Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_asin_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
70#include <inttypes.h>
71#include <math.h>
72#include <stdio.h>
73
74/* This is the number of terms of Taylor series to evaluate, increase this for more
75 * accuracy*/
76#define ASIN_TERMS 2
77
78#ifndef INCLUDED_volk_32f_asin_32f_a_H
79#define INCLUDED_volk_32f_asin_32f_a_H
80
81#if LV_HAVE_AVX2 && LV_HAVE_FMA
82#include <immintrin.h>
83
84static inline void volk_32f_asin_32f_a_avx2_fma(float* bVector,
85 const float* aVector,
86 unsigned int num_points)
87{
88 float* bPtr = bVector;
89 const float* aPtr = aVector;
90
91 unsigned int number = 0;
92 unsigned int eighthPoints = num_points / 8;
93 int i, j;
94
95 __m256 aVal, pio2, x, y, z, arcsine;
96 __m256 fzeroes, fones, ftwos, ffours, condition;
97
98 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
99 fzeroes = _mm256_setzero_ps();
100 fones = _mm256_set1_ps(1.0);
101 ftwos = _mm256_set1_ps(2.0);
102 ffours = _mm256_set1_ps(4.0);
103
104 for (; number < eighthPoints; number++) {
105 aVal = _mm256_load_ps(aPtr);
106 aVal = _mm256_div_ps(aVal,
107 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
108 _mm256_sub_ps(fones, aVal))));
109 z = aVal;
110 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
111 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
112 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
113 x = _mm256_add_ps(
114 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
115
116 for (i = 0; i < 2; i++) {
117 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
118 }
119 x = _mm256_div_ps(fones, x);
120 y = fzeroes;
121 for (j = ASIN_TERMS - 1; j >= 0; j--) {
122 y = _mm256_fmadd_ps(
123 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
124 }
125
126 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
127 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
128
129 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
130 arcsine = y;
131 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
132 arcsine = _mm256_sub_ps(arcsine,
133 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
134
135 _mm256_store_ps(bPtr, arcsine);
136 aPtr += 8;
137 bPtr += 8;
138 }
139
140 number = eighthPoints * 8;
141 for (; number < num_points; number++) {
142 *bPtr++ = asin(*aPtr++);
143 }
144}
145
146#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
147
148
149#ifdef LV_HAVE_AVX
150#include <immintrin.h>
151
152static inline void
153volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
154{
155 float* bPtr = bVector;
156 const float* aPtr = aVector;
157
158 unsigned int number = 0;
159 unsigned int eighthPoints = num_points / 8;
160 int i, j;
161
162 __m256 aVal, pio2, x, y, z, arcsine;
163 __m256 fzeroes, fones, ftwos, ffours, condition;
164
165 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
166 fzeroes = _mm256_setzero_ps();
167 fones = _mm256_set1_ps(1.0);
168 ftwos = _mm256_set1_ps(2.0);
169 ffours = _mm256_set1_ps(4.0);
170
171 for (; number < eighthPoints; number++) {
172 aVal = _mm256_load_ps(aPtr);
173 aVal = _mm256_div_ps(aVal,
174 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
175 _mm256_sub_ps(fones, aVal))));
176 z = aVal;
177 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
178 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
179 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
180 x = _mm256_add_ps(
181 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
182
183 for (i = 0; i < 2; i++) {
184 x = _mm256_add_ps(x,
185 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
186 }
187 x = _mm256_div_ps(fones, x);
188 y = fzeroes;
189 for (j = ASIN_TERMS - 1; j >= 0; j--) {
190 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
191 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
192 }
193
194 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
195 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
196
197 y = _mm256_add_ps(
198 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
199 arcsine = y;
200 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
201 arcsine = _mm256_sub_ps(arcsine,
202 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
203
204 _mm256_store_ps(bPtr, arcsine);
205 aPtr += 8;
206 bPtr += 8;
207 }
208
209 number = eighthPoints * 8;
210 for (; number < num_points; number++) {
211 *bPtr++ = asin(*aPtr++);
212 }
213}
214
215#endif /* LV_HAVE_AVX for aligned */
216
217#ifdef LV_HAVE_SSE4_1
218#include <smmintrin.h>
219
220static inline void
221volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
222{
223 float* bPtr = bVector;
224 const float* aPtr = aVector;
225
226 unsigned int number = 0;
227 unsigned int quarterPoints = num_points / 4;
228 int i, j;
229
230 __m128 aVal, pio2, x, y, z, arcsine;
231 __m128 fzeroes, fones, ftwos, ffours, condition;
232
233 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
234 fzeroes = _mm_setzero_ps();
235 fones = _mm_set1_ps(1.0);
236 ftwos = _mm_set1_ps(2.0);
237 ffours = _mm_set1_ps(4.0);
238
239 for (; number < quarterPoints; number++) {
240 aVal = _mm_load_ps(aPtr);
241 aVal = _mm_div_ps(
242 aVal,
243 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
244 z = aVal;
245 condition = _mm_cmplt_ps(z, fzeroes);
246 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
247 condition = _mm_cmplt_ps(z, fones);
248 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
249
250 for (i = 0; i < 2; i++) {
251 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
252 }
253 x = _mm_div_ps(fones, x);
254 y = fzeroes;
255 for (j = ASIN_TERMS - 1; j >= 0; j--) {
256 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
257 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
258 }
259
260 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
261 condition = _mm_cmpgt_ps(z, fones);
262
263 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
264 arcsine = y;
265 condition = _mm_cmplt_ps(aVal, fzeroes);
266 arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
267
268 _mm_store_ps(bPtr, arcsine);
269 aPtr += 4;
270 bPtr += 4;
271 }
272
273 number = quarterPoints * 4;
274 for (; number < num_points; number++) {
275 *bPtr++ = asinf(*aPtr++);
276 }
277}
278
279#endif /* LV_HAVE_SSE4_1 for aligned */
280
281#endif /* INCLUDED_volk_32f_asin_32f_a_H */
282
283#ifndef INCLUDED_volk_32f_asin_32f_u_H
284#define INCLUDED_volk_32f_asin_32f_u_H
285
286#if LV_HAVE_AVX2 && LV_HAVE_FMA
287#include <immintrin.h>
288
289static inline void volk_32f_asin_32f_u_avx2_fma(float* bVector,
290 const float* aVector,
291 unsigned int num_points)
292{
293 float* bPtr = bVector;
294 const float* aPtr = aVector;
295
296 unsigned int number = 0;
297 unsigned int eighthPoints = num_points / 8;
298 int i, j;
299
300 __m256 aVal, pio2, x, y, z, arcsine;
301 __m256 fzeroes, fones, ftwos, ffours, condition;
302
303 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
304 fzeroes = _mm256_setzero_ps();
305 fones = _mm256_set1_ps(1.0);
306 ftwos = _mm256_set1_ps(2.0);
307 ffours = _mm256_set1_ps(4.0);
308
309 for (; number < eighthPoints; number++) {
310 aVal = _mm256_loadu_ps(aPtr);
311 aVal = _mm256_div_ps(aVal,
312 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
313 _mm256_sub_ps(fones, aVal))));
314 z = aVal;
315 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
316 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
317 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
318 x = _mm256_add_ps(
319 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
320
321 for (i = 0; i < 2; i++) {
322 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
323 }
324 x = _mm256_div_ps(fones, x);
325 y = fzeroes;
326 for (j = ASIN_TERMS - 1; j >= 0; j--) {
327 y = _mm256_fmadd_ps(
328 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
329 }
330
331 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
332 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
333
334 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
335 arcsine = y;
336 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
337 arcsine = _mm256_sub_ps(arcsine,
338 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
339
340 _mm256_storeu_ps(bPtr, arcsine);
341 aPtr += 8;
342 bPtr += 8;
343 }
344
345 number = eighthPoints * 8;
346 for (; number < num_points; number++) {
347 *bPtr++ = asin(*aPtr++);
348 }
349}
350
351#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
352
353
354#ifdef LV_HAVE_AVX
355#include <immintrin.h>
356
357static inline void
358volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
359{
360 float* bPtr = bVector;
361 const float* aPtr = aVector;
362
363 unsigned int number = 0;
364 unsigned int eighthPoints = num_points / 8;
365 int i, j;
366
367 __m256 aVal, pio2, x, y, z, arcsine;
368 __m256 fzeroes, fones, ftwos, ffours, condition;
369
370 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
371 fzeroes = _mm256_setzero_ps();
372 fones = _mm256_set1_ps(1.0);
373 ftwos = _mm256_set1_ps(2.0);
374 ffours = _mm256_set1_ps(4.0);
375
376 for (; number < eighthPoints; number++) {
377 aVal = _mm256_loadu_ps(aPtr);
378 aVal = _mm256_div_ps(aVal,
379 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
380 _mm256_sub_ps(fones, aVal))));
381 z = aVal;
382 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
383 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
384 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
385 x = _mm256_add_ps(
386 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
387
388 for (i = 0; i < 2; i++) {
389 x = _mm256_add_ps(x,
390 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
391 }
392 x = _mm256_div_ps(fones, x);
393 y = fzeroes;
394 for (j = ASIN_TERMS - 1; j >= 0; j--) {
395 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
396 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
397 }
398
399 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
400 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
401
402 y = _mm256_add_ps(
403 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
404 arcsine = y;
405 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
406 arcsine = _mm256_sub_ps(arcsine,
407 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
408
409 _mm256_storeu_ps(bPtr, arcsine);
410 aPtr += 8;
411 bPtr += 8;
412 }
413
414 number = eighthPoints * 8;
415 for (; number < num_points; number++) {
416 *bPtr++ = asin(*aPtr++);
417 }
418}
419
420#endif /* LV_HAVE_AVX for unaligned */
421
422
423#ifdef LV_HAVE_SSE4_1
424#include <smmintrin.h>
425
426static inline void
427volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
428{
429 float* bPtr = bVector;
430 const float* aPtr = aVector;
431
432 unsigned int number = 0;
433 unsigned int quarterPoints = num_points / 4;
434 int i, j;
435
436 __m128 aVal, pio2, x, y, z, arcsine;
437 __m128 fzeroes, fones, ftwos, ffours, condition;
438
439 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
440 fzeroes = _mm_setzero_ps();
441 fones = _mm_set1_ps(1.0);
442 ftwos = _mm_set1_ps(2.0);
443 ffours = _mm_set1_ps(4.0);
444
445 for (; number < quarterPoints; number++) {
446 aVal = _mm_loadu_ps(aPtr);
447 aVal = _mm_div_ps(
448 aVal,
449 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
450 z = aVal;
451 condition = _mm_cmplt_ps(z, fzeroes);
452 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
453 condition = _mm_cmplt_ps(z, fones);
454 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
455
456 for (i = 0; i < 2; i++) {
457 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
458 }
459 x = _mm_div_ps(fones, x);
460 y = fzeroes;
461 for (j = ASIN_TERMS - 1; j >= 0; j--) {
462 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
463 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
464 }
465
466 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
467 condition = _mm_cmpgt_ps(z, fones);
468
469 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
470 arcsine = y;
471 condition = _mm_cmplt_ps(aVal, fzeroes);
472 arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
473
474 _mm_storeu_ps(bPtr, arcsine);
475 aPtr += 4;
476 bPtr += 4;
477 }
478
479 number = quarterPoints * 4;
480 for (; number < num_points; number++) {
481 *bPtr++ = asinf(*aPtr++);
482 }
483}
484
485#endif /* LV_HAVE_SSE4_1 for unaligned */
486
487#ifdef LV_HAVE_GENERIC
488
489static inline void
490volk_32f_asin_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
491{
492 float* bPtr = bVector;
493 const float* aPtr = aVector;
494 unsigned int number = 0;
495
496 for (number = 0; number < num_points; number++) {
497 *bPtr++ = asinf(*aPtr++);
498 }
499}
500#endif /* LV_HAVE_GENERIC */
501
502#endif /* INCLUDED_volk_32f_asin_32f_u_H */
#define ASIN_TERMS
Definition: volk_32f_asin_32f.h:76
static void volk_32f_asin_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:358
static void volk_32f_asin_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:153
static void volk_32f_asin_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:490
for i
Definition: volk_config_fixed.tmpl.h:25