Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_log2_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
92#ifndef INCLUDED_volk_32f_log2_32f_a_H
93#define INCLUDED_volk_32f_log2_32f_a_H
94
95#include <inttypes.h>
96#include <math.h>
97#include <stdio.h>
98#include <stdlib.h>
99
100#define LOG_POLY_DEGREE 6
101
102#ifdef LV_HAVE_GENERIC
103
104static inline void
105volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
106{
107 float* bPtr = bVector;
108 const float* aPtr = aVector;
109 unsigned int number = 0;
110
111 for (number = 0; number < num_points; number++)
112 *bPtr++ = log2f_non_ieee(*aPtr++);
113}
114#endif /* LV_HAVE_GENERIC */
115
116#if LV_HAVE_AVX2 && LV_HAVE_FMA
117#include <immintrin.h>
118
119#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
120#define POLY1_FMAAVX2(x, c0, c1) \
121 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
122#define POLY2_FMAAVX2(x, c0, c1, c2) \
123 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
124#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
125 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
126#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
127 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
128#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
129 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
130
131static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector,
132 const float* aVector,
133 unsigned int num_points)
134{
135 float* bPtr = bVector;
136 const float* aPtr = aVector;
137
138 unsigned int number = 0;
139 const unsigned int eighthPoints = num_points / 8;
140
141 __m256 aVal, bVal, mantissa, frac, leadingOne;
142 __m256i bias, exp;
143
144 for (; number < eighthPoints; number++) {
145
146 aVal = _mm256_load_ps(aPtr);
147 bias = _mm256_set1_epi32(127);
148 leadingOne = _mm256_set1_ps(1.0f);
149 exp = _mm256_sub_epi32(
150 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
151 _mm256_set1_epi32(0x7f800000)),
152 23),
153 bias);
154 bVal = _mm256_cvtepi32_ps(exp);
155
156 // Now to extract mantissa
157 frac = _mm256_or_ps(
158 leadingOne,
159 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
160
161#if LOG_POLY_DEGREE == 6
162 mantissa = POLY5_FMAAVX2(frac,
163 3.1157899f,
164 -3.3241990f,
165 2.5988452f,
166 -1.2315303f,
167 3.1821337e-1f,
168 -3.4436006e-2f);
169#elif LOG_POLY_DEGREE == 5
170 mantissa = POLY4_FMAAVX2(frac,
171 2.8882704548164776201f,
172 -2.52074962577807006663f,
173 1.48116647521213171641f,
174 -0.465725644288844778798f,
175 0.0596515482674574969533f);
176#elif LOG_POLY_DEGREE == 4
177 mantissa = POLY3_FMAAVX2(frac,
178 2.61761038894603480148f,
179 -1.75647175389045657003f,
180 0.688243882994381274313f,
181 -0.107254423828329604454f);
182#elif LOG_POLY_DEGREE == 3
183 mantissa = POLY2_FMAAVX2(frac,
184 2.28330284476918490682f,
185 -1.04913055217340124191f,
186 0.204446009836232697516f);
187#else
188#error
189#endif
190
191 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
192 _mm256_store_ps(bPtr, bVal);
193
194 aPtr += 8;
195 bPtr += 8;
196 }
197
198 number = eighthPoints * 8;
199 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
200}
201
202#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
203
204#ifdef LV_HAVE_AVX2
205#include <immintrin.h>
206
207#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
208#define POLY1_AVX2(x, c0, c1) \
209 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
210#define POLY2_AVX2(x, c0, c1, c2) \
211 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
212#define POLY3_AVX2(x, c0, c1, c2, c3) \
213 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
214#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
215 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
216#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
217 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
218
219static inline void
220volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
221{
222 float* bPtr = bVector;
223 const float* aPtr = aVector;
224
225 unsigned int number = 0;
226 const unsigned int eighthPoints = num_points / 8;
227
228 __m256 aVal, bVal, mantissa, frac, leadingOne;
229 __m256i bias, exp;
230
231 for (; number < eighthPoints; number++) {
232
233 aVal = _mm256_load_ps(aPtr);
234 bias = _mm256_set1_epi32(127);
235 leadingOne = _mm256_set1_ps(1.0f);
236 exp = _mm256_sub_epi32(
237 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
238 _mm256_set1_epi32(0x7f800000)),
239 23),
240 bias);
241 bVal = _mm256_cvtepi32_ps(exp);
242
243 // Now to extract mantissa
244 frac = _mm256_or_ps(
245 leadingOne,
246 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
247
248#if LOG_POLY_DEGREE == 6
249 mantissa = POLY5_AVX2(frac,
250 3.1157899f,
251 -3.3241990f,
252 2.5988452f,
253 -1.2315303f,
254 3.1821337e-1f,
255 -3.4436006e-2f);
256#elif LOG_POLY_DEGREE == 5
257 mantissa = POLY4_AVX2(frac,
258 2.8882704548164776201f,
259 -2.52074962577807006663f,
260 1.48116647521213171641f,
261 -0.465725644288844778798f,
262 0.0596515482674574969533f);
263#elif LOG_POLY_DEGREE == 4
264 mantissa = POLY3_AVX2(frac,
265 2.61761038894603480148f,
266 -1.75647175389045657003f,
267 0.688243882994381274313f,
268 -0.107254423828329604454f);
269#elif LOG_POLY_DEGREE == 3
270 mantissa = POLY2_AVX2(frac,
271 2.28330284476918490682f,
272 -1.04913055217340124191f,
273 0.204446009836232697516f);
274#else
275#error
276#endif
277
278 bVal =
279 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
280 _mm256_store_ps(bPtr, bVal);
281
282 aPtr += 8;
283 bPtr += 8;
284 }
285
286 number = eighthPoints * 8;
287 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
288}
289
290#endif /* LV_HAVE_AVX2 for aligned */
291
292#ifdef LV_HAVE_SSE4_1
293#include <smmintrin.h>
294
295#define POLY0(x, c0) _mm_set1_ps(c0)
296#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
297#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
298#define POLY3(x, c0, c1, c2, c3) \
299 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
300#define POLY4(x, c0, c1, c2, c3, c4) \
301 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
302#define POLY5(x, c0, c1, c2, c3, c4, c5) \
303 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
304
305static inline void
306volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
307{
308 float* bPtr = bVector;
309 const float* aPtr = aVector;
310
311 unsigned int number = 0;
312 const unsigned int quarterPoints = num_points / 4;
313
314 __m128 aVal, bVal, mantissa, frac, leadingOne;
315 __m128i bias, exp;
316
317 for (; number < quarterPoints; number++) {
318
319 aVal = _mm_load_ps(aPtr);
320 bias = _mm_set1_epi32(127);
321 leadingOne = _mm_set1_ps(1.0f);
322 exp = _mm_sub_epi32(
323 _mm_srli_epi32(
324 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
325 bias);
326 bVal = _mm_cvtepi32_ps(exp);
327
328 // Now to extract mantissa
329 frac = _mm_or_ps(leadingOne,
330 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
331
332#if LOG_POLY_DEGREE == 6
333 mantissa = POLY5(frac,
334 3.1157899f,
335 -3.3241990f,
336 2.5988452f,
337 -1.2315303f,
338 3.1821337e-1f,
339 -3.4436006e-2f);
340#elif LOG_POLY_DEGREE == 5
341 mantissa = POLY4(frac,
342 2.8882704548164776201f,
343 -2.52074962577807006663f,
344 1.48116647521213171641f,
345 -0.465725644288844778798f,
346 0.0596515482674574969533f);
347#elif LOG_POLY_DEGREE == 4
348 mantissa = POLY3(frac,
349 2.61761038894603480148f,
350 -1.75647175389045657003f,
351 0.688243882994381274313f,
352 -0.107254423828329604454f);
353#elif LOG_POLY_DEGREE == 3
354 mantissa = POLY2(frac,
355 2.28330284476918490682f,
356 -1.04913055217340124191f,
357 0.204446009836232697516f);
358#else
359#error
360#endif
361
362 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
363 _mm_store_ps(bPtr, bVal);
364
365 aPtr += 4;
366 bPtr += 4;
367 }
368
369 number = quarterPoints * 4;
370 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
371}
372
373#endif /* LV_HAVE_SSE4_1 for aligned */
374
375#ifdef LV_HAVE_NEON
376#include <arm_neon.h>
377
378/* these macros allow us to embed logs in other kernels */
379#define VLOG2Q_NEON_PREAMBLE() \
380 int32x4_t one = vdupq_n_s32(0x000800000); \
381 /* minimax polynomial */ \
382 float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
383 float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
384 float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
385 float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
386 float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
387 float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
388 float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
389 int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
390 int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
391 int32x4_t exp_bias = vdupq_n_s32(127);
392
393
394#define VLOG2Q_NEON_F32(log2_approx, aval) \
395 int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
396 int32x4_t significand_i = vandq_s32(aval, sig_mask); \
397 exponent_i = vshrq_n_s32(exponent_i, 23); \
398 \
399 /* extract the exponent and significand \
400 we can treat this as fixed point to save ~9% on the \
401 conversion + float add */ \
402 significand_i = vorrq_s32(one, significand_i); \
403 float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \
404 /* debias the exponent and convert to float */ \
405 exponent_i = vsubq_s32(exponent_i, exp_bias); \
406 float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
407 \
408 /* put the significand through a polynomial fit of log2(x) [1,2] \
409 add the result to the exponent */ \
410 log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
411 float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
412 log2_approx = vaddq_f32(log2_approx, tmp1); \
413 float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
414 tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
415 log2_approx = vaddq_f32(log2_approx, tmp1); \
416 \
417 float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
418 tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
419 log2_approx = vaddq_f32(log2_approx, tmp1); \
420 float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
421 tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
422 log2_approx = vaddq_f32(log2_approx, tmp1); \
423 float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
424 tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
425 log2_approx = vaddq_f32(log2_approx, tmp1); \
426 float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
427 tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
428 log2_approx = vaddq_f32(log2_approx, tmp1);
429
430static inline void
431volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
432{
433 float* bPtr = bVector;
434 const float* aPtr = aVector;
435 unsigned int number;
436 const unsigned int quarterPoints = num_points / 4;
437
438 int32x4_t aval;
439 float32x4_t log2_approx;
440
442 // lms
443 // p0 = vdupq_n_f32(-1.649132280361871);
444 // p1 = vdupq_n_f32(1.995047138579499);
445 // p2 = vdupq_n_f32(-0.336914839219728);
446
447 // keep in mind a single precision float is represented as
448 // (-1)^sign * 2^exp * 1.significand, so the log2 is
449 // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
450 for (number = 0; number < quarterPoints; ++number) {
451 // load float in to an int register without conversion
452 aval = vld1q_s32((int*)aPtr);
453
454 VLOG2Q_NEON_F32(log2_approx, aval)
455
456 vst1q_f32(bPtr, log2_approx);
457
458 aPtr += 4;
459 bPtr += 4;
460 }
461
462 number = quarterPoints * 4;
463 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
464}
465
466#endif /* LV_HAVE_NEON */
467
468
469#endif /* INCLUDED_volk_32f_log2_32f_a_H */
470
471#ifndef INCLUDED_volk_32f_log2_32f_u_H
472#define INCLUDED_volk_32f_log2_32f_u_H
473
474
475#ifdef LV_HAVE_GENERIC
476
477static inline void
478volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
479{
480 float* bPtr = bVector;
481 const float* aPtr = aVector;
482 unsigned int number = 0;
483
484 for (number = 0; number < num_points; number++) {
485 float const result = log2f(*aPtr++);
486 *bPtr++ = isinf(result) ? -127.0f : result;
487 }
488}
489
490#endif /* LV_HAVE_GENERIC */
491
492
493#ifdef LV_HAVE_SSE4_1
494#include <smmintrin.h>
495
496#define POLY0(x, c0) _mm_set1_ps(c0)
497#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
498#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
499#define POLY3(x, c0, c1, c2, c3) \
500 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
501#define POLY4(x, c0, c1, c2, c3, c4) \
502 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
503#define POLY5(x, c0, c1, c2, c3, c4, c5) \
504 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
505
506static inline void
507volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
508{
509 float* bPtr = bVector;
510 const float* aPtr = aVector;
511
512 unsigned int number = 0;
513 const unsigned int quarterPoints = num_points / 4;
514
515 __m128 aVal, bVal, mantissa, frac, leadingOne;
516 __m128i bias, exp;
517
518 for (; number < quarterPoints; number++) {
519
520 aVal = _mm_loadu_ps(aPtr);
521 bias = _mm_set1_epi32(127);
522 leadingOne = _mm_set1_ps(1.0f);
523 exp = _mm_sub_epi32(
524 _mm_srli_epi32(
525 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
526 bias);
527 bVal = _mm_cvtepi32_ps(exp);
528
529 // Now to extract mantissa
530 frac = _mm_or_ps(leadingOne,
531 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
532
533#if LOG_POLY_DEGREE == 6
534 mantissa = POLY5(frac,
535 3.1157899f,
536 -3.3241990f,
537 2.5988452f,
538 -1.2315303f,
539 3.1821337e-1f,
540 -3.4436006e-2f);
541#elif LOG_POLY_DEGREE == 5
542 mantissa = POLY4(frac,
543 2.8882704548164776201f,
544 -2.52074962577807006663f,
545 1.48116647521213171641f,
546 -0.465725644288844778798f,
547 0.0596515482674574969533f);
548#elif LOG_POLY_DEGREE == 4
549 mantissa = POLY3(frac,
550 2.61761038894603480148f,
551 -1.75647175389045657003f,
552 0.688243882994381274313f,
553 -0.107254423828329604454f);
554#elif LOG_POLY_DEGREE == 3
555 mantissa = POLY2(frac,
556 2.28330284476918490682f,
557 -1.04913055217340124191f,
558 0.204446009836232697516f);
559#else
560#error
561#endif
562
563 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
564 _mm_storeu_ps(bPtr, bVal);
565
566 aPtr += 4;
567 bPtr += 4;
568 }
569
570 number = quarterPoints * 4;
571 volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
572}
573
574#endif /* LV_HAVE_SSE4_1 for unaligned */
575
576#if LV_HAVE_AVX2 && LV_HAVE_FMA
577#include <immintrin.h>
578
579#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
580#define POLY1_FMAAVX2(x, c0, c1) \
581 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
582#define POLY2_FMAAVX2(x, c0, c1, c2) \
583 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
584#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
585 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
586#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
587 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
588#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
589 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
590
591static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector,
592 const float* aVector,
593 unsigned int num_points)
594{
595 float* bPtr = bVector;
596 const float* aPtr = aVector;
597
598 unsigned int number = 0;
599 const unsigned int eighthPoints = num_points / 8;
600
601 __m256 aVal, bVal, mantissa, frac, leadingOne;
602 __m256i bias, exp;
603
604 for (; number < eighthPoints; number++) {
605
606 aVal = _mm256_loadu_ps(aPtr);
607 bias = _mm256_set1_epi32(127);
608 leadingOne = _mm256_set1_ps(1.0f);
609 exp = _mm256_sub_epi32(
610 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
611 _mm256_set1_epi32(0x7f800000)),
612 23),
613 bias);
614 bVal = _mm256_cvtepi32_ps(exp);
615
616 // Now to extract mantissa
617 frac = _mm256_or_ps(
618 leadingOne,
619 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
620
621#if LOG_POLY_DEGREE == 6
622 mantissa = POLY5_FMAAVX2(frac,
623 3.1157899f,
624 -3.3241990f,
625 2.5988452f,
626 -1.2315303f,
627 3.1821337e-1f,
628 -3.4436006e-2f);
629#elif LOG_POLY_DEGREE == 5
630 mantissa = POLY4_FMAAVX2(frac,
631 2.8882704548164776201f,
632 -2.52074962577807006663f,
633 1.48116647521213171641f,
634 -0.465725644288844778798f,
635 0.0596515482674574969533f);
636#elif LOG_POLY_DEGREE == 4
637 mantissa = POLY3_FMAAVX2(frac,
638 2.61761038894603480148f,
639 -1.75647175389045657003f,
640 0.688243882994381274313f,
641 -0.107254423828329604454f);
642#elif LOG_POLY_DEGREE == 3
643 mantissa = POLY2_FMAAVX2(frac,
644 2.28330284476918490682f,
645 -1.04913055217340124191f,
646 0.204446009836232697516f);
647#else
648#error
649#endif
650
651 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
652 _mm256_storeu_ps(bPtr, bVal);
653
654 aPtr += 8;
655 bPtr += 8;
656 }
657
658 number = eighthPoints * 8;
659 volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
660}
661
662#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
663
664#ifdef LV_HAVE_AVX2
665#include <immintrin.h>
666
667#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
668#define POLY1_AVX2(x, c0, c1) \
669 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
670#define POLY2_AVX2(x, c0, c1, c2) \
671 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
672#define POLY3_AVX2(x, c0, c1, c2, c3) \
673 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
674#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
675 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
676#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
677 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
678
679static inline void
680volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
681{
682 float* bPtr = bVector;
683 const float* aPtr = aVector;
684
685 unsigned int number = 0;
686 const unsigned int eighthPoints = num_points / 8;
687
688 __m256 aVal, bVal, mantissa, frac, leadingOne;
689 __m256i bias, exp;
690
691 for (; number < eighthPoints; number++) {
692
693 aVal = _mm256_loadu_ps(aPtr);
694 bias = _mm256_set1_epi32(127);
695 leadingOne = _mm256_set1_ps(1.0f);
696 exp = _mm256_sub_epi32(
697 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
698 _mm256_set1_epi32(0x7f800000)),
699 23),
700 bias);
701 bVal = _mm256_cvtepi32_ps(exp);
702
703 // Now to extract mantissa
704 frac = _mm256_or_ps(
705 leadingOne,
706 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
707
708#if LOG_POLY_DEGREE == 6
709 mantissa = POLY5_AVX2(frac,
710 3.1157899f,
711 -3.3241990f,
712 2.5988452f,
713 -1.2315303f,
714 3.1821337e-1f,
715 -3.4436006e-2f);
716#elif LOG_POLY_DEGREE == 5
717 mantissa = POLY4_AVX2(frac,
718 2.8882704548164776201f,
719 -2.52074962577807006663f,
720 1.48116647521213171641f,
721 -0.465725644288844778798f,
722 0.0596515482674574969533f);
723#elif LOG_POLY_DEGREE == 4
724 mantissa = POLY3_AVX2(frac,
725 2.61761038894603480148f,
726 -1.75647175389045657003f,
727 0.688243882994381274313f,
728 -0.107254423828329604454f);
729#elif LOG_POLY_DEGREE == 3
730 mantissa = POLY2_AVX2(frac,
731 2.28330284476918490682f,
732 -1.04913055217340124191f,
733 0.204446009836232697516f);
734#else
735#error
736#endif
737
738 bVal =
739 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
740 _mm256_storeu_ps(bPtr, bVal);
741
742 aPtr += 8;
743 bPtr += 8;
744 }
745
746 number = eighthPoints * 8;
747 volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
748}
749
750#endif /* LV_HAVE_AVX2 for unaligned */
751
752
753#endif /* INCLUDED_volk_32f_log2_32f_u_H */
static void volk_32f_log2_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:475
static void volk_32f_log2_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:428
static void volk_32f_log2_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:105
#define VLOG2Q_NEON_F32(log2_approx, aval)
Definition: volk_32f_log2_32f.h:394
#define VLOG2Q_NEON_PREAMBLE()
Definition: volk_32f_log2_32f.h:379
static float log2f_non_ieee(float f)
Definition: volk_common.h:150