92#ifndef INCLUDED_volk_32f_log2_32f_a_H
93#define INCLUDED_volk_32f_log2_32f_a_H
100#define LOG_POLY_DEGREE 6
102#ifdef LV_HAVE_GENERIC
107 float* bPtr = bVector;
108 const float* aPtr = aVector;
109 unsigned int number = 0;
111 for (number = 0; number < num_points; number++)
116#if LV_HAVE_AVX2 && LV_HAVE_FMA
117#include <immintrin.h>
119#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
120#define POLY1_FMAAVX2(x, c0, c1) \
121 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
122#define POLY2_FMAAVX2(x, c0, c1, c2) \
123 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
124#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
125 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
126#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
127 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
128#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
129 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
131static inline void volk_32f_log2_32f_a_avx2_fma(
float* bVector,
132 const float* aVector,
133 unsigned int num_points)
135 float* bPtr = bVector;
136 const float* aPtr = aVector;
138 unsigned int number = 0;
139 const unsigned int eighthPoints = num_points / 8;
141 __m256 aVal, bVal, mantissa, frac, leadingOne;
144 for (; number < eighthPoints; number++) {
146 aVal = _mm256_load_ps(aPtr);
147 bias = _mm256_set1_epi32(127);
148 leadingOne = _mm256_set1_ps(1.0f);
149 exp = _mm256_sub_epi32(
150 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
151 _mm256_set1_epi32(0x7f800000)),
154 bVal = _mm256_cvtepi32_ps(exp);
159 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
161#if LOG_POLY_DEGREE == 6
162 mantissa = POLY5_FMAAVX2(frac,
169#elif LOG_POLY_DEGREE == 5
170 mantissa = POLY4_FMAAVX2(frac,
171 2.8882704548164776201f,
172 -2.52074962577807006663f,
173 1.48116647521213171641f,
174 -0.465725644288844778798f,
175 0.0596515482674574969533f);
176#elif LOG_POLY_DEGREE == 4
177 mantissa = POLY3_FMAAVX2(frac,
178 2.61761038894603480148f,
179 -1.75647175389045657003f,
180 0.688243882994381274313f,
181 -0.107254423828329604454f);
182#elif LOG_POLY_DEGREE == 3
183 mantissa = POLY2_FMAAVX2(frac,
184 2.28330284476918490682f,
185 -1.04913055217340124191f,
186 0.204446009836232697516f);
191 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
192 _mm256_store_ps(bPtr, bVal);
198 number = eighthPoints * 8;
205#include <immintrin.h>
207#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
208#define POLY1_AVX2(x, c0, c1) \
209 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
210#define POLY2_AVX2(x, c0, c1, c2) \
211 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
212#define POLY3_AVX2(x, c0, c1, c2, c3) \
213 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
214#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
215 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
216#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
217 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
220volk_32f_log2_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
222 float* bPtr = bVector;
223 const float* aPtr = aVector;
225 unsigned int number = 0;
226 const unsigned int eighthPoints = num_points / 8;
228 __m256 aVal, bVal, mantissa, frac, leadingOne;
231 for (; number < eighthPoints; number++) {
233 aVal = _mm256_load_ps(aPtr);
234 bias = _mm256_set1_epi32(127);
235 leadingOne = _mm256_set1_ps(1.0f);
236 exp = _mm256_sub_epi32(
237 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
238 _mm256_set1_epi32(0x7f800000)),
241 bVal = _mm256_cvtepi32_ps(exp);
246 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
248#if LOG_POLY_DEGREE == 6
249 mantissa = POLY5_AVX2(frac,
256#elif LOG_POLY_DEGREE == 5
257 mantissa = POLY4_AVX2(frac,
258 2.8882704548164776201f,
259 -2.52074962577807006663f,
260 1.48116647521213171641f,
261 -0.465725644288844778798f,
262 0.0596515482674574969533f);
263#elif LOG_POLY_DEGREE == 4
264 mantissa = POLY3_AVX2(frac,
265 2.61761038894603480148f,
266 -1.75647175389045657003f,
267 0.688243882994381274313f,
268 -0.107254423828329604454f);
269#elif LOG_POLY_DEGREE == 3
270 mantissa = POLY2_AVX2(frac,
271 2.28330284476918490682f,
272 -1.04913055217340124191f,
273 0.204446009836232697516f);
279 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
280 _mm256_store_ps(bPtr, bVal);
286 number = eighthPoints * 8;
293#include <smmintrin.h>
295#define POLY0(x, c0) _mm_set1_ps(c0)
296#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
297#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
298#define POLY3(x, c0, c1, c2, c3) \
299 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
300#define POLY4(x, c0, c1, c2, c3, c4) \
301 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
302#define POLY5(x, c0, c1, c2, c3, c4, c5) \
303 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
306volk_32f_log2_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
308 float* bPtr = bVector;
309 const float* aPtr = aVector;
311 unsigned int number = 0;
312 const unsigned int quarterPoints = num_points / 4;
314 __m128 aVal, bVal, mantissa, frac, leadingOne;
317 for (; number < quarterPoints; number++) {
319 aVal = _mm_load_ps(aPtr);
320 bias = _mm_set1_epi32(127);
321 leadingOne = _mm_set1_ps(1.0f);
324 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
326 bVal = _mm_cvtepi32_ps(exp);
329 frac = _mm_or_ps(leadingOne,
330 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
332#if LOG_POLY_DEGREE == 6
333 mantissa = POLY5(frac,
340#elif LOG_POLY_DEGREE == 5
341 mantissa = POLY4(frac,
342 2.8882704548164776201f,
343 -2.52074962577807006663f,
344 1.48116647521213171641f,
345 -0.465725644288844778798f,
346 0.0596515482674574969533f);
347#elif LOG_POLY_DEGREE == 4
348 mantissa = POLY3(frac,
349 2.61761038894603480148f,
350 -1.75647175389045657003f,
351 0.688243882994381274313f,
352 -0.107254423828329604454f);
353#elif LOG_POLY_DEGREE == 3
354 mantissa = POLY2(frac,
355 2.28330284476918490682f,
356 -1.04913055217340124191f,
357 0.204446009836232697516f);
362 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
363 _mm_store_ps(bPtr, bVal);
369 number = quarterPoints * 4;
379#define VLOG2Q_NEON_PREAMBLE() \
380 int32x4_t one = vdupq_n_s32(0x000800000); \
382 float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
383 float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
384 float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
385 float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
386 float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
387 float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
388 float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
389 int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
390 int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
391 int32x4_t exp_bias = vdupq_n_s32(127);
394#define VLOG2Q_NEON_F32(log2_approx, aval) \
395 int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
396 int32x4_t significand_i = vandq_s32(aval, sig_mask); \
397 exponent_i = vshrq_n_s32(exponent_i, 23); \
402 significand_i = vorrq_s32(one, significand_i); \
403 float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \
405 exponent_i = vsubq_s32(exponent_i, exp_bias); \
406 float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
410 log2_approx = vaddq_f32(exponent_f, p0); \
411 float32x4_t tmp1 = vmulq_f32(significand_f, p1); \
412 log2_approx = vaddq_f32(log2_approx, tmp1); \
413 float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); \
414 tmp1 = vmulq_f32(sig_2, p2); \
415 log2_approx = vaddq_f32(log2_approx, tmp1); \
417 float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); \
418 tmp1 = vmulq_f32(sig_3, p3); \
419 log2_approx = vaddq_f32(log2_approx, tmp1); \
420 float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); \
421 tmp1 = vmulq_f32(sig_4, p4); \
422 log2_approx = vaddq_f32(log2_approx, tmp1); \
423 float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); \
424 tmp1 = vmulq_f32(sig_5, p5); \
425 log2_approx = vaddq_f32(log2_approx, tmp1); \
426 float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); \
427 tmp1 = vmulq_f32(sig_6, p6); \
428 log2_approx = vaddq_f32(log2_approx, tmp1);
433 float* bPtr = bVector;
434 const float* aPtr = aVector;
436 const unsigned int quarterPoints = num_points / 4;
439 float32x4_t log2_approx;
450 for (number = 0; number < quarterPoints; ++number) {
452 aval = vld1q_s32((
int*)aPtr);
456 vst1q_f32(bPtr, log2_approx);
462 number = quarterPoints * 4;
471#ifndef INCLUDED_volk_32f_log2_32f_u_H
472#define INCLUDED_volk_32f_log2_32f_u_H
475#ifdef LV_HAVE_GENERIC
480 float* bPtr = bVector;
481 const float* aPtr = aVector;
482 unsigned int number = 0;
484 for (number = 0; number < num_points; number++) {
485 float const result = log2f(*aPtr++);
486 *bPtr++ = isinf(result) ? -127.0f : result;
494#include <smmintrin.h>
496#define POLY0(x, c0) _mm_set1_ps(c0)
497#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
498#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
499#define POLY3(x, c0, c1, c2, c3) \
500 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
501#define POLY4(x, c0, c1, c2, c3, c4) \
502 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
503#define POLY5(x, c0, c1, c2, c3, c4, c5) \
504 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
507volk_32f_log2_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
509 float* bPtr = bVector;
510 const float* aPtr = aVector;
512 unsigned int number = 0;
513 const unsigned int quarterPoints = num_points / 4;
515 __m128 aVal, bVal, mantissa, frac, leadingOne;
518 for (; number < quarterPoints; number++) {
520 aVal = _mm_loadu_ps(aPtr);
521 bias = _mm_set1_epi32(127);
522 leadingOne = _mm_set1_ps(1.0f);
525 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
527 bVal = _mm_cvtepi32_ps(exp);
530 frac = _mm_or_ps(leadingOne,
531 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
533#if LOG_POLY_DEGREE == 6
534 mantissa = POLY5(frac,
541#elif LOG_POLY_DEGREE == 5
542 mantissa = POLY4(frac,
543 2.8882704548164776201f,
544 -2.52074962577807006663f,
545 1.48116647521213171641f,
546 -0.465725644288844778798f,
547 0.0596515482674574969533f);
548#elif LOG_POLY_DEGREE == 4
549 mantissa = POLY3(frac,
550 2.61761038894603480148f,
551 -1.75647175389045657003f,
552 0.688243882994381274313f,
553 -0.107254423828329604454f);
554#elif LOG_POLY_DEGREE == 3
555 mantissa = POLY2(frac,
556 2.28330284476918490682f,
557 -1.04913055217340124191f,
558 0.204446009836232697516f);
563 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
564 _mm_storeu_ps(bPtr, bVal);
570 number = quarterPoints * 4;
576#if LV_HAVE_AVX2 && LV_HAVE_FMA
577#include <immintrin.h>
579#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
580#define POLY1_FMAAVX2(x, c0, c1) \
581 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
582#define POLY2_FMAAVX2(x, c0, c1, c2) \
583 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
584#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
585 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
586#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
587 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
588#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
589 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
591static inline void volk_32f_log2_32f_u_avx2_fma(
float* bVector,
592 const float* aVector,
593 unsigned int num_points)
595 float* bPtr = bVector;
596 const float* aPtr = aVector;
598 unsigned int number = 0;
599 const unsigned int eighthPoints = num_points / 8;
601 __m256 aVal, bVal, mantissa, frac, leadingOne;
604 for (; number < eighthPoints; number++) {
606 aVal = _mm256_loadu_ps(aPtr);
607 bias = _mm256_set1_epi32(127);
608 leadingOne = _mm256_set1_ps(1.0f);
609 exp = _mm256_sub_epi32(
610 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
611 _mm256_set1_epi32(0x7f800000)),
614 bVal = _mm256_cvtepi32_ps(exp);
619 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
621#if LOG_POLY_DEGREE == 6
622 mantissa = POLY5_FMAAVX2(frac,
629#elif LOG_POLY_DEGREE == 5
630 mantissa = POLY4_FMAAVX2(frac,
631 2.8882704548164776201f,
632 -2.52074962577807006663f,
633 1.48116647521213171641f,
634 -0.465725644288844778798f,
635 0.0596515482674574969533f);
636#elif LOG_POLY_DEGREE == 4
637 mantissa = POLY3_FMAAVX2(frac,
638 2.61761038894603480148f,
639 -1.75647175389045657003f,
640 0.688243882994381274313f,
641 -0.107254423828329604454f);
642#elif LOG_POLY_DEGREE == 3
643 mantissa = POLY2_FMAAVX2(frac,
644 2.28330284476918490682f,
645 -1.04913055217340124191f,
646 0.204446009836232697516f);
651 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
652 _mm256_storeu_ps(bPtr, bVal);
658 number = eighthPoints * 8;
665#include <immintrin.h>
667#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
668#define POLY1_AVX2(x, c0, c1) \
669 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
670#define POLY2_AVX2(x, c0, c1, c2) \
671 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
672#define POLY3_AVX2(x, c0, c1, c2, c3) \
673 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
674#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
675 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
676#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
677 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
680volk_32f_log2_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
682 float* bPtr = bVector;
683 const float* aPtr = aVector;
685 unsigned int number = 0;
686 const unsigned int eighthPoints = num_points / 8;
688 __m256 aVal, bVal, mantissa, frac, leadingOne;
691 for (; number < eighthPoints; number++) {
693 aVal = _mm256_loadu_ps(aPtr);
694 bias = _mm256_set1_epi32(127);
695 leadingOne = _mm256_set1_ps(1.0f);
696 exp = _mm256_sub_epi32(
697 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
698 _mm256_set1_epi32(0x7f800000)),
701 bVal = _mm256_cvtepi32_ps(exp);
706 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
708#if LOG_POLY_DEGREE == 6
709 mantissa = POLY5_AVX2(frac,
716#elif LOG_POLY_DEGREE == 5
717 mantissa = POLY4_AVX2(frac,
718 2.8882704548164776201f,
719 -2.52074962577807006663f,
720 1.48116647521213171641f,
721 -0.465725644288844778798f,
722 0.0596515482674574969533f);
723#elif LOG_POLY_DEGREE == 4
724 mantissa = POLY3_AVX2(frac,
725 2.61761038894603480148f,
726 -1.75647175389045657003f,
727 0.688243882994381274313f,
728 -0.107254423828329604454f);
729#elif LOG_POLY_DEGREE == 3
730 mantissa = POLY2_AVX2(frac,
731 2.28330284476918490682f,
732 -1.04913055217340124191f,
733 0.204446009836232697516f);
739 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
740 _mm256_storeu_ps(bPtr, bVal);
746 number = eighthPoints * 8;
static void volk_32f_log2_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:475
static void volk_32f_log2_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:428
static void volk_32f_log2_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:105
#define VLOG2Q_NEON_F32(log2_approx, aval)
Definition: volk_32f_log2_32f.h:394
#define VLOG2Q_NEON_PREAMBLE()
Definition: volk_32f_log2_32f.h:379
static float log2f_non_ieee(float f)
Definition: volk_common.h:150