81#ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
82#define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
89#define ROTATOR_RELOAD 512
90#define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
91#define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
100 unsigned int num_points)
106 *outVector++ = *inVector++ * (*phase);
107 (*phase) *= phase_inc;
113 *outVector++ = *inVector++ * (*phase);
114 (*phase) *= phase_inc;
133 unsigned int num_points)
137 const lv_32fc_t* inputVectorPtr = inVector;
139 lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
140 float32x4x2_t input_vec;
141 float32x4x2_t output_vec;
143 unsigned int i = 0, j = 0;
146 for (
i = 0;
i < 4; ++
i) {
152 const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
153 const float32x4x2_t incr_vec = vld2q_f32((
float*)incrPtr);
154 float32x4x2_t phase_vec = vld2q_f32((
float*)phasePtr);
158 input_vec = vld2q_f32((
float*)inputVectorPtr);
166 vst2q_f32((
float*)outputVectorPtr, output_vec);
168 outputVectorPtr += 4;
176 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
177 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
181 input_vec = vld2q_f32((
float*)inputVectorPtr);
189 vst2q_f32((
float*)outputVectorPtr, output_vec);
191 outputVectorPtr += 4;
201 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
202 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
205 vst2q_f32((
float*)phasePtr, phase_vec);
208 for (
i = 0;
i < num_points % 4;
i++) {
209 *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
210 phasePtr[0] *= (phase_inc);
214 (*phase) = phasePtr[0];
221#include <smmintrin.h>
223static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(
lv_32fc_t* outVector,
227 unsigned int num_points)
232 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
234 unsigned int i, j = 0;
236 for (
i = 0;
i < 2; ++
i) {
237 phase_Ptr[
i] *= incr;
241 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
243 phase_Val = _mm_loadu_ps((
float*)phase_Ptr);
249 aVal = _mm_load_ps((
float*)aPtr);
251 yl = _mm_moveldup_ps(phase_Val);
252 yh = _mm_movehdup_ps(phase_Val);
253 ylp = _mm_moveldup_ps(inc_Val);
254 yhp = _mm_movehdup_ps(inc_Val);
256 tmp1 = _mm_mul_ps(aVal, yl);
257 tmp1p = _mm_mul_ps(phase_Val, ylp);
259 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
260 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
261 tmp2 = _mm_mul_ps(aVal, yh);
262 tmp2p = _mm_mul_ps(phase_Val, yhp);
264 z = _mm_addsub_ps(tmp1, tmp2);
265 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
267 _mm_store_ps((
float*)cPtr, z);
272 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
273 tmp2 = _mm_hadd_ps(tmp1, tmp1);
274 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
275 tmp2 = _mm_sqrt_ps(tmp1);
276 phase_Val = _mm_div_ps(phase_Val, tmp2);
279 aVal = _mm_load_ps((
float*)aPtr);
281 yl = _mm_moveldup_ps(phase_Val);
282 yh = _mm_movehdup_ps(phase_Val);
283 ylp = _mm_moveldup_ps(inc_Val);
284 yhp = _mm_movehdup_ps(inc_Val);
286 tmp1 = _mm_mul_ps(aVal, yl);
288 tmp1p = _mm_mul_ps(phase_Val, ylp);
290 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
291 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
292 tmp2 = _mm_mul_ps(aVal, yh);
293 tmp2p = _mm_mul_ps(phase_Val, yhp);
295 z = _mm_addsub_ps(tmp1, tmp2);
296 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
298 _mm_store_ps((
float*)cPtr, z);
304 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
305 tmp2 = _mm_hadd_ps(tmp1, tmp1);
306 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
307 tmp2 = _mm_sqrt_ps(tmp1);
308 phase_Val = _mm_div_ps(phase_Val, tmp2);
311 _mm_storeu_ps((
float*)phase_Ptr, phase_Val);
312 if (num_points & 1) {
313 *cPtr++ = *aPtr++ * phase_Ptr[0];
314 phase_Ptr[0] *= (phase_inc);
317 (*phase) = phase_Ptr[0];
324#include <smmintrin.h>
326static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(
lv_32fc_t* outVector,
330 unsigned int num_points)
335 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
337 unsigned int i, j = 0;
339 for (
i = 0;
i < 2; ++
i) {
340 phase_Ptr[
i] *= incr;
347 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
349 phase_Val = _mm_loadu_ps((
float*)phase_Ptr);
355 aVal = _mm_loadu_ps((
float*)aPtr);
357 yl = _mm_moveldup_ps(phase_Val);
358 yh = _mm_movehdup_ps(phase_Val);
359 ylp = _mm_moveldup_ps(inc_Val);
360 yhp = _mm_movehdup_ps(inc_Val);
362 tmp1 = _mm_mul_ps(aVal, yl);
363 tmp1p = _mm_mul_ps(phase_Val, ylp);
365 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
366 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
367 tmp2 = _mm_mul_ps(aVal, yh);
368 tmp2p = _mm_mul_ps(phase_Val, yhp);
370 z = _mm_addsub_ps(tmp1, tmp2);
371 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
373 _mm_storeu_ps((
float*)cPtr, z);
378 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
379 tmp2 = _mm_hadd_ps(tmp1, tmp1);
380 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
381 tmp2 = _mm_sqrt_ps(tmp1);
382 phase_Val = _mm_div_ps(phase_Val, tmp2);
385 aVal = _mm_loadu_ps((
float*)aPtr);
387 yl = _mm_moveldup_ps(phase_Val);
388 yh = _mm_movehdup_ps(phase_Val);
389 ylp = _mm_moveldup_ps(inc_Val);
390 yhp = _mm_movehdup_ps(inc_Val);
392 tmp1 = _mm_mul_ps(aVal, yl);
394 tmp1p = _mm_mul_ps(phase_Val, ylp);
396 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
397 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
398 tmp2 = _mm_mul_ps(aVal, yh);
399 tmp2p = _mm_mul_ps(phase_Val, yhp);
401 z = _mm_addsub_ps(tmp1, tmp2);
402 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
404 _mm_storeu_ps((
float*)cPtr, z);
410 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
411 tmp2 = _mm_hadd_ps(tmp1, tmp1);
412 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
413 tmp2 = _mm_sqrt_ps(tmp1);
414 phase_Val = _mm_div_ps(phase_Val, tmp2);
417 _mm_storeu_ps((
float*)phase_Ptr, phase_Val);
418 if (num_points & 1) {
419 *cPtr++ = *aPtr++ * phase_Ptr[0];
420 phase_Ptr[0] *= (phase_inc);
423 (*phase) = phase_Ptr[0];
430#include <immintrin.h>
437 unsigned int num_points)
442 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
444 unsigned int i, j = 0;
446 for (
i = 0;
i < 4; ++
i) {
447 phase_Ptr[
i] *= incr;
451 __m256 aVal, phase_Val, z;
453 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
455 const __m256 inc_Val = _mm256_set_ps(
lv_cimag(incr),
467 aVal = _mm256_load_ps((
float*)aPtr);
472 _mm256_store_ps((
float*)cPtr, z);
481 aVal = _mm256_load_ps((
float*)aPtr);
486 _mm256_store_ps((
float*)cPtr, z);
495 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
496 (*phase) = phase_Ptr[0];
504#include <immintrin.h>
511 unsigned int num_points)
516 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
518 unsigned int i, j = 0;
520 for (
i = 0;
i < 4; ++
i) {
521 phase_Ptr[
i] *= incr;
525 __m256 aVal, phase_Val, z;
527 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
529 const __m256 inc_Val = _mm256_set_ps(
lv_cimag(incr),
541 aVal = _mm256_loadu_ps((
float*)aPtr);
546 _mm256_storeu_ps((
float*)cPtr, z);
555 aVal = _mm256_loadu_ps((
float*)aPtr);
560 _mm256_storeu_ps((
float*)cPtr, z);
569 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
570 (*phase) = phase_Ptr[0];
576#if LV_HAVE_AVX && LV_HAVE_FMA
577#include <immintrin.h>
579static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(
lv_32fc_t* outVector,
583 unsigned int num_points)
589 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
591 unsigned int i, j = 0;
593 for (
i = 0;
i < 4; ++
i) {
594 phase_Ptr[
i] *= incr;
598 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
600 phase_Val = _mm256_load_ps((
float*)phase_Ptr);
601 inc_Val = _mm256_set_ps(
lv_cimag(incr),
613 aVal = _mm256_load_ps((
float*)aPtr);
615 yl = _mm256_moveldup_ps(phase_Val);
616 yh = _mm256_movehdup_ps(phase_Val);
617 ylp = _mm256_moveldup_ps(inc_Val);
618 yhp = _mm256_movehdup_ps(inc_Val);
623 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
624 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
625 tmp2 = _mm256_mul_ps(aVal, yh);
626 tmp2p = _mm256_mul_ps(phase_Val, yhp);
628 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
629 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
631 _mm256_store_ps((
float*)cPtr, z);
636 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
637 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
638 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
639 tmp2 = _mm256_sqrt_ps(tmp1);
640 phase_Val = _mm256_div_ps(phase_Val, tmp2);
643 aVal = _mm256_load_ps((
float*)aPtr);
645 yl = _mm256_moveldup_ps(phase_Val);
646 yh = _mm256_movehdup_ps(phase_Val);
647 ylp = _mm256_moveldup_ps(inc_Val);
648 yhp = _mm256_movehdup_ps(inc_Val);
653 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
654 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
655 tmp2 = _mm256_mul_ps(aVal, yh);
656 tmp2p = _mm256_mul_ps(phase_Val, yhp);
658 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
659 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
661 _mm256_store_ps((
float*)cPtr, z);
667 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
668 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
669 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
670 tmp2 = _mm256_sqrt_ps(tmp1);
671 phase_Val = _mm256_div_ps(phase_Val, tmp2);
674 _mm256_store_ps((
float*)phase_Ptr, phase_Val);
675 for (
i = 0;
i < num_points % 4; ++
i) {
676 *cPtr++ = *aPtr++ * phase_Ptr[0];
677 phase_Ptr[0] *= (phase_inc);
680 (*phase) = phase_Ptr[0];
685#if LV_HAVE_AVX && LV_HAVE_FMA
686#include <immintrin.h>
688static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(
lv_32fc_t* outVector,
692 unsigned int num_points)
697 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
699 unsigned int i, j = 0;
701 for (
i = 0;
i < 4; ++
i) {
702 phase_Ptr[
i] *= incr;
706 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
708 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
709 inc_Val = _mm256_set_ps(
lv_cimag(incr),
721 aVal = _mm256_loadu_ps((
float*)aPtr);
723 yl = _mm256_moveldup_ps(phase_Val);
724 yh = _mm256_movehdup_ps(phase_Val);
725 ylp = _mm256_moveldup_ps(inc_Val);
726 yhp = _mm256_movehdup_ps(inc_Val);
731 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
732 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
733 tmp2 = _mm256_mul_ps(aVal, yh);
734 tmp2p = _mm256_mul_ps(phase_Val, yhp);
736 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
737 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
739 _mm256_storeu_ps((
float*)cPtr, z);
744 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
745 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
746 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
747 tmp2 = _mm256_sqrt_ps(tmp1);
748 phase_Val = _mm256_div_ps(phase_Val, tmp2);
751 aVal = _mm256_loadu_ps((
float*)aPtr);
753 yl = _mm256_moveldup_ps(phase_Val);
754 yh = _mm256_movehdup_ps(phase_Val);
755 ylp = _mm256_moveldup_ps(inc_Val);
756 yhp = _mm256_movehdup_ps(inc_Val);
761 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
762 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
763 tmp2 = _mm256_mul_ps(aVal, yh);
764 tmp2p = _mm256_mul_ps(phase_Val, yhp);
766 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
767 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
769 _mm256_storeu_ps((
float*)cPtr, z);
775 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
776 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
777 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
778 tmp2 = _mm256_sqrt_ps(tmp1);
779 phase_Val = _mm256_div_ps(phase_Val, tmp2);
782 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
783 for (
i = 0;
i < num_points % 4; ++
i) {
784 *cPtr++ = *aPtr++ * phase_Ptr[0];
785 phase_Ptr[0] *= (phase_inc);
788 (*phase) = phase_Ptr[0];
static void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:129
static void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:507
#define ROTATOR_RELOAD_4
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:91
#define ROTATOR_RELOAD_2
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:90
#define ROTATOR_RELOAD
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:89
static void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:96
static void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:433
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:32
static __m256 _mm256_normalize_ps(__m256 val)
Definition: volk_avx_intrinsics.h:64
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_cmake(r, i)
Definition: volk_complex.h:68
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:96
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:118
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:86