76#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
77#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
84#if LV_HAVE_AVX && LV_HAVE_FMA
87static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(
lv_32fc_t* cVector,
90 unsigned int num_points)
92 unsigned int number = 0;
94 const unsigned int quarterPoints = num_points / 4;
95 unsigned int isodd = num_points & 3;
96 __m256 x, yl, yh, z, tmp1, tmp2;
101 yl = _mm256_set1_ps(
lv_creal(scalar));
102 yh = _mm256_set1_ps(
lv_cimag(scalar));
104 for (; number < quarterPoints; number++) {
105 x = _mm256_loadu_ps((
float*)a);
109 x = _mm256_shuffle_ps(x, x, 0xB1);
111 tmp2 = _mm256_mul_ps(x, yh);
113 z = _mm256_fmaddsub_ps(
116 _mm256_storeu_ps((
float*)c, z);
122 for (
i = num_points - isodd;
i < num_points;
i++) {
123 *c++ = (*a++) * scalar;
129#include <immintrin.h>
134 unsigned int num_points)
136 unsigned int number = 0;
138 const unsigned int quarterPoints = num_points / 4;
139 unsigned int isodd = num_points & 3;
140 __m256 x, yl, yh, z, tmp1, tmp2;
145 yl = _mm256_set1_ps(
lv_creal(scalar));
146 yh = _mm256_set1_ps(
lv_cimag(scalar));
148 for (; number < quarterPoints; number++) {
149 x = _mm256_loadu_ps((
float*)a);
151 tmp1 = _mm256_mul_ps(x, yl);
153 x = _mm256_shuffle_ps(x, x, 0xB1);
155 tmp2 = _mm256_mul_ps(x, yh);
157 z = _mm256_addsub_ps(tmp1,
160 _mm256_storeu_ps((
float*)c, z);
166 for (
i = num_points - isodd;
i < num_points;
i++) {
167 *c++ = (*a++) * scalar;
173#include <pmmintrin.h>
178 unsigned int num_points)
180 unsigned int number = 0;
181 const unsigned int halfPoints = num_points / 2;
183 __m128 x, yl, yh, z, tmp1, tmp2;
191 for (; number < halfPoints; number++) {
193 x = _mm_loadu_ps((
float*)a);
195 tmp1 = _mm_mul_ps(x, yl);
197 x = _mm_shuffle_ps(x, x, 0xB1);
199 tmp2 = _mm_mul_ps(x, yh);
201 z = _mm_addsub_ps(tmp1,
204 _mm_storeu_ps((
float*)c, z);
210 if ((num_points % 2) != 0) {
216#ifdef LV_HAVE_GENERIC
221 unsigned int num_points)
225 unsigned int number = num_points;
228 while (number >= 8) {
229 *cPtr++ = (*aPtr++) * scalar;
230 *cPtr++ = (*aPtr++) * scalar;
231 *cPtr++ = (*aPtr++) * scalar;
232 *cPtr++ = (*aPtr++) * scalar;
233 *cPtr++ = (*aPtr++) * scalar;
234 *cPtr++ = (*aPtr++) * scalar;
235 *cPtr++ = (*aPtr++) * scalar;
236 *cPtr++ = (*aPtr++) * scalar;
242 *cPtr++ = *aPtr++ * scalar;
248#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
249#define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
256#if LV_HAVE_AVX && LV_HAVE_FMA
257#include <immintrin.h>
259static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(
lv_32fc_t* cVector,
262 unsigned int num_points)
264 unsigned int number = 0;
266 const unsigned int quarterPoints = num_points / 4;
267 unsigned int isodd = num_points & 3;
268 __m256 x, yl, yh, z, tmp1, tmp2;
273 yl = _mm256_set1_ps(
lv_creal(scalar));
274 yh = _mm256_set1_ps(
lv_cimag(scalar));
276 for (; number < quarterPoints; number++) {
277 x = _mm256_load_ps((
float*)a);
281 x = _mm256_shuffle_ps(x, x, 0xB1);
283 tmp2 = _mm256_mul_ps(x, yh);
285 z = _mm256_fmaddsub_ps(
288 _mm256_store_ps((
float*)c, z);
294 for (
i = num_points - isodd;
i < num_points;
i++) {
295 *c++ = (*a++) * scalar;
302#include <immintrin.h>
307 unsigned int num_points)
309 unsigned int number = 0;
311 const unsigned int quarterPoints = num_points / 4;
312 unsigned int isodd = num_points & 3;
313 __m256 x, yl, yh, z, tmp1, tmp2;
318 yl = _mm256_set1_ps(
lv_creal(scalar));
319 yh = _mm256_set1_ps(
lv_cimag(scalar));
321 for (; number < quarterPoints; number++) {
322 x = _mm256_load_ps((
float*)a);
324 tmp1 = _mm256_mul_ps(x, yl);
326 x = _mm256_shuffle_ps(x, x, 0xB1);
328 tmp2 = _mm256_mul_ps(x, yh);
330 z = _mm256_addsub_ps(tmp1,
333 _mm256_store_ps((
float*)c, z);
339 for (
i = num_points - isodd;
i < num_points;
i++) {
340 *c++ = (*a++) * scalar;
346#include <pmmintrin.h>
351 unsigned int num_points)
353 unsigned int number = 0;
354 const unsigned int halfPoints = num_points / 2;
356 __m128 x, yl, yh, z, tmp1, tmp2;
364 for (; number < halfPoints; number++) {
366 x = _mm_load_ps((
float*)a);
368 tmp1 = _mm_mul_ps(x, yl);
370 x = _mm_shuffle_ps(x, x, 0xB1);
372 tmp2 = _mm_mul_ps(x, yh);
374 z = _mm_addsub_ps(tmp1,
377 _mm_store_ps((
float*)c, z);
383 if ((num_points % 2) != 0) {
395 unsigned int num_points)
399 unsigned int number = num_points;
400 unsigned int quarter_points = num_points / 4;
402 float32x4x2_t a_val, scalar_val;
403 float32x4x2_t tmp_imag;
405 scalar_val.val[0] = vld1q_dup_f32((
const float*)&scalar);
406 scalar_val.val[1] = vld1q_dup_f32(((
const float*)&scalar) + 1);
407 for (number = 0; number < quarter_points; ++number) {
408 a_val = vld2q_f32((
float*)aPtr);
409 tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
410 tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
412 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
413 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
415 vst2q_f32((
float*)cPtr, tmp_imag);
420 for (number = quarter_points * 4; number < num_points; number++) {
421 *cPtr++ = *aPtr++ * scalar;
426#ifdef LV_HAVE_GENERIC
431 unsigned int num_points)
435 unsigned int number = num_points;
438 while (number >= 8) {
439 *cPtr++ = (*aPtr++) * scalar;
440 *cPtr++ = (*aPtr++) * scalar;
441 *cPtr++ = (*aPtr++) * scalar;
442 *cPtr++ = (*aPtr++) * scalar;
443 *cPtr++ = (*aPtr++) * scalar;
444 *cPtr++ = (*aPtr++) * scalar;
445 *cPtr++ = (*aPtr++) * scalar;
446 *cPtr++ = (*aPtr++) * scalar;
452 *cPtr++ = *aPtr++ * scalar;
static void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:428
static void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:218
static void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:131
static void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:175
static void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:304
static void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:348
static void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:392
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25