Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32fc_x2_multiply_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
70#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
71#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
72
73#include <float.h>
74#include <inttypes.h>
75#include <stdio.h>
76#include <volk/volk_complex.h>
77
78#if LV_HAVE_AVX2 && LV_HAVE_FMA
79#include <immintrin.h>
87static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector,
88 const lv_32fc_t* aVector,
89 const lv_32fc_t* bVector,
90 unsigned int num_points)
91{
92 unsigned int number = 0;
93 const unsigned int quarterPoints = num_points / 4;
94
95 lv_32fc_t* c = cVector;
96 const lv_32fc_t* a = aVector;
97 const lv_32fc_t* b = bVector;
98
99 for (; number < quarterPoints; number++) {
100
101 const __m256 x =
102 _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
103 const __m256 y =
104 _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
105
106 const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
107 const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
108
109 const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
110
111 const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
112
113 const __m256 z = _mm256_fmaddsub_ps(
114 x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
115
116 _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
117
118 a += 4;
119 b += 4;
120 c += 4;
121 }
122
123 number = quarterPoints * 4;
124 for (; number < num_points; number++) {
125 *c++ = (*a++) * (*b++);
126 }
127}
128#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
129
130
131#ifdef LV_HAVE_AVX
132#include <immintrin.h>
134
135static inline void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector,
136 const lv_32fc_t* aVector,
137 const lv_32fc_t* bVector,
138 unsigned int num_points)
139{
140 unsigned int number = 0;
141 const unsigned int quarterPoints = num_points / 4;
142
143 __m256 x, y, z;
144 lv_32fc_t* c = cVector;
145 const lv_32fc_t* a = aVector;
146 const lv_32fc_t* b = bVector;
147
148 for (; number < quarterPoints; number++) {
149 x = _mm256_loadu_ps(
150 (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
151 y = _mm256_loadu_ps(
152 (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
153 z = _mm256_complexmul_ps(x, y);
154 _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
155
156 a += 4;
157 b += 4;
158 c += 4;
159 }
160
161 number = quarterPoints * 4;
162
163 for (; number < num_points; number++) {
164 *c++ = (*a++) * (*b++);
165 }
166}
167#endif /* LV_HAVE_AVX */
168
169
170#ifdef LV_HAVE_SSE3
171#include <pmmintrin.h>
173
175 const lv_32fc_t* aVector,
176 const lv_32fc_t* bVector,
177 unsigned int num_points)
178{
179 unsigned int number = 0;
180 const unsigned int halfPoints = num_points / 2;
181
182 __m128 x, y, z;
183 lv_32fc_t* c = cVector;
184 const lv_32fc_t* a = aVector;
185 const lv_32fc_t* b = bVector;
186
187 for (; number < halfPoints; number++) {
188 x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
189 y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
190 z = _mm_complexmul_ps(x, y);
191 _mm_storeu_ps((float*)c, z); // Store the results back into the C container
192
193 a += 2;
194 b += 2;
195 c += 2;
196 }
197
198 if ((num_points % 2) != 0) {
199 *c = (*a) * (*b);
200 }
201}
202#endif /* LV_HAVE_SSE */
203
204
205#ifdef LV_HAVE_GENERIC
206
208 const lv_32fc_t* aVector,
209 const lv_32fc_t* bVector,
210 unsigned int num_points)
211{
212 lv_32fc_t* cPtr = cVector;
213 const lv_32fc_t* aPtr = aVector;
214 const lv_32fc_t* bPtr = bVector;
215 unsigned int number = 0;
216
217 for (number = 0; number < num_points; number++) {
218 *cPtr++ = (*aPtr++) * (*bPtr++);
219 }
220}
221#endif /* LV_HAVE_GENERIC */
222
223
224#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
225#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
226#define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
227
228#include <float.h>
229#include <inttypes.h>
230#include <stdio.h>
231#include <volk/volk_complex.h>
232
233#if LV_HAVE_AVX2 && LV_HAVE_FMA
234#include <immintrin.h>
242static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector,
243 const lv_32fc_t* aVector,
244 const lv_32fc_t* bVector,
245 unsigned int num_points)
246{
247 unsigned int number = 0;
248 const unsigned int quarterPoints = num_points / 4;
249
250 lv_32fc_t* c = cVector;
251 const lv_32fc_t* a = aVector;
252 const lv_32fc_t* b = bVector;
253
254 for (; number < quarterPoints; number++) {
255
256 const __m256 x =
257 _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
258 const __m256 y =
259 _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
260
261 const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
262 const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
263
264 const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
265
266 const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
267
268 const __m256 z = _mm256_fmaddsub_ps(
269 x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
270
271 _mm256_store_ps((float*)c, z); // Store the results back into the C container
272
273 a += 4;
274 b += 4;
275 c += 4;
276 }
277
278 number = quarterPoints * 4;
279 for (; number < num_points; number++) {
280 *c++ = (*a++) * (*b++);
281 }
282}
283#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
284
285
286#ifdef LV_HAVE_AVX
287#include <immintrin.h>
289
290static inline void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector,
291 const lv_32fc_t* aVector,
292 const lv_32fc_t* bVector,
293 unsigned int num_points)
294{
295 unsigned int number = 0;
296 const unsigned int quarterPoints = num_points / 4;
297
298 __m256 x, y, z;
299 lv_32fc_t* c = cVector;
300 const lv_32fc_t* a = aVector;
301 const lv_32fc_t* b = bVector;
302
303 for (; number < quarterPoints; number++) {
304 x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
305 y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
306 z = _mm256_complexmul_ps(x, y);
307 _mm256_store_ps((float*)c, z); // Store the results back into the C container
308
309 a += 4;
310 b += 4;
311 c += 4;
312 }
313
314 number = quarterPoints * 4;
315
316 for (; number < num_points; number++) {
317 *c++ = (*a++) * (*b++);
318 }
319}
320#endif /* LV_HAVE_AVX */
321
322#ifdef LV_HAVE_SSE3
323#include <pmmintrin.h>
325
327 const lv_32fc_t* aVector,
328 const lv_32fc_t* bVector,
329 unsigned int num_points)
330{
331 unsigned int number = 0;
332 const unsigned int halfPoints = num_points / 2;
333
334 __m128 x, y, z;
335 lv_32fc_t* c = cVector;
336 const lv_32fc_t* a = aVector;
337 const lv_32fc_t* b = bVector;
338
339 for (; number < halfPoints; number++) {
340 x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
341 y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
342 z = _mm_complexmul_ps(x, y);
343 _mm_store_ps((float*)c, z); // Store the results back into the C container
344
345 a += 2;
346 b += 2;
347 c += 2;
348 }
349
350 if ((num_points % 2) != 0) {
351 *c = (*a) * (*b);
352 }
353}
354#endif /* LV_HAVE_SSE */
355
356
357#ifdef LV_HAVE_GENERIC
358
360 const lv_32fc_t* aVector,
361 const lv_32fc_t* bVector,
362 unsigned int num_points)
363{
364 lv_32fc_t* cPtr = cVector;
365 const lv_32fc_t* aPtr = aVector;
366 const lv_32fc_t* bPtr = bVector;
367 unsigned int number = 0;
368
369 for (number = 0; number < num_points; number++) {
370 *cPtr++ = (*aPtr++) * (*bPtr++);
371 }
372}
373#endif /* LV_HAVE_GENERIC */
374
375
376#ifdef LV_HAVE_NEON
377#include <arm_neon.h>
378
379static inline void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector,
380 const lv_32fc_t* aVector,
381 const lv_32fc_t* bVector,
382 unsigned int num_points)
383{
384 lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
385 lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
386 unsigned int quarter_points = num_points / 4;
387 float32x4x2_t a_val, b_val, c_val;
388 float32x4x2_t tmp_real, tmp_imag;
389 unsigned int number = 0;
390
391 for (number = 0; number < quarter_points; ++number) {
392 a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
393 b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
394 __VOLK_PREFETCH(a_ptr + 4);
395 __VOLK_PREFETCH(b_ptr + 4);
396
397 // multiply the real*real and imag*imag to get real result
398 // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
399 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
400 // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
401 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
402
403 // Multiply cross terms to get the imaginary result
404 // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
405 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
406 // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
407 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
408
409 // store the results
410 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
411 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
412 vst2q_f32((float*)cVector, c_val);
413
414 a_ptr += 4;
415 b_ptr += 4;
416 cVector += 4;
417 }
418
419 for (number = quarter_points * 4; number < num_points; number++) {
420 *cVector++ = (*a_ptr++) * (*b_ptr++);
421 }
422}
423#endif /* LV_HAVE_NEON */
424
425
426#ifdef LV_HAVE_NEON
427
429 const lv_32fc_t* aVector,
430 const lv_32fc_t* bVector,
431 unsigned int num_points)
432{
433 lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
434 lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
435 unsigned int quarter_points = num_points / 4;
436 float32x4x2_t a_val, b_val;
437 float32x4x2_t tmp_imag;
438 unsigned int number = 0;
439
440 for (number = 0; number < quarter_points; ++number) {
441 a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
442 b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
443 __VOLK_PREFETCH(a_ptr + 4);
444 __VOLK_PREFETCH(b_ptr + 4);
445
446 // do the first multiply
447 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
448 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
449
450 // use multiply accumulate/subtract to get result
451 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
452 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
453
454 // store
455 vst2q_f32((float*)cVector, tmp_imag);
456 // increment pointers
457 a_ptr += 4;
458 b_ptr += 4;
459 cVector += 4;
460 }
461
462 for (number = quarter_points * 4; number < num_points; number++) {
463 *cVector++ = (*a_ptr++) * (*b_ptr++);
464 }
465}
466#endif /* LV_HAVE_NEON */
467
468
469#ifdef LV_HAVE_NEONV7
470
471extern void volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector,
472 const lv_32fc_t* aVector,
473 const lv_32fc_t* bVector,
474 unsigned int num_points);
475#endif /* LV_HAVE_NEONV7 */
476
477
478#ifdef LV_HAVE_ORC
479
480extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
481 const lv_32fc_t* aVector,
482 const lv_32fc_t* bVector,
483 unsigned int num_points);
484
485static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector,
486 const lv_32fc_t* aVector,
487 const lv_32fc_t* bVector,
488 unsigned int num_points)
489{
490 volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
491}
492
493#endif /* LV_HAVE_ORC */
494
495#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
static void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:326
static void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:135
static void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:207
static void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:428
static void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:379
static void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:290
static void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:174
static void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:359
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:32
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
float complex lv_32fc_t
Definition: volk_complex.h:65
static __m128 _mm_complexmul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:32