Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32fc_32f_dot_prod_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
63#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
64#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
65
66#include <stdio.h>
67#include <volk/volk_common.h>
68
69#ifdef LV_HAVE_GENERIC
70
72 const lv_32fc_t* input,
73 const float* taps,
74 unsigned int num_points)
75{
76
77 float res[2];
78 float *realpt = &res[0], *imagpt = &res[1];
79 const float* aPtr = (float*)input;
80 const float* bPtr = taps;
81 unsigned int number = 0;
82
83 *realpt = 0;
84 *imagpt = 0;
85
86 for (number = 0; number < num_points; number++) {
87 *realpt += ((*aPtr++) * (*bPtr));
88 *imagpt += ((*aPtr++) * (*bPtr++));
89 }
90
91 *result = *(lv_32fc_t*)(&res[0]);
92}
93
94#endif /*LV_HAVE_GENERIC*/
95
96#if LV_HAVE_AVX2 && LV_HAVE_FMA
97
98#include <immintrin.h>
99
100static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
101 const lv_32fc_t* input,
102 const float* taps,
103 unsigned int num_points)
104{
105
106 unsigned int number = 0;
107 const unsigned int sixteenthPoints = num_points / 16;
108
109 float res[2];
110 float *realpt = &res[0], *imagpt = &res[1];
111 const float* aPtr = (float*)input;
112 const float* bPtr = taps;
113
114 __m256 a0Val, a1Val, a2Val, a3Val;
115 __m256 b0Val, b1Val, b2Val, b3Val;
116 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
117
118 __m256 dotProdVal0 = _mm256_setzero_ps();
119 __m256 dotProdVal1 = _mm256_setzero_ps();
120 __m256 dotProdVal2 = _mm256_setzero_ps();
121 __m256 dotProdVal3 = _mm256_setzero_ps();
122
123 for (; number < sixteenthPoints; number++) {
124
125 a0Val = _mm256_load_ps(aPtr);
126 a1Val = _mm256_load_ps(aPtr + 8);
127 a2Val = _mm256_load_ps(aPtr + 16);
128 a3Val = _mm256_load_ps(aPtr + 24);
129
130 x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
131 x1Val = _mm256_load_ps(bPtr + 8);
132 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
133 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
134 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
135 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
136
137 // TODO: it may be possible to rearrange swizzling to better pipeline data
138 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
139 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
140 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
141 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
142
143 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
144 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
145 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
146 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
147
148 aPtr += 32;
149 bPtr += 16;
150 }
151
152 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
153 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
154 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
155
156 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
157
158 _mm256_store_ps(dotProductVector,
159 dotProdVal0); // Store the results back into the dot product vector
160
161 *realpt = dotProductVector[0];
162 *imagpt = dotProductVector[1];
163 *realpt += dotProductVector[2];
164 *imagpt += dotProductVector[3];
165 *realpt += dotProductVector[4];
166 *imagpt += dotProductVector[5];
167 *realpt += dotProductVector[6];
168 *imagpt += dotProductVector[7];
169
170 number = sixteenthPoints * 16;
171 for (; number < num_points; number++) {
172 *realpt += ((*aPtr++) * (*bPtr));
173 *imagpt += ((*aPtr++) * (*bPtr++));
174 }
175
176 *result = *(lv_32fc_t*)(&res[0]);
177}
178
179#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
180
181#ifdef LV_HAVE_AVX
182
183#include <immintrin.h>
184
186 const lv_32fc_t* input,
187 const float* taps,
188 unsigned int num_points)
189{
190
191 unsigned int number = 0;
192 const unsigned int sixteenthPoints = num_points / 16;
193
194 float res[2];
195 float *realpt = &res[0], *imagpt = &res[1];
196 const float* aPtr = (float*)input;
197 const float* bPtr = taps;
198
199 __m256 a0Val, a1Val, a2Val, a3Val;
200 __m256 b0Val, b1Val, b2Val, b3Val;
201 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
202 __m256 c0Val, c1Val, c2Val, c3Val;
203
204 __m256 dotProdVal0 = _mm256_setzero_ps();
205 __m256 dotProdVal1 = _mm256_setzero_ps();
206 __m256 dotProdVal2 = _mm256_setzero_ps();
207 __m256 dotProdVal3 = _mm256_setzero_ps();
208
209 for (; number < sixteenthPoints; number++) {
210
211 a0Val = _mm256_load_ps(aPtr);
212 a1Val = _mm256_load_ps(aPtr + 8);
213 a2Val = _mm256_load_ps(aPtr + 16);
214 a3Val = _mm256_load_ps(aPtr + 24);
215
216 x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
217 x1Val = _mm256_load_ps(bPtr + 8);
218 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
219 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
220 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
221 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
222
223 // TODO: it may be possible to rearrange swizzling to better pipeline data
224 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
225 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
226 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
227 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
228
229 c0Val = _mm256_mul_ps(a0Val, b0Val);
230 c1Val = _mm256_mul_ps(a1Val, b1Val);
231 c2Val = _mm256_mul_ps(a2Val, b2Val);
232 c3Val = _mm256_mul_ps(a3Val, b3Val);
233
234 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
235 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
236 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
237 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
238
239 aPtr += 32;
240 bPtr += 16;
241 }
242
243 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
244 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
245 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
246
247 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
248
249 _mm256_store_ps(dotProductVector,
250 dotProdVal0); // Store the results back into the dot product vector
251
252 *realpt = dotProductVector[0];
253 *imagpt = dotProductVector[1];
254 *realpt += dotProductVector[2];
255 *imagpt += dotProductVector[3];
256 *realpt += dotProductVector[4];
257 *imagpt += dotProductVector[5];
258 *realpt += dotProductVector[6];
259 *imagpt += dotProductVector[7];
260
261 number = sixteenthPoints * 16;
262 for (; number < num_points; number++) {
263 *realpt += ((*aPtr++) * (*bPtr));
264 *imagpt += ((*aPtr++) * (*bPtr++));
265 }
266
267 *result = *(lv_32fc_t*)(&res[0]);
268}
269
270#endif /*LV_HAVE_AVX*/
271
272
273#ifdef LV_HAVE_SSE
274
275
277 const lv_32fc_t* input,
278 const float* taps,
279 unsigned int num_points)
280{
281
282 unsigned int number = 0;
283 const unsigned int sixteenthPoints = num_points / 8;
284
285 float res[2];
286 float *realpt = &res[0], *imagpt = &res[1];
287 const float* aPtr = (float*)input;
288 const float* bPtr = taps;
289
290 __m128 a0Val, a1Val, a2Val, a3Val;
291 __m128 b0Val, b1Val, b2Val, b3Val;
292 __m128 x0Val, x1Val, x2Val, x3Val;
293 __m128 c0Val, c1Val, c2Val, c3Val;
294
295 __m128 dotProdVal0 = _mm_setzero_ps();
296 __m128 dotProdVal1 = _mm_setzero_ps();
297 __m128 dotProdVal2 = _mm_setzero_ps();
298 __m128 dotProdVal3 = _mm_setzero_ps();
299
300 for (; number < sixteenthPoints; number++) {
301
302 a0Val = _mm_load_ps(aPtr);
303 a1Val = _mm_load_ps(aPtr + 4);
304 a2Val = _mm_load_ps(aPtr + 8);
305 a3Val = _mm_load_ps(aPtr + 12);
306
307 x0Val = _mm_load_ps(bPtr);
308 x1Val = _mm_load_ps(bPtr);
309 x2Val = _mm_load_ps(bPtr + 4);
310 x3Val = _mm_load_ps(bPtr + 4);
311 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
312 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
313 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
314 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
315
316 c0Val = _mm_mul_ps(a0Val, b0Val);
317 c1Val = _mm_mul_ps(a1Val, b1Val);
318 c2Val = _mm_mul_ps(a2Val, b2Val);
319 c3Val = _mm_mul_ps(a3Val, b3Val);
320
321 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
322 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
323 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
324 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
325
326 aPtr += 16;
327 bPtr += 8;
328 }
329
330 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
331 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
332 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
333
334 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
335
336 _mm_store_ps(dotProductVector,
337 dotProdVal0); // Store the results back into the dot product vector
338
339 *realpt = dotProductVector[0];
340 *imagpt = dotProductVector[1];
341 *realpt += dotProductVector[2];
342 *imagpt += dotProductVector[3];
343
344 number = sixteenthPoints * 8;
345 for (; number < num_points; number++) {
346 *realpt += ((*aPtr++) * (*bPtr));
347 *imagpt += ((*aPtr++) * (*bPtr++));
348 }
349
350 *result = *(lv_32fc_t*)(&res[0]);
351}
352
353#endif /*LV_HAVE_SSE*/
354
355#if LV_HAVE_AVX2 && LV_HAVE_FMA
356
357#include <immintrin.h>
358
359static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
360 const lv_32fc_t* input,
361 const float* taps,
362 unsigned int num_points)
363{
364
365 unsigned int number = 0;
366 const unsigned int sixteenthPoints = num_points / 16;
367
368 float res[2];
369 float *realpt = &res[0], *imagpt = &res[1];
370 const float* aPtr = (float*)input;
371 const float* bPtr = taps;
372
373 __m256 a0Val, a1Val, a2Val, a3Val;
374 __m256 b0Val, b1Val, b2Val, b3Val;
375 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
376
377 __m256 dotProdVal0 = _mm256_setzero_ps();
378 __m256 dotProdVal1 = _mm256_setzero_ps();
379 __m256 dotProdVal2 = _mm256_setzero_ps();
380 __m256 dotProdVal3 = _mm256_setzero_ps();
381
382 for (; number < sixteenthPoints; number++) {
383
384 a0Val = _mm256_loadu_ps(aPtr);
385 a1Val = _mm256_loadu_ps(aPtr + 8);
386 a2Val = _mm256_loadu_ps(aPtr + 16);
387 a3Val = _mm256_loadu_ps(aPtr + 24);
388
389 x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
390 x1Val = _mm256_load_ps(bPtr + 8);
391 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
392 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
393 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
394 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
395
396 // TODO: it may be possible to rearrange swizzling to better pipeline data
397 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
398 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
399 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
400 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
401
402 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
403 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
404 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
405 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
406
407 aPtr += 32;
408 bPtr += 16;
409 }
410
411 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
412 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
413 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
414
415 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
416
417 _mm256_store_ps(dotProductVector,
418 dotProdVal0); // Store the results back into the dot product vector
419
420 *realpt = dotProductVector[0];
421 *imagpt = dotProductVector[1];
422 *realpt += dotProductVector[2];
423 *imagpt += dotProductVector[3];
424 *realpt += dotProductVector[4];
425 *imagpt += dotProductVector[5];
426 *realpt += dotProductVector[6];
427 *imagpt += dotProductVector[7];
428
429 number = sixteenthPoints * 16;
430 for (; number < num_points; number++) {
431 *realpt += ((*aPtr++) * (*bPtr));
432 *imagpt += ((*aPtr++) * (*bPtr++));
433 }
434
435 *result = *(lv_32fc_t*)(&res[0]);
436}
437
438#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
439
440#ifdef LV_HAVE_AVX
441
442#include <immintrin.h>
443
445 const lv_32fc_t* input,
446 const float* taps,
447 unsigned int num_points)
448{
449
450 unsigned int number = 0;
451 const unsigned int sixteenthPoints = num_points / 16;
452
453 float res[2];
454 float *realpt = &res[0], *imagpt = &res[1];
455 const float* aPtr = (float*)input;
456 const float* bPtr = taps;
457
458 __m256 a0Val, a1Val, a2Val, a3Val;
459 __m256 b0Val, b1Val, b2Val, b3Val;
460 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
461 __m256 c0Val, c1Val, c2Val, c3Val;
462
463 __m256 dotProdVal0 = _mm256_setzero_ps();
464 __m256 dotProdVal1 = _mm256_setzero_ps();
465 __m256 dotProdVal2 = _mm256_setzero_ps();
466 __m256 dotProdVal3 = _mm256_setzero_ps();
467
468 for (; number < sixteenthPoints; number++) {
469
470 a0Val = _mm256_loadu_ps(aPtr);
471 a1Val = _mm256_loadu_ps(aPtr + 8);
472 a2Val = _mm256_loadu_ps(aPtr + 16);
473 a3Val = _mm256_loadu_ps(aPtr + 24);
474
475 x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
476 x1Val = _mm256_loadu_ps(bPtr + 8);
477 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
478 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
479 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
480 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
481
482 // TODO: it may be possible to rearrange swizzling to better pipeline data
483 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
484 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
485 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
486 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
487
488 c0Val = _mm256_mul_ps(a0Val, b0Val);
489 c1Val = _mm256_mul_ps(a1Val, b1Val);
490 c2Val = _mm256_mul_ps(a2Val, b2Val);
491 c3Val = _mm256_mul_ps(a3Val, b3Val);
492
493 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
494 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
495 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
496 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
497
498 aPtr += 32;
499 bPtr += 16;
500 }
501
502 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
503 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
504 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
505
506 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
507
508 _mm256_store_ps(dotProductVector,
509 dotProdVal0); // Store the results back into the dot product vector
510
511 *realpt = dotProductVector[0];
512 *imagpt = dotProductVector[1];
513 *realpt += dotProductVector[2];
514 *imagpt += dotProductVector[3];
515 *realpt += dotProductVector[4];
516 *imagpt += dotProductVector[5];
517 *realpt += dotProductVector[6];
518 *imagpt += dotProductVector[7];
519
520 number = sixteenthPoints * 16;
521 for (; number < num_points; number++) {
522 *realpt += ((*aPtr++) * (*bPtr));
523 *imagpt += ((*aPtr++) * (*bPtr++));
524 }
525
526 *result = *(lv_32fc_t*)(&res[0]);
527}
528#endif /*LV_HAVE_AVX*/
529
530#ifdef LV_HAVE_NEON
531#include <arm_neon.h>
532
533static inline void
535 const lv_32fc_t* __restrict input,
536 const float* __restrict taps,
537 unsigned int num_points)
538{
539
540 unsigned int number;
541 const unsigned int quarterPoints = num_points / 8;
542
543 float res[2];
544 float *realpt = &res[0], *imagpt = &res[1];
545 const float* inputPtr = (float*)input;
546 const float* tapsPtr = taps;
547 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
548 float accVector_real[4];
549 float accVector_imag[4];
550
551 float32x4x2_t inputVector0, inputVector1;
552 float32x4_t tapsVector0, tapsVector1;
553 float32x4_t tmp_real0, tmp_imag0;
554 float32x4_t tmp_real1, tmp_imag1;
555 float32x4_t real_accumulator0, imag_accumulator0;
556 float32x4_t real_accumulator1, imag_accumulator1;
557
558 // zero out accumulators
559 // take a *float, return float32x4_t
560 real_accumulator0 = vld1q_f32(zero);
561 imag_accumulator0 = vld1q_f32(zero);
562 real_accumulator1 = vld1q_f32(zero);
563 imag_accumulator1 = vld1q_f32(zero);
564
565 for (number = 0; number < quarterPoints; number++) {
566 // load doublewords and duplicate in to second lane
567 tapsVector0 = vld1q_f32(tapsPtr);
568 tapsVector1 = vld1q_f32(tapsPtr + 4);
569
570 // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
571 inputVector0 = vld2q_f32(inputPtr);
572 inputVector1 = vld2q_f32(inputPtr + 8);
573 // inputVector is now a struct of two vectors, 0th is real, 1st is imag
574
575 tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
576 tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
577
578 tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
579 tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
580
581 real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
582 imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
583
584 real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
585 imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
586
587 tapsPtr += 8;
588 inputPtr += 16;
589 }
590
591 real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1);
592 imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1);
593 // void vst1q_f32( float32_t * ptr, float32x4_t val);
594 // store results back to a complex (array of 2 floats)
595 vst1q_f32(accVector_real, real_accumulator0);
596 vst1q_f32(accVector_imag, imag_accumulator0);
597 *realpt =
598 accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
599
600 *imagpt =
601 accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
602
603 // clean up the remainder
604 for (number = quarterPoints * 8; number < num_points; number++) {
605 *realpt += ((*inputPtr++) * (*tapsPtr));
606 *imagpt += ((*inputPtr++) * (*tapsPtr++));
607 }
608
609 *result = *(lv_32fc_t*)(&res[0]);
610}
611
612#endif /*LV_HAVE_NEON*/
613
614#ifdef LV_HAVE_NEON
615#include <arm_neon.h>
616
617static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result,
618 const lv_32fc_t* __restrict input,
619 const float* __restrict taps,
620 unsigned int num_points)
621{
622
623 unsigned int number;
624 const unsigned int quarterPoints = num_points / 4;
625
626 float res[2];
627 float *realpt = &res[0], *imagpt = &res[1];
628 const float* inputPtr = (float*)input;
629 const float* tapsPtr = taps;
630 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
631 float accVector_real[4];
632 float accVector_imag[4];
633
634 float32x4x2_t inputVector;
635 float32x4_t tapsVector;
636 float32x4_t tmp_real, tmp_imag;
637 float32x4_t real_accumulator, imag_accumulator;
638
639
640 // zero out accumulators
641 // take a *float, return float32x4_t
642 real_accumulator = vld1q_f32(zero);
643 imag_accumulator = vld1q_f32(zero);
644
645 for (number = 0; number < quarterPoints; number++) {
646 // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
647 // load doublewords and duplicate in to second lane
648 tapsVector = vld1q_f32(tapsPtr);
649
650 // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
651 inputVector = vld2q_f32(inputPtr);
652
653 tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
654 tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
655
656 real_accumulator = vaddq_f32(real_accumulator, tmp_real);
657 imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
658
659
660 tapsPtr += 4;
661 inputPtr += 8;
662 }
663
664 // store results back to a complex (array of 2 floats)
665 vst1q_f32(accVector_real, real_accumulator);
666 vst1q_f32(accVector_imag, imag_accumulator);
667 *realpt =
668 accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
669
670 *imagpt =
671 accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
672
673 // clean up the remainder
674 for (number = quarterPoints * 4; number < num_points; number++) {
675 *realpt += ((*inputPtr++) * (*tapsPtr));
676 *imagpt += ((*inputPtr++) * (*tapsPtr++));
677 }
678
679 *result = *(lv_32fc_t*)(&res[0]);
680}
681
682#endif /*LV_HAVE_NEON*/
683
684#ifdef LV_HAVE_NEONV7
685extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result,
686 const lv_32fc_t* input,
687 const float* taps,
688 unsigned int num_points);
689#endif /*LV_HAVE_NEONV7*/
690
691#ifdef LV_HAVE_NEONV7
692extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result,
693 const lv_32fc_t* input,
694 const float* taps,
695 unsigned int num_points);
696#endif /*LV_HAVE_NEONV7*/
697
698#ifdef LV_HAVE_NEONV7
699extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result,
700 const lv_32fc_t* input,
701 const float* taps,
702 unsigned int num_points);
703#endif /*LV_HAVE_NEONV7*/
704
705#ifdef LV_HAVE_SSE
706
708 const lv_32fc_t* input,
709 const float* taps,
710 unsigned int num_points)
711{
712
713 unsigned int number = 0;
714 const unsigned int sixteenthPoints = num_points / 8;
715
716 float res[2];
717 float *realpt = &res[0], *imagpt = &res[1];
718 const float* aPtr = (float*)input;
719 const float* bPtr = taps;
720
721 __m128 a0Val, a1Val, a2Val, a3Val;
722 __m128 b0Val, b1Val, b2Val, b3Val;
723 __m128 x0Val, x1Val, x2Val, x3Val;
724 __m128 c0Val, c1Val, c2Val, c3Val;
725
726 __m128 dotProdVal0 = _mm_setzero_ps();
727 __m128 dotProdVal1 = _mm_setzero_ps();
728 __m128 dotProdVal2 = _mm_setzero_ps();
729 __m128 dotProdVal3 = _mm_setzero_ps();
730
731 for (; number < sixteenthPoints; number++) {
732
733 a0Val = _mm_loadu_ps(aPtr);
734 a1Val = _mm_loadu_ps(aPtr + 4);
735 a2Val = _mm_loadu_ps(aPtr + 8);
736 a3Val = _mm_loadu_ps(aPtr + 12);
737
738 x0Val = _mm_loadu_ps(bPtr);
739 x1Val = _mm_loadu_ps(bPtr);
740 x2Val = _mm_loadu_ps(bPtr + 4);
741 x3Val = _mm_loadu_ps(bPtr + 4);
742 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
743 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
744 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
745 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
746
747 c0Val = _mm_mul_ps(a0Val, b0Val);
748 c1Val = _mm_mul_ps(a1Val, b1Val);
749 c2Val = _mm_mul_ps(a2Val, b2Val);
750 c3Val = _mm_mul_ps(a3Val, b3Val);
751
752 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
753 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
754 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
755 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
756
757 aPtr += 16;
758 bPtr += 8;
759 }
760
761 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
762 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
763 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
764
765 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
766
767 _mm_store_ps(dotProductVector,
768 dotProdVal0); // Store the results back into the dot product vector
769
770 *realpt = dotProductVector[0];
771 *imagpt = dotProductVector[1];
772 *realpt += dotProductVector[2];
773 *imagpt += dotProductVector[3];
774
775 number = sixteenthPoints * 8;
776 for (; number < num_points; number++) {
777 *realpt += ((*aPtr++) * (*bPtr));
778 *imagpt += ((*aPtr++) * (*bPtr++));
779 }
780
781 *result = *(lv_32fc_t*)(&res[0]);
782}
783
784#endif /*LV_HAVE_SSE*/
785
786
787#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/
static void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:444
static void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:276
static void volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:534
static void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:617
static void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:71
static void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:185
static void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:707
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
float complex lv_32fc_t
Definition: volk_complex.h:65