Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_x2_dot_prod_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
71#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
72#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
73
74#include <stdio.h>
75#include <volk/volk_common.h>
76
77
78#ifdef LV_HAVE_GENERIC
79
80
81static inline void volk_32f_x2_dot_prod_32f_generic(float* result,
82 const float* input,
83 const float* taps,
84 unsigned int num_points)
85{
86
87 float dotProduct = 0;
88 const float* aPtr = input;
89 const float* bPtr = taps;
90 unsigned int number = 0;
91
92 for (number = 0; number < num_points; number++) {
93 dotProduct += ((*aPtr++) * (*bPtr++));
94 }
95
96 *result = dotProduct;
97}
98
99#endif /*LV_HAVE_GENERIC*/
100
101
102#ifdef LV_HAVE_SSE
103
104
105static inline void volk_32f_x2_dot_prod_32f_u_sse(float* result,
106 const float* input,
107 const float* taps,
108 unsigned int num_points)
109{
110
111 unsigned int number = 0;
112 const unsigned int sixteenthPoints = num_points / 16;
113
114 float dotProduct = 0;
115 const float* aPtr = input;
116 const float* bPtr = taps;
117
118 __m128 a0Val, a1Val, a2Val, a3Val;
119 __m128 b0Val, b1Val, b2Val, b3Val;
120 __m128 c0Val, c1Val, c2Val, c3Val;
121
122 __m128 dotProdVal0 = _mm_setzero_ps();
123 __m128 dotProdVal1 = _mm_setzero_ps();
124 __m128 dotProdVal2 = _mm_setzero_ps();
125 __m128 dotProdVal3 = _mm_setzero_ps();
126
127 for (; number < sixteenthPoints; number++) {
128
129 a0Val = _mm_loadu_ps(aPtr);
130 a1Val = _mm_loadu_ps(aPtr + 4);
131 a2Val = _mm_loadu_ps(aPtr + 8);
132 a3Val = _mm_loadu_ps(aPtr + 12);
133 b0Val = _mm_loadu_ps(bPtr);
134 b1Val = _mm_loadu_ps(bPtr + 4);
135 b2Val = _mm_loadu_ps(bPtr + 8);
136 b3Val = _mm_loadu_ps(bPtr + 12);
137
138 c0Val = _mm_mul_ps(a0Val, b0Val);
139 c1Val = _mm_mul_ps(a1Val, b1Val);
140 c2Val = _mm_mul_ps(a2Val, b2Val);
141 c3Val = _mm_mul_ps(a3Val, b3Val);
142
143 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
144 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
145 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
146 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
147
148 aPtr += 16;
149 bPtr += 16;
150 }
151
152 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
153 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
154 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
155
156 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
157
158 _mm_store_ps(dotProductVector,
159 dotProdVal0); // Store the results back into the dot product vector
160
161 dotProduct = dotProductVector[0];
162 dotProduct += dotProductVector[1];
163 dotProduct += dotProductVector[2];
164 dotProduct += dotProductVector[3];
165
166 number = sixteenthPoints * 16;
167 for (; number < num_points; number++) {
168 dotProduct += ((*aPtr++) * (*bPtr++));
169 }
170
171 *result = dotProduct;
172}
173
174#endif /*LV_HAVE_SSE*/
175
176#ifdef LV_HAVE_SSE3
177
178#include <pmmintrin.h>
179
180static inline void volk_32f_x2_dot_prod_32f_u_sse3(float* result,
181 const float* input,
182 const float* taps,
183 unsigned int num_points)
184{
185 unsigned int number = 0;
186 const unsigned int sixteenthPoints = num_points / 16;
187
188 float dotProduct = 0;
189 const float* aPtr = input;
190 const float* bPtr = taps;
191
192 __m128 a0Val, a1Val, a2Val, a3Val;
193 __m128 b0Val, b1Val, b2Val, b3Val;
194 __m128 c0Val, c1Val, c2Val, c3Val;
195
196 __m128 dotProdVal0 = _mm_setzero_ps();
197 __m128 dotProdVal1 = _mm_setzero_ps();
198 __m128 dotProdVal2 = _mm_setzero_ps();
199 __m128 dotProdVal3 = _mm_setzero_ps();
200
201 for (; number < sixteenthPoints; number++) {
202
203 a0Val = _mm_loadu_ps(aPtr);
204 a1Val = _mm_loadu_ps(aPtr + 4);
205 a2Val = _mm_loadu_ps(aPtr + 8);
206 a3Val = _mm_loadu_ps(aPtr + 12);
207 b0Val = _mm_loadu_ps(bPtr);
208 b1Val = _mm_loadu_ps(bPtr + 4);
209 b2Val = _mm_loadu_ps(bPtr + 8);
210 b3Val = _mm_loadu_ps(bPtr + 12);
211
212 c0Val = _mm_mul_ps(a0Val, b0Val);
213 c1Val = _mm_mul_ps(a1Val, b1Val);
214 c2Val = _mm_mul_ps(a2Val, b2Val);
215 c3Val = _mm_mul_ps(a3Val, b3Val);
216
217 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
218 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
219 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
220 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
221
222 aPtr += 16;
223 bPtr += 16;
224 }
225
226 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
227 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
228 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
229
230 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
231 _mm_store_ps(dotProductVector,
232 dotProdVal0); // Store the results back into the dot product vector
233
234 dotProduct = dotProductVector[0];
235 dotProduct += dotProductVector[1];
236 dotProduct += dotProductVector[2];
237 dotProduct += dotProductVector[3];
238
239 number = sixteenthPoints * 16;
240 for (; number < num_points; number++) {
241 dotProduct += ((*aPtr++) * (*bPtr++));
242 }
243
244 *result = dotProduct;
245}
246
247#endif /*LV_HAVE_SSE3*/
248
249#ifdef LV_HAVE_SSE4_1
250
251#include <smmintrin.h>
252
253static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float* result,
254 const float* input,
255 const float* taps,
256 unsigned int num_points)
257{
258 unsigned int number = 0;
259 const unsigned int sixteenthPoints = num_points / 16;
260
261 float dotProduct = 0;
262 const float* aPtr = input;
263 const float* bPtr = taps;
264
265 __m128 aVal1, bVal1, cVal1;
266 __m128 aVal2, bVal2, cVal2;
267 __m128 aVal3, bVal3, cVal3;
268 __m128 aVal4, bVal4, cVal4;
269
270 __m128 dotProdVal = _mm_setzero_ps();
271
272 for (; number < sixteenthPoints; number++) {
273
274 aVal1 = _mm_loadu_ps(aPtr);
275 aPtr += 4;
276 aVal2 = _mm_loadu_ps(aPtr);
277 aPtr += 4;
278 aVal3 = _mm_loadu_ps(aPtr);
279 aPtr += 4;
280 aVal4 = _mm_loadu_ps(aPtr);
281 aPtr += 4;
282
283 bVal1 = _mm_loadu_ps(bPtr);
284 bPtr += 4;
285 bVal2 = _mm_loadu_ps(bPtr);
286 bPtr += 4;
287 bVal3 = _mm_loadu_ps(bPtr);
288 bPtr += 4;
289 bVal4 = _mm_loadu_ps(bPtr);
290 bPtr += 4;
291
292 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
293 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
294 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
295 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
296
297 cVal1 = _mm_or_ps(cVal1, cVal2);
298 cVal3 = _mm_or_ps(cVal3, cVal4);
299 cVal1 = _mm_or_ps(cVal1, cVal3);
300
301 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
302 }
303
304 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
305 _mm_store_ps(dotProductVector,
306 dotProdVal); // Store the results back into the dot product vector
307
308 dotProduct = dotProductVector[0];
309 dotProduct += dotProductVector[1];
310 dotProduct += dotProductVector[2];
311 dotProduct += dotProductVector[3];
312
313 number = sixteenthPoints * 16;
314 for (; number < num_points; number++) {
315 dotProduct += ((*aPtr++) * (*bPtr++));
316 }
317
318 *result = dotProduct;
319}
320
321#endif /*LV_HAVE_SSE4_1*/
322
323#ifdef LV_HAVE_AVX
324
325#include <immintrin.h>
326
327static inline void volk_32f_x2_dot_prod_32f_u_avx(float* result,
328 const float* input,
329 const float* taps,
330 unsigned int num_points)
331{
332
333 unsigned int number = 0;
334 const unsigned int sixteenthPoints = num_points / 16;
335
336 float dotProduct = 0;
337 const float* aPtr = input;
338 const float* bPtr = taps;
339
340 __m256 a0Val, a1Val;
341 __m256 b0Val, b1Val;
342 __m256 c0Val, c1Val;
343
344 __m256 dotProdVal0 = _mm256_setzero_ps();
345 __m256 dotProdVal1 = _mm256_setzero_ps();
346
347 for (; number < sixteenthPoints; number++) {
348
349 a0Val = _mm256_loadu_ps(aPtr);
350 a1Val = _mm256_loadu_ps(aPtr + 8);
351 b0Val = _mm256_loadu_ps(bPtr);
352 b1Val = _mm256_loadu_ps(bPtr + 8);
353
354 c0Val = _mm256_mul_ps(a0Val, b0Val);
355 c1Val = _mm256_mul_ps(a1Val, b1Val);
356
357 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
358 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
359
360 aPtr += 16;
361 bPtr += 16;
362 }
363
364 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
365
366 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
367
368 _mm256_storeu_ps(dotProductVector,
369 dotProdVal0); // Store the results back into the dot product vector
370
371 dotProduct = dotProductVector[0];
372 dotProduct += dotProductVector[1];
373 dotProduct += dotProductVector[2];
374 dotProduct += dotProductVector[3];
375 dotProduct += dotProductVector[4];
376 dotProduct += dotProductVector[5];
377 dotProduct += dotProductVector[6];
378 dotProduct += dotProductVector[7];
379
380 number = sixteenthPoints * 16;
381 for (; number < num_points; number++) {
382 dotProduct += ((*aPtr++) * (*bPtr++));
383 }
384
385 *result = dotProduct;
386}
387
388#endif /*LV_HAVE_AVX*/
389
390#if LV_HAVE_AVX2 && LV_HAVE_FMA
391#include <immintrin.h>
392static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float* result,
393 const float* input,
394 const float* taps,
395 unsigned int num_points)
396{
397 unsigned int number;
398 const unsigned int eighthPoints = num_points / 8;
399
400 const float* aPtr = input;
401 const float* bPtr = taps;
402
403 __m256 dotProdVal = _mm256_setzero_ps();
404 __m256 aVal1, bVal1;
405
406 for (number = 0; number < eighthPoints; number++) {
407
408 aVal1 = _mm256_loadu_ps(aPtr);
409 bVal1 = _mm256_loadu_ps(bPtr);
410 aPtr += 8;
411 bPtr += 8;
412
413 dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
414 }
415
416 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
417 _mm256_storeu_ps(dotProductVector,
418 dotProdVal); // Store the results back into the dot product vector
419
420 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
421 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
422 dotProductVector[6] + dotProductVector[7];
423
424 for (number = eighthPoints * 8; number < num_points; number++) {
425 dotProduct += ((*aPtr++) * (*bPtr++));
426 }
427
428 *result = dotProduct;
429}
430#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
431
432#if LV_HAVE_AVX512F
433#include <immintrin.h>
434static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float* result,
435 const float* input,
436 const float* taps,
437 unsigned int num_points)
438{
439 unsigned int number;
440 const unsigned int sixteenthPoints = num_points / 16;
441
442 const float* aPtr = input;
443 const float* bPtr = taps;
444
445 __m512 dotProdVal = _mm512_setzero_ps();
446 __m512 aVal1, bVal1;
447
448 for (number = 0; number < sixteenthPoints; number++) {
449
450 aVal1 = _mm512_loadu_ps(aPtr);
451 bVal1 = _mm512_loadu_ps(bPtr);
452 aPtr += 16;
453 bPtr += 16;
454
455 dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
456 }
457
458 __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
459 _mm512_storeu_ps(dotProductVector,
460 dotProdVal); // Store the results back into the dot product vector
461
462 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
463 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
464 dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
465 dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
466 dotProductVector[12] + dotProductVector[13] +
467 dotProductVector[14] + dotProductVector[15];
468
469 for (number = sixteenthPoints * 16; number < num_points; number++) {
470 dotProduct += ((*aPtr++) * (*bPtr++));
471 }
472
473 *result = dotProduct;
474}
475#endif /* LV_HAVE_AVX512F */
476
477#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
478
479#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
480#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
481
482#include <stdio.h>
483#include <volk/volk_common.h>
484
485
486#ifdef LV_HAVE_GENERIC
487
488
489static inline void volk_32f_x2_dot_prod_32f_a_generic(float* result,
490 const float* input,
491 const float* taps,
492 unsigned int num_points)
493{
494
495 float dotProduct = 0;
496 const float* aPtr = input;
497 const float* bPtr = taps;
498 unsigned int number = 0;
499
500 for (number = 0; number < num_points; number++) {
501 dotProduct += ((*aPtr++) * (*bPtr++));
502 }
503
504 *result = dotProduct;
505}
506
507#endif /*LV_HAVE_GENERIC*/
508
509
510#ifdef LV_HAVE_SSE
511
512
513static inline void volk_32f_x2_dot_prod_32f_a_sse(float* result,
514 const float* input,
515 const float* taps,
516 unsigned int num_points)
517{
518
519 unsigned int number = 0;
520 const unsigned int sixteenthPoints = num_points / 16;
521
522 float dotProduct = 0;
523 const float* aPtr = input;
524 const float* bPtr = taps;
525
526 __m128 a0Val, a1Val, a2Val, a3Val;
527 __m128 b0Val, b1Val, b2Val, b3Val;
528 __m128 c0Val, c1Val, c2Val, c3Val;
529
530 __m128 dotProdVal0 = _mm_setzero_ps();
531 __m128 dotProdVal1 = _mm_setzero_ps();
532 __m128 dotProdVal2 = _mm_setzero_ps();
533 __m128 dotProdVal3 = _mm_setzero_ps();
534
535 for (; number < sixteenthPoints; number++) {
536
537 a0Val = _mm_load_ps(aPtr);
538 a1Val = _mm_load_ps(aPtr + 4);
539 a2Val = _mm_load_ps(aPtr + 8);
540 a3Val = _mm_load_ps(aPtr + 12);
541 b0Val = _mm_load_ps(bPtr);
542 b1Val = _mm_load_ps(bPtr + 4);
543 b2Val = _mm_load_ps(bPtr + 8);
544 b3Val = _mm_load_ps(bPtr + 12);
545
546 c0Val = _mm_mul_ps(a0Val, b0Val);
547 c1Val = _mm_mul_ps(a1Val, b1Val);
548 c2Val = _mm_mul_ps(a2Val, b2Val);
549 c3Val = _mm_mul_ps(a3Val, b3Val);
550
551 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
552 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
553 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
554 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
555
556 aPtr += 16;
557 bPtr += 16;
558 }
559
560 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
561 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
562 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
563
564 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
565
566 _mm_store_ps(dotProductVector,
567 dotProdVal0); // Store the results back into the dot product vector
568
569 dotProduct = dotProductVector[0];
570 dotProduct += dotProductVector[1];
571 dotProduct += dotProductVector[2];
572 dotProduct += dotProductVector[3];
573
574 number = sixteenthPoints * 16;
575 for (; number < num_points; number++) {
576 dotProduct += ((*aPtr++) * (*bPtr++));
577 }
578
579 *result = dotProduct;
580}
581
582#endif /*LV_HAVE_SSE*/
583
584#ifdef LV_HAVE_SSE3
585
586#include <pmmintrin.h>
587
588static inline void volk_32f_x2_dot_prod_32f_a_sse3(float* result,
589 const float* input,
590 const float* taps,
591 unsigned int num_points)
592{
593 unsigned int number = 0;
594 const unsigned int sixteenthPoints = num_points / 16;
595
596 float dotProduct = 0;
597 const float* aPtr = input;
598 const float* bPtr = taps;
599
600 __m128 a0Val, a1Val, a2Val, a3Val;
601 __m128 b0Val, b1Val, b2Val, b3Val;
602 __m128 c0Val, c1Val, c2Val, c3Val;
603
604 __m128 dotProdVal0 = _mm_setzero_ps();
605 __m128 dotProdVal1 = _mm_setzero_ps();
606 __m128 dotProdVal2 = _mm_setzero_ps();
607 __m128 dotProdVal3 = _mm_setzero_ps();
608
609 for (; number < sixteenthPoints; number++) {
610
611 a0Val = _mm_load_ps(aPtr);
612 a1Val = _mm_load_ps(aPtr + 4);
613 a2Val = _mm_load_ps(aPtr + 8);
614 a3Val = _mm_load_ps(aPtr + 12);
615 b0Val = _mm_load_ps(bPtr);
616 b1Val = _mm_load_ps(bPtr + 4);
617 b2Val = _mm_load_ps(bPtr + 8);
618 b3Val = _mm_load_ps(bPtr + 12);
619
620 c0Val = _mm_mul_ps(a0Val, b0Val);
621 c1Val = _mm_mul_ps(a1Val, b1Val);
622 c2Val = _mm_mul_ps(a2Val, b2Val);
623 c3Val = _mm_mul_ps(a3Val, b3Val);
624
625 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
626 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
627 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
628 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
629
630 aPtr += 16;
631 bPtr += 16;
632 }
633
634 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
635 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
636 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
637
638 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
639 _mm_store_ps(dotProductVector,
640 dotProdVal0); // Store the results back into the dot product vector
641
642 dotProduct = dotProductVector[0];
643 dotProduct += dotProductVector[1];
644 dotProduct += dotProductVector[2];
645 dotProduct += dotProductVector[3];
646
647 number = sixteenthPoints * 16;
648 for (; number < num_points; number++) {
649 dotProduct += ((*aPtr++) * (*bPtr++));
650 }
651
652 *result = dotProduct;
653}
654
655#endif /*LV_HAVE_SSE3*/
656
657#ifdef LV_HAVE_SSE4_1
658
659#include <smmintrin.h>
660
661static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float* result,
662 const float* input,
663 const float* taps,
664 unsigned int num_points)
665{
666 unsigned int number = 0;
667 const unsigned int sixteenthPoints = num_points / 16;
668
669 float dotProduct = 0;
670 const float* aPtr = input;
671 const float* bPtr = taps;
672
673 __m128 aVal1, bVal1, cVal1;
674 __m128 aVal2, bVal2, cVal2;
675 __m128 aVal3, bVal3, cVal3;
676 __m128 aVal4, bVal4, cVal4;
677
678 __m128 dotProdVal = _mm_setzero_ps();
679
680 for (; number < sixteenthPoints; number++) {
681
682 aVal1 = _mm_load_ps(aPtr);
683 aPtr += 4;
684 aVal2 = _mm_load_ps(aPtr);
685 aPtr += 4;
686 aVal3 = _mm_load_ps(aPtr);
687 aPtr += 4;
688 aVal4 = _mm_load_ps(aPtr);
689 aPtr += 4;
690
691 bVal1 = _mm_load_ps(bPtr);
692 bPtr += 4;
693 bVal2 = _mm_load_ps(bPtr);
694 bPtr += 4;
695 bVal3 = _mm_load_ps(bPtr);
696 bPtr += 4;
697 bVal4 = _mm_load_ps(bPtr);
698 bPtr += 4;
699
700 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
701 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
702 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
703 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
704
705 cVal1 = _mm_or_ps(cVal1, cVal2);
706 cVal3 = _mm_or_ps(cVal3, cVal4);
707 cVal1 = _mm_or_ps(cVal1, cVal3);
708
709 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
710 }
711
712 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
713 _mm_store_ps(dotProductVector,
714 dotProdVal); // Store the results back into the dot product vector
715
716 dotProduct = dotProductVector[0];
717 dotProduct += dotProductVector[1];
718 dotProduct += dotProductVector[2];
719 dotProduct += dotProductVector[3];
720
721 number = sixteenthPoints * 16;
722 for (; number < num_points; number++) {
723 dotProduct += ((*aPtr++) * (*bPtr++));
724 }
725
726 *result = dotProduct;
727}
728
729#endif /*LV_HAVE_SSE4_1*/
730
731#ifdef LV_HAVE_AVX
732
733#include <immintrin.h>
734
735static inline void volk_32f_x2_dot_prod_32f_a_avx(float* result,
736 const float* input,
737 const float* taps,
738 unsigned int num_points)
739{
740
741 unsigned int number = 0;
742 const unsigned int sixteenthPoints = num_points / 16;
743
744 float dotProduct = 0;
745 const float* aPtr = input;
746 const float* bPtr = taps;
747
748 __m256 a0Val, a1Val;
749 __m256 b0Val, b1Val;
750 __m256 c0Val, c1Val;
751
752 __m256 dotProdVal0 = _mm256_setzero_ps();
753 __m256 dotProdVal1 = _mm256_setzero_ps();
754
755 for (; number < sixteenthPoints; number++) {
756
757 a0Val = _mm256_load_ps(aPtr);
758 a1Val = _mm256_load_ps(aPtr + 8);
759 b0Val = _mm256_load_ps(bPtr);
760 b1Val = _mm256_load_ps(bPtr + 8);
761
762 c0Val = _mm256_mul_ps(a0Val, b0Val);
763 c1Val = _mm256_mul_ps(a1Val, b1Val);
764
765 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
766 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
767
768 aPtr += 16;
769 bPtr += 16;
770 }
771
772 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
773
774 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
775
776 _mm256_store_ps(dotProductVector,
777 dotProdVal0); // Store the results back into the dot product vector
778
779 dotProduct = dotProductVector[0];
780 dotProduct += dotProductVector[1];
781 dotProduct += dotProductVector[2];
782 dotProduct += dotProductVector[3];
783 dotProduct += dotProductVector[4];
784 dotProduct += dotProductVector[5];
785 dotProduct += dotProductVector[6];
786 dotProduct += dotProductVector[7];
787
788 number = sixteenthPoints * 16;
789 for (; number < num_points; number++) {
790 dotProduct += ((*aPtr++) * (*bPtr++));
791 }
792
793 *result = dotProduct;
794}
795#endif /*LV_HAVE_AVX*/
796
797
798#if LV_HAVE_AVX2 && LV_HAVE_FMA
799#include <immintrin.h>
800static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float* result,
801 const float* input,
802 const float* taps,
803 unsigned int num_points)
804{
805 unsigned int number;
806 const unsigned int eighthPoints = num_points / 8;
807
808 const float* aPtr = input;
809 const float* bPtr = taps;
810
811 __m256 dotProdVal = _mm256_setzero_ps();
812 __m256 aVal1, bVal1;
813
814 for (number = 0; number < eighthPoints; number++) {
815
816 aVal1 = _mm256_load_ps(aPtr);
817 bVal1 = _mm256_load_ps(bPtr);
818 aPtr += 8;
819 bPtr += 8;
820
821 dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
822 }
823
824 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
825 _mm256_store_ps(dotProductVector,
826 dotProdVal); // Store the results back into the dot product vector
827
828 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
829 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
830 dotProductVector[6] + dotProductVector[7];
831
832 for (number = eighthPoints * 8; number < num_points; number++) {
833 dotProduct += ((*aPtr++) * (*bPtr++));
834 }
835
836 *result = dotProduct;
837}
838#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
839
840#if LV_HAVE_AVX512F
841#include <immintrin.h>
842static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float* result,
843 const float* input,
844 const float* taps,
845 unsigned int num_points)
846{
847 unsigned int number;
848 const unsigned int sixteenthPoints = num_points / 16;
849
850 const float* aPtr = input;
851 const float* bPtr = taps;
852
853 __m512 dotProdVal = _mm512_setzero_ps();
854 __m512 aVal1, bVal1;
855
856 for (number = 0; number < sixteenthPoints; number++) {
857
858 aVal1 = _mm512_load_ps(aPtr);
859 bVal1 = _mm512_load_ps(bPtr);
860 aPtr += 16;
861 bPtr += 16;
862
863 dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
864 }
865
866 __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
867 _mm512_store_ps(dotProductVector,
868 dotProdVal); // Store the results back into the dot product vector
869
870 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
871 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
872 dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
873 dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
874 dotProductVector[12] + dotProductVector[13] +
875 dotProductVector[14] + dotProductVector[15];
876
877 for (number = sixteenthPoints * 16; number < num_points; number++) {
878 dotProduct += ((*aPtr++) * (*bPtr++));
879 }
880
881 *result = dotProduct;
882}
883#endif /* LV_HAVE_AVX512F */
884
885#ifdef LV_HAVE_NEON
886#include <arm_neon.h>
887
888static inline void volk_32f_x2_dot_prod_32f_neonopts(float* result,
889 const float* input,
890 const float* taps,
891 unsigned int num_points)
892{
893
894 unsigned int quarter_points = num_points / 16;
895 float dotProduct = 0;
896 const float* aPtr = input;
897 const float* bPtr = taps;
898 unsigned int number = 0;
899
900 float32x4x4_t a_val, b_val, accumulator0;
901 accumulator0.val[0] = vdupq_n_f32(0);
902 accumulator0.val[1] = vdupq_n_f32(0);
903 accumulator0.val[2] = vdupq_n_f32(0);
904 accumulator0.val[3] = vdupq_n_f32(0);
905 // factor of 4 loop unroll with independent accumulators
906 // uses 12 out of 16 neon q registers
907 for (number = 0; number < quarter_points; ++number) {
908 a_val = vld4q_f32(aPtr);
909 b_val = vld4q_f32(bPtr);
910 accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
911 accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
912 accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
913 accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
914 aPtr += 16;
915 bPtr += 16;
916 }
917 accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
918 accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
919 accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
920 __VOLK_ATTR_ALIGNED(32) float accumulator[4];
921 vst1q_f32(accumulator, accumulator0.val[0]);
922 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
923
924 for (number = quarter_points * 16; number < num_points; number++) {
925 dotProduct += ((*aPtr++) * (*bPtr++));
926 }
927
928 *result = dotProduct;
929}
930
931#endif
932
933
934#ifdef LV_HAVE_NEON
935static inline void volk_32f_x2_dot_prod_32f_neon(float* result,
936 const float* input,
937 const float* taps,
938 unsigned int num_points)
939{
940
941 unsigned int quarter_points = num_points / 8;
942 float dotProduct = 0;
943 const float* aPtr = input;
944 const float* bPtr = taps;
945 unsigned int number = 0;
946
947 float32x4x2_t a_val, b_val, accumulator_val;
948 accumulator_val.val[0] = vdupq_n_f32(0);
949 accumulator_val.val[1] = vdupq_n_f32(0);
950 // factor of 2 loop unroll with independent accumulators
951 for (number = 0; number < quarter_points; ++number) {
952 a_val = vld2q_f32(aPtr);
953 b_val = vld2q_f32(bPtr);
954 accumulator_val.val[0] =
955 vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
956 accumulator_val.val[1] =
957 vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
958 aPtr += 8;
959 bPtr += 8;
960 }
961 accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
962 __VOLK_ATTR_ALIGNED(32) float accumulator[4];
963 vst1q_f32(accumulator, accumulator_val.val[0]);
964 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
965
966 for (number = quarter_points * 8; number < num_points; number++) {
967 dotProduct += ((*aPtr++) * (*bPtr++));
968 }
969
970 *result = dotProduct;
971}
972
973#endif /* LV_HAVE_NEON */
974
975#ifdef LV_HAVE_NEONV7
976extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector,
977 const float* aVector,
978 const float* bVector,
979 unsigned int num_points);
980#endif /* LV_HAVE_NEONV7 */
981
982#ifdef LV_HAVE_NEONV7
983extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector,
984 const float* aVector,
985 const float* bVector,
986 unsigned int num_points);
987#endif /* LV_HAVE_NEONV7 */
988
989#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
static void volk_32f_x2_dot_prod_32f_a_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:735
static void volk_32f_x2_dot_prod_32f_a_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:513
static void volk_32f_x2_dot_prod_32f_u_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:105
static void volk_32f_x2_dot_prod_32f_u_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:327
static void volk_32f_x2_dot_prod_32f_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:81
static void volk_32f_x2_dot_prod_32f_u_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:180
static void volk_32f_x2_dot_prod_32f_a_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:489
static void volk_32f_x2_dot_prod_32f_a_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:588
static void volk_32f_x2_dot_prod_32f_neonopts(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:888
static void volk_32f_x2_dot_prod_32f_neon(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:935
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56