Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_16ic_x2_dot_prod_16ic.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
48#ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H
49#define INCLUDED_volk_16ic_x2_dot_prod_16ic_H
50
52#include <volk/volk_common.h>
53#include <volk/volk_complex.h>
54
55
56#ifdef LV_HAVE_GENERIC
57
59 const lv_16sc_t* in_a,
60 const lv_16sc_t* in_b,
61 unsigned int num_points)
62{
63 result[0] = lv_cmake((int16_t)0, (int16_t)0);
64 unsigned int n;
65 for (n = 0; n < num_points; n++) {
66 lv_16sc_t tmp = in_a[n] * in_b[n];
67 result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)),
68 sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp)));
69 }
70}
71
72#endif /*LV_HAVE_GENERIC*/
73
74
75#ifdef LV_HAVE_SSE2
76#include <emmintrin.h>
77
79 const lv_16sc_t* in_a,
80 const lv_16sc_t* in_b,
81 unsigned int num_points)
82{
83 lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
84
85 const unsigned int sse_iters = num_points / 4;
86 unsigned int number;
87
88 const lv_16sc_t* _in_a = in_a;
89 const lv_16sc_t* _in_b = in_b;
90 lv_16sc_t* _out = out;
91
92 if (sse_iters > 0) {
93 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
94 realcacc, imagcacc;
95 __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
96
97 realcacc = _mm_setzero_si128();
98 imagcacc = _mm_setzero_si128();
99
100 mask_imag = _mm_set_epi8(
101 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
102 mask_real = _mm_set_epi8(
103 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
104
105 for (number = 0; number < sse_iters; number++) {
106 // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
107 a = _mm_load_si128(
108 (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
109 __VOLK_PREFETCH(_in_a + 8);
110 b = _mm_load_si128((__m128i*)_in_b);
111 __VOLK_PREFETCH(_in_b + 8);
112 c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
113
114 c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
115 // zeros, and store the results in dst.
116 real = _mm_subs_epi16(c, c_sr);
117
118 b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
119 a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
120
121 imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
122 imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
123
124 imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
125
126 realcacc = _mm_adds_epi16(realcacc, real);
127 imagcacc = _mm_adds_epi16(imagcacc, imag);
128
129 _in_a += 4;
130 _in_b += 4;
131 }
132
133 realcacc = _mm_and_si128(realcacc, mask_real);
134 imagcacc = _mm_and_si128(imagcacc, mask_imag);
135
136 a = _mm_or_si128(realcacc, imagcacc);
137
138 _mm_store_si128((__m128i*)dotProductVector,
139 a); // Store the results back into the dot product vector
140
141 for (number = 0; number < 4; ++number) {
142 dotProduct = lv_cmake(
143 sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
144 sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
145 }
146 }
147
148 for (number = 0; number < (num_points % 4); ++number) {
149 lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
150 dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
151 sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
152 }
153
154 *_out = dotProduct;
155}
156
157#endif /* LV_HAVE_SSE2 */
158
159
160#ifdef LV_HAVE_SSE2
161#include <emmintrin.h>
162
164 const lv_16sc_t* in_a,
165 const lv_16sc_t* in_b,
166 unsigned int num_points)
167{
168 lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
169
170 const unsigned int sse_iters = num_points / 4;
171
172 const lv_16sc_t* _in_a = in_a;
173 const lv_16sc_t* _in_b = in_b;
174 lv_16sc_t* _out = out;
175 unsigned int number;
176
177 if (sse_iters > 0) {
178 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
179 realcacc, imagcacc, result;
180 __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
181
182 realcacc = _mm_setzero_si128();
183 imagcacc = _mm_setzero_si128();
184
185 mask_imag = _mm_set_epi8(
186 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
187 mask_real = _mm_set_epi8(
188 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
189
190 for (number = 0; number < sse_iters; number++) {
191 // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
192 a = _mm_loadu_si128(
193 (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
194 __VOLK_PREFETCH(_in_a + 8);
195 b = _mm_loadu_si128((__m128i*)_in_b);
196 __VOLK_PREFETCH(_in_b + 8);
197 c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
198
199 c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
200 // zeros, and store the results in dst.
201 real = _mm_subs_epi16(c, c_sr);
202
203 b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
204 a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
205
206 imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
207 imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
208
209 imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
210
211 realcacc = _mm_adds_epi16(realcacc, real);
212 imagcacc = _mm_adds_epi16(imagcacc, imag);
213
214 _in_a += 4;
215 _in_b += 4;
216 }
217
218 realcacc = _mm_and_si128(realcacc, mask_real);
219 imagcacc = _mm_and_si128(imagcacc, mask_imag);
220
221 result = _mm_or_si128(realcacc, imagcacc);
222
223 _mm_storeu_si128((__m128i*)dotProductVector,
224 result); // Store the results back into the dot product vector
225
226 for (number = 0; number < 4; ++number) {
227 dotProduct = lv_cmake(
228 sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
229 sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
230 }
231 }
232
233 for (number = 0; number < (num_points % 4); ++number) {
234 lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
235 dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
236 sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
237 }
238
239 *_out = dotProduct;
240}
241#endif /* LV_HAVE_SSE2 */
242
243
244#ifdef LV_HAVE_AVX2
245#include <immintrin.h>
246
247static inline void volk_16ic_x2_dot_prod_16ic_u_avx2(lv_16sc_t* out,
248 const lv_16sc_t* in_a,
249 const lv_16sc_t* in_b,
250 unsigned int num_points)
251{
252 lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
253
254 const unsigned int avx_iters = num_points / 8;
255
256 const lv_16sc_t* _in_a = in_a;
257 const lv_16sc_t* _in_b = in_b;
258 lv_16sc_t* _out = out;
259 unsigned int number;
260
261 if (avx_iters > 0) {
262 __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
263 realcacc, imagcacc, result;
264 __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
265
266 realcacc = _mm256_setzero_si256();
267 imagcacc = _mm256_setzero_si256();
268
269 mask_imag = _mm256_set_epi8(0xFF,
270 0xFF,
271 0,
272 0,
273 0xFF,
274 0xFF,
275 0,
276 0,
277 0xFF,
278 0xFF,
279 0,
280 0,
281 0xFF,
282 0xFF,
283 0,
284 0,
285 0xFF,
286 0xFF,
287 0,
288 0,
289 0xFF,
290 0xFF,
291 0,
292 0,
293 0xFF,
294 0xFF,
295 0,
296 0,
297 0xFF,
298 0xFF,
299 0,
300 0);
301 mask_real = _mm256_set_epi8(0,
302 0,
303 0xFF,
304 0xFF,
305 0,
306 0,
307 0xFF,
308 0xFF,
309 0,
310 0,
311 0xFF,
312 0xFF,
313 0,
314 0,
315 0xFF,
316 0xFF,
317 0,
318 0,
319 0xFF,
320 0xFF,
321 0,
322 0,
323 0xFF,
324 0xFF,
325 0,
326 0,
327 0xFF,
328 0xFF,
329 0,
330 0,
331 0xFF,
332 0xFF);
333
334 for (number = 0; number < avx_iters; number++) {
335 a = _mm256_loadu_si256((__m256i*)_in_a);
336 __VOLK_PREFETCH(_in_a + 16);
337 b = _mm256_loadu_si256((__m256i*)_in_b);
338 __VOLK_PREFETCH(_in_b + 16);
339 c = _mm256_mullo_epi16(a, b);
340
341 c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
342 // in zeros, and store the results in dst.
343 real = _mm256_subs_epi16(c, c_sr);
344
345 b_sl = _mm256_slli_si256(b, 2);
346 a_sl = _mm256_slli_si256(a, 2);
347
348 imag1 = _mm256_mullo_epi16(a, b_sl);
349 imag2 = _mm256_mullo_epi16(b, a_sl);
350
351 imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
352
353 realcacc = _mm256_adds_epi16(realcacc, real);
354 imagcacc = _mm256_adds_epi16(imagcacc, imag);
355
356 _in_a += 8;
357 _in_b += 8;
358 }
359
360 realcacc = _mm256_and_si256(realcacc, mask_real);
361 imagcacc = _mm256_and_si256(imagcacc, mask_imag);
362
363 result = _mm256_or_si256(realcacc, imagcacc);
364
365 _mm256_storeu_si256((__m256i*)dotProductVector,
366 result); // Store the results back into the dot product vector
367
368 for (number = 0; number < 8; ++number) {
369 dotProduct = lv_cmake(
370 sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
371 sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
372 }
373 }
374
375 for (number = 0; number < (num_points % 8); ++number) {
376 lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
377 dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
378 sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
379 }
380
381 *_out = dotProduct;
382}
383#endif /* LV_HAVE_AVX2 */
384
385
386#ifdef LV_HAVE_AVX2
387#include <immintrin.h>
388
389static inline void volk_16ic_x2_dot_prod_16ic_a_avx2(lv_16sc_t* out,
390 const lv_16sc_t* in_a,
391 const lv_16sc_t* in_b,
392 unsigned int num_points)
393{
394 lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
395
396 const unsigned int avx_iters = num_points / 8;
397
398 const lv_16sc_t* _in_a = in_a;
399 const lv_16sc_t* _in_b = in_b;
400 lv_16sc_t* _out = out;
401 unsigned int number;
402
403 if (avx_iters > 0) {
404 __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
405 realcacc, imagcacc, result;
406 __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
407
408 realcacc = _mm256_setzero_si256();
409 imagcacc = _mm256_setzero_si256();
410
411 mask_imag = _mm256_set_epi8(0xFF,
412 0xFF,
413 0,
414 0,
415 0xFF,
416 0xFF,
417 0,
418 0,
419 0xFF,
420 0xFF,
421 0,
422 0,
423 0xFF,
424 0xFF,
425 0,
426 0,
427 0xFF,
428 0xFF,
429 0,
430 0,
431 0xFF,
432 0xFF,
433 0,
434 0,
435 0xFF,
436 0xFF,
437 0,
438 0,
439 0xFF,
440 0xFF,
441 0,
442 0);
443 mask_real = _mm256_set_epi8(0,
444 0,
445 0xFF,
446 0xFF,
447 0,
448 0,
449 0xFF,
450 0xFF,
451 0,
452 0,
453 0xFF,
454 0xFF,
455 0,
456 0,
457 0xFF,
458 0xFF,
459 0,
460 0,
461 0xFF,
462 0xFF,
463 0,
464 0,
465 0xFF,
466 0xFF,
467 0,
468 0,
469 0xFF,
470 0xFF,
471 0,
472 0,
473 0xFF,
474 0xFF);
475
476 for (number = 0; number < avx_iters; number++) {
477 a = _mm256_load_si256((__m256i*)_in_a);
478 __VOLK_PREFETCH(_in_a + 16);
479 b = _mm256_load_si256((__m256i*)_in_b);
480 __VOLK_PREFETCH(_in_b + 16);
481 c = _mm256_mullo_epi16(a, b);
482
483 c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
484 // in zeros, and store the results in dst.
485 real = _mm256_subs_epi16(c, c_sr);
486
487 b_sl = _mm256_slli_si256(b, 2);
488 a_sl = _mm256_slli_si256(a, 2);
489
490 imag1 = _mm256_mullo_epi16(a, b_sl);
491 imag2 = _mm256_mullo_epi16(b, a_sl);
492
493 imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
494
495 realcacc = _mm256_adds_epi16(realcacc, real);
496 imagcacc = _mm256_adds_epi16(imagcacc, imag);
497
498 _in_a += 8;
499 _in_b += 8;
500 }
501
502 realcacc = _mm256_and_si256(realcacc, mask_real);
503 imagcacc = _mm256_and_si256(imagcacc, mask_imag);
504
505 result = _mm256_or_si256(realcacc, imagcacc);
506
507 _mm256_store_si256((__m256i*)dotProductVector,
508 result); // Store the results back into the dot product vector
509
510 for (number = 0; number < 8; ++number) {
511 dotProduct = lv_cmake(
512 sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
513 sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
514 }
515 }
516
517 for (number = 0; number < (num_points % 8); ++number) {
518 lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
519 dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
520 sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
521 }
522
523 *_out = dotProduct;
524}
525#endif /* LV_HAVE_AVX2 */
526
527
528#ifdef LV_HAVE_NEON
529#include <arm_neon.h>
530
532 const lv_16sc_t* in_a,
533 const lv_16sc_t* in_b,
534 unsigned int num_points)
535{
536 unsigned int quarter_points = num_points / 4;
537 unsigned int number;
538
539 lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
540 lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
541 *out = lv_cmake((int16_t)0, (int16_t)0);
542
543 if (quarter_points > 0) {
544 // for 2-lane vectors, 1st lane holds the real part,
545 // 2nd lane holds the imaginary part
546 int16x4x2_t a_val, b_val, c_val, accumulator;
547 int16x4x2_t tmp_real, tmp_imag;
548 __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
549 accumulator.val[0] = vdup_n_s16(0);
550 accumulator.val[1] = vdup_n_s16(0);
551 lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
552
553 for (number = 0; number < quarter_points; ++number) {
554 a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
555 b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
556 __VOLK_PREFETCH(a_ptr + 8);
557 __VOLK_PREFETCH(b_ptr + 8);
558
559 // multiply the real*real and imag*imag to get real result
560 // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
561 tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
562 // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
563 tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
564
565 // Multiply cross terms to get the imaginary result
566 // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
567 tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
568 // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
569 tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
570
571 c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
572 c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
573
574 accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
575 accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
576
577 a_ptr += 4;
578 b_ptr += 4;
579 }
580
581 vst2_s16((int16_t*)accum_result, accumulator);
582 for (number = 0; number < 4; ++number) {
583 dotProduct = lv_cmake(
584 sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])),
585 sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
586 }
587
588 *out = dotProduct;
589 }
590
591 // tail case
592 for (number = quarter_points * 4; number < num_points; ++number) {
593 *out += (*a_ptr++) * (*b_ptr++);
594 }
595}
596
597#endif /* LV_HAVE_NEON */
598
599
600#ifdef LV_HAVE_NEON
601#include <arm_neon.h>
602
604 const lv_16sc_t* in_a,
605 const lv_16sc_t* in_b,
606 unsigned int num_points)
607{
608 unsigned int quarter_points = num_points / 4;
609 unsigned int number;
610
611 lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
612 lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
613 // for 2-lane vectors, 1st lane holds the real part,
614 // 2nd lane holds the imaginary part
615 int16x4x2_t a_val, b_val, accumulator;
616 int16x4x2_t tmp;
617 __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
618 accumulator.val[0] = vdup_n_s16(0);
619 accumulator.val[1] = vdup_n_s16(0);
620
621 for (number = 0; number < quarter_points; ++number) {
622 a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
623 b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
624 __VOLK_PREFETCH(a_ptr + 8);
625 __VOLK_PREFETCH(b_ptr + 8);
626
627 tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
628 tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
629
630 // use multiply accumulate/subtract to get result
631 tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
632 tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
633
634 accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
635 accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
636
637 a_ptr += 4;
638 b_ptr += 4;
639 }
640
641 vst2_s16((int16_t*)accum_result, accumulator);
642 *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
643
644 // tail case
645 for (number = quarter_points * 4; number < num_points; ++number) {
646 *out += (*a_ptr++) * (*b_ptr++);
647 }
648}
649
650#endif /* LV_HAVE_NEON */
651
652
653#ifdef LV_HAVE_NEON
654#include <arm_neon.h>
655
657 const lv_16sc_t* in_a,
658 const lv_16sc_t* in_b,
659 unsigned int num_points)
660{
661 unsigned int quarter_points = num_points / 4;
662 unsigned int number;
663
664 lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
665 lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
666 // for 2-lane vectors, 1st lane holds the real part,
667 // 2nd lane holds the imaginary part
668 int16x4x2_t a_val, b_val, accumulator1, accumulator2;
669
670 __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
671 accumulator1.val[0] = vdup_n_s16(0);
672 accumulator1.val[1] = vdup_n_s16(0);
673 accumulator2.val[0] = vdup_n_s16(0);
674 accumulator2.val[1] = vdup_n_s16(0);
675
676 for (number = 0; number < quarter_points; ++number) {
677 a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
678 b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
679 __VOLK_PREFETCH(a_ptr + 8);
680 __VOLK_PREFETCH(b_ptr + 8);
681
682 // use 2 accumulators to remove inter-instruction data dependencies
683 accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
684 accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
685 accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
686 accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
687
688 a_ptr += 4;
689 b_ptr += 4;
690 }
691
692 accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]);
693 accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]);
694
695 vst2_s16((int16_t*)accum_result, accumulator1);
696 *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
697
698 // tail case
699 for (number = quarter_points * 4; number < num_points; ++number) {
700 *out += (*a_ptr++) * (*b_ptr++);
701 }
702}
703
704#endif /* LV_HAVE_NEON */
705
706#endif /*INCLUDED_volk_16ic_x2_dot_prod_16ic_H*/
static int16_t sat_adds16i(int16_t x, int16_t y)
Definition: saturation_arithmetic.h:29
static void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:656
static void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t *result, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:58
static void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:78
static void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:603
static void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:163
static void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:531
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_cmake(r, i)
Definition: volk_complex.h:68
#define lv_creal(x)
Definition: volk_complex.h:87
short complex lv_16sc_t
Definition: volk_complex.h:62