Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_x3_sum_of_poly_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
84#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
85#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
86
87#include <inttypes.h>
88#include <stdio.h>
89#include <volk/volk_complex.h>
90
91#ifndef MAX
92#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
93#endif
94
95#ifdef LV_HAVE_SSE3
96#include <pmmintrin.h>
97#include <xmmintrin.h>
98
99static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target,
100 float* src0,
101 float* center_point_array,
102 float* cutoff,
103 unsigned int num_points)
104{
105 float result = 0.0f;
106 float fst = 0.0f;
107 float sq = 0.0f;
108 float thrd = 0.0f;
109 float frth = 0.0f;
110
111 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
112
113 xmm9 = _mm_setzero_ps();
114 xmm1 = _mm_setzero_ps();
115 xmm0 = _mm_load1_ps(&center_point_array[0]);
116 xmm6 = _mm_load1_ps(&center_point_array[1]);
117 xmm7 = _mm_load1_ps(&center_point_array[2]);
118 xmm8 = _mm_load1_ps(&center_point_array[3]);
119 xmm10 = _mm_load1_ps(cutoff);
120
121 int bound = num_points / 8;
122 int leftovers = num_points - 8 * bound;
123 int i = 0;
124 for (; i < bound; ++i) {
125 // 1st
126 xmm2 = _mm_load_ps(src0);
127 xmm2 = _mm_max_ps(xmm10, xmm2);
128 xmm3 = _mm_mul_ps(xmm2, xmm2);
129 xmm4 = _mm_mul_ps(xmm2, xmm3);
130 xmm5 = _mm_mul_ps(xmm3, xmm3);
131
132 xmm2 = _mm_mul_ps(xmm2, xmm0);
133 xmm3 = _mm_mul_ps(xmm3, xmm6);
134 xmm4 = _mm_mul_ps(xmm4, xmm7);
135 xmm5 = _mm_mul_ps(xmm5, xmm8);
136
137 xmm2 = _mm_add_ps(xmm2, xmm3);
138 xmm3 = _mm_add_ps(xmm4, xmm5);
139
140 src0 += 4;
141
142 xmm9 = _mm_add_ps(xmm2, xmm9);
143 xmm9 = _mm_add_ps(xmm3, xmm9);
144
145 // 2nd
146 xmm2 = _mm_load_ps(src0);
147 xmm2 = _mm_max_ps(xmm10, xmm2);
148 xmm3 = _mm_mul_ps(xmm2, xmm2);
149 xmm4 = _mm_mul_ps(xmm2, xmm3);
150 xmm5 = _mm_mul_ps(xmm3, xmm3);
151
152 xmm2 = _mm_mul_ps(xmm2, xmm0);
153 xmm3 = _mm_mul_ps(xmm3, xmm6);
154 xmm4 = _mm_mul_ps(xmm4, xmm7);
155 xmm5 = _mm_mul_ps(xmm5, xmm8);
156
157 xmm2 = _mm_add_ps(xmm2, xmm3);
158 xmm3 = _mm_add_ps(xmm4, xmm5);
159
160 src0 += 4;
161
162 xmm1 = _mm_add_ps(xmm2, xmm1);
163 xmm1 = _mm_add_ps(xmm3, xmm1);
164 }
165 xmm2 = _mm_hadd_ps(xmm9, xmm1);
166 xmm3 = _mm_hadd_ps(xmm2, xmm2);
167 xmm4 = _mm_hadd_ps(xmm3, xmm3);
168 _mm_store_ss(&result, xmm4);
169
170 for (i = 0; i < leftovers; ++i) {
171 fst = *src0++;
172 fst = MAX(fst, *cutoff);
173 sq = fst * fst;
174 thrd = fst * sq;
175 frth = sq * sq;
176 result += (center_point_array[0] * fst + center_point_array[1] * sq +
177 center_point_array[2] * thrd + center_point_array[3] * frth);
178 }
179
180 result += (float)(num_points)*center_point_array[4];
181 *target = result;
182}
183
184
185#endif /*LV_HAVE_SSE3*/
186
187#if LV_HAVE_AVX && LV_HAVE_FMA
188#include <immintrin.h>
189
190static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target,
191 float* src0,
192 float* center_point_array,
193 float* cutoff,
194 unsigned int num_points)
195{
196 const unsigned int eighth_points = num_points / 8;
197 float fst = 0.0;
198 float sq = 0.0;
199 float thrd = 0.0;
200 float frth = 0.0;
201
202 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
203 __m256 target_vec;
204 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
205
206 cpa0 = _mm256_set1_ps(center_point_array[0]);
207 cpa1 = _mm256_set1_ps(center_point_array[1]);
208 cpa2 = _mm256_set1_ps(center_point_array[2]);
209 cpa3 = _mm256_set1_ps(center_point_array[3]);
210 cutoff_vec = _mm256_set1_ps(*cutoff);
211 target_vec = _mm256_setzero_ps();
212
213 unsigned int i;
214
215 for (i = 0; i < eighth_points; ++i) {
216 x_to_1 = _mm256_load_ps(src0);
217 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
218 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
219 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
220 // x^1 * x^3 is slightly faster than x^2 * x^2
221 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
222
223 x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
224 x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
225
226 x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
227 x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
228 // this is slightly faster than result += (x_to_1 + x_to_3)
229 target_vec = _mm256_add_ps(x_to_1, target_vec);
230 target_vec = _mm256_add_ps(x_to_3, target_vec);
231
232 src0 += 8;
233 }
234
235 // the hadd for vector reduction has very very slight impact @ 50k iters
236 __VOLK_ATTR_ALIGNED(32) float temp_results[8];
237 target_vec = _mm256_hadd_ps(
238 target_vec,
239 target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
240 _mm256_store_ps(temp_results, target_vec);
241 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
242
243 for (i = eighth_points * 8; i < num_points; ++i) {
244 fst = *src0++;
245 fst = MAX(fst, *cutoff);
246 sq = fst * fst;
247 thrd = fst * sq;
248 frth = sq * sq;
249 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
250 center_point_array[2] * thrd + center_point_array[3] * frth);
251 }
252 *target += (float)(num_points)*center_point_array[4];
253}
254#endif // LV_HAVE_AVX && LV_HAVE_FMA
255
256#ifdef LV_HAVE_AVX
257#include <immintrin.h>
258
259static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target,
260 float* src0,
261 float* center_point_array,
262 float* cutoff,
263 unsigned int num_points)
264{
265 const unsigned int eighth_points = num_points / 8;
266 float fst = 0.0;
267 float sq = 0.0;
268 float thrd = 0.0;
269 float frth = 0.0;
270
271 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
272 __m256 target_vec;
273 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
274
275 cpa0 = _mm256_set1_ps(center_point_array[0]);
276 cpa1 = _mm256_set1_ps(center_point_array[1]);
277 cpa2 = _mm256_set1_ps(center_point_array[2]);
278 cpa3 = _mm256_set1_ps(center_point_array[3]);
279 cutoff_vec = _mm256_set1_ps(*cutoff);
280 target_vec = _mm256_setzero_ps();
281
282 unsigned int i;
283
284 for (i = 0; i < eighth_points; ++i) {
285 x_to_1 = _mm256_load_ps(src0);
286 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
287 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
288 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
289 // x^1 * x^3 is slightly faster than x^2 * x^2
290 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
291
292 x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
293 x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
294 x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
295 x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
296
297 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
298 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
299 // this is slightly faster than result += (x_to_1 + x_to_3)
300 target_vec = _mm256_add_ps(x_to_1, target_vec);
301 target_vec = _mm256_add_ps(x_to_3, target_vec);
302
303 src0 += 8;
304 }
305
306 // the hadd for vector reduction has very very slight impact @ 50k iters
307 __VOLK_ATTR_ALIGNED(32) float temp_results[8];
308 target_vec = _mm256_hadd_ps(
309 target_vec,
310 target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
311 _mm256_store_ps(temp_results, target_vec);
312 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
313
314 for (i = eighth_points * 8; i < num_points; ++i) {
315 fst = *src0++;
316 fst = MAX(fst, *cutoff);
317 sq = fst * fst;
318 thrd = fst * sq;
319 frth = sq * sq;
320 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
321 center_point_array[2] * thrd + center_point_array[3] * frth);
322 }
323 *target += (float)(num_points)*center_point_array[4];
324}
325#endif // LV_HAVE_AVX
326
327
328#ifdef LV_HAVE_GENERIC
329
330static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target,
331 float* src0,
332 float* center_point_array,
333 float* cutoff,
334 unsigned int num_points)
335{
336 const unsigned int eighth_points = num_points / 8;
337
338 float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
339 float fst = 0.0f;
340 float sq = 0.0f;
341 float thrd = 0.0f;
342 float frth = 0.0f;
343
344 unsigned int i = 0;
345 unsigned int k = 0;
346 for (i = 0; i < eighth_points; ++i) {
347 for (k = 0; k < 8; ++k) {
348 fst = *src0++;
349 fst = MAX(fst, *cutoff);
350 sq = fst * fst;
351 thrd = fst * sq;
352 frth = fst * thrd;
353 result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
354 result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
355 }
356 }
357 for (k = 0; k < 8; k += 2)
358 result[k] = result[k] + result[k + 1];
359
360 *target = result[0] + result[2] + result[4] + result[6];
361
362 for (i = eighth_points * 8; i < num_points; ++i) {
363 fst = *src0++;
364 fst = MAX(fst, *cutoff);
365 sq = fst * fst;
366 thrd = fst * sq;
367 frth = fst * thrd;
368 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
369 center_point_array[2] * thrd + center_point_array[3] * frth);
370 }
371 *target += (float)(num_points)*center_point_array[4];
372}
373
374#endif /*LV_HAVE_GENERIC*/
375
376#ifdef LV_HAVE_NEON
377#include <arm_neon.h>
378
379static inline void
380volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target,
381 float* __restrict src0,
382 float* __restrict center_point_array,
383 float* __restrict cutoff,
384 unsigned int num_points)
385{
386 unsigned int i;
387 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
388
389 float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
390 float32x2_t cutoff_vector;
391 float32x2x2_t x_low, x_high;
392 float32x4_t x_qvector, c_qvector, cpa_qvector;
393 float accumulator;
394 float res_accumulators[4];
395
396 c_qvector = vld1q_f32(zero);
397 // load the cutoff in to a vector
398 cutoff_vector = vdup_n_f32(*cutoff);
399 // ... center point array
400 cpa_qvector = vld1q_f32(center_point_array);
401
402 for (i = 0; i < num_points; ++i) {
403 // load x (src0)
404 x_to_1 = vdup_n_f32(*src0++);
405
406 // Get a vector of max(src0, cutoff)
407 x_to_1 = vmax_f32(x_to_1, cutoff_vector); // x^1
408 x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
409 x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
410 x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
411 // zip up doubles to interleave
412 x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
413 x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
414 // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
415 x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
416 // now we finally have [x^4 | x^3 | x^2 | x] !
417
418 c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
419 }
420 // there should be better vector reduction techniques
421 vst1q_f32(res_accumulators, c_qvector);
422 accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
423 res_accumulators[3];
424
425 *target = accumulator + (float)num_points * center_point_array[4];
426}
427
428#endif /* LV_HAVE_NEON */
429
430
431#ifdef LV_HAVE_NEON
432
433static inline void
435 float* __restrict src0,
436 float* __restrict center_point_array,
437 float* __restrict cutoff,
438 unsigned int num_points)
439{
440 unsigned int i;
441 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
442
443 float accumulator;
444
445 float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
446 accumulator1_vec = vld1q_f32(zero);
447 accumulator2_vec = vld1q_f32(zero);
448 accumulator3_vec = vld1q_f32(zero);
449 accumulator4_vec = vld1q_f32(zero);
450 float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
451 float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
452
453 // load the cutoff in to a vector
454 cutoff_vector = vdupq_n_f32(*cutoff);
455 // ... center point array
456 cpa_0 = vdupq_n_f32(center_point_array[0]);
457 cpa_1 = vdupq_n_f32(center_point_array[1]);
458 cpa_2 = vdupq_n_f32(center_point_array[2]);
459 cpa_3 = vdupq_n_f32(center_point_array[3]);
460
461 // nathan is not sure why this is slower *and* wrong compared to neonvertfma
462 for (i = 0; i < num_points / 4; ++i) {
463 // load x
464 x_to_1 = vld1q_f32(src0);
465
466 // Get a vector of max(src0, cutoff)
467 x_to_1 = vmaxq_f32(x_to_1, cutoff_vector); // x^1
468 x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
469 x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
470 x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
471 x_to_1 = vmulq_f32(x_to_1, cpa_0);
472 x_to_2 = vmulq_f32(x_to_2, cpa_1);
473 x_to_3 = vmulq_f32(x_to_3, cpa_2);
474 x_to_4 = vmulq_f32(x_to_4, cpa_3);
475 accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
476 accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
477 accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
478 accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
479
480 src0 += 4;
481 }
482 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
483 accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
484 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
485
486 __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
487 vst1q_f32(res_accumulators, accumulator1_vec);
488 accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
489 res_accumulators[3];
490
491 float fst = 0.0;
492 float sq = 0.0;
493 float thrd = 0.0;
494 float frth = 0.0;
495
496 for (i = 4 * num_points / 4; i < num_points; ++i) {
497 fst = src0[i];
498 fst = MAX(fst, *cutoff);
499
500 sq = fst * fst;
501 thrd = fst * sq;
502 frth = sq * sq;
503 // fith = sq * thrd;
504
505 accumulator += (center_point_array[0] * fst + center_point_array[1] * sq +
506 center_point_array[2] * thrd + center_point_array[3] * frth); //+
507 }
508
509 *target = accumulator + (float)num_points * center_point_array[4];
510}
511
512#endif /* LV_HAVE_NEON */
513
514#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
515
516#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
517#define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
518
519#include <inttypes.h>
520#include <stdio.h>
521#include <volk/volk_complex.h>
522
523#ifndef MAX
524#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
525#endif
526
527#if LV_HAVE_AVX && LV_HAVE_FMA
528#include <immintrin.h>
529
530static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target,
531 float* src0,
532 float* center_point_array,
533 float* cutoff,
534 unsigned int num_points)
535{
536 const unsigned int eighth_points = num_points / 8;
537 float fst = 0.0;
538 float sq = 0.0;
539 float thrd = 0.0;
540 float frth = 0.0;
541
542 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
543 __m256 target_vec;
544 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
545
546 cpa0 = _mm256_set1_ps(center_point_array[0]);
547 cpa1 = _mm256_set1_ps(center_point_array[1]);
548 cpa2 = _mm256_set1_ps(center_point_array[2]);
549 cpa3 = _mm256_set1_ps(center_point_array[3]);
550 cutoff_vec = _mm256_set1_ps(*cutoff);
551 target_vec = _mm256_setzero_ps();
552
553 unsigned int i;
554
555 for (i = 0; i < eighth_points; ++i) {
556 x_to_1 = _mm256_loadu_ps(src0);
557 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
558 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
559 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
560 // x^1 * x^3 is slightly faster than x^2 * x^2
561 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
562
563 x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
564 x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
565
566 x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
567 x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
568 // this is slightly faster than result += (x_to_1 + x_to_3)
569 target_vec = _mm256_add_ps(x_to_1, target_vec);
570 target_vec = _mm256_add_ps(x_to_3, target_vec);
571
572 src0 += 8;
573 }
574
575 // the hadd for vector reduction has very very slight impact @ 50k iters
576 __VOLK_ATTR_ALIGNED(32) float temp_results[8];
577 target_vec = _mm256_hadd_ps(
578 target_vec,
579 target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
580 _mm256_storeu_ps(temp_results, target_vec);
581 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
582
583 for (i = eighth_points * 8; i < num_points; ++i) {
584 fst = *src0++;
585 fst = MAX(fst, *cutoff);
586 sq = fst * fst;
587 thrd = fst * sq;
588 frth = sq * sq;
589 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
590 center_point_array[2] * thrd + center_point_array[3] * frth);
591 }
592
593 *target += (float)(num_points)*center_point_array[4];
594}
595#endif // LV_HAVE_AVX && LV_HAVE_FMA
596
597#ifdef LV_HAVE_AVX
598#include <immintrin.h>
599
600static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target,
601 float* src0,
602 float* center_point_array,
603 float* cutoff,
604 unsigned int num_points)
605{
606 const unsigned int eighth_points = num_points / 8;
607 float fst = 0.0;
608 float sq = 0.0;
609 float thrd = 0.0;
610 float frth = 0.0;
611
612 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
613 __m256 target_vec;
614 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
615
616 cpa0 = _mm256_set1_ps(center_point_array[0]);
617 cpa1 = _mm256_set1_ps(center_point_array[1]);
618 cpa2 = _mm256_set1_ps(center_point_array[2]);
619 cpa3 = _mm256_set1_ps(center_point_array[3]);
620 cutoff_vec = _mm256_set1_ps(*cutoff);
621 target_vec = _mm256_setzero_ps();
622
623 unsigned int i;
624
625 for (i = 0; i < eighth_points; ++i) {
626 x_to_1 = _mm256_loadu_ps(src0);
627 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
628 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
629 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
630 // x^1 * x^3 is slightly faster than x^2 * x^2
631 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
632
633 x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
634 x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
635 x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
636 x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
637
638 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
639 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
640 // this is slightly faster than result += (x_to_1 + x_to_3)
641 target_vec = _mm256_add_ps(x_to_1, target_vec);
642 target_vec = _mm256_add_ps(x_to_3, target_vec);
643
644 src0 += 8;
645 }
646
647 // the hadd for vector reduction has very very slight impact @ 50k iters
648 __VOLK_ATTR_ALIGNED(32) float temp_results[8];
649 target_vec = _mm256_hadd_ps(
650 target_vec,
651 target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
652 _mm256_storeu_ps(temp_results, target_vec);
653 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
654
655 for (i = eighth_points * 8; i < num_points; ++i) {
656 fst = *src0++;
657 fst = MAX(fst, *cutoff);
658 sq = fst * fst;
659 thrd = fst * sq;
660 frth = sq * sq;
661
662 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
663 center_point_array[2] * thrd + center_point_array[3] * frth);
664 }
665
666 *target += (float)(num_points)*center_point_array[4];
667}
668#endif // LV_HAVE_AVX
669
670#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H*/
static void volk_32f_x3_sum_of_poly_32f_u_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:600
static void volk_32f_x3_sum_of_poly_32f_a_sse3(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:99
static void volk_32f_x3_sum_of_poly_32f_neonvert(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:434
static void volk_32f_x3_sum_of_poly_32f_a_neon(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:380
static void volk_32f_x3_sum_of_poly_32f_generic(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:330
static void volk_32f_x3_sum_of_poly_32f_a_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:259
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:92
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
for i
Definition: volk_config_fixed.tmpl.h:25