Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_index_max_32u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
65#ifndef INCLUDED_volk_32f_index_max_32u_a_H
66#define INCLUDED_volk_32f_index_max_32u_a_H
67
68#include <inttypes.h>
69#include <stdio.h>
70#include <volk/volk_common.h>
71
72#ifdef LV_HAVE_SSE4_1
73#include <smmintrin.h>
74
75static inline void
76volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
77{
78 if (num_points > 0) {
79 uint32_t number = 0;
80 const uint32_t quarterPoints = num_points / 4;
81
82 float* inputPtr = (float*)src0;
83
84 __m128 indexIncrementValues = _mm_set1_ps(4);
85 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
86
87 float max = src0[0];
88 float index = 0;
89 __m128 maxValues = _mm_set1_ps(max);
90 __m128 maxValuesIndex = _mm_setzero_ps();
91 __m128 compareResults;
92 __m128 currentValues;
93
94 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
95 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
96
97 for (; number < quarterPoints; number++) {
98
99 currentValues = _mm_load_ps(inputPtr);
100 inputPtr += 4;
101 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
102
103 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
104
105 maxValuesIndex =
106 _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
107 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
108 }
109
110 // Calculate the largest value from the remaining 4 points
111 _mm_store_ps(maxValuesBuffer, maxValues);
112 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
113
114 for (number = 0; number < 4; number++) {
115 if (maxValuesBuffer[number] > max) {
116 index = maxIndexesBuffer[number];
117 max = maxValuesBuffer[number];
118 } else if (maxValuesBuffer[number] == max) {
119 if (index > maxIndexesBuffer[number])
120 index = maxIndexesBuffer[number];
121 }
122 }
123
124 number = quarterPoints * 4;
125 for (; number < num_points; number++) {
126 if (src0[number] > max) {
127 index = number;
128 max = src0[number];
129 }
130 }
131 target[0] = (uint32_t)index;
132 }
133}
134
135#endif /*LV_HAVE_SSE4_1*/
136
137
138#ifdef LV_HAVE_SSE
139
140#include <xmmintrin.h>
141
142static inline void
143volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
144{
145 if (num_points > 0) {
146 uint32_t number = 0;
147 const uint32_t quarterPoints = num_points / 4;
148
149 float* inputPtr = (float*)src0;
150
151 __m128 indexIncrementValues = _mm_set1_ps(4);
152 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
153
154 float max = src0[0];
155 float index = 0;
156 __m128 maxValues = _mm_set1_ps(max);
157 __m128 maxValuesIndex = _mm_setzero_ps();
158 __m128 compareResults;
159 __m128 currentValues;
160
161 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
162 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
163
164 for (; number < quarterPoints; number++) {
165
166 currentValues = _mm_load_ps(inputPtr);
167 inputPtr += 4;
168 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
169
170 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
171
172 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
173 _mm_andnot_ps(compareResults, maxValuesIndex));
174
175 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
176 _mm_andnot_ps(compareResults, maxValues));
177 }
178
179 // Calculate the largest value from the remaining 4 points
180 _mm_store_ps(maxValuesBuffer, maxValues);
181 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
182
183 for (number = 0; number < 4; number++) {
184 if (maxValuesBuffer[number] > max) {
185 index = maxIndexesBuffer[number];
186 max = maxValuesBuffer[number];
187 } else if (maxValuesBuffer[number] == max) {
188 if (index > maxIndexesBuffer[number])
189 index = maxIndexesBuffer[number];
190 }
191 }
192
193 number = quarterPoints * 4;
194 for (; number < num_points; number++) {
195 if (src0[number] > max) {
196 index = number;
197 max = src0[number];
198 }
199 }
200 target[0] = (uint32_t)index;
201 }
202}
203
204#endif /*LV_HAVE_SSE*/
205
206
207#ifdef LV_HAVE_AVX
208#include <immintrin.h>
209
210static inline void
211volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
212{
213 if (num_points > 0) {
214 uint32_t number = 0;
215 const uint32_t quarterPoints = num_points / 8;
216
217 float* inputPtr = (float*)src0;
218
219 __m256 indexIncrementValues = _mm256_set1_ps(8);
220 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
221
222 float max = src0[0];
223 float index = 0;
224 __m256 maxValues = _mm256_set1_ps(max);
225 __m256 maxValuesIndex = _mm256_setzero_ps();
226 __m256 compareResults;
227 __m256 currentValues;
228
229 __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
230 __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
231
232 for (; number < quarterPoints; number++) {
233 currentValues = _mm256_load_ps(inputPtr);
234 inputPtr += 8;
235 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
236 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
237 maxValuesIndex =
238 _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
239 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
240 }
241
242 // Calculate the largest value from the remaining 8 points
243 _mm256_store_ps(maxValuesBuffer, maxValues);
244 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
245
246 for (number = 0; number < 8; number++) {
247 if (maxValuesBuffer[number] > max) {
248 index = maxIndexesBuffer[number];
249 max = maxValuesBuffer[number];
250 } else if (maxValuesBuffer[number] == max) {
251 if (index > maxIndexesBuffer[number])
252 index = maxIndexesBuffer[number];
253 }
254 }
255
256 number = quarterPoints * 8;
257 for (; number < num_points; number++) {
258 if (src0[number] > max) {
259 index = number;
260 max = src0[number];
261 }
262 }
263 target[0] = (uint32_t)index;
264 }
265}
266
267#endif /*LV_HAVE_AVX*/
268
269
270#ifdef LV_HAVE_NEON
271#include <arm_neon.h>
272
273static inline void
274volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
275{
276 if (num_points > 0) {
277 uint32_t number = 0;
278 const uint32_t quarterPoints = num_points / 4;
279
280 float* inputPtr = (float*)src0;
281 float32x4_t indexIncrementValues = vdupq_n_f32(4);
283 float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
284 float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
285
286 float max = src0[0];
287 float index = 0;
288 float32x4_t maxValues = vdupq_n_f32(max);
289 uint32x4_t maxValuesIndex = vmovq_n_u32(0);
290 uint32x4_t compareResults;
291 uint32x4_t currentIndexes_u;
292 float32x4_t currentValues;
293
294 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
295 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
296
297 for (; number < quarterPoints; number++) {
298 currentValues = vld1q_f32(inputPtr);
299 inputPtr += 4;
300 currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
301 currentIndexes_u = vcvtq_u32_f32(currentIndexes);
302 compareResults = vcleq_f32(currentValues, maxValues);
303 maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex),
304 vbicq_u32(currentIndexes_u, compareResults));
305 maxValues = vmaxq_f32(currentValues, maxValues);
306 }
307
308 // Calculate the largest value from the remaining 4 points
309 vst1q_f32(maxValuesBuffer, maxValues);
310 vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
311 for (number = 0; number < 4; number++) {
312 if (maxValuesBuffer[number] > max) {
313 index = maxIndexesBuffer[number];
314 max = maxValuesBuffer[number];
315 } else if (maxValues[number] == max) {
316 if (index > maxIndexesBuffer[number])
317 index = maxIndexesBuffer[number];
318 }
319 }
320
321 number = quarterPoints * 4;
322 for (; number < num_points; number++) {
323 if (src0[number] > max) {
324 index = number;
325 max = src0[number];
326 }
327 }
328 target[0] = (uint32_t)index;
329 }
330}
331
332#endif /*LV_HAVE_NEON*/
333
334
335#ifdef LV_HAVE_GENERIC
336
337static inline void
338volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
339{
340 if (num_points > 0) {
341 float max = src0[0];
342 uint32_t index = 0;
343
344 uint32_t i = 1;
345
346 for (; i < num_points; ++i) {
347 if (src0[i] > max) {
348 index = i;
349 max = src0[i];
350 }
351 }
352 target[0] = index;
353 }
354}
355
356#endif /*LV_HAVE_GENERIC*/
357
358
359#endif /*INCLUDED_volk_32f_index_max_32u_a_H*/
360
361
362#ifndef INCLUDED_volk_32f_index_max_32u_u_H
363#define INCLUDED_volk_32f_index_max_32u_u_H
364
365#include <inttypes.h>
366#include <stdio.h>
367#include <volk/volk_common.h>
368
369
370#ifdef LV_HAVE_AVX
371#include <immintrin.h>
372
373static inline void
374volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
375{
376 if (num_points > 0) {
377 uint32_t number = 0;
378 const uint32_t quarterPoints = num_points / 8;
379
380 float* inputPtr = (float*)src0;
381
382 __m256 indexIncrementValues = _mm256_set1_ps(8);
383 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
384
385 float max = src0[0];
386 float index = 0;
387 __m256 maxValues = _mm256_set1_ps(max);
388 __m256 maxValuesIndex = _mm256_setzero_ps();
389 __m256 compareResults;
390 __m256 currentValues;
391
392 __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
393 __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
394
395 for (; number < quarterPoints; number++) {
396 currentValues = _mm256_loadu_ps(inputPtr);
397 inputPtr += 8;
398 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
399 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
400 maxValuesIndex =
401 _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
402 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
403 }
404
405 // Calculate the largest value from the remaining 8 points
406 _mm256_store_ps(maxValuesBuffer, maxValues);
407 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
408
409 for (number = 0; number < 8; number++) {
410 if (maxValuesBuffer[number] > max) {
411 index = maxIndexesBuffer[number];
412 max = maxValuesBuffer[number];
413 } else if (maxValuesBuffer[number] == max) {
414 if (index > maxIndexesBuffer[number])
415 index = maxIndexesBuffer[number];
416 }
417 }
418
419 number = quarterPoints * 8;
420 for (; number < num_points; number++) {
421 if (src0[number] > max) {
422 index = number;
423 max = src0[number];
424 }
425 }
426 target[0] = (uint32_t)index;
427 }
428}
429
430#endif /*LV_HAVE_AVX*/
431
432
433#ifdef LV_HAVE_SSE4_1
434#include <smmintrin.h>
435
436static inline void
437volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
438{
439 if (num_points > 0) {
440 uint32_t number = 0;
441 const uint32_t quarterPoints = num_points / 4;
442
443 float* inputPtr = (float*)src0;
444
445 __m128 indexIncrementValues = _mm_set1_ps(4);
446 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
447
448 float max = src0[0];
449 float index = 0;
450 __m128 maxValues = _mm_set1_ps(max);
451 __m128 maxValuesIndex = _mm_setzero_ps();
452 __m128 compareResults;
453 __m128 currentValues;
454
455 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
456 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
457
458 for (; number < quarterPoints; number++) {
459 currentValues = _mm_loadu_ps(inputPtr);
460 inputPtr += 4;
461 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
462 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
463 maxValuesIndex =
464 _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
465 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
466 }
467
468 // Calculate the largest value from the remaining 4 points
469 _mm_store_ps(maxValuesBuffer, maxValues);
470 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
471
472 for (number = 0; number < 4; number++) {
473 if (maxValuesBuffer[number] > max) {
474 index = maxIndexesBuffer[number];
475 max = maxValuesBuffer[number];
476 } else if (maxValuesBuffer[number] == max) {
477 if (index > maxIndexesBuffer[number])
478 index = maxIndexesBuffer[number];
479 }
480 }
481
482 number = quarterPoints * 4;
483 for (; number < num_points; number++) {
484 if (src0[number] > max) {
485 index = number;
486 max = src0[number];
487 }
488 }
489 target[0] = (uint32_t)index;
490 }
491}
492
493#endif /*LV_HAVE_SSE4_1*/
494
495#ifdef LV_HAVE_SSE
496#include <xmmintrin.h>
497
498static inline void
499volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
500{
501 if (num_points > 0) {
502 uint32_t number = 0;
503 const uint32_t quarterPoints = num_points / 4;
504
505 float* inputPtr = (float*)src0;
506
507 __m128 indexIncrementValues = _mm_set1_ps(4);
508 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
509
510 float max = src0[0];
511 float index = 0;
512 __m128 maxValues = _mm_set1_ps(max);
513 __m128 maxValuesIndex = _mm_setzero_ps();
514 __m128 compareResults;
515 __m128 currentValues;
516
517 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
518 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
519
520 for (; number < quarterPoints; number++) {
521 currentValues = _mm_loadu_ps(inputPtr);
522 inputPtr += 4;
523 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
524 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
525 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
526 _mm_andnot_ps(compareResults, maxValuesIndex));
527 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
528 _mm_andnot_ps(compareResults, maxValues));
529 }
530
531 // Calculate the largest value from the remaining 4 points
532 _mm_store_ps(maxValuesBuffer, maxValues);
533 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
534
535 for (number = 0; number < 4; number++) {
536 if (maxValuesBuffer[number] > max) {
537 index = maxIndexesBuffer[number];
538 max = maxValuesBuffer[number];
539 } else if (maxValuesBuffer[number] == max) {
540 if (index > maxIndexesBuffer[number])
541 index = maxIndexesBuffer[number];
542 }
543 }
544
545 number = quarterPoints * 4;
546 for (; number < num_points; number++) {
547 if (src0[number] > max) {
548 index = number;
549 max = src0[number];
550 }
551 }
552 target[0] = (uint32_t)index;
553 }
554}
555
556#endif /*LV_HAVE_SSE*/
557
558#endif /*INCLUDED_volk_32f_index_max_32u_u_H*/
static void volk_32f_index_max_32u_neon(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:274
static void volk_32f_index_max_32u_a_sse(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:143
static void volk_32f_index_max_32u_u_sse(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:499
static void volk_32f_index_max_32u_generic(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:338
static void volk_32f_index_max_32u_a_avx(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:211
static void volk_32f_index_max_32u_u_avx(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:374
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
for i
Definition: volk_config_fixed.tmpl.h:25