Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32fc_index_min_32u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2021 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * VOLK is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * VOLK is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
70#ifndef INCLUDED_volk_32fc_index_min_32u_a_H
71#define INCLUDED_volk_32fc_index_min_32u_a_H
72
73#include <inttypes.h>
74#include <stdio.h>
75#include <volk/volk_common.h>
76#include <volk/volk_complex.h>
77
78#ifdef LV_HAVE_AVX2
79#include <immintrin.h>
81
82static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target,
83 const lv_32fc_t* source,
84 uint32_t num_points)
85{
86 const __m256i indices_increment = _mm256_set1_epi32(8);
87 /*
88 * At the start of each loop iteration current_indices holds the indices of
89 * the complex numbers loaded from memory. Explanation for odd order is given
90 * in implementation of vector_32fc_index_min_variant0().
91 */
92 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
93
94 __m256 min_values = _mm256_set1_ps(FLT_MAX);
95 __m256i min_indices = _mm256_setzero_si256();
96
97 for (unsigned i = 0; i < num_points / 8u; ++i) {
98 __m256 in0 = _mm256_load_ps((float*)source);
99 __m256 in1 = _mm256_load_ps((float*)(source + 4));
101 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
102 source += 8;
103 }
104
105 // determine minimum value and index in the result of the vectorized loop
106 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
107 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
108 _mm256_store_ps(min_values_buffer, min_values);
109 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
110
111 float min = FLT_MAX;
112 uint32_t index = 0;
113 for (unsigned i = 0; i < 8; i++) {
114 if (min_values_buffer[i] < min) {
115 min = min_values_buffer[i];
116 index = min_indices_buffer[i];
117 }
118 }
119
120 // handle tail not processed by the vectorized loop
121 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
122 const float abs_squared =
123 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
124 if (abs_squared < min) {
125 min = abs_squared;
126 index = i;
127 }
128 ++source;
129 }
130
131 *target = index;
132}
133
134#endif /*LV_HAVE_AVX2*/
135
136#ifdef LV_HAVE_AVX2
137#include <immintrin.h>
139
140static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target,
141 const lv_32fc_t* source,
142 uint32_t num_points)
143{
144 const __m256i indices_increment = _mm256_set1_epi32(8);
145 /*
146 * At the start of each loop iteration current_indices holds the indices of
147 * the complex numbers loaded from memory. Explanation for odd order is given
148 * in implementation of vector_32fc_index_min_variant0().
149 */
150 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
151
152 __m256 min_values = _mm256_set1_ps(FLT_MAX);
153 __m256i min_indices = _mm256_setzero_si256();
154
155 for (unsigned i = 0; i < num_points / 8u; ++i) {
156 __m256 in0 = _mm256_load_ps((float*)source);
157 __m256 in1 = _mm256_load_ps((float*)(source + 4));
159 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
160 source += 8;
161 }
162
163 // determine minimum value and index in the result of the vectorized loop
164 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
165 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
166 _mm256_store_ps(min_values_buffer, min_values);
167 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
168
169 float min = FLT_MAX;
170 uint32_t index = 0;
171 for (unsigned i = 0; i < 8; i++) {
172 if (min_values_buffer[i] < min) {
173 min = min_values_buffer[i];
174 index = min_indices_buffer[i];
175 }
176 }
177
178 // handle tail not processed by the vectorized loop
179 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
180 const float abs_squared =
181 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
182 if (abs_squared < min) {
183 min = abs_squared;
184 index = i;
185 }
186 ++source;
187 }
188
189 *target = index;
190}
191
192#endif /*LV_HAVE_AVX2*/
193
194#ifdef LV_HAVE_SSE3
195#include <pmmintrin.h>
196#include <xmmintrin.h>
197
198static inline void volk_32fc_index_min_32u_a_sse3(uint32_t* target,
199 const lv_32fc_t* source,
200 uint32_t num_points)
201{
202 union bit128 holderf;
203 union bit128 holderi;
204 float sq_dist = 0.0;
205
206 union bit128 xmm5, xmm4;
207 __m128 xmm1, xmm2, xmm3;
208 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
209
210 xmm5.int_vec = _mm_setzero_si128();
211 xmm4.int_vec = _mm_setzero_si128();
212 holderf.int_vec = _mm_setzero_si128();
213 holderi.int_vec = _mm_setzero_si128();
214
215 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
216 xmm9 = _mm_setzero_si128();
217 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
218 xmm3 = _mm_set_ps1(FLT_MAX);
219
220 int bound = num_points >> 2;
221
222 for (int i = 0; i < bound; ++i) {
223 xmm1 = _mm_load_ps((float*)source);
224 xmm2 = _mm_load_ps((float*)&source[2]);
225
226 source += 4;
227
228 xmm1 = _mm_mul_ps(xmm1, xmm1);
229 xmm2 = _mm_mul_ps(xmm2, xmm2);
230
231 xmm1 = _mm_hadd_ps(xmm1, xmm2);
232
233 xmm3 = _mm_min_ps(xmm1, xmm3);
234
235 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
236 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
237
238 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
239 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
240
241 xmm9 = _mm_add_epi32(xmm11, xmm12);
242
243 xmm8 = _mm_add_epi32(xmm8, xmm10);
244 }
245
246 if (num_points >> 1 & 1) {
247 xmm2 = _mm_load_ps((float*)source);
248
249 xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
250 xmm8 = bit128_p(&xmm1)->int_vec;
251
252 xmm2 = _mm_mul_ps(xmm2, xmm2);
253
254 source += 2;
255
256 xmm1 = _mm_hadd_ps(xmm2, xmm2);
257
258 xmm3 = _mm_min_ps(xmm1, xmm3);
259
260 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
261
262 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
263 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
264
265 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
266 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
267
268 xmm9 = _mm_add_epi32(xmm11, xmm12);
269
270 xmm8 = _mm_add_epi32(xmm8, xmm10);
271 }
272
273 if (num_points & 1) {
274 sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
275 lv_cimag(source[0]) * lv_cimag(source[0]);
276
277 xmm2 = _mm_load1_ps(&sq_dist);
278
279 xmm1 = xmm3;
280
281 xmm3 = _mm_min_ss(xmm3, xmm2);
282
283 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
284 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
285
286 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
287
288 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
289 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
290
291 xmm9 = _mm_add_epi32(xmm11, xmm12);
292 }
293
294 _mm_store_ps((float*)&(holderf.f), xmm3);
295 _mm_store_si128(&(holderi.int_vec), xmm9);
296
297 target[0] = holderi.i[0];
298 sq_dist = holderf.f[0];
299 target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
300 sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
301 target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
302 sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
303 target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
304 sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
305}
306
307#endif /*LV_HAVE_SSE3*/
308
309#ifdef LV_HAVE_GENERIC
310static inline void volk_32fc_index_min_32u_generic(uint32_t* target,
311 const lv_32fc_t* source,
312 uint32_t num_points)
313{
314 float sq_dist = 0.0;
315 float min = FLT_MAX;
316 uint32_t index = 0;
317
318 for (uint32_t i = 0; i < num_points; ++i) {
319 sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
320 lv_cimag(source[i]) * lv_cimag(source[i]);
321
322 if (sq_dist < min) {
323 index = i;
324 min = sq_dist;
325 }
326 }
327 target[0] = index;
328}
329
330#endif /*LV_HAVE_GENERIC*/
331
332#endif /*INCLUDED_volk_32fc_index_min_32u_a_H*/
333
334#ifndef INCLUDED_volk_32fc_index_min_32u_u_H
335#define INCLUDED_volk_32fc_index_min_32u_u_H
336
337#include <inttypes.h>
338#include <stdio.h>
339#include <volk/volk_common.h>
340#include <volk/volk_complex.h>
341
342#ifdef LV_HAVE_AVX2
343#include <immintrin.h>
345
346static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target,
347 const lv_32fc_t* source,
348 uint32_t num_points)
349{
350 const __m256i indices_increment = _mm256_set1_epi32(8);
351 /*
352 * At the start of each loop iteration current_indices holds the indices of
353 * the complex numbers loaded from memory. Explanation for odd order is given
354 * in implementation of vector_32fc_index_min_variant0().
355 */
356 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
357
358 __m256 min_values = _mm256_set1_ps(FLT_MAX);
359 __m256i min_indices = _mm256_setzero_si256();
360
361 for (unsigned i = 0; i < num_points / 8u; ++i) {
362 __m256 in0 = _mm256_loadu_ps((float*)source);
363 __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
365 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
366 source += 8;
367 }
368
369 // determine minimum value and index in the result of the vectorized loop
370 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
371 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
372 _mm256_store_ps(min_values_buffer, min_values);
373 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
374
375 float min = FLT_MAX;
376 uint32_t index = 0;
377 for (unsigned i = 0; i < 8; i++) {
378 if (min_values_buffer[i] < min) {
379 min = min_values_buffer[i];
380 index = min_indices_buffer[i];
381 }
382 }
383
384 // handle tail not processed by the vectorized loop
385 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
386 const float abs_squared =
387 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
388 if (abs_squared < min) {
389 min = abs_squared;
390 index = i;
391 }
392 ++source;
393 }
394
395 *target = index;
396}
397
398#endif /*LV_HAVE_AVX2*/
399
400#ifdef LV_HAVE_AVX2
401#include <immintrin.h>
403
404static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target,
405 const lv_32fc_t* source,
406 uint32_t num_points)
407{
408 const __m256i indices_increment = _mm256_set1_epi32(8);
409 /*
410 * At the start of each loop iteration current_indices holds the indices of
411 * the complex numbers loaded from memory. Explanation for odd order is given
412 * in implementation of vector_32fc_index_min_variant0().
413 */
414 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
415
416 __m256 min_values = _mm256_set1_ps(FLT_MAX);
417 __m256i min_indices = _mm256_setzero_si256();
418
419 for (unsigned i = 0; i < num_points / 8u; ++i) {
420 __m256 in0 = _mm256_loadu_ps((float*)source);
421 __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
423 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
424 source += 8;
425 }
426
427 // determine minimum value and index in the result of the vectorized loop
428 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
429 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
430 _mm256_store_ps(min_values_buffer, min_values);
431 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
432
433 float min = FLT_MAX;
434 uint32_t index = 0;
435 for (unsigned i = 0; i < 8; i++) {
436 if (min_values_buffer[i] < min) {
437 min = min_values_buffer[i];
438 index = min_indices_buffer[i];
439 }
440 }
441
442 // handle tail not processed by the vectorized loop
443 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
444 const float abs_squared =
445 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
446 if (abs_squared < min) {
447 min = abs_squared;
448 index = i;
449 }
450 ++source;
451 }
452
453 *target = index;
454}
455
456#endif /*LV_HAVE_AVX2*/
457
458#ifdef LV_HAVE_NEON
459#include <arm_neon.h>
461
462static inline void volk_32fc_index_min_32u_neon(uint32_t* target,
463 const lv_32fc_t* source,
464 uint32_t num_points)
465{
466 const uint32_t quarter_points = num_points / 4;
467 const lv_32fc_t* sourcePtr = source;
468
469 uint32_t indices[4] = { 0, 1, 2, 3 };
470 const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
471 uint32x4_t vec_indices = vld1q_u32(indices);
472 uint32x4_t vec_min_indices = vec_indices;
473
474 if (num_points) {
475 float min = FLT_MAX;
476 uint32_t index = 0;
477
478 float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
479
480 for (uint32_t number = 0; number < quarter_points; number++) {
481 // Load complex and compute magnitude squared
482 const float32x4_t vec_mag2 =
483 _vmagnitudesquaredq_f32(vld2q_f32((float*)sourcePtr));
484 __VOLK_PREFETCH(sourcePtr += 4);
485 // a < b?
486 const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min);
487 vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min);
488 vec_min_indices = vbslq_u32(lt_mask, vec_indices, vec_min_indices);
489 vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
490 }
491 uint32_t tmp_min_indices[4];
492 float tmp_min[4];
493 vst1q_u32(tmp_min_indices, vec_min_indices);
494 vst1q_f32(tmp_min, vec_min);
495
496 for (int i = 0; i < 4; i++) {
497 if (tmp_min[i] < min) {
498 min = tmp_min[i];
499 index = tmp_min_indices[i];
500 }
501 }
502
503 // Deal with the rest
504 for (uint32_t number = quarter_points * 4; number < num_points; number++) {
505 const float re = lv_creal(*sourcePtr);
506 const float im = lv_cimag(*sourcePtr);
507 if ((re * re + im * im) < min) {
508 min = *sourcePtr;
509 index = number;
510 }
511 sourcePtr++;
512 }
513 *target = index;
514 }
515}
516
517#endif /*LV_HAVE_NEON*/
518
519#endif /*INCLUDED_volk_32fc_index_min_32u_u_H*/
Definition: volk_common.h:111
float f[4]
Definition: volk_common.h:115
__m128i int_vec
Definition: volk_common.h:123
uint32_t i[4]
Definition: volk_common.h:114
__m128 float_vec
Definition: volk_common.h:119
static void volk_32fc_index_min_32u_generic(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_32u.h:310
static void volk_32fc_index_min_32u_a_sse3(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_32u.h:198
static void volk_32fc_index_min_32u_neon(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_32u.h:462
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:251
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:313
#define bit128_p(x)
Definition: volk_common.h:142
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:86