Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32fc_index_max_16u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
76#ifndef INCLUDED_volk_32fc_index_max_16u_a_H
77#define INCLUDED_volk_32fc_index_max_16u_a_H
78
79#include <inttypes.h>
80#include <limits.h>
81#include <stdio.h>
82#include <volk/volk_common.h>
83#include <volk/volk_complex.h>
84
85#ifdef LV_HAVE_AVX2
86#include <immintrin.h>
88
89static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target,
90 lv_32fc_t* src0,
91 uint32_t num_points)
92{
93 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
94
95 const __m256i indices_increment = _mm256_set1_epi32(8);
96 /*
97 * At the start of each loop iteration current_indices holds the indices of
98 * the complex numbers loaded from memory. Explanation for odd order is given
99 * in implementation of vector_32fc_index_max_variant0().
100 */
101 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
102
103 __m256 max_values = _mm256_setzero_ps();
104 __m256i max_indices = _mm256_setzero_si256();
105
106 for (unsigned i = 0; i < num_points / 8u; ++i) {
107 __m256 in0 = _mm256_load_ps((float*)src0);
108 __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
110 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
111 src0 += 8;
112 }
113
114 // determine maximum value and index in the result of the vectorized loop
115 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
116 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
117 _mm256_store_ps(max_values_buffer, max_values);
118 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
119
120 float max = 0.f;
121 uint32_t index = 0;
122 for (unsigned i = 0; i < 8; i++) {
123 if (max_values_buffer[i] > max) {
124 max = max_values_buffer[i];
125 index = max_indices_buffer[i];
126 }
127 }
128
129 // handle tail not processed by the vectorized loop
130 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
131 const float abs_squared =
132 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
133 if (abs_squared > max) {
134 max = abs_squared;
135 index = i;
136 }
137 ++src0;
138 }
139
140 *target = index;
141}
142
143#endif /*LV_HAVE_AVX2*/
144
145#ifdef LV_HAVE_AVX2
146#include <immintrin.h>
148
149static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target,
150 lv_32fc_t* src0,
151 uint32_t num_points)
152{
153 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
154
155 const __m256i indices_increment = _mm256_set1_epi32(8);
156 /*
157 * At the start of each loop iteration current_indices holds the indices of
158 * the complex numbers loaded from memory. Explanation for odd order is given
159 * in implementation of vector_32fc_index_max_variant0().
160 */
161 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
162
163 __m256 max_values = _mm256_setzero_ps();
164 __m256i max_indices = _mm256_setzero_si256();
165
166 for (unsigned i = 0; i < num_points / 8u; ++i) {
167 __m256 in0 = _mm256_load_ps((float*)src0);
168 __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
170 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
171 src0 += 8;
172 }
173
174 // determine maximum value and index in the result of the vectorized loop
175 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
176 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
177 _mm256_store_ps(max_values_buffer, max_values);
178 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
179
180 float max = 0.f;
181 uint32_t index = 0;
182 for (unsigned i = 0; i < 8; i++) {
183 if (max_values_buffer[i] > max) {
184 max = max_values_buffer[i];
185 index = max_indices_buffer[i];
186 }
187 }
188
189 // handle tail not processed by the vectorized loop
190 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
191 const float abs_squared =
192 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
193 if (abs_squared > max) {
194 max = abs_squared;
195 index = i;
196 }
197 ++src0;
198 }
199
200 *target = index;
201}
202
203#endif /*LV_HAVE_AVX2*/
204
205#ifdef LV_HAVE_SSE3
206#include <pmmintrin.h>
207#include <xmmintrin.h>
208
209static inline void
210volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
211{
212 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
213 const uint32_t num_bytes = num_points * 8;
214
215 union bit128 holderf;
216 union bit128 holderi;
217 float sq_dist = 0.0;
218
219 union bit128 xmm5, xmm4;
220 __m128 xmm1, xmm2, xmm3;
221 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
222
223 xmm5.int_vec = _mm_setzero_si128();
224 xmm4.int_vec = _mm_setzero_si128();
225 holderf.int_vec = _mm_setzero_si128();
226 holderi.int_vec = _mm_setzero_si128();
227
228 int bound = num_bytes >> 5;
229 int i = 0;
230
231 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
232 xmm9 = _mm_setzero_si128();
233 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
234 xmm3 = _mm_setzero_ps();
235
236 for (; i < bound; ++i) {
237 xmm1 = _mm_load_ps((float*)src0);
238 xmm2 = _mm_load_ps((float*)&src0[2]);
239
240 src0 += 4;
241
242 xmm1 = _mm_mul_ps(xmm1, xmm1);
243 xmm2 = _mm_mul_ps(xmm2, xmm2);
244
245 xmm1 = _mm_hadd_ps(xmm1, xmm2);
246
247 xmm3 = _mm_max_ps(xmm1, xmm3);
248
249 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
250 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
251
252 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
253 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
254
255 xmm9 = _mm_add_epi32(xmm11, xmm12);
256
257 xmm8 = _mm_add_epi32(xmm8, xmm10);
258 }
259
260 if (num_bytes >> 4 & 1) {
261 xmm2 = _mm_load_ps((float*)src0);
262
263 xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
264 xmm8 = bit128_p(&xmm1)->int_vec;
265
266 xmm2 = _mm_mul_ps(xmm2, xmm2);
267
268 src0 += 2;
269
270 xmm1 = _mm_hadd_ps(xmm2, xmm2);
271
272 xmm3 = _mm_max_ps(xmm1, xmm3);
273
274 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
275
276 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
277 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
278
279 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
280 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
281
282 xmm9 = _mm_add_epi32(xmm11, xmm12);
283
284 xmm8 = _mm_add_epi32(xmm8, xmm10);
285 }
286
287 if (num_bytes >> 3 & 1) {
288 sq_dist =
289 lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
290
291 xmm2 = _mm_load1_ps(&sq_dist);
292
293 xmm1 = xmm3;
294
295 xmm3 = _mm_max_ss(xmm3, xmm2);
296
297 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
298 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
299
300 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
301
302 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
303 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
304
305 xmm9 = _mm_add_epi32(xmm11, xmm12);
306 }
307
308 _mm_store_ps((float*)&(holderf.f), xmm3);
309 _mm_store_si128(&(holderi.int_vec), xmm9);
310
311 target[0] = holderi.i[0];
312 sq_dist = holderf.f[0];
313 target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
314 sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
315 target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
316 sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
317 target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
318 sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
319}
320
321#endif /*LV_HAVE_SSE3*/
322
323#ifdef LV_HAVE_GENERIC
324static inline void
325volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
326{
327 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
328
329 const uint32_t num_bytes = num_points * 8;
330
331 float sq_dist = 0.0;
332 float max = 0.0;
333 uint16_t index = 0;
334
335 uint32_t i = 0;
336
337 for (; i<num_bytes>> 3; ++i) {
338 sq_dist =
339 lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
340
341 if (sq_dist > max) {
342 index = i;
343 max = sq_dist;
344 }
345 }
346 target[0] = index;
347}
348
349#endif /*LV_HAVE_GENERIC*/
350
351#endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
352
353#ifndef INCLUDED_volk_32fc_index_max_16u_u_H
354#define INCLUDED_volk_32fc_index_max_16u_u_H
355
356#include <inttypes.h>
357#include <limits.h>
358#include <stdio.h>
359#include <volk/volk_common.h>
360#include <volk/volk_complex.h>
361
362#ifdef LV_HAVE_AVX2
363#include <immintrin.h>
365
366static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target,
367 lv_32fc_t* src0,
368 uint32_t num_points)
369{
370 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
371
372 const __m256i indices_increment = _mm256_set1_epi32(8);
373 /*
374 * At the start of each loop iteration current_indices holds the indices of
375 * the complex numbers loaded from memory. Explanation for odd order is given
376 * in implementation of vector_32fc_index_max_variant0().
377 */
378 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
379
380 __m256 max_values = _mm256_setzero_ps();
381 __m256i max_indices = _mm256_setzero_si256();
382
383 for (unsigned i = 0; i < num_points / 8u; ++i) {
384 __m256 in0 = _mm256_loadu_ps((float*)src0);
385 __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
387 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
388 src0 += 8;
389 }
390
391 // determine maximum value and index in the result of the vectorized loop
392 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
393 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
394 _mm256_store_ps(max_values_buffer, max_values);
395 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
396
397 float max = 0.f;
398 uint32_t index = 0;
399 for (unsigned i = 0; i < 8; i++) {
400 if (max_values_buffer[i] > max) {
401 max = max_values_buffer[i];
402 index = max_indices_buffer[i];
403 }
404 }
405
406 // handle tail not processed by the vectorized loop
407 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
408 const float abs_squared =
409 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
410 if (abs_squared > max) {
411 max = abs_squared;
412 index = i;
413 }
414 ++src0;
415 }
416
417 *target = index;
418}
419
420#endif /*LV_HAVE_AVX2*/
421
422#ifdef LV_HAVE_AVX2
423#include <immintrin.h>
425
426static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
427 lv_32fc_t* src0,
428 uint32_t num_points)
429{
430 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
431
432 const __m256i indices_increment = _mm256_set1_epi32(8);
433 /*
434 * At the start of each loop iteration current_indices holds the indices of
435 * the complex numbers loaded from memory. Explanation for odd order is given
436 * in implementation of vector_32fc_index_max_variant0().
437 */
438 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
439
440 __m256 max_values = _mm256_setzero_ps();
441 __m256i max_indices = _mm256_setzero_si256();
442
443 for (unsigned i = 0; i < num_points / 8u; ++i) {
444 __m256 in0 = _mm256_loadu_ps((float*)src0);
445 __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
447 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
448 src0 += 8;
449 }
450
451 // determine maximum value and index in the result of the vectorized loop
452 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
453 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
454 _mm256_store_ps(max_values_buffer, max_values);
455 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
456
457 float max = 0.f;
458 uint32_t index = 0;
459 for (unsigned i = 0; i < 8; i++) {
460 if (max_values_buffer[i] > max) {
461 max = max_values_buffer[i];
462 index = max_indices_buffer[i];
463 }
464 }
465
466 // handle tail not processed by the vectorized loop
467 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
468 const float abs_squared =
469 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
470 if (abs_squared > max) {
471 max = abs_squared;
472 index = i;
473 }
474 ++src0;
475 }
476
477 *target = index;
478}
479
480#endif /*LV_HAVE_AVX2*/
481
482#endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/
Definition: volk_common.h:111
float f[4]
Definition: volk_common.h:115
__m128i int_vec
Definition: volk_common.h:123
uint32_t i[4]
Definition: volk_common.h:114
__m128 float_vec
Definition: volk_common.h:119
static void volk_32fc_index_max_16u_a_sse3(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:210
static void volk_32fc_index_max_16u_generic(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:325
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:201
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:139
#define bit128_p(x)
Definition: volk_common.h:142
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25