76#ifndef INCLUDED_volk_32fc_index_max_16u_a_H
77#define INCLUDED_volk_32fc_index_max_16u_a_H
89static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target,
93 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
95 const __m256i indices_increment = _mm256_set1_epi32(8);
101 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
103 __m256 max_values = _mm256_setzero_ps();
104 __m256i max_indices = _mm256_setzero_si256();
106 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
107 __m256 in0 = _mm256_load_ps((
float*)src0);
108 __m256 in1 = _mm256_load_ps((
float*)(src0 + 4));
110 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
117 _mm256_store_ps(max_values_buffer, max_values);
118 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
122 for (
unsigned i = 0;
i < 8;
i++) {
123 if (max_values_buffer[
i] > max) {
124 max = max_values_buffer[
i];
125 index = max_indices_buffer[
i];
130 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
131 const float abs_squared =
133 if (abs_squared > max) {
146#include <immintrin.h>
149static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target,
153 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
155 const __m256i indices_increment = _mm256_set1_epi32(8);
161 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
163 __m256 max_values = _mm256_setzero_ps();
164 __m256i max_indices = _mm256_setzero_si256();
166 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
167 __m256 in0 = _mm256_load_ps((
float*)src0);
168 __m256 in1 = _mm256_load_ps((
float*)(src0 + 4));
170 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
177 _mm256_store_ps(max_values_buffer, max_values);
178 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
182 for (
unsigned i = 0;
i < 8;
i++) {
183 if (max_values_buffer[
i] > max) {
184 max = max_values_buffer[
i];
185 index = max_indices_buffer[
i];
190 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
191 const float abs_squared =
193 if (abs_squared > max) {
206#include <pmmintrin.h>
207#include <xmmintrin.h>
212 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
213 const uint32_t num_bytes = num_points * 8;
220 __m128 xmm1, xmm2, xmm3;
221 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
223 xmm5.
int_vec = _mm_setzero_si128();
224 xmm4.
int_vec = _mm_setzero_si128();
225 holderf.
int_vec = _mm_setzero_si128();
226 holderi.
int_vec = _mm_setzero_si128();
228 int bound = num_bytes >> 5;
231 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
232 xmm9 = _mm_setzero_si128();
233 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
234 xmm3 = _mm_setzero_ps();
236 for (;
i < bound; ++
i) {
237 xmm1 = _mm_load_ps((
float*)src0);
238 xmm2 = _mm_load_ps((
float*)&src0[2]);
242 xmm1 = _mm_mul_ps(xmm1, xmm1);
243 xmm2 = _mm_mul_ps(xmm2, xmm2);
245 xmm1 = _mm_hadd_ps(xmm1, xmm2);
247 xmm3 = _mm_max_ps(xmm1, xmm3);
249 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
250 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
252 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
253 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
255 xmm9 = _mm_add_epi32(xmm11, xmm12);
257 xmm8 = _mm_add_epi32(xmm8, xmm10);
260 if (num_bytes >> 4 & 1) {
261 xmm2 = _mm_load_ps((
float*)src0);
266 xmm2 = _mm_mul_ps(xmm2, xmm2);
270 xmm1 = _mm_hadd_ps(xmm2, xmm2);
272 xmm3 = _mm_max_ps(xmm1, xmm3);
274 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
276 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
277 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
279 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
280 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
282 xmm9 = _mm_add_epi32(xmm11, xmm12);
284 xmm8 = _mm_add_epi32(xmm8, xmm10);
287 if (num_bytes >> 3 & 1) {
291 xmm2 = _mm_load1_ps(&sq_dist);
295 xmm3 = _mm_max_ss(xmm3, xmm2);
297 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
298 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
300 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
302 xmm11 = _mm_and_si128(xmm8, xmm4.
int_vec);
303 xmm12 = _mm_and_si128(xmm9, xmm5.
int_vec);
305 xmm9 = _mm_add_epi32(xmm11, xmm12);
308 _mm_store_ps((
float*)&(holderf.
f), xmm3);
309 _mm_store_si128(&(holderi.
int_vec), xmm9);
311 target[0] = holderi.
i[0];
312 sq_dist = holderf.
f[0];
313 target[0] = (holderf.
f[1] > sq_dist) ? holderi.
i[1] : target[0];
314 sq_dist = (holderf.
f[1] > sq_dist) ? holderf.
f[1] : sq_dist;
315 target[0] = (holderf.
f[2] > sq_dist) ? holderi.
i[2] : target[0];
316 sq_dist = (holderf.
f[2] > sq_dist) ? holderf.
f[2] : sq_dist;
317 target[0] = (holderf.
f[3] > sq_dist) ? holderi.
i[3] : target[0];
318 sq_dist = (holderf.
f[3] > sq_dist) ? holderf.
f[3] : sq_dist;
323#ifdef LV_HAVE_GENERIC
327 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
329 const uint32_t num_bytes = num_points * 8;
337 for (; i<num_bytes>> 3; ++
i) {
353#ifndef INCLUDED_volk_32fc_index_max_16u_u_H
354#define INCLUDED_volk_32fc_index_max_16u_u_H
363#include <immintrin.h>
366static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target,
370 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
372 const __m256i indices_increment = _mm256_set1_epi32(8);
378 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
380 __m256 max_values = _mm256_setzero_ps();
381 __m256i max_indices = _mm256_setzero_si256();
383 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
384 __m256 in0 = _mm256_loadu_ps((
float*)src0);
385 __m256 in1 = _mm256_loadu_ps((
float*)(src0 + 4));
387 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
394 _mm256_store_ps(max_values_buffer, max_values);
395 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
399 for (
unsigned i = 0;
i < 8;
i++) {
400 if (max_values_buffer[
i] > max) {
401 max = max_values_buffer[
i];
402 index = max_indices_buffer[
i];
407 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
408 const float abs_squared =
410 if (abs_squared > max) {
423#include <immintrin.h>
426static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
430 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
432 const __m256i indices_increment = _mm256_set1_epi32(8);
438 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
440 __m256 max_values = _mm256_setzero_ps();
441 __m256i max_indices = _mm256_setzero_si256();
443 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
444 __m256 in0 = _mm256_loadu_ps((
float*)src0);
445 __m256 in1 = _mm256_loadu_ps((
float*)(src0 + 4));
447 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
454 _mm256_store_ps(max_values_buffer, max_values);
455 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
459 for (
unsigned i = 0;
i < 8;
i++) {
460 if (max_values_buffer[
i] > max) {
461 max = max_values_buffer[
i];
462 index = max_indices_buffer[
i];
467 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
468 const float abs_squared =
470 if (abs_squared > max) {
Definition: volk_common.h:111
float f[4]
Definition: volk_common.h:115
__m128i int_vec
Definition: volk_common.h:123
uint32_t i[4]
Definition: volk_common.h:114
__m128 float_vec
Definition: volk_common.h:119
static void volk_32fc_index_max_16u_a_sse3(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:210
static void volk_32fc_index_max_16u_generic(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:325
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:201
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:139
#define bit128_p(x)
Definition: volk_common.h:142
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25