46#ifndef INCLUDED_volk_32fc_convert_16ic_a_H
47#define INCLUDED_volk_32fc_convert_16ic_a_H
56static inline void volk_32fc_convert_16ic_a_avx2(
lv_16sc_t* outputVector,
58 unsigned int num_points)
60 const unsigned int avx_iters = num_points / 8;
62 float* inputVectorPtr = (
float*)inputVector;
63 int16_t* outputVectorPtr = (int16_t*)outputVector;
66 const float min_val = (float)SHRT_MIN;
67 const float max_val = (float)SHRT_MAX;
69 __m256 inputVal1, inputVal2;
70 __m256i intInputVal1, intInputVal2;
72 const __m256 vmin_val = _mm256_set1_ps(min_val);
73 const __m256 vmax_val = _mm256_set1_ps(max_val);
76 for (
i = 0;
i < avx_iters;
i++) {
77 inputVal1 = _mm256_load_ps((
float*)inputVectorPtr);
79 inputVal2 = _mm256_load_ps((
float*)inputVectorPtr);
84 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
85 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
87 intInputVal1 = _mm256_cvtps_epi32(ret1);
88 intInputVal2 = _mm256_cvtps_epi32(ret2);
90 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
91 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
93 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
94 outputVectorPtr += 16;
97 for (
i = avx_iters * 16;
i < num_points * 2;
i++) {
98 aux = *inputVectorPtr++;
101 else if (aux < min_val)
103 *outputVectorPtr++ = (int16_t)
rintf(aux);
109#include <emmintrin.h>
113 unsigned int num_points)
115 const unsigned int sse_iters = num_points / 4;
117 float* inputVectorPtr = (
float*)inputVector;
118 int16_t* outputVectorPtr = (int16_t*)outputVector;
121 const float min_val = (float)SHRT_MIN;
122 const float max_val = (float)SHRT_MAX;
124 __m128 inputVal1, inputVal2;
125 __m128i intInputVal1, intInputVal2;
127 const __m128 vmin_val = _mm_set_ps1(min_val);
128 const __m128 vmax_val = _mm_set_ps1(max_val);
131 for (
i = 0;
i < sse_iters;
i++) {
132 inputVal1 = _mm_load_ps((
float*)inputVectorPtr);
134 inputVal2 = _mm_load_ps((
float*)inputVectorPtr);
139 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
140 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
142 intInputVal1 = _mm_cvtps_epi32(ret1);
143 intInputVal2 = _mm_cvtps_epi32(ret2);
145 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
147 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
148 outputVectorPtr += 8;
151 for (
i = sse_iters * 8;
i < num_points * 2;
i++) {
152 aux = *inputVectorPtr++;
155 else if (aux < min_val)
157 *outputVectorPtr++ = (int16_t)
rintf(aux);
166#define VCVTRQ_S32_F32(res, val) \
167 __VOLK_ASM("VCVTR.S32.F32 %[r0], %[v0]\n\t" \
168 : [r0] "=w"(res[0]) \
171 __VOLK_ASM("VCVTR.S32.F32 %[r1], %[v1]\n\t" \
172 : [r1] "=w"(res[1]) \
175 __VOLK_ASM("VCVTR.S32.F32 %[r2], %[v2]\n\t" \
176 : [r2] "=w"(res[2]) \
179 __VOLK_ASM("VCVTR.S32.F32 %[r3], %[v3]\n\t" : [r3] "=w"(res[3]) : [v3] "w"(val[3]) :);
181static inline void volk_32fc_convert_16ic_neon(
lv_16sc_t* outputVector,
183 unsigned int num_points)
186 const unsigned int neon_iters = num_points / 4;
188 float32_t* inputVectorPtr = (float32_t*)inputVector;
189 int16_t* outputVectorPtr = (int16_t*)outputVector;
191 const float min_val_f = (float)SHRT_MIN;
192 const float max_val_f = (float)SHRT_MAX;
196 const float32x4_t min_val = vmovq_n_f32(min_val_f);
197 const float32x4_t max_val = vmovq_n_f32(max_val_f);
198 float32x4_t ret1, ret2, a, b;
200 int32x4_t toint_a = { 0, 0, 0, 0 };
201 int32x4_t toint_b = { 0, 0, 0, 0 };
202 int16x4_t intInputVal1, intInputVal2;
205 for (
i = 0;
i < neon_iters;
i++) {
206 a = vld1q_f32((
const float32_t*)(inputVectorPtr));
208 b = vld1q_f32((
const float32_t*)(inputVectorPtr));
212 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
213 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
216 VCVTRQ_S32_F32(toint_a, ret1);
217 VCVTRQ_S32_F32(toint_b, ret2);
219 intInputVal1 = vqmovn_s32(toint_a);
220 intInputVal2 = vqmovn_s32(toint_b);
222 res = vcombine_s16(intInputVal1, intInputVal2);
223 vst1q_s16((int16_t*)outputVectorPtr, res);
224 outputVectorPtr += 8;
227 for (
i = neon_iters * 8;
i < num_points * 2;
i++) {
228 aux = *inputVectorPtr++;
231 else if (aux < min_val_f)
233 *outputVectorPtr++ = (int16_t)
rintf(aux);
243static inline void volk_32fc_convert_16ic_neonv8(
lv_16sc_t* outputVector,
245 unsigned int num_points)
247 const unsigned int neon_iters = num_points / 4;
249 float32_t* inputVectorPtr = (float32_t*)inputVector;
250 int16_t* outputVectorPtr = (int16_t*)outputVector;
252 const float min_val_f = (float)SHRT_MIN;
253 const float max_val_f = (float)SHRT_MAX;
257 const float32x4_t min_val = vmovq_n_f32(min_val_f);
258 const float32x4_t max_val = vmovq_n_f32(max_val_f);
259 float32x4_t ret1, ret2, a, b;
261 int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
262 int16x4_t intInputVal1, intInputVal2;
265 for (
i = 0;
i < neon_iters;
i++) {
266 a = vld1q_f32((
const float32_t*)(inputVectorPtr));
268 b = vld1q_f32((
const float32_t*)(inputVectorPtr));
272 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
273 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
276 toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
277 toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
279 intInputVal1 = vqmovn_s32(toint_a);
280 intInputVal2 = vqmovn_s32(toint_b);
282 res = vcombine_s16(intInputVal1, intInputVal2);
283 vst1q_s16((int16_t*)outputVectorPtr, res);
284 outputVectorPtr += 8;
287 for (
i = neon_iters * 8;
i < num_points * 2;
i++) {
288 aux = *inputVectorPtr++;
291 else if (aux < min_val_f)
293 *outputVectorPtr++ = (int16_t)
rintf(aux);
299#ifdef LV_HAVE_GENERIC
303 unsigned int num_points)
305 float* inputVectorPtr = (
float*)inputVector;
306 int16_t* outputVectorPtr = (int16_t*)outputVector;
307 const float min_val = (float)SHRT_MIN;
308 const float max_val = (float)SHRT_MAX;
311 for (
i = 0;
i < num_points * 2;
i++) {
312 aux = *inputVectorPtr++;
315 else if (aux < min_val)
317 *outputVectorPtr++ = (int16_t)
rintf(aux);
324#ifndef INCLUDED_volk_32fc_convert_16ic_u_H
325#define INCLUDED_volk_32fc_convert_16ic_u_H
333#include <immintrin.h>
335static inline void volk_32fc_convert_16ic_u_avx2(
lv_16sc_t* outputVector,
337 unsigned int num_points)
339 const unsigned int avx_iters = num_points / 8;
341 float* inputVectorPtr = (
float*)inputVector;
342 int16_t* outputVectorPtr = (int16_t*)outputVector;
345 const float min_val = (float)SHRT_MIN;
346 const float max_val = (float)SHRT_MAX;
348 __m256 inputVal1, inputVal2;
349 __m256i intInputVal1, intInputVal2;
351 const __m256 vmin_val = _mm256_set1_ps(min_val);
352 const __m256 vmax_val = _mm256_set1_ps(max_val);
355 for (
i = 0;
i < avx_iters;
i++) {
356 inputVal1 = _mm256_loadu_ps((
float*)inputVectorPtr);
358 inputVal2 = _mm256_loadu_ps((
float*)inputVectorPtr);
363 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
364 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
366 intInputVal1 = _mm256_cvtps_epi32(ret1);
367 intInputVal2 = _mm256_cvtps_epi32(ret2);
369 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
370 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
372 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
373 outputVectorPtr += 16;
376 for (
i = avx_iters * 16;
i < num_points * 2;
i++) {
377 aux = *inputVectorPtr++;
380 else if (aux < min_val)
382 *outputVectorPtr++ = (int16_t)
rintf(aux);
389#include <emmintrin.h>
393 unsigned int num_points)
395 const unsigned int sse_iters = num_points / 4;
397 float* inputVectorPtr = (
float*)inputVector;
398 int16_t* outputVectorPtr = (int16_t*)outputVector;
401 const float min_val = (float)SHRT_MIN;
402 const float max_val = (float)SHRT_MAX;
404 __m128 inputVal1, inputVal2;
405 __m128i intInputVal1, intInputVal2;
407 const __m128 vmin_val = _mm_set_ps1(min_val);
408 const __m128 vmax_val = _mm_set_ps1(max_val);
411 for (
i = 0;
i < sse_iters;
i++) {
412 inputVal1 = _mm_loadu_ps((
float*)inputVectorPtr);
414 inputVal2 = _mm_loadu_ps((
float*)inputVectorPtr);
419 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
420 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
422 intInputVal1 = _mm_cvtps_epi32(ret1);
423 intInputVal2 = _mm_cvtps_epi32(ret2);
425 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
427 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
428 outputVectorPtr += 8;
431 for (
i = sse_iters * 8;
i < num_points * 2;
i++) {
432 aux = *inputVectorPtr++;
435 else if (aux < min_val)
437 *outputVectorPtr++ = (int16_t)
rintf(aux);
static float rintf(float x)
Definition: config.h:37
static void volk_32fc_convert_16ic_a_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:111
static void volk_32fc_convert_16ic_u_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:391
static void volk_32fc_convert_16ic_generic(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:301
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
float complex lv_32fc_t
Definition: volk_complex.h:65
short complex lv_16sc_t
Definition: volk_complex.h:62
for i
Definition: volk_config_fixed.tmpl.h:25