Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_s32f_convert_8i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
73#ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
74#define INCLUDED_volk_32f_s32f_convert_8i_u_H
75
76#include <inttypes.h>
77#include <stdio.h>
78
79static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in)
80{
81 float min_val = INT8_MIN;
82 float max_val = INT8_MAX;
83 if (in > max_val) {
84 *out = (int8_t)(max_val);
85 } else if (in < min_val) {
86 *out = (int8_t)(min_val);
87 } else {
88 *out = (int8_t)(rintf(in));
89 }
90}
91
92#ifdef LV_HAVE_AVX2
93#include <immintrin.h>
94
95static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
96 const float* inputVector,
97 const float scalar,
98 unsigned int num_points)
99{
100 unsigned int number = 0;
101
102 const unsigned int thirtysecondPoints = num_points / 32;
103
104 const float* inputVectorPtr = (const float*)inputVector;
105 int8_t* outputVectorPtr = outputVector;
106
107 float min_val = INT8_MIN;
108 float max_val = INT8_MAX;
109 float r;
110
111 __m256 vScalar = _mm256_set1_ps(scalar);
112 __m256 inputVal1, inputVal2, inputVal3, inputVal4;
113 __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
114 __m256 vmin_val = _mm256_set1_ps(min_val);
115 __m256 vmax_val = _mm256_set1_ps(max_val);
116 __m256i intInputVal;
117
118 for (; number < thirtysecondPoints; number++) {
119 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
120 inputVectorPtr += 8;
121 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
122 inputVectorPtr += 8;
123 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
124 inputVectorPtr += 8;
125 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
126 inputVectorPtr += 8;
127
128 inputVal1 = _mm256_max_ps(
129 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
130 inputVal2 = _mm256_max_ps(
131 _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
132 inputVal3 = _mm256_max_ps(
133 _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
134 inputVal4 = _mm256_max_ps(
135 _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
136
137 intInputVal1 = _mm256_cvtps_epi32(inputVal1);
138 intInputVal2 = _mm256_cvtps_epi32(inputVal2);
139 intInputVal3 = _mm256_cvtps_epi32(inputVal3);
140 intInputVal4 = _mm256_cvtps_epi32(inputVal4);
141
142 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
143 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
144 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
145 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
146
147 intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
148 intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
149
150 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
151 outputVectorPtr += 32;
152 }
153
154 number = thirtysecondPoints * 32;
155 for (; number < num_points; number++) {
156 r = inputVector[number] * scalar;
157 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
158 }
159}
160
161#endif /* LV_HAVE_AVX2 */
162
163
164#ifdef LV_HAVE_SSE2
165#include <emmintrin.h>
166
167static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector,
168 const float* inputVector,
169 const float scalar,
170 unsigned int num_points)
171{
172 unsigned int number = 0;
173
174 const unsigned int sixteenthPoints = num_points / 16;
175
176 const float* inputVectorPtr = (const float*)inputVector;
177 int8_t* outputVectorPtr = outputVector;
178
179 float min_val = INT8_MIN;
180 float max_val = INT8_MAX;
181 float r;
182
183 __m128 vScalar = _mm_set_ps1(scalar);
184 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
185 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
186 __m128 vmin_val = _mm_set_ps1(min_val);
187 __m128 vmax_val = _mm_set_ps1(max_val);
188
189 for (; number < sixteenthPoints; number++) {
190 inputVal1 = _mm_loadu_ps(inputVectorPtr);
191 inputVectorPtr += 4;
192 inputVal2 = _mm_loadu_ps(inputVectorPtr);
193 inputVectorPtr += 4;
194 inputVal3 = _mm_loadu_ps(inputVectorPtr);
195 inputVectorPtr += 4;
196 inputVal4 = _mm_loadu_ps(inputVectorPtr);
197 inputVectorPtr += 4;
198
199 inputVal1 =
200 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
201 inputVal2 =
202 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
203 inputVal3 =
204 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
205 inputVal4 =
206 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
207
208 intInputVal1 = _mm_cvtps_epi32(inputVal1);
209 intInputVal2 = _mm_cvtps_epi32(inputVal2);
210 intInputVal3 = _mm_cvtps_epi32(inputVal3);
211 intInputVal4 = _mm_cvtps_epi32(inputVal4);
212
213 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
214 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
215
216 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
217
218 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
219 outputVectorPtr += 16;
220 }
221
222 number = sixteenthPoints * 16;
223 for (; number < num_points; number++) {
224 r = inputVector[number] * scalar;
225 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
226 }
227}
228
229#endif /* LV_HAVE_SSE2 */
230
231
232#ifdef LV_HAVE_SSE
233#include <xmmintrin.h>
234
235static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector,
236 const float* inputVector,
237 const float scalar,
238 unsigned int num_points)
239{
240 unsigned int number = 0;
241 size_t inner_loop;
242
243 const unsigned int quarterPoints = num_points / 4;
244
245 const float* inputVectorPtr = (const float*)inputVector;
246 int8_t* outputVectorPtr = outputVector;
247
248 float min_val = INT8_MIN;
249 float max_val = INT8_MAX;
250 float r;
251
252 __m128 vScalar = _mm_set_ps1(scalar);
253 __m128 ret;
254 __m128 vmin_val = _mm_set_ps1(min_val);
255 __m128 vmax_val = _mm_set_ps1(max_val);
256
257 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
258
259 for (; number < quarterPoints; number++) {
260 ret = _mm_loadu_ps(inputVectorPtr);
261 inputVectorPtr += 4;
262
263 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
264
265 _mm_store_ps(outputFloatBuffer, ret);
266 for (inner_loop = 0; inner_loop < 4; inner_loop++) {
267 *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
268 }
269 }
270
271 number = quarterPoints * 4;
272 for (; number < num_points; number++) {
273 r = inputVector[number] * scalar;
274 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
275 }
276}
277
278#endif /* LV_HAVE_SSE */
279
280
281#ifdef LV_HAVE_GENERIC
282
283static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector,
284 const float* inputVector,
285 const float scalar,
286 unsigned int num_points)
287{
288 const float* inputVectorPtr = inputVector;
289 unsigned int number = 0;
290 float r;
291
292 for (number = 0; number < num_points; number++) {
293 r = *inputVectorPtr++ * scalar;
294 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
295 }
296}
297
298#endif /* LV_HAVE_GENERIC */
299
300
301#endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
302#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
303#define INCLUDED_volk_32f_s32f_convert_8i_a_H
304
305#include <inttypes.h>
306#include <stdio.h>
307#include <volk/volk_common.h>
308
309#ifdef LV_HAVE_AVX2
310#include <immintrin.h>
311
312static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
313 const float* inputVector,
314 const float scalar,
315 unsigned int num_points)
316{
317 unsigned int number = 0;
318
319 const unsigned int thirtysecondPoints = num_points / 32;
320
321 const float* inputVectorPtr = (const float*)inputVector;
322 int8_t* outputVectorPtr = outputVector;
323
324 float min_val = INT8_MIN;
325 float max_val = INT8_MAX;
326 float r;
327
328 __m256 vScalar = _mm256_set1_ps(scalar);
329 __m256 inputVal1, inputVal2, inputVal3, inputVal4;
330 __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
331 __m256 vmin_val = _mm256_set1_ps(min_val);
332 __m256 vmax_val = _mm256_set1_ps(max_val);
333 __m256i intInputVal;
334
335 for (; number < thirtysecondPoints; number++) {
336 inputVal1 = _mm256_load_ps(inputVectorPtr);
337 inputVectorPtr += 8;
338 inputVal2 = _mm256_load_ps(inputVectorPtr);
339 inputVectorPtr += 8;
340 inputVal3 = _mm256_load_ps(inputVectorPtr);
341 inputVectorPtr += 8;
342 inputVal4 = _mm256_load_ps(inputVectorPtr);
343 inputVectorPtr += 8;
344
345 inputVal1 = _mm256_max_ps(
346 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
347 inputVal2 = _mm256_max_ps(
348 _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
349 inputVal3 = _mm256_max_ps(
350 _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
351 inputVal4 = _mm256_max_ps(
352 _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
353
354 intInputVal1 = _mm256_cvtps_epi32(inputVal1);
355 intInputVal2 = _mm256_cvtps_epi32(inputVal2);
356 intInputVal3 = _mm256_cvtps_epi32(inputVal3);
357 intInputVal4 = _mm256_cvtps_epi32(inputVal4);
358
359 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
360 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
361 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
362 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
363
364 intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
365 intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
366
367 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
368 outputVectorPtr += 32;
369 }
370
371 number = thirtysecondPoints * 32;
372 for (; number < num_points; number++) {
373 r = inputVector[number] * scalar;
374 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
375 }
376}
377
378#endif /* LV_HAVE_AVX2 */
379
380
381#ifdef LV_HAVE_SSE2
382#include <emmintrin.h>
383
384static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector,
385 const float* inputVector,
386 const float scalar,
387 unsigned int num_points)
388{
389 unsigned int number = 0;
390
391 const unsigned int sixteenthPoints = num_points / 16;
392
393 const float* inputVectorPtr = (const float*)inputVector;
394 int8_t* outputVectorPtr = outputVector;
395
396 float min_val = INT8_MIN;
397 float max_val = INT8_MAX;
398 float r;
399
400 __m128 vScalar = _mm_set_ps1(scalar);
401 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
402 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
403 __m128 vmin_val = _mm_set_ps1(min_val);
404 __m128 vmax_val = _mm_set_ps1(max_val);
405
406 for (; number < sixteenthPoints; number++) {
407 inputVal1 = _mm_load_ps(inputVectorPtr);
408 inputVectorPtr += 4;
409 inputVal2 = _mm_load_ps(inputVectorPtr);
410 inputVectorPtr += 4;
411 inputVal3 = _mm_load_ps(inputVectorPtr);
412 inputVectorPtr += 4;
413 inputVal4 = _mm_load_ps(inputVectorPtr);
414 inputVectorPtr += 4;
415
416 inputVal1 =
417 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
418 inputVal2 =
419 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
420 inputVal3 =
421 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
422 inputVal4 =
423 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
424
425 intInputVal1 = _mm_cvtps_epi32(inputVal1);
426 intInputVal2 = _mm_cvtps_epi32(inputVal2);
427 intInputVal3 = _mm_cvtps_epi32(inputVal3);
428 intInputVal4 = _mm_cvtps_epi32(inputVal4);
429
430 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
431 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
432
433 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
434
435 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
436 outputVectorPtr += 16;
437 }
438
439 number = sixteenthPoints * 16;
440 for (; number < num_points; number++) {
441 r = inputVector[number] * scalar;
442 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
443 }
444}
445#endif /* LV_HAVE_SSE2 */
446
447
448#ifdef LV_HAVE_SSE
449#include <xmmintrin.h>
450
451static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector,
452 const float* inputVector,
453 const float scalar,
454 unsigned int num_points)
455{
456 unsigned int number = 0;
457 size_t inner_loop;
458
459 const unsigned int quarterPoints = num_points / 4;
460
461 const float* inputVectorPtr = (const float*)inputVector;
462
463 float min_val = INT8_MIN;
464 float max_val = INT8_MAX;
465 float r;
466
467 int8_t* outputVectorPtr = outputVector;
468 __m128 vScalar = _mm_set_ps1(scalar);
469 __m128 ret;
470 __m128 vmin_val = _mm_set_ps1(min_val);
471 __m128 vmax_val = _mm_set_ps1(max_val);
472
473 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
474
475 for (; number < quarterPoints; number++) {
476 ret = _mm_load_ps(inputVectorPtr);
477 inputVectorPtr += 4;
478
479 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
480
481 _mm_store_ps(outputFloatBuffer, ret);
482 for (inner_loop = 0; inner_loop < 4; inner_loop++) {
483 *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
484 }
485 }
486
487 number = quarterPoints * 4;
488 for (; number < num_points; number++) {
489 r = inputVector[number] * scalar;
490 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
491 }
492}
493
494#endif /* LV_HAVE_SSE */
495
496
497#ifdef LV_HAVE_GENERIC
498
499static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector,
500 const float* inputVector,
501 const float scalar,
502 unsigned int num_points)
503{
504 const float* inputVectorPtr = inputVector;
505 unsigned int number = 0;
506 float r;
507
508 for (number = 0; number < num_points; number++) {
509 r = *inputVectorPtr++ * scalar;
510 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
511 }
512}
513
514#endif /* LV_HAVE_GENERIC */
515
516
517#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
static float rintf(float x)
Definition: config.h:37
static void volk_32f_s32f_convert_8i_a_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:384
static void volk_32f_s32f_convert_8i_single(int8_t *out, const float in)
Definition: volk_32f_s32f_convert_8i.h:79
static void volk_32f_s32f_convert_8i_u_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:235
static void volk_32f_s32f_convert_8i_a_generic(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:499
static void volk_32f_s32f_convert_8i_a_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:451
static void volk_32f_s32f_convert_8i_generic(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:283
static void volk_32f_s32f_convert_8i_u_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:167
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56