Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_binary_slicer_8i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
72#ifndef INCLUDED_volk_32f_binary_slicer_8i_H
73#define INCLUDED_volk_32f_binary_slicer_8i_H
74
75
76#ifdef LV_HAVE_GENERIC
77
78static inline void volk_32f_binary_slicer_8i_generic(int8_t* cVector,
79 const float* aVector,
80 unsigned int num_points)
81{
82 int8_t* cPtr = cVector;
83 const float* aPtr = aVector;
84 unsigned int number = 0;
85
86 for (number = 0; number < num_points; number++) {
87 if (*aPtr++ >= 0) {
88 *cPtr++ = 1;
89 } else {
90 *cPtr++ = 0;
91 }
92 }
93}
94#endif /* LV_HAVE_GENERIC */
95
96
97#ifdef LV_HAVE_GENERIC
98
99static inline void volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector,
100 const float* aVector,
101 unsigned int num_points)
102{
103 int8_t* cPtr = cVector;
104 const float* aPtr = aVector;
105 unsigned int number = 0;
106
107 for (number = 0; number < num_points; number++) {
108 *cPtr++ = (*aPtr++ >= 0);
109 }
110}
111#endif /* LV_HAVE_GENERIC */
112
113
114#ifdef LV_HAVE_AVX2
115#include <immintrin.h>
116
117static inline void volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector,
118 const float* aVector,
119 unsigned int num_points)
120{
121 int8_t* cPtr = cVector;
122 const float* aPtr = aVector;
123 unsigned int number = 0;
124 unsigned int n32points = num_points / 32;
125
126 const __m256 zero_val = _mm256_set1_ps(0.0f);
127 __m256 a0_val, a1_val, a2_val, a3_val;
128 __m256 res0_f, res1_f, res2_f, res3_f;
129 __m256i res0_i, res1_i, res2_i, res3_i;
130 __m256i byte_shuffle = _mm256_set_epi8(15,
131 14,
132 13,
133 12,
134 7,
135 6,
136 5,
137 4,
138 11,
139 10,
140 9,
141 8,
142 3,
143 2,
144 1,
145 0,
146 15,
147 14,
148 13,
149 12,
150 7,
151 6,
152 5,
153 4,
154 11,
155 10,
156 9,
157 8,
158 3,
159 2,
160 1,
161 0);
162
163 for (number = 0; number < n32points; number++) {
164 a0_val = _mm256_load_ps(aPtr);
165 a1_val = _mm256_load_ps(aPtr + 8);
166 a2_val = _mm256_load_ps(aPtr + 16);
167 a3_val = _mm256_load_ps(aPtr + 24);
168
169 // compare >= 0; return float
170 res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
171 res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
172 res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
173 res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
174
175 // convert to 32i and >> 31
176 res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
177 res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
178 res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
179 res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
180
181 // pack in to 16-bit results
182 res0_i = _mm256_packs_epi32(res0_i, res1_i);
183 res2_i = _mm256_packs_epi32(res2_i, res3_i);
184 // pack in to 8-bit results
185 // res0: (after packs_epi32)
186 // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
187 // res2:
188 // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
189 res0_i = _mm256_packs_epi16(res0_i, res2_i);
190 // shuffle the lanes
191 // res0: (after packs_epi16)
192 // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
193 // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
194 // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
195 res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
196
197 // shuffle bytes within lanes
198 // res0: (after shuffle_epi8)
199 // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
200 // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
201 res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
202
203 _mm256_store_si256((__m256i*)cPtr, res0_i);
204 aPtr += 32;
205 cPtr += 32;
206 }
207
208 for (number = n32points * 32; number < num_points; number++) {
209 if (*aPtr++ >= 0) {
210 *cPtr++ = 1;
211 } else {
212 *cPtr++ = 0;
213 }
214 }
215}
216#endif
217
218#ifdef LV_HAVE_AVX2
219#include <immintrin.h>
220
221static inline void volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector,
222 const float* aVector,
223 unsigned int num_points)
224{
225 int8_t* cPtr = cVector;
226 const float* aPtr = aVector;
227 unsigned int number = 0;
228 unsigned int n32points = num_points / 32;
229
230 const __m256 zero_val = _mm256_set1_ps(0.0f);
231 __m256 a0_val, a1_val, a2_val, a3_val;
232 __m256 res0_f, res1_f, res2_f, res3_f;
233 __m256i res0_i, res1_i, res2_i, res3_i;
234 __m256i byte_shuffle = _mm256_set_epi8(15,
235 14,
236 13,
237 12,
238 7,
239 6,
240 5,
241 4,
242 11,
243 10,
244 9,
245 8,
246 3,
247 2,
248 1,
249 0,
250 15,
251 14,
252 13,
253 12,
254 7,
255 6,
256 5,
257 4,
258 11,
259 10,
260 9,
261 8,
262 3,
263 2,
264 1,
265 0);
266
267 for (number = 0; number < n32points; number++) {
268 a0_val = _mm256_loadu_ps(aPtr);
269 a1_val = _mm256_loadu_ps(aPtr + 8);
270 a2_val = _mm256_loadu_ps(aPtr + 16);
271 a3_val = _mm256_loadu_ps(aPtr + 24);
272
273 // compare >= 0; return float
274 res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
275 res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
276 res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
277 res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
278
279 // convert to 32i and >> 31
280 res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
281 res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
282 res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
283 res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
284
285 // pack in to 16-bit results
286 res0_i = _mm256_packs_epi32(res0_i, res1_i);
287 res2_i = _mm256_packs_epi32(res2_i, res3_i);
288 // pack in to 8-bit results
289 // res0: (after packs_epi32)
290 // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
291 // res2:
292 // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
293 res0_i = _mm256_packs_epi16(res0_i, res2_i);
294 // shuffle the lanes
295 // res0: (after packs_epi16)
296 // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
297 // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
298 // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
299 res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
300
301 // shuffle bytes within lanes
302 // res0: (after shuffle_epi8)
303 // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
304 // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
305 res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
306
307 _mm256_storeu_si256((__m256i*)cPtr, res0_i);
308 aPtr += 32;
309 cPtr += 32;
310 }
311
312 for (number = n32points * 32; number < num_points; number++) {
313 if (*aPtr++ >= 0) {
314 *cPtr++ = 1;
315 } else {
316 *cPtr++ = 0;
317 }
318 }
319}
320#endif
321
322
323#ifdef LV_HAVE_SSE2
324
325#include <emmintrin.h>
326
327static inline void volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector,
328 const float* aVector,
329 unsigned int num_points)
330{
331 int8_t* cPtr = cVector;
332 const float* aPtr = aVector;
333 unsigned int number = 0;
334
335 unsigned int n16points = num_points / 16;
336 __m128 a0_val, a1_val, a2_val, a3_val;
337 __m128 res0_f, res1_f, res2_f, res3_f;
338 __m128i res0_i, res1_i, res2_i, res3_i;
339 __m128 zero_val;
340 zero_val = _mm_set1_ps(0.0f);
341
342 for (number = 0; number < n16points; number++) {
343 a0_val = _mm_load_ps(aPtr);
344 a1_val = _mm_load_ps(aPtr + 4);
345 a2_val = _mm_load_ps(aPtr + 8);
346 a3_val = _mm_load_ps(aPtr + 12);
347
348 // compare >= 0; return float
349 res0_f = _mm_cmpge_ps(a0_val, zero_val);
350 res1_f = _mm_cmpge_ps(a1_val, zero_val);
351 res2_f = _mm_cmpge_ps(a2_val, zero_val);
352 res3_f = _mm_cmpge_ps(a3_val, zero_val);
353
354 // convert to 32i and >> 31
355 res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
356 res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
357 res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
358 res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
359
360 // pack into 16-bit results
361 res0_i = _mm_packs_epi32(res0_i, res1_i);
362 res2_i = _mm_packs_epi32(res2_i, res3_i);
363
364 // pack into 8-bit results
365 res0_i = _mm_packs_epi16(res0_i, res2_i);
366
367 _mm_store_si128((__m128i*)cPtr, res0_i);
368
369 cPtr += 16;
370 aPtr += 16;
371 }
372
373 for (number = n16points * 16; number < num_points; number++) {
374 if (*aPtr++ >= 0) {
375 *cPtr++ = 1;
376 } else {
377 *cPtr++ = 0;
378 }
379 }
380}
381#endif /* LV_HAVE_SSE2 */
382
383
384#ifdef LV_HAVE_SSE2
385#include <emmintrin.h>
386
387static inline void volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector,
388 const float* aVector,
389 unsigned int num_points)
390{
391 int8_t* cPtr = cVector;
392 const float* aPtr = aVector;
393 unsigned int number = 0;
394
395 unsigned int n16points = num_points / 16;
396 __m128 a0_val, a1_val, a2_val, a3_val;
397 __m128 res0_f, res1_f, res2_f, res3_f;
398 __m128i res0_i, res1_i, res2_i, res3_i;
399 __m128 zero_val;
400 zero_val = _mm_set1_ps(0.0f);
401
402 for (number = 0; number < n16points; number++) {
403 a0_val = _mm_loadu_ps(aPtr);
404 a1_val = _mm_loadu_ps(aPtr + 4);
405 a2_val = _mm_loadu_ps(aPtr + 8);
406 a3_val = _mm_loadu_ps(aPtr + 12);
407
408 // compare >= 0; return float
409 res0_f = _mm_cmpge_ps(a0_val, zero_val);
410 res1_f = _mm_cmpge_ps(a1_val, zero_val);
411 res2_f = _mm_cmpge_ps(a2_val, zero_val);
412 res3_f = _mm_cmpge_ps(a3_val, zero_val);
413
414 // convert to 32i and >> 31
415 res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
416 res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
417 res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
418 res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
419
420 // pack into 16-bit results
421 res0_i = _mm_packs_epi32(res0_i, res1_i);
422 res2_i = _mm_packs_epi32(res2_i, res3_i);
423
424 // pack into 8-bit results
425 res0_i = _mm_packs_epi16(res0_i, res2_i);
426
427 _mm_storeu_si128((__m128i*)cPtr, res0_i);
428
429 cPtr += 16;
430 aPtr += 16;
431 }
432
433 for (number = n16points * 16; number < num_points; number++) {
434 if (*aPtr++ >= 0) {
435 *cPtr++ = 1;
436 } else {
437 *cPtr++ = 0;
438 }
439 }
440}
441#endif /* LV_HAVE_SSE2 */
442
443
444#ifdef LV_HAVE_NEON
445#include <arm_neon.h>
446
447static inline void volk_32f_binary_slicer_8i_neon(int8_t* cVector,
448 const float* aVector,
449 unsigned int num_points)
450{
451 int8_t* cPtr = cVector;
452 const float* aPtr = aVector;
453 unsigned int number = 0;
454 unsigned int n16points = num_points / 16;
455
456 float32x4x2_t input_val0, input_val1;
457 float32x4_t zero_val;
458 uint32x4x2_t res0_u32, res1_u32;
459 uint16x4x2_t res0_u16x4, res1_u16x4;
460 uint16x8x2_t res_u16x8;
461 uint8x8x2_t res_u8;
462 uint8x8_t one;
463
464 zero_val = vdupq_n_f32(0.0);
465 one = vdup_n_u8(0x01);
466
467 // TODO: this is a good candidate for asm because the vcombines
468 // can be eliminated simply by picking dst registers that are
469 // adjacent.
470 for (number = 0; number < n16points; number++) {
471 input_val0 = vld2q_f32(aPtr);
472 input_val1 = vld2q_f32(aPtr + 8);
473
474 // test against 0; return uint32
475 res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
476 res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
477 res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
478 res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
479
480 // narrow uint32 -> uint16 followed by combine to 8-element vectors
481 res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
482 res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
483 res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
484 res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
485
486 res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
487 res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
488
489 // narrow uint16x8 -> uint8x8
490 res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
491 res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
492 // we *could* load twice as much data and do another vcombine here
493 // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
494 // but that turns out to be ~16% slower than this version on zc702
495 // it's possible register contention in GCC scheduler slows it down
496 // and a hand-written asm with quad-word u8 registers is much faster.
497
498 res_u8.val[0] = vand_u8(one, res_u8.val[0]);
499 res_u8.val[1] = vand_u8(one, res_u8.val[1]);
500
501 vst2_u8((unsigned char*)cPtr, res_u8);
502 cPtr += 16;
503 aPtr += 16;
504 }
505
506 for (number = n16points * 16; number < num_points; number++) {
507 if (*aPtr++ >= 0) {
508 *cPtr++ = 1;
509 } else {
510 *cPtr++ = 0;
511 }
512 }
513}
514#endif /* LV_HAVE_NEON */
515
516
517#endif /* INCLUDED_volk_32f_binary_slicer_8i_H */
static void volk_32f_binary_slicer_8i_generic_branchless(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:99
static void volk_32f_binary_slicer_8i_a_sse2(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:327
static void volk_32f_binary_slicer_8i_u_sse2(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:387
static void volk_32f_binary_slicer_8i_neon(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:447
static void volk_32f_binary_slicer_8i_generic(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:78