Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_64u_byteswap.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
66#ifndef INCLUDED_volk_64u_byteswap_u_H
67#define INCLUDED_volk_64u_byteswap_u_H
68
69#include <inttypes.h>
70#include <stdio.h>
71
72#ifdef LV_HAVE_SSE2
73#include <emmintrin.h>
74
75static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points)
76{
77 uint32_t* inputPtr = (uint32_t*)intsToSwap;
78 __m128i input, byte1, byte2, byte3, byte4, output;
79 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
80 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
81 uint64_t number = 0;
82 const unsigned int halfPoints = num_points / 2;
83 for (; number < halfPoints; number++) {
84 // Load the 32t values, increment inputPtr later since we're doing it in-place.
85 input = _mm_loadu_si128((__m128i*)inputPtr);
86
87 // Do the four shifts
88 byte1 = _mm_slli_epi32(input, 24);
89 byte2 = _mm_slli_epi32(input, 8);
90 byte3 = _mm_srli_epi32(input, 8);
91 byte4 = _mm_srli_epi32(input, 24);
92 // Or bytes together
93 output = _mm_or_si128(byte1, byte4);
94 byte2 = _mm_and_si128(byte2, byte2mask);
95 output = _mm_or_si128(output, byte2);
96 byte3 = _mm_and_si128(byte3, byte3mask);
97 output = _mm_or_si128(output, byte3);
98
99 // Reorder the two words
100 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
101
102 // Store the results
103 _mm_storeu_si128((__m128i*)inputPtr, output);
104 inputPtr += 4;
105 }
106
107 // Byteswap any remaining points:
108 number = halfPoints * 2;
109 for (; number < num_points; number++) {
110 uint32_t output1 = *inputPtr;
111 uint32_t output2 = inputPtr[1];
112
113 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
114 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
115
116 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
117 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
118
119 *inputPtr++ = output2;
120 *inputPtr++ = output1;
121 }
122}
123#endif /* LV_HAVE_SSE2 */
124
125
126#ifdef LV_HAVE_GENERIC
127
128static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
129 unsigned int num_points)
130{
131 uint32_t* inputPtr = (uint32_t*)intsToSwap;
132 unsigned int point;
133 for (point = 0; point < num_points; point++) {
134 uint32_t output1 = *inputPtr;
135 uint32_t output2 = inputPtr[1];
136
137 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
138 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
139
140 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
141 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
142
143 *inputPtr++ = output2;
144 *inputPtr++ = output1;
145 }
146}
147#endif /* LV_HAVE_GENERIC */
148
149#if LV_HAVE_AVX2
150#include <immintrin.h>
151static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
152{
153 unsigned int number = 0;
154
155 const unsigned int nPerSet = 4;
156 const uint64_t nSets = num_points / nPerSet;
157
158 uint32_t* inputPtr = (uint32_t*)intsToSwap;
159
160 const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
161 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
162 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
163
164 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
165
166 for (; number < nSets; number++) {
167
168 // Load the 32t values, increment inputPtr later since we're doing it in-place.
169 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
170 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
171
172 // Store the results
173 _mm256_store_si256((__m256i*)inputPtr, output);
174
175 /* inputPtr is 32bit so increment twice */
176 inputPtr += 2 * nPerSet;
177 }
178
179 // Byteswap any remaining points:
180 for (number = nSets * nPerSet; number < num_points; ++number) {
181 uint32_t output1 = *inputPtr;
182 uint32_t output2 = inputPtr[1];
183 uint32_t out1 =
184 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
185 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
186
187 uint32_t out2 =
188 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
189 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
190 *inputPtr++ = out2;
191 *inputPtr++ = out1;
192 }
193}
194
195#endif /* LV_HAVE_AVX2 */
196
197
198#if LV_HAVE_SSSE3
199#include <tmmintrin.h>
200static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap,
201 unsigned int num_points)
202{
203 unsigned int number = 0;
204
205 const unsigned int nPerSet = 2;
206 const uint64_t nSets = num_points / nPerSet;
207
208 uint32_t* inputPtr = (uint32_t*)intsToSwap;
209
210 uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
211
212 const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
213
214 for (; number < nSets; number++) {
215
216 // Load the 32t values, increment inputPtr later since we're doing it in-place.
217 const __m128i input = _mm_load_si128((__m128i*)inputPtr);
218 const __m128i output = _mm_shuffle_epi8(input, myShuffle);
219
220 // Store the results
221 _mm_store_si128((__m128i*)inputPtr, output);
222
223 /* inputPtr is 32bit so increment twice */
224 inputPtr += 2 * nPerSet;
225 }
226
227 // Byteswap any remaining points:
228 for (number = nSets * nPerSet; number < num_points; ++number) {
229 uint32_t output1 = *inputPtr;
230 uint32_t output2 = inputPtr[1];
231 uint32_t out1 =
232 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
233 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
234
235 uint32_t out2 =
236 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
237 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
238 *inputPtr++ = out2;
239 *inputPtr++ = out1;
240 }
241}
242#endif /* LV_HAVE_SSSE3 */
243
244
245#ifdef LV_HAVE_NEONV8
246#include <arm_neon.h>
247
248static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points)
249{
250 uint32_t* inputPtr = (uint32_t*)intsToSwap;
251 const unsigned int n4points = num_points / 4;
252 uint8x16x2_t input;
253 uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
254
255 unsigned int number = 0;
256 for (number = 0; number < n4points; ++number) {
257 __VOLK_PREFETCH(inputPtr + 8);
258 input = vld2q_u8((uint8_t*)inputPtr);
259 input.val[0] = vqtbl1q_u8(input.val[0], idx);
260 input.val[1] = vqtbl1q_u8(input.val[1], idx);
261 vst2q_u8((uint8_t*)inputPtr, input);
262
263 inputPtr += 8;
264 }
265
266 for (number = n4points * 4; number < num_points; ++number) {
267 uint32_t output1 = *inputPtr;
268 uint32_t output2 = inputPtr[1];
269
270 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
271 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
272 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
273 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
274
275 *inputPtr++ = output2;
276 *inputPtr++ = output1;
277 }
278}
279#else
280#ifdef LV_HAVE_NEON
281#include <arm_neon.h>
282
283static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points)
284{
285 uint32_t* inputPtr = (uint32_t*)intsToSwap;
286 unsigned int number = 0;
287 unsigned int n8points = num_points / 4;
288
289 uint8x8x4_t input_table;
290 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
291 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
292
293 /* these magic numbers are used as byte-indices in the LUT.
294 they are pre-computed to save time. A simple C program
295 can calculate them; for example for lookup01:
296 uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
297 for(ii=0; ii < 8; ++ii) {
298 index += ((uint64_t)(*(chars+ii))) << (ii*8);
299 }
300 */
301 int_lookup01 = vcreate_u8(2269495096316185);
302 int_lookup23 = vcreate_u8(146949840772469531);
303 int_lookup45 = vcreate_u8(291630186448622877);
304 int_lookup67 = vcreate_u8(436310532124776223);
305
306 for (number = 0; number < n8points; ++number) {
307 input_table = vld4_u8((uint8_t*)inputPtr);
308 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
309 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
310 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
311 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
312 vst1_u8((uint8_t*)inputPtr, swapped_int01);
313 vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
314 vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
315 vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
316
317 inputPtr += 4;
318 }
319
320 for (number = n8points * 4; number < num_points; ++number) {
321 uint32_t output1 = *inputPtr;
322 uint32_t output2 = inputPtr[1];
323
324 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
325 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
326 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
327 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
328
329 *inputPtr++ = output2;
330 *inputPtr++ = output1;
331 }
332}
333#endif /* LV_HAVE_NEON */
334#endif
335
336#endif /* INCLUDED_volk_64u_byteswap_u_H */
337#ifndef INCLUDED_volk_64u_byteswap_a_H
338#define INCLUDED_volk_64u_byteswap_a_H
339
340#include <inttypes.h>
341#include <stdio.h>
342
343
344#ifdef LV_HAVE_SSE2
345#include <emmintrin.h>
346
347static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points)
348{
349 uint32_t* inputPtr = (uint32_t*)intsToSwap;
350 __m128i input, byte1, byte2, byte3, byte4, output;
351 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
352 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
353 uint64_t number = 0;
354 const unsigned int halfPoints = num_points / 2;
355 for (; number < halfPoints; number++) {
356 // Load the 32t values, increment inputPtr later since we're doing it in-place.
357 input = _mm_load_si128((__m128i*)inputPtr);
358
359 // Do the four shifts
360 byte1 = _mm_slli_epi32(input, 24);
361 byte2 = _mm_slli_epi32(input, 8);
362 byte3 = _mm_srli_epi32(input, 8);
363 byte4 = _mm_srli_epi32(input, 24);
364 // Or bytes together
365 output = _mm_or_si128(byte1, byte4);
366 byte2 = _mm_and_si128(byte2, byte2mask);
367 output = _mm_or_si128(output, byte2);
368 byte3 = _mm_and_si128(byte3, byte3mask);
369 output = _mm_or_si128(output, byte3);
370
371 // Reorder the two words
372 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
373
374 // Store the results
375 _mm_store_si128((__m128i*)inputPtr, output);
376 inputPtr += 4;
377 }
378
379 // Byteswap any remaining points:
380 number = halfPoints * 2;
381 for (; number < num_points; number++) {
382 uint32_t output1 = *inputPtr;
383 uint32_t output2 = inputPtr[1];
384
385 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
386 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
387
388 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
389 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
390
391 *inputPtr++ = output2;
392 *inputPtr++ = output1;
393 }
394}
395#endif /* LV_HAVE_SSE2 */
396
397#if LV_HAVE_AVX2
398#include <immintrin.h>
399static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points)
400{
401 unsigned int number = 0;
402
403 const unsigned int nPerSet = 4;
404 const uint64_t nSets = num_points / nPerSet;
405
406 uint32_t* inputPtr = (uint32_t*)intsToSwap;
407
408 const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
409 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
410 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
411
412 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
413
414 for (; number < nSets; number++) {
415 // Load the 32t values, increment inputPtr later since we're doing it in-place.
416 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
417 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
418
419 // Store the results
420 _mm256_storeu_si256((__m256i*)inputPtr, output);
421
422 /* inputPtr is 32bit so increment twice */
423 inputPtr += 2 * nPerSet;
424 }
425
426 // Byteswap any remaining points:
427 for (number = nSets * nPerSet; number < num_points; ++number) {
428 uint32_t output1 = *inputPtr;
429 uint32_t output2 = inputPtr[1];
430 uint32_t out1 =
431 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
432 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
433
434 uint32_t out2 =
435 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
436 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
437 *inputPtr++ = out2;
438 *inputPtr++ = out1;
439 }
440}
441
442#endif /* LV_HAVE_AVX2 */
443
444
445#if LV_HAVE_SSSE3
446#include <tmmintrin.h>
447static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap,
448 unsigned int num_points)
449{
450 unsigned int number = 0;
451
452 const unsigned int nPerSet = 2;
453 const uint64_t nSets = num_points / nPerSet;
454
455 uint32_t* inputPtr = (uint32_t*)intsToSwap;
456
457 uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
458
459 const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
460
461 for (; number < nSets; number++) {
462 // Load the 32t values, increment inputPtr later since we're doing it in-place.
463 const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
464 const __m128i output = _mm_shuffle_epi8(input, myShuffle);
465
466 // Store the results
467 _mm_storeu_si128((__m128i*)inputPtr, output);
468
469 /* inputPtr is 32bit so increment twice */
470 inputPtr += 2 * nPerSet;
471 }
472
473 // Byteswap any remaining points:
474 for (number = nSets * nPerSet; number < num_points; ++number) {
475 uint32_t output1 = *inputPtr;
476 uint32_t output2 = inputPtr[1];
477 uint32_t out1 =
478 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
479 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
480
481 uint32_t out2 =
482 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
483 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
484 *inputPtr++ = out2;
485 *inputPtr++ = out1;
486 }
487}
488#endif /* LV_HAVE_SSSE3 */
489
490#ifdef LV_HAVE_GENERIC
491
492static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap,
493 unsigned int num_points)
494{
495 uint32_t* inputPtr = (uint32_t*)intsToSwap;
496 unsigned int point;
497 for (point = 0; point < num_points; point++) {
498 uint32_t output1 = *inputPtr;
499 uint32_t output2 = inputPtr[1];
500
501 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
502 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
503
504 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
505 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
506
507 *inputPtr++ = output2;
508 *inputPtr++ = output1;
509 }
510}
511#endif /* LV_HAVE_GENERIC */
512
513
514#endif /* INCLUDED_volk_64u_byteswap_a_H */
static void volk_64u_byteswap_a_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:492
static void volk_64u_byteswap_a_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:200
static void volk_64u_byteswap_a_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:347
static void volk_64u_byteswap_u_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:447
static void volk_64u_byteswap_u_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:75
static void volk_64u_byteswap_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:128
static void volk_64u_byteswap_neon(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:283
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62