Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32u_byteswap.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
66#ifndef INCLUDED_volk_32u_byteswap_u_H
67#define INCLUDED_volk_32u_byteswap_u_H
68
69#include <inttypes.h>
70#include <stdio.h>
71
72#if LV_HAVE_AVX2
73#include <immintrin.h>
74static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points)
75{
76
77 unsigned int number;
78
79 const unsigned int nPerSet = 8;
80 const uint64_t nSets = num_points / nPerSet;
81
82 uint32_t* inputPtr = intsToSwap;
83
84 const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
85 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
86 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
87
88 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
89
90 for (number = 0; number < nSets; number++) {
91
92 // Load the 32t values, increment inputPtr later since we're doing it in-place.
93 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
94 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
95
96 // Store the results
97 _mm256_storeu_si256((__m256i*)inputPtr, output);
98 inputPtr += nPerSet;
99 }
100
101 // Byteswap any remaining points:
102 for (number = nSets * nPerSet; number < num_points; number++) {
103 uint32_t outputVal = *inputPtr;
104 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
105 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
106 *inputPtr = outputVal;
107 inputPtr++;
108 }
109}
110#endif /* LV_HAVE_AVX2 */
111
112
113#ifdef LV_HAVE_SSE2
114#include <emmintrin.h>
115
116static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points)
117{
118 unsigned int number = 0;
119
120 uint32_t* inputPtr = intsToSwap;
121 __m128i input, byte1, byte2, byte3, byte4, output;
122 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
123 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
124
125 const uint64_t quarterPoints = num_points / 4;
126 for (; number < quarterPoints; number++) {
127 // Load the 32t values, increment inputPtr later since we're doing it in-place.
128 input = _mm_loadu_si128((__m128i*)inputPtr);
129 // Do the four shifts
130 byte1 = _mm_slli_epi32(input, 24);
131 byte2 = _mm_slli_epi32(input, 8);
132 byte3 = _mm_srli_epi32(input, 8);
133 byte4 = _mm_srli_epi32(input, 24);
134 // Or bytes together
135 output = _mm_or_si128(byte1, byte4);
136 byte2 = _mm_and_si128(byte2, byte2mask);
137 output = _mm_or_si128(output, byte2);
138 byte3 = _mm_and_si128(byte3, byte3mask);
139 output = _mm_or_si128(output, byte3);
140 // Store the results
141 _mm_storeu_si128((__m128i*)inputPtr, output);
142 inputPtr += 4;
143 }
144
145 // Byteswap any remaining points:
146 number = quarterPoints * 4;
147 for (; number < num_points; number++) {
148 uint32_t outputVal = *inputPtr;
149 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
150 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
151 *inputPtr = outputVal;
152 inputPtr++;
153 }
154}
155#endif /* LV_HAVE_SSE2 */
156
157
158#ifdef LV_HAVE_NEON
159#include <arm_neon.h>
160
161static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points)
162{
163 uint32_t* inputPtr = intsToSwap;
164 unsigned int number = 0;
165 unsigned int n8points = num_points / 8;
166
167 uint8x8x4_t input_table;
168 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
169 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
170
171 /* these magic numbers are used as byte-indices in the LUT.
172 they are pre-computed to save time. A simple C program
173 can calculate them; for example for lookup01:
174 uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
175 for(ii=0; ii < 8; ++ii) {
176 index += ((uint64_t)(*(chars+ii))) << (ii*8);
177 }
178 */
179 int_lookup01 = vcreate_u8(74609667900706840);
180 int_lookup23 = vcreate_u8(219290013576860186);
181 int_lookup45 = vcreate_u8(363970359253013532);
182 int_lookup67 = vcreate_u8(508650704929166878);
183
184 for (number = 0; number < n8points; ++number) {
185 input_table = vld4_u8((uint8_t*)inputPtr);
186 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
187 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
188 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
189 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
190 vst1_u8((uint8_t*)inputPtr, swapped_int01);
191 vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
192 vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
193 vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
194
195 inputPtr += 8;
196 }
197
198 for (number = n8points * 8; number < num_points; ++number) {
199 uint32_t output = *inputPtr;
200 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
201 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
202
203 *inputPtr = output;
204 inputPtr++;
205 }
206}
207#endif /* LV_HAVE_NEON */
208
209#ifdef LV_HAVE_NEONV8
210#include <arm_neon.h>
211
212static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points)
213{
214 uint32_t* inputPtr = (uint32_t*)intsToSwap;
215 const unsigned int n8points = num_points / 8;
216 uint8x16_t input;
217 uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
218
219 unsigned int number = 0;
220 for (number = 0; number < n8points; ++number) {
221 __VOLK_PREFETCH(inputPtr + 8);
222 input = vld1q_u8((uint8_t*)inputPtr);
223 input = vqtbl1q_u8(input, idx);
224 vst1q_u8((uint8_t*)inputPtr, input);
225 inputPtr += 4;
226
227 input = vld1q_u8((uint8_t*)inputPtr);
228 input = vqtbl1q_u8(input, idx);
229 vst1q_u8((uint8_t*)inputPtr, input);
230 inputPtr += 4;
231 }
232
233 for (number = n8points * 8; number < num_points; ++number) {
234 uint32_t output = *inputPtr;
235
236 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
237 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
238
239 *inputPtr++ = output;
240 }
241}
242#endif /* LV_HAVE_NEONV8 */
243
244
245#ifdef LV_HAVE_GENERIC
246
247static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap,
248 unsigned int num_points)
249{
250 uint32_t* inputPtr = intsToSwap;
251
252 unsigned int point;
253 for (point = 0; point < num_points; point++) {
254 uint32_t output = *inputPtr;
255 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
256 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
257
258 *inputPtr = output;
259 inputPtr++;
260 }
261}
262#endif /* LV_HAVE_GENERIC */
263
264
265#endif /* INCLUDED_volk_32u_byteswap_u_H */
266#ifndef INCLUDED_volk_32u_byteswap_a_H
267#define INCLUDED_volk_32u_byteswap_a_H
268
269#include <inttypes.h>
270#include <stdio.h>
271
272
273#if LV_HAVE_AVX2
274#include <immintrin.h>
275static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points)
276{
277
278 unsigned int number;
279
280 const unsigned int nPerSet = 8;
281 const uint64_t nSets = num_points / nPerSet;
282
283 uint32_t* inputPtr = intsToSwap;
284
285 const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
286 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
287 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
288
289 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
290
291 for (number = 0; number < nSets; number++) {
292
293 // Load the 32t values, increment inputPtr later since we're doing it in-place.
294 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
295 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
296
297 // Store the results
298 _mm256_store_si256((__m256i*)inputPtr, output);
299 inputPtr += nPerSet;
300 }
301
302 // Byteswap any remaining points:
303 for (number = nSets * nPerSet; number < num_points; number++) {
304 uint32_t outputVal = *inputPtr;
305 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
306 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
307 *inputPtr = outputVal;
308 inputPtr++;
309 }
310}
311#endif /* LV_HAVE_AVX2 */
312
313
314#ifdef LV_HAVE_SSE2
315#include <emmintrin.h>
316
317
318static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points)
319{
320 unsigned int number = 0;
321
322 uint32_t* inputPtr = intsToSwap;
323 __m128i input, byte1, byte2, byte3, byte4, output;
324 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
325 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
326
327 const uint64_t quarterPoints = num_points / 4;
328 for (; number < quarterPoints; number++) {
329 // Load the 32t values, increment inputPtr later since we're doing it in-place.
330 input = _mm_load_si128((__m128i*)inputPtr);
331 // Do the four shifts
332 byte1 = _mm_slli_epi32(input, 24);
333 byte2 = _mm_slli_epi32(input, 8);
334 byte3 = _mm_srli_epi32(input, 8);
335 byte4 = _mm_srli_epi32(input, 24);
336 // Or bytes together
337 output = _mm_or_si128(byte1, byte4);
338 byte2 = _mm_and_si128(byte2, byte2mask);
339 output = _mm_or_si128(output, byte2);
340 byte3 = _mm_and_si128(byte3, byte3mask);
341 output = _mm_or_si128(output, byte3);
342 // Store the results
343 _mm_store_si128((__m128i*)inputPtr, output);
344 inputPtr += 4;
345 }
346
347 // Byteswap any remaining points:
348 number = quarterPoints * 4;
349 for (; number < num_points; number++) {
350 uint32_t outputVal = *inputPtr;
351 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
352 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
353 *inputPtr = outputVal;
354 inputPtr++;
355 }
356}
357#endif /* LV_HAVE_SSE2 */
358
359
360#ifdef LV_HAVE_GENERIC
361
362static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap,
363 unsigned int num_points)
364{
365 uint32_t* inputPtr = intsToSwap;
366
367 unsigned int point;
368 for (point = 0; point < num_points; point++) {
369 uint32_t output = *inputPtr;
370 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
371 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
372
373 *inputPtr = output;
374 inputPtr++;
375 }
376}
377#endif /* LV_HAVE_GENERIC */
378
379
380#endif /* INCLUDED_volk_32u_byteswap_a_H */
static void volk_32u_byteswap_neon(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:161
static void volk_32u_byteswap_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:247
static void volk_32u_byteswap_a_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:362
static void volk_32u_byteswap_u_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:116
static void volk_32u_byteswap_a_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:318
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62