Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_16u_byteswap.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
53#ifndef INCLUDED_volk_16u_byteswap_u_H
54#define INCLUDED_volk_16u_byteswap_u_H
55
56#include <inttypes.h>
57#include <stdio.h>
58
59#ifdef LV_HAVE_GENERIC
60
61static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap,
62 unsigned int num_points)
63{
64 uint16_t* inputPtr = intsToSwap;
65 for (unsigned int point = 0; point < num_points; point++) {
66 uint16_t output = *inputPtr;
67 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
68 *inputPtr = output;
69 inputPtr++;
70 }
71}
72#endif /* LV_HAVE_GENERIC */
73
74
75#if LV_HAVE_AVX2
76#include <immintrin.h>
77static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points)
78{
79 unsigned int number;
80
81 const unsigned int nPerSet = 16;
82 const uint64_t nSets = num_points / nPerSet;
83
84 uint16_t* inputPtr = (uint16_t*)intsToSwap;
85
86 const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
87 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
88 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
89
90 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
91
92 for (number = 0; number < nSets; number++) {
93 // Load the 32t values, increment inputPtr later since we're doing it in-place.
94 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
95 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
96
97 // Store the results
98 _mm256_store_si256((__m256i*)inputPtr, output);
99 inputPtr += nPerSet;
100 }
101
102 // Byteswap any remaining points:
103 for (number = nPerSet * nSets; number < num_points; number++) {
104 uint16_t outputVal = *inputPtr;
105 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
106 *inputPtr = outputVal;
107 inputPtr++;
108 }
109}
110#endif /* LV_HAVE_AVX2 */
111
112
113#if LV_HAVE_AVX2
114#include <immintrin.h>
115static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points)
116{
117 unsigned int number;
118
119 const unsigned int nPerSet = 16;
120 const uint64_t nSets = num_points / nPerSet;
121
122 uint16_t* inputPtr = (uint16_t*)intsToSwap;
123
124 const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
125 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
126 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
127
128 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
129
130 for (number = 0; number < nSets; number++) {
131 // Load the 32t values, increment inputPtr later since we're doing it in-place.
132 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
133 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
134
135 // Store the results
136 _mm256_storeu_si256((__m256i*)inputPtr, output);
137 inputPtr += nPerSet;
138 }
139
140 // Byteswap any remaining points:
141 for (number = nPerSet * nSets; number < num_points; number++) {
142 uint16_t outputVal = *inputPtr;
143 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
144 *inputPtr = outputVal;
145 inputPtr++;
146 }
147}
148#endif /* LV_HAVE_AVX2 */
149
150
151#ifdef LV_HAVE_SSE2
152#include <emmintrin.h>
153
154static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points)
155{
156 unsigned int number = 0;
157 uint16_t* inputPtr = intsToSwap;
158 __m128i input, left, right, output;
159
160 const unsigned int eighthPoints = num_points / 8;
161 for (; number < eighthPoints; number++) {
162 // Load the 16t values, increment inputPtr later since we're doing it in-place.
163 input = _mm_loadu_si128((__m128i*)inputPtr);
164 // Do the two shifts
165 left = _mm_slli_epi16(input, 8);
166 right = _mm_srli_epi16(input, 8);
167 // Or the left and right halves together
168 output = _mm_or_si128(left, right);
169 // Store the results
170 _mm_storeu_si128((__m128i*)inputPtr, output);
171 inputPtr += 8;
172 }
173
174 // Byteswap any remaining points:
175 number = eighthPoints * 8;
176 for (; number < num_points; number++) {
177 uint16_t outputVal = *inputPtr;
178 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
179 *inputPtr = outputVal;
180 inputPtr++;
181 }
182}
183#endif /* LV_HAVE_SSE2 */
184
185
186#endif /* INCLUDED_volk_16u_byteswap_u_H */
187#ifndef INCLUDED_volk_16u_byteswap_a_H
188#define INCLUDED_volk_16u_byteswap_a_H
189
190#include <inttypes.h>
191#include <stdio.h>
192
193#ifdef LV_HAVE_SSE2
194#include <emmintrin.h>
195
196static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points)
197{
198 uint16_t* inputPtr = intsToSwap;
199 __m128i input, left, right, output;
200
201 const unsigned int eighthPoints = num_points / 8;
202 for (unsigned int number = 0; number < eighthPoints; number++) {
203 // Load the 16t values, increment inputPtr later since we're doing it in-place.
204 input = _mm_load_si128((__m128i*)inputPtr);
205 // Do the two shifts
206 left = _mm_slli_epi16(input, 8);
207 right = _mm_srli_epi16(input, 8);
208 // Or the left and right halves together
209 output = _mm_or_si128(left, right);
210 // Store the results
211 _mm_store_si128((__m128i*)inputPtr, output);
212 inputPtr += 8;
213 }
214
215 // Byteswap any remaining points:
216 volk_16u_byteswap_generic(inputPtr, num_points - eighthPoints * 8);
217}
218#endif /* LV_HAVE_SSE2 */
219
220#ifdef LV_HAVE_NEON
221#include <arm_neon.h>
222
223static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points)
224{
225 unsigned int number;
226 unsigned int eighth_points = num_points / 8;
227 uint16x8_t input, output;
228 uint16_t* inputPtr = intsToSwap;
229
230 for (number = 0; number < eighth_points; number++) {
231 input = vld1q_u16(inputPtr);
232 output = vsriq_n_u16(output, input, 8);
233 output = vsliq_n_u16(output, input, 8);
234 vst1q_u16(inputPtr, output);
235 inputPtr += 8;
236 }
237
238 volk_16u_byteswap_generic(inputPtr, num_points - eighth_points * 8);
239}
240#endif /* LV_HAVE_NEON */
241
242#ifdef LV_HAVE_NEON
243#include <arm_neon.h>
244
245static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap,
246 unsigned int num_points)
247{
248 uint16_t* inputPtr = intsToSwap;
249 unsigned int number = 0;
250 unsigned int n16points = num_points / 16;
251
252 uint8x8x4_t input_table;
253 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
254 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
255
256 /* these magic numbers are used as byte-indices in the LUT.
257 they are pre-computed to save time. A simple C program
258 can calculate them; for example for lookup01:
259 uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
260 for(ii=0; ii < 8; ++ii) {
261 index += ((uint64_t)(*(chars+ii))) << (ii*8);
262 }
263 */
264 int_lookup01 = vcreate_u8(1232017111498883080);
265 int_lookup23 = vcreate_u8(1376697457175036426);
266 int_lookup45 = vcreate_u8(1521377802851189772);
267 int_lookup67 = vcreate_u8(1666058148527343118);
268
269 for (number = 0; number < n16points; ++number) {
270 input_table = vld4_u8((uint8_t*)inputPtr);
271 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
272 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
273 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
274 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
275 vst1_u8((uint8_t*)inputPtr, swapped_int01);
276 vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23);
277 vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45);
278 vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67);
279
280 inputPtr += 16;
281 }
282
283 volk_16u_byteswap_generic(inputPtr, num_points - n16points * 16);
284}
285#endif /* LV_HAVE_NEON */
286
287#ifdef LV_HAVE_GENERIC
288
289static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap,
290 unsigned int num_points)
291{
292 uint16_t* inputPtr = intsToSwap;
293 for (unsigned int point = 0; point < num_points; point++) {
294 uint16_t output = *inputPtr;
295 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
296 *inputPtr = output;
297 inputPtr++;
298 }
299}
300#endif /* LV_HAVE_GENERIC */
301
302#ifdef LV_HAVE_ORC
303
304extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
305static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points)
306{
307 volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
308}
309#endif /* LV_HAVE_ORC */
310
311
312#endif /* INCLUDED_volk_16u_byteswap_a_H */
static void volk_16u_byteswap_u_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:154
static void volk_16u_byteswap_a_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:289
static void volk_16u_byteswap_neon(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:223
static void volk_16u_byteswap_a_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:196
static void volk_16u_byteswap_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:61
static void volk_16u_byteswap_neon_table(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:245