Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_16ic_deinterleave_16i_x2.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
54#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
55#define INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
56
57#include <inttypes.h>
58#include <stdio.h>
59#ifdef LV_HAVE_AVX2
60#include <immintrin.h>
61
62static inline void volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
63 int16_t* qBuffer,
64 const lv_16sc_t* complexVector,
65 unsigned int num_points)
66{
67 unsigned int number = 0;
68 const int8_t* complexVectorPtr = (int8_t*)complexVector;
69 int16_t* iBufferPtr = iBuffer;
70 int16_t* qBufferPtr = qBuffer;
71
72 __m256i MoveMask = _mm256_set_epi8(15,
73 14,
74 11,
75 10,
76 7,
77 6,
78 3,
79 2,
80 13,
81 12,
82 9,
83 8,
84 5,
85 4,
86 1,
87 0,
88 15,
89 14,
90 11,
91 10,
92 7,
93 6,
94 3,
95 2,
96 13,
97 12,
98 9,
99 8,
100 5,
101 4,
102 1,
103 0);
104
105 __m256i iMove2, iMove1;
106 __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
107
108 unsigned int sixteenthPoints = num_points / 16;
109
110 for (number = 0; number < sixteenthPoints; number++) {
111 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
112 complexVectorPtr += 32;
113 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
114 complexVectorPtr += 32;
115
116 iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
117 iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
118
119 iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
120 _mm256_permute4x64_epi64(iMove2, 0x80),
121 0x30);
122 qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
123 _mm256_permute4x64_epi64(iMove2, 0xd0),
124 0x30);
125
126 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
127 _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
128
129 iBufferPtr += 16;
130 qBufferPtr += 16;
131 }
132
133 number = sixteenthPoints * 16;
134 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
135 for (; number < num_points; number++) {
136 *iBufferPtr++ = *int16ComplexVectorPtr++;
137 *qBufferPtr++ = *int16ComplexVectorPtr++;
138 }
139}
140#endif /* LV_HAVE_AVX2 */
141
142#ifdef LV_HAVE_SSSE3
143#include <tmmintrin.h>
144
145static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer,
146 int16_t* qBuffer,
147 const lv_16sc_t* complexVector,
148 unsigned int num_points)
149{
150 unsigned int number = 0;
151 const int8_t* complexVectorPtr = (int8_t*)complexVector;
152 int16_t* iBufferPtr = iBuffer;
153 int16_t* qBufferPtr = qBuffer;
154
155 __m128i iMoveMask1 = _mm_set_epi8(
156 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
157 __m128i iMoveMask2 = _mm_set_epi8(
158 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
159
160 __m128i qMoveMask1 = _mm_set_epi8(
161 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
162 __m128i qMoveMask2 = _mm_set_epi8(
163 15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
164
165 __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
166
167 unsigned int eighthPoints = num_points / 8;
168
169 for (number = 0; number < eighthPoints; number++) {
170 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
171 complexVectorPtr += 16;
172 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
173 complexVectorPtr += 16;
174
175 iOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, iMoveMask1),
176 _mm_shuffle_epi8(complexVal2, iMoveMask2));
177 qOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, qMoveMask1),
178 _mm_shuffle_epi8(complexVal2, qMoveMask2));
179
180 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
181 _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
182
183 iBufferPtr += 8;
184 qBufferPtr += 8;
185 }
186
187 number = eighthPoints * 8;
188 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
189 for (; number < num_points; number++) {
190 *iBufferPtr++ = *int16ComplexVectorPtr++;
191 *qBufferPtr++ = *int16ComplexVectorPtr++;
192 }
193}
194#endif /* LV_HAVE_SSSE3 */
195
196#ifdef LV_HAVE_SSE2
197#include <emmintrin.h>
198
199static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer,
200 int16_t* qBuffer,
201 const lv_16sc_t* complexVector,
202 unsigned int num_points)
203{
204 unsigned int number = 0;
205 const int16_t* complexVectorPtr = (int16_t*)complexVector;
206 int16_t* iBufferPtr = iBuffer;
207 int16_t* qBufferPtr = qBuffer;
208 __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1,
209 qComplexVal2, iOutputVal, qOutputVal;
210 __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
211 __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
212
213 unsigned int eighthPoints = num_points / 8;
214
215 for (number = 0; number < eighthPoints; number++) {
216 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
217 complexVectorPtr += 8;
218 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
219 complexVectorPtr += 8;
220
221 iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
222
223 iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
224
225 iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
226
227 iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
228
229 iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3, 1, 2, 0));
230
231 iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
232
233 iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask),
234 _mm_and_si128(iComplexVal2, highMask));
235
236 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
237
238 qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2, 0, 3, 1));
239
240 qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2, 0, 3, 1));
241
242 qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
243
244 qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
245
246 qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
247
248 qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
249
250 qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask),
251 _mm_and_si128(qComplexVal2, highMask));
252
253 _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
254
255 iBufferPtr += 8;
256 qBufferPtr += 8;
257 }
258
259 number = eighthPoints * 8;
260 for (; number < num_points; number++) {
261 *iBufferPtr++ = *complexVectorPtr++;
262 *qBufferPtr++ = *complexVectorPtr++;
263 }
264}
265#endif /* LV_HAVE_SSE2 */
266
267#ifdef LV_HAVE_GENERIC
268
269static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
270 int16_t* qBuffer,
271 const lv_16sc_t* complexVector,
272 unsigned int num_points)
273{
274 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
275 int16_t* iBufferPtr = iBuffer;
276 int16_t* qBufferPtr = qBuffer;
277 unsigned int number;
278 for (number = 0; number < num_points; number++) {
279 *iBufferPtr++ = *complexVectorPtr++;
280 *qBufferPtr++ = *complexVectorPtr++;
281 }
282}
283#endif /* LV_HAVE_GENERIC */
284
285#ifdef LV_HAVE_ORC
286
287extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer,
288 int16_t* qBuffer,
289 const lv_16sc_t* complexVector,
290 unsigned int num_points);
291static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer,
292 int16_t* qBuffer,
293 const lv_16sc_t* complexVector,
294 unsigned int num_points)
295{
296 volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
297}
298#endif /* LV_HAVE_ORC */
299
300#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a_H */
301
302
303#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
304#define INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
305
306#include <inttypes.h>
307#include <stdio.h>
308#ifdef LV_HAVE_AVX2
309#include <immintrin.h>
310
311static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
312 int16_t* qBuffer,
313 const lv_16sc_t* complexVector,
314 unsigned int num_points)
315{
316 unsigned int number = 0;
317 const int8_t* complexVectorPtr = (int8_t*)complexVector;
318 int16_t* iBufferPtr = iBuffer;
319 int16_t* qBufferPtr = qBuffer;
320
321 __m256i MoveMask = _mm256_set_epi8(15,
322 14,
323 11,
324 10,
325 7,
326 6,
327 3,
328 2,
329 13,
330 12,
331 9,
332 8,
333 5,
334 4,
335 1,
336 0,
337 15,
338 14,
339 11,
340 10,
341 7,
342 6,
343 3,
344 2,
345 13,
346 12,
347 9,
348 8,
349 5,
350 4,
351 1,
352 0);
353
354 __m256i iMove2, iMove1;
355 __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
356
357 unsigned int sixteenthPoints = num_points / 16;
358
359 for (number = 0; number < sixteenthPoints; number++) {
360 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
361 complexVectorPtr += 32;
362 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
363 complexVectorPtr += 32;
364
365 iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
366 iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
367
368 iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
369 _mm256_permute4x64_epi64(iMove2, 0x80),
370 0x30);
371 qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
372 _mm256_permute4x64_epi64(iMove2, 0xd0),
373 0x30);
374
375 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
376 _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
377
378 iBufferPtr += 16;
379 qBufferPtr += 16;
380 }
381
382 number = sixteenthPoints * 16;
383 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
384 for (; number < num_points; number++) {
385 *iBufferPtr++ = *int16ComplexVectorPtr++;
386 *qBufferPtr++ = *int16ComplexVectorPtr++;
387 }
388}
389#endif /* LV_HAVE_AVX2 */
390
391#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_u_H */
static void volk_16ic_deinterleave_16i_x2_generic(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_16i_x2.h:269
static void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_16i_x2.h:199
static void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_16i_x2.h:145
short complex lv_16sc_t
Definition: volk_complex.h:62