Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_8i_s32f_convert_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
54#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
55#define INCLUDED_volk_8i_s32f_convert_32f_u_H
56
57#include <inttypes.h>
58#include <stdio.h>
59
60#ifdef LV_HAVE_AVX2
61#include <immintrin.h>
62
63static inline void volk_8i_s32f_convert_32f_u_avx2(float* outputVector,
64 const int8_t* inputVector,
65 const float scalar,
66 unsigned int num_points)
67{
68 unsigned int number = 0;
69 const unsigned int sixteenthPoints = num_points / 16;
70
71 float* outputVectorPtr = outputVector;
72 const float iScalar = 1.0 / scalar;
73 __m256 invScalar = _mm256_set1_ps(iScalar);
74 const int8_t* inputVectorPtr = inputVector;
75 __m256 ret;
76 __m128i inputVal128;
77 __m256i interimVal;
78
79 for (; number < sixteenthPoints; number++) {
80 inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
81
82 interimVal = _mm256_cvtepi8_epi32(inputVal128);
83 ret = _mm256_cvtepi32_ps(interimVal);
84 ret = _mm256_mul_ps(ret, invScalar);
85 _mm256_storeu_ps(outputVectorPtr, ret);
86 outputVectorPtr += 8;
87
88 inputVal128 = _mm_srli_si128(inputVal128, 8);
89 interimVal = _mm256_cvtepi8_epi32(inputVal128);
90 ret = _mm256_cvtepi32_ps(interimVal);
91 ret = _mm256_mul_ps(ret, invScalar);
92 _mm256_storeu_ps(outputVectorPtr, ret);
93 outputVectorPtr += 8;
94
95 inputVectorPtr += 16;
96 }
97
98 number = sixteenthPoints * 16;
99 for (; number < num_points; number++) {
100 outputVector[number] = (float)(inputVector[number]) * iScalar;
101 }
102}
103#endif /* LV_HAVE_AVX2 */
104
105
106#ifdef LV_HAVE_SSE4_1
107#include <smmintrin.h>
108
109static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector,
110 const int8_t* inputVector,
111 const float scalar,
112 unsigned int num_points)
113{
114 unsigned int number = 0;
115 const unsigned int sixteenthPoints = num_points / 16;
116
117 float* outputVectorPtr = outputVector;
118 const float iScalar = 1.0 / scalar;
119 __m128 invScalar = _mm_set_ps1(iScalar);
120 const int8_t* inputVectorPtr = inputVector;
121 __m128 ret;
122 __m128i inputVal;
123 __m128i interimVal;
124
125 for (; number < sixteenthPoints; number++) {
126 inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
127
128 interimVal = _mm_cvtepi8_epi32(inputVal);
129 ret = _mm_cvtepi32_ps(interimVal);
130 ret = _mm_mul_ps(ret, invScalar);
131 _mm_storeu_ps(outputVectorPtr, ret);
132 outputVectorPtr += 4;
133
134 inputVal = _mm_srli_si128(inputVal, 4);
135 interimVal = _mm_cvtepi8_epi32(inputVal);
136 ret = _mm_cvtepi32_ps(interimVal);
137 ret = _mm_mul_ps(ret, invScalar);
138 _mm_storeu_ps(outputVectorPtr, ret);
139 outputVectorPtr += 4;
140
141 inputVal = _mm_srli_si128(inputVal, 4);
142 interimVal = _mm_cvtepi8_epi32(inputVal);
143 ret = _mm_cvtepi32_ps(interimVal);
144 ret = _mm_mul_ps(ret, invScalar);
145 _mm_storeu_ps(outputVectorPtr, ret);
146 outputVectorPtr += 4;
147
148 inputVal = _mm_srli_si128(inputVal, 4);
149 interimVal = _mm_cvtepi8_epi32(inputVal);
150 ret = _mm_cvtepi32_ps(interimVal);
151 ret = _mm_mul_ps(ret, invScalar);
152 _mm_storeu_ps(outputVectorPtr, ret);
153 outputVectorPtr += 4;
154
155 inputVectorPtr += 16;
156 }
157
158 number = sixteenthPoints * 16;
159 for (; number < num_points; number++) {
160 outputVector[number] = (float)(inputVector[number]) * iScalar;
161 }
162}
163#endif /* LV_HAVE_SSE4_1 */
164
165#ifdef LV_HAVE_GENERIC
166
167static inline void volk_8i_s32f_convert_32f_generic(float* outputVector,
168 const int8_t* inputVector,
169 const float scalar,
170 unsigned int num_points)
171{
172 float* outputVectorPtr = outputVector;
173 const int8_t* inputVectorPtr = inputVector;
174 unsigned int number = 0;
175 const float iScalar = 1.0 / scalar;
176
177 for (number = 0; number < num_points; number++) {
178 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
179 }
180}
181#endif /* LV_HAVE_GENERIC */
182
183
184#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
185
186#ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
187#define INCLUDED_volk_8i_s32f_convert_32f_a_H
188
189#include <inttypes.h>
190#include <stdio.h>
191
192#ifdef LV_HAVE_AVX2
193#include <immintrin.h>
194
195static inline void volk_8i_s32f_convert_32f_a_avx2(float* outputVector,
196 const int8_t* inputVector,
197 const float scalar,
198 unsigned int num_points)
199{
200 unsigned int number = 0;
201 const unsigned int sixteenthPoints = num_points / 16;
202
203 float* outputVectorPtr = outputVector;
204 const float iScalar = 1.0 / scalar;
205 __m256 invScalar = _mm256_set1_ps(iScalar);
206 const int8_t* inputVectorPtr = inputVector;
207 __m256 ret;
208 __m128i inputVal128;
209 __m256i interimVal;
210
211 for (; number < sixteenthPoints; number++) {
212 inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
213
214 interimVal = _mm256_cvtepi8_epi32(inputVal128);
215 ret = _mm256_cvtepi32_ps(interimVal);
216 ret = _mm256_mul_ps(ret, invScalar);
217 _mm256_store_ps(outputVectorPtr, ret);
218 outputVectorPtr += 8;
219
220 inputVal128 = _mm_srli_si128(inputVal128, 8);
221 interimVal = _mm256_cvtepi8_epi32(inputVal128);
222 ret = _mm256_cvtepi32_ps(interimVal);
223 ret = _mm256_mul_ps(ret, invScalar);
224 _mm256_store_ps(outputVectorPtr, ret);
225 outputVectorPtr += 8;
226
227 inputVectorPtr += 16;
228 }
229
230 number = sixteenthPoints * 16;
231 for (; number < num_points; number++) {
232 outputVector[number] = (float)(inputVector[number]) * iScalar;
233 }
234}
235#endif /* LV_HAVE_AVX2 */
236
237#ifdef LV_HAVE_SSE4_1
238#include <smmintrin.h>
239
240static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector,
241 const int8_t* inputVector,
242 const float scalar,
243 unsigned int num_points)
244{
245 unsigned int number = 0;
246 const unsigned int sixteenthPoints = num_points / 16;
247
248 float* outputVectorPtr = outputVector;
249 const float iScalar = 1.0 / scalar;
250 __m128 invScalar = _mm_set_ps1(iScalar);
251 const int8_t* inputVectorPtr = inputVector;
252 __m128 ret;
253 __m128i inputVal;
254 __m128i interimVal;
255
256 for (; number < sixteenthPoints; number++) {
257 inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
258
259 interimVal = _mm_cvtepi8_epi32(inputVal);
260 ret = _mm_cvtepi32_ps(interimVal);
261 ret = _mm_mul_ps(ret, invScalar);
262 _mm_store_ps(outputVectorPtr, ret);
263 outputVectorPtr += 4;
264
265 inputVal = _mm_srli_si128(inputVal, 4);
266 interimVal = _mm_cvtepi8_epi32(inputVal);
267 ret = _mm_cvtepi32_ps(interimVal);
268 ret = _mm_mul_ps(ret, invScalar);
269 _mm_store_ps(outputVectorPtr, ret);
270 outputVectorPtr += 4;
271
272 inputVal = _mm_srli_si128(inputVal, 4);
273 interimVal = _mm_cvtepi8_epi32(inputVal);
274 ret = _mm_cvtepi32_ps(interimVal);
275 ret = _mm_mul_ps(ret, invScalar);
276 _mm_store_ps(outputVectorPtr, ret);
277 outputVectorPtr += 4;
278
279 inputVal = _mm_srli_si128(inputVal, 4);
280 interimVal = _mm_cvtepi8_epi32(inputVal);
281 ret = _mm_cvtepi32_ps(interimVal);
282 ret = _mm_mul_ps(ret, invScalar);
283 _mm_store_ps(outputVectorPtr, ret);
284 outputVectorPtr += 4;
285
286 inputVectorPtr += 16;
287 }
288
289 number = sixteenthPoints * 16;
290 for (; number < num_points; number++) {
291 outputVector[number] = (float)(inputVector[number]) * iScalar;
292 }
293}
294#endif /* LV_HAVE_SSE4_1 */
295
296#ifdef LV_HAVE_NEON
297#include <arm_neon.h>
298
299static inline void volk_8i_s32f_convert_32f_neon(float* outputVector,
300 const int8_t* inputVector,
301 const float scalar,
302 unsigned int num_points)
303{
304 float* outputVectorPtr = outputVector;
305 const int8_t* inputVectorPtr = inputVector;
306
307 const float iScalar = 1.0 / scalar;
308 const float32x4_t qiScalar = vdupq_n_f32(iScalar);
309
310 int8x16_t inputVal;
311
312 int16x8_t lower;
313 int16x8_t higher;
314
315 float32x4_t outputFloat;
316
317 unsigned int number = 0;
318 const unsigned int sixteenthPoints = num_points / 16;
319 for (; number < sixteenthPoints; number++) {
320 inputVal = vld1q_s8(inputVectorPtr);
321 inputVectorPtr += 16;
322
323 lower = vmovl_s8(vget_low_s8(inputVal));
324 higher = vmovl_s8(vget_high_s8(inputVal));
325
326 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lower))), qiScalar);
327 vst1q_f32(outputVectorPtr, outputFloat);
328 outputVectorPtr += 4;
329
330 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lower))), qiScalar);
331 vst1q_f32(outputVectorPtr, outputFloat);
332 outputVectorPtr += 4;
333
334 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(higher))), qiScalar);
335 vst1q_f32(outputVectorPtr, outputFloat);
336 outputVectorPtr += 4;
337
338 outputFloat =
339 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(higher))), qiScalar);
340 vst1q_f32(outputVectorPtr, outputFloat);
341 outputVectorPtr += 4;
342 }
343 for (number = sixteenthPoints * 16; number < num_points; number++) {
344 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
345 }
346}
347
348#endif /* LV_HAVE_NEON */
349
350#ifdef LV_HAVE_GENERIC
351
352static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector,
353 const int8_t* inputVector,
354 const float scalar,
355 unsigned int num_points)
356{
357 float* outputVectorPtr = outputVector;
358 const int8_t* inputVectorPtr = inputVector;
359 unsigned int number = 0;
360 const float iScalar = 1.0 / scalar;
361
362 for (number = 0; number < num_points; number++) {
363 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
364 }
365}
366#endif /* LV_HAVE_GENERIC */
367
368
369#ifdef LV_HAVE_ORC
370extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector,
371 const int8_t* inputVector,
372 const float scalar,
373 unsigned int num_points);
374
375static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector,
376 const int8_t* inputVector,
377 const float scalar,
378 unsigned int num_points)
379{
380 float invscalar = 1.0 / scalar;
381 volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
382}
383#endif /* LV_HAVE_ORC */
384
385
386#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
static void volk_8i_s32f_convert_32f_a_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:352
static void volk_8i_s32f_convert_32f_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:167
static void volk_8i_s32f_convert_32f_neon(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:299