Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_8u_polarbutterfly_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
74#ifndef VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
75#define VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
76#include <math.h>
78
79static inline float llr_odd(const float la, const float lb)
80{
81 const float ala = fabsf(la);
82 const float alb = fabsf(lb);
83 return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
84}
85
86static inline void llr_odd_stages(
87 float* llrs, int min_stage, const int depth, const int frame_size, const int row)
88{
89 int loop_stage = depth - 1;
90 float* dst_llr_ptr;
91 float* src_llr_ptr;
92 int stage_size = 0x01 << loop_stage;
93
94 int el;
95 while (min_stage <= loop_stage) {
96 dst_llr_ptr = llrs + loop_stage * frame_size + row;
97 src_llr_ptr = dst_llr_ptr + frame_size;
98 for (el = 0; el < stage_size; el++) {
99 *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
100 src_llr_ptr += 2;
101 }
102
103 --loop_stage;
104 stage_size >>= 1;
105 }
106}
107
108static inline float llr_even(const float la, const float lb, const unsigned char f)
109{
110 switch (f) {
111 case 0:
112 return lb + la;
113 default:
114 return lb - la;
115 }
116}
117
118static inline void
119even_u_values(unsigned char* u_even, const unsigned char* u, const int u_num)
120{
121 u++;
122 int i;
123 for (i = 1; i < u_num; i += 2) {
124 *u_even++ = *u;
125 u += 2;
126 }
127}
128
129static inline void
130odd_xor_even_values(unsigned char* u_xor, const unsigned char* u, const int u_num)
131{
132 int i;
133 for (i = 1; i < u_num; i += 2) {
134 *u_xor++ = *u ^ *(u + 1);
135 u += 2;
136 }
137}
138
139static inline int calculate_max_stage_depth_for_row(const int frame_exp, const int row)
140{
141 int max_stage_depth = 0;
142 int half_stage_size = 0x01;
143 int stage_size = half_stage_size << 1;
144 while (max_stage_depth < (frame_exp - 1)) { // last stage holds received values.
145 if (!(row % stage_size < half_stage_size)) {
146 break;
147 }
148 half_stage_size <<= 1;
149 stage_size <<= 1;
150 max_stage_depth++;
151 }
152 return max_stage_depth;
153}
154
155#ifdef LV_HAVE_GENERIC
156
157static inline void volk_32f_8u_polarbutterfly_32f_generic(float* llrs,
158 unsigned char* u,
159 const int frame_exp,
160 const int stage,
161 const int u_num,
162 const int row)
163{
164 const int frame_size = 0x01 << frame_exp;
165 const int next_stage = stage + 1;
166
167 const int half_stage_size = 0x01 << stage;
168 const int stage_size = half_stage_size << 1;
169
170 const bool is_upper_stage_half = row % stage_size < half_stage_size;
171
172 // // this is a natural bit order impl
173 float* next_llrs = llrs + frame_size; // LLRs are stored in a consecutive array.
174 float* call_row_llr = llrs + row;
175
176 const int section = row - (row % stage_size);
177 const int jump_size = ((row % half_stage_size) << 1) % stage_size;
178
179 const int next_upper_row = section + jump_size;
180 const int next_lower_row = next_upper_row + 1;
181
182 const float* upper_right_llr_ptr = next_llrs + next_upper_row;
183 const float* lower_right_llr_ptr = next_llrs + next_lower_row;
184
185 if (!is_upper_stage_half) {
186 const int u_pos = u_num >> stage;
187 const unsigned char f = u[u_pos - 1];
188 *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
189 return;
190 }
191
192 if (frame_exp > next_stage) {
193 unsigned char* u_half = u + frame_size;
194 odd_xor_even_values(u_half, u, u_num);
196 next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row);
197
198 even_u_values(u_half, u, u_num);
200 next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row);
201 }
202
203 *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
204}
205
206#endif /* LV_HAVE_GENERIC */
207
208
209#ifdef LV_HAVE_AVX
210#include <immintrin.h>
212
213static inline void volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs,
214 unsigned char* u,
215 const int frame_exp,
216 const int stage,
217 const int u_num,
218 const int row)
219{
220 const int frame_size = 0x01 << frame_exp;
221 if (row % 2) { // for odd rows just do the only necessary calculation and return.
222 const float* next_llrs = llrs + frame_size + row;
223 *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
224 return;
225 }
226
227 const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
228 if (max_stage_depth < 3) { // vectorized version needs larger vectors.
229 volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
230 return;
231 }
232
233 int loop_stage = max_stage_depth;
234 int stage_size = 0x01 << loop_stage;
235
236 float* src_llr_ptr;
237 float* dst_llr_ptr;
238
239 __m256 src0, src1, dst;
240
241 if (row) { // not necessary for ZERO row. == first bit to be decoded.
242 // first do bit combination for all stages
243 // effectively encode some decoded bits again.
244 unsigned char* u_target = u + frame_size;
245 unsigned char* u_temp = u + 2 * frame_size;
246 memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
247
248 if (stage_size > 15) {
249 volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
250 } else {
251 volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
252 }
253
254 src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
255 dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
256
257 __m128i fbits;
258
259 int p;
260 for (p = 0; p < stage_size; p += 8) {
261 fbits = _mm_loadu_si128((__m128i*)u_target);
262 u_target += 8;
263
264 src0 = _mm256_loadu_ps(src_llr_ptr);
265 src1 = _mm256_loadu_ps(src_llr_ptr + 8);
266 src_llr_ptr += 16;
267
268 dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits);
269
270 _mm256_storeu_ps(dst_llr_ptr, dst);
271 dst_llr_ptr += 8;
272 }
273
274 --loop_stage;
275 stage_size >>= 1;
276 }
277
278 const int min_stage = stage > 2 ? stage : 2;
279
280 _mm256_zeroall(); // Important to clear cache!
281
282 int el;
283 while (min_stage < loop_stage) {
284 dst_llr_ptr = llrs + loop_stage * frame_size + row;
285 src_llr_ptr = dst_llr_ptr + frame_size;
286 for (el = 0; el < stage_size; el += 8) {
287 src0 = _mm256_loadu_ps(src_llr_ptr);
288 src_llr_ptr += 8;
289 src1 = _mm256_loadu_ps(src_llr_ptr);
290 src_llr_ptr += 8;
291
292 dst = _mm256_polar_minsum_llrs(src0, src1);
293
294 _mm256_storeu_ps(dst_llr_ptr, dst);
295 dst_llr_ptr += 8;
296 }
297
298 --loop_stage;
299 stage_size >>= 1;
300 }
301
302 // for stages < 3 vectors are too small!.
303 llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
304}
305
306#endif /* LV_HAVE_AVX */
307
308#ifdef LV_HAVE_AVX2
309#include <immintrin.h>
311
312static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs,
313 unsigned char* u,
314 const int frame_exp,
315 const int stage,
316 const int u_num,
317 const int row)
318{
319 const int frame_size = 0x01 << frame_exp;
320 if (row % 2) { // for odd rows just do the only necessary calculation and return.
321 const float* next_llrs = llrs + frame_size + row;
322 *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
323 return;
324 }
325
326 const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
327 if (max_stage_depth < 3) { // vectorized version needs larger vectors.
328 volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
329 return;
330 }
331
332 int loop_stage = max_stage_depth;
333 int stage_size = 0x01 << loop_stage;
334
335 float* src_llr_ptr;
336 float* dst_llr_ptr;
337
338 __m256 src0, src1, dst;
339
340 if (row) { // not necessary for ZERO row. == first bit to be decoded.
341 // first do bit combination for all stages
342 // effectively encode some decoded bits again.
343 unsigned char* u_target = u + frame_size;
344 unsigned char* u_temp = u + 2 * frame_size;
345 memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
346
347 if (stage_size > 15) {
348 volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
349 } else {
350 volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
351 }
352
353 src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
354 dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
355
356 __m128i fbits;
357
358 int p;
359 for (p = 0; p < stage_size; p += 8) {
360 fbits = _mm_loadu_si128((__m128i*)u_target);
361 u_target += 8;
362
363 src0 = _mm256_loadu_ps(src_llr_ptr);
364 src1 = _mm256_loadu_ps(src_llr_ptr + 8);
365 src_llr_ptr += 16;
366
367 dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits);
368
369 _mm256_storeu_ps(dst_llr_ptr, dst);
370 dst_llr_ptr += 8;
371 }
372
373 --loop_stage;
374 stage_size >>= 1;
375 }
376
377 const int min_stage = stage > 2 ? stage : 2;
378
379 _mm256_zeroall(); // Important to clear cache!
380
381 int el;
382 while (min_stage < loop_stage) {
383 dst_llr_ptr = llrs + loop_stage * frame_size + row;
384 src_llr_ptr = dst_llr_ptr + frame_size;
385 for (el = 0; el < stage_size; el += 8) {
386 src0 = _mm256_loadu_ps(src_llr_ptr);
387 src_llr_ptr += 8;
388 src1 = _mm256_loadu_ps(src_llr_ptr);
389 src_llr_ptr += 8;
390
391 dst = _mm256_polar_minsum_llrs(src0, src1);
392
393 _mm256_storeu_ps(dst_llr_ptr, dst);
394 dst_llr_ptr += 8;
395 }
396
397 --loop_stage;
398 stage_size >>= 1;
399 }
400
401 // for stages < 3 vectors are too small!.
402 llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
403}
404
405#endif /* LV_HAVE_AVX2 */
406
407#endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ */
static float llr_even(const float la, const float lb, const unsigned char f)
Definition: volk_32f_8u_polarbutterfly_32f.h:108
static void llr_odd_stages(float *llrs, int min_stage, const int depth, const int frame_size, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:86
static void odd_xor_even_values(unsigned char *u_xor, const unsigned char *u, const int u_num)
Definition: volk_32f_8u_polarbutterfly_32f.h:130
static void volk_32f_8u_polarbutterfly_32f_generic(float *llrs, unsigned char *u, const int frame_exp, const int stage, const int u_num, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:157
static void volk_32f_8u_polarbutterfly_32f_u_avx(float *llrs, unsigned char *u, const int frame_exp, const int stage, const int u_num, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:213
static void even_u_values(unsigned char *u_even, const unsigned char *u, const int u_num)
Definition: volk_32f_8u_polarbutterfly_32f.h:119
static int calculate_max_stage_depth_for_row(const int frame_exp, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:139
static float llr_odd(const float la, const float lb)
Definition: volk_32f_8u_polarbutterfly_32f.h:79
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:65
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:89
static __m256 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx2_intrinsics.h:81
static __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:167
static __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx_intrinsics.h:184
for i
Definition: volk_config_fixed.tmpl.h:25