Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_avx_intrinsics.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
23/*
24 * This file is intended to hold AVX intrinsics of intrinsics.
25 * They should be used in VOLK kernels to avoid copy-pasta.
26 */
27
28#ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
29#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
30#include <immintrin.h>
31
32static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
33{
34 __m256 yl, yh, tmp1, tmp2;
35 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
36 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
37 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
38 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
39 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
40
41 // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
42 return _mm256_addsub_ps(tmp1, tmp2);
43}
44
45static inline __m256 _mm256_conjugate_ps(__m256 x)
46{
47 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
48 return _mm256_xor_ps(x, conjugator); // conjugate y
49}
50
51static inline __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
52{
53 const __m256 nswap = _mm256_permute_ps(x, 0xb1);
54 const __m256 dreal = _mm256_moveldup_ps(y);
55 const __m256 dimag = _mm256_movehdup_ps(y);
56
57 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
58 const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
59 const __m256 multreal = _mm256_mul_ps(x, dreal);
60 const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
61 return _mm256_add_ps(multreal, multimag);
62}
63
64static inline __m256 _mm256_normalize_ps(__m256 val)
65{
66 __m256 tmp1 = _mm256_mul_ps(val, val);
67 tmp1 = _mm256_hadd_ps(tmp1, tmp1);
68 tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
69 tmp1 = _mm256_sqrt_ps(tmp1);
70 return _mm256_div_ps(val, tmp1);
71}
72
73static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
74{
75 __m256 complex1, complex2;
76 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
77 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
78 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
79 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
80 return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
81}
82
83static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
84{
85 return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
86}
87
88static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0,
89 const __m256 symbols1,
90 const __m256 points0,
91 const __m256 points1,
92 const __m256 scalar)
93{
94 /*
95 * Calculate: |y - x|^2 * SNR_lin
96 * Consider 'symbolsX' and 'pointsX' to be complex float
97 * 'symbolsX' are 'y' and 'pointsX' are 'x'
98 */
99 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
100 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
101 const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
102 return _mm256_mul_ps(norms, scalar);
103}
104
105static inline __m256 _mm256_polar_sign_mask(__m128i fbits)
106{
107 __m256 sign_mask_dummy = _mm256_setzero_ps();
108 const __m128i zeros = _mm_set1_epi8(0x00);
109 const __m128i sign_extract = _mm_set1_epi8(0x80);
110 const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
111 0xff,
112 0xff,
113 0x00,
114 0xff,
115 0xff,
116 0xff,
117 0x01,
118 0xff,
119 0xff,
120 0xff,
121 0x02,
122 0xff,
123 0xff,
124 0xff,
125 0x03);
126 const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
127 0xff,
128 0xff,
129 0x04,
130 0xff,
131 0xff,
132 0xff,
133 0x05,
134 0xff,
135 0xff,
136 0xff,
137 0x06,
138 0xff,
139 0xff,
140 0xff,
141 0x07);
142
143 fbits = _mm_cmpgt_epi8(fbits, zeros);
144 fbits = _mm_and_si128(fbits, sign_extract);
145 __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
146 __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
147
148 __m256 sign_mask =
149 _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
150 return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
151 // // This is the desired function call. Though it seems to be missing in GCC.
152 // // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
153 // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1),
154 // _mm_castsi128_ps(sign_bits0));
155}
156
157static inline void
158_mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1)
159{
160 // deinterleave values
161 __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
162 __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
163 *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
164 *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
165}
166
167static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
168{
169 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
170 const __m256 abs_mask =
171 _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
172
173 __m256 llr0, llr1;
174 _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
175
176 // calculate result
177 __m256 sign =
178 _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
179 __m256 dst =
180 _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
181 return _mm256_or_ps(dst, sign);
182}
183
184static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
185{
186 // prepare sign mask for correct +-
187 __m256 sign_mask = _mm256_polar_sign_mask(fbits);
188
189 __m256 llr0, llr1;
190 _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
191
192 // calculate result
193 llr0 = _mm256_xor_ps(llr0, sign_mask);
194 __m256 dst = _mm256_add_ps(llr0, llr1);
195 return dst;
196}
197
199 __m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
200{
201 aux = _mm256_mul_ps(aux, val);
202 aux = _mm256_sub_ps(aux, acc);
203 aux = _mm256_mul_ps(aux, aux);
204 aux = _mm256_mul_ps(aux, rec);
205 return _mm256_add_ps(sq_acc, aux);
206}
207
208#endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */
static __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition: volk_avx_intrinsics.h:73
static __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition: volk_avx_intrinsics.h:83
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:158
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:51
static __m256 _mm256_accumulate_square_sum_ps(__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
Definition: volk_avx_intrinsics.h:198
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:32
static __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:167
static __m256 _mm256_conjugate_ps(__m256 x)
Definition: volk_avx_intrinsics.h:45
static __m256 _mm256_normalize_ps(__m256 val)
Definition: volk_avx_intrinsics.h:64
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx_intrinsics.h:88
static __m256 _mm256_polar_sign_mask(__m128i fbits)
Definition: volk_avx_intrinsics.h:105
static __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx_intrinsics.h:184