28#ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
29#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
34 __m256 yl, yh, tmp1, tmp2;
35 yl = _mm256_moveldup_ps(y);
36 yh = _mm256_movehdup_ps(y);
37 tmp1 = _mm256_mul_ps(x, yl);
38 x = _mm256_shuffle_ps(x, x, 0xB1);
39 tmp2 = _mm256_mul_ps(x, yh);
42 return _mm256_addsub_ps(tmp1, tmp2);
47 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
48 return _mm256_xor_ps(x, conjugator);
53 const __m256 nswap = _mm256_permute_ps(x, 0xb1);
54 const __m256 dreal = _mm256_moveldup_ps(y);
55 const __m256 dimag = _mm256_movehdup_ps(y);
57 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
58 const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
59 const __m256 multreal = _mm256_mul_ps(x, dreal);
60 const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
61 return _mm256_add_ps(multreal, multimag);
66 __m256 tmp1 = _mm256_mul_ps(val, val);
67 tmp1 = _mm256_hadd_ps(tmp1, tmp1);
68 tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0));
69 tmp1 = _mm256_sqrt_ps(tmp1);
70 return _mm256_div_ps(val, tmp1);
75 __m256 complex1, complex2;
76 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
77 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
78 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
79 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
80 return _mm256_hadd_ps(complex1, complex2);
89 const __m256 symbols1,
99 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
100 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
102 return _mm256_mul_ps(norms, scalar);
107 __m256 sign_mask_dummy = _mm256_setzero_ps();
108 const __m128i zeros = _mm_set1_epi8(0x00);
109 const __m128i sign_extract = _mm_set1_epi8(0x80);
110 const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
126 const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
143 fbits = _mm_cmpgt_epi8(fbits, zeros);
144 fbits = _mm_and_si128(fbits, sign_extract);
145 __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
146 __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
149 _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
150 return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
161 __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
162 __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
163 *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
164 *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
169 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
170 const __m256 abs_mask =
171 _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
178 _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
180 _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
181 return _mm256_or_ps(dst, sign);
193 llr0 = _mm256_xor_ps(llr0, sign_mask);
194 __m256 dst = _mm256_add_ps(llr0, llr1);
199 __m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
201 aux = _mm256_mul_ps(aux, val);
202 aux = _mm256_sub_ps(aux, acc);
203 aux = _mm256_mul_ps(aux, aux);
204 aux = _mm256_mul_ps(aux, rec);
205 return _mm256_add_ps(sq_acc, aux);
static __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition: volk_avx_intrinsics.h:73
static __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition: volk_avx_intrinsics.h:83
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:158
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:51
static __m256 _mm256_accumulate_square_sum_ps(__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
Definition: volk_avx_intrinsics.h:198
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:32
static __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:167
static __m256 _mm256_conjugate_ps(__m256 x)
Definition: volk_avx_intrinsics.h:45
static __m256 _mm256_normalize_ps(__m256 val)
Definition: volk_avx_intrinsics.h:64
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx_intrinsics.h:88
static __m256 _mm256_polar_sign_mask(__m128i fbits)
Definition: volk_avx_intrinsics.h:105
static __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx_intrinsics.h:184