69#define Mln2 0.6931471805f
71#define B 1065353216.0f
75#ifndef INCLUDED_volk_32f_expfast_32f_a_H
76#define INCLUDED_volk_32f_expfast_32f_a_H
78#if LV_HAVE_AVX && LV_HAVE_FMA
82static inline void volk_32f_expfast_32f_a_avx_fma(
float* bVector,
84 unsigned int num_points)
86 float* bPtr = bVector;
87 const float* aPtr = aVector;
89 unsigned int number = 0;
90 const unsigned int eighthPoints = num_points / 8;
92 __m256 aVal, bVal, a, b;
94 a = _mm256_set1_ps(
A /
Mln2);
95 b = _mm256_set1_ps(
B -
C);
97 for (; number < eighthPoints; number++) {
98 aVal = _mm256_load_ps(aPtr);
99 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
100 bVal = _mm256_castsi256_ps(exp);
102 _mm256_store_ps(bPtr, bVal);
107 number = eighthPoints * 8;
108 for (; number < num_points; number++) {
109 *bPtr++ = expf(*aPtr++);
117#include <immintrin.h>
122 float* bPtr = bVector;
123 const float* aPtr = aVector;
125 unsigned int number = 0;
126 const unsigned int eighthPoints = num_points / 8;
128 __m256 aVal, bVal, a, b;
130 a = _mm256_set1_ps(
A /
Mln2);
131 b = _mm256_set1_ps(
B -
C);
133 for (; number < eighthPoints; number++) {
134 aVal = _mm256_load_ps(aPtr);
135 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
136 bVal = _mm256_castsi256_ps(exp);
138 _mm256_store_ps(bPtr, bVal);
143 number = eighthPoints * 8;
144 for (; number < num_points; number++) {
145 *bPtr++ = expf(*aPtr++);
152#include <smmintrin.h>
154static inline void volk_32f_expfast_32f_a_sse4_1(
float* bVector,
155 const float* aVector,
156 unsigned int num_points)
158 float* bPtr = bVector;
159 const float* aPtr = aVector;
161 unsigned int number = 0;
162 const unsigned int quarterPoints = num_points / 4;
164 __m128 aVal, bVal, a, b;
166 a = _mm_set1_ps(
A /
Mln2);
167 b = _mm_set1_ps(
B -
C);
169 for (; number < quarterPoints; number++) {
170 aVal = _mm_load_ps(aPtr);
171 exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
172 bVal = _mm_castsi128_ps(exp);
174 _mm_store_ps(bPtr, bVal);
179 number = quarterPoints * 4;
180 for (; number < num_points; number++) {
181 *bPtr++ = expf(*aPtr++);
189#ifndef INCLUDED_volk_32f_expfast_32f_u_H
190#define INCLUDED_volk_32f_expfast_32f_u_H
192#if LV_HAVE_AVX && LV_HAVE_FMA
193#include <immintrin.h>
195static inline void volk_32f_expfast_32f_u_avx_fma(
float* bVector,
196 const float* aVector,
197 unsigned int num_points)
199 float* bPtr = bVector;
200 const float* aPtr = aVector;
202 unsigned int number = 0;
203 const unsigned int eighthPoints = num_points / 8;
205 __m256 aVal, bVal, a, b;
207 a = _mm256_set1_ps(
A /
Mln2);
208 b = _mm256_set1_ps(
B -
C);
210 for (; number < eighthPoints; number++) {
211 aVal = _mm256_loadu_ps(aPtr);
212 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
213 bVal = _mm256_castsi256_ps(exp);
215 _mm256_storeu_ps(bPtr, bVal);
220 number = eighthPoints * 8;
221 for (; number < num_points; number++) {
222 *bPtr++ = expf(*aPtr++);
229#include <immintrin.h>
234 float* bPtr = bVector;
235 const float* aPtr = aVector;
237 unsigned int number = 0;
238 const unsigned int eighthPoints = num_points / 8;
240 __m256 aVal, bVal, a, b;
242 a = _mm256_set1_ps(
A /
Mln2);
243 b = _mm256_set1_ps(
B -
C);
245 for (; number < eighthPoints; number++) {
246 aVal = _mm256_loadu_ps(aPtr);
247 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
248 bVal = _mm256_castsi256_ps(exp);
250 _mm256_storeu_ps(bPtr, bVal);
255 number = eighthPoints * 8;
256 for (; number < num_points; number++) {
257 *bPtr++ = expf(*aPtr++);
265#include <smmintrin.h>
267static inline void volk_32f_expfast_32f_u_sse4_1(
float* bVector,
268 const float* aVector,
269 unsigned int num_points)
271 float* bPtr = bVector;
272 const float* aPtr = aVector;
274 unsigned int number = 0;
275 const unsigned int quarterPoints = num_points / 4;
277 __m128 aVal, bVal, a, b;
279 a = _mm_set1_ps(
A /
Mln2);
280 b = _mm_set1_ps(
B -
C);
282 for (; number < quarterPoints; number++) {
283 aVal = _mm_loadu_ps(aPtr);
284 exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
285 bVal = _mm_castsi128_ps(exp);
287 _mm_storeu_ps(bPtr, bVal);
292 number = quarterPoints * 4;
293 for (; number < num_points; number++) {
294 *bPtr++ = expf(*aPtr++);
301#ifdef LV_HAVE_GENERIC
304 const float* aVector,
305 unsigned int num_points)
307 float* bPtr = bVector;
308 const float* aPtr = aVector;
309 unsigned int number = 0;
311 for (number = 0; number < num_points; number++) {
312 *bPtr++ = expf(*aPtr++);
#define Mln2
Definition: volk_32f_expfast_32f.h:69
#define B
Definition: volk_32f_expfast_32f.h:71
#define A
Definition: volk_32f_expfast_32f.h:70
static void volk_32f_expfast_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:232
static void volk_32f_expfast_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:303
#define C
Definition: volk_32f_expfast_32f.h:72
static void volk_32f_expfast_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:120