Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32fc_32f_add_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
74#ifndef INCLUDED_volk_32fc_32f_add_32fc_u_H
75#define INCLUDED_volk_32fc_32f_add_32fc_u_H
76
77#ifdef LV_HAVE_GENERIC
78
79static inline void volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector,
80 const lv_32fc_t* aVector,
81 const float* bVector,
82 unsigned int num_points)
83{
84 lv_32fc_t* cPtr = cVector;
85 const lv_32fc_t* aPtr = aVector;
86 const float* bPtr = bVector;
87 unsigned int number = 0;
88
89 for (number = 0; number < num_points; number++) {
90 *cPtr++ = (*aPtr++) + (*bPtr++);
91 }
92}
93#endif /* LV_HAVE_GENERIC */
94
95
96#ifdef LV_HAVE_AVX
97#include <immintrin.h>
98
99static inline void volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector,
100 const lv_32fc_t* aVector,
101 const float* bVector,
102 unsigned int num_points)
103{
104 unsigned int number = 0;
105 const unsigned int eighthPoints = num_points / 8;
106
107 lv_32fc_t* cPtr = cVector;
108 const lv_32fc_t* aPtr = aVector;
109 const float* bPtr = bVector;
110
111 __m256 aVal1, aVal2, bVal, cVal1, cVal2;
112 __m256 cpx_b1, cpx_b2;
113 __m256 zero;
114 zero = _mm256_setzero_ps();
115 __m256 tmp1, tmp2;
116 for (; number < eighthPoints; number++) {
117
118 aVal1 = _mm256_loadu_ps((float*)aPtr);
119 aVal2 = _mm256_loadu_ps((float*)(aPtr + 4));
120 bVal = _mm256_loadu_ps(bPtr);
121 cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
122 cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
123
124 tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
125 tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
126
127 cVal1 = _mm256_add_ps(aVal1, tmp1);
128 cVal2 = _mm256_add_ps(aVal2, tmp2);
129
130 _mm256_storeu_ps((float*)cPtr,
131 cVal1); // Store the results back into the C container
132 _mm256_storeu_ps((float*)(cPtr + 4),
133 cVal2); // Store the results back into the C container
134
135 aPtr += 8;
136 bPtr += 8;
137 cPtr += 8;
138 }
139
140 number = eighthPoints * 8;
141 for (; number < num_points; number++) {
142 *cPtr++ = (*aPtr++) + (*bPtr++);
143 }
144}
145#endif /* LV_HAVE_AVX */
146
147#ifdef LV_HAVE_AVX
148#include <immintrin.h>
149
150static inline void volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector,
151 const lv_32fc_t* aVector,
152 const float* bVector,
153 unsigned int num_points)
154{
155 unsigned int number = 0;
156 const unsigned int eighthPoints = num_points / 8;
157
158 lv_32fc_t* cPtr = cVector;
159 const lv_32fc_t* aPtr = aVector;
160 const float* bPtr = bVector;
161
162 __m256 aVal1, aVal2, bVal, cVal1, cVal2;
163 __m256 cpx_b1, cpx_b2;
164 __m256 zero;
165 zero = _mm256_setzero_ps();
166 __m256 tmp1, tmp2;
167 for (; number < eighthPoints; number++) {
168
169 aVal1 = _mm256_load_ps((float*)aPtr);
170 aVal2 = _mm256_load_ps((float*)(aPtr + 4));
171 bVal = _mm256_load_ps(bPtr);
172 cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
173 cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
174
175 tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
176 tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
177
178 cVal1 = _mm256_add_ps(aVal1, tmp1);
179 cVal2 = _mm256_add_ps(aVal2, tmp2);
180
181 _mm256_store_ps((float*)cPtr,
182 cVal1); // Store the results back into the C container
183 _mm256_store_ps((float*)(cPtr + 4),
184 cVal2); // Store the results back into the C container
185
186 aPtr += 8;
187 bPtr += 8;
188 cPtr += 8;
189 }
190
191 number = eighthPoints * 8;
192 for (; number < num_points; number++) {
193 *cPtr++ = (*aPtr++) + (*bPtr++);
194 }
195}
196#endif /* LV_HAVE_AVX */
197
198#ifdef LV_HAVE_NEON
199#include <arm_neon.h>
200
201static inline void volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector,
202 const lv_32fc_t* aVector,
203 const float* bVector,
204 unsigned int num_points)
205{
206 lv_32fc_t* cPtr = cVector;
207 const lv_32fc_t* aPtr = aVector;
208 const float* bPtr = bVector;
209
210 float32x4x4_t aVal0, aVal1;
211 float32x4x2_t bVal0, bVal1;
212
213 const unsigned int sixteenthPoints = num_points / 16;
214 unsigned int number = 0;
215 for (; number < sixteenthPoints; number++) {
216 aVal0 = vld4q_f32((const float*)aPtr);
217 aPtr += 8;
218 aVal1 = vld4q_f32((const float*)aPtr);
219 aPtr += 8;
220 __VOLK_PREFETCH(aPtr + 16);
221
222 bVal0 = vld2q_f32((const float*)bPtr);
223 bPtr += 8;
224 bVal1 = vld2q_f32((const float*)bPtr);
225 bPtr += 8;
226 __VOLK_PREFETCH(bPtr + 16);
227
228 aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]);
229 aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]);
230
231 aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]);
232 aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]);
233
234 vst4q_f32((float*)(cPtr), aVal0);
235 cPtr += 8;
236 vst4q_f32((float*)(cPtr), aVal1);
237 cPtr += 8;
238 }
239
240 for (number = sixteenthPoints * 16; number < num_points; number++) {
241 *cPtr++ = (*aPtr++) + (*bPtr++);
242 }
243}
244#endif /* LV_HAVE_NEON */
245
246
247#endif /* INCLUDED_volk_32fc_32f_add_32fc_a_H */
static void volk_32fc_32f_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:150
static void volk_32fc_32f_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:79
static void volk_32fc_32f_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:99
static void volk_32fc_32f_add_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:201
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
float complex lv_32fc_t
Definition: volk_complex.h:65