Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32fc_x2_add_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
74#ifndef INCLUDED_volk_32fc_x2_add_32fc_u_H
75#define INCLUDED_volk_32fc_x2_add_32fc_u_H
76
77#ifdef LV_HAVE_AVX
78#include <immintrin.h>
79
80static inline void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector,
81 const lv_32fc_t* aVector,
82 const lv_32fc_t* bVector,
83 unsigned int num_points)
84{
85 unsigned int number = 0;
86 const unsigned int quarterPoints = num_points / 4;
87
88 lv_32fc_t* cPtr = cVector;
89 const lv_32fc_t* aPtr = aVector;
90 const lv_32fc_t* bPtr = bVector;
91
92 __m256 aVal, bVal, cVal;
93 for (; number < quarterPoints; number++) {
94
95 aVal = _mm256_loadu_ps((float*)aPtr);
96 bVal = _mm256_loadu_ps((float*)bPtr);
97
98 cVal = _mm256_add_ps(aVal, bVal);
99
100 _mm256_storeu_ps((float*)cPtr,
101 cVal); // Store the results back into the C container
102
103 aPtr += 4;
104 bPtr += 4;
105 cPtr += 4;
106 }
107
108 number = quarterPoints * 4;
109 for (; number < num_points; number++) {
110 *cPtr++ = (*aPtr++) + (*bPtr++);
111 }
112}
113#endif /* LV_HAVE_AVX */
114
115
116#ifdef LV_HAVE_AVX
117#include <immintrin.h>
118
119static inline void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector,
120 const lv_32fc_t* aVector,
121 const lv_32fc_t* bVector,
122 unsigned int num_points)
123{
124 unsigned int number = 0;
125 const unsigned int quarterPoints = num_points / 4;
126
127 lv_32fc_t* cPtr = cVector;
128 const lv_32fc_t* aPtr = aVector;
129 const lv_32fc_t* bPtr = bVector;
130
131 __m256 aVal, bVal, cVal;
132 for (; number < quarterPoints; number++) {
133
134 aVal = _mm256_load_ps((float*)aPtr);
135 bVal = _mm256_load_ps((float*)bPtr);
136
137 cVal = _mm256_add_ps(aVal, bVal);
138
139 _mm256_store_ps((float*)cPtr,
140 cVal); // Store the results back into the C container
141
142 aPtr += 4;
143 bPtr += 4;
144 cPtr += 4;
145 }
146
147 number = quarterPoints * 4;
148 for (; number < num_points; number++) {
149 *cPtr++ = (*aPtr++) + (*bPtr++);
150 }
151}
152#endif /* LV_HAVE_AVX */
153
154
155#ifdef LV_HAVE_SSE
156#include <xmmintrin.h>
157
158static inline void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector,
159 const lv_32fc_t* aVector,
160 const lv_32fc_t* bVector,
161 unsigned int num_points)
162{
163 unsigned int number = 0;
164 const unsigned int halfPoints = num_points / 2;
165
166 lv_32fc_t* cPtr = cVector;
167 const lv_32fc_t* aPtr = aVector;
168 const lv_32fc_t* bPtr = bVector;
169
170 __m128 aVal, bVal, cVal;
171 for (; number < halfPoints; number++) {
172
173 aVal = _mm_loadu_ps((float*)aPtr);
174 bVal = _mm_loadu_ps((float*)bPtr);
175
176 cVal = _mm_add_ps(aVal, bVal);
177
178 _mm_storeu_ps((float*)cPtr, cVal); // Store the results back into the C container
179
180 aPtr += 2;
181 bPtr += 2;
182 cPtr += 2;
183 }
184
185 number = halfPoints * 2;
186 for (; number < num_points; number++) {
187 *cPtr++ = (*aPtr++) + (*bPtr++);
188 }
189}
190#endif /* LV_HAVE_SSE */
191
192
193#ifdef LV_HAVE_GENERIC
194
195static inline void volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector,
196 const lv_32fc_t* aVector,
197 const lv_32fc_t* bVector,
198 unsigned int num_points)
199{
200 lv_32fc_t* cPtr = cVector;
201 const lv_32fc_t* aPtr = aVector;
202 const lv_32fc_t* bPtr = bVector;
203 unsigned int number = 0;
204
205 for (number = 0; number < num_points; number++) {
206 *cPtr++ = (*aPtr++) + (*bPtr++);
207 }
208}
209#endif /* LV_HAVE_GENERIC */
210
211
212#ifdef LV_HAVE_SSE
213#include <xmmintrin.h>
214
215static inline void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector,
216 const lv_32fc_t* aVector,
217 const lv_32fc_t* bVector,
218 unsigned int num_points)
219{
220 unsigned int number = 0;
221 const unsigned int halfPoints = num_points / 2;
222
223 lv_32fc_t* cPtr = cVector;
224 const lv_32fc_t* aPtr = aVector;
225 const lv_32fc_t* bPtr = bVector;
226
227 __m128 aVal, bVal, cVal;
228 for (; number < halfPoints; number++) {
229 aVal = _mm_load_ps((float*)aPtr);
230 bVal = _mm_load_ps((float*)bPtr);
231
232 cVal = _mm_add_ps(aVal, bVal);
233
234 _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
235
236 aPtr += 2;
237 bPtr += 2;
238 cPtr += 2;
239 }
240
241 number = halfPoints * 2;
242 for (; number < num_points; number++) {
243 *cPtr++ = (*aPtr++) + (*bPtr++);
244 }
245}
246#endif /* LV_HAVE_SSE */
247
248
249#ifdef LV_HAVE_NEON
250#include <arm_neon.h>
251
252static inline void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector,
253 const lv_32fc_t* aVector,
254 const lv_32fc_t* bVector,
255 unsigned int num_points)
256{
257 unsigned int number = 0;
258 const unsigned int halfPoints = num_points / 2;
259
260 lv_32fc_t* cPtr = cVector;
261 const lv_32fc_t* aPtr = aVector;
262 const lv_32fc_t* bPtr = bVector;
263 float32x4_t aVal, bVal, cVal;
264 for (number = 0; number < halfPoints; number++) {
265 // Load in to NEON registers
266 aVal = vld1q_f32((const float32_t*)(aPtr));
267 bVal = vld1q_f32((const float32_t*)(bPtr));
268 __VOLK_PREFETCH(aPtr + 2);
269 __VOLK_PREFETCH(bPtr + 2);
270
271 // vector add
272 cVal = vaddq_f32(aVal, bVal);
273 // Store the results back into the C container
274 vst1q_f32((float*)(cPtr), cVal);
275
276 aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd
277 bPtr += 2;
278 cPtr += 2;
279 }
280
281 number = halfPoints * 2; // should be = num_points
282 for (; number < num_points; number++) {
283 *cPtr++ = (*aPtr++) + (*bPtr++);
284 }
285}
286
287#endif /* LV_HAVE_NEON */
288
289
290#endif /* INCLUDED_volk_32fc_x2_add_32fc_a_H */
static void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:215
static void volk_32fc_x2_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:195
static void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:158
static void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:80
static void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:119
static void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:252
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
float complex lv_32fc_t
Definition: volk_complex.h:65