Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_x2_multiply_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
71#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
72#define INCLUDED_volk_32f_x2_multiply_32f_u_H
73
74#include <inttypes.h>
75#include <stdio.h>
76
77#ifdef LV_HAVE_SSE
78#include <xmmintrin.h>
79
80static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector,
81 const float* aVector,
82 const float* bVector,
83 unsigned int num_points)
84{
85 unsigned int number = 0;
86 const unsigned int quarterPoints = num_points / 4;
87
88 float* cPtr = cVector;
89 const float* aPtr = aVector;
90 const float* bPtr = bVector;
91
92 __m128 aVal, bVal, cVal;
93 for (; number < quarterPoints; number++) {
94
95 aVal = _mm_loadu_ps(aPtr);
96 bVal = _mm_loadu_ps(bPtr);
97
98 cVal = _mm_mul_ps(aVal, bVal);
99
100 _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
101
102 aPtr += 4;
103 bPtr += 4;
104 cPtr += 4;
105 }
106
107 number = quarterPoints * 4;
108 for (; number < num_points; number++) {
109 *cPtr++ = (*aPtr++) * (*bPtr++);
110 }
111}
112#endif /* LV_HAVE_SSE */
113
114#ifdef LV_HAVE_AVX512F
115#include <immintrin.h>
116
117static inline void volk_32f_x2_multiply_32f_u_avx512f(float* cVector,
118 const float* aVector,
119 const float* bVector,
120 unsigned int num_points)
121{
122 unsigned int number = 0;
123 const unsigned int sixteenthPoints = num_points / 16;
124
125 float* cPtr = cVector;
126 const float* aPtr = aVector;
127 const float* bPtr = bVector;
128
129 __m512 aVal, bVal, cVal;
130 for (; number < sixteenthPoints; number++) {
131
132 aVal = _mm512_loadu_ps(aPtr);
133 bVal = _mm512_loadu_ps(bPtr);
134
135 cVal = _mm512_mul_ps(aVal, bVal);
136
137 _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
138
139 aPtr += 16;
140 bPtr += 16;
141 cPtr += 16;
142 }
143
144 number = sixteenthPoints * 16;
145 for (; number < num_points; number++) {
146 *cPtr++ = (*aPtr++) * (*bPtr++);
147 }
148}
149#endif /* LV_HAVE_AVX512F */
150
151#ifdef LV_HAVE_AVX
152#include <immintrin.h>
153
154static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector,
155 const float* aVector,
156 const float* bVector,
157 unsigned int num_points)
158{
159 unsigned int number = 0;
160 const unsigned int eighthPoints = num_points / 8;
161
162 float* cPtr = cVector;
163 const float* aPtr = aVector;
164 const float* bPtr = bVector;
165
166 __m256 aVal, bVal, cVal;
167 for (; number < eighthPoints; number++) {
168
169 aVal = _mm256_loadu_ps(aPtr);
170 bVal = _mm256_loadu_ps(bPtr);
171
172 cVal = _mm256_mul_ps(aVal, bVal);
173
174 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
175
176 aPtr += 8;
177 bPtr += 8;
178 cPtr += 8;
179 }
180
181 number = eighthPoints * 8;
182 for (; number < num_points; number++) {
183 *cPtr++ = (*aPtr++) * (*bPtr++);
184 }
185}
186#endif /* LV_HAVE_AVX */
187
188
189#ifdef LV_HAVE_GENERIC
190
191static inline void volk_32f_x2_multiply_32f_generic(float* cVector,
192 const float* aVector,
193 const float* bVector,
194 unsigned int num_points)
195{
196 float* cPtr = cVector;
197 const float* aPtr = aVector;
198 const float* bPtr = bVector;
199 unsigned int number = 0;
200
201 for (number = 0; number < num_points; number++) {
202 *cPtr++ = (*aPtr++) * (*bPtr++);
203 }
204}
205#endif /* LV_HAVE_GENERIC */
206
207
208#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */
209
210
211#ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
212#define INCLUDED_volk_32f_x2_multiply_32f_a_H
213
214#include <inttypes.h>
215#include <stdio.h>
216
217#ifdef LV_HAVE_SSE
218#include <xmmintrin.h>
219
220static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector,
221 const float* aVector,
222 const float* bVector,
223 unsigned int num_points)
224{
225 unsigned int number = 0;
226 const unsigned int quarterPoints = num_points / 4;
227
228 float* cPtr = cVector;
229 const float* aPtr = aVector;
230 const float* bPtr = bVector;
231
232 __m128 aVal, bVal, cVal;
233 for (; number < quarterPoints; number++) {
234
235 aVal = _mm_load_ps(aPtr);
236 bVal = _mm_load_ps(bPtr);
237
238 cVal = _mm_mul_ps(aVal, bVal);
239
240 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
241
242 aPtr += 4;
243 bPtr += 4;
244 cPtr += 4;
245 }
246
247 number = quarterPoints * 4;
248 for (; number < num_points; number++) {
249 *cPtr++ = (*aPtr++) * (*bPtr++);
250 }
251}
252#endif /* LV_HAVE_SSE */
253
254#ifdef LV_HAVE_AVX512F
255#include <immintrin.h>
256
257static inline void volk_32f_x2_multiply_32f_a_avx512f(float* cVector,
258 const float* aVector,
259 const float* bVector,
260 unsigned int num_points)
261{
262 unsigned int number = 0;
263 const unsigned int sixteenthPoints = num_points / 16;
264
265 float* cPtr = cVector;
266 const float* aPtr = aVector;
267 const float* bPtr = bVector;
268
269 __m512 aVal, bVal, cVal;
270 for (; number < sixteenthPoints; number++) {
271
272 aVal = _mm512_load_ps(aPtr);
273 bVal = _mm512_load_ps(bPtr);
274
275 cVal = _mm512_mul_ps(aVal, bVal);
276
277 _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
278
279 aPtr += 16;
280 bPtr += 16;
281 cPtr += 16;
282 }
283
284 number = sixteenthPoints * 16;
285 for (; number < num_points; number++) {
286 *cPtr++ = (*aPtr++) * (*bPtr++);
287 }
288}
289#endif /* LV_HAVE_AVX512F */
290
291
292#ifdef LV_HAVE_AVX
293#include <immintrin.h>
294
295static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector,
296 const float* aVector,
297 const float* bVector,
298 unsigned int num_points)
299{
300 unsigned int number = 0;
301 const unsigned int eighthPoints = num_points / 8;
302
303 float* cPtr = cVector;
304 const float* aPtr = aVector;
305 const float* bPtr = bVector;
306
307 __m256 aVal, bVal, cVal;
308 for (; number < eighthPoints; number++) {
309
310 aVal = _mm256_load_ps(aPtr);
311 bVal = _mm256_load_ps(bPtr);
312
313 cVal = _mm256_mul_ps(aVal, bVal);
314
315 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
316
317 aPtr += 8;
318 bPtr += 8;
319 cPtr += 8;
320 }
321
322 number = eighthPoints * 8;
323 for (; number < num_points; number++) {
324 *cPtr++ = (*aPtr++) * (*bPtr++);
325 }
326}
327#endif /* LV_HAVE_AVX */
328
329
330#ifdef LV_HAVE_NEON
331#include <arm_neon.h>
332
333static inline void volk_32f_x2_multiply_32f_neon(float* cVector,
334 const float* aVector,
335 const float* bVector,
336 unsigned int num_points)
337{
338 const unsigned int quarter_points = num_points / 4;
339 unsigned int number;
340 float32x4_t avec, bvec, cvec;
341 for (number = 0; number < quarter_points; ++number) {
342 avec = vld1q_f32(aVector);
343 bvec = vld1q_f32(bVector);
344 cvec = vmulq_f32(avec, bvec);
345 vst1q_f32(cVector, cvec);
346 aVector += 4;
347 bVector += 4;
348 cVector += 4;
349 }
350 for (number = quarter_points * 4; number < num_points; ++number) {
351 *cVector++ = *aVector++ * *bVector++;
352 }
353}
354#endif /* LV_HAVE_NEON */
355
356
357#ifdef LV_HAVE_GENERIC
358
359static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector,
360 const float* aVector,
361 const float* bVector,
362 unsigned int num_points)
363{
364 float* cPtr = cVector;
365 const float* aPtr = aVector;
366 const float* bPtr = bVector;
367 unsigned int number = 0;
368
369 for (number = 0; number < num_points; number++) {
370 *cPtr++ = (*aPtr++) * (*bPtr++);
371 }
372}
373#endif /* LV_HAVE_GENERIC */
374
375
376#ifdef LV_HAVE_ORC
377extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector,
378 const float* aVector,
379 const float* bVector,
380 unsigned int num_points);
381
382static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector,
383 const float* aVector,
384 const float* bVector,
385 unsigned int num_points)
386{
387 volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
388}
389#endif /* LV_HAVE_ORC */
390
391
392#endif /* INCLUDED_volk_32f_x2_multiply_32f_a_H */
static void volk_32f_x2_multiply_32f_u_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:80
static void volk_32f_x2_multiply_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:191
static void volk_32f_x2_multiply_32f_a_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:359
static void volk_32f_x2_multiply_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:220
static void volk_32f_x2_multiply_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:295
static void volk_32f_x2_multiply_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:154
static void volk_32f_x2_multiply_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:333