Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_64f_add_64f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
73#ifndef INCLUDED_volk_32f_64f_add_64f_H
74#define INCLUDED_volk_32f_64f_add_64f_H
75
76#include <inttypes.h>
77
78#ifdef LV_HAVE_GENERIC
79
80static inline void volk_32f_64f_add_64f_generic(double* cVector,
81 const float* aVector,
82 const double* bVector,
83 unsigned int num_points)
84{
85 double* cPtr = cVector;
86 const float* aPtr = aVector;
87 const double* bPtr = bVector;
88 unsigned int number = 0;
89
90 for (number = 0; number < num_points; number++) {
91 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
92 }
93}
94
95#endif /* LV_HAVE_GENERIC */
96
97#ifdef LV_HAVE_NEONV8
98#include <arm_neon.h>
99
100static inline void volk_32f_64f_add_64f_neon(double* cVector,
101 const float* aVector,
102 const double* bVector,
103 unsigned int num_points)
104{
105 unsigned int number = 0;
106 const unsigned int half_points = num_points / 2;
107
108 double* cPtr = cVector;
109 const float* aPtr = aVector;
110 const double* bPtr = bVector;
111
112 float64x2_t aVal, bVal, cVal;
113 float32x2_t aVal1;
114 for (number = 0; number < half_points; number++) {
115 // Load in to NEON registers
116 aVal1 = vld1_f32(aPtr);
117 bVal = vld1q_f64(bPtr);
118 __VOLK_PREFETCH(aPtr + 2);
119 __VOLK_PREFETCH(bPtr + 2);
120 aPtr += 2; // q uses quadwords, 4 floats per vadd
121 bPtr += 2;
122
123 // Vector conversion
124 aVal = vcvt_f64_f32(aVal1);
125 // vector add
126 cVal = vaddq_f64(aVal, bVal);
127 // Store the results back into the C container
128 vst1q_f64(cPtr, cVal);
129
130 cPtr += 2;
131 }
132
133 number = half_points * 2; // should be = num_points
134 for (; number < num_points; number++) {
135 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
136 }
137}
138
139#endif /* LV_HAVE_NEONV8 */
140
141#ifdef LV_HAVE_AVX
142
143#include <immintrin.h>
144#include <xmmintrin.h>
145
146static inline void volk_32f_64f_add_64f_u_avx(double* cVector,
147 const float* aVector,
148 const double* bVector,
149 unsigned int num_points)
150{
151 unsigned int number = 0;
152 const unsigned int eighth_points = num_points / 8;
153
154 double* cPtr = cVector;
155 const float* aPtr = aVector;
156 const double* bPtr = bVector;
157
158 __m256 aVal;
159 __m128 aVal1, aVal2;
160 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
161 for (; number < eighth_points; number++) {
162
163 aVal = _mm256_loadu_ps(aPtr);
164 bVal1 = _mm256_loadu_pd(bPtr);
165 bVal2 = _mm256_loadu_pd(bPtr + 4);
166
167 aVal1 = _mm256_extractf128_ps(aVal, 0);
168 aVal2 = _mm256_extractf128_ps(aVal, 1);
169
170 aDbl1 = _mm256_cvtps_pd(aVal1);
171 aDbl2 = _mm256_cvtps_pd(aVal2);
172
173 cVal1 = _mm256_add_pd(aDbl1, bVal1);
174 cVal2 = _mm256_add_pd(aDbl2, bVal2);
175
176 _mm256_storeu_pd(cPtr,
177 cVal1); // Store the results back into the C container
178 _mm256_storeu_pd(cPtr + 4,
179 cVal2); // Store the results back into the C container
180
181 aPtr += 8;
182 bPtr += 8;
183 cPtr += 8;
184 }
185
186 number = eighth_points * 8;
187 for (; number < num_points; number++) {
188 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
189 }
190}
191
192#endif /* LV_HAVE_AVX */
193
194#ifdef LV_HAVE_AVX
195
196#include <immintrin.h>
197#include <xmmintrin.h>
198
199static inline void volk_32f_64f_add_64f_a_avx(double* cVector,
200 const float* aVector,
201 const double* bVector,
202 unsigned int num_points)
203{
204 unsigned int number = 0;
205 const unsigned int eighth_points = num_points / 8;
206
207 double* cPtr = cVector;
208 const float* aPtr = aVector;
209 const double* bPtr = bVector;
210
211 __m256 aVal;
212 __m128 aVal1, aVal2;
213 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
214 for (; number < eighth_points; number++) {
215
216 aVal = _mm256_load_ps(aPtr);
217 bVal1 = _mm256_load_pd(bPtr);
218 bVal2 = _mm256_load_pd(bPtr + 4);
219
220 aVal1 = _mm256_extractf128_ps(aVal, 0);
221 aVal2 = _mm256_extractf128_ps(aVal, 1);
222
223 aDbl1 = _mm256_cvtps_pd(aVal1);
224 aDbl2 = _mm256_cvtps_pd(aVal2);
225
226 cVal1 = _mm256_add_pd(aDbl1, bVal1);
227 cVal2 = _mm256_add_pd(aDbl2, bVal2);
228
229 _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
230 _mm256_store_pd(cPtr + 4,
231 cVal2); // Store the results back into the C container
232
233 aPtr += 8;
234 bPtr += 8;
235 cPtr += 8;
236 }
237
238 number = eighth_points * 8;
239 for (; number < num_points; number++) {
240 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
241 }
242}
243
244#endif /* LV_HAVE_AVX */
245
246#endif /* INCLUDED_volk_32f_64f_add_64f_u_H */
static void volk_32f_64f_add_64f_a_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:199
static void volk_32f_64f_add_64f_u_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:146
static void volk_32f_64f_add_64f_generic(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:80
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62