Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32fc_x2_square_dist_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
78#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
79#define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
80
81#include <inttypes.h>
82#include <stdio.h>
83#include <volk/volk_complex.h>
84
85#ifdef LV_HAVE_AVX2
86#include <immintrin.h>
87
88static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target,
89 lv_32fc_t* src0,
90 lv_32fc_t* points,
91 unsigned int num_points)
92{
93 const unsigned int num_bytes = num_points * 8;
94 __m128 xmm0, xmm9, xmm10;
95 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
96
97 lv_32fc_t diff;
98 float sq_dist;
99 int bound = num_bytes >> 6;
100 int leftovers0 = (num_bytes >> 5) & 1;
101 int leftovers1 = (num_bytes >> 4) & 1;
102 int leftovers2 = (num_bytes >> 3) & 1;
103 int i = 0;
104
105 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
106 xmm1 = _mm256_setzero_ps();
107 xmm2 = _mm256_load_ps((float*)&points[0]);
108 xmm0 = _mm_load_ps((float*)src0);
109 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
110 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
111 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
112 xmm3 = _mm256_load_ps((float*)&points[4]);
113
114 for (; i < bound; ++i) {
115 xmm4 = _mm256_sub_ps(xmm1, xmm2);
116 xmm5 = _mm256_sub_ps(xmm1, xmm3);
117 points += 8;
118 xmm6 = _mm256_mul_ps(xmm4, xmm4);
119 xmm7 = _mm256_mul_ps(xmm5, xmm5);
120
121 xmm2 = _mm256_load_ps((float*)&points[0]);
122
123 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
124 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
125
126 xmm3 = _mm256_load_ps((float*)&points[4]);
127
128 _mm256_store_ps(target, xmm4);
129
130 target += 8;
131 }
132
133 for (i = 0; i < leftovers0; ++i) {
134
135 xmm2 = _mm256_load_ps((float*)&points[0]);
136
137 xmm4 = _mm256_sub_ps(xmm1, xmm2);
138
139 points += 4;
140
141 xmm6 = _mm256_mul_ps(xmm4, xmm4);
142
143 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
144 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
145
146 xmm9 = _mm256_extractf128_ps(xmm4, 1);
147 _mm_store_ps(target, xmm9);
148
149 target += 4;
150 }
151
152 for (i = 0; i < leftovers1; ++i) {
153 xmm9 = _mm_load_ps((float*)&points[0]);
154
155 xmm10 = _mm_sub_ps(xmm0, xmm9);
156
157 points += 2;
158
159 xmm9 = _mm_mul_ps(xmm10, xmm10);
160
161 xmm10 = _mm_hadd_ps(xmm9, xmm9);
162
163 _mm_storeh_pi((__m64*)target, xmm10);
164
165 target += 2;
166 }
167
168 for (i = 0; i < leftovers2; ++i) {
169
170 diff = src0[0] - points[0];
171
172 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
173
174 target[0] = sq_dist;
175 }
176}
177
178#endif /*LV_HAVE_AVX2*/
179
180#ifdef LV_HAVE_SSE3
181#include <pmmintrin.h>
182#include <xmmintrin.h>
183
184static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target,
185 lv_32fc_t* src0,
186 lv_32fc_t* points,
187 unsigned int num_points)
188{
189 const unsigned int num_bytes = num_points * 8;
190
191 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
192
193 lv_32fc_t diff;
194 float sq_dist;
195 int bound = num_bytes >> 5;
196 int i = 0;
197
198 xmm1 = _mm_setzero_ps();
199 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
200 xmm2 = _mm_load_ps((float*)&points[0]);
201 xmm1 = _mm_movelh_ps(xmm1, xmm1);
202 xmm3 = _mm_load_ps((float*)&points[2]);
203
204 for (; i < bound - 1; ++i) {
205 xmm4 = _mm_sub_ps(xmm1, xmm2);
206 xmm5 = _mm_sub_ps(xmm1, xmm3);
207 points += 4;
208 xmm6 = _mm_mul_ps(xmm4, xmm4);
209 xmm7 = _mm_mul_ps(xmm5, xmm5);
210
211 xmm2 = _mm_load_ps((float*)&points[0]);
212
213 xmm4 = _mm_hadd_ps(xmm6, xmm7);
214
215 xmm3 = _mm_load_ps((float*)&points[2]);
216
217 _mm_store_ps(target, xmm4);
218
219 target += 4;
220 }
221
222 xmm4 = _mm_sub_ps(xmm1, xmm2);
223 xmm5 = _mm_sub_ps(xmm1, xmm3);
224
225 points += 4;
226 xmm6 = _mm_mul_ps(xmm4, xmm4);
227 xmm7 = _mm_mul_ps(xmm5, xmm5);
228
229 xmm4 = _mm_hadd_ps(xmm6, xmm7);
230
231 _mm_store_ps(target, xmm4);
232
233 target += 4;
234
235 if (num_bytes >> 4 & 1) {
236
237 xmm2 = _mm_load_ps((float*)&points[0]);
238
239 xmm4 = _mm_sub_ps(xmm1, xmm2);
240
241 points += 2;
242
243 xmm6 = _mm_mul_ps(xmm4, xmm4);
244
245 xmm4 = _mm_hadd_ps(xmm6, xmm6);
246
247 _mm_storeh_pi((__m64*)target, xmm4);
248
249 target += 2;
250 }
251
252 if (num_bytes >> 3 & 1) {
253
254 diff = src0[0] - points[0];
255
256 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
257
258 target[0] = sq_dist;
259 }
260}
261
262#endif /*LV_HAVE_SSE3*/
263
264
265#ifdef LV_HAVE_NEON
266#include <arm_neon.h>
267static inline void volk_32fc_x2_square_dist_32f_neon(float* target,
268 lv_32fc_t* src0,
269 lv_32fc_t* points,
270 unsigned int num_points)
271{
272 const unsigned int quarter_points = num_points / 4;
273 unsigned int number;
274
275 float32x4x2_t a_vec, b_vec;
276 float32x4x2_t diff_vec;
277 float32x4_t tmp, tmp1, dist_sq;
278 a_vec.val[0] = vdupq_n_f32(lv_creal(src0[0]));
279 a_vec.val[1] = vdupq_n_f32(lv_cimag(src0[0]));
280 for (number = 0; number < quarter_points; ++number) {
281 b_vec = vld2q_f32((float*)points);
282 diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
283 diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
284 tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
285 tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
286
287 dist_sq = vaddq_f32(tmp, tmp1);
288 vst1q_f32(target, dist_sq);
289 points += 4;
290 target += 4;
291 }
292 for (number = quarter_points * 4; number < num_points; ++number) {
293 lv_32fc_t diff = src0[0] - *points++;
294 *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
295 }
296}
297#endif /* LV_HAVE_NEON */
298
299
300#ifdef LV_HAVE_GENERIC
301static inline void volk_32fc_x2_square_dist_32f_generic(float* target,
302 lv_32fc_t* src0,
303 lv_32fc_t* points,
304 unsigned int num_points)
305{
306 const unsigned int num_bytes = num_points * 8;
307
308 lv_32fc_t diff;
309 float sq_dist;
310 unsigned int i = 0;
311
312 for (; i<num_bytes>> 3; ++i) {
313 diff = src0[0] - points[i];
314
315 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
316
317 target[i] = sq_dist;
318 }
319}
320
321#endif /*LV_HAVE_GENERIC*/
322
323
324#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
325
326#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
327#define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
328
329#include <inttypes.h>
330#include <stdio.h>
331#include <volk/volk_complex.h>
332
333#ifdef LV_HAVE_AVX2
334#include <immintrin.h>
335
336static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target,
337 lv_32fc_t* src0,
338 lv_32fc_t* points,
339 unsigned int num_points)
340{
341 const unsigned int num_bytes = num_points * 8;
342 __m128 xmm0, xmm9;
343 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
344
345 lv_32fc_t diff;
346 float sq_dist;
347 int bound = num_bytes >> 6;
348 int leftovers1 = (num_bytes >> 3) & 0b11;
349 int i = 0;
350
351 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
352 xmm1 = _mm256_setzero_ps();
353 xmm2 = _mm256_loadu_ps((float*)&points[0]);
354 xmm0 = _mm_loadu_ps((float*)src0);
355 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
356 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
357 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
358 xmm3 = _mm256_loadu_ps((float*)&points[4]);
359
360 for (; i < bound; ++i) {
361 xmm4 = _mm256_sub_ps(xmm1, xmm2);
362 xmm5 = _mm256_sub_ps(xmm1, xmm3);
363 points += 8;
364 xmm6 = _mm256_mul_ps(xmm4, xmm4);
365 xmm7 = _mm256_mul_ps(xmm5, xmm5);
366
367 xmm2 = _mm256_loadu_ps((float*)&points[0]);
368
369 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
370 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
371
372 xmm3 = _mm256_loadu_ps((float*)&points[4]);
373
374 _mm256_storeu_ps(target, xmm4);
375
376 target += 8;
377 }
378
379 if (num_bytes >> 5 & 1) {
380
381 xmm2 = _mm256_loadu_ps((float*)&points[0]);
382
383 xmm4 = _mm256_sub_ps(xmm1, xmm2);
384
385 points += 4;
386
387 xmm6 = _mm256_mul_ps(xmm4, xmm4);
388
389 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
390 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
391
392 xmm9 = _mm256_extractf128_ps(xmm4, 1);
393 _mm_storeu_ps(target, xmm9);
394
395 target += 4;
396 }
397
398 for (i = 0; i < leftovers1; ++i) {
399
400 diff = src0[0] - points[0];
401 points += 1;
402
403 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
404
405 target[0] = sq_dist;
406 target += 1;
407 }
408}
409
410#endif /*LV_HAVE_AVX2*/
411
412#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_u_H*/
static void volk_32fc_x2_square_dist_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:301
static void volk_32fc_x2_square_dist_32f_neon(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:267
static void volk_32fc_x2_square_dist_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:184
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25