Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_s32f_s32f_mod_range_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 Copyright (C) 2017 Free Software Foundation, Inc.
4
5 This file is pat of libVOLK
6
7 All rights reserved.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU Lesser General Public License version 2.1, as
11 published by the Free Software Foundation. This program is
12 distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
19*/
20
43#ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
44#define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
45
46#ifdef LV_HAVE_GENERIC
47
48static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector,
49 const float* inputVector,
50 const float lower_bound,
51 const float upper_bound,
52 unsigned int num_points)
53{
54 float* outPtr = outputVector;
55 const float* inPtr;
56 const float distance = upper_bound - lower_bound;
57
58 for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
59 float val = *inPtr;
60 if (val < lower_bound) {
61 float excess = lower_bound - val;
62 signed int count = (int)(excess / distance);
63 *outPtr = val + (count + 1) * distance;
64 } else if (val > upper_bound) {
65 float excess = val - upper_bound;
66 signed int count = (int)(excess / distance);
67 *outPtr = val - (count + 1) * distance;
68 } else
69 *outPtr = val;
70 outPtr++;
71 }
72}
73#endif /* LV_HAVE_GENERIC */
74
75
76#ifdef LV_HAVE_AVX
77#include <xmmintrin.h>
78
79static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector,
80 const float* inputVector,
81 const float lower_bound,
82 const float upper_bound,
83 unsigned int num_points)
84{
85 const __m256 lower = _mm256_set1_ps(lower_bound);
86 const __m256 upper = _mm256_set1_ps(upper_bound);
87 const __m256 distance = _mm256_sub_ps(upper, lower);
88 __m256 input, output;
89 __m256 is_smaller, is_bigger;
90 __m256 excess, adj;
91
92 const float* inPtr = inputVector;
93 float* outPtr = outputVector;
94 const size_t eight_points = num_points / 8;
95 for (size_t counter = 0; counter < eight_points; counter++) {
96 input = _mm256_loadu_ps(inPtr);
97 // calculate mask: input < lower, input > upper
98 is_smaller = _mm256_cmp_ps(
99 input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
100 is_bigger = _mm256_cmp_ps(
101 input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
102 // find out how far we are out-of-bound – positive values!
103 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
104 excess =
105 _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
106 // how many do we have to add? (int(excess/distance+1)*distance)
107 excess = _mm256_div_ps(excess, distance);
108 // round down
109 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
110 // plus 1
111 adj = _mm256_set1_ps(1.0f);
112 excess = _mm256_add_ps(excess, adj);
113 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
114 adj = _mm256_and_ps(adj, is_smaller);
115 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
116 // scale by distance, sign
117 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
118 output = _mm256_add_ps(input, excess);
119 _mm256_storeu_ps(outPtr, output);
120 inPtr += 8;
121 outPtr += 8;
122 }
123
125 outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
126}
127static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector,
128 const float* inputVector,
129 const float lower_bound,
130 const float upper_bound,
131 unsigned int num_points)
132{
133 const __m256 lower = _mm256_set1_ps(lower_bound);
134 const __m256 upper = _mm256_set1_ps(upper_bound);
135 const __m256 distance = _mm256_sub_ps(upper, lower);
136 __m256 input, output;
137 __m256 is_smaller, is_bigger;
138 __m256 excess, adj;
139
140 const float* inPtr = inputVector;
141 float* outPtr = outputVector;
142 const size_t eight_points = num_points / 8;
143 for (size_t counter = 0; counter < eight_points; counter++) {
144 input = _mm256_load_ps(inPtr);
145 // calculate mask: input < lower, input > upper
146 is_smaller = _mm256_cmp_ps(
147 input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
148 is_bigger = _mm256_cmp_ps(
149 input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
150 // find out how far we are out-of-bound – positive values!
151 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
152 excess =
153 _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
154 // how many do we have to add? (int(excess/distance+1)*distance)
155 excess = _mm256_div_ps(excess, distance);
156 // round down
157 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
158 // plus 1
159 adj = _mm256_set1_ps(1.0f);
160 excess = _mm256_add_ps(excess, adj);
161 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
162 adj = _mm256_and_ps(adj, is_smaller);
163 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
164 // scale by distance, sign
165 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
166 output = _mm256_add_ps(input, excess);
167 _mm256_store_ps(outPtr, output);
168 inPtr += 8;
169 outPtr += 8;
170 }
171
173 outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
174}
175#endif /* LV_HAVE_AVX */
176
177
178#ifdef LV_HAVE_SSE2
179#include <xmmintrin.h>
180
181static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector,
182 const float* inputVector,
183 const float lower_bound,
184 const float upper_bound,
185 unsigned int num_points)
186{
187 const __m128 lower = _mm_set_ps1(lower_bound);
188 const __m128 upper = _mm_set_ps1(upper_bound);
189 const __m128 distance = _mm_sub_ps(upper, lower);
190 __m128 input, output;
191 __m128 is_smaller, is_bigger;
192 __m128 excess, adj;
193
194 const float* inPtr = inputVector;
195 float* outPtr = outputVector;
196 const size_t quarter_points = num_points / 4;
197 for (size_t counter = 0; counter < quarter_points; counter++) {
198 input = _mm_load_ps(inPtr);
199 // calculate mask: input < lower, input > upper
200 is_smaller = _mm_cmplt_ps(input, lower);
201 is_bigger = _mm_cmpgt_ps(input, upper);
202 // find out how far we are out-of-bound – positive values!
203 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
204 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
205 // how many do we have to add? (int(excess/distance+1)*distance)
206 excess = _mm_div_ps(excess, distance);
207 // round down
208 excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
209 // plus 1
210 adj = _mm_set_ps1(1.0f);
211 excess = _mm_add_ps(excess, adj);
212 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
213 adj = _mm_and_ps(adj, is_smaller);
214 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
215 // scale by distance, sign
216 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
217 output = _mm_add_ps(input, excess);
218 _mm_store_ps(outPtr, output);
219 inPtr += 4;
220 outPtr += 4;
221 }
222
224 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
225}
226static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector,
227 const float* inputVector,
228 const float lower_bound,
229 const float upper_bound,
230 unsigned int num_points)
231{
232 const __m128 lower = _mm_set_ps1(lower_bound);
233 const __m128 upper = _mm_set_ps1(upper_bound);
234 const __m128 distance = _mm_sub_ps(upper, lower);
235 __m128 input, output;
236 __m128 is_smaller, is_bigger;
237 __m128 excess, adj;
238
239 const float* inPtr = inputVector;
240 float* outPtr = outputVector;
241 const size_t quarter_points = num_points / 4;
242 for (size_t counter = 0; counter < quarter_points; counter++) {
243 input = _mm_load_ps(inPtr);
244 // calculate mask: input < lower, input > upper
245 is_smaller = _mm_cmplt_ps(input, lower);
246 is_bigger = _mm_cmpgt_ps(input, upper);
247 // find out how far we are out-of-bound – positive values!
248 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
249 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
250 // how many do we have to add? (int(excess/distance+1)*distance)
251 excess = _mm_div_ps(excess, distance);
252 // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32
253 // conversion.
254 excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
255 // plus 1
256 adj = _mm_set_ps1(1.0f);
257 excess = _mm_add_ps(excess, adj);
258 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
259 adj = _mm_and_ps(adj, is_smaller);
260 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
261 // scale by distance, sign
262 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
263 output = _mm_add_ps(input, excess);
264 _mm_store_ps(outPtr, output);
265 inPtr += 4;
266 outPtr += 4;
267 }
268
270 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
271}
272#endif /* LV_HAVE_SSE2 */
273
274#ifdef LV_HAVE_SSE
275#include <xmmintrin.h>
276
277static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector,
278 const float* inputVector,
279 const float lower_bound,
280 const float upper_bound,
281 unsigned int num_points)
282{
283 const __m128 lower = _mm_set_ps1(lower_bound);
284 const __m128 upper = _mm_set_ps1(upper_bound);
285 const __m128 distance = _mm_sub_ps(upper, lower);
286 __m128 input, output;
287 __m128 is_smaller, is_bigger;
288 __m128 excess, adj;
289 __m128i rounddown;
290
291 const float* inPtr = inputVector;
292 float* outPtr = outputVector;
293 const size_t quarter_points = num_points / 4;
294 for (size_t counter = 0; counter < quarter_points; counter++) {
295 input = _mm_load_ps(inPtr);
296 // calculate mask: input < lower, input > upper
297 is_smaller = _mm_cmplt_ps(input, lower);
298 is_bigger = _mm_cmpgt_ps(input, upper);
299 // find out how far we are out-of-bound – positive values!
300 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
301 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
302 // how many do we have to add? (int(excess/distance+1)*distance)
303 excess = _mm_div_ps(excess, distance);
304 // round down – for some reason
305 rounddown = _mm_cvttps_epi32(excess);
306 excess = _mm_cvtepi32_ps(rounddown);
307 // plus 1
308 adj = _mm_set_ps1(1.0f);
309 excess = _mm_add_ps(excess, adj);
310 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
311 adj = _mm_and_ps(adj, is_smaller);
312 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
313 // scale by distance, sign
314 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
315 output = _mm_add_ps(input, excess);
316 _mm_store_ps(outPtr, output);
317 inPtr += 4;
318 outPtr += 4;
319 }
320
322 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
323}
324static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector,
325 const float* inputVector,
326 const float lower_bound,
327 const float upper_bound,
328 unsigned int num_points)
329{
330 const __m128 lower = _mm_set_ps1(lower_bound);
331 const __m128 upper = _mm_set_ps1(upper_bound);
332 const __m128 distance = _mm_sub_ps(upper, lower);
333 __m128 input, output;
334 __m128 is_smaller, is_bigger;
335 __m128 excess, adj;
336 __m128i rounddown;
337
338 const float* inPtr = inputVector;
339 float* outPtr = outputVector;
340 const size_t quarter_points = num_points / 4;
341 for (size_t counter = 0; counter < quarter_points; counter++) {
342 input = _mm_load_ps(inPtr);
343 // calculate mask: input < lower, input > upper
344 is_smaller = _mm_cmplt_ps(input, lower);
345 is_bigger = _mm_cmpgt_ps(input, upper);
346 // find out how far we are out-of-bound – positive values!
347 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
348 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
349 // how many do we have to add? (int(excess/distance+1)*distance)
350 excess = _mm_div_ps(excess, distance);
351 // round down
352 rounddown = _mm_cvttps_epi32(excess);
353 excess = _mm_cvtepi32_ps(rounddown);
354 // plus 1
355 adj = _mm_set_ps1(1.0f);
356 excess = _mm_add_ps(excess, adj);
357 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
358 adj = _mm_and_ps(adj, is_smaller);
359 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
360 // scale by distance, sign
361 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
362 output = _mm_add_ps(input, excess);
363 _mm_store_ps(outPtr, output);
364 inPtr += 4;
365 outPtr += 4;
366 }
367
369 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
370}
371#endif /* LV_HAVE_SSE */
372
373
374#endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */
static void volk_32f_s32f_s32f_mod_range_32f_u_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:277
static void volk_32f_s32f_s32f_mod_range_32f_a_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:127
static void volk_32f_s32f_s32f_mod_range_32f_a_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:324
static void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:181
static void volk_32f_s32f_s32f_mod_range_32f_generic(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:48
static void volk_32f_s32f_s32f_mod_range_32f_u_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:79
static void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:226