43#ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
44#define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
49 const float* inputVector,
50 const float lower_bound,
51 const float upper_bound,
52 unsigned int num_points)
54 float* outPtr = outputVector;
56 const float distance = upper_bound - lower_bound;
58 for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
60 if (val < lower_bound) {
61 float excess = lower_bound - val;
62 signed int count = (int)(excess / distance);
63 *outPtr = val + (count + 1) * distance;
64 }
else if (val > upper_bound) {
65 float excess = val - upper_bound;
66 signed int count = (int)(excess / distance);
67 *outPtr = val - (count + 1) * distance;
80 const float* inputVector,
81 const float lower_bound,
82 const float upper_bound,
83 unsigned int num_points)
85 const __m256 lower = _mm256_set1_ps(lower_bound);
86 const __m256 upper = _mm256_set1_ps(upper_bound);
87 const __m256 distance = _mm256_sub_ps(upper, lower);
89 __m256 is_smaller, is_bigger;
92 const float* inPtr = inputVector;
93 float* outPtr = outputVector;
94 const size_t eight_points = num_points / 8;
95 for (
size_t counter = 0; counter < eight_points; counter++) {
96 input = _mm256_loadu_ps(inPtr);
98 is_smaller = _mm256_cmp_ps(
99 input, lower, _CMP_LT_OQ);
100 is_bigger = _mm256_cmp_ps(
101 input, upper, _CMP_GT_OQ);
103 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
105 _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
107 excess = _mm256_div_ps(excess, distance);
109 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
111 adj = _mm256_set1_ps(1.0f);
112 excess = _mm256_add_ps(excess, adj);
114 adj = _mm256_and_ps(adj, is_smaller);
115 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
117 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
118 output = _mm256_add_ps(input, excess);
119 _mm256_storeu_ps(outPtr, output);
125 outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
128 const float* inputVector,
129 const float lower_bound,
130 const float upper_bound,
131 unsigned int num_points)
133 const __m256 lower = _mm256_set1_ps(lower_bound);
134 const __m256 upper = _mm256_set1_ps(upper_bound);
135 const __m256 distance = _mm256_sub_ps(upper, lower);
136 __m256 input, output;
137 __m256 is_smaller, is_bigger;
140 const float* inPtr = inputVector;
141 float* outPtr = outputVector;
142 const size_t eight_points = num_points / 8;
143 for (
size_t counter = 0; counter < eight_points; counter++) {
144 input = _mm256_load_ps(inPtr);
146 is_smaller = _mm256_cmp_ps(
147 input, lower, _CMP_LT_OQ);
148 is_bigger = _mm256_cmp_ps(
149 input, upper, _CMP_GT_OQ);
151 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
153 _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
155 excess = _mm256_div_ps(excess, distance);
157 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
159 adj = _mm256_set1_ps(1.0f);
160 excess = _mm256_add_ps(excess, adj);
162 adj = _mm256_and_ps(adj, is_smaller);
163 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
165 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
166 output = _mm256_add_ps(input, excess);
167 _mm256_store_ps(outPtr, output);
173 outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
179#include <xmmintrin.h>
182 const float* inputVector,
183 const float lower_bound,
184 const float upper_bound,
185 unsigned int num_points)
187 const __m128 lower = _mm_set_ps1(lower_bound);
188 const __m128 upper = _mm_set_ps1(upper_bound);
189 const __m128 distance = _mm_sub_ps(upper, lower);
190 __m128 input, output;
191 __m128 is_smaller, is_bigger;
194 const float* inPtr = inputVector;
195 float* outPtr = outputVector;
196 const size_t quarter_points = num_points / 4;
197 for (
size_t counter = 0; counter < quarter_points; counter++) {
198 input = _mm_load_ps(inPtr);
200 is_smaller = _mm_cmplt_ps(input, lower);
201 is_bigger = _mm_cmpgt_ps(input, upper);
203 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
204 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
206 excess = _mm_div_ps(excess, distance);
208 excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
210 adj = _mm_set_ps1(1.0f);
211 excess = _mm_add_ps(excess, adj);
213 adj = _mm_and_ps(adj, is_smaller);
214 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
216 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
217 output = _mm_add_ps(input, excess);
218 _mm_store_ps(outPtr, output);
224 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
227 const float* inputVector,
228 const float lower_bound,
229 const float upper_bound,
230 unsigned int num_points)
232 const __m128 lower = _mm_set_ps1(lower_bound);
233 const __m128 upper = _mm_set_ps1(upper_bound);
234 const __m128 distance = _mm_sub_ps(upper, lower);
235 __m128 input, output;
236 __m128 is_smaller, is_bigger;
239 const float* inPtr = inputVector;
240 float* outPtr = outputVector;
241 const size_t quarter_points = num_points / 4;
242 for (
size_t counter = 0; counter < quarter_points; counter++) {
243 input = _mm_load_ps(inPtr);
245 is_smaller = _mm_cmplt_ps(input, lower);
246 is_bigger = _mm_cmpgt_ps(input, upper);
248 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
249 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
251 excess = _mm_div_ps(excess, distance);
254 excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
256 adj = _mm_set_ps1(1.0f);
257 excess = _mm_add_ps(excess, adj);
259 adj = _mm_and_ps(adj, is_smaller);
260 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
262 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
263 output = _mm_add_ps(input, excess);
264 _mm_store_ps(outPtr, output);
270 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
275#include <xmmintrin.h>
278 const float* inputVector,
279 const float lower_bound,
280 const float upper_bound,
281 unsigned int num_points)
283 const __m128 lower = _mm_set_ps1(lower_bound);
284 const __m128 upper = _mm_set_ps1(upper_bound);
285 const __m128 distance = _mm_sub_ps(upper, lower);
286 __m128 input, output;
287 __m128 is_smaller, is_bigger;
291 const float* inPtr = inputVector;
292 float* outPtr = outputVector;
293 const size_t quarter_points = num_points / 4;
294 for (
size_t counter = 0; counter < quarter_points; counter++) {
295 input = _mm_load_ps(inPtr);
297 is_smaller = _mm_cmplt_ps(input, lower);
298 is_bigger = _mm_cmpgt_ps(input, upper);
300 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
301 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
303 excess = _mm_div_ps(excess, distance);
305 rounddown = _mm_cvttps_epi32(excess);
306 excess = _mm_cvtepi32_ps(rounddown);
308 adj = _mm_set_ps1(1.0f);
309 excess = _mm_add_ps(excess, adj);
311 adj = _mm_and_ps(adj, is_smaller);
312 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
314 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
315 output = _mm_add_ps(input, excess);
316 _mm_store_ps(outPtr, output);
322 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
325 const float* inputVector,
326 const float lower_bound,
327 const float upper_bound,
328 unsigned int num_points)
330 const __m128 lower = _mm_set_ps1(lower_bound);
331 const __m128 upper = _mm_set_ps1(upper_bound);
332 const __m128 distance = _mm_sub_ps(upper, lower);
333 __m128 input, output;
334 __m128 is_smaller, is_bigger;
338 const float* inPtr = inputVector;
339 float* outPtr = outputVector;
340 const size_t quarter_points = num_points / 4;
341 for (
size_t counter = 0; counter < quarter_points; counter++) {
342 input = _mm_load_ps(inPtr);
344 is_smaller = _mm_cmplt_ps(input, lower);
345 is_bigger = _mm_cmpgt_ps(input, upper);
347 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
348 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
350 excess = _mm_div_ps(excess, distance);
352 rounddown = _mm_cvttps_epi32(excess);
353 excess = _mm_cvtepi32_ps(rounddown);
355 adj = _mm_set_ps1(1.0f);
356 excess = _mm_add_ps(excess, adj);
358 adj = _mm_and_ps(adj, is_smaller);
359 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
361 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
362 output = _mm_add_ps(input, excess);
363 _mm_store_ps(outPtr, output);
369 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
static void volk_32f_s32f_s32f_mod_range_32f_u_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:277
static void volk_32f_s32f_s32f_mod_range_32f_a_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:127
static void volk_32f_s32f_s32f_mod_range_32f_a_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:324
static void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:181
static void volk_32f_s32f_s32f_mod_range_32f_generic(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:48
static void volk_32f_s32f_s32f_mod_range_32f_u_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:79
static void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:226