Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_16i_x4_quad_max_star_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
55#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
56#define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
57
58#include <inttypes.h>
59#include <stdio.h>
60
61#ifdef LV_HAVE_SSE2
62
63#include <emmintrin.h>
64
65static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target,
66 short* src0,
67 short* src1,
68 short* src2,
69 short* src3,
70 unsigned int num_points)
71{
72 const unsigned int num_bytes = num_points * 2;
73
74 int i = 0;
75
76 int bound = (num_bytes >> 4);
77 int bound_copy = bound;
78 int leftovers = (num_bytes >> 1) & 7;
79
80 __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
81 p_target = (__m128i*)target;
82 p_src0 = (__m128i*)src0;
83 p_src1 = (__m128i*)src1;
84 p_src2 = (__m128i*)src2;
85 p_src3 = (__m128i*)src3;
86
87 __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
88
89 while (bound_copy > 0) {
90 xmm1 = _mm_load_si128(p_src0);
91 xmm2 = _mm_load_si128(p_src1);
92 xmm3 = _mm_load_si128(p_src2);
93 xmm4 = _mm_load_si128(p_src3);
94
95 xmm5 = _mm_setzero_si128();
96 xmm6 = _mm_setzero_si128();
97 xmm7 = xmm1;
98 xmm8 = xmm3;
99
100 xmm1 = _mm_sub_epi16(xmm2, xmm1);
101
102 xmm3 = _mm_sub_epi16(xmm4, xmm3);
103
104 xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
105 xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
106
107 xmm2 = _mm_and_si128(xmm5, xmm2);
108 xmm4 = _mm_and_si128(xmm6, xmm4);
109 xmm5 = _mm_andnot_si128(xmm5, xmm7);
110 xmm6 = _mm_andnot_si128(xmm6, xmm8);
111
112 xmm5 = _mm_add_epi16(xmm2, xmm5);
113 xmm6 = _mm_add_epi16(xmm4, xmm6);
114
115 xmm1 = _mm_xor_si128(xmm1, xmm1);
116 xmm2 = xmm5;
117 xmm5 = _mm_sub_epi16(xmm6, xmm5);
118 p_src0 += 1;
119 bound_copy -= 1;
120
121 xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
122 p_src1 += 1;
123
124 xmm6 = _mm_and_si128(xmm1, xmm6);
125
126 xmm1 = _mm_andnot_si128(xmm1, xmm2);
127 p_src2 += 1;
128
129 xmm1 = _mm_add_epi16(xmm6, xmm1);
130 p_src3 += 1;
131
132 _mm_store_si128(p_target, xmm1);
133 p_target += 1;
134 }
135
136
137 /*__VOLK_ASM __VOLK_VOLATILE
138 (
139 "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
140 "cmp $0, %[bound]\n\t"
141 "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
142
143 "movaps (%[src0]), %%xmm1\n\t"
144 "movaps (%[src1]), %%xmm2\n\t"
145 "movaps (%[src2]), %%xmm3\n\t"
146 "movaps (%[src3]), %%xmm4\n\t"
147
148 "pxor %%xmm5, %%xmm5\n\t"
149 "pxor %%xmm6, %%xmm6\n\t"
150 "movaps %%xmm1, %%xmm7\n\t"
151 "movaps %%xmm3, %%xmm8\n\t"
152 "psubw %%xmm2, %%xmm1\n\t"
153 "psubw %%xmm4, %%xmm3\n\t"
154
155 "pcmpgtw %%xmm1, %%xmm5\n\t"
156 "pcmpgtw %%xmm3, %%xmm6\n\t"
157
158 "pand %%xmm5, %%xmm2\n\t"
159 "pand %%xmm6, %%xmm4\n\t"
160 "pandn %%xmm7, %%xmm5\n\t"
161 "pandn %%xmm8, %%xmm6\n\t"
162
163 "paddw %%xmm2, %%xmm5\n\t"
164 "paddw %%xmm4, %%xmm6\n\t"
165
166 "pxor %%xmm1, %%xmm1\n\t"
167 "movaps %%xmm5, %%xmm2\n\t"
168
169 "psubw %%xmm6, %%xmm5\n\t"
170 "add $16, %[src0]\n\t"
171 "add $-1, %[bound]\n\t"
172
173 "pcmpgtw %%xmm5, %%xmm1\n\t"
174 "add $16, %[src1]\n\t"
175
176 "pand %%xmm1, %%xmm6\n\t"
177
178 "pandn %%xmm2, %%xmm1\n\t"
179 "add $16, %[src2]\n\t"
180
181 "paddw %%xmm6, %%xmm1\n\t"
182 "add $16, %[src3]\n\t"
183
184 "movaps %%xmm1, (%[target])\n\t"
185 "addw $16, %[target]\n\t"
186 "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
187
188 "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
189 :
190 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2),
191 [src3]"r"(src3), [target]"r"(target)
192 :
193 );
194 */
195
196 short temp0 = 0;
197 short temp1 = 0;
198 for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
199 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
200 temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
201 target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
202 }
203 return;
204}
205
206#endif /*LV_HAVE_SSE2*/
207
208#ifdef LV_HAVE_NEON
209
210#include <arm_neon.h>
211
212static inline void volk_16i_x4_quad_max_star_16i_neon(short* target,
213 short* src0,
214 short* src1,
215 short* src2,
216 short* src3,
217 unsigned int num_points)
218{
219 const unsigned int eighth_points = num_points / 8;
220 unsigned i;
221
222 int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
223 int16x8_t diff12, diff34;
224 int16x8_t comp0, comp1, comp2, comp3;
225 int16x8_t result1_vec, result2_vec;
226 int16x8_t zeros;
227 zeros = vdupq_n_s16(0);
228 for (i = 0; i < eighth_points; ++i) {
229 src0_vec = vld1q_s16(src0);
230 src1_vec = vld1q_s16(src1);
231 src2_vec = vld1q_s16(src2);
232 src3_vec = vld1q_s16(src3);
233 diff12 = vsubq_s16(src0_vec, src1_vec);
234 diff34 = vsubq_s16(src2_vec, src3_vec);
235 comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
236 comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
237 comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
238 comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
239 comp0 = vandq_s16(src0_vec, comp0);
240 comp1 = vandq_s16(src1_vec, comp1);
241 comp2 = vandq_s16(src2_vec, comp2);
242 comp3 = vandq_s16(src3_vec, comp3);
243
244 result1_vec = vaddq_s16(comp0, comp1);
245 result2_vec = vaddq_s16(comp2, comp3);
246
247 diff12 = vsubq_s16(result1_vec, result2_vec);
248 comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
249 comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
250 comp0 = vandq_s16(result1_vec, comp0);
251 comp1 = vandq_s16(result2_vec, comp1);
252 result1_vec = vaddq_s16(comp0, comp1);
253 vst1q_s16(target, result1_vec);
254 src0 += 8;
255 src1 += 8;
256 src2 += 8;
257 src3 += 8;
258 target += 8;
259 }
260
261 short temp0 = 0;
262 short temp1 = 0;
263 for (i = eighth_points * 8; i < num_points; ++i) {
264 temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
265 temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
266 *target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
267 src0++;
268 src1++;
269 src2++;
270 src3++;
271 }
272}
273#endif /* LV_HAVE_NEON */
274
275
276#ifdef LV_HAVE_GENERIC
277static inline void volk_16i_x4_quad_max_star_16i_generic(short* target,
278 short* src0,
279 short* src1,
280 short* src2,
281 short* src3,
282 unsigned int num_points)
283{
284 const unsigned int num_bytes = num_points * 2;
285
286 int i = 0;
287
288 int bound = num_bytes >> 1;
289
290 short temp0 = 0;
291 short temp1 = 0;
292 for (i = 0; i < bound; ++i) {
293 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
294 temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
295 target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
296 }
297}
298
299#endif /*LV_HAVE_GENERIC*/
300
301#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/
static void volk_16i_x4_quad_max_star_16i_generic(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:277
static void volk_16i_x4_quad_max_star_16i_neon(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:212
static void volk_16i_x4_quad_max_star_16i_a_sse2(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:65
for i
Definition: volk_config_fixed.tmpl.h:25