Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_16i_max_star_horizontal_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
53#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
54#define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
55
56#include <volk/volk_common.h>
57
58#include <inttypes.h>
59#include <stdio.h>
60
61
62#ifdef LV_HAVE_SSSE3
63
64#include <emmintrin.h>
65#include <tmmintrin.h>
66#include <xmmintrin.h>
67
68static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target,
69 int16_t* src0,
70 unsigned int num_points)
71{
72 const unsigned int num_bytes = num_points * 2;
73
74 static const uint8_t shufmask0[16] = {
75 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d,
76 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
77 };
78 static const uint8_t shufmask1[16] = {
79 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d
81 };
82 static const uint8_t andmask0[16] = {
83 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
84 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
85 };
86 static const uint8_t andmask1[16] = {
87 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
88 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
89 };
90
91 __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
92 __m128i xmm5, xmm6, xmm7, xmm8;
93
94 xmm4 = _mm_load_si128((__m128i*)shufmask0);
95 xmm5 = _mm_load_si128((__m128i*)shufmask1);
96 xmm6 = _mm_load_si128((__m128i*)andmask0);
97 xmm7 = _mm_load_si128((__m128i*)andmask1);
98
99 __m128i *p_target, *p_src0;
100
101 p_target = (__m128i*)target;
102 p_src0 = (__m128i*)src0;
103
104 int bound = num_bytes >> 5;
105 int intermediate = (num_bytes >> 4) & 1;
106 int leftovers = (num_bytes >> 1) & 7;
107
108 int i = 0;
109
110 for (i = 0; i < bound; ++i) {
111 xmm0 = _mm_load_si128(p_src0);
112 xmm1 = _mm_load_si128(&p_src0[1]);
113
114 xmm2 = _mm_xor_si128(xmm2, xmm2);
115 p_src0 += 2;
116
117 xmm3 = _mm_hsub_epi16(xmm0, xmm1);
118
119 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
120
121 xmm8 = _mm_and_si128(xmm2, xmm6);
122 xmm3 = _mm_and_si128(xmm2, xmm7);
123
124
125 xmm8 = _mm_add_epi8(xmm8, xmm4);
126 xmm3 = _mm_add_epi8(xmm3, xmm5);
127
128 xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
129 xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
130
131
132 xmm3 = _mm_add_epi16(xmm0, xmm1);
133
134
135 _mm_store_si128(p_target, xmm3);
136
137 p_target += 1;
138 }
139
140 if (intermediate) {
141 xmm0 = _mm_load_si128(p_src0);
142
143 xmm2 = _mm_xor_si128(xmm2, xmm2);
144 p_src0 += 1;
145
146 xmm3 = _mm_hsub_epi16(xmm0, xmm1);
147 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
148
149 xmm8 = _mm_and_si128(xmm2, xmm6);
150
151 xmm3 = _mm_add_epi8(xmm8, xmm4);
152
153 xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
154
155 _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
156
157 p_target = (__m128i*)((int8_t*)p_target + 8);
158 }
159
160 for (i = (bound << 4) + (intermediate << 3);
161 i < (bound << 4) + (intermediate << 3) + leftovers;
162 i += 2) {
163 target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
164 }
165}
166
167#endif /*LV_HAVE_SSSE3*/
168
169#ifdef LV_HAVE_NEON
170
171#include <arm_neon.h>
172static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target,
173 int16_t* src0,
174 unsigned int num_points)
175{
176 const unsigned int eighth_points = num_points / 16;
177 unsigned number;
178 int16x8x2_t input_vec;
179 int16x8_t diff, max_vec, zeros;
180 uint16x8_t comp1, comp2;
181 zeros = vdupq_n_s16(0);
182 for (number = 0; number < eighth_points; ++number) {
183 input_vec = vld2q_s16(src0);
184 //__VOLK_PREFETCH(src0+16);
185 diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
186 comp1 = vcgeq_s16(diff, zeros);
187 comp2 = vcltq_s16(diff, zeros);
188
189 input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
190 input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
191
192 max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
193 vst1q_s16(target, max_vec);
194 src0 += 16;
195 target += 8;
196 }
197 for (number = 0; number < num_points % 16; number += 2) {
198 target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0)
199 ? src0[number]
200 : src0[number + 1];
201 }
202}
203#endif /* LV_HAVE_NEON */
204
205#ifdef LV_HAVE_NEONV7
206extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target,
207 int16_t* src0,
208 unsigned int num_points);
209#endif /* LV_HAVE_NEONV7 */
210
211#ifdef LV_HAVE_GENERIC
212static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target,
213 int16_t* src0,
214 unsigned int num_points)
215{
216 const unsigned int num_bytes = num_points * 2;
217
218 int i = 0;
219
220 int bound = num_bytes >> 1;
221
222 for (i = 0; i < bound; i += 2) {
223 target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
224 }
225}
226
227#endif /*LV_HAVE_GENERIC*/
228
229#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/
static void volk_16i_max_star_horizontal_16i_neon(int16_t *target, int16_t *src0, unsigned int num_points)
Definition: volk_16i_max_star_horizontal_16i.h:172
static void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t *target, int16_t *src0, unsigned int num_points)
Definition: volk_16i_max_star_horizontal_16i.h:68
static void volk_16i_max_star_horizontal_16i_generic(int16_t *target, int16_t *src0, unsigned int num_points)
Definition: volk_16i_max_star_horizontal_16i.h:212
#define bit128_p(x)
Definition: volk_common.h:142
for i
Definition: volk_config_fixed.tmpl.h:25