Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_8u_x2_encodeframepolar_8u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
23/*
24 * for documentation see 'volk_8u_x3_encodepolar_8u_x2.h'
25 */
26
27#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
28#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
29#include <string.h>
30
31static inline unsigned int log2_of_power_of_2(unsigned int val)
32{
33 // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
34 static const unsigned int b[] = {
35 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
36 };
37
38 unsigned int res = (val & b[0]) != 0;
39 res |= ((val & b[4]) != 0) << 4;
40 res |= ((val & b[3]) != 0) << 3;
41 res |= ((val & b[2]) != 0) << 2;
42 res |= ((val & b[1]) != 0) << 1;
43 return res;
44}
45
46static inline void encodepolar_single_stage(unsigned char* frame_ptr,
47 const unsigned char* temp_ptr,
48 const unsigned int num_branches,
49 const unsigned int frame_half)
50{
51 unsigned int branch, bit;
52 for (branch = 0; branch < num_branches; ++branch) {
53 for (bit = 0; bit < frame_half; ++bit) {
54 *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
55 *(frame_ptr + frame_half) = *(temp_ptr + 1);
56 ++frame_ptr;
57 temp_ptr += 2;
58 }
59 frame_ptr += frame_half;
60 }
61}
62
63#ifdef LV_HAVE_GENERIC
64
65static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame,
66 unsigned char* temp,
67 unsigned int frame_size)
68{
69 unsigned int stage = log2_of_power_of_2(frame_size);
70 unsigned int frame_half = frame_size >> 1;
71 unsigned int num_branches = 1;
72
73 while (stage) {
74 // encode stage
75 encodepolar_single_stage(frame, temp, num_branches, frame_half);
76 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
77
78 // update all the parameters.
79 num_branches = num_branches << 1;
80 frame_half = frame_half >> 1;
81 --stage;
82 }
83}
84#endif /* LV_HAVE_GENERIC */
85
86#ifdef LV_HAVE_SSSE3
87#include <tmmintrin.h>
88
89static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame,
90 unsigned char* temp,
91 unsigned int frame_size)
92{
93 const unsigned int po2 = log2_of_power_of_2(frame_size);
94
95 unsigned int stage = po2;
96 unsigned char* frame_ptr = frame;
97 unsigned char* temp_ptr = temp;
98
99 unsigned int frame_half = frame_size >> 1;
100 unsigned int num_branches = 1;
101 unsigned int branch;
102 unsigned int bit;
103
104 // prepare constants
105 const __m128i mask_stage1 = _mm_set_epi8(0x0,
106 0xFF,
107 0x0,
108 0xFF,
109 0x0,
110 0xFF,
111 0x0,
112 0xFF,
113 0x0,
114 0xFF,
115 0x0,
116 0xFF,
117 0x0,
118 0xFF,
119 0x0,
120 0xFF);
121
122 // get some SIMD registers to play with.
123 __m128i r_frame0, r_temp0, shifted;
124
125 {
126 __m128i r_frame1, r_temp1;
127 const __m128i shuffle_separate =
128 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
129
130 while (stage > 4) {
131 frame_ptr = frame;
132 temp_ptr = temp;
133
134 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
135 for (branch = 0; branch < num_branches; ++branch) {
136 for (bit = 0; bit < frame_half; bit += 16) {
137 r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
138 temp_ptr += 16;
139 r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
140 temp_ptr += 16;
141
142 shifted = _mm_srli_si128(r_temp0, 1);
143 shifted = _mm_and_si128(shifted, mask_stage1);
144 r_temp0 = _mm_xor_si128(shifted, r_temp0);
145 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
146
147 shifted = _mm_srli_si128(r_temp1, 1);
148 shifted = _mm_and_si128(shifted, mask_stage1);
149 r_temp1 = _mm_xor_si128(shifted, r_temp1);
150 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
151
152 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
153 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
154
155 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
156 _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
157 frame_ptr += 16;
158 }
159
160 frame_ptr += frame_half;
161 }
162 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
163
164 num_branches = num_branches << 1;
165 frame_half = frame_half >> 1;
166 stage--;
167 }
168 }
169
170 // This last part requires at least 16-bit frames.
171 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
172
173 // reset pointers to correct positions.
174 frame_ptr = frame;
175 temp_ptr = temp;
176
177 // prefetch first chunk
178 __VOLK_PREFETCH(temp_ptr);
179
180 const __m128i shuffle_stage4 =
181 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
182 const __m128i mask_stage4 = _mm_set_epi8(0x0,
183 0x0,
184 0x0,
185 0x0,
186 0x0,
187 0x0,
188 0x0,
189 0x0,
190 0xFF,
191 0xFF,
192 0xFF,
193 0xFF,
194 0xFF,
195 0xFF,
196 0xFF,
197 0xFF);
198 const __m128i mask_stage3 = _mm_set_epi8(0x0,
199 0x0,
200 0x0,
201 0x0,
202 0xFF,
203 0xFF,
204 0xFF,
205 0xFF,
206 0x0,
207 0x0,
208 0x0,
209 0x0,
210 0xFF,
211 0xFF,
212 0xFF,
213 0xFF);
214 const __m128i mask_stage2 = _mm_set_epi8(0x0,
215 0x0,
216 0xFF,
217 0xFF,
218 0x0,
219 0x0,
220 0xFF,
221 0xFF,
222 0x0,
223 0x0,
224 0xFF,
225 0xFF,
226 0x0,
227 0x0,
228 0xFF,
229 0xFF);
230
231 for (branch = 0; branch < num_branches; ++branch) {
232 r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
233
234 // prefetch next chunk
235 temp_ptr += 16;
236 __VOLK_PREFETCH(temp_ptr);
237
238 // shuffle once for bit-reversal.
239 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
240
241 shifted = _mm_srli_si128(r_temp0, 8);
242 shifted = _mm_and_si128(shifted, mask_stage4);
243 r_frame0 = _mm_xor_si128(shifted, r_temp0);
244
245 shifted = _mm_srli_si128(r_frame0, 4);
246 shifted = _mm_and_si128(shifted, mask_stage3);
247 r_frame0 = _mm_xor_si128(shifted, r_frame0);
248
249 shifted = _mm_srli_si128(r_frame0, 2);
250 shifted = _mm_and_si128(shifted, mask_stage2);
251 r_frame0 = _mm_xor_si128(shifted, r_frame0);
252
253 shifted = _mm_srli_si128(r_frame0, 1);
254 shifted = _mm_and_si128(shifted, mask_stage1);
255 r_frame0 = _mm_xor_si128(shifted, r_frame0);
256
257 // store result of chunk.
258 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
259 frame_ptr += 16;
260 }
261}
262
263#endif /* LV_HAVE_SSSE3 */
264
265#ifdef LV_HAVE_AVX2
266#include <immintrin.h>
267
268static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame,
269 unsigned char* temp,
270 unsigned int frame_size)
271{
272 const unsigned int po2 = log2_of_power_of_2(frame_size);
273
274 unsigned int stage = po2;
275 unsigned char* frame_ptr = frame;
276 unsigned char* temp_ptr = temp;
277
278 unsigned int frame_half = frame_size >> 1;
279 unsigned int num_branches = 1;
280 unsigned int branch;
281 unsigned int bit;
282
283 // prepare constants
284 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
285 0xFF,
286 0x0,
287 0xFF,
288 0x0,
289 0xFF,
290 0x0,
291 0xFF,
292 0x0,
293 0xFF,
294 0x0,
295 0xFF,
296 0x0,
297 0xFF,
298 0x0,
299 0xFF,
300 0x0,
301 0xFF,
302 0x0,
303 0xFF,
304 0x0,
305 0xFF,
306 0x0,
307 0xFF,
308 0x0,
309 0xFF,
310 0x0,
311 0xFF,
312 0x0,
313 0xFF,
314 0x0,
315 0xFF);
316
317 const __m128i mask_stage0 = _mm_set_epi8(0x0,
318 0xFF,
319 0x0,
320 0xFF,
321 0x0,
322 0xFF,
323 0x0,
324 0xFF,
325 0x0,
326 0xFF,
327 0x0,
328 0xFF,
329 0x0,
330 0xFF,
331 0x0,
332 0xFF);
333 // get some SIMD registers to play with.
334 __m256i r_frame0, r_temp0, shifted;
335 __m128i r_temp2, r_frame2, shifted2;
336 {
337 __m256i r_frame1, r_temp1;
338 __m128i r_frame3, r_temp3;
339 const __m256i shuffle_separate = _mm256_setr_epi8(0,
340 2,
341 4,
342 6,
343 8,
344 10,
345 12,
346 14,
347 1,
348 3,
349 5,
350 7,
351 9,
352 11,
353 13,
354 15,
355 0,
356 2,
357 4,
358 6,
359 8,
360 10,
361 12,
362 14,
363 1,
364 3,
365 5,
366 7,
367 9,
368 11,
369 13,
370 15);
371 const __m128i shuffle_separate128 =
372 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
373
374 while (stage > 4) {
375 frame_ptr = frame;
376 temp_ptr = temp;
377
378 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
379 for (branch = 0; branch < num_branches; ++branch) {
380 for (bit = 0; bit < frame_half; bit += 32) {
381 if ((frame_half - bit) <
382 32) // if only 16 bits remaining in frame, not 32
383 {
384 r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
385 temp_ptr += 16;
386 r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
387 temp_ptr += 16;
388
389 shifted2 = _mm_srli_si128(r_temp2, 1);
390 shifted2 = _mm_and_si128(shifted2, mask_stage0);
391 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
392 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
393
394 shifted2 = _mm_srli_si128(r_temp3, 1);
395 shifted2 = _mm_and_si128(shifted2, mask_stage0);
396 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
397 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
398
399 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
400 _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
401
402 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
403 _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
404 frame_ptr += 16;
405 break;
406 }
407 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
408 temp_ptr += 32;
409 r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
410 temp_ptr += 32;
411
412 shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
413 shifted = _mm256_and_si256(shifted, mask_stage1);
414 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
415 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
416
417 shifted = _mm256_srli_si256(r_temp1, 1);
418 shifted = _mm256_and_si256(shifted, mask_stage1);
419 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
420 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
421
422 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
423 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
424 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
425 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
426
427 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
428
429 _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
430 frame_ptr += 32;
431 }
432
433 frame_ptr += frame_half;
434 }
435 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
436
437 num_branches = num_branches << 1;
438 frame_half = frame_half >> 1;
439 stage--;
440 }
441 }
442
443 // This last part requires at least 32-bit frames.
444 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
445
446 // reset pointers to correct positions.
447 frame_ptr = frame;
448 temp_ptr = temp;
449
450 // prefetch first chunk
451 __VOLK_PREFETCH(temp_ptr);
452
453 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
454 8,
455 4,
456 12,
457 2,
458 10,
459 6,
460 14,
461 1,
462 9,
463 5,
464 13,
465 3,
466 11,
467 7,
468 15,
469 0,
470 8,
471 4,
472 12,
473 2,
474 10,
475 6,
476 14,
477 1,
478 9,
479 5,
480 13,
481 3,
482 11,
483 7,
484 15);
485 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
486 0x0,
487 0x0,
488 0x0,
489 0x0,
490 0x0,
491 0x0,
492 0x0,
493 0xFF,
494 0xFF,
495 0xFF,
496 0xFF,
497 0xFF,
498 0xFF,
499 0xFF,
500 0xFF,
501 0x0,
502 0x0,
503 0x0,
504 0x0,
505 0x0,
506 0x0,
507 0x0,
508 0x0,
509 0xFF,
510 0xFF,
511 0xFF,
512 0xFF,
513 0xFF,
514 0xFF,
515 0xFF,
516 0xFF);
517 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
518 0x0,
519 0x0,
520 0x0,
521 0xFF,
522 0xFF,
523 0xFF,
524 0xFF,
525 0x0,
526 0x0,
527 0x0,
528 0x0,
529 0xFF,
530 0xFF,
531 0xFF,
532 0xFF,
533 0x0,
534 0x0,
535 0x0,
536 0x0,
537 0xFF,
538 0xFF,
539 0xFF,
540 0xFF,
541 0x0,
542 0x0,
543 0x0,
544 0x0,
545 0xFF,
546 0xFF,
547 0xFF,
548 0xFF);
549 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
550 0x0,
551 0xFF,
552 0xFF,
553 0x0,
554 0x0,
555 0xFF,
556 0xFF,
557 0x0,
558 0x0,
559 0xFF,
560 0xFF,
561 0x0,
562 0x0,
563 0xFF,
564 0xFF,
565 0x0,
566 0x0,
567 0xFF,
568 0xFF,
569 0x0,
570 0x0,
571 0xFF,
572 0xFF,
573 0x0,
574 0x0,
575 0xFF,
576 0xFF,
577 0x0,
578 0x0,
579 0xFF,
580 0xFF);
581
582 for (branch = 0; branch < num_branches / 2; ++branch) {
583 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
584
585 // prefetch next chunk
586 temp_ptr += 32;
587 __VOLK_PREFETCH(temp_ptr);
588
589 // shuffle once for bit-reversal.
590 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
591
592 shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
593 shifted = _mm256_and_si256(shifted, mask_stage4);
594 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
595
596
597 shifted = _mm256_srli_si256(r_frame0, 4);
598 shifted = _mm256_and_si256(shifted, mask_stage3);
599 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
600
601 shifted = _mm256_srli_si256(r_frame0, 2);
602 shifted = _mm256_and_si256(shifted, mask_stage2);
603 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
604
605 shifted = _mm256_srli_si256(r_frame0, 1);
606 shifted = _mm256_and_si256(shifted, mask_stage1);
607 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
608
609 // store result of chunk.
610 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
611 frame_ptr += 32;
612 }
613}
614#endif /* LV_HAVE_AVX2 */
615
616#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ */
617
618#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
619#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
620
621#ifdef LV_HAVE_SSSE3
622#include <tmmintrin.h>
623
624static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame,
625 unsigned char* temp,
626 unsigned int frame_size)
627{
628 const unsigned int po2 = log2_of_power_of_2(frame_size);
629
630 unsigned int stage = po2;
631 unsigned char* frame_ptr = frame;
632 unsigned char* temp_ptr = temp;
633
634 unsigned int frame_half = frame_size >> 1;
635 unsigned int num_branches = 1;
636 unsigned int branch;
637 unsigned int bit;
638
639 // prepare constants
640 const __m128i mask_stage1 = _mm_set_epi8(0x0,
641 0xFF,
642 0x0,
643 0xFF,
644 0x0,
645 0xFF,
646 0x0,
647 0xFF,
648 0x0,
649 0xFF,
650 0x0,
651 0xFF,
652 0x0,
653 0xFF,
654 0x0,
655 0xFF);
656
657 // get some SIMD registers to play with.
658 __m128i r_frame0, r_temp0, shifted;
659
660 {
661 __m128i r_frame1, r_temp1;
662 const __m128i shuffle_separate =
663 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
664
665 while (stage > 4) {
666 frame_ptr = frame;
667 temp_ptr = temp;
668
669 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
670 for (branch = 0; branch < num_branches; ++branch) {
671 for (bit = 0; bit < frame_half; bit += 16) {
672 r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
673 temp_ptr += 16;
674 r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
675 temp_ptr += 16;
676
677 shifted = _mm_srli_si128(r_temp0, 1);
678 shifted = _mm_and_si128(shifted, mask_stage1);
679 r_temp0 = _mm_xor_si128(shifted, r_temp0);
680 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
681
682 shifted = _mm_srli_si128(r_temp1, 1);
683 shifted = _mm_and_si128(shifted, mask_stage1);
684 r_temp1 = _mm_xor_si128(shifted, r_temp1);
685 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
686
687 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
688 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
689
690 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
691 _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
692 frame_ptr += 16;
693 }
694
695 frame_ptr += frame_half;
696 }
697 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
698
699 num_branches = num_branches << 1;
700 frame_half = frame_half >> 1;
701 stage--;
702 }
703 }
704
705 // This last part requires at least 16-bit frames.
706 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
707
708 // reset pointers to correct positions.
709 frame_ptr = frame;
710 temp_ptr = temp;
711
712 // prefetch first chunk
713 __VOLK_PREFETCH(temp_ptr);
714
715 const __m128i shuffle_stage4 =
716 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
717 const __m128i mask_stage4 = _mm_set_epi8(0x0,
718 0x0,
719 0x0,
720 0x0,
721 0x0,
722 0x0,
723 0x0,
724 0x0,
725 0xFF,
726 0xFF,
727 0xFF,
728 0xFF,
729 0xFF,
730 0xFF,
731 0xFF,
732 0xFF);
733 const __m128i mask_stage3 = _mm_set_epi8(0x0,
734 0x0,
735 0x0,
736 0x0,
737 0xFF,
738 0xFF,
739 0xFF,
740 0xFF,
741 0x0,
742 0x0,
743 0x0,
744 0x0,
745 0xFF,
746 0xFF,
747 0xFF,
748 0xFF);
749 const __m128i mask_stage2 = _mm_set_epi8(0x0,
750 0x0,
751 0xFF,
752 0xFF,
753 0x0,
754 0x0,
755 0xFF,
756 0xFF,
757 0x0,
758 0x0,
759 0xFF,
760 0xFF,
761 0x0,
762 0x0,
763 0xFF,
764 0xFF);
765
766 for (branch = 0; branch < num_branches; ++branch) {
767 r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
768
769 // prefetch next chunk
770 temp_ptr += 16;
771 __VOLK_PREFETCH(temp_ptr);
772
773 // shuffle once for bit-reversal.
774 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
775
776 shifted = _mm_srli_si128(r_temp0, 8);
777 shifted = _mm_and_si128(shifted, mask_stage4);
778 r_frame0 = _mm_xor_si128(shifted, r_temp0);
779
780 shifted = _mm_srli_si128(r_frame0, 4);
781 shifted = _mm_and_si128(shifted, mask_stage3);
782 r_frame0 = _mm_xor_si128(shifted, r_frame0);
783
784 shifted = _mm_srli_si128(r_frame0, 2);
785 shifted = _mm_and_si128(shifted, mask_stage2);
786 r_frame0 = _mm_xor_si128(shifted, r_frame0);
787
788 shifted = _mm_srli_si128(r_frame0, 1);
789 shifted = _mm_and_si128(shifted, mask_stage1);
790 r_frame0 = _mm_xor_si128(shifted, r_frame0);
791
792 // store result of chunk.
793 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
794 frame_ptr += 16;
795 }
796}
797#endif /* LV_HAVE_SSSE3 */
798
799#ifdef LV_HAVE_AVX2
800#include <immintrin.h>
801
802static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame,
803 unsigned char* temp,
804 unsigned int frame_size)
805{
806 const unsigned int po2 = log2_of_power_of_2(frame_size);
807
808 unsigned int stage = po2;
809 unsigned char* frame_ptr = frame;
810 unsigned char* temp_ptr = temp;
811
812 unsigned int frame_half = frame_size >> 1;
813 unsigned int num_branches = 1;
814 unsigned int branch;
815 unsigned int bit;
816
817 // prepare constants
818 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
819 0xFF,
820 0x0,
821 0xFF,
822 0x0,
823 0xFF,
824 0x0,
825 0xFF,
826 0x0,
827 0xFF,
828 0x0,
829 0xFF,
830 0x0,
831 0xFF,
832 0x0,
833 0xFF,
834 0x0,
835 0xFF,
836 0x0,
837 0xFF,
838 0x0,
839 0xFF,
840 0x0,
841 0xFF,
842 0x0,
843 0xFF,
844 0x0,
845 0xFF,
846 0x0,
847 0xFF,
848 0x0,
849 0xFF);
850
851 const __m128i mask_stage0 = _mm_set_epi8(0x0,
852 0xFF,
853 0x0,
854 0xFF,
855 0x0,
856 0xFF,
857 0x0,
858 0xFF,
859 0x0,
860 0xFF,
861 0x0,
862 0xFF,
863 0x0,
864 0xFF,
865 0x0,
866 0xFF);
867 // get some SIMD registers to play with.
868 __m256i r_frame0, r_temp0, shifted;
869 __m128i r_temp2, r_frame2, shifted2;
870 {
871 __m256i r_frame1, r_temp1;
872 __m128i r_frame3, r_temp3;
873 const __m256i shuffle_separate = _mm256_setr_epi8(0,
874 2,
875 4,
876 6,
877 8,
878 10,
879 12,
880 14,
881 1,
882 3,
883 5,
884 7,
885 9,
886 11,
887 13,
888 15,
889 0,
890 2,
891 4,
892 6,
893 8,
894 10,
895 12,
896 14,
897 1,
898 3,
899 5,
900 7,
901 9,
902 11,
903 13,
904 15);
905 const __m128i shuffle_separate128 =
906 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
907
908 while (stage > 4) {
909 frame_ptr = frame;
910 temp_ptr = temp;
911
912 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
913 for (branch = 0; branch < num_branches; ++branch) {
914 for (bit = 0; bit < frame_half; bit += 32) {
915 if ((frame_half - bit) <
916 32) // if only 16 bits remaining in frame, not 32
917 {
918 r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
919 temp_ptr += 16;
920 r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
921 temp_ptr += 16;
922
923 shifted2 = _mm_srli_si128(r_temp2, 1);
924 shifted2 = _mm_and_si128(shifted2, mask_stage0);
925 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
926 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
927
928 shifted2 = _mm_srli_si128(r_temp3, 1);
929 shifted2 = _mm_and_si128(shifted2, mask_stage0);
930 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
931 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
932
933 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
934 _mm_store_si128((__m128i*)frame_ptr, r_frame2);
935
936 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
937 _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
938 frame_ptr += 16;
939 break;
940 }
941 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
942 temp_ptr += 32;
943 r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
944 temp_ptr += 32;
945
946 shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
947 shifted = _mm256_and_si256(shifted, mask_stage1);
948 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
949 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
950
951 shifted = _mm256_srli_si256(r_temp1, 1);
952 shifted = _mm256_and_si256(shifted, mask_stage1);
953 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
954 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
955
956 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
957 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
958 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
959 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
960
961 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
962
963 _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
964 frame_ptr += 32;
965 }
966
967 frame_ptr += frame_half;
968 }
969 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
970
971 num_branches = num_branches << 1;
972 frame_half = frame_half >> 1;
973 stage--;
974 }
975 }
976
977 // This last part requires at least 32-bit frames.
978 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
979
980 // reset pointers to correct positions.
981 frame_ptr = frame;
982 temp_ptr = temp;
983
984 // prefetch first chunk.
985 __VOLK_PREFETCH(temp_ptr);
986
987 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
988 8,
989 4,
990 12,
991 2,
992 10,
993 6,
994 14,
995 1,
996 9,
997 5,
998 13,
999 3,
1000 11,
1001 7,
1002 15,
1003 0,
1004 8,
1005 4,
1006 12,
1007 2,
1008 10,
1009 6,
1010 14,
1011 1,
1012 9,
1013 5,
1014 13,
1015 3,
1016 11,
1017 7,
1018 15);
1019 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1020 0x0,
1021 0x0,
1022 0x0,
1023 0x0,
1024 0x0,
1025 0x0,
1026 0x0,
1027 0xFF,
1028 0xFF,
1029 0xFF,
1030 0xFF,
1031 0xFF,
1032 0xFF,
1033 0xFF,
1034 0xFF,
1035 0x0,
1036 0x0,
1037 0x0,
1038 0x0,
1039 0x0,
1040 0x0,
1041 0x0,
1042 0x0,
1043 0xFF,
1044 0xFF,
1045 0xFF,
1046 0xFF,
1047 0xFF,
1048 0xFF,
1049 0xFF,
1050 0xFF);
1051 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1052 0x0,
1053 0x0,
1054 0x0,
1055 0xFF,
1056 0xFF,
1057 0xFF,
1058 0xFF,
1059 0x0,
1060 0x0,
1061 0x0,
1062 0x0,
1063 0xFF,
1064 0xFF,
1065 0xFF,
1066 0xFF,
1067 0x0,
1068 0x0,
1069 0x0,
1070 0x0,
1071 0xFF,
1072 0xFF,
1073 0xFF,
1074 0xFF,
1075 0x0,
1076 0x0,
1077 0x0,
1078 0x0,
1079 0xFF,
1080 0xFF,
1081 0xFF,
1082 0xFF);
1083 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1084 0x0,
1085 0xFF,
1086 0xFF,
1087 0x0,
1088 0x0,
1089 0xFF,
1090 0xFF,
1091 0x0,
1092 0x0,
1093 0xFF,
1094 0xFF,
1095 0x0,
1096 0x0,
1097 0xFF,
1098 0xFF,
1099 0x0,
1100 0x0,
1101 0xFF,
1102 0xFF,
1103 0x0,
1104 0x0,
1105 0xFF,
1106 0xFF,
1107 0x0,
1108 0x0,
1109 0xFF,
1110 0xFF,
1111 0x0,
1112 0x0,
1113 0xFF,
1114 0xFF);
1115
1116 for (branch = 0; branch < num_branches / 2; ++branch) {
1117 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1118
1119 // prefetch next chunk
1120 temp_ptr += 32;
1121 __VOLK_PREFETCH(temp_ptr);
1122
1123 // shuffle once for bit-reversal.
1124 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1125
1126 shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
1127 shifted = _mm256_and_si256(shifted, mask_stage4);
1128 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1129
1130 shifted = _mm256_srli_si256(r_frame0, 4);
1131 shifted = _mm256_and_si256(shifted, mask_stage3);
1132 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1133
1134 shifted = _mm256_srli_si256(r_frame0, 2);
1135 shifted = _mm256_and_si256(shifted, mask_stage2);
1136 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1137
1138 shifted = _mm256_srli_si256(r_frame0, 1);
1139 shifted = _mm256_and_si256(shifted, mask_stage1);
1140 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1141
1142 // store result of chunk.
1143 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
1144 frame_ptr += 32;
1145 }
1146}
1147#endif /* LV_HAVE_AVX2 */
1148
1149
1150#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */
static void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:624
static void encodepolar_single_stage(unsigned char *frame_ptr, const unsigned char *temp_ptr, const unsigned int num_branches, const unsigned int frame_half)
Definition: volk_8u_x2_encodeframepolar_8u.h:46
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:65
static unsigned int log2_of_power_of_2(unsigned int val)
Definition: volk_8u_x2_encodeframepolar_8u.h:31
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:89
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62