25 #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES 26 #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES 32 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 33 # include <arm_neon.h> 36 #if (CRYPTOPP_SSSE3_AVAILABLE) 37 # include <emmintrin.h> 38 # include <pmmintrin.h> 39 # include <tmmintrin.h> 42 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) 48 #if (CRYPTOPP_GCC_VERSION >= 40900) 49 # define GCC_NO_UBSAN __attribute__ ((no_sanitize_undefined)) 56 ANONYMOUS_NAMESPACE_BEGIN
58 using CryptoPP::BlockTransformation;
66 ANONYMOUS_NAMESPACE_END
70 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE) 74 template <
typename F2,
typename F6>
75 inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
76 const word32 *subKeys,
size_t rounds,
const byte *inBlocks,
77 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
84 #if defined(CRYPTOPP_LITTLE_ENDIAN) 85 const word32 s_zero32x4[] = {0, 0, 0, 0};
86 const word32 s_one32x4[] = {0, 0, 0, 1<<24};
87 const word32 s_one32x4_1b[] = {0, 0, 0, 1<<24};
88 const word32 s_one32x4_2b[] = {0, 2<<24, 0, 2<<24};
90 const word32 s_zero32x4[] = {0, 0, 0, 0};
91 const word32 s_one32x4[] = {0, 0, 0, 1};
92 const word32 s_one32x4_1b[] = {0, 0, 0, 1};
93 const word32 s_one32x4_2b[] = {0, 2, 0, 2};
96 const ptrdiff_t blockSize = 8;
97 const ptrdiff_t neonBlockSize = 16;
99 ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : neonBlockSize;
100 ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? neonBlockSize : 0;
101 ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : neonBlockSize;
104 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
105 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
107 if (flags & BT_ReverseDirection)
109 inBlocks +=
static_cast<ptrdiff_t
>(length) - neonBlockSize;
110 xorBlocks +=
static_cast<ptrdiff_t
>(length) - neonBlockSize;
111 outBlocks +=
static_cast<ptrdiff_t
>(length) - neonBlockSize;
112 inIncrement = 0-inIncrement;
113 xorIncrement = 0-xorIncrement;
114 outIncrement = 0-outIncrement;
117 if (flags & BT_AllowParallel)
119 while (length >= 6*neonBlockSize)
121 uint32x4_t block0, block1, block2, block3, block4, block5;
122 if (flags & BT_InBlockIsCounter)
127 const uint8x8_t ctr = vld1_u8(inBlocks);
128 block0 = vaddq_u32(vld1q_u32(s_one32x4_1b),
129 vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
132 const uint32x4_t be2 = vld1q_u32(s_one32x4_2b);
133 block1 = vaddq_u32(be2, block0);
134 block2 = vaddq_u32(be2, block1);
135 block3 = vaddq_u32(be2, block2);
136 block4 = vaddq_u32(be2, block3);
137 block5 = vaddq_u32(be2, block4);
139 vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
140 vreinterpretq_u8_u32(vaddq_u32(be2, block5))));
144 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
145 inBlocks += inIncrement;
146 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
147 inBlocks += inIncrement;
148 block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
149 inBlocks += inIncrement;
150 block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
151 inBlocks += inIncrement;
152 block4 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
153 inBlocks += inIncrement;
154 block5 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
155 inBlocks += inIncrement;
160 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
161 xorBlocks += xorIncrement;
162 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
163 xorBlocks += xorIncrement;
164 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
165 xorBlocks += xorIncrement;
166 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
167 xorBlocks += xorIncrement;
168 block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
169 xorBlocks += xorIncrement;
170 block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
171 xorBlocks += xorIncrement;
174 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
178 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
179 xorBlocks += xorIncrement;
180 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
181 xorBlocks += xorIncrement;
182 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
183 xorBlocks += xorIncrement;
184 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
185 xorBlocks += xorIncrement;
186 block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
187 xorBlocks += xorIncrement;
188 block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
189 xorBlocks += xorIncrement;
192 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
193 outBlocks += outIncrement;
194 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
195 outBlocks += outIncrement;
196 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
197 outBlocks += outIncrement;
198 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
199 outBlocks += outIncrement;
200 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block4));
201 outBlocks += outIncrement;
202 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block5));
203 outBlocks += outIncrement;
205 length -= 6*neonBlockSize;
208 while (length >= 2*neonBlockSize)
210 uint32x4_t block0, block1;
211 if (flags & BT_InBlockIsCounter)
216 const uint8x8_t ctr = vld1_u8(inBlocks);
217 block0 = vaddq_u32(vld1q_u32(s_one32x4_1b),
218 vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
221 const uint32x4_t be2 = vld1q_u32(s_one32x4_2b);
222 block1 = vaddq_u32(be2, block0);
224 vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
225 vreinterpretq_u8_u32(vaddq_u32(be2, block1))));
229 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
230 inBlocks += inIncrement;
231 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
232 inBlocks += inIncrement;
237 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
238 xorBlocks += xorIncrement;
239 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
240 xorBlocks += xorIncrement;
243 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
247 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
248 xorBlocks += xorIncrement;
249 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
250 xorBlocks += xorIncrement;
253 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
254 outBlocks += outIncrement;
255 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
256 outBlocks += outIncrement;
258 length -= 2*neonBlockSize;
265 if (flags & BT_ReverseDirection)
267 inIncrement += inIncrement ? blockSize : 0;
268 xorIncrement += xorIncrement ? blockSize : 0;
269 outIncrement += outIncrement ? blockSize : 0;
270 inBlocks -= inIncrement;
271 xorBlocks -= xorIncrement;
272 outBlocks -= outIncrement;
276 inIncrement -= inIncrement ? blockSize : 0;
277 xorIncrement -= xorIncrement ? blockSize : 0;
278 outIncrement -= outIncrement ? blockSize : 0;
281 while (length >= blockSize)
283 uint32x4_t block, zero = vld1q_u32(s_zero32x4);
285 const uint8x8_t v = vld1_u8(inBlocks);
286 block = vreinterpretq_u32_u8(vcombine_u8(v,v));
290 const uint8x8_t x = vld1_u8(xorBlocks);
291 block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
294 if (flags & BT_InBlockIsCounter)
295 const_cast<byte *
>(inBlocks)[7]++;
297 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
301 const uint8x8_t x = vld1_u8(xorBlocks);
302 block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
305 vst1_u8(const_cast<byte*>(outBlocks),
306 vget_low_u8(vreinterpretq_u8_u32(block)));
308 inBlocks += inIncrement;
309 outBlocks += outIncrement;
310 xorBlocks += xorIncrement;
318 template <
typename F1,
typename F6>
319 inline size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6,
320 const word32 *subKeys,
size_t rounds,
const byte *inBlocks,
321 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
328 #if defined(CRYPTOPP_LITTLE_ENDIAN) 329 const word32 s_zero32x4[] = {0, 0, 0, 0};
330 const word32 s_one32x4[] = {0, 0, 0, 1<<24};
331 const word32 s_one32x4_1b[] = {0, 0, 0, 1<<24};
332 const word32 s_one32x4_2b[] = {0, 2<<24, 0, 2<<24};
334 const word32 s_zero32x4[] = {0, 0, 0, 0};
335 const word32 s_one32x4[] = {0, 0, 0, 1};
336 const word32 s_one32x4_1b[] = {0, 0, 0, 1};
337 const word32 s_one32x4_2b[] = {0, 2, 0, 2};
340 const ptrdiff_t blockSize = 16;
343 ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
344 ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
345 ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
348 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
349 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
351 if (flags & BT_ReverseDirection)
353 inBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
354 xorBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
355 outBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
356 inIncrement = 0-inIncrement;
357 xorIncrement = 0-xorIncrement;
358 outIncrement = 0-outIncrement;
361 if (flags & BT_AllowParallel)
363 while (length >= 6*blockSize)
365 uint64x2_t block0, block1, block2, block3, block4, block5;
366 if (flags & BT_InBlockIsCounter)
368 const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4));
369 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
371 block1 = vaddq_u64(block0, be);
372 block2 = vaddq_u64(block1, be);
373 block3 = vaddq_u64(block2, be);
374 block4 = vaddq_u64(block3, be);
375 block5 = vaddq_u64(block4, be);
376 vst1q_u8(const_cast<byte*>(inBlocks),
377 vreinterpretq_u8_u64(vaddq_u64(block5, be)));
381 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
382 inBlocks += inIncrement;
383 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
384 inBlocks += inIncrement;
385 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
386 inBlocks += inIncrement;
387 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
388 inBlocks += inIncrement;
389 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
390 inBlocks += inIncrement;
391 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
392 inBlocks += inIncrement;
397 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
398 xorBlocks += xorIncrement;
399 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
400 xorBlocks += xorIncrement;
401 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
402 xorBlocks += xorIncrement;
403 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
404 xorBlocks += xorIncrement;
405 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
406 xorBlocks += xorIncrement;
407 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
408 xorBlocks += xorIncrement;
411 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
415 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
416 xorBlocks += xorIncrement;
417 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
418 xorBlocks += xorIncrement;
419 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
420 xorBlocks += xorIncrement;
421 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
422 xorBlocks += xorIncrement;
423 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
424 xorBlocks += xorIncrement;
425 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
426 xorBlocks += xorIncrement;
429 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
430 outBlocks += outIncrement;
431 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
432 outBlocks += outIncrement;
433 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
434 outBlocks += outIncrement;
435 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
436 outBlocks += outIncrement;
437 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
438 outBlocks += outIncrement;
439 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
440 outBlocks += outIncrement;
442 length -= 6*blockSize;
446 while (length >= blockSize)
449 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
452 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
454 if (flags & BT_InBlockIsCounter)
455 const_cast<byte *
>(inBlocks)[15]++;
457 func1(block, subKeys, static_cast<unsigned int>(rounds));
460 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
462 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
464 inBlocks += inIncrement;
465 outBlocks += outIncrement;
466 xorBlocks += xorIncrement;
473 template <
typename F2,
typename F6>
474 inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
475 const word64 *subKeys,
size_t rounds,
const byte *inBlocks,
476 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
483 #if defined(CRYPTOPP_LITTLE_ENDIAN) 484 const word32 s_zero32x4[] = {0, 0, 0, 0};
485 const word32 s_one32x4[] = {0, 0, 0, 1<<24};
486 const word32 s_one32x4_1b[] = {0, 0, 0, 1<<24};
487 const word32 s_one32x4_2b[] = {0, 2<<24, 0, 2<<24};
489 const word32 s_zero32x4[] = {0, 0, 0, 0};
490 const word32 s_one32x4[] = {0, 0, 0, 1};
491 const word32 s_one32x4_1b[] = {0, 0, 0, 1};
492 const word32 s_one32x4_2b[] = {0, 2, 0, 2};
495 const ptrdiff_t blockSize = 16;
498 ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
499 ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
500 ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
503 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
504 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
506 if (flags & BT_ReverseDirection)
508 inBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
509 xorBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
510 outBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
511 inIncrement = 0-inIncrement;
512 xorIncrement = 0-xorIncrement;
513 outIncrement = 0-outIncrement;
516 if (flags & BT_AllowParallel)
518 while (length >= 6*blockSize)
520 uint64x2_t block0, block1, block2, block3, block4, block5;
521 if (flags & BT_InBlockIsCounter)
523 const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4));
524 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
526 block1 = vaddq_u64(block0, be);
527 block2 = vaddq_u64(block1, be);
528 block3 = vaddq_u64(block2, be);
529 block4 = vaddq_u64(block3, be);
530 block5 = vaddq_u64(block4, be);
531 vst1q_u8(const_cast<byte*>(inBlocks),
532 vreinterpretq_u8_u64(vaddq_u64(block5, be)));
536 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
537 inBlocks += inIncrement;
538 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
539 inBlocks += inIncrement;
540 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
541 inBlocks += inIncrement;
542 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
543 inBlocks += inIncrement;
544 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
545 inBlocks += inIncrement;
546 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
547 inBlocks += inIncrement;
552 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
553 xorBlocks += xorIncrement;
554 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
555 xorBlocks += xorIncrement;
556 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
557 xorBlocks += xorIncrement;
558 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
559 xorBlocks += xorIncrement;
560 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
561 xorBlocks += xorIncrement;
562 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
563 xorBlocks += xorIncrement;
566 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
570 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
571 xorBlocks += xorIncrement;
572 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
573 xorBlocks += xorIncrement;
574 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
575 xorBlocks += xorIncrement;
576 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
577 xorBlocks += xorIncrement;
578 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
579 xorBlocks += xorIncrement;
580 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
581 xorBlocks += xorIncrement;
584 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
585 outBlocks += outIncrement;
586 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
587 outBlocks += outIncrement;
588 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
589 outBlocks += outIncrement;
590 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
591 outBlocks += outIncrement;
592 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
593 outBlocks += outIncrement;
594 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
595 outBlocks += outIncrement;
597 length -= 6*blockSize;
600 while (length >= 2*blockSize)
602 uint64x2_t block0, block1;
603 if (flags & BT_InBlockIsCounter)
605 const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4));
606 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
607 block1 = vaddq_u64(block0, be);
609 vst1q_u8(const_cast<byte*>(inBlocks),
610 vreinterpretq_u8_u64(vaddq_u64(block1, be)));
614 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
615 inBlocks += inIncrement;
616 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
617 inBlocks += inIncrement;
622 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
623 xorBlocks += xorIncrement;
624 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
625 xorBlocks += xorIncrement;
628 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
632 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
633 xorBlocks += xorIncrement;
634 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
635 xorBlocks += xorIncrement;
638 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
639 outBlocks += outIncrement;
640 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
641 outBlocks += outIncrement;
643 length -= 2*blockSize;
647 while (length >= blockSize)
649 uint64x2_t block, zero = {0,0};
650 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
653 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
655 if (flags & BT_InBlockIsCounter)
656 const_cast<byte *
>(inBlocks)[15]++;
658 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
661 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
663 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
665 inBlocks += inIncrement;
666 outBlocks += outIncrement;
667 xorBlocks += xorIncrement;
676 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 680 #if defined(CRYPTOPP_SSSE3_AVAILABLE) 683 #if (__SUNPRO_CC >= 0x5130) 685 # define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x) 687 # define MAYBE_CONST const 688 # define MAYBE_UNCONST_CAST(T, x) (x) 693 # define M128_CAST(x) ((__m128i *)(void *)(x)) 695 #ifndef CONST_M128_CAST 696 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) 701 # define DOUBLE_CAST(x) ((double *)(void *)(x)) 703 #ifndef CONST_DOUBLE_CAST 704 # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x)) 709 template <
typename F2,
typename F6>
710 inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
711 const word32 *subKeys,
size_t rounds,
const byte *inBlocks,
712 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
719 CRYPTOPP_ALIGN_DATA(16)
720 const word32 s_one32x4_1b[] = {0, 0, 0, 1<<24};
721 CRYPTOPP_ALIGN_DATA(16)
722 const word32 s_one32x4_2b[] = {0, 2<<24, 0, 2<<24};
724 const ptrdiff_t blockSize = 8;
725 const ptrdiff_t xmmBlockSize = 16;
727 ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
728 ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
729 ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
732 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
733 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
735 if (flags & BT_ReverseDirection)
737 inBlocks +=
static_cast<ptrdiff_t
>(length) - xmmBlockSize;
738 xorBlocks +=
static_cast<ptrdiff_t
>(length) - xmmBlockSize;
739 outBlocks +=
static_cast<ptrdiff_t
>(length) - xmmBlockSize;
740 inIncrement = 0-inIncrement;
741 xorIncrement = 0-xorIncrement;
742 outIncrement = 0-outIncrement;
745 if (flags & BT_AllowParallel)
747 while (length >= 6*xmmBlockSize)
749 __m128i block0, block1, block2, block3, block4, block5;
750 if (flags & BT_InBlockIsCounter)
755 block0 = _mm_add_epi32(*CONST_M128_CAST(s_one32x4_1b), _mm_castpd_si128(
756 _mm_loaddup_pd(CONST_DOUBLE_CAST(inBlocks))));
759 const __m128i be2 = *CONST_M128_CAST(s_one32x4_2b);
760 block1 = _mm_add_epi32(be2, block0);
761 block2 = _mm_add_epi32(be2, block1);
762 block3 = _mm_add_epi32(be2, block2);
763 block4 = _mm_add_epi32(be2, block3);
764 block5 = _mm_add_epi32(be2, block4);
767 _mm_store_sd(DOUBLE_CAST(inBlocks),
768 _mm_castsi128_pd(_mm_add_epi32(be2, block5)));
772 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
773 inBlocks += inIncrement;
774 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
775 inBlocks += inIncrement;
776 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
777 inBlocks += inIncrement;
778 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
779 inBlocks += inIncrement;
780 block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
781 inBlocks += inIncrement;
782 block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
783 inBlocks += inIncrement;
788 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
789 xorBlocks += xorIncrement;
790 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
791 xorBlocks += xorIncrement;
792 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
793 xorBlocks += xorIncrement;
794 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
795 xorBlocks += xorIncrement;
796 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
797 xorBlocks += xorIncrement;
798 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
799 xorBlocks += xorIncrement;
802 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
806 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
807 xorBlocks += xorIncrement;
808 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
809 xorBlocks += xorIncrement;
810 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
811 xorBlocks += xorIncrement;
812 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
813 xorBlocks += xorIncrement;
814 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
815 xorBlocks += xorIncrement;
816 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
817 xorBlocks += xorIncrement;
820 _mm_storeu_si128(M128_CAST(outBlocks), block0);
821 outBlocks += outIncrement;
822 _mm_storeu_si128(M128_CAST(outBlocks), block1);
823 outBlocks += outIncrement;
824 _mm_storeu_si128(M128_CAST(outBlocks), block2);
825 outBlocks += outIncrement;
826 _mm_storeu_si128(M128_CAST(outBlocks), block3);
827 outBlocks += outIncrement;
828 _mm_storeu_si128(M128_CAST(outBlocks), block4);
829 outBlocks += outIncrement;
830 _mm_storeu_si128(M128_CAST(outBlocks), block5);
831 outBlocks += outIncrement;
833 length -= 6*xmmBlockSize;
836 while (length >= 2*xmmBlockSize)
838 __m128i block0, block1;
839 if (flags & BT_InBlockIsCounter)
844 block0 = _mm_add_epi32(*CONST_M128_CAST(s_one32x4_1b), _mm_castpd_si128(
845 _mm_loaddup_pd(CONST_DOUBLE_CAST(inBlocks))));
848 const __m128i be2 = *CONST_M128_CAST(s_one32x4_2b);
849 block1 = _mm_add_epi32(be2, block0);
852 _mm_store_sd(DOUBLE_CAST(inBlocks),
853 _mm_castsi128_pd(_mm_add_epi64(be2, block1)));
857 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
858 inBlocks += inIncrement;
859 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
860 inBlocks += inIncrement;
865 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
866 xorBlocks += xorIncrement;
867 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
868 xorBlocks += xorIncrement;
871 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
875 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
876 xorBlocks += xorIncrement;
877 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
878 xorBlocks += xorIncrement;
881 _mm_storeu_si128(M128_CAST(outBlocks), block0);
882 outBlocks += outIncrement;
883 _mm_storeu_si128(M128_CAST(outBlocks), block1);
884 outBlocks += outIncrement;
886 length -= 2*xmmBlockSize;
893 if (flags & BT_ReverseDirection)
895 inIncrement += inIncrement ? blockSize : 0;
896 xorIncrement += xorIncrement ? blockSize : 0;
897 outIncrement += outIncrement ? blockSize : 0;
898 inBlocks -= inIncrement;
899 xorBlocks -= xorIncrement;
900 outBlocks -= outIncrement;
904 inIncrement -= inIncrement ? blockSize : 0;
905 xorIncrement -= xorIncrement ? blockSize : 0;
906 outIncrement -= outIncrement ? blockSize : 0;
909 while (length >= blockSize)
911 __m128i block, zero = _mm_setzero_si128();
912 block = _mm_castpd_si128(
914 _mm_load_sd(CONST_DOUBLE_CAST(inBlocks)));
918 block = _mm_xor_si128(block, _mm_castpd_si128(
920 _mm_load_sd(CONST_DOUBLE_CAST(xorBlocks))));
923 if (flags & BT_InBlockIsCounter)
924 const_cast<byte *
>(inBlocks)[7]++;
926 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
930 block = _mm_xor_si128(block, _mm_castpd_si128(
932 _mm_load_sd(CONST_DOUBLE_CAST(xorBlocks))));
936 _mm_store_sd(DOUBLE_CAST(outBlocks), _mm_castsi128_pd(block));
938 inBlocks += inIncrement;
939 outBlocks += outIncrement;
940 xorBlocks += xorIncrement;
948 template <
typename F2,
typename F6>
949 inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
950 const word64 *subKeys,
size_t rounds,
const byte *inBlocks,
951 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
958 CRYPTOPP_ALIGN_DATA(16)
959 const word32 s_one32x4[] = {0, 0, 0, 1<<24};
961 const ptrdiff_t blockSize = 16;
964 ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
965 ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
966 ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
969 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
970 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
972 if (flags & BT_ReverseDirection)
974 inBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
975 xorBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
976 outBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
977 inIncrement = 0-inIncrement;
978 xorIncrement = 0-xorIncrement;
979 outIncrement = 0-outIncrement;
982 if (flags & BT_AllowParallel)
984 while (length >= 6*blockSize)
986 __m128i block0, block1, block2, block3, block4, block5;
987 if (flags & BT_InBlockIsCounter)
989 const __m128i be1 = *CONST_M128_CAST(s_one32x4);
990 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
991 block1 = _mm_add_epi32(block0, be1);
992 block2 = _mm_add_epi32(block1, be1);
993 block3 = _mm_add_epi32(block2, be1);
994 block4 = _mm_add_epi32(block3, be1);
995 block5 = _mm_add_epi32(block4, be1);
996 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, be1));
1000 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1001 inBlocks += inIncrement;
1002 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1003 inBlocks += inIncrement;
1004 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1005 inBlocks += inIncrement;
1006 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1007 inBlocks += inIncrement;
1008 block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1009 inBlocks += inIncrement;
1010 block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1011 inBlocks += inIncrement;
1016 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1017 xorBlocks += xorIncrement;
1018 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1019 xorBlocks += xorIncrement;
1020 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1021 xorBlocks += xorIncrement;
1022 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1023 xorBlocks += xorIncrement;
1024 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1025 xorBlocks += xorIncrement;
1026 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1027 xorBlocks += xorIncrement;
1030 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1034 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1035 xorBlocks += xorIncrement;
1036 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1037 xorBlocks += xorIncrement;
1038 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1039 xorBlocks += xorIncrement;
1040 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1041 xorBlocks += xorIncrement;
1042 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1043 xorBlocks += xorIncrement;
1044 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1045 xorBlocks += xorIncrement;
1048 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1049 outBlocks += outIncrement;
1050 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1051 outBlocks += outIncrement;
1052 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1053 outBlocks += outIncrement;
1054 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1055 outBlocks += outIncrement;
1056 _mm_storeu_si128(M128_CAST(outBlocks), block4);
1057 outBlocks += outIncrement;
1058 _mm_storeu_si128(M128_CAST(outBlocks), block5);
1059 outBlocks += outIncrement;
1061 length -= 6*blockSize;
1064 while (length >= 2*blockSize)
1066 __m128i block0, block1;
1067 if (flags & BT_InBlockIsCounter)
1069 const __m128i be1 = *CONST_M128_CAST(s_one32x4);
1070 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1071 block1 = _mm_add_epi32(block0, be1);
1072 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, be1));
1076 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1077 inBlocks += inIncrement;
1078 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1079 inBlocks += inIncrement;
1084 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1085 xorBlocks += xorIncrement;
1086 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1087 xorBlocks += xorIncrement;
1090 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1094 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1095 xorBlocks += xorIncrement;
1096 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1097 xorBlocks += xorIncrement;
1100 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1101 outBlocks += outIncrement;
1102 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1103 outBlocks += outIncrement;
1105 length -= 2*blockSize;
1109 while (length >= blockSize)
1111 __m128i block, zero = _mm_setzero_si128();
1112 block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1115 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1117 if (flags & BT_InBlockIsCounter)
1118 const_cast<byte *
>(inBlocks)[15]++;
1120 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1123 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1125 _mm_storeu_si128(M128_CAST(outBlocks), block);
1127 inBlocks += inIncrement;
1128 outBlocks += outIncrement;
1129 xorBlocks += xorIncrement;
1130 length -= blockSize;
1136 template <
typename F1,
typename F4>
1137 inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
1138 MAYBE_CONST word32 *subKeys,
size_t rounds,
const byte *inBlocks,
1139 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1146 CRYPTOPP_ALIGN_DATA(16)
1147 const word32 s_one32x4[] = {0, 0, 0, 1<<24};
1149 const ptrdiff_t blockSize = 16;
1152 ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1153 ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1154 ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1157 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1158 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1160 if (flags & BT_ReverseDirection)
1162 inBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
1163 xorBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
1164 outBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
1165 inIncrement = 0-inIncrement;
1166 xorIncrement = 0-xorIncrement;
1167 outIncrement = 0-outIncrement;
1170 if (flags & BT_AllowParallel)
1172 while (length >= 4*blockSize)
1174 __m128i block0, block1, block2, block3;
1175 if (flags & BT_InBlockIsCounter)
1177 const __m128i be1 = *CONST_M128_CAST(s_one32x4);
1178 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1179 block1 = _mm_add_epi32(block0, be1);
1180 block2 = _mm_add_epi32(block1, be1);
1181 block3 = _mm_add_epi32(block2, be1);
1182 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1));
1186 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1187 inBlocks += inIncrement;
1188 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1189 inBlocks += inIncrement;
1190 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1191 inBlocks += inIncrement;
1192 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1193 inBlocks += inIncrement;
1198 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1199 xorBlocks += xorIncrement;
1200 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1201 xorBlocks += xorIncrement;
1202 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1203 xorBlocks += xorIncrement;
1204 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1205 xorBlocks += xorIncrement;
1208 func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1212 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1213 xorBlocks += xorIncrement;
1214 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1215 xorBlocks += xorIncrement;
1216 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1217 xorBlocks += xorIncrement;
1218 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1219 xorBlocks += xorIncrement;
1222 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1223 outBlocks += outIncrement;
1224 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1225 outBlocks += outIncrement;
1226 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1227 outBlocks += outIncrement;
1228 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1229 outBlocks += outIncrement;
1231 length -= 4*blockSize;
1235 while (length >= blockSize)
1237 __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1240 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1242 if (flags & BT_InBlockIsCounter)
1243 const_cast<byte *
>(inBlocks)[15]++;
1245 func1(block, subKeys, static_cast<unsigned int>(rounds));
1248 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1250 _mm_storeu_si128(M128_CAST(outBlocks), block);
1252 inBlocks += inIncrement;
1253 outBlocks += outIncrement;
1254 xorBlocks += xorIncrement;
1255 length -= blockSize;
1263 #endif // CRYPTOPP_SSSE3_AVAILABLE 1267 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) 1271 template <
typename F1,
typename F6>
1272 inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
1273 const word32 *subKeys,
size_t rounds,
const byte *inBlocks,
1274 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1281 #if defined(CRYPTOPP_LITTLE_ENDIAN) 1282 const uint32x4_p s_one = {1,0,0,0};
1284 const uint32x4_p s_one = {0,0,0,1};
1287 const ptrdiff_t blockSize = 16;
1290 ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1291 ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1292 ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1295 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1296 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1298 if (flags & BT_ReverseDirection)
1300 inBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
1301 xorBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
1302 outBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
1303 inIncrement = 0-inIncrement;
1304 xorIncrement = 0-xorIncrement;
1305 outIncrement = 0-outIncrement;
1308 if (flags & BT_AllowParallel)
1310 while (length >= 6*blockSize)
1312 uint32x4_p block0, block1, block2, block3, block4, block5, temp;
1314 if (flags & BT_InBlockIsCounter)
1328 inBlocks += inIncrement;
1330 inBlocks += inIncrement;
1332 inBlocks += inIncrement;
1334 inBlocks += inIncrement;
1336 inBlocks += inIncrement;
1338 inBlocks += inIncrement;
1344 xorBlocks += xorIncrement;
1346 xorBlocks += xorIncrement;
1348 xorBlocks += xorIncrement;
1350 xorBlocks += xorIncrement;
1352 xorBlocks += xorIncrement;
1354 xorBlocks += xorIncrement;
1357 func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
1362 xorBlocks += xorIncrement;
1364 xorBlocks += xorIncrement;
1366 xorBlocks += xorIncrement;
1368 xorBlocks += xorIncrement;
1370 xorBlocks += xorIncrement;
1372 xorBlocks += xorIncrement;
1376 outBlocks += outIncrement;
1378 outBlocks += outIncrement;
1380 outBlocks += outIncrement;
1382 outBlocks += outIncrement;
1384 outBlocks += outIncrement;
1386 outBlocks += outIncrement;
1388 length -= 6*blockSize;
1392 while (length >= blockSize)
1399 if (flags & BT_InBlockIsCounter)
1400 const_cast<byte *
>(inBlocks)[15]++;
1402 func1(block, subKeys, rounds);
1409 inBlocks += inIncrement;
1410 outBlocks += outIncrement;
1411 xorBlocks += xorIncrement;
1412 length -= blockSize;
1420 #endif // CRYPTOPP_ALTIVEC_AVAILABLE 1422 #endif // CRYPTOPP_ADVANCED_SIMD_TEMPLATES
Utility functions for the Crypto++ library.
Library configuration file.
Support functions for PowerPC and vector operations.
T1 VectorAdd(const T1 &vec1, const T2 &vec2)
Add two vector.
uint32x4_p VectorLoad(const byte src[16])
Loads a vector from a byte array.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
T1 VectorXor(const T1 &vec1, const T2 &vec2)
XOR two vectors.
Crypto++ library namespace.
void VectorStore(const T &src, byte dest[16])
Stores a vector to a byte array.