25 #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES 26 #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES 32 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 33 # include <arm_neon.h> 36 #if (CRYPTOPP_SSSE3_AVAILABLE) 37 # include <emmintrin.h> 38 # include <pmmintrin.h> 39 # include <tmmintrin.h> 42 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) 48 #if (CRYPTOPP_GCC_VERSION >= 40900) 49 # define GCC_NO_UBSAN __attribute__ ((no_sanitize_undefined)) 56 ANONYMOUS_NAMESPACE_BEGIN
58 using CryptoPP::BlockTransformation;
66 ANONYMOUS_NAMESPACE_END
70 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 74 template <
typename F2,
typename F6>
75 inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
76 const word32 *subKeys,
size_t rounds,
const byte *inBlocks,
77 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
84 #if defined(CRYPTOPP_LITTLE_ENDIAN) 85 const word32 s_zero32x4[] = {0, 0, 0, 0};
86 const word32 s_one32x4_1b[] = {0, 0, 0, 1<<24};
87 const word32 s_one32x4_2b[] = {0, 2<<24, 0, 2<<24};
89 const word32 s_zero32x4[] = {0, 0, 0, 0};
90 const word32 s_one32x4_1b[] = {0, 0, 0, 1};
91 const word32 s_one32x4_2b[] = {0, 2, 0, 2};
94 const ptrdiff_t blockSize = 8;
95 const ptrdiff_t neonBlockSize = 16;
97 ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : neonBlockSize;
98 ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? neonBlockSize : 0;
99 ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : neonBlockSize;
102 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
103 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
105 if (flags & BT_ReverseDirection)
107 inBlocks +=
static_cast<ptrdiff_t
>(length) - neonBlockSize;
108 xorBlocks +=
static_cast<ptrdiff_t
>(length) - neonBlockSize;
109 outBlocks +=
static_cast<ptrdiff_t
>(length) - neonBlockSize;
110 inIncrement = 0-inIncrement;
111 xorIncrement = 0-xorIncrement;
112 outIncrement = 0-outIncrement;
115 if (flags & BT_AllowParallel)
117 while (length >= 6*neonBlockSize)
119 uint32x4_t block0, block1, block2, block3, block4, block5;
120 if (flags & BT_InBlockIsCounter)
125 const uint8x8_t ctr = vld1_u8(inBlocks);
126 block0 = vaddq_u32(vld1q_u32(s_one32x4_1b),
127 vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
130 const uint32x4_t be2 = vld1q_u32(s_one32x4_2b);
131 block1 = vaddq_u32(be2, block0);
132 block2 = vaddq_u32(be2, block1);
133 block3 = vaddq_u32(be2, block2);
134 block4 = vaddq_u32(be2, block3);
135 block5 = vaddq_u32(be2, block4);
137 vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
138 vreinterpretq_u8_u32(vaddq_u32(be2, block5))));
142 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
143 inBlocks += inIncrement;
144 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
145 inBlocks += inIncrement;
146 block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
147 inBlocks += inIncrement;
148 block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
149 inBlocks += inIncrement;
150 block4 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
151 inBlocks += inIncrement;
152 block5 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
153 inBlocks += inIncrement;
158 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
159 xorBlocks += xorIncrement;
160 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
161 xorBlocks += xorIncrement;
162 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
163 xorBlocks += xorIncrement;
164 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
165 xorBlocks += xorIncrement;
166 block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
167 xorBlocks += xorIncrement;
168 block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
169 xorBlocks += xorIncrement;
172 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
176 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
177 xorBlocks += xorIncrement;
178 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
179 xorBlocks += xorIncrement;
180 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
181 xorBlocks += xorIncrement;
182 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
183 xorBlocks += xorIncrement;
184 block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
185 xorBlocks += xorIncrement;
186 block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
187 xorBlocks += xorIncrement;
190 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
191 outBlocks += outIncrement;
192 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
193 outBlocks += outIncrement;
194 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
195 outBlocks += outIncrement;
196 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
197 outBlocks += outIncrement;
198 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block4));
199 outBlocks += outIncrement;
200 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block5));
201 outBlocks += outIncrement;
203 length -= 6*neonBlockSize;
206 while (length >= 2*neonBlockSize)
208 uint32x4_t block0, block1;
209 if (flags & BT_InBlockIsCounter)
214 const uint8x8_t ctr = vld1_u8(inBlocks);
215 block0 = vaddq_u32(vld1q_u32(s_one32x4_1b),
216 vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
219 const uint32x4_t be2 = vld1q_u32(s_one32x4_2b);
220 block1 = vaddq_u32(be2, block0);
222 vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
223 vreinterpretq_u8_u32(vaddq_u32(be2, block1))));
227 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
228 inBlocks += inIncrement;
229 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
230 inBlocks += inIncrement;
235 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
236 xorBlocks += xorIncrement;
237 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
238 xorBlocks += xorIncrement;
241 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
245 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
246 xorBlocks += xorIncrement;
247 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
248 xorBlocks += xorIncrement;
251 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
252 outBlocks += outIncrement;
253 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
254 outBlocks += outIncrement;
256 length -= 2*neonBlockSize;
263 if (flags & BT_ReverseDirection)
265 inIncrement += inIncrement ? blockSize : 0;
266 xorIncrement += xorIncrement ? blockSize : 0;
267 outIncrement += outIncrement ? blockSize : 0;
268 inBlocks -= inIncrement;
269 xorBlocks -= xorIncrement;
270 outBlocks -= outIncrement;
274 inIncrement -= inIncrement ? blockSize : 0;
275 xorIncrement -= xorIncrement ? blockSize : 0;
276 outIncrement -= outIncrement ? blockSize : 0;
279 while (length >= blockSize)
281 uint32x4_t block, zero = vld1q_u32(s_zero32x4);
283 const uint8x8_t v = vld1_u8(inBlocks);
284 block = vreinterpretq_u32_u8(vcombine_u8(v,v));
288 const uint8x8_t x = vld1_u8(xorBlocks);
289 block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
292 if (flags & BT_InBlockIsCounter)
293 const_cast<byte *
>(inBlocks)[7]++;
295 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
299 const uint8x8_t x = vld1_u8(xorBlocks);
300 block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
303 vst1_u8(const_cast<byte*>(outBlocks),
304 vget_low_u8(vreinterpretq_u8_u32(block)));
306 inBlocks += inIncrement;
307 outBlocks += outIncrement;
308 xorBlocks += xorIncrement;
316 template <
typename F1,
typename F6>
317 inline size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6,
318 const word32 *subKeys,
size_t rounds,
const byte *inBlocks,
319 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
326 #if defined(CRYPTOPP_LITTLE_ENDIAN) 327 const word32 s_zero32x4[] = {0, 0, 0, 0};
328 const word32 s_one32x4[] = {0, 0, 0, 1<<24};
330 const word32 s_zero32x4[] = {0, 0, 0, 0};
331 const word32 s_one32x4[] = {0, 0, 0, 1};
334 const ptrdiff_t blockSize = 16;
337 ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
338 ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
339 ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
342 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
343 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
345 if (flags & BT_ReverseDirection)
347 inBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
348 xorBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
349 outBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
350 inIncrement = 0-inIncrement;
351 xorIncrement = 0-xorIncrement;
352 outIncrement = 0-outIncrement;
355 if (flags & BT_AllowParallel)
357 while (length >= 6*blockSize)
359 uint64x2_t block0, block1, block2, block3, block4, block5;
360 if (flags & BT_InBlockIsCounter)
362 const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4));
363 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
365 block1 = vaddq_u64(block0, be);
366 block2 = vaddq_u64(block1, be);
367 block3 = vaddq_u64(block2, be);
368 block4 = vaddq_u64(block3, be);
369 block5 = vaddq_u64(block4, be);
370 vst1q_u8(const_cast<byte*>(inBlocks),
371 vreinterpretq_u8_u64(vaddq_u64(block5, be)));
375 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
376 inBlocks += inIncrement;
377 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
378 inBlocks += inIncrement;
379 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
380 inBlocks += inIncrement;
381 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
382 inBlocks += inIncrement;
383 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
384 inBlocks += inIncrement;
385 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
386 inBlocks += inIncrement;
391 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
392 xorBlocks += xorIncrement;
393 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
394 xorBlocks += xorIncrement;
395 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
396 xorBlocks += xorIncrement;
397 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
398 xorBlocks += xorIncrement;
399 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
400 xorBlocks += xorIncrement;
401 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
402 xorBlocks += xorIncrement;
405 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
409 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
410 xorBlocks += xorIncrement;
411 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
412 xorBlocks += xorIncrement;
413 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
414 xorBlocks += xorIncrement;
415 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
416 xorBlocks += xorIncrement;
417 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
418 xorBlocks += xorIncrement;
419 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
420 xorBlocks += xorIncrement;
423 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
424 outBlocks += outIncrement;
425 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
426 outBlocks += outIncrement;
427 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
428 outBlocks += outIncrement;
429 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
430 outBlocks += outIncrement;
431 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
432 outBlocks += outIncrement;
433 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
434 outBlocks += outIncrement;
436 length -= 6*blockSize;
440 while (length >= blockSize)
443 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
446 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
448 if (flags & BT_InBlockIsCounter)
449 const_cast<byte *
>(inBlocks)[15]++;
451 func1(block, subKeys, static_cast<unsigned int>(rounds));
454 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
456 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
458 inBlocks += inIncrement;
459 outBlocks += outIncrement;
460 xorBlocks += xorIncrement;
467 template <
typename F2,
typename F6>
468 inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
469 const word64 *subKeys,
size_t rounds,
const byte *inBlocks,
470 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
477 #if defined(CRYPTOPP_LITTLE_ENDIAN) 478 const word32 s_one32x4[] = {0, 0, 0, 1<<24};
480 const word32 s_one32x4[] = {0, 0, 0, 1};
483 const ptrdiff_t blockSize = 16;
486 ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
487 ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
488 ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
491 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
492 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
494 if (flags & BT_ReverseDirection)
496 inBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
497 xorBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
498 outBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
499 inIncrement = 0-inIncrement;
500 xorIncrement = 0-xorIncrement;
501 outIncrement = 0-outIncrement;
504 if (flags & BT_AllowParallel)
506 while (length >= 6*blockSize)
508 uint64x2_t block0, block1, block2, block3, block4, block5;
509 if (flags & BT_InBlockIsCounter)
511 const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4));
512 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
514 block1 = vaddq_u64(block0, be);
515 block2 = vaddq_u64(block1, be);
516 block3 = vaddq_u64(block2, be);
517 block4 = vaddq_u64(block3, be);
518 block5 = vaddq_u64(block4, be);
519 vst1q_u8(const_cast<byte*>(inBlocks),
520 vreinterpretq_u8_u64(vaddq_u64(block5, be)));
524 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
525 inBlocks += inIncrement;
526 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
527 inBlocks += inIncrement;
528 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
529 inBlocks += inIncrement;
530 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
531 inBlocks += inIncrement;
532 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
533 inBlocks += inIncrement;
534 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
535 inBlocks += inIncrement;
540 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
541 xorBlocks += xorIncrement;
542 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
543 xorBlocks += xorIncrement;
544 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
545 xorBlocks += xorIncrement;
546 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
547 xorBlocks += xorIncrement;
548 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
549 xorBlocks += xorIncrement;
550 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
551 xorBlocks += xorIncrement;
554 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
558 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
559 xorBlocks += xorIncrement;
560 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
561 xorBlocks += xorIncrement;
562 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
563 xorBlocks += xorIncrement;
564 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
565 xorBlocks += xorIncrement;
566 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
567 xorBlocks += xorIncrement;
568 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
569 xorBlocks += xorIncrement;
572 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
573 outBlocks += outIncrement;
574 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
575 outBlocks += outIncrement;
576 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
577 outBlocks += outIncrement;
578 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
579 outBlocks += outIncrement;
580 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
581 outBlocks += outIncrement;
582 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
583 outBlocks += outIncrement;
585 length -= 6*blockSize;
588 while (length >= 2*blockSize)
590 uint64x2_t block0, block1;
591 if (flags & BT_InBlockIsCounter)
593 const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4));
594 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
595 block1 = vaddq_u64(block0, be);
597 vst1q_u8(const_cast<byte*>(inBlocks),
598 vreinterpretq_u8_u64(vaddq_u64(block1, be)));
602 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
603 inBlocks += inIncrement;
604 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
605 inBlocks += inIncrement;
610 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
611 xorBlocks += xorIncrement;
612 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
613 xorBlocks += xorIncrement;
616 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
620 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
621 xorBlocks += xorIncrement;
622 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
623 xorBlocks += xorIncrement;
626 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
627 outBlocks += outIncrement;
628 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
629 outBlocks += outIncrement;
631 length -= 2*blockSize;
635 while (length >= blockSize)
637 uint64x2_t block, zero = {0,0};
638 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
641 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
643 if (flags & BT_InBlockIsCounter)
644 const_cast<byte *
>(inBlocks)[15]++;
646 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
649 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
651 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
653 inBlocks += inIncrement;
654 outBlocks += outIncrement;
655 xorBlocks += xorIncrement;
664 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 668 #if defined(CRYPTOPP_SSSE3_AVAILABLE) 671 #if (__SUNPRO_CC >= 0x5130) 673 # define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x) 675 # define MAYBE_CONST const 676 # define MAYBE_UNCONST_CAST(T, x) (x) 681 # define M128_CAST(x) ((__m128i *)(void *)(x)) 683 #ifndef CONST_M128_CAST 684 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) 689 # define DOUBLE_CAST(x) ((double *)(void *)(x)) 691 #ifndef CONST_DOUBLE_CAST 692 # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x)) 697 template <
typename F2,
typename F6>
698 inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
699 const word32 *subKeys,
size_t rounds,
const byte *inBlocks,
700 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
707 CRYPTOPP_ALIGN_DATA(16)
708 const word32 s_one32x4_1b[] = {0, 0, 0, 1<<24};
709 CRYPTOPP_ALIGN_DATA(16)
710 const word32 s_one32x4_2b[] = {0, 2<<24, 0, 2<<24};
712 const ptrdiff_t blockSize = 8;
713 const ptrdiff_t xmmBlockSize = 16;
715 ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
716 ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
717 ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
720 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
721 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
723 if (flags & BT_ReverseDirection)
725 inBlocks +=
static_cast<ptrdiff_t
>(length) - xmmBlockSize;
726 xorBlocks +=
static_cast<ptrdiff_t
>(length) - xmmBlockSize;
727 outBlocks +=
static_cast<ptrdiff_t
>(length) - xmmBlockSize;
728 inIncrement = 0-inIncrement;
729 xorIncrement = 0-xorIncrement;
730 outIncrement = 0-outIncrement;
733 if (flags & BT_AllowParallel)
735 while (length >= 6*xmmBlockSize)
737 __m128i block0, block1, block2, block3, block4, block5;
738 if (flags & BT_InBlockIsCounter)
743 block0 = _mm_add_epi32(*CONST_M128_CAST(s_one32x4_1b), _mm_castpd_si128(
744 _mm_loaddup_pd(CONST_DOUBLE_CAST(inBlocks))));
747 const __m128i be2 = *CONST_M128_CAST(s_one32x4_2b);
748 block1 = _mm_add_epi32(be2, block0);
749 block2 = _mm_add_epi32(be2, block1);
750 block3 = _mm_add_epi32(be2, block2);
751 block4 = _mm_add_epi32(be2, block3);
752 block5 = _mm_add_epi32(be2, block4);
755 _mm_store_sd(DOUBLE_CAST(inBlocks),
756 _mm_castsi128_pd(_mm_add_epi32(be2, block5)));
760 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
761 inBlocks += inIncrement;
762 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
763 inBlocks += inIncrement;
764 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
765 inBlocks += inIncrement;
766 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
767 inBlocks += inIncrement;
768 block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
769 inBlocks += inIncrement;
770 block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
771 inBlocks += inIncrement;
776 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
777 xorBlocks += xorIncrement;
778 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
779 xorBlocks += xorIncrement;
780 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
781 xorBlocks += xorIncrement;
782 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
783 xorBlocks += xorIncrement;
784 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
785 xorBlocks += xorIncrement;
786 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
787 xorBlocks += xorIncrement;
790 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
794 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
795 xorBlocks += xorIncrement;
796 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
797 xorBlocks += xorIncrement;
798 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
799 xorBlocks += xorIncrement;
800 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
801 xorBlocks += xorIncrement;
802 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
803 xorBlocks += xorIncrement;
804 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
805 xorBlocks += xorIncrement;
808 _mm_storeu_si128(M128_CAST(outBlocks), block0);
809 outBlocks += outIncrement;
810 _mm_storeu_si128(M128_CAST(outBlocks), block1);
811 outBlocks += outIncrement;
812 _mm_storeu_si128(M128_CAST(outBlocks), block2);
813 outBlocks += outIncrement;
814 _mm_storeu_si128(M128_CAST(outBlocks), block3);
815 outBlocks += outIncrement;
816 _mm_storeu_si128(M128_CAST(outBlocks), block4);
817 outBlocks += outIncrement;
818 _mm_storeu_si128(M128_CAST(outBlocks), block5);
819 outBlocks += outIncrement;
821 length -= 6*xmmBlockSize;
824 while (length >= 2*xmmBlockSize)
826 __m128i block0, block1;
827 if (flags & BT_InBlockIsCounter)
832 block0 = _mm_add_epi32(*CONST_M128_CAST(s_one32x4_1b), _mm_castpd_si128(
833 _mm_loaddup_pd(CONST_DOUBLE_CAST(inBlocks))));
836 const __m128i be2 = *CONST_M128_CAST(s_one32x4_2b);
837 block1 = _mm_add_epi32(be2, block0);
840 _mm_store_sd(DOUBLE_CAST(inBlocks),
841 _mm_castsi128_pd(_mm_add_epi64(be2, block1)));
845 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
846 inBlocks += inIncrement;
847 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
848 inBlocks += inIncrement;
853 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
854 xorBlocks += xorIncrement;
855 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
856 xorBlocks += xorIncrement;
859 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
863 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
864 xorBlocks += xorIncrement;
865 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
866 xorBlocks += xorIncrement;
869 _mm_storeu_si128(M128_CAST(outBlocks), block0);
870 outBlocks += outIncrement;
871 _mm_storeu_si128(M128_CAST(outBlocks), block1);
872 outBlocks += outIncrement;
874 length -= 2*xmmBlockSize;
881 if (flags & BT_ReverseDirection)
883 inIncrement += inIncrement ? blockSize : 0;
884 xorIncrement += xorIncrement ? blockSize : 0;
885 outIncrement += outIncrement ? blockSize : 0;
886 inBlocks -= inIncrement;
887 xorBlocks -= xorIncrement;
888 outBlocks -= outIncrement;
892 inIncrement -= inIncrement ? blockSize : 0;
893 xorIncrement -= xorIncrement ? blockSize : 0;
894 outIncrement -= outIncrement ? blockSize : 0;
897 while (length >= blockSize)
899 __m128i block, zero = _mm_setzero_si128();
900 block = _mm_castpd_si128(
902 _mm_load_sd(CONST_DOUBLE_CAST(inBlocks)));
906 block = _mm_xor_si128(block, _mm_castpd_si128(
908 _mm_load_sd(CONST_DOUBLE_CAST(xorBlocks))));
911 if (flags & BT_InBlockIsCounter)
912 const_cast<byte *
>(inBlocks)[7]++;
914 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
918 block = _mm_xor_si128(block, _mm_castpd_si128(
920 _mm_load_sd(CONST_DOUBLE_CAST(xorBlocks))));
924 _mm_store_sd(DOUBLE_CAST(outBlocks), _mm_castsi128_pd(block));
926 inBlocks += inIncrement;
927 outBlocks += outIncrement;
928 xorBlocks += xorIncrement;
936 template <
typename F2,
typename F6>
937 inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
938 const word64 *subKeys,
size_t rounds,
const byte *inBlocks,
939 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
946 CRYPTOPP_ALIGN_DATA(16)
947 const word32 s_one32x4[] = {0, 0, 0, 1<<24};
949 const ptrdiff_t blockSize = 16;
952 ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
953 ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
954 ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
957 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
958 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
960 if (flags & BT_ReverseDirection)
962 inBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
963 xorBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
964 outBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
965 inIncrement = 0-inIncrement;
966 xorIncrement = 0-xorIncrement;
967 outIncrement = 0-outIncrement;
970 if (flags & BT_AllowParallel)
972 while (length >= 6*blockSize)
974 __m128i block0, block1, block2, block3, block4, block5;
975 if (flags & BT_InBlockIsCounter)
977 const __m128i be1 = *CONST_M128_CAST(s_one32x4);
978 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
979 block1 = _mm_add_epi32(block0, be1);
980 block2 = _mm_add_epi32(block1, be1);
981 block3 = _mm_add_epi32(block2, be1);
982 block4 = _mm_add_epi32(block3, be1);
983 block5 = _mm_add_epi32(block4, be1);
984 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, be1));
988 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
989 inBlocks += inIncrement;
990 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
991 inBlocks += inIncrement;
992 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
993 inBlocks += inIncrement;
994 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
995 inBlocks += inIncrement;
996 block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
997 inBlocks += inIncrement;
998 block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
999 inBlocks += inIncrement;
1004 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1005 xorBlocks += xorIncrement;
1006 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1007 xorBlocks += xorIncrement;
1008 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1009 xorBlocks += xorIncrement;
1010 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1011 xorBlocks += xorIncrement;
1012 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1013 xorBlocks += xorIncrement;
1014 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1015 xorBlocks += xorIncrement;
1018 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1022 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1023 xorBlocks += xorIncrement;
1024 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1025 xorBlocks += xorIncrement;
1026 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1027 xorBlocks += xorIncrement;
1028 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1029 xorBlocks += xorIncrement;
1030 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1031 xorBlocks += xorIncrement;
1032 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1033 xorBlocks += xorIncrement;
1036 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1037 outBlocks += outIncrement;
1038 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1039 outBlocks += outIncrement;
1040 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1041 outBlocks += outIncrement;
1042 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1043 outBlocks += outIncrement;
1044 _mm_storeu_si128(M128_CAST(outBlocks), block4);
1045 outBlocks += outIncrement;
1046 _mm_storeu_si128(M128_CAST(outBlocks), block5);
1047 outBlocks += outIncrement;
1049 length -= 6*blockSize;
1052 while (length >= 2*blockSize)
1054 __m128i block0, block1;
1055 if (flags & BT_InBlockIsCounter)
1057 const __m128i be1 = *CONST_M128_CAST(s_one32x4);
1058 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1059 block1 = _mm_add_epi32(block0, be1);
1060 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, be1));
1064 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1065 inBlocks += inIncrement;
1066 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1067 inBlocks += inIncrement;
1072 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1073 xorBlocks += xorIncrement;
1074 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1075 xorBlocks += xorIncrement;
1078 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1082 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1083 xorBlocks += xorIncrement;
1084 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1085 xorBlocks += xorIncrement;
1088 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1089 outBlocks += outIncrement;
1090 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1091 outBlocks += outIncrement;
1093 length -= 2*blockSize;
1097 while (length >= blockSize)
1099 __m128i block, zero = _mm_setzero_si128();
1100 block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1103 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1105 if (flags & BT_InBlockIsCounter)
1106 const_cast<byte *
>(inBlocks)[15]++;
1108 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1111 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1113 _mm_storeu_si128(M128_CAST(outBlocks), block);
1115 inBlocks += inIncrement;
1116 outBlocks += outIncrement;
1117 xorBlocks += xorIncrement;
1118 length -= blockSize;
1124 template <
typename F1,
typename F4>
1125 inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
1126 MAYBE_CONST word32 *subKeys,
size_t rounds,
const byte *inBlocks,
1127 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1134 CRYPTOPP_ALIGN_DATA(16)
1135 const word32 s_one32x4[] = {0, 0, 0, 1<<24};
1137 const ptrdiff_t blockSize = 16;
1140 ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1141 ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1142 ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1145 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1146 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1148 if (flags & BT_ReverseDirection)
1150 inBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
1151 xorBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
1152 outBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
1153 inIncrement = 0-inIncrement;
1154 xorIncrement = 0-xorIncrement;
1155 outIncrement = 0-outIncrement;
1158 if (flags & BT_AllowParallel)
1160 while (length >= 4*blockSize)
1162 __m128i block0, block1, block2, block3;
1163 if (flags & BT_InBlockIsCounter)
1165 const __m128i be1 = *CONST_M128_CAST(s_one32x4);
1166 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1167 block1 = _mm_add_epi32(block0, be1);
1168 block2 = _mm_add_epi32(block1, be1);
1169 block3 = _mm_add_epi32(block2, be1);
1170 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1));
1174 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1175 inBlocks += inIncrement;
1176 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1177 inBlocks += inIncrement;
1178 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1179 inBlocks += inIncrement;
1180 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1181 inBlocks += inIncrement;
1186 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1187 xorBlocks += xorIncrement;
1188 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1189 xorBlocks += xorIncrement;
1190 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1191 xorBlocks += xorIncrement;
1192 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1193 xorBlocks += xorIncrement;
1196 func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1200 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1201 xorBlocks += xorIncrement;
1202 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1203 xorBlocks += xorIncrement;
1204 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1205 xorBlocks += xorIncrement;
1206 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1207 xorBlocks += xorIncrement;
1210 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1211 outBlocks += outIncrement;
1212 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1213 outBlocks += outIncrement;
1214 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1215 outBlocks += outIncrement;
1216 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1217 outBlocks += outIncrement;
1219 length -= 4*blockSize;
1223 while (length >= blockSize)
1225 __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1228 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1230 if (flags & BT_InBlockIsCounter)
1231 const_cast<byte *
>(inBlocks)[15]++;
1233 func1(block, subKeys, static_cast<unsigned int>(rounds));
1236 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1238 _mm_storeu_si128(M128_CAST(outBlocks), block);
1240 inBlocks += inIncrement;
1241 outBlocks += outIncrement;
1242 xorBlocks += xorIncrement;
1243 length -= blockSize;
1251 #endif // CRYPTOPP_SSSE3_AVAILABLE 1255 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) 1259 template <
typename F1,
typename F6>
1260 inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
1261 const word32 *subKeys,
size_t rounds,
const byte *inBlocks,
1262 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1269 #if defined(CRYPTOPP_LITTLE_ENDIAN) 1270 const uint32x4_p s_one = {1,0,0,0};
1272 const uint32x4_p s_one = {0,0,0,1};
1275 const ptrdiff_t blockSize = 16;
1278 ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1279 ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1280 ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1283 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1284 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1286 if (flags & BT_ReverseDirection)
1288 inBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
1289 xorBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
1290 outBlocks +=
static_cast<ptrdiff_t
>(length) - blockSize;
1291 inIncrement = 0-inIncrement;
1292 xorIncrement = 0-xorIncrement;
1293 outIncrement = 0-outIncrement;
1296 if (flags & BT_AllowParallel)
1298 while (length >= 6*blockSize)
1300 uint32x4_p block0, block1, block2, block3, block4, block5, temp;
1302 if (flags & BT_InBlockIsCounter)
1316 inBlocks += inIncrement;
1318 inBlocks += inIncrement;
1320 inBlocks += inIncrement;
1322 inBlocks += inIncrement;
1324 inBlocks += inIncrement;
1326 inBlocks += inIncrement;
1332 xorBlocks += xorIncrement;
1334 xorBlocks += xorIncrement;
1336 xorBlocks += xorIncrement;
1338 xorBlocks += xorIncrement;
1340 xorBlocks += xorIncrement;
1342 xorBlocks += xorIncrement;
1345 func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
1350 xorBlocks += xorIncrement;
1352 xorBlocks += xorIncrement;
1354 xorBlocks += xorIncrement;
1356 xorBlocks += xorIncrement;
1358 xorBlocks += xorIncrement;
1360 xorBlocks += xorIncrement;
1364 outBlocks += outIncrement;
1366 outBlocks += outIncrement;
1368 outBlocks += outIncrement;
1370 outBlocks += outIncrement;
1372 outBlocks += outIncrement;
1374 outBlocks += outIncrement;
1376 length -= 6*blockSize;
1380 while (length >= blockSize)
1387 if (flags & BT_InBlockIsCounter)
1388 const_cast<byte *
>(inBlocks)[15]++;
1390 func1(block, subKeys, rounds);
1397 inBlocks += inIncrement;
1398 outBlocks += outIncrement;
1399 xorBlocks += xorIncrement;
1400 length -= blockSize;
1408 #endif // CRYPTOPP_ALTIVEC_AVAILABLE 1410 #endif // CRYPTOPP_ADVANCED_SIMD_TEMPLATES
Utility functions for the Crypto++ library.
Library configuration file.
Support functions for PowerPC and vector operations.
T1 VectorAdd(const T1 &vec1, const T2 &vec2)
Add two vector.
uint32x4_p VectorLoad(const byte src[16])
Loads a vector from a byte array.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
T1 VectorXor(const T1 &vec1, const T2 &vec2)
XOR two vectors.
Crypto++ library namespace.
void VectorStore(const T &src, byte dest[16])
Stores a vector to a byte array.