Crypto++  7.0
Free C++ class library of cryptographic schemes
adv-simd.h
1 // adv-simd.h - written and placed in the public domain by Jeffrey Walton
2 //
3 // The SIMD based implementations for ciphers that use SSE, NEON and Power7
4 // have a commom pattern. Namely, they have a specialized implementation of
5 // AdvancedProcessBlocks which processes multiple block using hardware
6 // acceleration. After several implementations we noticed a lot of copy and
7 // paste occuring. adv-simd.h provides a template to avoid the copy and paste.
8 //
9 // There are 8 templates provided in this file. The number following the
10 // function name is the block size of the cipher. The name following that
11 // is the acceleration and arrangement. For example 4x1_SSE means Intel SSE
12 // using two encrypt (or decrypt) functions: one that operates on 4 blocks,
13 // and one that operates on 1 block.
14 //
15 // * AdvancedProcessBlocks64_4x1_SSE
16 // * AdvancedProcessBlocks128_4x1_SSE
17 // * AdvancedProcessBlocks64_6x2_SSE
18 // * AdvancedProcessBlocks128_6x2_SSE
19 // * AdvancedProcessBlocks64_6x2_NEON
20 // * AdvancedProcessBlocks128_6x2_NEON
21 // * AdvancedProcessBlocks64_6x2_ALTIVEC
22 // * AdvancedProcessBlocks128_6x2_ALTIVEC
23 //
24 
25 #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES
26 #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES
27 
28 #include "config.h"
29 #include "misc.h"
30 #include "stdcpp.h"
31 
32 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
33 # include <arm_neon.h>
34 #endif
35 
36 #if (CRYPTOPP_SSSE3_AVAILABLE)
37 # include <emmintrin.h>
38 # include <pmmintrin.h>
39 # include <tmmintrin.h>
40 #endif
41 
42 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
43 # include "ppc-simd.h"
44 #endif
45 
46 // https://www.spinics.net/lists/gcchelp/msg47735.html and
47 // https://www.spinics.net/lists/gcchelp/msg47749.html
48 #if (CRYPTOPP_GCC_VERSION >= 40900)
49 # define GCC_NO_UBSAN __attribute__ ((no_sanitize_undefined))
50 #else
51 # define GCC_NO_UBSAN
52 #endif
53 
54 // ************************ All block ciphers *********************** //
55 
56 ANONYMOUS_NAMESPACE_BEGIN
57 
58 using CryptoPP::BlockTransformation;
59 
60 CRYPTOPP_CONSTANT(BT_XorInput = BlockTransformation::BT_XorInput)
61 CRYPTOPP_CONSTANT(BT_AllowParallel = BlockTransformation::BT_AllowParallel)
62 CRYPTOPP_CONSTANT(BT_InBlockIsCounter = BlockTransformation::BT_InBlockIsCounter)
63 CRYPTOPP_CONSTANT(BT_ReverseDirection = BlockTransformation::BT_ReverseDirection)
64 CRYPTOPP_CONSTANT(BT_DontIncrementInOutPointers = BlockTransformation::BT_DontIncrementInOutPointers)
65 
66 ANONYMOUS_NAMESPACE_END
67 
68 // *************************** ARM NEON ************************** //
69 
70 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
71 
72 NAMESPACE_BEGIN(CryptoPP)
73 
74 template <typename F2, typename F6>
75 inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
76  const word32 *subKeys, size_t rounds, const byte *inBlocks,
77  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
78 {
79  CRYPTOPP_ASSERT(subKeys);
80  CRYPTOPP_ASSERT(inBlocks);
81  CRYPTOPP_ASSERT(outBlocks);
82  CRYPTOPP_ASSERT(length >= 8);
83 
84 #if defined(CRYPTOPP_LITTLE_ENDIAN)
85  const word32 s_zero32x4[] = {0, 0, 0, 0};
86  const word32 s_one32x4_1b[] = {0, 0, 0, 1<<24};
87  const word32 s_one32x4_2b[] = {0, 2<<24, 0, 2<<24};
88 #else
89  const word32 s_zero32x4[] = {0, 0, 0, 0};
90  const word32 s_one32x4_1b[] = {0, 0, 0, 1};
91  const word32 s_one32x4_2b[] = {0, 2, 0, 2};
92 #endif
93 
94  const ptrdiff_t blockSize = 8;
95  const ptrdiff_t neonBlockSize = 16;
96 
97  ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : neonBlockSize;
98  ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? neonBlockSize : 0;
99  ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : neonBlockSize;
100 
101  // Clang and Coverity are generating findings using xorBlocks as a flag.
102  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
103  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
104 
105  if (flags & BT_ReverseDirection)
106  {
107  inBlocks += static_cast<ptrdiff_t>(length) - neonBlockSize;
108  xorBlocks += static_cast<ptrdiff_t>(length) - neonBlockSize;
109  outBlocks += static_cast<ptrdiff_t>(length) - neonBlockSize;
110  inIncrement = 0-inIncrement;
111  xorIncrement = 0-xorIncrement;
112  outIncrement = 0-outIncrement;
113  }
114 
115  if (flags & BT_AllowParallel)
116  {
117  while (length >= 6*neonBlockSize)
118  {
119  uint32x4_t block0, block1, block2, block3, block4, block5;
120  if (flags & BT_InBlockIsCounter)
121  {
122  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
123  // After the dup load we have two counters in the NEON word. Then we need
124  // to increment the low ctr by 0 and the high ctr by 1.
125  const uint8x8_t ctr = vld1_u8(inBlocks);
126  block0 = vaddq_u32(vld1q_u32(s_one32x4_1b),
127  vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
128 
129  // After initial increment of {0,1} remaining counters increment by {2,2}.
130  const uint32x4_t be2 = vld1q_u32(s_one32x4_2b);
131  block1 = vaddq_u32(be2, block0);
132  block2 = vaddq_u32(be2, block1);
133  block3 = vaddq_u32(be2, block2);
134  block4 = vaddq_u32(be2, block3);
135  block5 = vaddq_u32(be2, block4);
136 
137  vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
138  vreinterpretq_u8_u32(vaddq_u32(be2, block5))));
139  }
140  else
141  {
142  block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
143  inBlocks += inIncrement;
144  block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
145  inBlocks += inIncrement;
146  block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
147  inBlocks += inIncrement;
148  block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
149  inBlocks += inIncrement;
150  block4 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
151  inBlocks += inIncrement;
152  block5 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
153  inBlocks += inIncrement;
154  }
155 
156  if (xorInput)
157  {
158  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
159  xorBlocks += xorIncrement;
160  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
161  xorBlocks += xorIncrement;
162  block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
163  xorBlocks += xorIncrement;
164  block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
165  xorBlocks += xorIncrement;
166  block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
167  xorBlocks += xorIncrement;
168  block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
169  xorBlocks += xorIncrement;
170  }
171 
172  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
173 
174  if (xorOutput)
175  {
176  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
177  xorBlocks += xorIncrement;
178  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
179  xorBlocks += xorIncrement;
180  block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
181  xorBlocks += xorIncrement;
182  block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
183  xorBlocks += xorIncrement;
184  block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
185  xorBlocks += xorIncrement;
186  block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
187  xorBlocks += xorIncrement;
188  }
189 
190  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
191  outBlocks += outIncrement;
192  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
193  outBlocks += outIncrement;
194  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
195  outBlocks += outIncrement;
196  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
197  outBlocks += outIncrement;
198  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block4));
199  outBlocks += outIncrement;
200  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block5));
201  outBlocks += outIncrement;
202 
203  length -= 6*neonBlockSize;
204  }
205 
206  while (length >= 2*neonBlockSize)
207  {
208  uint32x4_t block0, block1;
209  if (flags & BT_InBlockIsCounter)
210  {
211  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
212  // After the dup load we have two counters in the NEON word. Then we need
213  // to increment the low ctr by 0 and the high ctr by 1.
214  const uint8x8_t ctr = vld1_u8(inBlocks);
215  block0 = vaddq_u32(vld1q_u32(s_one32x4_1b),
216  vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
217 
218  // After initial increment of {0,1} remaining counters increment by {2,2}.
219  const uint32x4_t be2 = vld1q_u32(s_one32x4_2b);
220  block1 = vaddq_u32(be2, block0);
221 
222  vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
223  vreinterpretq_u8_u32(vaddq_u32(be2, block1))));
224  }
225  else
226  {
227  block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
228  inBlocks += inIncrement;
229  block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
230  inBlocks += inIncrement;
231  }
232 
233  if (xorInput)
234  {
235  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
236  xorBlocks += xorIncrement;
237  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
238  xorBlocks += xorIncrement;
239  }
240 
241  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
242 
243  if (xorOutput)
244  {
245  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
246  xorBlocks += xorIncrement;
247  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
248  xorBlocks += xorIncrement;
249  }
250 
251  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
252  outBlocks += outIncrement;
253  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
254  outBlocks += outIncrement;
255 
256  length -= 2*neonBlockSize;
257  }
258  }
259 
260  if (length)
261  {
262  // Adjust to real block size
263  if (flags & BT_ReverseDirection)
264  {
265  inIncrement += inIncrement ? blockSize : 0;
266  xorIncrement += xorIncrement ? blockSize : 0;
267  outIncrement += outIncrement ? blockSize : 0;
268  inBlocks -= inIncrement;
269  xorBlocks -= xorIncrement;
270  outBlocks -= outIncrement;
271  }
272  else
273  {
274  inIncrement -= inIncrement ? blockSize : 0;
275  xorIncrement -= xorIncrement ? blockSize : 0;
276  outIncrement -= outIncrement ? blockSize : 0;
277  }
278 
279  while (length >= blockSize)
280  {
281  uint32x4_t block, zero = vld1q_u32(s_zero32x4);
282 
283  const uint8x8_t v = vld1_u8(inBlocks);
284  block = vreinterpretq_u32_u8(vcombine_u8(v,v));
285 
286  if (xorInput)
287  {
288  const uint8x8_t x = vld1_u8(xorBlocks);
289  block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
290  }
291 
292  if (flags & BT_InBlockIsCounter)
293  const_cast<byte *>(inBlocks)[7]++;
294 
295  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
296 
297  if (xorOutput)
298  {
299  const uint8x8_t x = vld1_u8(xorBlocks);
300  block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
301  }
302 
303  vst1_u8(const_cast<byte*>(outBlocks),
304  vget_low_u8(vreinterpretq_u8_u32(block)));
305 
306  inBlocks += inIncrement;
307  outBlocks += outIncrement;
308  xorBlocks += xorIncrement;
309  length -= blockSize;
310  }
311  }
312 
313  return length;
314 }
315 
316 template <typename F1, typename F6>
317 inline size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6,
318  const word32 *subKeys, size_t rounds, const byte *inBlocks,
319  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
320 {
321  CRYPTOPP_ASSERT(subKeys);
322  CRYPTOPP_ASSERT(inBlocks);
323  CRYPTOPP_ASSERT(outBlocks);
324  CRYPTOPP_ASSERT(length >= 16);
325 
326 #if defined(CRYPTOPP_LITTLE_ENDIAN)
327  const word32 s_zero32x4[] = {0, 0, 0, 0};
328  const word32 s_one32x4[] = {0, 0, 0, 1<<24};
329 #else
330  const word32 s_zero32x4[] = {0, 0, 0, 0};
331  const word32 s_one32x4[] = {0, 0, 0, 1};
332 #endif
333 
334  const ptrdiff_t blockSize = 16;
335  // const ptrdiff_t neonBlockSize = 16;
336 
337  ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
338  ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
339  ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
340 
341  // Clang and Coverity are generating findings using xorBlocks as a flag.
342  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
343  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
344 
345  if (flags & BT_ReverseDirection)
346  {
347  inBlocks += static_cast<ptrdiff_t>(length) - blockSize;
348  xorBlocks += static_cast<ptrdiff_t>(length) - blockSize;
349  outBlocks += static_cast<ptrdiff_t>(length) - blockSize;
350  inIncrement = 0-inIncrement;
351  xorIncrement = 0-xorIncrement;
352  outIncrement = 0-outIncrement;
353  }
354 
355  if (flags & BT_AllowParallel)
356  {
357  while (length >= 6*blockSize)
358  {
359  uint64x2_t block0, block1, block2, block3, block4, block5;
360  if (flags & BT_InBlockIsCounter)
361  {
362  const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4));
363  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
364 
365  block1 = vaddq_u64(block0, be);
366  block2 = vaddq_u64(block1, be);
367  block3 = vaddq_u64(block2, be);
368  block4 = vaddq_u64(block3, be);
369  block5 = vaddq_u64(block4, be);
370  vst1q_u8(const_cast<byte*>(inBlocks),
371  vreinterpretq_u8_u64(vaddq_u64(block5, be)));
372  }
373  else
374  {
375  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
376  inBlocks += inIncrement;
377  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
378  inBlocks += inIncrement;
379  block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
380  inBlocks += inIncrement;
381  block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
382  inBlocks += inIncrement;
383  block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
384  inBlocks += inIncrement;
385  block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
386  inBlocks += inIncrement;
387  }
388 
389  if (xorInput)
390  {
391  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
392  xorBlocks += xorIncrement;
393  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
394  xorBlocks += xorIncrement;
395  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
396  xorBlocks += xorIncrement;
397  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
398  xorBlocks += xorIncrement;
399  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
400  xorBlocks += xorIncrement;
401  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
402  xorBlocks += xorIncrement;
403  }
404 
405  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
406 
407  if (xorOutput)
408  {
409  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
410  xorBlocks += xorIncrement;
411  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
412  xorBlocks += xorIncrement;
413  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
414  xorBlocks += xorIncrement;
415  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
416  xorBlocks += xorIncrement;
417  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
418  xorBlocks += xorIncrement;
419  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
420  xorBlocks += xorIncrement;
421  }
422 
423  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
424  outBlocks += outIncrement;
425  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
426  outBlocks += outIncrement;
427  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
428  outBlocks += outIncrement;
429  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
430  outBlocks += outIncrement;
431  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
432  outBlocks += outIncrement;
433  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
434  outBlocks += outIncrement;
435 
436  length -= 6*blockSize;
437  }
438  }
439 
440  while (length >= blockSize)
441  {
442  uint64x2_t block;
443  block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
444 
445  if (xorInput)
446  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
447 
448  if (flags & BT_InBlockIsCounter)
449  const_cast<byte *>(inBlocks)[15]++;
450 
451  func1(block, subKeys, static_cast<unsigned int>(rounds));
452 
453  if (xorOutput)
454  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
455 
456  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
457 
458  inBlocks += inIncrement;
459  outBlocks += outIncrement;
460  xorBlocks += xorIncrement;
461  length -= blockSize;
462  }
463 
464  return length;
465 }
466 
467 template <typename F2, typename F6>
468 inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
469  const word64 *subKeys, size_t rounds, const byte *inBlocks,
470  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
471 {
472  CRYPTOPP_ASSERT(subKeys);
473  CRYPTOPP_ASSERT(inBlocks);
474  CRYPTOPP_ASSERT(outBlocks);
475  CRYPTOPP_ASSERT(length >= 16);
476 
477 #if defined(CRYPTOPP_LITTLE_ENDIAN)
478  const word32 s_one32x4[] = {0, 0, 0, 1<<24};
479 #else
480  const word32 s_one32x4[] = {0, 0, 0, 1};
481 #endif
482 
483  const ptrdiff_t blockSize = 16;
484  // const ptrdiff_t neonBlockSize = 16;
485 
486  ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
487  ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
488  ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
489 
490  // Clang and Coverity are generating findings using xorBlocks as a flag.
491  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
492  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
493 
494  if (flags & BT_ReverseDirection)
495  {
496  inBlocks += static_cast<ptrdiff_t>(length) - blockSize;
497  xorBlocks += static_cast<ptrdiff_t>(length) - blockSize;
498  outBlocks += static_cast<ptrdiff_t>(length) - blockSize;
499  inIncrement = 0-inIncrement;
500  xorIncrement = 0-xorIncrement;
501  outIncrement = 0-outIncrement;
502  }
503 
504  if (flags & BT_AllowParallel)
505  {
506  while (length >= 6*blockSize)
507  {
508  uint64x2_t block0, block1, block2, block3, block4, block5;
509  if (flags & BT_InBlockIsCounter)
510  {
511  const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4));
512  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
513 
514  block1 = vaddq_u64(block0, be);
515  block2 = vaddq_u64(block1, be);
516  block3 = vaddq_u64(block2, be);
517  block4 = vaddq_u64(block3, be);
518  block5 = vaddq_u64(block4, be);
519  vst1q_u8(const_cast<byte*>(inBlocks),
520  vreinterpretq_u8_u64(vaddq_u64(block5, be)));
521  }
522  else
523  {
524  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
525  inBlocks += inIncrement;
526  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
527  inBlocks += inIncrement;
528  block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
529  inBlocks += inIncrement;
530  block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
531  inBlocks += inIncrement;
532  block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
533  inBlocks += inIncrement;
534  block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
535  inBlocks += inIncrement;
536  }
537 
538  if (xorInput)
539  {
540  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
541  xorBlocks += xorIncrement;
542  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
543  xorBlocks += xorIncrement;
544  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
545  xorBlocks += xorIncrement;
546  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
547  xorBlocks += xorIncrement;
548  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
549  xorBlocks += xorIncrement;
550  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
551  xorBlocks += xorIncrement;
552  }
553 
554  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
555 
556  if (xorOutput)
557  {
558  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
559  xorBlocks += xorIncrement;
560  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
561  xorBlocks += xorIncrement;
562  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
563  xorBlocks += xorIncrement;
564  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
565  xorBlocks += xorIncrement;
566  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
567  xorBlocks += xorIncrement;
568  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
569  xorBlocks += xorIncrement;
570  }
571 
572  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
573  outBlocks += outIncrement;
574  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
575  outBlocks += outIncrement;
576  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
577  outBlocks += outIncrement;
578  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
579  outBlocks += outIncrement;
580  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
581  outBlocks += outIncrement;
582  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
583  outBlocks += outIncrement;
584 
585  length -= 6*blockSize;
586  }
587 
588  while (length >= 2*blockSize)
589  {
590  uint64x2_t block0, block1;
591  if (flags & BT_InBlockIsCounter)
592  {
593  const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4));
594  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
595  block1 = vaddq_u64(block0, be);
596 
597  vst1q_u8(const_cast<byte*>(inBlocks),
598  vreinterpretq_u8_u64(vaddq_u64(block1, be)));
599  }
600  else
601  {
602  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
603  inBlocks += inIncrement;
604  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
605  inBlocks += inIncrement;
606  }
607 
608  if (xorInput)
609  {
610  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
611  xorBlocks += xorIncrement;
612  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
613  xorBlocks += xorIncrement;
614  }
615 
616  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
617 
618  if (xorOutput)
619  {
620  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
621  xorBlocks += xorIncrement;
622  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
623  xorBlocks += xorIncrement;
624  }
625 
626  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
627  outBlocks += outIncrement;
628  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
629  outBlocks += outIncrement;
630 
631  length -= 2*blockSize;
632  }
633  }
634 
635  while (length >= blockSize)
636  {
637  uint64x2_t block, zero = {0,0};
638  block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
639 
640  if (xorInput)
641  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
642 
643  if (flags & BT_InBlockIsCounter)
644  const_cast<byte *>(inBlocks)[15]++;
645 
646  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
647 
648  if (xorOutput)
649  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
650 
651  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
652 
653  inBlocks += inIncrement;
654  outBlocks += outIncrement;
655  xorBlocks += xorIncrement;
656  length -= blockSize;
657  }
658 
659  return length;
660 }
661 
662 NAMESPACE_END // CryptoPP
663 
664 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
665 
666 // *************************** Intel SSE ************************** //
667 
668 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
669 
670 // Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224
671 #if (__SUNPRO_CC >= 0x5130)
672 # define MAYBE_CONST
673 # define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x)
674 #else
675 # define MAYBE_CONST const
676 # define MAYBE_UNCONST_CAST(T, x) (x)
677 #endif
678 
679 // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
680 #ifndef M128_CAST
681 # define M128_CAST(x) ((__m128i *)(void *)(x))
682 #endif
683 #ifndef CONST_M128_CAST
684 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
685 #endif
686 
687 // GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
688 #ifndef DOUBLE_CAST
689 # define DOUBLE_CAST(x) ((double *)(void *)(x))
690 #endif
691 #ifndef CONST_DOUBLE_CAST
692 # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
693 #endif
694 
695 NAMESPACE_BEGIN(CryptoPP)
696 
697 template <typename F2, typename F6>
698 inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
699  const word32 *subKeys, size_t rounds, const byte *inBlocks,
700  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
701 {
702  CRYPTOPP_ASSERT(subKeys);
703  CRYPTOPP_ASSERT(inBlocks);
704  CRYPTOPP_ASSERT(outBlocks);
705  CRYPTOPP_ASSERT(length >= 8);
706 
707  CRYPTOPP_ALIGN_DATA(16)
708  const word32 s_one32x4_1b[] = {0, 0, 0, 1<<24};
709  CRYPTOPP_ALIGN_DATA(16)
710  const word32 s_one32x4_2b[] = {0, 2<<24, 0, 2<<24};
711 
712  const ptrdiff_t blockSize = 8;
713  const ptrdiff_t xmmBlockSize = 16;
714 
715  ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
716  ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
717  ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
718 
719  // Clang and Coverity are generating findings using xorBlocks as a flag.
720  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
721  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
722 
723  if (flags & BT_ReverseDirection)
724  {
725  inBlocks += static_cast<ptrdiff_t>(length) - xmmBlockSize;
726  xorBlocks += static_cast<ptrdiff_t>(length) - xmmBlockSize;
727  outBlocks += static_cast<ptrdiff_t>(length) - xmmBlockSize;
728  inIncrement = 0-inIncrement;
729  xorIncrement = 0-xorIncrement;
730  outIncrement = 0-outIncrement;
731  }
732 
733  if (flags & BT_AllowParallel)
734  {
735  while (length >= 6*xmmBlockSize)
736  {
737  __m128i block0, block1, block2, block3, block4, block5;
738  if (flags & BT_InBlockIsCounter)
739  {
740  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
741  // After the dup load we have two counters in the XMM word. Then we need
742  // to increment the low ctr by 0 and the high ctr by 1.
743  block0 = _mm_add_epi32(*CONST_M128_CAST(s_one32x4_1b), _mm_castpd_si128(
744  _mm_loaddup_pd(CONST_DOUBLE_CAST(inBlocks))));
745 
746  // After initial increment of {0,1} remaining counters increment by {2,2}.
747  const __m128i be2 = *CONST_M128_CAST(s_one32x4_2b);
748  block1 = _mm_add_epi32(be2, block0);
749  block2 = _mm_add_epi32(be2, block1);
750  block3 = _mm_add_epi32(be2, block2);
751  block4 = _mm_add_epi32(be2, block3);
752  block5 = _mm_add_epi32(be2, block4);
753 
754  // Store the next counter. UBsan false positive; mem_addr can be unaligned.
755  _mm_store_sd(DOUBLE_CAST(inBlocks),
756  _mm_castsi128_pd(_mm_add_epi32(be2, block5)));
757  }
758  else
759  {
760  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
761  inBlocks += inIncrement;
762  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
763  inBlocks += inIncrement;
764  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
765  inBlocks += inIncrement;
766  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
767  inBlocks += inIncrement;
768  block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
769  inBlocks += inIncrement;
770  block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
771  inBlocks += inIncrement;
772  }
773 
774  if (xorInput)
775  {
776  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
777  xorBlocks += xorIncrement;
778  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
779  xorBlocks += xorIncrement;
780  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
781  xorBlocks += xorIncrement;
782  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
783  xorBlocks += xorIncrement;
784  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
785  xorBlocks += xorIncrement;
786  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
787  xorBlocks += xorIncrement;
788  }
789 
790  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
791 
792  if (xorOutput)
793  {
794  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
795  xorBlocks += xorIncrement;
796  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
797  xorBlocks += xorIncrement;
798  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
799  xorBlocks += xorIncrement;
800  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
801  xorBlocks += xorIncrement;
802  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
803  xorBlocks += xorIncrement;
804  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
805  xorBlocks += xorIncrement;
806  }
807 
808  _mm_storeu_si128(M128_CAST(outBlocks), block0);
809  outBlocks += outIncrement;
810  _mm_storeu_si128(M128_CAST(outBlocks), block1);
811  outBlocks += outIncrement;
812  _mm_storeu_si128(M128_CAST(outBlocks), block2);
813  outBlocks += outIncrement;
814  _mm_storeu_si128(M128_CAST(outBlocks), block3);
815  outBlocks += outIncrement;
816  _mm_storeu_si128(M128_CAST(outBlocks), block4);
817  outBlocks += outIncrement;
818  _mm_storeu_si128(M128_CAST(outBlocks), block5);
819  outBlocks += outIncrement;
820 
821  length -= 6*xmmBlockSize;
822  }
823 
824  while (length >= 2*xmmBlockSize)
825  {
826  __m128i block0, block1;
827  if (flags & BT_InBlockIsCounter)
828  {
829  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
830  // After the dup load we have two counters in the XMM word. Then we need
831  // to increment the low ctr by 0 and the high ctr by 1.
832  block0 = _mm_add_epi32(*CONST_M128_CAST(s_one32x4_1b), _mm_castpd_si128(
833  _mm_loaddup_pd(CONST_DOUBLE_CAST(inBlocks))));
834 
835  // After initial increment of {0,1} remaining counters increment by {2,2}.
836  const __m128i be2 = *CONST_M128_CAST(s_one32x4_2b);
837  block1 = _mm_add_epi32(be2, block0);
838 
839  // Store the next counter. UBsan false positive; mem_addr can be unaligned.
840  _mm_store_sd(DOUBLE_CAST(inBlocks),
841  _mm_castsi128_pd(_mm_add_epi64(be2, block1)));
842  }
843  else
844  {
845  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
846  inBlocks += inIncrement;
847  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
848  inBlocks += inIncrement;
849  }
850 
851  if (xorInput)
852  {
853  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
854  xorBlocks += xorIncrement;
855  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
856  xorBlocks += xorIncrement;
857  }
858 
859  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
860 
861  if (xorOutput)
862  {
863  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
864  xorBlocks += xorIncrement;
865  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
866  xorBlocks += xorIncrement;
867  }
868 
869  _mm_storeu_si128(M128_CAST(outBlocks), block0);
870  outBlocks += outIncrement;
871  _mm_storeu_si128(M128_CAST(outBlocks), block1);
872  outBlocks += outIncrement;
873 
874  length -= 2*xmmBlockSize;
875  }
876  }
877 
878  if (length)
879  {
880  // Adjust to real block size
881  if (flags & BT_ReverseDirection)
882  {
883  inIncrement += inIncrement ? blockSize : 0;
884  xorIncrement += xorIncrement ? blockSize : 0;
885  outIncrement += outIncrement ? blockSize : 0;
886  inBlocks -= inIncrement;
887  xorBlocks -= xorIncrement;
888  outBlocks -= outIncrement;
889  }
890  else
891  {
892  inIncrement -= inIncrement ? blockSize : 0;
893  xorIncrement -= xorIncrement ? blockSize : 0;
894  outIncrement -= outIncrement ? blockSize : 0;
895  }
896 
897  while (length >= blockSize)
898  {
899  __m128i block, zero = _mm_setzero_si128();
900  block = _mm_castpd_si128(
901  // UBsan false positive; mem_addr can be unaligned.
902  _mm_load_sd(CONST_DOUBLE_CAST(inBlocks)));
903 
904  if (xorInput)
905  {
906  block = _mm_xor_si128(block, _mm_castpd_si128(
907  // UBsan false positive; mem_addr can be unaligned.
908  _mm_load_sd(CONST_DOUBLE_CAST(xorBlocks))));
909  }
910 
911  if (flags & BT_InBlockIsCounter)
912  const_cast<byte *>(inBlocks)[7]++;
913 
914  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
915 
916  if (xorOutput)
917  {
918  block = _mm_xor_si128(block, _mm_castpd_si128(
919  // UBsan false positive; mem_addr can be unaligned.
920  _mm_load_sd(CONST_DOUBLE_CAST(xorBlocks))));
921  }
922 
923  // UBsan false positive; mem_addr can be unaligned.
924  _mm_store_sd(DOUBLE_CAST(outBlocks), _mm_castsi128_pd(block));
925 
926  inBlocks += inIncrement;
927  outBlocks += outIncrement;
928  xorBlocks += xorIncrement;
929  length -= blockSize;
930  }
931  }
932 
933  return length;
934 }
935 
936 template <typename F2, typename F6>
937 inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
938  const word64 *subKeys, size_t rounds, const byte *inBlocks,
939  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
940 {
941  CRYPTOPP_ASSERT(subKeys);
942  CRYPTOPP_ASSERT(inBlocks);
943  CRYPTOPP_ASSERT(outBlocks);
944  CRYPTOPP_ASSERT(length >= 16);
945 
946  CRYPTOPP_ALIGN_DATA(16)
947  const word32 s_one32x4[] = {0, 0, 0, 1<<24};
948 
949  const ptrdiff_t blockSize = 16;
950  // const ptrdiff_t xmmBlockSize = 16;
951 
952  ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
953  ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
954  ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
955 
956  // Clang and Coverity are generating findings using xorBlocks as a flag.
957  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
958  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
959 
960  if (flags & BT_ReverseDirection)
961  {
962  inBlocks += static_cast<ptrdiff_t>(length) - blockSize;
963  xorBlocks += static_cast<ptrdiff_t>(length) - blockSize;
964  outBlocks += static_cast<ptrdiff_t>(length) - blockSize;
965  inIncrement = 0-inIncrement;
966  xorIncrement = 0-xorIncrement;
967  outIncrement = 0-outIncrement;
968  }
969 
970  if (flags & BT_AllowParallel)
971  {
972  while (length >= 6*blockSize)
973  {
974  __m128i block0, block1, block2, block3, block4, block5;
975  if (flags & BT_InBlockIsCounter)
976  {
977  const __m128i be1 = *CONST_M128_CAST(s_one32x4);
978  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
979  block1 = _mm_add_epi32(block0, be1);
980  block2 = _mm_add_epi32(block1, be1);
981  block3 = _mm_add_epi32(block2, be1);
982  block4 = _mm_add_epi32(block3, be1);
983  block5 = _mm_add_epi32(block4, be1);
984  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, be1));
985  }
986  else
987  {
988  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
989  inBlocks += inIncrement;
990  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
991  inBlocks += inIncrement;
992  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
993  inBlocks += inIncrement;
994  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
995  inBlocks += inIncrement;
996  block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
997  inBlocks += inIncrement;
998  block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
999  inBlocks += inIncrement;
1000  }
1001 
1002  if (xorInput)
1003  {
1004  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1005  xorBlocks += xorIncrement;
1006  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1007  xorBlocks += xorIncrement;
1008  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1009  xorBlocks += xorIncrement;
1010  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1011  xorBlocks += xorIncrement;
1012  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1013  xorBlocks += xorIncrement;
1014  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1015  xorBlocks += xorIncrement;
1016  }
1017 
1018  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1019 
1020  if (xorOutput)
1021  {
1022  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1023  xorBlocks += xorIncrement;
1024  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1025  xorBlocks += xorIncrement;
1026  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1027  xorBlocks += xorIncrement;
1028  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1029  xorBlocks += xorIncrement;
1030  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1031  xorBlocks += xorIncrement;
1032  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1033  xorBlocks += xorIncrement;
1034  }
1035 
1036  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1037  outBlocks += outIncrement;
1038  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1039  outBlocks += outIncrement;
1040  _mm_storeu_si128(M128_CAST(outBlocks), block2);
1041  outBlocks += outIncrement;
1042  _mm_storeu_si128(M128_CAST(outBlocks), block3);
1043  outBlocks += outIncrement;
1044  _mm_storeu_si128(M128_CAST(outBlocks), block4);
1045  outBlocks += outIncrement;
1046  _mm_storeu_si128(M128_CAST(outBlocks), block5);
1047  outBlocks += outIncrement;
1048 
1049  length -= 6*blockSize;
1050  }
1051 
1052  while (length >= 2*blockSize)
1053  {
1054  __m128i block0, block1;
1055  if (flags & BT_InBlockIsCounter)
1056  {
1057  const __m128i be1 = *CONST_M128_CAST(s_one32x4);
1058  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1059  block1 = _mm_add_epi32(block0, be1);
1060  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, be1));
1061  }
1062  else
1063  {
1064  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1065  inBlocks += inIncrement;
1066  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1067  inBlocks += inIncrement;
1068  }
1069 
1070  if (xorInput)
1071  {
1072  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1073  xorBlocks += xorIncrement;
1074  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1075  xorBlocks += xorIncrement;
1076  }
1077 
1078  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1079 
1080  if (xorOutput)
1081  {
1082  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1083  xorBlocks += xorIncrement;
1084  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1085  xorBlocks += xorIncrement;
1086  }
1087 
1088  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1089  outBlocks += outIncrement;
1090  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1091  outBlocks += outIncrement;
1092 
1093  length -= 2*blockSize;
1094  }
1095  }
1096 
1097  while (length >= blockSize)
1098  {
1099  __m128i block, zero = _mm_setzero_si128();
1100  block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1101 
1102  if (xorInput)
1103  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1104 
1105  if (flags & BT_InBlockIsCounter)
1106  const_cast<byte *>(inBlocks)[15]++;
1107 
1108  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1109 
1110  if (xorOutput)
1111  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1112 
1113  _mm_storeu_si128(M128_CAST(outBlocks), block);
1114 
1115  inBlocks += inIncrement;
1116  outBlocks += outIncrement;
1117  xorBlocks += xorIncrement;
1118  length -= blockSize;
1119  }
1120 
1121  return length;
1122 }
1123 
1124 template <typename F1, typename F4>
1125 inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
1126  MAYBE_CONST word32 *subKeys, size_t rounds, const byte *inBlocks,
1127  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1128 {
1129  CRYPTOPP_ASSERT(subKeys);
1130  CRYPTOPP_ASSERT(inBlocks);
1131  CRYPTOPP_ASSERT(outBlocks);
1132  CRYPTOPP_ASSERT(length >= 16);
1133 
1134  CRYPTOPP_ALIGN_DATA(16)
1135  const word32 s_one32x4[] = {0, 0, 0, 1<<24};
1136 
1137  const ptrdiff_t blockSize = 16;
1138  // const ptrdiff_t xmmBlockSize = 16;
1139 
1140  ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1141  ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1142  ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1143 
1144  // Clang and Coverity are generating findings using xorBlocks as a flag.
1145  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1146  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1147 
1148  if (flags & BT_ReverseDirection)
1149  {
1150  inBlocks += static_cast<ptrdiff_t>(length) - blockSize;
1151  xorBlocks += static_cast<ptrdiff_t>(length) - blockSize;
1152  outBlocks += static_cast<ptrdiff_t>(length) - blockSize;
1153  inIncrement = 0-inIncrement;
1154  xorIncrement = 0-xorIncrement;
1155  outIncrement = 0-outIncrement;
1156  }
1157 
1158  if (flags & BT_AllowParallel)
1159  {
1160  while (length >= 4*blockSize)
1161  {
1162  __m128i block0, block1, block2, block3;
1163  if (flags & BT_InBlockIsCounter)
1164  {
1165  const __m128i be1 = *CONST_M128_CAST(s_one32x4);
1166  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1167  block1 = _mm_add_epi32(block0, be1);
1168  block2 = _mm_add_epi32(block1, be1);
1169  block3 = _mm_add_epi32(block2, be1);
1170  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1));
1171  }
1172  else
1173  {
1174  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1175  inBlocks += inIncrement;
1176  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1177  inBlocks += inIncrement;
1178  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1179  inBlocks += inIncrement;
1180  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1181  inBlocks += inIncrement;
1182  }
1183 
1184  if (xorInput)
1185  {
1186  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1187  xorBlocks += xorIncrement;
1188  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1189  xorBlocks += xorIncrement;
1190  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1191  xorBlocks += xorIncrement;
1192  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1193  xorBlocks += xorIncrement;
1194  }
1195 
1196  func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1197 
1198  if (xorOutput)
1199  {
1200  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1201  xorBlocks += xorIncrement;
1202  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1203  xorBlocks += xorIncrement;
1204  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1205  xorBlocks += xorIncrement;
1206  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1207  xorBlocks += xorIncrement;
1208  }
1209 
1210  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1211  outBlocks += outIncrement;
1212  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1213  outBlocks += outIncrement;
1214  _mm_storeu_si128(M128_CAST(outBlocks), block2);
1215  outBlocks += outIncrement;
1216  _mm_storeu_si128(M128_CAST(outBlocks), block3);
1217  outBlocks += outIncrement;
1218 
1219  length -= 4*blockSize;
1220  }
1221  }
1222 
1223  while (length >= blockSize)
1224  {
1225  __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1226 
1227  if (xorInput)
1228  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1229 
1230  if (flags & BT_InBlockIsCounter)
1231  const_cast<byte *>(inBlocks)[15]++;
1232 
1233  func1(block, subKeys, static_cast<unsigned int>(rounds));
1234 
1235  if (xorOutput)
1236  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1237 
1238  _mm_storeu_si128(M128_CAST(outBlocks), block);
1239 
1240  inBlocks += inIncrement;
1241  outBlocks += outIncrement;
1242  xorBlocks += xorIncrement;
1243  length -= blockSize;
1244  }
1245 
1246  return length;
1247 }
1248 
1249 NAMESPACE_END // CryptoPP
1250 
1251 #endif // CRYPTOPP_SSSE3_AVAILABLE
1252 
1253 // *********************** Altivec/Power 4 ********************** //
1254 
1255 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
1256 
1257 NAMESPACE_BEGIN(CryptoPP)
1258 
1259 template <typename F1, typename F6>
1260 inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
1261  const word32 *subKeys, size_t rounds, const byte *inBlocks,
1262  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1263 {
1264  CRYPTOPP_ASSERT(subKeys);
1265  CRYPTOPP_ASSERT(inBlocks);
1266  CRYPTOPP_ASSERT(outBlocks);
1267  CRYPTOPP_ASSERT(length >= 16);
1268 
1269 #if defined(CRYPTOPP_LITTLE_ENDIAN)
1270  const uint32x4_p s_one = {1,0,0,0};
1271 #else
1272  const uint32x4_p s_one = {0,0,0,1};
1273 #endif
1274 
1275  const ptrdiff_t blockSize = 16;
1276  // const ptrdiff_t vexBlockSize = 16;
1277 
1278  ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1279  ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1280  ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1281 
1282  // Clang and Coverity are generating findings using xorBlocks as a flag.
1283  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1284  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1285 
1286  if (flags & BT_ReverseDirection)
1287  {
1288  inBlocks += static_cast<ptrdiff_t>(length) - blockSize;
1289  xorBlocks += static_cast<ptrdiff_t>(length) - blockSize;
1290  outBlocks += static_cast<ptrdiff_t>(length) - blockSize;
1291  inIncrement = 0-inIncrement;
1292  xorIncrement = 0-xorIncrement;
1293  outIncrement = 0-outIncrement;
1294  }
1295 
1296  if (flags & BT_AllowParallel)
1297  {
1298  while (length >= 6*blockSize)
1299  {
1300  uint32x4_p block0, block1, block2, block3, block4, block5, temp;
1301 
1302  if (flags & BT_InBlockIsCounter)
1303  {
1304  block0 = VectorLoad(inBlocks);
1305  block1 = VectorAdd(block0, s_one);
1306  block2 = VectorAdd(block1, s_one);
1307  block3 = VectorAdd(block2, s_one);
1308  block4 = VectorAdd(block3, s_one);
1309  block5 = VectorAdd(block4, s_one);
1310  temp = VectorAdd(block5, s_one);
1311  VectorStore(temp, const_cast<byte*>(inBlocks));
1312  }
1313  else
1314  {
1315  block0 = VectorLoad(inBlocks);
1316  inBlocks += inIncrement;
1317  block1 = VectorLoad(inBlocks);
1318  inBlocks += inIncrement;
1319  block2 = VectorLoad(inBlocks);
1320  inBlocks += inIncrement;
1321  block3 = VectorLoad(inBlocks);
1322  inBlocks += inIncrement;
1323  block4 = VectorLoad(inBlocks);
1324  inBlocks += inIncrement;
1325  block5 = VectorLoad(inBlocks);
1326  inBlocks += inIncrement;
1327  }
1328 
1329  if (xorInput)
1330  {
1331  block0 = VectorXor(block0, VectorLoad(xorBlocks));
1332  xorBlocks += xorIncrement;
1333  block1 = VectorXor(block1, VectorLoad(xorBlocks));
1334  xorBlocks += xorIncrement;
1335  block2 = VectorXor(block2, VectorLoad(xorBlocks));
1336  xorBlocks += xorIncrement;
1337  block3 = VectorXor(block3, VectorLoad(xorBlocks));
1338  xorBlocks += xorIncrement;
1339  block4 = VectorXor(block4, VectorLoad(xorBlocks));
1340  xorBlocks += xorIncrement;
1341  block5 = VectorXor(block5, VectorLoad(xorBlocks));
1342  xorBlocks += xorIncrement;
1343  }
1344 
1345  func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
1346 
1347  if (xorOutput)
1348  {
1349  block0 = VectorXor(block0, VectorLoad(xorBlocks));
1350  xorBlocks += xorIncrement;
1351  block1 = VectorXor(block1, VectorLoad(xorBlocks));
1352  xorBlocks += xorIncrement;
1353  block2 = VectorXor(block2, VectorLoad(xorBlocks));
1354  xorBlocks += xorIncrement;
1355  block3 = VectorXor(block3, VectorLoad(xorBlocks));
1356  xorBlocks += xorIncrement;
1357  block4 = VectorXor(block4, VectorLoad(xorBlocks));
1358  xorBlocks += xorIncrement;
1359  block5 = VectorXor(block5, VectorLoad(xorBlocks));
1360  xorBlocks += xorIncrement;
1361  }
1362 
1363  VectorStore(block0, outBlocks);
1364  outBlocks += outIncrement;
1365  VectorStore(block1, outBlocks);
1366  outBlocks += outIncrement;
1367  VectorStore(block2, outBlocks);
1368  outBlocks += outIncrement;
1369  VectorStore(block3, outBlocks);
1370  outBlocks += outIncrement;
1371  VectorStore(block4, outBlocks);
1372  outBlocks += outIncrement;
1373  VectorStore(block5, outBlocks);
1374  outBlocks += outIncrement;
1375 
1376  length -= 6*blockSize;
1377  }
1378  }
1379 
1380  while (length >= blockSize)
1381  {
1382  uint32x4_p block = VectorLoad(inBlocks);
1383 
1384  if (xorInput)
1385  block = VectorXor(block, VectorLoad(xorBlocks));
1386 
1387  if (flags & BT_InBlockIsCounter)
1388  const_cast<byte *>(inBlocks)[15]++;
1389 
1390  func1(block, subKeys, rounds);
1391 
1392  if (xorOutput)
1393  block = VectorXor(block, VectorLoad(xorBlocks));
1394 
1395  VectorStore(block, outBlocks);
1396 
1397  inBlocks += inIncrement;
1398  outBlocks += outIncrement;
1399  xorBlocks += xorIncrement;
1400  length -= blockSize;
1401  }
1402 
1403  return length;
1404 }
1405 
1406 NAMESPACE_END // CryptoPP
1407 
1408 #endif // CRYPTOPP_ALTIVEC_AVAILABLE
1409 
1410 #endif // CRYPTOPP_ADVANCED_SIMD_TEMPLATES
Allow parallel transformations.
Definition: cryptlib.h:878
Utility functions for the Crypto++ library.
Library configuration file.
should not modify block pointers
Definition: cryptlib.h:872
Common C++ header files.
Support functions for PowerPC and vector operations.
T1 VectorAdd(const T1 &vec1, const T2 &vec2)
Add two vector.
Definition: ppc-simd.h:388
uint32x4_p VectorLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc-simd.h:188
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:60
Xor inputs before transformation.
Definition: cryptlib.h:874
T1 VectorXor(const T1 &vec1, const T2 &vec2)
XOR two vectors.
Definition: ppc-simd.h:373
perform the transformation in reverse
Definition: cryptlib.h:876
Crypto++ library namespace.
void VectorStore(const T &src, byte dest[16])
Stores a vector to a byte array.
Definition: ppc-simd.h:310