Crypto++  6.1
Free C++ class library of cryptographic schemes
simon-simd.cpp
1 // simon-simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "simon.h"
12 #include "misc.h"
13 #include "adv-simd.h"
14 
15 // Uncomment for benchmarking C++ against SSE or NEON.
16 // Do so in both simon.cpp and simon-simd.cpp.
17 // #undef CRYPTOPP_SSSE3_AVAILABLE
18 // #undef CRYPTOPP_SSE41_AVAILABLE
19 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
20 
21 #if (CRYPTOPP_SSSE3_AVAILABLE)
22 # include <pmmintrin.h>
23 # include <tmmintrin.h>
24 #endif
25 
26 #if (CRYPTOPP_SSE41_AVAILABLE)
27 # include <smmintrin.h>
28 #endif
29 
30 #if defined(__AVX512F__) && defined(__AVX512VL__)
31 # define CRYPTOPP_AVX512_ROTATE 1
32 # include <immintrin.h>
33 #endif
34 
35 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
36 # include <arm_neon.h>
37 #endif
38 
39 // Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
40 // compilers don't follow ACLE conventions for the include.
41 #if defined(CRYPTOPP_ARM_ACLE_AVAILABLE)
42 # include <stdint.h>
43 # include <arm_acle.h>
44 #endif
45 
46 // https://www.spinics.net/lists/gcchelp/msg47735.html and
47 // https://www.spinics.net/lists/gcchelp/msg47749.html
48 #if (CRYPTOPP_GCC_VERSION >= 40900)
49 # define GCC_NO_UBSAN __attribute__ ((no_sanitize_undefined))
50 #else
51 # define GCC_NO_UBSAN
52 #endif
53 
54 ANONYMOUS_NAMESPACE_BEGIN
55 
56 using CryptoPP::byte;
57 using CryptoPP::word32;
58 using CryptoPP::word64;
61 using CryptoPP::vec_swap; // SunCC
62 
63 // *************************** ARM NEON ************************** //
64 
65 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
66 
67 template <class T>
68 inline T UnpackHigh32(const T& a, const T& b)
69 {
70  const uint32x2_t x(vget_high_u32((uint32x4_t)a));
71  const uint32x2_t y(vget_high_u32((uint32x4_t)b));
72  const uint32x2x2_t r = vzip_u32(x, y);
73  return (T)vcombine_u32(r.val[0], r.val[1]);
74 }
75 
76 template <class T>
77 inline T UnpackLow32(const T& a, const T& b)
78 {
79  const uint32x2_t x(vget_low_u32((uint32x4_t)a));
80  const uint32x2_t y(vget_low_u32((uint32x4_t)b));
81  const uint32x2x2_t r = vzip_u32(x, y);
82  return (T)vcombine_u32(r.val[0], r.val[1]);
83 }
84 
85 template <unsigned int R>
86 inline uint32x4_t RotateLeft32(const uint32x4_t& val)
87 {
88  const uint32x4_t a(vshlq_n_u32(val, R));
89  const uint32x4_t b(vshrq_n_u32(val, 32 - R));
90  return vorrq_u32(a, b);
91 }
92 
93 template <unsigned int R>
94 inline uint32x4_t RotateRight32(const uint32x4_t& val)
95 {
96  const uint32x4_t a(vshlq_n_u32(val, 32 - R));
97  const uint32x4_t b(vshrq_n_u32(val, R));
98  return vorrq_u32(a, b);
99 }
100 
101 #if defined(__aarch32__) || defined(__aarch64__)
102 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
103 template <>
104 inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
105 {
106 #if defined(CRYPTOPP_BIG_ENDIAN)
107  const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
108  const uint8x16_t mask = vld1q_u8(maskb);
109 #else
110  const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
111  const uint8x16_t mask = vld1q_u8(maskb);
112 #endif
113 
114  return vreinterpretq_u32_u8(
115  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
116 }
117 
118 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
119 template <>
120 inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
121 {
122 #if defined(CRYPTOPP_BIG_ENDIAN)
123  const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
124  const uint8x16_t mask = vld1q_u8(maskb);
125 #else
126  const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
127  const uint8x16_t mask = vld1q_u8(maskb);
128 #endif
129 
130  return vreinterpretq_u32_u8(
131  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
132 }
133 #endif
134 
135 inline uint32x4_t SIMON64_f(const uint32x4_t& val)
136 {
137  return veorq_u32(RotateLeft32<2>(val),
138  vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val)));
139 }
140 
141 inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
142  const word32 *subkeys, unsigned int rounds)
143 {
144  // Rearrange the data for vectorization. The incoming data was read into
145  // a little-endian word array. Depending on the number of blocks it needs to
146  // be permuted to the following. If only a single block is available then
147  // a Zero block is provided to promote vectorizations.
148  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
149  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
150  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
151 
152  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
153  {
154  const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
155  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
156 
157  const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
158  x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
159  }
160 
161  if (rounds & 1)
162  {
163  const uint32x4_t rk = vld1q_dup_u32(subkeys+rounds-1);
164 
165  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
166  std::swap(x1, y1);
167  }
168 
169  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
170  block0 = UnpackLow32(y1, x1);
171  block1 = UnpackHigh32(y1, x1);
172 }
173 
174 inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
175  const word32 *subkeys, unsigned int rounds)
176 {
177  // Rearrange the data for vectorization. The incoming data was read into
178  // a little-endian word array. Depending on the number of blocks it needs to
179  // be permuted to the following. If only a single block is available then
180  // a Zero block is provided to promote vectorizations.
181  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
182  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
183  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
184 
185  if (rounds & 1)
186  {
187  std::swap(x1, y1);
188  const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
189 
190  y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
191  rounds--;
192  }
193 
194  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
195  {
196  const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i+1);
197  x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
198 
199  const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i);
200  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
201  }
202 
203  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
204  block0 = UnpackLow32(y1, x1);
205  block1 = UnpackHigh32(y1, x1);
206 }
207 
208 inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
209  uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
210  const word32 *subkeys, unsigned int rounds)
211 {
212  // Rearrange the data for vectorization. The incoming data was read into
213  // a little-endian word array. Depending on the number of blocks it needs to
214  // be permuted to the following. If only a single block is available then
215  // a Zero block is provided to promote vectorizations.
216  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
217  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
218  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
219  uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
220  uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
221  uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
222  uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
223 
224  for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
225  {
226  const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
227  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
228  y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk1);
229  y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk1);
230 
231  const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
232  x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
233  x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk2);
234  x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk2);
235  }
236 
237  if (rounds & 1)
238  {
239  const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
240 
241  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
242  y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk);
243  y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk);
244  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
245  }
246 
247  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
248  block0 = UnpackLow32(y1, x1);
249  block1 = UnpackHigh32(y1, x1);
250  block2 = UnpackLow32(y2, x2);
251  block3 = UnpackHigh32(y2, x2);
252  block4 = UnpackLow32(y3, x3);
253  block5 = UnpackHigh32(y3, x3);
254 }
255 
256 inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
257  uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
258  const word32 *subkeys, unsigned int rounds)
259 {
260  // Rearrange the data for vectorization. The incoming data was read into
261  // a little-endian word array. Depending on the number of blocks it needs to
262  // be permuted to the following. If only a single block is available then
263  // a Zero block is provided to promote vectorizations.
264  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
265  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
266  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
267  uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
268  uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
269  uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
270  uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
271 
272  if (rounds & 1)
273  {
274  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
275  const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
276 
277  y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
278  y2 = veorq_u32(veorq_u32(y2, rk), SIMON64_f(x2));
279  y3 = veorq_u32(veorq_u32(y3, rk), SIMON64_f(x3));
280  rounds--;
281  }
282 
283  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
284  {
285  const uint32x4_t rk1 = vld1q_dup_u32(subkeys + i + 1);
286  x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
287  x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk1);
288  x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk1);
289 
290  const uint32x4_t rk2 = vld1q_dup_u32(subkeys + i);
291  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
292  y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk2);
293  y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2);
294  }
295 
296  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
297  block0 = UnpackLow32(y1, x1);
298  block1 = UnpackHigh32(y1, x1);
299  block2 = UnpackLow32(y2, x2);
300  block3 = UnpackHigh32(y2, x2);
301  block4 = UnpackLow32(y3, x3);
302  block5 = UnpackHigh32(y3, x3);
303 }
304 
305 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
306 
307 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
308 
309 template <class T>
310 inline T UnpackHigh64(const T& a, const T& b)
311 {
312  const uint64x1_t x(vget_high_u64((uint64x2_t)a));
313  const uint64x1_t y(vget_high_u64((uint64x2_t)b));
314  return (T)vcombine_u64(x, y);
315 }
316 
317 template <class T>
318 inline T UnpackLow64(const T& a, const T& b)
319 {
320  const uint64x1_t x(vget_low_u64((uint64x2_t)a));
321  const uint64x1_t y(vget_low_u64((uint64x2_t)b));
322  return (T)vcombine_u64(x, y);
323 }
324 
325 template <unsigned int R>
326 inline uint64x2_t RotateLeft64(const uint64x2_t& val)
327 {
328  const uint64x2_t a(vshlq_n_u64(val, R));
329  const uint64x2_t b(vshrq_n_u64(val, 64 - R));
330  return vorrq_u64(a, b);
331 }
332 
333 template <unsigned int R>
334 inline uint64x2_t RotateRight64(const uint64x2_t& val)
335 {
336  const uint64x2_t a(vshlq_n_u64(val, 64 - R));
337  const uint64x2_t b(vshrq_n_u64(val, R));
338  return vorrq_u64(a, b);
339 }
340 
341 #if defined(__aarch32__) || defined(__aarch64__)
342 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
343 template <>
344 inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val)
345 {
346 #if defined(CRYPTOPP_BIG_ENDIAN)
347  const uint8_t maskb[16] = { 14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7 };
348  const uint8x16_t mask = vld1q_u8(maskb);
349 #else
350  const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
351  const uint8x16_t mask = vld1q_u8(maskb);
352 #endif
353 
354  return vreinterpretq_u64_u8(
355  vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
356 }
357 
358 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
359 template <>
360 inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
361 {
362 #if defined(CRYPTOPP_BIG_ENDIAN)
363  const uint8_t maskb[16] = { 8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1 };
364  const uint8x16_t mask = vld1q_u8(maskb);
365 #else
366  const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
367  const uint8x16_t mask = vld1q_u8(maskb);
368 #endif
369 
370  return vreinterpretq_u64_u8(
371  vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
372 }
373 #endif
374 
375 inline uint64x2_t SIMON128_f(const uint64x2_t& val)
376 {
377  return veorq_u64(RotateLeft64<2>(val),
378  vandq_u64(RotateLeft64<1>(val), RotateLeft64<8>(val)));
379 }
380 
381 inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
382  const word64 *subkeys, unsigned int rounds)
383 {
384  // Rearrange the data for vectorization. The incoming data was read into
385  // a little-endian word array. Depending on the number of blocks it needs to
386  // be permuted to the following.
387  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
388  uint64x2_t x1 = UnpackHigh64(block0, block1);
389  uint64x2_t y1 = UnpackLow64(block0, block1);
390 
391  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
392  {
393  const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
394  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
395 
396  const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
397  x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
398  }
399 
400  if (rounds & 1)
401  {
402  const uint64x2_t rk = vld1q_dup_u64(subkeys+rounds-1);
403 
404  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
405  std::swap(x1, y1);
406  }
407 
408  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
409  block0 = UnpackLow64(y1, x1);
410  block1 = UnpackHigh64(y1, x1);
411 }
412 
413 inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
414  uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
415  const word64 *subkeys, unsigned int rounds)
416 {
417  // Rearrange the data for vectorization. The incoming data was read into
418  // a little-endian word array. Depending on the number of blocks it needs to
419  // be permuted to the following.
420  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
421  uint64x2_t x1 = UnpackHigh64(block0, block1);
422  uint64x2_t y1 = UnpackLow64(block0, block1);
423  uint64x2_t x2 = UnpackHigh64(block2, block3);
424  uint64x2_t y2 = UnpackLow64(block2, block3);
425  uint64x2_t x3 = UnpackHigh64(block4, block5);
426  uint64x2_t y3 = UnpackLow64(block4, block5);
427 
428  for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
429  {
430  const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
431  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
432  y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk1);
433  y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk1);
434 
435  const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
436  x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
437  x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk2);
438  x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk2);
439  }
440 
441  if (rounds & 1)
442  {
443  const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
444 
445  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
446  y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk);
447  y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk);
448  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
449  }
450 
451  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
452  block0 = UnpackLow64(y1, x1);
453  block1 = UnpackHigh64(y1, x1);
454  block2 = UnpackLow64(y2, x2);
455  block3 = UnpackHigh64(y2, x2);
456  block4 = UnpackLow64(y3, x3);
457  block5 = UnpackHigh64(y3, x3);
458 }
459 
460 inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
461  const word64 *subkeys, unsigned int rounds)
462 {
463  // Rearrange the data for vectorization. The incoming data was read into
464  // a little-endian word array. Depending on the number of blocks it needs to
465  // be permuted to the following.
466  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
467  uint64x2_t x1 = UnpackHigh64(block0, block1);
468  uint64x2_t y1 = UnpackLow64(block0, block1);
469 
470  if (rounds & 1)
471  {
472  std::swap(x1, y1);
473  const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
474 
475  y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
476  rounds--;
477  }
478 
479  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
480  {
481  const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i+1);
482  x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
483 
484  const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i);
485  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
486  }
487 
488  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
489  block0 = UnpackLow64(y1, x1);
490  block1 = UnpackHigh64(y1, x1);
491 }
492 
493 inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
494  uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
495  const word64 *subkeys, unsigned int rounds)
496 {
497  // Rearrange the data for vectorization. The incoming data was read into
498  // a little-endian word array. Depending on the number of blocks it needs to
499  // be permuted to the following.
500  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
501  uint64x2_t x1 = UnpackHigh64(block0, block1);
502  uint64x2_t y1 = UnpackLow64(block0, block1);
503  uint64x2_t x2 = UnpackHigh64(block2, block3);
504  uint64x2_t y2 = UnpackLow64(block2, block3);
505  uint64x2_t x3 = UnpackHigh64(block4, block5);
506  uint64x2_t y3 = UnpackLow64(block4, block5);
507 
508  if (rounds & 1)
509  {
510  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
511  const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
512 
513  y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
514  y2 = veorq_u64(veorq_u64(y2, rk), SIMON128_f(x2));
515  y3 = veorq_u64(veorq_u64(y3, rk), SIMON128_f(x3));
516  rounds--;
517  }
518 
519  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
520  {
521  const uint64x2_t rk1 = vld1q_dup_u64(subkeys + i + 1);
522  x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
523  x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk1);
524  x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk1);
525 
526  const uint64x2_t rk2 = vld1q_dup_u64(subkeys + i);
527  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
528  y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk2);
529  y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk2);
530  }
531 
532  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
533  block0 = UnpackLow64(y1, x1);
534  block1 = UnpackHigh64(y1, x1);
535  block2 = UnpackLow64(y2, x2);
536  block3 = UnpackHigh64(y2, x2);
537  block4 = UnpackLow64(y3, x3);
538  block5 = UnpackHigh64(y3, x3);
539 }
540 
541 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
542 
543 // ***************************** IA-32 ***************************** //
544 
545 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
546 
547 // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
548 #ifndef M128_CAST
549 # define M128_CAST(x) ((__m128i *)(void *)(x))
550 #endif
551 #ifndef CONST_M128_CAST
552 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
553 #endif
554 
555 // GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
556 #ifndef DOUBLE_CAST
557 # define DOUBLE_CAST(x) ((double *)(void *)(x))
558 #endif
559 #ifndef CONST_DOUBLE_CAST
560 # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
561 #endif
562 
563 inline void Swap128(__m128i& a,__m128i& b)
564 {
565 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
566  // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
567  // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
568  vec_swap(a, b);
569 #else
570  std::swap(a, b);
571 #endif
572 }
573 
574 #if defined(CRYPTOPP_AVX512_ROTATE)
575 template <unsigned int R>
576 inline __m128i RotateLeft64(const __m128i& val)
577 {
578  return _mm_rol_epi64(val, R);
579 }
580 
581 template <unsigned int R>
582 inline __m128i RotateRight64(const __m128i& val)
583 {
584  return _mm_ror_epi64(val, R);
585 }
586 #else
587 template <unsigned int R>
588 inline __m128i RotateLeft64(const __m128i& val)
589 {
590  return _mm_or_si128(
591  _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
592 }
593 
594 template <unsigned int R>
595 inline __m128i RotateRight64(const __m128i& val)
596 {
597  return _mm_or_si128(
598  _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
599 }
600 
601 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
602 template <>
603 inline __m128i RotateLeft64<8>(const __m128i& val)
604 {
605  const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
606  return _mm_shuffle_epi8(val, mask);
607 }
608 
609 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
610 template <>
611 inline __m128i RotateRight64<8>(const __m128i& val)
612 {
613  const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
614  return _mm_shuffle_epi8(val, mask);
615 }
616 #endif // CRYPTOPP_AVX512_ROTATE
617 
618 inline __m128i SIMON128_f(const __m128i& v)
619 {
620  return _mm_xor_si128(RotateLeft64<2>(v),
621  _mm_and_si128(RotateLeft64<1>(v), RotateLeft64<8>(v)));
622 }
623 
624 inline void GCC_NO_UBSAN SIMON128_Enc_Block(__m128i &block0, __m128i &block1,
625  const word64 *subkeys, unsigned int rounds)
626 {
627  // Rearrange the data for vectorization. The incoming data was read into
628  // a little-endian word array. Depending on the number of blocks it needs to
629  // be permuted to the following.
630  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
631  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
632  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
633 
634  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
635  {
636  const __m128i rk1 = _mm_castpd_si128(
637  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
638  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
639 
640  const __m128i rk2 = _mm_castpd_si128(
641  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
642  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
643  }
644 
645  if (rounds & 1)
646  {
647  const __m128i rk = _mm_castpd_si128(
648  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+rounds-1)));
649 
650  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
651  Swap128(x1, y1);
652  }
653 
654  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
655  block0 = _mm_unpacklo_epi64(y1, x1);
656  block1 = _mm_unpackhi_epi64(y1, x1);
657 }
658 
659 inline void GCC_NO_UBSAN SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
660  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
661  const word64 *subkeys, unsigned int rounds)
662 {
663  // Rearrange the data for vectorization. The incoming data was read into
664  // a little-endian word array. Depending on the number of blocks it needs to
665  // be permuted to the following.
666  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
667  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
668  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
669  __m128i x2 = _mm_unpackhi_epi64(block2, block3);
670  __m128i y2 = _mm_unpacklo_epi64(block2, block3);
671  __m128i x3 = _mm_unpackhi_epi64(block4, block5);
672  __m128i y3 = _mm_unpacklo_epi64(block4, block5);
673 
674  for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
675  {
676  const __m128i rk1 = _mm_castpd_si128(
677  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
678  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
679  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1);
680  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1);
681 
682  const __m128i rk2 = _mm_castpd_si128(
683  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
684  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
685  x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2);
686  x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2);
687  }
688 
689  if (rounds & 1)
690  {
691  const __m128i rk = _mm_castpd_si128(
692  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
693  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
694  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk);
695  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk);
696  Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
697  }
698 
699  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
700  block0 = _mm_unpacklo_epi64(y1, x1);
701  block1 = _mm_unpackhi_epi64(y1, x1);
702  block2 = _mm_unpacklo_epi64(y2, x2);
703  block3 = _mm_unpackhi_epi64(y2, x2);
704  block4 = _mm_unpacklo_epi64(y3, x3);
705  block5 = _mm_unpackhi_epi64(y3, x3);
706 }
707 
708 inline void GCC_NO_UBSAN SIMON128_Dec_Block(__m128i &block0, __m128i &block1,
709  const word64 *subkeys, unsigned int rounds)
710 {
711  // Rearrange the data for vectorization. The incoming data was read into
712  // a little-endian word array. Depending on the number of blocks it needs to
713  // be permuted to the following.
714  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
715  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
716  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
717 
718  if (rounds & 1)
719  {
720  const __m128i rk = _mm_castpd_si128(
721  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
722 
723  Swap128(x1, y1);
724  y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
725  rounds--;
726  }
727 
728  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
729  {
730  const __m128i rk1 = _mm_castpd_si128(
731  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
732  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
733 
734  const __m128i rk2 = _mm_castpd_si128(
735  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
736  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
737  }
738 
739  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
740  block0 = _mm_unpacklo_epi64(y1, x1);
741  block1 = _mm_unpackhi_epi64(y1, x1);
742 }
743 
744 inline void GCC_NO_UBSAN SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
745  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
746  const word64 *subkeys, unsigned int rounds)
747 {
748  // Rearrange the data for vectorization. The incoming data was read into
749  // a little-endian word array. Depending on the number of blocks it needs to
750  // be permuted to the following.
751  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
752  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
753  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
754  __m128i x2 = _mm_unpackhi_epi64(block2, block3);
755  __m128i y2 = _mm_unpacklo_epi64(block2, block3);
756  __m128i x3 = _mm_unpackhi_epi64(block4, block5);
757  __m128i y3 = _mm_unpacklo_epi64(block4, block5);
758 
759  if (rounds & 1)
760  {
761  const __m128i rk = _mm_castpd_si128(
762  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
763 
764  Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
765  y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
766  y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON128_f(x2));
767  y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON128_f(x3));
768  rounds--;
769  }
770 
771  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
772  {
773  const __m128i rk1 = _mm_castpd_si128(
774  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
775  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
776  x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1);
777  x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1);
778 
779  const __m128i rk2 = _mm_castpd_si128(
780  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
781  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
782  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2);
783  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2);
784  }
785 
786  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
787  block0 = _mm_unpacklo_epi64(y1, x1);
788  block1 = _mm_unpackhi_epi64(y1, x1);
789  block2 = _mm_unpacklo_epi64(y2, x2);
790  block3 = _mm_unpackhi_epi64(y2, x2);
791  block4 = _mm_unpacklo_epi64(y3, x3);
792  block5 = _mm_unpackhi_epi64(y3, x3);
793 }
794 
795 #endif // CRYPTOPP_SSSE3_AVAILABLE
796 
797 #if defined(CRYPTOPP_SSE41_AVAILABLE)
798 
799 template <unsigned int R>
800 inline __m128i RotateLeft32(const __m128i& val)
801 {
802  return _mm_or_si128(
803  _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
804 }
805 
806 template <unsigned int R>
807 inline __m128i RotateRight32(const __m128i& val)
808 {
809  return _mm_or_si128(
810  _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
811 }
812 
813 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
814 template <>
815 inline __m128i RotateLeft32<8>(const __m128i& val)
816 {
817  const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
818  return _mm_shuffle_epi8(val, mask);
819 }
820 
821 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
822 template <>
823 inline __m128i RotateRight32<8>(const __m128i& val)
824 {
825  const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
826  return _mm_shuffle_epi8(val, mask);
827 }
828 
829 inline __m128i SIMON64_f(const __m128i& v)
830 {
831  return _mm_xor_si128(RotateLeft32<2>(v),
832  _mm_and_si128(RotateLeft32<1>(v), RotateLeft32<8>(v)));
833 }
834 
835 inline void GCC_NO_UBSAN SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
836  const word32 *subkeys, unsigned int rounds)
837 {
838  // Rearrange the data for vectorization. The incoming data was read into
839  // a little-endian word array. Depending on the number of blocks it needs to
840  // be permuted to the following. Thanks to Peter Cordes for help with the
841  // SSE permutes below.
842  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
843  const __m128 t0 = _mm_castsi128_ps(block0);
844  const __m128 t1 = _mm_castsi128_ps(block1);
845  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
846  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
847 
848  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
849  {
850  const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
851  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
852 
853  const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
854  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
855  }
856 
857  if (rounds & 1)
858  {
859  const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
860  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
861  Swap128(x1, y1);
862  }
863 
864  // The is roughly the SSE equivalent to ARM vzp32
865  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
866  block0 = _mm_unpacklo_epi32(y1, x1);
867  block1 = _mm_unpackhi_epi32(y1, x1);
868 }
869 
870 inline void GCC_NO_UBSAN SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
871  const word32 *subkeys, unsigned int rounds)
872 {
873  // Rearrange the data for vectorization. The incoming data was read into
874  // a little-endian word array. Depending on the number of blocks it needs to
875  // be permuted to the following. Thanks to Peter Cordes for help with the
876  // SSE permutes below.
877  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
878  const __m128 t0 = _mm_castsi128_ps(block0);
879  const __m128 t1 = _mm_castsi128_ps(block1);
880  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
881  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
882 
883  if (rounds & 1)
884  {
885  Swap128(x1, y1);
886  const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
887  y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
888  rounds--;
889  }
890 
891  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
892  {
893  const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
894  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
895 
896  const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
897  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
898  }
899 
900  // The is roughly the SSE equivalent to ARM vzp32
901  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
902  block0 = _mm_unpacklo_epi32(y1, x1);
903  block1 = _mm_unpackhi_epi32(y1, x1);
904 }
905 
906 inline void GCC_NO_UBSAN SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
907  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
908  const word32 *subkeys, unsigned int rounds)
909 {
910  // Rearrange the data for vectorization. The incoming data was read into
911  // a little-endian word array. Depending on the number of blocks it needs to
912  // be permuted to the following. Thanks to Peter Cordes for help with the
913  // SSE permutes below.
914  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
915  const __m128 t0 = _mm_castsi128_ps(block0);
916  const __m128 t1 = _mm_castsi128_ps(block1);
917  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
918  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
919 
920  const __m128 t2 = _mm_castsi128_ps(block2);
921  const __m128 t3 = _mm_castsi128_ps(block3);
922  __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
923  __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
924 
925  const __m128 t4 = _mm_castsi128_ps(block4);
926  const __m128 t5 = _mm_castsi128_ps(block5);
927  __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
928  __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
929 
930  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
931  {
932  const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
933  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
934  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1);
935  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1);
936 
937  const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
938  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
939  x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2);
940  x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2);
941  }
942 
943  if (rounds & 1)
944  {
945  const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
946  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
947  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk);
948  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk);
949  Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
950  }
951 
952  // The is roughly the SSE equivalent to ARM vzp32
953  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
954  block0 = _mm_unpacklo_epi32(y1, x1);
955  block1 = _mm_unpackhi_epi32(y1, x1);
956  block2 = _mm_unpacklo_epi32(y2, x2);
957  block3 = _mm_unpackhi_epi32(y2, x2);
958  block4 = _mm_unpacklo_epi32(y3, x3);
959  block5 = _mm_unpackhi_epi32(y3, x3);
960 }
961 
962 inline void GCC_NO_UBSAN SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
963  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
964  const word32 *subkeys, unsigned int rounds)
965 {
966  // Rearrange the data for vectorization. The incoming data was read into
967  // a little-endian word array. Depending on the number of blocks it needs to
968  // be permuted to the following. Thanks to Peter Cordes for help with the
969  // SSE permutes below.
970  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
971  const __m128 t0 = _mm_castsi128_ps(block0);
972  const __m128 t1 = _mm_castsi128_ps(block1);
973  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
974  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
975 
976  const __m128 t2 = _mm_castsi128_ps(block2);
977  const __m128 t3 = _mm_castsi128_ps(block3);
978  __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
979  __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
980 
981  const __m128 t4 = _mm_castsi128_ps(block4);
982  const __m128 t5 = _mm_castsi128_ps(block5);
983  __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
984  __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
985 
986  if (rounds & 1)
987  {
988  Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
989  const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
990  y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
991  y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2));
992  y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3));
993  rounds--;
994  }
995 
996  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
997  {
998  const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
999  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
1000  x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1);
1001  x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1);
1002 
1003  const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
1004  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
1005  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2);
1006  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2);
1007  }
1008 
1009  // The is roughly the SSE equivalent to ARM vzp32
1010  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
1011  block0 = _mm_unpacklo_epi32(y1, x1);
1012  block1 = _mm_unpackhi_epi32(y1, x1);
1013  block2 = _mm_unpacklo_epi32(y2, x2);
1014  block3 = _mm_unpackhi_epi32(y2, x2);
1015  block4 = _mm_unpacklo_epi32(y3, x3);
1016  block5 = _mm_unpackhi_epi32(y3, x3);
1017 }
1018 
1019 #endif // CRYPTOPP_SSE41_AVAILABLE
1020 
1021 ANONYMOUS_NAMESPACE_END
1022 
1023 ///////////////////////////////////////////////////////////////////////
1024 
1025 NAMESPACE_BEGIN(CryptoPP)
1026 
1027 // *************************** ARM NEON **************************** //
1028 
1029 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
1030 size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
1031  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1032 {
1033  return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
1034  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1035 }
1036 
1037 size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
1038  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1039 {
1040  return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
1041  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1042 }
1043 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
1044 
1045 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
1046 size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
1047  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1048 {
1049  return AdvancedProcessBlocks128_6x2_NEON(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
1050  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1051 }
1052 
1053 size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
1054  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1055 {
1056  return AdvancedProcessBlocks128_6x2_NEON(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
1057  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1058 }
1059 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
1060 
1061 // ***************************** IA-32 ***************************** //
1062 
1063 #if defined(CRYPTOPP_SSE41_AVAILABLE)
1064 size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
1065  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1066 {
1067  return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
1068  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1069 }
1070 
1071 size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
1072  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1073 {
1074  return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
1075  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1076 }
1077 #endif
1078 
1079 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
1080 size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
1081  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1082 {
1083  return AdvancedProcessBlocks128_6x2_SSE(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
1084  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1085 }
1086 
1087 size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
1088  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1089 {
1090  return AdvancedProcessBlocks128_6x2_SSE(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
1091  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1092 }
1093 #endif // CRYPTOPP_SSSE3_AVAILABLE
1094 
1095 NAMESPACE_END
Utility functions for the Crypto++ library.
T rotlFixed(T x, unsigned int y)
Performs a left rotate.
Definition: misc.h:1416
Library configuration file.
Precompiled header file.
Classes for the Simon block cipher.
Crypto++ library namespace.
T rotrFixed(T x, unsigned int y)
Performs a right rotate.
Definition: misc.h:1441
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.
Definition: misc.h:471