Crypto++  6.1
Free C++ class library of cryptographic schemes
aria-simd.cpp
1 // aria-simd.cpp - written and placed in the public domain by
2 // Jeffrey Walton, Uri Blumenthal and Marcel Raad.
3 //
4 // This source file uses intrinsics to gain access to ARMv7a and
5 // ARMv8a NEON instructions. A separate source file is needed
6 // because additional CXXFLAGS are required to enable the
7 // appropriate instructions sets in some build configurations.
8 
9 #include "pch.h"
10 #include "config.h"
11 #include "misc.h"
12 
13 #if (CRYPTOPP_SSSE3_AVAILABLE)
14 # include <tmmintrin.h>
15 #endif
16 
17 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
18 # include <arm_neon.h>
19 #endif
20 
21 // Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
22 // compilers don't follow ACLE conventions for the include.
23 #if defined(CRYPTOPP_ARM_ACLE_AVAILABLE)
24 # include <stdint.h>
25 # include <arm_acle.h>
26 #endif
27 
28 // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
29 #define M128_CAST(x) ((__m128i *)(void *)(x))
30 #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
31 
32 // GCC cast warning
33 #define UINT32_CAST(x) ((uint32_t *)(void *)(x))
34 #define CONST_UINT32_CAST(x) ((const uint32_t *)(const void *)(x))
35 
36 NAMESPACE_BEGIN(CryptoPP)
37 NAMESPACE_BEGIN(ARIATab)
38 
39 extern const word32 S1[256];
40 extern const word32 S2[256];
41 extern const word32 X1[256];
42 extern const word32 X2[256];
43 extern const word32 KRK[3][4];
44 
45 NAMESPACE_END
46 NAMESPACE_END
47 
48 NAMESPACE_BEGIN(CryptoPP)
49 
50 using CryptoPP::ARIATab::S1;
51 using CryptoPP::ARIATab::S2;
52 using CryptoPP::ARIATab::X1;
53 using CryptoPP::ARIATab::X2;
54 using CryptoPP::ARIATab::KRK;
55 
56 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
57 
58 template <unsigned int N>
59 inline void ARIA_GSRK_NEON(const uint32x4_t X, const uint32x4_t Y, byte RK[16])
60 {
61  static const unsigned int Q1 = (4-(N/32)) % 4;
62  static const unsigned int Q2 = (3-(N/32)) % 4;
63  static const unsigned int R = N % 32;
64 
65  vst1q_u32(UINT32_CAST(RK),
66  veorq_u32(X, veorq_u32(
67  vshrq_n_u32(vextq_u32(Y, Y, Q1), R),
68  vshlq_n_u32(vextq_u32(Y, Y, Q2), 32-R))));
69 }
70 
71 void ARIA_UncheckedSetKey_Schedule_NEON(byte* rk, word32* ws, unsigned int keylen)
72 {
73  const uint32x4_t w0 = vld1q_u32(CONST_UINT32_CAST(ws+ 0));
74  const uint32x4_t w1 = vld1q_u32(CONST_UINT32_CAST(ws+ 8));
75  const uint32x4_t w2 = vld1q_u32(CONST_UINT32_CAST(ws+12));
76  const uint32x4_t w3 = vld1q_u32(CONST_UINT32_CAST(ws+16));
77 
78  ARIA_GSRK_NEON<19>(w0, w1, rk + 0);
79  ARIA_GSRK_NEON<19>(w1, w2, rk + 16);
80  ARIA_GSRK_NEON<19>(w2, w3, rk + 32);
81  ARIA_GSRK_NEON<19>(w3, w0, rk + 48);
82  ARIA_GSRK_NEON<31>(w0, w1, rk + 64);
83  ARIA_GSRK_NEON<31>(w1, w2, rk + 80);
84  ARIA_GSRK_NEON<31>(w2, w3, rk + 96);
85  ARIA_GSRK_NEON<31>(w3, w0, rk + 112);
86  ARIA_GSRK_NEON<67>(w0, w1, rk + 128);
87  ARIA_GSRK_NEON<67>(w1, w2, rk + 144);
88  ARIA_GSRK_NEON<67>(w2, w3, rk + 160);
89  ARIA_GSRK_NEON<67>(w3, w0, rk + 176);
90  ARIA_GSRK_NEON<97>(w0, w1, rk + 192);
91 
92  if (keylen > 16)
93  {
94  ARIA_GSRK_NEON<97>(w1, w2, rk + 208);
95  ARIA_GSRK_NEON<97>(w2, w3, rk + 224);
96 
97  if (keylen > 24)
98  {
99  ARIA_GSRK_NEON< 97>(w3, w0, rk + 240);
100  ARIA_GSRK_NEON<109>(w0, w1, rk + 256);
101  }
102  }
103 }
104 
105 void ARIA_ProcessAndXorBlock_Xor_NEON(const byte* xorBlock, byte* outBlock)
106 {
107  vst1q_u32(UINT32_CAST(outBlock), veorq_u32(
108  vld1q_u32(CONST_UINT32_CAST(outBlock)),
109  vld1q_u32(CONST_UINT32_CAST(xorBlock))));
110 }
111 
112 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
113 
114 #if (CRYPTOPP_SSSE3_AVAILABLE)
115 
116 inline byte ARIA_BRF(const word32 x, const int y) {
117  return GETBYTE(x, y);
118 }
119 
120 void ARIA_ProcessAndXorBlock_Xor_SSSE3(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t)
121 {
122  const __m128i MASK = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
123 
124  outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] );
125  outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
126  outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] );
127  outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] );
128  outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] );
129  outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
130  outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] );
131  outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] );
132  outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] );
133  outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
134  outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] );
135  outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] );
136  outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] );
137  outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
138  outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] );
139  outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] );
140 
141  // 'outBlock' may be unaligned.
142  _mm_storeu_si128(M128_CAST(outBlock),
143  _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(outBlock)),
144  _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(rk)), MASK)));
145 
146  // 'outBlock' and 'xorBlock' may be unaligned.
147  if (xorBlock != NULLPTR)
148  {
149  _mm_storeu_si128(M128_CAST(outBlock),
150  _mm_xor_si128(
151  _mm_loadu_si128(CONST_M128_CAST(outBlock)),
152  _mm_loadu_si128(CONST_M128_CAST(xorBlock))));
153  }
154 }
155 
156 #endif // CRYPTOPP_SSSE3_AVAILABLE
157 
158 NAMESPACE_END
Utility functions for the Crypto++ library.
Library configuration file.
Precompiled header file.
Crypto++ library namespace.