Crypto++  8.6
Free C++ class library of cryptographic schemes
rijndael.cpp
1 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2 // and Wei Dai from Paulo Baretto's Rijndael implementation
3 // The original code and all modifications are in the public domain.
4 
5 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6 
7 /*
8 July 2018: Added support for ARMv7 AES instructions via Cryptogams ASM.
9  See the head notes in aes_armv4.S for copyright and license.
10 */
11 
12 /*
13 September 2017: Added support for Power8 AES instructions via compiler intrinsics.
14 */
15 
16 /*
17 July 2017: Added support for ARMv8 AES instructions via compiler intrinsics.
18 */
19 
20 /*
21 July 2010: Added support for AES-NI instructions via compiler intrinsics.
22 */
23 
24 /*
25 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
26 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
27 and Peter Schwabe in their paper "New AES software speed records". The round
28 function was also modified to include a trick similar to one in Brian Gladman's
29 x86 assembly code, doing an 8-bit register move to minimize the number of
30 register spills. Also switched to compressed tables and copying round keys to
31 the stack.
32 
33 The C++ implementation uses compressed tables if
34 CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined.
35 It is defined on x86 platforms by default but no others.
36 */
37 
38 /*
39 July 2006: Defense against timing attacks was added in by Wei Dai.
40 
41 The code now uses smaller tables in the first and last rounds,
42 and preloads them into L1 cache before usage (by loading at least
43 one element in each cache line).
44 
45 We try to delay subsequent accesses to each table (used in the first
46 and last rounds) until all of the table has been preloaded. Hopefully
47 the compiler isn't smart enough to optimize that code away.
48 
49 After preloading the table, we also try not to access any memory location
50 other than the table and the stack, in order to prevent table entries from
51 being unloaded from L1 cache, until that round is finished.
52 (Some popular CPUs have 2-way associative caches.)
53 */
54 
55 // This is the original introductory comment:
56 
57 /**
58  * version 3.0 (December 2000)
59  *
60  * Optimised ANSI C code for the Rijndael cipher (now AES)
61  *
62  * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
63  * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
64  * author Paulo Barreto <paulo.barreto@terra.com.br>
65  *
66  * This code is hereby placed in the public domain.
67  *
68  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
69  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
70  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
71  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
72  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
73  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
74  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
75  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
76  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
77  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
78  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
79  */
80 
81 #include "pch.h"
82 #include "config.h"
83 
84 #ifndef CRYPTOPP_IMPORTS
85 #ifndef CRYPTOPP_GENERATE_X64_MASM
86 
87 #include "rijndael.h"
88 #include "misc.h"
89 #include "cpu.h"
90 
91 // VS2017 and global optimization bug. TODO, figure out when
92 // we can re-enable full optimizations for VS2017. Also see
93 // https://github.com/weidai11/cryptopp/issues/649
94 #if (_MSC_VER >= 1910)
95 # ifndef CRYPTOPP_DEBUG
96 # pragma optimize("", off)
97 # pragma optimize("ts", on)
98 # endif
99 #endif
100 
101 NAMESPACE_BEGIN(CryptoPP)
102 
103 // Hack for http://github.com/weidai11/cryptopp/issues/42 and http://github.com/weidai11/cryptopp/issues/132
104 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE))
105 # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
106 #endif
107 
108 // Clang intrinsic casts
109 #define M128I_CAST(x) ((__m128i *)(void *)(x))
110 #define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x))
111 
112 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
113 # if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
114 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
115 using namespace rdtable;
116 # else
117 static word64 Te[256];
118 # endif
119 static word64 Td[256];
120 #else // Not CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS
121 # if defined(CRYPTOPP_X64_MASM_AVAILABLE)
122 // Unused; avoids linker error on Microsoft X64 non-AESNI platforms
123 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
124 # endif
125 CRYPTOPP_ALIGN_DATA(16) static word32 Te[256*4];
126 CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
127 #endif // CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS
128 
129 static volatile bool s_TeFilled = false, s_TdFilled = false;
130 
131 ANONYMOUS_NAMESPACE_BEGIN
132 
133 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
134 
135 // Determine whether the range between begin and end overlaps
136 // with the same 4k block offsets as the Te table. Logically,
137 // the code is trying to create the condition:
138 //
139 // Two sepearate memory pages:
140 //
141 // +-----+ +-----+
142 // |XXXXX| |YYYYY|
143 // |XXXXX| |YYYYY|
144 // | | | |
145 // | | | |
146 // +-----+ +-----+
147 // Te Table Locals
148 //
149 // Have a logical cache view of (X and Y may be inverted):
150 //
151 // +-----+
152 // |XXXXX|
153 // |XXXXX|
154 // |YYYYY|
155 // |YYYYY|
156 // +-----+
157 //
158 static inline bool AliasedWithTable(const byte *begin, const byte *end)
159 {
160  ptrdiff_t s0 = uintptr_t(begin)%4096, s1 = uintptr_t(end)%4096;
161  ptrdiff_t t0 = uintptr_t(Te)%4096, t1 = (uintptr_t(Te)+sizeof(Te))%4096;
162  if (t1 > t0)
163  return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
164  else
165  return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
166 }
167 
168 struct Locals
169 {
170  word32 subkeys[4*12], workspace[8];
171  const byte *inBlocks, *inXorBlocks, *outXorBlocks;
172  byte *outBlocks;
173  size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
174  size_t regSpill, lengthAndCounterFlag, keysBegin;
175 };
176 
177 const size_t s_aliasPageSize = 4096;
178 const size_t s_aliasBlockSize = 256;
179 const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize + sizeof(Locals);
180 
181 #endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
182 
183 ANONYMOUS_NAMESPACE_END
184 
185 // ************************* Portable Code ************************************
186 
187 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
188  a ^= L(T, 3, byte(t)); t >>= 8;\
189  b ^= L(T, 2, byte(t)); t >>= 8;\
190  c ^= L(T, 1, byte(t)); t >>= 8;\
191  d ^= L(T, 0, t);
192 
193 #define QUARTER_ROUND_LE(t, a, b, c, d) \
194  tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
195  tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
196  tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
197  tempBlock[d] = ((byte *)(Te+t))[1];
198 
199 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
200  #define QUARTER_ROUND_LD(t, a, b, c, d) \
201  tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
202  tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
203  tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
204  tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
205 #else
206  #define QUARTER_ROUND_LD(t, a, b, c, d) \
207  tempBlock[a] = Sd[byte(t)]; t >>= 8;\
208  tempBlock[b] = Sd[byte(t)]; t >>= 8;\
209  tempBlock[c] = Sd[byte(t)]; t >>= 8;\
210  tempBlock[d] = Sd[t];
211 #endif
212 
213 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
214 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
215 
216 #if (CRYPTOPP_LITTLE_ENDIAN)
217  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
218  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
219  #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
220  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1))
221  #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1))
222  #else
223  #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
224  #define TL_M(T, i, x) T[i*256 + x]
225  #endif
226 #else
227  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
228  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
229  #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
230  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4))
231  #define TL_M TL_F
232  #else
233  #define TL_F(T, i, x) rotrFixed(T[x], i*8)
234  #define TL_M(T, i, x) T[i*256 + x]
235  #endif
236 #endif
237 
238 
239 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
240 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
241 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
242 
243 #define f3(x) (f2(x) ^ x)
244 #define f9(x) (f8(x) ^ x)
245 #define fb(x) (f8(x) ^ f2(x) ^ x)
246 #define fd(x) (f8(x) ^ f4(x) ^ x)
247 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
248 
249 unsigned int Rijndael::Base::OptimalDataAlignment() const
250 {
251 #if (CRYPTOPP_AESNI_AVAILABLE)
252  if (HasAESNI())
253  return 16; // load __m128i
254 #endif
255 #if (CRYPTOPP_ARM_AES_AVAILABLE)
256  if (HasAES())
257  return 4; // load uint32x4_t
258 #endif
259 #if (CRYPTOGAMS_ARM_AES)
260  // Must use 1 here for Cryptogams AES. Also see
261  // https://github.com/weidai11/cryptopp/issues/683
262  if (HasARMv7())
263  return 1;
264 #endif
265 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
266  if (HasAES())
267  return 16; // load uint32x4_p
268 #endif
270 }
271 
272 void Rijndael::Base::FillEncTable()
273 {
274  for (int i=0; i<256; i++)
275  {
276  byte x = Se[i];
277 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
278  word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
279  Te[i] = word64(y | f3(x))<<32 | y;
280 #else
281  word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
282  for (int j=0; j<4; j++)
283  {
284  Te[i+j*256] = y;
285  y = rotrConstant<8>(y);
286  }
287 #endif
288  }
289 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
290  Te[256] = Te[257] = 0;
291 #endif
292  s_TeFilled = true;
293 }
294 
295 void Rijndael::Base::FillDecTable()
296 {
297  for (int i=0; i<256; i++)
298  {
299  byte x = Sd[i];
300 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
301  word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
302  Td[i] = word64(y | fb(x))<<32 | y | x;
303 #else
304  word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
305  for (int j=0; j<4; j++)
306  {
307  Td[i+j*256] = y;
308  y = rotrConstant<8>(y);
309  }
310 #endif
311  }
312  s_TdFilled = true;
313 }
314 
315 #if (CRYPTOPP_AESNI_AVAILABLE)
316 extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk);
317 extern void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds);
318 
319 extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
320  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
321 extern size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
322  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
323 #endif
324 
325 #if (CRYPTOPP_ARM_AES_AVAILABLE)
326 extern size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
327  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
328 extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
329  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
330 #endif
331 
332 #if (CRYPTOGAMS_ARM_AES)
333 extern "C" int cryptogams_AES_set_encrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey);
334 extern "C" int cryptogams_AES_set_decrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey);
335 extern "C" void cryptogams_AES_encrypt_block(const unsigned char *in, unsigned char *out, const word32 *rkey);
336 extern "C" void cryptogams_AES_decrypt_block(const unsigned char *in, unsigned char *out, const word32 *rkey);
337 #endif
338 
339 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
340 extern void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen,
341  word32* rk, const byte* Se);
342 
343 extern size_t Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
344  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
345 extern size_t Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
346  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
347 #endif
348 
349 #if (CRYPTOGAMS_ARM_AES)
350 int CRYPTOGAMS_set_encrypt_key(const byte *userKey, const int bitLen, word32 *rkey)
351 {
352  return cryptogams_AES_set_encrypt_key(userKey, bitLen, rkey);
353 }
354 int CRYPTOGAMS_set_decrypt_key(const byte *userKey, const int bitLen, word32 *rkey)
355 {
356  return cryptogams_AES_set_decrypt_key(userKey, bitLen, rkey);
357 }
358 void CRYPTOGAMS_encrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey)
359 {
360  cryptogams_AES_encrypt_block(inBlock, outBlock, rkey);
361  if (xorBlock)
362  xorbuf (outBlock, xorBlock, 16);
363 }
364 void CRYPTOGAMS_decrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey)
365 {
366  cryptogams_AES_decrypt_block(inBlock, outBlock, rkey);
367  if (xorBlock)
368  xorbuf (outBlock, xorBlock, 16);
369 }
370 #endif
371 
372 std::string Rijndael::Base::AlgorithmProvider() const
373 {
374 #if (CRYPTOPP_AESNI_AVAILABLE)
375  if (HasAESNI())
376  return "AESNI";
377 #endif
378 #if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
379  if (HasSSE2())
380  return "SSE2";
381 #endif
382 #if (CRYPTOPP_ARM_AES_AVAILABLE)
383  if (HasAES())
384  return "ARMv8";
385 #endif
386 #if (CRYPTOGAMS_ARM_AES)
387  if (HasARMv7())
388  return "ARMv7";
389 #endif
390 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
391  if (HasAES())
392  return "Power8";
393 #endif
394  return "C++";
395 }
396 
397 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
398 {
399  AssertValidKeyLength(keyLen);
400 
401 #if (CRYPTOGAMS_ARM_AES)
402  if (HasARMv7())
403  {
404  m_rounds = keyLen/4 + 6;
405  m_key.New(4*(14+1)+4);
406 
407  if (IsForwardTransformation())
408  CRYPTOGAMS_set_encrypt_key(userKey, keyLen*8, m_key.begin());
409  else
410  CRYPTOGAMS_set_decrypt_key(userKey, keyLen*8, m_key.begin());
411  return;
412  }
413 #endif
414 
415 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
416  m_aliasBlock.New(s_sizeToAllocate);
417  // The alias block is only used on IA-32 when unaligned data access is in effect.
418  // Setting the low water mark to 0 avoids zeroization when m_aliasBlock is unused.
419  m_aliasBlock.SetMark(0);
420 #endif
421 
422  m_rounds = keyLen/4 + 6;
423  m_key.New(4*(m_rounds+1));
424  word32 *rk = m_key;
425 
426 #if (CRYPTOPP_AESNI_AVAILABLE && CRYPTOPP_SSE41_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
427  // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
428  if (HasAESNI() && HasSSE41())
429  {
430  // TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end
431  // Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2.
432  Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk);
433  if (!IsForwardTransformation())
434  Rijndael_UncheckedSetKeyRev_AESNI(m_key, m_rounds);
435 
436  return;
437  }
438 #endif
439 
440 #if CRYPTOPP_POWER8_AES_AVAILABLE
441  if (HasAES())
442  {
443  // We still need rcon and Se to fallback to C/C++ for AES-192 and AES-256.
444  // The IBM docs on AES sucks. Intel's docs on AESNI puts IBM to shame.
445  Rijndael_UncheckedSetKey_POWER8(userKey, keyLen, rk, Se);
446  return;
447  }
448 #endif
449 
450  GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen);
451  const word32 *rc = rcon;
452  word32 temp;
453 
454  while (true)
455  {
456  temp = rk[keyLen/4-1];
457  word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^
458  (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
459  rk[keyLen/4] = rk[0] ^ x ^ *(rc++);
460  rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
461  rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
462  rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
463 
464  if (rk + keyLen/4 + 4 == m_key.end())
465  break;
466 
467  if (keyLen == 24)
468  {
469  rk[10] = rk[ 4] ^ rk[ 9];
470  rk[11] = rk[ 5] ^ rk[10];
471  }
472  else if (keyLen == 32)
473  {
474  temp = rk[11];
475  rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
476  rk[13] = rk[ 5] ^ rk[12];
477  rk[14] = rk[ 6] ^ rk[13];
478  rk[15] = rk[ 7] ^ rk[14];
479  }
480  rk += keyLen/4;
481  }
482 
483  rk = m_key;
484 
485  if (IsForwardTransformation())
486  {
487  if (!s_TeFilled)
488  FillEncTable();
489 
491  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
492  }
493  else
494  {
495  if (!s_TdFilled)
496  FillDecTable();
497 
498  #define InverseMixColumn(x) \
499  TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ \
500  TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
501 
502  unsigned int i, j;
503  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
504  {
505  temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
506  temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
507  temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
508  temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
509  }
510 
511  rk[i+0] = InverseMixColumn(rk[i+0]);
512  rk[i+1] = InverseMixColumn(rk[i+1]);
513  rk[i+2] = InverseMixColumn(rk[i+2]);
514  rk[i+3] = InverseMixColumn(rk[i+3]);
515 
516  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
517  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
518  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
519  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
520  }
521 
522 #if CRYPTOPP_AESNI_AVAILABLE
523  if (HasAESNI())
524  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
525 #endif
526 #if CRYPTOPP_ARM_AES_AVAILABLE
527  if (HasAES())
528  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
529 #endif
530 }
531 
532 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
533 {
534 #if CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_AESNI_AVAILABLE
535 # if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
536  if (HasSSE2())
537 # else
538  if (HasAESNI())
539 # endif
540  {
541  (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
542  return;
543  }
544 #endif
545 
546 #if (CRYPTOPP_ARM_AES_AVAILABLE)
547  if (HasAES())
548  {
549  (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
550  return;
551  }
552 #endif
553 
554 #if (CRYPTOGAMS_ARM_AES)
555  if (HasARMv7())
556  {
557  CRYPTOGAMS_encrypt(inBlock, xorBlock, outBlock, m_key.begin());
558  return;
559  }
560 #endif
561 
562 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
563  if (HasAES())
564  {
565  (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
566  return;
567  }
568 #endif
569 
571 
572  word32 s0, s1, s2, s3, t0, t1, t2, t3;
573  Block::Get(inBlock)(s0)(s1)(s2)(s3);
574 
575  const word32 *rk = m_key;
576  s0 ^= rk[0];
577  s1 ^= rk[1];
578  s2 ^= rk[2];
579  s3 ^= rk[3];
580  t0 = rk[4];
581  t1 = rk[5];
582  t2 = rk[6];
583  t3 = rk[7];
584  rk += 8;
585 
586  // timing attack countermeasure. see comments at top for more details.
587  // also see http://github.com/weidai11/cryptopp/issues/146
588  const int cacheLineSize = GetCacheLineSize();
589  unsigned int i;
590  volatile word32 _u = 0;
591  word32 u = _u;
592 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
593  for (i=0; i<2048; i+=cacheLineSize)
594 #else
595  for (i=0; i<1024; i+=cacheLineSize)
596 #endif
597  u &= *(const word32 *)(const void *)(((const byte *)Te)+i);
598  u &= Te[255];
599  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
600 
601  QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
602  QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
603  QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
604  QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
605 
606  // Nr - 2 full rounds:
607  unsigned int r = m_rounds/2 - 1;
608  do
609  {
610  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
611 
612  QUARTER_ROUND_E(t3, s0, s1, s2, s3)
613  QUARTER_ROUND_E(t2, s3, s0, s1, s2)
614  QUARTER_ROUND_E(t1, s2, s3, s0, s1)
615  QUARTER_ROUND_E(t0, s1, s2, s3, s0)
616 
617  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
618 
619  QUARTER_ROUND_E(s3, t0, t1, t2, t3)
620  QUARTER_ROUND_E(s2, t3, t0, t1, t2)
621  QUARTER_ROUND_E(s1, t2, t3, t0, t1)
622  QUARTER_ROUND_E(s0, t1, t2, t3, t0)
623 
624  rk += 8;
625  } while (--r);
626 
627  word32 tbw[4];
628  byte *const tempBlock = (byte *)tbw;
629 
630  QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
631  QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
632  QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
633  QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
634 
635  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
636 }
637 
638 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
639 {
640 #if CRYPTOPP_AESNI_AVAILABLE
641  if (HasAESNI())
642  {
643  (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
644  return;
645  }
646 #endif
647 
648 #if (CRYPTOPP_ARM_AES_AVAILABLE)
649  if (HasAES())
650  {
651  (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
652  return;
653  }
654 #endif
655 
656 #if (CRYPTOGAMS_ARM_AES)
657  if (HasARMv7())
658  {
659  CRYPTOGAMS_decrypt(inBlock, xorBlock, outBlock, m_key.begin());
660  return;
661  }
662 #endif
663 
664 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
665  if (HasAES())
666  {
667  (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
668  return;
669  }
670 #endif
671 
673 
674  word32 s0, s1, s2, s3, t0, t1, t2, t3;
675  Block::Get(inBlock)(s0)(s1)(s2)(s3);
676 
677  const word32 *rk = m_key;
678  s0 ^= rk[0];
679  s1 ^= rk[1];
680  s2 ^= rk[2];
681  s3 ^= rk[3];
682  t0 = rk[4];
683  t1 = rk[5];
684  t2 = rk[6];
685  t3 = rk[7];
686  rk += 8;
687 
688  // timing attack countermeasure. see comments at top for more details.
689  // also see http://github.com/weidai11/cryptopp/issues/146
690  const int cacheLineSize = GetCacheLineSize();
691  unsigned int i;
692  volatile word32 _u = 0;
693  word32 u = _u;
694 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
695  for (i=0; i<2048; i+=cacheLineSize)
696 #else
697  for (i=0; i<1024; i+=cacheLineSize)
698 #endif
699  u &= *(const word32 *)(const void *)(((const byte *)Td)+i);
700  u &= Td[255];
701  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
702 
703  QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
704  QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
705  QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
706  QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
707 
708  // Nr - 2 full rounds:
709  unsigned int r = m_rounds/2 - 1;
710  do
711  {
712  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
713 
714  QUARTER_ROUND_D(t3, s2, s1, s0, s3)
715  QUARTER_ROUND_D(t2, s1, s0, s3, s2)
716  QUARTER_ROUND_D(t1, s0, s3, s2, s1)
717  QUARTER_ROUND_D(t0, s3, s2, s1, s0)
718 
719  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
720 
721  QUARTER_ROUND_D(s3, t2, t1, t0, t3)
722  QUARTER_ROUND_D(s2, t1, t0, t3, t2)
723  QUARTER_ROUND_D(s1, t0, t3, t2, t1)
724  QUARTER_ROUND_D(s0, t3, t2, t1, t0)
725 
726  rk += 8;
727  } while (--r);
728 
729 #if !(defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
730  // timing attack countermeasure. see comments at top for more details
731  // If CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined,
732  // QUARTER_ROUND_LD will use Td, which is already preloaded.
733  u = _u;
734  for (i=0; i<256; i+=cacheLineSize)
735  u &= *(const word32 *)(const void *)(Sd+i);
736  u &= *(const word32 *)(const void *)(Sd+252);
737  t0 |= u; t1 |= u; t2 |= u; t3 |= u;
738 #endif
739 
740  word32 tbw[4];
741  byte *const tempBlock = (byte *)tbw;
742 
743  QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
744  QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
745  QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
746  QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
747 
748  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
749 }
750 
751 // ************************* Assembly Code ************************************
752 
753 #if CRYPTOPP_MSC_VERSION
754 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
755 #endif
756 
757 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
758 
759 #if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
760 
761 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k)
762 {
763  CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
764 
765 #if CRYPTOPP_BOOL_X86
766 
767 #define L_REG esp
768 #define L_INDEX(i) (L_REG+768+i)
769 #define L_INXORBLOCKS L_INBLOCKS+4
770 #define L_OUTXORBLOCKS L_INBLOCKS+8
771 #define L_OUTBLOCKS L_INBLOCKS+12
772 #define L_INCREMENTS L_INDEX(16*15)
773 #define L_SP L_INDEX(16*16)
774 #define L_LENGTH L_INDEX(16*16+4)
775 #define L_KEYS_BEGIN L_INDEX(16*16+8)
776 
777 #define MOVD movd
778 #define MM(i) mm##i
779 
780 #define MXOR(a,b,c) \
781  AS2( movzx esi, b)\
782  AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
783  AS2( pxor MM(a), mm7)\
784 
785 #define MMOV(a,b,c) \
786  AS2( movzx esi, b)\
787  AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
788 
789 #else
790 
791 #define L_REG r8
792 #define L_INDEX(i) (L_REG+i)
793 #define L_INXORBLOCKS L_INBLOCKS+8
794 #define L_OUTXORBLOCKS L_INBLOCKS+16
795 #define L_OUTBLOCKS L_INBLOCKS+24
796 #define L_INCREMENTS L_INDEX(16*16)
797 #define L_LENGTH L_INDEX(16*18+8)
798 #define L_KEYS_BEGIN L_INDEX(16*19)
799 
800 #define MOVD mov
801 #define MM_0 r9d
802 #define MM_1 r12d
803 #ifdef __GNUC__
804 #define MM_2 r11d
805 #else
806 #define MM_2 r10d
807 #endif
808 #define MM(i) MM_##i
809 
810 #define MXOR(a,b,c) \
811  AS2( movzx esi, b)\
812  AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
813 
814 #define MMOV(a,b,c) \
815  AS2( movzx esi, b)\
816  AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
817 
818 #endif
819 
820 #define L_SUBKEYS L_INDEX(0)
821 #define L_SAVED_X L_SUBKEYS
822 #define L_KEY12 L_INDEX(16*12)
823 #define L_LASTROUND L_INDEX(16*13)
824 #define L_INBLOCKS L_INDEX(16*14)
825 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
826 
827 #define XOR(a,b,c) \
828  AS2( movzx esi, b)\
829  AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
830 
831 #define MOV(a,b,c) \
832  AS2( movzx esi, b)\
833  AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
834 
835 #ifdef CRYPTOPP_GENERATE_X64_MASM
836  ALIGN 8
837  Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
838  rex_push_reg rsi
839  push_reg rdi
840  push_reg rbx
841  push_reg r12
842  .endprolog
843  mov L_REG, rcx
844  mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
845  mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
846 #elif defined(__GNUC__)
847  __asm__ __volatile__
848  (
849  INTEL_NOPREFIX
851  AS2( mov L_REG, rcx)
852  #endif
853  AS_PUSH_IF86(bx)
854  AS_PUSH_IF86(bp)
855  AS2( mov AS_REG_7, WORD_REG(si))
856 #else
857  AS_PUSH_IF86(si)
858  AS_PUSH_IF86(di)
859  AS_PUSH_IF86(bx)
860  AS_PUSH_IF86(bp)
861  AS2( lea AS_REG_7, [Te])
862  AS2( mov edi, [g_cacheLineSize])
863 #endif
864 
866  AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
867  AS2( lea esp, [ecx-768])
868 #endif
869 
870  // copy subkeys to stack
871  AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
872  AS2( mov WORD_REG(ax), 16)
873  AS2( and WORD_REG(ax), WORD_REG(si))
874  AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
875  AS2( movdqa [L_KEY12], xmm3)
876  AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
877  AS2( sub WORD_REG(ax), WORD_REG(si))
878  ASL(0)
879  AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
880  AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
881  AS2( add WORD_REG(si), 16)
882  AS2( cmp WORD_REG(si), 16*12)
883  ATT_NOPREFIX
884  ASJ( jl, 0, b)
885  INTEL_NOPREFIX
886 
887  // read subkeys 0, 1 and last
888  AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
889  AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
890  AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
891  AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
892  AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
893  AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
894 
895  // load table into cache
896  AS2( xor WORD_REG(ax), WORD_REG(ax))
897  ASL(9)
898  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
899  AS2( add WORD_REG(ax), WORD_REG(di))
900  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
901  AS2( add WORD_REG(ax), WORD_REG(di))
902  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
903  AS2( add WORD_REG(ax), WORD_REG(di))
904  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
905  AS2( add WORD_REG(ax), WORD_REG(di))
906  AS2( cmp WORD_REG(ax), 2048)
907  ATT_NOPREFIX
908  ASJ( jl, 9, b)
909  INTEL_NOPREFIX
910  AS1( lfence)
911 
912  AS2( test DWORD PTR [L_LENGTH], 1)
913  ATT_NOPREFIX
914  ASJ( jz, 8, f)
915  INTEL_NOPREFIX
916 
917  // counter mode one-time setup
918  AS2( mov WORD_REG(si), [L_INBLOCKS])
919  AS2( movdqu xmm2, [WORD_REG(si)]) // counter
920  AS2( pxor xmm2, xmm1)
921  AS2( psrldq xmm1, 14)
922  AS2( movd eax, xmm1)
923  AS2( mov al, BYTE PTR [WORD_REG(si)+15])
924  AS2( MOVD MM(2), eax)
926  AS2( mov eax, 1)
927  AS2( movd mm3, eax)
928 #endif
929 
930  // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
931  AS2( movd eax, xmm2)
932  AS2( psrldq xmm2, 4)
933  AS2( movd edi, xmm2)
934  AS2( psrldq xmm2, 4)
935  MXOR( 1, al, 0) // 0
936  XOR( edx, ah, 1) // 1
937  AS2( shr eax, 16)
938  XOR( ecx, al, 2) // 2
939  XOR( ebx, ah, 3) // 3
940  AS2( mov eax, edi)
941  AS2( movd edi, xmm2)
942  AS2( psrldq xmm2, 4)
943  XOR( ebx, al, 0) // 4
944  MXOR( 1, ah, 1) // 5
945  AS2( shr eax, 16)
946  XOR( edx, al, 2) // 6
947  XOR( ecx, ah, 3) // 7
948  AS2( mov eax, edi)
949  AS2( movd edi, xmm2)
950  XOR( ecx, al, 0) // 8
951  XOR( ebx, ah, 1) // 9
952  AS2( shr eax, 16)
953  MXOR( 1, al, 2) // 10
954  XOR( edx, ah, 3) // 11
955  AS2( mov eax, edi)
956  XOR( edx, al, 0) // 12
957  XOR( ecx, ah, 1) // 13
958  AS2( shr eax, 16)
959  XOR( ebx, al, 2) // 14
960  AS2( psrldq xmm2, 3)
961 
962  // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
963  AS2( mov eax, [L_KEY12+0*4])
964  AS2( mov edi, [L_KEY12+2*4])
965  AS2( MOVD MM(0), [L_KEY12+3*4])
966  MXOR( 0, cl, 3) /* 11 */
967  XOR( edi, bl, 3) /* 7 */
968  MXOR( 0, bh, 2) /* 6 */
969  AS2( shr ebx, 16) /* 4,5 */
970  XOR( eax, bl, 1) /* 5 */
971  MOV( ebx, bh, 0) /* 4 */
972  AS2( xor ebx, [L_KEY12+1*4])
973  XOR( eax, ch, 2) /* 10 */
974  AS2( shr ecx, 16) /* 8,9 */
975  XOR( eax, dl, 3) /* 15 */
976  XOR( ebx, dh, 2) /* 14 */
977  AS2( shr edx, 16) /* 12,13 */
978  XOR( edi, ch, 0) /* 8 */
979  XOR( ebx, cl, 1) /* 9 */
980  XOR( edi, dl, 1) /* 13 */
981  MXOR( 0, dh, 0) /* 12 */
982 
983  AS2( movd ecx, xmm2)
984  AS2( MOVD edx, MM(1))
985  AS2( MOVD [L_SAVED_X+3*4], MM(0))
986  AS2( mov [L_SAVED_X+0*4], eax)
987  AS2( mov [L_SAVED_X+1*4], ebx)
988  AS2( mov [L_SAVED_X+2*4], edi)
989  ATT_NOPREFIX
990  ASJ( jmp, 5, f)
991  INTEL_NOPREFIX
992  ASL(3)
993  // non-counter mode per-block setup
994  AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
995  AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
996  AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
997  AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
998  ASL(8)
999  AS2( mov WORD_REG(ax), [L_INBLOCKS])
1000  AS2( movdqu xmm2, [WORD_REG(ax)])
1001  AS2( mov WORD_REG(si), [L_INXORBLOCKS])
1002  AS2( movdqu xmm5, [WORD_REG(si)])
1003  AS2( pxor xmm2, xmm1)
1004  AS2( pxor xmm2, xmm5)
1005 
1006  // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
1007  AS2( movd eax, xmm2)
1008  AS2( psrldq xmm2, 4)
1009  AS2( movd edi, xmm2)
1010  AS2( psrldq xmm2, 4)
1011  MXOR( 1, al, 0) // 0
1012  XOR( edx, ah, 1) // 1
1013  AS2( shr eax, 16)
1014  XOR( ecx, al, 2) // 2
1015  XOR( ebx, ah, 3) // 3
1016  AS2( mov eax, edi)
1017  AS2( movd edi, xmm2)
1018  AS2( psrldq xmm2, 4)
1019  XOR( ebx, al, 0) // 4
1020  MXOR( 1, ah, 1) // 5
1021  AS2( shr eax, 16)
1022  XOR( edx, al, 2) // 6
1023  XOR( ecx, ah, 3) // 7
1024  AS2( mov eax, edi)
1025  AS2( movd edi, xmm2)
1026  XOR( ecx, al, 0) // 8
1027  XOR( ebx, ah, 1) // 9
1028  AS2( shr eax, 16)
1029  MXOR( 1, al, 2) // 10
1030  XOR( edx, ah, 3) // 11
1031  AS2( mov eax, edi)
1032  XOR( edx, al, 0) // 12
1033  XOR( ecx, ah, 1) // 13
1034  AS2( shr eax, 16)
1035  XOR( ebx, al, 2) // 14
1036  MXOR( 1, ah, 3) // 15
1037  AS2( MOVD eax, MM(1))
1038 
1039  AS2( add L_REG, [L_KEYS_BEGIN])
1040  AS2( add L_REG, 4*16)
1041  ATT_NOPREFIX
1042  ASJ( jmp, 2, f)
1043  INTEL_NOPREFIX
1044  ASL(1)
1045  // counter-mode per-block setup
1046  AS2( MOVD ecx, MM(2))
1047  AS2( MOVD edx, MM(1))
1048  AS2( mov eax, [L_SAVED_X+0*4])
1049  AS2( mov ebx, [L_SAVED_X+1*4])
1050  AS2( xor cl, ch)
1051  AS2( and WORD_REG(cx), 255)
1052  ASL(5)
1054  AS2( paddb MM(2), mm3)
1055 #else
1056  AS2( add MM(2), 1)
1057 #endif
1058  // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
1059  AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
1060  XOR( ebx, dl, 3)
1061  MOV( ecx, dh, 2)
1062  AS2( shr edx, 16)
1063  AS2( xor ecx, [L_SAVED_X+2*4])
1064  XOR( eax, dh, 0)
1065  MOV( edx, dl, 1)
1066  AS2( xor edx, [L_SAVED_X+3*4])
1067 
1068  AS2( add L_REG, [L_KEYS_BEGIN])
1069  AS2( add L_REG, 3*16)
1070  ATT_NOPREFIX
1071  ASJ( jmp, 4, f)
1072  INTEL_NOPREFIX
1073 
1074 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
1075 // out: eax, ebx, edi, mm0
1076 #define ROUND() \
1077  MXOR( 0, cl, 3) /* 11 */\
1078  AS2( mov cl, al) /* 8,9,10,3 */\
1079  XOR( edi, ah, 2) /* 2 */\
1080  AS2( shr eax, 16) /* 0,1 */\
1081  XOR( edi, bl, 3) /* 7 */\
1082  MXOR( 0, bh, 2) /* 6 */\
1083  AS2( shr ebx, 16) /* 4,5 */\
1084  MXOR( 0, al, 1) /* 1 */\
1085  MOV( eax, ah, 0) /* 0 */\
1086  XOR( eax, bl, 1) /* 5 */\
1087  MOV( ebx, bh, 0) /* 4 */\
1088  XOR( eax, ch, 2) /* 10 */\
1089  XOR( ebx, cl, 3) /* 3 */\
1090  AS2( shr ecx, 16) /* 8,9 */\
1091  XOR( eax, dl, 3) /* 15 */\
1092  XOR( ebx, dh, 2) /* 14 */\
1093  AS2( shr edx, 16) /* 12,13 */\
1094  XOR( edi, ch, 0) /* 8 */\
1095  XOR( ebx, cl, 1) /* 9 */\
1096  XOR( edi, dl, 1) /* 13 */\
1097  MXOR( 0, dh, 0) /* 12 */\
1098 
1099  ASL(2) // 2-round loop
1100  AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
1101  AS2( mov edi, [L_SUBKEYS-4*16+2*4])
1102  ROUND()
1103  AS2( mov ecx, edi)
1104  AS2( xor eax, [L_SUBKEYS-4*16+0*4])
1105  AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
1106  AS2( MOVD edx, MM(0))
1107 
1108  ASL(4)
1109  AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
1110  AS2( mov edi, [L_SUBKEYS-4*16+6*4])
1111  ROUND()
1112  AS2( mov ecx, edi)
1113  AS2( xor eax, [L_SUBKEYS-4*16+4*4])
1114  AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
1115  AS2( MOVD edx, MM(0))
1116 
1117  AS2( add L_REG, 32)
1118  AS2( test L_REG, 255)
1119  ATT_NOPREFIX
1120  ASJ( jnz, 2, b)
1121  INTEL_NOPREFIX
1122  AS2( sub L_REG, 16*16)
1123 
1124 #define LAST(a, b, c) \
1125  AS2( movzx esi, a )\
1126  AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
1127  AS2( movzx esi, b )\
1128  AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
1129  AS2( mov WORD PTR [L_LASTROUND+c], di )\
1130 
1131  // last round
1132  LAST(ch, dl, 2)
1133  LAST(dh, al, 6)
1134  AS2( shr edx, 16)
1135  LAST(ah, bl, 10)
1136  AS2( shr eax, 16)
1137  LAST(bh, cl, 14)
1138  AS2( shr ebx, 16)
1139  LAST(dh, al, 12)
1140  AS2( shr ecx, 16)
1141  LAST(ah, bl, 0)
1142  LAST(bh, cl, 4)
1143  LAST(ch, dl, 8)
1144 
1145  AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
1146  AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
1147 
1148  AS2( mov WORD_REG(cx), [L_LENGTH])
1149  AS2( sub WORD_REG(cx), 16)
1150 
1151  AS2( movdqu xmm2, [WORD_REG(ax)])
1152  AS2( pxor xmm2, xmm4)
1153 
1155  AS2( movdqa xmm0, [L_INCREMENTS])
1156  AS2( paddd xmm0, [L_INBLOCKS])
1157  AS2( movdqa [L_INBLOCKS], xmm0)
1158 #else
1159  AS2( movdqa xmm0, [L_INCREMENTS+16])
1160  AS2( paddq xmm0, [L_INBLOCKS+16])
1161  AS2( movdqa [L_INBLOCKS+16], xmm0)
1162 #endif
1163 
1164  AS2( pxor xmm2, [L_LASTROUND])
1165  AS2( movdqu [WORD_REG(bx)], xmm2)
1166 
1167  ATT_NOPREFIX
1168  ASJ( jle, 7, f)
1169  INTEL_NOPREFIX
1170  AS2( mov [L_LENGTH], WORD_REG(cx))
1171  AS2( test WORD_REG(cx), 1)
1172  ATT_NOPREFIX
1173  ASJ( jnz, 1, b)
1174  INTEL_NOPREFIX
1176  AS2( movdqa xmm0, [L_INCREMENTS])
1177  AS2( paddq xmm0, [L_INBLOCKS])
1178  AS2( movdqa [L_INBLOCKS], xmm0)
1179 #endif
1180  ATT_NOPREFIX
1181  ASJ( jmp, 3, b)
1182  INTEL_NOPREFIX
1183 
1184  ASL(7)
1185  // erase keys on stack
1186  AS2( xorps xmm0, xmm0)
1187  AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
1188  AS2( movaps [WORD_REG(ax)-7*16], xmm0)
1189  AS2( movaps [WORD_REG(ax)-6*16], xmm0)
1190  AS2( movaps [WORD_REG(ax)-5*16], xmm0)
1191  AS2( movaps [WORD_REG(ax)-4*16], xmm0)
1192  AS2( movaps [WORD_REG(ax)-3*16], xmm0)
1193  AS2( movaps [WORD_REG(ax)-2*16], xmm0)
1194  AS2( movaps [WORD_REG(ax)-1*16], xmm0)
1195  AS2( movaps [WORD_REG(ax)+0*16], xmm0)
1196  AS2( movaps [WORD_REG(ax)+1*16], xmm0)
1197  AS2( movaps [WORD_REG(ax)+2*16], xmm0)
1198  AS2( movaps [WORD_REG(ax)+3*16], xmm0)
1199  AS2( movaps [WORD_REG(ax)+4*16], xmm0)
1200  AS2( movaps [WORD_REG(ax)+5*16], xmm0)
1201  AS2( movaps [WORD_REG(ax)+6*16], xmm0)
1203  AS2( mov esp, [L_SP])
1204  AS1( emms)
1205 #endif
1206  AS_POP_IF86(bp)
1207  AS_POP_IF86(bx)
1208 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
1209  AS_POP_IF86(di)
1210  AS_POP_IF86(si)
1211  AS1(ret)
1212 #endif
1213 #ifdef CRYPTOPP_GENERATE_X64_MASM
1214  pop r12
1215  pop rbx
1216  pop rdi
1217  pop rsi
1218  ret
1219  Rijndael_Enc_AdvancedProcessBlocks ENDP
1220 #endif
1221 #ifdef __GNUC__
1222  ATT_PREFIX
1223  :
1224  : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
1225  : "memory", "cc", "%eax"
1226  #if CRYPTOPP_BOOL_X64
1227  , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
1228  #endif
1229  );
1230 #endif
1231 }
1232 
1233 #endif
1234 
1235 #ifndef CRYPTOPP_GENERATE_X64_MASM
1236 
1237 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
1238 extern "C" {
1239 void Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k);
1240 }
1241 #endif
1242 
1243 #if CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
1244 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1245 {
1246 #if CRYPTOPP_AESNI_AVAILABLE
1247  if (HasAESNI())
1248  return Rijndael_Enc_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1249 #endif
1250 #if CRYPTOPP_ARM_AES_AVAILABLE
1251  if (HasAES())
1252  return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1253 #endif
1254 #if CRYPTOPP_POWER8_AES_AVAILABLE
1255  if (HasAES())
1256  return Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1257 #endif
1258 
1259 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1260  if (HasSSE2())
1261  {
1262  if (length < BLOCKSIZE)
1263  return length;
1264 
1265  static const byte *zeros = (const byte*)(Te+256);
1266  m_aliasBlock.SetMark(m_aliasBlock.size());
1267  byte *space = NULLPTR, *originalSpace = const_cast<byte*>(m_aliasBlock.data());
1268 
1269  // round up to nearest 256 byte boundary
1270  space = originalSpace + (s_aliasBlockSize - (uintptr_t)originalSpace % s_aliasBlockSize) % s_aliasBlockSize;
1271  while (AliasedWithTable(space, space + sizeof(Locals)))
1272  {
1273  space += 256;
1274  CRYPTOPP_ASSERT(space < (originalSpace + s_aliasPageSize));
1275  }
1276 
1277  size_t increment = BLOCKSIZE;
1278  if (flags & BT_ReverseDirection)
1279  {
1280  CRYPTOPP_ASSERT(length % BLOCKSIZE == 0);
1281  inBlocks += length - BLOCKSIZE;
1282  xorBlocks += length - BLOCKSIZE;
1283  outBlocks += length - BLOCKSIZE;
1284  increment = 0-increment;
1285  }
1286 
1287  Locals &locals = *(Locals *)(void *)space;
1288 
1289  locals.inBlocks = inBlocks;
1290  locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1291  locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1292  locals.outBlocks = outBlocks;
1293 
1294  locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1295  locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1296  locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1297  locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1298 
1299  locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1300  int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1301  locals.keysBegin = (12-keysToCopy)*16;
1302 
1303  Rijndael_Enc_AdvancedProcessBlocks_SSE2(&locals, m_key);
1304 
1305  return length % BLOCKSIZE;
1306  }
1307 #endif
1308 
1309  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1310 }
1311 
1312 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1313 {
1314 #if CRYPTOPP_AESNI_AVAILABLE
1315  if (HasAESNI())
1316  return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1317 #endif
1318 #if CRYPTOPP_ARM_AES_AVAILABLE
1319  if (HasAES())
1320  return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1321 #endif
1322 #if CRYPTOPP_POWER8_AES_AVAILABLE
1323  if (HasAES())
1324  return Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1325 #endif
1326 
1327  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1328 }
1329 #endif // CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
1330 
1331 NAMESPACE_END
1332 
1333 #endif
1334 #endif
#define CRYPTOPP_BOOL_X86
32-bit x86 platform
Definition: config_cpu.h:52
Utility functions for the Crypto++ library.
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:62
Library configuration file.
Access a block of memory.
Definition: misc.h:2606
Rijndael block cipher.
Definition: rijndael.h:45
unsigned long long word64
64-bit unsigned datatype
Definition: config_int.h:91
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianness.
Definition: misc.h:2187
Precompiled header file.
CRYPTOPP_DLL void xorbuf(byte *buf, const byte *mask, size_t count)
Performs an XOR of a buffer with a mask.
virtual unsigned int OptimalDataAlignment() const
Provides input and output data alignment for optimal performance.
byte order is big-endian
Definition: cryptlib.h:147
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68
Classes for Rijndael encryption algorithm.
Functions for CPU features and intrinsics.
unsigned char byte
8-bit unsigned datatype
Definition: config_int.h:56
Crypto++ library namespace.
#define CRYPTOPP_BOOL_X64
32-bit x86 platform
Definition: config_cpu.h:48
virtual size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
Encrypt and xor multiple blocks using additional flags.
Interface for retrieving values given their names.
Definition: cryptlib.h:321