xref: /freebsd/crypto/krb5/src/lib/crypto/builtin/aes/aesopt.h (revision 7f2fe78b9dd5f51c821d771b63d2e096f6fd49e9)
1 /*
2 ---------------------------------------------------------------------------
3 Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
4 
5 The redistribution and use of this software (with or without changes)
6 is allowed without the payment of fees or royalties provided that:
7 
8   source code distributions include the above copyright notice, this
9   list of conditions and the following disclaimer;
10 
11   binary distributions include the above copyright notice, this list
12   of conditions and the following disclaimer in their documentation.
13 
14 This software is provided 'as is' with no explicit or implied warranties
15 in respect of its operation, including, but not limited to, correctness
16 and fitness for purpose.
17 ---------------------------------------------------------------------------
18 Issue Date: 20/12/2007
19 
20  This file contains the compilation options for AES (Rijndael) and code
21  that is common across encryption, key scheduling and table generation.
22 
23  OPERATION
24 
25  These source code files implement the AES algorithm Rijndael designed by
26  Joan Daemen and Vincent Rijmen. This version is designed for the standard
27  block size of 16 bytes and for key sizes of 128, 192 and 256 bits (16, 24
28  and 32 bytes).
29 
30  This version is designed for flexibility and speed using operations on
31  32-bit words rather than operations on bytes.  It can be compiled with
32  either big or little endian internal byte order but is faster when the
33  native byte order for the processor is used.
34 
35  THE CIPHER INTERFACE
36 
37  The cipher interface is implemented as an array of bytes in which lower
38  AES bit sequence indexes map to higher numeric significance within bytes.
39 
40   uint8_t                 (an unsigned  8-bit type)
41   uint32_t                (an unsigned 32-bit type)
42   struct aes_encrypt_ctx  (structure for the cipher encryption context)
43   struct aes_decrypt_ctx  (structure for the cipher decryption context)
44   AES_RETURN                the function return type
45 
46   C subroutine calls:
47 
48   AES_RETURN aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]);
49   AES_RETURN aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]);
50   AES_RETURN aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]);
51   AES_RETURN aes_encrypt(const unsigned char *in, unsigned char *out,
52                                                   const aes_encrypt_ctx cx[1]);
53 
54   AES_RETURN aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]);
55   AES_RETURN aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]);
56   AES_RETURN aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]);
57   AES_RETURN aes_decrypt(const unsigned char *in, unsigned char *out,
58                                                   const aes_decrypt_ctx cx[1]);
59 
60  IMPORTANT NOTE: If you are using this C interface with dynamic tables make sure that
61  you call aes_init() before AES is used so that the tables are initialised.
62 
63  C++ aes class subroutines:
64 
65      Class AESencrypt  for encryption
66 
67       Constructors:
68           AESencrypt(void)
69           AESencrypt(const unsigned char *key) - 128 bit key
70       Members:
71           AES_RETURN key128(const unsigned char *key)
72           AES_RETURN key192(const unsigned char *key)
73           AES_RETURN key256(const unsigned char *key)
74           AES_RETURN encrypt(const unsigned char *in, unsigned char *out) const
75 
76       Class AESdecrypt  for encryption
77       Constructors:
78           AESdecrypt(void)
79           AESdecrypt(const unsigned char *key) - 128 bit key
80       Members:
81           AES_RETURN key128(const unsigned char *key)
82           AES_RETURN key192(const unsigned char *key)
83           AES_RETURN key256(const unsigned char *key)
84           AES_RETURN decrypt(const unsigned char *in, unsigned char *out) const
85 */
86 
87 #if !defined( _AESOPT_H )
88 #define _AESOPT_H
89 
90 #if defined( __cplusplus )
91 #include "aescpp.h"
92 #else
93 #include "aes.h"
94 #endif
95 
96 /*  PLATFORM SPECIFIC INCLUDES */
97 
98 #include "brg_endian.h"
99 
100 /*  CONFIGURATION - THE USE OF DEFINES
101 
102     Later in this section there are a number of defines that control the
103     operation of the code.  In each section, the purpose of each define is
104     explained so that the relevant form can be included or excluded by
105     setting either 1's or 0's respectively on the branches of the related
106     #if clauses.  The following local defines should not be changed.
107 */
108 
109 #define ENCRYPTION_IN_C     1
110 #define DECRYPTION_IN_C     2
111 #define ENC_KEYING_IN_C     4
112 #define DEC_KEYING_IN_C     8
113 
114 #define NO_TABLES           0
115 #define ONE_TABLE           1
116 #define FOUR_TABLES         4
117 #define NONE                0
118 #define PARTIAL             1
119 #define FULL                2
120 
121 /*  --- START OF USER CONFIGURED OPTIONS --- */
122 
123 /*  1. BYTE ORDER WITHIN 32 BIT WORDS
124 
125     The fundamental data processing units in Rijndael are 8-bit bytes. The
126     input, output and key input are all enumerated arrays of bytes in which
127     bytes are numbered starting at zero and increasing to one less than the
128     number of bytes in the array in question. This enumeration is only used
129     for naming bytes and does not imply any adjacency or order relationship
130     from one byte to another. When these inputs and outputs are considered
131     as bit sequences, bits 8*n to 8*n+7 of the bit sequence are mapped to
132     byte[n] with bit 8n+i in the sequence mapped to bit 7-i within the byte.
133     In this implementation bits are numbered from 0 to 7 starting at the
134     numerically least significant end of each byte (bit n represents 2^n).
135 
136     However, Rijndael can be implemented more efficiently using 32-bit
137     words by packing bytes into words so that bytes 4*n to 4*n+3 are placed
138     into word[n]. While in principle these bytes can be assembled into words
139     in any positions, this implementation only supports the two formats in
140     which bytes in adjacent positions within words also have adjacent byte
141     numbers. This order is called big-endian if the lowest numbered bytes
142     in words have the highest numeric significance and little-endian if the
143     opposite applies.
144 
145     This code can work in either order irrespective of the order used by the
146     machine on which it runs. Normally the internal byte order will be set
147     to the order of the processor on which the code is to be run but this
148     define can be used to reverse this in special situations
149 
150     WARNING: Assembler code versions rely on PLATFORM_BYTE_ORDER being set.
151     This define will hence be redefined later (in section 4) if necessary
152 */
153 
154 #if 1
155 #  define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
156 #elif 0
157 #  define ALGORITHM_BYTE_ORDER IS_LITTLE_ENDIAN
158 #elif 0
159 #  define ALGORITHM_BYTE_ORDER IS_BIG_ENDIAN
160 #else
161 #  error The algorithm byte order is not defined
162 #endif
163 
164 /*  2. Intel AES AND VIA ACE SUPPORT */
165 
166 #if defined( __GNUC__ ) && defined( __i386__ ) && !defined(__BEOS__)  \
167  || defined( _WIN32 ) && defined( _M_IX86 ) && !(defined( _WIN64 ) \
168  || defined( _WIN32_WCE ) || defined( _MSC_VER ) && ( _MSC_VER <= 800 ))
169 #  define VIA_ACE_POSSIBLE
170 #endif
171 
172 /* AESNI is supported by all Windows x64 compilers, but for Linux/GCC
173    we have to test for SSE 2, SSE 3, and AES to before enabling it; */
174 #if !defined( INTEL_AES_POSSIBLE )
175 #  if defined( _WIN64 ) && defined( _MSC_VER ) \
176    || defined( __GNUC__ ) && defined( __x86_64__ ) && \
177 	  defined( __SSE2__ ) && defined( __SSE3__ ) && \
178 	  defined( __AES__ )
179 #    define INTEL_AES_POSSIBLE
180 #  endif
181 #endif
182 
183 /*  Define this option if support for the Intel AESNI is required
184     If USE_INTEL_AES_IF_PRESENT is defined then AESNI will be used
185     if it is detected (both present and enabled).
186 
187 	AESNI uses a decryption key schedule with the first decryption
188 	round key at the high end of the key schedule with the following
189 	round keys at lower positions in memory.  So AES_REV_DKS must NOT
190 	be defined when AESNI will be used.  Although it is unlikely that
191 	assembler code will be used with an AESNI build, if it is then
192 	AES_REV_DKS must NOT be defined when the assembler files are
193 	built (the definition of USE_INTEL_AES_IF_PRESENT in the assembler
194 	code files must match that here if they are used).
195 */
196 
197 #if defined( INTEL_AES_POSSIBLE )
198 #  if 0 && !defined( USE_INTEL_AES_IF_PRESENT )
199 #    define USE_INTEL_AES_IF_PRESENT
200 #  endif
201 #elif defined( USE_INTEL_AES_IF_PRESENT )
202 #  error: AES_NI is not available on this platform
203 #endif
204 
205 /*  Define this option if support for the VIA ACE is required. This uses
206     inline assembler instructions and is only implemented for the Microsoft,
207     Intel and GCC compilers.  If VIA ACE is known to be present, then defining
208     ASSUME_VIA_ACE_PRESENT will remove the ordinary encryption/decryption
209     code.  If USE_VIA_ACE_IF_PRESENT is defined then VIA ACE will be used if
210     it is detected (both present and enabled) but the normal AES code will
211     also be present.
212 
213     When VIA ACE is to be used, all AES encryption contexts MUST be 16 byte
214     aligned; other input/output buffers do not need to be 16 byte aligned
215     but there are very large performance gains if this can be arranged.
216     VIA ACE also requires the decryption key schedule to be in reverse
217     order (which later checks below ensure).
218 
219 	AES_REV_DKS must be set for assembler code used with a VIA ACE build
220 */
221 
222 #if 0 && defined( VIA_ACE_POSSIBLE ) && !defined( USE_VIA_ACE_IF_PRESENT )
223 #  define USE_VIA_ACE_IF_PRESENT
224 #endif
225 
226 #if 0 && defined( VIA_ACE_POSSIBLE ) && !defined( ASSUME_VIA_ACE_PRESENT )
227 #  define ASSUME_VIA_ACE_PRESENT
228 #  endif
229 
230 /*  3. ASSEMBLER SUPPORT
231 
232     This define (which can be on the command line) enables the use of the
233     assembler code routines for encryption, decryption and key scheduling
234     as follows:
235 
236     ASM_X86_V1C uses the assembler (aes_x86_v1.asm) with large tables for
237                 encryption and decryption and but with key scheduling in C
238     ASM_X86_V2  uses assembler (aes_x86_v2.asm) with compressed tables for
239                 encryption, decryption and key scheduling
240     ASM_X86_V2C uses assembler (aes_x86_v2.asm) with compressed tables for
241                 encryption and decryption and but with key scheduling in C
242     ASM_AMD64_C uses assembler (aes_amd64.asm) with compressed tables for
243                 encryption and decryption and but with key scheduling in C
244 
245     Change one 'if 0' below to 'if 1' to select the version or define
246     as a compilation option.
247 */
248 
249 #if 0 && !defined( ASM_X86_V1C )
250 #  define ASM_X86_V1C
251 #elif 0 && !defined( ASM_X86_V2  )
252 #  define ASM_X86_V2
253 #elif 0 && !defined( ASM_X86_V2C )
254 #  define ASM_X86_V2C
255 #elif 0 && !defined( ASM_AMD64_C )
256 #  define ASM_AMD64_C
257 #endif
258 
259 #if defined( __i386 ) || defined( _M_IX86 )
260 #  define A32_
261 #elif defined( __x86_64__ ) || defined( _M_X64 )
262 #  define A64_
263 #endif
264 
265 #if (defined ( ASM_X86_V1C ) || defined( ASM_X86_V2 ) || defined( ASM_X86_V2C )) \
266        && !defined( A32_ )  || defined( ASM_AMD64_C ) && !defined( A64_ )
267 #  error Assembler code is only available for x86 and AMD64 systems
268 #endif
269 
270 /*  4. FAST INPUT/OUTPUT OPERATIONS.
271 
272     On some machines it is possible to improve speed by transferring the
273     bytes in the input and output arrays to and from the internal 32-bit
274     variables by addressing these arrays as if they are arrays of 32-bit
275     words.  On some machines this will always be possible but there may
276     be a large performance penalty if the byte arrays are not aligned on
277     the normal word boundaries. On other machines this technique will
278     lead to memory access errors when such 32-bit word accesses are not
279     properly aligned. The option SAFE_IO avoids such problems but will
280     often be slower on those machines that support misaligned access
281     (especially so if care is taken to align the input  and output byte
282     arrays on 32-bit word boundaries). If SAFE_IO is not defined it is
283     assumed that access to byte arrays as if they are arrays of 32-bit
284     words will not cause problems when such accesses are misaligned.
285 */
286 #if 1 && !defined( _MSC_VER )
287 #  define SAFE_IO
288 #endif
289 
290 /*  5. LOOP UNROLLING
291 
292     The code for encryption and decryption cycles through a number of rounds
293     that can be implemented either in a loop or by expanding the code into a
294     long sequence of instructions, the latter producing a larger program but
295     one that will often be much faster. The latter is called loop unrolling.
296     There are also potential speed advantages in expanding two iterations in
297     a loop with half the number of iterations, which is called partial loop
298     unrolling.  The following options allow partial or full loop unrolling
299     to be set independently for encryption and decryption
300 */
301 #if !defined(CONFIG_SMALL) || defined(CONFIG_SMALL_NO_CRYPTO)
302 #  define ENC_UNROLL  FULL
303 #elif 0
304 #  define ENC_UNROLL  PARTIAL
305 #else
306 #  define ENC_UNROLL  NONE
307 #endif
308 
309 #if !defined(CONFIG_SMALL) || defined(CONFIG_SMALL_NO_CRYPTO)
310 #  define DEC_UNROLL  FULL
311 #elif 0
312 #  define DEC_UNROLL  PARTIAL
313 #else
314 #  define DEC_UNROLL  NONE
315 #endif
316 
317 #if 1
318 #  define ENC_KS_UNROLL
319 #endif
320 
321 #if 1
322 #  define DEC_KS_UNROLL
323 #endif
324 
325 /*  6. FAST FINITE FIELD OPERATIONS
326 
327     If this section is included, tables are used to provide faster finite
328     field arithmetic (this has no effect if STATIC_TABLES is defined).
329 */
330 #if 1
331 #  define FF_TABLES
332 #endif
333 
334 /*  7. INTERNAL STATE VARIABLE FORMAT
335 
336     The internal state of Rijndael is stored in a number of local 32-bit
337     word variables which can be defined either as an array or as individual
338     names variables. Include this section if you want to store these local
339     variables in arrays. Otherwise individual local variables will be used.
340 */
341 #if 1
342 #  define ARRAYS
343 #endif
344 
345 /*  8. FIXED OR DYNAMIC TABLES
346 
347     When this section is included the tables used by the code are compiled
348     statically into the binary file.  Otherwise the subroutine aes_init()
349     must be called to compute them before the code is first used.
350 */
351 #if 1 && !(defined( _MSC_VER ) && ( _MSC_VER <= 800 ))
352 #  define STATIC_TABLES
353 #endif
354 
355 /*  9. MASKING OR CASTING FROM LONGER VALUES TO BYTES
356 
357     In some systems it is better to mask longer values to extract bytes
358     rather than using a cast. This option allows this choice.
359 */
360 #if 0
361 #  define to_byte(x)  ((uint8_t)(x))
362 #else
363 #  define to_byte(x)  ((x) & 0xff)
364 #endif
365 
366 /*  10. TABLE ALIGNMENT
367 
368     On some systems speed will be improved by aligning the AES large lookup
369     tables on particular boundaries. This define should be set to a power of
370     two giving the desired alignment. It can be left undefined if alignment
371     is not needed.  This option is specific to the Microsoft VC++ compiler -
372     it seems to sometimes cause trouble for the VC++ version 6 compiler.
373 */
374 
375 #if 1 && defined( _MSC_VER ) && ( _MSC_VER >= 1300 )
376 #  define TABLE_ALIGN 32
377 #endif
378 
379 /*  11.  REDUCE CODE AND TABLE SIZE
380 
381     This replaces some expanded macros with function calls if AES_ASM_V2 or
382     AES_ASM_V2C are defined
383 */
384 
385 #if 1 && (defined( ASM_X86_V2 ) || defined( ASM_X86_V2C ))
386 #  define REDUCE_CODE_SIZE
387 #endif
388 
389 /*  12. TABLE OPTIONS
390 
391     This cipher proceeds by repeating in a number of cycles known as 'rounds'
392     which are implemented by a round function which can optionally be speeded
393     up using tables.  The basic tables are each 256 32-bit words, with either
394     one or four tables being required for each round function depending on
395     how much speed is required. The encryption and decryption round functions
396     are different and the last encryption and decryption round functions are
397     different again making four different round functions in all.
398 
399     This means that:
400       1. Normal encryption and decryption rounds can each use either 0, 1
401          or 4 tables and table spaces of 0, 1024 or 4096 bytes each.
402       2. The last encryption and decryption rounds can also use either 0, 1
403          or 4 tables and table spaces of 0, 1024 or 4096 bytes each.
404 
405     Include or exclude the appropriate definitions below to set the number
406     of tables used by this implementation.
407 */
408 
409 #if !defined(CONFIG_SMALL) || defined(CONFIG_SMALL_NO_CRYPTO)   /* set tables for the normal encryption round */
410 #  define ENC_ROUND   FOUR_TABLES
411 #elif 0
412 #  define ENC_ROUND   ONE_TABLE
413 #else
414 #  define ENC_ROUND   NO_TABLES
415 #endif
416 
417 #if !defined(CONFIG_SMALL) || defined(CONFIG_SMALL_NO_CRYPTO)   /* set tables for the last encryption round */
418 #  define LAST_ENC_ROUND  FOUR_TABLES
419 #elif 0
420 #  define LAST_ENC_ROUND  ONE_TABLE
421 #else
422 #  define LAST_ENC_ROUND  NO_TABLES
423 #endif
424 
425 #if !defined(CONFIG_SMALL) || defined(CONFIG_SMALL_NO_CRYPTO)   /* set tables for the normal decryption round */
426 #  define DEC_ROUND   FOUR_TABLES
427 #elif 0
428 #  define DEC_ROUND   ONE_TABLE
429 #else
430 #  define DEC_ROUND   NO_TABLES
431 #endif
432 
433 #if !defined(CONFIG_SMALL) || defined(CONFIG_SMALL_NO_CRYPTO)   /* set tables for the last decryption round */
434 #  define LAST_DEC_ROUND  FOUR_TABLES
435 #elif 0
436 #  define LAST_DEC_ROUND  ONE_TABLE
437 #else
438 #  define LAST_DEC_ROUND  NO_TABLES
439 #endif
440 
441 /*  The decryption key schedule can be speeded up with tables in the same
442     way that the round functions can.  Include or exclude the following
443     defines to set this requirement.
444 */
445 #if !defined(CONFIG_SMALL) || defined(CONFIG_SMALL_NO_CRYPTO)
446 #  define KEY_SCHED   FOUR_TABLES
447 #elif 0
448 #  define KEY_SCHED   ONE_TABLE
449 #else
450 #  define KEY_SCHED   NO_TABLES
451 #endif
452 
453 /*  ---- END OF USER CONFIGURED OPTIONS ---- */
454 
455 /* VIA ACE support is only available for VC++ and GCC */
456 
457 #if !defined( _MSC_VER ) && !defined( __GNUC__ )
458 #  if defined( ASSUME_VIA_ACE_PRESENT )
459 #    undef ASSUME_VIA_ACE_PRESENT
460 #  endif
461 #  if defined( USE_VIA_ACE_IF_PRESENT )
462 #    undef USE_VIA_ACE_IF_PRESENT
463 #  endif
464 #endif
465 
466 #if defined( ASSUME_VIA_ACE_PRESENT ) && !defined( USE_VIA_ACE_IF_PRESENT )
467 #  define USE_VIA_ACE_IF_PRESENT
468 #endif
469 
470 /* define to reverse decryption key schedule    */
471 #if 1 || defined( USE_VIA_ACE_IF_PRESENT ) && !defined ( AES_REV_DKS )
472 #  define AES_REV_DKS
473 #endif
474 
475 /* Intel AESNI uses a decryption key schedule in the encryption order */
476 #if defined( USE_INTEL_AES_IF_PRESENT ) && defined ( AES_REV_DKS )
477 #  undef AES_REV_DKS
478 #endif
479 
480 /* Assembler support requires the use of platform byte order */
481 
482 #if ( defined( ASM_X86_V1C ) || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C ) ) \
483     && (ALGORITHM_BYTE_ORDER != PLATFORM_BYTE_ORDER)
484 #  undef  ALGORITHM_BYTE_ORDER
485 #  define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
486 #endif
487 
488 /* In this implementation the columns of the state array are each held in
489    32-bit words. The state array can be held in various ways: in an array
490    of words, in a number of individual word variables or in a number of
491    processor registers. The following define maps a variable name x and
492    a column number c to the way the state array variable is to be held.
493    The first define below maps the state into an array x[c] whereas the
494    second form maps the state into a number of individual variables x0,
495    x1, etc.  Another form could map individual state columns to machine
496    register names.
497 */
498 
499 #if defined( ARRAYS )
500 #  define s(x,c) x[c]
501 #else
502 #  define s(x,c) x##c
503 #endif
504 
505 /*  This implementation provides subroutines for encryption, decryption
506     and for setting the three key lengths (separately) for encryption
507     and decryption. Since not all functions are needed, masks are set
508     up here to determine which will be implemented in C
509 */
510 
511 #if !defined( AES_ENCRYPT )
512 #  define EFUNCS_IN_C   0
513 #elif defined( ASSUME_VIA_ACE_PRESENT ) || defined( ASM_X86_V1C ) \
514     || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C )
515 #  define EFUNCS_IN_C   ENC_KEYING_IN_C
516 #elif !defined( ASM_X86_V2 )
517 #  define EFUNCS_IN_C   ( ENCRYPTION_IN_C | ENC_KEYING_IN_C )
518 #else
519 #  define EFUNCS_IN_C   0
520 #endif
521 
522 #if !defined( AES_DECRYPT )
523 #  define DFUNCS_IN_C   0
524 #elif defined( ASSUME_VIA_ACE_PRESENT ) || defined( ASM_X86_V1C ) \
525     || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C )
526 #  define DFUNCS_IN_C   DEC_KEYING_IN_C
527 #elif !defined( ASM_X86_V2 )
528 #  define DFUNCS_IN_C   ( DECRYPTION_IN_C | DEC_KEYING_IN_C )
529 #else
530 #  define DFUNCS_IN_C   0
531 #endif
532 
533 #define FUNCS_IN_C  ( EFUNCS_IN_C | DFUNCS_IN_C )
534 
535 /* END OF CONFIGURATION OPTIONS */
536 
537 #define RC_LENGTH   (5 * (AES_BLOCK_SIZE / 4 - 2))
538 
539 /* Disable or report errors on some combinations of options */
540 
541 #if ENC_ROUND == NO_TABLES && LAST_ENC_ROUND != NO_TABLES
542 #  undef  LAST_ENC_ROUND
543 #  define LAST_ENC_ROUND  NO_TABLES
544 #elif ENC_ROUND == ONE_TABLE && LAST_ENC_ROUND == FOUR_TABLES
545 #  undef  LAST_ENC_ROUND
546 #  define LAST_ENC_ROUND  ONE_TABLE
547 #endif
548 
549 #if ENC_ROUND == NO_TABLES && ENC_UNROLL != NONE
550 #  undef  ENC_UNROLL
551 #  define ENC_UNROLL  NONE
552 #endif
553 
554 #if DEC_ROUND == NO_TABLES && LAST_DEC_ROUND != NO_TABLES
555 #  undef  LAST_DEC_ROUND
556 #  define LAST_DEC_ROUND  NO_TABLES
557 #elif DEC_ROUND == ONE_TABLE && LAST_DEC_ROUND == FOUR_TABLES
558 #  undef  LAST_DEC_ROUND
559 #  define LAST_DEC_ROUND  ONE_TABLE
560 #endif
561 
562 #if DEC_ROUND == NO_TABLES && DEC_UNROLL != NONE
563 #  undef  DEC_UNROLL
564 #  define DEC_UNROLL  NONE
565 #endif
566 
567 #if defined( bswap32 )
568 #  define aes_sw32    bswap32
569 #elif defined( bswap_32 )
570 #  define aes_sw32    bswap_32
571 #else
572 #  define brot(x,n)   (((uint32_t)(x) <<  n) | ((uint32_t)(x) >> (32 - n)))
573 #  define aes_sw32(x) ((brot((x),8) & 0x00ff00ff) | (brot((x),24) & 0xff00ff00))
574 #endif
575 
576 /*  upr(x,n):  rotates bytes within words by n positions, moving bytes to
577                higher index positions with wrap around into low positions
578     ups(x,n):  moves bytes by n positions to higher index positions in
579                words but without wrap around
580     bval(x,n): extracts a byte from a word
581 
582     WARNING:   The definitions given here are intended only for use with
583                unsigned variables and with shift counts that are compile
584                time constants
585 */
586 
587 #if ( ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN )
588 #  define upr(x,n)      (((uint32_t)(x) << (8 * (n))) | ((uint32_t)(x) >> (32 - 8 * (n))))
589 #  define ups(x,n)      ((uint32_t) (x) << (8 * (n)))
590 #  define bval(x,n)     to_byte((x) >> (8 * (n)))
591 #  define bytes2word(b0, b1, b2, b3)  \
592         (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
593 #endif
594 
595 #if ( ALGORITHM_BYTE_ORDER == IS_BIG_ENDIAN )
596 #  define upr(x,n)      (((uint32_t)(x) >> (8 * (n))) | ((uint32_t)(x) << (32 - 8 * (n))))
597 #  define ups(x,n)      ((uint32_t) (x) >> (8 * (n)))
598 #  define bval(x,n)     to_byte((x) >> (24 - 8 * (n)))
599 #  define bytes2word(b0, b1, b2, b3)  \
600         (((uint32_t)(b0) << 24) | ((uint32_t)(b1) << 16) | ((uint32_t)(b2) << 8) | (b3))
601 #endif
602 
603 #if defined( SAFE_IO )
604 #  define word_in(x,c)    bytes2word(((const uint8_t*)(x)+4*c)[0], ((const uint8_t*)(x)+4*c)[1], \
605                                    ((const uint8_t*)(x)+4*c)[2], ((const uint8_t*)(x)+4*c)[3])
606 #  define word_out(x,c,v) { ((uint8_t*)(x)+4*c)[0] = bval(v,0); ((uint8_t*)(x)+4*c)[1] = bval(v,1); \
607                           ((uint8_t*)(x)+4*c)[2] = bval(v,2); ((uint8_t*)(x)+4*c)[3] = bval(v,3); }
608 #elif ( ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER )
609 #  define word_in(x,c)    (*((uint32_t*)(x)+(c)))
610 #  define word_out(x,c,v) (*((uint32_t*)(x)+(c)) = (v))
611 #else
612 #  define word_in(x,c)    aes_sw32(*((uint32_t*)(x)+(c)))
613 #  define word_out(x,c,v) (*((uint32_t*)(x)+(c)) = aes_sw32(v))
614 #endif
615 
616 /* the finite field modular polynomial and elements */
617 
618 #define WPOLY   0x011b
619 #define BPOLY     0x1b
620 
621 /* multiply four bytes in GF(2^8) by 'x' {02} in parallel */
622 
623 #define gf_c1  0x80808080
624 #define gf_c2  0x7f7f7f7f
625 #define gf_mulx(x)  ((((x) & gf_c2) << 1) ^ ((((x) & gf_c1) >> 7) * BPOLY))
626 
627 /* The following defines provide alternative definitions of gf_mulx that might
628    give improved performance if a fast 32-bit multiply is not available. Note
629    that a temporary variable u needs to be defined where gf_mulx is used.
630 
631 #define gf_mulx(x) (u = (x) & gf_c1, u |= (u >> 1), ((x) & gf_c2) << 1) ^ ((u >> 3) | (u >> 6))
632 #define gf_c4  (0x01010101 * BPOLY)
633 #define gf_mulx(x) (u = (x) & gf_c1, ((x) & gf_c2) << 1) ^ ((u - (u >> 7)) & gf_c4)
634 */
635 
636 /* Work out which tables are needed for the different options   */
637 
638 #if defined( ASM_X86_V1C )
639 #  if defined( ENC_ROUND )
640 #    undef  ENC_ROUND
641 #  endif
642 #  define ENC_ROUND   FOUR_TABLES
643 #  if defined( LAST_ENC_ROUND )
644 #    undef  LAST_ENC_ROUND
645 #  endif
646 #  define LAST_ENC_ROUND  FOUR_TABLES
647 #  if defined( DEC_ROUND )
648 #    undef  DEC_ROUND
649 #  endif
650 #  define DEC_ROUND   FOUR_TABLES
651 #  if defined( LAST_DEC_ROUND )
652 #    undef  LAST_DEC_ROUND
653 #  endif
654 #  define LAST_DEC_ROUND  FOUR_TABLES
655 #  if defined( KEY_SCHED )
656 #    undef  KEY_SCHED
657 #    define KEY_SCHED   FOUR_TABLES
658 #  endif
659 #endif
660 
661 #if ( FUNCS_IN_C & ENCRYPTION_IN_C ) || defined( ASM_X86_V1C )
662 #  if ENC_ROUND == ONE_TABLE
663 #    define FT1_SET
664 #  elif ENC_ROUND == FOUR_TABLES
665 #    define FT4_SET
666 #  else
667 #    define SBX_SET
668 #  endif
669 #  if LAST_ENC_ROUND == ONE_TABLE
670 #    define FL1_SET
671 #  elif LAST_ENC_ROUND == FOUR_TABLES
672 #    define FL4_SET
673 #  elif !defined( SBX_SET )
674 #    define SBX_SET
675 #  endif
676 #endif
677 
678 #if ( FUNCS_IN_C & DECRYPTION_IN_C ) || defined( ASM_X86_V1C )
679 #  if DEC_ROUND == ONE_TABLE
680 #    define IT1_SET
681 #  elif DEC_ROUND == FOUR_TABLES
682 #    define IT4_SET
683 #  else
684 #    define ISB_SET
685 #  endif
686 #  if LAST_DEC_ROUND == ONE_TABLE
687 #    define IL1_SET
688 #  elif LAST_DEC_ROUND == FOUR_TABLES
689 #    define IL4_SET
690 #  elif !defined(ISB_SET)
691 #    define ISB_SET
692 #  endif
693 #endif
694 
695 #if !(defined( REDUCE_CODE_SIZE ) && (defined( ASM_X86_V2 ) || defined( ASM_X86_V2C )))
696 #  if ((FUNCS_IN_C & ENC_KEYING_IN_C) || (FUNCS_IN_C & DEC_KEYING_IN_C))
697 #    if KEY_SCHED == ONE_TABLE
698 #      if !defined( FL1_SET )  && !defined( FL4_SET )
699 #        define LS1_SET
700 #      endif
701 #    elif KEY_SCHED == FOUR_TABLES
702 #      if !defined( FL4_SET )
703 #        define LS4_SET
704 #      endif
705 #    elif !defined( SBX_SET )
706 #      define SBX_SET
707 #    endif
708 #  endif
709 #  if (FUNCS_IN_C & DEC_KEYING_IN_C)
710 #    if KEY_SCHED == ONE_TABLE
711 #      define IM1_SET
712 #    elif KEY_SCHED == FOUR_TABLES
713 #      define IM4_SET
714 #    elif !defined( SBX_SET )
715 #      define SBX_SET
716 #    endif
717 #  endif
718 #endif
719 
720 /* generic definitions of Rijndael macros that use tables    */
721 
722 #define no_table(x,box,vf,rf,c) bytes2word( \
723     box[bval(vf(x,0,c),rf(0,c))], \
724     box[bval(vf(x,1,c),rf(1,c))], \
725     box[bval(vf(x,2,c),rf(2,c))], \
726     box[bval(vf(x,3,c),rf(3,c))])
727 
728 #define one_table(x,op,tab,vf,rf,c) \
729  (     tab[bval(vf(x,0,c),rf(0,c))] \
730   ^ op(tab[bval(vf(x,1,c),rf(1,c))],1) \
731   ^ op(tab[bval(vf(x,2,c),rf(2,c))],2) \
732   ^ op(tab[bval(vf(x,3,c),rf(3,c))],3))
733 
734 #define four_tables(x,tab,vf,rf,c) \
735  (  tab[0][bval(vf(x,0,c),rf(0,c))] \
736   ^ tab[1][bval(vf(x,1,c),rf(1,c))] \
737   ^ tab[2][bval(vf(x,2,c),rf(2,c))] \
738   ^ tab[3][bval(vf(x,3,c),rf(3,c))])
739 
740 #define vf1(x,r,c)  (x)
741 #define rf1(r,c)    (r)
742 #define rf2(r,c)    ((8+r-c)&3)
743 
744 /* perform forward and inverse column mix operation on four bytes in long word x in */
745 /* parallel. NOTE: x must be a simple variable, NOT an expression in these macros.  */
746 
747 #if !(defined( REDUCE_CODE_SIZE ) && (defined( ASM_X86_V2 ) || defined( ASM_X86_V2C )))
748 
749 #if defined( FM4_SET )      /* not currently used */
750 #  define fwd_mcol(x)       four_tables(x,t_use(f,m),vf1,rf1,0)
751 #elif defined( FM1_SET )    /* not currently used */
752 #  define fwd_mcol(x)       one_table(x,upr,t_use(f,m),vf1,rf1,0)
753 #else
754 #  define dec_fmvars        uint32_t g2
755 #  define fwd_mcol(x)       (g2 = gf_mulx(x), g2 ^ upr((x) ^ g2, 3) ^ upr((x), 2) ^ upr((x), 1))
756 #endif
757 
758 #if defined( IM4_SET )
759 #  define inv_mcol(x)       four_tables(x,t_use(i,m),vf1,rf1,0)
760 #elif defined( IM1_SET )
761 #  define inv_mcol(x)       one_table(x,upr,t_use(i,m),vf1,rf1,0)
762 #else
763 #  define dec_imvars        uint32_t g2, g4, g9
764 #  define inv_mcol(x)       (g2 = gf_mulx(x), g4 = gf_mulx(g2), g9 = (x) ^ gf_mulx(g4), g4 ^= g9, \
765                             (x) ^ g2 ^ g4 ^ upr(g2 ^ g9, 3) ^ upr(g4, 2) ^ upr(g9, 1))
766 #endif
767 
768 #if defined( FL4_SET )
769 #  define ls_box(x,c)       four_tables(x,t_use(f,l),vf1,rf2,c)
770 #elif defined( LS4_SET )
771 #  define ls_box(x,c)       four_tables(x,t_use(l,s),vf1,rf2,c)
772 #elif defined( FL1_SET )
773 #  define ls_box(x,c)       one_table(x,upr,t_use(f,l),vf1,rf2,c)
774 #elif defined( LS1_SET )
775 #  define ls_box(x,c)       one_table(x,upr,t_use(l,s),vf1,rf2,c)
776 #else
777 #  define ls_box(x,c)       no_table(x,t_use(s,box),vf1,rf2,c)
778 #endif
779 
780 #endif
781 
782 #if defined( ASM_X86_V1C ) && defined( AES_DECRYPT ) && !defined( ISB_SET )
783 #  define ISB_SET
784 #endif
785 
786 #endif
787