xref: /freebsd/crypto/openssl/engines/e_padlock.c (revision 780fb4a2fa9a9aee5ac48a60b790f567c0dc13e9)
1 /*-
2  * Support for VIA PadLock Advanced Cryptography Engine (ACE)
3  * Written by Michal Ludvig <michal@logix.cz>
4  *            http://www.logix.cz/michal
5  *
6  * Big thanks to Andy Polyakov for a help with optimization,
7  * assembler fixes, port to MS Windows and a lot of other
8  * valuable work on this engine!
9  */
10 
11 /* ====================================================================
12  * Copyright (c) 1999-2001 The OpenSSL Project.  All rights reserved.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  *
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  *
21  * 2. Redistributions in binary form must reproduce the above copyright
22  *    notice, this list of conditions and the following disclaimer in
23  *    the documentation and/or other materials provided with the
24  *    distribution.
25  *
26  * 3. All advertising materials mentioning features or use of this
27  *    software must display the following acknowledgment:
28  *    "This product includes software developed by the OpenSSL Project
29  *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
30  *
31  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
32  *    endorse or promote products derived from this software without
33  *    prior written permission. For written permission, please contact
34  *    licensing@OpenSSL.org.
35  *
36  * 5. Products derived from this software may not be called "OpenSSL"
37  *    nor may "OpenSSL" appear in their names without prior written
38  *    permission of the OpenSSL Project.
39  *
40  * 6. Redistributions of any form whatsoever must retain the following
41  *    acknowledgment:
42  *    "This product includes software developed by the OpenSSL Project
43  *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
44  *
45  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
46  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
49  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
50  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
51  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
52  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
54  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
55  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
56  * OF THE POSSIBILITY OF SUCH DAMAGE.
57  * ====================================================================
58  *
59  * This product includes cryptographic software written by Eric Young
60  * (eay@cryptsoft.com).  This product includes software written by Tim
61  * Hudson (tjh@cryptsoft.com).
62  *
63  */
64 
65 #include <stdio.h>
66 #include <string.h>
67 
68 #include <openssl/opensslconf.h>
69 #include <openssl/crypto.h>
70 #include <openssl/dso.h>
71 #include <openssl/engine.h>
72 #include <openssl/evp.h>
73 #ifndef OPENSSL_NO_AES
74 # include <openssl/aes.h>
75 #endif
76 #include <openssl/rand.h>
77 #include <openssl/err.h>
78 
79 #ifndef OPENSSL_NO_HW
80 # ifndef OPENSSL_NO_HW_PADLOCK
81 
82 /* Attempt to have a single source for both 0.9.7 and 0.9.8 :-) */
83 #  if (OPENSSL_VERSION_NUMBER >= 0x00908000L)
84 #   ifndef OPENSSL_NO_DYNAMIC_ENGINE
85 #    define DYNAMIC_ENGINE
86 #   endif
87 #  elif (OPENSSL_VERSION_NUMBER >= 0x00907000L)
88 #   ifdef ENGINE_DYNAMIC_SUPPORT
89 #    define DYNAMIC_ENGINE
90 #   endif
91 #  else
92 #   error "Only OpenSSL >= 0.9.7 is supported"
93 #  endif
94 
95 /*
96  * VIA PadLock AES is available *ONLY* on some x86 CPUs. Not only that it
97  * doesn't exist elsewhere, but it even can't be compiled on other platforms!
98  *
99  * In addition, because of the heavy use of inline assembler, compiler choice
100  * is limited to GCC and Microsoft C.
101  */
102 #  undef COMPILE_HW_PADLOCK
103 #  if !defined(I386_ONLY) && !defined(OPENSSL_NO_INLINE_ASM)
104 #   if (defined(__GNUC__) && (defined(__i386__) || defined(__i386))) || \
105      (defined(_MSC_VER) && defined(_M_IX86))
106 #    define COMPILE_HW_PADLOCK
107 #   endif
108 #  endif
109 
110 #  ifdef OPENSSL_NO_DYNAMIC_ENGINE
111 #   ifdef COMPILE_HW_PADLOCK
112 static ENGINE *ENGINE_padlock(void);
113 #   endif
114 
115 void ENGINE_load_padlock(void)
116 {
117 /* On non-x86 CPUs it just returns. */
118 #   ifdef COMPILE_HW_PADLOCK
119     ENGINE *toadd = ENGINE_padlock();
120     if (!toadd)
121         return;
122     ENGINE_add(toadd);
123     ENGINE_free(toadd);
124     ERR_clear_error();
125 #   endif
126 }
127 
128 #  endif
129 
130 #  ifdef COMPILE_HW_PADLOCK
131 /*
132  * We do these includes here to avoid header problems on platforms that do
133  * not have the VIA padlock anyway...
134  */
135 #   include <stdlib.h>
136 #   ifdef _WIN32
137 #    include <malloc.h>
138 #    ifndef alloca
139 #     define alloca _alloca
140 #    endif
141 #   elif defined(__GNUC__)
142 #    ifndef alloca
143 #     define alloca(s) __builtin_alloca(s)
144 #    endif
145 #   endif
146 
147 /* Function for ENGINE detection and control */
148 static int padlock_available(void);
149 static int padlock_init(ENGINE *e);
150 
151 /* RNG Stuff */
152 static RAND_METHOD padlock_rand;
153 
154 /* Cipher Stuff */
155 #   ifndef OPENSSL_NO_AES
156 static int padlock_ciphers(ENGINE *e, const EVP_CIPHER **cipher,
157                            const int **nids, int nid);
158 #   endif
159 
160 /* Engine names */
161 static const char *padlock_id = "padlock";
162 static char padlock_name[100];
163 
164 /* Available features */
165 static int padlock_use_ace = 0; /* Advanced Cryptography Engine */
166 static int padlock_use_rng = 0; /* Random Number Generator */
167 #   ifndef OPENSSL_NO_AES
168 static int padlock_aes_align_required = 1;
169 #   endif
170 
171 /* ===== Engine "management" functions ===== */
172 
173 /* Prepare the ENGINE structure for registration */
174 static int padlock_bind_helper(ENGINE *e)
175 {
176     /* Check available features */
177     padlock_available();
178 
179 #   if 1                        /* disable RNG for now, see commentary in
180                                  * vicinity of RNG code */
181     padlock_use_rng = 0;
182 #   endif
183 
184     /* Generate a nice engine name with available features */
185     BIO_snprintf(padlock_name, sizeof(padlock_name),
186                  "VIA PadLock (%s, %s)",
187                  padlock_use_rng ? "RNG" : "no-RNG",
188                  padlock_use_ace ? "ACE" : "no-ACE");
189 
190     /* Register everything or return with an error */
191     if (!ENGINE_set_id(e, padlock_id) ||
192         !ENGINE_set_name(e, padlock_name) ||
193         !ENGINE_set_init_function(e, padlock_init) ||
194 #   ifndef OPENSSL_NO_AES
195         (padlock_use_ace && !ENGINE_set_ciphers(e, padlock_ciphers)) ||
196 #   endif
197         (padlock_use_rng && !ENGINE_set_RAND(e, &padlock_rand))) {
198         return 0;
199     }
200 
201     /* Everything looks good */
202     return 1;
203 }
204 
205 #   ifdef OPENSSL_NO_DYNAMIC_ENGINE
206 
207 /* Constructor */
208 static ENGINE *ENGINE_padlock(void)
209 {
210     ENGINE *eng = ENGINE_new();
211 
212     if (!eng) {
213         return NULL;
214     }
215 
216     if (!padlock_bind_helper(eng)) {
217         ENGINE_free(eng);
218         return NULL;
219     }
220 
221     return eng;
222 }
223 
224 #   endif
225 
226 /* Check availability of the engine */
227 static int padlock_init(ENGINE *e)
228 {
229     return (padlock_use_rng || padlock_use_ace);
230 }
231 
232 /*
233  * This stuff is needed if this ENGINE is being compiled into a
234  * self-contained shared-library.
235  */
236 #   ifdef DYNAMIC_ENGINE
237 static int padlock_bind_fn(ENGINE *e, const char *id)
238 {
239     if (id && (strcmp(id, padlock_id) != 0)) {
240         return 0;
241     }
242 
243     if (!padlock_bind_helper(e)) {
244         return 0;
245     }
246 
247     return 1;
248 }
249 
250 IMPLEMENT_DYNAMIC_CHECK_FN()
251     IMPLEMENT_DYNAMIC_BIND_FN(padlock_bind_fn)
252 #   endif                       /* DYNAMIC_ENGINE */
253 /* ===== Here comes the "real" engine ===== */
254 #   ifndef OPENSSL_NO_AES
255 /* Some AES-related constants */
256 #    define AES_BLOCK_SIZE          16
257 #    define AES_KEY_SIZE_128        16
258 #    define AES_KEY_SIZE_192        24
259 #    define AES_KEY_SIZE_256        32
260     /*
261      * Here we store the status information relevant to the current context.
262      */
263     /*
264      * BIG FAT WARNING: Inline assembler in PADLOCK_XCRYPT_ASM() depends on
265      * the order of items in this structure.  Don't blindly modify, reorder,
266      * etc!
267      */
268 struct padlock_cipher_data {
269     unsigned char iv[AES_BLOCK_SIZE]; /* Initialization vector */
270     union {
271         unsigned int pad[4];
272         struct {
273             int rounds:4;
274             int dgst:1;         /* n/a in C3 */
275             int align:1;        /* n/a in C3 */
276             int ciphr:1;        /* n/a in C3 */
277             unsigned int keygen:1;
278             int interm:1;
279             unsigned int encdec:1;
280             int ksize:2;
281         } b;
282     } cword;                    /* Control word */
283     AES_KEY ks;                 /* Encryption key */
284 };
285 
286 /*
287  * Essentially this variable belongs in thread local storage.
288  * Having this variable global on the other hand can only cause
289  * few bogus key reloads [if any at all on single-CPU system],
290  * so we accept the penatly...
291  */
292 static volatile struct padlock_cipher_data *padlock_saved_context;
293 #   endif
294 
295 /*-
296  * =======================================================
297  * Inline assembler section(s).
298  * =======================================================
299  * Order of arguments is chosen to facilitate Windows port
300  * using __fastcall calling convention. If you wish to add
301  * more routines, keep in mind that first __fastcall
302  * argument is passed in %ecx and second - in %edx.
303  * =======================================================
304  */
305 #   if defined(__GNUC__) && __GNUC__>=2
306 /*
307  * As for excessive "push %ebx"/"pop %ebx" found all over.
308  * When generating position-independent code GCC won't let
309  * us use "b" in assembler templates nor even respect "ebx"
310  * in "clobber description." Therefore the trouble...
311  */
312 
313 /*
314  * Helper function - check if a CPUID instruction is available on this CPU
315  */
316 static int padlock_insn_cpuid_available(void)
317 {
318     int result = -1;
319 
320     /*
321      * We're checking if the bit #21 of EFLAGS can be toggled. If yes =
322      * CPUID is available.
323      */
324     asm volatile ("pushf\n"
325                   "popl %%eax\n"
326                   "xorl $0x200000, %%eax\n"
327                   "movl %%eax, %%ecx\n"
328                   "andl $0x200000, %%ecx\n"
329                   "pushl %%eax\n"
330                   "popf\n"
331                   "pushf\n"
332                   "popl %%eax\n"
333                   "andl $0x200000, %%eax\n"
334                   "xorl %%eax, %%ecx\n"
335                   "movl %%ecx, %0\n":"=r" (result)::"eax", "ecx");
336 
337     return (result == 0);
338 }
339 
340 /*
341  * Load supported features of the CPU to see if the PadLock is available.
342  */
343 static int padlock_available(void)
344 {
345     char vendor_string[16];
346     unsigned int eax, edx;
347 
348     /* First check if the CPUID instruction is available at all... */
349     if (!padlock_insn_cpuid_available())
350         return 0;
351 
352     /* Are we running on the Centaur (VIA) CPU? */
353     eax = 0x00000000;
354     vendor_string[12] = 0;
355     asm volatile ("pushl  %%ebx\n"
356                   "cpuid\n"
357                   "movl   %%ebx,(%%edi)\n"
358                   "movl   %%edx,4(%%edi)\n"
359                   "movl   %%ecx,8(%%edi)\n"
360                   "popl   %%ebx":"+a" (eax):"D"(vendor_string):"ecx", "edx");
361     if (strcmp(vendor_string, "CentaurHauls") != 0)
362         return 0;
363 
364     /* Check for Centaur Extended Feature Flags presence */
365     eax = 0xC0000000;
366     asm volatile ("pushl %%ebx; cpuid; popl %%ebx":"+a" (eax)::"ecx", "edx");
367     if (eax < 0xC0000001)
368         return 0;
369 
370     /* Read the Centaur Extended Feature Flags */
371     eax = 0xC0000001;
372     asm volatile ("pushl %%ebx; cpuid; popl %%ebx":"+a" (eax),
373                   "=d"(edx)::"ecx");
374 
375     /* Fill up some flags */
376     padlock_use_ace = ((edx & (0x3 << 6)) == (0x3 << 6));
377     padlock_use_rng = ((edx & (0x3 << 2)) == (0x3 << 2));
378 
379     return padlock_use_ace + padlock_use_rng;
380 }
381 
382 #    ifndef OPENSSL_NO_AES
383 #     ifndef AES_ASM
384 /* Our own htonl()/ntohl() */
385 static inline void padlock_bswapl(AES_KEY *ks)
386 {
387     size_t i = sizeof(ks->rd_key) / sizeof(ks->rd_key[0]);
388     unsigned int *key = ks->rd_key;
389 
390     while (i--) {
391         asm volatile ("bswapl %0":"+r" (*key));
392         key++;
393     }
394 }
395 #     endif
396 #    endif
397 
398 /*
399  * Force key reload from memory to the CPU microcode. Loading EFLAGS from the
400  * stack clears EFLAGS[30] which does the trick.
401  */
402 static inline void padlock_reload_key(void)
403 {
404     asm volatile ("pushfl; popfl");
405 }
406 
407 #    ifndef OPENSSL_NO_AES
408 /*
409  * This is heuristic key context tracing. At first one
410  * believes that one should use atomic swap instructions,
411  * but it's not actually necessary. Point is that if
412  * padlock_saved_context was changed by another thread
413  * after we've read it and before we compare it with cdata,
414  * our key *shall* be reloaded upon thread context switch
415  * and we are therefore set in either case...
416  */
417 static inline void padlock_verify_context(struct padlock_cipher_data *cdata)
418 {
419     asm volatile ("pushfl\n"
420                   "       btl     $30,(%%esp)\n"
421                   "       jnc     1f\n"
422                   "       cmpl    %2,%1\n"
423                   "       je      1f\n"
424                   "       popfl\n"
425                   "       subl    $4,%%esp\n"
426                   "1:     addl    $4,%%esp\n"
427                   "       movl    %2,%0":"+m" (padlock_saved_context)
428                   :"r"(padlock_saved_context), "r"(cdata):"cc");
429 }
430 
431 /* Template for padlock_xcrypt_* modes */
432 /*
433  * BIG FAT WARNING: The offsets used with 'leal' instructions describe items
434  * of the 'padlock_cipher_data' structure.
435  */
436 #     define PADLOCK_XCRYPT_ASM(name,rep_xcrypt)     \
437 static inline void *name(size_t cnt,            \
438         struct padlock_cipher_data *cdata,      \
439         void *out, const void *inp)             \
440 {       void *iv;                               \
441         asm volatile ( "pushl   %%ebx\n"        \
442                 "       leal    16(%0),%%edx\n" \
443                 "       leal    32(%0),%%ebx\n" \
444                         rep_xcrypt "\n"         \
445                 "       popl    %%ebx"          \
446                 : "=a"(iv), "=c"(cnt), "=D"(out), "=S"(inp) \
447                 : "0"(cdata), "1"(cnt), "2"(out), "3"(inp)  \
448                 : "edx", "cc", "memory");       \
449         return iv;                              \
450 }
451 
452 /* Generate all functions with appropriate opcodes */
453 /* rep xcryptecb */
454 PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb, ".byte 0xf3,0x0f,0xa7,0xc8")
455 /* rep xcryptcbc */
456     PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc, ".byte 0xf3,0x0f,0xa7,0xd0")
457 /* rep xcryptcfb */
458     PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb, ".byte 0xf3,0x0f,0xa7,0xe0")
459 /* rep xcryptofb */
460     PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb, ".byte 0xf3,0x0f,0xa7,0xe8")
461 #    endif
462 /* The RNG call itself */
463 static inline unsigned int padlock_xstore(void *addr, unsigned int edx_in)
464 {
465     unsigned int eax_out;
466 
467     asm volatile (".byte 0x0f,0xa7,0xc0" /* xstore */
468                   :"=a" (eax_out), "=m"(*(unsigned *)addr)
469                   :"D"(addr), "d"(edx_in)
470         );
471 
472     return eax_out;
473 }
474 
475 /*
476  * Why not inline 'rep movsd'? I failed to find information on what value in
477  * Direction Flag one can expect and consequently have to apply
478  * "better-safe-than-sorry" approach and assume "undefined." I could
479  * explicitly clear it and restore the original value upon return from
480  * padlock_aes_cipher, but it's presumably too much trouble for too little
481  * gain... In case you wonder 'rep xcrypt*' instructions above are *not*
482  * affected by the Direction Flag and pointers advance toward larger
483  * addresses unconditionally.
484  */
485 static inline unsigned char *padlock_memcpy(void *dst, const void *src,
486                                             size_t n)
487 {
488     long *d = dst;
489     const long *s = src;
490 
491     n /= sizeof(*d);
492     do {
493         *d++ = *s++;
494     } while (--n);
495 
496     return dst;
497 }
498 
499 #   elif defined(_MSC_VER)
500 /*
501  * Unlike GCC these are real functions. In order to minimize impact
502  * on performance we adhere to __fastcall calling convention in
503  * order to get two first arguments passed through %ecx and %edx.
504  * Which kind of suits very well, as instructions in question use
505  * both %ecx and %edx as input:-)
506  */
507 #    define REP_XCRYPT(code)                \
508         _asm _emit 0xf3                 \
509         _asm _emit 0x0f _asm _emit 0xa7 \
510         _asm _emit code
511 
512 /*
513  * BIG FAT WARNING: The offsets used with 'lea' instructions describe items
514  * of the 'padlock_cipher_data' structure.
515  */
516 #    define PADLOCK_XCRYPT_ASM(name,code)   \
517 static void * __fastcall                \
518         name (size_t cnt, void *cdata,  \
519         void *outp, const void *inp)    \
520 {       _asm    mov     eax,edx         \
521         _asm    lea     edx,[eax+16]    \
522         _asm    lea     ebx,[eax+32]    \
523         _asm    mov     edi,outp        \
524         _asm    mov     esi,inp         \
525         REP_XCRYPT(code)                \
526 }
527 
528 PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb,0xc8)
529 PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc,0xd0)
530 PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb,0xe0)
531 PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb,0xe8)
532 
533 static int __fastcall padlock_xstore(void *outp, unsigned int code)
534 {
535     _asm    mov edi,ecx
536     _asm _emit 0x0f _asm _emit 0xa7 _asm _emit 0xc0
537 }
538 
539 static void __fastcall padlock_reload_key(void)
540 {
541     _asm pushfd
542     _asm popfd
543 }
544 
545 static void __fastcall padlock_verify_context(void *cdata)
546 {
547     _asm    {
548         pushfd
549         bt  DWORD PTR[esp],30
550         jnc skip
551         cmp ecx,padlock_saved_context
552         je  skip
553         popfd
554         sub esp,4
555     skip:   add esp,4
556         mov padlock_saved_context,ecx
557     }
558 }
559 
560 static int
561 padlock_available(void)
562 {
563     _asm    {
564         pushfd
565         pop eax
566         mov ecx,eax
567         xor eax,1<<21
568         push    eax
569         popfd
570         pushfd
571         pop eax
572         xor eax,ecx
573         bt  eax,21
574         jnc noluck
575         mov eax,0
576         cpuid
577         xor eax,eax
578         cmp ebx,'tneC'
579         jne noluck
580         cmp edx,'Hrua'
581         jne noluck
582         cmp ecx,'slua'
583         jne noluck
584         mov eax,0xC0000000
585         cpuid
586         mov edx,eax
587         xor eax,eax
588         cmp edx,0xC0000001
589         jb  noluck
590         mov eax,0xC0000001
591         cpuid
592         xor eax,eax
593         bt  edx,6
594         jnc skip_a
595         bt  edx,7
596         jnc skip_a
597         mov padlock_use_ace,1
598         inc eax
599     skip_a: bt  edx,2
600         jnc skip_r
601         bt  edx,3
602         jnc skip_r
603         mov padlock_use_rng,1
604         inc eax
605     skip_r:
606     noluck:
607     }
608 }
609 
610 static void __fastcall padlock_bswapl(void *key)
611 {
612     _asm    {
613         pushfd
614         cld
615         mov esi,ecx
616         mov edi,ecx
617         mov ecx,60
618     up: lodsd
619         bswap   eax
620         stosd
621         loop    up
622         popfd
623     }
624 }
625 
626 /*
627  * MS actually specifies status of Direction Flag and compiler even manages
628  * to compile following as 'rep movsd' all by itself...
629  */
630 #    define padlock_memcpy(o,i,n) ((unsigned char *)memcpy((o),(i),(n)&~3U))
631 #   endif
632 /* ===== AES encryption/decryption ===== */
633 #   ifndef OPENSSL_NO_AES
634 #    if defined(NID_aes_128_cfb128) && ! defined (NID_aes_128_cfb)
635 #     define NID_aes_128_cfb NID_aes_128_cfb128
636 #    endif
637 #    if defined(NID_aes_128_ofb128) && ! defined (NID_aes_128_ofb)
638 #     define NID_aes_128_ofb NID_aes_128_ofb128
639 #    endif
640 #    if defined(NID_aes_192_cfb128) && ! defined (NID_aes_192_cfb)
641 #     define NID_aes_192_cfb NID_aes_192_cfb128
642 #    endif
643 #    if defined(NID_aes_192_ofb128) && ! defined (NID_aes_192_ofb)
644 #     define NID_aes_192_ofb NID_aes_192_ofb128
645 #    endif
646 #    if defined(NID_aes_256_cfb128) && ! defined (NID_aes_256_cfb)
647 #     define NID_aes_256_cfb NID_aes_256_cfb128
648 #    endif
649 #    if defined(NID_aes_256_ofb128) && ! defined (NID_aes_256_ofb)
650 #     define NID_aes_256_ofb NID_aes_256_ofb128
651 #    endif
652 /*
653  * List of supported ciphers.
654  */ static int padlock_cipher_nids[] = {
655     NID_aes_128_ecb,
656     NID_aes_128_cbc,
657     NID_aes_128_cfb,
658     NID_aes_128_ofb,
659 
660     NID_aes_192_ecb,
661     NID_aes_192_cbc,
662     NID_aes_192_cfb,
663     NID_aes_192_ofb,
664 
665     NID_aes_256_ecb,
666     NID_aes_256_cbc,
667     NID_aes_256_cfb,
668     NID_aes_256_ofb,
669 };
670 
671 static int padlock_cipher_nids_num = (sizeof(padlock_cipher_nids) /
672                                       sizeof(padlock_cipher_nids[0]));
673 
674 /* Function prototypes ... */
675 static int padlock_aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
676                                 const unsigned char *iv, int enc);
677 static int padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
678                               const unsigned char *in, size_t nbytes);
679 
680 #    define NEAREST_ALIGNED(ptr) ( (unsigned char *)(ptr) +         \
681         ( (0x10 - ((size_t)(ptr) & 0x0F)) & 0x0F )      )
682 #    define ALIGNED_CIPHER_DATA(ctx) ((struct padlock_cipher_data *)\
683         NEAREST_ALIGNED(ctx->cipher_data))
684 
685 #    define EVP_CIPHER_block_size_ECB       AES_BLOCK_SIZE
686 #    define EVP_CIPHER_block_size_CBC       AES_BLOCK_SIZE
687 #    define EVP_CIPHER_block_size_OFB       1
688 #    define EVP_CIPHER_block_size_CFB       1
689 
690 /*
691  * Declaring so many ciphers by hand would be a pain. Instead introduce a bit
692  * of preprocessor magic :-)
693  */
694 #    define DECLARE_AES_EVP(ksize,lmode,umode)      \
695 static const EVP_CIPHER padlock_aes_##ksize##_##lmode = {       \
696         NID_aes_##ksize##_##lmode,              \
697         EVP_CIPHER_block_size_##umode,  \
698         AES_KEY_SIZE_##ksize,           \
699         AES_BLOCK_SIZE,                 \
700         0 | EVP_CIPH_##umode##_MODE,    \
701         padlock_aes_init_key,           \
702         padlock_aes_cipher,             \
703         NULL,                           \
704         sizeof(struct padlock_cipher_data) + 16,        \
705         EVP_CIPHER_set_asn1_iv,         \
706         EVP_CIPHER_get_asn1_iv,         \
707         NULL,                           \
708         NULL                            \
709 }
710 
711 DECLARE_AES_EVP(128, ecb, ECB);
712 DECLARE_AES_EVP(128, cbc, CBC);
713 DECLARE_AES_EVP(128, cfb, CFB);
714 DECLARE_AES_EVP(128, ofb, OFB);
715 
716 DECLARE_AES_EVP(192, ecb, ECB);
717 DECLARE_AES_EVP(192, cbc, CBC);
718 DECLARE_AES_EVP(192, cfb, CFB);
719 DECLARE_AES_EVP(192, ofb, OFB);
720 
721 DECLARE_AES_EVP(256, ecb, ECB);
722 DECLARE_AES_EVP(256, cbc, CBC);
723 DECLARE_AES_EVP(256, cfb, CFB);
724 DECLARE_AES_EVP(256, ofb, OFB);
725 
726 static int
727 padlock_ciphers(ENGINE *e, const EVP_CIPHER **cipher, const int **nids,
728                 int nid)
729 {
730     /* No specific cipher => return a list of supported nids ... */
731     if (!cipher) {
732         *nids = padlock_cipher_nids;
733         return padlock_cipher_nids_num;
734     }
735 
736     /* ... or the requested "cipher" otherwise */
737     switch (nid) {
738     case NID_aes_128_ecb:
739         *cipher = &padlock_aes_128_ecb;
740         break;
741     case NID_aes_128_cbc:
742         *cipher = &padlock_aes_128_cbc;
743         break;
744     case NID_aes_128_cfb:
745         *cipher = &padlock_aes_128_cfb;
746         break;
747     case NID_aes_128_ofb:
748         *cipher = &padlock_aes_128_ofb;
749         break;
750 
751     case NID_aes_192_ecb:
752         *cipher = &padlock_aes_192_ecb;
753         break;
754     case NID_aes_192_cbc:
755         *cipher = &padlock_aes_192_cbc;
756         break;
757     case NID_aes_192_cfb:
758         *cipher = &padlock_aes_192_cfb;
759         break;
760     case NID_aes_192_ofb:
761         *cipher = &padlock_aes_192_ofb;
762         break;
763 
764     case NID_aes_256_ecb:
765         *cipher = &padlock_aes_256_ecb;
766         break;
767     case NID_aes_256_cbc:
768         *cipher = &padlock_aes_256_cbc;
769         break;
770     case NID_aes_256_cfb:
771         *cipher = &padlock_aes_256_cfb;
772         break;
773     case NID_aes_256_ofb:
774         *cipher = &padlock_aes_256_ofb;
775         break;
776 
777     default:
778         /* Sorry, we don't support this NID */
779         *cipher = NULL;
780         return 0;
781     }
782 
783     return 1;
784 }
785 
786 /* Prepare the encryption key for PadLock usage */
787 static int
788 padlock_aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
789                      const unsigned char *iv, int enc)
790 {
791     struct padlock_cipher_data *cdata;
792     int key_len = EVP_CIPHER_CTX_key_length(ctx) * 8;
793 
794     if (key == NULL)
795         return 0;               /* ERROR */
796 
797     cdata = ALIGNED_CIPHER_DATA(ctx);
798     memset(cdata, 0, sizeof(struct padlock_cipher_data));
799 
800     /* Prepare Control word. */
801     if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE)
802         cdata->cword.b.encdec = 0;
803     else
804         cdata->cword.b.encdec = (ctx->encrypt == 0);
805     cdata->cword.b.rounds = 10 + (key_len - 128) / 32;
806     cdata->cword.b.ksize = (key_len - 128) / 64;
807 
808     switch (key_len) {
809     case 128:
810         /*
811          * PadLock can generate an extended key for AES128 in hardware
812          */
813         memcpy(cdata->ks.rd_key, key, AES_KEY_SIZE_128);
814         cdata->cword.b.keygen = 0;
815         break;
816 
817     case 192:
818     case 256:
819         /*
820          * Generate an extended AES key in software. Needed for AES192/AES256
821          */
822         /*
823          * Well, the above applies to Stepping 8 CPUs and is listed as
824          * hardware errata. They most likely will fix it at some point and
825          * then a check for stepping would be due here.
826          */
827         if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_CFB_MODE ||
828             EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE || enc)
829             AES_set_encrypt_key(key, key_len, &cdata->ks);
830         else
831             AES_set_decrypt_key(key, key_len, &cdata->ks);
832 #    ifndef AES_ASM
833         /*
834          * OpenSSL C functions use byte-swapped extended key.
835          */
836         padlock_bswapl(&cdata->ks);
837 #    endif
838         cdata->cword.b.keygen = 1;
839         break;
840 
841     default:
842         /* ERROR */
843         return 0;
844     }
845 
846     /*
847      * This is done to cover for cases when user reuses the
848      * context for new key. The catch is that if we don't do
849      * this, padlock_eas_cipher might proceed with old key...
850      */
851     padlock_reload_key();
852 
853     return 1;
854 }
855 
856 /*-
857  * Simplified version of padlock_aes_cipher() used when
858  * 1) both input and output buffers are at aligned addresses.
859  * or when
860  * 2) running on a newer CPU that doesn't require aligned buffers.
861  */
862 static int
863 padlock_aes_cipher_omnivorous(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
864                               const unsigned char *in_arg, size_t nbytes)
865 {
866     struct padlock_cipher_data *cdata;
867     void *iv;
868 
869     cdata = ALIGNED_CIPHER_DATA(ctx);
870     padlock_verify_context(cdata);
871 
872     switch (EVP_CIPHER_CTX_mode(ctx)) {
873     case EVP_CIPH_ECB_MODE:
874         padlock_xcrypt_ecb(nbytes / AES_BLOCK_SIZE, cdata, out_arg, in_arg);
875         break;
876 
877     case EVP_CIPH_CBC_MODE:
878         memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
879         iv = padlock_xcrypt_cbc(nbytes / AES_BLOCK_SIZE, cdata, out_arg,
880                                 in_arg);
881         memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
882         break;
883 
884     case EVP_CIPH_CFB_MODE:
885         memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
886         iv = padlock_xcrypt_cfb(nbytes / AES_BLOCK_SIZE, cdata, out_arg,
887                                 in_arg);
888         memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
889         break;
890 
891     case EVP_CIPH_OFB_MODE:
892         memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
893         padlock_xcrypt_ofb(nbytes / AES_BLOCK_SIZE, cdata, out_arg, in_arg);
894         memcpy(ctx->iv, cdata->iv, AES_BLOCK_SIZE);
895         break;
896 
897     default:
898         return 0;
899     }
900 
901     memset(cdata->iv, 0, AES_BLOCK_SIZE);
902 
903     return 1;
904 }
905 
906 #    ifndef  PADLOCK_CHUNK
907 #     define PADLOCK_CHUNK  512 /* Must be a power of 2 larger than 16 */
908 #    endif
909 #    if PADLOCK_CHUNK<16 || PADLOCK_CHUNK&(PADLOCK_CHUNK-1)
910 #     error "insane PADLOCK_CHUNK..."
911 #    endif
912 
913 /*
914  * Re-align the arguments to 16-Bytes boundaries and run the encryption
915  * function itself. This function is not AES-specific.
916  */
917 static int
918 padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
919                    const unsigned char *in_arg, size_t nbytes)
920 {
921     struct padlock_cipher_data *cdata;
922     const void *inp;
923     unsigned char *out;
924     void *iv;
925     int inp_misaligned, out_misaligned, realign_in_loop;
926     size_t chunk, allocated = 0;
927 
928     /*
929      * ctx->num is maintained in byte-oriented modes, such as CFB and OFB...
930      */
931     if ((chunk = ctx->num)) {   /* borrow chunk variable */
932         unsigned char *ivp = ctx->iv;
933 
934         switch (EVP_CIPHER_CTX_mode(ctx)) {
935         case EVP_CIPH_CFB_MODE:
936             if (chunk >= AES_BLOCK_SIZE)
937                 return 0;       /* bogus value */
938 
939             if (ctx->encrypt)
940                 while (chunk < AES_BLOCK_SIZE && nbytes != 0) {
941                     ivp[chunk] = *(out_arg++) = *(in_arg++) ^ ivp[chunk];
942                     chunk++, nbytes--;
943             } else
944                 while (chunk < AES_BLOCK_SIZE && nbytes != 0) {
945                     unsigned char c = *(in_arg++);
946                     *(out_arg++) = c ^ ivp[chunk];
947                     ivp[chunk++] = c, nbytes--;
948                 }
949 
950             ctx->num = chunk % AES_BLOCK_SIZE;
951             break;
952         case EVP_CIPH_OFB_MODE:
953             if (chunk >= AES_BLOCK_SIZE)
954                 return 0;       /* bogus value */
955 
956             while (chunk < AES_BLOCK_SIZE && nbytes != 0) {
957                 *(out_arg++) = *(in_arg++) ^ ivp[chunk];
958                 chunk++, nbytes--;
959             }
960 
961             ctx->num = chunk % AES_BLOCK_SIZE;
962             break;
963         }
964     }
965 
966     if (nbytes == 0)
967         return 1;
968 #    if 0
969     if (nbytes % AES_BLOCK_SIZE)
970         return 0;               /* are we expected to do tail processing? */
971 #    else
972     /*
973      * nbytes is always multiple of AES_BLOCK_SIZE in ECB and CBC modes and
974      * arbitrary value in byte-oriented modes, such as CFB and OFB...
975      */
976 #    endif
977 
978     /*
979      * VIA promises CPUs that won't require alignment in the future. For now
980      * padlock_aes_align_required is initialized to 1 and the condition is
981      * never met...
982      */
983     /*
984      * C7 core is capable to manage unaligned input in non-ECB[!] mode, but
985      * performance penalties appear to be approximately same as for software
986      * alignment below or ~3x. They promise to improve it in the future, but
987      * for now we can just as well pretend that it can only handle aligned
988      * input...
989      */
990     if (!padlock_aes_align_required && (nbytes % AES_BLOCK_SIZE) == 0)
991         return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes);
992 
993     inp_misaligned = (((size_t)in_arg) & 0x0F);
994     out_misaligned = (((size_t)out_arg) & 0x0F);
995 
996     /*
997      * Note that even if output is aligned and input not, I still prefer to
998      * loop instead of copy the whole input and then encrypt in one stroke.
999      * This is done in order to improve L1 cache utilization...
1000      */
1001     realign_in_loop = out_misaligned | inp_misaligned;
1002 
1003     if (!realign_in_loop && (nbytes % AES_BLOCK_SIZE) == 0)
1004         return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes);
1005 
1006     /* this takes one "if" out of the loops */
1007     chunk = nbytes;
1008     chunk %= PADLOCK_CHUNK;
1009     if (chunk == 0)
1010         chunk = PADLOCK_CHUNK;
1011 
1012     if (out_misaligned) {
1013         /* optmize for small input */
1014         allocated = (chunk < nbytes ? PADLOCK_CHUNK : nbytes);
1015         out = alloca(0x10 + allocated);
1016         out = NEAREST_ALIGNED(out);
1017     } else
1018         out = out_arg;
1019 
1020     cdata = ALIGNED_CIPHER_DATA(ctx);
1021     padlock_verify_context(cdata);
1022 
1023     switch (EVP_CIPHER_CTX_mode(ctx)) {
1024     case EVP_CIPH_ECB_MODE:
1025         do {
1026             if (inp_misaligned)
1027                 inp = padlock_memcpy(out, in_arg, chunk);
1028             else
1029                 inp = in_arg;
1030             in_arg += chunk;
1031 
1032             padlock_xcrypt_ecb(chunk / AES_BLOCK_SIZE, cdata, out, inp);
1033 
1034             if (out_misaligned)
1035                 out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1036             else
1037                 out = out_arg += chunk;
1038 
1039             nbytes -= chunk;
1040             chunk = PADLOCK_CHUNK;
1041         } while (nbytes);
1042         break;
1043 
1044     case EVP_CIPH_CBC_MODE:
1045         memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1046         goto cbc_shortcut;
1047         do {
1048             if (iv != cdata->iv)
1049                 memcpy(cdata->iv, iv, AES_BLOCK_SIZE);
1050             chunk = PADLOCK_CHUNK;
1051  cbc_shortcut:                 /* optimize for small input */
1052             if (inp_misaligned)
1053                 inp = padlock_memcpy(out, in_arg, chunk);
1054             else
1055                 inp = in_arg;
1056             in_arg += chunk;
1057 
1058             iv = padlock_xcrypt_cbc(chunk / AES_BLOCK_SIZE, cdata, out, inp);
1059 
1060             if (out_misaligned)
1061                 out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1062             else
1063                 out = out_arg += chunk;
1064 
1065         } while (nbytes -= chunk);
1066         memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
1067         break;
1068 
1069     case EVP_CIPH_CFB_MODE:
1070         memcpy(iv = cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1071         chunk &= ~(AES_BLOCK_SIZE - 1);
1072         if (chunk)
1073             goto cfb_shortcut;
1074         else
1075             goto cfb_skiploop;
1076         do {
1077             if (iv != cdata->iv)
1078                 memcpy(cdata->iv, iv, AES_BLOCK_SIZE);
1079             chunk = PADLOCK_CHUNK;
1080  cfb_shortcut:                 /* optimize for small input */
1081             if (inp_misaligned)
1082                 inp = padlock_memcpy(out, in_arg, chunk);
1083             else
1084                 inp = in_arg;
1085             in_arg += chunk;
1086 
1087             iv = padlock_xcrypt_cfb(chunk / AES_BLOCK_SIZE, cdata, out, inp);
1088 
1089             if (out_misaligned)
1090                 out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1091             else
1092                 out = out_arg += chunk;
1093 
1094             nbytes -= chunk;
1095         } while (nbytes >= AES_BLOCK_SIZE);
1096 
1097  cfb_skiploop:
1098         if (nbytes) {
1099             unsigned char *ivp = cdata->iv;
1100 
1101             if (iv != ivp) {
1102                 memcpy(ivp, iv, AES_BLOCK_SIZE);
1103                 iv = ivp;
1104             }
1105             ctx->num = nbytes;
1106             if (cdata->cword.b.encdec) {
1107                 cdata->cword.b.encdec = 0;
1108                 padlock_reload_key();
1109                 padlock_xcrypt_ecb(1, cdata, ivp, ivp);
1110                 cdata->cword.b.encdec = 1;
1111                 padlock_reload_key();
1112                 while (nbytes) {
1113                     unsigned char c = *(in_arg++);
1114                     *(out_arg++) = c ^ *ivp;
1115                     *(ivp++) = c, nbytes--;
1116                 }
1117             } else {
1118                 padlock_reload_key();
1119                 padlock_xcrypt_ecb(1, cdata, ivp, ivp);
1120                 padlock_reload_key();
1121                 while (nbytes) {
1122                     *ivp = *(out_arg++) = *(in_arg++) ^ *ivp;
1123                     ivp++, nbytes--;
1124                 }
1125             }
1126         }
1127 
1128         memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
1129         break;
1130 
1131     case EVP_CIPH_OFB_MODE:
1132         memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1133         chunk &= ~(AES_BLOCK_SIZE - 1);
1134         if (chunk)
1135             do {
1136                 if (inp_misaligned)
1137                     inp = padlock_memcpy(out, in_arg, chunk);
1138                 else
1139                     inp = in_arg;
1140                 in_arg += chunk;
1141 
1142                 padlock_xcrypt_ofb(chunk / AES_BLOCK_SIZE, cdata, out, inp);
1143 
1144                 if (out_misaligned)
1145                     out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1146                 else
1147                     out = out_arg += chunk;
1148 
1149                 nbytes -= chunk;
1150                 chunk = PADLOCK_CHUNK;
1151             } while (nbytes >= AES_BLOCK_SIZE);
1152 
1153         if (nbytes) {
1154             unsigned char *ivp = cdata->iv;
1155 
1156             ctx->num = nbytes;
1157             padlock_reload_key(); /* empirically found */
1158             padlock_xcrypt_ecb(1, cdata, ivp, ivp);
1159             padlock_reload_key(); /* empirically found */
1160             while (nbytes) {
1161                 *(out_arg++) = *(in_arg++) ^ *ivp;
1162                 ivp++, nbytes--;
1163             }
1164         }
1165 
1166         memcpy(ctx->iv, cdata->iv, AES_BLOCK_SIZE);
1167         break;
1168 
1169     default:
1170         return 0;
1171     }
1172 
1173     /* Clean the realign buffer if it was used */
1174     if (out_misaligned) {
1175         volatile unsigned long *p = (void *)out;
1176         size_t n = allocated / sizeof(*p);
1177         while (n--)
1178             *p++ = 0;
1179     }
1180 
1181     memset(cdata->iv, 0, AES_BLOCK_SIZE);
1182 
1183     return 1;
1184 }
1185 
1186 #   endif                       /* OPENSSL_NO_AES */
1187 
1188 /* ===== Random Number Generator ===== */
1189 /*
1190  * This code is not engaged. The reason is that it does not comply
1191  * with recommendations for VIA RNG usage for secure applications
1192  * (posted at http://www.via.com.tw/en/viac3/c3.jsp) nor does it
1193  * provide meaningful error control...
1194  */
1195 /*
1196  * Wrapper that provides an interface between the API and the raw PadLock
1197  * RNG
1198  */
1199 static int padlock_rand_bytes(unsigned char *output, int count)
1200 {
1201     unsigned int eax, buf;
1202 
1203     while (count >= 8) {
1204         eax = padlock_xstore(output, 0);
1205         if (!(eax & (1 << 6)))
1206             return 0;           /* RNG disabled */
1207         /* this ---vv--- covers DC bias, Raw Bits and String Filter */
1208         if (eax & (0x1F << 10))
1209             return 0;
1210         if ((eax & 0x1F) == 0)
1211             continue;           /* no data, retry... */
1212         if ((eax & 0x1F) != 8)
1213             return 0;           /* fatal failure...  */
1214         output += 8;
1215         count -= 8;
1216     }
1217     while (count > 0) {
1218         eax = padlock_xstore(&buf, 3);
1219         if (!(eax & (1 << 6)))
1220             return 0;           /* RNG disabled */
1221         /* this ---vv--- covers DC bias, Raw Bits and String Filter */
1222         if (eax & (0x1F << 10))
1223             return 0;
1224         if ((eax & 0x1F) == 0)
1225             continue;           /* no data, retry... */
1226         if ((eax & 0x1F) != 1)
1227             return 0;           /* fatal failure...  */
1228         *output++ = (unsigned char)buf;
1229         count--;
1230     }
1231     *(volatile unsigned int *)&buf = 0;
1232 
1233     return 1;
1234 }
1235 
1236 /* Dummy but necessary function */
1237 static int padlock_rand_status(void)
1238 {
1239     return 1;
1240 }
1241 
1242 /* Prepare structure for registration */
1243 static RAND_METHOD padlock_rand = {
1244     NULL,                       /* seed */
1245     padlock_rand_bytes,         /* bytes */
1246     NULL,                       /* cleanup */
1247     NULL,                       /* add */
1248     padlock_rand_bytes,         /* pseudorand */
1249     padlock_rand_status,        /* rand status */
1250 };
1251 
1252 #  else                         /* !COMPILE_HW_PADLOCK */
1253 #   ifndef OPENSSL_NO_DYNAMIC_ENGINE
1254 OPENSSL_EXPORT
1255     int bind_engine(ENGINE *e, const char *id, const dynamic_fns *fns);
1256 OPENSSL_EXPORT
1257     int bind_engine(ENGINE *e, const char *id, const dynamic_fns *fns)
1258 {
1259     return 0;
1260 }
1261 
1262 IMPLEMENT_DYNAMIC_CHECK_FN()
1263 #   endif
1264 #  endif                        /* COMPILE_HW_PADLOCK */
1265 # endif                         /* !OPENSSL_NO_HW_PADLOCK */
1266 #endif                          /* !OPENSSL_NO_HW */
1267