1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Implement AES algorithm in Intel AES-NI instructions. 4 * 5 * The white paper of AES-NI instructions can be downloaded from: 6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 7 * 8 * Copyright (C) 2008, Intel Corp. 9 * Author: Huang Ying <ying.huang@intel.com> 10 * Vinodh Gopal <vinodh.gopal@intel.com> 11 * Kahraman Akdemir 12 * 13 * Copyright (c) 2010, Intel Corporation. 14 * 15 * Ported x86_64 version to x86: 16 * Author: Mathias Krause <minipli@googlemail.com> 17 */ 18 19#include <linux/linkage.h> 20#include <asm/frame.h> 21 22#define STATE1 %xmm0 23#define STATE2 %xmm4 24#define STATE3 %xmm5 25#define STATE4 %xmm6 26#define STATE STATE1 27#define IN1 %xmm1 28#define IN2 %xmm7 29#define IN3 %xmm8 30#define IN4 %xmm9 31#define IN IN1 32#define KEY %xmm2 33#define IV %xmm3 34 35#define BSWAP_MASK %xmm10 36#define CTR %xmm11 37#define INC %xmm12 38 39#define GF128MUL_MASK %xmm7 40 41#ifdef __x86_64__ 42#define AREG %rax 43#define KEYP %rdi 44#define OUTP %rsi 45#define UKEYP OUTP 46#define INP %rdx 47#define LEN %rcx 48#define IVP %r8 49#define KLEN %r9d 50#define T1 %r10 51#define TKEYP T1 52#define T2 %r11 53#define TCTR_LOW T2 54#else 55#define AREG %eax 56#define KEYP %edi 57#define OUTP AREG 58#define UKEYP OUTP 59#define INP %edx 60#define LEN %esi 61#define IVP %ebp 62#define KLEN %ebx 63#define T1 %ecx 64#define TKEYP T1 65#endif 66 67SYM_FUNC_START_LOCAL(_key_expansion_256a) 68 pshufd $0b11111111, %xmm1, %xmm1 69 shufps $0b00010000, %xmm0, %xmm4 70 pxor %xmm4, %xmm0 71 shufps $0b10001100, %xmm0, %xmm4 72 pxor %xmm4, %xmm0 73 pxor %xmm1, %xmm0 74 movaps %xmm0, (TKEYP) 75 add $0x10, TKEYP 76 RET 77SYM_FUNC_END(_key_expansion_256a) 78SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a) 79 80SYM_FUNC_START_LOCAL(_key_expansion_192a) 81 pshufd $0b01010101, %xmm1, %xmm1 82 shufps $0b00010000, %xmm0, %xmm4 83 pxor %xmm4, %xmm0 84 shufps $0b10001100, %xmm0, %xmm4 85 pxor %xmm4, %xmm0 86 pxor %xmm1, %xmm0 87 88 movaps %xmm2, %xmm5 89 movaps %xmm2, %xmm6 90 pslldq $4, %xmm5 91 pshufd $0b11111111, %xmm0, %xmm3 92 pxor %xmm3, %xmm2 93 pxor %xmm5, %xmm2 94 95 movaps %xmm0, %xmm1 96 shufps $0b01000100, %xmm0, %xmm6 97 movaps %xmm6, (TKEYP) 98 shufps $0b01001110, %xmm2, %xmm1 99 movaps %xmm1, 0x10(TKEYP) 100 add $0x20, TKEYP 101 RET 102SYM_FUNC_END(_key_expansion_192a) 103 104SYM_FUNC_START_LOCAL(_key_expansion_192b) 105 pshufd $0b01010101, %xmm1, %xmm1 106 shufps $0b00010000, %xmm0, %xmm4 107 pxor %xmm4, %xmm0 108 shufps $0b10001100, %xmm0, %xmm4 109 pxor %xmm4, %xmm0 110 pxor %xmm1, %xmm0 111 112 movaps %xmm2, %xmm5 113 pslldq $4, %xmm5 114 pshufd $0b11111111, %xmm0, %xmm3 115 pxor %xmm3, %xmm2 116 pxor %xmm5, %xmm2 117 118 movaps %xmm0, (TKEYP) 119 add $0x10, TKEYP 120 RET 121SYM_FUNC_END(_key_expansion_192b) 122 123SYM_FUNC_START_LOCAL(_key_expansion_256b) 124 pshufd $0b10101010, %xmm1, %xmm1 125 shufps $0b00010000, %xmm2, %xmm4 126 pxor %xmm4, %xmm2 127 shufps $0b10001100, %xmm2, %xmm4 128 pxor %xmm4, %xmm2 129 pxor %xmm1, %xmm2 130 movaps %xmm2, (TKEYP) 131 add $0x10, TKEYP 132 RET 133SYM_FUNC_END(_key_expansion_256b) 134 135/* 136 * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 137 * unsigned int key_len) 138 */ 139SYM_FUNC_START(aesni_set_key) 140 FRAME_BEGIN 141#ifndef __x86_64__ 142 pushl KEYP 143 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx 144 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key 145 movl (FRAME_OFFSET+16)(%esp), %edx # key_len 146#endif 147 movups (UKEYP), %xmm0 # user key (first 16 bytes) 148 movaps %xmm0, (KEYP) 149 lea 0x10(KEYP), TKEYP # key addr 150 movl %edx, 480(KEYP) 151 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 152 cmp $24, %dl 153 jb .Lenc_key128 154 je .Lenc_key192 155 movups 0x10(UKEYP), %xmm2 # other user key 156 movaps %xmm2, (TKEYP) 157 add $0x10, TKEYP 158 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 159 call _key_expansion_256a 160 aeskeygenassist $0x1, %xmm0, %xmm1 161 call _key_expansion_256b 162 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 163 call _key_expansion_256a 164 aeskeygenassist $0x2, %xmm0, %xmm1 165 call _key_expansion_256b 166 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 167 call _key_expansion_256a 168 aeskeygenassist $0x4, %xmm0, %xmm1 169 call _key_expansion_256b 170 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 171 call _key_expansion_256a 172 aeskeygenassist $0x8, %xmm0, %xmm1 173 call _key_expansion_256b 174 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 175 call _key_expansion_256a 176 aeskeygenassist $0x10, %xmm0, %xmm1 177 call _key_expansion_256b 178 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 179 call _key_expansion_256a 180 aeskeygenassist $0x20, %xmm0, %xmm1 181 call _key_expansion_256b 182 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 183 call _key_expansion_256a 184 jmp .Ldec_key 185.Lenc_key192: 186 movq 0x10(UKEYP), %xmm2 # other user key 187 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 188 call _key_expansion_192a 189 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 190 call _key_expansion_192b 191 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 192 call _key_expansion_192a 193 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 194 call _key_expansion_192b 195 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 196 call _key_expansion_192a 197 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 198 call _key_expansion_192b 199 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 200 call _key_expansion_192a 201 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 202 call _key_expansion_192b 203 jmp .Ldec_key 204.Lenc_key128: 205 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 206 call _key_expansion_128 207 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 208 call _key_expansion_128 209 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 210 call _key_expansion_128 211 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 212 call _key_expansion_128 213 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 214 call _key_expansion_128 215 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 216 call _key_expansion_128 217 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 218 call _key_expansion_128 219 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 220 call _key_expansion_128 221 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 222 call _key_expansion_128 223 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 224 call _key_expansion_128 225.Ldec_key: 226 sub $0x10, TKEYP 227 movaps (KEYP), %xmm0 228 movaps (TKEYP), %xmm1 229 movaps %xmm0, 240(TKEYP) 230 movaps %xmm1, 240(KEYP) 231 add $0x10, KEYP 232 lea 240-16(TKEYP), UKEYP 233.align 4 234.Ldec_key_loop: 235 movaps (KEYP), %xmm0 236 aesimc %xmm0, %xmm1 237 movaps %xmm1, (UKEYP) 238 add $0x10, KEYP 239 sub $0x10, UKEYP 240 cmp TKEYP, KEYP 241 jb .Ldec_key_loop 242#ifndef __x86_64__ 243 popl KEYP 244#endif 245 FRAME_END 246 RET 247SYM_FUNC_END(aesni_set_key) 248 249/* 250 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src) 251 */ 252SYM_FUNC_START(aesni_enc) 253 FRAME_BEGIN 254#ifndef __x86_64__ 255 pushl KEYP 256 pushl KLEN 257 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 258 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 259 movl (FRAME_OFFSET+20)(%esp), INP # src 260#endif 261 movl 480(KEYP), KLEN # key length 262 movups (INP), STATE # input 263 call _aesni_enc1 264 movups STATE, (OUTP) # output 265#ifndef __x86_64__ 266 popl KLEN 267 popl KEYP 268#endif 269 FRAME_END 270 RET 271SYM_FUNC_END(aesni_enc) 272 273/* 274 * _aesni_enc1: internal ABI 275 * input: 276 * KEYP: key struct pointer 277 * KLEN: round count 278 * STATE: initial state (input) 279 * output: 280 * STATE: finial state (output) 281 * changed: 282 * KEY 283 * TKEYP (T1) 284 */ 285SYM_FUNC_START_LOCAL(_aesni_enc1) 286 movaps (KEYP), KEY # key 287 mov KEYP, TKEYP 288 pxor KEY, STATE # round 0 289 add $0x30, TKEYP 290 cmp $24, KLEN 291 jb .Lenc128 292 lea 0x20(TKEYP), TKEYP 293 je .Lenc192 294 add $0x20, TKEYP 295 movaps -0x60(TKEYP), KEY 296 aesenc KEY, STATE 297 movaps -0x50(TKEYP), KEY 298 aesenc KEY, STATE 299.align 4 300.Lenc192: 301 movaps -0x40(TKEYP), KEY 302 aesenc KEY, STATE 303 movaps -0x30(TKEYP), KEY 304 aesenc KEY, STATE 305.align 4 306.Lenc128: 307 movaps -0x20(TKEYP), KEY 308 aesenc KEY, STATE 309 movaps -0x10(TKEYP), KEY 310 aesenc KEY, STATE 311 movaps (TKEYP), KEY 312 aesenc KEY, STATE 313 movaps 0x10(TKEYP), KEY 314 aesenc KEY, STATE 315 movaps 0x20(TKEYP), KEY 316 aesenc KEY, STATE 317 movaps 0x30(TKEYP), KEY 318 aesenc KEY, STATE 319 movaps 0x40(TKEYP), KEY 320 aesenc KEY, STATE 321 movaps 0x50(TKEYP), KEY 322 aesenc KEY, STATE 323 movaps 0x60(TKEYP), KEY 324 aesenc KEY, STATE 325 movaps 0x70(TKEYP), KEY 326 aesenclast KEY, STATE 327 RET 328SYM_FUNC_END(_aesni_enc1) 329 330/* 331 * _aesni_enc4: internal ABI 332 * input: 333 * KEYP: key struct pointer 334 * KLEN: round count 335 * STATE1: initial state (input) 336 * STATE2 337 * STATE3 338 * STATE4 339 * output: 340 * STATE1: finial state (output) 341 * STATE2 342 * STATE3 343 * STATE4 344 * changed: 345 * KEY 346 * TKEYP (T1) 347 */ 348SYM_FUNC_START_LOCAL(_aesni_enc4) 349 movaps (KEYP), KEY # key 350 mov KEYP, TKEYP 351 pxor KEY, STATE1 # round 0 352 pxor KEY, STATE2 353 pxor KEY, STATE3 354 pxor KEY, STATE4 355 add $0x30, TKEYP 356 cmp $24, KLEN 357 jb .L4enc128 358 lea 0x20(TKEYP), TKEYP 359 je .L4enc192 360 add $0x20, TKEYP 361 movaps -0x60(TKEYP), KEY 362 aesenc KEY, STATE1 363 aesenc KEY, STATE2 364 aesenc KEY, STATE3 365 aesenc KEY, STATE4 366 movaps -0x50(TKEYP), KEY 367 aesenc KEY, STATE1 368 aesenc KEY, STATE2 369 aesenc KEY, STATE3 370 aesenc KEY, STATE4 371#.align 4 372.L4enc192: 373 movaps -0x40(TKEYP), KEY 374 aesenc KEY, STATE1 375 aesenc KEY, STATE2 376 aesenc KEY, STATE3 377 aesenc KEY, STATE4 378 movaps -0x30(TKEYP), KEY 379 aesenc KEY, STATE1 380 aesenc KEY, STATE2 381 aesenc KEY, STATE3 382 aesenc KEY, STATE4 383#.align 4 384.L4enc128: 385 movaps -0x20(TKEYP), KEY 386 aesenc KEY, STATE1 387 aesenc KEY, STATE2 388 aesenc KEY, STATE3 389 aesenc KEY, STATE4 390 movaps -0x10(TKEYP), KEY 391 aesenc KEY, STATE1 392 aesenc KEY, STATE2 393 aesenc KEY, STATE3 394 aesenc KEY, STATE4 395 movaps (TKEYP), KEY 396 aesenc KEY, STATE1 397 aesenc KEY, STATE2 398 aesenc KEY, STATE3 399 aesenc KEY, STATE4 400 movaps 0x10(TKEYP), KEY 401 aesenc KEY, STATE1 402 aesenc KEY, STATE2 403 aesenc KEY, STATE3 404 aesenc KEY, STATE4 405 movaps 0x20(TKEYP), KEY 406 aesenc KEY, STATE1 407 aesenc KEY, STATE2 408 aesenc KEY, STATE3 409 aesenc KEY, STATE4 410 movaps 0x30(TKEYP), KEY 411 aesenc KEY, STATE1 412 aesenc KEY, STATE2 413 aesenc KEY, STATE3 414 aesenc KEY, STATE4 415 movaps 0x40(TKEYP), KEY 416 aesenc KEY, STATE1 417 aesenc KEY, STATE2 418 aesenc KEY, STATE3 419 aesenc KEY, STATE4 420 movaps 0x50(TKEYP), KEY 421 aesenc KEY, STATE1 422 aesenc KEY, STATE2 423 aesenc KEY, STATE3 424 aesenc KEY, STATE4 425 movaps 0x60(TKEYP), KEY 426 aesenc KEY, STATE1 427 aesenc KEY, STATE2 428 aesenc KEY, STATE3 429 aesenc KEY, STATE4 430 movaps 0x70(TKEYP), KEY 431 aesenclast KEY, STATE1 # last round 432 aesenclast KEY, STATE2 433 aesenclast KEY, STATE3 434 aesenclast KEY, STATE4 435 RET 436SYM_FUNC_END(_aesni_enc4) 437 438/* 439 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src) 440 */ 441SYM_FUNC_START(aesni_dec) 442 FRAME_BEGIN 443#ifndef __x86_64__ 444 pushl KEYP 445 pushl KLEN 446 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 447 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 448 movl (FRAME_OFFSET+20)(%esp), INP # src 449#endif 450 mov 480(KEYP), KLEN # key length 451 add $240, KEYP 452 movups (INP), STATE # input 453 call _aesni_dec1 454 movups STATE, (OUTP) #output 455#ifndef __x86_64__ 456 popl KLEN 457 popl KEYP 458#endif 459 FRAME_END 460 RET 461SYM_FUNC_END(aesni_dec) 462 463/* 464 * _aesni_dec1: internal ABI 465 * input: 466 * KEYP: key struct pointer 467 * KLEN: key length 468 * STATE: initial state (input) 469 * output: 470 * STATE: finial state (output) 471 * changed: 472 * KEY 473 * TKEYP (T1) 474 */ 475SYM_FUNC_START_LOCAL(_aesni_dec1) 476 movaps (KEYP), KEY # key 477 mov KEYP, TKEYP 478 pxor KEY, STATE # round 0 479 add $0x30, TKEYP 480 cmp $24, KLEN 481 jb .Ldec128 482 lea 0x20(TKEYP), TKEYP 483 je .Ldec192 484 add $0x20, TKEYP 485 movaps -0x60(TKEYP), KEY 486 aesdec KEY, STATE 487 movaps -0x50(TKEYP), KEY 488 aesdec KEY, STATE 489.align 4 490.Ldec192: 491 movaps -0x40(TKEYP), KEY 492 aesdec KEY, STATE 493 movaps -0x30(TKEYP), KEY 494 aesdec KEY, STATE 495.align 4 496.Ldec128: 497 movaps -0x20(TKEYP), KEY 498 aesdec KEY, STATE 499 movaps -0x10(TKEYP), KEY 500 aesdec KEY, STATE 501 movaps (TKEYP), KEY 502 aesdec KEY, STATE 503 movaps 0x10(TKEYP), KEY 504 aesdec KEY, STATE 505 movaps 0x20(TKEYP), KEY 506 aesdec KEY, STATE 507 movaps 0x30(TKEYP), KEY 508 aesdec KEY, STATE 509 movaps 0x40(TKEYP), KEY 510 aesdec KEY, STATE 511 movaps 0x50(TKEYP), KEY 512 aesdec KEY, STATE 513 movaps 0x60(TKEYP), KEY 514 aesdec KEY, STATE 515 movaps 0x70(TKEYP), KEY 516 aesdeclast KEY, STATE 517 RET 518SYM_FUNC_END(_aesni_dec1) 519 520/* 521 * _aesni_dec4: internal ABI 522 * input: 523 * KEYP: key struct pointer 524 * KLEN: key length 525 * STATE1: initial state (input) 526 * STATE2 527 * STATE3 528 * STATE4 529 * output: 530 * STATE1: finial state (output) 531 * STATE2 532 * STATE3 533 * STATE4 534 * changed: 535 * KEY 536 * TKEYP (T1) 537 */ 538SYM_FUNC_START_LOCAL(_aesni_dec4) 539 movaps (KEYP), KEY # key 540 mov KEYP, TKEYP 541 pxor KEY, STATE1 # round 0 542 pxor KEY, STATE2 543 pxor KEY, STATE3 544 pxor KEY, STATE4 545 add $0x30, TKEYP 546 cmp $24, KLEN 547 jb .L4dec128 548 lea 0x20(TKEYP), TKEYP 549 je .L4dec192 550 add $0x20, TKEYP 551 movaps -0x60(TKEYP), KEY 552 aesdec KEY, STATE1 553 aesdec KEY, STATE2 554 aesdec KEY, STATE3 555 aesdec KEY, STATE4 556 movaps -0x50(TKEYP), KEY 557 aesdec KEY, STATE1 558 aesdec KEY, STATE2 559 aesdec KEY, STATE3 560 aesdec KEY, STATE4 561.align 4 562.L4dec192: 563 movaps -0x40(TKEYP), KEY 564 aesdec KEY, STATE1 565 aesdec KEY, STATE2 566 aesdec KEY, STATE3 567 aesdec KEY, STATE4 568 movaps -0x30(TKEYP), KEY 569 aesdec KEY, STATE1 570 aesdec KEY, STATE2 571 aesdec KEY, STATE3 572 aesdec KEY, STATE4 573.align 4 574.L4dec128: 575 movaps -0x20(TKEYP), KEY 576 aesdec KEY, STATE1 577 aesdec KEY, STATE2 578 aesdec KEY, STATE3 579 aesdec KEY, STATE4 580 movaps -0x10(TKEYP), KEY 581 aesdec KEY, STATE1 582 aesdec KEY, STATE2 583 aesdec KEY, STATE3 584 aesdec KEY, STATE4 585 movaps (TKEYP), KEY 586 aesdec KEY, STATE1 587 aesdec KEY, STATE2 588 aesdec KEY, STATE3 589 aesdec KEY, STATE4 590 movaps 0x10(TKEYP), KEY 591 aesdec KEY, STATE1 592 aesdec KEY, STATE2 593 aesdec KEY, STATE3 594 aesdec KEY, STATE4 595 movaps 0x20(TKEYP), KEY 596 aesdec KEY, STATE1 597 aesdec KEY, STATE2 598 aesdec KEY, STATE3 599 aesdec KEY, STATE4 600 movaps 0x30(TKEYP), KEY 601 aesdec KEY, STATE1 602 aesdec KEY, STATE2 603 aesdec KEY, STATE3 604 aesdec KEY, STATE4 605 movaps 0x40(TKEYP), KEY 606 aesdec KEY, STATE1 607 aesdec KEY, STATE2 608 aesdec KEY, STATE3 609 aesdec KEY, STATE4 610 movaps 0x50(TKEYP), KEY 611 aesdec KEY, STATE1 612 aesdec KEY, STATE2 613 aesdec KEY, STATE3 614 aesdec KEY, STATE4 615 movaps 0x60(TKEYP), KEY 616 aesdec KEY, STATE1 617 aesdec KEY, STATE2 618 aesdec KEY, STATE3 619 aesdec KEY, STATE4 620 movaps 0x70(TKEYP), KEY 621 aesdeclast KEY, STATE1 # last round 622 aesdeclast KEY, STATE2 623 aesdeclast KEY, STATE3 624 aesdeclast KEY, STATE4 625 RET 626SYM_FUNC_END(_aesni_dec4) 627 628/* 629 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 630 * size_t len) 631 */ 632SYM_FUNC_START(aesni_ecb_enc) 633 FRAME_BEGIN 634#ifndef __x86_64__ 635 pushl LEN 636 pushl KEYP 637 pushl KLEN 638 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 639 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 640 movl (FRAME_OFFSET+24)(%esp), INP # src 641 movl (FRAME_OFFSET+28)(%esp), LEN # len 642#endif 643 test LEN, LEN # check length 644 jz .Lecb_enc_ret 645 mov 480(KEYP), KLEN 646 cmp $16, LEN 647 jb .Lecb_enc_ret 648 cmp $64, LEN 649 jb .Lecb_enc_loop1 650.align 4 651.Lecb_enc_loop4: 652 movups (INP), STATE1 653 movups 0x10(INP), STATE2 654 movups 0x20(INP), STATE3 655 movups 0x30(INP), STATE4 656 call _aesni_enc4 657 movups STATE1, (OUTP) 658 movups STATE2, 0x10(OUTP) 659 movups STATE3, 0x20(OUTP) 660 movups STATE4, 0x30(OUTP) 661 sub $64, LEN 662 add $64, INP 663 add $64, OUTP 664 cmp $64, LEN 665 jge .Lecb_enc_loop4 666 cmp $16, LEN 667 jb .Lecb_enc_ret 668.align 4 669.Lecb_enc_loop1: 670 movups (INP), STATE1 671 call _aesni_enc1 672 movups STATE1, (OUTP) 673 sub $16, LEN 674 add $16, INP 675 add $16, OUTP 676 cmp $16, LEN 677 jge .Lecb_enc_loop1 678.Lecb_enc_ret: 679#ifndef __x86_64__ 680 popl KLEN 681 popl KEYP 682 popl LEN 683#endif 684 FRAME_END 685 RET 686SYM_FUNC_END(aesni_ecb_enc) 687 688/* 689 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 690 * size_t len); 691 */ 692SYM_FUNC_START(aesni_ecb_dec) 693 FRAME_BEGIN 694#ifndef __x86_64__ 695 pushl LEN 696 pushl KEYP 697 pushl KLEN 698 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 699 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 700 movl (FRAME_OFFSET+24)(%esp), INP # src 701 movl (FRAME_OFFSET+28)(%esp), LEN # len 702#endif 703 test LEN, LEN 704 jz .Lecb_dec_ret 705 mov 480(KEYP), KLEN 706 add $240, KEYP 707 cmp $16, LEN 708 jb .Lecb_dec_ret 709 cmp $64, LEN 710 jb .Lecb_dec_loop1 711.align 4 712.Lecb_dec_loop4: 713 movups (INP), STATE1 714 movups 0x10(INP), STATE2 715 movups 0x20(INP), STATE3 716 movups 0x30(INP), STATE4 717 call _aesni_dec4 718 movups STATE1, (OUTP) 719 movups STATE2, 0x10(OUTP) 720 movups STATE3, 0x20(OUTP) 721 movups STATE4, 0x30(OUTP) 722 sub $64, LEN 723 add $64, INP 724 add $64, OUTP 725 cmp $64, LEN 726 jge .Lecb_dec_loop4 727 cmp $16, LEN 728 jb .Lecb_dec_ret 729.align 4 730.Lecb_dec_loop1: 731 movups (INP), STATE1 732 call _aesni_dec1 733 movups STATE1, (OUTP) 734 sub $16, LEN 735 add $16, INP 736 add $16, OUTP 737 cmp $16, LEN 738 jge .Lecb_dec_loop1 739.Lecb_dec_ret: 740#ifndef __x86_64__ 741 popl KLEN 742 popl KEYP 743 popl LEN 744#endif 745 FRAME_END 746 RET 747SYM_FUNC_END(aesni_ecb_dec) 748 749/* 750 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 751 * size_t len, u8 *iv) 752 */ 753SYM_FUNC_START(aesni_cbc_enc) 754 FRAME_BEGIN 755#ifndef __x86_64__ 756 pushl IVP 757 pushl LEN 758 pushl KEYP 759 pushl KLEN 760 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 761 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 762 movl (FRAME_OFFSET+28)(%esp), INP # src 763 movl (FRAME_OFFSET+32)(%esp), LEN # len 764 movl (FRAME_OFFSET+36)(%esp), IVP # iv 765#endif 766 cmp $16, LEN 767 jb .Lcbc_enc_ret 768 mov 480(KEYP), KLEN 769 movups (IVP), STATE # load iv as initial state 770.align 4 771.Lcbc_enc_loop: 772 movups (INP), IN # load input 773 pxor IN, STATE 774 call _aesni_enc1 775 movups STATE, (OUTP) # store output 776 sub $16, LEN 777 add $16, INP 778 add $16, OUTP 779 cmp $16, LEN 780 jge .Lcbc_enc_loop 781 movups STATE, (IVP) 782.Lcbc_enc_ret: 783#ifndef __x86_64__ 784 popl KLEN 785 popl KEYP 786 popl LEN 787 popl IVP 788#endif 789 FRAME_END 790 RET 791SYM_FUNC_END(aesni_cbc_enc) 792 793/* 794 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 795 * size_t len, u8 *iv) 796 */ 797SYM_FUNC_START(aesni_cbc_dec) 798 FRAME_BEGIN 799#ifndef __x86_64__ 800 pushl IVP 801 pushl LEN 802 pushl KEYP 803 pushl KLEN 804 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 805 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 806 movl (FRAME_OFFSET+28)(%esp), INP # src 807 movl (FRAME_OFFSET+32)(%esp), LEN # len 808 movl (FRAME_OFFSET+36)(%esp), IVP # iv 809#endif 810 cmp $16, LEN 811 jb .Lcbc_dec_just_ret 812 mov 480(KEYP), KLEN 813 add $240, KEYP 814 movups (IVP), IV 815 cmp $64, LEN 816 jb .Lcbc_dec_loop1 817.align 4 818.Lcbc_dec_loop4: 819 movups (INP), IN1 820 movaps IN1, STATE1 821 movups 0x10(INP), IN2 822 movaps IN2, STATE2 823#ifdef __x86_64__ 824 movups 0x20(INP), IN3 825 movaps IN3, STATE3 826 movups 0x30(INP), IN4 827 movaps IN4, STATE4 828#else 829 movups 0x20(INP), IN1 830 movaps IN1, STATE3 831 movups 0x30(INP), IN2 832 movaps IN2, STATE4 833#endif 834 call _aesni_dec4 835 pxor IV, STATE1 836#ifdef __x86_64__ 837 pxor IN1, STATE2 838 pxor IN2, STATE3 839 pxor IN3, STATE4 840 movaps IN4, IV 841#else 842 pxor IN1, STATE4 843 movaps IN2, IV 844 movups (INP), IN1 845 pxor IN1, STATE2 846 movups 0x10(INP), IN2 847 pxor IN2, STATE3 848#endif 849 movups STATE1, (OUTP) 850 movups STATE2, 0x10(OUTP) 851 movups STATE3, 0x20(OUTP) 852 movups STATE4, 0x30(OUTP) 853 sub $64, LEN 854 add $64, INP 855 add $64, OUTP 856 cmp $64, LEN 857 jge .Lcbc_dec_loop4 858 cmp $16, LEN 859 jb .Lcbc_dec_ret 860.align 4 861.Lcbc_dec_loop1: 862 movups (INP), IN 863 movaps IN, STATE 864 call _aesni_dec1 865 pxor IV, STATE 866 movups STATE, (OUTP) 867 movaps IN, IV 868 sub $16, LEN 869 add $16, INP 870 add $16, OUTP 871 cmp $16, LEN 872 jge .Lcbc_dec_loop1 873.Lcbc_dec_ret: 874 movups IV, (IVP) 875.Lcbc_dec_just_ret: 876#ifndef __x86_64__ 877 popl KLEN 878 popl KEYP 879 popl LEN 880 popl IVP 881#endif 882 FRAME_END 883 RET 884SYM_FUNC_END(aesni_cbc_dec) 885 886/* 887 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 888 * size_t len, u8 *iv) 889 */ 890SYM_FUNC_START(aesni_cts_cbc_enc) 891 FRAME_BEGIN 892#ifndef __x86_64__ 893 pushl IVP 894 pushl LEN 895 pushl KEYP 896 pushl KLEN 897 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 898 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 899 movl (FRAME_OFFSET+28)(%esp), INP # src 900 movl (FRAME_OFFSET+32)(%esp), LEN # len 901 movl (FRAME_OFFSET+36)(%esp), IVP # iv 902 lea .Lcts_permute_table, T1 903#else 904 lea .Lcts_permute_table(%rip), T1 905#endif 906 mov 480(KEYP), KLEN 907 movups (IVP), STATE 908 sub $16, LEN 909 mov T1, IVP 910 add $32, IVP 911 add LEN, T1 912 sub LEN, IVP 913 movups (T1), %xmm4 914 movups (IVP), %xmm5 915 916 movups (INP), IN1 917 add LEN, INP 918 movups (INP), IN2 919 920 pxor IN1, STATE 921 call _aesni_enc1 922 923 pshufb %xmm5, IN2 924 pxor STATE, IN2 925 pshufb %xmm4, STATE 926 add OUTP, LEN 927 movups STATE, (LEN) 928 929 movaps IN2, STATE 930 call _aesni_enc1 931 movups STATE, (OUTP) 932 933#ifndef __x86_64__ 934 popl KLEN 935 popl KEYP 936 popl LEN 937 popl IVP 938#endif 939 FRAME_END 940 RET 941SYM_FUNC_END(aesni_cts_cbc_enc) 942 943/* 944 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 945 * size_t len, u8 *iv) 946 */ 947SYM_FUNC_START(aesni_cts_cbc_dec) 948 FRAME_BEGIN 949#ifndef __x86_64__ 950 pushl IVP 951 pushl LEN 952 pushl KEYP 953 pushl KLEN 954 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 955 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 956 movl (FRAME_OFFSET+28)(%esp), INP # src 957 movl (FRAME_OFFSET+32)(%esp), LEN # len 958 movl (FRAME_OFFSET+36)(%esp), IVP # iv 959 lea .Lcts_permute_table, T1 960#else 961 lea .Lcts_permute_table(%rip), T1 962#endif 963 mov 480(KEYP), KLEN 964 add $240, KEYP 965 movups (IVP), IV 966 sub $16, LEN 967 mov T1, IVP 968 add $32, IVP 969 add LEN, T1 970 sub LEN, IVP 971 movups (T1), %xmm4 972 973 movups (INP), STATE 974 add LEN, INP 975 movups (INP), IN1 976 977 call _aesni_dec1 978 movaps STATE, IN2 979 pshufb %xmm4, STATE 980 pxor IN1, STATE 981 982 add OUTP, LEN 983 movups STATE, (LEN) 984 985 movups (IVP), %xmm0 986 pshufb %xmm0, IN1 987 pblendvb IN2, IN1 988 movaps IN1, STATE 989 call _aesni_dec1 990 991 pxor IV, STATE 992 movups STATE, (OUTP) 993 994#ifndef __x86_64__ 995 popl KLEN 996 popl KEYP 997 popl LEN 998 popl IVP 999#endif 1000 FRAME_END 1001 RET 1002SYM_FUNC_END(aesni_cts_cbc_dec) 1003 1004.pushsection .rodata 1005.align 16 1006.Lcts_permute_table: 1007 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 1008 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 1009 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 1010 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 1011 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 1012 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 1013#ifdef __x86_64__ 1014.Lbswap_mask: 1015 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 1016#endif 1017.popsection 1018 1019#ifdef __x86_64__ 1020/* 1021 * _aesni_inc_init: internal ABI 1022 * setup registers used by _aesni_inc 1023 * input: 1024 * IV 1025 * output: 1026 * CTR: == IV, in little endian 1027 * TCTR_LOW: == lower qword of CTR 1028 * INC: == 1, in little endian 1029 * BSWAP_MASK == endian swapping mask 1030 */ 1031SYM_FUNC_START_LOCAL(_aesni_inc_init) 1032 movaps .Lbswap_mask(%rip), BSWAP_MASK 1033 movaps IV, CTR 1034 pshufb BSWAP_MASK, CTR 1035 mov $1, TCTR_LOW 1036 movq TCTR_LOW, INC 1037 movq CTR, TCTR_LOW 1038 RET 1039SYM_FUNC_END(_aesni_inc_init) 1040 1041/* 1042 * _aesni_inc: internal ABI 1043 * Increase IV by 1, IV is in big endian 1044 * input: 1045 * IV 1046 * CTR: == IV, in little endian 1047 * TCTR_LOW: == lower qword of CTR 1048 * INC: == 1, in little endian 1049 * BSWAP_MASK == endian swapping mask 1050 * output: 1051 * IV: Increase by 1 1052 * changed: 1053 * CTR: == output IV, in little endian 1054 * TCTR_LOW: == lower qword of CTR 1055 */ 1056SYM_FUNC_START_LOCAL(_aesni_inc) 1057 paddq INC, CTR 1058 add $1, TCTR_LOW 1059 jnc .Linc_low 1060 pslldq $8, INC 1061 paddq INC, CTR 1062 psrldq $8, INC 1063.Linc_low: 1064 movaps CTR, IV 1065 pshufb BSWAP_MASK, IV 1066 RET 1067SYM_FUNC_END(_aesni_inc) 1068 1069/* 1070 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 1071 * size_t len, u8 *iv) 1072 */ 1073SYM_FUNC_START(aesni_ctr_enc) 1074 FRAME_BEGIN 1075 cmp $16, LEN 1076 jb .Lctr_enc_just_ret 1077 mov 480(KEYP), KLEN 1078 movups (IVP), IV 1079 call _aesni_inc_init 1080 cmp $64, LEN 1081 jb .Lctr_enc_loop1 1082.align 4 1083.Lctr_enc_loop4: 1084 movaps IV, STATE1 1085 call _aesni_inc 1086 movups (INP), IN1 1087 movaps IV, STATE2 1088 call _aesni_inc 1089 movups 0x10(INP), IN2 1090 movaps IV, STATE3 1091 call _aesni_inc 1092 movups 0x20(INP), IN3 1093 movaps IV, STATE4 1094 call _aesni_inc 1095 movups 0x30(INP), IN4 1096 call _aesni_enc4 1097 pxor IN1, STATE1 1098 movups STATE1, (OUTP) 1099 pxor IN2, STATE2 1100 movups STATE2, 0x10(OUTP) 1101 pxor IN3, STATE3 1102 movups STATE3, 0x20(OUTP) 1103 pxor IN4, STATE4 1104 movups STATE4, 0x30(OUTP) 1105 sub $64, LEN 1106 add $64, INP 1107 add $64, OUTP 1108 cmp $64, LEN 1109 jge .Lctr_enc_loop4 1110 cmp $16, LEN 1111 jb .Lctr_enc_ret 1112.align 4 1113.Lctr_enc_loop1: 1114 movaps IV, STATE 1115 call _aesni_inc 1116 movups (INP), IN 1117 call _aesni_enc1 1118 pxor IN, STATE 1119 movups STATE, (OUTP) 1120 sub $16, LEN 1121 add $16, INP 1122 add $16, OUTP 1123 cmp $16, LEN 1124 jge .Lctr_enc_loop1 1125.Lctr_enc_ret: 1126 movups IV, (IVP) 1127.Lctr_enc_just_ret: 1128 FRAME_END 1129 RET 1130SYM_FUNC_END(aesni_ctr_enc) 1131 1132#endif 1133 1134.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 1135.align 16 1136.Lgf128mul_x_ble_mask: 1137 .octa 0x00000000000000010000000000000087 1138.previous 1139 1140/* 1141 * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs 1142 * input: 1143 * IV: current IV 1144 * GF128MUL_MASK == mask with 0x87 and 0x01 1145 * output: 1146 * IV: next IV 1147 * changed: 1148 * KEY: == temporary value 1149 */ 1150.macro _aesni_gf128mul_x_ble 1151 pshufd $0x13, IV, KEY 1152 paddq IV, IV 1153 psrad $31, KEY 1154 pand GF128MUL_MASK, KEY 1155 pxor KEY, IV 1156.endm 1157 1158.macro _aesni_xts_crypt enc 1159 FRAME_BEGIN 1160#ifndef __x86_64__ 1161 pushl IVP 1162 pushl LEN 1163 pushl KEYP 1164 pushl KLEN 1165 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 1166 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 1167 movl (FRAME_OFFSET+28)(%esp), INP # src 1168 movl (FRAME_OFFSET+32)(%esp), LEN # len 1169 movl (FRAME_OFFSET+36)(%esp), IVP # iv 1170 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 1171#else 1172 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK 1173#endif 1174 movups (IVP), IV 1175 1176 mov 480(KEYP), KLEN 1177.if !\enc 1178 add $240, KEYP 1179 1180 test $15, LEN 1181 jz .Lxts_loop4\@ 1182 sub $16, LEN 1183.endif 1184 1185.Lxts_loop4\@: 1186 sub $64, LEN 1187 jl .Lxts_1x\@ 1188 1189 movdqa IV, STATE1 1190 movdqu 0x00(INP), IN 1191 pxor IN, STATE1 1192 movdqu IV, 0x00(OUTP) 1193 1194 _aesni_gf128mul_x_ble 1195 movdqa IV, STATE2 1196 movdqu 0x10(INP), IN 1197 pxor IN, STATE2 1198 movdqu IV, 0x10(OUTP) 1199 1200 _aesni_gf128mul_x_ble 1201 movdqa IV, STATE3 1202 movdqu 0x20(INP), IN 1203 pxor IN, STATE3 1204 movdqu IV, 0x20(OUTP) 1205 1206 _aesni_gf128mul_x_ble 1207 movdqa IV, STATE4 1208 movdqu 0x30(INP), IN 1209 pxor IN, STATE4 1210 movdqu IV, 0x30(OUTP) 1211 1212.if \enc 1213 call _aesni_enc4 1214.else 1215 call _aesni_dec4 1216.endif 1217 1218 movdqu 0x00(OUTP), IN 1219 pxor IN, STATE1 1220 movdqu STATE1, 0x00(OUTP) 1221 1222 movdqu 0x10(OUTP), IN 1223 pxor IN, STATE2 1224 movdqu STATE2, 0x10(OUTP) 1225 1226 movdqu 0x20(OUTP), IN 1227 pxor IN, STATE3 1228 movdqu STATE3, 0x20(OUTP) 1229 1230 movdqu 0x30(OUTP), IN 1231 pxor IN, STATE4 1232 movdqu STATE4, 0x30(OUTP) 1233 1234 _aesni_gf128mul_x_ble 1235 1236 add $64, INP 1237 add $64, OUTP 1238 test LEN, LEN 1239 jnz .Lxts_loop4\@ 1240 1241.Lxts_ret_iv\@: 1242 movups IV, (IVP) 1243 1244.Lxts_ret\@: 1245#ifndef __x86_64__ 1246 popl KLEN 1247 popl KEYP 1248 popl LEN 1249 popl IVP 1250#endif 1251 FRAME_END 1252 RET 1253 1254.Lxts_1x\@: 1255 add $64, LEN 1256 jz .Lxts_ret_iv\@ 1257.if \enc 1258 sub $16, LEN 1259 jl .Lxts_cts4\@ 1260.endif 1261 1262.Lxts_loop1\@: 1263 movdqu (INP), STATE 1264.if \enc 1265 pxor IV, STATE 1266 call _aesni_enc1 1267.else 1268 add $16, INP 1269 sub $16, LEN 1270 jl .Lxts_cts1\@ 1271 pxor IV, STATE 1272 call _aesni_dec1 1273.endif 1274 pxor IV, STATE 1275 _aesni_gf128mul_x_ble 1276 1277 test LEN, LEN 1278 jz .Lxts_out\@ 1279 1280.if \enc 1281 add $16, INP 1282 sub $16, LEN 1283 jl .Lxts_cts1\@ 1284.endif 1285 1286 movdqu STATE, (OUTP) 1287 add $16, OUTP 1288 jmp .Lxts_loop1\@ 1289 1290.Lxts_out\@: 1291 movdqu STATE, (OUTP) 1292 jmp .Lxts_ret_iv\@ 1293 1294.if \enc 1295.Lxts_cts4\@: 1296 movdqa STATE4, STATE 1297 sub $16, OUTP 1298.Lxts_cts1\@: 1299.else 1300.Lxts_cts1\@: 1301 movdqa IV, STATE4 1302 _aesni_gf128mul_x_ble 1303 1304 pxor IV, STATE 1305 call _aesni_dec1 1306 pxor IV, STATE 1307.endif 1308#ifndef __x86_64__ 1309 lea .Lcts_permute_table, T1 1310#else 1311 lea .Lcts_permute_table(%rip), T1 1312#endif 1313 add LEN, INP /* rewind input pointer */ 1314 add $16, LEN /* # bytes in final block */ 1315 movups (INP), IN1 1316 1317 mov T1, IVP 1318 add $32, IVP 1319 add LEN, T1 1320 sub LEN, IVP 1321 add OUTP, LEN 1322 1323 movups (T1), %xmm4 1324 movaps STATE, IN2 1325 pshufb %xmm4, STATE 1326 movups STATE, (LEN) 1327 1328 movups (IVP), %xmm0 1329 pshufb %xmm0, IN1 1330 pblendvb IN2, IN1 1331 movaps IN1, STATE 1332 1333.if \enc 1334 pxor IV, STATE 1335 call _aesni_enc1 1336 pxor IV, STATE 1337.else 1338 pxor STATE4, STATE 1339 call _aesni_dec1 1340 pxor STATE4, STATE 1341.endif 1342 1343 movups STATE, (OUTP) 1344 jmp .Lxts_ret\@ 1345.endm 1346 1347/* 1348 * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst, 1349 * const u8 *src, unsigned int len, le128 *iv) 1350 */ 1351SYM_FUNC_START(aesni_xts_enc) 1352 _aesni_xts_crypt 1 1353SYM_FUNC_END(aesni_xts_enc) 1354 1355/* 1356 * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst, 1357 * const u8 *src, unsigned int len, le128 *iv) 1358 */ 1359SYM_FUNC_START(aesni_xts_dec) 1360 _aesni_xts_crypt 0 1361SYM_FUNC_END(aesni_xts_dec) 1362