1/* 2 * Implement AES algorithm in Intel AES-NI instructions. 3 * 4 * The white paper of AES-NI instructions can be downloaded from: 5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 6 * 7 * Copyright (C) 2008, Intel Corp. 8 * Author: Huang Ying <ying.huang@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com> 10 * Kahraman Akdemir 11 * 12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 13 * interface for 64-bit kernels. 14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 15 * Aidan O'Mahony (aidan.o.mahony@intel.com) 16 * Adrian Hoban <adrian.hoban@intel.com> 17 * James Guilford (james.guilford@intel.com) 18 * Gabriele Paoloni <gabriele.paoloni@intel.com> 19 * Tadeusz Struk (tadeusz.struk@intel.com) 20 * Wajdi Feghali (wajdi.k.feghali@intel.com) 21 * Copyright (c) 2010, Intel Corporation. 22 * 23 * Ported x86_64 version to x86: 24 * Author: Mathias Krause <minipli@googlemail.com> 25 * 26 * This program is free software; you can redistribute it and/or modify 27 * it under the terms of the GNU General Public License as published by 28 * the Free Software Foundation; either version 2 of the License, or 29 * (at your option) any later version. 30 */ 31 32#include <linux/linkage.h> 33#include <asm/inst.h> 34#include <asm/frame.h> 35#include <asm/nospec-branch.h> 36 37/* 38 * The following macros are used to move an (un)aligned 16 byte value to/from 39 * an XMM register. This can done for either FP or integer values, for FP use 40 * movaps (move aligned packed single) or integer use movdqa (move double quad 41 * aligned). It doesn't make a performance difference which instruction is used 42 * since Nehalem (original Core i7) was released. However, the movaps is a byte 43 * shorter, so that is the one we'll use for now. (same for unaligned). 44 */ 45#define MOVADQ movaps 46#define MOVUDQ movups 47 48#ifdef __x86_64__ 49 50# constants in mergeable sections, linker can reorder and merge 51.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 52.align 16 53.Lgf128mul_x_ble_mask: 54 .octa 0x00000000000000010000000000000087 55.section .rodata.cst16.POLY, "aM", @progbits, 16 56.align 16 57POLY: .octa 0xC2000000000000000000000000000001 58.section .rodata.cst16.TWOONE, "aM", @progbits, 16 59.align 16 60TWOONE: .octa 0x00000001000000000000000000000001 61 62.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 63.align 16 64SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 65.section .rodata.cst16.MASK1, "aM", @progbits, 16 66.align 16 67MASK1: .octa 0x0000000000000000ffffffffffffffff 68.section .rodata.cst16.MASK2, "aM", @progbits, 16 69.align 16 70MASK2: .octa 0xffffffffffffffff0000000000000000 71.section .rodata.cst16.ONE, "aM", @progbits, 16 72.align 16 73ONE: .octa 0x00000000000000000000000000000001 74.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 75.align 16 76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 77.section .rodata.cst16.dec, "aM", @progbits, 16 78.align 16 79dec: .octa 0x1 80.section .rodata.cst16.enc, "aM", @progbits, 16 81.align 16 82enc: .octa 0x2 83 84# order of these constants should not change. 85# more specifically, ALL_F should follow SHIFT_MASK, 86# and zero should follow ALL_F 87.section .rodata, "a", @progbits 88.align 16 89SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 90ALL_F: .octa 0xffffffffffffffffffffffffffffffff 91 .octa 0x00000000000000000000000000000000 92 93.section .rodata 94.align 16 95.type aad_shift_arr, @object 96.size aad_shift_arr, 272 97aad_shift_arr: 98 .octa 0xffffffffffffffffffffffffffffffff 99 .octa 0xffffffffffffffffffffffffffffff0C 100 .octa 0xffffffffffffffffffffffffffff0D0C 101 .octa 0xffffffffffffffffffffffffff0E0D0C 102 .octa 0xffffffffffffffffffffffff0F0E0D0C 103 .octa 0xffffffffffffffffffffff0C0B0A0908 104 .octa 0xffffffffffffffffffff0D0C0B0A0908 105 .octa 0xffffffffffffffffff0E0D0C0B0A0908 106 .octa 0xffffffffffffffff0F0E0D0C0B0A0908 107 .octa 0xffffffffffffff0C0B0A090807060504 108 .octa 0xffffffffffff0D0C0B0A090807060504 109 .octa 0xffffffffff0E0D0C0B0A090807060504 110 .octa 0xffffffff0F0E0D0C0B0A090807060504 111 .octa 0xffffff0C0B0A09080706050403020100 112 .octa 0xffff0D0C0B0A09080706050403020100 113 .octa 0xff0E0D0C0B0A09080706050403020100 114 .octa 0x0F0E0D0C0B0A09080706050403020100 115 116 117.text 118 119 120#define STACK_OFFSET 8*3 121#define HashKey 16*0 // store HashKey <<1 mod poly here 122#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here 123#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here 124#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here 125#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 126 // bits of HashKey <<1 mod poly here 127 //(for Karatsuba purposes) 128#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 129 // bits of HashKey^2 <<1 mod poly here 130 // (for Karatsuba purposes) 131#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 132 // bits of HashKey^3 <<1 mod poly here 133 // (for Karatsuba purposes) 134#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 135 // bits of HashKey^4 <<1 mod poly here 136 // (for Karatsuba purposes) 137#define VARIABLE_OFFSET 16*8 138 139#define arg1 rdi 140#define arg2 rsi 141#define arg3 rdx 142#define arg4 rcx 143#define arg5 r8 144#define arg6 r9 145#define arg7 STACK_OFFSET+8(%r14) 146#define arg8 STACK_OFFSET+16(%r14) 147#define arg9 STACK_OFFSET+24(%r14) 148#define arg10 STACK_OFFSET+32(%r14) 149#define keysize 2*15*16(%arg1) 150#endif 151 152 153#define STATE1 %xmm0 154#define STATE2 %xmm4 155#define STATE3 %xmm5 156#define STATE4 %xmm6 157#define STATE STATE1 158#define IN1 %xmm1 159#define IN2 %xmm7 160#define IN3 %xmm8 161#define IN4 %xmm9 162#define IN IN1 163#define KEY %xmm2 164#define IV %xmm3 165 166#define BSWAP_MASK %xmm10 167#define CTR %xmm11 168#define INC %xmm12 169 170#define GF128MUL_MASK %xmm10 171 172#ifdef __x86_64__ 173#define AREG %rax 174#define KEYP %rdi 175#define OUTP %rsi 176#define UKEYP OUTP 177#define INP %rdx 178#define LEN %rcx 179#define IVP %r8 180#define KLEN %r9d 181#define T1 %r10 182#define TKEYP T1 183#define T2 %r11 184#define TCTR_LOW T2 185#else 186#define AREG %eax 187#define KEYP %edi 188#define OUTP AREG 189#define UKEYP OUTP 190#define INP %edx 191#define LEN %esi 192#define IVP %ebp 193#define KLEN %ebx 194#define T1 %ecx 195#define TKEYP T1 196#endif 197 198 199#ifdef __x86_64__ 200/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 201* 202* 203* Input: A and B (128-bits each, bit-reflected) 204* Output: C = A*B*x mod poly, (i.e. >>1 ) 205* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 206* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 207* 208*/ 209.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 210 movdqa \GH, \TMP1 211 pshufd $78, \GH, \TMP2 212 pshufd $78, \HK, \TMP3 213 pxor \GH, \TMP2 # TMP2 = a1+a0 214 pxor \HK, \TMP3 # TMP3 = b1+b0 215 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 216 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 217 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 218 pxor \GH, \TMP2 219 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 220 movdqa \TMP2, \TMP3 221 pslldq $8, \TMP3 # left shift TMP3 2 DWs 222 psrldq $8, \TMP2 # right shift TMP2 2 DWs 223 pxor \TMP3, \GH 224 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 225 226 # first phase of the reduction 227 228 movdqa \GH, \TMP2 229 movdqa \GH, \TMP3 230 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 231 # in in order to perform 232 # independent shifts 233 pslld $31, \TMP2 # packed right shift <<31 234 pslld $30, \TMP3 # packed right shift <<30 235 pslld $25, \TMP4 # packed right shift <<25 236 pxor \TMP3, \TMP2 # xor the shifted versions 237 pxor \TMP4, \TMP2 238 movdqa \TMP2, \TMP5 239 psrldq $4, \TMP5 # right shift TMP5 1 DW 240 pslldq $12, \TMP2 # left shift TMP2 3 DWs 241 pxor \TMP2, \GH 242 243 # second phase of the reduction 244 245 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 246 # in in order to perform 247 # independent shifts 248 movdqa \GH,\TMP3 249 movdqa \GH,\TMP4 250 psrld $1,\TMP2 # packed left shift >>1 251 psrld $2,\TMP3 # packed left shift >>2 252 psrld $7,\TMP4 # packed left shift >>7 253 pxor \TMP3,\TMP2 # xor the shifted versions 254 pxor \TMP4,\TMP2 255 pxor \TMP5, \TMP2 256 pxor \TMP2, \GH 257 pxor \TMP1, \GH # result is in TMP1 258.endm 259 260/* 261* if a = number of total plaintext bytes 262* b = floor(a/16) 263* num_initial_blocks = b mod 4 264* encrypt the initial num_initial_blocks blocks and apply ghash on 265* the ciphertext 266* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 267* are clobbered 268* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 269*/ 270 271 272.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 273XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 274 MOVADQ SHUF_MASK(%rip), %xmm14 275 mov arg7, %r10 # %r10 = AAD 276 mov arg8, %r12 # %r12 = aadLen 277 mov %r12, %r11 278 pxor %xmm\i, %xmm\i 279 pxor \XMM2, \XMM2 280 281 cmp $16, %r11 282 jl _get_AAD_rest8\num_initial_blocks\operation 283_get_AAD_blocks\num_initial_blocks\operation: 284 movdqu (%r10), %xmm\i 285 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 286 pxor %xmm\i, \XMM2 287 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 288 add $16, %r10 289 sub $16, %r12 290 sub $16, %r11 291 cmp $16, %r11 292 jge _get_AAD_blocks\num_initial_blocks\operation 293 294 movdqu \XMM2, %xmm\i 295 cmp $0, %r11 296 je _get_AAD_done\num_initial_blocks\operation 297 298 pxor %xmm\i,%xmm\i 299 300 /* read the last <16B of AAD. since we have at least 4B of 301 data right after the AAD (the ICV, and maybe some CT), we can 302 read 4B/8B blocks safely, and then get rid of the extra stuff */ 303_get_AAD_rest8\num_initial_blocks\operation: 304 cmp $4, %r11 305 jle _get_AAD_rest4\num_initial_blocks\operation 306 movq (%r10), \TMP1 307 add $8, %r10 308 sub $8, %r11 309 pslldq $8, \TMP1 310 psrldq $8, %xmm\i 311 pxor \TMP1, %xmm\i 312 jmp _get_AAD_rest8\num_initial_blocks\operation 313_get_AAD_rest4\num_initial_blocks\operation: 314 cmp $0, %r11 315 jle _get_AAD_rest0\num_initial_blocks\operation 316 mov (%r10), %eax 317 movq %rax, \TMP1 318 add $4, %r10 319 sub $4, %r10 320 pslldq $12, \TMP1 321 psrldq $4, %xmm\i 322 pxor \TMP1, %xmm\i 323_get_AAD_rest0\num_initial_blocks\operation: 324 /* finalize: shift out the extra bytes we read, and align 325 left. since pslldq can only shift by an immediate, we use 326 vpshufb and an array of shuffle masks */ 327 movq %r12, %r11 328 salq $4, %r11 329 movdqu aad_shift_arr(%r11), \TMP1 330 PSHUFB_XMM \TMP1, %xmm\i 331_get_AAD_rest_final\num_initial_blocks\operation: 332 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 333 pxor \XMM2, %xmm\i 334 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 335 336_get_AAD_done\num_initial_blocks\operation: 337 xor %r11, %r11 # initialise the data pointer offset as zero 338 # start AES for num_initial_blocks blocks 339 340 mov %arg5, %rax # %rax = *Y0 341 movdqu (%rax), \XMM0 # XMM0 = Y0 342 PSHUFB_XMM %xmm14, \XMM0 343 344.if (\i == 5) || (\i == 6) || (\i == 7) 345 MOVADQ ONE(%RIP),\TMP1 346 MOVADQ (%arg1),\TMP2 347.irpc index, \i_seq 348 paddd \TMP1, \XMM0 # INCR Y0 349 movdqa \XMM0, %xmm\index 350 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 351 pxor \TMP2, %xmm\index 352.endr 353 lea 0x10(%arg1),%r10 354 mov keysize,%eax 355 shr $2,%eax # 128->4, 192->6, 256->8 356 add $5,%eax # 128->9, 192->11, 256->13 357 358aes_loop_initial_dec\num_initial_blocks: 359 MOVADQ (%r10),\TMP1 360.irpc index, \i_seq 361 AESENC \TMP1, %xmm\index 362.endr 363 add $16,%r10 364 sub $1,%eax 365 jnz aes_loop_initial_dec\num_initial_blocks 366 367 MOVADQ (%r10), \TMP1 368.irpc index, \i_seq 369 AESENCLAST \TMP1, %xmm\index # Last Round 370.endr 371.irpc index, \i_seq 372 movdqu (%arg3 , %r11, 1), \TMP1 373 pxor \TMP1, %xmm\index 374 movdqu %xmm\index, (%arg2 , %r11, 1) 375 # write back plaintext/ciphertext for num_initial_blocks 376 add $16, %r11 377 378 movdqa \TMP1, %xmm\index 379 PSHUFB_XMM %xmm14, %xmm\index 380 # prepare plaintext/ciphertext for GHASH computation 381.endr 382.endif 383 384 # apply GHASH on num_initial_blocks blocks 385 386.if \i == 5 387 pxor %xmm5, %xmm6 388 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 389 pxor %xmm6, %xmm7 390 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 391 pxor %xmm7, %xmm8 392 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 393.elseif \i == 6 394 pxor %xmm6, %xmm7 395 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 396 pxor %xmm7, %xmm8 397 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 398.elseif \i == 7 399 pxor %xmm7, %xmm8 400 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 401.endif 402 cmp $64, %r13 403 jl _initial_blocks_done\num_initial_blocks\operation 404 # no need for precomputed values 405/* 406* 407* Precomputations for HashKey parallel with encryption of first 4 blocks. 408* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 409*/ 410 MOVADQ ONE(%rip), \TMP1 411 paddd \TMP1, \XMM0 # INCR Y0 412 MOVADQ \XMM0, \XMM1 413 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 414 415 paddd \TMP1, \XMM0 # INCR Y0 416 MOVADQ \XMM0, \XMM2 417 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 418 419 paddd \TMP1, \XMM0 # INCR Y0 420 MOVADQ \XMM0, \XMM3 421 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 422 423 paddd \TMP1, \XMM0 # INCR Y0 424 MOVADQ \XMM0, \XMM4 425 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 426 427 MOVADQ 0(%arg1),\TMP1 428 pxor \TMP1, \XMM1 429 pxor \TMP1, \XMM2 430 pxor \TMP1, \XMM3 431 pxor \TMP1, \XMM4 432 movdqa \TMP3, \TMP5 433 pshufd $78, \TMP3, \TMP1 434 pxor \TMP3, \TMP1 435 movdqa \TMP1, HashKey_k(%rsp) 436 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 437# TMP5 = HashKey^2<<1 (mod poly) 438 movdqa \TMP5, HashKey_2(%rsp) 439# HashKey_2 = HashKey^2<<1 (mod poly) 440 pshufd $78, \TMP5, \TMP1 441 pxor \TMP5, \TMP1 442 movdqa \TMP1, HashKey_2_k(%rsp) 443.irpc index, 1234 # do 4 rounds 444 movaps 0x10*\index(%arg1), \TMP1 445 AESENC \TMP1, \XMM1 446 AESENC \TMP1, \XMM2 447 AESENC \TMP1, \XMM3 448 AESENC \TMP1, \XMM4 449.endr 450 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 451# TMP5 = HashKey^3<<1 (mod poly) 452 movdqa \TMP5, HashKey_3(%rsp) 453 pshufd $78, \TMP5, \TMP1 454 pxor \TMP5, \TMP1 455 movdqa \TMP1, HashKey_3_k(%rsp) 456.irpc index, 56789 # do next 5 rounds 457 movaps 0x10*\index(%arg1), \TMP1 458 AESENC \TMP1, \XMM1 459 AESENC \TMP1, \XMM2 460 AESENC \TMP1, \XMM3 461 AESENC \TMP1, \XMM4 462.endr 463 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 464# TMP5 = HashKey^3<<1 (mod poly) 465 movdqa \TMP5, HashKey_4(%rsp) 466 pshufd $78, \TMP5, \TMP1 467 pxor \TMP5, \TMP1 468 movdqa \TMP1, HashKey_4_k(%rsp) 469 lea 0xa0(%arg1),%r10 470 mov keysize,%eax 471 shr $2,%eax # 128->4, 192->6, 256->8 472 sub $4,%eax # 128->0, 192->2, 256->4 473 jz aes_loop_pre_dec_done\num_initial_blocks 474 475aes_loop_pre_dec\num_initial_blocks: 476 MOVADQ (%r10),\TMP2 477.irpc index, 1234 478 AESENC \TMP2, %xmm\index 479.endr 480 add $16,%r10 481 sub $1,%eax 482 jnz aes_loop_pre_dec\num_initial_blocks 483 484aes_loop_pre_dec_done\num_initial_blocks: 485 MOVADQ (%r10), \TMP2 486 AESENCLAST \TMP2, \XMM1 487 AESENCLAST \TMP2, \XMM2 488 AESENCLAST \TMP2, \XMM3 489 AESENCLAST \TMP2, \XMM4 490 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 491 pxor \TMP1, \XMM1 492 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 493 movdqa \TMP1, \XMM1 494 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 495 pxor \TMP1, \XMM2 496 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 497 movdqa \TMP1, \XMM2 498 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 499 pxor \TMP1, \XMM3 500 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 501 movdqa \TMP1, \XMM3 502 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 503 pxor \TMP1, \XMM4 504 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 505 movdqa \TMP1, \XMM4 506 add $64, %r11 507 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 508 pxor \XMMDst, \XMM1 509# combine GHASHed value with the corresponding ciphertext 510 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 511 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 512 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 513 514_initial_blocks_done\num_initial_blocks\operation: 515 516.endm 517 518 519/* 520* if a = number of total plaintext bytes 521* b = floor(a/16) 522* num_initial_blocks = b mod 4 523* encrypt the initial num_initial_blocks blocks and apply ghash on 524* the ciphertext 525* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 526* are clobbered 527* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 528*/ 529 530 531.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 532XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 533 MOVADQ SHUF_MASK(%rip), %xmm14 534 mov arg7, %r10 # %r10 = AAD 535 mov arg8, %r12 # %r12 = aadLen 536 mov %r12, %r11 537 pxor %xmm\i, %xmm\i 538 pxor \XMM2, \XMM2 539 540 cmp $16, %r11 541 jl _get_AAD_rest8\num_initial_blocks\operation 542_get_AAD_blocks\num_initial_blocks\operation: 543 movdqu (%r10), %xmm\i 544 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 545 pxor %xmm\i, \XMM2 546 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 547 add $16, %r10 548 sub $16, %r12 549 sub $16, %r11 550 cmp $16, %r11 551 jge _get_AAD_blocks\num_initial_blocks\operation 552 553 movdqu \XMM2, %xmm\i 554 cmp $0, %r11 555 je _get_AAD_done\num_initial_blocks\operation 556 557 pxor %xmm\i,%xmm\i 558 559 /* read the last <16B of AAD. since we have at least 4B of 560 data right after the AAD (the ICV, and maybe some PT), we can 561 read 4B/8B blocks safely, and then get rid of the extra stuff */ 562_get_AAD_rest8\num_initial_blocks\operation: 563 cmp $4, %r11 564 jle _get_AAD_rest4\num_initial_blocks\operation 565 movq (%r10), \TMP1 566 add $8, %r10 567 sub $8, %r11 568 pslldq $8, \TMP1 569 psrldq $8, %xmm\i 570 pxor \TMP1, %xmm\i 571 jmp _get_AAD_rest8\num_initial_blocks\operation 572_get_AAD_rest4\num_initial_blocks\operation: 573 cmp $0, %r11 574 jle _get_AAD_rest0\num_initial_blocks\operation 575 mov (%r10), %eax 576 movq %rax, \TMP1 577 add $4, %r10 578 sub $4, %r10 579 pslldq $12, \TMP1 580 psrldq $4, %xmm\i 581 pxor \TMP1, %xmm\i 582_get_AAD_rest0\num_initial_blocks\operation: 583 /* finalize: shift out the extra bytes we read, and align 584 left. since pslldq can only shift by an immediate, we use 585 vpshufb and an array of shuffle masks */ 586 movq %r12, %r11 587 salq $4, %r11 588 movdqu aad_shift_arr(%r11), \TMP1 589 PSHUFB_XMM \TMP1, %xmm\i 590_get_AAD_rest_final\num_initial_blocks\operation: 591 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 592 pxor \XMM2, %xmm\i 593 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 594 595_get_AAD_done\num_initial_blocks\operation: 596 xor %r11, %r11 # initialise the data pointer offset as zero 597 # start AES for num_initial_blocks blocks 598 599 mov %arg5, %rax # %rax = *Y0 600 movdqu (%rax), \XMM0 # XMM0 = Y0 601 PSHUFB_XMM %xmm14, \XMM0 602 603.if (\i == 5) || (\i == 6) || (\i == 7) 604 605 MOVADQ ONE(%RIP),\TMP1 606 MOVADQ 0(%arg1),\TMP2 607.irpc index, \i_seq 608 paddd \TMP1, \XMM0 # INCR Y0 609 MOVADQ \XMM0, %xmm\index 610 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 611 pxor \TMP2, %xmm\index 612.endr 613 lea 0x10(%arg1),%r10 614 mov keysize,%eax 615 shr $2,%eax # 128->4, 192->6, 256->8 616 add $5,%eax # 128->9, 192->11, 256->13 617 618aes_loop_initial_enc\num_initial_blocks: 619 MOVADQ (%r10),\TMP1 620.irpc index, \i_seq 621 AESENC \TMP1, %xmm\index 622.endr 623 add $16,%r10 624 sub $1,%eax 625 jnz aes_loop_initial_enc\num_initial_blocks 626 627 MOVADQ (%r10), \TMP1 628.irpc index, \i_seq 629 AESENCLAST \TMP1, %xmm\index # Last Round 630.endr 631.irpc index, \i_seq 632 movdqu (%arg3 , %r11, 1), \TMP1 633 pxor \TMP1, %xmm\index 634 movdqu %xmm\index, (%arg2 , %r11, 1) 635 # write back plaintext/ciphertext for num_initial_blocks 636 add $16, %r11 637 PSHUFB_XMM %xmm14, %xmm\index 638 639 # prepare plaintext/ciphertext for GHASH computation 640.endr 641.endif 642 643 # apply GHASH on num_initial_blocks blocks 644 645.if \i == 5 646 pxor %xmm5, %xmm6 647 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 648 pxor %xmm6, %xmm7 649 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 650 pxor %xmm7, %xmm8 651 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 652.elseif \i == 6 653 pxor %xmm6, %xmm7 654 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 655 pxor %xmm7, %xmm8 656 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 657.elseif \i == 7 658 pxor %xmm7, %xmm8 659 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 660.endif 661 cmp $64, %r13 662 jl _initial_blocks_done\num_initial_blocks\operation 663 # no need for precomputed values 664/* 665* 666* Precomputations for HashKey parallel with encryption of first 4 blocks. 667* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 668*/ 669 MOVADQ ONE(%RIP),\TMP1 670 paddd \TMP1, \XMM0 # INCR Y0 671 MOVADQ \XMM0, \XMM1 672 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 673 674 paddd \TMP1, \XMM0 # INCR Y0 675 MOVADQ \XMM0, \XMM2 676 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 677 678 paddd \TMP1, \XMM0 # INCR Y0 679 MOVADQ \XMM0, \XMM3 680 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 681 682 paddd \TMP1, \XMM0 # INCR Y0 683 MOVADQ \XMM0, \XMM4 684 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 685 686 MOVADQ 0(%arg1),\TMP1 687 pxor \TMP1, \XMM1 688 pxor \TMP1, \XMM2 689 pxor \TMP1, \XMM3 690 pxor \TMP1, \XMM4 691 movdqa \TMP3, \TMP5 692 pshufd $78, \TMP3, \TMP1 693 pxor \TMP3, \TMP1 694 movdqa \TMP1, HashKey_k(%rsp) 695 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 696# TMP5 = HashKey^2<<1 (mod poly) 697 movdqa \TMP5, HashKey_2(%rsp) 698# HashKey_2 = HashKey^2<<1 (mod poly) 699 pshufd $78, \TMP5, \TMP1 700 pxor \TMP5, \TMP1 701 movdqa \TMP1, HashKey_2_k(%rsp) 702.irpc index, 1234 # do 4 rounds 703 movaps 0x10*\index(%arg1), \TMP1 704 AESENC \TMP1, \XMM1 705 AESENC \TMP1, \XMM2 706 AESENC \TMP1, \XMM3 707 AESENC \TMP1, \XMM4 708.endr 709 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 710# TMP5 = HashKey^3<<1 (mod poly) 711 movdqa \TMP5, HashKey_3(%rsp) 712 pshufd $78, \TMP5, \TMP1 713 pxor \TMP5, \TMP1 714 movdqa \TMP1, HashKey_3_k(%rsp) 715.irpc index, 56789 # do next 5 rounds 716 movaps 0x10*\index(%arg1), \TMP1 717 AESENC \TMP1, \XMM1 718 AESENC \TMP1, \XMM2 719 AESENC \TMP1, \XMM3 720 AESENC \TMP1, \XMM4 721.endr 722 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 723# TMP5 = HashKey^3<<1 (mod poly) 724 movdqa \TMP5, HashKey_4(%rsp) 725 pshufd $78, \TMP5, \TMP1 726 pxor \TMP5, \TMP1 727 movdqa \TMP1, HashKey_4_k(%rsp) 728 lea 0xa0(%arg1),%r10 729 mov keysize,%eax 730 shr $2,%eax # 128->4, 192->6, 256->8 731 sub $4,%eax # 128->0, 192->2, 256->4 732 jz aes_loop_pre_enc_done\num_initial_blocks 733 734aes_loop_pre_enc\num_initial_blocks: 735 MOVADQ (%r10),\TMP2 736.irpc index, 1234 737 AESENC \TMP2, %xmm\index 738.endr 739 add $16,%r10 740 sub $1,%eax 741 jnz aes_loop_pre_enc\num_initial_blocks 742 743aes_loop_pre_enc_done\num_initial_blocks: 744 MOVADQ (%r10), \TMP2 745 AESENCLAST \TMP2, \XMM1 746 AESENCLAST \TMP2, \XMM2 747 AESENCLAST \TMP2, \XMM3 748 AESENCLAST \TMP2, \XMM4 749 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 750 pxor \TMP1, \XMM1 751 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 752 pxor \TMP1, \XMM2 753 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 754 pxor \TMP1, \XMM3 755 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 756 pxor \TMP1, \XMM4 757 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 758 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 759 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 760 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 761 762 add $64, %r11 763 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 764 pxor \XMMDst, \XMM1 765# combine GHASHed value with the corresponding ciphertext 766 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 767 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 768 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 769 770_initial_blocks_done\num_initial_blocks\operation: 771 772.endm 773 774/* 775* encrypt 4 blocks at a time 776* ghash the 4 previously encrypted ciphertext blocks 777* arg1, %arg2, %arg3 are used as pointers only, not modified 778* %r11 is the data offset value 779*/ 780.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ 781TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 782 783 movdqa \XMM1, \XMM5 784 movdqa \XMM2, \XMM6 785 movdqa \XMM3, \XMM7 786 movdqa \XMM4, \XMM8 787 788 movdqa SHUF_MASK(%rip), %xmm15 789 # multiply TMP5 * HashKey using karatsuba 790 791 movdqa \XMM5, \TMP4 792 pshufd $78, \XMM5, \TMP6 793 pxor \XMM5, \TMP6 794 paddd ONE(%rip), \XMM0 # INCR CNT 795 movdqa HashKey_4(%rsp), \TMP5 796 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 797 movdqa \XMM0, \XMM1 798 paddd ONE(%rip), \XMM0 # INCR CNT 799 movdqa \XMM0, \XMM2 800 paddd ONE(%rip), \XMM0 # INCR CNT 801 movdqa \XMM0, \XMM3 802 paddd ONE(%rip), \XMM0 # INCR CNT 803 movdqa \XMM0, \XMM4 804 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 805 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 806 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 807 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 808 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 809 810 pxor (%arg1), \XMM1 811 pxor (%arg1), \XMM2 812 pxor (%arg1), \XMM3 813 pxor (%arg1), \XMM4 814 movdqa HashKey_4_k(%rsp), \TMP5 815 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 816 movaps 0x10(%arg1), \TMP1 817 AESENC \TMP1, \XMM1 # Round 1 818 AESENC \TMP1, \XMM2 819 AESENC \TMP1, \XMM3 820 AESENC \TMP1, \XMM4 821 movaps 0x20(%arg1), \TMP1 822 AESENC \TMP1, \XMM1 # Round 2 823 AESENC \TMP1, \XMM2 824 AESENC \TMP1, \XMM3 825 AESENC \TMP1, \XMM4 826 movdqa \XMM6, \TMP1 827 pshufd $78, \XMM6, \TMP2 828 pxor \XMM6, \TMP2 829 movdqa HashKey_3(%rsp), \TMP5 830 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 831 movaps 0x30(%arg1), \TMP3 832 AESENC \TMP3, \XMM1 # Round 3 833 AESENC \TMP3, \XMM2 834 AESENC \TMP3, \XMM3 835 AESENC \TMP3, \XMM4 836 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 837 movaps 0x40(%arg1), \TMP3 838 AESENC \TMP3, \XMM1 # Round 4 839 AESENC \TMP3, \XMM2 840 AESENC \TMP3, \XMM3 841 AESENC \TMP3, \XMM4 842 movdqa HashKey_3_k(%rsp), \TMP5 843 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 844 movaps 0x50(%arg1), \TMP3 845 AESENC \TMP3, \XMM1 # Round 5 846 AESENC \TMP3, \XMM2 847 AESENC \TMP3, \XMM3 848 AESENC \TMP3, \XMM4 849 pxor \TMP1, \TMP4 850# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 851 pxor \XMM6, \XMM5 852 pxor \TMP2, \TMP6 853 movdqa \XMM7, \TMP1 854 pshufd $78, \XMM7, \TMP2 855 pxor \XMM7, \TMP2 856 movdqa HashKey_2(%rsp ), \TMP5 857 858 # Multiply TMP5 * HashKey using karatsuba 859 860 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 861 movaps 0x60(%arg1), \TMP3 862 AESENC \TMP3, \XMM1 # Round 6 863 AESENC \TMP3, \XMM2 864 AESENC \TMP3, \XMM3 865 AESENC \TMP3, \XMM4 866 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 867 movaps 0x70(%arg1), \TMP3 868 AESENC \TMP3, \XMM1 # Round 7 869 AESENC \TMP3, \XMM2 870 AESENC \TMP3, \XMM3 871 AESENC \TMP3, \XMM4 872 movdqa HashKey_2_k(%rsp), \TMP5 873 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 874 movaps 0x80(%arg1), \TMP3 875 AESENC \TMP3, \XMM1 # Round 8 876 AESENC \TMP3, \XMM2 877 AESENC \TMP3, \XMM3 878 AESENC \TMP3, \XMM4 879 pxor \TMP1, \TMP4 880# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 881 pxor \XMM7, \XMM5 882 pxor \TMP2, \TMP6 883 884 # Multiply XMM8 * HashKey 885 # XMM8 and TMP5 hold the values for the two operands 886 887 movdqa \XMM8, \TMP1 888 pshufd $78, \XMM8, \TMP2 889 pxor \XMM8, \TMP2 890 movdqa HashKey(%rsp), \TMP5 891 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 892 movaps 0x90(%arg1), \TMP3 893 AESENC \TMP3, \XMM1 # Round 9 894 AESENC \TMP3, \XMM2 895 AESENC \TMP3, \XMM3 896 AESENC \TMP3, \XMM4 897 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 898 lea 0xa0(%arg1),%r10 899 mov keysize,%eax 900 shr $2,%eax # 128->4, 192->6, 256->8 901 sub $4,%eax # 128->0, 192->2, 256->4 902 jz aes_loop_par_enc_done 903 904aes_loop_par_enc: 905 MOVADQ (%r10),\TMP3 906.irpc index, 1234 907 AESENC \TMP3, %xmm\index 908.endr 909 add $16,%r10 910 sub $1,%eax 911 jnz aes_loop_par_enc 912 913aes_loop_par_enc_done: 914 MOVADQ (%r10), \TMP3 915 AESENCLAST \TMP3, \XMM1 # Round 10 916 AESENCLAST \TMP3, \XMM2 917 AESENCLAST \TMP3, \XMM3 918 AESENCLAST \TMP3, \XMM4 919 movdqa HashKey_k(%rsp), \TMP5 920 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 921 movdqu (%arg3,%r11,1), \TMP3 922 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 923 movdqu 16(%arg3,%r11,1), \TMP3 924 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 925 movdqu 32(%arg3,%r11,1), \TMP3 926 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 927 movdqu 48(%arg3,%r11,1), \TMP3 928 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 929 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer 930 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer 931 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer 932 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer 933 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 934 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 935 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 936 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 937 938 pxor \TMP4, \TMP1 939 pxor \XMM8, \XMM5 940 pxor \TMP6, \TMP2 941 pxor \TMP1, \TMP2 942 pxor \XMM5, \TMP2 943 movdqa \TMP2, \TMP3 944 pslldq $8, \TMP3 # left shift TMP3 2 DWs 945 psrldq $8, \TMP2 # right shift TMP2 2 DWs 946 pxor \TMP3, \XMM5 947 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 948 949 # first phase of reduction 950 951 movdqa \XMM5, \TMP2 952 movdqa \XMM5, \TMP3 953 movdqa \XMM5, \TMP4 954# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 955 pslld $31, \TMP2 # packed right shift << 31 956 pslld $30, \TMP3 # packed right shift << 30 957 pslld $25, \TMP4 # packed right shift << 25 958 pxor \TMP3, \TMP2 # xor the shifted versions 959 pxor \TMP4, \TMP2 960 movdqa \TMP2, \TMP5 961 psrldq $4, \TMP5 # right shift T5 1 DW 962 pslldq $12, \TMP2 # left shift T2 3 DWs 963 pxor \TMP2, \XMM5 964 965 # second phase of reduction 966 967 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 968 movdqa \XMM5,\TMP3 969 movdqa \XMM5,\TMP4 970 psrld $1, \TMP2 # packed left shift >>1 971 psrld $2, \TMP3 # packed left shift >>2 972 psrld $7, \TMP4 # packed left shift >>7 973 pxor \TMP3,\TMP2 # xor the shifted versions 974 pxor \TMP4,\TMP2 975 pxor \TMP5, \TMP2 976 pxor \TMP2, \XMM5 977 pxor \TMP1, \XMM5 # result is in TMP1 978 979 pxor \XMM5, \XMM1 980.endm 981 982/* 983* decrypt 4 blocks at a time 984* ghash the 4 previously decrypted ciphertext blocks 985* arg1, %arg2, %arg3 are used as pointers only, not modified 986* %r11 is the data offset value 987*/ 988.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ 989TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 990 991 movdqa \XMM1, \XMM5 992 movdqa \XMM2, \XMM6 993 movdqa \XMM3, \XMM7 994 movdqa \XMM4, \XMM8 995 996 movdqa SHUF_MASK(%rip), %xmm15 997 # multiply TMP5 * HashKey using karatsuba 998 999 movdqa \XMM5, \TMP4 1000 pshufd $78, \XMM5, \TMP6 1001 pxor \XMM5, \TMP6 1002 paddd ONE(%rip), \XMM0 # INCR CNT 1003 movdqa HashKey_4(%rsp), \TMP5 1004 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 1005 movdqa \XMM0, \XMM1 1006 paddd ONE(%rip), \XMM0 # INCR CNT 1007 movdqa \XMM0, \XMM2 1008 paddd ONE(%rip), \XMM0 # INCR CNT 1009 movdqa \XMM0, \XMM3 1010 paddd ONE(%rip), \XMM0 # INCR CNT 1011 movdqa \XMM0, \XMM4 1012 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1013 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1014 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1015 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1016 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1017 1018 pxor (%arg1), \XMM1 1019 pxor (%arg1), \XMM2 1020 pxor (%arg1), \XMM3 1021 pxor (%arg1), \XMM4 1022 movdqa HashKey_4_k(%rsp), \TMP5 1023 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1024 movaps 0x10(%arg1), \TMP1 1025 AESENC \TMP1, \XMM1 # Round 1 1026 AESENC \TMP1, \XMM2 1027 AESENC \TMP1, \XMM3 1028 AESENC \TMP1, \XMM4 1029 movaps 0x20(%arg1), \TMP1 1030 AESENC \TMP1, \XMM1 # Round 2 1031 AESENC \TMP1, \XMM2 1032 AESENC \TMP1, \XMM3 1033 AESENC \TMP1, \XMM4 1034 movdqa \XMM6, \TMP1 1035 pshufd $78, \XMM6, \TMP2 1036 pxor \XMM6, \TMP2 1037 movdqa HashKey_3(%rsp), \TMP5 1038 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1039 movaps 0x30(%arg1), \TMP3 1040 AESENC \TMP3, \XMM1 # Round 3 1041 AESENC \TMP3, \XMM2 1042 AESENC \TMP3, \XMM3 1043 AESENC \TMP3, \XMM4 1044 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1045 movaps 0x40(%arg1), \TMP3 1046 AESENC \TMP3, \XMM1 # Round 4 1047 AESENC \TMP3, \XMM2 1048 AESENC \TMP3, \XMM3 1049 AESENC \TMP3, \XMM4 1050 movdqa HashKey_3_k(%rsp), \TMP5 1051 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1052 movaps 0x50(%arg1), \TMP3 1053 AESENC \TMP3, \XMM1 # Round 5 1054 AESENC \TMP3, \XMM2 1055 AESENC \TMP3, \XMM3 1056 AESENC \TMP3, \XMM4 1057 pxor \TMP1, \TMP4 1058# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1059 pxor \XMM6, \XMM5 1060 pxor \TMP2, \TMP6 1061 movdqa \XMM7, \TMP1 1062 pshufd $78, \XMM7, \TMP2 1063 pxor \XMM7, \TMP2 1064 movdqa HashKey_2(%rsp ), \TMP5 1065 1066 # Multiply TMP5 * HashKey using karatsuba 1067 1068 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1069 movaps 0x60(%arg1), \TMP3 1070 AESENC \TMP3, \XMM1 # Round 6 1071 AESENC \TMP3, \XMM2 1072 AESENC \TMP3, \XMM3 1073 AESENC \TMP3, \XMM4 1074 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1075 movaps 0x70(%arg1), \TMP3 1076 AESENC \TMP3, \XMM1 # Round 7 1077 AESENC \TMP3, \XMM2 1078 AESENC \TMP3, \XMM3 1079 AESENC \TMP3, \XMM4 1080 movdqa HashKey_2_k(%rsp), \TMP5 1081 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1082 movaps 0x80(%arg1), \TMP3 1083 AESENC \TMP3, \XMM1 # Round 8 1084 AESENC \TMP3, \XMM2 1085 AESENC \TMP3, \XMM3 1086 AESENC \TMP3, \XMM4 1087 pxor \TMP1, \TMP4 1088# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1089 pxor \XMM7, \XMM5 1090 pxor \TMP2, \TMP6 1091 1092 # Multiply XMM8 * HashKey 1093 # XMM8 and TMP5 hold the values for the two operands 1094 1095 movdqa \XMM8, \TMP1 1096 pshufd $78, \XMM8, \TMP2 1097 pxor \XMM8, \TMP2 1098 movdqa HashKey(%rsp), \TMP5 1099 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1100 movaps 0x90(%arg1), \TMP3 1101 AESENC \TMP3, \XMM1 # Round 9 1102 AESENC \TMP3, \XMM2 1103 AESENC \TMP3, \XMM3 1104 AESENC \TMP3, \XMM4 1105 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1106 lea 0xa0(%arg1),%r10 1107 mov keysize,%eax 1108 shr $2,%eax # 128->4, 192->6, 256->8 1109 sub $4,%eax # 128->0, 192->2, 256->4 1110 jz aes_loop_par_dec_done 1111 1112aes_loop_par_dec: 1113 MOVADQ (%r10),\TMP3 1114.irpc index, 1234 1115 AESENC \TMP3, %xmm\index 1116.endr 1117 add $16,%r10 1118 sub $1,%eax 1119 jnz aes_loop_par_dec 1120 1121aes_loop_par_dec_done: 1122 MOVADQ (%r10), \TMP3 1123 AESENCLAST \TMP3, \XMM1 # last round 1124 AESENCLAST \TMP3, \XMM2 1125 AESENCLAST \TMP3, \XMM3 1126 AESENCLAST \TMP3, \XMM4 1127 movdqa HashKey_k(%rsp), \TMP5 1128 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1129 movdqu (%arg3,%r11,1), \TMP3 1130 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1131 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer 1132 movdqa \TMP3, \XMM1 1133 movdqu 16(%arg3,%r11,1), \TMP3 1134 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1135 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer 1136 movdqa \TMP3, \XMM2 1137 movdqu 32(%arg3,%r11,1), \TMP3 1138 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1139 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer 1140 movdqa \TMP3, \XMM3 1141 movdqu 48(%arg3,%r11,1), \TMP3 1142 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1143 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer 1144 movdqa \TMP3, \XMM4 1145 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1146 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1147 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1148 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1149 1150 pxor \TMP4, \TMP1 1151 pxor \XMM8, \XMM5 1152 pxor \TMP6, \TMP2 1153 pxor \TMP1, \TMP2 1154 pxor \XMM5, \TMP2 1155 movdqa \TMP2, \TMP3 1156 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1157 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1158 pxor \TMP3, \XMM5 1159 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1160 1161 # first phase of reduction 1162 1163 movdqa \XMM5, \TMP2 1164 movdqa \XMM5, \TMP3 1165 movdqa \XMM5, \TMP4 1166# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1167 pslld $31, \TMP2 # packed right shift << 31 1168 pslld $30, \TMP3 # packed right shift << 30 1169 pslld $25, \TMP4 # packed right shift << 25 1170 pxor \TMP3, \TMP2 # xor the shifted versions 1171 pxor \TMP4, \TMP2 1172 movdqa \TMP2, \TMP5 1173 psrldq $4, \TMP5 # right shift T5 1 DW 1174 pslldq $12, \TMP2 # left shift T2 3 DWs 1175 pxor \TMP2, \XMM5 1176 1177 # second phase of reduction 1178 1179 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1180 movdqa \XMM5,\TMP3 1181 movdqa \XMM5,\TMP4 1182 psrld $1, \TMP2 # packed left shift >>1 1183 psrld $2, \TMP3 # packed left shift >>2 1184 psrld $7, \TMP4 # packed left shift >>7 1185 pxor \TMP3,\TMP2 # xor the shifted versions 1186 pxor \TMP4,\TMP2 1187 pxor \TMP5, \TMP2 1188 pxor \TMP2, \XMM5 1189 pxor \TMP1, \XMM5 # result is in TMP1 1190 1191 pxor \XMM5, \XMM1 1192.endm 1193 1194/* GHASH the last 4 ciphertext blocks. */ 1195.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1196TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1197 1198 # Multiply TMP6 * HashKey (using Karatsuba) 1199 1200 movdqa \XMM1, \TMP6 1201 pshufd $78, \XMM1, \TMP2 1202 pxor \XMM1, \TMP2 1203 movdqa HashKey_4(%rsp), \TMP5 1204 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1205 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1206 movdqa HashKey_4_k(%rsp), \TMP4 1207 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1208 movdqa \XMM1, \XMMDst 1209 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1210 1211 # Multiply TMP1 * HashKey (using Karatsuba) 1212 1213 movdqa \XMM2, \TMP1 1214 pshufd $78, \XMM2, \TMP2 1215 pxor \XMM2, \TMP2 1216 movdqa HashKey_3(%rsp), \TMP5 1217 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1218 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1219 movdqa HashKey_3_k(%rsp), \TMP4 1220 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1221 pxor \TMP1, \TMP6 1222 pxor \XMM2, \XMMDst 1223 pxor \TMP2, \XMM1 1224# results accumulated in TMP6, XMMDst, XMM1 1225 1226 # Multiply TMP1 * HashKey (using Karatsuba) 1227 1228 movdqa \XMM3, \TMP1 1229 pshufd $78, \XMM3, \TMP2 1230 pxor \XMM3, \TMP2 1231 movdqa HashKey_2(%rsp), \TMP5 1232 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1233 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1234 movdqa HashKey_2_k(%rsp), \TMP4 1235 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1236 pxor \TMP1, \TMP6 1237 pxor \XMM3, \XMMDst 1238 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1239 1240 # Multiply TMP1 * HashKey (using Karatsuba) 1241 movdqa \XMM4, \TMP1 1242 pshufd $78, \XMM4, \TMP2 1243 pxor \XMM4, \TMP2 1244 movdqa HashKey(%rsp), \TMP5 1245 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1246 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1247 movdqa HashKey_k(%rsp), \TMP4 1248 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1249 pxor \TMP1, \TMP6 1250 pxor \XMM4, \XMMDst 1251 pxor \XMM1, \TMP2 1252 pxor \TMP6, \TMP2 1253 pxor \XMMDst, \TMP2 1254 # middle section of the temp results combined as in karatsuba algorithm 1255 movdqa \TMP2, \TMP4 1256 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1257 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1258 pxor \TMP4, \XMMDst 1259 pxor \TMP2, \TMP6 1260# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1261 # first phase of the reduction 1262 movdqa \XMMDst, \TMP2 1263 movdqa \XMMDst, \TMP3 1264 movdqa \XMMDst, \TMP4 1265# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1266 pslld $31, \TMP2 # packed right shifting << 31 1267 pslld $30, \TMP3 # packed right shifting << 30 1268 pslld $25, \TMP4 # packed right shifting << 25 1269 pxor \TMP3, \TMP2 # xor the shifted versions 1270 pxor \TMP4, \TMP2 1271 movdqa \TMP2, \TMP7 1272 psrldq $4, \TMP7 # right shift TMP7 1 DW 1273 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1274 pxor \TMP2, \XMMDst 1275 1276 # second phase of the reduction 1277 movdqa \XMMDst, \TMP2 1278 # make 3 copies of XMMDst for doing 3 shift operations 1279 movdqa \XMMDst, \TMP3 1280 movdqa \XMMDst, \TMP4 1281 psrld $1, \TMP2 # packed left shift >> 1 1282 psrld $2, \TMP3 # packed left shift >> 2 1283 psrld $7, \TMP4 # packed left shift >> 7 1284 pxor \TMP3, \TMP2 # xor the shifted versions 1285 pxor \TMP4, \TMP2 1286 pxor \TMP7, \TMP2 1287 pxor \TMP2, \XMMDst 1288 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1289.endm 1290 1291 1292/* Encryption of a single block 1293* uses eax & r10 1294*/ 1295 1296.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1297 1298 pxor (%arg1), \XMM0 1299 mov keysize,%eax 1300 shr $2,%eax # 128->4, 192->6, 256->8 1301 add $5,%eax # 128->9, 192->11, 256->13 1302 lea 16(%arg1), %r10 # get first expanded key address 1303 1304_esb_loop_\@: 1305 MOVADQ (%r10),\TMP1 1306 AESENC \TMP1,\XMM0 1307 add $16,%r10 1308 sub $1,%eax 1309 jnz _esb_loop_\@ 1310 1311 MOVADQ (%r10),\TMP1 1312 AESENCLAST \TMP1,\XMM0 1313.endm 1314/***************************************************************************** 1315* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1316* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1317* const u8 *in, // Ciphertext input 1318* u64 plaintext_len, // Length of data in bytes for decryption. 1319* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1320* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1321* // concatenated with 0x00000001. 16-byte aligned pointer. 1322* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1323* const u8 *aad, // Additional Authentication Data (AAD) 1324* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1325* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1326* // given authentication tag and only return the plaintext if they match. 1327* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1328* // (most likely), 12 or 8. 1329* 1330* Assumptions: 1331* 1332* keys: 1333* keys are pre-expanded and aligned to 16 bytes. we are using the first 1334* set of 11 keys in the data structure void *aes_ctx 1335* 1336* iv: 1337* 0 1 2 3 1338* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1339* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1340* | Salt (From the SA) | 1341* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1342* | Initialization Vector | 1343* | (This is the sequence number from IPSec header) | 1344* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1345* | 0x1 | 1346* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1347* 1348* 1349* 1350* AAD: 1351* AAD padded to 128 bits with 0 1352* for example, assume AAD is a u32 vector 1353* 1354* if AAD is 8 bytes: 1355* AAD[3] = {A0, A1}; 1356* padded AAD in xmm register = {A1 A0 0 0} 1357* 1358* 0 1 2 3 1359* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1360* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1361* | SPI (A1) | 1362* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1363* | 32-bit Sequence Number (A0) | 1364* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1365* | 0x0 | 1366* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1367* 1368* AAD Format with 32-bit Sequence Number 1369* 1370* if AAD is 12 bytes: 1371* AAD[3] = {A0, A1, A2}; 1372* padded AAD in xmm register = {A2 A1 A0 0} 1373* 1374* 0 1 2 3 1375* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1376* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1377* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1378* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1379* | SPI (A2) | 1380* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1381* | 64-bit Extended Sequence Number {A1,A0} | 1382* | | 1383* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1384* | 0x0 | 1385* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1386* 1387* AAD Format with 64-bit Extended Sequence Number 1388* 1389* aadLen: 1390* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1391* The code supports 16 too but for other sizes, the code will fail. 1392* 1393* TLen: 1394* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1395* For other sizes, the code will fail. 1396* 1397* poly = x^128 + x^127 + x^126 + x^121 + 1 1398* 1399*****************************************************************************/ 1400ENTRY(aesni_gcm_dec) 1401 push %r12 1402 push %r13 1403 push %r14 1404 mov %rsp, %r14 1405/* 1406* states of %xmm registers %xmm6:%xmm15 not saved 1407* all %xmm registers are clobbered 1408*/ 1409 sub $VARIABLE_OFFSET, %rsp 1410 and $~63, %rsp # align rsp to 64 bytes 1411 mov %arg6, %r12 1412 movdqu (%r12), %xmm13 # %xmm13 = HashKey 1413 movdqa SHUF_MASK(%rip), %xmm2 1414 PSHUFB_XMM %xmm2, %xmm13 1415 1416 1417# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) 1418 1419 movdqa %xmm13, %xmm2 1420 psllq $1, %xmm13 1421 psrlq $63, %xmm2 1422 movdqa %xmm2, %xmm1 1423 pslldq $8, %xmm2 1424 psrldq $8, %xmm1 1425 por %xmm2, %xmm13 1426 1427 # Reduction 1428 1429 pshufd $0x24, %xmm1, %xmm2 1430 pcmpeqd TWOONE(%rip), %xmm2 1431 pand POLY(%rip), %xmm2 1432 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) 1433 1434 1435 # Decrypt first few blocks 1436 1437 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) 1438 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext 1439 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 1440 mov %r13, %r12 1441 and $(3<<4), %r12 1442 jz _initial_num_blocks_is_0_decrypt 1443 cmp $(2<<4), %r12 1444 jb _initial_num_blocks_is_1_decrypt 1445 je _initial_num_blocks_is_2_decrypt 1446_initial_num_blocks_is_3_decrypt: 1447 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1448%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec 1449 sub $48, %r13 1450 jmp _initial_blocks_decrypted 1451_initial_num_blocks_is_2_decrypt: 1452 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1453%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec 1454 sub $32, %r13 1455 jmp _initial_blocks_decrypted 1456_initial_num_blocks_is_1_decrypt: 1457 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1458%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec 1459 sub $16, %r13 1460 jmp _initial_blocks_decrypted 1461_initial_num_blocks_is_0_decrypt: 1462 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1463%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec 1464_initial_blocks_decrypted: 1465 cmp $0, %r13 1466 je _zero_cipher_left_decrypt 1467 sub $64, %r13 1468 je _four_cipher_left_decrypt 1469_decrypt_by_4: 1470 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1471%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec 1472 add $64, %r11 1473 sub $64, %r13 1474 jne _decrypt_by_4 1475_four_cipher_left_decrypt: 1476 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1477%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1478_zero_cipher_left_decrypt: 1479 mov %arg4, %r13 1480 and $15, %r13 # %r13 = arg4 (mod 16) 1481 je _multiple_of_16_bytes_decrypt 1482 1483 # Handle the last <16 byte block separately 1484 1485 paddd ONE(%rip), %xmm0 # increment CNT to get Yn 1486 movdqa SHUF_MASK(%rip), %xmm10 1487 PSHUFB_XMM %xmm10, %xmm0 1488 1489 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) 1490 sub $16, %r11 1491 add %r13, %r11 1492 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block 1493 lea SHIFT_MASK+16(%rip), %r12 1494 sub %r13, %r12 1495# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes 1496# (%r13 is the number of bytes in plaintext mod 16) 1497 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1498 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes 1499 1500 movdqa %xmm1, %xmm2 1501 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) 1502 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1503 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 1504 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 1505 pand %xmm1, %xmm2 1506 movdqa SHUF_MASK(%rip), %xmm10 1507 PSHUFB_XMM %xmm10 ,%xmm2 1508 1509 pxor %xmm2, %xmm8 1510 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1511 # GHASH computation for the last <16 byte block 1512 sub %r13, %r11 1513 add $16, %r11 1514 1515 # output %r13 bytes 1516 MOVQ_R64_XMM %xmm0, %rax 1517 cmp $8, %r13 1518 jle _less_than_8_bytes_left_decrypt 1519 mov %rax, (%arg2 , %r11, 1) 1520 add $8, %r11 1521 psrldq $8, %xmm0 1522 MOVQ_R64_XMM %xmm0, %rax 1523 sub $8, %r13 1524_less_than_8_bytes_left_decrypt: 1525 mov %al, (%arg2, %r11, 1) 1526 add $1, %r11 1527 shr $8, %rax 1528 sub $1, %r13 1529 jne _less_than_8_bytes_left_decrypt 1530_multiple_of_16_bytes_decrypt: 1531 mov arg8, %r12 # %r13 = aadLen (number of bytes) 1532 shl $3, %r12 # convert into number of bits 1533 movd %r12d, %xmm15 # len(A) in %xmm15 1534 shl $3, %arg4 # len(C) in bits (*128) 1535 MOVQ_R64_XMM %arg4, %xmm1 1536 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1537 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1538 pxor %xmm15, %xmm8 1539 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1540 # final GHASH computation 1541 movdqa SHUF_MASK(%rip), %xmm10 1542 PSHUFB_XMM %xmm10, %xmm8 1543 1544 mov %arg5, %rax # %rax = *Y0 1545 movdqu (%rax), %xmm0 # %xmm0 = Y0 1546 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 1547 pxor %xmm8, %xmm0 1548_return_T_decrypt: 1549 mov arg9, %r10 # %r10 = authTag 1550 mov arg10, %r11 # %r11 = auth_tag_len 1551 cmp $16, %r11 1552 je _T_16_decrypt 1553 cmp $8, %r11 1554 jl _T_4_decrypt 1555_T_8_decrypt: 1556 MOVQ_R64_XMM %xmm0, %rax 1557 mov %rax, (%r10) 1558 add $8, %r10 1559 sub $8, %r11 1560 psrldq $8, %xmm0 1561 cmp $0, %r11 1562 je _return_T_done_decrypt 1563_T_4_decrypt: 1564 movd %xmm0, %eax 1565 mov %eax, (%r10) 1566 add $4, %r10 1567 sub $4, %r11 1568 psrldq $4, %xmm0 1569 cmp $0, %r11 1570 je _return_T_done_decrypt 1571_T_123_decrypt: 1572 movd %xmm0, %eax 1573 cmp $2, %r11 1574 jl _T_1_decrypt 1575 mov %ax, (%r10) 1576 cmp $2, %r11 1577 je _return_T_done_decrypt 1578 add $2, %r10 1579 sar $16, %eax 1580_T_1_decrypt: 1581 mov %al, (%r10) 1582 jmp _return_T_done_decrypt 1583_T_16_decrypt: 1584 movdqu %xmm0, (%r10) 1585_return_T_done_decrypt: 1586 mov %r14, %rsp 1587 pop %r14 1588 pop %r13 1589 pop %r12 1590 ret 1591ENDPROC(aesni_gcm_dec) 1592 1593 1594/***************************************************************************** 1595* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1596* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1597* const u8 *in, // Plaintext input 1598* u64 plaintext_len, // Length of data in bytes for encryption. 1599* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1600* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1601* // concatenated with 0x00000001. 16-byte aligned pointer. 1602* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1603* const u8 *aad, // Additional Authentication Data (AAD) 1604* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1605* u8 *auth_tag, // Authenticated Tag output. 1606* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1607* // 12 or 8. 1608* 1609* Assumptions: 1610* 1611* keys: 1612* keys are pre-expanded and aligned to 16 bytes. we are using the 1613* first set of 11 keys in the data structure void *aes_ctx 1614* 1615* 1616* iv: 1617* 0 1 2 3 1618* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1619* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1620* | Salt (From the SA) | 1621* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1622* | Initialization Vector | 1623* | (This is the sequence number from IPSec header) | 1624* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1625* | 0x1 | 1626* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1627* 1628* 1629* 1630* AAD: 1631* AAD padded to 128 bits with 0 1632* for example, assume AAD is a u32 vector 1633* 1634* if AAD is 8 bytes: 1635* AAD[3] = {A0, A1}; 1636* padded AAD in xmm register = {A1 A0 0 0} 1637* 1638* 0 1 2 3 1639* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1640* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1641* | SPI (A1) | 1642* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1643* | 32-bit Sequence Number (A0) | 1644* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1645* | 0x0 | 1646* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1647* 1648* AAD Format with 32-bit Sequence Number 1649* 1650* if AAD is 12 bytes: 1651* AAD[3] = {A0, A1, A2}; 1652* padded AAD in xmm register = {A2 A1 A0 0} 1653* 1654* 0 1 2 3 1655* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1656* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1657* | SPI (A2) | 1658* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1659* | 64-bit Extended Sequence Number {A1,A0} | 1660* | | 1661* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1662* | 0x0 | 1663* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1664* 1665* AAD Format with 64-bit Extended Sequence Number 1666* 1667* aadLen: 1668* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1669* The code supports 16 too but for other sizes, the code will fail. 1670* 1671* TLen: 1672* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1673* For other sizes, the code will fail. 1674* 1675* poly = x^128 + x^127 + x^126 + x^121 + 1 1676***************************************************************************/ 1677ENTRY(aesni_gcm_enc) 1678 push %r12 1679 push %r13 1680 push %r14 1681 mov %rsp, %r14 1682# 1683# states of %xmm registers %xmm6:%xmm15 not saved 1684# all %xmm registers are clobbered 1685# 1686 sub $VARIABLE_OFFSET, %rsp 1687 and $~63, %rsp 1688 mov %arg6, %r12 1689 movdqu (%r12), %xmm13 1690 movdqa SHUF_MASK(%rip), %xmm2 1691 PSHUFB_XMM %xmm2, %xmm13 1692 1693 1694# precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 1695 1696 movdqa %xmm13, %xmm2 1697 psllq $1, %xmm13 1698 psrlq $63, %xmm2 1699 movdqa %xmm2, %xmm1 1700 pslldq $8, %xmm2 1701 psrldq $8, %xmm1 1702 por %xmm2, %xmm13 1703 1704 # reduce HashKey<<1 1705 1706 pshufd $0x24, %xmm1, %xmm2 1707 pcmpeqd TWOONE(%rip), %xmm2 1708 pand POLY(%rip), %xmm2 1709 pxor %xmm2, %xmm13 1710 movdqa %xmm13, HashKey(%rsp) 1711 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) 1712 and $-16, %r13 1713 mov %r13, %r12 1714 1715 # Encrypt first few blocks 1716 1717 and $(3<<4), %r12 1718 jz _initial_num_blocks_is_0_encrypt 1719 cmp $(2<<4), %r12 1720 jb _initial_num_blocks_is_1_encrypt 1721 je _initial_num_blocks_is_2_encrypt 1722_initial_num_blocks_is_3_encrypt: 1723 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1724%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc 1725 sub $48, %r13 1726 jmp _initial_blocks_encrypted 1727_initial_num_blocks_is_2_encrypt: 1728 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1729%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc 1730 sub $32, %r13 1731 jmp _initial_blocks_encrypted 1732_initial_num_blocks_is_1_encrypt: 1733 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1734%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc 1735 sub $16, %r13 1736 jmp _initial_blocks_encrypted 1737_initial_num_blocks_is_0_encrypt: 1738 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1739%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc 1740_initial_blocks_encrypted: 1741 1742 # Main loop - Encrypt remaining blocks 1743 1744 cmp $0, %r13 1745 je _zero_cipher_left_encrypt 1746 sub $64, %r13 1747 je _four_cipher_left_encrypt 1748_encrypt_by_4_encrypt: 1749 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1750%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc 1751 add $64, %r11 1752 sub $64, %r13 1753 jne _encrypt_by_4_encrypt 1754_four_cipher_left_encrypt: 1755 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1756%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1757_zero_cipher_left_encrypt: 1758 mov %arg4, %r13 1759 and $15, %r13 # %r13 = arg4 (mod 16) 1760 je _multiple_of_16_bytes_encrypt 1761 1762 # Handle the last <16 Byte block separately 1763 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 1764 movdqa SHUF_MASK(%rip), %xmm10 1765 PSHUFB_XMM %xmm10, %xmm0 1766 1767 1768 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 1769 sub $16, %r11 1770 add %r13, %r11 1771 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks 1772 lea SHIFT_MASK+16(%rip), %r12 1773 sub %r13, %r12 1774 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 1775 # (%r13 is the number of bytes in plaintext mod 16) 1776 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1777 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte 1778 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) 1779 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1780 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 1781 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 1782 movdqa SHUF_MASK(%rip), %xmm10 1783 PSHUFB_XMM %xmm10,%xmm0 1784 1785 pxor %xmm0, %xmm8 1786 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1787 # GHASH computation for the last <16 byte block 1788 sub %r13, %r11 1789 add $16, %r11 1790 1791 movdqa SHUF_MASK(%rip), %xmm10 1792 PSHUFB_XMM %xmm10, %xmm0 1793 1794 # shuffle xmm0 back to output as ciphertext 1795 1796 # Output %r13 bytes 1797 MOVQ_R64_XMM %xmm0, %rax 1798 cmp $8, %r13 1799 jle _less_than_8_bytes_left_encrypt 1800 mov %rax, (%arg2 , %r11, 1) 1801 add $8, %r11 1802 psrldq $8, %xmm0 1803 MOVQ_R64_XMM %xmm0, %rax 1804 sub $8, %r13 1805_less_than_8_bytes_left_encrypt: 1806 mov %al, (%arg2, %r11, 1) 1807 add $1, %r11 1808 shr $8, %rax 1809 sub $1, %r13 1810 jne _less_than_8_bytes_left_encrypt 1811_multiple_of_16_bytes_encrypt: 1812 mov arg8, %r12 # %r12 = addLen (number of bytes) 1813 shl $3, %r12 1814 movd %r12d, %xmm15 # len(A) in %xmm15 1815 shl $3, %arg4 # len(C) in bits (*128) 1816 MOVQ_R64_XMM %arg4, %xmm1 1817 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1818 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1819 pxor %xmm15, %xmm8 1820 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1821 # final GHASH computation 1822 movdqa SHUF_MASK(%rip), %xmm10 1823 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap 1824 1825 mov %arg5, %rax # %rax = *Y0 1826 movdqu (%rax), %xmm0 # %xmm0 = Y0 1827 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) 1828 pxor %xmm8, %xmm0 1829_return_T_encrypt: 1830 mov arg9, %r10 # %r10 = authTag 1831 mov arg10, %r11 # %r11 = auth_tag_len 1832 cmp $16, %r11 1833 je _T_16_encrypt 1834 cmp $8, %r11 1835 jl _T_4_encrypt 1836_T_8_encrypt: 1837 MOVQ_R64_XMM %xmm0, %rax 1838 mov %rax, (%r10) 1839 add $8, %r10 1840 sub $8, %r11 1841 psrldq $8, %xmm0 1842 cmp $0, %r11 1843 je _return_T_done_encrypt 1844_T_4_encrypt: 1845 movd %xmm0, %eax 1846 mov %eax, (%r10) 1847 add $4, %r10 1848 sub $4, %r11 1849 psrldq $4, %xmm0 1850 cmp $0, %r11 1851 je _return_T_done_encrypt 1852_T_123_encrypt: 1853 movd %xmm0, %eax 1854 cmp $2, %r11 1855 jl _T_1_encrypt 1856 mov %ax, (%r10) 1857 cmp $2, %r11 1858 je _return_T_done_encrypt 1859 add $2, %r10 1860 sar $16, %eax 1861_T_1_encrypt: 1862 mov %al, (%r10) 1863 jmp _return_T_done_encrypt 1864_T_16_encrypt: 1865 movdqu %xmm0, (%r10) 1866_return_T_done_encrypt: 1867 mov %r14, %rsp 1868 pop %r14 1869 pop %r13 1870 pop %r12 1871 ret 1872ENDPROC(aesni_gcm_enc) 1873 1874#endif 1875 1876 1877.align 4 1878_key_expansion_128: 1879_key_expansion_256a: 1880 pshufd $0b11111111, %xmm1, %xmm1 1881 shufps $0b00010000, %xmm0, %xmm4 1882 pxor %xmm4, %xmm0 1883 shufps $0b10001100, %xmm0, %xmm4 1884 pxor %xmm4, %xmm0 1885 pxor %xmm1, %xmm0 1886 movaps %xmm0, (TKEYP) 1887 add $0x10, TKEYP 1888 ret 1889ENDPROC(_key_expansion_128) 1890ENDPROC(_key_expansion_256a) 1891 1892.align 4 1893_key_expansion_192a: 1894 pshufd $0b01010101, %xmm1, %xmm1 1895 shufps $0b00010000, %xmm0, %xmm4 1896 pxor %xmm4, %xmm0 1897 shufps $0b10001100, %xmm0, %xmm4 1898 pxor %xmm4, %xmm0 1899 pxor %xmm1, %xmm0 1900 1901 movaps %xmm2, %xmm5 1902 movaps %xmm2, %xmm6 1903 pslldq $4, %xmm5 1904 pshufd $0b11111111, %xmm0, %xmm3 1905 pxor %xmm3, %xmm2 1906 pxor %xmm5, %xmm2 1907 1908 movaps %xmm0, %xmm1 1909 shufps $0b01000100, %xmm0, %xmm6 1910 movaps %xmm6, (TKEYP) 1911 shufps $0b01001110, %xmm2, %xmm1 1912 movaps %xmm1, 0x10(TKEYP) 1913 add $0x20, TKEYP 1914 ret 1915ENDPROC(_key_expansion_192a) 1916 1917.align 4 1918_key_expansion_192b: 1919 pshufd $0b01010101, %xmm1, %xmm1 1920 shufps $0b00010000, %xmm0, %xmm4 1921 pxor %xmm4, %xmm0 1922 shufps $0b10001100, %xmm0, %xmm4 1923 pxor %xmm4, %xmm0 1924 pxor %xmm1, %xmm0 1925 1926 movaps %xmm2, %xmm5 1927 pslldq $4, %xmm5 1928 pshufd $0b11111111, %xmm0, %xmm3 1929 pxor %xmm3, %xmm2 1930 pxor %xmm5, %xmm2 1931 1932 movaps %xmm0, (TKEYP) 1933 add $0x10, TKEYP 1934 ret 1935ENDPROC(_key_expansion_192b) 1936 1937.align 4 1938_key_expansion_256b: 1939 pshufd $0b10101010, %xmm1, %xmm1 1940 shufps $0b00010000, %xmm2, %xmm4 1941 pxor %xmm4, %xmm2 1942 shufps $0b10001100, %xmm2, %xmm4 1943 pxor %xmm4, %xmm2 1944 pxor %xmm1, %xmm2 1945 movaps %xmm2, (TKEYP) 1946 add $0x10, TKEYP 1947 ret 1948ENDPROC(_key_expansion_256b) 1949 1950/* 1951 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1952 * unsigned int key_len) 1953 */ 1954ENTRY(aesni_set_key) 1955 FRAME_BEGIN 1956#ifndef __x86_64__ 1957 pushl KEYP 1958 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx 1959 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key 1960 movl (FRAME_OFFSET+16)(%esp), %edx # key_len 1961#endif 1962 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1963 movaps %xmm0, (KEYP) 1964 lea 0x10(KEYP), TKEYP # key addr 1965 movl %edx, 480(KEYP) 1966 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1967 cmp $24, %dl 1968 jb .Lenc_key128 1969 je .Lenc_key192 1970 movups 0x10(UKEYP), %xmm2 # other user key 1971 movaps %xmm2, (TKEYP) 1972 add $0x10, TKEYP 1973 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1974 call _key_expansion_256a 1975 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1976 call _key_expansion_256b 1977 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1978 call _key_expansion_256a 1979 AESKEYGENASSIST 0x2 %xmm0 %xmm1 1980 call _key_expansion_256b 1981 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1982 call _key_expansion_256a 1983 AESKEYGENASSIST 0x4 %xmm0 %xmm1 1984 call _key_expansion_256b 1985 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1986 call _key_expansion_256a 1987 AESKEYGENASSIST 0x8 %xmm0 %xmm1 1988 call _key_expansion_256b 1989 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1990 call _key_expansion_256a 1991 AESKEYGENASSIST 0x10 %xmm0 %xmm1 1992 call _key_expansion_256b 1993 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1994 call _key_expansion_256a 1995 AESKEYGENASSIST 0x20 %xmm0 %xmm1 1996 call _key_expansion_256b 1997 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1998 call _key_expansion_256a 1999 jmp .Ldec_key 2000.Lenc_key192: 2001 movq 0x10(UKEYP), %xmm2 # other user key 2002 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 2003 call _key_expansion_192a 2004 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 2005 call _key_expansion_192b 2006 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 2007 call _key_expansion_192a 2008 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 2009 call _key_expansion_192b 2010 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 2011 call _key_expansion_192a 2012 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 2013 call _key_expansion_192b 2014 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 2015 call _key_expansion_192a 2016 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 2017 call _key_expansion_192b 2018 jmp .Ldec_key 2019.Lenc_key128: 2020 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 2021 call _key_expansion_128 2022 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 2023 call _key_expansion_128 2024 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 2025 call _key_expansion_128 2026 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 2027 call _key_expansion_128 2028 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 2029 call _key_expansion_128 2030 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 2031 call _key_expansion_128 2032 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 2033 call _key_expansion_128 2034 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 2035 call _key_expansion_128 2036 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 2037 call _key_expansion_128 2038 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 2039 call _key_expansion_128 2040.Ldec_key: 2041 sub $0x10, TKEYP 2042 movaps (KEYP), %xmm0 2043 movaps (TKEYP), %xmm1 2044 movaps %xmm0, 240(TKEYP) 2045 movaps %xmm1, 240(KEYP) 2046 add $0x10, KEYP 2047 lea 240-16(TKEYP), UKEYP 2048.align 4 2049.Ldec_key_loop: 2050 movaps (KEYP), %xmm0 2051 AESIMC %xmm0 %xmm1 2052 movaps %xmm1, (UKEYP) 2053 add $0x10, KEYP 2054 sub $0x10, UKEYP 2055 cmp TKEYP, KEYP 2056 jb .Ldec_key_loop 2057 xor AREG, AREG 2058#ifndef __x86_64__ 2059 popl KEYP 2060#endif 2061 FRAME_END 2062 ret 2063ENDPROC(aesni_set_key) 2064 2065/* 2066 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2067 */ 2068ENTRY(aesni_enc) 2069 FRAME_BEGIN 2070#ifndef __x86_64__ 2071 pushl KEYP 2072 pushl KLEN 2073 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 2074 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 2075 movl (FRAME_OFFSET+20)(%esp), INP # src 2076#endif 2077 movl 480(KEYP), KLEN # key length 2078 movups (INP), STATE # input 2079 call _aesni_enc1 2080 movups STATE, (OUTP) # output 2081#ifndef __x86_64__ 2082 popl KLEN 2083 popl KEYP 2084#endif 2085 FRAME_END 2086 ret 2087ENDPROC(aesni_enc) 2088 2089/* 2090 * _aesni_enc1: internal ABI 2091 * input: 2092 * KEYP: key struct pointer 2093 * KLEN: round count 2094 * STATE: initial state (input) 2095 * output: 2096 * STATE: finial state (output) 2097 * changed: 2098 * KEY 2099 * TKEYP (T1) 2100 */ 2101.align 4 2102_aesni_enc1: 2103 movaps (KEYP), KEY # key 2104 mov KEYP, TKEYP 2105 pxor KEY, STATE # round 0 2106 add $0x30, TKEYP 2107 cmp $24, KLEN 2108 jb .Lenc128 2109 lea 0x20(TKEYP), TKEYP 2110 je .Lenc192 2111 add $0x20, TKEYP 2112 movaps -0x60(TKEYP), KEY 2113 AESENC KEY STATE 2114 movaps -0x50(TKEYP), KEY 2115 AESENC KEY STATE 2116.align 4 2117.Lenc192: 2118 movaps -0x40(TKEYP), KEY 2119 AESENC KEY STATE 2120 movaps -0x30(TKEYP), KEY 2121 AESENC KEY STATE 2122.align 4 2123.Lenc128: 2124 movaps -0x20(TKEYP), KEY 2125 AESENC KEY STATE 2126 movaps -0x10(TKEYP), KEY 2127 AESENC KEY STATE 2128 movaps (TKEYP), KEY 2129 AESENC KEY STATE 2130 movaps 0x10(TKEYP), KEY 2131 AESENC KEY STATE 2132 movaps 0x20(TKEYP), KEY 2133 AESENC KEY STATE 2134 movaps 0x30(TKEYP), KEY 2135 AESENC KEY STATE 2136 movaps 0x40(TKEYP), KEY 2137 AESENC KEY STATE 2138 movaps 0x50(TKEYP), KEY 2139 AESENC KEY STATE 2140 movaps 0x60(TKEYP), KEY 2141 AESENC KEY STATE 2142 movaps 0x70(TKEYP), KEY 2143 AESENCLAST KEY STATE 2144 ret 2145ENDPROC(_aesni_enc1) 2146 2147/* 2148 * _aesni_enc4: internal ABI 2149 * input: 2150 * KEYP: key struct pointer 2151 * KLEN: round count 2152 * STATE1: initial state (input) 2153 * STATE2 2154 * STATE3 2155 * STATE4 2156 * output: 2157 * STATE1: finial state (output) 2158 * STATE2 2159 * STATE3 2160 * STATE4 2161 * changed: 2162 * KEY 2163 * TKEYP (T1) 2164 */ 2165.align 4 2166_aesni_enc4: 2167 movaps (KEYP), KEY # key 2168 mov KEYP, TKEYP 2169 pxor KEY, STATE1 # round 0 2170 pxor KEY, STATE2 2171 pxor KEY, STATE3 2172 pxor KEY, STATE4 2173 add $0x30, TKEYP 2174 cmp $24, KLEN 2175 jb .L4enc128 2176 lea 0x20(TKEYP), TKEYP 2177 je .L4enc192 2178 add $0x20, TKEYP 2179 movaps -0x60(TKEYP), KEY 2180 AESENC KEY STATE1 2181 AESENC KEY STATE2 2182 AESENC KEY STATE3 2183 AESENC KEY STATE4 2184 movaps -0x50(TKEYP), KEY 2185 AESENC KEY STATE1 2186 AESENC KEY STATE2 2187 AESENC KEY STATE3 2188 AESENC KEY STATE4 2189#.align 4 2190.L4enc192: 2191 movaps -0x40(TKEYP), KEY 2192 AESENC KEY STATE1 2193 AESENC KEY STATE2 2194 AESENC KEY STATE3 2195 AESENC KEY STATE4 2196 movaps -0x30(TKEYP), KEY 2197 AESENC KEY STATE1 2198 AESENC KEY STATE2 2199 AESENC KEY STATE3 2200 AESENC KEY STATE4 2201#.align 4 2202.L4enc128: 2203 movaps -0x20(TKEYP), KEY 2204 AESENC KEY STATE1 2205 AESENC KEY STATE2 2206 AESENC KEY STATE3 2207 AESENC KEY STATE4 2208 movaps -0x10(TKEYP), KEY 2209 AESENC KEY STATE1 2210 AESENC KEY STATE2 2211 AESENC KEY STATE3 2212 AESENC KEY STATE4 2213 movaps (TKEYP), KEY 2214 AESENC KEY STATE1 2215 AESENC KEY STATE2 2216 AESENC KEY STATE3 2217 AESENC KEY STATE4 2218 movaps 0x10(TKEYP), KEY 2219 AESENC KEY STATE1 2220 AESENC KEY STATE2 2221 AESENC KEY STATE3 2222 AESENC KEY STATE4 2223 movaps 0x20(TKEYP), KEY 2224 AESENC KEY STATE1 2225 AESENC KEY STATE2 2226 AESENC KEY STATE3 2227 AESENC KEY STATE4 2228 movaps 0x30(TKEYP), KEY 2229 AESENC KEY STATE1 2230 AESENC KEY STATE2 2231 AESENC KEY STATE3 2232 AESENC KEY STATE4 2233 movaps 0x40(TKEYP), KEY 2234 AESENC KEY STATE1 2235 AESENC KEY STATE2 2236 AESENC KEY STATE3 2237 AESENC KEY STATE4 2238 movaps 0x50(TKEYP), KEY 2239 AESENC KEY STATE1 2240 AESENC KEY STATE2 2241 AESENC KEY STATE3 2242 AESENC KEY STATE4 2243 movaps 0x60(TKEYP), KEY 2244 AESENC KEY STATE1 2245 AESENC KEY STATE2 2246 AESENC KEY STATE3 2247 AESENC KEY STATE4 2248 movaps 0x70(TKEYP), KEY 2249 AESENCLAST KEY STATE1 # last round 2250 AESENCLAST KEY STATE2 2251 AESENCLAST KEY STATE3 2252 AESENCLAST KEY STATE4 2253 ret 2254ENDPROC(_aesni_enc4) 2255 2256/* 2257 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2258 */ 2259ENTRY(aesni_dec) 2260 FRAME_BEGIN 2261#ifndef __x86_64__ 2262 pushl KEYP 2263 pushl KLEN 2264 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 2265 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 2266 movl (FRAME_OFFSET+20)(%esp), INP # src 2267#endif 2268 mov 480(KEYP), KLEN # key length 2269 add $240, KEYP 2270 movups (INP), STATE # input 2271 call _aesni_dec1 2272 movups STATE, (OUTP) #output 2273#ifndef __x86_64__ 2274 popl KLEN 2275 popl KEYP 2276#endif 2277 FRAME_END 2278 ret 2279ENDPROC(aesni_dec) 2280 2281/* 2282 * _aesni_dec1: internal ABI 2283 * input: 2284 * KEYP: key struct pointer 2285 * KLEN: key length 2286 * STATE: initial state (input) 2287 * output: 2288 * STATE: finial state (output) 2289 * changed: 2290 * KEY 2291 * TKEYP (T1) 2292 */ 2293.align 4 2294_aesni_dec1: 2295 movaps (KEYP), KEY # key 2296 mov KEYP, TKEYP 2297 pxor KEY, STATE # round 0 2298 add $0x30, TKEYP 2299 cmp $24, KLEN 2300 jb .Ldec128 2301 lea 0x20(TKEYP), TKEYP 2302 je .Ldec192 2303 add $0x20, TKEYP 2304 movaps -0x60(TKEYP), KEY 2305 AESDEC KEY STATE 2306 movaps -0x50(TKEYP), KEY 2307 AESDEC KEY STATE 2308.align 4 2309.Ldec192: 2310 movaps -0x40(TKEYP), KEY 2311 AESDEC KEY STATE 2312 movaps -0x30(TKEYP), KEY 2313 AESDEC KEY STATE 2314.align 4 2315.Ldec128: 2316 movaps -0x20(TKEYP), KEY 2317 AESDEC KEY STATE 2318 movaps -0x10(TKEYP), KEY 2319 AESDEC KEY STATE 2320 movaps (TKEYP), KEY 2321 AESDEC KEY STATE 2322 movaps 0x10(TKEYP), KEY 2323 AESDEC KEY STATE 2324 movaps 0x20(TKEYP), KEY 2325 AESDEC KEY STATE 2326 movaps 0x30(TKEYP), KEY 2327 AESDEC KEY STATE 2328 movaps 0x40(TKEYP), KEY 2329 AESDEC KEY STATE 2330 movaps 0x50(TKEYP), KEY 2331 AESDEC KEY STATE 2332 movaps 0x60(TKEYP), KEY 2333 AESDEC KEY STATE 2334 movaps 0x70(TKEYP), KEY 2335 AESDECLAST KEY STATE 2336 ret 2337ENDPROC(_aesni_dec1) 2338 2339/* 2340 * _aesni_dec4: internal ABI 2341 * input: 2342 * KEYP: key struct pointer 2343 * KLEN: key length 2344 * STATE1: initial state (input) 2345 * STATE2 2346 * STATE3 2347 * STATE4 2348 * output: 2349 * STATE1: finial state (output) 2350 * STATE2 2351 * STATE3 2352 * STATE4 2353 * changed: 2354 * KEY 2355 * TKEYP (T1) 2356 */ 2357.align 4 2358_aesni_dec4: 2359 movaps (KEYP), KEY # key 2360 mov KEYP, TKEYP 2361 pxor KEY, STATE1 # round 0 2362 pxor KEY, STATE2 2363 pxor KEY, STATE3 2364 pxor KEY, STATE4 2365 add $0x30, TKEYP 2366 cmp $24, KLEN 2367 jb .L4dec128 2368 lea 0x20(TKEYP), TKEYP 2369 je .L4dec192 2370 add $0x20, TKEYP 2371 movaps -0x60(TKEYP), KEY 2372 AESDEC KEY STATE1 2373 AESDEC KEY STATE2 2374 AESDEC KEY STATE3 2375 AESDEC KEY STATE4 2376 movaps -0x50(TKEYP), KEY 2377 AESDEC KEY STATE1 2378 AESDEC KEY STATE2 2379 AESDEC KEY STATE3 2380 AESDEC KEY STATE4 2381.align 4 2382.L4dec192: 2383 movaps -0x40(TKEYP), KEY 2384 AESDEC KEY STATE1 2385 AESDEC KEY STATE2 2386 AESDEC KEY STATE3 2387 AESDEC KEY STATE4 2388 movaps -0x30(TKEYP), KEY 2389 AESDEC KEY STATE1 2390 AESDEC KEY STATE2 2391 AESDEC KEY STATE3 2392 AESDEC KEY STATE4 2393.align 4 2394.L4dec128: 2395 movaps -0x20(TKEYP), KEY 2396 AESDEC KEY STATE1 2397 AESDEC KEY STATE2 2398 AESDEC KEY STATE3 2399 AESDEC KEY STATE4 2400 movaps -0x10(TKEYP), KEY 2401 AESDEC KEY STATE1 2402 AESDEC KEY STATE2 2403 AESDEC KEY STATE3 2404 AESDEC KEY STATE4 2405 movaps (TKEYP), KEY 2406 AESDEC KEY STATE1 2407 AESDEC KEY STATE2 2408 AESDEC KEY STATE3 2409 AESDEC KEY STATE4 2410 movaps 0x10(TKEYP), KEY 2411 AESDEC KEY STATE1 2412 AESDEC KEY STATE2 2413 AESDEC KEY STATE3 2414 AESDEC KEY STATE4 2415 movaps 0x20(TKEYP), KEY 2416 AESDEC KEY STATE1 2417 AESDEC KEY STATE2 2418 AESDEC KEY STATE3 2419 AESDEC KEY STATE4 2420 movaps 0x30(TKEYP), KEY 2421 AESDEC KEY STATE1 2422 AESDEC KEY STATE2 2423 AESDEC KEY STATE3 2424 AESDEC KEY STATE4 2425 movaps 0x40(TKEYP), KEY 2426 AESDEC KEY STATE1 2427 AESDEC KEY STATE2 2428 AESDEC KEY STATE3 2429 AESDEC KEY STATE4 2430 movaps 0x50(TKEYP), KEY 2431 AESDEC KEY STATE1 2432 AESDEC KEY STATE2 2433 AESDEC KEY STATE3 2434 AESDEC KEY STATE4 2435 movaps 0x60(TKEYP), KEY 2436 AESDEC KEY STATE1 2437 AESDEC KEY STATE2 2438 AESDEC KEY STATE3 2439 AESDEC KEY STATE4 2440 movaps 0x70(TKEYP), KEY 2441 AESDECLAST KEY STATE1 # last round 2442 AESDECLAST KEY STATE2 2443 AESDECLAST KEY STATE3 2444 AESDECLAST KEY STATE4 2445 ret 2446ENDPROC(_aesni_dec4) 2447 2448/* 2449 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2450 * size_t len) 2451 */ 2452ENTRY(aesni_ecb_enc) 2453 FRAME_BEGIN 2454#ifndef __x86_64__ 2455 pushl LEN 2456 pushl KEYP 2457 pushl KLEN 2458 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2459 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2460 movl (FRAME_OFFSET+24)(%esp), INP # src 2461 movl (FRAME_OFFSET+28)(%esp), LEN # len 2462#endif 2463 test LEN, LEN # check length 2464 jz .Lecb_enc_ret 2465 mov 480(KEYP), KLEN 2466 cmp $16, LEN 2467 jb .Lecb_enc_ret 2468 cmp $64, LEN 2469 jb .Lecb_enc_loop1 2470.align 4 2471.Lecb_enc_loop4: 2472 movups (INP), STATE1 2473 movups 0x10(INP), STATE2 2474 movups 0x20(INP), STATE3 2475 movups 0x30(INP), STATE4 2476 call _aesni_enc4 2477 movups STATE1, (OUTP) 2478 movups STATE2, 0x10(OUTP) 2479 movups STATE3, 0x20(OUTP) 2480 movups STATE4, 0x30(OUTP) 2481 sub $64, LEN 2482 add $64, INP 2483 add $64, OUTP 2484 cmp $64, LEN 2485 jge .Lecb_enc_loop4 2486 cmp $16, LEN 2487 jb .Lecb_enc_ret 2488.align 4 2489.Lecb_enc_loop1: 2490 movups (INP), STATE1 2491 call _aesni_enc1 2492 movups STATE1, (OUTP) 2493 sub $16, LEN 2494 add $16, INP 2495 add $16, OUTP 2496 cmp $16, LEN 2497 jge .Lecb_enc_loop1 2498.Lecb_enc_ret: 2499#ifndef __x86_64__ 2500 popl KLEN 2501 popl KEYP 2502 popl LEN 2503#endif 2504 FRAME_END 2505 ret 2506ENDPROC(aesni_ecb_enc) 2507 2508/* 2509 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2510 * size_t len); 2511 */ 2512ENTRY(aesni_ecb_dec) 2513 FRAME_BEGIN 2514#ifndef __x86_64__ 2515 pushl LEN 2516 pushl KEYP 2517 pushl KLEN 2518 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2519 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2520 movl (FRAME_OFFSET+24)(%esp), INP # src 2521 movl (FRAME_OFFSET+28)(%esp), LEN # len 2522#endif 2523 test LEN, LEN 2524 jz .Lecb_dec_ret 2525 mov 480(KEYP), KLEN 2526 add $240, KEYP 2527 cmp $16, LEN 2528 jb .Lecb_dec_ret 2529 cmp $64, LEN 2530 jb .Lecb_dec_loop1 2531.align 4 2532.Lecb_dec_loop4: 2533 movups (INP), STATE1 2534 movups 0x10(INP), STATE2 2535 movups 0x20(INP), STATE3 2536 movups 0x30(INP), STATE4 2537 call _aesni_dec4 2538 movups STATE1, (OUTP) 2539 movups STATE2, 0x10(OUTP) 2540 movups STATE3, 0x20(OUTP) 2541 movups STATE4, 0x30(OUTP) 2542 sub $64, LEN 2543 add $64, INP 2544 add $64, OUTP 2545 cmp $64, LEN 2546 jge .Lecb_dec_loop4 2547 cmp $16, LEN 2548 jb .Lecb_dec_ret 2549.align 4 2550.Lecb_dec_loop1: 2551 movups (INP), STATE1 2552 call _aesni_dec1 2553 movups STATE1, (OUTP) 2554 sub $16, LEN 2555 add $16, INP 2556 add $16, OUTP 2557 cmp $16, LEN 2558 jge .Lecb_dec_loop1 2559.Lecb_dec_ret: 2560#ifndef __x86_64__ 2561 popl KLEN 2562 popl KEYP 2563 popl LEN 2564#endif 2565 FRAME_END 2566 ret 2567ENDPROC(aesni_ecb_dec) 2568 2569/* 2570 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2571 * size_t len, u8 *iv) 2572 */ 2573ENTRY(aesni_cbc_enc) 2574 FRAME_BEGIN 2575#ifndef __x86_64__ 2576 pushl IVP 2577 pushl LEN 2578 pushl KEYP 2579 pushl KLEN 2580 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2581 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2582 movl (FRAME_OFFSET+28)(%esp), INP # src 2583 movl (FRAME_OFFSET+32)(%esp), LEN # len 2584 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2585#endif 2586 cmp $16, LEN 2587 jb .Lcbc_enc_ret 2588 mov 480(KEYP), KLEN 2589 movups (IVP), STATE # load iv as initial state 2590.align 4 2591.Lcbc_enc_loop: 2592 movups (INP), IN # load input 2593 pxor IN, STATE 2594 call _aesni_enc1 2595 movups STATE, (OUTP) # store output 2596 sub $16, LEN 2597 add $16, INP 2598 add $16, OUTP 2599 cmp $16, LEN 2600 jge .Lcbc_enc_loop 2601 movups STATE, (IVP) 2602.Lcbc_enc_ret: 2603#ifndef __x86_64__ 2604 popl KLEN 2605 popl KEYP 2606 popl LEN 2607 popl IVP 2608#endif 2609 FRAME_END 2610 ret 2611ENDPROC(aesni_cbc_enc) 2612 2613/* 2614 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2615 * size_t len, u8 *iv) 2616 */ 2617ENTRY(aesni_cbc_dec) 2618 FRAME_BEGIN 2619#ifndef __x86_64__ 2620 pushl IVP 2621 pushl LEN 2622 pushl KEYP 2623 pushl KLEN 2624 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2625 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2626 movl (FRAME_OFFSET+28)(%esp), INP # src 2627 movl (FRAME_OFFSET+32)(%esp), LEN # len 2628 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2629#endif 2630 cmp $16, LEN 2631 jb .Lcbc_dec_just_ret 2632 mov 480(KEYP), KLEN 2633 add $240, KEYP 2634 movups (IVP), IV 2635 cmp $64, LEN 2636 jb .Lcbc_dec_loop1 2637.align 4 2638.Lcbc_dec_loop4: 2639 movups (INP), IN1 2640 movaps IN1, STATE1 2641 movups 0x10(INP), IN2 2642 movaps IN2, STATE2 2643#ifdef __x86_64__ 2644 movups 0x20(INP), IN3 2645 movaps IN3, STATE3 2646 movups 0x30(INP), IN4 2647 movaps IN4, STATE4 2648#else 2649 movups 0x20(INP), IN1 2650 movaps IN1, STATE3 2651 movups 0x30(INP), IN2 2652 movaps IN2, STATE4 2653#endif 2654 call _aesni_dec4 2655 pxor IV, STATE1 2656#ifdef __x86_64__ 2657 pxor IN1, STATE2 2658 pxor IN2, STATE3 2659 pxor IN3, STATE4 2660 movaps IN4, IV 2661#else 2662 pxor IN1, STATE4 2663 movaps IN2, IV 2664 movups (INP), IN1 2665 pxor IN1, STATE2 2666 movups 0x10(INP), IN2 2667 pxor IN2, STATE3 2668#endif 2669 movups STATE1, (OUTP) 2670 movups STATE2, 0x10(OUTP) 2671 movups STATE3, 0x20(OUTP) 2672 movups STATE4, 0x30(OUTP) 2673 sub $64, LEN 2674 add $64, INP 2675 add $64, OUTP 2676 cmp $64, LEN 2677 jge .Lcbc_dec_loop4 2678 cmp $16, LEN 2679 jb .Lcbc_dec_ret 2680.align 4 2681.Lcbc_dec_loop1: 2682 movups (INP), IN 2683 movaps IN, STATE 2684 call _aesni_dec1 2685 pxor IV, STATE 2686 movups STATE, (OUTP) 2687 movaps IN, IV 2688 sub $16, LEN 2689 add $16, INP 2690 add $16, OUTP 2691 cmp $16, LEN 2692 jge .Lcbc_dec_loop1 2693.Lcbc_dec_ret: 2694 movups IV, (IVP) 2695.Lcbc_dec_just_ret: 2696#ifndef __x86_64__ 2697 popl KLEN 2698 popl KEYP 2699 popl LEN 2700 popl IVP 2701#endif 2702 FRAME_END 2703 ret 2704ENDPROC(aesni_cbc_dec) 2705 2706#ifdef __x86_64__ 2707.pushsection .rodata 2708.align 16 2709.Lbswap_mask: 2710 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2711.popsection 2712 2713/* 2714 * _aesni_inc_init: internal ABI 2715 * setup registers used by _aesni_inc 2716 * input: 2717 * IV 2718 * output: 2719 * CTR: == IV, in little endian 2720 * TCTR_LOW: == lower qword of CTR 2721 * INC: == 1, in little endian 2722 * BSWAP_MASK == endian swapping mask 2723 */ 2724.align 4 2725_aesni_inc_init: 2726 movaps .Lbswap_mask, BSWAP_MASK 2727 movaps IV, CTR 2728 PSHUFB_XMM BSWAP_MASK CTR 2729 mov $1, TCTR_LOW 2730 MOVQ_R64_XMM TCTR_LOW INC 2731 MOVQ_R64_XMM CTR TCTR_LOW 2732 ret 2733ENDPROC(_aesni_inc_init) 2734 2735/* 2736 * _aesni_inc: internal ABI 2737 * Increase IV by 1, IV is in big endian 2738 * input: 2739 * IV 2740 * CTR: == IV, in little endian 2741 * TCTR_LOW: == lower qword of CTR 2742 * INC: == 1, in little endian 2743 * BSWAP_MASK == endian swapping mask 2744 * output: 2745 * IV: Increase by 1 2746 * changed: 2747 * CTR: == output IV, in little endian 2748 * TCTR_LOW: == lower qword of CTR 2749 */ 2750.align 4 2751_aesni_inc: 2752 paddq INC, CTR 2753 add $1, TCTR_LOW 2754 jnc .Linc_low 2755 pslldq $8, INC 2756 paddq INC, CTR 2757 psrldq $8, INC 2758.Linc_low: 2759 movaps CTR, IV 2760 PSHUFB_XMM BSWAP_MASK IV 2761 ret 2762ENDPROC(_aesni_inc) 2763 2764/* 2765 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2766 * size_t len, u8 *iv) 2767 */ 2768ENTRY(aesni_ctr_enc) 2769 FRAME_BEGIN 2770 cmp $16, LEN 2771 jb .Lctr_enc_just_ret 2772 mov 480(KEYP), KLEN 2773 movups (IVP), IV 2774 call _aesni_inc_init 2775 cmp $64, LEN 2776 jb .Lctr_enc_loop1 2777.align 4 2778.Lctr_enc_loop4: 2779 movaps IV, STATE1 2780 call _aesni_inc 2781 movups (INP), IN1 2782 movaps IV, STATE2 2783 call _aesni_inc 2784 movups 0x10(INP), IN2 2785 movaps IV, STATE3 2786 call _aesni_inc 2787 movups 0x20(INP), IN3 2788 movaps IV, STATE4 2789 call _aesni_inc 2790 movups 0x30(INP), IN4 2791 call _aesni_enc4 2792 pxor IN1, STATE1 2793 movups STATE1, (OUTP) 2794 pxor IN2, STATE2 2795 movups STATE2, 0x10(OUTP) 2796 pxor IN3, STATE3 2797 movups STATE3, 0x20(OUTP) 2798 pxor IN4, STATE4 2799 movups STATE4, 0x30(OUTP) 2800 sub $64, LEN 2801 add $64, INP 2802 add $64, OUTP 2803 cmp $64, LEN 2804 jge .Lctr_enc_loop4 2805 cmp $16, LEN 2806 jb .Lctr_enc_ret 2807.align 4 2808.Lctr_enc_loop1: 2809 movaps IV, STATE 2810 call _aesni_inc 2811 movups (INP), IN 2812 call _aesni_enc1 2813 pxor IN, STATE 2814 movups STATE, (OUTP) 2815 sub $16, LEN 2816 add $16, INP 2817 add $16, OUTP 2818 cmp $16, LEN 2819 jge .Lctr_enc_loop1 2820.Lctr_enc_ret: 2821 movups IV, (IVP) 2822.Lctr_enc_just_ret: 2823 FRAME_END 2824 ret 2825ENDPROC(aesni_ctr_enc) 2826 2827/* 2828 * _aesni_gf128mul_x_ble: internal ABI 2829 * Multiply in GF(2^128) for XTS IVs 2830 * input: 2831 * IV: current IV 2832 * GF128MUL_MASK == mask with 0x87 and 0x01 2833 * output: 2834 * IV: next IV 2835 * changed: 2836 * CTR: == temporary value 2837 */ 2838#define _aesni_gf128mul_x_ble() \ 2839 pshufd $0x13, IV, CTR; \ 2840 paddq IV, IV; \ 2841 psrad $31, CTR; \ 2842 pand GF128MUL_MASK, CTR; \ 2843 pxor CTR, IV; 2844 2845/* 2846 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2847 * bool enc, u8 *iv) 2848 */ 2849ENTRY(aesni_xts_crypt8) 2850 FRAME_BEGIN 2851 cmpb $0, %cl 2852 movl $0, %ecx 2853 movl $240, %r10d 2854 leaq _aesni_enc4, %r11 2855 leaq _aesni_dec4, %rax 2856 cmovel %r10d, %ecx 2857 cmoveq %rax, %r11 2858 2859 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2860 movups (IVP), IV 2861 2862 mov 480(KEYP), KLEN 2863 addq %rcx, KEYP 2864 2865 movdqa IV, STATE1 2866 movdqu 0x00(INP), INC 2867 pxor INC, STATE1 2868 movdqu IV, 0x00(OUTP) 2869 2870 _aesni_gf128mul_x_ble() 2871 movdqa IV, STATE2 2872 movdqu 0x10(INP), INC 2873 pxor INC, STATE2 2874 movdqu IV, 0x10(OUTP) 2875 2876 _aesni_gf128mul_x_ble() 2877 movdqa IV, STATE3 2878 movdqu 0x20(INP), INC 2879 pxor INC, STATE3 2880 movdqu IV, 0x20(OUTP) 2881 2882 _aesni_gf128mul_x_ble() 2883 movdqa IV, STATE4 2884 movdqu 0x30(INP), INC 2885 pxor INC, STATE4 2886 movdqu IV, 0x30(OUTP) 2887 2888 CALL_NOSPEC %r11 2889 2890 movdqu 0x00(OUTP), INC 2891 pxor INC, STATE1 2892 movdqu STATE1, 0x00(OUTP) 2893 2894 _aesni_gf128mul_x_ble() 2895 movdqa IV, STATE1 2896 movdqu 0x40(INP), INC 2897 pxor INC, STATE1 2898 movdqu IV, 0x40(OUTP) 2899 2900 movdqu 0x10(OUTP), INC 2901 pxor INC, STATE2 2902 movdqu STATE2, 0x10(OUTP) 2903 2904 _aesni_gf128mul_x_ble() 2905 movdqa IV, STATE2 2906 movdqu 0x50(INP), INC 2907 pxor INC, STATE2 2908 movdqu IV, 0x50(OUTP) 2909 2910 movdqu 0x20(OUTP), INC 2911 pxor INC, STATE3 2912 movdqu STATE3, 0x20(OUTP) 2913 2914 _aesni_gf128mul_x_ble() 2915 movdqa IV, STATE3 2916 movdqu 0x60(INP), INC 2917 pxor INC, STATE3 2918 movdqu IV, 0x60(OUTP) 2919 2920 movdqu 0x30(OUTP), INC 2921 pxor INC, STATE4 2922 movdqu STATE4, 0x30(OUTP) 2923 2924 _aesni_gf128mul_x_ble() 2925 movdqa IV, STATE4 2926 movdqu 0x70(INP), INC 2927 pxor INC, STATE4 2928 movdqu IV, 0x70(OUTP) 2929 2930 _aesni_gf128mul_x_ble() 2931 movups IV, (IVP) 2932 2933 CALL_NOSPEC %r11 2934 2935 movdqu 0x40(OUTP), INC 2936 pxor INC, STATE1 2937 movdqu STATE1, 0x40(OUTP) 2938 2939 movdqu 0x50(OUTP), INC 2940 pxor INC, STATE2 2941 movdqu STATE2, 0x50(OUTP) 2942 2943 movdqu 0x60(OUTP), INC 2944 pxor INC, STATE3 2945 movdqu STATE3, 0x60(OUTP) 2946 2947 movdqu 0x70(OUTP), INC 2948 pxor INC, STATE4 2949 movdqu STATE4, 0x70(OUTP) 2950 2951 FRAME_END 2952 ret 2953ENDPROC(aesni_xts_crypt8) 2954 2955#endif 2956