1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Implement AES algorithm in Intel AES-NI instructions. 4 * 5 * The white paper of AES-NI instructions can be downloaded from: 6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 7 * 8 * Copyright (C) 2008, Intel Corp. 9 * Author: Huang Ying <ying.huang@intel.com> 10 * Vinodh Gopal <vinodh.gopal@intel.com> 11 * Kahraman Akdemir 12 * 13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 14 * interface for 64-bit kernels. 15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 16 * Aidan O'Mahony (aidan.o.mahony@intel.com) 17 * Adrian Hoban <adrian.hoban@intel.com> 18 * James Guilford (james.guilford@intel.com) 19 * Gabriele Paoloni <gabriele.paoloni@intel.com> 20 * Tadeusz Struk (tadeusz.struk@intel.com) 21 * Wajdi Feghali (wajdi.k.feghali@intel.com) 22 * Copyright (c) 2010, Intel Corporation. 23 * 24 * Ported x86_64 version to x86: 25 * Author: Mathias Krause <minipli@googlemail.com> 26 */ 27 28#include <linux/linkage.h> 29#include <asm/frame.h> 30#include <asm/nospec-branch.h> 31 32/* 33 * The following macros are used to move an (un)aligned 16 byte value to/from 34 * an XMM register. This can done for either FP or integer values, for FP use 35 * movaps (move aligned packed single) or integer use movdqa (move double quad 36 * aligned). It doesn't make a performance difference which instruction is used 37 * since Nehalem (original Core i7) was released. However, the movaps is a byte 38 * shorter, so that is the one we'll use for now. (same for unaligned). 39 */ 40#define MOVADQ movaps 41#define MOVUDQ movups 42 43#ifdef __x86_64__ 44 45# constants in mergeable sections, linker can reorder and merge 46.section .rodata.cst16.POLY, "aM", @progbits, 16 47.align 16 48POLY: .octa 0xC2000000000000000000000000000001 49.section .rodata.cst16.TWOONE, "aM", @progbits, 16 50.align 16 51TWOONE: .octa 0x00000001000000000000000000000001 52 53.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 54.align 16 55SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 56.section .rodata.cst16.MASK1, "aM", @progbits, 16 57.align 16 58MASK1: .octa 0x0000000000000000ffffffffffffffff 59.section .rodata.cst16.MASK2, "aM", @progbits, 16 60.align 16 61MASK2: .octa 0xffffffffffffffff0000000000000000 62.section .rodata.cst16.ONE, "aM", @progbits, 16 63.align 16 64ONE: .octa 0x00000000000000000000000000000001 65.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 66.align 16 67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 68.section .rodata.cst16.dec, "aM", @progbits, 16 69.align 16 70dec: .octa 0x1 71.section .rodata.cst16.enc, "aM", @progbits, 16 72.align 16 73enc: .octa 0x2 74 75# order of these constants should not change. 76# more specifically, ALL_F should follow SHIFT_MASK, 77# and zero should follow ALL_F 78.section .rodata, "a", @progbits 79.align 16 80SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 81ALL_F: .octa 0xffffffffffffffffffffffffffffffff 82 .octa 0x00000000000000000000000000000000 83 84.text 85 86#define AadHash 16*0 87#define AadLen 16*1 88#define InLen (16*1)+8 89#define PBlockEncKey 16*2 90#define OrigIV 16*3 91#define CurCount 16*4 92#define PBlockLen 16*5 93#define HashKey 16*6 // store HashKey <<1 mod poly here 94#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here 95#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here 96#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here 97#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 98 // bits of HashKey <<1 mod poly here 99 //(for Karatsuba purposes) 100#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 101 // bits of HashKey^2 <<1 mod poly here 102 // (for Karatsuba purposes) 103#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 104 // bits of HashKey^3 <<1 mod poly here 105 // (for Karatsuba purposes) 106#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 107 // bits of HashKey^4 <<1 mod poly here 108 // (for Karatsuba purposes) 109 110#define arg1 rdi 111#define arg2 rsi 112#define arg3 rdx 113#define arg4 rcx 114#define arg5 r8 115#define arg6 r9 116#define keysize 2*15*16(%arg1) 117#endif 118 119 120#define STATE1 %xmm0 121#define STATE2 %xmm4 122#define STATE3 %xmm5 123#define STATE4 %xmm6 124#define STATE STATE1 125#define IN1 %xmm1 126#define IN2 %xmm7 127#define IN3 %xmm8 128#define IN4 %xmm9 129#define IN IN1 130#define KEY %xmm2 131#define IV %xmm3 132 133#define BSWAP_MASK %xmm10 134#define CTR %xmm11 135#define INC %xmm12 136 137#define GF128MUL_MASK %xmm7 138 139#ifdef __x86_64__ 140#define AREG %rax 141#define KEYP %rdi 142#define OUTP %rsi 143#define UKEYP OUTP 144#define INP %rdx 145#define LEN %rcx 146#define IVP %r8 147#define KLEN %r9d 148#define T1 %r10 149#define TKEYP T1 150#define T2 %r11 151#define TCTR_LOW T2 152#else 153#define AREG %eax 154#define KEYP %edi 155#define OUTP AREG 156#define UKEYP OUTP 157#define INP %edx 158#define LEN %esi 159#define IVP %ebp 160#define KLEN %ebx 161#define T1 %ecx 162#define TKEYP T1 163#endif 164 165.macro FUNC_SAVE 166 push %r12 167 push %r13 168 push %r14 169# 170# states of %xmm registers %xmm6:%xmm15 not saved 171# all %xmm registers are clobbered 172# 173.endm 174 175 176.macro FUNC_RESTORE 177 pop %r14 178 pop %r13 179 pop %r12 180.endm 181 182# Precompute hashkeys. 183# Input: Hash subkey. 184# Output: HashKeys stored in gcm_context_data. Only needs to be called 185# once per key. 186# clobbers r12, and tmp xmm registers. 187.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 188 mov \SUBKEY, %r12 189 movdqu (%r12), \TMP3 190 movdqa SHUF_MASK(%rip), \TMP2 191 pshufb \TMP2, \TMP3 192 193 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 194 195 movdqa \TMP3, \TMP2 196 psllq $1, \TMP3 197 psrlq $63, \TMP2 198 movdqa \TMP2, \TMP1 199 pslldq $8, \TMP2 200 psrldq $8, \TMP1 201 por \TMP2, \TMP3 202 203 # reduce HashKey<<1 204 205 pshufd $0x24, \TMP1, \TMP2 206 pcmpeqd TWOONE(%rip), \TMP2 207 pand POLY(%rip), \TMP2 208 pxor \TMP2, \TMP3 209 movdqu \TMP3, HashKey(%arg2) 210 211 movdqa \TMP3, \TMP5 212 pshufd $78, \TMP3, \TMP1 213 pxor \TMP3, \TMP1 214 movdqu \TMP1, HashKey_k(%arg2) 215 216 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 217# TMP5 = HashKey^2<<1 (mod poly) 218 movdqu \TMP5, HashKey_2(%arg2) 219# HashKey_2 = HashKey^2<<1 (mod poly) 220 pshufd $78, \TMP5, \TMP1 221 pxor \TMP5, \TMP1 222 movdqu \TMP1, HashKey_2_k(%arg2) 223 224 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 225# TMP5 = HashKey^3<<1 (mod poly) 226 movdqu \TMP5, HashKey_3(%arg2) 227 pshufd $78, \TMP5, \TMP1 228 pxor \TMP5, \TMP1 229 movdqu \TMP1, HashKey_3_k(%arg2) 230 231 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 232# TMP5 = HashKey^3<<1 (mod poly) 233 movdqu \TMP5, HashKey_4(%arg2) 234 pshufd $78, \TMP5, \TMP1 235 pxor \TMP5, \TMP1 236 movdqu \TMP1, HashKey_4_k(%arg2) 237.endm 238 239# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. 240# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 241.macro GCM_INIT Iv SUBKEY AAD AADLEN 242 mov \AADLEN, %r11 243 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length 244 xor %r11d, %r11d 245 mov %r11, InLen(%arg2) # ctx_data.in_length = 0 246 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 247 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 248 mov \Iv, %rax 249 movdqu (%rax), %xmm0 250 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv 251 252 movdqa SHUF_MASK(%rip), %xmm2 253 pshufb %xmm2, %xmm0 254 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv 255 256 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 257 movdqu HashKey(%arg2), %xmm13 258 259 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ 260 %xmm4, %xmm5, %xmm6 261.endm 262 263# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context 264# struct has been initialized by GCM_INIT. 265# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK 266# Clobbers rax, r10-r13, and xmm0-xmm15 267.macro GCM_ENC_DEC operation 268 movdqu AadHash(%arg2), %xmm8 269 movdqu HashKey(%arg2), %xmm13 270 add %arg5, InLen(%arg2) 271 272 xor %r11d, %r11d # initialise the data pointer offset as zero 273 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation 274 275 sub %r11, %arg5 # sub partial block data used 276 mov %arg5, %r13 # save the number of bytes 277 278 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 279 mov %r13, %r12 280 # Encrypt/Decrypt first few blocks 281 282 and $(3<<4), %r12 283 jz .L_initial_num_blocks_is_0_\@ 284 cmp $(2<<4), %r12 285 jb .L_initial_num_blocks_is_1_\@ 286 je .L_initial_num_blocks_is_2_\@ 287.L_initial_num_blocks_is_3_\@: 288 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 289%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation 290 sub $48, %r13 291 jmp .L_initial_blocks_\@ 292.L_initial_num_blocks_is_2_\@: 293 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 294%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation 295 sub $32, %r13 296 jmp .L_initial_blocks_\@ 297.L_initial_num_blocks_is_1_\@: 298 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 299%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation 300 sub $16, %r13 301 jmp .L_initial_blocks_\@ 302.L_initial_num_blocks_is_0_\@: 303 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 304%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation 305.L_initial_blocks_\@: 306 307 # Main loop - Encrypt/Decrypt remaining blocks 308 309 test %r13, %r13 310 je .L_zero_cipher_left_\@ 311 sub $64, %r13 312 je .L_four_cipher_left_\@ 313.L_crypt_by_4_\@: 314 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ 315 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ 316 %xmm7, %xmm8, enc 317 add $64, %r11 318 sub $64, %r13 319 jne .L_crypt_by_4_\@ 320.L_four_cipher_left_\@: 321 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 322%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 323.L_zero_cipher_left_\@: 324 movdqu %xmm8, AadHash(%arg2) 325 movdqu %xmm0, CurCount(%arg2) 326 327 mov %arg5, %r13 328 and $15, %r13 # %r13 = arg5 (mod 16) 329 je .L_multiple_of_16_bytes_\@ 330 331 mov %r13, PBlockLen(%arg2) 332 333 # Handle the last <16 Byte block separately 334 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 335 movdqu %xmm0, CurCount(%arg2) 336 movdqa SHUF_MASK(%rip), %xmm10 337 pshufb %xmm10, %xmm0 338 339 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 340 movdqu %xmm0, PBlockEncKey(%arg2) 341 342 cmp $16, %arg5 343 jge .L_large_enough_update_\@ 344 345 lea (%arg4,%r11,1), %r10 346 mov %r13, %r12 347 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 348 jmp .L_data_read_\@ 349 350.L_large_enough_update_\@: 351 sub $16, %r11 352 add %r13, %r11 353 354 # receive the last <16 Byte block 355 movdqu (%arg4, %r11, 1), %xmm1 356 357 sub %r13, %r11 358 add $16, %r11 359 360 lea SHIFT_MASK+16(%rip), %r12 361 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 362 # (r13 is the number of bytes in plaintext mod 16) 363 sub %r13, %r12 364 # get the appropriate shuffle mask 365 movdqu (%r12), %xmm2 366 # shift right 16-r13 bytes 367 pshufb %xmm2, %xmm1 368 369.L_data_read_\@: 370 lea ALL_F+16(%rip), %r12 371 sub %r13, %r12 372 373.ifc \operation, dec 374 movdqa %xmm1, %xmm2 375.endif 376 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) 377 movdqu (%r12), %xmm1 378 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 379 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 380.ifc \operation, dec 381 pand %xmm1, %xmm2 382 movdqa SHUF_MASK(%rip), %xmm10 383 pshufb %xmm10 ,%xmm2 384 385 pxor %xmm2, %xmm8 386.else 387 movdqa SHUF_MASK(%rip), %xmm10 388 pshufb %xmm10,%xmm0 389 390 pxor %xmm0, %xmm8 391.endif 392 393 movdqu %xmm8, AadHash(%arg2) 394.ifc \operation, enc 395 # GHASH computation for the last <16 byte block 396 movdqa SHUF_MASK(%rip), %xmm10 397 # shuffle xmm0 back to output as ciphertext 398 pshufb %xmm10, %xmm0 399.endif 400 401 # Output %r13 bytes 402 movq %xmm0, %rax 403 cmp $8, %r13 404 jle .L_less_than_8_bytes_left_\@ 405 mov %rax, (%arg3 , %r11, 1) 406 add $8, %r11 407 psrldq $8, %xmm0 408 movq %xmm0, %rax 409 sub $8, %r13 410.L_less_than_8_bytes_left_\@: 411 mov %al, (%arg3, %r11, 1) 412 add $1, %r11 413 shr $8, %rax 414 sub $1, %r13 415 jne .L_less_than_8_bytes_left_\@ 416.L_multiple_of_16_bytes_\@: 417.endm 418 419# GCM_COMPLETE Finishes update of tag of last partial block 420# Output: Authorization Tag (AUTH_TAG) 421# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 422.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN 423 movdqu AadHash(%arg2), %xmm8 424 movdqu HashKey(%arg2), %xmm13 425 426 mov PBlockLen(%arg2), %r12 427 428 test %r12, %r12 429 je .L_partial_done\@ 430 431 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 432 433.L_partial_done\@: 434 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) 435 shl $3, %r12 # convert into number of bits 436 movd %r12d, %xmm15 # len(A) in %xmm15 437 mov InLen(%arg2), %r12 438 shl $3, %r12 # len(C) in bits (*128) 439 movq %r12, %xmm1 440 441 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 442 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 443 pxor %xmm15, %xmm8 444 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 445 # final GHASH computation 446 movdqa SHUF_MASK(%rip), %xmm10 447 pshufb %xmm10, %xmm8 448 449 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 450 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 451 pxor %xmm8, %xmm0 452.L_return_T_\@: 453 mov \AUTHTAG, %r10 # %r10 = authTag 454 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len 455 cmp $16, %r11 456 je .L_T_16_\@ 457 cmp $8, %r11 458 jl .L_T_4_\@ 459.L_T_8_\@: 460 movq %xmm0, %rax 461 mov %rax, (%r10) 462 add $8, %r10 463 sub $8, %r11 464 psrldq $8, %xmm0 465 test %r11, %r11 466 je .L_return_T_done_\@ 467.L_T_4_\@: 468 movd %xmm0, %eax 469 mov %eax, (%r10) 470 add $4, %r10 471 sub $4, %r11 472 psrldq $4, %xmm0 473 test %r11, %r11 474 je .L_return_T_done_\@ 475.L_T_123_\@: 476 movd %xmm0, %eax 477 cmp $2, %r11 478 jl .L_T_1_\@ 479 mov %ax, (%r10) 480 cmp $2, %r11 481 je .L_return_T_done_\@ 482 add $2, %r10 483 sar $16, %eax 484.L_T_1_\@: 485 mov %al, (%r10) 486 jmp .L_return_T_done_\@ 487.L_T_16_\@: 488 movdqu %xmm0, (%r10) 489.L_return_T_done_\@: 490.endm 491 492#ifdef __x86_64__ 493/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 494* 495* 496* Input: A and B (128-bits each, bit-reflected) 497* Output: C = A*B*x mod poly, (i.e. >>1 ) 498* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 499* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 500* 501*/ 502.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 503 movdqa \GH, \TMP1 504 pshufd $78, \GH, \TMP2 505 pshufd $78, \HK, \TMP3 506 pxor \GH, \TMP2 # TMP2 = a1+a0 507 pxor \HK, \TMP3 # TMP3 = b1+b0 508 pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 509 pclmulqdq $0x00, \HK, \GH # GH = a0*b0 510 pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 511 pxor \GH, \TMP2 512 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 513 movdqa \TMP2, \TMP3 514 pslldq $8, \TMP3 # left shift TMP3 2 DWs 515 psrldq $8, \TMP2 # right shift TMP2 2 DWs 516 pxor \TMP3, \GH 517 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 518 519 # first phase of the reduction 520 521 movdqa \GH, \TMP2 522 movdqa \GH, \TMP3 523 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 524 # in in order to perform 525 # independent shifts 526 pslld $31, \TMP2 # packed right shift <<31 527 pslld $30, \TMP3 # packed right shift <<30 528 pslld $25, \TMP4 # packed right shift <<25 529 pxor \TMP3, \TMP2 # xor the shifted versions 530 pxor \TMP4, \TMP2 531 movdqa \TMP2, \TMP5 532 psrldq $4, \TMP5 # right shift TMP5 1 DW 533 pslldq $12, \TMP2 # left shift TMP2 3 DWs 534 pxor \TMP2, \GH 535 536 # second phase of the reduction 537 538 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 539 # in in order to perform 540 # independent shifts 541 movdqa \GH,\TMP3 542 movdqa \GH,\TMP4 543 psrld $1,\TMP2 # packed left shift >>1 544 psrld $2,\TMP3 # packed left shift >>2 545 psrld $7,\TMP4 # packed left shift >>7 546 pxor \TMP3,\TMP2 # xor the shifted versions 547 pxor \TMP4,\TMP2 548 pxor \TMP5, \TMP2 549 pxor \TMP2, \GH 550 pxor \TMP1, \GH # result is in TMP1 551.endm 552 553# Reads DLEN bytes starting at DPTR and stores in XMMDst 554# where 0 < DLEN < 16 555# Clobbers %rax, DLEN and XMM1 556.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst 557 cmp $8, \DLEN 558 jl .L_read_lt8_\@ 559 mov (\DPTR), %rax 560 movq %rax, \XMMDst 561 sub $8, \DLEN 562 jz .L_done_read_partial_block_\@ 563 xor %eax, %eax 564.L_read_next_byte_\@: 565 shl $8, %rax 566 mov 7(\DPTR, \DLEN, 1), %al 567 dec \DLEN 568 jnz .L_read_next_byte_\@ 569 movq %rax, \XMM1 570 pslldq $8, \XMM1 571 por \XMM1, \XMMDst 572 jmp .L_done_read_partial_block_\@ 573.L_read_lt8_\@: 574 xor %eax, %eax 575.L_read_next_byte_lt8_\@: 576 shl $8, %rax 577 mov -1(\DPTR, \DLEN, 1), %al 578 dec \DLEN 579 jnz .L_read_next_byte_lt8_\@ 580 movq %rax, \XMMDst 581.L_done_read_partial_block_\@: 582.endm 583 584# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. 585# clobbers r10-11, xmm14 586.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ 587 TMP6 TMP7 588 MOVADQ SHUF_MASK(%rip), %xmm14 589 mov \AAD, %r10 # %r10 = AAD 590 mov \AADLEN, %r11 # %r11 = aadLen 591 pxor \TMP7, \TMP7 592 pxor \TMP6, \TMP6 593 594 cmp $16, %r11 595 jl .L_get_AAD_rest\@ 596.L_get_AAD_blocks\@: 597 movdqu (%r10), \TMP7 598 pshufb %xmm14, \TMP7 # byte-reflect the AAD data 599 pxor \TMP7, \TMP6 600 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 601 add $16, %r10 602 sub $16, %r11 603 cmp $16, %r11 604 jge .L_get_AAD_blocks\@ 605 606 movdqu \TMP6, \TMP7 607 608 /* read the last <16B of AAD */ 609.L_get_AAD_rest\@: 610 test %r11, %r11 611 je .L_get_AAD_done\@ 612 613 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 614 pshufb %xmm14, \TMP7 # byte-reflect the AAD data 615 pxor \TMP6, \TMP7 616 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 617 movdqu \TMP7, \TMP6 618 619.L_get_AAD_done\@: 620 movdqu \TMP6, AadHash(%arg2) 621.endm 622 623# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks 624# between update calls. 625# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK 626# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context 627# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 628.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ 629 AAD_HASH operation 630 mov PBlockLen(%arg2), %r13 631 test %r13, %r13 632 je .L_partial_block_done_\@ # Leave Macro if no partial blocks 633 # Read in input data without over reading 634 cmp $16, \PLAIN_CYPH_LEN 635 jl .L_fewer_than_16_bytes_\@ 636 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm 637 jmp .L_data_read_\@ 638 639.L_fewer_than_16_bytes_\@: 640 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 641 mov \PLAIN_CYPH_LEN, %r12 642 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 643 644 mov PBlockLen(%arg2), %r13 645 646.L_data_read_\@: # Finished reading in data 647 648 movdqu PBlockEncKey(%arg2), %xmm9 649 movdqu HashKey(%arg2), %xmm13 650 651 lea SHIFT_MASK(%rip), %r12 652 653 # adjust the shuffle mask pointer to be able to shift r13 bytes 654 # r16-r13 is the number of bytes in plaintext mod 16) 655 add %r13, %r12 656 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 657 pshufb %xmm2, %xmm9 # shift right r13 bytes 658 659.ifc \operation, dec 660 movdqa %xmm1, %xmm3 661 pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) 662 663 mov \PLAIN_CYPH_LEN, %r10 664 add %r13, %r10 665 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 666 sub $16, %r10 667 # Determine if partial block is not being filled and 668 # shift mask accordingly 669 jge .L_no_extra_mask_1_\@ 670 sub %r10, %r12 671.L_no_extra_mask_1_\@: 672 673 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 674 # get the appropriate mask to mask out bottom r13 bytes of xmm9 675 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 676 677 pand %xmm1, %xmm3 678 movdqa SHUF_MASK(%rip), %xmm10 679 pshufb %xmm10, %xmm3 680 pshufb %xmm2, %xmm3 681 pxor %xmm3, \AAD_HASH 682 683 test %r10, %r10 684 jl .L_partial_incomplete_1_\@ 685 686 # GHASH computation for the last <16 Byte block 687 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 688 xor %eax, %eax 689 690 mov %rax, PBlockLen(%arg2) 691 jmp .L_dec_done_\@ 692.L_partial_incomplete_1_\@: 693 add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 694.L_dec_done_\@: 695 movdqu \AAD_HASH, AadHash(%arg2) 696.else 697 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) 698 699 mov \PLAIN_CYPH_LEN, %r10 700 add %r13, %r10 701 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 702 sub $16, %r10 703 # Determine if partial block is not being filled and 704 # shift mask accordingly 705 jge .L_no_extra_mask_2_\@ 706 sub %r10, %r12 707.L_no_extra_mask_2_\@: 708 709 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 710 # get the appropriate mask to mask out bottom r13 bytes of xmm9 711 pand %xmm1, %xmm9 712 713 movdqa SHUF_MASK(%rip), %xmm1 714 pshufb %xmm1, %xmm9 715 pshufb %xmm2, %xmm9 716 pxor %xmm9, \AAD_HASH 717 718 test %r10, %r10 719 jl .L_partial_incomplete_2_\@ 720 721 # GHASH computation for the last <16 Byte block 722 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 723 xor %eax, %eax 724 725 mov %rax, PBlockLen(%arg2) 726 jmp .L_encode_done_\@ 727.L_partial_incomplete_2_\@: 728 add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 729.L_encode_done_\@: 730 movdqu \AAD_HASH, AadHash(%arg2) 731 732 movdqa SHUF_MASK(%rip), %xmm10 733 # shuffle xmm9 back to output as ciphertext 734 pshufb %xmm10, %xmm9 735 pshufb %xmm2, %xmm9 736.endif 737 # output encrypted Bytes 738 test %r10, %r10 739 jl .L_partial_fill_\@ 740 mov %r13, %r12 741 mov $16, %r13 742 # Set r13 to be the number of bytes to write out 743 sub %r12, %r13 744 jmp .L_count_set_\@ 745.L_partial_fill_\@: 746 mov \PLAIN_CYPH_LEN, %r13 747.L_count_set_\@: 748 movdqa %xmm9, %xmm0 749 movq %xmm0, %rax 750 cmp $8, %r13 751 jle .L_less_than_8_bytes_left_\@ 752 753 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 754 add $8, \DATA_OFFSET 755 psrldq $8, %xmm0 756 movq %xmm0, %rax 757 sub $8, %r13 758.L_less_than_8_bytes_left_\@: 759 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 760 add $1, \DATA_OFFSET 761 shr $8, %rax 762 sub $1, %r13 763 jne .L_less_than_8_bytes_left_\@ 764.L_partial_block_done_\@: 765.endm # PARTIAL_BLOCK 766 767/* 768* if a = number of total plaintext bytes 769* b = floor(a/16) 770* num_initial_blocks = b mod 4 771* encrypt the initial num_initial_blocks blocks and apply ghash on 772* the ciphertext 773* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 774* are clobbered 775* arg1, %arg2, %arg3 are used as a pointer only, not modified 776*/ 777 778 779.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 780 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 781 MOVADQ SHUF_MASK(%rip), %xmm14 782 783 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 784 785 # start AES for num_initial_blocks blocks 786 787 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 788 789.if (\i == 5) || (\i == 6) || (\i == 7) 790 791 MOVADQ ONE(%RIP),\TMP1 792 MOVADQ 0(%arg1),\TMP2 793.irpc index, \i_seq 794 paddd \TMP1, \XMM0 # INCR Y0 795.ifc \operation, dec 796 movdqa \XMM0, %xmm\index 797.else 798 MOVADQ \XMM0, %xmm\index 799.endif 800 pshufb %xmm14, %xmm\index # perform a 16 byte swap 801 pxor \TMP2, %xmm\index 802.endr 803 lea 0x10(%arg1),%r10 804 mov keysize,%eax 805 shr $2,%eax # 128->4, 192->6, 256->8 806 add $5,%eax # 128->9, 192->11, 256->13 807 808.Laes_loop_initial_\@: 809 MOVADQ (%r10),\TMP1 810.irpc index, \i_seq 811 aesenc \TMP1, %xmm\index 812.endr 813 add $16,%r10 814 sub $1,%eax 815 jnz .Laes_loop_initial_\@ 816 817 MOVADQ (%r10), \TMP1 818.irpc index, \i_seq 819 aesenclast \TMP1, %xmm\index # Last Round 820.endr 821.irpc index, \i_seq 822 movdqu (%arg4 , %r11, 1), \TMP1 823 pxor \TMP1, %xmm\index 824 movdqu %xmm\index, (%arg3 , %r11, 1) 825 # write back plaintext/ciphertext for num_initial_blocks 826 add $16, %r11 827 828.ifc \operation, dec 829 movdqa \TMP1, %xmm\index 830.endif 831 pshufb %xmm14, %xmm\index 832 833 # prepare plaintext/ciphertext for GHASH computation 834.endr 835.endif 836 837 # apply GHASH on num_initial_blocks blocks 838 839.if \i == 5 840 pxor %xmm5, %xmm6 841 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 842 pxor %xmm6, %xmm7 843 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 844 pxor %xmm7, %xmm8 845 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 846.elseif \i == 6 847 pxor %xmm6, %xmm7 848 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 849 pxor %xmm7, %xmm8 850 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 851.elseif \i == 7 852 pxor %xmm7, %xmm8 853 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 854.endif 855 cmp $64, %r13 856 jl .L_initial_blocks_done\@ 857 # no need for precomputed values 858/* 859* 860* Precomputations for HashKey parallel with encryption of first 4 blocks. 861* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 862*/ 863 MOVADQ ONE(%RIP),\TMP1 864 paddd \TMP1, \XMM0 # INCR Y0 865 MOVADQ \XMM0, \XMM1 866 pshufb %xmm14, \XMM1 # perform a 16 byte swap 867 868 paddd \TMP1, \XMM0 # INCR Y0 869 MOVADQ \XMM0, \XMM2 870 pshufb %xmm14, \XMM2 # perform a 16 byte swap 871 872 paddd \TMP1, \XMM0 # INCR Y0 873 MOVADQ \XMM0, \XMM3 874 pshufb %xmm14, \XMM3 # perform a 16 byte swap 875 876 paddd \TMP1, \XMM0 # INCR Y0 877 MOVADQ \XMM0, \XMM4 878 pshufb %xmm14, \XMM4 # perform a 16 byte swap 879 880 MOVADQ 0(%arg1),\TMP1 881 pxor \TMP1, \XMM1 882 pxor \TMP1, \XMM2 883 pxor \TMP1, \XMM3 884 pxor \TMP1, \XMM4 885.irpc index, 1234 # do 4 rounds 886 movaps 0x10*\index(%arg1), \TMP1 887 aesenc \TMP1, \XMM1 888 aesenc \TMP1, \XMM2 889 aesenc \TMP1, \XMM3 890 aesenc \TMP1, \XMM4 891.endr 892.irpc index, 56789 # do next 5 rounds 893 movaps 0x10*\index(%arg1), \TMP1 894 aesenc \TMP1, \XMM1 895 aesenc \TMP1, \XMM2 896 aesenc \TMP1, \XMM3 897 aesenc \TMP1, \XMM4 898.endr 899 lea 0xa0(%arg1),%r10 900 mov keysize,%eax 901 shr $2,%eax # 128->4, 192->6, 256->8 902 sub $4,%eax # 128->0, 192->2, 256->4 903 jz .Laes_loop_pre_done\@ 904 905.Laes_loop_pre_\@: 906 MOVADQ (%r10),\TMP2 907.irpc index, 1234 908 aesenc \TMP2, %xmm\index 909.endr 910 add $16,%r10 911 sub $1,%eax 912 jnz .Laes_loop_pre_\@ 913 914.Laes_loop_pre_done\@: 915 MOVADQ (%r10), \TMP2 916 aesenclast \TMP2, \XMM1 917 aesenclast \TMP2, \XMM2 918 aesenclast \TMP2, \XMM3 919 aesenclast \TMP2, \XMM4 920 movdqu 16*0(%arg4 , %r11 , 1), \TMP1 921 pxor \TMP1, \XMM1 922.ifc \operation, dec 923 movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 924 movdqa \TMP1, \XMM1 925.endif 926 movdqu 16*1(%arg4 , %r11 , 1), \TMP1 927 pxor \TMP1, \XMM2 928.ifc \operation, dec 929 movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 930 movdqa \TMP1, \XMM2 931.endif 932 movdqu 16*2(%arg4 , %r11 , 1), \TMP1 933 pxor \TMP1, \XMM3 934.ifc \operation, dec 935 movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 936 movdqa \TMP1, \XMM3 937.endif 938 movdqu 16*3(%arg4 , %r11 , 1), \TMP1 939 pxor \TMP1, \XMM4 940.ifc \operation, dec 941 movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 942 movdqa \TMP1, \XMM4 943.else 944 movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 945 movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 946 movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 947 movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 948.endif 949 950 add $64, %r11 951 pshufb %xmm14, \XMM1 # perform a 16 byte swap 952 pxor \XMMDst, \XMM1 953# combine GHASHed value with the corresponding ciphertext 954 pshufb %xmm14, \XMM2 # perform a 16 byte swap 955 pshufb %xmm14, \XMM3 # perform a 16 byte swap 956 pshufb %xmm14, \XMM4 # perform a 16 byte swap 957 958.L_initial_blocks_done\@: 959 960.endm 961 962/* 963* encrypt 4 blocks at a time 964* ghash the 4 previously encrypted ciphertext blocks 965* arg1, %arg3, %arg4 are used as pointers only, not modified 966* %r11 is the data offset value 967*/ 968.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ 969TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 970 971 movdqa \XMM1, \XMM5 972 movdqa \XMM2, \XMM6 973 movdqa \XMM3, \XMM7 974 movdqa \XMM4, \XMM8 975 976 movdqa SHUF_MASK(%rip), %xmm15 977 # multiply TMP5 * HashKey using karatsuba 978 979 movdqa \XMM5, \TMP4 980 pshufd $78, \XMM5, \TMP6 981 pxor \XMM5, \TMP6 982 paddd ONE(%rip), \XMM0 # INCR CNT 983 movdqu HashKey_4(%arg2), \TMP5 984 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 985 movdqa \XMM0, \XMM1 986 paddd ONE(%rip), \XMM0 # INCR CNT 987 movdqa \XMM0, \XMM2 988 paddd ONE(%rip), \XMM0 # INCR CNT 989 movdqa \XMM0, \XMM3 990 paddd ONE(%rip), \XMM0 # INCR CNT 991 movdqa \XMM0, \XMM4 992 pshufb %xmm15, \XMM1 # perform a 16 byte swap 993 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 994 pshufb %xmm15, \XMM2 # perform a 16 byte swap 995 pshufb %xmm15, \XMM3 # perform a 16 byte swap 996 pshufb %xmm15, \XMM4 # perform a 16 byte swap 997 998 pxor (%arg1), \XMM1 999 pxor (%arg1), \XMM2 1000 pxor (%arg1), \XMM3 1001 pxor (%arg1), \XMM4 1002 movdqu HashKey_4_k(%arg2), \TMP5 1003 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1004 movaps 0x10(%arg1), \TMP1 1005 aesenc \TMP1, \XMM1 # Round 1 1006 aesenc \TMP1, \XMM2 1007 aesenc \TMP1, \XMM3 1008 aesenc \TMP1, \XMM4 1009 movaps 0x20(%arg1), \TMP1 1010 aesenc \TMP1, \XMM1 # Round 2 1011 aesenc \TMP1, \XMM2 1012 aesenc \TMP1, \XMM3 1013 aesenc \TMP1, \XMM4 1014 movdqa \XMM6, \TMP1 1015 pshufd $78, \XMM6, \TMP2 1016 pxor \XMM6, \TMP2 1017 movdqu HashKey_3(%arg2), \TMP5 1018 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1019 movaps 0x30(%arg1), \TMP3 1020 aesenc \TMP3, \XMM1 # Round 3 1021 aesenc \TMP3, \XMM2 1022 aesenc \TMP3, \XMM3 1023 aesenc \TMP3, \XMM4 1024 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1025 movaps 0x40(%arg1), \TMP3 1026 aesenc \TMP3, \XMM1 # Round 4 1027 aesenc \TMP3, \XMM2 1028 aesenc \TMP3, \XMM3 1029 aesenc \TMP3, \XMM4 1030 movdqu HashKey_3_k(%arg2), \TMP5 1031 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1032 movaps 0x50(%arg1), \TMP3 1033 aesenc \TMP3, \XMM1 # Round 5 1034 aesenc \TMP3, \XMM2 1035 aesenc \TMP3, \XMM3 1036 aesenc \TMP3, \XMM4 1037 pxor \TMP1, \TMP4 1038# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1039 pxor \XMM6, \XMM5 1040 pxor \TMP2, \TMP6 1041 movdqa \XMM7, \TMP1 1042 pshufd $78, \XMM7, \TMP2 1043 pxor \XMM7, \TMP2 1044 movdqu HashKey_2(%arg2), \TMP5 1045 1046 # Multiply TMP5 * HashKey using karatsuba 1047 1048 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1049 movaps 0x60(%arg1), \TMP3 1050 aesenc \TMP3, \XMM1 # Round 6 1051 aesenc \TMP3, \XMM2 1052 aesenc \TMP3, \XMM3 1053 aesenc \TMP3, \XMM4 1054 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1055 movaps 0x70(%arg1), \TMP3 1056 aesenc \TMP3, \XMM1 # Round 7 1057 aesenc \TMP3, \XMM2 1058 aesenc \TMP3, \XMM3 1059 aesenc \TMP3, \XMM4 1060 movdqu HashKey_2_k(%arg2), \TMP5 1061 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1062 movaps 0x80(%arg1), \TMP3 1063 aesenc \TMP3, \XMM1 # Round 8 1064 aesenc \TMP3, \XMM2 1065 aesenc \TMP3, \XMM3 1066 aesenc \TMP3, \XMM4 1067 pxor \TMP1, \TMP4 1068# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1069 pxor \XMM7, \XMM5 1070 pxor \TMP2, \TMP6 1071 1072 # Multiply XMM8 * HashKey 1073 # XMM8 and TMP5 hold the values for the two operands 1074 1075 movdqa \XMM8, \TMP1 1076 pshufd $78, \XMM8, \TMP2 1077 pxor \XMM8, \TMP2 1078 movdqu HashKey(%arg2), \TMP5 1079 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1080 movaps 0x90(%arg1), \TMP3 1081 aesenc \TMP3, \XMM1 # Round 9 1082 aesenc \TMP3, \XMM2 1083 aesenc \TMP3, \XMM3 1084 aesenc \TMP3, \XMM4 1085 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1086 lea 0xa0(%arg1),%r10 1087 mov keysize,%eax 1088 shr $2,%eax # 128->4, 192->6, 256->8 1089 sub $4,%eax # 128->0, 192->2, 256->4 1090 jz .Laes_loop_par_enc_done\@ 1091 1092.Laes_loop_par_enc\@: 1093 MOVADQ (%r10),\TMP3 1094.irpc index, 1234 1095 aesenc \TMP3, %xmm\index 1096.endr 1097 add $16,%r10 1098 sub $1,%eax 1099 jnz .Laes_loop_par_enc\@ 1100 1101.Laes_loop_par_enc_done\@: 1102 MOVADQ (%r10), \TMP3 1103 aesenclast \TMP3, \XMM1 # Round 10 1104 aesenclast \TMP3, \XMM2 1105 aesenclast \TMP3, \XMM3 1106 aesenclast \TMP3, \XMM4 1107 movdqu HashKey_k(%arg2), \TMP5 1108 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1109 movdqu (%arg4,%r11,1), \TMP3 1110 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1111 movdqu 16(%arg4,%r11,1), \TMP3 1112 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1113 movdqu 32(%arg4,%r11,1), \TMP3 1114 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1115 movdqu 48(%arg4,%r11,1), \TMP3 1116 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1117 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer 1118 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer 1119 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer 1120 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer 1121 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1122 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1123 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1124 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1125 1126 pxor \TMP4, \TMP1 1127 pxor \XMM8, \XMM5 1128 pxor \TMP6, \TMP2 1129 pxor \TMP1, \TMP2 1130 pxor \XMM5, \TMP2 1131 movdqa \TMP2, \TMP3 1132 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1133 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1134 pxor \TMP3, \XMM5 1135 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1136 1137 # first phase of reduction 1138 1139 movdqa \XMM5, \TMP2 1140 movdqa \XMM5, \TMP3 1141 movdqa \XMM5, \TMP4 1142# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1143 pslld $31, \TMP2 # packed right shift << 31 1144 pslld $30, \TMP3 # packed right shift << 30 1145 pslld $25, \TMP4 # packed right shift << 25 1146 pxor \TMP3, \TMP2 # xor the shifted versions 1147 pxor \TMP4, \TMP2 1148 movdqa \TMP2, \TMP5 1149 psrldq $4, \TMP5 # right shift T5 1 DW 1150 pslldq $12, \TMP2 # left shift T2 3 DWs 1151 pxor \TMP2, \XMM5 1152 1153 # second phase of reduction 1154 1155 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1156 movdqa \XMM5,\TMP3 1157 movdqa \XMM5,\TMP4 1158 psrld $1, \TMP2 # packed left shift >>1 1159 psrld $2, \TMP3 # packed left shift >>2 1160 psrld $7, \TMP4 # packed left shift >>7 1161 pxor \TMP3,\TMP2 # xor the shifted versions 1162 pxor \TMP4,\TMP2 1163 pxor \TMP5, \TMP2 1164 pxor \TMP2, \XMM5 1165 pxor \TMP1, \XMM5 # result is in TMP1 1166 1167 pxor \XMM5, \XMM1 1168.endm 1169 1170/* 1171* decrypt 4 blocks at a time 1172* ghash the 4 previously decrypted ciphertext blocks 1173* arg1, %arg3, %arg4 are used as pointers only, not modified 1174* %r11 is the data offset value 1175*/ 1176.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ 1177TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 1178 1179 movdqa \XMM1, \XMM5 1180 movdqa \XMM2, \XMM6 1181 movdqa \XMM3, \XMM7 1182 movdqa \XMM4, \XMM8 1183 1184 movdqa SHUF_MASK(%rip), %xmm15 1185 # multiply TMP5 * HashKey using karatsuba 1186 1187 movdqa \XMM5, \TMP4 1188 pshufd $78, \XMM5, \TMP6 1189 pxor \XMM5, \TMP6 1190 paddd ONE(%rip), \XMM0 # INCR CNT 1191 movdqu HashKey_4(%arg2), \TMP5 1192 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 1193 movdqa \XMM0, \XMM1 1194 paddd ONE(%rip), \XMM0 # INCR CNT 1195 movdqa \XMM0, \XMM2 1196 paddd ONE(%rip), \XMM0 # INCR CNT 1197 movdqa \XMM0, \XMM3 1198 paddd ONE(%rip), \XMM0 # INCR CNT 1199 movdqa \XMM0, \XMM4 1200 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1201 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1202 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1203 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1204 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1205 1206 pxor (%arg1), \XMM1 1207 pxor (%arg1), \XMM2 1208 pxor (%arg1), \XMM3 1209 pxor (%arg1), \XMM4 1210 movdqu HashKey_4_k(%arg2), \TMP5 1211 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1212 movaps 0x10(%arg1), \TMP1 1213 aesenc \TMP1, \XMM1 # Round 1 1214 aesenc \TMP1, \XMM2 1215 aesenc \TMP1, \XMM3 1216 aesenc \TMP1, \XMM4 1217 movaps 0x20(%arg1), \TMP1 1218 aesenc \TMP1, \XMM1 # Round 2 1219 aesenc \TMP1, \XMM2 1220 aesenc \TMP1, \XMM3 1221 aesenc \TMP1, \XMM4 1222 movdqa \XMM6, \TMP1 1223 pshufd $78, \XMM6, \TMP2 1224 pxor \XMM6, \TMP2 1225 movdqu HashKey_3(%arg2), \TMP5 1226 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1227 movaps 0x30(%arg1), \TMP3 1228 aesenc \TMP3, \XMM1 # Round 3 1229 aesenc \TMP3, \XMM2 1230 aesenc \TMP3, \XMM3 1231 aesenc \TMP3, \XMM4 1232 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1233 movaps 0x40(%arg1), \TMP3 1234 aesenc \TMP3, \XMM1 # Round 4 1235 aesenc \TMP3, \XMM2 1236 aesenc \TMP3, \XMM3 1237 aesenc \TMP3, \XMM4 1238 movdqu HashKey_3_k(%arg2), \TMP5 1239 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1240 movaps 0x50(%arg1), \TMP3 1241 aesenc \TMP3, \XMM1 # Round 5 1242 aesenc \TMP3, \XMM2 1243 aesenc \TMP3, \XMM3 1244 aesenc \TMP3, \XMM4 1245 pxor \TMP1, \TMP4 1246# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1247 pxor \XMM6, \XMM5 1248 pxor \TMP2, \TMP6 1249 movdqa \XMM7, \TMP1 1250 pshufd $78, \XMM7, \TMP2 1251 pxor \XMM7, \TMP2 1252 movdqu HashKey_2(%arg2), \TMP5 1253 1254 # Multiply TMP5 * HashKey using karatsuba 1255 1256 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1257 movaps 0x60(%arg1), \TMP3 1258 aesenc \TMP3, \XMM1 # Round 6 1259 aesenc \TMP3, \XMM2 1260 aesenc \TMP3, \XMM3 1261 aesenc \TMP3, \XMM4 1262 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1263 movaps 0x70(%arg1), \TMP3 1264 aesenc \TMP3, \XMM1 # Round 7 1265 aesenc \TMP3, \XMM2 1266 aesenc \TMP3, \XMM3 1267 aesenc \TMP3, \XMM4 1268 movdqu HashKey_2_k(%arg2), \TMP5 1269 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1270 movaps 0x80(%arg1), \TMP3 1271 aesenc \TMP3, \XMM1 # Round 8 1272 aesenc \TMP3, \XMM2 1273 aesenc \TMP3, \XMM3 1274 aesenc \TMP3, \XMM4 1275 pxor \TMP1, \TMP4 1276# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1277 pxor \XMM7, \XMM5 1278 pxor \TMP2, \TMP6 1279 1280 # Multiply XMM8 * HashKey 1281 # XMM8 and TMP5 hold the values for the two operands 1282 1283 movdqa \XMM8, \TMP1 1284 pshufd $78, \XMM8, \TMP2 1285 pxor \XMM8, \TMP2 1286 movdqu HashKey(%arg2), \TMP5 1287 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1288 movaps 0x90(%arg1), \TMP3 1289 aesenc \TMP3, \XMM1 # Round 9 1290 aesenc \TMP3, \XMM2 1291 aesenc \TMP3, \XMM3 1292 aesenc \TMP3, \XMM4 1293 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1294 lea 0xa0(%arg1),%r10 1295 mov keysize,%eax 1296 shr $2,%eax # 128->4, 192->6, 256->8 1297 sub $4,%eax # 128->0, 192->2, 256->4 1298 jz .Laes_loop_par_dec_done\@ 1299 1300.Laes_loop_par_dec\@: 1301 MOVADQ (%r10),\TMP3 1302.irpc index, 1234 1303 aesenc \TMP3, %xmm\index 1304.endr 1305 add $16,%r10 1306 sub $1,%eax 1307 jnz .Laes_loop_par_dec\@ 1308 1309.Laes_loop_par_dec_done\@: 1310 MOVADQ (%r10), \TMP3 1311 aesenclast \TMP3, \XMM1 # last round 1312 aesenclast \TMP3, \XMM2 1313 aesenclast \TMP3, \XMM3 1314 aesenclast \TMP3, \XMM4 1315 movdqu HashKey_k(%arg2), \TMP5 1316 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1317 movdqu (%arg4,%r11,1), \TMP3 1318 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1319 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer 1320 movdqa \TMP3, \XMM1 1321 movdqu 16(%arg4,%r11,1), \TMP3 1322 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1323 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer 1324 movdqa \TMP3, \XMM2 1325 movdqu 32(%arg4,%r11,1), \TMP3 1326 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1327 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer 1328 movdqa \TMP3, \XMM3 1329 movdqu 48(%arg4,%r11,1), \TMP3 1330 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1331 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer 1332 movdqa \TMP3, \XMM4 1333 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1334 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1335 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1336 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1337 1338 pxor \TMP4, \TMP1 1339 pxor \XMM8, \XMM5 1340 pxor \TMP6, \TMP2 1341 pxor \TMP1, \TMP2 1342 pxor \XMM5, \TMP2 1343 movdqa \TMP2, \TMP3 1344 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1345 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1346 pxor \TMP3, \XMM5 1347 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1348 1349 # first phase of reduction 1350 1351 movdqa \XMM5, \TMP2 1352 movdqa \XMM5, \TMP3 1353 movdqa \XMM5, \TMP4 1354# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1355 pslld $31, \TMP2 # packed right shift << 31 1356 pslld $30, \TMP3 # packed right shift << 30 1357 pslld $25, \TMP4 # packed right shift << 25 1358 pxor \TMP3, \TMP2 # xor the shifted versions 1359 pxor \TMP4, \TMP2 1360 movdqa \TMP2, \TMP5 1361 psrldq $4, \TMP5 # right shift T5 1 DW 1362 pslldq $12, \TMP2 # left shift T2 3 DWs 1363 pxor \TMP2, \XMM5 1364 1365 # second phase of reduction 1366 1367 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1368 movdqa \XMM5,\TMP3 1369 movdqa \XMM5,\TMP4 1370 psrld $1, \TMP2 # packed left shift >>1 1371 psrld $2, \TMP3 # packed left shift >>2 1372 psrld $7, \TMP4 # packed left shift >>7 1373 pxor \TMP3,\TMP2 # xor the shifted versions 1374 pxor \TMP4,\TMP2 1375 pxor \TMP5, \TMP2 1376 pxor \TMP2, \XMM5 1377 pxor \TMP1, \XMM5 # result is in TMP1 1378 1379 pxor \XMM5, \XMM1 1380.endm 1381 1382/* GHASH the last 4 ciphertext blocks. */ 1383.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1384TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1385 1386 # Multiply TMP6 * HashKey (using Karatsuba) 1387 1388 movdqa \XMM1, \TMP6 1389 pshufd $78, \XMM1, \TMP2 1390 pxor \XMM1, \TMP2 1391 movdqu HashKey_4(%arg2), \TMP5 1392 pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1393 pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1394 movdqu HashKey_4_k(%arg2), \TMP4 1395 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1396 movdqa \XMM1, \XMMDst 1397 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1398 1399 # Multiply TMP1 * HashKey (using Karatsuba) 1400 1401 movdqa \XMM2, \TMP1 1402 pshufd $78, \XMM2, \TMP2 1403 pxor \XMM2, \TMP2 1404 movdqu HashKey_3(%arg2), \TMP5 1405 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1406 pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1407 movdqu HashKey_3_k(%arg2), \TMP4 1408 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1409 pxor \TMP1, \TMP6 1410 pxor \XMM2, \XMMDst 1411 pxor \TMP2, \XMM1 1412# results accumulated in TMP6, XMMDst, XMM1 1413 1414 # Multiply TMP1 * HashKey (using Karatsuba) 1415 1416 movdqa \XMM3, \TMP1 1417 pshufd $78, \XMM3, \TMP2 1418 pxor \XMM3, \TMP2 1419 movdqu HashKey_2(%arg2), \TMP5 1420 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1421 pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1422 movdqu HashKey_2_k(%arg2), \TMP4 1423 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1424 pxor \TMP1, \TMP6 1425 pxor \XMM3, \XMMDst 1426 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1427 1428 # Multiply TMP1 * HashKey (using Karatsuba) 1429 movdqa \XMM4, \TMP1 1430 pshufd $78, \XMM4, \TMP2 1431 pxor \XMM4, \TMP2 1432 movdqu HashKey(%arg2), \TMP5 1433 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1434 pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1435 movdqu HashKey_k(%arg2), \TMP4 1436 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1437 pxor \TMP1, \TMP6 1438 pxor \XMM4, \XMMDst 1439 pxor \XMM1, \TMP2 1440 pxor \TMP6, \TMP2 1441 pxor \XMMDst, \TMP2 1442 # middle section of the temp results combined as in karatsuba algorithm 1443 movdqa \TMP2, \TMP4 1444 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1445 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1446 pxor \TMP4, \XMMDst 1447 pxor \TMP2, \TMP6 1448# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1449 # first phase of the reduction 1450 movdqa \XMMDst, \TMP2 1451 movdqa \XMMDst, \TMP3 1452 movdqa \XMMDst, \TMP4 1453# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1454 pslld $31, \TMP2 # packed right shifting << 31 1455 pslld $30, \TMP3 # packed right shifting << 30 1456 pslld $25, \TMP4 # packed right shifting << 25 1457 pxor \TMP3, \TMP2 # xor the shifted versions 1458 pxor \TMP4, \TMP2 1459 movdqa \TMP2, \TMP7 1460 psrldq $4, \TMP7 # right shift TMP7 1 DW 1461 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1462 pxor \TMP2, \XMMDst 1463 1464 # second phase of the reduction 1465 movdqa \XMMDst, \TMP2 1466 # make 3 copies of XMMDst for doing 3 shift operations 1467 movdqa \XMMDst, \TMP3 1468 movdqa \XMMDst, \TMP4 1469 psrld $1, \TMP2 # packed left shift >> 1 1470 psrld $2, \TMP3 # packed left shift >> 2 1471 psrld $7, \TMP4 # packed left shift >> 7 1472 pxor \TMP3, \TMP2 # xor the shifted versions 1473 pxor \TMP4, \TMP2 1474 pxor \TMP7, \TMP2 1475 pxor \TMP2, \XMMDst 1476 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1477.endm 1478 1479 1480/* Encryption of a single block 1481* uses eax & r10 1482*/ 1483 1484.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1485 1486 pxor (%arg1), \XMM0 1487 mov keysize,%eax 1488 shr $2,%eax # 128->4, 192->6, 256->8 1489 add $5,%eax # 128->9, 192->11, 256->13 1490 lea 16(%arg1), %r10 # get first expanded key address 1491 1492_esb_loop_\@: 1493 MOVADQ (%r10),\TMP1 1494 aesenc \TMP1,\XMM0 1495 add $16,%r10 1496 sub $1,%eax 1497 jnz _esb_loop_\@ 1498 1499 MOVADQ (%r10),\TMP1 1500 aesenclast \TMP1,\XMM0 1501.endm 1502 1503/***************************************************************************** 1504* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1505* struct gcm_context_data *data, 1506* // context data 1507* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1508* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1509* // concatenated with 0x00000001. 16-byte aligned pointer. 1510* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1511* const u8 *aad, // Additional Authentication Data (AAD) 1512* u64 aad_len) // Length of AAD in bytes. 1513*/ 1514SYM_FUNC_START(aesni_gcm_init) 1515 FUNC_SAVE 1516 GCM_INIT %arg3, %arg4,%arg5, %arg6 1517 FUNC_RESTORE 1518 RET 1519SYM_FUNC_END(aesni_gcm_init) 1520 1521/***************************************************************************** 1522* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1523* struct gcm_context_data *data, 1524* // context data 1525* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1526* const u8 *in, // Plaintext input 1527* u64 plaintext_len, // Length of data in bytes for encryption. 1528*/ 1529SYM_FUNC_START(aesni_gcm_enc_update) 1530 FUNC_SAVE 1531 GCM_ENC_DEC enc 1532 FUNC_RESTORE 1533 RET 1534SYM_FUNC_END(aesni_gcm_enc_update) 1535 1536/***************************************************************************** 1537* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1538* struct gcm_context_data *data, 1539* // context data 1540* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1541* const u8 *in, // Plaintext input 1542* u64 plaintext_len, // Length of data in bytes for encryption. 1543*/ 1544SYM_FUNC_START(aesni_gcm_dec_update) 1545 FUNC_SAVE 1546 GCM_ENC_DEC dec 1547 FUNC_RESTORE 1548 RET 1549SYM_FUNC_END(aesni_gcm_dec_update) 1550 1551/***************************************************************************** 1552* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1553* struct gcm_context_data *data, 1554* // context data 1555* u8 *auth_tag, // Authenticated Tag output. 1556* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1557* // 12 or 8. 1558*/ 1559SYM_FUNC_START(aesni_gcm_finalize) 1560 FUNC_SAVE 1561 GCM_COMPLETE %arg3 %arg4 1562 FUNC_RESTORE 1563 RET 1564SYM_FUNC_END(aesni_gcm_finalize) 1565 1566#endif 1567 1568SYM_FUNC_START_LOCAL(_key_expansion_256a) 1569 pshufd $0b11111111, %xmm1, %xmm1 1570 shufps $0b00010000, %xmm0, %xmm4 1571 pxor %xmm4, %xmm0 1572 shufps $0b10001100, %xmm0, %xmm4 1573 pxor %xmm4, %xmm0 1574 pxor %xmm1, %xmm0 1575 movaps %xmm0, (TKEYP) 1576 add $0x10, TKEYP 1577 RET 1578SYM_FUNC_END(_key_expansion_256a) 1579SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a) 1580 1581SYM_FUNC_START_LOCAL(_key_expansion_192a) 1582 pshufd $0b01010101, %xmm1, %xmm1 1583 shufps $0b00010000, %xmm0, %xmm4 1584 pxor %xmm4, %xmm0 1585 shufps $0b10001100, %xmm0, %xmm4 1586 pxor %xmm4, %xmm0 1587 pxor %xmm1, %xmm0 1588 1589 movaps %xmm2, %xmm5 1590 movaps %xmm2, %xmm6 1591 pslldq $4, %xmm5 1592 pshufd $0b11111111, %xmm0, %xmm3 1593 pxor %xmm3, %xmm2 1594 pxor %xmm5, %xmm2 1595 1596 movaps %xmm0, %xmm1 1597 shufps $0b01000100, %xmm0, %xmm6 1598 movaps %xmm6, (TKEYP) 1599 shufps $0b01001110, %xmm2, %xmm1 1600 movaps %xmm1, 0x10(TKEYP) 1601 add $0x20, TKEYP 1602 RET 1603SYM_FUNC_END(_key_expansion_192a) 1604 1605SYM_FUNC_START_LOCAL(_key_expansion_192b) 1606 pshufd $0b01010101, %xmm1, %xmm1 1607 shufps $0b00010000, %xmm0, %xmm4 1608 pxor %xmm4, %xmm0 1609 shufps $0b10001100, %xmm0, %xmm4 1610 pxor %xmm4, %xmm0 1611 pxor %xmm1, %xmm0 1612 1613 movaps %xmm2, %xmm5 1614 pslldq $4, %xmm5 1615 pshufd $0b11111111, %xmm0, %xmm3 1616 pxor %xmm3, %xmm2 1617 pxor %xmm5, %xmm2 1618 1619 movaps %xmm0, (TKEYP) 1620 add $0x10, TKEYP 1621 RET 1622SYM_FUNC_END(_key_expansion_192b) 1623 1624SYM_FUNC_START_LOCAL(_key_expansion_256b) 1625 pshufd $0b10101010, %xmm1, %xmm1 1626 shufps $0b00010000, %xmm2, %xmm4 1627 pxor %xmm4, %xmm2 1628 shufps $0b10001100, %xmm2, %xmm4 1629 pxor %xmm4, %xmm2 1630 pxor %xmm1, %xmm2 1631 movaps %xmm2, (TKEYP) 1632 add $0x10, TKEYP 1633 RET 1634SYM_FUNC_END(_key_expansion_256b) 1635 1636/* 1637 * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1638 * unsigned int key_len) 1639 */ 1640SYM_FUNC_START(aesni_set_key) 1641 FRAME_BEGIN 1642#ifndef __x86_64__ 1643 pushl KEYP 1644 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx 1645 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key 1646 movl (FRAME_OFFSET+16)(%esp), %edx # key_len 1647#endif 1648 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1649 movaps %xmm0, (KEYP) 1650 lea 0x10(KEYP), TKEYP # key addr 1651 movl %edx, 480(KEYP) 1652 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1653 cmp $24, %dl 1654 jb .Lenc_key128 1655 je .Lenc_key192 1656 movups 0x10(UKEYP), %xmm2 # other user key 1657 movaps %xmm2, (TKEYP) 1658 add $0x10, TKEYP 1659 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 1660 call _key_expansion_256a 1661 aeskeygenassist $0x1, %xmm0, %xmm1 1662 call _key_expansion_256b 1663 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 1664 call _key_expansion_256a 1665 aeskeygenassist $0x2, %xmm0, %xmm1 1666 call _key_expansion_256b 1667 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 1668 call _key_expansion_256a 1669 aeskeygenassist $0x4, %xmm0, %xmm1 1670 call _key_expansion_256b 1671 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 1672 call _key_expansion_256a 1673 aeskeygenassist $0x8, %xmm0, %xmm1 1674 call _key_expansion_256b 1675 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 1676 call _key_expansion_256a 1677 aeskeygenassist $0x10, %xmm0, %xmm1 1678 call _key_expansion_256b 1679 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 1680 call _key_expansion_256a 1681 aeskeygenassist $0x20, %xmm0, %xmm1 1682 call _key_expansion_256b 1683 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 1684 call _key_expansion_256a 1685 jmp .Ldec_key 1686.Lenc_key192: 1687 movq 0x10(UKEYP), %xmm2 # other user key 1688 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 1689 call _key_expansion_192a 1690 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 1691 call _key_expansion_192b 1692 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 1693 call _key_expansion_192a 1694 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 1695 call _key_expansion_192b 1696 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 1697 call _key_expansion_192a 1698 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 1699 call _key_expansion_192b 1700 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 1701 call _key_expansion_192a 1702 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 1703 call _key_expansion_192b 1704 jmp .Ldec_key 1705.Lenc_key128: 1706 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 1707 call _key_expansion_128 1708 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 1709 call _key_expansion_128 1710 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 1711 call _key_expansion_128 1712 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 1713 call _key_expansion_128 1714 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 1715 call _key_expansion_128 1716 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 1717 call _key_expansion_128 1718 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 1719 call _key_expansion_128 1720 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 1721 call _key_expansion_128 1722 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 1723 call _key_expansion_128 1724 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 1725 call _key_expansion_128 1726.Ldec_key: 1727 sub $0x10, TKEYP 1728 movaps (KEYP), %xmm0 1729 movaps (TKEYP), %xmm1 1730 movaps %xmm0, 240(TKEYP) 1731 movaps %xmm1, 240(KEYP) 1732 add $0x10, KEYP 1733 lea 240-16(TKEYP), UKEYP 1734.align 4 1735.Ldec_key_loop: 1736 movaps (KEYP), %xmm0 1737 aesimc %xmm0, %xmm1 1738 movaps %xmm1, (UKEYP) 1739 add $0x10, KEYP 1740 sub $0x10, UKEYP 1741 cmp TKEYP, KEYP 1742 jb .Ldec_key_loop 1743#ifndef __x86_64__ 1744 popl KEYP 1745#endif 1746 FRAME_END 1747 RET 1748SYM_FUNC_END(aesni_set_key) 1749 1750/* 1751 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src) 1752 */ 1753SYM_FUNC_START(aesni_enc) 1754 FRAME_BEGIN 1755#ifndef __x86_64__ 1756 pushl KEYP 1757 pushl KLEN 1758 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 1759 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 1760 movl (FRAME_OFFSET+20)(%esp), INP # src 1761#endif 1762 movl 480(KEYP), KLEN # key length 1763 movups (INP), STATE # input 1764 call _aesni_enc1 1765 movups STATE, (OUTP) # output 1766#ifndef __x86_64__ 1767 popl KLEN 1768 popl KEYP 1769#endif 1770 FRAME_END 1771 RET 1772SYM_FUNC_END(aesni_enc) 1773 1774/* 1775 * _aesni_enc1: internal ABI 1776 * input: 1777 * KEYP: key struct pointer 1778 * KLEN: round count 1779 * STATE: initial state (input) 1780 * output: 1781 * STATE: finial state (output) 1782 * changed: 1783 * KEY 1784 * TKEYP (T1) 1785 */ 1786SYM_FUNC_START_LOCAL(_aesni_enc1) 1787 movaps (KEYP), KEY # key 1788 mov KEYP, TKEYP 1789 pxor KEY, STATE # round 0 1790 add $0x30, TKEYP 1791 cmp $24, KLEN 1792 jb .Lenc128 1793 lea 0x20(TKEYP), TKEYP 1794 je .Lenc192 1795 add $0x20, TKEYP 1796 movaps -0x60(TKEYP), KEY 1797 aesenc KEY, STATE 1798 movaps -0x50(TKEYP), KEY 1799 aesenc KEY, STATE 1800.align 4 1801.Lenc192: 1802 movaps -0x40(TKEYP), KEY 1803 aesenc KEY, STATE 1804 movaps -0x30(TKEYP), KEY 1805 aesenc KEY, STATE 1806.align 4 1807.Lenc128: 1808 movaps -0x20(TKEYP), KEY 1809 aesenc KEY, STATE 1810 movaps -0x10(TKEYP), KEY 1811 aesenc KEY, STATE 1812 movaps (TKEYP), KEY 1813 aesenc KEY, STATE 1814 movaps 0x10(TKEYP), KEY 1815 aesenc KEY, STATE 1816 movaps 0x20(TKEYP), KEY 1817 aesenc KEY, STATE 1818 movaps 0x30(TKEYP), KEY 1819 aesenc KEY, STATE 1820 movaps 0x40(TKEYP), KEY 1821 aesenc KEY, STATE 1822 movaps 0x50(TKEYP), KEY 1823 aesenc KEY, STATE 1824 movaps 0x60(TKEYP), KEY 1825 aesenc KEY, STATE 1826 movaps 0x70(TKEYP), KEY 1827 aesenclast KEY, STATE 1828 RET 1829SYM_FUNC_END(_aesni_enc1) 1830 1831/* 1832 * _aesni_enc4: internal ABI 1833 * input: 1834 * KEYP: key struct pointer 1835 * KLEN: round count 1836 * STATE1: initial state (input) 1837 * STATE2 1838 * STATE3 1839 * STATE4 1840 * output: 1841 * STATE1: finial state (output) 1842 * STATE2 1843 * STATE3 1844 * STATE4 1845 * changed: 1846 * KEY 1847 * TKEYP (T1) 1848 */ 1849SYM_FUNC_START_LOCAL(_aesni_enc4) 1850 movaps (KEYP), KEY # key 1851 mov KEYP, TKEYP 1852 pxor KEY, STATE1 # round 0 1853 pxor KEY, STATE2 1854 pxor KEY, STATE3 1855 pxor KEY, STATE4 1856 add $0x30, TKEYP 1857 cmp $24, KLEN 1858 jb .L4enc128 1859 lea 0x20(TKEYP), TKEYP 1860 je .L4enc192 1861 add $0x20, TKEYP 1862 movaps -0x60(TKEYP), KEY 1863 aesenc KEY, STATE1 1864 aesenc KEY, STATE2 1865 aesenc KEY, STATE3 1866 aesenc KEY, STATE4 1867 movaps -0x50(TKEYP), KEY 1868 aesenc KEY, STATE1 1869 aesenc KEY, STATE2 1870 aesenc KEY, STATE3 1871 aesenc KEY, STATE4 1872#.align 4 1873.L4enc192: 1874 movaps -0x40(TKEYP), KEY 1875 aesenc KEY, STATE1 1876 aesenc KEY, STATE2 1877 aesenc KEY, STATE3 1878 aesenc KEY, STATE4 1879 movaps -0x30(TKEYP), KEY 1880 aesenc KEY, STATE1 1881 aesenc KEY, STATE2 1882 aesenc KEY, STATE3 1883 aesenc KEY, STATE4 1884#.align 4 1885.L4enc128: 1886 movaps -0x20(TKEYP), KEY 1887 aesenc KEY, STATE1 1888 aesenc KEY, STATE2 1889 aesenc KEY, STATE3 1890 aesenc KEY, STATE4 1891 movaps -0x10(TKEYP), KEY 1892 aesenc KEY, STATE1 1893 aesenc KEY, STATE2 1894 aesenc KEY, STATE3 1895 aesenc KEY, STATE4 1896 movaps (TKEYP), KEY 1897 aesenc KEY, STATE1 1898 aesenc KEY, STATE2 1899 aesenc KEY, STATE3 1900 aesenc KEY, STATE4 1901 movaps 0x10(TKEYP), KEY 1902 aesenc KEY, STATE1 1903 aesenc KEY, STATE2 1904 aesenc KEY, STATE3 1905 aesenc KEY, STATE4 1906 movaps 0x20(TKEYP), KEY 1907 aesenc KEY, STATE1 1908 aesenc KEY, STATE2 1909 aesenc KEY, STATE3 1910 aesenc KEY, STATE4 1911 movaps 0x30(TKEYP), KEY 1912 aesenc KEY, STATE1 1913 aesenc KEY, STATE2 1914 aesenc KEY, STATE3 1915 aesenc KEY, STATE4 1916 movaps 0x40(TKEYP), KEY 1917 aesenc KEY, STATE1 1918 aesenc KEY, STATE2 1919 aesenc KEY, STATE3 1920 aesenc KEY, STATE4 1921 movaps 0x50(TKEYP), KEY 1922 aesenc KEY, STATE1 1923 aesenc KEY, STATE2 1924 aesenc KEY, STATE3 1925 aesenc KEY, STATE4 1926 movaps 0x60(TKEYP), KEY 1927 aesenc KEY, STATE1 1928 aesenc KEY, STATE2 1929 aesenc KEY, STATE3 1930 aesenc KEY, STATE4 1931 movaps 0x70(TKEYP), KEY 1932 aesenclast KEY, STATE1 # last round 1933 aesenclast KEY, STATE2 1934 aesenclast KEY, STATE3 1935 aesenclast KEY, STATE4 1936 RET 1937SYM_FUNC_END(_aesni_enc4) 1938 1939/* 1940 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src) 1941 */ 1942SYM_FUNC_START(aesni_dec) 1943 FRAME_BEGIN 1944#ifndef __x86_64__ 1945 pushl KEYP 1946 pushl KLEN 1947 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 1948 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 1949 movl (FRAME_OFFSET+20)(%esp), INP # src 1950#endif 1951 mov 480(KEYP), KLEN # key length 1952 add $240, KEYP 1953 movups (INP), STATE # input 1954 call _aesni_dec1 1955 movups STATE, (OUTP) #output 1956#ifndef __x86_64__ 1957 popl KLEN 1958 popl KEYP 1959#endif 1960 FRAME_END 1961 RET 1962SYM_FUNC_END(aesni_dec) 1963 1964/* 1965 * _aesni_dec1: internal ABI 1966 * input: 1967 * KEYP: key struct pointer 1968 * KLEN: key length 1969 * STATE: initial state (input) 1970 * output: 1971 * STATE: finial state (output) 1972 * changed: 1973 * KEY 1974 * TKEYP (T1) 1975 */ 1976SYM_FUNC_START_LOCAL(_aesni_dec1) 1977 movaps (KEYP), KEY # key 1978 mov KEYP, TKEYP 1979 pxor KEY, STATE # round 0 1980 add $0x30, TKEYP 1981 cmp $24, KLEN 1982 jb .Ldec128 1983 lea 0x20(TKEYP), TKEYP 1984 je .Ldec192 1985 add $0x20, TKEYP 1986 movaps -0x60(TKEYP), KEY 1987 aesdec KEY, STATE 1988 movaps -0x50(TKEYP), KEY 1989 aesdec KEY, STATE 1990.align 4 1991.Ldec192: 1992 movaps -0x40(TKEYP), KEY 1993 aesdec KEY, STATE 1994 movaps -0x30(TKEYP), KEY 1995 aesdec KEY, STATE 1996.align 4 1997.Ldec128: 1998 movaps -0x20(TKEYP), KEY 1999 aesdec KEY, STATE 2000 movaps -0x10(TKEYP), KEY 2001 aesdec KEY, STATE 2002 movaps (TKEYP), KEY 2003 aesdec KEY, STATE 2004 movaps 0x10(TKEYP), KEY 2005 aesdec KEY, STATE 2006 movaps 0x20(TKEYP), KEY 2007 aesdec KEY, STATE 2008 movaps 0x30(TKEYP), KEY 2009 aesdec KEY, STATE 2010 movaps 0x40(TKEYP), KEY 2011 aesdec KEY, STATE 2012 movaps 0x50(TKEYP), KEY 2013 aesdec KEY, STATE 2014 movaps 0x60(TKEYP), KEY 2015 aesdec KEY, STATE 2016 movaps 0x70(TKEYP), KEY 2017 aesdeclast KEY, STATE 2018 RET 2019SYM_FUNC_END(_aesni_dec1) 2020 2021/* 2022 * _aesni_dec4: internal ABI 2023 * input: 2024 * KEYP: key struct pointer 2025 * KLEN: key length 2026 * STATE1: initial state (input) 2027 * STATE2 2028 * STATE3 2029 * STATE4 2030 * output: 2031 * STATE1: finial state (output) 2032 * STATE2 2033 * STATE3 2034 * STATE4 2035 * changed: 2036 * KEY 2037 * TKEYP (T1) 2038 */ 2039SYM_FUNC_START_LOCAL(_aesni_dec4) 2040 movaps (KEYP), KEY # key 2041 mov KEYP, TKEYP 2042 pxor KEY, STATE1 # round 0 2043 pxor KEY, STATE2 2044 pxor KEY, STATE3 2045 pxor KEY, STATE4 2046 add $0x30, TKEYP 2047 cmp $24, KLEN 2048 jb .L4dec128 2049 lea 0x20(TKEYP), TKEYP 2050 je .L4dec192 2051 add $0x20, TKEYP 2052 movaps -0x60(TKEYP), KEY 2053 aesdec KEY, STATE1 2054 aesdec KEY, STATE2 2055 aesdec KEY, STATE3 2056 aesdec KEY, STATE4 2057 movaps -0x50(TKEYP), KEY 2058 aesdec KEY, STATE1 2059 aesdec KEY, STATE2 2060 aesdec KEY, STATE3 2061 aesdec KEY, STATE4 2062.align 4 2063.L4dec192: 2064 movaps -0x40(TKEYP), KEY 2065 aesdec KEY, STATE1 2066 aesdec KEY, STATE2 2067 aesdec KEY, STATE3 2068 aesdec KEY, STATE4 2069 movaps -0x30(TKEYP), KEY 2070 aesdec KEY, STATE1 2071 aesdec KEY, STATE2 2072 aesdec KEY, STATE3 2073 aesdec KEY, STATE4 2074.align 4 2075.L4dec128: 2076 movaps -0x20(TKEYP), KEY 2077 aesdec KEY, STATE1 2078 aesdec KEY, STATE2 2079 aesdec KEY, STATE3 2080 aesdec KEY, STATE4 2081 movaps -0x10(TKEYP), KEY 2082 aesdec KEY, STATE1 2083 aesdec KEY, STATE2 2084 aesdec KEY, STATE3 2085 aesdec KEY, STATE4 2086 movaps (TKEYP), KEY 2087 aesdec KEY, STATE1 2088 aesdec KEY, STATE2 2089 aesdec KEY, STATE3 2090 aesdec KEY, STATE4 2091 movaps 0x10(TKEYP), KEY 2092 aesdec KEY, STATE1 2093 aesdec KEY, STATE2 2094 aesdec KEY, STATE3 2095 aesdec KEY, STATE4 2096 movaps 0x20(TKEYP), KEY 2097 aesdec KEY, STATE1 2098 aesdec KEY, STATE2 2099 aesdec KEY, STATE3 2100 aesdec KEY, STATE4 2101 movaps 0x30(TKEYP), KEY 2102 aesdec KEY, STATE1 2103 aesdec KEY, STATE2 2104 aesdec KEY, STATE3 2105 aesdec KEY, STATE4 2106 movaps 0x40(TKEYP), KEY 2107 aesdec KEY, STATE1 2108 aesdec KEY, STATE2 2109 aesdec KEY, STATE3 2110 aesdec KEY, STATE4 2111 movaps 0x50(TKEYP), KEY 2112 aesdec KEY, STATE1 2113 aesdec KEY, STATE2 2114 aesdec KEY, STATE3 2115 aesdec KEY, STATE4 2116 movaps 0x60(TKEYP), KEY 2117 aesdec KEY, STATE1 2118 aesdec KEY, STATE2 2119 aesdec KEY, STATE3 2120 aesdec KEY, STATE4 2121 movaps 0x70(TKEYP), KEY 2122 aesdeclast KEY, STATE1 # last round 2123 aesdeclast KEY, STATE2 2124 aesdeclast KEY, STATE3 2125 aesdeclast KEY, STATE4 2126 RET 2127SYM_FUNC_END(_aesni_dec4) 2128 2129/* 2130 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2131 * size_t len) 2132 */ 2133SYM_FUNC_START(aesni_ecb_enc) 2134 FRAME_BEGIN 2135#ifndef __x86_64__ 2136 pushl LEN 2137 pushl KEYP 2138 pushl KLEN 2139 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2140 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2141 movl (FRAME_OFFSET+24)(%esp), INP # src 2142 movl (FRAME_OFFSET+28)(%esp), LEN # len 2143#endif 2144 test LEN, LEN # check length 2145 jz .Lecb_enc_ret 2146 mov 480(KEYP), KLEN 2147 cmp $16, LEN 2148 jb .Lecb_enc_ret 2149 cmp $64, LEN 2150 jb .Lecb_enc_loop1 2151.align 4 2152.Lecb_enc_loop4: 2153 movups (INP), STATE1 2154 movups 0x10(INP), STATE2 2155 movups 0x20(INP), STATE3 2156 movups 0x30(INP), STATE4 2157 call _aesni_enc4 2158 movups STATE1, (OUTP) 2159 movups STATE2, 0x10(OUTP) 2160 movups STATE3, 0x20(OUTP) 2161 movups STATE4, 0x30(OUTP) 2162 sub $64, LEN 2163 add $64, INP 2164 add $64, OUTP 2165 cmp $64, LEN 2166 jge .Lecb_enc_loop4 2167 cmp $16, LEN 2168 jb .Lecb_enc_ret 2169.align 4 2170.Lecb_enc_loop1: 2171 movups (INP), STATE1 2172 call _aesni_enc1 2173 movups STATE1, (OUTP) 2174 sub $16, LEN 2175 add $16, INP 2176 add $16, OUTP 2177 cmp $16, LEN 2178 jge .Lecb_enc_loop1 2179.Lecb_enc_ret: 2180#ifndef __x86_64__ 2181 popl KLEN 2182 popl KEYP 2183 popl LEN 2184#endif 2185 FRAME_END 2186 RET 2187SYM_FUNC_END(aesni_ecb_enc) 2188 2189/* 2190 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2191 * size_t len); 2192 */ 2193SYM_FUNC_START(aesni_ecb_dec) 2194 FRAME_BEGIN 2195#ifndef __x86_64__ 2196 pushl LEN 2197 pushl KEYP 2198 pushl KLEN 2199 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2200 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2201 movl (FRAME_OFFSET+24)(%esp), INP # src 2202 movl (FRAME_OFFSET+28)(%esp), LEN # len 2203#endif 2204 test LEN, LEN 2205 jz .Lecb_dec_ret 2206 mov 480(KEYP), KLEN 2207 add $240, KEYP 2208 cmp $16, LEN 2209 jb .Lecb_dec_ret 2210 cmp $64, LEN 2211 jb .Lecb_dec_loop1 2212.align 4 2213.Lecb_dec_loop4: 2214 movups (INP), STATE1 2215 movups 0x10(INP), STATE2 2216 movups 0x20(INP), STATE3 2217 movups 0x30(INP), STATE4 2218 call _aesni_dec4 2219 movups STATE1, (OUTP) 2220 movups STATE2, 0x10(OUTP) 2221 movups STATE3, 0x20(OUTP) 2222 movups STATE4, 0x30(OUTP) 2223 sub $64, LEN 2224 add $64, INP 2225 add $64, OUTP 2226 cmp $64, LEN 2227 jge .Lecb_dec_loop4 2228 cmp $16, LEN 2229 jb .Lecb_dec_ret 2230.align 4 2231.Lecb_dec_loop1: 2232 movups (INP), STATE1 2233 call _aesni_dec1 2234 movups STATE1, (OUTP) 2235 sub $16, LEN 2236 add $16, INP 2237 add $16, OUTP 2238 cmp $16, LEN 2239 jge .Lecb_dec_loop1 2240.Lecb_dec_ret: 2241#ifndef __x86_64__ 2242 popl KLEN 2243 popl KEYP 2244 popl LEN 2245#endif 2246 FRAME_END 2247 RET 2248SYM_FUNC_END(aesni_ecb_dec) 2249 2250/* 2251 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2252 * size_t len, u8 *iv) 2253 */ 2254SYM_FUNC_START(aesni_cbc_enc) 2255 FRAME_BEGIN 2256#ifndef __x86_64__ 2257 pushl IVP 2258 pushl LEN 2259 pushl KEYP 2260 pushl KLEN 2261 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2262 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2263 movl (FRAME_OFFSET+28)(%esp), INP # src 2264 movl (FRAME_OFFSET+32)(%esp), LEN # len 2265 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2266#endif 2267 cmp $16, LEN 2268 jb .Lcbc_enc_ret 2269 mov 480(KEYP), KLEN 2270 movups (IVP), STATE # load iv as initial state 2271.align 4 2272.Lcbc_enc_loop: 2273 movups (INP), IN # load input 2274 pxor IN, STATE 2275 call _aesni_enc1 2276 movups STATE, (OUTP) # store output 2277 sub $16, LEN 2278 add $16, INP 2279 add $16, OUTP 2280 cmp $16, LEN 2281 jge .Lcbc_enc_loop 2282 movups STATE, (IVP) 2283.Lcbc_enc_ret: 2284#ifndef __x86_64__ 2285 popl KLEN 2286 popl KEYP 2287 popl LEN 2288 popl IVP 2289#endif 2290 FRAME_END 2291 RET 2292SYM_FUNC_END(aesni_cbc_enc) 2293 2294/* 2295 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2296 * size_t len, u8 *iv) 2297 */ 2298SYM_FUNC_START(aesni_cbc_dec) 2299 FRAME_BEGIN 2300#ifndef __x86_64__ 2301 pushl IVP 2302 pushl LEN 2303 pushl KEYP 2304 pushl KLEN 2305 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2306 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2307 movl (FRAME_OFFSET+28)(%esp), INP # src 2308 movl (FRAME_OFFSET+32)(%esp), LEN # len 2309 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2310#endif 2311 cmp $16, LEN 2312 jb .Lcbc_dec_just_ret 2313 mov 480(KEYP), KLEN 2314 add $240, KEYP 2315 movups (IVP), IV 2316 cmp $64, LEN 2317 jb .Lcbc_dec_loop1 2318.align 4 2319.Lcbc_dec_loop4: 2320 movups (INP), IN1 2321 movaps IN1, STATE1 2322 movups 0x10(INP), IN2 2323 movaps IN2, STATE2 2324#ifdef __x86_64__ 2325 movups 0x20(INP), IN3 2326 movaps IN3, STATE3 2327 movups 0x30(INP), IN4 2328 movaps IN4, STATE4 2329#else 2330 movups 0x20(INP), IN1 2331 movaps IN1, STATE3 2332 movups 0x30(INP), IN2 2333 movaps IN2, STATE4 2334#endif 2335 call _aesni_dec4 2336 pxor IV, STATE1 2337#ifdef __x86_64__ 2338 pxor IN1, STATE2 2339 pxor IN2, STATE3 2340 pxor IN3, STATE4 2341 movaps IN4, IV 2342#else 2343 pxor IN1, STATE4 2344 movaps IN2, IV 2345 movups (INP), IN1 2346 pxor IN1, STATE2 2347 movups 0x10(INP), IN2 2348 pxor IN2, STATE3 2349#endif 2350 movups STATE1, (OUTP) 2351 movups STATE2, 0x10(OUTP) 2352 movups STATE3, 0x20(OUTP) 2353 movups STATE4, 0x30(OUTP) 2354 sub $64, LEN 2355 add $64, INP 2356 add $64, OUTP 2357 cmp $64, LEN 2358 jge .Lcbc_dec_loop4 2359 cmp $16, LEN 2360 jb .Lcbc_dec_ret 2361.align 4 2362.Lcbc_dec_loop1: 2363 movups (INP), IN 2364 movaps IN, STATE 2365 call _aesni_dec1 2366 pxor IV, STATE 2367 movups STATE, (OUTP) 2368 movaps IN, IV 2369 sub $16, LEN 2370 add $16, INP 2371 add $16, OUTP 2372 cmp $16, LEN 2373 jge .Lcbc_dec_loop1 2374.Lcbc_dec_ret: 2375 movups IV, (IVP) 2376.Lcbc_dec_just_ret: 2377#ifndef __x86_64__ 2378 popl KLEN 2379 popl KEYP 2380 popl LEN 2381 popl IVP 2382#endif 2383 FRAME_END 2384 RET 2385SYM_FUNC_END(aesni_cbc_dec) 2386 2387/* 2388 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2389 * size_t len, u8 *iv) 2390 */ 2391SYM_FUNC_START(aesni_cts_cbc_enc) 2392 FRAME_BEGIN 2393#ifndef __x86_64__ 2394 pushl IVP 2395 pushl LEN 2396 pushl KEYP 2397 pushl KLEN 2398 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2399 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2400 movl (FRAME_OFFSET+28)(%esp), INP # src 2401 movl (FRAME_OFFSET+32)(%esp), LEN # len 2402 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2403 lea .Lcts_permute_table, T1 2404#else 2405 lea .Lcts_permute_table(%rip), T1 2406#endif 2407 mov 480(KEYP), KLEN 2408 movups (IVP), STATE 2409 sub $16, LEN 2410 mov T1, IVP 2411 add $32, IVP 2412 add LEN, T1 2413 sub LEN, IVP 2414 movups (T1), %xmm4 2415 movups (IVP), %xmm5 2416 2417 movups (INP), IN1 2418 add LEN, INP 2419 movups (INP), IN2 2420 2421 pxor IN1, STATE 2422 call _aesni_enc1 2423 2424 pshufb %xmm5, IN2 2425 pxor STATE, IN2 2426 pshufb %xmm4, STATE 2427 add OUTP, LEN 2428 movups STATE, (LEN) 2429 2430 movaps IN2, STATE 2431 call _aesni_enc1 2432 movups STATE, (OUTP) 2433 2434#ifndef __x86_64__ 2435 popl KLEN 2436 popl KEYP 2437 popl LEN 2438 popl IVP 2439#endif 2440 FRAME_END 2441 RET 2442SYM_FUNC_END(aesni_cts_cbc_enc) 2443 2444/* 2445 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2446 * size_t len, u8 *iv) 2447 */ 2448SYM_FUNC_START(aesni_cts_cbc_dec) 2449 FRAME_BEGIN 2450#ifndef __x86_64__ 2451 pushl IVP 2452 pushl LEN 2453 pushl KEYP 2454 pushl KLEN 2455 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2456 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2457 movl (FRAME_OFFSET+28)(%esp), INP # src 2458 movl (FRAME_OFFSET+32)(%esp), LEN # len 2459 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2460 lea .Lcts_permute_table, T1 2461#else 2462 lea .Lcts_permute_table(%rip), T1 2463#endif 2464 mov 480(KEYP), KLEN 2465 add $240, KEYP 2466 movups (IVP), IV 2467 sub $16, LEN 2468 mov T1, IVP 2469 add $32, IVP 2470 add LEN, T1 2471 sub LEN, IVP 2472 movups (T1), %xmm4 2473 2474 movups (INP), STATE 2475 add LEN, INP 2476 movups (INP), IN1 2477 2478 call _aesni_dec1 2479 movaps STATE, IN2 2480 pshufb %xmm4, STATE 2481 pxor IN1, STATE 2482 2483 add OUTP, LEN 2484 movups STATE, (LEN) 2485 2486 movups (IVP), %xmm0 2487 pshufb %xmm0, IN1 2488 pblendvb IN2, IN1 2489 movaps IN1, STATE 2490 call _aesni_dec1 2491 2492 pxor IV, STATE 2493 movups STATE, (OUTP) 2494 2495#ifndef __x86_64__ 2496 popl KLEN 2497 popl KEYP 2498 popl LEN 2499 popl IVP 2500#endif 2501 FRAME_END 2502 RET 2503SYM_FUNC_END(aesni_cts_cbc_dec) 2504 2505.pushsection .rodata 2506.align 16 2507.Lcts_permute_table: 2508 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 2509 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 2510 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 2511 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 2512 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 2513 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 2514#ifdef __x86_64__ 2515.Lbswap_mask: 2516 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2517#endif 2518.popsection 2519 2520#ifdef __x86_64__ 2521/* 2522 * _aesni_inc_init: internal ABI 2523 * setup registers used by _aesni_inc 2524 * input: 2525 * IV 2526 * output: 2527 * CTR: == IV, in little endian 2528 * TCTR_LOW: == lower qword of CTR 2529 * INC: == 1, in little endian 2530 * BSWAP_MASK == endian swapping mask 2531 */ 2532SYM_FUNC_START_LOCAL(_aesni_inc_init) 2533 movaps .Lbswap_mask(%rip), BSWAP_MASK 2534 movaps IV, CTR 2535 pshufb BSWAP_MASK, CTR 2536 mov $1, TCTR_LOW 2537 movq TCTR_LOW, INC 2538 movq CTR, TCTR_LOW 2539 RET 2540SYM_FUNC_END(_aesni_inc_init) 2541 2542/* 2543 * _aesni_inc: internal ABI 2544 * Increase IV by 1, IV is in big endian 2545 * input: 2546 * IV 2547 * CTR: == IV, in little endian 2548 * TCTR_LOW: == lower qword of CTR 2549 * INC: == 1, in little endian 2550 * BSWAP_MASK == endian swapping mask 2551 * output: 2552 * IV: Increase by 1 2553 * changed: 2554 * CTR: == output IV, in little endian 2555 * TCTR_LOW: == lower qword of CTR 2556 */ 2557SYM_FUNC_START_LOCAL(_aesni_inc) 2558 paddq INC, CTR 2559 add $1, TCTR_LOW 2560 jnc .Linc_low 2561 pslldq $8, INC 2562 paddq INC, CTR 2563 psrldq $8, INC 2564.Linc_low: 2565 movaps CTR, IV 2566 pshufb BSWAP_MASK, IV 2567 RET 2568SYM_FUNC_END(_aesni_inc) 2569 2570/* 2571 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2572 * size_t len, u8 *iv) 2573 */ 2574SYM_FUNC_START(aesni_ctr_enc) 2575 FRAME_BEGIN 2576 cmp $16, LEN 2577 jb .Lctr_enc_just_ret 2578 mov 480(KEYP), KLEN 2579 movups (IVP), IV 2580 call _aesni_inc_init 2581 cmp $64, LEN 2582 jb .Lctr_enc_loop1 2583.align 4 2584.Lctr_enc_loop4: 2585 movaps IV, STATE1 2586 call _aesni_inc 2587 movups (INP), IN1 2588 movaps IV, STATE2 2589 call _aesni_inc 2590 movups 0x10(INP), IN2 2591 movaps IV, STATE3 2592 call _aesni_inc 2593 movups 0x20(INP), IN3 2594 movaps IV, STATE4 2595 call _aesni_inc 2596 movups 0x30(INP), IN4 2597 call _aesni_enc4 2598 pxor IN1, STATE1 2599 movups STATE1, (OUTP) 2600 pxor IN2, STATE2 2601 movups STATE2, 0x10(OUTP) 2602 pxor IN3, STATE3 2603 movups STATE3, 0x20(OUTP) 2604 pxor IN4, STATE4 2605 movups STATE4, 0x30(OUTP) 2606 sub $64, LEN 2607 add $64, INP 2608 add $64, OUTP 2609 cmp $64, LEN 2610 jge .Lctr_enc_loop4 2611 cmp $16, LEN 2612 jb .Lctr_enc_ret 2613.align 4 2614.Lctr_enc_loop1: 2615 movaps IV, STATE 2616 call _aesni_inc 2617 movups (INP), IN 2618 call _aesni_enc1 2619 pxor IN, STATE 2620 movups STATE, (OUTP) 2621 sub $16, LEN 2622 add $16, INP 2623 add $16, OUTP 2624 cmp $16, LEN 2625 jge .Lctr_enc_loop1 2626.Lctr_enc_ret: 2627 movups IV, (IVP) 2628.Lctr_enc_just_ret: 2629 FRAME_END 2630 RET 2631SYM_FUNC_END(aesni_ctr_enc) 2632 2633#endif 2634 2635.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 2636.align 16 2637.Lgf128mul_x_ble_mask: 2638 .octa 0x00000000000000010000000000000087 2639.previous 2640 2641/* 2642 * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs 2643 * input: 2644 * IV: current IV 2645 * GF128MUL_MASK == mask with 0x87 and 0x01 2646 * output: 2647 * IV: next IV 2648 * changed: 2649 * KEY: == temporary value 2650 */ 2651.macro _aesni_gf128mul_x_ble 2652 pshufd $0x13, IV, KEY 2653 paddq IV, IV 2654 psrad $31, KEY 2655 pand GF128MUL_MASK, KEY 2656 pxor KEY, IV 2657.endm 2658 2659.macro _aesni_xts_crypt enc 2660 FRAME_BEGIN 2661#ifndef __x86_64__ 2662 pushl IVP 2663 pushl LEN 2664 pushl KEYP 2665 pushl KLEN 2666 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2667 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2668 movl (FRAME_OFFSET+28)(%esp), INP # src 2669 movl (FRAME_OFFSET+32)(%esp), LEN # len 2670 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2671 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2672#else 2673 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK 2674#endif 2675 movups (IVP), IV 2676 2677 mov 480(KEYP), KLEN 2678.if !\enc 2679 add $240, KEYP 2680 2681 test $15, LEN 2682 jz .Lxts_loop4\@ 2683 sub $16, LEN 2684.endif 2685 2686.Lxts_loop4\@: 2687 sub $64, LEN 2688 jl .Lxts_1x\@ 2689 2690 movdqa IV, STATE1 2691 movdqu 0x00(INP), IN 2692 pxor IN, STATE1 2693 movdqu IV, 0x00(OUTP) 2694 2695 _aesni_gf128mul_x_ble 2696 movdqa IV, STATE2 2697 movdqu 0x10(INP), IN 2698 pxor IN, STATE2 2699 movdqu IV, 0x10(OUTP) 2700 2701 _aesni_gf128mul_x_ble 2702 movdqa IV, STATE3 2703 movdqu 0x20(INP), IN 2704 pxor IN, STATE3 2705 movdqu IV, 0x20(OUTP) 2706 2707 _aesni_gf128mul_x_ble 2708 movdqa IV, STATE4 2709 movdqu 0x30(INP), IN 2710 pxor IN, STATE4 2711 movdqu IV, 0x30(OUTP) 2712 2713.if \enc 2714 call _aesni_enc4 2715.else 2716 call _aesni_dec4 2717.endif 2718 2719 movdqu 0x00(OUTP), IN 2720 pxor IN, STATE1 2721 movdqu STATE1, 0x00(OUTP) 2722 2723 movdqu 0x10(OUTP), IN 2724 pxor IN, STATE2 2725 movdqu STATE2, 0x10(OUTP) 2726 2727 movdqu 0x20(OUTP), IN 2728 pxor IN, STATE3 2729 movdqu STATE3, 0x20(OUTP) 2730 2731 movdqu 0x30(OUTP), IN 2732 pxor IN, STATE4 2733 movdqu STATE4, 0x30(OUTP) 2734 2735 _aesni_gf128mul_x_ble 2736 2737 add $64, INP 2738 add $64, OUTP 2739 test LEN, LEN 2740 jnz .Lxts_loop4\@ 2741 2742.Lxts_ret_iv\@: 2743 movups IV, (IVP) 2744 2745.Lxts_ret\@: 2746#ifndef __x86_64__ 2747 popl KLEN 2748 popl KEYP 2749 popl LEN 2750 popl IVP 2751#endif 2752 FRAME_END 2753 RET 2754 2755.Lxts_1x\@: 2756 add $64, LEN 2757 jz .Lxts_ret_iv\@ 2758.if \enc 2759 sub $16, LEN 2760 jl .Lxts_cts4\@ 2761.endif 2762 2763.Lxts_loop1\@: 2764 movdqu (INP), STATE 2765.if \enc 2766 pxor IV, STATE 2767 call _aesni_enc1 2768.else 2769 add $16, INP 2770 sub $16, LEN 2771 jl .Lxts_cts1\@ 2772 pxor IV, STATE 2773 call _aesni_dec1 2774.endif 2775 pxor IV, STATE 2776 _aesni_gf128mul_x_ble 2777 2778 test LEN, LEN 2779 jz .Lxts_out\@ 2780 2781.if \enc 2782 add $16, INP 2783 sub $16, LEN 2784 jl .Lxts_cts1\@ 2785.endif 2786 2787 movdqu STATE, (OUTP) 2788 add $16, OUTP 2789 jmp .Lxts_loop1\@ 2790 2791.Lxts_out\@: 2792 movdqu STATE, (OUTP) 2793 jmp .Lxts_ret_iv\@ 2794 2795.if \enc 2796.Lxts_cts4\@: 2797 movdqa STATE4, STATE 2798 sub $16, OUTP 2799.Lxts_cts1\@: 2800.else 2801.Lxts_cts1\@: 2802 movdqa IV, STATE4 2803 _aesni_gf128mul_x_ble 2804 2805 pxor IV, STATE 2806 call _aesni_dec1 2807 pxor IV, STATE 2808.endif 2809#ifndef __x86_64__ 2810 lea .Lcts_permute_table, T1 2811#else 2812 lea .Lcts_permute_table(%rip), T1 2813#endif 2814 add LEN, INP /* rewind input pointer */ 2815 add $16, LEN /* # bytes in final block */ 2816 movups (INP), IN1 2817 2818 mov T1, IVP 2819 add $32, IVP 2820 add LEN, T1 2821 sub LEN, IVP 2822 add OUTP, LEN 2823 2824 movups (T1), %xmm4 2825 movaps STATE, IN2 2826 pshufb %xmm4, STATE 2827 movups STATE, (LEN) 2828 2829 movups (IVP), %xmm0 2830 pshufb %xmm0, IN1 2831 pblendvb IN2, IN1 2832 movaps IN1, STATE 2833 2834.if \enc 2835 pxor IV, STATE 2836 call _aesni_enc1 2837 pxor IV, STATE 2838.else 2839 pxor STATE4, STATE 2840 call _aesni_dec1 2841 pxor STATE4, STATE 2842.endif 2843 2844 movups STATE, (OUTP) 2845 jmp .Lxts_ret\@ 2846.endm 2847 2848/* 2849 * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst, 2850 * const u8 *src, unsigned int len, le128 *iv) 2851 */ 2852SYM_FUNC_START(aesni_xts_enc) 2853 _aesni_xts_crypt 1 2854SYM_FUNC_END(aesni_xts_enc) 2855 2856/* 2857 * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst, 2858 * const u8 *src, unsigned int len, le128 *iv) 2859 */ 2860SYM_FUNC_START(aesni_xts_dec) 2861 _aesni_xts_crypt 0 2862SYM_FUNC_END(aesni_xts_dec) 2863