1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Implement AES algorithm in Intel AES-NI instructions. 4 * 5 * The white paper of AES-NI instructions can be downloaded from: 6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 7 * 8 * Copyright (C) 2008, Intel Corp. 9 * Author: Huang Ying <ying.huang@intel.com> 10 * Vinodh Gopal <vinodh.gopal@intel.com> 11 * Kahraman Akdemir 12 * 13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 14 * interface for 64-bit kernels. 15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 16 * Aidan O'Mahony (aidan.o.mahony@intel.com) 17 * Adrian Hoban <adrian.hoban@intel.com> 18 * James Guilford (james.guilford@intel.com) 19 * Gabriele Paoloni <gabriele.paoloni@intel.com> 20 * Tadeusz Struk (tadeusz.struk@intel.com) 21 * Wajdi Feghali (wajdi.k.feghali@intel.com) 22 * Copyright (c) 2010, Intel Corporation. 23 * 24 * Ported x86_64 version to x86: 25 * Author: Mathias Krause <minipli@googlemail.com> 26 */ 27 28#include <linux/linkage.h> 29#include <asm/frame.h> 30#include <asm/nospec-branch.h> 31 32/* 33 * The following macros are used to move an (un)aligned 16 byte value to/from 34 * an XMM register. This can done for either FP or integer values, for FP use 35 * movaps (move aligned packed single) or integer use movdqa (move double quad 36 * aligned). It doesn't make a performance difference which instruction is used 37 * since Nehalem (original Core i7) was released. However, the movaps is a byte 38 * shorter, so that is the one we'll use for now. (same for unaligned). 39 */ 40#define MOVADQ movaps 41#define MOVUDQ movups 42 43#ifdef __x86_64__ 44 45# constants in mergeable sections, linker can reorder and merge 46.section .rodata.cst16.POLY, "aM", @progbits, 16 47.align 16 48POLY: .octa 0xC2000000000000000000000000000001 49.section .rodata.cst16.TWOONE, "aM", @progbits, 16 50.align 16 51TWOONE: .octa 0x00000001000000000000000000000001 52 53.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 54.align 16 55SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 56.section .rodata.cst16.MASK1, "aM", @progbits, 16 57.align 16 58MASK1: .octa 0x0000000000000000ffffffffffffffff 59.section .rodata.cst16.MASK2, "aM", @progbits, 16 60.align 16 61MASK2: .octa 0xffffffffffffffff0000000000000000 62.section .rodata.cst16.ONE, "aM", @progbits, 16 63.align 16 64ONE: .octa 0x00000000000000000000000000000001 65.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 66.align 16 67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 68.section .rodata.cst16.dec, "aM", @progbits, 16 69.align 16 70dec: .octa 0x1 71.section .rodata.cst16.enc, "aM", @progbits, 16 72.align 16 73enc: .octa 0x2 74 75# order of these constants should not change. 76# more specifically, ALL_F should follow SHIFT_MASK, 77# and zero should follow ALL_F 78.section .rodata, "a", @progbits 79.align 16 80SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 81ALL_F: .octa 0xffffffffffffffffffffffffffffffff 82 .octa 0x00000000000000000000000000000000 83 84.text 85 86 87#define STACK_OFFSET 8*3 88 89#define AadHash 16*0 90#define AadLen 16*1 91#define InLen (16*1)+8 92#define PBlockEncKey 16*2 93#define OrigIV 16*3 94#define CurCount 16*4 95#define PBlockLen 16*5 96#define HashKey 16*6 // store HashKey <<1 mod poly here 97#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here 98#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here 99#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here 100#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 101 // bits of HashKey <<1 mod poly here 102 //(for Karatsuba purposes) 103#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 104 // bits of HashKey^2 <<1 mod poly here 105 // (for Karatsuba purposes) 106#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 107 // bits of HashKey^3 <<1 mod poly here 108 // (for Karatsuba purposes) 109#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 110 // bits of HashKey^4 <<1 mod poly here 111 // (for Karatsuba purposes) 112 113#define arg1 rdi 114#define arg2 rsi 115#define arg3 rdx 116#define arg4 rcx 117#define arg5 r8 118#define arg6 r9 119#define arg7 STACK_OFFSET+8(%rsp) 120#define arg8 STACK_OFFSET+16(%rsp) 121#define arg9 STACK_OFFSET+24(%rsp) 122#define arg10 STACK_OFFSET+32(%rsp) 123#define arg11 STACK_OFFSET+40(%rsp) 124#define keysize 2*15*16(%arg1) 125#endif 126 127 128#define STATE1 %xmm0 129#define STATE2 %xmm4 130#define STATE3 %xmm5 131#define STATE4 %xmm6 132#define STATE STATE1 133#define IN1 %xmm1 134#define IN2 %xmm7 135#define IN3 %xmm8 136#define IN4 %xmm9 137#define IN IN1 138#define KEY %xmm2 139#define IV %xmm3 140 141#define BSWAP_MASK %xmm10 142#define CTR %xmm11 143#define INC %xmm12 144 145#define GF128MUL_MASK %xmm7 146 147#ifdef __x86_64__ 148#define AREG %rax 149#define KEYP %rdi 150#define OUTP %rsi 151#define UKEYP OUTP 152#define INP %rdx 153#define LEN %rcx 154#define IVP %r8 155#define KLEN %r9d 156#define T1 %r10 157#define TKEYP T1 158#define T2 %r11 159#define TCTR_LOW T2 160#else 161#define AREG %eax 162#define KEYP %edi 163#define OUTP AREG 164#define UKEYP OUTP 165#define INP %edx 166#define LEN %esi 167#define IVP %ebp 168#define KLEN %ebx 169#define T1 %ecx 170#define TKEYP T1 171#endif 172 173.macro FUNC_SAVE 174 push %r12 175 push %r13 176 push %r14 177# 178# states of %xmm registers %xmm6:%xmm15 not saved 179# all %xmm registers are clobbered 180# 181.endm 182 183 184.macro FUNC_RESTORE 185 pop %r14 186 pop %r13 187 pop %r12 188.endm 189 190# Precompute hashkeys. 191# Input: Hash subkey. 192# Output: HashKeys stored in gcm_context_data. Only needs to be called 193# once per key. 194# clobbers r12, and tmp xmm registers. 195.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 196 mov \SUBKEY, %r12 197 movdqu (%r12), \TMP3 198 movdqa SHUF_MASK(%rip), \TMP2 199 pshufb \TMP2, \TMP3 200 201 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 202 203 movdqa \TMP3, \TMP2 204 psllq $1, \TMP3 205 psrlq $63, \TMP2 206 movdqa \TMP2, \TMP1 207 pslldq $8, \TMP2 208 psrldq $8, \TMP1 209 por \TMP2, \TMP3 210 211 # reduce HashKey<<1 212 213 pshufd $0x24, \TMP1, \TMP2 214 pcmpeqd TWOONE(%rip), \TMP2 215 pand POLY(%rip), \TMP2 216 pxor \TMP2, \TMP3 217 movdqu \TMP3, HashKey(%arg2) 218 219 movdqa \TMP3, \TMP5 220 pshufd $78, \TMP3, \TMP1 221 pxor \TMP3, \TMP1 222 movdqu \TMP1, HashKey_k(%arg2) 223 224 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 225# TMP5 = HashKey^2<<1 (mod poly) 226 movdqu \TMP5, HashKey_2(%arg2) 227# HashKey_2 = HashKey^2<<1 (mod poly) 228 pshufd $78, \TMP5, \TMP1 229 pxor \TMP5, \TMP1 230 movdqu \TMP1, HashKey_2_k(%arg2) 231 232 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 233# TMP5 = HashKey^3<<1 (mod poly) 234 movdqu \TMP5, HashKey_3(%arg2) 235 pshufd $78, \TMP5, \TMP1 236 pxor \TMP5, \TMP1 237 movdqu \TMP1, HashKey_3_k(%arg2) 238 239 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 240# TMP5 = HashKey^3<<1 (mod poly) 241 movdqu \TMP5, HashKey_4(%arg2) 242 pshufd $78, \TMP5, \TMP1 243 pxor \TMP5, \TMP1 244 movdqu \TMP1, HashKey_4_k(%arg2) 245.endm 246 247# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. 248# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 249.macro GCM_INIT Iv SUBKEY AAD AADLEN 250 mov \AADLEN, %r11 251 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length 252 xor %r11d, %r11d 253 mov %r11, InLen(%arg2) # ctx_data.in_length = 0 254 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 255 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 256 mov \Iv, %rax 257 movdqu (%rax), %xmm0 258 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv 259 260 movdqa SHUF_MASK(%rip), %xmm2 261 pshufb %xmm2, %xmm0 262 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv 263 264 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 265 movdqu HashKey(%arg2), %xmm13 266 267 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ 268 %xmm4, %xmm5, %xmm6 269.endm 270 271# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context 272# struct has been initialized by GCM_INIT. 273# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK 274# Clobbers rax, r10-r13, and xmm0-xmm15 275.macro GCM_ENC_DEC operation 276 movdqu AadHash(%arg2), %xmm8 277 movdqu HashKey(%arg2), %xmm13 278 add %arg5, InLen(%arg2) 279 280 xor %r11d, %r11d # initialise the data pointer offset as zero 281 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation 282 283 sub %r11, %arg5 # sub partial block data used 284 mov %arg5, %r13 # save the number of bytes 285 286 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 287 mov %r13, %r12 288 # Encrypt/Decrypt first few blocks 289 290 and $(3<<4), %r12 291 jz .L_initial_num_blocks_is_0_\@ 292 cmp $(2<<4), %r12 293 jb .L_initial_num_blocks_is_1_\@ 294 je .L_initial_num_blocks_is_2_\@ 295.L_initial_num_blocks_is_3_\@: 296 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 297%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation 298 sub $48, %r13 299 jmp .L_initial_blocks_\@ 300.L_initial_num_blocks_is_2_\@: 301 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 302%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation 303 sub $32, %r13 304 jmp .L_initial_blocks_\@ 305.L_initial_num_blocks_is_1_\@: 306 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 307%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation 308 sub $16, %r13 309 jmp .L_initial_blocks_\@ 310.L_initial_num_blocks_is_0_\@: 311 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 312%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation 313.L_initial_blocks_\@: 314 315 # Main loop - Encrypt/Decrypt remaining blocks 316 317 test %r13, %r13 318 je .L_zero_cipher_left_\@ 319 sub $64, %r13 320 je .L_four_cipher_left_\@ 321.L_crypt_by_4_\@: 322 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ 323 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ 324 %xmm7, %xmm8, enc 325 add $64, %r11 326 sub $64, %r13 327 jne .L_crypt_by_4_\@ 328.L_four_cipher_left_\@: 329 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 330%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 331.L_zero_cipher_left_\@: 332 movdqu %xmm8, AadHash(%arg2) 333 movdqu %xmm0, CurCount(%arg2) 334 335 mov %arg5, %r13 336 and $15, %r13 # %r13 = arg5 (mod 16) 337 je .L_multiple_of_16_bytes_\@ 338 339 mov %r13, PBlockLen(%arg2) 340 341 # Handle the last <16 Byte block separately 342 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 343 movdqu %xmm0, CurCount(%arg2) 344 movdqa SHUF_MASK(%rip), %xmm10 345 pshufb %xmm10, %xmm0 346 347 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 348 movdqu %xmm0, PBlockEncKey(%arg2) 349 350 cmp $16, %arg5 351 jge .L_large_enough_update_\@ 352 353 lea (%arg4,%r11,1), %r10 354 mov %r13, %r12 355 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 356 jmp .L_data_read_\@ 357 358.L_large_enough_update_\@: 359 sub $16, %r11 360 add %r13, %r11 361 362 # receive the last <16 Byte block 363 movdqu (%arg4, %r11, 1), %xmm1 364 365 sub %r13, %r11 366 add $16, %r11 367 368 lea SHIFT_MASK+16(%rip), %r12 369 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 370 # (r13 is the number of bytes in plaintext mod 16) 371 sub %r13, %r12 372 # get the appropriate shuffle mask 373 movdqu (%r12), %xmm2 374 # shift right 16-r13 bytes 375 pshufb %xmm2, %xmm1 376 377.L_data_read_\@: 378 lea ALL_F+16(%rip), %r12 379 sub %r13, %r12 380 381.ifc \operation, dec 382 movdqa %xmm1, %xmm2 383.endif 384 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) 385 movdqu (%r12), %xmm1 386 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 387 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 388.ifc \operation, dec 389 pand %xmm1, %xmm2 390 movdqa SHUF_MASK(%rip), %xmm10 391 pshufb %xmm10 ,%xmm2 392 393 pxor %xmm2, %xmm8 394.else 395 movdqa SHUF_MASK(%rip), %xmm10 396 pshufb %xmm10,%xmm0 397 398 pxor %xmm0, %xmm8 399.endif 400 401 movdqu %xmm8, AadHash(%arg2) 402.ifc \operation, enc 403 # GHASH computation for the last <16 byte block 404 movdqa SHUF_MASK(%rip), %xmm10 405 # shuffle xmm0 back to output as ciphertext 406 pshufb %xmm10, %xmm0 407.endif 408 409 # Output %r13 bytes 410 movq %xmm0, %rax 411 cmp $8, %r13 412 jle .L_less_than_8_bytes_left_\@ 413 mov %rax, (%arg3 , %r11, 1) 414 add $8, %r11 415 psrldq $8, %xmm0 416 movq %xmm0, %rax 417 sub $8, %r13 418.L_less_than_8_bytes_left_\@: 419 mov %al, (%arg3, %r11, 1) 420 add $1, %r11 421 shr $8, %rax 422 sub $1, %r13 423 jne .L_less_than_8_bytes_left_\@ 424.L_multiple_of_16_bytes_\@: 425.endm 426 427# GCM_COMPLETE Finishes update of tag of last partial block 428# Output: Authorization Tag (AUTH_TAG) 429# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 430.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN 431 movdqu AadHash(%arg2), %xmm8 432 movdqu HashKey(%arg2), %xmm13 433 434 mov PBlockLen(%arg2), %r12 435 436 test %r12, %r12 437 je .L_partial_done\@ 438 439 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 440 441.L_partial_done\@: 442 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) 443 shl $3, %r12 # convert into number of bits 444 movd %r12d, %xmm15 # len(A) in %xmm15 445 mov InLen(%arg2), %r12 446 shl $3, %r12 # len(C) in bits (*128) 447 movq %r12, %xmm1 448 449 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 450 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 451 pxor %xmm15, %xmm8 452 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 453 # final GHASH computation 454 movdqa SHUF_MASK(%rip), %xmm10 455 pshufb %xmm10, %xmm8 456 457 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 458 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 459 pxor %xmm8, %xmm0 460.L_return_T_\@: 461 mov \AUTHTAG, %r10 # %r10 = authTag 462 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len 463 cmp $16, %r11 464 je .L_T_16_\@ 465 cmp $8, %r11 466 jl .L_T_4_\@ 467.L_T_8_\@: 468 movq %xmm0, %rax 469 mov %rax, (%r10) 470 add $8, %r10 471 sub $8, %r11 472 psrldq $8, %xmm0 473 test %r11, %r11 474 je .L_return_T_done_\@ 475.L_T_4_\@: 476 movd %xmm0, %eax 477 mov %eax, (%r10) 478 add $4, %r10 479 sub $4, %r11 480 psrldq $4, %xmm0 481 test %r11, %r11 482 je .L_return_T_done_\@ 483.L_T_123_\@: 484 movd %xmm0, %eax 485 cmp $2, %r11 486 jl .L_T_1_\@ 487 mov %ax, (%r10) 488 cmp $2, %r11 489 je .L_return_T_done_\@ 490 add $2, %r10 491 sar $16, %eax 492.L_T_1_\@: 493 mov %al, (%r10) 494 jmp .L_return_T_done_\@ 495.L_T_16_\@: 496 movdqu %xmm0, (%r10) 497.L_return_T_done_\@: 498.endm 499 500#ifdef __x86_64__ 501/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 502* 503* 504* Input: A and B (128-bits each, bit-reflected) 505* Output: C = A*B*x mod poly, (i.e. >>1 ) 506* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 507* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 508* 509*/ 510.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 511 movdqa \GH, \TMP1 512 pshufd $78, \GH, \TMP2 513 pshufd $78, \HK, \TMP3 514 pxor \GH, \TMP2 # TMP2 = a1+a0 515 pxor \HK, \TMP3 # TMP3 = b1+b0 516 pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 517 pclmulqdq $0x00, \HK, \GH # GH = a0*b0 518 pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 519 pxor \GH, \TMP2 520 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 521 movdqa \TMP2, \TMP3 522 pslldq $8, \TMP3 # left shift TMP3 2 DWs 523 psrldq $8, \TMP2 # right shift TMP2 2 DWs 524 pxor \TMP3, \GH 525 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 526 527 # first phase of the reduction 528 529 movdqa \GH, \TMP2 530 movdqa \GH, \TMP3 531 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 532 # in in order to perform 533 # independent shifts 534 pslld $31, \TMP2 # packed right shift <<31 535 pslld $30, \TMP3 # packed right shift <<30 536 pslld $25, \TMP4 # packed right shift <<25 537 pxor \TMP3, \TMP2 # xor the shifted versions 538 pxor \TMP4, \TMP2 539 movdqa \TMP2, \TMP5 540 psrldq $4, \TMP5 # right shift TMP5 1 DW 541 pslldq $12, \TMP2 # left shift TMP2 3 DWs 542 pxor \TMP2, \GH 543 544 # second phase of the reduction 545 546 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 547 # in in order to perform 548 # independent shifts 549 movdqa \GH,\TMP3 550 movdqa \GH,\TMP4 551 psrld $1,\TMP2 # packed left shift >>1 552 psrld $2,\TMP3 # packed left shift >>2 553 psrld $7,\TMP4 # packed left shift >>7 554 pxor \TMP3,\TMP2 # xor the shifted versions 555 pxor \TMP4,\TMP2 556 pxor \TMP5, \TMP2 557 pxor \TMP2, \GH 558 pxor \TMP1, \GH # result is in TMP1 559.endm 560 561# Reads DLEN bytes starting at DPTR and stores in XMMDst 562# where 0 < DLEN < 16 563# Clobbers %rax, DLEN and XMM1 564.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst 565 cmp $8, \DLEN 566 jl .L_read_lt8_\@ 567 mov (\DPTR), %rax 568 movq %rax, \XMMDst 569 sub $8, \DLEN 570 jz .L_done_read_partial_block_\@ 571 xor %eax, %eax 572.L_read_next_byte_\@: 573 shl $8, %rax 574 mov 7(\DPTR, \DLEN, 1), %al 575 dec \DLEN 576 jnz .L_read_next_byte_\@ 577 movq %rax, \XMM1 578 pslldq $8, \XMM1 579 por \XMM1, \XMMDst 580 jmp .L_done_read_partial_block_\@ 581.L_read_lt8_\@: 582 xor %eax, %eax 583.L_read_next_byte_lt8_\@: 584 shl $8, %rax 585 mov -1(\DPTR, \DLEN, 1), %al 586 dec \DLEN 587 jnz .L_read_next_byte_lt8_\@ 588 movq %rax, \XMMDst 589.L_done_read_partial_block_\@: 590.endm 591 592# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. 593# clobbers r10-11, xmm14 594.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ 595 TMP6 TMP7 596 MOVADQ SHUF_MASK(%rip), %xmm14 597 mov \AAD, %r10 # %r10 = AAD 598 mov \AADLEN, %r11 # %r11 = aadLen 599 pxor \TMP7, \TMP7 600 pxor \TMP6, \TMP6 601 602 cmp $16, %r11 603 jl .L_get_AAD_rest\@ 604.L_get_AAD_blocks\@: 605 movdqu (%r10), \TMP7 606 pshufb %xmm14, \TMP7 # byte-reflect the AAD data 607 pxor \TMP7, \TMP6 608 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 609 add $16, %r10 610 sub $16, %r11 611 cmp $16, %r11 612 jge .L_get_AAD_blocks\@ 613 614 movdqu \TMP6, \TMP7 615 616 /* read the last <16B of AAD */ 617.L_get_AAD_rest\@: 618 test %r11, %r11 619 je .L_get_AAD_done\@ 620 621 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 622 pshufb %xmm14, \TMP7 # byte-reflect the AAD data 623 pxor \TMP6, \TMP7 624 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 625 movdqu \TMP7, \TMP6 626 627.L_get_AAD_done\@: 628 movdqu \TMP6, AadHash(%arg2) 629.endm 630 631# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks 632# between update calls. 633# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK 634# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context 635# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 636.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ 637 AAD_HASH operation 638 mov PBlockLen(%arg2), %r13 639 test %r13, %r13 640 je .L_partial_block_done_\@ # Leave Macro if no partial blocks 641 # Read in input data without over reading 642 cmp $16, \PLAIN_CYPH_LEN 643 jl .L_fewer_than_16_bytes_\@ 644 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm 645 jmp .L_data_read_\@ 646 647.L_fewer_than_16_bytes_\@: 648 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 649 mov \PLAIN_CYPH_LEN, %r12 650 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 651 652 mov PBlockLen(%arg2), %r13 653 654.L_data_read_\@: # Finished reading in data 655 656 movdqu PBlockEncKey(%arg2), %xmm9 657 movdqu HashKey(%arg2), %xmm13 658 659 lea SHIFT_MASK(%rip), %r12 660 661 # adjust the shuffle mask pointer to be able to shift r13 bytes 662 # r16-r13 is the number of bytes in plaintext mod 16) 663 add %r13, %r12 664 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 665 pshufb %xmm2, %xmm9 # shift right r13 bytes 666 667.ifc \operation, dec 668 movdqa %xmm1, %xmm3 669 pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) 670 671 mov \PLAIN_CYPH_LEN, %r10 672 add %r13, %r10 673 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 674 sub $16, %r10 675 # Determine if partial block is not being filled and 676 # shift mask accordingly 677 jge .L_no_extra_mask_1_\@ 678 sub %r10, %r12 679.L_no_extra_mask_1_\@: 680 681 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 682 # get the appropriate mask to mask out bottom r13 bytes of xmm9 683 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 684 685 pand %xmm1, %xmm3 686 movdqa SHUF_MASK(%rip), %xmm10 687 pshufb %xmm10, %xmm3 688 pshufb %xmm2, %xmm3 689 pxor %xmm3, \AAD_HASH 690 691 test %r10, %r10 692 jl .L_partial_incomplete_1_\@ 693 694 # GHASH computation for the last <16 Byte block 695 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 696 xor %eax, %eax 697 698 mov %rax, PBlockLen(%arg2) 699 jmp .L_dec_done_\@ 700.L_partial_incomplete_1_\@: 701 add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 702.L_dec_done_\@: 703 movdqu \AAD_HASH, AadHash(%arg2) 704.else 705 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) 706 707 mov \PLAIN_CYPH_LEN, %r10 708 add %r13, %r10 709 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 710 sub $16, %r10 711 # Determine if partial block is not being filled and 712 # shift mask accordingly 713 jge .L_no_extra_mask_2_\@ 714 sub %r10, %r12 715.L_no_extra_mask_2_\@: 716 717 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 718 # get the appropriate mask to mask out bottom r13 bytes of xmm9 719 pand %xmm1, %xmm9 720 721 movdqa SHUF_MASK(%rip), %xmm1 722 pshufb %xmm1, %xmm9 723 pshufb %xmm2, %xmm9 724 pxor %xmm9, \AAD_HASH 725 726 test %r10, %r10 727 jl .L_partial_incomplete_2_\@ 728 729 # GHASH computation for the last <16 Byte block 730 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 731 xor %eax, %eax 732 733 mov %rax, PBlockLen(%arg2) 734 jmp .L_encode_done_\@ 735.L_partial_incomplete_2_\@: 736 add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 737.L_encode_done_\@: 738 movdqu \AAD_HASH, AadHash(%arg2) 739 740 movdqa SHUF_MASK(%rip), %xmm10 741 # shuffle xmm9 back to output as ciphertext 742 pshufb %xmm10, %xmm9 743 pshufb %xmm2, %xmm9 744.endif 745 # output encrypted Bytes 746 test %r10, %r10 747 jl .L_partial_fill_\@ 748 mov %r13, %r12 749 mov $16, %r13 750 # Set r13 to be the number of bytes to write out 751 sub %r12, %r13 752 jmp .L_count_set_\@ 753.L_partial_fill_\@: 754 mov \PLAIN_CYPH_LEN, %r13 755.L_count_set_\@: 756 movdqa %xmm9, %xmm0 757 movq %xmm0, %rax 758 cmp $8, %r13 759 jle .L_less_than_8_bytes_left_\@ 760 761 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 762 add $8, \DATA_OFFSET 763 psrldq $8, %xmm0 764 movq %xmm0, %rax 765 sub $8, %r13 766.L_less_than_8_bytes_left_\@: 767 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 768 add $1, \DATA_OFFSET 769 shr $8, %rax 770 sub $1, %r13 771 jne .L_less_than_8_bytes_left_\@ 772.L_partial_block_done_\@: 773.endm # PARTIAL_BLOCK 774 775/* 776* if a = number of total plaintext bytes 777* b = floor(a/16) 778* num_initial_blocks = b mod 4 779* encrypt the initial num_initial_blocks blocks and apply ghash on 780* the ciphertext 781* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 782* are clobbered 783* arg1, %arg2, %arg3 are used as a pointer only, not modified 784*/ 785 786 787.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 788 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 789 MOVADQ SHUF_MASK(%rip), %xmm14 790 791 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 792 793 # start AES for num_initial_blocks blocks 794 795 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 796 797.if (\i == 5) || (\i == 6) || (\i == 7) 798 799 MOVADQ ONE(%RIP),\TMP1 800 MOVADQ 0(%arg1),\TMP2 801.irpc index, \i_seq 802 paddd \TMP1, \XMM0 # INCR Y0 803.ifc \operation, dec 804 movdqa \XMM0, %xmm\index 805.else 806 MOVADQ \XMM0, %xmm\index 807.endif 808 pshufb %xmm14, %xmm\index # perform a 16 byte swap 809 pxor \TMP2, %xmm\index 810.endr 811 lea 0x10(%arg1),%r10 812 mov keysize,%eax 813 shr $2,%eax # 128->4, 192->6, 256->8 814 add $5,%eax # 128->9, 192->11, 256->13 815 816.Laes_loop_initial_\@: 817 MOVADQ (%r10),\TMP1 818.irpc index, \i_seq 819 aesenc \TMP1, %xmm\index 820.endr 821 add $16,%r10 822 sub $1,%eax 823 jnz .Laes_loop_initial_\@ 824 825 MOVADQ (%r10), \TMP1 826.irpc index, \i_seq 827 aesenclast \TMP1, %xmm\index # Last Round 828.endr 829.irpc index, \i_seq 830 movdqu (%arg4 , %r11, 1), \TMP1 831 pxor \TMP1, %xmm\index 832 movdqu %xmm\index, (%arg3 , %r11, 1) 833 # write back plaintext/ciphertext for num_initial_blocks 834 add $16, %r11 835 836.ifc \operation, dec 837 movdqa \TMP1, %xmm\index 838.endif 839 pshufb %xmm14, %xmm\index 840 841 # prepare plaintext/ciphertext for GHASH computation 842.endr 843.endif 844 845 # apply GHASH on num_initial_blocks blocks 846 847.if \i == 5 848 pxor %xmm5, %xmm6 849 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 850 pxor %xmm6, %xmm7 851 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 852 pxor %xmm7, %xmm8 853 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 854.elseif \i == 6 855 pxor %xmm6, %xmm7 856 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 857 pxor %xmm7, %xmm8 858 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 859.elseif \i == 7 860 pxor %xmm7, %xmm8 861 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 862.endif 863 cmp $64, %r13 864 jl .L_initial_blocks_done\@ 865 # no need for precomputed values 866/* 867* 868* Precomputations for HashKey parallel with encryption of first 4 blocks. 869* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 870*/ 871 MOVADQ ONE(%RIP),\TMP1 872 paddd \TMP1, \XMM0 # INCR Y0 873 MOVADQ \XMM0, \XMM1 874 pshufb %xmm14, \XMM1 # perform a 16 byte swap 875 876 paddd \TMP1, \XMM0 # INCR Y0 877 MOVADQ \XMM0, \XMM2 878 pshufb %xmm14, \XMM2 # perform a 16 byte swap 879 880 paddd \TMP1, \XMM0 # INCR Y0 881 MOVADQ \XMM0, \XMM3 882 pshufb %xmm14, \XMM3 # perform a 16 byte swap 883 884 paddd \TMP1, \XMM0 # INCR Y0 885 MOVADQ \XMM0, \XMM4 886 pshufb %xmm14, \XMM4 # perform a 16 byte swap 887 888 MOVADQ 0(%arg1),\TMP1 889 pxor \TMP1, \XMM1 890 pxor \TMP1, \XMM2 891 pxor \TMP1, \XMM3 892 pxor \TMP1, \XMM4 893.irpc index, 1234 # do 4 rounds 894 movaps 0x10*\index(%arg1), \TMP1 895 aesenc \TMP1, \XMM1 896 aesenc \TMP1, \XMM2 897 aesenc \TMP1, \XMM3 898 aesenc \TMP1, \XMM4 899.endr 900.irpc index, 56789 # do next 5 rounds 901 movaps 0x10*\index(%arg1), \TMP1 902 aesenc \TMP1, \XMM1 903 aesenc \TMP1, \XMM2 904 aesenc \TMP1, \XMM3 905 aesenc \TMP1, \XMM4 906.endr 907 lea 0xa0(%arg1),%r10 908 mov keysize,%eax 909 shr $2,%eax # 128->4, 192->6, 256->8 910 sub $4,%eax # 128->0, 192->2, 256->4 911 jz .Laes_loop_pre_done\@ 912 913.Laes_loop_pre_\@: 914 MOVADQ (%r10),\TMP2 915.irpc index, 1234 916 aesenc \TMP2, %xmm\index 917.endr 918 add $16,%r10 919 sub $1,%eax 920 jnz .Laes_loop_pre_\@ 921 922.Laes_loop_pre_done\@: 923 MOVADQ (%r10), \TMP2 924 aesenclast \TMP2, \XMM1 925 aesenclast \TMP2, \XMM2 926 aesenclast \TMP2, \XMM3 927 aesenclast \TMP2, \XMM4 928 movdqu 16*0(%arg4 , %r11 , 1), \TMP1 929 pxor \TMP1, \XMM1 930.ifc \operation, dec 931 movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 932 movdqa \TMP1, \XMM1 933.endif 934 movdqu 16*1(%arg4 , %r11 , 1), \TMP1 935 pxor \TMP1, \XMM2 936.ifc \operation, dec 937 movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 938 movdqa \TMP1, \XMM2 939.endif 940 movdqu 16*2(%arg4 , %r11 , 1), \TMP1 941 pxor \TMP1, \XMM3 942.ifc \operation, dec 943 movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 944 movdqa \TMP1, \XMM3 945.endif 946 movdqu 16*3(%arg4 , %r11 , 1), \TMP1 947 pxor \TMP1, \XMM4 948.ifc \operation, dec 949 movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 950 movdqa \TMP1, \XMM4 951.else 952 movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 953 movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 954 movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 955 movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 956.endif 957 958 add $64, %r11 959 pshufb %xmm14, \XMM1 # perform a 16 byte swap 960 pxor \XMMDst, \XMM1 961# combine GHASHed value with the corresponding ciphertext 962 pshufb %xmm14, \XMM2 # perform a 16 byte swap 963 pshufb %xmm14, \XMM3 # perform a 16 byte swap 964 pshufb %xmm14, \XMM4 # perform a 16 byte swap 965 966.L_initial_blocks_done\@: 967 968.endm 969 970/* 971* encrypt 4 blocks at a time 972* ghash the 4 previously encrypted ciphertext blocks 973* arg1, %arg3, %arg4 are used as pointers only, not modified 974* %r11 is the data offset value 975*/ 976.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ 977TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 978 979 movdqa \XMM1, \XMM5 980 movdqa \XMM2, \XMM6 981 movdqa \XMM3, \XMM7 982 movdqa \XMM4, \XMM8 983 984 movdqa SHUF_MASK(%rip), %xmm15 985 # multiply TMP5 * HashKey using karatsuba 986 987 movdqa \XMM5, \TMP4 988 pshufd $78, \XMM5, \TMP6 989 pxor \XMM5, \TMP6 990 paddd ONE(%rip), \XMM0 # INCR CNT 991 movdqu HashKey_4(%arg2), \TMP5 992 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 993 movdqa \XMM0, \XMM1 994 paddd ONE(%rip), \XMM0 # INCR CNT 995 movdqa \XMM0, \XMM2 996 paddd ONE(%rip), \XMM0 # INCR CNT 997 movdqa \XMM0, \XMM3 998 paddd ONE(%rip), \XMM0 # INCR CNT 999 movdqa \XMM0, \XMM4 1000 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1001 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1002 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1003 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1004 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1005 1006 pxor (%arg1), \XMM1 1007 pxor (%arg1), \XMM2 1008 pxor (%arg1), \XMM3 1009 pxor (%arg1), \XMM4 1010 movdqu HashKey_4_k(%arg2), \TMP5 1011 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1012 movaps 0x10(%arg1), \TMP1 1013 aesenc \TMP1, \XMM1 # Round 1 1014 aesenc \TMP1, \XMM2 1015 aesenc \TMP1, \XMM3 1016 aesenc \TMP1, \XMM4 1017 movaps 0x20(%arg1), \TMP1 1018 aesenc \TMP1, \XMM1 # Round 2 1019 aesenc \TMP1, \XMM2 1020 aesenc \TMP1, \XMM3 1021 aesenc \TMP1, \XMM4 1022 movdqa \XMM6, \TMP1 1023 pshufd $78, \XMM6, \TMP2 1024 pxor \XMM6, \TMP2 1025 movdqu HashKey_3(%arg2), \TMP5 1026 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1027 movaps 0x30(%arg1), \TMP3 1028 aesenc \TMP3, \XMM1 # Round 3 1029 aesenc \TMP3, \XMM2 1030 aesenc \TMP3, \XMM3 1031 aesenc \TMP3, \XMM4 1032 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1033 movaps 0x40(%arg1), \TMP3 1034 aesenc \TMP3, \XMM1 # Round 4 1035 aesenc \TMP3, \XMM2 1036 aesenc \TMP3, \XMM3 1037 aesenc \TMP3, \XMM4 1038 movdqu HashKey_3_k(%arg2), \TMP5 1039 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1040 movaps 0x50(%arg1), \TMP3 1041 aesenc \TMP3, \XMM1 # Round 5 1042 aesenc \TMP3, \XMM2 1043 aesenc \TMP3, \XMM3 1044 aesenc \TMP3, \XMM4 1045 pxor \TMP1, \TMP4 1046# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1047 pxor \XMM6, \XMM5 1048 pxor \TMP2, \TMP6 1049 movdqa \XMM7, \TMP1 1050 pshufd $78, \XMM7, \TMP2 1051 pxor \XMM7, \TMP2 1052 movdqu HashKey_2(%arg2), \TMP5 1053 1054 # Multiply TMP5 * HashKey using karatsuba 1055 1056 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1057 movaps 0x60(%arg1), \TMP3 1058 aesenc \TMP3, \XMM1 # Round 6 1059 aesenc \TMP3, \XMM2 1060 aesenc \TMP3, \XMM3 1061 aesenc \TMP3, \XMM4 1062 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1063 movaps 0x70(%arg1), \TMP3 1064 aesenc \TMP3, \XMM1 # Round 7 1065 aesenc \TMP3, \XMM2 1066 aesenc \TMP3, \XMM3 1067 aesenc \TMP3, \XMM4 1068 movdqu HashKey_2_k(%arg2), \TMP5 1069 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1070 movaps 0x80(%arg1), \TMP3 1071 aesenc \TMP3, \XMM1 # Round 8 1072 aesenc \TMP3, \XMM2 1073 aesenc \TMP3, \XMM3 1074 aesenc \TMP3, \XMM4 1075 pxor \TMP1, \TMP4 1076# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1077 pxor \XMM7, \XMM5 1078 pxor \TMP2, \TMP6 1079 1080 # Multiply XMM8 * HashKey 1081 # XMM8 and TMP5 hold the values for the two operands 1082 1083 movdqa \XMM8, \TMP1 1084 pshufd $78, \XMM8, \TMP2 1085 pxor \XMM8, \TMP2 1086 movdqu HashKey(%arg2), \TMP5 1087 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1088 movaps 0x90(%arg1), \TMP3 1089 aesenc \TMP3, \XMM1 # Round 9 1090 aesenc \TMP3, \XMM2 1091 aesenc \TMP3, \XMM3 1092 aesenc \TMP3, \XMM4 1093 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1094 lea 0xa0(%arg1),%r10 1095 mov keysize,%eax 1096 shr $2,%eax # 128->4, 192->6, 256->8 1097 sub $4,%eax # 128->0, 192->2, 256->4 1098 jz .Laes_loop_par_enc_done\@ 1099 1100.Laes_loop_par_enc\@: 1101 MOVADQ (%r10),\TMP3 1102.irpc index, 1234 1103 aesenc \TMP3, %xmm\index 1104.endr 1105 add $16,%r10 1106 sub $1,%eax 1107 jnz .Laes_loop_par_enc\@ 1108 1109.Laes_loop_par_enc_done\@: 1110 MOVADQ (%r10), \TMP3 1111 aesenclast \TMP3, \XMM1 # Round 10 1112 aesenclast \TMP3, \XMM2 1113 aesenclast \TMP3, \XMM3 1114 aesenclast \TMP3, \XMM4 1115 movdqu HashKey_k(%arg2), \TMP5 1116 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1117 movdqu (%arg4,%r11,1), \TMP3 1118 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1119 movdqu 16(%arg4,%r11,1), \TMP3 1120 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1121 movdqu 32(%arg4,%r11,1), \TMP3 1122 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1123 movdqu 48(%arg4,%r11,1), \TMP3 1124 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1125 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer 1126 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer 1127 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer 1128 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer 1129 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1130 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1131 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1132 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1133 1134 pxor \TMP4, \TMP1 1135 pxor \XMM8, \XMM5 1136 pxor \TMP6, \TMP2 1137 pxor \TMP1, \TMP2 1138 pxor \XMM5, \TMP2 1139 movdqa \TMP2, \TMP3 1140 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1141 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1142 pxor \TMP3, \XMM5 1143 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1144 1145 # first phase of reduction 1146 1147 movdqa \XMM5, \TMP2 1148 movdqa \XMM5, \TMP3 1149 movdqa \XMM5, \TMP4 1150# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1151 pslld $31, \TMP2 # packed right shift << 31 1152 pslld $30, \TMP3 # packed right shift << 30 1153 pslld $25, \TMP4 # packed right shift << 25 1154 pxor \TMP3, \TMP2 # xor the shifted versions 1155 pxor \TMP4, \TMP2 1156 movdqa \TMP2, \TMP5 1157 psrldq $4, \TMP5 # right shift T5 1 DW 1158 pslldq $12, \TMP2 # left shift T2 3 DWs 1159 pxor \TMP2, \XMM5 1160 1161 # second phase of reduction 1162 1163 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1164 movdqa \XMM5,\TMP3 1165 movdqa \XMM5,\TMP4 1166 psrld $1, \TMP2 # packed left shift >>1 1167 psrld $2, \TMP3 # packed left shift >>2 1168 psrld $7, \TMP4 # packed left shift >>7 1169 pxor \TMP3,\TMP2 # xor the shifted versions 1170 pxor \TMP4,\TMP2 1171 pxor \TMP5, \TMP2 1172 pxor \TMP2, \XMM5 1173 pxor \TMP1, \XMM5 # result is in TMP1 1174 1175 pxor \XMM5, \XMM1 1176.endm 1177 1178/* 1179* decrypt 4 blocks at a time 1180* ghash the 4 previously decrypted ciphertext blocks 1181* arg1, %arg3, %arg4 are used as pointers only, not modified 1182* %r11 is the data offset value 1183*/ 1184.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ 1185TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 1186 1187 movdqa \XMM1, \XMM5 1188 movdqa \XMM2, \XMM6 1189 movdqa \XMM3, \XMM7 1190 movdqa \XMM4, \XMM8 1191 1192 movdqa SHUF_MASK(%rip), %xmm15 1193 # multiply TMP5 * HashKey using karatsuba 1194 1195 movdqa \XMM5, \TMP4 1196 pshufd $78, \XMM5, \TMP6 1197 pxor \XMM5, \TMP6 1198 paddd ONE(%rip), \XMM0 # INCR CNT 1199 movdqu HashKey_4(%arg2), \TMP5 1200 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 1201 movdqa \XMM0, \XMM1 1202 paddd ONE(%rip), \XMM0 # INCR CNT 1203 movdqa \XMM0, \XMM2 1204 paddd ONE(%rip), \XMM0 # INCR CNT 1205 movdqa \XMM0, \XMM3 1206 paddd ONE(%rip), \XMM0 # INCR CNT 1207 movdqa \XMM0, \XMM4 1208 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1209 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1210 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1211 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1212 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1213 1214 pxor (%arg1), \XMM1 1215 pxor (%arg1), \XMM2 1216 pxor (%arg1), \XMM3 1217 pxor (%arg1), \XMM4 1218 movdqu HashKey_4_k(%arg2), \TMP5 1219 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1220 movaps 0x10(%arg1), \TMP1 1221 aesenc \TMP1, \XMM1 # Round 1 1222 aesenc \TMP1, \XMM2 1223 aesenc \TMP1, \XMM3 1224 aesenc \TMP1, \XMM4 1225 movaps 0x20(%arg1), \TMP1 1226 aesenc \TMP1, \XMM1 # Round 2 1227 aesenc \TMP1, \XMM2 1228 aesenc \TMP1, \XMM3 1229 aesenc \TMP1, \XMM4 1230 movdqa \XMM6, \TMP1 1231 pshufd $78, \XMM6, \TMP2 1232 pxor \XMM6, \TMP2 1233 movdqu HashKey_3(%arg2), \TMP5 1234 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1235 movaps 0x30(%arg1), \TMP3 1236 aesenc \TMP3, \XMM1 # Round 3 1237 aesenc \TMP3, \XMM2 1238 aesenc \TMP3, \XMM3 1239 aesenc \TMP3, \XMM4 1240 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1241 movaps 0x40(%arg1), \TMP3 1242 aesenc \TMP3, \XMM1 # Round 4 1243 aesenc \TMP3, \XMM2 1244 aesenc \TMP3, \XMM3 1245 aesenc \TMP3, \XMM4 1246 movdqu HashKey_3_k(%arg2), \TMP5 1247 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1248 movaps 0x50(%arg1), \TMP3 1249 aesenc \TMP3, \XMM1 # Round 5 1250 aesenc \TMP3, \XMM2 1251 aesenc \TMP3, \XMM3 1252 aesenc \TMP3, \XMM4 1253 pxor \TMP1, \TMP4 1254# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1255 pxor \XMM6, \XMM5 1256 pxor \TMP2, \TMP6 1257 movdqa \XMM7, \TMP1 1258 pshufd $78, \XMM7, \TMP2 1259 pxor \XMM7, \TMP2 1260 movdqu HashKey_2(%arg2), \TMP5 1261 1262 # Multiply TMP5 * HashKey using karatsuba 1263 1264 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1265 movaps 0x60(%arg1), \TMP3 1266 aesenc \TMP3, \XMM1 # Round 6 1267 aesenc \TMP3, \XMM2 1268 aesenc \TMP3, \XMM3 1269 aesenc \TMP3, \XMM4 1270 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1271 movaps 0x70(%arg1), \TMP3 1272 aesenc \TMP3, \XMM1 # Round 7 1273 aesenc \TMP3, \XMM2 1274 aesenc \TMP3, \XMM3 1275 aesenc \TMP3, \XMM4 1276 movdqu HashKey_2_k(%arg2), \TMP5 1277 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1278 movaps 0x80(%arg1), \TMP3 1279 aesenc \TMP3, \XMM1 # Round 8 1280 aesenc \TMP3, \XMM2 1281 aesenc \TMP3, \XMM3 1282 aesenc \TMP3, \XMM4 1283 pxor \TMP1, \TMP4 1284# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1285 pxor \XMM7, \XMM5 1286 pxor \TMP2, \TMP6 1287 1288 # Multiply XMM8 * HashKey 1289 # XMM8 and TMP5 hold the values for the two operands 1290 1291 movdqa \XMM8, \TMP1 1292 pshufd $78, \XMM8, \TMP2 1293 pxor \XMM8, \TMP2 1294 movdqu HashKey(%arg2), \TMP5 1295 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1296 movaps 0x90(%arg1), \TMP3 1297 aesenc \TMP3, \XMM1 # Round 9 1298 aesenc \TMP3, \XMM2 1299 aesenc \TMP3, \XMM3 1300 aesenc \TMP3, \XMM4 1301 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1302 lea 0xa0(%arg1),%r10 1303 mov keysize,%eax 1304 shr $2,%eax # 128->4, 192->6, 256->8 1305 sub $4,%eax # 128->0, 192->2, 256->4 1306 jz .Laes_loop_par_dec_done\@ 1307 1308.Laes_loop_par_dec\@: 1309 MOVADQ (%r10),\TMP3 1310.irpc index, 1234 1311 aesenc \TMP3, %xmm\index 1312.endr 1313 add $16,%r10 1314 sub $1,%eax 1315 jnz .Laes_loop_par_dec\@ 1316 1317.Laes_loop_par_dec_done\@: 1318 MOVADQ (%r10), \TMP3 1319 aesenclast \TMP3, \XMM1 # last round 1320 aesenclast \TMP3, \XMM2 1321 aesenclast \TMP3, \XMM3 1322 aesenclast \TMP3, \XMM4 1323 movdqu HashKey_k(%arg2), \TMP5 1324 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1325 movdqu (%arg4,%r11,1), \TMP3 1326 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1327 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer 1328 movdqa \TMP3, \XMM1 1329 movdqu 16(%arg4,%r11,1), \TMP3 1330 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1331 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer 1332 movdqa \TMP3, \XMM2 1333 movdqu 32(%arg4,%r11,1), \TMP3 1334 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1335 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer 1336 movdqa \TMP3, \XMM3 1337 movdqu 48(%arg4,%r11,1), \TMP3 1338 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1339 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer 1340 movdqa \TMP3, \XMM4 1341 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1342 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1343 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1344 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1345 1346 pxor \TMP4, \TMP1 1347 pxor \XMM8, \XMM5 1348 pxor \TMP6, \TMP2 1349 pxor \TMP1, \TMP2 1350 pxor \XMM5, \TMP2 1351 movdqa \TMP2, \TMP3 1352 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1353 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1354 pxor \TMP3, \XMM5 1355 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1356 1357 # first phase of reduction 1358 1359 movdqa \XMM5, \TMP2 1360 movdqa \XMM5, \TMP3 1361 movdqa \XMM5, \TMP4 1362# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1363 pslld $31, \TMP2 # packed right shift << 31 1364 pslld $30, \TMP3 # packed right shift << 30 1365 pslld $25, \TMP4 # packed right shift << 25 1366 pxor \TMP3, \TMP2 # xor the shifted versions 1367 pxor \TMP4, \TMP2 1368 movdqa \TMP2, \TMP5 1369 psrldq $4, \TMP5 # right shift T5 1 DW 1370 pslldq $12, \TMP2 # left shift T2 3 DWs 1371 pxor \TMP2, \XMM5 1372 1373 # second phase of reduction 1374 1375 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1376 movdqa \XMM5,\TMP3 1377 movdqa \XMM5,\TMP4 1378 psrld $1, \TMP2 # packed left shift >>1 1379 psrld $2, \TMP3 # packed left shift >>2 1380 psrld $7, \TMP4 # packed left shift >>7 1381 pxor \TMP3,\TMP2 # xor the shifted versions 1382 pxor \TMP4,\TMP2 1383 pxor \TMP5, \TMP2 1384 pxor \TMP2, \XMM5 1385 pxor \TMP1, \XMM5 # result is in TMP1 1386 1387 pxor \XMM5, \XMM1 1388.endm 1389 1390/* GHASH the last 4 ciphertext blocks. */ 1391.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1392TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1393 1394 # Multiply TMP6 * HashKey (using Karatsuba) 1395 1396 movdqa \XMM1, \TMP6 1397 pshufd $78, \XMM1, \TMP2 1398 pxor \XMM1, \TMP2 1399 movdqu HashKey_4(%arg2), \TMP5 1400 pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1401 pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1402 movdqu HashKey_4_k(%arg2), \TMP4 1403 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1404 movdqa \XMM1, \XMMDst 1405 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1406 1407 # Multiply TMP1 * HashKey (using Karatsuba) 1408 1409 movdqa \XMM2, \TMP1 1410 pshufd $78, \XMM2, \TMP2 1411 pxor \XMM2, \TMP2 1412 movdqu HashKey_3(%arg2), \TMP5 1413 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1414 pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1415 movdqu HashKey_3_k(%arg2), \TMP4 1416 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1417 pxor \TMP1, \TMP6 1418 pxor \XMM2, \XMMDst 1419 pxor \TMP2, \XMM1 1420# results accumulated in TMP6, XMMDst, XMM1 1421 1422 # Multiply TMP1 * HashKey (using Karatsuba) 1423 1424 movdqa \XMM3, \TMP1 1425 pshufd $78, \XMM3, \TMP2 1426 pxor \XMM3, \TMP2 1427 movdqu HashKey_2(%arg2), \TMP5 1428 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1429 pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1430 movdqu HashKey_2_k(%arg2), \TMP4 1431 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1432 pxor \TMP1, \TMP6 1433 pxor \XMM3, \XMMDst 1434 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1435 1436 # Multiply TMP1 * HashKey (using Karatsuba) 1437 movdqa \XMM4, \TMP1 1438 pshufd $78, \XMM4, \TMP2 1439 pxor \XMM4, \TMP2 1440 movdqu HashKey(%arg2), \TMP5 1441 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1442 pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1443 movdqu HashKey_k(%arg2), \TMP4 1444 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1445 pxor \TMP1, \TMP6 1446 pxor \XMM4, \XMMDst 1447 pxor \XMM1, \TMP2 1448 pxor \TMP6, \TMP2 1449 pxor \XMMDst, \TMP2 1450 # middle section of the temp results combined as in karatsuba algorithm 1451 movdqa \TMP2, \TMP4 1452 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1453 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1454 pxor \TMP4, \XMMDst 1455 pxor \TMP2, \TMP6 1456# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1457 # first phase of the reduction 1458 movdqa \XMMDst, \TMP2 1459 movdqa \XMMDst, \TMP3 1460 movdqa \XMMDst, \TMP4 1461# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1462 pslld $31, \TMP2 # packed right shifting << 31 1463 pslld $30, \TMP3 # packed right shifting << 30 1464 pslld $25, \TMP4 # packed right shifting << 25 1465 pxor \TMP3, \TMP2 # xor the shifted versions 1466 pxor \TMP4, \TMP2 1467 movdqa \TMP2, \TMP7 1468 psrldq $4, \TMP7 # right shift TMP7 1 DW 1469 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1470 pxor \TMP2, \XMMDst 1471 1472 # second phase of the reduction 1473 movdqa \XMMDst, \TMP2 1474 # make 3 copies of XMMDst for doing 3 shift operations 1475 movdqa \XMMDst, \TMP3 1476 movdqa \XMMDst, \TMP4 1477 psrld $1, \TMP2 # packed left shift >> 1 1478 psrld $2, \TMP3 # packed left shift >> 2 1479 psrld $7, \TMP4 # packed left shift >> 7 1480 pxor \TMP3, \TMP2 # xor the shifted versions 1481 pxor \TMP4, \TMP2 1482 pxor \TMP7, \TMP2 1483 pxor \TMP2, \XMMDst 1484 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1485.endm 1486 1487 1488/* Encryption of a single block 1489* uses eax & r10 1490*/ 1491 1492.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1493 1494 pxor (%arg1), \XMM0 1495 mov keysize,%eax 1496 shr $2,%eax # 128->4, 192->6, 256->8 1497 add $5,%eax # 128->9, 192->11, 256->13 1498 lea 16(%arg1), %r10 # get first expanded key address 1499 1500_esb_loop_\@: 1501 MOVADQ (%r10),\TMP1 1502 aesenc \TMP1,\XMM0 1503 add $16,%r10 1504 sub $1,%eax 1505 jnz _esb_loop_\@ 1506 1507 MOVADQ (%r10),\TMP1 1508 aesenclast \TMP1,\XMM0 1509.endm 1510/***************************************************************************** 1511* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1512* struct gcm_context_data *data 1513* // Context data 1514* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1515* const u8 *in, // Ciphertext input 1516* u64 plaintext_len, // Length of data in bytes for decryption. 1517* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1518* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1519* // concatenated with 0x00000001. 16-byte aligned pointer. 1520* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1521* const u8 *aad, // Additional Authentication Data (AAD) 1522* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1523* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1524* // given authentication tag and only return the plaintext if they match. 1525* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1526* // (most likely), 12 or 8. 1527* 1528* Assumptions: 1529* 1530* keys: 1531* keys are pre-expanded and aligned to 16 bytes. we are using the first 1532* set of 11 keys in the data structure void *aes_ctx 1533* 1534* iv: 1535* 0 1 2 3 1536* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1537* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1538* | Salt (From the SA) | 1539* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1540* | Initialization Vector | 1541* | (This is the sequence number from IPSec header) | 1542* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1543* | 0x1 | 1544* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1545* 1546* 1547* 1548* AAD: 1549* AAD padded to 128 bits with 0 1550* for example, assume AAD is a u32 vector 1551* 1552* if AAD is 8 bytes: 1553* AAD[3] = {A0, A1}; 1554* padded AAD in xmm register = {A1 A0 0 0} 1555* 1556* 0 1 2 3 1557* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1558* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1559* | SPI (A1) | 1560* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1561* | 32-bit Sequence Number (A0) | 1562* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1563* | 0x0 | 1564* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1565* 1566* AAD Format with 32-bit Sequence Number 1567* 1568* if AAD is 12 bytes: 1569* AAD[3] = {A0, A1, A2}; 1570* padded AAD in xmm register = {A2 A1 A0 0} 1571* 1572* 0 1 2 3 1573* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1574* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1575* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1576* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1577* | SPI (A2) | 1578* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1579* | 64-bit Extended Sequence Number {A1,A0} | 1580* | | 1581* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1582* | 0x0 | 1583* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1584* 1585* AAD Format with 64-bit Extended Sequence Number 1586* 1587* poly = x^128 + x^127 + x^126 + x^121 + 1 1588* 1589*****************************************************************************/ 1590SYM_FUNC_START(aesni_gcm_dec) 1591 FUNC_SAVE 1592 1593 GCM_INIT %arg6, arg7, arg8, arg9 1594 GCM_ENC_DEC dec 1595 GCM_COMPLETE arg10, arg11 1596 FUNC_RESTORE 1597 RET 1598SYM_FUNC_END(aesni_gcm_dec) 1599 1600 1601/***************************************************************************** 1602* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1603* struct gcm_context_data *data 1604* // Context data 1605* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1606* const u8 *in, // Plaintext input 1607* u64 plaintext_len, // Length of data in bytes for encryption. 1608* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1609* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1610* // concatenated with 0x00000001. 16-byte aligned pointer. 1611* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1612* const u8 *aad, // Additional Authentication Data (AAD) 1613* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1614* u8 *auth_tag, // Authenticated Tag output. 1615* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1616* // 12 or 8. 1617* 1618* Assumptions: 1619* 1620* keys: 1621* keys are pre-expanded and aligned to 16 bytes. we are using the 1622* first set of 11 keys in the data structure void *aes_ctx 1623* 1624* 1625* iv: 1626* 0 1 2 3 1627* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1628* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1629* | Salt (From the SA) | 1630* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1631* | Initialization Vector | 1632* | (This is the sequence number from IPSec header) | 1633* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1634* | 0x1 | 1635* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1636* 1637* 1638* 1639* AAD: 1640* AAD padded to 128 bits with 0 1641* for example, assume AAD is a u32 vector 1642* 1643* if AAD is 8 bytes: 1644* AAD[3] = {A0, A1}; 1645* padded AAD in xmm register = {A1 A0 0 0} 1646* 1647* 0 1 2 3 1648* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1649* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1650* | SPI (A1) | 1651* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1652* | 32-bit Sequence Number (A0) | 1653* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1654* | 0x0 | 1655* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1656* 1657* AAD Format with 32-bit Sequence Number 1658* 1659* if AAD is 12 bytes: 1660* AAD[3] = {A0, A1, A2}; 1661* padded AAD in xmm register = {A2 A1 A0 0} 1662* 1663* 0 1 2 3 1664* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1665* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1666* | SPI (A2) | 1667* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1668* | 64-bit Extended Sequence Number {A1,A0} | 1669* | | 1670* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1671* | 0x0 | 1672* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1673* 1674* AAD Format with 64-bit Extended Sequence Number 1675* 1676* poly = x^128 + x^127 + x^126 + x^121 + 1 1677***************************************************************************/ 1678SYM_FUNC_START(aesni_gcm_enc) 1679 FUNC_SAVE 1680 1681 GCM_INIT %arg6, arg7, arg8, arg9 1682 GCM_ENC_DEC enc 1683 1684 GCM_COMPLETE arg10, arg11 1685 FUNC_RESTORE 1686 RET 1687SYM_FUNC_END(aesni_gcm_enc) 1688 1689/***************************************************************************** 1690* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1691* struct gcm_context_data *data, 1692* // context data 1693* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1694* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1695* // concatenated with 0x00000001. 16-byte aligned pointer. 1696* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1697* const u8 *aad, // Additional Authentication Data (AAD) 1698* u64 aad_len) // Length of AAD in bytes. 1699*/ 1700SYM_FUNC_START(aesni_gcm_init) 1701 FUNC_SAVE 1702 GCM_INIT %arg3, %arg4,%arg5, %arg6 1703 FUNC_RESTORE 1704 RET 1705SYM_FUNC_END(aesni_gcm_init) 1706 1707/***************************************************************************** 1708* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1709* struct gcm_context_data *data, 1710* // context data 1711* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1712* const u8 *in, // Plaintext input 1713* u64 plaintext_len, // Length of data in bytes for encryption. 1714*/ 1715SYM_FUNC_START(aesni_gcm_enc_update) 1716 FUNC_SAVE 1717 GCM_ENC_DEC enc 1718 FUNC_RESTORE 1719 RET 1720SYM_FUNC_END(aesni_gcm_enc_update) 1721 1722/***************************************************************************** 1723* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1724* struct gcm_context_data *data, 1725* // context data 1726* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1727* const u8 *in, // Plaintext input 1728* u64 plaintext_len, // Length of data in bytes for encryption. 1729*/ 1730SYM_FUNC_START(aesni_gcm_dec_update) 1731 FUNC_SAVE 1732 GCM_ENC_DEC dec 1733 FUNC_RESTORE 1734 RET 1735SYM_FUNC_END(aesni_gcm_dec_update) 1736 1737/***************************************************************************** 1738* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1739* struct gcm_context_data *data, 1740* // context data 1741* u8 *auth_tag, // Authenticated Tag output. 1742* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1743* // 12 or 8. 1744*/ 1745SYM_FUNC_START(aesni_gcm_finalize) 1746 FUNC_SAVE 1747 GCM_COMPLETE %arg3 %arg4 1748 FUNC_RESTORE 1749 RET 1750SYM_FUNC_END(aesni_gcm_finalize) 1751 1752#endif 1753 1754SYM_FUNC_START_LOCAL(_key_expansion_256a) 1755 pshufd $0b11111111, %xmm1, %xmm1 1756 shufps $0b00010000, %xmm0, %xmm4 1757 pxor %xmm4, %xmm0 1758 shufps $0b10001100, %xmm0, %xmm4 1759 pxor %xmm4, %xmm0 1760 pxor %xmm1, %xmm0 1761 movaps %xmm0, (TKEYP) 1762 add $0x10, TKEYP 1763 RET 1764SYM_FUNC_END(_key_expansion_256a) 1765SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a) 1766 1767SYM_FUNC_START_LOCAL(_key_expansion_192a) 1768 pshufd $0b01010101, %xmm1, %xmm1 1769 shufps $0b00010000, %xmm0, %xmm4 1770 pxor %xmm4, %xmm0 1771 shufps $0b10001100, %xmm0, %xmm4 1772 pxor %xmm4, %xmm0 1773 pxor %xmm1, %xmm0 1774 1775 movaps %xmm2, %xmm5 1776 movaps %xmm2, %xmm6 1777 pslldq $4, %xmm5 1778 pshufd $0b11111111, %xmm0, %xmm3 1779 pxor %xmm3, %xmm2 1780 pxor %xmm5, %xmm2 1781 1782 movaps %xmm0, %xmm1 1783 shufps $0b01000100, %xmm0, %xmm6 1784 movaps %xmm6, (TKEYP) 1785 shufps $0b01001110, %xmm2, %xmm1 1786 movaps %xmm1, 0x10(TKEYP) 1787 add $0x20, TKEYP 1788 RET 1789SYM_FUNC_END(_key_expansion_192a) 1790 1791SYM_FUNC_START_LOCAL(_key_expansion_192b) 1792 pshufd $0b01010101, %xmm1, %xmm1 1793 shufps $0b00010000, %xmm0, %xmm4 1794 pxor %xmm4, %xmm0 1795 shufps $0b10001100, %xmm0, %xmm4 1796 pxor %xmm4, %xmm0 1797 pxor %xmm1, %xmm0 1798 1799 movaps %xmm2, %xmm5 1800 pslldq $4, %xmm5 1801 pshufd $0b11111111, %xmm0, %xmm3 1802 pxor %xmm3, %xmm2 1803 pxor %xmm5, %xmm2 1804 1805 movaps %xmm0, (TKEYP) 1806 add $0x10, TKEYP 1807 RET 1808SYM_FUNC_END(_key_expansion_192b) 1809 1810SYM_FUNC_START_LOCAL(_key_expansion_256b) 1811 pshufd $0b10101010, %xmm1, %xmm1 1812 shufps $0b00010000, %xmm2, %xmm4 1813 pxor %xmm4, %xmm2 1814 shufps $0b10001100, %xmm2, %xmm4 1815 pxor %xmm4, %xmm2 1816 pxor %xmm1, %xmm2 1817 movaps %xmm2, (TKEYP) 1818 add $0x10, TKEYP 1819 RET 1820SYM_FUNC_END(_key_expansion_256b) 1821 1822/* 1823 * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1824 * unsigned int key_len) 1825 */ 1826SYM_FUNC_START(aesni_set_key) 1827 FRAME_BEGIN 1828#ifndef __x86_64__ 1829 pushl KEYP 1830 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx 1831 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key 1832 movl (FRAME_OFFSET+16)(%esp), %edx # key_len 1833#endif 1834 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1835 movaps %xmm0, (KEYP) 1836 lea 0x10(KEYP), TKEYP # key addr 1837 movl %edx, 480(KEYP) 1838 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1839 cmp $24, %dl 1840 jb .Lenc_key128 1841 je .Lenc_key192 1842 movups 0x10(UKEYP), %xmm2 # other user key 1843 movaps %xmm2, (TKEYP) 1844 add $0x10, TKEYP 1845 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 1846 call _key_expansion_256a 1847 aeskeygenassist $0x1, %xmm0, %xmm1 1848 call _key_expansion_256b 1849 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 1850 call _key_expansion_256a 1851 aeskeygenassist $0x2, %xmm0, %xmm1 1852 call _key_expansion_256b 1853 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 1854 call _key_expansion_256a 1855 aeskeygenassist $0x4, %xmm0, %xmm1 1856 call _key_expansion_256b 1857 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 1858 call _key_expansion_256a 1859 aeskeygenassist $0x8, %xmm0, %xmm1 1860 call _key_expansion_256b 1861 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 1862 call _key_expansion_256a 1863 aeskeygenassist $0x10, %xmm0, %xmm1 1864 call _key_expansion_256b 1865 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 1866 call _key_expansion_256a 1867 aeskeygenassist $0x20, %xmm0, %xmm1 1868 call _key_expansion_256b 1869 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 1870 call _key_expansion_256a 1871 jmp .Ldec_key 1872.Lenc_key192: 1873 movq 0x10(UKEYP), %xmm2 # other user key 1874 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 1875 call _key_expansion_192a 1876 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 1877 call _key_expansion_192b 1878 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 1879 call _key_expansion_192a 1880 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 1881 call _key_expansion_192b 1882 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 1883 call _key_expansion_192a 1884 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 1885 call _key_expansion_192b 1886 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 1887 call _key_expansion_192a 1888 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 1889 call _key_expansion_192b 1890 jmp .Ldec_key 1891.Lenc_key128: 1892 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 1893 call _key_expansion_128 1894 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 1895 call _key_expansion_128 1896 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 1897 call _key_expansion_128 1898 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 1899 call _key_expansion_128 1900 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 1901 call _key_expansion_128 1902 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 1903 call _key_expansion_128 1904 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 1905 call _key_expansion_128 1906 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 1907 call _key_expansion_128 1908 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 1909 call _key_expansion_128 1910 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 1911 call _key_expansion_128 1912.Ldec_key: 1913 sub $0x10, TKEYP 1914 movaps (KEYP), %xmm0 1915 movaps (TKEYP), %xmm1 1916 movaps %xmm0, 240(TKEYP) 1917 movaps %xmm1, 240(KEYP) 1918 add $0x10, KEYP 1919 lea 240-16(TKEYP), UKEYP 1920.align 4 1921.Ldec_key_loop: 1922 movaps (KEYP), %xmm0 1923 aesimc %xmm0, %xmm1 1924 movaps %xmm1, (UKEYP) 1925 add $0x10, KEYP 1926 sub $0x10, UKEYP 1927 cmp TKEYP, KEYP 1928 jb .Ldec_key_loop 1929#ifndef __x86_64__ 1930 popl KEYP 1931#endif 1932 FRAME_END 1933 RET 1934SYM_FUNC_END(aesni_set_key) 1935 1936/* 1937 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src) 1938 */ 1939SYM_FUNC_START(aesni_enc) 1940 FRAME_BEGIN 1941#ifndef __x86_64__ 1942 pushl KEYP 1943 pushl KLEN 1944 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 1945 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 1946 movl (FRAME_OFFSET+20)(%esp), INP # src 1947#endif 1948 movl 480(KEYP), KLEN # key length 1949 movups (INP), STATE # input 1950 call _aesni_enc1 1951 movups STATE, (OUTP) # output 1952#ifndef __x86_64__ 1953 popl KLEN 1954 popl KEYP 1955#endif 1956 FRAME_END 1957 RET 1958SYM_FUNC_END(aesni_enc) 1959 1960/* 1961 * _aesni_enc1: internal ABI 1962 * input: 1963 * KEYP: key struct pointer 1964 * KLEN: round count 1965 * STATE: initial state (input) 1966 * output: 1967 * STATE: finial state (output) 1968 * changed: 1969 * KEY 1970 * TKEYP (T1) 1971 */ 1972SYM_FUNC_START_LOCAL(_aesni_enc1) 1973 movaps (KEYP), KEY # key 1974 mov KEYP, TKEYP 1975 pxor KEY, STATE # round 0 1976 add $0x30, TKEYP 1977 cmp $24, KLEN 1978 jb .Lenc128 1979 lea 0x20(TKEYP), TKEYP 1980 je .Lenc192 1981 add $0x20, TKEYP 1982 movaps -0x60(TKEYP), KEY 1983 aesenc KEY, STATE 1984 movaps -0x50(TKEYP), KEY 1985 aesenc KEY, STATE 1986.align 4 1987.Lenc192: 1988 movaps -0x40(TKEYP), KEY 1989 aesenc KEY, STATE 1990 movaps -0x30(TKEYP), KEY 1991 aesenc KEY, STATE 1992.align 4 1993.Lenc128: 1994 movaps -0x20(TKEYP), KEY 1995 aesenc KEY, STATE 1996 movaps -0x10(TKEYP), KEY 1997 aesenc KEY, STATE 1998 movaps (TKEYP), KEY 1999 aesenc KEY, STATE 2000 movaps 0x10(TKEYP), KEY 2001 aesenc KEY, STATE 2002 movaps 0x20(TKEYP), KEY 2003 aesenc KEY, STATE 2004 movaps 0x30(TKEYP), KEY 2005 aesenc KEY, STATE 2006 movaps 0x40(TKEYP), KEY 2007 aesenc KEY, STATE 2008 movaps 0x50(TKEYP), KEY 2009 aesenc KEY, STATE 2010 movaps 0x60(TKEYP), KEY 2011 aesenc KEY, STATE 2012 movaps 0x70(TKEYP), KEY 2013 aesenclast KEY, STATE 2014 RET 2015SYM_FUNC_END(_aesni_enc1) 2016 2017/* 2018 * _aesni_enc4: internal ABI 2019 * input: 2020 * KEYP: key struct pointer 2021 * KLEN: round count 2022 * STATE1: initial state (input) 2023 * STATE2 2024 * STATE3 2025 * STATE4 2026 * output: 2027 * STATE1: finial state (output) 2028 * STATE2 2029 * STATE3 2030 * STATE4 2031 * changed: 2032 * KEY 2033 * TKEYP (T1) 2034 */ 2035SYM_FUNC_START_LOCAL(_aesni_enc4) 2036 movaps (KEYP), KEY # key 2037 mov KEYP, TKEYP 2038 pxor KEY, STATE1 # round 0 2039 pxor KEY, STATE2 2040 pxor KEY, STATE3 2041 pxor KEY, STATE4 2042 add $0x30, TKEYP 2043 cmp $24, KLEN 2044 jb .L4enc128 2045 lea 0x20(TKEYP), TKEYP 2046 je .L4enc192 2047 add $0x20, TKEYP 2048 movaps -0x60(TKEYP), KEY 2049 aesenc KEY, STATE1 2050 aesenc KEY, STATE2 2051 aesenc KEY, STATE3 2052 aesenc KEY, STATE4 2053 movaps -0x50(TKEYP), KEY 2054 aesenc KEY, STATE1 2055 aesenc KEY, STATE2 2056 aesenc KEY, STATE3 2057 aesenc KEY, STATE4 2058#.align 4 2059.L4enc192: 2060 movaps -0x40(TKEYP), KEY 2061 aesenc KEY, STATE1 2062 aesenc KEY, STATE2 2063 aesenc KEY, STATE3 2064 aesenc KEY, STATE4 2065 movaps -0x30(TKEYP), KEY 2066 aesenc KEY, STATE1 2067 aesenc KEY, STATE2 2068 aesenc KEY, STATE3 2069 aesenc KEY, STATE4 2070#.align 4 2071.L4enc128: 2072 movaps -0x20(TKEYP), KEY 2073 aesenc KEY, STATE1 2074 aesenc KEY, STATE2 2075 aesenc KEY, STATE3 2076 aesenc KEY, STATE4 2077 movaps -0x10(TKEYP), KEY 2078 aesenc KEY, STATE1 2079 aesenc KEY, STATE2 2080 aesenc KEY, STATE3 2081 aesenc KEY, STATE4 2082 movaps (TKEYP), KEY 2083 aesenc KEY, STATE1 2084 aesenc KEY, STATE2 2085 aesenc KEY, STATE3 2086 aesenc KEY, STATE4 2087 movaps 0x10(TKEYP), KEY 2088 aesenc KEY, STATE1 2089 aesenc KEY, STATE2 2090 aesenc KEY, STATE3 2091 aesenc KEY, STATE4 2092 movaps 0x20(TKEYP), KEY 2093 aesenc KEY, STATE1 2094 aesenc KEY, STATE2 2095 aesenc KEY, STATE3 2096 aesenc KEY, STATE4 2097 movaps 0x30(TKEYP), KEY 2098 aesenc KEY, STATE1 2099 aesenc KEY, STATE2 2100 aesenc KEY, STATE3 2101 aesenc KEY, STATE4 2102 movaps 0x40(TKEYP), KEY 2103 aesenc KEY, STATE1 2104 aesenc KEY, STATE2 2105 aesenc KEY, STATE3 2106 aesenc KEY, STATE4 2107 movaps 0x50(TKEYP), KEY 2108 aesenc KEY, STATE1 2109 aesenc KEY, STATE2 2110 aesenc KEY, STATE3 2111 aesenc KEY, STATE4 2112 movaps 0x60(TKEYP), KEY 2113 aesenc KEY, STATE1 2114 aesenc KEY, STATE2 2115 aesenc KEY, STATE3 2116 aesenc KEY, STATE4 2117 movaps 0x70(TKEYP), KEY 2118 aesenclast KEY, STATE1 # last round 2119 aesenclast KEY, STATE2 2120 aesenclast KEY, STATE3 2121 aesenclast KEY, STATE4 2122 RET 2123SYM_FUNC_END(_aesni_enc4) 2124 2125/* 2126 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src) 2127 */ 2128SYM_FUNC_START(aesni_dec) 2129 FRAME_BEGIN 2130#ifndef __x86_64__ 2131 pushl KEYP 2132 pushl KLEN 2133 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 2134 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 2135 movl (FRAME_OFFSET+20)(%esp), INP # src 2136#endif 2137 mov 480(KEYP), KLEN # key length 2138 add $240, KEYP 2139 movups (INP), STATE # input 2140 call _aesni_dec1 2141 movups STATE, (OUTP) #output 2142#ifndef __x86_64__ 2143 popl KLEN 2144 popl KEYP 2145#endif 2146 FRAME_END 2147 RET 2148SYM_FUNC_END(aesni_dec) 2149 2150/* 2151 * _aesni_dec1: internal ABI 2152 * input: 2153 * KEYP: key struct pointer 2154 * KLEN: key length 2155 * STATE: initial state (input) 2156 * output: 2157 * STATE: finial state (output) 2158 * changed: 2159 * KEY 2160 * TKEYP (T1) 2161 */ 2162SYM_FUNC_START_LOCAL(_aesni_dec1) 2163 movaps (KEYP), KEY # key 2164 mov KEYP, TKEYP 2165 pxor KEY, STATE # round 0 2166 add $0x30, TKEYP 2167 cmp $24, KLEN 2168 jb .Ldec128 2169 lea 0x20(TKEYP), TKEYP 2170 je .Ldec192 2171 add $0x20, TKEYP 2172 movaps -0x60(TKEYP), KEY 2173 aesdec KEY, STATE 2174 movaps -0x50(TKEYP), KEY 2175 aesdec KEY, STATE 2176.align 4 2177.Ldec192: 2178 movaps -0x40(TKEYP), KEY 2179 aesdec KEY, STATE 2180 movaps -0x30(TKEYP), KEY 2181 aesdec KEY, STATE 2182.align 4 2183.Ldec128: 2184 movaps -0x20(TKEYP), KEY 2185 aesdec KEY, STATE 2186 movaps -0x10(TKEYP), KEY 2187 aesdec KEY, STATE 2188 movaps (TKEYP), KEY 2189 aesdec KEY, STATE 2190 movaps 0x10(TKEYP), KEY 2191 aesdec KEY, STATE 2192 movaps 0x20(TKEYP), KEY 2193 aesdec KEY, STATE 2194 movaps 0x30(TKEYP), KEY 2195 aesdec KEY, STATE 2196 movaps 0x40(TKEYP), KEY 2197 aesdec KEY, STATE 2198 movaps 0x50(TKEYP), KEY 2199 aesdec KEY, STATE 2200 movaps 0x60(TKEYP), KEY 2201 aesdec KEY, STATE 2202 movaps 0x70(TKEYP), KEY 2203 aesdeclast KEY, STATE 2204 RET 2205SYM_FUNC_END(_aesni_dec1) 2206 2207/* 2208 * _aesni_dec4: internal ABI 2209 * input: 2210 * KEYP: key struct pointer 2211 * KLEN: key length 2212 * STATE1: initial state (input) 2213 * STATE2 2214 * STATE3 2215 * STATE4 2216 * output: 2217 * STATE1: finial state (output) 2218 * STATE2 2219 * STATE3 2220 * STATE4 2221 * changed: 2222 * KEY 2223 * TKEYP (T1) 2224 */ 2225SYM_FUNC_START_LOCAL(_aesni_dec4) 2226 movaps (KEYP), KEY # key 2227 mov KEYP, TKEYP 2228 pxor KEY, STATE1 # round 0 2229 pxor KEY, STATE2 2230 pxor KEY, STATE3 2231 pxor KEY, STATE4 2232 add $0x30, TKEYP 2233 cmp $24, KLEN 2234 jb .L4dec128 2235 lea 0x20(TKEYP), TKEYP 2236 je .L4dec192 2237 add $0x20, TKEYP 2238 movaps -0x60(TKEYP), KEY 2239 aesdec KEY, STATE1 2240 aesdec KEY, STATE2 2241 aesdec KEY, STATE3 2242 aesdec KEY, STATE4 2243 movaps -0x50(TKEYP), KEY 2244 aesdec KEY, STATE1 2245 aesdec KEY, STATE2 2246 aesdec KEY, STATE3 2247 aesdec KEY, STATE4 2248.align 4 2249.L4dec192: 2250 movaps -0x40(TKEYP), KEY 2251 aesdec KEY, STATE1 2252 aesdec KEY, STATE2 2253 aesdec KEY, STATE3 2254 aesdec KEY, STATE4 2255 movaps -0x30(TKEYP), KEY 2256 aesdec KEY, STATE1 2257 aesdec KEY, STATE2 2258 aesdec KEY, STATE3 2259 aesdec KEY, STATE4 2260.align 4 2261.L4dec128: 2262 movaps -0x20(TKEYP), KEY 2263 aesdec KEY, STATE1 2264 aesdec KEY, STATE2 2265 aesdec KEY, STATE3 2266 aesdec KEY, STATE4 2267 movaps -0x10(TKEYP), KEY 2268 aesdec KEY, STATE1 2269 aesdec KEY, STATE2 2270 aesdec KEY, STATE3 2271 aesdec KEY, STATE4 2272 movaps (TKEYP), KEY 2273 aesdec KEY, STATE1 2274 aesdec KEY, STATE2 2275 aesdec KEY, STATE3 2276 aesdec KEY, STATE4 2277 movaps 0x10(TKEYP), KEY 2278 aesdec KEY, STATE1 2279 aesdec KEY, STATE2 2280 aesdec KEY, STATE3 2281 aesdec KEY, STATE4 2282 movaps 0x20(TKEYP), KEY 2283 aesdec KEY, STATE1 2284 aesdec KEY, STATE2 2285 aesdec KEY, STATE3 2286 aesdec KEY, STATE4 2287 movaps 0x30(TKEYP), KEY 2288 aesdec KEY, STATE1 2289 aesdec KEY, STATE2 2290 aesdec KEY, STATE3 2291 aesdec KEY, STATE4 2292 movaps 0x40(TKEYP), KEY 2293 aesdec KEY, STATE1 2294 aesdec KEY, STATE2 2295 aesdec KEY, STATE3 2296 aesdec KEY, STATE4 2297 movaps 0x50(TKEYP), KEY 2298 aesdec KEY, STATE1 2299 aesdec KEY, STATE2 2300 aesdec KEY, STATE3 2301 aesdec KEY, STATE4 2302 movaps 0x60(TKEYP), KEY 2303 aesdec KEY, STATE1 2304 aesdec KEY, STATE2 2305 aesdec KEY, STATE3 2306 aesdec KEY, STATE4 2307 movaps 0x70(TKEYP), KEY 2308 aesdeclast KEY, STATE1 # last round 2309 aesdeclast KEY, STATE2 2310 aesdeclast KEY, STATE3 2311 aesdeclast KEY, STATE4 2312 RET 2313SYM_FUNC_END(_aesni_dec4) 2314 2315/* 2316 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2317 * size_t len) 2318 */ 2319SYM_FUNC_START(aesni_ecb_enc) 2320 FRAME_BEGIN 2321#ifndef __x86_64__ 2322 pushl LEN 2323 pushl KEYP 2324 pushl KLEN 2325 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2326 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2327 movl (FRAME_OFFSET+24)(%esp), INP # src 2328 movl (FRAME_OFFSET+28)(%esp), LEN # len 2329#endif 2330 test LEN, LEN # check length 2331 jz .Lecb_enc_ret 2332 mov 480(KEYP), KLEN 2333 cmp $16, LEN 2334 jb .Lecb_enc_ret 2335 cmp $64, LEN 2336 jb .Lecb_enc_loop1 2337.align 4 2338.Lecb_enc_loop4: 2339 movups (INP), STATE1 2340 movups 0x10(INP), STATE2 2341 movups 0x20(INP), STATE3 2342 movups 0x30(INP), STATE4 2343 call _aesni_enc4 2344 movups STATE1, (OUTP) 2345 movups STATE2, 0x10(OUTP) 2346 movups STATE3, 0x20(OUTP) 2347 movups STATE4, 0x30(OUTP) 2348 sub $64, LEN 2349 add $64, INP 2350 add $64, OUTP 2351 cmp $64, LEN 2352 jge .Lecb_enc_loop4 2353 cmp $16, LEN 2354 jb .Lecb_enc_ret 2355.align 4 2356.Lecb_enc_loop1: 2357 movups (INP), STATE1 2358 call _aesni_enc1 2359 movups STATE1, (OUTP) 2360 sub $16, LEN 2361 add $16, INP 2362 add $16, OUTP 2363 cmp $16, LEN 2364 jge .Lecb_enc_loop1 2365.Lecb_enc_ret: 2366#ifndef __x86_64__ 2367 popl KLEN 2368 popl KEYP 2369 popl LEN 2370#endif 2371 FRAME_END 2372 RET 2373SYM_FUNC_END(aesni_ecb_enc) 2374 2375/* 2376 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2377 * size_t len); 2378 */ 2379SYM_FUNC_START(aesni_ecb_dec) 2380 FRAME_BEGIN 2381#ifndef __x86_64__ 2382 pushl LEN 2383 pushl KEYP 2384 pushl KLEN 2385 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2386 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2387 movl (FRAME_OFFSET+24)(%esp), INP # src 2388 movl (FRAME_OFFSET+28)(%esp), LEN # len 2389#endif 2390 test LEN, LEN 2391 jz .Lecb_dec_ret 2392 mov 480(KEYP), KLEN 2393 add $240, KEYP 2394 cmp $16, LEN 2395 jb .Lecb_dec_ret 2396 cmp $64, LEN 2397 jb .Lecb_dec_loop1 2398.align 4 2399.Lecb_dec_loop4: 2400 movups (INP), STATE1 2401 movups 0x10(INP), STATE2 2402 movups 0x20(INP), STATE3 2403 movups 0x30(INP), STATE4 2404 call _aesni_dec4 2405 movups STATE1, (OUTP) 2406 movups STATE2, 0x10(OUTP) 2407 movups STATE3, 0x20(OUTP) 2408 movups STATE4, 0x30(OUTP) 2409 sub $64, LEN 2410 add $64, INP 2411 add $64, OUTP 2412 cmp $64, LEN 2413 jge .Lecb_dec_loop4 2414 cmp $16, LEN 2415 jb .Lecb_dec_ret 2416.align 4 2417.Lecb_dec_loop1: 2418 movups (INP), STATE1 2419 call _aesni_dec1 2420 movups STATE1, (OUTP) 2421 sub $16, LEN 2422 add $16, INP 2423 add $16, OUTP 2424 cmp $16, LEN 2425 jge .Lecb_dec_loop1 2426.Lecb_dec_ret: 2427#ifndef __x86_64__ 2428 popl KLEN 2429 popl KEYP 2430 popl LEN 2431#endif 2432 FRAME_END 2433 RET 2434SYM_FUNC_END(aesni_ecb_dec) 2435 2436/* 2437 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2438 * size_t len, u8 *iv) 2439 */ 2440SYM_FUNC_START(aesni_cbc_enc) 2441 FRAME_BEGIN 2442#ifndef __x86_64__ 2443 pushl IVP 2444 pushl LEN 2445 pushl KEYP 2446 pushl KLEN 2447 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2448 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2449 movl (FRAME_OFFSET+28)(%esp), INP # src 2450 movl (FRAME_OFFSET+32)(%esp), LEN # len 2451 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2452#endif 2453 cmp $16, LEN 2454 jb .Lcbc_enc_ret 2455 mov 480(KEYP), KLEN 2456 movups (IVP), STATE # load iv as initial state 2457.align 4 2458.Lcbc_enc_loop: 2459 movups (INP), IN # load input 2460 pxor IN, STATE 2461 call _aesni_enc1 2462 movups STATE, (OUTP) # store output 2463 sub $16, LEN 2464 add $16, INP 2465 add $16, OUTP 2466 cmp $16, LEN 2467 jge .Lcbc_enc_loop 2468 movups STATE, (IVP) 2469.Lcbc_enc_ret: 2470#ifndef __x86_64__ 2471 popl KLEN 2472 popl KEYP 2473 popl LEN 2474 popl IVP 2475#endif 2476 FRAME_END 2477 RET 2478SYM_FUNC_END(aesni_cbc_enc) 2479 2480/* 2481 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2482 * size_t len, u8 *iv) 2483 */ 2484SYM_FUNC_START(aesni_cbc_dec) 2485 FRAME_BEGIN 2486#ifndef __x86_64__ 2487 pushl IVP 2488 pushl LEN 2489 pushl KEYP 2490 pushl KLEN 2491 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2492 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2493 movl (FRAME_OFFSET+28)(%esp), INP # src 2494 movl (FRAME_OFFSET+32)(%esp), LEN # len 2495 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2496#endif 2497 cmp $16, LEN 2498 jb .Lcbc_dec_just_ret 2499 mov 480(KEYP), KLEN 2500 add $240, KEYP 2501 movups (IVP), IV 2502 cmp $64, LEN 2503 jb .Lcbc_dec_loop1 2504.align 4 2505.Lcbc_dec_loop4: 2506 movups (INP), IN1 2507 movaps IN1, STATE1 2508 movups 0x10(INP), IN2 2509 movaps IN2, STATE2 2510#ifdef __x86_64__ 2511 movups 0x20(INP), IN3 2512 movaps IN3, STATE3 2513 movups 0x30(INP), IN4 2514 movaps IN4, STATE4 2515#else 2516 movups 0x20(INP), IN1 2517 movaps IN1, STATE3 2518 movups 0x30(INP), IN2 2519 movaps IN2, STATE4 2520#endif 2521 call _aesni_dec4 2522 pxor IV, STATE1 2523#ifdef __x86_64__ 2524 pxor IN1, STATE2 2525 pxor IN2, STATE3 2526 pxor IN3, STATE4 2527 movaps IN4, IV 2528#else 2529 pxor IN1, STATE4 2530 movaps IN2, IV 2531 movups (INP), IN1 2532 pxor IN1, STATE2 2533 movups 0x10(INP), IN2 2534 pxor IN2, STATE3 2535#endif 2536 movups STATE1, (OUTP) 2537 movups STATE2, 0x10(OUTP) 2538 movups STATE3, 0x20(OUTP) 2539 movups STATE4, 0x30(OUTP) 2540 sub $64, LEN 2541 add $64, INP 2542 add $64, OUTP 2543 cmp $64, LEN 2544 jge .Lcbc_dec_loop4 2545 cmp $16, LEN 2546 jb .Lcbc_dec_ret 2547.align 4 2548.Lcbc_dec_loop1: 2549 movups (INP), IN 2550 movaps IN, STATE 2551 call _aesni_dec1 2552 pxor IV, STATE 2553 movups STATE, (OUTP) 2554 movaps IN, IV 2555 sub $16, LEN 2556 add $16, INP 2557 add $16, OUTP 2558 cmp $16, LEN 2559 jge .Lcbc_dec_loop1 2560.Lcbc_dec_ret: 2561 movups IV, (IVP) 2562.Lcbc_dec_just_ret: 2563#ifndef __x86_64__ 2564 popl KLEN 2565 popl KEYP 2566 popl LEN 2567 popl IVP 2568#endif 2569 FRAME_END 2570 RET 2571SYM_FUNC_END(aesni_cbc_dec) 2572 2573/* 2574 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2575 * size_t len, u8 *iv) 2576 */ 2577SYM_FUNC_START(aesni_cts_cbc_enc) 2578 FRAME_BEGIN 2579#ifndef __x86_64__ 2580 pushl IVP 2581 pushl LEN 2582 pushl KEYP 2583 pushl KLEN 2584 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2585 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2586 movl (FRAME_OFFSET+28)(%esp), INP # src 2587 movl (FRAME_OFFSET+32)(%esp), LEN # len 2588 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2589 lea .Lcts_permute_table, T1 2590#else 2591 lea .Lcts_permute_table(%rip), T1 2592#endif 2593 mov 480(KEYP), KLEN 2594 movups (IVP), STATE 2595 sub $16, LEN 2596 mov T1, IVP 2597 add $32, IVP 2598 add LEN, T1 2599 sub LEN, IVP 2600 movups (T1), %xmm4 2601 movups (IVP), %xmm5 2602 2603 movups (INP), IN1 2604 add LEN, INP 2605 movups (INP), IN2 2606 2607 pxor IN1, STATE 2608 call _aesni_enc1 2609 2610 pshufb %xmm5, IN2 2611 pxor STATE, IN2 2612 pshufb %xmm4, STATE 2613 add OUTP, LEN 2614 movups STATE, (LEN) 2615 2616 movaps IN2, STATE 2617 call _aesni_enc1 2618 movups STATE, (OUTP) 2619 2620#ifndef __x86_64__ 2621 popl KLEN 2622 popl KEYP 2623 popl LEN 2624 popl IVP 2625#endif 2626 FRAME_END 2627 RET 2628SYM_FUNC_END(aesni_cts_cbc_enc) 2629 2630/* 2631 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2632 * size_t len, u8 *iv) 2633 */ 2634SYM_FUNC_START(aesni_cts_cbc_dec) 2635 FRAME_BEGIN 2636#ifndef __x86_64__ 2637 pushl IVP 2638 pushl LEN 2639 pushl KEYP 2640 pushl KLEN 2641 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2642 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2643 movl (FRAME_OFFSET+28)(%esp), INP # src 2644 movl (FRAME_OFFSET+32)(%esp), LEN # len 2645 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2646 lea .Lcts_permute_table, T1 2647#else 2648 lea .Lcts_permute_table(%rip), T1 2649#endif 2650 mov 480(KEYP), KLEN 2651 add $240, KEYP 2652 movups (IVP), IV 2653 sub $16, LEN 2654 mov T1, IVP 2655 add $32, IVP 2656 add LEN, T1 2657 sub LEN, IVP 2658 movups (T1), %xmm4 2659 2660 movups (INP), STATE 2661 add LEN, INP 2662 movups (INP), IN1 2663 2664 call _aesni_dec1 2665 movaps STATE, IN2 2666 pshufb %xmm4, STATE 2667 pxor IN1, STATE 2668 2669 add OUTP, LEN 2670 movups STATE, (LEN) 2671 2672 movups (IVP), %xmm0 2673 pshufb %xmm0, IN1 2674 pblendvb IN2, IN1 2675 movaps IN1, STATE 2676 call _aesni_dec1 2677 2678 pxor IV, STATE 2679 movups STATE, (OUTP) 2680 2681#ifndef __x86_64__ 2682 popl KLEN 2683 popl KEYP 2684 popl LEN 2685 popl IVP 2686#endif 2687 FRAME_END 2688 RET 2689SYM_FUNC_END(aesni_cts_cbc_dec) 2690 2691.pushsection .rodata 2692.align 16 2693.Lcts_permute_table: 2694 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 2695 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 2696 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 2697 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 2698 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 2699 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 2700#ifdef __x86_64__ 2701.Lbswap_mask: 2702 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2703#endif 2704.popsection 2705 2706#ifdef __x86_64__ 2707/* 2708 * _aesni_inc_init: internal ABI 2709 * setup registers used by _aesni_inc 2710 * input: 2711 * IV 2712 * output: 2713 * CTR: == IV, in little endian 2714 * TCTR_LOW: == lower qword of CTR 2715 * INC: == 1, in little endian 2716 * BSWAP_MASK == endian swapping mask 2717 */ 2718SYM_FUNC_START_LOCAL(_aesni_inc_init) 2719 movaps .Lbswap_mask(%rip), BSWAP_MASK 2720 movaps IV, CTR 2721 pshufb BSWAP_MASK, CTR 2722 mov $1, TCTR_LOW 2723 movq TCTR_LOW, INC 2724 movq CTR, TCTR_LOW 2725 RET 2726SYM_FUNC_END(_aesni_inc_init) 2727 2728/* 2729 * _aesni_inc: internal ABI 2730 * Increase IV by 1, IV is in big endian 2731 * input: 2732 * IV 2733 * CTR: == IV, in little endian 2734 * TCTR_LOW: == lower qword of CTR 2735 * INC: == 1, in little endian 2736 * BSWAP_MASK == endian swapping mask 2737 * output: 2738 * IV: Increase by 1 2739 * changed: 2740 * CTR: == output IV, in little endian 2741 * TCTR_LOW: == lower qword of CTR 2742 */ 2743SYM_FUNC_START_LOCAL(_aesni_inc) 2744 paddq INC, CTR 2745 add $1, TCTR_LOW 2746 jnc .Linc_low 2747 pslldq $8, INC 2748 paddq INC, CTR 2749 psrldq $8, INC 2750.Linc_low: 2751 movaps CTR, IV 2752 pshufb BSWAP_MASK, IV 2753 RET 2754SYM_FUNC_END(_aesni_inc) 2755 2756/* 2757 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2758 * size_t len, u8 *iv) 2759 */ 2760SYM_FUNC_START(aesni_ctr_enc) 2761 FRAME_BEGIN 2762 cmp $16, LEN 2763 jb .Lctr_enc_just_ret 2764 mov 480(KEYP), KLEN 2765 movups (IVP), IV 2766 call _aesni_inc_init 2767 cmp $64, LEN 2768 jb .Lctr_enc_loop1 2769.align 4 2770.Lctr_enc_loop4: 2771 movaps IV, STATE1 2772 call _aesni_inc 2773 movups (INP), IN1 2774 movaps IV, STATE2 2775 call _aesni_inc 2776 movups 0x10(INP), IN2 2777 movaps IV, STATE3 2778 call _aesni_inc 2779 movups 0x20(INP), IN3 2780 movaps IV, STATE4 2781 call _aesni_inc 2782 movups 0x30(INP), IN4 2783 call _aesni_enc4 2784 pxor IN1, STATE1 2785 movups STATE1, (OUTP) 2786 pxor IN2, STATE2 2787 movups STATE2, 0x10(OUTP) 2788 pxor IN3, STATE3 2789 movups STATE3, 0x20(OUTP) 2790 pxor IN4, STATE4 2791 movups STATE4, 0x30(OUTP) 2792 sub $64, LEN 2793 add $64, INP 2794 add $64, OUTP 2795 cmp $64, LEN 2796 jge .Lctr_enc_loop4 2797 cmp $16, LEN 2798 jb .Lctr_enc_ret 2799.align 4 2800.Lctr_enc_loop1: 2801 movaps IV, STATE 2802 call _aesni_inc 2803 movups (INP), IN 2804 call _aesni_enc1 2805 pxor IN, STATE 2806 movups STATE, (OUTP) 2807 sub $16, LEN 2808 add $16, INP 2809 add $16, OUTP 2810 cmp $16, LEN 2811 jge .Lctr_enc_loop1 2812.Lctr_enc_ret: 2813 movups IV, (IVP) 2814.Lctr_enc_just_ret: 2815 FRAME_END 2816 RET 2817SYM_FUNC_END(aesni_ctr_enc) 2818 2819#endif 2820 2821.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 2822.align 16 2823.Lgf128mul_x_ble_mask: 2824 .octa 0x00000000000000010000000000000087 2825.previous 2826 2827/* 2828 * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs 2829 * input: 2830 * IV: current IV 2831 * GF128MUL_MASK == mask with 0x87 and 0x01 2832 * output: 2833 * IV: next IV 2834 * changed: 2835 * KEY: == temporary value 2836 */ 2837.macro _aesni_gf128mul_x_ble 2838 pshufd $0x13, IV, KEY 2839 paddq IV, IV 2840 psrad $31, KEY 2841 pand GF128MUL_MASK, KEY 2842 pxor KEY, IV 2843.endm 2844 2845.macro _aesni_xts_crypt enc 2846 FRAME_BEGIN 2847#ifndef __x86_64__ 2848 pushl IVP 2849 pushl LEN 2850 pushl KEYP 2851 pushl KLEN 2852 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2853 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2854 movl (FRAME_OFFSET+28)(%esp), INP # src 2855 movl (FRAME_OFFSET+32)(%esp), LEN # len 2856 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2857 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2858#else 2859 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK 2860#endif 2861 movups (IVP), IV 2862 2863 mov 480(KEYP), KLEN 2864.if !\enc 2865 add $240, KEYP 2866 2867 test $15, LEN 2868 jz .Lxts_loop4\@ 2869 sub $16, LEN 2870.endif 2871 2872.Lxts_loop4\@: 2873 sub $64, LEN 2874 jl .Lxts_1x\@ 2875 2876 movdqa IV, STATE1 2877 movdqu 0x00(INP), IN 2878 pxor IN, STATE1 2879 movdqu IV, 0x00(OUTP) 2880 2881 _aesni_gf128mul_x_ble 2882 movdqa IV, STATE2 2883 movdqu 0x10(INP), IN 2884 pxor IN, STATE2 2885 movdqu IV, 0x10(OUTP) 2886 2887 _aesni_gf128mul_x_ble 2888 movdqa IV, STATE3 2889 movdqu 0x20(INP), IN 2890 pxor IN, STATE3 2891 movdqu IV, 0x20(OUTP) 2892 2893 _aesni_gf128mul_x_ble 2894 movdqa IV, STATE4 2895 movdqu 0x30(INP), IN 2896 pxor IN, STATE4 2897 movdqu IV, 0x30(OUTP) 2898 2899.if \enc 2900 call _aesni_enc4 2901.else 2902 call _aesni_dec4 2903.endif 2904 2905 movdqu 0x00(OUTP), IN 2906 pxor IN, STATE1 2907 movdqu STATE1, 0x00(OUTP) 2908 2909 movdqu 0x10(OUTP), IN 2910 pxor IN, STATE2 2911 movdqu STATE2, 0x10(OUTP) 2912 2913 movdqu 0x20(OUTP), IN 2914 pxor IN, STATE3 2915 movdqu STATE3, 0x20(OUTP) 2916 2917 movdqu 0x30(OUTP), IN 2918 pxor IN, STATE4 2919 movdqu STATE4, 0x30(OUTP) 2920 2921 _aesni_gf128mul_x_ble 2922 2923 add $64, INP 2924 add $64, OUTP 2925 test LEN, LEN 2926 jnz .Lxts_loop4\@ 2927 2928.Lxts_ret_iv\@: 2929 movups IV, (IVP) 2930 2931.Lxts_ret\@: 2932#ifndef __x86_64__ 2933 popl KLEN 2934 popl KEYP 2935 popl LEN 2936 popl IVP 2937#endif 2938 FRAME_END 2939 RET 2940 2941.Lxts_1x\@: 2942 add $64, LEN 2943 jz .Lxts_ret_iv\@ 2944.if \enc 2945 sub $16, LEN 2946 jl .Lxts_cts4\@ 2947.endif 2948 2949.Lxts_loop1\@: 2950 movdqu (INP), STATE 2951.if \enc 2952 pxor IV, STATE 2953 call _aesni_enc1 2954.else 2955 add $16, INP 2956 sub $16, LEN 2957 jl .Lxts_cts1\@ 2958 pxor IV, STATE 2959 call _aesni_dec1 2960.endif 2961 pxor IV, STATE 2962 _aesni_gf128mul_x_ble 2963 2964 test LEN, LEN 2965 jz .Lxts_out\@ 2966 2967.if \enc 2968 add $16, INP 2969 sub $16, LEN 2970 jl .Lxts_cts1\@ 2971.endif 2972 2973 movdqu STATE, (OUTP) 2974 add $16, OUTP 2975 jmp .Lxts_loop1\@ 2976 2977.Lxts_out\@: 2978 movdqu STATE, (OUTP) 2979 jmp .Lxts_ret_iv\@ 2980 2981.if \enc 2982.Lxts_cts4\@: 2983 movdqa STATE4, STATE 2984 sub $16, OUTP 2985.Lxts_cts1\@: 2986.else 2987.Lxts_cts1\@: 2988 movdqa IV, STATE4 2989 _aesni_gf128mul_x_ble 2990 2991 pxor IV, STATE 2992 call _aesni_dec1 2993 pxor IV, STATE 2994.endif 2995#ifndef __x86_64__ 2996 lea .Lcts_permute_table, T1 2997#else 2998 lea .Lcts_permute_table(%rip), T1 2999#endif 3000 add LEN, INP /* rewind input pointer */ 3001 add $16, LEN /* # bytes in final block */ 3002 movups (INP), IN1 3003 3004 mov T1, IVP 3005 add $32, IVP 3006 add LEN, T1 3007 sub LEN, IVP 3008 add OUTP, LEN 3009 3010 movups (T1), %xmm4 3011 movaps STATE, IN2 3012 pshufb %xmm4, STATE 3013 movups STATE, (LEN) 3014 3015 movups (IVP), %xmm0 3016 pshufb %xmm0, IN1 3017 pblendvb IN2, IN1 3018 movaps IN1, STATE 3019 3020.if \enc 3021 pxor IV, STATE 3022 call _aesni_enc1 3023 pxor IV, STATE 3024.else 3025 pxor STATE4, STATE 3026 call _aesni_dec1 3027 pxor STATE4, STATE 3028.endif 3029 3030 movups STATE, (OUTP) 3031 jmp .Lxts_ret\@ 3032.endm 3033 3034/* 3035 * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst, 3036 * const u8 *src, unsigned int len, le128 *iv) 3037 */ 3038SYM_FUNC_START(aesni_xts_enc) 3039 _aesni_xts_crypt 1 3040SYM_FUNC_END(aesni_xts_enc) 3041 3042/* 3043 * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst, 3044 * const u8 *src, unsigned int len, le128 *iv) 3045 */ 3046SYM_FUNC_START(aesni_xts_dec) 3047 _aesni_xts_crypt 0 3048SYM_FUNC_END(aesni_xts_dec) 3049