1# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. 2# 3# Licensed under the Apache License 2.0 (the "License"). You may not use 4# this file except in compliance with the License. You can obtain a copy 5# in the file LICENSE in the source distribution or at 6# https://www.openssl.org/source/license.html 7 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# 17# AES-NI-CTR+GHASH stitch. 18# 19# February 2013 20# 21# OpenSSL GCM implementation is organized in such way that its 22# performance is rather close to the sum of its streamed components, 23# in the context parallelized AES-NI CTR and modulo-scheduled 24# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation 25# was observed to perform significantly better than the sum of the 26# components on contemporary CPUs, the effort was deemed impossible to 27# justify. This module is based on combination of Intel submissions, 28# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max 29# Locktyukhin of Intel Corp. who verified that it reduces shuffles 30# pressure with notable relative improvement, achieving 1.0 cycle per 31# byte processed with 128-bit key on Haswell processor, 0.74 - on 32# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled 33# measurements for favourable packet size, one divisible by 96. 34# Applications using the EVP interface will observe a few percent 35# worse performance.] 36# 37# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). 38# 39# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 40# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf 41 42# Generated once from 43# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl 44# and modified for ICP. Modification are kept at a bare minimum to ease later 45# upstream merges. 46 47#if defined(__x86_64__) && defined(HAVE_AVX) && \ 48 defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) 49 50#define _ASM 51#include <sys/asm_linkage.h> 52 53/* Windows userland links with OpenSSL */ 54#if !defined (_WIN32) || defined (_KERNEL) 55 56/* Apple needs _ */ 57#if defined (__APPLE__) 58#define gcm_avx_can_use_movbe _gcm_avx_can_use_movbe 59#endif 60 61.extern gcm_avx_can_use_movbe 62 63.text 64 65#ifdef HAVE_MOVBE 66.balign 32 67FUNCTION(_aesni_ctr32_ghash_6x) 68.cfi_startproc 69 ENDBR 70 vmovdqu 32(%r11),%xmm2 71 subq $6,%rdx 72 vpxor %xmm4,%xmm4,%xmm4 73 vmovdqu 0-128(%rcx),%xmm15 74 vpaddb %xmm2,%xmm1,%xmm10 75 vpaddb %xmm2,%xmm10,%xmm11 76 vpaddb %xmm2,%xmm11,%xmm12 77 vpaddb %xmm2,%xmm12,%xmm13 78 vpaddb %xmm2,%xmm13,%xmm14 79 vpxor %xmm15,%xmm1,%xmm9 80 vmovdqu %xmm4,16+8(%rsp) 81 jmp .Loop6x 82 83.balign 32 84.Loop6x: 85 addl $100663296,%ebx 86 jc .Lhandle_ctr32 87 vmovdqu 0-32(%r9),%xmm3 88 vpaddb %xmm2,%xmm14,%xmm1 89 vpxor %xmm15,%xmm10,%xmm10 90 vpxor %xmm15,%xmm11,%xmm11 91 92.Lresume_ctr32: 93 vmovdqu %xmm1,(%r8) 94 vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 95 vpxor %xmm15,%xmm12,%xmm12 96 vmovups 16-128(%rcx),%xmm2 97 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 98 xorq %r12,%r12 99 cmpq %r14,%r15 100 101 vaesenc %xmm2,%xmm9,%xmm9 102 vmovdqu 48+8(%rsp),%xmm0 103 vpxor %xmm15,%xmm13,%xmm13 104 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 105 vaesenc %xmm2,%xmm10,%xmm10 106 vpxor %xmm15,%xmm14,%xmm14 107 setnc %r12b 108 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 109 vaesenc %xmm2,%xmm11,%xmm11 110 vmovdqu 16-32(%r9),%xmm3 111 negq %r12 112 vaesenc %xmm2,%xmm12,%xmm12 113 vpxor %xmm5,%xmm6,%xmm6 114 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 115 vpxor %xmm4,%xmm8,%xmm8 116 vaesenc %xmm2,%xmm13,%xmm13 117 vpxor %xmm5,%xmm1,%xmm4 118 andq $0x60,%r12 119 vmovups 32-128(%rcx),%xmm15 120 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 121 vaesenc %xmm2,%xmm14,%xmm14 122 123 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 124 leaq (%r14,%r12,1),%r14 125 vaesenc %xmm15,%xmm9,%xmm9 126 vpxor 16+8(%rsp),%xmm8,%xmm8 127 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 128 vmovdqu 64+8(%rsp),%xmm0 129 vaesenc %xmm15,%xmm10,%xmm10 130 movbeq 88(%r14),%r13 131 vaesenc %xmm15,%xmm11,%xmm11 132 movbeq 80(%r14),%r12 133 vaesenc %xmm15,%xmm12,%xmm12 134 movq %r13,32+8(%rsp) 135 vaesenc %xmm15,%xmm13,%xmm13 136 movq %r12,40+8(%rsp) 137 vmovdqu 48-32(%r9),%xmm5 138 vaesenc %xmm15,%xmm14,%xmm14 139 140 vmovups 48-128(%rcx),%xmm15 141 vpxor %xmm1,%xmm6,%xmm6 142 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 143 vaesenc %xmm15,%xmm9,%xmm9 144 vpxor %xmm2,%xmm6,%xmm6 145 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 146 vaesenc %xmm15,%xmm10,%xmm10 147 vpxor %xmm3,%xmm7,%xmm7 148 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 149 vaesenc %xmm15,%xmm11,%xmm11 150 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 151 vmovdqu 80+8(%rsp),%xmm0 152 vaesenc %xmm15,%xmm12,%xmm12 153 vaesenc %xmm15,%xmm13,%xmm13 154 vpxor %xmm1,%xmm4,%xmm4 155 vmovdqu 64-32(%r9),%xmm1 156 vaesenc %xmm15,%xmm14,%xmm14 157 158 vmovups 64-128(%rcx),%xmm15 159 vpxor %xmm2,%xmm6,%xmm6 160 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 161 vaesenc %xmm15,%xmm9,%xmm9 162 vpxor %xmm3,%xmm6,%xmm6 163 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 164 vaesenc %xmm15,%xmm10,%xmm10 165 movbeq 72(%r14),%r13 166 vpxor %xmm5,%xmm7,%xmm7 167 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 168 vaesenc %xmm15,%xmm11,%xmm11 169 movbeq 64(%r14),%r12 170 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 171 vmovdqu 96+8(%rsp),%xmm0 172 vaesenc %xmm15,%xmm12,%xmm12 173 movq %r13,48+8(%rsp) 174 vaesenc %xmm15,%xmm13,%xmm13 175 movq %r12,56+8(%rsp) 176 vpxor %xmm2,%xmm4,%xmm4 177 vmovdqu 96-32(%r9),%xmm2 178 vaesenc %xmm15,%xmm14,%xmm14 179 180 vmovups 80-128(%rcx),%xmm15 181 vpxor %xmm3,%xmm6,%xmm6 182 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 183 vaesenc %xmm15,%xmm9,%xmm9 184 vpxor %xmm5,%xmm6,%xmm6 185 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 186 vaesenc %xmm15,%xmm10,%xmm10 187 movbeq 56(%r14),%r13 188 vpxor %xmm1,%xmm7,%xmm7 189 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 190 vpxor 112+8(%rsp),%xmm8,%xmm8 191 vaesenc %xmm15,%xmm11,%xmm11 192 movbeq 48(%r14),%r12 193 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 194 vaesenc %xmm15,%xmm12,%xmm12 195 movq %r13,64+8(%rsp) 196 vaesenc %xmm15,%xmm13,%xmm13 197 movq %r12,72+8(%rsp) 198 vpxor %xmm3,%xmm4,%xmm4 199 vmovdqu 112-32(%r9),%xmm3 200 vaesenc %xmm15,%xmm14,%xmm14 201 202 vmovups 96-128(%rcx),%xmm15 203 vpxor %xmm5,%xmm6,%xmm6 204 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 205 vaesenc %xmm15,%xmm9,%xmm9 206 vpxor %xmm1,%xmm6,%xmm6 207 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 208 vaesenc %xmm15,%xmm10,%xmm10 209 movbeq 40(%r14),%r13 210 vpxor %xmm2,%xmm7,%xmm7 211 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 212 vaesenc %xmm15,%xmm11,%xmm11 213 movbeq 32(%r14),%r12 214 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 215 vaesenc %xmm15,%xmm12,%xmm12 216 movq %r13,80+8(%rsp) 217 vaesenc %xmm15,%xmm13,%xmm13 218 movq %r12,88+8(%rsp) 219 vpxor %xmm5,%xmm6,%xmm6 220 vaesenc %xmm15,%xmm14,%xmm14 221 vpxor %xmm1,%xmm6,%xmm6 222 223 vmovups 112-128(%rcx),%xmm15 224 vpslldq $8,%xmm6,%xmm5 225 vpxor %xmm2,%xmm4,%xmm4 226 vmovdqu 16(%r11),%xmm3 227 228 vaesenc %xmm15,%xmm9,%xmm9 229 vpxor %xmm8,%xmm7,%xmm7 230 vaesenc %xmm15,%xmm10,%xmm10 231 vpxor %xmm5,%xmm4,%xmm4 232 movbeq 24(%r14),%r13 233 vaesenc %xmm15,%xmm11,%xmm11 234 movbeq 16(%r14),%r12 235 vpalignr $8,%xmm4,%xmm4,%xmm0 236 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 237 movq %r13,96+8(%rsp) 238 vaesenc %xmm15,%xmm12,%xmm12 239 movq %r12,104+8(%rsp) 240 vaesenc %xmm15,%xmm13,%xmm13 241 vmovups 128-128(%rcx),%xmm1 242 vaesenc %xmm15,%xmm14,%xmm14 243 244 vaesenc %xmm1,%xmm9,%xmm9 245 vmovups 144-128(%rcx),%xmm15 246 vaesenc %xmm1,%xmm10,%xmm10 247 vpsrldq $8,%xmm6,%xmm6 248 vaesenc %xmm1,%xmm11,%xmm11 249 vpxor %xmm6,%xmm7,%xmm7 250 vaesenc %xmm1,%xmm12,%xmm12 251 vpxor %xmm0,%xmm4,%xmm4 252 movbeq 8(%r14),%r13 253 vaesenc %xmm1,%xmm13,%xmm13 254 movbeq 0(%r14),%r12 255 vaesenc %xmm1,%xmm14,%xmm14 256 vmovups 160-128(%rcx),%xmm1 257 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. 258 jb .Lenc_tail 259 260 vaesenc %xmm15,%xmm9,%xmm9 261 vaesenc %xmm15,%xmm10,%xmm10 262 vaesenc %xmm15,%xmm11,%xmm11 263 vaesenc %xmm15,%xmm12,%xmm12 264 vaesenc %xmm15,%xmm13,%xmm13 265 vaesenc %xmm15,%xmm14,%xmm14 266 267 vaesenc %xmm1,%xmm9,%xmm9 268 vaesenc %xmm1,%xmm10,%xmm10 269 vaesenc %xmm1,%xmm11,%xmm11 270 vaesenc %xmm1,%xmm12,%xmm12 271 vaesenc %xmm1,%xmm13,%xmm13 272 vmovups 176-128(%rcx),%xmm15 273 vaesenc %xmm1,%xmm14,%xmm14 274 vmovups 192-128(%rcx),%xmm1 275 cmpl $14,%ebp // ICP does not zero key schedule. 276 jb .Lenc_tail 277 278 vaesenc %xmm15,%xmm9,%xmm9 279 vaesenc %xmm15,%xmm10,%xmm10 280 vaesenc %xmm15,%xmm11,%xmm11 281 vaesenc %xmm15,%xmm12,%xmm12 282 vaesenc %xmm15,%xmm13,%xmm13 283 vaesenc %xmm15,%xmm14,%xmm14 284 285 vaesenc %xmm1,%xmm9,%xmm9 286 vaesenc %xmm1,%xmm10,%xmm10 287 vaesenc %xmm1,%xmm11,%xmm11 288 vaesenc %xmm1,%xmm12,%xmm12 289 vaesenc %xmm1,%xmm13,%xmm13 290 vmovups 208-128(%rcx),%xmm15 291 vaesenc %xmm1,%xmm14,%xmm14 292 vmovups 224-128(%rcx),%xmm1 293 jmp .Lenc_tail 294 295.balign 32 296.Lhandle_ctr32: 297 vmovdqu (%r11),%xmm0 298 vpshufb %xmm0,%xmm1,%xmm6 299 vmovdqu 48(%r11),%xmm5 300 vpaddd 64(%r11),%xmm6,%xmm10 301 vpaddd %xmm5,%xmm6,%xmm11 302 vmovdqu 0-32(%r9),%xmm3 303 vpaddd %xmm5,%xmm10,%xmm12 304 vpshufb %xmm0,%xmm10,%xmm10 305 vpaddd %xmm5,%xmm11,%xmm13 306 vpshufb %xmm0,%xmm11,%xmm11 307 vpxor %xmm15,%xmm10,%xmm10 308 vpaddd %xmm5,%xmm12,%xmm14 309 vpshufb %xmm0,%xmm12,%xmm12 310 vpxor %xmm15,%xmm11,%xmm11 311 vpaddd %xmm5,%xmm13,%xmm1 312 vpshufb %xmm0,%xmm13,%xmm13 313 vpshufb %xmm0,%xmm14,%xmm14 314 vpshufb %xmm0,%xmm1,%xmm1 315 jmp .Lresume_ctr32 316 317.balign 32 318.Lenc_tail: 319 vaesenc %xmm15,%xmm9,%xmm9 320 vmovdqu %xmm7,16+8(%rsp) 321 vpalignr $8,%xmm4,%xmm4,%xmm8 322 vaesenc %xmm15,%xmm10,%xmm10 323 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 324 vpxor 0(%rdi),%xmm1,%xmm2 325 vaesenc %xmm15,%xmm11,%xmm11 326 vpxor 16(%rdi),%xmm1,%xmm0 327 vaesenc %xmm15,%xmm12,%xmm12 328 vpxor 32(%rdi),%xmm1,%xmm5 329 vaesenc %xmm15,%xmm13,%xmm13 330 vpxor 48(%rdi),%xmm1,%xmm6 331 vaesenc %xmm15,%xmm14,%xmm14 332 vpxor 64(%rdi),%xmm1,%xmm7 333 vpxor 80(%rdi),%xmm1,%xmm3 334 vmovdqu (%r8),%xmm1 335 336 vaesenclast %xmm2,%xmm9,%xmm9 337 vmovdqu 32(%r11),%xmm2 338 vaesenclast %xmm0,%xmm10,%xmm10 339 vpaddb %xmm2,%xmm1,%xmm0 340 movq %r13,112+8(%rsp) 341 leaq 96(%rdi),%rdi 342 vaesenclast %xmm5,%xmm11,%xmm11 343 vpaddb %xmm2,%xmm0,%xmm5 344 movq %r12,120+8(%rsp) 345 leaq 96(%rsi),%rsi 346 vmovdqu 0-128(%rcx),%xmm15 347 vaesenclast %xmm6,%xmm12,%xmm12 348 vpaddb %xmm2,%xmm5,%xmm6 349 vaesenclast %xmm7,%xmm13,%xmm13 350 vpaddb %xmm2,%xmm6,%xmm7 351 vaesenclast %xmm3,%xmm14,%xmm14 352 vpaddb %xmm2,%xmm7,%xmm3 353 354 addq $0x60,%r10 355 subq $0x6,%rdx 356 jc .L6x_done 357 358 vmovups %xmm9,-96(%rsi) 359 vpxor %xmm15,%xmm1,%xmm9 360 vmovups %xmm10,-80(%rsi) 361 vmovdqa %xmm0,%xmm10 362 vmovups %xmm11,-64(%rsi) 363 vmovdqa %xmm5,%xmm11 364 vmovups %xmm12,-48(%rsi) 365 vmovdqa %xmm6,%xmm12 366 vmovups %xmm13,-32(%rsi) 367 vmovdqa %xmm7,%xmm13 368 vmovups %xmm14,-16(%rsi) 369 vmovdqa %xmm3,%xmm14 370 vmovdqu 32+8(%rsp),%xmm7 371 jmp .Loop6x 372 373.L6x_done: 374 vpxor 16+8(%rsp),%xmm8,%xmm8 375 vpxor %xmm4,%xmm8,%xmm8 376 377 RET 378.cfi_endproc 379SET_SIZE(_aesni_ctr32_ghash_6x) 380#endif /* ifdef HAVE_MOVBE */ 381 382.balign 32 383FUNCTION(_aesni_ctr32_ghash_no_movbe_6x) 384.cfi_startproc 385 ENDBR 386 vmovdqu 32(%r11),%xmm2 387 subq $6,%rdx 388 vpxor %xmm4,%xmm4,%xmm4 389 vmovdqu 0-128(%rcx),%xmm15 390 vpaddb %xmm2,%xmm1,%xmm10 391 vpaddb %xmm2,%xmm10,%xmm11 392 vpaddb %xmm2,%xmm11,%xmm12 393 vpaddb %xmm2,%xmm12,%xmm13 394 vpaddb %xmm2,%xmm13,%xmm14 395 vpxor %xmm15,%xmm1,%xmm9 396 vmovdqu %xmm4,16+8(%rsp) 397 jmp .Loop6x_nmb 398 399.balign 32 400.Loop6x_nmb: 401 addl $100663296,%ebx 402 jc .Lhandle_ctr32_nmb 403 vmovdqu 0-32(%r9),%xmm3 404 vpaddb %xmm2,%xmm14,%xmm1 405 vpxor %xmm15,%xmm10,%xmm10 406 vpxor %xmm15,%xmm11,%xmm11 407 408.Lresume_ctr32_nmb: 409 vmovdqu %xmm1,(%r8) 410 vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 411 vpxor %xmm15,%xmm12,%xmm12 412 vmovups 16-128(%rcx),%xmm2 413 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 414 xorq %r12,%r12 415 cmpq %r14,%r15 416 417 vaesenc %xmm2,%xmm9,%xmm9 418 vmovdqu 48+8(%rsp),%xmm0 419 vpxor %xmm15,%xmm13,%xmm13 420 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 421 vaesenc %xmm2,%xmm10,%xmm10 422 vpxor %xmm15,%xmm14,%xmm14 423 setnc %r12b 424 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 425 vaesenc %xmm2,%xmm11,%xmm11 426 vmovdqu 16-32(%r9),%xmm3 427 negq %r12 428 vaesenc %xmm2,%xmm12,%xmm12 429 vpxor %xmm5,%xmm6,%xmm6 430 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 431 vpxor %xmm4,%xmm8,%xmm8 432 vaesenc %xmm2,%xmm13,%xmm13 433 vpxor %xmm5,%xmm1,%xmm4 434 andq $0x60,%r12 435 vmovups 32-128(%rcx),%xmm15 436 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 437 vaesenc %xmm2,%xmm14,%xmm14 438 439 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 440 leaq (%r14,%r12,1),%r14 441 vaesenc %xmm15,%xmm9,%xmm9 442 vpxor 16+8(%rsp),%xmm8,%xmm8 443 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 444 vmovdqu 64+8(%rsp),%xmm0 445 vaesenc %xmm15,%xmm10,%xmm10 446 movq 88(%r14),%r13 447 bswapq %r13 448 vaesenc %xmm15,%xmm11,%xmm11 449 movq 80(%r14),%r12 450 bswapq %r12 451 vaesenc %xmm15,%xmm12,%xmm12 452 movq %r13,32+8(%rsp) 453 vaesenc %xmm15,%xmm13,%xmm13 454 movq %r12,40+8(%rsp) 455 vmovdqu 48-32(%r9),%xmm5 456 vaesenc %xmm15,%xmm14,%xmm14 457 458 vmovups 48-128(%rcx),%xmm15 459 vpxor %xmm1,%xmm6,%xmm6 460 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 461 vaesenc %xmm15,%xmm9,%xmm9 462 vpxor %xmm2,%xmm6,%xmm6 463 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 464 vaesenc %xmm15,%xmm10,%xmm10 465 vpxor %xmm3,%xmm7,%xmm7 466 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 467 vaesenc %xmm15,%xmm11,%xmm11 468 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 469 vmovdqu 80+8(%rsp),%xmm0 470 vaesenc %xmm15,%xmm12,%xmm12 471 vaesenc %xmm15,%xmm13,%xmm13 472 vpxor %xmm1,%xmm4,%xmm4 473 vmovdqu 64-32(%r9),%xmm1 474 vaesenc %xmm15,%xmm14,%xmm14 475 476 vmovups 64-128(%rcx),%xmm15 477 vpxor %xmm2,%xmm6,%xmm6 478 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 479 vaesenc %xmm15,%xmm9,%xmm9 480 vpxor %xmm3,%xmm6,%xmm6 481 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 482 vaesenc %xmm15,%xmm10,%xmm10 483 movq 72(%r14),%r13 484 bswapq %r13 485 vpxor %xmm5,%xmm7,%xmm7 486 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 487 vaesenc %xmm15,%xmm11,%xmm11 488 movq 64(%r14),%r12 489 bswapq %r12 490 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 491 vmovdqu 96+8(%rsp),%xmm0 492 vaesenc %xmm15,%xmm12,%xmm12 493 movq %r13,48+8(%rsp) 494 vaesenc %xmm15,%xmm13,%xmm13 495 movq %r12,56+8(%rsp) 496 vpxor %xmm2,%xmm4,%xmm4 497 vmovdqu 96-32(%r9),%xmm2 498 vaesenc %xmm15,%xmm14,%xmm14 499 500 vmovups 80-128(%rcx),%xmm15 501 vpxor %xmm3,%xmm6,%xmm6 502 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 503 vaesenc %xmm15,%xmm9,%xmm9 504 vpxor %xmm5,%xmm6,%xmm6 505 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 506 vaesenc %xmm15,%xmm10,%xmm10 507 movq 56(%r14),%r13 508 bswapq %r13 509 vpxor %xmm1,%xmm7,%xmm7 510 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 511 vpxor 112+8(%rsp),%xmm8,%xmm8 512 vaesenc %xmm15,%xmm11,%xmm11 513 movq 48(%r14),%r12 514 bswapq %r12 515 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 516 vaesenc %xmm15,%xmm12,%xmm12 517 movq %r13,64+8(%rsp) 518 vaesenc %xmm15,%xmm13,%xmm13 519 movq %r12,72+8(%rsp) 520 vpxor %xmm3,%xmm4,%xmm4 521 vmovdqu 112-32(%r9),%xmm3 522 vaesenc %xmm15,%xmm14,%xmm14 523 524 vmovups 96-128(%rcx),%xmm15 525 vpxor %xmm5,%xmm6,%xmm6 526 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 527 vaesenc %xmm15,%xmm9,%xmm9 528 vpxor %xmm1,%xmm6,%xmm6 529 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 530 vaesenc %xmm15,%xmm10,%xmm10 531 movq 40(%r14),%r13 532 bswapq %r13 533 vpxor %xmm2,%xmm7,%xmm7 534 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 535 vaesenc %xmm15,%xmm11,%xmm11 536 movq 32(%r14),%r12 537 bswapq %r12 538 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 539 vaesenc %xmm15,%xmm12,%xmm12 540 movq %r13,80+8(%rsp) 541 vaesenc %xmm15,%xmm13,%xmm13 542 movq %r12,88+8(%rsp) 543 vpxor %xmm5,%xmm6,%xmm6 544 vaesenc %xmm15,%xmm14,%xmm14 545 vpxor %xmm1,%xmm6,%xmm6 546 547 vmovups 112-128(%rcx),%xmm15 548 vpslldq $8,%xmm6,%xmm5 549 vpxor %xmm2,%xmm4,%xmm4 550 vmovdqu 16(%r11),%xmm3 551 552 vaesenc %xmm15,%xmm9,%xmm9 553 vpxor %xmm8,%xmm7,%xmm7 554 vaesenc %xmm15,%xmm10,%xmm10 555 vpxor %xmm5,%xmm4,%xmm4 556 movq 24(%r14),%r13 557 bswapq %r13 558 vaesenc %xmm15,%xmm11,%xmm11 559 movq 16(%r14),%r12 560 bswapq %r12 561 vpalignr $8,%xmm4,%xmm4,%xmm0 562 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 563 movq %r13,96+8(%rsp) 564 vaesenc %xmm15,%xmm12,%xmm12 565 movq %r12,104+8(%rsp) 566 vaesenc %xmm15,%xmm13,%xmm13 567 vmovups 128-128(%rcx),%xmm1 568 vaesenc %xmm15,%xmm14,%xmm14 569 570 vaesenc %xmm1,%xmm9,%xmm9 571 vmovups 144-128(%rcx),%xmm15 572 vaesenc %xmm1,%xmm10,%xmm10 573 vpsrldq $8,%xmm6,%xmm6 574 vaesenc %xmm1,%xmm11,%xmm11 575 vpxor %xmm6,%xmm7,%xmm7 576 vaesenc %xmm1,%xmm12,%xmm12 577 vpxor %xmm0,%xmm4,%xmm4 578 movq 8(%r14),%r13 579 bswapq %r13 580 vaesenc %xmm1,%xmm13,%xmm13 581 movq 0(%r14),%r12 582 bswapq %r12 583 vaesenc %xmm1,%xmm14,%xmm14 584 vmovups 160-128(%rcx),%xmm1 585 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. 586 jb .Lenc_tail_nmb 587 588 vaesenc %xmm15,%xmm9,%xmm9 589 vaesenc %xmm15,%xmm10,%xmm10 590 vaesenc %xmm15,%xmm11,%xmm11 591 vaesenc %xmm15,%xmm12,%xmm12 592 vaesenc %xmm15,%xmm13,%xmm13 593 vaesenc %xmm15,%xmm14,%xmm14 594 595 vaesenc %xmm1,%xmm9,%xmm9 596 vaesenc %xmm1,%xmm10,%xmm10 597 vaesenc %xmm1,%xmm11,%xmm11 598 vaesenc %xmm1,%xmm12,%xmm12 599 vaesenc %xmm1,%xmm13,%xmm13 600 vmovups 176-128(%rcx),%xmm15 601 vaesenc %xmm1,%xmm14,%xmm14 602 vmovups 192-128(%rcx),%xmm1 603 cmpl $14,%ebp // ICP does not zero key schedule. 604 jb .Lenc_tail_nmb 605 606 vaesenc %xmm15,%xmm9,%xmm9 607 vaesenc %xmm15,%xmm10,%xmm10 608 vaesenc %xmm15,%xmm11,%xmm11 609 vaesenc %xmm15,%xmm12,%xmm12 610 vaesenc %xmm15,%xmm13,%xmm13 611 vaesenc %xmm15,%xmm14,%xmm14 612 613 vaesenc %xmm1,%xmm9,%xmm9 614 vaesenc %xmm1,%xmm10,%xmm10 615 vaesenc %xmm1,%xmm11,%xmm11 616 vaesenc %xmm1,%xmm12,%xmm12 617 vaesenc %xmm1,%xmm13,%xmm13 618 vmovups 208-128(%rcx),%xmm15 619 vaesenc %xmm1,%xmm14,%xmm14 620 vmovups 224-128(%rcx),%xmm1 621 jmp .Lenc_tail_nmb 622 623.balign 32 624.Lhandle_ctr32_nmb: 625 vmovdqu (%r11),%xmm0 626 vpshufb %xmm0,%xmm1,%xmm6 627 vmovdqu 48(%r11),%xmm5 628 vpaddd 64(%r11),%xmm6,%xmm10 629 vpaddd %xmm5,%xmm6,%xmm11 630 vmovdqu 0-32(%r9),%xmm3 631 vpaddd %xmm5,%xmm10,%xmm12 632 vpshufb %xmm0,%xmm10,%xmm10 633 vpaddd %xmm5,%xmm11,%xmm13 634 vpshufb %xmm0,%xmm11,%xmm11 635 vpxor %xmm15,%xmm10,%xmm10 636 vpaddd %xmm5,%xmm12,%xmm14 637 vpshufb %xmm0,%xmm12,%xmm12 638 vpxor %xmm15,%xmm11,%xmm11 639 vpaddd %xmm5,%xmm13,%xmm1 640 vpshufb %xmm0,%xmm13,%xmm13 641 vpshufb %xmm0,%xmm14,%xmm14 642 vpshufb %xmm0,%xmm1,%xmm1 643 jmp .Lresume_ctr32_nmb 644 645.balign 32 646.Lenc_tail_nmb: 647 vaesenc %xmm15,%xmm9,%xmm9 648 vmovdqu %xmm7,16+8(%rsp) 649 vpalignr $8,%xmm4,%xmm4,%xmm8 650 vaesenc %xmm15,%xmm10,%xmm10 651 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 652 vpxor 0(%rdi),%xmm1,%xmm2 653 vaesenc %xmm15,%xmm11,%xmm11 654 vpxor 16(%rdi),%xmm1,%xmm0 655 vaesenc %xmm15,%xmm12,%xmm12 656 vpxor 32(%rdi),%xmm1,%xmm5 657 vaesenc %xmm15,%xmm13,%xmm13 658 vpxor 48(%rdi),%xmm1,%xmm6 659 vaesenc %xmm15,%xmm14,%xmm14 660 vpxor 64(%rdi),%xmm1,%xmm7 661 vpxor 80(%rdi),%xmm1,%xmm3 662 vmovdqu (%r8),%xmm1 663 664 vaesenclast %xmm2,%xmm9,%xmm9 665 vmovdqu 32(%r11),%xmm2 666 vaesenclast %xmm0,%xmm10,%xmm10 667 vpaddb %xmm2,%xmm1,%xmm0 668 movq %r13,112+8(%rsp) 669 leaq 96(%rdi),%rdi 670 vaesenclast %xmm5,%xmm11,%xmm11 671 vpaddb %xmm2,%xmm0,%xmm5 672 movq %r12,120+8(%rsp) 673 leaq 96(%rsi),%rsi 674 vmovdqu 0-128(%rcx),%xmm15 675 vaesenclast %xmm6,%xmm12,%xmm12 676 vpaddb %xmm2,%xmm5,%xmm6 677 vaesenclast %xmm7,%xmm13,%xmm13 678 vpaddb %xmm2,%xmm6,%xmm7 679 vaesenclast %xmm3,%xmm14,%xmm14 680 vpaddb %xmm2,%xmm7,%xmm3 681 682 addq $0x60,%r10 683 subq $0x6,%rdx 684 jc .L6x_done_nmb 685 686 vmovups %xmm9,-96(%rsi) 687 vpxor %xmm15,%xmm1,%xmm9 688 vmovups %xmm10,-80(%rsi) 689 vmovdqa %xmm0,%xmm10 690 vmovups %xmm11,-64(%rsi) 691 vmovdqa %xmm5,%xmm11 692 vmovups %xmm12,-48(%rsi) 693 vmovdqa %xmm6,%xmm12 694 vmovups %xmm13,-32(%rsi) 695 vmovdqa %xmm7,%xmm13 696 vmovups %xmm14,-16(%rsi) 697 vmovdqa %xmm3,%xmm14 698 vmovdqu 32+8(%rsp),%xmm7 699 jmp .Loop6x_nmb 700 701.L6x_done_nmb: 702 vpxor 16+8(%rsp),%xmm8,%xmm8 703 vpxor %xmm4,%xmm8,%xmm8 704 705 RET 706.cfi_endproc 707SET_SIZE(_aesni_ctr32_ghash_no_movbe_6x) 708 709ENTRY_ALIGN(aesni_gcm_decrypt, 32) 710.cfi_startproc 711 ENDBR 712 xorq %r10,%r10 713 cmpq $0x60,%rdx 714 jb .Lgcm_dec_abort 715 716 leaq (%rsp),%rax 717.cfi_def_cfa_register %rax 718 pushq %rbx 719.cfi_offset %rbx,-16 720 pushq %rbp 721.cfi_offset %rbp,-24 722 pushq %r12 723.cfi_offset %r12,-32 724 pushq %r13 725.cfi_offset %r13,-40 726 pushq %r14 727.cfi_offset %r14,-48 728 pushq %r15 729.cfi_offset %r15,-56 730 pushq %r9 731.cfi_offset %r9,-64 732 vzeroupper 733 734 vmovdqu (%r8),%xmm1 735 addq $-128,%rsp 736 movl 12(%r8),%ebx 737 leaq .Lbswap_mask(%rip),%r11 738 leaq -128(%rcx),%r14 739 movq $0xf80,%r15 740 vmovdqu (%r9),%xmm8 741 andq $-128,%rsp 742 vmovdqu (%r11),%xmm0 743 leaq 128(%rcx),%rcx 744 movq 32(%r9),%r9 745 leaq 32(%r9),%r9 746 movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds. 747 vpshufb %xmm0,%xmm8,%xmm8 748 749 andq %r15,%r14 750 andq %rsp,%r15 751 subq %r14,%r15 752 jc .Ldec_no_key_aliasing 753 cmpq $768,%r15 754 jnc .Ldec_no_key_aliasing 755 subq %r15,%rsp 756.Ldec_no_key_aliasing: 757 758 vmovdqu 80(%rdi),%xmm7 759 leaq (%rdi),%r14 760 vmovdqu 64(%rdi),%xmm4 761 leaq -192(%rdi,%rdx,1),%r15 762 vmovdqu 48(%rdi),%xmm5 763 shrq $4,%rdx 764 xorq %r10,%r10 765 vmovdqu 32(%rdi),%xmm6 766 vpshufb %xmm0,%xmm7,%xmm7 767 vmovdqu 16(%rdi),%xmm2 768 vpshufb %xmm0,%xmm4,%xmm4 769 vmovdqu (%rdi),%xmm3 770 vpshufb %xmm0,%xmm5,%xmm5 771 vmovdqu %xmm4,48(%rsp) 772 vpshufb %xmm0,%xmm6,%xmm6 773 vmovdqu %xmm5,64(%rsp) 774 vpshufb %xmm0,%xmm2,%xmm2 775 vmovdqu %xmm6,80(%rsp) 776 vpshufb %xmm0,%xmm3,%xmm3 777 vmovdqu %xmm2,96(%rsp) 778 vmovdqu %xmm3,112(%rsp) 779 780#ifdef HAVE_MOVBE 781#ifdef _KERNEL 782 testl $1,gcm_avx_can_use_movbe(%rip) 783#else 784 testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) 785#endif 786 jz 1f 787 call _aesni_ctr32_ghash_6x 788 jmp 2f 7891: 790#endif 791 call _aesni_ctr32_ghash_no_movbe_6x 7922: 793 vmovups %xmm9,-96(%rsi) 794 vmovups %xmm10,-80(%rsi) 795 vmovups %xmm11,-64(%rsi) 796 vmovups %xmm12,-48(%rsi) 797 vmovups %xmm13,-32(%rsi) 798 vmovups %xmm14,-16(%rsi) 799 800 vpshufb (%r11),%xmm8,%xmm8 801 movq -56(%rax),%r9 802.cfi_restore %r9 803 vmovdqu %xmm8,(%r9) 804 805 vzeroupper 806 movq -48(%rax),%r15 807.cfi_restore %r15 808 movq -40(%rax),%r14 809.cfi_restore %r14 810 movq -32(%rax),%r13 811.cfi_restore %r13 812 movq -24(%rax),%r12 813.cfi_restore %r12 814 movq -16(%rax),%rbp 815.cfi_restore %rbp 816 movq -8(%rax),%rbx 817.cfi_restore %rbx 818 leaq (%rax),%rsp 819.cfi_def_cfa_register %rsp 820.Lgcm_dec_abort: 821 movq %r10,%rax 822 RET 823.cfi_endproc 824SET_SIZE(aesni_gcm_decrypt) 825 826.balign 32 827FUNCTION(_aesni_ctr32_6x) 828.cfi_startproc 829 ENDBR 830 vmovdqu 0-128(%rcx),%xmm4 831 vmovdqu 32(%r11),%xmm2 832 leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds. 833 vmovups 16-128(%rcx),%xmm15 834 leaq 32-128(%rcx),%r12 835 vpxor %xmm4,%xmm1,%xmm9 836 addl $100663296,%ebx 837 jc .Lhandle_ctr32_2 838 vpaddb %xmm2,%xmm1,%xmm10 839 vpaddb %xmm2,%xmm10,%xmm11 840 vpxor %xmm4,%xmm10,%xmm10 841 vpaddb %xmm2,%xmm11,%xmm12 842 vpxor %xmm4,%xmm11,%xmm11 843 vpaddb %xmm2,%xmm12,%xmm13 844 vpxor %xmm4,%xmm12,%xmm12 845 vpaddb %xmm2,%xmm13,%xmm14 846 vpxor %xmm4,%xmm13,%xmm13 847 vpaddb %xmm2,%xmm14,%xmm1 848 vpxor %xmm4,%xmm14,%xmm14 849 jmp .Loop_ctr32 850 851.balign 16 852.Loop_ctr32: 853 vaesenc %xmm15,%xmm9,%xmm9 854 vaesenc %xmm15,%xmm10,%xmm10 855 vaesenc %xmm15,%xmm11,%xmm11 856 vaesenc %xmm15,%xmm12,%xmm12 857 vaesenc %xmm15,%xmm13,%xmm13 858 vaesenc %xmm15,%xmm14,%xmm14 859 vmovups (%r12),%xmm15 860 leaq 16(%r12),%r12 861 decl %r13d 862 jnz .Loop_ctr32 863 864 vmovdqu (%r12),%xmm3 865 vaesenc %xmm15,%xmm9,%xmm9 866 vpxor 0(%rdi),%xmm3,%xmm4 867 vaesenc %xmm15,%xmm10,%xmm10 868 vpxor 16(%rdi),%xmm3,%xmm5 869 vaesenc %xmm15,%xmm11,%xmm11 870 vpxor 32(%rdi),%xmm3,%xmm6 871 vaesenc %xmm15,%xmm12,%xmm12 872 vpxor 48(%rdi),%xmm3,%xmm8 873 vaesenc %xmm15,%xmm13,%xmm13 874 vpxor 64(%rdi),%xmm3,%xmm2 875 vaesenc %xmm15,%xmm14,%xmm14 876 vpxor 80(%rdi),%xmm3,%xmm3 877 leaq 96(%rdi),%rdi 878 879 vaesenclast %xmm4,%xmm9,%xmm9 880 vaesenclast %xmm5,%xmm10,%xmm10 881 vaesenclast %xmm6,%xmm11,%xmm11 882 vaesenclast %xmm8,%xmm12,%xmm12 883 vaesenclast %xmm2,%xmm13,%xmm13 884 vaesenclast %xmm3,%xmm14,%xmm14 885 vmovups %xmm9,0(%rsi) 886 vmovups %xmm10,16(%rsi) 887 vmovups %xmm11,32(%rsi) 888 vmovups %xmm12,48(%rsi) 889 vmovups %xmm13,64(%rsi) 890 vmovups %xmm14,80(%rsi) 891 leaq 96(%rsi),%rsi 892 893 RET 894.balign 32 895.Lhandle_ctr32_2: 896 vpshufb %xmm0,%xmm1,%xmm6 897 vmovdqu 48(%r11),%xmm5 898 vpaddd 64(%r11),%xmm6,%xmm10 899 vpaddd %xmm5,%xmm6,%xmm11 900 vpaddd %xmm5,%xmm10,%xmm12 901 vpshufb %xmm0,%xmm10,%xmm10 902 vpaddd %xmm5,%xmm11,%xmm13 903 vpshufb %xmm0,%xmm11,%xmm11 904 vpxor %xmm4,%xmm10,%xmm10 905 vpaddd %xmm5,%xmm12,%xmm14 906 vpshufb %xmm0,%xmm12,%xmm12 907 vpxor %xmm4,%xmm11,%xmm11 908 vpaddd %xmm5,%xmm13,%xmm1 909 vpshufb %xmm0,%xmm13,%xmm13 910 vpxor %xmm4,%xmm12,%xmm12 911 vpshufb %xmm0,%xmm14,%xmm14 912 vpxor %xmm4,%xmm13,%xmm13 913 vpshufb %xmm0,%xmm1,%xmm1 914 vpxor %xmm4,%xmm14,%xmm14 915 jmp .Loop_ctr32 916.cfi_endproc 917SET_SIZE(_aesni_ctr32_6x) 918 919ENTRY_ALIGN(aesni_gcm_encrypt, 32) 920.cfi_startproc 921 ENDBR 922 xorq %r10,%r10 923 cmpq $288,%rdx 924 jb .Lgcm_enc_abort 925 926 leaq (%rsp),%rax 927.cfi_def_cfa_register %rax 928 pushq %rbx 929.cfi_offset %rbx,-16 930 pushq %rbp 931.cfi_offset %rbp,-24 932 pushq %r12 933.cfi_offset %r12,-32 934 pushq %r13 935.cfi_offset %r13,-40 936 pushq %r14 937.cfi_offset %r14,-48 938 pushq %r15 939.cfi_offset %r15,-56 940 pushq %r9 941.cfi_offset %r9,-64 942 vzeroupper 943 944 vmovdqu (%r8),%xmm1 945 addq $-128,%rsp 946 movl 12(%r8),%ebx 947 leaq .Lbswap_mask(%rip),%r11 948 leaq -128(%rcx),%r14 949 movq $0xf80,%r15 950 leaq 128(%rcx),%rcx 951 vmovdqu (%r11),%xmm0 952 andq $-128,%rsp 953 movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds. 954 955 andq %r15,%r14 956 andq %rsp,%r15 957 subq %r14,%r15 958 jc .Lenc_no_key_aliasing 959 cmpq $768,%r15 960 jnc .Lenc_no_key_aliasing 961 subq %r15,%rsp 962.Lenc_no_key_aliasing: 963 964 leaq (%rsi),%r14 965 leaq -192(%rsi,%rdx,1),%r15 966 shrq $4,%rdx 967 968 call _aesni_ctr32_6x 969 vpshufb %xmm0,%xmm9,%xmm8 970 vpshufb %xmm0,%xmm10,%xmm2 971 vmovdqu %xmm8,112(%rsp) 972 vpshufb %xmm0,%xmm11,%xmm4 973 vmovdqu %xmm2,96(%rsp) 974 vpshufb %xmm0,%xmm12,%xmm5 975 vmovdqu %xmm4,80(%rsp) 976 vpshufb %xmm0,%xmm13,%xmm6 977 vmovdqu %xmm5,64(%rsp) 978 vpshufb %xmm0,%xmm14,%xmm7 979 vmovdqu %xmm6,48(%rsp) 980 981 call _aesni_ctr32_6x 982 983 vmovdqu (%r9),%xmm8 984 movq 32(%r9),%r9 985 leaq 32(%r9),%r9 986 subq $12,%rdx 987 movq $192,%r10 988 vpshufb %xmm0,%xmm8,%xmm8 989 990#ifdef HAVE_MOVBE 991#ifdef _KERNEL 992 testl $1,gcm_avx_can_use_movbe(%rip) 993#else 994 testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) 995#endif 996 jz 1f 997 call _aesni_ctr32_ghash_6x 998 jmp 2f 9991: 1000#endif 1001 call _aesni_ctr32_ghash_no_movbe_6x 10022: 1003 vmovdqu 32(%rsp),%xmm7 1004 vmovdqu (%r11),%xmm0 1005 vmovdqu 0-32(%r9),%xmm3 1006 vpunpckhqdq %xmm7,%xmm7,%xmm1 1007 vmovdqu 32-32(%r9),%xmm15 1008 vmovups %xmm9,-96(%rsi) 1009 vpshufb %xmm0,%xmm9,%xmm9 1010 vpxor %xmm7,%xmm1,%xmm1 1011 vmovups %xmm10,-80(%rsi) 1012 vpshufb %xmm0,%xmm10,%xmm10 1013 vmovups %xmm11,-64(%rsi) 1014 vpshufb %xmm0,%xmm11,%xmm11 1015 vmovups %xmm12,-48(%rsi) 1016 vpshufb %xmm0,%xmm12,%xmm12 1017 vmovups %xmm13,-32(%rsi) 1018 vpshufb %xmm0,%xmm13,%xmm13 1019 vmovups %xmm14,-16(%rsi) 1020 vpshufb %xmm0,%xmm14,%xmm14 1021 vmovdqu %xmm9,16(%rsp) 1022 vmovdqu 48(%rsp),%xmm6 1023 vmovdqu 16-32(%r9),%xmm0 1024 vpunpckhqdq %xmm6,%xmm6,%xmm2 1025 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 1026 vpxor %xmm6,%xmm2,%xmm2 1027 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 1028 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 1029 1030 vmovdqu 64(%rsp),%xmm9 1031 vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 1032 vmovdqu 48-32(%r9),%xmm3 1033 vpxor %xmm5,%xmm4,%xmm4 1034 vpunpckhqdq %xmm9,%xmm9,%xmm5 1035 vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 1036 vpxor %xmm9,%xmm5,%xmm5 1037 vpxor %xmm7,%xmm6,%xmm6 1038 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 1039 vmovdqu 80-32(%r9),%xmm15 1040 vpxor %xmm1,%xmm2,%xmm2 1041 1042 vmovdqu 80(%rsp),%xmm1 1043 vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 1044 vmovdqu 64-32(%r9),%xmm0 1045 vpxor %xmm4,%xmm7,%xmm7 1046 vpunpckhqdq %xmm1,%xmm1,%xmm4 1047 vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 1048 vpxor %xmm1,%xmm4,%xmm4 1049 vpxor %xmm6,%xmm9,%xmm9 1050 vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 1051 vpxor %xmm2,%xmm5,%xmm5 1052 1053 vmovdqu 96(%rsp),%xmm2 1054 vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 1055 vmovdqu 96-32(%r9),%xmm3 1056 vpxor %xmm7,%xmm6,%xmm6 1057 vpunpckhqdq %xmm2,%xmm2,%xmm7 1058 vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 1059 vpxor %xmm2,%xmm7,%xmm7 1060 vpxor %xmm9,%xmm1,%xmm1 1061 vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 1062 vmovdqu 128-32(%r9),%xmm15 1063 vpxor %xmm5,%xmm4,%xmm4 1064 1065 vpxor 112(%rsp),%xmm8,%xmm8 1066 vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 1067 vmovdqu 112-32(%r9),%xmm0 1068 vpunpckhqdq %xmm8,%xmm8,%xmm9 1069 vpxor %xmm6,%xmm5,%xmm5 1070 vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 1071 vpxor %xmm8,%xmm9,%xmm9 1072 vpxor %xmm1,%xmm2,%xmm2 1073 vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 1074 vpxor %xmm4,%xmm7,%xmm4 1075 1076 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 1077 vmovdqu 0-32(%r9),%xmm3 1078 vpunpckhqdq %xmm14,%xmm14,%xmm1 1079 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 1080 vpxor %xmm14,%xmm1,%xmm1 1081 vpxor %xmm5,%xmm6,%xmm5 1082 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 1083 vmovdqu 32-32(%r9),%xmm15 1084 vpxor %xmm2,%xmm8,%xmm7 1085 vpxor %xmm4,%xmm9,%xmm6 1086 1087 vmovdqu 16-32(%r9),%xmm0 1088 vpxor %xmm5,%xmm7,%xmm9 1089 vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 1090 vpxor %xmm9,%xmm6,%xmm6 1091 vpunpckhqdq %xmm13,%xmm13,%xmm2 1092 vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 1093 vpxor %xmm13,%xmm2,%xmm2 1094 vpslldq $8,%xmm6,%xmm9 1095 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 1096 vpxor %xmm9,%xmm5,%xmm8 1097 vpsrldq $8,%xmm6,%xmm6 1098 vpxor %xmm6,%xmm7,%xmm7 1099 1100 vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 1101 vmovdqu 48-32(%r9),%xmm3 1102 vpxor %xmm4,%xmm5,%xmm5 1103 vpunpckhqdq %xmm12,%xmm12,%xmm9 1104 vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 1105 vpxor %xmm12,%xmm9,%xmm9 1106 vpxor %xmm14,%xmm13,%xmm13 1107 vpalignr $8,%xmm8,%xmm8,%xmm14 1108 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 1109 vmovdqu 80-32(%r9),%xmm15 1110 vpxor %xmm1,%xmm2,%xmm2 1111 1112 vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 1113 vmovdqu 64-32(%r9),%xmm0 1114 vpxor %xmm5,%xmm4,%xmm4 1115 vpunpckhqdq %xmm11,%xmm11,%xmm1 1116 vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 1117 vpxor %xmm11,%xmm1,%xmm1 1118 vpxor %xmm13,%xmm12,%xmm12 1119 vxorps 16(%rsp),%xmm7,%xmm7 1120 vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 1121 vpxor %xmm2,%xmm9,%xmm9 1122 1123 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 1124 vxorps %xmm14,%xmm8,%xmm8 1125 1126 vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 1127 vmovdqu 96-32(%r9),%xmm3 1128 vpxor %xmm4,%xmm5,%xmm5 1129 vpunpckhqdq %xmm10,%xmm10,%xmm2 1130 vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 1131 vpxor %xmm10,%xmm2,%xmm2 1132 vpalignr $8,%xmm8,%xmm8,%xmm14 1133 vpxor %xmm12,%xmm11,%xmm11 1134 vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 1135 vmovdqu 128-32(%r9),%xmm15 1136 vpxor %xmm9,%xmm1,%xmm1 1137 1138 vxorps %xmm7,%xmm14,%xmm14 1139 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 1140 vxorps %xmm14,%xmm8,%xmm8 1141 1142 vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 1143 vmovdqu 112-32(%r9),%xmm0 1144 vpxor %xmm5,%xmm4,%xmm4 1145 vpunpckhqdq %xmm8,%xmm8,%xmm9 1146 vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 1147 vpxor %xmm8,%xmm9,%xmm9 1148 vpxor %xmm11,%xmm10,%xmm10 1149 vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 1150 vpxor %xmm1,%xmm2,%xmm2 1151 1152 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 1153 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 1154 vpxor %xmm4,%xmm5,%xmm5 1155 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 1156 vpxor %xmm10,%xmm7,%xmm7 1157 vpxor %xmm2,%xmm6,%xmm6 1158 1159 vpxor %xmm5,%xmm7,%xmm4 1160 vpxor %xmm4,%xmm6,%xmm6 1161 vpslldq $8,%xmm6,%xmm1 1162 vmovdqu 16(%r11),%xmm3 1163 vpsrldq $8,%xmm6,%xmm6 1164 vpxor %xmm1,%xmm5,%xmm8 1165 vpxor %xmm6,%xmm7,%xmm7 1166 1167 vpalignr $8,%xmm8,%xmm8,%xmm2 1168 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 1169 vpxor %xmm2,%xmm8,%xmm8 1170 1171 vpalignr $8,%xmm8,%xmm8,%xmm2 1172 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 1173 vpxor %xmm7,%xmm2,%xmm2 1174 vpxor %xmm2,%xmm8,%xmm8 1175 vpshufb (%r11),%xmm8,%xmm8 1176 movq -56(%rax),%r9 1177.cfi_restore %r9 1178 vmovdqu %xmm8,(%r9) 1179 1180 vzeroupper 1181 movq -48(%rax),%r15 1182.cfi_restore %r15 1183 movq -40(%rax),%r14 1184.cfi_restore %r14 1185 movq -32(%rax),%r13 1186.cfi_restore %r13 1187 movq -24(%rax),%r12 1188.cfi_restore %r12 1189 movq -16(%rax),%rbp 1190.cfi_restore %rbp 1191 movq -8(%rax),%rbx 1192.cfi_restore %rbx 1193 leaq (%rax),%rsp 1194.cfi_def_cfa_register %rsp 1195.Lgcm_enc_abort: 1196 movq %r10,%rax 1197 RET 1198.cfi_endproc 1199SET_SIZE(aesni_gcm_encrypt) 1200 1201#endif /* !_WIN32 || _KERNEL */ 1202 1203/* Some utility routines */ 1204 1205/* 1206 * clear all fpu registers 1207 * void clear_fpu_regs_avx(void); 1208 */ 1209ENTRY_ALIGN(clear_fpu_regs_avx, 32) 1210 vzeroall 1211 RET 1212SET_SIZE(clear_fpu_regs_avx) 1213 1214/* 1215 * void gcm_xor_avx(const uint8_t *src, uint8_t *dst); 1216 * 1217 * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and 1218 * stores the result at `dst'. The XOR is performed using FPU registers, 1219 * so make sure FPU state is saved when running this in the kernel. 1220 */ 1221ENTRY_ALIGN(gcm_xor_avx, 32) 1222 movdqu (%rdi), %xmm0 1223 movdqu (%rsi), %xmm1 1224 pxor %xmm1, %xmm0 1225 movdqu %xmm0, (%rsi) 1226 RET 1227SET_SIZE(gcm_xor_avx) 1228 1229/* 1230 * Toggle a boolean_t value atomically and return the new value. 1231 * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); 1232 */ 1233ENTRY_ALIGN(atomic_toggle_boolean_nv, 32) 1234 xorl %eax, %eax 1235 lock 1236 xorl $1, (%rdi) 1237 jz 1f 1238 movl $1, %eax 12391: 1240 RET 1241SET_SIZE(atomic_toggle_boolean_nv) 1242 1243SECTION_STATIC 1244 1245.balign 64 1246.Lbswap_mask: 1247.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1248.Lpoly: 1249.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1250.Lone_msb: 1251.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 1252.Ltwo_lsb: 1253.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1254.Lone_lsb: 1255.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1256.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1257.balign 64 1258 1259/* Mark the stack non-executable. */ 1260#if defined(__linux__) && defined(__ELF__) 1261.section .note.GNU-stack,"",%progbits 1262#endif 1263 1264#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ 1265