1*74750aa7SEric Biggers/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*74750aa7SEric Biggers/* 3*74750aa7SEric Biggers * ChaCha 256-bit cipher algorithm, x64 AVX2 functions 4*74750aa7SEric Biggers * 5*74750aa7SEric Biggers * Copyright (C) 2015 Martin Willi 6*74750aa7SEric Biggers */ 7*74750aa7SEric Biggers 8*74750aa7SEric Biggers#include <linux/linkage.h> 9*74750aa7SEric Biggers 10*74750aa7SEric Biggers.section .rodata.cst32.ROT8, "aM", @progbits, 32 11*74750aa7SEric Biggers.align 32 12*74750aa7SEric BiggersROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 13*74750aa7SEric Biggers .octa 0x0e0d0c0f0a09080b0605040702010003 14*74750aa7SEric Biggers 15*74750aa7SEric Biggers.section .rodata.cst32.ROT16, "aM", @progbits, 32 16*74750aa7SEric Biggers.align 32 17*74750aa7SEric BiggersROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 18*74750aa7SEric Biggers .octa 0x0d0c0f0e09080b0a0504070601000302 19*74750aa7SEric Biggers 20*74750aa7SEric Biggers.section .rodata.cst32.CTRINC, "aM", @progbits, 32 21*74750aa7SEric Biggers.align 32 22*74750aa7SEric BiggersCTRINC: .octa 0x00000003000000020000000100000000 23*74750aa7SEric Biggers .octa 0x00000007000000060000000500000004 24*74750aa7SEric Biggers 25*74750aa7SEric Biggers.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 26*74750aa7SEric Biggers.align 32 27*74750aa7SEric BiggersCTR2BL: .octa 0x00000000000000000000000000000000 28*74750aa7SEric Biggers .octa 0x00000000000000000000000000000001 29*74750aa7SEric Biggers 30*74750aa7SEric Biggers.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 31*74750aa7SEric Biggers.align 32 32*74750aa7SEric BiggersCTR4BL: .octa 0x00000000000000000000000000000002 33*74750aa7SEric Biggers .octa 0x00000000000000000000000000000003 34*74750aa7SEric Biggers 35*74750aa7SEric Biggers.text 36*74750aa7SEric Biggers 37*74750aa7SEric BiggersSYM_FUNC_START(chacha_2block_xor_avx2) 38*74750aa7SEric Biggers # %rdi: Input state matrix, s 39*74750aa7SEric Biggers # %rsi: up to 2 data blocks output, o 40*74750aa7SEric Biggers # %rdx: up to 2 data blocks input, i 41*74750aa7SEric Biggers # %rcx: input/output length in bytes 42*74750aa7SEric Biggers # %r8d: nrounds 43*74750aa7SEric Biggers 44*74750aa7SEric Biggers # This function encrypts two ChaCha blocks by loading the state 45*74750aa7SEric Biggers # matrix twice across four AVX registers. It performs matrix operations 46*74750aa7SEric Biggers # on four words in each matrix in parallel, but requires shuffling to 47*74750aa7SEric Biggers # rearrange the words after each round. 48*74750aa7SEric Biggers 49*74750aa7SEric Biggers vzeroupper 50*74750aa7SEric Biggers 51*74750aa7SEric Biggers # x0..3[0-2] = s0..3 52*74750aa7SEric Biggers vbroadcasti128 0x00(%rdi),%ymm0 53*74750aa7SEric Biggers vbroadcasti128 0x10(%rdi),%ymm1 54*74750aa7SEric Biggers vbroadcasti128 0x20(%rdi),%ymm2 55*74750aa7SEric Biggers vbroadcasti128 0x30(%rdi),%ymm3 56*74750aa7SEric Biggers 57*74750aa7SEric Biggers vpaddd CTR2BL(%rip),%ymm3,%ymm3 58*74750aa7SEric Biggers 59*74750aa7SEric Biggers vmovdqa %ymm0,%ymm8 60*74750aa7SEric Biggers vmovdqa %ymm1,%ymm9 61*74750aa7SEric Biggers vmovdqa %ymm2,%ymm10 62*74750aa7SEric Biggers vmovdqa %ymm3,%ymm11 63*74750aa7SEric Biggers 64*74750aa7SEric Biggers vmovdqa ROT8(%rip),%ymm4 65*74750aa7SEric Biggers vmovdqa ROT16(%rip),%ymm5 66*74750aa7SEric Biggers 67*74750aa7SEric Biggers mov %rcx,%rax 68*74750aa7SEric Biggers 69*74750aa7SEric Biggers.Ldoubleround: 70*74750aa7SEric Biggers 71*74750aa7SEric Biggers # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 72*74750aa7SEric Biggers vpaddd %ymm1,%ymm0,%ymm0 73*74750aa7SEric Biggers vpxor %ymm0,%ymm3,%ymm3 74*74750aa7SEric Biggers vpshufb %ymm5,%ymm3,%ymm3 75*74750aa7SEric Biggers 76*74750aa7SEric Biggers # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 77*74750aa7SEric Biggers vpaddd %ymm3,%ymm2,%ymm2 78*74750aa7SEric Biggers vpxor %ymm2,%ymm1,%ymm1 79*74750aa7SEric Biggers vmovdqa %ymm1,%ymm6 80*74750aa7SEric Biggers vpslld $12,%ymm6,%ymm6 81*74750aa7SEric Biggers vpsrld $20,%ymm1,%ymm1 82*74750aa7SEric Biggers vpor %ymm6,%ymm1,%ymm1 83*74750aa7SEric Biggers 84*74750aa7SEric Biggers # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 85*74750aa7SEric Biggers vpaddd %ymm1,%ymm0,%ymm0 86*74750aa7SEric Biggers vpxor %ymm0,%ymm3,%ymm3 87*74750aa7SEric Biggers vpshufb %ymm4,%ymm3,%ymm3 88*74750aa7SEric Biggers 89*74750aa7SEric Biggers # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 90*74750aa7SEric Biggers vpaddd %ymm3,%ymm2,%ymm2 91*74750aa7SEric Biggers vpxor %ymm2,%ymm1,%ymm1 92*74750aa7SEric Biggers vmovdqa %ymm1,%ymm7 93*74750aa7SEric Biggers vpslld $7,%ymm7,%ymm7 94*74750aa7SEric Biggers vpsrld $25,%ymm1,%ymm1 95*74750aa7SEric Biggers vpor %ymm7,%ymm1,%ymm1 96*74750aa7SEric Biggers 97*74750aa7SEric Biggers # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 98*74750aa7SEric Biggers vpshufd $0x39,%ymm1,%ymm1 99*74750aa7SEric Biggers # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 100*74750aa7SEric Biggers vpshufd $0x4e,%ymm2,%ymm2 101*74750aa7SEric Biggers # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 102*74750aa7SEric Biggers vpshufd $0x93,%ymm3,%ymm3 103*74750aa7SEric Biggers 104*74750aa7SEric Biggers # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 105*74750aa7SEric Biggers vpaddd %ymm1,%ymm0,%ymm0 106*74750aa7SEric Biggers vpxor %ymm0,%ymm3,%ymm3 107*74750aa7SEric Biggers vpshufb %ymm5,%ymm3,%ymm3 108*74750aa7SEric Biggers 109*74750aa7SEric Biggers # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 110*74750aa7SEric Biggers vpaddd %ymm3,%ymm2,%ymm2 111*74750aa7SEric Biggers vpxor %ymm2,%ymm1,%ymm1 112*74750aa7SEric Biggers vmovdqa %ymm1,%ymm6 113*74750aa7SEric Biggers vpslld $12,%ymm6,%ymm6 114*74750aa7SEric Biggers vpsrld $20,%ymm1,%ymm1 115*74750aa7SEric Biggers vpor %ymm6,%ymm1,%ymm1 116*74750aa7SEric Biggers 117*74750aa7SEric Biggers # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 118*74750aa7SEric Biggers vpaddd %ymm1,%ymm0,%ymm0 119*74750aa7SEric Biggers vpxor %ymm0,%ymm3,%ymm3 120*74750aa7SEric Biggers vpshufb %ymm4,%ymm3,%ymm3 121*74750aa7SEric Biggers 122*74750aa7SEric Biggers # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 123*74750aa7SEric Biggers vpaddd %ymm3,%ymm2,%ymm2 124*74750aa7SEric Biggers vpxor %ymm2,%ymm1,%ymm1 125*74750aa7SEric Biggers vmovdqa %ymm1,%ymm7 126*74750aa7SEric Biggers vpslld $7,%ymm7,%ymm7 127*74750aa7SEric Biggers vpsrld $25,%ymm1,%ymm1 128*74750aa7SEric Biggers vpor %ymm7,%ymm1,%ymm1 129*74750aa7SEric Biggers 130*74750aa7SEric Biggers # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 131*74750aa7SEric Biggers vpshufd $0x93,%ymm1,%ymm1 132*74750aa7SEric Biggers # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 133*74750aa7SEric Biggers vpshufd $0x4e,%ymm2,%ymm2 134*74750aa7SEric Biggers # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 135*74750aa7SEric Biggers vpshufd $0x39,%ymm3,%ymm3 136*74750aa7SEric Biggers 137*74750aa7SEric Biggers sub $2,%r8d 138*74750aa7SEric Biggers jnz .Ldoubleround 139*74750aa7SEric Biggers 140*74750aa7SEric Biggers # o0 = i0 ^ (x0 + s0) 141*74750aa7SEric Biggers vpaddd %ymm8,%ymm0,%ymm7 142*74750aa7SEric Biggers cmp $0x10,%rax 143*74750aa7SEric Biggers jl .Lxorpart2 144*74750aa7SEric Biggers vpxor 0x00(%rdx),%xmm7,%xmm6 145*74750aa7SEric Biggers vmovdqu %xmm6,0x00(%rsi) 146*74750aa7SEric Biggers vextracti128 $1,%ymm7,%xmm0 147*74750aa7SEric Biggers # o1 = i1 ^ (x1 + s1) 148*74750aa7SEric Biggers vpaddd %ymm9,%ymm1,%ymm7 149*74750aa7SEric Biggers cmp $0x20,%rax 150*74750aa7SEric Biggers jl .Lxorpart2 151*74750aa7SEric Biggers vpxor 0x10(%rdx),%xmm7,%xmm6 152*74750aa7SEric Biggers vmovdqu %xmm6,0x10(%rsi) 153*74750aa7SEric Biggers vextracti128 $1,%ymm7,%xmm1 154*74750aa7SEric Biggers # o2 = i2 ^ (x2 + s2) 155*74750aa7SEric Biggers vpaddd %ymm10,%ymm2,%ymm7 156*74750aa7SEric Biggers cmp $0x30,%rax 157*74750aa7SEric Biggers jl .Lxorpart2 158*74750aa7SEric Biggers vpxor 0x20(%rdx),%xmm7,%xmm6 159*74750aa7SEric Biggers vmovdqu %xmm6,0x20(%rsi) 160*74750aa7SEric Biggers vextracti128 $1,%ymm7,%xmm2 161*74750aa7SEric Biggers # o3 = i3 ^ (x3 + s3) 162*74750aa7SEric Biggers vpaddd %ymm11,%ymm3,%ymm7 163*74750aa7SEric Biggers cmp $0x40,%rax 164*74750aa7SEric Biggers jl .Lxorpart2 165*74750aa7SEric Biggers vpxor 0x30(%rdx),%xmm7,%xmm6 166*74750aa7SEric Biggers vmovdqu %xmm6,0x30(%rsi) 167*74750aa7SEric Biggers vextracti128 $1,%ymm7,%xmm3 168*74750aa7SEric Biggers 169*74750aa7SEric Biggers # xor and write second block 170*74750aa7SEric Biggers vmovdqa %xmm0,%xmm7 171*74750aa7SEric Biggers cmp $0x50,%rax 172*74750aa7SEric Biggers jl .Lxorpart2 173*74750aa7SEric Biggers vpxor 0x40(%rdx),%xmm7,%xmm6 174*74750aa7SEric Biggers vmovdqu %xmm6,0x40(%rsi) 175*74750aa7SEric Biggers 176*74750aa7SEric Biggers vmovdqa %xmm1,%xmm7 177*74750aa7SEric Biggers cmp $0x60,%rax 178*74750aa7SEric Biggers jl .Lxorpart2 179*74750aa7SEric Biggers vpxor 0x50(%rdx),%xmm7,%xmm6 180*74750aa7SEric Biggers vmovdqu %xmm6,0x50(%rsi) 181*74750aa7SEric Biggers 182*74750aa7SEric Biggers vmovdqa %xmm2,%xmm7 183*74750aa7SEric Biggers cmp $0x70,%rax 184*74750aa7SEric Biggers jl .Lxorpart2 185*74750aa7SEric Biggers vpxor 0x60(%rdx),%xmm7,%xmm6 186*74750aa7SEric Biggers vmovdqu %xmm6,0x60(%rsi) 187*74750aa7SEric Biggers 188*74750aa7SEric Biggers vmovdqa %xmm3,%xmm7 189*74750aa7SEric Biggers cmp $0x80,%rax 190*74750aa7SEric Biggers jl .Lxorpart2 191*74750aa7SEric Biggers vpxor 0x70(%rdx),%xmm7,%xmm6 192*74750aa7SEric Biggers vmovdqu %xmm6,0x70(%rsi) 193*74750aa7SEric Biggers 194*74750aa7SEric Biggers.Ldone2: 195*74750aa7SEric Biggers vzeroupper 196*74750aa7SEric Biggers RET 197*74750aa7SEric Biggers 198*74750aa7SEric Biggers.Lxorpart2: 199*74750aa7SEric Biggers # xor remaining bytes from partial register into output 200*74750aa7SEric Biggers mov %rax,%r9 201*74750aa7SEric Biggers and $0x0f,%r9 202*74750aa7SEric Biggers jz .Ldone2 203*74750aa7SEric Biggers and $~0x0f,%rax 204*74750aa7SEric Biggers 205*74750aa7SEric Biggers mov %rsi,%r11 206*74750aa7SEric Biggers 207*74750aa7SEric Biggers lea 8(%rsp),%r10 208*74750aa7SEric Biggers sub $0x10,%rsp 209*74750aa7SEric Biggers and $~31,%rsp 210*74750aa7SEric Biggers 211*74750aa7SEric Biggers lea (%rdx,%rax),%rsi 212*74750aa7SEric Biggers mov %rsp,%rdi 213*74750aa7SEric Biggers mov %r9,%rcx 214*74750aa7SEric Biggers rep movsb 215*74750aa7SEric Biggers 216*74750aa7SEric Biggers vpxor 0x00(%rsp),%xmm7,%xmm7 217*74750aa7SEric Biggers vmovdqa %xmm7,0x00(%rsp) 218*74750aa7SEric Biggers 219*74750aa7SEric Biggers mov %rsp,%rsi 220*74750aa7SEric Biggers lea (%r11,%rax),%rdi 221*74750aa7SEric Biggers mov %r9,%rcx 222*74750aa7SEric Biggers rep movsb 223*74750aa7SEric Biggers 224*74750aa7SEric Biggers lea -8(%r10),%rsp 225*74750aa7SEric Biggers jmp .Ldone2 226*74750aa7SEric Biggers 227*74750aa7SEric BiggersSYM_FUNC_END(chacha_2block_xor_avx2) 228*74750aa7SEric Biggers 229*74750aa7SEric BiggersSYM_FUNC_START(chacha_4block_xor_avx2) 230*74750aa7SEric Biggers # %rdi: Input state matrix, s 231*74750aa7SEric Biggers # %rsi: up to 4 data blocks output, o 232*74750aa7SEric Biggers # %rdx: up to 4 data blocks input, i 233*74750aa7SEric Biggers # %rcx: input/output length in bytes 234*74750aa7SEric Biggers # %r8d: nrounds 235*74750aa7SEric Biggers 236*74750aa7SEric Biggers # This function encrypts four ChaCha blocks by loading the state 237*74750aa7SEric Biggers # matrix four times across eight AVX registers. It performs matrix 238*74750aa7SEric Biggers # operations on four words in two matrices in parallel, sequentially 239*74750aa7SEric Biggers # to the operations on the four words of the other two matrices. The 240*74750aa7SEric Biggers # required word shuffling has a rather high latency, we can do the 241*74750aa7SEric Biggers # arithmetic on two matrix-pairs without much slowdown. 242*74750aa7SEric Biggers 243*74750aa7SEric Biggers vzeroupper 244*74750aa7SEric Biggers 245*74750aa7SEric Biggers # x0..3[0-4] = s0..3 246*74750aa7SEric Biggers vbroadcasti128 0x00(%rdi),%ymm0 247*74750aa7SEric Biggers vbroadcasti128 0x10(%rdi),%ymm1 248*74750aa7SEric Biggers vbroadcasti128 0x20(%rdi),%ymm2 249*74750aa7SEric Biggers vbroadcasti128 0x30(%rdi),%ymm3 250*74750aa7SEric Biggers 251*74750aa7SEric Biggers vmovdqa %ymm0,%ymm4 252*74750aa7SEric Biggers vmovdqa %ymm1,%ymm5 253*74750aa7SEric Biggers vmovdqa %ymm2,%ymm6 254*74750aa7SEric Biggers vmovdqa %ymm3,%ymm7 255*74750aa7SEric Biggers 256*74750aa7SEric Biggers vpaddd CTR2BL(%rip),%ymm3,%ymm3 257*74750aa7SEric Biggers vpaddd CTR4BL(%rip),%ymm7,%ymm7 258*74750aa7SEric Biggers 259*74750aa7SEric Biggers vmovdqa %ymm0,%ymm11 260*74750aa7SEric Biggers vmovdqa %ymm1,%ymm12 261*74750aa7SEric Biggers vmovdqa %ymm2,%ymm13 262*74750aa7SEric Biggers vmovdqa %ymm3,%ymm14 263*74750aa7SEric Biggers vmovdqa %ymm7,%ymm15 264*74750aa7SEric Biggers 265*74750aa7SEric Biggers vmovdqa ROT8(%rip),%ymm8 266*74750aa7SEric Biggers vmovdqa ROT16(%rip),%ymm9 267*74750aa7SEric Biggers 268*74750aa7SEric Biggers mov %rcx,%rax 269*74750aa7SEric Biggers 270*74750aa7SEric Biggers.Ldoubleround4: 271*74750aa7SEric Biggers 272*74750aa7SEric Biggers # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 273*74750aa7SEric Biggers vpaddd %ymm1,%ymm0,%ymm0 274*74750aa7SEric Biggers vpxor %ymm0,%ymm3,%ymm3 275*74750aa7SEric Biggers vpshufb %ymm9,%ymm3,%ymm3 276*74750aa7SEric Biggers 277*74750aa7SEric Biggers vpaddd %ymm5,%ymm4,%ymm4 278*74750aa7SEric Biggers vpxor %ymm4,%ymm7,%ymm7 279*74750aa7SEric Biggers vpshufb %ymm9,%ymm7,%ymm7 280*74750aa7SEric Biggers 281*74750aa7SEric Biggers # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 282*74750aa7SEric Biggers vpaddd %ymm3,%ymm2,%ymm2 283*74750aa7SEric Biggers vpxor %ymm2,%ymm1,%ymm1 284*74750aa7SEric Biggers vmovdqa %ymm1,%ymm10 285*74750aa7SEric Biggers vpslld $12,%ymm10,%ymm10 286*74750aa7SEric Biggers vpsrld $20,%ymm1,%ymm1 287*74750aa7SEric Biggers vpor %ymm10,%ymm1,%ymm1 288*74750aa7SEric Biggers 289*74750aa7SEric Biggers vpaddd %ymm7,%ymm6,%ymm6 290*74750aa7SEric Biggers vpxor %ymm6,%ymm5,%ymm5 291*74750aa7SEric Biggers vmovdqa %ymm5,%ymm10 292*74750aa7SEric Biggers vpslld $12,%ymm10,%ymm10 293*74750aa7SEric Biggers vpsrld $20,%ymm5,%ymm5 294*74750aa7SEric Biggers vpor %ymm10,%ymm5,%ymm5 295*74750aa7SEric Biggers 296*74750aa7SEric Biggers # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 297*74750aa7SEric Biggers vpaddd %ymm1,%ymm0,%ymm0 298*74750aa7SEric Biggers vpxor %ymm0,%ymm3,%ymm3 299*74750aa7SEric Biggers vpshufb %ymm8,%ymm3,%ymm3 300*74750aa7SEric Biggers 301*74750aa7SEric Biggers vpaddd %ymm5,%ymm4,%ymm4 302*74750aa7SEric Biggers vpxor %ymm4,%ymm7,%ymm7 303*74750aa7SEric Biggers vpshufb %ymm8,%ymm7,%ymm7 304*74750aa7SEric Biggers 305*74750aa7SEric Biggers # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 306*74750aa7SEric Biggers vpaddd %ymm3,%ymm2,%ymm2 307*74750aa7SEric Biggers vpxor %ymm2,%ymm1,%ymm1 308*74750aa7SEric Biggers vmovdqa %ymm1,%ymm10 309*74750aa7SEric Biggers vpslld $7,%ymm10,%ymm10 310*74750aa7SEric Biggers vpsrld $25,%ymm1,%ymm1 311*74750aa7SEric Biggers vpor %ymm10,%ymm1,%ymm1 312*74750aa7SEric Biggers 313*74750aa7SEric Biggers vpaddd %ymm7,%ymm6,%ymm6 314*74750aa7SEric Biggers vpxor %ymm6,%ymm5,%ymm5 315*74750aa7SEric Biggers vmovdqa %ymm5,%ymm10 316*74750aa7SEric Biggers vpslld $7,%ymm10,%ymm10 317*74750aa7SEric Biggers vpsrld $25,%ymm5,%ymm5 318*74750aa7SEric Biggers vpor %ymm10,%ymm5,%ymm5 319*74750aa7SEric Biggers 320*74750aa7SEric Biggers # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 321*74750aa7SEric Biggers vpshufd $0x39,%ymm1,%ymm1 322*74750aa7SEric Biggers vpshufd $0x39,%ymm5,%ymm5 323*74750aa7SEric Biggers # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 324*74750aa7SEric Biggers vpshufd $0x4e,%ymm2,%ymm2 325*74750aa7SEric Biggers vpshufd $0x4e,%ymm6,%ymm6 326*74750aa7SEric Biggers # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 327*74750aa7SEric Biggers vpshufd $0x93,%ymm3,%ymm3 328*74750aa7SEric Biggers vpshufd $0x93,%ymm7,%ymm7 329*74750aa7SEric Biggers 330*74750aa7SEric Biggers # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 331*74750aa7SEric Biggers vpaddd %ymm1,%ymm0,%ymm0 332*74750aa7SEric Biggers vpxor %ymm0,%ymm3,%ymm3 333*74750aa7SEric Biggers vpshufb %ymm9,%ymm3,%ymm3 334*74750aa7SEric Biggers 335*74750aa7SEric Biggers vpaddd %ymm5,%ymm4,%ymm4 336*74750aa7SEric Biggers vpxor %ymm4,%ymm7,%ymm7 337*74750aa7SEric Biggers vpshufb %ymm9,%ymm7,%ymm7 338*74750aa7SEric Biggers 339*74750aa7SEric Biggers # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 340*74750aa7SEric Biggers vpaddd %ymm3,%ymm2,%ymm2 341*74750aa7SEric Biggers vpxor %ymm2,%ymm1,%ymm1 342*74750aa7SEric Biggers vmovdqa %ymm1,%ymm10 343*74750aa7SEric Biggers vpslld $12,%ymm10,%ymm10 344*74750aa7SEric Biggers vpsrld $20,%ymm1,%ymm1 345*74750aa7SEric Biggers vpor %ymm10,%ymm1,%ymm1 346*74750aa7SEric Biggers 347*74750aa7SEric Biggers vpaddd %ymm7,%ymm6,%ymm6 348*74750aa7SEric Biggers vpxor %ymm6,%ymm5,%ymm5 349*74750aa7SEric Biggers vmovdqa %ymm5,%ymm10 350*74750aa7SEric Biggers vpslld $12,%ymm10,%ymm10 351*74750aa7SEric Biggers vpsrld $20,%ymm5,%ymm5 352*74750aa7SEric Biggers vpor %ymm10,%ymm5,%ymm5 353*74750aa7SEric Biggers 354*74750aa7SEric Biggers # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 355*74750aa7SEric Biggers vpaddd %ymm1,%ymm0,%ymm0 356*74750aa7SEric Biggers vpxor %ymm0,%ymm3,%ymm3 357*74750aa7SEric Biggers vpshufb %ymm8,%ymm3,%ymm3 358*74750aa7SEric Biggers 359*74750aa7SEric Biggers vpaddd %ymm5,%ymm4,%ymm4 360*74750aa7SEric Biggers vpxor %ymm4,%ymm7,%ymm7 361*74750aa7SEric Biggers vpshufb %ymm8,%ymm7,%ymm7 362*74750aa7SEric Biggers 363*74750aa7SEric Biggers # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 364*74750aa7SEric Biggers vpaddd %ymm3,%ymm2,%ymm2 365*74750aa7SEric Biggers vpxor %ymm2,%ymm1,%ymm1 366*74750aa7SEric Biggers vmovdqa %ymm1,%ymm10 367*74750aa7SEric Biggers vpslld $7,%ymm10,%ymm10 368*74750aa7SEric Biggers vpsrld $25,%ymm1,%ymm1 369*74750aa7SEric Biggers vpor %ymm10,%ymm1,%ymm1 370*74750aa7SEric Biggers 371*74750aa7SEric Biggers vpaddd %ymm7,%ymm6,%ymm6 372*74750aa7SEric Biggers vpxor %ymm6,%ymm5,%ymm5 373*74750aa7SEric Biggers vmovdqa %ymm5,%ymm10 374*74750aa7SEric Biggers vpslld $7,%ymm10,%ymm10 375*74750aa7SEric Biggers vpsrld $25,%ymm5,%ymm5 376*74750aa7SEric Biggers vpor %ymm10,%ymm5,%ymm5 377*74750aa7SEric Biggers 378*74750aa7SEric Biggers # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 379*74750aa7SEric Biggers vpshufd $0x93,%ymm1,%ymm1 380*74750aa7SEric Biggers vpshufd $0x93,%ymm5,%ymm5 381*74750aa7SEric Biggers # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 382*74750aa7SEric Biggers vpshufd $0x4e,%ymm2,%ymm2 383*74750aa7SEric Biggers vpshufd $0x4e,%ymm6,%ymm6 384*74750aa7SEric Biggers # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 385*74750aa7SEric Biggers vpshufd $0x39,%ymm3,%ymm3 386*74750aa7SEric Biggers vpshufd $0x39,%ymm7,%ymm7 387*74750aa7SEric Biggers 388*74750aa7SEric Biggers sub $2,%r8d 389*74750aa7SEric Biggers jnz .Ldoubleround4 390*74750aa7SEric Biggers 391*74750aa7SEric Biggers # o0 = i0 ^ (x0 + s0), first block 392*74750aa7SEric Biggers vpaddd %ymm11,%ymm0,%ymm10 393*74750aa7SEric Biggers cmp $0x10,%rax 394*74750aa7SEric Biggers jl .Lxorpart4 395*74750aa7SEric Biggers vpxor 0x00(%rdx),%xmm10,%xmm9 396*74750aa7SEric Biggers vmovdqu %xmm9,0x00(%rsi) 397*74750aa7SEric Biggers vextracti128 $1,%ymm10,%xmm0 398*74750aa7SEric Biggers # o1 = i1 ^ (x1 + s1), first block 399*74750aa7SEric Biggers vpaddd %ymm12,%ymm1,%ymm10 400*74750aa7SEric Biggers cmp $0x20,%rax 401*74750aa7SEric Biggers jl .Lxorpart4 402*74750aa7SEric Biggers vpxor 0x10(%rdx),%xmm10,%xmm9 403*74750aa7SEric Biggers vmovdqu %xmm9,0x10(%rsi) 404*74750aa7SEric Biggers vextracti128 $1,%ymm10,%xmm1 405*74750aa7SEric Biggers # o2 = i2 ^ (x2 + s2), first block 406*74750aa7SEric Biggers vpaddd %ymm13,%ymm2,%ymm10 407*74750aa7SEric Biggers cmp $0x30,%rax 408*74750aa7SEric Biggers jl .Lxorpart4 409*74750aa7SEric Biggers vpxor 0x20(%rdx),%xmm10,%xmm9 410*74750aa7SEric Biggers vmovdqu %xmm9,0x20(%rsi) 411*74750aa7SEric Biggers vextracti128 $1,%ymm10,%xmm2 412*74750aa7SEric Biggers # o3 = i3 ^ (x3 + s3), first block 413*74750aa7SEric Biggers vpaddd %ymm14,%ymm3,%ymm10 414*74750aa7SEric Biggers cmp $0x40,%rax 415*74750aa7SEric Biggers jl .Lxorpart4 416*74750aa7SEric Biggers vpxor 0x30(%rdx),%xmm10,%xmm9 417*74750aa7SEric Biggers vmovdqu %xmm9,0x30(%rsi) 418*74750aa7SEric Biggers vextracti128 $1,%ymm10,%xmm3 419*74750aa7SEric Biggers 420*74750aa7SEric Biggers # xor and write second block 421*74750aa7SEric Biggers vmovdqa %xmm0,%xmm10 422*74750aa7SEric Biggers cmp $0x50,%rax 423*74750aa7SEric Biggers jl .Lxorpart4 424*74750aa7SEric Biggers vpxor 0x40(%rdx),%xmm10,%xmm9 425*74750aa7SEric Biggers vmovdqu %xmm9,0x40(%rsi) 426*74750aa7SEric Biggers 427*74750aa7SEric Biggers vmovdqa %xmm1,%xmm10 428*74750aa7SEric Biggers cmp $0x60,%rax 429*74750aa7SEric Biggers jl .Lxorpart4 430*74750aa7SEric Biggers vpxor 0x50(%rdx),%xmm10,%xmm9 431*74750aa7SEric Biggers vmovdqu %xmm9,0x50(%rsi) 432*74750aa7SEric Biggers 433*74750aa7SEric Biggers vmovdqa %xmm2,%xmm10 434*74750aa7SEric Biggers cmp $0x70,%rax 435*74750aa7SEric Biggers jl .Lxorpart4 436*74750aa7SEric Biggers vpxor 0x60(%rdx),%xmm10,%xmm9 437*74750aa7SEric Biggers vmovdqu %xmm9,0x60(%rsi) 438*74750aa7SEric Biggers 439*74750aa7SEric Biggers vmovdqa %xmm3,%xmm10 440*74750aa7SEric Biggers cmp $0x80,%rax 441*74750aa7SEric Biggers jl .Lxorpart4 442*74750aa7SEric Biggers vpxor 0x70(%rdx),%xmm10,%xmm9 443*74750aa7SEric Biggers vmovdqu %xmm9,0x70(%rsi) 444*74750aa7SEric Biggers 445*74750aa7SEric Biggers # o0 = i0 ^ (x0 + s0), third block 446*74750aa7SEric Biggers vpaddd %ymm11,%ymm4,%ymm10 447*74750aa7SEric Biggers cmp $0x90,%rax 448*74750aa7SEric Biggers jl .Lxorpart4 449*74750aa7SEric Biggers vpxor 0x80(%rdx),%xmm10,%xmm9 450*74750aa7SEric Biggers vmovdqu %xmm9,0x80(%rsi) 451*74750aa7SEric Biggers vextracti128 $1,%ymm10,%xmm4 452*74750aa7SEric Biggers # o1 = i1 ^ (x1 + s1), third block 453*74750aa7SEric Biggers vpaddd %ymm12,%ymm5,%ymm10 454*74750aa7SEric Biggers cmp $0xa0,%rax 455*74750aa7SEric Biggers jl .Lxorpart4 456*74750aa7SEric Biggers vpxor 0x90(%rdx),%xmm10,%xmm9 457*74750aa7SEric Biggers vmovdqu %xmm9,0x90(%rsi) 458*74750aa7SEric Biggers vextracti128 $1,%ymm10,%xmm5 459*74750aa7SEric Biggers # o2 = i2 ^ (x2 + s2), third block 460*74750aa7SEric Biggers vpaddd %ymm13,%ymm6,%ymm10 461*74750aa7SEric Biggers cmp $0xb0,%rax 462*74750aa7SEric Biggers jl .Lxorpart4 463*74750aa7SEric Biggers vpxor 0xa0(%rdx),%xmm10,%xmm9 464*74750aa7SEric Biggers vmovdqu %xmm9,0xa0(%rsi) 465*74750aa7SEric Biggers vextracti128 $1,%ymm10,%xmm6 466*74750aa7SEric Biggers # o3 = i3 ^ (x3 + s3), third block 467*74750aa7SEric Biggers vpaddd %ymm15,%ymm7,%ymm10 468*74750aa7SEric Biggers cmp $0xc0,%rax 469*74750aa7SEric Biggers jl .Lxorpart4 470*74750aa7SEric Biggers vpxor 0xb0(%rdx),%xmm10,%xmm9 471*74750aa7SEric Biggers vmovdqu %xmm9,0xb0(%rsi) 472*74750aa7SEric Biggers vextracti128 $1,%ymm10,%xmm7 473*74750aa7SEric Biggers 474*74750aa7SEric Biggers # xor and write fourth block 475*74750aa7SEric Biggers vmovdqa %xmm4,%xmm10 476*74750aa7SEric Biggers cmp $0xd0,%rax 477*74750aa7SEric Biggers jl .Lxorpart4 478*74750aa7SEric Biggers vpxor 0xc0(%rdx),%xmm10,%xmm9 479*74750aa7SEric Biggers vmovdqu %xmm9,0xc0(%rsi) 480*74750aa7SEric Biggers 481*74750aa7SEric Biggers vmovdqa %xmm5,%xmm10 482*74750aa7SEric Biggers cmp $0xe0,%rax 483*74750aa7SEric Biggers jl .Lxorpart4 484*74750aa7SEric Biggers vpxor 0xd0(%rdx),%xmm10,%xmm9 485*74750aa7SEric Biggers vmovdqu %xmm9,0xd0(%rsi) 486*74750aa7SEric Biggers 487*74750aa7SEric Biggers vmovdqa %xmm6,%xmm10 488*74750aa7SEric Biggers cmp $0xf0,%rax 489*74750aa7SEric Biggers jl .Lxorpart4 490*74750aa7SEric Biggers vpxor 0xe0(%rdx),%xmm10,%xmm9 491*74750aa7SEric Biggers vmovdqu %xmm9,0xe0(%rsi) 492*74750aa7SEric Biggers 493*74750aa7SEric Biggers vmovdqa %xmm7,%xmm10 494*74750aa7SEric Biggers cmp $0x100,%rax 495*74750aa7SEric Biggers jl .Lxorpart4 496*74750aa7SEric Biggers vpxor 0xf0(%rdx),%xmm10,%xmm9 497*74750aa7SEric Biggers vmovdqu %xmm9,0xf0(%rsi) 498*74750aa7SEric Biggers 499*74750aa7SEric Biggers.Ldone4: 500*74750aa7SEric Biggers vzeroupper 501*74750aa7SEric Biggers RET 502*74750aa7SEric Biggers 503*74750aa7SEric Biggers.Lxorpart4: 504*74750aa7SEric Biggers # xor remaining bytes from partial register into output 505*74750aa7SEric Biggers mov %rax,%r9 506*74750aa7SEric Biggers and $0x0f,%r9 507*74750aa7SEric Biggers jz .Ldone4 508*74750aa7SEric Biggers and $~0x0f,%rax 509*74750aa7SEric Biggers 510*74750aa7SEric Biggers mov %rsi,%r11 511*74750aa7SEric Biggers 512*74750aa7SEric Biggers lea 8(%rsp),%r10 513*74750aa7SEric Biggers sub $0x10,%rsp 514*74750aa7SEric Biggers and $~31,%rsp 515*74750aa7SEric Biggers 516*74750aa7SEric Biggers lea (%rdx,%rax),%rsi 517*74750aa7SEric Biggers mov %rsp,%rdi 518*74750aa7SEric Biggers mov %r9,%rcx 519*74750aa7SEric Biggers rep movsb 520*74750aa7SEric Biggers 521*74750aa7SEric Biggers vpxor 0x00(%rsp),%xmm10,%xmm10 522*74750aa7SEric Biggers vmovdqa %xmm10,0x00(%rsp) 523*74750aa7SEric Biggers 524*74750aa7SEric Biggers mov %rsp,%rsi 525*74750aa7SEric Biggers lea (%r11,%rax),%rdi 526*74750aa7SEric Biggers mov %r9,%rcx 527*74750aa7SEric Biggers rep movsb 528*74750aa7SEric Biggers 529*74750aa7SEric Biggers lea -8(%r10),%rsp 530*74750aa7SEric Biggers jmp .Ldone4 531*74750aa7SEric Biggers 532*74750aa7SEric BiggersSYM_FUNC_END(chacha_4block_xor_avx2) 533*74750aa7SEric Biggers 534*74750aa7SEric BiggersSYM_FUNC_START(chacha_8block_xor_avx2) 535*74750aa7SEric Biggers # %rdi: Input state matrix, s 536*74750aa7SEric Biggers # %rsi: up to 8 data blocks output, o 537*74750aa7SEric Biggers # %rdx: up to 8 data blocks input, i 538*74750aa7SEric Biggers # %rcx: input/output length in bytes 539*74750aa7SEric Biggers # %r8d: nrounds 540*74750aa7SEric Biggers 541*74750aa7SEric Biggers # This function encrypts eight consecutive ChaCha blocks by loading 542*74750aa7SEric Biggers # the state matrix in AVX registers eight times. As we need some 543*74750aa7SEric Biggers # scratch registers, we save the first four registers on the stack. The 544*74750aa7SEric Biggers # algorithm performs each operation on the corresponding word of each 545*74750aa7SEric Biggers # state matrix, hence requires no word shuffling. For final XORing step 546*74750aa7SEric Biggers # we transpose the matrix by interleaving 32-, 64- and then 128-bit 547*74750aa7SEric Biggers # words, which allows us to do XOR in AVX registers. 8/16-bit word 548*74750aa7SEric Biggers # rotation is done with the slightly better performing byte shuffling, 549*74750aa7SEric Biggers # 7/12-bit word rotation uses traditional shift+OR. 550*74750aa7SEric Biggers 551*74750aa7SEric Biggers vzeroupper 552*74750aa7SEric Biggers # 4 * 32 byte stack, 32-byte aligned 553*74750aa7SEric Biggers lea 8(%rsp),%r10 554*74750aa7SEric Biggers and $~31, %rsp 555*74750aa7SEric Biggers sub $0x80, %rsp 556*74750aa7SEric Biggers mov %rcx,%rax 557*74750aa7SEric Biggers 558*74750aa7SEric Biggers # x0..15[0-7] = s[0..15] 559*74750aa7SEric Biggers vpbroadcastd 0x00(%rdi),%ymm0 560*74750aa7SEric Biggers vpbroadcastd 0x04(%rdi),%ymm1 561*74750aa7SEric Biggers vpbroadcastd 0x08(%rdi),%ymm2 562*74750aa7SEric Biggers vpbroadcastd 0x0c(%rdi),%ymm3 563*74750aa7SEric Biggers vpbroadcastd 0x10(%rdi),%ymm4 564*74750aa7SEric Biggers vpbroadcastd 0x14(%rdi),%ymm5 565*74750aa7SEric Biggers vpbroadcastd 0x18(%rdi),%ymm6 566*74750aa7SEric Biggers vpbroadcastd 0x1c(%rdi),%ymm7 567*74750aa7SEric Biggers vpbroadcastd 0x20(%rdi),%ymm8 568*74750aa7SEric Biggers vpbroadcastd 0x24(%rdi),%ymm9 569*74750aa7SEric Biggers vpbroadcastd 0x28(%rdi),%ymm10 570*74750aa7SEric Biggers vpbroadcastd 0x2c(%rdi),%ymm11 571*74750aa7SEric Biggers vpbroadcastd 0x30(%rdi),%ymm12 572*74750aa7SEric Biggers vpbroadcastd 0x34(%rdi),%ymm13 573*74750aa7SEric Biggers vpbroadcastd 0x38(%rdi),%ymm14 574*74750aa7SEric Biggers vpbroadcastd 0x3c(%rdi),%ymm15 575*74750aa7SEric Biggers # x0..3 on stack 576*74750aa7SEric Biggers vmovdqa %ymm0,0x00(%rsp) 577*74750aa7SEric Biggers vmovdqa %ymm1,0x20(%rsp) 578*74750aa7SEric Biggers vmovdqa %ymm2,0x40(%rsp) 579*74750aa7SEric Biggers vmovdqa %ymm3,0x60(%rsp) 580*74750aa7SEric Biggers 581*74750aa7SEric Biggers vmovdqa CTRINC(%rip),%ymm1 582*74750aa7SEric Biggers vmovdqa ROT8(%rip),%ymm2 583*74750aa7SEric Biggers vmovdqa ROT16(%rip),%ymm3 584*74750aa7SEric Biggers 585*74750aa7SEric Biggers # x12 += counter values 0-3 586*74750aa7SEric Biggers vpaddd %ymm1,%ymm12,%ymm12 587*74750aa7SEric Biggers 588*74750aa7SEric Biggers.Ldoubleround8: 589*74750aa7SEric Biggers # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 590*74750aa7SEric Biggers vpaddd 0x00(%rsp),%ymm4,%ymm0 591*74750aa7SEric Biggers vmovdqa %ymm0,0x00(%rsp) 592*74750aa7SEric Biggers vpxor %ymm0,%ymm12,%ymm12 593*74750aa7SEric Biggers vpshufb %ymm3,%ymm12,%ymm12 594*74750aa7SEric Biggers # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 595*74750aa7SEric Biggers vpaddd 0x20(%rsp),%ymm5,%ymm0 596*74750aa7SEric Biggers vmovdqa %ymm0,0x20(%rsp) 597*74750aa7SEric Biggers vpxor %ymm0,%ymm13,%ymm13 598*74750aa7SEric Biggers vpshufb %ymm3,%ymm13,%ymm13 599*74750aa7SEric Biggers # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 600*74750aa7SEric Biggers vpaddd 0x40(%rsp),%ymm6,%ymm0 601*74750aa7SEric Biggers vmovdqa %ymm0,0x40(%rsp) 602*74750aa7SEric Biggers vpxor %ymm0,%ymm14,%ymm14 603*74750aa7SEric Biggers vpshufb %ymm3,%ymm14,%ymm14 604*74750aa7SEric Biggers # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 605*74750aa7SEric Biggers vpaddd 0x60(%rsp),%ymm7,%ymm0 606*74750aa7SEric Biggers vmovdqa %ymm0,0x60(%rsp) 607*74750aa7SEric Biggers vpxor %ymm0,%ymm15,%ymm15 608*74750aa7SEric Biggers vpshufb %ymm3,%ymm15,%ymm15 609*74750aa7SEric Biggers 610*74750aa7SEric Biggers # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 611*74750aa7SEric Biggers vpaddd %ymm12,%ymm8,%ymm8 612*74750aa7SEric Biggers vpxor %ymm8,%ymm4,%ymm4 613*74750aa7SEric Biggers vpslld $12,%ymm4,%ymm0 614*74750aa7SEric Biggers vpsrld $20,%ymm4,%ymm4 615*74750aa7SEric Biggers vpor %ymm0,%ymm4,%ymm4 616*74750aa7SEric Biggers # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 617*74750aa7SEric Biggers vpaddd %ymm13,%ymm9,%ymm9 618*74750aa7SEric Biggers vpxor %ymm9,%ymm5,%ymm5 619*74750aa7SEric Biggers vpslld $12,%ymm5,%ymm0 620*74750aa7SEric Biggers vpsrld $20,%ymm5,%ymm5 621*74750aa7SEric Biggers vpor %ymm0,%ymm5,%ymm5 622*74750aa7SEric Biggers # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 623*74750aa7SEric Biggers vpaddd %ymm14,%ymm10,%ymm10 624*74750aa7SEric Biggers vpxor %ymm10,%ymm6,%ymm6 625*74750aa7SEric Biggers vpslld $12,%ymm6,%ymm0 626*74750aa7SEric Biggers vpsrld $20,%ymm6,%ymm6 627*74750aa7SEric Biggers vpor %ymm0,%ymm6,%ymm6 628*74750aa7SEric Biggers # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 629*74750aa7SEric Biggers vpaddd %ymm15,%ymm11,%ymm11 630*74750aa7SEric Biggers vpxor %ymm11,%ymm7,%ymm7 631*74750aa7SEric Biggers vpslld $12,%ymm7,%ymm0 632*74750aa7SEric Biggers vpsrld $20,%ymm7,%ymm7 633*74750aa7SEric Biggers vpor %ymm0,%ymm7,%ymm7 634*74750aa7SEric Biggers 635*74750aa7SEric Biggers # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 636*74750aa7SEric Biggers vpaddd 0x00(%rsp),%ymm4,%ymm0 637*74750aa7SEric Biggers vmovdqa %ymm0,0x00(%rsp) 638*74750aa7SEric Biggers vpxor %ymm0,%ymm12,%ymm12 639*74750aa7SEric Biggers vpshufb %ymm2,%ymm12,%ymm12 640*74750aa7SEric Biggers # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 641*74750aa7SEric Biggers vpaddd 0x20(%rsp),%ymm5,%ymm0 642*74750aa7SEric Biggers vmovdqa %ymm0,0x20(%rsp) 643*74750aa7SEric Biggers vpxor %ymm0,%ymm13,%ymm13 644*74750aa7SEric Biggers vpshufb %ymm2,%ymm13,%ymm13 645*74750aa7SEric Biggers # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 646*74750aa7SEric Biggers vpaddd 0x40(%rsp),%ymm6,%ymm0 647*74750aa7SEric Biggers vmovdqa %ymm0,0x40(%rsp) 648*74750aa7SEric Biggers vpxor %ymm0,%ymm14,%ymm14 649*74750aa7SEric Biggers vpshufb %ymm2,%ymm14,%ymm14 650*74750aa7SEric Biggers # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 651*74750aa7SEric Biggers vpaddd 0x60(%rsp),%ymm7,%ymm0 652*74750aa7SEric Biggers vmovdqa %ymm0,0x60(%rsp) 653*74750aa7SEric Biggers vpxor %ymm0,%ymm15,%ymm15 654*74750aa7SEric Biggers vpshufb %ymm2,%ymm15,%ymm15 655*74750aa7SEric Biggers 656*74750aa7SEric Biggers # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 657*74750aa7SEric Biggers vpaddd %ymm12,%ymm8,%ymm8 658*74750aa7SEric Biggers vpxor %ymm8,%ymm4,%ymm4 659*74750aa7SEric Biggers vpslld $7,%ymm4,%ymm0 660*74750aa7SEric Biggers vpsrld $25,%ymm4,%ymm4 661*74750aa7SEric Biggers vpor %ymm0,%ymm4,%ymm4 662*74750aa7SEric Biggers # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 663*74750aa7SEric Biggers vpaddd %ymm13,%ymm9,%ymm9 664*74750aa7SEric Biggers vpxor %ymm9,%ymm5,%ymm5 665*74750aa7SEric Biggers vpslld $7,%ymm5,%ymm0 666*74750aa7SEric Biggers vpsrld $25,%ymm5,%ymm5 667*74750aa7SEric Biggers vpor %ymm0,%ymm5,%ymm5 668*74750aa7SEric Biggers # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 669*74750aa7SEric Biggers vpaddd %ymm14,%ymm10,%ymm10 670*74750aa7SEric Biggers vpxor %ymm10,%ymm6,%ymm6 671*74750aa7SEric Biggers vpslld $7,%ymm6,%ymm0 672*74750aa7SEric Biggers vpsrld $25,%ymm6,%ymm6 673*74750aa7SEric Biggers vpor %ymm0,%ymm6,%ymm6 674*74750aa7SEric Biggers # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 675*74750aa7SEric Biggers vpaddd %ymm15,%ymm11,%ymm11 676*74750aa7SEric Biggers vpxor %ymm11,%ymm7,%ymm7 677*74750aa7SEric Biggers vpslld $7,%ymm7,%ymm0 678*74750aa7SEric Biggers vpsrld $25,%ymm7,%ymm7 679*74750aa7SEric Biggers vpor %ymm0,%ymm7,%ymm7 680*74750aa7SEric Biggers 681*74750aa7SEric Biggers # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 682*74750aa7SEric Biggers vpaddd 0x00(%rsp),%ymm5,%ymm0 683*74750aa7SEric Biggers vmovdqa %ymm0,0x00(%rsp) 684*74750aa7SEric Biggers vpxor %ymm0,%ymm15,%ymm15 685*74750aa7SEric Biggers vpshufb %ymm3,%ymm15,%ymm15 686*74750aa7SEric Biggers # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 687*74750aa7SEric Biggers vpaddd 0x20(%rsp),%ymm6,%ymm0 688*74750aa7SEric Biggers vmovdqa %ymm0,0x20(%rsp) 689*74750aa7SEric Biggers vpxor %ymm0,%ymm12,%ymm12 690*74750aa7SEric Biggers vpshufb %ymm3,%ymm12,%ymm12 691*74750aa7SEric Biggers # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 692*74750aa7SEric Biggers vpaddd 0x40(%rsp),%ymm7,%ymm0 693*74750aa7SEric Biggers vmovdqa %ymm0,0x40(%rsp) 694*74750aa7SEric Biggers vpxor %ymm0,%ymm13,%ymm13 695*74750aa7SEric Biggers vpshufb %ymm3,%ymm13,%ymm13 696*74750aa7SEric Biggers # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 697*74750aa7SEric Biggers vpaddd 0x60(%rsp),%ymm4,%ymm0 698*74750aa7SEric Biggers vmovdqa %ymm0,0x60(%rsp) 699*74750aa7SEric Biggers vpxor %ymm0,%ymm14,%ymm14 700*74750aa7SEric Biggers vpshufb %ymm3,%ymm14,%ymm14 701*74750aa7SEric Biggers 702*74750aa7SEric Biggers # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 703*74750aa7SEric Biggers vpaddd %ymm15,%ymm10,%ymm10 704*74750aa7SEric Biggers vpxor %ymm10,%ymm5,%ymm5 705*74750aa7SEric Biggers vpslld $12,%ymm5,%ymm0 706*74750aa7SEric Biggers vpsrld $20,%ymm5,%ymm5 707*74750aa7SEric Biggers vpor %ymm0,%ymm5,%ymm5 708*74750aa7SEric Biggers # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 709*74750aa7SEric Biggers vpaddd %ymm12,%ymm11,%ymm11 710*74750aa7SEric Biggers vpxor %ymm11,%ymm6,%ymm6 711*74750aa7SEric Biggers vpslld $12,%ymm6,%ymm0 712*74750aa7SEric Biggers vpsrld $20,%ymm6,%ymm6 713*74750aa7SEric Biggers vpor %ymm0,%ymm6,%ymm6 714*74750aa7SEric Biggers # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 715*74750aa7SEric Biggers vpaddd %ymm13,%ymm8,%ymm8 716*74750aa7SEric Biggers vpxor %ymm8,%ymm7,%ymm7 717*74750aa7SEric Biggers vpslld $12,%ymm7,%ymm0 718*74750aa7SEric Biggers vpsrld $20,%ymm7,%ymm7 719*74750aa7SEric Biggers vpor %ymm0,%ymm7,%ymm7 720*74750aa7SEric Biggers # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 721*74750aa7SEric Biggers vpaddd %ymm14,%ymm9,%ymm9 722*74750aa7SEric Biggers vpxor %ymm9,%ymm4,%ymm4 723*74750aa7SEric Biggers vpslld $12,%ymm4,%ymm0 724*74750aa7SEric Biggers vpsrld $20,%ymm4,%ymm4 725*74750aa7SEric Biggers vpor %ymm0,%ymm4,%ymm4 726*74750aa7SEric Biggers 727*74750aa7SEric Biggers # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 728*74750aa7SEric Biggers vpaddd 0x00(%rsp),%ymm5,%ymm0 729*74750aa7SEric Biggers vmovdqa %ymm0,0x00(%rsp) 730*74750aa7SEric Biggers vpxor %ymm0,%ymm15,%ymm15 731*74750aa7SEric Biggers vpshufb %ymm2,%ymm15,%ymm15 732*74750aa7SEric Biggers # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 733*74750aa7SEric Biggers vpaddd 0x20(%rsp),%ymm6,%ymm0 734*74750aa7SEric Biggers vmovdqa %ymm0,0x20(%rsp) 735*74750aa7SEric Biggers vpxor %ymm0,%ymm12,%ymm12 736*74750aa7SEric Biggers vpshufb %ymm2,%ymm12,%ymm12 737*74750aa7SEric Biggers # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 738*74750aa7SEric Biggers vpaddd 0x40(%rsp),%ymm7,%ymm0 739*74750aa7SEric Biggers vmovdqa %ymm0,0x40(%rsp) 740*74750aa7SEric Biggers vpxor %ymm0,%ymm13,%ymm13 741*74750aa7SEric Biggers vpshufb %ymm2,%ymm13,%ymm13 742*74750aa7SEric Biggers # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 743*74750aa7SEric Biggers vpaddd 0x60(%rsp),%ymm4,%ymm0 744*74750aa7SEric Biggers vmovdqa %ymm0,0x60(%rsp) 745*74750aa7SEric Biggers vpxor %ymm0,%ymm14,%ymm14 746*74750aa7SEric Biggers vpshufb %ymm2,%ymm14,%ymm14 747*74750aa7SEric Biggers 748*74750aa7SEric Biggers # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 749*74750aa7SEric Biggers vpaddd %ymm15,%ymm10,%ymm10 750*74750aa7SEric Biggers vpxor %ymm10,%ymm5,%ymm5 751*74750aa7SEric Biggers vpslld $7,%ymm5,%ymm0 752*74750aa7SEric Biggers vpsrld $25,%ymm5,%ymm5 753*74750aa7SEric Biggers vpor %ymm0,%ymm5,%ymm5 754*74750aa7SEric Biggers # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 755*74750aa7SEric Biggers vpaddd %ymm12,%ymm11,%ymm11 756*74750aa7SEric Biggers vpxor %ymm11,%ymm6,%ymm6 757*74750aa7SEric Biggers vpslld $7,%ymm6,%ymm0 758*74750aa7SEric Biggers vpsrld $25,%ymm6,%ymm6 759*74750aa7SEric Biggers vpor %ymm0,%ymm6,%ymm6 760*74750aa7SEric Biggers # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 761*74750aa7SEric Biggers vpaddd %ymm13,%ymm8,%ymm8 762*74750aa7SEric Biggers vpxor %ymm8,%ymm7,%ymm7 763*74750aa7SEric Biggers vpslld $7,%ymm7,%ymm0 764*74750aa7SEric Biggers vpsrld $25,%ymm7,%ymm7 765*74750aa7SEric Biggers vpor %ymm0,%ymm7,%ymm7 766*74750aa7SEric Biggers # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 767*74750aa7SEric Biggers vpaddd %ymm14,%ymm9,%ymm9 768*74750aa7SEric Biggers vpxor %ymm9,%ymm4,%ymm4 769*74750aa7SEric Biggers vpslld $7,%ymm4,%ymm0 770*74750aa7SEric Biggers vpsrld $25,%ymm4,%ymm4 771*74750aa7SEric Biggers vpor %ymm0,%ymm4,%ymm4 772*74750aa7SEric Biggers 773*74750aa7SEric Biggers sub $2,%r8d 774*74750aa7SEric Biggers jnz .Ldoubleround8 775*74750aa7SEric Biggers 776*74750aa7SEric Biggers # x0..15[0-3] += s[0..15] 777*74750aa7SEric Biggers vpbroadcastd 0x00(%rdi),%ymm0 778*74750aa7SEric Biggers vpaddd 0x00(%rsp),%ymm0,%ymm0 779*74750aa7SEric Biggers vmovdqa %ymm0,0x00(%rsp) 780*74750aa7SEric Biggers vpbroadcastd 0x04(%rdi),%ymm0 781*74750aa7SEric Biggers vpaddd 0x20(%rsp),%ymm0,%ymm0 782*74750aa7SEric Biggers vmovdqa %ymm0,0x20(%rsp) 783*74750aa7SEric Biggers vpbroadcastd 0x08(%rdi),%ymm0 784*74750aa7SEric Biggers vpaddd 0x40(%rsp),%ymm0,%ymm0 785*74750aa7SEric Biggers vmovdqa %ymm0,0x40(%rsp) 786*74750aa7SEric Biggers vpbroadcastd 0x0c(%rdi),%ymm0 787*74750aa7SEric Biggers vpaddd 0x60(%rsp),%ymm0,%ymm0 788*74750aa7SEric Biggers vmovdqa %ymm0,0x60(%rsp) 789*74750aa7SEric Biggers vpbroadcastd 0x10(%rdi),%ymm0 790*74750aa7SEric Biggers vpaddd %ymm0,%ymm4,%ymm4 791*74750aa7SEric Biggers vpbroadcastd 0x14(%rdi),%ymm0 792*74750aa7SEric Biggers vpaddd %ymm0,%ymm5,%ymm5 793*74750aa7SEric Biggers vpbroadcastd 0x18(%rdi),%ymm0 794*74750aa7SEric Biggers vpaddd %ymm0,%ymm6,%ymm6 795*74750aa7SEric Biggers vpbroadcastd 0x1c(%rdi),%ymm0 796*74750aa7SEric Biggers vpaddd %ymm0,%ymm7,%ymm7 797*74750aa7SEric Biggers vpbroadcastd 0x20(%rdi),%ymm0 798*74750aa7SEric Biggers vpaddd %ymm0,%ymm8,%ymm8 799*74750aa7SEric Biggers vpbroadcastd 0x24(%rdi),%ymm0 800*74750aa7SEric Biggers vpaddd %ymm0,%ymm9,%ymm9 801*74750aa7SEric Biggers vpbroadcastd 0x28(%rdi),%ymm0 802*74750aa7SEric Biggers vpaddd %ymm0,%ymm10,%ymm10 803*74750aa7SEric Biggers vpbroadcastd 0x2c(%rdi),%ymm0 804*74750aa7SEric Biggers vpaddd %ymm0,%ymm11,%ymm11 805*74750aa7SEric Biggers vpbroadcastd 0x30(%rdi),%ymm0 806*74750aa7SEric Biggers vpaddd %ymm0,%ymm12,%ymm12 807*74750aa7SEric Biggers vpbroadcastd 0x34(%rdi),%ymm0 808*74750aa7SEric Biggers vpaddd %ymm0,%ymm13,%ymm13 809*74750aa7SEric Biggers vpbroadcastd 0x38(%rdi),%ymm0 810*74750aa7SEric Biggers vpaddd %ymm0,%ymm14,%ymm14 811*74750aa7SEric Biggers vpbroadcastd 0x3c(%rdi),%ymm0 812*74750aa7SEric Biggers vpaddd %ymm0,%ymm15,%ymm15 813*74750aa7SEric Biggers 814*74750aa7SEric Biggers # x12 += counter values 0-3 815*74750aa7SEric Biggers vpaddd %ymm1,%ymm12,%ymm12 816*74750aa7SEric Biggers 817*74750aa7SEric Biggers # interleave 32-bit words in state n, n+1 818*74750aa7SEric Biggers vmovdqa 0x00(%rsp),%ymm0 819*74750aa7SEric Biggers vmovdqa 0x20(%rsp),%ymm1 820*74750aa7SEric Biggers vpunpckldq %ymm1,%ymm0,%ymm2 821*74750aa7SEric Biggers vpunpckhdq %ymm1,%ymm0,%ymm1 822*74750aa7SEric Biggers vmovdqa %ymm2,0x00(%rsp) 823*74750aa7SEric Biggers vmovdqa %ymm1,0x20(%rsp) 824*74750aa7SEric Biggers vmovdqa 0x40(%rsp),%ymm0 825*74750aa7SEric Biggers vmovdqa 0x60(%rsp),%ymm1 826*74750aa7SEric Biggers vpunpckldq %ymm1,%ymm0,%ymm2 827*74750aa7SEric Biggers vpunpckhdq %ymm1,%ymm0,%ymm1 828*74750aa7SEric Biggers vmovdqa %ymm2,0x40(%rsp) 829*74750aa7SEric Biggers vmovdqa %ymm1,0x60(%rsp) 830*74750aa7SEric Biggers vmovdqa %ymm4,%ymm0 831*74750aa7SEric Biggers vpunpckldq %ymm5,%ymm0,%ymm4 832*74750aa7SEric Biggers vpunpckhdq %ymm5,%ymm0,%ymm5 833*74750aa7SEric Biggers vmovdqa %ymm6,%ymm0 834*74750aa7SEric Biggers vpunpckldq %ymm7,%ymm0,%ymm6 835*74750aa7SEric Biggers vpunpckhdq %ymm7,%ymm0,%ymm7 836*74750aa7SEric Biggers vmovdqa %ymm8,%ymm0 837*74750aa7SEric Biggers vpunpckldq %ymm9,%ymm0,%ymm8 838*74750aa7SEric Biggers vpunpckhdq %ymm9,%ymm0,%ymm9 839*74750aa7SEric Biggers vmovdqa %ymm10,%ymm0 840*74750aa7SEric Biggers vpunpckldq %ymm11,%ymm0,%ymm10 841*74750aa7SEric Biggers vpunpckhdq %ymm11,%ymm0,%ymm11 842*74750aa7SEric Biggers vmovdqa %ymm12,%ymm0 843*74750aa7SEric Biggers vpunpckldq %ymm13,%ymm0,%ymm12 844*74750aa7SEric Biggers vpunpckhdq %ymm13,%ymm0,%ymm13 845*74750aa7SEric Biggers vmovdqa %ymm14,%ymm0 846*74750aa7SEric Biggers vpunpckldq %ymm15,%ymm0,%ymm14 847*74750aa7SEric Biggers vpunpckhdq %ymm15,%ymm0,%ymm15 848*74750aa7SEric Biggers 849*74750aa7SEric Biggers # interleave 64-bit words in state n, n+2 850*74750aa7SEric Biggers vmovdqa 0x00(%rsp),%ymm0 851*74750aa7SEric Biggers vmovdqa 0x40(%rsp),%ymm2 852*74750aa7SEric Biggers vpunpcklqdq %ymm2,%ymm0,%ymm1 853*74750aa7SEric Biggers vpunpckhqdq %ymm2,%ymm0,%ymm2 854*74750aa7SEric Biggers vmovdqa %ymm1,0x00(%rsp) 855*74750aa7SEric Biggers vmovdqa %ymm2,0x40(%rsp) 856*74750aa7SEric Biggers vmovdqa 0x20(%rsp),%ymm0 857*74750aa7SEric Biggers vmovdqa 0x60(%rsp),%ymm2 858*74750aa7SEric Biggers vpunpcklqdq %ymm2,%ymm0,%ymm1 859*74750aa7SEric Biggers vpunpckhqdq %ymm2,%ymm0,%ymm2 860*74750aa7SEric Biggers vmovdqa %ymm1,0x20(%rsp) 861*74750aa7SEric Biggers vmovdqa %ymm2,0x60(%rsp) 862*74750aa7SEric Biggers vmovdqa %ymm4,%ymm0 863*74750aa7SEric Biggers vpunpcklqdq %ymm6,%ymm0,%ymm4 864*74750aa7SEric Biggers vpunpckhqdq %ymm6,%ymm0,%ymm6 865*74750aa7SEric Biggers vmovdqa %ymm5,%ymm0 866*74750aa7SEric Biggers vpunpcklqdq %ymm7,%ymm0,%ymm5 867*74750aa7SEric Biggers vpunpckhqdq %ymm7,%ymm0,%ymm7 868*74750aa7SEric Biggers vmovdqa %ymm8,%ymm0 869*74750aa7SEric Biggers vpunpcklqdq %ymm10,%ymm0,%ymm8 870*74750aa7SEric Biggers vpunpckhqdq %ymm10,%ymm0,%ymm10 871*74750aa7SEric Biggers vmovdqa %ymm9,%ymm0 872*74750aa7SEric Biggers vpunpcklqdq %ymm11,%ymm0,%ymm9 873*74750aa7SEric Biggers vpunpckhqdq %ymm11,%ymm0,%ymm11 874*74750aa7SEric Biggers vmovdqa %ymm12,%ymm0 875*74750aa7SEric Biggers vpunpcklqdq %ymm14,%ymm0,%ymm12 876*74750aa7SEric Biggers vpunpckhqdq %ymm14,%ymm0,%ymm14 877*74750aa7SEric Biggers vmovdqa %ymm13,%ymm0 878*74750aa7SEric Biggers vpunpcklqdq %ymm15,%ymm0,%ymm13 879*74750aa7SEric Biggers vpunpckhqdq %ymm15,%ymm0,%ymm15 880*74750aa7SEric Biggers 881*74750aa7SEric Biggers # interleave 128-bit words in state n, n+4 882*74750aa7SEric Biggers # xor/write first four blocks 883*74750aa7SEric Biggers vmovdqa 0x00(%rsp),%ymm1 884*74750aa7SEric Biggers vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 885*74750aa7SEric Biggers cmp $0x0020,%rax 886*74750aa7SEric Biggers jl .Lxorpart8 887*74750aa7SEric Biggers vpxor 0x0000(%rdx),%ymm0,%ymm0 888*74750aa7SEric Biggers vmovdqu %ymm0,0x0000(%rsi) 889*74750aa7SEric Biggers vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 890*74750aa7SEric Biggers 891*74750aa7SEric Biggers vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 892*74750aa7SEric Biggers cmp $0x0040,%rax 893*74750aa7SEric Biggers jl .Lxorpart8 894*74750aa7SEric Biggers vpxor 0x0020(%rdx),%ymm0,%ymm0 895*74750aa7SEric Biggers vmovdqu %ymm0,0x0020(%rsi) 896*74750aa7SEric Biggers vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 897*74750aa7SEric Biggers 898*74750aa7SEric Biggers vmovdqa 0x40(%rsp),%ymm1 899*74750aa7SEric Biggers vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 900*74750aa7SEric Biggers cmp $0x0060,%rax 901*74750aa7SEric Biggers jl .Lxorpart8 902*74750aa7SEric Biggers vpxor 0x0040(%rdx),%ymm0,%ymm0 903*74750aa7SEric Biggers vmovdqu %ymm0,0x0040(%rsi) 904*74750aa7SEric Biggers vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 905*74750aa7SEric Biggers 906*74750aa7SEric Biggers vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 907*74750aa7SEric Biggers cmp $0x0080,%rax 908*74750aa7SEric Biggers jl .Lxorpart8 909*74750aa7SEric Biggers vpxor 0x0060(%rdx),%ymm0,%ymm0 910*74750aa7SEric Biggers vmovdqu %ymm0,0x0060(%rsi) 911*74750aa7SEric Biggers vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 912*74750aa7SEric Biggers 913*74750aa7SEric Biggers vmovdqa 0x20(%rsp),%ymm1 914*74750aa7SEric Biggers vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 915*74750aa7SEric Biggers cmp $0x00a0,%rax 916*74750aa7SEric Biggers jl .Lxorpart8 917*74750aa7SEric Biggers vpxor 0x0080(%rdx),%ymm0,%ymm0 918*74750aa7SEric Biggers vmovdqu %ymm0,0x0080(%rsi) 919*74750aa7SEric Biggers vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 920*74750aa7SEric Biggers 921*74750aa7SEric Biggers vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 922*74750aa7SEric Biggers cmp $0x00c0,%rax 923*74750aa7SEric Biggers jl .Lxorpart8 924*74750aa7SEric Biggers vpxor 0x00a0(%rdx),%ymm0,%ymm0 925*74750aa7SEric Biggers vmovdqu %ymm0,0x00a0(%rsi) 926*74750aa7SEric Biggers vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 927*74750aa7SEric Biggers 928*74750aa7SEric Biggers vmovdqa 0x60(%rsp),%ymm1 929*74750aa7SEric Biggers vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 930*74750aa7SEric Biggers cmp $0x00e0,%rax 931*74750aa7SEric Biggers jl .Lxorpart8 932*74750aa7SEric Biggers vpxor 0x00c0(%rdx),%ymm0,%ymm0 933*74750aa7SEric Biggers vmovdqu %ymm0,0x00c0(%rsi) 934*74750aa7SEric Biggers vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 935*74750aa7SEric Biggers 936*74750aa7SEric Biggers vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 937*74750aa7SEric Biggers cmp $0x0100,%rax 938*74750aa7SEric Biggers jl .Lxorpart8 939*74750aa7SEric Biggers vpxor 0x00e0(%rdx),%ymm0,%ymm0 940*74750aa7SEric Biggers vmovdqu %ymm0,0x00e0(%rsi) 941*74750aa7SEric Biggers vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 942*74750aa7SEric Biggers 943*74750aa7SEric Biggers # xor remaining blocks, write to output 944*74750aa7SEric Biggers vmovdqa %ymm4,%ymm0 945*74750aa7SEric Biggers cmp $0x0120,%rax 946*74750aa7SEric Biggers jl .Lxorpart8 947*74750aa7SEric Biggers vpxor 0x0100(%rdx),%ymm0,%ymm0 948*74750aa7SEric Biggers vmovdqu %ymm0,0x0100(%rsi) 949*74750aa7SEric Biggers 950*74750aa7SEric Biggers vmovdqa %ymm12,%ymm0 951*74750aa7SEric Biggers cmp $0x0140,%rax 952*74750aa7SEric Biggers jl .Lxorpart8 953*74750aa7SEric Biggers vpxor 0x0120(%rdx),%ymm0,%ymm0 954*74750aa7SEric Biggers vmovdqu %ymm0,0x0120(%rsi) 955*74750aa7SEric Biggers 956*74750aa7SEric Biggers vmovdqa %ymm6,%ymm0 957*74750aa7SEric Biggers cmp $0x0160,%rax 958*74750aa7SEric Biggers jl .Lxorpart8 959*74750aa7SEric Biggers vpxor 0x0140(%rdx),%ymm0,%ymm0 960*74750aa7SEric Biggers vmovdqu %ymm0,0x0140(%rsi) 961*74750aa7SEric Biggers 962*74750aa7SEric Biggers vmovdqa %ymm14,%ymm0 963*74750aa7SEric Biggers cmp $0x0180,%rax 964*74750aa7SEric Biggers jl .Lxorpart8 965*74750aa7SEric Biggers vpxor 0x0160(%rdx),%ymm0,%ymm0 966*74750aa7SEric Biggers vmovdqu %ymm0,0x0160(%rsi) 967*74750aa7SEric Biggers 968*74750aa7SEric Biggers vmovdqa %ymm5,%ymm0 969*74750aa7SEric Biggers cmp $0x01a0,%rax 970*74750aa7SEric Biggers jl .Lxorpart8 971*74750aa7SEric Biggers vpxor 0x0180(%rdx),%ymm0,%ymm0 972*74750aa7SEric Biggers vmovdqu %ymm0,0x0180(%rsi) 973*74750aa7SEric Biggers 974*74750aa7SEric Biggers vmovdqa %ymm13,%ymm0 975*74750aa7SEric Biggers cmp $0x01c0,%rax 976*74750aa7SEric Biggers jl .Lxorpart8 977*74750aa7SEric Biggers vpxor 0x01a0(%rdx),%ymm0,%ymm0 978*74750aa7SEric Biggers vmovdqu %ymm0,0x01a0(%rsi) 979*74750aa7SEric Biggers 980*74750aa7SEric Biggers vmovdqa %ymm7,%ymm0 981*74750aa7SEric Biggers cmp $0x01e0,%rax 982*74750aa7SEric Biggers jl .Lxorpart8 983*74750aa7SEric Biggers vpxor 0x01c0(%rdx),%ymm0,%ymm0 984*74750aa7SEric Biggers vmovdqu %ymm0,0x01c0(%rsi) 985*74750aa7SEric Biggers 986*74750aa7SEric Biggers vmovdqa %ymm15,%ymm0 987*74750aa7SEric Biggers cmp $0x0200,%rax 988*74750aa7SEric Biggers jl .Lxorpart8 989*74750aa7SEric Biggers vpxor 0x01e0(%rdx),%ymm0,%ymm0 990*74750aa7SEric Biggers vmovdqu %ymm0,0x01e0(%rsi) 991*74750aa7SEric Biggers 992*74750aa7SEric Biggers.Ldone8: 993*74750aa7SEric Biggers vzeroupper 994*74750aa7SEric Biggers lea -8(%r10),%rsp 995*74750aa7SEric Biggers RET 996*74750aa7SEric Biggers 997*74750aa7SEric Biggers.Lxorpart8: 998*74750aa7SEric Biggers # xor remaining bytes from partial register into output 999*74750aa7SEric Biggers mov %rax,%r9 1000*74750aa7SEric Biggers and $0x1f,%r9 1001*74750aa7SEric Biggers jz .Ldone8 1002*74750aa7SEric Biggers and $~0x1f,%rax 1003*74750aa7SEric Biggers 1004*74750aa7SEric Biggers mov %rsi,%r11 1005*74750aa7SEric Biggers 1006*74750aa7SEric Biggers lea (%rdx,%rax),%rsi 1007*74750aa7SEric Biggers mov %rsp,%rdi 1008*74750aa7SEric Biggers mov %r9,%rcx 1009*74750aa7SEric Biggers rep movsb 1010*74750aa7SEric Biggers 1011*74750aa7SEric Biggers vpxor 0x00(%rsp),%ymm0,%ymm0 1012*74750aa7SEric Biggers vmovdqa %ymm0,0x00(%rsp) 1013*74750aa7SEric Biggers 1014*74750aa7SEric Biggers mov %rsp,%rsi 1015*74750aa7SEric Biggers lea (%r11,%rax),%rdi 1016*74750aa7SEric Biggers mov %r9,%rcx 1017*74750aa7SEric Biggers rep movsb 1018*74750aa7SEric Biggers 1019*74750aa7SEric Biggers jmp .Ldone8 1020*74750aa7SEric Biggers 1021*74750aa7SEric BiggersSYM_FUNC_END(chacha_8block_xor_avx2) 1022