1*74750aa7SEric Biggers/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*74750aa7SEric Biggers/* 3*74750aa7SEric Biggers * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions 4*74750aa7SEric Biggers * 5*74750aa7SEric Biggers * Copyright (C) 2015 Martin Willi 6*74750aa7SEric Biggers */ 7*74750aa7SEric Biggers 8*74750aa7SEric Biggers#include <linux/linkage.h> 9*74750aa7SEric Biggers#include <asm/frame.h> 10*74750aa7SEric Biggers 11*74750aa7SEric Biggers.section .rodata.cst16.ROT8, "aM", @progbits, 16 12*74750aa7SEric Biggers.align 16 13*74750aa7SEric BiggersROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 14*74750aa7SEric Biggers.section .rodata.cst16.ROT16, "aM", @progbits, 16 15*74750aa7SEric Biggers.align 16 16*74750aa7SEric BiggersROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 17*74750aa7SEric Biggers.section .rodata.cst16.CTRINC, "aM", @progbits, 16 18*74750aa7SEric Biggers.align 16 19*74750aa7SEric BiggersCTRINC: .octa 0x00000003000000020000000100000000 20*74750aa7SEric Biggers 21*74750aa7SEric Biggers.text 22*74750aa7SEric Biggers 23*74750aa7SEric Biggers/* 24*74750aa7SEric Biggers * chacha_permute - permute one block 25*74750aa7SEric Biggers * 26*74750aa7SEric Biggers * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This 27*74750aa7SEric Biggers * function performs matrix operations on four words in parallel, but requires 28*74750aa7SEric Biggers * shuffling to rearrange the words after each round. 8/16-bit word rotation is 29*74750aa7SEric Biggers * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word 30*74750aa7SEric Biggers * rotation uses traditional shift+OR. 31*74750aa7SEric Biggers * 32*74750aa7SEric Biggers * The round count is given in %r8d. 33*74750aa7SEric Biggers * 34*74750aa7SEric Biggers * Clobbers: %r8d, %xmm4-%xmm7 35*74750aa7SEric Biggers */ 36*74750aa7SEric BiggersSYM_FUNC_START_LOCAL(chacha_permute) 37*74750aa7SEric Biggers 38*74750aa7SEric Biggers movdqa ROT8(%rip),%xmm4 39*74750aa7SEric Biggers movdqa ROT16(%rip),%xmm5 40*74750aa7SEric Biggers 41*74750aa7SEric Biggers.Ldoubleround: 42*74750aa7SEric Biggers # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 43*74750aa7SEric Biggers paddd %xmm1,%xmm0 44*74750aa7SEric Biggers pxor %xmm0,%xmm3 45*74750aa7SEric Biggers pshufb %xmm5,%xmm3 46*74750aa7SEric Biggers 47*74750aa7SEric Biggers # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 48*74750aa7SEric Biggers paddd %xmm3,%xmm2 49*74750aa7SEric Biggers pxor %xmm2,%xmm1 50*74750aa7SEric Biggers movdqa %xmm1,%xmm6 51*74750aa7SEric Biggers pslld $12,%xmm6 52*74750aa7SEric Biggers psrld $20,%xmm1 53*74750aa7SEric Biggers por %xmm6,%xmm1 54*74750aa7SEric Biggers 55*74750aa7SEric Biggers # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 56*74750aa7SEric Biggers paddd %xmm1,%xmm0 57*74750aa7SEric Biggers pxor %xmm0,%xmm3 58*74750aa7SEric Biggers pshufb %xmm4,%xmm3 59*74750aa7SEric Biggers 60*74750aa7SEric Biggers # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 61*74750aa7SEric Biggers paddd %xmm3,%xmm2 62*74750aa7SEric Biggers pxor %xmm2,%xmm1 63*74750aa7SEric Biggers movdqa %xmm1,%xmm7 64*74750aa7SEric Biggers pslld $7,%xmm7 65*74750aa7SEric Biggers psrld $25,%xmm1 66*74750aa7SEric Biggers por %xmm7,%xmm1 67*74750aa7SEric Biggers 68*74750aa7SEric Biggers # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 69*74750aa7SEric Biggers pshufd $0x39,%xmm1,%xmm1 70*74750aa7SEric Biggers # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 71*74750aa7SEric Biggers pshufd $0x4e,%xmm2,%xmm2 72*74750aa7SEric Biggers # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 73*74750aa7SEric Biggers pshufd $0x93,%xmm3,%xmm3 74*74750aa7SEric Biggers 75*74750aa7SEric Biggers # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 76*74750aa7SEric Biggers paddd %xmm1,%xmm0 77*74750aa7SEric Biggers pxor %xmm0,%xmm3 78*74750aa7SEric Biggers pshufb %xmm5,%xmm3 79*74750aa7SEric Biggers 80*74750aa7SEric Biggers # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 81*74750aa7SEric Biggers paddd %xmm3,%xmm2 82*74750aa7SEric Biggers pxor %xmm2,%xmm1 83*74750aa7SEric Biggers movdqa %xmm1,%xmm6 84*74750aa7SEric Biggers pslld $12,%xmm6 85*74750aa7SEric Biggers psrld $20,%xmm1 86*74750aa7SEric Biggers por %xmm6,%xmm1 87*74750aa7SEric Biggers 88*74750aa7SEric Biggers # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 89*74750aa7SEric Biggers paddd %xmm1,%xmm0 90*74750aa7SEric Biggers pxor %xmm0,%xmm3 91*74750aa7SEric Biggers pshufb %xmm4,%xmm3 92*74750aa7SEric Biggers 93*74750aa7SEric Biggers # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 94*74750aa7SEric Biggers paddd %xmm3,%xmm2 95*74750aa7SEric Biggers pxor %xmm2,%xmm1 96*74750aa7SEric Biggers movdqa %xmm1,%xmm7 97*74750aa7SEric Biggers pslld $7,%xmm7 98*74750aa7SEric Biggers psrld $25,%xmm1 99*74750aa7SEric Biggers por %xmm7,%xmm1 100*74750aa7SEric Biggers 101*74750aa7SEric Biggers # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 102*74750aa7SEric Biggers pshufd $0x93,%xmm1,%xmm1 103*74750aa7SEric Biggers # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 104*74750aa7SEric Biggers pshufd $0x4e,%xmm2,%xmm2 105*74750aa7SEric Biggers # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 106*74750aa7SEric Biggers pshufd $0x39,%xmm3,%xmm3 107*74750aa7SEric Biggers 108*74750aa7SEric Biggers sub $2,%r8d 109*74750aa7SEric Biggers jnz .Ldoubleround 110*74750aa7SEric Biggers 111*74750aa7SEric Biggers RET 112*74750aa7SEric BiggersSYM_FUNC_END(chacha_permute) 113*74750aa7SEric Biggers 114*74750aa7SEric BiggersSYM_FUNC_START(chacha_block_xor_ssse3) 115*74750aa7SEric Biggers # %rdi: Input state matrix, s 116*74750aa7SEric Biggers # %rsi: up to 1 data block output, o 117*74750aa7SEric Biggers # %rdx: up to 1 data block input, i 118*74750aa7SEric Biggers # %rcx: input/output length in bytes 119*74750aa7SEric Biggers # %r8d: nrounds 120*74750aa7SEric Biggers FRAME_BEGIN 121*74750aa7SEric Biggers 122*74750aa7SEric Biggers # x0..3 = s0..3 123*74750aa7SEric Biggers movdqu 0x00(%rdi),%xmm0 124*74750aa7SEric Biggers movdqu 0x10(%rdi),%xmm1 125*74750aa7SEric Biggers movdqu 0x20(%rdi),%xmm2 126*74750aa7SEric Biggers movdqu 0x30(%rdi),%xmm3 127*74750aa7SEric Biggers movdqa %xmm0,%xmm8 128*74750aa7SEric Biggers movdqa %xmm1,%xmm9 129*74750aa7SEric Biggers movdqa %xmm2,%xmm10 130*74750aa7SEric Biggers movdqa %xmm3,%xmm11 131*74750aa7SEric Biggers 132*74750aa7SEric Biggers mov %rcx,%rax 133*74750aa7SEric Biggers call chacha_permute 134*74750aa7SEric Biggers 135*74750aa7SEric Biggers # o0 = i0 ^ (x0 + s0) 136*74750aa7SEric Biggers paddd %xmm8,%xmm0 137*74750aa7SEric Biggers cmp $0x10,%rax 138*74750aa7SEric Biggers jl .Lxorpart 139*74750aa7SEric Biggers movdqu 0x00(%rdx),%xmm4 140*74750aa7SEric Biggers pxor %xmm4,%xmm0 141*74750aa7SEric Biggers movdqu %xmm0,0x00(%rsi) 142*74750aa7SEric Biggers # o1 = i1 ^ (x1 + s1) 143*74750aa7SEric Biggers paddd %xmm9,%xmm1 144*74750aa7SEric Biggers movdqa %xmm1,%xmm0 145*74750aa7SEric Biggers cmp $0x20,%rax 146*74750aa7SEric Biggers jl .Lxorpart 147*74750aa7SEric Biggers movdqu 0x10(%rdx),%xmm0 148*74750aa7SEric Biggers pxor %xmm1,%xmm0 149*74750aa7SEric Biggers movdqu %xmm0,0x10(%rsi) 150*74750aa7SEric Biggers # o2 = i2 ^ (x2 + s2) 151*74750aa7SEric Biggers paddd %xmm10,%xmm2 152*74750aa7SEric Biggers movdqa %xmm2,%xmm0 153*74750aa7SEric Biggers cmp $0x30,%rax 154*74750aa7SEric Biggers jl .Lxorpart 155*74750aa7SEric Biggers movdqu 0x20(%rdx),%xmm0 156*74750aa7SEric Biggers pxor %xmm2,%xmm0 157*74750aa7SEric Biggers movdqu %xmm0,0x20(%rsi) 158*74750aa7SEric Biggers # o3 = i3 ^ (x3 + s3) 159*74750aa7SEric Biggers paddd %xmm11,%xmm3 160*74750aa7SEric Biggers movdqa %xmm3,%xmm0 161*74750aa7SEric Biggers cmp $0x40,%rax 162*74750aa7SEric Biggers jl .Lxorpart 163*74750aa7SEric Biggers movdqu 0x30(%rdx),%xmm0 164*74750aa7SEric Biggers pxor %xmm3,%xmm0 165*74750aa7SEric Biggers movdqu %xmm0,0x30(%rsi) 166*74750aa7SEric Biggers 167*74750aa7SEric Biggers.Ldone: 168*74750aa7SEric Biggers FRAME_END 169*74750aa7SEric Biggers RET 170*74750aa7SEric Biggers 171*74750aa7SEric Biggers.Lxorpart: 172*74750aa7SEric Biggers # xor remaining bytes from partial register into output 173*74750aa7SEric Biggers mov %rax,%r9 174*74750aa7SEric Biggers and $0x0f,%r9 175*74750aa7SEric Biggers jz .Ldone 176*74750aa7SEric Biggers and $~0x0f,%rax 177*74750aa7SEric Biggers 178*74750aa7SEric Biggers mov %rsi,%r11 179*74750aa7SEric Biggers 180*74750aa7SEric Biggers lea 8(%rsp),%r10 181*74750aa7SEric Biggers sub $0x10,%rsp 182*74750aa7SEric Biggers and $~31,%rsp 183*74750aa7SEric Biggers 184*74750aa7SEric Biggers lea (%rdx,%rax),%rsi 185*74750aa7SEric Biggers mov %rsp,%rdi 186*74750aa7SEric Biggers mov %r9,%rcx 187*74750aa7SEric Biggers rep movsb 188*74750aa7SEric Biggers 189*74750aa7SEric Biggers pxor 0x00(%rsp),%xmm0 190*74750aa7SEric Biggers movdqa %xmm0,0x00(%rsp) 191*74750aa7SEric Biggers 192*74750aa7SEric Biggers mov %rsp,%rsi 193*74750aa7SEric Biggers lea (%r11,%rax),%rdi 194*74750aa7SEric Biggers mov %r9,%rcx 195*74750aa7SEric Biggers rep movsb 196*74750aa7SEric Biggers 197*74750aa7SEric Biggers lea -8(%r10),%rsp 198*74750aa7SEric Biggers jmp .Ldone 199*74750aa7SEric Biggers 200*74750aa7SEric BiggersSYM_FUNC_END(chacha_block_xor_ssse3) 201*74750aa7SEric Biggers 202*74750aa7SEric BiggersSYM_FUNC_START(hchacha_block_ssse3) 203*74750aa7SEric Biggers # %rdi: Input state matrix, s 204*74750aa7SEric Biggers # %rsi: output (8 32-bit words) 205*74750aa7SEric Biggers # %edx: nrounds 206*74750aa7SEric Biggers FRAME_BEGIN 207*74750aa7SEric Biggers 208*74750aa7SEric Biggers movdqu 0x00(%rdi),%xmm0 209*74750aa7SEric Biggers movdqu 0x10(%rdi),%xmm1 210*74750aa7SEric Biggers movdqu 0x20(%rdi),%xmm2 211*74750aa7SEric Biggers movdqu 0x30(%rdi),%xmm3 212*74750aa7SEric Biggers 213*74750aa7SEric Biggers mov %edx,%r8d 214*74750aa7SEric Biggers call chacha_permute 215*74750aa7SEric Biggers 216*74750aa7SEric Biggers movdqu %xmm0,0x00(%rsi) 217*74750aa7SEric Biggers movdqu %xmm3,0x10(%rsi) 218*74750aa7SEric Biggers 219*74750aa7SEric Biggers FRAME_END 220*74750aa7SEric Biggers RET 221*74750aa7SEric BiggersSYM_FUNC_END(hchacha_block_ssse3) 222*74750aa7SEric Biggers 223*74750aa7SEric BiggersSYM_FUNC_START(chacha_4block_xor_ssse3) 224*74750aa7SEric Biggers # %rdi: Input state matrix, s 225*74750aa7SEric Biggers # %rsi: up to 4 data blocks output, o 226*74750aa7SEric Biggers # %rdx: up to 4 data blocks input, i 227*74750aa7SEric Biggers # %rcx: input/output length in bytes 228*74750aa7SEric Biggers # %r8d: nrounds 229*74750aa7SEric Biggers 230*74750aa7SEric Biggers # This function encrypts four consecutive ChaCha blocks by loading the 231*74750aa7SEric Biggers # the state matrix in SSE registers four times. As we need some scratch 232*74750aa7SEric Biggers # registers, we save the first four registers on the stack. The 233*74750aa7SEric Biggers # algorithm performs each operation on the corresponding word of each 234*74750aa7SEric Biggers # state matrix, hence requires no word shuffling. For final XORing step 235*74750aa7SEric Biggers # we transpose the matrix by interleaving 32- and then 64-bit words, 236*74750aa7SEric Biggers # which allows us to do XOR in SSE registers. 8/16-bit word rotation is 237*74750aa7SEric Biggers # done with the slightly better performing SSSE3 byte shuffling, 238*74750aa7SEric Biggers # 7/12-bit word rotation uses traditional shift+OR. 239*74750aa7SEric Biggers 240*74750aa7SEric Biggers lea 8(%rsp),%r10 241*74750aa7SEric Biggers sub $0x80,%rsp 242*74750aa7SEric Biggers and $~63,%rsp 243*74750aa7SEric Biggers mov %rcx,%rax 244*74750aa7SEric Biggers 245*74750aa7SEric Biggers # x0..15[0-3] = s0..3[0..3] 246*74750aa7SEric Biggers movq 0x00(%rdi),%xmm1 247*74750aa7SEric Biggers pshufd $0x00,%xmm1,%xmm0 248*74750aa7SEric Biggers pshufd $0x55,%xmm1,%xmm1 249*74750aa7SEric Biggers movq 0x08(%rdi),%xmm3 250*74750aa7SEric Biggers pshufd $0x00,%xmm3,%xmm2 251*74750aa7SEric Biggers pshufd $0x55,%xmm3,%xmm3 252*74750aa7SEric Biggers movq 0x10(%rdi),%xmm5 253*74750aa7SEric Biggers pshufd $0x00,%xmm5,%xmm4 254*74750aa7SEric Biggers pshufd $0x55,%xmm5,%xmm5 255*74750aa7SEric Biggers movq 0x18(%rdi),%xmm7 256*74750aa7SEric Biggers pshufd $0x00,%xmm7,%xmm6 257*74750aa7SEric Biggers pshufd $0x55,%xmm7,%xmm7 258*74750aa7SEric Biggers movq 0x20(%rdi),%xmm9 259*74750aa7SEric Biggers pshufd $0x00,%xmm9,%xmm8 260*74750aa7SEric Biggers pshufd $0x55,%xmm9,%xmm9 261*74750aa7SEric Biggers movq 0x28(%rdi),%xmm11 262*74750aa7SEric Biggers pshufd $0x00,%xmm11,%xmm10 263*74750aa7SEric Biggers pshufd $0x55,%xmm11,%xmm11 264*74750aa7SEric Biggers movq 0x30(%rdi),%xmm13 265*74750aa7SEric Biggers pshufd $0x00,%xmm13,%xmm12 266*74750aa7SEric Biggers pshufd $0x55,%xmm13,%xmm13 267*74750aa7SEric Biggers movq 0x38(%rdi),%xmm15 268*74750aa7SEric Biggers pshufd $0x00,%xmm15,%xmm14 269*74750aa7SEric Biggers pshufd $0x55,%xmm15,%xmm15 270*74750aa7SEric Biggers # x0..3 on stack 271*74750aa7SEric Biggers movdqa %xmm0,0x00(%rsp) 272*74750aa7SEric Biggers movdqa %xmm1,0x10(%rsp) 273*74750aa7SEric Biggers movdqa %xmm2,0x20(%rsp) 274*74750aa7SEric Biggers movdqa %xmm3,0x30(%rsp) 275*74750aa7SEric Biggers 276*74750aa7SEric Biggers movdqa CTRINC(%rip),%xmm1 277*74750aa7SEric Biggers movdqa ROT8(%rip),%xmm2 278*74750aa7SEric Biggers movdqa ROT16(%rip),%xmm3 279*74750aa7SEric Biggers 280*74750aa7SEric Biggers # x12 += counter values 0-3 281*74750aa7SEric Biggers paddd %xmm1,%xmm12 282*74750aa7SEric Biggers 283*74750aa7SEric Biggers.Ldoubleround4: 284*74750aa7SEric Biggers # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 285*74750aa7SEric Biggers movdqa 0x00(%rsp),%xmm0 286*74750aa7SEric Biggers paddd %xmm4,%xmm0 287*74750aa7SEric Biggers movdqa %xmm0,0x00(%rsp) 288*74750aa7SEric Biggers pxor %xmm0,%xmm12 289*74750aa7SEric Biggers pshufb %xmm3,%xmm12 290*74750aa7SEric Biggers # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 291*74750aa7SEric Biggers movdqa 0x10(%rsp),%xmm0 292*74750aa7SEric Biggers paddd %xmm5,%xmm0 293*74750aa7SEric Biggers movdqa %xmm0,0x10(%rsp) 294*74750aa7SEric Biggers pxor %xmm0,%xmm13 295*74750aa7SEric Biggers pshufb %xmm3,%xmm13 296*74750aa7SEric Biggers # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 297*74750aa7SEric Biggers movdqa 0x20(%rsp),%xmm0 298*74750aa7SEric Biggers paddd %xmm6,%xmm0 299*74750aa7SEric Biggers movdqa %xmm0,0x20(%rsp) 300*74750aa7SEric Biggers pxor %xmm0,%xmm14 301*74750aa7SEric Biggers pshufb %xmm3,%xmm14 302*74750aa7SEric Biggers # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 303*74750aa7SEric Biggers movdqa 0x30(%rsp),%xmm0 304*74750aa7SEric Biggers paddd %xmm7,%xmm0 305*74750aa7SEric Biggers movdqa %xmm0,0x30(%rsp) 306*74750aa7SEric Biggers pxor %xmm0,%xmm15 307*74750aa7SEric Biggers pshufb %xmm3,%xmm15 308*74750aa7SEric Biggers 309*74750aa7SEric Biggers # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 310*74750aa7SEric Biggers paddd %xmm12,%xmm8 311*74750aa7SEric Biggers pxor %xmm8,%xmm4 312*74750aa7SEric Biggers movdqa %xmm4,%xmm0 313*74750aa7SEric Biggers pslld $12,%xmm0 314*74750aa7SEric Biggers psrld $20,%xmm4 315*74750aa7SEric Biggers por %xmm0,%xmm4 316*74750aa7SEric Biggers # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 317*74750aa7SEric Biggers paddd %xmm13,%xmm9 318*74750aa7SEric Biggers pxor %xmm9,%xmm5 319*74750aa7SEric Biggers movdqa %xmm5,%xmm0 320*74750aa7SEric Biggers pslld $12,%xmm0 321*74750aa7SEric Biggers psrld $20,%xmm5 322*74750aa7SEric Biggers por %xmm0,%xmm5 323*74750aa7SEric Biggers # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 324*74750aa7SEric Biggers paddd %xmm14,%xmm10 325*74750aa7SEric Biggers pxor %xmm10,%xmm6 326*74750aa7SEric Biggers movdqa %xmm6,%xmm0 327*74750aa7SEric Biggers pslld $12,%xmm0 328*74750aa7SEric Biggers psrld $20,%xmm6 329*74750aa7SEric Biggers por %xmm0,%xmm6 330*74750aa7SEric Biggers # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 331*74750aa7SEric Biggers paddd %xmm15,%xmm11 332*74750aa7SEric Biggers pxor %xmm11,%xmm7 333*74750aa7SEric Biggers movdqa %xmm7,%xmm0 334*74750aa7SEric Biggers pslld $12,%xmm0 335*74750aa7SEric Biggers psrld $20,%xmm7 336*74750aa7SEric Biggers por %xmm0,%xmm7 337*74750aa7SEric Biggers 338*74750aa7SEric Biggers # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 339*74750aa7SEric Biggers movdqa 0x00(%rsp),%xmm0 340*74750aa7SEric Biggers paddd %xmm4,%xmm0 341*74750aa7SEric Biggers movdqa %xmm0,0x00(%rsp) 342*74750aa7SEric Biggers pxor %xmm0,%xmm12 343*74750aa7SEric Biggers pshufb %xmm2,%xmm12 344*74750aa7SEric Biggers # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 345*74750aa7SEric Biggers movdqa 0x10(%rsp),%xmm0 346*74750aa7SEric Biggers paddd %xmm5,%xmm0 347*74750aa7SEric Biggers movdqa %xmm0,0x10(%rsp) 348*74750aa7SEric Biggers pxor %xmm0,%xmm13 349*74750aa7SEric Biggers pshufb %xmm2,%xmm13 350*74750aa7SEric Biggers # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 351*74750aa7SEric Biggers movdqa 0x20(%rsp),%xmm0 352*74750aa7SEric Biggers paddd %xmm6,%xmm0 353*74750aa7SEric Biggers movdqa %xmm0,0x20(%rsp) 354*74750aa7SEric Biggers pxor %xmm0,%xmm14 355*74750aa7SEric Biggers pshufb %xmm2,%xmm14 356*74750aa7SEric Biggers # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 357*74750aa7SEric Biggers movdqa 0x30(%rsp),%xmm0 358*74750aa7SEric Biggers paddd %xmm7,%xmm0 359*74750aa7SEric Biggers movdqa %xmm0,0x30(%rsp) 360*74750aa7SEric Biggers pxor %xmm0,%xmm15 361*74750aa7SEric Biggers pshufb %xmm2,%xmm15 362*74750aa7SEric Biggers 363*74750aa7SEric Biggers # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 364*74750aa7SEric Biggers paddd %xmm12,%xmm8 365*74750aa7SEric Biggers pxor %xmm8,%xmm4 366*74750aa7SEric Biggers movdqa %xmm4,%xmm0 367*74750aa7SEric Biggers pslld $7,%xmm0 368*74750aa7SEric Biggers psrld $25,%xmm4 369*74750aa7SEric Biggers por %xmm0,%xmm4 370*74750aa7SEric Biggers # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 371*74750aa7SEric Biggers paddd %xmm13,%xmm9 372*74750aa7SEric Biggers pxor %xmm9,%xmm5 373*74750aa7SEric Biggers movdqa %xmm5,%xmm0 374*74750aa7SEric Biggers pslld $7,%xmm0 375*74750aa7SEric Biggers psrld $25,%xmm5 376*74750aa7SEric Biggers por %xmm0,%xmm5 377*74750aa7SEric Biggers # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 378*74750aa7SEric Biggers paddd %xmm14,%xmm10 379*74750aa7SEric Biggers pxor %xmm10,%xmm6 380*74750aa7SEric Biggers movdqa %xmm6,%xmm0 381*74750aa7SEric Biggers pslld $7,%xmm0 382*74750aa7SEric Biggers psrld $25,%xmm6 383*74750aa7SEric Biggers por %xmm0,%xmm6 384*74750aa7SEric Biggers # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 385*74750aa7SEric Biggers paddd %xmm15,%xmm11 386*74750aa7SEric Biggers pxor %xmm11,%xmm7 387*74750aa7SEric Biggers movdqa %xmm7,%xmm0 388*74750aa7SEric Biggers pslld $7,%xmm0 389*74750aa7SEric Biggers psrld $25,%xmm7 390*74750aa7SEric Biggers por %xmm0,%xmm7 391*74750aa7SEric Biggers 392*74750aa7SEric Biggers # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 393*74750aa7SEric Biggers movdqa 0x00(%rsp),%xmm0 394*74750aa7SEric Biggers paddd %xmm5,%xmm0 395*74750aa7SEric Biggers movdqa %xmm0,0x00(%rsp) 396*74750aa7SEric Biggers pxor %xmm0,%xmm15 397*74750aa7SEric Biggers pshufb %xmm3,%xmm15 398*74750aa7SEric Biggers # x1 += x6, x12 = rotl32(x12 ^ x1, 16) 399*74750aa7SEric Biggers movdqa 0x10(%rsp),%xmm0 400*74750aa7SEric Biggers paddd %xmm6,%xmm0 401*74750aa7SEric Biggers movdqa %xmm0,0x10(%rsp) 402*74750aa7SEric Biggers pxor %xmm0,%xmm12 403*74750aa7SEric Biggers pshufb %xmm3,%xmm12 404*74750aa7SEric Biggers # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 405*74750aa7SEric Biggers movdqa 0x20(%rsp),%xmm0 406*74750aa7SEric Biggers paddd %xmm7,%xmm0 407*74750aa7SEric Biggers movdqa %xmm0,0x20(%rsp) 408*74750aa7SEric Biggers pxor %xmm0,%xmm13 409*74750aa7SEric Biggers pshufb %xmm3,%xmm13 410*74750aa7SEric Biggers # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 411*74750aa7SEric Biggers movdqa 0x30(%rsp),%xmm0 412*74750aa7SEric Biggers paddd %xmm4,%xmm0 413*74750aa7SEric Biggers movdqa %xmm0,0x30(%rsp) 414*74750aa7SEric Biggers pxor %xmm0,%xmm14 415*74750aa7SEric Biggers pshufb %xmm3,%xmm14 416*74750aa7SEric Biggers 417*74750aa7SEric Biggers # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 418*74750aa7SEric Biggers paddd %xmm15,%xmm10 419*74750aa7SEric Biggers pxor %xmm10,%xmm5 420*74750aa7SEric Biggers movdqa %xmm5,%xmm0 421*74750aa7SEric Biggers pslld $12,%xmm0 422*74750aa7SEric Biggers psrld $20,%xmm5 423*74750aa7SEric Biggers por %xmm0,%xmm5 424*74750aa7SEric Biggers # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 425*74750aa7SEric Biggers paddd %xmm12,%xmm11 426*74750aa7SEric Biggers pxor %xmm11,%xmm6 427*74750aa7SEric Biggers movdqa %xmm6,%xmm0 428*74750aa7SEric Biggers pslld $12,%xmm0 429*74750aa7SEric Biggers psrld $20,%xmm6 430*74750aa7SEric Biggers por %xmm0,%xmm6 431*74750aa7SEric Biggers # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 432*74750aa7SEric Biggers paddd %xmm13,%xmm8 433*74750aa7SEric Biggers pxor %xmm8,%xmm7 434*74750aa7SEric Biggers movdqa %xmm7,%xmm0 435*74750aa7SEric Biggers pslld $12,%xmm0 436*74750aa7SEric Biggers psrld $20,%xmm7 437*74750aa7SEric Biggers por %xmm0,%xmm7 438*74750aa7SEric Biggers # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 439*74750aa7SEric Biggers paddd %xmm14,%xmm9 440*74750aa7SEric Biggers pxor %xmm9,%xmm4 441*74750aa7SEric Biggers movdqa %xmm4,%xmm0 442*74750aa7SEric Biggers pslld $12,%xmm0 443*74750aa7SEric Biggers psrld $20,%xmm4 444*74750aa7SEric Biggers por %xmm0,%xmm4 445*74750aa7SEric Biggers 446*74750aa7SEric Biggers # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 447*74750aa7SEric Biggers movdqa 0x00(%rsp),%xmm0 448*74750aa7SEric Biggers paddd %xmm5,%xmm0 449*74750aa7SEric Biggers movdqa %xmm0,0x00(%rsp) 450*74750aa7SEric Biggers pxor %xmm0,%xmm15 451*74750aa7SEric Biggers pshufb %xmm2,%xmm15 452*74750aa7SEric Biggers # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 453*74750aa7SEric Biggers movdqa 0x10(%rsp),%xmm0 454*74750aa7SEric Biggers paddd %xmm6,%xmm0 455*74750aa7SEric Biggers movdqa %xmm0,0x10(%rsp) 456*74750aa7SEric Biggers pxor %xmm0,%xmm12 457*74750aa7SEric Biggers pshufb %xmm2,%xmm12 458*74750aa7SEric Biggers # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 459*74750aa7SEric Biggers movdqa 0x20(%rsp),%xmm0 460*74750aa7SEric Biggers paddd %xmm7,%xmm0 461*74750aa7SEric Biggers movdqa %xmm0,0x20(%rsp) 462*74750aa7SEric Biggers pxor %xmm0,%xmm13 463*74750aa7SEric Biggers pshufb %xmm2,%xmm13 464*74750aa7SEric Biggers # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 465*74750aa7SEric Biggers movdqa 0x30(%rsp),%xmm0 466*74750aa7SEric Biggers paddd %xmm4,%xmm0 467*74750aa7SEric Biggers movdqa %xmm0,0x30(%rsp) 468*74750aa7SEric Biggers pxor %xmm0,%xmm14 469*74750aa7SEric Biggers pshufb %xmm2,%xmm14 470*74750aa7SEric Biggers 471*74750aa7SEric Biggers # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 472*74750aa7SEric Biggers paddd %xmm15,%xmm10 473*74750aa7SEric Biggers pxor %xmm10,%xmm5 474*74750aa7SEric Biggers movdqa %xmm5,%xmm0 475*74750aa7SEric Biggers pslld $7,%xmm0 476*74750aa7SEric Biggers psrld $25,%xmm5 477*74750aa7SEric Biggers por %xmm0,%xmm5 478*74750aa7SEric Biggers # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 479*74750aa7SEric Biggers paddd %xmm12,%xmm11 480*74750aa7SEric Biggers pxor %xmm11,%xmm6 481*74750aa7SEric Biggers movdqa %xmm6,%xmm0 482*74750aa7SEric Biggers pslld $7,%xmm0 483*74750aa7SEric Biggers psrld $25,%xmm6 484*74750aa7SEric Biggers por %xmm0,%xmm6 485*74750aa7SEric Biggers # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 486*74750aa7SEric Biggers paddd %xmm13,%xmm8 487*74750aa7SEric Biggers pxor %xmm8,%xmm7 488*74750aa7SEric Biggers movdqa %xmm7,%xmm0 489*74750aa7SEric Biggers pslld $7,%xmm0 490*74750aa7SEric Biggers psrld $25,%xmm7 491*74750aa7SEric Biggers por %xmm0,%xmm7 492*74750aa7SEric Biggers # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 493*74750aa7SEric Biggers paddd %xmm14,%xmm9 494*74750aa7SEric Biggers pxor %xmm9,%xmm4 495*74750aa7SEric Biggers movdqa %xmm4,%xmm0 496*74750aa7SEric Biggers pslld $7,%xmm0 497*74750aa7SEric Biggers psrld $25,%xmm4 498*74750aa7SEric Biggers por %xmm0,%xmm4 499*74750aa7SEric Biggers 500*74750aa7SEric Biggers sub $2,%r8d 501*74750aa7SEric Biggers jnz .Ldoubleround4 502*74750aa7SEric Biggers 503*74750aa7SEric Biggers # x0[0-3] += s0[0] 504*74750aa7SEric Biggers # x1[0-3] += s0[1] 505*74750aa7SEric Biggers movq 0x00(%rdi),%xmm3 506*74750aa7SEric Biggers pshufd $0x00,%xmm3,%xmm2 507*74750aa7SEric Biggers pshufd $0x55,%xmm3,%xmm3 508*74750aa7SEric Biggers paddd 0x00(%rsp),%xmm2 509*74750aa7SEric Biggers movdqa %xmm2,0x00(%rsp) 510*74750aa7SEric Biggers paddd 0x10(%rsp),%xmm3 511*74750aa7SEric Biggers movdqa %xmm3,0x10(%rsp) 512*74750aa7SEric Biggers # x2[0-3] += s0[2] 513*74750aa7SEric Biggers # x3[0-3] += s0[3] 514*74750aa7SEric Biggers movq 0x08(%rdi),%xmm3 515*74750aa7SEric Biggers pshufd $0x00,%xmm3,%xmm2 516*74750aa7SEric Biggers pshufd $0x55,%xmm3,%xmm3 517*74750aa7SEric Biggers paddd 0x20(%rsp),%xmm2 518*74750aa7SEric Biggers movdqa %xmm2,0x20(%rsp) 519*74750aa7SEric Biggers paddd 0x30(%rsp),%xmm3 520*74750aa7SEric Biggers movdqa %xmm3,0x30(%rsp) 521*74750aa7SEric Biggers 522*74750aa7SEric Biggers # x4[0-3] += s1[0] 523*74750aa7SEric Biggers # x5[0-3] += s1[1] 524*74750aa7SEric Biggers movq 0x10(%rdi),%xmm3 525*74750aa7SEric Biggers pshufd $0x00,%xmm3,%xmm2 526*74750aa7SEric Biggers pshufd $0x55,%xmm3,%xmm3 527*74750aa7SEric Biggers paddd %xmm2,%xmm4 528*74750aa7SEric Biggers paddd %xmm3,%xmm5 529*74750aa7SEric Biggers # x6[0-3] += s1[2] 530*74750aa7SEric Biggers # x7[0-3] += s1[3] 531*74750aa7SEric Biggers movq 0x18(%rdi),%xmm3 532*74750aa7SEric Biggers pshufd $0x00,%xmm3,%xmm2 533*74750aa7SEric Biggers pshufd $0x55,%xmm3,%xmm3 534*74750aa7SEric Biggers paddd %xmm2,%xmm6 535*74750aa7SEric Biggers paddd %xmm3,%xmm7 536*74750aa7SEric Biggers 537*74750aa7SEric Biggers # x8[0-3] += s2[0] 538*74750aa7SEric Biggers # x9[0-3] += s2[1] 539*74750aa7SEric Biggers movq 0x20(%rdi),%xmm3 540*74750aa7SEric Biggers pshufd $0x00,%xmm3,%xmm2 541*74750aa7SEric Biggers pshufd $0x55,%xmm3,%xmm3 542*74750aa7SEric Biggers paddd %xmm2,%xmm8 543*74750aa7SEric Biggers paddd %xmm3,%xmm9 544*74750aa7SEric Biggers # x10[0-3] += s2[2] 545*74750aa7SEric Biggers # x11[0-3] += s2[3] 546*74750aa7SEric Biggers movq 0x28(%rdi),%xmm3 547*74750aa7SEric Biggers pshufd $0x00,%xmm3,%xmm2 548*74750aa7SEric Biggers pshufd $0x55,%xmm3,%xmm3 549*74750aa7SEric Biggers paddd %xmm2,%xmm10 550*74750aa7SEric Biggers paddd %xmm3,%xmm11 551*74750aa7SEric Biggers 552*74750aa7SEric Biggers # x12[0-3] += s3[0] 553*74750aa7SEric Biggers # x13[0-3] += s3[1] 554*74750aa7SEric Biggers movq 0x30(%rdi),%xmm3 555*74750aa7SEric Biggers pshufd $0x00,%xmm3,%xmm2 556*74750aa7SEric Biggers pshufd $0x55,%xmm3,%xmm3 557*74750aa7SEric Biggers paddd %xmm2,%xmm12 558*74750aa7SEric Biggers paddd %xmm3,%xmm13 559*74750aa7SEric Biggers # x14[0-3] += s3[2] 560*74750aa7SEric Biggers # x15[0-3] += s3[3] 561*74750aa7SEric Biggers movq 0x38(%rdi),%xmm3 562*74750aa7SEric Biggers pshufd $0x00,%xmm3,%xmm2 563*74750aa7SEric Biggers pshufd $0x55,%xmm3,%xmm3 564*74750aa7SEric Biggers paddd %xmm2,%xmm14 565*74750aa7SEric Biggers paddd %xmm3,%xmm15 566*74750aa7SEric Biggers 567*74750aa7SEric Biggers # x12 += counter values 0-3 568*74750aa7SEric Biggers paddd %xmm1,%xmm12 569*74750aa7SEric Biggers 570*74750aa7SEric Biggers # interleave 32-bit words in state n, n+1 571*74750aa7SEric Biggers movdqa 0x00(%rsp),%xmm0 572*74750aa7SEric Biggers movdqa 0x10(%rsp),%xmm1 573*74750aa7SEric Biggers movdqa %xmm0,%xmm2 574*74750aa7SEric Biggers punpckldq %xmm1,%xmm2 575*74750aa7SEric Biggers punpckhdq %xmm1,%xmm0 576*74750aa7SEric Biggers movdqa %xmm2,0x00(%rsp) 577*74750aa7SEric Biggers movdqa %xmm0,0x10(%rsp) 578*74750aa7SEric Biggers movdqa 0x20(%rsp),%xmm0 579*74750aa7SEric Biggers movdqa 0x30(%rsp),%xmm1 580*74750aa7SEric Biggers movdqa %xmm0,%xmm2 581*74750aa7SEric Biggers punpckldq %xmm1,%xmm2 582*74750aa7SEric Biggers punpckhdq %xmm1,%xmm0 583*74750aa7SEric Biggers movdqa %xmm2,0x20(%rsp) 584*74750aa7SEric Biggers movdqa %xmm0,0x30(%rsp) 585*74750aa7SEric Biggers movdqa %xmm4,%xmm0 586*74750aa7SEric Biggers punpckldq %xmm5,%xmm4 587*74750aa7SEric Biggers punpckhdq %xmm5,%xmm0 588*74750aa7SEric Biggers movdqa %xmm0,%xmm5 589*74750aa7SEric Biggers movdqa %xmm6,%xmm0 590*74750aa7SEric Biggers punpckldq %xmm7,%xmm6 591*74750aa7SEric Biggers punpckhdq %xmm7,%xmm0 592*74750aa7SEric Biggers movdqa %xmm0,%xmm7 593*74750aa7SEric Biggers movdqa %xmm8,%xmm0 594*74750aa7SEric Biggers punpckldq %xmm9,%xmm8 595*74750aa7SEric Biggers punpckhdq %xmm9,%xmm0 596*74750aa7SEric Biggers movdqa %xmm0,%xmm9 597*74750aa7SEric Biggers movdqa %xmm10,%xmm0 598*74750aa7SEric Biggers punpckldq %xmm11,%xmm10 599*74750aa7SEric Biggers punpckhdq %xmm11,%xmm0 600*74750aa7SEric Biggers movdqa %xmm0,%xmm11 601*74750aa7SEric Biggers movdqa %xmm12,%xmm0 602*74750aa7SEric Biggers punpckldq %xmm13,%xmm12 603*74750aa7SEric Biggers punpckhdq %xmm13,%xmm0 604*74750aa7SEric Biggers movdqa %xmm0,%xmm13 605*74750aa7SEric Biggers movdqa %xmm14,%xmm0 606*74750aa7SEric Biggers punpckldq %xmm15,%xmm14 607*74750aa7SEric Biggers punpckhdq %xmm15,%xmm0 608*74750aa7SEric Biggers movdqa %xmm0,%xmm15 609*74750aa7SEric Biggers 610*74750aa7SEric Biggers # interleave 64-bit words in state n, n+2 611*74750aa7SEric Biggers movdqa 0x00(%rsp),%xmm0 612*74750aa7SEric Biggers movdqa 0x20(%rsp),%xmm1 613*74750aa7SEric Biggers movdqa %xmm0,%xmm2 614*74750aa7SEric Biggers punpcklqdq %xmm1,%xmm2 615*74750aa7SEric Biggers punpckhqdq %xmm1,%xmm0 616*74750aa7SEric Biggers movdqa %xmm2,0x00(%rsp) 617*74750aa7SEric Biggers movdqa %xmm0,0x20(%rsp) 618*74750aa7SEric Biggers movdqa 0x10(%rsp),%xmm0 619*74750aa7SEric Biggers movdqa 0x30(%rsp),%xmm1 620*74750aa7SEric Biggers movdqa %xmm0,%xmm2 621*74750aa7SEric Biggers punpcklqdq %xmm1,%xmm2 622*74750aa7SEric Biggers punpckhqdq %xmm1,%xmm0 623*74750aa7SEric Biggers movdqa %xmm2,0x10(%rsp) 624*74750aa7SEric Biggers movdqa %xmm0,0x30(%rsp) 625*74750aa7SEric Biggers movdqa %xmm4,%xmm0 626*74750aa7SEric Biggers punpcklqdq %xmm6,%xmm4 627*74750aa7SEric Biggers punpckhqdq %xmm6,%xmm0 628*74750aa7SEric Biggers movdqa %xmm0,%xmm6 629*74750aa7SEric Biggers movdqa %xmm5,%xmm0 630*74750aa7SEric Biggers punpcklqdq %xmm7,%xmm5 631*74750aa7SEric Biggers punpckhqdq %xmm7,%xmm0 632*74750aa7SEric Biggers movdqa %xmm0,%xmm7 633*74750aa7SEric Biggers movdqa %xmm8,%xmm0 634*74750aa7SEric Biggers punpcklqdq %xmm10,%xmm8 635*74750aa7SEric Biggers punpckhqdq %xmm10,%xmm0 636*74750aa7SEric Biggers movdqa %xmm0,%xmm10 637*74750aa7SEric Biggers movdqa %xmm9,%xmm0 638*74750aa7SEric Biggers punpcklqdq %xmm11,%xmm9 639*74750aa7SEric Biggers punpckhqdq %xmm11,%xmm0 640*74750aa7SEric Biggers movdqa %xmm0,%xmm11 641*74750aa7SEric Biggers movdqa %xmm12,%xmm0 642*74750aa7SEric Biggers punpcklqdq %xmm14,%xmm12 643*74750aa7SEric Biggers punpckhqdq %xmm14,%xmm0 644*74750aa7SEric Biggers movdqa %xmm0,%xmm14 645*74750aa7SEric Biggers movdqa %xmm13,%xmm0 646*74750aa7SEric Biggers punpcklqdq %xmm15,%xmm13 647*74750aa7SEric Biggers punpckhqdq %xmm15,%xmm0 648*74750aa7SEric Biggers movdqa %xmm0,%xmm15 649*74750aa7SEric Biggers 650*74750aa7SEric Biggers # xor with corresponding input, write to output 651*74750aa7SEric Biggers movdqa 0x00(%rsp),%xmm0 652*74750aa7SEric Biggers cmp $0x10,%rax 653*74750aa7SEric Biggers jl .Lxorpart4 654*74750aa7SEric Biggers movdqu 0x00(%rdx),%xmm1 655*74750aa7SEric Biggers pxor %xmm1,%xmm0 656*74750aa7SEric Biggers movdqu %xmm0,0x00(%rsi) 657*74750aa7SEric Biggers 658*74750aa7SEric Biggers movdqu %xmm4,%xmm0 659*74750aa7SEric Biggers cmp $0x20,%rax 660*74750aa7SEric Biggers jl .Lxorpart4 661*74750aa7SEric Biggers movdqu 0x10(%rdx),%xmm1 662*74750aa7SEric Biggers pxor %xmm1,%xmm0 663*74750aa7SEric Biggers movdqu %xmm0,0x10(%rsi) 664*74750aa7SEric Biggers 665*74750aa7SEric Biggers movdqu %xmm8,%xmm0 666*74750aa7SEric Biggers cmp $0x30,%rax 667*74750aa7SEric Biggers jl .Lxorpart4 668*74750aa7SEric Biggers movdqu 0x20(%rdx),%xmm1 669*74750aa7SEric Biggers pxor %xmm1,%xmm0 670*74750aa7SEric Biggers movdqu %xmm0,0x20(%rsi) 671*74750aa7SEric Biggers 672*74750aa7SEric Biggers movdqu %xmm12,%xmm0 673*74750aa7SEric Biggers cmp $0x40,%rax 674*74750aa7SEric Biggers jl .Lxorpart4 675*74750aa7SEric Biggers movdqu 0x30(%rdx),%xmm1 676*74750aa7SEric Biggers pxor %xmm1,%xmm0 677*74750aa7SEric Biggers movdqu %xmm0,0x30(%rsi) 678*74750aa7SEric Biggers 679*74750aa7SEric Biggers movdqa 0x20(%rsp),%xmm0 680*74750aa7SEric Biggers cmp $0x50,%rax 681*74750aa7SEric Biggers jl .Lxorpart4 682*74750aa7SEric Biggers movdqu 0x40(%rdx),%xmm1 683*74750aa7SEric Biggers pxor %xmm1,%xmm0 684*74750aa7SEric Biggers movdqu %xmm0,0x40(%rsi) 685*74750aa7SEric Biggers 686*74750aa7SEric Biggers movdqu %xmm6,%xmm0 687*74750aa7SEric Biggers cmp $0x60,%rax 688*74750aa7SEric Biggers jl .Lxorpart4 689*74750aa7SEric Biggers movdqu 0x50(%rdx),%xmm1 690*74750aa7SEric Biggers pxor %xmm1,%xmm0 691*74750aa7SEric Biggers movdqu %xmm0,0x50(%rsi) 692*74750aa7SEric Biggers 693*74750aa7SEric Biggers movdqu %xmm10,%xmm0 694*74750aa7SEric Biggers cmp $0x70,%rax 695*74750aa7SEric Biggers jl .Lxorpart4 696*74750aa7SEric Biggers movdqu 0x60(%rdx),%xmm1 697*74750aa7SEric Biggers pxor %xmm1,%xmm0 698*74750aa7SEric Biggers movdqu %xmm0,0x60(%rsi) 699*74750aa7SEric Biggers 700*74750aa7SEric Biggers movdqu %xmm14,%xmm0 701*74750aa7SEric Biggers cmp $0x80,%rax 702*74750aa7SEric Biggers jl .Lxorpart4 703*74750aa7SEric Biggers movdqu 0x70(%rdx),%xmm1 704*74750aa7SEric Biggers pxor %xmm1,%xmm0 705*74750aa7SEric Biggers movdqu %xmm0,0x70(%rsi) 706*74750aa7SEric Biggers 707*74750aa7SEric Biggers movdqa 0x10(%rsp),%xmm0 708*74750aa7SEric Biggers cmp $0x90,%rax 709*74750aa7SEric Biggers jl .Lxorpart4 710*74750aa7SEric Biggers movdqu 0x80(%rdx),%xmm1 711*74750aa7SEric Biggers pxor %xmm1,%xmm0 712*74750aa7SEric Biggers movdqu %xmm0,0x80(%rsi) 713*74750aa7SEric Biggers 714*74750aa7SEric Biggers movdqu %xmm5,%xmm0 715*74750aa7SEric Biggers cmp $0xa0,%rax 716*74750aa7SEric Biggers jl .Lxorpart4 717*74750aa7SEric Biggers movdqu 0x90(%rdx),%xmm1 718*74750aa7SEric Biggers pxor %xmm1,%xmm0 719*74750aa7SEric Biggers movdqu %xmm0,0x90(%rsi) 720*74750aa7SEric Biggers 721*74750aa7SEric Biggers movdqu %xmm9,%xmm0 722*74750aa7SEric Biggers cmp $0xb0,%rax 723*74750aa7SEric Biggers jl .Lxorpart4 724*74750aa7SEric Biggers movdqu 0xa0(%rdx),%xmm1 725*74750aa7SEric Biggers pxor %xmm1,%xmm0 726*74750aa7SEric Biggers movdqu %xmm0,0xa0(%rsi) 727*74750aa7SEric Biggers 728*74750aa7SEric Biggers movdqu %xmm13,%xmm0 729*74750aa7SEric Biggers cmp $0xc0,%rax 730*74750aa7SEric Biggers jl .Lxorpart4 731*74750aa7SEric Biggers movdqu 0xb0(%rdx),%xmm1 732*74750aa7SEric Biggers pxor %xmm1,%xmm0 733*74750aa7SEric Biggers movdqu %xmm0,0xb0(%rsi) 734*74750aa7SEric Biggers 735*74750aa7SEric Biggers movdqa 0x30(%rsp),%xmm0 736*74750aa7SEric Biggers cmp $0xd0,%rax 737*74750aa7SEric Biggers jl .Lxorpart4 738*74750aa7SEric Biggers movdqu 0xc0(%rdx),%xmm1 739*74750aa7SEric Biggers pxor %xmm1,%xmm0 740*74750aa7SEric Biggers movdqu %xmm0,0xc0(%rsi) 741*74750aa7SEric Biggers 742*74750aa7SEric Biggers movdqu %xmm7,%xmm0 743*74750aa7SEric Biggers cmp $0xe0,%rax 744*74750aa7SEric Biggers jl .Lxorpart4 745*74750aa7SEric Biggers movdqu 0xd0(%rdx),%xmm1 746*74750aa7SEric Biggers pxor %xmm1,%xmm0 747*74750aa7SEric Biggers movdqu %xmm0,0xd0(%rsi) 748*74750aa7SEric Biggers 749*74750aa7SEric Biggers movdqu %xmm11,%xmm0 750*74750aa7SEric Biggers cmp $0xf0,%rax 751*74750aa7SEric Biggers jl .Lxorpart4 752*74750aa7SEric Biggers movdqu 0xe0(%rdx),%xmm1 753*74750aa7SEric Biggers pxor %xmm1,%xmm0 754*74750aa7SEric Biggers movdqu %xmm0,0xe0(%rsi) 755*74750aa7SEric Biggers 756*74750aa7SEric Biggers movdqu %xmm15,%xmm0 757*74750aa7SEric Biggers cmp $0x100,%rax 758*74750aa7SEric Biggers jl .Lxorpart4 759*74750aa7SEric Biggers movdqu 0xf0(%rdx),%xmm1 760*74750aa7SEric Biggers pxor %xmm1,%xmm0 761*74750aa7SEric Biggers movdqu %xmm0,0xf0(%rsi) 762*74750aa7SEric Biggers 763*74750aa7SEric Biggers.Ldone4: 764*74750aa7SEric Biggers lea -8(%r10),%rsp 765*74750aa7SEric Biggers RET 766*74750aa7SEric Biggers 767*74750aa7SEric Biggers.Lxorpart4: 768*74750aa7SEric Biggers # xor remaining bytes from partial register into output 769*74750aa7SEric Biggers mov %rax,%r9 770*74750aa7SEric Biggers and $0x0f,%r9 771*74750aa7SEric Biggers jz .Ldone4 772*74750aa7SEric Biggers and $~0x0f,%rax 773*74750aa7SEric Biggers 774*74750aa7SEric Biggers mov %rsi,%r11 775*74750aa7SEric Biggers 776*74750aa7SEric Biggers lea (%rdx,%rax),%rsi 777*74750aa7SEric Biggers mov %rsp,%rdi 778*74750aa7SEric Biggers mov %r9,%rcx 779*74750aa7SEric Biggers rep movsb 780*74750aa7SEric Biggers 781*74750aa7SEric Biggers pxor 0x00(%rsp),%xmm0 782*74750aa7SEric Biggers movdqa %xmm0,0x00(%rsp) 783*74750aa7SEric Biggers 784*74750aa7SEric Biggers mov %rsp,%rsi 785*74750aa7SEric Biggers lea (%r11,%rax),%rdi 786*74750aa7SEric Biggers mov %r9,%rcx 787*74750aa7SEric Biggers rep movsb 788*74750aa7SEric Biggers 789*74750aa7SEric Biggers jmp .Ldone4 790*74750aa7SEric Biggers 791*74750aa7SEric BiggersSYM_FUNC_END(chacha_4block_xor_ssse3) 792