1*693c819fSH. Peter Anvin// SPDX-License-Identifier: GPL-2.0 2*693c819fSH. Peter Anvin/* 3*693c819fSH. Peter Anvin * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4*693c819fSH. Peter Anvin */ 5*693c819fSH. Peter Anvin 6*693c819fSH. Peter Anvin#include <linux/linkage.h> 7*693c819fSH. Peter Anvin#include <asm/frame.h> 8*693c819fSH. Peter Anvin 9*693c819fSH. Peter Anvin.section .rodata, "a" 10*693c819fSH. Peter Anvin.align 16 11*693c819fSH. Peter AnvinCONSTANTS: .octa 0x6b20657479622d323320646e61707865 12*693c819fSH. Peter Anvin.text 13*693c819fSH. Peter Anvin 14*693c819fSH. Peter Anvin/* 15*693c819fSH. Peter Anvin * Very basic SSE2 implementation of ChaCha20. Produces a given positive number 16*693c819fSH. Peter Anvin * of blocks of output with a nonce of 0, taking an input key and 8-byte 17*693c819fSH. Peter Anvin * counter. Importantly does not spill to the stack. Its arguments are: 18*693c819fSH. Peter Anvin * 19*693c819fSH. Peter Anvin * rdi: output bytes 20*693c819fSH. Peter Anvin * rsi: 32-byte key input 21*693c819fSH. Peter Anvin * rdx: 8-byte counter input/output 22*693c819fSH. Peter Anvin * rcx: number of 64-byte blocks to write to output 23*693c819fSH. Peter Anvin */ 24*693c819fSH. Peter AnvinSYM_FUNC_START(__arch_chacha20_blocks_nostack) 25*693c819fSH. Peter Anvin 26*693c819fSH. Peter Anvin.set output, %rdi 27*693c819fSH. Peter Anvin.set key, %rsi 28*693c819fSH. Peter Anvin.set counter, %rdx 29*693c819fSH. Peter Anvin.set nblocks, %rcx 30*693c819fSH. Peter Anvin.set i, %al 31*693c819fSH. Peter Anvin/* xmm registers are *not* callee-save. */ 32*693c819fSH. Peter Anvin.set temp, %xmm0 33*693c819fSH. Peter Anvin.set state0, %xmm1 34*693c819fSH. Peter Anvin.set state1, %xmm2 35*693c819fSH. Peter Anvin.set state2, %xmm3 36*693c819fSH. Peter Anvin.set state3, %xmm4 37*693c819fSH. Peter Anvin.set copy0, %xmm5 38*693c819fSH. Peter Anvin.set copy1, %xmm6 39*693c819fSH. Peter Anvin.set copy2, %xmm7 40*693c819fSH. Peter Anvin.set copy3, %xmm8 41*693c819fSH. Peter Anvin.set one, %xmm9 42*693c819fSH. Peter Anvin 43*693c819fSH. Peter Anvin /* copy0 = "expand 32-byte k" */ 44*693c819fSH. Peter Anvin movaps CONSTANTS(%rip),copy0 45*693c819fSH. Peter Anvin /* copy1,copy2 = key */ 46*693c819fSH. Peter Anvin movups 0x00(key),copy1 47*693c819fSH. Peter Anvin movups 0x10(key),copy2 48*693c819fSH. Peter Anvin /* copy3 = counter || zero nonce */ 49*693c819fSH. Peter Anvin movq 0x00(counter),copy3 50*693c819fSH. Peter Anvin /* one = 1 || 0 */ 51*693c819fSH. Peter Anvin movq $1,%rax 52*693c819fSH. Peter Anvin movq %rax,one 53*693c819fSH. Peter Anvin 54*693c819fSH. Peter Anvin.Lblock: 55*693c819fSH. Peter Anvin /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */ 56*693c819fSH. Peter Anvin movdqa copy0,state0 57*693c819fSH. Peter Anvin movdqa copy1,state1 58*693c819fSH. Peter Anvin movdqa copy2,state2 59*693c819fSH. Peter Anvin movdqa copy3,state3 60*693c819fSH. Peter Anvin 61*693c819fSH. Peter Anvin movb $10,i 62*693c819fSH. Peter Anvin.Lpermute: 63*693c819fSH. Peter Anvin /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ 64*693c819fSH. Peter Anvin paddd state1,state0 65*693c819fSH. Peter Anvin pxor state0,state3 66*693c819fSH. Peter Anvin movdqa state3,temp 67*693c819fSH. Peter Anvin pslld $16,temp 68*693c819fSH. Peter Anvin psrld $16,state3 69*693c819fSH. Peter Anvin por temp,state3 70*693c819fSH. Peter Anvin 71*693c819fSH. Peter Anvin /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ 72*693c819fSH. Peter Anvin paddd state3,state2 73*693c819fSH. Peter Anvin pxor state2,state1 74*693c819fSH. Peter Anvin movdqa state1,temp 75*693c819fSH. Peter Anvin pslld $12,temp 76*693c819fSH. Peter Anvin psrld $20,state1 77*693c819fSH. Peter Anvin por temp,state1 78*693c819fSH. Peter Anvin 79*693c819fSH. Peter Anvin /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ 80*693c819fSH. Peter Anvin paddd state1,state0 81*693c819fSH. Peter Anvin pxor state0,state3 82*693c819fSH. Peter Anvin movdqa state3,temp 83*693c819fSH. Peter Anvin pslld $8,temp 84*693c819fSH. Peter Anvin psrld $24,state3 85*693c819fSH. Peter Anvin por temp,state3 86*693c819fSH. Peter Anvin 87*693c819fSH. Peter Anvin /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ 88*693c819fSH. Peter Anvin paddd state3,state2 89*693c819fSH. Peter Anvin pxor state2,state1 90*693c819fSH. Peter Anvin movdqa state1,temp 91*693c819fSH. Peter Anvin pslld $7,temp 92*693c819fSH. Peter Anvin psrld $25,state1 93*693c819fSH. Peter Anvin por temp,state1 94*693c819fSH. Peter Anvin 95*693c819fSH. Peter Anvin /* state1[0,1,2,3] = state1[1,2,3,0] */ 96*693c819fSH. Peter Anvin pshufd $0x39,state1,state1 97*693c819fSH. Peter Anvin /* state2[0,1,2,3] = state2[2,3,0,1] */ 98*693c819fSH. Peter Anvin pshufd $0x4e,state2,state2 99*693c819fSH. Peter Anvin /* state3[0,1,2,3] = state3[3,0,1,2] */ 100*693c819fSH. Peter Anvin pshufd $0x93,state3,state3 101*693c819fSH. Peter Anvin 102*693c819fSH. Peter Anvin /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ 103*693c819fSH. Peter Anvin paddd state1,state0 104*693c819fSH. Peter Anvin pxor state0,state3 105*693c819fSH. Peter Anvin movdqa state3,temp 106*693c819fSH. Peter Anvin pslld $16,temp 107*693c819fSH. Peter Anvin psrld $16,state3 108*693c819fSH. Peter Anvin por temp,state3 109*693c819fSH. Peter Anvin 110*693c819fSH. Peter Anvin /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ 111*693c819fSH. Peter Anvin paddd state3,state2 112*693c819fSH. Peter Anvin pxor state2,state1 113*693c819fSH. Peter Anvin movdqa state1,temp 114*693c819fSH. Peter Anvin pslld $12,temp 115*693c819fSH. Peter Anvin psrld $20,state1 116*693c819fSH. Peter Anvin por temp,state1 117*693c819fSH. Peter Anvin 118*693c819fSH. Peter Anvin /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ 119*693c819fSH. Peter Anvin paddd state1,state0 120*693c819fSH. Peter Anvin pxor state0,state3 121*693c819fSH. Peter Anvin movdqa state3,temp 122*693c819fSH. Peter Anvin pslld $8,temp 123*693c819fSH. Peter Anvin psrld $24,state3 124*693c819fSH. Peter Anvin por temp,state3 125*693c819fSH. Peter Anvin 126*693c819fSH. Peter Anvin /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ 127*693c819fSH. Peter Anvin paddd state3,state2 128*693c819fSH. Peter Anvin pxor state2,state1 129*693c819fSH. Peter Anvin movdqa state1,temp 130*693c819fSH. Peter Anvin pslld $7,temp 131*693c819fSH. Peter Anvin psrld $25,state1 132*693c819fSH. Peter Anvin por temp,state1 133*693c819fSH. Peter Anvin 134*693c819fSH. Peter Anvin /* state1[0,1,2,3] = state1[3,0,1,2] */ 135*693c819fSH. Peter Anvin pshufd $0x93,state1,state1 136*693c819fSH. Peter Anvin /* state2[0,1,2,3] = state2[2,3,0,1] */ 137*693c819fSH. Peter Anvin pshufd $0x4e,state2,state2 138*693c819fSH. Peter Anvin /* state3[0,1,2,3] = state3[1,2,3,0] */ 139*693c819fSH. Peter Anvin pshufd $0x39,state3,state3 140*693c819fSH. Peter Anvin 141*693c819fSH. Peter Anvin decb i 142*693c819fSH. Peter Anvin jnz .Lpermute 143*693c819fSH. Peter Anvin 144*693c819fSH. Peter Anvin /* output0 = state0 + copy0 */ 145*693c819fSH. Peter Anvin paddd copy0,state0 146*693c819fSH. Peter Anvin movups state0,0x00(output) 147*693c819fSH. Peter Anvin /* output1 = state1 + copy1 */ 148*693c819fSH. Peter Anvin paddd copy1,state1 149*693c819fSH. Peter Anvin movups state1,0x10(output) 150*693c819fSH. Peter Anvin /* output2 = state2 + copy2 */ 151*693c819fSH. Peter Anvin paddd copy2,state2 152*693c819fSH. Peter Anvin movups state2,0x20(output) 153*693c819fSH. Peter Anvin /* output3 = state3 + copy3 */ 154*693c819fSH. Peter Anvin paddd copy3,state3 155*693c819fSH. Peter Anvin movups state3,0x30(output) 156*693c819fSH. Peter Anvin 157*693c819fSH. Peter Anvin /* ++copy3.counter */ 158*693c819fSH. Peter Anvin paddq one,copy3 159*693c819fSH. Peter Anvin 160*693c819fSH. Peter Anvin /* output += 64, --nblocks */ 161*693c819fSH. Peter Anvin addq $64,output 162*693c819fSH. Peter Anvin decq nblocks 163*693c819fSH. Peter Anvin jnz .Lblock 164*693c819fSH. Peter Anvin 165*693c819fSH. Peter Anvin /* counter = copy3.counter */ 166*693c819fSH. Peter Anvin movq copy3,0x00(counter) 167*693c819fSH. Peter Anvin 168*693c819fSH. Peter Anvin /* Zero out the potentially sensitive regs, in case nothing uses these again. */ 169*693c819fSH. Peter Anvin pxor state0,state0 170*693c819fSH. Peter Anvin pxor state1,state1 171*693c819fSH. Peter Anvin pxor state2,state2 172*693c819fSH. Peter Anvin pxor state3,state3 173*693c819fSH. Peter Anvin pxor copy1,copy1 174*693c819fSH. Peter Anvin pxor copy2,copy2 175*693c819fSH. Peter Anvin pxor temp,temp 176*693c819fSH. Peter Anvin 177*693c819fSH. Peter Anvin ret 178*693c819fSH. Peter AnvinSYM_FUNC_END(__arch_chacha20_blocks_nostack) 179