1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4 */ 5 6#include <linux/linkage.h> 7#include <asm/frame.h> 8 9.section .rodata, "a" 10.align 16 11CONSTANTS: .octa 0x6b20657479622d323320646e61707865 12.text 13 14/* 15 * Very basic SSE2 implementation of ChaCha20. Produces a given positive number 16 * of blocks of output with a nonce of 0, taking an input key and 8-byte 17 * counter. Importantly does not spill to the stack. Its arguments are: 18 * 19 * rdi: output bytes 20 * rsi: 32-byte key input 21 * rdx: 8-byte counter input/output 22 * rcx: number of 64-byte blocks to write to output 23 */ 24SYM_FUNC_START(__arch_chacha20_blocks_nostack) 25 26.set output, %rdi 27.set key, %rsi 28.set counter, %rdx 29.set nblocks, %rcx 30.set i, %al 31/* xmm registers are *not* callee-save. */ 32.set temp, %xmm0 33.set state0, %xmm1 34.set state1, %xmm2 35.set state2, %xmm3 36.set state3, %xmm4 37.set copy0, %xmm5 38.set copy1, %xmm6 39.set copy2, %xmm7 40.set copy3, %xmm8 41.set one, %xmm9 42 43 /* copy0 = "expand 32-byte k" */ 44 movaps CONSTANTS(%rip),copy0 45 /* copy1,copy2 = key */ 46 movups 0x00(key),copy1 47 movups 0x10(key),copy2 48 /* copy3 = counter || zero nonce */ 49 movq 0x00(counter),copy3 50 /* one = 1 || 0 */ 51 movq $1,%rax 52 movq %rax,one 53 54.Lblock: 55 /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */ 56 movdqa copy0,state0 57 movdqa copy1,state1 58 movdqa copy2,state2 59 movdqa copy3,state3 60 61 movb $10,i 62.Lpermute: 63 /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ 64 paddd state1,state0 65 pxor state0,state3 66 movdqa state3,temp 67 pslld $16,temp 68 psrld $16,state3 69 por temp,state3 70 71 /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ 72 paddd state3,state2 73 pxor state2,state1 74 movdqa state1,temp 75 pslld $12,temp 76 psrld $20,state1 77 por temp,state1 78 79 /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ 80 paddd state1,state0 81 pxor state0,state3 82 movdqa state3,temp 83 pslld $8,temp 84 psrld $24,state3 85 por temp,state3 86 87 /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ 88 paddd state3,state2 89 pxor state2,state1 90 movdqa state1,temp 91 pslld $7,temp 92 psrld $25,state1 93 por temp,state1 94 95 /* state1[0,1,2,3] = state1[1,2,3,0] */ 96 pshufd $0x39,state1,state1 97 /* state2[0,1,2,3] = state2[2,3,0,1] */ 98 pshufd $0x4e,state2,state2 99 /* state3[0,1,2,3] = state3[3,0,1,2] */ 100 pshufd $0x93,state3,state3 101 102 /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ 103 paddd state1,state0 104 pxor state0,state3 105 movdqa state3,temp 106 pslld $16,temp 107 psrld $16,state3 108 por temp,state3 109 110 /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ 111 paddd state3,state2 112 pxor state2,state1 113 movdqa state1,temp 114 pslld $12,temp 115 psrld $20,state1 116 por temp,state1 117 118 /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ 119 paddd state1,state0 120 pxor state0,state3 121 movdqa state3,temp 122 pslld $8,temp 123 psrld $24,state3 124 por temp,state3 125 126 /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ 127 paddd state3,state2 128 pxor state2,state1 129 movdqa state1,temp 130 pslld $7,temp 131 psrld $25,state1 132 por temp,state1 133 134 /* state1[0,1,2,3] = state1[3,0,1,2] */ 135 pshufd $0x93,state1,state1 136 /* state2[0,1,2,3] = state2[2,3,0,1] */ 137 pshufd $0x4e,state2,state2 138 /* state3[0,1,2,3] = state3[1,2,3,0] */ 139 pshufd $0x39,state3,state3 140 141 decb i 142 jnz .Lpermute 143 144 /* output0 = state0 + copy0 */ 145 paddd copy0,state0 146 movups state0,0x00(output) 147 /* output1 = state1 + copy1 */ 148 paddd copy1,state1 149 movups state1,0x10(output) 150 /* output2 = state2 + copy2 */ 151 paddd copy2,state2 152 movups state2,0x20(output) 153 /* output3 = state3 + copy3 */ 154 paddd copy3,state3 155 movups state3,0x30(output) 156 157 /* ++copy3.counter */ 158 paddq one,copy3 159 160 /* output += 64, --nblocks */ 161 addq $64,output 162 decq nblocks 163 jnz .Lblock 164 165 /* counter = copy3.counter */ 166 movq copy3,0x00(counter) 167 168 /* Zero out the potentially sensitive regs, in case nothing uses these again. */ 169 pxor state0,state0 170 pxor state1,state1 171 pxor state2,state2 172 pxor state3,state3 173 pxor copy1,copy1 174 pxor copy2,copy2 175 pxor temp,temp 176 177 ret 178SYM_FUNC_END(__arch_chacha20_blocks_nostack) 179