1/* SPDX-License-Identifier: GPL-2.0 OR MIT */ 2/* 3 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. 5 */ 6 7#include <linux/linkage.h> 8 9.section .rodata.cst32.iv, "aM", @progbits, 32 10.align 32 11.Liv: 12 .octa 0xA54FF53A3C6EF372BB67AE856A09E667 13 .octa 0x5BE0CD191F83D9AB9B05688C510E527F 14 15.section .rodata.cst16.ror16, "aM", @progbits, 16 16.align 16 17.Lror16: 18 .octa 0x0D0C0F0E09080B0A0504070601000302 19 20.section .rodata.cst16.ror8, "aM", @progbits, 16 21.align 16 22.Lror8: 23 .octa 0x0C0F0E0D080B0A090407060500030201 24 25.section .rodata.cst64.sigma, "aM", @progbits, 160 26.align 64 27.Lsigma: 28.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 29.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 30.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 31.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 32.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 33.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 34.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 35.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 36.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 37.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 38 39.section .rodata.cst64.sigma2, "aM", @progbits, 160 40.align 64 41.Lsigma2: 42.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 43.byte 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 44.byte 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 45.byte 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 46.byte 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 47.byte 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 48.byte 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 49.byte 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 50.byte 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 51.byte 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 52 53#define CTX %rdi 54#define DATA %rsi 55#define NBLOCKS %rdx 56#define INC %ecx 57 58.text 59// 60// void blake2s_compress_ssse3(struct blake2s_ctx *ctx, 61// const u8 *data, size_t nblocks, u32 inc); 62// 63// Only the first three fields of struct blake2s_ctx are used: 64// u32 h[8]; (inout) 65// u32 t[2]; (inout) 66// u32 f[2]; (in) 67// 68SYM_FUNC_START(blake2s_compress_ssse3) 69 movdqu (CTX),%xmm0 // Load h[0..3] 70 movdqu 16(CTX),%xmm1 // Load h[4..7] 71 movdqa .Lror16(%rip),%xmm12 72 movdqa .Lror8(%rip),%xmm13 73 movdqu 32(CTX),%xmm14 // Load t and f 74 movd INC,%xmm15 // Load inc 75 leaq .Lsigma+160(%rip),%r8 76 jmp .Lssse3_mainloop 77 78 .align 32 79.Lssse3_mainloop: 80 // Main loop: each iteration processes one 64-byte block. 81 movdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3] 82 movdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7] 83 paddq %xmm15,%xmm14 // t += inc (64-bit addition) 84 movdqa .Liv(%rip),%xmm2 // v[8..11] = iv[0..3] 85 movdqa %xmm14,%xmm3 86 pxor .Liv+16(%rip),%xmm3 // v[12..15] = iv[4..7] ^ [t, f] 87 leaq .Lsigma(%rip),%rcx 88 89.Lssse3_roundloop: 90 // Round loop: each iteration does 1 round (of 10 rounds total). 91 movzbl (%rcx),%eax 92 movd (DATA,%rax,4),%xmm4 93 movzbl 1(%rcx),%eax 94 movd (DATA,%rax,4),%xmm5 95 movzbl 2(%rcx),%eax 96 movd (DATA,%rax,4),%xmm6 97 movzbl 3(%rcx),%eax 98 movd (DATA,%rax,4),%xmm7 99 punpckldq %xmm5,%xmm4 100 punpckldq %xmm7,%xmm6 101 punpcklqdq %xmm6,%xmm4 102 paddd %xmm4,%xmm0 103 paddd %xmm1,%xmm0 104 pxor %xmm0,%xmm3 105 pshufb %xmm12,%xmm3 106 paddd %xmm3,%xmm2 107 pxor %xmm2,%xmm1 108 movdqa %xmm1,%xmm8 109 psrld $12,%xmm1 110 pslld $20,%xmm8 111 por %xmm8,%xmm1 112 movzbl 4(%rcx),%eax 113 movd (DATA,%rax,4),%xmm5 114 movzbl 5(%rcx),%eax 115 movd (DATA,%rax,4),%xmm6 116 movzbl 6(%rcx),%eax 117 movd (DATA,%rax,4),%xmm7 118 movzbl 7(%rcx),%eax 119 movd (DATA,%rax,4),%xmm4 120 punpckldq %xmm6,%xmm5 121 punpckldq %xmm4,%xmm7 122 punpcklqdq %xmm7,%xmm5 123 paddd %xmm5,%xmm0 124 paddd %xmm1,%xmm0 125 pxor %xmm0,%xmm3 126 pshufb %xmm13,%xmm3 127 paddd %xmm3,%xmm2 128 pxor %xmm2,%xmm1 129 movdqa %xmm1,%xmm8 130 psrld $7,%xmm1 131 pslld $25,%xmm8 132 por %xmm8,%xmm1 133 pshufd $0x93,%xmm0,%xmm0 134 pshufd $0x4e,%xmm3,%xmm3 135 pshufd $0x39,%xmm2,%xmm2 136 movzbl 8(%rcx),%eax 137 movd (DATA,%rax,4),%xmm6 138 movzbl 9(%rcx),%eax 139 movd (DATA,%rax,4),%xmm7 140 movzbl 10(%rcx),%eax 141 movd (DATA,%rax,4),%xmm4 142 movzbl 11(%rcx),%eax 143 movd (DATA,%rax,4),%xmm5 144 punpckldq %xmm7,%xmm6 145 punpckldq %xmm5,%xmm4 146 punpcklqdq %xmm4,%xmm6 147 paddd %xmm6,%xmm0 148 paddd %xmm1,%xmm0 149 pxor %xmm0,%xmm3 150 pshufb %xmm12,%xmm3 151 paddd %xmm3,%xmm2 152 pxor %xmm2,%xmm1 153 movdqa %xmm1,%xmm8 154 psrld $12,%xmm1 155 pslld $20,%xmm8 156 por %xmm8,%xmm1 157 movzbl 12(%rcx),%eax 158 movd (DATA,%rax,4),%xmm7 159 movzbl 13(%rcx),%eax 160 movd (DATA,%rax,4),%xmm4 161 movzbl 14(%rcx),%eax 162 movd (DATA,%rax,4),%xmm5 163 movzbl 15(%rcx),%eax 164 movd (DATA,%rax,4),%xmm6 165 punpckldq %xmm4,%xmm7 166 punpckldq %xmm6,%xmm5 167 punpcklqdq %xmm5,%xmm7 168 paddd %xmm7,%xmm0 169 paddd %xmm1,%xmm0 170 pxor %xmm0,%xmm3 171 pshufb %xmm13,%xmm3 172 paddd %xmm3,%xmm2 173 pxor %xmm2,%xmm1 174 movdqa %xmm1,%xmm8 175 psrld $7,%xmm1 176 pslld $25,%xmm8 177 por %xmm8,%xmm1 178 pshufd $0x39,%xmm0,%xmm0 179 pshufd $0x4e,%xmm3,%xmm3 180 pshufd $0x93,%xmm2,%xmm2 181 addq $16,%rcx 182 cmpq %r8,%rcx 183 jnz .Lssse3_roundloop 184 185 // Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15] 186 pxor %xmm2,%xmm0 187 pxor %xmm3,%xmm1 188 pxor %xmm10,%xmm0 189 pxor %xmm11,%xmm1 190 addq $64,DATA 191 decq NBLOCKS 192 jnz .Lssse3_mainloop 193 194 movdqu %xmm0,(CTX) // Store new h[0..3] 195 movdqu %xmm1,16(CTX) // Store new h[4..7] 196 movq %xmm14,32(CTX) // Store new t (f is unchanged) 197 RET 198SYM_FUNC_END(blake2s_compress_ssse3) 199 200// 201// void blake2s_compress_avx512(struct blake2s_ctx *ctx, 202// const u8 *data, size_t nblocks, u32 inc); 203// 204// Only the first three fields of struct blake2s_ctx are used: 205// u32 h[8]; (inout) 206// u32 t[2]; (inout) 207// u32 f[2]; (in) 208// 209SYM_FUNC_START(blake2s_compress_avx512) 210 vmovdqu (CTX),%xmm0 // Load h[0..3] 211 vmovdqu 16(CTX),%xmm1 // Load h[4..7] 212 vmovdqu 32(CTX),%xmm4 // Load t and f 213 vmovd INC,%xmm5 // Load inc 214 vmovdqa .Liv(%rip),%xmm14 // Load iv[0..3] 215 vmovdqa .Liv+16(%rip),%xmm15 // Load iv[4..7] 216 jmp .Lavx512_mainloop 217 218 .align 32 219.Lavx512_mainloop: 220 // Main loop: each iteration processes one 64-byte block. 221 vmovdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3] 222 vmovdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7] 223 vpaddq %xmm5,%xmm4,%xmm4 // t += inc (64-bit addition) 224 vmovdqa %xmm14,%xmm2 // v[8..11] = iv[0..3] 225 vpxor %xmm15,%xmm4,%xmm3 // v[12..15] = iv[4..7] ^ [t, f] 226 vmovdqu (DATA),%ymm6 // Load first 8 data words 227 vmovdqu 32(DATA),%ymm7 // Load second 8 data words 228 addq $64,DATA 229 leaq .Lsigma2(%rip),%rax 230 movb $10,%cl // Set num rounds remaining 231 232.Lavx512_roundloop: 233 // Round loop: each iteration does 1 round (of 10 rounds total). 234 vpmovzxbd (%rax),%ymm8 235 vpmovzxbd 8(%rax),%ymm9 236 addq $16,%rax 237 vpermi2d %ymm7,%ymm6,%ymm8 238 vpermi2d %ymm7,%ymm6,%ymm9 239 vmovdqa %ymm8,%ymm6 240 vmovdqa %ymm9,%ymm7 241 vpaddd %xmm8,%xmm0,%xmm0 242 vpaddd %xmm1,%xmm0,%xmm0 243 vpxor %xmm0,%xmm3,%xmm3 244 vprord $16,%xmm3,%xmm3 245 vpaddd %xmm3,%xmm2,%xmm2 246 vpxor %xmm2,%xmm1,%xmm1 247 vprord $12,%xmm1,%xmm1 248 vextracti128 $1,%ymm8,%xmm8 249 vpaddd %xmm8,%xmm0,%xmm0 250 vpaddd %xmm1,%xmm0,%xmm0 251 vpxor %xmm0,%xmm3,%xmm3 252 vprord $8,%xmm3,%xmm3 253 vpaddd %xmm3,%xmm2,%xmm2 254 vpxor %xmm2,%xmm1,%xmm1 255 vprord $7,%xmm1,%xmm1 256 vpshufd $0x93,%xmm0,%xmm0 257 vpshufd $0x4e,%xmm3,%xmm3 258 vpshufd $0x39,%xmm2,%xmm2 259 vpaddd %xmm9,%xmm0,%xmm0 260 vpaddd %xmm1,%xmm0,%xmm0 261 vpxor %xmm0,%xmm3,%xmm3 262 vprord $16,%xmm3,%xmm3 263 vpaddd %xmm3,%xmm2,%xmm2 264 vpxor %xmm2,%xmm1,%xmm1 265 vprord $12,%xmm1,%xmm1 266 vextracti128 $1,%ymm9,%xmm9 267 vpaddd %xmm9,%xmm0,%xmm0 268 vpaddd %xmm1,%xmm0,%xmm0 269 vpxor %xmm0,%xmm3,%xmm3 270 vprord $8,%xmm3,%xmm3 271 vpaddd %xmm3,%xmm2,%xmm2 272 vpxor %xmm2,%xmm1,%xmm1 273 vprord $7,%xmm1,%xmm1 274 vpshufd $0x39,%xmm0,%xmm0 275 vpshufd $0x4e,%xmm3,%xmm3 276 vpshufd $0x93,%xmm2,%xmm2 277 decb %cl 278 jne .Lavx512_roundloop 279 280 // Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15] 281 vpternlogd $0x96,%xmm10,%xmm2,%xmm0 282 vpternlogd $0x96,%xmm11,%xmm3,%xmm1 283 decq NBLOCKS 284 jne .Lavx512_mainloop 285 286 vmovdqu %xmm0,(CTX) // Store new h[0..3] 287 vmovdqu %xmm1,16(CTX) // Store new h[4..7] 288 vmovq %xmm4,32(CTX) // Store new t (f is unchanged) 289 vzeroupper 290 RET 291SYM_FUNC_END(blake2s_compress_avx512) 292