1/* SPDX-License-Identifier: GPL-2.0 OR MIT */ 2/* 3 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. 5 */ 6 7#include <linux/linkage.h> 8 9.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 10.align 32 11IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 12 .octa 0x5BE0CD191F83D9AB9B05688C510E527F 13.section .rodata.cst16.ROT16, "aM", @progbits, 16 14.align 16 15ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 16.section .rodata.cst16.ROR328, "aM", @progbits, 16 17.align 16 18ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 19.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 20.align 64 21SIGMA: 22.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 23.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 24.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 25.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 26.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 27.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 28.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 29.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 30.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 31.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 32.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 160 33.align 64 34SIGMA2: 35.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 36.byte 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 37.byte 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 38.byte 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 39.byte 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 40.byte 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 41.byte 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 42.byte 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 43.byte 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 44.byte 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 45 46.text 47SYM_FUNC_START(blake2s_compress_ssse3) 48 movdqu (%rdi),%xmm0 49 movdqu 0x10(%rdi),%xmm1 50 movdqa ROT16(%rip),%xmm12 51 movdqa ROR328(%rip),%xmm13 52 movdqu 0x20(%rdi),%xmm14 53 movd %ecx,%xmm15 54 leaq SIGMA+0xa0(%rip),%r8 55 jmp .Lbeginofloop 56 .align 32 57.Lbeginofloop: 58 movdqa %xmm0,%xmm10 59 movdqa %xmm1,%xmm11 60 paddq %xmm15,%xmm14 61 movdqa IV(%rip),%xmm2 62 movdqa %xmm14,%xmm3 63 pxor IV+0x10(%rip),%xmm3 64 leaq SIGMA(%rip),%rcx 65.Lroundloop: 66 movzbl (%rcx),%eax 67 movd (%rsi,%rax,4),%xmm4 68 movzbl 0x1(%rcx),%eax 69 movd (%rsi,%rax,4),%xmm5 70 movzbl 0x2(%rcx),%eax 71 movd (%rsi,%rax,4),%xmm6 72 movzbl 0x3(%rcx),%eax 73 movd (%rsi,%rax,4),%xmm7 74 punpckldq %xmm5,%xmm4 75 punpckldq %xmm7,%xmm6 76 punpcklqdq %xmm6,%xmm4 77 paddd %xmm4,%xmm0 78 paddd %xmm1,%xmm0 79 pxor %xmm0,%xmm3 80 pshufb %xmm12,%xmm3 81 paddd %xmm3,%xmm2 82 pxor %xmm2,%xmm1 83 movdqa %xmm1,%xmm8 84 psrld $0xc,%xmm1 85 pslld $0x14,%xmm8 86 por %xmm8,%xmm1 87 movzbl 0x4(%rcx),%eax 88 movd (%rsi,%rax,4),%xmm5 89 movzbl 0x5(%rcx),%eax 90 movd (%rsi,%rax,4),%xmm6 91 movzbl 0x6(%rcx),%eax 92 movd (%rsi,%rax,4),%xmm7 93 movzbl 0x7(%rcx),%eax 94 movd (%rsi,%rax,4),%xmm4 95 punpckldq %xmm6,%xmm5 96 punpckldq %xmm4,%xmm7 97 punpcklqdq %xmm7,%xmm5 98 paddd %xmm5,%xmm0 99 paddd %xmm1,%xmm0 100 pxor %xmm0,%xmm3 101 pshufb %xmm13,%xmm3 102 paddd %xmm3,%xmm2 103 pxor %xmm2,%xmm1 104 movdqa %xmm1,%xmm8 105 psrld $0x7,%xmm1 106 pslld $0x19,%xmm8 107 por %xmm8,%xmm1 108 pshufd $0x93,%xmm0,%xmm0 109 pshufd $0x4e,%xmm3,%xmm3 110 pshufd $0x39,%xmm2,%xmm2 111 movzbl 0x8(%rcx),%eax 112 movd (%rsi,%rax,4),%xmm6 113 movzbl 0x9(%rcx),%eax 114 movd (%rsi,%rax,4),%xmm7 115 movzbl 0xa(%rcx),%eax 116 movd (%rsi,%rax,4),%xmm4 117 movzbl 0xb(%rcx),%eax 118 movd (%rsi,%rax,4),%xmm5 119 punpckldq %xmm7,%xmm6 120 punpckldq %xmm5,%xmm4 121 punpcklqdq %xmm4,%xmm6 122 paddd %xmm6,%xmm0 123 paddd %xmm1,%xmm0 124 pxor %xmm0,%xmm3 125 pshufb %xmm12,%xmm3 126 paddd %xmm3,%xmm2 127 pxor %xmm2,%xmm1 128 movdqa %xmm1,%xmm8 129 psrld $0xc,%xmm1 130 pslld $0x14,%xmm8 131 por %xmm8,%xmm1 132 movzbl 0xc(%rcx),%eax 133 movd (%rsi,%rax,4),%xmm7 134 movzbl 0xd(%rcx),%eax 135 movd (%rsi,%rax,4),%xmm4 136 movzbl 0xe(%rcx),%eax 137 movd (%rsi,%rax,4),%xmm5 138 movzbl 0xf(%rcx),%eax 139 movd (%rsi,%rax,4),%xmm6 140 punpckldq %xmm4,%xmm7 141 punpckldq %xmm6,%xmm5 142 punpcklqdq %xmm5,%xmm7 143 paddd %xmm7,%xmm0 144 paddd %xmm1,%xmm0 145 pxor %xmm0,%xmm3 146 pshufb %xmm13,%xmm3 147 paddd %xmm3,%xmm2 148 pxor %xmm2,%xmm1 149 movdqa %xmm1,%xmm8 150 psrld $0x7,%xmm1 151 pslld $0x19,%xmm8 152 por %xmm8,%xmm1 153 pshufd $0x39,%xmm0,%xmm0 154 pshufd $0x4e,%xmm3,%xmm3 155 pshufd $0x93,%xmm2,%xmm2 156 addq $0x10,%rcx 157 cmpq %r8,%rcx 158 jnz .Lroundloop 159 pxor %xmm2,%xmm0 160 pxor %xmm3,%xmm1 161 pxor %xmm10,%xmm0 162 pxor %xmm11,%xmm1 163 addq $0x40,%rsi 164 decq %rdx 165 jnz .Lbeginofloop 166 movdqu %xmm0,(%rdi) 167 movdqu %xmm1,0x10(%rdi) 168 movdqu %xmm14,0x20(%rdi) 169 RET 170SYM_FUNC_END(blake2s_compress_ssse3) 171 172SYM_FUNC_START(blake2s_compress_avx512) 173 vmovdqu (%rdi),%xmm0 174 vmovdqu 0x10(%rdi),%xmm1 175 vmovdqu 0x20(%rdi),%xmm4 176 vmovd %ecx,%xmm5 177 vmovdqa IV(%rip),%xmm14 178 vmovdqa IV+16(%rip),%xmm15 179 jmp .Lblake2s_compress_avx512_mainloop 180.align 32 181.Lblake2s_compress_avx512_mainloop: 182 vmovdqa %xmm0,%xmm10 183 vmovdqa %xmm1,%xmm11 184 vpaddq %xmm5,%xmm4,%xmm4 185 vmovdqa %xmm14,%xmm2 186 vpxor %xmm15,%xmm4,%xmm3 187 vmovdqu (%rsi),%ymm6 188 vmovdqu 0x20(%rsi),%ymm7 189 addq $0x40,%rsi 190 leaq SIGMA2(%rip),%rax 191 movb $0xa,%cl 192.Lblake2s_compress_avx512_roundloop: 193 vpmovzxbd (%rax),%ymm8 194 vpmovzxbd 0x8(%rax),%ymm9 195 addq $0x10,%rax 196 vpermi2d %ymm7,%ymm6,%ymm8 197 vpermi2d %ymm7,%ymm6,%ymm9 198 vmovdqa %ymm8,%ymm6 199 vmovdqa %ymm9,%ymm7 200 vpaddd %xmm8,%xmm0,%xmm0 201 vpaddd %xmm1,%xmm0,%xmm0 202 vpxor %xmm0,%xmm3,%xmm3 203 vprord $0x10,%xmm3,%xmm3 204 vpaddd %xmm3,%xmm2,%xmm2 205 vpxor %xmm2,%xmm1,%xmm1 206 vprord $0xc,%xmm1,%xmm1 207 vextracti128 $0x1,%ymm8,%xmm8 208 vpaddd %xmm8,%xmm0,%xmm0 209 vpaddd %xmm1,%xmm0,%xmm0 210 vpxor %xmm0,%xmm3,%xmm3 211 vprord $0x8,%xmm3,%xmm3 212 vpaddd %xmm3,%xmm2,%xmm2 213 vpxor %xmm2,%xmm1,%xmm1 214 vprord $0x7,%xmm1,%xmm1 215 vpshufd $0x93,%xmm0,%xmm0 216 vpshufd $0x4e,%xmm3,%xmm3 217 vpshufd $0x39,%xmm2,%xmm2 218 vpaddd %xmm9,%xmm0,%xmm0 219 vpaddd %xmm1,%xmm0,%xmm0 220 vpxor %xmm0,%xmm3,%xmm3 221 vprord $0x10,%xmm3,%xmm3 222 vpaddd %xmm3,%xmm2,%xmm2 223 vpxor %xmm2,%xmm1,%xmm1 224 vprord $0xc,%xmm1,%xmm1 225 vextracti128 $0x1,%ymm9,%xmm9 226 vpaddd %xmm9,%xmm0,%xmm0 227 vpaddd %xmm1,%xmm0,%xmm0 228 vpxor %xmm0,%xmm3,%xmm3 229 vprord $0x8,%xmm3,%xmm3 230 vpaddd %xmm3,%xmm2,%xmm2 231 vpxor %xmm2,%xmm1,%xmm1 232 vprord $0x7,%xmm1,%xmm1 233 vpshufd $0x39,%xmm0,%xmm0 234 vpshufd $0x4e,%xmm3,%xmm3 235 vpshufd $0x93,%xmm2,%xmm2 236 decb %cl 237 jne .Lblake2s_compress_avx512_roundloop 238 vpxor %xmm10,%xmm0,%xmm0 239 vpxor %xmm11,%xmm1,%xmm1 240 vpxor %xmm2,%xmm0,%xmm0 241 vpxor %xmm3,%xmm1,%xmm1 242 decq %rdx 243 jne .Lblake2s_compress_avx512_mainloop 244 vmovdqu %xmm0,(%rdi) 245 vmovdqu %xmm1,0x10(%rdi) 246 vmovdqu %xmm4,0x20(%rdi) 247 vzeroupper 248 RET 249SYM_FUNC_END(blake2s_compress_avx512) 250