1/* SPDX-License-Identifier: GPL-2.0 OR MIT */ 2/* 3 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. 5 */ 6 7#include <linux/linkage.h> 8 9.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 10.align 32 11IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 12 .octa 0x5BE0CD191F83D9AB9B05688C510E527F 13.section .rodata.cst16.ROT16, "aM", @progbits, 16 14.align 16 15ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 16.section .rodata.cst16.ROR328, "aM", @progbits, 16 17.align 16 18ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 19.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 20.align 64 21SIGMA: 22.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 23.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 24.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 25.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 26.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 27.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 28.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 29.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 30.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 31.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 32.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 33.align 64 34SIGMA2: 35.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 36.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 37.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 38.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 39.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 40.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 41.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 42.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 43.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 44.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 45 46.text 47SYM_FUNC_START(blake2s_compress_ssse3) 48 testq %rdx,%rdx 49 je .Lendofloop 50 movdqu (%rdi),%xmm0 51 movdqu 0x10(%rdi),%xmm1 52 movdqa ROT16(%rip),%xmm12 53 movdqa ROR328(%rip),%xmm13 54 movdqu 0x20(%rdi),%xmm14 55 movq %rcx,%xmm15 56 leaq SIGMA+0xa0(%rip),%r8 57 jmp .Lbeginofloop 58 .align 32 59.Lbeginofloop: 60 movdqa %xmm0,%xmm10 61 movdqa %xmm1,%xmm11 62 paddq %xmm15,%xmm14 63 movdqa IV(%rip),%xmm2 64 movdqa %xmm14,%xmm3 65 pxor IV+0x10(%rip),%xmm3 66 leaq SIGMA(%rip),%rcx 67.Lroundloop: 68 movzbl (%rcx),%eax 69 movd (%rsi,%rax,4),%xmm4 70 movzbl 0x1(%rcx),%eax 71 movd (%rsi,%rax,4),%xmm5 72 movzbl 0x2(%rcx),%eax 73 movd (%rsi,%rax,4),%xmm6 74 movzbl 0x3(%rcx),%eax 75 movd (%rsi,%rax,4),%xmm7 76 punpckldq %xmm5,%xmm4 77 punpckldq %xmm7,%xmm6 78 punpcklqdq %xmm6,%xmm4 79 paddd %xmm4,%xmm0 80 paddd %xmm1,%xmm0 81 pxor %xmm0,%xmm3 82 pshufb %xmm12,%xmm3 83 paddd %xmm3,%xmm2 84 pxor %xmm2,%xmm1 85 movdqa %xmm1,%xmm8 86 psrld $0xc,%xmm1 87 pslld $0x14,%xmm8 88 por %xmm8,%xmm1 89 movzbl 0x4(%rcx),%eax 90 movd (%rsi,%rax,4),%xmm5 91 movzbl 0x5(%rcx),%eax 92 movd (%rsi,%rax,4),%xmm6 93 movzbl 0x6(%rcx),%eax 94 movd (%rsi,%rax,4),%xmm7 95 movzbl 0x7(%rcx),%eax 96 movd (%rsi,%rax,4),%xmm4 97 punpckldq %xmm6,%xmm5 98 punpckldq %xmm4,%xmm7 99 punpcklqdq %xmm7,%xmm5 100 paddd %xmm5,%xmm0 101 paddd %xmm1,%xmm0 102 pxor %xmm0,%xmm3 103 pshufb %xmm13,%xmm3 104 paddd %xmm3,%xmm2 105 pxor %xmm2,%xmm1 106 movdqa %xmm1,%xmm8 107 psrld $0x7,%xmm1 108 pslld $0x19,%xmm8 109 por %xmm8,%xmm1 110 pshufd $0x93,%xmm0,%xmm0 111 pshufd $0x4e,%xmm3,%xmm3 112 pshufd $0x39,%xmm2,%xmm2 113 movzbl 0x8(%rcx),%eax 114 movd (%rsi,%rax,4),%xmm6 115 movzbl 0x9(%rcx),%eax 116 movd (%rsi,%rax,4),%xmm7 117 movzbl 0xa(%rcx),%eax 118 movd (%rsi,%rax,4),%xmm4 119 movzbl 0xb(%rcx),%eax 120 movd (%rsi,%rax,4),%xmm5 121 punpckldq %xmm7,%xmm6 122 punpckldq %xmm5,%xmm4 123 punpcklqdq %xmm4,%xmm6 124 paddd %xmm6,%xmm0 125 paddd %xmm1,%xmm0 126 pxor %xmm0,%xmm3 127 pshufb %xmm12,%xmm3 128 paddd %xmm3,%xmm2 129 pxor %xmm2,%xmm1 130 movdqa %xmm1,%xmm8 131 psrld $0xc,%xmm1 132 pslld $0x14,%xmm8 133 por %xmm8,%xmm1 134 movzbl 0xc(%rcx),%eax 135 movd (%rsi,%rax,4),%xmm7 136 movzbl 0xd(%rcx),%eax 137 movd (%rsi,%rax,4),%xmm4 138 movzbl 0xe(%rcx),%eax 139 movd (%rsi,%rax,4),%xmm5 140 movzbl 0xf(%rcx),%eax 141 movd (%rsi,%rax,4),%xmm6 142 punpckldq %xmm4,%xmm7 143 punpckldq %xmm6,%xmm5 144 punpcklqdq %xmm5,%xmm7 145 paddd %xmm7,%xmm0 146 paddd %xmm1,%xmm0 147 pxor %xmm0,%xmm3 148 pshufb %xmm13,%xmm3 149 paddd %xmm3,%xmm2 150 pxor %xmm2,%xmm1 151 movdqa %xmm1,%xmm8 152 psrld $0x7,%xmm1 153 pslld $0x19,%xmm8 154 por %xmm8,%xmm1 155 pshufd $0x39,%xmm0,%xmm0 156 pshufd $0x4e,%xmm3,%xmm3 157 pshufd $0x93,%xmm2,%xmm2 158 addq $0x10,%rcx 159 cmpq %r8,%rcx 160 jnz .Lroundloop 161 pxor %xmm2,%xmm0 162 pxor %xmm3,%xmm1 163 pxor %xmm10,%xmm0 164 pxor %xmm11,%xmm1 165 addq $0x40,%rsi 166 decq %rdx 167 jnz .Lbeginofloop 168 movdqu %xmm0,(%rdi) 169 movdqu %xmm1,0x10(%rdi) 170 movdqu %xmm14,0x20(%rdi) 171.Lendofloop: 172 RET 173SYM_FUNC_END(blake2s_compress_ssse3) 174 175SYM_FUNC_START(blake2s_compress_avx512) 176 vmovdqu (%rdi),%xmm0 177 vmovdqu 0x10(%rdi),%xmm1 178 vmovdqu 0x20(%rdi),%xmm4 179 vmovq %rcx,%xmm5 180 vmovdqa IV(%rip),%xmm14 181 vmovdqa IV+16(%rip),%xmm15 182 jmp .Lblake2s_compress_avx512_mainloop 183.align 32 184.Lblake2s_compress_avx512_mainloop: 185 vmovdqa %xmm0,%xmm10 186 vmovdqa %xmm1,%xmm11 187 vpaddq %xmm5,%xmm4,%xmm4 188 vmovdqa %xmm14,%xmm2 189 vpxor %xmm15,%xmm4,%xmm3 190 vmovdqu (%rsi),%ymm6 191 vmovdqu 0x20(%rsi),%ymm7 192 addq $0x40,%rsi 193 leaq SIGMA2(%rip),%rax 194 movb $0xa,%cl 195.Lblake2s_compress_avx512_roundloop: 196 addq $0x40,%rax 197 vmovdqa -0x40(%rax),%ymm8 198 vmovdqa -0x20(%rax),%ymm9 199 vpermi2d %ymm7,%ymm6,%ymm8 200 vpermi2d %ymm7,%ymm6,%ymm9 201 vmovdqa %ymm8,%ymm6 202 vmovdqa %ymm9,%ymm7 203 vpaddd %xmm8,%xmm0,%xmm0 204 vpaddd %xmm1,%xmm0,%xmm0 205 vpxor %xmm0,%xmm3,%xmm3 206 vprord $0x10,%xmm3,%xmm3 207 vpaddd %xmm3,%xmm2,%xmm2 208 vpxor %xmm2,%xmm1,%xmm1 209 vprord $0xc,%xmm1,%xmm1 210 vextracti128 $0x1,%ymm8,%xmm8 211 vpaddd %xmm8,%xmm0,%xmm0 212 vpaddd %xmm1,%xmm0,%xmm0 213 vpxor %xmm0,%xmm3,%xmm3 214 vprord $0x8,%xmm3,%xmm3 215 vpaddd %xmm3,%xmm2,%xmm2 216 vpxor %xmm2,%xmm1,%xmm1 217 vprord $0x7,%xmm1,%xmm1 218 vpshufd $0x93,%xmm0,%xmm0 219 vpshufd $0x4e,%xmm3,%xmm3 220 vpshufd $0x39,%xmm2,%xmm2 221 vpaddd %xmm9,%xmm0,%xmm0 222 vpaddd %xmm1,%xmm0,%xmm0 223 vpxor %xmm0,%xmm3,%xmm3 224 vprord $0x10,%xmm3,%xmm3 225 vpaddd %xmm3,%xmm2,%xmm2 226 vpxor %xmm2,%xmm1,%xmm1 227 vprord $0xc,%xmm1,%xmm1 228 vextracti128 $0x1,%ymm9,%xmm9 229 vpaddd %xmm9,%xmm0,%xmm0 230 vpaddd %xmm1,%xmm0,%xmm0 231 vpxor %xmm0,%xmm3,%xmm3 232 vprord $0x8,%xmm3,%xmm3 233 vpaddd %xmm3,%xmm2,%xmm2 234 vpxor %xmm2,%xmm1,%xmm1 235 vprord $0x7,%xmm1,%xmm1 236 vpshufd $0x39,%xmm0,%xmm0 237 vpshufd $0x4e,%xmm3,%xmm3 238 vpshufd $0x93,%xmm2,%xmm2 239 decb %cl 240 jne .Lblake2s_compress_avx512_roundloop 241 vpxor %xmm10,%xmm0,%xmm0 242 vpxor %xmm11,%xmm1,%xmm1 243 vpxor %xmm2,%xmm0,%xmm0 244 vpxor %xmm3,%xmm1,%xmm1 245 decq %rdx 246 jne .Lblake2s_compress_avx512_mainloop 247 vmovdqu %xmm0,(%rdi) 248 vmovdqu %xmm1,0x10(%rdi) 249 vmovdqu %xmm4,0x20(%rdi) 250 vzeroupper 251 RET 252SYM_FUNC_END(blake2s_compress_avx512) 253