xref: /freebsd/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse41.S (revision 15f0b8c309dea1dcb14d3e374686576ff68ac43f)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
24 * Copyright (c) 2019-2020 Samuel Neves
25 * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
26 */
27
28#if defined(HAVE_SSE4_1)
29
30#define _ASM
31#include <sys/asm_linkage.h>
32
33.intel_syntax noprefix
34
35.text
36
37ENTRY_ALIGN(zfs_blake3_hash_many_sse41, 64)
38        ENDBR
39        push    r15
40        push    r14
41        push    r13
42        push    r12
43        push    rbx
44        push    rbp
45        mov     rbp, rsp
46        sub     rsp, 360
47        and     rsp, 0xFFFFFFFFFFFFFFC0
48        neg     r9d
49        movd    xmm0, r9d
50        pshufd  xmm0, xmm0, 0x00
51        movdqa  xmmword ptr [rsp+0x130], xmm0
52        movdqa  xmm1, xmm0
53        pand    xmm1, xmmword ptr [ADD0+rip]
54        pand    xmm0, xmmword ptr [ADD1+rip]
55        movdqa  xmmword ptr [rsp+0x150], xmm0
56        movd    xmm0, r8d
57        pshufd  xmm0, xmm0, 0x00
58        paddd   xmm0, xmm1
59        movdqa  xmmword ptr [rsp+0x110], xmm0
60        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
61        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
62        pcmpgtd xmm1, xmm0
63        shr     r8, 32
64        movd    xmm2, r8d
65        pshufd  xmm2, xmm2, 0x00
66        psubd   xmm2, xmm1
67        movdqa  xmmword ptr [rsp+0x120], xmm2
68        mov     rbx, qword ptr [rbp+0x50]
69        mov     r15, rdx
70        shl     r15, 6
71        movzx   r13d, byte ptr [rbp+0x38]
72        movzx   r12d, byte ptr [rbp+0x48]
73        cmp     rsi, 4
74        jc      3f
752:
76        movdqu  xmm3, xmmword ptr [rcx]
77        pshufd  xmm0, xmm3, 0x00
78        pshufd  xmm1, xmm3, 0x55
79        pshufd  xmm2, xmm3, 0xAA
80        pshufd  xmm3, xmm3, 0xFF
81        movdqu  xmm7, xmmword ptr [rcx+0x10]
82        pshufd  xmm4, xmm7, 0x00
83        pshufd  xmm5, xmm7, 0x55
84        pshufd  xmm6, xmm7, 0xAA
85        pshufd  xmm7, xmm7, 0xFF
86        mov     r8, qword ptr [rdi]
87        mov     r9, qword ptr [rdi+0x8]
88        mov     r10, qword ptr [rdi+0x10]
89        mov     r11, qword ptr [rdi+0x18]
90        movzx   eax, byte ptr [rbp+0x40]
91        or      eax, r13d
92        xor     edx, edx
939:
94        mov     r14d, eax
95        or      eax, r12d
96        add     rdx, 64
97        cmp     rdx, r15
98        cmovne  eax, r14d
99        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
100        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
101        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
102        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
103        movdqa  xmm12, xmm8
104        punpckldq xmm8, xmm9
105        punpckhdq xmm12, xmm9
106        movdqa  xmm14, xmm10
107        punpckldq xmm10, xmm11
108        punpckhdq xmm14, xmm11
109        movdqa  xmm9, xmm8
110        punpcklqdq xmm8, xmm10
111        punpckhqdq xmm9, xmm10
112        movdqa  xmm13, xmm12
113        punpcklqdq xmm12, xmm14
114        punpckhqdq xmm13, xmm14
115        movdqa  xmmword ptr [rsp], xmm8
116        movdqa  xmmword ptr [rsp+0x10], xmm9
117        movdqa  xmmword ptr [rsp+0x20], xmm12
118        movdqa  xmmword ptr [rsp+0x30], xmm13
119        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
120        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
121        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
122        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
123        movdqa  xmm12, xmm8
124        punpckldq xmm8, xmm9
125        punpckhdq xmm12, xmm9
126        movdqa  xmm14, xmm10
127        punpckldq xmm10, xmm11
128        punpckhdq xmm14, xmm11
129        movdqa  xmm9, xmm8
130        punpcklqdq xmm8, xmm10
131        punpckhqdq xmm9, xmm10
132        movdqa  xmm13, xmm12
133        punpcklqdq xmm12, xmm14
134        punpckhqdq xmm13, xmm14
135        movdqa  xmmword ptr [rsp+0x40], xmm8
136        movdqa  xmmword ptr [rsp+0x50], xmm9
137        movdqa  xmmword ptr [rsp+0x60], xmm12
138        movdqa  xmmword ptr [rsp+0x70], xmm13
139        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
140        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
141        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
142        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
143        movdqa  xmm12, xmm8
144        punpckldq xmm8, xmm9
145        punpckhdq xmm12, xmm9
146        movdqa  xmm14, xmm10
147        punpckldq xmm10, xmm11
148        punpckhdq xmm14, xmm11
149        movdqa  xmm9, xmm8
150        punpcklqdq xmm8, xmm10
151        punpckhqdq xmm9, xmm10
152        movdqa  xmm13, xmm12
153        punpcklqdq xmm12, xmm14
154        punpckhqdq xmm13, xmm14
155        movdqa  xmmword ptr [rsp+0x80], xmm8
156        movdqa  xmmword ptr [rsp+0x90], xmm9
157        movdqa  xmmword ptr [rsp+0xA0], xmm12
158        movdqa  xmmword ptr [rsp+0xB0], xmm13
159        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
160        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
161        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
162        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
163        movdqa  xmm12, xmm8
164        punpckldq xmm8, xmm9
165        punpckhdq xmm12, xmm9
166        movdqa  xmm14, xmm10
167        punpckldq xmm10, xmm11
168        punpckhdq xmm14, xmm11
169        movdqa  xmm9, xmm8
170        punpcklqdq xmm8, xmm10
171        punpckhqdq xmm9, xmm10
172        movdqa  xmm13, xmm12
173        punpcklqdq xmm12, xmm14
174        punpckhqdq xmm13, xmm14
175        movdqa  xmmword ptr [rsp+0xC0], xmm8
176        movdqa  xmmword ptr [rsp+0xD0], xmm9
177        movdqa  xmmword ptr [rsp+0xE0], xmm12
178        movdqa  xmmword ptr [rsp+0xF0], xmm13
179        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
180        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
181        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
182        movdqa  xmm12, xmmword ptr [rsp+0x110]
183        movdqa  xmm13, xmmword ptr [rsp+0x120]
184        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
185        movd    xmm15, eax
186        pshufd  xmm15, xmm15, 0x00
187        prefetcht0 [r8+rdx+0x80]
188        prefetcht0 [r9+rdx+0x80]
189        prefetcht0 [r10+rdx+0x80]
190        prefetcht0 [r11+rdx+0x80]
191        paddd   xmm0, xmmword ptr [rsp]
192        paddd   xmm1, xmmword ptr [rsp+0x20]
193        paddd   xmm2, xmmword ptr [rsp+0x40]
194        paddd   xmm3, xmmword ptr [rsp+0x60]
195        paddd   xmm0, xmm4
196        paddd   xmm1, xmm5
197        paddd   xmm2, xmm6
198        paddd   xmm3, xmm7
199        pxor    xmm12, xmm0
200        pxor    xmm13, xmm1
201        pxor    xmm14, xmm2
202        pxor    xmm15, xmm3
203        movdqa  xmm8, xmmword ptr [ROT16+rip]
204        pshufb  xmm12, xmm8
205        pshufb  xmm13, xmm8
206        pshufb  xmm14, xmm8
207        pshufb  xmm15, xmm8
208        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
209        paddd   xmm8, xmm12
210        paddd   xmm9, xmm13
211        paddd   xmm10, xmm14
212        paddd   xmm11, xmm15
213        pxor    xmm4, xmm8
214        pxor    xmm5, xmm9
215        pxor    xmm6, xmm10
216        pxor    xmm7, xmm11
217        movdqa  xmmword ptr [rsp+0x100], xmm8
218        movdqa  xmm8, xmm4
219        psrld   xmm8, 12
220        pslld   xmm4, 20
221        por     xmm4, xmm8
222        movdqa  xmm8, xmm5
223        psrld   xmm8, 12
224        pslld   xmm5, 20
225        por     xmm5, xmm8
226        movdqa  xmm8, xmm6
227        psrld   xmm8, 12
228        pslld   xmm6, 20
229        por     xmm6, xmm8
230        movdqa  xmm8, xmm7
231        psrld   xmm8, 12
232        pslld   xmm7, 20
233        por     xmm7, xmm8
234        paddd   xmm0, xmmword ptr [rsp+0x10]
235        paddd   xmm1, xmmword ptr [rsp+0x30]
236        paddd   xmm2, xmmword ptr [rsp+0x50]
237        paddd   xmm3, xmmword ptr [rsp+0x70]
238        paddd   xmm0, xmm4
239        paddd   xmm1, xmm5
240        paddd   xmm2, xmm6
241        paddd   xmm3, xmm7
242        pxor    xmm12, xmm0
243        pxor    xmm13, xmm1
244        pxor    xmm14, xmm2
245        pxor    xmm15, xmm3
246        movdqa  xmm8, xmmword ptr [ROT8+rip]
247        pshufb  xmm12, xmm8
248        pshufb  xmm13, xmm8
249        pshufb  xmm14, xmm8
250        pshufb  xmm15, xmm8
251        movdqa  xmm8, xmmword ptr [rsp+0x100]
252        paddd   xmm8, xmm12
253        paddd   xmm9, xmm13
254        paddd   xmm10, xmm14
255        paddd   xmm11, xmm15
256        pxor    xmm4, xmm8
257        pxor    xmm5, xmm9
258        pxor    xmm6, xmm10
259        pxor    xmm7, xmm11
260        movdqa  xmmword ptr [rsp+0x100], xmm8
261        movdqa  xmm8, xmm4
262        psrld   xmm8, 7
263        pslld   xmm4, 25
264        por     xmm4, xmm8
265        movdqa  xmm8, xmm5
266        psrld   xmm8, 7
267        pslld   xmm5, 25
268        por     xmm5, xmm8
269        movdqa  xmm8, xmm6
270        psrld   xmm8, 7
271        pslld   xmm6, 25
272        por     xmm6, xmm8
273        movdqa  xmm8, xmm7
274        psrld   xmm8, 7
275        pslld   xmm7, 25
276        por     xmm7, xmm8
277        paddd   xmm0, xmmword ptr [rsp+0x80]
278        paddd   xmm1, xmmword ptr [rsp+0xA0]
279        paddd   xmm2, xmmword ptr [rsp+0xC0]
280        paddd   xmm3, xmmword ptr [rsp+0xE0]
281        paddd   xmm0, xmm5
282        paddd   xmm1, xmm6
283        paddd   xmm2, xmm7
284        paddd   xmm3, xmm4
285        pxor    xmm15, xmm0
286        pxor    xmm12, xmm1
287        pxor    xmm13, xmm2
288        pxor    xmm14, xmm3
289        movdqa  xmm8, xmmword ptr [ROT16+rip]
290        pshufb  xmm15, xmm8
291        pshufb  xmm12, xmm8
292        pshufb  xmm13, xmm8
293        pshufb  xmm14, xmm8
294        paddd   xmm10, xmm15
295        paddd   xmm11, xmm12
296        movdqa  xmm8, xmmword ptr [rsp+0x100]
297        paddd   xmm8, xmm13
298        paddd   xmm9, xmm14
299        pxor    xmm5, xmm10
300        pxor    xmm6, xmm11
301        pxor    xmm7, xmm8
302        pxor    xmm4, xmm9
303        movdqa  xmmword ptr [rsp+0x100], xmm8
304        movdqa  xmm8, xmm5
305        psrld   xmm8, 12
306        pslld   xmm5, 20
307        por     xmm5, xmm8
308        movdqa  xmm8, xmm6
309        psrld   xmm8, 12
310        pslld   xmm6, 20
311        por     xmm6, xmm8
312        movdqa  xmm8, xmm7
313        psrld   xmm8, 12
314        pslld   xmm7, 20
315        por     xmm7, xmm8
316        movdqa  xmm8, xmm4
317        psrld   xmm8, 12
318        pslld   xmm4, 20
319        por     xmm4, xmm8
320        paddd   xmm0, xmmword ptr [rsp+0x90]
321        paddd   xmm1, xmmword ptr [rsp+0xB0]
322        paddd   xmm2, xmmword ptr [rsp+0xD0]
323        paddd   xmm3, xmmword ptr [rsp+0xF0]
324        paddd   xmm0, xmm5
325        paddd   xmm1, xmm6
326        paddd   xmm2, xmm7
327        paddd   xmm3, xmm4
328        pxor    xmm15, xmm0
329        pxor    xmm12, xmm1
330        pxor    xmm13, xmm2
331        pxor    xmm14, xmm3
332        movdqa  xmm8, xmmword ptr [ROT8+rip]
333        pshufb  xmm15, xmm8
334        pshufb  xmm12, xmm8
335        pshufb  xmm13, xmm8
336        pshufb  xmm14, xmm8
337        paddd   xmm10, xmm15
338        paddd   xmm11, xmm12
339        movdqa  xmm8, xmmword ptr [rsp+0x100]
340        paddd   xmm8, xmm13
341        paddd   xmm9, xmm14
342        pxor    xmm5, xmm10
343        pxor    xmm6, xmm11
344        pxor    xmm7, xmm8
345        pxor    xmm4, xmm9
346        movdqa  xmmword ptr [rsp+0x100], xmm8
347        movdqa  xmm8, xmm5
348        psrld   xmm8, 7
349        pslld   xmm5, 25
350        por     xmm5, xmm8
351        movdqa  xmm8, xmm6
352        psrld   xmm8, 7
353        pslld   xmm6, 25
354        por     xmm6, xmm8
355        movdqa  xmm8, xmm7
356        psrld   xmm8, 7
357        pslld   xmm7, 25
358        por     xmm7, xmm8
359        movdqa  xmm8, xmm4
360        psrld   xmm8, 7
361        pslld   xmm4, 25
362        por     xmm4, xmm8
363        paddd   xmm0, xmmword ptr [rsp+0x20]
364        paddd   xmm1, xmmword ptr [rsp+0x30]
365        paddd   xmm2, xmmword ptr [rsp+0x70]
366        paddd   xmm3, xmmword ptr [rsp+0x40]
367        paddd   xmm0, xmm4
368        paddd   xmm1, xmm5
369        paddd   xmm2, xmm6
370        paddd   xmm3, xmm7
371        pxor    xmm12, xmm0
372        pxor    xmm13, xmm1
373        pxor    xmm14, xmm2
374        pxor    xmm15, xmm3
375        movdqa  xmm8, xmmword ptr [ROT16+rip]
376        pshufb  xmm12, xmm8
377        pshufb  xmm13, xmm8
378        pshufb  xmm14, xmm8
379        pshufb  xmm15, xmm8
380        movdqa  xmm8, xmmword ptr [rsp+0x100]
381        paddd   xmm8, xmm12
382        paddd   xmm9, xmm13
383        paddd   xmm10, xmm14
384        paddd   xmm11, xmm15
385        pxor    xmm4, xmm8
386        pxor    xmm5, xmm9
387        pxor    xmm6, xmm10
388        pxor    xmm7, xmm11
389        movdqa  xmmword ptr [rsp+0x100], xmm8
390        movdqa  xmm8, xmm4
391        psrld   xmm8, 12
392        pslld   xmm4, 20
393        por     xmm4, xmm8
394        movdqa  xmm8, xmm5
395        psrld   xmm8, 12
396        pslld   xmm5, 20
397        por     xmm5, xmm8
398        movdqa  xmm8, xmm6
399        psrld   xmm8, 12
400        pslld   xmm6, 20
401        por     xmm6, xmm8
402        movdqa  xmm8, xmm7
403        psrld   xmm8, 12
404        pslld   xmm7, 20
405        por     xmm7, xmm8
406        paddd   xmm0, xmmword ptr [rsp+0x60]
407        paddd   xmm1, xmmword ptr [rsp+0xA0]
408        paddd   xmm2, xmmword ptr [rsp]
409        paddd   xmm3, xmmword ptr [rsp+0xD0]
410        paddd   xmm0, xmm4
411        paddd   xmm1, xmm5
412        paddd   xmm2, xmm6
413        paddd   xmm3, xmm7
414        pxor    xmm12, xmm0
415        pxor    xmm13, xmm1
416        pxor    xmm14, xmm2
417        pxor    xmm15, xmm3
418        movdqa  xmm8, xmmword ptr [ROT8+rip]
419        pshufb  xmm12, xmm8
420        pshufb  xmm13, xmm8
421        pshufb  xmm14, xmm8
422        pshufb  xmm15, xmm8
423        movdqa  xmm8, xmmword ptr [rsp+0x100]
424        paddd   xmm8, xmm12
425        paddd   xmm9, xmm13
426        paddd   xmm10, xmm14
427        paddd   xmm11, xmm15
428        pxor    xmm4, xmm8
429        pxor    xmm5, xmm9
430        pxor    xmm6, xmm10
431        pxor    xmm7, xmm11
432        movdqa  xmmword ptr [rsp+0x100], xmm8
433        movdqa  xmm8, xmm4
434        psrld   xmm8, 7
435        pslld   xmm4, 25
436        por     xmm4, xmm8
437        movdqa  xmm8, xmm5
438        psrld   xmm8, 7
439        pslld   xmm5, 25
440        por     xmm5, xmm8
441        movdqa  xmm8, xmm6
442        psrld   xmm8, 7
443        pslld   xmm6, 25
444        por     xmm6, xmm8
445        movdqa  xmm8, xmm7
446        psrld   xmm8, 7
447        pslld   xmm7, 25
448        por     xmm7, xmm8
449        paddd   xmm0, xmmword ptr [rsp+0x10]
450        paddd   xmm1, xmmword ptr [rsp+0xC0]
451        paddd   xmm2, xmmword ptr [rsp+0x90]
452        paddd   xmm3, xmmword ptr [rsp+0xF0]
453        paddd   xmm0, xmm5
454        paddd   xmm1, xmm6
455        paddd   xmm2, xmm7
456        paddd   xmm3, xmm4
457        pxor    xmm15, xmm0
458        pxor    xmm12, xmm1
459        pxor    xmm13, xmm2
460        pxor    xmm14, xmm3
461        movdqa  xmm8, xmmword ptr [ROT16+rip]
462        pshufb  xmm15, xmm8
463        pshufb  xmm12, xmm8
464        pshufb  xmm13, xmm8
465        pshufb  xmm14, xmm8
466        paddd   xmm10, xmm15
467        paddd   xmm11, xmm12
468        movdqa  xmm8, xmmword ptr [rsp+0x100]
469        paddd   xmm8, xmm13
470        paddd   xmm9, xmm14
471        pxor    xmm5, xmm10
472        pxor    xmm6, xmm11
473        pxor    xmm7, xmm8
474        pxor    xmm4, xmm9
475        movdqa  xmmword ptr [rsp+0x100], xmm8
476        movdqa  xmm8, xmm5
477        psrld   xmm8, 12
478        pslld   xmm5, 20
479        por     xmm5, xmm8
480        movdqa  xmm8, xmm6
481        psrld   xmm8, 12
482        pslld   xmm6, 20
483        por     xmm6, xmm8
484        movdqa  xmm8, xmm7
485        psrld   xmm8, 12
486        pslld   xmm7, 20
487        por     xmm7, xmm8
488        movdqa  xmm8, xmm4
489        psrld   xmm8, 12
490        pslld   xmm4, 20
491        por     xmm4, xmm8
492        paddd   xmm0, xmmword ptr [rsp+0xB0]
493        paddd   xmm1, xmmword ptr [rsp+0x50]
494        paddd   xmm2, xmmword ptr [rsp+0xE0]
495        paddd   xmm3, xmmword ptr [rsp+0x80]
496        paddd   xmm0, xmm5
497        paddd   xmm1, xmm6
498        paddd   xmm2, xmm7
499        paddd   xmm3, xmm4
500        pxor    xmm15, xmm0
501        pxor    xmm12, xmm1
502        pxor    xmm13, xmm2
503        pxor    xmm14, xmm3
504        movdqa  xmm8, xmmword ptr [ROT8+rip]
505        pshufb  xmm15, xmm8
506        pshufb  xmm12, xmm8
507        pshufb  xmm13, xmm8
508        pshufb  xmm14, xmm8
509        paddd   xmm10, xmm15
510        paddd   xmm11, xmm12
511        movdqa  xmm8, xmmword ptr [rsp+0x100]
512        paddd   xmm8, xmm13
513        paddd   xmm9, xmm14
514        pxor    xmm5, xmm10
515        pxor    xmm6, xmm11
516        pxor    xmm7, xmm8
517        pxor    xmm4, xmm9
518        movdqa  xmmword ptr [rsp+0x100], xmm8
519        movdqa  xmm8, xmm5
520        psrld   xmm8, 7
521        pslld   xmm5, 25
522        por     xmm5, xmm8
523        movdqa  xmm8, xmm6
524        psrld   xmm8, 7
525        pslld   xmm6, 25
526        por     xmm6, xmm8
527        movdqa  xmm8, xmm7
528        psrld   xmm8, 7
529        pslld   xmm7, 25
530        por     xmm7, xmm8
531        movdqa  xmm8, xmm4
532        psrld   xmm8, 7
533        pslld   xmm4, 25
534        por     xmm4, xmm8
535        paddd   xmm0, xmmword ptr [rsp+0x30]
536        paddd   xmm1, xmmword ptr [rsp+0xA0]
537        paddd   xmm2, xmmword ptr [rsp+0xD0]
538        paddd   xmm3, xmmword ptr [rsp+0x70]
539        paddd   xmm0, xmm4
540        paddd   xmm1, xmm5
541        paddd   xmm2, xmm6
542        paddd   xmm3, xmm7
543        pxor    xmm12, xmm0
544        pxor    xmm13, xmm1
545        pxor    xmm14, xmm2
546        pxor    xmm15, xmm3
547        movdqa  xmm8, xmmword ptr [ROT16+rip]
548        pshufb  xmm12, xmm8
549        pshufb  xmm13, xmm8
550        pshufb  xmm14, xmm8
551        pshufb  xmm15, xmm8
552        movdqa  xmm8, xmmword ptr [rsp+0x100]
553        paddd   xmm8, xmm12
554        paddd   xmm9, xmm13
555        paddd   xmm10, xmm14
556        paddd   xmm11, xmm15
557        pxor    xmm4, xmm8
558        pxor    xmm5, xmm9
559        pxor    xmm6, xmm10
560        pxor    xmm7, xmm11
561        movdqa  xmmword ptr [rsp+0x100], xmm8
562        movdqa  xmm8, xmm4
563        psrld   xmm8, 12
564        pslld   xmm4, 20
565        por     xmm4, xmm8
566        movdqa  xmm8, xmm5
567        psrld   xmm8, 12
568        pslld   xmm5, 20
569        por     xmm5, xmm8
570        movdqa  xmm8, xmm6
571        psrld   xmm8, 12
572        pslld   xmm6, 20
573        por     xmm6, xmm8
574        movdqa  xmm8, xmm7
575        psrld   xmm8, 12
576        pslld   xmm7, 20
577        por     xmm7, xmm8
578        paddd   xmm0, xmmword ptr [rsp+0x40]
579        paddd   xmm1, xmmword ptr [rsp+0xC0]
580        paddd   xmm2, xmmword ptr [rsp+0x20]
581        paddd   xmm3, xmmword ptr [rsp+0xE0]
582        paddd   xmm0, xmm4
583        paddd   xmm1, xmm5
584        paddd   xmm2, xmm6
585        paddd   xmm3, xmm7
586        pxor    xmm12, xmm0
587        pxor    xmm13, xmm1
588        pxor    xmm14, xmm2
589        pxor    xmm15, xmm3
590        movdqa  xmm8, xmmword ptr [ROT8+rip]
591        pshufb  xmm12, xmm8
592        pshufb  xmm13, xmm8
593        pshufb  xmm14, xmm8
594        pshufb  xmm15, xmm8
595        movdqa  xmm8, xmmword ptr [rsp+0x100]
596        paddd   xmm8, xmm12
597        paddd   xmm9, xmm13
598        paddd   xmm10, xmm14
599        paddd   xmm11, xmm15
600        pxor    xmm4, xmm8
601        pxor    xmm5, xmm9
602        pxor    xmm6, xmm10
603        pxor    xmm7, xmm11
604        movdqa  xmmword ptr [rsp+0x100], xmm8
605        movdqa  xmm8, xmm4
606        psrld   xmm8, 7
607        pslld   xmm4, 25
608        por     xmm4, xmm8
609        movdqa  xmm8, xmm5
610        psrld   xmm8, 7
611        pslld   xmm5, 25
612        por     xmm5, xmm8
613        movdqa  xmm8, xmm6
614        psrld   xmm8, 7
615        pslld   xmm6, 25
616        por     xmm6, xmm8
617        movdqa  xmm8, xmm7
618        psrld   xmm8, 7
619        pslld   xmm7, 25
620        por     xmm7, xmm8
621        paddd   xmm0, xmmword ptr [rsp+0x60]
622        paddd   xmm1, xmmword ptr [rsp+0x90]
623        paddd   xmm2, xmmword ptr [rsp+0xB0]
624        paddd   xmm3, xmmword ptr [rsp+0x80]
625        paddd   xmm0, xmm5
626        paddd   xmm1, xmm6
627        paddd   xmm2, xmm7
628        paddd   xmm3, xmm4
629        pxor    xmm15, xmm0
630        pxor    xmm12, xmm1
631        pxor    xmm13, xmm2
632        pxor    xmm14, xmm3
633        movdqa  xmm8, xmmword ptr [ROT16+rip]
634        pshufb  xmm15, xmm8
635        pshufb  xmm12, xmm8
636        pshufb  xmm13, xmm8
637        pshufb  xmm14, xmm8
638        paddd   xmm10, xmm15
639        paddd   xmm11, xmm12
640        movdqa  xmm8, xmmword ptr [rsp+0x100]
641        paddd   xmm8, xmm13
642        paddd   xmm9, xmm14
643        pxor    xmm5, xmm10
644        pxor    xmm6, xmm11
645        pxor    xmm7, xmm8
646        pxor    xmm4, xmm9
647        movdqa  xmmword ptr [rsp+0x100], xmm8
648        movdqa  xmm8, xmm5
649        psrld   xmm8, 12
650        pslld   xmm5, 20
651        por     xmm5, xmm8
652        movdqa  xmm8, xmm6
653        psrld   xmm8, 12
654        pslld   xmm6, 20
655        por     xmm6, xmm8
656        movdqa  xmm8, xmm7
657        psrld   xmm8, 12
658        pslld   xmm7, 20
659        por     xmm7, xmm8
660        movdqa  xmm8, xmm4
661        psrld   xmm8, 12
662        pslld   xmm4, 20
663        por     xmm4, xmm8
664        paddd   xmm0, xmmword ptr [rsp+0x50]
665        paddd   xmm1, xmmword ptr [rsp]
666        paddd   xmm2, xmmword ptr [rsp+0xF0]
667        paddd   xmm3, xmmword ptr [rsp+0x10]
668        paddd   xmm0, xmm5
669        paddd   xmm1, xmm6
670        paddd   xmm2, xmm7
671        paddd   xmm3, xmm4
672        pxor    xmm15, xmm0
673        pxor    xmm12, xmm1
674        pxor    xmm13, xmm2
675        pxor    xmm14, xmm3
676        movdqa  xmm8, xmmword ptr [ROT8+rip]
677        pshufb  xmm15, xmm8
678        pshufb  xmm12, xmm8
679        pshufb  xmm13, xmm8
680        pshufb  xmm14, xmm8
681        paddd   xmm10, xmm15
682        paddd   xmm11, xmm12
683        movdqa  xmm8, xmmword ptr [rsp+0x100]
684        paddd   xmm8, xmm13
685        paddd   xmm9, xmm14
686        pxor    xmm5, xmm10
687        pxor    xmm6, xmm11
688        pxor    xmm7, xmm8
689        pxor    xmm4, xmm9
690        movdqa  xmmword ptr [rsp+0x100], xmm8
691        movdqa  xmm8, xmm5
692        psrld   xmm8, 7
693        pslld   xmm5, 25
694        por     xmm5, xmm8
695        movdqa  xmm8, xmm6
696        psrld   xmm8, 7
697        pslld   xmm6, 25
698        por     xmm6, xmm8
699        movdqa  xmm8, xmm7
700        psrld   xmm8, 7
701        pslld   xmm7, 25
702        por     xmm7, xmm8
703        movdqa  xmm8, xmm4
704        psrld   xmm8, 7
705        pslld   xmm4, 25
706        por     xmm4, xmm8
707        paddd   xmm0, xmmword ptr [rsp+0xA0]
708        paddd   xmm1, xmmword ptr [rsp+0xC0]
709        paddd   xmm2, xmmword ptr [rsp+0xE0]
710        paddd   xmm3, xmmword ptr [rsp+0xD0]
711        paddd   xmm0, xmm4
712        paddd   xmm1, xmm5
713        paddd   xmm2, xmm6
714        paddd   xmm3, xmm7
715        pxor    xmm12, xmm0
716        pxor    xmm13, xmm1
717        pxor    xmm14, xmm2
718        pxor    xmm15, xmm3
719        movdqa  xmm8, xmmword ptr [ROT16+rip]
720        pshufb  xmm12, xmm8
721        pshufb  xmm13, xmm8
722        pshufb  xmm14, xmm8
723        pshufb  xmm15, xmm8
724        movdqa  xmm8, xmmword ptr [rsp+0x100]
725        paddd   xmm8, xmm12
726        paddd   xmm9, xmm13
727        paddd   xmm10, xmm14
728        paddd   xmm11, xmm15
729        pxor    xmm4, xmm8
730        pxor    xmm5, xmm9
731        pxor    xmm6, xmm10
732        pxor    xmm7, xmm11
733        movdqa  xmmword ptr [rsp+0x100], xmm8
734        movdqa  xmm8, xmm4
735        psrld   xmm8, 12
736        pslld   xmm4, 20
737        por     xmm4, xmm8
738        movdqa  xmm8, xmm5
739        psrld   xmm8, 12
740        pslld   xmm5, 20
741        por     xmm5, xmm8
742        movdqa  xmm8, xmm6
743        psrld   xmm8, 12
744        pslld   xmm6, 20
745        por     xmm6, xmm8
746        movdqa  xmm8, xmm7
747        psrld   xmm8, 12
748        pslld   xmm7, 20
749        por     xmm7, xmm8
750        paddd   xmm0, xmmword ptr [rsp+0x70]
751        paddd   xmm1, xmmword ptr [rsp+0x90]
752        paddd   xmm2, xmmword ptr [rsp+0x30]
753        paddd   xmm3, xmmword ptr [rsp+0xF0]
754        paddd   xmm0, xmm4
755        paddd   xmm1, xmm5
756        paddd   xmm2, xmm6
757        paddd   xmm3, xmm7
758        pxor    xmm12, xmm0
759        pxor    xmm13, xmm1
760        pxor    xmm14, xmm2
761        pxor    xmm15, xmm3
762        movdqa  xmm8, xmmword ptr [ROT8+rip]
763        pshufb  xmm12, xmm8
764        pshufb  xmm13, xmm8
765        pshufb  xmm14, xmm8
766        pshufb  xmm15, xmm8
767        movdqa  xmm8, xmmword ptr [rsp+0x100]
768        paddd   xmm8, xmm12
769        paddd   xmm9, xmm13
770        paddd   xmm10, xmm14
771        paddd   xmm11, xmm15
772        pxor    xmm4, xmm8
773        pxor    xmm5, xmm9
774        pxor    xmm6, xmm10
775        pxor    xmm7, xmm11
776        movdqa  xmmword ptr [rsp+0x100], xmm8
777        movdqa  xmm8, xmm4
778        psrld   xmm8, 7
779        pslld   xmm4, 25
780        por     xmm4, xmm8
781        movdqa  xmm8, xmm5
782        psrld   xmm8, 7
783        pslld   xmm5, 25
784        por     xmm5, xmm8
785        movdqa  xmm8, xmm6
786        psrld   xmm8, 7
787        pslld   xmm6, 25
788        por     xmm6, xmm8
789        movdqa  xmm8, xmm7
790        psrld   xmm8, 7
791        pslld   xmm7, 25
792        por     xmm7, xmm8
793        paddd   xmm0, xmmword ptr [rsp+0x40]
794        paddd   xmm1, xmmword ptr [rsp+0xB0]
795        paddd   xmm2, xmmword ptr [rsp+0x50]
796        paddd   xmm3, xmmword ptr [rsp+0x10]
797        paddd   xmm0, xmm5
798        paddd   xmm1, xmm6
799        paddd   xmm2, xmm7
800        paddd   xmm3, xmm4
801        pxor    xmm15, xmm0
802        pxor    xmm12, xmm1
803        pxor    xmm13, xmm2
804        pxor    xmm14, xmm3
805        movdqa  xmm8, xmmword ptr [ROT16+rip]
806        pshufb  xmm15, xmm8
807        pshufb  xmm12, xmm8
808        pshufb  xmm13, xmm8
809        pshufb  xmm14, xmm8
810        paddd   xmm10, xmm15
811        paddd   xmm11, xmm12
812        movdqa  xmm8, xmmword ptr [rsp+0x100]
813        paddd   xmm8, xmm13
814        paddd   xmm9, xmm14
815        pxor    xmm5, xmm10
816        pxor    xmm6, xmm11
817        pxor    xmm7, xmm8
818        pxor    xmm4, xmm9
819        movdqa  xmmword ptr [rsp+0x100], xmm8
820        movdqa  xmm8, xmm5
821        psrld   xmm8, 12
822        pslld   xmm5, 20
823        por     xmm5, xmm8
824        movdqa  xmm8, xmm6
825        psrld   xmm8, 12
826        pslld   xmm6, 20
827        por     xmm6, xmm8
828        movdqa  xmm8, xmm7
829        psrld   xmm8, 12
830        pslld   xmm7, 20
831        por     xmm7, xmm8
832        movdqa  xmm8, xmm4
833        psrld   xmm8, 12
834        pslld   xmm4, 20
835        por     xmm4, xmm8
836        paddd   xmm0, xmmword ptr [rsp]
837        paddd   xmm1, xmmword ptr [rsp+0x20]
838        paddd   xmm2, xmmword ptr [rsp+0x80]
839        paddd   xmm3, xmmword ptr [rsp+0x60]
840        paddd   xmm0, xmm5
841        paddd   xmm1, xmm6
842        paddd   xmm2, xmm7
843        paddd   xmm3, xmm4
844        pxor    xmm15, xmm0
845        pxor    xmm12, xmm1
846        pxor    xmm13, xmm2
847        pxor    xmm14, xmm3
848        movdqa  xmm8, xmmword ptr [ROT8+rip]
849        pshufb  xmm15, xmm8
850        pshufb  xmm12, xmm8
851        pshufb  xmm13, xmm8
852        pshufb  xmm14, xmm8
853        paddd   xmm10, xmm15
854        paddd   xmm11, xmm12
855        movdqa  xmm8, xmmword ptr [rsp+0x100]
856        paddd   xmm8, xmm13
857        paddd   xmm9, xmm14
858        pxor    xmm5, xmm10
859        pxor    xmm6, xmm11
860        pxor    xmm7, xmm8
861        pxor    xmm4, xmm9
862        movdqa  xmmword ptr [rsp+0x100], xmm8
863        movdqa  xmm8, xmm5
864        psrld   xmm8, 7
865        pslld   xmm5, 25
866        por     xmm5, xmm8
867        movdqa  xmm8, xmm6
868        psrld   xmm8, 7
869        pslld   xmm6, 25
870        por     xmm6, xmm8
871        movdqa  xmm8, xmm7
872        psrld   xmm8, 7
873        pslld   xmm7, 25
874        por     xmm7, xmm8
875        movdqa  xmm8, xmm4
876        psrld   xmm8, 7
877        pslld   xmm4, 25
878        por     xmm4, xmm8
879        paddd   xmm0, xmmword ptr [rsp+0xC0]
880        paddd   xmm1, xmmword ptr [rsp+0x90]
881        paddd   xmm2, xmmword ptr [rsp+0xF0]
882        paddd   xmm3, xmmword ptr [rsp+0xE0]
883        paddd   xmm0, xmm4
884        paddd   xmm1, xmm5
885        paddd   xmm2, xmm6
886        paddd   xmm3, xmm7
887        pxor    xmm12, xmm0
888        pxor    xmm13, xmm1
889        pxor    xmm14, xmm2
890        pxor    xmm15, xmm3
891        movdqa  xmm8, xmmword ptr [ROT16+rip]
892        pshufb  xmm12, xmm8
893        pshufb  xmm13, xmm8
894        pshufb  xmm14, xmm8
895        pshufb  xmm15, xmm8
896        movdqa  xmm8, xmmword ptr [rsp+0x100]
897        paddd   xmm8, xmm12
898        paddd   xmm9, xmm13
899        paddd   xmm10, xmm14
900        paddd   xmm11, xmm15
901        pxor    xmm4, xmm8
902        pxor    xmm5, xmm9
903        pxor    xmm6, xmm10
904        pxor    xmm7, xmm11
905        movdqa  xmmword ptr [rsp+0x100], xmm8
906        movdqa  xmm8, xmm4
907        psrld   xmm8, 12
908        pslld   xmm4, 20
909        por     xmm4, xmm8
910        movdqa  xmm8, xmm5
911        psrld   xmm8, 12
912        pslld   xmm5, 20
913        por     xmm5, xmm8
914        movdqa  xmm8, xmm6
915        psrld   xmm8, 12
916        pslld   xmm6, 20
917        por     xmm6, xmm8
918        movdqa  xmm8, xmm7
919        psrld   xmm8, 12
920        pslld   xmm7, 20
921        por     xmm7, xmm8
922        paddd   xmm0, xmmword ptr [rsp+0xD0]
923        paddd   xmm1, xmmword ptr [rsp+0xB0]
924        paddd   xmm2, xmmword ptr [rsp+0xA0]
925        paddd   xmm3, xmmword ptr [rsp+0x80]
926        paddd   xmm0, xmm4
927        paddd   xmm1, xmm5
928        paddd   xmm2, xmm6
929        paddd   xmm3, xmm7
930        pxor    xmm12, xmm0
931        pxor    xmm13, xmm1
932        pxor    xmm14, xmm2
933        pxor    xmm15, xmm3
934        movdqa  xmm8, xmmword ptr [ROT8+rip]
935        pshufb  xmm12, xmm8
936        pshufb  xmm13, xmm8
937        pshufb  xmm14, xmm8
938        pshufb  xmm15, xmm8
939        movdqa  xmm8, xmmword ptr [rsp+0x100]
940        paddd   xmm8, xmm12
941        paddd   xmm9, xmm13
942        paddd   xmm10, xmm14
943        paddd   xmm11, xmm15
944        pxor    xmm4, xmm8
945        pxor    xmm5, xmm9
946        pxor    xmm6, xmm10
947        pxor    xmm7, xmm11
948        movdqa  xmmword ptr [rsp+0x100], xmm8
949        movdqa  xmm8, xmm4
950        psrld   xmm8, 7
951        pslld   xmm4, 25
952        por     xmm4, xmm8
953        movdqa  xmm8, xmm5
954        psrld   xmm8, 7
955        pslld   xmm5, 25
956        por     xmm5, xmm8
957        movdqa  xmm8, xmm6
958        psrld   xmm8, 7
959        pslld   xmm6, 25
960        por     xmm6, xmm8
961        movdqa  xmm8, xmm7
962        psrld   xmm8, 7
963        pslld   xmm7, 25
964        por     xmm7, xmm8
965        paddd   xmm0, xmmword ptr [rsp+0x70]
966        paddd   xmm1, xmmword ptr [rsp+0x50]
967        paddd   xmm2, xmmword ptr [rsp]
968        paddd   xmm3, xmmword ptr [rsp+0x60]
969        paddd   xmm0, xmm5
970        paddd   xmm1, xmm6
971        paddd   xmm2, xmm7
972        paddd   xmm3, xmm4
973        pxor    xmm15, xmm0
974        pxor    xmm12, xmm1
975        pxor    xmm13, xmm2
976        pxor    xmm14, xmm3
977        movdqa  xmm8, xmmword ptr [ROT16+rip]
978        pshufb  xmm15, xmm8
979        pshufb  xmm12, xmm8
980        pshufb  xmm13, xmm8
981        pshufb  xmm14, xmm8
982        paddd   xmm10, xmm15
983        paddd   xmm11, xmm12
984        movdqa  xmm8, xmmword ptr [rsp+0x100]
985        paddd   xmm8, xmm13
986        paddd   xmm9, xmm14
987        pxor    xmm5, xmm10
988        pxor    xmm6, xmm11
989        pxor    xmm7, xmm8
990        pxor    xmm4, xmm9
991        movdqa  xmmword ptr [rsp+0x100], xmm8
992        movdqa  xmm8, xmm5
993        psrld   xmm8, 12
994        pslld   xmm5, 20
995        por     xmm5, xmm8
996        movdqa  xmm8, xmm6
997        psrld   xmm8, 12
998        pslld   xmm6, 20
999        por     xmm6, xmm8
1000        movdqa  xmm8, xmm7
1001        psrld   xmm8, 12
1002        pslld   xmm7, 20
1003        por     xmm7, xmm8
1004        movdqa  xmm8, xmm4
1005        psrld   xmm8, 12
1006        pslld   xmm4, 20
1007        por     xmm4, xmm8
1008        paddd   xmm0, xmmword ptr [rsp+0x20]
1009        paddd   xmm1, xmmword ptr [rsp+0x30]
1010        paddd   xmm2, xmmword ptr [rsp+0x10]
1011        paddd   xmm3, xmmword ptr [rsp+0x40]
1012        paddd   xmm0, xmm5
1013        paddd   xmm1, xmm6
1014        paddd   xmm2, xmm7
1015        paddd   xmm3, xmm4
1016        pxor    xmm15, xmm0
1017        pxor    xmm12, xmm1
1018        pxor    xmm13, xmm2
1019        pxor    xmm14, xmm3
1020        movdqa  xmm8, xmmword ptr [ROT8+rip]
1021        pshufb  xmm15, xmm8
1022        pshufb  xmm12, xmm8
1023        pshufb  xmm13, xmm8
1024        pshufb  xmm14, xmm8
1025        paddd   xmm10, xmm15
1026        paddd   xmm11, xmm12
1027        movdqa  xmm8, xmmword ptr [rsp+0x100]
1028        paddd   xmm8, xmm13
1029        paddd   xmm9, xmm14
1030        pxor    xmm5, xmm10
1031        pxor    xmm6, xmm11
1032        pxor    xmm7, xmm8
1033        pxor    xmm4, xmm9
1034        movdqa  xmmword ptr [rsp+0x100], xmm8
1035        movdqa  xmm8, xmm5
1036        psrld   xmm8, 7
1037        pslld   xmm5, 25
1038        por     xmm5, xmm8
1039        movdqa  xmm8, xmm6
1040        psrld   xmm8, 7
1041        pslld   xmm6, 25
1042        por     xmm6, xmm8
1043        movdqa  xmm8, xmm7
1044        psrld   xmm8, 7
1045        pslld   xmm7, 25
1046        por     xmm7, xmm8
1047        movdqa  xmm8, xmm4
1048        psrld   xmm8, 7
1049        pslld   xmm4, 25
1050        por     xmm4, xmm8
1051        paddd   xmm0, xmmword ptr [rsp+0x90]
1052        paddd   xmm1, xmmword ptr [rsp+0xB0]
1053        paddd   xmm2, xmmword ptr [rsp+0x80]
1054        paddd   xmm3, xmmword ptr [rsp+0xF0]
1055        paddd   xmm0, xmm4
1056        paddd   xmm1, xmm5
1057        paddd   xmm2, xmm6
1058        paddd   xmm3, xmm7
1059        pxor    xmm12, xmm0
1060        pxor    xmm13, xmm1
1061        pxor    xmm14, xmm2
1062        pxor    xmm15, xmm3
1063        movdqa  xmm8, xmmword ptr [ROT16+rip]
1064        pshufb  xmm12, xmm8
1065        pshufb  xmm13, xmm8
1066        pshufb  xmm14, xmm8
1067        pshufb  xmm15, xmm8
1068        movdqa  xmm8, xmmword ptr [rsp+0x100]
1069        paddd   xmm8, xmm12
1070        paddd   xmm9, xmm13
1071        paddd   xmm10, xmm14
1072        paddd   xmm11, xmm15
1073        pxor    xmm4, xmm8
1074        pxor    xmm5, xmm9
1075        pxor    xmm6, xmm10
1076        pxor    xmm7, xmm11
1077        movdqa  xmmword ptr [rsp+0x100], xmm8
1078        movdqa  xmm8, xmm4
1079        psrld   xmm8, 12
1080        pslld   xmm4, 20
1081        por     xmm4, xmm8
1082        movdqa  xmm8, xmm5
1083        psrld   xmm8, 12
1084        pslld   xmm5, 20
1085        por     xmm5, xmm8
1086        movdqa  xmm8, xmm6
1087        psrld   xmm8, 12
1088        pslld   xmm6, 20
1089        por     xmm6, xmm8
1090        movdqa  xmm8, xmm7
1091        psrld   xmm8, 12
1092        pslld   xmm7, 20
1093        por     xmm7, xmm8
1094        paddd   xmm0, xmmword ptr [rsp+0xE0]
1095        paddd   xmm1, xmmword ptr [rsp+0x50]
1096        paddd   xmm2, xmmword ptr [rsp+0xC0]
1097        paddd   xmm3, xmmword ptr [rsp+0x10]
1098        paddd   xmm0, xmm4
1099        paddd   xmm1, xmm5
1100        paddd   xmm2, xmm6
1101        paddd   xmm3, xmm7
1102        pxor    xmm12, xmm0
1103        pxor    xmm13, xmm1
1104        pxor    xmm14, xmm2
1105        pxor    xmm15, xmm3
1106        movdqa  xmm8, xmmword ptr [ROT8+rip]
1107        pshufb  xmm12, xmm8
1108        pshufb  xmm13, xmm8
1109        pshufb  xmm14, xmm8
1110        pshufb  xmm15, xmm8
1111        movdqa  xmm8, xmmword ptr [rsp+0x100]
1112        paddd   xmm8, xmm12
1113        paddd   xmm9, xmm13
1114        paddd   xmm10, xmm14
1115        paddd   xmm11, xmm15
1116        pxor    xmm4, xmm8
1117        pxor    xmm5, xmm9
1118        pxor    xmm6, xmm10
1119        pxor    xmm7, xmm11
1120        movdqa  xmmword ptr [rsp+0x100], xmm8
1121        movdqa  xmm8, xmm4
1122        psrld   xmm8, 7
1123        pslld   xmm4, 25
1124        por     xmm4, xmm8
1125        movdqa  xmm8, xmm5
1126        psrld   xmm8, 7
1127        pslld   xmm5, 25
1128        por     xmm5, xmm8
1129        movdqa  xmm8, xmm6
1130        psrld   xmm8, 7
1131        pslld   xmm6, 25
1132        por     xmm6, xmm8
1133        movdqa  xmm8, xmm7
1134        psrld   xmm8, 7
1135        pslld   xmm7, 25
1136        por     xmm7, xmm8
1137        paddd   xmm0, xmmword ptr [rsp+0xD0]
1138        paddd   xmm1, xmmword ptr [rsp]
1139        paddd   xmm2, xmmword ptr [rsp+0x20]
1140        paddd   xmm3, xmmword ptr [rsp+0x40]
1141        paddd   xmm0, xmm5
1142        paddd   xmm1, xmm6
1143        paddd   xmm2, xmm7
1144        paddd   xmm3, xmm4
1145        pxor    xmm15, xmm0
1146        pxor    xmm12, xmm1
1147        pxor    xmm13, xmm2
1148        pxor    xmm14, xmm3
1149        movdqa  xmm8, xmmword ptr [ROT16+rip]
1150        pshufb  xmm15, xmm8
1151        pshufb  xmm12, xmm8
1152        pshufb  xmm13, xmm8
1153        pshufb  xmm14, xmm8
1154        paddd   xmm10, xmm15
1155        paddd   xmm11, xmm12
1156        movdqa  xmm8, xmmword ptr [rsp+0x100]
1157        paddd   xmm8, xmm13
1158        paddd   xmm9, xmm14
1159        pxor    xmm5, xmm10
1160        pxor    xmm6, xmm11
1161        pxor    xmm7, xmm8
1162        pxor    xmm4, xmm9
1163        movdqa  xmmword ptr [rsp+0x100], xmm8
1164        movdqa  xmm8, xmm5
1165        psrld   xmm8, 12
1166        pslld   xmm5, 20
1167        por     xmm5, xmm8
1168        movdqa  xmm8, xmm6
1169        psrld   xmm8, 12
1170        pslld   xmm6, 20
1171        por     xmm6, xmm8
1172        movdqa  xmm8, xmm7
1173        psrld   xmm8, 12
1174        pslld   xmm7, 20
1175        por     xmm7, xmm8
1176        movdqa  xmm8, xmm4
1177        psrld   xmm8, 12
1178        pslld   xmm4, 20
1179        por     xmm4, xmm8
1180        paddd   xmm0, xmmword ptr [rsp+0x30]
1181        paddd   xmm1, xmmword ptr [rsp+0xA0]
1182        paddd   xmm2, xmmword ptr [rsp+0x60]
1183        paddd   xmm3, xmmword ptr [rsp+0x70]
1184        paddd   xmm0, xmm5
1185        paddd   xmm1, xmm6
1186        paddd   xmm2, xmm7
1187        paddd   xmm3, xmm4
1188        pxor    xmm15, xmm0
1189        pxor    xmm12, xmm1
1190        pxor    xmm13, xmm2
1191        pxor    xmm14, xmm3
1192        movdqa  xmm8, xmmword ptr [ROT8+rip]
1193        pshufb  xmm15, xmm8
1194        pshufb  xmm12, xmm8
1195        pshufb  xmm13, xmm8
1196        pshufb  xmm14, xmm8
1197        paddd   xmm10, xmm15
1198        paddd   xmm11, xmm12
1199        movdqa  xmm8, xmmword ptr [rsp+0x100]
1200        paddd   xmm8, xmm13
1201        paddd   xmm9, xmm14
1202        pxor    xmm5, xmm10
1203        pxor    xmm6, xmm11
1204        pxor    xmm7, xmm8
1205        pxor    xmm4, xmm9
1206        movdqa  xmmword ptr [rsp+0x100], xmm8
1207        movdqa  xmm8, xmm5
1208        psrld   xmm8, 7
1209        pslld   xmm5, 25
1210        por     xmm5, xmm8
1211        movdqa  xmm8, xmm6
1212        psrld   xmm8, 7
1213        pslld   xmm6, 25
1214        por     xmm6, xmm8
1215        movdqa  xmm8, xmm7
1216        psrld   xmm8, 7
1217        pslld   xmm7, 25
1218        por     xmm7, xmm8
1219        movdqa  xmm8, xmm4
1220        psrld   xmm8, 7
1221        pslld   xmm4, 25
1222        por     xmm4, xmm8
1223        paddd   xmm0, xmmword ptr [rsp+0xB0]
1224        paddd   xmm1, xmmword ptr [rsp+0x50]
1225        paddd   xmm2, xmmword ptr [rsp+0x10]
1226        paddd   xmm3, xmmword ptr [rsp+0x80]
1227        paddd   xmm0, xmm4
1228        paddd   xmm1, xmm5
1229        paddd   xmm2, xmm6
1230        paddd   xmm3, xmm7
1231        pxor    xmm12, xmm0
1232        pxor    xmm13, xmm1
1233        pxor    xmm14, xmm2
1234        pxor    xmm15, xmm3
1235        movdqa  xmm8, xmmword ptr [ROT16+rip]
1236        pshufb  xmm12, xmm8
1237        pshufb  xmm13, xmm8
1238        pshufb  xmm14, xmm8
1239        pshufb  xmm15, xmm8
1240        movdqa  xmm8, xmmword ptr [rsp+0x100]
1241        paddd   xmm8, xmm12
1242        paddd   xmm9, xmm13
1243        paddd   xmm10, xmm14
1244        paddd   xmm11, xmm15
1245        pxor    xmm4, xmm8
1246        pxor    xmm5, xmm9
1247        pxor    xmm6, xmm10
1248        pxor    xmm7, xmm11
1249        movdqa  xmmword ptr [rsp+0x100], xmm8
1250        movdqa  xmm8, xmm4
1251        psrld   xmm8, 12
1252        pslld   xmm4, 20
1253        por     xmm4, xmm8
1254        movdqa  xmm8, xmm5
1255        psrld   xmm8, 12
1256        pslld   xmm5, 20
1257        por     xmm5, xmm8
1258        movdqa  xmm8, xmm6
1259        psrld   xmm8, 12
1260        pslld   xmm6, 20
1261        por     xmm6, xmm8
1262        movdqa  xmm8, xmm7
1263        psrld   xmm8, 12
1264        pslld   xmm7, 20
1265        por     xmm7, xmm8
1266        paddd   xmm0, xmmword ptr [rsp+0xF0]
1267        paddd   xmm1, xmmword ptr [rsp]
1268        paddd   xmm2, xmmword ptr [rsp+0x90]
1269        paddd   xmm3, xmmword ptr [rsp+0x60]
1270        paddd   xmm0, xmm4
1271        paddd   xmm1, xmm5
1272        paddd   xmm2, xmm6
1273        paddd   xmm3, xmm7
1274        pxor    xmm12, xmm0
1275        pxor    xmm13, xmm1
1276        pxor    xmm14, xmm2
1277        pxor    xmm15, xmm3
1278        movdqa  xmm8, xmmword ptr [ROT8+rip]
1279        pshufb  xmm12, xmm8
1280        pshufb  xmm13, xmm8
1281        pshufb  xmm14, xmm8
1282        pshufb  xmm15, xmm8
1283        movdqa  xmm8, xmmword ptr [rsp+0x100]
1284        paddd   xmm8, xmm12
1285        paddd   xmm9, xmm13
1286        paddd   xmm10, xmm14
1287        paddd   xmm11, xmm15
1288        pxor    xmm4, xmm8
1289        pxor    xmm5, xmm9
1290        pxor    xmm6, xmm10
1291        pxor    xmm7, xmm11
1292        movdqa  xmmword ptr [rsp+0x100], xmm8
1293        movdqa  xmm8, xmm4
1294        psrld   xmm8, 7
1295        pslld   xmm4, 25
1296        por     xmm4, xmm8
1297        movdqa  xmm8, xmm5
1298        psrld   xmm8, 7
1299        pslld   xmm5, 25
1300        por     xmm5, xmm8
1301        movdqa  xmm8, xmm6
1302        psrld   xmm8, 7
1303        pslld   xmm6, 25
1304        por     xmm6, xmm8
1305        movdqa  xmm8, xmm7
1306        psrld   xmm8, 7
1307        pslld   xmm7, 25
1308        por     xmm7, xmm8
1309        paddd   xmm0, xmmword ptr [rsp+0xE0]
1310        paddd   xmm1, xmmword ptr [rsp+0x20]
1311        paddd   xmm2, xmmword ptr [rsp+0x30]
1312        paddd   xmm3, xmmword ptr [rsp+0x70]
1313        paddd   xmm0, xmm5
1314        paddd   xmm1, xmm6
1315        paddd   xmm2, xmm7
1316        paddd   xmm3, xmm4
1317        pxor    xmm15, xmm0
1318        pxor    xmm12, xmm1
1319        pxor    xmm13, xmm2
1320        pxor    xmm14, xmm3
1321        movdqa  xmm8, xmmword ptr [ROT16+rip]
1322        pshufb  xmm15, xmm8
1323        pshufb  xmm12, xmm8
1324        pshufb  xmm13, xmm8
1325        pshufb  xmm14, xmm8
1326        paddd   xmm10, xmm15
1327        paddd   xmm11, xmm12
1328        movdqa  xmm8, xmmword ptr [rsp+0x100]
1329        paddd   xmm8, xmm13
1330        paddd   xmm9, xmm14
1331        pxor    xmm5, xmm10
1332        pxor    xmm6, xmm11
1333        pxor    xmm7, xmm8
1334        pxor    xmm4, xmm9
1335        movdqa  xmmword ptr [rsp+0x100], xmm8
1336        movdqa  xmm8, xmm5
1337        psrld   xmm8, 12
1338        pslld   xmm5, 20
1339        por     xmm5, xmm8
1340        movdqa  xmm8, xmm6
1341        psrld   xmm8, 12
1342        pslld   xmm6, 20
1343        por     xmm6, xmm8
1344        movdqa  xmm8, xmm7
1345        psrld   xmm8, 12
1346        pslld   xmm7, 20
1347        por     xmm7, xmm8
1348        movdqa  xmm8, xmm4
1349        psrld   xmm8, 12
1350        pslld   xmm4, 20
1351        por     xmm4, xmm8
1352        paddd   xmm0, xmmword ptr [rsp+0xA0]
1353        paddd   xmm1, xmmword ptr [rsp+0xC0]
1354        paddd   xmm2, xmmword ptr [rsp+0x40]
1355        paddd   xmm3, xmmword ptr [rsp+0xD0]
1356        paddd   xmm0, xmm5
1357        paddd   xmm1, xmm6
1358        paddd   xmm2, xmm7
1359        paddd   xmm3, xmm4
1360        pxor    xmm15, xmm0
1361        pxor    xmm12, xmm1
1362        pxor    xmm13, xmm2
1363        pxor    xmm14, xmm3
1364        movdqa  xmm8, xmmword ptr [ROT8+rip]
1365        pshufb  xmm15, xmm8
1366        pshufb  xmm12, xmm8
1367        pshufb  xmm13, xmm8
1368        pshufb  xmm14, xmm8
1369        paddd   xmm10, xmm15
1370        paddd   xmm11, xmm12
1371        movdqa  xmm8, xmmword ptr [rsp+0x100]
1372        paddd   xmm8, xmm13
1373        paddd   xmm9, xmm14
1374        pxor    xmm5, xmm10
1375        pxor    xmm6, xmm11
1376        pxor    xmm7, xmm8
1377        pxor    xmm4, xmm9
1378        pxor    xmm0, xmm8
1379        pxor    xmm1, xmm9
1380        pxor    xmm2, xmm10
1381        pxor    xmm3, xmm11
1382        movdqa  xmm8, xmm5
1383        psrld   xmm8, 7
1384        pslld   xmm5, 25
1385        por     xmm5, xmm8
1386        movdqa  xmm8, xmm6
1387        psrld   xmm8, 7
1388        pslld   xmm6, 25
1389        por     xmm6, xmm8
1390        movdqa  xmm8, xmm7
1391        psrld   xmm8, 7
1392        pslld   xmm7, 25
1393        por     xmm7, xmm8
1394        movdqa  xmm8, xmm4
1395        psrld   xmm8, 7
1396        pslld   xmm4, 25
1397        por     xmm4, xmm8
1398        pxor    xmm4, xmm12
1399        pxor    xmm5, xmm13
1400        pxor    xmm6, xmm14
1401        pxor    xmm7, xmm15
1402        mov     eax, r13d
1403        jne     9b
1404        movdqa  xmm9, xmm0
1405        punpckldq xmm0, xmm1
1406        punpckhdq xmm9, xmm1
1407        movdqa  xmm11, xmm2
1408        punpckldq xmm2, xmm3
1409        punpckhdq xmm11, xmm3
1410        movdqa  xmm1, xmm0
1411        punpcklqdq xmm0, xmm2
1412        punpckhqdq xmm1, xmm2
1413        movdqa  xmm3, xmm9
1414        punpcklqdq xmm9, xmm11
1415        punpckhqdq xmm3, xmm11
1416        movdqu  xmmword ptr [rbx], xmm0
1417        movdqu  xmmword ptr [rbx+0x20], xmm1
1418        movdqu  xmmword ptr [rbx+0x40], xmm9
1419        movdqu  xmmword ptr [rbx+0x60], xmm3
1420        movdqa  xmm9, xmm4
1421        punpckldq xmm4, xmm5
1422        punpckhdq xmm9, xmm5
1423        movdqa  xmm11, xmm6
1424        punpckldq xmm6, xmm7
1425        punpckhdq xmm11, xmm7
1426        movdqa  xmm5, xmm4
1427        punpcklqdq xmm4, xmm6
1428        punpckhqdq xmm5, xmm6
1429        movdqa  xmm7, xmm9
1430        punpcklqdq xmm9, xmm11
1431        punpckhqdq xmm7, xmm11
1432        movdqu  xmmword ptr [rbx+0x10], xmm4
1433        movdqu  xmmword ptr [rbx+0x30], xmm5
1434        movdqu  xmmword ptr [rbx+0x50], xmm9
1435        movdqu  xmmword ptr [rbx+0x70], xmm7
1436        movdqa  xmm1, xmmword ptr [rsp+0x110]
1437        movdqa  xmm0, xmm1
1438        paddd   xmm1, xmmword ptr [rsp+0x150]
1439        movdqa  xmmword ptr [rsp+0x110], xmm1
1440        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1441        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1442        pcmpgtd xmm0, xmm1
1443        movdqa  xmm1, xmmword ptr [rsp+0x120]
1444        psubd   xmm1, xmm0
1445        movdqa  xmmword ptr [rsp+0x120], xmm1
1446        add     rbx, 128
1447        add     rdi, 32
1448        sub     rsi, 4
1449        cmp     rsi, 4
1450        jnc     2b
1451        test    rsi, rsi
1452        jnz     3f
14534:
1454        mov     rsp, rbp
1455        pop     rbp
1456        pop     rbx
1457        pop     r12
1458        pop     r13
1459        pop     r14
1460        pop     r15
1461        RET
1462.p2align 5
14633:
1464        test    esi, 0x2
1465        je      3f
1466        movups  xmm0, xmmword ptr [rcx]
1467        movups  xmm1, xmmword ptr [rcx+0x10]
1468        movaps  xmm8, xmm0
1469        movaps  xmm9, xmm1
1470        movd    xmm13, dword ptr [rsp+0x110]
1471        pinsrd  xmm13, dword ptr [rsp+0x120], 1
1472        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1473        movaps  xmmword ptr [rsp], xmm13
1474        movd    xmm14, dword ptr [rsp+0x114]
1475        pinsrd  xmm14, dword ptr [rsp+0x124], 1
1476        pinsrd  xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1477        movaps  xmmword ptr [rsp+0x10], xmm14
1478        mov     r8, qword ptr [rdi]
1479        mov     r9, qword ptr [rdi+0x8]
1480        movzx   eax, byte ptr [rbp+0x40]
1481        or      eax, r13d
1482        xor     edx, edx
14832:
1484        mov     r14d, eax
1485        or      eax, r12d
1486        add     rdx, 64
1487        cmp     rdx, r15
1488        cmovne  eax, r14d
1489        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1490        movaps  xmm10, xmm2
1491        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1492        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1493        movaps  xmm3, xmm4
1494        shufps  xmm4, xmm5, 136
1495        shufps  xmm3, xmm5, 221
1496        movaps  xmm5, xmm3
1497        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1498        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1499        movaps  xmm3, xmm6
1500        shufps  xmm6, xmm7, 136
1501        pshufd  xmm6, xmm6, 0x93
1502        shufps  xmm3, xmm7, 221
1503        pshufd  xmm7, xmm3, 0x93
1504        movups  xmm12, xmmword ptr [r9+rdx-0x40]
1505        movups  xmm13, xmmword ptr [r9+rdx-0x30]
1506        movaps  xmm11, xmm12
1507        shufps  xmm12, xmm13, 136
1508        shufps  xmm11, xmm13, 221
1509        movaps  xmm13, xmm11
1510        movups  xmm14, xmmword ptr [r9+rdx-0x20]
1511        movups  xmm15, xmmword ptr [r9+rdx-0x10]
1512        movaps  xmm11, xmm14
1513        shufps  xmm14, xmm15, 136
1514        pshufd  xmm14, xmm14, 0x93
1515        shufps  xmm11, xmm15, 221
1516        pshufd  xmm15, xmm11, 0x93
1517        movaps  xmm3, xmmword ptr [rsp]
1518        movaps  xmm11, xmmword ptr [rsp+0x10]
1519        pinsrd  xmm3, eax, 3
1520        pinsrd  xmm11, eax, 3
1521        mov     al, 7
15229:
1523        paddd   xmm0, xmm4
1524        paddd   xmm8, xmm12
1525        movaps  xmmword ptr [rsp+0x20], xmm4
1526        movaps  xmmword ptr [rsp+0x30], xmm12
1527        paddd   xmm0, xmm1
1528        paddd   xmm8, xmm9
1529        pxor    xmm3, xmm0
1530        pxor    xmm11, xmm8
1531        movaps  xmm12, xmmword ptr [ROT16+rip]
1532        pshufb  xmm3, xmm12
1533        pshufb  xmm11, xmm12
1534        paddd   xmm2, xmm3
1535        paddd   xmm10, xmm11
1536        pxor    xmm1, xmm2
1537        pxor    xmm9, xmm10
1538        movdqa  xmm4, xmm1
1539        pslld   xmm1, 20
1540        psrld   xmm4, 12
1541        por     xmm1, xmm4
1542        movdqa  xmm4, xmm9
1543        pslld   xmm9, 20
1544        psrld   xmm4, 12
1545        por     xmm9, xmm4
1546        paddd   xmm0, xmm5
1547        paddd   xmm8, xmm13
1548        movaps  xmmword ptr [rsp+0x40], xmm5
1549        movaps  xmmword ptr [rsp+0x50], xmm13
1550        paddd   xmm0, xmm1
1551        paddd   xmm8, xmm9
1552        pxor    xmm3, xmm0
1553        pxor    xmm11, xmm8
1554        movaps  xmm13, xmmword ptr [ROT8+rip]
1555        pshufb  xmm3, xmm13
1556        pshufb  xmm11, xmm13
1557        paddd   xmm2, xmm3
1558        paddd   xmm10, xmm11
1559        pxor    xmm1, xmm2
1560        pxor    xmm9, xmm10
1561        movdqa  xmm4, xmm1
1562        pslld   xmm1, 25
1563        psrld   xmm4, 7
1564        por     xmm1, xmm4
1565        movdqa  xmm4, xmm9
1566        pslld   xmm9, 25
1567        psrld   xmm4, 7
1568        por     xmm9, xmm4
1569        pshufd  xmm0, xmm0, 0x93
1570        pshufd  xmm8, xmm8, 0x93
1571        pshufd  xmm3, xmm3, 0x4E
1572        pshufd  xmm11, xmm11, 0x4E
1573        pshufd  xmm2, xmm2, 0x39
1574        pshufd  xmm10, xmm10, 0x39
1575        paddd   xmm0, xmm6
1576        paddd   xmm8, xmm14
1577        paddd   xmm0, xmm1
1578        paddd   xmm8, xmm9
1579        pxor    xmm3, xmm0
1580        pxor    xmm11, xmm8
1581        pshufb  xmm3, xmm12
1582        pshufb  xmm11, xmm12
1583        paddd   xmm2, xmm3
1584        paddd   xmm10, xmm11
1585        pxor    xmm1, xmm2
1586        pxor    xmm9, xmm10
1587        movdqa  xmm4, xmm1
1588        pslld   xmm1, 20
1589        psrld   xmm4, 12
1590        por     xmm1, xmm4
1591        movdqa  xmm4, xmm9
1592        pslld   xmm9, 20
1593        psrld   xmm4, 12
1594        por     xmm9, xmm4
1595        paddd   xmm0, xmm7
1596        paddd   xmm8, xmm15
1597        paddd   xmm0, xmm1
1598        paddd   xmm8, xmm9
1599        pxor    xmm3, xmm0
1600        pxor    xmm11, xmm8
1601        pshufb  xmm3, xmm13
1602        pshufb  xmm11, xmm13
1603        paddd   xmm2, xmm3
1604        paddd   xmm10, xmm11
1605        pxor    xmm1, xmm2
1606        pxor    xmm9, xmm10
1607        movdqa  xmm4, xmm1
1608        pslld   xmm1, 25
1609        psrld   xmm4, 7
1610        por     xmm1, xmm4
1611        movdqa  xmm4, xmm9
1612        pslld   xmm9, 25
1613        psrld   xmm4, 7
1614        por     xmm9, xmm4
1615        pshufd  xmm0, xmm0, 0x39
1616        pshufd  xmm8, xmm8, 0x39
1617        pshufd  xmm3, xmm3, 0x4E
1618        pshufd  xmm11, xmm11, 0x4E
1619        pshufd  xmm2, xmm2, 0x93
1620        pshufd  xmm10, xmm10, 0x93
1621        dec     al
1622        je      9f
1623        movdqa  xmm12, xmmword ptr [rsp+0x20]
1624        movdqa  xmm5, xmmword ptr [rsp+0x40]
1625        pshufd  xmm13, xmm12, 0x0F
1626        shufps  xmm12, xmm5, 214
1627        pshufd  xmm4, xmm12, 0x39
1628        movdqa  xmm12, xmm6
1629        shufps  xmm12, xmm7, 250
1630        pblendw xmm13, xmm12, 0xCC
1631        movdqa  xmm12, xmm7
1632        punpcklqdq xmm12, xmm5
1633        pblendw xmm12, xmm6, 0xC0
1634        pshufd  xmm12, xmm12, 0x78
1635        punpckhdq xmm5, xmm7
1636        punpckldq xmm6, xmm5
1637        pshufd  xmm7, xmm6, 0x1E
1638        movdqa  xmmword ptr [rsp+0x20], xmm13
1639        movdqa  xmmword ptr [rsp+0x40], xmm12
1640        movdqa  xmm5, xmmword ptr [rsp+0x30]
1641        movdqa  xmm13, xmmword ptr [rsp+0x50]
1642        pshufd  xmm6, xmm5, 0x0F
1643        shufps  xmm5, xmm13, 214
1644        pshufd  xmm12, xmm5, 0x39
1645        movdqa  xmm5, xmm14
1646        shufps  xmm5, xmm15, 250
1647        pblendw xmm6, xmm5, 0xCC
1648        movdqa  xmm5, xmm15
1649        punpcklqdq xmm5, xmm13
1650        pblendw xmm5, xmm14, 0xC0
1651        pshufd  xmm5, xmm5, 0x78
1652        punpckhdq xmm13, xmm15
1653        punpckldq xmm14, xmm13
1654        pshufd  xmm15, xmm14, 0x1E
1655        movdqa  xmm13, xmm6
1656        movdqa  xmm14, xmm5
1657        movdqa  xmm5, xmmword ptr [rsp+0x20]
1658        movdqa  xmm6, xmmword ptr [rsp+0x40]
1659        jmp     9b
16609:
1661        pxor    xmm0, xmm2
1662        pxor    xmm1, xmm3
1663        pxor    xmm8, xmm10
1664        pxor    xmm9, xmm11
1665        mov     eax, r13d
1666        cmp     rdx, r15
1667        jne     2b
1668        movups  xmmword ptr [rbx], xmm0
1669        movups  xmmword ptr [rbx+0x10], xmm1
1670        movups  xmmword ptr [rbx+0x20], xmm8
1671        movups  xmmword ptr [rbx+0x30], xmm9
1672        movdqa  xmm0, xmmword ptr [rsp+0x130]
1673        movdqa  xmm1, xmmword ptr [rsp+0x110]
1674        movdqa  xmm2, xmmword ptr [rsp+0x120]
1675        movdqu  xmm3, xmmword ptr [rsp+0x118]
1676        movdqu  xmm4, xmmword ptr [rsp+0x128]
1677        blendvps xmm1, xmm3, xmm0
1678        blendvps xmm2, xmm4, xmm0
1679        movdqa  xmmword ptr [rsp+0x110], xmm1
1680        movdqa  xmmword ptr [rsp+0x120], xmm2
1681        add     rdi, 16
1682        add     rbx, 64
1683        sub     rsi, 2
16843:
1685        test    esi, 0x1
1686        je      4b
1687        movups  xmm0, xmmword ptr [rcx]
1688        movups  xmm1, xmmword ptr [rcx+0x10]
1689        movd    xmm13, dword ptr [rsp+0x110]
1690        pinsrd  xmm13, dword ptr [rsp+0x120], 1
1691        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1692        movaps  xmm14, xmmword ptr [ROT8+rip]
1693        movaps  xmm15, xmmword ptr [ROT16+rip]
1694        mov     r8, qword ptr [rdi]
1695        movzx   eax, byte ptr [rbp+0x40]
1696        or      eax, r13d
1697        xor     edx, edx
16982:
1699        mov     r14d, eax
1700        or      eax, r12d
1701        add     rdx, 64
1702        cmp     rdx, r15
1703        cmovne  eax, r14d
1704        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1705        movaps  xmm3, xmm13
1706        pinsrd  xmm3, eax, 3
1707        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1708        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1709        movaps  xmm8, xmm4
1710        shufps  xmm4, xmm5, 136
1711        shufps  xmm8, xmm5, 221
1712        movaps  xmm5, xmm8
1713        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1714        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1715        movaps  xmm8, xmm6
1716        shufps  xmm6, xmm7, 136
1717        pshufd  xmm6, xmm6, 0x93
1718        shufps  xmm8, xmm7, 221
1719        pshufd  xmm7, xmm8, 0x93
1720        mov     al, 7
17219:
1722        paddd   xmm0, xmm4
1723        paddd   xmm0, xmm1
1724        pxor    xmm3, xmm0
1725        pshufb  xmm3, xmm15
1726        paddd   xmm2, xmm3
1727        pxor    xmm1, xmm2
1728        movdqa  xmm11, xmm1
1729        pslld   xmm1, 20
1730        psrld   xmm11, 12
1731        por     xmm1, xmm11
1732        paddd   xmm0, xmm5
1733        paddd   xmm0, xmm1
1734        pxor    xmm3, xmm0
1735        pshufb  xmm3, xmm14
1736        paddd   xmm2, xmm3
1737        pxor    xmm1, xmm2
1738        movdqa  xmm11, xmm1
1739        pslld   xmm1, 25
1740        psrld   xmm11, 7
1741        por     xmm1, xmm11
1742        pshufd  xmm0, xmm0, 0x93
1743        pshufd  xmm3, xmm3, 0x4E
1744        pshufd  xmm2, xmm2, 0x39
1745        paddd   xmm0, xmm6
1746        paddd   xmm0, xmm1
1747        pxor    xmm3, xmm0
1748        pshufb  xmm3, xmm15
1749        paddd   xmm2, xmm3
1750        pxor    xmm1, xmm2
1751        movdqa  xmm11, xmm1
1752        pslld   xmm1, 20
1753        psrld   xmm11, 12
1754        por     xmm1, xmm11
1755        paddd   xmm0, xmm7
1756        paddd   xmm0, xmm1
1757        pxor    xmm3, xmm0
1758        pshufb  xmm3, xmm14
1759        paddd   xmm2, xmm3
1760        pxor    xmm1, xmm2
1761        movdqa  xmm11, xmm1
1762        pslld   xmm1, 25
1763        psrld   xmm11, 7
1764        por     xmm1, xmm11
1765        pshufd  xmm0, xmm0, 0x39
1766        pshufd  xmm3, xmm3, 0x4E
1767        pshufd  xmm2, xmm2, 0x93
1768        dec     al
1769        jz      9f
1770        movdqa  xmm8, xmm4
1771        shufps  xmm8, xmm5, 214
1772        pshufd  xmm9, xmm4, 0x0F
1773        pshufd  xmm4, xmm8, 0x39
1774        movdqa  xmm8, xmm6
1775        shufps  xmm8, xmm7, 250
1776        pblendw xmm9, xmm8, 0xCC
1777        movdqa  xmm8, xmm7
1778        punpcklqdq xmm8, xmm5
1779        pblendw xmm8, xmm6, 0xC0
1780        pshufd  xmm8, xmm8, 0x78
1781        punpckhdq xmm5, xmm7
1782        punpckldq xmm6, xmm5
1783        pshufd  xmm7, xmm6, 0x1E
1784        movdqa  xmm5, xmm9
1785        movdqa  xmm6, xmm8
1786        jmp     9b
17879:
1788        pxor    xmm0, xmm2
1789        pxor    xmm1, xmm3
1790        mov     eax, r13d
1791        cmp     rdx, r15
1792        jne     2b
1793        movups  xmmword ptr [rbx], xmm0
1794        movups  xmmword ptr [rbx+0x10], xmm1
1795        jmp     4b
1796SET_SIZE(zfs_blake3_hash_many_sse41)
1797
1798ENTRY_ALIGN(zfs_blake3_compress_in_place_sse41, 64)
1799        ENDBR
1800        movups  xmm0, xmmword ptr [rdi]
1801        movups  xmm1, xmmword ptr [rdi+0x10]
1802        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1803        shl     r8, 32
1804        add     rdx, r8
1805        movq    xmm3, rcx
1806        movq    xmm4, rdx
1807        punpcklqdq xmm3, xmm4
1808        movups  xmm4, xmmword ptr [rsi]
1809        movups  xmm5, xmmword ptr [rsi+0x10]
1810        movaps  xmm8, xmm4
1811        shufps  xmm4, xmm5, 136
1812        shufps  xmm8, xmm5, 221
1813        movaps  xmm5, xmm8
1814        movups  xmm6, xmmword ptr [rsi+0x20]
1815        movups  xmm7, xmmword ptr [rsi+0x30]
1816        movaps  xmm8, xmm6
1817        shufps  xmm6, xmm7, 136
1818        pshufd  xmm6, xmm6, 0x93
1819        shufps  xmm8, xmm7, 221
1820        pshufd  xmm7, xmm8, 0x93
1821        movaps  xmm14, xmmword ptr [ROT8+rip]
1822        movaps  xmm15, xmmword ptr [ROT16+rip]
1823        mov     al, 7
18249:
1825        paddd   xmm0, xmm4
1826        paddd   xmm0, xmm1
1827        pxor    xmm3, xmm0
1828        pshufb  xmm3, xmm15
1829        paddd   xmm2, xmm3
1830        pxor    xmm1, xmm2
1831        movdqa  xmm11, xmm1
1832        pslld   xmm1, 20
1833        psrld   xmm11, 12
1834        por     xmm1, xmm11
1835        paddd   xmm0, xmm5
1836        paddd   xmm0, xmm1
1837        pxor    xmm3, xmm0
1838        pshufb  xmm3, xmm14
1839        paddd   xmm2, xmm3
1840        pxor    xmm1, xmm2
1841        movdqa  xmm11, xmm1
1842        pslld   xmm1, 25
1843        psrld   xmm11, 7
1844        por     xmm1, xmm11
1845        pshufd  xmm0, xmm0, 0x93
1846        pshufd  xmm3, xmm3, 0x4E
1847        pshufd  xmm2, xmm2, 0x39
1848        paddd   xmm0, xmm6
1849        paddd   xmm0, xmm1
1850        pxor    xmm3, xmm0
1851        pshufb  xmm3, xmm15
1852        paddd   xmm2, xmm3
1853        pxor    xmm1, xmm2
1854        movdqa  xmm11, xmm1
1855        pslld   xmm1, 20
1856        psrld   xmm11, 12
1857        por     xmm1, xmm11
1858        paddd   xmm0, xmm7
1859        paddd   xmm0, xmm1
1860        pxor    xmm3, xmm0
1861        pshufb  xmm3, xmm14
1862        paddd   xmm2, xmm3
1863        pxor    xmm1, xmm2
1864        movdqa  xmm11, xmm1
1865        pslld   xmm1, 25
1866        psrld   xmm11, 7
1867        por     xmm1, xmm11
1868        pshufd  xmm0, xmm0, 0x39
1869        pshufd  xmm3, xmm3, 0x4E
1870        pshufd  xmm2, xmm2, 0x93
1871        dec     al
1872        jz      9f
1873        movdqa  xmm8, xmm4
1874        shufps  xmm8, xmm5, 214
1875        pshufd  xmm9, xmm4, 0x0F
1876        pshufd  xmm4, xmm8, 0x39
1877        movdqa  xmm8, xmm6
1878        shufps  xmm8, xmm7, 250
1879        pblendw xmm9, xmm8, 0xCC
1880        movdqa  xmm8, xmm7
1881        punpcklqdq xmm8, xmm5
1882        pblendw xmm8, xmm6, 0xC0
1883        pshufd  xmm8, xmm8, 0x78
1884        punpckhdq xmm5, xmm7
1885        punpckldq xmm6, xmm5
1886        pshufd  xmm7, xmm6, 0x1E
1887        movdqa  xmm5, xmm9
1888        movdqa  xmm6, xmm8
1889        jmp     9b
18909:
1891        pxor    xmm0, xmm2
1892        pxor    xmm1, xmm3
1893        movups  xmmword ptr [rdi], xmm0
1894        movups  xmmword ptr [rdi+0x10], xmm1
1895        RET
1896SET_SIZE(zfs_blake3_compress_in_place_sse41)
1897
1898ENTRY_ALIGN(zfs_blake3_compress_xof_sse41, 64)
1899        ENDBR
1900        movups  xmm0, xmmword ptr [rdi]
1901        movups  xmm1, xmmword ptr [rdi+0x10]
1902        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1903        movzx   eax, r8b
1904        movzx   edx, dl
1905        shl     rax, 32
1906        add     rdx, rax
1907        movq    xmm3, rcx
1908        movq    xmm4, rdx
1909        punpcklqdq xmm3, xmm4
1910        movups  xmm4, xmmword ptr [rsi]
1911        movups  xmm5, xmmword ptr [rsi+0x10]
1912        movaps  xmm8, xmm4
1913        shufps  xmm4, xmm5, 136
1914        shufps  xmm8, xmm5, 221
1915        movaps  xmm5, xmm8
1916        movups  xmm6, xmmword ptr [rsi+0x20]
1917        movups  xmm7, xmmword ptr [rsi+0x30]
1918        movaps  xmm8, xmm6
1919        shufps  xmm6, xmm7, 136
1920        pshufd  xmm6, xmm6, 0x93
1921        shufps  xmm8, xmm7, 221
1922        pshufd  xmm7, xmm8, 0x93
1923        movaps  xmm14, xmmword ptr [ROT8+rip]
1924        movaps  xmm15, xmmword ptr [ROT16+rip]
1925        mov     al, 7
19269:
1927        paddd   xmm0, xmm4
1928        paddd   xmm0, xmm1
1929        pxor    xmm3, xmm0
1930        pshufb  xmm3, xmm15
1931        paddd   xmm2, xmm3
1932        pxor    xmm1, xmm2
1933        movdqa  xmm11, xmm1
1934        pslld   xmm1, 20
1935        psrld   xmm11, 12
1936        por     xmm1, xmm11
1937        paddd   xmm0, xmm5
1938        paddd   xmm0, xmm1
1939        pxor    xmm3, xmm0
1940        pshufb  xmm3, xmm14
1941        paddd   xmm2, xmm3
1942        pxor    xmm1, xmm2
1943        movdqa  xmm11, xmm1
1944        pslld   xmm1, 25
1945        psrld   xmm11, 7
1946        por     xmm1, xmm11
1947        pshufd  xmm0, xmm0, 0x93
1948        pshufd  xmm3, xmm3, 0x4E
1949        pshufd  xmm2, xmm2, 0x39
1950        paddd   xmm0, xmm6
1951        paddd   xmm0, xmm1
1952        pxor    xmm3, xmm0
1953        pshufb  xmm3, xmm15
1954        paddd   xmm2, xmm3
1955        pxor    xmm1, xmm2
1956        movdqa  xmm11, xmm1
1957        pslld   xmm1, 20
1958        psrld   xmm11, 12
1959        por     xmm1, xmm11
1960        paddd   xmm0, xmm7
1961        paddd   xmm0, xmm1
1962        pxor    xmm3, xmm0
1963        pshufb  xmm3, xmm14
1964        paddd   xmm2, xmm3
1965        pxor    xmm1, xmm2
1966        movdqa  xmm11, xmm1
1967        pslld   xmm1, 25
1968        psrld   xmm11, 7
1969        por     xmm1, xmm11
1970        pshufd  xmm0, xmm0, 0x39
1971        pshufd  xmm3, xmm3, 0x4E
1972        pshufd  xmm2, xmm2, 0x93
1973        dec     al
1974        jz      9f
1975        movdqa  xmm8, xmm4
1976        shufps  xmm8, xmm5, 214
1977        pshufd  xmm9, xmm4, 0x0F
1978        pshufd  xmm4, xmm8, 0x39
1979        movdqa  xmm8, xmm6
1980        shufps  xmm8, xmm7, 250
1981        pblendw xmm9, xmm8, 0xCC
1982        movdqa  xmm8, xmm7
1983        punpcklqdq xmm8, xmm5
1984        pblendw xmm8, xmm6, 0xC0
1985        pshufd  xmm8, xmm8, 0x78
1986        punpckhdq xmm5, xmm7
1987        punpckldq xmm6, xmm5
1988        pshufd  xmm7, xmm6, 0x1E
1989        movdqa  xmm5, xmm9
1990        movdqa  xmm6, xmm8
1991        jmp     9b
19929:
1993        movdqu  xmm4, xmmword ptr [rdi]
1994        movdqu  xmm5, xmmword ptr [rdi+0x10]
1995        pxor    xmm0, xmm2
1996        pxor    xmm1, xmm3
1997        pxor    xmm2, xmm4
1998        pxor    xmm3, xmm5
1999        movups  xmmword ptr [r9], xmm0
2000        movups  xmmword ptr [r9+0x10], xmm1
2001        movups  xmmword ptr [r9+0x20], xmm2
2002        movups  xmmword ptr [r9+0x30], xmm3
2003        RET
2004SET_SIZE(zfs_blake3_compress_xof_sse41)
2005
2006SECTION_STATIC
2007
2008.p2align  6
2009BLAKE3_IV:
2010        .long  0x6A09E667, 0xBB67AE85
2011        .long  0x3C6EF372, 0xA54FF53A
2012ROT16:
2013        .byte  2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2014ROT8:
2015        .byte  1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2016ADD0:
2017        .long  0, 1, 2, 3
2018ADD1:
2019	.long  4, 4, 4, 4
2020BLAKE3_IV_0:
2021	.long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2022BLAKE3_IV_1:
2023	.long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2024BLAKE3_IV_2:
2025	.long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2026BLAKE3_IV_3:
2027	.long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2028BLAKE3_BLOCK_LEN:
2029	.long  64, 64, 64, 64
2030CMP_MSB_MASK:
2031	.long  0x80000000, 0x80000000, 0x80000000, 0x80000000
2032
2033#endif	/* HAVE_SSE4_1 */
2034
2035#ifdef __ELF__
2036.section .note.GNU-stack,"",%progbits
2037#endif
2038