xref: /freebsd/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse41.S (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1// SPDX-License-Identifier: CDDL-1.0
2/*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23/*
24 * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
25 * Copyright (c) 2019-2020 Samuel Neves
26 * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
27 */
28
29#if defined(HAVE_SSE4_1)
30
31#define _ASM
32#include <sys/asm_linkage.h>
33
34.intel_syntax noprefix
35
36.text
37
38ENTRY_ALIGN(zfs_blake3_hash_many_sse41, 64)
39        ENDBR
40        push    r15
41        push    r14
42        push    r13
43        push    r12
44        push    rbx
45        push    rbp
46        mov     rbp, rsp
47        sub     rsp, 360
48        and     rsp, 0xFFFFFFFFFFFFFFC0
49        neg     r9d
50        movd    xmm0, r9d
51        pshufd  xmm0, xmm0, 0x00
52        movdqa  xmmword ptr [rsp+0x130], xmm0
53        movdqa  xmm1, xmm0
54        pand    xmm1, xmmword ptr [ADD0+rip]
55        pand    xmm0, xmmword ptr [ADD1+rip]
56        movdqa  xmmword ptr [rsp+0x150], xmm0
57        movd    xmm0, r8d
58        pshufd  xmm0, xmm0, 0x00
59        paddd   xmm0, xmm1
60        movdqa  xmmword ptr [rsp+0x110], xmm0
61        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
62        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
63        pcmpgtd xmm1, xmm0
64        shr     r8, 32
65        movd    xmm2, r8d
66        pshufd  xmm2, xmm2, 0x00
67        psubd   xmm2, xmm1
68        movdqa  xmmword ptr [rsp+0x120], xmm2
69        mov     rbx, qword ptr [rbp+0x50]
70        mov     r15, rdx
71        shl     r15, 6
72        movzx   r13d, byte ptr [rbp+0x38]
73        movzx   r12d, byte ptr [rbp+0x48]
74        cmp     rsi, 4
75        jc      3f
762:
77        movdqu  xmm3, xmmword ptr [rcx]
78        pshufd  xmm0, xmm3, 0x00
79        pshufd  xmm1, xmm3, 0x55
80        pshufd  xmm2, xmm3, 0xAA
81        pshufd  xmm3, xmm3, 0xFF
82        movdqu  xmm7, xmmword ptr [rcx+0x10]
83        pshufd  xmm4, xmm7, 0x00
84        pshufd  xmm5, xmm7, 0x55
85        pshufd  xmm6, xmm7, 0xAA
86        pshufd  xmm7, xmm7, 0xFF
87        mov     r8, qword ptr [rdi]
88        mov     r9, qword ptr [rdi+0x8]
89        mov     r10, qword ptr [rdi+0x10]
90        mov     r11, qword ptr [rdi+0x18]
91        movzx   eax, byte ptr [rbp+0x40]
92        or      eax, r13d
93        xor     edx, edx
949:
95        mov     r14d, eax
96        or      eax, r12d
97        add     rdx, 64
98        cmp     rdx, r15
99        cmovne  eax, r14d
100        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
101        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
102        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
103        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
104        movdqa  xmm12, xmm8
105        punpckldq xmm8, xmm9
106        punpckhdq xmm12, xmm9
107        movdqa  xmm14, xmm10
108        punpckldq xmm10, xmm11
109        punpckhdq xmm14, xmm11
110        movdqa  xmm9, xmm8
111        punpcklqdq xmm8, xmm10
112        punpckhqdq xmm9, xmm10
113        movdqa  xmm13, xmm12
114        punpcklqdq xmm12, xmm14
115        punpckhqdq xmm13, xmm14
116        movdqa  xmmword ptr [rsp], xmm8
117        movdqa  xmmword ptr [rsp+0x10], xmm9
118        movdqa  xmmword ptr [rsp+0x20], xmm12
119        movdqa  xmmword ptr [rsp+0x30], xmm13
120        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
121        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
122        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
123        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
124        movdqa  xmm12, xmm8
125        punpckldq xmm8, xmm9
126        punpckhdq xmm12, xmm9
127        movdqa  xmm14, xmm10
128        punpckldq xmm10, xmm11
129        punpckhdq xmm14, xmm11
130        movdqa  xmm9, xmm8
131        punpcklqdq xmm8, xmm10
132        punpckhqdq xmm9, xmm10
133        movdqa  xmm13, xmm12
134        punpcklqdq xmm12, xmm14
135        punpckhqdq xmm13, xmm14
136        movdqa  xmmword ptr [rsp+0x40], xmm8
137        movdqa  xmmword ptr [rsp+0x50], xmm9
138        movdqa  xmmword ptr [rsp+0x60], xmm12
139        movdqa  xmmword ptr [rsp+0x70], xmm13
140        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
141        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
142        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
143        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
144        movdqa  xmm12, xmm8
145        punpckldq xmm8, xmm9
146        punpckhdq xmm12, xmm9
147        movdqa  xmm14, xmm10
148        punpckldq xmm10, xmm11
149        punpckhdq xmm14, xmm11
150        movdqa  xmm9, xmm8
151        punpcklqdq xmm8, xmm10
152        punpckhqdq xmm9, xmm10
153        movdqa  xmm13, xmm12
154        punpcklqdq xmm12, xmm14
155        punpckhqdq xmm13, xmm14
156        movdqa  xmmword ptr [rsp+0x80], xmm8
157        movdqa  xmmword ptr [rsp+0x90], xmm9
158        movdqa  xmmword ptr [rsp+0xA0], xmm12
159        movdqa  xmmword ptr [rsp+0xB0], xmm13
160        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
161        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
162        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
163        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
164        movdqa  xmm12, xmm8
165        punpckldq xmm8, xmm9
166        punpckhdq xmm12, xmm9
167        movdqa  xmm14, xmm10
168        punpckldq xmm10, xmm11
169        punpckhdq xmm14, xmm11
170        movdqa  xmm9, xmm8
171        punpcklqdq xmm8, xmm10
172        punpckhqdq xmm9, xmm10
173        movdqa  xmm13, xmm12
174        punpcklqdq xmm12, xmm14
175        punpckhqdq xmm13, xmm14
176        movdqa  xmmword ptr [rsp+0xC0], xmm8
177        movdqa  xmmword ptr [rsp+0xD0], xmm9
178        movdqa  xmmword ptr [rsp+0xE0], xmm12
179        movdqa  xmmword ptr [rsp+0xF0], xmm13
180        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
181        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
182        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
183        movdqa  xmm12, xmmword ptr [rsp+0x110]
184        movdqa  xmm13, xmmword ptr [rsp+0x120]
185        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
186        movd    xmm15, eax
187        pshufd  xmm15, xmm15, 0x00
188        prefetcht0 [r8+rdx+0x80]
189        prefetcht0 [r9+rdx+0x80]
190        prefetcht0 [r10+rdx+0x80]
191        prefetcht0 [r11+rdx+0x80]
192        paddd   xmm0, xmmword ptr [rsp]
193        paddd   xmm1, xmmword ptr [rsp+0x20]
194        paddd   xmm2, xmmword ptr [rsp+0x40]
195        paddd   xmm3, xmmword ptr [rsp+0x60]
196        paddd   xmm0, xmm4
197        paddd   xmm1, xmm5
198        paddd   xmm2, xmm6
199        paddd   xmm3, xmm7
200        pxor    xmm12, xmm0
201        pxor    xmm13, xmm1
202        pxor    xmm14, xmm2
203        pxor    xmm15, xmm3
204        movdqa  xmm8, xmmword ptr [ROT16+rip]
205        pshufb  xmm12, xmm8
206        pshufb  xmm13, xmm8
207        pshufb  xmm14, xmm8
208        pshufb  xmm15, xmm8
209        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
210        paddd   xmm8, xmm12
211        paddd   xmm9, xmm13
212        paddd   xmm10, xmm14
213        paddd   xmm11, xmm15
214        pxor    xmm4, xmm8
215        pxor    xmm5, xmm9
216        pxor    xmm6, xmm10
217        pxor    xmm7, xmm11
218        movdqa  xmmword ptr [rsp+0x100], xmm8
219        movdqa  xmm8, xmm4
220        psrld   xmm8, 12
221        pslld   xmm4, 20
222        por     xmm4, xmm8
223        movdqa  xmm8, xmm5
224        psrld   xmm8, 12
225        pslld   xmm5, 20
226        por     xmm5, xmm8
227        movdqa  xmm8, xmm6
228        psrld   xmm8, 12
229        pslld   xmm6, 20
230        por     xmm6, xmm8
231        movdqa  xmm8, xmm7
232        psrld   xmm8, 12
233        pslld   xmm7, 20
234        por     xmm7, xmm8
235        paddd   xmm0, xmmword ptr [rsp+0x10]
236        paddd   xmm1, xmmword ptr [rsp+0x30]
237        paddd   xmm2, xmmword ptr [rsp+0x50]
238        paddd   xmm3, xmmword ptr [rsp+0x70]
239        paddd   xmm0, xmm4
240        paddd   xmm1, xmm5
241        paddd   xmm2, xmm6
242        paddd   xmm3, xmm7
243        pxor    xmm12, xmm0
244        pxor    xmm13, xmm1
245        pxor    xmm14, xmm2
246        pxor    xmm15, xmm3
247        movdqa  xmm8, xmmword ptr [ROT8+rip]
248        pshufb  xmm12, xmm8
249        pshufb  xmm13, xmm8
250        pshufb  xmm14, xmm8
251        pshufb  xmm15, xmm8
252        movdqa  xmm8, xmmword ptr [rsp+0x100]
253        paddd   xmm8, xmm12
254        paddd   xmm9, xmm13
255        paddd   xmm10, xmm14
256        paddd   xmm11, xmm15
257        pxor    xmm4, xmm8
258        pxor    xmm5, xmm9
259        pxor    xmm6, xmm10
260        pxor    xmm7, xmm11
261        movdqa  xmmword ptr [rsp+0x100], xmm8
262        movdqa  xmm8, xmm4
263        psrld   xmm8, 7
264        pslld   xmm4, 25
265        por     xmm4, xmm8
266        movdqa  xmm8, xmm5
267        psrld   xmm8, 7
268        pslld   xmm5, 25
269        por     xmm5, xmm8
270        movdqa  xmm8, xmm6
271        psrld   xmm8, 7
272        pslld   xmm6, 25
273        por     xmm6, xmm8
274        movdqa  xmm8, xmm7
275        psrld   xmm8, 7
276        pslld   xmm7, 25
277        por     xmm7, xmm8
278        paddd   xmm0, xmmword ptr [rsp+0x80]
279        paddd   xmm1, xmmword ptr [rsp+0xA0]
280        paddd   xmm2, xmmword ptr [rsp+0xC0]
281        paddd   xmm3, xmmword ptr [rsp+0xE0]
282        paddd   xmm0, xmm5
283        paddd   xmm1, xmm6
284        paddd   xmm2, xmm7
285        paddd   xmm3, xmm4
286        pxor    xmm15, xmm0
287        pxor    xmm12, xmm1
288        pxor    xmm13, xmm2
289        pxor    xmm14, xmm3
290        movdqa  xmm8, xmmword ptr [ROT16+rip]
291        pshufb  xmm15, xmm8
292        pshufb  xmm12, xmm8
293        pshufb  xmm13, xmm8
294        pshufb  xmm14, xmm8
295        paddd   xmm10, xmm15
296        paddd   xmm11, xmm12
297        movdqa  xmm8, xmmword ptr [rsp+0x100]
298        paddd   xmm8, xmm13
299        paddd   xmm9, xmm14
300        pxor    xmm5, xmm10
301        pxor    xmm6, xmm11
302        pxor    xmm7, xmm8
303        pxor    xmm4, xmm9
304        movdqa  xmmword ptr [rsp+0x100], xmm8
305        movdqa  xmm8, xmm5
306        psrld   xmm8, 12
307        pslld   xmm5, 20
308        por     xmm5, xmm8
309        movdqa  xmm8, xmm6
310        psrld   xmm8, 12
311        pslld   xmm6, 20
312        por     xmm6, xmm8
313        movdqa  xmm8, xmm7
314        psrld   xmm8, 12
315        pslld   xmm7, 20
316        por     xmm7, xmm8
317        movdqa  xmm8, xmm4
318        psrld   xmm8, 12
319        pslld   xmm4, 20
320        por     xmm4, xmm8
321        paddd   xmm0, xmmword ptr [rsp+0x90]
322        paddd   xmm1, xmmword ptr [rsp+0xB0]
323        paddd   xmm2, xmmword ptr [rsp+0xD0]
324        paddd   xmm3, xmmword ptr [rsp+0xF0]
325        paddd   xmm0, xmm5
326        paddd   xmm1, xmm6
327        paddd   xmm2, xmm7
328        paddd   xmm3, xmm4
329        pxor    xmm15, xmm0
330        pxor    xmm12, xmm1
331        pxor    xmm13, xmm2
332        pxor    xmm14, xmm3
333        movdqa  xmm8, xmmword ptr [ROT8+rip]
334        pshufb  xmm15, xmm8
335        pshufb  xmm12, xmm8
336        pshufb  xmm13, xmm8
337        pshufb  xmm14, xmm8
338        paddd   xmm10, xmm15
339        paddd   xmm11, xmm12
340        movdqa  xmm8, xmmword ptr [rsp+0x100]
341        paddd   xmm8, xmm13
342        paddd   xmm9, xmm14
343        pxor    xmm5, xmm10
344        pxor    xmm6, xmm11
345        pxor    xmm7, xmm8
346        pxor    xmm4, xmm9
347        movdqa  xmmword ptr [rsp+0x100], xmm8
348        movdqa  xmm8, xmm5
349        psrld   xmm8, 7
350        pslld   xmm5, 25
351        por     xmm5, xmm8
352        movdqa  xmm8, xmm6
353        psrld   xmm8, 7
354        pslld   xmm6, 25
355        por     xmm6, xmm8
356        movdqa  xmm8, xmm7
357        psrld   xmm8, 7
358        pslld   xmm7, 25
359        por     xmm7, xmm8
360        movdqa  xmm8, xmm4
361        psrld   xmm8, 7
362        pslld   xmm4, 25
363        por     xmm4, xmm8
364        paddd   xmm0, xmmword ptr [rsp+0x20]
365        paddd   xmm1, xmmword ptr [rsp+0x30]
366        paddd   xmm2, xmmword ptr [rsp+0x70]
367        paddd   xmm3, xmmword ptr [rsp+0x40]
368        paddd   xmm0, xmm4
369        paddd   xmm1, xmm5
370        paddd   xmm2, xmm6
371        paddd   xmm3, xmm7
372        pxor    xmm12, xmm0
373        pxor    xmm13, xmm1
374        pxor    xmm14, xmm2
375        pxor    xmm15, xmm3
376        movdqa  xmm8, xmmword ptr [ROT16+rip]
377        pshufb  xmm12, xmm8
378        pshufb  xmm13, xmm8
379        pshufb  xmm14, xmm8
380        pshufb  xmm15, xmm8
381        movdqa  xmm8, xmmword ptr [rsp+0x100]
382        paddd   xmm8, xmm12
383        paddd   xmm9, xmm13
384        paddd   xmm10, xmm14
385        paddd   xmm11, xmm15
386        pxor    xmm4, xmm8
387        pxor    xmm5, xmm9
388        pxor    xmm6, xmm10
389        pxor    xmm7, xmm11
390        movdqa  xmmword ptr [rsp+0x100], xmm8
391        movdqa  xmm8, xmm4
392        psrld   xmm8, 12
393        pslld   xmm4, 20
394        por     xmm4, xmm8
395        movdqa  xmm8, xmm5
396        psrld   xmm8, 12
397        pslld   xmm5, 20
398        por     xmm5, xmm8
399        movdqa  xmm8, xmm6
400        psrld   xmm8, 12
401        pslld   xmm6, 20
402        por     xmm6, xmm8
403        movdqa  xmm8, xmm7
404        psrld   xmm8, 12
405        pslld   xmm7, 20
406        por     xmm7, xmm8
407        paddd   xmm0, xmmword ptr [rsp+0x60]
408        paddd   xmm1, xmmword ptr [rsp+0xA0]
409        paddd   xmm2, xmmword ptr [rsp]
410        paddd   xmm3, xmmword ptr [rsp+0xD0]
411        paddd   xmm0, xmm4
412        paddd   xmm1, xmm5
413        paddd   xmm2, xmm6
414        paddd   xmm3, xmm7
415        pxor    xmm12, xmm0
416        pxor    xmm13, xmm1
417        pxor    xmm14, xmm2
418        pxor    xmm15, xmm3
419        movdqa  xmm8, xmmword ptr [ROT8+rip]
420        pshufb  xmm12, xmm8
421        pshufb  xmm13, xmm8
422        pshufb  xmm14, xmm8
423        pshufb  xmm15, xmm8
424        movdqa  xmm8, xmmword ptr [rsp+0x100]
425        paddd   xmm8, xmm12
426        paddd   xmm9, xmm13
427        paddd   xmm10, xmm14
428        paddd   xmm11, xmm15
429        pxor    xmm4, xmm8
430        pxor    xmm5, xmm9
431        pxor    xmm6, xmm10
432        pxor    xmm7, xmm11
433        movdqa  xmmword ptr [rsp+0x100], xmm8
434        movdqa  xmm8, xmm4
435        psrld   xmm8, 7
436        pslld   xmm4, 25
437        por     xmm4, xmm8
438        movdqa  xmm8, xmm5
439        psrld   xmm8, 7
440        pslld   xmm5, 25
441        por     xmm5, xmm8
442        movdqa  xmm8, xmm6
443        psrld   xmm8, 7
444        pslld   xmm6, 25
445        por     xmm6, xmm8
446        movdqa  xmm8, xmm7
447        psrld   xmm8, 7
448        pslld   xmm7, 25
449        por     xmm7, xmm8
450        paddd   xmm0, xmmword ptr [rsp+0x10]
451        paddd   xmm1, xmmword ptr [rsp+0xC0]
452        paddd   xmm2, xmmword ptr [rsp+0x90]
453        paddd   xmm3, xmmword ptr [rsp+0xF0]
454        paddd   xmm0, xmm5
455        paddd   xmm1, xmm6
456        paddd   xmm2, xmm7
457        paddd   xmm3, xmm4
458        pxor    xmm15, xmm0
459        pxor    xmm12, xmm1
460        pxor    xmm13, xmm2
461        pxor    xmm14, xmm3
462        movdqa  xmm8, xmmword ptr [ROT16+rip]
463        pshufb  xmm15, xmm8
464        pshufb  xmm12, xmm8
465        pshufb  xmm13, xmm8
466        pshufb  xmm14, xmm8
467        paddd   xmm10, xmm15
468        paddd   xmm11, xmm12
469        movdqa  xmm8, xmmword ptr [rsp+0x100]
470        paddd   xmm8, xmm13
471        paddd   xmm9, xmm14
472        pxor    xmm5, xmm10
473        pxor    xmm6, xmm11
474        pxor    xmm7, xmm8
475        pxor    xmm4, xmm9
476        movdqa  xmmword ptr [rsp+0x100], xmm8
477        movdqa  xmm8, xmm5
478        psrld   xmm8, 12
479        pslld   xmm5, 20
480        por     xmm5, xmm8
481        movdqa  xmm8, xmm6
482        psrld   xmm8, 12
483        pslld   xmm6, 20
484        por     xmm6, xmm8
485        movdqa  xmm8, xmm7
486        psrld   xmm8, 12
487        pslld   xmm7, 20
488        por     xmm7, xmm8
489        movdqa  xmm8, xmm4
490        psrld   xmm8, 12
491        pslld   xmm4, 20
492        por     xmm4, xmm8
493        paddd   xmm0, xmmword ptr [rsp+0xB0]
494        paddd   xmm1, xmmword ptr [rsp+0x50]
495        paddd   xmm2, xmmword ptr [rsp+0xE0]
496        paddd   xmm3, xmmword ptr [rsp+0x80]
497        paddd   xmm0, xmm5
498        paddd   xmm1, xmm6
499        paddd   xmm2, xmm7
500        paddd   xmm3, xmm4
501        pxor    xmm15, xmm0
502        pxor    xmm12, xmm1
503        pxor    xmm13, xmm2
504        pxor    xmm14, xmm3
505        movdqa  xmm8, xmmword ptr [ROT8+rip]
506        pshufb  xmm15, xmm8
507        pshufb  xmm12, xmm8
508        pshufb  xmm13, xmm8
509        pshufb  xmm14, xmm8
510        paddd   xmm10, xmm15
511        paddd   xmm11, xmm12
512        movdqa  xmm8, xmmword ptr [rsp+0x100]
513        paddd   xmm8, xmm13
514        paddd   xmm9, xmm14
515        pxor    xmm5, xmm10
516        pxor    xmm6, xmm11
517        pxor    xmm7, xmm8
518        pxor    xmm4, xmm9
519        movdqa  xmmword ptr [rsp+0x100], xmm8
520        movdqa  xmm8, xmm5
521        psrld   xmm8, 7
522        pslld   xmm5, 25
523        por     xmm5, xmm8
524        movdqa  xmm8, xmm6
525        psrld   xmm8, 7
526        pslld   xmm6, 25
527        por     xmm6, xmm8
528        movdqa  xmm8, xmm7
529        psrld   xmm8, 7
530        pslld   xmm7, 25
531        por     xmm7, xmm8
532        movdqa  xmm8, xmm4
533        psrld   xmm8, 7
534        pslld   xmm4, 25
535        por     xmm4, xmm8
536        paddd   xmm0, xmmword ptr [rsp+0x30]
537        paddd   xmm1, xmmword ptr [rsp+0xA0]
538        paddd   xmm2, xmmword ptr [rsp+0xD0]
539        paddd   xmm3, xmmword ptr [rsp+0x70]
540        paddd   xmm0, xmm4
541        paddd   xmm1, xmm5
542        paddd   xmm2, xmm6
543        paddd   xmm3, xmm7
544        pxor    xmm12, xmm0
545        pxor    xmm13, xmm1
546        pxor    xmm14, xmm2
547        pxor    xmm15, xmm3
548        movdqa  xmm8, xmmword ptr [ROT16+rip]
549        pshufb  xmm12, xmm8
550        pshufb  xmm13, xmm8
551        pshufb  xmm14, xmm8
552        pshufb  xmm15, xmm8
553        movdqa  xmm8, xmmword ptr [rsp+0x100]
554        paddd   xmm8, xmm12
555        paddd   xmm9, xmm13
556        paddd   xmm10, xmm14
557        paddd   xmm11, xmm15
558        pxor    xmm4, xmm8
559        pxor    xmm5, xmm9
560        pxor    xmm6, xmm10
561        pxor    xmm7, xmm11
562        movdqa  xmmword ptr [rsp+0x100], xmm8
563        movdqa  xmm8, xmm4
564        psrld   xmm8, 12
565        pslld   xmm4, 20
566        por     xmm4, xmm8
567        movdqa  xmm8, xmm5
568        psrld   xmm8, 12
569        pslld   xmm5, 20
570        por     xmm5, xmm8
571        movdqa  xmm8, xmm6
572        psrld   xmm8, 12
573        pslld   xmm6, 20
574        por     xmm6, xmm8
575        movdqa  xmm8, xmm7
576        psrld   xmm8, 12
577        pslld   xmm7, 20
578        por     xmm7, xmm8
579        paddd   xmm0, xmmword ptr [rsp+0x40]
580        paddd   xmm1, xmmword ptr [rsp+0xC0]
581        paddd   xmm2, xmmword ptr [rsp+0x20]
582        paddd   xmm3, xmmword ptr [rsp+0xE0]
583        paddd   xmm0, xmm4
584        paddd   xmm1, xmm5
585        paddd   xmm2, xmm6
586        paddd   xmm3, xmm7
587        pxor    xmm12, xmm0
588        pxor    xmm13, xmm1
589        pxor    xmm14, xmm2
590        pxor    xmm15, xmm3
591        movdqa  xmm8, xmmword ptr [ROT8+rip]
592        pshufb  xmm12, xmm8
593        pshufb  xmm13, xmm8
594        pshufb  xmm14, xmm8
595        pshufb  xmm15, xmm8
596        movdqa  xmm8, xmmword ptr [rsp+0x100]
597        paddd   xmm8, xmm12
598        paddd   xmm9, xmm13
599        paddd   xmm10, xmm14
600        paddd   xmm11, xmm15
601        pxor    xmm4, xmm8
602        pxor    xmm5, xmm9
603        pxor    xmm6, xmm10
604        pxor    xmm7, xmm11
605        movdqa  xmmword ptr [rsp+0x100], xmm8
606        movdqa  xmm8, xmm4
607        psrld   xmm8, 7
608        pslld   xmm4, 25
609        por     xmm4, xmm8
610        movdqa  xmm8, xmm5
611        psrld   xmm8, 7
612        pslld   xmm5, 25
613        por     xmm5, xmm8
614        movdqa  xmm8, xmm6
615        psrld   xmm8, 7
616        pslld   xmm6, 25
617        por     xmm6, xmm8
618        movdqa  xmm8, xmm7
619        psrld   xmm8, 7
620        pslld   xmm7, 25
621        por     xmm7, xmm8
622        paddd   xmm0, xmmword ptr [rsp+0x60]
623        paddd   xmm1, xmmword ptr [rsp+0x90]
624        paddd   xmm2, xmmword ptr [rsp+0xB0]
625        paddd   xmm3, xmmword ptr [rsp+0x80]
626        paddd   xmm0, xmm5
627        paddd   xmm1, xmm6
628        paddd   xmm2, xmm7
629        paddd   xmm3, xmm4
630        pxor    xmm15, xmm0
631        pxor    xmm12, xmm1
632        pxor    xmm13, xmm2
633        pxor    xmm14, xmm3
634        movdqa  xmm8, xmmword ptr [ROT16+rip]
635        pshufb  xmm15, xmm8
636        pshufb  xmm12, xmm8
637        pshufb  xmm13, xmm8
638        pshufb  xmm14, xmm8
639        paddd   xmm10, xmm15
640        paddd   xmm11, xmm12
641        movdqa  xmm8, xmmword ptr [rsp+0x100]
642        paddd   xmm8, xmm13
643        paddd   xmm9, xmm14
644        pxor    xmm5, xmm10
645        pxor    xmm6, xmm11
646        pxor    xmm7, xmm8
647        pxor    xmm4, xmm9
648        movdqa  xmmword ptr [rsp+0x100], xmm8
649        movdqa  xmm8, xmm5
650        psrld   xmm8, 12
651        pslld   xmm5, 20
652        por     xmm5, xmm8
653        movdqa  xmm8, xmm6
654        psrld   xmm8, 12
655        pslld   xmm6, 20
656        por     xmm6, xmm8
657        movdqa  xmm8, xmm7
658        psrld   xmm8, 12
659        pslld   xmm7, 20
660        por     xmm7, xmm8
661        movdqa  xmm8, xmm4
662        psrld   xmm8, 12
663        pslld   xmm4, 20
664        por     xmm4, xmm8
665        paddd   xmm0, xmmword ptr [rsp+0x50]
666        paddd   xmm1, xmmword ptr [rsp]
667        paddd   xmm2, xmmword ptr [rsp+0xF0]
668        paddd   xmm3, xmmword ptr [rsp+0x10]
669        paddd   xmm0, xmm5
670        paddd   xmm1, xmm6
671        paddd   xmm2, xmm7
672        paddd   xmm3, xmm4
673        pxor    xmm15, xmm0
674        pxor    xmm12, xmm1
675        pxor    xmm13, xmm2
676        pxor    xmm14, xmm3
677        movdqa  xmm8, xmmword ptr [ROT8+rip]
678        pshufb  xmm15, xmm8
679        pshufb  xmm12, xmm8
680        pshufb  xmm13, xmm8
681        pshufb  xmm14, xmm8
682        paddd   xmm10, xmm15
683        paddd   xmm11, xmm12
684        movdqa  xmm8, xmmword ptr [rsp+0x100]
685        paddd   xmm8, xmm13
686        paddd   xmm9, xmm14
687        pxor    xmm5, xmm10
688        pxor    xmm6, xmm11
689        pxor    xmm7, xmm8
690        pxor    xmm4, xmm9
691        movdqa  xmmword ptr [rsp+0x100], xmm8
692        movdqa  xmm8, xmm5
693        psrld   xmm8, 7
694        pslld   xmm5, 25
695        por     xmm5, xmm8
696        movdqa  xmm8, xmm6
697        psrld   xmm8, 7
698        pslld   xmm6, 25
699        por     xmm6, xmm8
700        movdqa  xmm8, xmm7
701        psrld   xmm8, 7
702        pslld   xmm7, 25
703        por     xmm7, xmm8
704        movdqa  xmm8, xmm4
705        psrld   xmm8, 7
706        pslld   xmm4, 25
707        por     xmm4, xmm8
708        paddd   xmm0, xmmword ptr [rsp+0xA0]
709        paddd   xmm1, xmmword ptr [rsp+0xC0]
710        paddd   xmm2, xmmword ptr [rsp+0xE0]
711        paddd   xmm3, xmmword ptr [rsp+0xD0]
712        paddd   xmm0, xmm4
713        paddd   xmm1, xmm5
714        paddd   xmm2, xmm6
715        paddd   xmm3, xmm7
716        pxor    xmm12, xmm0
717        pxor    xmm13, xmm1
718        pxor    xmm14, xmm2
719        pxor    xmm15, xmm3
720        movdqa  xmm8, xmmword ptr [ROT16+rip]
721        pshufb  xmm12, xmm8
722        pshufb  xmm13, xmm8
723        pshufb  xmm14, xmm8
724        pshufb  xmm15, xmm8
725        movdqa  xmm8, xmmword ptr [rsp+0x100]
726        paddd   xmm8, xmm12
727        paddd   xmm9, xmm13
728        paddd   xmm10, xmm14
729        paddd   xmm11, xmm15
730        pxor    xmm4, xmm8
731        pxor    xmm5, xmm9
732        pxor    xmm6, xmm10
733        pxor    xmm7, xmm11
734        movdqa  xmmword ptr [rsp+0x100], xmm8
735        movdqa  xmm8, xmm4
736        psrld   xmm8, 12
737        pslld   xmm4, 20
738        por     xmm4, xmm8
739        movdqa  xmm8, xmm5
740        psrld   xmm8, 12
741        pslld   xmm5, 20
742        por     xmm5, xmm8
743        movdqa  xmm8, xmm6
744        psrld   xmm8, 12
745        pslld   xmm6, 20
746        por     xmm6, xmm8
747        movdqa  xmm8, xmm7
748        psrld   xmm8, 12
749        pslld   xmm7, 20
750        por     xmm7, xmm8
751        paddd   xmm0, xmmword ptr [rsp+0x70]
752        paddd   xmm1, xmmword ptr [rsp+0x90]
753        paddd   xmm2, xmmword ptr [rsp+0x30]
754        paddd   xmm3, xmmword ptr [rsp+0xF0]
755        paddd   xmm0, xmm4
756        paddd   xmm1, xmm5
757        paddd   xmm2, xmm6
758        paddd   xmm3, xmm7
759        pxor    xmm12, xmm0
760        pxor    xmm13, xmm1
761        pxor    xmm14, xmm2
762        pxor    xmm15, xmm3
763        movdqa  xmm8, xmmword ptr [ROT8+rip]
764        pshufb  xmm12, xmm8
765        pshufb  xmm13, xmm8
766        pshufb  xmm14, xmm8
767        pshufb  xmm15, xmm8
768        movdqa  xmm8, xmmword ptr [rsp+0x100]
769        paddd   xmm8, xmm12
770        paddd   xmm9, xmm13
771        paddd   xmm10, xmm14
772        paddd   xmm11, xmm15
773        pxor    xmm4, xmm8
774        pxor    xmm5, xmm9
775        pxor    xmm6, xmm10
776        pxor    xmm7, xmm11
777        movdqa  xmmword ptr [rsp+0x100], xmm8
778        movdqa  xmm8, xmm4
779        psrld   xmm8, 7
780        pslld   xmm4, 25
781        por     xmm4, xmm8
782        movdqa  xmm8, xmm5
783        psrld   xmm8, 7
784        pslld   xmm5, 25
785        por     xmm5, xmm8
786        movdqa  xmm8, xmm6
787        psrld   xmm8, 7
788        pslld   xmm6, 25
789        por     xmm6, xmm8
790        movdqa  xmm8, xmm7
791        psrld   xmm8, 7
792        pslld   xmm7, 25
793        por     xmm7, xmm8
794        paddd   xmm0, xmmword ptr [rsp+0x40]
795        paddd   xmm1, xmmword ptr [rsp+0xB0]
796        paddd   xmm2, xmmword ptr [rsp+0x50]
797        paddd   xmm3, xmmword ptr [rsp+0x10]
798        paddd   xmm0, xmm5
799        paddd   xmm1, xmm6
800        paddd   xmm2, xmm7
801        paddd   xmm3, xmm4
802        pxor    xmm15, xmm0
803        pxor    xmm12, xmm1
804        pxor    xmm13, xmm2
805        pxor    xmm14, xmm3
806        movdqa  xmm8, xmmword ptr [ROT16+rip]
807        pshufb  xmm15, xmm8
808        pshufb  xmm12, xmm8
809        pshufb  xmm13, xmm8
810        pshufb  xmm14, xmm8
811        paddd   xmm10, xmm15
812        paddd   xmm11, xmm12
813        movdqa  xmm8, xmmword ptr [rsp+0x100]
814        paddd   xmm8, xmm13
815        paddd   xmm9, xmm14
816        pxor    xmm5, xmm10
817        pxor    xmm6, xmm11
818        pxor    xmm7, xmm8
819        pxor    xmm4, xmm9
820        movdqa  xmmword ptr [rsp+0x100], xmm8
821        movdqa  xmm8, xmm5
822        psrld   xmm8, 12
823        pslld   xmm5, 20
824        por     xmm5, xmm8
825        movdqa  xmm8, xmm6
826        psrld   xmm8, 12
827        pslld   xmm6, 20
828        por     xmm6, xmm8
829        movdqa  xmm8, xmm7
830        psrld   xmm8, 12
831        pslld   xmm7, 20
832        por     xmm7, xmm8
833        movdqa  xmm8, xmm4
834        psrld   xmm8, 12
835        pslld   xmm4, 20
836        por     xmm4, xmm8
837        paddd   xmm0, xmmword ptr [rsp]
838        paddd   xmm1, xmmword ptr [rsp+0x20]
839        paddd   xmm2, xmmword ptr [rsp+0x80]
840        paddd   xmm3, xmmword ptr [rsp+0x60]
841        paddd   xmm0, xmm5
842        paddd   xmm1, xmm6
843        paddd   xmm2, xmm7
844        paddd   xmm3, xmm4
845        pxor    xmm15, xmm0
846        pxor    xmm12, xmm1
847        pxor    xmm13, xmm2
848        pxor    xmm14, xmm3
849        movdqa  xmm8, xmmword ptr [ROT8+rip]
850        pshufb  xmm15, xmm8
851        pshufb  xmm12, xmm8
852        pshufb  xmm13, xmm8
853        pshufb  xmm14, xmm8
854        paddd   xmm10, xmm15
855        paddd   xmm11, xmm12
856        movdqa  xmm8, xmmword ptr [rsp+0x100]
857        paddd   xmm8, xmm13
858        paddd   xmm9, xmm14
859        pxor    xmm5, xmm10
860        pxor    xmm6, xmm11
861        pxor    xmm7, xmm8
862        pxor    xmm4, xmm9
863        movdqa  xmmword ptr [rsp+0x100], xmm8
864        movdqa  xmm8, xmm5
865        psrld   xmm8, 7
866        pslld   xmm5, 25
867        por     xmm5, xmm8
868        movdqa  xmm8, xmm6
869        psrld   xmm8, 7
870        pslld   xmm6, 25
871        por     xmm6, xmm8
872        movdqa  xmm8, xmm7
873        psrld   xmm8, 7
874        pslld   xmm7, 25
875        por     xmm7, xmm8
876        movdqa  xmm8, xmm4
877        psrld   xmm8, 7
878        pslld   xmm4, 25
879        por     xmm4, xmm8
880        paddd   xmm0, xmmword ptr [rsp+0xC0]
881        paddd   xmm1, xmmword ptr [rsp+0x90]
882        paddd   xmm2, xmmword ptr [rsp+0xF0]
883        paddd   xmm3, xmmword ptr [rsp+0xE0]
884        paddd   xmm0, xmm4
885        paddd   xmm1, xmm5
886        paddd   xmm2, xmm6
887        paddd   xmm3, xmm7
888        pxor    xmm12, xmm0
889        pxor    xmm13, xmm1
890        pxor    xmm14, xmm2
891        pxor    xmm15, xmm3
892        movdqa  xmm8, xmmword ptr [ROT16+rip]
893        pshufb  xmm12, xmm8
894        pshufb  xmm13, xmm8
895        pshufb  xmm14, xmm8
896        pshufb  xmm15, xmm8
897        movdqa  xmm8, xmmword ptr [rsp+0x100]
898        paddd   xmm8, xmm12
899        paddd   xmm9, xmm13
900        paddd   xmm10, xmm14
901        paddd   xmm11, xmm15
902        pxor    xmm4, xmm8
903        pxor    xmm5, xmm9
904        pxor    xmm6, xmm10
905        pxor    xmm7, xmm11
906        movdqa  xmmword ptr [rsp+0x100], xmm8
907        movdqa  xmm8, xmm4
908        psrld   xmm8, 12
909        pslld   xmm4, 20
910        por     xmm4, xmm8
911        movdqa  xmm8, xmm5
912        psrld   xmm8, 12
913        pslld   xmm5, 20
914        por     xmm5, xmm8
915        movdqa  xmm8, xmm6
916        psrld   xmm8, 12
917        pslld   xmm6, 20
918        por     xmm6, xmm8
919        movdqa  xmm8, xmm7
920        psrld   xmm8, 12
921        pslld   xmm7, 20
922        por     xmm7, xmm8
923        paddd   xmm0, xmmword ptr [rsp+0xD0]
924        paddd   xmm1, xmmword ptr [rsp+0xB0]
925        paddd   xmm2, xmmword ptr [rsp+0xA0]
926        paddd   xmm3, xmmword ptr [rsp+0x80]
927        paddd   xmm0, xmm4
928        paddd   xmm1, xmm5
929        paddd   xmm2, xmm6
930        paddd   xmm3, xmm7
931        pxor    xmm12, xmm0
932        pxor    xmm13, xmm1
933        pxor    xmm14, xmm2
934        pxor    xmm15, xmm3
935        movdqa  xmm8, xmmword ptr [ROT8+rip]
936        pshufb  xmm12, xmm8
937        pshufb  xmm13, xmm8
938        pshufb  xmm14, xmm8
939        pshufb  xmm15, xmm8
940        movdqa  xmm8, xmmword ptr [rsp+0x100]
941        paddd   xmm8, xmm12
942        paddd   xmm9, xmm13
943        paddd   xmm10, xmm14
944        paddd   xmm11, xmm15
945        pxor    xmm4, xmm8
946        pxor    xmm5, xmm9
947        pxor    xmm6, xmm10
948        pxor    xmm7, xmm11
949        movdqa  xmmword ptr [rsp+0x100], xmm8
950        movdqa  xmm8, xmm4
951        psrld   xmm8, 7
952        pslld   xmm4, 25
953        por     xmm4, xmm8
954        movdqa  xmm8, xmm5
955        psrld   xmm8, 7
956        pslld   xmm5, 25
957        por     xmm5, xmm8
958        movdqa  xmm8, xmm6
959        psrld   xmm8, 7
960        pslld   xmm6, 25
961        por     xmm6, xmm8
962        movdqa  xmm8, xmm7
963        psrld   xmm8, 7
964        pslld   xmm7, 25
965        por     xmm7, xmm8
966        paddd   xmm0, xmmword ptr [rsp+0x70]
967        paddd   xmm1, xmmword ptr [rsp+0x50]
968        paddd   xmm2, xmmword ptr [rsp]
969        paddd   xmm3, xmmword ptr [rsp+0x60]
970        paddd   xmm0, xmm5
971        paddd   xmm1, xmm6
972        paddd   xmm2, xmm7
973        paddd   xmm3, xmm4
974        pxor    xmm15, xmm0
975        pxor    xmm12, xmm1
976        pxor    xmm13, xmm2
977        pxor    xmm14, xmm3
978        movdqa  xmm8, xmmword ptr [ROT16+rip]
979        pshufb  xmm15, xmm8
980        pshufb  xmm12, xmm8
981        pshufb  xmm13, xmm8
982        pshufb  xmm14, xmm8
983        paddd   xmm10, xmm15
984        paddd   xmm11, xmm12
985        movdqa  xmm8, xmmword ptr [rsp+0x100]
986        paddd   xmm8, xmm13
987        paddd   xmm9, xmm14
988        pxor    xmm5, xmm10
989        pxor    xmm6, xmm11
990        pxor    xmm7, xmm8
991        pxor    xmm4, xmm9
992        movdqa  xmmword ptr [rsp+0x100], xmm8
993        movdqa  xmm8, xmm5
994        psrld   xmm8, 12
995        pslld   xmm5, 20
996        por     xmm5, xmm8
997        movdqa  xmm8, xmm6
998        psrld   xmm8, 12
999        pslld   xmm6, 20
1000        por     xmm6, xmm8
1001        movdqa  xmm8, xmm7
1002        psrld   xmm8, 12
1003        pslld   xmm7, 20
1004        por     xmm7, xmm8
1005        movdqa  xmm8, xmm4
1006        psrld   xmm8, 12
1007        pslld   xmm4, 20
1008        por     xmm4, xmm8
1009        paddd   xmm0, xmmword ptr [rsp+0x20]
1010        paddd   xmm1, xmmword ptr [rsp+0x30]
1011        paddd   xmm2, xmmword ptr [rsp+0x10]
1012        paddd   xmm3, xmmword ptr [rsp+0x40]
1013        paddd   xmm0, xmm5
1014        paddd   xmm1, xmm6
1015        paddd   xmm2, xmm7
1016        paddd   xmm3, xmm4
1017        pxor    xmm15, xmm0
1018        pxor    xmm12, xmm1
1019        pxor    xmm13, xmm2
1020        pxor    xmm14, xmm3
1021        movdqa  xmm8, xmmword ptr [ROT8+rip]
1022        pshufb  xmm15, xmm8
1023        pshufb  xmm12, xmm8
1024        pshufb  xmm13, xmm8
1025        pshufb  xmm14, xmm8
1026        paddd   xmm10, xmm15
1027        paddd   xmm11, xmm12
1028        movdqa  xmm8, xmmword ptr [rsp+0x100]
1029        paddd   xmm8, xmm13
1030        paddd   xmm9, xmm14
1031        pxor    xmm5, xmm10
1032        pxor    xmm6, xmm11
1033        pxor    xmm7, xmm8
1034        pxor    xmm4, xmm9
1035        movdqa  xmmword ptr [rsp+0x100], xmm8
1036        movdqa  xmm8, xmm5
1037        psrld   xmm8, 7
1038        pslld   xmm5, 25
1039        por     xmm5, xmm8
1040        movdqa  xmm8, xmm6
1041        psrld   xmm8, 7
1042        pslld   xmm6, 25
1043        por     xmm6, xmm8
1044        movdqa  xmm8, xmm7
1045        psrld   xmm8, 7
1046        pslld   xmm7, 25
1047        por     xmm7, xmm8
1048        movdqa  xmm8, xmm4
1049        psrld   xmm8, 7
1050        pslld   xmm4, 25
1051        por     xmm4, xmm8
1052        paddd   xmm0, xmmword ptr [rsp+0x90]
1053        paddd   xmm1, xmmword ptr [rsp+0xB0]
1054        paddd   xmm2, xmmword ptr [rsp+0x80]
1055        paddd   xmm3, xmmword ptr [rsp+0xF0]
1056        paddd   xmm0, xmm4
1057        paddd   xmm1, xmm5
1058        paddd   xmm2, xmm6
1059        paddd   xmm3, xmm7
1060        pxor    xmm12, xmm0
1061        pxor    xmm13, xmm1
1062        pxor    xmm14, xmm2
1063        pxor    xmm15, xmm3
1064        movdqa  xmm8, xmmword ptr [ROT16+rip]
1065        pshufb  xmm12, xmm8
1066        pshufb  xmm13, xmm8
1067        pshufb  xmm14, xmm8
1068        pshufb  xmm15, xmm8
1069        movdqa  xmm8, xmmword ptr [rsp+0x100]
1070        paddd   xmm8, xmm12
1071        paddd   xmm9, xmm13
1072        paddd   xmm10, xmm14
1073        paddd   xmm11, xmm15
1074        pxor    xmm4, xmm8
1075        pxor    xmm5, xmm9
1076        pxor    xmm6, xmm10
1077        pxor    xmm7, xmm11
1078        movdqa  xmmword ptr [rsp+0x100], xmm8
1079        movdqa  xmm8, xmm4
1080        psrld   xmm8, 12
1081        pslld   xmm4, 20
1082        por     xmm4, xmm8
1083        movdqa  xmm8, xmm5
1084        psrld   xmm8, 12
1085        pslld   xmm5, 20
1086        por     xmm5, xmm8
1087        movdqa  xmm8, xmm6
1088        psrld   xmm8, 12
1089        pslld   xmm6, 20
1090        por     xmm6, xmm8
1091        movdqa  xmm8, xmm7
1092        psrld   xmm8, 12
1093        pslld   xmm7, 20
1094        por     xmm7, xmm8
1095        paddd   xmm0, xmmword ptr [rsp+0xE0]
1096        paddd   xmm1, xmmword ptr [rsp+0x50]
1097        paddd   xmm2, xmmword ptr [rsp+0xC0]
1098        paddd   xmm3, xmmword ptr [rsp+0x10]
1099        paddd   xmm0, xmm4
1100        paddd   xmm1, xmm5
1101        paddd   xmm2, xmm6
1102        paddd   xmm3, xmm7
1103        pxor    xmm12, xmm0
1104        pxor    xmm13, xmm1
1105        pxor    xmm14, xmm2
1106        pxor    xmm15, xmm3
1107        movdqa  xmm8, xmmword ptr [ROT8+rip]
1108        pshufb  xmm12, xmm8
1109        pshufb  xmm13, xmm8
1110        pshufb  xmm14, xmm8
1111        pshufb  xmm15, xmm8
1112        movdqa  xmm8, xmmword ptr [rsp+0x100]
1113        paddd   xmm8, xmm12
1114        paddd   xmm9, xmm13
1115        paddd   xmm10, xmm14
1116        paddd   xmm11, xmm15
1117        pxor    xmm4, xmm8
1118        pxor    xmm5, xmm9
1119        pxor    xmm6, xmm10
1120        pxor    xmm7, xmm11
1121        movdqa  xmmword ptr [rsp+0x100], xmm8
1122        movdqa  xmm8, xmm4
1123        psrld   xmm8, 7
1124        pslld   xmm4, 25
1125        por     xmm4, xmm8
1126        movdqa  xmm8, xmm5
1127        psrld   xmm8, 7
1128        pslld   xmm5, 25
1129        por     xmm5, xmm8
1130        movdqa  xmm8, xmm6
1131        psrld   xmm8, 7
1132        pslld   xmm6, 25
1133        por     xmm6, xmm8
1134        movdqa  xmm8, xmm7
1135        psrld   xmm8, 7
1136        pslld   xmm7, 25
1137        por     xmm7, xmm8
1138        paddd   xmm0, xmmword ptr [rsp+0xD0]
1139        paddd   xmm1, xmmword ptr [rsp]
1140        paddd   xmm2, xmmword ptr [rsp+0x20]
1141        paddd   xmm3, xmmword ptr [rsp+0x40]
1142        paddd   xmm0, xmm5
1143        paddd   xmm1, xmm6
1144        paddd   xmm2, xmm7
1145        paddd   xmm3, xmm4
1146        pxor    xmm15, xmm0
1147        pxor    xmm12, xmm1
1148        pxor    xmm13, xmm2
1149        pxor    xmm14, xmm3
1150        movdqa  xmm8, xmmword ptr [ROT16+rip]
1151        pshufb  xmm15, xmm8
1152        pshufb  xmm12, xmm8
1153        pshufb  xmm13, xmm8
1154        pshufb  xmm14, xmm8
1155        paddd   xmm10, xmm15
1156        paddd   xmm11, xmm12
1157        movdqa  xmm8, xmmword ptr [rsp+0x100]
1158        paddd   xmm8, xmm13
1159        paddd   xmm9, xmm14
1160        pxor    xmm5, xmm10
1161        pxor    xmm6, xmm11
1162        pxor    xmm7, xmm8
1163        pxor    xmm4, xmm9
1164        movdqa  xmmword ptr [rsp+0x100], xmm8
1165        movdqa  xmm8, xmm5
1166        psrld   xmm8, 12
1167        pslld   xmm5, 20
1168        por     xmm5, xmm8
1169        movdqa  xmm8, xmm6
1170        psrld   xmm8, 12
1171        pslld   xmm6, 20
1172        por     xmm6, xmm8
1173        movdqa  xmm8, xmm7
1174        psrld   xmm8, 12
1175        pslld   xmm7, 20
1176        por     xmm7, xmm8
1177        movdqa  xmm8, xmm4
1178        psrld   xmm8, 12
1179        pslld   xmm4, 20
1180        por     xmm4, xmm8
1181        paddd   xmm0, xmmword ptr [rsp+0x30]
1182        paddd   xmm1, xmmword ptr [rsp+0xA0]
1183        paddd   xmm2, xmmword ptr [rsp+0x60]
1184        paddd   xmm3, xmmword ptr [rsp+0x70]
1185        paddd   xmm0, xmm5
1186        paddd   xmm1, xmm6
1187        paddd   xmm2, xmm7
1188        paddd   xmm3, xmm4
1189        pxor    xmm15, xmm0
1190        pxor    xmm12, xmm1
1191        pxor    xmm13, xmm2
1192        pxor    xmm14, xmm3
1193        movdqa  xmm8, xmmword ptr [ROT8+rip]
1194        pshufb  xmm15, xmm8
1195        pshufb  xmm12, xmm8
1196        pshufb  xmm13, xmm8
1197        pshufb  xmm14, xmm8
1198        paddd   xmm10, xmm15
1199        paddd   xmm11, xmm12
1200        movdqa  xmm8, xmmword ptr [rsp+0x100]
1201        paddd   xmm8, xmm13
1202        paddd   xmm9, xmm14
1203        pxor    xmm5, xmm10
1204        pxor    xmm6, xmm11
1205        pxor    xmm7, xmm8
1206        pxor    xmm4, xmm9
1207        movdqa  xmmword ptr [rsp+0x100], xmm8
1208        movdqa  xmm8, xmm5
1209        psrld   xmm8, 7
1210        pslld   xmm5, 25
1211        por     xmm5, xmm8
1212        movdqa  xmm8, xmm6
1213        psrld   xmm8, 7
1214        pslld   xmm6, 25
1215        por     xmm6, xmm8
1216        movdqa  xmm8, xmm7
1217        psrld   xmm8, 7
1218        pslld   xmm7, 25
1219        por     xmm7, xmm8
1220        movdqa  xmm8, xmm4
1221        psrld   xmm8, 7
1222        pslld   xmm4, 25
1223        por     xmm4, xmm8
1224        paddd   xmm0, xmmword ptr [rsp+0xB0]
1225        paddd   xmm1, xmmword ptr [rsp+0x50]
1226        paddd   xmm2, xmmword ptr [rsp+0x10]
1227        paddd   xmm3, xmmword ptr [rsp+0x80]
1228        paddd   xmm0, xmm4
1229        paddd   xmm1, xmm5
1230        paddd   xmm2, xmm6
1231        paddd   xmm3, xmm7
1232        pxor    xmm12, xmm0
1233        pxor    xmm13, xmm1
1234        pxor    xmm14, xmm2
1235        pxor    xmm15, xmm3
1236        movdqa  xmm8, xmmword ptr [ROT16+rip]
1237        pshufb  xmm12, xmm8
1238        pshufb  xmm13, xmm8
1239        pshufb  xmm14, xmm8
1240        pshufb  xmm15, xmm8
1241        movdqa  xmm8, xmmword ptr [rsp+0x100]
1242        paddd   xmm8, xmm12
1243        paddd   xmm9, xmm13
1244        paddd   xmm10, xmm14
1245        paddd   xmm11, xmm15
1246        pxor    xmm4, xmm8
1247        pxor    xmm5, xmm9
1248        pxor    xmm6, xmm10
1249        pxor    xmm7, xmm11
1250        movdqa  xmmword ptr [rsp+0x100], xmm8
1251        movdqa  xmm8, xmm4
1252        psrld   xmm8, 12
1253        pslld   xmm4, 20
1254        por     xmm4, xmm8
1255        movdqa  xmm8, xmm5
1256        psrld   xmm8, 12
1257        pslld   xmm5, 20
1258        por     xmm5, xmm8
1259        movdqa  xmm8, xmm6
1260        psrld   xmm8, 12
1261        pslld   xmm6, 20
1262        por     xmm6, xmm8
1263        movdqa  xmm8, xmm7
1264        psrld   xmm8, 12
1265        pslld   xmm7, 20
1266        por     xmm7, xmm8
1267        paddd   xmm0, xmmword ptr [rsp+0xF0]
1268        paddd   xmm1, xmmword ptr [rsp]
1269        paddd   xmm2, xmmword ptr [rsp+0x90]
1270        paddd   xmm3, xmmword ptr [rsp+0x60]
1271        paddd   xmm0, xmm4
1272        paddd   xmm1, xmm5
1273        paddd   xmm2, xmm6
1274        paddd   xmm3, xmm7
1275        pxor    xmm12, xmm0
1276        pxor    xmm13, xmm1
1277        pxor    xmm14, xmm2
1278        pxor    xmm15, xmm3
1279        movdqa  xmm8, xmmword ptr [ROT8+rip]
1280        pshufb  xmm12, xmm8
1281        pshufb  xmm13, xmm8
1282        pshufb  xmm14, xmm8
1283        pshufb  xmm15, xmm8
1284        movdqa  xmm8, xmmword ptr [rsp+0x100]
1285        paddd   xmm8, xmm12
1286        paddd   xmm9, xmm13
1287        paddd   xmm10, xmm14
1288        paddd   xmm11, xmm15
1289        pxor    xmm4, xmm8
1290        pxor    xmm5, xmm9
1291        pxor    xmm6, xmm10
1292        pxor    xmm7, xmm11
1293        movdqa  xmmword ptr [rsp+0x100], xmm8
1294        movdqa  xmm8, xmm4
1295        psrld   xmm8, 7
1296        pslld   xmm4, 25
1297        por     xmm4, xmm8
1298        movdqa  xmm8, xmm5
1299        psrld   xmm8, 7
1300        pslld   xmm5, 25
1301        por     xmm5, xmm8
1302        movdqa  xmm8, xmm6
1303        psrld   xmm8, 7
1304        pslld   xmm6, 25
1305        por     xmm6, xmm8
1306        movdqa  xmm8, xmm7
1307        psrld   xmm8, 7
1308        pslld   xmm7, 25
1309        por     xmm7, xmm8
1310        paddd   xmm0, xmmword ptr [rsp+0xE0]
1311        paddd   xmm1, xmmword ptr [rsp+0x20]
1312        paddd   xmm2, xmmword ptr [rsp+0x30]
1313        paddd   xmm3, xmmword ptr [rsp+0x70]
1314        paddd   xmm0, xmm5
1315        paddd   xmm1, xmm6
1316        paddd   xmm2, xmm7
1317        paddd   xmm3, xmm4
1318        pxor    xmm15, xmm0
1319        pxor    xmm12, xmm1
1320        pxor    xmm13, xmm2
1321        pxor    xmm14, xmm3
1322        movdqa  xmm8, xmmword ptr [ROT16+rip]
1323        pshufb  xmm15, xmm8
1324        pshufb  xmm12, xmm8
1325        pshufb  xmm13, xmm8
1326        pshufb  xmm14, xmm8
1327        paddd   xmm10, xmm15
1328        paddd   xmm11, xmm12
1329        movdqa  xmm8, xmmword ptr [rsp+0x100]
1330        paddd   xmm8, xmm13
1331        paddd   xmm9, xmm14
1332        pxor    xmm5, xmm10
1333        pxor    xmm6, xmm11
1334        pxor    xmm7, xmm8
1335        pxor    xmm4, xmm9
1336        movdqa  xmmword ptr [rsp+0x100], xmm8
1337        movdqa  xmm8, xmm5
1338        psrld   xmm8, 12
1339        pslld   xmm5, 20
1340        por     xmm5, xmm8
1341        movdqa  xmm8, xmm6
1342        psrld   xmm8, 12
1343        pslld   xmm6, 20
1344        por     xmm6, xmm8
1345        movdqa  xmm8, xmm7
1346        psrld   xmm8, 12
1347        pslld   xmm7, 20
1348        por     xmm7, xmm8
1349        movdqa  xmm8, xmm4
1350        psrld   xmm8, 12
1351        pslld   xmm4, 20
1352        por     xmm4, xmm8
1353        paddd   xmm0, xmmword ptr [rsp+0xA0]
1354        paddd   xmm1, xmmword ptr [rsp+0xC0]
1355        paddd   xmm2, xmmword ptr [rsp+0x40]
1356        paddd   xmm3, xmmword ptr [rsp+0xD0]
1357        paddd   xmm0, xmm5
1358        paddd   xmm1, xmm6
1359        paddd   xmm2, xmm7
1360        paddd   xmm3, xmm4
1361        pxor    xmm15, xmm0
1362        pxor    xmm12, xmm1
1363        pxor    xmm13, xmm2
1364        pxor    xmm14, xmm3
1365        movdqa  xmm8, xmmword ptr [ROT8+rip]
1366        pshufb  xmm15, xmm8
1367        pshufb  xmm12, xmm8
1368        pshufb  xmm13, xmm8
1369        pshufb  xmm14, xmm8
1370        paddd   xmm10, xmm15
1371        paddd   xmm11, xmm12
1372        movdqa  xmm8, xmmword ptr [rsp+0x100]
1373        paddd   xmm8, xmm13
1374        paddd   xmm9, xmm14
1375        pxor    xmm5, xmm10
1376        pxor    xmm6, xmm11
1377        pxor    xmm7, xmm8
1378        pxor    xmm4, xmm9
1379        pxor    xmm0, xmm8
1380        pxor    xmm1, xmm9
1381        pxor    xmm2, xmm10
1382        pxor    xmm3, xmm11
1383        movdqa  xmm8, xmm5
1384        psrld   xmm8, 7
1385        pslld   xmm5, 25
1386        por     xmm5, xmm8
1387        movdqa  xmm8, xmm6
1388        psrld   xmm8, 7
1389        pslld   xmm6, 25
1390        por     xmm6, xmm8
1391        movdqa  xmm8, xmm7
1392        psrld   xmm8, 7
1393        pslld   xmm7, 25
1394        por     xmm7, xmm8
1395        movdqa  xmm8, xmm4
1396        psrld   xmm8, 7
1397        pslld   xmm4, 25
1398        por     xmm4, xmm8
1399        pxor    xmm4, xmm12
1400        pxor    xmm5, xmm13
1401        pxor    xmm6, xmm14
1402        pxor    xmm7, xmm15
1403        mov     eax, r13d
1404        jne     9b
1405        movdqa  xmm9, xmm0
1406        punpckldq xmm0, xmm1
1407        punpckhdq xmm9, xmm1
1408        movdqa  xmm11, xmm2
1409        punpckldq xmm2, xmm3
1410        punpckhdq xmm11, xmm3
1411        movdqa  xmm1, xmm0
1412        punpcklqdq xmm0, xmm2
1413        punpckhqdq xmm1, xmm2
1414        movdqa  xmm3, xmm9
1415        punpcklqdq xmm9, xmm11
1416        punpckhqdq xmm3, xmm11
1417        movdqu  xmmword ptr [rbx], xmm0
1418        movdqu  xmmword ptr [rbx+0x20], xmm1
1419        movdqu  xmmword ptr [rbx+0x40], xmm9
1420        movdqu  xmmword ptr [rbx+0x60], xmm3
1421        movdqa  xmm9, xmm4
1422        punpckldq xmm4, xmm5
1423        punpckhdq xmm9, xmm5
1424        movdqa  xmm11, xmm6
1425        punpckldq xmm6, xmm7
1426        punpckhdq xmm11, xmm7
1427        movdqa  xmm5, xmm4
1428        punpcklqdq xmm4, xmm6
1429        punpckhqdq xmm5, xmm6
1430        movdqa  xmm7, xmm9
1431        punpcklqdq xmm9, xmm11
1432        punpckhqdq xmm7, xmm11
1433        movdqu  xmmword ptr [rbx+0x10], xmm4
1434        movdqu  xmmword ptr [rbx+0x30], xmm5
1435        movdqu  xmmword ptr [rbx+0x50], xmm9
1436        movdqu  xmmword ptr [rbx+0x70], xmm7
1437        movdqa  xmm1, xmmword ptr [rsp+0x110]
1438        movdqa  xmm0, xmm1
1439        paddd   xmm1, xmmword ptr [rsp+0x150]
1440        movdqa  xmmword ptr [rsp+0x110], xmm1
1441        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1442        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1443        pcmpgtd xmm0, xmm1
1444        movdqa  xmm1, xmmword ptr [rsp+0x120]
1445        psubd   xmm1, xmm0
1446        movdqa  xmmword ptr [rsp+0x120], xmm1
1447        add     rbx, 128
1448        add     rdi, 32
1449        sub     rsi, 4
1450        cmp     rsi, 4
1451        jnc     2b
1452        test    rsi, rsi
1453        jnz     3f
14544:
1455        mov     rsp, rbp
1456        pop     rbp
1457        pop     rbx
1458        pop     r12
1459        pop     r13
1460        pop     r14
1461        pop     r15
1462        RET
1463.p2align 5
14643:
1465        test    esi, 0x2
1466        je      3f
1467        movups  xmm0, xmmword ptr [rcx]
1468        movups  xmm1, xmmword ptr [rcx+0x10]
1469        movaps  xmm8, xmm0
1470        movaps  xmm9, xmm1
1471        movd    xmm13, dword ptr [rsp+0x110]
1472        pinsrd  xmm13, dword ptr [rsp+0x120], 1
1473        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1474        movaps  xmmword ptr [rsp], xmm13
1475        movd    xmm14, dword ptr [rsp+0x114]
1476        pinsrd  xmm14, dword ptr [rsp+0x124], 1
1477        pinsrd  xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1478        movaps  xmmword ptr [rsp+0x10], xmm14
1479        mov     r8, qword ptr [rdi]
1480        mov     r9, qword ptr [rdi+0x8]
1481        movzx   eax, byte ptr [rbp+0x40]
1482        or      eax, r13d
1483        xor     edx, edx
14842:
1485        mov     r14d, eax
1486        or      eax, r12d
1487        add     rdx, 64
1488        cmp     rdx, r15
1489        cmovne  eax, r14d
1490        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1491        movaps  xmm10, xmm2
1492        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1493        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1494        movaps  xmm3, xmm4
1495        shufps  xmm4, xmm5, 136
1496        shufps  xmm3, xmm5, 221
1497        movaps  xmm5, xmm3
1498        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1499        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1500        movaps  xmm3, xmm6
1501        shufps  xmm6, xmm7, 136
1502        pshufd  xmm6, xmm6, 0x93
1503        shufps  xmm3, xmm7, 221
1504        pshufd  xmm7, xmm3, 0x93
1505        movups  xmm12, xmmword ptr [r9+rdx-0x40]
1506        movups  xmm13, xmmword ptr [r9+rdx-0x30]
1507        movaps  xmm11, xmm12
1508        shufps  xmm12, xmm13, 136
1509        shufps  xmm11, xmm13, 221
1510        movaps  xmm13, xmm11
1511        movups  xmm14, xmmword ptr [r9+rdx-0x20]
1512        movups  xmm15, xmmword ptr [r9+rdx-0x10]
1513        movaps  xmm11, xmm14
1514        shufps  xmm14, xmm15, 136
1515        pshufd  xmm14, xmm14, 0x93
1516        shufps  xmm11, xmm15, 221
1517        pshufd  xmm15, xmm11, 0x93
1518        movaps  xmm3, xmmword ptr [rsp]
1519        movaps  xmm11, xmmword ptr [rsp+0x10]
1520        pinsrd  xmm3, eax, 3
1521        pinsrd  xmm11, eax, 3
1522        mov     al, 7
15239:
1524        paddd   xmm0, xmm4
1525        paddd   xmm8, xmm12
1526        movaps  xmmword ptr [rsp+0x20], xmm4
1527        movaps  xmmword ptr [rsp+0x30], xmm12
1528        paddd   xmm0, xmm1
1529        paddd   xmm8, xmm9
1530        pxor    xmm3, xmm0
1531        pxor    xmm11, xmm8
1532        movaps  xmm12, xmmword ptr [ROT16+rip]
1533        pshufb  xmm3, xmm12
1534        pshufb  xmm11, xmm12
1535        paddd   xmm2, xmm3
1536        paddd   xmm10, xmm11
1537        pxor    xmm1, xmm2
1538        pxor    xmm9, xmm10
1539        movdqa  xmm4, xmm1
1540        pslld   xmm1, 20
1541        psrld   xmm4, 12
1542        por     xmm1, xmm4
1543        movdqa  xmm4, xmm9
1544        pslld   xmm9, 20
1545        psrld   xmm4, 12
1546        por     xmm9, xmm4
1547        paddd   xmm0, xmm5
1548        paddd   xmm8, xmm13
1549        movaps  xmmword ptr [rsp+0x40], xmm5
1550        movaps  xmmword ptr [rsp+0x50], xmm13
1551        paddd   xmm0, xmm1
1552        paddd   xmm8, xmm9
1553        pxor    xmm3, xmm0
1554        pxor    xmm11, xmm8
1555        movaps  xmm13, xmmword ptr [ROT8+rip]
1556        pshufb  xmm3, xmm13
1557        pshufb  xmm11, xmm13
1558        paddd   xmm2, xmm3
1559        paddd   xmm10, xmm11
1560        pxor    xmm1, xmm2
1561        pxor    xmm9, xmm10
1562        movdqa  xmm4, xmm1
1563        pslld   xmm1, 25
1564        psrld   xmm4, 7
1565        por     xmm1, xmm4
1566        movdqa  xmm4, xmm9
1567        pslld   xmm9, 25
1568        psrld   xmm4, 7
1569        por     xmm9, xmm4
1570        pshufd  xmm0, xmm0, 0x93
1571        pshufd  xmm8, xmm8, 0x93
1572        pshufd  xmm3, xmm3, 0x4E
1573        pshufd  xmm11, xmm11, 0x4E
1574        pshufd  xmm2, xmm2, 0x39
1575        pshufd  xmm10, xmm10, 0x39
1576        paddd   xmm0, xmm6
1577        paddd   xmm8, xmm14
1578        paddd   xmm0, xmm1
1579        paddd   xmm8, xmm9
1580        pxor    xmm3, xmm0
1581        pxor    xmm11, xmm8
1582        pshufb  xmm3, xmm12
1583        pshufb  xmm11, xmm12
1584        paddd   xmm2, xmm3
1585        paddd   xmm10, xmm11
1586        pxor    xmm1, xmm2
1587        pxor    xmm9, xmm10
1588        movdqa  xmm4, xmm1
1589        pslld   xmm1, 20
1590        psrld   xmm4, 12
1591        por     xmm1, xmm4
1592        movdqa  xmm4, xmm9
1593        pslld   xmm9, 20
1594        psrld   xmm4, 12
1595        por     xmm9, xmm4
1596        paddd   xmm0, xmm7
1597        paddd   xmm8, xmm15
1598        paddd   xmm0, xmm1
1599        paddd   xmm8, xmm9
1600        pxor    xmm3, xmm0
1601        pxor    xmm11, xmm8
1602        pshufb  xmm3, xmm13
1603        pshufb  xmm11, xmm13
1604        paddd   xmm2, xmm3
1605        paddd   xmm10, xmm11
1606        pxor    xmm1, xmm2
1607        pxor    xmm9, xmm10
1608        movdqa  xmm4, xmm1
1609        pslld   xmm1, 25
1610        psrld   xmm4, 7
1611        por     xmm1, xmm4
1612        movdqa  xmm4, xmm9
1613        pslld   xmm9, 25
1614        psrld   xmm4, 7
1615        por     xmm9, xmm4
1616        pshufd  xmm0, xmm0, 0x39
1617        pshufd  xmm8, xmm8, 0x39
1618        pshufd  xmm3, xmm3, 0x4E
1619        pshufd  xmm11, xmm11, 0x4E
1620        pshufd  xmm2, xmm2, 0x93
1621        pshufd  xmm10, xmm10, 0x93
1622        dec     al
1623        je      9f
1624        movdqa  xmm12, xmmword ptr [rsp+0x20]
1625        movdqa  xmm5, xmmword ptr [rsp+0x40]
1626        pshufd  xmm13, xmm12, 0x0F
1627        shufps  xmm12, xmm5, 214
1628        pshufd  xmm4, xmm12, 0x39
1629        movdqa  xmm12, xmm6
1630        shufps  xmm12, xmm7, 250
1631        pblendw xmm13, xmm12, 0xCC
1632        movdqa  xmm12, xmm7
1633        punpcklqdq xmm12, xmm5
1634        pblendw xmm12, xmm6, 0xC0
1635        pshufd  xmm12, xmm12, 0x78
1636        punpckhdq xmm5, xmm7
1637        punpckldq xmm6, xmm5
1638        pshufd  xmm7, xmm6, 0x1E
1639        movdqa  xmmword ptr [rsp+0x20], xmm13
1640        movdqa  xmmword ptr [rsp+0x40], xmm12
1641        movdqa  xmm5, xmmword ptr [rsp+0x30]
1642        movdqa  xmm13, xmmword ptr [rsp+0x50]
1643        pshufd  xmm6, xmm5, 0x0F
1644        shufps  xmm5, xmm13, 214
1645        pshufd  xmm12, xmm5, 0x39
1646        movdqa  xmm5, xmm14
1647        shufps  xmm5, xmm15, 250
1648        pblendw xmm6, xmm5, 0xCC
1649        movdqa  xmm5, xmm15
1650        punpcklqdq xmm5, xmm13
1651        pblendw xmm5, xmm14, 0xC0
1652        pshufd  xmm5, xmm5, 0x78
1653        punpckhdq xmm13, xmm15
1654        punpckldq xmm14, xmm13
1655        pshufd  xmm15, xmm14, 0x1E
1656        movdqa  xmm13, xmm6
1657        movdqa  xmm14, xmm5
1658        movdqa  xmm5, xmmword ptr [rsp+0x20]
1659        movdqa  xmm6, xmmword ptr [rsp+0x40]
1660        jmp     9b
16619:
1662        pxor    xmm0, xmm2
1663        pxor    xmm1, xmm3
1664        pxor    xmm8, xmm10
1665        pxor    xmm9, xmm11
1666        mov     eax, r13d
1667        cmp     rdx, r15
1668        jne     2b
1669        movups  xmmword ptr [rbx], xmm0
1670        movups  xmmword ptr [rbx+0x10], xmm1
1671        movups  xmmword ptr [rbx+0x20], xmm8
1672        movups  xmmword ptr [rbx+0x30], xmm9
1673        movdqa  xmm0, xmmword ptr [rsp+0x130]
1674        movdqa  xmm1, xmmword ptr [rsp+0x110]
1675        movdqa  xmm2, xmmword ptr [rsp+0x120]
1676        movdqu  xmm3, xmmword ptr [rsp+0x118]
1677        movdqu  xmm4, xmmword ptr [rsp+0x128]
1678        blendvps xmm1, xmm3, xmm0
1679        blendvps xmm2, xmm4, xmm0
1680        movdqa  xmmword ptr [rsp+0x110], xmm1
1681        movdqa  xmmword ptr [rsp+0x120], xmm2
1682        add     rdi, 16
1683        add     rbx, 64
1684        sub     rsi, 2
16853:
1686        test    esi, 0x1
1687        je      4b
1688        movups  xmm0, xmmword ptr [rcx]
1689        movups  xmm1, xmmword ptr [rcx+0x10]
1690        movd    xmm13, dword ptr [rsp+0x110]
1691        pinsrd  xmm13, dword ptr [rsp+0x120], 1
1692        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1693        movaps  xmm14, xmmword ptr [ROT8+rip]
1694        movaps  xmm15, xmmword ptr [ROT16+rip]
1695        mov     r8, qword ptr [rdi]
1696        movzx   eax, byte ptr [rbp+0x40]
1697        or      eax, r13d
1698        xor     edx, edx
16992:
1700        mov     r14d, eax
1701        or      eax, r12d
1702        add     rdx, 64
1703        cmp     rdx, r15
1704        cmovne  eax, r14d
1705        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1706        movaps  xmm3, xmm13
1707        pinsrd  xmm3, eax, 3
1708        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1709        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1710        movaps  xmm8, xmm4
1711        shufps  xmm4, xmm5, 136
1712        shufps  xmm8, xmm5, 221
1713        movaps  xmm5, xmm8
1714        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1715        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1716        movaps  xmm8, xmm6
1717        shufps  xmm6, xmm7, 136
1718        pshufd  xmm6, xmm6, 0x93
1719        shufps  xmm8, xmm7, 221
1720        pshufd  xmm7, xmm8, 0x93
1721        mov     al, 7
17229:
1723        paddd   xmm0, xmm4
1724        paddd   xmm0, xmm1
1725        pxor    xmm3, xmm0
1726        pshufb  xmm3, xmm15
1727        paddd   xmm2, xmm3
1728        pxor    xmm1, xmm2
1729        movdqa  xmm11, xmm1
1730        pslld   xmm1, 20
1731        psrld   xmm11, 12
1732        por     xmm1, xmm11
1733        paddd   xmm0, xmm5
1734        paddd   xmm0, xmm1
1735        pxor    xmm3, xmm0
1736        pshufb  xmm3, xmm14
1737        paddd   xmm2, xmm3
1738        pxor    xmm1, xmm2
1739        movdqa  xmm11, xmm1
1740        pslld   xmm1, 25
1741        psrld   xmm11, 7
1742        por     xmm1, xmm11
1743        pshufd  xmm0, xmm0, 0x93
1744        pshufd  xmm3, xmm3, 0x4E
1745        pshufd  xmm2, xmm2, 0x39
1746        paddd   xmm0, xmm6
1747        paddd   xmm0, xmm1
1748        pxor    xmm3, xmm0
1749        pshufb  xmm3, xmm15
1750        paddd   xmm2, xmm3
1751        pxor    xmm1, xmm2
1752        movdqa  xmm11, xmm1
1753        pslld   xmm1, 20
1754        psrld   xmm11, 12
1755        por     xmm1, xmm11
1756        paddd   xmm0, xmm7
1757        paddd   xmm0, xmm1
1758        pxor    xmm3, xmm0
1759        pshufb  xmm3, xmm14
1760        paddd   xmm2, xmm3
1761        pxor    xmm1, xmm2
1762        movdqa  xmm11, xmm1
1763        pslld   xmm1, 25
1764        psrld   xmm11, 7
1765        por     xmm1, xmm11
1766        pshufd  xmm0, xmm0, 0x39
1767        pshufd  xmm3, xmm3, 0x4E
1768        pshufd  xmm2, xmm2, 0x93
1769        dec     al
1770        jz      9f
1771        movdqa  xmm8, xmm4
1772        shufps  xmm8, xmm5, 214
1773        pshufd  xmm9, xmm4, 0x0F
1774        pshufd  xmm4, xmm8, 0x39
1775        movdqa  xmm8, xmm6
1776        shufps  xmm8, xmm7, 250
1777        pblendw xmm9, xmm8, 0xCC
1778        movdqa  xmm8, xmm7
1779        punpcklqdq xmm8, xmm5
1780        pblendw xmm8, xmm6, 0xC0
1781        pshufd  xmm8, xmm8, 0x78
1782        punpckhdq xmm5, xmm7
1783        punpckldq xmm6, xmm5
1784        pshufd  xmm7, xmm6, 0x1E
1785        movdqa  xmm5, xmm9
1786        movdqa  xmm6, xmm8
1787        jmp     9b
17889:
1789        pxor    xmm0, xmm2
1790        pxor    xmm1, xmm3
1791        mov     eax, r13d
1792        cmp     rdx, r15
1793        jne     2b
1794        movups  xmmword ptr [rbx], xmm0
1795        movups  xmmword ptr [rbx+0x10], xmm1
1796        jmp     4b
1797SET_SIZE(zfs_blake3_hash_many_sse41)
1798
1799ENTRY_ALIGN(zfs_blake3_compress_in_place_sse41, 64)
1800        ENDBR
1801        movups  xmm0, xmmword ptr [rdi]
1802        movups  xmm1, xmmword ptr [rdi+0x10]
1803        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1804        shl     r8, 32
1805        add     rdx, r8
1806        movq    xmm3, rcx
1807        movq    xmm4, rdx
1808        punpcklqdq xmm3, xmm4
1809        movups  xmm4, xmmword ptr [rsi]
1810        movups  xmm5, xmmword ptr [rsi+0x10]
1811        movaps  xmm8, xmm4
1812        shufps  xmm4, xmm5, 136
1813        shufps  xmm8, xmm5, 221
1814        movaps  xmm5, xmm8
1815        movups  xmm6, xmmword ptr [rsi+0x20]
1816        movups  xmm7, xmmword ptr [rsi+0x30]
1817        movaps  xmm8, xmm6
1818        shufps  xmm6, xmm7, 136
1819        pshufd  xmm6, xmm6, 0x93
1820        shufps  xmm8, xmm7, 221
1821        pshufd  xmm7, xmm8, 0x93
1822        movaps  xmm14, xmmword ptr [ROT8+rip]
1823        movaps  xmm15, xmmword ptr [ROT16+rip]
1824        mov     al, 7
18259:
1826        paddd   xmm0, xmm4
1827        paddd   xmm0, xmm1
1828        pxor    xmm3, xmm0
1829        pshufb  xmm3, xmm15
1830        paddd   xmm2, xmm3
1831        pxor    xmm1, xmm2
1832        movdqa  xmm11, xmm1
1833        pslld   xmm1, 20
1834        psrld   xmm11, 12
1835        por     xmm1, xmm11
1836        paddd   xmm0, xmm5
1837        paddd   xmm0, xmm1
1838        pxor    xmm3, xmm0
1839        pshufb  xmm3, xmm14
1840        paddd   xmm2, xmm3
1841        pxor    xmm1, xmm2
1842        movdqa  xmm11, xmm1
1843        pslld   xmm1, 25
1844        psrld   xmm11, 7
1845        por     xmm1, xmm11
1846        pshufd  xmm0, xmm0, 0x93
1847        pshufd  xmm3, xmm3, 0x4E
1848        pshufd  xmm2, xmm2, 0x39
1849        paddd   xmm0, xmm6
1850        paddd   xmm0, xmm1
1851        pxor    xmm3, xmm0
1852        pshufb  xmm3, xmm15
1853        paddd   xmm2, xmm3
1854        pxor    xmm1, xmm2
1855        movdqa  xmm11, xmm1
1856        pslld   xmm1, 20
1857        psrld   xmm11, 12
1858        por     xmm1, xmm11
1859        paddd   xmm0, xmm7
1860        paddd   xmm0, xmm1
1861        pxor    xmm3, xmm0
1862        pshufb  xmm3, xmm14
1863        paddd   xmm2, xmm3
1864        pxor    xmm1, xmm2
1865        movdqa  xmm11, xmm1
1866        pslld   xmm1, 25
1867        psrld   xmm11, 7
1868        por     xmm1, xmm11
1869        pshufd  xmm0, xmm0, 0x39
1870        pshufd  xmm3, xmm3, 0x4E
1871        pshufd  xmm2, xmm2, 0x93
1872        dec     al
1873        jz      9f
1874        movdqa  xmm8, xmm4
1875        shufps  xmm8, xmm5, 214
1876        pshufd  xmm9, xmm4, 0x0F
1877        pshufd  xmm4, xmm8, 0x39
1878        movdqa  xmm8, xmm6
1879        shufps  xmm8, xmm7, 250
1880        pblendw xmm9, xmm8, 0xCC
1881        movdqa  xmm8, xmm7
1882        punpcklqdq xmm8, xmm5
1883        pblendw xmm8, xmm6, 0xC0
1884        pshufd  xmm8, xmm8, 0x78
1885        punpckhdq xmm5, xmm7
1886        punpckldq xmm6, xmm5
1887        pshufd  xmm7, xmm6, 0x1E
1888        movdqa  xmm5, xmm9
1889        movdqa  xmm6, xmm8
1890        jmp     9b
18919:
1892        pxor    xmm0, xmm2
1893        pxor    xmm1, xmm3
1894        movups  xmmword ptr [rdi], xmm0
1895        movups  xmmword ptr [rdi+0x10], xmm1
1896        RET
1897SET_SIZE(zfs_blake3_compress_in_place_sse41)
1898
1899ENTRY_ALIGN(zfs_blake3_compress_xof_sse41, 64)
1900        ENDBR
1901        movups  xmm0, xmmword ptr [rdi]
1902        movups  xmm1, xmmword ptr [rdi+0x10]
1903        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1904        movzx   eax, r8b
1905        movzx   edx, dl
1906        shl     rax, 32
1907        add     rdx, rax
1908        movq    xmm3, rcx
1909        movq    xmm4, rdx
1910        punpcklqdq xmm3, xmm4
1911        movups  xmm4, xmmword ptr [rsi]
1912        movups  xmm5, xmmword ptr [rsi+0x10]
1913        movaps  xmm8, xmm4
1914        shufps  xmm4, xmm5, 136
1915        shufps  xmm8, xmm5, 221
1916        movaps  xmm5, xmm8
1917        movups  xmm6, xmmword ptr [rsi+0x20]
1918        movups  xmm7, xmmword ptr [rsi+0x30]
1919        movaps  xmm8, xmm6
1920        shufps  xmm6, xmm7, 136
1921        pshufd  xmm6, xmm6, 0x93
1922        shufps  xmm8, xmm7, 221
1923        pshufd  xmm7, xmm8, 0x93
1924        movaps  xmm14, xmmword ptr [ROT8+rip]
1925        movaps  xmm15, xmmword ptr [ROT16+rip]
1926        mov     al, 7
19279:
1928        paddd   xmm0, xmm4
1929        paddd   xmm0, xmm1
1930        pxor    xmm3, xmm0
1931        pshufb  xmm3, xmm15
1932        paddd   xmm2, xmm3
1933        pxor    xmm1, xmm2
1934        movdqa  xmm11, xmm1
1935        pslld   xmm1, 20
1936        psrld   xmm11, 12
1937        por     xmm1, xmm11
1938        paddd   xmm0, xmm5
1939        paddd   xmm0, xmm1
1940        pxor    xmm3, xmm0
1941        pshufb  xmm3, xmm14
1942        paddd   xmm2, xmm3
1943        pxor    xmm1, xmm2
1944        movdqa  xmm11, xmm1
1945        pslld   xmm1, 25
1946        psrld   xmm11, 7
1947        por     xmm1, xmm11
1948        pshufd  xmm0, xmm0, 0x93
1949        pshufd  xmm3, xmm3, 0x4E
1950        pshufd  xmm2, xmm2, 0x39
1951        paddd   xmm0, xmm6
1952        paddd   xmm0, xmm1
1953        pxor    xmm3, xmm0
1954        pshufb  xmm3, xmm15
1955        paddd   xmm2, xmm3
1956        pxor    xmm1, xmm2
1957        movdqa  xmm11, xmm1
1958        pslld   xmm1, 20
1959        psrld   xmm11, 12
1960        por     xmm1, xmm11
1961        paddd   xmm0, xmm7
1962        paddd   xmm0, xmm1
1963        pxor    xmm3, xmm0
1964        pshufb  xmm3, xmm14
1965        paddd   xmm2, xmm3
1966        pxor    xmm1, xmm2
1967        movdqa  xmm11, xmm1
1968        pslld   xmm1, 25
1969        psrld   xmm11, 7
1970        por     xmm1, xmm11
1971        pshufd  xmm0, xmm0, 0x39
1972        pshufd  xmm3, xmm3, 0x4E
1973        pshufd  xmm2, xmm2, 0x93
1974        dec     al
1975        jz      9f
1976        movdqa  xmm8, xmm4
1977        shufps  xmm8, xmm5, 214
1978        pshufd  xmm9, xmm4, 0x0F
1979        pshufd  xmm4, xmm8, 0x39
1980        movdqa  xmm8, xmm6
1981        shufps  xmm8, xmm7, 250
1982        pblendw xmm9, xmm8, 0xCC
1983        movdqa  xmm8, xmm7
1984        punpcklqdq xmm8, xmm5
1985        pblendw xmm8, xmm6, 0xC0
1986        pshufd  xmm8, xmm8, 0x78
1987        punpckhdq xmm5, xmm7
1988        punpckldq xmm6, xmm5
1989        pshufd  xmm7, xmm6, 0x1E
1990        movdqa  xmm5, xmm9
1991        movdqa  xmm6, xmm8
1992        jmp     9b
19939:
1994        movdqu  xmm4, xmmword ptr [rdi]
1995        movdqu  xmm5, xmmword ptr [rdi+0x10]
1996        pxor    xmm0, xmm2
1997        pxor    xmm1, xmm3
1998        pxor    xmm2, xmm4
1999        pxor    xmm3, xmm5
2000        movups  xmmword ptr [r9], xmm0
2001        movups  xmmword ptr [r9+0x10], xmm1
2002        movups  xmmword ptr [r9+0x20], xmm2
2003        movups  xmmword ptr [r9+0x30], xmm3
2004        RET
2005SET_SIZE(zfs_blake3_compress_xof_sse41)
2006
2007SECTION_STATIC
2008
2009.p2align  6
2010BLAKE3_IV:
2011        .long  0x6A09E667, 0xBB67AE85
2012        .long  0x3C6EF372, 0xA54FF53A
2013ROT16:
2014        .byte  2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2015ROT8:
2016        .byte  1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2017ADD0:
2018        .long  0, 1, 2, 3
2019ADD1:
2020	.long  4, 4, 4, 4
2021BLAKE3_IV_0:
2022	.long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2023BLAKE3_IV_1:
2024	.long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2025BLAKE3_IV_2:
2026	.long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2027BLAKE3_IV_3:
2028	.long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2029BLAKE3_BLOCK_LEN:
2030	.long  64, 64, 64, 64
2031CMP_MSB_MASK:
2032	.long  0x80000000, 0x80000000, 0x80000000, 0x80000000
2033
2034#endif	/* HAVE_SSE4_1 */
2035
2036#ifdef __ELF__
2037.section .note.GNU-stack,"",%progbits
2038#endif
2039