xref: /freebsd/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse2.S (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1// SPDX-License-Identifier: CDDL-1.0
2/*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23/*
24 * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
25 * Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale
26 * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
27 */
28
29#if defined(HAVE_SSE2)
30
31#define _ASM
32#include <sys/asm_linkage.h>
33
34.intel_syntax noprefix
35
36SECTION_TEXT
37
38ENTRY_ALIGN(zfs_blake3_hash_many_sse2, 64)
39        ENDBR
40        push    r15
41        push    r14
42        push    r13
43        push    r12
44        push    rbx
45        push    rbp
46        mov     rbp, rsp
47        sub     rsp, 360
48        and     rsp, 0xFFFFFFFFFFFFFFC0
49        neg     r9d
50        movd    xmm0, r9d
51        pshufd  xmm0, xmm0, 0x00
52        movdqa  xmmword ptr [rsp+0x130], xmm0
53        movdqa  xmm1, xmm0
54        pand    xmm1, xmmword ptr [ADD0+rip]
55        pand    xmm0, xmmword ptr [ADD1+rip]
56        movdqa  xmmword ptr [rsp+0x150], xmm0
57        movd    xmm0, r8d
58        pshufd  xmm0, xmm0, 0x00
59        paddd   xmm0, xmm1
60        movdqa  xmmword ptr [rsp+0x110], xmm0
61        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
62        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
63        pcmpgtd xmm1, xmm0
64        shr     r8, 32
65        movd    xmm2, r8d
66        pshufd  xmm2, xmm2, 0x00
67        psubd   xmm2, xmm1
68        movdqa  xmmword ptr [rsp+0x120], xmm2
69        mov     rbx, qword ptr [rbp+0x50]
70        mov     r15, rdx
71        shl     r15, 6
72        movzx   r13d, byte ptr [rbp+0x38]
73        movzx   r12d, byte ptr [rbp+0x48]
74        cmp     rsi, 4
75        jc      3f
762:
77        movdqu  xmm3, xmmword ptr [rcx]
78        pshufd  xmm0, xmm3, 0x00
79        pshufd  xmm1, xmm3, 0x55
80        pshufd  xmm2, xmm3, 0xAA
81        pshufd  xmm3, xmm3, 0xFF
82        movdqu  xmm7, xmmword ptr [rcx+0x10]
83        pshufd  xmm4, xmm7, 0x00
84        pshufd  xmm5, xmm7, 0x55
85        pshufd  xmm6, xmm7, 0xAA
86        pshufd  xmm7, xmm7, 0xFF
87        mov     r8, qword ptr [rdi]
88        mov     r9, qword ptr [rdi+0x8]
89        mov     r10, qword ptr [rdi+0x10]
90        mov     r11, qword ptr [rdi+0x18]
91        movzx   eax, byte ptr [rbp+0x40]
92        or      eax, r13d
93        xor     edx, edx
949:
95        mov     r14d, eax
96        or      eax, r12d
97        add     rdx, 64
98        cmp     rdx, r15
99        cmovne  eax, r14d
100        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
101        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
102        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
103        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
104        movdqa  xmm12, xmm8
105        punpckldq xmm8, xmm9
106        punpckhdq xmm12, xmm9
107        movdqa  xmm14, xmm10
108        punpckldq xmm10, xmm11
109        punpckhdq xmm14, xmm11
110        movdqa  xmm9, xmm8
111        punpcklqdq xmm8, xmm10
112        punpckhqdq xmm9, xmm10
113        movdqa  xmm13, xmm12
114        punpcklqdq xmm12, xmm14
115        punpckhqdq xmm13, xmm14
116        movdqa  xmmword ptr [rsp], xmm8
117        movdqa  xmmword ptr [rsp+0x10], xmm9
118        movdqa  xmmword ptr [rsp+0x20], xmm12
119        movdqa  xmmword ptr [rsp+0x30], xmm13
120        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
121        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
122        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
123        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
124        movdqa  xmm12, xmm8
125        punpckldq xmm8, xmm9
126        punpckhdq xmm12, xmm9
127        movdqa  xmm14, xmm10
128        punpckldq xmm10, xmm11
129        punpckhdq xmm14, xmm11
130        movdqa  xmm9, xmm8
131        punpcklqdq xmm8, xmm10
132        punpckhqdq xmm9, xmm10
133        movdqa  xmm13, xmm12
134        punpcklqdq xmm12, xmm14
135        punpckhqdq xmm13, xmm14
136        movdqa  xmmword ptr [rsp+0x40], xmm8
137        movdqa  xmmword ptr [rsp+0x50], xmm9
138        movdqa  xmmword ptr [rsp+0x60], xmm12
139        movdqa  xmmword ptr [rsp+0x70], xmm13
140        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
141        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
142        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
143        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
144        movdqa  xmm12, xmm8
145        punpckldq xmm8, xmm9
146        punpckhdq xmm12, xmm9
147        movdqa  xmm14, xmm10
148        punpckldq xmm10, xmm11
149        punpckhdq xmm14, xmm11
150        movdqa  xmm9, xmm8
151        punpcklqdq xmm8, xmm10
152        punpckhqdq xmm9, xmm10
153        movdqa  xmm13, xmm12
154        punpcklqdq xmm12, xmm14
155        punpckhqdq xmm13, xmm14
156        movdqa  xmmword ptr [rsp+0x80], xmm8
157        movdqa  xmmword ptr [rsp+0x90], xmm9
158        movdqa  xmmword ptr [rsp+0xA0], xmm12
159        movdqa  xmmword ptr [rsp+0xB0], xmm13
160        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
161        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
162        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
163        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
164        movdqa  xmm12, xmm8
165        punpckldq xmm8, xmm9
166        punpckhdq xmm12, xmm9
167        movdqa  xmm14, xmm10
168        punpckldq xmm10, xmm11
169        punpckhdq xmm14, xmm11
170        movdqa  xmm9, xmm8
171        punpcklqdq xmm8, xmm10
172        punpckhqdq xmm9, xmm10
173        movdqa  xmm13, xmm12
174        punpcklqdq xmm12, xmm14
175        punpckhqdq xmm13, xmm14
176        movdqa  xmmword ptr [rsp+0xC0], xmm8
177        movdqa  xmmword ptr [rsp+0xD0], xmm9
178        movdqa  xmmword ptr [rsp+0xE0], xmm12
179        movdqa  xmmword ptr [rsp+0xF0], xmm13
180        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
181        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
182        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
183        movdqa  xmm12, xmmword ptr [rsp+0x110]
184        movdqa  xmm13, xmmword ptr [rsp+0x120]
185        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
186        movd    xmm15, eax
187        pshufd  xmm15, xmm15, 0x00
188        prefetcht0 [r8+rdx+0x80]
189        prefetcht0 [r9+rdx+0x80]
190        prefetcht0 [r10+rdx+0x80]
191        prefetcht0 [r11+rdx+0x80]
192        paddd   xmm0, xmmword ptr [rsp]
193        paddd   xmm1, xmmword ptr [rsp+0x20]
194        paddd   xmm2, xmmword ptr [rsp+0x40]
195        paddd   xmm3, xmmword ptr [rsp+0x60]
196        paddd   xmm0, xmm4
197        paddd   xmm1, xmm5
198        paddd   xmm2, xmm6
199        paddd   xmm3, xmm7
200        pxor    xmm12, xmm0
201        pxor    xmm13, xmm1
202        pxor    xmm14, xmm2
203        pxor    xmm15, xmm3
204        pshuflw xmm12, xmm12, 0xB1
205        pshufhw xmm12, xmm12, 0xB1
206        pshuflw xmm13, xmm13, 0xB1
207        pshufhw xmm13, xmm13, 0xB1
208        pshuflw xmm14, xmm14, 0xB1
209        pshufhw xmm14, xmm14, 0xB1
210        pshuflw xmm15, xmm15, 0xB1
211        pshufhw xmm15, xmm15, 0xB1
212        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
213        paddd   xmm8, xmm12
214        paddd   xmm9, xmm13
215        paddd   xmm10, xmm14
216        paddd   xmm11, xmm15
217        pxor    xmm4, xmm8
218        pxor    xmm5, xmm9
219        pxor    xmm6, xmm10
220        pxor    xmm7, xmm11
221        movdqa  xmmword ptr [rsp+0x100], xmm8
222        movdqa  xmm8, xmm4
223        psrld   xmm8, 12
224        pslld   xmm4, 20
225        por     xmm4, xmm8
226        movdqa  xmm8, xmm5
227        psrld   xmm8, 12
228        pslld   xmm5, 20
229        por     xmm5, xmm8
230        movdqa  xmm8, xmm6
231        psrld   xmm8, 12
232        pslld   xmm6, 20
233        por     xmm6, xmm8
234        movdqa  xmm8, xmm7
235        psrld   xmm8, 12
236        pslld   xmm7, 20
237        por     xmm7, xmm8
238        paddd   xmm0, xmmword ptr [rsp+0x10]
239        paddd   xmm1, xmmword ptr [rsp+0x30]
240        paddd   xmm2, xmmword ptr [rsp+0x50]
241        paddd   xmm3, xmmword ptr [rsp+0x70]
242        paddd   xmm0, xmm4
243        paddd   xmm1, xmm5
244        paddd   xmm2, xmm6
245        paddd   xmm3, xmm7
246        pxor    xmm12, xmm0
247        pxor    xmm13, xmm1
248        pxor    xmm14, xmm2
249        pxor    xmm15, xmm3
250        movdqa  xmm8, xmm12
251        psrld   xmm12, 8
252        pslld   xmm8, 24
253        pxor    xmm12, xmm8
254        movdqa  xmm8, xmm13
255        psrld   xmm13, 8
256        pslld   xmm8, 24
257        pxor    xmm13, xmm8
258        movdqa  xmm8, xmm14
259        psrld   xmm14, 8
260        pslld   xmm8, 24
261        pxor    xmm14, xmm8
262        movdqa  xmm8, xmm15
263        psrld   xmm15, 8
264        pslld   xmm8, 24
265        pxor    xmm15, xmm8
266        movdqa  xmm8, xmmword ptr [rsp+0x100]
267        paddd   xmm8, xmm12
268        paddd   xmm9, xmm13
269        paddd   xmm10, xmm14
270        paddd   xmm11, xmm15
271        pxor    xmm4, xmm8
272        pxor    xmm5, xmm9
273        pxor    xmm6, xmm10
274        pxor    xmm7, xmm11
275        movdqa  xmmword ptr [rsp+0x100], xmm8
276        movdqa  xmm8, xmm4
277        psrld   xmm8, 7
278        pslld   xmm4, 25
279        por     xmm4, xmm8
280        movdqa  xmm8, xmm5
281        psrld   xmm8, 7
282        pslld   xmm5, 25
283        por     xmm5, xmm8
284        movdqa  xmm8, xmm6
285        psrld   xmm8, 7
286        pslld   xmm6, 25
287        por     xmm6, xmm8
288        movdqa  xmm8, xmm7
289        psrld   xmm8, 7
290        pslld   xmm7, 25
291        por     xmm7, xmm8
292        paddd   xmm0, xmmword ptr [rsp+0x80]
293        paddd   xmm1, xmmword ptr [rsp+0xA0]
294        paddd   xmm2, xmmword ptr [rsp+0xC0]
295        paddd   xmm3, xmmword ptr [rsp+0xE0]
296        paddd   xmm0, xmm5
297        paddd   xmm1, xmm6
298        paddd   xmm2, xmm7
299        paddd   xmm3, xmm4
300        pxor    xmm15, xmm0
301        pxor    xmm12, xmm1
302        pxor    xmm13, xmm2
303        pxor    xmm14, xmm3
304        pshuflw xmm15, xmm15, 0xB1
305        pshufhw xmm15, xmm15, 0xB1
306        pshuflw xmm12, xmm12, 0xB1
307        pshufhw xmm12, xmm12, 0xB1
308        pshuflw xmm13, xmm13, 0xB1
309        pshufhw xmm13, xmm13, 0xB1
310        pshuflw xmm14, xmm14, 0xB1
311        pshufhw xmm14, xmm14, 0xB1
312        paddd   xmm10, xmm15
313        paddd   xmm11, xmm12
314        movdqa  xmm8, xmmword ptr [rsp+0x100]
315        paddd   xmm8, xmm13
316        paddd   xmm9, xmm14
317        pxor    xmm5, xmm10
318        pxor    xmm6, xmm11
319        pxor    xmm7, xmm8
320        pxor    xmm4, xmm9
321        movdqa  xmmword ptr [rsp+0x100], xmm8
322        movdqa  xmm8, xmm5
323        psrld   xmm8, 12
324        pslld   xmm5, 20
325        por     xmm5, xmm8
326        movdqa  xmm8, xmm6
327        psrld   xmm8, 12
328        pslld   xmm6, 20
329        por     xmm6, xmm8
330        movdqa  xmm8, xmm7
331        psrld   xmm8, 12
332        pslld   xmm7, 20
333        por     xmm7, xmm8
334        movdqa  xmm8, xmm4
335        psrld   xmm8, 12
336        pslld   xmm4, 20
337        por     xmm4, xmm8
338        paddd   xmm0, xmmword ptr [rsp+0x90]
339        paddd   xmm1, xmmword ptr [rsp+0xB0]
340        paddd   xmm2, xmmword ptr [rsp+0xD0]
341        paddd   xmm3, xmmword ptr [rsp+0xF0]
342        paddd   xmm0, xmm5
343        paddd   xmm1, xmm6
344        paddd   xmm2, xmm7
345        paddd   xmm3, xmm4
346        pxor    xmm15, xmm0
347        pxor    xmm12, xmm1
348        pxor    xmm13, xmm2
349        pxor    xmm14, xmm3
350        movdqa  xmm8, xmm15
351        psrld   xmm15, 8
352        pslld   xmm8, 24
353        pxor    xmm15, xmm8
354        movdqa  xmm8, xmm12
355        psrld   xmm12, 8
356        pslld   xmm8, 24
357        pxor    xmm12, xmm8
358        movdqa  xmm8, xmm13
359        psrld   xmm13, 8
360        pslld   xmm8, 24
361        pxor    xmm13, xmm8
362        movdqa  xmm8, xmm14
363        psrld   xmm14, 8
364        pslld   xmm8, 24
365        pxor    xmm14, xmm8
366        paddd   xmm10, xmm15
367        paddd   xmm11, xmm12
368        movdqa  xmm8, xmmword ptr [rsp+0x100]
369        paddd   xmm8, xmm13
370        paddd   xmm9, xmm14
371        pxor    xmm5, xmm10
372        pxor    xmm6, xmm11
373        pxor    xmm7, xmm8
374        pxor    xmm4, xmm9
375        movdqa  xmmword ptr [rsp+0x100], xmm8
376        movdqa  xmm8, xmm5
377        psrld   xmm8, 7
378        pslld   xmm5, 25
379        por     xmm5, xmm8
380        movdqa  xmm8, xmm6
381        psrld   xmm8, 7
382        pslld   xmm6, 25
383        por     xmm6, xmm8
384        movdqa  xmm8, xmm7
385        psrld   xmm8, 7
386        pslld   xmm7, 25
387        por     xmm7, xmm8
388        movdqa  xmm8, xmm4
389        psrld   xmm8, 7
390        pslld   xmm4, 25
391        por     xmm4, xmm8
392        paddd   xmm0, xmmword ptr [rsp+0x20]
393        paddd   xmm1, xmmword ptr [rsp+0x30]
394        paddd   xmm2, xmmword ptr [rsp+0x70]
395        paddd   xmm3, xmmword ptr [rsp+0x40]
396        paddd   xmm0, xmm4
397        paddd   xmm1, xmm5
398        paddd   xmm2, xmm6
399        paddd   xmm3, xmm7
400        pxor    xmm12, xmm0
401        pxor    xmm13, xmm1
402        pxor    xmm14, xmm2
403        pxor    xmm15, xmm3
404        pshuflw xmm12, xmm12, 0xB1
405        pshufhw xmm12, xmm12, 0xB1
406        pshuflw xmm13, xmm13, 0xB1
407        pshufhw xmm13, xmm13, 0xB1
408        pshuflw xmm14, xmm14, 0xB1
409        pshufhw xmm14, xmm14, 0xB1
410        pshuflw xmm15, xmm15, 0xB1
411        pshufhw xmm15, xmm15, 0xB1
412        movdqa  xmm8, xmmword ptr [rsp+0x100]
413        paddd   xmm8, xmm12
414        paddd   xmm9, xmm13
415        paddd   xmm10, xmm14
416        paddd   xmm11, xmm15
417        pxor    xmm4, xmm8
418        pxor    xmm5, xmm9
419        pxor    xmm6, xmm10
420        pxor    xmm7, xmm11
421        movdqa  xmmword ptr [rsp+0x100], xmm8
422        movdqa  xmm8, xmm4
423        psrld   xmm8, 12
424        pslld   xmm4, 20
425        por     xmm4, xmm8
426        movdqa  xmm8, xmm5
427        psrld   xmm8, 12
428        pslld   xmm5, 20
429        por     xmm5, xmm8
430        movdqa  xmm8, xmm6
431        psrld   xmm8, 12
432        pslld   xmm6, 20
433        por     xmm6, xmm8
434        movdqa  xmm8, xmm7
435        psrld   xmm8, 12
436        pslld   xmm7, 20
437        por     xmm7, xmm8
438        paddd   xmm0, xmmword ptr [rsp+0x60]
439        paddd   xmm1, xmmword ptr [rsp+0xA0]
440        paddd   xmm2, xmmword ptr [rsp]
441        paddd   xmm3, xmmword ptr [rsp+0xD0]
442        paddd   xmm0, xmm4
443        paddd   xmm1, xmm5
444        paddd   xmm2, xmm6
445        paddd   xmm3, xmm7
446        pxor    xmm12, xmm0
447        pxor    xmm13, xmm1
448        pxor    xmm14, xmm2
449        pxor    xmm15, xmm3
450        movdqa  xmm8, xmm12
451        psrld   xmm12, 8
452        pslld   xmm8, 24
453        pxor    xmm12, xmm8
454        movdqa  xmm8, xmm13
455        psrld   xmm13, 8
456        pslld   xmm8, 24
457        pxor    xmm13, xmm8
458        movdqa  xmm8, xmm14
459        psrld   xmm14, 8
460        pslld   xmm8, 24
461        pxor    xmm14, xmm8
462        movdqa  xmm8, xmm15
463        psrld   xmm15, 8
464        pslld   xmm8, 24
465        pxor    xmm15, xmm8
466        movdqa  xmm8, xmmword ptr [rsp+0x100]
467        paddd   xmm8, xmm12
468        paddd   xmm9, xmm13
469        paddd   xmm10, xmm14
470        paddd   xmm11, xmm15
471        pxor    xmm4, xmm8
472        pxor    xmm5, xmm9
473        pxor    xmm6, xmm10
474        pxor    xmm7, xmm11
475        movdqa  xmmword ptr [rsp+0x100], xmm8
476        movdqa  xmm8, xmm4
477        psrld   xmm8, 7
478        pslld   xmm4, 25
479        por     xmm4, xmm8
480        movdqa  xmm8, xmm5
481        psrld   xmm8, 7
482        pslld   xmm5, 25
483        por     xmm5, xmm8
484        movdqa  xmm8, xmm6
485        psrld   xmm8, 7
486        pslld   xmm6, 25
487        por     xmm6, xmm8
488        movdqa  xmm8, xmm7
489        psrld   xmm8, 7
490        pslld   xmm7, 25
491        por     xmm7, xmm8
492        paddd   xmm0, xmmword ptr [rsp+0x10]
493        paddd   xmm1, xmmword ptr [rsp+0xC0]
494        paddd   xmm2, xmmword ptr [rsp+0x90]
495        paddd   xmm3, xmmword ptr [rsp+0xF0]
496        paddd   xmm0, xmm5
497        paddd   xmm1, xmm6
498        paddd   xmm2, xmm7
499        paddd   xmm3, xmm4
500        pxor    xmm15, xmm0
501        pxor    xmm12, xmm1
502        pxor    xmm13, xmm2
503        pxor    xmm14, xmm3
504        pshuflw xmm15, xmm15, 0xB1
505        pshufhw xmm15, xmm15, 0xB1
506        pshuflw xmm12, xmm12, 0xB1
507        pshufhw xmm12, xmm12, 0xB1
508        pshuflw xmm13, xmm13, 0xB1
509        pshufhw xmm13, xmm13, 0xB1
510        pshuflw xmm14, xmm14, 0xB1
511        pshufhw xmm14, xmm14, 0xB1
512        paddd   xmm10, xmm15
513        paddd   xmm11, xmm12
514        movdqa  xmm8, xmmword ptr [rsp+0x100]
515        paddd   xmm8, xmm13
516        paddd   xmm9, xmm14
517        pxor    xmm5, xmm10
518        pxor    xmm6, xmm11
519        pxor    xmm7, xmm8
520        pxor    xmm4, xmm9
521        movdqa  xmmword ptr [rsp+0x100], xmm8
522        movdqa  xmm8, xmm5
523        psrld   xmm8, 12
524        pslld   xmm5, 20
525        por     xmm5, xmm8
526        movdqa  xmm8, xmm6
527        psrld   xmm8, 12
528        pslld   xmm6, 20
529        por     xmm6, xmm8
530        movdqa  xmm8, xmm7
531        psrld   xmm8, 12
532        pslld   xmm7, 20
533        por     xmm7, xmm8
534        movdqa  xmm8, xmm4
535        psrld   xmm8, 12
536        pslld   xmm4, 20
537        por     xmm4, xmm8
538        paddd   xmm0, xmmword ptr [rsp+0xB0]
539        paddd   xmm1, xmmword ptr [rsp+0x50]
540        paddd   xmm2, xmmword ptr [rsp+0xE0]
541        paddd   xmm3, xmmword ptr [rsp+0x80]
542        paddd   xmm0, xmm5
543        paddd   xmm1, xmm6
544        paddd   xmm2, xmm7
545        paddd   xmm3, xmm4
546        pxor    xmm15, xmm0
547        pxor    xmm12, xmm1
548        pxor    xmm13, xmm2
549        pxor    xmm14, xmm3
550        movdqa  xmm8, xmm15
551        psrld   xmm15, 8
552        pslld   xmm8, 24
553        pxor    xmm15, xmm8
554        movdqa  xmm8, xmm12
555        psrld   xmm12, 8
556        pslld   xmm8, 24
557        pxor    xmm12, xmm8
558        movdqa  xmm8, xmm13
559        psrld   xmm13, 8
560        pslld   xmm8, 24
561        pxor    xmm13, xmm8
562        movdqa  xmm8, xmm14
563        psrld   xmm14, 8
564        pslld   xmm8, 24
565        pxor    xmm14, xmm8
566        paddd   xmm10, xmm15
567        paddd   xmm11, xmm12
568        movdqa  xmm8, xmmword ptr [rsp+0x100]
569        paddd   xmm8, xmm13
570        paddd   xmm9, xmm14
571        pxor    xmm5, xmm10
572        pxor    xmm6, xmm11
573        pxor    xmm7, xmm8
574        pxor    xmm4, xmm9
575        movdqa  xmmword ptr [rsp+0x100], xmm8
576        movdqa  xmm8, xmm5
577        psrld   xmm8, 7
578        pslld   xmm5, 25
579        por     xmm5, xmm8
580        movdqa  xmm8, xmm6
581        psrld   xmm8, 7
582        pslld   xmm6, 25
583        por     xmm6, xmm8
584        movdqa  xmm8, xmm7
585        psrld   xmm8, 7
586        pslld   xmm7, 25
587        por     xmm7, xmm8
588        movdqa  xmm8, xmm4
589        psrld   xmm8, 7
590        pslld   xmm4, 25
591        por     xmm4, xmm8
592        paddd   xmm0, xmmword ptr [rsp+0x30]
593        paddd   xmm1, xmmword ptr [rsp+0xA0]
594        paddd   xmm2, xmmword ptr [rsp+0xD0]
595        paddd   xmm3, xmmword ptr [rsp+0x70]
596        paddd   xmm0, xmm4
597        paddd   xmm1, xmm5
598        paddd   xmm2, xmm6
599        paddd   xmm3, xmm7
600        pxor    xmm12, xmm0
601        pxor    xmm13, xmm1
602        pxor    xmm14, xmm2
603        pxor    xmm15, xmm3
604        pshuflw xmm12, xmm12, 0xB1
605        pshufhw xmm12, xmm12, 0xB1
606        pshuflw xmm13, xmm13, 0xB1
607        pshufhw xmm13, xmm13, 0xB1
608        pshuflw xmm14, xmm14, 0xB1
609        pshufhw xmm14, xmm14, 0xB1
610        pshuflw xmm15, xmm15, 0xB1
611        pshufhw xmm15, xmm15, 0xB1
612        movdqa  xmm8, xmmword ptr [rsp+0x100]
613        paddd   xmm8, xmm12
614        paddd   xmm9, xmm13
615        paddd   xmm10, xmm14
616        paddd   xmm11, xmm15
617        pxor    xmm4, xmm8
618        pxor    xmm5, xmm9
619        pxor    xmm6, xmm10
620        pxor    xmm7, xmm11
621        movdqa  xmmword ptr [rsp+0x100], xmm8
622        movdqa  xmm8, xmm4
623        psrld   xmm8, 12
624        pslld   xmm4, 20
625        por     xmm4, xmm8
626        movdqa  xmm8, xmm5
627        psrld   xmm8, 12
628        pslld   xmm5, 20
629        por     xmm5, xmm8
630        movdqa  xmm8, xmm6
631        psrld   xmm8, 12
632        pslld   xmm6, 20
633        por     xmm6, xmm8
634        movdqa  xmm8, xmm7
635        psrld   xmm8, 12
636        pslld   xmm7, 20
637        por     xmm7, xmm8
638        paddd   xmm0, xmmword ptr [rsp+0x40]
639        paddd   xmm1, xmmword ptr [rsp+0xC0]
640        paddd   xmm2, xmmword ptr [rsp+0x20]
641        paddd   xmm3, xmmword ptr [rsp+0xE0]
642        paddd   xmm0, xmm4
643        paddd   xmm1, xmm5
644        paddd   xmm2, xmm6
645        paddd   xmm3, xmm7
646        pxor    xmm12, xmm0
647        pxor    xmm13, xmm1
648        pxor    xmm14, xmm2
649        pxor    xmm15, xmm3
650        movdqa  xmm8, xmm12
651        psrld   xmm12, 8
652        pslld   xmm8, 24
653        pxor    xmm12, xmm8
654        movdqa  xmm8, xmm13
655        psrld   xmm13, 8
656        pslld   xmm8, 24
657        pxor    xmm13, xmm8
658        movdqa  xmm8, xmm14
659        psrld   xmm14, 8
660        pslld   xmm8, 24
661        pxor    xmm14, xmm8
662        movdqa  xmm8, xmm15
663        psrld   xmm15, 8
664        pslld   xmm8, 24
665        pxor    xmm15, xmm8
666        movdqa  xmm8, xmmword ptr [rsp+0x100]
667        paddd   xmm8, xmm12
668        paddd   xmm9, xmm13
669        paddd   xmm10, xmm14
670        paddd   xmm11, xmm15
671        pxor    xmm4, xmm8
672        pxor    xmm5, xmm9
673        pxor    xmm6, xmm10
674        pxor    xmm7, xmm11
675        movdqa  xmmword ptr [rsp+0x100], xmm8
676        movdqa  xmm8, xmm4
677        psrld   xmm8, 7
678        pslld   xmm4, 25
679        por     xmm4, xmm8
680        movdqa  xmm8, xmm5
681        psrld   xmm8, 7
682        pslld   xmm5, 25
683        por     xmm5, xmm8
684        movdqa  xmm8, xmm6
685        psrld   xmm8, 7
686        pslld   xmm6, 25
687        por     xmm6, xmm8
688        movdqa  xmm8, xmm7
689        psrld   xmm8, 7
690        pslld   xmm7, 25
691        por     xmm7, xmm8
692        paddd   xmm0, xmmword ptr [rsp+0x60]
693        paddd   xmm1, xmmword ptr [rsp+0x90]
694        paddd   xmm2, xmmword ptr [rsp+0xB0]
695        paddd   xmm3, xmmword ptr [rsp+0x80]
696        paddd   xmm0, xmm5
697        paddd   xmm1, xmm6
698        paddd   xmm2, xmm7
699        paddd   xmm3, xmm4
700        pxor    xmm15, xmm0
701        pxor    xmm12, xmm1
702        pxor    xmm13, xmm2
703        pxor    xmm14, xmm3
704        pshuflw xmm15, xmm15, 0xB1
705        pshufhw xmm15, xmm15, 0xB1
706        pshuflw xmm12, xmm12, 0xB1
707        pshufhw xmm12, xmm12, 0xB1
708        pshuflw xmm13, xmm13, 0xB1
709        pshufhw xmm13, xmm13, 0xB1
710        pshuflw xmm14, xmm14, 0xB1
711        pshufhw xmm14, xmm14, 0xB1
712        paddd   xmm10, xmm15
713        paddd   xmm11, xmm12
714        movdqa  xmm8, xmmword ptr [rsp+0x100]
715        paddd   xmm8, xmm13
716        paddd   xmm9, xmm14
717        pxor    xmm5, xmm10
718        pxor    xmm6, xmm11
719        pxor    xmm7, xmm8
720        pxor    xmm4, xmm9
721        movdqa  xmmword ptr [rsp+0x100], xmm8
722        movdqa  xmm8, xmm5
723        psrld   xmm8, 12
724        pslld   xmm5, 20
725        por     xmm5, xmm8
726        movdqa  xmm8, xmm6
727        psrld   xmm8, 12
728        pslld   xmm6, 20
729        por     xmm6, xmm8
730        movdqa  xmm8, xmm7
731        psrld   xmm8, 12
732        pslld   xmm7, 20
733        por     xmm7, xmm8
734        movdqa  xmm8, xmm4
735        psrld   xmm8, 12
736        pslld   xmm4, 20
737        por     xmm4, xmm8
738        paddd   xmm0, xmmword ptr [rsp+0x50]
739        paddd   xmm1, xmmword ptr [rsp]
740        paddd   xmm2, xmmword ptr [rsp+0xF0]
741        paddd   xmm3, xmmword ptr [rsp+0x10]
742        paddd   xmm0, xmm5
743        paddd   xmm1, xmm6
744        paddd   xmm2, xmm7
745        paddd   xmm3, xmm4
746        pxor    xmm15, xmm0
747        pxor    xmm12, xmm1
748        pxor    xmm13, xmm2
749        pxor    xmm14, xmm3
750        movdqa  xmm8, xmm15
751        psrld   xmm15, 8
752        pslld   xmm8, 24
753        pxor    xmm15, xmm8
754        movdqa  xmm8, xmm12
755        psrld   xmm12, 8
756        pslld   xmm8, 24
757        pxor    xmm12, xmm8
758        movdqa  xmm8, xmm13
759        psrld   xmm13, 8
760        pslld   xmm8, 24
761        pxor    xmm13, xmm8
762        movdqa  xmm8, xmm14
763        psrld   xmm14, 8
764        pslld   xmm8, 24
765        pxor    xmm14, xmm8
766        paddd   xmm10, xmm15
767        paddd   xmm11, xmm12
768        movdqa  xmm8, xmmword ptr [rsp+0x100]
769        paddd   xmm8, xmm13
770        paddd   xmm9, xmm14
771        pxor    xmm5, xmm10
772        pxor    xmm6, xmm11
773        pxor    xmm7, xmm8
774        pxor    xmm4, xmm9
775        movdqa  xmmword ptr [rsp+0x100], xmm8
776        movdqa  xmm8, xmm5
777        psrld   xmm8, 7
778        pslld   xmm5, 25
779        por     xmm5, xmm8
780        movdqa  xmm8, xmm6
781        psrld   xmm8, 7
782        pslld   xmm6, 25
783        por     xmm6, xmm8
784        movdqa  xmm8, xmm7
785        psrld   xmm8, 7
786        pslld   xmm7, 25
787        por     xmm7, xmm8
788        movdqa  xmm8, xmm4
789        psrld   xmm8, 7
790        pslld   xmm4, 25
791        por     xmm4, xmm8
792        paddd   xmm0, xmmword ptr [rsp+0xA0]
793        paddd   xmm1, xmmword ptr [rsp+0xC0]
794        paddd   xmm2, xmmword ptr [rsp+0xE0]
795        paddd   xmm3, xmmword ptr [rsp+0xD0]
796        paddd   xmm0, xmm4
797        paddd   xmm1, xmm5
798        paddd   xmm2, xmm6
799        paddd   xmm3, xmm7
800        pxor    xmm12, xmm0
801        pxor    xmm13, xmm1
802        pxor    xmm14, xmm2
803        pxor    xmm15, xmm3
804        pshuflw xmm12, xmm12, 0xB1
805        pshufhw xmm12, xmm12, 0xB1
806        pshuflw xmm13, xmm13, 0xB1
807        pshufhw xmm13, xmm13, 0xB1
808        pshuflw xmm14, xmm14, 0xB1
809        pshufhw xmm14, xmm14, 0xB1
810        pshuflw xmm15, xmm15, 0xB1
811        pshufhw xmm15, xmm15, 0xB1
812        movdqa  xmm8, xmmword ptr [rsp+0x100]
813        paddd   xmm8, xmm12
814        paddd   xmm9, xmm13
815        paddd   xmm10, xmm14
816        paddd   xmm11, xmm15
817        pxor    xmm4, xmm8
818        pxor    xmm5, xmm9
819        pxor    xmm6, xmm10
820        pxor    xmm7, xmm11
821        movdqa  xmmword ptr [rsp+0x100], xmm8
822        movdqa  xmm8, xmm4
823        psrld   xmm8, 12
824        pslld   xmm4, 20
825        por     xmm4, xmm8
826        movdqa  xmm8, xmm5
827        psrld   xmm8, 12
828        pslld   xmm5, 20
829        por     xmm5, xmm8
830        movdqa  xmm8, xmm6
831        psrld   xmm8, 12
832        pslld   xmm6, 20
833        por     xmm6, xmm8
834        movdqa  xmm8, xmm7
835        psrld   xmm8, 12
836        pslld   xmm7, 20
837        por     xmm7, xmm8
838        paddd   xmm0, xmmword ptr [rsp+0x70]
839        paddd   xmm1, xmmword ptr [rsp+0x90]
840        paddd   xmm2, xmmword ptr [rsp+0x30]
841        paddd   xmm3, xmmword ptr [rsp+0xF0]
842        paddd   xmm0, xmm4
843        paddd   xmm1, xmm5
844        paddd   xmm2, xmm6
845        paddd   xmm3, xmm7
846        pxor    xmm12, xmm0
847        pxor    xmm13, xmm1
848        pxor    xmm14, xmm2
849        pxor    xmm15, xmm3
850        movdqa  xmm8, xmm12
851        psrld   xmm12, 8
852        pslld   xmm8, 24
853        pxor    xmm12, xmm8
854        movdqa  xmm8, xmm13
855        psrld   xmm13, 8
856        pslld   xmm8, 24
857        pxor    xmm13, xmm8
858        movdqa  xmm8, xmm14
859        psrld   xmm14, 8
860        pslld   xmm8, 24
861        pxor    xmm14, xmm8
862        movdqa  xmm8, xmm15
863        psrld   xmm15, 8
864        pslld   xmm8, 24
865        pxor    xmm15, xmm8
866        movdqa  xmm8, xmmword ptr [rsp+0x100]
867        paddd   xmm8, xmm12
868        paddd   xmm9, xmm13
869        paddd   xmm10, xmm14
870        paddd   xmm11, xmm15
871        pxor    xmm4, xmm8
872        pxor    xmm5, xmm9
873        pxor    xmm6, xmm10
874        pxor    xmm7, xmm11
875        movdqa  xmmword ptr [rsp+0x100], xmm8
876        movdqa  xmm8, xmm4
877        psrld   xmm8, 7
878        pslld   xmm4, 25
879        por     xmm4, xmm8
880        movdqa  xmm8, xmm5
881        psrld   xmm8, 7
882        pslld   xmm5, 25
883        por     xmm5, xmm8
884        movdqa  xmm8, xmm6
885        psrld   xmm8, 7
886        pslld   xmm6, 25
887        por     xmm6, xmm8
888        movdqa  xmm8, xmm7
889        psrld   xmm8, 7
890        pslld   xmm7, 25
891        por     xmm7, xmm8
892        paddd   xmm0, xmmword ptr [rsp+0x40]
893        paddd   xmm1, xmmword ptr [rsp+0xB0]
894        paddd   xmm2, xmmword ptr [rsp+0x50]
895        paddd   xmm3, xmmword ptr [rsp+0x10]
896        paddd   xmm0, xmm5
897        paddd   xmm1, xmm6
898        paddd   xmm2, xmm7
899        paddd   xmm3, xmm4
900        pxor    xmm15, xmm0
901        pxor    xmm12, xmm1
902        pxor    xmm13, xmm2
903        pxor    xmm14, xmm3
904        pshuflw xmm15, xmm15, 0xB1
905        pshufhw xmm15, xmm15, 0xB1
906        pshuflw xmm12, xmm12, 0xB1
907        pshufhw xmm12, xmm12, 0xB1
908        pshuflw xmm13, xmm13, 0xB1
909        pshufhw xmm13, xmm13, 0xB1
910        pshuflw xmm14, xmm14, 0xB1
911        pshufhw xmm14, xmm14, 0xB1
912        paddd   xmm10, xmm15
913        paddd   xmm11, xmm12
914        movdqa  xmm8, xmmword ptr [rsp+0x100]
915        paddd   xmm8, xmm13
916        paddd   xmm9, xmm14
917        pxor    xmm5, xmm10
918        pxor    xmm6, xmm11
919        pxor    xmm7, xmm8
920        pxor    xmm4, xmm9
921        movdqa  xmmword ptr [rsp+0x100], xmm8
922        movdqa  xmm8, xmm5
923        psrld   xmm8, 12
924        pslld   xmm5, 20
925        por     xmm5, xmm8
926        movdqa  xmm8, xmm6
927        psrld   xmm8, 12
928        pslld   xmm6, 20
929        por     xmm6, xmm8
930        movdqa  xmm8, xmm7
931        psrld   xmm8, 12
932        pslld   xmm7, 20
933        por     xmm7, xmm8
934        movdqa  xmm8, xmm4
935        psrld   xmm8, 12
936        pslld   xmm4, 20
937        por     xmm4, xmm8
938        paddd   xmm0, xmmword ptr [rsp]
939        paddd   xmm1, xmmword ptr [rsp+0x20]
940        paddd   xmm2, xmmword ptr [rsp+0x80]
941        paddd   xmm3, xmmword ptr [rsp+0x60]
942        paddd   xmm0, xmm5
943        paddd   xmm1, xmm6
944        paddd   xmm2, xmm7
945        paddd   xmm3, xmm4
946        pxor    xmm15, xmm0
947        pxor    xmm12, xmm1
948        pxor    xmm13, xmm2
949        pxor    xmm14, xmm3
950        movdqa  xmm8, xmm15
951        psrld   xmm15, 8
952        pslld   xmm8, 24
953        pxor    xmm15, xmm8
954        movdqa  xmm8, xmm12
955        psrld   xmm12, 8
956        pslld   xmm8, 24
957        pxor    xmm12, xmm8
958        movdqa  xmm8, xmm13
959        psrld   xmm13, 8
960        pslld   xmm8, 24
961        pxor    xmm13, xmm8
962        movdqa  xmm8, xmm14
963        psrld   xmm14, 8
964        pslld   xmm8, 24
965        pxor    xmm14, xmm8
966        paddd   xmm10, xmm15
967        paddd   xmm11, xmm12
968        movdqa  xmm8, xmmword ptr [rsp+0x100]
969        paddd   xmm8, xmm13
970        paddd   xmm9, xmm14
971        pxor    xmm5, xmm10
972        pxor    xmm6, xmm11
973        pxor    xmm7, xmm8
974        pxor    xmm4, xmm9
975        movdqa  xmmword ptr [rsp+0x100], xmm8
976        movdqa  xmm8, xmm5
977        psrld   xmm8, 7
978        pslld   xmm5, 25
979        por     xmm5, xmm8
980        movdqa  xmm8, xmm6
981        psrld   xmm8, 7
982        pslld   xmm6, 25
983        por     xmm6, xmm8
984        movdqa  xmm8, xmm7
985        psrld   xmm8, 7
986        pslld   xmm7, 25
987        por     xmm7, xmm8
988        movdqa  xmm8, xmm4
989        psrld   xmm8, 7
990        pslld   xmm4, 25
991        por     xmm4, xmm8
992        paddd   xmm0, xmmword ptr [rsp+0xC0]
993        paddd   xmm1, xmmword ptr [rsp+0x90]
994        paddd   xmm2, xmmword ptr [rsp+0xF0]
995        paddd   xmm3, xmmword ptr [rsp+0xE0]
996        paddd   xmm0, xmm4
997        paddd   xmm1, xmm5
998        paddd   xmm2, xmm6
999        paddd   xmm3, xmm7
1000        pxor    xmm12, xmm0
1001        pxor    xmm13, xmm1
1002        pxor    xmm14, xmm2
1003        pxor    xmm15, xmm3
1004        pshuflw xmm12, xmm12, 0xB1
1005        pshufhw xmm12, xmm12, 0xB1
1006        pshuflw xmm13, xmm13, 0xB1
1007        pshufhw xmm13, xmm13, 0xB1
1008        pshuflw xmm14, xmm14, 0xB1
1009        pshufhw xmm14, xmm14, 0xB1
1010        pshuflw xmm15, xmm15, 0xB1
1011        pshufhw xmm15, xmm15, 0xB1
1012        movdqa  xmm8, xmmword ptr [rsp+0x100]
1013        paddd   xmm8, xmm12
1014        paddd   xmm9, xmm13
1015        paddd   xmm10, xmm14
1016        paddd   xmm11, xmm15
1017        pxor    xmm4, xmm8
1018        pxor    xmm5, xmm9
1019        pxor    xmm6, xmm10
1020        pxor    xmm7, xmm11
1021        movdqa  xmmword ptr [rsp+0x100], xmm8
1022        movdqa  xmm8, xmm4
1023        psrld   xmm8, 12
1024        pslld   xmm4, 20
1025        por     xmm4, xmm8
1026        movdqa  xmm8, xmm5
1027        psrld   xmm8, 12
1028        pslld   xmm5, 20
1029        por     xmm5, xmm8
1030        movdqa  xmm8, xmm6
1031        psrld   xmm8, 12
1032        pslld   xmm6, 20
1033        por     xmm6, xmm8
1034        movdqa  xmm8, xmm7
1035        psrld   xmm8, 12
1036        pslld   xmm7, 20
1037        por     xmm7, xmm8
1038        paddd   xmm0, xmmword ptr [rsp+0xD0]
1039        paddd   xmm1, xmmword ptr [rsp+0xB0]
1040        paddd   xmm2, xmmword ptr [rsp+0xA0]
1041        paddd   xmm3, xmmword ptr [rsp+0x80]
1042        paddd   xmm0, xmm4
1043        paddd   xmm1, xmm5
1044        paddd   xmm2, xmm6
1045        paddd   xmm3, xmm7
1046        pxor    xmm12, xmm0
1047        pxor    xmm13, xmm1
1048        pxor    xmm14, xmm2
1049        pxor    xmm15, xmm3
1050        movdqa  xmm8, xmm12
1051        psrld   xmm12, 8
1052        pslld   xmm8, 24
1053        pxor    xmm12, xmm8
1054        movdqa  xmm8, xmm13
1055        psrld   xmm13, 8
1056        pslld   xmm8, 24
1057        pxor    xmm13, xmm8
1058        movdqa  xmm8, xmm14
1059        psrld   xmm14, 8
1060        pslld   xmm8, 24
1061        pxor    xmm14, xmm8
1062        movdqa  xmm8, xmm15
1063        psrld   xmm15, 8
1064        pslld   xmm8, 24
1065        pxor    xmm15, xmm8
1066        movdqa  xmm8, xmmword ptr [rsp+0x100]
1067        paddd   xmm8, xmm12
1068        paddd   xmm9, xmm13
1069        paddd   xmm10, xmm14
1070        paddd   xmm11, xmm15
1071        pxor    xmm4, xmm8
1072        pxor    xmm5, xmm9
1073        pxor    xmm6, xmm10
1074        pxor    xmm7, xmm11
1075        movdqa  xmmword ptr [rsp+0x100], xmm8
1076        movdqa  xmm8, xmm4
1077        psrld   xmm8, 7
1078        pslld   xmm4, 25
1079        por     xmm4, xmm8
1080        movdqa  xmm8, xmm5
1081        psrld   xmm8, 7
1082        pslld   xmm5, 25
1083        por     xmm5, xmm8
1084        movdqa  xmm8, xmm6
1085        psrld   xmm8, 7
1086        pslld   xmm6, 25
1087        por     xmm6, xmm8
1088        movdqa  xmm8, xmm7
1089        psrld   xmm8, 7
1090        pslld   xmm7, 25
1091        por     xmm7, xmm8
1092        paddd   xmm0, xmmword ptr [rsp+0x70]
1093        paddd   xmm1, xmmword ptr [rsp+0x50]
1094        paddd   xmm2, xmmword ptr [rsp]
1095        paddd   xmm3, xmmword ptr [rsp+0x60]
1096        paddd   xmm0, xmm5
1097        paddd   xmm1, xmm6
1098        paddd   xmm2, xmm7
1099        paddd   xmm3, xmm4
1100        pxor    xmm15, xmm0
1101        pxor    xmm12, xmm1
1102        pxor    xmm13, xmm2
1103        pxor    xmm14, xmm3
1104        pshuflw xmm15, xmm15, 0xB1
1105        pshufhw xmm15, xmm15, 0xB1
1106        pshuflw xmm12, xmm12, 0xB1
1107        pshufhw xmm12, xmm12, 0xB1
1108        pshuflw xmm13, xmm13, 0xB1
1109        pshufhw xmm13, xmm13, 0xB1
1110        pshuflw xmm14, xmm14, 0xB1
1111        pshufhw xmm14, xmm14, 0xB1
1112        paddd   xmm10, xmm15
1113        paddd   xmm11, xmm12
1114        movdqa  xmm8, xmmword ptr [rsp+0x100]
1115        paddd   xmm8, xmm13
1116        paddd   xmm9, xmm14
1117        pxor    xmm5, xmm10
1118        pxor    xmm6, xmm11
1119        pxor    xmm7, xmm8
1120        pxor    xmm4, xmm9
1121        movdqa  xmmword ptr [rsp+0x100], xmm8
1122        movdqa  xmm8, xmm5
1123        psrld   xmm8, 12
1124        pslld   xmm5, 20
1125        por     xmm5, xmm8
1126        movdqa  xmm8, xmm6
1127        psrld   xmm8, 12
1128        pslld   xmm6, 20
1129        por     xmm6, xmm8
1130        movdqa  xmm8, xmm7
1131        psrld   xmm8, 12
1132        pslld   xmm7, 20
1133        por     xmm7, xmm8
1134        movdqa  xmm8, xmm4
1135        psrld   xmm8, 12
1136        pslld   xmm4, 20
1137        por     xmm4, xmm8
1138        paddd   xmm0, xmmword ptr [rsp+0x20]
1139        paddd   xmm1, xmmword ptr [rsp+0x30]
1140        paddd   xmm2, xmmword ptr [rsp+0x10]
1141        paddd   xmm3, xmmword ptr [rsp+0x40]
1142        paddd   xmm0, xmm5
1143        paddd   xmm1, xmm6
1144        paddd   xmm2, xmm7
1145        paddd   xmm3, xmm4
1146        pxor    xmm15, xmm0
1147        pxor    xmm12, xmm1
1148        pxor    xmm13, xmm2
1149        pxor    xmm14, xmm3
1150        movdqa  xmm8, xmm15
1151        psrld   xmm15, 8
1152        pslld   xmm8, 24
1153        pxor    xmm15, xmm8
1154        movdqa  xmm8, xmm12
1155        psrld   xmm12, 8
1156        pslld   xmm8, 24
1157        pxor    xmm12, xmm8
1158        movdqa  xmm8, xmm13
1159        psrld   xmm13, 8
1160        pslld   xmm8, 24
1161        pxor    xmm13, xmm8
1162        movdqa  xmm8, xmm14
1163        psrld   xmm14, 8
1164        pslld   xmm8, 24
1165        pxor    xmm14, xmm8
1166        paddd   xmm10, xmm15
1167        paddd   xmm11, xmm12
1168        movdqa  xmm8, xmmword ptr [rsp+0x100]
1169        paddd   xmm8, xmm13
1170        paddd   xmm9, xmm14
1171        pxor    xmm5, xmm10
1172        pxor    xmm6, xmm11
1173        pxor    xmm7, xmm8
1174        pxor    xmm4, xmm9
1175        movdqa  xmmword ptr [rsp+0x100], xmm8
1176        movdqa  xmm8, xmm5
1177        psrld   xmm8, 7
1178        pslld   xmm5, 25
1179        por     xmm5, xmm8
1180        movdqa  xmm8, xmm6
1181        psrld   xmm8, 7
1182        pslld   xmm6, 25
1183        por     xmm6, xmm8
1184        movdqa  xmm8, xmm7
1185        psrld   xmm8, 7
1186        pslld   xmm7, 25
1187        por     xmm7, xmm8
1188        movdqa  xmm8, xmm4
1189        psrld   xmm8, 7
1190        pslld   xmm4, 25
1191        por     xmm4, xmm8
1192        paddd   xmm0, xmmword ptr [rsp+0x90]
1193        paddd   xmm1, xmmword ptr [rsp+0xB0]
1194        paddd   xmm2, xmmword ptr [rsp+0x80]
1195        paddd   xmm3, xmmword ptr [rsp+0xF0]
1196        paddd   xmm0, xmm4
1197        paddd   xmm1, xmm5
1198        paddd   xmm2, xmm6
1199        paddd   xmm3, xmm7
1200        pxor    xmm12, xmm0
1201        pxor    xmm13, xmm1
1202        pxor    xmm14, xmm2
1203        pxor    xmm15, xmm3
1204        pshuflw xmm12, xmm12, 0xB1
1205        pshufhw xmm12, xmm12, 0xB1
1206        pshuflw xmm13, xmm13, 0xB1
1207        pshufhw xmm13, xmm13, 0xB1
1208        pshuflw xmm14, xmm14, 0xB1
1209        pshufhw xmm14, xmm14, 0xB1
1210        pshuflw xmm15, xmm15, 0xB1
1211        pshufhw xmm15, xmm15, 0xB1
1212        movdqa  xmm8, xmmword ptr [rsp+0x100]
1213        paddd   xmm8, xmm12
1214        paddd   xmm9, xmm13
1215        paddd   xmm10, xmm14
1216        paddd   xmm11, xmm15
1217        pxor    xmm4, xmm8
1218        pxor    xmm5, xmm9
1219        pxor    xmm6, xmm10
1220        pxor    xmm7, xmm11
1221        movdqa  xmmword ptr [rsp+0x100], xmm8
1222        movdqa  xmm8, xmm4
1223        psrld   xmm8, 12
1224        pslld   xmm4, 20
1225        por     xmm4, xmm8
1226        movdqa  xmm8, xmm5
1227        psrld   xmm8, 12
1228        pslld   xmm5, 20
1229        por     xmm5, xmm8
1230        movdqa  xmm8, xmm6
1231        psrld   xmm8, 12
1232        pslld   xmm6, 20
1233        por     xmm6, xmm8
1234        movdqa  xmm8, xmm7
1235        psrld   xmm8, 12
1236        pslld   xmm7, 20
1237        por     xmm7, xmm8
1238        paddd   xmm0, xmmword ptr [rsp+0xE0]
1239        paddd   xmm1, xmmword ptr [rsp+0x50]
1240        paddd   xmm2, xmmword ptr [rsp+0xC0]
1241        paddd   xmm3, xmmword ptr [rsp+0x10]
1242        paddd   xmm0, xmm4
1243        paddd   xmm1, xmm5
1244        paddd   xmm2, xmm6
1245        paddd   xmm3, xmm7
1246        pxor    xmm12, xmm0
1247        pxor    xmm13, xmm1
1248        pxor    xmm14, xmm2
1249        pxor    xmm15, xmm3
1250        movdqa  xmm8, xmm12
1251        psrld   xmm12, 8
1252        pslld   xmm8, 24
1253        pxor    xmm12, xmm8
1254        movdqa  xmm8, xmm13
1255        psrld   xmm13, 8
1256        pslld   xmm8, 24
1257        pxor    xmm13, xmm8
1258        movdqa  xmm8, xmm14
1259        psrld   xmm14, 8
1260        pslld   xmm8, 24
1261        pxor    xmm14, xmm8
1262        movdqa  xmm8, xmm15
1263        psrld   xmm15, 8
1264        pslld   xmm8, 24
1265        pxor    xmm15, xmm8
1266        movdqa  xmm8, xmmword ptr [rsp+0x100]
1267        paddd   xmm8, xmm12
1268        paddd   xmm9, xmm13
1269        paddd   xmm10, xmm14
1270        paddd   xmm11, xmm15
1271        pxor    xmm4, xmm8
1272        pxor    xmm5, xmm9
1273        pxor    xmm6, xmm10
1274        pxor    xmm7, xmm11
1275        movdqa  xmmword ptr [rsp+0x100], xmm8
1276        movdqa  xmm8, xmm4
1277        psrld   xmm8, 7
1278        pslld   xmm4, 25
1279        por     xmm4, xmm8
1280        movdqa  xmm8, xmm5
1281        psrld   xmm8, 7
1282        pslld   xmm5, 25
1283        por     xmm5, xmm8
1284        movdqa  xmm8, xmm6
1285        psrld   xmm8, 7
1286        pslld   xmm6, 25
1287        por     xmm6, xmm8
1288        movdqa  xmm8, xmm7
1289        psrld   xmm8, 7
1290        pslld   xmm7, 25
1291        por     xmm7, xmm8
1292        paddd   xmm0, xmmword ptr [rsp+0xD0]
1293        paddd   xmm1, xmmword ptr [rsp]
1294        paddd   xmm2, xmmword ptr [rsp+0x20]
1295        paddd   xmm3, xmmword ptr [rsp+0x40]
1296        paddd   xmm0, xmm5
1297        paddd   xmm1, xmm6
1298        paddd   xmm2, xmm7
1299        paddd   xmm3, xmm4
1300        pxor    xmm15, xmm0
1301        pxor    xmm12, xmm1
1302        pxor    xmm13, xmm2
1303        pxor    xmm14, xmm3
1304        pshuflw xmm15, xmm15, 0xB1
1305        pshufhw xmm15, xmm15, 0xB1
1306        pshuflw xmm12, xmm12, 0xB1
1307        pshufhw xmm12, xmm12, 0xB1
1308        pshuflw xmm13, xmm13, 0xB1
1309        pshufhw xmm13, xmm13, 0xB1
1310        pshuflw xmm14, xmm14, 0xB1
1311        pshufhw xmm14, xmm14, 0xB1
1312        paddd   xmm10, xmm15
1313        paddd   xmm11, xmm12
1314        movdqa  xmm8, xmmword ptr [rsp+0x100]
1315        paddd   xmm8, xmm13
1316        paddd   xmm9, xmm14
1317        pxor    xmm5, xmm10
1318        pxor    xmm6, xmm11
1319        pxor    xmm7, xmm8
1320        pxor    xmm4, xmm9
1321        movdqa  xmmword ptr [rsp+0x100], xmm8
1322        movdqa  xmm8, xmm5
1323        psrld   xmm8, 12
1324        pslld   xmm5, 20
1325        por     xmm5, xmm8
1326        movdqa  xmm8, xmm6
1327        psrld   xmm8, 12
1328        pslld   xmm6, 20
1329        por     xmm6, xmm8
1330        movdqa  xmm8, xmm7
1331        psrld   xmm8, 12
1332        pslld   xmm7, 20
1333        por     xmm7, xmm8
1334        movdqa  xmm8, xmm4
1335        psrld   xmm8, 12
1336        pslld   xmm4, 20
1337        por     xmm4, xmm8
1338        paddd   xmm0, xmmword ptr [rsp+0x30]
1339        paddd   xmm1, xmmword ptr [rsp+0xA0]
1340        paddd   xmm2, xmmword ptr [rsp+0x60]
1341        paddd   xmm3, xmmword ptr [rsp+0x70]
1342        paddd   xmm0, xmm5
1343        paddd   xmm1, xmm6
1344        paddd   xmm2, xmm7
1345        paddd   xmm3, xmm4
1346        pxor    xmm15, xmm0
1347        pxor    xmm12, xmm1
1348        pxor    xmm13, xmm2
1349        pxor    xmm14, xmm3
1350        movdqa  xmm8, xmm15
1351        psrld   xmm15, 8
1352        pslld   xmm8, 24
1353        pxor    xmm15, xmm8
1354        movdqa  xmm8, xmm12
1355        psrld   xmm12, 8
1356        pslld   xmm8, 24
1357        pxor    xmm12, xmm8
1358        movdqa  xmm8, xmm13
1359        psrld   xmm13, 8
1360        pslld   xmm8, 24
1361        pxor    xmm13, xmm8
1362        movdqa  xmm8, xmm14
1363        psrld   xmm14, 8
1364        pslld   xmm8, 24
1365        pxor    xmm14, xmm8
1366        paddd   xmm10, xmm15
1367        paddd   xmm11, xmm12
1368        movdqa  xmm8, xmmword ptr [rsp+0x100]
1369        paddd   xmm8, xmm13
1370        paddd   xmm9, xmm14
1371        pxor    xmm5, xmm10
1372        pxor    xmm6, xmm11
1373        pxor    xmm7, xmm8
1374        pxor    xmm4, xmm9
1375        movdqa  xmmword ptr [rsp+0x100], xmm8
1376        movdqa  xmm8, xmm5
1377        psrld   xmm8, 7
1378        pslld   xmm5, 25
1379        por     xmm5, xmm8
1380        movdqa  xmm8, xmm6
1381        psrld   xmm8, 7
1382        pslld   xmm6, 25
1383        por     xmm6, xmm8
1384        movdqa  xmm8, xmm7
1385        psrld   xmm8, 7
1386        pslld   xmm7, 25
1387        por     xmm7, xmm8
1388        movdqa  xmm8, xmm4
1389        psrld   xmm8, 7
1390        pslld   xmm4, 25
1391        por     xmm4, xmm8
1392        paddd   xmm0, xmmword ptr [rsp+0xB0]
1393        paddd   xmm1, xmmword ptr [rsp+0x50]
1394        paddd   xmm2, xmmword ptr [rsp+0x10]
1395        paddd   xmm3, xmmword ptr [rsp+0x80]
1396        paddd   xmm0, xmm4
1397        paddd   xmm1, xmm5
1398        paddd   xmm2, xmm6
1399        paddd   xmm3, xmm7
1400        pxor    xmm12, xmm0
1401        pxor    xmm13, xmm1
1402        pxor    xmm14, xmm2
1403        pxor    xmm15, xmm3
1404        pshuflw xmm12, xmm12, 0xB1
1405        pshufhw xmm12, xmm12, 0xB1
1406        pshuflw xmm13, xmm13, 0xB1
1407        pshufhw xmm13, xmm13, 0xB1
1408        pshuflw xmm14, xmm14, 0xB1
1409        pshufhw xmm14, xmm14, 0xB1
1410        pshuflw xmm15, xmm15, 0xB1
1411        pshufhw xmm15, xmm15, 0xB1
1412        movdqa  xmm8, xmmword ptr [rsp+0x100]
1413        paddd   xmm8, xmm12
1414        paddd   xmm9, xmm13
1415        paddd   xmm10, xmm14
1416        paddd   xmm11, xmm15
1417        pxor    xmm4, xmm8
1418        pxor    xmm5, xmm9
1419        pxor    xmm6, xmm10
1420        pxor    xmm7, xmm11
1421        movdqa  xmmword ptr [rsp+0x100], xmm8
1422        movdqa  xmm8, xmm4
1423        psrld   xmm8, 12
1424        pslld   xmm4, 20
1425        por     xmm4, xmm8
1426        movdqa  xmm8, xmm5
1427        psrld   xmm8, 12
1428        pslld   xmm5, 20
1429        por     xmm5, xmm8
1430        movdqa  xmm8, xmm6
1431        psrld   xmm8, 12
1432        pslld   xmm6, 20
1433        por     xmm6, xmm8
1434        movdqa  xmm8, xmm7
1435        psrld   xmm8, 12
1436        pslld   xmm7, 20
1437        por     xmm7, xmm8
1438        paddd   xmm0, xmmword ptr [rsp+0xF0]
1439        paddd   xmm1, xmmword ptr [rsp]
1440        paddd   xmm2, xmmword ptr [rsp+0x90]
1441        paddd   xmm3, xmmword ptr [rsp+0x60]
1442        paddd   xmm0, xmm4
1443        paddd   xmm1, xmm5
1444        paddd   xmm2, xmm6
1445        paddd   xmm3, xmm7
1446        pxor    xmm12, xmm0
1447        pxor    xmm13, xmm1
1448        pxor    xmm14, xmm2
1449        pxor    xmm15, xmm3
1450        movdqa  xmm8, xmm12
1451        psrld   xmm12, 8
1452        pslld   xmm8, 24
1453        pxor    xmm12, xmm8
1454        movdqa  xmm8, xmm13
1455        psrld   xmm13, 8
1456        pslld   xmm8, 24
1457        pxor    xmm13, xmm8
1458        movdqa  xmm8, xmm14
1459        psrld   xmm14, 8
1460        pslld   xmm8, 24
1461        pxor    xmm14, xmm8
1462        movdqa  xmm8, xmm15
1463        psrld   xmm15, 8
1464        pslld   xmm8, 24
1465        pxor    xmm15, xmm8
1466        movdqa  xmm8, xmmword ptr [rsp+0x100]
1467        paddd   xmm8, xmm12
1468        paddd   xmm9, xmm13
1469        paddd   xmm10, xmm14
1470        paddd   xmm11, xmm15
1471        pxor    xmm4, xmm8
1472        pxor    xmm5, xmm9
1473        pxor    xmm6, xmm10
1474        pxor    xmm7, xmm11
1475        movdqa  xmmword ptr [rsp+0x100], xmm8
1476        movdqa  xmm8, xmm4
1477        psrld   xmm8, 7
1478        pslld   xmm4, 25
1479        por     xmm4, xmm8
1480        movdqa  xmm8, xmm5
1481        psrld   xmm8, 7
1482        pslld   xmm5, 25
1483        por     xmm5, xmm8
1484        movdqa  xmm8, xmm6
1485        psrld   xmm8, 7
1486        pslld   xmm6, 25
1487        por     xmm6, xmm8
1488        movdqa  xmm8, xmm7
1489        psrld   xmm8, 7
1490        pslld   xmm7, 25
1491        por     xmm7, xmm8
1492        paddd   xmm0, xmmword ptr [rsp+0xE0]
1493        paddd   xmm1, xmmword ptr [rsp+0x20]
1494        paddd   xmm2, xmmword ptr [rsp+0x30]
1495        paddd   xmm3, xmmword ptr [rsp+0x70]
1496        paddd   xmm0, xmm5
1497        paddd   xmm1, xmm6
1498        paddd   xmm2, xmm7
1499        paddd   xmm3, xmm4
1500        pxor    xmm15, xmm0
1501        pxor    xmm12, xmm1
1502        pxor    xmm13, xmm2
1503        pxor    xmm14, xmm3
1504        pshuflw xmm15, xmm15, 0xB1
1505        pshufhw xmm15, xmm15, 0xB1
1506        pshuflw xmm12, xmm12, 0xB1
1507        pshufhw xmm12, xmm12, 0xB1
1508        pshuflw xmm13, xmm13, 0xB1
1509        pshufhw xmm13, xmm13, 0xB1
1510        pshuflw xmm14, xmm14, 0xB1
1511        pshufhw xmm14, xmm14, 0xB1
1512        paddd   xmm10, xmm15
1513        paddd   xmm11, xmm12
1514        movdqa  xmm8, xmmword ptr [rsp+0x100]
1515        paddd   xmm8, xmm13
1516        paddd   xmm9, xmm14
1517        pxor    xmm5, xmm10
1518        pxor    xmm6, xmm11
1519        pxor    xmm7, xmm8
1520        pxor    xmm4, xmm9
1521        movdqa  xmmword ptr [rsp+0x100], xmm8
1522        movdqa  xmm8, xmm5
1523        psrld   xmm8, 12
1524        pslld   xmm5, 20
1525        por     xmm5, xmm8
1526        movdqa  xmm8, xmm6
1527        psrld   xmm8, 12
1528        pslld   xmm6, 20
1529        por     xmm6, xmm8
1530        movdqa  xmm8, xmm7
1531        psrld   xmm8, 12
1532        pslld   xmm7, 20
1533        por     xmm7, xmm8
1534        movdqa  xmm8, xmm4
1535        psrld   xmm8, 12
1536        pslld   xmm4, 20
1537        por     xmm4, xmm8
1538        paddd   xmm0, xmmword ptr [rsp+0xA0]
1539        paddd   xmm1, xmmword ptr [rsp+0xC0]
1540        paddd   xmm2, xmmword ptr [rsp+0x40]
1541        paddd   xmm3, xmmword ptr [rsp+0xD0]
1542        paddd   xmm0, xmm5
1543        paddd   xmm1, xmm6
1544        paddd   xmm2, xmm7
1545        paddd   xmm3, xmm4
1546        pxor    xmm15, xmm0
1547        pxor    xmm12, xmm1
1548        pxor    xmm13, xmm2
1549        pxor    xmm14, xmm3
1550        movdqa  xmm8, xmm15
1551        psrld   xmm15, 8
1552        pslld   xmm8, 24
1553        pxor    xmm15, xmm8
1554        movdqa  xmm8, xmm12
1555        psrld   xmm12, 8
1556        pslld   xmm8, 24
1557        pxor    xmm12, xmm8
1558        movdqa  xmm8, xmm13
1559        psrld   xmm13, 8
1560        pslld   xmm8, 24
1561        pxor    xmm13, xmm8
1562        movdqa  xmm8, xmm14
1563        psrld   xmm14, 8
1564        pslld   xmm8, 24
1565        pxor    xmm14, xmm8
1566        paddd   xmm10, xmm15
1567        paddd   xmm11, xmm12
1568        movdqa  xmm8, xmmword ptr [rsp+0x100]
1569        paddd   xmm8, xmm13
1570        paddd   xmm9, xmm14
1571        pxor    xmm5, xmm10
1572        pxor    xmm6, xmm11
1573        pxor    xmm7, xmm8
1574        pxor    xmm4, xmm9
1575        pxor    xmm0, xmm8
1576        pxor    xmm1, xmm9
1577        pxor    xmm2, xmm10
1578        pxor    xmm3, xmm11
1579        movdqa  xmm8, xmm5
1580        psrld   xmm8, 7
1581        pslld   xmm5, 25
1582        por     xmm5, xmm8
1583        movdqa  xmm8, xmm6
1584        psrld   xmm8, 7
1585        pslld   xmm6, 25
1586        por     xmm6, xmm8
1587        movdqa  xmm8, xmm7
1588        psrld   xmm8, 7
1589        pslld   xmm7, 25
1590        por     xmm7, xmm8
1591        movdqa  xmm8, xmm4
1592        psrld   xmm8, 7
1593        pslld   xmm4, 25
1594        por     xmm4, xmm8
1595        pxor    xmm4, xmm12
1596        pxor    xmm5, xmm13
1597        pxor    xmm6, xmm14
1598        pxor    xmm7, xmm15
1599        mov     eax, r13d
1600        jne     9b
1601        movdqa  xmm9, xmm0
1602        punpckldq xmm0, xmm1
1603        punpckhdq xmm9, xmm1
1604        movdqa  xmm11, xmm2
1605        punpckldq xmm2, xmm3
1606        punpckhdq xmm11, xmm3
1607        movdqa  xmm1, xmm0
1608        punpcklqdq xmm0, xmm2
1609        punpckhqdq xmm1, xmm2
1610        movdqa  xmm3, xmm9
1611        punpcklqdq xmm9, xmm11
1612        punpckhqdq xmm3, xmm11
1613        movdqu  xmmword ptr [rbx], xmm0
1614        movdqu  xmmword ptr [rbx+0x20], xmm1
1615        movdqu  xmmword ptr [rbx+0x40], xmm9
1616        movdqu  xmmword ptr [rbx+0x60], xmm3
1617        movdqa  xmm9, xmm4
1618        punpckldq xmm4, xmm5
1619        punpckhdq xmm9, xmm5
1620        movdqa  xmm11, xmm6
1621        punpckldq xmm6, xmm7
1622        punpckhdq xmm11, xmm7
1623        movdqa  xmm5, xmm4
1624        punpcklqdq xmm4, xmm6
1625        punpckhqdq xmm5, xmm6
1626        movdqa  xmm7, xmm9
1627        punpcklqdq xmm9, xmm11
1628        punpckhqdq xmm7, xmm11
1629        movdqu  xmmword ptr [rbx+0x10], xmm4
1630        movdqu  xmmword ptr [rbx+0x30], xmm5
1631        movdqu  xmmword ptr [rbx+0x50], xmm9
1632        movdqu  xmmword ptr [rbx+0x70], xmm7
1633        movdqa  xmm1, xmmword ptr [rsp+0x110]
1634        movdqa  xmm0, xmm1
1635        paddd   xmm1, xmmword ptr [rsp+0x150]
1636        movdqa  xmmword ptr [rsp+0x110], xmm1
1637        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1638        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1639        pcmpgtd xmm0, xmm1
1640        movdqa  xmm1, xmmword ptr [rsp+0x120]
1641        psubd   xmm1, xmm0
1642        movdqa  xmmword ptr [rsp+0x120], xmm1
1643        add     rbx, 128
1644        add     rdi, 32
1645        sub     rsi, 4
1646        cmp     rsi, 4
1647        jnc     2b
1648        test    rsi, rsi
1649        jnz     3f
16504:
1651        mov     rsp, rbp
1652        pop     rbp
1653        pop     rbx
1654        pop     r12
1655        pop     r13
1656        pop     r14
1657        pop     r15
1658        RET
1659.p2align 5
16603:
1661        test    esi, 0x2
1662        je      3f
1663        movups  xmm0, xmmword ptr [rcx]
1664        movups  xmm1, xmmword ptr [rcx+0x10]
1665        movaps  xmm8, xmm0
1666        movaps  xmm9, xmm1
1667        movd    xmm13, dword ptr [rsp+0x110]
1668        movd    xmm14, dword ptr [rsp+0x120]
1669        punpckldq xmm13, xmm14
1670        movaps  xmmword ptr [rsp], xmm13
1671        movd    xmm14, dword ptr [rsp+0x114]
1672        movd    xmm13, dword ptr [rsp+0x124]
1673        punpckldq xmm14, xmm13
1674        movaps  xmmword ptr [rsp+0x10], xmm14
1675        mov     r8, qword ptr [rdi]
1676        mov     r9, qword ptr [rdi+0x8]
1677        movzx   eax, byte ptr [rbp+0x40]
1678        or      eax, r13d
1679        xor     edx, edx
16802:
1681        mov     r14d, eax
1682        or      eax, r12d
1683        add     rdx, 64
1684        cmp     rdx, r15
1685        cmovne  eax, r14d
1686        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1687        movaps  xmm10, xmm2
1688        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1689        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1690        movaps  xmm3, xmm4
1691        shufps  xmm4, xmm5, 136
1692        shufps  xmm3, xmm5, 221
1693        movaps  xmm5, xmm3
1694        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1695        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1696        movaps  xmm3, xmm6
1697        shufps  xmm6, xmm7, 136
1698        pshufd  xmm6, xmm6, 0x93
1699        shufps  xmm3, xmm7, 221
1700        pshufd  xmm7, xmm3, 0x93
1701        movups  xmm12, xmmword ptr [r9+rdx-0x40]
1702        movups  xmm13, xmmword ptr [r9+rdx-0x30]
1703        movaps  xmm11, xmm12
1704        shufps  xmm12, xmm13, 136
1705        shufps  xmm11, xmm13, 221
1706        movaps  xmm13, xmm11
1707        movups  xmm14, xmmword ptr [r9+rdx-0x20]
1708        movups  xmm15, xmmword ptr [r9+rdx-0x10]
1709        movaps  xmm11, xmm14
1710        shufps  xmm14, xmm15, 136
1711        pshufd  xmm14, xmm14, 0x93
1712        shufps  xmm11, xmm15, 221
1713        pshufd  xmm15, xmm11, 0x93
1714        shl     rax, 0x20
1715        or      rax, 0x40
1716        movq    xmm3, rax
1717        movdqa  xmmword ptr [rsp+0x20], xmm3
1718        movaps  xmm3, xmmword ptr [rsp]
1719        movaps  xmm11, xmmword ptr [rsp+0x10]
1720        punpcklqdq xmm3, xmmword ptr [rsp+0x20]
1721        punpcklqdq xmm11, xmmword ptr [rsp+0x20]
1722        mov     al, 7
17239:
1724        paddd   xmm0, xmm4
1725        paddd   xmm8, xmm12
1726        movaps  xmmword ptr [rsp+0x20], xmm4
1727        movaps  xmmword ptr [rsp+0x30], xmm12
1728        paddd   xmm0, xmm1
1729        paddd   xmm8, xmm9
1730        pxor    xmm3, xmm0
1731        pxor    xmm11, xmm8
1732        pshuflw xmm3, xmm3, 0xB1
1733        pshufhw xmm3, xmm3, 0xB1
1734        pshuflw xmm11, xmm11, 0xB1
1735        pshufhw xmm11, xmm11, 0xB1
1736        paddd   xmm2, xmm3
1737        paddd   xmm10, xmm11
1738        pxor    xmm1, xmm2
1739        pxor    xmm9, xmm10
1740        movdqa  xmm4, xmm1
1741        pslld   xmm1, 20
1742        psrld   xmm4, 12
1743        por     xmm1, xmm4
1744        movdqa  xmm4, xmm9
1745        pslld   xmm9, 20
1746        psrld   xmm4, 12
1747        por     xmm9, xmm4
1748        paddd   xmm0, xmm5
1749        paddd   xmm8, xmm13
1750        movaps  xmmword ptr [rsp+0x40], xmm5
1751        movaps  xmmword ptr [rsp+0x50], xmm13
1752        paddd   xmm0, xmm1
1753        paddd   xmm8, xmm9
1754        pxor    xmm3, xmm0
1755        pxor    xmm11, xmm8
1756        movdqa  xmm13, xmm3
1757        psrld   xmm3, 8
1758        pslld   xmm13, 24
1759        pxor    xmm3, xmm13
1760        movdqa  xmm13, xmm11
1761        psrld   xmm11, 8
1762        pslld   xmm13, 24
1763        pxor    xmm11, xmm13
1764        paddd   xmm2, xmm3
1765        paddd   xmm10, xmm11
1766        pxor    xmm1, xmm2
1767        pxor    xmm9, xmm10
1768        movdqa  xmm4, xmm1
1769        pslld   xmm1, 25
1770        psrld   xmm4, 7
1771        por     xmm1, xmm4
1772        movdqa  xmm4, xmm9
1773        pslld   xmm9, 25
1774        psrld   xmm4, 7
1775        por     xmm9, xmm4
1776        pshufd  xmm0, xmm0, 0x93
1777        pshufd  xmm8, xmm8, 0x93
1778        pshufd  xmm3, xmm3, 0x4E
1779        pshufd  xmm11, xmm11, 0x4E
1780        pshufd  xmm2, xmm2, 0x39
1781        pshufd  xmm10, xmm10, 0x39
1782        paddd   xmm0, xmm6
1783        paddd   xmm8, xmm14
1784        paddd   xmm0, xmm1
1785        paddd   xmm8, xmm9
1786        pxor    xmm3, xmm0
1787        pxor    xmm11, xmm8
1788        pshuflw xmm3, xmm3, 0xB1
1789        pshufhw xmm3, xmm3, 0xB1
1790        pshuflw xmm11, xmm11, 0xB1
1791        pshufhw xmm11, xmm11, 0xB1
1792        paddd   xmm2, xmm3
1793        paddd   xmm10, xmm11
1794        pxor    xmm1, xmm2
1795        pxor    xmm9, xmm10
1796        movdqa  xmm4, xmm1
1797        pslld   xmm1, 20
1798        psrld   xmm4, 12
1799        por     xmm1, xmm4
1800        movdqa  xmm4, xmm9
1801        pslld   xmm9, 20
1802        psrld   xmm4, 12
1803        por     xmm9, xmm4
1804        paddd   xmm0, xmm7
1805        paddd   xmm8, xmm15
1806        paddd   xmm0, xmm1
1807        paddd   xmm8, xmm9
1808        pxor    xmm3, xmm0
1809        pxor    xmm11, xmm8
1810        movdqa  xmm13, xmm3
1811        psrld   xmm3, 8
1812        pslld   xmm13, 24
1813        pxor    xmm3, xmm13
1814        movdqa  xmm13, xmm11
1815        psrld   xmm11, 8
1816        pslld   xmm13, 24
1817        pxor    xmm11, xmm13
1818        paddd   xmm2, xmm3
1819        paddd   xmm10, xmm11
1820        pxor    xmm1, xmm2
1821        pxor    xmm9, xmm10
1822        movdqa  xmm4, xmm1
1823        pslld   xmm1, 25
1824        psrld   xmm4, 7
1825        por     xmm1, xmm4
1826        movdqa  xmm4, xmm9
1827        pslld   xmm9, 25
1828        psrld   xmm4, 7
1829        por     xmm9, xmm4
1830        pshufd  xmm0, xmm0, 0x39
1831        pshufd  xmm8, xmm8, 0x39
1832        pshufd  xmm3, xmm3, 0x4E
1833        pshufd  xmm11, xmm11, 0x4E
1834        pshufd  xmm2, xmm2, 0x93
1835        pshufd  xmm10, xmm10, 0x93
1836        dec     al
1837        je      9f
1838        movdqa  xmm12, xmmword ptr [rsp+0x20]
1839        movdqa  xmm5, xmmword ptr [rsp+0x40]
1840        pshufd  xmm13, xmm12, 0x0F
1841        shufps  xmm12, xmm5, 214
1842        pshufd  xmm4, xmm12, 0x39
1843        movdqa  xmm12, xmm6
1844        shufps  xmm12, xmm7, 250
1845        pand    xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
1846        pand    xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1847        por     xmm13, xmm12
1848        movdqa  xmmword ptr [rsp+0x20], xmm13
1849        movdqa  xmm12, xmm7
1850        punpcklqdq xmm12, xmm5
1851        movdqa  xmm13, xmm6
1852        pand    xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1853        pand    xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1854        por     xmm12, xmm13
1855        pshufd  xmm12, xmm12, 0x78
1856        punpckhdq xmm5, xmm7
1857        punpckldq xmm6, xmm5
1858        pshufd  xmm7, xmm6, 0x1E
1859        movdqa  xmmword ptr [rsp+0x40], xmm12
1860        movdqa  xmm5, xmmword ptr [rsp+0x30]
1861        movdqa  xmm13, xmmword ptr [rsp+0x50]
1862        pshufd  xmm6, xmm5, 0x0F
1863        shufps  xmm5, xmm13, 214
1864        pshufd  xmm12, xmm5, 0x39
1865        movdqa  xmm5, xmm14
1866        shufps  xmm5, xmm15, 250
1867        pand    xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
1868        pand    xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1869        por     xmm6, xmm5
1870        movdqa  xmm5, xmm15
1871        punpcklqdq xmm5, xmm13
1872        movdqa  xmmword ptr [rsp+0x30], xmm2
1873        movdqa  xmm2, xmm14
1874        pand    xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1875        pand    xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1876        por     xmm5, xmm2
1877        movdqa  xmm2, xmmword ptr [rsp+0x30]
1878        pshufd  xmm5, xmm5, 0x78
1879        punpckhdq xmm13, xmm15
1880        punpckldq xmm14, xmm13
1881        pshufd  xmm15, xmm14, 0x1E
1882        movdqa  xmm13, xmm6
1883        movdqa  xmm14, xmm5
1884        movdqa  xmm5, xmmword ptr [rsp+0x20]
1885        movdqa  xmm6, xmmword ptr [rsp+0x40]
1886        jmp     9b
18879:
1888        pxor    xmm0, xmm2
1889        pxor    xmm1, xmm3
1890        pxor    xmm8, xmm10
1891        pxor    xmm9, xmm11
1892        mov     eax, r13d
1893        cmp     rdx, r15
1894        jne     2b
1895        movups  xmmword ptr [rbx], xmm0
1896        movups  xmmword ptr [rbx+0x10], xmm1
1897        movups  xmmword ptr [rbx+0x20], xmm8
1898        movups  xmmword ptr [rbx+0x30], xmm9
1899        mov     eax, dword ptr [rsp+0x130]
1900        neg     eax
1901        mov    r10d, dword ptr [rsp+0x110+8*rax]
1902        mov    r11d, dword ptr [rsp+0x120+8*rax]
1903        mov dword ptr [rsp+0x110], r10d
1904        mov dword ptr [rsp+0x120], r11d
1905        add     rdi, 16
1906        add     rbx, 64
1907        sub     rsi, 2
19083:
1909        test    esi, 0x1
1910        je      4b
1911        movups  xmm0, xmmword ptr [rcx]
1912        movups  xmm1, xmmword ptr [rcx+0x10]
1913        movd    xmm13, dword ptr [rsp+0x110]
1914        movd    xmm14, dword ptr [rsp+0x120]
1915        punpckldq xmm13, xmm14
1916        mov     r8, qword ptr [rdi]
1917        movzx   eax, byte ptr [rbp+0x40]
1918        or      eax, r13d
1919        xor     edx, edx
19202:
1921        mov     r14d, eax
1922        or      eax, r12d
1923        add     rdx, 64
1924        cmp     rdx, r15
1925        cmovne  eax, r14d
1926        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1927        shl     rax, 32
1928        or      rax, 64
1929        movq    xmm12, rax
1930        movdqa  xmm3, xmm13
1931        punpcklqdq xmm3, xmm12
1932        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1933        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1934        movaps  xmm8, xmm4
1935        shufps  xmm4, xmm5, 136
1936        shufps  xmm8, xmm5, 221
1937        movaps  xmm5, xmm8
1938        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1939        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1940        movaps  xmm8, xmm6
1941        shufps  xmm6, xmm7, 136
1942        pshufd  xmm6, xmm6, 0x93
1943        shufps  xmm8, xmm7, 221
1944        pshufd  xmm7, xmm8, 0x93
1945        mov     al, 7
19469:
1947        paddd   xmm0, xmm4
1948        paddd   xmm0, xmm1
1949        pxor    xmm3, xmm0
1950        pshuflw xmm3, xmm3, 0xB1
1951        pshufhw xmm3, xmm3, 0xB1
1952        paddd   xmm2, xmm3
1953        pxor    xmm1, xmm2
1954        movdqa  xmm11, xmm1
1955        pslld   xmm1, 20
1956        psrld   xmm11, 12
1957        por     xmm1, xmm11
1958        paddd   xmm0, xmm5
1959        paddd   xmm0, xmm1
1960        pxor    xmm3, xmm0
1961        movdqa  xmm14, xmm3
1962        psrld   xmm3, 8
1963        pslld   xmm14, 24
1964        pxor    xmm3, xmm14
1965        paddd   xmm2, xmm3
1966        pxor    xmm1, xmm2
1967        movdqa  xmm11, xmm1
1968        pslld   xmm1, 25
1969        psrld   xmm11, 7
1970        por     xmm1, xmm11
1971        pshufd  xmm0, xmm0, 0x93
1972        pshufd  xmm3, xmm3, 0x4E
1973        pshufd  xmm2, xmm2, 0x39
1974        paddd   xmm0, xmm6
1975        paddd   xmm0, xmm1
1976        pxor    xmm3, xmm0
1977        pshuflw xmm3, xmm3, 0xB1
1978        pshufhw xmm3, xmm3, 0xB1
1979        paddd   xmm2, xmm3
1980        pxor    xmm1, xmm2
1981        movdqa  xmm11, xmm1
1982        pslld   xmm1, 20
1983        psrld   xmm11, 12
1984        por     xmm1, xmm11
1985        paddd   xmm0, xmm7
1986        paddd   xmm0, xmm1
1987        pxor    xmm3, xmm0
1988        movdqa  xmm14, xmm3
1989        psrld   xmm3, 8
1990        pslld   xmm14, 24
1991        pxor    xmm3, xmm14
1992        paddd   xmm2, xmm3
1993        pxor    xmm1, xmm2
1994        movdqa  xmm11, xmm1
1995        pslld   xmm1, 25
1996        psrld   xmm11, 7
1997        por     xmm1, xmm11
1998        pshufd  xmm0, xmm0, 0x39
1999        pshufd  xmm3, xmm3, 0x4E
2000        pshufd  xmm2, xmm2, 0x93
2001        dec     al
2002        jz      9f
2003        movdqa  xmm8, xmm4
2004        shufps  xmm8, xmm5, 214
2005        pshufd  xmm9, xmm4, 0x0F
2006        pshufd  xmm4, xmm8, 0x39
2007        movdqa  xmm8, xmm6
2008        shufps  xmm8, xmm7, 250
2009        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2010        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2011        por     xmm9, xmm8
2012        movdqa  xmm8, xmm7
2013        punpcklqdq xmm8, xmm5
2014        movdqa  xmm10, xmm6
2015        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2016        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2017        por     xmm8, xmm10
2018        pshufd  xmm8, xmm8, 0x78
2019        punpckhdq xmm5, xmm7
2020        punpckldq xmm6, xmm5
2021        pshufd  xmm7, xmm6, 0x1E
2022        movdqa  xmm5, xmm9
2023        movdqa  xmm6, xmm8
2024        jmp     9b
20259:
2026        pxor    xmm0, xmm2
2027        pxor    xmm1, xmm3
2028        mov     eax, r13d
2029        cmp     rdx, r15
2030        jne     2b
2031        movups  xmmword ptr [rbx], xmm0
2032        movups  xmmword ptr [rbx+0x10], xmm1
2033        jmp     4b
2034SET_SIZE(zfs_blake3_hash_many_sse2)
2035
2036ENTRY_ALIGN(zfs_blake3_compress_in_place_sse2, 64)
2037        ENDBR
2038        movups  xmm0, xmmword ptr [rdi]
2039        movups  xmm1, xmmword ptr [rdi+0x10]
2040        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
2041        shl     r8, 32
2042        add     rdx, r8
2043        movq    xmm3, rcx
2044        movq    xmm4, rdx
2045        punpcklqdq xmm3, xmm4
2046        movups  xmm4, xmmword ptr [rsi]
2047        movups  xmm5, xmmword ptr [rsi+0x10]
2048        movaps  xmm8, xmm4
2049        shufps  xmm4, xmm5, 136
2050        shufps  xmm8, xmm5, 221
2051        movaps  xmm5, xmm8
2052        movups  xmm6, xmmword ptr [rsi+0x20]
2053        movups  xmm7, xmmword ptr [rsi+0x30]
2054        movaps  xmm8, xmm6
2055        shufps  xmm6, xmm7, 136
2056        pshufd  xmm6, xmm6, 0x93
2057        shufps  xmm8, xmm7, 221
2058        pshufd  xmm7, xmm8, 0x93
2059        mov     al, 7
20609:
2061        paddd   xmm0, xmm4
2062        paddd   xmm0, xmm1
2063        pxor    xmm3, xmm0
2064        pshuflw xmm3, xmm3, 0xB1
2065        pshufhw xmm3, xmm3, 0xB1
2066        paddd   xmm2, xmm3
2067        pxor    xmm1, xmm2
2068        movdqa  xmm11, xmm1
2069        pslld   xmm1, 20
2070        psrld   xmm11, 12
2071        por     xmm1, xmm11
2072        paddd   xmm0, xmm5
2073        paddd   xmm0, xmm1
2074        pxor    xmm3, xmm0
2075        movdqa  xmm14, xmm3
2076        psrld   xmm3, 8
2077        pslld   xmm14, 24
2078        pxor    xmm3, xmm14
2079        paddd   xmm2, xmm3
2080        pxor    xmm1, xmm2
2081        movdqa  xmm11, xmm1
2082        pslld   xmm1, 25
2083        psrld   xmm11, 7
2084        por     xmm1, xmm11
2085        pshufd  xmm0, xmm0, 0x93
2086        pshufd  xmm3, xmm3, 0x4E
2087        pshufd  xmm2, xmm2, 0x39
2088        paddd   xmm0, xmm6
2089        paddd   xmm0, xmm1
2090        pxor    xmm3, xmm0
2091        pshuflw xmm3, xmm3, 0xB1
2092        pshufhw xmm3, xmm3, 0xB1
2093        paddd   xmm2, xmm3
2094        pxor    xmm1, xmm2
2095        movdqa  xmm11, xmm1
2096        pslld   xmm1, 20
2097        psrld   xmm11, 12
2098        por     xmm1, xmm11
2099        paddd   xmm0, xmm7
2100        paddd   xmm0, xmm1
2101        pxor    xmm3, xmm0
2102        movdqa  xmm14, xmm3
2103        psrld   xmm3, 8
2104        pslld   xmm14, 24
2105        pxor    xmm3, xmm14
2106        paddd   xmm2, xmm3
2107        pxor    xmm1, xmm2
2108        movdqa  xmm11, xmm1
2109        pslld   xmm1, 25
2110        psrld   xmm11, 7
2111        por     xmm1, xmm11
2112        pshufd  xmm0, xmm0, 0x39
2113        pshufd  xmm3, xmm3, 0x4E
2114        pshufd  xmm2, xmm2, 0x93
2115        dec     al
2116        jz      9f
2117        movdqa  xmm8, xmm4
2118        shufps  xmm8, xmm5, 214
2119        pshufd  xmm9, xmm4, 0x0F
2120        pshufd  xmm4, xmm8, 0x39
2121        movdqa  xmm8, xmm6
2122        shufps  xmm8, xmm7, 250
2123        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2124        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2125        por     xmm9, xmm8
2126        movdqa  xmm8, xmm7
2127        punpcklqdq xmm8, xmm5
2128        movdqa  xmm10, xmm6
2129        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2130        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2131        por     xmm8, xmm10
2132        pshufd  xmm8, xmm8, 0x78
2133        punpckhdq xmm5, xmm7
2134        punpckldq xmm6, xmm5
2135        pshufd  xmm7, xmm6, 0x1E
2136        movdqa  xmm5, xmm9
2137        movdqa  xmm6, xmm8
2138        jmp     9b
21399:
2140        pxor    xmm0, xmm2
2141        pxor    xmm1, xmm3
2142        movups  xmmword ptr [rdi], xmm0
2143        movups  xmmword ptr [rdi+0x10], xmm1
2144        RET
2145SET_SIZE(zfs_blake3_compress_in_place_sse2)
2146
2147ENTRY_ALIGN(zfs_blake3_compress_xof_sse2, 64)
2148        ENDBR
2149        movups  xmm0, xmmword ptr [rdi]
2150        movups  xmm1, xmmword ptr [rdi+0x10]
2151        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
2152        movzx   eax, r8b
2153        movzx   edx, dl
2154        shl     rax, 32
2155        add     rdx, rax
2156        movq    xmm3, rcx
2157        movq    xmm4, rdx
2158        punpcklqdq xmm3, xmm4
2159        movups  xmm4, xmmword ptr [rsi]
2160        movups  xmm5, xmmword ptr [rsi+0x10]
2161        movaps  xmm8, xmm4
2162        shufps  xmm4, xmm5, 136
2163        shufps  xmm8, xmm5, 221
2164        movaps  xmm5, xmm8
2165        movups  xmm6, xmmword ptr [rsi+0x20]
2166        movups  xmm7, xmmword ptr [rsi+0x30]
2167        movaps  xmm8, xmm6
2168        shufps  xmm6, xmm7, 136
2169        pshufd  xmm6, xmm6, 0x93
2170        shufps  xmm8, xmm7, 221
2171        pshufd  xmm7, xmm8, 0x93
2172        mov     al, 7
21739:
2174        paddd   xmm0, xmm4
2175        paddd   xmm0, xmm1
2176        pxor    xmm3, xmm0
2177        pshuflw xmm3, xmm3, 0xB1
2178        pshufhw xmm3, xmm3, 0xB1
2179        paddd   xmm2, xmm3
2180        pxor    xmm1, xmm2
2181        movdqa  xmm11, xmm1
2182        pslld   xmm1, 20
2183        psrld   xmm11, 12
2184        por     xmm1, xmm11
2185        paddd   xmm0, xmm5
2186        paddd   xmm0, xmm1
2187        pxor    xmm3, xmm0
2188        movdqa  xmm14, xmm3
2189        psrld   xmm3, 8
2190        pslld   xmm14, 24
2191        pxor    xmm3, xmm14
2192        paddd   xmm2, xmm3
2193        pxor    xmm1, xmm2
2194        movdqa  xmm11, xmm1
2195        pslld   xmm1, 25
2196        psrld   xmm11, 7
2197        por     xmm1, xmm11
2198        pshufd  xmm0, xmm0, 0x93
2199        pshufd  xmm3, xmm3, 0x4E
2200        pshufd  xmm2, xmm2, 0x39
2201        paddd   xmm0, xmm6
2202        paddd   xmm0, xmm1
2203        pxor    xmm3, xmm0
2204        pshuflw xmm3, xmm3, 0xB1
2205        pshufhw xmm3, xmm3, 0xB1
2206        paddd   xmm2, xmm3
2207        pxor    xmm1, xmm2
2208        movdqa  xmm11, xmm1
2209        pslld   xmm1, 20
2210        psrld   xmm11, 12
2211        por     xmm1, xmm11
2212        paddd   xmm0, xmm7
2213        paddd   xmm0, xmm1
2214        pxor    xmm3, xmm0
2215        movdqa  xmm14, xmm3
2216        psrld   xmm3, 8
2217        pslld   xmm14, 24
2218        pxor    xmm3, xmm14
2219        paddd   xmm2, xmm3
2220        pxor    xmm1, xmm2
2221        movdqa  xmm11, xmm1
2222        pslld   xmm1, 25
2223        psrld   xmm11, 7
2224        por     xmm1, xmm11
2225        pshufd  xmm0, xmm0, 0x39
2226        pshufd  xmm3, xmm3, 0x4E
2227        pshufd  xmm2, xmm2, 0x93
2228        dec     al
2229        jz      9f
2230        movdqa  xmm8, xmm4
2231        shufps  xmm8, xmm5, 214
2232        pshufd  xmm9, xmm4, 0x0F
2233        pshufd  xmm4, xmm8, 0x39
2234        movdqa  xmm8, xmm6
2235        shufps  xmm8, xmm7, 250
2236        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2237        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2238        por     xmm9, xmm8
2239        movdqa  xmm8, xmm7
2240        punpcklqdq xmm8, xmm5
2241        movdqa  xmm10, xmm6
2242        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2243        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2244        por     xmm8, xmm10
2245        pshufd  xmm8, xmm8, 0x78
2246        punpckhdq xmm5, xmm7
2247        punpckldq xmm6, xmm5
2248        pshufd  xmm7, xmm6, 0x1E
2249        movdqa  xmm5, xmm9
2250        movdqa  xmm6, xmm8
2251        jmp     9b
22529:
2253        movdqu  xmm4, xmmword ptr [rdi]
2254        movdqu  xmm5, xmmword ptr [rdi+0x10]
2255        pxor    xmm0, xmm2
2256        pxor    xmm1, xmm3
2257        pxor    xmm2, xmm4
2258        pxor    xmm3, xmm5
2259        movups  xmmword ptr [r9], xmm0
2260        movups  xmmword ptr [r9+0x10], xmm1
2261        movups  xmmword ptr [r9+0x20], xmm2
2262        movups  xmmword ptr [r9+0x30], xmm3
2263        RET
2264SET_SIZE(zfs_blake3_compress_xof_sse2)
2265
2266SECTION_STATIC
2267.p2align  6
2268BLAKE3_IV:
2269        .long  0x6A09E667, 0xBB67AE85
2270        .long  0x3C6EF372, 0xA54FF53A
2271ADD0:
2272        .long  0, 1, 2, 3
2273ADD1:
2274	.long  4, 4, 4, 4
2275BLAKE3_IV_0:
2276	.long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2277BLAKE3_IV_1:
2278	.long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2279BLAKE3_IV_2:
2280	.long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2281BLAKE3_IV_3:
2282	.long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2283BLAKE3_BLOCK_LEN:
2284	.long  64, 64, 64, 64
2285CMP_MSB_MASK:
2286	.long  0x80000000, 0x80000000, 0x80000000, 0x80000000
2287PBLENDW_0x33_MASK:
2288	.long  0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
2289PBLENDW_0xCC_MASK:
2290	.long  0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
2291PBLENDW_0x3F_MASK:
2292	.long  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
2293PBLENDW_0xC0_MASK:
2294	.long  0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
2295
2296#endif	/* HAVE_SSE2 */
2297
2298#ifdef __ELF__
2299.section .note.GNU-stack,"",%progbits
2300#endif
2301