xref: /freebsd/sys/crypto/openssl/aarch64/chacha-armv8-sve.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from chacha-armv8-sve.pl. */
2// Copyright 2022-2025  The OpenSSL Project Authors. All Rights Reserved.
3//
4// Licensed under the Apache License 2.0 (the "License").  You may not use
5// this file except in compliance with the License.  You can obtain a copy
6// in the file LICENSE in the source distribution or at
7// https://www.openssl.org/source/license.html
8//
9//
10// ChaCha20 for ARMv8 via SVE
11//
12// $output is the last argument if it looks like a file (it has an extension)
13// $flavour is the first argument if it doesn't look like a file
14#include "arm_arch.h"
15
16.arch	armv8-a
17
18
19.hidden	OPENSSL_armcap_P
20
21.text
22
23.section	.rodata
24.align	5
25.type	_chacha_sve_consts,%object
26_chacha_sve_consts:
27.Lchacha20_consts:
28.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
29.Lrot8:
30.word	0x02010003,0x04040404,0x02010003,0x04040404
31.size	_chacha_sve_consts,.-_chacha_sve_consts
32
33.previous
34
35.globl	ChaCha20_ctr32_sve
36.type	ChaCha20_ctr32_sve,%function
37.align	5
38ChaCha20_ctr32_sve:
39	AARCH64_VALID_CALL_TARGET
40.inst	0x04a0e3e5	//cntw x5, ALL, MUL #1
41	cmp	x2,x5,lsl #6
42	b.lt	.Lreturn
43	mov	x7,0
44	adrp	x6,OPENSSL_armcap_P
45	ldr	w6,[x6,#:lo12:OPENSSL_armcap_P]
46	tst	w6,#ARMV8_SVE2
47	b.eq	1f
48	mov	x7,1
49	b	2f
501:
51	cmp	x5,4
52	b.le	.Lreturn
53	adrp	x6,.Lrot8
54	add	x6,x6,#:lo12:.Lrot8
55	ldp	w9,w10,[x6]
56.inst	0x04aa4d3f	//index z31.s,w9,w10
572:
58	AARCH64_SIGN_LINK_REGISTER
59	stp	d8,d9,[sp,-192]!
60	stp	d10,d11,[sp,16]
61	stp	d12,d13,[sp,32]
62	stp	d14,d15,[sp,48]
63	stp	x16,x17,[sp,64]
64	stp	x18,x19,[sp,80]
65	stp	x20,x21,[sp,96]
66	stp	x22,x23,[sp,112]
67	stp	x24,x25,[sp,128]
68	stp	x26,x27,[sp,144]
69	stp	x28,x29,[sp,160]
70	str	x30,[sp,176]
71
72	adrp	x6,.Lchacha20_consts
73	add	x6,x6,#:lo12:.Lchacha20_consts
74	ldp	x23,x24,[x6]
75	ldp	x25,x26,[x3]
76	ldp	x27,x28,[x3, 16]
77	ldp	x29,x30,[x4]
78.inst	0x2599e3e0	//ptrues p0.s,ALL
79#ifdef	__AARCH64EB__
80	ror	x25,x25,#32
81	ror	x26,x26,#32
82	ror	x27,x27,#32
83	ror	x28,x28,#32
84	ror	x29,x29,#32
85	ror	x30,x30,#32
86#endif
87	cbz	x7, 1f
88.align	5
89100:
90	subs	x7,x2,x5,lsl #6
91	b.lt	110f
92	mov	x2,x7
93	b.eq	101f
94	cmp	x2,64
95	b.lt	101f
96	mixin=1
97	lsr	x8,x23,#32
98.inst	0x05a03ae0	//dup z0.s,w23
99.inst	0x05a03af9	//dup z25.s,w23
100.if	mixin == 1
101	mov	w7,w23
102.endif
103.inst	0x05a03904	//dup z4.s,w8
104.inst	0x05a0391a	//dup z26.s,w8
105	lsr	x10,x24,#32
106.inst	0x05a03b08	//dup z8.s,w24
107.inst	0x05a03b1b	//dup z27.s,w24
108.if	mixin == 1
109	mov	w9,w24
110.endif
111.inst	0x05a0394c	//dup z12.s,w10
112.inst	0x05a0395c	//dup z28.s,w10
113	lsr	x12,x25,#32
114.inst	0x05a03b21	//dup z1.s,w25
115.inst	0x05a03b3d	//dup z29.s,w25
116.if	mixin == 1
117	mov	w11,w25
118.endif
119.inst	0x05a03985	//dup z5.s,w12
120.inst	0x05a0399e	//dup z30.s,w12
121	lsr	x14,x26,#32
122.inst	0x05a03b49	//dup z9.s,w26
123.inst	0x05a03b55	//dup z21.s,w26
124.if	mixin == 1
125	mov	w13,w26
126.endif
127.inst	0x05a039cd	//dup z13.s,w14
128.inst	0x05a039d6	//dup z22.s,w14
129	lsr	x16,x27,#32
130.inst	0x05a03b62	//dup z2.s,w27
131.inst	0x05a03b77	//dup z23.s,w27
132.if	mixin == 1
133	mov	w15,w27
134.endif
135.inst	0x05a03a06	//dup z6.s,w16
136.inst	0x05a03a18	//dup z24.s,w16
137	lsr	x18,x28,#32
138.inst	0x05a03b8a	//dup z10.s,w28
139.inst	0x05a03b91	//dup z17.s,w28
140.if	mixin == 1
141	mov	w17,w28
142.endif
143.inst	0x05a03a4e	//dup z14.s,w18
144.inst	0x05a03a52	//dup z18.s,w18
145	lsr	x22,x30,#32
146.inst	0x05a03bcb	//dup z11.s,w30
147.inst	0x05a03bd4	//dup z20.s,w30
148.if	mixin == 1
149	mov	w21,w30
150.endif
151.inst	0x05a03acf	//dup z15.s,w22
152.inst	0x05a03adf	//dup z31.s,w22
153.if	mixin == 1
154	add	w20,w29,#1
155	mov	w19,w29
156.inst	0x04a14690	//index z16.s,w20,1
157.inst	0x04a14683	//index z3.s,w20,1
158.else
159.inst	0x04a147b0	//index z16.s,w29,1
160.inst	0x04a147a3	//index z3.s,w29,1
161.endif
162	lsr	x20,x29,#32
163.inst	0x05a03a87	//dup z7.s,w20
164.inst	0x05a03a93	//dup z19.s,w20
165	mov	x6,#10
16610:
167.align	5
168.inst	0x04a10000	//add z0.s,z0.s,z1.s
169.if	mixin == 1
170	add	w7,w7,w11
171.endif
172.inst	0x04a50084	//add z4.s,z4.s,z5.s
173.if	mixin == 1
174	add	w8,w8,w12
175.endif
176.inst	0x04a90108	//add z8.s,z8.s,z9.s
177.if	mixin == 1
178	add	w9,w9,w13
179.endif
180.inst	0x04ad018c	//add z12.s,z12.s,z13.s
181.if	mixin == 1
182	add	w10,w10,w14
183.endif
184.if	mixin == 1
185	eor	w19,w19,w7
186.endif
187.inst	0x04703403	//xar z3.s,z3.s,z0.s,16
188.if	mixin == 1
189	ror	w19,w19,16
190.endif
191.if	mixin == 1
192	eor	w20,w20,w8
193.endif
194.inst	0x04703487	//xar z7.s,z7.s,z4.s,16
195.if	mixin == 1
196	ror	w20,w20,16
197.endif
198.if	mixin == 1
199	eor	w21,w21,w9
200.endif
201.inst	0x0470350b	//xar z11.s,z11.s,z8.s,16
202.if	mixin == 1
203	ror	w21,w21,16
204.endif
205.if	mixin == 1
206	eor	w22,w22,w10
207.endif
208.inst	0x0470358f	//xar z15.s,z15.s,z12.s,16
209.if	mixin == 1
210	ror	w22,w22,16
211.endif
212.inst	0x04a30042	//add z2.s,z2.s,z3.s
213.if	mixin == 1
214	add	w15,w15,w19
215.endif
216.inst	0x04a700c6	//add z6.s,z6.s,z7.s
217.if	mixin == 1
218	add	w16,w16,w20
219.endif
220.inst	0x04ab014a	//add z10.s,z10.s,z11.s
221.if	mixin == 1
222	add	w17,w17,w21
223.endif
224.inst	0x04af01ce	//add z14.s,z14.s,z15.s
225.if	mixin == 1
226	add	w18,w18,w22
227.endif
228.if	mixin == 1
229	eor	w11,w11,w15
230.endif
231.inst	0x046c3441	//xar z1.s,z1.s,z2.s,20
232.if	mixin == 1
233	ror	w11,w11,20
234.endif
235.if	mixin == 1
236	eor	w12,w12,w16
237.endif
238.inst	0x046c34c5	//xar z5.s,z5.s,z6.s,20
239.if	mixin == 1
240	ror	w12,w12,20
241.endif
242.if	mixin == 1
243	eor	w13,w13,w17
244.endif
245.inst	0x046c3549	//xar z9.s,z9.s,z10.s,20
246.if	mixin == 1
247	ror	w13,w13,20
248.endif
249.if	mixin == 1
250	eor	w14,w14,w18
251.endif
252.inst	0x046c35cd	//xar z13.s,z13.s,z14.s,20
253.if	mixin == 1
254	ror	w14,w14,20
255.endif
256.inst	0x04a10000	//add z0.s,z0.s,z1.s
257.if	mixin == 1
258	add	w7,w7,w11
259.endif
260.inst	0x04a50084	//add z4.s,z4.s,z5.s
261.if	mixin == 1
262	add	w8,w8,w12
263.endif
264.inst	0x04a90108	//add z8.s,z8.s,z9.s
265.if	mixin == 1
266	add	w9,w9,w13
267.endif
268.inst	0x04ad018c	//add z12.s,z12.s,z13.s
269.if	mixin == 1
270	add	w10,w10,w14
271.endif
272.if	mixin == 1
273	eor	w19,w19,w7
274.endif
275.inst	0x04683403	//xar z3.s,z3.s,z0.s,24
276.if	mixin == 1
277	ror	w19,w19,24
278.endif
279.if	mixin == 1
280	eor	w20,w20,w8
281.endif
282.inst	0x04683487	//xar z7.s,z7.s,z4.s,24
283.if	mixin == 1
284	ror	w20,w20,24
285.endif
286.if	mixin == 1
287	eor	w21,w21,w9
288.endif
289.inst	0x0468350b	//xar z11.s,z11.s,z8.s,24
290.if	mixin == 1
291	ror	w21,w21,24
292.endif
293.if	mixin == 1
294	eor	w22,w22,w10
295.endif
296.inst	0x0468358f	//xar z15.s,z15.s,z12.s,24
297.if	mixin == 1
298	ror	w22,w22,24
299.endif
300.inst	0x04a30042	//add z2.s,z2.s,z3.s
301.if	mixin == 1
302	add	w15,w15,w19
303.endif
304.inst	0x04a700c6	//add z6.s,z6.s,z7.s
305.if	mixin == 1
306	add	w16,w16,w20
307.endif
308.inst	0x04ab014a	//add z10.s,z10.s,z11.s
309.if	mixin == 1
310	add	w17,w17,w21
311.endif
312.inst	0x04af01ce	//add z14.s,z14.s,z15.s
313.if	mixin == 1
314	add	w18,w18,w22
315.endif
316.if	mixin == 1
317	eor	w11,w11,w15
318.endif
319.inst	0x04673441	//xar z1.s,z1.s,z2.s,25
320.if	mixin == 1
321	ror	w11,w11,25
322.endif
323.if	mixin == 1
324	eor	w12,w12,w16
325.endif
326.inst	0x046734c5	//xar z5.s,z5.s,z6.s,25
327.if	mixin == 1
328	ror	w12,w12,25
329.endif
330.if	mixin == 1
331	eor	w13,w13,w17
332.endif
333.inst	0x04673549	//xar z9.s,z9.s,z10.s,25
334.if	mixin == 1
335	ror	w13,w13,25
336.endif
337.if	mixin == 1
338	eor	w14,w14,w18
339.endif
340.inst	0x046735cd	//xar z13.s,z13.s,z14.s,25
341.if	mixin == 1
342	ror	w14,w14,25
343.endif
344.inst	0x04a50000	//add z0.s,z0.s,z5.s
345.if	mixin == 1
346	add	w7,w7,w12
347.endif
348.inst	0x04a90084	//add z4.s,z4.s,z9.s
349.if	mixin == 1
350	add	w8,w8,w13
351.endif
352.inst	0x04ad0108	//add z8.s,z8.s,z13.s
353.if	mixin == 1
354	add	w9,w9,w14
355.endif
356.inst	0x04a1018c	//add z12.s,z12.s,z1.s
357.if	mixin == 1
358	add	w10,w10,w11
359.endif
360.if	mixin == 1
361	eor	w22,w22,w7
362.endif
363.inst	0x0470340f	//xar z15.s,z15.s,z0.s,16
364.if	mixin == 1
365	ror	w22,w22,16
366.endif
367.if	mixin == 1
368	eor	w19,w19,w8
369.endif
370.inst	0x04703483	//xar z3.s,z3.s,z4.s,16
371.if	mixin == 1
372	ror	w19,w19,16
373.endif
374.if	mixin == 1
375	eor	w20,w20,w9
376.endif
377.inst	0x04703507	//xar z7.s,z7.s,z8.s,16
378.if	mixin == 1
379	ror	w20,w20,16
380.endif
381.if	mixin == 1
382	eor	w21,w21,w10
383.endif
384.inst	0x0470358b	//xar z11.s,z11.s,z12.s,16
385.if	mixin == 1
386	ror	w21,w21,16
387.endif
388.inst	0x04af014a	//add z10.s,z10.s,z15.s
389.if	mixin == 1
390	add	w17,w17,w22
391.endif
392.inst	0x04a301ce	//add z14.s,z14.s,z3.s
393.if	mixin == 1
394	add	w18,w18,w19
395.endif
396.inst	0x04a70042	//add z2.s,z2.s,z7.s
397.if	mixin == 1
398	add	w15,w15,w20
399.endif
400.inst	0x04ab00c6	//add z6.s,z6.s,z11.s
401.if	mixin == 1
402	add	w16,w16,w21
403.endif
404.if	mixin == 1
405	eor	w12,w12,w17
406.endif
407.inst	0x046c3545	//xar z5.s,z5.s,z10.s,20
408.if	mixin == 1
409	ror	w12,w12,20
410.endif
411.if	mixin == 1
412	eor	w13,w13,w18
413.endif
414.inst	0x046c35c9	//xar z9.s,z9.s,z14.s,20
415.if	mixin == 1
416	ror	w13,w13,20
417.endif
418.if	mixin == 1
419	eor	w14,w14,w15
420.endif
421.inst	0x046c344d	//xar z13.s,z13.s,z2.s,20
422.if	mixin == 1
423	ror	w14,w14,20
424.endif
425.if	mixin == 1
426	eor	w11,w11,w16
427.endif
428.inst	0x046c34c1	//xar z1.s,z1.s,z6.s,20
429.if	mixin == 1
430	ror	w11,w11,20
431.endif
432.inst	0x04a50000	//add z0.s,z0.s,z5.s
433.if	mixin == 1
434	add	w7,w7,w12
435.endif
436.inst	0x04a90084	//add z4.s,z4.s,z9.s
437.if	mixin == 1
438	add	w8,w8,w13
439.endif
440.inst	0x04ad0108	//add z8.s,z8.s,z13.s
441.if	mixin == 1
442	add	w9,w9,w14
443.endif
444.inst	0x04a1018c	//add z12.s,z12.s,z1.s
445.if	mixin == 1
446	add	w10,w10,w11
447.endif
448.if	mixin == 1
449	eor	w22,w22,w7
450.endif
451.inst	0x0468340f	//xar z15.s,z15.s,z0.s,24
452.if	mixin == 1
453	ror	w22,w22,24
454.endif
455.if	mixin == 1
456	eor	w19,w19,w8
457.endif
458.inst	0x04683483	//xar z3.s,z3.s,z4.s,24
459.if	mixin == 1
460	ror	w19,w19,24
461.endif
462.if	mixin == 1
463	eor	w20,w20,w9
464.endif
465.inst	0x04683507	//xar z7.s,z7.s,z8.s,24
466.if	mixin == 1
467	ror	w20,w20,24
468.endif
469.if	mixin == 1
470	eor	w21,w21,w10
471.endif
472.inst	0x0468358b	//xar z11.s,z11.s,z12.s,24
473.if	mixin == 1
474	ror	w21,w21,24
475.endif
476.inst	0x04af014a	//add z10.s,z10.s,z15.s
477.if	mixin == 1
478	add	w17,w17,w22
479.endif
480.inst	0x04a301ce	//add z14.s,z14.s,z3.s
481.if	mixin == 1
482	add	w18,w18,w19
483.endif
484.inst	0x04a70042	//add z2.s,z2.s,z7.s
485.if	mixin == 1
486	add	w15,w15,w20
487.endif
488.inst	0x04ab00c6	//add z6.s,z6.s,z11.s
489.if	mixin == 1
490	add	w16,w16,w21
491.endif
492.if	mixin == 1
493	eor	w12,w12,w17
494.endif
495.inst	0x04673545	//xar z5.s,z5.s,z10.s,25
496.if	mixin == 1
497	ror	w12,w12,25
498.endif
499.if	mixin == 1
500	eor	w13,w13,w18
501.endif
502.inst	0x046735c9	//xar z9.s,z9.s,z14.s,25
503.if	mixin == 1
504	ror	w13,w13,25
505.endif
506.if	mixin == 1
507	eor	w14,w14,w15
508.endif
509.inst	0x0467344d	//xar z13.s,z13.s,z2.s,25
510.if	mixin == 1
511	ror	w14,w14,25
512.endif
513.if	mixin == 1
514	eor	w11,w11,w16
515.endif
516.inst	0x046734c1	//xar z1.s,z1.s,z6.s,25
517.if	mixin == 1
518	ror	w11,w11,25
519.endif
520	sub	x6,x6,1
521	cbnz	x6,10b
522.if	mixin == 1
523	add	w7,w7,w23
524.endif
525.inst	0x04b90000	//add z0.s,z0.s,z25.s
526.if	mixin == 1
527	add	x8,x8,x23,lsr #32
528.endif
529.inst	0x04ba0084	//add z4.s,z4.s,z26.s
530.if	mixin == 1
531	add	x7,x7,x8,lsl #32  // pack
532.endif
533.if	mixin == 1
534	add	w9,w9,w24
535.endif
536.inst	0x04bb0108	//add z8.s,z8.s,z27.s
537.if	mixin == 1
538	add	x10,x10,x24,lsr #32
539.endif
540.inst	0x04bc018c	//add z12.s,z12.s,z28.s
541.if	mixin == 1
542	add	x9,x9,x10,lsl #32  // pack
543.endif
544.if	mixin == 1
545	ldp	x8,x10,[x1],#16
546.endif
547.if	mixin == 1
548	add	w11,w11,w25
549.endif
550.inst	0x04bd0021	//add z1.s,z1.s,z29.s
551.if	mixin == 1
552	add	x12,x12,x25,lsr #32
553.endif
554.inst	0x04be00a5	//add z5.s,z5.s,z30.s
555.if	mixin == 1
556	add	x11,x11,x12,lsl #32  // pack
557.endif
558.if	mixin == 1
559	add	w13,w13,w26
560.endif
561.inst	0x04b50129	//add z9.s,z9.s,z21.s
562.if	mixin == 1
563	add	x14,x14,x26,lsr #32
564.endif
565.inst	0x04b601ad	//add z13.s,z13.s,z22.s
566.if	mixin == 1
567	add	x13,x13,x14,lsl #32  // pack
568.endif
569.if	mixin == 1
570	ldp	x12,x14,[x1],#16
571.endif
572.if	mixin == 1
573	add	w15,w15,w27
574.endif
575.inst	0x04b70042	//add z2.s,z2.s,z23.s
576.if	mixin == 1
577	add	x16,x16,x27,lsr #32
578.endif
579.inst	0x04b800c6	//add z6.s,z6.s,z24.s
580.if	mixin == 1
581	add	x15,x15,x16,lsl #32  // pack
582.endif
583.if	mixin == 1
584	add	w17,w17,w28
585.endif
586.inst	0x04b1014a	//add z10.s,z10.s,z17.s
587.if	mixin == 1
588	add	x18,x18,x28,lsr #32
589.endif
590.inst	0x04b201ce	//add z14.s,z14.s,z18.s
591.if	mixin == 1
592	add	x17,x17,x18,lsl #32  // pack
593.endif
594.if	mixin == 1
595	ldp	x16,x18,[x1],#16
596.endif
597.if	mixin == 1
598	add	w19,w19,w29
599.endif
600.inst	0x04b00063	//add z3.s,z3.s,z16.s
601.if	mixin == 1
602	add	x20,x20,x29,lsr #32
603.endif
604.inst	0x04b300e7	//add z7.s,z7.s,z19.s
605.if	mixin == 1
606	add	x19,x19,x20,lsl #32  // pack
607.endif
608.if	mixin == 1
609	add	w21,w21,w30
610.endif
611.inst	0x04b4016b	//add z11.s,z11.s,z20.s
612.if	mixin == 1
613	add	x22,x22,x30,lsr #32
614.endif
615.inst	0x04bf01ef	//add z15.s,z15.s,z31.s
616.if	mixin == 1
617	add	x21,x21,x22,lsl #32  // pack
618.endif
619.if	mixin == 1
620	ldp	x20,x22,[x1],#16
621.endif
622#ifdef	__AARCH64EB__
623	rev	x7,x7
624.inst	0x05a48000	//revb z0.s,p0/m,z0.s
625.inst	0x05a48084	//revb z4.s,p0/m,z4.s
626	rev	x9,x9
627.inst	0x05a48108	//revb z8.s,p0/m,z8.s
628.inst	0x05a4818c	//revb z12.s,p0/m,z12.s
629	rev	x11,x11
630.inst	0x05a48021	//revb z1.s,p0/m,z1.s
631.inst	0x05a480a5	//revb z5.s,p0/m,z5.s
632	rev	x13,x13
633.inst	0x05a48129	//revb z9.s,p0/m,z9.s
634.inst	0x05a481ad	//revb z13.s,p0/m,z13.s
635	rev	x15,x15
636.inst	0x05a48042	//revb z2.s,p0/m,z2.s
637.inst	0x05a480c6	//revb z6.s,p0/m,z6.s
638	rev	x17,x17
639.inst	0x05a4814a	//revb z10.s,p0/m,z10.s
640.inst	0x05a481ce	//revb z14.s,p0/m,z14.s
641	rev	x19,x19
642.inst	0x05a48063	//revb z3.s,p0/m,z3.s
643.inst	0x05a480e7	//revb z7.s,p0/m,z7.s
644	rev	x21,x21
645.inst	0x05a4816b	//revb z11.s,p0/m,z11.s
646.inst	0x05a481ef	//revb z15.s,p0/m,z15.s
647#endif
648.if	mixin == 1
649	add	x29,x29,#1
650.endif
651	cmp	x5,4
652	b.ne	200f
653.if	mixin == 1
654	eor	x7,x7,x8
655.endif
656.if	mixin == 1
657	eor	x9,x9,x10
658.endif
659.if	mixin == 1
660	eor	x11,x11,x12
661.endif
662.inst	0x05a46011	//zip1 z17.s,z0.s,z4.s
663.inst	0x05a46412	//zip2 z18.s,z0.s,z4.s
664.inst	0x05ac6113	//zip1 z19.s,z8.s,z12.s
665.inst	0x05ac6514	//zip2 z20.s,z8.s,z12.s
666
667.inst	0x05a56035	//zip1 z21.s,z1.s,z5.s
668.inst	0x05a56436	//zip2 z22.s,z1.s,z5.s
669.inst	0x05ad6137	//zip1 z23.s,z9.s,z13.s
670.inst	0x05ad6538	//zip2 z24.s,z9.s,z13.s
671
672.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
673.inst	0x05f36624	//zip2 z4.d,z17.d,z19.d
674.inst	0x05f46248	//zip1 z8.d,z18.d,z20.d
675.inst	0x05f4664c	//zip2 z12.d,z18.d,z20.d
676
677.inst	0x05f762a1	//zip1 z1.d,z21.d,z23.d
678.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
679.inst	0x05f862c9	//zip1 z9.d,z22.d,z24.d
680.inst	0x05f866cd	//zip2 z13.d,z22.d,z24.d
681.if	mixin == 1
682	eor	x13,x13,x14
683.endif
684.if	mixin == 1
685	eor	x15,x15,x16
686.endif
687.if	mixin == 1
688	eor	x17,x17,x18
689.endif
690.inst	0x05a66051	//zip1 z17.s,z2.s,z6.s
691.inst	0x05a66452	//zip2 z18.s,z2.s,z6.s
692.inst	0x05ae6153	//zip1 z19.s,z10.s,z14.s
693.inst	0x05ae6554	//zip2 z20.s,z10.s,z14.s
694
695.inst	0x05a76075	//zip1 z21.s,z3.s,z7.s
696.inst	0x05a76476	//zip2 z22.s,z3.s,z7.s
697.inst	0x05af6177	//zip1 z23.s,z11.s,z15.s
698.inst	0x05af6578	//zip2 z24.s,z11.s,z15.s
699
700.inst	0x05f36222	//zip1 z2.d,z17.d,z19.d
701.inst	0x05f36626	//zip2 z6.d,z17.d,z19.d
702.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
703.inst	0x05f4664e	//zip2 z14.d,z18.d,z20.d
704
705.inst	0x05f762a3	//zip1 z3.d,z21.d,z23.d
706.inst	0x05f766a7	//zip2 z7.d,z21.d,z23.d
707.inst	0x05f862cb	//zip1 z11.d,z22.d,z24.d
708.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
709.if	mixin == 1
710	eor	x19,x19,x20
711.endif
712.if	mixin == 1
713	eor	x21,x21,x22
714.endif
715	ld1	{v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
716	ld1	{v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
717.inst	0x04b13000	//eor z0.d,z0.d,z17.d
718.inst	0x04b23021	//eor z1.d,z1.d,z18.d
719.inst	0x04b33042	//eor z2.d,z2.d,z19.d
720.inst	0x04b43063	//eor z3.d,z3.d,z20.d
721.inst	0x04b53084	//eor z4.d,z4.d,z21.d
722.inst	0x04b630a5	//eor z5.d,z5.d,z22.d
723.inst	0x04b730c6	//eor z6.d,z6.d,z23.d
724.inst	0x04b830e7	//eor z7.d,z7.d,z24.d
725	ld1	{v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
726	ld1	{v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
727.if	mixin == 1
728	stp	x7,x9,[x0],#16
729.endif
730.inst	0x04b13108	//eor z8.d,z8.d,z17.d
731.inst	0x04b23129	//eor z9.d,z9.d,z18.d
732.if	mixin == 1
733	stp	x11,x13,[x0],#16
734.endif
735.inst	0x04b3314a	//eor z10.d,z10.d,z19.d
736.inst	0x04b4316b	//eor z11.d,z11.d,z20.d
737.if	mixin == 1
738	stp	x15,x17,[x0],#16
739.endif
740.inst	0x04b5318c	//eor z12.d,z12.d,z21.d
741.inst	0x04b631ad	//eor z13.d,z13.d,z22.d
742.if	mixin == 1
743	stp	x19,x21,[x0],#16
744.endif
745.inst	0x04b731ce	//eor z14.d,z14.d,z23.d
746.inst	0x04b831ef	//eor z15.d,z15.d,z24.d
747	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
748	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
749	st1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
750	st1	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
751	b	210f
752200:
753.inst	0x05a16011	//zip1 z17.s,z0.s,z1.s
754.inst	0x05a16412	//zip2 z18.s,z0.s,z1.s
755.inst	0x05a36053	//zip1 z19.s,z2.s,z3.s
756.inst	0x05a36454	//zip2 z20.s,z2.s,z3.s
757
758.inst	0x05a56095	//zip1 z21.s,z4.s,z5.s
759.inst	0x05a56496	//zip2 z22.s,z4.s,z5.s
760.inst	0x05a760d7	//zip1 z23.s,z6.s,z7.s
761.inst	0x05a764d8	//zip2 z24.s,z6.s,z7.s
762
763.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
764.inst	0x05f36621	//zip2 z1.d,z17.d,z19.d
765.inst	0x05f46242	//zip1 z2.d,z18.d,z20.d
766.inst	0x05f46643	//zip2 z3.d,z18.d,z20.d
767
768.inst	0x05f762a4	//zip1 z4.d,z21.d,z23.d
769.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
770.inst	0x05f862c6	//zip1 z6.d,z22.d,z24.d
771.inst	0x05f866c7	//zip2 z7.d,z22.d,z24.d
772.if	mixin == 1
773	eor	x7,x7,x8
774.endif
775.if	mixin == 1
776	eor	x9,x9,x10
777.endif
778.inst	0x05a96111	//zip1 z17.s,z8.s,z9.s
779.inst	0x05a96512	//zip2 z18.s,z8.s,z9.s
780.inst	0x05ab6153	//zip1 z19.s,z10.s,z11.s
781.inst	0x05ab6554	//zip2 z20.s,z10.s,z11.s
782
783.inst	0x05ad6195	//zip1 z21.s,z12.s,z13.s
784.inst	0x05ad6596	//zip2 z22.s,z12.s,z13.s
785.inst	0x05af61d7	//zip1 z23.s,z14.s,z15.s
786.inst	0x05af65d8	//zip2 z24.s,z14.s,z15.s
787
788.inst	0x05f36228	//zip1 z8.d,z17.d,z19.d
789.inst	0x05f36629	//zip2 z9.d,z17.d,z19.d
790.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
791.inst	0x05f4664b	//zip2 z11.d,z18.d,z20.d
792
793.inst	0x05f762ac	//zip1 z12.d,z21.d,z23.d
794.inst	0x05f766ad	//zip2 z13.d,z21.d,z23.d
795.inst	0x05f862ce	//zip1 z14.d,z22.d,z24.d
796.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
797.if	mixin == 1
798	eor	x11,x11,x12
799.endif
800.if	mixin == 1
801	eor	x13,x13,x14
802.endif
803.inst	0x05a46011	//zip1 z17.s,z0.s,z4.s
804.inst	0x05a46412	//zip2 z18.s,z0.s,z4.s
805.inst	0x05ac6113	//zip1 z19.s,z8.s,z12.s
806.inst	0x05ac6514	//zip2 z20.s,z8.s,z12.s
807
808.inst	0x05a56035	//zip1 z21.s,z1.s,z5.s
809.inst	0x05a56436	//zip2 z22.s,z1.s,z5.s
810.inst	0x05ad6137	//zip1 z23.s,z9.s,z13.s
811.inst	0x05ad6538	//zip2 z24.s,z9.s,z13.s
812
813.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
814.inst	0x05f36624	//zip2 z4.d,z17.d,z19.d
815.inst	0x05f46248	//zip1 z8.d,z18.d,z20.d
816.inst	0x05f4664c	//zip2 z12.d,z18.d,z20.d
817
818.inst	0x05f762a1	//zip1 z1.d,z21.d,z23.d
819.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
820.inst	0x05f862c9	//zip1 z9.d,z22.d,z24.d
821.inst	0x05f866cd	//zip2 z13.d,z22.d,z24.d
822.if	mixin == 1
823	eor	x15,x15,x16
824.endif
825.if	mixin == 1
826	eor	x17,x17,x18
827.endif
828.inst	0x05a66051	//zip1 z17.s,z2.s,z6.s
829.inst	0x05a66452	//zip2 z18.s,z2.s,z6.s
830.inst	0x05ae6153	//zip1 z19.s,z10.s,z14.s
831.inst	0x05ae6554	//zip2 z20.s,z10.s,z14.s
832
833.inst	0x05a76075	//zip1 z21.s,z3.s,z7.s
834.inst	0x05a76476	//zip2 z22.s,z3.s,z7.s
835.inst	0x05af6177	//zip1 z23.s,z11.s,z15.s
836.inst	0x05af6578	//zip2 z24.s,z11.s,z15.s
837
838.inst	0x05f36222	//zip1 z2.d,z17.d,z19.d
839.inst	0x05f36626	//zip2 z6.d,z17.d,z19.d
840.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
841.inst	0x05f4664e	//zip2 z14.d,z18.d,z20.d
842
843.inst	0x05f762a3	//zip1 z3.d,z21.d,z23.d
844.inst	0x05f766a7	//zip2 z7.d,z21.d,z23.d
845.inst	0x05f862cb	//zip1 z11.d,z22.d,z24.d
846.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
847.if	mixin == 1
848	eor	x19,x19,x20
849.endif
850.if	mixin == 1
851	eor	x21,x21,x22
852.endif
853.inst	0xa540a031	//ld1w {z17.s},p0/z,[x1,#0,MUL VL]
854.inst	0xa541a032	//ld1w {z18.s},p0/z,[x1,#1,MUL VL]
855.inst	0xa542a033	//ld1w {z19.s},p0/z,[x1,#2,MUL VL]
856.inst	0xa543a034	//ld1w {z20.s},p0/z,[x1,#3,MUL VL]
857.inst	0xa544a035	//ld1w {z21.s},p0/z,[x1,#4,MUL VL]
858.inst	0xa545a036	//ld1w {z22.s},p0/z,[x1,#5,MUL VL]
859.inst	0xa546a037	//ld1w {z23.s},p0/z,[x1,#6,MUL VL]
860.inst	0xa547a038	//ld1w {z24.s},p0/z,[x1,#7,MUL VL]
861.inst	0x04215101	//addvl x1,x1,8
862.inst	0x04b13000	//eor z0.d,z0.d,z17.d
863.inst	0x04b23084	//eor z4.d,z4.d,z18.d
864.inst	0x04b33108	//eor z8.d,z8.d,z19.d
865.inst	0x04b4318c	//eor z12.d,z12.d,z20.d
866.inst	0x04b53021	//eor z1.d,z1.d,z21.d
867.inst	0x04b630a5	//eor z5.d,z5.d,z22.d
868.inst	0x04b73129	//eor z9.d,z9.d,z23.d
869.inst	0x04b831ad	//eor z13.d,z13.d,z24.d
870.inst	0xa540a031	//ld1w {z17.s},p0/z,[x1,#0,MUL VL]
871.inst	0xa541a032	//ld1w {z18.s},p0/z,[x1,#1,MUL VL]
872.inst	0xa542a033	//ld1w {z19.s},p0/z,[x1,#2,MUL VL]
873.inst	0xa543a034	//ld1w {z20.s},p0/z,[x1,#3,MUL VL]
874.inst	0xa544a035	//ld1w {z21.s},p0/z,[x1,#4,MUL VL]
875.inst	0xa545a036	//ld1w {z22.s},p0/z,[x1,#5,MUL VL]
876.inst	0xa546a037	//ld1w {z23.s},p0/z,[x1,#6,MUL VL]
877.inst	0xa547a038	//ld1w {z24.s},p0/z,[x1,#7,MUL VL]
878.inst	0x04215101	//addvl x1,x1,8
879.if	mixin == 1
880	stp	x7,x9,[x0],#16
881.endif
882.inst	0x04b13042	//eor z2.d,z2.d,z17.d
883.inst	0x04b230c6	//eor z6.d,z6.d,z18.d
884.if	mixin == 1
885	stp	x11,x13,[x0],#16
886.endif
887.inst	0x04b3314a	//eor z10.d,z10.d,z19.d
888.inst	0x04b431ce	//eor z14.d,z14.d,z20.d
889.if	mixin == 1
890	stp	x15,x17,[x0],#16
891.endif
892.inst	0x04b53063	//eor z3.d,z3.d,z21.d
893.inst	0x04b630e7	//eor z7.d,z7.d,z22.d
894.if	mixin == 1
895	stp	x19,x21,[x0],#16
896.endif
897.inst	0x04b7316b	//eor z11.d,z11.d,z23.d
898.inst	0x04b831ef	//eor z15.d,z15.d,z24.d
899.inst	0xe540e000	//st1w {z0.s},p0,[x0,#0,MUL VL]
900.inst	0xe541e004	//st1w {z4.s},p0,[x0,#1,MUL VL]
901.inst	0xe542e008	//st1w {z8.s},p0,[x0,#2,MUL VL]
902.inst	0xe543e00c	//st1w {z12.s},p0,[x0,#3,MUL VL]
903.inst	0xe544e001	//st1w {z1.s},p0,[x0,#4,MUL VL]
904.inst	0xe545e005	//st1w {z5.s},p0,[x0,#5,MUL VL]
905.inst	0xe546e009	//st1w {z9.s},p0,[x0,#6,MUL VL]
906.inst	0xe547e00d	//st1w {z13.s},p0,[x0,#7,MUL VL]
907.inst	0x04205100	//addvl x0,x0,8
908.inst	0xe540e002	//st1w {z2.s},p0,[x0,#0,MUL VL]
909.inst	0xe541e006	//st1w {z6.s},p0,[x0,#1,MUL VL]
910.inst	0xe542e00a	//st1w {z10.s},p0,[x0,#2,MUL VL]
911.inst	0xe543e00e	//st1w {z14.s},p0,[x0,#3,MUL VL]
912.inst	0xe544e003	//st1w {z3.s},p0,[x0,#4,MUL VL]
913.inst	0xe545e007	//st1w {z7.s},p0,[x0,#5,MUL VL]
914.inst	0xe546e00b	//st1w {z11.s},p0,[x0,#6,MUL VL]
915.inst	0xe547e00f	//st1w {z15.s},p0,[x0,#7,MUL VL]
916.inst	0x04205100	//addvl x0,x0,8
917210:
918.inst	0x04b0e3fd	//incw x29, ALL, MUL #1
919	subs	x2,x2,64
920	b.gt	100b
921	b	110f
922101:
923	mixin=0
924	lsr	x8,x23,#32
925.inst	0x05a03ae0	//dup z0.s,w23
926.inst	0x05a03af9	//dup z25.s,w23
927.if	mixin == 1
928	mov	w7,w23
929.endif
930.inst	0x05a03904	//dup z4.s,w8
931.inst	0x05a0391a	//dup z26.s,w8
932	lsr	x10,x24,#32
933.inst	0x05a03b08	//dup z8.s,w24
934.inst	0x05a03b1b	//dup z27.s,w24
935.if	mixin == 1
936	mov	w9,w24
937.endif
938.inst	0x05a0394c	//dup z12.s,w10
939.inst	0x05a0395c	//dup z28.s,w10
940	lsr	x12,x25,#32
941.inst	0x05a03b21	//dup z1.s,w25
942.inst	0x05a03b3d	//dup z29.s,w25
943.if	mixin == 1
944	mov	w11,w25
945.endif
946.inst	0x05a03985	//dup z5.s,w12
947.inst	0x05a0399e	//dup z30.s,w12
948	lsr	x14,x26,#32
949.inst	0x05a03b49	//dup z9.s,w26
950.inst	0x05a03b55	//dup z21.s,w26
951.if	mixin == 1
952	mov	w13,w26
953.endif
954.inst	0x05a039cd	//dup z13.s,w14
955.inst	0x05a039d6	//dup z22.s,w14
956	lsr	x16,x27,#32
957.inst	0x05a03b62	//dup z2.s,w27
958.inst	0x05a03b77	//dup z23.s,w27
959.if	mixin == 1
960	mov	w15,w27
961.endif
962.inst	0x05a03a06	//dup z6.s,w16
963.inst	0x05a03a18	//dup z24.s,w16
964	lsr	x18,x28,#32
965.inst	0x05a03b8a	//dup z10.s,w28
966.inst	0x05a03b91	//dup z17.s,w28
967.if	mixin == 1
968	mov	w17,w28
969.endif
970.inst	0x05a03a4e	//dup z14.s,w18
971.inst	0x05a03a52	//dup z18.s,w18
972	lsr	x22,x30,#32
973.inst	0x05a03bcb	//dup z11.s,w30
974.inst	0x05a03bd4	//dup z20.s,w30
975.if	mixin == 1
976	mov	w21,w30
977.endif
978.inst	0x05a03acf	//dup z15.s,w22
979.inst	0x05a03adf	//dup z31.s,w22
980.if	mixin == 1
981	add	w20,w29,#1
982	mov	w19,w29
983.inst	0x04a14690	//index z16.s,w20,1
984.inst	0x04a14683	//index z3.s,w20,1
985.else
986.inst	0x04a147b0	//index z16.s,w29,1
987.inst	0x04a147a3	//index z3.s,w29,1
988.endif
989	lsr	x20,x29,#32
990.inst	0x05a03a87	//dup z7.s,w20
991.inst	0x05a03a93	//dup z19.s,w20
992	mov	x6,#10
99310:
994.align	5
995.inst	0x04a10000	//add z0.s,z0.s,z1.s
996.if	mixin == 1
997	add	w7,w7,w11
998.endif
999.inst	0x04a50084	//add z4.s,z4.s,z5.s
1000.if	mixin == 1
1001	add	w8,w8,w12
1002.endif
1003.inst	0x04a90108	//add z8.s,z8.s,z9.s
1004.if	mixin == 1
1005	add	w9,w9,w13
1006.endif
1007.inst	0x04ad018c	//add z12.s,z12.s,z13.s
1008.if	mixin == 1
1009	add	w10,w10,w14
1010.endif
1011.if	mixin == 1
1012	eor	w19,w19,w7
1013.endif
1014.inst	0x04703403	//xar z3.s,z3.s,z0.s,16
1015.if	mixin == 1
1016	ror	w19,w19,16
1017.endif
1018.if	mixin == 1
1019	eor	w20,w20,w8
1020.endif
1021.inst	0x04703487	//xar z7.s,z7.s,z4.s,16
1022.if	mixin == 1
1023	ror	w20,w20,16
1024.endif
1025.if	mixin == 1
1026	eor	w21,w21,w9
1027.endif
1028.inst	0x0470350b	//xar z11.s,z11.s,z8.s,16
1029.if	mixin == 1
1030	ror	w21,w21,16
1031.endif
1032.if	mixin == 1
1033	eor	w22,w22,w10
1034.endif
1035.inst	0x0470358f	//xar z15.s,z15.s,z12.s,16
1036.if	mixin == 1
1037	ror	w22,w22,16
1038.endif
1039.inst	0x04a30042	//add z2.s,z2.s,z3.s
1040.if	mixin == 1
1041	add	w15,w15,w19
1042.endif
1043.inst	0x04a700c6	//add z6.s,z6.s,z7.s
1044.if	mixin == 1
1045	add	w16,w16,w20
1046.endif
1047.inst	0x04ab014a	//add z10.s,z10.s,z11.s
1048.if	mixin == 1
1049	add	w17,w17,w21
1050.endif
1051.inst	0x04af01ce	//add z14.s,z14.s,z15.s
1052.if	mixin == 1
1053	add	w18,w18,w22
1054.endif
1055.if	mixin == 1
1056	eor	w11,w11,w15
1057.endif
1058.inst	0x046c3441	//xar z1.s,z1.s,z2.s,20
1059.if	mixin == 1
1060	ror	w11,w11,20
1061.endif
1062.if	mixin == 1
1063	eor	w12,w12,w16
1064.endif
1065.inst	0x046c34c5	//xar z5.s,z5.s,z6.s,20
1066.if	mixin == 1
1067	ror	w12,w12,20
1068.endif
1069.if	mixin == 1
1070	eor	w13,w13,w17
1071.endif
1072.inst	0x046c3549	//xar z9.s,z9.s,z10.s,20
1073.if	mixin == 1
1074	ror	w13,w13,20
1075.endif
1076.if	mixin == 1
1077	eor	w14,w14,w18
1078.endif
1079.inst	0x046c35cd	//xar z13.s,z13.s,z14.s,20
1080.if	mixin == 1
1081	ror	w14,w14,20
1082.endif
1083.inst	0x04a10000	//add z0.s,z0.s,z1.s
1084.if	mixin == 1
1085	add	w7,w7,w11
1086.endif
1087.inst	0x04a50084	//add z4.s,z4.s,z5.s
1088.if	mixin == 1
1089	add	w8,w8,w12
1090.endif
1091.inst	0x04a90108	//add z8.s,z8.s,z9.s
1092.if	mixin == 1
1093	add	w9,w9,w13
1094.endif
1095.inst	0x04ad018c	//add z12.s,z12.s,z13.s
1096.if	mixin == 1
1097	add	w10,w10,w14
1098.endif
1099.if	mixin == 1
1100	eor	w19,w19,w7
1101.endif
1102.inst	0x04683403	//xar z3.s,z3.s,z0.s,24
1103.if	mixin == 1
1104	ror	w19,w19,24
1105.endif
1106.if	mixin == 1
1107	eor	w20,w20,w8
1108.endif
1109.inst	0x04683487	//xar z7.s,z7.s,z4.s,24
1110.if	mixin == 1
1111	ror	w20,w20,24
1112.endif
1113.if	mixin == 1
1114	eor	w21,w21,w9
1115.endif
1116.inst	0x0468350b	//xar z11.s,z11.s,z8.s,24
1117.if	mixin == 1
1118	ror	w21,w21,24
1119.endif
1120.if	mixin == 1
1121	eor	w22,w22,w10
1122.endif
1123.inst	0x0468358f	//xar z15.s,z15.s,z12.s,24
1124.if	mixin == 1
1125	ror	w22,w22,24
1126.endif
1127.inst	0x04a30042	//add z2.s,z2.s,z3.s
1128.if	mixin == 1
1129	add	w15,w15,w19
1130.endif
1131.inst	0x04a700c6	//add z6.s,z6.s,z7.s
1132.if	mixin == 1
1133	add	w16,w16,w20
1134.endif
1135.inst	0x04ab014a	//add z10.s,z10.s,z11.s
1136.if	mixin == 1
1137	add	w17,w17,w21
1138.endif
1139.inst	0x04af01ce	//add z14.s,z14.s,z15.s
1140.if	mixin == 1
1141	add	w18,w18,w22
1142.endif
1143.if	mixin == 1
1144	eor	w11,w11,w15
1145.endif
1146.inst	0x04673441	//xar z1.s,z1.s,z2.s,25
1147.if	mixin == 1
1148	ror	w11,w11,25
1149.endif
1150.if	mixin == 1
1151	eor	w12,w12,w16
1152.endif
1153.inst	0x046734c5	//xar z5.s,z5.s,z6.s,25
1154.if	mixin == 1
1155	ror	w12,w12,25
1156.endif
1157.if	mixin == 1
1158	eor	w13,w13,w17
1159.endif
1160.inst	0x04673549	//xar z9.s,z9.s,z10.s,25
1161.if	mixin == 1
1162	ror	w13,w13,25
1163.endif
1164.if	mixin == 1
1165	eor	w14,w14,w18
1166.endif
1167.inst	0x046735cd	//xar z13.s,z13.s,z14.s,25
1168.if	mixin == 1
1169	ror	w14,w14,25
1170.endif
1171.inst	0x04a50000	//add z0.s,z0.s,z5.s
1172.if	mixin == 1
1173	add	w7,w7,w12
1174.endif
1175.inst	0x04a90084	//add z4.s,z4.s,z9.s
1176.if	mixin == 1
1177	add	w8,w8,w13
1178.endif
1179.inst	0x04ad0108	//add z8.s,z8.s,z13.s
1180.if	mixin == 1
1181	add	w9,w9,w14
1182.endif
1183.inst	0x04a1018c	//add z12.s,z12.s,z1.s
1184.if	mixin == 1
1185	add	w10,w10,w11
1186.endif
1187.if	mixin == 1
1188	eor	w22,w22,w7
1189.endif
1190.inst	0x0470340f	//xar z15.s,z15.s,z0.s,16
1191.if	mixin == 1
1192	ror	w22,w22,16
1193.endif
1194.if	mixin == 1
1195	eor	w19,w19,w8
1196.endif
1197.inst	0x04703483	//xar z3.s,z3.s,z4.s,16
1198.if	mixin == 1
1199	ror	w19,w19,16
1200.endif
1201.if	mixin == 1
1202	eor	w20,w20,w9
1203.endif
1204.inst	0x04703507	//xar z7.s,z7.s,z8.s,16
1205.if	mixin == 1
1206	ror	w20,w20,16
1207.endif
1208.if	mixin == 1
1209	eor	w21,w21,w10
1210.endif
1211.inst	0x0470358b	//xar z11.s,z11.s,z12.s,16
1212.if	mixin == 1
1213	ror	w21,w21,16
1214.endif
1215.inst	0x04af014a	//add z10.s,z10.s,z15.s
1216.if	mixin == 1
1217	add	w17,w17,w22
1218.endif
1219.inst	0x04a301ce	//add z14.s,z14.s,z3.s
1220.if	mixin == 1
1221	add	w18,w18,w19
1222.endif
1223.inst	0x04a70042	//add z2.s,z2.s,z7.s
1224.if	mixin == 1
1225	add	w15,w15,w20
1226.endif
1227.inst	0x04ab00c6	//add z6.s,z6.s,z11.s
1228.if	mixin == 1
1229	add	w16,w16,w21
1230.endif
1231.if	mixin == 1
1232	eor	w12,w12,w17
1233.endif
1234.inst	0x046c3545	//xar z5.s,z5.s,z10.s,20
1235.if	mixin == 1
1236	ror	w12,w12,20
1237.endif
1238.if	mixin == 1
1239	eor	w13,w13,w18
1240.endif
1241.inst	0x046c35c9	//xar z9.s,z9.s,z14.s,20
1242.if	mixin == 1
1243	ror	w13,w13,20
1244.endif
1245.if	mixin == 1
1246	eor	w14,w14,w15
1247.endif
1248.inst	0x046c344d	//xar z13.s,z13.s,z2.s,20
1249.if	mixin == 1
1250	ror	w14,w14,20
1251.endif
1252.if	mixin == 1
1253	eor	w11,w11,w16
1254.endif
1255.inst	0x046c34c1	//xar z1.s,z1.s,z6.s,20
1256.if	mixin == 1
1257	ror	w11,w11,20
1258.endif
1259.inst	0x04a50000	//add z0.s,z0.s,z5.s
1260.if	mixin == 1
1261	add	w7,w7,w12
1262.endif
1263.inst	0x04a90084	//add z4.s,z4.s,z9.s
1264.if	mixin == 1
1265	add	w8,w8,w13
1266.endif
1267.inst	0x04ad0108	//add z8.s,z8.s,z13.s
1268.if	mixin == 1
1269	add	w9,w9,w14
1270.endif
1271.inst	0x04a1018c	//add z12.s,z12.s,z1.s
1272.if	mixin == 1
1273	add	w10,w10,w11
1274.endif
1275.if	mixin == 1
1276	eor	w22,w22,w7
1277.endif
1278.inst	0x0468340f	//xar z15.s,z15.s,z0.s,24
1279.if	mixin == 1
1280	ror	w22,w22,24
1281.endif
1282.if	mixin == 1
1283	eor	w19,w19,w8
1284.endif
1285.inst	0x04683483	//xar z3.s,z3.s,z4.s,24
1286.if	mixin == 1
1287	ror	w19,w19,24
1288.endif
1289.if	mixin == 1
1290	eor	w20,w20,w9
1291.endif
1292.inst	0x04683507	//xar z7.s,z7.s,z8.s,24
1293.if	mixin == 1
1294	ror	w20,w20,24
1295.endif
1296.if	mixin == 1
1297	eor	w21,w21,w10
1298.endif
1299.inst	0x0468358b	//xar z11.s,z11.s,z12.s,24
1300.if	mixin == 1
1301	ror	w21,w21,24
1302.endif
1303.inst	0x04af014a	//add z10.s,z10.s,z15.s
1304.if	mixin == 1
1305	add	w17,w17,w22
1306.endif
1307.inst	0x04a301ce	//add z14.s,z14.s,z3.s
1308.if	mixin == 1
1309	add	w18,w18,w19
1310.endif
1311.inst	0x04a70042	//add z2.s,z2.s,z7.s
1312.if	mixin == 1
1313	add	w15,w15,w20
1314.endif
1315.inst	0x04ab00c6	//add z6.s,z6.s,z11.s
1316.if	mixin == 1
1317	add	w16,w16,w21
1318.endif
1319.if	mixin == 1
1320	eor	w12,w12,w17
1321.endif
1322.inst	0x04673545	//xar z5.s,z5.s,z10.s,25
1323.if	mixin == 1
1324	ror	w12,w12,25
1325.endif
1326.if	mixin == 1
1327	eor	w13,w13,w18
1328.endif
1329.inst	0x046735c9	//xar z9.s,z9.s,z14.s,25
1330.if	mixin == 1
1331	ror	w13,w13,25
1332.endif
1333.if	mixin == 1
1334	eor	w14,w14,w15
1335.endif
1336.inst	0x0467344d	//xar z13.s,z13.s,z2.s,25
1337.if	mixin == 1
1338	ror	w14,w14,25
1339.endif
1340.if	mixin == 1
1341	eor	w11,w11,w16
1342.endif
1343.inst	0x046734c1	//xar z1.s,z1.s,z6.s,25
1344.if	mixin == 1
1345	ror	w11,w11,25
1346.endif
1347	sub	x6,x6,1
1348	cbnz	x6,10b
1349.if	mixin == 1
1350	add	w7,w7,w23
1351.endif
1352.inst	0x04b90000	//add z0.s,z0.s,z25.s
1353.if	mixin == 1
1354	add	x8,x8,x23,lsr #32
1355.endif
1356.inst	0x04ba0084	//add z4.s,z4.s,z26.s
1357.if	mixin == 1
1358	add	x7,x7,x8,lsl #32  // pack
1359.endif
1360.if	mixin == 1
1361	add	w9,w9,w24
1362.endif
1363.inst	0x04bb0108	//add z8.s,z8.s,z27.s
1364.if	mixin == 1
1365	add	x10,x10,x24,lsr #32
1366.endif
1367.inst	0x04bc018c	//add z12.s,z12.s,z28.s
1368.if	mixin == 1
1369	add	x9,x9,x10,lsl #32  // pack
1370.endif
1371.if	mixin == 1
1372	ldp	x8,x10,[x1],#16
1373.endif
1374.if	mixin == 1
1375	add	w11,w11,w25
1376.endif
1377.inst	0x04bd0021	//add z1.s,z1.s,z29.s
1378.if	mixin == 1
1379	add	x12,x12,x25,lsr #32
1380.endif
1381.inst	0x04be00a5	//add z5.s,z5.s,z30.s
1382.if	mixin == 1
1383	add	x11,x11,x12,lsl #32  // pack
1384.endif
1385.if	mixin == 1
1386	add	w13,w13,w26
1387.endif
1388.inst	0x04b50129	//add z9.s,z9.s,z21.s
1389.if	mixin == 1
1390	add	x14,x14,x26,lsr #32
1391.endif
1392.inst	0x04b601ad	//add z13.s,z13.s,z22.s
1393.if	mixin == 1
1394	add	x13,x13,x14,lsl #32  // pack
1395.endif
1396.if	mixin == 1
1397	ldp	x12,x14,[x1],#16
1398.endif
1399.if	mixin == 1
1400	add	w15,w15,w27
1401.endif
1402.inst	0x04b70042	//add z2.s,z2.s,z23.s
1403.if	mixin == 1
1404	add	x16,x16,x27,lsr #32
1405.endif
1406.inst	0x04b800c6	//add z6.s,z6.s,z24.s
1407.if	mixin == 1
1408	add	x15,x15,x16,lsl #32  // pack
1409.endif
1410.if	mixin == 1
1411	add	w17,w17,w28
1412.endif
1413.inst	0x04b1014a	//add z10.s,z10.s,z17.s
1414.if	mixin == 1
1415	add	x18,x18,x28,lsr #32
1416.endif
1417.inst	0x04b201ce	//add z14.s,z14.s,z18.s
1418.if	mixin == 1
1419	add	x17,x17,x18,lsl #32  // pack
1420.endif
1421.if	mixin == 1
1422	ldp	x16,x18,[x1],#16
1423.endif
1424.if	mixin == 1
1425	add	w19,w19,w29
1426.endif
1427.inst	0x04b00063	//add z3.s,z3.s,z16.s
1428.if	mixin == 1
1429	add	x20,x20,x29,lsr #32
1430.endif
1431.inst	0x04b300e7	//add z7.s,z7.s,z19.s
1432.if	mixin == 1
1433	add	x19,x19,x20,lsl #32  // pack
1434.endif
1435.if	mixin == 1
1436	add	w21,w21,w30
1437.endif
1438.inst	0x04b4016b	//add z11.s,z11.s,z20.s
1439.if	mixin == 1
1440	add	x22,x22,x30,lsr #32
1441.endif
1442.inst	0x04bf01ef	//add z15.s,z15.s,z31.s
1443.if	mixin == 1
1444	add	x21,x21,x22,lsl #32  // pack
1445.endif
1446.if	mixin == 1
1447	ldp	x20,x22,[x1],#16
1448.endif
1449#ifdef	__AARCH64EB__
1450	rev	x7,x7
1451.inst	0x05a48000	//revb z0.s,p0/m,z0.s
1452.inst	0x05a48084	//revb z4.s,p0/m,z4.s
1453	rev	x9,x9
1454.inst	0x05a48108	//revb z8.s,p0/m,z8.s
1455.inst	0x05a4818c	//revb z12.s,p0/m,z12.s
1456	rev	x11,x11
1457.inst	0x05a48021	//revb z1.s,p0/m,z1.s
1458.inst	0x05a480a5	//revb z5.s,p0/m,z5.s
1459	rev	x13,x13
1460.inst	0x05a48129	//revb z9.s,p0/m,z9.s
1461.inst	0x05a481ad	//revb z13.s,p0/m,z13.s
1462	rev	x15,x15
1463.inst	0x05a48042	//revb z2.s,p0/m,z2.s
1464.inst	0x05a480c6	//revb z6.s,p0/m,z6.s
1465	rev	x17,x17
1466.inst	0x05a4814a	//revb z10.s,p0/m,z10.s
1467.inst	0x05a481ce	//revb z14.s,p0/m,z14.s
1468	rev	x19,x19
1469.inst	0x05a48063	//revb z3.s,p0/m,z3.s
1470.inst	0x05a480e7	//revb z7.s,p0/m,z7.s
1471	rev	x21,x21
1472.inst	0x05a4816b	//revb z11.s,p0/m,z11.s
1473.inst	0x05a481ef	//revb z15.s,p0/m,z15.s
1474#endif
1475.if	mixin == 1
1476	add	x29,x29,#1
1477.endif
1478	cmp	x5,4
1479	b.ne	200f
1480.if	mixin == 1
1481	eor	x7,x7,x8
1482.endif
1483.if	mixin == 1
1484	eor	x9,x9,x10
1485.endif
1486.if	mixin == 1
1487	eor	x11,x11,x12
1488.endif
1489.inst	0x05a46011	//zip1 z17.s,z0.s,z4.s
1490.inst	0x05a46412	//zip2 z18.s,z0.s,z4.s
1491.inst	0x05ac6113	//zip1 z19.s,z8.s,z12.s
1492.inst	0x05ac6514	//zip2 z20.s,z8.s,z12.s
1493
1494.inst	0x05a56035	//zip1 z21.s,z1.s,z5.s
1495.inst	0x05a56436	//zip2 z22.s,z1.s,z5.s
1496.inst	0x05ad6137	//zip1 z23.s,z9.s,z13.s
1497.inst	0x05ad6538	//zip2 z24.s,z9.s,z13.s
1498
1499.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
1500.inst	0x05f36624	//zip2 z4.d,z17.d,z19.d
1501.inst	0x05f46248	//zip1 z8.d,z18.d,z20.d
1502.inst	0x05f4664c	//zip2 z12.d,z18.d,z20.d
1503
1504.inst	0x05f762a1	//zip1 z1.d,z21.d,z23.d
1505.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
1506.inst	0x05f862c9	//zip1 z9.d,z22.d,z24.d
1507.inst	0x05f866cd	//zip2 z13.d,z22.d,z24.d
1508.if	mixin == 1
1509	eor	x13,x13,x14
1510.endif
1511.if	mixin == 1
1512	eor	x15,x15,x16
1513.endif
1514.if	mixin == 1
1515	eor	x17,x17,x18
1516.endif
1517.inst	0x05a66051	//zip1 z17.s,z2.s,z6.s
1518.inst	0x05a66452	//zip2 z18.s,z2.s,z6.s
1519.inst	0x05ae6153	//zip1 z19.s,z10.s,z14.s
1520.inst	0x05ae6554	//zip2 z20.s,z10.s,z14.s
1521
1522.inst	0x05a76075	//zip1 z21.s,z3.s,z7.s
1523.inst	0x05a76476	//zip2 z22.s,z3.s,z7.s
1524.inst	0x05af6177	//zip1 z23.s,z11.s,z15.s
1525.inst	0x05af6578	//zip2 z24.s,z11.s,z15.s
1526
1527.inst	0x05f36222	//zip1 z2.d,z17.d,z19.d
1528.inst	0x05f36626	//zip2 z6.d,z17.d,z19.d
1529.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
1530.inst	0x05f4664e	//zip2 z14.d,z18.d,z20.d
1531
1532.inst	0x05f762a3	//zip1 z3.d,z21.d,z23.d
1533.inst	0x05f766a7	//zip2 z7.d,z21.d,z23.d
1534.inst	0x05f862cb	//zip1 z11.d,z22.d,z24.d
1535.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
1536.if	mixin == 1
1537	eor	x19,x19,x20
1538.endif
1539.if	mixin == 1
1540	eor	x21,x21,x22
1541.endif
1542	ld1	{v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
1543	ld1	{v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
1544.inst	0x04b13000	//eor z0.d,z0.d,z17.d
1545.inst	0x04b23021	//eor z1.d,z1.d,z18.d
1546.inst	0x04b33042	//eor z2.d,z2.d,z19.d
1547.inst	0x04b43063	//eor z3.d,z3.d,z20.d
1548.inst	0x04b53084	//eor z4.d,z4.d,z21.d
1549.inst	0x04b630a5	//eor z5.d,z5.d,z22.d
1550.inst	0x04b730c6	//eor z6.d,z6.d,z23.d
1551.inst	0x04b830e7	//eor z7.d,z7.d,z24.d
1552	ld1	{v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
1553	ld1	{v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
1554.if	mixin == 1
1555	stp	x7,x9,[x0],#16
1556.endif
1557.inst	0x04b13108	//eor z8.d,z8.d,z17.d
1558.inst	0x04b23129	//eor z9.d,z9.d,z18.d
1559.if	mixin == 1
1560	stp	x11,x13,[x0],#16
1561.endif
1562.inst	0x04b3314a	//eor z10.d,z10.d,z19.d
1563.inst	0x04b4316b	//eor z11.d,z11.d,z20.d
1564.if	mixin == 1
1565	stp	x15,x17,[x0],#16
1566.endif
1567.inst	0x04b5318c	//eor z12.d,z12.d,z21.d
1568.inst	0x04b631ad	//eor z13.d,z13.d,z22.d
1569.if	mixin == 1
1570	stp	x19,x21,[x0],#16
1571.endif
1572.inst	0x04b731ce	//eor z14.d,z14.d,z23.d
1573.inst	0x04b831ef	//eor z15.d,z15.d,z24.d
1574	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
1575	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
1576	st1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
1577	st1	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
1578	b	210f
1579200:
1580.inst	0x05a16011	//zip1 z17.s,z0.s,z1.s
1581.inst	0x05a16412	//zip2 z18.s,z0.s,z1.s
1582.inst	0x05a36053	//zip1 z19.s,z2.s,z3.s
1583.inst	0x05a36454	//zip2 z20.s,z2.s,z3.s
1584
1585.inst	0x05a56095	//zip1 z21.s,z4.s,z5.s
1586.inst	0x05a56496	//zip2 z22.s,z4.s,z5.s
1587.inst	0x05a760d7	//zip1 z23.s,z6.s,z7.s
1588.inst	0x05a764d8	//zip2 z24.s,z6.s,z7.s
1589
1590.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
1591.inst	0x05f36621	//zip2 z1.d,z17.d,z19.d
1592.inst	0x05f46242	//zip1 z2.d,z18.d,z20.d
1593.inst	0x05f46643	//zip2 z3.d,z18.d,z20.d
1594
1595.inst	0x05f762a4	//zip1 z4.d,z21.d,z23.d
1596.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
1597.inst	0x05f862c6	//zip1 z6.d,z22.d,z24.d
1598.inst	0x05f866c7	//zip2 z7.d,z22.d,z24.d
1599.if	mixin == 1
1600	eor	x7,x7,x8
1601.endif
1602.if	mixin == 1
1603	eor	x9,x9,x10
1604.endif
1605.inst	0x05a96111	//zip1 z17.s,z8.s,z9.s
1606.inst	0x05a96512	//zip2 z18.s,z8.s,z9.s
1607.inst	0x05ab6153	//zip1 z19.s,z10.s,z11.s
1608.inst	0x05ab6554	//zip2 z20.s,z10.s,z11.s
1609
1610.inst	0x05ad6195	//zip1 z21.s,z12.s,z13.s
1611.inst	0x05ad6596	//zip2 z22.s,z12.s,z13.s
1612.inst	0x05af61d7	//zip1 z23.s,z14.s,z15.s
1613.inst	0x05af65d8	//zip2 z24.s,z14.s,z15.s
1614
1615.inst	0x05f36228	//zip1 z8.d,z17.d,z19.d
1616.inst	0x05f36629	//zip2 z9.d,z17.d,z19.d
1617.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
1618.inst	0x05f4664b	//zip2 z11.d,z18.d,z20.d
1619
1620.inst	0x05f762ac	//zip1 z12.d,z21.d,z23.d
1621.inst	0x05f766ad	//zip2 z13.d,z21.d,z23.d
1622.inst	0x05f862ce	//zip1 z14.d,z22.d,z24.d
1623.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
1624.if	mixin == 1
1625	eor	x11,x11,x12
1626.endif
1627.if	mixin == 1
1628	eor	x13,x13,x14
1629.endif
1630.inst	0x05a46011	//zip1 z17.s,z0.s,z4.s
1631.inst	0x05a46412	//zip2 z18.s,z0.s,z4.s
1632.inst	0x05ac6113	//zip1 z19.s,z8.s,z12.s
1633.inst	0x05ac6514	//zip2 z20.s,z8.s,z12.s
1634
1635.inst	0x05a56035	//zip1 z21.s,z1.s,z5.s
1636.inst	0x05a56436	//zip2 z22.s,z1.s,z5.s
1637.inst	0x05ad6137	//zip1 z23.s,z9.s,z13.s
1638.inst	0x05ad6538	//zip2 z24.s,z9.s,z13.s
1639
1640.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
1641.inst	0x05f36624	//zip2 z4.d,z17.d,z19.d
1642.inst	0x05f46248	//zip1 z8.d,z18.d,z20.d
1643.inst	0x05f4664c	//zip2 z12.d,z18.d,z20.d
1644
1645.inst	0x05f762a1	//zip1 z1.d,z21.d,z23.d
1646.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
1647.inst	0x05f862c9	//zip1 z9.d,z22.d,z24.d
1648.inst	0x05f866cd	//zip2 z13.d,z22.d,z24.d
1649.if	mixin == 1
1650	eor	x15,x15,x16
1651.endif
1652.if	mixin == 1
1653	eor	x17,x17,x18
1654.endif
1655.inst	0x05a66051	//zip1 z17.s,z2.s,z6.s
1656.inst	0x05a66452	//zip2 z18.s,z2.s,z6.s
1657.inst	0x05ae6153	//zip1 z19.s,z10.s,z14.s
1658.inst	0x05ae6554	//zip2 z20.s,z10.s,z14.s
1659
1660.inst	0x05a76075	//zip1 z21.s,z3.s,z7.s
1661.inst	0x05a76476	//zip2 z22.s,z3.s,z7.s
1662.inst	0x05af6177	//zip1 z23.s,z11.s,z15.s
1663.inst	0x05af6578	//zip2 z24.s,z11.s,z15.s
1664
1665.inst	0x05f36222	//zip1 z2.d,z17.d,z19.d
1666.inst	0x05f36626	//zip2 z6.d,z17.d,z19.d
1667.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
1668.inst	0x05f4664e	//zip2 z14.d,z18.d,z20.d
1669
1670.inst	0x05f762a3	//zip1 z3.d,z21.d,z23.d
1671.inst	0x05f766a7	//zip2 z7.d,z21.d,z23.d
1672.inst	0x05f862cb	//zip1 z11.d,z22.d,z24.d
1673.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
1674.if	mixin == 1
1675	eor	x19,x19,x20
1676.endif
1677.if	mixin == 1
1678	eor	x21,x21,x22
1679.endif
1680.inst	0xa540a031	//ld1w {z17.s},p0/z,[x1,#0,MUL VL]
1681.inst	0xa541a032	//ld1w {z18.s},p0/z,[x1,#1,MUL VL]
1682.inst	0xa542a033	//ld1w {z19.s},p0/z,[x1,#2,MUL VL]
1683.inst	0xa543a034	//ld1w {z20.s},p0/z,[x1,#3,MUL VL]
1684.inst	0xa544a035	//ld1w {z21.s},p0/z,[x1,#4,MUL VL]
1685.inst	0xa545a036	//ld1w {z22.s},p0/z,[x1,#5,MUL VL]
1686.inst	0xa546a037	//ld1w {z23.s},p0/z,[x1,#6,MUL VL]
1687.inst	0xa547a038	//ld1w {z24.s},p0/z,[x1,#7,MUL VL]
1688.inst	0x04215101	//addvl x1,x1,8
1689.inst	0x04b13000	//eor z0.d,z0.d,z17.d
1690.inst	0x04b23084	//eor z4.d,z4.d,z18.d
1691.inst	0x04b33108	//eor z8.d,z8.d,z19.d
1692.inst	0x04b4318c	//eor z12.d,z12.d,z20.d
1693.inst	0x04b53021	//eor z1.d,z1.d,z21.d
1694.inst	0x04b630a5	//eor z5.d,z5.d,z22.d
1695.inst	0x04b73129	//eor z9.d,z9.d,z23.d
1696.inst	0x04b831ad	//eor z13.d,z13.d,z24.d
1697.inst	0xa540a031	//ld1w {z17.s},p0/z,[x1,#0,MUL VL]
1698.inst	0xa541a032	//ld1w {z18.s},p0/z,[x1,#1,MUL VL]
1699.inst	0xa542a033	//ld1w {z19.s},p0/z,[x1,#2,MUL VL]
1700.inst	0xa543a034	//ld1w {z20.s},p0/z,[x1,#3,MUL VL]
1701.inst	0xa544a035	//ld1w {z21.s},p0/z,[x1,#4,MUL VL]
1702.inst	0xa545a036	//ld1w {z22.s},p0/z,[x1,#5,MUL VL]
1703.inst	0xa546a037	//ld1w {z23.s},p0/z,[x1,#6,MUL VL]
1704.inst	0xa547a038	//ld1w {z24.s},p0/z,[x1,#7,MUL VL]
1705.inst	0x04215101	//addvl x1,x1,8
1706.if	mixin == 1
1707	stp	x7,x9,[x0],#16
1708.endif
1709.inst	0x04b13042	//eor z2.d,z2.d,z17.d
1710.inst	0x04b230c6	//eor z6.d,z6.d,z18.d
1711.if	mixin == 1
1712	stp	x11,x13,[x0],#16
1713.endif
1714.inst	0x04b3314a	//eor z10.d,z10.d,z19.d
1715.inst	0x04b431ce	//eor z14.d,z14.d,z20.d
1716.if	mixin == 1
1717	stp	x15,x17,[x0],#16
1718.endif
1719.inst	0x04b53063	//eor z3.d,z3.d,z21.d
1720.inst	0x04b630e7	//eor z7.d,z7.d,z22.d
1721.if	mixin == 1
1722	stp	x19,x21,[x0],#16
1723.endif
1724.inst	0x04b7316b	//eor z11.d,z11.d,z23.d
1725.inst	0x04b831ef	//eor z15.d,z15.d,z24.d
1726.inst	0xe540e000	//st1w {z0.s},p0,[x0,#0,MUL VL]
1727.inst	0xe541e004	//st1w {z4.s},p0,[x0,#1,MUL VL]
1728.inst	0xe542e008	//st1w {z8.s},p0,[x0,#2,MUL VL]
1729.inst	0xe543e00c	//st1w {z12.s},p0,[x0,#3,MUL VL]
1730.inst	0xe544e001	//st1w {z1.s},p0,[x0,#4,MUL VL]
1731.inst	0xe545e005	//st1w {z5.s},p0,[x0,#5,MUL VL]
1732.inst	0xe546e009	//st1w {z9.s},p0,[x0,#6,MUL VL]
1733.inst	0xe547e00d	//st1w {z13.s},p0,[x0,#7,MUL VL]
1734.inst	0x04205100	//addvl x0,x0,8
1735.inst	0xe540e002	//st1w {z2.s},p0,[x0,#0,MUL VL]
1736.inst	0xe541e006	//st1w {z6.s},p0,[x0,#1,MUL VL]
1737.inst	0xe542e00a	//st1w {z10.s},p0,[x0,#2,MUL VL]
1738.inst	0xe543e00e	//st1w {z14.s},p0,[x0,#3,MUL VL]
1739.inst	0xe544e003	//st1w {z3.s},p0,[x0,#4,MUL VL]
1740.inst	0xe545e007	//st1w {z7.s},p0,[x0,#5,MUL VL]
1741.inst	0xe546e00b	//st1w {z11.s},p0,[x0,#6,MUL VL]
1742.inst	0xe547e00f	//st1w {z15.s},p0,[x0,#7,MUL VL]
1743.inst	0x04205100	//addvl x0,x0,8
1744210:
1745.inst	0x04b0e3fd	//incw x29, ALL, MUL #1
1746110:
1747	b	2f
17481:
1749.align	5
1750100:
1751	subs	x7,x2,x5,lsl #6
1752	b.lt	110f
1753	mov	x2,x7
1754	b.eq	101f
1755	cmp	x2,64
1756	b.lt	101f
1757	mixin=1
1758	lsr	x8,x23,#32
1759.inst	0x05a03ae0	//dup z0.s,w23
1760.inst	0x05a03af9	//dup z25.s,w23
1761.if	mixin == 1
1762	mov	w7,w23
1763.endif
1764.inst	0x05a03904	//dup z4.s,w8
1765.inst	0x05a0391a	//dup z26.s,w8
1766	lsr	x10,x24,#32
1767.inst	0x05a03b08	//dup z8.s,w24
1768.inst	0x05a03b1b	//dup z27.s,w24
1769.if	mixin == 1
1770	mov	w9,w24
1771.endif
1772.inst	0x05a0394c	//dup z12.s,w10
1773.inst	0x05a0395c	//dup z28.s,w10
1774	lsr	x12,x25,#32
1775.inst	0x05a03b21	//dup z1.s,w25
1776.inst	0x05a03b3d	//dup z29.s,w25
1777.if	mixin == 1
1778	mov	w11,w25
1779.endif
1780.inst	0x05a03985	//dup z5.s,w12
1781.inst	0x05a0399e	//dup z30.s,w12
1782	lsr	x14,x26,#32
1783.inst	0x05a03b49	//dup z9.s,w26
1784.inst	0x05a03b55	//dup z21.s,w26
1785.if	mixin == 1
1786	mov	w13,w26
1787.endif
1788.inst	0x05a039cd	//dup z13.s,w14
1789.inst	0x05a039d6	//dup z22.s,w14
1790	lsr	x16,x27,#32
1791.inst	0x05a03b62	//dup z2.s,w27
1792.inst	0x05a03b77	//dup z23.s,w27
1793.if	mixin == 1
1794	mov	w15,w27
1795.endif
1796.inst	0x05a03a06	//dup z6.s,w16
1797.inst	0x05a03a18	//dup z24.s,w16
1798	lsr	x18,x28,#32
1799.inst	0x05a03b8a	//dup z10.s,w28
1800.if	mixin == 1
1801	mov	w17,w28
1802.endif
1803.inst	0x05a03a4e	//dup z14.s,w18
1804	lsr	x22,x30,#32
1805.inst	0x05a03bcb	//dup z11.s,w30
1806.if	mixin == 1
1807	mov	w21,w30
1808.endif
1809.inst	0x05a03acf	//dup z15.s,w22
1810.if	mixin == 1
1811	add	w20,w29,#1
1812	mov	w19,w29
1813.inst	0x04a14690	//index z16.s,w20,1
1814.inst	0x04a14683	//index z3.s,w20,1
1815.else
1816.inst	0x04a147b0	//index z16.s,w29,1
1817.inst	0x04a147a3	//index z3.s,w29,1
1818.endif
1819	lsr	x20,x29,#32
1820.inst	0x05a03a87	//dup z7.s,w20
1821	mov	x6,#10
182210:
1823.align	5
1824.inst	0x04a10000	//add z0.s,z0.s,z1.s
1825.if	mixin == 1
1826	add	w7,w7,w11
1827.endif
1828.inst	0x04a50084	//add z4.s,z4.s,z5.s
1829.if	mixin == 1
1830	add	w8,w8,w12
1831.endif
1832.inst	0x04a90108	//add z8.s,z8.s,z9.s
1833.if	mixin == 1
1834	add	w9,w9,w13
1835.endif
1836.inst	0x04ad018c	//add z12.s,z12.s,z13.s
1837.if	mixin == 1
1838	add	w10,w10,w14
1839.endif
1840.inst	0x04a03063	//eor z3.d,z3.d,z0.d
1841.if	mixin == 1
1842	eor	w19,w19,w7
1843.endif
1844.inst	0x04a430e7	//eor z7.d,z7.d,z4.d
1845.if	mixin == 1
1846	eor	w20,w20,w8
1847.endif
1848.inst	0x04a8316b	//eor z11.d,z11.d,z8.d
1849.if	mixin == 1
1850	eor	w21,w21,w9
1851.endif
1852.inst	0x04ac31ef	//eor z15.d,z15.d,z12.d
1853.if	mixin == 1
1854	eor	w22,w22,w10
1855.endif
1856.inst	0x05a58063	//revh z3.s,p0/m,z3.s
1857.if	mixin == 1
1858	ror	w19,w19,#16
1859.endif
1860.inst	0x05a580e7	//revh z7.s,p0/m,z7.s
1861.if	mixin == 1
1862	ror	w20,w20,#16
1863.endif
1864.inst	0x05a5816b	//revh z11.s,p0/m,z11.s
1865.if	mixin == 1
1866	ror	w21,w21,#16
1867.endif
1868.inst	0x05a581ef	//revh z15.s,p0/m,z15.s
1869.if	mixin == 1
1870	ror	w22,w22,#16
1871.endif
1872.inst	0x04a30042	//add z2.s,z2.s,z3.s
1873.if	mixin == 1
1874	add	w15,w15,w19
1875.endif
1876.inst	0x04a700c6	//add z6.s,z6.s,z7.s
1877.if	mixin == 1
1878	add	w16,w16,w20
1879.endif
1880.inst	0x04ab014a	//add z10.s,z10.s,z11.s
1881.if	mixin == 1
1882	add	w17,w17,w21
1883.endif
1884.inst	0x04af01ce	//add z14.s,z14.s,z15.s
1885.if	mixin == 1
1886	add	w18,w18,w22
1887.endif
1888.inst	0x04a23021	//eor z1.d,z1.d,z2.d
1889.if	mixin == 1
1890	eor	w11,w11,w15
1891.endif
1892.inst	0x04a630a5	//eor z5.d,z5.d,z6.d
1893.if	mixin == 1
1894	eor	w12,w12,w16
1895.endif
1896.inst	0x04aa3129	//eor z9.d,z9.d,z10.d
1897.if	mixin == 1
1898	eor	w13,w13,w17
1899.endif
1900.inst	0x04ae31ad	//eor z13.d,z13.d,z14.d
1901.if	mixin == 1
1902	eor	w14,w14,w18
1903.endif
1904.inst	0x046c9c31	//lsl z17.s,z1.s,12
1905.inst	0x046c9cb2	//lsl z18.s,z5.s,12
1906.inst	0x046c9d33	//lsl z19.s,z9.s,12
1907.inst	0x046c9db4	//lsl z20.s,z13.s,12
1908.inst	0x046c9421	//lsr z1.s,z1.s,20
1909.if	mixin == 1
1910	ror	w11,w11,20
1911.endif
1912.inst	0x046c94a5	//lsr z5.s,z5.s,20
1913.if	mixin == 1
1914	ror	w12,w12,20
1915.endif
1916.inst	0x046c9529	//lsr z9.s,z9.s,20
1917.if	mixin == 1
1918	ror	w13,w13,20
1919.endif
1920.inst	0x046c95ad	//lsr z13.s,z13.s,20
1921.if	mixin == 1
1922	ror	w14,w14,20
1923.endif
1924.inst	0x04713021	//orr z1.d,z1.d,z17.d
1925.inst	0x047230a5	//orr z5.d,z5.d,z18.d
1926.inst	0x04733129	//orr z9.d,z9.d,z19.d
1927.inst	0x047431ad	//orr z13.d,z13.d,z20.d
1928.inst	0x04a10000	//add z0.s,z0.s,z1.s
1929.if	mixin == 1
1930	add	w7,w7,w11
1931.endif
1932.inst	0x04a50084	//add z4.s,z4.s,z5.s
1933.if	mixin == 1
1934	add	w8,w8,w12
1935.endif
1936.inst	0x04a90108	//add z8.s,z8.s,z9.s
1937.if	mixin == 1
1938	add	w9,w9,w13
1939.endif
1940.inst	0x04ad018c	//add z12.s,z12.s,z13.s
1941.if	mixin == 1
1942	add	w10,w10,w14
1943.endif
1944.inst	0x04a03063	//eor z3.d,z3.d,z0.d
1945.if	mixin == 1
1946	eor	w19,w19,w7
1947.endif
1948.inst	0x04a430e7	//eor z7.d,z7.d,z4.d
1949.if	mixin == 1
1950	eor	w20,w20,w8
1951.endif
1952.inst	0x04a8316b	//eor z11.d,z11.d,z8.d
1953.if	mixin == 1
1954	eor	w21,w21,w9
1955.endif
1956.inst	0x04ac31ef	//eor z15.d,z15.d,z12.d
1957.if	mixin == 1
1958	eor	w22,w22,w10
1959.endif
1960.inst	0x053f3063	//tbl z3.b,{z3.b},z31.b
1961.if	mixin == 1
1962	ror	w19,w19,#24
1963.endif
1964.inst	0x053f30e7	//tbl z7.b,{z7.b},z31.b
1965.if	mixin == 1
1966	ror	w20,w20,#24
1967.endif
1968.inst	0x053f316b	//tbl z11.b,{z11.b},z31.b
1969.if	mixin == 1
1970	ror	w21,w21,#24
1971.endif
1972.inst	0x053f31ef	//tbl z15.b,{z15.b},z31.b
1973.if	mixin == 1
1974	ror	w22,w22,#24
1975.endif
1976.inst	0x04a30042	//add z2.s,z2.s,z3.s
1977.if	mixin == 1
1978	add	w15,w15,w19
1979.endif
1980.inst	0x04a700c6	//add z6.s,z6.s,z7.s
1981.if	mixin == 1
1982	add	w16,w16,w20
1983.endif
1984.inst	0x04ab014a	//add z10.s,z10.s,z11.s
1985.if	mixin == 1
1986	add	w17,w17,w21
1987.endif
1988.inst	0x04af01ce	//add z14.s,z14.s,z15.s
1989.if	mixin == 1
1990	add	w18,w18,w22
1991.endif
1992.inst	0x04a23021	//eor z1.d,z1.d,z2.d
1993.if	mixin == 1
1994	eor	w11,w11,w15
1995.endif
1996.inst	0x04a630a5	//eor z5.d,z5.d,z6.d
1997.if	mixin == 1
1998	eor	w12,w12,w16
1999.endif
2000.inst	0x04aa3129	//eor z9.d,z9.d,z10.d
2001.if	mixin == 1
2002	eor	w13,w13,w17
2003.endif
2004.inst	0x04ae31ad	//eor z13.d,z13.d,z14.d
2005.if	mixin == 1
2006	eor	w14,w14,w18
2007.endif
2008.inst	0x04679c31	//lsl z17.s,z1.s,7
2009.inst	0x04679cb2	//lsl z18.s,z5.s,7
2010.inst	0x04679d33	//lsl z19.s,z9.s,7
2011.inst	0x04679db4	//lsl z20.s,z13.s,7
2012.inst	0x04679421	//lsr z1.s,z1.s,25
2013.if	mixin == 1
2014	ror	w11,w11,25
2015.endif
2016.inst	0x046794a5	//lsr z5.s,z5.s,25
2017.if	mixin == 1
2018	ror	w12,w12,25
2019.endif
2020.inst	0x04679529	//lsr z9.s,z9.s,25
2021.if	mixin == 1
2022	ror	w13,w13,25
2023.endif
2024.inst	0x046795ad	//lsr z13.s,z13.s,25
2025.if	mixin == 1
2026	ror	w14,w14,25
2027.endif
2028.inst	0x04713021	//orr z1.d,z1.d,z17.d
2029.inst	0x047230a5	//orr z5.d,z5.d,z18.d
2030.inst	0x04733129	//orr z9.d,z9.d,z19.d
2031.inst	0x047431ad	//orr z13.d,z13.d,z20.d
2032.inst	0x04a50000	//add z0.s,z0.s,z5.s
2033.if	mixin == 1
2034	add	w7,w7,w12
2035.endif
2036.inst	0x04a90084	//add z4.s,z4.s,z9.s
2037.if	mixin == 1
2038	add	w8,w8,w13
2039.endif
2040.inst	0x04ad0108	//add z8.s,z8.s,z13.s
2041.if	mixin == 1
2042	add	w9,w9,w14
2043.endif
2044.inst	0x04a1018c	//add z12.s,z12.s,z1.s
2045.if	mixin == 1
2046	add	w10,w10,w11
2047.endif
2048.inst	0x04a031ef	//eor z15.d,z15.d,z0.d
2049.if	mixin == 1
2050	eor	w22,w22,w7
2051.endif
2052.inst	0x04a43063	//eor z3.d,z3.d,z4.d
2053.if	mixin == 1
2054	eor	w19,w19,w8
2055.endif
2056.inst	0x04a830e7	//eor z7.d,z7.d,z8.d
2057.if	mixin == 1
2058	eor	w20,w20,w9
2059.endif
2060.inst	0x04ac316b	//eor z11.d,z11.d,z12.d
2061.if	mixin == 1
2062	eor	w21,w21,w10
2063.endif
2064.inst	0x05a581ef	//revh z15.s,p0/m,z15.s
2065.if	mixin == 1
2066	ror	w22,w22,#16
2067.endif
2068.inst	0x05a58063	//revh z3.s,p0/m,z3.s
2069.if	mixin == 1
2070	ror	w19,w19,#16
2071.endif
2072.inst	0x05a580e7	//revh z7.s,p0/m,z7.s
2073.if	mixin == 1
2074	ror	w20,w20,#16
2075.endif
2076.inst	0x05a5816b	//revh z11.s,p0/m,z11.s
2077.if	mixin == 1
2078	ror	w21,w21,#16
2079.endif
2080.inst	0x04af014a	//add z10.s,z10.s,z15.s
2081.if	mixin == 1
2082	add	w17,w17,w22
2083.endif
2084.inst	0x04a301ce	//add z14.s,z14.s,z3.s
2085.if	mixin == 1
2086	add	w18,w18,w19
2087.endif
2088.inst	0x04a70042	//add z2.s,z2.s,z7.s
2089.if	mixin == 1
2090	add	w15,w15,w20
2091.endif
2092.inst	0x04ab00c6	//add z6.s,z6.s,z11.s
2093.if	mixin == 1
2094	add	w16,w16,w21
2095.endif
2096.inst	0x04aa30a5	//eor z5.d,z5.d,z10.d
2097.if	mixin == 1
2098	eor	w12,w12,w17
2099.endif
2100.inst	0x04ae3129	//eor z9.d,z9.d,z14.d
2101.if	mixin == 1
2102	eor	w13,w13,w18
2103.endif
2104.inst	0x04a231ad	//eor z13.d,z13.d,z2.d
2105.if	mixin == 1
2106	eor	w14,w14,w15
2107.endif
2108.inst	0x04a63021	//eor z1.d,z1.d,z6.d
2109.if	mixin == 1
2110	eor	w11,w11,w16
2111.endif
2112.inst	0x046c9cb1	//lsl z17.s,z5.s,12
2113.inst	0x046c9d32	//lsl z18.s,z9.s,12
2114.inst	0x046c9db3	//lsl z19.s,z13.s,12
2115.inst	0x046c9c34	//lsl z20.s,z1.s,12
2116.inst	0x046c94a5	//lsr z5.s,z5.s,20
2117.if	mixin == 1
2118	ror	w12,w12,20
2119.endif
2120.inst	0x046c9529	//lsr z9.s,z9.s,20
2121.if	mixin == 1
2122	ror	w13,w13,20
2123.endif
2124.inst	0x046c95ad	//lsr z13.s,z13.s,20
2125.if	mixin == 1
2126	ror	w14,w14,20
2127.endif
2128.inst	0x046c9421	//lsr z1.s,z1.s,20
2129.if	mixin == 1
2130	ror	w11,w11,20
2131.endif
2132.inst	0x047130a5	//orr z5.d,z5.d,z17.d
2133.inst	0x04723129	//orr z9.d,z9.d,z18.d
2134.inst	0x047331ad	//orr z13.d,z13.d,z19.d
2135.inst	0x04743021	//orr z1.d,z1.d,z20.d
2136.inst	0x04a50000	//add z0.s,z0.s,z5.s
2137.if	mixin == 1
2138	add	w7,w7,w12
2139.endif
2140.inst	0x04a90084	//add z4.s,z4.s,z9.s
2141.if	mixin == 1
2142	add	w8,w8,w13
2143.endif
2144.inst	0x04ad0108	//add z8.s,z8.s,z13.s
2145.if	mixin == 1
2146	add	w9,w9,w14
2147.endif
2148.inst	0x04a1018c	//add z12.s,z12.s,z1.s
2149.if	mixin == 1
2150	add	w10,w10,w11
2151.endif
2152.inst	0x04a031ef	//eor z15.d,z15.d,z0.d
2153.if	mixin == 1
2154	eor	w22,w22,w7
2155.endif
2156.inst	0x04a43063	//eor z3.d,z3.d,z4.d
2157.if	mixin == 1
2158	eor	w19,w19,w8
2159.endif
2160.inst	0x04a830e7	//eor z7.d,z7.d,z8.d
2161.if	mixin == 1
2162	eor	w20,w20,w9
2163.endif
2164.inst	0x04ac316b	//eor z11.d,z11.d,z12.d
2165.if	mixin == 1
2166	eor	w21,w21,w10
2167.endif
2168.inst	0x053f31ef	//tbl z15.b,{z15.b},z31.b
2169.if	mixin == 1
2170	ror	w22,w22,#24
2171.endif
2172.inst	0x053f3063	//tbl z3.b,{z3.b},z31.b
2173.if	mixin == 1
2174	ror	w19,w19,#24
2175.endif
2176.inst	0x053f30e7	//tbl z7.b,{z7.b},z31.b
2177.if	mixin == 1
2178	ror	w20,w20,#24
2179.endif
2180.inst	0x053f316b	//tbl z11.b,{z11.b},z31.b
2181.if	mixin == 1
2182	ror	w21,w21,#24
2183.endif
2184.inst	0x04af014a	//add z10.s,z10.s,z15.s
2185.if	mixin == 1
2186	add	w17,w17,w22
2187.endif
2188.inst	0x04a301ce	//add z14.s,z14.s,z3.s
2189.if	mixin == 1
2190	add	w18,w18,w19
2191.endif
2192.inst	0x04a70042	//add z2.s,z2.s,z7.s
2193.if	mixin == 1
2194	add	w15,w15,w20
2195.endif
2196.inst	0x04ab00c6	//add z6.s,z6.s,z11.s
2197.if	mixin == 1
2198	add	w16,w16,w21
2199.endif
2200.inst	0x04aa30a5	//eor z5.d,z5.d,z10.d
2201.if	mixin == 1
2202	eor	w12,w12,w17
2203.endif
2204.inst	0x04ae3129	//eor z9.d,z9.d,z14.d
2205.if	mixin == 1
2206	eor	w13,w13,w18
2207.endif
2208.inst	0x04a231ad	//eor z13.d,z13.d,z2.d
2209.if	mixin == 1
2210	eor	w14,w14,w15
2211.endif
2212.inst	0x04a63021	//eor z1.d,z1.d,z6.d
2213.if	mixin == 1
2214	eor	w11,w11,w16
2215.endif
2216.inst	0x04679cb1	//lsl z17.s,z5.s,7
2217.inst	0x04679d32	//lsl z18.s,z9.s,7
2218.inst	0x04679db3	//lsl z19.s,z13.s,7
2219.inst	0x04679c34	//lsl z20.s,z1.s,7
2220.inst	0x046794a5	//lsr z5.s,z5.s,25
2221.if	mixin == 1
2222	ror	w12,w12,25
2223.endif
2224.inst	0x04679529	//lsr z9.s,z9.s,25
2225.if	mixin == 1
2226	ror	w13,w13,25
2227.endif
2228.inst	0x046795ad	//lsr z13.s,z13.s,25
2229.if	mixin == 1
2230	ror	w14,w14,25
2231.endif
2232.inst	0x04679421	//lsr z1.s,z1.s,25
2233.if	mixin == 1
2234	ror	w11,w11,25
2235.endif
2236.inst	0x047130a5	//orr z5.d,z5.d,z17.d
2237.inst	0x04723129	//orr z9.d,z9.d,z18.d
2238.inst	0x047331ad	//orr z13.d,z13.d,z19.d
2239.inst	0x04743021	//orr z1.d,z1.d,z20.d
2240	sub	x6,x6,1
2241	cbnz	x6,10b
2242	lsr	x6,x28,#32
2243.inst	0x05a03b91	//dup z17.s,w28
2244.inst	0x05a038d2	//dup z18.s,w6
2245	lsr	x6,x29,#32
2246.inst	0x05a038d3	//dup z19.s,w6
2247	lsr	x6,x30,#32
2248.if	mixin == 1
2249	add	w7,w7,w23
2250.endif
2251.inst	0x04b90000	//add z0.s,z0.s,z25.s
2252.if	mixin == 1
2253	add	x8,x8,x23,lsr #32
2254.endif
2255.inst	0x04ba0084	//add z4.s,z4.s,z26.s
2256.if	mixin == 1
2257	add	x7,x7,x8,lsl #32  // pack
2258.endif
2259.if	mixin == 1
2260	add	w9,w9,w24
2261.endif
2262.inst	0x04bb0108	//add z8.s,z8.s,z27.s
2263.if	mixin == 1
2264	add	x10,x10,x24,lsr #32
2265.endif
2266.inst	0x04bc018c	//add z12.s,z12.s,z28.s
2267.if	mixin == 1
2268	add	x9,x9,x10,lsl #32  // pack
2269.endif
2270.if	mixin == 1
2271	ldp	x8,x10,[x1],#16
2272.endif
2273.if	mixin == 1
2274	add	w11,w11,w25
2275.endif
2276.inst	0x04bd0021	//add z1.s,z1.s,z29.s
2277.if	mixin == 1
2278	add	x12,x12,x25,lsr #32
2279.endif
2280.inst	0x04be00a5	//add z5.s,z5.s,z30.s
2281.if	mixin == 1
2282	add	x11,x11,x12,lsl #32  // pack
2283.endif
2284.if	mixin == 1
2285	add	w13,w13,w26
2286.endif
2287.inst	0x04b50129	//add z9.s,z9.s,z21.s
2288.if	mixin == 1
2289	add	x14,x14,x26,lsr #32
2290.endif
2291.inst	0x04b601ad	//add z13.s,z13.s,z22.s
2292.if	mixin == 1
2293	add	x13,x13,x14,lsl #32  // pack
2294.endif
2295.if	mixin == 1
2296	ldp	x12,x14,[x1],#16
2297.endif
2298.if	mixin == 1
2299	add	w15,w15,w27
2300.endif
2301.inst	0x04b70042	//add z2.s,z2.s,z23.s
2302.if	mixin == 1
2303	add	x16,x16,x27,lsr #32
2304.endif
2305.inst	0x04b800c6	//add z6.s,z6.s,z24.s
2306.if	mixin == 1
2307	add	x15,x15,x16,lsl #32  // pack
2308.endif
2309.if	mixin == 1
2310	add	w17,w17,w28
2311.endif
2312.inst	0x04b1014a	//add z10.s,z10.s,z17.s
2313.if	mixin == 1
2314	add	x18,x18,x28,lsr #32
2315.endif
2316.inst	0x04b201ce	//add z14.s,z14.s,z18.s
2317.if	mixin == 1
2318	add	x17,x17,x18,lsl #32  // pack
2319.endif
2320.if	mixin == 1
2321	ldp	x16,x18,[x1],#16
2322.endif
2323.inst	0x05a03bd4	//dup z20.s,w30
2324.inst	0x05a038d9	//dup z25.s,w6	// bak[15] not available for SVE
2325.if	mixin == 1
2326	add	w19,w19,w29
2327.endif
2328.inst	0x04b00063	//add z3.s,z3.s,z16.s
2329.if	mixin == 1
2330	add	x20,x20,x29,lsr #32
2331.endif
2332.inst	0x04b300e7	//add z7.s,z7.s,z19.s
2333.if	mixin == 1
2334	add	x19,x19,x20,lsl #32  // pack
2335.endif
2336.if	mixin == 1
2337	add	w21,w21,w30
2338.endif
2339.inst	0x04b4016b	//add z11.s,z11.s,z20.s
2340.if	mixin == 1
2341	add	x22,x22,x30,lsr #32
2342.endif
2343.inst	0x04b901ef	//add z15.s,z15.s,z25.s
2344.if	mixin == 1
2345	add	x21,x21,x22,lsl #32  // pack
2346.endif
2347.if	mixin == 1
2348	ldp	x20,x22,[x1],#16
2349.endif
2350#ifdef	__AARCH64EB__
2351	rev	x7,x7
2352.inst	0x05a48000	//revb z0.s,p0/m,z0.s
2353.inst	0x05a48084	//revb z4.s,p0/m,z4.s
2354	rev	x9,x9
2355.inst	0x05a48108	//revb z8.s,p0/m,z8.s
2356.inst	0x05a4818c	//revb z12.s,p0/m,z12.s
2357	rev	x11,x11
2358.inst	0x05a48021	//revb z1.s,p0/m,z1.s
2359.inst	0x05a480a5	//revb z5.s,p0/m,z5.s
2360	rev	x13,x13
2361.inst	0x05a48129	//revb z9.s,p0/m,z9.s
2362.inst	0x05a481ad	//revb z13.s,p0/m,z13.s
2363	rev	x15,x15
2364.inst	0x05a48042	//revb z2.s,p0/m,z2.s
2365.inst	0x05a480c6	//revb z6.s,p0/m,z6.s
2366	rev	x17,x17
2367.inst	0x05a4814a	//revb z10.s,p0/m,z10.s
2368.inst	0x05a481ce	//revb z14.s,p0/m,z14.s
2369	rev	x19,x19
2370.inst	0x05a48063	//revb z3.s,p0/m,z3.s
2371.inst	0x05a480e7	//revb z7.s,p0/m,z7.s
2372	rev	x21,x21
2373.inst	0x05a4816b	//revb z11.s,p0/m,z11.s
2374.inst	0x05a481ef	//revb z15.s,p0/m,z15.s
2375#endif
2376.if	mixin == 1
2377	add	x29,x29,#1
2378.endif
2379	cmp	x5,4
2380	b.ne	200f
2381.if	mixin == 1
2382	eor	x7,x7,x8
2383.endif
2384.if	mixin == 1
2385	eor	x9,x9,x10
2386.endif
2387.if	mixin == 1
2388	eor	x11,x11,x12
2389.endif
2390.inst	0x05a46011	//zip1 z17.s,z0.s,z4.s
2391.inst	0x05a46412	//zip2 z18.s,z0.s,z4.s
2392.inst	0x05ac6113	//zip1 z19.s,z8.s,z12.s
2393.inst	0x05ac6514	//zip2 z20.s,z8.s,z12.s
2394
2395.inst	0x05a56035	//zip1 z21.s,z1.s,z5.s
2396.inst	0x05a56436	//zip2 z22.s,z1.s,z5.s
2397.inst	0x05ad6137	//zip1 z23.s,z9.s,z13.s
2398.inst	0x05ad6538	//zip2 z24.s,z9.s,z13.s
2399
2400.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
2401.inst	0x05f36624	//zip2 z4.d,z17.d,z19.d
2402.inst	0x05f46248	//zip1 z8.d,z18.d,z20.d
2403.inst	0x05f4664c	//zip2 z12.d,z18.d,z20.d
2404
2405.inst	0x05f762a1	//zip1 z1.d,z21.d,z23.d
2406.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
2407.inst	0x05f862c9	//zip1 z9.d,z22.d,z24.d
2408.inst	0x05f866cd	//zip2 z13.d,z22.d,z24.d
2409.if	mixin == 1
2410	eor	x13,x13,x14
2411.endif
2412.if	mixin == 1
2413	eor	x15,x15,x16
2414.endif
2415.if	mixin == 1
2416	eor	x17,x17,x18
2417.endif
2418.inst	0x05a66051	//zip1 z17.s,z2.s,z6.s
2419.inst	0x05a66452	//zip2 z18.s,z2.s,z6.s
2420.inst	0x05ae6153	//zip1 z19.s,z10.s,z14.s
2421.inst	0x05ae6554	//zip2 z20.s,z10.s,z14.s
2422
2423.inst	0x05a76075	//zip1 z21.s,z3.s,z7.s
2424.inst	0x05a76476	//zip2 z22.s,z3.s,z7.s
2425.inst	0x05af6177	//zip1 z23.s,z11.s,z15.s
2426.inst	0x05af6578	//zip2 z24.s,z11.s,z15.s
2427
2428.inst	0x05f36222	//zip1 z2.d,z17.d,z19.d
2429.inst	0x05f36626	//zip2 z6.d,z17.d,z19.d
2430.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
2431.inst	0x05f4664e	//zip2 z14.d,z18.d,z20.d
2432
2433.inst	0x05f762a3	//zip1 z3.d,z21.d,z23.d
2434.inst	0x05f766a7	//zip2 z7.d,z21.d,z23.d
2435.inst	0x05f862cb	//zip1 z11.d,z22.d,z24.d
2436.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
2437.if	mixin == 1
2438	eor	x19,x19,x20
2439.endif
2440.if	mixin == 1
2441	eor	x21,x21,x22
2442.endif
2443	ld1	{v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
2444	ld1	{v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
2445.inst	0x04b13000	//eor z0.d,z0.d,z17.d
2446.inst	0x04b23021	//eor z1.d,z1.d,z18.d
2447.inst	0x04b33042	//eor z2.d,z2.d,z19.d
2448.inst	0x04b43063	//eor z3.d,z3.d,z20.d
2449.inst	0x04b53084	//eor z4.d,z4.d,z21.d
2450.inst	0x04b630a5	//eor z5.d,z5.d,z22.d
2451.inst	0x04b730c6	//eor z6.d,z6.d,z23.d
2452.inst	0x04b830e7	//eor z7.d,z7.d,z24.d
2453	ld1	{v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
2454	ld1	{v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
2455.if	mixin == 1
2456	stp	x7,x9,[x0],#16
2457.endif
2458.inst	0x04b13108	//eor z8.d,z8.d,z17.d
2459.inst	0x04b23129	//eor z9.d,z9.d,z18.d
2460.if	mixin == 1
2461	stp	x11,x13,[x0],#16
2462.endif
2463.inst	0x04b3314a	//eor z10.d,z10.d,z19.d
2464.inst	0x04b4316b	//eor z11.d,z11.d,z20.d
2465.if	mixin == 1
2466	stp	x15,x17,[x0],#16
2467.endif
2468.inst	0x04b5318c	//eor z12.d,z12.d,z21.d
2469.inst	0x04b631ad	//eor z13.d,z13.d,z22.d
2470.if	mixin == 1
2471	stp	x19,x21,[x0],#16
2472.endif
2473.inst	0x04b731ce	//eor z14.d,z14.d,z23.d
2474.inst	0x04b831ef	//eor z15.d,z15.d,z24.d
2475	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
2476	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
2477	st1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
2478	st1	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2479	b	210f
2480200:
2481.inst	0x05a16011	//zip1 z17.s,z0.s,z1.s
2482.inst	0x05a16412	//zip2 z18.s,z0.s,z1.s
2483.inst	0x05a36053	//zip1 z19.s,z2.s,z3.s
2484.inst	0x05a36454	//zip2 z20.s,z2.s,z3.s
2485
2486.inst	0x05a56095	//zip1 z21.s,z4.s,z5.s
2487.inst	0x05a56496	//zip2 z22.s,z4.s,z5.s
2488.inst	0x05a760d7	//zip1 z23.s,z6.s,z7.s
2489.inst	0x05a764d8	//zip2 z24.s,z6.s,z7.s
2490
2491.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
2492.inst	0x05f36621	//zip2 z1.d,z17.d,z19.d
2493.inst	0x05f46242	//zip1 z2.d,z18.d,z20.d
2494.inst	0x05f46643	//zip2 z3.d,z18.d,z20.d
2495
2496.inst	0x05f762a4	//zip1 z4.d,z21.d,z23.d
2497.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
2498.inst	0x05f862c6	//zip1 z6.d,z22.d,z24.d
2499.inst	0x05f866c7	//zip2 z7.d,z22.d,z24.d
2500.if	mixin == 1
2501	eor	x7,x7,x8
2502.endif
2503.if	mixin == 1
2504	eor	x9,x9,x10
2505.endif
2506.inst	0x05a96111	//zip1 z17.s,z8.s,z9.s
2507.inst	0x05a96512	//zip2 z18.s,z8.s,z9.s
2508.inst	0x05ab6153	//zip1 z19.s,z10.s,z11.s
2509.inst	0x05ab6554	//zip2 z20.s,z10.s,z11.s
2510
2511.inst	0x05ad6195	//zip1 z21.s,z12.s,z13.s
2512.inst	0x05ad6596	//zip2 z22.s,z12.s,z13.s
2513.inst	0x05af61d7	//zip1 z23.s,z14.s,z15.s
2514.inst	0x05af65d8	//zip2 z24.s,z14.s,z15.s
2515
2516.inst	0x05f36228	//zip1 z8.d,z17.d,z19.d
2517.inst	0x05f36629	//zip2 z9.d,z17.d,z19.d
2518.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
2519.inst	0x05f4664b	//zip2 z11.d,z18.d,z20.d
2520
2521.inst	0x05f762ac	//zip1 z12.d,z21.d,z23.d
2522.inst	0x05f766ad	//zip2 z13.d,z21.d,z23.d
2523.inst	0x05f862ce	//zip1 z14.d,z22.d,z24.d
2524.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
2525.if	mixin == 1
2526	eor	x11,x11,x12
2527.endif
2528.if	mixin == 1
2529	eor	x13,x13,x14
2530.endif
2531.inst	0x05a46011	//zip1 z17.s,z0.s,z4.s
2532.inst	0x05a46412	//zip2 z18.s,z0.s,z4.s
2533.inst	0x05ac6113	//zip1 z19.s,z8.s,z12.s
2534.inst	0x05ac6514	//zip2 z20.s,z8.s,z12.s
2535
2536.inst	0x05a56035	//zip1 z21.s,z1.s,z5.s
2537.inst	0x05a56436	//zip2 z22.s,z1.s,z5.s
2538.inst	0x05ad6137	//zip1 z23.s,z9.s,z13.s
2539.inst	0x05ad6538	//zip2 z24.s,z9.s,z13.s
2540
2541.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
2542.inst	0x05f36624	//zip2 z4.d,z17.d,z19.d
2543.inst	0x05f46248	//zip1 z8.d,z18.d,z20.d
2544.inst	0x05f4664c	//zip2 z12.d,z18.d,z20.d
2545
2546.inst	0x05f762a1	//zip1 z1.d,z21.d,z23.d
2547.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
2548.inst	0x05f862c9	//zip1 z9.d,z22.d,z24.d
2549.inst	0x05f866cd	//zip2 z13.d,z22.d,z24.d
2550.if	mixin == 1
2551	eor	x15,x15,x16
2552.endif
2553.if	mixin == 1
2554	eor	x17,x17,x18
2555.endif
2556.inst	0x05a66051	//zip1 z17.s,z2.s,z6.s
2557.inst	0x05a66452	//zip2 z18.s,z2.s,z6.s
2558.inst	0x05ae6153	//zip1 z19.s,z10.s,z14.s
2559.inst	0x05ae6554	//zip2 z20.s,z10.s,z14.s
2560
2561.inst	0x05a76075	//zip1 z21.s,z3.s,z7.s
2562.inst	0x05a76476	//zip2 z22.s,z3.s,z7.s
2563.inst	0x05af6177	//zip1 z23.s,z11.s,z15.s
2564.inst	0x05af6578	//zip2 z24.s,z11.s,z15.s
2565
2566.inst	0x05f36222	//zip1 z2.d,z17.d,z19.d
2567.inst	0x05f36626	//zip2 z6.d,z17.d,z19.d
2568.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
2569.inst	0x05f4664e	//zip2 z14.d,z18.d,z20.d
2570
2571.inst	0x05f762a3	//zip1 z3.d,z21.d,z23.d
2572.inst	0x05f766a7	//zip2 z7.d,z21.d,z23.d
2573.inst	0x05f862cb	//zip1 z11.d,z22.d,z24.d
2574.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
2575.if	mixin == 1
2576	eor	x19,x19,x20
2577.endif
2578.if	mixin == 1
2579	eor	x21,x21,x22
2580.endif
2581.inst	0xa540a031	//ld1w {z17.s},p0/z,[x1,#0,MUL VL]
2582.inst	0xa541a032	//ld1w {z18.s},p0/z,[x1,#1,MUL VL]
2583.inst	0xa542a033	//ld1w {z19.s},p0/z,[x1,#2,MUL VL]
2584.inst	0xa543a034	//ld1w {z20.s},p0/z,[x1,#3,MUL VL]
2585.inst	0xa544a035	//ld1w {z21.s},p0/z,[x1,#4,MUL VL]
2586.inst	0xa545a036	//ld1w {z22.s},p0/z,[x1,#5,MUL VL]
2587.inst	0xa546a037	//ld1w {z23.s},p0/z,[x1,#6,MUL VL]
2588.inst	0xa547a038	//ld1w {z24.s},p0/z,[x1,#7,MUL VL]
2589.inst	0x04215101	//addvl x1,x1,8
2590.inst	0x04b13000	//eor z0.d,z0.d,z17.d
2591.inst	0x04b23084	//eor z4.d,z4.d,z18.d
2592.inst	0x04b33108	//eor z8.d,z8.d,z19.d
2593.inst	0x04b4318c	//eor z12.d,z12.d,z20.d
2594.inst	0x04b53021	//eor z1.d,z1.d,z21.d
2595.inst	0x04b630a5	//eor z5.d,z5.d,z22.d
2596.inst	0x04b73129	//eor z9.d,z9.d,z23.d
2597.inst	0x04b831ad	//eor z13.d,z13.d,z24.d
2598.inst	0xa540a031	//ld1w {z17.s},p0/z,[x1,#0,MUL VL]
2599.inst	0xa541a032	//ld1w {z18.s},p0/z,[x1,#1,MUL VL]
2600.inst	0xa542a033	//ld1w {z19.s},p0/z,[x1,#2,MUL VL]
2601.inst	0xa543a034	//ld1w {z20.s},p0/z,[x1,#3,MUL VL]
2602.inst	0xa544a035	//ld1w {z21.s},p0/z,[x1,#4,MUL VL]
2603.inst	0xa545a036	//ld1w {z22.s},p0/z,[x1,#5,MUL VL]
2604.inst	0xa546a037	//ld1w {z23.s},p0/z,[x1,#6,MUL VL]
2605.inst	0xa547a038	//ld1w {z24.s},p0/z,[x1,#7,MUL VL]
2606.inst	0x04215101	//addvl x1,x1,8
2607.if	mixin == 1
2608	stp	x7,x9,[x0],#16
2609.endif
2610.inst	0x04b13042	//eor z2.d,z2.d,z17.d
2611.inst	0x04b230c6	//eor z6.d,z6.d,z18.d
2612.if	mixin == 1
2613	stp	x11,x13,[x0],#16
2614.endif
2615.inst	0x04b3314a	//eor z10.d,z10.d,z19.d
2616.inst	0x04b431ce	//eor z14.d,z14.d,z20.d
2617.if	mixin == 1
2618	stp	x15,x17,[x0],#16
2619.endif
2620.inst	0x04b53063	//eor z3.d,z3.d,z21.d
2621.inst	0x04b630e7	//eor z7.d,z7.d,z22.d
2622.if	mixin == 1
2623	stp	x19,x21,[x0],#16
2624.endif
2625.inst	0x04b7316b	//eor z11.d,z11.d,z23.d
2626.inst	0x04b831ef	//eor z15.d,z15.d,z24.d
2627.inst	0xe540e000	//st1w {z0.s},p0,[x0,#0,MUL VL]
2628.inst	0xe541e004	//st1w {z4.s},p0,[x0,#1,MUL VL]
2629.inst	0xe542e008	//st1w {z8.s},p0,[x0,#2,MUL VL]
2630.inst	0xe543e00c	//st1w {z12.s},p0,[x0,#3,MUL VL]
2631.inst	0xe544e001	//st1w {z1.s},p0,[x0,#4,MUL VL]
2632.inst	0xe545e005	//st1w {z5.s},p0,[x0,#5,MUL VL]
2633.inst	0xe546e009	//st1w {z9.s},p0,[x0,#6,MUL VL]
2634.inst	0xe547e00d	//st1w {z13.s},p0,[x0,#7,MUL VL]
2635.inst	0x04205100	//addvl x0,x0,8
2636.inst	0xe540e002	//st1w {z2.s},p0,[x0,#0,MUL VL]
2637.inst	0xe541e006	//st1w {z6.s},p0,[x0,#1,MUL VL]
2638.inst	0xe542e00a	//st1w {z10.s},p0,[x0,#2,MUL VL]
2639.inst	0xe543e00e	//st1w {z14.s},p0,[x0,#3,MUL VL]
2640.inst	0xe544e003	//st1w {z3.s},p0,[x0,#4,MUL VL]
2641.inst	0xe545e007	//st1w {z7.s},p0,[x0,#5,MUL VL]
2642.inst	0xe546e00b	//st1w {z11.s},p0,[x0,#6,MUL VL]
2643.inst	0xe547e00f	//st1w {z15.s},p0,[x0,#7,MUL VL]
2644.inst	0x04205100	//addvl x0,x0,8
2645210:
2646.inst	0x04b0e3fd	//incw x29, ALL, MUL #1
2647	subs	x2,x2,64
2648	b.gt	100b
2649	b	110f
2650101:
2651	mixin=0
2652	lsr	x8,x23,#32
2653.inst	0x05a03ae0	//dup z0.s,w23
2654.inst	0x05a03af9	//dup z25.s,w23
2655.if	mixin == 1
2656	mov	w7,w23
2657.endif
2658.inst	0x05a03904	//dup z4.s,w8
2659.inst	0x05a0391a	//dup z26.s,w8
2660	lsr	x10,x24,#32
2661.inst	0x05a03b08	//dup z8.s,w24
2662.inst	0x05a03b1b	//dup z27.s,w24
2663.if	mixin == 1
2664	mov	w9,w24
2665.endif
2666.inst	0x05a0394c	//dup z12.s,w10
2667.inst	0x05a0395c	//dup z28.s,w10
2668	lsr	x12,x25,#32
2669.inst	0x05a03b21	//dup z1.s,w25
2670.inst	0x05a03b3d	//dup z29.s,w25
2671.if	mixin == 1
2672	mov	w11,w25
2673.endif
2674.inst	0x05a03985	//dup z5.s,w12
2675.inst	0x05a0399e	//dup z30.s,w12
2676	lsr	x14,x26,#32
2677.inst	0x05a03b49	//dup z9.s,w26
2678.inst	0x05a03b55	//dup z21.s,w26
2679.if	mixin == 1
2680	mov	w13,w26
2681.endif
2682.inst	0x05a039cd	//dup z13.s,w14
2683.inst	0x05a039d6	//dup z22.s,w14
2684	lsr	x16,x27,#32
2685.inst	0x05a03b62	//dup z2.s,w27
2686.inst	0x05a03b77	//dup z23.s,w27
2687.if	mixin == 1
2688	mov	w15,w27
2689.endif
2690.inst	0x05a03a06	//dup z6.s,w16
2691.inst	0x05a03a18	//dup z24.s,w16
2692	lsr	x18,x28,#32
2693.inst	0x05a03b8a	//dup z10.s,w28
2694.if	mixin == 1
2695	mov	w17,w28
2696.endif
2697.inst	0x05a03a4e	//dup z14.s,w18
2698	lsr	x22,x30,#32
2699.inst	0x05a03bcb	//dup z11.s,w30
2700.if	mixin == 1
2701	mov	w21,w30
2702.endif
2703.inst	0x05a03acf	//dup z15.s,w22
2704.if	mixin == 1
2705	add	w20,w29,#1
2706	mov	w19,w29
2707.inst	0x04a14690	//index z16.s,w20,1
2708.inst	0x04a14683	//index z3.s,w20,1
2709.else
2710.inst	0x04a147b0	//index z16.s,w29,1
2711.inst	0x04a147a3	//index z3.s,w29,1
2712.endif
2713	lsr	x20,x29,#32
2714.inst	0x05a03a87	//dup z7.s,w20
2715	mov	x6,#10
271610:
2717.align	5
2718.inst	0x04a10000	//add z0.s,z0.s,z1.s
2719.if	mixin == 1
2720	add	w7,w7,w11
2721.endif
2722.inst	0x04a50084	//add z4.s,z4.s,z5.s
2723.if	mixin == 1
2724	add	w8,w8,w12
2725.endif
2726.inst	0x04a90108	//add z8.s,z8.s,z9.s
2727.if	mixin == 1
2728	add	w9,w9,w13
2729.endif
2730.inst	0x04ad018c	//add z12.s,z12.s,z13.s
2731.if	mixin == 1
2732	add	w10,w10,w14
2733.endif
2734.inst	0x04a03063	//eor z3.d,z3.d,z0.d
2735.if	mixin == 1
2736	eor	w19,w19,w7
2737.endif
2738.inst	0x04a430e7	//eor z7.d,z7.d,z4.d
2739.if	mixin == 1
2740	eor	w20,w20,w8
2741.endif
2742.inst	0x04a8316b	//eor z11.d,z11.d,z8.d
2743.if	mixin == 1
2744	eor	w21,w21,w9
2745.endif
2746.inst	0x04ac31ef	//eor z15.d,z15.d,z12.d
2747.if	mixin == 1
2748	eor	w22,w22,w10
2749.endif
2750.inst	0x05a58063	//revh z3.s,p0/m,z3.s
2751.if	mixin == 1
2752	ror	w19,w19,#16
2753.endif
2754.inst	0x05a580e7	//revh z7.s,p0/m,z7.s
2755.if	mixin == 1
2756	ror	w20,w20,#16
2757.endif
2758.inst	0x05a5816b	//revh z11.s,p0/m,z11.s
2759.if	mixin == 1
2760	ror	w21,w21,#16
2761.endif
2762.inst	0x05a581ef	//revh z15.s,p0/m,z15.s
2763.if	mixin == 1
2764	ror	w22,w22,#16
2765.endif
2766.inst	0x04a30042	//add z2.s,z2.s,z3.s
2767.if	mixin == 1
2768	add	w15,w15,w19
2769.endif
2770.inst	0x04a700c6	//add z6.s,z6.s,z7.s
2771.if	mixin == 1
2772	add	w16,w16,w20
2773.endif
2774.inst	0x04ab014a	//add z10.s,z10.s,z11.s
2775.if	mixin == 1
2776	add	w17,w17,w21
2777.endif
2778.inst	0x04af01ce	//add z14.s,z14.s,z15.s
2779.if	mixin == 1
2780	add	w18,w18,w22
2781.endif
2782.inst	0x04a23021	//eor z1.d,z1.d,z2.d
2783.if	mixin == 1
2784	eor	w11,w11,w15
2785.endif
2786.inst	0x04a630a5	//eor z5.d,z5.d,z6.d
2787.if	mixin == 1
2788	eor	w12,w12,w16
2789.endif
2790.inst	0x04aa3129	//eor z9.d,z9.d,z10.d
2791.if	mixin == 1
2792	eor	w13,w13,w17
2793.endif
2794.inst	0x04ae31ad	//eor z13.d,z13.d,z14.d
2795.if	mixin == 1
2796	eor	w14,w14,w18
2797.endif
2798.inst	0x046c9c31	//lsl z17.s,z1.s,12
2799.inst	0x046c9cb2	//lsl z18.s,z5.s,12
2800.inst	0x046c9d33	//lsl z19.s,z9.s,12
2801.inst	0x046c9db4	//lsl z20.s,z13.s,12
2802.inst	0x046c9421	//lsr z1.s,z1.s,20
2803.if	mixin == 1
2804	ror	w11,w11,20
2805.endif
2806.inst	0x046c94a5	//lsr z5.s,z5.s,20
2807.if	mixin == 1
2808	ror	w12,w12,20
2809.endif
2810.inst	0x046c9529	//lsr z9.s,z9.s,20
2811.if	mixin == 1
2812	ror	w13,w13,20
2813.endif
2814.inst	0x046c95ad	//lsr z13.s,z13.s,20
2815.if	mixin == 1
2816	ror	w14,w14,20
2817.endif
2818.inst	0x04713021	//orr z1.d,z1.d,z17.d
2819.inst	0x047230a5	//orr z5.d,z5.d,z18.d
2820.inst	0x04733129	//orr z9.d,z9.d,z19.d
2821.inst	0x047431ad	//orr z13.d,z13.d,z20.d
2822.inst	0x04a10000	//add z0.s,z0.s,z1.s
2823.if	mixin == 1
2824	add	w7,w7,w11
2825.endif
2826.inst	0x04a50084	//add z4.s,z4.s,z5.s
2827.if	mixin == 1
2828	add	w8,w8,w12
2829.endif
2830.inst	0x04a90108	//add z8.s,z8.s,z9.s
2831.if	mixin == 1
2832	add	w9,w9,w13
2833.endif
2834.inst	0x04ad018c	//add z12.s,z12.s,z13.s
2835.if	mixin == 1
2836	add	w10,w10,w14
2837.endif
2838.inst	0x04a03063	//eor z3.d,z3.d,z0.d
2839.if	mixin == 1
2840	eor	w19,w19,w7
2841.endif
2842.inst	0x04a430e7	//eor z7.d,z7.d,z4.d
2843.if	mixin == 1
2844	eor	w20,w20,w8
2845.endif
2846.inst	0x04a8316b	//eor z11.d,z11.d,z8.d
2847.if	mixin == 1
2848	eor	w21,w21,w9
2849.endif
2850.inst	0x04ac31ef	//eor z15.d,z15.d,z12.d
2851.if	mixin == 1
2852	eor	w22,w22,w10
2853.endif
2854.inst	0x053f3063	//tbl z3.b,{z3.b},z31.b
2855.if	mixin == 1
2856	ror	w19,w19,#24
2857.endif
2858.inst	0x053f30e7	//tbl z7.b,{z7.b},z31.b
2859.if	mixin == 1
2860	ror	w20,w20,#24
2861.endif
2862.inst	0x053f316b	//tbl z11.b,{z11.b},z31.b
2863.if	mixin == 1
2864	ror	w21,w21,#24
2865.endif
2866.inst	0x053f31ef	//tbl z15.b,{z15.b},z31.b
2867.if	mixin == 1
2868	ror	w22,w22,#24
2869.endif
2870.inst	0x04a30042	//add z2.s,z2.s,z3.s
2871.if	mixin == 1
2872	add	w15,w15,w19
2873.endif
2874.inst	0x04a700c6	//add z6.s,z6.s,z7.s
2875.if	mixin == 1
2876	add	w16,w16,w20
2877.endif
2878.inst	0x04ab014a	//add z10.s,z10.s,z11.s
2879.if	mixin == 1
2880	add	w17,w17,w21
2881.endif
2882.inst	0x04af01ce	//add z14.s,z14.s,z15.s
2883.if	mixin == 1
2884	add	w18,w18,w22
2885.endif
2886.inst	0x04a23021	//eor z1.d,z1.d,z2.d
2887.if	mixin == 1
2888	eor	w11,w11,w15
2889.endif
2890.inst	0x04a630a5	//eor z5.d,z5.d,z6.d
2891.if	mixin == 1
2892	eor	w12,w12,w16
2893.endif
2894.inst	0x04aa3129	//eor z9.d,z9.d,z10.d
2895.if	mixin == 1
2896	eor	w13,w13,w17
2897.endif
2898.inst	0x04ae31ad	//eor z13.d,z13.d,z14.d
2899.if	mixin == 1
2900	eor	w14,w14,w18
2901.endif
2902.inst	0x04679c31	//lsl z17.s,z1.s,7
2903.inst	0x04679cb2	//lsl z18.s,z5.s,7
2904.inst	0x04679d33	//lsl z19.s,z9.s,7
2905.inst	0x04679db4	//lsl z20.s,z13.s,7
2906.inst	0x04679421	//lsr z1.s,z1.s,25
2907.if	mixin == 1
2908	ror	w11,w11,25
2909.endif
2910.inst	0x046794a5	//lsr z5.s,z5.s,25
2911.if	mixin == 1
2912	ror	w12,w12,25
2913.endif
2914.inst	0x04679529	//lsr z9.s,z9.s,25
2915.if	mixin == 1
2916	ror	w13,w13,25
2917.endif
2918.inst	0x046795ad	//lsr z13.s,z13.s,25
2919.if	mixin == 1
2920	ror	w14,w14,25
2921.endif
2922.inst	0x04713021	//orr z1.d,z1.d,z17.d
2923.inst	0x047230a5	//orr z5.d,z5.d,z18.d
2924.inst	0x04733129	//orr z9.d,z9.d,z19.d
2925.inst	0x047431ad	//orr z13.d,z13.d,z20.d
2926.inst	0x04a50000	//add z0.s,z0.s,z5.s
2927.if	mixin == 1
2928	add	w7,w7,w12
2929.endif
2930.inst	0x04a90084	//add z4.s,z4.s,z9.s
2931.if	mixin == 1
2932	add	w8,w8,w13
2933.endif
2934.inst	0x04ad0108	//add z8.s,z8.s,z13.s
2935.if	mixin == 1
2936	add	w9,w9,w14
2937.endif
2938.inst	0x04a1018c	//add z12.s,z12.s,z1.s
2939.if	mixin == 1
2940	add	w10,w10,w11
2941.endif
2942.inst	0x04a031ef	//eor z15.d,z15.d,z0.d
2943.if	mixin == 1
2944	eor	w22,w22,w7
2945.endif
2946.inst	0x04a43063	//eor z3.d,z3.d,z4.d
2947.if	mixin == 1
2948	eor	w19,w19,w8
2949.endif
2950.inst	0x04a830e7	//eor z7.d,z7.d,z8.d
2951.if	mixin == 1
2952	eor	w20,w20,w9
2953.endif
2954.inst	0x04ac316b	//eor z11.d,z11.d,z12.d
2955.if	mixin == 1
2956	eor	w21,w21,w10
2957.endif
2958.inst	0x05a581ef	//revh z15.s,p0/m,z15.s
2959.if	mixin == 1
2960	ror	w22,w22,#16
2961.endif
2962.inst	0x05a58063	//revh z3.s,p0/m,z3.s
2963.if	mixin == 1
2964	ror	w19,w19,#16
2965.endif
2966.inst	0x05a580e7	//revh z7.s,p0/m,z7.s
2967.if	mixin == 1
2968	ror	w20,w20,#16
2969.endif
2970.inst	0x05a5816b	//revh z11.s,p0/m,z11.s
2971.if	mixin == 1
2972	ror	w21,w21,#16
2973.endif
2974.inst	0x04af014a	//add z10.s,z10.s,z15.s
2975.if	mixin == 1
2976	add	w17,w17,w22
2977.endif
2978.inst	0x04a301ce	//add z14.s,z14.s,z3.s
2979.if	mixin == 1
2980	add	w18,w18,w19
2981.endif
2982.inst	0x04a70042	//add z2.s,z2.s,z7.s
2983.if	mixin == 1
2984	add	w15,w15,w20
2985.endif
2986.inst	0x04ab00c6	//add z6.s,z6.s,z11.s
2987.if	mixin == 1
2988	add	w16,w16,w21
2989.endif
2990.inst	0x04aa30a5	//eor z5.d,z5.d,z10.d
2991.if	mixin == 1
2992	eor	w12,w12,w17
2993.endif
2994.inst	0x04ae3129	//eor z9.d,z9.d,z14.d
2995.if	mixin == 1
2996	eor	w13,w13,w18
2997.endif
2998.inst	0x04a231ad	//eor z13.d,z13.d,z2.d
2999.if	mixin == 1
3000	eor	w14,w14,w15
3001.endif
3002.inst	0x04a63021	//eor z1.d,z1.d,z6.d
3003.if	mixin == 1
3004	eor	w11,w11,w16
3005.endif
3006.inst	0x046c9cb1	//lsl z17.s,z5.s,12
3007.inst	0x046c9d32	//lsl z18.s,z9.s,12
3008.inst	0x046c9db3	//lsl z19.s,z13.s,12
3009.inst	0x046c9c34	//lsl z20.s,z1.s,12
3010.inst	0x046c94a5	//lsr z5.s,z5.s,20
3011.if	mixin == 1
3012	ror	w12,w12,20
3013.endif
3014.inst	0x046c9529	//lsr z9.s,z9.s,20
3015.if	mixin == 1
3016	ror	w13,w13,20
3017.endif
3018.inst	0x046c95ad	//lsr z13.s,z13.s,20
3019.if	mixin == 1
3020	ror	w14,w14,20
3021.endif
3022.inst	0x046c9421	//lsr z1.s,z1.s,20
3023.if	mixin == 1
3024	ror	w11,w11,20
3025.endif
3026.inst	0x047130a5	//orr z5.d,z5.d,z17.d
3027.inst	0x04723129	//orr z9.d,z9.d,z18.d
3028.inst	0x047331ad	//orr z13.d,z13.d,z19.d
3029.inst	0x04743021	//orr z1.d,z1.d,z20.d
3030.inst	0x04a50000	//add z0.s,z0.s,z5.s
3031.if	mixin == 1
3032	add	w7,w7,w12
3033.endif
3034.inst	0x04a90084	//add z4.s,z4.s,z9.s
3035.if	mixin == 1
3036	add	w8,w8,w13
3037.endif
3038.inst	0x04ad0108	//add z8.s,z8.s,z13.s
3039.if	mixin == 1
3040	add	w9,w9,w14
3041.endif
3042.inst	0x04a1018c	//add z12.s,z12.s,z1.s
3043.if	mixin == 1
3044	add	w10,w10,w11
3045.endif
3046.inst	0x04a031ef	//eor z15.d,z15.d,z0.d
3047.if	mixin == 1
3048	eor	w22,w22,w7
3049.endif
3050.inst	0x04a43063	//eor z3.d,z3.d,z4.d
3051.if	mixin == 1
3052	eor	w19,w19,w8
3053.endif
3054.inst	0x04a830e7	//eor z7.d,z7.d,z8.d
3055.if	mixin == 1
3056	eor	w20,w20,w9
3057.endif
3058.inst	0x04ac316b	//eor z11.d,z11.d,z12.d
3059.if	mixin == 1
3060	eor	w21,w21,w10
3061.endif
3062.inst	0x053f31ef	//tbl z15.b,{z15.b},z31.b
3063.if	mixin == 1
3064	ror	w22,w22,#24
3065.endif
3066.inst	0x053f3063	//tbl z3.b,{z3.b},z31.b
3067.if	mixin == 1
3068	ror	w19,w19,#24
3069.endif
3070.inst	0x053f30e7	//tbl z7.b,{z7.b},z31.b
3071.if	mixin == 1
3072	ror	w20,w20,#24
3073.endif
3074.inst	0x053f316b	//tbl z11.b,{z11.b},z31.b
3075.if	mixin == 1
3076	ror	w21,w21,#24
3077.endif
3078.inst	0x04af014a	//add z10.s,z10.s,z15.s
3079.if	mixin == 1
3080	add	w17,w17,w22
3081.endif
3082.inst	0x04a301ce	//add z14.s,z14.s,z3.s
3083.if	mixin == 1
3084	add	w18,w18,w19
3085.endif
3086.inst	0x04a70042	//add z2.s,z2.s,z7.s
3087.if	mixin == 1
3088	add	w15,w15,w20
3089.endif
3090.inst	0x04ab00c6	//add z6.s,z6.s,z11.s
3091.if	mixin == 1
3092	add	w16,w16,w21
3093.endif
3094.inst	0x04aa30a5	//eor z5.d,z5.d,z10.d
3095.if	mixin == 1
3096	eor	w12,w12,w17
3097.endif
3098.inst	0x04ae3129	//eor z9.d,z9.d,z14.d
3099.if	mixin == 1
3100	eor	w13,w13,w18
3101.endif
3102.inst	0x04a231ad	//eor z13.d,z13.d,z2.d
3103.if	mixin == 1
3104	eor	w14,w14,w15
3105.endif
3106.inst	0x04a63021	//eor z1.d,z1.d,z6.d
3107.if	mixin == 1
3108	eor	w11,w11,w16
3109.endif
3110.inst	0x04679cb1	//lsl z17.s,z5.s,7
3111.inst	0x04679d32	//lsl z18.s,z9.s,7
3112.inst	0x04679db3	//lsl z19.s,z13.s,7
3113.inst	0x04679c34	//lsl z20.s,z1.s,7
3114.inst	0x046794a5	//lsr z5.s,z5.s,25
3115.if	mixin == 1
3116	ror	w12,w12,25
3117.endif
3118.inst	0x04679529	//lsr z9.s,z9.s,25
3119.if	mixin == 1
3120	ror	w13,w13,25
3121.endif
3122.inst	0x046795ad	//lsr z13.s,z13.s,25
3123.if	mixin == 1
3124	ror	w14,w14,25
3125.endif
3126.inst	0x04679421	//lsr z1.s,z1.s,25
3127.if	mixin == 1
3128	ror	w11,w11,25
3129.endif
3130.inst	0x047130a5	//orr z5.d,z5.d,z17.d
3131.inst	0x04723129	//orr z9.d,z9.d,z18.d
3132.inst	0x047331ad	//orr z13.d,z13.d,z19.d
3133.inst	0x04743021	//orr z1.d,z1.d,z20.d
3134	sub	x6,x6,1
3135	cbnz	x6,10b
3136	lsr	x6,x28,#32
3137.inst	0x05a03b91	//dup z17.s,w28
3138.inst	0x05a038d2	//dup z18.s,w6
3139	lsr	x6,x29,#32
3140.inst	0x05a038d3	//dup z19.s,w6
3141	lsr	x6,x30,#32
3142.if	mixin == 1
3143	add	w7,w7,w23
3144.endif
3145.inst	0x04b90000	//add z0.s,z0.s,z25.s
3146.if	mixin == 1
3147	add	x8,x8,x23,lsr #32
3148.endif
3149.inst	0x04ba0084	//add z4.s,z4.s,z26.s
3150.if	mixin == 1
3151	add	x7,x7,x8,lsl #32  // pack
3152.endif
3153.if	mixin == 1
3154	add	w9,w9,w24
3155.endif
3156.inst	0x04bb0108	//add z8.s,z8.s,z27.s
3157.if	mixin == 1
3158	add	x10,x10,x24,lsr #32
3159.endif
3160.inst	0x04bc018c	//add z12.s,z12.s,z28.s
3161.if	mixin == 1
3162	add	x9,x9,x10,lsl #32  // pack
3163.endif
3164.if	mixin == 1
3165	ldp	x8,x10,[x1],#16
3166.endif
3167.if	mixin == 1
3168	add	w11,w11,w25
3169.endif
3170.inst	0x04bd0021	//add z1.s,z1.s,z29.s
3171.if	mixin == 1
3172	add	x12,x12,x25,lsr #32
3173.endif
3174.inst	0x04be00a5	//add z5.s,z5.s,z30.s
3175.if	mixin == 1
3176	add	x11,x11,x12,lsl #32  // pack
3177.endif
3178.if	mixin == 1
3179	add	w13,w13,w26
3180.endif
3181.inst	0x04b50129	//add z9.s,z9.s,z21.s
3182.if	mixin == 1
3183	add	x14,x14,x26,lsr #32
3184.endif
3185.inst	0x04b601ad	//add z13.s,z13.s,z22.s
3186.if	mixin == 1
3187	add	x13,x13,x14,lsl #32  // pack
3188.endif
3189.if	mixin == 1
3190	ldp	x12,x14,[x1],#16
3191.endif
3192.if	mixin == 1
3193	add	w15,w15,w27
3194.endif
3195.inst	0x04b70042	//add z2.s,z2.s,z23.s
3196.if	mixin == 1
3197	add	x16,x16,x27,lsr #32
3198.endif
3199.inst	0x04b800c6	//add z6.s,z6.s,z24.s
3200.if	mixin == 1
3201	add	x15,x15,x16,lsl #32  // pack
3202.endif
3203.if	mixin == 1
3204	add	w17,w17,w28
3205.endif
3206.inst	0x04b1014a	//add z10.s,z10.s,z17.s
3207.if	mixin == 1
3208	add	x18,x18,x28,lsr #32
3209.endif
3210.inst	0x04b201ce	//add z14.s,z14.s,z18.s
3211.if	mixin == 1
3212	add	x17,x17,x18,lsl #32  // pack
3213.endif
3214.if	mixin == 1
3215	ldp	x16,x18,[x1],#16
3216.endif
3217.inst	0x05a03bd4	//dup z20.s,w30
3218.inst	0x05a038d9	//dup z25.s,w6	// bak[15] not available for SVE
3219.if	mixin == 1
3220	add	w19,w19,w29
3221.endif
3222.inst	0x04b00063	//add z3.s,z3.s,z16.s
3223.if	mixin == 1
3224	add	x20,x20,x29,lsr #32
3225.endif
3226.inst	0x04b300e7	//add z7.s,z7.s,z19.s
3227.if	mixin == 1
3228	add	x19,x19,x20,lsl #32  // pack
3229.endif
3230.if	mixin == 1
3231	add	w21,w21,w30
3232.endif
3233.inst	0x04b4016b	//add z11.s,z11.s,z20.s
3234.if	mixin == 1
3235	add	x22,x22,x30,lsr #32
3236.endif
3237.inst	0x04b901ef	//add z15.s,z15.s,z25.s
3238.if	mixin == 1
3239	add	x21,x21,x22,lsl #32  // pack
3240.endif
3241.if	mixin == 1
3242	ldp	x20,x22,[x1],#16
3243.endif
3244#ifdef	__AARCH64EB__
3245	rev	x7,x7
3246.inst	0x05a48000	//revb z0.s,p0/m,z0.s
3247.inst	0x05a48084	//revb z4.s,p0/m,z4.s
3248	rev	x9,x9
3249.inst	0x05a48108	//revb z8.s,p0/m,z8.s
3250.inst	0x05a4818c	//revb z12.s,p0/m,z12.s
3251	rev	x11,x11
3252.inst	0x05a48021	//revb z1.s,p0/m,z1.s
3253.inst	0x05a480a5	//revb z5.s,p0/m,z5.s
3254	rev	x13,x13
3255.inst	0x05a48129	//revb z9.s,p0/m,z9.s
3256.inst	0x05a481ad	//revb z13.s,p0/m,z13.s
3257	rev	x15,x15
3258.inst	0x05a48042	//revb z2.s,p0/m,z2.s
3259.inst	0x05a480c6	//revb z6.s,p0/m,z6.s
3260	rev	x17,x17
3261.inst	0x05a4814a	//revb z10.s,p0/m,z10.s
3262.inst	0x05a481ce	//revb z14.s,p0/m,z14.s
3263	rev	x19,x19
3264.inst	0x05a48063	//revb z3.s,p0/m,z3.s
3265.inst	0x05a480e7	//revb z7.s,p0/m,z7.s
3266	rev	x21,x21
3267.inst	0x05a4816b	//revb z11.s,p0/m,z11.s
3268.inst	0x05a481ef	//revb z15.s,p0/m,z15.s
3269#endif
3270.if	mixin == 1
3271	add	x29,x29,#1
3272.endif
3273	cmp	x5,4
3274	b.ne	200f
3275.if	mixin == 1
3276	eor	x7,x7,x8
3277.endif
3278.if	mixin == 1
3279	eor	x9,x9,x10
3280.endif
3281.if	mixin == 1
3282	eor	x11,x11,x12
3283.endif
3284.inst	0x05a46011	//zip1 z17.s,z0.s,z4.s
3285.inst	0x05a46412	//zip2 z18.s,z0.s,z4.s
3286.inst	0x05ac6113	//zip1 z19.s,z8.s,z12.s
3287.inst	0x05ac6514	//zip2 z20.s,z8.s,z12.s
3288
3289.inst	0x05a56035	//zip1 z21.s,z1.s,z5.s
3290.inst	0x05a56436	//zip2 z22.s,z1.s,z5.s
3291.inst	0x05ad6137	//zip1 z23.s,z9.s,z13.s
3292.inst	0x05ad6538	//zip2 z24.s,z9.s,z13.s
3293
3294.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
3295.inst	0x05f36624	//zip2 z4.d,z17.d,z19.d
3296.inst	0x05f46248	//zip1 z8.d,z18.d,z20.d
3297.inst	0x05f4664c	//zip2 z12.d,z18.d,z20.d
3298
3299.inst	0x05f762a1	//zip1 z1.d,z21.d,z23.d
3300.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
3301.inst	0x05f862c9	//zip1 z9.d,z22.d,z24.d
3302.inst	0x05f866cd	//zip2 z13.d,z22.d,z24.d
3303.if	mixin == 1
3304	eor	x13,x13,x14
3305.endif
3306.if	mixin == 1
3307	eor	x15,x15,x16
3308.endif
3309.if	mixin == 1
3310	eor	x17,x17,x18
3311.endif
3312.inst	0x05a66051	//zip1 z17.s,z2.s,z6.s
3313.inst	0x05a66452	//zip2 z18.s,z2.s,z6.s
3314.inst	0x05ae6153	//zip1 z19.s,z10.s,z14.s
3315.inst	0x05ae6554	//zip2 z20.s,z10.s,z14.s
3316
3317.inst	0x05a76075	//zip1 z21.s,z3.s,z7.s
3318.inst	0x05a76476	//zip2 z22.s,z3.s,z7.s
3319.inst	0x05af6177	//zip1 z23.s,z11.s,z15.s
3320.inst	0x05af6578	//zip2 z24.s,z11.s,z15.s
3321
3322.inst	0x05f36222	//zip1 z2.d,z17.d,z19.d
3323.inst	0x05f36626	//zip2 z6.d,z17.d,z19.d
3324.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
3325.inst	0x05f4664e	//zip2 z14.d,z18.d,z20.d
3326
3327.inst	0x05f762a3	//zip1 z3.d,z21.d,z23.d
3328.inst	0x05f766a7	//zip2 z7.d,z21.d,z23.d
3329.inst	0x05f862cb	//zip1 z11.d,z22.d,z24.d
3330.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
3331.if	mixin == 1
3332	eor	x19,x19,x20
3333.endif
3334.if	mixin == 1
3335	eor	x21,x21,x22
3336.endif
3337	ld1	{v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
3338	ld1	{v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
3339.inst	0x04b13000	//eor z0.d,z0.d,z17.d
3340.inst	0x04b23021	//eor z1.d,z1.d,z18.d
3341.inst	0x04b33042	//eor z2.d,z2.d,z19.d
3342.inst	0x04b43063	//eor z3.d,z3.d,z20.d
3343.inst	0x04b53084	//eor z4.d,z4.d,z21.d
3344.inst	0x04b630a5	//eor z5.d,z5.d,z22.d
3345.inst	0x04b730c6	//eor z6.d,z6.d,z23.d
3346.inst	0x04b830e7	//eor z7.d,z7.d,z24.d
3347	ld1	{v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
3348	ld1	{v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
3349.if	mixin == 1
3350	stp	x7,x9,[x0],#16
3351.endif
3352.inst	0x04b13108	//eor z8.d,z8.d,z17.d
3353.inst	0x04b23129	//eor z9.d,z9.d,z18.d
3354.if	mixin == 1
3355	stp	x11,x13,[x0],#16
3356.endif
3357.inst	0x04b3314a	//eor z10.d,z10.d,z19.d
3358.inst	0x04b4316b	//eor z11.d,z11.d,z20.d
3359.if	mixin == 1
3360	stp	x15,x17,[x0],#16
3361.endif
3362.inst	0x04b5318c	//eor z12.d,z12.d,z21.d
3363.inst	0x04b631ad	//eor z13.d,z13.d,z22.d
3364.if	mixin == 1
3365	stp	x19,x21,[x0],#16
3366.endif
3367.inst	0x04b731ce	//eor z14.d,z14.d,z23.d
3368.inst	0x04b831ef	//eor z15.d,z15.d,z24.d
3369	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
3370	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
3371	st1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
3372	st1	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
3373	b	210f
3374200:
3375.inst	0x05a16011	//zip1 z17.s,z0.s,z1.s
3376.inst	0x05a16412	//zip2 z18.s,z0.s,z1.s
3377.inst	0x05a36053	//zip1 z19.s,z2.s,z3.s
3378.inst	0x05a36454	//zip2 z20.s,z2.s,z3.s
3379
3380.inst	0x05a56095	//zip1 z21.s,z4.s,z5.s
3381.inst	0x05a56496	//zip2 z22.s,z4.s,z5.s
3382.inst	0x05a760d7	//zip1 z23.s,z6.s,z7.s
3383.inst	0x05a764d8	//zip2 z24.s,z6.s,z7.s
3384
3385.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
3386.inst	0x05f36621	//zip2 z1.d,z17.d,z19.d
3387.inst	0x05f46242	//zip1 z2.d,z18.d,z20.d
3388.inst	0x05f46643	//zip2 z3.d,z18.d,z20.d
3389
3390.inst	0x05f762a4	//zip1 z4.d,z21.d,z23.d
3391.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
3392.inst	0x05f862c6	//zip1 z6.d,z22.d,z24.d
3393.inst	0x05f866c7	//zip2 z7.d,z22.d,z24.d
3394.if	mixin == 1
3395	eor	x7,x7,x8
3396.endif
3397.if	mixin == 1
3398	eor	x9,x9,x10
3399.endif
3400.inst	0x05a96111	//zip1 z17.s,z8.s,z9.s
3401.inst	0x05a96512	//zip2 z18.s,z8.s,z9.s
3402.inst	0x05ab6153	//zip1 z19.s,z10.s,z11.s
3403.inst	0x05ab6554	//zip2 z20.s,z10.s,z11.s
3404
3405.inst	0x05ad6195	//zip1 z21.s,z12.s,z13.s
3406.inst	0x05ad6596	//zip2 z22.s,z12.s,z13.s
3407.inst	0x05af61d7	//zip1 z23.s,z14.s,z15.s
3408.inst	0x05af65d8	//zip2 z24.s,z14.s,z15.s
3409
3410.inst	0x05f36228	//zip1 z8.d,z17.d,z19.d
3411.inst	0x05f36629	//zip2 z9.d,z17.d,z19.d
3412.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
3413.inst	0x05f4664b	//zip2 z11.d,z18.d,z20.d
3414
3415.inst	0x05f762ac	//zip1 z12.d,z21.d,z23.d
3416.inst	0x05f766ad	//zip2 z13.d,z21.d,z23.d
3417.inst	0x05f862ce	//zip1 z14.d,z22.d,z24.d
3418.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
3419.if	mixin == 1
3420	eor	x11,x11,x12
3421.endif
3422.if	mixin == 1
3423	eor	x13,x13,x14
3424.endif
3425.inst	0x05a46011	//zip1 z17.s,z0.s,z4.s
3426.inst	0x05a46412	//zip2 z18.s,z0.s,z4.s
3427.inst	0x05ac6113	//zip1 z19.s,z8.s,z12.s
3428.inst	0x05ac6514	//zip2 z20.s,z8.s,z12.s
3429
3430.inst	0x05a56035	//zip1 z21.s,z1.s,z5.s
3431.inst	0x05a56436	//zip2 z22.s,z1.s,z5.s
3432.inst	0x05ad6137	//zip1 z23.s,z9.s,z13.s
3433.inst	0x05ad6538	//zip2 z24.s,z9.s,z13.s
3434
3435.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
3436.inst	0x05f36624	//zip2 z4.d,z17.d,z19.d
3437.inst	0x05f46248	//zip1 z8.d,z18.d,z20.d
3438.inst	0x05f4664c	//zip2 z12.d,z18.d,z20.d
3439
3440.inst	0x05f762a1	//zip1 z1.d,z21.d,z23.d
3441.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
3442.inst	0x05f862c9	//zip1 z9.d,z22.d,z24.d
3443.inst	0x05f866cd	//zip2 z13.d,z22.d,z24.d
3444.if	mixin == 1
3445	eor	x15,x15,x16
3446.endif
3447.if	mixin == 1
3448	eor	x17,x17,x18
3449.endif
3450.inst	0x05a66051	//zip1 z17.s,z2.s,z6.s
3451.inst	0x05a66452	//zip2 z18.s,z2.s,z6.s
3452.inst	0x05ae6153	//zip1 z19.s,z10.s,z14.s
3453.inst	0x05ae6554	//zip2 z20.s,z10.s,z14.s
3454
3455.inst	0x05a76075	//zip1 z21.s,z3.s,z7.s
3456.inst	0x05a76476	//zip2 z22.s,z3.s,z7.s
3457.inst	0x05af6177	//zip1 z23.s,z11.s,z15.s
3458.inst	0x05af6578	//zip2 z24.s,z11.s,z15.s
3459
3460.inst	0x05f36222	//zip1 z2.d,z17.d,z19.d
3461.inst	0x05f36626	//zip2 z6.d,z17.d,z19.d
3462.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
3463.inst	0x05f4664e	//zip2 z14.d,z18.d,z20.d
3464
3465.inst	0x05f762a3	//zip1 z3.d,z21.d,z23.d
3466.inst	0x05f766a7	//zip2 z7.d,z21.d,z23.d
3467.inst	0x05f862cb	//zip1 z11.d,z22.d,z24.d
3468.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
3469.if	mixin == 1
3470	eor	x19,x19,x20
3471.endif
3472.if	mixin == 1
3473	eor	x21,x21,x22
3474.endif
3475.inst	0xa540a031	//ld1w {z17.s},p0/z,[x1,#0,MUL VL]
3476.inst	0xa541a032	//ld1w {z18.s},p0/z,[x1,#1,MUL VL]
3477.inst	0xa542a033	//ld1w {z19.s},p0/z,[x1,#2,MUL VL]
3478.inst	0xa543a034	//ld1w {z20.s},p0/z,[x1,#3,MUL VL]
3479.inst	0xa544a035	//ld1w {z21.s},p0/z,[x1,#4,MUL VL]
3480.inst	0xa545a036	//ld1w {z22.s},p0/z,[x1,#5,MUL VL]
3481.inst	0xa546a037	//ld1w {z23.s},p0/z,[x1,#6,MUL VL]
3482.inst	0xa547a038	//ld1w {z24.s},p0/z,[x1,#7,MUL VL]
3483.inst	0x04215101	//addvl x1,x1,8
3484.inst	0x04b13000	//eor z0.d,z0.d,z17.d
3485.inst	0x04b23084	//eor z4.d,z4.d,z18.d
3486.inst	0x04b33108	//eor z8.d,z8.d,z19.d
3487.inst	0x04b4318c	//eor z12.d,z12.d,z20.d
3488.inst	0x04b53021	//eor z1.d,z1.d,z21.d
3489.inst	0x04b630a5	//eor z5.d,z5.d,z22.d
3490.inst	0x04b73129	//eor z9.d,z9.d,z23.d
3491.inst	0x04b831ad	//eor z13.d,z13.d,z24.d
3492.inst	0xa540a031	//ld1w {z17.s},p0/z,[x1,#0,MUL VL]
3493.inst	0xa541a032	//ld1w {z18.s},p0/z,[x1,#1,MUL VL]
3494.inst	0xa542a033	//ld1w {z19.s},p0/z,[x1,#2,MUL VL]
3495.inst	0xa543a034	//ld1w {z20.s},p0/z,[x1,#3,MUL VL]
3496.inst	0xa544a035	//ld1w {z21.s},p0/z,[x1,#4,MUL VL]
3497.inst	0xa545a036	//ld1w {z22.s},p0/z,[x1,#5,MUL VL]
3498.inst	0xa546a037	//ld1w {z23.s},p0/z,[x1,#6,MUL VL]
3499.inst	0xa547a038	//ld1w {z24.s},p0/z,[x1,#7,MUL VL]
3500.inst	0x04215101	//addvl x1,x1,8
3501.if	mixin == 1
3502	stp	x7,x9,[x0],#16
3503.endif
3504.inst	0x04b13042	//eor z2.d,z2.d,z17.d
3505.inst	0x04b230c6	//eor z6.d,z6.d,z18.d
3506.if	mixin == 1
3507	stp	x11,x13,[x0],#16
3508.endif
3509.inst	0x04b3314a	//eor z10.d,z10.d,z19.d
3510.inst	0x04b431ce	//eor z14.d,z14.d,z20.d
3511.if	mixin == 1
3512	stp	x15,x17,[x0],#16
3513.endif
3514.inst	0x04b53063	//eor z3.d,z3.d,z21.d
3515.inst	0x04b630e7	//eor z7.d,z7.d,z22.d
3516.if	mixin == 1
3517	stp	x19,x21,[x0],#16
3518.endif
3519.inst	0x04b7316b	//eor z11.d,z11.d,z23.d
3520.inst	0x04b831ef	//eor z15.d,z15.d,z24.d
3521.inst	0xe540e000	//st1w {z0.s},p0,[x0,#0,MUL VL]
3522.inst	0xe541e004	//st1w {z4.s},p0,[x0,#1,MUL VL]
3523.inst	0xe542e008	//st1w {z8.s},p0,[x0,#2,MUL VL]
3524.inst	0xe543e00c	//st1w {z12.s},p0,[x0,#3,MUL VL]
3525.inst	0xe544e001	//st1w {z1.s},p0,[x0,#4,MUL VL]
3526.inst	0xe545e005	//st1w {z5.s},p0,[x0,#5,MUL VL]
3527.inst	0xe546e009	//st1w {z9.s},p0,[x0,#6,MUL VL]
3528.inst	0xe547e00d	//st1w {z13.s},p0,[x0,#7,MUL VL]
3529.inst	0x04205100	//addvl x0,x0,8
3530.inst	0xe540e002	//st1w {z2.s},p0,[x0,#0,MUL VL]
3531.inst	0xe541e006	//st1w {z6.s},p0,[x0,#1,MUL VL]
3532.inst	0xe542e00a	//st1w {z10.s},p0,[x0,#2,MUL VL]
3533.inst	0xe543e00e	//st1w {z14.s},p0,[x0,#3,MUL VL]
3534.inst	0xe544e003	//st1w {z3.s},p0,[x0,#4,MUL VL]
3535.inst	0xe545e007	//st1w {z7.s},p0,[x0,#5,MUL VL]
3536.inst	0xe546e00b	//st1w {z11.s},p0,[x0,#6,MUL VL]
3537.inst	0xe547e00f	//st1w {z15.s},p0,[x0,#7,MUL VL]
3538.inst	0x04205100	//addvl x0,x0,8
3539210:
3540.inst	0x04b0e3fd	//incw x29, ALL, MUL #1
3541110:
35422:
3543	str	w29,[x4]
3544	ldp	d10,d11,[sp,16]
3545	ldp	d12,d13,[sp,32]
3546	ldp	d14,d15,[sp,48]
3547	ldp	x16,x17,[sp,64]
3548	ldp	x18,x19,[sp,80]
3549	ldp	x20,x21,[sp,96]
3550	ldp	x22,x23,[sp,112]
3551	ldp	x24,x25,[sp,128]
3552	ldp	x26,x27,[sp,144]
3553	ldp	x28,x29,[sp,160]
3554	ldr	x30,[sp,176]
3555	ldp	d8,d9,[sp],192
3556	AARCH64_VALIDATE_LINK_REGISTER
3557.Lreturn:
3558	ret
3559.size	ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve
3560