xref: /linux/arch/s390/crypto/chacha-s390.S (revision da51bbcdbace8f43adf6066934c3926b656376e5)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Original implementation written by Andy Polyakov, @dot-asm.
4 * This is an adaptation of the original code for kernel use.
5 *
6 * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
7 */
8
9#include <linux/linkage.h>
10#include <asm/nospec-insn.h>
11#include <asm/fpu-insn.h>
12
13#define SP	%r15
14#define FRAME	(16 * 8 + 4 * 8)
15
16	.data
17	.balign	32
18
19SYM_DATA_START_LOCAL(sigma)
20	.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	# endian-neutral
21	.long	1,0,0,0
22	.long	2,0,0,0
23	.long	3,0,0,0
24	.long	0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c	# byte swap
25
26	.long	0,1,2,3
27	.long	0x61707865,0x61707865,0x61707865,0x61707865	# smashed sigma
28	.long	0x3320646e,0x3320646e,0x3320646e,0x3320646e
29	.long	0x79622d32,0x79622d32,0x79622d32,0x79622d32
30	.long	0x6b206574,0x6b206574,0x6b206574,0x6b206574
31SYM_DATA_END(sigma)
32
33	.previous
34
35	GEN_BR_THUNK %r14
36
37	.text
38
39#############################################################################
40# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
41#		      counst u32 *key, const u32 *counter)
42
43#define	OUT		%r2
44#define	INP		%r3
45#define	LEN		%r4
46#define	KEY		%r5
47#define	COUNTER		%r6
48
49#define BEPERM		%v31
50#define CTR		%v26
51
52#define K0		%v16
53#define K1		%v17
54#define K2		%v18
55#define K3		%v19
56
57#define XA0		%v0
58#define XA1		%v1
59#define XA2		%v2
60#define XA3		%v3
61
62#define XB0		%v4
63#define XB1		%v5
64#define XB2		%v6
65#define XB3		%v7
66
67#define XC0		%v8
68#define XC1		%v9
69#define XC2		%v10
70#define XC3		%v11
71
72#define XD0		%v12
73#define XD1		%v13
74#define XD2		%v14
75#define XD3		%v15
76
77#define XT0		%v27
78#define XT1		%v28
79#define XT2		%v29
80#define XT3		%v30
81
82SYM_FUNC_START(chacha20_vx_4x)
83	stmg	%r6,%r7,6*8(SP)
84
85	larl	%r7,sigma
86	lhi	%r0,10
87	lhi	%r1,0
88
89	VL	K0,0,,%r7		# load sigma
90	VL	K1,0,,KEY		# load key
91	VL	K2,16,,KEY
92	VL	K3,0,,COUNTER		# load counter
93
94	VL	BEPERM,0x40,,%r7
95	VL	CTR,0x50,,%r7
96
97	VLM	XA0,XA3,0x60,%r7,4	# load [smashed] sigma
98
99	VREPF	XB0,K1,0		# smash the key
100	VREPF	XB1,K1,1
101	VREPF	XB2,K1,2
102	VREPF	XB3,K1,3
103
104	VREPF	XD0,K3,0
105	VREPF	XD1,K3,1
106	VREPF	XD2,K3,2
107	VREPF	XD3,K3,3
108	VAF	XD0,XD0,CTR
109
110	VREPF	XC0,K2,0
111	VREPF	XC1,K2,1
112	VREPF	XC2,K2,2
113	VREPF	XC3,K2,3
114
115.Loop_4x:
116	VAF	XA0,XA0,XB0
117	VX	XD0,XD0,XA0
118	VERLLF	XD0,XD0,16
119
120	VAF	XA1,XA1,XB1
121	VX	XD1,XD1,XA1
122	VERLLF	XD1,XD1,16
123
124	VAF	XA2,XA2,XB2
125	VX	XD2,XD2,XA2
126	VERLLF	XD2,XD2,16
127
128	VAF	XA3,XA3,XB3
129	VX	XD3,XD3,XA3
130	VERLLF	XD3,XD3,16
131
132	VAF	XC0,XC0,XD0
133	VX	XB0,XB0,XC0
134	VERLLF	XB0,XB0,12
135
136	VAF	XC1,XC1,XD1
137	VX	XB1,XB1,XC1
138	VERLLF	XB1,XB1,12
139
140	VAF	XC2,XC2,XD2
141	VX	XB2,XB2,XC2
142	VERLLF	XB2,XB2,12
143
144	VAF	XC3,XC3,XD3
145	VX	XB3,XB3,XC3
146	VERLLF	XB3,XB3,12
147
148	VAF	XA0,XA0,XB0
149	VX	XD0,XD0,XA0
150	VERLLF	XD0,XD0,8
151
152	VAF	XA1,XA1,XB1
153	VX	XD1,XD1,XA1
154	VERLLF	XD1,XD1,8
155
156	VAF	XA2,XA2,XB2
157	VX	XD2,XD2,XA2
158	VERLLF	XD2,XD2,8
159
160	VAF	XA3,XA3,XB3
161	VX	XD3,XD3,XA3
162	VERLLF	XD3,XD3,8
163
164	VAF	XC0,XC0,XD0
165	VX	XB0,XB0,XC0
166	VERLLF	XB0,XB0,7
167
168	VAF	XC1,XC1,XD1
169	VX	XB1,XB1,XC1
170	VERLLF	XB1,XB1,7
171
172	VAF	XC2,XC2,XD2
173	VX	XB2,XB2,XC2
174	VERLLF	XB2,XB2,7
175
176	VAF	XC3,XC3,XD3
177	VX	XB3,XB3,XC3
178	VERLLF	XB3,XB3,7
179
180	VAF	XA0,XA0,XB1
181	VX	XD3,XD3,XA0
182	VERLLF	XD3,XD3,16
183
184	VAF	XA1,XA1,XB2
185	VX	XD0,XD0,XA1
186	VERLLF	XD0,XD0,16
187
188	VAF	XA2,XA2,XB3
189	VX	XD1,XD1,XA2
190	VERLLF	XD1,XD1,16
191
192	VAF	XA3,XA3,XB0
193	VX	XD2,XD2,XA3
194	VERLLF	XD2,XD2,16
195
196	VAF	XC2,XC2,XD3
197	VX	XB1,XB1,XC2
198	VERLLF	XB1,XB1,12
199
200	VAF	XC3,XC3,XD0
201	VX	XB2,XB2,XC3
202	VERLLF	XB2,XB2,12
203
204	VAF	XC0,XC0,XD1
205	VX	XB3,XB3,XC0
206	VERLLF	XB3,XB3,12
207
208	VAF	XC1,XC1,XD2
209	VX	XB0,XB0,XC1
210	VERLLF	XB0,XB0,12
211
212	VAF	XA0,XA0,XB1
213	VX	XD3,XD3,XA0
214	VERLLF	XD3,XD3,8
215
216	VAF	XA1,XA1,XB2
217	VX	XD0,XD0,XA1
218	VERLLF	XD0,XD0,8
219
220	VAF	XA2,XA2,XB3
221	VX	XD1,XD1,XA2
222	VERLLF	XD1,XD1,8
223
224	VAF	XA3,XA3,XB0
225	VX	XD2,XD2,XA3
226	VERLLF	XD2,XD2,8
227
228	VAF	XC2,XC2,XD3
229	VX	XB1,XB1,XC2
230	VERLLF	XB1,XB1,7
231
232	VAF	XC3,XC3,XD0
233	VX	XB2,XB2,XC3
234	VERLLF	XB2,XB2,7
235
236	VAF	XC0,XC0,XD1
237	VX	XB3,XB3,XC0
238	VERLLF	XB3,XB3,7
239
240	VAF	XC1,XC1,XD2
241	VX	XB0,XB0,XC1
242	VERLLF	XB0,XB0,7
243	brct	%r0,.Loop_4x
244
245	VAF	XD0,XD0,CTR
246
247	VMRHF	XT0,XA0,XA1		# transpose data
248	VMRHF	XT1,XA2,XA3
249	VMRLF	XT2,XA0,XA1
250	VMRLF	XT3,XA2,XA3
251	VPDI	XA0,XT0,XT1,0b0000
252	VPDI	XA1,XT0,XT1,0b0101
253	VPDI	XA2,XT2,XT3,0b0000
254	VPDI	XA3,XT2,XT3,0b0101
255
256	VMRHF	XT0,XB0,XB1
257	VMRHF	XT1,XB2,XB3
258	VMRLF	XT2,XB0,XB1
259	VMRLF	XT3,XB2,XB3
260	VPDI	XB0,XT0,XT1,0b0000
261	VPDI	XB1,XT0,XT1,0b0101
262	VPDI	XB2,XT2,XT3,0b0000
263	VPDI	XB3,XT2,XT3,0b0101
264
265	VMRHF	XT0,XC0,XC1
266	VMRHF	XT1,XC2,XC3
267	VMRLF	XT2,XC0,XC1
268	VMRLF	XT3,XC2,XC3
269	VPDI	XC0,XT0,XT1,0b0000
270	VPDI	XC1,XT0,XT1,0b0101
271	VPDI	XC2,XT2,XT3,0b0000
272	VPDI	XC3,XT2,XT3,0b0101
273
274	VMRHF	XT0,XD0,XD1
275	VMRHF	XT1,XD2,XD3
276	VMRLF	XT2,XD0,XD1
277	VMRLF	XT3,XD2,XD3
278	VPDI	XD0,XT0,XT1,0b0000
279	VPDI	XD1,XT0,XT1,0b0101
280	VPDI	XD2,XT2,XT3,0b0000
281	VPDI	XD3,XT2,XT3,0b0101
282
283	VAF	XA0,XA0,K0
284	VAF	XB0,XB0,K1
285	VAF	XC0,XC0,K2
286	VAF	XD0,XD0,K3
287
288	VPERM	XA0,XA0,XA0,BEPERM
289	VPERM	XB0,XB0,XB0,BEPERM
290	VPERM	XC0,XC0,XC0,BEPERM
291	VPERM	XD0,XD0,XD0,BEPERM
292
293	VLM	XT0,XT3,0,INP,0
294
295	VX	XT0,XT0,XA0
296	VX	XT1,XT1,XB0
297	VX	XT2,XT2,XC0
298	VX	XT3,XT3,XD0
299
300	VSTM	XT0,XT3,0,OUT,0
301
302	la	INP,0x40(INP)
303	la	OUT,0x40(OUT)
304	aghi	LEN,-0x40
305
306	VAF	XA0,XA1,K0
307	VAF	XB0,XB1,K1
308	VAF	XC0,XC1,K2
309	VAF	XD0,XD1,K3
310
311	VPERM	XA0,XA0,XA0,BEPERM
312	VPERM	XB0,XB0,XB0,BEPERM
313	VPERM	XC0,XC0,XC0,BEPERM
314	VPERM	XD0,XD0,XD0,BEPERM
315
316	clgfi	LEN,0x40
317	jl	.Ltail_4x
318
319	VLM	XT0,XT3,0,INP,0
320
321	VX	XT0,XT0,XA0
322	VX	XT1,XT1,XB0
323	VX	XT2,XT2,XC0
324	VX	XT3,XT3,XD0
325
326	VSTM	XT0,XT3,0,OUT,0
327
328	la	INP,0x40(INP)
329	la	OUT,0x40(OUT)
330	aghi	LEN,-0x40
331	je	.Ldone_4x
332
333	VAF	XA0,XA2,K0
334	VAF	XB0,XB2,K1
335	VAF	XC0,XC2,K2
336	VAF	XD0,XD2,K3
337
338	VPERM	XA0,XA0,XA0,BEPERM
339	VPERM	XB0,XB0,XB0,BEPERM
340	VPERM	XC0,XC0,XC0,BEPERM
341	VPERM	XD0,XD0,XD0,BEPERM
342
343	clgfi	LEN,0x40
344	jl	.Ltail_4x
345
346	VLM	XT0,XT3,0,INP,0
347
348	VX	XT0,XT0,XA0
349	VX	XT1,XT1,XB0
350	VX	XT2,XT2,XC0
351	VX	XT3,XT3,XD0
352
353	VSTM	XT0,XT3,0,OUT,0
354
355	la	INP,0x40(INP)
356	la	OUT,0x40(OUT)
357	aghi	LEN,-0x40
358	je	.Ldone_4x
359
360	VAF	XA0,XA3,K0
361	VAF	XB0,XB3,K1
362	VAF	XC0,XC3,K2
363	VAF	XD0,XD3,K3
364
365	VPERM	XA0,XA0,XA0,BEPERM
366	VPERM	XB0,XB0,XB0,BEPERM
367	VPERM	XC0,XC0,XC0,BEPERM
368	VPERM	XD0,XD0,XD0,BEPERM
369
370	clgfi	LEN,0x40
371	jl	.Ltail_4x
372
373	VLM	XT0,XT3,0,INP,0
374
375	VX	XT0,XT0,XA0
376	VX	XT1,XT1,XB0
377	VX	XT2,XT2,XC0
378	VX	XT3,XT3,XD0
379
380	VSTM	XT0,XT3,0,OUT,0
381
382.Ldone_4x:
383	lmg	%r6,%r7,6*8(SP)
384	BR_EX	%r14
385
386.Ltail_4x:
387	VLR	XT0,XC0
388	VLR	XT1,XD0
389
390	VST	XA0,8*8+0x00,,SP
391	VST	XB0,8*8+0x10,,SP
392	VST	XT0,8*8+0x20,,SP
393	VST	XT1,8*8+0x30,,SP
394
395	lghi	%r1,0
396
397.Loop_tail_4x:
398	llgc	%r5,0(%r1,INP)
399	llgc	%r6,8*8(%r1,SP)
400	xr	%r6,%r5
401	stc	%r6,0(%r1,OUT)
402	la	%r1,1(%r1)
403	brct	LEN,.Loop_tail_4x
404
405	lmg	%r6,%r7,6*8(SP)
406	BR_EX	%r14
407SYM_FUNC_END(chacha20_vx_4x)
408
409#undef	OUT
410#undef	INP
411#undef	LEN
412#undef	KEY
413#undef	COUNTER
414
415#undef BEPERM
416
417#undef K0
418#undef K1
419#undef K2
420#undef K3
421
422
423#############################################################################
424# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
425#		   counst u32 *key, const u32 *counter)
426
427#define	OUT		%r2
428#define	INP		%r3
429#define	LEN		%r4
430#define	KEY		%r5
431#define	COUNTER		%r6
432
433#define BEPERM		%v31
434
435#define K0		%v27
436#define K1		%v24
437#define K2		%v25
438#define K3		%v26
439
440#define A0		%v0
441#define B0		%v1
442#define C0		%v2
443#define D0		%v3
444
445#define A1		%v4
446#define B1		%v5
447#define C1		%v6
448#define D1		%v7
449
450#define A2		%v8
451#define B2		%v9
452#define C2		%v10
453#define D2		%v11
454
455#define A3		%v12
456#define B3		%v13
457#define C3		%v14
458#define D3		%v15
459
460#define A4		%v16
461#define B4		%v17
462#define C4		%v18
463#define D4		%v19
464
465#define A5		%v20
466#define B5		%v21
467#define C5		%v22
468#define D5		%v23
469
470#define T0		%v27
471#define T1		%v28
472#define T2		%v29
473#define T3		%v30
474
475SYM_FUNC_START(chacha20_vx)
476	clgfi	LEN,256
477	jle	chacha20_vx_4x
478	stmg	%r6,%r7,6*8(SP)
479
480	lghi	%r1,-FRAME
481	lgr	%r0,SP
482	la	SP,0(%r1,SP)
483	stg	%r0,0(SP)		# back-chain
484
485	larl	%r7,sigma
486	lhi	%r0,10
487
488	VLM	K1,K2,0,KEY,0		# load key
489	VL	K3,0,,COUNTER		# load counter
490
491	VLM	K0,BEPERM,0,%r7,4	# load sigma, increments, ...
492
493.Loop_outer_vx:
494	VLR	A0,K0
495	VLR	B0,K1
496	VLR	A1,K0
497	VLR	B1,K1
498	VLR	A2,K0
499	VLR	B2,K1
500	VLR	A3,K0
501	VLR	B3,K1
502	VLR	A4,K0
503	VLR	B4,K1
504	VLR	A5,K0
505	VLR	B5,K1
506
507	VLR	D0,K3
508	VAF	D1,K3,T1		# K[3]+1
509	VAF	D2,K3,T2		# K[3]+2
510	VAF	D3,K3,T3		# K[3]+3
511	VAF	D4,D2,T2		# K[3]+4
512	VAF	D5,D2,T3		# K[3]+5
513
514	VLR	C0,K2
515	VLR	C1,K2
516	VLR	C2,K2
517	VLR	C3,K2
518	VLR	C4,K2
519	VLR	C5,K2
520
521	VLR	T1,D1
522	VLR	T2,D2
523	VLR	T3,D3
524
525.Loop_vx:
526	VAF	A0,A0,B0
527	VAF	A1,A1,B1
528	VAF	A2,A2,B2
529	VAF	A3,A3,B3
530	VAF	A4,A4,B4
531	VAF	A5,A5,B5
532	VX	D0,D0,A0
533	VX	D1,D1,A1
534	VX	D2,D2,A2
535	VX	D3,D3,A3
536	VX	D4,D4,A4
537	VX	D5,D5,A5
538	VERLLF	D0,D0,16
539	VERLLF	D1,D1,16
540	VERLLF	D2,D2,16
541	VERLLF	D3,D3,16
542	VERLLF	D4,D4,16
543	VERLLF	D5,D5,16
544
545	VAF	C0,C0,D0
546	VAF	C1,C1,D1
547	VAF	C2,C2,D2
548	VAF	C3,C3,D3
549	VAF	C4,C4,D4
550	VAF	C5,C5,D5
551	VX	B0,B0,C0
552	VX	B1,B1,C1
553	VX	B2,B2,C2
554	VX	B3,B3,C3
555	VX	B4,B4,C4
556	VX	B5,B5,C5
557	VERLLF	B0,B0,12
558	VERLLF	B1,B1,12
559	VERLLF	B2,B2,12
560	VERLLF	B3,B3,12
561	VERLLF	B4,B4,12
562	VERLLF	B5,B5,12
563
564	VAF	A0,A0,B0
565	VAF	A1,A1,B1
566	VAF	A2,A2,B2
567	VAF	A3,A3,B3
568	VAF	A4,A4,B4
569	VAF	A5,A5,B5
570	VX	D0,D0,A0
571	VX	D1,D1,A1
572	VX	D2,D2,A2
573	VX	D3,D3,A3
574	VX	D4,D4,A4
575	VX	D5,D5,A5
576	VERLLF	D0,D0,8
577	VERLLF	D1,D1,8
578	VERLLF	D2,D2,8
579	VERLLF	D3,D3,8
580	VERLLF	D4,D4,8
581	VERLLF	D5,D5,8
582
583	VAF	C0,C0,D0
584	VAF	C1,C1,D1
585	VAF	C2,C2,D2
586	VAF	C3,C3,D3
587	VAF	C4,C4,D4
588	VAF	C5,C5,D5
589	VX	B0,B0,C0
590	VX	B1,B1,C1
591	VX	B2,B2,C2
592	VX	B3,B3,C3
593	VX	B4,B4,C4
594	VX	B5,B5,C5
595	VERLLF	B0,B0,7
596	VERLLF	B1,B1,7
597	VERLLF	B2,B2,7
598	VERLLF	B3,B3,7
599	VERLLF	B4,B4,7
600	VERLLF	B5,B5,7
601
602	VSLDB	C0,C0,C0,8
603	VSLDB	C1,C1,C1,8
604	VSLDB	C2,C2,C2,8
605	VSLDB	C3,C3,C3,8
606	VSLDB	C4,C4,C4,8
607	VSLDB	C5,C5,C5,8
608	VSLDB	B0,B0,B0,4
609	VSLDB	B1,B1,B1,4
610	VSLDB	B2,B2,B2,4
611	VSLDB	B3,B3,B3,4
612	VSLDB	B4,B4,B4,4
613	VSLDB	B5,B5,B5,4
614	VSLDB	D0,D0,D0,12
615	VSLDB	D1,D1,D1,12
616	VSLDB	D2,D2,D2,12
617	VSLDB	D3,D3,D3,12
618	VSLDB	D4,D4,D4,12
619	VSLDB	D5,D5,D5,12
620
621	VAF	A0,A0,B0
622	VAF	A1,A1,B1
623	VAF	A2,A2,B2
624	VAF	A3,A3,B3
625	VAF	A4,A4,B4
626	VAF	A5,A5,B5
627	VX	D0,D0,A0
628	VX	D1,D1,A1
629	VX	D2,D2,A2
630	VX	D3,D3,A3
631	VX	D4,D4,A4
632	VX	D5,D5,A5
633	VERLLF	D0,D0,16
634	VERLLF	D1,D1,16
635	VERLLF	D2,D2,16
636	VERLLF	D3,D3,16
637	VERLLF	D4,D4,16
638	VERLLF	D5,D5,16
639
640	VAF	C0,C0,D0
641	VAF	C1,C1,D1
642	VAF	C2,C2,D2
643	VAF	C3,C3,D3
644	VAF	C4,C4,D4
645	VAF	C5,C5,D5
646	VX	B0,B0,C0
647	VX	B1,B1,C1
648	VX	B2,B2,C2
649	VX	B3,B3,C3
650	VX	B4,B4,C4
651	VX	B5,B5,C5
652	VERLLF	B0,B0,12
653	VERLLF	B1,B1,12
654	VERLLF	B2,B2,12
655	VERLLF	B3,B3,12
656	VERLLF	B4,B4,12
657	VERLLF	B5,B5,12
658
659	VAF	A0,A0,B0
660	VAF	A1,A1,B1
661	VAF	A2,A2,B2
662	VAF	A3,A3,B3
663	VAF	A4,A4,B4
664	VAF	A5,A5,B5
665	VX	D0,D0,A0
666	VX	D1,D1,A1
667	VX	D2,D2,A2
668	VX	D3,D3,A3
669	VX	D4,D4,A4
670	VX	D5,D5,A5
671	VERLLF	D0,D0,8
672	VERLLF	D1,D1,8
673	VERLLF	D2,D2,8
674	VERLLF	D3,D3,8
675	VERLLF	D4,D4,8
676	VERLLF	D5,D5,8
677
678	VAF	C0,C0,D0
679	VAF	C1,C1,D1
680	VAF	C2,C2,D2
681	VAF	C3,C3,D3
682	VAF	C4,C4,D4
683	VAF	C5,C5,D5
684	VX	B0,B0,C0
685	VX	B1,B1,C1
686	VX	B2,B2,C2
687	VX	B3,B3,C3
688	VX	B4,B4,C4
689	VX	B5,B5,C5
690	VERLLF	B0,B0,7
691	VERLLF	B1,B1,7
692	VERLLF	B2,B2,7
693	VERLLF	B3,B3,7
694	VERLLF	B4,B4,7
695	VERLLF	B5,B5,7
696
697	VSLDB	C0,C0,C0,8
698	VSLDB	C1,C1,C1,8
699	VSLDB	C2,C2,C2,8
700	VSLDB	C3,C3,C3,8
701	VSLDB	C4,C4,C4,8
702	VSLDB	C5,C5,C5,8
703	VSLDB	B0,B0,B0,12
704	VSLDB	B1,B1,B1,12
705	VSLDB	B2,B2,B2,12
706	VSLDB	B3,B3,B3,12
707	VSLDB	B4,B4,B4,12
708	VSLDB	B5,B5,B5,12
709	VSLDB	D0,D0,D0,4
710	VSLDB	D1,D1,D1,4
711	VSLDB	D2,D2,D2,4
712	VSLDB	D3,D3,D3,4
713	VSLDB	D4,D4,D4,4
714	VSLDB	D5,D5,D5,4
715	brct	%r0,.Loop_vx
716
717	VAF	A0,A0,K0
718	VAF	B0,B0,K1
719	VAF	C0,C0,K2
720	VAF	D0,D0,K3
721	VAF	A1,A1,K0
722	VAF	D1,D1,T1		# +K[3]+1
723
724	VPERM	A0,A0,A0,BEPERM
725	VPERM	B0,B0,B0,BEPERM
726	VPERM	C0,C0,C0,BEPERM
727	VPERM	D0,D0,D0,BEPERM
728
729	clgfi	LEN,0x40
730	jl	.Ltail_vx
731
732	VAF	D2,D2,T2		# +K[3]+2
733	VAF	D3,D3,T3		# +K[3]+3
734	VLM	T0,T3,0,INP,0
735
736	VX	A0,A0,T0
737	VX	B0,B0,T1
738	VX	C0,C0,T2
739	VX	D0,D0,T3
740
741	VLM	K0,T3,0,%r7,4		# re-load sigma and increments
742
743	VSTM	A0,D0,0,OUT,0
744
745	la	INP,0x40(INP)
746	la	OUT,0x40(OUT)
747	aghi	LEN,-0x40
748	je	.Ldone_vx
749
750	VAF	B1,B1,K1
751	VAF	C1,C1,K2
752
753	VPERM	A0,A1,A1,BEPERM
754	VPERM	B0,B1,B1,BEPERM
755	VPERM	C0,C1,C1,BEPERM
756	VPERM	D0,D1,D1,BEPERM
757
758	clgfi	LEN,0x40
759	jl	.Ltail_vx
760
761	VLM	A1,D1,0,INP,0
762
763	VX	A0,A0,A1
764	VX	B0,B0,B1
765	VX	C0,C0,C1
766	VX	D0,D0,D1
767
768	VSTM	A0,D0,0,OUT,0
769
770	la	INP,0x40(INP)
771	la	OUT,0x40(OUT)
772	aghi	LEN,-0x40
773	je	.Ldone_vx
774
775	VAF	A2,A2,K0
776	VAF	B2,B2,K1
777	VAF	C2,C2,K2
778
779	VPERM	A0,A2,A2,BEPERM
780	VPERM	B0,B2,B2,BEPERM
781	VPERM	C0,C2,C2,BEPERM
782	VPERM	D0,D2,D2,BEPERM
783
784	clgfi	LEN,0x40
785	jl	.Ltail_vx
786
787	VLM	A1,D1,0,INP,0
788
789	VX	A0,A0,A1
790	VX	B0,B0,B1
791	VX	C0,C0,C1
792	VX	D0,D0,D1
793
794	VSTM	A0,D0,0,OUT,0
795
796	la	INP,0x40(INP)
797	la	OUT,0x40(OUT)
798	aghi	LEN,-0x40
799	je	.Ldone_vx
800
801	VAF	A3,A3,K0
802	VAF	B3,B3,K1
803	VAF	C3,C3,K2
804	VAF	D2,K3,T3		# K[3]+3
805
806	VPERM	A0,A3,A3,BEPERM
807	VPERM	B0,B3,B3,BEPERM
808	VPERM	C0,C3,C3,BEPERM
809	VPERM	D0,D3,D3,BEPERM
810
811	clgfi	LEN,0x40
812	jl	.Ltail_vx
813
814	VAF	D3,D2,T1		# K[3]+4
815	VLM	A1,D1,0,INP,0
816
817	VX	A0,A0,A1
818	VX	B0,B0,B1
819	VX	C0,C0,C1
820	VX	D0,D0,D1
821
822	VSTM	A0,D0,0,OUT,0
823
824	la	INP,0x40(INP)
825	la	OUT,0x40(OUT)
826	aghi	LEN,-0x40
827	je	.Ldone_vx
828
829	VAF	A4,A4,K0
830	VAF	B4,B4,K1
831	VAF	C4,C4,K2
832	VAF	D4,D4,D3		# +K[3]+4
833	VAF	D3,D3,T1		# K[3]+5
834	VAF	K3,D2,T3		# K[3]+=6
835
836	VPERM	A0,A4,A4,BEPERM
837	VPERM	B0,B4,B4,BEPERM
838	VPERM	C0,C4,C4,BEPERM
839	VPERM	D0,D4,D4,BEPERM
840
841	clgfi	LEN,0x40
842	jl	.Ltail_vx
843
844	VLM	A1,D1,0,INP,0
845
846	VX	A0,A0,A1
847	VX	B0,B0,B1
848	VX	C0,C0,C1
849	VX	D0,D0,D1
850
851	VSTM	A0,D0,0,OUT,0
852
853	la	INP,0x40(INP)
854	la	OUT,0x40(OUT)
855	aghi	LEN,-0x40
856	je	.Ldone_vx
857
858	VAF	A5,A5,K0
859	VAF	B5,B5,K1
860	VAF	C5,C5,K2
861	VAF	D5,D5,D3		# +K[3]+5
862
863	VPERM	A0,A5,A5,BEPERM
864	VPERM	B0,B5,B5,BEPERM
865	VPERM	C0,C5,C5,BEPERM
866	VPERM	D0,D5,D5,BEPERM
867
868	clgfi	LEN,0x40
869	jl	.Ltail_vx
870
871	VLM	A1,D1,0,INP,0
872
873	VX	A0,A0,A1
874	VX	B0,B0,B1
875	VX	C0,C0,C1
876	VX	D0,D0,D1
877
878	VSTM	A0,D0,0,OUT,0
879
880	la	INP,0x40(INP)
881	la	OUT,0x40(OUT)
882	lhi	%r0,10
883	aghi	LEN,-0x40
884	jne	.Loop_outer_vx
885
886.Ldone_vx:
887	lmg	%r6,%r7,FRAME+6*8(SP)
888	la	SP,FRAME(SP)
889	BR_EX	%r14
890
891.Ltail_vx:
892	VSTM	A0,D0,8*8,SP,3
893	lghi	%r1,0
894
895.Loop_tail_vx:
896	llgc	%r5,0(%r1,INP)
897	llgc	%r6,8*8(%r1,SP)
898	xr	%r6,%r5
899	stc	%r6,0(%r1,OUT)
900	la	%r1,1(%r1)
901	brct	LEN,.Loop_tail_vx
902
903	lmg	%r6,%r7,FRAME+6*8(SP)
904	la	SP,FRAME(SP)
905	BR_EX	%r14
906SYM_FUNC_END(chacha20_vx)
907
908.previous
909