xref: /linux/arch/powerpc/crypto/chacha-p10le-8x.S (revision 02091cbe9cc4f18167208eec1d6de636cc731817)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2#
3# Accelerated chacha20 implementation for ppc64le.
4#
5# Copyright 2023- IBM Corp. All rights reserved
6#
7#===================================================================================
8# Written by Danny Tsen <dtsen@us.ibm.com>
9#
10# chacha_p10le_8x(u32 *state, byte *dst, const byte *src,
11#				 size_t len, int nrounds);
12#
13# do rounds,  8 quarter rounds
14# 1.  a += b; d ^= a; d <<<= 16;
15# 2.  c += d; b ^= c; b <<<= 12;
16# 3.  a += b; d ^= a; d <<<= 8;
17# 4.  c += d; b ^= c; b <<<= 7
18#
19# row1 = (row1 + row2),  row4 = row1 xor row4,  row4 rotate each word by 16
20# row3 = (row3 + row4),  row2 = row3 xor row2,  row2 rotate each word by 12
21# row1 = (row1 + row2), row4 = row1 xor row4,  row4 rotate each word by 8
22# row3 = (row3 + row4), row2 = row3 xor row2,  row2 rotate each word by 7
23#
24# 4 blocks (a b c d)
25#
26# a0 b0 c0 d0
27# a1 b1 c1 d1
28# ...
29# a4 b4 c4 d4
30# ...
31# a8 b8 c8 d8
32# ...
33# a12 b12 c12 d12
34# a13 ...
35# a14 ...
36# a15 b15 c15 d15
37#
38# Column round (v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
39# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
40#
41
42#include <asm/ppc_asm.h>
43#include <asm/asm-offsets.h>
44#include <asm/asm-compat.h>
45#include <linux/linkage.h>
46
47.machine	"any"
48.text
49
50.macro	SAVE_GPR GPR OFFSET FRAME
51	std	\GPR,\OFFSET(\FRAME)
52.endm
53
54.macro	SAVE_VRS VRS OFFSET FRAME
55	li	16, \OFFSET
56	stvx	\VRS, 16, \FRAME
57.endm
58
59.macro	SAVE_VSX VSX OFFSET FRAME
60	li	16, \OFFSET
61	stxvx	\VSX, 16, \FRAME
62.endm
63
64.macro	RESTORE_GPR GPR OFFSET FRAME
65	ld	\GPR,\OFFSET(\FRAME)
66.endm
67
68.macro	RESTORE_VRS VRS OFFSET FRAME
69	li	16, \OFFSET
70	lvx	\VRS, 16, \FRAME
71.endm
72
73.macro	RESTORE_VSX VSX OFFSET FRAME
74	li	16, \OFFSET
75	lxvx	\VSX, 16, \FRAME
76.endm
77
78.macro SAVE_REGS
79	mflr 0
80	std 0, 16(1)
81	stdu 1,-752(1)
82
83	SAVE_GPR 14, 112, 1
84	SAVE_GPR 15, 120, 1
85	SAVE_GPR 16, 128, 1
86	SAVE_GPR 17, 136, 1
87	SAVE_GPR 18, 144, 1
88	SAVE_GPR 19, 152, 1
89	SAVE_GPR 20, 160, 1
90	SAVE_GPR 21, 168, 1
91	SAVE_GPR 22, 176, 1
92	SAVE_GPR 23, 184, 1
93	SAVE_GPR 24, 192, 1
94	SAVE_GPR 25, 200, 1
95	SAVE_GPR 26, 208, 1
96	SAVE_GPR 27, 216, 1
97	SAVE_GPR 28, 224, 1
98	SAVE_GPR 29, 232, 1
99	SAVE_GPR 30, 240, 1
100	SAVE_GPR 31, 248, 1
101
102	addi	9, 1, 256
103	SAVE_VRS 20, 0, 9
104	SAVE_VRS 21, 16, 9
105	SAVE_VRS 22, 32, 9
106	SAVE_VRS 23, 48, 9
107	SAVE_VRS 24, 64, 9
108	SAVE_VRS 25, 80, 9
109	SAVE_VRS 26, 96, 9
110	SAVE_VRS 27, 112, 9
111	SAVE_VRS 28, 128, 9
112	SAVE_VRS 29, 144, 9
113	SAVE_VRS 30, 160, 9
114	SAVE_VRS 31, 176, 9
115
116	SAVE_VSX 14, 192, 9
117	SAVE_VSX 15, 208, 9
118	SAVE_VSX 16, 224, 9
119	SAVE_VSX 17, 240, 9
120	SAVE_VSX 18, 256, 9
121	SAVE_VSX 19, 272, 9
122	SAVE_VSX 20, 288, 9
123	SAVE_VSX 21, 304, 9
124	SAVE_VSX 22, 320, 9
125	SAVE_VSX 23, 336, 9
126	SAVE_VSX 24, 352, 9
127	SAVE_VSX 25, 368, 9
128	SAVE_VSX 26, 384, 9
129	SAVE_VSX 27, 400, 9
130	SAVE_VSX 28, 416, 9
131	SAVE_VSX 29, 432, 9
132	SAVE_VSX 30, 448, 9
133	SAVE_VSX 31, 464, 9
134.endm # SAVE_REGS
135
136.macro RESTORE_REGS
137	addi	9, 1, 256
138	RESTORE_VRS 20, 0, 9
139	RESTORE_VRS 21, 16, 9
140	RESTORE_VRS 22, 32, 9
141	RESTORE_VRS 23, 48, 9
142	RESTORE_VRS 24, 64, 9
143	RESTORE_VRS 25, 80, 9
144	RESTORE_VRS 26, 96, 9
145	RESTORE_VRS 27, 112, 9
146	RESTORE_VRS 28, 128, 9
147	RESTORE_VRS 29, 144, 9
148	RESTORE_VRS 30, 160, 9
149	RESTORE_VRS 31, 176, 9
150
151	RESTORE_VSX 14, 192, 9
152	RESTORE_VSX 15, 208, 9
153	RESTORE_VSX 16, 224, 9
154	RESTORE_VSX 17, 240, 9
155	RESTORE_VSX 18, 256, 9
156	RESTORE_VSX 19, 272, 9
157	RESTORE_VSX 20, 288, 9
158	RESTORE_VSX 21, 304, 9
159	RESTORE_VSX 22, 320, 9
160	RESTORE_VSX 23, 336, 9
161	RESTORE_VSX 24, 352, 9
162	RESTORE_VSX 25, 368, 9
163	RESTORE_VSX 26, 384, 9
164	RESTORE_VSX 27, 400, 9
165	RESTORE_VSX 28, 416, 9
166	RESTORE_VSX 29, 432, 9
167	RESTORE_VSX 30, 448, 9
168	RESTORE_VSX 31, 464, 9
169
170	RESTORE_GPR 14, 112, 1
171	RESTORE_GPR 15, 120, 1
172	RESTORE_GPR 16, 128, 1
173	RESTORE_GPR 17, 136, 1
174	RESTORE_GPR 18, 144, 1
175	RESTORE_GPR 19, 152, 1
176	RESTORE_GPR 20, 160, 1
177	RESTORE_GPR 21, 168, 1
178	RESTORE_GPR 22, 176, 1
179	RESTORE_GPR 23, 184, 1
180	RESTORE_GPR 24, 192, 1
181	RESTORE_GPR 25, 200, 1
182	RESTORE_GPR 26, 208, 1
183	RESTORE_GPR 27, 216, 1
184	RESTORE_GPR 28, 224, 1
185	RESTORE_GPR 29, 232, 1
186	RESTORE_GPR 30, 240, 1
187	RESTORE_GPR 31, 248, 1
188
189	addi    1, 1, 752
190	ld 0, 16(1)
191	mtlr 0
192.endm # RESTORE_REGS
193
194.macro QT_loop_8x
195	# QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
196	xxlor	0, 32+25, 32+25
197	xxlor	32+25, 20, 20
198	vadduwm 0, 0, 4
199	vadduwm 1, 1, 5
200	vadduwm 2, 2, 6
201	vadduwm 3, 3, 7
202	  vadduwm 16, 16, 20
203	  vadduwm 17, 17, 21
204	  vadduwm 18, 18, 22
205	  vadduwm 19, 19, 23
206
207	  vpermxor 12, 12, 0, 25
208	  vpermxor 13, 13, 1, 25
209	  vpermxor 14, 14, 2, 25
210	  vpermxor 15, 15, 3, 25
211	  vpermxor 28, 28, 16, 25
212	  vpermxor 29, 29, 17, 25
213	  vpermxor 30, 30, 18, 25
214	  vpermxor 31, 31, 19, 25
215	xxlor	32+25, 0, 0
216	vadduwm 8, 8, 12
217	vadduwm 9, 9, 13
218	vadduwm 10, 10, 14
219	vadduwm 11, 11, 15
220	  vadduwm 24, 24, 28
221	  vadduwm 25, 25, 29
222	  vadduwm 26, 26, 30
223	  vadduwm 27, 27, 31
224	vxor 4, 4, 8
225	vxor 5, 5, 9
226	vxor 6, 6, 10
227	vxor 7, 7, 11
228	  vxor 20, 20, 24
229	  vxor 21, 21, 25
230	  vxor 22, 22, 26
231	  vxor 23, 23, 27
232
233	xxlor	0, 32+25, 32+25
234	xxlor	32+25, 21, 21
235	vrlw 4, 4, 25  #
236	vrlw 5, 5, 25
237	vrlw 6, 6, 25
238	vrlw 7, 7, 25
239	  vrlw 20, 20, 25  #
240	  vrlw 21, 21, 25
241	  vrlw 22, 22, 25
242	  vrlw 23, 23, 25
243	xxlor	32+25, 0, 0
244	vadduwm 0, 0, 4
245	vadduwm 1, 1, 5
246	vadduwm 2, 2, 6
247	vadduwm 3, 3, 7
248	  vadduwm 16, 16, 20
249	  vadduwm 17, 17, 21
250	  vadduwm 18, 18, 22
251	  vadduwm 19, 19, 23
252
253	xxlor	0, 32+25, 32+25
254	xxlor	32+25, 22, 22
255	  vpermxor 12, 12, 0, 25
256	  vpermxor 13, 13, 1, 25
257	  vpermxor 14, 14, 2, 25
258	  vpermxor 15, 15, 3, 25
259	  vpermxor 28, 28, 16, 25
260	  vpermxor 29, 29, 17, 25
261	  vpermxor 30, 30, 18, 25
262	  vpermxor 31, 31, 19, 25
263	xxlor	32+25, 0, 0
264	vadduwm 8, 8, 12
265	vadduwm 9, 9, 13
266	vadduwm 10, 10, 14
267	vadduwm 11, 11, 15
268	  vadduwm 24, 24, 28
269	  vadduwm 25, 25, 29
270	  vadduwm 26, 26, 30
271	  vadduwm 27, 27, 31
272	xxlor	0, 32+28, 32+28
273	xxlor	32+28, 23, 23
274	vxor 4, 4, 8
275	vxor 5, 5, 9
276	vxor 6, 6, 10
277	vxor 7, 7, 11
278	  vxor 20, 20, 24
279	  vxor 21, 21, 25
280	  vxor 22, 22, 26
281	  vxor 23, 23, 27
282	vrlw 4, 4, 28  #
283	vrlw 5, 5, 28
284	vrlw 6, 6, 28
285	vrlw 7, 7, 28
286	  vrlw 20, 20, 28  #
287	  vrlw 21, 21, 28
288	  vrlw 22, 22, 28
289	  vrlw 23, 23, 28
290	xxlor	32+28, 0, 0
291
292	# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
293	xxlor	0, 32+25, 32+25
294	xxlor	32+25, 20, 20
295	vadduwm 0, 0, 5
296	vadduwm 1, 1, 6
297	vadduwm 2, 2, 7
298	vadduwm 3, 3, 4
299	  vadduwm 16, 16, 21
300	  vadduwm 17, 17, 22
301	  vadduwm 18, 18, 23
302	  vadduwm 19, 19, 20
303
304	  vpermxor 15, 15, 0, 25
305	  vpermxor 12, 12, 1, 25
306	  vpermxor 13, 13, 2, 25
307	  vpermxor 14, 14, 3, 25
308	  vpermxor 31, 31, 16, 25
309	  vpermxor 28, 28, 17, 25
310	  vpermxor 29, 29, 18, 25
311	  vpermxor 30, 30, 19, 25
312
313	xxlor	32+25, 0, 0
314	vadduwm 10, 10, 15
315	vadduwm 11, 11, 12
316	vadduwm 8, 8, 13
317	vadduwm 9, 9, 14
318	  vadduwm 26, 26, 31
319	  vadduwm 27, 27, 28
320	  vadduwm 24, 24, 29
321	  vadduwm 25, 25, 30
322	vxor 5, 5, 10
323	vxor 6, 6, 11
324	vxor 7, 7, 8
325	vxor 4, 4, 9
326	  vxor 21, 21, 26
327	  vxor 22, 22, 27
328	  vxor 23, 23, 24
329	  vxor 20, 20, 25
330
331	xxlor	0, 32+25, 32+25
332	xxlor	32+25, 21, 21
333	vrlw 5, 5, 25
334	vrlw 6, 6, 25
335	vrlw 7, 7, 25
336	vrlw 4, 4, 25
337	  vrlw 21, 21, 25
338	  vrlw 22, 22, 25
339	  vrlw 23, 23, 25
340	  vrlw 20, 20, 25
341	xxlor	32+25, 0, 0
342
343	vadduwm 0, 0, 5
344	vadduwm 1, 1, 6
345	vadduwm 2, 2, 7
346	vadduwm 3, 3, 4
347	  vadduwm 16, 16, 21
348	  vadduwm 17, 17, 22
349	  vadduwm 18, 18, 23
350	  vadduwm 19, 19, 20
351
352	xxlor	0, 32+25, 32+25
353	xxlor	32+25, 22, 22
354	  vpermxor 15, 15, 0, 25
355	  vpermxor 12, 12, 1, 25
356	  vpermxor 13, 13, 2, 25
357	  vpermxor 14, 14, 3, 25
358	  vpermxor 31, 31, 16, 25
359	  vpermxor 28, 28, 17, 25
360	  vpermxor 29, 29, 18, 25
361	  vpermxor 30, 30, 19, 25
362	xxlor	32+25, 0, 0
363
364	vadduwm 10, 10, 15
365	vadduwm 11, 11, 12
366	vadduwm 8, 8, 13
367	vadduwm 9, 9, 14
368	  vadduwm 26, 26, 31
369	  vadduwm 27, 27, 28
370	  vadduwm 24, 24, 29
371	  vadduwm 25, 25, 30
372
373	xxlor	0, 32+28, 32+28
374	xxlor	32+28, 23, 23
375	vxor 5, 5, 10
376	vxor 6, 6, 11
377	vxor 7, 7, 8
378	vxor 4, 4, 9
379	  vxor 21, 21, 26
380	  vxor 22, 22, 27
381	  vxor 23, 23, 24
382	  vxor 20, 20, 25
383	vrlw 5, 5, 28
384	vrlw 6, 6, 28
385	vrlw 7, 7, 28
386	vrlw 4, 4, 28
387	  vrlw 21, 21, 28
388	  vrlw 22, 22, 28
389	  vrlw 23, 23, 28
390	  vrlw 20, 20, 28
391	xxlor	32+28, 0, 0
392.endm
393
394.macro QT_loop_4x
395	# QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
396	vadduwm 0, 0, 4
397	vadduwm 1, 1, 5
398	vadduwm 2, 2, 6
399	vadduwm 3, 3, 7
400	  vpermxor 12, 12, 0, 20
401	  vpermxor 13, 13, 1, 20
402	  vpermxor 14, 14, 2, 20
403	  vpermxor 15, 15, 3, 20
404	vadduwm 8, 8, 12
405	vadduwm 9, 9, 13
406	vadduwm 10, 10, 14
407	vadduwm 11, 11, 15
408	vxor 4, 4, 8
409	vxor 5, 5, 9
410	vxor 6, 6, 10
411	vxor 7, 7, 11
412	vrlw 4, 4, 21
413	vrlw 5, 5, 21
414	vrlw 6, 6, 21
415	vrlw 7, 7, 21
416	vadduwm 0, 0, 4
417	vadduwm 1, 1, 5
418	vadduwm 2, 2, 6
419	vadduwm 3, 3, 7
420	  vpermxor 12, 12, 0, 22
421	  vpermxor 13, 13, 1, 22
422	  vpermxor 14, 14, 2, 22
423	  vpermxor 15, 15, 3, 22
424	vadduwm 8, 8, 12
425	vadduwm 9, 9, 13
426	vadduwm 10, 10, 14
427	vadduwm 11, 11, 15
428	vxor 4, 4, 8
429	vxor 5, 5, 9
430	vxor 6, 6, 10
431	vxor 7, 7, 11
432	vrlw 4, 4, 23
433	vrlw 5, 5, 23
434	vrlw 6, 6, 23
435	vrlw 7, 7, 23
436
437	# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
438	vadduwm 0, 0, 5
439	vadduwm 1, 1, 6
440	vadduwm 2, 2, 7
441	vadduwm 3, 3, 4
442	  vpermxor 15, 15, 0, 20
443	  vpermxor 12, 12, 1, 20
444	  vpermxor 13, 13, 2, 20
445	  vpermxor 14, 14, 3, 20
446	vadduwm 10, 10, 15
447	vadduwm 11, 11, 12
448	vadduwm 8, 8, 13
449	vadduwm 9, 9, 14
450	vxor 5, 5, 10
451	vxor 6, 6, 11
452	vxor 7, 7, 8
453	vxor 4, 4, 9
454	vrlw 5, 5, 21
455	vrlw 6, 6, 21
456	vrlw 7, 7, 21
457	vrlw 4, 4, 21
458	vadduwm 0, 0, 5
459	vadduwm 1, 1, 6
460	vadduwm 2, 2, 7
461	vadduwm 3, 3, 4
462	  vpermxor 15, 15, 0, 22
463	  vpermxor 12, 12, 1, 22
464	  vpermxor 13, 13, 2, 22
465	  vpermxor 14, 14, 3, 22
466	vadduwm 10, 10, 15
467	vadduwm 11, 11, 12
468	vadduwm 8, 8, 13
469	vadduwm 9, 9, 14
470	vxor 5, 5, 10
471	vxor 6, 6, 11
472	vxor 7, 7, 8
473	vxor 4, 4, 9
474	vrlw 5, 5, 23
475	vrlw 6, 6, 23
476	vrlw 7, 7, 23
477	vrlw 4, 4, 23
478.endm
479
480# Transpose
481.macro TP_4x a0 a1 a2 a3
482	xxmrghw  10, 32+\a0, 32+\a1	# a0, a1, b0, b1
483	xxmrghw  11, 32+\a2, 32+\a3	# a2, a3, b2, b3
484	xxmrglw  12, 32+\a0, 32+\a1	# c0, c1, d0, d1
485	xxmrglw  13, 32+\a2, 32+\a3	# c2, c3, d2, d3
486	xxpermdi	32+\a0, 10, 11, 0	# a0, a1, a2, a3
487	xxpermdi	32+\a1, 10, 11, 3	# b0, b1, b2, b3
488	xxpermdi	32+\a2, 12, 13, 0	# c0, c1, c2, c3
489	xxpermdi	32+\a3, 12, 13, 3	# d0, d1, d2, d3
490.endm
491
492# key stream = working state + state
493.macro Add_state S
494	vadduwm \S+0, \S+0, 16-\S
495	vadduwm \S+4, \S+4, 17-\S
496	vadduwm \S+8, \S+8, 18-\S
497	vadduwm \S+12, \S+12, 19-\S
498
499	vadduwm \S+1, \S+1, 16-\S
500	vadduwm \S+5, \S+5, 17-\S
501	vadduwm \S+9, \S+9, 18-\S
502	vadduwm \S+13, \S+13, 19-\S
503
504	vadduwm \S+2, \S+2, 16-\S
505	vadduwm \S+6, \S+6, 17-\S
506	vadduwm \S+10, \S+10, 18-\S
507	vadduwm \S+14, \S+14, 19-\S
508
509	vadduwm	\S+3, \S+3, 16-\S
510	vadduwm	\S+7, \S+7, 17-\S
511	vadduwm	\S+11, \S+11, 18-\S
512	vadduwm	\S+15, \S+15, 19-\S
513.endm
514
515#
516# write 256 bytes
517#
518.macro Write_256 S
519	add 9, 14, 5
520	add 16, 14, 4
521	lxvw4x 0, 0, 9
522	lxvw4x 1, 17, 9
523	lxvw4x 2, 18, 9
524	lxvw4x 3, 19, 9
525	lxvw4x 4, 20, 9
526	lxvw4x 5, 21, 9
527	lxvw4x 6, 22, 9
528	lxvw4x 7, 23, 9
529	lxvw4x 8, 24, 9
530	lxvw4x 9, 25, 9
531	lxvw4x 10, 26, 9
532	lxvw4x 11, 27, 9
533	lxvw4x 12, 28, 9
534	lxvw4x 13, 29, 9
535	lxvw4x 14, 30, 9
536	lxvw4x 15, 31, 9
537
538	xxlxor \S+32, \S+32, 0
539	xxlxor \S+36, \S+36, 1
540	xxlxor \S+40, \S+40, 2
541	xxlxor \S+44, \S+44, 3
542	xxlxor \S+33, \S+33, 4
543	xxlxor \S+37, \S+37, 5
544	xxlxor \S+41, \S+41, 6
545	xxlxor \S+45, \S+45, 7
546	xxlxor \S+34, \S+34, 8
547	xxlxor \S+38, \S+38, 9
548	xxlxor \S+42, \S+42, 10
549	xxlxor \S+46, \S+46, 11
550	xxlxor \S+35, \S+35, 12
551	xxlxor \S+39, \S+39, 13
552	xxlxor \S+43, \S+43, 14
553	xxlxor \S+47, \S+47, 15
554
555	stxvw4x \S+32, 0, 16
556	stxvw4x \S+36, 17, 16
557	stxvw4x \S+40, 18, 16
558	stxvw4x \S+44, 19, 16
559
560	stxvw4x \S+33, 20, 16
561	stxvw4x \S+37, 21, 16
562	stxvw4x \S+41, 22, 16
563	stxvw4x \S+45, 23, 16
564
565	stxvw4x \S+34, 24, 16
566	stxvw4x \S+38, 25, 16
567	stxvw4x \S+42, 26, 16
568	stxvw4x \S+46, 27, 16
569
570	stxvw4x \S+35, 28, 16
571	stxvw4x \S+39, 29, 16
572	stxvw4x \S+43, 30, 16
573	stxvw4x \S+47, 31, 16
574
575.endm
576
577#
578# chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len, int nrounds);
579#
580SYM_FUNC_START(chacha_p10le_8x)
581.align 5
582	cmpdi	6, 0
583	ble	Out_no_chacha
584
585	SAVE_REGS
586
587	# r17 - r31 mainly for Write_256 macro.
588	li	17, 16
589	li	18, 32
590	li	19, 48
591	li	20, 64
592	li	21, 80
593	li	22, 96
594	li	23, 112
595	li	24, 128
596	li	25, 144
597	li	26, 160
598	li	27, 176
599	li	28, 192
600	li	29, 208
601	li	30, 224
602	li	31, 240
603
604	mr 15, 6			# len
605	li 14, 0			# offset to inp and outp
606
607        lxvw4x	48, 0, 3		#  vr16, constants
608	lxvw4x	49, 17, 3		#  vr17, key 1
609	lxvw4x	50, 18, 3		#  vr18, key 2
610	lxvw4x	51, 19, 3		#  vr19, counter, nonce
611
612	# create (0, 1, 2, 3) counters
613	vspltisw 0, 0
614	vspltisw 1, 1
615	vspltisw 2, 2
616	vspltisw 3, 3
617	vmrghw	4, 0, 1
618	vmrglw	5, 2, 3
619	vsldoi	30, 4, 5, 8		# vr30 counter, 4 (0, 1, 2, 3)
620
621	vspltisw 21, 12
622	vspltisw 23, 7
623
624	addis	11, 2, permx@toc@ha
625	addi	11, 11, permx@toc@l
626	lxvw4x	32+20, 0, 11
627	lxvw4x	32+22, 17, 11
628
629	sradi	8, 7, 1
630
631	mtctr 8
632
633	# save constants to vsx
634	xxlor	16, 48, 48
635	xxlor	17, 49, 49
636	xxlor	18, 50, 50
637	xxlor	19, 51, 51
638
639	vspltisw 25, 4
640	vspltisw 26, 8
641
642	xxlor	25, 32+26, 32+26
643	xxlor	24, 32+25, 32+25
644
645	vadduwm	31, 30, 25		# counter = (0, 1, 2, 3) + (4, 4, 4, 4)
646	xxlor	30, 32+30, 32+30
647	xxlor	31, 32+31, 32+31
648
649	xxlor	20, 32+20, 32+20
650	xxlor	21, 32+21, 32+21
651	xxlor	22, 32+22, 32+22
652	xxlor	23, 32+23, 32+23
653
654	cmpdi	6, 512
655	blt	Loop_last
656
657Loop_8x:
658	xxspltw  32+0, 16, 0
659	xxspltw  32+1, 16, 1
660	xxspltw  32+2, 16, 2
661	xxspltw  32+3, 16, 3
662
663	xxspltw  32+4, 17, 0
664	xxspltw  32+5, 17, 1
665	xxspltw  32+6, 17, 2
666	xxspltw  32+7, 17, 3
667	xxspltw  32+8, 18, 0
668	xxspltw  32+9, 18, 1
669	xxspltw  32+10, 18, 2
670	xxspltw  32+11, 18, 3
671	xxspltw  32+12, 19, 0
672	xxspltw  32+13, 19, 1
673	xxspltw  32+14, 19, 2
674	xxspltw  32+15, 19, 3
675	vadduwm	12, 12, 30	# increase counter
676
677	xxspltw  32+16, 16, 0
678	xxspltw  32+17, 16, 1
679	xxspltw  32+18, 16, 2
680	xxspltw  32+19, 16, 3
681
682	xxspltw  32+20, 17, 0
683	xxspltw  32+21, 17, 1
684	xxspltw  32+22, 17, 2
685	xxspltw  32+23, 17, 3
686	xxspltw  32+24, 18, 0
687	xxspltw  32+25, 18, 1
688	xxspltw  32+26, 18, 2
689	xxspltw  32+27, 18, 3
690	xxspltw  32+28, 19, 0
691	xxspltw  32+29, 19, 1
692	vadduwm	28, 28, 31	# increase counter
693	xxspltw  32+30, 19, 2
694	xxspltw  32+31, 19, 3
695
696.align 5
697quarter_loop_8x:
698	QT_loop_8x
699
700	bdnz	quarter_loop_8x
701
702	xxlor	0, 32+30, 32+30
703	xxlor	32+30, 30, 30
704	vadduwm	12, 12, 30
705	xxlor	32+30, 0, 0
706	TP_4x 0, 1, 2, 3
707	TP_4x 4, 5, 6, 7
708	TP_4x 8, 9, 10, 11
709	TP_4x 12, 13, 14, 15
710
711	xxlor	0, 48, 48
712	xxlor	1, 49, 49
713	xxlor	2, 50, 50
714	xxlor	3, 51, 51
715	xxlor	48, 16, 16
716	xxlor	49, 17, 17
717	xxlor	50, 18, 18
718	xxlor	51, 19, 19
719	Add_state 0
720	xxlor	48, 0, 0
721	xxlor	49, 1, 1
722	xxlor	50, 2, 2
723	xxlor	51, 3, 3
724	Write_256 0
725	addi	14, 14, 256	# offset +=256
726	addi	15, 15, -256	# len -=256
727
728	xxlor	5, 32+31, 32+31
729	xxlor	32+31, 31, 31
730	vadduwm	28, 28, 31
731	xxlor	32+31, 5, 5
732	TP_4x 16+0, 16+1, 16+2, 16+3
733	TP_4x 16+4, 16+5, 16+6, 16+7
734	TP_4x 16+8, 16+9, 16+10, 16+11
735	TP_4x 16+12, 16+13, 16+14, 16+15
736
737	xxlor	32, 16, 16
738	xxlor	33, 17, 17
739	xxlor	34, 18, 18
740	xxlor	35, 19, 19
741	Add_state 16
742	Write_256 16
743	addi	14, 14, 256	# offset +=256
744	addi	15, 15, -256	# len +=256
745
746	xxlor	32+24, 24, 24
747	xxlor	32+25, 25, 25
748	xxlor	32+30, 30, 30
749	vadduwm	30, 30, 25
750	vadduwm	31, 30, 24
751	xxlor	30, 32+30, 32+30
752	xxlor	31, 32+31, 32+31
753
754	cmpdi	15, 0
755	beq	Out_loop
756
757	cmpdi	15, 512
758	blt	Loop_last
759
760	mtctr 8
761	b Loop_8x
762
763Loop_last:
764        lxvw4x	48, 0, 3		#  vr16, constants
765	lxvw4x	49, 17, 3		#  vr17, key 1
766	lxvw4x	50, 18, 3		#  vr18, key 2
767	lxvw4x	51, 19, 3		#  vr19, counter, nonce
768
769	vspltisw 21, 12
770	vspltisw 23, 7
771	addis	11, 2, permx@toc@ha
772	addi	11, 11, permx@toc@l
773	lxvw4x	32+20, 0, 11
774	lxvw4x	32+22, 17, 11
775
776	sradi	8, 7, 1
777	mtctr 8
778
779Loop_4x:
780	vspltw  0, 16, 0
781	vspltw  1, 16, 1
782	vspltw  2, 16, 2
783	vspltw  3, 16, 3
784
785	vspltw  4, 17, 0
786	vspltw  5, 17, 1
787	vspltw  6, 17, 2
788	vspltw  7, 17, 3
789	vspltw  8, 18, 0
790	vspltw  9, 18, 1
791	vspltw  10, 18, 2
792	vspltw  11, 18, 3
793	vspltw  12, 19, 0
794	vadduwm	12, 12, 30	# increase counter
795	vspltw  13, 19, 1
796	vspltw  14, 19, 2
797	vspltw  15, 19, 3
798
799.align 5
800quarter_loop:
801	QT_loop_4x
802
803	bdnz	quarter_loop
804
805	vadduwm	12, 12, 30
806	TP_4x 0, 1, 2, 3
807	TP_4x 4, 5, 6, 7
808	TP_4x 8, 9, 10, 11
809	TP_4x 12, 13, 14, 15
810
811	Add_state 0
812	Write_256 0
813	addi	14, 14, 256	# offset += 256
814	addi	15, 15, -256	# len += 256
815
816	# Update state counter
817	vspltisw 25, 4
818	vadduwm	30, 30, 25
819
820	cmpdi	15, 0
821	beq	Out_loop
822	cmpdi	15, 256
823	blt	Out_loop
824
825	mtctr 8
826	b Loop_4x
827
828Out_loop:
829	RESTORE_REGS
830	blr
831
832Out_no_chacha:
833	li	3, 0
834	blr
835SYM_FUNC_END(chacha_p10le_8x)
836
837SYM_DATA_START_LOCAL(PERMX)
838.align 5
839permx:
840.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
841.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
842SYM_DATA_END(PERMX)
843