xref: /linux/arch/powerpc/crypto/poly1305-p10le_64.S (revision 4b132aacb0768ac1e652cf517097ea6f237214b9)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2#
3# Accelerated poly1305 implementation for ppc64le.
4#
5# Copyright 2023- IBM Corp. All rights reserved
6#
7#===================================================================================
8# Written by Danny Tsen <dtsen@us.ibm.com>
9#
10# Poly1305 - this version mainly using vector/VSX/Scalar
11#  - 26 bits limbs
12#  - Handle multiple 64 byte blcok.
13#
14# Block size 16 bytes
15# key = (r, s)
16# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
17# p = 2^130 - 5
18# a += m
19# a = (r + a) % p
20# a += s
21#
22# Improve performance by breaking down polynominal to the sum of products with
23#     h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
24#
25#  07/22/21 - this revison based on the above sum of products.  Setup r^4, r^3, r^2, r and s3, s2, s1, s0
26#             to 9 vectors for multiplications.
27#
28# setup r^4, r^3, r^2, r vectors
29#    vs    [r^1, r^3, r^2, r^4]
30#    vs0 = [r0,.....]
31#    vs1 = [r1,.....]
32#    vs2 = [r2,.....]
33#    vs3 = [r3,.....]
34#    vs4 = [r4,.....]
35#    vs5 = [r1*5,...]
36#    vs6 = [r2*5,...]
37#    vs7 = [r2*5,...]
38#    vs8 = [r4*5,...]
39#
40#  Each word in a vector consists a member of a "r/s" in [a * r/s].
41#
42# r0, r4*5, r3*5, r2*5, r1*5;
43# r1, r0,   r4*5, r3*5, r2*5;
44# r2, r1,   r0,   r4*5, r3*5;
45# r3, r2,   r1,   r0,   r4*5;
46# r4, r3,   r2,   r1,   r0  ;
47#
48#
49# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
50#  k = 32 bytes key
51#  r3 = k (r, s)
52#  r4 = mlen
53#  r5 = m
54#
55#include <asm/ppc_asm.h>
56#include <asm/asm-offsets.h>
57#include <asm/asm-compat.h>
58#include <linux/linkage.h>
59
60.machine "any"
61
62.text
63
64.macro	SAVE_GPR GPR OFFSET FRAME
65	std	\GPR,\OFFSET(\FRAME)
66.endm
67
68.macro	SAVE_VRS VRS OFFSET FRAME
69	li	16, \OFFSET
70	stvx	\VRS, 16, \FRAME
71.endm
72
73.macro	SAVE_VSX VSX OFFSET FRAME
74	li	16, \OFFSET
75	stxvx	\VSX, 16, \FRAME
76.endm
77
78.macro	RESTORE_GPR GPR OFFSET FRAME
79	ld	\GPR,\OFFSET(\FRAME)
80.endm
81
82.macro	RESTORE_VRS VRS OFFSET FRAME
83	li	16, \OFFSET
84	lvx	\VRS, 16, \FRAME
85.endm
86
87.macro	RESTORE_VSX VSX OFFSET FRAME
88	li	16, \OFFSET
89	lxvx	\VSX, 16, \FRAME
90.endm
91
92.macro SAVE_REGS
93	mflr 0
94	std 0, 16(1)
95	stdu 1,-752(1)
96
97	SAVE_GPR 14, 112, 1
98	SAVE_GPR 15, 120, 1
99	SAVE_GPR 16, 128, 1
100	SAVE_GPR 17, 136, 1
101	SAVE_GPR 18, 144, 1
102	SAVE_GPR 19, 152, 1
103	SAVE_GPR 20, 160, 1
104	SAVE_GPR 21, 168, 1
105	SAVE_GPR 22, 176, 1
106	SAVE_GPR 23, 184, 1
107	SAVE_GPR 24, 192, 1
108	SAVE_GPR 25, 200, 1
109	SAVE_GPR 26, 208, 1
110	SAVE_GPR 27, 216, 1
111	SAVE_GPR 28, 224, 1
112	SAVE_GPR 29, 232, 1
113	SAVE_GPR 30, 240, 1
114	SAVE_GPR 31, 248, 1
115
116	addi	9, 1, 256
117	SAVE_VRS 20, 0, 9
118	SAVE_VRS 21, 16, 9
119	SAVE_VRS 22, 32, 9
120	SAVE_VRS 23, 48, 9
121	SAVE_VRS 24, 64, 9
122	SAVE_VRS 25, 80, 9
123	SAVE_VRS 26, 96, 9
124	SAVE_VRS 27, 112, 9
125	SAVE_VRS 28, 128, 9
126	SAVE_VRS 29, 144, 9
127	SAVE_VRS 30, 160, 9
128	SAVE_VRS 31, 176, 9
129
130	SAVE_VSX 14, 192, 9
131	SAVE_VSX 15, 208, 9
132	SAVE_VSX 16, 224, 9
133	SAVE_VSX 17, 240, 9
134	SAVE_VSX 18, 256, 9
135	SAVE_VSX 19, 272, 9
136	SAVE_VSX 20, 288, 9
137	SAVE_VSX 21, 304, 9
138	SAVE_VSX 22, 320, 9
139	SAVE_VSX 23, 336, 9
140	SAVE_VSX 24, 352, 9
141	SAVE_VSX 25, 368, 9
142	SAVE_VSX 26, 384, 9
143	SAVE_VSX 27, 400, 9
144	SAVE_VSX 28, 416, 9
145	SAVE_VSX 29, 432, 9
146	SAVE_VSX 30, 448, 9
147	SAVE_VSX 31, 464, 9
148.endm # SAVE_REGS
149
150.macro RESTORE_REGS
151	addi	9, 1, 256
152	RESTORE_VRS 20, 0, 9
153	RESTORE_VRS 21, 16, 9
154	RESTORE_VRS 22, 32, 9
155	RESTORE_VRS 23, 48, 9
156	RESTORE_VRS 24, 64, 9
157	RESTORE_VRS 25, 80, 9
158	RESTORE_VRS 26, 96, 9
159	RESTORE_VRS 27, 112, 9
160	RESTORE_VRS 28, 128, 9
161	RESTORE_VRS 29, 144, 9
162	RESTORE_VRS 30, 160, 9
163	RESTORE_VRS 31, 176, 9
164
165	RESTORE_VSX 14, 192, 9
166	RESTORE_VSX 15, 208, 9
167	RESTORE_VSX 16, 224, 9
168	RESTORE_VSX 17, 240, 9
169	RESTORE_VSX 18, 256, 9
170	RESTORE_VSX 19, 272, 9
171	RESTORE_VSX 20, 288, 9
172	RESTORE_VSX 21, 304, 9
173	RESTORE_VSX 22, 320, 9
174	RESTORE_VSX 23, 336, 9
175	RESTORE_VSX 24, 352, 9
176	RESTORE_VSX 25, 368, 9
177	RESTORE_VSX 26, 384, 9
178	RESTORE_VSX 27, 400, 9
179	RESTORE_VSX 28, 416, 9
180	RESTORE_VSX 29, 432, 9
181	RESTORE_VSX 30, 448, 9
182	RESTORE_VSX 31, 464, 9
183
184	RESTORE_GPR 14, 112, 1
185	RESTORE_GPR 15, 120, 1
186	RESTORE_GPR 16, 128, 1
187	RESTORE_GPR 17, 136, 1
188	RESTORE_GPR 18, 144, 1
189	RESTORE_GPR 19, 152, 1
190	RESTORE_GPR 20, 160, 1
191	RESTORE_GPR 21, 168, 1
192	RESTORE_GPR 22, 176, 1
193	RESTORE_GPR 23, 184, 1
194	RESTORE_GPR 24, 192, 1
195	RESTORE_GPR 25, 200, 1
196	RESTORE_GPR 26, 208, 1
197	RESTORE_GPR 27, 216, 1
198	RESTORE_GPR 28, 224, 1
199	RESTORE_GPR 29, 232, 1
200	RESTORE_GPR 30, 240, 1
201	RESTORE_GPR 31, 248, 1
202
203	addi    1, 1, 752
204	ld 0, 16(1)
205	mtlr 0
206.endm # RESTORE_REGS
207
208#
209# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
210# p[1] = a0*r1 + a1*r0   + a2*r4*5 + a3*r3*5 + a4*r2*5;
211# p[2] = a0*r2 + a1*r1   + a2*r0   + a3*r4*5 + a4*r3*5;
212# p[3] = a0*r3 + a1*r2   + a2*r1   + a3*r0   + a4*r4*5;
213# p[4] = a0*r4 + a1*r3   + a2*r2   + a3*r1   + a4*r0  ;
214#
215#    [r^2, r^3, r^1, r^4]
216#    [m3,  m2,  m4,  m1]
217#
218# multiply odd and even words
219.macro mul_odd
220	vmulouw	14, 4, 26
221	vmulouw	10, 5, 3
222	vmulouw	11, 6, 2
223	vmulouw	12, 7, 1
224	vmulouw	13, 8, 0
225	vmulouw	15, 4, 27
226	vaddudm	14, 14, 10
227	vaddudm	14, 14, 11
228	vmulouw	10, 5, 26
229	vmulouw	11, 6, 3
230	vaddudm	14, 14, 12
231	vaddudm	14, 14, 13	# x0
232	vaddudm	15, 15, 10
233	vaddudm	15, 15, 11
234	vmulouw	12, 7, 2
235	vmulouw	13, 8, 1
236	vaddudm	15, 15, 12
237	vaddudm	15, 15, 13	# x1
238	vmulouw	16, 4, 28
239	vmulouw	10, 5, 27
240	vmulouw	11, 6, 26
241	vaddudm	16, 16, 10
242	vaddudm	16, 16, 11
243	vmulouw	12, 7, 3
244	vmulouw	13, 8, 2
245	vaddudm	16, 16, 12
246	vaddudm	16, 16, 13	# x2
247	vmulouw	17, 4, 29
248	vmulouw	10, 5, 28
249	vmulouw	11, 6, 27
250	vaddudm	17, 17, 10
251	vaddudm	17, 17, 11
252	vmulouw	12, 7, 26
253	vmulouw	13, 8, 3
254	vaddudm	17, 17, 12
255	vaddudm	17, 17, 13	# x3
256	vmulouw	18, 4, 30
257	vmulouw	10, 5, 29
258	vmulouw	11, 6, 28
259	vaddudm	18, 18, 10
260	vaddudm	18, 18, 11
261	vmulouw	12, 7, 27
262	vmulouw	13, 8, 26
263	vaddudm	18, 18, 12
264	vaddudm	18, 18, 13	# x4
265.endm
266
267.macro mul_even
268	vmuleuw	9, 4, 26
269	vmuleuw	10, 5, 3
270	vmuleuw	11, 6, 2
271	vmuleuw	12, 7, 1
272	vmuleuw	13, 8, 0
273	vaddudm	14, 14, 9
274	vaddudm	14, 14, 10
275	vaddudm	14, 14, 11
276	vaddudm	14, 14, 12
277	vaddudm	14, 14, 13	# x0
278
279	vmuleuw	9, 4, 27
280	vmuleuw	10, 5, 26
281	vmuleuw	11, 6, 3
282	vmuleuw	12, 7, 2
283	vmuleuw	13, 8, 1
284	vaddudm	15, 15, 9
285	vaddudm	15, 15, 10
286	vaddudm	15, 15, 11
287	vaddudm	15, 15, 12
288	vaddudm	15, 15, 13	# x1
289
290	vmuleuw	9, 4, 28
291	vmuleuw	10, 5, 27
292	vmuleuw	11, 6, 26
293	vmuleuw	12, 7, 3
294	vmuleuw	13, 8, 2
295	vaddudm	16, 16, 9
296	vaddudm	16, 16, 10
297	vaddudm	16, 16, 11
298	vaddudm	16, 16, 12
299	vaddudm	16, 16, 13	# x2
300
301	vmuleuw	9, 4, 29
302	vmuleuw	10, 5, 28
303	vmuleuw	11, 6, 27
304	vmuleuw	12, 7, 26
305	vmuleuw	13, 8, 3
306	vaddudm	17, 17, 9
307	vaddudm	17, 17, 10
308	vaddudm	17, 17, 11
309	vaddudm	17, 17, 12
310	vaddudm	17, 17, 13	# x3
311
312	vmuleuw	9, 4, 30
313	vmuleuw	10, 5, 29
314	vmuleuw	11, 6, 28
315	vmuleuw	12, 7, 27
316	vmuleuw	13, 8, 26
317	vaddudm	18, 18, 9
318	vaddudm	18, 18, 10
319	vaddudm	18, 18, 11
320	vaddudm	18, 18, 12
321	vaddudm	18, 18, 13	# x4
322.endm
323
324#
325# poly1305_setup_r
326#
327# setup r^4, r^3, r^2, r vectors
328#    [r, r^3, r^2, r^4]
329#    vs0 = [r0,...]
330#    vs1 = [r1,...]
331#    vs2 = [r2,...]
332#    vs3 = [r3,...]
333#    vs4 = [r4,...]
334#    vs5 = [r4*5,...]
335#    vs6 = [r3*5,...]
336#    vs7 = [r2*5,...]
337#    vs8 = [r1*5,...]
338#
339# r0, r4*5, r3*5, r2*5, r1*5;
340# r1, r0,   r4*5, r3*5, r2*5;
341# r2, r1,   r0,   r4*5, r3*5;
342# r3, r2,   r1,   r0,   r4*5;
343# r4, r3,   r2,   r1,   r0  ;
344#
345.macro poly1305_setup_r
346
347	# save r
348	xxlor	26, 58, 58
349	xxlor	27, 59, 59
350	xxlor	28, 60, 60
351	xxlor	29, 61, 61
352	xxlor	30, 62, 62
353
354	xxlxor	31, 31, 31
355
356#    [r, r^3, r^2, r^4]
357	# compute r^2
358	vmr	4, 26
359	vmr	5, 27
360	vmr	6, 28
361	vmr	7, 29
362	vmr	8, 30
363	bl	do_mul		# r^2 r^1
364	xxpermdi 58, 58, 36, 0x3		# r0
365	xxpermdi 59, 59, 37, 0x3		# r1
366	xxpermdi 60, 60, 38, 0x3		# r2
367	xxpermdi 61, 61, 39, 0x3		# r3
368	xxpermdi 62, 62, 40, 0x3		# r4
369	xxpermdi 36, 36, 36, 0x3
370	xxpermdi 37, 37, 37, 0x3
371	xxpermdi 38, 38, 38, 0x3
372	xxpermdi 39, 39, 39, 0x3
373	xxpermdi 40, 40, 40, 0x3
374	vspltisb 13, 2
375	vsld	9, 27, 13
376	vsld	10, 28, 13
377	vsld	11, 29, 13
378	vsld	12, 30, 13
379	vaddudm	0, 9, 27
380	vaddudm	1, 10, 28
381	vaddudm	2, 11, 29
382	vaddudm	3, 12, 30
383
384	bl	do_mul		# r^4 r^3
385	vmrgow	26, 26, 4
386	vmrgow	27, 27, 5
387	vmrgow	28, 28, 6
388	vmrgow	29, 29, 7
389	vmrgow	30, 30, 8
390	vspltisb 13, 2
391	vsld	9, 27, 13
392	vsld	10, 28, 13
393	vsld	11, 29, 13
394	vsld	12, 30, 13
395	vaddudm	0, 9, 27
396	vaddudm	1, 10, 28
397	vaddudm	2, 11, 29
398	vaddudm	3, 12, 30
399
400	# r^2 r^4
401	xxlor	0, 58, 58
402	xxlor	1, 59, 59
403	xxlor	2, 60, 60
404	xxlor	3, 61, 61
405	xxlor	4, 62, 62
406	xxlor	5, 32, 32
407	xxlor	6, 33, 33
408	xxlor	7, 34, 34
409	xxlor	8, 35, 35
410
411	vspltw	9, 26, 3
412	vspltw	10, 26, 2
413	vmrgow	26, 10, 9
414	vspltw	9, 27, 3
415	vspltw	10, 27, 2
416	vmrgow	27, 10, 9
417	vspltw	9, 28, 3
418	vspltw	10, 28, 2
419	vmrgow	28, 10, 9
420	vspltw	9, 29, 3
421	vspltw	10, 29, 2
422	vmrgow	29, 10, 9
423	vspltw	9, 30, 3
424	vspltw	10, 30, 2
425	vmrgow	30, 10, 9
426
427	vsld	9, 27, 13
428	vsld	10, 28, 13
429	vsld	11, 29, 13
430	vsld	12, 30, 13
431	vaddudm	0, 9, 27
432	vaddudm	1, 10, 28
433	vaddudm	2, 11, 29
434	vaddudm	3, 12, 30
435.endm
436
437SYM_FUNC_START_LOCAL(do_mul)
438	mul_odd
439
440	# do reduction ( h %= p )
441	# carry reduction
442	vspltisb 9, 2
443	vsrd	10, 14, 31
444	vsrd	11, 17, 31
445	vand	7, 17, 25
446	vand	4, 14, 25
447	vaddudm	18, 18, 11
448	vsrd	12, 18, 31
449	vaddudm	15, 15, 10
450
451	vsrd	11, 15, 31
452	vand	8, 18, 25
453	vand	5, 15, 25
454	vaddudm	4, 4, 12
455	vsld	10, 12, 9
456	vaddudm	6, 16, 11
457
458	vsrd	13, 6, 31
459	vand	6, 6, 25
460	vaddudm	4, 4, 10
461	vsrd	10, 4, 31
462	vaddudm	7, 7, 13
463
464	vsrd	11, 7, 31
465	vand	7, 7, 25
466	vand	4, 4, 25
467	vaddudm	5, 5, 10
468	vaddudm	8, 8, 11
469	blr
470SYM_FUNC_END(do_mul)
471
472#
473# init key
474#
475.macro do_poly1305_init
476	addis	10, 2, rmask@toc@ha
477	addi	10, 10, rmask@toc@l
478
479	ld	11, 0(10)
480	ld	12, 8(10)
481
482	li	14, 16
483	li	15, 32
484	addis	10, 2, cnum@toc@ha
485	addi	10, 10, cnum@toc@l
486	lvx	25, 0, 10	# v25 - mask
487	lvx	31, 14, 10	# v31 = 1a
488	lvx	19, 15, 10	# v19 = 1 << 24
489	lxv	24, 48(10)	# vs24
490	lxv	25, 64(10)	# vs25
491
492	# initialize
493	# load key from r3 to vectors
494	ld	9, 24(3)
495	ld	10, 32(3)
496	and.	9, 9, 11
497	and.	10, 10, 12
498
499	# break 26 bits
500	extrdi	14, 9, 26, 38
501	extrdi	15, 9, 26, 12
502	extrdi	16, 9, 12, 0
503	mtvsrdd	58, 0, 14
504	insrdi	16, 10, 14, 38
505	mtvsrdd	59, 0, 15
506	extrdi	17, 10, 26, 24
507	mtvsrdd	60, 0, 16
508	extrdi	18, 10, 24, 0
509	mtvsrdd	61, 0, 17
510	mtvsrdd	62, 0, 18
511
512	# r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
513	li	9, 5
514	mtvsrdd	36, 0, 9
515	vmulouw	0, 27, 4		# v0 = rr0
516	vmulouw	1, 28, 4		# v1 = rr1
517	vmulouw	2, 29, 4		# v2 = rr2
518	vmulouw	3, 30, 4		# v3 = rr3
519.endm
520
521#
522# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
523#  k = 32 bytes key
524#  r3 = k (r, s)
525#  r4 = mlen
526#  r5 = m
527#
528SYM_FUNC_START(poly1305_p10le_4blocks)
529.align 5
530	cmpdi	5, 64
531	blt	Out_no_poly1305
532
533	SAVE_REGS
534
535	do_poly1305_init
536
537	li	21, 0	# counter to message
538
539	poly1305_setup_r
540
541	# load previous H state
542	# break/convert r6 to 26 bits
543	ld	9, 0(3)
544	ld	10, 8(3)
545	ld	19, 16(3)
546	sldi	19, 19, 24
547	mtvsrdd	41, 0, 19
548	extrdi	14, 9, 26, 38
549	extrdi	15, 9, 26, 12
550	extrdi	16, 9, 12, 0
551	mtvsrdd	36, 0, 14
552	insrdi	16, 10, 14, 38
553	mtvsrdd	37, 0, 15
554	extrdi	17, 10, 26, 24
555	mtvsrdd	38, 0, 16
556	extrdi	18, 10, 24, 0
557	mtvsrdd	39, 0, 17
558	mtvsrdd	40, 0, 18
559	vor	8, 8, 9
560
561	# input m1 m2
562	add	20, 4, 21
563	xxlor	49, 24, 24
564	xxlor	50, 25, 25
565	lxvw4x	43, 0, 20
566	addi	17, 20, 16
567	lxvw4x	44, 0, 17
568	vperm	14, 11, 12, 17
569	vperm	15, 11, 12, 18
570	vand	9, 14, 25	# a0
571	vsrd	10, 14, 31	# >> 26
572	vsrd	11, 10, 31	# 12 bits left
573	vand	10, 10, 25	# a1
574	vspltisb 13, 12
575	vand	16, 15, 25
576	vsld	12, 16, 13
577	vor	11, 11, 12
578	vand	11, 11, 25	# a2
579	vspltisb 13, 14
580	vsrd	12, 15, 13	# >> 14
581	vsrd	13, 12, 31	# >> 26, a4
582	vand	12, 12, 25	# a3
583
584	vaddudm	20, 4, 9
585	vaddudm	21, 5, 10
586	vaddudm	22, 6, 11
587	vaddudm	23, 7, 12
588	vaddudm	24, 8, 13
589
590	# m3 m4
591	addi	17, 17, 16
592	lxvw4x	43, 0, 17
593	addi	17, 17, 16
594	lxvw4x	44, 0, 17
595	vperm	14, 11, 12, 17
596	vperm	15, 11, 12, 18
597	vand	9, 14, 25	# a0
598	vsrd	10, 14, 31	# >> 26
599	vsrd	11, 10, 31	# 12 bits left
600	vand	10, 10, 25	# a1
601	vspltisb 13, 12
602	vand	16, 15, 25
603	vsld	12, 16, 13
604	vspltisb 13, 14
605	vor	11, 11, 12
606	vand	11, 11, 25	# a2
607	vsrd	12, 15, 13	# >> 14
608	vsrd	13, 12, 31	# >> 26, a4
609	vand	12, 12, 25	# a3
610
611	# Smash 4 message blocks into 5 vectors of [m4,  m2,  m3,  m1]
612	vmrgow	4, 9, 20
613	vmrgow	5, 10, 21
614	vmrgow	6, 11, 22
615	vmrgow	7, 12, 23
616	vmrgow	8, 13, 24
617	vaddudm	8, 8, 19
618
619	addi	5, 5, -64	# len -= 64
620	addi	21, 21, 64	# offset += 64
621
622	li      9, 64
623	divdu   31, 5, 9
624
625	cmpdi	31, 0
626	ble	Skip_block_loop
627
628	mtctr	31
629
630# h4 =   m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
631# Rewrite the polynominal sum of product as follows,
632# h1 = (h0 + m1) * r^2,	h2 = (h0 + m2) * r^2
633# h3 = (h1 + m3) * r^2,	h4 = (h2 + m4) * r^2  --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
634#  .... Repeat
635# h5 = (h3 + m5) * r^2,	h6 = (h4 + m6) * r^2  -->
636# h7 = (h5 + m7) * r^2,	h8 = (h6 + m8) * r^1  --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
637#
638loop_4blocks:
639
640	# Multiply odd words and even words
641	mul_odd
642	mul_even
643	# carry reduction
644	vspltisb 9, 2
645	vsrd	10, 14, 31
646	vsrd	11, 17, 31
647	vand	7, 17, 25
648	vand	4, 14, 25
649	vaddudm	18, 18, 11
650	vsrd	12, 18, 31
651	vaddudm	15, 15, 10
652
653	vsrd	11, 15, 31
654	vand	8, 18, 25
655	vand	5, 15, 25
656	vaddudm	4, 4, 12
657	vsld	10, 12, 9
658	vaddudm	6, 16, 11
659
660	vsrd	13, 6, 31
661	vand	6, 6, 25
662	vaddudm	4, 4, 10
663	vsrd	10, 4, 31
664	vaddudm	7, 7, 13
665
666	vsrd	11, 7, 31
667	vand	7, 7, 25
668	vand	4, 4, 25
669	vaddudm	5, 5, 10
670	vaddudm	8, 8, 11
671
672	# input m1  m2  m3  m4
673	add	20, 4, 21
674	xxlor	49, 24, 24
675	xxlor	50, 25, 25
676	lxvw4x	43, 0, 20
677	addi	17, 20, 16
678	lxvw4x	44, 0, 17
679	vperm	14, 11, 12, 17
680	vperm	15, 11, 12, 18
681	addi	17, 17, 16
682	lxvw4x	43, 0, 17
683	addi	17, 17, 16
684	lxvw4x	44, 0, 17
685	vperm	17, 11, 12, 17
686	vperm	18, 11, 12, 18
687
688	vand	20, 14, 25	# a0
689	vand	9, 17, 25	# a0
690	vsrd	21, 14, 31	# >> 26
691	vsrd	22, 21, 31	# 12 bits left
692	vsrd	10, 17, 31	# >> 26
693	vsrd	11, 10, 31	# 12 bits left
694
695	vand	21, 21, 25	# a1
696	vand	10, 10, 25	# a1
697
698	vspltisb 13, 12
699	vand	16, 15, 25
700	vsld	23, 16, 13
701	vor	22, 22, 23
702	vand	22, 22, 25	# a2
703	vand	16, 18, 25
704	vsld	12, 16, 13
705	vor	11, 11, 12
706	vand	11, 11, 25	# a2
707	vspltisb 13, 14
708	vsrd	23, 15, 13	# >> 14
709	vsrd	24, 23, 31	# >> 26, a4
710	vand	23, 23, 25	# a3
711	vsrd	12, 18, 13	# >> 14
712	vsrd	13, 12, 31	# >> 26, a4
713	vand	12, 12, 25	# a3
714
715	vaddudm	4, 4, 20
716	vaddudm	5, 5, 21
717	vaddudm	6, 6, 22
718	vaddudm	7, 7, 23
719	vaddudm	8, 8, 24
720
721	# Smash 4 message blocks into 5 vectors of [m4,  m2,  m3,  m1]
722	vmrgow	4, 9, 4
723	vmrgow	5, 10, 5
724	vmrgow	6, 11, 6
725	vmrgow	7, 12, 7
726	vmrgow	8, 13, 8
727	vaddudm	8, 8, 19
728
729	addi	5, 5, -64	# len -= 64
730	addi	21, 21, 64	# offset += 64
731
732	bdnz	loop_4blocks
733
734Skip_block_loop:
735	xxlor	58, 0, 0
736	xxlor	59, 1, 1
737	xxlor	60, 2, 2
738	xxlor	61, 3, 3
739	xxlor	62, 4, 4
740	xxlor	32, 5, 5
741	xxlor	33, 6, 6
742	xxlor	34, 7, 7
743	xxlor	35, 8, 8
744
745	# Multiply odd words and even words
746	mul_odd
747	mul_even
748
749	# Sum the products.
750	xxpermdi 41, 31, 46, 0
751	xxpermdi 42, 31, 47, 0
752	vaddudm	4, 14, 9
753	xxpermdi 36, 31, 36, 3
754	vaddudm	5, 15, 10
755	xxpermdi 37, 31, 37, 3
756	xxpermdi 43, 31, 48, 0
757	vaddudm	6, 16, 11
758	xxpermdi 38, 31, 38, 3
759	xxpermdi 44, 31, 49, 0
760	vaddudm	7, 17, 12
761	xxpermdi 39, 31, 39, 3
762	xxpermdi 45, 31, 50, 0
763	vaddudm	8, 18, 13
764	xxpermdi 40, 31, 40, 3
765
766	# carry reduction
767	vspltisb 9, 2
768	vsrd	10, 4, 31
769	vsrd	11, 7, 31
770	vand	7, 7, 25
771	vand	4, 4, 25
772	vaddudm	8, 8, 11
773	vsrd	12, 8, 31
774	vaddudm	5, 5, 10
775
776	vsrd	11, 5, 31
777	vand	8, 8, 25
778	vand	5, 5, 25
779	vaddudm	4, 4, 12
780	vsld	10, 12, 9
781	vaddudm	6, 6, 11
782
783	vsrd	13, 6, 31
784	vand	6, 6, 25
785	vaddudm	4, 4, 10
786	vsrd	10, 4, 31
787	vaddudm	7, 7, 13
788
789	vsrd	11, 7, 31
790	vand	7, 7, 25
791	vand	4, 4, 25
792	vaddudm	5, 5, 10
793	vsrd	10, 5, 31
794	vand	5, 5, 25
795	vaddudm	6, 6, 10
796	vaddudm	8, 8, 11
797
798	b	do_final_update
799
800do_final_update:
801	# combine 26 bit limbs
802	# v4, v5, v6, v7 and v8 are 26 bit vectors
803	vsld	5, 5, 31
804	vor	20, 4, 5
805	vspltisb 11, 12
806	vsrd	12, 6, 11
807	vsld	6, 6, 31
808	vsld	6, 6, 31
809	vor	20, 20, 6
810	vspltisb 11, 14
811	vsld	7, 7, 11
812	vor	21, 7, 12
813	mfvsrld	16, 40		# save last 2 bytes
814	vsld	8, 8, 11
815	vsld	8, 8, 31
816	vor	21, 21, 8
817	mfvsrld	17, 52
818	mfvsrld	19, 53
819	srdi	16, 16, 24
820
821	std	17, 0(3)
822	std	19, 8(3)
823	stw	16, 16(3)
824
825Out_loop:
826	li	3, 0
827
828	RESTORE_REGS
829
830	blr
831
832Out_no_poly1305:
833	li	3, 0
834	blr
835SYM_FUNC_END(poly1305_p10le_4blocks)
836
837#
838# =======================================================================
839# The following functions implement 64 x 64 bits multiplication poly1305.
840#
841SYM_FUNC_START_LOCAL(Poly1305_init_64)
842	#  mask 0x0FFFFFFC0FFFFFFC
843	#  mask 0x0FFFFFFC0FFFFFFF
844	addis	10, 2, rmask@toc@ha
845	addi	10, 10, rmask@toc@l
846	ld	11, 0(10)
847	ld	12, 8(10)
848
849	# initialize
850	# load key from r3
851	ld	9, 24(3)
852	ld	10, 32(3)
853	and.	9, 9, 11	# cramp mask r0
854	and.	10, 10, 12	# cramp mask r1
855
856        srdi    21, 10, 2
857        add     19, 21, 10      # s1: r19 - (r1 >> 2) *5
858
859        # setup r and s
860        li      25, 0
861	mtvsrdd 32+0, 9, 19	# r0, s1
862	mtvsrdd 32+1, 10, 9	# r1, r0
863	mtvsrdd 32+2, 19, 25	# s1
864	mtvsrdd 32+3, 9, 25	# r0
865
866	blr
867SYM_FUNC_END(Poly1305_init_64)
868
869# Poly1305_mult
870# v6 = (h0, h1), v8 = h2
871# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0
872#
873# Output: v7, v10, v11
874#
875SYM_FUNC_START_LOCAL(Poly1305_mult)
876	#
877	#	d0 = h0 * r0 + h1 * s1
878	vmsumudm	7, 6, 0, 9		# h0 * r0, h1 * s1
879
880	#	d1 = h0 * r1 + h1 * r0 + h2 * s1
881	vmsumudm	11, 6, 1, 9		# h0 * r1, h1 * r0
882	vmsumudm	10, 8, 2, 11		# d1 += h2 * s1
883
884	#       d2 = r0
885	vmsumudm	11, 8, 3, 9		# d2 = h2 * r0
886	blr
887SYM_FUNC_END(Poly1305_mult)
888
889#
890# carry reduction
891# h %=p
892#
893# Input: v7, v10, v11
894# Output: r27, r28, r29
895#
896SYM_FUNC_START_LOCAL(Carry_reduction)
897	mfvsrld	27, 32+7
898	mfvsrld	28, 32+10
899	mfvsrld	29, 32+11
900	mfvsrd	20, 32+7	# h0.h
901	mfvsrd	21, 32+10	# h1.h
902
903	addc	28, 28, 20
904	adde	29, 29, 21
905	srdi	22, 29, 0x2
906	sldi	23, 22, 0x2
907	add	23, 23, 22	# (h2 & 3) * 5
908	addc	27, 27, 23	# h0
909	addze	28, 28		# h1
910	andi.	29, 29, 0x3	# h2
911	blr
912SYM_FUNC_END(Carry_reduction)
913
914#
915# poly1305 multiplication
916# h *= r, h %= p
917#	d0 = h0 * r0 + h1 * s1
918#	d1 = h0 * r1 + h1 * r0 + h2 * s1
919#       d2 = h0 * r0
920#
921#
922# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)
923#   - no highbit if final leftover block (highbit = 0)
924#
925SYM_FUNC_START(poly1305_64s)
926	cmpdi	5, 0
927	ble	Out_no_poly1305_64
928
929	mflr 0
930	std 0, 16(1)
931	stdu 1,-400(1)
932
933	SAVE_GPR 14, 112, 1
934	SAVE_GPR 15, 120, 1
935	SAVE_GPR 16, 128, 1
936	SAVE_GPR 17, 136, 1
937	SAVE_GPR 18, 144, 1
938	SAVE_GPR 19, 152, 1
939	SAVE_GPR 20, 160, 1
940	SAVE_GPR 21, 168, 1
941	SAVE_GPR 22, 176, 1
942	SAVE_GPR 23, 184, 1
943	SAVE_GPR 24, 192, 1
944	SAVE_GPR 25, 200, 1
945	SAVE_GPR 26, 208, 1
946	SAVE_GPR 27, 216, 1
947	SAVE_GPR 28, 224, 1
948	SAVE_GPR 29, 232, 1
949	SAVE_GPR 30, 240, 1
950	SAVE_GPR 31, 248, 1
951
952	# Init poly1305
953	bl Poly1305_init_64
954
955	li 25, 0			# offset to inp and outp
956
957	add 11, 25, 4
958
959	# load h
960	# h0, h1, h2?
961        ld	27, 0(3)
962        ld	28, 8(3)
963        lwz	29, 16(3)
964
965        li      30, 16
966        divdu   31, 5, 30
967
968        mtctr   31
969
970        mr      24, 6		# highbit
971
972Loop_block_64:
973	vxor	9, 9, 9
974
975	ld	20, 0(11)
976	ld	21, 8(11)
977	addi	11, 11, 16
978
979	addc	27, 27, 20
980	adde	28, 28, 21
981	adde	29, 29, 24
982
983	li	22, 0
984	mtvsrdd	32+6, 27, 28	# h0, h1
985	mtvsrdd	32+8, 29, 22	# h2
986
987	bl	Poly1305_mult
988
989	bl	Carry_reduction
990
991	bdnz	Loop_block_64
992
993	std	27, 0(3)
994	std	28, 8(3)
995	stw	29, 16(3)
996
997	li	3, 0
998
999	RESTORE_GPR 14, 112, 1
1000	RESTORE_GPR 15, 120, 1
1001	RESTORE_GPR 16, 128, 1
1002	RESTORE_GPR 17, 136, 1
1003	RESTORE_GPR 18, 144, 1
1004	RESTORE_GPR 19, 152, 1
1005	RESTORE_GPR 20, 160, 1
1006	RESTORE_GPR 21, 168, 1
1007	RESTORE_GPR 22, 176, 1
1008	RESTORE_GPR 23, 184, 1
1009	RESTORE_GPR 24, 192, 1
1010	RESTORE_GPR 25, 200, 1
1011	RESTORE_GPR 26, 208, 1
1012	RESTORE_GPR 27, 216, 1
1013	RESTORE_GPR 28, 224, 1
1014	RESTORE_GPR 29, 232, 1
1015	RESTORE_GPR 30, 240, 1
1016	RESTORE_GPR 31, 248, 1
1017
1018	addi    1, 1, 400
1019	ld 0, 16(1)
1020	mtlr 0
1021
1022	blr
1023
1024Out_no_poly1305_64:
1025	li	3, 0
1026	blr
1027SYM_FUNC_END(poly1305_64s)
1028
1029#
1030# Input: r3 = h, r4 = s, r5 = mac
1031# mac = h + s
1032#
1033SYM_FUNC_START(poly1305_emit_64)
1034	ld	10, 0(3)
1035	ld	11, 8(3)
1036	ld	12, 16(3)
1037
1038	# compare modulus
1039	# h + 5 + (-p)
1040	mr	6, 10
1041	mr	7, 11
1042	mr	8, 12
1043	addic.	6, 6, 5
1044	addze	7, 7
1045	addze	8, 8
1046	srdi	9, 8, 2		# overflow?
1047	cmpdi	9, 0
1048	beq	Skip_h64
1049	mr	10, 6
1050	mr	11, 7
1051	mr	12, 8
1052
1053Skip_h64:
1054	ld	6, 0(4)
1055	ld	7, 8(4)
1056	addc	10, 10, 6
1057	adde	11, 11, 7
1058	addze	12, 12
1059
1060	std	10, 0(5)
1061	std	11, 8(5)
1062	blr
1063SYM_FUNC_END(poly1305_emit_64)
1064
1065SYM_DATA_START_LOCAL(RMASK)
1066.align 5
1067rmask:
1068.byte	0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
1069cnum:
1070.long	0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
1071.long	0x1a, 0x00, 0x1a, 0x00
1072.long	0x01000000, 0x01000000, 0x01000000, 0x01000000
1073.long	0x00010203, 0x04050607, 0x10111213, 0x14151617
1074.long	0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
1075SYM_DATA_END(RMASK)
1076