xref: /linux/lib/crypto/mips/poly1305-mips.pl (revision 07fdad3a93756b872da7b53647715c48d0f4a2d0)
1#!/usr/bin/env perl
2# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
3#
4# ====================================================================
5# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
6# project.
7# ====================================================================
8
9# Poly1305 hash for MIPS.
10#
11# May 2016
12#
13# Numbers are cycles per processed byte with poly1305_blocks alone.
14#
15#		IALU/gcc
16# R1x000	~5.5/+130%	(big-endian)
17# Octeon II	2.50/+70%	(little-endian)
18#
19# March 2019
20#
21# Add 32-bit code path.
22#
23# October 2019
24#
25# Modulo-scheduling reduction allows to omit dependency chain at the
26# end of inner loop and improve performance. Also optimize MIPS32R2
27# code path for MIPS 1004K core. Per René von Dorst's suggestions.
28#
29#		IALU/gcc
30# R1x000	~9.8/?		(big-endian)
31# Octeon II	3.65/+140%	(little-endian)
32# MT7621/1004K	4.75/?		(little-endian)
33#
34######################################################################
35# There is a number of MIPS ABI in use, O32 and N32/64 are most
36# widely used. Then there is a new contender: NUBI. It appears that if
37# one picks the latter, it's possible to arrange code in ABI neutral
38# manner. Therefore let's stick to NUBI register layout:
39#
40($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
41($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
42($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
43($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
44#
45# The return value is placed in $a0. Following coding rules facilitate
46# interoperability:
47#
48# - never ever touch $tp, "thread pointer", former $gp [o32 can be
49#   excluded from the rule, because it's specified volatile];
50# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
51#   old code];
52# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
53#
54# For reference here is register layout for N32/64 MIPS ABIs:
55#
56# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
57# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
58# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
59# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
60# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
61#
62# <appro@openssl.org>
63#
64######################################################################
65
66$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
67
68$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
69
70if ($flavour =~ /64|n32/i) {{{
71######################################################################
72# 64-bit code path
73#
74
75my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
76my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
77
78$code.=<<___;
79#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
80     defined(_MIPS_ARCH_MIPS64R6)) \\
81     && !defined(_MIPS_ARCH_MIPS64R2)
82# define _MIPS_ARCH_MIPS64R2
83#endif
84
85#if defined(_MIPS_ARCH_MIPS64R6)
86# define dmultu(rs,rt)
87# define mflo(rd,rs,rt)	dmulu	rd,rs,rt
88# define mfhi(rd,rs,rt)	dmuhu	rd,rs,rt
89#else
90# define dmultu(rs,rt)		dmultu	rs,rt
91# define mflo(rd,rs,rt)	mflo	rd
92# define mfhi(rd,rs,rt)	mfhi	rd
93#endif
94
95#ifdef	__KERNEL__
96# define poly1305_init   poly1305_block_init
97#endif
98
99#if defined(__MIPSEB__) && !defined(MIPSEB)
100# define MIPSEB
101#endif
102
103#ifdef MIPSEB
104# define MSB 0
105# define LSB 7
106#else
107# define MSB 7
108# define LSB 0
109#endif
110
111.text
112.set	noat
113.set	noreorder
114
115.align	5
116.globl	poly1305_init
117.ent	poly1305_init
118poly1305_init:
119	.frame	$sp,0,$ra
120	.set	reorder
121
122	sd	$zero,0($ctx)
123	sd	$zero,8($ctx)
124	sd	$zero,16($ctx)
125
126	beqz	$inp,.Lno_key
127
128#if defined(_MIPS_ARCH_MIPS64R6)
129	andi	$tmp0,$inp,7		# $inp % 8
130	dsubu	$inp,$inp,$tmp0		# align $inp
131	sll	$tmp0,$tmp0,3		# byte to bit offset
132	ld	$in0,0($inp)
133	ld	$in1,8($inp)
134	beqz	$tmp0,.Laligned_key
135	ld	$tmp2,16($inp)
136
137	subu	$tmp1,$zero,$tmp0
138# ifdef	MIPSEB
139	dsllv	$in0,$in0,$tmp0
140	dsrlv	$tmp3,$in1,$tmp1
141	dsllv	$in1,$in1,$tmp0
142	dsrlv	$tmp2,$tmp2,$tmp1
143# else
144	dsrlv	$in0,$in0,$tmp0
145	dsllv	$tmp3,$in1,$tmp1
146	dsrlv	$in1,$in1,$tmp0
147	dsllv	$tmp2,$tmp2,$tmp1
148# endif
149	or	$in0,$in0,$tmp3
150	or	$in1,$in1,$tmp2
151.Laligned_key:
152#else
153	ldl	$in0,0+MSB($inp)
154	ldl	$in1,8+MSB($inp)
155	ldr	$in0,0+LSB($inp)
156	ldr	$in1,8+LSB($inp)
157#endif
158#ifdef	MIPSEB
159# if defined(_MIPS_ARCH_MIPS64R2)
160	dsbh	$in0,$in0		# byte swap
161	 dsbh	$in1,$in1
162	dshd	$in0,$in0
163	 dshd	$in1,$in1
164# else
165	ori	$tmp0,$zero,0xFF
166	dsll	$tmp2,$tmp0,32
167	or	$tmp0,$tmp2		# 0x000000FF000000FF
168
169	and	$tmp1,$in0,$tmp0	# byte swap
170	 and	$tmp3,$in1,$tmp0
171	dsrl	$tmp2,$in0,24
172	 dsrl	$tmp4,$in1,24
173	dsll	$tmp1,24
174	 dsll	$tmp3,24
175	and	$tmp2,$tmp0
176	 and	$tmp4,$tmp0
177	dsll	$tmp0,8			# 0x0000FF000000FF00
178	or	$tmp1,$tmp2
179	 or	$tmp3,$tmp4
180	and	$tmp2,$in0,$tmp0
181	 and	$tmp4,$in1,$tmp0
182	dsrl	$in0,8
183	 dsrl	$in1,8
184	dsll	$tmp2,8
185	 dsll	$tmp4,8
186	and	$in0,$tmp0
187	 and	$in1,$tmp0
188	or	$tmp1,$tmp2
189	 or	$tmp3,$tmp4
190	or	$in0,$tmp1
191	 or	$in1,$tmp3
192	dsrl	$tmp1,$in0,32
193	 dsrl	$tmp3,$in1,32
194	dsll	$in0,32
195	 dsll	$in1,32
196	or	$in0,$tmp1
197	 or	$in1,$tmp3
198# endif
199#endif
200	li	$tmp0,1
201	dsll	$tmp0,32		# 0x0000000100000000
202	daddiu	$tmp0,-63		# 0x00000000ffffffc1
203	dsll	$tmp0,28		# 0x0ffffffc10000000
204	daddiu	$tmp0,-1		# 0x0ffffffc0fffffff
205
206	and	$in0,$tmp0
207	daddiu	$tmp0,-3		# 0x0ffffffc0ffffffc
208	and	$in1,$tmp0
209
210	sd	$in0,24($ctx)
211	dsrl	$tmp0,$in1,2
212	sd	$in1,32($ctx)
213	daddu	$tmp0,$in1		# s1 = r1 + (r1 >> 2)
214	sd	$tmp0,40($ctx)
215
216.Lno_key:
217	li	$v0,0			# return 0
218	jr	$ra
219.end	poly1305_init
220___
221{
222my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
223
224my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
225   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
226my ($shr,$shl) = ($s6,$s7);		# used on R6
227
228$code.=<<___;
229.align	5
230.globl	poly1305_blocks
231.ent	poly1305_blocks
232poly1305_blocks:
233	.set	noreorder
234	dsrl	$len,4			# number of complete blocks
235	bnez	$len,poly1305_blocks_internal
236	nop
237	jr	$ra
238	nop
239.end	poly1305_blocks
240
241.align	5
242.ent	poly1305_blocks_internal
243poly1305_blocks_internal:
244	.set	noreorder
245#if defined(_MIPS_ARCH_MIPS64R6)
246	.frame	$sp,8*8,$ra
247	.mask	$SAVED_REGS_MASK|0x000c0000,-8
248	dsubu	$sp,8*8
249	sd	$s7,56($sp)
250	sd	$s6,48($sp)
251#else
252	.frame	$sp,6*8,$ra
253	.mask	$SAVED_REGS_MASK,-8
254	dsubu	$sp,6*8
255#endif
256	sd	$s5,40($sp)
257	sd	$s4,32($sp)
258___
259$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
260	sd	$s3,24($sp)
261	sd	$s2,16($sp)
262	sd	$s1,8($sp)
263	sd	$s0,0($sp)
264___
265$code.=<<___;
266	.set	reorder
267
268#if defined(_MIPS_ARCH_MIPS64R6)
269	andi	$shr,$inp,7
270	dsubu	$inp,$inp,$shr		# align $inp
271	sll	$shr,$shr,3		# byte to bit offset
272	subu	$shl,$zero,$shr
273#endif
274
275	ld	$h0,0($ctx)		# load hash value
276	ld	$h1,8($ctx)
277	ld	$h2,16($ctx)
278
279	ld	$r0,24($ctx)		# load key
280	ld	$r1,32($ctx)
281	ld	$rs1,40($ctx)
282
283	dsll	$len,4
284	daddu	$len,$inp		# end of buffer
285	b	.Loop
286
287.align	4
288.Loop:
289#if defined(_MIPS_ARCH_MIPS64R6)
290	ld	$in0,0($inp)		# load input
291	ld	$in1,8($inp)
292	beqz	$shr,.Laligned_inp
293
294	ld	$tmp2,16($inp)
295# ifdef	MIPSEB
296	dsllv	$in0,$in0,$shr
297	dsrlv	$tmp3,$in1,$shl
298	dsllv	$in1,$in1,$shr
299	dsrlv	$tmp2,$tmp2,$shl
300# else
301	dsrlv	$in0,$in0,$shr
302	dsllv	$tmp3,$in1,$shl
303	dsrlv	$in1,$in1,$shr
304	dsllv	$tmp2,$tmp2,$shl
305# endif
306	or	$in0,$in0,$tmp3
307	or	$in1,$in1,$tmp2
308.Laligned_inp:
309#else
310	ldl	$in0,0+MSB($inp)	# load input
311	ldl	$in1,8+MSB($inp)
312	ldr	$in0,0+LSB($inp)
313	ldr	$in1,8+LSB($inp)
314#endif
315	daddiu	$inp,16
316#ifdef	MIPSEB
317# if defined(_MIPS_ARCH_MIPS64R2)
318	dsbh	$in0,$in0		# byte swap
319	 dsbh	$in1,$in1
320	dshd	$in0,$in0
321	 dshd	$in1,$in1
322# else
323	ori	$tmp0,$zero,0xFF
324	dsll	$tmp2,$tmp0,32
325	or	$tmp0,$tmp2		# 0x000000FF000000FF
326
327	and	$tmp1,$in0,$tmp0	# byte swap
328	 and	$tmp3,$in1,$tmp0
329	dsrl	$tmp2,$in0,24
330	 dsrl	$tmp4,$in1,24
331	dsll	$tmp1,24
332	 dsll	$tmp3,24
333	and	$tmp2,$tmp0
334	 and	$tmp4,$tmp0
335	dsll	$tmp0,8			# 0x0000FF000000FF00
336	or	$tmp1,$tmp2
337	 or	$tmp3,$tmp4
338	and	$tmp2,$in0,$tmp0
339	 and	$tmp4,$in1,$tmp0
340	dsrl	$in0,8
341	 dsrl	$in1,8
342	dsll	$tmp2,8
343	 dsll	$tmp4,8
344	and	$in0,$tmp0
345	 and	$in1,$tmp0
346	or	$tmp1,$tmp2
347	 or	$tmp3,$tmp4
348	or	$in0,$tmp1
349	 or	$in1,$tmp3
350	dsrl	$tmp1,$in0,32
351	 dsrl	$tmp3,$in1,32
352	dsll	$in0,32
353	 dsll	$in1,32
354	or	$in0,$tmp1
355	 or	$in1,$tmp3
356# endif
357#endif
358	dsrl	$tmp1,$h2,2		# modulo-scheduled reduction
359	andi	$h2,$h2,3
360	dsll	$tmp0,$tmp1,2
361
362	daddu	$d0,$h0,$in0		# accumulate input
363	 daddu	$tmp1,$tmp0
364	sltu	$tmp0,$d0,$h0
365	daddu	$d0,$d0,$tmp1		# ... and residue
366	sltu	$tmp1,$d0,$tmp1
367	daddu	$d1,$h1,$in1
368	daddu	$tmp0,$tmp1
369	sltu	$tmp1,$d1,$h1
370	daddu	$d1,$tmp0
371
372	dmultu	($r0,$d0)		# h0*r0
373	 daddu	$d2,$h2,$padbit
374	 sltu	$tmp0,$d1,$tmp0
375	mflo	($h0,$r0,$d0)
376	mfhi	($h1,$r0,$d0)
377
378	dmultu	($rs1,$d1)		# h1*5*r1
379	 daddu	$d2,$tmp1
380	 daddu	$d2,$tmp0
381	mflo	($tmp0,$rs1,$d1)
382	mfhi	($tmp1,$rs1,$d1)
383
384	dmultu	($r1,$d0)		# h0*r1
385	mflo	($tmp2,$r1,$d0)
386	mfhi	($h2,$r1,$d0)
387	 daddu	$h0,$tmp0
388	 daddu	$h1,$tmp1
389	 sltu	$tmp0,$h0,$tmp0
390
391	dmultu	($r0,$d1)		# h1*r0
392	 daddu	$h1,$tmp0
393	 daddu	$h1,$tmp2
394	mflo	($tmp0,$r0,$d1)
395	mfhi	($tmp1,$r0,$d1)
396
397	dmultu	($rs1,$d2)		# h2*5*r1
398	 sltu	$tmp2,$h1,$tmp2
399	 daddu	$h2,$tmp2
400	mflo	($tmp2,$rs1,$d2)
401
402	dmultu	($r0,$d2)		# h2*r0
403	 daddu	$h1,$tmp0
404	 daddu	$h2,$tmp1
405	mflo	($tmp3,$r0,$d2)
406	 sltu	$tmp0,$h1,$tmp0
407	 daddu	$h2,$tmp0
408
409	daddu	$h1,$tmp2
410	sltu	$tmp2,$h1,$tmp2
411	daddu	$h2,$tmp2
412	daddu	$h2,$tmp3
413
414	bne	$inp,$len,.Loop
415
416	sd	$h0,0($ctx)		# store hash value
417	sd	$h1,8($ctx)
418	sd	$h2,16($ctx)
419
420	.set	noreorder
421#if defined(_MIPS_ARCH_MIPS64R6)
422	ld	$s7,56($sp)
423	ld	$s6,48($sp)
424#endif
425	ld	$s5,40($sp)		# epilogue
426	ld	$s4,32($sp)
427___
428$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi epilogue
429	ld	$s3,24($sp)
430	ld	$s2,16($sp)
431	ld	$s1,8($sp)
432	ld	$s0,0($sp)
433___
434$code.=<<___;
435	jr	$ra
436#if defined(_MIPS_ARCH_MIPS64R6)
437	daddu	$sp,8*8
438#else
439	daddu	$sp,6*8
440#endif
441.end	poly1305_blocks_internal
442___
443}
444{
445my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
446
447$code.=<<___;
448.align	5
449.globl	poly1305_emit
450.ent	poly1305_emit
451poly1305_emit:
452	.frame	$sp,0,$ra
453	.set	reorder
454
455	ld	$tmp2,16($ctx)
456	ld	$tmp0,0($ctx)
457	ld	$tmp1,8($ctx)
458
459	li	$in0,-4			# final reduction
460	dsrl	$in1,$tmp2,2
461	and	$in0,$tmp2
462	andi	$tmp2,$tmp2,3
463	daddu	$in0,$in1
464
465	daddu	$tmp0,$tmp0,$in0
466	sltu	$in1,$tmp0,$in0
467	 daddiu	$in0,$tmp0,5		# compare to modulus
468	daddu	$tmp1,$tmp1,$in1
469	 sltiu	$tmp3,$in0,5
470	sltu	$tmp4,$tmp1,$in1
471	 daddu	$in1,$tmp1,$tmp3
472	daddu	$tmp2,$tmp2,$tmp4
473	 sltu	$tmp3,$in1,$tmp3
474	 daddu	$tmp2,$tmp2,$tmp3
475
476	dsrl	$tmp2,2			# see if it carried/borrowed
477	dsubu	$tmp2,$zero,$tmp2
478
479	xor	$in0,$tmp0
480	xor	$in1,$tmp1
481	and	$in0,$tmp2
482	and	$in1,$tmp2
483	xor	$in0,$tmp0
484	xor	$in1,$tmp1
485
486	lwu	$tmp0,0($nonce)		# load nonce
487	lwu	$tmp1,4($nonce)
488	lwu	$tmp2,8($nonce)
489	lwu	$tmp3,12($nonce)
490	dsll	$tmp1,32
491	dsll	$tmp3,32
492	or	$tmp0,$tmp1
493	or	$tmp2,$tmp3
494
495	daddu	$in0,$tmp0		# accumulate nonce
496	daddu	$in1,$tmp2
497	sltu	$tmp0,$in0,$tmp0
498	daddu	$in1,$tmp0
499
500	dsrl	$tmp0,$in0,8		# write mac value
501	dsrl	$tmp1,$in0,16
502	dsrl	$tmp2,$in0,24
503	sb	$in0,0($mac)
504	dsrl	$tmp3,$in0,32
505	sb	$tmp0,1($mac)
506	dsrl	$tmp0,$in0,40
507	sb	$tmp1,2($mac)
508	dsrl	$tmp1,$in0,48
509	sb	$tmp2,3($mac)
510	dsrl	$tmp2,$in0,56
511	sb	$tmp3,4($mac)
512	dsrl	$tmp3,$in1,8
513	sb	$tmp0,5($mac)
514	dsrl	$tmp0,$in1,16
515	sb	$tmp1,6($mac)
516	dsrl	$tmp1,$in1,24
517	sb	$tmp2,7($mac)
518
519	sb	$in1,8($mac)
520	dsrl	$tmp2,$in1,32
521	sb	$tmp3,9($mac)
522	dsrl	$tmp3,$in1,40
523	sb	$tmp0,10($mac)
524	dsrl	$tmp0,$in1,48
525	sb	$tmp1,11($mac)
526	dsrl	$tmp1,$in1,56
527	sb	$tmp2,12($mac)
528	sb	$tmp3,13($mac)
529	sb	$tmp0,14($mac)
530	sb	$tmp1,15($mac)
531
532	jr	$ra
533.end	poly1305_emit
534.rdata
535.asciiz	"Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
536.align	2
537___
538}
539}}} else {{{
540######################################################################
541# 32-bit code path
542#
543
544my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
545my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
546   ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
547
548$code.=<<___;
549#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
550     defined(_MIPS_ARCH_MIPS32R6)) \\
551     && !defined(_MIPS_ARCH_MIPS32R2)
552# define _MIPS_ARCH_MIPS32R2
553#endif
554
555#if defined(_MIPS_ARCH_MIPS32R6)
556# define multu(rs,rt)
557# define mflo(rd,rs,rt)	mulu	rd,rs,rt
558# define mfhi(rd,rs,rt)	muhu	rd,rs,rt
559#else
560# define multu(rs,rt)	multu	rs,rt
561# define mflo(rd,rs,rt)	mflo	rd
562# define mfhi(rd,rs,rt)	mfhi	rd
563#endif
564
565#ifdef	__KERNEL__
566# define poly1305_init   poly1305_block_init
567#endif
568
569#if defined(__MIPSEB__) && !defined(MIPSEB)
570# define MIPSEB
571#endif
572
573#ifdef MIPSEB
574# define MSB 0
575# define LSB 3
576#else
577# define MSB 3
578# define LSB 0
579#endif
580
581.text
582.set	noat
583.set	noreorder
584
585.align	5
586.globl	poly1305_init
587.ent	poly1305_init
588poly1305_init:
589	.frame	$sp,0,$ra
590	.set	reorder
591
592	sw	$zero,0($ctx)
593	sw	$zero,4($ctx)
594	sw	$zero,8($ctx)
595	sw	$zero,12($ctx)
596	sw	$zero,16($ctx)
597
598	beqz	$inp,.Lno_key
599
600#if defined(_MIPS_ARCH_MIPS32R6)
601	andi	$tmp0,$inp,3		# $inp % 4
602	subu	$inp,$inp,$tmp0		# align $inp
603	sll	$tmp0,$tmp0,3		# byte to bit offset
604	lw	$in0,0($inp)
605	lw	$in1,4($inp)
606	lw	$in2,8($inp)
607	lw	$in3,12($inp)
608	beqz	$tmp0,.Laligned_key
609
610	lw	$tmp2,16($inp)
611	subu	$tmp1,$zero,$tmp0
612# ifdef	MIPSEB
613	sllv	$in0,$in0,$tmp0
614	srlv	$tmp3,$in1,$tmp1
615	sllv	$in1,$in1,$tmp0
616	or	$in0,$in0,$tmp3
617	srlv	$tmp3,$in2,$tmp1
618	sllv	$in2,$in2,$tmp0
619	or	$in1,$in1,$tmp3
620	srlv	$tmp3,$in3,$tmp1
621	sllv	$in3,$in3,$tmp0
622	or	$in2,$in2,$tmp3
623	srlv	$tmp2,$tmp2,$tmp1
624	or	$in3,$in3,$tmp2
625# else
626	srlv	$in0,$in0,$tmp0
627	sllv	$tmp3,$in1,$tmp1
628	srlv	$in1,$in1,$tmp0
629	or	$in0,$in0,$tmp3
630	sllv	$tmp3,$in2,$tmp1
631	srlv	$in2,$in2,$tmp0
632	or	$in1,$in1,$tmp3
633	sllv	$tmp3,$in3,$tmp1
634	srlv	$in3,$in3,$tmp0
635	or	$in2,$in2,$tmp3
636	sllv	$tmp2,$tmp2,$tmp1
637	or	$in3,$in3,$tmp2
638# endif
639.Laligned_key:
640#else
641	lwl	$in0,0+MSB($inp)
642	lwl	$in1,4+MSB($inp)
643	lwl	$in2,8+MSB($inp)
644	lwl	$in3,12+MSB($inp)
645	lwr	$in0,0+LSB($inp)
646	lwr	$in1,4+LSB($inp)
647	lwr	$in2,8+LSB($inp)
648	lwr	$in3,12+LSB($inp)
649#endif
650#ifdef	MIPSEB
651# if defined(_MIPS_ARCH_MIPS32R2)
652	wsbh	$in0,$in0		# byte swap
653	wsbh	$in1,$in1
654	wsbh	$in2,$in2
655	wsbh	$in3,$in3
656	rotr	$in0,$in0,16
657	rotr	$in1,$in1,16
658	rotr	$in2,$in2,16
659	rotr	$in3,$in3,16
660# else
661	srl	$tmp0,$in0,24		# byte swap
662	srl	$tmp1,$in0,8
663	andi	$tmp2,$in0,0xFF00
664	sll	$in0,$in0,24
665	andi	$tmp1,0xFF00
666	sll	$tmp2,$tmp2,8
667	or	$in0,$tmp0
668	 srl	$tmp0,$in1,24
669	or	$tmp1,$tmp2
670	 srl	$tmp2,$in1,8
671	or	$in0,$tmp1
672	 andi	$tmp1,$in1,0xFF00
673	 sll	$in1,$in1,24
674	 andi	$tmp2,0xFF00
675	 sll	$tmp1,$tmp1,8
676	 or	$in1,$tmp0
677	srl	$tmp0,$in2,24
678	 or	$tmp2,$tmp1
679	srl	$tmp1,$in2,8
680	 or	$in1,$tmp2
681	andi	$tmp2,$in2,0xFF00
682	sll	$in2,$in2,24
683	andi	$tmp1,0xFF00
684	sll	$tmp2,$tmp2,8
685	or	$in2,$tmp0
686	 srl	$tmp0,$in3,24
687	or	$tmp1,$tmp2
688	 srl	$tmp2,$in3,8
689	or	$in2,$tmp1
690	 andi	$tmp1,$in3,0xFF00
691	 sll	$in3,$in3,24
692	 andi	$tmp2,0xFF00
693	 sll	$tmp1,$tmp1,8
694	 or	$in3,$tmp0
695	 or	$tmp2,$tmp1
696	 or	$in3,$tmp2
697# endif
698#endif
699	lui	$tmp0,0x0fff
700	ori	$tmp0,0xffff		# 0x0fffffff
701	and	$in0,$in0,$tmp0
702	subu	$tmp0,3			# 0x0ffffffc
703	and	$in1,$in1,$tmp0
704	and	$in2,$in2,$tmp0
705	and	$in3,$in3,$tmp0
706
707	sw	$in0,20($ctx)
708	sw	$in1,24($ctx)
709	sw	$in2,28($ctx)
710	sw	$in3,32($ctx)
711
712	srl	$tmp1,$in1,2
713	srl	$tmp2,$in2,2
714	srl	$tmp3,$in3,2
715	addu	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
716	addu	$in2,$in2,$tmp2
717	addu	$in3,$in3,$tmp3
718	sw	$in1,36($ctx)
719	sw	$in2,40($ctx)
720	sw	$in3,44($ctx)
721.Lno_key:
722	li	$v0,0
723	jr	$ra
724.end	poly1305_init
725___
726{
727my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
728
729my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
730   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
731my ($d0,$d1,$d2,$d3) =
732   ($a4,$a5,$a6,$a7);
733my $shr = $t2;		# used on R6
734my $one = $t2;		# used on R2
735
736$code.=<<___;
737.globl	poly1305_blocks
738.align	5
739.ent	poly1305_blocks
740poly1305_blocks:
741	.frame	$sp,16*4,$ra
742	.mask	$SAVED_REGS_MASK,-4
743	.set	noreorder
744	subu	$sp, $sp,4*12
745	sw	$s11,4*11($sp)
746	sw	$s10,4*10($sp)
747	sw	$s9, 4*9($sp)
748	sw	$s8, 4*8($sp)
749	sw	$s7, 4*7($sp)
750	sw	$s6, 4*6($sp)
751	sw	$s5, 4*5($sp)
752	sw	$s4, 4*4($sp)
753___
754$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
755	sw	$s3, 4*3($sp)
756	sw	$s2, 4*2($sp)
757	sw	$s1, 4*1($sp)
758	sw	$s0, 4*0($sp)
759___
760$code.=<<___;
761	.set	reorder
762
763	srl	$len,4			# number of complete blocks
764	li	$one,1
765	beqz	$len,.Labort
766
767#if defined(_MIPS_ARCH_MIPS32R6)
768	andi	$shr,$inp,3
769	subu	$inp,$inp,$shr		# align $inp
770	sll	$shr,$shr,3		# byte to bit offset
771#endif
772
773	lw	$h0,0($ctx)		# load hash value
774	lw	$h1,4($ctx)
775	lw	$h2,8($ctx)
776	lw	$h3,12($ctx)
777	lw	$h4,16($ctx)
778
779	lw	$r0,20($ctx)		# load key
780	lw	$r1,24($ctx)
781	lw	$r2,28($ctx)
782	lw	$r3,32($ctx)
783	lw	$rs1,36($ctx)
784	lw	$rs2,40($ctx)
785	lw	$rs3,44($ctx)
786
787	sll	$len,4
788	addu	$len,$len,$inp		# end of buffer
789	b	.Loop
790
791.align	4
792.Loop:
793#if defined(_MIPS_ARCH_MIPS32R6)
794	lw	$d0,0($inp)		# load input
795	lw	$d1,4($inp)
796	lw	$d2,8($inp)
797	lw	$d3,12($inp)
798	beqz	$shr,.Laligned_inp
799
800	lw	$t0,16($inp)
801	subu	$t1,$zero,$shr
802# ifdef	MIPSEB
803	sllv	$d0,$d0,$shr
804	srlv	$at,$d1,$t1
805	sllv	$d1,$d1,$shr
806	or	$d0,$d0,$at
807	srlv	$at,$d2,$t1
808	sllv	$d2,$d2,$shr
809	or	$d1,$d1,$at
810	srlv	$at,$d3,$t1
811	sllv	$d3,$d3,$shr
812	or	$d2,$d2,$at
813	srlv	$t0,$t0,$t1
814	or	$d3,$d3,$t0
815# else
816	srlv	$d0,$d0,$shr
817	sllv	$at,$d1,$t1
818	srlv	$d1,$d1,$shr
819	or	$d0,$d0,$at
820	sllv	$at,$d2,$t1
821	srlv	$d2,$d2,$shr
822	or	$d1,$d1,$at
823	sllv	$at,$d3,$t1
824	srlv	$d3,$d3,$shr
825	or	$d2,$d2,$at
826	sllv	$t0,$t0,$t1
827	or	$d3,$d3,$t0
828# endif
829.Laligned_inp:
830#else
831	lwl	$d0,0+MSB($inp)		# load input
832	lwl	$d1,4+MSB($inp)
833	lwl	$d2,8+MSB($inp)
834	lwl	$d3,12+MSB($inp)
835	lwr	$d0,0+LSB($inp)
836	lwr	$d1,4+LSB($inp)
837	lwr	$d2,8+LSB($inp)
838	lwr	$d3,12+LSB($inp)
839#endif
840#ifdef	MIPSEB
841# if defined(_MIPS_ARCH_MIPS32R2)
842	wsbh	$d0,$d0			# byte swap
843	wsbh	$d1,$d1
844	wsbh	$d2,$d2
845	wsbh	$d3,$d3
846	rotr	$d0,$d0,16
847	rotr	$d1,$d1,16
848	rotr	$d2,$d2,16
849	rotr	$d3,$d3,16
850# else
851	srl	$at,$d0,24		# byte swap
852	srl	$t0,$d0,8
853	andi	$t1,$d0,0xFF00
854	sll	$d0,$d0,24
855	andi	$t0,0xFF00
856	sll	$t1,$t1,8
857	or	$d0,$at
858	 srl	$at,$d1,24
859	or	$t0,$t1
860	 srl	$t1,$d1,8
861	or	$d0,$t0
862	 andi	$t0,$d1,0xFF00
863	 sll	$d1,$d1,24
864	 andi	$t1,0xFF00
865	 sll	$t0,$t0,8
866	 or	$d1,$at
867	srl	$at,$d2,24
868	 or	$t1,$t0
869	srl	$t0,$d2,8
870	 or	$d1,$t1
871	andi	$t1,$d2,0xFF00
872	sll	$d2,$d2,24
873	andi	$t0,0xFF00
874	sll	$t1,$t1,8
875	or	$d2,$at
876	 srl	$at,$d3,24
877	or	$t0,$t1
878	 srl	$t1,$d3,8
879	or	$d2,$t0
880	 andi	$t0,$d3,0xFF00
881	 sll	$d3,$d3,24
882	 andi	$t1,0xFF00
883	 sll	$t0,$t0,8
884	 or	$d3,$at
885	 or	$t1,$t0
886	 or	$d3,$t1
887# endif
888#endif
889	srl	$t0,$h4,2		# modulo-scheduled reduction
890	andi	$h4,$h4,3
891	sll	$at,$t0,2
892
893	addu	$d0,$d0,$h0		# accumulate input
894	 addu	$t0,$t0,$at
895	sltu	$h0,$d0,$h0
896	addu	$d0,$d0,$t0		# ... and residue
897	sltu	$at,$d0,$t0
898
899	addu	$d1,$d1,$h1
900	 addu	$h0,$h0,$at		# carry
901	sltu	$h1,$d1,$h1
902	addu	$d1,$d1,$h0
903	sltu	$h0,$d1,$h0
904
905	addu	$d2,$d2,$h2
906	 addu	$h1,$h1,$h0		# carry
907	sltu	$h2,$d2,$h2
908	addu	$d2,$d2,$h1
909	sltu	$h1,$d2,$h1
910
911	addu	$d3,$d3,$h3
912	 addu	$h2,$h2,$h1		# carry
913	sltu	$h3,$d3,$h3
914	addu	$d3,$d3,$h2
915
916#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
917	multu	$r0,$d0			# d0*r0
918	 sltu	$h2,$d3,$h2
919	maddu	$rs3,$d1		# d1*s3
920	 addu	$h3,$h3,$h2		# carry
921	maddu	$rs2,$d2		# d2*s2
922	 addu	$h4,$h4,$padbit
923	maddu	$rs1,$d3		# d3*s1
924	 addu	$h4,$h4,$h3
925	mfhi	$at
926	mflo	$h0
927
928	multu	$r1,$d0			# d0*r1
929	maddu	$r0,$d1			# d1*r0
930	maddu	$rs3,$d2		# d2*s3
931	maddu	$rs2,$d3		# d3*s2
932	maddu	$rs1,$h4		# h4*s1
933	maddu	$at,$one		# hi*1
934	mfhi	$at
935	mflo	$h1
936
937	multu	$r2,$d0			# d0*r2
938	maddu	$r1,$d1			# d1*r1
939	maddu	$r0,$d2			# d2*r0
940	maddu	$rs3,$d3		# d3*s3
941	maddu	$rs2,$h4		# h4*s2
942	maddu	$at,$one		# hi*1
943	mfhi	$at
944	mflo	$h2
945
946	mul	$t0,$r0,$h4		# h4*r0
947
948	multu	$r3,$d0			# d0*r3
949	maddu	$r2,$d1			# d1*r2
950	maddu	$r1,$d2			# d2*r1
951	maddu	$r0,$d3			# d3*r0
952	maddu	$rs3,$h4		# h4*s3
953	maddu	$at,$one		# hi*1
954	mfhi	$at
955	mflo	$h3
956
957	 addiu	$inp,$inp,16
958
959	addu	$h4,$t0,$at
960#else
961	multu	($r0,$d0)		# d0*r0
962	mflo	($h0,$r0,$d0)
963	mfhi	($h1,$r0,$d0)
964
965	 sltu	$h2,$d3,$h2
966	 addu	$h3,$h3,$h2		# carry
967
968	multu	($rs3,$d1)		# d1*s3
969	mflo	($at,$rs3,$d1)
970	mfhi	($t0,$rs3,$d1)
971
972	 addu	$h4,$h4,$padbit
973	 addiu	$inp,$inp,16
974	 addu	$h4,$h4,$h3
975
976	multu	($rs2,$d2)		# d2*s2
977	mflo	($a3,$rs2,$d2)
978	mfhi	($t1,$rs2,$d2)
979	 addu	$h0,$h0,$at
980	 addu	$h1,$h1,$t0
981	multu	($rs1,$d3)		# d3*s1
982	 sltu	$at,$h0,$at
983	 addu	$h1,$h1,$at
984
985	mflo	($at,$rs1,$d3)
986	mfhi	($t0,$rs1,$d3)
987	 addu	$h0,$h0,$a3
988	 addu	$h1,$h1,$t1
989	multu	($r1,$d0)		# d0*r1
990	 sltu	$a3,$h0,$a3
991	 addu	$h1,$h1,$a3
992
993
994	mflo	($a3,$r1,$d0)
995	mfhi	($h2,$r1,$d0)
996	 addu	$h0,$h0,$at
997	 addu	$h1,$h1,$t0
998	multu	($r0,$d1)		# d1*r0
999	 sltu	$at,$h0,$at
1000	 addu	$h1,$h1,$at
1001
1002	mflo	($at,$r0,$d1)
1003	mfhi	($t0,$r0,$d1)
1004	 addu	$h1,$h1,$a3
1005	 sltu	$a3,$h1,$a3
1006	multu	($rs3,$d2)		# d2*s3
1007	 addu	$h2,$h2,$a3
1008
1009	mflo	($a3,$rs3,$d2)
1010	mfhi	($t1,$rs3,$d2)
1011	 addu	$h1,$h1,$at
1012	 addu	$h2,$h2,$t0
1013	multu	($rs2,$d3)		# d3*s2
1014	 sltu	$at,$h1,$at
1015	 addu	$h2,$h2,$at
1016
1017	mflo	($at,$rs2,$d3)
1018	mfhi	($t0,$rs2,$d3)
1019	 addu	$h1,$h1,$a3
1020	 addu	$h2,$h2,$t1
1021	multu	($rs1,$h4)		# h4*s1
1022	 sltu	$a3,$h1,$a3
1023	 addu	$h2,$h2,$a3
1024
1025	mflo	($a3,$rs1,$h4)
1026	 addu	$h1,$h1,$at
1027	 addu	$h2,$h2,$t0
1028	multu	($r2,$d0)		# d0*r2
1029	 sltu	$at,$h1,$at
1030	 addu	$h2,$h2,$at
1031
1032
1033	mflo	($at,$r2,$d0)
1034	mfhi	($h3,$r2,$d0)
1035	 addu	$h1,$h1,$a3
1036	 sltu	$a3,$h1,$a3
1037	multu	($r1,$d1)		# d1*r1
1038	 addu	$h2,$h2,$a3
1039
1040	mflo	($a3,$r1,$d1)
1041	mfhi	($t1,$r1,$d1)
1042	 addu	$h2,$h2,$at
1043	 sltu	$at,$h2,$at
1044	multu	($r0,$d2)		# d2*r0
1045	 addu	$h3,$h3,$at
1046
1047	mflo	($at,$r0,$d2)
1048	mfhi	($t0,$r0,$d2)
1049	 addu	$h2,$h2,$a3
1050	 addu	$h3,$h3,$t1
1051	multu	($rs3,$d3)		# d3*s3
1052	 sltu	$a3,$h2,$a3
1053	 addu	$h3,$h3,$a3
1054
1055	mflo	($a3,$rs3,$d3)
1056	mfhi	($t1,$rs3,$d3)
1057	 addu	$h2,$h2,$at
1058	 addu	$h3,$h3,$t0
1059	multu	($rs2,$h4)		# h4*s2
1060	 sltu	$at,$h2,$at
1061	 addu	$h3,$h3,$at
1062
1063	mflo	($at,$rs2,$h4)
1064	 addu	$h2,$h2,$a3
1065	 addu	$h3,$h3,$t1
1066	multu	($r3,$d0)		# d0*r3
1067	 sltu	$a3,$h2,$a3
1068	 addu	$h3,$h3,$a3
1069
1070
1071	mflo	($a3,$r3,$d0)
1072	mfhi	($t1,$r3,$d0)
1073	 addu	$h2,$h2,$at
1074	 sltu	$at,$h2,$at
1075	multu	($r2,$d1)		# d1*r2
1076	 addu	$h3,$h3,$at
1077
1078	mflo	($at,$r2,$d1)
1079	mfhi	($t0,$r2,$d1)
1080	 addu	$h3,$h3,$a3
1081	 sltu	$a3,$h3,$a3
1082	multu	($r0,$d3)		# d3*r0
1083	 addu	$t1,$t1,$a3
1084
1085	mflo	($a3,$r0,$d3)
1086	mfhi	($d3,$r0,$d3)
1087	 addu	$h3,$h3,$at
1088	 addu	$t1,$t1,$t0
1089	multu	($r1,$d2)		# d2*r1
1090	 sltu	$at,$h3,$at
1091	 addu	$t1,$t1,$at
1092
1093	mflo	($at,$r1,$d2)
1094	mfhi	($t0,$r1,$d2)
1095	 addu	$h3,$h3,$a3
1096	 addu	$t1,$t1,$d3
1097	multu	($rs3,$h4)		# h4*s3
1098	 sltu	$a3,$h3,$a3
1099	 addu	$t1,$t1,$a3
1100
1101	mflo	($a3,$rs3,$h4)
1102	 addu	$h3,$h3,$at
1103	 addu	$t1,$t1,$t0
1104	multu	($r0,$h4)		# h4*r0
1105	 sltu	$at,$h3,$at
1106	 addu	$t1,$t1,$at
1107
1108
1109	mflo	($h4,$r0,$h4)
1110	 addu	$h3,$h3,$a3
1111	 sltu	$a3,$h3,$a3
1112	 addu	$t1,$t1,$a3
1113	addu	$h4,$h4,$t1
1114
1115	li	$padbit,1		# if we loop, padbit is 1
1116#endif
1117	bne	$inp,$len,.Loop
1118
1119	sw	$h0,0($ctx)		# store hash value
1120	sw	$h1,4($ctx)
1121	sw	$h2,8($ctx)
1122	sw	$h3,12($ctx)
1123	sw	$h4,16($ctx)
1124
1125	.set	noreorder
1126.Labort:
1127	lw	$s11,4*11($sp)
1128	lw	$s10,4*10($sp)
1129	lw	$s9, 4*9($sp)
1130	lw	$s8, 4*8($sp)
1131	lw	$s7, 4*7($sp)
1132	lw	$s6, 4*6($sp)
1133	lw	$s5, 4*5($sp)
1134	lw	$s4, 4*4($sp)
1135___
1136$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
1137	lw	$s3, 4*3($sp)
1138	lw	$s2, 4*2($sp)
1139	lw	$s1, 4*1($sp)
1140	lw	$s0, 4*0($sp)
1141___
1142$code.=<<___;
1143	jr	$ra
1144	addu	$sp,$sp,4*12
1145.end	poly1305_blocks
1146___
1147}
1148{
1149my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
1150
1151$code.=<<___;
1152.align	5
1153.globl	poly1305_emit
1154.ent	poly1305_emit
1155poly1305_emit:
1156	.frame	$sp,0,$ra
1157	.set	reorder
1158
1159	lw	$tmp4,16($ctx)
1160	lw	$tmp0,0($ctx)
1161	lw	$tmp1,4($ctx)
1162	lw	$tmp2,8($ctx)
1163	lw	$tmp3,12($ctx)
1164
1165	li	$in0,-4			# final reduction
1166	srl	$ctx,$tmp4,2
1167	and	$in0,$in0,$tmp4
1168	andi	$tmp4,$tmp4,3
1169	addu	$ctx,$ctx,$in0
1170
1171	addu	$tmp0,$tmp0,$ctx
1172	sltu	$ctx,$tmp0,$ctx
1173	 addiu	$in0,$tmp0,5		# compare to modulus
1174	addu	$tmp1,$tmp1,$ctx
1175	 sltiu	$in1,$in0,5
1176	sltu	$ctx,$tmp1,$ctx
1177	 addu	$in1,$in1,$tmp1
1178	addu	$tmp2,$tmp2,$ctx
1179	 sltu	$in2,$in1,$tmp1
1180	sltu	$ctx,$tmp2,$ctx
1181	 addu	$in2,$in2,$tmp2
1182	addu	$tmp3,$tmp3,$ctx
1183	 sltu	$in3,$in2,$tmp2
1184	sltu	$ctx,$tmp3,$ctx
1185	 addu	$in3,$in3,$tmp3
1186	addu	$tmp4,$tmp4,$ctx
1187	 sltu	$ctx,$in3,$tmp3
1188	 addu	$ctx,$tmp4
1189
1190	srl	$ctx,2			# see if it carried/borrowed
1191	subu	$ctx,$zero,$ctx
1192
1193	xor	$in0,$tmp0
1194	xor	$in1,$tmp1
1195	xor	$in2,$tmp2
1196	xor	$in3,$tmp3
1197	and	$in0,$ctx
1198	and	$in1,$ctx
1199	and	$in2,$ctx
1200	and	$in3,$ctx
1201	xor	$in0,$tmp0
1202	xor	$in1,$tmp1
1203	xor	$in2,$tmp2
1204	xor	$in3,$tmp3
1205
1206	lw	$tmp0,0($nonce)		# load nonce
1207	lw	$tmp1,4($nonce)
1208	lw	$tmp2,8($nonce)
1209	lw	$tmp3,12($nonce)
1210
1211	addu	$in0,$tmp0		# accumulate nonce
1212	sltu	$ctx,$in0,$tmp0
1213
1214	addu	$in1,$tmp1
1215	sltu	$tmp1,$in1,$tmp1
1216	addu	$in1,$ctx
1217	sltu	$ctx,$in1,$ctx
1218	addu	$ctx,$tmp1
1219
1220	addu	$in2,$tmp2
1221	sltu	$tmp2,$in2,$tmp2
1222	addu	$in2,$ctx
1223	sltu	$ctx,$in2,$ctx
1224	addu	$ctx,$tmp2
1225
1226	addu	$in3,$tmp3
1227	addu	$in3,$ctx
1228
1229	srl	$tmp0,$in0,8		# write mac value
1230	srl	$tmp1,$in0,16
1231	srl	$tmp2,$in0,24
1232	sb	$in0, 0($mac)
1233	sb	$tmp0,1($mac)
1234	srl	$tmp0,$in1,8
1235	sb	$tmp1,2($mac)
1236	srl	$tmp1,$in1,16
1237	sb	$tmp2,3($mac)
1238	srl	$tmp2,$in1,24
1239	sb	$in1, 4($mac)
1240	sb	$tmp0,5($mac)
1241	srl	$tmp0,$in2,8
1242	sb	$tmp1,6($mac)
1243	srl	$tmp1,$in2,16
1244	sb	$tmp2,7($mac)
1245	srl	$tmp2,$in2,24
1246	sb	$in2, 8($mac)
1247	sb	$tmp0,9($mac)
1248	srl	$tmp0,$in3,8
1249	sb	$tmp1,10($mac)
1250	srl	$tmp1,$in3,16
1251	sb	$tmp2,11($mac)
1252	srl	$tmp2,$in3,24
1253	sb	$in3, 12($mac)
1254	sb	$tmp0,13($mac)
1255	sb	$tmp1,14($mac)
1256	sb	$tmp2,15($mac)
1257
1258	jr	$ra
1259.end	poly1305_emit
1260.rdata
1261.asciiz	"Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
1262.align	2
1263___
1264}
1265}}}
1266
1267$output=pop and open STDOUT,">$output";
1268print $code;
1269close STDOUT;
1270