xref: /linux/lib/crypto/riscv/poly1305-riscv.pl (revision 03f76ddff5b04a808ae16c06418460151e2fdd4b)
1#!/usr/bin/env perl
2# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
3#
4# ====================================================================
5# Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL.
6# ====================================================================
7#
8# Poly1305 hash for RISC-V.
9#
10# February 2019
11#
12# In the essence it's pretty straightforward transliteration of MIPS
13# module [without big-endian option].
14#
15# 1.8 cycles per byte on U74, >100% faster than compiler-generated
16# code. 1.9 cpb on C910, ~75% improvement. 3.3 on Spacemit X60, ~69%
17# improvement.
18#
19# June 2024.
20#
21# Add CHERI support.
22#
23######################################################################
24#
25($zero,$ra,$sp,$gp,$tp)=map("x$_",(0..4));
26($t0,$t1,$t2,$t3,$t4,$t5,$t6)=map("x$_",(5..7,28..31));
27($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(10..17));
28($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("x$_",(8,9,18..27));
29#
30######################################################################
31
32$flavour = shift || "64";
33
34for (@ARGV) {   $output=$_ if (/\w[\w\-]*\.\w+$/);   }
35open STDOUT,">$output";
36
37$code.=<<___;
38#ifdef __KERNEL__
39# ifdef __riscv_zicfilp
40#  undef __riscv_zicfilp // calls are expected to be direct
41# endif
42#endif
43
44#if defined(__CHERI_PURE_CAPABILITY__) && !defined(__riscv_misaligned_fast)
45# define __riscv_misaligned_fast 1
46#endif
47___
48
49if ($flavour =~ /64/) {{{
50######################################################################
51# 64-bit code path...
52#
53my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
54my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$t0,$t1,$t2);
55
56$code.=<<___;
57#if __riscv_xlen == 64
58# if __SIZEOF_POINTER__ == 16
59#  define PUSH	csc
60#  define POP	clc
61# else
62#  define PUSH	sd
63#  define POP	ld
64# endif
65#else
66# error "unsupported __riscv_xlen"
67#endif
68
69.option	pic
70.text
71
72.globl	poly1305_init
73.type	poly1305_init,\@function
74poly1305_init:
75#ifdef	__riscv_zicfilp
76	lpad	0
77#endif
78	sd	$zero,0($ctx)
79	sd	$zero,8($ctx)
80	sd	$zero,16($ctx)
81
82	beqz	$inp,.Lno_key
83
84#ifndef	__riscv_misaligned_fast
85	andi	$tmp0,$inp,7		# $inp % 8
86	andi	$inp,$inp,-8		# align $inp
87	slli	$tmp0,$tmp0,3		# byte to bit offset
88#endif
89	ld	$in0,0($inp)
90	ld	$in1,8($inp)
91#ifndef	__riscv_misaligned_fast
92	beqz	$tmp0,.Laligned_key
93
94	ld	$tmp2,16($inp)
95	neg	$tmp1,$tmp0		# implicit &63 in sll
96	srl	$in0,$in0,$tmp0
97	sll	$tmp3,$in1,$tmp1
98	srl	$in1,$in1,$tmp0
99	sll	$tmp2,$tmp2,$tmp1
100	or	$in0,$in0,$tmp3
101	or	$in1,$in1,$tmp2
102
103.Laligned_key:
104#endif
105	li	$tmp0,1
106	slli	$tmp0,$tmp0,32		# 0x0000000100000000
107	addi	$tmp0,$tmp0,-63		# 0x00000000ffffffc1
108	slli	$tmp0,$tmp0,28		# 0x0ffffffc10000000
109	addi	$tmp0,$tmp0,-1		# 0x0ffffffc0fffffff
110
111	and	$in0,$in0,$tmp0
112	addi	$tmp0,$tmp0,-3		# 0x0ffffffc0ffffffc
113	and	$in1,$in1,$tmp0
114
115	sd	$in0,24($ctx)
116	srli	$tmp0,$in1,2
117	sd	$in1,32($ctx)
118	add	$tmp0,$tmp0,$in1	# s1 = r1 + (r1 >> 2)
119	sd	$tmp0,40($ctx)
120
121.Lno_key:
122	li	$a0,0			# return 0
123	ret
124.size	poly1305_init,.-poly1305_init
125___
126{
127my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
128   ($s0,$s1,$s2,$s3,$t3,$t4,$in0,$in1,$t2);
129my ($shr,$shl) = ($t5,$t6);		# used on R6
130
131$code.=<<___;
132.globl	poly1305_blocks
133.type	poly1305_blocks,\@function
134poly1305_blocks:
135#ifdef	__riscv_zicfilp
136	lpad	0
137#endif
138	andi	$len,$len,-16		# complete blocks only
139	beqz	$len,.Lno_data
140
141	caddi	$sp,$sp,-4*__SIZEOF_POINTER__
142	PUSH	$s0,3*__SIZEOF_POINTER__($sp)
143	PUSH	$s1,2*__SIZEOF_POINTER__($sp)
144	PUSH	$s2,1*__SIZEOF_POINTER__($sp)
145	PUSH	$s3,0*__SIZEOF_POINTER__($sp)
146
147#ifndef	__riscv_misaligned_fast
148	andi	$shr,$inp,7
149	andi	$inp,$inp,-8		# align $inp
150	slli	$shr,$shr,3		# byte to bit offset
151	neg	$shl,$shr		# implicit &63 in sll
152#endif
153
154	ld	$h0,0($ctx)		# load hash value
155	ld	$h1,8($ctx)
156	ld	$h2,16($ctx)
157
158	ld	$r0,24($ctx)		# load key
159	ld	$r1,32($ctx)
160	ld	$rs1,40($ctx)
161
162	add	$len,$len,$inp		# end of buffer
163
164.Loop:
165	ld	$in0,0($inp)		# load input
166	ld	$in1,8($inp)
167#ifndef	__riscv_misaligned_fast
168	beqz	$shr,.Laligned_inp
169
170	ld	$tmp2,16($inp)
171	srl	$in0,$in0,$shr
172	sll	$tmp3,$in1,$shl
173	srl	$in1,$in1,$shr
174	sll	$tmp2,$tmp2,$shl
175	or	$in0,$in0,$tmp3
176	or	$in1,$in1,$tmp2
177
178.Laligned_inp:
179#endif
180	caddi	$inp,$inp,16
181
182	andi	$tmp0,$h2,-4		# modulo-scheduled reduction
183	srli	$tmp1,$h2,2
184	andi	$h2,$h2,3
185
186	add	$d0,$h0,$in0		# accumulate input
187	 add	$tmp1,$tmp1,$tmp0
188	sltu	$tmp0,$d0,$h0
189	add	$d0,$d0,$tmp1		# ... and residue
190	sltu	$tmp1,$d0,$tmp1
191	add	$d1,$h1,$in1
192	add	$tmp0,$tmp0,$tmp1
193	sltu	$tmp1,$d1,$h1
194	add	$d1,$d1,$tmp0
195
196	 add	$d2,$h2,$padbit
197	 sltu	$tmp0,$d1,$tmp0
198	mulhu	$h1,$r0,$d0		# h0*r0
199	mul	$h0,$r0,$d0
200
201	 add	$d2,$d2,$tmp1
202	 add	$d2,$d2,$tmp0
203	mulhu	$tmp1,$rs1,$d1		# h1*5*r1
204	mul	$tmp0,$rs1,$d1
205
206	mulhu	$h2,$r1,$d0		# h0*r1
207	mul	$tmp2,$r1,$d0
208	 add	$h0,$h0,$tmp0
209	 add	$h1,$h1,$tmp1
210	 sltu	$tmp0,$h0,$tmp0
211
212	 add	$h1,$h1,$tmp0
213	 add	$h1,$h1,$tmp2
214	mulhu	$tmp1,$r0,$d1		# h1*r0
215	mul	$tmp0,$r0,$d1
216
217	 sltu	$tmp2,$h1,$tmp2
218	 add	$h2,$h2,$tmp2
219	mul	$tmp2,$rs1,$d2		# h2*5*r1
220
221	 add	$h1,$h1,$tmp0
222	 add	$h2,$h2,$tmp1
223	mul	$tmp3,$r0,$d2		# h2*r0
224	 sltu	$tmp0,$h1,$tmp0
225	 add	$h2,$h2,$tmp0
226
227	add	$h1,$h1,$tmp2
228	sltu	$tmp2,$h1,$tmp2
229	add	$h2,$h2,$tmp2
230	add	$h2,$h2,$tmp3
231
232	bne	$inp,$len,.Loop
233
234	sd	$h0,0($ctx)		# store hash value
235	sd	$h1,8($ctx)
236	sd	$h2,16($ctx)
237
238	POP	$s0,3*__SIZEOF_POINTER__($sp)		# epilogue
239	POP	$s1,2*__SIZEOF_POINTER__($sp)
240	POP	$s2,1*__SIZEOF_POINTER__($sp)
241	POP	$s3,0*__SIZEOF_POINTER__($sp)
242	caddi	$sp,$sp,4*__SIZEOF_POINTER__
243
244.Lno_data:
245	ret
246.size	poly1305_blocks,.-poly1305_blocks
247___
248}
249{
250my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
251
252$code.=<<___;
253.globl	poly1305_emit
254.type	poly1305_emit,\@function
255poly1305_emit:
256#ifdef	__riscv_zicfilp
257	lpad	0
258#endif
259	ld	$tmp2,16($ctx)
260	ld	$tmp0,0($ctx)
261	ld	$tmp1,8($ctx)
262
263	andi	$in0,$tmp2,-4		# final reduction
264	srl	$in1,$tmp2,2
265	andi	$tmp2,$tmp2,3
266	add	$in0,$in0,$in1
267
268	add	$tmp0,$tmp0,$in0
269	sltu	$in1,$tmp0,$in0
270	 addi	$in0,$tmp0,5		# compare to modulus
271	add	$tmp1,$tmp1,$in1
272	 sltiu	$tmp3,$in0,5
273	sltu	$tmp4,$tmp1,$in1
274	 add	$in1,$tmp1,$tmp3
275	add	$tmp2,$tmp2,$tmp4
276	 sltu	$tmp3,$in1,$tmp3
277	 add	$tmp2,$tmp2,$tmp3
278
279	srli	$tmp2,$tmp2,2		# see if it carried/borrowed
280	neg	$tmp2,$tmp2
281
282	xor	$in0,$in0,$tmp0
283	xor	$in1,$in1,$tmp1
284	and	$in0,$in0,$tmp2
285	and	$in1,$in1,$tmp2
286	xor	$in0,$in0,$tmp0
287	xor	$in1,$in1,$tmp1
288
289	lwu	$tmp0,0($nonce)		# load nonce
290	lwu	$tmp1,4($nonce)
291	lwu	$tmp2,8($nonce)
292	lwu	$tmp3,12($nonce)
293	slli	$tmp1,$tmp1,32
294	slli	$tmp3,$tmp3,32
295	or	$tmp0,$tmp0,$tmp1
296	or	$tmp2,$tmp2,$tmp3
297
298	add	$in0,$in0,$tmp0		# accumulate nonce
299	add	$in1,$in1,$tmp2
300	sltu	$tmp0,$in0,$tmp0
301	add	$in1,$in1,$tmp0
302
303#ifdef	__riscv_misaligned_fast
304	sd	$in0,0($mac)		# write mac value
305	sd	$in1,8($mac)
306#else
307	srli	$tmp0,$in0,8		# write mac value
308	srli	$tmp1,$in0,16
309	srli	$tmp2,$in0,24
310	sb	$in0,0($mac)
311	srli	$tmp3,$in0,32
312	sb	$tmp0,1($mac)
313	srli	$tmp0,$in0,40
314	sb	$tmp1,2($mac)
315	srli	$tmp1,$in0,48
316	sb	$tmp2,3($mac)
317	srli	$tmp2,$in0,56
318	sb	$tmp3,4($mac)
319	srli	$tmp3,$in1,8
320	sb	$tmp0,5($mac)
321	srli	$tmp0,$in1,16
322	sb	$tmp1,6($mac)
323	srli	$tmp1,$in1,24
324	sb	$tmp2,7($mac)
325
326	sb	$in1,8($mac)
327	srli	$tmp2,$in1,32
328	sb	$tmp3,9($mac)
329	srli	$tmp3,$in1,40
330	sb	$tmp0,10($mac)
331	srli	$tmp0,$in1,48
332	sb	$tmp1,11($mac)
333	srli	$tmp1,$in1,56
334	sb	$tmp2,12($mac)
335	sb	$tmp3,13($mac)
336	sb	$tmp0,14($mac)
337	sb	$tmp1,15($mac)
338#endif
339
340	ret
341.size	poly1305_emit,.-poly1305_emit
342.string	"Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
343___
344}
345}}} else {{{
346######################################################################
347# 32-bit code path
348#
349
350my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
351my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
352   ($a4,$a5,$a6,$a7,$t0,$t1,$t2,$t3);
353
354$code.=<<___;
355#if __riscv_xlen == 32
356# if __SIZEOF_POINTER__ == 8
357#  define PUSH	csc
358#  define POP	clc
359# else
360#  define PUSH	sw
361#  define POP	lw
362# endif
363# define MULX(hi,lo,a,b)	mulhu hi,a,b; mul lo,a,b
364# define srliw	srli
365# define srlw	srl
366# define sllw	sll
367# define addw	add
368# define addiw	addi
369# define mulw	mul
370#elif __riscv_xlen == 64
371# if __SIZEOF_POINTER__ == 16
372#  define PUSH	csc
373#  define POP	clc
374# else
375#  define PUSH	sd
376#  define POP	ld
377# endif
378# define MULX(hi,lo,a,b)	slli b,b,32; srli b,b,32; mul hi,a,b; addiw lo,hi,0; srai hi,hi,32
379#else
380# error "unsupported __riscv_xlen"
381#endif
382
383.option	pic
384.text
385
386.globl	poly1305_init
387.type	poly1305_init,\@function
388poly1305_init:
389#ifdef	__riscv_zicfilp
390	lpad	0
391#endif
392	sw	$zero,0($ctx)
393	sw	$zero,4($ctx)
394	sw	$zero,8($ctx)
395	sw	$zero,12($ctx)
396	sw	$zero,16($ctx)
397
398	beqz	$inp,.Lno_key
399
400#ifndef	__riscv_misaligned_fast
401	andi	$tmp0,$inp,3		# $inp % 4
402	sub	$inp,$inp,$tmp0		# align $inp
403	sll	$tmp0,$tmp0,3		# byte to bit offset
404#endif
405	lw	$in0,0($inp)
406	lw	$in1,4($inp)
407	lw	$in2,8($inp)
408	lw	$in3,12($inp)
409#ifndef	__riscv_misaligned_fast
410	beqz	$tmp0,.Laligned_key
411
412	lw	$tmp2,16($inp)
413	sub	$tmp1,$zero,$tmp0
414	srlw	$in0,$in0,$tmp0
415	sllw	$tmp3,$in1,$tmp1
416	srlw	$in1,$in1,$tmp0
417	or	$in0,$in0,$tmp3
418	sllw	$tmp3,$in2,$tmp1
419	srlw	$in2,$in2,$tmp0
420	or	$in1,$in1,$tmp3
421	sllw	$tmp3,$in3,$tmp1
422	srlw	$in3,$in3,$tmp0
423	or	$in2,$in2,$tmp3
424	sllw	$tmp2,$tmp2,$tmp1
425	or	$in3,$in3,$tmp2
426.Laligned_key:
427#endif
428
429	lui	$tmp0,0x10000
430	addi	$tmp0,$tmp0,-1		# 0x0fffffff
431	and	$in0,$in0,$tmp0
432	addi	$tmp0,$tmp0,-3		# 0x0ffffffc
433	and	$in1,$in1,$tmp0
434	and	$in2,$in2,$tmp0
435	and	$in3,$in3,$tmp0
436
437	sw	$in0,20($ctx)
438	sw	$in1,24($ctx)
439	sw	$in2,28($ctx)
440	sw	$in3,32($ctx)
441
442	srlw	$tmp1,$in1,2
443	srlw	$tmp2,$in2,2
444	srlw	$tmp3,$in3,2
445	addw	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
446	addw	$in2,$in2,$tmp2
447	addw	$in3,$in3,$tmp3
448	sw	$in1,36($ctx)
449	sw	$in2,40($ctx)
450	sw	$in3,44($ctx)
451.Lno_key:
452	li	$a0,0
453	ret
454.size	poly1305_init,.-poly1305_init
455___
456{
457my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
458   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $t0,$t1,$t2);
459my ($d0,$d1,$d2,$d3) =
460   ($a4,$a5,$a6,$a7);
461my $shr = $ra;		# used on R6
462
463$code.=<<___;
464.globl	poly1305_blocks
465.type	poly1305_blocks,\@function
466poly1305_blocks:
467#ifdef	__riscv_zicfilp
468	lpad	0
469#endif
470	andi	$len,$len,-16		# complete blocks only
471	beqz	$len,.Labort
472
473#ifdef	__riscv_zcmp
474	cm.push	{ra,s0-s8}, -48
475#else
476	caddi	$sp,$sp,-__SIZEOF_POINTER__*12
477	PUSH	$ra, __SIZEOF_POINTER__*11($sp)
478	PUSH	$s0, __SIZEOF_POINTER__*10($sp)
479	PUSH	$s1, __SIZEOF_POINTER__*9($sp)
480	PUSH	$s2, __SIZEOF_POINTER__*8($sp)
481	PUSH	$s3, __SIZEOF_POINTER__*7($sp)
482	PUSH	$s4, __SIZEOF_POINTER__*6($sp)
483	PUSH	$s5, __SIZEOF_POINTER__*5($sp)
484	PUSH	$s6, __SIZEOF_POINTER__*4($sp)
485	PUSH	$s7, __SIZEOF_POINTER__*3($sp)
486	PUSH	$s8, __SIZEOF_POINTER__*2($sp)
487#endif
488
489#ifndef	__riscv_misaligned_fast
490	andi	$shr,$inp,3
491	andi	$inp,$inp,-4		# align $inp
492	slli	$shr,$shr,3		# byte to bit offset
493#endif
494
495	lw	$h0,0($ctx)		# load hash value
496	lw	$h1,4($ctx)
497	lw	$h2,8($ctx)
498	lw	$h3,12($ctx)
499	lw	$h4,16($ctx)
500
501	lw	$r0,20($ctx)		# load key
502	lw	$r1,24($ctx)
503	lw	$r2,28($ctx)
504	lw	$r3,32($ctx)
505	lw	$rs1,36($ctx)
506	lw	$rs2,40($ctx)
507	lw	$rs3,44($ctx)
508
509	add	$len,$len,$inp		# end of buffer
510
511.Loop:
512	lw	$d0,0($inp)		# load input
513	lw	$d1,4($inp)
514	lw	$d2,8($inp)
515	lw	$d3,12($inp)
516#ifndef	__riscv_misaligned_fast
517	beqz	$shr,.Laligned_inp
518
519	lw	$t4,16($inp)
520	sub	$t5,$zero,$shr
521	srlw	$d0,$d0,$shr
522	sllw	$t3,$d1,$t5
523	srlw	$d1,$d1,$shr
524	or	$d0,$d0,$t3
525	sllw	$t3,$d2,$t5
526	srlw	$d2,$d2,$shr
527	or	$d1,$d1,$t3
528	sllw	$t3,$d3,$t5
529	srlw	$d3,$d3,$shr
530	or	$d2,$d2,$t3
531	sllw	$t4,$t4,$t5
532	or	$d3,$d3,$t4
533
534.Laligned_inp:
535#endif
536	srliw	$t3,$h4,2		# modulo-scheduled reduction
537	andi	$t4,$h4,-4
538	andi	$h4,$h4,3
539
540	addw	$d0,$d0,$h0		# accumulate input
541	 addw	$t4,$t4,$t3
542	sltu	$h0,$d0,$h0
543	addw	$d0,$d0,$t4		# ... and residue
544	sltu	$t4,$d0,$t4
545
546	addw	$d1,$d1,$h1
547	 addw	$h0,$h0,$t4		# carry
548	sltu	$h1,$d1,$h1
549	addw	$d1,$d1,$h0
550	sltu	$h0,$d1,$h0
551
552	addw	$d2,$d2,$h2
553	 addw	$h1,$h1,$h0		# carry
554	sltu	$h2,$d2,$h2
555	addw	$d2,$d2,$h1
556	sltu	$h1,$d2,$h1
557
558	addw	$d3,$d3,$h3
559	 addw	$h2,$h2,$h1		# carry
560	sltu	$h3,$d3,$h3
561	addw	$d3,$d3,$h2
562
563	MULX	($h1,$h0,$r0,$d0)	# d0*r0
564
565	 sltu	$h2,$d3,$h2
566	 addw	$h3,$h3,$h2		# carry
567
568	MULX	($t4,$t3,$rs3,$d1)	# d1*s3
569
570	 addw	$h4,$h4,$padbit
571	 caddi	$inp,$inp,16
572	 addw	$h4,$h4,$h3
573
574	MULX	($t6,$a3,$rs2,$d2)	# d2*s2
575	 addw	$h0,$h0,$t3
576	 addw	$h1,$h1,$t4
577	 sltu	$t3,$h0,$t3
578	 addw	$h1,$h1,$t3
579
580	MULX	($t4,$t3,$rs1,$d3)	# d3*s1
581	 addw	$h0,$h0,$a3
582	 addw	$h1,$h1,$t6
583	 sltu	$a3,$h0,$a3
584	 addw	$h1,$h1,$a3
585
586
587	MULX	($h2,$a3,$r1,$d0)	# d0*r1
588	 addw	$h0,$h0,$t3
589	 addw	$h1,$h1,$t4
590	 sltu	$t3,$h0,$t3
591	 addw	$h1,$h1,$t3
592
593	MULX	($t4,$t3,$r0,$d1)	# d1*r0
594	 addw	$h1,$h1,$a3
595	 sltu	$a3,$h1,$a3
596	 addw	$h2,$h2,$a3
597
598	MULX	($t6,$a3,$rs3,$d2)	# d2*s3
599	 addw	$h1,$h1,$t3
600	 addw	$h2,$h2,$t4
601	 sltu	$t3,$h1,$t3
602	 addw	$h2,$h2,$t3
603
604	MULX	($t4,$t3,$rs2,$d3)	# d3*s2
605	 addw	$h1,$h1,$a3
606	 addw	$h2,$h2,$t6
607	 sltu	$a3,$h1,$a3
608	 addw	$h2,$h2,$a3
609
610	mulw	$a3,$rs1,$h4		# h4*s1
611	 addw	$h1,$h1,$t3
612	 addw	$h2,$h2,$t4
613	 sltu	$t3,$h1,$t3
614	 addw	$h2,$h2,$t3
615
616
617	MULX	($h3,$t3,$r2,$d0)	# d0*r2
618	 addw	$h1,$h1,$a3
619	 sltu	$a3,$h1,$a3
620	 addw	$h2,$h2,$a3
621
622	MULX	($t6,$a3,$r1,$d1)	# d1*r1
623	 addw	$h2,$h2,$t3
624	 sltu	$t3,$h2,$t3
625	 addw	$h3,$h3,$t3
626
627	MULX	($t4,$t3,$r0,$d2)	# d2*r0
628	 addw	$h2,$h2,$a3
629	 addw	$h3,$h3,$t6
630	 sltu	$a3,$h2,$a3
631	 addw	$h3,$h3,$a3
632
633	MULX	($t6,$a3,$rs3,$d3)	# d3*s3
634	 addw	$h2,$h2,$t3
635	 addw	$h3,$h3,$t4
636	 sltu	$t3,$h2,$t3
637	 addw	$h3,$h3,$t3
638
639	mulw	$t3,$rs2,$h4		# h4*s2
640	 addw	$h2,$h2,$a3
641	 addw	$h3,$h3,$t6
642	 sltu	$a3,$h2,$a3
643	 addw	$h3,$h3,$a3
644
645
646	MULX	($t6,$a3,$r3,$d0)	# d0*r3
647	 addw	$h2,$h2,$t3
648	 sltu	$t3,$h2,$t3
649	 addw	$h3,$h3,$t3
650
651	MULX	($t4,$t3,$r2,$d1)	# d1*r2
652	 addw	$h3,$h3,$a3
653	 sltu	$a3,$h3,$a3
654	 addw	$t6,$t6,$a3
655
656	MULX	($a3,$d3,$r0,$d3)	# d3*r0
657	 addw	$h3,$h3,$t3
658	 addw	$t6,$t6,$t4
659	 sltu	$t3,$h3,$t3
660	 addw	$t6,$t6,$t3
661
662	MULX	($t4,$t3,$r1,$d2)	# d2*r1
663	 addw	$h3,$h3,$d3
664	 addw	$t6,$t6,$a3
665	 sltu	$d3,$h3,$d3
666	 addw	$t6,$t6,$d3
667
668	mulw	$a3,$rs3,$h4		# h4*s3
669	 addw	$h3,$h3,$t3
670	 addw	$t6,$t6,$t4
671	 sltu	$t3,$h3,$t3
672	 addw	$t6,$t6,$t3
673
674
675	mulw	$h4,$r0,$h4		# h4*r0
676	 addw	$h3,$h3,$a3
677	 sltu	$a3,$h3,$a3
678	 addw	$t6,$t6,$a3
679	addw	$h4,$t6,$h4
680
681	li	$padbit,1		# if we loop, padbit is 1
682
683	bne	$inp,$len,.Loop
684
685	sw	$h0,0($ctx)		# store hash value
686	sw	$h1,4($ctx)
687	sw	$h2,8($ctx)
688	sw	$h3,12($ctx)
689	sw	$h4,16($ctx)
690
691#ifdef	__riscv_zcmp
692	cm.popret	{ra,s0-s8}, 48
693#else
694	POP	$ra, __SIZEOF_POINTER__*11($sp)
695	POP	$s0, __SIZEOF_POINTER__*10($sp)
696	POP	$s1, __SIZEOF_POINTER__*9($sp)
697	POP	$s2, __SIZEOF_POINTER__*8($sp)
698	POP	$s3, __SIZEOF_POINTER__*7($sp)
699	POP	$s4, __SIZEOF_POINTER__*6($sp)
700	POP	$s5, __SIZEOF_POINTER__*5($sp)
701	POP	$s6, __SIZEOF_POINTER__*4($sp)
702	POP	$s7, __SIZEOF_POINTER__*3($sp)
703	POP	$s8, __SIZEOF_POINTER__*2($sp)
704	caddi	$sp,$sp,__SIZEOF_POINTER__*12
705#endif
706.Labort:
707	ret
708.size	poly1305_blocks,.-poly1305_blocks
709___
710}
711{
712my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
713
714$code.=<<___;
715.globl	poly1305_emit
716.type	poly1305_emit,\@function
717poly1305_emit:
718#ifdef	__riscv_zicfilp
719	lpad	0
720#endif
721	lw	$tmp4,16($ctx)
722	lw	$tmp0,0($ctx)
723	lw	$tmp1,4($ctx)
724	lw	$tmp2,8($ctx)
725	lw	$tmp3,12($ctx)
726
727	srliw	$ctx,$tmp4,2		# final reduction
728	andi	$in0,$tmp4,-4
729	andi	$tmp4,$tmp4,3
730	addw	$ctx,$ctx,$in0
731
732	addw	$tmp0,$tmp0,$ctx
733	sltu	$ctx,$tmp0,$ctx
734	 addiw	$in0,$tmp0,5		# compare to modulus
735	addw	$tmp1,$tmp1,$ctx
736	 sltiu	$in1,$in0,5
737	sltu	$ctx,$tmp1,$ctx
738	 addw	$in1,$in1,$tmp1
739	addw	$tmp2,$tmp2,$ctx
740	 sltu	$in2,$in1,$tmp1
741	sltu	$ctx,$tmp2,$ctx
742	 addw	$in2,$in2,$tmp2
743	addw	$tmp3,$tmp3,$ctx
744	 sltu	$in3,$in2,$tmp2
745	sltu	$ctx,$tmp3,$ctx
746	 addw	$in3,$in3,$tmp3
747	addw	$tmp4,$tmp4,$ctx
748	 sltu	$ctx,$in3,$tmp3
749	 addw	$ctx,$ctx,$tmp4
750
751	srl	$ctx,$ctx,2		# see if it carried/borrowed
752	sub	$ctx,$zero,$ctx
753
754	xor	$in0,$in0,$tmp0
755	xor	$in1,$in1,$tmp1
756	xor	$in2,$in2,$tmp2
757	xor	$in3,$in3,$tmp3
758	and	$in0,$in0,$ctx
759	and	$in1,$in1,$ctx
760	and	$in2,$in2,$ctx
761	and	$in3,$in3,$ctx
762	xor	$in0,$in0,$tmp0
763	xor	$in1,$in1,$tmp1
764	xor	$in2,$in2,$tmp2
765	xor	$in3,$in3,$tmp3
766
767	lw	$tmp0,0($nonce)		# load nonce
768	lw	$tmp1,4($nonce)
769	lw	$tmp2,8($nonce)
770	lw	$tmp3,12($nonce)
771
772	addw	$in0,$in0,$tmp0		# accumulate nonce
773	sltu	$ctx,$in0,$tmp0
774
775	addw	$in1,$in1,$tmp1
776	sltu	$tmp1,$in1,$tmp1
777	addw	$in1,$in1,$ctx
778	sltu	$ctx,$in1,$ctx
779	addw	$ctx,$ctx,$tmp1
780
781	addw	$in2,$in2,$tmp2
782	sltu	$tmp2,$in2,$tmp2
783	addw	$in2,$in2,$ctx
784	sltu	$ctx,$in2,$ctx
785	addw	$ctx,$ctx,$tmp2
786
787	addw	$in3,$in3,$tmp3
788	addw	$in3,$in3,$ctx
789
790#ifdef	__riscv_misaligned_fast
791	sw	$in0,0($mac)		# write mac value
792	sw	$in1,4($mac)
793	sw	$in2,8($mac)
794	sw	$in3,12($mac)
795#else
796	srl	$tmp0,$in0,8		# write mac value
797	srl	$tmp1,$in0,16
798	srl	$tmp2,$in0,24
799	sb	$in0, 0($mac)
800	sb	$tmp0,1($mac)
801	srl	$tmp0,$in1,8
802	sb	$tmp1,2($mac)
803	srl	$tmp1,$in1,16
804	sb	$tmp2,3($mac)
805	srl	$tmp2,$in1,24
806	sb	$in1, 4($mac)
807	sb	$tmp0,5($mac)
808	srl	$tmp0,$in2,8
809	sb	$tmp1,6($mac)
810	srl	$tmp1,$in2,16
811	sb	$tmp2,7($mac)
812	srl	$tmp2,$in2,24
813	sb	$in2, 8($mac)
814	sb	$tmp0,9($mac)
815	srl	$tmp0,$in3,8
816	sb	$tmp1,10($mac)
817	srl	$tmp1,$in3,16
818	sb	$tmp2,11($mac)
819	srl	$tmp2,$in3,24
820	sb	$in3, 12($mac)
821	sb	$tmp0,13($mac)
822	sb	$tmp1,14($mac)
823	sb	$tmp2,15($mac)
824#endif
825
826	ret
827.size	poly1305_emit,.-poly1305_emit
828.string	"Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
829___
830}
831}}}
832
833foreach (split("\n", $code)) {
834    if ($flavour =~ /^cheri/) {
835	s/\(x([0-9]+)\)/(c$1)/ and s/\b([ls][bhwd]u?)\b/c$1/;
836	s/\b(PUSH|POP)(\s+)x([0-9]+)/$1$2c$3/ or
837	s/\b(ret|jal)\b/c$1/;
838	s/\bcaddi?\b/cincoffset/ and s/\bx([0-9]+,)/c$1/g or
839	m/\bcmove\b/ and s/\bx([0-9]+)/c$1/g;
840    } else {
841	s/\bcaddi?\b/add/ or
842	s/\bcmove\b/mv/;
843    }
844    print $_, "\n";
845}
846
847close STDOUT;
848