xref: /freebsd/sys/crypto/openssl/aarch64/poly1305-armv8.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */
2#include "arm_arch.h"
3
4.text
5
6// forward "declarations" are required for Apple
7
8.hidden	OPENSSL_armcap_P
9.globl	poly1305_init
10.hidden	poly1305_init
11.globl	poly1305_blocks
12.hidden	poly1305_blocks
13.globl	poly1305_emit
14.hidden	poly1305_emit
15
16.type	poly1305_init,%function
17.align	5
18poly1305_init:
19	AARCH64_VALID_CALL_TARGET
20	cmp	x1,xzr
21	stp	xzr,xzr,[x0]		// zero hash value
22	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
23
24	csel	x0,xzr,x0,eq
25	b.eq	.Lno_key
26
27	adrp	x17,OPENSSL_armcap_P
28	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
29
30	ldp	x7,x8,[x1]		// load key
31	mov	x9,#0xfffffffc0fffffff
32	movk	x9,#0x0fff,lsl#48
33#ifdef	__AARCH64EB__
34	rev	x7,x7			// flip bytes
35	rev	x8,x8
36#endif
37	and	x7,x7,x9		// &=0ffffffc0fffffff
38	and	x9,x9,#-4
39	and	x8,x8,x9		// &=0ffffffc0ffffffc
40	stp	x7,x8,[x0,#32]	// save key value
41
42	tst	w17,#ARMV7_NEON
43
44	adrp	x12,poly1305_blocks
45	add	x12,x12,#:lo12:.Lpoly1305_blocks
46	adrp	x7,poly1305_blocks_neon
47	add	x7,x7,#:lo12:.Lpoly1305_blocks_neon
48	adrp	x13,poly1305_emit
49	add	x13,x13,#:lo12:.Lpoly1305_emit
50	adrp	x8,poly1305_emit_neon
51	add	x8,x8,#:lo12:.Lpoly1305_emit_neon
52
53	csel	x12,x12,x7,eq
54	csel	x13,x13,x8,eq
55
56#ifdef	__ILP32__
57	stp	w12,w13,[x2]
58#else
59	stp	x12,x13,[x2]
60#endif
61
62	mov	x0,#1
63.Lno_key:
64	ret
65.size	poly1305_init,.-poly1305_init
66
67.type	poly1305_blocks,%function
68.align	5
69poly1305_blocks:
70.Lpoly1305_blocks:
71	// The symbol .Lpoly1305_blocks is not a .globl symbol
72	// but a pointer to it is returned by poly1305_init
73	AARCH64_VALID_CALL_TARGET
74	ands	x2,x2,#-16
75	b.eq	.Lno_data
76
77	ldp	x4,x5,[x0]		// load hash value
78	ldp	x7,x8,[x0,#32]	// load key value
79	ldr	x6,[x0,#16]
80	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
81	b	.Loop
82
83.align	5
84.Loop:
85	ldp	x10,x11,[x1],#16	// load input
86	sub	x2,x2,#16
87#ifdef	__AARCH64EB__
88	rev	x10,x10
89	rev	x11,x11
90#endif
91	adds	x4,x4,x10		// accumulate input
92	adcs	x5,x5,x11
93
94	mul	x12,x4,x7		// h0*r0
95	adc	x6,x6,x3
96	umulh	x13,x4,x7
97
98	mul	x10,x5,x9		// h1*5*r1
99	umulh	x11,x5,x9
100
101	adds	x12,x12,x10
102	mul	x10,x4,x8		// h0*r1
103	adc	x13,x13,x11
104	umulh	x14,x4,x8
105
106	adds	x13,x13,x10
107	mul	x10,x5,x7		// h1*r0
108	adc	x14,x14,xzr
109	umulh	x11,x5,x7
110
111	adds	x13,x13,x10
112	mul	x10,x6,x9		// h2*5*r1
113	adc	x14,x14,x11
114	mul	x11,x6,x7		// h2*r0
115
116	adds	x13,x13,x10
117	adc	x14,x14,x11
118
119	and	x10,x14,#-4		// final reduction
120	and	x6,x14,#3
121	add	x10,x10,x14,lsr#2
122	adds	x4,x12,x10
123	adcs	x5,x13,xzr
124	adc	x6,x6,xzr
125
126	cbnz	x2,.Loop
127
128	stp	x4,x5,[x0]		// store hash value
129	str	x6,[x0,#16]
130
131.Lno_data:
132	ret
133.size	poly1305_blocks,.-poly1305_blocks
134
135.type	poly1305_emit,%function
136.align	5
137poly1305_emit:
138.Lpoly1305_emit:
139	// The symbol .poly1305_emit is not a .globl symbol
140	// but a pointer to it is returned by poly1305_init
141	AARCH64_VALID_CALL_TARGET
142	ldp	x4,x5,[x0]		// load hash base 2^64
143	ldr	x6,[x0,#16]
144	ldp	x10,x11,[x2]	// load nonce
145
146	adds	x12,x4,#5		// compare to modulus
147	adcs	x13,x5,xzr
148	adc	x14,x6,xzr
149
150	tst	x14,#-4			// see if it's carried/borrowed
151
152	csel	x4,x4,x12,eq
153	csel	x5,x5,x13,eq
154
155#ifdef	__AARCH64EB__
156	ror	x10,x10,#32		// flip nonce words
157	ror	x11,x11,#32
158#endif
159	adds	x4,x4,x10		// accumulate nonce
160	adc	x5,x5,x11
161#ifdef	__AARCH64EB__
162	rev	x4,x4			// flip output bytes
163	rev	x5,x5
164#endif
165	stp	x4,x5,[x1]		// write result
166
167	ret
168.size	poly1305_emit,.-poly1305_emit
169.type	poly1305_mult,%function
170.align	5
171poly1305_mult:
172	mul	x12,x4,x7		// h0*r0
173	umulh	x13,x4,x7
174
175	mul	x10,x5,x9		// h1*5*r1
176	umulh	x11,x5,x9
177
178	adds	x12,x12,x10
179	mul	x10,x4,x8		// h0*r1
180	adc	x13,x13,x11
181	umulh	x14,x4,x8
182
183	adds	x13,x13,x10
184	mul	x10,x5,x7		// h1*r0
185	adc	x14,x14,xzr
186	umulh	x11,x5,x7
187
188	adds	x13,x13,x10
189	mul	x10,x6,x9		// h2*5*r1
190	adc	x14,x14,x11
191	mul	x11,x6,x7		// h2*r0
192
193	adds	x13,x13,x10
194	adc	x14,x14,x11
195
196	and	x10,x14,#-4		// final reduction
197	and	x6,x14,#3
198	add	x10,x10,x14,lsr#2
199	adds	x4,x12,x10
200	adcs	x5,x13,xzr
201	adc	x6,x6,xzr
202
203	ret
204.size	poly1305_mult,.-poly1305_mult
205
206.type	poly1305_splat,%function
207.align	5
208poly1305_splat:
209	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
210	ubfx	x13,x4,#26,#26
211	extr	x14,x5,x4,#52
212	and	x14,x14,#0x03ffffff
213	ubfx	x15,x5,#14,#26
214	extr	x16,x6,x5,#40
215
216	str	w12,[x0,#16*0]	// r0
217	add	w12,w13,w13,lsl#2	// r1*5
218	str	w13,[x0,#16*1]	// r1
219	add	w13,w14,w14,lsl#2	// r2*5
220	str	w12,[x0,#16*2]	// s1
221	str	w14,[x0,#16*3]	// r2
222	add	w14,w15,w15,lsl#2	// r3*5
223	str	w13,[x0,#16*4]	// s2
224	str	w15,[x0,#16*5]	// r3
225	add	w15,w16,w16,lsl#2	// r4*5
226	str	w14,[x0,#16*6]	// s3
227	str	w16,[x0,#16*7]	// r4
228	str	w15,[x0,#16*8]	// s4
229
230	ret
231.size	poly1305_splat,.-poly1305_splat
232
233.type	poly1305_blocks_neon,%function
234.align	5
235poly1305_blocks_neon:
236.Lpoly1305_blocks_neon:
237	// The symbol .Lpoly1305_blocks_neon is not a .globl symbol
238	// but a pointer to it is returned by poly1305_init
239	AARCH64_VALID_CALL_TARGET
240	ldr	x17,[x0,#24]
241	cmp	x2,#128
242	b.hs	.Lblocks_neon
243	cbz	x17,.Lpoly1305_blocks
244
245.Lblocks_neon:
246	AARCH64_SIGN_LINK_REGISTER
247	stp	x29,x30,[sp,#-80]!
248	add	x29,sp,#0
249
250	ands	x2,x2,#-16
251	b.eq	.Lno_data_neon
252
253	cbz	x17,.Lbase2_64_neon
254
255	ldp	w10,w11,[x0]		// load hash value base 2^26
256	ldp	w12,w13,[x0,#8]
257	ldr	w14,[x0,#16]
258
259	tst	x2,#31
260	b.eq	.Leven_neon
261
262	ldp	x7,x8,[x0,#32]	// load key value
263
264	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
265	lsr	x5,x12,#12
266	adds	x4,x4,x12,lsl#52
267	add	x5,x5,x13,lsl#14
268	adc	x5,x5,xzr
269	lsr	x6,x14,#24
270	adds	x5,x5,x14,lsl#40
271	adc	x14,x6,xzr		// can be partially reduced...
272
273	ldp	x12,x13,[x1],#16	// load input
274	sub	x2,x2,#16
275	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
276
277	and	x10,x14,#-4		// ... so reduce
278	and	x6,x14,#3
279	add	x10,x10,x14,lsr#2
280	adds	x4,x4,x10
281	adcs	x5,x5,xzr
282	adc	x6,x6,xzr
283
284#ifdef	__AARCH64EB__
285	rev	x12,x12
286	rev	x13,x13
287#endif
288	adds	x4,x4,x12		// accumulate input
289	adcs	x5,x5,x13
290	adc	x6,x6,x3
291
292	bl	poly1305_mult
293	ldr	x30,[sp,#8]
294
295	cbz	x3,.Lstore_base2_64_neon
296
297	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
298	ubfx	x11,x4,#26,#26
299	extr	x12,x5,x4,#52
300	and	x12,x12,#0x03ffffff
301	ubfx	x13,x5,#14,#26
302	extr	x14,x6,x5,#40
303
304	cbnz	x2,.Leven_neon
305
306	stp	w10,w11,[x0]		// store hash value base 2^26
307	stp	w12,w13,[x0,#8]
308	str	w14,[x0,#16]
309	b	.Lno_data_neon
310
311.align	4
312.Lstore_base2_64_neon:
313	stp	x4,x5,[x0]		// store hash value base 2^64
314	stp	x6,xzr,[x0,#16]	// note that is_base2_26 is zeroed
315	b	.Lno_data_neon
316
317.align	4
318.Lbase2_64_neon:
319	ldp	x7,x8,[x0,#32]	// load key value
320
321	ldp	x4,x5,[x0]		// load hash value base 2^64
322	ldr	x6,[x0,#16]
323
324	tst	x2,#31
325	b.eq	.Linit_neon
326
327	ldp	x12,x13,[x1],#16	// load input
328	sub	x2,x2,#16
329	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
330#ifdef	__AARCH64EB__
331	rev	x12,x12
332	rev	x13,x13
333#endif
334	adds	x4,x4,x12		// accumulate input
335	adcs	x5,x5,x13
336	adc	x6,x6,x3
337
338	bl	poly1305_mult
339
340.Linit_neon:
341	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
342	ubfx	x11,x4,#26,#26
343	extr	x12,x5,x4,#52
344	and	x12,x12,#0x03ffffff
345	ubfx	x13,x5,#14,#26
346	extr	x14,x6,x5,#40
347
348	stp	d8,d9,[sp,#16]		// meet ABI requirements
349	stp	d10,d11,[sp,#32]
350	stp	d12,d13,[sp,#48]
351	stp	d14,d15,[sp,#64]
352
353	fmov	d24,x10
354	fmov	d25,x11
355	fmov	d26,x12
356	fmov	d27,x13
357	fmov	d28,x14
358
359	////////////////////////////////// initialize r^n table
360	mov	x4,x7			// r^1
361	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
362	mov	x5,x8
363	mov	x6,xzr
364	add	x0,x0,#48+12
365	bl	poly1305_splat
366
367	bl	poly1305_mult		// r^2
368	sub	x0,x0,#4
369	bl	poly1305_splat
370
371	bl	poly1305_mult		// r^3
372	sub	x0,x0,#4
373	bl	poly1305_splat
374
375	bl	poly1305_mult		// r^4
376	sub	x0,x0,#4
377	bl	poly1305_splat
378	ldr	x30,[sp,#8]
379
380	add	x16,x1,#32
381	adrp	x17,.Lzeros
382	add	x17,x17,#:lo12:.Lzeros
383	subs	x2,x2,#64
384	csel	x16,x17,x16,lo
385
386	mov	x4,#1
387	stur	x4,[x0,#-24]		// set is_base2_26
388	sub	x0,x0,#48		// restore original x0
389	b	.Ldo_neon
390
391.align	4
392.Leven_neon:
393	add	x16,x1,#32
394	adrp	x17,.Lzeros
395	add	x17,x17,#:lo12:.Lzeros
396	subs	x2,x2,#64
397	csel	x16,x17,x16,lo
398
399	stp	d8,d9,[sp,#16]		// meet ABI requirements
400	stp	d10,d11,[sp,#32]
401	stp	d12,d13,[sp,#48]
402	stp	d14,d15,[sp,#64]
403
404	fmov	d24,x10
405	fmov	d25,x11
406	fmov	d26,x12
407	fmov	d27,x13
408	fmov	d28,x14
409
410.Ldo_neon:
411	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
412	ldp	x9,x13,[x16],#48
413
414	lsl	x3,x3,#24
415	add	x15,x0,#48
416
417#ifdef	__AARCH64EB__
418	rev	x8,x8
419	rev	x12,x12
420	rev	x9,x9
421	rev	x13,x13
422#endif
423	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
424	and	x5,x9,#0x03ffffff
425	ubfx	x6,x8,#26,#26
426	ubfx	x7,x9,#26,#26
427	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
428	extr	x8,x12,x8,#52
429	extr	x9,x13,x9,#52
430	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
431	fmov	d14,x4
432	and	x8,x8,#0x03ffffff
433	and	x9,x9,#0x03ffffff
434	ubfx	x10,x12,#14,#26
435	ubfx	x11,x13,#14,#26
436	add	x12,x3,x12,lsr#40
437	add	x13,x3,x13,lsr#40
438	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
439	fmov	d15,x6
440	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
441	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
442	fmov	d16,x8
443	fmov	d17,x10
444	fmov	d18,x12
445
446	ldp	x8,x12,[x1],#16	// inp[0:1]
447	ldp	x9,x13,[x1],#48
448
449	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
450	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
451	ld1	{v8.4s},[x15]
452
453#ifdef	__AARCH64EB__
454	rev	x8,x8
455	rev	x12,x12
456	rev	x9,x9
457	rev	x13,x13
458#endif
459	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
460	and	x5,x9,#0x03ffffff
461	ubfx	x6,x8,#26,#26
462	ubfx	x7,x9,#26,#26
463	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
464	extr	x8,x12,x8,#52
465	extr	x9,x13,x9,#52
466	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
467	fmov	d9,x4
468	and	x8,x8,#0x03ffffff
469	and	x9,x9,#0x03ffffff
470	ubfx	x10,x12,#14,#26
471	ubfx	x11,x13,#14,#26
472	add	x12,x3,x12,lsr#40
473	add	x13,x3,x13,lsr#40
474	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
475	fmov	d10,x6
476	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
477	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
478	movi	v31.2d,#-1
479	fmov	d11,x8
480	fmov	d12,x10
481	fmov	d13,x12
482	ushr	v31.2d,v31.2d,#38
483
484	b.ls	.Lskip_loop
485
486.align	4
487.Loop_neon:
488	////////////////////////////////////////////////////////////////
489	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
490	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
491	//   ___________________/
492	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
493	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
494	//   ___________________/ ____________________/
495	//
496	// Note that we start with inp[2:3]*r^2. This is because it
497	// doesn't depend on reduction in previous iteration.
498	////////////////////////////////////////////////////////////////
499	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
500	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
501	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
502	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
503	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
504
505	subs	x2,x2,#64
506	umull	v23.2d,v14.2s,v7.s[2]
507	csel	x16,x17,x16,lo
508	umull	v22.2d,v14.2s,v5.s[2]
509	umull	v21.2d,v14.2s,v3.s[2]
510	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
511	umull	v20.2d,v14.2s,v1.s[2]
512	ldp	x9,x13,[x16],#48
513	umull	v19.2d,v14.2s,v0.s[2]
514#ifdef	__AARCH64EB__
515	rev	x8,x8
516	rev	x12,x12
517	rev	x9,x9
518	rev	x13,x13
519#endif
520
521	umlal	v23.2d,v15.2s,v5.s[2]
522	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
523	umlal	v22.2d,v15.2s,v3.s[2]
524	and	x5,x9,#0x03ffffff
525	umlal	v21.2d,v15.2s,v1.s[2]
526	ubfx	x6,x8,#26,#26
527	umlal	v20.2d,v15.2s,v0.s[2]
528	ubfx	x7,x9,#26,#26
529	umlal	v19.2d,v15.2s,v8.s[2]
530	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
531
532	umlal	v23.2d,v16.2s,v3.s[2]
533	extr	x8,x12,x8,#52
534	umlal	v22.2d,v16.2s,v1.s[2]
535	extr	x9,x13,x9,#52
536	umlal	v21.2d,v16.2s,v0.s[2]
537	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
538	umlal	v20.2d,v16.2s,v8.s[2]
539	fmov	d14,x4
540	umlal	v19.2d,v16.2s,v6.s[2]
541	and	x8,x8,#0x03ffffff
542
543	umlal	v23.2d,v17.2s,v1.s[2]
544	and	x9,x9,#0x03ffffff
545	umlal	v22.2d,v17.2s,v0.s[2]
546	ubfx	x10,x12,#14,#26
547	umlal	v21.2d,v17.2s,v8.s[2]
548	ubfx	x11,x13,#14,#26
549	umlal	v20.2d,v17.2s,v6.s[2]
550	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
551	umlal	v19.2d,v17.2s,v4.s[2]
552	fmov	d15,x6
553
554	add	v11.2s,v11.2s,v26.2s
555	add	x12,x3,x12,lsr#40
556	umlal	v23.2d,v18.2s,v0.s[2]
557	add	x13,x3,x13,lsr#40
558	umlal	v22.2d,v18.2s,v8.s[2]
559	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
560	umlal	v21.2d,v18.2s,v6.s[2]
561	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
562	umlal	v20.2d,v18.2s,v4.s[2]
563	fmov	d16,x8
564	umlal	v19.2d,v18.2s,v2.s[2]
565	fmov	d17,x10
566
567	////////////////////////////////////////////////////////////////
568	// (hash+inp[0:1])*r^4 and accumulate
569
570	add	v9.2s,v9.2s,v24.2s
571	fmov	d18,x12
572	umlal	v22.2d,v11.2s,v1.s[0]
573	ldp	x8,x12,[x1],#16	// inp[0:1]
574	umlal	v19.2d,v11.2s,v6.s[0]
575	ldp	x9,x13,[x1],#48
576	umlal	v23.2d,v11.2s,v3.s[0]
577	umlal	v20.2d,v11.2s,v8.s[0]
578	umlal	v21.2d,v11.2s,v0.s[0]
579#ifdef	__AARCH64EB__
580	rev	x8,x8
581	rev	x12,x12
582	rev	x9,x9
583	rev	x13,x13
584#endif
585
586	add	v10.2s,v10.2s,v25.2s
587	umlal	v22.2d,v9.2s,v5.s[0]
588	umlal	v23.2d,v9.2s,v7.s[0]
589	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
590	umlal	v21.2d,v9.2s,v3.s[0]
591	and	x5,x9,#0x03ffffff
592	umlal	v19.2d,v9.2s,v0.s[0]
593	ubfx	x6,x8,#26,#26
594	umlal	v20.2d,v9.2s,v1.s[0]
595	ubfx	x7,x9,#26,#26
596
597	add	v12.2s,v12.2s,v27.2s
598	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
599	umlal	v22.2d,v10.2s,v3.s[0]
600	extr	x8,x12,x8,#52
601	umlal	v23.2d,v10.2s,v5.s[0]
602	extr	x9,x13,x9,#52
603	umlal	v19.2d,v10.2s,v8.s[0]
604	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
605	umlal	v21.2d,v10.2s,v1.s[0]
606	fmov	d9,x4
607	umlal	v20.2d,v10.2s,v0.s[0]
608	and	x8,x8,#0x03ffffff
609
610	add	v13.2s,v13.2s,v28.2s
611	and	x9,x9,#0x03ffffff
612	umlal	v22.2d,v12.2s,v0.s[0]
613	ubfx	x10,x12,#14,#26
614	umlal	v19.2d,v12.2s,v4.s[0]
615	ubfx	x11,x13,#14,#26
616	umlal	v23.2d,v12.2s,v1.s[0]
617	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
618	umlal	v20.2d,v12.2s,v6.s[0]
619	fmov	d10,x6
620	umlal	v21.2d,v12.2s,v8.s[0]
621	add	x12,x3,x12,lsr#40
622
623	umlal	v22.2d,v13.2s,v8.s[0]
624	add	x13,x3,x13,lsr#40
625	umlal	v19.2d,v13.2s,v2.s[0]
626	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
627	umlal	v23.2d,v13.2s,v0.s[0]
628	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
629	umlal	v20.2d,v13.2s,v4.s[0]
630	fmov	d11,x8
631	umlal	v21.2d,v13.2s,v6.s[0]
632	fmov	d12,x10
633	fmov	d13,x12
634
635	/////////////////////////////////////////////////////////////////
636	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
637	// and P. Schwabe
638	//
639	// [see discussion in poly1305-armv4 module]
640
641	ushr	v29.2d,v22.2d,#26
642	xtn	v27.2s,v22.2d
643	ushr	v30.2d,v19.2d,#26
644	and	v19.16b,v19.16b,v31.16b
645	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
646	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
647	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
648
649	ushr	v29.2d,v23.2d,#26
650	xtn	v28.2s,v23.2d
651	ushr	v30.2d,v20.2d,#26
652	xtn	v25.2s,v20.2d
653	bic	v28.2s,#0xfc,lsl#24
654	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
655
656	add	v19.2d,v19.2d,v29.2d
657	shl	v29.2d,v29.2d,#2
658	shrn	v30.2s,v21.2d,#26
659	xtn	v26.2s,v21.2d
660	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
661	bic	v25.2s,#0xfc,lsl#24
662	add	v27.2s,v27.2s,v30.2s		// h2 -> h3
663	bic	v26.2s,#0xfc,lsl#24
664
665	shrn	v29.2s,v19.2d,#26
666	xtn	v24.2s,v19.2d
667	ushr	v30.2s,v27.2s,#26
668	bic	v27.2s,#0xfc,lsl#24
669	bic	v24.2s,#0xfc,lsl#24
670	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
671	add	v28.2s,v28.2s,v30.2s		// h3 -> h4
672
673	b.hi	.Loop_neon
674
675.Lskip_loop:
676	dup	v16.2d,v16.d[0]
677	add	v11.2s,v11.2s,v26.2s
678
679	////////////////////////////////////////////////////////////////
680	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
681
682	adds	x2,x2,#32
683	b.ne	.Long_tail
684
685	dup	v16.2d,v11.d[0]
686	add	v14.2s,v9.2s,v24.2s
687	add	v17.2s,v12.2s,v27.2s
688	add	v15.2s,v10.2s,v25.2s
689	add	v18.2s,v13.2s,v28.2s
690
691.Long_tail:
692	dup	v14.2d,v14.d[0]
693	umull2	v19.2d,v16.4s,v6.4s
694	umull2	v22.2d,v16.4s,v1.4s
695	umull2	v23.2d,v16.4s,v3.4s
696	umull2	v21.2d,v16.4s,v0.4s
697	umull2	v20.2d,v16.4s,v8.4s
698
699	dup	v15.2d,v15.d[0]
700	umlal2	v19.2d,v14.4s,v0.4s
701	umlal2	v21.2d,v14.4s,v3.4s
702	umlal2	v22.2d,v14.4s,v5.4s
703	umlal2	v23.2d,v14.4s,v7.4s
704	umlal2	v20.2d,v14.4s,v1.4s
705
706	dup	v17.2d,v17.d[0]
707	umlal2	v19.2d,v15.4s,v8.4s
708	umlal2	v22.2d,v15.4s,v3.4s
709	umlal2	v21.2d,v15.4s,v1.4s
710	umlal2	v23.2d,v15.4s,v5.4s
711	umlal2	v20.2d,v15.4s,v0.4s
712
713	dup	v18.2d,v18.d[0]
714	umlal2	v22.2d,v17.4s,v0.4s
715	umlal2	v23.2d,v17.4s,v1.4s
716	umlal2	v19.2d,v17.4s,v4.4s
717	umlal2	v20.2d,v17.4s,v6.4s
718	umlal2	v21.2d,v17.4s,v8.4s
719
720	umlal2	v22.2d,v18.4s,v8.4s
721	umlal2	v19.2d,v18.4s,v2.4s
722	umlal2	v23.2d,v18.4s,v0.4s
723	umlal2	v20.2d,v18.4s,v4.4s
724	umlal2	v21.2d,v18.4s,v6.4s
725
726	b.eq	.Lshort_tail
727
728	////////////////////////////////////////////////////////////////
729	// (hash+inp[0:1])*r^4:r^3 and accumulate
730
731	add	v9.2s,v9.2s,v24.2s
732	umlal	v22.2d,v11.2s,v1.2s
733	umlal	v19.2d,v11.2s,v6.2s
734	umlal	v23.2d,v11.2s,v3.2s
735	umlal	v20.2d,v11.2s,v8.2s
736	umlal	v21.2d,v11.2s,v0.2s
737
738	add	v10.2s,v10.2s,v25.2s
739	umlal	v22.2d,v9.2s,v5.2s
740	umlal	v19.2d,v9.2s,v0.2s
741	umlal	v23.2d,v9.2s,v7.2s
742	umlal	v20.2d,v9.2s,v1.2s
743	umlal	v21.2d,v9.2s,v3.2s
744
745	add	v12.2s,v12.2s,v27.2s
746	umlal	v22.2d,v10.2s,v3.2s
747	umlal	v19.2d,v10.2s,v8.2s
748	umlal	v23.2d,v10.2s,v5.2s
749	umlal	v20.2d,v10.2s,v0.2s
750	umlal	v21.2d,v10.2s,v1.2s
751
752	add	v13.2s,v13.2s,v28.2s
753	umlal	v22.2d,v12.2s,v0.2s
754	umlal	v19.2d,v12.2s,v4.2s
755	umlal	v23.2d,v12.2s,v1.2s
756	umlal	v20.2d,v12.2s,v6.2s
757	umlal	v21.2d,v12.2s,v8.2s
758
759	umlal	v22.2d,v13.2s,v8.2s
760	umlal	v19.2d,v13.2s,v2.2s
761	umlal	v23.2d,v13.2s,v0.2s
762	umlal	v20.2d,v13.2s,v4.2s
763	umlal	v21.2d,v13.2s,v6.2s
764
765.Lshort_tail:
766	////////////////////////////////////////////////////////////////
767	// horizontal add
768
769	addp	v22.2d,v22.2d,v22.2d
770	ldp	d8,d9,[sp,#16]		// meet ABI requirements
771	addp	v19.2d,v19.2d,v19.2d
772	ldp	d10,d11,[sp,#32]
773	addp	v23.2d,v23.2d,v23.2d
774	ldp	d12,d13,[sp,#48]
775	addp	v20.2d,v20.2d,v20.2d
776	ldp	d14,d15,[sp,#64]
777	addp	v21.2d,v21.2d,v21.2d
778
779	////////////////////////////////////////////////////////////////
780	// lazy reduction, but without narrowing
781
782	ushr	v29.2d,v22.2d,#26
783	and	v22.16b,v22.16b,v31.16b
784	ushr	v30.2d,v19.2d,#26
785	and	v19.16b,v19.16b,v31.16b
786
787	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
788	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
789
790	ushr	v29.2d,v23.2d,#26
791	and	v23.16b,v23.16b,v31.16b
792	ushr	v30.2d,v20.2d,#26
793	and	v20.16b,v20.16b,v31.16b
794	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
795
796	add	v19.2d,v19.2d,v29.2d
797	shl	v29.2d,v29.2d,#2
798	ushr	v30.2d,v21.2d,#26
799	and	v21.16b,v21.16b,v31.16b
800	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
801	add	v22.2d,v22.2d,v30.2d	// h2 -> h3
802
803	ushr	v29.2d,v19.2d,#26
804	and	v19.16b,v19.16b,v31.16b
805	ushr	v30.2d,v22.2d,#26
806	and	v22.16b,v22.16b,v31.16b
807	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
808	add	v23.2d,v23.2d,v30.2d	// h3 -> h4
809
810	////////////////////////////////////////////////////////////////
811	// write the result, can be partially reduced
812
813	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
814	st1	{v23.s}[0],[x0]
815
816.Lno_data_neon:
817	ldr	x29,[sp],#80
818	AARCH64_VALIDATE_LINK_REGISTER
819	ret
820.size	poly1305_blocks_neon,.-poly1305_blocks_neon
821
822.type	poly1305_emit_neon,%function
823.align	5
824poly1305_emit_neon:
825.Lpoly1305_emit_neon:
826	// The symbol .Lpoly1305_emit_neon is not a .globl symbol
827	// but a pointer to it is returned by poly1305_init
828	AARCH64_VALID_CALL_TARGET
829	ldr	x17,[x0,#24]
830	cbz	x17,poly1305_emit
831
832	ldp	w10,w11,[x0]		// load hash value base 2^26
833	ldp	w12,w13,[x0,#8]
834	ldr	w14,[x0,#16]
835
836	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
837	lsr	x5,x12,#12
838	adds	x4,x4,x12,lsl#52
839	add	x5,x5,x13,lsl#14
840	adc	x5,x5,xzr
841	lsr	x6,x14,#24
842	adds	x5,x5,x14,lsl#40
843	adc	x6,x6,xzr		// can be partially reduced...
844
845	ldp	x10,x11,[x2]	// load nonce
846
847	and	x12,x6,#-4		// ... so reduce
848	add	x12,x12,x6,lsr#2
849	and	x6,x6,#3
850	adds	x4,x4,x12
851	adcs	x5,x5,xzr
852	adc	x6,x6,xzr
853
854	adds	x12,x4,#5		// compare to modulus
855	adcs	x13,x5,xzr
856	adc	x14,x6,xzr
857
858	tst	x14,#-4			// see if it's carried/borrowed
859
860	csel	x4,x4,x12,eq
861	csel	x5,x5,x13,eq
862
863#ifdef	__AARCH64EB__
864	ror	x10,x10,#32		// flip nonce words
865	ror	x11,x11,#32
866#endif
867	adds	x4,x4,x10		// accumulate nonce
868	adc	x5,x5,x11
869#ifdef	__AARCH64EB__
870	rev	x4,x4			// flip output bytes
871	rev	x5,x5
872#endif
873	stp	x4,x5,[x1]		// write result
874
875	ret
876.size	poly1305_emit_neon,.-poly1305_emit_neon
877
878.section	.rodata
879
880.align	5
881.Lzeros:
882.long	0,0,0,0,0,0,0,0
883.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
884.align	2
885.align	2
886