xref: /freebsd/sys/crypto/openssl/aarch64/poly1305-armv8.S (revision af23369a6deaaeb612ab266eb88b8bb8d560c322)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */
3#include "arm_arch.h"
4
5.text
6
7// forward "declarations" are required for Apple
8
9.hidden	OPENSSL_armcap_P
10.globl	poly1305_init
11.hidden	poly1305_init
12.globl	poly1305_blocks
13.hidden	poly1305_blocks
14.globl	poly1305_emit
15.hidden	poly1305_emit
16
17.type	poly1305_init,%function
18.align	5
19poly1305_init:
20	cmp	x1,xzr
21	stp	xzr,xzr,[x0]		// zero hash value
22	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
23
24	csel	x0,xzr,x0,eq
25	b.eq	.Lno_key
26
27#ifdef	__ILP32__
28	ldrsw	x11,.LOPENSSL_armcap_P
29#else
30	ldr	x11,.LOPENSSL_armcap_P
31#endif
32	adr	x10,.LOPENSSL_armcap_P
33
34	ldp	x7,x8,[x1]		// load key
35	mov	x9,#0xfffffffc0fffffff
36	movk	x9,#0x0fff,lsl#48
37	ldr	w17,[x10,x11]
38#ifdef	__ARMEB__
39	rev	x7,x7			// flip bytes
40	rev	x8,x8
41#endif
42	and	x7,x7,x9		// &=0ffffffc0fffffff
43	and	x9,x9,#-4
44	and	x8,x8,x9		// &=0ffffffc0ffffffc
45	stp	x7,x8,[x0,#32]	// save key value
46
47	tst	w17,#ARMV7_NEON
48
49	adr	x12,poly1305_blocks
50	adr	x7,poly1305_blocks_neon
51	adr	x13,poly1305_emit
52	adr	x8,poly1305_emit_neon
53
54	csel	x12,x12,x7,eq
55	csel	x13,x13,x8,eq
56
57#ifdef	__ILP32__
58	stp	w12,w13,[x2]
59#else
60	stp	x12,x13,[x2]
61#endif
62
63	mov	x0,#1
64.Lno_key:
65	ret
66.size	poly1305_init,.-poly1305_init
67
68.type	poly1305_blocks,%function
69.align	5
70poly1305_blocks:
71	ands	x2,x2,#-16
72	b.eq	.Lno_data
73
74	ldp	x4,x5,[x0]		// load hash value
75	ldp	x7,x8,[x0,#32]	// load key value
76	ldr	x6,[x0,#16]
77	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
78	b	.Loop
79
80.align	5
81.Loop:
82	ldp	x10,x11,[x1],#16	// load input
83	sub	x2,x2,#16
84#ifdef	__ARMEB__
85	rev	x10,x10
86	rev	x11,x11
87#endif
88	adds	x4,x4,x10		// accumulate input
89	adcs	x5,x5,x11
90
91	mul	x12,x4,x7		// h0*r0
92	adc	x6,x6,x3
93	umulh	x13,x4,x7
94
95	mul	x10,x5,x9		// h1*5*r1
96	umulh	x11,x5,x9
97
98	adds	x12,x12,x10
99	mul	x10,x4,x8		// h0*r1
100	adc	x13,x13,x11
101	umulh	x14,x4,x8
102
103	adds	x13,x13,x10
104	mul	x10,x5,x7		// h1*r0
105	adc	x14,x14,xzr
106	umulh	x11,x5,x7
107
108	adds	x13,x13,x10
109	mul	x10,x6,x9		// h2*5*r1
110	adc	x14,x14,x11
111	mul	x11,x6,x7		// h2*r0
112
113	adds	x13,x13,x10
114	adc	x14,x14,x11
115
116	and	x10,x14,#-4		// final reduction
117	and	x6,x14,#3
118	add	x10,x10,x14,lsr#2
119	adds	x4,x12,x10
120	adcs	x5,x13,xzr
121	adc	x6,x6,xzr
122
123	cbnz	x2,.Loop
124
125	stp	x4,x5,[x0]		// store hash value
126	str	x6,[x0,#16]
127
128.Lno_data:
129	ret
130.size	poly1305_blocks,.-poly1305_blocks
131
132.type	poly1305_emit,%function
133.align	5
134poly1305_emit:
135	ldp	x4,x5,[x0]		// load hash base 2^64
136	ldr	x6,[x0,#16]
137	ldp	x10,x11,[x2]	// load nonce
138
139	adds	x12,x4,#5		// compare to modulus
140	adcs	x13,x5,xzr
141	adc	x14,x6,xzr
142
143	tst	x14,#-4			// see if it's carried/borrowed
144
145	csel	x4,x4,x12,eq
146	csel	x5,x5,x13,eq
147
148#ifdef	__ARMEB__
149	ror	x10,x10,#32		// flip nonce words
150	ror	x11,x11,#32
151#endif
152	adds	x4,x4,x10		// accumulate nonce
153	adc	x5,x5,x11
154#ifdef	__ARMEB__
155	rev	x4,x4			// flip output bytes
156	rev	x5,x5
157#endif
158	stp	x4,x5,[x1]		// write result
159
160	ret
161.size	poly1305_emit,.-poly1305_emit
162.type	poly1305_mult,%function
163.align	5
164poly1305_mult:
165	mul	x12,x4,x7		// h0*r0
166	umulh	x13,x4,x7
167
168	mul	x10,x5,x9		// h1*5*r1
169	umulh	x11,x5,x9
170
171	adds	x12,x12,x10
172	mul	x10,x4,x8		// h0*r1
173	adc	x13,x13,x11
174	umulh	x14,x4,x8
175
176	adds	x13,x13,x10
177	mul	x10,x5,x7		// h1*r0
178	adc	x14,x14,xzr
179	umulh	x11,x5,x7
180
181	adds	x13,x13,x10
182	mul	x10,x6,x9		// h2*5*r1
183	adc	x14,x14,x11
184	mul	x11,x6,x7		// h2*r0
185
186	adds	x13,x13,x10
187	adc	x14,x14,x11
188
189	and	x10,x14,#-4		// final reduction
190	and	x6,x14,#3
191	add	x10,x10,x14,lsr#2
192	adds	x4,x12,x10
193	adcs	x5,x13,xzr
194	adc	x6,x6,xzr
195
196	ret
197.size	poly1305_mult,.-poly1305_mult
198
199.type	poly1305_splat,%function
200.align	5
201poly1305_splat:
202	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
203	ubfx	x13,x4,#26,#26
204	extr	x14,x5,x4,#52
205	and	x14,x14,#0x03ffffff
206	ubfx	x15,x5,#14,#26
207	extr	x16,x6,x5,#40
208
209	str	w12,[x0,#16*0]	// r0
210	add	w12,w13,w13,lsl#2	// r1*5
211	str	w13,[x0,#16*1]	// r1
212	add	w13,w14,w14,lsl#2	// r2*5
213	str	w12,[x0,#16*2]	// s1
214	str	w14,[x0,#16*3]	// r2
215	add	w14,w15,w15,lsl#2	// r3*5
216	str	w13,[x0,#16*4]	// s2
217	str	w15,[x0,#16*5]	// r3
218	add	w15,w16,w16,lsl#2	// r4*5
219	str	w14,[x0,#16*6]	// s3
220	str	w16,[x0,#16*7]	// r4
221	str	w15,[x0,#16*8]	// s4
222
223	ret
224.size	poly1305_splat,.-poly1305_splat
225
226.type	poly1305_blocks_neon,%function
227.align	5
228poly1305_blocks_neon:
229	ldr	x17,[x0,#24]
230	cmp	x2,#128
231	b.hs	.Lblocks_neon
232	cbz	x17,poly1305_blocks
233
234.Lblocks_neon:
235.inst	0xd503233f		// paciasp
236	stp	x29,x30,[sp,#-80]!
237	add	x29,sp,#0
238
239	ands	x2,x2,#-16
240	b.eq	.Lno_data_neon
241
242	cbz	x17,.Lbase2_64_neon
243
244	ldp	w10,w11,[x0]		// load hash value base 2^26
245	ldp	w12,w13,[x0,#8]
246	ldr	w14,[x0,#16]
247
248	tst	x2,#31
249	b.eq	.Leven_neon
250
251	ldp	x7,x8,[x0,#32]	// load key value
252
253	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
254	lsr	x5,x12,#12
255	adds	x4,x4,x12,lsl#52
256	add	x5,x5,x13,lsl#14
257	adc	x5,x5,xzr
258	lsr	x6,x14,#24
259	adds	x5,x5,x14,lsl#40
260	adc	x14,x6,xzr		// can be partially reduced...
261
262	ldp	x12,x13,[x1],#16	// load input
263	sub	x2,x2,#16
264	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
265
266	and	x10,x14,#-4		// ... so reduce
267	and	x6,x14,#3
268	add	x10,x10,x14,lsr#2
269	adds	x4,x4,x10
270	adcs	x5,x5,xzr
271	adc	x6,x6,xzr
272
273#ifdef	__ARMEB__
274	rev	x12,x12
275	rev	x13,x13
276#endif
277	adds	x4,x4,x12		// accumulate input
278	adcs	x5,x5,x13
279	adc	x6,x6,x3
280
281	bl	poly1305_mult
282	ldr	x30,[sp,#8]
283
284	cbz	x3,.Lstore_base2_64_neon
285
286	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
287	ubfx	x11,x4,#26,#26
288	extr	x12,x5,x4,#52
289	and	x12,x12,#0x03ffffff
290	ubfx	x13,x5,#14,#26
291	extr	x14,x6,x5,#40
292
293	cbnz	x2,.Leven_neon
294
295	stp	w10,w11,[x0]		// store hash value base 2^26
296	stp	w12,w13,[x0,#8]
297	str	w14,[x0,#16]
298	b	.Lno_data_neon
299
300.align	4
301.Lstore_base2_64_neon:
302	stp	x4,x5,[x0]		// store hash value base 2^64
303	stp	x6,xzr,[x0,#16]	// note that is_base2_26 is zeroed
304	b	.Lno_data_neon
305
306.align	4
307.Lbase2_64_neon:
308	ldp	x7,x8,[x0,#32]	// load key value
309
310	ldp	x4,x5,[x0]		// load hash value base 2^64
311	ldr	x6,[x0,#16]
312
313	tst	x2,#31
314	b.eq	.Linit_neon
315
316	ldp	x12,x13,[x1],#16	// load input
317	sub	x2,x2,#16
318	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
319#ifdef	__ARMEB__
320	rev	x12,x12
321	rev	x13,x13
322#endif
323	adds	x4,x4,x12		// accumulate input
324	adcs	x5,x5,x13
325	adc	x6,x6,x3
326
327	bl	poly1305_mult
328
329.Linit_neon:
330	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
331	ubfx	x11,x4,#26,#26
332	extr	x12,x5,x4,#52
333	and	x12,x12,#0x03ffffff
334	ubfx	x13,x5,#14,#26
335	extr	x14,x6,x5,#40
336
337	stp	d8,d9,[sp,#16]		// meet ABI requirements
338	stp	d10,d11,[sp,#32]
339	stp	d12,d13,[sp,#48]
340	stp	d14,d15,[sp,#64]
341
342	fmov	d24,x10
343	fmov	d25,x11
344	fmov	d26,x12
345	fmov	d27,x13
346	fmov	d28,x14
347
348	////////////////////////////////// initialize r^n table
349	mov	x4,x7			// r^1
350	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
351	mov	x5,x8
352	mov	x6,xzr
353	add	x0,x0,#48+12
354	bl	poly1305_splat
355
356	bl	poly1305_mult		// r^2
357	sub	x0,x0,#4
358	bl	poly1305_splat
359
360	bl	poly1305_mult		// r^3
361	sub	x0,x0,#4
362	bl	poly1305_splat
363
364	bl	poly1305_mult		// r^4
365	sub	x0,x0,#4
366	bl	poly1305_splat
367	ldr	x30,[sp,#8]
368
369	add	x16,x1,#32
370	adr	x17,.Lzeros
371	subs	x2,x2,#64
372	csel	x16,x17,x16,lo
373
374	mov	x4,#1
375	str	x4,[x0,#-24]		// set is_base2_26
376	sub	x0,x0,#48		// restore original x0
377	b	.Ldo_neon
378
379.align	4
380.Leven_neon:
381	add	x16,x1,#32
382	adr	x17,.Lzeros
383	subs	x2,x2,#64
384	csel	x16,x17,x16,lo
385
386	stp	d8,d9,[sp,#16]		// meet ABI requirements
387	stp	d10,d11,[sp,#32]
388	stp	d12,d13,[sp,#48]
389	stp	d14,d15,[sp,#64]
390
391	fmov	d24,x10
392	fmov	d25,x11
393	fmov	d26,x12
394	fmov	d27,x13
395	fmov	d28,x14
396
397.Ldo_neon:
398	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
399	ldp	x9,x13,[x16],#48
400
401	lsl	x3,x3,#24
402	add	x15,x0,#48
403
404#ifdef	__ARMEB__
405	rev	x8,x8
406	rev	x12,x12
407	rev	x9,x9
408	rev	x13,x13
409#endif
410	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
411	and	x5,x9,#0x03ffffff
412	ubfx	x6,x8,#26,#26
413	ubfx	x7,x9,#26,#26
414	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
415	extr	x8,x12,x8,#52
416	extr	x9,x13,x9,#52
417	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
418	fmov	d14,x4
419	and	x8,x8,#0x03ffffff
420	and	x9,x9,#0x03ffffff
421	ubfx	x10,x12,#14,#26
422	ubfx	x11,x13,#14,#26
423	add	x12,x3,x12,lsr#40
424	add	x13,x3,x13,lsr#40
425	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
426	fmov	d15,x6
427	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
428	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
429	fmov	d16,x8
430	fmov	d17,x10
431	fmov	d18,x12
432
433	ldp	x8,x12,[x1],#16	// inp[0:1]
434	ldp	x9,x13,[x1],#48
435
436	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
437	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
438	ld1	{v8.4s},[x15]
439
440#ifdef	__ARMEB__
441	rev	x8,x8
442	rev	x12,x12
443	rev	x9,x9
444	rev	x13,x13
445#endif
446	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
447	and	x5,x9,#0x03ffffff
448	ubfx	x6,x8,#26,#26
449	ubfx	x7,x9,#26,#26
450	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
451	extr	x8,x12,x8,#52
452	extr	x9,x13,x9,#52
453	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
454	fmov	d9,x4
455	and	x8,x8,#0x03ffffff
456	and	x9,x9,#0x03ffffff
457	ubfx	x10,x12,#14,#26
458	ubfx	x11,x13,#14,#26
459	add	x12,x3,x12,lsr#40
460	add	x13,x3,x13,lsr#40
461	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
462	fmov	d10,x6
463	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
464	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
465	movi	v31.2d,#-1
466	fmov	d11,x8
467	fmov	d12,x10
468	fmov	d13,x12
469	ushr	v31.2d,v31.2d,#38
470
471	b.ls	.Lskip_loop
472
473.align	4
474.Loop_neon:
475	////////////////////////////////////////////////////////////////
476	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
477	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
478	//   ___________________/
479	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
480	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
481	//   ___________________/ ____________________/
482	//
483	// Note that we start with inp[2:3]*r^2. This is because it
484	// doesn't depend on reduction in previous iteration.
485	////////////////////////////////////////////////////////////////
486	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
487	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
488	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
489	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
490	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
491
492	subs	x2,x2,#64
493	umull	v23.2d,v14.2s,v7.s[2]
494	csel	x16,x17,x16,lo
495	umull	v22.2d,v14.2s,v5.s[2]
496	umull	v21.2d,v14.2s,v3.s[2]
497	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
498	umull	v20.2d,v14.2s,v1.s[2]
499	ldp	x9,x13,[x16],#48
500	umull	v19.2d,v14.2s,v0.s[2]
501#ifdef	__ARMEB__
502	rev	x8,x8
503	rev	x12,x12
504	rev	x9,x9
505	rev	x13,x13
506#endif
507
508	umlal	v23.2d,v15.2s,v5.s[2]
509	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
510	umlal	v22.2d,v15.2s,v3.s[2]
511	and	x5,x9,#0x03ffffff
512	umlal	v21.2d,v15.2s,v1.s[2]
513	ubfx	x6,x8,#26,#26
514	umlal	v20.2d,v15.2s,v0.s[2]
515	ubfx	x7,x9,#26,#26
516	umlal	v19.2d,v15.2s,v8.s[2]
517	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
518
519	umlal	v23.2d,v16.2s,v3.s[2]
520	extr	x8,x12,x8,#52
521	umlal	v22.2d,v16.2s,v1.s[2]
522	extr	x9,x13,x9,#52
523	umlal	v21.2d,v16.2s,v0.s[2]
524	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
525	umlal	v20.2d,v16.2s,v8.s[2]
526	fmov	d14,x4
527	umlal	v19.2d,v16.2s,v6.s[2]
528	and	x8,x8,#0x03ffffff
529
530	umlal	v23.2d,v17.2s,v1.s[2]
531	and	x9,x9,#0x03ffffff
532	umlal	v22.2d,v17.2s,v0.s[2]
533	ubfx	x10,x12,#14,#26
534	umlal	v21.2d,v17.2s,v8.s[2]
535	ubfx	x11,x13,#14,#26
536	umlal	v20.2d,v17.2s,v6.s[2]
537	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
538	umlal	v19.2d,v17.2s,v4.s[2]
539	fmov	d15,x6
540
541	add	v11.2s,v11.2s,v26.2s
542	add	x12,x3,x12,lsr#40
543	umlal	v23.2d,v18.2s,v0.s[2]
544	add	x13,x3,x13,lsr#40
545	umlal	v22.2d,v18.2s,v8.s[2]
546	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
547	umlal	v21.2d,v18.2s,v6.s[2]
548	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
549	umlal	v20.2d,v18.2s,v4.s[2]
550	fmov	d16,x8
551	umlal	v19.2d,v18.2s,v2.s[2]
552	fmov	d17,x10
553
554	////////////////////////////////////////////////////////////////
555	// (hash+inp[0:1])*r^4 and accumulate
556
557	add	v9.2s,v9.2s,v24.2s
558	fmov	d18,x12
559	umlal	v22.2d,v11.2s,v1.s[0]
560	ldp	x8,x12,[x1],#16	// inp[0:1]
561	umlal	v19.2d,v11.2s,v6.s[0]
562	ldp	x9,x13,[x1],#48
563	umlal	v23.2d,v11.2s,v3.s[0]
564	umlal	v20.2d,v11.2s,v8.s[0]
565	umlal	v21.2d,v11.2s,v0.s[0]
566#ifdef	__ARMEB__
567	rev	x8,x8
568	rev	x12,x12
569	rev	x9,x9
570	rev	x13,x13
571#endif
572
573	add	v10.2s,v10.2s,v25.2s
574	umlal	v22.2d,v9.2s,v5.s[0]
575	umlal	v23.2d,v9.2s,v7.s[0]
576	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
577	umlal	v21.2d,v9.2s,v3.s[0]
578	and	x5,x9,#0x03ffffff
579	umlal	v19.2d,v9.2s,v0.s[0]
580	ubfx	x6,x8,#26,#26
581	umlal	v20.2d,v9.2s,v1.s[0]
582	ubfx	x7,x9,#26,#26
583
584	add	v12.2s,v12.2s,v27.2s
585	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
586	umlal	v22.2d,v10.2s,v3.s[0]
587	extr	x8,x12,x8,#52
588	umlal	v23.2d,v10.2s,v5.s[0]
589	extr	x9,x13,x9,#52
590	umlal	v19.2d,v10.2s,v8.s[0]
591	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
592	umlal	v21.2d,v10.2s,v1.s[0]
593	fmov	d9,x4
594	umlal	v20.2d,v10.2s,v0.s[0]
595	and	x8,x8,#0x03ffffff
596
597	add	v13.2s,v13.2s,v28.2s
598	and	x9,x9,#0x03ffffff
599	umlal	v22.2d,v12.2s,v0.s[0]
600	ubfx	x10,x12,#14,#26
601	umlal	v19.2d,v12.2s,v4.s[0]
602	ubfx	x11,x13,#14,#26
603	umlal	v23.2d,v12.2s,v1.s[0]
604	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
605	umlal	v20.2d,v12.2s,v6.s[0]
606	fmov	d10,x6
607	umlal	v21.2d,v12.2s,v8.s[0]
608	add	x12,x3,x12,lsr#40
609
610	umlal	v22.2d,v13.2s,v8.s[0]
611	add	x13,x3,x13,lsr#40
612	umlal	v19.2d,v13.2s,v2.s[0]
613	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
614	umlal	v23.2d,v13.2s,v0.s[0]
615	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
616	umlal	v20.2d,v13.2s,v4.s[0]
617	fmov	d11,x8
618	umlal	v21.2d,v13.2s,v6.s[0]
619	fmov	d12,x10
620	fmov	d13,x12
621
622	/////////////////////////////////////////////////////////////////
623	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
624	// and P. Schwabe
625	//
626	// [see discussion in poly1305-armv4 module]
627
628	ushr	v29.2d,v22.2d,#26
629	xtn	v27.2s,v22.2d
630	ushr	v30.2d,v19.2d,#26
631	and	v19.16b,v19.16b,v31.16b
632	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
633	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
634	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
635
636	ushr	v29.2d,v23.2d,#26
637	xtn	v28.2s,v23.2d
638	ushr	v30.2d,v20.2d,#26
639	xtn	v25.2s,v20.2d
640	bic	v28.2s,#0xfc,lsl#24
641	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
642
643	add	v19.2d,v19.2d,v29.2d
644	shl	v29.2d,v29.2d,#2
645	shrn	v30.2s,v21.2d,#26
646	xtn	v26.2s,v21.2d
647	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
648	bic	v25.2s,#0xfc,lsl#24
649	add	v27.2s,v27.2s,v30.2s		// h2 -> h3
650	bic	v26.2s,#0xfc,lsl#24
651
652	shrn	v29.2s,v19.2d,#26
653	xtn	v24.2s,v19.2d
654	ushr	v30.2s,v27.2s,#26
655	bic	v27.2s,#0xfc,lsl#24
656	bic	v24.2s,#0xfc,lsl#24
657	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
658	add	v28.2s,v28.2s,v30.2s		// h3 -> h4
659
660	b.hi	.Loop_neon
661
662.Lskip_loop:
663	dup	v16.2d,v16.d[0]
664	add	v11.2s,v11.2s,v26.2s
665
666	////////////////////////////////////////////////////////////////
667	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
668
669	adds	x2,x2,#32
670	b.ne	.Long_tail
671
672	dup	v16.2d,v11.d[0]
673	add	v14.2s,v9.2s,v24.2s
674	add	v17.2s,v12.2s,v27.2s
675	add	v15.2s,v10.2s,v25.2s
676	add	v18.2s,v13.2s,v28.2s
677
678.Long_tail:
679	dup	v14.2d,v14.d[0]
680	umull2	v19.2d,v16.4s,v6.4s
681	umull2	v22.2d,v16.4s,v1.4s
682	umull2	v23.2d,v16.4s,v3.4s
683	umull2	v21.2d,v16.4s,v0.4s
684	umull2	v20.2d,v16.4s,v8.4s
685
686	dup	v15.2d,v15.d[0]
687	umlal2	v19.2d,v14.4s,v0.4s
688	umlal2	v21.2d,v14.4s,v3.4s
689	umlal2	v22.2d,v14.4s,v5.4s
690	umlal2	v23.2d,v14.4s,v7.4s
691	umlal2	v20.2d,v14.4s,v1.4s
692
693	dup	v17.2d,v17.d[0]
694	umlal2	v19.2d,v15.4s,v8.4s
695	umlal2	v22.2d,v15.4s,v3.4s
696	umlal2	v21.2d,v15.4s,v1.4s
697	umlal2	v23.2d,v15.4s,v5.4s
698	umlal2	v20.2d,v15.4s,v0.4s
699
700	dup	v18.2d,v18.d[0]
701	umlal2	v22.2d,v17.4s,v0.4s
702	umlal2	v23.2d,v17.4s,v1.4s
703	umlal2	v19.2d,v17.4s,v4.4s
704	umlal2	v20.2d,v17.4s,v6.4s
705	umlal2	v21.2d,v17.4s,v8.4s
706
707	umlal2	v22.2d,v18.4s,v8.4s
708	umlal2	v19.2d,v18.4s,v2.4s
709	umlal2	v23.2d,v18.4s,v0.4s
710	umlal2	v20.2d,v18.4s,v4.4s
711	umlal2	v21.2d,v18.4s,v6.4s
712
713	b.eq	.Lshort_tail
714
715	////////////////////////////////////////////////////////////////
716	// (hash+inp[0:1])*r^4:r^3 and accumulate
717
718	add	v9.2s,v9.2s,v24.2s
719	umlal	v22.2d,v11.2s,v1.2s
720	umlal	v19.2d,v11.2s,v6.2s
721	umlal	v23.2d,v11.2s,v3.2s
722	umlal	v20.2d,v11.2s,v8.2s
723	umlal	v21.2d,v11.2s,v0.2s
724
725	add	v10.2s,v10.2s,v25.2s
726	umlal	v22.2d,v9.2s,v5.2s
727	umlal	v19.2d,v9.2s,v0.2s
728	umlal	v23.2d,v9.2s,v7.2s
729	umlal	v20.2d,v9.2s,v1.2s
730	umlal	v21.2d,v9.2s,v3.2s
731
732	add	v12.2s,v12.2s,v27.2s
733	umlal	v22.2d,v10.2s,v3.2s
734	umlal	v19.2d,v10.2s,v8.2s
735	umlal	v23.2d,v10.2s,v5.2s
736	umlal	v20.2d,v10.2s,v0.2s
737	umlal	v21.2d,v10.2s,v1.2s
738
739	add	v13.2s,v13.2s,v28.2s
740	umlal	v22.2d,v12.2s,v0.2s
741	umlal	v19.2d,v12.2s,v4.2s
742	umlal	v23.2d,v12.2s,v1.2s
743	umlal	v20.2d,v12.2s,v6.2s
744	umlal	v21.2d,v12.2s,v8.2s
745
746	umlal	v22.2d,v13.2s,v8.2s
747	umlal	v19.2d,v13.2s,v2.2s
748	umlal	v23.2d,v13.2s,v0.2s
749	umlal	v20.2d,v13.2s,v4.2s
750	umlal	v21.2d,v13.2s,v6.2s
751
752.Lshort_tail:
753	////////////////////////////////////////////////////////////////
754	// horizontal add
755
756	addp	v22.2d,v22.2d,v22.2d
757	ldp	d8,d9,[sp,#16]		// meet ABI requirements
758	addp	v19.2d,v19.2d,v19.2d
759	ldp	d10,d11,[sp,#32]
760	addp	v23.2d,v23.2d,v23.2d
761	ldp	d12,d13,[sp,#48]
762	addp	v20.2d,v20.2d,v20.2d
763	ldp	d14,d15,[sp,#64]
764	addp	v21.2d,v21.2d,v21.2d
765
766	////////////////////////////////////////////////////////////////
767	// lazy reduction, but without narrowing
768
769	ushr	v29.2d,v22.2d,#26
770	and	v22.16b,v22.16b,v31.16b
771	ushr	v30.2d,v19.2d,#26
772	and	v19.16b,v19.16b,v31.16b
773
774	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
775	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
776
777	ushr	v29.2d,v23.2d,#26
778	and	v23.16b,v23.16b,v31.16b
779	ushr	v30.2d,v20.2d,#26
780	and	v20.16b,v20.16b,v31.16b
781	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
782
783	add	v19.2d,v19.2d,v29.2d
784	shl	v29.2d,v29.2d,#2
785	ushr	v30.2d,v21.2d,#26
786	and	v21.16b,v21.16b,v31.16b
787	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
788	add	v22.2d,v22.2d,v30.2d	// h2 -> h3
789
790	ushr	v29.2d,v19.2d,#26
791	and	v19.16b,v19.16b,v31.16b
792	ushr	v30.2d,v22.2d,#26
793	and	v22.16b,v22.16b,v31.16b
794	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
795	add	v23.2d,v23.2d,v30.2d	// h3 -> h4
796
797	////////////////////////////////////////////////////////////////
798	// write the result, can be partially reduced
799
800	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
801	st1	{v23.s}[0],[x0]
802
803.Lno_data_neon:
804	ldr	x29,[sp],#80
805.inst	0xd50323bf		// autiasp
806	ret
807.size	poly1305_blocks_neon,.-poly1305_blocks_neon
808
809.type	poly1305_emit_neon,%function
810.align	5
811poly1305_emit_neon:
812	ldr	x17,[x0,#24]
813	cbz	x17,poly1305_emit
814
815	ldp	w10,w11,[x0]		// load hash value base 2^26
816	ldp	w12,w13,[x0,#8]
817	ldr	w14,[x0,#16]
818
819	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
820	lsr	x5,x12,#12
821	adds	x4,x4,x12,lsl#52
822	add	x5,x5,x13,lsl#14
823	adc	x5,x5,xzr
824	lsr	x6,x14,#24
825	adds	x5,x5,x14,lsl#40
826	adc	x6,x6,xzr		// can be partially reduced...
827
828	ldp	x10,x11,[x2]	// load nonce
829
830	and	x12,x6,#-4		// ... so reduce
831	add	x12,x12,x6,lsr#2
832	and	x6,x6,#3
833	adds	x4,x4,x12
834	adcs	x5,x5,xzr
835	adc	x6,x6,xzr
836
837	adds	x12,x4,#5		// compare to modulus
838	adcs	x13,x5,xzr
839	adc	x14,x6,xzr
840
841	tst	x14,#-4			// see if it's carried/borrowed
842
843	csel	x4,x4,x12,eq
844	csel	x5,x5,x13,eq
845
846#ifdef	__ARMEB__
847	ror	x10,x10,#32		// flip nonce words
848	ror	x11,x11,#32
849#endif
850	adds	x4,x4,x10		// accumulate nonce
851	adc	x5,x5,x11
852#ifdef	__ARMEB__
853	rev	x4,x4			// flip output bytes
854	rev	x5,x5
855#endif
856	stp	x4,x5,[x1]		// write result
857
858	ret
859.size	poly1305_emit_neon,.-poly1305_emit_neon
860
861.align	5
862.Lzeros:
863.long	0,0,0,0,0,0,0,0
864.LOPENSSL_armcap_P:
865#ifdef	__ILP32__
866.long	OPENSSL_armcap_P-.
867#else
868.quad	OPENSSL_armcap_P-.
869#endif
870.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
871.align	2
872.align	2
873