xref: /linux/arch/arm/crypto/blake2b-neon-core.S (revision d0034a7a4ac7fae708146ac0059b9c47a1543f0d)
1*1862eb00SEric Biggers/* SPDX-License-Identifier: GPL-2.0-or-later */
2*1862eb00SEric Biggers/*
3*1862eb00SEric Biggers * BLAKE2b digest algorithm, NEON accelerated
4*1862eb00SEric Biggers *
5*1862eb00SEric Biggers * Copyright 2020 Google LLC
6*1862eb00SEric Biggers *
7*1862eb00SEric Biggers * Author: Eric Biggers <ebiggers@google.com>
8*1862eb00SEric Biggers */
9*1862eb00SEric Biggers
10*1862eb00SEric Biggers#include <linux/linkage.h>
11*1862eb00SEric Biggers
12*1862eb00SEric Biggers	.text
13*1862eb00SEric Biggers	.fpu		neon
14*1862eb00SEric Biggers
15*1862eb00SEric Biggers	// The arguments to blake2b_compress_neon()
16*1862eb00SEric Biggers	STATE		.req	r0
17*1862eb00SEric Biggers	BLOCK		.req	r1
18*1862eb00SEric Biggers	NBLOCKS		.req	r2
19*1862eb00SEric Biggers	INC		.req	r3
20*1862eb00SEric Biggers
21*1862eb00SEric Biggers	// Pointers to the rotation tables
22*1862eb00SEric Biggers	ROR24_TABLE	.req	r4
23*1862eb00SEric Biggers	ROR16_TABLE	.req	r5
24*1862eb00SEric Biggers
25*1862eb00SEric Biggers	// The original stack pointer
26*1862eb00SEric Biggers	ORIG_SP		.req	r6
27*1862eb00SEric Biggers
28*1862eb00SEric Biggers	// NEON registers which contain the message words of the current block.
29*1862eb00SEric Biggers	// M_0-M_3 are occasionally used for other purposes too.
30*1862eb00SEric Biggers	M_0		.req	d16
31*1862eb00SEric Biggers	M_1		.req	d17
32*1862eb00SEric Biggers	M_2		.req	d18
33*1862eb00SEric Biggers	M_3		.req	d19
34*1862eb00SEric Biggers	M_4		.req	d20
35*1862eb00SEric Biggers	M_5		.req	d21
36*1862eb00SEric Biggers	M_6		.req	d22
37*1862eb00SEric Biggers	M_7		.req	d23
38*1862eb00SEric Biggers	M_8		.req	d24
39*1862eb00SEric Biggers	M_9		.req	d25
40*1862eb00SEric Biggers	M_10		.req	d26
41*1862eb00SEric Biggers	M_11		.req	d27
42*1862eb00SEric Biggers	M_12		.req	d28
43*1862eb00SEric Biggers	M_13		.req	d29
44*1862eb00SEric Biggers	M_14		.req	d30
45*1862eb00SEric Biggers	M_15		.req	d31
46*1862eb00SEric Biggers
47*1862eb00SEric Biggers	.align		4
48*1862eb00SEric Biggers	// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
49*1862eb00SEric Biggers	// instruction.  This is the most efficient way to implement these
50*1862eb00SEric Biggers	// rotation amounts with NEON.  (On Cortex-A53 it's the same speed as
51*1862eb00SEric Biggers	// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
52*1862eb00SEric Biggers.Lror24_table:
53*1862eb00SEric Biggers	.byte		3, 4, 5, 6, 7, 0, 1, 2
54*1862eb00SEric Biggers.Lror16_table:
55*1862eb00SEric Biggers	.byte		2, 3, 4, 5, 6, 7, 0, 1
56*1862eb00SEric Biggers	// The BLAKE2b initialization vector
57*1862eb00SEric Biggers.Lblake2b_IV:
58*1862eb00SEric Biggers	.quad		0x6a09e667f3bcc908, 0xbb67ae8584caa73b
59*1862eb00SEric Biggers	.quad		0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
60*1862eb00SEric Biggers	.quad		0x510e527fade682d1, 0x9b05688c2b3e6c1f
61*1862eb00SEric Biggers	.quad		0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
62*1862eb00SEric Biggers
63*1862eb00SEric Biggers// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
64*1862eb00SEric Biggers// NEON registers q0-q7.  The message block is in q8..q15 (M_0-M_15).  The stack
65*1862eb00SEric Biggers// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
66*1862eb00SEric Biggers// (M_0-M_3), so that they can be reloaded if they are used as temporary
67*1862eb00SEric Biggers// registers.  The macro arguments s0-s15 give the order in which the message
68*1862eb00SEric Biggers// words are used in this round.  'final' is 1 if this is the final round.
69*1862eb00SEric Biggers.macro	_blake2b_round	s0, s1, s2, s3, s4, s5, s6, s7, \
70*1862eb00SEric Biggers			s8, s9, s10, s11, s12, s13, s14, s15, final=0
71*1862eb00SEric Biggers
72*1862eb00SEric Biggers	// Mix the columns:
73*1862eb00SEric Biggers	// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
74*1862eb00SEric Biggers	// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
75*1862eb00SEric Biggers
76*1862eb00SEric Biggers	// a += b + m[blake2b_sigma[r][2*i + 0]];
77*1862eb00SEric Biggers	vadd.u64	q0, q0, q2
78*1862eb00SEric Biggers	vadd.u64	q1, q1, q3
79*1862eb00SEric Biggers	vadd.u64	d0, d0, M_\s0
80*1862eb00SEric Biggers	vadd.u64	d1, d1, M_\s2
81*1862eb00SEric Biggers	vadd.u64	d2, d2, M_\s4
82*1862eb00SEric Biggers	vadd.u64	d3, d3, M_\s6
83*1862eb00SEric Biggers
84*1862eb00SEric Biggers	// d = ror64(d ^ a, 32);
85*1862eb00SEric Biggers	veor		q6, q6, q0
86*1862eb00SEric Biggers	veor		q7, q7, q1
87*1862eb00SEric Biggers	vrev64.32	q6, q6
88*1862eb00SEric Biggers	vrev64.32	q7, q7
89*1862eb00SEric Biggers
90*1862eb00SEric Biggers	// c += d;
91*1862eb00SEric Biggers	vadd.u64	q4, q4, q6
92*1862eb00SEric Biggers	vadd.u64	q5, q5, q7
93*1862eb00SEric Biggers
94*1862eb00SEric Biggers	// b = ror64(b ^ c, 24);
95*1862eb00SEric Biggers	vld1.8		{M_0}, [ROR24_TABLE, :64]
96*1862eb00SEric Biggers	veor		q2, q2, q4
97*1862eb00SEric Biggers	veor		q3, q3, q5
98*1862eb00SEric Biggers	vtbl.8		d4, {d4}, M_0
99*1862eb00SEric Biggers	vtbl.8		d5, {d5}, M_0
100*1862eb00SEric Biggers	vtbl.8		d6, {d6}, M_0
101*1862eb00SEric Biggers	vtbl.8		d7, {d7}, M_0
102*1862eb00SEric Biggers
103*1862eb00SEric Biggers	// a += b + m[blake2b_sigma[r][2*i + 1]];
104*1862eb00SEric Biggers	//
105*1862eb00SEric Biggers	// M_0 got clobbered above, so we have to reload it if any of the four
106*1862eb00SEric Biggers	// message words this step needs happens to be M_0.  Otherwise we don't
107*1862eb00SEric Biggers	// need to reload it here, as it will just get clobbered again below.
108*1862eb00SEric Biggers.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
109*1862eb00SEric Biggers	vld1.8		{M_0}, [sp, :64]
110*1862eb00SEric Biggers.endif
111*1862eb00SEric Biggers	vadd.u64	q0, q0, q2
112*1862eb00SEric Biggers	vadd.u64	q1, q1, q3
113*1862eb00SEric Biggers	vadd.u64	d0, d0, M_\s1
114*1862eb00SEric Biggers	vadd.u64	d1, d1, M_\s3
115*1862eb00SEric Biggers	vadd.u64	d2, d2, M_\s5
116*1862eb00SEric Biggers	vadd.u64	d3, d3, M_\s7
117*1862eb00SEric Biggers
118*1862eb00SEric Biggers	// d = ror64(d ^ a, 16);
119*1862eb00SEric Biggers	vld1.8		{M_0}, [ROR16_TABLE, :64]
120*1862eb00SEric Biggers	veor		q6, q6, q0
121*1862eb00SEric Biggers	veor		q7, q7, q1
122*1862eb00SEric Biggers	vtbl.8		d12, {d12}, M_0
123*1862eb00SEric Biggers	vtbl.8		d13, {d13}, M_0
124*1862eb00SEric Biggers	vtbl.8		d14, {d14}, M_0
125*1862eb00SEric Biggers	vtbl.8		d15, {d15}, M_0
126*1862eb00SEric Biggers
127*1862eb00SEric Biggers	// c += d;
128*1862eb00SEric Biggers	vadd.u64	q4, q4, q6
129*1862eb00SEric Biggers	vadd.u64	q5, q5, q7
130*1862eb00SEric Biggers
131*1862eb00SEric Biggers	// b = ror64(b ^ c, 63);
132*1862eb00SEric Biggers	//
133*1862eb00SEric Biggers	// This rotation amount isn't a multiple of 8, so it has to be
134*1862eb00SEric Biggers	// implemented using a pair of shifts, which requires temporary
135*1862eb00SEric Biggers	// registers.  Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
136*1862eb00SEric Biggers	veor		q8, q2, q4
137*1862eb00SEric Biggers	veor		q9, q3, q5
138*1862eb00SEric Biggers	vshr.u64	q2, q8, #63
139*1862eb00SEric Biggers	vshr.u64	q3, q9, #63
140*1862eb00SEric Biggers	vsli.u64	q2, q8, #1
141*1862eb00SEric Biggers	vsli.u64	q3, q9, #1
142*1862eb00SEric Biggers	vld1.8		{q8-q9}, [sp, :256]
143*1862eb00SEric Biggers
144*1862eb00SEric Biggers	// Mix the diagonals:
145*1862eb00SEric Biggers	// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
146*1862eb00SEric Biggers	// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
147*1862eb00SEric Biggers	//
148*1862eb00SEric Biggers	// There are two possible ways to do this: use 'vext' instructions to
149*1862eb00SEric Biggers	// shift the rows of the matrix so that the diagonals become columns,
150*1862eb00SEric Biggers	// and undo it afterwards; or just use 64-bit operations on 'd'
151*1862eb00SEric Biggers	// registers instead of 128-bit operations on 'q' registers.  We use the
152*1862eb00SEric Biggers	// latter approach, as it performs much better on Cortex-A7.
153*1862eb00SEric Biggers
154*1862eb00SEric Biggers	// a += b + m[blake2b_sigma[r][2*i + 0]];
155*1862eb00SEric Biggers	vadd.u64	d0, d0, d5
156*1862eb00SEric Biggers	vadd.u64	d1, d1, d6
157*1862eb00SEric Biggers	vadd.u64	d2, d2, d7
158*1862eb00SEric Biggers	vadd.u64	d3, d3, d4
159*1862eb00SEric Biggers	vadd.u64	d0, d0, M_\s8
160*1862eb00SEric Biggers	vadd.u64	d1, d1, M_\s10
161*1862eb00SEric Biggers	vadd.u64	d2, d2, M_\s12
162*1862eb00SEric Biggers	vadd.u64	d3, d3, M_\s14
163*1862eb00SEric Biggers
164*1862eb00SEric Biggers	// d = ror64(d ^ a, 32);
165*1862eb00SEric Biggers	veor		d15, d15, d0
166*1862eb00SEric Biggers	veor		d12, d12, d1
167*1862eb00SEric Biggers	veor		d13, d13, d2
168*1862eb00SEric Biggers	veor		d14, d14, d3
169*1862eb00SEric Biggers	vrev64.32	d15, d15
170*1862eb00SEric Biggers	vrev64.32	d12, d12
171*1862eb00SEric Biggers	vrev64.32	d13, d13
172*1862eb00SEric Biggers	vrev64.32	d14, d14
173*1862eb00SEric Biggers
174*1862eb00SEric Biggers	// c += d;
175*1862eb00SEric Biggers	vadd.u64	d10, d10, d15
176*1862eb00SEric Biggers	vadd.u64	d11, d11, d12
177*1862eb00SEric Biggers	vadd.u64	d8, d8, d13
178*1862eb00SEric Biggers	vadd.u64	d9, d9, d14
179*1862eb00SEric Biggers
180*1862eb00SEric Biggers	// b = ror64(b ^ c, 24);
181*1862eb00SEric Biggers	vld1.8		{M_0}, [ROR24_TABLE, :64]
182*1862eb00SEric Biggers	veor		d5, d5, d10
183*1862eb00SEric Biggers	veor		d6, d6, d11
184*1862eb00SEric Biggers	veor		d7, d7, d8
185*1862eb00SEric Biggers	veor		d4, d4, d9
186*1862eb00SEric Biggers	vtbl.8		d5, {d5}, M_0
187*1862eb00SEric Biggers	vtbl.8		d6, {d6}, M_0
188*1862eb00SEric Biggers	vtbl.8		d7, {d7}, M_0
189*1862eb00SEric Biggers	vtbl.8		d4, {d4}, M_0
190*1862eb00SEric Biggers
191*1862eb00SEric Biggers	// a += b + m[blake2b_sigma[r][2*i + 1]];
192*1862eb00SEric Biggers.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
193*1862eb00SEric Biggers	vld1.8		{M_0}, [sp, :64]
194*1862eb00SEric Biggers.endif
195*1862eb00SEric Biggers	vadd.u64	d0, d0, d5
196*1862eb00SEric Biggers	vadd.u64	d1, d1, d6
197*1862eb00SEric Biggers	vadd.u64	d2, d2, d7
198*1862eb00SEric Biggers	vadd.u64	d3, d3, d4
199*1862eb00SEric Biggers	vadd.u64	d0, d0, M_\s9
200*1862eb00SEric Biggers	vadd.u64	d1, d1, M_\s11
201*1862eb00SEric Biggers	vadd.u64	d2, d2, M_\s13
202*1862eb00SEric Biggers	vadd.u64	d3, d3, M_\s15
203*1862eb00SEric Biggers
204*1862eb00SEric Biggers	// d = ror64(d ^ a, 16);
205*1862eb00SEric Biggers	vld1.8		{M_0}, [ROR16_TABLE, :64]
206*1862eb00SEric Biggers	veor		d15, d15, d0
207*1862eb00SEric Biggers	veor		d12, d12, d1
208*1862eb00SEric Biggers	veor		d13, d13, d2
209*1862eb00SEric Biggers	veor		d14, d14, d3
210*1862eb00SEric Biggers	vtbl.8		d12, {d12}, M_0
211*1862eb00SEric Biggers	vtbl.8		d13, {d13}, M_0
212*1862eb00SEric Biggers	vtbl.8		d14, {d14}, M_0
213*1862eb00SEric Biggers	vtbl.8		d15, {d15}, M_0
214*1862eb00SEric Biggers
215*1862eb00SEric Biggers	// c += d;
216*1862eb00SEric Biggers	vadd.u64	d10, d10, d15
217*1862eb00SEric Biggers	vadd.u64	d11, d11, d12
218*1862eb00SEric Biggers	vadd.u64	d8, d8, d13
219*1862eb00SEric Biggers	vadd.u64	d9, d9, d14
220*1862eb00SEric Biggers
221*1862eb00SEric Biggers	// b = ror64(b ^ c, 63);
222*1862eb00SEric Biggers	veor		d16, d4, d9
223*1862eb00SEric Biggers	veor		d17, d5, d10
224*1862eb00SEric Biggers	veor		d18, d6, d11
225*1862eb00SEric Biggers	veor		d19, d7, d8
226*1862eb00SEric Biggers	vshr.u64	q2, q8, #63
227*1862eb00SEric Biggers	vshr.u64	q3, q9, #63
228*1862eb00SEric Biggers	vsli.u64	q2, q8, #1
229*1862eb00SEric Biggers	vsli.u64	q3, q9, #1
230*1862eb00SEric Biggers	// Reloading q8-q9 can be skipped on the final round.
231*1862eb00SEric Biggers.if ! \final
232*1862eb00SEric Biggers	vld1.8		{q8-q9}, [sp, :256]
233*1862eb00SEric Biggers.endif
234*1862eb00SEric Biggers.endm
235*1862eb00SEric Biggers
236*1862eb00SEric Biggers//
237*1862eb00SEric Biggers// void blake2b_compress_neon(struct blake2b_state *state,
238*1862eb00SEric Biggers//			      const u8 *block, size_t nblocks, u32 inc);
239*1862eb00SEric Biggers//
240*1862eb00SEric Biggers// Only the first three fields of struct blake2b_state are used:
241*1862eb00SEric Biggers//	u64 h[8];	(inout)
242*1862eb00SEric Biggers//	u64 t[2];	(inout)
243*1862eb00SEric Biggers//	u64 f[2];	(in)
244*1862eb00SEric Biggers//
245*1862eb00SEric Biggers	.align		5
246*1862eb00SEric BiggersENTRY(blake2b_compress_neon)
247*1862eb00SEric Biggers	push		{r4-r10}
248*1862eb00SEric Biggers
249*1862eb00SEric Biggers	// Allocate a 32-byte stack buffer that is 32-byte aligned.
250*1862eb00SEric Biggers	mov		ORIG_SP, sp
251*1862eb00SEric Biggers	sub		ip, sp, #32
252*1862eb00SEric Biggers	bic		ip, ip, #31
253*1862eb00SEric Biggers	mov		sp, ip
254*1862eb00SEric Biggers
255*1862eb00SEric Biggers	adr		ROR24_TABLE, .Lror24_table
256*1862eb00SEric Biggers	adr		ROR16_TABLE, .Lror16_table
257*1862eb00SEric Biggers
258*1862eb00SEric Biggers	mov		ip, STATE
259*1862eb00SEric Biggers	vld1.64		{q0-q1}, [ip]!		// Load h[0..3]
260*1862eb00SEric Biggers	vld1.64		{q2-q3}, [ip]!		// Load h[4..7]
261*1862eb00SEric Biggers.Lnext_block:
262*1862eb00SEric Biggers	  adr		r10, .Lblake2b_IV
263*1862eb00SEric Biggers	vld1.64		{q14-q15}, [ip]		// Load t[0..1] and f[0..1]
264*1862eb00SEric Biggers	vld1.64		{q4-q5}, [r10]!		// Load IV[0..3]
265*1862eb00SEric Biggers	  vmov		r7, r8, d28		// Copy t[0] to (r7, r8)
266*1862eb00SEric Biggers	vld1.64		{q6-q7}, [r10]		// Load IV[4..7]
267*1862eb00SEric Biggers	  adds		r7, r7, INC		// Increment counter
268*1862eb00SEric Biggers	bcs		.Lslow_inc_ctr
269*1862eb00SEric Biggers	vmov.i32	d28[0], r7
270*1862eb00SEric Biggers	vst1.64		{d28}, [ip]		// Update t[0]
271*1862eb00SEric Biggers.Linc_ctr_done:
272*1862eb00SEric Biggers
273*1862eb00SEric Biggers	// Load the next message block and finish initializing the state matrix
274*1862eb00SEric Biggers	// 'v'.  Fortunately, there are exactly enough NEON registers to fit the
275*1862eb00SEric Biggers	// entire state matrix in q0-q7 and the entire message block in q8-15.
276*1862eb00SEric Biggers	//
277*1862eb00SEric Biggers	// However, _blake2b_round also needs some extra registers for rotates,
278*1862eb00SEric Biggers	// so we have to spill some registers.  It's better to spill the message
279*1862eb00SEric Biggers	// registers than the state registers, as the message doesn't change.
280*1862eb00SEric Biggers	// Therefore we store a copy of the first 32 bytes of the message block
281*1862eb00SEric Biggers	// (q8-q9) in an aligned buffer on the stack so that they can be
282*1862eb00SEric Biggers	// reloaded when needed.  (We could just reload directly from the
283*1862eb00SEric Biggers	// message buffer, but it's faster to use aligned loads.)
284*1862eb00SEric Biggers	vld1.8		{q8-q9}, [BLOCK]!
285*1862eb00SEric Biggers	  veor		q6, q6, q14	// v[12..13] = IV[4..5] ^ t[0..1]
286*1862eb00SEric Biggers	vld1.8		{q10-q11}, [BLOCK]!
287*1862eb00SEric Biggers	  veor		q7, q7, q15	// v[14..15] = IV[6..7] ^ f[0..1]
288*1862eb00SEric Biggers	vld1.8		{q12-q13}, [BLOCK]!
289*1862eb00SEric Biggers	vst1.8		{q8-q9}, [sp, :256]
290*1862eb00SEric Biggers	  mov		ip, STATE
291*1862eb00SEric Biggers	vld1.8		{q14-q15}, [BLOCK]!
292*1862eb00SEric Biggers
293*1862eb00SEric Biggers	// Execute the rounds.  Each round is provided the order in which it
294*1862eb00SEric Biggers	// needs to use the message words.
295*1862eb00SEric Biggers	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
296*1862eb00SEric Biggers	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
297*1862eb00SEric Biggers	_blake2b_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
298*1862eb00SEric Biggers	_blake2b_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
299*1862eb00SEric Biggers	_blake2b_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
300*1862eb00SEric Biggers	_blake2b_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
301*1862eb00SEric Biggers	_blake2b_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
302*1862eb00SEric Biggers	_blake2b_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
303*1862eb00SEric Biggers	_blake2b_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
304*1862eb00SEric Biggers	_blake2b_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
305*1862eb00SEric Biggers	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
306*1862eb00SEric Biggers	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
307*1862eb00SEric Biggers			final=1
308*1862eb00SEric Biggers
309*1862eb00SEric Biggers	// Fold the final state matrix into the hash chaining value:
310*1862eb00SEric Biggers	//
311*1862eb00SEric Biggers	//	for (i = 0; i < 8; i++)
312*1862eb00SEric Biggers	//		h[i] ^= v[i] ^ v[i + 8];
313*1862eb00SEric Biggers	//
314*1862eb00SEric Biggers	  vld1.64	{q8-q9}, [ip]!		// Load old h[0..3]
315*1862eb00SEric Biggers	veor		q0, q0, q4		// v[0..1] ^= v[8..9]
316*1862eb00SEric Biggers	veor		q1, q1, q5		// v[2..3] ^= v[10..11]
317*1862eb00SEric Biggers	  vld1.64	{q10-q11}, [ip]		// Load old h[4..7]
318*1862eb00SEric Biggers	veor		q2, q2, q6		// v[4..5] ^= v[12..13]
319*1862eb00SEric Biggers	veor		q3, q3, q7		// v[6..7] ^= v[14..15]
320*1862eb00SEric Biggers	veor		q0, q0, q8		// v[0..1] ^= h[0..1]
321*1862eb00SEric Biggers	veor		q1, q1, q9		// v[2..3] ^= h[2..3]
322*1862eb00SEric Biggers	  mov		ip, STATE
323*1862eb00SEric Biggers	  subs		NBLOCKS, NBLOCKS, #1	// nblocks--
324*1862eb00SEric Biggers	  vst1.64	{q0-q1}, [ip]!		// Store new h[0..3]
325*1862eb00SEric Biggers	veor		q2, q2, q10		// v[4..5] ^= h[4..5]
326*1862eb00SEric Biggers	veor		q3, q3, q11		// v[6..7] ^= h[6..7]
327*1862eb00SEric Biggers	  vst1.64	{q2-q3}, [ip]!		// Store new h[4..7]
328*1862eb00SEric Biggers
329*1862eb00SEric Biggers	// Advance to the next block, if there is one.
330*1862eb00SEric Biggers	bne		.Lnext_block		// nblocks != 0?
331*1862eb00SEric Biggers
332*1862eb00SEric Biggers	mov		sp, ORIG_SP
333*1862eb00SEric Biggers	pop		{r4-r10}
334*1862eb00SEric Biggers	mov		pc, lr
335*1862eb00SEric Biggers
336*1862eb00SEric Biggers.Lslow_inc_ctr:
337*1862eb00SEric Biggers	// Handle the case where the counter overflowed its low 32 bits, by
338*1862eb00SEric Biggers	// carrying the overflow bit into the full 128-bit counter.
339*1862eb00SEric Biggers	vmov		r9, r10, d29
340*1862eb00SEric Biggers	adcs		r8, r8, #0
341*1862eb00SEric Biggers	adcs		r9, r9, #0
342*1862eb00SEric Biggers	adc		r10, r10, #0
343*1862eb00SEric Biggers	vmov		d28, r7, r8
344*1862eb00SEric Biggers	vmov		d29, r9, r10
345*1862eb00SEric Biggers	vst1.64		{q14}, [ip]		// Update t[0] and t[1]
346*1862eb00SEric Biggers	b		.Linc_ctr_done
347*1862eb00SEric BiggersENDPROC(blake2b_compress_neon)
348