Lines Matching +full:6 +full:- +full:64
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
29 // M_0-M_3 are occasionally used for other purposes too.
50 // rotation amounts with NEON. (On Cortex-A53 it's the same speed as
51 // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
53 .byte 3, 4, 5, 6, 7, 0, 1, 2
55 .byte 2, 3, 4, 5, 6, 7, 0, 1
64 // NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack
65 // pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
66 // (M_0-M_3), so that they can be reloaded if they are used as temporary
67 // registers. The macro arguments s0-s15 give the order in which the message
74 // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
95 vld1.8 {M_0}, [ROR24_TABLE, :64]
109 vld1.8 {M_0}, [sp, :64]
119 vld1.8 {M_0}, [ROR16_TABLE, :64]
135 // registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
142 vld1.8 {q8-q9}, [sp, :256]
145 // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
150 // and undo it afterwards; or just use 64-bit operations on 'd'
151 // registers instead of 128-bit operations on 'q' registers. We use the
152 // latter approach, as it performs much better on Cortex-A7.
181 vld1.8 {M_0}, [ROR24_TABLE, :64]
193 vld1.8 {M_0}, [sp, :64]
205 vld1.8 {M_0}, [ROR16_TABLE, :64]
230 // Reloading q8-q9 can be skipped on the final round.
232 vld1.8 {q8-q9}, [sp, :256]
247 push {r4-r10}
249 // Allocate a 32-byte stack buffer that is 32-byte aligned.
259 vld1.64 {q0-q1}, [ip]! // Load h[0..3]
260 vld1.64 {q2-q3}, [ip]! // Load h[4..7]
263 vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1]
264 vld1.64 {q4-q5}, [r10]! // Load IV[0..3]
266 vld1.64 {q6-q7}, [r10] // Load IV[4..7]
270 vst1.64 {d28}, [ip] // Update t[0]
275 // entire state matrix in q0-q7 and the entire message block in q8-15.
281 // (q8-q9) in an aligned buffer on the stack so that they can be
284 vld1.8 {q8-q9}, [BLOCK]!
286 vld1.8 {q10-q11}, [BLOCK]!
287 veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1]
288 vld1.8 {q12-q13}, [BLOCK]!
289 vst1.8 {q8-q9}, [sp, :256]
291 vld1.8 {q14-q15}, [BLOCK]!
295 _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
296 _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
297 _blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
298 _blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
299 _blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
300 _blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
301 _blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
302 _blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
303 _blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
304 _blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
305 _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
306 _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
314 vld1.64 {q8-q9}, [ip]! // Load old h[0..3]
317 vld1.64 {q10-q11}, [ip] // Load old h[4..7]
319 veor q3, q3, q7 // v[6..7] ^= v[14..15]
323 subs NBLOCKS, NBLOCKS, #1 // nblocks--
324 vst1.64 {q0-q1}, [ip]! // Store new h[0..3]
326 veor q3, q3, q11 // v[6..7] ^= h[6..7]
327 vst1.64 {q2-q3}, [ip]! // Store new h[4..7]
333 pop {r4-r10}
338 // carrying the overflow bit into the full 128-bit counter.
345 vst1.64 {q14}, [ip] // Update t[0] and t[1]