Lines Matching +full:sub +full:- +full:message
1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
12 .arch armv8-a+crypto
51 * The SHA-256 round constants
75 ld1 { v0.4s- v3.4s}, [\tmp], #64
76 ld1 { v4.4s- v7.4s}, [\tmp], #64
77 ld1 { v8.4s-v11.4s}, [\tmp], #64
78 ld1 {v12.4s-v15.4s}, [\tmp]
94 0: ld1 {v16.4s-v19.4s}, [x1], #64
95 sub x2, x2, #1
167 // x8-x9 are used as temporaries.
169 // v0-v15 are used to cache the SHA-256 round constants.
170 // v16-v19 are used for the message schedule for the first message.
171 // v20-v23 are used for the message schedule for the second message.
172 // v24-v31 are used for the state and temporaries as given below.
173 // *_a are for the first message and *_b for the second.
193 // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
194 // and m0_b contain the current 4 message schedule words for the first
195 // and second message respectively.
197 // If not all the message schedule words have been computed yet, then
198 // this also computes 4 more message schedule words for each message.
199 // m1_a-m3_a contain the next 3 groups of 4 message schedule words for
200 // the first message, and likewise m1_b-m3_b for the second. After
237 // This function computes the SHA-256 digests of two messages |data1| and
241 // The instructions for the two SHA-256 operations are interleaved. On many
242 // CPUs, this is almost twice as fast as hashing each message individually due
243 // to taking better advantage of the CPU's SHA-256 and SIMD throughput.
246 sub sp, sp, #128
250 // Load the initial state from ctx->state.
251 ld1 {state0_a.4s-state1_a.4s}, [ctx]
253 // Load ctx->bytecount. Take the mod 64 of it to get the number of
254 // bytes that are buffered in ctx->buf. Also save it in a register with
261 // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them
262 // followed by the first 64 - x8 bytes of data. Since len >= 64, we
263 // just load 64 bytes from each of ctx->buf, data1, and data2
266 ld1 {v16.16b-v19.16b}, [x9]
267 st1 {v16.16b-v19.16b}, [sp]
269 ld1 {v16.16b-v19.16b}, [data1], #64
271 st1 {v16.16b-v19.16b}, [x9]
272 ld1 {v16.4s-v19.4s}, [sp]
274 ld1 {v20.16b-v23.16b}, [data2], #64
275 st1 {v20.16b-v23.16b}, [x9]
276 ld1 {v20.4s-v23.4s}, [sp]
278 sub len, len, #64
279 sub data1, data1, x8
280 sub data2, data2, x8
287 sub len, len, #64
292 ld1 {v16.4s-v19.4s}, [data1], #64
293 ld1 {v20.4s-v23.4s}, [data2], #64
307 st1 {state0_a.4s-state1_b.4s}, [sp]
309 // Do the SHA-256 rounds on each block.
316 ld1 {v16.4s-v19.4s}, [sp]
323 sub len, len, #64
328 // final_step = 1: need to do count-only padding block
335 // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
337 // &sp[64]. Then for each message, copy the last 64 data bytes to sp
338 // and load from &sp[64 - len] to get the needed padding block. This
340 sub w8, len, #64 // w8 = len - 64
341 add data1, data1, w8, sxtw // data1 += len - 64
342 add data2, data2, w8, sxtw // data2 += len - 64
351 sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
357 mov final_step, #2 // won't need count-only block
360 mov final_step, #1 // will need count-only block
362 ld1 {v16.16b-v19.16b}, [data1]
363 st1 {v16.16b-v19.16b}, [sp]
364 ld1 {v16.4s-v19.4s}, [x9]
365 ld1 {v20.16b-v23.16b}, [data2]
366 st1 {v20.16b-v23.16b}, [sp]
367 ld1 {v20.4s-v23.4s}, [x9]
373 // This is for a block aligned message.
376 // This is for a message whose length mod 64 is >= 56.
378 // Pre-swap the endianness of the words.
404 st1 {state0_a.4s-state1_a.4s}, [out1]
405 st1 {state0_b.4s-state1_b.4s}, [out2]