Lines Matching +full:- +full:4
1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
12 .arch armv8-a+crypto
32 add t1.4s, v\s0\().4s, \rc\().4s
33 sha256h dg0q, dg1q, t0.4s
34 sha256h2 dg1q, dg2q, t0.4s
37 add t0.4s, v\s0\().4s, \rc\().4s
39 sha256h dg0q, dg1q, t1.4s
40 sha256h2 dg1q, dg2q, t1.4s
45 sha256su0 v\s0\().4s, v\s1\().4s
47 sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s
51 * The SHA-256 round constants
54 .align 4
75 ld1 { v0.4s- v3.4s}, [\tmp], #64
76 ld1 { v4.4s- v7.4s}, [\tmp], #64
77 ld1 { v8.4s-v11.4s}, [\tmp], #64
78 ld1 {v12.4s-v15.4s}, [\tmp]
91 ld1 {dgav.4s, dgbv.4s}, [x0]
94 0: ld1 {v16.4s-v19.4s}, [x1], #64
102 add t0.4s, v16.4s, v0.4s
127 add dgav.4s, dgav.4s, dg0v.4s
128 add dgbv.4s, dgbv.4s, dg1v.4s
137 1: st1 {dgav.4s, dgbv.4s}, [x0]
167 // x8-x9 are used as temporaries.
169 // v0-v15 are used to cache the SHA-256 round constants.
170 // v16-v19 are used for the message schedule for the first message.
171 // v20-v23 are used for the message schedule for the second message.
172 // v24-v31 are used for the state and temporaries as given below.
193 // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
194 // and m0_b contain the current 4 message schedule words for the first
198 // this also computes 4 more message schedule words for each message.
199 // m1_a-m3_a contain the next 3 groups of 4 message schedule words for
200 // the first message, and likewise m1_b-m3_b for the second. After
208 add t0_a\().4s, \m0_a\().4s, \k\().4s
209 add t0_b\().4s, \m0_b\().4s, \k\().4s
211 sha256su0 \m0_a\().4s, \m1_a\().4s
212 sha256su0 \m0_b\().4s, \m1_b\().4s
213 sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s
214 sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s
218 sha256h state0_a_q, state1_a_q, t0_a\().4s
219 sha256h state0_b_q, state1_b_q, t0_b\().4s
220 sha256h2 state1_a_q, t1_a_q, t0_a\().4s
221 sha256h2 state1_b_q, t1_b_q, t0_b\().4s
226 do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20
237 // This function computes the SHA-256 digests of two messages |data1| and
241 // The instructions for the two SHA-256 operations are interleaved. On many
243 // to taking better advantage of the CPU's SHA-256 and SIMD throughput.
250 // Load the initial state from ctx->state.
251 ld1 {state0_a.4s-state1_a.4s}, [ctx]
253 // Load ctx->bytecount. Take the mod 64 of it to get the number of
254 // bytes that are buffered in ctx->buf. Also save it in a register with
261 // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them
262 // followed by the first 64 - x8 bytes of data. Since len >= 64, we
263 // just load 64 bytes from each of ctx->buf, data1, and data2
266 ld1 {v16.16b-v19.16b}, [x9]
267 st1 {v16.16b-v19.16b}, [sp]
269 ld1 {v16.16b-v19.16b}, [data1], #64
271 st1 {v16.16b-v19.16b}, [x9]
272 ld1 {v16.4s-v19.4s}, [sp]
274 ld1 {v20.16b-v23.16b}, [data2], #64
275 st1 {v20.16b-v23.16b}, [x9]
276 ld1 {v20.4s-v23.4s}, [sp]
292 ld1 {v16.4s-v19.4s}, [data1], #64
293 ld1 {v20.4s-v23.4s}, [data2], #64
307 st1 {state0_a.4s-state1_b.4s}, [sp]
309 // Do the SHA-256 rounds on each block.
316 ld1 {v16.4s-v19.4s}, [sp]
317 add state0_a.4s, state0_a.4s, v16.4s
318 add state1_a.4s, state1_a.4s, v17.4s
319 add state0_b.4s, state0_b.4s, v18.4s
320 add state1_b.4s, state1_b.4s, v19.4s
328 // final_step = 1: need to do count-only padding block
335 // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
338 // and load from &sp[64 - len] to get the needed padding block. This
340 sub w8, len, #64 // w8 = len - 64
341 add data1, data1, w8, sxtw // data1 += len - 64
342 add data2, data2, w8, sxtw // data2 += len - 64
351 sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
357 mov final_step, #2 // won't need count-only block
360 mov final_step, #1 // will need count-only block
362 ld1 {v16.16b-v19.16b}, [data1]
363 st1 {v16.16b-v19.16b}, [sp]
364 ld1 {v16.4s-v19.4s}, [x9]
365 ld1 {v20.16b-v23.16b}, [data2]
366 st1 {v20.16b-v23.16b}, [sp]
367 ld1 {v20.4s-v23.4s}, [x9]
378 // Pre-swap the endianness of the words.
404 st1 {state0_a.4s-state1_a.4s}, [out1]
405 st1 {state0_b.4s-state1_b.4s}, [out2]