Lines Matching +full:1 +full:- +full:16
1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
12 .arch armv8-a+crypto
30 mov dg2v.16b, dg0v.16b
51 * The SHA-256 round constants
75 ld1 { v0.4s- v3.4s}, [\tmp], #64
76 ld1 { v4.4s- v7.4s}, [\tmp], #64
77 ld1 { v8.4s-v11.4s}, [\tmp], #64
78 ld1 {v12.4s-v15.4s}, [\tmp]
94 0: ld1 {v16.4s-v19.4s}, [x1], #64
95 sub x2, x2, #1
97 CPU_LE( rev32 v16.16b, v16.16b )
98 CPU_LE( rev32 v17.16b, v17.16b )
99 CPU_LE( rev32 v18.16b, v18.16b )
100 CPU_LE( rev32 v19.16b, v19.16b )
103 mov dg0v.16b, dgav.16b
104 mov dg1v.16b, dgbv.16b
106 add_update 0, v1, 16, 17, 18, 19
107 add_update 1, v2, 17, 18, 19, 16
108 add_update 0, v3, 18, 19, 16, 17
109 add_update 1, v4, 19, 16, 17, 18
111 add_update 0, v5, 16, 17, 18, 19
112 add_update 1, v6, 17, 18, 19, 16
113 add_update 0, v7, 18, 19, 16, 17
114 add_update 1, v8, 19, 16, 17, 18
116 add_update 0, v9, 16, 17, 18, 19
117 add_update 1, v10, 17, 18, 19, 16
118 add_update 0, v11, 18, 19, 16, 17
119 add_update 1, v12, 19, 16, 17, 18
122 add_only 1, v14, 18
124 add_only 1
131 cond_yield 1f, x5, x6
137 1: st1 {dgav.4s, dgbv.4s}, [x0]
167 // x8-x9 are used as temporaries.
169 // v0-v15 are used to cache the SHA-256 round constants.
170 // v16-v19 are used for the message schedule for the first message.
171 // v20-v23 are used for the message schedule for the second message.
172 // v24-v31 are used for the state and temporaries as given below.
193 // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
199 // m1_a-m3_a contain the next 3 groups of 4 message schedule words for
200 // the first message, and likewise m1_b-m3_b for the second. After
216 mov t1_a.16b, state0_a.16b
217 mov t1_b.16b, state0_b.16b
237 // This function computes the SHA-256 digests of two messages |data1| and
241 // The instructions for the two SHA-256 operations are interleaved. On many
243 // to taking better advantage of the CPU's SHA-256 and SIMD throughput.
250 // Load the initial state from ctx->state.
251 ld1 {state0_a.4s-state1_a.4s}, [ctx]
253 // Load ctx->bytecount. Take the mod 64 of it to get the number of
254 // bytes that are buffered in ctx->buf. Also save it in a register with
261 // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them
262 // followed by the first 64 - x8 bytes of data. Since len >= 64, we
263 // just load 64 bytes from each of ctx->buf, data1, and data2
266 ld1 {v16.16b-v19.16b}, [x9]
267 st1 {v16.16b-v19.16b}, [sp]
269 ld1 {v16.16b-v19.16b}, [data1], #64
271 st1 {v16.16b-v19.16b}, [x9]
272 ld1 {v16.4s-v19.4s}, [sp]
274 ld1 {v20.16b-v23.16b}, [data2], #64
275 st1 {v20.16b-v23.16b}, [x9]
276 ld1 {v20.4s-v23.4s}, [sp]
282 mov state0_b.16b, state0_a.16b
283 mov state1_b.16b, state1_a.16b
288 mov state0_b.16b, state0_a.16b
289 mov state1_b.16b, state1_a.16b
292 ld1 {v16.4s-v19.4s}, [data1], #64
293 ld1 {v20.4s-v23.4s}, [data2], #64
296 CPU_LE( rev32 v16.16b, v16.16b )
297 CPU_LE( rev32 v17.16b, v17.16b )
298 CPU_LE( rev32 v18.16b, v18.16b )
299 CPU_LE( rev32 v19.16b, v19.16b )
300 CPU_LE( rev32 v20.16b, v20.16b )
301 CPU_LE( rev32 v21.16b, v21.16b )
302 CPU_LE( rev32 v22.16b, v22.16b )
303 CPU_LE( rev32 v23.16b, v23.16b )
307 st1 {state0_a.4s-state1_b.4s}, [sp]
309 // Do the SHA-256 rounds on each block.
311 do_16rounds_2x 16, v4, v5, v6, v7
316 ld1 {v16.4s-v19.4s}, [sp]
328 // final_step = 1: need to do count-only padding block
330 tbnz final_step, #1, .Lfinup2x_done
335 // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
338 // and load from &sp[64 - len] to get the needed padding block. This
340 sub w8, len, #64 // w8 = len - 64
341 add data1, data1, w8, sxtw // data1 += len - 64
342 add data2, data2, w8, sxtw // data2 += len - 64
345 CPU_BE( movi v16.16b, #0 )
347 CPU_BE( mov v16.d[1], x9 )
348 movi v17.16b, #0
351 sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
353 b.ge 1f // will count spill into its own block?
357 mov final_step, #2 // won't need count-only block
359 1:
360 mov final_step, #1 // will need count-only block
362 ld1 {v16.16b-v19.16b}, [data1]
363 st1 {v16.16b-v19.16b}, [sp]
364 ld1 {v16.4s-v19.4s}, [x9]
365 ld1 {v20.16b-v23.16b}, [data2]
366 st1 {v20.16b-v23.16b}, [sp]
367 ld1 {v20.4s-v23.4s}, [x9]
378 // Pre-swap the endianness of the words.
381 b 1f
385 1:
390 mov v19.d[1], count
391 mov v20.16b, v16.16b
394 mov v23.16b, v19.16b
400 CPU_LE( rev32 state0_a.16b, state0_a.16b )
401 CPU_LE( rev32 state1_a.16b, state1_a.16b )
402 CPU_LE( rev32 state0_b.16b, state0_b.16b )
403 CPU_LE( rev32 state1_b.16b, state1_b.16b )
404 st1 {state0_a.4s-state1_a.4s}, [out1]
405 st1 {state0_b.4s-state1_b.4s}, [out2]