Lines Matching +full:0 +full:- +full:4

1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
12 .arch armv8-a+crypto
32 add t1.4s, v\s0\().4s, \rc\().4s
33 sha256h dg0q, dg1q, t0.4s
34 sha256h2 dg1q, dg2q, t0.4s
37 add t0.4s, v\s0\().4s, \rc\().4s
39 sha256h dg0q, dg1q, t1.4s
40 sha256h2 dg1q, dg2q, t1.4s
45 sha256su0 v\s0\().4s, v\s1\().4s
47 sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s
51 * The SHA-256 round constants
54 .align 4
56 .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
57 .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
58 .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
59 .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
60 .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
61 .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
62 .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
63 .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
64 .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
65 .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
66 .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
67 .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
68 .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
69 .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
70 .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
71 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
75 ld1 { v0.4s- v3.4s}, [\tmp], #64
76 ld1 { v4.4s- v7.4s}, [\tmp], #64
77 ld1 { v8.4s-v11.4s}, [\tmp], #64
78 ld1 {v12.4s-v15.4s}, [\tmp]
91 ld1 {dgav.4s, dgbv.4s}, [x0]
94 0: ld1 {v16.4s-v19.4s}, [x1], #64
102 add t0.4s, v16.4s, v0.4s
106 add_update 0, v1, 16, 17, 18, 19
108 add_update 0, v3, 18, 19, 16, 17
111 add_update 0, v5, 16, 17, 18, 19
113 add_update 0, v7, 18, 19, 16, 17
116 add_update 0, v9, 16, 17, 18, 19
118 add_update 0, v11, 18, 19, 16, 17
121 add_only 0, v13, 17
123 add_only 0, v15, 19
127 add dgav.4s, dgav.4s, dg0v.4s
128 add dgbv.4s, dgbv.4s, dg1v.4s
134 cbnz x2, 0b
137 1: st1 {dgav.4s, dgbv.4s}, [x0]
167 // x8-x9 are used as temporaries.
169 // v0-v15 are used to cache the SHA-256 round constants.
170 // v16-v19 are used for the message schedule for the first message.
171 // v20-v23 are used for the message schedule for the second message.
172 // v24-v31 are used for the state and temporaries as given below.
191 // offsetof(struct __sha256_ctx, state) is assumed to be 0.
193 // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
194 // and m0_b contain the current 4 message schedule words for the first
198 // this also computes 4 more message schedule words for each message.
199 // m1_a-m3_a contain the next 3 groups of 4 message schedule words for
200 // the first message, and likewise m1_b-m3_b for the second. After
208 add t0_a\().4s, \m0_a\().4s, \k\().4s
209 add t0_b\().4s, \m0_b\().4s, \k\().4s
211 sha256su0 \m0_a\().4s, \m1_a\().4s
212 sha256su0 \m0_b\().4s, \m1_b\().4s
213 sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s
214 sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s
218 sha256h state0_a_q, state1_a_q, t0_a\().4s
219 sha256h state0_b_q, state1_b_q, t0_b\().4s
220 sha256h2 state1_a_q, t1_a_q, t0_a\().4s
221 sha256h2 state1_b_q, t1_b_q, t0_b\().4s
225 do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23
226 do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20
237 // This function computes the SHA-256 digests of two messages |data1| and
241 // The instructions for the two SHA-256 operations are interleaved. On many
243 // to taking better advantage of the CPU's SHA-256 and SIMD throughput.
247 mov final_step, #0
250 // Load the initial state from ctx->state.
251 ld1 {state0_a.4s-state1_a.4s}, [ctx]
253 // Load ctx->bytecount. Take the mod 64 of it to get the number of
254 // bytes that are buffered in ctx->buf. Also save it in a register with
261 // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them
262 // followed by the first 64 - x8 bytes of data. Since len >= 64, we
263 // just load 64 bytes from each of ctx->buf, data1, and data2
266 ld1 {v16.16b-v19.16b}, [x9]
267 st1 {v16.16b-v19.16b}, [sp]
269 ld1 {v16.16b-v19.16b}, [data1], #64
271 st1 {v16.16b-v19.16b}, [x9]
272 ld1 {v16.4s-v19.4s}, [sp]
274 ld1 {v20.16b-v23.16b}, [data2], #64
275 st1 {v20.16b-v23.16b}, [x9]
276 ld1 {v20.4s-v23.4s}, [sp]
292 ld1 {v16.4s-v19.4s}, [data1], #64
293 ld1 {v20.4s-v23.4s}, [data2], #64
307 st1 {state0_a.4s-state1_b.4s}, [sp]
309 // Do the SHA-256 rounds on each block.
310 do_16rounds_2x 0, v0, v1, v2, v3
316 ld1 {v16.4s-v19.4s}, [sp]
317 add state0_a.4s, state0_a.4s, v16.4s
318 add state1_a.4s, state1_a.4s, v17.4s
319 add state0_b.4s, state0_b.4s, v18.4s
320 add state1_b.4s, state1_b.4s, v19.4s
324 tbz len, #31, .Lfinup2x_loop // len >= 0?
328 // final_step = 1: need to do count-only padding block
329 // final_step = 0: need to do the block with 0x80 padding byte
331 tbnz final_step, #0, .Lfinup2x_finalize_countonly
335 // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
336 // To do this, write the padding starting with the 0x80 byte to
338 // and load from &sp[64 - len] to get the needed padding block. This
340 sub w8, len, #64 // w8 = len - 64
341 add data1, data1, w8, sxtw // data1 += len - 64
342 add data2, data2, w8, sxtw // data2 += len - 64
343 CPU_LE( mov x9, #0x80 )
345 CPU_BE( movi v16.16b, #0 )
346 CPU_BE( mov x9, #0x8000000000000000 )
348 movi v17.16b, #0
351 sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
357 mov final_step, #2 // won't need count-only block
360 mov final_step, #1 // will need count-only block
362 ld1 {v16.16b-v19.16b}, [data1]
363 st1 {v16.16b-v19.16b}, [sp]
364 ld1 {v16.4s-v19.4s}, [x9]
365 ld1 {v20.16b-v23.16b}, [data2]
366 st1 {v20.16b-v23.16b}, [sp]
367 ld1 {v20.4s-v23.4s}, [x9]
372 // {0x80, 0, 0, 0, ..., count (as __be64)}
375 // { 0, 0, 0, 0, ..., count (as __be64)}
378 // Pre-swap the endianness of the words.
380 movi v16.2d, #0
383 mov x8, #0x80000000
386 movi v17.2d, #0
387 movi v18.2d, #0
389 mov v19.d[0], xzr
392 movi v21.2d, #0
393 movi v22.2d, #0
404 st1 {state0_a.4s-state1_a.4s}, [out1]
405 st1 {state0_b.4s-state1_b.4s}, [out2]