1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Core SHA-224/SHA-256 transform using v8 Crypto Extensions 4 * 5 * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 6 */ 7 8#include <linux/linkage.h> 9#include <asm/assembler.h> 10 11 .text 12 .arch armv8-a+crypto 13 14 dga .req q20 15 dgav .req v20 16 dgb .req q21 17 dgbv .req v21 18 19 t0 .req v22 20 t1 .req v23 21 22 dg0q .req q24 23 dg0v .req v24 24 dg1q .req q25 25 dg1v .req v25 26 dg2q .req q26 27 dg2v .req v26 28 29 .macro add_only, ev, rc, s0 30 mov dg2v.16b, dg0v.16b 31 .ifeq \ev 32 add t1.4s, v\s0\().4s, \rc\().4s 33 sha256h dg0q, dg1q, t0.4s 34 sha256h2 dg1q, dg2q, t0.4s 35 .else 36 .ifnb \s0 37 add t0.4s, v\s0\().4s, \rc\().4s 38 .endif 39 sha256h dg0q, dg1q, t1.4s 40 sha256h2 dg1q, dg2q, t1.4s 41 .endif 42 .endm 43 44 .macro add_update, ev, rc, s0, s1, s2, s3 45 sha256su0 v\s0\().4s, v\s1\().4s 46 add_only \ev, \rc, \s1 47 sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s 48 .endm 49 50 /* 51 * The SHA-256 round constants 52 */ 53 .section ".rodata", "a" 54 .align 4 55.Lsha2_rcon: 56 .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 57 .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 58 .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 59 .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 60 .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 61 .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 62 .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 63 .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 64 .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 65 .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 66 .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 67 .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 68 .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 69 .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 70 .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 71 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 72 73 .macro load_round_constants tmp 74 adr_l \tmp, .Lsha2_rcon 75 ld1 { v0.4s- v3.4s}, [\tmp], #64 76 ld1 { v4.4s- v7.4s}, [\tmp], #64 77 ld1 { v8.4s-v11.4s}, [\tmp], #64 78 ld1 {v12.4s-v15.4s}, [\tmp] 79 .endm 80 81 /* 82 * void sha256_ce_transform(struct sha256_block_state *state, 83 * const u8 *data, size_t nblocks); 84 */ 85 .text 86SYM_FUNC_START(sha256_ce_transform) 87 88 load_round_constants x8 89 90 /* load state */ 91 ld1 {dgav.4s, dgbv.4s}, [x0] 92 93 /* load input */ 940: ld1 {v16.4s-v19.4s}, [x1], #64 95 sub x2, x2, #1 96 97 rev32 v16.16b, v16.16b 98 rev32 v17.16b, v17.16b 99 rev32 v18.16b, v18.16b 100 rev32 v19.16b, v19.16b 101 102 add t0.4s, v16.4s, v0.4s 103 mov dg0v.16b, dgav.16b 104 mov dg1v.16b, dgbv.16b 105 106 add_update 0, v1, 16, 17, 18, 19 107 add_update 1, v2, 17, 18, 19, 16 108 add_update 0, v3, 18, 19, 16, 17 109 add_update 1, v4, 19, 16, 17, 18 110 111 add_update 0, v5, 16, 17, 18, 19 112 add_update 1, v6, 17, 18, 19, 16 113 add_update 0, v7, 18, 19, 16, 17 114 add_update 1, v8, 19, 16, 17, 18 115 116 add_update 0, v9, 16, 17, 18, 19 117 add_update 1, v10, 17, 18, 19, 16 118 add_update 0, v11, 18, 19, 16, 17 119 add_update 1, v12, 19, 16, 17, 18 120 121 add_only 0, v13, 17 122 add_only 1, v14, 18 123 add_only 0, v15, 19 124 add_only 1 125 126 /* update state */ 127 add dgav.4s, dgav.4s, dg0v.4s 128 add dgbv.4s, dgbv.4s, dg1v.4s 129 130 /* handled all input blocks? */ 131 cbnz x2, 0b 132 133 /* store new state */ 134 st1 {dgav.4s, dgbv.4s}, [x0] 135 ret 136SYM_FUNC_END(sha256_ce_transform) 137 138 .unreq dga 139 .unreq dgav 140 .unreq dgb 141 .unreq dgbv 142 .unreq t0 143 .unreq t1 144 .unreq dg0q 145 .unreq dg0v 146 .unreq dg1q 147 .unreq dg1v 148 .unreq dg2q 149 .unreq dg2v 150 151 // parameters for sha256_ce_finup2x() 152 ctx .req x0 153 data1 .req x1 154 data2 .req x2 155 len .req w3 156 out1 .req x4 157 out2 .req x5 158 159 // other scalar variables 160 count .req x6 161 final_step .req w7 162 163 // x8-x9 are used as temporaries. 164 165 // v0-v15 are used to cache the SHA-256 round constants. 166 // v16-v19 are used for the message schedule for the first message. 167 // v20-v23 are used for the message schedule for the second message. 168 // v24-v31 are used for the state and temporaries as given below. 169 // *_a are for the first message and *_b for the second. 170 state0_a_q .req q24 171 state0_a .req v24 172 state1_a_q .req q25 173 state1_a .req v25 174 state0_b_q .req q26 175 state0_b .req v26 176 state1_b_q .req q27 177 state1_b .req v27 178 t0_a .req v28 179 t0_b .req v29 180 t1_a_q .req q30 181 t1_a .req v30 182 t1_b_q .req q31 183 t1_b .req v31 184 185#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount) 186#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf) 187// offsetof(struct __sha256_ctx, state) is assumed to be 0. 188 189 // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a 190 // and m0_b contain the current 4 message schedule words for the first 191 // and second message respectively. 192 // 193 // If not all the message schedule words have been computed yet, then 194 // this also computes 4 more message schedule words for each message. 195 // m1_a-m3_a contain the next 3 groups of 4 message schedule words for 196 // the first message, and likewise m1_b-m3_b for the second. After 197 // consuming the current value of m0_a, this macro computes the group 198 // after m3_a and writes it to m0_a, and likewise for *_b. This means 199 // that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a, 200 // m3_a, m0_a), and likewise for *_b, so the caller must cycle through 201 // the registers accordingly. 202 .macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \ 203 m0_b, m1_b, m2_b, m3_b 204 add t0_a\().4s, \m0_a\().4s, \k\().4s 205 add t0_b\().4s, \m0_b\().4s, \k\().4s 206 .if \i < 48 207 sha256su0 \m0_a\().4s, \m1_a\().4s 208 sha256su0 \m0_b\().4s, \m1_b\().4s 209 sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s 210 sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s 211 .endif 212 mov t1_a.16b, state0_a.16b 213 mov t1_b.16b, state0_b.16b 214 sha256h state0_a_q, state1_a_q, t0_a\().4s 215 sha256h state0_b_q, state1_b_q, t0_b\().4s 216 sha256h2 state1_a_q, t1_a_q, t0_a\().4s 217 sha256h2 state1_b_q, t1_b_q, t0_b\().4s 218 .endm 219 220 .macro do_16rounds_2x i, k0, k1, k2, k3 221 do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23 222 do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20 223 do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21 224 do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22 225 .endm 226 227// 228// void sha256_ce_finup2x(const struct __sha256_ctx *ctx, 229// const u8 *data1, const u8 *data2, int len, 230// u8 out1[SHA256_DIGEST_SIZE], 231// u8 out2[SHA256_DIGEST_SIZE]); 232// 233// This function computes the SHA-256 digests of two messages |data1| and 234// |data2| that are both |len| bytes long, starting from the initial context 235// |ctx|. |len| must be at least SHA256_BLOCK_SIZE. 236// 237// The instructions for the two SHA-256 operations are interleaved. On many 238// CPUs, this is almost twice as fast as hashing each message individually due 239// to taking better advantage of the CPU's SHA-256 and SIMD throughput. 240// 241SYM_FUNC_START(sha256_ce_finup2x) 242 sub sp, sp, #128 243 mov final_step, #0 244 load_round_constants x8 245 246 // Load the initial state from ctx->state. 247 ld1 {state0_a.4s-state1_a.4s}, [ctx] 248 249 // Load ctx->bytecount. Take the mod 64 of it to get the number of 250 // bytes that are buffered in ctx->buf. Also save it in a register with 251 // len added to it. 252 ldr x8, [ctx, #OFFSETOF_BYTECOUNT] 253 add count, x8, len, sxtw 254 and x8, x8, #63 255 cbz x8, .Lfinup2x_enter_loop // No bytes buffered? 256 257 // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them 258 // followed by the first 64 - x8 bytes of data. Since len >= 64, we 259 // just load 64 bytes from each of ctx->buf, data1, and data2 260 // unconditionally and rearrange the data as needed. 261 add x9, ctx, #OFFSETOF_BUF 262 ld1 {v16.16b-v19.16b}, [x9] 263 st1 {v16.16b-v19.16b}, [sp] 264 265 ld1 {v16.16b-v19.16b}, [data1], #64 266 add x9, sp, x8 267 st1 {v16.16b-v19.16b}, [x9] 268 ld1 {v16.4s-v19.4s}, [sp] 269 270 ld1 {v20.16b-v23.16b}, [data2], #64 271 st1 {v20.16b-v23.16b}, [x9] 272 ld1 {v20.4s-v23.4s}, [sp] 273 274 sub len, len, #64 275 sub data1, data1, x8 276 sub data2, data2, x8 277 add len, len, w8 278 mov state0_b.16b, state0_a.16b 279 mov state1_b.16b, state1_a.16b 280 b .Lfinup2x_loop_have_data 281 282.Lfinup2x_enter_loop: 283 sub len, len, #64 284 mov state0_b.16b, state0_a.16b 285 mov state1_b.16b, state1_a.16b 286.Lfinup2x_loop: 287 // Load the next two data blocks. 288 ld1 {v16.4s-v19.4s}, [data1], #64 289 ld1 {v20.4s-v23.4s}, [data2], #64 290.Lfinup2x_loop_have_data: 291 // Convert the words of the data blocks from big endian. 292 rev32 v16.16b, v16.16b 293 rev32 v17.16b, v17.16b 294 rev32 v18.16b, v18.16b 295 rev32 v19.16b, v19.16b 296 rev32 v20.16b, v20.16b 297 rev32 v21.16b, v21.16b 298 rev32 v22.16b, v22.16b 299 rev32 v23.16b, v23.16b 300.Lfinup2x_loop_have_bswapped_data: 301 302 // Save the original state for each block. 303 st1 {state0_a.4s-state1_b.4s}, [sp] 304 305 // Do the SHA-256 rounds on each block. 306 do_16rounds_2x 0, v0, v1, v2, v3 307 do_16rounds_2x 16, v4, v5, v6, v7 308 do_16rounds_2x 32, v8, v9, v10, v11 309 do_16rounds_2x 48, v12, v13, v14, v15 310 311 // Add the original state for each block. 312 ld1 {v16.4s-v19.4s}, [sp] 313 add state0_a.4s, state0_a.4s, v16.4s 314 add state1_a.4s, state1_a.4s, v17.4s 315 add state0_b.4s, state0_b.4s, v18.4s 316 add state1_b.4s, state1_b.4s, v19.4s 317 318 // Update len and loop back if more blocks remain. 319 sub len, len, #64 320 tbz len, #31, .Lfinup2x_loop // len >= 0? 321 322 // Check if any final blocks need to be handled. 323 // final_step = 2: all done 324 // final_step = 1: need to do count-only padding block 325 // final_step = 0: need to do the block with 0x80 padding byte 326 tbnz final_step, #1, .Lfinup2x_done 327 tbnz final_step, #0, .Lfinup2x_finalize_countonly 328 add len, len, #64 329 cbz len, .Lfinup2x_finalize_blockaligned 330 331 // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block. 332 // To do this, write the padding starting with the 0x80 byte to 333 // &sp[64]. Then for each message, copy the last 64 data bytes to sp 334 // and load from &sp[64 - len] to get the needed padding block. This 335 // code relies on the data buffers being >= 64 bytes in length. 336 sub w8, len, #64 // w8 = len - 64 337 add data1, data1, w8, sxtw // data1 += len - 64 338 add data2, data2, w8, sxtw // data2 += len - 64 339 mov x9, #0x80 340 fmov d16, x9 341 movi v17.16b, #0 342 stp q16, q17, [sp, #64] 343 stp q17, q17, [sp, #96] 344 sub x9, sp, w8, sxtw // x9 = &sp[64 - len] 345 cmp len, #56 346 b.ge 1f // will count spill into its own block? 347 lsl count, count, #3 348 rev count, count 349 str count, [x9, #56] 350 mov final_step, #2 // won't need count-only block 351 b 2f 3521: 353 mov final_step, #1 // will need count-only block 3542: 355 ld1 {v16.16b-v19.16b}, [data1] 356 st1 {v16.16b-v19.16b}, [sp] 357 ld1 {v16.4s-v19.4s}, [x9] 358 ld1 {v20.16b-v23.16b}, [data2] 359 st1 {v20.16b-v23.16b}, [sp] 360 ld1 {v20.4s-v23.4s}, [x9] 361 b .Lfinup2x_loop_have_data 362 363 // Prepare a padding block, either: 364 // 365 // {0x80, 0, 0, 0, ..., count (as __be64)} 366 // This is for a block aligned message. 367 // 368 // { 0, 0, 0, 0, ..., count (as __be64)} 369 // This is for a message whose length mod 64 is >= 56. 370 // 371 // Pre-swap the endianness of the words. 372.Lfinup2x_finalize_countonly: 373 movi v16.2d, #0 374 b 1f 375.Lfinup2x_finalize_blockaligned: 376 mov x8, #0x80000000 377 fmov d16, x8 3781: 379 movi v17.2d, #0 380 movi v18.2d, #0 381 ror count, count, #29 // ror(lsl(count, 3), 32) 382 mov v19.d[0], xzr 383 mov v19.d[1], count 384 mov v20.16b, v16.16b 385 movi v21.2d, #0 386 movi v22.2d, #0 387 mov v23.16b, v19.16b 388 mov final_step, #2 389 b .Lfinup2x_loop_have_bswapped_data 390 391.Lfinup2x_done: 392 // Write the two digests with all bytes in the correct order. 393 rev32 state0_a.16b, state0_a.16b 394 rev32 state1_a.16b, state1_a.16b 395 rev32 state0_b.16b, state0_b.16b 396 rev32 state1_b.16b, state1_b.16b 397 st1 {state0_a.4s-state1_a.4s}, [out1] 398 st1 {state0_b.4s-state1_b.4s}, [out2] 399 add sp, sp, #128 400 ret 401SYM_FUNC_END(sha256_ce_finup2x) 402