1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function 3 * 4 * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> 5 */ 6 7#include <linux/linkage.h> 8#include <asm/assembler.h> 9 10.syntax unified 11.fpu neon 12 13.text 14 15 16/* Context structure */ 17 18#define state_h0 0 19#define state_h1 4 20#define state_h2 8 21#define state_h3 12 22#define state_h4 16 23 24 25/* Constants */ 26 27#define K1 0x5A827999 28#define K2 0x6ED9EBA1 29#define K3 0x8F1BBCDC 30#define K4 0xCA62C1D6 31.align 4 32.LK_VEC: 33.LK1: .long K1, K1, K1, K1 34.LK2: .long K2, K2, K2, K2 35.LK3: .long K3, K3, K3, K3 36.LK4: .long K4, K4, K4, K4 37 38 39/* Register macros */ 40 41#define RSTATE r0 42#define RDATA r1 43#define RNBLKS r2 44#define ROLDSTACK r3 45#define RWK lr 46 47#define _a r4 48#define _b r5 49#define _c r6 50#define _d r7 51#define _e r8 52 53#define RT0 r9 54#define RT1 r10 55#define RT2 r11 56#define RT3 r12 57 58#define W0 q0 59#define W1 q7 60#define W2 q2 61#define W3 q3 62#define W4 q4 63#define W5 q6 64#define W6 q5 65#define W7 q1 66 67#define tmp0 q8 68#define tmp1 q9 69#define tmp2 q10 70#define tmp3 q11 71 72#define qK1 q12 73#define qK2 q13 74#define qK3 q14 75#define qK4 q15 76 77#ifdef CONFIG_CPU_BIG_ENDIAN 78#define ARM_LE(code...) 79#else 80#define ARM_LE(code...) code 81#endif 82 83/* Round function macros. */ 84 85#define WK_offs(i) (((i) & 15) * 4) 86 87#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 88 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 89 ldr RT3, [sp, WK_offs(i)]; \ 90 pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 91 bic RT0, d, b; \ 92 add e, e, a, ror #(32 - 5); \ 93 and RT1, c, b; \ 94 pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 95 add RT0, RT0, RT3; \ 96 add e, e, RT1; \ 97 ror b, #(32 - 30); \ 98 pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 99 add e, e, RT0; 100 101#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 102 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 103 ldr RT3, [sp, WK_offs(i)]; \ 104 pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 105 eor RT0, d, b; \ 106 add e, e, a, ror #(32 - 5); \ 107 eor RT0, RT0, c; \ 108 pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 109 add e, e, RT3; \ 110 ror b, #(32 - 30); \ 111 pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 112 add e, e, RT0; \ 113 114#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 115 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 116 ldr RT3, [sp, WK_offs(i)]; \ 117 pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 118 eor RT0, b, c; \ 119 and RT1, b, c; \ 120 add e, e, a, ror #(32 - 5); \ 121 pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 122 and RT0, RT0, d; \ 123 add RT1, RT1, RT3; \ 124 add e, e, RT0; \ 125 ror b, #(32 - 30); \ 126 pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 127 add e, e, RT1; 128 129#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 130 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 131 _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 132 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) 133 134#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\ 135 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 136 _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 137 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) 138 139#define R(a,b,c,d,e,f,i) \ 140 _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\ 141 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) 142 143#define dummy(...) 144 145 146/* Input expansion macros. */ 147 148/********* Precalc macros for rounds 0-15 *************************************/ 149 150#define W_PRECALC_00_15() \ 151 add RWK, sp, #(WK_offs(0)); \ 152 \ 153 vld1.32 {W0, W7}, [RDATA]!; \ 154 ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ 155 vld1.32 {W6, W5}, [RDATA]!; \ 156 vadd.u32 tmp0, W0, curK; \ 157 ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ 158 ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ 159 vadd.u32 tmp1, W7, curK; \ 160 ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ 161 vadd.u32 tmp2, W6, curK; \ 162 vst1.32 {tmp0, tmp1}, [RWK]!; \ 163 vadd.u32 tmp3, W5, curK; \ 164 vst1.32 {tmp2, tmp3}, [RWK]; \ 165 166#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 167 vld1.32 {W0, W7}, [RDATA]!; \ 168 169#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 170 add RWK, sp, #(WK_offs(0)); \ 171 172#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 173 ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ 174 175#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 176 vld1.32 {W6, W5}, [RDATA]!; \ 177 178#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 179 vadd.u32 tmp0, W0, curK; \ 180 181#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 182 ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ 183 184#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 185 ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ 186 187#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 188 vadd.u32 tmp1, W7, curK; \ 189 190#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 191 ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ 192 193#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 194 vadd.u32 tmp2, W6, curK; \ 195 196#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 197 vst1.32 {tmp0, tmp1}, [RWK]!; \ 198 199#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 200 vadd.u32 tmp3, W5, curK; \ 201 202#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 203 vst1.32 {tmp2, tmp3}, [RWK]; \ 204 205 206/********* Precalc macros for rounds 16-31 ************************************/ 207 208#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 209 veor tmp0, tmp0; \ 210 vext.8 W, W_m16, W_m12, #8; \ 211 212#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 213 add RWK, sp, #(WK_offs(i)); \ 214 vext.8 tmp0, W_m04, tmp0, #4; \ 215 216#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 217 veor tmp0, tmp0, W_m16; \ 218 veor.32 W, W, W_m08; \ 219 220#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 221 veor tmp1, tmp1; \ 222 veor W, W, tmp0; \ 223 224#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 225 vshl.u32 tmp0, W, #1; \ 226 227#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 228 vext.8 tmp1, tmp1, W, #(16-12); \ 229 vshr.u32 W, W, #31; \ 230 231#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 232 vorr tmp0, tmp0, W; \ 233 vshr.u32 W, tmp1, #30; \ 234 235#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 236 vshl.u32 tmp1, tmp1, #2; \ 237 238#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 239 veor tmp0, tmp0, W; \ 240 241#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 242 veor W, tmp0, tmp1; \ 243 244#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 245 vadd.u32 tmp0, W, curK; \ 246 247#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 248 vst1.32 {tmp0}, [RWK]; 249 250 251/********* Precalc macros for rounds 32-79 ************************************/ 252 253#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 254 veor W, W_m28; \ 255 256#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 257 vext.8 tmp0, W_m08, W_m04, #8; \ 258 259#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 260 veor W, W_m16; \ 261 262#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 263 veor W, tmp0; \ 264 265#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 266 add RWK, sp, #(WK_offs(i&~3)); \ 267 268#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 269 vshl.u32 tmp1, W, #2; \ 270 271#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 272 vshr.u32 tmp0, W, #30; \ 273 274#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 275 vorr W, tmp0, tmp1; \ 276 277#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 278 vadd.u32 tmp0, W, curK; \ 279 280#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 281 vst1.32 {tmp0}, [RWK]; 282 283 284/* 285 * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA. 286 * 287 * void sha1_transform_neon(struct sha1_block_state *state, 288 * const u8 *data, size_t nblocks); 289 */ 290.align 3 291ENTRY(sha1_transform_neon) 292 /* input: 293 * r0: state 294 * r1: data (64*nblocks bytes) 295 * r2: nblocks 296 */ 297 298 cmp RNBLKS, #0; 299 beq .Ldo_nothing; 300 301 push {r4-r12, lr}; 302 /*vpush {q4-q7};*/ 303 304 adr RT3, .LK_VEC; 305 306 mov ROLDSTACK, sp; 307 308 /* Align stack. */ 309 sub RT0, sp, #(16*4); 310 and RT0, #(~(16-1)); 311 mov sp, RT0; 312 313 vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */ 314 315 /* Get the values of the chaining variables. */ 316 ldm RSTATE, {_a-_e}; 317 318 vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */ 319 320#undef curK 321#define curK qK1 322 /* Precalc 0-15. */ 323 W_PRECALC_00_15(); 324 325.Loop: 326 /* Transform 0-15 + Precalc 16-31. */ 327 _R( _a, _b, _c, _d, _e, F1, 0, 328 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16, 329 W4, W5, W6, W7, W0, _, _, _ ); 330 _R( _e, _a, _b, _c, _d, F1, 1, 331 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16, 332 W4, W5, W6, W7, W0, _, _, _ ); 333 _R( _d, _e, _a, _b, _c, F1, 2, 334 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16, 335 W4, W5, W6, W7, W0, _, _, _ ); 336 _R( _c, _d, _e, _a, _b, F1, 3, 337 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16, 338 W4, W5, W6, W7, W0, _, _, _ ); 339 340#undef curK 341#define curK qK2 342 _R( _b, _c, _d, _e, _a, F1, 4, 343 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20, 344 W3, W4, W5, W6, W7, _, _, _ ); 345 _R( _a, _b, _c, _d, _e, F1, 5, 346 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20, 347 W3, W4, W5, W6, W7, _, _, _ ); 348 _R( _e, _a, _b, _c, _d, F1, 6, 349 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20, 350 W3, W4, W5, W6, W7, _, _, _ ); 351 _R( _d, _e, _a, _b, _c, F1, 7, 352 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20, 353 W3, W4, W5, W6, W7, _, _, _ ); 354 355 _R( _c, _d, _e, _a, _b, F1, 8, 356 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24, 357 W2, W3, W4, W5, W6, _, _, _ ); 358 _R( _b, _c, _d, _e, _a, F1, 9, 359 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24, 360 W2, W3, W4, W5, W6, _, _, _ ); 361 _R( _a, _b, _c, _d, _e, F1, 10, 362 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24, 363 W2, W3, W4, W5, W6, _, _, _ ); 364 _R( _e, _a, _b, _c, _d, F1, 11, 365 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24, 366 W2, W3, W4, W5, W6, _, _, _ ); 367 368 _R( _d, _e, _a, _b, _c, F1, 12, 369 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28, 370 W1, W2, W3, W4, W5, _, _, _ ); 371 _R( _c, _d, _e, _a, _b, F1, 13, 372 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28, 373 W1, W2, W3, W4, W5, _, _, _ ); 374 _R( _b, _c, _d, _e, _a, F1, 14, 375 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28, 376 W1, W2, W3, W4, W5, _, _, _ ); 377 _R( _a, _b, _c, _d, _e, F1, 15, 378 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28, 379 W1, W2, W3, W4, W5, _, _, _ ); 380 381 /* Transform 16-63 + Precalc 32-79. */ 382 _R( _e, _a, _b, _c, _d, F1, 16, 383 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32, 384 W0, W1, W2, W3, W4, W5, W6, W7); 385 _R( _d, _e, _a, _b, _c, F1, 17, 386 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32, 387 W0, W1, W2, W3, W4, W5, W6, W7); 388 _R( _c, _d, _e, _a, _b, F1, 18, 389 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32, 390 W0, W1, W2, W3, W4, W5, W6, W7); 391 _R( _b, _c, _d, _e, _a, F1, 19, 392 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32, 393 W0, W1, W2, W3, W4, W5, W6, W7); 394 395 _R( _a, _b, _c, _d, _e, F2, 20, 396 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36, 397 W7, W0, W1, W2, W3, W4, W5, W6); 398 _R( _e, _a, _b, _c, _d, F2, 21, 399 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36, 400 W7, W0, W1, W2, W3, W4, W5, W6); 401 _R( _d, _e, _a, _b, _c, F2, 22, 402 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36, 403 W7, W0, W1, W2, W3, W4, W5, W6); 404 _R( _c, _d, _e, _a, _b, F2, 23, 405 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36, 406 W7, W0, W1, W2, W3, W4, W5, W6); 407 408#undef curK 409#define curK qK3 410 _R( _b, _c, _d, _e, _a, F2, 24, 411 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40, 412 W6, W7, W0, W1, W2, W3, W4, W5); 413 _R( _a, _b, _c, _d, _e, F2, 25, 414 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40, 415 W6, W7, W0, W1, W2, W3, W4, W5); 416 _R( _e, _a, _b, _c, _d, F2, 26, 417 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40, 418 W6, W7, W0, W1, W2, W3, W4, W5); 419 _R( _d, _e, _a, _b, _c, F2, 27, 420 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40, 421 W6, W7, W0, W1, W2, W3, W4, W5); 422 423 _R( _c, _d, _e, _a, _b, F2, 28, 424 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44, 425 W5, W6, W7, W0, W1, W2, W3, W4); 426 _R( _b, _c, _d, _e, _a, F2, 29, 427 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44, 428 W5, W6, W7, W0, W1, W2, W3, W4); 429 _R( _a, _b, _c, _d, _e, F2, 30, 430 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44, 431 W5, W6, W7, W0, W1, W2, W3, W4); 432 _R( _e, _a, _b, _c, _d, F2, 31, 433 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44, 434 W5, W6, W7, W0, W1, W2, W3, W4); 435 436 _R( _d, _e, _a, _b, _c, F2, 32, 437 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48, 438 W4, W5, W6, W7, W0, W1, W2, W3); 439 _R( _c, _d, _e, _a, _b, F2, 33, 440 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48, 441 W4, W5, W6, W7, W0, W1, W2, W3); 442 _R( _b, _c, _d, _e, _a, F2, 34, 443 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48, 444 W4, W5, W6, W7, W0, W1, W2, W3); 445 _R( _a, _b, _c, _d, _e, F2, 35, 446 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48, 447 W4, W5, W6, W7, W0, W1, W2, W3); 448 449 _R( _e, _a, _b, _c, _d, F2, 36, 450 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52, 451 W3, W4, W5, W6, W7, W0, W1, W2); 452 _R( _d, _e, _a, _b, _c, F2, 37, 453 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52, 454 W3, W4, W5, W6, W7, W0, W1, W2); 455 _R( _c, _d, _e, _a, _b, F2, 38, 456 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52, 457 W3, W4, W5, W6, W7, W0, W1, W2); 458 _R( _b, _c, _d, _e, _a, F2, 39, 459 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52, 460 W3, W4, W5, W6, W7, W0, W1, W2); 461 462 _R( _a, _b, _c, _d, _e, F3, 40, 463 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56, 464 W2, W3, W4, W5, W6, W7, W0, W1); 465 _R( _e, _a, _b, _c, _d, F3, 41, 466 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56, 467 W2, W3, W4, W5, W6, W7, W0, W1); 468 _R( _d, _e, _a, _b, _c, F3, 42, 469 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56, 470 W2, W3, W4, W5, W6, W7, W0, W1); 471 _R( _c, _d, _e, _a, _b, F3, 43, 472 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56, 473 W2, W3, W4, W5, W6, W7, W0, W1); 474 475#undef curK 476#define curK qK4 477 _R( _b, _c, _d, _e, _a, F3, 44, 478 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60, 479 W1, W2, W3, W4, W5, W6, W7, W0); 480 _R( _a, _b, _c, _d, _e, F3, 45, 481 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60, 482 W1, W2, W3, W4, W5, W6, W7, W0); 483 _R( _e, _a, _b, _c, _d, F3, 46, 484 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60, 485 W1, W2, W3, W4, W5, W6, W7, W0); 486 _R( _d, _e, _a, _b, _c, F3, 47, 487 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60, 488 W1, W2, W3, W4, W5, W6, W7, W0); 489 490 _R( _c, _d, _e, _a, _b, F3, 48, 491 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64, 492 W0, W1, W2, W3, W4, W5, W6, W7); 493 _R( _b, _c, _d, _e, _a, F3, 49, 494 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64, 495 W0, W1, W2, W3, W4, W5, W6, W7); 496 _R( _a, _b, _c, _d, _e, F3, 50, 497 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64, 498 W0, W1, W2, W3, W4, W5, W6, W7); 499 _R( _e, _a, _b, _c, _d, F3, 51, 500 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64, 501 W0, W1, W2, W3, W4, W5, W6, W7); 502 503 _R( _d, _e, _a, _b, _c, F3, 52, 504 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68, 505 W7, W0, W1, W2, W3, W4, W5, W6); 506 _R( _c, _d, _e, _a, _b, F3, 53, 507 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68, 508 W7, W0, W1, W2, W3, W4, W5, W6); 509 _R( _b, _c, _d, _e, _a, F3, 54, 510 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68, 511 W7, W0, W1, W2, W3, W4, W5, W6); 512 _R( _a, _b, _c, _d, _e, F3, 55, 513 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68, 514 W7, W0, W1, W2, W3, W4, W5, W6); 515 516 _R( _e, _a, _b, _c, _d, F3, 56, 517 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72, 518 W6, W7, W0, W1, W2, W3, W4, W5); 519 _R( _d, _e, _a, _b, _c, F3, 57, 520 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72, 521 W6, W7, W0, W1, W2, W3, W4, W5); 522 _R( _c, _d, _e, _a, _b, F3, 58, 523 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72, 524 W6, W7, W0, W1, W2, W3, W4, W5); 525 _R( _b, _c, _d, _e, _a, F3, 59, 526 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72, 527 W6, W7, W0, W1, W2, W3, W4, W5); 528 529 subs RNBLKS, #1; 530 531 _R( _a, _b, _c, _d, _e, F4, 60, 532 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76, 533 W5, W6, W7, W0, W1, W2, W3, W4); 534 _R( _e, _a, _b, _c, _d, F4, 61, 535 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76, 536 W5, W6, W7, W0, W1, W2, W3, W4); 537 _R( _d, _e, _a, _b, _c, F4, 62, 538 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76, 539 W5, W6, W7, W0, W1, W2, W3, W4); 540 _R( _c, _d, _e, _a, _b, F4, 63, 541 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76, 542 W5, W6, W7, W0, W1, W2, W3, W4); 543 544 beq .Lend; 545 546 /* Transform 64-79 + Precalc 0-15 of next block. */ 547#undef curK 548#define curK qK1 549 _R( _b, _c, _d, _e, _a, F4, 64, 550 WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 551 _R( _a, _b, _c, _d, _e, F4, 65, 552 WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 553 _R( _e, _a, _b, _c, _d, F4, 66, 554 WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 555 _R( _d, _e, _a, _b, _c, F4, 67, 556 WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 557 558 _R( _c, _d, _e, _a, _b, F4, 68, 559 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 560 _R( _b, _c, _d, _e, _a, F4, 69, 561 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 562 _R( _a, _b, _c, _d, _e, F4, 70, 563 WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 564 _R( _e, _a, _b, _c, _d, F4, 71, 565 WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 566 567 _R( _d, _e, _a, _b, _c, F4, 72, 568 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 569 _R( _c, _d, _e, _a, _b, F4, 73, 570 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 571 _R( _b, _c, _d, _e, _a, F4, 74, 572 WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 573 _R( _a, _b, _c, _d, _e, F4, 75, 574 WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 575 576 _R( _e, _a, _b, _c, _d, F4, 76, 577 WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 578 _R( _d, _e, _a, _b, _c, F4, 77, 579 WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 580 _R( _c, _d, _e, _a, _b, F4, 78, 581 WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 582 _R( _b, _c, _d, _e, _a, F4, 79, 583 WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ ); 584 585 /* Update the chaining variables. */ 586 ldm RSTATE, {RT0-RT3}; 587 add _a, RT0; 588 ldr RT0, [RSTATE, #state_h4]; 589 add _b, RT1; 590 add _c, RT2; 591 add _d, RT3; 592 add _e, RT0; 593 stm RSTATE, {_a-_e}; 594 595 b .Loop; 596 597.Lend: 598 /* Transform 64-79 */ 599 R( _b, _c, _d, _e, _a, F4, 64 ); 600 R( _a, _b, _c, _d, _e, F4, 65 ); 601 R( _e, _a, _b, _c, _d, F4, 66 ); 602 R( _d, _e, _a, _b, _c, F4, 67 ); 603 R( _c, _d, _e, _a, _b, F4, 68 ); 604 R( _b, _c, _d, _e, _a, F4, 69 ); 605 R( _a, _b, _c, _d, _e, F4, 70 ); 606 R( _e, _a, _b, _c, _d, F4, 71 ); 607 R( _d, _e, _a, _b, _c, F4, 72 ); 608 R( _c, _d, _e, _a, _b, F4, 73 ); 609 R( _b, _c, _d, _e, _a, F4, 74 ); 610 R( _a, _b, _c, _d, _e, F4, 75 ); 611 R( _e, _a, _b, _c, _d, F4, 76 ); 612 R( _d, _e, _a, _b, _c, F4, 77 ); 613 R( _c, _d, _e, _a, _b, F4, 78 ); 614 R( _b, _c, _d, _e, _a, F4, 79 ); 615 616 mov sp, ROLDSTACK; 617 618 /* Update the chaining variables. */ 619 ldm RSTATE, {RT0-RT3}; 620 add _a, RT0; 621 ldr RT0, [RSTATE, #state_h4]; 622 add _b, RT1; 623 add _c, RT2; 624 add _d, RT3; 625 /*vpop {q4-q7};*/ 626 add _e, RT0; 627 stm RSTATE, {_a-_e}; 628 629 pop {r4-r12, pc}; 630 631.Ldo_nothing: 632 bx lr 633ENDPROC(sha1_transform_neon) 634