1/* SPDX-License-Identifier: GPL-2.0 OR MIT */ 2/* 3 * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved. 4 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 5 */ 6 7#define MASK_U32 0x3c 8#define CHACHA20_BLOCK_SIZE 64 9#define STACK_SIZE 32 10 11#define X0 $t0 12#define X1 $t1 13#define X2 $t2 14#define X3 $t3 15#define X4 $t4 16#define X5 $t5 17#define X6 $t6 18#define X7 $t7 19#define X8 $t8 20#define X9 $t9 21#define X10 $v1 22#define X11 $s6 23#define X12 $s5 24#define X13 $s4 25#define X14 $s3 26#define X15 $s2 27/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ 28#define T0 $s1 29#define T1 $s0 30#define T(n) T ## n 31#define X(n) X ## n 32 33/* Input arguments */ 34#define STATE $a0 35#define OUT $a1 36#define IN $a2 37#define BYTES $a3 38 39/* Output argument */ 40/* NONCE[0] is kept in a register and not in memory. 41 * We don't want to touch original value in memory. 42 * Must be incremented every loop iteration. 43 */ 44#define NONCE_0 $v0 45 46/* SAVED_X and SAVED_CA are set in the jump table. 47 * Use regs which are overwritten on exit else we don't leak clear data. 48 * They are used to handling the last bytes which are not multiple of 4. 49 */ 50#define SAVED_X X15 51#define SAVED_CA $s7 52 53#define IS_UNALIGNED $s7 54 55#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 56#define MSB 0 57#define LSB 3 58#define ROTx rotl 59#define ROTR(n) rotr n, 24 60#define CPU_TO_LE32(n) \ 61 wsbh n; \ 62 rotr n, 16; 63#else 64#define MSB 3 65#define LSB 0 66#define ROTx rotr 67#define CPU_TO_LE32(n) 68#define ROTR(n) 69#endif 70 71#define FOR_EACH_WORD(x) \ 72 x( 0); \ 73 x( 1); \ 74 x( 2); \ 75 x( 3); \ 76 x( 4); \ 77 x( 5); \ 78 x( 6); \ 79 x( 7); \ 80 x( 8); \ 81 x( 9); \ 82 x(10); \ 83 x(11); \ 84 x(12); \ 85 x(13); \ 86 x(14); \ 87 x(15); 88 89#define FOR_EACH_WORD_REV(x) \ 90 x(15); \ 91 x(14); \ 92 x(13); \ 93 x(12); \ 94 x(11); \ 95 x(10); \ 96 x( 9); \ 97 x( 8); \ 98 x( 7); \ 99 x( 6); \ 100 x( 5); \ 101 x( 4); \ 102 x( 3); \ 103 x( 2); \ 104 x( 1); \ 105 x( 0); 106 107#define PLUS_ONE_0 1 108#define PLUS_ONE_1 2 109#define PLUS_ONE_2 3 110#define PLUS_ONE_3 4 111#define PLUS_ONE_4 5 112#define PLUS_ONE_5 6 113#define PLUS_ONE_6 7 114#define PLUS_ONE_7 8 115#define PLUS_ONE_8 9 116#define PLUS_ONE_9 10 117#define PLUS_ONE_10 11 118#define PLUS_ONE_11 12 119#define PLUS_ONE_12 13 120#define PLUS_ONE_13 14 121#define PLUS_ONE_14 15 122#define PLUS_ONE_15 16 123#define PLUS_ONE(x) PLUS_ONE_ ## x 124#define _CONCAT3(a,b,c) a ## b ## c 125#define CONCAT3(a,b,c) _CONCAT3(a,b,c) 126 127#define STORE_UNALIGNED(x) \ 128CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ 129 .if (x != 12); \ 130 lw T0, (x*4)(STATE); \ 131 .endif; \ 132 lwl T1, (x*4)+MSB ## (IN); \ 133 lwr T1, (x*4)+LSB ## (IN); \ 134 .if (x == 12); \ 135 addu X ## x, NONCE_0; \ 136 .else; \ 137 addu X ## x, T0; \ 138 .endif; \ 139 CPU_TO_LE32(X ## x); \ 140 xor X ## x, T1; \ 141 swl X ## x, (x*4)+MSB ## (OUT); \ 142 swr X ## x, (x*4)+LSB ## (OUT); 143 144#define STORE_ALIGNED(x) \ 145CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ 146 .if (x != 12); \ 147 lw T0, (x*4)(STATE); \ 148 .endif; \ 149 lw T1, (x*4) ## (IN); \ 150 .if (x == 12); \ 151 addu X ## x, NONCE_0; \ 152 .else; \ 153 addu X ## x, T0; \ 154 .endif; \ 155 CPU_TO_LE32(X ## x); \ 156 xor X ## x, T1; \ 157 sw X ## x, (x*4) ## (OUT); 158 159/* Jump table macro. 160 * Used for setup and handling the last bytes, which are not multiple of 4. 161 * X15 is free to store Xn 162 * Every jumptable entry must be equal in size. 163 */ 164#define JMPTBL_ALIGNED(x) \ 165.Lchacha20_mips_jmptbl_aligned_ ## x: ; \ 166 .set noreorder; \ 167 b .Lchacha20_mips_xor_aligned_ ## x ## _b; \ 168 .if (x == 12); \ 169 addu SAVED_X, X ## x, NONCE_0; \ 170 .else; \ 171 addu SAVED_X, X ## x, SAVED_CA; \ 172 .endif; \ 173 .set reorder 174 175#define JMPTBL_UNALIGNED(x) \ 176.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \ 177 .set noreorder; \ 178 b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \ 179 .if (x == 12); \ 180 addu SAVED_X, X ## x, NONCE_0; \ 181 .else; \ 182 addu SAVED_X, X ## x, SAVED_CA; \ 183 .endif; \ 184 .set reorder 185 186#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ 187 addu X(A), X(K); \ 188 addu X(B), X(L); \ 189 addu X(C), X(M); \ 190 addu X(D), X(N); \ 191 xor X(V), X(A); \ 192 xor X(W), X(B); \ 193 xor X(Y), X(C); \ 194 xor X(Z), X(D); \ 195 rotl X(V), S; \ 196 rotl X(W), S; \ 197 rotl X(Y), S; \ 198 rotl X(Z), S; 199 200.text 201.set reorder 202.set noat 203.globl chacha20_mips 204.ent chacha20_mips 205chacha20_mips: 206 .frame $sp, STACK_SIZE, $ra 207 208 addiu $sp, -STACK_SIZE 209 210 /* Return bytes = 0. */ 211 beqz BYTES, .Lchacha20_mips_end 212 213 lw NONCE_0, 48(STATE) 214 215 /* Save s0-s7 */ 216 sw $s0, 0($sp) 217 sw $s1, 4($sp) 218 sw $s2, 8($sp) 219 sw $s3, 12($sp) 220 sw $s4, 16($sp) 221 sw $s5, 20($sp) 222 sw $s6, 24($sp) 223 sw $s7, 28($sp) 224 225 /* Test IN or OUT is unaligned. 226 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003 227 */ 228 or IS_UNALIGNED, IN, OUT 229 andi IS_UNALIGNED, 0x3 230 231 /* Set number of rounds */ 232 li $at, 20 233 234 b .Lchacha20_rounds_start 235 236.align 4 237.Loop_chacha20_rounds: 238 addiu IN, CHACHA20_BLOCK_SIZE 239 addiu OUT, CHACHA20_BLOCK_SIZE 240 addiu NONCE_0, 1 241 242.Lchacha20_rounds_start: 243 lw X0, 0(STATE) 244 lw X1, 4(STATE) 245 lw X2, 8(STATE) 246 lw X3, 12(STATE) 247 248 lw X4, 16(STATE) 249 lw X5, 20(STATE) 250 lw X6, 24(STATE) 251 lw X7, 28(STATE) 252 lw X8, 32(STATE) 253 lw X9, 36(STATE) 254 lw X10, 40(STATE) 255 lw X11, 44(STATE) 256 257 move X12, NONCE_0 258 lw X13, 52(STATE) 259 lw X14, 56(STATE) 260 lw X15, 60(STATE) 261 262.Loop_chacha20_xor_rounds: 263 addiu $at, -2 264 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); 265 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); 266 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); 267 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); 268 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); 269 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); 270 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); 271 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); 272 bnez $at, .Loop_chacha20_xor_rounds 273 274 addiu BYTES, -(CHACHA20_BLOCK_SIZE) 275 276 /* Is data src/dst unaligned? Jump */ 277 bnez IS_UNALIGNED, .Loop_chacha20_unaligned 278 279 /* Set number rounds here to fill delayslot. */ 280 li $at, 20 281 282 /* BYTES < 0, it has no full block. */ 283 bltz BYTES, .Lchacha20_mips_no_full_block_aligned 284 285 FOR_EACH_WORD_REV(STORE_ALIGNED) 286 287 /* BYTES > 0? Loop again. */ 288 bgtz BYTES, .Loop_chacha20_rounds 289 290 /* Place this here to fill delay slot */ 291 addiu NONCE_0, 1 292 293 /* BYTES < 0? Handle last bytes */ 294 bltz BYTES, .Lchacha20_mips_xor_bytes 295 296.Lchacha20_mips_xor_done: 297 /* Restore used registers */ 298 lw $s0, 0($sp) 299 lw $s1, 4($sp) 300 lw $s2, 8($sp) 301 lw $s3, 12($sp) 302 lw $s4, 16($sp) 303 lw $s5, 20($sp) 304 lw $s6, 24($sp) 305 lw $s7, 28($sp) 306 307 /* Write NONCE_0 back to right location in state */ 308 sw NONCE_0, 48(STATE) 309 310.Lchacha20_mips_end: 311 addiu $sp, STACK_SIZE 312 jr $ra 313 314.Lchacha20_mips_no_full_block_aligned: 315 /* Restore the offset on BYTES */ 316 addiu BYTES, CHACHA20_BLOCK_SIZE 317 318 /* Get number of full WORDS */ 319 andi $at, BYTES, MASK_U32 320 321 /* Load upper half of jump table addr */ 322 lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0) 323 324 /* Calculate lower half jump table offset */ 325 ins T0, $at, 1, 6 326 327 /* Add offset to STATE */ 328 addu T1, STATE, $at 329 330 /* Add lower half jump table addr */ 331 addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0) 332 333 /* Read value from STATE */ 334 lw SAVED_CA, 0(T1) 335 336 /* Store remaining bytecounter as negative value */ 337 subu BYTES, $at, BYTES 338 339 jr T0 340 341 /* Jump table */ 342 FOR_EACH_WORD(JMPTBL_ALIGNED) 343 344 345.Loop_chacha20_unaligned: 346 /* Set number rounds here to fill delayslot. */ 347 li $at, 20 348 349 /* BYTES > 0, it has no full block. */ 350 bltz BYTES, .Lchacha20_mips_no_full_block_unaligned 351 352 FOR_EACH_WORD_REV(STORE_UNALIGNED) 353 354 /* BYTES > 0? Loop again. */ 355 bgtz BYTES, .Loop_chacha20_rounds 356 357 /* Write NONCE_0 back to right location in state */ 358 sw NONCE_0, 48(STATE) 359 360 .set noreorder 361 /* Fall through to byte handling */ 362 bgez BYTES, .Lchacha20_mips_xor_done 363.Lchacha20_mips_xor_unaligned_0_b: 364.Lchacha20_mips_xor_aligned_0_b: 365 /* Place this here to fill delay slot */ 366 addiu NONCE_0, 1 367 .set reorder 368 369.Lchacha20_mips_xor_bytes: 370 addu IN, $at 371 addu OUT, $at 372 /* First byte */ 373 lbu T1, 0(IN) 374 addiu $at, BYTES, 1 375 CPU_TO_LE32(SAVED_X) 376 ROTR(SAVED_X) 377 xor T1, SAVED_X 378 sb T1, 0(OUT) 379 beqz $at, .Lchacha20_mips_xor_done 380 /* Second byte */ 381 lbu T1, 1(IN) 382 addiu $at, BYTES, 2 383 ROTx SAVED_X, 8 384 xor T1, SAVED_X 385 sb T1, 1(OUT) 386 beqz $at, .Lchacha20_mips_xor_done 387 /* Third byte */ 388 lbu T1, 2(IN) 389 ROTx SAVED_X, 8 390 xor T1, SAVED_X 391 sb T1, 2(OUT) 392 b .Lchacha20_mips_xor_done 393 394.Lchacha20_mips_no_full_block_unaligned: 395 /* Restore the offset on BYTES */ 396 addiu BYTES, CHACHA20_BLOCK_SIZE 397 398 /* Get number of full WORDS */ 399 andi $at, BYTES, MASK_U32 400 401 /* Load upper half of jump table addr */ 402 lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0) 403 404 /* Calculate lower half jump table offset */ 405 ins T0, $at, 1, 6 406 407 /* Add offset to STATE */ 408 addu T1, STATE, $at 409 410 /* Add lower half jump table addr */ 411 addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0) 412 413 /* Read value from STATE */ 414 lw SAVED_CA, 0(T1) 415 416 /* Store remaining bytecounter as negative value */ 417 subu BYTES, $at, BYTES 418 419 jr T0 420 421 /* Jump table */ 422 FOR_EACH_WORD(JMPTBL_UNALIGNED) 423.end chacha20_mips 424.set at 425