1*4a32e5dcSEric Biggers/* SPDX-License-Identifier: GPL-2.0 */ 2*4a32e5dcSEric Biggers/* 3*4a32e5dcSEric Biggers * Copyright (C) 2018 Google, Inc. 4*4a32e5dcSEric Biggers */ 5*4a32e5dcSEric Biggers 6*4a32e5dcSEric Biggers#include <linux/linkage.h> 7*4a32e5dcSEric Biggers#include <asm/assembler.h> 8*4a32e5dcSEric Biggers 9*4a32e5dcSEric Biggers/* 10*4a32e5dcSEric Biggers * Design notes: 11*4a32e5dcSEric Biggers * 12*4a32e5dcSEric Biggers * 16 registers would be needed to hold the state matrix, but only 14 are 13*4a32e5dcSEric Biggers * available because 'sp' and 'pc' cannot be used. So we spill the elements 14*4a32e5dcSEric Biggers * (x8, x9) to the stack and swap them out with (x10, x11). This adds one 15*4a32e5dcSEric Biggers * 'ldrd' and one 'strd' instruction per round. 16*4a32e5dcSEric Biggers * 17*4a32e5dcSEric Biggers * All rotates are performed using the implicit rotate operand accepted by the 18*4a32e5dcSEric Biggers * 'add' and 'eor' instructions. This is faster than using explicit rotate 19*4a32e5dcSEric Biggers * instructions. To make this work, we allow the values in the second and last 20*4a32e5dcSEric Biggers * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the 21*4a32e5dcSEric Biggers * wrong rotation amount. The rotation amount is then fixed up just in time 22*4a32e5dcSEric Biggers * when the values are used. 'brot' is the number of bits the values in row 'b' 23*4a32e5dcSEric Biggers * need to be rotated right to arrive at the correct values, and 'drot' 24*4a32e5dcSEric Biggers * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such 25*4a32e5dcSEric Biggers * that they end up as (25, 24) after every round. 26*4a32e5dcSEric Biggers */ 27*4a32e5dcSEric Biggers 28*4a32e5dcSEric Biggers // ChaCha state registers 29*4a32e5dcSEric Biggers X0 .req r0 30*4a32e5dcSEric Biggers X1 .req r1 31*4a32e5dcSEric Biggers X2 .req r2 32*4a32e5dcSEric Biggers X3 .req r3 33*4a32e5dcSEric Biggers X4 .req r4 34*4a32e5dcSEric Biggers X5 .req r5 35*4a32e5dcSEric Biggers X6 .req r6 36*4a32e5dcSEric Biggers X7 .req r7 37*4a32e5dcSEric Biggers X8_X10 .req r8 // shared by x8 and x10 38*4a32e5dcSEric Biggers X9_X11 .req r9 // shared by x9 and x11 39*4a32e5dcSEric Biggers X12 .req r10 40*4a32e5dcSEric Biggers X13 .req r11 41*4a32e5dcSEric Biggers X14 .req r12 42*4a32e5dcSEric Biggers X15 .req r14 43*4a32e5dcSEric Biggers 44*4a32e5dcSEric Biggers.macro _le32_bswap_4x a, b, c, d, tmp 45*4a32e5dcSEric Biggers#ifdef __ARMEB__ 46*4a32e5dcSEric Biggers rev_l \a, \tmp 47*4a32e5dcSEric Biggers rev_l \b, \tmp 48*4a32e5dcSEric Biggers rev_l \c, \tmp 49*4a32e5dcSEric Biggers rev_l \d, \tmp 50*4a32e5dcSEric Biggers#endif 51*4a32e5dcSEric Biggers.endm 52*4a32e5dcSEric Biggers 53*4a32e5dcSEric Biggers.macro __ldrd a, b, src, offset 54*4a32e5dcSEric Biggers#if __LINUX_ARM_ARCH__ >= 6 55*4a32e5dcSEric Biggers ldrd \a, \b, [\src, #\offset] 56*4a32e5dcSEric Biggers#else 57*4a32e5dcSEric Biggers ldr \a, [\src, #\offset] 58*4a32e5dcSEric Biggers ldr \b, [\src, #\offset + 4] 59*4a32e5dcSEric Biggers#endif 60*4a32e5dcSEric Biggers.endm 61*4a32e5dcSEric Biggers 62*4a32e5dcSEric Biggers.macro __strd a, b, dst, offset 63*4a32e5dcSEric Biggers#if __LINUX_ARM_ARCH__ >= 6 64*4a32e5dcSEric Biggers strd \a, \b, [\dst, #\offset] 65*4a32e5dcSEric Biggers#else 66*4a32e5dcSEric Biggers str \a, [\dst, #\offset] 67*4a32e5dcSEric Biggers str \b, [\dst, #\offset + 4] 68*4a32e5dcSEric Biggers#endif 69*4a32e5dcSEric Biggers.endm 70*4a32e5dcSEric Biggers 71*4a32e5dcSEric Biggers.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2 72*4a32e5dcSEric Biggers 73*4a32e5dcSEric Biggers // a += b; d ^= a; d = rol(d, 16); 74*4a32e5dcSEric Biggers add \a1, \a1, \b1, ror #brot 75*4a32e5dcSEric Biggers add \a2, \a2, \b2, ror #brot 76*4a32e5dcSEric Biggers eor \d1, \a1, \d1, ror #drot 77*4a32e5dcSEric Biggers eor \d2, \a2, \d2, ror #drot 78*4a32e5dcSEric Biggers // drot == 32 - 16 == 16 79*4a32e5dcSEric Biggers 80*4a32e5dcSEric Biggers // c += d; b ^= c; b = rol(b, 12); 81*4a32e5dcSEric Biggers add \c1, \c1, \d1, ror #16 82*4a32e5dcSEric Biggers add \c2, \c2, \d2, ror #16 83*4a32e5dcSEric Biggers eor \b1, \c1, \b1, ror #brot 84*4a32e5dcSEric Biggers eor \b2, \c2, \b2, ror #brot 85*4a32e5dcSEric Biggers // brot == 32 - 12 == 20 86*4a32e5dcSEric Biggers 87*4a32e5dcSEric Biggers // a += b; d ^= a; d = rol(d, 8); 88*4a32e5dcSEric Biggers add \a1, \a1, \b1, ror #20 89*4a32e5dcSEric Biggers add \a2, \a2, \b2, ror #20 90*4a32e5dcSEric Biggers eor \d1, \a1, \d1, ror #16 91*4a32e5dcSEric Biggers eor \d2, \a2, \d2, ror #16 92*4a32e5dcSEric Biggers // drot == 32 - 8 == 24 93*4a32e5dcSEric Biggers 94*4a32e5dcSEric Biggers // c += d; b ^= c; b = rol(b, 7); 95*4a32e5dcSEric Biggers add \c1, \c1, \d1, ror #24 96*4a32e5dcSEric Biggers add \c2, \c2, \d2, ror #24 97*4a32e5dcSEric Biggers eor \b1, \c1, \b1, ror #20 98*4a32e5dcSEric Biggers eor \b2, \c2, \b2, ror #20 99*4a32e5dcSEric Biggers // brot == 32 - 7 == 25 100*4a32e5dcSEric Biggers.endm 101*4a32e5dcSEric Biggers 102*4a32e5dcSEric Biggers.macro _doubleround 103*4a32e5dcSEric Biggers 104*4a32e5dcSEric Biggers // column round 105*4a32e5dcSEric Biggers 106*4a32e5dcSEric Biggers // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13) 107*4a32e5dcSEric Biggers _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13 108*4a32e5dcSEric Biggers 109*4a32e5dcSEric Biggers // save (x8, x9); restore (x10, x11) 110*4a32e5dcSEric Biggers __strd X8_X10, X9_X11, sp, 0 111*4a32e5dcSEric Biggers __ldrd X8_X10, X9_X11, sp, 8 112*4a32e5dcSEric Biggers 113*4a32e5dcSEric Biggers // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15) 114*4a32e5dcSEric Biggers _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15 115*4a32e5dcSEric Biggers 116*4a32e5dcSEric Biggers .set brot, 25 117*4a32e5dcSEric Biggers .set drot, 24 118*4a32e5dcSEric Biggers 119*4a32e5dcSEric Biggers // diagonal round 120*4a32e5dcSEric Biggers 121*4a32e5dcSEric Biggers // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12) 122*4a32e5dcSEric Biggers _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12 123*4a32e5dcSEric Biggers 124*4a32e5dcSEric Biggers // save (x10, x11); restore (x8, x9) 125*4a32e5dcSEric Biggers __strd X8_X10, X9_X11, sp, 8 126*4a32e5dcSEric Biggers __ldrd X8_X10, X9_X11, sp, 0 127*4a32e5dcSEric Biggers 128*4a32e5dcSEric Biggers // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14) 129*4a32e5dcSEric Biggers _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14 130*4a32e5dcSEric Biggers.endm 131*4a32e5dcSEric Biggers 132*4a32e5dcSEric Biggers.macro _chacha_permute nrounds 133*4a32e5dcSEric Biggers .set brot, 0 134*4a32e5dcSEric Biggers .set drot, 0 135*4a32e5dcSEric Biggers .rept \nrounds / 2 136*4a32e5dcSEric Biggers _doubleround 137*4a32e5dcSEric Biggers .endr 138*4a32e5dcSEric Biggers.endm 139*4a32e5dcSEric Biggers 140*4a32e5dcSEric Biggers.macro _chacha nrounds 141*4a32e5dcSEric Biggers 142*4a32e5dcSEric Biggers.Lnext_block\@: 143*4a32e5dcSEric Biggers // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN 144*4a32e5dcSEric Biggers // Registers contain x0-x9,x12-x15. 145*4a32e5dcSEric Biggers 146*4a32e5dcSEric Biggers // Do the core ChaCha permutation to update x0-x15. 147*4a32e5dcSEric Biggers _chacha_permute \nrounds 148*4a32e5dcSEric Biggers 149*4a32e5dcSEric Biggers add sp, #8 150*4a32e5dcSEric Biggers // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN 151*4a32e5dcSEric Biggers // Registers contain x0-x9,x12-x15. 152*4a32e5dcSEric Biggers // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. 153*4a32e5dcSEric Biggers 154*4a32e5dcSEric Biggers // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15). 155*4a32e5dcSEric Biggers push {X8_X10, X9_X11, X12, X13, X14, X15} 156*4a32e5dcSEric Biggers 157*4a32e5dcSEric Biggers // Load (OUT, IN, LEN). 158*4a32e5dcSEric Biggers ldr r14, [sp, #96] 159*4a32e5dcSEric Biggers ldr r12, [sp, #100] 160*4a32e5dcSEric Biggers ldr r11, [sp, #104] 161*4a32e5dcSEric Biggers 162*4a32e5dcSEric Biggers orr r10, r14, r12 163*4a32e5dcSEric Biggers 164*4a32e5dcSEric Biggers // Use slow path if fewer than 64 bytes remain. 165*4a32e5dcSEric Biggers cmp r11, #64 166*4a32e5dcSEric Biggers blt .Lxor_slowpath\@ 167*4a32e5dcSEric Biggers 168*4a32e5dcSEric Biggers // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on 169*4a32e5dcSEric Biggers // ARMv6+, since ldmia and stmia (used below) still require alignment. 170*4a32e5dcSEric Biggers tst r10, #3 171*4a32e5dcSEric Biggers bne .Lxor_slowpath\@ 172*4a32e5dcSEric Biggers 173*4a32e5dcSEric Biggers // Fast path: XOR 64 bytes of aligned data. 174*4a32e5dcSEric Biggers 175*4a32e5dcSEric Biggers // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN 176*4a32e5dcSEric Biggers // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT. 177*4a32e5dcSEric Biggers // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. 178*4a32e5dcSEric Biggers 179*4a32e5dcSEric Biggers // x0-x3 180*4a32e5dcSEric Biggers __ldrd r8, r9, sp, 32 181*4a32e5dcSEric Biggers __ldrd r10, r11, sp, 40 182*4a32e5dcSEric Biggers add X0, X0, r8 183*4a32e5dcSEric Biggers add X1, X1, r9 184*4a32e5dcSEric Biggers add X2, X2, r10 185*4a32e5dcSEric Biggers add X3, X3, r11 186*4a32e5dcSEric Biggers _le32_bswap_4x X0, X1, X2, X3, r8 187*4a32e5dcSEric Biggers ldmia r12!, {r8-r11} 188*4a32e5dcSEric Biggers eor X0, X0, r8 189*4a32e5dcSEric Biggers eor X1, X1, r9 190*4a32e5dcSEric Biggers eor X2, X2, r10 191*4a32e5dcSEric Biggers eor X3, X3, r11 192*4a32e5dcSEric Biggers stmia r14!, {X0-X3} 193*4a32e5dcSEric Biggers 194*4a32e5dcSEric Biggers // x4-x7 195*4a32e5dcSEric Biggers __ldrd r8, r9, sp, 48 196*4a32e5dcSEric Biggers __ldrd r10, r11, sp, 56 197*4a32e5dcSEric Biggers add X4, r8, X4, ror #brot 198*4a32e5dcSEric Biggers add X5, r9, X5, ror #brot 199*4a32e5dcSEric Biggers ldmia r12!, {X0-X3} 200*4a32e5dcSEric Biggers add X6, r10, X6, ror #brot 201*4a32e5dcSEric Biggers add X7, r11, X7, ror #brot 202*4a32e5dcSEric Biggers _le32_bswap_4x X4, X5, X6, X7, r8 203*4a32e5dcSEric Biggers eor X4, X4, X0 204*4a32e5dcSEric Biggers eor X5, X5, X1 205*4a32e5dcSEric Biggers eor X6, X6, X2 206*4a32e5dcSEric Biggers eor X7, X7, X3 207*4a32e5dcSEric Biggers stmia r14!, {X4-X7} 208*4a32e5dcSEric Biggers 209*4a32e5dcSEric Biggers // x8-x15 210*4a32e5dcSEric Biggers pop {r0-r7} // (x8-x9,x12-x15,x10-x11) 211*4a32e5dcSEric Biggers __ldrd r8, r9, sp, 32 212*4a32e5dcSEric Biggers __ldrd r10, r11, sp, 40 213*4a32e5dcSEric Biggers add r0, r0, r8 // x8 214*4a32e5dcSEric Biggers add r1, r1, r9 // x9 215*4a32e5dcSEric Biggers add r6, r6, r10 // x10 216*4a32e5dcSEric Biggers add r7, r7, r11 // x11 217*4a32e5dcSEric Biggers _le32_bswap_4x r0, r1, r6, r7, r8 218*4a32e5dcSEric Biggers ldmia r12!, {r8-r11} 219*4a32e5dcSEric Biggers eor r0, r0, r8 // x8 220*4a32e5dcSEric Biggers eor r1, r1, r9 // x9 221*4a32e5dcSEric Biggers eor r6, r6, r10 // x10 222*4a32e5dcSEric Biggers eor r7, r7, r11 // x11 223*4a32e5dcSEric Biggers stmia r14!, {r0,r1,r6,r7} 224*4a32e5dcSEric Biggers ldmia r12!, {r0,r1,r6,r7} 225*4a32e5dcSEric Biggers __ldrd r8, r9, sp, 48 226*4a32e5dcSEric Biggers __ldrd r10, r11, sp, 56 227*4a32e5dcSEric Biggers add r2, r8, r2, ror #drot // x12 228*4a32e5dcSEric Biggers add r3, r9, r3, ror #drot // x13 229*4a32e5dcSEric Biggers add r4, r10, r4, ror #drot // x14 230*4a32e5dcSEric Biggers add r5, r11, r5, ror #drot // x15 231*4a32e5dcSEric Biggers _le32_bswap_4x r2, r3, r4, r5, r9 232*4a32e5dcSEric Biggers ldr r9, [sp, #72] // load LEN 233*4a32e5dcSEric Biggers eor r2, r2, r0 // x12 234*4a32e5dcSEric Biggers eor r3, r3, r1 // x13 235*4a32e5dcSEric Biggers eor r4, r4, r6 // x14 236*4a32e5dcSEric Biggers eor r5, r5, r7 // x15 237*4a32e5dcSEric Biggers subs r9, #64 // decrement and check LEN 238*4a32e5dcSEric Biggers stmia r14!, {r2-r5} 239*4a32e5dcSEric Biggers 240*4a32e5dcSEric Biggers beq .Ldone\@ 241*4a32e5dcSEric Biggers 242*4a32e5dcSEric Biggers.Lprepare_for_next_block\@: 243*4a32e5dcSEric Biggers 244*4a32e5dcSEric Biggers // Stack: x0-x15 OUT IN LEN 245*4a32e5dcSEric Biggers 246*4a32e5dcSEric Biggers // Increment block counter (x12) 247*4a32e5dcSEric Biggers add r8, #1 248*4a32e5dcSEric Biggers 249*4a32e5dcSEric Biggers // Store updated (OUT, IN, LEN) 250*4a32e5dcSEric Biggers str r14, [sp, #64] 251*4a32e5dcSEric Biggers str r12, [sp, #68] 252*4a32e5dcSEric Biggers str r9, [sp, #72] 253*4a32e5dcSEric Biggers 254*4a32e5dcSEric Biggers mov r14, sp 255*4a32e5dcSEric Biggers 256*4a32e5dcSEric Biggers // Store updated block counter (x12) 257*4a32e5dcSEric Biggers str r8, [sp, #48] 258*4a32e5dcSEric Biggers 259*4a32e5dcSEric Biggers sub sp, #16 260*4a32e5dcSEric Biggers 261*4a32e5dcSEric Biggers // Reload state and do next block 262*4a32e5dcSEric Biggers ldmia r14!, {r0-r11} // load x0-x11 263*4a32e5dcSEric Biggers __strd r10, r11, sp, 8 // store x10-x11 before state 264*4a32e5dcSEric Biggers ldmia r14, {r10-r12,r14} // load x12-x15 265*4a32e5dcSEric Biggers b .Lnext_block\@ 266*4a32e5dcSEric Biggers 267*4a32e5dcSEric Biggers.Lxor_slowpath\@: 268*4a32e5dcSEric Biggers // Slow path: < 64 bytes remaining, or unaligned input or output buffer. 269*4a32e5dcSEric Biggers // We handle it by storing the 64 bytes of keystream to the stack, then 270*4a32e5dcSEric Biggers // XOR-ing the needed portion with the data. 271*4a32e5dcSEric Biggers 272*4a32e5dcSEric Biggers // Allocate keystream buffer 273*4a32e5dcSEric Biggers sub sp, #64 274*4a32e5dcSEric Biggers mov r14, sp 275*4a32e5dcSEric Biggers 276*4a32e5dcSEric Biggers // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN 277*4a32e5dcSEric Biggers // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0. 278*4a32e5dcSEric Biggers // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. 279*4a32e5dcSEric Biggers 280*4a32e5dcSEric Biggers // Save keystream for x0-x3 281*4a32e5dcSEric Biggers __ldrd r8, r9, sp, 96 282*4a32e5dcSEric Biggers __ldrd r10, r11, sp, 104 283*4a32e5dcSEric Biggers add X0, X0, r8 284*4a32e5dcSEric Biggers add X1, X1, r9 285*4a32e5dcSEric Biggers add X2, X2, r10 286*4a32e5dcSEric Biggers add X3, X3, r11 287*4a32e5dcSEric Biggers _le32_bswap_4x X0, X1, X2, X3, r8 288*4a32e5dcSEric Biggers stmia r14!, {X0-X3} 289*4a32e5dcSEric Biggers 290*4a32e5dcSEric Biggers // Save keystream for x4-x7 291*4a32e5dcSEric Biggers __ldrd r8, r9, sp, 112 292*4a32e5dcSEric Biggers __ldrd r10, r11, sp, 120 293*4a32e5dcSEric Biggers add X4, r8, X4, ror #brot 294*4a32e5dcSEric Biggers add X5, r9, X5, ror #brot 295*4a32e5dcSEric Biggers add X6, r10, X6, ror #brot 296*4a32e5dcSEric Biggers add X7, r11, X7, ror #brot 297*4a32e5dcSEric Biggers _le32_bswap_4x X4, X5, X6, X7, r8 298*4a32e5dcSEric Biggers add r8, sp, #64 299*4a32e5dcSEric Biggers stmia r14!, {X4-X7} 300*4a32e5dcSEric Biggers 301*4a32e5dcSEric Biggers // Save keystream for x8-x15 302*4a32e5dcSEric Biggers ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11) 303*4a32e5dcSEric Biggers __ldrd r8, r9, sp, 128 304*4a32e5dcSEric Biggers __ldrd r10, r11, sp, 136 305*4a32e5dcSEric Biggers add r0, r0, r8 // x8 306*4a32e5dcSEric Biggers add r1, r1, r9 // x9 307*4a32e5dcSEric Biggers add r6, r6, r10 // x10 308*4a32e5dcSEric Biggers add r7, r7, r11 // x11 309*4a32e5dcSEric Biggers _le32_bswap_4x r0, r1, r6, r7, r8 310*4a32e5dcSEric Biggers stmia r14!, {r0,r1,r6,r7} 311*4a32e5dcSEric Biggers __ldrd r8, r9, sp, 144 312*4a32e5dcSEric Biggers __ldrd r10, r11, sp, 152 313*4a32e5dcSEric Biggers add r2, r8, r2, ror #drot // x12 314*4a32e5dcSEric Biggers add r3, r9, r3, ror #drot // x13 315*4a32e5dcSEric Biggers add r4, r10, r4, ror #drot // x14 316*4a32e5dcSEric Biggers add r5, r11, r5, ror #drot // x15 317*4a32e5dcSEric Biggers _le32_bswap_4x r2, r3, r4, r5, r9 318*4a32e5dcSEric Biggers stmia r14, {r2-r5} 319*4a32e5dcSEric Biggers 320*4a32e5dcSEric Biggers // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN 321*4a32e5dcSEric Biggers // Registers: r8 is block counter, r12 is IN. 322*4a32e5dcSEric Biggers 323*4a32e5dcSEric Biggers ldr r9, [sp, #168] // LEN 324*4a32e5dcSEric Biggers ldr r14, [sp, #160] // OUT 325*4a32e5dcSEric Biggers cmp r9, #64 326*4a32e5dcSEric Biggers mov r0, sp 327*4a32e5dcSEric Biggers movle r1, r9 328*4a32e5dcSEric Biggers movgt r1, #64 329*4a32e5dcSEric Biggers // r1 is number of bytes to XOR, in range [1, 64] 330*4a32e5dcSEric Biggers 331*4a32e5dcSEric Biggers.if __LINUX_ARM_ARCH__ < 6 332*4a32e5dcSEric Biggers orr r2, r12, r14 333*4a32e5dcSEric Biggers tst r2, #3 // IN or OUT misaligned? 334*4a32e5dcSEric Biggers bne .Lxor_next_byte\@ 335*4a32e5dcSEric Biggers.endif 336*4a32e5dcSEric Biggers 337*4a32e5dcSEric Biggers // XOR a word at a time 338*4a32e5dcSEric Biggers.rept 16 339*4a32e5dcSEric Biggers subs r1, #4 340*4a32e5dcSEric Biggers blt .Lxor_words_done\@ 341*4a32e5dcSEric Biggers ldr r2, [r12], #4 342*4a32e5dcSEric Biggers ldr r3, [r0], #4 343*4a32e5dcSEric Biggers eor r2, r2, r3 344*4a32e5dcSEric Biggers str r2, [r14], #4 345*4a32e5dcSEric Biggers.endr 346*4a32e5dcSEric Biggers b .Lxor_slowpath_done\@ 347*4a32e5dcSEric Biggers.Lxor_words_done\@: 348*4a32e5dcSEric Biggers ands r1, r1, #3 349*4a32e5dcSEric Biggers beq .Lxor_slowpath_done\@ 350*4a32e5dcSEric Biggers 351*4a32e5dcSEric Biggers // XOR a byte at a time 352*4a32e5dcSEric Biggers.Lxor_next_byte\@: 353*4a32e5dcSEric Biggers ldrb r2, [r12], #1 354*4a32e5dcSEric Biggers ldrb r3, [r0], #1 355*4a32e5dcSEric Biggers eor r2, r2, r3 356*4a32e5dcSEric Biggers strb r2, [r14], #1 357*4a32e5dcSEric Biggers subs r1, #1 358*4a32e5dcSEric Biggers bne .Lxor_next_byte\@ 359*4a32e5dcSEric Biggers 360*4a32e5dcSEric Biggers.Lxor_slowpath_done\@: 361*4a32e5dcSEric Biggers subs r9, #64 362*4a32e5dcSEric Biggers add sp, #96 363*4a32e5dcSEric Biggers bgt .Lprepare_for_next_block\@ 364*4a32e5dcSEric Biggers 365*4a32e5dcSEric Biggers.Ldone\@: 366*4a32e5dcSEric Biggers.endm // _chacha 367*4a32e5dcSEric Biggers 368*4a32e5dcSEric Biggers/* 369*4a32e5dcSEric Biggers * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, 370*4a32e5dcSEric Biggers * const struct chacha_state *state, int nrounds); 371*4a32e5dcSEric Biggers */ 372*4a32e5dcSEric BiggersENTRY(chacha_doarm) 373*4a32e5dcSEric Biggers cmp r2, #0 // len == 0? 374*4a32e5dcSEric Biggers reteq lr 375*4a32e5dcSEric Biggers 376*4a32e5dcSEric Biggers ldr ip, [sp] 377*4a32e5dcSEric Biggers cmp ip, #12 378*4a32e5dcSEric Biggers 379*4a32e5dcSEric Biggers push {r0-r2,r4-r11,lr} 380*4a32e5dcSEric Biggers 381*4a32e5dcSEric Biggers // Push state x0-x15 onto stack. 382*4a32e5dcSEric Biggers // Also store an extra copy of x10-x11 just before the state. 383*4a32e5dcSEric Biggers 384*4a32e5dcSEric Biggers add X12, r3, #48 385*4a32e5dcSEric Biggers ldm X12, {X12,X13,X14,X15} 386*4a32e5dcSEric Biggers push {X12,X13,X14,X15} 387*4a32e5dcSEric Biggers sub sp, sp, #64 388*4a32e5dcSEric Biggers 389*4a32e5dcSEric Biggers __ldrd X8_X10, X9_X11, r3, 40 390*4a32e5dcSEric Biggers __strd X8_X10, X9_X11, sp, 8 391*4a32e5dcSEric Biggers __strd X8_X10, X9_X11, sp, 56 392*4a32e5dcSEric Biggers ldm r3, {X0-X9_X11} 393*4a32e5dcSEric Biggers __strd X0, X1, sp, 16 394*4a32e5dcSEric Biggers __strd X2, X3, sp, 24 395*4a32e5dcSEric Biggers __strd X4, X5, sp, 32 396*4a32e5dcSEric Biggers __strd X6, X7, sp, 40 397*4a32e5dcSEric Biggers __strd X8_X10, X9_X11, sp, 48 398*4a32e5dcSEric Biggers 399*4a32e5dcSEric Biggers beq 1f 400*4a32e5dcSEric Biggers _chacha 20 401*4a32e5dcSEric Biggers 402*4a32e5dcSEric Biggers0: add sp, #76 403*4a32e5dcSEric Biggers pop {r4-r11, pc} 404*4a32e5dcSEric Biggers 405*4a32e5dcSEric Biggers1: _chacha 12 406*4a32e5dcSEric Biggers b 0b 407*4a32e5dcSEric BiggersENDPROC(chacha_doarm) 408*4a32e5dcSEric Biggers 409*4a32e5dcSEric Biggers/* 410*4a32e5dcSEric Biggers * void hchacha_block_arm(const struct chacha_state *state, 411*4a32e5dcSEric Biggers * u32 out[HCHACHA_OUT_WORDS], int nrounds); 412*4a32e5dcSEric Biggers */ 413*4a32e5dcSEric BiggersENTRY(hchacha_block_arm) 414*4a32e5dcSEric Biggers push {r1,r4-r11,lr} 415*4a32e5dcSEric Biggers 416*4a32e5dcSEric Biggers cmp r2, #12 // ChaCha12 ? 417*4a32e5dcSEric Biggers 418*4a32e5dcSEric Biggers mov r14, r0 419*4a32e5dcSEric Biggers ldmia r14!, {r0-r11} // load x0-x11 420*4a32e5dcSEric Biggers push {r10-r11} // store x10-x11 to stack 421*4a32e5dcSEric Biggers ldm r14, {r10-r12,r14} // load x12-x15 422*4a32e5dcSEric Biggers sub sp, #8 423*4a32e5dcSEric Biggers 424*4a32e5dcSEric Biggers beq 1f 425*4a32e5dcSEric Biggers _chacha_permute 20 426*4a32e5dcSEric Biggers 427*4a32e5dcSEric Biggers // Skip over (unused0-unused1, x10-x11) 428*4a32e5dcSEric Biggers0: add sp, #16 429*4a32e5dcSEric Biggers 430*4a32e5dcSEric Biggers // Fix up rotations of x12-x15 431*4a32e5dcSEric Biggers ror X12, X12, #drot 432*4a32e5dcSEric Biggers ror X13, X13, #drot 433*4a32e5dcSEric Biggers pop {r4} // load 'out' 434*4a32e5dcSEric Biggers ror X14, X14, #drot 435*4a32e5dcSEric Biggers ror X15, X15, #drot 436*4a32e5dcSEric Biggers 437*4a32e5dcSEric Biggers // Store (x0-x3,x12-x15) to 'out' 438*4a32e5dcSEric Biggers stm r4, {X0,X1,X2,X3,X12,X13,X14,X15} 439*4a32e5dcSEric Biggers 440*4a32e5dcSEric Biggers pop {r4-r11,pc} 441*4a32e5dcSEric Biggers 442*4a32e5dcSEric Biggers1: _chacha_permute 12 443*4a32e5dcSEric Biggers b 0b 444*4a32e5dcSEric BiggersENDPROC(hchacha_block_arm) 445