1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * Copyright (C) 2024 Christophe Leroy <christophe.leroy@csgroup.eu>, CS GROUP France 4 */ 5 6#include <linux/linkage.h> 7 8#include <asm/ppc_asm.h> 9 10#define dst_bytes r3 11#define key r4 12#define counter r5 13#define nblocks r6 14 15#define idx_r0 r0 16#define val4 r4 17 18#define const0 0x61707865 19#define const1 0x3320646e 20#define const2 0x79622d32 21#define const3 0x6b206574 22 23#define key0 r5 24#define key1 r6 25#define key2 r7 26#define key3 r8 27#define key4 r9 28#define key5 r10 29#define key6 r11 30#define key7 r12 31 32#define counter0 r14 33#define counter1 r15 34 35#define state0 r16 36#define state1 r17 37#define state2 r18 38#define state3 r19 39#define state4 r20 40#define state5 r21 41#define state6 r22 42#define state7 r23 43#define state8 r24 44#define state9 r25 45#define state10 r26 46#define state11 r27 47#define state12 r28 48#define state13 r29 49#define state14 r30 50#define state15 r31 51 52.macro quarterround4 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 a4 b4 c4 d4 53 add \a1, \a1, \b1 54 add \a2, \a2, \b2 55 add \a3, \a3, \b3 56 add \a4, \a4, \b4 57 xor \d1, \d1, \a1 58 xor \d2, \d2, \a2 59 xor \d3, \d3, \a3 60 xor \d4, \d4, \a4 61 rotlwi \d1, \d1, 16 62 rotlwi \d2, \d2, 16 63 rotlwi \d3, \d3, 16 64 rotlwi \d4, \d4, 16 65 add \c1, \c1, \d1 66 add \c2, \c2, \d2 67 add \c3, \c3, \d3 68 add \c4, \c4, \d4 69 xor \b1, \b1, \c1 70 xor \b2, \b2, \c2 71 xor \b3, \b3, \c3 72 xor \b4, \b4, \c4 73 rotlwi \b1, \b1, 12 74 rotlwi \b2, \b2, 12 75 rotlwi \b3, \b3, 12 76 rotlwi \b4, \b4, 12 77 add \a1, \a1, \b1 78 add \a2, \a2, \b2 79 add \a3, \a3, \b3 80 add \a4, \a4, \b4 81 xor \d1, \d1, \a1 82 xor \d2, \d2, \a2 83 xor \d3, \d3, \a3 84 xor \d4, \d4, \a4 85 rotlwi \d1, \d1, 8 86 rotlwi \d2, \d2, 8 87 rotlwi \d3, \d3, 8 88 rotlwi \d4, \d4, 8 89 add \c1, \c1, \d1 90 add \c2, \c2, \d2 91 add \c3, \c3, \d3 92 add \c4, \c4, \d4 93 xor \b1, \b1, \c1 94 xor \b2, \b2, \c2 95 xor \b3, \b3, \c3 96 xor \b4, \b4, \c4 97 rotlwi \b1, \b1, 7 98 rotlwi \b2, \b2, 7 99 rotlwi \b3, \b3, 7 100 rotlwi \b4, \b4, 7 101.endm 102 103#define QUARTERROUND4(a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,a4,b4,c4,d4) \ 104 quarterround4 state##a1 state##b1 state##c1 state##d1 \ 105 state##a2 state##b2 state##c2 state##d2 \ 106 state##a3 state##b3 state##c3 state##d3 \ 107 state##a4 state##b4 state##c4 state##d4 108 109/* 110 * Very basic 32 bits implementation of ChaCha20. Produces a given positive number 111 * of blocks of output with a nonce of 0, taking an input key and 8-byte 112 * counter. Importantly does not spill to the stack. Its arguments are: 113 * 114 * r3: output bytes 115 * r4: 32-byte key input 116 * r5: 8-byte counter input/output (saved on stack) 117 * r6: number of 64-byte blocks to write to output 118 * 119 * r0: counter of blocks (initialised with r6) 120 * r4: Value '4' after key has been read. 121 * r5-r12: key 122 * r14-r15: counter 123 * r16-r31: state 124 */ 125SYM_FUNC_START(__arch_chacha20_blocks_nostack) 126#ifdef __powerpc64__ 127 std counter, -216(r1) 128 129 std r14, -144(r1) 130 std r15, -136(r1) 131 std r16, -128(r1) 132 std r17, -120(r1) 133 std r18, -112(r1) 134 std r19, -104(r1) 135 std r20, -96(r1) 136 std r21, -88(r1) 137 std r22, -80(r1) 138 std r23, -72(r1) 139 std r24, -64(r1) 140 std r25, -56(r1) 141 std r26, -48(r1) 142 std r27, -40(r1) 143 std r28, -32(r1) 144 std r29, -24(r1) 145 std r30, -16(r1) 146 std r31, -8(r1) 147#else 148 stwu r1, -96(r1) 149 stw counter, 20(r1) 150#ifdef __BIG_ENDIAN__ 151 stmw r14, 24(r1) 152#else 153 stw r14, 24(r1) 154 stw r15, 28(r1) 155 stw r16, 32(r1) 156 stw r17, 36(r1) 157 stw r18, 40(r1) 158 stw r19, 44(r1) 159 stw r20, 48(r1) 160 stw r21, 52(r1) 161 stw r22, 56(r1) 162 stw r23, 60(r1) 163 stw r24, 64(r1) 164 stw r25, 68(r1) 165 stw r26, 72(r1) 166 stw r27, 76(r1) 167 stw r28, 80(r1) 168 stw r29, 84(r1) 169 stw r30, 88(r1) 170 stw r31, 92(r1) 171#endif 172#endif /* __powerpc64__ */ 173 174 lwz counter0, 0(counter) 175 lwz counter1, 4(counter) 176#ifdef __powerpc64__ 177 rldimi counter0, counter1, 32, 0 178#endif 179 mr idx_r0, nblocks 180 subi dst_bytes, dst_bytes, 4 181 182 lwz key0, 0(key) 183 lwz key1, 4(key) 184 lwz key2, 8(key) 185 lwz key3, 12(key) 186 lwz key4, 16(key) 187 lwz key5, 20(key) 188 lwz key6, 24(key) 189 lwz key7, 28(key) 190 191 li val4, 4 192.Lblock: 193 li r31, 10 194 195 lis state0, const0@ha 196 lis state1, const1@ha 197 lis state2, const2@ha 198 lis state3, const3@ha 199 addi state0, state0, const0@l 200 addi state1, state1, const1@l 201 addi state2, state2, const2@l 202 addi state3, state3, const3@l 203 204 mtctr r31 205 206 mr state4, key0 207 mr state5, key1 208 mr state6, key2 209 mr state7, key3 210 mr state8, key4 211 mr state9, key5 212 mr state10, key6 213 mr state11, key7 214 215 mr state12, counter0 216 mr state13, counter1 217 218 li state14, 0 219 li state15, 0 220 221.Lpermute: 222 QUARTERROUND4( 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15) 223 QUARTERROUND4( 0, 5,10,15, 1, 6,11,12, 2, 7, 8,13, 3, 4, 9,14) 224 225 bdnz .Lpermute 226 227 addis state0, state0, const0@ha 228 addis state1, state1, const1@ha 229 addis state2, state2, const2@ha 230 addis state3, state3, const3@ha 231 addi state0, state0, const0@l 232 addi state1, state1, const1@l 233 addi state2, state2, const2@l 234 addi state3, state3, const3@l 235 236 add state4, state4, key0 237 add state5, state5, key1 238 add state6, state6, key2 239 add state7, state7, key3 240 add state8, state8, key4 241 add state9, state9, key5 242 add state10, state10, key6 243 add state11, state11, key7 244 245 add state12, state12, counter0 246 add state13, state13, counter1 247 248#ifdef __BIG_ENDIAN__ 249 stwbrx state0, val4, dst_bytes 250 addi dst_bytes, dst_bytes, 8 251 stwbrx state1, 0, dst_bytes 252 stwbrx state2, val4, dst_bytes 253 addi dst_bytes, dst_bytes, 8 254 stwbrx state3, 0, dst_bytes 255 stwbrx state4, val4, dst_bytes 256 addi dst_bytes, dst_bytes, 8 257 stwbrx state5, 0, dst_bytes 258 stwbrx state6, val4, dst_bytes 259 addi dst_bytes, dst_bytes, 8 260 stwbrx state7, 0, dst_bytes 261 stwbrx state8, val4, dst_bytes 262 addi dst_bytes, dst_bytes, 8 263 stwbrx state9, 0, dst_bytes 264 stwbrx state10, val4, dst_bytes 265 addi dst_bytes, dst_bytes, 8 266 stwbrx state11, 0, dst_bytes 267 stwbrx state12, val4, dst_bytes 268 addi dst_bytes, dst_bytes, 8 269 stwbrx state13, 0, dst_bytes 270 stwbrx state14, val4, dst_bytes 271 addi dst_bytes, dst_bytes, 8 272 stwbrx state15, 0, dst_bytes 273#else 274 stw state0, 4(dst_bytes) 275 stw state1, 8(dst_bytes) 276 stw state2, 12(dst_bytes) 277 stw state3, 16(dst_bytes) 278 stw state4, 20(dst_bytes) 279 stw state5, 24(dst_bytes) 280 stw state6, 28(dst_bytes) 281 stw state7, 32(dst_bytes) 282 stw state8, 36(dst_bytes) 283 stw state9, 40(dst_bytes) 284 stw state10, 44(dst_bytes) 285 stw state11, 48(dst_bytes) 286 stw state12, 52(dst_bytes) 287 stw state13, 56(dst_bytes) 288 stw state14, 60(dst_bytes) 289 stwu state15, 64(dst_bytes) 290#endif 291 292 subic. idx_r0, idx_r0, 1 /* subi. can't use r0 as source */ 293 294#ifdef __powerpc64__ 295 addi counter0, counter0, 1 296 srdi counter1, counter0, 32 297#else 298 addic counter0, counter0, 1 299 addze counter1, counter1 300#endif 301 302 bne .Lblock 303 304#ifdef __powerpc64__ 305 ld counter, -216(r1) 306#else 307 lwz counter, 20(r1) 308#endif 309 stw counter0, 0(counter) 310 stw counter1, 4(counter) 311 312 li r6, 0 313 li r7, 0 314 li r8, 0 315 li r9, 0 316 li r10, 0 317 li r11, 0 318 li r12, 0 319 320#ifdef __powerpc64__ 321 ld r14, -144(r1) 322 ld r15, -136(r1) 323 ld r16, -128(r1) 324 ld r17, -120(r1) 325 ld r18, -112(r1) 326 ld r19, -104(r1) 327 ld r20, -96(r1) 328 ld r21, -88(r1) 329 ld r22, -80(r1) 330 ld r23, -72(r1) 331 ld r24, -64(r1) 332 ld r25, -56(r1) 333 ld r26, -48(r1) 334 ld r27, -40(r1) 335 ld r28, -32(r1) 336 ld r29, -24(r1) 337 ld r30, -16(r1) 338 ld r31, -8(r1) 339#else 340#ifdef __BIG_ENDIAN__ 341 lmw r14, 24(r1) 342#else 343 lwz r14, 24(r1) 344 lwz r15, 28(r1) 345 lwz r16, 32(r1) 346 lwz r17, 36(r1) 347 lwz r18, 40(r1) 348 lwz r19, 44(r1) 349 lwz r20, 48(r1) 350 lwz r21, 52(r1) 351 lwz r22, 56(r1) 352 lwz r23, 60(r1) 353 lwz r24, 64(r1) 354 lwz r25, 68(r1) 355 lwz r26, 72(r1) 356 lwz r27, 76(r1) 357 lwz r28, 80(r1) 358 lwz r29, 84(r1) 359 lwz r30, 88(r1) 360 lwz r31, 92(r1) 361#endif 362 addi r1, r1, 96 363#endif /* __powerpc64__ */ 364 blr 365SYM_FUNC_END(__arch_chacha20_blocks_nostack) 366