1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2# 3# Accelerated poly1305 implementation for ppc64le. 4# 5# Copyright 2023- IBM Corp. All rights reserved 6# 7#=================================================================================== 8# Written by Danny Tsen <dtsen@us.ibm.com> 9# 10# Poly1305 - this version mainly using vector/VSX/Scalar 11# - 26 bits limbs 12# - Handle multiple 64 byte blcok. 13# 14# Block size 16 bytes 15# key = (r, s) 16# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF 17# p = 2^130 - 5 18# a += m 19# a = (r + a) % p 20# a += s 21# 22# Improve performance by breaking down polynominal to the sum of products with 23# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r 24# 25# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0 26# to 9 vectors for multiplications. 27# 28# setup r^4, r^3, r^2, r vectors 29# vs [r^1, r^3, r^2, r^4] 30# vs0 = [r0,.....] 31# vs1 = [r1,.....] 32# vs2 = [r2,.....] 33# vs3 = [r3,.....] 34# vs4 = [r4,.....] 35# vs5 = [r1*5,...] 36# vs6 = [r2*5,...] 37# vs7 = [r2*5,...] 38# vs8 = [r4*5,...] 39# 40# Each word in a vector consists a member of a "r/s" in [a * r/s]. 41# 42# r0, r4*5, r3*5, r2*5, r1*5; 43# r1, r0, r4*5, r3*5, r2*5; 44# r2, r1, r0, r4*5, r3*5; 45# r3, r2, r1, r0, r4*5; 46# r4, r3, r2, r1, r0 ; 47# 48# 49# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) 50# k = 32 bytes key 51# r3 = k (r, s) 52# r4 = mlen 53# r5 = m 54# 55#include <asm/ppc_asm.h> 56#include <asm/asm-offsets.h> 57#include <asm/asm-compat.h> 58#include <linux/linkage.h> 59 60.machine "any" 61 62.text 63 64.macro SAVE_GPR GPR OFFSET FRAME 65 std \GPR,\OFFSET(\FRAME) 66.endm 67 68.macro SAVE_VRS VRS OFFSET FRAME 69 li 16, \OFFSET 70 stvx \VRS, 16, \FRAME 71.endm 72 73.macro SAVE_VSX VSX OFFSET FRAME 74 li 16, \OFFSET 75 stxvx \VSX, 16, \FRAME 76.endm 77 78.macro RESTORE_GPR GPR OFFSET FRAME 79 ld \GPR,\OFFSET(\FRAME) 80.endm 81 82.macro RESTORE_VRS VRS OFFSET FRAME 83 li 16, \OFFSET 84 lvx \VRS, 16, \FRAME 85.endm 86 87.macro RESTORE_VSX VSX OFFSET FRAME 88 li 16, \OFFSET 89 lxvx \VSX, 16, \FRAME 90.endm 91 92.macro SAVE_REGS 93 mflr 0 94 std 0, 16(1) 95 stdu 1,-752(1) 96 97 SAVE_GPR 14, 112, 1 98 SAVE_GPR 15, 120, 1 99 SAVE_GPR 16, 128, 1 100 SAVE_GPR 17, 136, 1 101 SAVE_GPR 18, 144, 1 102 SAVE_GPR 19, 152, 1 103 SAVE_GPR 20, 160, 1 104 SAVE_GPR 21, 168, 1 105 SAVE_GPR 22, 176, 1 106 SAVE_GPR 23, 184, 1 107 SAVE_GPR 24, 192, 1 108 SAVE_GPR 25, 200, 1 109 SAVE_GPR 26, 208, 1 110 SAVE_GPR 27, 216, 1 111 SAVE_GPR 28, 224, 1 112 SAVE_GPR 29, 232, 1 113 SAVE_GPR 30, 240, 1 114 SAVE_GPR 31, 248, 1 115 116 addi 9, 1, 256 117 SAVE_VRS 20, 0, 9 118 SAVE_VRS 21, 16, 9 119 SAVE_VRS 22, 32, 9 120 SAVE_VRS 23, 48, 9 121 SAVE_VRS 24, 64, 9 122 SAVE_VRS 25, 80, 9 123 SAVE_VRS 26, 96, 9 124 SAVE_VRS 27, 112, 9 125 SAVE_VRS 28, 128, 9 126 SAVE_VRS 29, 144, 9 127 SAVE_VRS 30, 160, 9 128 SAVE_VRS 31, 176, 9 129 130 SAVE_VSX 14, 192, 9 131 SAVE_VSX 15, 208, 9 132 SAVE_VSX 16, 224, 9 133 SAVE_VSX 17, 240, 9 134 SAVE_VSX 18, 256, 9 135 SAVE_VSX 19, 272, 9 136 SAVE_VSX 20, 288, 9 137 SAVE_VSX 21, 304, 9 138 SAVE_VSX 22, 320, 9 139 SAVE_VSX 23, 336, 9 140 SAVE_VSX 24, 352, 9 141 SAVE_VSX 25, 368, 9 142 SAVE_VSX 26, 384, 9 143 SAVE_VSX 27, 400, 9 144 SAVE_VSX 28, 416, 9 145 SAVE_VSX 29, 432, 9 146 SAVE_VSX 30, 448, 9 147 SAVE_VSX 31, 464, 9 148.endm # SAVE_REGS 149 150.macro RESTORE_REGS 151 addi 9, 1, 256 152 RESTORE_VRS 20, 0, 9 153 RESTORE_VRS 21, 16, 9 154 RESTORE_VRS 22, 32, 9 155 RESTORE_VRS 23, 48, 9 156 RESTORE_VRS 24, 64, 9 157 RESTORE_VRS 25, 80, 9 158 RESTORE_VRS 26, 96, 9 159 RESTORE_VRS 27, 112, 9 160 RESTORE_VRS 28, 128, 9 161 RESTORE_VRS 29, 144, 9 162 RESTORE_VRS 30, 160, 9 163 RESTORE_VRS 31, 176, 9 164 165 RESTORE_VSX 14, 192, 9 166 RESTORE_VSX 15, 208, 9 167 RESTORE_VSX 16, 224, 9 168 RESTORE_VSX 17, 240, 9 169 RESTORE_VSX 18, 256, 9 170 RESTORE_VSX 19, 272, 9 171 RESTORE_VSX 20, 288, 9 172 RESTORE_VSX 21, 304, 9 173 RESTORE_VSX 22, 320, 9 174 RESTORE_VSX 23, 336, 9 175 RESTORE_VSX 24, 352, 9 176 RESTORE_VSX 25, 368, 9 177 RESTORE_VSX 26, 384, 9 178 RESTORE_VSX 27, 400, 9 179 RESTORE_VSX 28, 416, 9 180 RESTORE_VSX 29, 432, 9 181 RESTORE_VSX 30, 448, 9 182 RESTORE_VSX 31, 464, 9 183 184 RESTORE_GPR 14, 112, 1 185 RESTORE_GPR 15, 120, 1 186 RESTORE_GPR 16, 128, 1 187 RESTORE_GPR 17, 136, 1 188 RESTORE_GPR 18, 144, 1 189 RESTORE_GPR 19, 152, 1 190 RESTORE_GPR 20, 160, 1 191 RESTORE_GPR 21, 168, 1 192 RESTORE_GPR 22, 176, 1 193 RESTORE_GPR 23, 184, 1 194 RESTORE_GPR 24, 192, 1 195 RESTORE_GPR 25, 200, 1 196 RESTORE_GPR 26, 208, 1 197 RESTORE_GPR 27, 216, 1 198 RESTORE_GPR 28, 224, 1 199 RESTORE_GPR 29, 232, 1 200 RESTORE_GPR 30, 240, 1 201 RESTORE_GPR 31, 248, 1 202 203 addi 1, 1, 752 204 ld 0, 16(1) 205 mtlr 0 206.endm # RESTORE_REGS 207 208# 209# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5; 210# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5; 211# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5; 212# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5; 213# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ; 214# 215# [r^2, r^3, r^1, r^4] 216# [m3, m2, m4, m1] 217# 218# multiply odd and even words 219.macro mul_odd 220 vmulouw 14, 4, 26 221 vmulouw 10, 5, 3 222 vmulouw 11, 6, 2 223 vmulouw 12, 7, 1 224 vmulouw 13, 8, 0 225 vmulouw 15, 4, 27 226 vaddudm 14, 14, 10 227 vaddudm 14, 14, 11 228 vmulouw 10, 5, 26 229 vmulouw 11, 6, 3 230 vaddudm 14, 14, 12 231 vaddudm 14, 14, 13 # x0 232 vaddudm 15, 15, 10 233 vaddudm 15, 15, 11 234 vmulouw 12, 7, 2 235 vmulouw 13, 8, 1 236 vaddudm 15, 15, 12 237 vaddudm 15, 15, 13 # x1 238 vmulouw 16, 4, 28 239 vmulouw 10, 5, 27 240 vmulouw 11, 6, 26 241 vaddudm 16, 16, 10 242 vaddudm 16, 16, 11 243 vmulouw 12, 7, 3 244 vmulouw 13, 8, 2 245 vaddudm 16, 16, 12 246 vaddudm 16, 16, 13 # x2 247 vmulouw 17, 4, 29 248 vmulouw 10, 5, 28 249 vmulouw 11, 6, 27 250 vaddudm 17, 17, 10 251 vaddudm 17, 17, 11 252 vmulouw 12, 7, 26 253 vmulouw 13, 8, 3 254 vaddudm 17, 17, 12 255 vaddudm 17, 17, 13 # x3 256 vmulouw 18, 4, 30 257 vmulouw 10, 5, 29 258 vmulouw 11, 6, 28 259 vaddudm 18, 18, 10 260 vaddudm 18, 18, 11 261 vmulouw 12, 7, 27 262 vmulouw 13, 8, 26 263 vaddudm 18, 18, 12 264 vaddudm 18, 18, 13 # x4 265.endm 266 267.macro mul_even 268 vmuleuw 9, 4, 26 269 vmuleuw 10, 5, 3 270 vmuleuw 11, 6, 2 271 vmuleuw 12, 7, 1 272 vmuleuw 13, 8, 0 273 vaddudm 14, 14, 9 274 vaddudm 14, 14, 10 275 vaddudm 14, 14, 11 276 vaddudm 14, 14, 12 277 vaddudm 14, 14, 13 # x0 278 279 vmuleuw 9, 4, 27 280 vmuleuw 10, 5, 26 281 vmuleuw 11, 6, 3 282 vmuleuw 12, 7, 2 283 vmuleuw 13, 8, 1 284 vaddudm 15, 15, 9 285 vaddudm 15, 15, 10 286 vaddudm 15, 15, 11 287 vaddudm 15, 15, 12 288 vaddudm 15, 15, 13 # x1 289 290 vmuleuw 9, 4, 28 291 vmuleuw 10, 5, 27 292 vmuleuw 11, 6, 26 293 vmuleuw 12, 7, 3 294 vmuleuw 13, 8, 2 295 vaddudm 16, 16, 9 296 vaddudm 16, 16, 10 297 vaddudm 16, 16, 11 298 vaddudm 16, 16, 12 299 vaddudm 16, 16, 13 # x2 300 301 vmuleuw 9, 4, 29 302 vmuleuw 10, 5, 28 303 vmuleuw 11, 6, 27 304 vmuleuw 12, 7, 26 305 vmuleuw 13, 8, 3 306 vaddudm 17, 17, 9 307 vaddudm 17, 17, 10 308 vaddudm 17, 17, 11 309 vaddudm 17, 17, 12 310 vaddudm 17, 17, 13 # x3 311 312 vmuleuw 9, 4, 30 313 vmuleuw 10, 5, 29 314 vmuleuw 11, 6, 28 315 vmuleuw 12, 7, 27 316 vmuleuw 13, 8, 26 317 vaddudm 18, 18, 9 318 vaddudm 18, 18, 10 319 vaddudm 18, 18, 11 320 vaddudm 18, 18, 12 321 vaddudm 18, 18, 13 # x4 322.endm 323 324# 325# poly1305_setup_r 326# 327# setup r^4, r^3, r^2, r vectors 328# [r, r^3, r^2, r^4] 329# vs0 = [r0,...] 330# vs1 = [r1,...] 331# vs2 = [r2,...] 332# vs3 = [r3,...] 333# vs4 = [r4,...] 334# vs5 = [r4*5,...] 335# vs6 = [r3*5,...] 336# vs7 = [r2*5,...] 337# vs8 = [r1*5,...] 338# 339# r0, r4*5, r3*5, r2*5, r1*5; 340# r1, r0, r4*5, r3*5, r2*5; 341# r2, r1, r0, r4*5, r3*5; 342# r3, r2, r1, r0, r4*5; 343# r4, r3, r2, r1, r0 ; 344# 345.macro poly1305_setup_r 346 347 # save r 348 xxlor 26, 58, 58 349 xxlor 27, 59, 59 350 xxlor 28, 60, 60 351 xxlor 29, 61, 61 352 xxlor 30, 62, 62 353 354 xxlxor 31, 31, 31 355 356# [r, r^3, r^2, r^4] 357 # compute r^2 358 vmr 4, 26 359 vmr 5, 27 360 vmr 6, 28 361 vmr 7, 29 362 vmr 8, 30 363 bl do_mul # r^2 r^1 364 xxpermdi 58, 58, 36, 0x3 # r0 365 xxpermdi 59, 59, 37, 0x3 # r1 366 xxpermdi 60, 60, 38, 0x3 # r2 367 xxpermdi 61, 61, 39, 0x3 # r3 368 xxpermdi 62, 62, 40, 0x3 # r4 369 xxpermdi 36, 36, 36, 0x3 370 xxpermdi 37, 37, 37, 0x3 371 xxpermdi 38, 38, 38, 0x3 372 xxpermdi 39, 39, 39, 0x3 373 xxpermdi 40, 40, 40, 0x3 374 vspltisb 13, 2 375 vsld 9, 27, 13 376 vsld 10, 28, 13 377 vsld 11, 29, 13 378 vsld 12, 30, 13 379 vaddudm 0, 9, 27 380 vaddudm 1, 10, 28 381 vaddudm 2, 11, 29 382 vaddudm 3, 12, 30 383 384 bl do_mul # r^4 r^3 385 vmrgow 26, 26, 4 386 vmrgow 27, 27, 5 387 vmrgow 28, 28, 6 388 vmrgow 29, 29, 7 389 vmrgow 30, 30, 8 390 vspltisb 13, 2 391 vsld 9, 27, 13 392 vsld 10, 28, 13 393 vsld 11, 29, 13 394 vsld 12, 30, 13 395 vaddudm 0, 9, 27 396 vaddudm 1, 10, 28 397 vaddudm 2, 11, 29 398 vaddudm 3, 12, 30 399 400 # r^2 r^4 401 xxlor 0, 58, 58 402 xxlor 1, 59, 59 403 xxlor 2, 60, 60 404 xxlor 3, 61, 61 405 xxlor 4, 62, 62 406 xxlor 5, 32, 32 407 xxlor 6, 33, 33 408 xxlor 7, 34, 34 409 xxlor 8, 35, 35 410 411 vspltw 9, 26, 3 412 vspltw 10, 26, 2 413 vmrgow 26, 10, 9 414 vspltw 9, 27, 3 415 vspltw 10, 27, 2 416 vmrgow 27, 10, 9 417 vspltw 9, 28, 3 418 vspltw 10, 28, 2 419 vmrgow 28, 10, 9 420 vspltw 9, 29, 3 421 vspltw 10, 29, 2 422 vmrgow 29, 10, 9 423 vspltw 9, 30, 3 424 vspltw 10, 30, 2 425 vmrgow 30, 10, 9 426 427 vsld 9, 27, 13 428 vsld 10, 28, 13 429 vsld 11, 29, 13 430 vsld 12, 30, 13 431 vaddudm 0, 9, 27 432 vaddudm 1, 10, 28 433 vaddudm 2, 11, 29 434 vaddudm 3, 12, 30 435.endm 436 437SYM_FUNC_START_LOCAL(do_mul) 438 mul_odd 439 440 # do reduction ( h %= p ) 441 # carry reduction 442 vspltisb 9, 2 443 vsrd 10, 14, 31 444 vsrd 11, 17, 31 445 vand 7, 17, 25 446 vand 4, 14, 25 447 vaddudm 18, 18, 11 448 vsrd 12, 18, 31 449 vaddudm 15, 15, 10 450 451 vsrd 11, 15, 31 452 vand 8, 18, 25 453 vand 5, 15, 25 454 vaddudm 4, 4, 12 455 vsld 10, 12, 9 456 vaddudm 6, 16, 11 457 458 vsrd 13, 6, 31 459 vand 6, 6, 25 460 vaddudm 4, 4, 10 461 vsrd 10, 4, 31 462 vaddudm 7, 7, 13 463 464 vsrd 11, 7, 31 465 vand 7, 7, 25 466 vand 4, 4, 25 467 vaddudm 5, 5, 10 468 vaddudm 8, 8, 11 469 blr 470SYM_FUNC_END(do_mul) 471 472# 473# init key 474# 475.macro do_poly1305_init 476 addis 10, 2, rmask@toc@ha 477 addi 10, 10, rmask@toc@l 478 479 ld 11, 0(10) 480 ld 12, 8(10) 481 482 li 14, 16 483 li 15, 32 484 addis 10, 2, cnum@toc@ha 485 addi 10, 10, cnum@toc@l 486 lvx 25, 0, 10 # v25 - mask 487 lvx 31, 14, 10 # v31 = 1a 488 lvx 19, 15, 10 # v19 = 1 << 24 489 lxv 24, 48(10) # vs24 490 lxv 25, 64(10) # vs25 491 492 # initialize 493 # load key from r3 to vectors 494 ld 9, 24(3) 495 ld 10, 32(3) 496 and. 9, 9, 11 497 and. 10, 10, 12 498 499 # break 26 bits 500 extrdi 14, 9, 26, 38 501 extrdi 15, 9, 26, 12 502 extrdi 16, 9, 12, 0 503 mtvsrdd 58, 0, 14 504 insrdi 16, 10, 14, 38 505 mtvsrdd 59, 0, 15 506 extrdi 17, 10, 26, 24 507 mtvsrdd 60, 0, 16 508 extrdi 18, 10, 24, 0 509 mtvsrdd 61, 0, 17 510 mtvsrdd 62, 0, 18 511 512 # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5 513 li 9, 5 514 mtvsrdd 36, 0, 9 515 vmulouw 0, 27, 4 # v0 = rr0 516 vmulouw 1, 28, 4 # v1 = rr1 517 vmulouw 2, 29, 4 # v2 = rr2 518 vmulouw 3, 30, 4 # v3 = rr3 519.endm 520 521# 522# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) 523# k = 32 bytes key 524# r3 = k (r, s) 525# r4 = mlen 526# r5 = m 527# 528SYM_FUNC_START(poly1305_p10le_4blocks) 529.align 5 530 cmpdi 5, 64 531 blt Out_no_poly1305 532 533 SAVE_REGS 534 535 do_poly1305_init 536 537 li 21, 0 # counter to message 538 539 poly1305_setup_r 540 541 # load previous H state 542 # break/convert r6 to 26 bits 543 ld 9, 0(3) 544 ld 10, 8(3) 545 ld 19, 16(3) 546 sldi 19, 19, 24 547 mtvsrdd 41, 0, 19 548 extrdi 14, 9, 26, 38 549 extrdi 15, 9, 26, 12 550 extrdi 16, 9, 12, 0 551 mtvsrdd 36, 0, 14 552 insrdi 16, 10, 14, 38 553 mtvsrdd 37, 0, 15 554 extrdi 17, 10, 26, 24 555 mtvsrdd 38, 0, 16 556 extrdi 18, 10, 24, 0 557 mtvsrdd 39, 0, 17 558 mtvsrdd 40, 0, 18 559 vor 8, 8, 9 560 561 # input m1 m2 562 add 20, 4, 21 563 xxlor 49, 24, 24 564 xxlor 50, 25, 25 565 lxvw4x 43, 0, 20 566 addi 17, 20, 16 567 lxvw4x 44, 0, 17 568 vperm 14, 11, 12, 17 569 vperm 15, 11, 12, 18 570 vand 9, 14, 25 # a0 571 vsrd 10, 14, 31 # >> 26 572 vsrd 11, 10, 31 # 12 bits left 573 vand 10, 10, 25 # a1 574 vspltisb 13, 12 575 vand 16, 15, 25 576 vsld 12, 16, 13 577 vor 11, 11, 12 578 vand 11, 11, 25 # a2 579 vspltisb 13, 14 580 vsrd 12, 15, 13 # >> 14 581 vsrd 13, 12, 31 # >> 26, a4 582 vand 12, 12, 25 # a3 583 584 vaddudm 20, 4, 9 585 vaddudm 21, 5, 10 586 vaddudm 22, 6, 11 587 vaddudm 23, 7, 12 588 vaddudm 24, 8, 13 589 590 # m3 m4 591 addi 17, 17, 16 592 lxvw4x 43, 0, 17 593 addi 17, 17, 16 594 lxvw4x 44, 0, 17 595 vperm 14, 11, 12, 17 596 vperm 15, 11, 12, 18 597 vand 9, 14, 25 # a0 598 vsrd 10, 14, 31 # >> 26 599 vsrd 11, 10, 31 # 12 bits left 600 vand 10, 10, 25 # a1 601 vspltisb 13, 12 602 vand 16, 15, 25 603 vsld 12, 16, 13 604 vspltisb 13, 14 605 vor 11, 11, 12 606 vand 11, 11, 25 # a2 607 vsrd 12, 15, 13 # >> 14 608 vsrd 13, 12, 31 # >> 26, a4 609 vand 12, 12, 25 # a3 610 611 # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] 612 vmrgow 4, 9, 20 613 vmrgow 5, 10, 21 614 vmrgow 6, 11, 22 615 vmrgow 7, 12, 23 616 vmrgow 8, 13, 24 617 vaddudm 8, 8, 19 618 619 addi 5, 5, -64 # len -= 64 620 addi 21, 21, 64 # offset += 64 621 622 li 9, 64 623 divdu 31, 5, 9 624 625 cmpdi 31, 0 626 ble Skip_block_loop 627 628 mtctr 31 629 630# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r 631# Rewrite the polynominal sum of product as follows, 632# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2 633# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2 634# .... Repeat 635# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 --> 636# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r 637# 638loop_4blocks: 639 640 # Multiply odd words and even words 641 mul_odd 642 mul_even 643 # carry reduction 644 vspltisb 9, 2 645 vsrd 10, 14, 31 646 vsrd 11, 17, 31 647 vand 7, 17, 25 648 vand 4, 14, 25 649 vaddudm 18, 18, 11 650 vsrd 12, 18, 31 651 vaddudm 15, 15, 10 652 653 vsrd 11, 15, 31 654 vand 8, 18, 25 655 vand 5, 15, 25 656 vaddudm 4, 4, 12 657 vsld 10, 12, 9 658 vaddudm 6, 16, 11 659 660 vsrd 13, 6, 31 661 vand 6, 6, 25 662 vaddudm 4, 4, 10 663 vsrd 10, 4, 31 664 vaddudm 7, 7, 13 665 666 vsrd 11, 7, 31 667 vand 7, 7, 25 668 vand 4, 4, 25 669 vaddudm 5, 5, 10 670 vaddudm 8, 8, 11 671 672 # input m1 m2 m3 m4 673 add 20, 4, 21 674 xxlor 49, 24, 24 675 xxlor 50, 25, 25 676 lxvw4x 43, 0, 20 677 addi 17, 20, 16 678 lxvw4x 44, 0, 17 679 vperm 14, 11, 12, 17 680 vperm 15, 11, 12, 18 681 addi 17, 17, 16 682 lxvw4x 43, 0, 17 683 addi 17, 17, 16 684 lxvw4x 44, 0, 17 685 vperm 17, 11, 12, 17 686 vperm 18, 11, 12, 18 687 688 vand 20, 14, 25 # a0 689 vand 9, 17, 25 # a0 690 vsrd 21, 14, 31 # >> 26 691 vsrd 22, 21, 31 # 12 bits left 692 vsrd 10, 17, 31 # >> 26 693 vsrd 11, 10, 31 # 12 bits left 694 695 vand 21, 21, 25 # a1 696 vand 10, 10, 25 # a1 697 698 vspltisb 13, 12 699 vand 16, 15, 25 700 vsld 23, 16, 13 701 vor 22, 22, 23 702 vand 22, 22, 25 # a2 703 vand 16, 18, 25 704 vsld 12, 16, 13 705 vor 11, 11, 12 706 vand 11, 11, 25 # a2 707 vspltisb 13, 14 708 vsrd 23, 15, 13 # >> 14 709 vsrd 24, 23, 31 # >> 26, a4 710 vand 23, 23, 25 # a3 711 vsrd 12, 18, 13 # >> 14 712 vsrd 13, 12, 31 # >> 26, a4 713 vand 12, 12, 25 # a3 714 715 vaddudm 4, 4, 20 716 vaddudm 5, 5, 21 717 vaddudm 6, 6, 22 718 vaddudm 7, 7, 23 719 vaddudm 8, 8, 24 720 721 # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] 722 vmrgow 4, 9, 4 723 vmrgow 5, 10, 5 724 vmrgow 6, 11, 6 725 vmrgow 7, 12, 7 726 vmrgow 8, 13, 8 727 vaddudm 8, 8, 19 728 729 addi 5, 5, -64 # len -= 64 730 addi 21, 21, 64 # offset += 64 731 732 bdnz loop_4blocks 733 734Skip_block_loop: 735 xxlor 58, 0, 0 736 xxlor 59, 1, 1 737 xxlor 60, 2, 2 738 xxlor 61, 3, 3 739 xxlor 62, 4, 4 740 xxlor 32, 5, 5 741 xxlor 33, 6, 6 742 xxlor 34, 7, 7 743 xxlor 35, 8, 8 744 745 # Multiply odd words and even words 746 mul_odd 747 mul_even 748 749 # Sum the products. 750 xxpermdi 41, 31, 46, 0 751 xxpermdi 42, 31, 47, 0 752 vaddudm 4, 14, 9 753 xxpermdi 36, 31, 36, 3 754 vaddudm 5, 15, 10 755 xxpermdi 37, 31, 37, 3 756 xxpermdi 43, 31, 48, 0 757 vaddudm 6, 16, 11 758 xxpermdi 38, 31, 38, 3 759 xxpermdi 44, 31, 49, 0 760 vaddudm 7, 17, 12 761 xxpermdi 39, 31, 39, 3 762 xxpermdi 45, 31, 50, 0 763 vaddudm 8, 18, 13 764 xxpermdi 40, 31, 40, 3 765 766 # carry reduction 767 vspltisb 9, 2 768 vsrd 10, 4, 31 769 vsrd 11, 7, 31 770 vand 7, 7, 25 771 vand 4, 4, 25 772 vaddudm 8, 8, 11 773 vsrd 12, 8, 31 774 vaddudm 5, 5, 10 775 776 vsrd 11, 5, 31 777 vand 8, 8, 25 778 vand 5, 5, 25 779 vaddudm 4, 4, 12 780 vsld 10, 12, 9 781 vaddudm 6, 6, 11 782 783 vsrd 13, 6, 31 784 vand 6, 6, 25 785 vaddudm 4, 4, 10 786 vsrd 10, 4, 31 787 vaddudm 7, 7, 13 788 789 vsrd 11, 7, 31 790 vand 7, 7, 25 791 vand 4, 4, 25 792 vaddudm 5, 5, 10 793 vsrd 10, 5, 31 794 vand 5, 5, 25 795 vaddudm 6, 6, 10 796 vaddudm 8, 8, 11 797 798 b do_final_update 799 800do_final_update: 801 # combine 26 bit limbs 802 # v4, v5, v6, v7 and v8 are 26 bit vectors 803 vsld 5, 5, 31 804 vor 20, 4, 5 805 vspltisb 11, 12 806 vsrd 12, 6, 11 807 vsld 6, 6, 31 808 vsld 6, 6, 31 809 vor 20, 20, 6 810 vspltisb 11, 14 811 vsld 7, 7, 11 812 vor 21, 7, 12 813 mfvsrld 16, 40 # save last 2 bytes 814 vsld 8, 8, 11 815 vsld 8, 8, 31 816 vor 21, 21, 8 817 mfvsrld 17, 52 818 mfvsrld 19, 53 819 srdi 16, 16, 24 820 821 std 17, 0(3) 822 std 19, 8(3) 823 stw 16, 16(3) 824 825Out_loop: 826 li 3, 0 827 828 RESTORE_REGS 829 830 blr 831 832Out_no_poly1305: 833 li 3, 0 834 blr 835SYM_FUNC_END(poly1305_p10le_4blocks) 836 837# 838# ======================================================================= 839# The following functions implement 64 x 64 bits multiplication poly1305. 840# 841SYM_FUNC_START_LOCAL(Poly1305_init_64) 842 # mask 0x0FFFFFFC0FFFFFFC 843 # mask 0x0FFFFFFC0FFFFFFF 844 addis 10, 2, rmask@toc@ha 845 addi 10, 10, rmask@toc@l 846 ld 11, 0(10) 847 ld 12, 8(10) 848 849 # initialize 850 # load key from r3 851 ld 9, 24(3) 852 ld 10, 32(3) 853 and. 9, 9, 11 # cramp mask r0 854 and. 10, 10, 12 # cramp mask r1 855 856 srdi 21, 10, 2 857 add 19, 21, 10 # s1: r19 - (r1 >> 2) *5 858 859 # setup r and s 860 li 25, 0 861 mtvsrdd 32+0, 9, 19 # r0, s1 862 mtvsrdd 32+1, 10, 9 # r1, r0 863 mtvsrdd 32+2, 19, 25 # s1 864 mtvsrdd 32+3, 9, 25 # r0 865 866 blr 867SYM_FUNC_END(Poly1305_init_64) 868 869# Poly1305_mult 870# v6 = (h0, h1), v8 = h2 871# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0 872# 873# Output: v7, v10, v11 874# 875SYM_FUNC_START_LOCAL(Poly1305_mult) 876 # 877 # d0 = h0 * r0 + h1 * s1 878 vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1 879 880 # d1 = h0 * r1 + h1 * r0 + h2 * s1 881 vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0 882 vmsumudm 10, 8, 2, 11 # d1 += h2 * s1 883 884 # d2 = r0 885 vmsumudm 11, 8, 3, 9 # d2 = h2 * r0 886 blr 887SYM_FUNC_END(Poly1305_mult) 888 889# 890# carry reduction 891# h %=p 892# 893# Input: v7, v10, v11 894# Output: r27, r28, r29 895# 896SYM_FUNC_START_LOCAL(Carry_reduction) 897 mfvsrld 27, 32+7 898 mfvsrld 28, 32+10 899 mfvsrld 29, 32+11 900 mfvsrd 20, 32+7 # h0.h 901 mfvsrd 21, 32+10 # h1.h 902 903 addc 28, 28, 20 904 adde 29, 29, 21 905 srdi 22, 29, 0x2 906 sldi 23, 22, 0x2 907 add 23, 23, 22 # (h2 & 3) * 5 908 addc 27, 27, 23 # h0 909 addze 28, 28 # h1 910 andi. 29, 29, 0x3 # h2 911 blr 912SYM_FUNC_END(Carry_reduction) 913 914# 915# poly1305 multiplication 916# h *= r, h %= p 917# d0 = h0 * r0 + h1 * s1 918# d1 = h0 * r1 + h1 * r0 + h2 * s1 919# d2 = h0 * r0 920# 921# 922# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit) 923# - no highbit if final leftover block (highbit = 0) 924# 925SYM_FUNC_START(poly1305_64s) 926 cmpdi 5, 0 927 ble Out_no_poly1305_64 928 929 mflr 0 930 std 0, 16(1) 931 stdu 1,-400(1) 932 933 SAVE_GPR 14, 112, 1 934 SAVE_GPR 15, 120, 1 935 SAVE_GPR 16, 128, 1 936 SAVE_GPR 17, 136, 1 937 SAVE_GPR 18, 144, 1 938 SAVE_GPR 19, 152, 1 939 SAVE_GPR 20, 160, 1 940 SAVE_GPR 21, 168, 1 941 SAVE_GPR 22, 176, 1 942 SAVE_GPR 23, 184, 1 943 SAVE_GPR 24, 192, 1 944 SAVE_GPR 25, 200, 1 945 SAVE_GPR 26, 208, 1 946 SAVE_GPR 27, 216, 1 947 SAVE_GPR 28, 224, 1 948 SAVE_GPR 29, 232, 1 949 SAVE_GPR 30, 240, 1 950 SAVE_GPR 31, 248, 1 951 952 # Init poly1305 953 bl Poly1305_init_64 954 955 li 25, 0 # offset to inp and outp 956 957 add 11, 25, 4 958 959 # load h 960 # h0, h1, h2? 961 ld 27, 0(3) 962 ld 28, 8(3) 963 lwz 29, 16(3) 964 965 li 30, 16 966 divdu 31, 5, 30 967 968 mtctr 31 969 970 mr 24, 6 # highbit 971 972Loop_block_64: 973 vxor 9, 9, 9 974 975 ld 20, 0(11) 976 ld 21, 8(11) 977 addi 11, 11, 16 978 979 addc 27, 27, 20 980 adde 28, 28, 21 981 adde 29, 29, 24 982 983 li 22, 0 984 mtvsrdd 32+6, 27, 28 # h0, h1 985 mtvsrdd 32+8, 29, 22 # h2 986 987 bl Poly1305_mult 988 989 bl Carry_reduction 990 991 bdnz Loop_block_64 992 993 std 27, 0(3) 994 std 28, 8(3) 995 stw 29, 16(3) 996 997 li 3, 0 998 999 RESTORE_GPR 14, 112, 1 1000 RESTORE_GPR 15, 120, 1 1001 RESTORE_GPR 16, 128, 1 1002 RESTORE_GPR 17, 136, 1 1003 RESTORE_GPR 18, 144, 1 1004 RESTORE_GPR 19, 152, 1 1005 RESTORE_GPR 20, 160, 1 1006 RESTORE_GPR 21, 168, 1 1007 RESTORE_GPR 22, 176, 1 1008 RESTORE_GPR 23, 184, 1 1009 RESTORE_GPR 24, 192, 1 1010 RESTORE_GPR 25, 200, 1 1011 RESTORE_GPR 26, 208, 1 1012 RESTORE_GPR 27, 216, 1 1013 RESTORE_GPR 28, 224, 1 1014 RESTORE_GPR 29, 232, 1 1015 RESTORE_GPR 30, 240, 1 1016 RESTORE_GPR 31, 248, 1 1017 1018 addi 1, 1, 400 1019 ld 0, 16(1) 1020 mtlr 0 1021 1022 blr 1023 1024Out_no_poly1305_64: 1025 li 3, 0 1026 blr 1027SYM_FUNC_END(poly1305_64s) 1028 1029# 1030# Input: r3 = h, r4 = s, r5 = mac 1031# mac = h + s 1032# 1033SYM_FUNC_START(poly1305_emit_64) 1034 ld 10, 0(3) 1035 ld 11, 8(3) 1036 ld 12, 16(3) 1037 1038 # compare modulus 1039 # h + 5 + (-p) 1040 mr 6, 10 1041 mr 7, 11 1042 mr 8, 12 1043 addic. 6, 6, 5 1044 addze 7, 7 1045 addze 8, 8 1046 srdi 9, 8, 2 # overflow? 1047 cmpdi 9, 0 1048 beq Skip_h64 1049 mr 10, 6 1050 mr 11, 7 1051 mr 12, 8 1052 1053Skip_h64: 1054 ld 6, 0(4) 1055 ld 7, 8(4) 1056 addc 10, 10, 6 1057 adde 11, 11, 7 1058 addze 12, 12 1059 1060 std 10, 0(5) 1061 std 11, 8(5) 1062 blr 1063SYM_FUNC_END(poly1305_emit_64) 1064 1065SYM_DATA_START_LOCAL(RMASK) 1066.align 5 1067rmask: 1068.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f 1069cnum: 1070.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000 1071.long 0x1a, 0x00, 0x1a, 0x00 1072.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 1073.long 0x00010203, 0x04050607, 0x10111213, 0x14151617 1074.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f 1075SYM_DATA_END(RMASK) 1076