1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22/* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29#include <sys/asm_linkage.h> 30 31#if defined(lint) || defined(__lint) 32 33#include <sys/types.h> 34 35/* ARGSUSED */ 36uint64_t 37big_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) 38{ return (0); } 39 40/* ARGSUSED */ 41uint64_t 42big_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) 43{ return (0); } 44 45/* ARGSUSED */ 46void 47big_sqr_vec64(uint64_t *r, uint64_t *a, int len) 48{} 49 50#else /* lint */ 51 52/ ------------------------------------------------------------------------ 53/ 54/ Implementation of big_mul_set_vec which exploits 55/ the 64X64->128 bit unsigned multiply instruction. 56/ 57/ As defined in Sun's bignum library for pkcs11, bignums are 58/ composed of an array of 32-bit "digits" along with descriptive 59/ information. The arrays of digits are only required to be 60/ aligned on 32-bit boundary. This implementation works only 61/ when the two factors and the result happen to be 64 bit aligned 62/ and have an even number of digits. 63/ 64/ ------------------------------------------------------------------------ 65 66/ r = a * digit, r and a are vectors of length len 67/ returns the carry digit 68/ r and a are 64 bit aligned. 69/ 70/ uint64_t 71/ big_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) 72/ 73 ENTRY(big_mul_set_vec64) 74 xorq %rax, %rax / if (len == 0) return (0) 75 testq %rdx, %rdx 76 jz .L17 77 78 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul 79 xorq %r9, %r9 / cy = 0 80 81.L15: 82 cmpq $8, %r8 / 8 - len 83 jb .L16 84 movq 0(%rsi), %rax / rax = a[0] 85 movq 8(%rsi), %r11 / prefetch a[1] 86 mulq %rcx / p = a[0] * digit 87 addq %r9, %rax 88 adcq $0, %rdx / p += cy 89 movq %rax, 0(%rdi) / r[0] = lo(p) 90 movq %rdx, %r9 / cy = hi(p) 91 92 movq %r11, %rax 93 movq 16(%rsi), %r11 / prefetch a[2] 94 mulq %rcx / p = a[1] * digit 95 addq %r9, %rax 96 adcq $0, %rdx / p += cy 97 movq %rax, 8(%rdi) / r[1] = lo(p) 98 movq %rdx, %r9 / cy = hi(p) 99 100 movq %r11, %rax 101 movq 24(%rsi), %r11 / prefetch a[3] 102 mulq %rcx / p = a[2] * digit 103 addq %r9, %rax 104 adcq $0, %rdx / p += cy 105 movq %rax, 16(%rdi) / r[2] = lo(p) 106 movq %rdx, %r9 / cy = hi(p) 107 108 movq %r11, %rax 109 movq 32(%rsi), %r11 / prefetch a[4] 110 mulq %rcx / p = a[3] * digit 111 addq %r9, %rax 112 adcq $0, %rdx / p += cy 113 movq %rax, 24(%rdi) / r[3] = lo(p) 114 movq %rdx, %r9 / cy = hi(p) 115 116 movq %r11, %rax 117 movq 40(%rsi), %r11 / prefetch a[5] 118 mulq %rcx / p = a[4] * digit 119 addq %r9, %rax 120 adcq $0, %rdx / p += cy 121 movq %rax, 32(%rdi) / r[4] = lo(p) 122 movq %rdx, %r9 / cy = hi(p) 123 124 movq %r11, %rax 125 movq 48(%rsi), %r11 / prefetch a[6] 126 mulq %rcx / p = a[5] * digit 127 addq %r9, %rax 128 adcq $0, %rdx / p += cy 129 movq %rax, 40(%rdi) / r[5] = lo(p) 130 movq %rdx, %r9 / cy = hi(p) 131 132 movq %r11, %rax 133 movq 56(%rsi), %r11 / prefetch a[7] 134 mulq %rcx / p = a[6] * digit 135 addq %r9, %rax 136 adcq $0, %rdx / p += cy 137 movq %rax, 48(%rdi) / r[6] = lo(p) 138 movq %rdx, %r9 / cy = hi(p) 139 140 movq %r11, %rax 141 mulq %rcx / p = a[7] * digit 142 addq %r9, %rax 143 adcq $0, %rdx / p += cy 144 movq %rax, 56(%rdi) / r[7] = lo(p) 145 movq %rdx, %r9 / cy = hi(p) 146 147 addq $64, %rsi 148 addq $64, %rdi 149 subq $8, %r8 150 151 jz .L17 152 jmp .L15 153 154.L16: 155 movq 0(%rsi), %rax 156 mulq %rcx / p = a[0] * digit 157 addq %r9, %rax 158 adcq $0, %rdx / p += cy 159 movq %rax, 0(%rdi) / r[0] = lo(p) 160 movq %rdx, %r9 / cy = hi(p) 161 decq %r8 162 jz .L17 163 164 movq 8(%rsi), %rax 165 mulq %rcx / p = a[1] * digit 166 addq %r9, %rax 167 adcq $0, %rdx / p += cy 168 movq %rax, 8(%rdi) / r[1] = lo(p) 169 movq %rdx, %r9 / cy = hi(p) 170 decq %r8 171 jz .L17 172 173 movq 16(%rsi), %rax 174 mulq %rcx / p = a[2] * digit 175 addq %r9, %rax 176 adcq $0, %rdx / p += cy 177 movq %rax, 16(%rdi) / r[2] = lo(p) 178 movq %rdx, %r9 / cy = hi(p) 179 decq %r8 180 jz .L17 181 182 movq 24(%rsi), %rax 183 mulq %rcx / p = a[3] * digit 184 addq %r9, %rax 185 adcq $0, %rdx / p += cy 186 movq %rax, 24(%rdi) / r[3] = lo(p) 187 movq %rdx, %r9 / cy = hi(p) 188 decq %r8 189 jz .L17 190 191 movq 32(%rsi), %rax 192 mulq %rcx / p = a[4] * digit 193 addq %r9, %rax 194 adcq $0, %rdx / p += cy 195 movq %rax, 32(%rdi) / r[4] = lo(p) 196 movq %rdx, %r9 / cy = hi(p) 197 decq %r8 198 jz .L17 199 200 movq 40(%rsi), %rax 201 mulq %rcx / p = a[5] * digit 202 addq %r9, %rax 203 adcq $0, %rdx / p += cy 204 movq %rax, 40(%rdi) / r[5] = lo(p) 205 movq %rdx, %r9 / cy = hi(p) 206 decq %r8 207 jz .L17 208 209 movq 48(%rsi), %rax 210 mulq %rcx / p = a[6] * digit 211 addq %r9, %rax 212 adcq $0, %rdx / p += cy 213 movq %rax, 48(%rdi) / r[6] = lo(p) 214 movq %rdx, %r9 / cy = hi(p) 215 decq %r8 216 jz .L17 217 218 219.L17: 220 movq %r9, %rax 221 ret 222 SET_SIZE(big_mul_set_vec64) 223 224/ ------------------------------------------------------------------------ 225/ 226/ Implementation of big_mul_add_vec which exploits 227/ the 64X64->128 bit unsigned multiply instruction. 228/ 229/ As defined in Sun's bignum library for pkcs11, bignums are 230/ composed of an array of 32-bit "digits" along with descriptive 231/ information. The arrays of digits are only required to be 232/ aligned on 32-bit boundary. This implementation works only 233/ when the two factors and the result happen to be 64 bit aligned 234/ and have an even number of digits. 235/ 236/ ------------------------------------------------------------------------ 237 238/ r += a * digit, r and a are vectors of length len 239/ returns the carry digit 240/ r and a are 64 bit aligned. 241/ 242/ uint64_t 243/ big_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) 244/ 245 ENTRY(big_mul_add_vec64) 246 xorq %rax, %rax / if (len == 0) return (0) 247 testq %rdx, %rdx 248 jz .L27 249 250 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul 251 xorq %r9, %r9 / cy = 0 252 253.L25: 254 cmpq $8, %r8 / 8 - len 255 jb .L26 256 movq 0(%rsi), %rax / rax = a[0] 257 movq 0(%rdi), %r10 / r10 = r[0] 258 movq 8(%rsi), %r11 / prefetch a[1] 259 mulq %rcx / p = a[0] * digit 260 addq %r10, %rax 261 adcq $0, %rdx / p += r[0] 262 movq 8(%rdi), %r10 / prefetch r[1] 263 addq %r9, %rax 264 adcq $0, %rdx / p += cy 265 movq %rax, 0(%rdi) / r[0] = lo(p) 266 movq %rdx, %r9 / cy = hi(p) 267 268 movq %r11, %rax 269 movq 16(%rsi), %r11 / prefetch a[2] 270 mulq %rcx / p = a[1] * digit 271 addq %r10, %rax 272 adcq $0, %rdx / p += r[1] 273 movq 16(%rdi), %r10 / prefetch r[2] 274 addq %r9, %rax 275 adcq $0, %rdx / p += cy 276 movq %rax, 8(%rdi) / r[1] = lo(p) 277 movq %rdx, %r9 / cy = hi(p) 278 279 movq %r11, %rax 280 movq 24(%rsi), %r11 / prefetch a[3] 281 mulq %rcx / p = a[2] * digit 282 addq %r10, %rax 283 adcq $0, %rdx / p += r[2] 284 movq 24(%rdi), %r10 / prefetch r[3] 285 addq %r9, %rax 286 adcq $0, %rdx / p += cy 287 movq %rax, 16(%rdi) / r[2] = lo(p) 288 movq %rdx, %r9 / cy = hi(p) 289 290 movq %r11, %rax 291 movq 32(%rsi), %r11 / prefetch a[4] 292 mulq %rcx / p = a[3] * digit 293 addq %r10, %rax 294 adcq $0, %rdx / p += r[3] 295 movq 32(%rdi), %r10 / prefetch r[4] 296 addq %r9, %rax 297 adcq $0, %rdx / p += cy 298 movq %rax, 24(%rdi) / r[3] = lo(p) 299 movq %rdx, %r9 / cy = hi(p) 300 301 movq %r11, %rax 302 movq 40(%rsi), %r11 / prefetch a[5] 303 mulq %rcx / p = a[4] * digit 304 addq %r10, %rax 305 adcq $0, %rdx / p += r[4] 306 movq 40(%rdi), %r10 / prefetch r[5] 307 addq %r9, %rax 308 adcq $0, %rdx / p += cy 309 movq %rax, 32(%rdi) / r[4] = lo(p) 310 movq %rdx, %r9 / cy = hi(p) 311 312 movq %r11, %rax 313 movq 48(%rsi), %r11 / prefetch a[6] 314 mulq %rcx / p = a[5] * digit 315 addq %r10, %rax 316 adcq $0, %rdx / p += r[5] 317 movq 48(%rdi), %r10 / prefetch r[6] 318 addq %r9, %rax 319 adcq $0, %rdx / p += cy 320 movq %rax, 40(%rdi) / r[5] = lo(p) 321 movq %rdx, %r9 / cy = hi(p) 322 323 movq %r11, %rax 324 movq 56(%rsi), %r11 / prefetch a[7] 325 mulq %rcx / p = a[6] * digit 326 addq %r10, %rax 327 adcq $0, %rdx / p += r[6] 328 movq 56(%rdi), %r10 / prefetch r[7] 329 addq %r9, %rax 330 adcq $0, %rdx / p += cy 331 movq %rax, 48(%rdi) / r[6] = lo(p) 332 movq %rdx, %r9 / cy = hi(p) 333 334 movq %r11, %rax 335 mulq %rcx / p = a[7] * digit 336 addq %r10, %rax 337 adcq $0, %rdx / p += r[7] 338 addq %r9, %rax 339 adcq $0, %rdx / p += cy 340 movq %rax, 56(%rdi) / r[7] = lo(p) 341 movq %rdx, %r9 / cy = hi(p) 342 343 addq $64, %rsi 344 addq $64, %rdi 345 subq $8, %r8 346 347 jz .L27 348 jmp .L25 349 350.L26: 351 movq 0(%rsi), %rax 352 movq 0(%rdi), %r10 353 mulq %rcx / p = a[0] * digit 354 addq %r10, %rax 355 adcq $0, %rdx / p += r[0] 356 addq %r9, %rax 357 adcq $0, %rdx / p += cy 358 movq %rax, 0(%rdi) / r[0] = lo(p) 359 movq %rdx, %r9 / cy = hi(p) 360 decq %r8 361 jz .L27 362 363 movq 8(%rsi), %rax 364 movq 8(%rdi), %r10 365 mulq %rcx / p = a[1] * digit 366 addq %r10, %rax 367 adcq $0, %rdx / p += r[1] 368 addq %r9, %rax 369 adcq $0, %rdx / p += cy 370 movq %rax, 8(%rdi) / r[1] = lo(p) 371 movq %rdx, %r9 / cy = hi(p) 372 decq %r8 373 jz .L27 374 375 movq 16(%rsi), %rax 376 movq 16(%rdi), %r10 377 mulq %rcx / p = a[2] * digit 378 addq %r10, %rax 379 adcq $0, %rdx / p += r[2] 380 addq %r9, %rax 381 adcq $0, %rdx / p += cy 382 movq %rax, 16(%rdi) / r[2] = lo(p) 383 movq %rdx, %r9 / cy = hi(p) 384 decq %r8 385 jz .L27 386 387 movq 24(%rsi), %rax 388 movq 24(%rdi), %r10 389 mulq %rcx / p = a[3] * digit 390 addq %r10, %rax 391 adcq $0, %rdx / p += r[3] 392 addq %r9, %rax 393 adcq $0, %rdx / p += cy 394 movq %rax, 24(%rdi) / r[3] = lo(p) 395 movq %rdx, %r9 / cy = hi(p) 396 decq %r8 397 jz .L27 398 399 movq 32(%rsi), %rax 400 movq 32(%rdi), %r10 401 mulq %rcx / p = a[4] * digit 402 addq %r10, %rax 403 adcq $0, %rdx / p += r[4] 404 addq %r9, %rax 405 adcq $0, %rdx / p += cy 406 movq %rax, 32(%rdi) / r[4] = lo(p) 407 movq %rdx, %r9 / cy = hi(p) 408 decq %r8 409 jz .L27 410 411 movq 40(%rsi), %rax 412 movq 40(%rdi), %r10 413 mulq %rcx / p = a[5] * digit 414 addq %r10, %rax 415 adcq $0, %rdx / p += r[5] 416 addq %r9, %rax 417 adcq $0, %rdx / p += cy 418 movq %rax, 40(%rdi) / r[5] = lo(p) 419 movq %rdx, %r9 / cy = hi(p) 420 decq %r8 421 jz .L27 422 423 movq 48(%rsi), %rax 424 movq 48(%rdi), %r10 425 mulq %rcx / p = a[6] * digit 426 addq %r10, %rax 427 adcq $0, %rdx / p += r[6] 428 addq %r9, %rax 429 adcq $0, %rdx / p += cy 430 movq %rax, 48(%rdi) / r[6] = lo(p) 431 movq %rdx, %r9 / cy = hi(p) 432 decq %r8 433 jz .L27 434 435 436.L27: 437 movq %r9, %rax 438 ret 439 SET_SIZE(big_mul_add_vec64) 440 441 442/ void 443/ big_sqr_vec64(uint64_t *r, uint64_t *a, int len) 444 445 ENTRY(big_sqr_vec64) 446 pushq %rbx 447 pushq %rbp 448 pushq %r12 449 pushq %r13 450 pushq %r14 451 pushq %r15 452 pushq %rdx / save arg3, len 453 pushq %rsi / save arg2, a 454 pushq %rdi / save arg1, r 455 456 leaq 8(%rdi), %r13 / tr = r + 1 457 movq %rsi, %r14 / ta = a 458 movq %rdx, %r15 / tlen = len 459 decq %r15 / tlen = len - 1 460 movq %r13, %rdi / arg1 = tr 461 leaq 8(%r14), %rsi / arg2 = ta + 1 462 movq %r15, %rdx / arg3 = tlen 463 movq 0(%r14), %rcx / arg4 = ta[0] 464 call big_mul_set_vec64 465 movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy 466.L31: 467 decq %r15 / --tlen 468 jz .L32 / while (--tlen != 0) 469 470 addq $16, %r13 / tr += 2 471 addq $8, %r14 / ++ta 472 movq %r13, %rdi / arg1 = tr 473 leaq 8(%r14), %rsi / arg2 = ta + 1 474 movq %r15, %rdx / arg3 = tlen 475 movq 0(%r14), %rcx / arg4 = ta[0] 476 call big_mul_add_vec64 477 movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy 478 jmp .L31 479 480.L32: 481 482/ No more function calls after this. 483/ Restore arguments to registers. 484/ However, don't use %rdx for arg3, len, because it is heavily 485/ used by the hardware MUL instruction. Use %r8, instead. 486 movq 0(%rsp), %rdi / %rdi == arg1 == r 487 movq 8(%rsp), %rsi / %rsi == arg2 == a 488 movq 16(%rsp), %r8 / %r8 == arg3 == len 489 490 movq 0(%rsi), %rax / %rax = a[0]; 491 mulq %rax / s = %edx:%eax = a[0]**2 492 movq %rax, 0(%rdi) / r[0] = lo64(s) 493 movq %rdx, %r9 / cy = hi64(s) 494 xorq %rdx, %rdx 495 movq 8(%rdi), %rax / p = %rdx:%rax = r[1] 496 addq %rax, %rax 497 adcq $0, %rdx / p = p << 1 498 addq %r9, %rax 499 adcq $0, %rdx / p = (r[1] << 1) + cy 500 movq %rax, 8(%rdi) / r[1] = lo64(p) 501 movq %rdx, %r9 / cy = hi64(p) 502 movq $1, %r11 / row = 1 503 movq $2, %r12 / col = 2 504 movq %r8, %r15 505 decq %r15 / tlen = len - 1 506.L33: 507 cmpq %r8, %r11 / len - row 508 jae .L34 / while (row < len) 509 510 movq 0(%rsi, %r11, 8), %rax / s = (uint128_t)a[row] 511 mulq %rax / s = s * s 512 xorq %rbx, %rbx 513 movq 0(%rdi, %r12, 8), %rcx / p = (uint128_t)r[col] 514 addq %rcx, %rcx 515 adcq $0, %rbx / p = p << 1 516 addq %rcx, %rax 517 adcq %rbx, %rdx / t = p + s 518 xorq %r10, %r10 519 movq %rax, %rbp / t2 = 0:lo64(t) 520 addq %r9, %rbp 521 adcq $0, %r10 / t2 = %r10:%rbp = lo64(t) + cy 522 movq %rbp, 0(%rdi, %r12, 8) / r[col] = lo64(t2) 523 xorq %rcx, %rcx 524 movq %rdx, %r9 525 addq %r10, %r9 526 adcq $0, %rcx / cy = hi64(t) + hi64(t2) 527 cmpq %r11, %r15 528 je .L34 / if (row == len - 1) break 529 xorq %rdx, %rdx 530 movq 8(%rdi, %r12, 8), %rax 531 addq %rax, %rax 532 adcq $0, %rdx 533 addq %r9, %rax 534 adcq %rcx, %rdx / p = (lo64(r[col+1]) << 1) + cy 535 movq %rax, 8(%rdi, %r12, 8) / r[col+1] = lo64(p) 536 movq %rdx, %r9 / cy = hi64(p) 537 538 incq %r11 / ++row 539 addq $2, %r12 / col += 2 540 jmp .L33 541 542.L34: 543 movq %r9, 8(%rdi, %r12, 8) / r[col+1] = lo64(cy) 544 545 addq $24, %rsp / skip %rdi, %rsi, %rdx 546 popq %r15 547 popq %r14 548 popq %r13 549 popq %r12 550 popq %rbp 551 popq %rbx 552 553 ret 554 555 SET_SIZE(big_sqr_vec64) 556 557#endif /* lint */ 558