1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#include <sys/asm_linkage.h> 27 28#if defined(lint) || defined(__lint) 29 30#include <sys/types.h> 31 32/* ARGSUSED */ 33uint64_t 34big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit) 35{ return (0); } 36 37/* ARGSUSED */ 38uint64_t 39big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit) 40{ return (0); } 41 42/* ARGSUSED */ 43void 44big_sqr_vec(uint64_t *r, uint64_t *a, int len) 45{} 46 47#else /* lint */ 48 49/ ------------------------------------------------------------------------ 50/ 51/ Implementation of big_mul_set_vec which exploits 52/ the 64X64->128 bit unsigned multiply instruction. 53/ 54/ As defined in Sun's bignum library for pkcs11, bignums are 55/ composed of an array of 64-bit "digits" or "chunks" along with 56/ descriptive information. 57/ 58/ ------------------------------------------------------------------------ 59 60/ r = a * digit, r and a are vectors of length len 61/ returns the carry digit 62/ r and a are 64 bit aligned. 63/ 64/ uint64_t 65/ big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit) 66/ 67 ENTRY(big_mul_set_vec) 68 xorq %rax, %rax / if (len == 0) return (0) 69 testq %rdx, %rdx 70 jz .L17 71 72 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul 73 xorq %r9, %r9 / cy = 0 74 75.L15: 76 cmpq $8, %r8 / 8 - len 77 jb .L16 78 movq 0(%rsi), %rax / rax = a[0] 79 movq 8(%rsi), %r11 / prefetch a[1] 80 mulq %rcx / p = a[0] * digit 81 addq %r9, %rax 82 adcq $0, %rdx / p += cy 83 movq %rax, 0(%rdi) / r[0] = lo(p) 84 movq %rdx, %r9 / cy = hi(p) 85 86 movq %r11, %rax 87 movq 16(%rsi), %r11 / prefetch a[2] 88 mulq %rcx / p = a[1] * digit 89 addq %r9, %rax 90 adcq $0, %rdx / p += cy 91 movq %rax, 8(%rdi) / r[1] = lo(p) 92 movq %rdx, %r9 / cy = hi(p) 93 94 movq %r11, %rax 95 movq 24(%rsi), %r11 / prefetch a[3] 96 mulq %rcx / p = a[2] * digit 97 addq %r9, %rax 98 adcq $0, %rdx / p += cy 99 movq %rax, 16(%rdi) / r[2] = lo(p) 100 movq %rdx, %r9 / cy = hi(p) 101 102 movq %r11, %rax 103 movq 32(%rsi), %r11 / prefetch a[4] 104 mulq %rcx / p = a[3] * digit 105 addq %r9, %rax 106 adcq $0, %rdx / p += cy 107 movq %rax, 24(%rdi) / r[3] = lo(p) 108 movq %rdx, %r9 / cy = hi(p) 109 110 movq %r11, %rax 111 movq 40(%rsi), %r11 / prefetch a[5] 112 mulq %rcx / p = a[4] * digit 113 addq %r9, %rax 114 adcq $0, %rdx / p += cy 115 movq %rax, 32(%rdi) / r[4] = lo(p) 116 movq %rdx, %r9 / cy = hi(p) 117 118 movq %r11, %rax 119 movq 48(%rsi), %r11 / prefetch a[6] 120 mulq %rcx / p = a[5] * digit 121 addq %r9, %rax 122 adcq $0, %rdx / p += cy 123 movq %rax, 40(%rdi) / r[5] = lo(p) 124 movq %rdx, %r9 / cy = hi(p) 125 126 movq %r11, %rax 127 movq 56(%rsi), %r11 / prefetch a[7] 128 mulq %rcx / p = a[6] * digit 129 addq %r9, %rax 130 adcq $0, %rdx / p += cy 131 movq %rax, 48(%rdi) / r[6] = lo(p) 132 movq %rdx, %r9 / cy = hi(p) 133 134 movq %r11, %rax 135 mulq %rcx / p = a[7] * digit 136 addq %r9, %rax 137 adcq $0, %rdx / p += cy 138 movq %rax, 56(%rdi) / r[7] = lo(p) 139 movq %rdx, %r9 / cy = hi(p) 140 141 addq $64, %rsi 142 addq $64, %rdi 143 subq $8, %r8 144 145 jz .L17 146 jmp .L15 147 148.L16: 149 movq 0(%rsi), %rax 150 mulq %rcx / p = a[0] * digit 151 addq %r9, %rax 152 adcq $0, %rdx / p += cy 153 movq %rax, 0(%rdi) / r[0] = lo(p) 154 movq %rdx, %r9 / cy = hi(p) 155 decq %r8 156 jz .L17 157 158 movq 8(%rsi), %rax 159 mulq %rcx / p = a[1] * digit 160 addq %r9, %rax 161 adcq $0, %rdx / p += cy 162 movq %rax, 8(%rdi) / r[1] = lo(p) 163 movq %rdx, %r9 / cy = hi(p) 164 decq %r8 165 jz .L17 166 167 movq 16(%rsi), %rax 168 mulq %rcx / p = a[2] * digit 169 addq %r9, %rax 170 adcq $0, %rdx / p += cy 171 movq %rax, 16(%rdi) / r[2] = lo(p) 172 movq %rdx, %r9 / cy = hi(p) 173 decq %r8 174 jz .L17 175 176 movq 24(%rsi), %rax 177 mulq %rcx / p = a[3] * digit 178 addq %r9, %rax 179 adcq $0, %rdx / p += cy 180 movq %rax, 24(%rdi) / r[3] = lo(p) 181 movq %rdx, %r9 / cy = hi(p) 182 decq %r8 183 jz .L17 184 185 movq 32(%rsi), %rax 186 mulq %rcx / p = a[4] * digit 187 addq %r9, %rax 188 adcq $0, %rdx / p += cy 189 movq %rax, 32(%rdi) / r[4] = lo(p) 190 movq %rdx, %r9 / cy = hi(p) 191 decq %r8 192 jz .L17 193 194 movq 40(%rsi), %rax 195 mulq %rcx / p = a[5] * digit 196 addq %r9, %rax 197 adcq $0, %rdx / p += cy 198 movq %rax, 40(%rdi) / r[5] = lo(p) 199 movq %rdx, %r9 / cy = hi(p) 200 decq %r8 201 jz .L17 202 203 movq 48(%rsi), %rax 204 mulq %rcx / p = a[6] * digit 205 addq %r9, %rax 206 adcq $0, %rdx / p += cy 207 movq %rax, 48(%rdi) / r[6] = lo(p) 208 movq %rdx, %r9 / cy = hi(p) 209 decq %r8 210 jz .L17 211 212 213.L17: 214 movq %r9, %rax 215 ret 216 SET_SIZE(big_mul_set_vec) 217 218 219/ ------------------------------------------------------------------------ 220/ 221/ Implementation of big_mul_add_vec which exploits 222/ the 64X64->128 bit unsigned multiply instruction. 223/ 224/ As defined in Sun's bignum library for pkcs11, bignums are 225/ composed of an array of 64-bit "digits" or "chunks" along with 226/ descriptive information. 227/ 228/ ------------------------------------------------------------------------ 229 230/ r += a * digit, r and a are vectors of length len 231/ returns the carry digit 232/ r and a are 64 bit aligned. 233/ 234/ uint64_t 235/ big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit) 236/ 237 ENTRY(big_mul_add_vec) 238 xorq %rax, %rax / if (len == 0) return (0) 239 testq %rdx, %rdx 240 jz .L27 241 242 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul 243 xorq %r9, %r9 / cy = 0 244 245.L25: 246 cmpq $8, %r8 / 8 - len 247 jb .L26 248 movq 0(%rsi), %rax / rax = a[0] 249 movq 0(%rdi), %r10 / r10 = r[0] 250 movq 8(%rsi), %r11 / prefetch a[1] 251 mulq %rcx / p = a[0] * digit 252 addq %r10, %rax 253 adcq $0, %rdx / p += r[0] 254 movq 8(%rdi), %r10 / prefetch r[1] 255 addq %r9, %rax 256 adcq $0, %rdx / p += cy 257 movq %rax, 0(%rdi) / r[0] = lo(p) 258 movq %rdx, %r9 / cy = hi(p) 259 260 movq %r11, %rax 261 movq 16(%rsi), %r11 / prefetch a[2] 262 mulq %rcx / p = a[1] * digit 263 addq %r10, %rax 264 adcq $0, %rdx / p += r[1] 265 movq 16(%rdi), %r10 / prefetch r[2] 266 addq %r9, %rax 267 adcq $0, %rdx / p += cy 268 movq %rax, 8(%rdi) / r[1] = lo(p) 269 movq %rdx, %r9 / cy = hi(p) 270 271 movq %r11, %rax 272 movq 24(%rsi), %r11 / prefetch a[3] 273 mulq %rcx / p = a[2] * digit 274 addq %r10, %rax 275 adcq $0, %rdx / p += r[2] 276 movq 24(%rdi), %r10 / prefetch r[3] 277 addq %r9, %rax 278 adcq $0, %rdx / p += cy 279 movq %rax, 16(%rdi) / r[2] = lo(p) 280 movq %rdx, %r9 / cy = hi(p) 281 282 movq %r11, %rax 283 movq 32(%rsi), %r11 / prefetch a[4] 284 mulq %rcx / p = a[3] * digit 285 addq %r10, %rax 286 adcq $0, %rdx / p += r[3] 287 movq 32(%rdi), %r10 / prefetch r[4] 288 addq %r9, %rax 289 adcq $0, %rdx / p += cy 290 movq %rax, 24(%rdi) / r[3] = lo(p) 291 movq %rdx, %r9 / cy = hi(p) 292 293 movq %r11, %rax 294 movq 40(%rsi), %r11 / prefetch a[5] 295 mulq %rcx / p = a[4] * digit 296 addq %r10, %rax 297 adcq $0, %rdx / p += r[4] 298 movq 40(%rdi), %r10 / prefetch r[5] 299 addq %r9, %rax 300 adcq $0, %rdx / p += cy 301 movq %rax, 32(%rdi) / r[4] = lo(p) 302 movq %rdx, %r9 / cy = hi(p) 303 304 movq %r11, %rax 305 movq 48(%rsi), %r11 / prefetch a[6] 306 mulq %rcx / p = a[5] * digit 307 addq %r10, %rax 308 adcq $0, %rdx / p += r[5] 309 movq 48(%rdi), %r10 / prefetch r[6] 310 addq %r9, %rax 311 adcq $0, %rdx / p += cy 312 movq %rax, 40(%rdi) / r[5] = lo(p) 313 movq %rdx, %r9 / cy = hi(p) 314 315 movq %r11, %rax 316 movq 56(%rsi), %r11 / prefetch a[7] 317 mulq %rcx / p = a[6] * digit 318 addq %r10, %rax 319 adcq $0, %rdx / p += r[6] 320 movq 56(%rdi), %r10 / prefetch r[7] 321 addq %r9, %rax 322 adcq $0, %rdx / p += cy 323 movq %rax, 48(%rdi) / r[6] = lo(p) 324 movq %rdx, %r9 / cy = hi(p) 325 326 movq %r11, %rax 327 mulq %rcx / p = a[7] * digit 328 addq %r10, %rax 329 adcq $0, %rdx / p += r[7] 330 addq %r9, %rax 331 adcq $0, %rdx / p += cy 332 movq %rax, 56(%rdi) / r[7] = lo(p) 333 movq %rdx, %r9 / cy = hi(p) 334 335 addq $64, %rsi 336 addq $64, %rdi 337 subq $8, %r8 338 339 jz .L27 340 jmp .L25 341 342.L26: 343 movq 0(%rsi), %rax 344 movq 0(%rdi), %r10 345 mulq %rcx / p = a[0] * digit 346 addq %r10, %rax 347 adcq $0, %rdx / p += r[0] 348 addq %r9, %rax 349 adcq $0, %rdx / p += cy 350 movq %rax, 0(%rdi) / r[0] = lo(p) 351 movq %rdx, %r9 / cy = hi(p) 352 decq %r8 353 jz .L27 354 355 movq 8(%rsi), %rax 356 movq 8(%rdi), %r10 357 mulq %rcx / p = a[1] * digit 358 addq %r10, %rax 359 adcq $0, %rdx / p += r[1] 360 addq %r9, %rax 361 adcq $0, %rdx / p += cy 362 movq %rax, 8(%rdi) / r[1] = lo(p) 363 movq %rdx, %r9 / cy = hi(p) 364 decq %r8 365 jz .L27 366 367 movq 16(%rsi), %rax 368 movq 16(%rdi), %r10 369 mulq %rcx / p = a[2] * digit 370 addq %r10, %rax 371 adcq $0, %rdx / p += r[2] 372 addq %r9, %rax 373 adcq $0, %rdx / p += cy 374 movq %rax, 16(%rdi) / r[2] = lo(p) 375 movq %rdx, %r9 / cy = hi(p) 376 decq %r8 377 jz .L27 378 379 movq 24(%rsi), %rax 380 movq 24(%rdi), %r10 381 mulq %rcx / p = a[3] * digit 382 addq %r10, %rax 383 adcq $0, %rdx / p += r[3] 384 addq %r9, %rax 385 adcq $0, %rdx / p += cy 386 movq %rax, 24(%rdi) / r[3] = lo(p) 387 movq %rdx, %r9 / cy = hi(p) 388 decq %r8 389 jz .L27 390 391 movq 32(%rsi), %rax 392 movq 32(%rdi), %r10 393 mulq %rcx / p = a[4] * digit 394 addq %r10, %rax 395 adcq $0, %rdx / p += r[4] 396 addq %r9, %rax 397 adcq $0, %rdx / p += cy 398 movq %rax, 32(%rdi) / r[4] = lo(p) 399 movq %rdx, %r9 / cy = hi(p) 400 decq %r8 401 jz .L27 402 403 movq 40(%rsi), %rax 404 movq 40(%rdi), %r10 405 mulq %rcx / p = a[5] * digit 406 addq %r10, %rax 407 adcq $0, %rdx / p += r[5] 408 addq %r9, %rax 409 adcq $0, %rdx / p += cy 410 movq %rax, 40(%rdi) / r[5] = lo(p) 411 movq %rdx, %r9 / cy = hi(p) 412 decq %r8 413 jz .L27 414 415 movq 48(%rsi), %rax 416 movq 48(%rdi), %r10 417 mulq %rcx / p = a[6] * digit 418 addq %r10, %rax 419 adcq $0, %rdx / p += r[6] 420 addq %r9, %rax 421 adcq $0, %rdx / p += cy 422 movq %rax, 48(%rdi) / r[6] = lo(p) 423 movq %rdx, %r9 / cy = hi(p) 424 decq %r8 425 jz .L27 426 427 428.L27: 429 movq %r9, %rax 430 ret 431 SET_SIZE(big_mul_add_vec) 432 433 434/ void 435/ big_sqr_vec(uint64_t *r, uint64_t *a, int len) 436 437 ENTRY(big_sqr_vec) 438 pushq %rbx 439 pushq %rbp 440 pushq %r12 441 pushq %r13 442 pushq %r14 443 pushq %r15 444 pushq %rdx / save arg3, len 445 pushq %rsi / save arg2, a 446 pushq %rdi / save arg1, r 447 448 leaq 8(%rdi), %r13 / tr = r + 1 449 movq %rsi, %r14 / ta = a 450 movq %rdx, %r15 / tlen = len 451 decq %r15 / tlen = len - 1 452 movq %r13, %rdi / arg1 = tr 453 leaq 8(%r14), %rsi / arg2 = ta + 1 454 movq %r15, %rdx / arg3 = tlen 455 movq 0(%r14), %rcx / arg4 = ta[0] 456 call big_mul_set_vec 457 movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy 458.L31: 459 decq %r15 / --tlen 460 jz .L32 / while (--tlen != 0) 461 462 addq $16, %r13 / tr += 2 463 addq $8, %r14 / ++ta 464 movq %r13, %rdi / arg1 = tr 465 leaq 8(%r14), %rsi / arg2 = ta + 1 466 movq %r15, %rdx / arg3 = tlen 467 movq 0(%r14), %rcx / arg4 = ta[0] 468 call big_mul_add_vec 469 movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy 470 jmp .L31 471 472.L32: 473 474/ No more function calls after this. 475/ Restore arguments to registers. 476/ However, don't use %rdx for arg3, len, because it is heavily 477/ used by the hardware MUL instruction. Use %r8, instead. 478 movq 0(%rsp), %rdi / %rdi == arg1 == r 479 movq 8(%rsp), %rsi / %rsi == arg2 == a 480 movq 16(%rsp), %r8 / %r8 == arg3 == len 481 482 movq 0(%rsi), %rax / %rax = a[0]; 483 mulq %rax / s = %edx:%eax = a[0]**2 484 movq %rax, 0(%rdi) / r[0] = lo64(s) 485 movq %rdx, %r9 / cy = hi64(s) 486 xorq %rdx, %rdx 487 movq 8(%rdi), %rax / p = %rdx:%rax = r[1] 488 addq %rax, %rax 489 adcq $0, %rdx / p = p << 1 490 addq %r9, %rax 491 adcq $0, %rdx / p = (r[1] << 1) + cy 492 movq %rax, 8(%rdi) / r[1] = lo64(p) 493 movq %rdx, %r9 / cy = hi64(p) 494 movq $1, %r11 / row = 1 495 movq $2, %r12 / col = 2 496 movq %r8, %r15 497 decq %r15 / tlen = len - 1 498.L33: 499 cmpq %r8, %r11 / len - row 500 jae .L34 / while (row < len) 501 502 movq 0(%rsi, %r11, 8), %rax / s = (uint128_t)a[row] 503 mulq %rax / s = s * s 504 xorq %rbx, %rbx 505 movq 0(%rdi, %r12, 8), %rcx / p = (uint128_t)r[col] 506 addq %rcx, %rcx 507 adcq $0, %rbx / p = p << 1 508 addq %rcx, %rax 509 adcq %rbx, %rdx / t = p + s 510 xorq %r10, %r10 511 movq %rax, %rbp / t2 = 0:lo64(t) 512 addq %r9, %rbp 513 adcq $0, %r10 / t2 = %r10:%rbp = lo64(t) + cy 514 movq %rbp, 0(%rdi, %r12, 8) / r[col] = lo64(t2) 515 xorq %rcx, %rcx 516 movq %rdx, %r9 517 addq %r10, %r9 518 adcq $0, %rcx / cy = hi64(t) + hi64(t2) 519 cmpq %r11, %r15 520 je .L34 / if (row == len - 1) break 521 xorq %rdx, %rdx 522 movq 8(%rdi, %r12, 8), %rax 523 addq %rax, %rax 524 adcq $0, %rdx 525 addq %r9, %rax 526 adcq %rcx, %rdx / p = (lo64(r[col+1]) << 1) + cy 527 movq %rax, 8(%rdi, %r12, 8) / r[col+1] = lo64(p) 528 movq %rdx, %r9 / cy = hi64(p) 529 530 incq %r11 / ++row 531 addq $2, %r12 / col += 2 532 jmp .L33 533 534.L34: 535 movq %r9, 8(%rdi, %r12, 8) / r[col+1] = lo64(cy) 536 537 addq $24, %rsp / skip %rdi, %rsi, %rdx 538 popq %r15 539 popq %r14 540 popq %r13 541 popq %r12 542 popq %rbp 543 popq %rbx 544 545 ret 546 547 SET_SIZE(big_sqr_vec) 548 549#endif /* lint */ 550