1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25#include <sys/asm_linkage.h> 26#include <sys/x86_archext.h> 27#include <sys/controlregs.h> 28 29#if defined(__lint) 30 31#include <sys/types.h> 32 33uint32_t 34bignum_use_sse2() 35{ return (0); } 36 37/* Not to be called by C code */ 38/* ARGSUSED */ 39uint32_t 40big_mul_set_vec_sse2_r() 41{ return (0); } 42 43/* Not to be called by C code */ 44/* ARGSUSED */ 45uint32_t 46big_mul_add_vec_sse2_r() 47{ return (0); } 48 49/* ARGSUSED */ 50uint32_t 51big_mul_set_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit) 52{ return (0); } 53 54/* ARGSUSED */ 55uint32_t 56big_mul_add_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit) 57{ return (0); } 58 59/* ARGSUSED */ 60void 61big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen) 62{} 63 64/* ARGSUSED */ 65void 66big_sqr_vec_sse2(uint32_t *r, uint32_t *a, int len) 67{} 68 69#if defined(MMX_MANAGE) 70 71/* ARGSUSED */ 72uint32_t 73big_mul_set_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit) 74{ return (0); } 75 76/* ARGSUSED */ 77uint32_t 78big_mul_add_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit) 79{ return (0); } 80 81/* Not to be called by C code */ 82/* ARGSUSED */ 83void 84big_sqr_vec_sse2_fc(uint32_t *r, uint32_t *a, int len) 85{} 86 87#endif /* MMX_MANAGE */ 88 89/* 90 * UMUL 91 * 92 */ 93 94/* ARGSUSED */ 95uint32_t 96big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 97{ return (0); } 98 99/* ARGSUSED */ 100uint32_t 101big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 102{ return (0); } 103 104#else /* __lint */ 105 106#if defined(MMX_MANAGE) 107 108#if defined(_KERNEL) 109 110#define KPREEMPT_DISABLE call kpr_disable 111#define KPREEMPT_ENABLE call kpr_enable 112#define TEST_TS(reg) \ 113 movl %cr0, reg; \ 114 clts; \ 115 testl $CR0_TS, reg 116 117#else /* _KERNEL */ 118 119#define KPREEMPT_DISABLE 120#define KPREEMPT_ENABLE 121 122#define TEST_TS(reg) \ 123 movl $0, reg; \ 124 testl $CR0_TS, reg 125 126#endif /* _KERNEL */ 127 128#define MMX_SIZE 8 129#define MMX_ALIGN 8 130 131#define SAVE_MMX_PROLOG(sreg, nreg) \ 132 subl $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp; \ 133 movl %esp, sreg; \ 134 addl $MMX_ALIGN, sreg; \ 135 andl $-1![MMX_ALIGN-1], sreg; 136 137#define RSTOR_MMX_EPILOG(nreg) \ 138 addl $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp; 139 140#define SAVE_MMX_0TO4(sreg) \ 141 SAVE_MMX_PROLOG(sreg, 5); \ 142 movq %mm0, 0(sreg); \ 143 movq %mm1, 8(sreg); \ 144 movq %mm2, 16(sreg); \ 145 movq %mm3, 24(sreg); \ 146 movq %mm4, 32(sreg) 147 148#define RSTOR_MMX_0TO4(sreg) \ 149 movq 0(sreg), %mm0; \ 150 movq 8(sreg), %mm1; \ 151 movq 16(sreg), %mm2; \ 152 movq 24(sreg), %mm3; \ 153 movq 32(sreg), %mm4; \ 154 RSTOR_MMX_EPILOG(5) 155 156#endif /* MMX_MANAGE */ 157 158/ Note: this file contains implementations for 159/ big_mul_set_vec() 160/ big_mul_add_vec() 161/ big_mul_vec() 162/ big_sqr_vec() 163/ One set of implementations is for SSE2-capable models. 164/ The other uses no MMX, SSE, or SSE2 instructions, only 165/ the x86 32 X 32 -> 64 unsigned multiply instruction, MUL. 166/ 167/ The code for the implementations is grouped by SSE2 vs UMUL, 168/ rather than grouping pairs of implementations for each function. 169/ This is because the bignum implementation gets "imprinted" 170/ on the correct implementation, at the time of first use, 171/ so none of the code for the other implementations is ever 172/ executed. So, it is a no-brainer to layout the code to minimize 173/ the "footprint" of executed code. 174 175/ Can we use SSE2 instructions? Return value is non-zero 176/ if we can. 177/ 178/ Note: 179/ Using the cpuid instruction directly would work equally 180/ well in userland and in the kernel, but we do not use the 181/ cpuid instruction in the kernel, we use x86_featureset, 182/ instead. This means we honor any decisions the kernel 183/ startup code may have made in setting this variable, 184/ including disabling SSE2. It might even be a good idea 185/ to honor this kind of setting in userland, as well, but 186/ the variable, x86_featureset is not readily available to 187/ userland processes. 188/ 189/ uint32_t 190/ bignum_use_sse2() 191 192 ENTRY(bignum_use_sse2) 193#if defined(_KERNEL) 194 xor %eax, %eax 195 bt $X86FSET_SSE2, x86_featureset 196 adc %eax, %eax 197#else /* _KERNEL */ 198 pushl %ebx 199 movl $1, %eax / Get feature information 200 cpuid 201 movl %edx, %eax / set return value 202 popl %ebx 203 andl $CPUID_INTC_EDX_SSE2, %eax 204#endif /* _KERNEL */ 205 ret 206 SET_SIZE(bignum_use_sse2) 207 208 209/ ------------------------------------------------------------------------ 210/ SSE2 Implementations 211/ ------------------------------------------------------------------------ 212 213/ r = a * digit, r and a are vectors of length len 214/ returns the carry digit 215/ Suitable only for x86 models that support SSE2 instruction set extensions 216/ 217/ uint32_t 218/ big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit) 219/ 220/ r %edx 221/ a %ebx 222/ len %ecx 223/ digit %mm3 224/ 225/ Does not touch the following registers: %esi, %edi, %mm4 226/ 227/ N.B.: 228/ This is strictly for internal use. 229/ The interface is very light-weight. 230/ All parameters are passed in registers. 231/ It does not conform to the SYSV x86 ABI. 232/ So, don't even think about calling this function directly from C code. 233/ 234/ The basic multiply digit loop is unrolled 8 times. 235/ Each comment is preceded by an instance number. 236/ Instructions that have been moved retain their original, "natural" 237/ instance number. It should be easier this way to follow 238/ the step-wise refinement process that went into constructing 239/ the final code. 240 241#define UNROLL 8 242#define UNROLL32 32 243 244 ENTRY(big_mul_set_vec_sse2_r) 245 xorl %eax, %eax / if (len == 0) return (0); 246 testl %ecx, %ecx 247 jz .L17 248 249 pxor %mm0, %mm0 / cy = 0 250 251.L15: 252 cmpl $UNROLL, %ecx 253 jl .L16 254 movd 0(%ebx), %mm1 / 1: mm1 = a[i] 255 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 256 paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy; 257 movd 4(%ebx), %mm1 / 2: mm1 = a[i] 258 movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 259 psrlq $32, %mm0 / 1: cy = product[63..32] 260 261 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 262 paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy; 263 movd 8(%ebx), %mm1 / 3: mm1 = a[i] 264 movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 265 psrlq $32, %mm0 / 2: cy = product[63..32] 266 267 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 268 paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy; 269 movd 12(%ebx), %mm1 / 4: mm1 = a[i] 270 movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 271 psrlq $32, %mm0 / 3: cy = product[63..32] 272 273 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 274 paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy; 275 movd 16(%ebx), %mm1 / 5: mm1 = a[i] 276 movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 277 psrlq $32, %mm0 / 4: cy = product[63..32] 278 279 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 280 paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy; 281 movd 20(%ebx), %mm1 / 6: mm1 = a[i] 282 movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 283 psrlq $32, %mm0 / 5: cy = product[63..32] 284 285 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 286 paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy; 287 movd 24(%ebx), %mm1 / 7: mm1 = a[i] 288 movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 289 psrlq $32, %mm0 / 6: cy = product[63..32] 290 291 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 292 paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy; 293 movd 28(%ebx), %mm1 / 8: mm1 = a[i] 294 movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 295 psrlq $32, %mm0 / 7: cy = product[63..32] 296 297 pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i] 298 paddq %mm1, %mm0 / 8: mm0 = digit * a[i] + cy; 299 movd %mm0, 28(%edx) / 8: r[i] = product[31..0] 300 psrlq $32, %mm0 / 8: cy = product[63..32] 301 302 leal UNROLL32(%ebx), %ebx / a += UNROLL 303 leal UNROLL32(%edx), %edx / r += UNROLL 304 subl $UNROLL, %ecx / len -= UNROLL 305 jz .L17 306 jmp .L15 307 308.L16: 309 movd 0(%ebx), %mm1 / 1: mm1 = a[i] 310 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 311 paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy; 312 movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 313 psrlq $32, %mm0 / 1: cy = product[63..32] 314 subl $1, %ecx 315 jz .L17 316 317 movd 4(%ebx), %mm1 / 2: mm1 = a[i] 318 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 319 paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy; 320 movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 321 psrlq $32, %mm0 / 2: cy = product[63..32] 322 subl $1, %ecx 323 jz .L17 324 325 movd 8(%ebx), %mm1 / 3: mm1 = a[i] 326 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 327 paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy; 328 movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 329 psrlq $32, %mm0 / 3: cy = product[63..32] 330 subl $1, %ecx 331 jz .L17 332 333 movd 12(%ebx), %mm1 / 4: mm1 = a[i] 334 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 335 paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy; 336 movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 337 psrlq $32, %mm0 / 4: cy = product[63..32] 338 subl $1, %ecx 339 jz .L17 340 341 movd 16(%ebx), %mm1 / 5: mm1 = a[i] 342 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 343 paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy; 344 movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 345 psrlq $32, %mm0 / 5: cy = product[63..32] 346 subl $1, %ecx 347 jz .L17 348 349 movd 20(%ebx), %mm1 / 6: mm1 = a[i] 350 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 351 paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy; 352 movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 353 psrlq $32, %mm0 / 6: cy = product[63..32] 354 subl $1, %ecx 355 jz .L17 356 357 movd 24(%ebx), %mm1 / 7: mm1 = a[i] 358 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 359 paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy; 360 movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 361 psrlq $32, %mm0 / 7: cy = product[63..32] 362 363.L17: 364 movd %mm0, %eax / return (cy) 365 / no emms. caller is responsible for emms 366 ret 367 SET_SIZE(big_mul_set_vec_sse2_r) 368 369 370/ r = a * digit, r and a are vectors of length len 371/ returns the carry digit 372/ Suitable only for x86 models that support SSE2 instruction set extensions 373/ 374/ r 8(%ebp) %edx 375/ a 12(%ebp) %ebx 376/ len 16(%ebp) %ecx 377/ digit 20(%ebp) %mm3 378/ 379/ In userland, there is just the one function, big_mul_set_vec_sse2(). 380/ But in the kernel, there are two variations: 381/ 1. big_mul_set_vec_sse2() which does what is necessary to save and 382/ restore state, if necessary, and to ensure that preemtion is 383/ disabled. 384/ 2. big_mul_set_vec_sse2_nsv() which just does the work; 385/ it is the caller's responsibility to ensure that MMX state 386/ does not need to be saved and restored and that preemption 387/ is already disabled. 388 389#if defined(MMX_MANAGE) 390 ENTRY(big_mul_set_vec_sse2) 391 pushl %ebp 392 movl %esp, %ebp 393 pushl %ebx 394 pushl %esi 395 KPREEMPT_DISABLE 396 TEST_TS(%ebx) 397 pushl %ebx 398 jnz .setvec_no_save 399 pushl %edi 400 SAVE_MMX_0TO4(%edi) 401 movl 8(%ebp), %edx 402 movl 12(%ebp), %ebx 403 movl 16(%ebp), %ecx 404 movd 20(%ebp), %mm3 405 call big_mul_set_vec_sse2_r 406 movl %eax, %esi 407 RSTOR_MMX_0TO4(%edi) 408 popl %edi 409 jmp .setvec_rtn 410 411.setvec_no_save: 412 movl 8(%ebp), %edx 413 movl 12(%ebp), %ebx 414 movl 16(%ebp), %ecx 415 movd 20(%ebp), %mm3 416 call big_mul_set_vec_sse2_r 417 movl %eax, %esi 418 419.setvec_rtn: 420 emms 421 popl %ebx 422 movl %ebx, %cr0 423 KPREEMPT_ENABLE 424 movl %esi, %eax 425 popl %esi 426 popl %ebx 427 leave 428 ret 429 SET_SIZE(big_mul_set_vec_sse2) 430 431 ENTRY(big_mul_set_vec_sse2_nsv) 432 pushl %ebp 433 movl %esp, %ebp 434 pushl %ebx 435 movl 8(%ebp), %edx 436 movl 12(%ebp), %ebx 437 movl 16(%ebp), %ecx 438 movd 20(%ebp), %mm3 439 call big_mul_set_vec_sse2_r 440 popl %ebx 441 leave 442 ret 443 SET_SIZE(big_mul_set_vec_sse2_nsv) 444 445#else /* !defined(MMX_MANAGE) */ 446 447/ r = a * digit, r and a are vectors of length len 448/ returns the carry digit 449/ Suitable only for x86 models that support SSE2 instruction set extensions 450/ 451/ r 8(%ebp) %edx 452/ a 12(%ebp) %ebx 453/ len 16(%ebp) %ecx 454/ digit 20(%ebp) %mm3 455 456 ENTRY(big_mul_set_vec_sse2) 457 pushl %ebp 458 movl %esp, %ebp 459 pushl %ebx 460 movl 8(%ebp), %edx 461 movl 12(%ebp), %ebx 462 movl 16(%ebp), %ecx 463 movd 20(%ebp), %mm3 464 call big_mul_set_vec_sse2_r 465 popl %ebx 466 emms 467 leave 468 ret 469 SET_SIZE(big_mul_set_vec_sse2) 470 471#endif /* MMX_MANAGE */ 472 473 474/ r = r + a * digit, r and a are vectors of length len 475/ returns the carry digit 476/ Suitable only for x86 models that support SSE2 instruction set extensions 477/ 478/ uint32_t 479/ big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit) 480/ 481/ r %edx 482/ a %ebx 483/ len %ecx 484/ digit %mm3 485/ 486/ N.B.: 487/ This is strictly for internal use. 488/ The interface is very light-weight. 489/ All parameters are passed in registers. 490/ It does not conform to the SYSV x86 ABI. 491/ So, don't even think about calling this function directly from C code. 492/ 493/ The basic multiply digit loop is unrolled 8 times. 494/ Each comment is preceded by an instance number. 495/ Instructions that have been moved retain their original, "natural" 496/ instance number. It should be easier this way to follow 497/ the step-wise refinement process that went into constructing 498/ the final code. 499 500 ENTRY(big_mul_add_vec_sse2_r) 501 xorl %eax, %eax 502 testl %ecx, %ecx 503 jz .L27 504 505 pxor %mm0, %mm0 / cy = 0 506 507.L25: 508 cmpl $UNROLL, %ecx 509 jl .L26 510 movd 0(%ebx), %mm1 / 1: mm1 = a[i] 511 movd 0(%edx), %mm2 / 1: mm2 = r[i] 512 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 513 paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i] 514 movd 4(%ebx), %mm1 / 2: mm1 = a[i] 515 paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy; 516 movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 517 movd 4(%edx), %mm2 / 2: mm2 = r[i] 518 psrlq $32, %mm0 / 1: cy = product[63..32] 519 520 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 521 paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i] 522 movd 8(%ebx), %mm1 / 3: mm1 = a[i] 523 paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy; 524 movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 525 movd 8(%edx), %mm2 / 3: mm2 = r[i] 526 psrlq $32, %mm0 / 2: cy = product[63..32] 527 528 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 529 paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i] 530 movd 12(%ebx), %mm1 / 4: mm1 = a[i] 531 paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy; 532 movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 533 movd 12(%edx), %mm2 / 4: mm2 = r[i] 534 psrlq $32, %mm0 / 3: cy = product[63..32] 535 536 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 537 paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i] 538 movd 16(%ebx), %mm1 / 5: mm1 = a[i] 539 paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy; 540 movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 541 movd 16(%edx), %mm2 / 5: mm2 = r[i] 542 psrlq $32, %mm0 / 4: cy = product[63..32] 543 544 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 545 paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i] 546 movd 20(%ebx), %mm1 / 6: mm1 = a[i] 547 paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy; 548 movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 549 movd 20(%edx), %mm2 / 6: mm2 = r[i] 550 psrlq $32, %mm0 / 5: cy = product[63..32] 551 552 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 553 paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i] 554 movd 24(%ebx), %mm1 / 7: mm1 = a[i] 555 paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy; 556 movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 557 movd 24(%edx), %mm2 / 7: mm2 = r[i] 558 psrlq $32, %mm0 / 6: cy = product[63..32] 559 560 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 561 paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i] 562 movd 28(%ebx), %mm1 / 8: mm1 = a[i] 563 paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy; 564 movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 565 movd 28(%edx), %mm2 / 8: mm2 = r[i] 566 psrlq $32, %mm0 / 7: cy = product[63..32] 567 568 pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i] 569 paddq %mm1, %mm2 / 8: mm2 = digit * a[i] + r[i] 570 paddq %mm2, %mm0 / 8: mm0 = digit * a[i] + r[i] + cy; 571 movd %mm0, 28(%edx) / 8: r[i] = product[31..0] 572 psrlq $32, %mm0 / 8: cy = product[63..32] 573 574 leal UNROLL32(%ebx), %ebx / a += UNROLL 575 leal UNROLL32(%edx), %edx / r += UNROLL 576 subl $UNROLL, %ecx / len -= UNROLL 577 jz .L27 578 jmp .L25 579 580.L26: 581 movd 0(%ebx), %mm1 / 1: mm1 = a[i] 582 movd 0(%edx), %mm2 / 1: mm2 = r[i] 583 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 584 paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i] 585 paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy; 586 movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 587 psrlq $32, %mm0 / 1: cy = product[63..32] 588 subl $1, %ecx 589 jz .L27 590 591 movd 4(%ebx), %mm1 / 2: mm1 = a[i] 592 movd 4(%edx), %mm2 / 2: mm2 = r[i] 593 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 594 paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i] 595 paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy; 596 movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 597 psrlq $32, %mm0 / 2: cy = product[63..32] 598 subl $1, %ecx 599 jz .L27 600 601 movd 8(%ebx), %mm1 / 3: mm1 = a[i] 602 movd 8(%edx), %mm2 / 3: mm2 = r[i] 603 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 604 paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i] 605 paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy; 606 movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 607 psrlq $32, %mm0 / 3: cy = product[63..32] 608 subl $1, %ecx 609 jz .L27 610 611 movd 12(%ebx), %mm1 / 4: mm1 = a[i] 612 movd 12(%edx), %mm2 / 4: mm2 = r[i] 613 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 614 paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i] 615 paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy; 616 movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 617 psrlq $32, %mm0 / 4: cy = product[63..32] 618 subl $1, %ecx 619 jz .L27 620 621 movd 16(%ebx), %mm1 / 5: mm1 = a[i] 622 movd 16(%edx), %mm2 / 5: mm2 = r[i] 623 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 624 paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i] 625 paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy; 626 movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 627 psrlq $32, %mm0 / 5: cy = product[63..32] 628 subl $1, %ecx 629 jz .L27 630 631 movd 20(%ebx), %mm1 / 6: mm1 = a[i] 632 movd 20(%edx), %mm2 / 6: mm2 = r[i] 633 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 634 paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i] 635 paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy; 636 movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 637 psrlq $32, %mm0 / 6: cy = product[63..32] 638 subl $1, %ecx 639 jz .L27 640 641 movd 24(%ebx), %mm1 / 7: mm1 = a[i] 642 movd 24(%edx), %mm2 / 7: mm2 = r[i] 643 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 644 paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i] 645 paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy; 646 movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 647 psrlq $32, %mm0 / 7: cy = product[63..32] 648 649.L27: 650 movd %mm0, %eax 651 / no emms. caller is responsible for emms 652 ret 653 SET_SIZE(big_mul_add_vec_sse2_r) 654 655 656/ r = r + a * digit, r and a are vectors of length len 657/ returns the carry digit 658/ Suitable only for x86 models that support SSE2 instruction set extensions 659/ 660/ r 8(%ebp) %edx 661/ a 12(%ebp) %ebx 662/ len 16(%ebp) %ecx 663/ digit 20(%ebp) %mm3 664/ 665/ In userland, there is just the one function, big_mul_add_vec_sse2(). 666/ But in the kernel, there are two variations: 667/ 1. big_mul_add_vec_sse2() which does what is necessary to save and 668/ restore state, if necessary, and to ensure that preemtion is 669/ disabled. 670/ 2. big_mul_add_vec_sse2_nsv() which just does the work; 671/ it is the caller's responsibility to ensure that MMX state 672/ does not need to be saved and restored and that preemption 673/ is already disabled. 674 675 676#if defined(MMX_MANAGE) 677 678 ENTRY(big_mul_add_vec_sse2) 679 pushl %ebp 680 movl %esp, %ebp 681 pushl %ebx 682 pushl %esi 683 KPREEMPT_DISABLE 684 TEST_TS(%ebx) 685 pushl %ebx 686 jnz .addvec_no_save 687 pushl %edi 688 SAVE_MMX_0TO4(%edi) 689 movl 8(%ebp), %edx 690 movl 12(%ebp), %ebx 691 movl 16(%ebp), %ecx 692 movd 20(%ebp), %mm3 693 call big_mul_add_vec_sse2_r 694 movl %eax, %esi 695 RSTOR_MMX_0TO4(%edi) 696 popl %edi 697 jmp .addvec_rtn 698 699.addvec_no_save: 700 movl 8(%ebp), %edx 701 movl 12(%ebp), %ebx 702 movl 16(%ebp), %ecx 703 movd 20(%ebp), %mm3 704 call big_mul_add_vec_sse2_r 705 movl %eax, %esi 706 707.addvec_rtn: 708 emms 709 popl %ebx 710 movl %ebx, %cr0 711 KPREEMPT_ENABLE 712 movl %esi, %eax 713 popl %esi 714 popl %ebx 715 leave 716 ret 717 SET_SIZE(big_mul_add_vec_sse2) 718 719 ENTRY(big_mul_add_vec_sse2_nsv) 720 pushl %ebp 721 movl %esp, %ebp 722 pushl %ebx 723 movl 8(%ebp), %edx 724 movl 12(%ebp), %ebx 725 movl 16(%ebp), %ecx 726 movd 20(%ebp), %mm3 727 call big_mul_add_vec_sse2_r 728 popl %ebx 729 leave 730 ret 731 SET_SIZE(big_mul_add_vec_sse2_nsv) 732 733 734#else /* !defined(MMX_MANAGE) */ 735 736 ENTRY(big_mul_add_vec_sse2) 737 pushl %ebp 738 movl %esp, %ebp 739 pushl %ebx 740 movl 8(%ebp), %edx 741 movl 12(%ebp), %ebx 742 movl 16(%ebp), %ecx 743 movd 20(%ebp), %mm3 744 call big_mul_add_vec_sse2_r 745 popl %ebx 746 emms 747 leave 748 ret 749 SET_SIZE(big_mul_add_vec_sse2) 750 751#endif /* MMX_MANAGE */ 752 753 754/ void 755/ big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen) 756/ { 757/ int i; 758/ 759/ r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]); 760/ for (i = 1; i < blen; ++i) 761/ r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]); 762/ } 763 764 765#if defined(MMX_MANAGE) 766 ENTRY(big_mul_vec_sse2_fc) 767#else 768 ENTRY(big_mul_vec_sse2) 769#endif 770 subl $0x8, %esp 771 pushl %ebx 772 pushl %ebp 773 pushl %esi 774 pushl %edi 775 movl 40(%esp), %eax 776 movl %eax, 20(%esp) 777 pushl (%eax) 778 movl 40(%esp), %edi 779 pushl %edi 780 movl 40(%esp), %esi 781 pushl %esi 782 movl 40(%esp), %ebx 783 pushl %ebx 784#if defined(MMX_MANAGE) 785 call big_mul_set_vec_sse2_nsv 786#else 787 call big_mul_set_vec_sse2 788#endif 789 addl $0x10, %esp 790 movl %eax, (%ebx,%edi,4) 791 movl 44(%esp), %eax 792 movl %eax, 16(%esp) 793 cmpl $0x1, %eax 794 jle .mulvec_rtn 795 movl $0x1, %ebp 796 797 .align 16 798.mulvec_add: 799 movl 20(%esp), %eax 800 pushl (%eax,%ebp,4) 801 pushl %edi 802 pushl %esi 803 leal (%ebx,%ebp,4), %eax 804 pushl %eax 805#if defined(MMX_MANAGE) 806 call big_mul_add_vec_sse2_nsv 807#else 808 call big_mul_add_vec_sse2 809#endif 810 addl $0x10, %esp 811 leal (%ebp,%edi), %ecx 812 movl %eax, (%ebx,%ecx,4) 813 incl %ebp 814 cmpl 16(%esp), %ebp 815 jl .mulvec_add 816.mulvec_rtn: 817#if defined(MMX_MANAGE) 818 emms 819#endif 820 popl %edi 821 popl %esi 822 popl %ebp 823 popl %ebx 824 addl $0x8, %esp 825 ret 826#if defined(MMX_MANAGE) 827 SET_SIZE(big_mul_vec_sse2_fc) 828#else 829 SET_SIZE(big_mul_vec_sse2) 830#endif 831 832#if defined(MMX_MANAGE) 833 834 ENTRY(big_mul_vec_sse2) 835 pushl %ebp 836 movl %esp, %ebp 837 subl $8, %esp 838 pushl %edi 839 KPREEMPT_DISABLE 840 TEST_TS(%eax) 841 movl %eax, -8(%ebp) 842 jnz .mulvec_no_save 843 SAVE_MMX_0TO4(%edi) 844 movl %edi, -4(%ebp) 845.mulvec_no_save: 846 movl 24(%ebp), %eax / blen 847 pushl %eax 848 movl 20(%ebp), %eax / b 849 pushl %eax 850 movl 16(%ebp), %eax / alen 851 pushl %eax 852 movl 12(%ebp), %eax / a 853 pushl %eax 854 movl 8(%ebp), %eax / r 855 pushl %eax 856 call big_mul_vec_sse2_fc 857 addl $20, %esp 858 movl -8(%ebp), %eax 859 testl $CR0_TS, %eax 860 jnz .mulvec_no_rstr 861 movl -4(%ebp), %edi 862 RSTOR_MMX_0TO4(%edi) 863.mulvec_no_rstr: 864 movl %eax, %cr0 865 KPREEMPT_ENABLE 866 popl %edi 867 leave 868 ret 869 SET_SIZE(big_mul_vec_sse2) 870 871#endif /* MMX_MANAGE */ 872 873 874 875#undef UNROLL 876#undef UNROLL32 877 878 879/ r = a * a, r and a are vectors of length len 880/ Suitable only for x86 models that support SSE2 instruction set extensions 881/ 882/ This function is not suitable for a truly general-purpose multiprecision 883/ arithmetic library, because it does not work for "small" numbers, that is 884/ numbers of 1 or 2 digits. big_mul() just uses the ordinary big_mul_vec() 885/ for any small numbers. 886 887#if defined(MMX_MANAGE) 888 ENTRY(big_sqr_vec_sse2_fc) 889#else 890 ENTRY(big_sqr_vec_sse2) 891 pushl %ebp 892 movl %esp, %ebp 893#endif 894 895 pushl %ebx 896 pushl %edi 897 pushl %esi 898 899 / r[1..alen] = a[0] * a[1..alen-1] 900 901 movl 8(%ebp), %edi / r = arg(r) 902 movl 12(%ebp), %esi / a = arg(a) 903 movl 16(%ebp), %ecx / cnt = arg(alen) 904 movd %ecx, %mm4 / save_cnt = arg(alen) 905 leal 4(%edi), %edx / dst = &r[1] 906 movl %esi, %ebx / src = a 907 movd 0(%ebx), %mm3 / mm3 = a[0] 908 leal 4(%ebx), %ebx / src = &a[1] 909 subl $1, %ecx / --cnt 910 call big_mul_set_vec_sse2_r / r[1..alen-1] = a[0] * a[1..alen-1] 911 movl %edi, %edx / dst = r 912 movl %esi, %ebx / src = a 913 movd %mm4, %ecx / cnt = save_cnt 914 movl %eax, (%edx, %ecx, 4) / r[cnt] = cy 915 916/ /* High-level vector C pseudocode */ 917/ for (i = 1; i < alen-1; ++i) 918/ r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1] 919/ 920/ /* Same thing, but slightly lower level C-like pseudocode */ 921/ i = 1; 922/ r = &arg_r[2*i + 1]; 923/ a = &arg_a[i + 1]; 924/ digit = arg_a[i]; 925/ cnt = alen - 3; 926/ while (cnt != 0) { 927/ r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit); 928/ r += 2; 929/ ++a; 930/ --cnt; 931/ } 932/ 933/ /* Same thing, but even lower level 934/ * For example, pointers are raw pointers, 935/ * with no scaling by object size. 936/ */ 937/ r = arg_r + 12; /* i == 1; 2i + 1 == 3; 4*3 == 12; */ 938/ a = arg_a + 8; 939/ digit = *(arg_a + 4); 940/ cnt = alen - 3; 941/ while (cnt != 0) { 942/ cy = big_mul_add_vec_sse2_r(); 943/ *(r + 4 * cnt) = cy; 944/ r += 8; 945/ a += 4; 946/ --cnt; 947/ } 948 949 leal 4(%edi), %edi / r += 4; r = &r[1] 950 leal 4(%esi), %esi / a += 4; a = &a[1] 951 movd %mm4, %ecx / cnt = save 952 subl $2, %ecx / cnt = alen - 2; i in 1..alen-2 953 movd %ecx, %mm4 / save_cnt 954 jecxz .L32 / while (cnt != 0) { 955.L31: 956 movd 0(%esi), %mm3 / digit = a[i] 957 leal 4(%esi), %esi / a += 4; a = &a[1]; a = &a[i + 1] 958 leal 8(%edi), %edi / r += 8; r = &r[2]; r = &r[2 * i + 1] 959 movl %edi, %edx / edx = r 960 movl %esi, %ebx / ebx = a 961 cmp $1, %ecx / The last triangle term is special 962 jz .L32 963 call big_mul_add_vec_sse2_r 964 movd %mm4, %ecx / cnt = save_cnt 965 movl %eax, (%edi, %ecx, 4) / r[cnt] = cy 966 subl $1, %ecx / --cnt 967 movd %ecx, %mm4 / save_cnt = cnt 968 jmp .L31 / } 969 970.L32: 971 movd 0(%ebx), %mm1 / mm1 = a[i + 1] 972 movd 0(%edx), %mm2 / mm2 = r[2 * i + 1] 973 pmuludq %mm3, %mm1 / mm1 = p = digit * a[i + 1] 974 paddq %mm1, %mm2 / mm2 = r[2 * i + 1] + p 975 movd %mm2, 0(%edx) / r[2 * i + 1] += lo32(p) 976 psrlq $32, %mm2 / mm2 = cy 977 movd %mm2, 4(%edx) / r[2 * i + 2] = cy 978 pxor %mm2, %mm2 979 movd %mm2, 8(%edx) / r[2 * i + 3] = 0 980 981 movl 8(%ebp), %edx / r = arg(r) 982 movl 12(%ebp), %ebx / a = arg(a) 983 movl 16(%ebp), %ecx / cnt = arg(alen) 984 985 / compute low-order corner 986 / p = a[0]**2 987 / r[0] = lo32(p) 988 / cy = hi32(p) 989 movd 0(%ebx), %mm2 / mm2 = a[0] 990 pmuludq %mm2, %mm2 / mm2 = p = a[0]**2 991 movd %mm2, 0(%edx) / r[0] = lo32(p) 992 psrlq $32, %mm2 / mm2 = cy = hi32(p) 993 994 / p = 2 * r[1] 995 / t = p + cy 996 / r[1] = lo32(t) 997 / cy = hi32(t) 998 movd 4(%edx), %mm1 / mm1 = r[1] 999 psllq $1, %mm1 / mm1 = p = 2 * r[1] 1000 paddq %mm1, %mm2 / mm2 = t = p + cy 1001 movd %mm2, 4(%edx) / r[1] = low32(t) 1002 psrlq $32, %mm2 / mm2 = cy = hi32(t) 1003 1004 / r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3] 1005 subl $2, %ecx / cnt = alen - 2 1006.L34: 1007 movd 4(%ebx), %mm0 / mm0 = diag = a[i+1] 1008 pmuludq %mm0, %mm0 / mm0 = p = diag**2 1009 paddq %mm0, %mm2 / mm2 = t = p + cy 1010 movd %mm2, %eax 1011 movd %eax, %mm1 / mm1 = lo32(t) 1012 psrlq $32, %mm2 / mm2 = hi32(t) 1013 1014 movd 8(%edx), %mm3 / mm3 = r[2*i] 1015 psllq $1, %mm3 / mm3 = 2*r[2*i] 1016 paddq %mm3, %mm1 / mm1 = 2*r[2*i] + lo32(t) 1017 movd %mm1, 8(%edx) / r[2*i] = 2*r[2*i] + lo32(t) 1018 psrlq $32, %mm1 1019 paddq %mm1, %mm2 1020 1021 movd 12(%edx), %mm3 / mm3 = r[2*i+1] 1022 psllq $1, %mm3 / mm3 = 2*r[2*i+1] 1023 paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + hi32(t) 1024 movd %mm2, 12(%edx) / r[2*i+1] = mm2 1025 psrlq $32, %mm2 / mm2 = cy 1026 leal 8(%edx), %edx / r += 2 1027 leal 4(%ebx), %ebx / ++a 1028 subl $1, %ecx / --cnt 1029 jnz .L34 1030 1031 / Carry from last triangle term must participate in doubling, 1032 / but this step isn't paired up with a squaring the elements 1033 / of the inner diagonal. 1034 / r[$-3..$-2] += 2 * r[$-3..$-2] + cy 1035 movd 8(%edx), %mm3 / mm3 = r[2*i] 1036 psllq $1, %mm3 / mm3 = 2*r[2*i] 1037 paddq %mm3, %mm2 / mm2 = 2*r[2*i] + cy 1038 movd %mm2, 8(%edx) / r[2*i] = lo32(2*r[2*i] + cy) 1039 psrlq $32, %mm2 / mm2 = cy = hi32(2*r[2*i] + cy) 1040 1041 movd 12(%edx), %mm3 / mm3 = r[2*i+1] 1042 psllq $1, %mm3 / mm3 = 2*r[2*i+1] 1043 paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + cy 1044 movd %mm2, 12(%edx) / r[2*i+1] = mm2 1045 psrlq $32, %mm2 / mm2 = cy 1046 1047 / compute high-order corner and add it in 1048 / p = a[alen - 1]**2 1049 / t = p + cy 1050 / r[alen + alen - 2] += lo32(t) 1051 / cy = hi32(t) 1052 / r[alen + alen - 1] = cy 1053 movd 4(%ebx), %mm0 / mm0 = a[$-1] 1054 movd 8(%edx), %mm3 / mm3 = r[$-2] 1055 pmuludq %mm0, %mm0 / mm0 = p = a[$-1]**2 1056 paddq %mm0, %mm2 / mm2 = t = p + cy 1057 paddq %mm3, %mm2 / mm2 = r[$-2] + t 1058 movd %mm2, 8(%edx) / r[$-2] = lo32(r[$-2] + t) 1059 psrlq $32, %mm2 / mm2 = cy = hi32(r[$-2] + t) 1060 movd 12(%edx), %mm3 1061 paddq %mm3, %mm2 1062 movd %mm2, 12(%edx) / r[$-1] += cy 1063 1064.L35: 1065 emms 1066 popl %esi 1067 popl %edi 1068 popl %ebx 1069 1070#if defined(MMX_MANAGE) 1071 ret 1072 SET_SIZE(big_sqr_vec_sse2_fc) 1073#else 1074 leave 1075 ret 1076 SET_SIZE(big_sqr_vec_sse2) 1077#endif 1078 1079 1080#if defined(MMX_MANAGE) 1081 ENTRY(big_sqr_vec_sse2) 1082 pushl %ebp 1083 movl %esp, %ebp 1084 KPREEMPT_DISABLE 1085 TEST_TS(%ebx) 1086 pushl %ebx 1087 jnz .sqr_no_save 1088 pushl %edi 1089 SAVE_MMX_0TO4(%edi) 1090 call big_sqr_vec_sse2_fc 1091 RSTOR_MMX_0TO4(%edi) 1092 popl %edi 1093 jmp .sqr_rtn 1094 1095.sqr_no_save: 1096 call big_sqr_vec_sse2_fc 1097 1098.sqr_rtn: 1099 popl %ebx 1100 movl %ebx, %cr0 1101 KPREEMPT_ENABLE 1102 leave 1103 ret 1104 SET_SIZE(big_sqr_vec_sse2) 1105 1106#endif /* MMX_MANAGE */ 1107 1108/ ------------------------------------------------------------------------ 1109/ UMUL Implementations 1110/ ------------------------------------------------------------------------ 1111 1112 1113/ r = a * digit, r and a are vectors of length len 1114/ returns the carry digit 1115/ Does not use any MMX, SSE, or SSE2 instructions. 1116/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL. 1117/ This is a fall-back implementation for x86 models that do not support 1118/ the PMULUDQ instruction. 1119/ 1120/ uint32_t 1121/ big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 1122/ 1123/ r 8(%ebp) %edx %edi 1124/ a 12(%ebp) %ebx %esi 1125/ len 16(%ebp) %ecx 1126/ digit 20(%ebp) %esi 1127 1128 ENTRY(big_mul_set_vec_umul) 1129 pushl %ebp 1130 movl %esp, %ebp 1131 pushl %esi 1132 pushl %edi 1133 pushl %ebx 1134 movl 16(%ebp), %ecx 1135 xorl %ebx, %ebx / cy = 0 1136 testl %ecx, %ecx 1137 movl 8(%ebp), %edi 1138 movl 12(%ebp), %esi 1139 je .L57 1140 1141.L55: 1142 movl (%esi), %eax / eax = a[i] 1143 leal 4(%esi), %esi / ++a 1144 mull 20(%ebp) / edx:eax = a[i] * digit 1145 addl %ebx, %eax 1146 adcl $0, %edx / edx:eax = a[i] * digit + cy 1147 movl %eax, (%edi) / r[i] = product[31..0] 1148 movl %edx, %ebx / cy = product[63..32] 1149 leal 4(%edi), %edi / ++r 1150 decl %ecx / --len 1151 jnz .L55 / while (len != 0) 1152.L57: 1153 movl %ebx, %eax 1154 popl %ebx 1155 popl %edi 1156 popl %esi 1157 leave 1158 ret 1159 SET_SIZE(big_mul_set_vec_umul) 1160 1161 1162/ r = r + a * digit, r and a are vectors of length len 1163/ returns the carry digit 1164/ Does not use any MMX, SSE, or SSE2 instructions. 1165/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL. 1166/ This is a fall-back implementation for x86 models that do not support 1167/ the PMULUDQ instruction. 1168/ 1169/ uint32_t 1170/ big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 1171/ 1172/ r 8(%ebp) %edx %edi 1173/ a 12(%ebp) %ebx %esi 1174/ len 16(%ebp) %ecx 1175/ digit 20(%ebp) %esi 1176 1177 ENTRY(big_mul_add_vec_umul) 1178 pushl %ebp 1179 movl %esp, %ebp 1180 pushl %esi 1181 pushl %edi 1182 pushl %ebx 1183 movl 16(%ebp), %ecx 1184 xorl %ebx, %ebx / cy = 0 1185 testl %ecx, %ecx 1186 movl 8(%ebp), %edi 1187 movl 12(%ebp), %esi 1188 je .L67 1189 .align 4 1190.L65: 1191 movl (%esi), %eax / eax = a[i] 1192 leal 4(%esi), %esi / ++a 1193 mull 20(%ebp) / edx:eax = a[i] * digit 1194 addl (%edi), %eax 1195 adcl $0, %edx / edx:eax = a[i] * digit + r[i] 1196 addl %ebx, %eax 1197 adcl $0, %edx / edx:eax = a[i] * digit + r[i] + cy 1198 movl %eax, (%edi) / r[i] = product[31..0] 1199 movl %edx, %ebx / cy = product[63..32] 1200 leal 4(%edi), %edi / ++r 1201 decl %ecx / --len 1202 jnz .L65 / while (len != 0) 1203.L67: 1204 movl %ebx, %eax 1205 popl %ebx 1206 popl %edi 1207 popl %esi 1208 leave 1209 ret 1210 SET_SIZE(big_mul_add_vec_umul) 1211 1212#endif /* __lint */ 1213