1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22/* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29#include <sys/asm_linkage.h> 30#include <sys/x86_archext.h> 31#include <sys/controlregs.h> 32 33#if defined(__lint) 34 35#include <sys/types.h> 36 37uint32_t 38bignum_use_sse2() 39{ return (0); } 40 41/* Not to be called by C code */ 42/* ARGSUSED */ 43uint32_t 44big_mul_set_vec_sse2_r() 45{ return (0); } 46 47/* Not to be called by C code */ 48/* ARGSUSED */ 49uint32_t 50big_mul_add_vec_sse2_r() 51{ return (0); } 52 53/* ARGSUSED */ 54uint32_t 55big_mul_set_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit) 56{ return (0); } 57 58/* ARGSUSED */ 59uint32_t 60big_mul_add_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit) 61{ return (0); } 62 63/* ARGSUSED */ 64void 65big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen) 66{} 67 68/* ARGSUSED */ 69void 70big_sqr_vec_sse2(uint32_t *r, uint32_t *a, int len) 71{} 72 73#if defined(MMX_MANAGE) 74 75/* ARGSUSED */ 76uint32_t 77big_mul_set_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit) 78{ return (0); } 79 80/* ARGSUSED */ 81uint32_t 82big_mul_add_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit) 83{ return (0); } 84 85/* Not to be called by C code */ 86/* ARGSUSED */ 87void 88big_sqr_vec_sse2_fc(uint32_t *r, uint32_t *a, int len) 89{} 90 91#endif /* MMX_MANAGE */ 92 93/* 94 * UMUL 95 * 96 */ 97 98/* ARGSUSED */ 99uint32_t 100big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 101{ return (0); } 102 103/* ARGSUSED */ 104uint32_t 105big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 106{ return (0); } 107 108#else /* __lint */ 109 110#if defined(MMX_MANAGE) 111 112#if defined(_KERNEL) 113 114#define KPREEMPT_DISABLE call kpr_disable 115#define KPREEMPT_ENABLE call kpr_enable 116#define TEST_TS(reg) \ 117 movl %cr0, reg; \ 118 clts; \ 119 testl $CR0_TS, reg 120 121#else /* _KERNEL */ 122 123#define KPREEMPT_DISABLE 124#define KPREEMPT_ENABLE 125 126#define TEST_TS(reg) \ 127 movl $0, reg; \ 128 testl $CR0_TS, reg 129 130#endif /* _KERNEL */ 131 132#define MMX_SIZE 8 133#define MMX_ALIGN 8 134 135#define SAVE_MMX_PROLOG(sreg, nreg) \ 136 subl $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp; \ 137 movl %esp, sreg; \ 138 addl $MMX_ALIGN, sreg; \ 139 andl $-1![MMX_ALIGN-1], sreg; 140 141#define RSTOR_MMX_EPILOG(nreg) \ 142 addl $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp; 143 144#define SAVE_MMX_0TO4(sreg) \ 145 SAVE_MMX_PROLOG(sreg, 5); \ 146 movq %mm0, 0(sreg); \ 147 movq %mm1, 8(sreg); \ 148 movq %mm2, 16(sreg); \ 149 movq %mm3, 24(sreg); \ 150 movq %mm4, 32(sreg) 151 152#define RSTOR_MMX_0TO4(sreg) \ 153 movq 0(sreg), %mm0; \ 154 movq 8(sreg), %mm1; \ 155 movq 16(sreg), %mm2; \ 156 movq 24(sreg), %mm3; \ 157 movq 32(sreg), %mm4; \ 158 RSTOR_MMX_EPILOG(5) 159 160#endif /* MMX_MANAGE */ 161 162/ Note: this file contains implementations for 163/ big_mul_set_vec() 164/ big_mul_add_vec() 165/ big_mul_vec() 166/ big_sqr_vec() 167/ One set of implementations is for SSE2-capable models. 168/ The other uses no MMX, SSE, or SSE2 instructions, only 169/ the x86 32 X 32 -> 64 unsigned multiply instruction, MUL. 170/ 171/ The code for the implementations is grouped by SSE2 vs UMUL, 172/ rather than grouping pairs of implementations for each function. 173/ This is because the bignum implementation gets "imprinted" 174/ on the correct implementation, at the time of first use, 175/ so none of the code for the other implementations is ever 176/ executed. So, it is a no-brainer to layout the code to minimize 177/ the "footprint" of executed code. 178 179/ Can we use SSE2 instructions? Return value is non-zero 180/ if we can. 181/ 182/ Note: 183/ Using the cpuid instruction directly would work equally 184/ well in userland and in the kernel, but we do not use the 185/ cpuid instruction in the kernel, we use the x86_feature 186/ variable, instead. This means we honor any decisions 187/ the kernel startup code may have made in setting this 188/ variable, including disabling SSE2 because of settings 189/ in /etc/system. It might even be a good idea to honor 190/ this kind of setting in userland, as well, but the variable, 191/ x86-feature is not readily available to userland processes. 192/ 193/ uint32_t 194/ bignum_use_sse2() 195 196 ENTRY(bignum_use_sse2) 197#if defined(_KERNEL) 198 movl x86_feature, %eax 199 andl $X86_SSE2, %eax 200#else /* _KERNEL */ 201 pushl %ebx 202 movl $1, %eax / Get feature information 203 cpuid 204 movl %edx, %eax / set return value 205 popl %ebx 206 andl $CPUID_INTC_EDX_SSE2, %eax 207#endif /* _KERNEL */ 208 ret 209 SET_SIZE(bignum_use_sse2) 210 211 212/ ------------------------------------------------------------------------ 213/ SSE2 Implementations 214/ ------------------------------------------------------------------------ 215 216/ r = a * digit, r and a are vectors of length len 217/ returns the carry digit 218/ Suitable only for x86 models that support SSE2 instruction set extensions 219/ 220/ uint32_t 221/ big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit) 222/ 223/ r %edx 224/ a %ebx 225/ len %ecx 226/ digit %mm3 227/ 228/ Does not touch the following registers: %esi, %edi, %mm4 229/ 230/ N.B.: 231/ This is strictly for internal use. 232/ The interface is very light-weight. 233/ All parameters are passed in registers. 234/ It does not conform to the SYSV x86 ABI. 235/ So, don't even think about calling this function directly from C code. 236/ 237/ The basic multiply digit loop is unrolled 8 times. 238/ Each comment is preceded by an instance number. 239/ Instructions that have been moved retain their original, "natural" 240/ instance number. It should be easier this way to follow 241/ the step-wise refinement process that went into constructing 242/ the final code. 243 244#define UNROLL 8 245#define UNROLL32 32 246 247 ENTRY(big_mul_set_vec_sse2_r) 248 xorl %eax, %eax / if (len == 0) return (0); 249 testl %ecx, %ecx 250 jz .L17 251 252 pxor %mm0, %mm0 / cy = 0 253 254.L15: 255 cmpl $UNROLL, %ecx 256 jl .L16 257 movd 0(%ebx), %mm1 / 1: mm1 = a[i] 258 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 259 paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy; 260 movd 4(%ebx), %mm1 / 2: mm1 = a[i] 261 movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 262 psrlq $32, %mm0 / 1: cy = product[63..32] 263 264 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 265 paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy; 266 movd 8(%ebx), %mm1 / 3: mm1 = a[i] 267 movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 268 psrlq $32, %mm0 / 2: cy = product[63..32] 269 270 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 271 paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy; 272 movd 12(%ebx), %mm1 / 4: mm1 = a[i] 273 movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 274 psrlq $32, %mm0 / 3: cy = product[63..32] 275 276 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 277 paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy; 278 movd 16(%ebx), %mm1 / 5: mm1 = a[i] 279 movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 280 psrlq $32, %mm0 / 4: cy = product[63..32] 281 282 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 283 paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy; 284 movd 20(%ebx), %mm1 / 6: mm1 = a[i] 285 movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 286 psrlq $32, %mm0 / 5: cy = product[63..32] 287 288 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 289 paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy; 290 movd 24(%ebx), %mm1 / 7: mm1 = a[i] 291 movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 292 psrlq $32, %mm0 / 6: cy = product[63..32] 293 294 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 295 paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy; 296 movd 28(%ebx), %mm1 / 8: mm1 = a[i] 297 movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 298 psrlq $32, %mm0 / 7: cy = product[63..32] 299 300 pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i] 301 paddq %mm1, %mm0 / 8: mm0 = digit * a[i] + cy; 302 movd %mm0, 28(%edx) / 8: r[i] = product[31..0] 303 psrlq $32, %mm0 / 8: cy = product[63..32] 304 305 leal UNROLL32(%ebx), %ebx / a += UNROLL 306 leal UNROLL32(%edx), %edx / r += UNROLL 307 subl $UNROLL, %ecx / len -= UNROLL 308 jz .L17 309 jmp .L15 310 311.L16: 312 movd 0(%ebx), %mm1 / 1: mm1 = a[i] 313 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 314 paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy; 315 movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 316 psrlq $32, %mm0 / 1: cy = product[63..32] 317 subl $1, %ecx 318 jz .L17 319 320 movd 4(%ebx), %mm1 / 2: mm1 = a[i] 321 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 322 paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy; 323 movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 324 psrlq $32, %mm0 / 2: cy = product[63..32] 325 subl $1, %ecx 326 jz .L17 327 328 movd 8(%ebx), %mm1 / 3: mm1 = a[i] 329 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 330 paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy; 331 movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 332 psrlq $32, %mm0 / 3: cy = product[63..32] 333 subl $1, %ecx 334 jz .L17 335 336 movd 12(%ebx), %mm1 / 4: mm1 = a[i] 337 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 338 paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy; 339 movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 340 psrlq $32, %mm0 / 4: cy = product[63..32] 341 subl $1, %ecx 342 jz .L17 343 344 movd 16(%ebx), %mm1 / 5: mm1 = a[i] 345 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 346 paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy; 347 movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 348 psrlq $32, %mm0 / 5: cy = product[63..32] 349 subl $1, %ecx 350 jz .L17 351 352 movd 20(%ebx), %mm1 / 6: mm1 = a[i] 353 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 354 paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy; 355 movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 356 psrlq $32, %mm0 / 6: cy = product[63..32] 357 subl $1, %ecx 358 jz .L17 359 360 movd 24(%ebx), %mm1 / 7: mm1 = a[i] 361 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 362 paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy; 363 movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 364 psrlq $32, %mm0 / 7: cy = product[63..32] 365 366.L17: 367 movd %mm0, %eax / return (cy) 368 / no emms. caller is responsible for emms 369 ret 370 SET_SIZE(big_mul_set_vec_sse2_r) 371 372 373/ r = a * digit, r and a are vectors of length len 374/ returns the carry digit 375/ Suitable only for x86 models that support SSE2 instruction set extensions 376/ 377/ r 8(%ebp) %edx 378/ a 12(%ebp) %ebx 379/ len 16(%ebp) %ecx 380/ digit 20(%ebp) %mm3 381/ 382/ In userland, there is just the one function, big_mul_set_vec_sse2(). 383/ But in the kernel, there are two variations: 384/ 1. big_mul_set_vec_sse2() which does what is necessary to save and 385/ restore state, if necessary, and to ensure that preemtion is 386/ disabled. 387/ 2. big_mul_set_vec_sse2_nsv() which just does the work; 388/ it is the caller's responsibility to ensure that MMX state 389/ does not need to be saved and restored and that preemption 390/ is already disabled. 391 392#if defined(MMX_MANAGE) 393 ENTRY(big_mul_set_vec_sse2) 394 pushl %ebp 395 movl %esp, %ebp 396 pushl %ebx 397 pushl %esi 398 KPREEMPT_DISABLE 399 TEST_TS(%ebx) 400 pushl %ebx 401 jnz .setvec_no_save 402 pushl %edi 403 SAVE_MMX_0TO4(%edi) 404 movl 8(%ebp), %edx 405 movl 12(%ebp), %ebx 406 movl 16(%ebp), %ecx 407 movd 20(%ebp), %mm3 408 call big_mul_set_vec_sse2_r 409 movl %eax, %esi 410 RSTOR_MMX_0TO4(%edi) 411 popl %edi 412 jmp .setvec_rtn 413 414.setvec_no_save: 415 movl 8(%ebp), %edx 416 movl 12(%ebp), %ebx 417 movl 16(%ebp), %ecx 418 movd 20(%ebp), %mm3 419 call big_mul_set_vec_sse2_r 420 movl %eax, %esi 421 422.setvec_rtn: 423 emms 424 popl %ebx 425 movl %ebx, %cr0 426 KPREEMPT_ENABLE 427 movl %esi, %eax 428 popl %esi 429 popl %ebx 430 leave 431 ret 432 SET_SIZE(big_mul_set_vec_sse2) 433 434 ENTRY(big_mul_set_vec_sse2_nsv) 435 pushl %ebp 436 movl %esp, %ebp 437 pushl %ebx 438 movl 8(%ebp), %edx 439 movl 12(%ebp), %ebx 440 movl 16(%ebp), %ecx 441 movd 20(%ebp), %mm3 442 call big_mul_set_vec_sse2_r 443 popl %ebx 444 leave 445 ret 446 SET_SIZE(big_mul_set_vec_sse2_nsv) 447 448#else /* !defined(MMX_MANAGE) */ 449 450/ r = a * digit, r and a are vectors of length len 451/ returns the carry digit 452/ Suitable only for x86 models that support SSE2 instruction set extensions 453/ 454/ r 8(%ebp) %edx 455/ a 12(%ebp) %ebx 456/ len 16(%ebp) %ecx 457/ digit 20(%ebp) %mm3 458 459 ENTRY(big_mul_set_vec_sse2) 460 pushl %ebp 461 movl %esp, %ebp 462 pushl %ebx 463 movl 8(%ebp), %edx 464 movl 12(%ebp), %ebx 465 movl 16(%ebp), %ecx 466 movd 20(%ebp), %mm3 467 call big_mul_set_vec_sse2_r 468 popl %ebx 469 emms 470 leave 471 ret 472 SET_SIZE(big_mul_set_vec_sse2) 473 474#endif /* MMX_MANAGE */ 475 476 477/ r = r + a * digit, r and a are vectors of length len 478/ returns the carry digit 479/ Suitable only for x86 models that support SSE2 instruction set extensions 480/ 481/ uint32_t 482/ big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit) 483/ 484/ r %edx 485/ a %ebx 486/ len %ecx 487/ digit %mm3 488/ 489/ N.B.: 490/ This is strictly for internal use. 491/ The interface is very light-weight. 492/ All parameters are passed in registers. 493/ It does not conform to the SYSV x86 ABI. 494/ So, don't even think about calling this function directly from C code. 495/ 496/ The basic multiply digit loop is unrolled 8 times. 497/ Each comment is preceded by an instance number. 498/ Instructions that have been moved retain their original, "natural" 499/ instance number. It should be easier this way to follow 500/ the step-wise refinement process that went into constructing 501/ the final code. 502 503 ENTRY(big_mul_add_vec_sse2_r) 504 xorl %eax, %eax 505 testl %ecx, %ecx 506 jz .L27 507 508 pxor %mm0, %mm0 / cy = 0 509 510.L25: 511 cmpl $UNROLL, %ecx 512 jl .L26 513 movd 0(%ebx), %mm1 / 1: mm1 = a[i] 514 movd 0(%edx), %mm2 / 1: mm2 = r[i] 515 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 516 paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i] 517 movd 4(%ebx), %mm1 / 2: mm1 = a[i] 518 paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy; 519 movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 520 movd 4(%edx), %mm2 / 2: mm2 = r[i] 521 psrlq $32, %mm0 / 1: cy = product[63..32] 522 523 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 524 paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i] 525 movd 8(%ebx), %mm1 / 3: mm1 = a[i] 526 paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy; 527 movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 528 movd 8(%edx), %mm2 / 3: mm2 = r[i] 529 psrlq $32, %mm0 / 2: cy = product[63..32] 530 531 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 532 paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i] 533 movd 12(%ebx), %mm1 / 4: mm1 = a[i] 534 paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy; 535 movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 536 movd 12(%edx), %mm2 / 4: mm2 = r[i] 537 psrlq $32, %mm0 / 3: cy = product[63..32] 538 539 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 540 paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i] 541 movd 16(%ebx), %mm1 / 5: mm1 = a[i] 542 paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy; 543 movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 544 movd 16(%edx), %mm2 / 5: mm2 = r[i] 545 psrlq $32, %mm0 / 4: cy = product[63..32] 546 547 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 548 paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i] 549 movd 20(%ebx), %mm1 / 6: mm1 = a[i] 550 paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy; 551 movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 552 movd 20(%edx), %mm2 / 6: mm2 = r[i] 553 psrlq $32, %mm0 / 5: cy = product[63..32] 554 555 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 556 paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i] 557 movd 24(%ebx), %mm1 / 7: mm1 = a[i] 558 paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy; 559 movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 560 movd 24(%edx), %mm2 / 7: mm2 = r[i] 561 psrlq $32, %mm0 / 6: cy = product[63..32] 562 563 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 564 paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i] 565 movd 28(%ebx), %mm1 / 8: mm1 = a[i] 566 paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy; 567 movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 568 movd 28(%edx), %mm2 / 8: mm2 = r[i] 569 psrlq $32, %mm0 / 7: cy = product[63..32] 570 571 pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i] 572 paddq %mm1, %mm2 / 8: mm2 = digit * a[i] + r[i] 573 paddq %mm2, %mm0 / 8: mm0 = digit * a[i] + r[i] + cy; 574 movd %mm0, 28(%edx) / 8: r[i] = product[31..0] 575 psrlq $32, %mm0 / 8: cy = product[63..32] 576 577 leal UNROLL32(%ebx), %ebx / a += UNROLL 578 leal UNROLL32(%edx), %edx / r += UNROLL 579 subl $UNROLL, %ecx / len -= UNROLL 580 jz .L27 581 jmp .L25 582 583.L26: 584 movd 0(%ebx), %mm1 / 1: mm1 = a[i] 585 movd 0(%edx), %mm2 / 1: mm2 = r[i] 586 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 587 paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i] 588 paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy; 589 movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 590 psrlq $32, %mm0 / 1: cy = product[63..32] 591 subl $1, %ecx 592 jz .L27 593 594 movd 4(%ebx), %mm1 / 2: mm1 = a[i] 595 movd 4(%edx), %mm2 / 2: mm2 = r[i] 596 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 597 paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i] 598 paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy; 599 movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 600 psrlq $32, %mm0 / 2: cy = product[63..32] 601 subl $1, %ecx 602 jz .L27 603 604 movd 8(%ebx), %mm1 / 3: mm1 = a[i] 605 movd 8(%edx), %mm2 / 3: mm2 = r[i] 606 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 607 paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i] 608 paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy; 609 movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 610 psrlq $32, %mm0 / 3: cy = product[63..32] 611 subl $1, %ecx 612 jz .L27 613 614 movd 12(%ebx), %mm1 / 4: mm1 = a[i] 615 movd 12(%edx), %mm2 / 4: mm2 = r[i] 616 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 617 paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i] 618 paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy; 619 movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 620 psrlq $32, %mm0 / 4: cy = product[63..32] 621 subl $1, %ecx 622 jz .L27 623 624 movd 16(%ebx), %mm1 / 5: mm1 = a[i] 625 movd 16(%edx), %mm2 / 5: mm2 = r[i] 626 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 627 paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i] 628 paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy; 629 movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 630 psrlq $32, %mm0 / 5: cy = product[63..32] 631 subl $1, %ecx 632 jz .L27 633 634 movd 20(%ebx), %mm1 / 6: mm1 = a[i] 635 movd 20(%edx), %mm2 / 6: mm2 = r[i] 636 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 637 paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i] 638 paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy; 639 movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 640 psrlq $32, %mm0 / 6: cy = product[63..32] 641 subl $1, %ecx 642 jz .L27 643 644 movd 24(%ebx), %mm1 / 7: mm1 = a[i] 645 movd 24(%edx), %mm2 / 7: mm2 = r[i] 646 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 647 paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i] 648 paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy; 649 movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 650 psrlq $32, %mm0 / 7: cy = product[63..32] 651 652.L27: 653 movd %mm0, %eax 654 / no emms. caller is responsible for emms 655 ret 656 SET_SIZE(big_mul_add_vec_sse2_r) 657 658 659/ r = r + a * digit, r and a are vectors of length len 660/ returns the carry digit 661/ Suitable only for x86 models that support SSE2 instruction set extensions 662/ 663/ r 8(%ebp) %edx 664/ a 12(%ebp) %ebx 665/ len 16(%ebp) %ecx 666/ digit 20(%ebp) %mm3 667/ 668/ In userland, there is just the one function, big_mul_add_vec_sse2(). 669/ But in the kernel, there are two variations: 670/ 1. big_mul_add_vec_sse2() which does what is necessary to save and 671/ restore state, if necessary, and to ensure that preemtion is 672/ disabled. 673/ 2. big_mul_add_vec_sse2_nsv() which just does the work; 674/ it is the caller's responsibility to ensure that MMX state 675/ does not need to be saved and restored and that preemption 676/ is already disabled. 677 678 679#if defined(MMX_MANAGE) 680 681 ENTRY(big_mul_add_vec_sse2) 682 pushl %ebp 683 movl %esp, %ebp 684 pushl %ebx 685 pushl %esi 686 KPREEMPT_DISABLE 687 TEST_TS(%ebx) 688 pushl %ebx 689 jnz .addvec_no_save 690 pushl %edi 691 SAVE_MMX_0TO4(%edi) 692 movl 8(%ebp), %edx 693 movl 12(%ebp), %ebx 694 movl 16(%ebp), %ecx 695 movd 20(%ebp), %mm3 696 call big_mul_add_vec_sse2_r 697 movl %eax, %esi 698 RSTOR_MMX_0TO4(%edi) 699 popl %edi 700 jmp .addvec_rtn 701 702.addvec_no_save: 703 movl 8(%ebp), %edx 704 movl 12(%ebp), %ebx 705 movl 16(%ebp), %ecx 706 movd 20(%ebp), %mm3 707 call big_mul_add_vec_sse2_r 708 movl %eax, %esi 709 710.addvec_rtn: 711 emms 712 popl %ebx 713 movl %ebx, %cr0 714 KPREEMPT_ENABLE 715 movl %esi, %eax 716 popl %esi 717 popl %ebx 718 leave 719 ret 720 SET_SIZE(big_mul_add_vec_sse2) 721 722 ENTRY(big_mul_add_vec_sse2_nsv) 723 pushl %ebp 724 movl %esp, %ebp 725 pushl %ebx 726 movl 8(%ebp), %edx 727 movl 12(%ebp), %ebx 728 movl 16(%ebp), %ecx 729 movd 20(%ebp), %mm3 730 call big_mul_add_vec_sse2_r 731 popl %ebx 732 leave 733 ret 734 SET_SIZE(big_mul_add_vec_sse2_nsv) 735 736 737#else /* !defined(MMX_MANAGE) */ 738 739 ENTRY(big_mul_add_vec_sse2) 740 pushl %ebp 741 movl %esp, %ebp 742 pushl %ebx 743 movl 8(%ebp), %edx 744 movl 12(%ebp), %ebx 745 movl 16(%ebp), %ecx 746 movd 20(%ebp), %mm3 747 call big_mul_add_vec_sse2_r 748 popl %ebx 749 emms 750 leave 751 ret 752 SET_SIZE(big_mul_add_vec_sse2) 753 754#endif /* MMX_MANAGE */ 755 756 757/ void 758/ big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen) 759/ { 760/ int i; 761/ 762/ r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]); 763/ for (i = 1; i < blen; ++i) 764/ r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]); 765/ } 766 767 768#if defined(MMX_MANAGE) 769 ENTRY(big_mul_vec_sse2_fc) 770#else 771 ENTRY(big_mul_vec_sse2) 772#endif 773 subl $0x8, %esp 774 pushl %ebx 775 pushl %ebp 776 pushl %esi 777 pushl %edi 778 movl 40(%esp), %eax 779 movl %eax, 20(%esp) 780 pushl (%eax) 781 movl 40(%esp), %edi 782 pushl %edi 783 movl 40(%esp), %esi 784 pushl %esi 785 movl 40(%esp), %ebx 786 pushl %ebx 787#if defined(MMX_MANAGE) 788 call big_mul_set_vec_sse2_nsv 789#else 790 call big_mul_set_vec_sse2 791#endif 792 addl $0x10, %esp 793 movl %eax, (%ebx,%edi,4) 794 movl 44(%esp), %eax 795 movl %eax, 16(%esp) 796 cmpl $0x1, %eax 797 jle .mulvec_rtn 798 movl $0x1, %ebp 799 800 .align 16 801.mulvec_add: 802 movl 20(%esp), %eax 803 pushl (%eax,%ebp,4) 804 pushl %edi 805 pushl %esi 806 leal (%ebx,%ebp,4), %eax 807 pushl %eax 808#if defined(MMX_MANAGE) 809 call big_mul_add_vec_sse2_nsv 810#else 811 call big_mul_add_vec_sse2 812#endif 813 addl $0x10, %esp 814 leal (%ebp,%edi), %ecx 815 movl %eax, (%ebx,%ecx,4) 816 incl %ebp 817 cmpl 16(%esp), %ebp 818 jl .mulvec_add 819.mulvec_rtn: 820#if defined(MMX_MANAGE) 821 emms 822#endif 823 popl %edi 824 popl %esi 825 popl %ebp 826 popl %ebx 827 addl $0x8, %esp 828 ret 829#if defined(MMX_MANAGE) 830 SET_SIZE(big_mul_vec_sse2_fc) 831#else 832 SET_SIZE(big_mul_vec_sse2) 833#endif 834 835#if defined(MMX_MANAGE) 836 837 ENTRY(big_mul_vec_sse2) 838 pushl %ebp 839 movl %esp, %ebp 840 subl $8, %esp 841 pushl %edi 842 KPREEMPT_DISABLE 843 TEST_TS(%eax) 844 movl %eax, -8(%ebp) 845 jnz .mulvec_no_save 846 SAVE_MMX_0TO4(%edi) 847 movl %edi, -4(%ebp) 848.mulvec_no_save: 849 movl 24(%ebp), %eax / blen 850 pushl %eax 851 movl 20(%ebp), %eax / b 852 pushl %eax 853 movl 16(%ebp), %eax / alen 854 pushl %eax 855 movl 12(%ebp), %eax / a 856 pushl %eax 857 movl 8(%ebp), %eax / r 858 pushl %eax 859 call big_mul_vec_sse2_fc 860 addl $20, %esp 861 movl -8(%ebp), %eax 862 testl $CR0_TS, %eax 863 jnz .mulvec_no_rstr 864 movl -4(%ebp), %edi 865 RSTOR_MMX_0TO4(%edi) 866.mulvec_no_rstr: 867 movl %eax, %cr0 868 KPREEMPT_ENABLE 869 popl %edi 870 leave 871 ret 872 SET_SIZE(big_mul_vec_sse2) 873 874#endif /* MMX_MANAGE */ 875 876 877 878#undef UNROLL 879#undef UNROLL32 880 881 882/ r = a * a, r and a are vectors of length len 883/ Suitable only for x86 models that support SSE2 instruction set extensions 884/ 885/ This function is not suitable for a truly general-purpose multiprecision 886/ arithmetic library, because it does not work for "small" numbers, that is 887/ numbers of 1 or 2 digits. big_mul() just uses the ordinary big_mul_vec() 888/ for any small numbers. 889 890#if defined(MMX_MANAGE) 891 ENTRY(big_sqr_vec_sse2_fc) 892#else 893 ENTRY(big_sqr_vec_sse2) 894 pushl %ebp 895 movl %esp, %ebp 896#endif 897 898 pushl %ebx 899 pushl %edi 900 pushl %esi 901 902 / r[1..alen] = a[0] * a[1..alen-1] 903 904 movl 8(%ebp), %edi / r = arg(r) 905 movl 12(%ebp), %esi / a = arg(a) 906 movl 16(%ebp), %ecx / cnt = arg(alen) 907 movd %ecx, %mm4 / save_cnt = arg(alen) 908 leal 4(%edi), %edx / dst = &r[1] 909 movl %esi, %ebx / src = a 910 movd 0(%ebx), %mm3 / mm3 = a[0] 911 leal 4(%ebx), %ebx / src = &a[1] 912 subl $1, %ecx / --cnt 913 call big_mul_set_vec_sse2_r / r[1..alen-1] = a[0] * a[1..alen-1] 914 movl %edi, %edx / dst = r 915 movl %esi, %ebx / src = a 916 movd %mm4, %ecx / cnt = save_cnt 917 movl %eax, (%edx, %ecx, 4) / r[cnt] = cy 918 919/ /* High-level vector C pseudocode */ 920/ for (i = 1; i < alen-1; ++i) 921/ r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1] 922/ 923/ /* Same thing, but slightly lower level C-like pseudocode */ 924/ i = 1; 925/ r = &arg_r[2*i + 1]; 926/ a = &arg_a[i + 1]; 927/ digit = arg_a[i]; 928/ cnt = alen - 3; 929/ while (cnt != 0) { 930/ r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit); 931/ r += 2; 932/ ++a; 933/ --cnt; 934/ } 935/ 936/ /* Same thing, but even lower level 937/ * For example, pointers are raw pointers, 938/ * with no scaling by object size. 939/ */ 940/ r = arg_r + 12; /* i == 1; 2i + 1 == 3; 4*3 == 12; */ 941/ a = arg_a + 8; 942/ digit = *(arg_a + 4); 943/ cnt = alen - 3; 944/ while (cnt != 0) { 945/ cy = big_mul_add_vec_sse2_r(); 946/ *(r + 4 * cnt) = cy; 947/ r += 8; 948/ a += 4; 949/ --cnt; 950/ } 951 952 leal 4(%edi), %edi / r += 4; r = &r[1] 953 leal 4(%esi), %esi / a += 4; a = &a[1] 954 movd %mm4, %ecx / cnt = save 955 subl $2, %ecx / cnt = alen - 2; i in 1..alen-2 956 movd %ecx, %mm4 / save_cnt 957 jecxz .L32 / while (cnt != 0) { 958.L31: 959 movd 0(%esi), %mm3 / digit = a[i] 960 leal 4(%esi), %esi / a += 4; a = &a[1]; a = &a[i + 1] 961 leal 8(%edi), %edi / r += 8; r = &r[2]; r = &r[2 * i + 1] 962 movl %edi, %edx / edx = r 963 movl %esi, %ebx / ebx = a 964 cmp $1, %ecx / The last triangle term is special 965 jz .L32 966 call big_mul_add_vec_sse2_r 967 movd %mm4, %ecx / cnt = save_cnt 968 movl %eax, (%edi, %ecx, 4) / r[cnt] = cy 969 subl $1, %ecx / --cnt 970 movd %ecx, %mm4 / save_cnt = cnt 971 jmp .L31 / } 972 973.L32: 974 movd 0(%ebx), %mm1 / mm1 = a[i + 1] 975 movd 0(%edx), %mm2 / mm2 = r[2 * i + 1] 976 pmuludq %mm3, %mm1 / mm1 = p = digit * a[i + 1] 977 paddq %mm1, %mm2 / mm2 = r[2 * i + 1] + p 978 movd %mm2, 0(%edx) / r[2 * i + 1] += lo32(p) 979 psrlq $32, %mm2 / mm2 = cy 980 movd %mm2, 4(%edx) / r[2 * i + 2] = cy 981 pxor %mm2, %mm2 982 movd %mm2, 8(%edx) / r[2 * i + 3] = 0 983 984 movl 8(%ebp), %edx / r = arg(r) 985 movl 12(%ebp), %ebx / a = arg(a) 986 movl 16(%ebp), %ecx / cnt = arg(alen) 987 988 / compute low-order corner 989 / p = a[0]**2 990 / r[0] = lo32(p) 991 / cy = hi32(p) 992 movd 0(%ebx), %mm2 / mm2 = a[0] 993 pmuludq %mm2, %mm2 / mm2 = p = a[0]**2 994 movd %mm2, 0(%edx) / r[0] = lo32(p) 995 psrlq $32, %mm2 / mm2 = cy = hi32(p) 996 997 / p = 2 * r[1] 998 / t = p + cy 999 / r[1] = lo32(t) 1000 / cy = hi32(t) 1001 movd 4(%edx), %mm1 / mm1 = r[1] 1002 psllq $1, %mm1 / mm1 = p = 2 * r[1] 1003 paddq %mm1, %mm2 / mm2 = t = p + cy 1004 movd %mm2, 4(%edx) / r[1] = low32(t) 1005 psrlq $32, %mm2 / mm2 = cy = hi32(t) 1006 1007 / r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3] 1008 subl $2, %ecx / cnt = alen - 2 1009.L34: 1010 movd 4(%ebx), %mm0 / mm0 = diag = a[i+1] 1011 pmuludq %mm0, %mm0 / mm0 = p = diag**2 1012 paddq %mm0, %mm2 / mm2 = t = p + cy 1013 movd %mm2, %eax 1014 movd %eax, %mm1 / mm1 = lo32(t) 1015 psrlq $32, %mm2 / mm2 = hi32(t) 1016 1017 movd 8(%edx), %mm3 / mm3 = r[2*i] 1018 psllq $1, %mm3 / mm3 = 2*r[2*i] 1019 paddq %mm3, %mm1 / mm1 = 2*r[2*i] + lo32(t) 1020 movd %mm1, 8(%edx) / r[2*i] = 2*r[2*i] + lo32(t) 1021 psrlq $32, %mm1 1022 paddq %mm1, %mm2 1023 1024 movd 12(%edx), %mm3 / mm3 = r[2*i+1] 1025 psllq $1, %mm3 / mm3 = 2*r[2*i+1] 1026 paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + hi32(t) 1027 movd %mm2, 12(%edx) / r[2*i+1] = mm2 1028 psrlq $32, %mm2 / mm2 = cy 1029 leal 8(%edx), %edx / r += 2 1030 leal 4(%ebx), %ebx / ++a 1031 subl $1, %ecx / --cnt 1032 jnz .L34 1033 1034 / Carry from last triangle term must participate in doubling, 1035 / but this step isn't paired up with a squaring the elements 1036 / of the inner diagonal. 1037 / r[$-3..$-2] += 2 * r[$-3..$-2] + cy 1038 movd 8(%edx), %mm3 / mm3 = r[2*i] 1039 psllq $1, %mm3 / mm3 = 2*r[2*i] 1040 paddq %mm3, %mm2 / mm2 = 2*r[2*i] + cy 1041 movd %mm2, 8(%edx) / r[2*i] = lo32(2*r[2*i] + cy) 1042 psrlq $32, %mm2 / mm2 = cy = hi32(2*r[2*i] + cy) 1043 1044 movd 12(%edx), %mm3 / mm3 = r[2*i+1] 1045 psllq $1, %mm3 / mm3 = 2*r[2*i+1] 1046 paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + cy 1047 movd %mm2, 12(%edx) / r[2*i+1] = mm2 1048 psrlq $32, %mm2 / mm2 = cy 1049 1050 / compute high-order corner and add it in 1051 / p = a[alen - 1]**2 1052 / t = p + cy 1053 / r[alen + alen - 2] += lo32(t) 1054 / cy = hi32(t) 1055 / r[alen + alen - 1] = cy 1056 movd 4(%ebx), %mm0 / mm0 = a[$-1] 1057 movd 8(%edx), %mm3 / mm3 = r[$-2] 1058 pmuludq %mm0, %mm0 / mm0 = p = a[$-1]**2 1059 paddq %mm0, %mm2 / mm2 = t = p + cy 1060 paddq %mm3, %mm2 / mm2 = r[$-2] + t 1061 movd %mm2, 8(%edx) / r[$-2] = lo32(r[$-2] + t) 1062 psrlq $32, %mm2 / mm2 = cy = hi32(r[$-2] + t) 1063 movd 12(%edx), %mm3 1064 paddq %mm3, %mm2 1065 movd %mm2, 12(%edx) / r[$-1] += cy 1066 1067.L35: 1068 emms 1069 popl %esi 1070 popl %edi 1071 popl %ebx 1072 1073#if defined(MMX_MANAGE) 1074 ret 1075 SET_SIZE(big_sqr_vec_sse2_fc) 1076#else 1077 leave 1078 ret 1079 SET_SIZE(big_sqr_vec_sse2) 1080#endif 1081 1082 1083#if defined(MMX_MANAGE) 1084 ENTRY(big_sqr_vec_sse2) 1085 pushl %ebp 1086 movl %esp, %ebp 1087 KPREEMPT_DISABLE 1088 TEST_TS(%ebx) 1089 pushl %ebx 1090 jnz .sqr_no_save 1091 pushl %edi 1092 SAVE_MMX_0TO4(%edi) 1093 call big_sqr_vec_sse2_fc 1094 RSTOR_MMX_0TO4(%edi) 1095 popl %edi 1096 jmp .sqr_rtn 1097 1098.sqr_no_save: 1099 call big_sqr_vec_sse2_fc 1100 1101.sqr_rtn: 1102 popl %ebx 1103 movl %ebx, %cr0 1104 KPREEMPT_ENABLE 1105 leave 1106 ret 1107 SET_SIZE(big_sqr_vec_sse2) 1108 1109#endif /* MMX_MANAGE */ 1110 1111/ ------------------------------------------------------------------------ 1112/ UMUL Implementations 1113/ ------------------------------------------------------------------------ 1114 1115 1116/ r = a * digit, r and a are vectors of length len 1117/ returns the carry digit 1118/ Does not use any MMX, SSE, or SSE2 instructions. 1119/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL. 1120/ This is a fall-back implementation for x86 models that do not support 1121/ the PMULUDQ instruction. 1122/ 1123/ uint32_t 1124/ big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 1125/ 1126/ r 8(%ebp) %edx %edi 1127/ a 12(%ebp) %ebx %esi 1128/ len 16(%ebp) %ecx 1129/ digit 20(%ebp) %esi 1130 1131 ENTRY(big_mul_set_vec_umul) 1132 pushl %ebp 1133 movl %esp, %ebp 1134 pushl %esi 1135 pushl %edi 1136 pushl %ebx 1137 movl 16(%ebp), %ecx 1138 xorl %ebx, %ebx / cy = 0 1139 testl %ecx, %ecx 1140 movl 8(%ebp), %edi 1141 movl 12(%ebp), %esi 1142 je .L57 1143 1144.L55: 1145 movl (%esi), %eax / eax = a[i] 1146 leal 4(%esi), %esi / ++a 1147 mull 20(%ebp) / edx:eax = a[i] * digit 1148 addl %ebx, %eax 1149 adcl $0, %edx / edx:eax = a[i] * digit + cy 1150 movl %eax, (%edi) / r[i] = product[31..0] 1151 movl %edx, %ebx / cy = product[63..32] 1152 leal 4(%edi), %edi / ++r 1153 decl %ecx / --len 1154 jnz .L55 / while (len != 0) 1155.L57: 1156 movl %ebx, %eax 1157 popl %ebx 1158 popl %edi 1159 popl %esi 1160 leave 1161 ret 1162 SET_SIZE(big_mul_set_vec_umul) 1163 1164 1165/ r = r + a * digit, r and a are vectors of length len 1166/ returns the carry digit 1167/ Does not use any MMX, SSE, or SSE2 instructions. 1168/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL. 1169/ This is a fall-back implementation for x86 models that do not support 1170/ the PMULUDQ instruction. 1171/ 1172/ uint32_t 1173/ big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 1174/ 1175/ r 8(%ebp) %edx %edi 1176/ a 12(%ebp) %ebx %esi 1177/ len 16(%ebp) %ecx 1178/ digit 20(%ebp) %esi 1179 1180 ENTRY(big_mul_add_vec_umul) 1181 pushl %ebp 1182 movl %esp, %ebp 1183 pushl %esi 1184 pushl %edi 1185 pushl %ebx 1186 movl 16(%ebp), %ecx 1187 xorl %ebx, %ebx / cy = 0 1188 testl %ecx, %ecx 1189 movl 8(%ebp), %edi 1190 movl 12(%ebp), %esi 1191 je .L67 1192 .align 4 1193.L65: 1194 movl (%esi), %eax / eax = a[i] 1195 leal 4(%esi), %esi / ++a 1196 mull 20(%ebp) / edx:eax = a[i] * digit 1197 addl (%edi), %eax 1198 adcl $0, %edx / edx:eax = a[i] * digit + r[i] 1199 addl %ebx, %eax 1200 adcl $0, %edx / edx:eax = a[i] * digit + r[i] + cy 1201 movl %eax, (%edi) / r[i] = product[31..0] 1202 movl %edx, %ebx / cy = product[63..32] 1203 leal 4(%edi), %edi / ++r 1204 decl %ecx / --len 1205 jnz .L65 / while (len != 0) 1206.L67: 1207 movl %ebx, %eax 1208 popl %ebx 1209 popl %edi 1210 popl %esi 1211 leave 1212 ret 1213 SET_SIZE(big_mul_add_vec_umul) 1214 1215#endif /* __lint */ 1216