1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22/* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29#if !defined(__GNUC_AS__) 30 31#include <sys/asm_linkage.h> 32#include <sys/x86_archext.h> 33#include <sys/controlregs.h> 34 35#if defined(__lint) 36 37#include <sys/types.h> 38 39uint32_t 40bignum_use_sse2() 41{ return (0); } 42 43/* Not to be called by C code */ 44/* ARGSUSED */ 45uint32_t 46big_mul_set_vec_sse2_r() 47{ return (0); } 48 49/* Not to be called by C code */ 50/* ARGSUSED */ 51uint32_t 52big_mul_add_vec_sse2_r() 53{ return (0); } 54 55/* ARGSUSED */ 56uint32_t 57big_mul_set_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit) 58{ return (0); } 59 60/* ARGSUSED */ 61uint32_t 62big_mul_add_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit) 63{ return (0); } 64 65/* ARGSUSED */ 66void 67big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen) 68{} 69 70/* ARGSUSED */ 71void 72big_sqr_vec_sse2(uint32_t *r, uint32_t *a, int len) 73{} 74 75#if defined(MMX_MANAGE) 76 77/* ARGSUSED */ 78uint32_t 79big_mul_set_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit) 80{ return (0); } 81 82/* ARGSUSED */ 83uint32_t 84big_mul_add_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit) 85{ return (0); } 86 87/* Not to be called by C code */ 88/* ARGSUSED */ 89void 90big_sqr_vec_sse2_fc(uint32_t *r, uint32_t *a, int len) 91{} 92 93#endif /* MMX_MANAGE */ 94 95/* 96 * UMUL 97 * 98 */ 99 100/* ARGSUSED */ 101uint32_t 102big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 103{ return (0); } 104 105/* ARGSUSED */ 106uint32_t 107big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 108{ return (0); } 109 110#else /* __lint */ 111 112#if defined(MMX_MANAGE) 113 114#if defined(_KERNEL) 115 116#define KPREEMPT_DISABLE call kpr_disable 117#define KPREEMPT_ENABLE call kpr_enable 118#define TEST_TS(reg) \ 119 movl %cr0, reg; \ 120 clts; \ 121 testl $CR0_TS, reg 122 123#else /* _KERNEL */ 124 125#define KPREEMPT_DISABLE 126#define KPREEMPT_ENABLE 127 128#define TEST_TS(reg) \ 129 movl $0, reg; \ 130 testl $CR0_TS, reg 131 132#endif /* _KERNEL */ 133 134#define MMX_SIZE 8 135#define MMX_ALIGN 8 136 137#define SAVE_MMX_PROLOG(sreg, nreg) \ 138 subl $[MMX_SIZE \* nreg + MMX_ALIGN], %esp; \ 139 movl %esp, sreg; \ 140 addl $MMX_ALIGN, sreg; \ 141 andl $-1![MMX_ALIGN-1], sreg; 142 143#define RSTOR_MMX_EPILOG(nreg) \ 144 addl $[MMX_SIZE \* nreg + MMX_ALIGN], %esp; 145 146#define SAVE_MMX_0TO4(sreg) \ 147 SAVE_MMX_PROLOG(sreg, 5); \ 148 movq %mm0, 0(sreg); \ 149 movq %mm1, 8(sreg); \ 150 movq %mm2, 16(sreg); \ 151 movq %mm3, 24(sreg); \ 152 movq %mm4, 32(sreg) 153 154#define RSTOR_MMX_0TO4(sreg) \ 155 movq 0(sreg), %mm0; \ 156 movq 8(sreg), %mm1; \ 157 movq 16(sreg), %mm2; \ 158 movq 24(sreg), %mm3; \ 159 movq 32(sreg), %mm4; \ 160 RSTOR_MMX_EPILOG(5) 161 162#endif /* MMX_MANAGE */ 163 164/ Note: this file contains implementations for 165/ big_mul_set_vec() 166/ big_mul_add_vec() 167/ big_mul_vec() 168/ big_sqr_vec() 169/ One set of implementations is for SSE2-capable models. 170/ The other uses no MMX, SSE, or SSE2 instructions, only 171/ the x86 32 X 32 -> 64 unsigned multiply instruction, MUL. 172/ 173/ The code for the implementations is grouped by SSE2 vs UMUL, 174/ rather than grouping pairs of implementations for each function. 175/ This is because the bignum implementation gets "imprinted" 176/ on the correct implementation, at the time of first use, 177/ so none of the code for the other implementations is ever 178/ executed. So, it is a no-brainer to layout the code to minimize 179/ the "footprint" of executed code. 180 181/ Can we use SSE2 instructions? Return value is non-zero 182/ if we can. 183/ 184/ Note: 185/ Using the cpuid instruction directly would work equally 186/ well in userland and in the kernel, but we do not use the 187/ cpuid instruction in the kernel, we use the x86_feature 188/ variable, instead. This means we honor any decisions 189/ the kernel startup code may have made in setting this 190/ variable, including disabling SSE2 because of settings 191/ in /etc/system. It might even be a good idea to honor 192/ this kind of setting in userland, as well, but the variable, 193/ x86-feature is not readily available to userland processes. 194/ 195/ uint32_t 196/ bignum_use_sse2() 197 198 ENTRY(bignum_use_sse2) 199#if defined(_KERNEL) 200 movl x86_feature, %eax 201 andl $X86_SSE2, %eax 202#else /* _KERNEL */ 203 pushl %ebx 204 movl $1, %eax / Get feature information 205 cpuid 206 movl %edx, %eax / set return value 207 popl %ebx 208 andl $CPUID_INTC_EDX_SSE2, %eax 209#endif /* _KERNEL */ 210 ret 211 SET_SIZE(bignum_use_sse2) 212 213 214/ ------------------------------------------------------------------------ 215/ SSE2 Implementations 216/ ------------------------------------------------------------------------ 217 218/ r = a * digit, r and a are vectors of length len 219/ returns the carry digit 220/ Suitable only for x86 models that support SSE2 instruction set extensions 221/ 222/ uint32_t 223/ big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit) 224/ 225/ r %edx 226/ a %ebx 227/ len %ecx 228/ digit %mm3 229/ 230/ Does not touch the following registers: %esi, %edi, %mm4 231/ 232/ N.B.: 233/ This is strictly for internal use. 234/ The interface is very light-weight. 235/ All parameters are passed in registers. 236/ It does not conform to the SYSV x86 ABI. 237/ So, don't even think about calling this function directly from C code. 238/ 239/ The basic multiply digit loop is unrolled 8 times. 240/ Each comment is preceded by an instance number. 241/ Instructions that have been moved retain their original, "natural" 242/ instance number. It should be easier this way to follow 243/ the step-wise refinement process that went into constructing 244/ the final code. 245 246#define UNROLL 8 247#define UNROLL32 32 248 249 ENTRY(big_mul_set_vec_sse2_r) 250 xorl %eax, %eax / if (len == 0) return (0); 251 testl %ecx, %ecx 252 jz .L17 253 254 pxor %mm0, %mm0 / cy = 0 255 256.L15: 257 cmpl $UNROLL, %ecx 258 jl .L16 259 movd 0(%ebx), %mm1 / 1: mm1 = a[i] 260 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 261 paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy; 262 movd 4(%ebx), %mm1 / 2: mm1 = a[i] 263 movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 264 psrlq $32, %mm0 / 1: cy = product[63..32] 265 266 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 267 paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy; 268 movd 8(%ebx), %mm1 / 3: mm1 = a[i] 269 movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 270 psrlq $32, %mm0 / 2: cy = product[63..32] 271 272 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 273 paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy; 274 movd 12(%ebx), %mm1 / 4: mm1 = a[i] 275 movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 276 psrlq $32, %mm0 / 3: cy = product[63..32] 277 278 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 279 paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy; 280 movd 16(%ebx), %mm1 / 5: mm1 = a[i] 281 movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 282 psrlq $32, %mm0 / 4: cy = product[63..32] 283 284 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 285 paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy; 286 movd 20(%ebx), %mm1 / 6: mm1 = a[i] 287 movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 288 psrlq $32, %mm0 / 5: cy = product[63..32] 289 290 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 291 paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy; 292 movd 24(%ebx), %mm1 / 7: mm1 = a[i] 293 movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 294 psrlq $32, %mm0 / 6: cy = product[63..32] 295 296 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 297 paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy; 298 movd 28(%ebx), %mm1 / 8: mm1 = a[i] 299 movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 300 psrlq $32, %mm0 / 7: cy = product[63..32] 301 302 pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i] 303 paddq %mm1, %mm0 / 8: mm0 = digit * a[i] + cy; 304 movd %mm0, 28(%edx) / 8: r[i] = product[31..0] 305 psrlq $32, %mm0 / 8: cy = product[63..32] 306 307 leal UNROLL32(%ebx), %ebx / a += UNROLL 308 leal UNROLL32(%edx), %edx / r += UNROLL 309 subl $UNROLL, %ecx / len -= UNROLL 310 jz .L17 311 jmp .L15 312 313.L16: 314 movd 0(%ebx), %mm1 / 1: mm1 = a[i] 315 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 316 paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy; 317 movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 318 psrlq $32, %mm0 / 1: cy = product[63..32] 319 subl $1, %ecx 320 jz .L17 321 322 movd 4(%ebx), %mm1 / 2: mm1 = a[i] 323 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 324 paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy; 325 movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 326 psrlq $32, %mm0 / 2: cy = product[63..32] 327 subl $1, %ecx 328 jz .L17 329 330 movd 8(%ebx), %mm1 / 3: mm1 = a[i] 331 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 332 paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy; 333 movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 334 psrlq $32, %mm0 / 3: cy = product[63..32] 335 subl $1, %ecx 336 jz .L17 337 338 movd 12(%ebx), %mm1 / 4: mm1 = a[i] 339 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 340 paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy; 341 movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 342 psrlq $32, %mm0 / 4: cy = product[63..32] 343 subl $1, %ecx 344 jz .L17 345 346 movd 16(%ebx), %mm1 / 5: mm1 = a[i] 347 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 348 paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy; 349 movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 350 psrlq $32, %mm0 / 5: cy = product[63..32] 351 subl $1, %ecx 352 jz .L17 353 354 movd 20(%ebx), %mm1 / 6: mm1 = a[i] 355 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 356 paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy; 357 movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 358 psrlq $32, %mm0 / 6: cy = product[63..32] 359 subl $1, %ecx 360 jz .L17 361 362 movd 24(%ebx), %mm1 / 7: mm1 = a[i] 363 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 364 paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy; 365 movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 366 psrlq $32, %mm0 / 7: cy = product[63..32] 367 368.L17: 369 movd %mm0, %eax / return (cy) 370 / no emms. caller is responsible for emms 371 ret 372 SET_SIZE(big_mul_set_vec_sse2_r) 373 374 375/ r = a * digit, r and a are vectors of length len 376/ returns the carry digit 377/ Suitable only for x86 models that support SSE2 instruction set extensions 378/ 379/ r 8(%ebp) %edx 380/ a 12(%ebp) %ebx 381/ len 16(%ebp) %ecx 382/ digit 20(%ebp) %mm3 383/ 384/ In userland, there is just the one function, big_mul_set_vec_sse2(). 385/ But in the kernel, there are two variations: 386/ 1. big_mul_set_vec_sse2() which does what is necessary to save and 387/ restore state, if necessary, and to ensure that preemtion is 388/ disabled. 389/ 2. big_mul_set_vec_sse2_nsv() which just does the work; 390/ it is the caller's responsibility to ensure that MMX state 391/ does not need to be saved and restored and that preemption 392/ is already disabled. 393 394#if defined(MMX_MANAGE) 395 ENTRY(big_mul_set_vec_sse2) 396 pushl %ebp 397 movl %esp, %ebp 398 pushl %ebx 399 pushl %esi 400 KPREEMPT_DISABLE 401 TEST_TS(%ebx) 402 pushl %ebx 403 jnz .setvec_no_save 404 pushl %edi 405 SAVE_MMX_0TO4(%edi) 406 movl 8(%ebp), %edx 407 movl 12(%ebp), %ebx 408 movl 16(%ebp), %ecx 409 movd 20(%ebp), %mm3 410 call big_mul_set_vec_sse2_r 411 movl %eax, %esi 412 RSTOR_MMX_0TO4(%edi) 413 popl %edi 414 jmp .setvec_rtn 415 416.setvec_no_save: 417 movl 8(%ebp), %edx 418 movl 12(%ebp), %ebx 419 movl 16(%ebp), %ecx 420 movd 20(%ebp), %mm3 421 call big_mul_set_vec_sse2_r 422 movl %eax, %esi 423 424.setvec_rtn: 425 emms 426 popl %ebx 427 movl %ebx, %cr0 428 KPREEMPT_ENABLE 429 movl %esi, %eax 430 popl %esi 431 popl %ebx 432 leave 433 ret 434 SET_SIZE(big_mul_set_vec_sse2) 435 436 ENTRY(big_mul_set_vec_sse2_nsv) 437 pushl %ebp 438 movl %esp, %ebp 439 pushl %ebx 440 movl 8(%ebp), %edx 441 movl 12(%ebp), %ebx 442 movl 16(%ebp), %ecx 443 movd 20(%ebp), %mm3 444 call big_mul_set_vec_sse2_r 445 popl %ebx 446 leave 447 ret 448 SET_SIZE(big_mul_set_vec_sse2_nsv) 449 450#else /* !defined(MMX_MANAGE) */ 451 452/ r = a * digit, r and a are vectors of length len 453/ returns the carry digit 454/ Suitable only for x86 models that support SSE2 instruction set extensions 455/ 456/ r 8(%ebp) %edx 457/ a 12(%ebp) %ebx 458/ len 16(%ebp) %ecx 459/ digit 20(%ebp) %mm3 460 461 ENTRY(big_mul_set_vec_sse2) 462 pushl %ebp 463 movl %esp, %ebp 464 pushl %ebx 465 movl 8(%ebp), %edx 466 movl 12(%ebp), %ebx 467 movl 16(%ebp), %ecx 468 movd 20(%ebp), %mm3 469 call big_mul_set_vec_sse2_r 470 popl %ebx 471 emms 472 leave 473 ret 474 SET_SIZE(big_mul_set_vec_sse2) 475 476#endif /* MMX_MANAGE */ 477 478 479/ r = r + a * digit, r and a are vectors of length len 480/ returns the carry digit 481/ Suitable only for x86 models that support SSE2 instruction set extensions 482/ 483/ uint32_t 484/ big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit) 485/ 486/ r %edx 487/ a %ebx 488/ len %ecx 489/ digit %mm3 490/ 491/ N.B.: 492/ This is strictly for internal use. 493/ The interface is very light-weight. 494/ All parameters are passed in registers. 495/ It does not conform to the SYSV x86 ABI. 496/ So, don't even think about calling this function directly from C code. 497/ 498/ The basic multiply digit loop is unrolled 8 times. 499/ Each comment is preceded by an instance number. 500/ Instructions that have been moved retain their original, "natural" 501/ instance number. It should be easier this way to follow 502/ the step-wise refinement process that went into constructing 503/ the final code. 504 505 ENTRY(big_mul_add_vec_sse2_r) 506 xorl %eax, %eax 507 testl %ecx, %ecx 508 jz .L27 509 510 pxor %mm0, %mm0 / cy = 0 511 512.L25: 513 cmpl $UNROLL, %ecx 514 jl .L26 515 movd 0(%ebx), %mm1 / 1: mm1 = a[i] 516 movd 0(%edx), %mm2 / 1: mm2 = r[i] 517 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 518 paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i] 519 movd 4(%ebx), %mm1 / 2: mm1 = a[i] 520 paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy; 521 movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 522 movd 4(%edx), %mm2 / 2: mm2 = r[i] 523 psrlq $32, %mm0 / 1: cy = product[63..32] 524 525 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 526 paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i] 527 movd 8(%ebx), %mm1 / 3: mm1 = a[i] 528 paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy; 529 movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 530 movd 8(%edx), %mm2 / 3: mm2 = r[i] 531 psrlq $32, %mm0 / 2: cy = product[63..32] 532 533 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 534 paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i] 535 movd 12(%ebx), %mm1 / 4: mm1 = a[i] 536 paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy; 537 movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 538 movd 12(%edx), %mm2 / 4: mm2 = r[i] 539 psrlq $32, %mm0 / 3: cy = product[63..32] 540 541 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 542 paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i] 543 movd 16(%ebx), %mm1 / 5: mm1 = a[i] 544 paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy; 545 movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 546 movd 16(%edx), %mm2 / 5: mm2 = r[i] 547 psrlq $32, %mm0 / 4: cy = product[63..32] 548 549 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 550 paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i] 551 movd 20(%ebx), %mm1 / 6: mm1 = a[i] 552 paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy; 553 movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 554 movd 20(%edx), %mm2 / 6: mm2 = r[i] 555 psrlq $32, %mm0 / 5: cy = product[63..32] 556 557 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 558 paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i] 559 movd 24(%ebx), %mm1 / 7: mm1 = a[i] 560 paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy; 561 movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 562 movd 24(%edx), %mm2 / 7: mm2 = r[i] 563 psrlq $32, %mm0 / 6: cy = product[63..32] 564 565 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 566 paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i] 567 movd 28(%ebx), %mm1 / 8: mm1 = a[i] 568 paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy; 569 movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 570 movd 28(%edx), %mm2 / 8: mm2 = r[i] 571 psrlq $32, %mm0 / 7: cy = product[63..32] 572 573 pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i] 574 paddq %mm1, %mm2 / 8: mm2 = digit * a[i] + r[i] 575 paddq %mm2, %mm0 / 8: mm0 = digit * a[i] + r[i] + cy; 576 movd %mm0, 28(%edx) / 8: r[i] = product[31..0] 577 psrlq $32, %mm0 / 8: cy = product[63..32] 578 579 leal UNROLL32(%ebx), %ebx / a += UNROLL 580 leal UNROLL32(%edx), %edx / r += UNROLL 581 subl $UNROLL, %ecx / len -= UNROLL 582 jz .L27 583 jmp .L25 584 585.L26: 586 movd 0(%ebx), %mm1 / 1: mm1 = a[i] 587 movd 0(%edx), %mm2 / 1: mm2 = r[i] 588 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 589 paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i] 590 paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy; 591 movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 592 psrlq $32, %mm0 / 1: cy = product[63..32] 593 subl $1, %ecx 594 jz .L27 595 596 movd 4(%ebx), %mm1 / 2: mm1 = a[i] 597 movd 4(%edx), %mm2 / 2: mm2 = r[i] 598 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 599 paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i] 600 paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy; 601 movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 602 psrlq $32, %mm0 / 2: cy = product[63..32] 603 subl $1, %ecx 604 jz .L27 605 606 movd 8(%ebx), %mm1 / 3: mm1 = a[i] 607 movd 8(%edx), %mm2 / 3: mm2 = r[i] 608 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 609 paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i] 610 paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy; 611 movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 612 psrlq $32, %mm0 / 3: cy = product[63..32] 613 subl $1, %ecx 614 jz .L27 615 616 movd 12(%ebx), %mm1 / 4: mm1 = a[i] 617 movd 12(%edx), %mm2 / 4: mm2 = r[i] 618 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 619 paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i] 620 paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy; 621 movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 622 psrlq $32, %mm0 / 4: cy = product[63..32] 623 subl $1, %ecx 624 jz .L27 625 626 movd 16(%ebx), %mm1 / 5: mm1 = a[i] 627 movd 16(%edx), %mm2 / 5: mm2 = r[i] 628 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 629 paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i] 630 paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy; 631 movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 632 psrlq $32, %mm0 / 5: cy = product[63..32] 633 subl $1, %ecx 634 jz .L27 635 636 movd 20(%ebx), %mm1 / 6: mm1 = a[i] 637 movd 20(%edx), %mm2 / 6: mm2 = r[i] 638 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 639 paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i] 640 paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy; 641 movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 642 psrlq $32, %mm0 / 6: cy = product[63..32] 643 subl $1, %ecx 644 jz .L27 645 646 movd 24(%ebx), %mm1 / 7: mm1 = a[i] 647 movd 24(%edx), %mm2 / 7: mm2 = r[i] 648 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 649 paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i] 650 paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy; 651 movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 652 psrlq $32, %mm0 / 7: cy = product[63..32] 653 654.L27: 655 movd %mm0, %eax 656 / no emms. caller is responsible for emms 657 ret 658 SET_SIZE(big_mul_add_vec_sse2_r) 659 660 661/ r = r + a * digit, r and a are vectors of length len 662/ returns the carry digit 663/ Suitable only for x86 models that support SSE2 instruction set extensions 664/ 665/ r 8(%ebp) %edx 666/ a 12(%ebp) %ebx 667/ len 16(%ebp) %ecx 668/ digit 20(%ebp) %mm3 669/ 670/ In userland, there is just the one function, big_mul_add_vec_sse2(). 671/ But in the kernel, there are two variations: 672/ 1. big_mul_add_vec_sse2() which does what is necessary to save and 673/ restore state, if necessary, and to ensure that preemtion is 674/ disabled. 675/ 2. big_mul_add_vec_sse2_nsv() which just does the work; 676/ it is the caller's responsibility to ensure that MMX state 677/ does not need to be saved and restored and that preemption 678/ is already disabled. 679 680 681#if defined(MMX_MANAGE) 682 683 ENTRY(big_mul_add_vec_sse2) 684 pushl %ebp 685 movl %esp, %ebp 686 pushl %ebx 687 pushl %esi 688 KPREEMPT_DISABLE 689 TEST_TS(%ebx) 690 pushl %ebx 691 jnz .addvec_no_save 692 pushl %edi 693 SAVE_MMX_0TO4(%edi) 694 movl 8(%ebp), %edx 695 movl 12(%ebp), %ebx 696 movl 16(%ebp), %ecx 697 movd 20(%ebp), %mm3 698 call big_mul_add_vec_sse2_r 699 movl %eax, %esi 700 RSTOR_MMX_0TO4(%edi) 701 popl %edi 702 jmp .addvec_rtn 703 704.addvec_no_save: 705 movl 8(%ebp), %edx 706 movl 12(%ebp), %ebx 707 movl 16(%ebp), %ecx 708 movd 20(%ebp), %mm3 709 call big_mul_add_vec_sse2_r 710 movl %eax, %esi 711 712.addvec_rtn: 713 emms 714 popl %ebx 715 movl %ebx, %cr0 716 KPREEMPT_ENABLE 717 movl %esi, %eax 718 popl %esi 719 popl %ebx 720 leave 721 ret 722 SET_SIZE(big_mul_add_vec_sse2) 723 724 ENTRY(big_mul_add_vec_sse2_nsv) 725 pushl %ebp 726 movl %esp, %ebp 727 pushl %ebx 728 movl 8(%ebp), %edx 729 movl 12(%ebp), %ebx 730 movl 16(%ebp), %ecx 731 movd 20(%ebp), %mm3 732 call big_mul_add_vec_sse2_r 733 popl %ebx 734 leave 735 ret 736 SET_SIZE(big_mul_add_vec_sse2_nsv) 737 738 739#else /* !defined(MMX_MANAGE) */ 740 741 ENTRY(big_mul_add_vec_sse2) 742 pushl %ebp 743 movl %esp, %ebp 744 pushl %ebx 745 movl 8(%ebp), %edx 746 movl 12(%ebp), %ebx 747 movl 16(%ebp), %ecx 748 movd 20(%ebp), %mm3 749 call big_mul_add_vec_sse2_r 750 popl %ebx 751 emms 752 leave 753 ret 754 SET_SIZE(big_mul_add_vec_sse2) 755 756#endif /* MMX_MANAGE */ 757 758 759/ void 760/ big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen) 761/ { 762/ int i; 763/ 764/ r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]); 765/ for (i = 1; i < blen; ++i) 766/ r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]); 767/ } 768 769 770#if defined(MMX_MANAGE) 771 ENTRY(big_mul_vec_sse2_fc) 772#else 773 ENTRY(big_mul_vec_sse2) 774#endif 775 subl $0x8, %esp 776 pushl %ebx 777 pushl %ebp 778 pushl %esi 779 pushl %edi 780 movl 40(%esp), %eax 781 movl %eax, 20(%esp) 782 pushl (%eax) 783 movl 40(%esp), %edi 784 pushl %edi 785 movl 40(%esp), %esi 786 pushl %esi 787 movl 40(%esp), %ebx 788 pushl %ebx 789#if defined(MMX_MANAGE) 790 call big_mul_set_vec_sse2_nsv 791#else 792 call big_mul_set_vec_sse2 793#endif 794 addl $0x10, %esp 795 movl %eax, (%ebx,%edi,4) 796 movl 44(%esp), %eax 797 movl %eax, 16(%esp) 798 cmpl $0x1, %eax 799 jle .mulvec_rtn 800 movl $0x1, %ebp 801 802 .zalign 16,8 803.mulvec_add: 804 movl 20(%esp), %eax 805 pushl (%eax,%ebp,4) 806 pushl %edi 807 pushl %esi 808 leal (%ebx,%ebp,4), %eax 809 pushl %eax 810#if defined(MMX_MANAGE) 811 call big_mul_add_vec_sse2_nsv 812#else 813 call big_mul_add_vec_sse2 814#endif 815 addl $0x10, %esp 816 leal (%ebp,%edi), %ecx 817 movl %eax, (%ebx,%ecx,4) 818 incl %ebp 819 cmpl 16(%esp), %ebp 820 jl .mulvec_add 821.mulvec_rtn: 822#if defined(MMX_MANAGE) 823 emms 824#endif 825 popl %edi 826 popl %esi 827 popl %ebp 828 popl %ebx 829 addl $0x8, %esp 830 ret 831#if defined(MMX_MANAGE) 832 SET_SIZE(big_mul_vec_sse2_fc) 833#else 834 SET_SIZE(big_mul_vec_sse2) 835#endif 836 837#if defined(MMX_MANAGE) 838 839 ENTRY(big_mul_vec_sse2) 840 pushl %ebp 841 movl %esp, %ebp 842 subl $8, %esp 843 pushl %edi 844 KPREEMPT_DISABLE 845 TEST_TS(%eax) 846 movl %eax, -8(%ebp) 847 jnz .mulvec_no_save 848 SAVE_MMX_0TO4(%edi) 849 movl %edi, -4(%ebp) 850.mulvec_no_save: 851 movl 24(%ebp), %eax / blen 852 pushl %eax 853 movl 20(%ebp), %eax / b 854 pushl %eax 855 movl 16(%ebp), %eax / alen 856 pushl %eax 857 movl 12(%ebp), %eax / a 858 pushl %eax 859 movl 8(%ebp), %eax / r 860 pushl %eax 861 call big_mul_vec_sse2_fc 862 addl $20, %esp 863 movl -8(%ebp), %eax 864 testl $CR0_TS, %eax 865 jnz .mulvec_no_rstr 866 movl -4(%ebp), %edi 867 RSTOR_MMX_0TO4(%edi) 868.mulvec_no_rstr: 869 movl %eax, %cr0 870 KPREEMPT_ENABLE 871 popl %edi 872 leave 873 ret 874 SET_SIZE(big_mul_vec_sse2) 875 876#endif /* MMX_MANAGE */ 877 878 879 880#undef UNROLL 881#undef UNROLL32 882 883 884/ r = a * a, r and a are vectors of length len 885/ Suitable only for x86 models that support SSE2 instruction set extensions 886/ 887/ This function is not suitable for a truly general-purpose multiprecision 888/ arithmetic library, because it does not work for "small" numbers, that is 889/ numbers of 1 or 2 digits. big_mul() just uses the ordinary big_mul_vec() 890/ for any small numbers. 891 892#if defined(MMX_MANAGE) 893 ENTRY(big_sqr_vec_sse2_fc) 894#else 895 ENTRY(big_sqr_vec_sse2) 896 pushl %ebp 897 movl %esp, %ebp 898#endif 899 900 pushl %ebx 901 pushl %edi 902 pushl %esi 903 904 / r[1..alen] = a[0] * a[1..alen-1] 905 906 movl 8(%ebp), %edi / r = arg(r) 907 movl 12(%ebp), %esi / a = arg(a) 908 movl 16(%ebp), %ecx / cnt = arg(alen) 909 movd %ecx, %mm4 / save_cnt = arg(alen) 910 leal 4(%edi), %edx / dst = &r[1] 911 movl %esi, %ebx / src = a 912 movd 0(%ebx), %mm3 / mm3 = a[0] 913 leal 4(%ebx), %ebx / src = &a[1] 914 subl $1, %ecx / --cnt 915 call big_mul_set_vec_sse2_r / r[1..alen-1] = a[0] * a[1..alen-1] 916 movl %edi, %edx / dst = r 917 movl %esi, %ebx / src = a 918 movd %mm4, %ecx / cnt = save_cnt 919 movl %eax, (%edx, %ecx, 4) / r[cnt] = cy 920 921/ /* High-level vector C pseudocode */ 922/ for (i = 1; i < alen-1; ++i) 923/ r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1] 924/ 925/ /* Same thing, but slightly lower level C-like pseudocode */ 926/ i = 1; 927/ r = &arg_r[2*i + 1]; 928/ a = &arg_a[i + 1]; 929/ digit = arg_a[i]; 930/ cnt = alen - 3; 931/ while (cnt != 0) { 932/ r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit); 933/ r += 2; 934/ ++a; 935/ --cnt; 936/ } 937/ 938/ /* Same thing, but even lower level 939/ * For example, pointers are raw pointers, 940/ * with no scaling by object size. 941/ */ 942/ r = arg_r + 12; /* i == 1; 2i + 1 == 3; 4*3 == 12; */ 943/ a = arg_a + 8; 944/ digit = *(arg_a + 4); 945/ cnt = alen - 3; 946/ while (cnt != 0) { 947/ cy = big_mul_add_vec_sse2_r(); 948/ *(r + 4 * cnt) = cy; 949/ r += 8; 950/ a += 4; 951/ --cnt; 952/ } 953 954 leal 4(%edi), %edi / r += 4; r = &r[1] 955 leal 4(%esi), %esi / a += 4; a = &a[1] 956 movd %mm4, %ecx / cnt = save 957 subl $2, %ecx / cnt = alen - 2; i in 1..alen-2 958 movd %ecx, %mm4 / save_cnt 959 jecxz .L32 / while (cnt != 0) { 960.L31: 961 movd 0(%esi), %mm3 / digit = a[i] 962 leal 4(%esi), %esi / a += 4; a = &a[1]; a = &a[i + 1] 963 leal 8(%edi), %edi / r += 8; r = &r[2]; r = &r[2 * i + 1] 964 movl %edi, %edx / edx = r 965 movl %esi, %ebx / ebx = a 966 cmp $1, %ecx / The last triangle term is special 967 jz .L32 968 call big_mul_add_vec_sse2_r 969 movd %mm4, %ecx / cnt = save_cnt 970 movl %eax, (%edi, %ecx, 4) / r[cnt] = cy 971 subl $1, %ecx / --cnt 972 movd %ecx, %mm4 / save_cnt = cnt 973 jmp .L31 / } 974 975.L32: 976 movd 0(%ebx), %mm1 / mm1 = a[i + 1] 977 movd 0(%edx), %mm2 / mm2 = r[2 * i + 1] 978 pmuludq %mm3, %mm1 / mm1 = p = digit * a[i + 1] 979 paddq %mm1, %mm2 / mm2 = r[2 * i + 1] + p 980 movd %mm2, 0(%edx) / r[2 * i + 1] += lo32(p) 981 psrlq $32, %mm2 / mm2 = cy 982 movd %mm2, 4(%edx) / r[2 * i + 2] = cy 983 pxor %mm2, %mm2 984 movd %mm2, 8(%edx) / r[2 * i + 3] = 0 985 986 movl 8(%ebp), %edx / r = arg(r) 987 movl 12(%ebp), %ebx / a = arg(a) 988 movl 16(%ebp), %ecx / cnt = arg(alen) 989 990 / compute low-order corner 991 / p = a[0]**2 992 / r[0] = lo32(p) 993 / cy = hi32(p) 994 movd 0(%ebx), %mm2 / mm2 = a[0] 995 pmuludq %mm2, %mm2 / mm2 = p = a[0]**2 996 movd %mm2, 0(%edx) / r[0] = lo32(p) 997 psrlq $32, %mm2 / mm2 = cy = hi32(p) 998 999 / p = 2 * r[1] 1000 / t = p + cy 1001 / r[1] = lo32(t) 1002 / cy = hi32(t) 1003 movd 4(%edx), %mm1 / mm1 = r[1] 1004 psllq $1, %mm1 / mm1 = p = 2 * r[1] 1005 paddq %mm1, %mm2 / mm2 = t = p + cy 1006 movd %mm2, 4(%edx) / r[1] = low32(t) 1007 psrlq $32, %mm2 / mm2 = cy = hi32(t) 1008 1009 / r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3] 1010 subl $2, %ecx / cnt = alen - 2 1011.L34: 1012 movd 4(%ebx), %mm0 / mm0 = diag = a[i+1] 1013 pmuludq %mm0, %mm0 / mm0 = p = diag**2 1014 paddq %mm0, %mm2 / mm2 = t = p + cy 1015 movd %mm2, %eax 1016 movd %eax, %mm1 / mm1 = lo32(t) 1017 psrlq $32, %mm2 / mm2 = hi32(t) 1018 1019 movd 8(%edx), %mm3 / mm3 = r[2*i] 1020 psllq $1, %mm3 / mm3 = 2*r[2*i] 1021 paddq %mm3, %mm1 / mm1 = 2*r[2*i] + lo32(t) 1022 movd %mm1, 8(%edx) / r[2*i] = 2*r[2*i] + lo32(t) 1023 psrlq $32, %mm1 1024 paddq %mm1, %mm2 1025 1026 movd 12(%edx), %mm3 / mm3 = r[2*i+1] 1027 psllq $1, %mm3 / mm3 = 2*r[2*i+1] 1028 paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + hi32(t) 1029 movd %mm2, 12(%edx) / r[2*i+1] = mm2 1030 psrlq $32, %mm2 / mm2 = cy 1031 leal 8(%edx), %edx / r += 2 1032 leal 4(%ebx), %ebx / ++a 1033 subl $1, %ecx / --cnt 1034 jnz .L34 1035 1036 / Carry from last triangle term must participate in doubling, 1037 / but this step isn't paired up with a squaring the elements 1038 / of the inner diagonal. 1039 / r[$-3..$-2] += 2 * r[$-3..$-2] + cy 1040 movd 8(%edx), %mm3 / mm3 = r[2*i] 1041 psllq $1, %mm3 / mm3 = 2*r[2*i] 1042 paddq %mm3, %mm2 / mm2 = 2*r[2*i] + cy 1043 movd %mm2, 8(%edx) / r[2*i] = lo32(2*r[2*i] + cy) 1044 psrlq $32, %mm2 / mm2 = cy = hi32(2*r[2*i] + cy) 1045 1046 movd 12(%edx), %mm3 / mm3 = r[2*i+1] 1047 psllq $1, %mm3 / mm3 = 2*r[2*i+1] 1048 paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + cy 1049 movd %mm2, 12(%edx) / r[2*i+1] = mm2 1050 psrlq $32, %mm2 / mm2 = cy 1051 1052 / compute high-order corner and add it in 1053 / p = a[alen - 1]**2 1054 / t = p + cy 1055 / r[alen + alen - 2] += lo32(t) 1056 / cy = hi32(t) 1057 / r[alen + alen - 1] = cy 1058 movd 4(%ebx), %mm0 / mm0 = a[$-1] 1059 movd 8(%edx), %mm3 / mm3 = r[$-2] 1060 pmuludq %mm0, %mm0 / mm0 = p = a[$-1]**2 1061 paddq %mm0, %mm2 / mm2 = t = p + cy 1062 paddq %mm3, %mm2 / mm2 = r[$-2] + t 1063 movd %mm2, 8(%edx) / r[$-2] = lo32(r[$-2] + t) 1064 psrlq $32, %mm2 / mm2 = cy = hi32(r[$-2] + t) 1065 movd 12(%edx), %mm3 1066 paddq %mm3, %mm2 1067 movd %mm2, 12(%edx) / r[$-1] += cy 1068 1069.L35: 1070 emms 1071 popl %esi 1072 popl %edi 1073 popl %ebx 1074 1075#if defined(MMX_MANAGE) 1076 ret 1077 SET_SIZE(big_sqr_vec_sse2_fc) 1078#else 1079 leave 1080 ret 1081 SET_SIZE(big_sqr_vec_sse2) 1082#endif 1083 1084 1085#if defined(MMX_MANAGE) 1086 ENTRY(big_sqr_vec_sse2) 1087 pushl %ebp 1088 movl %esp, %ebp 1089 KPREEMPT_DISABLE 1090 TEST_TS(%ebx) 1091 pushl %ebx 1092 jnz .sqr_no_save 1093 pushl %edi 1094 SAVE_MMX_0TO4(%edi) 1095 call big_sqr_vec_sse2_fc 1096 RSTOR_MMX_0TO4(%edi) 1097 popl %edi 1098 jmp .sqr_rtn 1099 1100.sqr_no_save: 1101 call big_sqr_vec_sse2_fc 1102 1103.sqr_rtn: 1104 popl %ebx 1105 movl %ebx, %cr0 1106 KPREEMPT_ENABLE 1107 leave 1108 ret 1109 SET_SIZE(big_sqr_vec_sse2) 1110 1111#endif /* MMX_MANAGE */ 1112 1113/ ------------------------------------------------------------------------ 1114/ UMUL Implementations 1115/ ------------------------------------------------------------------------ 1116 1117 1118/ r = a * digit, r and a are vectors of length len 1119/ returns the carry digit 1120/ Does not use any MMX, SSE, or SSE2 instructions. 1121/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL. 1122/ This is a fall-back implementation for x86 models that do not support 1123/ the PMULUDQ instruction. 1124/ 1125/ uint32_t 1126/ big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 1127/ 1128/ r 8(%ebp) %edx %edi 1129/ a 12(%ebp) %ebx %esi 1130/ len 16(%ebp) %ecx 1131/ digit 20(%ebp) %esi 1132 1133 ENTRY(big_mul_set_vec_umul) 1134 pushl %ebp 1135 movl %esp, %ebp 1136 pushl %esi 1137 pushl %edi 1138 pushl %ebx 1139 movl 16(%ebp), %ecx 1140 xorl %ebx, %ebx / cy = 0 1141 testl %ecx, %ecx 1142 movl 8(%ebp), %edi 1143 movl 12(%ebp), %esi 1144 je .L57 1145 1146.L55: 1147 movl (%esi), %eax / eax = a[i] 1148 leal 4(%esi), %esi / ++a 1149 mull 20(%ebp) / edx:eax = a[i] * digit 1150 addl %ebx, %eax 1151 adcl $0, %edx / edx:eax = a[i] * digit + cy 1152 movl %eax, (%edi) / r[i] = product[31..0] 1153 movl %edx, %ebx / cy = product[63..32] 1154 leal 4(%edi), %edi / ++r 1155 decl %ecx / --len 1156 jnz .L55 / while (len != 0) 1157.L57: 1158 movl %ebx, %eax 1159 popl %ebx 1160 popl %edi 1161 popl %esi 1162 leave 1163 ret 1164 SET_SIZE(big_mul_set_vec_umul) 1165 1166 1167/ r = r + a * digit, r and a are vectors of length len 1168/ returns the carry digit 1169/ Does not use any MMX, SSE, or SSE2 instructions. 1170/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL. 1171/ This is a fall-back implementation for x86 models that do not support 1172/ the PMULUDQ instruction. 1173/ 1174/ uint32_t 1175/ big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 1176/ 1177/ r 8(%ebp) %edx %edi 1178/ a 12(%ebp) %ebx %esi 1179/ len 16(%ebp) %ecx 1180/ digit 20(%ebp) %esi 1181 1182 ENTRY(big_mul_add_vec_umul) 1183 pushl %ebp 1184 movl %esp, %ebp 1185 pushl %esi 1186 pushl %edi 1187 pushl %ebx 1188 movl 16(%ebp), %ecx 1189 xorl %ebx, %ebx / cy = 0 1190 testl %ecx, %ecx 1191 movl 8(%ebp), %edi 1192 movl 12(%ebp), %esi 1193 je .L67 1194 .align 4 1195.L65: 1196 movl (%esi), %eax / eax = a[i] 1197 leal 4(%esi), %esi / ++a 1198 mull 20(%ebp) / edx:eax = a[i] * digit 1199 addl (%edi), %eax 1200 adcl $0, %edx / edx:eax = a[i] * digit + r[i] 1201 addl %ebx, %eax 1202 adcl $0, %edx / edx:eax = a[i] * digit + r[i] + cy 1203 movl %eax, (%edi) / r[i] = product[31..0] 1204 movl %edx, %ebx / cy = product[63..32] 1205 leal 4(%edi), %edi / ++r 1206 decl %ecx / --len 1207 jnz .L65 / while (len != 0) 1208.L67: 1209 movl %ebx, %eax 1210 popl %ebx 1211 popl %edi 1212 popl %esi 1213 leave 1214 ret 1215 SET_SIZE(big_mul_add_vec_umul) 1216 1217#endif /* __lint */ 1218 1219#endif /* !__GNUC_AS__ */ 1220