17c478bd9Sstevel@tonic-gate/* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*7417cfdeSKuriakose Kuruvilla * Common Development and Distribution License (the "License"). 6*7417cfdeSKuriakose Kuruvilla * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate/* 22*7417cfdeSKuriakose Kuruvilla * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 237c478bd9Sstevel@tonic-gate */ 247c478bd9Sstevel@tonic-gate 257c478bd9Sstevel@tonic-gate#include <sys/asm_linkage.h> 267c478bd9Sstevel@tonic-gate#include <sys/x86_archext.h> 277c478bd9Sstevel@tonic-gate#include <sys/controlregs.h> 287c478bd9Sstevel@tonic-gate 297c478bd9Sstevel@tonic-gate#if defined(__lint) 307c478bd9Sstevel@tonic-gate 317c478bd9Sstevel@tonic-gate#include <sys/types.h> 327c478bd9Sstevel@tonic-gate 337c478bd9Sstevel@tonic-gateuint32_t 347c478bd9Sstevel@tonic-gatebignum_use_sse2() 357c478bd9Sstevel@tonic-gate{ return (0); } 367c478bd9Sstevel@tonic-gate 377c478bd9Sstevel@tonic-gate/* Not to be called by C code */ 387c478bd9Sstevel@tonic-gate/* ARGSUSED */ 397c478bd9Sstevel@tonic-gateuint32_t 407c478bd9Sstevel@tonic-gatebig_mul_set_vec_sse2_r() 417c478bd9Sstevel@tonic-gate{ return (0); } 427c478bd9Sstevel@tonic-gate 437c478bd9Sstevel@tonic-gate/* Not to be called by C code */ 447c478bd9Sstevel@tonic-gate/* ARGSUSED */ 457c478bd9Sstevel@tonic-gateuint32_t 467c478bd9Sstevel@tonic-gatebig_mul_add_vec_sse2_r() 477c478bd9Sstevel@tonic-gate{ return (0); } 487c478bd9Sstevel@tonic-gate 497c478bd9Sstevel@tonic-gate/* ARGSUSED */ 507c478bd9Sstevel@tonic-gateuint32_t 517c478bd9Sstevel@tonic-gatebig_mul_set_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit) 527c478bd9Sstevel@tonic-gate{ return (0); } 537c478bd9Sstevel@tonic-gate 547c478bd9Sstevel@tonic-gate/* ARGSUSED */ 557c478bd9Sstevel@tonic-gateuint32_t 567c478bd9Sstevel@tonic-gatebig_mul_add_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit) 577c478bd9Sstevel@tonic-gate{ return (0); } 587c478bd9Sstevel@tonic-gate 597c478bd9Sstevel@tonic-gate/* ARGSUSED */ 607c478bd9Sstevel@tonic-gatevoid 617c478bd9Sstevel@tonic-gatebig_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen) 627c478bd9Sstevel@tonic-gate{} 637c478bd9Sstevel@tonic-gate 647c478bd9Sstevel@tonic-gate/* ARGSUSED */ 657c478bd9Sstevel@tonic-gatevoid 667c478bd9Sstevel@tonic-gatebig_sqr_vec_sse2(uint32_t *r, uint32_t *a, int len) 677c478bd9Sstevel@tonic-gate{} 687c478bd9Sstevel@tonic-gate 697c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE) 707c478bd9Sstevel@tonic-gate 717c478bd9Sstevel@tonic-gate/* ARGSUSED */ 727c478bd9Sstevel@tonic-gateuint32_t 737c478bd9Sstevel@tonic-gatebig_mul_set_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit) 747c478bd9Sstevel@tonic-gate{ return (0); } 757c478bd9Sstevel@tonic-gate 767c478bd9Sstevel@tonic-gate/* ARGSUSED */ 777c478bd9Sstevel@tonic-gateuint32_t 787c478bd9Sstevel@tonic-gatebig_mul_add_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit) 797c478bd9Sstevel@tonic-gate{ return (0); } 807c478bd9Sstevel@tonic-gate 817c478bd9Sstevel@tonic-gate/* Not to be called by C code */ 827c478bd9Sstevel@tonic-gate/* ARGSUSED */ 837c478bd9Sstevel@tonic-gatevoid 847c478bd9Sstevel@tonic-gatebig_sqr_vec_sse2_fc(uint32_t *r, uint32_t *a, int len) 857c478bd9Sstevel@tonic-gate{} 867c478bd9Sstevel@tonic-gate 877c478bd9Sstevel@tonic-gate#endif /* MMX_MANAGE */ 887c478bd9Sstevel@tonic-gate 897c478bd9Sstevel@tonic-gate/* 907c478bd9Sstevel@tonic-gate * UMUL 917c478bd9Sstevel@tonic-gate * 927c478bd9Sstevel@tonic-gate */ 937c478bd9Sstevel@tonic-gate 947c478bd9Sstevel@tonic-gate/* ARGSUSED */ 957c478bd9Sstevel@tonic-gateuint32_t 967c478bd9Sstevel@tonic-gatebig_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 977c478bd9Sstevel@tonic-gate{ return (0); } 987c478bd9Sstevel@tonic-gate 997c478bd9Sstevel@tonic-gate/* ARGSUSED */ 1007c478bd9Sstevel@tonic-gateuint32_t 1017c478bd9Sstevel@tonic-gatebig_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 1027c478bd9Sstevel@tonic-gate{ return (0); } 1037c478bd9Sstevel@tonic-gate 1047c478bd9Sstevel@tonic-gate#else /* __lint */ 1057c478bd9Sstevel@tonic-gate 1067c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE) 1077c478bd9Sstevel@tonic-gate 1087c478bd9Sstevel@tonic-gate#if defined(_KERNEL) 1097c478bd9Sstevel@tonic-gate 1107c478bd9Sstevel@tonic-gate#define KPREEMPT_DISABLE call kpr_disable 1117c478bd9Sstevel@tonic-gate#define KPREEMPT_ENABLE call kpr_enable 1127c478bd9Sstevel@tonic-gate#define TEST_TS(reg) \ 1137c478bd9Sstevel@tonic-gate movl %cr0, reg; \ 1147c478bd9Sstevel@tonic-gate clts; \ 1157c478bd9Sstevel@tonic-gate testl $CR0_TS, reg 1167c478bd9Sstevel@tonic-gate 1177c478bd9Sstevel@tonic-gate#else /* _KERNEL */ 1187c478bd9Sstevel@tonic-gate 1197c478bd9Sstevel@tonic-gate#define KPREEMPT_DISABLE 1207c478bd9Sstevel@tonic-gate#define KPREEMPT_ENABLE 1217c478bd9Sstevel@tonic-gate 1227c478bd9Sstevel@tonic-gate#define TEST_TS(reg) \ 1237c478bd9Sstevel@tonic-gate movl $0, reg; \ 1247c478bd9Sstevel@tonic-gate testl $CR0_TS, reg 1257c478bd9Sstevel@tonic-gate 1267c478bd9Sstevel@tonic-gate#endif /* _KERNEL */ 1277c478bd9Sstevel@tonic-gate 1287c478bd9Sstevel@tonic-gate#define MMX_SIZE 8 1297c478bd9Sstevel@tonic-gate#define MMX_ALIGN 8 1307c478bd9Sstevel@tonic-gate 1317c478bd9Sstevel@tonic-gate#define SAVE_MMX_PROLOG(sreg, nreg) \ 1329e1a718fSdarrenm subl $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp; \ 1337c478bd9Sstevel@tonic-gate movl %esp, sreg; \ 1347c478bd9Sstevel@tonic-gate addl $MMX_ALIGN, sreg; \ 1357c478bd9Sstevel@tonic-gate andl $-1![MMX_ALIGN-1], sreg; 1367c478bd9Sstevel@tonic-gate 1377c478bd9Sstevel@tonic-gate#define RSTOR_MMX_EPILOG(nreg) \ 1389e1a718fSdarrenm addl $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp; 1397c478bd9Sstevel@tonic-gate 1407c478bd9Sstevel@tonic-gate#define SAVE_MMX_0TO4(sreg) \ 1417c478bd9Sstevel@tonic-gate SAVE_MMX_PROLOG(sreg, 5); \ 1427c478bd9Sstevel@tonic-gate movq %mm0, 0(sreg); \ 1437c478bd9Sstevel@tonic-gate movq %mm1, 8(sreg); \ 1447c478bd9Sstevel@tonic-gate movq %mm2, 16(sreg); \ 1457c478bd9Sstevel@tonic-gate movq %mm3, 24(sreg); \ 1467c478bd9Sstevel@tonic-gate movq %mm4, 32(sreg) 1477c478bd9Sstevel@tonic-gate 1487c478bd9Sstevel@tonic-gate#define RSTOR_MMX_0TO4(sreg) \ 1497c478bd9Sstevel@tonic-gate movq 0(sreg), %mm0; \ 1507c478bd9Sstevel@tonic-gate movq 8(sreg), %mm1; \ 1517c478bd9Sstevel@tonic-gate movq 16(sreg), %mm2; \ 1527c478bd9Sstevel@tonic-gate movq 24(sreg), %mm3; \ 1537c478bd9Sstevel@tonic-gate movq 32(sreg), %mm4; \ 1547c478bd9Sstevel@tonic-gate RSTOR_MMX_EPILOG(5) 1557c478bd9Sstevel@tonic-gate 1567c478bd9Sstevel@tonic-gate#endif /* MMX_MANAGE */ 1577c478bd9Sstevel@tonic-gate 1587c478bd9Sstevel@tonic-gate/ Note: this file contains implementations for 1597c478bd9Sstevel@tonic-gate/ big_mul_set_vec() 1607c478bd9Sstevel@tonic-gate/ big_mul_add_vec() 1617c478bd9Sstevel@tonic-gate/ big_mul_vec() 1627c478bd9Sstevel@tonic-gate/ big_sqr_vec() 1637c478bd9Sstevel@tonic-gate/ One set of implementations is for SSE2-capable models. 1647c478bd9Sstevel@tonic-gate/ The other uses no MMX, SSE, or SSE2 instructions, only 1657c478bd9Sstevel@tonic-gate/ the x86 32 X 32 -> 64 unsigned multiply instruction, MUL. 1667c478bd9Sstevel@tonic-gate/ 1677c478bd9Sstevel@tonic-gate/ The code for the implementations is grouped by SSE2 vs UMUL, 1687c478bd9Sstevel@tonic-gate/ rather than grouping pairs of implementations for each function. 1697c478bd9Sstevel@tonic-gate/ This is because the bignum implementation gets "imprinted" 1707c478bd9Sstevel@tonic-gate/ on the correct implementation, at the time of first use, 1717c478bd9Sstevel@tonic-gate/ so none of the code for the other implementations is ever 1727c478bd9Sstevel@tonic-gate/ executed. So, it is a no-brainer to layout the code to minimize 1737c478bd9Sstevel@tonic-gate/ the "footprint" of executed code. 1747c478bd9Sstevel@tonic-gate 1757c478bd9Sstevel@tonic-gate/ Can we use SSE2 instructions? Return value is non-zero 1767c478bd9Sstevel@tonic-gate/ if we can. 1777c478bd9Sstevel@tonic-gate/ 1787c478bd9Sstevel@tonic-gate/ Note: 1797c478bd9Sstevel@tonic-gate/ Using the cpuid instruction directly would work equally 1807c478bd9Sstevel@tonic-gate/ well in userland and in the kernel, but we do not use the 181*7417cfdeSKuriakose Kuruvilla/ cpuid instruction in the kernel, we use x86_featureset, 182*7417cfdeSKuriakose Kuruvilla/ instead. This means we honor any decisions the kernel 183*7417cfdeSKuriakose Kuruvilla/ startup code may have made in setting this variable, 184*7417cfdeSKuriakose Kuruvilla/ including disabling SSE2. It might even be a good idea 185*7417cfdeSKuriakose Kuruvilla/ to honor this kind of setting in userland, as well, but 186*7417cfdeSKuriakose Kuruvilla/ the variable, x86_featureset is not readily available to 187*7417cfdeSKuriakose Kuruvilla/ userland processes. 1887c478bd9Sstevel@tonic-gate/ 1897c478bd9Sstevel@tonic-gate/ uint32_t 1907c478bd9Sstevel@tonic-gate/ bignum_use_sse2() 1917c478bd9Sstevel@tonic-gate 1927c478bd9Sstevel@tonic-gate ENTRY(bignum_use_sse2) 1937c478bd9Sstevel@tonic-gate#if defined(_KERNEL) 194*7417cfdeSKuriakose Kuruvilla xor %eax, %eax 195*7417cfdeSKuriakose Kuruvilla bt $X86FSET_SSE2, x86_featureset 196*7417cfdeSKuriakose Kuruvilla adc %eax, %eax 1977c478bd9Sstevel@tonic-gate#else /* _KERNEL */ 1987c478bd9Sstevel@tonic-gate pushl %ebx 1997c478bd9Sstevel@tonic-gate movl $1, %eax / Get feature information 2007c478bd9Sstevel@tonic-gate cpuid 2017c478bd9Sstevel@tonic-gate movl %edx, %eax / set return value 2027c478bd9Sstevel@tonic-gate popl %ebx 2037c478bd9Sstevel@tonic-gate andl $CPUID_INTC_EDX_SSE2, %eax 2047c478bd9Sstevel@tonic-gate#endif /* _KERNEL */ 2057c478bd9Sstevel@tonic-gate ret 2067c478bd9Sstevel@tonic-gate SET_SIZE(bignum_use_sse2) 2077c478bd9Sstevel@tonic-gate 2087c478bd9Sstevel@tonic-gate 2097c478bd9Sstevel@tonic-gate/ ------------------------------------------------------------------------ 2107c478bd9Sstevel@tonic-gate/ SSE2 Implementations 2117c478bd9Sstevel@tonic-gate/ ------------------------------------------------------------------------ 2127c478bd9Sstevel@tonic-gate 2137c478bd9Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len 2147c478bd9Sstevel@tonic-gate/ returns the carry digit 2157c478bd9Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions 2167c478bd9Sstevel@tonic-gate/ 2177c478bd9Sstevel@tonic-gate/ uint32_t 2187c478bd9Sstevel@tonic-gate/ big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit) 2197c478bd9Sstevel@tonic-gate/ 2207c478bd9Sstevel@tonic-gate/ r %edx 2217c478bd9Sstevel@tonic-gate/ a %ebx 2227c478bd9Sstevel@tonic-gate/ len %ecx 2237c478bd9Sstevel@tonic-gate/ digit %mm3 2247c478bd9Sstevel@tonic-gate/ 2257c478bd9Sstevel@tonic-gate/ Does not touch the following registers: %esi, %edi, %mm4 2267c478bd9Sstevel@tonic-gate/ 2277c478bd9Sstevel@tonic-gate/ N.B.: 2287c478bd9Sstevel@tonic-gate/ This is strictly for internal use. 2297c478bd9Sstevel@tonic-gate/ The interface is very light-weight. 2307c478bd9Sstevel@tonic-gate/ All parameters are passed in registers. 2317c478bd9Sstevel@tonic-gate/ It does not conform to the SYSV x86 ABI. 2327c478bd9Sstevel@tonic-gate/ So, don't even think about calling this function directly from C code. 2337c478bd9Sstevel@tonic-gate/ 2347c478bd9Sstevel@tonic-gate/ The basic multiply digit loop is unrolled 8 times. 2357c478bd9Sstevel@tonic-gate/ Each comment is preceded by an instance number. 2367c478bd9Sstevel@tonic-gate/ Instructions that have been moved retain their original, "natural" 2377c478bd9Sstevel@tonic-gate/ instance number. It should be easier this way to follow 2387c478bd9Sstevel@tonic-gate/ the step-wise refinement process that went into constructing 2397c478bd9Sstevel@tonic-gate/ the final code. 2407c478bd9Sstevel@tonic-gate 2417c478bd9Sstevel@tonic-gate#define UNROLL 8 2427c478bd9Sstevel@tonic-gate#define UNROLL32 32 2437c478bd9Sstevel@tonic-gate 2447c478bd9Sstevel@tonic-gate ENTRY(big_mul_set_vec_sse2_r) 2457c478bd9Sstevel@tonic-gate xorl %eax, %eax / if (len == 0) return (0); 2467c478bd9Sstevel@tonic-gate testl %ecx, %ecx 2477c478bd9Sstevel@tonic-gate jz .L17 2487c478bd9Sstevel@tonic-gate 2497c478bd9Sstevel@tonic-gate pxor %mm0, %mm0 / cy = 0 2507c478bd9Sstevel@tonic-gate 2517c478bd9Sstevel@tonic-gate.L15: 2527c478bd9Sstevel@tonic-gate cmpl $UNROLL, %ecx 2537c478bd9Sstevel@tonic-gate jl .L16 2547c478bd9Sstevel@tonic-gate movd 0(%ebx), %mm1 / 1: mm1 = a[i] 2557c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 2567c478bd9Sstevel@tonic-gate paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy; 2577c478bd9Sstevel@tonic-gate movd 4(%ebx), %mm1 / 2: mm1 = a[i] 2587c478bd9Sstevel@tonic-gate movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 2597c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 1: cy = product[63..32] 2607c478bd9Sstevel@tonic-gate 2617c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 2627c478bd9Sstevel@tonic-gate paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy; 2637c478bd9Sstevel@tonic-gate movd 8(%ebx), %mm1 / 3: mm1 = a[i] 2647c478bd9Sstevel@tonic-gate movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 2657c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 2: cy = product[63..32] 2667c478bd9Sstevel@tonic-gate 2677c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 2687c478bd9Sstevel@tonic-gate paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy; 2697c478bd9Sstevel@tonic-gate movd 12(%ebx), %mm1 / 4: mm1 = a[i] 2707c478bd9Sstevel@tonic-gate movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 2717c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 3: cy = product[63..32] 2727c478bd9Sstevel@tonic-gate 2737c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 2747c478bd9Sstevel@tonic-gate paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy; 2757c478bd9Sstevel@tonic-gate movd 16(%ebx), %mm1 / 5: mm1 = a[i] 2767c478bd9Sstevel@tonic-gate movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 2777c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 4: cy = product[63..32] 2787c478bd9Sstevel@tonic-gate 2797c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 2807c478bd9Sstevel@tonic-gate paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy; 2817c478bd9Sstevel@tonic-gate movd 20(%ebx), %mm1 / 6: mm1 = a[i] 2827c478bd9Sstevel@tonic-gate movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 2837c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 5: cy = product[63..32] 2847c478bd9Sstevel@tonic-gate 2857c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 2867c478bd9Sstevel@tonic-gate paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy; 2877c478bd9Sstevel@tonic-gate movd 24(%ebx), %mm1 / 7: mm1 = a[i] 2887c478bd9Sstevel@tonic-gate movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 2897c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 6: cy = product[63..32] 2907c478bd9Sstevel@tonic-gate 2917c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 2927c478bd9Sstevel@tonic-gate paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy; 2937c478bd9Sstevel@tonic-gate movd 28(%ebx), %mm1 / 8: mm1 = a[i] 2947c478bd9Sstevel@tonic-gate movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 2957c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 7: cy = product[63..32] 2967c478bd9Sstevel@tonic-gate 2977c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i] 2987c478bd9Sstevel@tonic-gate paddq %mm1, %mm0 / 8: mm0 = digit * a[i] + cy; 2997c478bd9Sstevel@tonic-gate movd %mm0, 28(%edx) / 8: r[i] = product[31..0] 3007c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 8: cy = product[63..32] 3017c478bd9Sstevel@tonic-gate 3027c478bd9Sstevel@tonic-gate leal UNROLL32(%ebx), %ebx / a += UNROLL 3037c478bd9Sstevel@tonic-gate leal UNROLL32(%edx), %edx / r += UNROLL 3047c478bd9Sstevel@tonic-gate subl $UNROLL, %ecx / len -= UNROLL 3057c478bd9Sstevel@tonic-gate jz .L17 3067c478bd9Sstevel@tonic-gate jmp .L15 3077c478bd9Sstevel@tonic-gate 3087c478bd9Sstevel@tonic-gate.L16: 3097c478bd9Sstevel@tonic-gate movd 0(%ebx), %mm1 / 1: mm1 = a[i] 3107c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 3117c478bd9Sstevel@tonic-gate paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy; 3127c478bd9Sstevel@tonic-gate movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 3137c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 1: cy = product[63..32] 3147c478bd9Sstevel@tonic-gate subl $1, %ecx 3157c478bd9Sstevel@tonic-gate jz .L17 3167c478bd9Sstevel@tonic-gate 3177c478bd9Sstevel@tonic-gate movd 4(%ebx), %mm1 / 2: mm1 = a[i] 3187c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 3197c478bd9Sstevel@tonic-gate paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy; 3207c478bd9Sstevel@tonic-gate movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 3217c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 2: cy = product[63..32] 3227c478bd9Sstevel@tonic-gate subl $1, %ecx 3237c478bd9Sstevel@tonic-gate jz .L17 3247c478bd9Sstevel@tonic-gate 3257c478bd9Sstevel@tonic-gate movd 8(%ebx), %mm1 / 3: mm1 = a[i] 3267c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 3277c478bd9Sstevel@tonic-gate paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy; 3287c478bd9Sstevel@tonic-gate movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 3297c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 3: cy = product[63..32] 3307c478bd9Sstevel@tonic-gate subl $1, %ecx 3317c478bd9Sstevel@tonic-gate jz .L17 3327c478bd9Sstevel@tonic-gate 3337c478bd9Sstevel@tonic-gate movd 12(%ebx), %mm1 / 4: mm1 = a[i] 3347c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 3357c478bd9Sstevel@tonic-gate paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy; 3367c478bd9Sstevel@tonic-gate movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 3377c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 4: cy = product[63..32] 3387c478bd9Sstevel@tonic-gate subl $1, %ecx 3397c478bd9Sstevel@tonic-gate jz .L17 3407c478bd9Sstevel@tonic-gate 3417c478bd9Sstevel@tonic-gate movd 16(%ebx), %mm1 / 5: mm1 = a[i] 3427c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 3437c478bd9Sstevel@tonic-gate paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy; 3447c478bd9Sstevel@tonic-gate movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 3457c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 5: cy = product[63..32] 3467c478bd9Sstevel@tonic-gate subl $1, %ecx 3477c478bd9Sstevel@tonic-gate jz .L17 3487c478bd9Sstevel@tonic-gate 3497c478bd9Sstevel@tonic-gate movd 20(%ebx), %mm1 / 6: mm1 = a[i] 3507c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 3517c478bd9Sstevel@tonic-gate paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy; 3527c478bd9Sstevel@tonic-gate movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 3537c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 6: cy = product[63..32] 3547c478bd9Sstevel@tonic-gate subl $1, %ecx 3557c478bd9Sstevel@tonic-gate jz .L17 3567c478bd9Sstevel@tonic-gate 3577c478bd9Sstevel@tonic-gate movd 24(%ebx), %mm1 / 7: mm1 = a[i] 3587c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 3597c478bd9Sstevel@tonic-gate paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy; 3607c478bd9Sstevel@tonic-gate movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 3617c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 7: cy = product[63..32] 3627c478bd9Sstevel@tonic-gate 3637c478bd9Sstevel@tonic-gate.L17: 3647c478bd9Sstevel@tonic-gate movd %mm0, %eax / return (cy) 3657c478bd9Sstevel@tonic-gate / no emms. caller is responsible for emms 3667c478bd9Sstevel@tonic-gate ret 3677c478bd9Sstevel@tonic-gate SET_SIZE(big_mul_set_vec_sse2_r) 3687c478bd9Sstevel@tonic-gate 3697c478bd9Sstevel@tonic-gate 3707c478bd9Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len 3717c478bd9Sstevel@tonic-gate/ returns the carry digit 3727c478bd9Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions 3737c478bd9Sstevel@tonic-gate/ 3747c478bd9Sstevel@tonic-gate/ r 8(%ebp) %edx 3757c478bd9Sstevel@tonic-gate/ a 12(%ebp) %ebx 3767c478bd9Sstevel@tonic-gate/ len 16(%ebp) %ecx 3777c478bd9Sstevel@tonic-gate/ digit 20(%ebp) %mm3 3787c478bd9Sstevel@tonic-gate/ 3797c478bd9Sstevel@tonic-gate/ In userland, there is just the one function, big_mul_set_vec_sse2(). 3807c478bd9Sstevel@tonic-gate/ But in the kernel, there are two variations: 3817c478bd9Sstevel@tonic-gate/ 1. big_mul_set_vec_sse2() which does what is necessary to save and 3827c478bd9Sstevel@tonic-gate/ restore state, if necessary, and to ensure that preemtion is 3837c478bd9Sstevel@tonic-gate/ disabled. 3847c478bd9Sstevel@tonic-gate/ 2. big_mul_set_vec_sse2_nsv() which just does the work; 3857c478bd9Sstevel@tonic-gate/ it is the caller's responsibility to ensure that MMX state 3867c478bd9Sstevel@tonic-gate/ does not need to be saved and restored and that preemption 3877c478bd9Sstevel@tonic-gate/ is already disabled. 3887c478bd9Sstevel@tonic-gate 3897c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE) 3907c478bd9Sstevel@tonic-gate ENTRY(big_mul_set_vec_sse2) 3917c478bd9Sstevel@tonic-gate pushl %ebp 3927c478bd9Sstevel@tonic-gate movl %esp, %ebp 3937c478bd9Sstevel@tonic-gate pushl %ebx 3947c478bd9Sstevel@tonic-gate pushl %esi 3957c478bd9Sstevel@tonic-gate KPREEMPT_DISABLE 3967c478bd9Sstevel@tonic-gate TEST_TS(%ebx) 3977c478bd9Sstevel@tonic-gate pushl %ebx 3987c478bd9Sstevel@tonic-gate jnz .setvec_no_save 3997c478bd9Sstevel@tonic-gate pushl %edi 4007c478bd9Sstevel@tonic-gate SAVE_MMX_0TO4(%edi) 4017c478bd9Sstevel@tonic-gate movl 8(%ebp), %edx 4027c478bd9Sstevel@tonic-gate movl 12(%ebp), %ebx 4037c478bd9Sstevel@tonic-gate movl 16(%ebp), %ecx 4047c478bd9Sstevel@tonic-gate movd 20(%ebp), %mm3 4057c478bd9Sstevel@tonic-gate call big_mul_set_vec_sse2_r 4067c478bd9Sstevel@tonic-gate movl %eax, %esi 4077c478bd9Sstevel@tonic-gate RSTOR_MMX_0TO4(%edi) 4087c478bd9Sstevel@tonic-gate popl %edi 4097c478bd9Sstevel@tonic-gate jmp .setvec_rtn 4107c478bd9Sstevel@tonic-gate 4117c478bd9Sstevel@tonic-gate.setvec_no_save: 4127c478bd9Sstevel@tonic-gate movl 8(%ebp), %edx 4137c478bd9Sstevel@tonic-gate movl 12(%ebp), %ebx 4147c478bd9Sstevel@tonic-gate movl 16(%ebp), %ecx 4157c478bd9Sstevel@tonic-gate movd 20(%ebp), %mm3 4167c478bd9Sstevel@tonic-gate call big_mul_set_vec_sse2_r 4177c478bd9Sstevel@tonic-gate movl %eax, %esi 4187c478bd9Sstevel@tonic-gate 4197c478bd9Sstevel@tonic-gate.setvec_rtn: 4207c478bd9Sstevel@tonic-gate emms 4217c478bd9Sstevel@tonic-gate popl %ebx 4227c478bd9Sstevel@tonic-gate movl %ebx, %cr0 4237c478bd9Sstevel@tonic-gate KPREEMPT_ENABLE 4247c478bd9Sstevel@tonic-gate movl %esi, %eax 4257c478bd9Sstevel@tonic-gate popl %esi 4267c478bd9Sstevel@tonic-gate popl %ebx 4277c478bd9Sstevel@tonic-gate leave 4287c478bd9Sstevel@tonic-gate ret 4297c478bd9Sstevel@tonic-gate SET_SIZE(big_mul_set_vec_sse2) 4307c478bd9Sstevel@tonic-gate 4317c478bd9Sstevel@tonic-gate ENTRY(big_mul_set_vec_sse2_nsv) 4327c478bd9Sstevel@tonic-gate pushl %ebp 4337c478bd9Sstevel@tonic-gate movl %esp, %ebp 4347c478bd9Sstevel@tonic-gate pushl %ebx 4357c478bd9Sstevel@tonic-gate movl 8(%ebp), %edx 4367c478bd9Sstevel@tonic-gate movl 12(%ebp), %ebx 4377c478bd9Sstevel@tonic-gate movl 16(%ebp), %ecx 4387c478bd9Sstevel@tonic-gate movd 20(%ebp), %mm3 4397c478bd9Sstevel@tonic-gate call big_mul_set_vec_sse2_r 4407c478bd9Sstevel@tonic-gate popl %ebx 4417c478bd9Sstevel@tonic-gate leave 4427c478bd9Sstevel@tonic-gate ret 4437c478bd9Sstevel@tonic-gate SET_SIZE(big_mul_set_vec_sse2_nsv) 4447c478bd9Sstevel@tonic-gate 4457c478bd9Sstevel@tonic-gate#else /* !defined(MMX_MANAGE) */ 4467c478bd9Sstevel@tonic-gate 4477c478bd9Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len 4487c478bd9Sstevel@tonic-gate/ returns the carry digit 4497c478bd9Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions 4507c478bd9Sstevel@tonic-gate/ 4517c478bd9Sstevel@tonic-gate/ r 8(%ebp) %edx 4527c478bd9Sstevel@tonic-gate/ a 12(%ebp) %ebx 4537c478bd9Sstevel@tonic-gate/ len 16(%ebp) %ecx 4547c478bd9Sstevel@tonic-gate/ digit 20(%ebp) %mm3 4557c478bd9Sstevel@tonic-gate 4567c478bd9Sstevel@tonic-gate ENTRY(big_mul_set_vec_sse2) 4577c478bd9Sstevel@tonic-gate pushl %ebp 4587c478bd9Sstevel@tonic-gate movl %esp, %ebp 4597c478bd9Sstevel@tonic-gate pushl %ebx 4607c478bd9Sstevel@tonic-gate movl 8(%ebp), %edx 4617c478bd9Sstevel@tonic-gate movl 12(%ebp), %ebx 4627c478bd9Sstevel@tonic-gate movl 16(%ebp), %ecx 4637c478bd9Sstevel@tonic-gate movd 20(%ebp), %mm3 4647c478bd9Sstevel@tonic-gate call big_mul_set_vec_sse2_r 4657c478bd9Sstevel@tonic-gate popl %ebx 4667c478bd9Sstevel@tonic-gate emms 4677c478bd9Sstevel@tonic-gate leave 4687c478bd9Sstevel@tonic-gate ret 4697c478bd9Sstevel@tonic-gate SET_SIZE(big_mul_set_vec_sse2) 4707c478bd9Sstevel@tonic-gate 4717c478bd9Sstevel@tonic-gate#endif /* MMX_MANAGE */ 4727c478bd9Sstevel@tonic-gate 4737c478bd9Sstevel@tonic-gate 4747c478bd9Sstevel@tonic-gate/ r = r + a * digit, r and a are vectors of length len 4757c478bd9Sstevel@tonic-gate/ returns the carry digit 4767c478bd9Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions 4777c478bd9Sstevel@tonic-gate/ 4787c478bd9Sstevel@tonic-gate/ uint32_t 4797c478bd9Sstevel@tonic-gate/ big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit) 4807c478bd9Sstevel@tonic-gate/ 4817c478bd9Sstevel@tonic-gate/ r %edx 4827c478bd9Sstevel@tonic-gate/ a %ebx 4837c478bd9Sstevel@tonic-gate/ len %ecx 4847c478bd9Sstevel@tonic-gate/ digit %mm3 4857c478bd9Sstevel@tonic-gate/ 4867c478bd9Sstevel@tonic-gate/ N.B.: 4877c478bd9Sstevel@tonic-gate/ This is strictly for internal use. 4887c478bd9Sstevel@tonic-gate/ The interface is very light-weight. 4897c478bd9Sstevel@tonic-gate/ All parameters are passed in registers. 4907c478bd9Sstevel@tonic-gate/ It does not conform to the SYSV x86 ABI. 4917c478bd9Sstevel@tonic-gate/ So, don't even think about calling this function directly from C code. 4927c478bd9Sstevel@tonic-gate/ 4937c478bd9Sstevel@tonic-gate/ The basic multiply digit loop is unrolled 8 times. 4947c478bd9Sstevel@tonic-gate/ Each comment is preceded by an instance number. 4957c478bd9Sstevel@tonic-gate/ Instructions that have been moved retain their original, "natural" 4967c478bd9Sstevel@tonic-gate/ instance number. It should be easier this way to follow 4977c478bd9Sstevel@tonic-gate/ the step-wise refinement process that went into constructing 4987c478bd9Sstevel@tonic-gate/ the final code. 4997c478bd9Sstevel@tonic-gate 5007c478bd9Sstevel@tonic-gate ENTRY(big_mul_add_vec_sse2_r) 5017c478bd9Sstevel@tonic-gate xorl %eax, %eax 5027c478bd9Sstevel@tonic-gate testl %ecx, %ecx 5037c478bd9Sstevel@tonic-gate jz .L27 5047c478bd9Sstevel@tonic-gate 5057c478bd9Sstevel@tonic-gate pxor %mm0, %mm0 / cy = 0 5067c478bd9Sstevel@tonic-gate 5077c478bd9Sstevel@tonic-gate.L25: 5087c478bd9Sstevel@tonic-gate cmpl $UNROLL, %ecx 5097c478bd9Sstevel@tonic-gate jl .L26 5107c478bd9Sstevel@tonic-gate movd 0(%ebx), %mm1 / 1: mm1 = a[i] 5117c478bd9Sstevel@tonic-gate movd 0(%edx), %mm2 / 1: mm2 = r[i] 5127c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 5137c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i] 5147c478bd9Sstevel@tonic-gate movd 4(%ebx), %mm1 / 2: mm1 = a[i] 5157c478bd9Sstevel@tonic-gate paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy; 5167c478bd9Sstevel@tonic-gate movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 5177c478bd9Sstevel@tonic-gate movd 4(%edx), %mm2 / 2: mm2 = r[i] 5187c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 1: cy = product[63..32] 5197c478bd9Sstevel@tonic-gate 5207c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 5217c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i] 5227c478bd9Sstevel@tonic-gate movd 8(%ebx), %mm1 / 3: mm1 = a[i] 5237c478bd9Sstevel@tonic-gate paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy; 5247c478bd9Sstevel@tonic-gate movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 5257c478bd9Sstevel@tonic-gate movd 8(%edx), %mm2 / 3: mm2 = r[i] 5267c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 2: cy = product[63..32] 5277c478bd9Sstevel@tonic-gate 5287c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 5297c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i] 5307c478bd9Sstevel@tonic-gate movd 12(%ebx), %mm1 / 4: mm1 = a[i] 5317c478bd9Sstevel@tonic-gate paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy; 5327c478bd9Sstevel@tonic-gate movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 5337c478bd9Sstevel@tonic-gate movd 12(%edx), %mm2 / 4: mm2 = r[i] 5347c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 3: cy = product[63..32] 5357c478bd9Sstevel@tonic-gate 5367c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 5377c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i] 5387c478bd9Sstevel@tonic-gate movd 16(%ebx), %mm1 / 5: mm1 = a[i] 5397c478bd9Sstevel@tonic-gate paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy; 5407c478bd9Sstevel@tonic-gate movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 5417c478bd9Sstevel@tonic-gate movd 16(%edx), %mm2 / 5: mm2 = r[i] 5427c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 4: cy = product[63..32] 5437c478bd9Sstevel@tonic-gate 5447c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 5457c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i] 5467c478bd9Sstevel@tonic-gate movd 20(%ebx), %mm1 / 6: mm1 = a[i] 5477c478bd9Sstevel@tonic-gate paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy; 5487c478bd9Sstevel@tonic-gate movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 5497c478bd9Sstevel@tonic-gate movd 20(%edx), %mm2 / 6: mm2 = r[i] 5507c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 5: cy = product[63..32] 5517c478bd9Sstevel@tonic-gate 5527c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 5537c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i] 5547c478bd9Sstevel@tonic-gate movd 24(%ebx), %mm1 / 7: mm1 = a[i] 5557c478bd9Sstevel@tonic-gate paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy; 5567c478bd9Sstevel@tonic-gate movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 5577c478bd9Sstevel@tonic-gate movd 24(%edx), %mm2 / 7: mm2 = r[i] 5587c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 6: cy = product[63..32] 5597c478bd9Sstevel@tonic-gate 5607c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 5617c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i] 5627c478bd9Sstevel@tonic-gate movd 28(%ebx), %mm1 / 8: mm1 = a[i] 5637c478bd9Sstevel@tonic-gate paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy; 5647c478bd9Sstevel@tonic-gate movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 5657c478bd9Sstevel@tonic-gate movd 28(%edx), %mm2 / 8: mm2 = r[i] 5667c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 7: cy = product[63..32] 5677c478bd9Sstevel@tonic-gate 5687c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i] 5697c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / 8: mm2 = digit * a[i] + r[i] 5707c478bd9Sstevel@tonic-gate paddq %mm2, %mm0 / 8: mm0 = digit * a[i] + r[i] + cy; 5717c478bd9Sstevel@tonic-gate movd %mm0, 28(%edx) / 8: r[i] = product[31..0] 5727c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 8: cy = product[63..32] 5737c478bd9Sstevel@tonic-gate 5747c478bd9Sstevel@tonic-gate leal UNROLL32(%ebx), %ebx / a += UNROLL 5757c478bd9Sstevel@tonic-gate leal UNROLL32(%edx), %edx / r += UNROLL 5767c478bd9Sstevel@tonic-gate subl $UNROLL, %ecx / len -= UNROLL 5777c478bd9Sstevel@tonic-gate jz .L27 5787c478bd9Sstevel@tonic-gate jmp .L25 5797c478bd9Sstevel@tonic-gate 5807c478bd9Sstevel@tonic-gate.L26: 5817c478bd9Sstevel@tonic-gate movd 0(%ebx), %mm1 / 1: mm1 = a[i] 5827c478bd9Sstevel@tonic-gate movd 0(%edx), %mm2 / 1: mm2 = r[i] 5837c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 5847c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i] 5857c478bd9Sstevel@tonic-gate paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy; 5867c478bd9Sstevel@tonic-gate movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 5877c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 1: cy = product[63..32] 5887c478bd9Sstevel@tonic-gate subl $1, %ecx 5897c478bd9Sstevel@tonic-gate jz .L27 5907c478bd9Sstevel@tonic-gate 5917c478bd9Sstevel@tonic-gate movd 4(%ebx), %mm1 / 2: mm1 = a[i] 5927c478bd9Sstevel@tonic-gate movd 4(%edx), %mm2 / 2: mm2 = r[i] 5937c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 5947c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i] 5957c478bd9Sstevel@tonic-gate paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy; 5967c478bd9Sstevel@tonic-gate movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 5977c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 2: cy = product[63..32] 5987c478bd9Sstevel@tonic-gate subl $1, %ecx 5997c478bd9Sstevel@tonic-gate jz .L27 6007c478bd9Sstevel@tonic-gate 6017c478bd9Sstevel@tonic-gate movd 8(%ebx), %mm1 / 3: mm1 = a[i] 6027c478bd9Sstevel@tonic-gate movd 8(%edx), %mm2 / 3: mm2 = r[i] 6037c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 6047c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i] 6057c478bd9Sstevel@tonic-gate paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy; 6067c478bd9Sstevel@tonic-gate movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 6077c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 3: cy = product[63..32] 6087c478bd9Sstevel@tonic-gate subl $1, %ecx 6097c478bd9Sstevel@tonic-gate jz .L27 6107c478bd9Sstevel@tonic-gate 6117c478bd9Sstevel@tonic-gate movd 12(%ebx), %mm1 / 4: mm1 = a[i] 6127c478bd9Sstevel@tonic-gate movd 12(%edx), %mm2 / 4: mm2 = r[i] 6137c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 6147c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i] 6157c478bd9Sstevel@tonic-gate paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy; 6167c478bd9Sstevel@tonic-gate movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 6177c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 4: cy = product[63..32] 6187c478bd9Sstevel@tonic-gate subl $1, %ecx 6197c478bd9Sstevel@tonic-gate jz .L27 6207c478bd9Sstevel@tonic-gate 6217c478bd9Sstevel@tonic-gate movd 16(%ebx), %mm1 / 5: mm1 = a[i] 6227c478bd9Sstevel@tonic-gate movd 16(%edx), %mm2 / 5: mm2 = r[i] 6237c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 6247c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i] 6257c478bd9Sstevel@tonic-gate paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy; 6267c478bd9Sstevel@tonic-gate movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 6277c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 5: cy = product[63..32] 6287c478bd9Sstevel@tonic-gate subl $1, %ecx 6297c478bd9Sstevel@tonic-gate jz .L27 6307c478bd9Sstevel@tonic-gate 6317c478bd9Sstevel@tonic-gate movd 20(%ebx), %mm1 / 6: mm1 = a[i] 6327c478bd9Sstevel@tonic-gate movd 20(%edx), %mm2 / 6: mm2 = r[i] 6337c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 6347c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i] 6357c478bd9Sstevel@tonic-gate paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy; 6367c478bd9Sstevel@tonic-gate movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 6377c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 6: cy = product[63..32] 6387c478bd9Sstevel@tonic-gate subl $1, %ecx 6397c478bd9Sstevel@tonic-gate jz .L27 6407c478bd9Sstevel@tonic-gate 6417c478bd9Sstevel@tonic-gate movd 24(%ebx), %mm1 / 7: mm1 = a[i] 6427c478bd9Sstevel@tonic-gate movd 24(%edx), %mm2 / 7: mm2 = r[i] 6437c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 6447c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i] 6457c478bd9Sstevel@tonic-gate paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy; 6467c478bd9Sstevel@tonic-gate movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 6477c478bd9Sstevel@tonic-gate psrlq $32, %mm0 / 7: cy = product[63..32] 6487c478bd9Sstevel@tonic-gate 6497c478bd9Sstevel@tonic-gate.L27: 6507c478bd9Sstevel@tonic-gate movd %mm0, %eax 6517c478bd9Sstevel@tonic-gate / no emms. caller is responsible for emms 6527c478bd9Sstevel@tonic-gate ret 6537c478bd9Sstevel@tonic-gate SET_SIZE(big_mul_add_vec_sse2_r) 6547c478bd9Sstevel@tonic-gate 6557c478bd9Sstevel@tonic-gate 6567c478bd9Sstevel@tonic-gate/ r = r + a * digit, r and a are vectors of length len 6577c478bd9Sstevel@tonic-gate/ returns the carry digit 6587c478bd9Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions 6597c478bd9Sstevel@tonic-gate/ 6607c478bd9Sstevel@tonic-gate/ r 8(%ebp) %edx 6617c478bd9Sstevel@tonic-gate/ a 12(%ebp) %ebx 6627c478bd9Sstevel@tonic-gate/ len 16(%ebp) %ecx 6637c478bd9Sstevel@tonic-gate/ digit 20(%ebp) %mm3 6647c478bd9Sstevel@tonic-gate/ 6657c478bd9Sstevel@tonic-gate/ In userland, there is just the one function, big_mul_add_vec_sse2(). 6667c478bd9Sstevel@tonic-gate/ But in the kernel, there are two variations: 6677c478bd9Sstevel@tonic-gate/ 1. big_mul_add_vec_sse2() which does what is necessary to save and 6687c478bd9Sstevel@tonic-gate/ restore state, if necessary, and to ensure that preemtion is 6697c478bd9Sstevel@tonic-gate/ disabled. 6707c478bd9Sstevel@tonic-gate/ 2. big_mul_add_vec_sse2_nsv() which just does the work; 6717c478bd9Sstevel@tonic-gate/ it is the caller's responsibility to ensure that MMX state 6727c478bd9Sstevel@tonic-gate/ does not need to be saved and restored and that preemption 6737c478bd9Sstevel@tonic-gate/ is already disabled. 6747c478bd9Sstevel@tonic-gate 6757c478bd9Sstevel@tonic-gate 6767c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE) 6777c478bd9Sstevel@tonic-gate 6787c478bd9Sstevel@tonic-gate ENTRY(big_mul_add_vec_sse2) 6797c478bd9Sstevel@tonic-gate pushl %ebp 6807c478bd9Sstevel@tonic-gate movl %esp, %ebp 6817c478bd9Sstevel@tonic-gate pushl %ebx 6827c478bd9Sstevel@tonic-gate pushl %esi 6837c478bd9Sstevel@tonic-gate KPREEMPT_DISABLE 6847c478bd9Sstevel@tonic-gate TEST_TS(%ebx) 6857c478bd9Sstevel@tonic-gate pushl %ebx 6867c478bd9Sstevel@tonic-gate jnz .addvec_no_save 6877c478bd9Sstevel@tonic-gate pushl %edi 6887c478bd9Sstevel@tonic-gate SAVE_MMX_0TO4(%edi) 6897c478bd9Sstevel@tonic-gate movl 8(%ebp), %edx 6907c478bd9Sstevel@tonic-gate movl 12(%ebp), %ebx 6917c478bd9Sstevel@tonic-gate movl 16(%ebp), %ecx 6927c478bd9Sstevel@tonic-gate movd 20(%ebp), %mm3 6937c478bd9Sstevel@tonic-gate call big_mul_add_vec_sse2_r 6947c478bd9Sstevel@tonic-gate movl %eax, %esi 6957c478bd9Sstevel@tonic-gate RSTOR_MMX_0TO4(%edi) 6967c478bd9Sstevel@tonic-gate popl %edi 6977c478bd9Sstevel@tonic-gate jmp .addvec_rtn 6987c478bd9Sstevel@tonic-gate 6997c478bd9Sstevel@tonic-gate.addvec_no_save: 7007c478bd9Sstevel@tonic-gate movl 8(%ebp), %edx 7017c478bd9Sstevel@tonic-gate movl 12(%ebp), %ebx 7027c478bd9Sstevel@tonic-gate movl 16(%ebp), %ecx 7037c478bd9Sstevel@tonic-gate movd 20(%ebp), %mm3 7047c478bd9Sstevel@tonic-gate call big_mul_add_vec_sse2_r 7057c478bd9Sstevel@tonic-gate movl %eax, %esi 7067c478bd9Sstevel@tonic-gate 7077c478bd9Sstevel@tonic-gate.addvec_rtn: 7087c478bd9Sstevel@tonic-gate emms 7097c478bd9Sstevel@tonic-gate popl %ebx 7107c478bd9Sstevel@tonic-gate movl %ebx, %cr0 7117c478bd9Sstevel@tonic-gate KPREEMPT_ENABLE 7127c478bd9Sstevel@tonic-gate movl %esi, %eax 7137c478bd9Sstevel@tonic-gate popl %esi 7147c478bd9Sstevel@tonic-gate popl %ebx 7157c478bd9Sstevel@tonic-gate leave 7167c478bd9Sstevel@tonic-gate ret 7177c478bd9Sstevel@tonic-gate SET_SIZE(big_mul_add_vec_sse2) 7187c478bd9Sstevel@tonic-gate 7197c478bd9Sstevel@tonic-gate ENTRY(big_mul_add_vec_sse2_nsv) 7207c478bd9Sstevel@tonic-gate pushl %ebp 7217c478bd9Sstevel@tonic-gate movl %esp, %ebp 7227c478bd9Sstevel@tonic-gate pushl %ebx 7237c478bd9Sstevel@tonic-gate movl 8(%ebp), %edx 7247c478bd9Sstevel@tonic-gate movl 12(%ebp), %ebx 7257c478bd9Sstevel@tonic-gate movl 16(%ebp), %ecx 7267c478bd9Sstevel@tonic-gate movd 20(%ebp), %mm3 7277c478bd9Sstevel@tonic-gate call big_mul_add_vec_sse2_r 7287c478bd9Sstevel@tonic-gate popl %ebx 7297c478bd9Sstevel@tonic-gate leave 7307c478bd9Sstevel@tonic-gate ret 7317c478bd9Sstevel@tonic-gate SET_SIZE(big_mul_add_vec_sse2_nsv) 7327c478bd9Sstevel@tonic-gate 7337c478bd9Sstevel@tonic-gate 7347c478bd9Sstevel@tonic-gate#else /* !defined(MMX_MANAGE) */ 7357c478bd9Sstevel@tonic-gate 7367c478bd9Sstevel@tonic-gate ENTRY(big_mul_add_vec_sse2) 7377c478bd9Sstevel@tonic-gate pushl %ebp 7387c478bd9Sstevel@tonic-gate movl %esp, %ebp 7397c478bd9Sstevel@tonic-gate pushl %ebx 7407c478bd9Sstevel@tonic-gate movl 8(%ebp), %edx 7417c478bd9Sstevel@tonic-gate movl 12(%ebp), %ebx 7427c478bd9Sstevel@tonic-gate movl 16(%ebp), %ecx 7437c478bd9Sstevel@tonic-gate movd 20(%ebp), %mm3 7447c478bd9Sstevel@tonic-gate call big_mul_add_vec_sse2_r 7457c478bd9Sstevel@tonic-gate popl %ebx 7467c478bd9Sstevel@tonic-gate emms 7477c478bd9Sstevel@tonic-gate leave 7487c478bd9Sstevel@tonic-gate ret 7497c478bd9Sstevel@tonic-gate SET_SIZE(big_mul_add_vec_sse2) 7507c478bd9Sstevel@tonic-gate 7517c478bd9Sstevel@tonic-gate#endif /* MMX_MANAGE */ 7527c478bd9Sstevel@tonic-gate 7537c478bd9Sstevel@tonic-gate 7547c478bd9Sstevel@tonic-gate/ void 7557c478bd9Sstevel@tonic-gate/ big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen) 7567c478bd9Sstevel@tonic-gate/ { 7577c478bd9Sstevel@tonic-gate/ int i; 7587c478bd9Sstevel@tonic-gate/ 7597c478bd9Sstevel@tonic-gate/ r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]); 7607c478bd9Sstevel@tonic-gate/ for (i = 1; i < blen; ++i) 7617c478bd9Sstevel@tonic-gate/ r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]); 7627c478bd9Sstevel@tonic-gate/ } 7637c478bd9Sstevel@tonic-gate 7647c478bd9Sstevel@tonic-gate 7657c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE) 7667c478bd9Sstevel@tonic-gate ENTRY(big_mul_vec_sse2_fc) 7677c478bd9Sstevel@tonic-gate#else 7687c478bd9Sstevel@tonic-gate ENTRY(big_mul_vec_sse2) 7697c478bd9Sstevel@tonic-gate#endif 7707c478bd9Sstevel@tonic-gate subl $0x8, %esp 7717c478bd9Sstevel@tonic-gate pushl %ebx 7727c478bd9Sstevel@tonic-gate pushl %ebp 7737c478bd9Sstevel@tonic-gate pushl %esi 7747c478bd9Sstevel@tonic-gate pushl %edi 7757c478bd9Sstevel@tonic-gate movl 40(%esp), %eax 7767c478bd9Sstevel@tonic-gate movl %eax, 20(%esp) 7777c478bd9Sstevel@tonic-gate pushl (%eax) 7787c478bd9Sstevel@tonic-gate movl 40(%esp), %edi 7797c478bd9Sstevel@tonic-gate pushl %edi 7807c478bd9Sstevel@tonic-gate movl 40(%esp), %esi 7817c478bd9Sstevel@tonic-gate pushl %esi 7827c478bd9Sstevel@tonic-gate movl 40(%esp), %ebx 7837c478bd9Sstevel@tonic-gate pushl %ebx 7847c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE) 7857c478bd9Sstevel@tonic-gate call big_mul_set_vec_sse2_nsv 7867c478bd9Sstevel@tonic-gate#else 7877c478bd9Sstevel@tonic-gate call big_mul_set_vec_sse2 7887c478bd9Sstevel@tonic-gate#endif 7897c478bd9Sstevel@tonic-gate addl $0x10, %esp 7907c478bd9Sstevel@tonic-gate movl %eax, (%ebx,%edi,4) 7917c478bd9Sstevel@tonic-gate movl 44(%esp), %eax 7927c478bd9Sstevel@tonic-gate movl %eax, 16(%esp) 7937c478bd9Sstevel@tonic-gate cmpl $0x1, %eax 7947c478bd9Sstevel@tonic-gate jle .mulvec_rtn 7957c478bd9Sstevel@tonic-gate movl $0x1, %ebp 7967c478bd9Sstevel@tonic-gate 7979e1a718fSdarrenm .align 16 7987c478bd9Sstevel@tonic-gate.mulvec_add: 7997c478bd9Sstevel@tonic-gate movl 20(%esp), %eax 8007c478bd9Sstevel@tonic-gate pushl (%eax,%ebp,4) 8017c478bd9Sstevel@tonic-gate pushl %edi 8027c478bd9Sstevel@tonic-gate pushl %esi 8037c478bd9Sstevel@tonic-gate leal (%ebx,%ebp,4), %eax 8047c478bd9Sstevel@tonic-gate pushl %eax 8057c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE) 8067c478bd9Sstevel@tonic-gate call big_mul_add_vec_sse2_nsv 8077c478bd9Sstevel@tonic-gate#else 8087c478bd9Sstevel@tonic-gate call big_mul_add_vec_sse2 8097c478bd9Sstevel@tonic-gate#endif 8107c478bd9Sstevel@tonic-gate addl $0x10, %esp 8117c478bd9Sstevel@tonic-gate leal (%ebp,%edi), %ecx 8127c478bd9Sstevel@tonic-gate movl %eax, (%ebx,%ecx,4) 8137c478bd9Sstevel@tonic-gate incl %ebp 8147c478bd9Sstevel@tonic-gate cmpl 16(%esp), %ebp 8157c478bd9Sstevel@tonic-gate jl .mulvec_add 8167c478bd9Sstevel@tonic-gate.mulvec_rtn: 8177c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE) 8187c478bd9Sstevel@tonic-gate emms 8197c478bd9Sstevel@tonic-gate#endif 8207c478bd9Sstevel@tonic-gate popl %edi 8217c478bd9Sstevel@tonic-gate popl %esi 8227c478bd9Sstevel@tonic-gate popl %ebp 8237c478bd9Sstevel@tonic-gate popl %ebx 8247c478bd9Sstevel@tonic-gate addl $0x8, %esp 8257c478bd9Sstevel@tonic-gate ret 8267c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE) 8277c478bd9Sstevel@tonic-gate SET_SIZE(big_mul_vec_sse2_fc) 8287c478bd9Sstevel@tonic-gate#else 8297c478bd9Sstevel@tonic-gate SET_SIZE(big_mul_vec_sse2) 8307c478bd9Sstevel@tonic-gate#endif 8317c478bd9Sstevel@tonic-gate 8327c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE) 8337c478bd9Sstevel@tonic-gate 8347c478bd9Sstevel@tonic-gate ENTRY(big_mul_vec_sse2) 8357c478bd9Sstevel@tonic-gate pushl %ebp 8367c478bd9Sstevel@tonic-gate movl %esp, %ebp 8377c478bd9Sstevel@tonic-gate subl $8, %esp 8387c478bd9Sstevel@tonic-gate pushl %edi 8397c478bd9Sstevel@tonic-gate KPREEMPT_DISABLE 8407c478bd9Sstevel@tonic-gate TEST_TS(%eax) 8417c478bd9Sstevel@tonic-gate movl %eax, -8(%ebp) 8427c478bd9Sstevel@tonic-gate jnz .mulvec_no_save 8437c478bd9Sstevel@tonic-gate SAVE_MMX_0TO4(%edi) 8447c478bd9Sstevel@tonic-gate movl %edi, -4(%ebp) 8457c478bd9Sstevel@tonic-gate.mulvec_no_save: 8467c478bd9Sstevel@tonic-gate movl 24(%ebp), %eax / blen 8477c478bd9Sstevel@tonic-gate pushl %eax 8487c478bd9Sstevel@tonic-gate movl 20(%ebp), %eax / b 8497c478bd9Sstevel@tonic-gate pushl %eax 8507c478bd9Sstevel@tonic-gate movl 16(%ebp), %eax / alen 8517c478bd9Sstevel@tonic-gate pushl %eax 8527c478bd9Sstevel@tonic-gate movl 12(%ebp), %eax / a 8537c478bd9Sstevel@tonic-gate pushl %eax 8547c478bd9Sstevel@tonic-gate movl 8(%ebp), %eax / r 8557c478bd9Sstevel@tonic-gate pushl %eax 8567c478bd9Sstevel@tonic-gate call big_mul_vec_sse2_fc 8577c478bd9Sstevel@tonic-gate addl $20, %esp 8587c478bd9Sstevel@tonic-gate movl -8(%ebp), %eax 8597c478bd9Sstevel@tonic-gate testl $CR0_TS, %eax 8607c478bd9Sstevel@tonic-gate jnz .mulvec_no_rstr 8617c478bd9Sstevel@tonic-gate movl -4(%ebp), %edi 8627c478bd9Sstevel@tonic-gate RSTOR_MMX_0TO4(%edi) 8637c478bd9Sstevel@tonic-gate.mulvec_no_rstr: 8647c478bd9Sstevel@tonic-gate movl %eax, %cr0 8657c478bd9Sstevel@tonic-gate KPREEMPT_ENABLE 8667c478bd9Sstevel@tonic-gate popl %edi 8677c478bd9Sstevel@tonic-gate leave 8687c478bd9Sstevel@tonic-gate ret 8697c478bd9Sstevel@tonic-gate SET_SIZE(big_mul_vec_sse2) 8707c478bd9Sstevel@tonic-gate 8717c478bd9Sstevel@tonic-gate#endif /* MMX_MANAGE */ 8727c478bd9Sstevel@tonic-gate 8737c478bd9Sstevel@tonic-gate 8747c478bd9Sstevel@tonic-gate 8757c478bd9Sstevel@tonic-gate#undef UNROLL 8767c478bd9Sstevel@tonic-gate#undef UNROLL32 8777c478bd9Sstevel@tonic-gate 8787c478bd9Sstevel@tonic-gate 8797c478bd9Sstevel@tonic-gate/ r = a * a, r and a are vectors of length len 8807c478bd9Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions 8817c478bd9Sstevel@tonic-gate/ 8827c478bd9Sstevel@tonic-gate/ This function is not suitable for a truly general-purpose multiprecision 8837c478bd9Sstevel@tonic-gate/ arithmetic library, because it does not work for "small" numbers, that is 8847c478bd9Sstevel@tonic-gate/ numbers of 1 or 2 digits. big_mul() just uses the ordinary big_mul_vec() 8857c478bd9Sstevel@tonic-gate/ for any small numbers. 8867c478bd9Sstevel@tonic-gate 8877c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE) 8887c478bd9Sstevel@tonic-gate ENTRY(big_sqr_vec_sse2_fc) 8897c478bd9Sstevel@tonic-gate#else 8907c478bd9Sstevel@tonic-gate ENTRY(big_sqr_vec_sse2) 8917c478bd9Sstevel@tonic-gate pushl %ebp 8927c478bd9Sstevel@tonic-gate movl %esp, %ebp 8937c478bd9Sstevel@tonic-gate#endif 8947c478bd9Sstevel@tonic-gate 8957c478bd9Sstevel@tonic-gate pushl %ebx 8967c478bd9Sstevel@tonic-gate pushl %edi 8977c478bd9Sstevel@tonic-gate pushl %esi 8987c478bd9Sstevel@tonic-gate 8997c478bd9Sstevel@tonic-gate / r[1..alen] = a[0] * a[1..alen-1] 9007c478bd9Sstevel@tonic-gate 9017c478bd9Sstevel@tonic-gate movl 8(%ebp), %edi / r = arg(r) 9027c478bd9Sstevel@tonic-gate movl 12(%ebp), %esi / a = arg(a) 9037c478bd9Sstevel@tonic-gate movl 16(%ebp), %ecx / cnt = arg(alen) 9047c478bd9Sstevel@tonic-gate movd %ecx, %mm4 / save_cnt = arg(alen) 9057c478bd9Sstevel@tonic-gate leal 4(%edi), %edx / dst = &r[1] 9067c478bd9Sstevel@tonic-gate movl %esi, %ebx / src = a 9077c478bd9Sstevel@tonic-gate movd 0(%ebx), %mm3 / mm3 = a[0] 9087c478bd9Sstevel@tonic-gate leal 4(%ebx), %ebx / src = &a[1] 9097c478bd9Sstevel@tonic-gate subl $1, %ecx / --cnt 9107c478bd9Sstevel@tonic-gate call big_mul_set_vec_sse2_r / r[1..alen-1] = a[0] * a[1..alen-1] 9117c478bd9Sstevel@tonic-gate movl %edi, %edx / dst = r 9127c478bd9Sstevel@tonic-gate movl %esi, %ebx / src = a 9137c478bd9Sstevel@tonic-gate movd %mm4, %ecx / cnt = save_cnt 9147c478bd9Sstevel@tonic-gate movl %eax, (%edx, %ecx, 4) / r[cnt] = cy 9157c478bd9Sstevel@tonic-gate 9167c478bd9Sstevel@tonic-gate/ /* High-level vector C pseudocode */ 9177c478bd9Sstevel@tonic-gate/ for (i = 1; i < alen-1; ++i) 9187c478bd9Sstevel@tonic-gate/ r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1] 9197c478bd9Sstevel@tonic-gate/ 9207c478bd9Sstevel@tonic-gate/ /* Same thing, but slightly lower level C-like pseudocode */ 9217c478bd9Sstevel@tonic-gate/ i = 1; 9227c478bd9Sstevel@tonic-gate/ r = &arg_r[2*i + 1]; 9237c478bd9Sstevel@tonic-gate/ a = &arg_a[i + 1]; 9247c478bd9Sstevel@tonic-gate/ digit = arg_a[i]; 9257c478bd9Sstevel@tonic-gate/ cnt = alen - 3; 9267c478bd9Sstevel@tonic-gate/ while (cnt != 0) { 9277c478bd9Sstevel@tonic-gate/ r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit); 9287c478bd9Sstevel@tonic-gate/ r += 2; 9297c478bd9Sstevel@tonic-gate/ ++a; 9307c478bd9Sstevel@tonic-gate/ --cnt; 9317c478bd9Sstevel@tonic-gate/ } 9327c478bd9Sstevel@tonic-gate/ 9337c478bd9Sstevel@tonic-gate/ /* Same thing, but even lower level 9347c478bd9Sstevel@tonic-gate/ * For example, pointers are raw pointers, 9357c478bd9Sstevel@tonic-gate/ * with no scaling by object size. 9367c478bd9Sstevel@tonic-gate/ */ 9377c478bd9Sstevel@tonic-gate/ r = arg_r + 12; /* i == 1; 2i + 1 == 3; 4*3 == 12; */ 9387c478bd9Sstevel@tonic-gate/ a = arg_a + 8; 9397c478bd9Sstevel@tonic-gate/ digit = *(arg_a + 4); 9407c478bd9Sstevel@tonic-gate/ cnt = alen - 3; 9417c478bd9Sstevel@tonic-gate/ while (cnt != 0) { 9427c478bd9Sstevel@tonic-gate/ cy = big_mul_add_vec_sse2_r(); 9437c478bd9Sstevel@tonic-gate/ *(r + 4 * cnt) = cy; 9447c478bd9Sstevel@tonic-gate/ r += 8; 9457c478bd9Sstevel@tonic-gate/ a += 4; 9467c478bd9Sstevel@tonic-gate/ --cnt; 9477c478bd9Sstevel@tonic-gate/ } 9487c478bd9Sstevel@tonic-gate 9497c478bd9Sstevel@tonic-gate leal 4(%edi), %edi / r += 4; r = &r[1] 9507c478bd9Sstevel@tonic-gate leal 4(%esi), %esi / a += 4; a = &a[1] 9517c478bd9Sstevel@tonic-gate movd %mm4, %ecx / cnt = save 9527c478bd9Sstevel@tonic-gate subl $2, %ecx / cnt = alen - 2; i in 1..alen-2 9537c478bd9Sstevel@tonic-gate movd %ecx, %mm4 / save_cnt 9547c478bd9Sstevel@tonic-gate jecxz .L32 / while (cnt != 0) { 9557c478bd9Sstevel@tonic-gate.L31: 9567c478bd9Sstevel@tonic-gate movd 0(%esi), %mm3 / digit = a[i] 9577c478bd9Sstevel@tonic-gate leal 4(%esi), %esi / a += 4; a = &a[1]; a = &a[i + 1] 9587c478bd9Sstevel@tonic-gate leal 8(%edi), %edi / r += 8; r = &r[2]; r = &r[2 * i + 1] 9597c478bd9Sstevel@tonic-gate movl %edi, %edx / edx = r 9607c478bd9Sstevel@tonic-gate movl %esi, %ebx / ebx = a 9617c478bd9Sstevel@tonic-gate cmp $1, %ecx / The last triangle term is special 9627c478bd9Sstevel@tonic-gate jz .L32 9637c478bd9Sstevel@tonic-gate call big_mul_add_vec_sse2_r 9647c478bd9Sstevel@tonic-gate movd %mm4, %ecx / cnt = save_cnt 9657c478bd9Sstevel@tonic-gate movl %eax, (%edi, %ecx, 4) / r[cnt] = cy 9667c478bd9Sstevel@tonic-gate subl $1, %ecx / --cnt 9677c478bd9Sstevel@tonic-gate movd %ecx, %mm4 / save_cnt = cnt 9687c478bd9Sstevel@tonic-gate jmp .L31 / } 9697c478bd9Sstevel@tonic-gate 9707c478bd9Sstevel@tonic-gate.L32: 9717c478bd9Sstevel@tonic-gate movd 0(%ebx), %mm1 / mm1 = a[i + 1] 9727c478bd9Sstevel@tonic-gate movd 0(%edx), %mm2 / mm2 = r[2 * i + 1] 9737c478bd9Sstevel@tonic-gate pmuludq %mm3, %mm1 / mm1 = p = digit * a[i + 1] 9747c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / mm2 = r[2 * i + 1] + p 9757c478bd9Sstevel@tonic-gate movd %mm2, 0(%edx) / r[2 * i + 1] += lo32(p) 9767c478bd9Sstevel@tonic-gate psrlq $32, %mm2 / mm2 = cy 9777c478bd9Sstevel@tonic-gate movd %mm2, 4(%edx) / r[2 * i + 2] = cy 9787c478bd9Sstevel@tonic-gate pxor %mm2, %mm2 9797c478bd9Sstevel@tonic-gate movd %mm2, 8(%edx) / r[2 * i + 3] = 0 9807c478bd9Sstevel@tonic-gate 9817c478bd9Sstevel@tonic-gate movl 8(%ebp), %edx / r = arg(r) 9827c478bd9Sstevel@tonic-gate movl 12(%ebp), %ebx / a = arg(a) 9837c478bd9Sstevel@tonic-gate movl 16(%ebp), %ecx / cnt = arg(alen) 9847c478bd9Sstevel@tonic-gate 9857c478bd9Sstevel@tonic-gate / compute low-order corner 9867c478bd9Sstevel@tonic-gate / p = a[0]**2 9877c478bd9Sstevel@tonic-gate / r[0] = lo32(p) 9887c478bd9Sstevel@tonic-gate / cy = hi32(p) 9897c478bd9Sstevel@tonic-gate movd 0(%ebx), %mm2 / mm2 = a[0] 9907c478bd9Sstevel@tonic-gate pmuludq %mm2, %mm2 / mm2 = p = a[0]**2 9917c478bd9Sstevel@tonic-gate movd %mm2, 0(%edx) / r[0] = lo32(p) 9927c478bd9Sstevel@tonic-gate psrlq $32, %mm2 / mm2 = cy = hi32(p) 9937c478bd9Sstevel@tonic-gate 9947c478bd9Sstevel@tonic-gate / p = 2 * r[1] 9957c478bd9Sstevel@tonic-gate / t = p + cy 9967c478bd9Sstevel@tonic-gate / r[1] = lo32(t) 9977c478bd9Sstevel@tonic-gate / cy = hi32(t) 9987c478bd9Sstevel@tonic-gate movd 4(%edx), %mm1 / mm1 = r[1] 9997c478bd9Sstevel@tonic-gate psllq $1, %mm1 / mm1 = p = 2 * r[1] 10007c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 / mm2 = t = p + cy 10017c478bd9Sstevel@tonic-gate movd %mm2, 4(%edx) / r[1] = low32(t) 10027c478bd9Sstevel@tonic-gate psrlq $32, %mm2 / mm2 = cy = hi32(t) 10037c478bd9Sstevel@tonic-gate 10047c478bd9Sstevel@tonic-gate / r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3] 10057c478bd9Sstevel@tonic-gate subl $2, %ecx / cnt = alen - 2 10067c478bd9Sstevel@tonic-gate.L34: 10077c478bd9Sstevel@tonic-gate movd 4(%ebx), %mm0 / mm0 = diag = a[i+1] 10087c478bd9Sstevel@tonic-gate pmuludq %mm0, %mm0 / mm0 = p = diag**2 10097c478bd9Sstevel@tonic-gate paddq %mm0, %mm2 / mm2 = t = p + cy 10107c478bd9Sstevel@tonic-gate movd %mm2, %eax 10117c478bd9Sstevel@tonic-gate movd %eax, %mm1 / mm1 = lo32(t) 10127c478bd9Sstevel@tonic-gate psrlq $32, %mm2 / mm2 = hi32(t) 10137c478bd9Sstevel@tonic-gate 10147c478bd9Sstevel@tonic-gate movd 8(%edx), %mm3 / mm3 = r[2*i] 10157c478bd9Sstevel@tonic-gate psllq $1, %mm3 / mm3 = 2*r[2*i] 10167c478bd9Sstevel@tonic-gate paddq %mm3, %mm1 / mm1 = 2*r[2*i] + lo32(t) 10177c478bd9Sstevel@tonic-gate movd %mm1, 8(%edx) / r[2*i] = 2*r[2*i] + lo32(t) 10187c478bd9Sstevel@tonic-gate psrlq $32, %mm1 10197c478bd9Sstevel@tonic-gate paddq %mm1, %mm2 10207c478bd9Sstevel@tonic-gate 10217c478bd9Sstevel@tonic-gate movd 12(%edx), %mm3 / mm3 = r[2*i+1] 10227c478bd9Sstevel@tonic-gate psllq $1, %mm3 / mm3 = 2*r[2*i+1] 10237c478bd9Sstevel@tonic-gate paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + hi32(t) 10247c478bd9Sstevel@tonic-gate movd %mm2, 12(%edx) / r[2*i+1] = mm2 10257c478bd9Sstevel@tonic-gate psrlq $32, %mm2 / mm2 = cy 10267c478bd9Sstevel@tonic-gate leal 8(%edx), %edx / r += 2 10277c478bd9Sstevel@tonic-gate leal 4(%ebx), %ebx / ++a 10287c478bd9Sstevel@tonic-gate subl $1, %ecx / --cnt 10297c478bd9Sstevel@tonic-gate jnz .L34 10307c478bd9Sstevel@tonic-gate 10317c478bd9Sstevel@tonic-gate / Carry from last triangle term must participate in doubling, 10327c478bd9Sstevel@tonic-gate / but this step isn't paired up with a squaring the elements 10337c478bd9Sstevel@tonic-gate / of the inner diagonal. 10347c478bd9Sstevel@tonic-gate / r[$-3..$-2] += 2 * r[$-3..$-2] + cy 10357c478bd9Sstevel@tonic-gate movd 8(%edx), %mm3 / mm3 = r[2*i] 10367c478bd9Sstevel@tonic-gate psllq $1, %mm3 / mm3 = 2*r[2*i] 10377c478bd9Sstevel@tonic-gate paddq %mm3, %mm2 / mm2 = 2*r[2*i] + cy 10387c478bd9Sstevel@tonic-gate movd %mm2, 8(%edx) / r[2*i] = lo32(2*r[2*i] + cy) 10397c478bd9Sstevel@tonic-gate psrlq $32, %mm2 / mm2 = cy = hi32(2*r[2*i] + cy) 10407c478bd9Sstevel@tonic-gate 10417c478bd9Sstevel@tonic-gate movd 12(%edx), %mm3 / mm3 = r[2*i+1] 10427c478bd9Sstevel@tonic-gate psllq $1, %mm3 / mm3 = 2*r[2*i+1] 10437c478bd9Sstevel@tonic-gate paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + cy 10447c478bd9Sstevel@tonic-gate movd %mm2, 12(%edx) / r[2*i+1] = mm2 10457c478bd9Sstevel@tonic-gate psrlq $32, %mm2 / mm2 = cy 10467c478bd9Sstevel@tonic-gate 10477c478bd9Sstevel@tonic-gate / compute high-order corner and add it in 10487c478bd9Sstevel@tonic-gate / p = a[alen - 1]**2 10497c478bd9Sstevel@tonic-gate / t = p + cy 10507c478bd9Sstevel@tonic-gate / r[alen + alen - 2] += lo32(t) 10517c478bd9Sstevel@tonic-gate / cy = hi32(t) 10527c478bd9Sstevel@tonic-gate / r[alen + alen - 1] = cy 10537c478bd9Sstevel@tonic-gate movd 4(%ebx), %mm0 / mm0 = a[$-1] 10547c478bd9Sstevel@tonic-gate movd 8(%edx), %mm3 / mm3 = r[$-2] 10557c478bd9Sstevel@tonic-gate pmuludq %mm0, %mm0 / mm0 = p = a[$-1]**2 10567c478bd9Sstevel@tonic-gate paddq %mm0, %mm2 / mm2 = t = p + cy 10577c478bd9Sstevel@tonic-gate paddq %mm3, %mm2 / mm2 = r[$-2] + t 10587c478bd9Sstevel@tonic-gate movd %mm2, 8(%edx) / r[$-2] = lo32(r[$-2] + t) 10597c478bd9Sstevel@tonic-gate psrlq $32, %mm2 / mm2 = cy = hi32(r[$-2] + t) 10607c478bd9Sstevel@tonic-gate movd 12(%edx), %mm3 10617c478bd9Sstevel@tonic-gate paddq %mm3, %mm2 10627c478bd9Sstevel@tonic-gate movd %mm2, 12(%edx) / r[$-1] += cy 10637c478bd9Sstevel@tonic-gate 10647c478bd9Sstevel@tonic-gate.L35: 10657c478bd9Sstevel@tonic-gate emms 10667c478bd9Sstevel@tonic-gate popl %esi 10677c478bd9Sstevel@tonic-gate popl %edi 10687c478bd9Sstevel@tonic-gate popl %ebx 10697c478bd9Sstevel@tonic-gate 10707c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE) 10717c478bd9Sstevel@tonic-gate ret 10727c478bd9Sstevel@tonic-gate SET_SIZE(big_sqr_vec_sse2_fc) 10737c478bd9Sstevel@tonic-gate#else 10747c478bd9Sstevel@tonic-gate leave 10757c478bd9Sstevel@tonic-gate ret 10767c478bd9Sstevel@tonic-gate SET_SIZE(big_sqr_vec_sse2) 10777c478bd9Sstevel@tonic-gate#endif 10787c478bd9Sstevel@tonic-gate 10797c478bd9Sstevel@tonic-gate 10807c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE) 10817c478bd9Sstevel@tonic-gate ENTRY(big_sqr_vec_sse2) 10827c478bd9Sstevel@tonic-gate pushl %ebp 10837c478bd9Sstevel@tonic-gate movl %esp, %ebp 10847c478bd9Sstevel@tonic-gate KPREEMPT_DISABLE 10857c478bd9Sstevel@tonic-gate TEST_TS(%ebx) 10867c478bd9Sstevel@tonic-gate pushl %ebx 10877c478bd9Sstevel@tonic-gate jnz .sqr_no_save 10887c478bd9Sstevel@tonic-gate pushl %edi 10897c478bd9Sstevel@tonic-gate SAVE_MMX_0TO4(%edi) 10907c478bd9Sstevel@tonic-gate call big_sqr_vec_sse2_fc 10917c478bd9Sstevel@tonic-gate RSTOR_MMX_0TO4(%edi) 10927c478bd9Sstevel@tonic-gate popl %edi 10937c478bd9Sstevel@tonic-gate jmp .sqr_rtn 10947c478bd9Sstevel@tonic-gate 10957c478bd9Sstevel@tonic-gate.sqr_no_save: 10967c478bd9Sstevel@tonic-gate call big_sqr_vec_sse2_fc 10977c478bd9Sstevel@tonic-gate 10987c478bd9Sstevel@tonic-gate.sqr_rtn: 10997c478bd9Sstevel@tonic-gate popl %ebx 11007c478bd9Sstevel@tonic-gate movl %ebx, %cr0 11017c478bd9Sstevel@tonic-gate KPREEMPT_ENABLE 11027c478bd9Sstevel@tonic-gate leave 11037c478bd9Sstevel@tonic-gate ret 11047c478bd9Sstevel@tonic-gate SET_SIZE(big_sqr_vec_sse2) 11057c478bd9Sstevel@tonic-gate 11067c478bd9Sstevel@tonic-gate#endif /* MMX_MANAGE */ 11077c478bd9Sstevel@tonic-gate 11087c478bd9Sstevel@tonic-gate/ ------------------------------------------------------------------------ 11097c478bd9Sstevel@tonic-gate/ UMUL Implementations 11107c478bd9Sstevel@tonic-gate/ ------------------------------------------------------------------------ 11117c478bd9Sstevel@tonic-gate 11127c478bd9Sstevel@tonic-gate 11137c478bd9Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len 11147c478bd9Sstevel@tonic-gate/ returns the carry digit 11157c478bd9Sstevel@tonic-gate/ Does not use any MMX, SSE, or SSE2 instructions. 11167c478bd9Sstevel@tonic-gate/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL. 11177c478bd9Sstevel@tonic-gate/ This is a fall-back implementation for x86 models that do not support 11187c478bd9Sstevel@tonic-gate/ the PMULUDQ instruction. 11197c478bd9Sstevel@tonic-gate/ 11207c478bd9Sstevel@tonic-gate/ uint32_t 11217c478bd9Sstevel@tonic-gate/ big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 11227c478bd9Sstevel@tonic-gate/ 11237c478bd9Sstevel@tonic-gate/ r 8(%ebp) %edx %edi 11247c478bd9Sstevel@tonic-gate/ a 12(%ebp) %ebx %esi 11257c478bd9Sstevel@tonic-gate/ len 16(%ebp) %ecx 11267c478bd9Sstevel@tonic-gate/ digit 20(%ebp) %esi 11277c478bd9Sstevel@tonic-gate 11287c478bd9Sstevel@tonic-gate ENTRY(big_mul_set_vec_umul) 11297c478bd9Sstevel@tonic-gate pushl %ebp 11307c478bd9Sstevel@tonic-gate movl %esp, %ebp 11317c478bd9Sstevel@tonic-gate pushl %esi 11327c478bd9Sstevel@tonic-gate pushl %edi 11337c478bd9Sstevel@tonic-gate pushl %ebx 11347c478bd9Sstevel@tonic-gate movl 16(%ebp), %ecx 11357c478bd9Sstevel@tonic-gate xorl %ebx, %ebx / cy = 0 11367c478bd9Sstevel@tonic-gate testl %ecx, %ecx 11377c478bd9Sstevel@tonic-gate movl 8(%ebp), %edi 11387c478bd9Sstevel@tonic-gate movl 12(%ebp), %esi 11397c478bd9Sstevel@tonic-gate je .L57 11407c478bd9Sstevel@tonic-gate 11417c478bd9Sstevel@tonic-gate.L55: 11427c478bd9Sstevel@tonic-gate movl (%esi), %eax / eax = a[i] 11437c478bd9Sstevel@tonic-gate leal 4(%esi), %esi / ++a 11447c478bd9Sstevel@tonic-gate mull 20(%ebp) / edx:eax = a[i] * digit 11457c478bd9Sstevel@tonic-gate addl %ebx, %eax 11467c478bd9Sstevel@tonic-gate adcl $0, %edx / edx:eax = a[i] * digit + cy 11477c478bd9Sstevel@tonic-gate movl %eax, (%edi) / r[i] = product[31..0] 11487c478bd9Sstevel@tonic-gate movl %edx, %ebx / cy = product[63..32] 11497c478bd9Sstevel@tonic-gate leal 4(%edi), %edi / ++r 11507c478bd9Sstevel@tonic-gate decl %ecx / --len 11517c478bd9Sstevel@tonic-gate jnz .L55 / while (len != 0) 11527c478bd9Sstevel@tonic-gate.L57: 11537c478bd9Sstevel@tonic-gate movl %ebx, %eax 11547c478bd9Sstevel@tonic-gate popl %ebx 11557c478bd9Sstevel@tonic-gate popl %edi 11567c478bd9Sstevel@tonic-gate popl %esi 11577c478bd9Sstevel@tonic-gate leave 11587c478bd9Sstevel@tonic-gate ret 11597c478bd9Sstevel@tonic-gate SET_SIZE(big_mul_set_vec_umul) 11607c478bd9Sstevel@tonic-gate 11617c478bd9Sstevel@tonic-gate 11627c478bd9Sstevel@tonic-gate/ r = r + a * digit, r and a are vectors of length len 11637c478bd9Sstevel@tonic-gate/ returns the carry digit 11647c478bd9Sstevel@tonic-gate/ Does not use any MMX, SSE, or SSE2 instructions. 11657c478bd9Sstevel@tonic-gate/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL. 11667c478bd9Sstevel@tonic-gate/ This is a fall-back implementation for x86 models that do not support 11677c478bd9Sstevel@tonic-gate/ the PMULUDQ instruction. 11687c478bd9Sstevel@tonic-gate/ 11697c478bd9Sstevel@tonic-gate/ uint32_t 11707c478bd9Sstevel@tonic-gate/ big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 11717c478bd9Sstevel@tonic-gate/ 11727c478bd9Sstevel@tonic-gate/ r 8(%ebp) %edx %edi 11737c478bd9Sstevel@tonic-gate/ a 12(%ebp) %ebx %esi 11747c478bd9Sstevel@tonic-gate/ len 16(%ebp) %ecx 11757c478bd9Sstevel@tonic-gate/ digit 20(%ebp) %esi 11767c478bd9Sstevel@tonic-gate 11777c478bd9Sstevel@tonic-gate ENTRY(big_mul_add_vec_umul) 11787c478bd9Sstevel@tonic-gate pushl %ebp 11797c478bd9Sstevel@tonic-gate movl %esp, %ebp 11807c478bd9Sstevel@tonic-gate pushl %esi 11817c478bd9Sstevel@tonic-gate pushl %edi 11827c478bd9Sstevel@tonic-gate pushl %ebx 11837c478bd9Sstevel@tonic-gate movl 16(%ebp), %ecx 11847c478bd9Sstevel@tonic-gate xorl %ebx, %ebx / cy = 0 11857c478bd9Sstevel@tonic-gate testl %ecx, %ecx 11867c478bd9Sstevel@tonic-gate movl 8(%ebp), %edi 11877c478bd9Sstevel@tonic-gate movl 12(%ebp), %esi 11887c478bd9Sstevel@tonic-gate je .L67 11897c478bd9Sstevel@tonic-gate .align 4 11907c478bd9Sstevel@tonic-gate.L65: 11917c478bd9Sstevel@tonic-gate movl (%esi), %eax / eax = a[i] 11927c478bd9Sstevel@tonic-gate leal 4(%esi), %esi / ++a 11937c478bd9Sstevel@tonic-gate mull 20(%ebp) / edx:eax = a[i] * digit 11947c478bd9Sstevel@tonic-gate addl (%edi), %eax 11957c478bd9Sstevel@tonic-gate adcl $0, %edx / edx:eax = a[i] * digit + r[i] 11967c478bd9Sstevel@tonic-gate addl %ebx, %eax 11977c478bd9Sstevel@tonic-gate adcl $0, %edx / edx:eax = a[i] * digit + r[i] + cy 11987c478bd9Sstevel@tonic-gate movl %eax, (%edi) / r[i] = product[31..0] 11997c478bd9Sstevel@tonic-gate movl %edx, %ebx / cy = product[63..32] 12007c478bd9Sstevel@tonic-gate leal 4(%edi), %edi / ++r 12017c478bd9Sstevel@tonic-gate decl %ecx / --len 12027c478bd9Sstevel@tonic-gate jnz .L65 / while (len != 0) 12037c478bd9Sstevel@tonic-gate.L67: 12047c478bd9Sstevel@tonic-gate movl %ebx, %eax 12057c478bd9Sstevel@tonic-gate popl %ebx 12067c478bd9Sstevel@tonic-gate popl %edi 12077c478bd9Sstevel@tonic-gate popl %esi 12087c478bd9Sstevel@tonic-gate leave 12097c478bd9Sstevel@tonic-gate ret 12107c478bd9Sstevel@tonic-gate SET_SIZE(big_mul_add_vec_umul) 12117c478bd9Sstevel@tonic-gate 12127c478bd9Sstevel@tonic-gate#endif /* __lint */ 1213