/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #if !defined(__GNUC_AS__) #include #include #include #if defined(__lint) #include uint32_t bignum_use_sse2() { return (0); } /* Not to be called by C code */ /* ARGSUSED */ uint32_t big_mul_set_vec_sse2_r() { return (0); } /* Not to be called by C code */ /* ARGSUSED */ uint32_t big_mul_add_vec_sse2_r() { return (0); } /* ARGSUSED */ uint32_t big_mul_set_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit) { return (0); } /* ARGSUSED */ uint32_t big_mul_add_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit) { return (0); } /* ARGSUSED */ void big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen) {} /* ARGSUSED */ void big_sqr_vec_sse2(uint32_t *r, uint32_t *a, int len) {} #if defined(MMX_MANAGE) /* ARGSUSED */ uint32_t big_mul_set_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit) { return (0); } /* ARGSUSED */ uint32_t big_mul_add_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit) { return (0); } /* Not to be called by C code */ /* ARGSUSED */ void big_sqr_vec_sse2_fc(uint32_t *r, uint32_t *a, int len) {} #endif /* MMX_MANAGE */ /* * UMUL * */ /* ARGSUSED */ uint32_t big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) { return (0); } /* ARGSUSED */ uint32_t big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) { return (0); } #else /* __lint */ #if defined(MMX_MANAGE) #if defined(_KERNEL) #define KPREEMPT_DISABLE call kpr_disable #define KPREEMPT_ENABLE call kpr_enable #define TEST_TS(reg) \ movl %cr0, reg; \ clts; \ testl $CR0_TS, reg #else /* _KERNEL */ #define KPREEMPT_DISABLE #define KPREEMPT_ENABLE #define TEST_TS(reg) \ movl $0, reg; \ testl $CR0_TS, reg #endif /* _KERNEL */ #define MMX_SIZE 8 #define MMX_ALIGN 8 #define SAVE_MMX_PROLOG(sreg, nreg) \ subl $[MMX_SIZE \* nreg + MMX_ALIGN], %esp; \ movl %esp, sreg; \ addl $MMX_ALIGN, sreg; \ andl $-1![MMX_ALIGN-1], sreg; #define RSTOR_MMX_EPILOG(nreg) \ addl $[MMX_SIZE \* nreg + MMX_ALIGN], %esp; #define SAVE_MMX_0TO4(sreg) \ SAVE_MMX_PROLOG(sreg, 5); \ movq %mm0, 0(sreg); \ movq %mm1, 8(sreg); \ movq %mm2, 16(sreg); \ movq %mm3, 24(sreg); \ movq %mm4, 32(sreg) #define RSTOR_MMX_0TO4(sreg) \ movq 0(sreg), %mm0; \ movq 8(sreg), %mm1; \ movq 16(sreg), %mm2; \ movq 24(sreg), %mm3; \ movq 32(sreg), %mm4; \ RSTOR_MMX_EPILOG(5) #endif /* MMX_MANAGE */ / Note: this file contains implementations for / big_mul_set_vec() / big_mul_add_vec() / big_mul_vec() / big_sqr_vec() / One set of implementations is for SSE2-capable models. / The other uses no MMX, SSE, or SSE2 instructions, only / the x86 32 X 32 -> 64 unsigned multiply instruction, MUL. / / The code for the implementations is grouped by SSE2 vs UMUL, / rather than grouping pairs of implementations for each function. / This is because the bignum implementation gets "imprinted" / on the correct implementation, at the time of first use, / so none of the code for the other implementations is ever / executed. So, it is a no-brainer to layout the code to minimize / the "footprint" of executed code. / Can we use SSE2 instructions? Return value is non-zero / if we can. / / Note: / Using the cpuid instruction directly would work equally / well in userland and in the kernel, but we do not use the / cpuid instruction in the kernel, we use the x86_feature / variable, instead. This means we honor any decisions / the kernel startup code may have made in setting this / variable, including disabling SSE2 because of settings / in /etc/system. It might even be a good idea to honor / this kind of setting in userland, as well, but the variable, / x86-feature is not readily available to userland processes. / / uint32_t / bignum_use_sse2() ENTRY(bignum_use_sse2) #if defined(_KERNEL) movl x86_feature, %eax andl $X86_SSE2, %eax #else /* _KERNEL */ pushl %ebx movl $1, %eax / Get feature information cpuid movl %edx, %eax / set return value popl %ebx andl $CPUID_INTC_EDX_SSE2, %eax #endif /* _KERNEL */ ret SET_SIZE(bignum_use_sse2) / ------------------------------------------------------------------------ / SSE2 Implementations / ------------------------------------------------------------------------ / r = a * digit, r and a are vectors of length len / returns the carry digit / Suitable only for x86 models that support SSE2 instruction set extensions / / uint32_t / big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit) / / r %edx / a %ebx / len %ecx / digit %mm3 / / Does not touch the following registers: %esi, %edi, %mm4 / / N.B.: / This is strictly for internal use. / The interface is very light-weight. / All parameters are passed in registers. / It does not conform to the SYSV x86 ABI. / So, don't even think about calling this function directly from C code. / / The basic multiply digit loop is unrolled 8 times. / Each comment is preceded by an instance number. / Instructions that have been moved retain their original, "natural" / instance number. It should be easier this way to follow / the step-wise refinement process that went into constructing / the final code. #define UNROLL 8 #define UNROLL32 32 ENTRY(big_mul_set_vec_sse2_r) xorl %eax, %eax / if (len == 0) return (0); testl %ecx, %ecx jz .L17 pxor %mm0, %mm0 / cy = 0 .L15: cmpl $UNROLL, %ecx jl .L16 movd 0(%ebx), %mm1 / 1: mm1 = a[i] pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy; movd 4(%ebx), %mm1 / 2: mm1 = a[i] movd %mm0, 0(%edx) / 1: r[i] = product[31..0] psrlq $32, %mm0 / 1: cy = product[63..32] pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy; movd 8(%ebx), %mm1 / 3: mm1 = a[i] movd %mm0, 4(%edx) / 2: r[i] = product[31..0] psrlq $32, %mm0 / 2: cy = product[63..32] pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy; movd 12(%ebx), %mm1 / 4: mm1 = a[i] movd %mm0, 8(%edx) / 3: r[i] = product[31..0] psrlq $32, %mm0 / 3: cy = product[63..32] pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy; movd 16(%ebx), %mm1 / 5: mm1 = a[i] movd %mm0, 12(%edx) / 4: r[i] = product[31..0] psrlq $32, %mm0 / 4: cy = product[63..32] pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy; movd 20(%ebx), %mm1 / 6: mm1 = a[i] movd %mm0, 16(%edx) / 5: r[i] = product[31..0] psrlq $32, %mm0 / 5: cy = product[63..32] pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy; movd 24(%ebx), %mm1 / 7: mm1 = a[i] movd %mm0, 20(%edx) / 6: r[i] = product[31..0] psrlq $32, %mm0 / 6: cy = product[63..32] pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy; movd 28(%ebx), %mm1 / 8: mm1 = a[i] movd %mm0, 24(%edx) / 7: r[i] = product[31..0] psrlq $32, %mm0 / 7: cy = product[63..32] pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i] paddq %mm1, %mm0 / 8: mm0 = digit * a[i] + cy; movd %mm0, 28(%edx) / 8: r[i] = product[31..0] psrlq $32, %mm0 / 8: cy = product[63..32] leal UNROLL32(%ebx), %ebx / a += UNROLL leal UNROLL32(%edx), %edx / r += UNROLL subl $UNROLL, %ecx / len -= UNROLL jz .L17 jmp .L15 .L16: movd 0(%ebx), %mm1 / 1: mm1 = a[i] pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy; movd %mm0, 0(%edx) / 1: r[i] = product[31..0] psrlq $32, %mm0 / 1: cy = product[63..32] subl $1, %ecx jz .L17 movd 4(%ebx), %mm1 / 2: mm1 = a[i] pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy; movd %mm0, 4(%edx) / 2: r[i] = product[31..0] psrlq $32, %mm0 / 2: cy = product[63..32] subl $1, %ecx jz .L17 movd 8(%ebx), %mm1 / 3: mm1 = a[i] pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy; movd %mm0, 8(%edx) / 3: r[i] = product[31..0] psrlq $32, %mm0 / 3: cy = product[63..32] subl $1, %ecx jz .L17 movd 12(%ebx), %mm1 / 4: mm1 = a[i] pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy; movd %mm0, 12(%edx) / 4: r[i] = product[31..0] psrlq $32, %mm0 / 4: cy = product[63..32] subl $1, %ecx jz .L17 movd 16(%ebx), %mm1 / 5: mm1 = a[i] pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy; movd %mm0, 16(%edx) / 5: r[i] = product[31..0] psrlq $32, %mm0 / 5: cy = product[63..32] subl $1, %ecx jz .L17 movd 20(%ebx), %mm1 / 6: mm1 = a[i] pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy; movd %mm0, 20(%edx) / 6: r[i] = product[31..0] psrlq $32, %mm0 / 6: cy = product[63..32] subl $1, %ecx jz .L17 movd 24(%ebx), %mm1 / 7: mm1 = a[i] pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy; movd %mm0, 24(%edx) / 7: r[i] = product[31..0] psrlq $32, %mm0 / 7: cy = product[63..32] .L17: movd %mm0, %eax / return (cy) / no emms. caller is responsible for emms ret SET_SIZE(big_mul_set_vec_sse2_r) / r = a * digit, r and a are vectors of length len / returns the carry digit / Suitable only for x86 models that support SSE2 instruction set extensions / / r 8(%ebp) %edx / a 12(%ebp) %ebx / len 16(%ebp) %ecx / digit 20(%ebp) %mm3 / / In userland, there is just the one function, big_mul_set_vec_sse2(). / But in the kernel, there are two variations: / 1. big_mul_set_vec_sse2() which does what is necessary to save and / restore state, if necessary, and to ensure that preemtion is / disabled. / 2. big_mul_set_vec_sse2_nsv() which just does the work; / it is the caller's responsibility to ensure that MMX state / does not need to be saved and restored and that preemption / is already disabled. #if defined(MMX_MANAGE) ENTRY(big_mul_set_vec_sse2) pushl %ebp movl %esp, %ebp pushl %ebx pushl %esi KPREEMPT_DISABLE TEST_TS(%ebx) pushl %ebx jnz .setvec_no_save pushl %edi SAVE_MMX_0TO4(%edi) movl 8(%ebp), %edx movl 12(%ebp), %ebx movl 16(%ebp), %ecx movd 20(%ebp), %mm3 call big_mul_set_vec_sse2_r movl %eax, %esi RSTOR_MMX_0TO4(%edi) popl %edi jmp .setvec_rtn .setvec_no_save: movl 8(%ebp), %edx movl 12(%ebp), %ebx movl 16(%ebp), %ecx movd 20(%ebp), %mm3 call big_mul_set_vec_sse2_r movl %eax, %esi .setvec_rtn: emms popl %ebx movl %ebx, %cr0 KPREEMPT_ENABLE movl %esi, %eax popl %esi popl %ebx leave ret SET_SIZE(big_mul_set_vec_sse2) ENTRY(big_mul_set_vec_sse2_nsv) pushl %ebp movl %esp, %ebp pushl %ebx movl 8(%ebp), %edx movl 12(%ebp), %ebx movl 16(%ebp), %ecx movd 20(%ebp), %mm3 call big_mul_set_vec_sse2_r popl %ebx leave ret SET_SIZE(big_mul_set_vec_sse2_nsv) #else /* !defined(MMX_MANAGE) */ / r = a * digit, r and a are vectors of length len / returns the carry digit / Suitable only for x86 models that support SSE2 instruction set extensions / / r 8(%ebp) %edx / a 12(%ebp) %ebx / len 16(%ebp) %ecx / digit 20(%ebp) %mm3 ENTRY(big_mul_set_vec_sse2) pushl %ebp movl %esp, %ebp pushl %ebx movl 8(%ebp), %edx movl 12(%ebp), %ebx movl 16(%ebp), %ecx movd 20(%ebp), %mm3 call big_mul_set_vec_sse2_r popl %ebx emms leave ret SET_SIZE(big_mul_set_vec_sse2) #endif /* MMX_MANAGE */ / r = r + a * digit, r and a are vectors of length len / returns the carry digit / Suitable only for x86 models that support SSE2 instruction set extensions / / uint32_t / big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit) / / r %edx / a %ebx / len %ecx / digit %mm3 / / N.B.: / This is strictly for internal use. / The interface is very light-weight. / All parameters are passed in registers. / It does not conform to the SYSV x86 ABI. / So, don't even think about calling this function directly from C code. / / The basic multiply digit loop is unrolled 8 times. / Each comment is preceded by an instance number. / Instructions that have been moved retain their original, "natural" / instance number. It should be easier this way to follow / the step-wise refinement process that went into constructing / the final code. ENTRY(big_mul_add_vec_sse2_r) xorl %eax, %eax testl %ecx, %ecx jz .L27 pxor %mm0, %mm0 / cy = 0 .L25: cmpl $UNROLL, %ecx jl .L26 movd 0(%ebx), %mm1 / 1: mm1 = a[i] movd 0(%edx), %mm2 / 1: mm2 = r[i] pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i] movd 4(%ebx), %mm1 / 2: mm1 = a[i] paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy; movd %mm0, 0(%edx) / 1: r[i] = product[31..0] movd 4(%edx), %mm2 / 2: mm2 = r[i] psrlq $32, %mm0 / 1: cy = product[63..32] pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i] movd 8(%ebx), %mm1 / 3: mm1 = a[i] paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy; movd %mm0, 4(%edx) / 2: r[i] = product[31..0] movd 8(%edx), %mm2 / 3: mm2 = r[i] psrlq $32, %mm0 / 2: cy = product[63..32] pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i] movd 12(%ebx), %mm1 / 4: mm1 = a[i] paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy; movd %mm0, 8(%edx) / 3: r[i] = product[31..0] movd 12(%edx), %mm2 / 4: mm2 = r[i] psrlq $32, %mm0 / 3: cy = product[63..32] pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i] movd 16(%ebx), %mm1 / 5: mm1 = a[i] paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy; movd %mm0, 12(%edx) / 4: r[i] = product[31..0] movd 16(%edx), %mm2 / 5: mm2 = r[i] psrlq $32, %mm0 / 4: cy = product[63..32] pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i] movd 20(%ebx), %mm1 / 6: mm1 = a[i] paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy; movd %mm0, 16(%edx) / 5: r[i] = product[31..0] movd 20(%edx), %mm2 / 6: mm2 = r[i] psrlq $32, %mm0 / 5: cy = product[63..32] pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i] movd 24(%ebx), %mm1 / 7: mm1 = a[i] paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy; movd %mm0, 20(%edx) / 6: r[i] = product[31..0] movd 24(%edx), %mm2 / 7: mm2 = r[i] psrlq $32, %mm0 / 6: cy = product[63..32] pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i] movd 28(%ebx), %mm1 / 8: mm1 = a[i] paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy; movd %mm0, 24(%edx) / 7: r[i] = product[31..0] movd 28(%edx), %mm2 / 8: mm2 = r[i] psrlq $32, %mm0 / 7: cy = product[63..32] pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i] paddq %mm1, %mm2 / 8: mm2 = digit * a[i] + r[i] paddq %mm2, %mm0 / 8: mm0 = digit * a[i] + r[i] + cy; movd %mm0, 28(%edx) / 8: r[i] = product[31..0] psrlq $32, %mm0 / 8: cy = product[63..32] leal UNROLL32(%ebx), %ebx / a += UNROLL leal UNROLL32(%edx), %edx / r += UNROLL subl $UNROLL, %ecx / len -= UNROLL jz .L27 jmp .L25 .L26: movd 0(%ebx), %mm1 / 1: mm1 = a[i] movd 0(%edx), %mm2 / 1: mm2 = r[i] pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i] paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy; movd %mm0, 0(%edx) / 1: r[i] = product[31..0] psrlq $32, %mm0 / 1: cy = product[63..32] subl $1, %ecx jz .L27 movd 4(%ebx), %mm1 / 2: mm1 = a[i] movd 4(%edx), %mm2 / 2: mm2 = r[i] pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i] paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy; movd %mm0, 4(%edx) / 2: r[i] = product[31..0] psrlq $32, %mm0 / 2: cy = product[63..32] subl $1, %ecx jz .L27 movd 8(%ebx), %mm1 / 3: mm1 = a[i] movd 8(%edx), %mm2 / 3: mm2 = r[i] pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i] paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy; movd %mm0, 8(%edx) / 3: r[i] = product[31..0] psrlq $32, %mm0 / 3: cy = product[63..32] subl $1, %ecx jz .L27 movd 12(%ebx), %mm1 / 4: mm1 = a[i] movd 12(%edx), %mm2 / 4: mm2 = r[i] pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i] paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy; movd %mm0, 12(%edx) / 4: r[i] = product[31..0] psrlq $32, %mm0 / 4: cy = product[63..32] subl $1, %ecx jz .L27 movd 16(%ebx), %mm1 / 5: mm1 = a[i] movd 16(%edx), %mm2 / 5: mm2 = r[i] pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i] paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy; movd %mm0, 16(%edx) / 5: r[i] = product[31..0] psrlq $32, %mm0 / 5: cy = product[63..32] subl $1, %ecx jz .L27 movd 20(%ebx), %mm1 / 6: mm1 = a[i] movd 20(%edx), %mm2 / 6: mm2 = r[i] pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i] paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy; movd %mm0, 20(%edx) / 6: r[i] = product[31..0] psrlq $32, %mm0 / 6: cy = product[63..32] subl $1, %ecx jz .L27 movd 24(%ebx), %mm1 / 7: mm1 = a[i] movd 24(%edx), %mm2 / 7: mm2 = r[i] pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i] paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy; movd %mm0, 24(%edx) / 7: r[i] = product[31..0] psrlq $32, %mm0 / 7: cy = product[63..32] .L27: movd %mm0, %eax / no emms. caller is responsible for emms ret SET_SIZE(big_mul_add_vec_sse2_r) / r = r + a * digit, r and a are vectors of length len / returns the carry digit / Suitable only for x86 models that support SSE2 instruction set extensions / / r 8(%ebp) %edx / a 12(%ebp) %ebx / len 16(%ebp) %ecx / digit 20(%ebp) %mm3 / / In userland, there is just the one function, big_mul_add_vec_sse2(). / But in the kernel, there are two variations: / 1. big_mul_add_vec_sse2() which does what is necessary to save and / restore state, if necessary, and to ensure that preemtion is / disabled. / 2. big_mul_add_vec_sse2_nsv() which just does the work; / it is the caller's responsibility to ensure that MMX state / does not need to be saved and restored and that preemption / is already disabled. #if defined(MMX_MANAGE) ENTRY(big_mul_add_vec_sse2) pushl %ebp movl %esp, %ebp pushl %ebx pushl %esi KPREEMPT_DISABLE TEST_TS(%ebx) pushl %ebx jnz .addvec_no_save pushl %edi SAVE_MMX_0TO4(%edi) movl 8(%ebp), %edx movl 12(%ebp), %ebx movl 16(%ebp), %ecx movd 20(%ebp), %mm3 call big_mul_add_vec_sse2_r movl %eax, %esi RSTOR_MMX_0TO4(%edi) popl %edi jmp .addvec_rtn .addvec_no_save: movl 8(%ebp), %edx movl 12(%ebp), %ebx movl 16(%ebp), %ecx movd 20(%ebp), %mm3 call big_mul_add_vec_sse2_r movl %eax, %esi .addvec_rtn: emms popl %ebx movl %ebx, %cr0 KPREEMPT_ENABLE movl %esi, %eax popl %esi popl %ebx leave ret SET_SIZE(big_mul_add_vec_sse2) ENTRY(big_mul_add_vec_sse2_nsv) pushl %ebp movl %esp, %ebp pushl %ebx movl 8(%ebp), %edx movl 12(%ebp), %ebx movl 16(%ebp), %ecx movd 20(%ebp), %mm3 call big_mul_add_vec_sse2_r popl %ebx leave ret SET_SIZE(big_mul_add_vec_sse2_nsv) #else /* !defined(MMX_MANAGE) */ ENTRY(big_mul_add_vec_sse2) pushl %ebp movl %esp, %ebp pushl %ebx movl 8(%ebp), %edx movl 12(%ebp), %ebx movl 16(%ebp), %ecx movd 20(%ebp), %mm3 call big_mul_add_vec_sse2_r popl %ebx emms leave ret SET_SIZE(big_mul_add_vec_sse2) #endif /* MMX_MANAGE */ / void / big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen) / { / int i; / / r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]); / for (i = 1; i < blen; ++i) / r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]); / } #if defined(MMX_MANAGE) ENTRY(big_mul_vec_sse2_fc) #else ENTRY(big_mul_vec_sse2) #endif subl $0x8, %esp pushl %ebx pushl %ebp pushl %esi pushl %edi movl 40(%esp), %eax movl %eax, 20(%esp) pushl (%eax) movl 40(%esp), %edi pushl %edi movl 40(%esp), %esi pushl %esi movl 40(%esp), %ebx pushl %ebx #if defined(MMX_MANAGE) call big_mul_set_vec_sse2_nsv #else call big_mul_set_vec_sse2 #endif addl $0x10, %esp movl %eax, (%ebx,%edi,4) movl 44(%esp), %eax movl %eax, 16(%esp) cmpl $0x1, %eax jle .mulvec_rtn movl $0x1, %ebp .zalign 16,8 .mulvec_add: movl 20(%esp), %eax pushl (%eax,%ebp,4) pushl %edi pushl %esi leal (%ebx,%ebp,4), %eax pushl %eax #if defined(MMX_MANAGE) call big_mul_add_vec_sse2_nsv #else call big_mul_add_vec_sse2 #endif addl $0x10, %esp leal (%ebp,%edi), %ecx movl %eax, (%ebx,%ecx,4) incl %ebp cmpl 16(%esp), %ebp jl .mulvec_add .mulvec_rtn: #if defined(MMX_MANAGE) emms #endif popl %edi popl %esi popl %ebp popl %ebx addl $0x8, %esp ret #if defined(MMX_MANAGE) SET_SIZE(big_mul_vec_sse2_fc) #else SET_SIZE(big_mul_vec_sse2) #endif #if defined(MMX_MANAGE) ENTRY(big_mul_vec_sse2) pushl %ebp movl %esp, %ebp subl $8, %esp pushl %edi KPREEMPT_DISABLE TEST_TS(%eax) movl %eax, -8(%ebp) jnz .mulvec_no_save SAVE_MMX_0TO4(%edi) movl %edi, -4(%ebp) .mulvec_no_save: movl 24(%ebp), %eax / blen pushl %eax movl 20(%ebp), %eax / b pushl %eax movl 16(%ebp), %eax / alen pushl %eax movl 12(%ebp), %eax / a pushl %eax movl 8(%ebp), %eax / r pushl %eax call big_mul_vec_sse2_fc addl $20, %esp movl -8(%ebp), %eax testl $CR0_TS, %eax jnz .mulvec_no_rstr movl -4(%ebp), %edi RSTOR_MMX_0TO4(%edi) .mulvec_no_rstr: movl %eax, %cr0 KPREEMPT_ENABLE popl %edi leave ret SET_SIZE(big_mul_vec_sse2) #endif /* MMX_MANAGE */ #undef UNROLL #undef UNROLL32 / r = a * a, r and a are vectors of length len / Suitable only for x86 models that support SSE2 instruction set extensions / / This function is not suitable for a truly general-purpose multiprecision / arithmetic library, because it does not work for "small" numbers, that is / numbers of 1 or 2 digits. big_mul() just uses the ordinary big_mul_vec() / for any small numbers. #if defined(MMX_MANAGE) ENTRY(big_sqr_vec_sse2_fc) #else ENTRY(big_sqr_vec_sse2) pushl %ebp movl %esp, %ebp #endif pushl %ebx pushl %edi pushl %esi / r[1..alen] = a[0] * a[1..alen-1] movl 8(%ebp), %edi / r = arg(r) movl 12(%ebp), %esi / a = arg(a) movl 16(%ebp), %ecx / cnt = arg(alen) movd %ecx, %mm4 / save_cnt = arg(alen) leal 4(%edi), %edx / dst = &r[1] movl %esi, %ebx / src = a movd 0(%ebx), %mm3 / mm3 = a[0] leal 4(%ebx), %ebx / src = &a[1] subl $1, %ecx / --cnt call big_mul_set_vec_sse2_r / r[1..alen-1] = a[0] * a[1..alen-1] movl %edi, %edx / dst = r movl %esi, %ebx / src = a movd %mm4, %ecx / cnt = save_cnt movl %eax, (%edx, %ecx, 4) / r[cnt] = cy / /* High-level vector C pseudocode */ / for (i = 1; i < alen-1; ++i) / r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1] / / /* Same thing, but slightly lower level C-like pseudocode */ / i = 1; / r = &arg_r[2*i + 1]; / a = &arg_a[i + 1]; / digit = arg_a[i]; / cnt = alen - 3; / while (cnt != 0) { / r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit); / r += 2; / ++a; / --cnt; / } / / /* Same thing, but even lower level / * For example, pointers are raw pointers, / * with no scaling by object size. / */ / r = arg_r + 12; /* i == 1; 2i + 1 == 3; 4*3 == 12; */ / a = arg_a + 8; / digit = *(arg_a + 4); / cnt = alen - 3; / while (cnt != 0) { / cy = big_mul_add_vec_sse2_r(); / *(r + 4 * cnt) = cy; / r += 8; / a += 4; / --cnt; / } leal 4(%edi), %edi / r += 4; r = &r[1] leal 4(%esi), %esi / a += 4; a = &a[1] movd %mm4, %ecx / cnt = save subl $2, %ecx / cnt = alen - 2; i in 1..alen-2 movd %ecx, %mm4 / save_cnt jecxz .L32 / while (cnt != 0) { .L31: movd 0(%esi), %mm3 / digit = a[i] leal 4(%esi), %esi / a += 4; a = &a[1]; a = &a[i + 1] leal 8(%edi), %edi / r += 8; r = &r[2]; r = &r[2 * i + 1] movl %edi, %edx / edx = r movl %esi, %ebx / ebx = a cmp $1, %ecx / The last triangle term is special jz .L32 call big_mul_add_vec_sse2_r movd %mm4, %ecx / cnt = save_cnt movl %eax, (%edi, %ecx, 4) / r[cnt] = cy subl $1, %ecx / --cnt movd %ecx, %mm4 / save_cnt = cnt jmp .L31 / } .L32: movd 0(%ebx), %mm1 / mm1 = a[i + 1] movd 0(%edx), %mm2 / mm2 = r[2 * i + 1] pmuludq %mm3, %mm1 / mm1 = p = digit * a[i + 1] paddq %mm1, %mm2 / mm2 = r[2 * i + 1] + p movd %mm2, 0(%edx) / r[2 * i + 1] += lo32(p) psrlq $32, %mm2 / mm2 = cy movd %mm2, 4(%edx) / r[2 * i + 2] = cy pxor %mm2, %mm2 movd %mm2, 8(%edx) / r[2 * i + 3] = 0 movl 8(%ebp), %edx / r = arg(r) movl 12(%ebp), %ebx / a = arg(a) movl 16(%ebp), %ecx / cnt = arg(alen) / compute low-order corner / p = a[0]**2 / r[0] = lo32(p) / cy = hi32(p) movd 0(%ebx), %mm2 / mm2 = a[0] pmuludq %mm2, %mm2 / mm2 = p = a[0]**2 movd %mm2, 0(%edx) / r[0] = lo32(p) psrlq $32, %mm2 / mm2 = cy = hi32(p) / p = 2 * r[1] / t = p + cy / r[1] = lo32(t) / cy = hi32(t) movd 4(%edx), %mm1 / mm1 = r[1] psllq $1, %mm1 / mm1 = p = 2 * r[1] paddq %mm1, %mm2 / mm2 = t = p + cy movd %mm2, 4(%edx) / r[1] = low32(t) psrlq $32, %mm2 / mm2 = cy = hi32(t) / r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3] subl $2, %ecx / cnt = alen - 2 .L34: movd 4(%ebx), %mm0 / mm0 = diag = a[i+1] pmuludq %mm0, %mm0 / mm0 = p = diag**2 paddq %mm0, %mm2 / mm2 = t = p + cy movd %mm2, %eax movd %eax, %mm1 / mm1 = lo32(t) psrlq $32, %mm2 / mm2 = hi32(t) movd 8(%edx), %mm3 / mm3 = r[2*i] psllq $1, %mm3 / mm3 = 2*r[2*i] paddq %mm3, %mm1 / mm1 = 2*r[2*i] + lo32(t) movd %mm1, 8(%edx) / r[2*i] = 2*r[2*i] + lo32(t) psrlq $32, %mm1 paddq %mm1, %mm2 movd 12(%edx), %mm3 / mm3 = r[2*i+1] psllq $1, %mm3 / mm3 = 2*r[2*i+1] paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + hi32(t) movd %mm2, 12(%edx) / r[2*i+1] = mm2 psrlq $32, %mm2 / mm2 = cy leal 8(%edx), %edx / r += 2 leal 4(%ebx), %ebx / ++a subl $1, %ecx / --cnt jnz .L34 / Carry from last triangle term must participate in doubling, / but this step isn't paired up with a squaring the elements / of the inner diagonal. / r[$-3..$-2] += 2 * r[$-3..$-2] + cy movd 8(%edx), %mm3 / mm3 = r[2*i] psllq $1, %mm3 / mm3 = 2*r[2*i] paddq %mm3, %mm2 / mm2 = 2*r[2*i] + cy movd %mm2, 8(%edx) / r[2*i] = lo32(2*r[2*i] + cy) psrlq $32, %mm2 / mm2 = cy = hi32(2*r[2*i] + cy) movd 12(%edx), %mm3 / mm3 = r[2*i+1] psllq $1, %mm3 / mm3 = 2*r[2*i+1] paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + cy movd %mm2, 12(%edx) / r[2*i+1] = mm2 psrlq $32, %mm2 / mm2 = cy / compute high-order corner and add it in / p = a[alen - 1]**2 / t = p + cy / r[alen + alen - 2] += lo32(t) / cy = hi32(t) / r[alen + alen - 1] = cy movd 4(%ebx), %mm0 / mm0 = a[$-1] movd 8(%edx), %mm3 / mm3 = r[$-2] pmuludq %mm0, %mm0 / mm0 = p = a[$-1]**2 paddq %mm0, %mm2 / mm2 = t = p + cy paddq %mm3, %mm2 / mm2 = r[$-2] + t movd %mm2, 8(%edx) / r[$-2] = lo32(r[$-2] + t) psrlq $32, %mm2 / mm2 = cy = hi32(r[$-2] + t) movd 12(%edx), %mm3 paddq %mm3, %mm2 movd %mm2, 12(%edx) / r[$-1] += cy .L35: emms popl %esi popl %edi popl %ebx #if defined(MMX_MANAGE) ret SET_SIZE(big_sqr_vec_sse2_fc) #else leave ret SET_SIZE(big_sqr_vec_sse2) #endif #if defined(MMX_MANAGE) ENTRY(big_sqr_vec_sse2) pushl %ebp movl %esp, %ebp KPREEMPT_DISABLE TEST_TS(%ebx) pushl %ebx jnz .sqr_no_save pushl %edi SAVE_MMX_0TO4(%edi) call big_sqr_vec_sse2_fc RSTOR_MMX_0TO4(%edi) popl %edi jmp .sqr_rtn .sqr_no_save: call big_sqr_vec_sse2_fc .sqr_rtn: popl %ebx movl %ebx, %cr0 KPREEMPT_ENABLE leave ret SET_SIZE(big_sqr_vec_sse2) #endif /* MMX_MANAGE */ / ------------------------------------------------------------------------ / UMUL Implementations / ------------------------------------------------------------------------ / r = a * digit, r and a are vectors of length len / returns the carry digit / Does not use any MMX, SSE, or SSE2 instructions. / Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL. / This is a fall-back implementation for x86 models that do not support / the PMULUDQ instruction. / / uint32_t / big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) / / r 8(%ebp) %edx %edi / a 12(%ebp) %ebx %esi / len 16(%ebp) %ecx / digit 20(%ebp) %esi ENTRY(big_mul_set_vec_umul) pushl %ebp movl %esp, %ebp pushl %esi pushl %edi pushl %ebx movl 16(%ebp), %ecx xorl %ebx, %ebx / cy = 0 testl %ecx, %ecx movl 8(%ebp), %edi movl 12(%ebp), %esi je .L57 .L55: movl (%esi), %eax / eax = a[i] leal 4(%esi), %esi / ++a mull 20(%ebp) / edx:eax = a[i] * digit addl %ebx, %eax adcl $0, %edx / edx:eax = a[i] * digit + cy movl %eax, (%edi) / r[i] = product[31..0] movl %edx, %ebx / cy = product[63..32] leal 4(%edi), %edi / ++r decl %ecx / --len jnz .L55 / while (len != 0) .L57: movl %ebx, %eax popl %ebx popl %edi popl %esi leave ret SET_SIZE(big_mul_set_vec_umul) / r = r + a * digit, r and a are vectors of length len / returns the carry digit / Does not use any MMX, SSE, or SSE2 instructions. / Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL. / This is a fall-back implementation for x86 models that do not support / the PMULUDQ instruction. / / uint32_t / big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) / / r 8(%ebp) %edx %edi / a 12(%ebp) %ebx %esi / len 16(%ebp) %ecx / digit 20(%ebp) %esi ENTRY(big_mul_add_vec_umul) pushl %ebp movl %esp, %ebp pushl %esi pushl %edi pushl %ebx movl 16(%ebp), %ecx xorl %ebx, %ebx / cy = 0 testl %ecx, %ecx movl 8(%ebp), %edi movl 12(%ebp), %esi je .L67 .align 4 .L65: movl (%esi), %eax / eax = a[i] leal 4(%esi), %esi / ++a mull 20(%ebp) / edx:eax = a[i] * digit addl (%edi), %eax adcl $0, %edx / edx:eax = a[i] * digit + r[i] addl %ebx, %eax adcl $0, %edx / edx:eax = a[i] * digit + r[i] + cy movl %eax, (%edi) / r[i] = product[31..0] movl %edx, %ebx / cy = product[63..32] leal 4(%edi), %edi / ++r decl %ecx / --len jnz .L65 / while (len != 0) .L67: movl %ebx, %eax popl %ebx popl %edi popl %esi leave ret SET_SIZE(big_mul_add_vec_umul) #endif /* __lint */ #endif /* !__GNUC_AS__ */