17c478bd9Sstevel@tonic-gate/* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*8475e043SDan OpenSolaris Anderson * Common Development and Distribution License (the "License"). 6*8475e043SDan OpenSolaris Anderson * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate/* 22*8475e043SDan OpenSolaris Anderson * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 237c478bd9Sstevel@tonic-gate * Use is subject to license terms. 247c478bd9Sstevel@tonic-gate */ 257c478bd9Sstevel@tonic-gate 267c478bd9Sstevel@tonic-gate#include <sys/asm_linkage.h> 277c478bd9Sstevel@tonic-gate 287c478bd9Sstevel@tonic-gate#if defined(lint) || defined(__lint) 297c478bd9Sstevel@tonic-gate 307c478bd9Sstevel@tonic-gate#include <sys/types.h> 317c478bd9Sstevel@tonic-gate 327c478bd9Sstevel@tonic-gate/* ARGSUSED */ 337c478bd9Sstevel@tonic-gateuint64_t 34*8475e043SDan OpenSolaris Andersonbig_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit) 357c478bd9Sstevel@tonic-gate{ return (0); } 367c478bd9Sstevel@tonic-gate 377c478bd9Sstevel@tonic-gate/* ARGSUSED */ 387c478bd9Sstevel@tonic-gateuint64_t 39*8475e043SDan OpenSolaris Andersonbig_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit) 407c478bd9Sstevel@tonic-gate{ return (0); } 417c478bd9Sstevel@tonic-gate 427c478bd9Sstevel@tonic-gate/* ARGSUSED */ 437c478bd9Sstevel@tonic-gatevoid 44*8475e043SDan OpenSolaris Andersonbig_sqr_vec(uint64_t *r, uint64_t *a, int len) 457c478bd9Sstevel@tonic-gate{} 467c478bd9Sstevel@tonic-gate 477c478bd9Sstevel@tonic-gate#else /* lint */ 487c478bd9Sstevel@tonic-gate 497c478bd9Sstevel@tonic-gate/ ------------------------------------------------------------------------ 507c478bd9Sstevel@tonic-gate/ 517c478bd9Sstevel@tonic-gate/ Implementation of big_mul_set_vec which exploits 527c478bd9Sstevel@tonic-gate/ the 64X64->128 bit unsigned multiply instruction. 537c478bd9Sstevel@tonic-gate/ 547c478bd9Sstevel@tonic-gate/ As defined in Sun's bignum library for pkcs11, bignums are 55*8475e043SDan OpenSolaris Anderson/ composed of an array of 64-bit "digits" or "chunks" along with 56*8475e043SDan OpenSolaris Anderson/ descriptive information. 577c478bd9Sstevel@tonic-gate/ 587c478bd9Sstevel@tonic-gate/ ------------------------------------------------------------------------ 597c478bd9Sstevel@tonic-gate 607c478bd9Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len 617c478bd9Sstevel@tonic-gate/ returns the carry digit 627c478bd9Sstevel@tonic-gate/ r and a are 64 bit aligned. 637c478bd9Sstevel@tonic-gate/ 647c478bd9Sstevel@tonic-gate/ uint64_t 65*8475e043SDan OpenSolaris Anderson/ big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit) 667c478bd9Sstevel@tonic-gate/ 67*8475e043SDan OpenSolaris Anderson ENTRY(big_mul_set_vec) 687c478bd9Sstevel@tonic-gate xorq %rax, %rax / if (len == 0) return (0) 697c478bd9Sstevel@tonic-gate testq %rdx, %rdx 707c478bd9Sstevel@tonic-gate jz .L17 717c478bd9Sstevel@tonic-gate 727c478bd9Sstevel@tonic-gate movq %rdx, %r8 / Use r8 for len; %rdx is used by mul 737c478bd9Sstevel@tonic-gate xorq %r9, %r9 / cy = 0 747c478bd9Sstevel@tonic-gate 757c478bd9Sstevel@tonic-gate.L15: 767c478bd9Sstevel@tonic-gate cmpq $8, %r8 / 8 - len 777c478bd9Sstevel@tonic-gate jb .L16 787c478bd9Sstevel@tonic-gate movq 0(%rsi), %rax / rax = a[0] 797c478bd9Sstevel@tonic-gate movq 8(%rsi), %r11 / prefetch a[1] 807c478bd9Sstevel@tonic-gate mulq %rcx / p = a[0] * digit 817c478bd9Sstevel@tonic-gate addq %r9, %rax 827c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 837c478bd9Sstevel@tonic-gate movq %rax, 0(%rdi) / r[0] = lo(p) 847c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 857c478bd9Sstevel@tonic-gate 867c478bd9Sstevel@tonic-gate movq %r11, %rax 877c478bd9Sstevel@tonic-gate movq 16(%rsi), %r11 / prefetch a[2] 887c478bd9Sstevel@tonic-gate mulq %rcx / p = a[1] * digit 897c478bd9Sstevel@tonic-gate addq %r9, %rax 907c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 917c478bd9Sstevel@tonic-gate movq %rax, 8(%rdi) / r[1] = lo(p) 927c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 937c478bd9Sstevel@tonic-gate 947c478bd9Sstevel@tonic-gate movq %r11, %rax 957c478bd9Sstevel@tonic-gate movq 24(%rsi), %r11 / prefetch a[3] 967c478bd9Sstevel@tonic-gate mulq %rcx / p = a[2] * digit 977c478bd9Sstevel@tonic-gate addq %r9, %rax 987c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 997c478bd9Sstevel@tonic-gate movq %rax, 16(%rdi) / r[2] = lo(p) 1007c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1017c478bd9Sstevel@tonic-gate 1027c478bd9Sstevel@tonic-gate movq %r11, %rax 1037c478bd9Sstevel@tonic-gate movq 32(%rsi), %r11 / prefetch a[4] 1047c478bd9Sstevel@tonic-gate mulq %rcx / p = a[3] * digit 1057c478bd9Sstevel@tonic-gate addq %r9, %rax 1067c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 1077c478bd9Sstevel@tonic-gate movq %rax, 24(%rdi) / r[3] = lo(p) 1087c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1097c478bd9Sstevel@tonic-gate 1107c478bd9Sstevel@tonic-gate movq %r11, %rax 1117c478bd9Sstevel@tonic-gate movq 40(%rsi), %r11 / prefetch a[5] 1127c478bd9Sstevel@tonic-gate mulq %rcx / p = a[4] * digit 1137c478bd9Sstevel@tonic-gate addq %r9, %rax 1147c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 1157c478bd9Sstevel@tonic-gate movq %rax, 32(%rdi) / r[4] = lo(p) 1167c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1177c478bd9Sstevel@tonic-gate 1187c478bd9Sstevel@tonic-gate movq %r11, %rax 1197c478bd9Sstevel@tonic-gate movq 48(%rsi), %r11 / prefetch a[6] 1207c478bd9Sstevel@tonic-gate mulq %rcx / p = a[5] * digit 1217c478bd9Sstevel@tonic-gate addq %r9, %rax 1227c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 1237c478bd9Sstevel@tonic-gate movq %rax, 40(%rdi) / r[5] = lo(p) 1247c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1257c478bd9Sstevel@tonic-gate 1267c478bd9Sstevel@tonic-gate movq %r11, %rax 1277c478bd9Sstevel@tonic-gate movq 56(%rsi), %r11 / prefetch a[7] 1287c478bd9Sstevel@tonic-gate mulq %rcx / p = a[6] * digit 1297c478bd9Sstevel@tonic-gate addq %r9, %rax 1307c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 1317c478bd9Sstevel@tonic-gate movq %rax, 48(%rdi) / r[6] = lo(p) 1327c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1337c478bd9Sstevel@tonic-gate 1347c478bd9Sstevel@tonic-gate movq %r11, %rax 1357c478bd9Sstevel@tonic-gate mulq %rcx / p = a[7] * digit 1367c478bd9Sstevel@tonic-gate addq %r9, %rax 1377c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 1387c478bd9Sstevel@tonic-gate movq %rax, 56(%rdi) / r[7] = lo(p) 1397c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1407c478bd9Sstevel@tonic-gate 1417c478bd9Sstevel@tonic-gate addq $64, %rsi 1427c478bd9Sstevel@tonic-gate addq $64, %rdi 1437c478bd9Sstevel@tonic-gate subq $8, %r8 1447c478bd9Sstevel@tonic-gate 1457c478bd9Sstevel@tonic-gate jz .L17 1467c478bd9Sstevel@tonic-gate jmp .L15 1477c478bd9Sstevel@tonic-gate 1487c478bd9Sstevel@tonic-gate.L16: 1497c478bd9Sstevel@tonic-gate movq 0(%rsi), %rax 1507c478bd9Sstevel@tonic-gate mulq %rcx / p = a[0] * digit 1517c478bd9Sstevel@tonic-gate addq %r9, %rax 1527c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 1537c478bd9Sstevel@tonic-gate movq %rax, 0(%rdi) / r[0] = lo(p) 1547c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1557c478bd9Sstevel@tonic-gate decq %r8 1567c478bd9Sstevel@tonic-gate jz .L17 1577c478bd9Sstevel@tonic-gate 1587c478bd9Sstevel@tonic-gate movq 8(%rsi), %rax 1597c478bd9Sstevel@tonic-gate mulq %rcx / p = a[1] * digit 1607c478bd9Sstevel@tonic-gate addq %r9, %rax 1617c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 1627c478bd9Sstevel@tonic-gate movq %rax, 8(%rdi) / r[1] = lo(p) 1637c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1647c478bd9Sstevel@tonic-gate decq %r8 1657c478bd9Sstevel@tonic-gate jz .L17 1667c478bd9Sstevel@tonic-gate 1677c478bd9Sstevel@tonic-gate movq 16(%rsi), %rax 1687c478bd9Sstevel@tonic-gate mulq %rcx / p = a[2] * digit 1697c478bd9Sstevel@tonic-gate addq %r9, %rax 1707c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 1717c478bd9Sstevel@tonic-gate movq %rax, 16(%rdi) / r[2] = lo(p) 1727c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1737c478bd9Sstevel@tonic-gate decq %r8 1747c478bd9Sstevel@tonic-gate jz .L17 1757c478bd9Sstevel@tonic-gate 1767c478bd9Sstevel@tonic-gate movq 24(%rsi), %rax 1777c478bd9Sstevel@tonic-gate mulq %rcx / p = a[3] * digit 1787c478bd9Sstevel@tonic-gate addq %r9, %rax 1797c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 1807c478bd9Sstevel@tonic-gate movq %rax, 24(%rdi) / r[3] = lo(p) 1817c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1827c478bd9Sstevel@tonic-gate decq %r8 1837c478bd9Sstevel@tonic-gate jz .L17 1847c478bd9Sstevel@tonic-gate 1857c478bd9Sstevel@tonic-gate movq 32(%rsi), %rax 1867c478bd9Sstevel@tonic-gate mulq %rcx / p = a[4] * digit 1877c478bd9Sstevel@tonic-gate addq %r9, %rax 1887c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 1897c478bd9Sstevel@tonic-gate movq %rax, 32(%rdi) / r[4] = lo(p) 1907c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1917c478bd9Sstevel@tonic-gate decq %r8 1927c478bd9Sstevel@tonic-gate jz .L17 1937c478bd9Sstevel@tonic-gate 1947c478bd9Sstevel@tonic-gate movq 40(%rsi), %rax 1957c478bd9Sstevel@tonic-gate mulq %rcx / p = a[5] * digit 1967c478bd9Sstevel@tonic-gate addq %r9, %rax 1977c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 1987c478bd9Sstevel@tonic-gate movq %rax, 40(%rdi) / r[5] = lo(p) 1997c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 2007c478bd9Sstevel@tonic-gate decq %r8 2017c478bd9Sstevel@tonic-gate jz .L17 2027c478bd9Sstevel@tonic-gate 2037c478bd9Sstevel@tonic-gate movq 48(%rsi), %rax 2047c478bd9Sstevel@tonic-gate mulq %rcx / p = a[6] * digit 2057c478bd9Sstevel@tonic-gate addq %r9, %rax 2067c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 2077c478bd9Sstevel@tonic-gate movq %rax, 48(%rdi) / r[6] = lo(p) 2087c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 2097c478bd9Sstevel@tonic-gate decq %r8 2107c478bd9Sstevel@tonic-gate jz .L17 2117c478bd9Sstevel@tonic-gate 2127c478bd9Sstevel@tonic-gate 2137c478bd9Sstevel@tonic-gate.L17: 2147c478bd9Sstevel@tonic-gate movq %r9, %rax 2157c478bd9Sstevel@tonic-gate ret 216*8475e043SDan OpenSolaris Anderson SET_SIZE(big_mul_set_vec) 217*8475e043SDan OpenSolaris Anderson 2187c478bd9Sstevel@tonic-gate 2197c478bd9Sstevel@tonic-gate/ ------------------------------------------------------------------------ 2207c478bd9Sstevel@tonic-gate/ 2217c478bd9Sstevel@tonic-gate/ Implementation of big_mul_add_vec which exploits 2227c478bd9Sstevel@tonic-gate/ the 64X64->128 bit unsigned multiply instruction. 2237c478bd9Sstevel@tonic-gate/ 2247c478bd9Sstevel@tonic-gate/ As defined in Sun's bignum library for pkcs11, bignums are 225*8475e043SDan OpenSolaris Anderson/ composed of an array of 64-bit "digits" or "chunks" along with 226*8475e043SDan OpenSolaris Anderson/ descriptive information. 2277c478bd9Sstevel@tonic-gate/ 2287c478bd9Sstevel@tonic-gate/ ------------------------------------------------------------------------ 2297c478bd9Sstevel@tonic-gate 2307c478bd9Sstevel@tonic-gate/ r += a * digit, r and a are vectors of length len 2317c478bd9Sstevel@tonic-gate/ returns the carry digit 2327c478bd9Sstevel@tonic-gate/ r and a are 64 bit aligned. 2337c478bd9Sstevel@tonic-gate/ 2347c478bd9Sstevel@tonic-gate/ uint64_t 235*8475e043SDan OpenSolaris Anderson/ big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit) 2367c478bd9Sstevel@tonic-gate/ 237*8475e043SDan OpenSolaris Anderson ENTRY(big_mul_add_vec) 2387c478bd9Sstevel@tonic-gate xorq %rax, %rax / if (len == 0) return (0) 2397c478bd9Sstevel@tonic-gate testq %rdx, %rdx 2407c478bd9Sstevel@tonic-gate jz .L27 2417c478bd9Sstevel@tonic-gate 2427c478bd9Sstevel@tonic-gate movq %rdx, %r8 / Use r8 for len; %rdx is used by mul 2437c478bd9Sstevel@tonic-gate xorq %r9, %r9 / cy = 0 2447c478bd9Sstevel@tonic-gate 2457c478bd9Sstevel@tonic-gate.L25: 2467c478bd9Sstevel@tonic-gate cmpq $8, %r8 / 8 - len 2477c478bd9Sstevel@tonic-gate jb .L26 2487c478bd9Sstevel@tonic-gate movq 0(%rsi), %rax / rax = a[0] 2497c478bd9Sstevel@tonic-gate movq 0(%rdi), %r10 / r10 = r[0] 2507c478bd9Sstevel@tonic-gate movq 8(%rsi), %r11 / prefetch a[1] 2517c478bd9Sstevel@tonic-gate mulq %rcx / p = a[0] * digit 2527c478bd9Sstevel@tonic-gate addq %r10, %rax 2537c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += r[0] 2547c478bd9Sstevel@tonic-gate movq 8(%rdi), %r10 / prefetch r[1] 2557c478bd9Sstevel@tonic-gate addq %r9, %rax 2567c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 2577c478bd9Sstevel@tonic-gate movq %rax, 0(%rdi) / r[0] = lo(p) 2587c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 2597c478bd9Sstevel@tonic-gate 2607c478bd9Sstevel@tonic-gate movq %r11, %rax 2617c478bd9Sstevel@tonic-gate movq 16(%rsi), %r11 / prefetch a[2] 2627c478bd9Sstevel@tonic-gate mulq %rcx / p = a[1] * digit 2637c478bd9Sstevel@tonic-gate addq %r10, %rax 2647c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += r[1] 2657c478bd9Sstevel@tonic-gate movq 16(%rdi), %r10 / prefetch r[2] 2667c478bd9Sstevel@tonic-gate addq %r9, %rax 2677c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 2687c478bd9Sstevel@tonic-gate movq %rax, 8(%rdi) / r[1] = lo(p) 2697c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 2707c478bd9Sstevel@tonic-gate 2717c478bd9Sstevel@tonic-gate movq %r11, %rax 2727c478bd9Sstevel@tonic-gate movq 24(%rsi), %r11 / prefetch a[3] 2737c478bd9Sstevel@tonic-gate mulq %rcx / p = a[2] * digit 2747c478bd9Sstevel@tonic-gate addq %r10, %rax 2757c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += r[2] 2767c478bd9Sstevel@tonic-gate movq 24(%rdi), %r10 / prefetch r[3] 2777c478bd9Sstevel@tonic-gate addq %r9, %rax 2787c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 2797c478bd9Sstevel@tonic-gate movq %rax, 16(%rdi) / r[2] = lo(p) 2807c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 2817c478bd9Sstevel@tonic-gate 2827c478bd9Sstevel@tonic-gate movq %r11, %rax 2837c478bd9Sstevel@tonic-gate movq 32(%rsi), %r11 / prefetch a[4] 2847c478bd9Sstevel@tonic-gate mulq %rcx / p = a[3] * digit 2857c478bd9Sstevel@tonic-gate addq %r10, %rax 2867c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += r[3] 2877c478bd9Sstevel@tonic-gate movq 32(%rdi), %r10 / prefetch r[4] 2887c478bd9Sstevel@tonic-gate addq %r9, %rax 2897c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 2907c478bd9Sstevel@tonic-gate movq %rax, 24(%rdi) / r[3] = lo(p) 2917c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 2927c478bd9Sstevel@tonic-gate 2937c478bd9Sstevel@tonic-gate movq %r11, %rax 2947c478bd9Sstevel@tonic-gate movq 40(%rsi), %r11 / prefetch a[5] 2957c478bd9Sstevel@tonic-gate mulq %rcx / p = a[4] * digit 2967c478bd9Sstevel@tonic-gate addq %r10, %rax 2977c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += r[4] 2987c478bd9Sstevel@tonic-gate movq 40(%rdi), %r10 / prefetch r[5] 2997c478bd9Sstevel@tonic-gate addq %r9, %rax 3007c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 3017c478bd9Sstevel@tonic-gate movq %rax, 32(%rdi) / r[4] = lo(p) 3027c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 3037c478bd9Sstevel@tonic-gate 3047c478bd9Sstevel@tonic-gate movq %r11, %rax 3057c478bd9Sstevel@tonic-gate movq 48(%rsi), %r11 / prefetch a[6] 3067c478bd9Sstevel@tonic-gate mulq %rcx / p = a[5] * digit 3077c478bd9Sstevel@tonic-gate addq %r10, %rax 3087c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += r[5] 3097c478bd9Sstevel@tonic-gate movq 48(%rdi), %r10 / prefetch r[6] 3107c478bd9Sstevel@tonic-gate addq %r9, %rax 3117c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 3127c478bd9Sstevel@tonic-gate movq %rax, 40(%rdi) / r[5] = lo(p) 3137c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 3147c478bd9Sstevel@tonic-gate 3157c478bd9Sstevel@tonic-gate movq %r11, %rax 3167c478bd9Sstevel@tonic-gate movq 56(%rsi), %r11 / prefetch a[7] 3177c478bd9Sstevel@tonic-gate mulq %rcx / p = a[6] * digit 3187c478bd9Sstevel@tonic-gate addq %r10, %rax 3197c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += r[6] 3207c478bd9Sstevel@tonic-gate movq 56(%rdi), %r10 / prefetch r[7] 3217c478bd9Sstevel@tonic-gate addq %r9, %rax 3227c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 3237c478bd9Sstevel@tonic-gate movq %rax, 48(%rdi) / r[6] = lo(p) 3247c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 3257c478bd9Sstevel@tonic-gate 3267c478bd9Sstevel@tonic-gate movq %r11, %rax 3277c478bd9Sstevel@tonic-gate mulq %rcx / p = a[7] * digit 3287c478bd9Sstevel@tonic-gate addq %r10, %rax 3297c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += r[7] 3307c478bd9Sstevel@tonic-gate addq %r9, %rax 3317c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 3327c478bd9Sstevel@tonic-gate movq %rax, 56(%rdi) / r[7] = lo(p) 3337c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 3347c478bd9Sstevel@tonic-gate 3357c478bd9Sstevel@tonic-gate addq $64, %rsi 3367c478bd9Sstevel@tonic-gate addq $64, %rdi 3377c478bd9Sstevel@tonic-gate subq $8, %r8 3387c478bd9Sstevel@tonic-gate 3397c478bd9Sstevel@tonic-gate jz .L27 3407c478bd9Sstevel@tonic-gate jmp .L25 3417c478bd9Sstevel@tonic-gate 3427c478bd9Sstevel@tonic-gate.L26: 3437c478bd9Sstevel@tonic-gate movq 0(%rsi), %rax 3447c478bd9Sstevel@tonic-gate movq 0(%rdi), %r10 3457c478bd9Sstevel@tonic-gate mulq %rcx / p = a[0] * digit 3467c478bd9Sstevel@tonic-gate addq %r10, %rax 3477c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += r[0] 3487c478bd9Sstevel@tonic-gate addq %r9, %rax 3497c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 3507c478bd9Sstevel@tonic-gate movq %rax, 0(%rdi) / r[0] = lo(p) 3517c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 3527c478bd9Sstevel@tonic-gate decq %r8 3537c478bd9Sstevel@tonic-gate jz .L27 3547c478bd9Sstevel@tonic-gate 3557c478bd9Sstevel@tonic-gate movq 8(%rsi), %rax 3567c478bd9Sstevel@tonic-gate movq 8(%rdi), %r10 3577c478bd9Sstevel@tonic-gate mulq %rcx / p = a[1] * digit 3587c478bd9Sstevel@tonic-gate addq %r10, %rax 3597c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += r[1] 3607c478bd9Sstevel@tonic-gate addq %r9, %rax 3617c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 3627c478bd9Sstevel@tonic-gate movq %rax, 8(%rdi) / r[1] = lo(p) 3637c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 3647c478bd9Sstevel@tonic-gate decq %r8 3657c478bd9Sstevel@tonic-gate jz .L27 3667c478bd9Sstevel@tonic-gate 3677c478bd9Sstevel@tonic-gate movq 16(%rsi), %rax 3687c478bd9Sstevel@tonic-gate movq 16(%rdi), %r10 3697c478bd9Sstevel@tonic-gate mulq %rcx / p = a[2] * digit 3707c478bd9Sstevel@tonic-gate addq %r10, %rax 3717c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += r[2] 3727c478bd9Sstevel@tonic-gate addq %r9, %rax 3737c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 3747c478bd9Sstevel@tonic-gate movq %rax, 16(%rdi) / r[2] = lo(p) 3757c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 3767c478bd9Sstevel@tonic-gate decq %r8 3777c478bd9Sstevel@tonic-gate jz .L27 3787c478bd9Sstevel@tonic-gate 3797c478bd9Sstevel@tonic-gate movq 24(%rsi), %rax 3807c478bd9Sstevel@tonic-gate movq 24(%rdi), %r10 3817c478bd9Sstevel@tonic-gate mulq %rcx / p = a[3] * digit 3827c478bd9Sstevel@tonic-gate addq %r10, %rax 3837c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += r[3] 3847c478bd9Sstevel@tonic-gate addq %r9, %rax 3857c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 3867c478bd9Sstevel@tonic-gate movq %rax, 24(%rdi) / r[3] = lo(p) 3877c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 3887c478bd9Sstevel@tonic-gate decq %r8 3897c478bd9Sstevel@tonic-gate jz .L27 3907c478bd9Sstevel@tonic-gate 3917c478bd9Sstevel@tonic-gate movq 32(%rsi), %rax 3927c478bd9Sstevel@tonic-gate movq 32(%rdi), %r10 3937c478bd9Sstevel@tonic-gate mulq %rcx / p = a[4] * digit 3947c478bd9Sstevel@tonic-gate addq %r10, %rax 3957c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += r[4] 3967c478bd9Sstevel@tonic-gate addq %r9, %rax 3977c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 3987c478bd9Sstevel@tonic-gate movq %rax, 32(%rdi) / r[4] = lo(p) 3997c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 4007c478bd9Sstevel@tonic-gate decq %r8 4017c478bd9Sstevel@tonic-gate jz .L27 4027c478bd9Sstevel@tonic-gate 4037c478bd9Sstevel@tonic-gate movq 40(%rsi), %rax 4047c478bd9Sstevel@tonic-gate movq 40(%rdi), %r10 4057c478bd9Sstevel@tonic-gate mulq %rcx / p = a[5] * digit 4067c478bd9Sstevel@tonic-gate addq %r10, %rax 4077c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += r[5] 4087c478bd9Sstevel@tonic-gate addq %r9, %rax 4097c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 4107c478bd9Sstevel@tonic-gate movq %rax, 40(%rdi) / r[5] = lo(p) 4117c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 4127c478bd9Sstevel@tonic-gate decq %r8 4137c478bd9Sstevel@tonic-gate jz .L27 4147c478bd9Sstevel@tonic-gate 4157c478bd9Sstevel@tonic-gate movq 48(%rsi), %rax 4167c478bd9Sstevel@tonic-gate movq 48(%rdi), %r10 4177c478bd9Sstevel@tonic-gate mulq %rcx / p = a[6] * digit 4187c478bd9Sstevel@tonic-gate addq %r10, %rax 4197c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += r[6] 4207c478bd9Sstevel@tonic-gate addq %r9, %rax 4217c478bd9Sstevel@tonic-gate adcq $0, %rdx / p += cy 4227c478bd9Sstevel@tonic-gate movq %rax, 48(%rdi) / r[6] = lo(p) 4237c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 4247c478bd9Sstevel@tonic-gate decq %r8 4257c478bd9Sstevel@tonic-gate jz .L27 4267c478bd9Sstevel@tonic-gate 4277c478bd9Sstevel@tonic-gate 4287c478bd9Sstevel@tonic-gate.L27: 4297c478bd9Sstevel@tonic-gate movq %r9, %rax 4307c478bd9Sstevel@tonic-gate ret 431*8475e043SDan OpenSolaris Anderson SET_SIZE(big_mul_add_vec) 4327c478bd9Sstevel@tonic-gate 4337c478bd9Sstevel@tonic-gate 4347c478bd9Sstevel@tonic-gate/ void 435*8475e043SDan OpenSolaris Anderson/ big_sqr_vec(uint64_t *r, uint64_t *a, int len) 4367c478bd9Sstevel@tonic-gate 437*8475e043SDan OpenSolaris Anderson ENTRY(big_sqr_vec) 4387c478bd9Sstevel@tonic-gate pushq %rbx 4397c478bd9Sstevel@tonic-gate pushq %rbp 4407c478bd9Sstevel@tonic-gate pushq %r12 4417c478bd9Sstevel@tonic-gate pushq %r13 4427c478bd9Sstevel@tonic-gate pushq %r14 4437c478bd9Sstevel@tonic-gate pushq %r15 4447c478bd9Sstevel@tonic-gate pushq %rdx / save arg3, len 4457c478bd9Sstevel@tonic-gate pushq %rsi / save arg2, a 4467c478bd9Sstevel@tonic-gate pushq %rdi / save arg1, r 4477c478bd9Sstevel@tonic-gate 4487c478bd9Sstevel@tonic-gate leaq 8(%rdi), %r13 / tr = r + 1 4497c478bd9Sstevel@tonic-gate movq %rsi, %r14 / ta = a 4507c478bd9Sstevel@tonic-gate movq %rdx, %r15 / tlen = len 4517c478bd9Sstevel@tonic-gate decq %r15 / tlen = len - 1 4527c478bd9Sstevel@tonic-gate movq %r13, %rdi / arg1 = tr 4537c478bd9Sstevel@tonic-gate leaq 8(%r14), %rsi / arg2 = ta + 1 4547c478bd9Sstevel@tonic-gate movq %r15, %rdx / arg3 = tlen 4557c478bd9Sstevel@tonic-gate movq 0(%r14), %rcx / arg4 = ta[0] 456*8475e043SDan OpenSolaris Anderson call big_mul_set_vec 4577c478bd9Sstevel@tonic-gate movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy 4587c478bd9Sstevel@tonic-gate.L31: 4597c478bd9Sstevel@tonic-gate decq %r15 / --tlen 4607c478bd9Sstevel@tonic-gate jz .L32 / while (--tlen != 0) 4617c478bd9Sstevel@tonic-gate 4627c478bd9Sstevel@tonic-gate addq $16, %r13 / tr += 2 4637c478bd9Sstevel@tonic-gate addq $8, %r14 / ++ta 4647c478bd9Sstevel@tonic-gate movq %r13, %rdi / arg1 = tr 4657c478bd9Sstevel@tonic-gate leaq 8(%r14), %rsi / arg2 = ta + 1 4667c478bd9Sstevel@tonic-gate movq %r15, %rdx / arg3 = tlen 4677c478bd9Sstevel@tonic-gate movq 0(%r14), %rcx / arg4 = ta[0] 468*8475e043SDan OpenSolaris Anderson call big_mul_add_vec 4697c478bd9Sstevel@tonic-gate movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy 4707c478bd9Sstevel@tonic-gate jmp .L31 4717c478bd9Sstevel@tonic-gate 4727c478bd9Sstevel@tonic-gate.L32: 4737c478bd9Sstevel@tonic-gate 4747c478bd9Sstevel@tonic-gate/ No more function calls after this. 4757c478bd9Sstevel@tonic-gate/ Restore arguments to registers. 4767c478bd9Sstevel@tonic-gate/ However, don't use %rdx for arg3, len, because it is heavily 4777c478bd9Sstevel@tonic-gate/ used by the hardware MUL instruction. Use %r8, instead. 4787c478bd9Sstevel@tonic-gate movq 0(%rsp), %rdi / %rdi == arg1 == r 4797c478bd9Sstevel@tonic-gate movq 8(%rsp), %rsi / %rsi == arg2 == a 4807c478bd9Sstevel@tonic-gate movq 16(%rsp), %r8 / %r8 == arg3 == len 4817c478bd9Sstevel@tonic-gate 4827c478bd9Sstevel@tonic-gate movq 0(%rsi), %rax / %rax = a[0]; 4837c478bd9Sstevel@tonic-gate mulq %rax / s = %edx:%eax = a[0]**2 4847c478bd9Sstevel@tonic-gate movq %rax, 0(%rdi) / r[0] = lo64(s) 4857c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi64(s) 4867c478bd9Sstevel@tonic-gate xorq %rdx, %rdx 4877c478bd9Sstevel@tonic-gate movq 8(%rdi), %rax / p = %rdx:%rax = r[1] 4887c478bd9Sstevel@tonic-gate addq %rax, %rax 4897c478bd9Sstevel@tonic-gate adcq $0, %rdx / p = p << 1 4907c478bd9Sstevel@tonic-gate addq %r9, %rax 4917c478bd9Sstevel@tonic-gate adcq $0, %rdx / p = (r[1] << 1) + cy 4927c478bd9Sstevel@tonic-gate movq %rax, 8(%rdi) / r[1] = lo64(p) 4937c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi64(p) 4947c478bd9Sstevel@tonic-gate movq $1, %r11 / row = 1 4957c478bd9Sstevel@tonic-gate movq $2, %r12 / col = 2 4967c478bd9Sstevel@tonic-gate movq %r8, %r15 4977c478bd9Sstevel@tonic-gate decq %r15 / tlen = len - 1 4987c478bd9Sstevel@tonic-gate.L33: 4997c478bd9Sstevel@tonic-gate cmpq %r8, %r11 / len - row 5007c478bd9Sstevel@tonic-gate jae .L34 / while (row < len) 5017c478bd9Sstevel@tonic-gate 5027c478bd9Sstevel@tonic-gate movq 0(%rsi, %r11, 8), %rax / s = (uint128_t)a[row] 5037c478bd9Sstevel@tonic-gate mulq %rax / s = s * s 5047c478bd9Sstevel@tonic-gate xorq %rbx, %rbx 5057c478bd9Sstevel@tonic-gate movq 0(%rdi, %r12, 8), %rcx / p = (uint128_t)r[col] 5067c478bd9Sstevel@tonic-gate addq %rcx, %rcx 5077c478bd9Sstevel@tonic-gate adcq $0, %rbx / p = p << 1 5087c478bd9Sstevel@tonic-gate addq %rcx, %rax 5097c478bd9Sstevel@tonic-gate adcq %rbx, %rdx / t = p + s 5107c478bd9Sstevel@tonic-gate xorq %r10, %r10 5117c478bd9Sstevel@tonic-gate movq %rax, %rbp / t2 = 0:lo64(t) 5127c478bd9Sstevel@tonic-gate addq %r9, %rbp 5137c478bd9Sstevel@tonic-gate adcq $0, %r10 / t2 = %r10:%rbp = lo64(t) + cy 5147c478bd9Sstevel@tonic-gate movq %rbp, 0(%rdi, %r12, 8) / r[col] = lo64(t2) 5157c478bd9Sstevel@tonic-gate xorq %rcx, %rcx 5167c478bd9Sstevel@tonic-gate movq %rdx, %r9 5177c478bd9Sstevel@tonic-gate addq %r10, %r9 5187c478bd9Sstevel@tonic-gate adcq $0, %rcx / cy = hi64(t) + hi64(t2) 5197c478bd9Sstevel@tonic-gate cmpq %r11, %r15 5207c478bd9Sstevel@tonic-gate je .L34 / if (row == len - 1) break 5217c478bd9Sstevel@tonic-gate xorq %rdx, %rdx 5227c478bd9Sstevel@tonic-gate movq 8(%rdi, %r12, 8), %rax 5237c478bd9Sstevel@tonic-gate addq %rax, %rax 5247c478bd9Sstevel@tonic-gate adcq $0, %rdx 5257c478bd9Sstevel@tonic-gate addq %r9, %rax 5267c478bd9Sstevel@tonic-gate adcq %rcx, %rdx / p = (lo64(r[col+1]) << 1) + cy 5277c478bd9Sstevel@tonic-gate movq %rax, 8(%rdi, %r12, 8) / r[col+1] = lo64(p) 5287c478bd9Sstevel@tonic-gate movq %rdx, %r9 / cy = hi64(p) 5297c478bd9Sstevel@tonic-gate 5307c478bd9Sstevel@tonic-gate incq %r11 / ++row 5317c478bd9Sstevel@tonic-gate addq $2, %r12 / col += 2 5327c478bd9Sstevel@tonic-gate jmp .L33 5337c478bd9Sstevel@tonic-gate 5347c478bd9Sstevel@tonic-gate.L34: 5357c478bd9Sstevel@tonic-gate movq %r9, 8(%rdi, %r12, 8) / r[col+1] = lo64(cy) 5367c478bd9Sstevel@tonic-gate 5377c478bd9Sstevel@tonic-gate addq $24, %rsp / skip %rdi, %rsi, %rdx 5387c478bd9Sstevel@tonic-gate popq %r15 5397c478bd9Sstevel@tonic-gate popq %r14 5407c478bd9Sstevel@tonic-gate popq %r13 5417c478bd9Sstevel@tonic-gate popq %r12 5427c478bd9Sstevel@tonic-gate popq %rbp 5437c478bd9Sstevel@tonic-gate popq %rbx 5447c478bd9Sstevel@tonic-gate 5457c478bd9Sstevel@tonic-gate ret 5467c478bd9Sstevel@tonic-gate 547*8475e043SDan OpenSolaris Anderson SET_SIZE(big_sqr_vec) 5487c478bd9Sstevel@tonic-gate 5497c478bd9Sstevel@tonic-gate#endif /* lint */ 550