1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 23 */ 24/* 25 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms. 27 */ 28 29 .file "__vsqrtf.S" 30 31#include "libm.h" 32 33 ENTRY(__vsqrtf) 34 push %rbp 35 movq %rsp,%rbp 36 37/ on entry: 38/ %edi = n 39/ %rsi = x 40/ %edx = stridex 41/ %rcx = y 42/ %r8d = stridey 43 44 movslq %edx,%rdx / sign extend and scale strides 45 shlq $2,%rdx 46 movslq %r8d,%r8 47 shlq $2,%r8 48 49 cmpl $4,%edi 50 jl .finish 51 52 cmpq $4,%rdx 53 jne .nonunit 54 cmpq $4,%r8 55 jne .nonunit 56 57/ unit-stride case 58 movq %rdx,%r9 59 shlq $2,%r9 60 movq %r8,%r10 61 shlq $2,%r10 62 63 .align 16 64.loop: 65 movups (%rsi),%xmm0 66 addq %r9,%rsi 67 sqrtps %xmm0,%xmm0 68 movups %xmm0,(%rcx) 69 addq %r10,%rcx 70 subl $4,%edi 71 cmpl $4,%edi 72 jge .loop 73 74.finish: 75 testl %edi,%edi 76 jle .done 77 78.finish_loop: 79 movss (%rsi),%xmm0 80 addq %rdx,%rsi 81 sqrtss %xmm0,%xmm0 82 movss %xmm0,(%rcx) 83 addq %r8,%rcx 84 decl %edi 85 jg .finish_loop 86 87.done: 88 leave 89 ret 90 91 .align 16 92.nonunit: 93 movss (%rsi),%xmm0 94 addq %rdx,%rsi 95 movss (%rsi),%xmm1 96 addq %rdx,%rsi 97 movss (%rsi),%xmm2 98 addq %rdx,%rsi 99 movss (%rsi),%xmm3 100 addq %rdx,%rsi 101 102 movlhps %xmm1,%xmm0 / xmm0: 0 x1 0 x0 103 movlhps %xmm3,%xmm2 / xmm2: 0 x3 0 x2 104 shufps $0x88,%xmm2,%xmm0 / xmm0: x3 x2 x1 x0 105 106 sqrtps %xmm0,%xmm0 / xmm0: y3 y2 y1 y0 107 108 movaps %xmm0,%xmm1 / xmm1: y3 y2 y1 y0 109 shufps $0xf5,%xmm0,%xmm1 / xmm1: y3 y3 y1 y1 110 movhlps %xmm0,%xmm2 / xmm2: 0 x3 y3 y2 111 movhlps %xmm1,%xmm3 / xmm3: 0 0 y3 y3 112 113 movss %xmm0,(%rcx) 114 addq %r8,%rcx 115 movss %xmm1,(%rcx) 116 addq %r8,%rcx 117 movss %xmm2,(%rcx) 118 addq %r8,%rcx 119 movss %xmm3,(%rcx) 120 addq %r8,%rcx 121 122 subl $4,%edi 123 cmpl $4,%edi 124 jge .nonunit 125 126 jmp .finish 127 128 SET_SIZE(__vsqrtf) 129