1*0b57cec5SDimitry Andric// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 2*0b57cec5SDimitry Andric// See https://llvm.org/LICENSE.txt for license information. 3*0b57cec5SDimitry Andric// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 4*0b57cec5SDimitry Andric 5*0b57cec5SDimitry Andric#include "../assembly.h" 6*0b57cec5SDimitry Andric 7*0b57cec5SDimitry Andric// float __floatundisf(du_int a); 8*0b57cec5SDimitry Andric 9*0b57cec5SDimitry Andric// Note that there is a hardware instruction, fildll, that does most of what 10*0b57cec5SDimitry Andric// this function needs to do. However, because of our ia32 ABI, it will take 11*0b57cec5SDimitry Andric// a write-small read-large stall, so the software implementation here is 12*0b57cec5SDimitry Andric// actually several cycles faster. 13*0b57cec5SDimitry Andric 14*0b57cec5SDimitry Andric// This is a branch-free implementation. A branchy implementation might be 15*0b57cec5SDimitry Andric// faster for the common case if you know something a priori about the input 16*0b57cec5SDimitry Andric// distribution. 17*0b57cec5SDimitry Andric 18*0b57cec5SDimitry Andric/* branch-free x87 implementation - one cycle slower than without x87. 19*0b57cec5SDimitry Andric 20*0b57cec5SDimitry Andric#ifdef __i386__ 21*0b57cec5SDimitry Andric 22*0b57cec5SDimitry AndricCONST_SECTION 23*0b57cec5SDimitry Andric.balign 3 24*0b57cec5SDimitry Andric 25*0b57cec5SDimitry Andric .quad 0x43f0000000000000 26*0b57cec5SDimitry Andrictwop64: .quad 0x0000000000000000 27*0b57cec5SDimitry Andric 28*0b57cec5SDimitry Andric#define TWOp64 twop64-0b(%ecx,%eax,8) 29*0b57cec5SDimitry Andric 30*0b57cec5SDimitry Andric.text 31*0b57cec5SDimitry Andric.balign 4 32*0b57cec5SDimitry AndricDEFINE_COMPILERRT_FUNCTION(__floatundisf) 33*0b57cec5SDimitry Andric movl 8(%esp), %eax 34*0b57cec5SDimitry Andric movd 8(%esp), %xmm1 35*0b57cec5SDimitry Andric movd 4(%esp), %xmm0 36*0b57cec5SDimitry Andric punpckldq %xmm1, %xmm0 37*0b57cec5SDimitry Andric calll 0f 38*0b57cec5SDimitry Andric0: popl %ecx 39*0b57cec5SDimitry Andric sarl $31, %eax 40*0b57cec5SDimitry Andric movq %xmm0, 4(%esp) 41*0b57cec5SDimitry Andric fildll 4(%esp) 42*0b57cec5SDimitry Andric faddl TWOp64 43*0b57cec5SDimitry Andric fstps 4(%esp) 44*0b57cec5SDimitry Andric flds 4(%esp) 45*0b57cec5SDimitry Andric ret 46*0b57cec5SDimitry AndricEND_COMPILERRT_FUNCTION(__floatundisf) 47*0b57cec5SDimitry Andric 48*0b57cec5SDimitry Andric#endif // __i386__ 49*0b57cec5SDimitry Andric 50*0b57cec5SDimitry Andric*/ 51*0b57cec5SDimitry Andric 52*0b57cec5SDimitry Andric// branch-free, x87-free implementation - faster at the expense of code size 53*0b57cec5SDimitry Andric 54*0b57cec5SDimitry Andric#ifdef __i386__ 55*0b57cec5SDimitry Andric 56*0b57cec5SDimitry AndricCONST_SECTION 57*0b57cec5SDimitry Andric 58*0b57cec5SDimitry Andric .balign 16 59*0b57cec5SDimitry Andrictwop52: 60*0b57cec5SDimitry Andric .quad 0x4330000000000000 61*0b57cec5SDimitry Andric .quad 0x0000000000000fff 62*0b57cec5SDimitry Andric 63*0b57cec5SDimitry Andric .balign 16 64*0b57cec5SDimitry Andricsticky: 65*0b57cec5SDimitry Andric .quad 0x0000000000000000 66*0b57cec5SDimitry Andric .long 0x00000012 67*0b57cec5SDimitry Andric 68*0b57cec5SDimitry Andric .balign 16 69*0b57cec5SDimitry Andrictwelve: 70*0b57cec5SDimitry Andric .long 0x00000000 71*0b57cec5SDimitry Andric 72*0b57cec5SDimitry Andric#define TWOp52 twop52-0b(%ecx) 73*0b57cec5SDimitry Andric#define STICKY sticky-0b(%ecx,%eax,8) 74*0b57cec5SDimitry Andric 75*0b57cec5SDimitry Andric.text 76*0b57cec5SDimitry Andric.balign 4 77*0b57cec5SDimitry AndricDEFINE_COMPILERRT_FUNCTION(__floatundisf) 78*0b57cec5SDimitry Andric movl 8(%esp), %eax 79*0b57cec5SDimitry Andric movd 8(%esp), %xmm1 80*0b57cec5SDimitry Andric movd 4(%esp), %xmm0 81*0b57cec5SDimitry Andric punpckldq %xmm1, %xmm0 82*0b57cec5SDimitry Andric 83*0b57cec5SDimitry Andric calll 0f 84*0b57cec5SDimitry Andric0: popl %ecx 85*0b57cec5SDimitry Andric shrl %eax // high 31 bits of input as sint32 86*0b57cec5SDimitry Andric addl $0x7ff80000, %eax 87*0b57cec5SDimitry Andric sarl $31, %eax // (big input) ? -1 : 0 88*0b57cec5SDimitry Andric movsd STICKY, %xmm1 // (big input) ? 0xfff : 0 89*0b57cec5SDimitry Andric movl $12, %edx 90*0b57cec5SDimitry Andric andl %eax, %edx // (big input) ? 12 : 0 91*0b57cec5SDimitry Andric movd %edx, %xmm3 92*0b57cec5SDimitry Andric andpd %xmm0, %xmm1 // (big input) ? input & 0xfff : 0 93*0b57cec5SDimitry Andric movsd TWOp52, %xmm2 // 0x1.0p52 94*0b57cec5SDimitry Andric psrlq %xmm3, %xmm0 // (big input) ? input >> 12 : input 95*0b57cec5SDimitry Andric orpd %xmm2, %xmm1 // 0x1.0p52 + ((big input) ? input & 0xfff : input) 96*0b57cec5SDimitry Andric orpd %xmm1, %xmm0 // 0x1.0p52 + ((big input) ? (input >> 12 | input & 0xfff) : input) 97*0b57cec5SDimitry Andric subsd %xmm2, %xmm0 // (double)((big input) ? (input >> 12 | input & 0xfff) : input) 98*0b57cec5SDimitry Andric cvtsd2ss %xmm0, %xmm0 // (float)((big input) ? (input >> 12 | input & 0xfff) : input) 99*0b57cec5SDimitry Andric pslld $23, %xmm3 100*0b57cec5SDimitry Andric paddd %xmm3, %xmm0 // (float)input 101*0b57cec5SDimitry Andric movd %xmm0, 4(%esp) 102*0b57cec5SDimitry Andric flds 4(%esp) 103*0b57cec5SDimitry Andric ret 104*0b57cec5SDimitry AndricEND_COMPILERRT_FUNCTION(__floatundisf) 105*0b57cec5SDimitry Andric 106*0b57cec5SDimitry Andric#endif // __i386__ 107*0b57cec5SDimitry Andric 108*0b57cec5SDimitry AndricNO_EXEC_STACK_DIRECTIVE 109*0b57cec5SDimitry Andric 110