1 //===-- udivmodti4.c - Implement __udivmodti4 -----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements __udivmodti4 for the compiler_rt library. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "int_lib.h" 14 15 #ifdef CRT_HAS_128BIT 16 17 // Returns the 128 bit division result by 64 bit. Result must fit in 64 bits. 18 // Remainder stored in r. 19 // Taken and adjusted from libdivide libdivide_128_div_64_to_64 division 20 // fallback. For a correctness proof see the reference for this algorithm 21 // in Knuth, Volume 2, section 4.3.1, Algorithm D. 22 UNUSED 23 static inline du_int udiv128by64to64default(du_int u1, du_int u0, du_int v, 24 du_int *r) { 25 const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT; 26 const du_int b = (1ULL << (n_udword_bits / 2)); // Number base (32 bits) 27 du_int un1, un0; // Norm. dividend LSD's 28 du_int vn1, vn0; // Norm. divisor digits 29 du_int q1, q0; // Quotient digits 30 du_int un64, un21, un10; // Dividend digit pairs 31 du_int rhat; // A remainder 32 si_int s; // Shift amount for normalization 33 34 s = __builtin_clzll(v); 35 if (s > 0) { 36 // Normalize the divisor. 37 v = v << s; 38 un64 = (u1 << s) | (u0 >> (n_udword_bits - s)); 39 un10 = u0 << s; // Shift dividend left 40 } else { 41 // Avoid undefined behavior of (u0 >> 64). 42 un64 = u1; 43 un10 = u0; 44 } 45 46 // Break divisor up into two 32-bit digits. 47 vn1 = v >> (n_udword_bits / 2); 48 vn0 = v & 0xFFFFFFFF; 49 50 // Break right half of dividend into two digits. 51 un1 = un10 >> (n_udword_bits / 2); 52 un0 = un10 & 0xFFFFFFFF; 53 54 // Compute the first quotient digit, q1. 55 q1 = un64 / vn1; 56 rhat = un64 - q1 * vn1; 57 58 // q1 has at most error 2. No more than 2 iterations. 59 while (q1 >= b || q1 * vn0 > b * rhat + un1) { 60 q1 = q1 - 1; 61 rhat = rhat + vn1; 62 if (rhat >= b) 63 break; 64 } 65 66 un21 = un64 * b + un1 - q1 * v; 67 68 // Compute the second quotient digit. 69 q0 = un21 / vn1; 70 rhat = un21 - q0 * vn1; 71 72 // q0 has at most error 2. No more than 2 iterations. 73 while (q0 >= b || q0 * vn0 > b * rhat + un0) { 74 q0 = q0 - 1; 75 rhat = rhat + vn1; 76 if (rhat >= b) 77 break; 78 } 79 80 *r = (un21 * b + un0 - q0 * v) >> s; 81 return q1 * b + q0; 82 } 83 84 static inline du_int udiv128by64to64(du_int u1, du_int u0, du_int v, 85 du_int *r) { 86 #if defined(__x86_64__) 87 du_int result; 88 __asm__("divq %[v]" 89 : "=a"(result), "=d"(*r) 90 : [ v ] "r"(v), "a"(u0), "d"(u1)); 91 return result; 92 #else 93 return udiv128by64to64default(u1, u0, v, r); 94 #endif 95 } 96 97 // Effects: if rem != 0, *rem = a % b 98 // Returns: a / b 99 100 COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem) { 101 const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT; 102 utwords dividend; 103 dividend.all = a; 104 utwords divisor; 105 divisor.all = b; 106 utwords quotient; 107 utwords remainder; 108 if (divisor.all > dividend.all) { 109 if (rem) 110 *rem = dividend.all; 111 return 0; 112 } 113 // When the divisor fits in 64 bits, we can use an optimized path. 114 if (divisor.s.high == 0) { 115 remainder.s.high = 0; 116 if (dividend.s.high < divisor.s.low) { 117 // The result fits in 64 bits. 118 quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low, 119 divisor.s.low, &remainder.s.low); 120 quotient.s.high = 0; 121 } else { 122 // First, divide with the high part to get the remainder in dividend.s.high. 123 // After that dividend.s.high < divisor.s.low. 124 quotient.s.high = dividend.s.high / divisor.s.low; 125 dividend.s.high = dividend.s.high % divisor.s.low; 126 quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low, 127 divisor.s.low, &remainder.s.low); 128 } 129 if (rem) 130 *rem = remainder.all; 131 return quotient.all; 132 } 133 // 0 <= shift <= 63. 134 si_int shift = 135 __builtin_clzll(divisor.s.high) - __builtin_clzll(dividend.s.high); 136 divisor.all <<= shift; 137 quotient.s.high = 0; 138 quotient.s.low = 0; 139 for (; shift >= 0; --shift) { 140 quotient.s.low <<= 1; 141 // Branch free version of. 142 // if (dividend.all >= divisor.all) 143 // { 144 // dividend.all -= divisor.all; 145 // carry = 1; 146 // } 147 const ti_int s = 148 (ti_int)(divisor.all - dividend.all - 1) >> (n_utword_bits - 1); 149 quotient.s.low |= s & 1; 150 dividend.all -= divisor.all & s; 151 divisor.all >>= 1; 152 } 153 if (rem) 154 *rem = dividend.all; 155 return quotient.all; 156 } 157 158 #endif // CRT_HAS_128BIT 159