1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * arch/alpha/lib/ev6-divide.S 4 * 5 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 6 * 7 * Alpha division.. 8 */ 9 10/* 11 * The alpha chip doesn't provide hardware division, so we have to do it 12 * by hand. The compiler expects the functions 13 * 14 * __divqu: 64-bit unsigned long divide 15 * __remqu: 64-bit unsigned long remainder 16 * __divqs/__remqs: signed 64-bit 17 * __divlu/__remlu: unsigned 32-bit 18 * __divls/__remls: signed 32-bit 19 * 20 * These are not normal C functions: instead of the normal 21 * calling sequence, these expect their arguments in registers 22 * $24 and $25, and return the result in $27. Register $28 may 23 * be clobbered (assembly temporary), anything else must be saved. 24 * 25 * In short: painful. 26 * 27 * This is a rather simple bit-at-a-time algorithm: it's very good 28 * at dividing random 64-bit numbers, but the more usual case where 29 * the divisor is small is handled better by the DEC algorithm 30 * using lookup tables. This uses much less memory, though, and is 31 * nicer on the cache.. Besides, I don't know the copyright status 32 * of the DEC code. 33 */ 34 35/* 36 * My temporaries: 37 * $0 - current bit 38 * $1 - shifted divisor 39 * $2 - modulus/quotient 40 * 41 * $23 - return address 42 * $24 - dividend 43 * $25 - divisor 44 * 45 * $27 - quotient/modulus 46 * $28 - compare status 47 * 48 * Much of the information about 21264 scheduling/coding comes from: 49 * Compiler Writer's Guide for the Alpha 21264 50 * abbreviated as 'CWG' in other comments here 51 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 52 * Scheduling notation: 53 * E - either cluster 54 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 55 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 56 * Try not to change the actual algorithm if possible for consistency. 57 */ 58 59#include <asm/export.h> 60#define halt .long 0 61 62/* 63 * Select function type and registers 64 */ 65#define mask $0 66#define divisor $1 67#define compare $28 68#define tmp1 $3 69#define tmp2 $4 70 71#ifdef DIV 72#define DIV_ONLY(x,y...) x,##y 73#define MOD_ONLY(x,y...) 74#define func(x) __div##x 75#define modulus $2 76#define quotient $27 77#define GETSIGN(x) xor $24,$25,x 78#define STACK 48 79#else 80#define DIV_ONLY(x,y...) 81#define MOD_ONLY(x,y...) x,##y 82#define func(x) __rem##x 83#define modulus $27 84#define quotient $2 85#define GETSIGN(x) bis $24,$24,x 86#define STACK 32 87#endif 88 89/* 90 * For 32-bit operations, we need to extend to 64-bit 91 */ 92#ifdef INTSIZE 93#define ufunction func(lu) 94#define sfunction func(l) 95#define LONGIFY(x) zapnot x,15,x 96#define SLONGIFY(x) addl x,0,x 97#else 98#define ufunction func(qu) 99#define sfunction func(q) 100#define LONGIFY(x) 101#define SLONGIFY(x) 102#endif 103 104.set noat 105.align 4 106.globl ufunction 107.ent ufunction 108ufunction: 109 subq $30,STACK,$30 # E : 110 .frame $30,STACK,$23 111 .prologue 0 112 1137: stq $1, 0($30) # L : 114 bis $25,$25,divisor # E : 115 stq $2, 8($30) # L : L U L U 116 117 bis $24,$24,modulus # E : 118 stq $0,16($30) # L : 119 bis $31,$31,quotient # E : 120 LONGIFY(divisor) # E : U L L U 121 122 stq tmp1,24($30) # L : 123 LONGIFY(modulus) # E : 124 bis $31,1,mask # E : 125 DIV_ONLY(stq tmp2,32($30)) # L : L U U L 126 127 beq divisor, 9f /* div by zero */ 128 /* 129 * In spite of the DIV_ONLY being either a non-instruction 130 * or an actual stq, the addition of the .align directive 131 * below ensures that label 1 is going to be nicely aligned 132 */ 133 134 .align 4 135#ifdef INTSIZE 136 /* 137 * shift divisor left, using 3-bit shifts for 138 * 32-bit divides as we can't overflow. Three-bit 139 * shifts will result in looping three times less 140 * here, but can result in two loops more later. 141 * Thus using a large shift isn't worth it (and 142 * s8add pairs better than a sll..) 143 */ 1441: cmpult divisor,modulus,compare # E : 145 s8addq divisor,$31,divisor # E : 146 s8addq mask,$31,mask # E : 147 bne compare,1b # U : U L U L 148#else 1491: cmpult divisor,modulus,compare # E : 150 nop # E : 151 nop # E : 152 blt divisor, 2f # U : U L U L 153 154 addq divisor,divisor,divisor # E : 155 addq mask,mask,mask # E : 156 unop # E : 157 bne compare,1b # U : U L U L 158#endif 159 160 /* ok, start to go right again.. */ 1612: 162 /* 163 * Keep things nicely bundled... use a nop instead of not 164 * having an instruction for DIV_ONLY 165 */ 166#ifdef DIV 167 DIV_ONLY(addq quotient,mask,tmp2) # E : 168#else 169 nop # E : 170#endif 171 srl mask,1,mask # U : 172 cmpule divisor,modulus,compare # E : 173 subq modulus,divisor,tmp1 # E : 174 175#ifdef DIV 176 DIV_ONLY(cmovne compare,tmp2,quotient) # E : Latency 2, extra map slot 177 nop # E : as part of the cmovne 178 srl divisor,1,divisor # U : 179 nop # E : L U L U 180 181 nop # E : 182 cmovne compare,tmp1,modulus # E : Latency 2, extra map slot 183 nop # E : as part of the cmovne 184 bne mask,2b # U : U L U L 185#else 186 srl divisor,1,divisor # U : 187 cmovne compare,tmp1,modulus # E : Latency 2, extra map slot 188 nop # E : as part of the cmovne 189 bne mask,2b # U : U L L U 190#endif 191 1929: ldq $1, 0($30) # L : 193 ldq $2, 8($30) # L : 194 nop # E : 195 nop # E : U U L L 196 197 ldq $0,16($30) # L : 198 ldq tmp1,24($30) # L : 199 nop # E : 200 nop # E : 201 202#ifdef DIV 203 DIV_ONLY(ldq tmp2,32($30)) # L : 204#else 205 nop # E : 206#endif 207 addq $30,STACK,$30 # E : 208 ret $31,($23),1 # L0 : L U U L 209 .end ufunction 210EXPORT_SYMBOL(ufunction) 211 212/* 213 * Uhh.. Ugly signed division. I'd rather not have it at all, but 214 * it's needed in some circumstances. There are different ways to 215 * handle this, really. This does: 216 * -a / b = a / -b = -(a / b) 217 * -a % b = -(a % b) 218 * a % -b = a % b 219 * which is probably not the best solution, but at least should 220 * have the property that (x/y)*y + (x%y) = x. 221 */ 222.align 4 223.globl sfunction 224.ent sfunction 225sfunction: 226 subq $30,STACK,$30 # E : 227 .frame $30,STACK,$23 228 .prologue 0 229 bis $24,$25,$28 # E : 230 SLONGIFY($28) # E : 231 bge $28,7b # U : 232 233 stq $24,0($30) # L : 234 subq $31,$24,$28 # E : 235 stq $25,8($30) # L : 236 nop # E : U L U L 237 238 cmovlt $24,$28,$24 /* abs($24) */ # E : Latency 2, extra map slot 239 nop # E : as part of the cmov 240 stq $23,16($30) # L : 241 subq $31,$25,$28 # E : U L U L 242 243 stq tmp1,24($30) # L : 244 cmovlt $25,$28,$25 /* abs($25) */ # E : Latency 2, extra map slot 245 nop # E : 246 bsr $23,ufunction # L0: L U L U 247 248 ldq $24,0($30) # L : 249 ldq $25,8($30) # L : 250 GETSIGN($28) # E : 251 subq $31,$27,tmp1 # E : U U L L 252 253 SLONGIFY($28) # E : 254 ldq $23,16($30) # L : 255 cmovlt $28,tmp1,$27 # E : Latency 2, extra map slot 256 nop # E : U L L U : as part of the cmov 257 258 ldq tmp1,24($30) # L : 259 nop # E : as part of the cmov 260 addq $30,STACK,$30 # E : 261 ret $31,($23),1 # L0 : L U U L 262 .end sfunction 263EXPORT_SYMBOL(sfunction) 264