1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22/* 23 * .seg "data" 24 * .asciz "Copyr 1987 Sun Micro" 25 * .align 4 26 */ 27 .seg "text" 28 29#ident "%Z%%M% %I% %E% SMI" 30 31! Copyright (c) 1987 by Sun Microsystems, Inc. 32 33 34#include <sys/asm_linkage.h> 35 36/* 37 * procedure to perform a 32 by 32 unsigned integer multiply. 38 * pass the multiplier into %o0, and the multiplicand into %o1 39 * the least significant 32 bits of the result will be returned in %o0, 40 * and the most significant in %o1 41 * 42 * Most unsigned integer multiplies involve small numbers, so it is 43 * worthwhile to optimize for short multiplies at the expense of long 44 * multiplies. This code checks the size of the multiplier, and has 45 * special cases for the following: 46 * 47 * 4 or fewer bit multipliers: 19 or 21 instruction cycles 48 * 8 or fewer bit multipliers: 26 or 28 instruction cycles 49 * 12 or fewer bit multipliers: 34 or 36 instruction cycles 50 * 16 or fewer bit multipliers: 42 or 44 instruction cycles 51 * 52 * Long multipliers require 58 or 60 instruction cycles: 53 * 54 * This code indicates that overflow has occured, by leaving the Z condition 55 * code clear. The following call sequence would be used if you wish to 56 * deal with overflow: 57 * 58 * call .umul 59 * nop ( or set up last parameter here ) 60 * bnz overflow_code (or tnz to overflow handler) 61 */ 62 63! RTENTRY(.umul) 64 .global .umul 65.umul: 66 wr %o0, %y ! multiplier to Y register 67 68 andncc %o0, 0xf, %o4 ! mask out lower 4 bits; if branch 69 ! taken, %o4, N and V have been cleared 70 71 be umul_4bit ! 4-bit multiplier 72 sethi %hi(0xffff0000), %o5 ! mask for 16-bit case; have to 73 ! wait 3 instructions after wd 74 ! before %y has stabilized anyway 75 76 andncc %o0, 0xff, %o4 77 be,a umul_8bit ! 8-bit multiplier 78 mulscc %o4, %o1, %o4 ! first iteration of 9 79 80 andncc %o0, 0xfff, %o4 81 be,a umul_12bit ! 12-bit multiplier 82 mulscc %o4, %o1, %o4 ! first iteration of 13 83 84 andcc %o0, %o5, %o4 85 be,a umul_16bit ! 16-bit multiplier 86 mulscc %o4, %o1, %o4 ! first iteration of 17 87 88 andcc %g0, %g0, %o4 ! zero the partial product 89 ! and clear N and V conditions 90 ! 91 ! long multiply 92 ! 93 mulscc %o4, %o1, %o4 ! first iteration of 33 94 mulscc %o4, %o1, %o4 95 mulscc %o4, %o1, %o4 96 mulscc %o4, %o1, %o4 97 mulscc %o4, %o1, %o4 98 mulscc %o4, %o1, %o4 99 mulscc %o4, %o1, %o4 100 mulscc %o4, %o1, %o4 101 mulscc %o4, %o1, %o4 102 mulscc %o4, %o1, %o4 103 mulscc %o4, %o1, %o4 104 mulscc %o4, %o1, %o4 105 mulscc %o4, %o1, %o4 106 mulscc %o4, %o1, %o4 107 mulscc %o4, %o1, %o4 108 mulscc %o4, %o1, %o4 109 mulscc %o4, %o1, %o4 110 mulscc %o4, %o1, %o4 111 mulscc %o4, %o1, %o4 112 mulscc %o4, %o1, %o4 113 mulscc %o4, %o1, %o4 114 mulscc %o4, %o1, %o4 115 mulscc %o4, %o1, %o4 116 mulscc %o4, %o1, %o4 117 mulscc %o4, %o1, %o4 118 mulscc %o4, %o1, %o4 119 mulscc %o4, %o1, %o4 120 mulscc %o4, %o1, %o4 121 mulscc %o4, %o1, %o4 122 mulscc %o4, %o1, %o4 123 mulscc %o4, %o1, %o4 124 mulscc %o4, %o1, %o4 ! 32nd iteration 125 mulscc %o4, %g0, %o4 ! last iteration only shifts 126 ! 127 ! For unsigned multiplies, a pure shifty-add approach yields the 128 ! correct result. Signed multiplies introduce complications. 129 ! 130 ! With 32-bit twos-complement numbers, -x can be represented as 131 ! 132 ! ((2 - (x/(2**32)) mod 2) * 2**32. 133 ! 134 ! To simplify the equations, the radix point can be moved to just 135 ! to the left of the sign bit. So: 136 ! 137 ! x * y = (xy) mod 2 138 ! -x * y = (2 - x) mod 2 * y = (2y - xy) mod 2 139 ! x * -y = x * (2 - y) mod 2 = (2x - xy) mod 2 140 ! -x * -y = (2 - x) * (2 - y) = (4 - 2x - 2y + xy) mod 2 141 ! 142 ! Because of the way the shift into the partial product is calculated 143 ! (N xor V), the extra term is automagically removed for negative 144 ! multiplicands, so no adjustment is necessary. 145 ! 146 ! But for unsigned multiplies, the high-order bit of the multiplicand 147 ! is incorrectly treated as a sign bit. For unsigned multiplies where 148 ! the high-order bit of the multiplicand is one, the result is 149 ! 150 ! xy - y * (2**32) 151 ! 152 ! we fix that here 153 ! 154 tst %o1 155 bge 1f 156 nop 157 158 add %o4, %o0, %o4 ! add (2**32) * %o0; bits 63-32 159 ! of the product are in %o4 160 ! 161 ! The multiply hasn't overflowed if the high-order bits are 0 162 ! 163 ! if you are not interested in detecting overflow, 164 ! replace the following code with: 165 ! 166 ! 1: 167 ! rd %y, %o0 168 ! retl 169 ! mov %o4, %o1 170 ! 1711: 172 rd %y, %o0 173 retl ! leaf routine return 174 addcc %o4, %g0, %o1 ! return high-order bits and set Z if 175 ! high order bits are 0 176 ! 177 ! 4-bit multiply 178 ! 179umul_4bit: 180 mulscc %o4, %o1, %o4 ! first iteration of 5 181 mulscc %o4, %o1, %o4 182 mulscc %o4, %o1, %o4 183 mulscc %o4, %o1, %o4 ! 4th iteration 184 mulscc %o4, %g0, %o4 ! last iteration only shifts 185 186 rd %y, %o5 187 ! 188 ! The folowing code adds (2**32) * %o0 to the product if the 189 ! multiplicand had it's high bit set (see 32-bit case for explanation) 190 ! 191 tst %o1 192 bge 2f 193 sra %o4, 28, %o1 ! right shift high bits by 28 bits 194 195 add %o1, %o0, %o1 196 ! 197 ! The multiply hasn't overflowed if high-order bits are 0 198 ! 199 ! if you are not interested in detecting overflow, 200 ! replace the following code with: 201 ! 202 ! 2: 203 ! sll %o4, 4, %o0 204 ! srl %o5, 28, %o5 205 ! retl 206 ! or %o5, %o0, %o0 207 ! 2082: 209 sll %o4, 4, %o0 ! left shift middle bits by 4 bits 210 srl %o5, 28, %o5 ! right shift low bits by 28 bits 211 or %o5, %o0, %o0 ! merge for true product 212 retl ! leaf routine return 213 tst %o1 ! set Z if high order bits are 0 214 ! 215 ! 8-bit multiply 216 ! 217umul_8bit: 218 mulscc %o4, %o1, %o4 ! second iteration of 9 219 mulscc %o4, %o1, %o4 220 mulscc %o4, %o1, %o4 221 mulscc %o4, %o1, %o4 222 mulscc %o4, %o1, %o4 223 mulscc %o4, %o1, %o4 224 mulscc %o4, %o1, %o4 ! 8th iteration 225 mulscc %o4, %g0, %o4 ! last iteration only shifts 226 227 rd %y, %o5 228 ! 229 ! The folowing code adds (2**32) * %o0 to the product if the 230 ! multiplicand had it's high bit set (see 32-bit case for explanation) 231 ! 232 tst %o1 233 bge 3f 234 sra %o4, 24, %o1 ! right shift high bits by 24 bits 235 236 add %o1, %o0, %o1 237 ! 238 ! The multiply hasn't overflowed if high-order bits are 0 239 ! 240 ! if you are not interested in detecting overflow, 241 ! replace the following code with: 242 ! 243 ! 3: 244 ! sll %o4, 8, %o0 245 ! srl %o5, 24, %o5 246 ! retl 247 ! or %o5, %o0, %o0 248 ! 2493: 250 sll %o4, 8, %o0 ! left shift middle bits by 8 bits 251 srl %o5, 24, %o5 ! right shift low bits by 24 bits 252 or %o5, %o0, %o0 ! merge for true product 253 retl ! leaf routine return 254 tst %o1 ! set Z if high order bits are 0 255 ! 256 ! 12-bit multiply 257 ! 258umul_12bit: 259 mulscc %o4, %o1, %o4 ! second iteration of 13 260 mulscc %o4, %o1, %o4 261 mulscc %o4, %o1, %o4 262 mulscc %o4, %o1, %o4 263 mulscc %o4, %o1, %o4 264 mulscc %o4, %o1, %o4 265 mulscc %o4, %o1, %o4 266 mulscc %o4, %o1, %o4 267 mulscc %o4, %o1, %o4 268 mulscc %o4, %o1, %o4 269 mulscc %o4, %o1, %o4 ! 12th iteration 270 mulscc %o4, %g0, %o4 ! last iteration only shifts 271 272 rd %y, %o5 273 ! 274 ! The folowing code adds (2**32) * %o0 to the product if the 275 ! multiplicand had it's high bit set (see 32-bit case for explanation) 276 ! 277 tst %o1 278 bge 4f 279 sra %o4, 20, %o1 ! right shift high bits by 20 bits 280 281 add %o1, %o0, %o1 282 ! 283 ! The multiply hasn't overflowed if high-order bits are 0 284 ! 285 ! if you are not interested in detecting overflow, 286 ! replace the following code with: 287 ! 288 ! 4: 289 ! sll %o4, 12, %o0 290 ! srl %o5, 20, %o5 291 ! retl 292 ! or %o5, %o0, %o0 293 ! 2944: 295 sll %o4, 12, %o0 ! left shift middle bits by 12 bits 296 srl %o5, 20, %o5 ! right shift low bits by 20 bits 297 or %o5, %o0, %o0 ! merge for true product 298 retl ! leaf routine return 299 tst %o1 ! set Z if high order bits are 0 300 ! 301 ! 16-bit multiply 302 ! 303umul_16bit: 304 mulscc %o4, %o1, %o4 ! second iteration of 17 305 mulscc %o4, %o1, %o4 306 mulscc %o4, %o1, %o4 307 mulscc %o4, %o1, %o4 308 mulscc %o4, %o1, %o4 309 mulscc %o4, %o1, %o4 310 mulscc %o4, %o1, %o4 311 mulscc %o4, %o1, %o4 312 mulscc %o4, %o1, %o4 313 mulscc %o4, %o1, %o4 314 mulscc %o4, %o1, %o4 315 mulscc %o4, %o1, %o4 316 mulscc %o4, %o1, %o4 317 mulscc %o4, %o1, %o4 318 mulscc %o4, %o1, %o4 ! 16th iteration 319 mulscc %o4, %g0, %o4 ! last iteration only shifts 320 321 rd %y, %o5 322 ! 323 ! The folowing code adds (2**32) * %o0 to the product if the 324 ! multiplicand had it's high bit set (see 32-bit case for explanation) 325 ! 326 tst %o1 327 bge 5f 328 sra %o4, 16, %o1 ! right shift high bits by 16 bits 329 330 add %o1, %o0, %o1 331 ! 332 ! The multiply hasn't overflowed if high-order bits are 0 333 ! 334 ! if you are not interested in detecting overflow, 335 ! replace the following code with: 336 ! 337 ! 5: 338 ! sll %o4, 16, %o0 339 ! srl %o5, 16, %o5 340 ! retl 341 ! or %o5, %o0, %o0 342 ! 3435: 344 sll %o4, 16, %o0 ! left shift middle bits by 16 bits 345 srl %o5, 16, %o5 ! right shift low bits by 16 bits 346 or %o5, %o0, %o0 ! merge for true product 347 retl ! leaf routine return 348 tst %o1 ! set Z if high order bits are 0 349