1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This is the parent TargetLowering class for hardware code gen 11 /// targets. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUISelLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUMachineFunction.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/Analysis.h" 21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 22 #include "llvm/CodeGen/MachineFrameInfo.h" 23 #include "llvm/IR/DiagnosticInfo.h" 24 #include "llvm/IR/IntrinsicsAMDGPU.h" 25 #include "llvm/IR/PatternMatch.h" 26 #include "llvm/Support/CommandLine.h" 27 #include "llvm/Support/KnownBits.h" 28 #include "llvm/Target/TargetMachine.h" 29 30 using namespace llvm; 31 32 #include "AMDGPUGenCallingConv.inc" 33 34 static cl::opt<bool> AMDGPUBypassSlowDiv( 35 "amdgpu-bypass-slow-div", 36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"), 37 cl::init(true)); 38 39 // Find a larger type to do a load / store of a vector with. 40 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { 41 unsigned StoreSize = VT.getStoreSizeInBits(); 42 if (StoreSize <= 32) 43 return EVT::getIntegerVT(Ctx, StoreSize); 44 45 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32"); 46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); 47 } 48 49 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) { 50 return DAG.computeKnownBits(Op).countMaxActiveBits(); 51 } 52 53 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) { 54 // In order for this to be a signed 24-bit value, bit 23, must 55 // be a sign bit. 56 return DAG.ComputeMaxSignificantBits(Op); 57 } 58 59 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, 60 const AMDGPUSubtarget &STI) 61 : TargetLowering(TM), Subtarget(&STI) { 62 // Lower floating point store/load to integer store/load to reduce the number 63 // of patterns in tablegen. 64 setOperationAction(ISD::LOAD, MVT::f32, Promote); 65 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); 66 67 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 68 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); 69 70 setOperationAction(ISD::LOAD, MVT::v3f32, Promote); 71 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32); 72 73 setOperationAction(ISD::LOAD, MVT::v4f32, Promote); 74 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); 75 76 setOperationAction(ISD::LOAD, MVT::v5f32, Promote); 77 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32); 78 79 setOperationAction(ISD::LOAD, MVT::v6f32, Promote); 80 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32); 81 82 setOperationAction(ISD::LOAD, MVT::v7f32, Promote); 83 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32); 84 85 setOperationAction(ISD::LOAD, MVT::v8f32, Promote); 86 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); 87 88 setOperationAction(ISD::LOAD, MVT::v9f32, Promote); 89 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32); 90 91 setOperationAction(ISD::LOAD, MVT::v10f32, Promote); 92 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32); 93 94 setOperationAction(ISD::LOAD, MVT::v11f32, Promote); 95 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32); 96 97 setOperationAction(ISD::LOAD, MVT::v12f32, Promote); 98 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32); 99 100 setOperationAction(ISD::LOAD, MVT::v16f32, Promote); 101 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); 102 103 setOperationAction(ISD::LOAD, MVT::v32f32, Promote); 104 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32); 105 106 setOperationAction(ISD::LOAD, MVT::i64, Promote); 107 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); 108 109 setOperationAction(ISD::LOAD, MVT::v2i64, Promote); 110 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); 111 112 setOperationAction(ISD::LOAD, MVT::f64, Promote); 113 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32); 114 115 setOperationAction(ISD::LOAD, MVT::v2f64, Promote); 116 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32); 117 118 setOperationAction(ISD::LOAD, MVT::v3i64, Promote); 119 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32); 120 121 setOperationAction(ISD::LOAD, MVT::v4i64, Promote); 122 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32); 123 124 setOperationAction(ISD::LOAD, MVT::v3f64, Promote); 125 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32); 126 127 setOperationAction(ISD::LOAD, MVT::v4f64, Promote); 128 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32); 129 130 setOperationAction(ISD::LOAD, MVT::v8i64, Promote); 131 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32); 132 133 setOperationAction(ISD::LOAD, MVT::v8f64, Promote); 134 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32); 135 136 setOperationAction(ISD::LOAD, MVT::v16i64, Promote); 137 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32); 138 139 setOperationAction(ISD::LOAD, MVT::v16f64, Promote); 140 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32); 141 142 setOperationAction(ISD::LOAD, MVT::i128, Promote); 143 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32); 144 145 // There are no 64-bit extloads. These should be done as a 32-bit extload and 146 // an extension to 64-bit. 147 for (MVT VT : MVT::integer_valuetypes()) 148 setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT, 149 Expand); 150 151 for (MVT VT : MVT::integer_valuetypes()) { 152 if (VT == MVT::i64) 153 continue; 154 155 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) { 156 setLoadExtAction(Op, VT, MVT::i1, Promote); 157 setLoadExtAction(Op, VT, MVT::i8, Legal); 158 setLoadExtAction(Op, VT, MVT::i16, Legal); 159 setLoadExtAction(Op, VT, MVT::i32, Expand); 160 } 161 } 162 163 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) 164 for (auto MemVT : 165 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16}) 166 setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT, 167 Expand); 168 169 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 170 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); 171 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 172 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand); 173 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand); 174 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand); 175 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 176 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand); 177 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); 178 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand); 179 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand); 180 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand); 181 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand); 182 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand); 183 184 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 185 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 186 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand); 187 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 188 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand); 189 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand); 190 191 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 192 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); 193 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 194 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand); 195 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand); 196 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand); 197 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 198 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand); 199 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); 200 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand); 201 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand); 202 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand); 203 204 setOperationAction(ISD::STORE, MVT::f32, Promote); 205 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); 206 207 setOperationAction(ISD::STORE, MVT::v2f32, Promote); 208 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); 209 210 setOperationAction(ISD::STORE, MVT::v3f32, Promote); 211 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32); 212 213 setOperationAction(ISD::STORE, MVT::v4f32, Promote); 214 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); 215 216 setOperationAction(ISD::STORE, MVT::v5f32, Promote); 217 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32); 218 219 setOperationAction(ISD::STORE, MVT::v6f32, Promote); 220 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32); 221 222 setOperationAction(ISD::STORE, MVT::v7f32, Promote); 223 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32); 224 225 setOperationAction(ISD::STORE, MVT::v8f32, Promote); 226 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); 227 228 setOperationAction(ISD::STORE, MVT::v9f32, Promote); 229 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32); 230 231 setOperationAction(ISD::STORE, MVT::v10f32, Promote); 232 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32); 233 234 setOperationAction(ISD::STORE, MVT::v11f32, Promote); 235 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32); 236 237 setOperationAction(ISD::STORE, MVT::v12f32, Promote); 238 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32); 239 240 setOperationAction(ISD::STORE, MVT::v16f32, Promote); 241 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); 242 243 setOperationAction(ISD::STORE, MVT::v32f32, Promote); 244 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32); 245 246 setOperationAction(ISD::STORE, MVT::i64, Promote); 247 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); 248 249 setOperationAction(ISD::STORE, MVT::v2i64, Promote); 250 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); 251 252 setOperationAction(ISD::STORE, MVT::f64, Promote); 253 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32); 254 255 setOperationAction(ISD::STORE, MVT::v2f64, Promote); 256 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32); 257 258 setOperationAction(ISD::STORE, MVT::v3i64, Promote); 259 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32); 260 261 setOperationAction(ISD::STORE, MVT::v3f64, Promote); 262 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32); 263 264 setOperationAction(ISD::STORE, MVT::v4i64, Promote); 265 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32); 266 267 setOperationAction(ISD::STORE, MVT::v4f64, Promote); 268 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32); 269 270 setOperationAction(ISD::STORE, MVT::v8i64, Promote); 271 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32); 272 273 setOperationAction(ISD::STORE, MVT::v8f64, Promote); 274 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32); 275 276 setOperationAction(ISD::STORE, MVT::v16i64, Promote); 277 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32); 278 279 setOperationAction(ISD::STORE, MVT::v16f64, Promote); 280 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32); 281 282 setOperationAction(ISD::STORE, MVT::i128, Promote); 283 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32); 284 285 setTruncStoreAction(MVT::i64, MVT::i1, Expand); 286 setTruncStoreAction(MVT::i64, MVT::i8, Expand); 287 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 288 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 289 290 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); 291 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand); 292 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand); 293 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); 294 295 setTruncStoreAction(MVT::f32, MVT::bf16, Expand); 296 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 297 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); 298 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand); 299 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); 300 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); 301 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand); 302 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand); 303 304 setTruncStoreAction(MVT::f64, MVT::bf16, Expand); 305 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 306 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 307 308 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); 309 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); 310 311 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand); 312 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand); 313 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand); 314 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand); 315 316 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand); 317 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand); 318 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand); 319 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand); 320 321 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand); 322 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand); 323 324 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand); 325 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand); 326 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); 327 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); 328 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); 329 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); 330 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand); 331 332 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal); 333 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal); 334 335 setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand); 336 337 // For R600, this is totally unsupported, just custom lower to produce an 338 // error. 339 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 340 341 // Library functions. These default to Expand, but we have instructions 342 // for them. 343 setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, 344 ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM}, 345 MVT::f32, Legal); 346 347 setOperationAction(ISD::FLOG2, MVT::f32, Custom); 348 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom); 349 350 setOperationAction( 351 {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32, 352 Custom); 353 354 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom); 355 356 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom); 357 358 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom); 359 360 if (Subtarget->has16BitInsts()) 361 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal); 362 else { 363 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal); 364 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom); 365 } 366 367 setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16, 368 Custom); 369 370 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches 371 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by 372 // default unless marked custom/legal. 373 setOperationAction( 374 ISD::IS_FPCLASS, 375 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32, 376 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32, 377 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64}, 378 Custom); 379 380 // Expand to fneg + fadd. 381 setOperationAction(ISD::FSUB, MVT::f64, Expand); 382 383 setOperationAction(ISD::CONCAT_VECTORS, 384 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32, 385 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32, 386 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32, 387 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32, 388 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32}, 389 Custom); 390 391 // FIXME: Why is v8f16/v8bf16 missing? 392 setOperationAction( 393 ISD::EXTRACT_SUBVECTOR, 394 {MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16, 395 MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, 396 MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, 397 MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, 398 MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, 399 MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16, 400 MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, 401 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64, 402 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64, 403 MVT::v32i16, MVT::v32f16, MVT::v32bf16}, 404 Custom); 405 406 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 407 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom); 408 409 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 410 for (MVT VT : ScalarIntVTs) { 411 // These should use [SU]DIVREM, so set them to expand 412 setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT, 413 Expand); 414 415 // GPU does not have divrem function for signed or unsigned. 416 setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom); 417 418 // GPU does not have [S|U]MUL_LOHI functions as a single instruction. 419 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand); 420 421 setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand); 422 423 // AMDGPU uses ADDC/SUBC/ADDE/SUBE 424 setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal); 425 } 426 427 // The hardware supports 32-bit FSHR, but not FSHL. 428 setOperationAction(ISD::FSHR, MVT::i32, Legal); 429 430 // The hardware supports 32-bit ROTR, but not ROTL. 431 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand); 432 setOperationAction(ISD::ROTR, MVT::i64, Expand); 433 434 setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand); 435 436 setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand); 437 setOperationAction( 438 {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, 439 MVT::i64, Custom); 440 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 441 442 setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32, 443 Legal); 444 445 setOperationAction( 446 {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, 447 MVT::i64, Custom); 448 449 for (auto VT : {MVT::i8, MVT::i16}) 450 setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Custom); 451 452 static const MVT::SimpleValueType VectorIntTypes[] = { 453 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32, 454 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32}; 455 456 for (MVT VT : VectorIntTypes) { 457 // Expand the following operations for the current type by default. 458 setOperationAction({ISD::ADD, ISD::AND, ISD::FP_TO_SINT, 459 ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU, 460 ISD::MULHS, ISD::OR, ISD::SHL, 461 ISD::SRA, ISD::SRL, ISD::ROTL, 462 ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP, 463 ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV, 464 ISD::SREM, ISD::UREM, ISD::SMUL_LOHI, 465 ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM, 466 ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC, 467 ISD::XOR, ISD::BSWAP, ISD::CTPOP, 468 ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE, 469 ISD::SETCC}, 470 VT, Expand); 471 } 472 473 static const MVT::SimpleValueType FloatVectorTypes[] = { 474 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, 475 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32}; 476 477 for (MVT VT : FloatVectorTypes) { 478 setOperationAction( 479 {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM, 480 ISD::FADD, ISD::FCEIL, ISD::FCOS, 481 ISD::FDIV, ISD::FEXP2, ISD::FEXP, 482 ISD::FEXP10, ISD::FLOG2, ISD::FREM, 483 ISD::FLOG, ISD::FLOG10, ISD::FPOW, 484 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL, 485 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT, 486 ISD::FSQRT, ISD::FSIN, ISD::FSUB, 487 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC, 488 ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC, 489 ISD::FCANONICALIZE, ISD::FROUNDEVEN}, 490 VT, Expand); 491 } 492 493 // This causes using an unrolled select operation rather than expansion with 494 // bit operations. This is in general better, but the alternative using BFI 495 // instructions may be better if the select sources are SGPRs. 496 setOperationAction(ISD::SELECT, MVT::v2f32, Promote); 497 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32); 498 499 setOperationAction(ISD::SELECT, MVT::v3f32, Promote); 500 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32); 501 502 setOperationAction(ISD::SELECT, MVT::v4f32, Promote); 503 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32); 504 505 setOperationAction(ISD::SELECT, MVT::v5f32, Promote); 506 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32); 507 508 setOperationAction(ISD::SELECT, MVT::v6f32, Promote); 509 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32); 510 511 setOperationAction(ISD::SELECT, MVT::v7f32, Promote); 512 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32); 513 514 setOperationAction(ISD::SELECT, MVT::v9f32, Promote); 515 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32); 516 517 setOperationAction(ISD::SELECT, MVT::v10f32, Promote); 518 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32); 519 520 setOperationAction(ISD::SELECT, MVT::v11f32, Promote); 521 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32); 522 523 setOperationAction(ISD::SELECT, MVT::v12f32, Promote); 524 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32); 525 526 // Disable most libcalls. 527 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) { 528 if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16) 529 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr); 530 } 531 532 setSchedulingPreference(Sched::RegPressure); 533 setJumpIsExpensive(true); 534 535 // FIXME: This is only partially true. If we have to do vector compares, any 536 // SGPR pair can be a condition register. If we have a uniform condition, we 537 // are better off doing SALU operations, where there is only one SCC. For now, 538 // we don't have a way of knowing during instruction selection if a condition 539 // will be uniform and we always use vector compares. Assume we are using 540 // vector compares until that is fixed. 541 setHasMultipleConditionRegisters(true); 542 543 setMinCmpXchgSizeInBits(32); 544 setSupportsUnalignedAtomics(false); 545 546 PredictableSelectIsExpensive = false; 547 548 // We want to find all load dependencies for long chains of stores to enable 549 // merging into very wide vectors. The problem is with vectors with > 4 550 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16 551 // vectors are a legal type, even though we have to split the loads 552 // usually. When we can more precisely specify load legality per address 553 // space, we should be able to make FindBetterChain/MergeConsecutiveStores 554 // smarter so that they can figure out what to do in 2 iterations without all 555 // N > 4 stores on the same chain. 556 GatherAllAliasesMaxDepth = 16; 557 558 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry 559 // about these during lowering. 560 MaxStoresPerMemcpy = 0xffffffff; 561 MaxStoresPerMemmove = 0xffffffff; 562 MaxStoresPerMemset = 0xffffffff; 563 564 // The expansion for 64-bit division is enormous. 565 if (AMDGPUBypassSlowDiv) 566 addBypassSlowDiv(64, 32); 567 568 setTargetDAGCombine({ISD::BITCAST, ISD::SHL, 569 ISD::SRA, ISD::SRL, 570 ISD::TRUNCATE, ISD::MUL, 571 ISD::SMUL_LOHI, ISD::UMUL_LOHI, 572 ISD::MULHU, ISD::MULHS, 573 ISD::SELECT, ISD::SELECT_CC, 574 ISD::STORE, ISD::FADD, 575 ISD::FSUB, ISD::FNEG, 576 ISD::FABS, ISD::AssertZext, 577 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN}); 578 579 setMaxAtomicSizeInBitsSupported(64); 580 setMaxDivRemBitWidthSupported(64); 581 } 582 583 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const { 584 if (getTargetMachine().Options.NoSignedZerosFPMath) 585 return true; 586 587 const auto Flags = Op.getNode()->getFlags(); 588 if (Flags.hasNoSignedZeros()) 589 return true; 590 591 return false; 592 } 593 594 //===----------------------------------------------------------------------===// 595 // Target Information 596 //===----------------------------------------------------------------------===// 597 598 LLVM_READNONE 599 static bool fnegFoldsIntoOpcode(unsigned Opc) { 600 switch (Opc) { 601 case ISD::FADD: 602 case ISD::FSUB: 603 case ISD::FMUL: 604 case ISD::FMA: 605 case ISD::FMAD: 606 case ISD::FMINNUM: 607 case ISD::FMAXNUM: 608 case ISD::FMINNUM_IEEE: 609 case ISD::FMAXNUM_IEEE: 610 case ISD::FMINIMUM: 611 case ISD::FMAXIMUM: 612 case ISD::SELECT: 613 case ISD::FSIN: 614 case ISD::FTRUNC: 615 case ISD::FRINT: 616 case ISD::FNEARBYINT: 617 case ISD::FROUNDEVEN: 618 case ISD::FCANONICALIZE: 619 case AMDGPUISD::RCP: 620 case AMDGPUISD::RCP_LEGACY: 621 case AMDGPUISD::RCP_IFLAG: 622 case AMDGPUISD::SIN_HW: 623 case AMDGPUISD::FMUL_LEGACY: 624 case AMDGPUISD::FMIN_LEGACY: 625 case AMDGPUISD::FMAX_LEGACY: 626 case AMDGPUISD::FMED3: 627 // TODO: handle llvm.amdgcn.fma.legacy 628 return true; 629 case ISD::BITCAST: 630 llvm_unreachable("bitcast is special cased"); 631 default: 632 return false; 633 } 634 } 635 636 static bool fnegFoldsIntoOp(const SDNode *N) { 637 unsigned Opc = N->getOpcode(); 638 if (Opc == ISD::BITCAST) { 639 // TODO: Is there a benefit to checking the conditions performFNegCombine 640 // does? We don't for the other cases. 641 SDValue BCSrc = N->getOperand(0); 642 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) { 643 return BCSrc.getNumOperands() == 2 && 644 BCSrc.getOperand(1).getValueSizeInBits() == 32; 645 } 646 647 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32; 648 } 649 650 return fnegFoldsIntoOpcode(Opc); 651 } 652 653 /// \p returns true if the operation will definitely need to use a 64-bit 654 /// encoding, and thus will use a VOP3 encoding regardless of the source 655 /// modifiers. 656 LLVM_READONLY 657 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) { 658 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) || 659 VT == MVT::f64; 660 } 661 662 /// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the 663 /// type for ISD::SELECT. 664 LLVM_READONLY 665 static bool selectSupportsSourceMods(const SDNode *N) { 666 // TODO: Only applies if select will be vector 667 return N->getValueType(0) == MVT::f32; 668 } 669 670 // Most FP instructions support source modifiers, but this could be refined 671 // slightly. 672 LLVM_READONLY 673 static bool hasSourceMods(const SDNode *N) { 674 if (isa<MemSDNode>(N)) 675 return false; 676 677 switch (N->getOpcode()) { 678 case ISD::CopyToReg: 679 case ISD::FDIV: 680 case ISD::FREM: 681 case ISD::INLINEASM: 682 case ISD::INLINEASM_BR: 683 case AMDGPUISD::DIV_SCALE: 684 case ISD::INTRINSIC_W_CHAIN: 685 686 // TODO: Should really be looking at the users of the bitcast. These are 687 // problematic because bitcasts are used to legalize all stores to integer 688 // types. 689 case ISD::BITCAST: 690 return false; 691 case ISD::INTRINSIC_WO_CHAIN: { 692 switch (N->getConstantOperandVal(0)) { 693 case Intrinsic::amdgcn_interp_p1: 694 case Intrinsic::amdgcn_interp_p2: 695 case Intrinsic::amdgcn_interp_mov: 696 case Intrinsic::amdgcn_interp_p1_f16: 697 case Intrinsic::amdgcn_interp_p2_f16: 698 return false; 699 default: 700 return true; 701 } 702 } 703 case ISD::SELECT: 704 return selectSupportsSourceMods(N); 705 default: 706 return true; 707 } 708 } 709 710 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N, 711 unsigned CostThreshold) { 712 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus 713 // it is truly free to use a source modifier in all cases. If there are 714 // multiple users but for each one will necessitate using VOP3, there will be 715 // a code size increase. Try to avoid increasing code size unless we know it 716 // will save on the instruction count. 717 unsigned NumMayIncreaseSize = 0; 718 MVT VT = N->getValueType(0).getScalarType().getSimpleVT(); 719 720 assert(!N->use_empty()); 721 722 // XXX - Should this limit number of uses to check? 723 for (const SDNode *U : N->uses()) { 724 if (!hasSourceMods(U)) 725 return false; 726 727 if (!opMustUseVOP3Encoding(U, VT)) { 728 if (++NumMayIncreaseSize > CostThreshold) 729 return false; 730 } 731 } 732 733 return true; 734 } 735 736 EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, 737 ISD::NodeType ExtendKind) const { 738 assert(!VT.isVector() && "only scalar expected"); 739 740 // Round to the next multiple of 32-bits. 741 unsigned Size = VT.getSizeInBits(); 742 if (Size <= 32) 743 return MVT::i32; 744 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32)); 745 } 746 747 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { 748 return MVT::i32; 749 } 750 751 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { 752 return true; 753 } 754 755 // The backend supports 32 and 64 bit floating point immediates. 756 // FIXME: Why are we reporting vectors of FP immediates as legal? 757 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 758 bool ForCodeSize) const { 759 EVT ScalarVT = VT.getScalarType(); 760 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 || 761 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts())); 762 } 763 764 // We don't want to shrink f64 / f32 constants. 765 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { 766 EVT ScalarVT = VT.getScalarType(); 767 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); 768 } 769 770 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, 771 ISD::LoadExtType ExtTy, 772 EVT NewVT) const { 773 // TODO: This may be worth removing. Check regression tests for diffs. 774 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT)) 775 return false; 776 777 unsigned NewSize = NewVT.getStoreSizeInBits(); 778 779 // If we are reducing to a 32-bit load or a smaller multi-dword load, 780 // this is always better. 781 if (NewSize >= 32) 782 return true; 783 784 EVT OldVT = N->getValueType(0); 785 unsigned OldSize = OldVT.getStoreSizeInBits(); 786 787 MemSDNode *MN = cast<MemSDNode>(N); 788 unsigned AS = MN->getAddressSpace(); 789 // Do not shrink an aligned scalar load to sub-dword. 790 // Scalar engine cannot do sub-dword loads. 791 // TODO: Update this for GFX12 which does have scalar sub-dword loads. 792 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) && 793 (AS == AMDGPUAS::CONSTANT_ADDRESS || 794 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 795 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS && 796 MN->isInvariant())) && 797 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand())) 798 return false; 799 800 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar 801 // extloads, so doing one requires using a buffer_load. In cases where we 802 // still couldn't use a scalar load, using the wider load shouldn't really 803 // hurt anything. 804 805 // If the old size already had to be an extload, there's no harm in continuing 806 // to reduce the width. 807 return (OldSize < 32); 808 } 809 810 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy, 811 const SelectionDAG &DAG, 812 const MachineMemOperand &MMO) const { 813 814 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits()); 815 816 if (LoadTy.getScalarType() == MVT::i32) 817 return false; 818 819 unsigned LScalarSize = LoadTy.getScalarSizeInBits(); 820 unsigned CastScalarSize = CastTy.getScalarSizeInBits(); 821 822 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32)) 823 return false; 824 825 unsigned Fast = 0; 826 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 827 CastTy, MMO, &Fast) && 828 Fast; 829 } 830 831 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also 832 // profitable with the expansion for 64-bit since it's generally good to 833 // speculate things. 834 bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const { 835 return true; 836 } 837 838 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { 839 return true; 840 } 841 842 bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const { 843 switch (N->getOpcode()) { 844 case ISD::EntryToken: 845 case ISD::TokenFactor: 846 return true; 847 case ISD::INTRINSIC_WO_CHAIN: { 848 unsigned IntrID = N->getConstantOperandVal(0); 849 switch (IntrID) { 850 case Intrinsic::amdgcn_readfirstlane: 851 case Intrinsic::amdgcn_readlane: 852 return true; 853 } 854 return false; 855 } 856 case ISD::LOAD: 857 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() == 858 AMDGPUAS::CONSTANT_ADDRESS_32BIT) 859 return true; 860 return false; 861 case AMDGPUISD::SETCC: // ballot-style instruction 862 return true; 863 } 864 return false; 865 } 866 867 SDValue AMDGPUTargetLowering::getNegatedExpression( 868 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, 869 NegatibleCost &Cost, unsigned Depth) const { 870 871 switch (Op.getOpcode()) { 872 case ISD::FMA: 873 case ISD::FMAD: { 874 // Negating a fma is not free if it has users without source mods. 875 if (!allUsesHaveSourceMods(Op.getNode())) 876 return SDValue(); 877 break; 878 } 879 case AMDGPUISD::RCP: { 880 SDValue Src = Op.getOperand(0); 881 EVT VT = Op.getValueType(); 882 SDLoc SL(Op); 883 884 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations, 885 ForCodeSize, Cost, Depth + 1); 886 if (NegSrc) 887 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags()); 888 return SDValue(); 889 } 890 default: 891 break; 892 } 893 894 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations, 895 ForCodeSize, Cost, Depth); 896 } 897 898 //===---------------------------------------------------------------------===// 899 // Target Properties 900 //===---------------------------------------------------------------------===// 901 902 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { 903 assert(VT.isFloatingPoint()); 904 905 // Packed operations do not have a fabs modifier. 906 return VT == MVT::f32 || VT == MVT::f64 || 907 (Subtarget->has16BitInsts() && VT == MVT::f16); 908 } 909 910 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { 911 assert(VT.isFloatingPoint()); 912 // Report this based on the end legalized type. 913 VT = VT.getScalarType(); 914 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16; 915 } 916 917 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, 918 unsigned NumElem, 919 unsigned AS) const { 920 return true; 921 } 922 923 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const { 924 // There are few operations which truly have vector input operands. Any vector 925 // operation is going to involve operations on each component, and a 926 // build_vector will be a copy per element, so it always makes sense to use a 927 // build_vector input in place of the extracted element to avoid a copy into a 928 // super register. 929 // 930 // We should probably only do this if all users are extracts only, but this 931 // should be the common case. 932 return true; 933 } 934 935 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { 936 // Truncate is just accessing a subregister. 937 938 unsigned SrcSize = Source.getSizeInBits(); 939 unsigned DestSize = Dest.getSizeInBits(); 940 941 return DestSize < SrcSize && DestSize % 32 == 0 ; 942 } 943 944 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { 945 // Truncate is just accessing a subregister. 946 947 unsigned SrcSize = Source->getScalarSizeInBits(); 948 unsigned DestSize = Dest->getScalarSizeInBits(); 949 950 if (DestSize== 16 && Subtarget->has16BitInsts()) 951 return SrcSize >= 32; 952 953 return DestSize < SrcSize && DestSize % 32 == 0; 954 } 955 956 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { 957 unsigned SrcSize = Src->getScalarSizeInBits(); 958 unsigned DestSize = Dest->getScalarSizeInBits(); 959 960 if (SrcSize == 16 && Subtarget->has16BitInsts()) 961 return DestSize >= 32; 962 963 return SrcSize == 32 && DestSize == 64; 964 } 965 966 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { 967 // Any register load of a 64-bit value really requires 2 32-bit moves. For all 968 // practical purposes, the extra mov 0 to load a 64-bit is free. As used, 969 // this will enable reducing 64-bit operations the 32-bit, which is always 970 // good. 971 972 if (Src == MVT::i16) 973 return Dest == MVT::i32 ||Dest == MVT::i64 ; 974 975 return Src == MVT::i32 && Dest == MVT::i64; 976 } 977 978 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { 979 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a 980 // limited number of native 64-bit operations. Shrinking an operation to fit 981 // in a single 32-bit register should always be helpful. As currently used, 982 // this is much less general than the name suggests, and is only used in 983 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is 984 // not profitable, and may actually be harmful. 985 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; 986 } 987 988 bool AMDGPUTargetLowering::isDesirableToCommuteWithShift( 989 const SDNode* N, CombineLevel Level) const { 990 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || 991 N->getOpcode() == ISD::SRL) && 992 "Expected shift op"); 993 // Always commute pre-type legalization and right shifts. 994 // We're looking for shl(or(x,y),z) patterns. 995 if (Level < CombineLevel::AfterLegalizeTypes || 996 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR) 997 return true; 998 999 // If only user is a i32 right-shift, then don't destroy a BFE pattern. 1000 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 && 1001 (N->use_begin()->getOpcode() == ISD::SRA || 1002 N->use_begin()->getOpcode() == ISD::SRL)) 1003 return false; 1004 1005 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns. 1006 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) { 1007 if (LHS.getOpcode() != ISD::SHL) 1008 return false; 1009 auto *RHSLd = dyn_cast<LoadSDNode>(RHS); 1010 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0)); 1011 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 1012 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD && 1013 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() && 1014 RHSLd->getExtensionType() == ISD::ZEXTLOAD; 1015 }; 1016 SDValue LHS = N->getOperand(0).getOperand(0); 1017 SDValue RHS = N->getOperand(0).getOperand(1); 1018 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS)); 1019 } 1020 1021 //===---------------------------------------------------------------------===// 1022 // TargetLowering Callbacks 1023 //===---------------------------------------------------------------------===// 1024 1025 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, 1026 bool IsVarArg) { 1027 switch (CC) { 1028 case CallingConv::AMDGPU_VS: 1029 case CallingConv::AMDGPU_GS: 1030 case CallingConv::AMDGPU_PS: 1031 case CallingConv::AMDGPU_CS: 1032 case CallingConv::AMDGPU_HS: 1033 case CallingConv::AMDGPU_ES: 1034 case CallingConv::AMDGPU_LS: 1035 return CC_AMDGPU; 1036 case CallingConv::AMDGPU_CS_Chain: 1037 case CallingConv::AMDGPU_CS_ChainPreserve: 1038 return CC_AMDGPU_CS_CHAIN; 1039 case CallingConv::C: 1040 case CallingConv::Fast: 1041 case CallingConv::Cold: 1042 return CC_AMDGPU_Func; 1043 case CallingConv::AMDGPU_Gfx: 1044 return CC_SI_Gfx; 1045 case CallingConv::AMDGPU_KERNEL: 1046 case CallingConv::SPIR_KERNEL: 1047 default: 1048 report_fatal_error("Unsupported calling convention for call"); 1049 } 1050 } 1051 1052 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, 1053 bool IsVarArg) { 1054 switch (CC) { 1055 case CallingConv::AMDGPU_KERNEL: 1056 case CallingConv::SPIR_KERNEL: 1057 llvm_unreachable("kernels should not be handled here"); 1058 case CallingConv::AMDGPU_VS: 1059 case CallingConv::AMDGPU_GS: 1060 case CallingConv::AMDGPU_PS: 1061 case CallingConv::AMDGPU_CS: 1062 case CallingConv::AMDGPU_CS_Chain: 1063 case CallingConv::AMDGPU_CS_ChainPreserve: 1064 case CallingConv::AMDGPU_HS: 1065 case CallingConv::AMDGPU_ES: 1066 case CallingConv::AMDGPU_LS: 1067 return RetCC_SI_Shader; 1068 case CallingConv::AMDGPU_Gfx: 1069 return RetCC_SI_Gfx; 1070 case CallingConv::C: 1071 case CallingConv::Fast: 1072 case CallingConv::Cold: 1073 return RetCC_AMDGPU_Func; 1074 default: 1075 report_fatal_error("Unsupported calling convention."); 1076 } 1077 } 1078 1079 /// The SelectionDAGBuilder will automatically promote function arguments 1080 /// with illegal types. However, this does not work for the AMDGPU targets 1081 /// since the function arguments are stored in memory as these illegal types. 1082 /// In order to handle this properly we need to get the original types sizes 1083 /// from the LLVM IR Function and fixup the ISD:InputArg values before 1084 /// passing them to AnalyzeFormalArguments() 1085 1086 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting 1087 /// input values across multiple registers. Each item in the Ins array 1088 /// represents a single value that will be stored in registers. Ins[x].VT is 1089 /// the value type of the value that will be stored in the register, so 1090 /// whatever SDNode we lower the argument to needs to be this type. 1091 /// 1092 /// In order to correctly lower the arguments we need to know the size of each 1093 /// argument. Since Ins[x].VT gives us the size of the register that will 1094 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type 1095 /// for the original function argument so that we can deduce the correct memory 1096 /// type to use for Ins[x]. In most cases the correct memory type will be 1097 /// Ins[x].ArgVT. However, this will not always be the case. If, for example, 1098 /// we have a kernel argument of type v8i8, this argument will be split into 1099 /// 8 parts and each part will be represented by its own item in the Ins array. 1100 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of 1101 /// the argument before it was split. From this, we deduce that the memory type 1102 /// for each individual part is i8. We pass the memory type as LocVT to the 1103 /// calling convention analysis function and the register type (Ins[x].VT) as 1104 /// the ValVT. 1105 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( 1106 CCState &State, 1107 const SmallVectorImpl<ISD::InputArg> &Ins) const { 1108 const MachineFunction &MF = State.getMachineFunction(); 1109 const Function &Fn = MF.getFunction(); 1110 LLVMContext &Ctx = Fn.getParent()->getContext(); 1111 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); 1112 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(); 1113 CallingConv::ID CC = Fn.getCallingConv(); 1114 1115 Align MaxAlign = Align(1); 1116 uint64_t ExplicitArgOffset = 0; 1117 const DataLayout &DL = Fn.getParent()->getDataLayout(); 1118 1119 unsigned InIndex = 0; 1120 1121 for (const Argument &Arg : Fn.args()) { 1122 const bool IsByRef = Arg.hasByRefAttr(); 1123 Type *BaseArgTy = Arg.getType(); 1124 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy; 1125 Align Alignment = DL.getValueOrABITypeAlignment( 1126 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy); 1127 MaxAlign = std::max(Alignment, MaxAlign); 1128 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy); 1129 1130 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset; 1131 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize; 1132 1133 // We're basically throwing away everything passed into us and starting over 1134 // to get accurate in-memory offsets. The "PartOffset" is completely useless 1135 // to us as computed in Ins. 1136 // 1137 // We also need to figure out what type legalization is trying to do to get 1138 // the correct memory offsets. 1139 1140 SmallVector<EVT, 16> ValueVTs; 1141 SmallVector<uint64_t, 16> Offsets; 1142 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset); 1143 1144 for (unsigned Value = 0, NumValues = ValueVTs.size(); 1145 Value != NumValues; ++Value) { 1146 uint64_t BasePartOffset = Offsets[Value]; 1147 1148 EVT ArgVT = ValueVTs[Value]; 1149 EVT MemVT = ArgVT; 1150 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT); 1151 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT); 1152 1153 if (NumRegs == 1) { 1154 // This argument is not split, so the IR type is the memory type. 1155 if (ArgVT.isExtended()) { 1156 // We have an extended type, like i24, so we should just use the 1157 // register type. 1158 MemVT = RegisterVT; 1159 } else { 1160 MemVT = ArgVT; 1161 } 1162 } else if (ArgVT.isVector() && RegisterVT.isVector() && 1163 ArgVT.getScalarType() == RegisterVT.getScalarType()) { 1164 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()); 1165 // We have a vector value which has been split into a vector with 1166 // the same scalar type, but fewer elements. This should handle 1167 // all the floating-point vector types. 1168 MemVT = RegisterVT; 1169 } else if (ArgVT.isVector() && 1170 ArgVT.getVectorNumElements() == NumRegs) { 1171 // This arg has been split so that each element is stored in a separate 1172 // register. 1173 MemVT = ArgVT.getScalarType(); 1174 } else if (ArgVT.isExtended()) { 1175 // We have an extended type, like i65. 1176 MemVT = RegisterVT; 1177 } else { 1178 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs; 1179 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0); 1180 if (RegisterVT.isInteger()) { 1181 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); 1182 } else if (RegisterVT.isVector()) { 1183 assert(!RegisterVT.getScalarType().isFloatingPoint()); 1184 unsigned NumElements = RegisterVT.getVectorNumElements(); 1185 assert(MemoryBits % NumElements == 0); 1186 // This vector type has been split into another vector type with 1187 // a different elements size. 1188 EVT ScalarVT = EVT::getIntegerVT(State.getContext(), 1189 MemoryBits / NumElements); 1190 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); 1191 } else { 1192 llvm_unreachable("cannot deduce memory type."); 1193 } 1194 } 1195 1196 // Convert one element vectors to scalar. 1197 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) 1198 MemVT = MemVT.getScalarType(); 1199 1200 // Round up vec3/vec5 argument. 1201 if (MemVT.isVector() && !MemVT.isPow2VectorType()) { 1202 assert(MemVT.getVectorNumElements() == 3 || 1203 MemVT.getVectorNumElements() == 5 || 1204 (MemVT.getVectorNumElements() >= 9 && 1205 MemVT.getVectorNumElements() <= 12)); 1206 MemVT = MemVT.getPow2VectorType(State.getContext()); 1207 } else if (!MemVT.isSimple() && !MemVT.isVector()) { 1208 MemVT = MemVT.getRoundIntegerType(State.getContext()); 1209 } 1210 1211 unsigned PartOffset = 0; 1212 for (unsigned i = 0; i != NumRegs; ++i) { 1213 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT, 1214 BasePartOffset + PartOffset, 1215 MemVT.getSimpleVT(), 1216 CCValAssign::Full)); 1217 PartOffset += MemVT.getStoreSize(); 1218 } 1219 } 1220 } 1221 } 1222 1223 SDValue AMDGPUTargetLowering::LowerReturn( 1224 SDValue Chain, CallingConv::ID CallConv, 1225 bool isVarArg, 1226 const SmallVectorImpl<ISD::OutputArg> &Outs, 1227 const SmallVectorImpl<SDValue> &OutVals, 1228 const SDLoc &DL, SelectionDAG &DAG) const { 1229 // FIXME: Fails for r600 tests 1230 //assert(!isVarArg && Outs.empty() && OutVals.empty() && 1231 // "wave terminate should not have return values"); 1232 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain); 1233 } 1234 1235 //===---------------------------------------------------------------------===// 1236 // Target specific lowering 1237 //===---------------------------------------------------------------------===// 1238 1239 /// Selects the correct CCAssignFn for a given CallingConvention value. 1240 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1241 bool IsVarArg) { 1242 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg); 1243 } 1244 1245 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 1246 bool IsVarArg) { 1247 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg); 1248 } 1249 1250 SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain, 1251 SelectionDAG &DAG, 1252 MachineFrameInfo &MFI, 1253 int ClobberedFI) const { 1254 SmallVector<SDValue, 8> ArgChains; 1255 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); 1256 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; 1257 1258 // Include the original chain at the beginning of the list. When this is 1259 // used by target LowerCall hooks, this helps legalize find the 1260 // CALLSEQ_BEGIN node. 1261 ArgChains.push_back(Chain); 1262 1263 // Add a chain value for each stack argument corresponding 1264 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) { 1265 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) { 1266 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) { 1267 if (FI->getIndex() < 0) { 1268 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); 1269 int64_t InLastByte = InFirstByte; 1270 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; 1271 1272 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 1273 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 1274 ArgChains.push_back(SDValue(L, 1)); 1275 } 1276 } 1277 } 1278 } 1279 1280 // Build a tokenfactor for all the chains. 1281 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 1282 } 1283 1284 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI, 1285 SmallVectorImpl<SDValue> &InVals, 1286 StringRef Reason) const { 1287 SDValue Callee = CLI.Callee; 1288 SelectionDAG &DAG = CLI.DAG; 1289 1290 const Function &Fn = DAG.getMachineFunction().getFunction(); 1291 1292 StringRef FuncName("<unknown>"); 1293 1294 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) 1295 FuncName = G->getSymbol(); 1296 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 1297 FuncName = G->getGlobal()->getName(); 1298 1299 DiagnosticInfoUnsupported NoCalls( 1300 Fn, Reason + FuncName, CLI.DL.getDebugLoc()); 1301 DAG.getContext()->diagnose(NoCalls); 1302 1303 if (!CLI.IsTailCall) { 1304 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I) 1305 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT)); 1306 } 1307 1308 return DAG.getEntryNode(); 1309 } 1310 1311 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, 1312 SmallVectorImpl<SDValue> &InVals) const { 1313 return lowerUnhandledCall(CLI, InVals, "unsupported call to function "); 1314 } 1315 1316 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 1317 SelectionDAG &DAG) const { 1318 const Function &Fn = DAG.getMachineFunction().getFunction(); 1319 1320 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca", 1321 SDLoc(Op).getDebugLoc()); 1322 DAG.getContext()->diagnose(NoDynamicAlloca); 1323 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)}; 1324 return DAG.getMergeValues(Ops, SDLoc()); 1325 } 1326 1327 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, 1328 SelectionDAG &DAG) const { 1329 switch (Op.getOpcode()) { 1330 default: 1331 Op->print(errs(), &DAG); 1332 llvm_unreachable("Custom lowering code for this " 1333 "instruction is not implemented yet!"); 1334 break; 1335 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); 1336 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 1337 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 1338 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); 1339 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); 1340 case ISD::FREM: return LowerFREM(Op, DAG); 1341 case ISD::FCEIL: return LowerFCEIL(Op, DAG); 1342 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); 1343 case ISD::FRINT: return LowerFRINT(Op, DAG); 1344 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); 1345 case ISD::FROUNDEVEN: 1346 return LowerFROUNDEVEN(Op, DAG); 1347 case ISD::FROUND: return LowerFROUND(Op, DAG); 1348 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); 1349 case ISD::FLOG2: 1350 return LowerFLOG2(Op, DAG); 1351 case ISD::FLOG: 1352 case ISD::FLOG10: 1353 return LowerFLOGCommon(Op, DAG); 1354 case ISD::FEXP: 1355 case ISD::FEXP10: 1356 return lowerFEXP(Op, DAG); 1357 case ISD::FEXP2: 1358 return lowerFEXP2(Op, DAG); 1359 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 1360 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 1361 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); 1362 case ISD::FP_TO_SINT: 1363 case ISD::FP_TO_UINT: 1364 return LowerFP_TO_INT(Op, DAG); 1365 case ISD::CTTZ: 1366 case ISD::CTTZ_ZERO_UNDEF: 1367 case ISD::CTLZ: 1368 case ISD::CTLZ_ZERO_UNDEF: 1369 return LowerCTLZ_CTTZ(Op, DAG); 1370 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 1371 } 1372 return Op; 1373 } 1374 1375 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, 1376 SmallVectorImpl<SDValue> &Results, 1377 SelectionDAG &DAG) const { 1378 switch (N->getOpcode()) { 1379 case ISD::SIGN_EXTEND_INREG: 1380 // Different parts of legalization seem to interpret which type of 1381 // sign_extend_inreg is the one to check for custom lowering. The extended 1382 // from type is what really matters, but some places check for custom 1383 // lowering of the result type. This results in trying to use 1384 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do 1385 // nothing here and let the illegal result integer be handled normally. 1386 return; 1387 case ISD::FLOG2: 1388 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG)) 1389 Results.push_back(Lowered); 1390 return; 1391 case ISD::FLOG: 1392 case ISD::FLOG10: 1393 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG)) 1394 Results.push_back(Lowered); 1395 return; 1396 case ISD::FEXP2: 1397 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG)) 1398 Results.push_back(Lowered); 1399 return; 1400 case ISD::FEXP: 1401 case ISD::FEXP10: 1402 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG)) 1403 Results.push_back(Lowered); 1404 return; 1405 case ISD::CTLZ: 1406 case ISD::CTLZ_ZERO_UNDEF: 1407 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG)) 1408 Results.push_back(Lowered); 1409 return; 1410 default: 1411 return; 1412 } 1413 } 1414 1415 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, 1416 SDValue Op, 1417 SelectionDAG &DAG) const { 1418 1419 const DataLayout &DL = DAG.getDataLayout(); 1420 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); 1421 const GlobalValue *GV = G->getGlobal(); 1422 1423 if (!MFI->isModuleEntryFunction()) { 1424 if (std::optional<uint32_t> Address = 1425 AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) { 1426 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType()); 1427 } 1428 } 1429 1430 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 1431 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) { 1432 if (!MFI->isModuleEntryFunction() && 1433 !GV->getName().equals("llvm.amdgcn.module.lds")) { 1434 SDLoc DL(Op); 1435 const Function &Fn = DAG.getMachineFunction().getFunction(); 1436 DiagnosticInfoUnsupported BadLDSDecl( 1437 Fn, "local memory global used by non-kernel function", 1438 DL.getDebugLoc(), DS_Warning); 1439 DAG.getContext()->diagnose(BadLDSDecl); 1440 1441 // We currently don't have a way to correctly allocate LDS objects that 1442 // aren't directly associated with a kernel. We do force inlining of 1443 // functions that use local objects. However, if these dead functions are 1444 // not eliminated, we don't want a compile time error. Just emit a warning 1445 // and a trap, since there should be no callable path here. 1446 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode()); 1447 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 1448 Trap, DAG.getRoot()); 1449 DAG.setRoot(OutputChain); 1450 return DAG.getUNDEF(Op.getValueType()); 1451 } 1452 1453 // XXX: What does the value of G->getOffset() mean? 1454 assert(G->getOffset() == 0 && 1455 "Do not know what to do with an non-zero offset"); 1456 1457 // TODO: We could emit code to handle the initialization somewhere. 1458 // We ignore the initializer for now and legalize it to allow selection. 1459 // The initializer will anyway get errored out during assembly emission. 1460 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV)); 1461 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType()); 1462 } 1463 return SDValue(); 1464 } 1465 1466 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, 1467 SelectionDAG &DAG) const { 1468 SmallVector<SDValue, 8> Args; 1469 SDLoc SL(Op); 1470 1471 EVT VT = Op.getValueType(); 1472 if (VT.getVectorElementType().getSizeInBits() < 32) { 1473 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits(); 1474 if (OpBitSize >= 32 && OpBitSize % 32 == 0) { 1475 unsigned NewNumElt = OpBitSize / 32; 1476 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32 1477 : EVT::getVectorVT(*DAG.getContext(), 1478 MVT::i32, NewNumElt); 1479 for (const SDUse &U : Op->ops()) { 1480 SDValue In = U.get(); 1481 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In); 1482 if (NewNumElt > 1) 1483 DAG.ExtractVectorElements(NewIn, Args); 1484 else 1485 Args.push_back(NewIn); 1486 } 1487 1488 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, 1489 NewNumElt * Op.getNumOperands()); 1490 SDValue BV = DAG.getBuildVector(NewVT, SL, Args); 1491 return DAG.getNode(ISD::BITCAST, SL, VT, BV); 1492 } 1493 } 1494 1495 for (const SDUse &U : Op->ops()) 1496 DAG.ExtractVectorElements(U.get(), Args); 1497 1498 return DAG.getBuildVector(Op.getValueType(), SL, Args); 1499 } 1500 1501 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 1502 SelectionDAG &DAG) const { 1503 SDLoc SL(Op); 1504 SmallVector<SDValue, 8> Args; 1505 unsigned Start = Op.getConstantOperandVal(1); 1506 EVT VT = Op.getValueType(); 1507 EVT SrcVT = Op.getOperand(0).getValueType(); 1508 1509 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) { 1510 unsigned NumElt = VT.getVectorNumElements(); 1511 unsigned NumSrcElt = SrcVT.getVectorNumElements(); 1512 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types"); 1513 1514 // Extract 32-bit registers at a time. 1515 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2); 1516 EVT NewVT = NumElt == 2 1517 ? MVT::i32 1518 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2); 1519 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0)); 1520 1521 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2); 1522 if (NumElt == 2) 1523 Tmp = Args[0]; 1524 else 1525 Tmp = DAG.getBuildVector(NewVT, SL, Args); 1526 1527 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp); 1528 } 1529 1530 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, 1531 VT.getVectorNumElements()); 1532 1533 return DAG.getBuildVector(Op.getValueType(), SL, Args); 1534 } 1535 1536 // TODO: Handle fabs too 1537 static SDValue peekFNeg(SDValue Val) { 1538 if (Val.getOpcode() == ISD::FNEG) 1539 return Val.getOperand(0); 1540 1541 return Val; 1542 } 1543 1544 static SDValue peekFPSignOps(SDValue Val) { 1545 if (Val.getOpcode() == ISD::FNEG) 1546 Val = Val.getOperand(0); 1547 if (Val.getOpcode() == ISD::FABS) 1548 Val = Val.getOperand(0); 1549 if (Val.getOpcode() == ISD::FCOPYSIGN) 1550 Val = Val.getOperand(0); 1551 return Val; 1552 } 1553 1554 SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl( 1555 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, 1556 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const { 1557 SelectionDAG &DAG = DCI.DAG; 1558 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1559 switch (CCOpcode) { 1560 case ISD::SETOEQ: 1561 case ISD::SETONE: 1562 case ISD::SETUNE: 1563 case ISD::SETNE: 1564 case ISD::SETUEQ: 1565 case ISD::SETEQ: 1566 case ISD::SETFALSE: 1567 case ISD::SETFALSE2: 1568 case ISD::SETTRUE: 1569 case ISD::SETTRUE2: 1570 case ISD::SETUO: 1571 case ISD::SETO: 1572 break; 1573 case ISD::SETULE: 1574 case ISD::SETULT: { 1575 if (LHS == True) 1576 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); 1577 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); 1578 } 1579 case ISD::SETOLE: 1580 case ISD::SETOLT: 1581 case ISD::SETLE: 1582 case ISD::SETLT: { 1583 // Ordered. Assume ordered for undefined. 1584 1585 // Only do this after legalization to avoid interfering with other combines 1586 // which might occur. 1587 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && 1588 !DCI.isCalledByLegalizer()) 1589 return SDValue(); 1590 1591 // We need to permute the operands to get the correct NaN behavior. The 1592 // selected operand is the second one based on the failing compare with NaN, 1593 // so permute it based on the compare type the hardware uses. 1594 if (LHS == True) 1595 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); 1596 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); 1597 } 1598 case ISD::SETUGE: 1599 case ISD::SETUGT: { 1600 if (LHS == True) 1601 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); 1602 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); 1603 } 1604 case ISD::SETGT: 1605 case ISD::SETGE: 1606 case ISD::SETOGE: 1607 case ISD::SETOGT: { 1608 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && 1609 !DCI.isCalledByLegalizer()) 1610 return SDValue(); 1611 1612 if (LHS == True) 1613 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); 1614 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); 1615 } 1616 case ISD::SETCC_INVALID: 1617 llvm_unreachable("Invalid setcc condcode!"); 1618 } 1619 return SDValue(); 1620 } 1621 1622 /// Generate Min/Max node 1623 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT, 1624 SDValue LHS, SDValue RHS, 1625 SDValue True, SDValue False, 1626 SDValue CC, 1627 DAGCombinerInfo &DCI) const { 1628 if ((LHS == True && RHS == False) || (LHS == False && RHS == True)) 1629 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI); 1630 1631 SelectionDAG &DAG = DCI.DAG; 1632 1633 // If we can't directly match this, try to see if we can fold an fneg to 1634 // match. 1635 1636 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 1637 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False); 1638 SDValue NegTrue = peekFNeg(True); 1639 1640 // Undo the combine foldFreeOpFromSelect does if it helps us match the 1641 // fmin/fmax. 1642 // 1643 // select (fcmp olt (lhs, K)), (fneg lhs), -K 1644 // -> fneg (fmin_legacy lhs, K) 1645 // 1646 // TODO: Use getNegatedExpression 1647 if (LHS == NegTrue && CFalse && CRHS) { 1648 APFloat NegRHS = neg(CRHS->getValueAPF()); 1649 if (NegRHS == CFalse->getValueAPF()) { 1650 SDValue Combined = 1651 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI); 1652 if (Combined) 1653 return DAG.getNode(ISD::FNEG, DL, VT, Combined); 1654 return SDValue(); 1655 } 1656 } 1657 1658 return SDValue(); 1659 } 1660 1661 std::pair<SDValue, SDValue> 1662 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const { 1663 SDLoc SL(Op); 1664 1665 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1666 1667 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 1668 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 1669 1670 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); 1671 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); 1672 1673 return std::pair(Lo, Hi); 1674 } 1675 1676 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const { 1677 SDLoc SL(Op); 1678 1679 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1680 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 1681 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); 1682 } 1683 1684 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const { 1685 SDLoc SL(Op); 1686 1687 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1688 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 1689 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); 1690 } 1691 1692 // Split a vector type into two parts. The first part is a power of two vector. 1693 // The second part is whatever is left over, and is a scalar if it would 1694 // otherwise be a 1-vector. 1695 std::pair<EVT, EVT> 1696 AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const { 1697 EVT LoVT, HiVT; 1698 EVT EltVT = VT.getVectorElementType(); 1699 unsigned NumElts = VT.getVectorNumElements(); 1700 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2); 1701 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts); 1702 HiVT = NumElts - LoNumElts == 1 1703 ? EltVT 1704 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts); 1705 return std::pair(LoVT, HiVT); 1706 } 1707 1708 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be 1709 // scalar. 1710 std::pair<SDValue, SDValue> 1711 AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL, 1712 const EVT &LoVT, const EVT &HiVT, 1713 SelectionDAG &DAG) const { 1714 assert(LoVT.getVectorNumElements() + 1715 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= 1716 N.getValueType().getVectorNumElements() && 1717 "More vector elements requested than available!"); 1718 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, 1719 DAG.getVectorIdxConstant(0, DL)); 1720 SDValue Hi = DAG.getNode( 1721 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL, 1722 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL)); 1723 return std::pair(Lo, Hi); 1724 } 1725 1726 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, 1727 SelectionDAG &DAG) const { 1728 LoadSDNode *Load = cast<LoadSDNode>(Op); 1729 EVT VT = Op.getValueType(); 1730 SDLoc SL(Op); 1731 1732 1733 // If this is a 2 element vector, we really want to scalarize and not create 1734 // weird 1 element vectors. 1735 if (VT.getVectorNumElements() == 2) { 1736 SDValue Ops[2]; 1737 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG); 1738 return DAG.getMergeValues(Ops, SL); 1739 } 1740 1741 SDValue BasePtr = Load->getBasePtr(); 1742 EVT MemVT = Load->getMemoryVT(); 1743 1744 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); 1745 1746 EVT LoVT, HiVT; 1747 EVT LoMemVT, HiMemVT; 1748 SDValue Lo, Hi; 1749 1750 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); 1751 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); 1752 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG); 1753 1754 unsigned Size = LoMemVT.getStoreSize(); 1755 Align BaseAlign = Load->getAlign(); 1756 Align HiAlign = commonAlignment(BaseAlign, Size); 1757 1758 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, 1759 Load->getChain(), BasePtr, SrcValue, LoMemVT, 1760 BaseAlign, Load->getMemOperand()->getFlags()); 1761 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size)); 1762 SDValue HiLoad = 1763 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), 1764 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), 1765 HiMemVT, HiAlign, Load->getMemOperand()->getFlags()); 1766 1767 SDValue Join; 1768 if (LoVT == HiVT) { 1769 // This is the case that the vector is power of two so was evenly split. 1770 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad); 1771 } else { 1772 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad, 1773 DAG.getVectorIdxConstant(0, SL)); 1774 Join = DAG.getNode( 1775 HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL, 1776 VT, Join, HiLoad, 1777 DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL)); 1778 } 1779 1780 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other, 1781 LoLoad.getValue(1), HiLoad.getValue(1))}; 1782 1783 return DAG.getMergeValues(Ops, SL); 1784 } 1785 1786 SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op, 1787 SelectionDAG &DAG) const { 1788 LoadSDNode *Load = cast<LoadSDNode>(Op); 1789 EVT VT = Op.getValueType(); 1790 SDValue BasePtr = Load->getBasePtr(); 1791 EVT MemVT = Load->getMemoryVT(); 1792 SDLoc SL(Op); 1793 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); 1794 Align BaseAlign = Load->getAlign(); 1795 unsigned NumElements = MemVT.getVectorNumElements(); 1796 1797 // Widen from vec3 to vec4 when the load is at least 8-byte aligned 1798 // or 16-byte fully dereferenceable. Otherwise, split the vector load. 1799 if (NumElements != 3 || 1800 (BaseAlign < Align(8) && 1801 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout()))) 1802 return SplitVectorLoad(Op, DAG); 1803 1804 assert(NumElements == 3); 1805 1806 EVT WideVT = 1807 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); 1808 EVT WideMemVT = 1809 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4); 1810 SDValue WideLoad = DAG.getExtLoad( 1811 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue, 1812 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags()); 1813 return DAG.getMergeValues( 1814 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad, 1815 DAG.getVectorIdxConstant(0, SL)), 1816 WideLoad.getValue(1)}, 1817 SL); 1818 } 1819 1820 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, 1821 SelectionDAG &DAG) const { 1822 StoreSDNode *Store = cast<StoreSDNode>(Op); 1823 SDValue Val = Store->getValue(); 1824 EVT VT = Val.getValueType(); 1825 1826 // If this is a 2 element vector, we really want to scalarize and not create 1827 // weird 1 element vectors. 1828 if (VT.getVectorNumElements() == 2) 1829 return scalarizeVectorStore(Store, DAG); 1830 1831 EVT MemVT = Store->getMemoryVT(); 1832 SDValue Chain = Store->getChain(); 1833 SDValue BasePtr = Store->getBasePtr(); 1834 SDLoc SL(Op); 1835 1836 EVT LoVT, HiVT; 1837 EVT LoMemVT, HiMemVT; 1838 SDValue Lo, Hi; 1839 1840 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); 1841 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); 1842 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG); 1843 1844 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize()); 1845 1846 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo(); 1847 Align BaseAlign = Store->getAlign(); 1848 unsigned Size = LoMemVT.getStoreSize(); 1849 Align HiAlign = commonAlignment(BaseAlign, Size); 1850 1851 SDValue LoStore = 1852 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign, 1853 Store->getMemOperand()->getFlags()); 1854 SDValue HiStore = 1855 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), 1856 HiMemVT, HiAlign, Store->getMemOperand()->getFlags()); 1857 1858 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); 1859 } 1860 1861 // This is a shortcut for integer division because we have fast i32<->f32 1862 // conversions, and fast f32 reciprocal instructions. The fractional part of a 1863 // float is enough to accurately represent up to a 24-bit signed integer. 1864 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, 1865 bool Sign) const { 1866 SDLoc DL(Op); 1867 EVT VT = Op.getValueType(); 1868 SDValue LHS = Op.getOperand(0); 1869 SDValue RHS = Op.getOperand(1); 1870 MVT IntVT = MVT::i32; 1871 MVT FltVT = MVT::f32; 1872 1873 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS); 1874 if (LHSSignBits < 9) 1875 return SDValue(); 1876 1877 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS); 1878 if (RHSSignBits < 9) 1879 return SDValue(); 1880 1881 unsigned BitSize = VT.getSizeInBits(); 1882 unsigned SignBits = std::min(LHSSignBits, RHSSignBits); 1883 unsigned DivBits = BitSize - SignBits; 1884 if (Sign) 1885 ++DivBits; 1886 1887 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; 1888 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; 1889 1890 SDValue jq = DAG.getConstant(1, DL, IntVT); 1891 1892 if (Sign) { 1893 // char|short jq = ia ^ ib; 1894 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); 1895 1896 // jq = jq >> (bitsize - 2) 1897 jq = DAG.getNode(ISD::SRA, DL, VT, jq, 1898 DAG.getConstant(BitSize - 2, DL, VT)); 1899 1900 // jq = jq | 0x1 1901 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT)); 1902 } 1903 1904 // int ia = (int)LHS; 1905 SDValue ia = LHS; 1906 1907 // int ib, (int)RHS; 1908 SDValue ib = RHS; 1909 1910 // float fa = (float)ia; 1911 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); 1912 1913 // float fb = (float)ib; 1914 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); 1915 1916 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, 1917 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); 1918 1919 // fq = trunc(fq); 1920 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq); 1921 1922 // float fqneg = -fq; 1923 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); 1924 1925 MachineFunction &MF = DAG.getMachineFunction(); 1926 1927 bool UseFmadFtz = false; 1928 if (Subtarget->isGCN()) { 1929 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1930 UseFmadFtz = 1931 MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign(); 1932 } 1933 1934 // float fr = mad(fqneg, fb, fa); 1935 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA 1936 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ 1937 : (unsigned)ISD::FMAD; 1938 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa); 1939 1940 // int iq = (int)fq; 1941 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); 1942 1943 // fr = fabs(fr); 1944 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); 1945 1946 // fb = fabs(fb); 1947 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); 1948 1949 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 1950 1951 // int cv = fr >= fb; 1952 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); 1953 1954 // jq = (cv ? jq : 0); 1955 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT)); 1956 1957 // dst = iq + jq; 1958 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); 1959 1960 // Rem needs compensation, it's easier to recompute it 1961 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); 1962 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); 1963 1964 // Truncate to number of bits this divide really is. 1965 if (Sign) { 1966 SDValue InRegSize 1967 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits)); 1968 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize); 1969 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize); 1970 } else { 1971 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT); 1972 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask); 1973 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask); 1974 } 1975 1976 return DAG.getMergeValues({ Div, Rem }, DL); 1977 } 1978 1979 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, 1980 SelectionDAG &DAG, 1981 SmallVectorImpl<SDValue> &Results) const { 1982 SDLoc DL(Op); 1983 EVT VT = Op.getValueType(); 1984 1985 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64"); 1986 1987 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 1988 1989 SDValue One = DAG.getConstant(1, DL, HalfVT); 1990 SDValue Zero = DAG.getConstant(0, DL, HalfVT); 1991 1992 //HiLo split 1993 SDValue LHS_Lo, LHS_Hi; 1994 SDValue LHS = Op.getOperand(0); 1995 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT); 1996 1997 SDValue RHS_Lo, RHS_Hi; 1998 SDValue RHS = Op.getOperand(1); 1999 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT); 2000 2001 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && 2002 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { 2003 2004 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), 2005 LHS_Lo, RHS_Lo); 2006 2007 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero}); 2008 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero}); 2009 2010 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV)); 2011 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM)); 2012 return; 2013 } 2014 2015 if (isTypeLegal(MVT::i64)) { 2016 // The algorithm here is based on ideas from "Software Integer Division", 2017 // Tom Rodeheffer, August 2008. 2018 2019 MachineFunction &MF = DAG.getMachineFunction(); 2020 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2021 2022 // Compute denominator reciprocal. 2023 unsigned FMAD = 2024 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA 2025 : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign() 2026 ? (unsigned)ISD::FMAD 2027 : (unsigned)AMDGPUISD::FMAD_FTZ; 2028 2029 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo); 2030 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi); 2031 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi, 2032 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32), 2033 Cvt_Lo); 2034 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1); 2035 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp, 2036 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32)); 2037 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1, 2038 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32)); 2039 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2); 2040 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc, 2041 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32), 2042 Mul1); 2043 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2); 2044 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc); 2045 SDValue Rcp64 = DAG.getBitcast(VT, 2046 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi})); 2047 2048 SDValue Zero64 = DAG.getConstant(0, DL, VT); 2049 SDValue One64 = DAG.getConstant(1, DL, VT); 2050 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1); 2051 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1); 2052 2053 // First round of UNR (Unsigned integer Newton-Raphson). 2054 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS); 2055 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64); 2056 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1); 2057 SDValue Mulhi1_Lo, Mulhi1_Hi; 2058 std::tie(Mulhi1_Lo, Mulhi1_Hi) = 2059 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT); 2060 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo, 2061 Mulhi1_Lo, Zero1); 2062 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi, 2063 Mulhi1_Hi, Add1_Lo.getValue(1)); 2064 SDValue Add1 = DAG.getBitcast(VT, 2065 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi})); 2066 2067 // Second round of UNR. 2068 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1); 2069 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2); 2070 SDValue Mulhi2_Lo, Mulhi2_Hi; 2071 std::tie(Mulhi2_Lo, Mulhi2_Hi) = 2072 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT); 2073 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo, 2074 Mulhi2_Lo, Zero1); 2075 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi, 2076 Mulhi2_Hi, Add2_Lo.getValue(1)); 2077 SDValue Add2 = DAG.getBitcast(VT, 2078 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi})); 2079 2080 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2); 2081 2082 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3); 2083 2084 SDValue Mul3_Lo, Mul3_Hi; 2085 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT); 2086 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo, 2087 Mul3_Lo, Zero1); 2088 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi, 2089 Mul3_Hi, Sub1_Lo.getValue(1)); 2090 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi); 2091 SDValue Sub1 = DAG.getBitcast(VT, 2092 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi})); 2093 2094 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT); 2095 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero, 2096 ISD::SETUGE); 2097 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero, 2098 ISD::SETUGE); 2099 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ); 2100 2101 // TODO: Here and below portions of the code can be enclosed into if/endif. 2102 // Currently control flow is unconditional and we have 4 selects after 2103 // potential endif to substitute PHIs. 2104 2105 // if C3 != 0 ... 2106 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo, 2107 RHS_Lo, Zero1); 2108 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi, 2109 RHS_Hi, Sub1_Lo.getValue(1)); 2110 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi, 2111 Zero, Sub2_Lo.getValue(1)); 2112 SDValue Sub2 = DAG.getBitcast(VT, 2113 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi})); 2114 2115 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64); 2116 2117 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero, 2118 ISD::SETUGE); 2119 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero, 2120 ISD::SETUGE); 2121 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ); 2122 2123 // if (C6 != 0) 2124 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64); 2125 2126 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo, 2127 RHS_Lo, Zero1); 2128 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi, 2129 RHS_Hi, Sub2_Lo.getValue(1)); 2130 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi, 2131 Zero, Sub3_Lo.getValue(1)); 2132 SDValue Sub3 = DAG.getBitcast(VT, 2133 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi})); 2134 2135 // endif C6 2136 // endif C3 2137 2138 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE); 2139 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE); 2140 2141 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE); 2142 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE); 2143 2144 Results.push_back(Div); 2145 Results.push_back(Rem); 2146 2147 return; 2148 } 2149 2150 // r600 expandion. 2151 // Get Speculative values 2152 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); 2153 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); 2154 2155 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ); 2156 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero}); 2157 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM); 2158 2159 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ); 2160 SDValue DIV_Lo = Zero; 2161 2162 const unsigned halfBitWidth = HalfVT.getSizeInBits(); 2163 2164 for (unsigned i = 0; i < halfBitWidth; ++i) { 2165 const unsigned bitPos = halfBitWidth - i - 1; 2166 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT); 2167 // Get value of high bit 2168 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); 2169 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One); 2170 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); 2171 2172 // Shift 2173 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT)); 2174 // Add LHS high bit 2175 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); 2176 2177 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT); 2178 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE); 2179 2180 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); 2181 2182 // Update REM 2183 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); 2184 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); 2185 } 2186 2187 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi}); 2188 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV); 2189 Results.push_back(DIV); 2190 Results.push_back(REM); 2191 } 2192 2193 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, 2194 SelectionDAG &DAG) const { 2195 SDLoc DL(Op); 2196 EVT VT = Op.getValueType(); 2197 2198 if (VT == MVT::i64) { 2199 SmallVector<SDValue, 2> Results; 2200 LowerUDIVREM64(Op, DAG, Results); 2201 return DAG.getMergeValues(Results, DL); 2202 } 2203 2204 if (VT == MVT::i32) { 2205 if (SDValue Res = LowerDIVREM24(Op, DAG, false)) 2206 return Res; 2207 } 2208 2209 SDValue X = Op.getOperand(0); 2210 SDValue Y = Op.getOperand(1); 2211 2212 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2213 // algorithm used here. 2214 2215 // Initial estimate of inv(y). 2216 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y); 2217 2218 // One round of UNR. 2219 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y); 2220 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z); 2221 Z = DAG.getNode(ISD::ADD, DL, VT, Z, 2222 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ)); 2223 2224 // Quotient/remainder estimate. 2225 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z); 2226 SDValue R = 2227 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y)); 2228 2229 // First quotient/remainder refinement. 2230 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2231 SDValue One = DAG.getConstant(1, DL, VT); 2232 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE); 2233 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond, 2234 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q); 2235 R = DAG.getNode(ISD::SELECT, DL, VT, Cond, 2236 DAG.getNode(ISD::SUB, DL, VT, R, Y), R); 2237 2238 // Second quotient/remainder refinement. 2239 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE); 2240 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond, 2241 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q); 2242 R = DAG.getNode(ISD::SELECT, DL, VT, Cond, 2243 DAG.getNode(ISD::SUB, DL, VT, R, Y), R); 2244 2245 return DAG.getMergeValues({Q, R}, DL); 2246 } 2247 2248 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, 2249 SelectionDAG &DAG) const { 2250 SDLoc DL(Op); 2251 EVT VT = Op.getValueType(); 2252 2253 SDValue LHS = Op.getOperand(0); 2254 SDValue RHS = Op.getOperand(1); 2255 2256 SDValue Zero = DAG.getConstant(0, DL, VT); 2257 SDValue NegOne = DAG.getConstant(-1, DL, VT); 2258 2259 if (VT == MVT::i32) { 2260 if (SDValue Res = LowerDIVREM24(Op, DAG, true)) 2261 return Res; 2262 } 2263 2264 if (VT == MVT::i64 && 2265 DAG.ComputeNumSignBits(LHS) > 32 && 2266 DAG.ComputeNumSignBits(RHS) > 32) { 2267 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 2268 2269 //HiLo split 2270 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); 2271 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); 2272 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), 2273 LHS_Lo, RHS_Lo); 2274 SDValue Res[2] = { 2275 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)), 2276 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1)) 2277 }; 2278 return DAG.getMergeValues(Res, DL); 2279 } 2280 2281 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); 2282 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); 2283 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); 2284 SDValue RSign = LHSign; // Remainder sign is the same as LHS 2285 2286 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); 2287 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); 2288 2289 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); 2290 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); 2291 2292 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); 2293 SDValue Rem = Div.getValue(1); 2294 2295 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); 2296 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); 2297 2298 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); 2299 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); 2300 2301 SDValue Res[2] = { 2302 Div, 2303 Rem 2304 }; 2305 return DAG.getMergeValues(Res, DL); 2306 } 2307 2308 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x) 2309 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { 2310 SDLoc SL(Op); 2311 EVT VT = Op.getValueType(); 2312 auto Flags = Op->getFlags(); 2313 SDValue X = Op.getOperand(0); 2314 SDValue Y = Op.getOperand(1); 2315 2316 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags); 2317 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags); 2318 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags); 2319 // TODO: For f32 use FMAD instead if !hasFastFMA32? 2320 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags); 2321 } 2322 2323 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { 2324 SDLoc SL(Op); 2325 SDValue Src = Op.getOperand(0); 2326 2327 // result = trunc(src) 2328 // if (src > 0.0 && src != result) 2329 // result += 1.0 2330 2331 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 2332 2333 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); 2334 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 2335 2336 EVT SetCCVT = 2337 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2338 2339 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); 2340 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 2341 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 2342 2343 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); 2344 // TODO: Should this propagate fast-math-flags? 2345 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 2346 } 2347 2348 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, 2349 SelectionDAG &DAG) { 2350 const unsigned FractBits = 52; 2351 const unsigned ExpBits = 11; 2352 2353 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, 2354 Hi, 2355 DAG.getConstant(FractBits - 32, SL, MVT::i32), 2356 DAG.getConstant(ExpBits, SL, MVT::i32)); 2357 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, 2358 DAG.getConstant(1023, SL, MVT::i32)); 2359 2360 return Exp; 2361 } 2362 2363 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { 2364 SDLoc SL(Op); 2365 SDValue Src = Op.getOperand(0); 2366 2367 assert(Op.getValueType() == MVT::f64); 2368 2369 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 2370 2371 // Extract the upper half, since this is where we will find the sign and 2372 // exponent. 2373 SDValue Hi = getHiHalf64(Src, DAG); 2374 2375 SDValue Exp = extractF64Exponent(Hi, SL, DAG); 2376 2377 const unsigned FractBits = 52; 2378 2379 // Extract the sign bit. 2380 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32); 2381 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); 2382 2383 // Extend back to 64-bits. 2384 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit}); 2385 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); 2386 2387 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); 2388 const SDValue FractMask 2389 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64); 2390 2391 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); 2392 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); 2393 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); 2394 2395 EVT SetCCVT = 2396 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); 2397 2398 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32); 2399 2400 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); 2401 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); 2402 2403 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); 2404 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); 2405 2406 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); 2407 } 2408 2409 SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op, 2410 SelectionDAG &DAG) const { 2411 SDLoc SL(Op); 2412 SDValue Src = Op.getOperand(0); 2413 2414 assert(Op.getValueType() == MVT::f64); 2415 2416 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2417 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); 2418 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); 2419 2420 // TODO: Should this propagate fast-math-flags? 2421 2422 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); 2423 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); 2424 2425 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); 2426 2427 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2428 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); 2429 2430 EVT SetCCVT = 2431 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2432 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); 2433 2434 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); 2435 } 2436 2437 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, 2438 SelectionDAG &DAG) const { 2439 // FNEARBYINT and FRINT are the same, except in their handling of FP 2440 // exceptions. Those aren't really meaningful for us, and OpenCL only has 2441 // rint, so just treat them as equivalent. 2442 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(), 2443 Op.getOperand(0)); 2444 } 2445 2446 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { 2447 auto VT = Op.getValueType(); 2448 auto Arg = Op.getOperand(0u); 2449 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg); 2450 } 2451 2452 // XXX - May require not supporting f32 denormals? 2453 2454 // Don't handle v2f16. The extra instructions to scalarize and repack around the 2455 // compare and vselect end up producing worse code than scalarizing the whole 2456 // operation. 2457 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2458 SDLoc SL(Op); 2459 SDValue X = Op.getOperand(0); 2460 EVT VT = Op.getValueType(); 2461 2462 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X); 2463 2464 // TODO: Should this propagate fast-math-flags? 2465 2466 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T); 2467 2468 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff); 2469 2470 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 2471 const SDValue One = DAG.getConstantFP(1.0, SL, VT); 2472 2473 EVT SetCCVT = 2474 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2475 2476 const SDValue Half = DAG.getConstantFP(0.5, SL, VT); 2477 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); 2478 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero); 2479 2480 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X); 2481 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset); 2482 } 2483 2484 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { 2485 SDLoc SL(Op); 2486 SDValue Src = Op.getOperand(0); 2487 2488 // result = trunc(src); 2489 // if (src < 0.0 && src != result) 2490 // result += -1.0. 2491 2492 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 2493 2494 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); 2495 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64); 2496 2497 EVT SetCCVT = 2498 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2499 2500 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); 2501 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 2502 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 2503 2504 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); 2505 // TODO: Should this propagate fast-math-flags? 2506 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 2507 } 2508 2509 /// Return true if it's known that \p Src can never be an f32 denormal value. 2510 static bool valueIsKnownNeverF32Denorm(SDValue Src) { 2511 switch (Src.getOpcode()) { 2512 case ISD::FP_EXTEND: 2513 return Src.getOperand(0).getValueType() == MVT::f16; 2514 case ISD::FP16_TO_FP: 2515 case ISD::FFREXP: 2516 return true; 2517 case ISD::INTRINSIC_WO_CHAIN: { 2518 unsigned IntrinsicID = Src.getConstantOperandVal(0); 2519 switch (IntrinsicID) { 2520 case Intrinsic::amdgcn_frexp_mant: 2521 return true; 2522 default: 2523 return false; 2524 } 2525 } 2526 default: 2527 return false; 2528 } 2529 2530 llvm_unreachable("covered opcode switch"); 2531 } 2532 2533 bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG, 2534 SDNodeFlags Flags) { 2535 if (Flags.hasApproximateFuncs()) 2536 return true; 2537 auto &Options = DAG.getTarget().Options; 2538 return Options.UnsafeFPMath || Options.ApproxFuncFPMath; 2539 } 2540 2541 bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG, 2542 SDValue Src, 2543 SDNodeFlags Flags) { 2544 return !valueIsKnownNeverF32Denorm(Src) && 2545 DAG.getMachineFunction() 2546 .getDenormalMode(APFloat::IEEEsingle()) 2547 .Input != DenormalMode::PreserveSign; 2548 } 2549 2550 SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG, 2551 SDValue Src, 2552 SDNodeFlags Flags) const { 2553 SDLoc SL(Src); 2554 EVT VT = Src.getValueType(); 2555 const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT); 2556 SDValue SmallestNormal = 2557 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT); 2558 2559 // Want to scale denormals up, but negatives and 0 work just as well on the 2560 // scaled path. 2561 SDValue IsLtSmallestNormal = DAG.getSetCC( 2562 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src, 2563 SmallestNormal, ISD::SETOLT); 2564 2565 return IsLtSmallestNormal; 2566 } 2567 2568 SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src, 2569 SDNodeFlags Flags) const { 2570 SDLoc SL(Src); 2571 EVT VT = Src.getValueType(); 2572 const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT); 2573 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT); 2574 2575 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags); 2576 SDValue IsFinite = DAG.getSetCC( 2577 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs, 2578 Inf, ISD::SETOLT); 2579 return IsFinite; 2580 } 2581 2582 /// If denormal handling is required return the scaled input to FLOG2, and the 2583 /// check for denormal range. Otherwise, return null values. 2584 std::pair<SDValue, SDValue> 2585 AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, 2586 SDValue Src, SDNodeFlags Flags) const { 2587 if (!needsDenormHandlingF32(DAG, Src, Flags)) 2588 return {}; 2589 2590 MVT VT = MVT::f32; 2591 const fltSemantics &Semantics = APFloat::IEEEsingle(); 2592 SDValue SmallestNormal = 2593 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT); 2594 2595 SDValue IsLtSmallestNormal = DAG.getSetCC( 2596 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src, 2597 SmallestNormal, ISD::SETOLT); 2598 2599 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT); 2600 SDValue One = DAG.getConstantFP(1.0, SL, VT); 2601 SDValue ScaleFactor = 2602 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags); 2603 2604 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags); 2605 return {ScaledInput, IsLtSmallestNormal}; 2606 } 2607 2608 SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const { 2609 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. 2610 // If we have to handle denormals, scale up the input and adjust the result. 2611 2612 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) 2613 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) 2614 2615 SDLoc SL(Op); 2616 EVT VT = Op.getValueType(); 2617 SDValue Src = Op.getOperand(0); 2618 SDNodeFlags Flags = Op->getFlags(); 2619 2620 if (VT == MVT::f16) { 2621 // Nothing in half is a denormal when promoted to f32. 2622 assert(!Subtarget->has16BitInsts()); 2623 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags); 2624 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags); 2625 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log, 2626 DAG.getTargetConstant(0, SL, MVT::i32), Flags); 2627 } 2628 2629 auto [ScaledInput, IsLtSmallestNormal] = 2630 getScaledLogInput(DAG, SL, Src, Flags); 2631 if (!ScaledInput) 2632 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags); 2633 2634 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags); 2635 2636 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT); 2637 SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 2638 SDValue ResultOffset = 2639 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero); 2640 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags); 2641 } 2642 2643 static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, 2644 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) { 2645 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags); 2646 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags); 2647 } 2648 2649 SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, 2650 SelectionDAG &DAG) const { 2651 SDValue X = Op.getOperand(0); 2652 EVT VT = Op.getValueType(); 2653 SDNodeFlags Flags = Op->getFlags(); 2654 SDLoc DL(Op); 2655 2656 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10; 2657 assert(IsLog10 || Op.getOpcode() == ISD::FLOG); 2658 2659 const auto &Options = getTargetMachine().Options; 2660 if (VT == MVT::f16 || Flags.hasApproximateFuncs() || 2661 Options.ApproxFuncFPMath || Options.UnsafeFPMath) { 2662 2663 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { 2664 // Log and multiply in f32 is good enough for f16. 2665 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags); 2666 } 2667 2668 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags); 2669 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { 2670 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered, 2671 DAG.getTargetConstant(0, DL, MVT::i32), Flags); 2672 } 2673 2674 return Lowered; 2675 } 2676 2677 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags); 2678 if (ScaledInput) 2679 X = ScaledInput; 2680 2681 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags); 2682 2683 SDValue R; 2684 if (Subtarget->hasFastFMAF32()) { 2685 // c+cc are ln(2)/ln(10) to more than 49 bits 2686 const float c_log10 = 0x1.344134p-2f; 2687 const float cc_log10 = 0x1.09f79ep-26f; 2688 2689 // c + cc is ln(2) to more than 49 bits 2690 const float c_log = 0x1.62e42ep-1f; 2691 const float cc_log = 0x1.efa39ep-25f; 2692 2693 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT); 2694 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT); 2695 2696 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags); 2697 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags); 2698 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags); 2699 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags); 2700 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags); 2701 } else { 2702 // ch+ct is ln(2)/ln(10) to more than 36 bits 2703 const float ch_log10 = 0x1.344000p-2f; 2704 const float ct_log10 = 0x1.3509f6p-18f; 2705 2706 // ch + ct is ln(2) to more than 36 bits 2707 const float ch_log = 0x1.62e000p-1f; 2708 const float ct_log = 0x1.0bfbe8p-15f; 2709 2710 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT); 2711 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT); 2712 2713 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y); 2714 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32); 2715 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst); 2716 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt); 2717 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags); 2718 2719 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags); 2720 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags); 2721 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags); 2722 R = getMad(DAG, DL, VT, YH, CH, Mad1); 2723 } 2724 2725 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && 2726 (Flags.hasNoInfs() || Options.NoInfsFPMath); 2727 2728 // TODO: Check if known finite from source value. 2729 if (!IsFiniteOnly) { 2730 SDValue IsFinite = getIsFinite(DAG, Y, Flags); 2731 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags); 2732 } 2733 2734 if (IsScaled) { 2735 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT); 2736 SDValue ShiftK = 2737 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT); 2738 SDValue Shift = 2739 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags); 2740 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags); 2741 } 2742 2743 return R; 2744 } 2745 2746 SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const { 2747 return LowerFLOGCommon(Op, DAG); 2748 } 2749 2750 // Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a 2751 // promote f16 operation. 2752 SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL, 2753 SelectionDAG &DAG, bool IsLog10, 2754 SDNodeFlags Flags) const { 2755 EVT VT = Src.getValueType(); 2756 unsigned LogOp = 2757 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2; 2758 2759 double Log2BaseInverted = 2760 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2; 2761 2762 if (VT == MVT::f32) { 2763 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags); 2764 if (ScaledInput) { 2765 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags); 2766 SDValue ScaledResultOffset = 2767 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT); 2768 2769 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT); 2770 2771 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled, 2772 ScaledResultOffset, Zero, Flags); 2773 2774 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT); 2775 2776 if (Subtarget->hasFastFMAF32()) 2777 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset, 2778 Flags); 2779 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags); 2780 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset); 2781 } 2782 } 2783 2784 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags); 2785 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT); 2786 2787 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand, 2788 Flags); 2789 } 2790 2791 SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const { 2792 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. 2793 // If we have to handle denormals, scale up the input and adjust the result. 2794 2795 SDLoc SL(Op); 2796 EVT VT = Op.getValueType(); 2797 SDValue Src = Op.getOperand(0); 2798 SDNodeFlags Flags = Op->getFlags(); 2799 2800 if (VT == MVT::f16) { 2801 // Nothing in half is a denormal when promoted to f32. 2802 assert(!Subtarget->has16BitInsts()); 2803 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags); 2804 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags); 2805 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log, 2806 DAG.getTargetConstant(0, SL, MVT::i32), Flags); 2807 } 2808 2809 assert(VT == MVT::f32); 2810 2811 if (!needsDenormHandlingF32(DAG, Src, Flags)) 2812 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags); 2813 2814 // bool needs_scaling = x < -0x1.f80000p+6f; 2815 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); 2816 2817 // -nextafter(128.0, -1) 2818 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT); 2819 2820 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2821 2822 SDValue NeedsScaling = 2823 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT); 2824 2825 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT); 2826 SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 2827 2828 SDValue AddOffset = 2829 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero); 2830 2831 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags); 2832 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags); 2833 2834 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT); 2835 SDValue One = DAG.getConstantFP(1.0, SL, VT); 2836 SDValue ResultScale = 2837 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One); 2838 2839 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags); 2840 } 2841 2842 SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL, 2843 SelectionDAG &DAG, 2844 SDNodeFlags Flags) const { 2845 EVT VT = X.getValueType(); 2846 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT); 2847 2848 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) { 2849 // exp2(M_LOG2E_F * f); 2850 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags); 2851 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP 2852 : (unsigned)ISD::FEXP2, 2853 SL, VT, Mul, Flags); 2854 } 2855 2856 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2857 2858 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT); 2859 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT); 2860 2861 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT); 2862 2863 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags); 2864 2865 SDValue AdjustedX = 2866 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X); 2867 2868 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags); 2869 2870 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags); 2871 2872 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT); 2873 SDValue AdjustedResult = 2874 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags); 2875 2876 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2, 2877 Flags); 2878 } 2879 2880 /// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be 2881 /// handled correctly. 2882 SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL, 2883 SelectionDAG &DAG, 2884 SDNodeFlags Flags) const { 2885 const EVT VT = X.getValueType(); 2886 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2; 2887 2888 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) { 2889 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f); 2890 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT); 2891 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT); 2892 2893 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags); 2894 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags); 2895 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags); 2896 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags); 2897 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1); 2898 } 2899 2900 // bool s = x < -0x1.2f7030p+5f; 2901 // x += s ? 0x1.0p+5f : 0.0f; 2902 // exp10 = exp2(x * 0x1.a92000p+1f) * 2903 // exp2(x * 0x1.4f0978p-11f) * 2904 // (s ? 0x1.9f623ep-107f : 1.0f); 2905 2906 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2907 2908 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT); 2909 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT); 2910 2911 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT); 2912 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags); 2913 SDValue AdjustedX = 2914 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X); 2915 2916 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT); 2917 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT); 2918 2919 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags); 2920 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags); 2921 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags); 2922 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags); 2923 2924 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags); 2925 2926 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT); 2927 SDValue AdjustedResult = 2928 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags); 2929 2930 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps, 2931 Flags); 2932 } 2933 2934 SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { 2935 EVT VT = Op.getValueType(); 2936 SDLoc SL(Op); 2937 SDValue X = Op.getOperand(0); 2938 SDNodeFlags Flags = Op->getFlags(); 2939 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10; 2940 2941 if (VT.getScalarType() == MVT::f16) { 2942 // v_exp_f16 (fmul x, log2e) 2943 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast? 2944 return lowerFEXPUnsafe(X, SL, DAG, Flags); 2945 2946 if (VT.isVector()) 2947 return SDValue(); 2948 2949 // exp(f16 x) -> 2950 // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) 2951 2952 // Nothing in half is a denormal when promoted to f32. 2953 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags); 2954 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags); 2955 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered, 2956 DAG.getTargetConstant(0, SL, MVT::i32), Flags); 2957 } 2958 2959 assert(VT == MVT::f32); 2960 2961 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying 2962 // library behavior. Also, is known-not-daz source sufficient? 2963 if (allowApproxFunc(DAG, Flags)) { 2964 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags) 2965 : lowerFEXPUnsafe(X, SL, DAG, Flags); 2966 } 2967 2968 // Algorithm: 2969 // 2970 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) 2971 // 2972 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer 2973 // n = 64*m + j, 0 <= j < 64 2974 // 2975 // e^x = 2^((64*m + j + f)/64) 2976 // = (2^m) * (2^(j/64)) * 2^(f/64) 2977 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) 2978 // 2979 // f = x*(64/ln(2)) - n 2980 // r = f*(ln(2)/64) = x - n*(ln(2)/64) 2981 // 2982 // e^x = (2^m) * (2^(j/64)) * e^r 2983 // 2984 // (2^(j/64)) is precomputed 2985 // 2986 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 2987 // e^r = 1 + q 2988 // 2989 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 2990 // 2991 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) 2992 SDNodeFlags FlagsNoContract = Flags; 2993 FlagsNoContract.setAllowContract(false); 2994 2995 SDValue PH, PL; 2996 if (Subtarget->hasFastFMAF32()) { 2997 const float c_exp = numbers::log2ef; 2998 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits 2999 const float c_exp10 = 0x1.a934f0p+1f; 3000 const float cc_exp10 = 0x1.2f346ep-24f; 3001 3002 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT); 3003 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT); 3004 3005 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags); 3006 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags); 3007 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags); 3008 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags); 3009 } else { 3010 const float ch_exp = 0x1.714000p+0f; 3011 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits 3012 3013 const float ch_exp10 = 0x1.a92000p+1f; 3014 const float cl_exp10 = 0x1.4f0978p-11f; 3015 3016 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT); 3017 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT); 3018 3019 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X); 3020 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32); 3021 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst); 3022 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt); 3023 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags); 3024 3025 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags); 3026 3027 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags); 3028 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags); 3029 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags); 3030 } 3031 3032 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags); 3033 3034 // It is unsafe to contract this fsub into the PH multiply. 3035 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract); 3036 3037 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags); 3038 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E); 3039 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags); 3040 3041 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags); 3042 3043 SDValue UnderflowCheckConst = 3044 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT); 3045 3046 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 3047 SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 3048 SDValue Underflow = 3049 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT); 3050 3051 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R); 3052 const auto &Options = getTargetMachine().Options; 3053 3054 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) { 3055 SDValue OverflowCheckConst = 3056 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT); 3057 SDValue Overflow = 3058 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT); 3059 SDValue Inf = 3060 DAG.getConstantFP(APFloat::getInf(APFloat::IEEEsingle()), SL, VT); 3061 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R); 3062 } 3063 3064 return R; 3065 } 3066 3067 static bool isCtlzOpc(unsigned Opc) { 3068 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; 3069 } 3070 3071 static bool isCttzOpc(unsigned Opc) { 3072 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF; 3073 } 3074 3075 SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op, 3076 SelectionDAG &DAG) const { 3077 auto SL = SDLoc(Op); 3078 auto Arg = Op.getOperand(0u); 3079 auto ResultVT = Op.getValueType(); 3080 3081 if (ResultVT != MVT::i8 && ResultVT != MVT::i16) 3082 return {}; 3083 3084 assert(isCtlzOpc(Op.getOpcode())); 3085 assert(ResultVT == Arg.getValueType()); 3086 3087 auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits(); 3088 auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg); 3089 auto ShiftVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32); 3090 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, ShiftVal); 3091 NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp); 3092 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp); 3093 } 3094 3095 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const { 3096 SDLoc SL(Op); 3097 SDValue Src = Op.getOperand(0); 3098 3099 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode())); 3100 bool Ctlz = isCtlzOpc(Op.getOpcode()); 3101 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32; 3102 3103 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF || 3104 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF; 3105 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64; 3106 3107 if (Src.getValueType() == MVT::i32 || Is64BitScalar) { 3108 // (ctlz hi:lo) -> (umin (ffbh src), 32) 3109 // (cttz hi:lo) -> (umin (ffbl src), 32) 3110 // (ctlz_zero_undef src) -> (ffbh src) 3111 // (cttz_zero_undef src) -> (ffbl src) 3112 3113 // 64-bit scalar version produce 32-bit result 3114 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64) 3115 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64) 3116 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src) 3117 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src) 3118 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src); 3119 if (!ZeroUndef) { 3120 const SDValue ConstVal = DAG.getConstant( 3121 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32); 3122 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal); 3123 } 3124 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr); 3125 } 3126 3127 SDValue Lo, Hi; 3128 std::tie(Lo, Hi) = split64BitValue(Src, DAG); 3129 3130 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo); 3131 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi); 3132 3133 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64) 3134 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64) 3135 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) 3136 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) 3137 3138 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT; 3139 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32); 3140 if (Ctlz) 3141 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32); 3142 else 3143 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32); 3144 3145 SDValue NewOpr; 3146 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi); 3147 if (!ZeroUndef) { 3148 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32); 3149 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64); 3150 } 3151 3152 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr); 3153 } 3154 3155 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, 3156 bool Signed) const { 3157 // The regular method converting a 64-bit integer to float roughly consists of 3158 // 2 steps: normalization and rounding. In fact, after normalization, the 3159 // conversion from a 64-bit integer to a float is essentially the same as the 3160 // one from a 32-bit integer. The only difference is that it has more 3161 // trailing bits to be rounded. To leverage the native 32-bit conversion, a 3162 // 64-bit integer could be preprocessed and fit into a 32-bit integer then 3163 // converted into the correct float number. The basic steps for the unsigned 3164 // conversion are illustrated in the following pseudo code: 3165 // 3166 // f32 uitofp(i64 u) { 3167 // i32 hi, lo = split(u); 3168 // // Only count the leading zeros in hi as we have native support of the 3169 // // conversion from i32 to f32. If hi is all 0s, the conversion is 3170 // // reduced to a 32-bit one automatically. 3171 // i32 shamt = clz(hi); // Return 32 if hi is all 0s. 3172 // u <<= shamt; 3173 // hi, lo = split(u); 3174 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo. 3175 // // convert it as a 32-bit integer and scale the result back. 3176 // return uitofp(hi) * 2^(32 - shamt); 3177 // } 3178 // 3179 // The signed one follows the same principle but uses 'ffbh_i32' to count its 3180 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is 3181 // converted instead followed by negation based its sign bit. 3182 3183 SDLoc SL(Op); 3184 SDValue Src = Op.getOperand(0); 3185 3186 SDValue Lo, Hi; 3187 std::tie(Lo, Hi) = split64BitValue(Src, DAG); 3188 SDValue Sign; 3189 SDValue ShAmt; 3190 if (Signed && Subtarget->isGCN()) { 3191 // We also need to consider the sign bit in Lo if Hi has just sign bits, 3192 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into 3193 // account. That is, the maximal shift is 3194 // - 32 if Lo and Hi have opposite signs; 3195 // - 33 if Lo and Hi have the same sign. 3196 // 3197 // Or, MaxShAmt = 33 + OppositeSign, where 3198 // 3199 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is 3200 // - -1 if Lo and Hi have opposite signs; and 3201 // - 0 otherwise. 3202 // 3203 // All in all, ShAmt is calculated as 3204 // 3205 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1. 3206 // 3207 // or 3208 // 3209 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31). 3210 // 3211 // to reduce the critical path. 3212 SDValue OppositeSign = DAG.getNode( 3213 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi), 3214 DAG.getConstant(31, SL, MVT::i32)); 3215 SDValue MaxShAmt = 3216 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32), 3217 OppositeSign); 3218 // Count the leading sign bits. 3219 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi); 3220 // Different from unsigned conversion, the shift should be one bit less to 3221 // preserve the sign bit. 3222 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt, 3223 DAG.getConstant(1, SL, MVT::i32)); 3224 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt); 3225 } else { 3226 if (Signed) { 3227 // Without 'ffbh_i32', only leading zeros could be counted. Take the 3228 // absolute value first. 3229 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src, 3230 DAG.getConstant(63, SL, MVT::i64)); 3231 SDValue Abs = 3232 DAG.getNode(ISD::XOR, SL, MVT::i64, 3233 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign); 3234 std::tie(Lo, Hi) = split64BitValue(Abs, DAG); 3235 } 3236 // Count the leading zeros. 3237 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi); 3238 // The shift amount for signed integers is [0, 32]. 3239 } 3240 // Normalize the given 64-bit integer. 3241 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt); 3242 // Split it again. 3243 std::tie(Lo, Hi) = split64BitValue(Norm, DAG); 3244 // Calculate the adjust bit for rounding. 3245 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo) 3246 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32, 3247 DAG.getConstant(1, SL, MVT::i32), Lo); 3248 // Get the 32-bit normalized integer. 3249 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust); 3250 // Convert the normalized 32-bit integer into f32. 3251 unsigned Opc = 3252 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; 3253 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm); 3254 3255 // Finally, need to scale back the converted floating number as the original 3256 // 64-bit integer is converted as a 32-bit one. 3257 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32), 3258 ShAmt); 3259 // On GCN, use LDEXP directly. 3260 if (Subtarget->isGCN()) 3261 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt); 3262 3263 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent 3264 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit 3265 // exponent is enough to avoid overflowing into the sign bit. 3266 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt, 3267 DAG.getConstant(23, SL, MVT::i32)); 3268 SDValue IVal = 3269 DAG.getNode(ISD::ADD, SL, MVT::i32, 3270 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp); 3271 if (Signed) { 3272 // Set the sign bit. 3273 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32, 3274 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign), 3275 DAG.getConstant(31, SL, MVT::i32)); 3276 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign); 3277 } 3278 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal); 3279 } 3280 3281 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, 3282 bool Signed) const { 3283 SDLoc SL(Op); 3284 SDValue Src = Op.getOperand(0); 3285 3286 SDValue Lo, Hi; 3287 std::tie(Lo, Hi) = split64BitValue(Src, DAG); 3288 3289 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP, 3290 SL, MVT::f64, Hi); 3291 3292 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); 3293 3294 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi, 3295 DAG.getConstant(32, SL, MVT::i32)); 3296 // TODO: Should this propagate fast-math-flags? 3297 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); 3298 } 3299 3300 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, 3301 SelectionDAG &DAG) const { 3302 // TODO: Factor out code common with LowerSINT_TO_FP. 3303 EVT DestVT = Op.getValueType(); 3304 SDValue Src = Op.getOperand(0); 3305 EVT SrcVT = Src.getValueType(); 3306 3307 if (SrcVT == MVT::i16) { 3308 if (DestVT == MVT::f16) 3309 return Op; 3310 SDLoc DL(Op); 3311 3312 // Promote src to i32 3313 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src); 3314 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext); 3315 } 3316 3317 if (DestVT == MVT::bf16) { 3318 SDLoc SL(Op); 3319 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src); 3320 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true); 3321 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag); 3322 } 3323 3324 if (SrcVT != MVT::i64) 3325 return Op; 3326 3327 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { 3328 SDLoc DL(Op); 3329 3330 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); 3331 SDValue FPRoundFlag = 3332 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true); 3333 SDValue FPRound = 3334 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); 3335 3336 return FPRound; 3337 } 3338 3339 if (DestVT == MVT::f32) 3340 return LowerINT_TO_FP32(Op, DAG, false); 3341 3342 assert(DestVT == MVT::f64); 3343 return LowerINT_TO_FP64(Op, DAG, false); 3344 } 3345 3346 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, 3347 SelectionDAG &DAG) const { 3348 EVT DestVT = Op.getValueType(); 3349 3350 SDValue Src = Op.getOperand(0); 3351 EVT SrcVT = Src.getValueType(); 3352 3353 if (SrcVT == MVT::i16) { 3354 if (DestVT == MVT::f16) 3355 return Op; 3356 3357 SDLoc DL(Op); 3358 // Promote src to i32 3359 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src); 3360 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext); 3361 } 3362 3363 if (DestVT == MVT::bf16) { 3364 SDLoc SL(Op); 3365 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src); 3366 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true); 3367 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag); 3368 } 3369 3370 if (SrcVT != MVT::i64) 3371 return Op; 3372 3373 // TODO: Factor out code common with LowerUINT_TO_FP. 3374 3375 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { 3376 SDLoc DL(Op); 3377 SDValue Src = Op.getOperand(0); 3378 3379 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); 3380 SDValue FPRoundFlag = 3381 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true); 3382 SDValue FPRound = 3383 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); 3384 3385 return FPRound; 3386 } 3387 3388 if (DestVT == MVT::f32) 3389 return LowerINT_TO_FP32(Op, DAG, true); 3390 3391 assert(DestVT == MVT::f64); 3392 return LowerINT_TO_FP64(Op, DAG, true); 3393 } 3394 3395 SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, 3396 bool Signed) const { 3397 SDLoc SL(Op); 3398 3399 SDValue Src = Op.getOperand(0); 3400 EVT SrcVT = Src.getValueType(); 3401 3402 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64); 3403 3404 // The basic idea of converting a floating point number into a pair of 32-bit 3405 // integers is illustrated as follows: 3406 // 3407 // tf := trunc(val); 3408 // hif := floor(tf * 2^-32); 3409 // lof := tf - hif * 2^32; // lof is always positive due to floor. 3410 // hi := fptoi(hif); 3411 // lo := fptoi(lof); 3412 // 3413 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src); 3414 SDValue Sign; 3415 if (Signed && SrcVT == MVT::f32) { 3416 // However, a 32-bit floating point number has only 23 bits mantissa and 3417 // it's not enough to hold all the significant bits of `lof` if val is 3418 // negative. To avoid the loss of precision, We need to take the absolute 3419 // value after truncating and flip the result back based on the original 3420 // signedness. 3421 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32, 3422 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc), 3423 DAG.getConstant(31, SL, MVT::i32)); 3424 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc); 3425 } 3426 3427 SDValue K0, K1; 3428 if (SrcVT == MVT::f64) { 3429 K0 = DAG.getConstantFP( 3430 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL, 3431 SrcVT); 3432 K1 = DAG.getConstantFP( 3433 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL, 3434 SrcVT); 3435 } else { 3436 K0 = DAG.getConstantFP( 3437 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT); 3438 K1 = DAG.getConstantFP( 3439 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT); 3440 } 3441 // TODO: Should this propagate fast-math-flags? 3442 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0); 3443 3444 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul); 3445 3446 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc); 3447 3448 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT 3449 : ISD::FP_TO_UINT, 3450 SL, MVT::i32, FloorMul); 3451 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); 3452 3453 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, 3454 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi})); 3455 3456 if (Signed && SrcVT == MVT::f32) { 3457 assert(Sign); 3458 // Flip the result based on the signedness, which is either all 0s or 1s. 3459 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64, 3460 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign})); 3461 // r := xor(r, sign) - sign; 3462 Result = 3463 DAG.getNode(ISD::SUB, SL, MVT::i64, 3464 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign); 3465 } 3466 3467 return Result; 3468 } 3469 3470 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const { 3471 SDLoc DL(Op); 3472 SDValue N0 = Op.getOperand(0); 3473 3474 // Convert to target node to get known bits 3475 if (N0.getValueType() == MVT::f32) 3476 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0); 3477 3478 if (getTargetMachine().Options.UnsafeFPMath) { 3479 // There is a generic expand for FP_TO_FP16 with unsafe fast math. 3480 return SDValue(); 3481 } 3482 3483 assert(N0.getSimpleValueType() == MVT::f64); 3484 3485 // f64 -> f16 conversion using round-to-nearest-even rounding mode. 3486 const unsigned ExpMask = 0x7ff; 3487 const unsigned ExpBiasf64 = 1023; 3488 const unsigned ExpBiasf16 = 15; 3489 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 3490 SDValue One = DAG.getConstant(1, DL, MVT::i32); 3491 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0); 3492 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U, 3493 DAG.getConstant(32, DL, MVT::i64)); 3494 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32); 3495 U = DAG.getZExtOrTrunc(U, DL, MVT::i32); 3496 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 3497 DAG.getConstant(20, DL, MVT::i64)); 3498 E = DAG.getNode(ISD::AND, DL, MVT::i32, E, 3499 DAG.getConstant(ExpMask, DL, MVT::i32)); 3500 // Subtract the fp64 exponent bias (1023) to get the real exponent and 3501 // add the f16 bias (15) to get the biased exponent for the f16 format. 3502 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E, 3503 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32)); 3504 3505 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 3506 DAG.getConstant(8, DL, MVT::i32)); 3507 M = DAG.getNode(ISD::AND, DL, MVT::i32, M, 3508 DAG.getConstant(0xffe, DL, MVT::i32)); 3509 3510 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH, 3511 DAG.getConstant(0x1ff, DL, MVT::i32)); 3512 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U); 3513 3514 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ); 3515 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set); 3516 3517 // (M != 0 ? 0x0200 : 0) | 0x7c00; 3518 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32, 3519 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32), 3520 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32)); 3521 3522 // N = M | (E << 12); 3523 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M, 3524 DAG.getNode(ISD::SHL, DL, MVT::i32, E, 3525 DAG.getConstant(12, DL, MVT::i32))); 3526 3527 // B = clamp(1-E, 0, 13); 3528 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32, 3529 One, E); 3530 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero); 3531 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B, 3532 DAG.getConstant(13, DL, MVT::i32)); 3533 3534 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M, 3535 DAG.getConstant(0x1000, DL, MVT::i32)); 3536 3537 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B); 3538 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B); 3539 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE); 3540 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1); 3541 3542 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT); 3543 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V, 3544 DAG.getConstant(0x7, DL, MVT::i32)); 3545 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V, 3546 DAG.getConstant(2, DL, MVT::i32)); 3547 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32), 3548 One, Zero, ISD::SETEQ); 3549 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32), 3550 One, Zero, ISD::SETGT); 3551 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1); 3552 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1); 3553 3554 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32), 3555 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT); 3556 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32), 3557 I, V, ISD::SETEQ); 3558 3559 // Extract the sign bit. 3560 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 3561 DAG.getConstant(16, DL, MVT::i32)); 3562 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign, 3563 DAG.getConstant(0x8000, DL, MVT::i32)); 3564 3565 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V); 3566 return DAG.getZExtOrTrunc(V, DL, Op.getValueType()); 3567 } 3568 3569 SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op, 3570 SelectionDAG &DAG) const { 3571 SDValue Src = Op.getOperand(0); 3572 unsigned OpOpcode = Op.getOpcode(); 3573 EVT SrcVT = Src.getValueType(); 3574 EVT DestVT = Op.getValueType(); 3575 3576 // Will be selected natively 3577 if (SrcVT == MVT::f16 && DestVT == MVT::i16) 3578 return Op; 3579 3580 if (SrcVT == MVT::bf16) { 3581 SDLoc DL(Op); 3582 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); 3583 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc); 3584 } 3585 3586 // Promote i16 to i32 3587 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) { 3588 SDLoc DL(Op); 3589 3590 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src); 3591 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32); 3592 } 3593 3594 if (DestVT != MVT::i64) 3595 return Op; 3596 3597 if (SrcVT == MVT::f16 || 3598 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) { 3599 SDLoc DL(Op); 3600 3601 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src); 3602 unsigned Ext = 3603 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 3604 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32); 3605 } 3606 3607 if (SrcVT == MVT::f32 || SrcVT == MVT::f64) 3608 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT); 3609 3610 return SDValue(); 3611 } 3612 3613 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 3614 SelectionDAG &DAG) const { 3615 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 3616 MVT VT = Op.getSimpleValueType(); 3617 MVT ScalarVT = VT.getScalarType(); 3618 3619 assert(VT.isVector()); 3620 3621 SDValue Src = Op.getOperand(0); 3622 SDLoc DL(Op); 3623 3624 // TODO: Don't scalarize on Evergreen? 3625 unsigned NElts = VT.getVectorNumElements(); 3626 SmallVector<SDValue, 8> Args; 3627 DAG.ExtractVectorElements(Src, Args, 0, NElts); 3628 3629 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); 3630 for (unsigned I = 0; I < NElts; ++I) 3631 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); 3632 3633 return DAG.getBuildVector(VT, DL, Args); 3634 } 3635 3636 //===----------------------------------------------------------------------===// 3637 // Custom DAG optimizations 3638 //===----------------------------------------------------------------------===// 3639 3640 static bool isU24(SDValue Op, SelectionDAG &DAG) { 3641 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24; 3642 } 3643 3644 static bool isI24(SDValue Op, SelectionDAG &DAG) { 3645 EVT VT = Op.getValueType(); 3646 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated 3647 // as unsigned 24-bit values. 3648 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24; 3649 } 3650 3651 static SDValue simplifyMul24(SDNode *Node24, 3652 TargetLowering::DAGCombinerInfo &DCI) { 3653 SelectionDAG &DAG = DCI.DAG; 3654 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 3655 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN; 3656 3657 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0); 3658 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1); 3659 unsigned NewOpcode = Node24->getOpcode(); 3660 if (IsIntrin) { 3661 unsigned IID = Node24->getConstantOperandVal(0); 3662 switch (IID) { 3663 case Intrinsic::amdgcn_mul_i24: 3664 NewOpcode = AMDGPUISD::MUL_I24; 3665 break; 3666 case Intrinsic::amdgcn_mul_u24: 3667 NewOpcode = AMDGPUISD::MUL_U24; 3668 break; 3669 case Intrinsic::amdgcn_mulhi_i24: 3670 NewOpcode = AMDGPUISD::MULHI_I24; 3671 break; 3672 case Intrinsic::amdgcn_mulhi_u24: 3673 NewOpcode = AMDGPUISD::MULHI_U24; 3674 break; 3675 default: 3676 llvm_unreachable("Expected 24-bit mul intrinsic"); 3677 } 3678 } 3679 3680 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24); 3681 3682 // First try to simplify using SimplifyMultipleUseDemandedBits which allows 3683 // the operands to have other uses, but will only perform simplifications that 3684 // involve bypassing some nodes for this user. 3685 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG); 3686 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG); 3687 if (DemandedLHS || DemandedRHS) 3688 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(), 3689 DemandedLHS ? DemandedLHS : LHS, 3690 DemandedRHS ? DemandedRHS : RHS); 3691 3692 // Now try SimplifyDemandedBits which can simplify the nodes used by our 3693 // operands if this node is the only user. 3694 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI)) 3695 return SDValue(Node24, 0); 3696 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI)) 3697 return SDValue(Node24, 0); 3698 3699 return SDValue(); 3700 } 3701 3702 template <typename IntTy> 3703 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, 3704 uint32_t Width, const SDLoc &DL) { 3705 if (Width + Offset < 32) { 3706 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width); 3707 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width); 3708 return DAG.getConstant(Result, DL, MVT::i32); 3709 } 3710 3711 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); 3712 } 3713 3714 static bool hasVolatileUser(SDNode *Val) { 3715 for (SDNode *U : Val->uses()) { 3716 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) { 3717 if (M->isVolatile()) 3718 return true; 3719 } 3720 } 3721 3722 return false; 3723 } 3724 3725 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const { 3726 // i32 vectors are the canonical memory type. 3727 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT)) 3728 return false; 3729 3730 if (!VT.isByteSized()) 3731 return false; 3732 3733 unsigned Size = VT.getStoreSize(); 3734 3735 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector()) 3736 return false; 3737 3738 if (Size == 3 || (Size > 4 && (Size % 4 != 0))) 3739 return false; 3740 3741 return true; 3742 } 3743 3744 // Replace load of an illegal type with a store of a bitcast to a friendlier 3745 // type. 3746 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, 3747 DAGCombinerInfo &DCI) const { 3748 if (!DCI.isBeforeLegalize()) 3749 return SDValue(); 3750 3751 LoadSDNode *LN = cast<LoadSDNode>(N); 3752 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN)) 3753 return SDValue(); 3754 3755 SDLoc SL(N); 3756 SelectionDAG &DAG = DCI.DAG; 3757 EVT VT = LN->getMemoryVT(); 3758 3759 unsigned Size = VT.getStoreSize(); 3760 Align Alignment = LN->getAlign(); 3761 if (Alignment < Size && isTypeLegal(VT)) { 3762 unsigned IsFast; 3763 unsigned AS = LN->getAddressSpace(); 3764 3765 // Expand unaligned loads earlier than legalization. Due to visitation order 3766 // problems during legalization, the emitted instructions to pack and unpack 3767 // the bytes again are not eliminated in the case of an unaligned copy. 3768 if (!allowsMisalignedMemoryAccesses( 3769 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) { 3770 if (VT.isVector()) 3771 return SplitVectorLoad(SDValue(LN, 0), DAG); 3772 3773 SDValue Ops[2]; 3774 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); 3775 3776 return DAG.getMergeValues(Ops, SDLoc(N)); 3777 } 3778 3779 if (!IsFast) 3780 return SDValue(); 3781 } 3782 3783 if (!shouldCombineMemoryType(VT)) 3784 return SDValue(); 3785 3786 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); 3787 3788 SDValue NewLoad 3789 = DAG.getLoad(NewVT, SL, LN->getChain(), 3790 LN->getBasePtr(), LN->getMemOperand()); 3791 3792 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad); 3793 DCI.CombineTo(N, BC, NewLoad.getValue(1)); 3794 return SDValue(N, 0); 3795 } 3796 3797 // Replace store of an illegal type with a store of a bitcast to a friendlier 3798 // type. 3799 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, 3800 DAGCombinerInfo &DCI) const { 3801 if (!DCI.isBeforeLegalize()) 3802 return SDValue(); 3803 3804 StoreSDNode *SN = cast<StoreSDNode>(N); 3805 if (!SN->isSimple() || !ISD::isNormalStore(SN)) 3806 return SDValue(); 3807 3808 EVT VT = SN->getMemoryVT(); 3809 unsigned Size = VT.getStoreSize(); 3810 3811 SDLoc SL(N); 3812 SelectionDAG &DAG = DCI.DAG; 3813 Align Alignment = SN->getAlign(); 3814 if (Alignment < Size && isTypeLegal(VT)) { 3815 unsigned IsFast; 3816 unsigned AS = SN->getAddressSpace(); 3817 3818 // Expand unaligned stores earlier than legalization. Due to visitation 3819 // order problems during legalization, the emitted instructions to pack and 3820 // unpack the bytes again are not eliminated in the case of an unaligned 3821 // copy. 3822 if (!allowsMisalignedMemoryAccesses( 3823 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) { 3824 if (VT.isVector()) 3825 return SplitVectorStore(SDValue(SN, 0), DAG); 3826 3827 return expandUnalignedStore(SN, DAG); 3828 } 3829 3830 if (!IsFast) 3831 return SDValue(); 3832 } 3833 3834 if (!shouldCombineMemoryType(VT)) 3835 return SDValue(); 3836 3837 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); 3838 SDValue Val = SN->getValue(); 3839 3840 //DCI.AddToWorklist(Val.getNode()); 3841 3842 bool OtherUses = !Val.hasOneUse(); 3843 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val); 3844 if (OtherUses) { 3845 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal); 3846 DAG.ReplaceAllUsesOfValueWith(Val, CastBack); 3847 } 3848 3849 return DAG.getStore(SN->getChain(), SL, CastVal, 3850 SN->getBasePtr(), SN->getMemOperand()); 3851 } 3852 3853 // FIXME: This should go in generic DAG combiner with an isTruncateFree check, 3854 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU 3855 // issues. 3856 SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N, 3857 DAGCombinerInfo &DCI) const { 3858 SelectionDAG &DAG = DCI.DAG; 3859 SDValue N0 = N->getOperand(0); 3860 3861 // (vt2 (assertzext (truncate vt0:x), vt1)) -> 3862 // (vt2 (truncate (assertzext vt0:x, vt1))) 3863 if (N0.getOpcode() == ISD::TRUNCATE) { 3864 SDValue N1 = N->getOperand(1); 3865 EVT ExtVT = cast<VTSDNode>(N1)->getVT(); 3866 SDLoc SL(N); 3867 3868 SDValue Src = N0.getOperand(0); 3869 EVT SrcVT = Src.getValueType(); 3870 if (SrcVT.bitsGE(ExtVT)) { 3871 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1); 3872 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg); 3873 } 3874 } 3875 3876 return SDValue(); 3877 } 3878 3879 SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( 3880 SDNode *N, DAGCombinerInfo &DCI) const { 3881 unsigned IID = N->getConstantOperandVal(0); 3882 switch (IID) { 3883 case Intrinsic::amdgcn_mul_i24: 3884 case Intrinsic::amdgcn_mul_u24: 3885 case Intrinsic::amdgcn_mulhi_i24: 3886 case Intrinsic::amdgcn_mulhi_u24: 3887 return simplifyMul24(N, DCI); 3888 case Intrinsic::amdgcn_fract: 3889 case Intrinsic::amdgcn_rsq: 3890 case Intrinsic::amdgcn_rcp_legacy: 3891 case Intrinsic::amdgcn_rsq_legacy: 3892 case Intrinsic::amdgcn_rsq_clamp: { 3893 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted 3894 SDValue Src = N->getOperand(1); 3895 return Src.isUndef() ? Src : SDValue(); 3896 } 3897 case Intrinsic::amdgcn_frexp_exp: { 3898 // frexp_exp (fneg x) -> frexp_exp x 3899 // frexp_exp (fabs x) -> frexp_exp x 3900 // frexp_exp (fneg (fabs x)) -> frexp_exp x 3901 SDValue Src = N->getOperand(1); 3902 SDValue PeekSign = peekFPSignOps(Src); 3903 if (PeekSign == Src) 3904 return SDValue(); 3905 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign), 3906 0); 3907 } 3908 default: 3909 return SDValue(); 3910 } 3911 } 3912 3913 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the 3914 /// binary operation \p Opc to it with the corresponding constant operands. 3915 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( 3916 DAGCombinerInfo &DCI, const SDLoc &SL, 3917 unsigned Opc, SDValue LHS, 3918 uint32_t ValLo, uint32_t ValHi) const { 3919 SelectionDAG &DAG = DCI.DAG; 3920 SDValue Lo, Hi; 3921 std::tie(Lo, Hi) = split64BitValue(LHS, DAG); 3922 3923 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32); 3924 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32); 3925 3926 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS); 3927 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS); 3928 3929 // Re-visit the ands. It's possible we eliminated one of them and it could 3930 // simplify the vector. 3931 DCI.AddToWorklist(Lo.getNode()); 3932 DCI.AddToWorklist(Hi.getNode()); 3933 3934 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd}); 3935 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 3936 } 3937 3938 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, 3939 DAGCombinerInfo &DCI) const { 3940 EVT VT = N->getValueType(0); 3941 3942 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 3943 if (!RHS) 3944 return SDValue(); 3945 3946 SDValue LHS = N->getOperand(0); 3947 unsigned RHSVal = RHS->getZExtValue(); 3948 if (!RHSVal) 3949 return LHS; 3950 3951 SDLoc SL(N); 3952 SelectionDAG &DAG = DCI.DAG; 3953 3954 switch (LHS->getOpcode()) { 3955 default: 3956 break; 3957 case ISD::ZERO_EXTEND: 3958 case ISD::SIGN_EXTEND: 3959 case ISD::ANY_EXTEND: { 3960 SDValue X = LHS->getOperand(0); 3961 3962 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 && 3963 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) { 3964 // Prefer build_vector as the canonical form if packed types are legal. 3965 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x 3966 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL, 3967 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) }); 3968 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); 3969 } 3970 3971 // shl (ext x) => zext (shl x), if shift does not overflow int 3972 if (VT != MVT::i64) 3973 break; 3974 KnownBits Known = DAG.computeKnownBits(X); 3975 unsigned LZ = Known.countMinLeadingZeros(); 3976 if (LZ < RHSVal) 3977 break; 3978 EVT XVT = X.getValueType(); 3979 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0)); 3980 return DAG.getZExtOrTrunc(Shl, SL, VT); 3981 } 3982 } 3983 3984 if (VT != MVT::i64) 3985 return SDValue(); 3986 3987 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) 3988 3989 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 3990 // common case, splitting this into a move and a 32-bit shift is faster and 3991 // the same code size. 3992 if (RHSVal < 32) 3993 return SDValue(); 3994 3995 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32); 3996 3997 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); 3998 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt); 3999 4000 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 4001 4002 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift}); 4003 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 4004 } 4005 4006 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, 4007 DAGCombinerInfo &DCI) const { 4008 if (N->getValueType(0) != MVT::i64) 4009 return SDValue(); 4010 4011 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 4012 if (!RHS) 4013 return SDValue(); 4014 4015 SelectionDAG &DAG = DCI.DAG; 4016 SDLoc SL(N); 4017 unsigned RHSVal = RHS->getZExtValue(); 4018 4019 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31) 4020 if (RHSVal == 32) { 4021 SDValue Hi = getHiHalf64(N->getOperand(0), DAG); 4022 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, 4023 DAG.getConstant(31, SL, MVT::i32)); 4024 4025 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift}); 4026 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); 4027 } 4028 4029 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31) 4030 if (RHSVal == 63) { 4031 SDValue Hi = getHiHalf64(N->getOperand(0), DAG); 4032 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, 4033 DAG.getConstant(31, SL, MVT::i32)); 4034 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift}); 4035 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); 4036 } 4037 4038 return SDValue(); 4039 } 4040 4041 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, 4042 DAGCombinerInfo &DCI) const { 4043 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 4044 if (!RHS) 4045 return SDValue(); 4046 4047 EVT VT = N->getValueType(0); 4048 SDValue LHS = N->getOperand(0); 4049 unsigned ShiftAmt = RHS->getZExtValue(); 4050 SelectionDAG &DAG = DCI.DAG; 4051 SDLoc SL(N); 4052 4053 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1) 4054 // this improves the ability to match BFE patterns in isel. 4055 if (LHS.getOpcode() == ISD::AND) { 4056 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) { 4057 unsigned MaskIdx, MaskLen; 4058 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) && 4059 MaskIdx == ShiftAmt) { 4060 return DAG.getNode( 4061 ISD::AND, SL, VT, 4062 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)), 4063 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1))); 4064 } 4065 } 4066 } 4067 4068 if (VT != MVT::i64) 4069 return SDValue(); 4070 4071 if (ShiftAmt < 32) 4072 return SDValue(); 4073 4074 // srl i64:x, C for C >= 32 4075 // => 4076 // build_pair (srl hi_32(x), C - 32), 0 4077 SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 4078 4079 SDValue Hi = getHiHalf64(LHS, DAG); 4080 4081 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32); 4082 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst); 4083 4084 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero}); 4085 4086 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair); 4087 } 4088 4089 SDValue AMDGPUTargetLowering::performTruncateCombine( 4090 SDNode *N, DAGCombinerInfo &DCI) const { 4091 SDLoc SL(N); 4092 SelectionDAG &DAG = DCI.DAG; 4093 EVT VT = N->getValueType(0); 4094 SDValue Src = N->getOperand(0); 4095 4096 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x) 4097 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) { 4098 SDValue Vec = Src.getOperand(0); 4099 if (Vec.getOpcode() == ISD::BUILD_VECTOR) { 4100 SDValue Elt0 = Vec.getOperand(0); 4101 EVT EltVT = Elt0.getValueType(); 4102 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) { 4103 if (EltVT.isFloatingPoint()) { 4104 Elt0 = DAG.getNode(ISD::BITCAST, SL, 4105 EltVT.changeTypeToInteger(), Elt0); 4106 } 4107 4108 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0); 4109 } 4110 } 4111 } 4112 4113 // Equivalent of above for accessing the high element of a vector as an 4114 // integer operation. 4115 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y) 4116 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) { 4117 if (auto K = isConstOrConstSplat(Src.getOperand(1))) { 4118 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) { 4119 SDValue BV = stripBitcast(Src.getOperand(0)); 4120 if (BV.getOpcode() == ISD::BUILD_VECTOR && 4121 BV.getValueType().getVectorNumElements() == 2) { 4122 SDValue SrcElt = BV.getOperand(1); 4123 EVT SrcEltVT = SrcElt.getValueType(); 4124 if (SrcEltVT.isFloatingPoint()) { 4125 SrcElt = DAG.getNode(ISD::BITCAST, SL, 4126 SrcEltVT.changeTypeToInteger(), SrcElt); 4127 } 4128 4129 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt); 4130 } 4131 } 4132 } 4133 } 4134 4135 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit. 4136 // 4137 // i16 (trunc (srl i64:x, K)), K <= 16 -> 4138 // i16 (trunc (srl (i32 (trunc x), K))) 4139 if (VT.getScalarSizeInBits() < 32) { 4140 EVT SrcVT = Src.getValueType(); 4141 if (SrcVT.getScalarSizeInBits() > 32 && 4142 (Src.getOpcode() == ISD::SRL || 4143 Src.getOpcode() == ISD::SRA || 4144 Src.getOpcode() == ISD::SHL)) { 4145 SDValue Amt = Src.getOperand(1); 4146 KnownBits Known = DAG.computeKnownBits(Amt); 4147 4148 // - For left shifts, do the transform as long as the shift 4149 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31) 4150 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid 4151 // losing information stored in the high bits when truncating. 4152 const unsigned MaxCstSize = 4153 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits()); 4154 if (Known.getMaxValue().ule(MaxCstSize)) { 4155 EVT MidVT = VT.isVector() ? 4156 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4157 VT.getVectorNumElements()) : MVT::i32; 4158 4159 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout()); 4160 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT, 4161 Src.getOperand(0)); 4162 DCI.AddToWorklist(Trunc.getNode()); 4163 4164 if (Amt.getValueType() != NewShiftVT) { 4165 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT); 4166 DCI.AddToWorklist(Amt.getNode()); 4167 } 4168 4169 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT, 4170 Trunc, Amt); 4171 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift); 4172 } 4173 } 4174 } 4175 4176 return SDValue(); 4177 } 4178 4179 // We need to specifically handle i64 mul here to avoid unnecessary conversion 4180 // instructions. If we only match on the legalized i64 mul expansion, 4181 // SimplifyDemandedBits will be unable to remove them because there will be 4182 // multiple uses due to the separate mul + mulh[su]. 4183 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, 4184 SDValue N0, SDValue N1, unsigned Size, bool Signed) { 4185 if (Size <= 32) { 4186 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; 4187 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1); 4188 } 4189 4190 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; 4191 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24; 4192 4193 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1); 4194 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1); 4195 4196 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi); 4197 } 4198 4199 /// If \p V is an add of a constant 1, returns the other operand. Otherwise 4200 /// return SDValue(). 4201 static SDValue getAddOneOp(const SDNode *V) { 4202 if (V->getOpcode() != ISD::ADD) 4203 return SDValue(); 4204 4205 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue(); 4206 } 4207 4208 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, 4209 DAGCombinerInfo &DCI) const { 4210 EVT VT = N->getValueType(0); 4211 4212 // Don't generate 24-bit multiplies on values that are in SGPRs, since 4213 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs 4214 // unnecessarily). isDivergent() is used as an approximation of whether the 4215 // value is in an SGPR. 4216 if (!N->isDivergent()) 4217 return SDValue(); 4218 4219 unsigned Size = VT.getSizeInBits(); 4220 if (VT.isVector() || Size > 64) 4221 return SDValue(); 4222 4223 SelectionDAG &DAG = DCI.DAG; 4224 SDLoc DL(N); 4225 4226 SDValue N0 = N->getOperand(0); 4227 SDValue N1 = N->getOperand(1); 4228 4229 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad 4230 // matching. 4231 4232 // mul x, (add y, 1) -> add (mul x, y), x 4233 auto IsFoldableAdd = [](SDValue V) -> SDValue { 4234 SDValue AddOp = getAddOneOp(V.getNode()); 4235 if (!AddOp) 4236 return SDValue(); 4237 4238 if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool { 4239 return U->getOpcode() == ISD::MUL; 4240 })) 4241 return AddOp; 4242 4243 return SDValue(); 4244 }; 4245 4246 // FIXME: The selection pattern is not properly checking for commuted 4247 // operands, so we have to place the mul in the LHS 4248 if (SDValue MulOper = IsFoldableAdd(N0)) { 4249 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper); 4250 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1); 4251 } 4252 4253 if (SDValue MulOper = IsFoldableAdd(N1)) { 4254 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper); 4255 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0); 4256 } 4257 4258 // Skip if already mul24. 4259 if (N->getOpcode() != ISD::MUL) 4260 return SDValue(); 4261 4262 // There are i16 integer mul/mad. 4263 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16)) 4264 return SDValue(); 4265 4266 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends 4267 // in the source into any_extends if the result of the mul is truncated. Since 4268 // we can assume the high bits are whatever we want, use the underlying value 4269 // to avoid the unknown high bits from interfering. 4270 if (N0.getOpcode() == ISD::ANY_EXTEND) 4271 N0 = N0.getOperand(0); 4272 4273 if (N1.getOpcode() == ISD::ANY_EXTEND) 4274 N1 = N1.getOperand(0); 4275 4276 SDValue Mul; 4277 4278 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { 4279 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 4280 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 4281 Mul = getMul24(DAG, DL, N0, N1, Size, false); 4282 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { 4283 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 4284 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 4285 Mul = getMul24(DAG, DL, N0, N1, Size, true); 4286 } else { 4287 return SDValue(); 4288 } 4289 4290 // We need to use sext even for MUL_U24, because MUL_U24 is used 4291 // for signed multiply of 8 and 16-bit types. 4292 return DAG.getSExtOrTrunc(Mul, DL, VT); 4293 } 4294 4295 SDValue 4296 AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N, 4297 DAGCombinerInfo &DCI) const { 4298 if (N->getValueType(0) != MVT::i32) 4299 return SDValue(); 4300 4301 SelectionDAG &DAG = DCI.DAG; 4302 SDLoc DL(N); 4303 4304 SDValue N0 = N->getOperand(0); 4305 SDValue N1 = N->getOperand(1); 4306 4307 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends 4308 // in the source into any_extends if the result of the mul is truncated. Since 4309 // we can assume the high bits are whatever we want, use the underlying value 4310 // to avoid the unknown high bits from interfering. 4311 if (N0.getOpcode() == ISD::ANY_EXTEND) 4312 N0 = N0.getOperand(0); 4313 if (N1.getOpcode() == ISD::ANY_EXTEND) 4314 N1 = N1.getOperand(0); 4315 4316 // Try to use two fast 24-bit multiplies (one for each half of the result) 4317 // instead of one slow extending multiply. 4318 unsigned LoOpcode, HiOpcode; 4319 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { 4320 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 4321 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 4322 LoOpcode = AMDGPUISD::MUL_U24; 4323 HiOpcode = AMDGPUISD::MULHI_U24; 4324 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { 4325 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 4326 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 4327 LoOpcode = AMDGPUISD::MUL_I24; 4328 HiOpcode = AMDGPUISD::MULHI_I24; 4329 } else { 4330 return SDValue(); 4331 } 4332 4333 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1); 4334 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1); 4335 DCI.CombineTo(N, Lo, Hi); 4336 return SDValue(N, 0); 4337 } 4338 4339 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N, 4340 DAGCombinerInfo &DCI) const { 4341 EVT VT = N->getValueType(0); 4342 4343 if (!Subtarget->hasMulI24() || VT.isVector()) 4344 return SDValue(); 4345 4346 // Don't generate 24-bit multiplies on values that are in SGPRs, since 4347 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs 4348 // unnecessarily). isDivergent() is used as an approximation of whether the 4349 // value is in an SGPR. 4350 // This doesn't apply if no s_mul_hi is available (since we'll end up with a 4351 // valu op anyway) 4352 if (Subtarget->hasSMulHi() && !N->isDivergent()) 4353 return SDValue(); 4354 4355 SelectionDAG &DAG = DCI.DAG; 4356 SDLoc DL(N); 4357 4358 SDValue N0 = N->getOperand(0); 4359 SDValue N1 = N->getOperand(1); 4360 4361 if (!isI24(N0, DAG) || !isI24(N1, DAG)) 4362 return SDValue(); 4363 4364 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 4365 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 4366 4367 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1); 4368 DCI.AddToWorklist(Mulhi.getNode()); 4369 return DAG.getSExtOrTrunc(Mulhi, DL, VT); 4370 } 4371 4372 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N, 4373 DAGCombinerInfo &DCI) const { 4374 EVT VT = N->getValueType(0); 4375 4376 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32) 4377 return SDValue(); 4378 4379 // Don't generate 24-bit multiplies on values that are in SGPRs, since 4380 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs 4381 // unnecessarily). isDivergent() is used as an approximation of whether the 4382 // value is in an SGPR. 4383 // This doesn't apply if no s_mul_hi is available (since we'll end up with a 4384 // valu op anyway) 4385 if (Subtarget->hasSMulHi() && !N->isDivergent()) 4386 return SDValue(); 4387 4388 SelectionDAG &DAG = DCI.DAG; 4389 SDLoc DL(N); 4390 4391 SDValue N0 = N->getOperand(0); 4392 SDValue N1 = N->getOperand(1); 4393 4394 if (!isU24(N0, DAG) || !isU24(N1, DAG)) 4395 return SDValue(); 4396 4397 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 4398 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 4399 4400 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1); 4401 DCI.AddToWorklist(Mulhi.getNode()); 4402 return DAG.getZExtOrTrunc(Mulhi, DL, VT); 4403 } 4404 4405 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG, 4406 SDValue Op, 4407 const SDLoc &DL, 4408 unsigned Opc) const { 4409 EVT VT = Op.getValueType(); 4410 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT); 4411 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() && 4412 LegalVT != MVT::i16)) 4413 return SDValue(); 4414 4415 if (VT != MVT::i32) 4416 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); 4417 4418 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op); 4419 if (VT != MVT::i32) 4420 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX); 4421 4422 return FFBX; 4423 } 4424 4425 // The native instructions return -1 on 0 input. Optimize out a select that 4426 // produces -1 on 0. 4427 // 4428 // TODO: If zero is not undef, we could also do this if the output is compared 4429 // against the bitwidth. 4430 // 4431 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly. 4432 SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, 4433 SDValue LHS, SDValue RHS, 4434 DAGCombinerInfo &DCI) const { 4435 if (!isNullConstant(Cond.getOperand(1))) 4436 return SDValue(); 4437 4438 SelectionDAG &DAG = DCI.DAG; 4439 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 4440 SDValue CmpLHS = Cond.getOperand(0); 4441 4442 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x 4443 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x 4444 if (CCOpcode == ISD::SETEQ && 4445 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && 4446 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) { 4447 unsigned Opc = 4448 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32; 4449 return getFFBX_U32(DAG, CmpLHS, SL, Opc); 4450 } 4451 4452 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x 4453 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x 4454 if (CCOpcode == ISD::SETNE && 4455 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) && 4456 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) { 4457 unsigned Opc = 4458 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32; 4459 4460 return getFFBX_U32(DAG, CmpLHS, SL, Opc); 4461 } 4462 4463 return SDValue(); 4464 } 4465 4466 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, 4467 unsigned Op, 4468 const SDLoc &SL, 4469 SDValue Cond, 4470 SDValue N1, 4471 SDValue N2) { 4472 SelectionDAG &DAG = DCI.DAG; 4473 EVT VT = N1.getValueType(); 4474 4475 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond, 4476 N1.getOperand(0), N2.getOperand(0)); 4477 DCI.AddToWorklist(NewSelect.getNode()); 4478 return DAG.getNode(Op, SL, VT, NewSelect); 4479 } 4480 4481 // Pull a free FP operation out of a select so it may fold into uses. 4482 // 4483 // select c, (fneg x), (fneg y) -> fneg (select c, x, y) 4484 // select c, (fneg x), k -> fneg (select c, x, (fneg k)) 4485 // 4486 // select c, (fabs x), (fabs y) -> fabs (select c, x, y) 4487 // select c, (fabs x), +k -> fabs (select c, x, k) 4488 SDValue 4489 AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, 4490 SDValue N) const { 4491 SelectionDAG &DAG = DCI.DAG; 4492 SDValue Cond = N.getOperand(0); 4493 SDValue LHS = N.getOperand(1); 4494 SDValue RHS = N.getOperand(2); 4495 4496 EVT VT = N.getValueType(); 4497 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) || 4498 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) { 4499 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) 4500 return SDValue(); 4501 4502 return distributeOpThroughSelect(DCI, LHS.getOpcode(), 4503 SDLoc(N), Cond, LHS, RHS); 4504 } 4505 4506 bool Inv = false; 4507 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) { 4508 std::swap(LHS, RHS); 4509 Inv = true; 4510 } 4511 4512 // TODO: Support vector constants. 4513 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 4514 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS && 4515 !selectSupportsSourceMods(N.getNode())) { 4516 SDLoc SL(N); 4517 // If one side is an fneg/fabs and the other is a constant, we can push the 4518 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative. 4519 SDValue NewLHS = LHS.getOperand(0); 4520 SDValue NewRHS = RHS; 4521 4522 // Careful: if the neg can be folded up, don't try to pull it back down. 4523 bool ShouldFoldNeg = true; 4524 4525 if (NewLHS.hasOneUse()) { 4526 unsigned Opc = NewLHS.getOpcode(); 4527 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode())) 4528 ShouldFoldNeg = false; 4529 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL) 4530 ShouldFoldNeg = false; 4531 } 4532 4533 if (ShouldFoldNeg) { 4534 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative()) 4535 return SDValue(); 4536 4537 // We're going to be forced to use a source modifier anyway, there's no 4538 // point to pulling the negate out unless we can get a size reduction by 4539 // negating the constant. 4540 // 4541 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know 4542 // about cheaper constants. 4543 if (NewLHS.getOpcode() == ISD::FABS && 4544 getConstantNegateCost(CRHS) != NegatibleCost::Cheaper) 4545 return SDValue(); 4546 4547 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) 4548 return SDValue(); 4549 4550 if (LHS.getOpcode() == ISD::FNEG) 4551 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 4552 4553 if (Inv) 4554 std::swap(NewLHS, NewRHS); 4555 4556 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, 4557 Cond, NewLHS, NewRHS); 4558 DCI.AddToWorklist(NewSelect.getNode()); 4559 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect); 4560 } 4561 } 4562 4563 return SDValue(); 4564 } 4565 4566 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, 4567 DAGCombinerInfo &DCI) const { 4568 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) 4569 return Folded; 4570 4571 SDValue Cond = N->getOperand(0); 4572 if (Cond.getOpcode() != ISD::SETCC) 4573 return SDValue(); 4574 4575 EVT VT = N->getValueType(0); 4576 SDValue LHS = Cond.getOperand(0); 4577 SDValue RHS = Cond.getOperand(1); 4578 SDValue CC = Cond.getOperand(2); 4579 4580 SDValue True = N->getOperand(1); 4581 SDValue False = N->getOperand(2); 4582 4583 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses. 4584 SelectionDAG &DAG = DCI.DAG; 4585 if (DAG.isConstantValueOfAnyType(True) && 4586 !DAG.isConstantValueOfAnyType(False)) { 4587 // Swap cmp + select pair to move constant to false input. 4588 // This will allow using VOPC cndmasks more often. 4589 // select (setcc x, y), k, x -> select (setccinv x, y), x, k 4590 4591 SDLoc SL(N); 4592 ISD::CondCode NewCC = 4593 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType()); 4594 4595 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC); 4596 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True); 4597 } 4598 4599 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) { 4600 SDValue MinMax 4601 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); 4602 // Revisit this node so we can catch min3/max3/med3 patterns. 4603 //DCI.AddToWorklist(MinMax.getNode()); 4604 return MinMax; 4605 } 4606 } 4607 4608 // There's no reason to not do this if the condition has other uses. 4609 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI); 4610 } 4611 4612 static bool isInv2Pi(const APFloat &APF) { 4613 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118)); 4614 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983)); 4615 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882)); 4616 4617 return APF.bitwiseIsEqual(KF16) || 4618 APF.bitwiseIsEqual(KF32) || 4619 APF.bitwiseIsEqual(KF64); 4620 } 4621 4622 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an 4623 // additional cost to negate them. 4624 TargetLowering::NegatibleCost 4625 AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const { 4626 if (C->isZero()) 4627 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive; 4628 4629 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF())) 4630 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive; 4631 4632 return NegatibleCost::Neutral; 4633 } 4634 4635 bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const { 4636 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) 4637 return getConstantNegateCost(C) == NegatibleCost::Expensive; 4638 return false; 4639 } 4640 4641 bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const { 4642 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) 4643 return getConstantNegateCost(C) == NegatibleCost::Cheaper; 4644 return false; 4645 } 4646 4647 static unsigned inverseMinMax(unsigned Opc) { 4648 switch (Opc) { 4649 case ISD::FMAXNUM: 4650 return ISD::FMINNUM; 4651 case ISD::FMINNUM: 4652 return ISD::FMAXNUM; 4653 case ISD::FMAXNUM_IEEE: 4654 return ISD::FMINNUM_IEEE; 4655 case ISD::FMINNUM_IEEE: 4656 return ISD::FMAXNUM_IEEE; 4657 case ISD::FMAXIMUM: 4658 return ISD::FMINIMUM; 4659 case ISD::FMINIMUM: 4660 return ISD::FMAXIMUM; 4661 case AMDGPUISD::FMAX_LEGACY: 4662 return AMDGPUISD::FMIN_LEGACY; 4663 case AMDGPUISD::FMIN_LEGACY: 4664 return AMDGPUISD::FMAX_LEGACY; 4665 default: 4666 llvm_unreachable("invalid min/max opcode"); 4667 } 4668 } 4669 4670 /// \return true if it's profitable to try to push an fneg into its source 4671 /// instruction. 4672 bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) { 4673 // If the input has multiple uses and we can either fold the negate down, or 4674 // the other uses cannot, give up. This both prevents unprofitable 4675 // transformations and infinite loops: we won't repeatedly try to fold around 4676 // a negate that has no 'good' form. 4677 if (N0.hasOneUse()) { 4678 // This may be able to fold into the source, but at a code size cost. Don't 4679 // fold if the fold into the user is free. 4680 if (allUsesHaveSourceMods(N, 0)) 4681 return false; 4682 } else { 4683 if (fnegFoldsIntoOp(N0.getNode()) && 4684 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode()))) 4685 return false; 4686 } 4687 4688 return true; 4689 } 4690 4691 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, 4692 DAGCombinerInfo &DCI) const { 4693 SelectionDAG &DAG = DCI.DAG; 4694 SDValue N0 = N->getOperand(0); 4695 EVT VT = N->getValueType(0); 4696 4697 unsigned Opc = N0.getOpcode(); 4698 4699 if (!shouldFoldFNegIntoSrc(N, N0)) 4700 return SDValue(); 4701 4702 SDLoc SL(N); 4703 switch (Opc) { 4704 case ISD::FADD: { 4705 if (!mayIgnoreSignedZero(N0)) 4706 return SDValue(); 4707 4708 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y)) 4709 SDValue LHS = N0.getOperand(0); 4710 SDValue RHS = N0.getOperand(1); 4711 4712 if (LHS.getOpcode() != ISD::FNEG) 4713 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); 4714 else 4715 LHS = LHS.getOperand(0); 4716 4717 if (RHS.getOpcode() != ISD::FNEG) 4718 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 4719 else 4720 RHS = RHS.getOperand(0); 4721 4722 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags()); 4723 if (Res.getOpcode() != ISD::FADD) 4724 return SDValue(); // Op got folded away. 4725 if (!N0.hasOneUse()) 4726 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 4727 return Res; 4728 } 4729 case ISD::FMUL: 4730 case AMDGPUISD::FMUL_LEGACY: { 4731 // (fneg (fmul x, y)) -> (fmul x, (fneg y)) 4732 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y)) 4733 SDValue LHS = N0.getOperand(0); 4734 SDValue RHS = N0.getOperand(1); 4735 4736 if (LHS.getOpcode() == ISD::FNEG) 4737 LHS = LHS.getOperand(0); 4738 else if (RHS.getOpcode() == ISD::FNEG) 4739 RHS = RHS.getOperand(0); 4740 else 4741 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 4742 4743 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags()); 4744 if (Res.getOpcode() != Opc) 4745 return SDValue(); // Op got folded away. 4746 if (!N0.hasOneUse()) 4747 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 4748 return Res; 4749 } 4750 case ISD::FMA: 4751 case ISD::FMAD: { 4752 // TODO: handle llvm.amdgcn.fma.legacy 4753 if (!mayIgnoreSignedZero(N0)) 4754 return SDValue(); 4755 4756 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z)) 4757 SDValue LHS = N0.getOperand(0); 4758 SDValue MHS = N0.getOperand(1); 4759 SDValue RHS = N0.getOperand(2); 4760 4761 if (LHS.getOpcode() == ISD::FNEG) 4762 LHS = LHS.getOperand(0); 4763 else if (MHS.getOpcode() == ISD::FNEG) 4764 MHS = MHS.getOperand(0); 4765 else 4766 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS); 4767 4768 if (RHS.getOpcode() != ISD::FNEG) 4769 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 4770 else 4771 RHS = RHS.getOperand(0); 4772 4773 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS); 4774 if (Res.getOpcode() != Opc) 4775 return SDValue(); // Op got folded away. 4776 if (!N0.hasOneUse()) 4777 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 4778 return Res; 4779 } 4780 case ISD::FMAXNUM: 4781 case ISD::FMINNUM: 4782 case ISD::FMAXNUM_IEEE: 4783 case ISD::FMINNUM_IEEE: 4784 case ISD::FMINIMUM: 4785 case ISD::FMAXIMUM: 4786 case AMDGPUISD::FMAX_LEGACY: 4787 case AMDGPUISD::FMIN_LEGACY: { 4788 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y) 4789 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y) 4790 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y) 4791 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y) 4792 4793 SDValue LHS = N0.getOperand(0); 4794 SDValue RHS = N0.getOperand(1); 4795 4796 // 0 doesn't have a negated inline immediate. 4797 // TODO: This constant check should be generalized to other operations. 4798 if (isConstantCostlierToNegate(RHS)) 4799 return SDValue(); 4800 4801 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); 4802 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 4803 unsigned Opposite = inverseMinMax(Opc); 4804 4805 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags()); 4806 if (Res.getOpcode() != Opposite) 4807 return SDValue(); // Op got folded away. 4808 if (!N0.hasOneUse()) 4809 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 4810 return Res; 4811 } 4812 case AMDGPUISD::FMED3: { 4813 SDValue Ops[3]; 4814 for (unsigned I = 0; I < 3; ++I) 4815 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags()); 4816 4817 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags()); 4818 if (Res.getOpcode() != AMDGPUISD::FMED3) 4819 return SDValue(); // Op got folded away. 4820 4821 if (!N0.hasOneUse()) { 4822 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res); 4823 DAG.ReplaceAllUsesWith(N0, Neg); 4824 4825 for (SDNode *U : Neg->uses()) 4826 DCI.AddToWorklist(U); 4827 } 4828 4829 return Res; 4830 } 4831 case ISD::FP_EXTEND: 4832 case ISD::FTRUNC: 4833 case ISD::FRINT: 4834 case ISD::FNEARBYINT: // XXX - Should fround be handled? 4835 case ISD::FROUNDEVEN: 4836 case ISD::FSIN: 4837 case ISD::FCANONICALIZE: 4838 case AMDGPUISD::RCP: 4839 case AMDGPUISD::RCP_LEGACY: 4840 case AMDGPUISD::RCP_IFLAG: 4841 case AMDGPUISD::SIN_HW: { 4842 SDValue CvtSrc = N0.getOperand(0); 4843 if (CvtSrc.getOpcode() == ISD::FNEG) { 4844 // (fneg (fp_extend (fneg x))) -> (fp_extend x) 4845 // (fneg (rcp (fneg x))) -> (rcp x) 4846 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0)); 4847 } 4848 4849 if (!N0.hasOneUse()) 4850 return SDValue(); 4851 4852 // (fneg (fp_extend x)) -> (fp_extend (fneg x)) 4853 // (fneg (rcp x)) -> (rcp (fneg x)) 4854 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); 4855 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags()); 4856 } 4857 case ISD::FP_ROUND: { 4858 SDValue CvtSrc = N0.getOperand(0); 4859 4860 if (CvtSrc.getOpcode() == ISD::FNEG) { 4861 // (fneg (fp_round (fneg x))) -> (fp_round x) 4862 return DAG.getNode(ISD::FP_ROUND, SL, VT, 4863 CvtSrc.getOperand(0), N0.getOperand(1)); 4864 } 4865 4866 if (!N0.hasOneUse()) 4867 return SDValue(); 4868 4869 // (fneg (fp_round x)) -> (fp_round (fneg x)) 4870 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); 4871 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1)); 4872 } 4873 case ISD::FP16_TO_FP: { 4874 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal 4875 // f16, but legalization of f16 fneg ends up pulling it out of the source. 4876 // Put the fneg back as a legal source operation that can be matched later. 4877 SDLoc SL(N); 4878 4879 SDValue Src = N0.getOperand(0); 4880 EVT SrcVT = Src.getValueType(); 4881 4882 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000) 4883 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src, 4884 DAG.getConstant(0x8000, SL, SrcVT)); 4885 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg); 4886 } 4887 case ISD::SELECT: { 4888 // fneg (select c, a, b) -> select c, (fneg a), (fneg b) 4889 // TODO: Invert conditions of foldFreeOpFromSelect 4890 return SDValue(); 4891 } 4892 case ISD::BITCAST: { 4893 SDLoc SL(N); 4894 SDValue BCSrc = N0.getOperand(0); 4895 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) { 4896 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1); 4897 if (HighBits.getValueType().getSizeInBits() != 32 || 4898 !fnegFoldsIntoOp(HighBits.getNode())) 4899 return SDValue(); 4900 4901 // f64 fneg only really needs to operate on the high half of of the 4902 // register, so try to force it to an f32 operation to help make use of 4903 // source modifiers. 4904 // 4905 // 4906 // fneg (f64 (bitcast (build_vector x, y))) -> 4907 // f64 (bitcast (build_vector (bitcast i32:x to f32), 4908 // (fneg (bitcast i32:y to f32))) 4909 4910 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits); 4911 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi); 4912 SDValue CastBack = 4913 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi); 4914 4915 SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end()); 4916 Ops.back() = CastBack; 4917 DCI.AddToWorklist(NegHi.getNode()); 4918 SDValue Build = 4919 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops); 4920 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build); 4921 4922 if (!N0.hasOneUse()) 4923 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result)); 4924 return Result; 4925 } 4926 4927 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 && 4928 BCSrc.hasOneUse()) { 4929 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) -> 4930 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32) 4931 4932 // TODO: Cast back result for multiple uses is beneficial in some cases. 4933 4934 SDValue LHS = 4935 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1)); 4936 SDValue RHS = 4937 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2)); 4938 4939 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS); 4940 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS); 4941 4942 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS, 4943 NegRHS); 4944 } 4945 4946 return SDValue(); 4947 } 4948 default: 4949 return SDValue(); 4950 } 4951 } 4952 4953 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N, 4954 DAGCombinerInfo &DCI) const { 4955 SelectionDAG &DAG = DCI.DAG; 4956 SDValue N0 = N->getOperand(0); 4957 4958 if (!N0.hasOneUse()) 4959 return SDValue(); 4960 4961 switch (N0.getOpcode()) { 4962 case ISD::FP16_TO_FP: { 4963 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal"); 4964 SDLoc SL(N); 4965 SDValue Src = N0.getOperand(0); 4966 EVT SrcVT = Src.getValueType(); 4967 4968 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff) 4969 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src, 4970 DAG.getConstant(0x7fff, SL, SrcVT)); 4971 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs); 4972 } 4973 default: 4974 return SDValue(); 4975 } 4976 } 4977 4978 SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N, 4979 DAGCombinerInfo &DCI) const { 4980 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); 4981 if (!CFP) 4982 return SDValue(); 4983 4984 // XXX - Should this flush denormals? 4985 const APFloat &Val = CFP->getValueAPF(); 4986 APFloat One(Val.getSemantics(), "1.0"); 4987 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); 4988 } 4989 4990 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, 4991 DAGCombinerInfo &DCI) const { 4992 SelectionDAG &DAG = DCI.DAG; 4993 SDLoc DL(N); 4994 4995 switch(N->getOpcode()) { 4996 default: 4997 break; 4998 case ISD::BITCAST: { 4999 EVT DestVT = N->getValueType(0); 5000 5001 // Push casts through vector builds. This helps avoid emitting a large 5002 // number of copies when materializing floating point vector constants. 5003 // 5004 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) => 5005 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y)) 5006 if (DestVT.isVector()) { 5007 SDValue Src = N->getOperand(0); 5008 if (Src.getOpcode() == ISD::BUILD_VECTOR && 5009 (DCI.getDAGCombineLevel() < AfterLegalizeDAG || 5010 isOperationLegal(ISD::BUILD_VECTOR, DestVT))) { 5011 EVT SrcVT = Src.getValueType(); 5012 unsigned NElts = DestVT.getVectorNumElements(); 5013 5014 if (SrcVT.getVectorNumElements() == NElts) { 5015 EVT DestEltVT = DestVT.getVectorElementType(); 5016 5017 SmallVector<SDValue, 8> CastedElts; 5018 SDLoc SL(N); 5019 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) { 5020 SDValue Elt = Src.getOperand(I); 5021 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt)); 5022 } 5023 5024 return DAG.getBuildVector(DestVT, SL, CastedElts); 5025 } 5026 } 5027 } 5028 5029 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector()) 5030 break; 5031 5032 // Fold bitcasts of constants. 5033 // 5034 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k) 5035 // TODO: Generalize and move to DAGCombiner 5036 SDValue Src = N->getOperand(0); 5037 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) { 5038 SDLoc SL(N); 5039 uint64_t CVal = C->getZExtValue(); 5040 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 5041 DAG.getConstant(Lo_32(CVal), SL, MVT::i32), 5042 DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); 5043 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV); 5044 } 5045 5046 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) { 5047 const APInt &Val = C->getValueAPF().bitcastToAPInt(); 5048 SDLoc SL(N); 5049 uint64_t CVal = Val.getZExtValue(); 5050 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 5051 DAG.getConstant(Lo_32(CVal), SL, MVT::i32), 5052 DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); 5053 5054 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec); 5055 } 5056 5057 break; 5058 } 5059 case ISD::SHL: { 5060 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 5061 break; 5062 5063 return performShlCombine(N, DCI); 5064 } 5065 case ISD::SRL: { 5066 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 5067 break; 5068 5069 return performSrlCombine(N, DCI); 5070 } 5071 case ISD::SRA: { 5072 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 5073 break; 5074 5075 return performSraCombine(N, DCI); 5076 } 5077 case ISD::TRUNCATE: 5078 return performTruncateCombine(N, DCI); 5079 case ISD::MUL: 5080 return performMulCombine(N, DCI); 5081 case AMDGPUISD::MUL_U24: 5082 case AMDGPUISD::MUL_I24: { 5083 if (SDValue Simplified = simplifyMul24(N, DCI)) 5084 return Simplified; 5085 return performMulCombine(N, DCI); 5086 } 5087 case AMDGPUISD::MULHI_I24: 5088 case AMDGPUISD::MULHI_U24: 5089 return simplifyMul24(N, DCI); 5090 case ISD::SMUL_LOHI: 5091 case ISD::UMUL_LOHI: 5092 return performMulLoHiCombine(N, DCI); 5093 case ISD::MULHS: 5094 return performMulhsCombine(N, DCI); 5095 case ISD::MULHU: 5096 return performMulhuCombine(N, DCI); 5097 case ISD::SELECT: 5098 return performSelectCombine(N, DCI); 5099 case ISD::FNEG: 5100 return performFNegCombine(N, DCI); 5101 case ISD::FABS: 5102 return performFAbsCombine(N, DCI); 5103 case AMDGPUISD::BFE_I32: 5104 case AMDGPUISD::BFE_U32: { 5105 assert(!N->getValueType(0).isVector() && 5106 "Vector handling of BFE not implemented"); 5107 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); 5108 if (!Width) 5109 break; 5110 5111 uint32_t WidthVal = Width->getZExtValue() & 0x1f; 5112 if (WidthVal == 0) 5113 return DAG.getConstant(0, DL, MVT::i32); 5114 5115 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 5116 if (!Offset) 5117 break; 5118 5119 SDValue BitsFrom = N->getOperand(0); 5120 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; 5121 5122 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; 5123 5124 if (OffsetVal == 0) { 5125 // This is already sign / zero extended, so try to fold away extra BFEs. 5126 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); 5127 5128 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); 5129 if (OpSignBits >= SignBits) 5130 return BitsFrom; 5131 5132 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); 5133 if (Signed) { 5134 // This is a sign_extend_inreg. Replace it to take advantage of existing 5135 // DAG Combines. If not eliminated, we will match back to BFE during 5136 // selection. 5137 5138 // TODO: The sext_inreg of extended types ends, although we can could 5139 // handle them in a single BFE. 5140 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, 5141 DAG.getValueType(SmallVT)); 5142 } 5143 5144 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); 5145 } 5146 5147 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) { 5148 if (Signed) { 5149 return constantFoldBFE<int32_t>(DAG, 5150 CVal->getSExtValue(), 5151 OffsetVal, 5152 WidthVal, 5153 DL); 5154 } 5155 5156 return constantFoldBFE<uint32_t>(DAG, 5157 CVal->getZExtValue(), 5158 OffsetVal, 5159 WidthVal, 5160 DL); 5161 } 5162 5163 if ((OffsetVal + WidthVal) >= 32 && 5164 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) { 5165 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); 5166 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, 5167 BitsFrom, ShiftVal); 5168 } 5169 5170 if (BitsFrom.hasOneUse()) { 5171 APInt Demanded = APInt::getBitsSet(32, 5172 OffsetVal, 5173 OffsetVal + WidthVal); 5174 5175 KnownBits Known; 5176 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 5177 !DCI.isBeforeLegalizeOps()); 5178 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5179 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) || 5180 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) { 5181 DCI.CommitTargetLoweringOpt(TLO); 5182 } 5183 } 5184 5185 break; 5186 } 5187 case ISD::LOAD: 5188 return performLoadCombine(N, DCI); 5189 case ISD::STORE: 5190 return performStoreCombine(N, DCI); 5191 case AMDGPUISD::RCP: 5192 case AMDGPUISD::RCP_IFLAG: 5193 return performRcpCombine(N, DCI); 5194 case ISD::AssertZext: 5195 case ISD::AssertSext: 5196 return performAssertSZExtCombine(N, DCI); 5197 case ISD::INTRINSIC_WO_CHAIN: 5198 return performIntrinsicWOChainCombine(N, DCI); 5199 case AMDGPUISD::FMAD_FTZ: { 5200 SDValue N0 = N->getOperand(0); 5201 SDValue N1 = N->getOperand(1); 5202 SDValue N2 = N->getOperand(2); 5203 EVT VT = N->getValueType(0); 5204 5205 // FMAD_FTZ is a FMAD + flush denormals to zero. 5206 // We flush the inputs, the intermediate step, and the output. 5207 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); 5208 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); 5209 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2); 5210 if (N0CFP && N1CFP && N2CFP) { 5211 const auto FTZ = [](const APFloat &V) { 5212 if (V.isDenormal()) { 5213 APFloat Zero(V.getSemantics(), 0); 5214 return V.isNegative() ? -Zero : Zero; 5215 } 5216 return V; 5217 }; 5218 5219 APFloat V0 = FTZ(N0CFP->getValueAPF()); 5220 APFloat V1 = FTZ(N1CFP->getValueAPF()); 5221 APFloat V2 = FTZ(N2CFP->getValueAPF()); 5222 V0.multiply(V1, APFloat::rmNearestTiesToEven); 5223 V0 = FTZ(V0); 5224 V0.add(V2, APFloat::rmNearestTiesToEven); 5225 return DAG.getConstantFP(FTZ(V0), DL, VT); 5226 } 5227 break; 5228 } 5229 } 5230 return SDValue(); 5231 } 5232 5233 //===----------------------------------------------------------------------===// 5234 // Helper functions 5235 //===----------------------------------------------------------------------===// 5236 5237 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 5238 const TargetRegisterClass *RC, 5239 Register Reg, EVT VT, 5240 const SDLoc &SL, 5241 bool RawReg) const { 5242 MachineFunction &MF = DAG.getMachineFunction(); 5243 MachineRegisterInfo &MRI = MF.getRegInfo(); 5244 Register VReg; 5245 5246 if (!MRI.isLiveIn(Reg)) { 5247 VReg = MRI.createVirtualRegister(RC); 5248 MRI.addLiveIn(Reg, VReg); 5249 } else { 5250 VReg = MRI.getLiveInVirtReg(Reg); 5251 } 5252 5253 if (RawReg) 5254 return DAG.getRegister(VReg, VT); 5255 5256 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT); 5257 } 5258 5259 // This may be called multiple times, and nothing prevents creating multiple 5260 // objects at the same offset. See if we already defined this object. 5261 static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, 5262 int64_t Offset) { 5263 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) { 5264 if (MFI.getObjectOffset(I) == Offset) { 5265 assert(MFI.getObjectSize(I) == Size); 5266 return I; 5267 } 5268 } 5269 5270 return MFI.CreateFixedObject(Size, Offset, true); 5271 } 5272 5273 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, 5274 EVT VT, 5275 const SDLoc &SL, 5276 int64_t Offset) const { 5277 MachineFunction &MF = DAG.getMachineFunction(); 5278 MachineFrameInfo &MFI = MF.getFrameInfo(); 5279 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset); 5280 5281 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); 5282 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); 5283 5284 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4), 5285 MachineMemOperand::MODereferenceable | 5286 MachineMemOperand::MOInvariant); 5287 } 5288 5289 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, 5290 const SDLoc &SL, 5291 SDValue Chain, 5292 SDValue ArgVal, 5293 int64_t Offset) const { 5294 MachineFunction &MF = DAG.getMachineFunction(); 5295 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); 5296 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 5297 5298 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32); 5299 // Stores to the argument stack area are relative to the stack pointer. 5300 SDValue SP = 5301 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32); 5302 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr); 5303 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4), 5304 MachineMemOperand::MODereferenceable); 5305 return Store; 5306 } 5307 5308 SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, 5309 const TargetRegisterClass *RC, 5310 EVT VT, const SDLoc &SL, 5311 const ArgDescriptor &Arg) const { 5312 assert(Arg && "Attempting to load missing argument"); 5313 5314 SDValue V = Arg.isRegister() ? 5315 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) : 5316 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); 5317 5318 if (!Arg.isMasked()) 5319 return V; 5320 5321 unsigned Mask = Arg.getMask(); 5322 unsigned Shift = llvm::countr_zero<unsigned>(Mask); 5323 V = DAG.getNode(ISD::SRL, SL, VT, V, 5324 DAG.getShiftAmountConstant(Shift, VT, SL)); 5325 return DAG.getNode(ISD::AND, SL, VT, V, 5326 DAG.getConstant(Mask >> Shift, SL, VT)); 5327 } 5328 5329 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( 5330 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const { 5331 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset(); 5332 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr(); 5333 uint64_t ArgOffset = 5334 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset; 5335 switch (Param) { 5336 case FIRST_IMPLICIT: 5337 return ArgOffset; 5338 case PRIVATE_BASE: 5339 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET; 5340 case SHARED_BASE: 5341 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET; 5342 case QUEUE_PTR: 5343 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET; 5344 } 5345 llvm_unreachable("unexpected implicit parameter type"); 5346 } 5347 5348 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( 5349 const MachineFunction &MF, const ImplicitParameter Param) const { 5350 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); 5351 return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param); 5352 } 5353 5354 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; 5355 5356 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { 5357 switch ((AMDGPUISD::NodeType)Opcode) { 5358 case AMDGPUISD::FIRST_NUMBER: break; 5359 // AMDIL DAG nodes 5360 NODE_NAME_CASE(UMUL); 5361 NODE_NAME_CASE(BRANCH_COND); 5362 5363 // AMDGPU DAG nodes 5364 NODE_NAME_CASE(IF) 5365 NODE_NAME_CASE(ELSE) 5366 NODE_NAME_CASE(LOOP) 5367 NODE_NAME_CASE(CALL) 5368 NODE_NAME_CASE(TC_RETURN) 5369 NODE_NAME_CASE(TC_RETURN_GFX) 5370 NODE_NAME_CASE(TC_RETURN_CHAIN) 5371 NODE_NAME_CASE(TRAP) 5372 NODE_NAME_CASE(RET_GLUE) 5373 NODE_NAME_CASE(WAVE_ADDRESS) 5374 NODE_NAME_CASE(RETURN_TO_EPILOG) 5375 NODE_NAME_CASE(ENDPGM) 5376 NODE_NAME_CASE(ENDPGM_TRAP) 5377 NODE_NAME_CASE(DWORDADDR) 5378 NODE_NAME_CASE(FRACT) 5379 NODE_NAME_CASE(SETCC) 5380 NODE_NAME_CASE(SETREG) 5381 NODE_NAME_CASE(DENORM_MODE) 5382 NODE_NAME_CASE(FMA_W_CHAIN) 5383 NODE_NAME_CASE(FMUL_W_CHAIN) 5384 NODE_NAME_CASE(CLAMP) 5385 NODE_NAME_CASE(COS_HW) 5386 NODE_NAME_CASE(SIN_HW) 5387 NODE_NAME_CASE(FMAX_LEGACY) 5388 NODE_NAME_CASE(FMIN_LEGACY) 5389 NODE_NAME_CASE(FMAX3) 5390 NODE_NAME_CASE(SMAX3) 5391 NODE_NAME_CASE(UMAX3) 5392 NODE_NAME_CASE(FMIN3) 5393 NODE_NAME_CASE(SMIN3) 5394 NODE_NAME_CASE(UMIN3) 5395 NODE_NAME_CASE(FMED3) 5396 NODE_NAME_CASE(SMED3) 5397 NODE_NAME_CASE(UMED3) 5398 NODE_NAME_CASE(FMAXIMUM3) 5399 NODE_NAME_CASE(FMINIMUM3) 5400 NODE_NAME_CASE(FDOT2) 5401 NODE_NAME_CASE(URECIP) 5402 NODE_NAME_CASE(DIV_SCALE) 5403 NODE_NAME_CASE(DIV_FMAS) 5404 NODE_NAME_CASE(DIV_FIXUP) 5405 NODE_NAME_CASE(FMAD_FTZ) 5406 NODE_NAME_CASE(RCP) 5407 NODE_NAME_CASE(RSQ) 5408 NODE_NAME_CASE(RCP_LEGACY) 5409 NODE_NAME_CASE(RCP_IFLAG) 5410 NODE_NAME_CASE(LOG) 5411 NODE_NAME_CASE(EXP) 5412 NODE_NAME_CASE(FMUL_LEGACY) 5413 NODE_NAME_CASE(RSQ_CLAMP) 5414 NODE_NAME_CASE(FP_CLASS) 5415 NODE_NAME_CASE(DOT4) 5416 NODE_NAME_CASE(CARRY) 5417 NODE_NAME_CASE(BORROW) 5418 NODE_NAME_CASE(BFE_U32) 5419 NODE_NAME_CASE(BFE_I32) 5420 NODE_NAME_CASE(BFI) 5421 NODE_NAME_CASE(BFM) 5422 NODE_NAME_CASE(FFBH_U32) 5423 NODE_NAME_CASE(FFBH_I32) 5424 NODE_NAME_CASE(FFBL_B32) 5425 NODE_NAME_CASE(MUL_U24) 5426 NODE_NAME_CASE(MUL_I24) 5427 NODE_NAME_CASE(MULHI_U24) 5428 NODE_NAME_CASE(MULHI_I24) 5429 NODE_NAME_CASE(MAD_U24) 5430 NODE_NAME_CASE(MAD_I24) 5431 NODE_NAME_CASE(MAD_I64_I32) 5432 NODE_NAME_CASE(MAD_U64_U32) 5433 NODE_NAME_CASE(PERM) 5434 NODE_NAME_CASE(TEXTURE_FETCH) 5435 NODE_NAME_CASE(R600_EXPORT) 5436 NODE_NAME_CASE(CONST_ADDRESS) 5437 NODE_NAME_CASE(REGISTER_LOAD) 5438 NODE_NAME_CASE(REGISTER_STORE) 5439 NODE_NAME_CASE(SAMPLE) 5440 NODE_NAME_CASE(SAMPLEB) 5441 NODE_NAME_CASE(SAMPLED) 5442 NODE_NAME_CASE(SAMPLEL) 5443 NODE_NAME_CASE(CVT_F32_UBYTE0) 5444 NODE_NAME_CASE(CVT_F32_UBYTE1) 5445 NODE_NAME_CASE(CVT_F32_UBYTE2) 5446 NODE_NAME_CASE(CVT_F32_UBYTE3) 5447 NODE_NAME_CASE(CVT_PKRTZ_F16_F32) 5448 NODE_NAME_CASE(CVT_PKNORM_I16_F32) 5449 NODE_NAME_CASE(CVT_PKNORM_U16_F32) 5450 NODE_NAME_CASE(CVT_PK_I16_I32) 5451 NODE_NAME_CASE(CVT_PK_U16_U32) 5452 NODE_NAME_CASE(FP_TO_FP16) 5453 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) 5454 NODE_NAME_CASE(CONST_DATA_PTR) 5455 NODE_NAME_CASE(PC_ADD_REL_OFFSET) 5456 NODE_NAME_CASE(LDS) 5457 NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD) 5458 NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD) 5459 NODE_NAME_CASE(DUMMY_CHAIN) 5460 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; 5461 NODE_NAME_CASE(LOAD_D16_HI) 5462 NODE_NAME_CASE(LOAD_D16_LO) 5463 NODE_NAME_CASE(LOAD_D16_HI_I8) 5464 NODE_NAME_CASE(LOAD_D16_HI_U8) 5465 NODE_NAME_CASE(LOAD_D16_LO_I8) 5466 NODE_NAME_CASE(LOAD_D16_LO_U8) 5467 NODE_NAME_CASE(STORE_MSKOR) 5468 NODE_NAME_CASE(LOAD_CONSTANT) 5469 NODE_NAME_CASE(TBUFFER_STORE_FORMAT) 5470 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) 5471 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) 5472 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) 5473 NODE_NAME_CASE(DS_ORDERED_COUNT) 5474 NODE_NAME_CASE(ATOMIC_CMP_SWAP) 5475 NODE_NAME_CASE(ATOMIC_LOAD_FMIN) 5476 NODE_NAME_CASE(ATOMIC_LOAD_FMAX) 5477 NODE_NAME_CASE(BUFFER_LOAD) 5478 NODE_NAME_CASE(BUFFER_LOAD_UBYTE) 5479 NODE_NAME_CASE(BUFFER_LOAD_USHORT) 5480 NODE_NAME_CASE(BUFFER_LOAD_BYTE) 5481 NODE_NAME_CASE(BUFFER_LOAD_SHORT) 5482 NODE_NAME_CASE(BUFFER_LOAD_FORMAT) 5483 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE) 5484 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) 5485 NODE_NAME_CASE(SBUFFER_LOAD) 5486 NODE_NAME_CASE(SBUFFER_LOAD_BYTE) 5487 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE) 5488 NODE_NAME_CASE(SBUFFER_LOAD_SHORT) 5489 NODE_NAME_CASE(SBUFFER_LOAD_USHORT) 5490 NODE_NAME_CASE(BUFFER_STORE) 5491 NODE_NAME_CASE(BUFFER_STORE_BYTE) 5492 NODE_NAME_CASE(BUFFER_STORE_SHORT) 5493 NODE_NAME_CASE(BUFFER_STORE_FORMAT) 5494 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) 5495 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) 5496 NODE_NAME_CASE(BUFFER_ATOMIC_ADD) 5497 NODE_NAME_CASE(BUFFER_ATOMIC_SUB) 5498 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN) 5499 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN) 5500 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX) 5501 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX) 5502 NODE_NAME_CASE(BUFFER_ATOMIC_AND) 5503 NODE_NAME_CASE(BUFFER_ATOMIC_OR) 5504 NODE_NAME_CASE(BUFFER_ATOMIC_XOR) 5505 NODE_NAME_CASE(BUFFER_ATOMIC_INC) 5506 NODE_NAME_CASE(BUFFER_ATOMIC_DEC) 5507 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) 5508 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) 5509 NODE_NAME_CASE(BUFFER_ATOMIC_FADD) 5510 NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16) 5511 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) 5512 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) 5513 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32) 5514 5515 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; 5516 } 5517 return nullptr; 5518 } 5519 5520 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand, 5521 SelectionDAG &DAG, int Enabled, 5522 int &RefinementSteps, 5523 bool &UseOneConstNR, 5524 bool Reciprocal) const { 5525 EVT VT = Operand.getValueType(); 5526 5527 if (VT == MVT::f32) { 5528 RefinementSteps = 0; 5529 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand); 5530 } 5531 5532 // TODO: There is also f64 rsq instruction, but the documentation is less 5533 // clear on its precision. 5534 5535 return SDValue(); 5536 } 5537 5538 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, 5539 SelectionDAG &DAG, int Enabled, 5540 int &RefinementSteps) const { 5541 EVT VT = Operand.getValueType(); 5542 5543 if (VT == MVT::f32) { 5544 // Reciprocal, < 1 ulp error. 5545 // 5546 // This reciprocal approximation converges to < 0.5 ulp error with one 5547 // newton rhapson performed with two fused multiple adds (FMAs). 5548 5549 RefinementSteps = 0; 5550 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); 5551 } 5552 5553 // TODO: There is also f64 rcp instruction, but the documentation is less 5554 // clear on its precision. 5555 5556 return SDValue(); 5557 } 5558 5559 static unsigned workitemIntrinsicDim(unsigned ID) { 5560 switch (ID) { 5561 case Intrinsic::amdgcn_workitem_id_x: 5562 return 0; 5563 case Intrinsic::amdgcn_workitem_id_y: 5564 return 1; 5565 case Intrinsic::amdgcn_workitem_id_z: 5566 return 2; 5567 default: 5568 llvm_unreachable("not a workitem intrinsic"); 5569 } 5570 } 5571 5572 void AMDGPUTargetLowering::computeKnownBitsForTargetNode( 5573 const SDValue Op, KnownBits &Known, 5574 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { 5575 5576 Known.resetAll(); // Don't know anything. 5577 5578 unsigned Opc = Op.getOpcode(); 5579 5580 switch (Opc) { 5581 default: 5582 break; 5583 case AMDGPUISD::CARRY: 5584 case AMDGPUISD::BORROW: { 5585 Known.Zero = APInt::getHighBitsSet(32, 31); 5586 break; 5587 } 5588 5589 case AMDGPUISD::BFE_I32: 5590 case AMDGPUISD::BFE_U32: { 5591 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 5592 if (!CWidth) 5593 return; 5594 5595 uint32_t Width = CWidth->getZExtValue() & 0x1f; 5596 5597 if (Opc == AMDGPUISD::BFE_U32) 5598 Known.Zero = APInt::getHighBitsSet(32, 32 - Width); 5599 5600 break; 5601 } 5602 case AMDGPUISD::FP_TO_FP16: { 5603 unsigned BitWidth = Known.getBitWidth(); 5604 5605 // High bits are zero. 5606 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 5607 break; 5608 } 5609 case AMDGPUISD::MUL_U24: 5610 case AMDGPUISD::MUL_I24: { 5611 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 5612 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); 5613 unsigned TrailZ = LHSKnown.countMinTrailingZeros() + 5614 RHSKnown.countMinTrailingZeros(); 5615 Known.Zero.setLowBits(std::min(TrailZ, 32u)); 5616 // Skip extra check if all bits are known zeros. 5617 if (TrailZ >= 32) 5618 break; 5619 5620 // Truncate to 24 bits. 5621 LHSKnown = LHSKnown.trunc(24); 5622 RHSKnown = RHSKnown.trunc(24); 5623 5624 if (Opc == AMDGPUISD::MUL_I24) { 5625 unsigned LHSValBits = LHSKnown.countMaxSignificantBits(); 5626 unsigned RHSValBits = RHSKnown.countMaxSignificantBits(); 5627 unsigned MaxValBits = LHSValBits + RHSValBits; 5628 if (MaxValBits > 32) 5629 break; 5630 unsigned SignBits = 32 - MaxValBits + 1; 5631 bool LHSNegative = LHSKnown.isNegative(); 5632 bool LHSNonNegative = LHSKnown.isNonNegative(); 5633 bool LHSPositive = LHSKnown.isStrictlyPositive(); 5634 bool RHSNegative = RHSKnown.isNegative(); 5635 bool RHSNonNegative = RHSKnown.isNonNegative(); 5636 bool RHSPositive = RHSKnown.isStrictlyPositive(); 5637 5638 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative)) 5639 Known.Zero.setHighBits(SignBits); 5640 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative)) 5641 Known.One.setHighBits(SignBits); 5642 } else { 5643 unsigned LHSValBits = LHSKnown.countMaxActiveBits(); 5644 unsigned RHSValBits = RHSKnown.countMaxActiveBits(); 5645 unsigned MaxValBits = LHSValBits + RHSValBits; 5646 if (MaxValBits >= 32) 5647 break; 5648 Known.Zero.setBitsFrom(MaxValBits); 5649 } 5650 break; 5651 } 5652 case AMDGPUISD::PERM: { 5653 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 5654 if (!CMask) 5655 return; 5656 5657 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 5658 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); 5659 unsigned Sel = CMask->getZExtValue(); 5660 5661 for (unsigned I = 0; I < 32; I += 8) { 5662 unsigned SelBits = Sel & 0xff; 5663 if (SelBits < 4) { 5664 SelBits *= 8; 5665 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; 5666 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; 5667 } else if (SelBits < 7) { 5668 SelBits = (SelBits & 3) * 8; 5669 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; 5670 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; 5671 } else if (SelBits == 0x0c) { 5672 Known.Zero |= 0xFFull << I; 5673 } else if (SelBits > 0x0c) { 5674 Known.One |= 0xFFull << I; 5675 } 5676 Sel >>= 8; 5677 } 5678 break; 5679 } 5680 case AMDGPUISD::BUFFER_LOAD_UBYTE: { 5681 Known.Zero.setHighBits(24); 5682 break; 5683 } 5684 case AMDGPUISD::BUFFER_LOAD_USHORT: { 5685 Known.Zero.setHighBits(16); 5686 break; 5687 } 5688 case AMDGPUISD::LDS: { 5689 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode()); 5690 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout()); 5691 5692 Known.Zero.setHighBits(16); 5693 Known.Zero.setLowBits(Log2(Alignment)); 5694 break; 5695 } 5696 case AMDGPUISD::SMIN3: 5697 case AMDGPUISD::SMAX3: 5698 case AMDGPUISD::SMED3: 5699 case AMDGPUISD::UMIN3: 5700 case AMDGPUISD::UMAX3: 5701 case AMDGPUISD::UMED3: { 5702 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1); 5703 if (Known2.isUnknown()) 5704 break; 5705 5706 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); 5707 if (Known1.isUnknown()) 5708 break; 5709 5710 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 5711 if (Known0.isUnknown()) 5712 break; 5713 5714 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling. 5715 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero; 5716 Known.One = Known0.One & Known1.One & Known2.One; 5717 break; 5718 } 5719 case ISD::INTRINSIC_WO_CHAIN: { 5720 unsigned IID = Op.getConstantOperandVal(0); 5721 switch (IID) { 5722 case Intrinsic::amdgcn_workitem_id_x: 5723 case Intrinsic::amdgcn_workitem_id_y: 5724 case Intrinsic::amdgcn_workitem_id_z: { 5725 unsigned MaxValue = Subtarget->getMaxWorkitemID( 5726 DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID)); 5727 Known.Zero.setHighBits(llvm::countl_zero(MaxValue)); 5728 break; 5729 } 5730 default: 5731 break; 5732 } 5733 } 5734 } 5735 } 5736 5737 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( 5738 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, 5739 unsigned Depth) const { 5740 switch (Op.getOpcode()) { 5741 case AMDGPUISD::BFE_I32: { 5742 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 5743 if (!Width) 5744 return 1; 5745 5746 unsigned SignBits = 32 - Width->getZExtValue() + 1; 5747 if (!isNullConstant(Op.getOperand(1))) 5748 return SignBits; 5749 5750 // TODO: Could probably figure something out with non-0 offsets. 5751 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); 5752 return std::max(SignBits, Op0SignBits); 5753 } 5754 5755 case AMDGPUISD::BFE_U32: { 5756 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 5757 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; 5758 } 5759 5760 case AMDGPUISD::CARRY: 5761 case AMDGPUISD::BORROW: 5762 return 31; 5763 case AMDGPUISD::BUFFER_LOAD_BYTE: 5764 return 25; 5765 case AMDGPUISD::BUFFER_LOAD_SHORT: 5766 return 17; 5767 case AMDGPUISD::BUFFER_LOAD_UBYTE: 5768 return 24; 5769 case AMDGPUISD::BUFFER_LOAD_USHORT: 5770 return 16; 5771 case AMDGPUISD::FP_TO_FP16: 5772 return 16; 5773 case AMDGPUISD::SMIN3: 5774 case AMDGPUISD::SMAX3: 5775 case AMDGPUISD::SMED3: 5776 case AMDGPUISD::UMIN3: 5777 case AMDGPUISD::UMAX3: 5778 case AMDGPUISD::UMED3: { 5779 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1); 5780 if (Tmp2 == 1) 5781 return 1; // Early out. 5782 5783 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1); 5784 if (Tmp1 == 1) 5785 return 1; // Early out. 5786 5787 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); 5788 if (Tmp0 == 1) 5789 return 1; // Early out. 5790 5791 return std::min(Tmp0, std::min(Tmp1, Tmp2)); 5792 } 5793 default: 5794 return 1; 5795 } 5796 } 5797 5798 unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr( 5799 GISelKnownBits &Analysis, Register R, 5800 const APInt &DemandedElts, const MachineRegisterInfo &MRI, 5801 unsigned Depth) const { 5802 const MachineInstr *MI = MRI.getVRegDef(R); 5803 if (!MI) 5804 return 1; 5805 5806 // TODO: Check range metadata on MMO. 5807 switch (MI->getOpcode()) { 5808 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 5809 return 25; 5810 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 5811 return 17; 5812 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 5813 return 24; 5814 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 5815 return 16; 5816 case AMDGPU::G_AMDGPU_SMED3: 5817 case AMDGPU::G_AMDGPU_UMED3: { 5818 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs(); 5819 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1); 5820 if (Tmp2 == 1) 5821 return 1; 5822 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1); 5823 if (Tmp1 == 1) 5824 return 1; 5825 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1); 5826 if (Tmp0 == 1) 5827 return 1; 5828 return std::min(Tmp0, std::min(Tmp1, Tmp2)); 5829 } 5830 default: 5831 return 1; 5832 } 5833 } 5834 5835 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, 5836 const SelectionDAG &DAG, 5837 bool SNaN, 5838 unsigned Depth) const { 5839 unsigned Opcode = Op.getOpcode(); 5840 switch (Opcode) { 5841 case AMDGPUISD::FMIN_LEGACY: 5842 case AMDGPUISD::FMAX_LEGACY: { 5843 if (SNaN) 5844 return true; 5845 5846 // TODO: Can check no nans on one of the operands for each one, but which 5847 // one? 5848 return false; 5849 } 5850 case AMDGPUISD::FMUL_LEGACY: 5851 case AMDGPUISD::CVT_PKRTZ_F16_F32: { 5852 if (SNaN) 5853 return true; 5854 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && 5855 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); 5856 } 5857 case AMDGPUISD::FMED3: 5858 case AMDGPUISD::FMIN3: 5859 case AMDGPUISD::FMAX3: 5860 case AMDGPUISD::FMINIMUM3: 5861 case AMDGPUISD::FMAXIMUM3: 5862 case AMDGPUISD::FMAD_FTZ: { 5863 if (SNaN) 5864 return true; 5865 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && 5866 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && 5867 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); 5868 } 5869 case AMDGPUISD::CVT_F32_UBYTE0: 5870 case AMDGPUISD::CVT_F32_UBYTE1: 5871 case AMDGPUISD::CVT_F32_UBYTE2: 5872 case AMDGPUISD::CVT_F32_UBYTE3: 5873 return true; 5874 5875 case AMDGPUISD::RCP: 5876 case AMDGPUISD::RSQ: 5877 case AMDGPUISD::RCP_LEGACY: 5878 case AMDGPUISD::RSQ_CLAMP: { 5879 if (SNaN) 5880 return true; 5881 5882 // TODO: Need is known positive check. 5883 return false; 5884 } 5885 case ISD::FLDEXP: 5886 case AMDGPUISD::FRACT: { 5887 if (SNaN) 5888 return true; 5889 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); 5890 } 5891 case AMDGPUISD::DIV_SCALE: 5892 case AMDGPUISD::DIV_FMAS: 5893 case AMDGPUISD::DIV_FIXUP: 5894 // TODO: Refine on operands. 5895 return SNaN; 5896 case AMDGPUISD::SIN_HW: 5897 case AMDGPUISD::COS_HW: { 5898 // TODO: Need check for infinity 5899 return SNaN; 5900 } 5901 case ISD::INTRINSIC_WO_CHAIN: { 5902 unsigned IntrinsicID = Op.getConstantOperandVal(0); 5903 // TODO: Handle more intrinsics 5904 switch (IntrinsicID) { 5905 case Intrinsic::amdgcn_cubeid: 5906 return true; 5907 5908 case Intrinsic::amdgcn_frexp_mant: { 5909 if (SNaN) 5910 return true; 5911 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); 5912 } 5913 case Intrinsic::amdgcn_cvt_pkrtz: { 5914 if (SNaN) 5915 return true; 5916 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && 5917 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); 5918 } 5919 case Intrinsic::amdgcn_rcp: 5920 case Intrinsic::amdgcn_rsq: 5921 case Intrinsic::amdgcn_rcp_legacy: 5922 case Intrinsic::amdgcn_rsq_legacy: 5923 case Intrinsic::amdgcn_rsq_clamp: { 5924 if (SNaN) 5925 return true; 5926 5927 // TODO: Need is known positive check. 5928 return false; 5929 } 5930 case Intrinsic::amdgcn_trig_preop: 5931 case Intrinsic::amdgcn_fdot2: 5932 // TODO: Refine on operand 5933 return SNaN; 5934 case Intrinsic::amdgcn_fma_legacy: 5935 if (SNaN) 5936 return true; 5937 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && 5938 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) && 5939 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1); 5940 default: 5941 return false; 5942 } 5943 } 5944 default: 5945 return false; 5946 } 5947 } 5948 5949 bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, 5950 Register N0, Register N1) const { 5951 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks 5952 } 5953 5954 TargetLowering::AtomicExpansionKind 5955 AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { 5956 switch (RMW->getOperation()) { 5957 case AtomicRMWInst::Nand: 5958 case AtomicRMWInst::FAdd: 5959 case AtomicRMWInst::FSub: 5960 case AtomicRMWInst::FMax: 5961 case AtomicRMWInst::FMin: 5962 return AtomicExpansionKind::CmpXChg; 5963 default: { 5964 if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) { 5965 unsigned Size = IntTy->getBitWidth(); 5966 if (Size == 32 || Size == 64) 5967 return AtomicExpansionKind::None; 5968 } 5969 5970 return AtomicExpansionKind::CmpXChg; 5971 } 5972 } 5973 } 5974 5975 /// Whether it is profitable to sink the operands of an 5976 /// Instruction I to the basic block of I. 5977 /// This helps using several modifiers (like abs and neg) more often. 5978 bool AMDGPUTargetLowering::shouldSinkOperands( 5979 Instruction *I, SmallVectorImpl<Use *> &Ops) const { 5980 using namespace PatternMatch; 5981 5982 for (auto &Op : I->operands()) { 5983 // Ensure we are not already sinking this operand. 5984 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); })) 5985 continue; 5986 5987 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) 5988 Ops.push_back(&Op); 5989 } 5990 5991 return !Ops.empty(); 5992 } 5993