1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This is the parent TargetLowering class for hardware code gen 11 /// targets. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUISelLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUMachineFunction.h" 19 #include "AMDGPUMemoryUtils.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/CodeGen/Analysis.h" 22 #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h" 23 #include "llvm/CodeGen/MachineFrameInfo.h" 24 #include "llvm/IR/DiagnosticInfo.h" 25 #include "llvm/IR/IntrinsicsAMDGPU.h" 26 #include "llvm/Support/CommandLine.h" 27 #include "llvm/Support/KnownBits.h" 28 #include "llvm/Target/TargetMachine.h" 29 30 using namespace llvm; 31 32 #include "AMDGPUGenCallingConv.inc" 33 34 static cl::opt<bool> AMDGPUBypassSlowDiv( 35 "amdgpu-bypass-slow-div", 36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"), 37 cl::init(true)); 38 39 // Find a larger type to do a load / store of a vector with. 40 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { 41 unsigned StoreSize = VT.getStoreSizeInBits(); 42 if (StoreSize <= 32) 43 return EVT::getIntegerVT(Ctx, StoreSize); 44 45 if (StoreSize % 32 == 0) 46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); 47 48 return VT; 49 } 50 51 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) { 52 return DAG.computeKnownBits(Op).countMaxActiveBits(); 53 } 54 55 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) { 56 // In order for this to be a signed 24-bit value, bit 23, must 57 // be a sign bit. 58 return DAG.ComputeMaxSignificantBits(Op); 59 } 60 61 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, 62 const AMDGPUSubtarget &STI) 63 : TargetLowering(TM), Subtarget(&STI) { 64 // Always lower memset, memcpy, and memmove intrinsics to load/store 65 // instructions, rather then generating calls to memset, mempcy or memmove. 66 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U; 67 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U; 68 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U; 69 70 // Enable ganging up loads and stores in the memcpy DAG lowering. 71 MaxGluedStoresPerMemcpy = 16; 72 73 // Lower floating point store/load to integer store/load to reduce the number 74 // of patterns in tablegen. 75 setOperationAction(ISD::LOAD, MVT::f32, Promote); 76 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); 77 78 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 79 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); 80 81 setOperationAction(ISD::LOAD, MVT::v3f32, Promote); 82 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32); 83 84 setOperationAction(ISD::LOAD, MVT::v4f32, Promote); 85 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); 86 87 setOperationAction(ISD::LOAD, MVT::v5f32, Promote); 88 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32); 89 90 setOperationAction(ISD::LOAD, MVT::v6f32, Promote); 91 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32); 92 93 setOperationAction(ISD::LOAD, MVT::v7f32, Promote); 94 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32); 95 96 setOperationAction(ISD::LOAD, MVT::v8f32, Promote); 97 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); 98 99 setOperationAction(ISD::LOAD, MVT::v9f32, Promote); 100 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32); 101 102 setOperationAction(ISD::LOAD, MVT::v10f32, Promote); 103 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32); 104 105 setOperationAction(ISD::LOAD, MVT::v11f32, Promote); 106 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32); 107 108 setOperationAction(ISD::LOAD, MVT::v12f32, Promote); 109 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32); 110 111 setOperationAction(ISD::LOAD, MVT::v16f32, Promote); 112 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); 113 114 setOperationAction(ISD::LOAD, MVT::v32f32, Promote); 115 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32); 116 117 setOperationAction(ISD::LOAD, MVT::i64, Promote); 118 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); 119 120 setOperationAction(ISD::LOAD, MVT::v2i64, Promote); 121 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); 122 123 setOperationAction(ISD::LOAD, MVT::f64, Promote); 124 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32); 125 126 setOperationAction(ISD::LOAD, MVT::v2f64, Promote); 127 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32); 128 129 setOperationAction(ISD::LOAD, MVT::v3i64, Promote); 130 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32); 131 132 setOperationAction(ISD::LOAD, MVT::v4i64, Promote); 133 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32); 134 135 setOperationAction(ISD::LOAD, MVT::v3f64, Promote); 136 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32); 137 138 setOperationAction(ISD::LOAD, MVT::v4f64, Promote); 139 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32); 140 141 setOperationAction(ISD::LOAD, MVT::v8i64, Promote); 142 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32); 143 144 setOperationAction(ISD::LOAD, MVT::v8f64, Promote); 145 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32); 146 147 setOperationAction(ISD::LOAD, MVT::v16i64, Promote); 148 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32); 149 150 setOperationAction(ISD::LOAD, MVT::v16f64, Promote); 151 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32); 152 153 setOperationAction(ISD::LOAD, MVT::i128, Promote); 154 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32); 155 156 // TODO: Would be better to consume as directly legal 157 setOperationAction(ISD::ATOMIC_LOAD, MVT::f32, Promote); 158 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32); 159 160 setOperationAction(ISD::ATOMIC_LOAD, MVT::f64, Promote); 161 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64); 162 163 setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Promote); 164 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16); 165 166 setOperationAction(ISD::ATOMIC_LOAD, MVT::bf16, Promote); 167 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16); 168 169 setOperationAction(ISD::ATOMIC_STORE, MVT::f32, Promote); 170 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32); 171 172 setOperationAction(ISD::ATOMIC_STORE, MVT::f64, Promote); 173 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64); 174 175 setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Promote); 176 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16); 177 178 setOperationAction(ISD::ATOMIC_STORE, MVT::bf16, Promote); 179 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16); 180 181 // There are no 64-bit extloads. These should be done as a 32-bit extload and 182 // an extension to 64-bit. 183 for (MVT VT : MVT::integer_valuetypes()) 184 setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT, 185 Expand); 186 187 for (MVT VT : MVT::integer_valuetypes()) { 188 if (VT == MVT::i64) 189 continue; 190 191 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) { 192 setLoadExtAction(Op, VT, MVT::i1, Promote); 193 setLoadExtAction(Op, VT, MVT::i8, Legal); 194 setLoadExtAction(Op, VT, MVT::i16, Legal); 195 setLoadExtAction(Op, VT, MVT::i32, Expand); 196 } 197 } 198 199 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) 200 for (auto MemVT : 201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16}) 202 setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT, 203 Expand); 204 205 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 206 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); 207 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 208 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand); 209 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand); 210 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand); 211 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 212 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand); 213 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); 214 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand); 215 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand); 216 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand); 217 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand); 218 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand); 219 220 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 221 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 222 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand); 223 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 224 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand); 225 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand); 226 227 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); 229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 230 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand); 231 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand); 232 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand); 233 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 234 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand); 235 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); 236 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand); 237 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand); 238 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand); 239 240 setOperationAction(ISD::STORE, MVT::f32, Promote); 241 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); 242 243 setOperationAction(ISD::STORE, MVT::v2f32, Promote); 244 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); 245 246 setOperationAction(ISD::STORE, MVT::v3f32, Promote); 247 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32); 248 249 setOperationAction(ISD::STORE, MVT::v4f32, Promote); 250 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); 251 252 setOperationAction(ISD::STORE, MVT::v5f32, Promote); 253 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32); 254 255 setOperationAction(ISD::STORE, MVT::v6f32, Promote); 256 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32); 257 258 setOperationAction(ISD::STORE, MVT::v7f32, Promote); 259 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32); 260 261 setOperationAction(ISD::STORE, MVT::v8f32, Promote); 262 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); 263 264 setOperationAction(ISD::STORE, MVT::v9f32, Promote); 265 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32); 266 267 setOperationAction(ISD::STORE, MVT::v10f32, Promote); 268 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32); 269 270 setOperationAction(ISD::STORE, MVT::v11f32, Promote); 271 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32); 272 273 setOperationAction(ISD::STORE, MVT::v12f32, Promote); 274 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32); 275 276 setOperationAction(ISD::STORE, MVT::v16f32, Promote); 277 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); 278 279 setOperationAction(ISD::STORE, MVT::v32f32, Promote); 280 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32); 281 282 setOperationAction(ISD::STORE, MVT::i64, Promote); 283 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); 284 285 setOperationAction(ISD::STORE, MVT::v2i64, Promote); 286 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); 287 288 setOperationAction(ISD::STORE, MVT::f64, Promote); 289 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32); 290 291 setOperationAction(ISD::STORE, MVT::v2f64, Promote); 292 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32); 293 294 setOperationAction(ISD::STORE, MVT::v3i64, Promote); 295 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32); 296 297 setOperationAction(ISD::STORE, MVT::v3f64, Promote); 298 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32); 299 300 setOperationAction(ISD::STORE, MVT::v4i64, Promote); 301 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32); 302 303 setOperationAction(ISD::STORE, MVT::v4f64, Promote); 304 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32); 305 306 setOperationAction(ISD::STORE, MVT::v8i64, Promote); 307 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32); 308 309 setOperationAction(ISD::STORE, MVT::v8f64, Promote); 310 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32); 311 312 setOperationAction(ISD::STORE, MVT::v16i64, Promote); 313 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32); 314 315 setOperationAction(ISD::STORE, MVT::v16f64, Promote); 316 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32); 317 318 setOperationAction(ISD::STORE, MVT::i128, Promote); 319 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32); 320 321 setTruncStoreAction(MVT::i64, MVT::i1, Expand); 322 setTruncStoreAction(MVT::i64, MVT::i8, Expand); 323 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 324 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 325 326 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); 327 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand); 328 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand); 329 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); 330 331 setTruncStoreAction(MVT::f32, MVT::bf16, Expand); 332 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 333 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand); 334 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); 335 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand); 336 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand); 337 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand); 338 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); 339 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand); 340 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); 341 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand); 342 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand); 343 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand); 344 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand); 345 346 setTruncStoreAction(MVT::f64, MVT::bf16, Expand); 347 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 348 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 349 350 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); 351 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand); 352 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); 353 354 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand); 355 356 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand); 357 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand); 358 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand); 359 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand); 360 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand); 361 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand); 362 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand); 363 364 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand); 365 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand); 366 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand); 367 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand); 368 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand); 369 370 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand); 371 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand); 372 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand); 373 374 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand); 375 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand); 376 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand); 377 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); 378 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); 379 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); 380 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); 381 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand); 382 383 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal); 384 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal); 385 386 setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand); 387 388 // For R600, this is totally unsupported, just custom lower to produce an 389 // error. 390 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 391 392 // Library functions. These default to Expand, but we have instructions 393 // for them. 394 setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, 395 ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM}, 396 MVT::f32, Legal); 397 398 setOperationAction(ISD::FLOG2, MVT::f32, Custom); 399 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom); 400 setOperationAction({ISD::LROUND, ISD::LLROUND}, 401 {MVT::f16, MVT::f32, MVT::f64}, Expand); 402 403 setOperationAction( 404 {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32, 405 Custom); 406 407 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom); 408 409 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom); 410 411 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64}, 412 Expand); 413 414 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom); 415 416 if (Subtarget->has16BitInsts()) 417 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal); 418 else { 419 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal); 420 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom); 421 } 422 423 setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16, 424 Custom); 425 426 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal); 427 if (Subtarget->has16BitInsts()) { 428 setOperationAction(ISD::FCANONICALIZE, MVT::f16, Legal); 429 } 430 431 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches 432 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by 433 // default unless marked custom/legal. 434 setOperationAction(ISD::IS_FPCLASS, 435 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, 436 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32, 437 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, 438 MVT::v16f64}, 439 Custom); 440 441 if (isTypeLegal(MVT::f16)) 442 setOperationAction(ISD::IS_FPCLASS, 443 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16}, 444 Custom); 445 446 // Expand to fneg + fadd. 447 setOperationAction(ISD::FSUB, MVT::f64, Expand); 448 449 setOperationAction(ISD::CONCAT_VECTORS, 450 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32, 451 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32, 452 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32, 453 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32, 454 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32}, 455 Custom); 456 457 setOperationAction( 458 ISD::EXTRACT_SUBVECTOR, 459 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32, 460 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32, 461 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32, 462 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, 463 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, 464 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64, 465 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64}, 466 Custom); 467 468 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 469 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom); 470 471 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 472 for (MVT VT : ScalarIntVTs) { 473 // These should use [SU]DIVREM, so set them to expand 474 setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT, 475 Expand); 476 477 // GPU does not have divrem function for signed or unsigned. 478 setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom); 479 480 // GPU does not have [S|U]MUL_LOHI functions as a single instruction. 481 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand); 482 483 setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand); 484 485 // AMDGPU uses ADDC/SUBC/ADDE/SUBE 486 setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal); 487 } 488 489 // The hardware supports 32-bit FSHR, but not FSHL. 490 setOperationAction(ISD::FSHR, MVT::i32, Legal); 491 492 // The hardware supports 32-bit ROTR, but not ROTL. 493 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand); 494 setOperationAction(ISD::ROTR, MVT::i64, Expand); 495 496 setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand); 497 498 setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand); 499 setOperationAction( 500 {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, 501 MVT::i64, Custom); 502 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 503 504 setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32, 505 Legal); 506 507 setOperationAction( 508 {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, 509 MVT::i64, Custom); 510 511 for (auto VT : {MVT::i8, MVT::i16}) 512 setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Custom); 513 514 static const MVT::SimpleValueType VectorIntTypes[] = { 515 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32, 516 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32}; 517 518 for (MVT VT : VectorIntTypes) { 519 // Expand the following operations for the current type by default. 520 setOperationAction({ISD::ADD, ISD::AND, ISD::FP_TO_SINT, 521 ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU, 522 ISD::MULHS, ISD::OR, ISD::SHL, 523 ISD::SRA, ISD::SRL, ISD::ROTL, 524 ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP, 525 ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV, 526 ISD::SREM, ISD::UREM, ISD::SMUL_LOHI, 527 ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM, 528 ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC, 529 ISD::XOR, ISD::BSWAP, ISD::CTPOP, 530 ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE, 531 ISD::SETCC, ISD::ADDRSPACECAST}, 532 VT, Expand); 533 } 534 535 static const MVT::SimpleValueType FloatVectorTypes[] = { 536 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, 537 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32}; 538 539 for (MVT VT : FloatVectorTypes) { 540 setOperationAction( 541 {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM, 542 ISD::FADD, ISD::FCEIL, ISD::FCOS, 543 ISD::FDIV, ISD::FEXP2, ISD::FEXP, 544 ISD::FEXP10, ISD::FLOG2, ISD::FREM, 545 ISD::FLOG, ISD::FLOG10, ISD::FPOW, 546 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL, 547 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT, 548 ISD::FSQRT, ISD::FSIN, ISD::FSUB, 549 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC, 550 ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC, 551 ISD::FCANONICALIZE, ISD::FROUNDEVEN}, 552 VT, Expand); 553 } 554 555 // This causes using an unrolled select operation rather than expansion with 556 // bit operations. This is in general better, but the alternative using BFI 557 // instructions may be better if the select sources are SGPRs. 558 setOperationAction(ISD::SELECT, MVT::v2f32, Promote); 559 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32); 560 561 setOperationAction(ISD::SELECT, MVT::v3f32, Promote); 562 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32); 563 564 setOperationAction(ISD::SELECT, MVT::v4f32, Promote); 565 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32); 566 567 setOperationAction(ISD::SELECT, MVT::v5f32, Promote); 568 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32); 569 570 setOperationAction(ISD::SELECT, MVT::v6f32, Promote); 571 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32); 572 573 setOperationAction(ISD::SELECT, MVT::v7f32, Promote); 574 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32); 575 576 setOperationAction(ISD::SELECT, MVT::v9f32, Promote); 577 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32); 578 579 setOperationAction(ISD::SELECT, MVT::v10f32, Promote); 580 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32); 581 582 setOperationAction(ISD::SELECT, MVT::v11f32, Promote); 583 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32); 584 585 setOperationAction(ISD::SELECT, MVT::v12f32, Promote); 586 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32); 587 588 setSchedulingPreference(Sched::RegPressure); 589 setJumpIsExpensive(true); 590 591 // FIXME: This is only partially true. If we have to do vector compares, any 592 // SGPR pair can be a condition register. If we have a uniform condition, we 593 // are better off doing SALU operations, where there is only one SCC. For now, 594 // we don't have a way of knowing during instruction selection if a condition 595 // will be uniform and we always use vector compares. Assume we are using 596 // vector compares until that is fixed. 597 setHasMultipleConditionRegisters(true); 598 599 setMinCmpXchgSizeInBits(32); 600 setSupportsUnalignedAtomics(false); 601 602 PredictableSelectIsExpensive = false; 603 604 // We want to find all load dependencies for long chains of stores to enable 605 // merging into very wide vectors. The problem is with vectors with > 4 606 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16 607 // vectors are a legal type, even though we have to split the loads 608 // usually. When we can more precisely specify load legality per address 609 // space, we should be able to make FindBetterChain/MergeConsecutiveStores 610 // smarter so that they can figure out what to do in 2 iterations without all 611 // N > 4 stores on the same chain. 612 GatherAllAliasesMaxDepth = 16; 613 614 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry 615 // about these during lowering. 616 MaxStoresPerMemcpy = 0xffffffff; 617 MaxStoresPerMemmove = 0xffffffff; 618 MaxStoresPerMemset = 0xffffffff; 619 620 // The expansion for 64-bit division is enormous. 621 if (AMDGPUBypassSlowDiv) 622 addBypassSlowDiv(64, 32); 623 624 setTargetDAGCombine({ISD::BITCAST, ISD::SHL, 625 ISD::SRA, ISD::SRL, 626 ISD::TRUNCATE, ISD::MUL, 627 ISD::SMUL_LOHI, ISD::UMUL_LOHI, 628 ISD::MULHU, ISD::MULHS, 629 ISD::SELECT, ISD::SELECT_CC, 630 ISD::STORE, ISD::FADD, 631 ISD::FSUB, ISD::FNEG, 632 ISD::FABS, ISD::AssertZext, 633 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN}); 634 635 setMaxAtomicSizeInBitsSupported(64); 636 setMaxDivRemBitWidthSupported(64); 637 setMaxLargeFPConvertBitWidthSupported(64); 638 } 639 640 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const { 641 if (getTargetMachine().Options.NoSignedZerosFPMath) 642 return true; 643 644 const auto Flags = Op.getNode()->getFlags(); 645 if (Flags.hasNoSignedZeros()) 646 return true; 647 648 return false; 649 } 650 651 //===----------------------------------------------------------------------===// 652 // Target Information 653 //===----------------------------------------------------------------------===// 654 655 LLVM_READNONE 656 static bool fnegFoldsIntoOpcode(unsigned Opc) { 657 switch (Opc) { 658 case ISD::FADD: 659 case ISD::FSUB: 660 case ISD::FMUL: 661 case ISD::FMA: 662 case ISD::FMAD: 663 case ISD::FMINNUM: 664 case ISD::FMAXNUM: 665 case ISD::FMINNUM_IEEE: 666 case ISD::FMAXNUM_IEEE: 667 case ISD::FMINIMUM: 668 case ISD::FMAXIMUM: 669 case ISD::FMINIMUMNUM: 670 case ISD::FMAXIMUMNUM: 671 case ISD::SELECT: 672 case ISD::FSIN: 673 case ISD::FTRUNC: 674 case ISD::FRINT: 675 case ISD::FNEARBYINT: 676 case ISD::FROUNDEVEN: 677 case ISD::FCANONICALIZE: 678 case AMDGPUISD::RCP: 679 case AMDGPUISD::RCP_LEGACY: 680 case AMDGPUISD::RCP_IFLAG: 681 case AMDGPUISD::SIN_HW: 682 case AMDGPUISD::FMUL_LEGACY: 683 case AMDGPUISD::FMIN_LEGACY: 684 case AMDGPUISD::FMAX_LEGACY: 685 case AMDGPUISD::FMED3: 686 // TODO: handle llvm.amdgcn.fma.legacy 687 return true; 688 case ISD::BITCAST: 689 llvm_unreachable("bitcast is special cased"); 690 default: 691 return false; 692 } 693 } 694 695 static bool fnegFoldsIntoOp(const SDNode *N) { 696 unsigned Opc = N->getOpcode(); 697 if (Opc == ISD::BITCAST) { 698 // TODO: Is there a benefit to checking the conditions performFNegCombine 699 // does? We don't for the other cases. 700 SDValue BCSrc = N->getOperand(0); 701 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) { 702 return BCSrc.getNumOperands() == 2 && 703 BCSrc.getOperand(1).getValueSizeInBits() == 32; 704 } 705 706 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32; 707 } 708 709 return fnegFoldsIntoOpcode(Opc); 710 } 711 712 /// \p returns true if the operation will definitely need to use a 64-bit 713 /// encoding, and thus will use a VOP3 encoding regardless of the source 714 /// modifiers. 715 LLVM_READONLY 716 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) { 717 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) || 718 VT == MVT::f64; 719 } 720 721 /// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the 722 /// type for ISD::SELECT. 723 LLVM_READONLY 724 static bool selectSupportsSourceMods(const SDNode *N) { 725 // TODO: Only applies if select will be vector 726 return N->getValueType(0) == MVT::f32; 727 } 728 729 // Most FP instructions support source modifiers, but this could be refined 730 // slightly. 731 LLVM_READONLY 732 static bool hasSourceMods(const SDNode *N) { 733 if (isa<MemSDNode>(N)) 734 return false; 735 736 switch (N->getOpcode()) { 737 case ISD::CopyToReg: 738 case ISD::FDIV: 739 case ISD::FREM: 740 case ISD::INLINEASM: 741 case ISD::INLINEASM_BR: 742 case AMDGPUISD::DIV_SCALE: 743 case ISD::INTRINSIC_W_CHAIN: 744 745 // TODO: Should really be looking at the users of the bitcast. These are 746 // problematic because bitcasts are used to legalize all stores to integer 747 // types. 748 case ISD::BITCAST: 749 return false; 750 case ISD::INTRINSIC_WO_CHAIN: { 751 switch (N->getConstantOperandVal(0)) { 752 case Intrinsic::amdgcn_interp_p1: 753 case Intrinsic::amdgcn_interp_p2: 754 case Intrinsic::amdgcn_interp_mov: 755 case Intrinsic::amdgcn_interp_p1_f16: 756 case Intrinsic::amdgcn_interp_p2_f16: 757 return false; 758 default: 759 return true; 760 } 761 } 762 case ISD::SELECT: 763 return selectSupportsSourceMods(N); 764 default: 765 return true; 766 } 767 } 768 769 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N, 770 unsigned CostThreshold) { 771 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus 772 // it is truly free to use a source modifier in all cases. If there are 773 // multiple users but for each one will necessitate using VOP3, there will be 774 // a code size increase. Try to avoid increasing code size unless we know it 775 // will save on the instruction count. 776 unsigned NumMayIncreaseSize = 0; 777 MVT VT = N->getValueType(0).getScalarType().getSimpleVT(); 778 779 assert(!N->use_empty()); 780 781 // XXX - Should this limit number of uses to check? 782 for (const SDNode *U : N->users()) { 783 if (!hasSourceMods(U)) 784 return false; 785 786 if (!opMustUseVOP3Encoding(U, VT)) { 787 if (++NumMayIncreaseSize > CostThreshold) 788 return false; 789 } 790 } 791 792 return true; 793 } 794 795 EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, 796 ISD::NodeType ExtendKind) const { 797 assert(!VT.isVector() && "only scalar expected"); 798 799 // Round to the next multiple of 32-bits. 800 unsigned Size = VT.getSizeInBits(); 801 if (Size <= 32) 802 return MVT::i32; 803 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32)); 804 } 805 806 unsigned AMDGPUTargetLowering::getVectorIdxWidth(const DataLayout &) const { 807 return 32; 808 } 809 810 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { 811 return true; 812 } 813 814 // The backend supports 32 and 64 bit floating point immediates. 815 // FIXME: Why are we reporting vectors of FP immediates as legal? 816 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 817 bool ForCodeSize) const { 818 EVT ScalarVT = VT.getScalarType(); 819 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 || 820 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts())); 821 } 822 823 // We don't want to shrink f64 / f32 constants. 824 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { 825 EVT ScalarVT = VT.getScalarType(); 826 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); 827 } 828 829 bool AMDGPUTargetLowering::shouldReduceLoadWidth( 830 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT, 831 std::optional<unsigned> ByteOffset) const { 832 // TODO: This may be worth removing. Check regression tests for diffs. 833 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset)) 834 return false; 835 836 unsigned NewSize = NewVT.getStoreSizeInBits(); 837 838 // If we are reducing to a 32-bit load or a smaller multi-dword load, 839 // this is always better. 840 if (NewSize >= 32) 841 return true; 842 843 EVT OldVT = N->getValueType(0); 844 unsigned OldSize = OldVT.getStoreSizeInBits(); 845 846 MemSDNode *MN = cast<MemSDNode>(N); 847 unsigned AS = MN->getAddressSpace(); 848 // Do not shrink an aligned scalar load to sub-dword. 849 // Scalar engine cannot do sub-dword loads. 850 // TODO: Update this for GFX12 which does have scalar sub-dword loads. 851 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) && 852 (AS == AMDGPUAS::CONSTANT_ADDRESS || 853 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 854 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS && 855 MN->isInvariant())) && 856 AMDGPU::isUniformMMO(MN->getMemOperand())) 857 return false; 858 859 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar 860 // extloads, so doing one requires using a buffer_load. In cases where we 861 // still couldn't use a scalar load, using the wider load shouldn't really 862 // hurt anything. 863 864 // If the old size already had to be an extload, there's no harm in continuing 865 // to reduce the width. 866 return (OldSize < 32); 867 } 868 869 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy, 870 const SelectionDAG &DAG, 871 const MachineMemOperand &MMO) const { 872 873 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits()); 874 875 if (LoadTy.getScalarType() == MVT::i32) 876 return false; 877 878 unsigned LScalarSize = LoadTy.getScalarSizeInBits(); 879 unsigned CastScalarSize = CastTy.getScalarSizeInBits(); 880 881 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32)) 882 return false; 883 884 unsigned Fast = 0; 885 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 886 CastTy, MMO, &Fast) && 887 Fast; 888 } 889 890 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also 891 // profitable with the expansion for 64-bit since it's generally good to 892 // speculate things. 893 bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const { 894 return true; 895 } 896 897 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { 898 return true; 899 } 900 901 bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const { 902 switch (N->getOpcode()) { 903 case ISD::EntryToken: 904 case ISD::TokenFactor: 905 return true; 906 case ISD::INTRINSIC_WO_CHAIN: { 907 unsigned IntrID = N->getConstantOperandVal(0); 908 return AMDGPU::isIntrinsicAlwaysUniform(IntrID); 909 } 910 case ISD::INTRINSIC_W_CHAIN: { 911 unsigned IntrID = N->getConstantOperandVal(1); 912 return AMDGPU::isIntrinsicAlwaysUniform(IntrID); 913 } 914 case ISD::LOAD: 915 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() == 916 AMDGPUAS::CONSTANT_ADDRESS_32BIT) 917 return true; 918 return false; 919 case AMDGPUISD::SETCC: // ballot-style instruction 920 return true; 921 } 922 return false; 923 } 924 925 SDValue AMDGPUTargetLowering::getNegatedExpression( 926 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, 927 NegatibleCost &Cost, unsigned Depth) const { 928 929 switch (Op.getOpcode()) { 930 case ISD::FMA: 931 case ISD::FMAD: { 932 // Negating a fma is not free if it has users without source mods. 933 if (!allUsesHaveSourceMods(Op.getNode())) 934 return SDValue(); 935 break; 936 } 937 case AMDGPUISD::RCP: { 938 SDValue Src = Op.getOperand(0); 939 EVT VT = Op.getValueType(); 940 SDLoc SL(Op); 941 942 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations, 943 ForCodeSize, Cost, Depth + 1); 944 if (NegSrc) 945 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags()); 946 return SDValue(); 947 } 948 default: 949 break; 950 } 951 952 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations, 953 ForCodeSize, Cost, Depth); 954 } 955 956 //===---------------------------------------------------------------------===// 957 // Target Properties 958 //===---------------------------------------------------------------------===// 959 960 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { 961 assert(VT.isFloatingPoint()); 962 963 // Packed operations do not have a fabs modifier. 964 return VT == MVT::f32 || VT == MVT::f64 || 965 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16)); 966 } 967 968 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { 969 assert(VT.isFloatingPoint()); 970 // Report this based on the end legalized type. 971 VT = VT.getScalarType(); 972 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16; 973 } 974 975 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, 976 unsigned NumElem, 977 unsigned AS) const { 978 return true; 979 } 980 981 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const { 982 // There are few operations which truly have vector input operands. Any vector 983 // operation is going to involve operations on each component, and a 984 // build_vector will be a copy per element, so it always makes sense to use a 985 // build_vector input in place of the extracted element to avoid a copy into a 986 // super register. 987 // 988 // We should probably only do this if all users are extracts only, but this 989 // should be the common case. 990 return true; 991 } 992 993 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { 994 // Truncate is just accessing a subregister. 995 996 unsigned SrcSize = Source.getSizeInBits(); 997 unsigned DestSize = Dest.getSizeInBits(); 998 999 return DestSize < SrcSize && DestSize % 32 == 0 ; 1000 } 1001 1002 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { 1003 // Truncate is just accessing a subregister. 1004 1005 unsigned SrcSize = Source->getScalarSizeInBits(); 1006 unsigned DestSize = Dest->getScalarSizeInBits(); 1007 1008 if (DestSize== 16 && Subtarget->has16BitInsts()) 1009 return SrcSize >= 32; 1010 1011 return DestSize < SrcSize && DestSize % 32 == 0; 1012 } 1013 1014 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { 1015 unsigned SrcSize = Src->getScalarSizeInBits(); 1016 unsigned DestSize = Dest->getScalarSizeInBits(); 1017 1018 if (SrcSize == 16 && Subtarget->has16BitInsts()) 1019 return DestSize >= 32; 1020 1021 return SrcSize == 32 && DestSize == 64; 1022 } 1023 1024 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { 1025 // Any register load of a 64-bit value really requires 2 32-bit moves. For all 1026 // practical purposes, the extra mov 0 to load a 64-bit is free. As used, 1027 // this will enable reducing 64-bit operations the 32-bit, which is always 1028 // good. 1029 1030 if (Src == MVT::i16) 1031 return Dest == MVT::i32 ||Dest == MVT::i64 ; 1032 1033 return Src == MVT::i32 && Dest == MVT::i64; 1034 } 1035 1036 bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT, 1037 EVT DestVT) const { 1038 switch (N->getOpcode()) { 1039 case ISD::ADD: 1040 case ISD::SUB: 1041 case ISD::SHL: 1042 case ISD::SRL: 1043 case ISD::SRA: 1044 case ISD::AND: 1045 case ISD::OR: 1046 case ISD::XOR: 1047 case ISD::MUL: 1048 case ISD::SETCC: 1049 case ISD::SELECT: 1050 case ISD::SMIN: 1051 case ISD::SMAX: 1052 case ISD::UMIN: 1053 case ISD::UMAX: 1054 if (Subtarget->has16BitInsts() && 1055 (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) { 1056 // Don't narrow back down to i16 if promoted to i32 already. 1057 if (!N->isDivergent() && DestVT.isInteger() && 1058 DestVT.getScalarSizeInBits() > 1 && 1059 DestVT.getScalarSizeInBits() <= 16 && 1060 SrcVT.getScalarSizeInBits() > 16) { 1061 return false; 1062 } 1063 } 1064 return true; 1065 default: 1066 break; 1067 } 1068 1069 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a 1070 // limited number of native 64-bit operations. Shrinking an operation to fit 1071 // in a single 32-bit register should always be helpful. As currently used, 1072 // this is much less general than the name suggests, and is only used in 1073 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is 1074 // not profitable, and may actually be harmful. 1075 if (isa<LoadSDNode>(N)) 1076 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; 1077 1078 return true; 1079 } 1080 1081 bool AMDGPUTargetLowering::isDesirableToCommuteWithShift( 1082 const SDNode* N, CombineLevel Level) const { 1083 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || 1084 N->getOpcode() == ISD::SRL) && 1085 "Expected shift op"); 1086 1087 SDValue ShiftLHS = N->getOperand(0); 1088 if (!ShiftLHS->hasOneUse()) 1089 return false; 1090 1091 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND && 1092 !ShiftLHS.getOperand(0)->hasOneUse()) 1093 return false; 1094 1095 // Always commute pre-type legalization and right shifts. 1096 // We're looking for shl(or(x,y),z) patterns. 1097 if (Level < CombineLevel::AfterLegalizeTypes || 1098 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR) 1099 return true; 1100 1101 // If only user is a i32 right-shift, then don't destroy a BFE pattern. 1102 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() && 1103 (N->user_begin()->getOpcode() == ISD::SRA || 1104 N->user_begin()->getOpcode() == ISD::SRL)) 1105 return false; 1106 1107 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns. 1108 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) { 1109 if (LHS.getOpcode() != ISD::SHL) 1110 return false; 1111 auto *RHSLd = dyn_cast<LoadSDNode>(RHS); 1112 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0)); 1113 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 1114 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD && 1115 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() && 1116 RHSLd->getExtensionType() == ISD::ZEXTLOAD; 1117 }; 1118 SDValue LHS = N->getOperand(0).getOperand(0); 1119 SDValue RHS = N->getOperand(0).getOperand(1); 1120 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS)); 1121 } 1122 1123 //===---------------------------------------------------------------------===// 1124 // TargetLowering Callbacks 1125 //===---------------------------------------------------------------------===// 1126 1127 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, 1128 bool IsVarArg) { 1129 switch (CC) { 1130 case CallingConv::AMDGPU_VS: 1131 case CallingConv::AMDGPU_GS: 1132 case CallingConv::AMDGPU_PS: 1133 case CallingConv::AMDGPU_CS: 1134 case CallingConv::AMDGPU_HS: 1135 case CallingConv::AMDGPU_ES: 1136 case CallingConv::AMDGPU_LS: 1137 return CC_AMDGPU; 1138 case CallingConv::AMDGPU_CS_Chain: 1139 case CallingConv::AMDGPU_CS_ChainPreserve: 1140 return CC_AMDGPU_CS_CHAIN; 1141 case CallingConv::C: 1142 case CallingConv::Fast: 1143 case CallingConv::Cold: 1144 return CC_AMDGPU_Func; 1145 case CallingConv::AMDGPU_Gfx: 1146 return CC_SI_Gfx; 1147 case CallingConv::AMDGPU_KERNEL: 1148 case CallingConv::SPIR_KERNEL: 1149 default: 1150 reportFatalUsageError("unsupported calling convention for call"); 1151 } 1152 } 1153 1154 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, 1155 bool IsVarArg) { 1156 switch (CC) { 1157 case CallingConv::AMDGPU_KERNEL: 1158 case CallingConv::SPIR_KERNEL: 1159 llvm_unreachable("kernels should not be handled here"); 1160 case CallingConv::AMDGPU_VS: 1161 case CallingConv::AMDGPU_GS: 1162 case CallingConv::AMDGPU_PS: 1163 case CallingConv::AMDGPU_CS: 1164 case CallingConv::AMDGPU_CS_Chain: 1165 case CallingConv::AMDGPU_CS_ChainPreserve: 1166 case CallingConv::AMDGPU_HS: 1167 case CallingConv::AMDGPU_ES: 1168 case CallingConv::AMDGPU_LS: 1169 return RetCC_SI_Shader; 1170 case CallingConv::AMDGPU_Gfx: 1171 return RetCC_SI_Gfx; 1172 case CallingConv::C: 1173 case CallingConv::Fast: 1174 case CallingConv::Cold: 1175 return RetCC_AMDGPU_Func; 1176 default: 1177 reportFatalUsageError("unsupported calling convention"); 1178 } 1179 } 1180 1181 /// The SelectionDAGBuilder will automatically promote function arguments 1182 /// with illegal types. However, this does not work for the AMDGPU targets 1183 /// since the function arguments are stored in memory as these illegal types. 1184 /// In order to handle this properly we need to get the original types sizes 1185 /// from the LLVM IR Function and fixup the ISD:InputArg values before 1186 /// passing them to AnalyzeFormalArguments() 1187 1188 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting 1189 /// input values across multiple registers. Each item in the Ins array 1190 /// represents a single value that will be stored in registers. Ins[x].VT is 1191 /// the value type of the value that will be stored in the register, so 1192 /// whatever SDNode we lower the argument to needs to be this type. 1193 /// 1194 /// In order to correctly lower the arguments we need to know the size of each 1195 /// argument. Since Ins[x].VT gives us the size of the register that will 1196 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type 1197 /// for the original function argument so that we can deduce the correct memory 1198 /// type to use for Ins[x]. In most cases the correct memory type will be 1199 /// Ins[x].ArgVT. However, this will not always be the case. If, for example, 1200 /// we have a kernel argument of type v8i8, this argument will be split into 1201 /// 8 parts and each part will be represented by its own item in the Ins array. 1202 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of 1203 /// the argument before it was split. From this, we deduce that the memory type 1204 /// for each individual part is i8. We pass the memory type as LocVT to the 1205 /// calling convention analysis function and the register type (Ins[x].VT) as 1206 /// the ValVT. 1207 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( 1208 CCState &State, 1209 const SmallVectorImpl<ISD::InputArg> &Ins) const { 1210 const MachineFunction &MF = State.getMachineFunction(); 1211 const Function &Fn = MF.getFunction(); 1212 LLVMContext &Ctx = Fn.getParent()->getContext(); 1213 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); 1214 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(); 1215 CallingConv::ID CC = Fn.getCallingConv(); 1216 1217 Align MaxAlign = Align(1); 1218 uint64_t ExplicitArgOffset = 0; 1219 const DataLayout &DL = Fn.getDataLayout(); 1220 1221 unsigned InIndex = 0; 1222 1223 for (const Argument &Arg : Fn.args()) { 1224 const bool IsByRef = Arg.hasByRefAttr(); 1225 Type *BaseArgTy = Arg.getType(); 1226 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy; 1227 Align Alignment = DL.getValueOrABITypeAlignment( 1228 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy); 1229 MaxAlign = std::max(Alignment, MaxAlign); 1230 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy); 1231 1232 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset; 1233 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize; 1234 1235 // We're basically throwing away everything passed into us and starting over 1236 // to get accurate in-memory offsets. The "PartOffset" is completely useless 1237 // to us as computed in Ins. 1238 // 1239 // We also need to figure out what type legalization is trying to do to get 1240 // the correct memory offsets. 1241 1242 SmallVector<EVT, 16> ValueVTs; 1243 SmallVector<uint64_t, 16> Offsets; 1244 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset); 1245 1246 for (unsigned Value = 0, NumValues = ValueVTs.size(); 1247 Value != NumValues; ++Value) { 1248 uint64_t BasePartOffset = Offsets[Value]; 1249 1250 EVT ArgVT = ValueVTs[Value]; 1251 EVT MemVT = ArgVT; 1252 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT); 1253 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT); 1254 1255 if (NumRegs == 1) { 1256 // This argument is not split, so the IR type is the memory type. 1257 if (ArgVT.isExtended()) { 1258 // We have an extended type, like i24, so we should just use the 1259 // register type. 1260 MemVT = RegisterVT; 1261 } else { 1262 MemVT = ArgVT; 1263 } 1264 } else if (ArgVT.isVector() && RegisterVT.isVector() && 1265 ArgVT.getScalarType() == RegisterVT.getScalarType()) { 1266 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()); 1267 // We have a vector value which has been split into a vector with 1268 // the same scalar type, but fewer elements. This should handle 1269 // all the floating-point vector types. 1270 MemVT = RegisterVT; 1271 } else if (ArgVT.isVector() && 1272 ArgVT.getVectorNumElements() == NumRegs) { 1273 // This arg has been split so that each element is stored in a separate 1274 // register. 1275 MemVT = ArgVT.getScalarType(); 1276 } else if (ArgVT.isExtended()) { 1277 // We have an extended type, like i65. 1278 MemVT = RegisterVT; 1279 } else { 1280 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs; 1281 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0); 1282 if (RegisterVT.isInteger()) { 1283 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); 1284 } else if (RegisterVT.isVector()) { 1285 assert(!RegisterVT.getScalarType().isFloatingPoint()); 1286 unsigned NumElements = RegisterVT.getVectorNumElements(); 1287 assert(MemoryBits % NumElements == 0); 1288 // This vector type has been split into another vector type with 1289 // a different elements size. 1290 EVT ScalarVT = EVT::getIntegerVT(State.getContext(), 1291 MemoryBits / NumElements); 1292 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); 1293 } else { 1294 llvm_unreachable("cannot deduce memory type."); 1295 } 1296 } 1297 1298 // Convert one element vectors to scalar. 1299 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) 1300 MemVT = MemVT.getScalarType(); 1301 1302 // Round up vec3/vec5 argument. 1303 if (MemVT.isVector() && !MemVT.isPow2VectorType()) { 1304 MemVT = MemVT.getPow2VectorType(State.getContext()); 1305 } else if (!MemVT.isSimple() && !MemVT.isVector()) { 1306 MemVT = MemVT.getRoundIntegerType(State.getContext()); 1307 } 1308 1309 unsigned PartOffset = 0; 1310 for (unsigned i = 0; i != NumRegs; ++i) { 1311 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT, 1312 BasePartOffset + PartOffset, 1313 MemVT.getSimpleVT(), 1314 CCValAssign::Full)); 1315 PartOffset += MemVT.getStoreSize(); 1316 } 1317 } 1318 } 1319 } 1320 1321 SDValue AMDGPUTargetLowering::LowerReturn( 1322 SDValue Chain, CallingConv::ID CallConv, 1323 bool isVarArg, 1324 const SmallVectorImpl<ISD::OutputArg> &Outs, 1325 const SmallVectorImpl<SDValue> &OutVals, 1326 const SDLoc &DL, SelectionDAG &DAG) const { 1327 // FIXME: Fails for r600 tests 1328 //assert(!isVarArg && Outs.empty() && OutVals.empty() && 1329 // "wave terminate should not have return values"); 1330 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain); 1331 } 1332 1333 //===---------------------------------------------------------------------===// 1334 // Target specific lowering 1335 //===---------------------------------------------------------------------===// 1336 1337 /// Selects the correct CCAssignFn for a given CallingConvention value. 1338 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1339 bool IsVarArg) { 1340 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg); 1341 } 1342 1343 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 1344 bool IsVarArg) { 1345 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg); 1346 } 1347 1348 SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain, 1349 SelectionDAG &DAG, 1350 MachineFrameInfo &MFI, 1351 int ClobberedFI) const { 1352 SmallVector<SDValue, 8> ArgChains; 1353 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); 1354 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; 1355 1356 // Include the original chain at the beginning of the list. When this is 1357 // used by target LowerCall hooks, this helps legalize find the 1358 // CALLSEQ_BEGIN node. 1359 ArgChains.push_back(Chain); 1360 1361 // Add a chain value for each stack argument corresponding 1362 for (SDNode *U : DAG.getEntryNode().getNode()->users()) { 1363 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) { 1364 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) { 1365 if (FI->getIndex() < 0) { 1366 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); 1367 int64_t InLastByte = InFirstByte; 1368 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; 1369 1370 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 1371 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 1372 ArgChains.push_back(SDValue(L, 1)); 1373 } 1374 } 1375 } 1376 } 1377 1378 // Build a tokenfactor for all the chains. 1379 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 1380 } 1381 1382 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI, 1383 SmallVectorImpl<SDValue> &InVals, 1384 StringRef Reason) const { 1385 SDValue Callee = CLI.Callee; 1386 SelectionDAG &DAG = CLI.DAG; 1387 1388 const Function &Fn = DAG.getMachineFunction().getFunction(); 1389 1390 StringRef FuncName("<unknown>"); 1391 1392 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) 1393 FuncName = G->getSymbol(); 1394 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 1395 FuncName = G->getGlobal()->getName(); 1396 1397 DAG.getContext()->diagnose( 1398 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc())); 1399 1400 if (!CLI.IsTailCall) { 1401 for (ISD::InputArg &Arg : CLI.Ins) 1402 InVals.push_back(DAG.getPOISON(Arg.VT)); 1403 } 1404 1405 return DAG.getEntryNode(); 1406 } 1407 1408 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, 1409 SmallVectorImpl<SDValue> &InVals) const { 1410 return lowerUnhandledCall(CLI, InVals, "unsupported call to function "); 1411 } 1412 1413 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 1414 SelectionDAG &DAG) const { 1415 const Function &Fn = DAG.getMachineFunction().getFunction(); 1416 1417 DAG.getContext()->diagnose(DiagnosticInfoUnsupported( 1418 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc())); 1419 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)}; 1420 return DAG.getMergeValues(Ops, SDLoc()); 1421 } 1422 1423 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, 1424 SelectionDAG &DAG) const { 1425 switch (Op.getOpcode()) { 1426 default: 1427 Op->print(errs(), &DAG); 1428 llvm_unreachable("Custom lowering code for this " 1429 "instruction is not implemented yet!"); 1430 break; 1431 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); 1432 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 1433 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 1434 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); 1435 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); 1436 case ISD::FREM: return LowerFREM(Op, DAG); 1437 case ISD::FCEIL: return LowerFCEIL(Op, DAG); 1438 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); 1439 case ISD::FRINT: return LowerFRINT(Op, DAG); 1440 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); 1441 case ISD::FROUNDEVEN: 1442 return LowerFROUNDEVEN(Op, DAG); 1443 case ISD::FROUND: return LowerFROUND(Op, DAG); 1444 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); 1445 case ISD::FLOG2: 1446 return LowerFLOG2(Op, DAG); 1447 case ISD::FLOG: 1448 case ISD::FLOG10: 1449 return LowerFLOGCommon(Op, DAG); 1450 case ISD::FEXP: 1451 case ISD::FEXP10: 1452 return lowerFEXP(Op, DAG); 1453 case ISD::FEXP2: 1454 return lowerFEXP2(Op, DAG); 1455 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 1456 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 1457 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); 1458 case ISD::FP_TO_SINT: 1459 case ISD::FP_TO_UINT: 1460 return LowerFP_TO_INT(Op, DAG); 1461 case ISD::CTTZ: 1462 case ISD::CTTZ_ZERO_UNDEF: 1463 case ISD::CTLZ: 1464 case ISD::CTLZ_ZERO_UNDEF: 1465 return LowerCTLZ_CTTZ(Op, DAG); 1466 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 1467 } 1468 return Op; 1469 } 1470 1471 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, 1472 SmallVectorImpl<SDValue> &Results, 1473 SelectionDAG &DAG) const { 1474 switch (N->getOpcode()) { 1475 case ISD::SIGN_EXTEND_INREG: 1476 // Different parts of legalization seem to interpret which type of 1477 // sign_extend_inreg is the one to check for custom lowering. The extended 1478 // from type is what really matters, but some places check for custom 1479 // lowering of the result type. This results in trying to use 1480 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do 1481 // nothing here and let the illegal result integer be handled normally. 1482 return; 1483 case ISD::FLOG2: 1484 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG)) 1485 Results.push_back(Lowered); 1486 return; 1487 case ISD::FLOG: 1488 case ISD::FLOG10: 1489 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG)) 1490 Results.push_back(Lowered); 1491 return; 1492 case ISD::FEXP2: 1493 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG)) 1494 Results.push_back(Lowered); 1495 return; 1496 case ISD::FEXP: 1497 case ISD::FEXP10: 1498 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG)) 1499 Results.push_back(Lowered); 1500 return; 1501 case ISD::CTLZ: 1502 case ISD::CTLZ_ZERO_UNDEF: 1503 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG)) 1504 Results.push_back(Lowered); 1505 return; 1506 default: 1507 return; 1508 } 1509 } 1510 1511 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, 1512 SDValue Op, 1513 SelectionDAG &DAG) const { 1514 1515 const DataLayout &DL = DAG.getDataLayout(); 1516 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); 1517 const GlobalValue *GV = G->getGlobal(); 1518 1519 if (!MFI->isModuleEntryFunction()) { 1520 if (std::optional<uint32_t> Address = 1521 AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) { 1522 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType()); 1523 } 1524 } 1525 1526 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 1527 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) { 1528 if (!MFI->isModuleEntryFunction() && 1529 GV->getName() != "llvm.amdgcn.module.lds" && 1530 !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) { 1531 SDLoc DL(Op); 1532 const Function &Fn = DAG.getMachineFunction().getFunction(); 1533 DAG.getContext()->diagnose(DiagnosticInfoUnsupported( 1534 Fn, "local memory global used by non-kernel function", 1535 DL.getDebugLoc(), DS_Warning)); 1536 1537 // We currently don't have a way to correctly allocate LDS objects that 1538 // aren't directly associated with a kernel. We do force inlining of 1539 // functions that use local objects. However, if these dead functions are 1540 // not eliminated, we don't want a compile time error. Just emit a warning 1541 // and a trap, since there should be no callable path here. 1542 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode()); 1543 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 1544 Trap, DAG.getRoot()); 1545 DAG.setRoot(OutputChain); 1546 return DAG.getPOISON(Op.getValueType()); 1547 } 1548 1549 // XXX: What does the value of G->getOffset() mean? 1550 assert(G->getOffset() == 0 && 1551 "Do not know what to do with an non-zero offset"); 1552 1553 // TODO: We could emit code to handle the initialization somewhere. 1554 // We ignore the initializer for now and legalize it to allow selection. 1555 // The initializer will anyway get errored out during assembly emission. 1556 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV)); 1557 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType()); 1558 } 1559 return SDValue(); 1560 } 1561 1562 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, 1563 SelectionDAG &DAG) const { 1564 SmallVector<SDValue, 8> Args; 1565 SDLoc SL(Op); 1566 1567 EVT VT = Op.getValueType(); 1568 if (VT.getVectorElementType().getSizeInBits() < 32) { 1569 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits(); 1570 if (OpBitSize >= 32 && OpBitSize % 32 == 0) { 1571 unsigned NewNumElt = OpBitSize / 32; 1572 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32 1573 : EVT::getVectorVT(*DAG.getContext(), 1574 MVT::i32, NewNumElt); 1575 for (const SDUse &U : Op->ops()) { 1576 SDValue In = U.get(); 1577 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In); 1578 if (NewNumElt > 1) 1579 DAG.ExtractVectorElements(NewIn, Args); 1580 else 1581 Args.push_back(NewIn); 1582 } 1583 1584 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, 1585 NewNumElt * Op.getNumOperands()); 1586 SDValue BV = DAG.getBuildVector(NewVT, SL, Args); 1587 return DAG.getNode(ISD::BITCAST, SL, VT, BV); 1588 } 1589 } 1590 1591 for (const SDUse &U : Op->ops()) 1592 DAG.ExtractVectorElements(U.get(), Args); 1593 1594 return DAG.getBuildVector(Op.getValueType(), SL, Args); 1595 } 1596 1597 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 1598 SelectionDAG &DAG) const { 1599 SDLoc SL(Op); 1600 SmallVector<SDValue, 8> Args; 1601 unsigned Start = Op.getConstantOperandVal(1); 1602 EVT VT = Op.getValueType(); 1603 EVT SrcVT = Op.getOperand(0).getValueType(); 1604 1605 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) { 1606 unsigned NumElt = VT.getVectorNumElements(); 1607 unsigned NumSrcElt = SrcVT.getVectorNumElements(); 1608 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types"); 1609 1610 // Extract 32-bit registers at a time. 1611 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2); 1612 EVT NewVT = NumElt == 2 1613 ? MVT::i32 1614 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2); 1615 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0)); 1616 1617 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2); 1618 if (NumElt == 2) 1619 Tmp = Args[0]; 1620 else 1621 Tmp = DAG.getBuildVector(NewVT, SL, Args); 1622 1623 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp); 1624 } 1625 1626 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, 1627 VT.getVectorNumElements()); 1628 1629 return DAG.getBuildVector(Op.getValueType(), SL, Args); 1630 } 1631 1632 // TODO: Handle fabs too 1633 static SDValue peekFNeg(SDValue Val) { 1634 if (Val.getOpcode() == ISD::FNEG) 1635 return Val.getOperand(0); 1636 1637 return Val; 1638 } 1639 1640 static SDValue peekFPSignOps(SDValue Val) { 1641 if (Val.getOpcode() == ISD::FNEG) 1642 Val = Val.getOperand(0); 1643 if (Val.getOpcode() == ISD::FABS) 1644 Val = Val.getOperand(0); 1645 if (Val.getOpcode() == ISD::FCOPYSIGN) 1646 Val = Val.getOperand(0); 1647 return Val; 1648 } 1649 1650 SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl( 1651 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, 1652 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const { 1653 SelectionDAG &DAG = DCI.DAG; 1654 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1655 switch (CCOpcode) { 1656 case ISD::SETOEQ: 1657 case ISD::SETONE: 1658 case ISD::SETUNE: 1659 case ISD::SETNE: 1660 case ISD::SETUEQ: 1661 case ISD::SETEQ: 1662 case ISD::SETFALSE: 1663 case ISD::SETFALSE2: 1664 case ISD::SETTRUE: 1665 case ISD::SETTRUE2: 1666 case ISD::SETUO: 1667 case ISD::SETO: 1668 break; 1669 case ISD::SETULE: 1670 case ISD::SETULT: { 1671 if (LHS == True) 1672 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); 1673 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); 1674 } 1675 case ISD::SETOLE: 1676 case ISD::SETOLT: 1677 case ISD::SETLE: 1678 case ISD::SETLT: { 1679 // Ordered. Assume ordered for undefined. 1680 1681 // Only do this after legalization to avoid interfering with other combines 1682 // which might occur. 1683 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && 1684 !DCI.isCalledByLegalizer()) 1685 return SDValue(); 1686 1687 // We need to permute the operands to get the correct NaN behavior. The 1688 // selected operand is the second one based on the failing compare with NaN, 1689 // so permute it based on the compare type the hardware uses. 1690 if (LHS == True) 1691 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); 1692 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); 1693 } 1694 case ISD::SETUGE: 1695 case ISD::SETUGT: { 1696 if (LHS == True) 1697 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); 1698 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); 1699 } 1700 case ISD::SETGT: 1701 case ISD::SETGE: 1702 case ISD::SETOGE: 1703 case ISD::SETOGT: { 1704 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && 1705 !DCI.isCalledByLegalizer()) 1706 return SDValue(); 1707 1708 if (LHS == True) 1709 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); 1710 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); 1711 } 1712 case ISD::SETCC_INVALID: 1713 llvm_unreachable("Invalid setcc condcode!"); 1714 } 1715 return SDValue(); 1716 } 1717 1718 /// Generate Min/Max node 1719 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT, 1720 SDValue LHS, SDValue RHS, 1721 SDValue True, SDValue False, 1722 SDValue CC, 1723 DAGCombinerInfo &DCI) const { 1724 if ((LHS == True && RHS == False) || (LHS == False && RHS == True)) 1725 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI); 1726 1727 SelectionDAG &DAG = DCI.DAG; 1728 1729 // If we can't directly match this, try to see if we can fold an fneg to 1730 // match. 1731 1732 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 1733 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False); 1734 SDValue NegTrue = peekFNeg(True); 1735 1736 // Undo the combine foldFreeOpFromSelect does if it helps us match the 1737 // fmin/fmax. 1738 // 1739 // select (fcmp olt (lhs, K)), (fneg lhs), -K 1740 // -> fneg (fmin_legacy lhs, K) 1741 // 1742 // TODO: Use getNegatedExpression 1743 if (LHS == NegTrue && CFalse && CRHS) { 1744 APFloat NegRHS = neg(CRHS->getValueAPF()); 1745 if (NegRHS == CFalse->getValueAPF()) { 1746 SDValue Combined = 1747 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI); 1748 if (Combined) 1749 return DAG.getNode(ISD::FNEG, DL, VT, Combined); 1750 return SDValue(); 1751 } 1752 } 1753 1754 return SDValue(); 1755 } 1756 1757 std::pair<SDValue, SDValue> 1758 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const { 1759 SDLoc SL(Op); 1760 1761 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1762 1763 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 1764 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 1765 1766 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); 1767 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); 1768 1769 return std::pair(Lo, Hi); 1770 } 1771 1772 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const { 1773 SDLoc SL(Op); 1774 1775 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1776 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 1777 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); 1778 } 1779 1780 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const { 1781 SDLoc SL(Op); 1782 1783 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1784 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 1785 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); 1786 } 1787 1788 // Split a vector type into two parts. The first part is a power of two vector. 1789 // The second part is whatever is left over, and is a scalar if it would 1790 // otherwise be a 1-vector. 1791 std::pair<EVT, EVT> 1792 AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const { 1793 EVT LoVT, HiVT; 1794 EVT EltVT = VT.getVectorElementType(); 1795 unsigned NumElts = VT.getVectorNumElements(); 1796 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2); 1797 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts); 1798 HiVT = NumElts - LoNumElts == 1 1799 ? EltVT 1800 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts); 1801 return std::pair(LoVT, HiVT); 1802 } 1803 1804 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be 1805 // scalar. 1806 std::pair<SDValue, SDValue> 1807 AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL, 1808 const EVT &LoVT, const EVT &HiVT, 1809 SelectionDAG &DAG) const { 1810 assert(LoVT.getVectorNumElements() + 1811 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= 1812 N.getValueType().getVectorNumElements() && 1813 "More vector elements requested than available!"); 1814 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, 1815 DAG.getVectorIdxConstant(0, DL)); 1816 SDValue Hi = DAG.getNode( 1817 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL, 1818 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL)); 1819 return std::pair(Lo, Hi); 1820 } 1821 1822 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, 1823 SelectionDAG &DAG) const { 1824 LoadSDNode *Load = cast<LoadSDNode>(Op); 1825 EVT VT = Op.getValueType(); 1826 SDLoc SL(Op); 1827 1828 1829 // If this is a 2 element vector, we really want to scalarize and not create 1830 // weird 1 element vectors. 1831 if (VT.getVectorNumElements() == 2) { 1832 SDValue Ops[2]; 1833 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG); 1834 return DAG.getMergeValues(Ops, SL); 1835 } 1836 1837 SDValue BasePtr = Load->getBasePtr(); 1838 EVT MemVT = Load->getMemoryVT(); 1839 1840 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); 1841 1842 EVT LoVT, HiVT; 1843 EVT LoMemVT, HiMemVT; 1844 SDValue Lo, Hi; 1845 1846 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); 1847 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); 1848 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG); 1849 1850 unsigned Size = LoMemVT.getStoreSize(); 1851 Align BaseAlign = Load->getAlign(); 1852 Align HiAlign = commonAlignment(BaseAlign, Size); 1853 1854 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, 1855 Load->getChain(), BasePtr, SrcValue, LoMemVT, 1856 BaseAlign, Load->getMemOperand()->getFlags()); 1857 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size)); 1858 SDValue HiLoad = 1859 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), 1860 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), 1861 HiMemVT, HiAlign, Load->getMemOperand()->getFlags()); 1862 1863 SDValue Join; 1864 if (LoVT == HiVT) { 1865 // This is the case that the vector is power of two so was evenly split. 1866 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad); 1867 } else { 1868 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad, 1869 DAG.getVectorIdxConstant(0, SL)); 1870 Join = DAG.getNode( 1871 HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL, 1872 VT, Join, HiLoad, 1873 DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL)); 1874 } 1875 1876 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other, 1877 LoLoad.getValue(1), HiLoad.getValue(1))}; 1878 1879 return DAG.getMergeValues(Ops, SL); 1880 } 1881 1882 SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op, 1883 SelectionDAG &DAG) const { 1884 LoadSDNode *Load = cast<LoadSDNode>(Op); 1885 EVT VT = Op.getValueType(); 1886 SDValue BasePtr = Load->getBasePtr(); 1887 EVT MemVT = Load->getMemoryVT(); 1888 SDLoc SL(Op); 1889 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); 1890 Align BaseAlign = Load->getAlign(); 1891 unsigned NumElements = MemVT.getVectorNumElements(); 1892 1893 // Widen from vec3 to vec4 when the load is at least 8-byte aligned 1894 // or 16-byte fully dereferenceable. Otherwise, split the vector load. 1895 if (NumElements != 3 || 1896 (BaseAlign < Align(8) && 1897 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout()))) 1898 return SplitVectorLoad(Op, DAG); 1899 1900 assert(NumElements == 3); 1901 1902 EVT WideVT = 1903 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); 1904 EVT WideMemVT = 1905 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4); 1906 SDValue WideLoad = DAG.getExtLoad( 1907 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue, 1908 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags()); 1909 return DAG.getMergeValues( 1910 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad, 1911 DAG.getVectorIdxConstant(0, SL)), 1912 WideLoad.getValue(1)}, 1913 SL); 1914 } 1915 1916 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, 1917 SelectionDAG &DAG) const { 1918 StoreSDNode *Store = cast<StoreSDNode>(Op); 1919 SDValue Val = Store->getValue(); 1920 EVT VT = Val.getValueType(); 1921 1922 // If this is a 2 element vector, we really want to scalarize and not create 1923 // weird 1 element vectors. 1924 if (VT.getVectorNumElements() == 2) 1925 return scalarizeVectorStore(Store, DAG); 1926 1927 EVT MemVT = Store->getMemoryVT(); 1928 SDValue Chain = Store->getChain(); 1929 SDValue BasePtr = Store->getBasePtr(); 1930 SDLoc SL(Op); 1931 1932 EVT LoVT, HiVT; 1933 EVT LoMemVT, HiMemVT; 1934 SDValue Lo, Hi; 1935 1936 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); 1937 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); 1938 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG); 1939 1940 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize()); 1941 1942 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo(); 1943 Align BaseAlign = Store->getAlign(); 1944 unsigned Size = LoMemVT.getStoreSize(); 1945 Align HiAlign = commonAlignment(BaseAlign, Size); 1946 1947 SDValue LoStore = 1948 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign, 1949 Store->getMemOperand()->getFlags()); 1950 SDValue HiStore = 1951 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), 1952 HiMemVT, HiAlign, Store->getMemOperand()->getFlags()); 1953 1954 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); 1955 } 1956 1957 // This is a shortcut for integer division because we have fast i32<->f32 1958 // conversions, and fast f32 reciprocal instructions. The fractional part of a 1959 // float is enough to accurately represent up to a 24-bit signed integer. 1960 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, 1961 bool Sign) const { 1962 SDLoc DL(Op); 1963 EVT VT = Op.getValueType(); 1964 SDValue LHS = Op.getOperand(0); 1965 SDValue RHS = Op.getOperand(1); 1966 MVT IntVT = MVT::i32; 1967 MVT FltVT = MVT::f32; 1968 1969 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS); 1970 if (LHSSignBits < 9) 1971 return SDValue(); 1972 1973 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS); 1974 if (RHSSignBits < 9) 1975 return SDValue(); 1976 1977 unsigned BitSize = VT.getSizeInBits(); 1978 unsigned SignBits = std::min(LHSSignBits, RHSSignBits); 1979 unsigned DivBits = BitSize - SignBits; 1980 if (Sign) 1981 ++DivBits; 1982 1983 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; 1984 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; 1985 1986 SDValue jq = DAG.getConstant(1, DL, IntVT); 1987 1988 if (Sign) { 1989 // char|short jq = ia ^ ib; 1990 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); 1991 1992 // jq = jq >> (bitsize - 2) 1993 jq = DAG.getNode(ISD::SRA, DL, VT, jq, 1994 DAG.getConstant(BitSize - 2, DL, VT)); 1995 1996 // jq = jq | 0x1 1997 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT)); 1998 } 1999 2000 // int ia = (int)LHS; 2001 SDValue ia = LHS; 2002 2003 // int ib, (int)RHS; 2004 SDValue ib = RHS; 2005 2006 // float fa = (float)ia; 2007 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); 2008 2009 // float fb = (float)ib; 2010 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); 2011 2012 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, 2013 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); 2014 2015 // fq = trunc(fq); 2016 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq); 2017 2018 // float fqneg = -fq; 2019 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); 2020 2021 MachineFunction &MF = DAG.getMachineFunction(); 2022 2023 bool UseFmadFtz = false; 2024 if (Subtarget->isGCN()) { 2025 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2026 UseFmadFtz = 2027 MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign(); 2028 } 2029 2030 // float fr = mad(fqneg, fb, fa); 2031 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA 2032 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ 2033 : (unsigned)ISD::FMAD; 2034 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa); 2035 2036 // int iq = (int)fq; 2037 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); 2038 2039 // fr = fabs(fr); 2040 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); 2041 2042 // fb = fabs(fb); 2043 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); 2044 2045 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2046 2047 // int cv = fr >= fb; 2048 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); 2049 2050 // jq = (cv ? jq : 0); 2051 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT)); 2052 2053 // dst = iq + jq; 2054 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); 2055 2056 // Rem needs compensation, it's easier to recompute it 2057 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); 2058 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); 2059 2060 // Truncate to number of bits this divide really is. 2061 if (Sign) { 2062 SDValue InRegSize 2063 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits)); 2064 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize); 2065 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize); 2066 } else { 2067 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT); 2068 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask); 2069 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask); 2070 } 2071 2072 return DAG.getMergeValues({ Div, Rem }, DL); 2073 } 2074 2075 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, 2076 SelectionDAG &DAG, 2077 SmallVectorImpl<SDValue> &Results) const { 2078 SDLoc DL(Op); 2079 EVT VT = Op.getValueType(); 2080 2081 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64"); 2082 2083 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 2084 2085 SDValue One = DAG.getConstant(1, DL, HalfVT); 2086 SDValue Zero = DAG.getConstant(0, DL, HalfVT); 2087 2088 //HiLo split 2089 SDValue LHS_Lo, LHS_Hi; 2090 SDValue LHS = Op.getOperand(0); 2091 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT); 2092 2093 SDValue RHS_Lo, RHS_Hi; 2094 SDValue RHS = Op.getOperand(1); 2095 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT); 2096 2097 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && 2098 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { 2099 2100 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), 2101 LHS_Lo, RHS_Lo); 2102 2103 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero}); 2104 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero}); 2105 2106 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV)); 2107 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM)); 2108 return; 2109 } 2110 2111 if (isTypeLegal(MVT::i64)) { 2112 // The algorithm here is based on ideas from "Software Integer Division", 2113 // Tom Rodeheffer, August 2008. 2114 2115 MachineFunction &MF = DAG.getMachineFunction(); 2116 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2117 2118 // Compute denominator reciprocal. 2119 unsigned FMAD = 2120 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA 2121 : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign() 2122 ? (unsigned)ISD::FMAD 2123 : (unsigned)AMDGPUISD::FMAD_FTZ; 2124 2125 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo); 2126 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi); 2127 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi, 2128 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32), 2129 Cvt_Lo); 2130 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1); 2131 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp, 2132 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32)); 2133 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1, 2134 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32)); 2135 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2); 2136 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc, 2137 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32), 2138 Mul1); 2139 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2); 2140 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc); 2141 SDValue Rcp64 = DAG.getBitcast(VT, 2142 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi})); 2143 2144 SDValue Zero64 = DAG.getConstant(0, DL, VT); 2145 SDValue One64 = DAG.getConstant(1, DL, VT); 2146 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1); 2147 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1); 2148 2149 // First round of UNR (Unsigned integer Newton-Raphson). 2150 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS); 2151 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64); 2152 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1); 2153 SDValue Mulhi1_Lo, Mulhi1_Hi; 2154 std::tie(Mulhi1_Lo, Mulhi1_Hi) = 2155 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT); 2156 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo, 2157 Mulhi1_Lo, Zero1); 2158 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi, 2159 Mulhi1_Hi, Add1_Lo.getValue(1)); 2160 SDValue Add1 = DAG.getBitcast(VT, 2161 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi})); 2162 2163 // Second round of UNR. 2164 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1); 2165 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2); 2166 SDValue Mulhi2_Lo, Mulhi2_Hi; 2167 std::tie(Mulhi2_Lo, Mulhi2_Hi) = 2168 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT); 2169 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo, 2170 Mulhi2_Lo, Zero1); 2171 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi, 2172 Mulhi2_Hi, Add2_Lo.getValue(1)); 2173 SDValue Add2 = DAG.getBitcast(VT, 2174 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi})); 2175 2176 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2); 2177 2178 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3); 2179 2180 SDValue Mul3_Lo, Mul3_Hi; 2181 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT); 2182 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo, 2183 Mul3_Lo, Zero1); 2184 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi, 2185 Mul3_Hi, Sub1_Lo.getValue(1)); 2186 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi); 2187 SDValue Sub1 = DAG.getBitcast(VT, 2188 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi})); 2189 2190 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT); 2191 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero, 2192 ISD::SETUGE); 2193 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero, 2194 ISD::SETUGE); 2195 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ); 2196 2197 // TODO: Here and below portions of the code can be enclosed into if/endif. 2198 // Currently control flow is unconditional and we have 4 selects after 2199 // potential endif to substitute PHIs. 2200 2201 // if C3 != 0 ... 2202 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo, 2203 RHS_Lo, Zero1); 2204 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi, 2205 RHS_Hi, Sub1_Lo.getValue(1)); 2206 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi, 2207 Zero, Sub2_Lo.getValue(1)); 2208 SDValue Sub2 = DAG.getBitcast(VT, 2209 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi})); 2210 2211 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64); 2212 2213 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero, 2214 ISD::SETUGE); 2215 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero, 2216 ISD::SETUGE); 2217 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ); 2218 2219 // if (C6 != 0) 2220 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64); 2221 2222 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo, 2223 RHS_Lo, Zero1); 2224 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi, 2225 RHS_Hi, Sub2_Lo.getValue(1)); 2226 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi, 2227 Zero, Sub3_Lo.getValue(1)); 2228 SDValue Sub3 = DAG.getBitcast(VT, 2229 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi})); 2230 2231 // endif C6 2232 // endif C3 2233 2234 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE); 2235 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE); 2236 2237 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE); 2238 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE); 2239 2240 Results.push_back(Div); 2241 Results.push_back(Rem); 2242 2243 return; 2244 } 2245 2246 // r600 expandion. 2247 // Get Speculative values 2248 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); 2249 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); 2250 2251 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ); 2252 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero}); 2253 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM); 2254 2255 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ); 2256 SDValue DIV_Lo = Zero; 2257 2258 const unsigned halfBitWidth = HalfVT.getSizeInBits(); 2259 2260 for (unsigned i = 0; i < halfBitWidth; ++i) { 2261 const unsigned bitPos = halfBitWidth - i - 1; 2262 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT); 2263 // Get value of high bit 2264 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); 2265 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One); 2266 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); 2267 2268 // Shift 2269 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT)); 2270 // Add LHS high bit 2271 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); 2272 2273 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT); 2274 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE); 2275 2276 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); 2277 2278 // Update REM 2279 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); 2280 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); 2281 } 2282 2283 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi}); 2284 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV); 2285 Results.push_back(DIV); 2286 Results.push_back(REM); 2287 } 2288 2289 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, 2290 SelectionDAG &DAG) const { 2291 SDLoc DL(Op); 2292 EVT VT = Op.getValueType(); 2293 2294 if (VT == MVT::i64) { 2295 SmallVector<SDValue, 2> Results; 2296 LowerUDIVREM64(Op, DAG, Results); 2297 return DAG.getMergeValues(Results, DL); 2298 } 2299 2300 if (VT == MVT::i32) { 2301 if (SDValue Res = LowerDIVREM24(Op, DAG, false)) 2302 return Res; 2303 } 2304 2305 SDValue X = Op.getOperand(0); 2306 SDValue Y = Op.getOperand(1); 2307 2308 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2309 // algorithm used here. 2310 2311 // Initial estimate of inv(y). 2312 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y); 2313 2314 // One round of UNR. 2315 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y); 2316 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z); 2317 Z = DAG.getNode(ISD::ADD, DL, VT, Z, 2318 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ)); 2319 2320 // Quotient/remainder estimate. 2321 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z); 2322 SDValue R = 2323 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y)); 2324 2325 // First quotient/remainder refinement. 2326 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2327 SDValue One = DAG.getConstant(1, DL, VT); 2328 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE); 2329 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond, 2330 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q); 2331 R = DAG.getNode(ISD::SELECT, DL, VT, Cond, 2332 DAG.getNode(ISD::SUB, DL, VT, R, Y), R); 2333 2334 // Second quotient/remainder refinement. 2335 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE); 2336 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond, 2337 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q); 2338 R = DAG.getNode(ISD::SELECT, DL, VT, Cond, 2339 DAG.getNode(ISD::SUB, DL, VT, R, Y), R); 2340 2341 return DAG.getMergeValues({Q, R}, DL); 2342 } 2343 2344 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, 2345 SelectionDAG &DAG) const { 2346 SDLoc DL(Op); 2347 EVT VT = Op.getValueType(); 2348 2349 SDValue LHS = Op.getOperand(0); 2350 SDValue RHS = Op.getOperand(1); 2351 2352 SDValue Zero = DAG.getConstant(0, DL, VT); 2353 SDValue NegOne = DAG.getAllOnesConstant(DL, VT); 2354 2355 if (VT == MVT::i32) { 2356 if (SDValue Res = LowerDIVREM24(Op, DAG, true)) 2357 return Res; 2358 } 2359 2360 if (VT == MVT::i64 && 2361 DAG.ComputeNumSignBits(LHS) > 32 && 2362 DAG.ComputeNumSignBits(RHS) > 32) { 2363 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 2364 2365 //HiLo split 2366 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); 2367 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); 2368 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), 2369 LHS_Lo, RHS_Lo); 2370 SDValue Res[2] = { 2371 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)), 2372 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1)) 2373 }; 2374 return DAG.getMergeValues(Res, DL); 2375 } 2376 2377 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); 2378 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); 2379 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); 2380 SDValue RSign = LHSign; // Remainder sign is the same as LHS 2381 2382 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); 2383 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); 2384 2385 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); 2386 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); 2387 2388 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); 2389 SDValue Rem = Div.getValue(1); 2390 2391 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); 2392 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); 2393 2394 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); 2395 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); 2396 2397 SDValue Res[2] = { 2398 Div, 2399 Rem 2400 }; 2401 return DAG.getMergeValues(Res, DL); 2402 } 2403 2404 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x) 2405 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { 2406 SDLoc SL(Op); 2407 EVT VT = Op.getValueType(); 2408 auto Flags = Op->getFlags(); 2409 SDValue X = Op.getOperand(0); 2410 SDValue Y = Op.getOperand(1); 2411 2412 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags); 2413 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags); 2414 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags); 2415 // TODO: For f32 use FMAD instead if !hasFastFMA32? 2416 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags); 2417 } 2418 2419 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { 2420 SDLoc SL(Op); 2421 SDValue Src = Op.getOperand(0); 2422 2423 // result = trunc(src) 2424 // if (src > 0.0 && src != result) 2425 // result += 1.0 2426 2427 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 2428 2429 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); 2430 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 2431 2432 EVT SetCCVT = 2433 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2434 2435 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); 2436 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 2437 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 2438 2439 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); 2440 // TODO: Should this propagate fast-math-flags? 2441 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 2442 } 2443 2444 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, 2445 SelectionDAG &DAG) { 2446 const unsigned FractBits = 52; 2447 const unsigned ExpBits = 11; 2448 2449 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, 2450 Hi, 2451 DAG.getConstant(FractBits - 32, SL, MVT::i32), 2452 DAG.getConstant(ExpBits, SL, MVT::i32)); 2453 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, 2454 DAG.getConstant(1023, SL, MVT::i32)); 2455 2456 return Exp; 2457 } 2458 2459 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { 2460 SDLoc SL(Op); 2461 SDValue Src = Op.getOperand(0); 2462 2463 assert(Op.getValueType() == MVT::f64); 2464 2465 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 2466 2467 // Extract the upper half, since this is where we will find the sign and 2468 // exponent. 2469 SDValue Hi = getHiHalf64(Src, DAG); 2470 2471 SDValue Exp = extractF64Exponent(Hi, SL, DAG); 2472 2473 const unsigned FractBits = 52; 2474 2475 // Extract the sign bit. 2476 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32); 2477 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); 2478 2479 // Extend back to 64-bits. 2480 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit}); 2481 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); 2482 2483 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); 2484 const SDValue FractMask 2485 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64); 2486 2487 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); 2488 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); 2489 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); 2490 2491 EVT SetCCVT = 2492 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); 2493 2494 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32); 2495 2496 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); 2497 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); 2498 2499 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); 2500 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); 2501 2502 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); 2503 } 2504 2505 SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op, 2506 SelectionDAG &DAG) const { 2507 SDLoc SL(Op); 2508 SDValue Src = Op.getOperand(0); 2509 2510 assert(Op.getValueType() == MVT::f64); 2511 2512 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2513 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); 2514 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); 2515 2516 // TODO: Should this propagate fast-math-flags? 2517 2518 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); 2519 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); 2520 2521 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); 2522 2523 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2524 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); 2525 2526 EVT SetCCVT = 2527 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2528 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); 2529 2530 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); 2531 } 2532 2533 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, 2534 SelectionDAG &DAG) const { 2535 // FNEARBYINT and FRINT are the same, except in their handling of FP 2536 // exceptions. Those aren't really meaningful for us, and OpenCL only has 2537 // rint, so just treat them as equivalent. 2538 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(), 2539 Op.getOperand(0)); 2540 } 2541 2542 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { 2543 auto VT = Op.getValueType(); 2544 auto Arg = Op.getOperand(0u); 2545 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg); 2546 } 2547 2548 // XXX - May require not supporting f32 denormals? 2549 2550 // Don't handle v2f16. The extra instructions to scalarize and repack around the 2551 // compare and vselect end up producing worse code than scalarizing the whole 2552 // operation. 2553 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2554 SDLoc SL(Op); 2555 SDValue X = Op.getOperand(0); 2556 EVT VT = Op.getValueType(); 2557 2558 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X); 2559 2560 // TODO: Should this propagate fast-math-flags? 2561 2562 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T); 2563 2564 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff); 2565 2566 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 2567 const SDValue One = DAG.getConstantFP(1.0, SL, VT); 2568 2569 EVT SetCCVT = 2570 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2571 2572 const SDValue Half = DAG.getConstantFP(0.5, SL, VT); 2573 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); 2574 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero); 2575 2576 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X); 2577 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset); 2578 } 2579 2580 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { 2581 SDLoc SL(Op); 2582 SDValue Src = Op.getOperand(0); 2583 2584 // result = trunc(src); 2585 // if (src < 0.0 && src != result) 2586 // result += -1.0. 2587 2588 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 2589 2590 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); 2591 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64); 2592 2593 EVT SetCCVT = 2594 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2595 2596 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); 2597 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 2598 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 2599 2600 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); 2601 // TODO: Should this propagate fast-math-flags? 2602 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 2603 } 2604 2605 /// Return true if it's known that \p Src can never be an f32 denormal value. 2606 static bool valueIsKnownNeverF32Denorm(SDValue Src) { 2607 switch (Src.getOpcode()) { 2608 case ISD::FP_EXTEND: 2609 return Src.getOperand(0).getValueType() == MVT::f16; 2610 case ISD::FP16_TO_FP: 2611 case ISD::FFREXP: 2612 return true; 2613 case ISD::INTRINSIC_WO_CHAIN: { 2614 unsigned IntrinsicID = Src.getConstantOperandVal(0); 2615 switch (IntrinsicID) { 2616 case Intrinsic::amdgcn_frexp_mant: 2617 return true; 2618 default: 2619 return false; 2620 } 2621 } 2622 default: 2623 return false; 2624 } 2625 2626 llvm_unreachable("covered opcode switch"); 2627 } 2628 2629 bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG, 2630 SDNodeFlags Flags) { 2631 if (Flags.hasApproximateFuncs()) 2632 return true; 2633 auto &Options = DAG.getTarget().Options; 2634 return Options.UnsafeFPMath || Options.ApproxFuncFPMath; 2635 } 2636 2637 bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG, 2638 SDValue Src, 2639 SDNodeFlags Flags) { 2640 return !valueIsKnownNeverF32Denorm(Src) && 2641 DAG.getMachineFunction() 2642 .getDenormalMode(APFloat::IEEEsingle()) 2643 .Input != DenormalMode::PreserveSign; 2644 } 2645 2646 SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG, 2647 SDValue Src, 2648 SDNodeFlags Flags) const { 2649 SDLoc SL(Src); 2650 EVT VT = Src.getValueType(); 2651 const fltSemantics &Semantics = VT.getFltSemantics(); 2652 SDValue SmallestNormal = 2653 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT); 2654 2655 // Want to scale denormals up, but negatives and 0 work just as well on the 2656 // scaled path. 2657 SDValue IsLtSmallestNormal = DAG.getSetCC( 2658 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src, 2659 SmallestNormal, ISD::SETOLT); 2660 2661 return IsLtSmallestNormal; 2662 } 2663 2664 SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src, 2665 SDNodeFlags Flags) const { 2666 SDLoc SL(Src); 2667 EVT VT = Src.getValueType(); 2668 const fltSemantics &Semantics = VT.getFltSemantics(); 2669 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT); 2670 2671 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags); 2672 SDValue IsFinite = DAG.getSetCC( 2673 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs, 2674 Inf, ISD::SETOLT); 2675 return IsFinite; 2676 } 2677 2678 /// If denormal handling is required return the scaled input to FLOG2, and the 2679 /// check for denormal range. Otherwise, return null values. 2680 std::pair<SDValue, SDValue> 2681 AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, 2682 SDValue Src, SDNodeFlags Flags) const { 2683 if (!needsDenormHandlingF32(DAG, Src, Flags)) 2684 return {}; 2685 2686 MVT VT = MVT::f32; 2687 const fltSemantics &Semantics = APFloat::IEEEsingle(); 2688 SDValue SmallestNormal = 2689 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT); 2690 2691 SDValue IsLtSmallestNormal = DAG.getSetCC( 2692 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src, 2693 SmallestNormal, ISD::SETOLT); 2694 2695 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT); 2696 SDValue One = DAG.getConstantFP(1.0, SL, VT); 2697 SDValue ScaleFactor = 2698 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags); 2699 2700 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags); 2701 return {ScaledInput, IsLtSmallestNormal}; 2702 } 2703 2704 SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const { 2705 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. 2706 // If we have to handle denormals, scale up the input and adjust the result. 2707 2708 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) 2709 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) 2710 2711 SDLoc SL(Op); 2712 EVT VT = Op.getValueType(); 2713 SDValue Src = Op.getOperand(0); 2714 SDNodeFlags Flags = Op->getFlags(); 2715 2716 if (VT == MVT::f16) { 2717 // Nothing in half is a denormal when promoted to f32. 2718 assert(!Subtarget->has16BitInsts()); 2719 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags); 2720 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags); 2721 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log, 2722 DAG.getTargetConstant(0, SL, MVT::i32), Flags); 2723 } 2724 2725 auto [ScaledInput, IsLtSmallestNormal] = 2726 getScaledLogInput(DAG, SL, Src, Flags); 2727 if (!ScaledInput) 2728 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags); 2729 2730 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags); 2731 2732 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT); 2733 SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 2734 SDValue ResultOffset = 2735 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero); 2736 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags); 2737 } 2738 2739 static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, 2740 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) { 2741 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags); 2742 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags); 2743 } 2744 2745 SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, 2746 SelectionDAG &DAG) const { 2747 SDValue X = Op.getOperand(0); 2748 EVT VT = Op.getValueType(); 2749 SDNodeFlags Flags = Op->getFlags(); 2750 SDLoc DL(Op); 2751 2752 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10; 2753 assert(IsLog10 || Op.getOpcode() == ISD::FLOG); 2754 2755 const auto &Options = getTargetMachine().Options; 2756 if (VT == MVT::f16 || Flags.hasApproximateFuncs() || 2757 Options.ApproxFuncFPMath || Options.UnsafeFPMath) { 2758 2759 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { 2760 // Log and multiply in f32 is good enough for f16. 2761 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags); 2762 } 2763 2764 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags); 2765 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { 2766 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered, 2767 DAG.getTargetConstant(0, DL, MVT::i32), Flags); 2768 } 2769 2770 return Lowered; 2771 } 2772 2773 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags); 2774 if (ScaledInput) 2775 X = ScaledInput; 2776 2777 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags); 2778 2779 SDValue R; 2780 if (Subtarget->hasFastFMAF32()) { 2781 // c+cc are ln(2)/ln(10) to more than 49 bits 2782 const float c_log10 = 0x1.344134p-2f; 2783 const float cc_log10 = 0x1.09f79ep-26f; 2784 2785 // c + cc is ln(2) to more than 49 bits 2786 const float c_log = 0x1.62e42ep-1f; 2787 const float cc_log = 0x1.efa39ep-25f; 2788 2789 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT); 2790 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT); 2791 2792 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags); 2793 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags); 2794 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags); 2795 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags); 2796 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags); 2797 } else { 2798 // ch+ct is ln(2)/ln(10) to more than 36 bits 2799 const float ch_log10 = 0x1.344000p-2f; 2800 const float ct_log10 = 0x1.3509f6p-18f; 2801 2802 // ch + ct is ln(2) to more than 36 bits 2803 const float ch_log = 0x1.62e000p-1f; 2804 const float ct_log = 0x1.0bfbe8p-15f; 2805 2806 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT); 2807 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT); 2808 2809 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y); 2810 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32); 2811 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst); 2812 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt); 2813 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags); 2814 2815 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags); 2816 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags); 2817 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags); 2818 R = getMad(DAG, DL, VT, YH, CH, Mad1); 2819 } 2820 2821 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && 2822 (Flags.hasNoInfs() || Options.NoInfsFPMath); 2823 2824 // TODO: Check if known finite from source value. 2825 if (!IsFiniteOnly) { 2826 SDValue IsFinite = getIsFinite(DAG, Y, Flags); 2827 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags); 2828 } 2829 2830 if (IsScaled) { 2831 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT); 2832 SDValue ShiftK = 2833 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT); 2834 SDValue Shift = 2835 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags); 2836 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags); 2837 } 2838 2839 return R; 2840 } 2841 2842 SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const { 2843 return LowerFLOGCommon(Op, DAG); 2844 } 2845 2846 // Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a 2847 // promote f16 operation. 2848 SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL, 2849 SelectionDAG &DAG, bool IsLog10, 2850 SDNodeFlags Flags) const { 2851 EVT VT = Src.getValueType(); 2852 unsigned LogOp = 2853 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2; 2854 2855 double Log2BaseInverted = 2856 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2; 2857 2858 if (VT == MVT::f32) { 2859 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags); 2860 if (ScaledInput) { 2861 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags); 2862 SDValue ScaledResultOffset = 2863 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT); 2864 2865 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT); 2866 2867 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled, 2868 ScaledResultOffset, Zero, Flags); 2869 2870 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT); 2871 2872 if (Subtarget->hasFastFMAF32()) 2873 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset, 2874 Flags); 2875 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags); 2876 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset); 2877 } 2878 } 2879 2880 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags); 2881 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT); 2882 2883 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand, 2884 Flags); 2885 } 2886 2887 SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const { 2888 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. 2889 // If we have to handle denormals, scale up the input and adjust the result. 2890 2891 SDLoc SL(Op); 2892 EVT VT = Op.getValueType(); 2893 SDValue Src = Op.getOperand(0); 2894 SDNodeFlags Flags = Op->getFlags(); 2895 2896 if (VT == MVT::f16) { 2897 // Nothing in half is a denormal when promoted to f32. 2898 assert(!Subtarget->has16BitInsts()); 2899 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags); 2900 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags); 2901 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log, 2902 DAG.getTargetConstant(0, SL, MVT::i32), Flags); 2903 } 2904 2905 assert(VT == MVT::f32); 2906 2907 if (!needsDenormHandlingF32(DAG, Src, Flags)) 2908 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags); 2909 2910 // bool needs_scaling = x < -0x1.f80000p+6f; 2911 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); 2912 2913 // -nextafter(128.0, -1) 2914 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT); 2915 2916 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2917 2918 SDValue NeedsScaling = 2919 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT); 2920 2921 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT); 2922 SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 2923 2924 SDValue AddOffset = 2925 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero); 2926 2927 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags); 2928 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags); 2929 2930 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT); 2931 SDValue One = DAG.getConstantFP(1.0, SL, VT); 2932 SDValue ResultScale = 2933 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One); 2934 2935 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags); 2936 } 2937 2938 SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL, 2939 SelectionDAG &DAG, 2940 SDNodeFlags Flags) const { 2941 EVT VT = X.getValueType(); 2942 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT); 2943 2944 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) { 2945 // exp2(M_LOG2E_F * f); 2946 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags); 2947 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP 2948 : (unsigned)ISD::FEXP2, 2949 SL, VT, Mul, Flags); 2950 } 2951 2952 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2953 2954 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT); 2955 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT); 2956 2957 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT); 2958 2959 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags); 2960 2961 SDValue AdjustedX = 2962 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X); 2963 2964 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags); 2965 2966 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags); 2967 2968 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT); 2969 SDValue AdjustedResult = 2970 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags); 2971 2972 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2, 2973 Flags); 2974 } 2975 2976 /// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be 2977 /// handled correctly. 2978 SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL, 2979 SelectionDAG &DAG, 2980 SDNodeFlags Flags) const { 2981 const EVT VT = X.getValueType(); 2982 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP) 2983 : static_cast<unsigned>(ISD::FEXP2); 2984 2985 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) { 2986 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f); 2987 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT); 2988 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT); 2989 2990 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags); 2991 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags); 2992 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags); 2993 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags); 2994 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1); 2995 } 2996 2997 // bool s = x < -0x1.2f7030p+5f; 2998 // x += s ? 0x1.0p+5f : 0.0f; 2999 // exp10 = exp2(x * 0x1.a92000p+1f) * 3000 // exp2(x * 0x1.4f0978p-11f) * 3001 // (s ? 0x1.9f623ep-107f : 1.0f); 3002 3003 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 3004 3005 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT); 3006 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT); 3007 3008 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT); 3009 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags); 3010 SDValue AdjustedX = 3011 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X); 3012 3013 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT); 3014 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT); 3015 3016 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags); 3017 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags); 3018 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags); 3019 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags); 3020 3021 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags); 3022 3023 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT); 3024 SDValue AdjustedResult = 3025 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags); 3026 3027 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps, 3028 Flags); 3029 } 3030 3031 SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { 3032 EVT VT = Op.getValueType(); 3033 SDLoc SL(Op); 3034 SDValue X = Op.getOperand(0); 3035 SDNodeFlags Flags = Op->getFlags(); 3036 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10; 3037 3038 if (VT.getScalarType() == MVT::f16) { 3039 // v_exp_f16 (fmul x, log2e) 3040 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast? 3041 return lowerFEXPUnsafe(X, SL, DAG, Flags); 3042 3043 if (VT.isVector()) 3044 return SDValue(); 3045 3046 // exp(f16 x) -> 3047 // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) 3048 3049 // Nothing in half is a denormal when promoted to f32. 3050 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags); 3051 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags); 3052 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered, 3053 DAG.getTargetConstant(0, SL, MVT::i32), Flags); 3054 } 3055 3056 assert(VT == MVT::f32); 3057 3058 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying 3059 // library behavior. Also, is known-not-daz source sufficient? 3060 if (allowApproxFunc(DAG, Flags)) { 3061 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags) 3062 : lowerFEXPUnsafe(X, SL, DAG, Flags); 3063 } 3064 3065 // Algorithm: 3066 // 3067 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) 3068 // 3069 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer 3070 // n = 64*m + j, 0 <= j < 64 3071 // 3072 // e^x = 2^((64*m + j + f)/64) 3073 // = (2^m) * (2^(j/64)) * 2^(f/64) 3074 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) 3075 // 3076 // f = x*(64/ln(2)) - n 3077 // r = f*(ln(2)/64) = x - n*(ln(2)/64) 3078 // 3079 // e^x = (2^m) * (2^(j/64)) * e^r 3080 // 3081 // (2^(j/64)) is precomputed 3082 // 3083 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3084 // e^r = 1 + q 3085 // 3086 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3087 // 3088 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) 3089 SDNodeFlags FlagsNoContract = Flags; 3090 FlagsNoContract.setAllowContract(false); 3091 3092 SDValue PH, PL; 3093 if (Subtarget->hasFastFMAF32()) { 3094 const float c_exp = numbers::log2ef; 3095 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits 3096 const float c_exp10 = 0x1.a934f0p+1f; 3097 const float cc_exp10 = 0x1.2f346ep-24f; 3098 3099 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT); 3100 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT); 3101 3102 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags); 3103 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags); 3104 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags); 3105 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags); 3106 } else { 3107 const float ch_exp = 0x1.714000p+0f; 3108 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits 3109 3110 const float ch_exp10 = 0x1.a92000p+1f; 3111 const float cl_exp10 = 0x1.4f0978p-11f; 3112 3113 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT); 3114 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT); 3115 3116 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X); 3117 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32); 3118 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst); 3119 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt); 3120 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags); 3121 3122 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags); 3123 3124 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags); 3125 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags); 3126 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags); 3127 } 3128 3129 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags); 3130 3131 // It is unsafe to contract this fsub into the PH multiply. 3132 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract); 3133 3134 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags); 3135 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E); 3136 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags); 3137 3138 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags); 3139 3140 SDValue UnderflowCheckConst = 3141 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT); 3142 3143 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 3144 SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 3145 SDValue Underflow = 3146 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT); 3147 3148 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R); 3149 const auto &Options = getTargetMachine().Options; 3150 3151 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) { 3152 SDValue OverflowCheckConst = 3153 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT); 3154 SDValue Overflow = 3155 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT); 3156 SDValue Inf = 3157 DAG.getConstantFP(APFloat::getInf(APFloat::IEEEsingle()), SL, VT); 3158 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R); 3159 } 3160 3161 return R; 3162 } 3163 3164 static bool isCtlzOpc(unsigned Opc) { 3165 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; 3166 } 3167 3168 static bool isCttzOpc(unsigned Opc) { 3169 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF; 3170 } 3171 3172 SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op, 3173 SelectionDAG &DAG) const { 3174 auto SL = SDLoc(Op); 3175 auto Opc = Op.getOpcode(); 3176 auto Arg = Op.getOperand(0u); 3177 auto ResultVT = Op.getValueType(); 3178 3179 if (ResultVT != MVT::i8 && ResultVT != MVT::i16) 3180 return {}; 3181 3182 assert(isCtlzOpc(Opc)); 3183 assert(ResultVT == Arg.getValueType()); 3184 3185 const uint64_t NumBits = ResultVT.getFixedSizeInBits(); 3186 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32); 3187 SDValue NewOp; 3188 3189 if (Opc == ISD::CTLZ_ZERO_UNDEF) { 3190 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg); 3191 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits); 3192 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp); 3193 } else { 3194 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg); 3195 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp); 3196 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits); 3197 } 3198 3199 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp); 3200 } 3201 3202 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const { 3203 SDLoc SL(Op); 3204 SDValue Src = Op.getOperand(0); 3205 3206 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode())); 3207 bool Ctlz = isCtlzOpc(Op.getOpcode()); 3208 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32; 3209 3210 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF || 3211 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF; 3212 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64; 3213 3214 if (Src.getValueType() == MVT::i32 || Is64BitScalar) { 3215 // (ctlz hi:lo) -> (umin (ffbh src), 32) 3216 // (cttz hi:lo) -> (umin (ffbl src), 32) 3217 // (ctlz_zero_undef src) -> (ffbh src) 3218 // (cttz_zero_undef src) -> (ffbl src) 3219 3220 // 64-bit scalar version produce 32-bit result 3221 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64) 3222 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64) 3223 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src) 3224 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src) 3225 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src); 3226 if (!ZeroUndef) { 3227 const SDValue ConstVal = DAG.getConstant( 3228 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32); 3229 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal); 3230 } 3231 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr); 3232 } 3233 3234 SDValue Lo, Hi; 3235 std::tie(Lo, Hi) = split64BitValue(Src, DAG); 3236 3237 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo); 3238 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi); 3239 3240 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64) 3241 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64) 3242 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) 3243 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) 3244 3245 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT; 3246 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32); 3247 if (Ctlz) 3248 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32); 3249 else 3250 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32); 3251 3252 SDValue NewOpr; 3253 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi); 3254 if (!ZeroUndef) { 3255 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32); 3256 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64); 3257 } 3258 3259 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr); 3260 } 3261 3262 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, 3263 bool Signed) const { 3264 // The regular method converting a 64-bit integer to float roughly consists of 3265 // 2 steps: normalization and rounding. In fact, after normalization, the 3266 // conversion from a 64-bit integer to a float is essentially the same as the 3267 // one from a 32-bit integer. The only difference is that it has more 3268 // trailing bits to be rounded. To leverage the native 32-bit conversion, a 3269 // 64-bit integer could be preprocessed and fit into a 32-bit integer then 3270 // converted into the correct float number. The basic steps for the unsigned 3271 // conversion are illustrated in the following pseudo code: 3272 // 3273 // f32 uitofp(i64 u) { 3274 // i32 hi, lo = split(u); 3275 // // Only count the leading zeros in hi as we have native support of the 3276 // // conversion from i32 to f32. If hi is all 0s, the conversion is 3277 // // reduced to a 32-bit one automatically. 3278 // i32 shamt = clz(hi); // Return 32 if hi is all 0s. 3279 // u <<= shamt; 3280 // hi, lo = split(u); 3281 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo. 3282 // // convert it as a 32-bit integer and scale the result back. 3283 // return uitofp(hi) * 2^(32 - shamt); 3284 // } 3285 // 3286 // The signed one follows the same principle but uses 'ffbh_i32' to count its 3287 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is 3288 // converted instead followed by negation based its sign bit. 3289 3290 SDLoc SL(Op); 3291 SDValue Src = Op.getOperand(0); 3292 3293 SDValue Lo, Hi; 3294 std::tie(Lo, Hi) = split64BitValue(Src, DAG); 3295 SDValue Sign; 3296 SDValue ShAmt; 3297 if (Signed && Subtarget->isGCN()) { 3298 // We also need to consider the sign bit in Lo if Hi has just sign bits, 3299 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into 3300 // account. That is, the maximal shift is 3301 // - 32 if Lo and Hi have opposite signs; 3302 // - 33 if Lo and Hi have the same sign. 3303 // 3304 // Or, MaxShAmt = 33 + OppositeSign, where 3305 // 3306 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is 3307 // - -1 if Lo and Hi have opposite signs; and 3308 // - 0 otherwise. 3309 // 3310 // All in all, ShAmt is calculated as 3311 // 3312 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1. 3313 // 3314 // or 3315 // 3316 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31). 3317 // 3318 // to reduce the critical path. 3319 SDValue OppositeSign = DAG.getNode( 3320 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi), 3321 DAG.getConstant(31, SL, MVT::i32)); 3322 SDValue MaxShAmt = 3323 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32), 3324 OppositeSign); 3325 // Count the leading sign bits. 3326 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi); 3327 // Different from unsigned conversion, the shift should be one bit less to 3328 // preserve the sign bit. 3329 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt, 3330 DAG.getConstant(1, SL, MVT::i32)); 3331 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt); 3332 } else { 3333 if (Signed) { 3334 // Without 'ffbh_i32', only leading zeros could be counted. Take the 3335 // absolute value first. 3336 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src, 3337 DAG.getConstant(63, SL, MVT::i64)); 3338 SDValue Abs = 3339 DAG.getNode(ISD::XOR, SL, MVT::i64, 3340 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign); 3341 std::tie(Lo, Hi) = split64BitValue(Abs, DAG); 3342 } 3343 // Count the leading zeros. 3344 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi); 3345 // The shift amount for signed integers is [0, 32]. 3346 } 3347 // Normalize the given 64-bit integer. 3348 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt); 3349 // Split it again. 3350 std::tie(Lo, Hi) = split64BitValue(Norm, DAG); 3351 // Calculate the adjust bit for rounding. 3352 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo) 3353 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32, 3354 DAG.getConstant(1, SL, MVT::i32), Lo); 3355 // Get the 32-bit normalized integer. 3356 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust); 3357 // Convert the normalized 32-bit integer into f32. 3358 unsigned Opc = 3359 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; 3360 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm); 3361 3362 // Finally, need to scale back the converted floating number as the original 3363 // 64-bit integer is converted as a 32-bit one. 3364 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32), 3365 ShAmt); 3366 // On GCN, use LDEXP directly. 3367 if (Subtarget->isGCN()) 3368 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt); 3369 3370 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent 3371 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit 3372 // exponent is enough to avoid overflowing into the sign bit. 3373 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt, 3374 DAG.getConstant(23, SL, MVT::i32)); 3375 SDValue IVal = 3376 DAG.getNode(ISD::ADD, SL, MVT::i32, 3377 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp); 3378 if (Signed) { 3379 // Set the sign bit. 3380 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32, 3381 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign), 3382 DAG.getConstant(31, SL, MVT::i32)); 3383 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign); 3384 } 3385 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal); 3386 } 3387 3388 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, 3389 bool Signed) const { 3390 SDLoc SL(Op); 3391 SDValue Src = Op.getOperand(0); 3392 3393 SDValue Lo, Hi; 3394 std::tie(Lo, Hi) = split64BitValue(Src, DAG); 3395 3396 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP, 3397 SL, MVT::f64, Hi); 3398 3399 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); 3400 3401 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi, 3402 DAG.getConstant(32, SL, MVT::i32)); 3403 // TODO: Should this propagate fast-math-flags? 3404 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); 3405 } 3406 3407 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, 3408 SelectionDAG &DAG) const { 3409 // TODO: Factor out code common with LowerSINT_TO_FP. 3410 EVT DestVT = Op.getValueType(); 3411 SDValue Src = Op.getOperand(0); 3412 EVT SrcVT = Src.getValueType(); 3413 3414 if (SrcVT == MVT::i16) { 3415 if (DestVT == MVT::f16) 3416 return Op; 3417 SDLoc DL(Op); 3418 3419 // Promote src to i32 3420 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src); 3421 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext); 3422 } 3423 3424 if (DestVT == MVT::bf16) { 3425 SDLoc SL(Op); 3426 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src); 3427 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true); 3428 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag); 3429 } 3430 3431 if (SrcVT != MVT::i64) 3432 return Op; 3433 3434 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { 3435 SDLoc DL(Op); 3436 3437 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); 3438 SDValue FPRoundFlag = 3439 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true); 3440 SDValue FPRound = 3441 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); 3442 3443 return FPRound; 3444 } 3445 3446 if (DestVT == MVT::f32) 3447 return LowerINT_TO_FP32(Op, DAG, false); 3448 3449 assert(DestVT == MVT::f64); 3450 return LowerINT_TO_FP64(Op, DAG, false); 3451 } 3452 3453 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, 3454 SelectionDAG &DAG) const { 3455 EVT DestVT = Op.getValueType(); 3456 3457 SDValue Src = Op.getOperand(0); 3458 EVT SrcVT = Src.getValueType(); 3459 3460 if (SrcVT == MVT::i16) { 3461 if (DestVT == MVT::f16) 3462 return Op; 3463 3464 SDLoc DL(Op); 3465 // Promote src to i32 3466 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src); 3467 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext); 3468 } 3469 3470 if (DestVT == MVT::bf16) { 3471 SDLoc SL(Op); 3472 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src); 3473 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true); 3474 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag); 3475 } 3476 3477 if (SrcVT != MVT::i64) 3478 return Op; 3479 3480 // TODO: Factor out code common with LowerUINT_TO_FP. 3481 3482 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { 3483 SDLoc DL(Op); 3484 SDValue Src = Op.getOperand(0); 3485 3486 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); 3487 SDValue FPRoundFlag = 3488 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true); 3489 SDValue FPRound = 3490 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); 3491 3492 return FPRound; 3493 } 3494 3495 if (DestVT == MVT::f32) 3496 return LowerINT_TO_FP32(Op, DAG, true); 3497 3498 assert(DestVT == MVT::f64); 3499 return LowerINT_TO_FP64(Op, DAG, true); 3500 } 3501 3502 SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, 3503 bool Signed) const { 3504 SDLoc SL(Op); 3505 3506 SDValue Src = Op.getOperand(0); 3507 EVT SrcVT = Src.getValueType(); 3508 3509 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64); 3510 3511 // The basic idea of converting a floating point number into a pair of 32-bit 3512 // integers is illustrated as follows: 3513 // 3514 // tf := trunc(val); 3515 // hif := floor(tf * 2^-32); 3516 // lof := tf - hif * 2^32; // lof is always positive due to floor. 3517 // hi := fptoi(hif); 3518 // lo := fptoi(lof); 3519 // 3520 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src); 3521 SDValue Sign; 3522 if (Signed && SrcVT == MVT::f32) { 3523 // However, a 32-bit floating point number has only 23 bits mantissa and 3524 // it's not enough to hold all the significant bits of `lof` if val is 3525 // negative. To avoid the loss of precision, We need to take the absolute 3526 // value after truncating and flip the result back based on the original 3527 // signedness. 3528 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32, 3529 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc), 3530 DAG.getConstant(31, SL, MVT::i32)); 3531 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc); 3532 } 3533 3534 SDValue K0, K1; 3535 if (SrcVT == MVT::f64) { 3536 K0 = DAG.getConstantFP( 3537 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL, 3538 SrcVT); 3539 K1 = DAG.getConstantFP( 3540 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL, 3541 SrcVT); 3542 } else { 3543 K0 = DAG.getConstantFP( 3544 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT); 3545 K1 = DAG.getConstantFP( 3546 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT); 3547 } 3548 // TODO: Should this propagate fast-math-flags? 3549 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0); 3550 3551 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul); 3552 3553 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc); 3554 3555 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT 3556 : ISD::FP_TO_UINT, 3557 SL, MVT::i32, FloorMul); 3558 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); 3559 3560 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, 3561 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi})); 3562 3563 if (Signed && SrcVT == MVT::f32) { 3564 assert(Sign); 3565 // Flip the result based on the signedness, which is either all 0s or 1s. 3566 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64, 3567 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign})); 3568 // r := xor(r, sign) - sign; 3569 Result = 3570 DAG.getNode(ISD::SUB, SL, MVT::i64, 3571 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign); 3572 } 3573 3574 return Result; 3575 } 3576 3577 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const { 3578 SDLoc DL(Op); 3579 SDValue N0 = Op.getOperand(0); 3580 3581 // Convert to target node to get known bits 3582 if (N0.getValueType() == MVT::f32) 3583 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0); 3584 3585 if (getTargetMachine().Options.UnsafeFPMath) { 3586 // There is a generic expand for FP_TO_FP16 with unsafe fast math. 3587 return SDValue(); 3588 } 3589 3590 return LowerF64ToF16Safe(N0, DL, DAG); 3591 } 3592 3593 // return node in i32 3594 SDValue AMDGPUTargetLowering::LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, 3595 SelectionDAG &DAG) const { 3596 assert(Src.getSimpleValueType() == MVT::f64); 3597 3598 // f64 -> f16 conversion using round-to-nearest-even rounding mode. 3599 // TODO: We can generate better code for True16. 3600 const unsigned ExpMask = 0x7ff; 3601 const unsigned ExpBiasf64 = 1023; 3602 const unsigned ExpBiasf16 = 15; 3603 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 3604 SDValue One = DAG.getConstant(1, DL, MVT::i32); 3605 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src); 3606 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U, 3607 DAG.getConstant(32, DL, MVT::i64)); 3608 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32); 3609 U = DAG.getZExtOrTrunc(U, DL, MVT::i32); 3610 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 3611 DAG.getConstant(20, DL, MVT::i64)); 3612 E = DAG.getNode(ISD::AND, DL, MVT::i32, E, 3613 DAG.getConstant(ExpMask, DL, MVT::i32)); 3614 // Subtract the fp64 exponent bias (1023) to get the real exponent and 3615 // add the f16 bias (15) to get the biased exponent for the f16 format. 3616 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E, 3617 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32)); 3618 3619 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 3620 DAG.getConstant(8, DL, MVT::i32)); 3621 M = DAG.getNode(ISD::AND, DL, MVT::i32, M, 3622 DAG.getConstant(0xffe, DL, MVT::i32)); 3623 3624 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH, 3625 DAG.getConstant(0x1ff, DL, MVT::i32)); 3626 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U); 3627 3628 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ); 3629 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set); 3630 3631 // (M != 0 ? 0x0200 : 0) | 0x7c00; 3632 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32, 3633 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32), 3634 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32)); 3635 3636 // N = M | (E << 12); 3637 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M, 3638 DAG.getNode(ISD::SHL, DL, MVT::i32, E, 3639 DAG.getConstant(12, DL, MVT::i32))); 3640 3641 // B = clamp(1-E, 0, 13); 3642 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32, 3643 One, E); 3644 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero); 3645 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B, 3646 DAG.getConstant(13, DL, MVT::i32)); 3647 3648 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M, 3649 DAG.getConstant(0x1000, DL, MVT::i32)); 3650 3651 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B); 3652 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B); 3653 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE); 3654 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1); 3655 3656 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT); 3657 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V, 3658 DAG.getConstant(0x7, DL, MVT::i32)); 3659 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V, 3660 DAG.getConstant(2, DL, MVT::i32)); 3661 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32), 3662 One, Zero, ISD::SETEQ); 3663 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32), 3664 One, Zero, ISD::SETGT); 3665 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1); 3666 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1); 3667 3668 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32), 3669 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT); 3670 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32), 3671 I, V, ISD::SETEQ); 3672 3673 // Extract the sign bit. 3674 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 3675 DAG.getConstant(16, DL, MVT::i32)); 3676 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign, 3677 DAG.getConstant(0x8000, DL, MVT::i32)); 3678 3679 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V); 3680 } 3681 3682 SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op, 3683 SelectionDAG &DAG) const { 3684 SDValue Src = Op.getOperand(0); 3685 unsigned OpOpcode = Op.getOpcode(); 3686 EVT SrcVT = Src.getValueType(); 3687 EVT DestVT = Op.getValueType(); 3688 3689 // Will be selected natively 3690 if (SrcVT == MVT::f16 && DestVT == MVT::i16) 3691 return Op; 3692 3693 if (SrcVT == MVT::bf16) { 3694 SDLoc DL(Op); 3695 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); 3696 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc); 3697 } 3698 3699 // Promote i16 to i32 3700 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) { 3701 SDLoc DL(Op); 3702 3703 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src); 3704 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32); 3705 } 3706 3707 if (DestVT != MVT::i64) 3708 return Op; 3709 3710 if (SrcVT == MVT::f16 || 3711 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) { 3712 SDLoc DL(Op); 3713 3714 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src); 3715 unsigned Ext = 3716 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 3717 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32); 3718 } 3719 3720 if (SrcVT == MVT::f32 || SrcVT == MVT::f64) 3721 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT); 3722 3723 return SDValue(); 3724 } 3725 3726 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 3727 SelectionDAG &DAG) const { 3728 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 3729 MVT VT = Op.getSimpleValueType(); 3730 MVT ScalarVT = VT.getScalarType(); 3731 3732 assert(VT.isVector()); 3733 3734 SDValue Src = Op.getOperand(0); 3735 SDLoc DL(Op); 3736 3737 // TODO: Don't scalarize on Evergreen? 3738 unsigned NElts = VT.getVectorNumElements(); 3739 SmallVector<SDValue, 8> Args; 3740 DAG.ExtractVectorElements(Src, Args, 0, NElts); 3741 3742 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); 3743 for (unsigned I = 0; I < NElts; ++I) 3744 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); 3745 3746 return DAG.getBuildVector(VT, DL, Args); 3747 } 3748 3749 //===----------------------------------------------------------------------===// 3750 // Custom DAG optimizations 3751 //===----------------------------------------------------------------------===// 3752 3753 static bool isU24(SDValue Op, SelectionDAG &DAG) { 3754 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24; 3755 } 3756 3757 static bool isI24(SDValue Op, SelectionDAG &DAG) { 3758 EVT VT = Op.getValueType(); 3759 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated 3760 // as unsigned 24-bit values. 3761 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24; 3762 } 3763 3764 static SDValue simplifyMul24(SDNode *Node24, 3765 TargetLowering::DAGCombinerInfo &DCI) { 3766 SelectionDAG &DAG = DCI.DAG; 3767 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 3768 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN; 3769 3770 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0); 3771 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1); 3772 unsigned NewOpcode = Node24->getOpcode(); 3773 if (IsIntrin) { 3774 unsigned IID = Node24->getConstantOperandVal(0); 3775 switch (IID) { 3776 case Intrinsic::amdgcn_mul_i24: 3777 NewOpcode = AMDGPUISD::MUL_I24; 3778 break; 3779 case Intrinsic::amdgcn_mul_u24: 3780 NewOpcode = AMDGPUISD::MUL_U24; 3781 break; 3782 case Intrinsic::amdgcn_mulhi_i24: 3783 NewOpcode = AMDGPUISD::MULHI_I24; 3784 break; 3785 case Intrinsic::amdgcn_mulhi_u24: 3786 NewOpcode = AMDGPUISD::MULHI_U24; 3787 break; 3788 default: 3789 llvm_unreachable("Expected 24-bit mul intrinsic"); 3790 } 3791 } 3792 3793 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24); 3794 3795 // First try to simplify using SimplifyMultipleUseDemandedBits which allows 3796 // the operands to have other uses, but will only perform simplifications that 3797 // involve bypassing some nodes for this user. 3798 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG); 3799 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG); 3800 if (DemandedLHS || DemandedRHS) 3801 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(), 3802 DemandedLHS ? DemandedLHS : LHS, 3803 DemandedRHS ? DemandedRHS : RHS); 3804 3805 // Now try SimplifyDemandedBits which can simplify the nodes used by our 3806 // operands if this node is the only user. 3807 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI)) 3808 return SDValue(Node24, 0); 3809 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI)) 3810 return SDValue(Node24, 0); 3811 3812 return SDValue(); 3813 } 3814 3815 template <typename IntTy> 3816 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, 3817 uint32_t Width, const SDLoc &DL) { 3818 if (Width + Offset < 32) { 3819 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width); 3820 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width); 3821 if constexpr (std::is_signed_v<IntTy>) { 3822 return DAG.getSignedConstant(Result, DL, MVT::i32); 3823 } else { 3824 return DAG.getConstant(Result, DL, MVT::i32); 3825 } 3826 } 3827 3828 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); 3829 } 3830 3831 static bool hasVolatileUser(SDNode *Val) { 3832 for (SDNode *U : Val->users()) { 3833 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) { 3834 if (M->isVolatile()) 3835 return true; 3836 } 3837 } 3838 3839 return false; 3840 } 3841 3842 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const { 3843 // i32 vectors are the canonical memory type. 3844 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT)) 3845 return false; 3846 3847 if (!VT.isByteSized()) 3848 return false; 3849 3850 unsigned Size = VT.getStoreSize(); 3851 3852 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector()) 3853 return false; 3854 3855 if (Size == 3 || (Size > 4 && (Size % 4 != 0))) 3856 return false; 3857 3858 return true; 3859 } 3860 3861 // Replace load of an illegal type with a bitcast from a load of a friendlier 3862 // type. 3863 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, 3864 DAGCombinerInfo &DCI) const { 3865 if (!DCI.isBeforeLegalize()) 3866 return SDValue(); 3867 3868 LoadSDNode *LN = cast<LoadSDNode>(N); 3869 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN)) 3870 return SDValue(); 3871 3872 SDLoc SL(N); 3873 SelectionDAG &DAG = DCI.DAG; 3874 EVT VT = LN->getMemoryVT(); 3875 3876 unsigned Size = VT.getStoreSize(); 3877 Align Alignment = LN->getAlign(); 3878 if (Alignment < Size && isTypeLegal(VT)) { 3879 unsigned IsFast; 3880 unsigned AS = LN->getAddressSpace(); 3881 3882 // Expand unaligned loads earlier than legalization. Due to visitation order 3883 // problems during legalization, the emitted instructions to pack and unpack 3884 // the bytes again are not eliminated in the case of an unaligned copy. 3885 if (!allowsMisalignedMemoryAccesses( 3886 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) { 3887 if (VT.isVector()) 3888 return SplitVectorLoad(SDValue(LN, 0), DAG); 3889 3890 SDValue Ops[2]; 3891 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); 3892 3893 return DAG.getMergeValues(Ops, SDLoc(N)); 3894 } 3895 3896 if (!IsFast) 3897 return SDValue(); 3898 } 3899 3900 if (!shouldCombineMemoryType(VT)) 3901 return SDValue(); 3902 3903 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); 3904 3905 SDValue NewLoad 3906 = DAG.getLoad(NewVT, SL, LN->getChain(), 3907 LN->getBasePtr(), LN->getMemOperand()); 3908 3909 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad); 3910 DCI.CombineTo(N, BC, NewLoad.getValue(1)); 3911 return SDValue(N, 0); 3912 } 3913 3914 // Replace store of an illegal type with a store of a bitcast to a friendlier 3915 // type. 3916 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, 3917 DAGCombinerInfo &DCI) const { 3918 if (!DCI.isBeforeLegalize()) 3919 return SDValue(); 3920 3921 StoreSDNode *SN = cast<StoreSDNode>(N); 3922 if (!SN->isSimple() || !ISD::isNormalStore(SN)) 3923 return SDValue(); 3924 3925 EVT VT = SN->getMemoryVT(); 3926 unsigned Size = VT.getStoreSize(); 3927 3928 SDLoc SL(N); 3929 SelectionDAG &DAG = DCI.DAG; 3930 Align Alignment = SN->getAlign(); 3931 if (Alignment < Size && isTypeLegal(VT)) { 3932 unsigned IsFast; 3933 unsigned AS = SN->getAddressSpace(); 3934 3935 // Expand unaligned stores earlier than legalization. Due to visitation 3936 // order problems during legalization, the emitted instructions to pack and 3937 // unpack the bytes again are not eliminated in the case of an unaligned 3938 // copy. 3939 if (!allowsMisalignedMemoryAccesses( 3940 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) { 3941 if (VT.isVector()) 3942 return SplitVectorStore(SDValue(SN, 0), DAG); 3943 3944 return expandUnalignedStore(SN, DAG); 3945 } 3946 3947 if (!IsFast) 3948 return SDValue(); 3949 } 3950 3951 if (!shouldCombineMemoryType(VT)) 3952 return SDValue(); 3953 3954 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); 3955 SDValue Val = SN->getValue(); 3956 3957 //DCI.AddToWorklist(Val.getNode()); 3958 3959 bool OtherUses = !Val.hasOneUse(); 3960 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val); 3961 if (OtherUses) { 3962 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal); 3963 DAG.ReplaceAllUsesOfValueWith(Val, CastBack); 3964 } 3965 3966 return DAG.getStore(SN->getChain(), SL, CastVal, 3967 SN->getBasePtr(), SN->getMemOperand()); 3968 } 3969 3970 // FIXME: This should go in generic DAG combiner with an isTruncateFree check, 3971 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU 3972 // issues. 3973 SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N, 3974 DAGCombinerInfo &DCI) const { 3975 SelectionDAG &DAG = DCI.DAG; 3976 SDValue N0 = N->getOperand(0); 3977 3978 // (vt2 (assertzext (truncate vt0:x), vt1)) -> 3979 // (vt2 (truncate (assertzext vt0:x, vt1))) 3980 if (N0.getOpcode() == ISD::TRUNCATE) { 3981 SDValue N1 = N->getOperand(1); 3982 EVT ExtVT = cast<VTSDNode>(N1)->getVT(); 3983 SDLoc SL(N); 3984 3985 SDValue Src = N0.getOperand(0); 3986 EVT SrcVT = Src.getValueType(); 3987 if (SrcVT.bitsGE(ExtVT)) { 3988 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1); 3989 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg); 3990 } 3991 } 3992 3993 return SDValue(); 3994 } 3995 3996 SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( 3997 SDNode *N, DAGCombinerInfo &DCI) const { 3998 unsigned IID = N->getConstantOperandVal(0); 3999 switch (IID) { 4000 case Intrinsic::amdgcn_mul_i24: 4001 case Intrinsic::amdgcn_mul_u24: 4002 case Intrinsic::amdgcn_mulhi_i24: 4003 case Intrinsic::amdgcn_mulhi_u24: 4004 return simplifyMul24(N, DCI); 4005 case Intrinsic::amdgcn_fract: 4006 case Intrinsic::amdgcn_rsq: 4007 case Intrinsic::amdgcn_rcp_legacy: 4008 case Intrinsic::amdgcn_rsq_legacy: 4009 case Intrinsic::amdgcn_rsq_clamp: 4010 case Intrinsic::amdgcn_tanh: { 4011 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted 4012 SDValue Src = N->getOperand(1); 4013 return Src.isUndef() ? Src : SDValue(); 4014 } 4015 case Intrinsic::amdgcn_frexp_exp: { 4016 // frexp_exp (fneg x) -> frexp_exp x 4017 // frexp_exp (fabs x) -> frexp_exp x 4018 // frexp_exp (fneg (fabs x)) -> frexp_exp x 4019 SDValue Src = N->getOperand(1); 4020 SDValue PeekSign = peekFPSignOps(Src); 4021 if (PeekSign == Src) 4022 return SDValue(); 4023 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign), 4024 0); 4025 } 4026 default: 4027 return SDValue(); 4028 } 4029 } 4030 4031 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the 4032 /// binary operation \p Opc to it with the corresponding constant operands. 4033 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( 4034 DAGCombinerInfo &DCI, const SDLoc &SL, 4035 unsigned Opc, SDValue LHS, 4036 uint32_t ValLo, uint32_t ValHi) const { 4037 SelectionDAG &DAG = DCI.DAG; 4038 SDValue Lo, Hi; 4039 std::tie(Lo, Hi) = split64BitValue(LHS, DAG); 4040 4041 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32); 4042 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32); 4043 4044 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS); 4045 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS); 4046 4047 // Re-visit the ands. It's possible we eliminated one of them and it could 4048 // simplify the vector. 4049 DCI.AddToWorklist(Lo.getNode()); 4050 DCI.AddToWorklist(Hi.getNode()); 4051 4052 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd}); 4053 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 4054 } 4055 4056 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, 4057 DAGCombinerInfo &DCI) const { 4058 EVT VT = N->getValueType(0); 4059 SDValue LHS = N->getOperand(0); 4060 SDValue RHS = N->getOperand(1); 4061 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 4062 SDLoc SL(N); 4063 SelectionDAG &DAG = DCI.DAG; 4064 4065 unsigned RHSVal; 4066 if (CRHS) { 4067 RHSVal = CRHS->getZExtValue(); 4068 if (!RHSVal) 4069 return LHS; 4070 4071 switch (LHS->getOpcode()) { 4072 default: 4073 break; 4074 case ISD::ZERO_EXTEND: 4075 case ISD::SIGN_EXTEND: 4076 case ISD::ANY_EXTEND: { 4077 SDValue X = LHS->getOperand(0); 4078 4079 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 && 4080 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) { 4081 // Prefer build_vector as the canonical form if packed types are legal. 4082 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x 4083 SDValue Vec = DAG.getBuildVector( 4084 MVT::v2i16, SL, 4085 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)}); 4086 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); 4087 } 4088 4089 // shl (ext x) => zext (shl x), if shift does not overflow int 4090 if (VT != MVT::i64) 4091 break; 4092 KnownBits Known = DAG.computeKnownBits(X); 4093 unsigned LZ = Known.countMinLeadingZeros(); 4094 if (LZ < RHSVal) 4095 break; 4096 EVT XVT = X.getValueType(); 4097 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0)); 4098 return DAG.getZExtOrTrunc(Shl, SL, VT); 4099 } 4100 } 4101 } 4102 4103 if (VT.getScalarType() != MVT::i64) 4104 return SDValue(); 4105 4106 // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32)) 4107 4108 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 4109 // common case, splitting this into a move and a 32-bit shift is faster and 4110 // the same code size. 4111 KnownBits Known = DAG.computeKnownBits(RHS); 4112 4113 EVT ElementType = VT.getScalarType(); 4114 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext()); 4115 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType) 4116 : TargetScalarType; 4117 4118 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits()) 4119 return SDValue(); 4120 SDValue ShiftAmt; 4121 4122 if (CRHS) { 4123 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL, 4124 TargetType); 4125 } else { 4126 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS); 4127 const SDValue ShiftMask = 4128 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType); 4129 // This AND instruction will clamp out of bounds shift values. 4130 // It will also be removed during later instruction selection. 4131 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask); 4132 } 4133 4134 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS); 4135 SDValue NewShift = 4136 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags()); 4137 4138 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType); 4139 SDValue Vec; 4140 4141 if (VT.isVector()) { 4142 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext()); 4143 unsigned NElts = TargetType.getVectorNumElements(); 4144 SmallVector<SDValue, 8> HiOps; 4145 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero); 4146 4147 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts); 4148 for (unsigned I = 0; I != NElts; ++I) 4149 HiAndLoOps[2 * I + 1] = HiOps[I]; 4150 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps); 4151 } else { 4152 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2); 4153 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift}); 4154 } 4155 return DAG.getNode(ISD::BITCAST, SL, VT, Vec); 4156 } 4157 4158 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, 4159 DAGCombinerInfo &DCI) const { 4160 SDValue RHS = N->getOperand(1); 4161 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 4162 EVT VT = N->getValueType(0); 4163 SDValue LHS = N->getOperand(0); 4164 SelectionDAG &DAG = DCI.DAG; 4165 SDLoc SL(N); 4166 4167 if (VT.getScalarType() != MVT::i64) 4168 return SDValue(); 4169 4170 // For C >= 32 4171 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31)) 4172 4173 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 4174 // common case, splitting this into a move and a 32-bit shift is faster and 4175 // the same code size. 4176 KnownBits Known = DAG.computeKnownBits(RHS); 4177 4178 EVT ElementType = VT.getScalarType(); 4179 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext()); 4180 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType) 4181 : TargetScalarType; 4182 4183 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits()) 4184 return SDValue(); 4185 4186 SDValue ShiftFullAmt = 4187 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType); 4188 SDValue ShiftAmt; 4189 if (CRHS) { 4190 unsigned RHSVal = CRHS->getZExtValue(); 4191 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL, 4192 TargetType); 4193 } else if (Known.getMinValue().getZExtValue() == 4194 (ElementType.getSizeInBits() - 1)) { 4195 ShiftAmt = ShiftFullAmt; 4196 } else { 4197 SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS); 4198 const SDValue ShiftMask = 4199 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType); 4200 // This AND instruction will clamp out of bounds shift values. 4201 // It will also be removed during later instruction selection. 4202 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask); 4203 } 4204 4205 EVT ConcatType; 4206 SDValue Hi; 4207 SDLoc LHSSL(LHS); 4208 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi 4209 if (VT.isVector()) { 4210 unsigned NElts = TargetType.getVectorNumElements(); 4211 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext()); 4212 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS); 4213 SmallVector<SDValue, 8> HiOps(NElts); 4214 SmallVector<SDValue, 16> HiAndLoOps; 4215 4216 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2); 4217 for (unsigned I = 0; I != NElts; ++I) { 4218 HiOps[I] = HiAndLoOps[2 * I + 1]; 4219 } 4220 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps); 4221 } else { 4222 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType); 4223 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2); 4224 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS); 4225 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One); 4226 } 4227 4228 KnownBits KnownLHS = DAG.computeKnownBits(LHS); 4229 SDValue HiShift; 4230 if (KnownLHS.isNegative()) { 4231 HiShift = DAG.getAllOnesConstant(SL, TargetType); 4232 } else { 4233 Hi = DAG.getFreeze(Hi); 4234 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt); 4235 } 4236 SDValue NewShift = 4237 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags()); 4238 4239 SDValue Vec; 4240 if (VT.isVector()) { 4241 unsigned NElts = TargetType.getVectorNumElements(); 4242 SmallVector<SDValue, 8> HiOps; 4243 SmallVector<SDValue, 8> LoOps; 4244 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2); 4245 4246 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts); 4247 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts); 4248 for (unsigned I = 0; I != NElts; ++I) { 4249 HiAndLoOps[2 * I + 1] = HiOps[I]; 4250 HiAndLoOps[2 * I] = LoOps[I]; 4251 } 4252 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps); 4253 } else { 4254 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift}); 4255 } 4256 return DAG.getNode(ISD::BITCAST, SL, VT, Vec); 4257 } 4258 4259 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, 4260 DAGCombinerInfo &DCI) const { 4261 SDValue RHS = N->getOperand(1); 4262 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 4263 EVT VT = N->getValueType(0); 4264 SDValue LHS = N->getOperand(0); 4265 SelectionDAG &DAG = DCI.DAG; 4266 SDLoc SL(N); 4267 unsigned RHSVal; 4268 4269 if (CRHS) { 4270 RHSVal = CRHS->getZExtValue(); 4271 4272 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1) 4273 // this improves the ability to match BFE patterns in isel. 4274 if (LHS.getOpcode() == ISD::AND) { 4275 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) { 4276 unsigned MaskIdx, MaskLen; 4277 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) && 4278 MaskIdx == RHSVal) { 4279 return DAG.getNode(ISD::AND, SL, VT, 4280 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), 4281 N->getOperand(1)), 4282 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), 4283 N->getOperand(1))); 4284 } 4285 } 4286 } 4287 } 4288 4289 if (VT.getScalarType() != MVT::i64) 4290 return SDValue(); 4291 4292 // for C >= 32 4293 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0) 4294 4295 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 4296 // common case, splitting this into a move and a 32-bit shift is faster and 4297 // the same code size. 4298 KnownBits Known = DAG.computeKnownBits(RHS); 4299 4300 EVT ElementType = VT.getScalarType(); 4301 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext()); 4302 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType) 4303 : TargetScalarType; 4304 4305 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits()) 4306 return SDValue(); 4307 4308 SDValue ShiftAmt; 4309 if (CRHS) { 4310 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL, 4311 TargetType); 4312 } else { 4313 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS); 4314 const SDValue ShiftMask = 4315 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType); 4316 // This AND instruction will clamp out of bounds shift values. 4317 // It will also be removed during later instruction selection. 4318 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask); 4319 } 4320 4321 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType); 4322 EVT ConcatType; 4323 SDValue Hi; 4324 SDLoc LHSSL(LHS); 4325 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi 4326 if (VT.isVector()) { 4327 unsigned NElts = TargetType.getVectorNumElements(); 4328 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext()); 4329 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS); 4330 SmallVector<SDValue, 8> HiOps(NElts); 4331 SmallVector<SDValue, 16> HiAndLoOps; 4332 4333 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2); 4334 for (unsigned I = 0; I != NElts; ++I) 4335 HiOps[I] = HiAndLoOps[2 * I + 1]; 4336 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps); 4337 } else { 4338 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType); 4339 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2); 4340 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS); 4341 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One); 4342 } 4343 4344 SDValue NewShift = 4345 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags()); 4346 4347 SDValue Vec; 4348 if (VT.isVector()) { 4349 unsigned NElts = TargetType.getVectorNumElements(); 4350 SmallVector<SDValue, 8> LoOps; 4351 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero); 4352 4353 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts); 4354 for (unsigned I = 0; I != NElts; ++I) 4355 HiAndLoOps[2 * I] = LoOps[I]; 4356 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps); 4357 } else { 4358 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero}); 4359 } 4360 return DAG.getNode(ISD::BITCAST, SL, VT, Vec); 4361 } 4362 4363 SDValue AMDGPUTargetLowering::performTruncateCombine( 4364 SDNode *N, DAGCombinerInfo &DCI) const { 4365 SDLoc SL(N); 4366 SelectionDAG &DAG = DCI.DAG; 4367 EVT VT = N->getValueType(0); 4368 SDValue Src = N->getOperand(0); 4369 4370 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x) 4371 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) { 4372 SDValue Vec = Src.getOperand(0); 4373 if (Vec.getOpcode() == ISD::BUILD_VECTOR) { 4374 SDValue Elt0 = Vec.getOperand(0); 4375 EVT EltVT = Elt0.getValueType(); 4376 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) { 4377 if (EltVT.isFloatingPoint()) { 4378 Elt0 = DAG.getNode(ISD::BITCAST, SL, 4379 EltVT.changeTypeToInteger(), Elt0); 4380 } 4381 4382 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0); 4383 } 4384 } 4385 } 4386 4387 // Equivalent of above for accessing the high element of a vector as an 4388 // integer operation. 4389 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y) 4390 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) { 4391 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) { 4392 SDValue BV = stripBitcast(Src.getOperand(0)); 4393 if (BV.getOpcode() == ISD::BUILD_VECTOR) { 4394 EVT SrcEltVT = BV.getOperand(0).getValueType(); 4395 unsigned SrcEltSize = SrcEltVT.getSizeInBits(); 4396 unsigned BitIndex = K->getZExtValue(); 4397 unsigned PartIndex = BitIndex / SrcEltSize; 4398 4399 if (PartIndex * SrcEltSize == BitIndex && 4400 PartIndex < BV.getNumOperands()) { 4401 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) { 4402 SDValue SrcElt = 4403 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(), 4404 BV.getOperand(PartIndex)); 4405 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt); 4406 } 4407 } 4408 } 4409 } 4410 } 4411 4412 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit. 4413 // 4414 // i16 (trunc (srl i64:x, K)), K <= 16 -> 4415 // i16 (trunc (srl (i32 (trunc x), K))) 4416 if (VT.getScalarSizeInBits() < 32) { 4417 EVT SrcVT = Src.getValueType(); 4418 if (SrcVT.getScalarSizeInBits() > 32 && 4419 (Src.getOpcode() == ISD::SRL || 4420 Src.getOpcode() == ISD::SRA || 4421 Src.getOpcode() == ISD::SHL)) { 4422 SDValue Amt = Src.getOperand(1); 4423 KnownBits Known = DAG.computeKnownBits(Amt); 4424 4425 // - For left shifts, do the transform as long as the shift 4426 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31) 4427 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid 4428 // losing information stored in the high bits when truncating. 4429 const unsigned MaxCstSize = 4430 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits()); 4431 if (Known.getMaxValue().ule(MaxCstSize)) { 4432 EVT MidVT = VT.isVector() ? 4433 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4434 VT.getVectorNumElements()) : MVT::i32; 4435 4436 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout()); 4437 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT, 4438 Src.getOperand(0)); 4439 DCI.AddToWorklist(Trunc.getNode()); 4440 4441 if (Amt.getValueType() != NewShiftVT) { 4442 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT); 4443 DCI.AddToWorklist(Amt.getNode()); 4444 } 4445 4446 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT, 4447 Trunc, Amt); 4448 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift); 4449 } 4450 } 4451 } 4452 4453 return SDValue(); 4454 } 4455 4456 // We need to specifically handle i64 mul here to avoid unnecessary conversion 4457 // instructions. If we only match on the legalized i64 mul expansion, 4458 // SimplifyDemandedBits will be unable to remove them because there will be 4459 // multiple uses due to the separate mul + mulh[su]. 4460 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, 4461 SDValue N0, SDValue N1, unsigned Size, bool Signed) { 4462 if (Size <= 32) { 4463 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; 4464 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1); 4465 } 4466 4467 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; 4468 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24; 4469 4470 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1); 4471 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1); 4472 4473 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi); 4474 } 4475 4476 /// If \p V is an add of a constant 1, returns the other operand. Otherwise 4477 /// return SDValue(). 4478 static SDValue getAddOneOp(const SDNode *V) { 4479 if (V->getOpcode() != ISD::ADD) 4480 return SDValue(); 4481 4482 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue(); 4483 } 4484 4485 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, 4486 DAGCombinerInfo &DCI) const { 4487 assert(N->getOpcode() == ISD::MUL); 4488 EVT VT = N->getValueType(0); 4489 4490 // Don't generate 24-bit multiplies on values that are in SGPRs, since 4491 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs 4492 // unnecessarily). isDivergent() is used as an approximation of whether the 4493 // value is in an SGPR. 4494 if (!N->isDivergent()) 4495 return SDValue(); 4496 4497 unsigned Size = VT.getSizeInBits(); 4498 if (VT.isVector() || Size > 64) 4499 return SDValue(); 4500 4501 SelectionDAG &DAG = DCI.DAG; 4502 SDLoc DL(N); 4503 4504 SDValue N0 = N->getOperand(0); 4505 SDValue N1 = N->getOperand(1); 4506 4507 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad 4508 // matching. 4509 4510 // mul x, (add y, 1) -> add (mul x, y), x 4511 auto IsFoldableAdd = [](SDValue V) -> SDValue { 4512 SDValue AddOp = getAddOneOp(V.getNode()); 4513 if (!AddOp) 4514 return SDValue(); 4515 4516 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool { 4517 return U->getOpcode() == ISD::MUL; 4518 })) 4519 return AddOp; 4520 4521 return SDValue(); 4522 }; 4523 4524 // FIXME: The selection pattern is not properly checking for commuted 4525 // operands, so we have to place the mul in the LHS 4526 if (SDValue MulOper = IsFoldableAdd(N0)) { 4527 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper); 4528 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1); 4529 } 4530 4531 if (SDValue MulOper = IsFoldableAdd(N1)) { 4532 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper); 4533 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0); 4534 } 4535 4536 // There are i16 integer mul/mad. 4537 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16)) 4538 return SDValue(); 4539 4540 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends 4541 // in the source into any_extends if the result of the mul is truncated. Since 4542 // we can assume the high bits are whatever we want, use the underlying value 4543 // to avoid the unknown high bits from interfering. 4544 if (N0.getOpcode() == ISD::ANY_EXTEND) 4545 N0 = N0.getOperand(0); 4546 4547 if (N1.getOpcode() == ISD::ANY_EXTEND) 4548 N1 = N1.getOperand(0); 4549 4550 SDValue Mul; 4551 4552 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { 4553 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 4554 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 4555 Mul = getMul24(DAG, DL, N0, N1, Size, false); 4556 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { 4557 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 4558 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 4559 Mul = getMul24(DAG, DL, N0, N1, Size, true); 4560 } else { 4561 return SDValue(); 4562 } 4563 4564 // We need to use sext even for MUL_U24, because MUL_U24 is used 4565 // for signed multiply of 8 and 16-bit types. 4566 return DAG.getSExtOrTrunc(Mul, DL, VT); 4567 } 4568 4569 SDValue 4570 AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N, 4571 DAGCombinerInfo &DCI) const { 4572 if (N->getValueType(0) != MVT::i32) 4573 return SDValue(); 4574 4575 SelectionDAG &DAG = DCI.DAG; 4576 SDLoc DL(N); 4577 4578 bool Signed = N->getOpcode() == ISD::SMUL_LOHI; 4579 SDValue N0 = N->getOperand(0); 4580 SDValue N1 = N->getOperand(1); 4581 4582 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends 4583 // in the source into any_extends if the result of the mul is truncated. Since 4584 // we can assume the high bits are whatever we want, use the underlying value 4585 // to avoid the unknown high bits from interfering. 4586 if (N0.getOpcode() == ISD::ANY_EXTEND) 4587 N0 = N0.getOperand(0); 4588 if (N1.getOpcode() == ISD::ANY_EXTEND) 4589 N1 = N1.getOperand(0); 4590 4591 // Try to use two fast 24-bit multiplies (one for each half of the result) 4592 // instead of one slow extending multiply. 4593 unsigned LoOpcode = 0; 4594 unsigned HiOpcode = 0; 4595 if (Signed) { 4596 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { 4597 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 4598 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 4599 LoOpcode = AMDGPUISD::MUL_I24; 4600 HiOpcode = AMDGPUISD::MULHI_I24; 4601 } 4602 } else { 4603 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { 4604 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 4605 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 4606 LoOpcode = AMDGPUISD::MUL_U24; 4607 HiOpcode = AMDGPUISD::MULHI_U24; 4608 } 4609 } 4610 if (!LoOpcode) 4611 return SDValue(); 4612 4613 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1); 4614 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1); 4615 DCI.CombineTo(N, Lo, Hi); 4616 return SDValue(N, 0); 4617 } 4618 4619 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N, 4620 DAGCombinerInfo &DCI) const { 4621 EVT VT = N->getValueType(0); 4622 4623 if (!Subtarget->hasMulI24() || VT.isVector()) 4624 return SDValue(); 4625 4626 // Don't generate 24-bit multiplies on values that are in SGPRs, since 4627 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs 4628 // unnecessarily). isDivergent() is used as an approximation of whether the 4629 // value is in an SGPR. 4630 // This doesn't apply if no s_mul_hi is available (since we'll end up with a 4631 // valu op anyway) 4632 if (Subtarget->hasSMulHi() && !N->isDivergent()) 4633 return SDValue(); 4634 4635 SelectionDAG &DAG = DCI.DAG; 4636 SDLoc DL(N); 4637 4638 SDValue N0 = N->getOperand(0); 4639 SDValue N1 = N->getOperand(1); 4640 4641 if (!isI24(N0, DAG) || !isI24(N1, DAG)) 4642 return SDValue(); 4643 4644 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 4645 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 4646 4647 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1); 4648 DCI.AddToWorklist(Mulhi.getNode()); 4649 return DAG.getSExtOrTrunc(Mulhi, DL, VT); 4650 } 4651 4652 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N, 4653 DAGCombinerInfo &DCI) const { 4654 EVT VT = N->getValueType(0); 4655 4656 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32) 4657 return SDValue(); 4658 4659 // Don't generate 24-bit multiplies on values that are in SGPRs, since 4660 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs 4661 // unnecessarily). isDivergent() is used as an approximation of whether the 4662 // value is in an SGPR. 4663 // This doesn't apply if no s_mul_hi is available (since we'll end up with a 4664 // valu op anyway) 4665 if (Subtarget->hasSMulHi() && !N->isDivergent()) 4666 return SDValue(); 4667 4668 SelectionDAG &DAG = DCI.DAG; 4669 SDLoc DL(N); 4670 4671 SDValue N0 = N->getOperand(0); 4672 SDValue N1 = N->getOperand(1); 4673 4674 if (!isU24(N0, DAG) || !isU24(N1, DAG)) 4675 return SDValue(); 4676 4677 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 4678 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 4679 4680 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1); 4681 DCI.AddToWorklist(Mulhi.getNode()); 4682 return DAG.getZExtOrTrunc(Mulhi, DL, VT); 4683 } 4684 4685 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG, 4686 SDValue Op, 4687 const SDLoc &DL, 4688 unsigned Opc) const { 4689 EVT VT = Op.getValueType(); 4690 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT); 4691 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() && 4692 LegalVT != MVT::i16)) 4693 return SDValue(); 4694 4695 if (VT != MVT::i32) 4696 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); 4697 4698 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op); 4699 if (VT != MVT::i32) 4700 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX); 4701 4702 return FFBX; 4703 } 4704 4705 // The native instructions return -1 on 0 input. Optimize out a select that 4706 // produces -1 on 0. 4707 // 4708 // TODO: If zero is not undef, we could also do this if the output is compared 4709 // against the bitwidth. 4710 // 4711 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly. 4712 SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, 4713 SDValue LHS, SDValue RHS, 4714 DAGCombinerInfo &DCI) const { 4715 if (!isNullConstant(Cond.getOperand(1))) 4716 return SDValue(); 4717 4718 SelectionDAG &DAG = DCI.DAG; 4719 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 4720 SDValue CmpLHS = Cond.getOperand(0); 4721 4722 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x 4723 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x 4724 if (CCOpcode == ISD::SETEQ && 4725 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && 4726 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) { 4727 unsigned Opc = 4728 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32; 4729 return getFFBX_U32(DAG, CmpLHS, SL, Opc); 4730 } 4731 4732 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x 4733 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x 4734 if (CCOpcode == ISD::SETNE && 4735 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) && 4736 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) { 4737 unsigned Opc = 4738 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32; 4739 4740 return getFFBX_U32(DAG, CmpLHS, SL, Opc); 4741 } 4742 4743 return SDValue(); 4744 } 4745 4746 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, 4747 unsigned Op, 4748 const SDLoc &SL, 4749 SDValue Cond, 4750 SDValue N1, 4751 SDValue N2) { 4752 SelectionDAG &DAG = DCI.DAG; 4753 EVT VT = N1.getValueType(); 4754 4755 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond, 4756 N1.getOperand(0), N2.getOperand(0)); 4757 DCI.AddToWorklist(NewSelect.getNode()); 4758 return DAG.getNode(Op, SL, VT, NewSelect); 4759 } 4760 4761 // Pull a free FP operation out of a select so it may fold into uses. 4762 // 4763 // select c, (fneg x), (fneg y) -> fneg (select c, x, y) 4764 // select c, (fneg x), k -> fneg (select c, x, (fneg k)) 4765 // 4766 // select c, (fabs x), (fabs y) -> fabs (select c, x, y) 4767 // select c, (fabs x), +k -> fabs (select c, x, k) 4768 SDValue 4769 AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, 4770 SDValue N) const { 4771 SelectionDAG &DAG = DCI.DAG; 4772 SDValue Cond = N.getOperand(0); 4773 SDValue LHS = N.getOperand(1); 4774 SDValue RHS = N.getOperand(2); 4775 4776 EVT VT = N.getValueType(); 4777 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) || 4778 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) { 4779 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) 4780 return SDValue(); 4781 4782 return distributeOpThroughSelect(DCI, LHS.getOpcode(), 4783 SDLoc(N), Cond, LHS, RHS); 4784 } 4785 4786 bool Inv = false; 4787 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) { 4788 std::swap(LHS, RHS); 4789 Inv = true; 4790 } 4791 4792 // TODO: Support vector constants. 4793 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 4794 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS && 4795 !selectSupportsSourceMods(N.getNode())) { 4796 SDLoc SL(N); 4797 // If one side is an fneg/fabs and the other is a constant, we can push the 4798 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative. 4799 SDValue NewLHS = LHS.getOperand(0); 4800 SDValue NewRHS = RHS; 4801 4802 // Careful: if the neg can be folded up, don't try to pull it back down. 4803 bool ShouldFoldNeg = true; 4804 4805 if (NewLHS.hasOneUse()) { 4806 unsigned Opc = NewLHS.getOpcode(); 4807 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode())) 4808 ShouldFoldNeg = false; 4809 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL) 4810 ShouldFoldNeg = false; 4811 } 4812 4813 if (ShouldFoldNeg) { 4814 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative()) 4815 return SDValue(); 4816 4817 // We're going to be forced to use a source modifier anyway, there's no 4818 // point to pulling the negate out unless we can get a size reduction by 4819 // negating the constant. 4820 // 4821 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know 4822 // about cheaper constants. 4823 if (NewLHS.getOpcode() == ISD::FABS && 4824 getConstantNegateCost(CRHS) != NegatibleCost::Cheaper) 4825 return SDValue(); 4826 4827 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) 4828 return SDValue(); 4829 4830 if (LHS.getOpcode() == ISD::FNEG) 4831 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 4832 4833 if (Inv) 4834 std::swap(NewLHS, NewRHS); 4835 4836 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, 4837 Cond, NewLHS, NewRHS); 4838 DCI.AddToWorklist(NewSelect.getNode()); 4839 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect); 4840 } 4841 } 4842 4843 return SDValue(); 4844 } 4845 4846 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, 4847 DAGCombinerInfo &DCI) const { 4848 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) 4849 return Folded; 4850 4851 SDValue Cond = N->getOperand(0); 4852 if (Cond.getOpcode() != ISD::SETCC) 4853 return SDValue(); 4854 4855 EVT VT = N->getValueType(0); 4856 SDValue LHS = Cond.getOperand(0); 4857 SDValue RHS = Cond.getOperand(1); 4858 SDValue CC = Cond.getOperand(2); 4859 4860 SDValue True = N->getOperand(1); 4861 SDValue False = N->getOperand(2); 4862 4863 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses. 4864 SelectionDAG &DAG = DCI.DAG; 4865 if (DAG.isConstantValueOfAnyType(True) && 4866 !DAG.isConstantValueOfAnyType(False)) { 4867 // Swap cmp + select pair to move constant to false input. 4868 // This will allow using VOPC cndmasks more often. 4869 // select (setcc x, y), k, x -> select (setccinv x, y), x, k 4870 4871 SDLoc SL(N); 4872 ISD::CondCode NewCC = 4873 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType()); 4874 4875 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC); 4876 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True); 4877 } 4878 4879 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) { 4880 SDValue MinMax 4881 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); 4882 // Revisit this node so we can catch min3/max3/med3 patterns. 4883 //DCI.AddToWorklist(MinMax.getNode()); 4884 return MinMax; 4885 } 4886 } 4887 4888 // There's no reason to not do this if the condition has other uses. 4889 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI); 4890 } 4891 4892 static bool isInv2Pi(const APFloat &APF) { 4893 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118)); 4894 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983)); 4895 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882)); 4896 4897 return APF.bitwiseIsEqual(KF16) || 4898 APF.bitwiseIsEqual(KF32) || 4899 APF.bitwiseIsEqual(KF64); 4900 } 4901 4902 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an 4903 // additional cost to negate them. 4904 TargetLowering::NegatibleCost 4905 AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const { 4906 if (C->isZero()) 4907 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive; 4908 4909 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF())) 4910 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive; 4911 4912 return NegatibleCost::Neutral; 4913 } 4914 4915 bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const { 4916 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) 4917 return getConstantNegateCost(C) == NegatibleCost::Expensive; 4918 return false; 4919 } 4920 4921 bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const { 4922 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) 4923 return getConstantNegateCost(C) == NegatibleCost::Cheaper; 4924 return false; 4925 } 4926 4927 static unsigned inverseMinMax(unsigned Opc) { 4928 switch (Opc) { 4929 case ISD::FMAXNUM: 4930 return ISD::FMINNUM; 4931 case ISD::FMINNUM: 4932 return ISD::FMAXNUM; 4933 case ISD::FMAXNUM_IEEE: 4934 return ISD::FMINNUM_IEEE; 4935 case ISD::FMINNUM_IEEE: 4936 return ISD::FMAXNUM_IEEE; 4937 case ISD::FMAXIMUM: 4938 return ISD::FMINIMUM; 4939 case ISD::FMINIMUM: 4940 return ISD::FMAXIMUM; 4941 case ISD::FMAXIMUMNUM: 4942 return ISD::FMINIMUMNUM; 4943 case ISD::FMINIMUMNUM: 4944 return ISD::FMAXIMUMNUM; 4945 case AMDGPUISD::FMAX_LEGACY: 4946 return AMDGPUISD::FMIN_LEGACY; 4947 case AMDGPUISD::FMIN_LEGACY: 4948 return AMDGPUISD::FMAX_LEGACY; 4949 default: 4950 llvm_unreachable("invalid min/max opcode"); 4951 } 4952 } 4953 4954 /// \return true if it's profitable to try to push an fneg into its source 4955 /// instruction. 4956 bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) { 4957 // If the input has multiple uses and we can either fold the negate down, or 4958 // the other uses cannot, give up. This both prevents unprofitable 4959 // transformations and infinite loops: we won't repeatedly try to fold around 4960 // a negate that has no 'good' form. 4961 if (N0.hasOneUse()) { 4962 // This may be able to fold into the source, but at a code size cost. Don't 4963 // fold if the fold into the user is free. 4964 if (allUsesHaveSourceMods(N, 0)) 4965 return false; 4966 } else { 4967 if (fnegFoldsIntoOp(N0.getNode()) && 4968 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode()))) 4969 return false; 4970 } 4971 4972 return true; 4973 } 4974 4975 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, 4976 DAGCombinerInfo &DCI) const { 4977 SelectionDAG &DAG = DCI.DAG; 4978 SDValue N0 = N->getOperand(0); 4979 EVT VT = N->getValueType(0); 4980 4981 unsigned Opc = N0.getOpcode(); 4982 4983 if (!shouldFoldFNegIntoSrc(N, N0)) 4984 return SDValue(); 4985 4986 SDLoc SL(N); 4987 switch (Opc) { 4988 case ISD::FADD: { 4989 if (!mayIgnoreSignedZero(N0)) 4990 return SDValue(); 4991 4992 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y)) 4993 SDValue LHS = N0.getOperand(0); 4994 SDValue RHS = N0.getOperand(1); 4995 4996 if (LHS.getOpcode() != ISD::FNEG) 4997 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); 4998 else 4999 LHS = LHS.getOperand(0); 5000 5001 if (RHS.getOpcode() != ISD::FNEG) 5002 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 5003 else 5004 RHS = RHS.getOperand(0); 5005 5006 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags()); 5007 if (Res.getOpcode() != ISD::FADD) 5008 return SDValue(); // Op got folded away. 5009 if (!N0.hasOneUse()) 5010 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 5011 return Res; 5012 } 5013 case ISD::FMUL: 5014 case AMDGPUISD::FMUL_LEGACY: { 5015 // (fneg (fmul x, y)) -> (fmul x, (fneg y)) 5016 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y)) 5017 SDValue LHS = N0.getOperand(0); 5018 SDValue RHS = N0.getOperand(1); 5019 5020 if (LHS.getOpcode() == ISD::FNEG) 5021 LHS = LHS.getOperand(0); 5022 else if (RHS.getOpcode() == ISD::FNEG) 5023 RHS = RHS.getOperand(0); 5024 else 5025 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 5026 5027 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags()); 5028 if (Res.getOpcode() != Opc) 5029 return SDValue(); // Op got folded away. 5030 if (!N0.hasOneUse()) 5031 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 5032 return Res; 5033 } 5034 case ISD::FMA: 5035 case ISD::FMAD: { 5036 // TODO: handle llvm.amdgcn.fma.legacy 5037 if (!mayIgnoreSignedZero(N0)) 5038 return SDValue(); 5039 5040 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z)) 5041 SDValue LHS = N0.getOperand(0); 5042 SDValue MHS = N0.getOperand(1); 5043 SDValue RHS = N0.getOperand(2); 5044 5045 if (LHS.getOpcode() == ISD::FNEG) 5046 LHS = LHS.getOperand(0); 5047 else if (MHS.getOpcode() == ISD::FNEG) 5048 MHS = MHS.getOperand(0); 5049 else 5050 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS); 5051 5052 if (RHS.getOpcode() != ISD::FNEG) 5053 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 5054 else 5055 RHS = RHS.getOperand(0); 5056 5057 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS); 5058 if (Res.getOpcode() != Opc) 5059 return SDValue(); // Op got folded away. 5060 if (!N0.hasOneUse()) 5061 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 5062 return Res; 5063 } 5064 case ISD::FMAXNUM: 5065 case ISD::FMINNUM: 5066 case ISD::FMAXNUM_IEEE: 5067 case ISD::FMINNUM_IEEE: 5068 case ISD::FMINIMUM: 5069 case ISD::FMAXIMUM: 5070 case ISD::FMINIMUMNUM: 5071 case ISD::FMAXIMUMNUM: 5072 case AMDGPUISD::FMAX_LEGACY: 5073 case AMDGPUISD::FMIN_LEGACY: { 5074 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y) 5075 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y) 5076 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y) 5077 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y) 5078 5079 SDValue LHS = N0.getOperand(0); 5080 SDValue RHS = N0.getOperand(1); 5081 5082 // 0 doesn't have a negated inline immediate. 5083 // TODO: This constant check should be generalized to other operations. 5084 if (isConstantCostlierToNegate(RHS)) 5085 return SDValue(); 5086 5087 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); 5088 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 5089 unsigned Opposite = inverseMinMax(Opc); 5090 5091 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags()); 5092 if (Res.getOpcode() != Opposite) 5093 return SDValue(); // Op got folded away. 5094 if (!N0.hasOneUse()) 5095 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 5096 return Res; 5097 } 5098 case AMDGPUISD::FMED3: { 5099 SDValue Ops[3]; 5100 for (unsigned I = 0; I < 3; ++I) 5101 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags()); 5102 5103 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags()); 5104 if (Res.getOpcode() != AMDGPUISD::FMED3) 5105 return SDValue(); // Op got folded away. 5106 5107 if (!N0.hasOneUse()) { 5108 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res); 5109 DAG.ReplaceAllUsesWith(N0, Neg); 5110 5111 for (SDNode *U : Neg->users()) 5112 DCI.AddToWorklist(U); 5113 } 5114 5115 return Res; 5116 } 5117 case ISD::FP_EXTEND: 5118 case ISD::FTRUNC: 5119 case ISD::FRINT: 5120 case ISD::FNEARBYINT: // XXX - Should fround be handled? 5121 case ISD::FROUNDEVEN: 5122 case ISD::FSIN: 5123 case ISD::FCANONICALIZE: 5124 case AMDGPUISD::RCP: 5125 case AMDGPUISD::RCP_LEGACY: 5126 case AMDGPUISD::RCP_IFLAG: 5127 case AMDGPUISD::SIN_HW: { 5128 SDValue CvtSrc = N0.getOperand(0); 5129 if (CvtSrc.getOpcode() == ISD::FNEG) { 5130 // (fneg (fp_extend (fneg x))) -> (fp_extend x) 5131 // (fneg (rcp (fneg x))) -> (rcp x) 5132 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0)); 5133 } 5134 5135 if (!N0.hasOneUse()) 5136 return SDValue(); 5137 5138 // (fneg (fp_extend x)) -> (fp_extend (fneg x)) 5139 // (fneg (rcp x)) -> (rcp (fneg x)) 5140 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); 5141 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags()); 5142 } 5143 case ISD::FP_ROUND: { 5144 SDValue CvtSrc = N0.getOperand(0); 5145 5146 if (CvtSrc.getOpcode() == ISD::FNEG) { 5147 // (fneg (fp_round (fneg x))) -> (fp_round x) 5148 return DAG.getNode(ISD::FP_ROUND, SL, VT, 5149 CvtSrc.getOperand(0), N0.getOperand(1)); 5150 } 5151 5152 if (!N0.hasOneUse()) 5153 return SDValue(); 5154 5155 // (fneg (fp_round x)) -> (fp_round (fneg x)) 5156 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); 5157 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1)); 5158 } 5159 case ISD::FP16_TO_FP: { 5160 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal 5161 // f16, but legalization of f16 fneg ends up pulling it out of the source. 5162 // Put the fneg back as a legal source operation that can be matched later. 5163 SDLoc SL(N); 5164 5165 SDValue Src = N0.getOperand(0); 5166 EVT SrcVT = Src.getValueType(); 5167 5168 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000) 5169 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src, 5170 DAG.getConstant(0x8000, SL, SrcVT)); 5171 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg); 5172 } 5173 case ISD::SELECT: { 5174 // fneg (select c, a, b) -> select c, (fneg a), (fneg b) 5175 // TODO: Invert conditions of foldFreeOpFromSelect 5176 return SDValue(); 5177 } 5178 case ISD::BITCAST: { 5179 SDLoc SL(N); 5180 SDValue BCSrc = N0.getOperand(0); 5181 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) { 5182 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1); 5183 if (HighBits.getValueType().getSizeInBits() != 32 || 5184 !fnegFoldsIntoOp(HighBits.getNode())) 5185 return SDValue(); 5186 5187 // f64 fneg only really needs to operate on the high half of of the 5188 // register, so try to force it to an f32 operation to help make use of 5189 // source modifiers. 5190 // 5191 // 5192 // fneg (f64 (bitcast (build_vector x, y))) -> 5193 // f64 (bitcast (build_vector (bitcast i32:x to f32), 5194 // (fneg (bitcast i32:y to f32))) 5195 5196 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits); 5197 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi); 5198 SDValue CastBack = 5199 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi); 5200 5201 SmallVector<SDValue, 8> Ops(BCSrc->ops()); 5202 Ops.back() = CastBack; 5203 DCI.AddToWorklist(NegHi.getNode()); 5204 SDValue Build = 5205 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops); 5206 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build); 5207 5208 if (!N0.hasOneUse()) 5209 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result)); 5210 return Result; 5211 } 5212 5213 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 && 5214 BCSrc.hasOneUse()) { 5215 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) -> 5216 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32) 5217 5218 // TODO: Cast back result for multiple uses is beneficial in some cases. 5219 5220 SDValue LHS = 5221 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1)); 5222 SDValue RHS = 5223 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2)); 5224 5225 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS); 5226 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS); 5227 5228 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS, 5229 NegRHS); 5230 } 5231 5232 return SDValue(); 5233 } 5234 default: 5235 return SDValue(); 5236 } 5237 } 5238 5239 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N, 5240 DAGCombinerInfo &DCI) const { 5241 SelectionDAG &DAG = DCI.DAG; 5242 SDValue N0 = N->getOperand(0); 5243 5244 if (!N0.hasOneUse()) 5245 return SDValue(); 5246 5247 switch (N0.getOpcode()) { 5248 case ISD::FP16_TO_FP: { 5249 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal"); 5250 SDLoc SL(N); 5251 SDValue Src = N0.getOperand(0); 5252 EVT SrcVT = Src.getValueType(); 5253 5254 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff) 5255 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src, 5256 DAG.getConstant(0x7fff, SL, SrcVT)); 5257 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs); 5258 } 5259 default: 5260 return SDValue(); 5261 } 5262 } 5263 5264 SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N, 5265 DAGCombinerInfo &DCI) const { 5266 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); 5267 if (!CFP) 5268 return SDValue(); 5269 5270 // XXX - Should this flush denormals? 5271 const APFloat &Val = CFP->getValueAPF(); 5272 APFloat One(Val.getSemantics(), "1.0"); 5273 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); 5274 } 5275 5276 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, 5277 DAGCombinerInfo &DCI) const { 5278 SelectionDAG &DAG = DCI.DAG; 5279 SDLoc DL(N); 5280 5281 switch(N->getOpcode()) { 5282 default: 5283 break; 5284 case ISD::BITCAST: { 5285 EVT DestVT = N->getValueType(0); 5286 5287 // Push casts through vector builds. This helps avoid emitting a large 5288 // number of copies when materializing floating point vector constants. 5289 // 5290 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) => 5291 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y)) 5292 if (DestVT.isVector()) { 5293 SDValue Src = N->getOperand(0); 5294 if (Src.getOpcode() == ISD::BUILD_VECTOR && 5295 (DCI.getDAGCombineLevel() < AfterLegalizeDAG || 5296 isOperationLegal(ISD::BUILD_VECTOR, DestVT))) { 5297 EVT SrcVT = Src.getValueType(); 5298 unsigned NElts = DestVT.getVectorNumElements(); 5299 5300 if (SrcVT.getVectorNumElements() == NElts) { 5301 EVT DestEltVT = DestVT.getVectorElementType(); 5302 5303 SmallVector<SDValue, 8> CastedElts; 5304 SDLoc SL(N); 5305 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) { 5306 SDValue Elt = Src.getOperand(I); 5307 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt)); 5308 } 5309 5310 return DAG.getBuildVector(DestVT, SL, CastedElts); 5311 } 5312 } 5313 } 5314 5315 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector()) 5316 break; 5317 5318 // Fold bitcasts of constants. 5319 // 5320 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k) 5321 // TODO: Generalize and move to DAGCombiner 5322 SDValue Src = N->getOperand(0); 5323 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) { 5324 SDLoc SL(N); 5325 uint64_t CVal = C->getZExtValue(); 5326 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 5327 DAG.getConstant(Lo_32(CVal), SL, MVT::i32), 5328 DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); 5329 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV); 5330 } 5331 5332 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) { 5333 const APInt &Val = C->getValueAPF().bitcastToAPInt(); 5334 SDLoc SL(N); 5335 uint64_t CVal = Val.getZExtValue(); 5336 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 5337 DAG.getConstant(Lo_32(CVal), SL, MVT::i32), 5338 DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); 5339 5340 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec); 5341 } 5342 5343 break; 5344 } 5345 case ISD::SHL: 5346 case ISD::SRA: 5347 case ISD::SRL: { 5348 // Range metadata can be invalidated when loads are converted to legal types 5349 // (e.g. v2i64 -> v4i32). 5350 // Try to convert vector shl/sra/srl before type legalization so that range 5351 // metadata can be utilized. 5352 if (!(N->getValueType(0).isVector() && 5353 DCI.getDAGCombineLevel() == BeforeLegalizeTypes) && 5354 DCI.getDAGCombineLevel() < AfterLegalizeDAG) 5355 break; 5356 if (N->getOpcode() == ISD::SHL) 5357 return performShlCombine(N, DCI); 5358 if (N->getOpcode() == ISD::SRA) 5359 return performSraCombine(N, DCI); 5360 return performSrlCombine(N, DCI); 5361 } 5362 case ISD::TRUNCATE: 5363 return performTruncateCombine(N, DCI); 5364 case ISD::MUL: 5365 return performMulCombine(N, DCI); 5366 case AMDGPUISD::MUL_U24: 5367 case AMDGPUISD::MUL_I24: { 5368 if (SDValue Simplified = simplifyMul24(N, DCI)) 5369 return Simplified; 5370 break; 5371 } 5372 case AMDGPUISD::MULHI_I24: 5373 case AMDGPUISD::MULHI_U24: 5374 return simplifyMul24(N, DCI); 5375 case ISD::SMUL_LOHI: 5376 case ISD::UMUL_LOHI: 5377 return performMulLoHiCombine(N, DCI); 5378 case ISD::MULHS: 5379 return performMulhsCombine(N, DCI); 5380 case ISD::MULHU: 5381 return performMulhuCombine(N, DCI); 5382 case ISD::SELECT: 5383 return performSelectCombine(N, DCI); 5384 case ISD::FNEG: 5385 return performFNegCombine(N, DCI); 5386 case ISD::FABS: 5387 return performFAbsCombine(N, DCI); 5388 case AMDGPUISD::BFE_I32: 5389 case AMDGPUISD::BFE_U32: { 5390 assert(!N->getValueType(0).isVector() && 5391 "Vector handling of BFE not implemented"); 5392 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); 5393 if (!Width) 5394 break; 5395 5396 uint32_t WidthVal = Width->getZExtValue() & 0x1f; 5397 if (WidthVal == 0) 5398 return DAG.getConstant(0, DL, MVT::i32); 5399 5400 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 5401 if (!Offset) 5402 break; 5403 5404 SDValue BitsFrom = N->getOperand(0); 5405 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; 5406 5407 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; 5408 5409 if (OffsetVal == 0) { 5410 // This is already sign / zero extended, so try to fold away extra BFEs. 5411 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); 5412 5413 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); 5414 if (OpSignBits >= SignBits) 5415 return BitsFrom; 5416 5417 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); 5418 if (Signed) { 5419 // This is a sign_extend_inreg. Replace it to take advantage of existing 5420 // DAG Combines. If not eliminated, we will match back to BFE during 5421 // selection. 5422 5423 // TODO: The sext_inreg of extended types ends, although we can could 5424 // handle them in a single BFE. 5425 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, 5426 DAG.getValueType(SmallVT)); 5427 } 5428 5429 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); 5430 } 5431 5432 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) { 5433 if (Signed) { 5434 return constantFoldBFE<int32_t>(DAG, 5435 CVal->getSExtValue(), 5436 OffsetVal, 5437 WidthVal, 5438 DL); 5439 } 5440 5441 return constantFoldBFE<uint32_t>(DAG, 5442 CVal->getZExtValue(), 5443 OffsetVal, 5444 WidthVal, 5445 DL); 5446 } 5447 5448 if ((OffsetVal + WidthVal) >= 32 && 5449 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) { 5450 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); 5451 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, 5452 BitsFrom, ShiftVal); 5453 } 5454 5455 if (BitsFrom.hasOneUse()) { 5456 APInt Demanded = APInt::getBitsSet(32, 5457 OffsetVal, 5458 OffsetVal + WidthVal); 5459 5460 KnownBits Known; 5461 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 5462 !DCI.isBeforeLegalizeOps()); 5463 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5464 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) || 5465 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) { 5466 DCI.CommitTargetLoweringOpt(TLO); 5467 } 5468 } 5469 5470 break; 5471 } 5472 case ISD::LOAD: 5473 return performLoadCombine(N, DCI); 5474 case ISD::STORE: 5475 return performStoreCombine(N, DCI); 5476 case AMDGPUISD::RCP: 5477 case AMDGPUISD::RCP_IFLAG: 5478 return performRcpCombine(N, DCI); 5479 case ISD::AssertZext: 5480 case ISD::AssertSext: 5481 return performAssertSZExtCombine(N, DCI); 5482 case ISD::INTRINSIC_WO_CHAIN: 5483 return performIntrinsicWOChainCombine(N, DCI); 5484 case AMDGPUISD::FMAD_FTZ: { 5485 SDValue N0 = N->getOperand(0); 5486 SDValue N1 = N->getOperand(1); 5487 SDValue N2 = N->getOperand(2); 5488 EVT VT = N->getValueType(0); 5489 5490 // FMAD_FTZ is a FMAD + flush denormals to zero. 5491 // We flush the inputs, the intermediate step, and the output. 5492 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); 5493 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); 5494 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2); 5495 if (N0CFP && N1CFP && N2CFP) { 5496 const auto FTZ = [](const APFloat &V) { 5497 if (V.isDenormal()) { 5498 APFloat Zero(V.getSemantics(), 0); 5499 return V.isNegative() ? -Zero : Zero; 5500 } 5501 return V; 5502 }; 5503 5504 APFloat V0 = FTZ(N0CFP->getValueAPF()); 5505 APFloat V1 = FTZ(N1CFP->getValueAPF()); 5506 APFloat V2 = FTZ(N2CFP->getValueAPF()); 5507 V0.multiply(V1, APFloat::rmNearestTiesToEven); 5508 V0 = FTZ(V0); 5509 V0.add(V2, APFloat::rmNearestTiesToEven); 5510 return DAG.getConstantFP(FTZ(V0), DL, VT); 5511 } 5512 break; 5513 } 5514 } 5515 return SDValue(); 5516 } 5517 5518 //===----------------------------------------------------------------------===// 5519 // Helper functions 5520 //===----------------------------------------------------------------------===// 5521 5522 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 5523 const TargetRegisterClass *RC, 5524 Register Reg, EVT VT, 5525 const SDLoc &SL, 5526 bool RawReg) const { 5527 MachineFunction &MF = DAG.getMachineFunction(); 5528 MachineRegisterInfo &MRI = MF.getRegInfo(); 5529 Register VReg; 5530 5531 if (!MRI.isLiveIn(Reg)) { 5532 VReg = MRI.createVirtualRegister(RC); 5533 MRI.addLiveIn(Reg, VReg); 5534 } else { 5535 VReg = MRI.getLiveInVirtReg(Reg); 5536 } 5537 5538 if (RawReg) 5539 return DAG.getRegister(VReg, VT); 5540 5541 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT); 5542 } 5543 5544 // This may be called multiple times, and nothing prevents creating multiple 5545 // objects at the same offset. See if we already defined this object. 5546 static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, 5547 int64_t Offset) { 5548 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) { 5549 if (MFI.getObjectOffset(I) == Offset) { 5550 assert(MFI.getObjectSize(I) == Size); 5551 return I; 5552 } 5553 } 5554 5555 return MFI.CreateFixedObject(Size, Offset, true); 5556 } 5557 5558 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, 5559 EVT VT, 5560 const SDLoc &SL, 5561 int64_t Offset) const { 5562 MachineFunction &MF = DAG.getMachineFunction(); 5563 MachineFrameInfo &MFI = MF.getFrameInfo(); 5564 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset); 5565 5566 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); 5567 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); 5568 5569 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4), 5570 MachineMemOperand::MODereferenceable | 5571 MachineMemOperand::MOInvariant); 5572 } 5573 5574 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, 5575 const SDLoc &SL, 5576 SDValue Chain, 5577 SDValue ArgVal, 5578 int64_t Offset) const { 5579 MachineFunction &MF = DAG.getMachineFunction(); 5580 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); 5581 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 5582 5583 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32); 5584 // Stores to the argument stack area are relative to the stack pointer. 5585 SDValue SP = 5586 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32); 5587 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr); 5588 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4), 5589 MachineMemOperand::MODereferenceable); 5590 return Store; 5591 } 5592 5593 SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, 5594 const TargetRegisterClass *RC, 5595 EVT VT, const SDLoc &SL, 5596 const ArgDescriptor &Arg) const { 5597 assert(Arg && "Attempting to load missing argument"); 5598 5599 SDValue V = Arg.isRegister() ? 5600 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) : 5601 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); 5602 5603 if (!Arg.isMasked()) 5604 return V; 5605 5606 unsigned Mask = Arg.getMask(); 5607 unsigned Shift = llvm::countr_zero<unsigned>(Mask); 5608 V = DAG.getNode(ISD::SRL, SL, VT, V, 5609 DAG.getShiftAmountConstant(Shift, VT, SL)); 5610 return DAG.getNode(ISD::AND, SL, VT, V, 5611 DAG.getConstant(Mask >> Shift, SL, VT)); 5612 } 5613 5614 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( 5615 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const { 5616 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset(); 5617 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr(); 5618 uint64_t ArgOffset = 5619 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset; 5620 switch (Param) { 5621 case FIRST_IMPLICIT: 5622 return ArgOffset; 5623 case PRIVATE_BASE: 5624 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET; 5625 case SHARED_BASE: 5626 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET; 5627 case QUEUE_PTR: 5628 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET; 5629 } 5630 llvm_unreachable("unexpected implicit parameter type"); 5631 } 5632 5633 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( 5634 const MachineFunction &MF, const ImplicitParameter Param) const { 5635 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); 5636 return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param); 5637 } 5638 5639 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; 5640 5641 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { 5642 switch ((AMDGPUISD::NodeType)Opcode) { 5643 case AMDGPUISD::FIRST_NUMBER: break; 5644 // AMDIL DAG nodes 5645 NODE_NAME_CASE(BRANCH_COND); 5646 5647 // AMDGPU DAG nodes 5648 NODE_NAME_CASE(IF) 5649 NODE_NAME_CASE(ELSE) 5650 NODE_NAME_CASE(LOOP) 5651 NODE_NAME_CASE(CALL) 5652 NODE_NAME_CASE(TC_RETURN) 5653 NODE_NAME_CASE(TC_RETURN_GFX) 5654 NODE_NAME_CASE(TC_RETURN_CHAIN) 5655 NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR) 5656 NODE_NAME_CASE(TRAP) 5657 NODE_NAME_CASE(RET_GLUE) 5658 NODE_NAME_CASE(WAVE_ADDRESS) 5659 NODE_NAME_CASE(RETURN_TO_EPILOG) 5660 NODE_NAME_CASE(ENDPGM) 5661 NODE_NAME_CASE(ENDPGM_TRAP) 5662 NODE_NAME_CASE(SIMULATED_TRAP) 5663 NODE_NAME_CASE(DWORDADDR) 5664 NODE_NAME_CASE(FRACT) 5665 NODE_NAME_CASE(SETCC) 5666 NODE_NAME_CASE(DENORM_MODE) 5667 NODE_NAME_CASE(FMA_W_CHAIN) 5668 NODE_NAME_CASE(FMUL_W_CHAIN) 5669 NODE_NAME_CASE(CLAMP) 5670 NODE_NAME_CASE(COS_HW) 5671 NODE_NAME_CASE(SIN_HW) 5672 NODE_NAME_CASE(FMAX_LEGACY) 5673 NODE_NAME_CASE(FMIN_LEGACY) 5674 NODE_NAME_CASE(FMAX3) 5675 NODE_NAME_CASE(SMAX3) 5676 NODE_NAME_CASE(UMAX3) 5677 NODE_NAME_CASE(FMIN3) 5678 NODE_NAME_CASE(SMIN3) 5679 NODE_NAME_CASE(UMIN3) 5680 NODE_NAME_CASE(FMED3) 5681 NODE_NAME_CASE(SMED3) 5682 NODE_NAME_CASE(UMED3) 5683 NODE_NAME_CASE(FMAXIMUM3) 5684 NODE_NAME_CASE(FMINIMUM3) 5685 NODE_NAME_CASE(FDOT2) 5686 NODE_NAME_CASE(URECIP) 5687 NODE_NAME_CASE(DIV_SCALE) 5688 NODE_NAME_CASE(DIV_FMAS) 5689 NODE_NAME_CASE(DIV_FIXUP) 5690 NODE_NAME_CASE(FMAD_FTZ) 5691 NODE_NAME_CASE(RCP) 5692 NODE_NAME_CASE(RSQ) 5693 NODE_NAME_CASE(RCP_LEGACY) 5694 NODE_NAME_CASE(RCP_IFLAG) 5695 NODE_NAME_CASE(LOG) 5696 NODE_NAME_CASE(EXP) 5697 NODE_NAME_CASE(FMUL_LEGACY) 5698 NODE_NAME_CASE(RSQ_CLAMP) 5699 NODE_NAME_CASE(FP_CLASS) 5700 NODE_NAME_CASE(DOT4) 5701 NODE_NAME_CASE(CARRY) 5702 NODE_NAME_CASE(BORROW) 5703 NODE_NAME_CASE(BFE_U32) 5704 NODE_NAME_CASE(BFE_I32) 5705 NODE_NAME_CASE(BFI) 5706 NODE_NAME_CASE(BFM) 5707 NODE_NAME_CASE(FFBH_U32) 5708 NODE_NAME_CASE(FFBH_I32) 5709 NODE_NAME_CASE(FFBL_B32) 5710 NODE_NAME_CASE(MUL_U24) 5711 NODE_NAME_CASE(MUL_I24) 5712 NODE_NAME_CASE(MULHI_U24) 5713 NODE_NAME_CASE(MULHI_I24) 5714 NODE_NAME_CASE(MAD_U24) 5715 NODE_NAME_CASE(MAD_I24) 5716 NODE_NAME_CASE(MAD_I64_I32) 5717 NODE_NAME_CASE(MAD_U64_U32) 5718 NODE_NAME_CASE(PERM) 5719 NODE_NAME_CASE(TEXTURE_FETCH) 5720 NODE_NAME_CASE(R600_EXPORT) 5721 NODE_NAME_CASE(CONST_ADDRESS) 5722 NODE_NAME_CASE(REGISTER_LOAD) 5723 NODE_NAME_CASE(REGISTER_STORE) 5724 NODE_NAME_CASE(CVT_F32_UBYTE0) 5725 NODE_NAME_CASE(CVT_F32_UBYTE1) 5726 NODE_NAME_CASE(CVT_F32_UBYTE2) 5727 NODE_NAME_CASE(CVT_F32_UBYTE3) 5728 NODE_NAME_CASE(CVT_PKRTZ_F16_F32) 5729 NODE_NAME_CASE(CVT_PKNORM_I16_F32) 5730 NODE_NAME_CASE(CVT_PKNORM_U16_F32) 5731 NODE_NAME_CASE(CVT_PK_I16_I32) 5732 NODE_NAME_CASE(CVT_PK_U16_U32) 5733 NODE_NAME_CASE(FP_TO_FP16) 5734 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) 5735 NODE_NAME_CASE(CONST_DATA_PTR) 5736 NODE_NAME_CASE(PC_ADD_REL_OFFSET) 5737 NODE_NAME_CASE(LDS) 5738 NODE_NAME_CASE(DUMMY_CHAIN) 5739 NODE_NAME_CASE(LOAD_D16_HI) 5740 NODE_NAME_CASE(LOAD_D16_LO) 5741 NODE_NAME_CASE(LOAD_D16_HI_I8) 5742 NODE_NAME_CASE(LOAD_D16_HI_U8) 5743 NODE_NAME_CASE(LOAD_D16_LO_I8) 5744 NODE_NAME_CASE(LOAD_D16_LO_U8) 5745 NODE_NAME_CASE(STORE_MSKOR) 5746 NODE_NAME_CASE(TBUFFER_STORE_FORMAT) 5747 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) 5748 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) 5749 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) 5750 NODE_NAME_CASE(DS_ORDERED_COUNT) 5751 NODE_NAME_CASE(ATOMIC_CMP_SWAP) 5752 NODE_NAME_CASE(BUFFER_LOAD) 5753 NODE_NAME_CASE(BUFFER_LOAD_UBYTE) 5754 NODE_NAME_CASE(BUFFER_LOAD_USHORT) 5755 NODE_NAME_CASE(BUFFER_LOAD_BYTE) 5756 NODE_NAME_CASE(BUFFER_LOAD_SHORT) 5757 NODE_NAME_CASE(BUFFER_LOAD_TFE) 5758 NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE) 5759 NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE) 5760 NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE) 5761 NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE) 5762 NODE_NAME_CASE(BUFFER_LOAD_FORMAT) 5763 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE) 5764 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) 5765 NODE_NAME_CASE(SBUFFER_LOAD) 5766 NODE_NAME_CASE(SBUFFER_LOAD_BYTE) 5767 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE) 5768 NODE_NAME_CASE(SBUFFER_LOAD_SHORT) 5769 NODE_NAME_CASE(SBUFFER_LOAD_USHORT) 5770 NODE_NAME_CASE(SBUFFER_PREFETCH_DATA) 5771 NODE_NAME_CASE(BUFFER_STORE) 5772 NODE_NAME_CASE(BUFFER_STORE_BYTE) 5773 NODE_NAME_CASE(BUFFER_STORE_SHORT) 5774 NODE_NAME_CASE(BUFFER_STORE_FORMAT) 5775 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) 5776 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) 5777 NODE_NAME_CASE(BUFFER_ATOMIC_ADD) 5778 NODE_NAME_CASE(BUFFER_ATOMIC_SUB) 5779 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN) 5780 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN) 5781 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX) 5782 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX) 5783 NODE_NAME_CASE(BUFFER_ATOMIC_AND) 5784 NODE_NAME_CASE(BUFFER_ATOMIC_OR) 5785 NODE_NAME_CASE(BUFFER_ATOMIC_XOR) 5786 NODE_NAME_CASE(BUFFER_ATOMIC_INC) 5787 NODE_NAME_CASE(BUFFER_ATOMIC_DEC) 5788 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) 5789 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) 5790 NODE_NAME_CASE(BUFFER_ATOMIC_FADD) 5791 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) 5792 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) 5793 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32) 5794 } 5795 return nullptr; 5796 } 5797 5798 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand, 5799 SelectionDAG &DAG, int Enabled, 5800 int &RefinementSteps, 5801 bool &UseOneConstNR, 5802 bool Reciprocal) const { 5803 EVT VT = Operand.getValueType(); 5804 5805 if (VT == MVT::f32) { 5806 RefinementSteps = 0; 5807 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand); 5808 } 5809 5810 // TODO: There is also f64 rsq instruction, but the documentation is less 5811 // clear on its precision. 5812 5813 return SDValue(); 5814 } 5815 5816 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, 5817 SelectionDAG &DAG, int Enabled, 5818 int &RefinementSteps) const { 5819 EVT VT = Operand.getValueType(); 5820 5821 if (VT == MVT::f32) { 5822 // Reciprocal, < 1 ulp error. 5823 // 5824 // This reciprocal approximation converges to < 0.5 ulp error with one 5825 // newton rhapson performed with two fused multiple adds (FMAs). 5826 5827 RefinementSteps = 0; 5828 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); 5829 } 5830 5831 // TODO: There is also f64 rcp instruction, but the documentation is less 5832 // clear on its precision. 5833 5834 return SDValue(); 5835 } 5836 5837 static unsigned workitemIntrinsicDim(unsigned ID) { 5838 switch (ID) { 5839 case Intrinsic::amdgcn_workitem_id_x: 5840 return 0; 5841 case Intrinsic::amdgcn_workitem_id_y: 5842 return 1; 5843 case Intrinsic::amdgcn_workitem_id_z: 5844 return 2; 5845 default: 5846 llvm_unreachable("not a workitem intrinsic"); 5847 } 5848 } 5849 5850 void AMDGPUTargetLowering::computeKnownBitsForTargetNode( 5851 const SDValue Op, KnownBits &Known, 5852 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { 5853 5854 Known.resetAll(); // Don't know anything. 5855 5856 unsigned Opc = Op.getOpcode(); 5857 5858 switch (Opc) { 5859 default: 5860 break; 5861 case AMDGPUISD::CARRY: 5862 case AMDGPUISD::BORROW: { 5863 Known.Zero = APInt::getHighBitsSet(32, 31); 5864 break; 5865 } 5866 5867 case AMDGPUISD::BFE_I32: 5868 case AMDGPUISD::BFE_U32: { 5869 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 5870 if (!CWidth) 5871 return; 5872 5873 uint32_t Width = CWidth->getZExtValue() & 0x1f; 5874 5875 if (Opc == AMDGPUISD::BFE_U32) 5876 Known.Zero = APInt::getHighBitsSet(32, 32 - Width); 5877 5878 break; 5879 } 5880 case AMDGPUISD::FP_TO_FP16: { 5881 unsigned BitWidth = Known.getBitWidth(); 5882 5883 // High bits are zero. 5884 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 5885 break; 5886 } 5887 case AMDGPUISD::MUL_U24: 5888 case AMDGPUISD::MUL_I24: { 5889 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 5890 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); 5891 unsigned TrailZ = LHSKnown.countMinTrailingZeros() + 5892 RHSKnown.countMinTrailingZeros(); 5893 Known.Zero.setLowBits(std::min(TrailZ, 32u)); 5894 // Skip extra check if all bits are known zeros. 5895 if (TrailZ >= 32) 5896 break; 5897 5898 // Truncate to 24 bits. 5899 LHSKnown = LHSKnown.trunc(24); 5900 RHSKnown = RHSKnown.trunc(24); 5901 5902 if (Opc == AMDGPUISD::MUL_I24) { 5903 unsigned LHSValBits = LHSKnown.countMaxSignificantBits(); 5904 unsigned RHSValBits = RHSKnown.countMaxSignificantBits(); 5905 unsigned MaxValBits = LHSValBits + RHSValBits; 5906 if (MaxValBits > 32) 5907 break; 5908 unsigned SignBits = 32 - MaxValBits + 1; 5909 bool LHSNegative = LHSKnown.isNegative(); 5910 bool LHSNonNegative = LHSKnown.isNonNegative(); 5911 bool LHSPositive = LHSKnown.isStrictlyPositive(); 5912 bool RHSNegative = RHSKnown.isNegative(); 5913 bool RHSNonNegative = RHSKnown.isNonNegative(); 5914 bool RHSPositive = RHSKnown.isStrictlyPositive(); 5915 5916 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative)) 5917 Known.Zero.setHighBits(SignBits); 5918 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative)) 5919 Known.One.setHighBits(SignBits); 5920 } else { 5921 unsigned LHSValBits = LHSKnown.countMaxActiveBits(); 5922 unsigned RHSValBits = RHSKnown.countMaxActiveBits(); 5923 unsigned MaxValBits = LHSValBits + RHSValBits; 5924 if (MaxValBits >= 32) 5925 break; 5926 Known.Zero.setBitsFrom(MaxValBits); 5927 } 5928 break; 5929 } 5930 case AMDGPUISD::PERM: { 5931 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 5932 if (!CMask) 5933 return; 5934 5935 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 5936 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); 5937 unsigned Sel = CMask->getZExtValue(); 5938 5939 for (unsigned I = 0; I < 32; I += 8) { 5940 unsigned SelBits = Sel & 0xff; 5941 if (SelBits < 4) { 5942 SelBits *= 8; 5943 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; 5944 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; 5945 } else if (SelBits < 7) { 5946 SelBits = (SelBits & 3) * 8; 5947 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; 5948 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; 5949 } else if (SelBits == 0x0c) { 5950 Known.Zero |= 0xFFull << I; 5951 } else if (SelBits > 0x0c) { 5952 Known.One |= 0xFFull << I; 5953 } 5954 Sel >>= 8; 5955 } 5956 break; 5957 } 5958 case AMDGPUISD::BUFFER_LOAD_UBYTE: { 5959 Known.Zero.setHighBits(24); 5960 break; 5961 } 5962 case AMDGPUISD::BUFFER_LOAD_USHORT: { 5963 Known.Zero.setHighBits(16); 5964 break; 5965 } 5966 case AMDGPUISD::LDS: { 5967 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode()); 5968 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout()); 5969 5970 Known.Zero.setHighBits(16); 5971 Known.Zero.setLowBits(Log2(Alignment)); 5972 break; 5973 } 5974 case AMDGPUISD::SMIN3: 5975 case AMDGPUISD::SMAX3: 5976 case AMDGPUISD::SMED3: 5977 case AMDGPUISD::UMIN3: 5978 case AMDGPUISD::UMAX3: 5979 case AMDGPUISD::UMED3: { 5980 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1); 5981 if (Known2.isUnknown()) 5982 break; 5983 5984 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); 5985 if (Known1.isUnknown()) 5986 break; 5987 5988 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 5989 if (Known0.isUnknown()) 5990 break; 5991 5992 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling. 5993 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero; 5994 Known.One = Known0.One & Known1.One & Known2.One; 5995 break; 5996 } 5997 case ISD::INTRINSIC_WO_CHAIN: { 5998 unsigned IID = Op.getConstantOperandVal(0); 5999 switch (IID) { 6000 case Intrinsic::amdgcn_workitem_id_x: 6001 case Intrinsic::amdgcn_workitem_id_y: 6002 case Intrinsic::amdgcn_workitem_id_z: { 6003 unsigned MaxValue = Subtarget->getMaxWorkitemID( 6004 DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID)); 6005 Known.Zero.setHighBits(llvm::countl_zero(MaxValue)); 6006 break; 6007 } 6008 default: 6009 break; 6010 } 6011 } 6012 } 6013 } 6014 6015 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( 6016 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, 6017 unsigned Depth) const { 6018 switch (Op.getOpcode()) { 6019 case AMDGPUISD::BFE_I32: { 6020 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 6021 if (!Width) 6022 return 1; 6023 6024 unsigned SignBits = 32 - Width->getZExtValue() + 1; 6025 if (!isNullConstant(Op.getOperand(1))) 6026 return SignBits; 6027 6028 // TODO: Could probably figure something out with non-0 offsets. 6029 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); 6030 return std::max(SignBits, Op0SignBits); 6031 } 6032 6033 case AMDGPUISD::BFE_U32: { 6034 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 6035 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; 6036 } 6037 6038 case AMDGPUISD::CARRY: 6039 case AMDGPUISD::BORROW: 6040 return 31; 6041 case AMDGPUISD::BUFFER_LOAD_BYTE: 6042 return 25; 6043 case AMDGPUISD::BUFFER_LOAD_SHORT: 6044 return 17; 6045 case AMDGPUISD::BUFFER_LOAD_UBYTE: 6046 return 24; 6047 case AMDGPUISD::BUFFER_LOAD_USHORT: 6048 return 16; 6049 case AMDGPUISD::FP_TO_FP16: 6050 return 16; 6051 case AMDGPUISD::SMIN3: 6052 case AMDGPUISD::SMAX3: 6053 case AMDGPUISD::SMED3: 6054 case AMDGPUISD::UMIN3: 6055 case AMDGPUISD::UMAX3: 6056 case AMDGPUISD::UMED3: { 6057 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1); 6058 if (Tmp2 == 1) 6059 return 1; // Early out. 6060 6061 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1); 6062 if (Tmp1 == 1) 6063 return 1; // Early out. 6064 6065 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); 6066 if (Tmp0 == 1) 6067 return 1; // Early out. 6068 6069 return std::min({Tmp0, Tmp1, Tmp2}); 6070 } 6071 default: 6072 return 1; 6073 } 6074 } 6075 6076 unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr( 6077 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, 6078 const MachineRegisterInfo &MRI, unsigned Depth) const { 6079 const MachineInstr *MI = MRI.getVRegDef(R); 6080 if (!MI) 6081 return 1; 6082 6083 // TODO: Check range metadata on MMO. 6084 switch (MI->getOpcode()) { 6085 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 6086 return 25; 6087 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 6088 return 17; 6089 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 6090 return 24; 6091 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 6092 return 16; 6093 case AMDGPU::G_AMDGPU_SMED3: 6094 case AMDGPU::G_AMDGPU_UMED3: { 6095 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs(); 6096 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1); 6097 if (Tmp2 == 1) 6098 return 1; 6099 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1); 6100 if (Tmp1 == 1) 6101 return 1; 6102 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1); 6103 if (Tmp0 == 1) 6104 return 1; 6105 return std::min({Tmp0, Tmp1, Tmp2}); 6106 } 6107 default: 6108 return 1; 6109 } 6110 } 6111 6112 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode( 6113 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN, 6114 unsigned Depth) const { 6115 unsigned Opcode = Op.getOpcode(); 6116 switch (Opcode) { 6117 case AMDGPUISD::FMIN_LEGACY: 6118 case AMDGPUISD::FMAX_LEGACY: { 6119 if (SNaN) 6120 return true; 6121 6122 // TODO: Can check no nans on one of the operands for each one, but which 6123 // one? 6124 return false; 6125 } 6126 case AMDGPUISD::FMUL_LEGACY: 6127 case AMDGPUISD::CVT_PKRTZ_F16_F32: { 6128 if (SNaN) 6129 return true; 6130 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && 6131 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); 6132 } 6133 case AMDGPUISD::FMED3: 6134 case AMDGPUISD::FMIN3: 6135 case AMDGPUISD::FMAX3: 6136 case AMDGPUISD::FMINIMUM3: 6137 case AMDGPUISD::FMAXIMUM3: 6138 case AMDGPUISD::FMAD_FTZ: { 6139 if (SNaN) 6140 return true; 6141 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && 6142 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && 6143 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); 6144 } 6145 case AMDGPUISD::CVT_F32_UBYTE0: 6146 case AMDGPUISD::CVT_F32_UBYTE1: 6147 case AMDGPUISD::CVT_F32_UBYTE2: 6148 case AMDGPUISD::CVT_F32_UBYTE3: 6149 return true; 6150 6151 case AMDGPUISD::RCP: 6152 case AMDGPUISD::RSQ: 6153 case AMDGPUISD::RCP_LEGACY: 6154 case AMDGPUISD::RSQ_CLAMP: { 6155 if (SNaN) 6156 return true; 6157 6158 // TODO: Need is known positive check. 6159 return false; 6160 } 6161 case ISD::FLDEXP: 6162 case AMDGPUISD::FRACT: { 6163 if (SNaN) 6164 return true; 6165 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); 6166 } 6167 case AMDGPUISD::DIV_SCALE: 6168 case AMDGPUISD::DIV_FMAS: 6169 case AMDGPUISD::DIV_FIXUP: 6170 // TODO: Refine on operands. 6171 return SNaN; 6172 case AMDGPUISD::SIN_HW: 6173 case AMDGPUISD::COS_HW: { 6174 // TODO: Need check for infinity 6175 return SNaN; 6176 } 6177 case ISD::INTRINSIC_WO_CHAIN: { 6178 unsigned IntrinsicID = Op.getConstantOperandVal(0); 6179 // TODO: Handle more intrinsics 6180 switch (IntrinsicID) { 6181 case Intrinsic::amdgcn_cubeid: 6182 case Intrinsic::amdgcn_cvt_off_f32_i4: 6183 return true; 6184 6185 case Intrinsic::amdgcn_frexp_mant: { 6186 if (SNaN) 6187 return true; 6188 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); 6189 } 6190 case Intrinsic::amdgcn_cvt_pkrtz: { 6191 if (SNaN) 6192 return true; 6193 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && 6194 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); 6195 } 6196 case Intrinsic::amdgcn_rcp: 6197 case Intrinsic::amdgcn_rsq: 6198 case Intrinsic::amdgcn_rcp_legacy: 6199 case Intrinsic::amdgcn_rsq_legacy: 6200 case Intrinsic::amdgcn_rsq_clamp: 6201 case Intrinsic::amdgcn_tanh: { 6202 if (SNaN) 6203 return true; 6204 6205 // TODO: Need is known positive check. 6206 return false; 6207 } 6208 case Intrinsic::amdgcn_trig_preop: 6209 case Intrinsic::amdgcn_fdot2: 6210 // TODO: Refine on operand 6211 return SNaN; 6212 case Intrinsic::amdgcn_fma_legacy: 6213 if (SNaN) 6214 return true; 6215 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && 6216 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) && 6217 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1); 6218 default: 6219 return false; 6220 } 6221 } 6222 default: 6223 return false; 6224 } 6225 } 6226 6227 bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, 6228 Register N0, Register N1) const { 6229 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks 6230 } 6231