1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This is the parent TargetLowering class for hardware code gen 11 /// targets. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUISelLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUMachineFunction.h" 19 #include "AMDGPUMemoryUtils.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/CodeGen/Analysis.h" 22 #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h" 23 #include "llvm/CodeGen/MachineFrameInfo.h" 24 #include "llvm/IR/DiagnosticInfo.h" 25 #include "llvm/IR/IntrinsicsAMDGPU.h" 26 #include "llvm/Support/CommandLine.h" 27 #include "llvm/Support/KnownBits.h" 28 #include "llvm/Target/TargetMachine.h" 29 30 using namespace llvm; 31 32 #include "AMDGPUGenCallingConv.inc" 33 34 static cl::opt<bool> AMDGPUBypassSlowDiv( 35 "amdgpu-bypass-slow-div", 36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"), 37 cl::init(true)); 38 39 // Find a larger type to do a load / store of a vector with. 40 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { 41 unsigned StoreSize = VT.getStoreSizeInBits(); 42 if (StoreSize <= 32) 43 return EVT::getIntegerVT(Ctx, StoreSize); 44 45 if (StoreSize % 32 == 0) 46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); 47 48 return VT; 49 } 50 51 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) { 52 return DAG.computeKnownBits(Op).countMaxActiveBits(); 53 } 54 55 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) { 56 // In order for this to be a signed 24-bit value, bit 23, must 57 // be a sign bit. 58 return DAG.ComputeMaxSignificantBits(Op); 59 } 60 61 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, 62 const AMDGPUSubtarget &STI) 63 : TargetLowering(TM), Subtarget(&STI) { 64 // Always lower memset, memcpy, and memmove intrinsics to load/store 65 // instructions, rather then generating calls to memset, mempcy or memmove. 66 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U; 67 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U; 68 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U; 69 70 // Enable ganging up loads and stores in the memcpy DAG lowering. 71 MaxGluedStoresPerMemcpy = 16; 72 73 // Lower floating point store/load to integer store/load to reduce the number 74 // of patterns in tablegen. 75 setOperationAction(ISD::LOAD, MVT::f32, Promote); 76 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); 77 78 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 79 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); 80 81 setOperationAction(ISD::LOAD, MVT::v3f32, Promote); 82 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32); 83 84 setOperationAction(ISD::LOAD, MVT::v4f32, Promote); 85 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); 86 87 setOperationAction(ISD::LOAD, MVT::v5f32, Promote); 88 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32); 89 90 setOperationAction(ISD::LOAD, MVT::v6f32, Promote); 91 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32); 92 93 setOperationAction(ISD::LOAD, MVT::v7f32, Promote); 94 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32); 95 96 setOperationAction(ISD::LOAD, MVT::v8f32, Promote); 97 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); 98 99 setOperationAction(ISD::LOAD, MVT::v9f32, Promote); 100 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32); 101 102 setOperationAction(ISD::LOAD, MVT::v10f32, Promote); 103 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32); 104 105 setOperationAction(ISD::LOAD, MVT::v11f32, Promote); 106 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32); 107 108 setOperationAction(ISD::LOAD, MVT::v12f32, Promote); 109 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32); 110 111 setOperationAction(ISD::LOAD, MVT::v16f32, Promote); 112 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); 113 114 setOperationAction(ISD::LOAD, MVT::v32f32, Promote); 115 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32); 116 117 setOperationAction(ISD::LOAD, MVT::i64, Promote); 118 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); 119 120 setOperationAction(ISD::LOAD, MVT::v2i64, Promote); 121 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); 122 123 setOperationAction(ISD::LOAD, MVT::f64, Promote); 124 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32); 125 126 setOperationAction(ISD::LOAD, MVT::v2f64, Promote); 127 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32); 128 129 setOperationAction(ISD::LOAD, MVT::v3i64, Promote); 130 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32); 131 132 setOperationAction(ISD::LOAD, MVT::v4i64, Promote); 133 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32); 134 135 setOperationAction(ISD::LOAD, MVT::v3f64, Promote); 136 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32); 137 138 setOperationAction(ISD::LOAD, MVT::v4f64, Promote); 139 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32); 140 141 setOperationAction(ISD::LOAD, MVT::v8i64, Promote); 142 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32); 143 144 setOperationAction(ISD::LOAD, MVT::v8f64, Promote); 145 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32); 146 147 setOperationAction(ISD::LOAD, MVT::v16i64, Promote); 148 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32); 149 150 setOperationAction(ISD::LOAD, MVT::v16f64, Promote); 151 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32); 152 153 setOperationAction(ISD::LOAD, MVT::i128, Promote); 154 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32); 155 156 // TODO: Would be better to consume as directly legal 157 setOperationAction(ISD::ATOMIC_LOAD, MVT::f32, Promote); 158 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32); 159 160 setOperationAction(ISD::ATOMIC_LOAD, MVT::f64, Promote); 161 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64); 162 163 setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Promote); 164 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16); 165 166 setOperationAction(ISD::ATOMIC_LOAD, MVT::bf16, Promote); 167 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16); 168 169 setOperationAction(ISD::ATOMIC_STORE, MVT::f32, Promote); 170 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32); 171 172 setOperationAction(ISD::ATOMIC_STORE, MVT::f64, Promote); 173 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64); 174 175 setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Promote); 176 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16); 177 178 setOperationAction(ISD::ATOMIC_STORE, MVT::bf16, Promote); 179 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16); 180 181 // There are no 64-bit extloads. These should be done as a 32-bit extload and 182 // an extension to 64-bit. 183 for (MVT VT : MVT::integer_valuetypes()) 184 setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT, 185 Expand); 186 187 for (MVT VT : MVT::integer_valuetypes()) { 188 if (VT == MVT::i64) 189 continue; 190 191 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) { 192 setLoadExtAction(Op, VT, MVT::i1, Promote); 193 setLoadExtAction(Op, VT, MVT::i8, Legal); 194 setLoadExtAction(Op, VT, MVT::i16, Legal); 195 setLoadExtAction(Op, VT, MVT::i32, Expand); 196 } 197 } 198 199 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) 200 for (auto MemVT : 201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16}) 202 setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT, 203 Expand); 204 205 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 206 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); 207 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 208 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand); 209 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand); 210 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand); 211 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 212 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand); 213 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); 214 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand); 215 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand); 216 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand); 217 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand); 218 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand); 219 220 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 221 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 222 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand); 223 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 224 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand); 225 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand); 226 227 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); 229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 230 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand); 231 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand); 232 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand); 233 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 234 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand); 235 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); 236 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand); 237 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand); 238 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand); 239 240 setOperationAction(ISD::STORE, MVT::f32, Promote); 241 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); 242 243 setOperationAction(ISD::STORE, MVT::v2f32, Promote); 244 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); 245 246 setOperationAction(ISD::STORE, MVT::v3f32, Promote); 247 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32); 248 249 setOperationAction(ISD::STORE, MVT::v4f32, Promote); 250 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); 251 252 setOperationAction(ISD::STORE, MVT::v5f32, Promote); 253 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32); 254 255 setOperationAction(ISD::STORE, MVT::v6f32, Promote); 256 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32); 257 258 setOperationAction(ISD::STORE, MVT::v7f32, Promote); 259 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32); 260 261 setOperationAction(ISD::STORE, MVT::v8f32, Promote); 262 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); 263 264 setOperationAction(ISD::STORE, MVT::v9f32, Promote); 265 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32); 266 267 setOperationAction(ISD::STORE, MVT::v10f32, Promote); 268 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32); 269 270 setOperationAction(ISD::STORE, MVT::v11f32, Promote); 271 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32); 272 273 setOperationAction(ISD::STORE, MVT::v12f32, Promote); 274 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32); 275 276 setOperationAction(ISD::STORE, MVT::v16f32, Promote); 277 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); 278 279 setOperationAction(ISD::STORE, MVT::v32f32, Promote); 280 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32); 281 282 setOperationAction(ISD::STORE, MVT::i64, Promote); 283 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); 284 285 setOperationAction(ISD::STORE, MVT::v2i64, Promote); 286 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); 287 288 setOperationAction(ISD::STORE, MVT::f64, Promote); 289 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32); 290 291 setOperationAction(ISD::STORE, MVT::v2f64, Promote); 292 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32); 293 294 setOperationAction(ISD::STORE, MVT::v3i64, Promote); 295 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32); 296 297 setOperationAction(ISD::STORE, MVT::v3f64, Promote); 298 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32); 299 300 setOperationAction(ISD::STORE, MVT::v4i64, Promote); 301 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32); 302 303 setOperationAction(ISD::STORE, MVT::v4f64, Promote); 304 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32); 305 306 setOperationAction(ISD::STORE, MVT::v8i64, Promote); 307 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32); 308 309 setOperationAction(ISD::STORE, MVT::v8f64, Promote); 310 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32); 311 312 setOperationAction(ISD::STORE, MVT::v16i64, Promote); 313 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32); 314 315 setOperationAction(ISD::STORE, MVT::v16f64, Promote); 316 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32); 317 318 setOperationAction(ISD::STORE, MVT::i128, Promote); 319 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32); 320 321 setTruncStoreAction(MVT::i64, MVT::i1, Expand); 322 setTruncStoreAction(MVT::i64, MVT::i8, Expand); 323 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 324 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 325 326 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); 327 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand); 328 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand); 329 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); 330 331 setTruncStoreAction(MVT::f32, MVT::bf16, Expand); 332 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 333 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand); 334 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); 335 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand); 336 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand); 337 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand); 338 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); 339 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand); 340 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); 341 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand); 342 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand); 343 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand); 344 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand); 345 346 setTruncStoreAction(MVT::f64, MVT::bf16, Expand); 347 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 348 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 349 350 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); 351 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand); 352 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); 353 354 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand); 355 356 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand); 357 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand); 358 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand); 359 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand); 360 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand); 361 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand); 362 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand); 363 364 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand); 365 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand); 366 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand); 367 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand); 368 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand); 369 370 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand); 371 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand); 372 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand); 373 374 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand); 375 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand); 376 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand); 377 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); 378 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); 379 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); 380 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); 381 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand); 382 383 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal); 384 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal); 385 386 setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand); 387 388 // For R600, this is totally unsupported, just custom lower to produce an 389 // error. 390 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 391 392 // Library functions. These default to Expand, but we have instructions 393 // for them. 394 setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, 395 ISD::FROUNDEVEN, ISD::FTRUNC}, 396 {MVT::f16, MVT::f32}, Legal); 397 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal); 398 399 setOperationAction(ISD::FLOG2, MVT::f32, Custom); 400 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom); 401 setOperationAction({ISD::LROUND, ISD::LLROUND}, 402 {MVT::f16, MVT::f32, MVT::f64}, Expand); 403 404 setOperationAction( 405 {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32, 406 Custom); 407 408 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom); 409 410 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom); 411 412 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64}, 413 Expand); 414 415 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom); 416 417 if (Subtarget->has16BitInsts()) { 418 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal); 419 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal); 420 } else { 421 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal); 422 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom); 423 } 424 425 setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16, 426 Custom); 427 428 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal); 429 if (Subtarget->has16BitInsts()) { 430 setOperationAction(ISD::FCANONICALIZE, MVT::f16, Legal); 431 } 432 433 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches 434 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by 435 // default unless marked custom/legal. 436 setOperationAction(ISD::IS_FPCLASS, 437 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, 438 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32, 439 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, 440 MVT::v16f64}, 441 Custom); 442 443 if (isTypeLegal(MVT::f16)) 444 setOperationAction(ISD::IS_FPCLASS, 445 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16}, 446 Custom); 447 448 // Expand to fneg + fadd. 449 setOperationAction(ISD::FSUB, MVT::f64, Expand); 450 451 setOperationAction(ISD::CONCAT_VECTORS, 452 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32, 453 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32, 454 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32, 455 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32, 456 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32}, 457 Custom); 458 459 setOperationAction( 460 ISD::EXTRACT_SUBVECTOR, 461 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32, 462 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32, 463 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32, 464 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, 465 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, 466 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64, 467 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64}, 468 Custom); 469 470 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 471 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom); 472 473 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 474 for (MVT VT : ScalarIntVTs) { 475 // These should use [SU]DIVREM, so set them to expand 476 setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT, 477 Expand); 478 479 // GPU does not have divrem function for signed or unsigned. 480 setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom); 481 482 // GPU does not have [S|U]MUL_LOHI functions as a single instruction. 483 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand); 484 485 setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand); 486 487 // AMDGPU uses ADDC/SUBC/ADDE/SUBE 488 setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal); 489 } 490 491 // The hardware supports 32-bit FSHR, but not FSHL. 492 setOperationAction(ISD::FSHR, MVT::i32, Legal); 493 494 // The hardware supports 32-bit ROTR, but not ROTL. 495 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand); 496 setOperationAction(ISD::ROTR, MVT::i64, Expand); 497 498 setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand); 499 500 setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand); 501 setOperationAction( 502 {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, 503 MVT::i64, Custom); 504 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 505 506 setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32, 507 Legal); 508 509 setOperationAction( 510 {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, 511 MVT::i64, Custom); 512 513 for (auto VT : {MVT::i8, MVT::i16}) 514 setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Custom); 515 516 static const MVT::SimpleValueType VectorIntTypes[] = { 517 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32, 518 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32}; 519 520 for (MVT VT : VectorIntTypes) { 521 // Expand the following operations for the current type by default. 522 setOperationAction({ISD::ADD, ISD::AND, ISD::FP_TO_SINT, 523 ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU, 524 ISD::MULHS, ISD::OR, ISD::SHL, 525 ISD::SRA, ISD::SRL, ISD::ROTL, 526 ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP, 527 ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV, 528 ISD::SREM, ISD::UREM, ISD::SMUL_LOHI, 529 ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM, 530 ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC, 531 ISD::XOR, ISD::BSWAP, ISD::CTPOP, 532 ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE, 533 ISD::SETCC, ISD::ADDRSPACECAST}, 534 VT, Expand); 535 } 536 537 static const MVT::SimpleValueType FloatVectorTypes[] = { 538 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, 539 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32}; 540 541 for (MVT VT : FloatVectorTypes) { 542 setOperationAction( 543 {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM, 544 ISD::FADD, ISD::FCEIL, ISD::FCOS, 545 ISD::FDIV, ISD::FEXP2, ISD::FEXP, 546 ISD::FEXP10, ISD::FLOG2, ISD::FREM, 547 ISD::FLOG, ISD::FLOG10, ISD::FPOW, 548 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL, 549 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT, 550 ISD::FSQRT, ISD::FSIN, ISD::FSUB, 551 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC, 552 ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC, 553 ISD::FCANONICALIZE, ISD::FROUNDEVEN}, 554 VT, Expand); 555 } 556 557 // This causes using an unrolled select operation rather than expansion with 558 // bit operations. This is in general better, but the alternative using BFI 559 // instructions may be better if the select sources are SGPRs. 560 setOperationAction(ISD::SELECT, MVT::v2f32, Promote); 561 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32); 562 563 setOperationAction(ISD::SELECT, MVT::v3f32, Promote); 564 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32); 565 566 setOperationAction(ISD::SELECT, MVT::v4f32, Promote); 567 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32); 568 569 setOperationAction(ISD::SELECT, MVT::v5f32, Promote); 570 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32); 571 572 setOperationAction(ISD::SELECT, MVT::v6f32, Promote); 573 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32); 574 575 setOperationAction(ISD::SELECT, MVT::v7f32, Promote); 576 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32); 577 578 setOperationAction(ISD::SELECT, MVT::v9f32, Promote); 579 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32); 580 581 setOperationAction(ISD::SELECT, MVT::v10f32, Promote); 582 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32); 583 584 setOperationAction(ISD::SELECT, MVT::v11f32, Promote); 585 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32); 586 587 setOperationAction(ISD::SELECT, MVT::v12f32, Promote); 588 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32); 589 590 setSchedulingPreference(Sched::RegPressure); 591 setJumpIsExpensive(true); 592 593 // FIXME: This is only partially true. If we have to do vector compares, any 594 // SGPR pair can be a condition register. If we have a uniform condition, we 595 // are better off doing SALU operations, where there is only one SCC. For now, 596 // we don't have a way of knowing during instruction selection if a condition 597 // will be uniform and we always use vector compares. Assume we are using 598 // vector compares until that is fixed. 599 setHasMultipleConditionRegisters(true); 600 601 setMinCmpXchgSizeInBits(32); 602 setSupportsUnalignedAtomics(false); 603 604 PredictableSelectIsExpensive = false; 605 606 // We want to find all load dependencies for long chains of stores to enable 607 // merging into very wide vectors. The problem is with vectors with > 4 608 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16 609 // vectors are a legal type, even though we have to split the loads 610 // usually. When we can more precisely specify load legality per address 611 // space, we should be able to make FindBetterChain/MergeConsecutiveStores 612 // smarter so that they can figure out what to do in 2 iterations without all 613 // N > 4 stores on the same chain. 614 GatherAllAliasesMaxDepth = 16; 615 616 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry 617 // about these during lowering. 618 MaxStoresPerMemcpy = 0xffffffff; 619 MaxStoresPerMemmove = 0xffffffff; 620 MaxStoresPerMemset = 0xffffffff; 621 622 // The expansion for 64-bit division is enormous. 623 if (AMDGPUBypassSlowDiv) 624 addBypassSlowDiv(64, 32); 625 626 setTargetDAGCombine({ISD::BITCAST, ISD::SHL, 627 ISD::SRA, ISD::SRL, 628 ISD::TRUNCATE, ISD::MUL, 629 ISD::SMUL_LOHI, ISD::UMUL_LOHI, 630 ISD::MULHU, ISD::MULHS, 631 ISD::SELECT, ISD::SELECT_CC, 632 ISD::STORE, ISD::FADD, 633 ISD::FSUB, ISD::FNEG, 634 ISD::FABS, ISD::AssertZext, 635 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN}); 636 637 setMaxAtomicSizeInBitsSupported(64); 638 setMaxDivRemBitWidthSupported(64); 639 setMaxLargeFPConvertBitWidthSupported(64); 640 } 641 642 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const { 643 if (getTargetMachine().Options.NoSignedZerosFPMath) 644 return true; 645 646 const auto Flags = Op.getNode()->getFlags(); 647 if (Flags.hasNoSignedZeros()) 648 return true; 649 650 return false; 651 } 652 653 //===----------------------------------------------------------------------===// 654 // Target Information 655 //===----------------------------------------------------------------------===// 656 657 LLVM_READNONE 658 static bool fnegFoldsIntoOpcode(unsigned Opc) { 659 switch (Opc) { 660 case ISD::FADD: 661 case ISD::FSUB: 662 case ISD::FMUL: 663 case ISD::FMA: 664 case ISD::FMAD: 665 case ISD::FMINNUM: 666 case ISD::FMAXNUM: 667 case ISD::FMINNUM_IEEE: 668 case ISD::FMAXNUM_IEEE: 669 case ISD::FMINIMUM: 670 case ISD::FMAXIMUM: 671 case ISD::FMINIMUMNUM: 672 case ISD::FMAXIMUMNUM: 673 case ISD::SELECT: 674 case ISD::FSIN: 675 case ISD::FTRUNC: 676 case ISD::FRINT: 677 case ISD::FNEARBYINT: 678 case ISD::FROUNDEVEN: 679 case ISD::FCANONICALIZE: 680 case AMDGPUISD::RCP: 681 case AMDGPUISD::RCP_LEGACY: 682 case AMDGPUISD::RCP_IFLAG: 683 case AMDGPUISD::SIN_HW: 684 case AMDGPUISD::FMUL_LEGACY: 685 case AMDGPUISD::FMIN_LEGACY: 686 case AMDGPUISD::FMAX_LEGACY: 687 case AMDGPUISD::FMED3: 688 // TODO: handle llvm.amdgcn.fma.legacy 689 return true; 690 case ISD::BITCAST: 691 llvm_unreachable("bitcast is special cased"); 692 default: 693 return false; 694 } 695 } 696 697 static bool fnegFoldsIntoOp(const SDNode *N) { 698 unsigned Opc = N->getOpcode(); 699 if (Opc == ISD::BITCAST) { 700 // TODO: Is there a benefit to checking the conditions performFNegCombine 701 // does? We don't for the other cases. 702 SDValue BCSrc = N->getOperand(0); 703 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) { 704 return BCSrc.getNumOperands() == 2 && 705 BCSrc.getOperand(1).getValueSizeInBits() == 32; 706 } 707 708 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32; 709 } 710 711 return fnegFoldsIntoOpcode(Opc); 712 } 713 714 /// \p returns true if the operation will definitely need to use a 64-bit 715 /// encoding, and thus will use a VOP3 encoding regardless of the source 716 /// modifiers. 717 LLVM_READONLY 718 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) { 719 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) || 720 VT == MVT::f64; 721 } 722 723 /// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the 724 /// type for ISD::SELECT. 725 LLVM_READONLY 726 static bool selectSupportsSourceMods(const SDNode *N) { 727 // TODO: Only applies if select will be vector 728 return N->getValueType(0) == MVT::f32; 729 } 730 731 // Most FP instructions support source modifiers, but this could be refined 732 // slightly. 733 LLVM_READONLY 734 static bool hasSourceMods(const SDNode *N) { 735 if (isa<MemSDNode>(N)) 736 return false; 737 738 switch (N->getOpcode()) { 739 case ISD::CopyToReg: 740 case ISD::FDIV: 741 case ISD::FREM: 742 case ISD::INLINEASM: 743 case ISD::INLINEASM_BR: 744 case AMDGPUISD::DIV_SCALE: 745 case ISD::INTRINSIC_W_CHAIN: 746 747 // TODO: Should really be looking at the users of the bitcast. These are 748 // problematic because bitcasts are used to legalize all stores to integer 749 // types. 750 case ISD::BITCAST: 751 return false; 752 case ISD::INTRINSIC_WO_CHAIN: { 753 switch (N->getConstantOperandVal(0)) { 754 case Intrinsic::amdgcn_interp_p1: 755 case Intrinsic::amdgcn_interp_p2: 756 case Intrinsic::amdgcn_interp_mov: 757 case Intrinsic::amdgcn_interp_p1_f16: 758 case Intrinsic::amdgcn_interp_p2_f16: 759 return false; 760 default: 761 return true; 762 } 763 } 764 case ISD::SELECT: 765 return selectSupportsSourceMods(N); 766 default: 767 return true; 768 } 769 } 770 771 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N, 772 unsigned CostThreshold) { 773 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus 774 // it is truly free to use a source modifier in all cases. If there are 775 // multiple users but for each one will necessitate using VOP3, there will be 776 // a code size increase. Try to avoid increasing code size unless we know it 777 // will save on the instruction count. 778 unsigned NumMayIncreaseSize = 0; 779 MVT VT = N->getValueType(0).getScalarType().getSimpleVT(); 780 781 assert(!N->use_empty()); 782 783 // XXX - Should this limit number of uses to check? 784 for (const SDNode *U : N->users()) { 785 if (!hasSourceMods(U)) 786 return false; 787 788 if (!opMustUseVOP3Encoding(U, VT)) { 789 if (++NumMayIncreaseSize > CostThreshold) 790 return false; 791 } 792 } 793 794 return true; 795 } 796 797 EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, 798 ISD::NodeType ExtendKind) const { 799 assert(!VT.isVector() && "only scalar expected"); 800 801 // Round to the next multiple of 32-bits. 802 unsigned Size = VT.getSizeInBits(); 803 if (Size <= 32) 804 return MVT::i32; 805 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32)); 806 } 807 808 unsigned AMDGPUTargetLowering::getVectorIdxWidth(const DataLayout &) const { 809 return 32; 810 } 811 812 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { 813 return true; 814 } 815 816 // The backend supports 32 and 64 bit floating point immediates. 817 // FIXME: Why are we reporting vectors of FP immediates as legal? 818 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 819 bool ForCodeSize) const { 820 EVT ScalarVT = VT.getScalarType(); 821 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 || 822 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts())); 823 } 824 825 // We don't want to shrink f64 / f32 constants. 826 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { 827 EVT ScalarVT = VT.getScalarType(); 828 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); 829 } 830 831 bool AMDGPUTargetLowering::shouldReduceLoadWidth( 832 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT, 833 std::optional<unsigned> ByteOffset) const { 834 // TODO: This may be worth removing. Check regression tests for diffs. 835 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset)) 836 return false; 837 838 unsigned NewSize = NewVT.getStoreSizeInBits(); 839 840 // If we are reducing to a 32-bit load or a smaller multi-dword load, 841 // this is always better. 842 if (NewSize >= 32) 843 return true; 844 845 EVT OldVT = N->getValueType(0); 846 unsigned OldSize = OldVT.getStoreSizeInBits(); 847 848 MemSDNode *MN = cast<MemSDNode>(N); 849 unsigned AS = MN->getAddressSpace(); 850 // Do not shrink an aligned scalar load to sub-dword. 851 // Scalar engine cannot do sub-dword loads. 852 // TODO: Update this for GFX12 which does have scalar sub-dword loads. 853 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) && 854 (AS == AMDGPUAS::CONSTANT_ADDRESS || 855 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 856 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS && 857 MN->isInvariant())) && 858 AMDGPU::isUniformMMO(MN->getMemOperand())) 859 return false; 860 861 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar 862 // extloads, so doing one requires using a buffer_load. In cases where we 863 // still couldn't use a scalar load, using the wider load shouldn't really 864 // hurt anything. 865 866 // If the old size already had to be an extload, there's no harm in continuing 867 // to reduce the width. 868 return (OldSize < 32); 869 } 870 871 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy, 872 const SelectionDAG &DAG, 873 const MachineMemOperand &MMO) const { 874 875 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits()); 876 877 if (LoadTy.getScalarType() == MVT::i32) 878 return false; 879 880 unsigned LScalarSize = LoadTy.getScalarSizeInBits(); 881 unsigned CastScalarSize = CastTy.getScalarSizeInBits(); 882 883 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32)) 884 return false; 885 886 unsigned Fast = 0; 887 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 888 CastTy, MMO, &Fast) && 889 Fast; 890 } 891 892 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also 893 // profitable with the expansion for 64-bit since it's generally good to 894 // speculate things. 895 bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const { 896 return true; 897 } 898 899 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { 900 return true; 901 } 902 903 bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const { 904 switch (N->getOpcode()) { 905 case ISD::EntryToken: 906 case ISD::TokenFactor: 907 return true; 908 case ISD::INTRINSIC_WO_CHAIN: { 909 unsigned IntrID = N->getConstantOperandVal(0); 910 return AMDGPU::isIntrinsicAlwaysUniform(IntrID); 911 } 912 case ISD::INTRINSIC_W_CHAIN: { 913 unsigned IntrID = N->getConstantOperandVal(1); 914 return AMDGPU::isIntrinsicAlwaysUniform(IntrID); 915 } 916 case ISD::LOAD: 917 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() == 918 AMDGPUAS::CONSTANT_ADDRESS_32BIT) 919 return true; 920 return false; 921 case AMDGPUISD::SETCC: // ballot-style instruction 922 return true; 923 } 924 return false; 925 } 926 927 SDValue AMDGPUTargetLowering::getNegatedExpression( 928 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, 929 NegatibleCost &Cost, unsigned Depth) const { 930 931 switch (Op.getOpcode()) { 932 case ISD::FMA: 933 case ISD::FMAD: { 934 // Negating a fma is not free if it has users without source mods. 935 if (!allUsesHaveSourceMods(Op.getNode())) 936 return SDValue(); 937 break; 938 } 939 case AMDGPUISD::RCP: { 940 SDValue Src = Op.getOperand(0); 941 EVT VT = Op.getValueType(); 942 SDLoc SL(Op); 943 944 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations, 945 ForCodeSize, Cost, Depth + 1); 946 if (NegSrc) 947 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags()); 948 return SDValue(); 949 } 950 default: 951 break; 952 } 953 954 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations, 955 ForCodeSize, Cost, Depth); 956 } 957 958 //===---------------------------------------------------------------------===// 959 // Target Properties 960 //===---------------------------------------------------------------------===// 961 962 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { 963 assert(VT.isFloatingPoint()); 964 965 // Packed operations do not have a fabs modifier. 966 return VT == MVT::f32 || VT == MVT::f64 || 967 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16)); 968 } 969 970 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { 971 assert(VT.isFloatingPoint()); 972 // Report this based on the end legalized type. 973 VT = VT.getScalarType(); 974 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16; 975 } 976 977 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, 978 unsigned NumElem, 979 unsigned AS) const { 980 return true; 981 } 982 983 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const { 984 // There are few operations which truly have vector input operands. Any vector 985 // operation is going to involve operations on each component, and a 986 // build_vector will be a copy per element, so it always makes sense to use a 987 // build_vector input in place of the extracted element to avoid a copy into a 988 // super register. 989 // 990 // We should probably only do this if all users are extracts only, but this 991 // should be the common case. 992 return true; 993 } 994 995 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { 996 // Truncate is just accessing a subregister. 997 998 unsigned SrcSize = Source.getSizeInBits(); 999 unsigned DestSize = Dest.getSizeInBits(); 1000 1001 return DestSize < SrcSize && DestSize % 32 == 0 ; 1002 } 1003 1004 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { 1005 // Truncate is just accessing a subregister. 1006 1007 unsigned SrcSize = Source->getScalarSizeInBits(); 1008 unsigned DestSize = Dest->getScalarSizeInBits(); 1009 1010 if (DestSize== 16 && Subtarget->has16BitInsts()) 1011 return SrcSize >= 32; 1012 1013 return DestSize < SrcSize && DestSize % 32 == 0; 1014 } 1015 1016 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { 1017 unsigned SrcSize = Src->getScalarSizeInBits(); 1018 unsigned DestSize = Dest->getScalarSizeInBits(); 1019 1020 if (SrcSize == 16 && Subtarget->has16BitInsts()) 1021 return DestSize >= 32; 1022 1023 return SrcSize == 32 && DestSize == 64; 1024 } 1025 1026 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { 1027 // Any register load of a 64-bit value really requires 2 32-bit moves. For all 1028 // practical purposes, the extra mov 0 to load a 64-bit is free. As used, 1029 // this will enable reducing 64-bit operations the 32-bit, which is always 1030 // good. 1031 1032 if (Src == MVT::i16) 1033 return Dest == MVT::i32 ||Dest == MVT::i64 ; 1034 1035 return Src == MVT::i32 && Dest == MVT::i64; 1036 } 1037 1038 bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT, 1039 EVT DestVT) const { 1040 switch (N->getOpcode()) { 1041 case ISD::ADD: 1042 case ISD::SUB: 1043 case ISD::SHL: 1044 case ISD::SRL: 1045 case ISD::SRA: 1046 case ISD::AND: 1047 case ISD::OR: 1048 case ISD::XOR: 1049 case ISD::MUL: 1050 case ISD::SETCC: 1051 case ISD::SELECT: 1052 case ISD::SMIN: 1053 case ISD::SMAX: 1054 case ISD::UMIN: 1055 case ISD::UMAX: 1056 if (Subtarget->has16BitInsts() && 1057 (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) { 1058 // Don't narrow back down to i16 if promoted to i32 already. 1059 if (!N->isDivergent() && DestVT.isInteger() && 1060 DestVT.getScalarSizeInBits() > 1 && 1061 DestVT.getScalarSizeInBits() <= 16 && 1062 SrcVT.getScalarSizeInBits() > 16) { 1063 return false; 1064 } 1065 } 1066 return true; 1067 default: 1068 break; 1069 } 1070 1071 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a 1072 // limited number of native 64-bit operations. Shrinking an operation to fit 1073 // in a single 32-bit register should always be helpful. As currently used, 1074 // this is much less general than the name suggests, and is only used in 1075 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is 1076 // not profitable, and may actually be harmful. 1077 if (isa<LoadSDNode>(N)) 1078 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; 1079 1080 return true; 1081 } 1082 1083 bool AMDGPUTargetLowering::isDesirableToCommuteWithShift( 1084 const SDNode* N, CombineLevel Level) const { 1085 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || 1086 N->getOpcode() == ISD::SRL) && 1087 "Expected shift op"); 1088 1089 SDValue ShiftLHS = N->getOperand(0); 1090 if (!ShiftLHS->hasOneUse()) 1091 return false; 1092 1093 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND && 1094 !ShiftLHS.getOperand(0)->hasOneUse()) 1095 return false; 1096 1097 // Always commute pre-type legalization and right shifts. 1098 // We're looking for shl(or(x,y),z) patterns. 1099 if (Level < CombineLevel::AfterLegalizeTypes || 1100 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR) 1101 return true; 1102 1103 // If only user is a i32 right-shift, then don't destroy a BFE pattern. 1104 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() && 1105 (N->user_begin()->getOpcode() == ISD::SRA || 1106 N->user_begin()->getOpcode() == ISD::SRL)) 1107 return false; 1108 1109 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns. 1110 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) { 1111 if (LHS.getOpcode() != ISD::SHL) 1112 return false; 1113 auto *RHSLd = dyn_cast<LoadSDNode>(RHS); 1114 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0)); 1115 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 1116 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD && 1117 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() && 1118 RHSLd->getExtensionType() == ISD::ZEXTLOAD; 1119 }; 1120 SDValue LHS = N->getOperand(0).getOperand(0); 1121 SDValue RHS = N->getOperand(0).getOperand(1); 1122 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS)); 1123 } 1124 1125 //===---------------------------------------------------------------------===// 1126 // TargetLowering Callbacks 1127 //===---------------------------------------------------------------------===// 1128 1129 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, 1130 bool IsVarArg) { 1131 switch (CC) { 1132 case CallingConv::AMDGPU_VS: 1133 case CallingConv::AMDGPU_GS: 1134 case CallingConv::AMDGPU_PS: 1135 case CallingConv::AMDGPU_CS: 1136 case CallingConv::AMDGPU_HS: 1137 case CallingConv::AMDGPU_ES: 1138 case CallingConv::AMDGPU_LS: 1139 return CC_AMDGPU; 1140 case CallingConv::AMDGPU_CS_Chain: 1141 case CallingConv::AMDGPU_CS_ChainPreserve: 1142 return CC_AMDGPU_CS_CHAIN; 1143 case CallingConv::C: 1144 case CallingConv::Fast: 1145 case CallingConv::Cold: 1146 return CC_AMDGPU_Func; 1147 case CallingConv::AMDGPU_Gfx: 1148 return CC_SI_Gfx; 1149 case CallingConv::AMDGPU_KERNEL: 1150 case CallingConv::SPIR_KERNEL: 1151 default: 1152 reportFatalUsageError("unsupported calling convention for call"); 1153 } 1154 } 1155 1156 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, 1157 bool IsVarArg) { 1158 switch (CC) { 1159 case CallingConv::AMDGPU_KERNEL: 1160 case CallingConv::SPIR_KERNEL: 1161 llvm_unreachable("kernels should not be handled here"); 1162 case CallingConv::AMDGPU_VS: 1163 case CallingConv::AMDGPU_GS: 1164 case CallingConv::AMDGPU_PS: 1165 case CallingConv::AMDGPU_CS: 1166 case CallingConv::AMDGPU_CS_Chain: 1167 case CallingConv::AMDGPU_CS_ChainPreserve: 1168 case CallingConv::AMDGPU_HS: 1169 case CallingConv::AMDGPU_ES: 1170 case CallingConv::AMDGPU_LS: 1171 return RetCC_SI_Shader; 1172 case CallingConv::AMDGPU_Gfx: 1173 return RetCC_SI_Gfx; 1174 case CallingConv::C: 1175 case CallingConv::Fast: 1176 case CallingConv::Cold: 1177 return RetCC_AMDGPU_Func; 1178 default: 1179 reportFatalUsageError("unsupported calling convention"); 1180 } 1181 } 1182 1183 /// The SelectionDAGBuilder will automatically promote function arguments 1184 /// with illegal types. However, this does not work for the AMDGPU targets 1185 /// since the function arguments are stored in memory as these illegal types. 1186 /// In order to handle this properly we need to get the original types sizes 1187 /// from the LLVM IR Function and fixup the ISD:InputArg values before 1188 /// passing them to AnalyzeFormalArguments() 1189 1190 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting 1191 /// input values across multiple registers. Each item in the Ins array 1192 /// represents a single value that will be stored in registers. Ins[x].VT is 1193 /// the value type of the value that will be stored in the register, so 1194 /// whatever SDNode we lower the argument to needs to be this type. 1195 /// 1196 /// In order to correctly lower the arguments we need to know the size of each 1197 /// argument. Since Ins[x].VT gives us the size of the register that will 1198 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type 1199 /// for the original function argument so that we can deduce the correct memory 1200 /// type to use for Ins[x]. In most cases the correct memory type will be 1201 /// Ins[x].ArgVT. However, this will not always be the case. If, for example, 1202 /// we have a kernel argument of type v8i8, this argument will be split into 1203 /// 8 parts and each part will be represented by its own item in the Ins array. 1204 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of 1205 /// the argument before it was split. From this, we deduce that the memory type 1206 /// for each individual part is i8. We pass the memory type as LocVT to the 1207 /// calling convention analysis function and the register type (Ins[x].VT) as 1208 /// the ValVT. 1209 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( 1210 CCState &State, 1211 const SmallVectorImpl<ISD::InputArg> &Ins) const { 1212 const MachineFunction &MF = State.getMachineFunction(); 1213 const Function &Fn = MF.getFunction(); 1214 LLVMContext &Ctx = Fn.getParent()->getContext(); 1215 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); 1216 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(); 1217 CallingConv::ID CC = Fn.getCallingConv(); 1218 1219 Align MaxAlign = Align(1); 1220 uint64_t ExplicitArgOffset = 0; 1221 const DataLayout &DL = Fn.getDataLayout(); 1222 1223 unsigned InIndex = 0; 1224 1225 for (const Argument &Arg : Fn.args()) { 1226 const bool IsByRef = Arg.hasByRefAttr(); 1227 Type *BaseArgTy = Arg.getType(); 1228 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy; 1229 Align Alignment = DL.getValueOrABITypeAlignment( 1230 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy); 1231 MaxAlign = std::max(Alignment, MaxAlign); 1232 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy); 1233 1234 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset; 1235 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize; 1236 1237 // We're basically throwing away everything passed into us and starting over 1238 // to get accurate in-memory offsets. The "PartOffset" is completely useless 1239 // to us as computed in Ins. 1240 // 1241 // We also need to figure out what type legalization is trying to do to get 1242 // the correct memory offsets. 1243 1244 SmallVector<EVT, 16> ValueVTs; 1245 SmallVector<uint64_t, 16> Offsets; 1246 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset); 1247 1248 for (unsigned Value = 0, NumValues = ValueVTs.size(); 1249 Value != NumValues; ++Value) { 1250 uint64_t BasePartOffset = Offsets[Value]; 1251 1252 EVT ArgVT = ValueVTs[Value]; 1253 EVT MemVT = ArgVT; 1254 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT); 1255 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT); 1256 1257 if (NumRegs == 1) { 1258 // This argument is not split, so the IR type is the memory type. 1259 if (ArgVT.isExtended()) { 1260 // We have an extended type, like i24, so we should just use the 1261 // register type. 1262 MemVT = RegisterVT; 1263 } else { 1264 MemVT = ArgVT; 1265 } 1266 } else if (ArgVT.isVector() && RegisterVT.isVector() && 1267 ArgVT.getScalarType() == RegisterVT.getScalarType()) { 1268 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()); 1269 // We have a vector value which has been split into a vector with 1270 // the same scalar type, but fewer elements. This should handle 1271 // all the floating-point vector types. 1272 MemVT = RegisterVT; 1273 } else if (ArgVT.isVector() && 1274 ArgVT.getVectorNumElements() == NumRegs) { 1275 // This arg has been split so that each element is stored in a separate 1276 // register. 1277 MemVT = ArgVT.getScalarType(); 1278 } else if (ArgVT.isExtended()) { 1279 // We have an extended type, like i65. 1280 MemVT = RegisterVT; 1281 } else { 1282 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs; 1283 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0); 1284 if (RegisterVT.isInteger()) { 1285 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); 1286 } else if (RegisterVT.isVector()) { 1287 assert(!RegisterVT.getScalarType().isFloatingPoint()); 1288 unsigned NumElements = RegisterVT.getVectorNumElements(); 1289 assert(MemoryBits % NumElements == 0); 1290 // This vector type has been split into another vector type with 1291 // a different elements size. 1292 EVT ScalarVT = EVT::getIntegerVT(State.getContext(), 1293 MemoryBits / NumElements); 1294 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); 1295 } else { 1296 llvm_unreachable("cannot deduce memory type."); 1297 } 1298 } 1299 1300 // Convert one element vectors to scalar. 1301 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) 1302 MemVT = MemVT.getScalarType(); 1303 1304 // Round up vec3/vec5 argument. 1305 if (MemVT.isVector() && !MemVT.isPow2VectorType()) { 1306 MemVT = MemVT.getPow2VectorType(State.getContext()); 1307 } else if (!MemVT.isSimple() && !MemVT.isVector()) { 1308 MemVT = MemVT.getRoundIntegerType(State.getContext()); 1309 } 1310 1311 unsigned PartOffset = 0; 1312 for (unsigned i = 0; i != NumRegs; ++i) { 1313 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT, 1314 BasePartOffset + PartOffset, 1315 MemVT.getSimpleVT(), 1316 CCValAssign::Full)); 1317 PartOffset += MemVT.getStoreSize(); 1318 } 1319 } 1320 } 1321 } 1322 1323 SDValue AMDGPUTargetLowering::LowerReturn( 1324 SDValue Chain, CallingConv::ID CallConv, 1325 bool isVarArg, 1326 const SmallVectorImpl<ISD::OutputArg> &Outs, 1327 const SmallVectorImpl<SDValue> &OutVals, 1328 const SDLoc &DL, SelectionDAG &DAG) const { 1329 // FIXME: Fails for r600 tests 1330 //assert(!isVarArg && Outs.empty() && OutVals.empty() && 1331 // "wave terminate should not have return values"); 1332 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain); 1333 } 1334 1335 //===---------------------------------------------------------------------===// 1336 // Target specific lowering 1337 //===---------------------------------------------------------------------===// 1338 1339 /// Selects the correct CCAssignFn for a given CallingConvention value. 1340 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1341 bool IsVarArg) { 1342 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg); 1343 } 1344 1345 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 1346 bool IsVarArg) { 1347 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg); 1348 } 1349 1350 SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain, 1351 SelectionDAG &DAG, 1352 MachineFrameInfo &MFI, 1353 int ClobberedFI) const { 1354 SmallVector<SDValue, 8> ArgChains; 1355 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); 1356 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; 1357 1358 // Include the original chain at the beginning of the list. When this is 1359 // used by target LowerCall hooks, this helps legalize find the 1360 // CALLSEQ_BEGIN node. 1361 ArgChains.push_back(Chain); 1362 1363 // Add a chain value for each stack argument corresponding 1364 for (SDNode *U : DAG.getEntryNode().getNode()->users()) { 1365 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) { 1366 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) { 1367 if (FI->getIndex() < 0) { 1368 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); 1369 int64_t InLastByte = InFirstByte; 1370 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; 1371 1372 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 1373 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 1374 ArgChains.push_back(SDValue(L, 1)); 1375 } 1376 } 1377 } 1378 } 1379 1380 // Build a tokenfactor for all the chains. 1381 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 1382 } 1383 1384 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI, 1385 SmallVectorImpl<SDValue> &InVals, 1386 StringRef Reason) const { 1387 SDValue Callee = CLI.Callee; 1388 SelectionDAG &DAG = CLI.DAG; 1389 1390 const Function &Fn = DAG.getMachineFunction().getFunction(); 1391 1392 StringRef FuncName("<unknown>"); 1393 1394 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) 1395 FuncName = G->getSymbol(); 1396 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 1397 FuncName = G->getGlobal()->getName(); 1398 1399 DAG.getContext()->diagnose( 1400 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc())); 1401 1402 if (!CLI.IsTailCall) { 1403 for (ISD::InputArg &Arg : CLI.Ins) 1404 InVals.push_back(DAG.getPOISON(Arg.VT)); 1405 } 1406 1407 return DAG.getEntryNode(); 1408 } 1409 1410 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, 1411 SmallVectorImpl<SDValue> &InVals) const { 1412 return lowerUnhandledCall(CLI, InVals, "unsupported call to function "); 1413 } 1414 1415 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 1416 SelectionDAG &DAG) const { 1417 const Function &Fn = DAG.getMachineFunction().getFunction(); 1418 1419 DAG.getContext()->diagnose(DiagnosticInfoUnsupported( 1420 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc())); 1421 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)}; 1422 return DAG.getMergeValues(Ops, SDLoc()); 1423 } 1424 1425 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, 1426 SelectionDAG &DAG) const { 1427 switch (Op.getOpcode()) { 1428 default: 1429 Op->print(errs(), &DAG); 1430 llvm_unreachable("Custom lowering code for this " 1431 "instruction is not implemented yet!"); 1432 break; 1433 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); 1434 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 1435 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 1436 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); 1437 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); 1438 case ISD::FREM: return LowerFREM(Op, DAG); 1439 case ISD::FCEIL: return LowerFCEIL(Op, DAG); 1440 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); 1441 case ISD::FRINT: return LowerFRINT(Op, DAG); 1442 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); 1443 case ISD::FROUNDEVEN: 1444 return LowerFROUNDEVEN(Op, DAG); 1445 case ISD::FROUND: return LowerFROUND(Op, DAG); 1446 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); 1447 case ISD::FLOG2: 1448 return LowerFLOG2(Op, DAG); 1449 case ISD::FLOG: 1450 case ISD::FLOG10: 1451 return LowerFLOGCommon(Op, DAG); 1452 case ISD::FEXP: 1453 case ISD::FEXP10: 1454 return lowerFEXP(Op, DAG); 1455 case ISD::FEXP2: 1456 return lowerFEXP2(Op, DAG); 1457 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 1458 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 1459 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); 1460 case ISD::FP_TO_SINT: 1461 case ISD::FP_TO_UINT: 1462 return LowerFP_TO_INT(Op, DAG); 1463 case ISD::CTTZ: 1464 case ISD::CTTZ_ZERO_UNDEF: 1465 case ISD::CTLZ: 1466 case ISD::CTLZ_ZERO_UNDEF: 1467 return LowerCTLZ_CTTZ(Op, DAG); 1468 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 1469 } 1470 return Op; 1471 } 1472 1473 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, 1474 SmallVectorImpl<SDValue> &Results, 1475 SelectionDAG &DAG) const { 1476 switch (N->getOpcode()) { 1477 case ISD::SIGN_EXTEND_INREG: 1478 // Different parts of legalization seem to interpret which type of 1479 // sign_extend_inreg is the one to check for custom lowering. The extended 1480 // from type is what really matters, but some places check for custom 1481 // lowering of the result type. This results in trying to use 1482 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do 1483 // nothing here and let the illegal result integer be handled normally. 1484 return; 1485 case ISD::FLOG2: 1486 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG)) 1487 Results.push_back(Lowered); 1488 return; 1489 case ISD::FLOG: 1490 case ISD::FLOG10: 1491 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG)) 1492 Results.push_back(Lowered); 1493 return; 1494 case ISD::FEXP2: 1495 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG)) 1496 Results.push_back(Lowered); 1497 return; 1498 case ISD::FEXP: 1499 case ISD::FEXP10: 1500 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG)) 1501 Results.push_back(Lowered); 1502 return; 1503 case ISD::CTLZ: 1504 case ISD::CTLZ_ZERO_UNDEF: 1505 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG)) 1506 Results.push_back(Lowered); 1507 return; 1508 default: 1509 return; 1510 } 1511 } 1512 1513 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, 1514 SDValue Op, 1515 SelectionDAG &DAG) const { 1516 1517 const DataLayout &DL = DAG.getDataLayout(); 1518 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); 1519 const GlobalValue *GV = G->getGlobal(); 1520 1521 if (!MFI->isModuleEntryFunction()) { 1522 if (std::optional<uint32_t> Address = 1523 AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) { 1524 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType()); 1525 } 1526 } 1527 1528 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 1529 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) { 1530 if (!MFI->isModuleEntryFunction() && 1531 GV->getName() != "llvm.amdgcn.module.lds" && 1532 !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) { 1533 SDLoc DL(Op); 1534 const Function &Fn = DAG.getMachineFunction().getFunction(); 1535 DAG.getContext()->diagnose(DiagnosticInfoUnsupported( 1536 Fn, "local memory global used by non-kernel function", 1537 DL.getDebugLoc(), DS_Warning)); 1538 1539 // We currently don't have a way to correctly allocate LDS objects that 1540 // aren't directly associated with a kernel. We do force inlining of 1541 // functions that use local objects. However, if these dead functions are 1542 // not eliminated, we don't want a compile time error. Just emit a warning 1543 // and a trap, since there should be no callable path here. 1544 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode()); 1545 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 1546 Trap, DAG.getRoot()); 1547 DAG.setRoot(OutputChain); 1548 return DAG.getPOISON(Op.getValueType()); 1549 } 1550 1551 // XXX: What does the value of G->getOffset() mean? 1552 assert(G->getOffset() == 0 && 1553 "Do not know what to do with an non-zero offset"); 1554 1555 // TODO: We could emit code to handle the initialization somewhere. 1556 // We ignore the initializer for now and legalize it to allow selection. 1557 // The initializer will anyway get errored out during assembly emission. 1558 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV)); 1559 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType()); 1560 } 1561 return SDValue(); 1562 } 1563 1564 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, 1565 SelectionDAG &DAG) const { 1566 SmallVector<SDValue, 8> Args; 1567 SDLoc SL(Op); 1568 1569 EVT VT = Op.getValueType(); 1570 if (VT.getVectorElementType().getSizeInBits() < 32) { 1571 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits(); 1572 if (OpBitSize >= 32 && OpBitSize % 32 == 0) { 1573 unsigned NewNumElt = OpBitSize / 32; 1574 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32 1575 : EVT::getVectorVT(*DAG.getContext(), 1576 MVT::i32, NewNumElt); 1577 for (const SDUse &U : Op->ops()) { 1578 SDValue In = U.get(); 1579 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In); 1580 if (NewNumElt > 1) 1581 DAG.ExtractVectorElements(NewIn, Args); 1582 else 1583 Args.push_back(NewIn); 1584 } 1585 1586 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, 1587 NewNumElt * Op.getNumOperands()); 1588 SDValue BV = DAG.getBuildVector(NewVT, SL, Args); 1589 return DAG.getNode(ISD::BITCAST, SL, VT, BV); 1590 } 1591 } 1592 1593 for (const SDUse &U : Op->ops()) 1594 DAG.ExtractVectorElements(U.get(), Args); 1595 1596 return DAG.getBuildVector(Op.getValueType(), SL, Args); 1597 } 1598 1599 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 1600 SelectionDAG &DAG) const { 1601 SDLoc SL(Op); 1602 SmallVector<SDValue, 8> Args; 1603 unsigned Start = Op.getConstantOperandVal(1); 1604 EVT VT = Op.getValueType(); 1605 EVT SrcVT = Op.getOperand(0).getValueType(); 1606 1607 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) { 1608 unsigned NumElt = VT.getVectorNumElements(); 1609 unsigned NumSrcElt = SrcVT.getVectorNumElements(); 1610 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types"); 1611 1612 // Extract 32-bit registers at a time. 1613 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2); 1614 EVT NewVT = NumElt == 2 1615 ? MVT::i32 1616 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2); 1617 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0)); 1618 1619 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2); 1620 if (NumElt == 2) 1621 Tmp = Args[0]; 1622 else 1623 Tmp = DAG.getBuildVector(NewVT, SL, Args); 1624 1625 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp); 1626 } 1627 1628 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, 1629 VT.getVectorNumElements()); 1630 1631 return DAG.getBuildVector(Op.getValueType(), SL, Args); 1632 } 1633 1634 // TODO: Handle fabs too 1635 static SDValue peekFNeg(SDValue Val) { 1636 if (Val.getOpcode() == ISD::FNEG) 1637 return Val.getOperand(0); 1638 1639 return Val; 1640 } 1641 1642 static SDValue peekFPSignOps(SDValue Val) { 1643 if (Val.getOpcode() == ISD::FNEG) 1644 Val = Val.getOperand(0); 1645 if (Val.getOpcode() == ISD::FABS) 1646 Val = Val.getOperand(0); 1647 if (Val.getOpcode() == ISD::FCOPYSIGN) 1648 Val = Val.getOperand(0); 1649 return Val; 1650 } 1651 1652 SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl( 1653 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, 1654 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const { 1655 SelectionDAG &DAG = DCI.DAG; 1656 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1657 switch (CCOpcode) { 1658 case ISD::SETOEQ: 1659 case ISD::SETONE: 1660 case ISD::SETUNE: 1661 case ISD::SETNE: 1662 case ISD::SETUEQ: 1663 case ISD::SETEQ: 1664 case ISD::SETFALSE: 1665 case ISD::SETFALSE2: 1666 case ISD::SETTRUE: 1667 case ISD::SETTRUE2: 1668 case ISD::SETUO: 1669 case ISD::SETO: 1670 break; 1671 case ISD::SETULE: 1672 case ISD::SETULT: { 1673 if (LHS == True) 1674 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); 1675 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); 1676 } 1677 case ISD::SETOLE: 1678 case ISD::SETOLT: 1679 case ISD::SETLE: 1680 case ISD::SETLT: { 1681 // Ordered. Assume ordered for undefined. 1682 1683 // Only do this after legalization to avoid interfering with other combines 1684 // which might occur. 1685 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && 1686 !DCI.isCalledByLegalizer()) 1687 return SDValue(); 1688 1689 // We need to permute the operands to get the correct NaN behavior. The 1690 // selected operand is the second one based on the failing compare with NaN, 1691 // so permute it based on the compare type the hardware uses. 1692 if (LHS == True) 1693 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); 1694 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); 1695 } 1696 case ISD::SETUGE: 1697 case ISD::SETUGT: { 1698 if (LHS == True) 1699 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); 1700 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); 1701 } 1702 case ISD::SETGT: 1703 case ISD::SETGE: 1704 case ISD::SETOGE: 1705 case ISD::SETOGT: { 1706 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && 1707 !DCI.isCalledByLegalizer()) 1708 return SDValue(); 1709 1710 if (LHS == True) 1711 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); 1712 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); 1713 } 1714 case ISD::SETCC_INVALID: 1715 llvm_unreachable("Invalid setcc condcode!"); 1716 } 1717 return SDValue(); 1718 } 1719 1720 /// Generate Min/Max node 1721 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT, 1722 SDValue LHS, SDValue RHS, 1723 SDValue True, SDValue False, 1724 SDValue CC, 1725 DAGCombinerInfo &DCI) const { 1726 if ((LHS == True && RHS == False) || (LHS == False && RHS == True)) 1727 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI); 1728 1729 SelectionDAG &DAG = DCI.DAG; 1730 1731 // If we can't directly match this, try to see if we can fold an fneg to 1732 // match. 1733 1734 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 1735 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False); 1736 SDValue NegTrue = peekFNeg(True); 1737 1738 // Undo the combine foldFreeOpFromSelect does if it helps us match the 1739 // fmin/fmax. 1740 // 1741 // select (fcmp olt (lhs, K)), (fneg lhs), -K 1742 // -> fneg (fmin_legacy lhs, K) 1743 // 1744 // TODO: Use getNegatedExpression 1745 if (LHS == NegTrue && CFalse && CRHS) { 1746 APFloat NegRHS = neg(CRHS->getValueAPF()); 1747 if (NegRHS == CFalse->getValueAPF()) { 1748 SDValue Combined = 1749 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI); 1750 if (Combined) 1751 return DAG.getNode(ISD::FNEG, DL, VT, Combined); 1752 return SDValue(); 1753 } 1754 } 1755 1756 return SDValue(); 1757 } 1758 1759 std::pair<SDValue, SDValue> 1760 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const { 1761 SDLoc SL(Op); 1762 1763 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1764 1765 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 1766 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 1767 1768 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); 1769 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); 1770 1771 return std::pair(Lo, Hi); 1772 } 1773 1774 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const { 1775 SDLoc SL(Op); 1776 1777 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1778 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 1779 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); 1780 } 1781 1782 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const { 1783 SDLoc SL(Op); 1784 1785 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1786 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 1787 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); 1788 } 1789 1790 // Split a vector type into two parts. The first part is a power of two vector. 1791 // The second part is whatever is left over, and is a scalar if it would 1792 // otherwise be a 1-vector. 1793 std::pair<EVT, EVT> 1794 AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const { 1795 EVT LoVT, HiVT; 1796 EVT EltVT = VT.getVectorElementType(); 1797 unsigned NumElts = VT.getVectorNumElements(); 1798 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2); 1799 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts); 1800 HiVT = NumElts - LoNumElts == 1 1801 ? EltVT 1802 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts); 1803 return std::pair(LoVT, HiVT); 1804 } 1805 1806 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be 1807 // scalar. 1808 std::pair<SDValue, SDValue> 1809 AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL, 1810 const EVT &LoVT, const EVT &HiVT, 1811 SelectionDAG &DAG) const { 1812 assert(LoVT.getVectorNumElements() + 1813 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= 1814 N.getValueType().getVectorNumElements() && 1815 "More vector elements requested than available!"); 1816 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, 1817 DAG.getVectorIdxConstant(0, DL)); 1818 SDValue Hi = DAG.getNode( 1819 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL, 1820 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL)); 1821 return std::pair(Lo, Hi); 1822 } 1823 1824 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, 1825 SelectionDAG &DAG) const { 1826 LoadSDNode *Load = cast<LoadSDNode>(Op); 1827 EVT VT = Op.getValueType(); 1828 SDLoc SL(Op); 1829 1830 1831 // If this is a 2 element vector, we really want to scalarize and not create 1832 // weird 1 element vectors. 1833 if (VT.getVectorNumElements() == 2) { 1834 SDValue Ops[2]; 1835 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG); 1836 return DAG.getMergeValues(Ops, SL); 1837 } 1838 1839 SDValue BasePtr = Load->getBasePtr(); 1840 EVT MemVT = Load->getMemoryVT(); 1841 1842 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); 1843 1844 EVT LoVT, HiVT; 1845 EVT LoMemVT, HiMemVT; 1846 SDValue Lo, Hi; 1847 1848 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); 1849 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); 1850 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG); 1851 1852 unsigned Size = LoMemVT.getStoreSize(); 1853 Align BaseAlign = Load->getAlign(); 1854 Align HiAlign = commonAlignment(BaseAlign, Size); 1855 1856 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, 1857 Load->getChain(), BasePtr, SrcValue, LoMemVT, 1858 BaseAlign, Load->getMemOperand()->getFlags()); 1859 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size)); 1860 SDValue HiLoad = 1861 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), 1862 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), 1863 HiMemVT, HiAlign, Load->getMemOperand()->getFlags()); 1864 1865 SDValue Join; 1866 if (LoVT == HiVT) { 1867 // This is the case that the vector is power of two so was evenly split. 1868 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad); 1869 } else { 1870 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad, 1871 DAG.getVectorIdxConstant(0, SL)); 1872 Join = DAG.getNode( 1873 HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL, 1874 VT, Join, HiLoad, 1875 DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL)); 1876 } 1877 1878 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other, 1879 LoLoad.getValue(1), HiLoad.getValue(1))}; 1880 1881 return DAG.getMergeValues(Ops, SL); 1882 } 1883 1884 SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op, 1885 SelectionDAG &DAG) const { 1886 LoadSDNode *Load = cast<LoadSDNode>(Op); 1887 EVT VT = Op.getValueType(); 1888 SDValue BasePtr = Load->getBasePtr(); 1889 EVT MemVT = Load->getMemoryVT(); 1890 SDLoc SL(Op); 1891 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); 1892 Align BaseAlign = Load->getAlign(); 1893 unsigned NumElements = MemVT.getVectorNumElements(); 1894 1895 // Widen from vec3 to vec4 when the load is at least 8-byte aligned 1896 // or 16-byte fully dereferenceable. Otherwise, split the vector load. 1897 if (NumElements != 3 || 1898 (BaseAlign < Align(8) && 1899 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout()))) 1900 return SplitVectorLoad(Op, DAG); 1901 1902 assert(NumElements == 3); 1903 1904 EVT WideVT = 1905 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); 1906 EVT WideMemVT = 1907 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4); 1908 SDValue WideLoad = DAG.getExtLoad( 1909 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue, 1910 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags()); 1911 return DAG.getMergeValues( 1912 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad, 1913 DAG.getVectorIdxConstant(0, SL)), 1914 WideLoad.getValue(1)}, 1915 SL); 1916 } 1917 1918 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, 1919 SelectionDAG &DAG) const { 1920 StoreSDNode *Store = cast<StoreSDNode>(Op); 1921 SDValue Val = Store->getValue(); 1922 EVT VT = Val.getValueType(); 1923 1924 // If this is a 2 element vector, we really want to scalarize and not create 1925 // weird 1 element vectors. 1926 if (VT.getVectorNumElements() == 2) 1927 return scalarizeVectorStore(Store, DAG); 1928 1929 EVT MemVT = Store->getMemoryVT(); 1930 SDValue Chain = Store->getChain(); 1931 SDValue BasePtr = Store->getBasePtr(); 1932 SDLoc SL(Op); 1933 1934 EVT LoVT, HiVT; 1935 EVT LoMemVT, HiMemVT; 1936 SDValue Lo, Hi; 1937 1938 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); 1939 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); 1940 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG); 1941 1942 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize()); 1943 1944 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo(); 1945 Align BaseAlign = Store->getAlign(); 1946 unsigned Size = LoMemVT.getStoreSize(); 1947 Align HiAlign = commonAlignment(BaseAlign, Size); 1948 1949 SDValue LoStore = 1950 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign, 1951 Store->getMemOperand()->getFlags()); 1952 SDValue HiStore = 1953 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), 1954 HiMemVT, HiAlign, Store->getMemOperand()->getFlags()); 1955 1956 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); 1957 } 1958 1959 // This is a shortcut for integer division because we have fast i32<->f32 1960 // conversions, and fast f32 reciprocal instructions. The fractional part of a 1961 // float is enough to accurately represent up to a 24-bit signed integer. 1962 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, 1963 bool Sign) const { 1964 SDLoc DL(Op); 1965 EVT VT = Op.getValueType(); 1966 SDValue LHS = Op.getOperand(0); 1967 SDValue RHS = Op.getOperand(1); 1968 MVT IntVT = MVT::i32; 1969 MVT FltVT = MVT::f32; 1970 1971 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS); 1972 if (LHSSignBits < 9) 1973 return SDValue(); 1974 1975 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS); 1976 if (RHSSignBits < 9) 1977 return SDValue(); 1978 1979 unsigned BitSize = VT.getSizeInBits(); 1980 unsigned SignBits = std::min(LHSSignBits, RHSSignBits); 1981 unsigned DivBits = BitSize - SignBits; 1982 if (Sign) 1983 ++DivBits; 1984 1985 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; 1986 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; 1987 1988 SDValue jq = DAG.getConstant(1, DL, IntVT); 1989 1990 if (Sign) { 1991 // char|short jq = ia ^ ib; 1992 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); 1993 1994 // jq = jq >> (bitsize - 2) 1995 jq = DAG.getNode(ISD::SRA, DL, VT, jq, 1996 DAG.getConstant(BitSize - 2, DL, VT)); 1997 1998 // jq = jq | 0x1 1999 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT)); 2000 } 2001 2002 // int ia = (int)LHS; 2003 SDValue ia = LHS; 2004 2005 // int ib, (int)RHS; 2006 SDValue ib = RHS; 2007 2008 // float fa = (float)ia; 2009 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); 2010 2011 // float fb = (float)ib; 2012 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); 2013 2014 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, 2015 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); 2016 2017 // fq = trunc(fq); 2018 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq); 2019 2020 // float fqneg = -fq; 2021 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); 2022 2023 MachineFunction &MF = DAG.getMachineFunction(); 2024 2025 bool UseFmadFtz = false; 2026 if (Subtarget->isGCN()) { 2027 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2028 UseFmadFtz = 2029 MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign(); 2030 } 2031 2032 // float fr = mad(fqneg, fb, fa); 2033 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA 2034 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ 2035 : (unsigned)ISD::FMAD; 2036 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa); 2037 2038 // int iq = (int)fq; 2039 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); 2040 2041 // fr = fabs(fr); 2042 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); 2043 2044 // fb = fabs(fb); 2045 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); 2046 2047 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2048 2049 // int cv = fr >= fb; 2050 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); 2051 2052 // jq = (cv ? jq : 0); 2053 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT)); 2054 2055 // dst = iq + jq; 2056 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); 2057 2058 // Rem needs compensation, it's easier to recompute it 2059 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); 2060 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); 2061 2062 // Truncate to number of bits this divide really is. 2063 if (Sign) { 2064 SDValue InRegSize 2065 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits)); 2066 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize); 2067 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize); 2068 } else { 2069 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT); 2070 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask); 2071 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask); 2072 } 2073 2074 return DAG.getMergeValues({ Div, Rem }, DL); 2075 } 2076 2077 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, 2078 SelectionDAG &DAG, 2079 SmallVectorImpl<SDValue> &Results) const { 2080 SDLoc DL(Op); 2081 EVT VT = Op.getValueType(); 2082 2083 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64"); 2084 2085 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 2086 2087 SDValue One = DAG.getConstant(1, DL, HalfVT); 2088 SDValue Zero = DAG.getConstant(0, DL, HalfVT); 2089 2090 //HiLo split 2091 SDValue LHS_Lo, LHS_Hi; 2092 SDValue LHS = Op.getOperand(0); 2093 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT); 2094 2095 SDValue RHS_Lo, RHS_Hi; 2096 SDValue RHS = Op.getOperand(1); 2097 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT); 2098 2099 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && 2100 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { 2101 2102 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), 2103 LHS_Lo, RHS_Lo); 2104 2105 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero}); 2106 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero}); 2107 2108 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV)); 2109 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM)); 2110 return; 2111 } 2112 2113 if (isTypeLegal(MVT::i64)) { 2114 // The algorithm here is based on ideas from "Software Integer Division", 2115 // Tom Rodeheffer, August 2008. 2116 2117 MachineFunction &MF = DAG.getMachineFunction(); 2118 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2119 2120 // Compute denominator reciprocal. 2121 unsigned FMAD = 2122 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA 2123 : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign() 2124 ? (unsigned)ISD::FMAD 2125 : (unsigned)AMDGPUISD::FMAD_FTZ; 2126 2127 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo); 2128 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi); 2129 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi, 2130 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32), 2131 Cvt_Lo); 2132 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1); 2133 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp, 2134 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32)); 2135 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1, 2136 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32)); 2137 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2); 2138 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc, 2139 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32), 2140 Mul1); 2141 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2); 2142 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc); 2143 SDValue Rcp64 = DAG.getBitcast(VT, 2144 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi})); 2145 2146 SDValue Zero64 = DAG.getConstant(0, DL, VT); 2147 SDValue One64 = DAG.getConstant(1, DL, VT); 2148 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1); 2149 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1); 2150 2151 // First round of UNR (Unsigned integer Newton-Raphson). 2152 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS); 2153 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64); 2154 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1); 2155 SDValue Mulhi1_Lo, Mulhi1_Hi; 2156 std::tie(Mulhi1_Lo, Mulhi1_Hi) = 2157 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT); 2158 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo, 2159 Mulhi1_Lo, Zero1); 2160 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi, 2161 Mulhi1_Hi, Add1_Lo.getValue(1)); 2162 SDValue Add1 = DAG.getBitcast(VT, 2163 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi})); 2164 2165 // Second round of UNR. 2166 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1); 2167 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2); 2168 SDValue Mulhi2_Lo, Mulhi2_Hi; 2169 std::tie(Mulhi2_Lo, Mulhi2_Hi) = 2170 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT); 2171 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo, 2172 Mulhi2_Lo, Zero1); 2173 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi, 2174 Mulhi2_Hi, Add2_Lo.getValue(1)); 2175 SDValue Add2 = DAG.getBitcast(VT, 2176 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi})); 2177 2178 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2); 2179 2180 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3); 2181 2182 SDValue Mul3_Lo, Mul3_Hi; 2183 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT); 2184 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo, 2185 Mul3_Lo, Zero1); 2186 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi, 2187 Mul3_Hi, Sub1_Lo.getValue(1)); 2188 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi); 2189 SDValue Sub1 = DAG.getBitcast(VT, 2190 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi})); 2191 2192 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT); 2193 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero, 2194 ISD::SETUGE); 2195 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero, 2196 ISD::SETUGE); 2197 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ); 2198 2199 // TODO: Here and below portions of the code can be enclosed into if/endif. 2200 // Currently control flow is unconditional and we have 4 selects after 2201 // potential endif to substitute PHIs. 2202 2203 // if C3 != 0 ... 2204 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo, 2205 RHS_Lo, Zero1); 2206 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi, 2207 RHS_Hi, Sub1_Lo.getValue(1)); 2208 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi, 2209 Zero, Sub2_Lo.getValue(1)); 2210 SDValue Sub2 = DAG.getBitcast(VT, 2211 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi})); 2212 2213 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64); 2214 2215 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero, 2216 ISD::SETUGE); 2217 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero, 2218 ISD::SETUGE); 2219 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ); 2220 2221 // if (C6 != 0) 2222 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64); 2223 2224 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo, 2225 RHS_Lo, Zero1); 2226 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi, 2227 RHS_Hi, Sub2_Lo.getValue(1)); 2228 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi, 2229 Zero, Sub3_Lo.getValue(1)); 2230 SDValue Sub3 = DAG.getBitcast(VT, 2231 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi})); 2232 2233 // endif C6 2234 // endif C3 2235 2236 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE); 2237 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE); 2238 2239 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE); 2240 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE); 2241 2242 Results.push_back(Div); 2243 Results.push_back(Rem); 2244 2245 return; 2246 } 2247 2248 // r600 expandion. 2249 // Get Speculative values 2250 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); 2251 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); 2252 2253 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ); 2254 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero}); 2255 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM); 2256 2257 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ); 2258 SDValue DIV_Lo = Zero; 2259 2260 const unsigned halfBitWidth = HalfVT.getSizeInBits(); 2261 2262 for (unsigned i = 0; i < halfBitWidth; ++i) { 2263 const unsigned bitPos = halfBitWidth - i - 1; 2264 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT); 2265 // Get value of high bit 2266 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); 2267 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One); 2268 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); 2269 2270 // Shift 2271 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT)); 2272 // Add LHS high bit 2273 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); 2274 2275 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT); 2276 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE); 2277 2278 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); 2279 2280 // Update REM 2281 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); 2282 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); 2283 } 2284 2285 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi}); 2286 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV); 2287 Results.push_back(DIV); 2288 Results.push_back(REM); 2289 } 2290 2291 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, 2292 SelectionDAG &DAG) const { 2293 SDLoc DL(Op); 2294 EVT VT = Op.getValueType(); 2295 2296 if (VT == MVT::i64) { 2297 SmallVector<SDValue, 2> Results; 2298 LowerUDIVREM64(Op, DAG, Results); 2299 return DAG.getMergeValues(Results, DL); 2300 } 2301 2302 if (VT == MVT::i32) { 2303 if (SDValue Res = LowerDIVREM24(Op, DAG, false)) 2304 return Res; 2305 } 2306 2307 SDValue X = Op.getOperand(0); 2308 SDValue Y = Op.getOperand(1); 2309 2310 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2311 // algorithm used here. 2312 2313 // Initial estimate of inv(y). 2314 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y); 2315 2316 // One round of UNR. 2317 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y); 2318 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z); 2319 Z = DAG.getNode(ISD::ADD, DL, VT, Z, 2320 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ)); 2321 2322 // Quotient/remainder estimate. 2323 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z); 2324 SDValue R = 2325 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y)); 2326 2327 // First quotient/remainder refinement. 2328 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2329 SDValue One = DAG.getConstant(1, DL, VT); 2330 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE); 2331 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond, 2332 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q); 2333 R = DAG.getNode(ISD::SELECT, DL, VT, Cond, 2334 DAG.getNode(ISD::SUB, DL, VT, R, Y), R); 2335 2336 // Second quotient/remainder refinement. 2337 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE); 2338 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond, 2339 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q); 2340 R = DAG.getNode(ISD::SELECT, DL, VT, Cond, 2341 DAG.getNode(ISD::SUB, DL, VT, R, Y), R); 2342 2343 return DAG.getMergeValues({Q, R}, DL); 2344 } 2345 2346 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, 2347 SelectionDAG &DAG) const { 2348 SDLoc DL(Op); 2349 EVT VT = Op.getValueType(); 2350 2351 SDValue LHS = Op.getOperand(0); 2352 SDValue RHS = Op.getOperand(1); 2353 2354 SDValue Zero = DAG.getConstant(0, DL, VT); 2355 SDValue NegOne = DAG.getAllOnesConstant(DL, VT); 2356 2357 if (VT == MVT::i32) { 2358 if (SDValue Res = LowerDIVREM24(Op, DAG, true)) 2359 return Res; 2360 } 2361 2362 if (VT == MVT::i64 && 2363 DAG.ComputeNumSignBits(LHS) > 32 && 2364 DAG.ComputeNumSignBits(RHS) > 32) { 2365 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 2366 2367 //HiLo split 2368 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); 2369 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); 2370 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), 2371 LHS_Lo, RHS_Lo); 2372 SDValue Res[2] = { 2373 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)), 2374 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1)) 2375 }; 2376 return DAG.getMergeValues(Res, DL); 2377 } 2378 2379 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); 2380 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); 2381 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); 2382 SDValue RSign = LHSign; // Remainder sign is the same as LHS 2383 2384 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); 2385 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); 2386 2387 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); 2388 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); 2389 2390 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); 2391 SDValue Rem = Div.getValue(1); 2392 2393 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); 2394 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); 2395 2396 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); 2397 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); 2398 2399 SDValue Res[2] = { 2400 Div, 2401 Rem 2402 }; 2403 return DAG.getMergeValues(Res, DL); 2404 } 2405 2406 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x) 2407 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { 2408 SDLoc SL(Op); 2409 EVT VT = Op.getValueType(); 2410 auto Flags = Op->getFlags(); 2411 SDValue X = Op.getOperand(0); 2412 SDValue Y = Op.getOperand(1); 2413 2414 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags); 2415 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags); 2416 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags); 2417 // TODO: For f32 use FMAD instead if !hasFastFMA32? 2418 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags); 2419 } 2420 2421 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { 2422 SDLoc SL(Op); 2423 SDValue Src = Op.getOperand(0); 2424 2425 // result = trunc(src) 2426 // if (src > 0.0 && src != result) 2427 // result += 1.0 2428 2429 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 2430 2431 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); 2432 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 2433 2434 EVT SetCCVT = 2435 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2436 2437 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); 2438 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 2439 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 2440 2441 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); 2442 // TODO: Should this propagate fast-math-flags? 2443 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 2444 } 2445 2446 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, 2447 SelectionDAG &DAG) { 2448 const unsigned FractBits = 52; 2449 const unsigned ExpBits = 11; 2450 2451 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, 2452 Hi, 2453 DAG.getConstant(FractBits - 32, SL, MVT::i32), 2454 DAG.getConstant(ExpBits, SL, MVT::i32)); 2455 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, 2456 DAG.getConstant(1023, SL, MVT::i32)); 2457 2458 return Exp; 2459 } 2460 2461 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { 2462 SDLoc SL(Op); 2463 SDValue Src = Op.getOperand(0); 2464 2465 assert(Op.getValueType() == MVT::f64); 2466 2467 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 2468 2469 // Extract the upper half, since this is where we will find the sign and 2470 // exponent. 2471 SDValue Hi = getHiHalf64(Src, DAG); 2472 2473 SDValue Exp = extractF64Exponent(Hi, SL, DAG); 2474 2475 const unsigned FractBits = 52; 2476 2477 // Extract the sign bit. 2478 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32); 2479 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); 2480 2481 // Extend back to 64-bits. 2482 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit}); 2483 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); 2484 2485 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); 2486 const SDValue FractMask 2487 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64); 2488 2489 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); 2490 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); 2491 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); 2492 2493 EVT SetCCVT = 2494 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); 2495 2496 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32); 2497 2498 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); 2499 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); 2500 2501 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); 2502 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); 2503 2504 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); 2505 } 2506 2507 SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op, 2508 SelectionDAG &DAG) const { 2509 SDLoc SL(Op); 2510 SDValue Src = Op.getOperand(0); 2511 2512 assert(Op.getValueType() == MVT::f64); 2513 2514 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2515 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); 2516 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); 2517 2518 // TODO: Should this propagate fast-math-flags? 2519 2520 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); 2521 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); 2522 2523 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); 2524 2525 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2526 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); 2527 2528 EVT SetCCVT = 2529 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2530 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); 2531 2532 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); 2533 } 2534 2535 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, 2536 SelectionDAG &DAG) const { 2537 // FNEARBYINT and FRINT are the same, except in their handling of FP 2538 // exceptions. Those aren't really meaningful for us, and OpenCL only has 2539 // rint, so just treat them as equivalent. 2540 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(), 2541 Op.getOperand(0)); 2542 } 2543 2544 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { 2545 auto VT = Op.getValueType(); 2546 auto Arg = Op.getOperand(0u); 2547 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg); 2548 } 2549 2550 // XXX - May require not supporting f32 denormals? 2551 2552 // Don't handle v2f16. The extra instructions to scalarize and repack around the 2553 // compare and vselect end up producing worse code than scalarizing the whole 2554 // operation. 2555 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2556 SDLoc SL(Op); 2557 SDValue X = Op.getOperand(0); 2558 EVT VT = Op.getValueType(); 2559 2560 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X); 2561 2562 // TODO: Should this propagate fast-math-flags? 2563 2564 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T); 2565 2566 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff); 2567 2568 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 2569 const SDValue One = DAG.getConstantFP(1.0, SL, VT); 2570 2571 EVT SetCCVT = 2572 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2573 2574 const SDValue Half = DAG.getConstantFP(0.5, SL, VT); 2575 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); 2576 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero); 2577 2578 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X); 2579 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset); 2580 } 2581 2582 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { 2583 SDLoc SL(Op); 2584 SDValue Src = Op.getOperand(0); 2585 2586 // result = trunc(src); 2587 // if (src < 0.0 && src != result) 2588 // result += -1.0. 2589 2590 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 2591 2592 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); 2593 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64); 2594 2595 EVT SetCCVT = 2596 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2597 2598 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); 2599 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 2600 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 2601 2602 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); 2603 // TODO: Should this propagate fast-math-flags? 2604 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 2605 } 2606 2607 /// Return true if it's known that \p Src can never be an f32 denormal value. 2608 static bool valueIsKnownNeverF32Denorm(SDValue Src) { 2609 switch (Src.getOpcode()) { 2610 case ISD::FP_EXTEND: 2611 return Src.getOperand(0).getValueType() == MVT::f16; 2612 case ISD::FP16_TO_FP: 2613 case ISD::FFREXP: 2614 return true; 2615 case ISD::INTRINSIC_WO_CHAIN: { 2616 unsigned IntrinsicID = Src.getConstantOperandVal(0); 2617 switch (IntrinsicID) { 2618 case Intrinsic::amdgcn_frexp_mant: 2619 return true; 2620 default: 2621 return false; 2622 } 2623 } 2624 default: 2625 return false; 2626 } 2627 2628 llvm_unreachable("covered opcode switch"); 2629 } 2630 2631 bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG, 2632 SDNodeFlags Flags) { 2633 if (Flags.hasApproximateFuncs()) 2634 return true; 2635 auto &Options = DAG.getTarget().Options; 2636 return Options.UnsafeFPMath || Options.ApproxFuncFPMath; 2637 } 2638 2639 bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG, 2640 SDValue Src, 2641 SDNodeFlags Flags) { 2642 return !valueIsKnownNeverF32Denorm(Src) && 2643 DAG.getMachineFunction() 2644 .getDenormalMode(APFloat::IEEEsingle()) 2645 .Input != DenormalMode::PreserveSign; 2646 } 2647 2648 SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG, 2649 SDValue Src, 2650 SDNodeFlags Flags) const { 2651 SDLoc SL(Src); 2652 EVT VT = Src.getValueType(); 2653 const fltSemantics &Semantics = VT.getFltSemantics(); 2654 SDValue SmallestNormal = 2655 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT); 2656 2657 // Want to scale denormals up, but negatives and 0 work just as well on the 2658 // scaled path. 2659 SDValue IsLtSmallestNormal = DAG.getSetCC( 2660 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src, 2661 SmallestNormal, ISD::SETOLT); 2662 2663 return IsLtSmallestNormal; 2664 } 2665 2666 SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src, 2667 SDNodeFlags Flags) const { 2668 SDLoc SL(Src); 2669 EVT VT = Src.getValueType(); 2670 const fltSemantics &Semantics = VT.getFltSemantics(); 2671 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT); 2672 2673 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags); 2674 SDValue IsFinite = DAG.getSetCC( 2675 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs, 2676 Inf, ISD::SETOLT); 2677 return IsFinite; 2678 } 2679 2680 /// If denormal handling is required return the scaled input to FLOG2, and the 2681 /// check for denormal range. Otherwise, return null values. 2682 std::pair<SDValue, SDValue> 2683 AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, 2684 SDValue Src, SDNodeFlags Flags) const { 2685 if (!needsDenormHandlingF32(DAG, Src, Flags)) 2686 return {}; 2687 2688 MVT VT = MVT::f32; 2689 const fltSemantics &Semantics = APFloat::IEEEsingle(); 2690 SDValue SmallestNormal = 2691 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT); 2692 2693 SDValue IsLtSmallestNormal = DAG.getSetCC( 2694 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src, 2695 SmallestNormal, ISD::SETOLT); 2696 2697 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT); 2698 SDValue One = DAG.getConstantFP(1.0, SL, VT); 2699 SDValue ScaleFactor = 2700 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags); 2701 2702 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags); 2703 return {ScaledInput, IsLtSmallestNormal}; 2704 } 2705 2706 SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const { 2707 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. 2708 // If we have to handle denormals, scale up the input and adjust the result. 2709 2710 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) 2711 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) 2712 2713 SDLoc SL(Op); 2714 EVT VT = Op.getValueType(); 2715 SDValue Src = Op.getOperand(0); 2716 SDNodeFlags Flags = Op->getFlags(); 2717 2718 if (VT == MVT::f16) { 2719 // Nothing in half is a denormal when promoted to f32. 2720 assert(!Subtarget->has16BitInsts()); 2721 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags); 2722 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags); 2723 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log, 2724 DAG.getTargetConstant(0, SL, MVT::i32), Flags); 2725 } 2726 2727 auto [ScaledInput, IsLtSmallestNormal] = 2728 getScaledLogInput(DAG, SL, Src, Flags); 2729 if (!ScaledInput) 2730 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags); 2731 2732 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags); 2733 2734 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT); 2735 SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 2736 SDValue ResultOffset = 2737 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero); 2738 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags); 2739 } 2740 2741 static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, 2742 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) { 2743 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags); 2744 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags); 2745 } 2746 2747 SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, 2748 SelectionDAG &DAG) const { 2749 SDValue X = Op.getOperand(0); 2750 EVT VT = Op.getValueType(); 2751 SDNodeFlags Flags = Op->getFlags(); 2752 SDLoc DL(Op); 2753 2754 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10; 2755 assert(IsLog10 || Op.getOpcode() == ISD::FLOG); 2756 2757 const auto &Options = getTargetMachine().Options; 2758 if (VT == MVT::f16 || Flags.hasApproximateFuncs() || 2759 Options.ApproxFuncFPMath || Options.UnsafeFPMath) { 2760 2761 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { 2762 // Log and multiply in f32 is good enough for f16. 2763 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags); 2764 } 2765 2766 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags); 2767 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { 2768 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered, 2769 DAG.getTargetConstant(0, DL, MVT::i32), Flags); 2770 } 2771 2772 return Lowered; 2773 } 2774 2775 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags); 2776 if (ScaledInput) 2777 X = ScaledInput; 2778 2779 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags); 2780 2781 SDValue R; 2782 if (Subtarget->hasFastFMAF32()) { 2783 // c+cc are ln(2)/ln(10) to more than 49 bits 2784 const float c_log10 = 0x1.344134p-2f; 2785 const float cc_log10 = 0x1.09f79ep-26f; 2786 2787 // c + cc is ln(2) to more than 49 bits 2788 const float c_log = 0x1.62e42ep-1f; 2789 const float cc_log = 0x1.efa39ep-25f; 2790 2791 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT); 2792 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT); 2793 2794 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags); 2795 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags); 2796 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags); 2797 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags); 2798 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags); 2799 } else { 2800 // ch+ct is ln(2)/ln(10) to more than 36 bits 2801 const float ch_log10 = 0x1.344000p-2f; 2802 const float ct_log10 = 0x1.3509f6p-18f; 2803 2804 // ch + ct is ln(2) to more than 36 bits 2805 const float ch_log = 0x1.62e000p-1f; 2806 const float ct_log = 0x1.0bfbe8p-15f; 2807 2808 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT); 2809 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT); 2810 2811 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y); 2812 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32); 2813 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst); 2814 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt); 2815 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags); 2816 2817 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags); 2818 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags); 2819 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags); 2820 R = getMad(DAG, DL, VT, YH, CH, Mad1); 2821 } 2822 2823 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && 2824 (Flags.hasNoInfs() || Options.NoInfsFPMath); 2825 2826 // TODO: Check if known finite from source value. 2827 if (!IsFiniteOnly) { 2828 SDValue IsFinite = getIsFinite(DAG, Y, Flags); 2829 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags); 2830 } 2831 2832 if (IsScaled) { 2833 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT); 2834 SDValue ShiftK = 2835 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT); 2836 SDValue Shift = 2837 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags); 2838 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags); 2839 } 2840 2841 return R; 2842 } 2843 2844 SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const { 2845 return LowerFLOGCommon(Op, DAG); 2846 } 2847 2848 // Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a 2849 // promote f16 operation. 2850 SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL, 2851 SelectionDAG &DAG, bool IsLog10, 2852 SDNodeFlags Flags) const { 2853 EVT VT = Src.getValueType(); 2854 unsigned LogOp = 2855 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2; 2856 2857 double Log2BaseInverted = 2858 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2; 2859 2860 if (VT == MVT::f32) { 2861 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags); 2862 if (ScaledInput) { 2863 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags); 2864 SDValue ScaledResultOffset = 2865 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT); 2866 2867 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT); 2868 2869 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled, 2870 ScaledResultOffset, Zero, Flags); 2871 2872 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT); 2873 2874 if (Subtarget->hasFastFMAF32()) 2875 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset, 2876 Flags); 2877 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags); 2878 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset); 2879 } 2880 } 2881 2882 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags); 2883 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT); 2884 2885 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand, 2886 Flags); 2887 } 2888 2889 SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const { 2890 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. 2891 // If we have to handle denormals, scale up the input and adjust the result. 2892 2893 SDLoc SL(Op); 2894 EVT VT = Op.getValueType(); 2895 SDValue Src = Op.getOperand(0); 2896 SDNodeFlags Flags = Op->getFlags(); 2897 2898 if (VT == MVT::f16) { 2899 // Nothing in half is a denormal when promoted to f32. 2900 assert(!Subtarget->has16BitInsts()); 2901 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags); 2902 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags); 2903 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log, 2904 DAG.getTargetConstant(0, SL, MVT::i32), Flags); 2905 } 2906 2907 assert(VT == MVT::f32); 2908 2909 if (!needsDenormHandlingF32(DAG, Src, Flags)) 2910 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags); 2911 2912 // bool needs_scaling = x < -0x1.f80000p+6f; 2913 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); 2914 2915 // -nextafter(128.0, -1) 2916 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT); 2917 2918 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2919 2920 SDValue NeedsScaling = 2921 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT); 2922 2923 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT); 2924 SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 2925 2926 SDValue AddOffset = 2927 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero); 2928 2929 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags); 2930 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags); 2931 2932 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT); 2933 SDValue One = DAG.getConstantFP(1.0, SL, VT); 2934 SDValue ResultScale = 2935 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One); 2936 2937 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags); 2938 } 2939 2940 SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL, 2941 SelectionDAG &DAG, 2942 SDNodeFlags Flags) const { 2943 EVT VT = X.getValueType(); 2944 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT); 2945 2946 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) { 2947 // exp2(M_LOG2E_F * f); 2948 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags); 2949 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP 2950 : (unsigned)ISD::FEXP2, 2951 SL, VT, Mul, Flags); 2952 } 2953 2954 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2955 2956 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT); 2957 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT); 2958 2959 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT); 2960 2961 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags); 2962 2963 SDValue AdjustedX = 2964 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X); 2965 2966 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags); 2967 2968 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags); 2969 2970 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT); 2971 SDValue AdjustedResult = 2972 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags); 2973 2974 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2, 2975 Flags); 2976 } 2977 2978 /// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be 2979 /// handled correctly. 2980 SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL, 2981 SelectionDAG &DAG, 2982 SDNodeFlags Flags) const { 2983 const EVT VT = X.getValueType(); 2984 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP) 2985 : static_cast<unsigned>(ISD::FEXP2); 2986 2987 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) { 2988 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f); 2989 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT); 2990 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT); 2991 2992 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags); 2993 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags); 2994 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags); 2995 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags); 2996 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1); 2997 } 2998 2999 // bool s = x < -0x1.2f7030p+5f; 3000 // x += s ? 0x1.0p+5f : 0.0f; 3001 // exp10 = exp2(x * 0x1.a92000p+1f) * 3002 // exp2(x * 0x1.4f0978p-11f) * 3003 // (s ? 0x1.9f623ep-107f : 1.0f); 3004 3005 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 3006 3007 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT); 3008 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT); 3009 3010 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT); 3011 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags); 3012 SDValue AdjustedX = 3013 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X); 3014 3015 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT); 3016 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT); 3017 3018 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags); 3019 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags); 3020 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags); 3021 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags); 3022 3023 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags); 3024 3025 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT); 3026 SDValue AdjustedResult = 3027 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags); 3028 3029 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps, 3030 Flags); 3031 } 3032 3033 SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { 3034 EVT VT = Op.getValueType(); 3035 SDLoc SL(Op); 3036 SDValue X = Op.getOperand(0); 3037 SDNodeFlags Flags = Op->getFlags(); 3038 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10; 3039 3040 if (VT.getScalarType() == MVT::f16) { 3041 // v_exp_f16 (fmul x, log2e) 3042 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast? 3043 return lowerFEXPUnsafe(X, SL, DAG, Flags); 3044 3045 if (VT.isVector()) 3046 return SDValue(); 3047 3048 // exp(f16 x) -> 3049 // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) 3050 3051 // Nothing in half is a denormal when promoted to f32. 3052 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags); 3053 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags); 3054 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered, 3055 DAG.getTargetConstant(0, SL, MVT::i32), Flags); 3056 } 3057 3058 assert(VT == MVT::f32); 3059 3060 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying 3061 // library behavior. Also, is known-not-daz source sufficient? 3062 if (allowApproxFunc(DAG, Flags)) { 3063 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags) 3064 : lowerFEXPUnsafe(X, SL, DAG, Flags); 3065 } 3066 3067 // Algorithm: 3068 // 3069 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) 3070 // 3071 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer 3072 // n = 64*m + j, 0 <= j < 64 3073 // 3074 // e^x = 2^((64*m + j + f)/64) 3075 // = (2^m) * (2^(j/64)) * 2^(f/64) 3076 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) 3077 // 3078 // f = x*(64/ln(2)) - n 3079 // r = f*(ln(2)/64) = x - n*(ln(2)/64) 3080 // 3081 // e^x = (2^m) * (2^(j/64)) * e^r 3082 // 3083 // (2^(j/64)) is precomputed 3084 // 3085 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3086 // e^r = 1 + q 3087 // 3088 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3089 // 3090 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) 3091 SDNodeFlags FlagsNoContract = Flags; 3092 FlagsNoContract.setAllowContract(false); 3093 3094 SDValue PH, PL; 3095 if (Subtarget->hasFastFMAF32()) { 3096 const float c_exp = numbers::log2ef; 3097 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits 3098 const float c_exp10 = 0x1.a934f0p+1f; 3099 const float cc_exp10 = 0x1.2f346ep-24f; 3100 3101 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT); 3102 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT); 3103 3104 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags); 3105 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags); 3106 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags); 3107 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags); 3108 } else { 3109 const float ch_exp = 0x1.714000p+0f; 3110 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits 3111 3112 const float ch_exp10 = 0x1.a92000p+1f; 3113 const float cl_exp10 = 0x1.4f0978p-11f; 3114 3115 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT); 3116 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT); 3117 3118 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X); 3119 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32); 3120 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst); 3121 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt); 3122 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags); 3123 3124 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags); 3125 3126 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags); 3127 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags); 3128 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags); 3129 } 3130 3131 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags); 3132 3133 // It is unsafe to contract this fsub into the PH multiply. 3134 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract); 3135 3136 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags); 3137 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E); 3138 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags); 3139 3140 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags); 3141 3142 SDValue UnderflowCheckConst = 3143 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT); 3144 3145 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 3146 SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 3147 SDValue Underflow = 3148 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT); 3149 3150 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R); 3151 const auto &Options = getTargetMachine().Options; 3152 3153 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) { 3154 SDValue OverflowCheckConst = 3155 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT); 3156 SDValue Overflow = 3157 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT); 3158 SDValue Inf = 3159 DAG.getConstantFP(APFloat::getInf(APFloat::IEEEsingle()), SL, VT); 3160 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R); 3161 } 3162 3163 return R; 3164 } 3165 3166 static bool isCtlzOpc(unsigned Opc) { 3167 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; 3168 } 3169 3170 static bool isCttzOpc(unsigned Opc) { 3171 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF; 3172 } 3173 3174 SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op, 3175 SelectionDAG &DAG) const { 3176 auto SL = SDLoc(Op); 3177 auto Opc = Op.getOpcode(); 3178 auto Arg = Op.getOperand(0u); 3179 auto ResultVT = Op.getValueType(); 3180 3181 if (ResultVT != MVT::i8 && ResultVT != MVT::i16) 3182 return {}; 3183 3184 assert(isCtlzOpc(Opc)); 3185 assert(ResultVT == Arg.getValueType()); 3186 3187 const uint64_t NumBits = ResultVT.getFixedSizeInBits(); 3188 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32); 3189 SDValue NewOp; 3190 3191 if (Opc == ISD::CTLZ_ZERO_UNDEF) { 3192 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg); 3193 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits); 3194 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp); 3195 } else { 3196 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg); 3197 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp); 3198 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits); 3199 } 3200 3201 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp); 3202 } 3203 3204 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const { 3205 SDLoc SL(Op); 3206 SDValue Src = Op.getOperand(0); 3207 3208 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode())); 3209 bool Ctlz = isCtlzOpc(Op.getOpcode()); 3210 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32; 3211 3212 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF || 3213 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF; 3214 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64; 3215 3216 if (Src.getValueType() == MVT::i32 || Is64BitScalar) { 3217 // (ctlz hi:lo) -> (umin (ffbh src), 32) 3218 // (cttz hi:lo) -> (umin (ffbl src), 32) 3219 // (ctlz_zero_undef src) -> (ffbh src) 3220 // (cttz_zero_undef src) -> (ffbl src) 3221 3222 // 64-bit scalar version produce 32-bit result 3223 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64) 3224 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64) 3225 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src) 3226 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src) 3227 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src); 3228 if (!ZeroUndef) { 3229 const SDValue ConstVal = DAG.getConstant( 3230 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32); 3231 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal); 3232 } 3233 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr); 3234 } 3235 3236 SDValue Lo, Hi; 3237 std::tie(Lo, Hi) = split64BitValue(Src, DAG); 3238 3239 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo); 3240 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi); 3241 3242 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64) 3243 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64) 3244 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) 3245 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) 3246 3247 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT; 3248 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32); 3249 if (Ctlz) 3250 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32); 3251 else 3252 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32); 3253 3254 SDValue NewOpr; 3255 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi); 3256 if (!ZeroUndef) { 3257 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32); 3258 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64); 3259 } 3260 3261 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr); 3262 } 3263 3264 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, 3265 bool Signed) const { 3266 // The regular method converting a 64-bit integer to float roughly consists of 3267 // 2 steps: normalization and rounding. In fact, after normalization, the 3268 // conversion from a 64-bit integer to a float is essentially the same as the 3269 // one from a 32-bit integer. The only difference is that it has more 3270 // trailing bits to be rounded. To leverage the native 32-bit conversion, a 3271 // 64-bit integer could be preprocessed and fit into a 32-bit integer then 3272 // converted into the correct float number. The basic steps for the unsigned 3273 // conversion are illustrated in the following pseudo code: 3274 // 3275 // f32 uitofp(i64 u) { 3276 // i32 hi, lo = split(u); 3277 // // Only count the leading zeros in hi as we have native support of the 3278 // // conversion from i32 to f32. If hi is all 0s, the conversion is 3279 // // reduced to a 32-bit one automatically. 3280 // i32 shamt = clz(hi); // Return 32 if hi is all 0s. 3281 // u <<= shamt; 3282 // hi, lo = split(u); 3283 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo. 3284 // // convert it as a 32-bit integer and scale the result back. 3285 // return uitofp(hi) * 2^(32 - shamt); 3286 // } 3287 // 3288 // The signed one follows the same principle but uses 'ffbh_i32' to count its 3289 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is 3290 // converted instead followed by negation based its sign bit. 3291 3292 SDLoc SL(Op); 3293 SDValue Src = Op.getOperand(0); 3294 3295 SDValue Lo, Hi; 3296 std::tie(Lo, Hi) = split64BitValue(Src, DAG); 3297 SDValue Sign; 3298 SDValue ShAmt; 3299 if (Signed && Subtarget->isGCN()) { 3300 // We also need to consider the sign bit in Lo if Hi has just sign bits, 3301 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into 3302 // account. That is, the maximal shift is 3303 // - 32 if Lo and Hi have opposite signs; 3304 // - 33 if Lo and Hi have the same sign. 3305 // 3306 // Or, MaxShAmt = 33 + OppositeSign, where 3307 // 3308 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is 3309 // - -1 if Lo and Hi have opposite signs; and 3310 // - 0 otherwise. 3311 // 3312 // All in all, ShAmt is calculated as 3313 // 3314 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1. 3315 // 3316 // or 3317 // 3318 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31). 3319 // 3320 // to reduce the critical path. 3321 SDValue OppositeSign = DAG.getNode( 3322 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi), 3323 DAG.getConstant(31, SL, MVT::i32)); 3324 SDValue MaxShAmt = 3325 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32), 3326 OppositeSign); 3327 // Count the leading sign bits. 3328 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi); 3329 // Different from unsigned conversion, the shift should be one bit less to 3330 // preserve the sign bit. 3331 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt, 3332 DAG.getConstant(1, SL, MVT::i32)); 3333 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt); 3334 } else { 3335 if (Signed) { 3336 // Without 'ffbh_i32', only leading zeros could be counted. Take the 3337 // absolute value first. 3338 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src, 3339 DAG.getConstant(63, SL, MVT::i64)); 3340 SDValue Abs = 3341 DAG.getNode(ISD::XOR, SL, MVT::i64, 3342 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign); 3343 std::tie(Lo, Hi) = split64BitValue(Abs, DAG); 3344 } 3345 // Count the leading zeros. 3346 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi); 3347 // The shift amount for signed integers is [0, 32]. 3348 } 3349 // Normalize the given 64-bit integer. 3350 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt); 3351 // Split it again. 3352 std::tie(Lo, Hi) = split64BitValue(Norm, DAG); 3353 // Calculate the adjust bit for rounding. 3354 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo) 3355 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32, 3356 DAG.getConstant(1, SL, MVT::i32), Lo); 3357 // Get the 32-bit normalized integer. 3358 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust); 3359 // Convert the normalized 32-bit integer into f32. 3360 unsigned Opc = 3361 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; 3362 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm); 3363 3364 // Finally, need to scale back the converted floating number as the original 3365 // 64-bit integer is converted as a 32-bit one. 3366 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32), 3367 ShAmt); 3368 // On GCN, use LDEXP directly. 3369 if (Subtarget->isGCN()) 3370 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt); 3371 3372 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent 3373 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit 3374 // exponent is enough to avoid overflowing into the sign bit. 3375 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt, 3376 DAG.getConstant(23, SL, MVT::i32)); 3377 SDValue IVal = 3378 DAG.getNode(ISD::ADD, SL, MVT::i32, 3379 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp); 3380 if (Signed) { 3381 // Set the sign bit. 3382 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32, 3383 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign), 3384 DAG.getConstant(31, SL, MVT::i32)); 3385 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign); 3386 } 3387 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal); 3388 } 3389 3390 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, 3391 bool Signed) const { 3392 SDLoc SL(Op); 3393 SDValue Src = Op.getOperand(0); 3394 3395 SDValue Lo, Hi; 3396 std::tie(Lo, Hi) = split64BitValue(Src, DAG); 3397 3398 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP, 3399 SL, MVT::f64, Hi); 3400 3401 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); 3402 3403 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi, 3404 DAG.getConstant(32, SL, MVT::i32)); 3405 // TODO: Should this propagate fast-math-flags? 3406 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); 3407 } 3408 3409 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, 3410 SelectionDAG &DAG) const { 3411 // TODO: Factor out code common with LowerSINT_TO_FP. 3412 EVT DestVT = Op.getValueType(); 3413 SDValue Src = Op.getOperand(0); 3414 EVT SrcVT = Src.getValueType(); 3415 3416 if (SrcVT == MVT::i16) { 3417 if (DestVT == MVT::f16) 3418 return Op; 3419 SDLoc DL(Op); 3420 3421 // Promote src to i32 3422 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src); 3423 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext); 3424 } 3425 3426 if (DestVT == MVT::bf16) { 3427 SDLoc SL(Op); 3428 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src); 3429 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true); 3430 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag); 3431 } 3432 3433 if (SrcVT != MVT::i64) 3434 return Op; 3435 3436 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { 3437 SDLoc DL(Op); 3438 3439 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); 3440 SDValue FPRoundFlag = 3441 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true); 3442 SDValue FPRound = 3443 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); 3444 3445 return FPRound; 3446 } 3447 3448 if (DestVT == MVT::f32) 3449 return LowerINT_TO_FP32(Op, DAG, false); 3450 3451 assert(DestVT == MVT::f64); 3452 return LowerINT_TO_FP64(Op, DAG, false); 3453 } 3454 3455 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, 3456 SelectionDAG &DAG) const { 3457 EVT DestVT = Op.getValueType(); 3458 3459 SDValue Src = Op.getOperand(0); 3460 EVT SrcVT = Src.getValueType(); 3461 3462 if (SrcVT == MVT::i16) { 3463 if (DestVT == MVT::f16) 3464 return Op; 3465 3466 SDLoc DL(Op); 3467 // Promote src to i32 3468 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src); 3469 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext); 3470 } 3471 3472 if (DestVT == MVT::bf16) { 3473 SDLoc SL(Op); 3474 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src); 3475 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true); 3476 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag); 3477 } 3478 3479 if (SrcVT != MVT::i64) 3480 return Op; 3481 3482 // TODO: Factor out code common with LowerUINT_TO_FP. 3483 3484 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { 3485 SDLoc DL(Op); 3486 SDValue Src = Op.getOperand(0); 3487 3488 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); 3489 SDValue FPRoundFlag = 3490 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true); 3491 SDValue FPRound = 3492 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); 3493 3494 return FPRound; 3495 } 3496 3497 if (DestVT == MVT::f32) 3498 return LowerINT_TO_FP32(Op, DAG, true); 3499 3500 assert(DestVT == MVT::f64); 3501 return LowerINT_TO_FP64(Op, DAG, true); 3502 } 3503 3504 SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, 3505 bool Signed) const { 3506 SDLoc SL(Op); 3507 3508 SDValue Src = Op.getOperand(0); 3509 EVT SrcVT = Src.getValueType(); 3510 3511 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64); 3512 3513 // The basic idea of converting a floating point number into a pair of 32-bit 3514 // integers is illustrated as follows: 3515 // 3516 // tf := trunc(val); 3517 // hif := floor(tf * 2^-32); 3518 // lof := tf - hif * 2^32; // lof is always positive due to floor. 3519 // hi := fptoi(hif); 3520 // lo := fptoi(lof); 3521 // 3522 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src); 3523 SDValue Sign; 3524 if (Signed && SrcVT == MVT::f32) { 3525 // However, a 32-bit floating point number has only 23 bits mantissa and 3526 // it's not enough to hold all the significant bits of `lof` if val is 3527 // negative. To avoid the loss of precision, We need to take the absolute 3528 // value after truncating and flip the result back based on the original 3529 // signedness. 3530 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32, 3531 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc), 3532 DAG.getConstant(31, SL, MVT::i32)); 3533 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc); 3534 } 3535 3536 SDValue K0, K1; 3537 if (SrcVT == MVT::f64) { 3538 K0 = DAG.getConstantFP( 3539 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL, 3540 SrcVT); 3541 K1 = DAG.getConstantFP( 3542 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL, 3543 SrcVT); 3544 } else { 3545 K0 = DAG.getConstantFP( 3546 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT); 3547 K1 = DAG.getConstantFP( 3548 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT); 3549 } 3550 // TODO: Should this propagate fast-math-flags? 3551 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0); 3552 3553 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul); 3554 3555 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc); 3556 3557 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT 3558 : ISD::FP_TO_UINT, 3559 SL, MVT::i32, FloorMul); 3560 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); 3561 3562 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, 3563 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi})); 3564 3565 if (Signed && SrcVT == MVT::f32) { 3566 assert(Sign); 3567 // Flip the result based on the signedness, which is either all 0s or 1s. 3568 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64, 3569 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign})); 3570 // r := xor(r, sign) - sign; 3571 Result = 3572 DAG.getNode(ISD::SUB, SL, MVT::i64, 3573 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign); 3574 } 3575 3576 return Result; 3577 } 3578 3579 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const { 3580 SDLoc DL(Op); 3581 SDValue N0 = Op.getOperand(0); 3582 3583 // Convert to target node to get known bits 3584 if (N0.getValueType() == MVT::f32) 3585 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0); 3586 3587 if (getTargetMachine().Options.UnsafeFPMath) { 3588 // There is a generic expand for FP_TO_FP16 with unsafe fast math. 3589 return SDValue(); 3590 } 3591 3592 return LowerF64ToF16Safe(N0, DL, DAG); 3593 } 3594 3595 // return node in i32 3596 SDValue AMDGPUTargetLowering::LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, 3597 SelectionDAG &DAG) const { 3598 assert(Src.getSimpleValueType() == MVT::f64); 3599 3600 // f64 -> f16 conversion using round-to-nearest-even rounding mode. 3601 // TODO: We can generate better code for True16. 3602 const unsigned ExpMask = 0x7ff; 3603 const unsigned ExpBiasf64 = 1023; 3604 const unsigned ExpBiasf16 = 15; 3605 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 3606 SDValue One = DAG.getConstant(1, DL, MVT::i32); 3607 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src); 3608 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U, 3609 DAG.getConstant(32, DL, MVT::i64)); 3610 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32); 3611 U = DAG.getZExtOrTrunc(U, DL, MVT::i32); 3612 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 3613 DAG.getConstant(20, DL, MVT::i64)); 3614 E = DAG.getNode(ISD::AND, DL, MVT::i32, E, 3615 DAG.getConstant(ExpMask, DL, MVT::i32)); 3616 // Subtract the fp64 exponent bias (1023) to get the real exponent and 3617 // add the f16 bias (15) to get the biased exponent for the f16 format. 3618 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E, 3619 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32)); 3620 3621 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 3622 DAG.getConstant(8, DL, MVT::i32)); 3623 M = DAG.getNode(ISD::AND, DL, MVT::i32, M, 3624 DAG.getConstant(0xffe, DL, MVT::i32)); 3625 3626 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH, 3627 DAG.getConstant(0x1ff, DL, MVT::i32)); 3628 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U); 3629 3630 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ); 3631 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set); 3632 3633 // (M != 0 ? 0x0200 : 0) | 0x7c00; 3634 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32, 3635 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32), 3636 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32)); 3637 3638 // N = M | (E << 12); 3639 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M, 3640 DAG.getNode(ISD::SHL, DL, MVT::i32, E, 3641 DAG.getConstant(12, DL, MVT::i32))); 3642 3643 // B = clamp(1-E, 0, 13); 3644 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32, 3645 One, E); 3646 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero); 3647 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B, 3648 DAG.getConstant(13, DL, MVT::i32)); 3649 3650 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M, 3651 DAG.getConstant(0x1000, DL, MVT::i32)); 3652 3653 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B); 3654 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B); 3655 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE); 3656 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1); 3657 3658 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT); 3659 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V, 3660 DAG.getConstant(0x7, DL, MVT::i32)); 3661 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V, 3662 DAG.getConstant(2, DL, MVT::i32)); 3663 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32), 3664 One, Zero, ISD::SETEQ); 3665 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32), 3666 One, Zero, ISD::SETGT); 3667 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1); 3668 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1); 3669 3670 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32), 3671 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT); 3672 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32), 3673 I, V, ISD::SETEQ); 3674 3675 // Extract the sign bit. 3676 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 3677 DAG.getConstant(16, DL, MVT::i32)); 3678 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign, 3679 DAG.getConstant(0x8000, DL, MVT::i32)); 3680 3681 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V); 3682 } 3683 3684 SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op, 3685 SelectionDAG &DAG) const { 3686 SDValue Src = Op.getOperand(0); 3687 unsigned OpOpcode = Op.getOpcode(); 3688 EVT SrcVT = Src.getValueType(); 3689 EVT DestVT = Op.getValueType(); 3690 3691 // Will be selected natively 3692 if (SrcVT == MVT::f16 && DestVT == MVT::i16) 3693 return Op; 3694 3695 if (SrcVT == MVT::bf16) { 3696 SDLoc DL(Op); 3697 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); 3698 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc); 3699 } 3700 3701 // Promote i16 to i32 3702 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) { 3703 SDLoc DL(Op); 3704 3705 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src); 3706 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32); 3707 } 3708 3709 if (DestVT != MVT::i64) 3710 return Op; 3711 3712 if (SrcVT == MVT::f16 || 3713 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) { 3714 SDLoc DL(Op); 3715 3716 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src); 3717 unsigned Ext = 3718 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 3719 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32); 3720 } 3721 3722 if (SrcVT == MVT::f32 || SrcVT == MVT::f64) 3723 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT); 3724 3725 return SDValue(); 3726 } 3727 3728 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 3729 SelectionDAG &DAG) const { 3730 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 3731 MVT VT = Op.getSimpleValueType(); 3732 MVT ScalarVT = VT.getScalarType(); 3733 3734 assert(VT.isVector()); 3735 3736 SDValue Src = Op.getOperand(0); 3737 SDLoc DL(Op); 3738 3739 // TODO: Don't scalarize on Evergreen? 3740 unsigned NElts = VT.getVectorNumElements(); 3741 SmallVector<SDValue, 8> Args; 3742 DAG.ExtractVectorElements(Src, Args, 0, NElts); 3743 3744 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); 3745 for (unsigned I = 0; I < NElts; ++I) 3746 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); 3747 3748 return DAG.getBuildVector(VT, DL, Args); 3749 } 3750 3751 //===----------------------------------------------------------------------===// 3752 // Custom DAG optimizations 3753 //===----------------------------------------------------------------------===// 3754 3755 static bool isU24(SDValue Op, SelectionDAG &DAG) { 3756 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24; 3757 } 3758 3759 static bool isI24(SDValue Op, SelectionDAG &DAG) { 3760 EVT VT = Op.getValueType(); 3761 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated 3762 // as unsigned 24-bit values. 3763 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24; 3764 } 3765 3766 static SDValue simplifyMul24(SDNode *Node24, 3767 TargetLowering::DAGCombinerInfo &DCI) { 3768 SelectionDAG &DAG = DCI.DAG; 3769 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 3770 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN; 3771 3772 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0); 3773 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1); 3774 unsigned NewOpcode = Node24->getOpcode(); 3775 if (IsIntrin) { 3776 unsigned IID = Node24->getConstantOperandVal(0); 3777 switch (IID) { 3778 case Intrinsic::amdgcn_mul_i24: 3779 NewOpcode = AMDGPUISD::MUL_I24; 3780 break; 3781 case Intrinsic::amdgcn_mul_u24: 3782 NewOpcode = AMDGPUISD::MUL_U24; 3783 break; 3784 case Intrinsic::amdgcn_mulhi_i24: 3785 NewOpcode = AMDGPUISD::MULHI_I24; 3786 break; 3787 case Intrinsic::amdgcn_mulhi_u24: 3788 NewOpcode = AMDGPUISD::MULHI_U24; 3789 break; 3790 default: 3791 llvm_unreachable("Expected 24-bit mul intrinsic"); 3792 } 3793 } 3794 3795 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24); 3796 3797 // First try to simplify using SimplifyMultipleUseDemandedBits which allows 3798 // the operands to have other uses, but will only perform simplifications that 3799 // involve bypassing some nodes for this user. 3800 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG); 3801 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG); 3802 if (DemandedLHS || DemandedRHS) 3803 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(), 3804 DemandedLHS ? DemandedLHS : LHS, 3805 DemandedRHS ? DemandedRHS : RHS); 3806 3807 // Now try SimplifyDemandedBits which can simplify the nodes used by our 3808 // operands if this node is the only user. 3809 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI)) 3810 return SDValue(Node24, 0); 3811 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI)) 3812 return SDValue(Node24, 0); 3813 3814 return SDValue(); 3815 } 3816 3817 template <typename IntTy> 3818 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, 3819 uint32_t Width, const SDLoc &DL) { 3820 if (Width + Offset < 32) { 3821 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width); 3822 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width); 3823 if constexpr (std::is_signed_v<IntTy>) { 3824 return DAG.getSignedConstant(Result, DL, MVT::i32); 3825 } else { 3826 return DAG.getConstant(Result, DL, MVT::i32); 3827 } 3828 } 3829 3830 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); 3831 } 3832 3833 static bool hasVolatileUser(SDNode *Val) { 3834 for (SDNode *U : Val->users()) { 3835 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) { 3836 if (M->isVolatile()) 3837 return true; 3838 } 3839 } 3840 3841 return false; 3842 } 3843 3844 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const { 3845 // i32 vectors are the canonical memory type. 3846 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT)) 3847 return false; 3848 3849 if (!VT.isByteSized()) 3850 return false; 3851 3852 unsigned Size = VT.getStoreSize(); 3853 3854 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector()) 3855 return false; 3856 3857 if (Size == 3 || (Size > 4 && (Size % 4 != 0))) 3858 return false; 3859 3860 return true; 3861 } 3862 3863 // Replace load of an illegal type with a bitcast from a load of a friendlier 3864 // type. 3865 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, 3866 DAGCombinerInfo &DCI) const { 3867 if (!DCI.isBeforeLegalize()) 3868 return SDValue(); 3869 3870 LoadSDNode *LN = cast<LoadSDNode>(N); 3871 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN)) 3872 return SDValue(); 3873 3874 SDLoc SL(N); 3875 SelectionDAG &DAG = DCI.DAG; 3876 EVT VT = LN->getMemoryVT(); 3877 3878 unsigned Size = VT.getStoreSize(); 3879 Align Alignment = LN->getAlign(); 3880 if (Alignment < Size && isTypeLegal(VT)) { 3881 unsigned IsFast; 3882 unsigned AS = LN->getAddressSpace(); 3883 3884 // Expand unaligned loads earlier than legalization. Due to visitation order 3885 // problems during legalization, the emitted instructions to pack and unpack 3886 // the bytes again are not eliminated in the case of an unaligned copy. 3887 if (!allowsMisalignedMemoryAccesses( 3888 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) { 3889 if (VT.isVector()) 3890 return SplitVectorLoad(SDValue(LN, 0), DAG); 3891 3892 SDValue Ops[2]; 3893 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); 3894 3895 return DAG.getMergeValues(Ops, SDLoc(N)); 3896 } 3897 3898 if (!IsFast) 3899 return SDValue(); 3900 } 3901 3902 if (!shouldCombineMemoryType(VT)) 3903 return SDValue(); 3904 3905 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); 3906 3907 SDValue NewLoad 3908 = DAG.getLoad(NewVT, SL, LN->getChain(), 3909 LN->getBasePtr(), LN->getMemOperand()); 3910 3911 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad); 3912 DCI.CombineTo(N, BC, NewLoad.getValue(1)); 3913 return SDValue(N, 0); 3914 } 3915 3916 // Replace store of an illegal type with a store of a bitcast to a friendlier 3917 // type. 3918 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, 3919 DAGCombinerInfo &DCI) const { 3920 if (!DCI.isBeforeLegalize()) 3921 return SDValue(); 3922 3923 StoreSDNode *SN = cast<StoreSDNode>(N); 3924 if (!SN->isSimple() || !ISD::isNormalStore(SN)) 3925 return SDValue(); 3926 3927 EVT VT = SN->getMemoryVT(); 3928 unsigned Size = VT.getStoreSize(); 3929 3930 SDLoc SL(N); 3931 SelectionDAG &DAG = DCI.DAG; 3932 Align Alignment = SN->getAlign(); 3933 if (Alignment < Size && isTypeLegal(VT)) { 3934 unsigned IsFast; 3935 unsigned AS = SN->getAddressSpace(); 3936 3937 // Expand unaligned stores earlier than legalization. Due to visitation 3938 // order problems during legalization, the emitted instructions to pack and 3939 // unpack the bytes again are not eliminated in the case of an unaligned 3940 // copy. 3941 if (!allowsMisalignedMemoryAccesses( 3942 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) { 3943 if (VT.isVector()) 3944 return SplitVectorStore(SDValue(SN, 0), DAG); 3945 3946 return expandUnalignedStore(SN, DAG); 3947 } 3948 3949 if (!IsFast) 3950 return SDValue(); 3951 } 3952 3953 if (!shouldCombineMemoryType(VT)) 3954 return SDValue(); 3955 3956 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); 3957 SDValue Val = SN->getValue(); 3958 3959 //DCI.AddToWorklist(Val.getNode()); 3960 3961 bool OtherUses = !Val.hasOneUse(); 3962 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val); 3963 if (OtherUses) { 3964 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal); 3965 DAG.ReplaceAllUsesOfValueWith(Val, CastBack); 3966 } 3967 3968 return DAG.getStore(SN->getChain(), SL, CastVal, 3969 SN->getBasePtr(), SN->getMemOperand()); 3970 } 3971 3972 // FIXME: This should go in generic DAG combiner with an isTruncateFree check, 3973 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU 3974 // issues. 3975 SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N, 3976 DAGCombinerInfo &DCI) const { 3977 SelectionDAG &DAG = DCI.DAG; 3978 SDValue N0 = N->getOperand(0); 3979 3980 // (vt2 (assertzext (truncate vt0:x), vt1)) -> 3981 // (vt2 (truncate (assertzext vt0:x, vt1))) 3982 if (N0.getOpcode() == ISD::TRUNCATE) { 3983 SDValue N1 = N->getOperand(1); 3984 EVT ExtVT = cast<VTSDNode>(N1)->getVT(); 3985 SDLoc SL(N); 3986 3987 SDValue Src = N0.getOperand(0); 3988 EVT SrcVT = Src.getValueType(); 3989 if (SrcVT.bitsGE(ExtVT)) { 3990 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1); 3991 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg); 3992 } 3993 } 3994 3995 return SDValue(); 3996 } 3997 3998 SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( 3999 SDNode *N, DAGCombinerInfo &DCI) const { 4000 unsigned IID = N->getConstantOperandVal(0); 4001 switch (IID) { 4002 case Intrinsic::amdgcn_mul_i24: 4003 case Intrinsic::amdgcn_mul_u24: 4004 case Intrinsic::amdgcn_mulhi_i24: 4005 case Intrinsic::amdgcn_mulhi_u24: 4006 return simplifyMul24(N, DCI); 4007 case Intrinsic::amdgcn_fract: 4008 case Intrinsic::amdgcn_rsq: 4009 case Intrinsic::amdgcn_rcp_legacy: 4010 case Intrinsic::amdgcn_rsq_legacy: 4011 case Intrinsic::amdgcn_rsq_clamp: 4012 case Intrinsic::amdgcn_tanh: { 4013 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted 4014 SDValue Src = N->getOperand(1); 4015 return Src.isUndef() ? Src : SDValue(); 4016 } 4017 case Intrinsic::amdgcn_frexp_exp: { 4018 // frexp_exp (fneg x) -> frexp_exp x 4019 // frexp_exp (fabs x) -> frexp_exp x 4020 // frexp_exp (fneg (fabs x)) -> frexp_exp x 4021 SDValue Src = N->getOperand(1); 4022 SDValue PeekSign = peekFPSignOps(Src); 4023 if (PeekSign == Src) 4024 return SDValue(); 4025 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign), 4026 0); 4027 } 4028 default: 4029 return SDValue(); 4030 } 4031 } 4032 4033 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the 4034 /// binary operation \p Opc to it with the corresponding constant operands. 4035 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( 4036 DAGCombinerInfo &DCI, const SDLoc &SL, 4037 unsigned Opc, SDValue LHS, 4038 uint32_t ValLo, uint32_t ValHi) const { 4039 SelectionDAG &DAG = DCI.DAG; 4040 SDValue Lo, Hi; 4041 std::tie(Lo, Hi) = split64BitValue(LHS, DAG); 4042 4043 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32); 4044 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32); 4045 4046 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS); 4047 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS); 4048 4049 // Re-visit the ands. It's possible we eliminated one of them and it could 4050 // simplify the vector. 4051 DCI.AddToWorklist(Lo.getNode()); 4052 DCI.AddToWorklist(Hi.getNode()); 4053 4054 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd}); 4055 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 4056 } 4057 4058 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, 4059 DAGCombinerInfo &DCI) const { 4060 EVT VT = N->getValueType(0); 4061 SDValue LHS = N->getOperand(0); 4062 SDValue RHS = N->getOperand(1); 4063 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 4064 SDLoc SL(N); 4065 SelectionDAG &DAG = DCI.DAG; 4066 4067 unsigned RHSVal; 4068 if (CRHS) { 4069 RHSVal = CRHS->getZExtValue(); 4070 if (!RHSVal) 4071 return LHS; 4072 4073 switch (LHS->getOpcode()) { 4074 default: 4075 break; 4076 case ISD::ZERO_EXTEND: 4077 case ISD::SIGN_EXTEND: 4078 case ISD::ANY_EXTEND: { 4079 SDValue X = LHS->getOperand(0); 4080 4081 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 && 4082 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) { 4083 // Prefer build_vector as the canonical form if packed types are legal. 4084 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x 4085 SDValue Vec = DAG.getBuildVector( 4086 MVT::v2i16, SL, 4087 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)}); 4088 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); 4089 } 4090 4091 // shl (ext x) => zext (shl x), if shift does not overflow int 4092 if (VT != MVT::i64) 4093 break; 4094 KnownBits Known = DAG.computeKnownBits(X); 4095 unsigned LZ = Known.countMinLeadingZeros(); 4096 if (LZ < RHSVal) 4097 break; 4098 EVT XVT = X.getValueType(); 4099 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0)); 4100 return DAG.getZExtOrTrunc(Shl, SL, VT); 4101 } 4102 } 4103 } 4104 4105 if (VT.getScalarType() != MVT::i64) 4106 return SDValue(); 4107 4108 // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32)) 4109 4110 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 4111 // common case, splitting this into a move and a 32-bit shift is faster and 4112 // the same code size. 4113 KnownBits Known = DAG.computeKnownBits(RHS); 4114 4115 EVT ElementType = VT.getScalarType(); 4116 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext()); 4117 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType) 4118 : TargetScalarType; 4119 4120 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits()) 4121 return SDValue(); 4122 SDValue ShiftAmt; 4123 4124 if (CRHS) { 4125 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL, 4126 TargetType); 4127 } else { 4128 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS); 4129 const SDValue ShiftMask = 4130 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType); 4131 // This AND instruction will clamp out of bounds shift values. 4132 // It will also be removed during later instruction selection. 4133 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask); 4134 } 4135 4136 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS); 4137 SDValue NewShift = 4138 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags()); 4139 4140 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType); 4141 SDValue Vec; 4142 4143 if (VT.isVector()) { 4144 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext()); 4145 unsigned NElts = TargetType.getVectorNumElements(); 4146 SmallVector<SDValue, 8> HiOps; 4147 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero); 4148 4149 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts); 4150 for (unsigned I = 0; I != NElts; ++I) 4151 HiAndLoOps[2 * I + 1] = HiOps[I]; 4152 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps); 4153 } else { 4154 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2); 4155 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift}); 4156 } 4157 return DAG.getNode(ISD::BITCAST, SL, VT, Vec); 4158 } 4159 4160 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, 4161 DAGCombinerInfo &DCI) const { 4162 SDValue RHS = N->getOperand(1); 4163 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 4164 EVT VT = N->getValueType(0); 4165 SDValue LHS = N->getOperand(0); 4166 SelectionDAG &DAG = DCI.DAG; 4167 SDLoc SL(N); 4168 4169 if (VT.getScalarType() != MVT::i64) 4170 return SDValue(); 4171 4172 // For C >= 32 4173 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31)) 4174 4175 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 4176 // common case, splitting this into a move and a 32-bit shift is faster and 4177 // the same code size. 4178 KnownBits Known = DAG.computeKnownBits(RHS); 4179 4180 EVT ElementType = VT.getScalarType(); 4181 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext()); 4182 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType) 4183 : TargetScalarType; 4184 4185 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits()) 4186 return SDValue(); 4187 4188 SDValue ShiftFullAmt = 4189 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType); 4190 SDValue ShiftAmt; 4191 if (CRHS) { 4192 unsigned RHSVal = CRHS->getZExtValue(); 4193 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL, 4194 TargetType); 4195 } else if (Known.getMinValue().getZExtValue() == 4196 (ElementType.getSizeInBits() - 1)) { 4197 ShiftAmt = ShiftFullAmt; 4198 } else { 4199 SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS); 4200 const SDValue ShiftMask = 4201 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType); 4202 // This AND instruction will clamp out of bounds shift values. 4203 // It will also be removed during later instruction selection. 4204 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask); 4205 } 4206 4207 EVT ConcatType; 4208 SDValue Hi; 4209 SDLoc LHSSL(LHS); 4210 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi 4211 if (VT.isVector()) { 4212 unsigned NElts = TargetType.getVectorNumElements(); 4213 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext()); 4214 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS); 4215 SmallVector<SDValue, 8> HiOps(NElts); 4216 SmallVector<SDValue, 16> HiAndLoOps; 4217 4218 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2); 4219 for (unsigned I = 0; I != NElts; ++I) { 4220 HiOps[I] = HiAndLoOps[2 * I + 1]; 4221 } 4222 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps); 4223 } else { 4224 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType); 4225 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2); 4226 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS); 4227 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One); 4228 } 4229 4230 KnownBits KnownLHS = DAG.computeKnownBits(LHS); 4231 SDValue HiShift; 4232 if (KnownLHS.isNegative()) { 4233 HiShift = DAG.getAllOnesConstant(SL, TargetType); 4234 } else { 4235 Hi = DAG.getFreeze(Hi); 4236 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt); 4237 } 4238 SDValue NewShift = 4239 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags()); 4240 4241 SDValue Vec; 4242 if (VT.isVector()) { 4243 unsigned NElts = TargetType.getVectorNumElements(); 4244 SmallVector<SDValue, 8> HiOps; 4245 SmallVector<SDValue, 8> LoOps; 4246 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2); 4247 4248 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts); 4249 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts); 4250 for (unsigned I = 0; I != NElts; ++I) { 4251 HiAndLoOps[2 * I + 1] = HiOps[I]; 4252 HiAndLoOps[2 * I] = LoOps[I]; 4253 } 4254 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps); 4255 } else { 4256 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift}); 4257 } 4258 return DAG.getNode(ISD::BITCAST, SL, VT, Vec); 4259 } 4260 4261 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, 4262 DAGCombinerInfo &DCI) const { 4263 SDValue RHS = N->getOperand(1); 4264 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 4265 EVT VT = N->getValueType(0); 4266 SDValue LHS = N->getOperand(0); 4267 SelectionDAG &DAG = DCI.DAG; 4268 SDLoc SL(N); 4269 unsigned RHSVal; 4270 4271 if (CRHS) { 4272 RHSVal = CRHS->getZExtValue(); 4273 4274 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1) 4275 // this improves the ability to match BFE patterns in isel. 4276 if (LHS.getOpcode() == ISD::AND) { 4277 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) { 4278 unsigned MaskIdx, MaskLen; 4279 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) && 4280 MaskIdx == RHSVal) { 4281 return DAG.getNode(ISD::AND, SL, VT, 4282 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), 4283 N->getOperand(1)), 4284 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), 4285 N->getOperand(1))); 4286 } 4287 } 4288 } 4289 } 4290 4291 if (VT.getScalarType() != MVT::i64) 4292 return SDValue(); 4293 4294 // for C >= 32 4295 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0) 4296 4297 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 4298 // common case, splitting this into a move and a 32-bit shift is faster and 4299 // the same code size. 4300 KnownBits Known = DAG.computeKnownBits(RHS); 4301 4302 EVT ElementType = VT.getScalarType(); 4303 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext()); 4304 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType) 4305 : TargetScalarType; 4306 4307 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits()) 4308 return SDValue(); 4309 4310 SDValue ShiftAmt; 4311 if (CRHS) { 4312 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL, 4313 TargetType); 4314 } else { 4315 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS); 4316 const SDValue ShiftMask = 4317 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType); 4318 // This AND instruction will clamp out of bounds shift values. 4319 // It will also be removed during later instruction selection. 4320 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask); 4321 } 4322 4323 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType); 4324 EVT ConcatType; 4325 SDValue Hi; 4326 SDLoc LHSSL(LHS); 4327 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi 4328 if (VT.isVector()) { 4329 unsigned NElts = TargetType.getVectorNumElements(); 4330 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext()); 4331 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS); 4332 SmallVector<SDValue, 8> HiOps(NElts); 4333 SmallVector<SDValue, 16> HiAndLoOps; 4334 4335 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2); 4336 for (unsigned I = 0; I != NElts; ++I) 4337 HiOps[I] = HiAndLoOps[2 * I + 1]; 4338 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps); 4339 } else { 4340 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType); 4341 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2); 4342 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS); 4343 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One); 4344 } 4345 4346 SDValue NewShift = 4347 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags()); 4348 4349 SDValue Vec; 4350 if (VT.isVector()) { 4351 unsigned NElts = TargetType.getVectorNumElements(); 4352 SmallVector<SDValue, 8> LoOps; 4353 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero); 4354 4355 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts); 4356 for (unsigned I = 0; I != NElts; ++I) 4357 HiAndLoOps[2 * I] = LoOps[I]; 4358 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps); 4359 } else { 4360 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero}); 4361 } 4362 return DAG.getNode(ISD::BITCAST, SL, VT, Vec); 4363 } 4364 4365 SDValue AMDGPUTargetLowering::performTruncateCombine( 4366 SDNode *N, DAGCombinerInfo &DCI) const { 4367 SDLoc SL(N); 4368 SelectionDAG &DAG = DCI.DAG; 4369 EVT VT = N->getValueType(0); 4370 SDValue Src = N->getOperand(0); 4371 4372 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x) 4373 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) { 4374 SDValue Vec = Src.getOperand(0); 4375 if (Vec.getOpcode() == ISD::BUILD_VECTOR) { 4376 SDValue Elt0 = Vec.getOperand(0); 4377 EVT EltVT = Elt0.getValueType(); 4378 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) { 4379 if (EltVT.isFloatingPoint()) { 4380 Elt0 = DAG.getNode(ISD::BITCAST, SL, 4381 EltVT.changeTypeToInteger(), Elt0); 4382 } 4383 4384 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0); 4385 } 4386 } 4387 } 4388 4389 // Equivalent of above for accessing the high element of a vector as an 4390 // integer operation. 4391 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y) 4392 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) { 4393 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) { 4394 SDValue BV = stripBitcast(Src.getOperand(0)); 4395 if (BV.getOpcode() == ISD::BUILD_VECTOR) { 4396 EVT SrcEltVT = BV.getOperand(0).getValueType(); 4397 unsigned SrcEltSize = SrcEltVT.getSizeInBits(); 4398 unsigned BitIndex = K->getZExtValue(); 4399 unsigned PartIndex = BitIndex / SrcEltSize; 4400 4401 if (PartIndex * SrcEltSize == BitIndex && 4402 PartIndex < BV.getNumOperands()) { 4403 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) { 4404 SDValue SrcElt = 4405 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(), 4406 BV.getOperand(PartIndex)); 4407 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt); 4408 } 4409 } 4410 } 4411 } 4412 } 4413 4414 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit. 4415 // 4416 // i16 (trunc (srl i64:x, K)), K <= 16 -> 4417 // i16 (trunc (srl (i32 (trunc x), K))) 4418 if (VT.getScalarSizeInBits() < 32) { 4419 EVT SrcVT = Src.getValueType(); 4420 if (SrcVT.getScalarSizeInBits() > 32 && 4421 (Src.getOpcode() == ISD::SRL || 4422 Src.getOpcode() == ISD::SRA || 4423 Src.getOpcode() == ISD::SHL)) { 4424 SDValue Amt = Src.getOperand(1); 4425 KnownBits Known = DAG.computeKnownBits(Amt); 4426 4427 // - For left shifts, do the transform as long as the shift 4428 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31) 4429 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid 4430 // losing information stored in the high bits when truncating. 4431 const unsigned MaxCstSize = 4432 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits()); 4433 if (Known.getMaxValue().ule(MaxCstSize)) { 4434 EVT MidVT = VT.isVector() ? 4435 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4436 VT.getVectorNumElements()) : MVT::i32; 4437 4438 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout()); 4439 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT, 4440 Src.getOperand(0)); 4441 DCI.AddToWorklist(Trunc.getNode()); 4442 4443 if (Amt.getValueType() != NewShiftVT) { 4444 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT); 4445 DCI.AddToWorklist(Amt.getNode()); 4446 } 4447 4448 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT, 4449 Trunc, Amt); 4450 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift); 4451 } 4452 } 4453 } 4454 4455 return SDValue(); 4456 } 4457 4458 // We need to specifically handle i64 mul here to avoid unnecessary conversion 4459 // instructions. If we only match on the legalized i64 mul expansion, 4460 // SimplifyDemandedBits will be unable to remove them because there will be 4461 // multiple uses due to the separate mul + mulh[su]. 4462 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, 4463 SDValue N0, SDValue N1, unsigned Size, bool Signed) { 4464 if (Size <= 32) { 4465 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; 4466 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1); 4467 } 4468 4469 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; 4470 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24; 4471 4472 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1); 4473 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1); 4474 4475 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi); 4476 } 4477 4478 /// If \p V is an add of a constant 1, returns the other operand. Otherwise 4479 /// return SDValue(). 4480 static SDValue getAddOneOp(const SDNode *V) { 4481 if (V->getOpcode() != ISD::ADD) 4482 return SDValue(); 4483 4484 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue(); 4485 } 4486 4487 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, 4488 DAGCombinerInfo &DCI) const { 4489 assert(N->getOpcode() == ISD::MUL); 4490 EVT VT = N->getValueType(0); 4491 4492 // Don't generate 24-bit multiplies on values that are in SGPRs, since 4493 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs 4494 // unnecessarily). isDivergent() is used as an approximation of whether the 4495 // value is in an SGPR. 4496 if (!N->isDivergent()) 4497 return SDValue(); 4498 4499 unsigned Size = VT.getSizeInBits(); 4500 if (VT.isVector() || Size > 64) 4501 return SDValue(); 4502 4503 SelectionDAG &DAG = DCI.DAG; 4504 SDLoc DL(N); 4505 4506 SDValue N0 = N->getOperand(0); 4507 SDValue N1 = N->getOperand(1); 4508 4509 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad 4510 // matching. 4511 4512 // mul x, (add y, 1) -> add (mul x, y), x 4513 auto IsFoldableAdd = [](SDValue V) -> SDValue { 4514 SDValue AddOp = getAddOneOp(V.getNode()); 4515 if (!AddOp) 4516 return SDValue(); 4517 4518 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool { 4519 return U->getOpcode() == ISD::MUL; 4520 })) 4521 return AddOp; 4522 4523 return SDValue(); 4524 }; 4525 4526 // FIXME: The selection pattern is not properly checking for commuted 4527 // operands, so we have to place the mul in the LHS 4528 if (SDValue MulOper = IsFoldableAdd(N0)) { 4529 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper); 4530 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1); 4531 } 4532 4533 if (SDValue MulOper = IsFoldableAdd(N1)) { 4534 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper); 4535 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0); 4536 } 4537 4538 // There are i16 integer mul/mad. 4539 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16)) 4540 return SDValue(); 4541 4542 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends 4543 // in the source into any_extends if the result of the mul is truncated. Since 4544 // we can assume the high bits are whatever we want, use the underlying value 4545 // to avoid the unknown high bits from interfering. 4546 if (N0.getOpcode() == ISD::ANY_EXTEND) 4547 N0 = N0.getOperand(0); 4548 4549 if (N1.getOpcode() == ISD::ANY_EXTEND) 4550 N1 = N1.getOperand(0); 4551 4552 SDValue Mul; 4553 4554 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { 4555 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 4556 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 4557 Mul = getMul24(DAG, DL, N0, N1, Size, false); 4558 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { 4559 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 4560 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 4561 Mul = getMul24(DAG, DL, N0, N1, Size, true); 4562 } else { 4563 return SDValue(); 4564 } 4565 4566 // We need to use sext even for MUL_U24, because MUL_U24 is used 4567 // for signed multiply of 8 and 16-bit types. 4568 return DAG.getSExtOrTrunc(Mul, DL, VT); 4569 } 4570 4571 SDValue 4572 AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N, 4573 DAGCombinerInfo &DCI) const { 4574 if (N->getValueType(0) != MVT::i32) 4575 return SDValue(); 4576 4577 SelectionDAG &DAG = DCI.DAG; 4578 SDLoc DL(N); 4579 4580 bool Signed = N->getOpcode() == ISD::SMUL_LOHI; 4581 SDValue N0 = N->getOperand(0); 4582 SDValue N1 = N->getOperand(1); 4583 4584 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends 4585 // in the source into any_extends if the result of the mul is truncated. Since 4586 // we can assume the high bits are whatever we want, use the underlying value 4587 // to avoid the unknown high bits from interfering. 4588 if (N0.getOpcode() == ISD::ANY_EXTEND) 4589 N0 = N0.getOperand(0); 4590 if (N1.getOpcode() == ISD::ANY_EXTEND) 4591 N1 = N1.getOperand(0); 4592 4593 // Try to use two fast 24-bit multiplies (one for each half of the result) 4594 // instead of one slow extending multiply. 4595 unsigned LoOpcode = 0; 4596 unsigned HiOpcode = 0; 4597 if (Signed) { 4598 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { 4599 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 4600 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 4601 LoOpcode = AMDGPUISD::MUL_I24; 4602 HiOpcode = AMDGPUISD::MULHI_I24; 4603 } 4604 } else { 4605 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { 4606 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 4607 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 4608 LoOpcode = AMDGPUISD::MUL_U24; 4609 HiOpcode = AMDGPUISD::MULHI_U24; 4610 } 4611 } 4612 if (!LoOpcode) 4613 return SDValue(); 4614 4615 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1); 4616 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1); 4617 DCI.CombineTo(N, Lo, Hi); 4618 return SDValue(N, 0); 4619 } 4620 4621 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N, 4622 DAGCombinerInfo &DCI) const { 4623 EVT VT = N->getValueType(0); 4624 4625 if (!Subtarget->hasMulI24() || VT.isVector()) 4626 return SDValue(); 4627 4628 // Don't generate 24-bit multiplies on values that are in SGPRs, since 4629 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs 4630 // unnecessarily). isDivergent() is used as an approximation of whether the 4631 // value is in an SGPR. 4632 // This doesn't apply if no s_mul_hi is available (since we'll end up with a 4633 // valu op anyway) 4634 if (Subtarget->hasSMulHi() && !N->isDivergent()) 4635 return SDValue(); 4636 4637 SelectionDAG &DAG = DCI.DAG; 4638 SDLoc DL(N); 4639 4640 SDValue N0 = N->getOperand(0); 4641 SDValue N1 = N->getOperand(1); 4642 4643 if (!isI24(N0, DAG) || !isI24(N1, DAG)) 4644 return SDValue(); 4645 4646 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 4647 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 4648 4649 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1); 4650 DCI.AddToWorklist(Mulhi.getNode()); 4651 return DAG.getSExtOrTrunc(Mulhi, DL, VT); 4652 } 4653 4654 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N, 4655 DAGCombinerInfo &DCI) const { 4656 EVT VT = N->getValueType(0); 4657 4658 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32) 4659 return SDValue(); 4660 4661 // Don't generate 24-bit multiplies on values that are in SGPRs, since 4662 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs 4663 // unnecessarily). isDivergent() is used as an approximation of whether the 4664 // value is in an SGPR. 4665 // This doesn't apply if no s_mul_hi is available (since we'll end up with a 4666 // valu op anyway) 4667 if (Subtarget->hasSMulHi() && !N->isDivergent()) 4668 return SDValue(); 4669 4670 SelectionDAG &DAG = DCI.DAG; 4671 SDLoc DL(N); 4672 4673 SDValue N0 = N->getOperand(0); 4674 SDValue N1 = N->getOperand(1); 4675 4676 if (!isU24(N0, DAG) || !isU24(N1, DAG)) 4677 return SDValue(); 4678 4679 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 4680 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 4681 4682 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1); 4683 DCI.AddToWorklist(Mulhi.getNode()); 4684 return DAG.getZExtOrTrunc(Mulhi, DL, VT); 4685 } 4686 4687 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG, 4688 SDValue Op, 4689 const SDLoc &DL, 4690 unsigned Opc) const { 4691 EVT VT = Op.getValueType(); 4692 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT); 4693 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() && 4694 LegalVT != MVT::i16)) 4695 return SDValue(); 4696 4697 if (VT != MVT::i32) 4698 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); 4699 4700 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op); 4701 if (VT != MVT::i32) 4702 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX); 4703 4704 return FFBX; 4705 } 4706 4707 // The native instructions return -1 on 0 input. Optimize out a select that 4708 // produces -1 on 0. 4709 // 4710 // TODO: If zero is not undef, we could also do this if the output is compared 4711 // against the bitwidth. 4712 // 4713 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly. 4714 SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, 4715 SDValue LHS, SDValue RHS, 4716 DAGCombinerInfo &DCI) const { 4717 if (!isNullConstant(Cond.getOperand(1))) 4718 return SDValue(); 4719 4720 SelectionDAG &DAG = DCI.DAG; 4721 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 4722 SDValue CmpLHS = Cond.getOperand(0); 4723 4724 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x 4725 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x 4726 if (CCOpcode == ISD::SETEQ && 4727 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && 4728 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) { 4729 unsigned Opc = 4730 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32; 4731 return getFFBX_U32(DAG, CmpLHS, SL, Opc); 4732 } 4733 4734 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x 4735 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x 4736 if (CCOpcode == ISD::SETNE && 4737 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) && 4738 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) { 4739 unsigned Opc = 4740 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32; 4741 4742 return getFFBX_U32(DAG, CmpLHS, SL, Opc); 4743 } 4744 4745 return SDValue(); 4746 } 4747 4748 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, 4749 unsigned Op, 4750 const SDLoc &SL, 4751 SDValue Cond, 4752 SDValue N1, 4753 SDValue N2) { 4754 SelectionDAG &DAG = DCI.DAG; 4755 EVT VT = N1.getValueType(); 4756 4757 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond, 4758 N1.getOperand(0), N2.getOperand(0)); 4759 DCI.AddToWorklist(NewSelect.getNode()); 4760 return DAG.getNode(Op, SL, VT, NewSelect); 4761 } 4762 4763 // Pull a free FP operation out of a select so it may fold into uses. 4764 // 4765 // select c, (fneg x), (fneg y) -> fneg (select c, x, y) 4766 // select c, (fneg x), k -> fneg (select c, x, (fneg k)) 4767 // 4768 // select c, (fabs x), (fabs y) -> fabs (select c, x, y) 4769 // select c, (fabs x), +k -> fabs (select c, x, k) 4770 SDValue 4771 AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, 4772 SDValue N) const { 4773 SelectionDAG &DAG = DCI.DAG; 4774 SDValue Cond = N.getOperand(0); 4775 SDValue LHS = N.getOperand(1); 4776 SDValue RHS = N.getOperand(2); 4777 4778 EVT VT = N.getValueType(); 4779 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) || 4780 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) { 4781 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) 4782 return SDValue(); 4783 4784 return distributeOpThroughSelect(DCI, LHS.getOpcode(), 4785 SDLoc(N), Cond, LHS, RHS); 4786 } 4787 4788 bool Inv = false; 4789 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) { 4790 std::swap(LHS, RHS); 4791 Inv = true; 4792 } 4793 4794 // TODO: Support vector constants. 4795 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 4796 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS && 4797 !selectSupportsSourceMods(N.getNode())) { 4798 SDLoc SL(N); 4799 // If one side is an fneg/fabs and the other is a constant, we can push the 4800 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative. 4801 SDValue NewLHS = LHS.getOperand(0); 4802 SDValue NewRHS = RHS; 4803 4804 // Careful: if the neg can be folded up, don't try to pull it back down. 4805 bool ShouldFoldNeg = true; 4806 4807 if (NewLHS.hasOneUse()) { 4808 unsigned Opc = NewLHS.getOpcode(); 4809 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode())) 4810 ShouldFoldNeg = false; 4811 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL) 4812 ShouldFoldNeg = false; 4813 } 4814 4815 if (ShouldFoldNeg) { 4816 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative()) 4817 return SDValue(); 4818 4819 // We're going to be forced to use a source modifier anyway, there's no 4820 // point to pulling the negate out unless we can get a size reduction by 4821 // negating the constant. 4822 // 4823 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know 4824 // about cheaper constants. 4825 if (NewLHS.getOpcode() == ISD::FABS && 4826 getConstantNegateCost(CRHS) != NegatibleCost::Cheaper) 4827 return SDValue(); 4828 4829 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) 4830 return SDValue(); 4831 4832 if (LHS.getOpcode() == ISD::FNEG) 4833 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 4834 4835 if (Inv) 4836 std::swap(NewLHS, NewRHS); 4837 4838 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, 4839 Cond, NewLHS, NewRHS); 4840 DCI.AddToWorklist(NewSelect.getNode()); 4841 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect); 4842 } 4843 } 4844 4845 return SDValue(); 4846 } 4847 4848 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, 4849 DAGCombinerInfo &DCI) const { 4850 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) 4851 return Folded; 4852 4853 SDValue Cond = N->getOperand(0); 4854 if (Cond.getOpcode() != ISD::SETCC) 4855 return SDValue(); 4856 4857 EVT VT = N->getValueType(0); 4858 SDValue LHS = Cond.getOperand(0); 4859 SDValue RHS = Cond.getOperand(1); 4860 SDValue CC = Cond.getOperand(2); 4861 4862 SDValue True = N->getOperand(1); 4863 SDValue False = N->getOperand(2); 4864 4865 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses. 4866 SelectionDAG &DAG = DCI.DAG; 4867 if (DAG.isConstantValueOfAnyType(True) && 4868 !DAG.isConstantValueOfAnyType(False)) { 4869 // Swap cmp + select pair to move constant to false input. 4870 // This will allow using VOPC cndmasks more often. 4871 // select (setcc x, y), k, x -> select (setccinv x, y), x, k 4872 4873 SDLoc SL(N); 4874 ISD::CondCode NewCC = 4875 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType()); 4876 4877 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC); 4878 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True); 4879 } 4880 4881 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) { 4882 SDValue MinMax 4883 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); 4884 // Revisit this node so we can catch min3/max3/med3 patterns. 4885 //DCI.AddToWorklist(MinMax.getNode()); 4886 return MinMax; 4887 } 4888 } 4889 4890 // There's no reason to not do this if the condition has other uses. 4891 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI); 4892 } 4893 4894 static bool isInv2Pi(const APFloat &APF) { 4895 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118)); 4896 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983)); 4897 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882)); 4898 4899 return APF.bitwiseIsEqual(KF16) || 4900 APF.bitwiseIsEqual(KF32) || 4901 APF.bitwiseIsEqual(KF64); 4902 } 4903 4904 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an 4905 // additional cost to negate them. 4906 TargetLowering::NegatibleCost 4907 AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const { 4908 if (C->isZero()) 4909 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive; 4910 4911 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF())) 4912 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive; 4913 4914 return NegatibleCost::Neutral; 4915 } 4916 4917 bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const { 4918 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) 4919 return getConstantNegateCost(C) == NegatibleCost::Expensive; 4920 return false; 4921 } 4922 4923 bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const { 4924 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) 4925 return getConstantNegateCost(C) == NegatibleCost::Cheaper; 4926 return false; 4927 } 4928 4929 static unsigned inverseMinMax(unsigned Opc) { 4930 switch (Opc) { 4931 case ISD::FMAXNUM: 4932 return ISD::FMINNUM; 4933 case ISD::FMINNUM: 4934 return ISD::FMAXNUM; 4935 case ISD::FMAXNUM_IEEE: 4936 return ISD::FMINNUM_IEEE; 4937 case ISD::FMINNUM_IEEE: 4938 return ISD::FMAXNUM_IEEE; 4939 case ISD::FMAXIMUM: 4940 return ISD::FMINIMUM; 4941 case ISD::FMINIMUM: 4942 return ISD::FMAXIMUM; 4943 case ISD::FMAXIMUMNUM: 4944 return ISD::FMINIMUMNUM; 4945 case ISD::FMINIMUMNUM: 4946 return ISD::FMAXIMUMNUM; 4947 case AMDGPUISD::FMAX_LEGACY: 4948 return AMDGPUISD::FMIN_LEGACY; 4949 case AMDGPUISD::FMIN_LEGACY: 4950 return AMDGPUISD::FMAX_LEGACY; 4951 default: 4952 llvm_unreachable("invalid min/max opcode"); 4953 } 4954 } 4955 4956 /// \return true if it's profitable to try to push an fneg into its source 4957 /// instruction. 4958 bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) { 4959 // If the input has multiple uses and we can either fold the negate down, or 4960 // the other uses cannot, give up. This both prevents unprofitable 4961 // transformations and infinite loops: we won't repeatedly try to fold around 4962 // a negate that has no 'good' form. 4963 if (N0.hasOneUse()) { 4964 // This may be able to fold into the source, but at a code size cost. Don't 4965 // fold if the fold into the user is free. 4966 if (allUsesHaveSourceMods(N, 0)) 4967 return false; 4968 } else { 4969 if (fnegFoldsIntoOp(N0.getNode()) && 4970 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode()))) 4971 return false; 4972 } 4973 4974 return true; 4975 } 4976 4977 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, 4978 DAGCombinerInfo &DCI) const { 4979 SelectionDAG &DAG = DCI.DAG; 4980 SDValue N0 = N->getOperand(0); 4981 EVT VT = N->getValueType(0); 4982 4983 unsigned Opc = N0.getOpcode(); 4984 4985 if (!shouldFoldFNegIntoSrc(N, N0)) 4986 return SDValue(); 4987 4988 SDLoc SL(N); 4989 switch (Opc) { 4990 case ISD::FADD: { 4991 if (!mayIgnoreSignedZero(N0)) 4992 return SDValue(); 4993 4994 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y)) 4995 SDValue LHS = N0.getOperand(0); 4996 SDValue RHS = N0.getOperand(1); 4997 4998 if (LHS.getOpcode() != ISD::FNEG) 4999 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); 5000 else 5001 LHS = LHS.getOperand(0); 5002 5003 if (RHS.getOpcode() != ISD::FNEG) 5004 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 5005 else 5006 RHS = RHS.getOperand(0); 5007 5008 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags()); 5009 if (Res.getOpcode() != ISD::FADD) 5010 return SDValue(); // Op got folded away. 5011 if (!N0.hasOneUse()) 5012 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 5013 return Res; 5014 } 5015 case ISD::FMUL: 5016 case AMDGPUISD::FMUL_LEGACY: { 5017 // (fneg (fmul x, y)) -> (fmul x, (fneg y)) 5018 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y)) 5019 SDValue LHS = N0.getOperand(0); 5020 SDValue RHS = N0.getOperand(1); 5021 5022 if (LHS.getOpcode() == ISD::FNEG) 5023 LHS = LHS.getOperand(0); 5024 else if (RHS.getOpcode() == ISD::FNEG) 5025 RHS = RHS.getOperand(0); 5026 else 5027 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 5028 5029 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags()); 5030 if (Res.getOpcode() != Opc) 5031 return SDValue(); // Op got folded away. 5032 if (!N0.hasOneUse()) 5033 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 5034 return Res; 5035 } 5036 case ISD::FMA: 5037 case ISD::FMAD: { 5038 // TODO: handle llvm.amdgcn.fma.legacy 5039 if (!mayIgnoreSignedZero(N0)) 5040 return SDValue(); 5041 5042 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z)) 5043 SDValue LHS = N0.getOperand(0); 5044 SDValue MHS = N0.getOperand(1); 5045 SDValue RHS = N0.getOperand(2); 5046 5047 if (LHS.getOpcode() == ISD::FNEG) 5048 LHS = LHS.getOperand(0); 5049 else if (MHS.getOpcode() == ISD::FNEG) 5050 MHS = MHS.getOperand(0); 5051 else 5052 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS); 5053 5054 if (RHS.getOpcode() != ISD::FNEG) 5055 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 5056 else 5057 RHS = RHS.getOperand(0); 5058 5059 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS); 5060 if (Res.getOpcode() != Opc) 5061 return SDValue(); // Op got folded away. 5062 if (!N0.hasOneUse()) 5063 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 5064 return Res; 5065 } 5066 case ISD::FMAXNUM: 5067 case ISD::FMINNUM: 5068 case ISD::FMAXNUM_IEEE: 5069 case ISD::FMINNUM_IEEE: 5070 case ISD::FMINIMUM: 5071 case ISD::FMAXIMUM: 5072 case ISD::FMINIMUMNUM: 5073 case ISD::FMAXIMUMNUM: 5074 case AMDGPUISD::FMAX_LEGACY: 5075 case AMDGPUISD::FMIN_LEGACY: { 5076 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y) 5077 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y) 5078 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y) 5079 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y) 5080 5081 SDValue LHS = N0.getOperand(0); 5082 SDValue RHS = N0.getOperand(1); 5083 5084 // 0 doesn't have a negated inline immediate. 5085 // TODO: This constant check should be generalized to other operations. 5086 if (isConstantCostlierToNegate(RHS)) 5087 return SDValue(); 5088 5089 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); 5090 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 5091 unsigned Opposite = inverseMinMax(Opc); 5092 5093 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags()); 5094 if (Res.getOpcode() != Opposite) 5095 return SDValue(); // Op got folded away. 5096 if (!N0.hasOneUse()) 5097 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 5098 return Res; 5099 } 5100 case AMDGPUISD::FMED3: { 5101 SDValue Ops[3]; 5102 for (unsigned I = 0; I < 3; ++I) 5103 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags()); 5104 5105 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags()); 5106 if (Res.getOpcode() != AMDGPUISD::FMED3) 5107 return SDValue(); // Op got folded away. 5108 5109 if (!N0.hasOneUse()) { 5110 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res); 5111 DAG.ReplaceAllUsesWith(N0, Neg); 5112 5113 for (SDNode *U : Neg->users()) 5114 DCI.AddToWorklist(U); 5115 } 5116 5117 return Res; 5118 } 5119 case ISD::FP_EXTEND: 5120 case ISD::FTRUNC: 5121 case ISD::FRINT: 5122 case ISD::FNEARBYINT: // XXX - Should fround be handled? 5123 case ISD::FROUNDEVEN: 5124 case ISD::FSIN: 5125 case ISD::FCANONICALIZE: 5126 case AMDGPUISD::RCP: 5127 case AMDGPUISD::RCP_LEGACY: 5128 case AMDGPUISD::RCP_IFLAG: 5129 case AMDGPUISD::SIN_HW: { 5130 SDValue CvtSrc = N0.getOperand(0); 5131 if (CvtSrc.getOpcode() == ISD::FNEG) { 5132 // (fneg (fp_extend (fneg x))) -> (fp_extend x) 5133 // (fneg (rcp (fneg x))) -> (rcp x) 5134 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0)); 5135 } 5136 5137 if (!N0.hasOneUse()) 5138 return SDValue(); 5139 5140 // (fneg (fp_extend x)) -> (fp_extend (fneg x)) 5141 // (fneg (rcp x)) -> (rcp (fneg x)) 5142 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); 5143 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags()); 5144 } 5145 case ISD::FP_ROUND: { 5146 SDValue CvtSrc = N0.getOperand(0); 5147 5148 if (CvtSrc.getOpcode() == ISD::FNEG) { 5149 // (fneg (fp_round (fneg x))) -> (fp_round x) 5150 return DAG.getNode(ISD::FP_ROUND, SL, VT, 5151 CvtSrc.getOperand(0), N0.getOperand(1)); 5152 } 5153 5154 if (!N0.hasOneUse()) 5155 return SDValue(); 5156 5157 // (fneg (fp_round x)) -> (fp_round (fneg x)) 5158 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); 5159 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1)); 5160 } 5161 case ISD::FP16_TO_FP: { 5162 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal 5163 // f16, but legalization of f16 fneg ends up pulling it out of the source. 5164 // Put the fneg back as a legal source operation that can be matched later. 5165 SDLoc SL(N); 5166 5167 SDValue Src = N0.getOperand(0); 5168 EVT SrcVT = Src.getValueType(); 5169 5170 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000) 5171 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src, 5172 DAG.getConstant(0x8000, SL, SrcVT)); 5173 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg); 5174 } 5175 case ISD::SELECT: { 5176 // fneg (select c, a, b) -> select c, (fneg a), (fneg b) 5177 // TODO: Invert conditions of foldFreeOpFromSelect 5178 return SDValue(); 5179 } 5180 case ISD::BITCAST: { 5181 SDLoc SL(N); 5182 SDValue BCSrc = N0.getOperand(0); 5183 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) { 5184 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1); 5185 if (HighBits.getValueType().getSizeInBits() != 32 || 5186 !fnegFoldsIntoOp(HighBits.getNode())) 5187 return SDValue(); 5188 5189 // f64 fneg only really needs to operate on the high half of of the 5190 // register, so try to force it to an f32 operation to help make use of 5191 // source modifiers. 5192 // 5193 // 5194 // fneg (f64 (bitcast (build_vector x, y))) -> 5195 // f64 (bitcast (build_vector (bitcast i32:x to f32), 5196 // (fneg (bitcast i32:y to f32))) 5197 5198 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits); 5199 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi); 5200 SDValue CastBack = 5201 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi); 5202 5203 SmallVector<SDValue, 8> Ops(BCSrc->ops()); 5204 Ops.back() = CastBack; 5205 DCI.AddToWorklist(NegHi.getNode()); 5206 SDValue Build = 5207 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops); 5208 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build); 5209 5210 if (!N0.hasOneUse()) 5211 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result)); 5212 return Result; 5213 } 5214 5215 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 && 5216 BCSrc.hasOneUse()) { 5217 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) -> 5218 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32) 5219 5220 // TODO: Cast back result for multiple uses is beneficial in some cases. 5221 5222 SDValue LHS = 5223 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1)); 5224 SDValue RHS = 5225 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2)); 5226 5227 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS); 5228 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS); 5229 5230 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS, 5231 NegRHS); 5232 } 5233 5234 return SDValue(); 5235 } 5236 default: 5237 return SDValue(); 5238 } 5239 } 5240 5241 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N, 5242 DAGCombinerInfo &DCI) const { 5243 SelectionDAG &DAG = DCI.DAG; 5244 SDValue N0 = N->getOperand(0); 5245 5246 if (!N0.hasOneUse()) 5247 return SDValue(); 5248 5249 switch (N0.getOpcode()) { 5250 case ISD::FP16_TO_FP: { 5251 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal"); 5252 SDLoc SL(N); 5253 SDValue Src = N0.getOperand(0); 5254 EVT SrcVT = Src.getValueType(); 5255 5256 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff) 5257 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src, 5258 DAG.getConstant(0x7fff, SL, SrcVT)); 5259 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs); 5260 } 5261 default: 5262 return SDValue(); 5263 } 5264 } 5265 5266 SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N, 5267 DAGCombinerInfo &DCI) const { 5268 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); 5269 if (!CFP) 5270 return SDValue(); 5271 5272 // XXX - Should this flush denormals? 5273 const APFloat &Val = CFP->getValueAPF(); 5274 APFloat One(Val.getSemantics(), "1.0"); 5275 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); 5276 } 5277 5278 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, 5279 DAGCombinerInfo &DCI) const { 5280 SelectionDAG &DAG = DCI.DAG; 5281 SDLoc DL(N); 5282 5283 switch(N->getOpcode()) { 5284 default: 5285 break; 5286 case ISD::BITCAST: { 5287 EVT DestVT = N->getValueType(0); 5288 5289 // Push casts through vector builds. This helps avoid emitting a large 5290 // number of copies when materializing floating point vector constants. 5291 // 5292 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) => 5293 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y)) 5294 if (DestVT.isVector()) { 5295 SDValue Src = N->getOperand(0); 5296 if (Src.getOpcode() == ISD::BUILD_VECTOR && 5297 (DCI.getDAGCombineLevel() < AfterLegalizeDAG || 5298 isOperationLegal(ISD::BUILD_VECTOR, DestVT))) { 5299 EVT SrcVT = Src.getValueType(); 5300 unsigned NElts = DestVT.getVectorNumElements(); 5301 5302 if (SrcVT.getVectorNumElements() == NElts) { 5303 EVT DestEltVT = DestVT.getVectorElementType(); 5304 5305 SmallVector<SDValue, 8> CastedElts; 5306 SDLoc SL(N); 5307 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) { 5308 SDValue Elt = Src.getOperand(I); 5309 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt)); 5310 } 5311 5312 return DAG.getBuildVector(DestVT, SL, CastedElts); 5313 } 5314 } 5315 } 5316 5317 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector()) 5318 break; 5319 5320 // Fold bitcasts of constants. 5321 // 5322 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k) 5323 // TODO: Generalize and move to DAGCombiner 5324 SDValue Src = N->getOperand(0); 5325 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) { 5326 SDLoc SL(N); 5327 uint64_t CVal = C->getZExtValue(); 5328 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 5329 DAG.getConstant(Lo_32(CVal), SL, MVT::i32), 5330 DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); 5331 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV); 5332 } 5333 5334 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) { 5335 const APInt &Val = C->getValueAPF().bitcastToAPInt(); 5336 SDLoc SL(N); 5337 uint64_t CVal = Val.getZExtValue(); 5338 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 5339 DAG.getConstant(Lo_32(CVal), SL, MVT::i32), 5340 DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); 5341 5342 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec); 5343 } 5344 5345 break; 5346 } 5347 case ISD::SHL: 5348 case ISD::SRA: 5349 case ISD::SRL: { 5350 // Range metadata can be invalidated when loads are converted to legal types 5351 // (e.g. v2i64 -> v4i32). 5352 // Try to convert vector shl/sra/srl before type legalization so that range 5353 // metadata can be utilized. 5354 if (!(N->getValueType(0).isVector() && 5355 DCI.getDAGCombineLevel() == BeforeLegalizeTypes) && 5356 DCI.getDAGCombineLevel() < AfterLegalizeDAG) 5357 break; 5358 if (N->getOpcode() == ISD::SHL) 5359 return performShlCombine(N, DCI); 5360 if (N->getOpcode() == ISD::SRA) 5361 return performSraCombine(N, DCI); 5362 return performSrlCombine(N, DCI); 5363 } 5364 case ISD::TRUNCATE: 5365 return performTruncateCombine(N, DCI); 5366 case ISD::MUL: 5367 return performMulCombine(N, DCI); 5368 case AMDGPUISD::MUL_U24: 5369 case AMDGPUISD::MUL_I24: { 5370 if (SDValue Simplified = simplifyMul24(N, DCI)) 5371 return Simplified; 5372 break; 5373 } 5374 case AMDGPUISD::MULHI_I24: 5375 case AMDGPUISD::MULHI_U24: 5376 return simplifyMul24(N, DCI); 5377 case ISD::SMUL_LOHI: 5378 case ISD::UMUL_LOHI: 5379 return performMulLoHiCombine(N, DCI); 5380 case ISD::MULHS: 5381 return performMulhsCombine(N, DCI); 5382 case ISD::MULHU: 5383 return performMulhuCombine(N, DCI); 5384 case ISD::SELECT: 5385 return performSelectCombine(N, DCI); 5386 case ISD::FNEG: 5387 return performFNegCombine(N, DCI); 5388 case ISD::FABS: 5389 return performFAbsCombine(N, DCI); 5390 case AMDGPUISD::BFE_I32: 5391 case AMDGPUISD::BFE_U32: { 5392 assert(!N->getValueType(0).isVector() && 5393 "Vector handling of BFE not implemented"); 5394 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); 5395 if (!Width) 5396 break; 5397 5398 uint32_t WidthVal = Width->getZExtValue() & 0x1f; 5399 if (WidthVal == 0) 5400 return DAG.getConstant(0, DL, MVT::i32); 5401 5402 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 5403 if (!Offset) 5404 break; 5405 5406 SDValue BitsFrom = N->getOperand(0); 5407 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; 5408 5409 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; 5410 5411 if (OffsetVal == 0) { 5412 // This is already sign / zero extended, so try to fold away extra BFEs. 5413 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); 5414 5415 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); 5416 if (OpSignBits >= SignBits) 5417 return BitsFrom; 5418 5419 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); 5420 if (Signed) { 5421 // This is a sign_extend_inreg. Replace it to take advantage of existing 5422 // DAG Combines. If not eliminated, we will match back to BFE during 5423 // selection. 5424 5425 // TODO: The sext_inreg of extended types ends, although we can could 5426 // handle them in a single BFE. 5427 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, 5428 DAG.getValueType(SmallVT)); 5429 } 5430 5431 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); 5432 } 5433 5434 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) { 5435 if (Signed) { 5436 return constantFoldBFE<int32_t>(DAG, 5437 CVal->getSExtValue(), 5438 OffsetVal, 5439 WidthVal, 5440 DL); 5441 } 5442 5443 return constantFoldBFE<uint32_t>(DAG, 5444 CVal->getZExtValue(), 5445 OffsetVal, 5446 WidthVal, 5447 DL); 5448 } 5449 5450 if ((OffsetVal + WidthVal) >= 32 && 5451 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) { 5452 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); 5453 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, 5454 BitsFrom, ShiftVal); 5455 } 5456 5457 if (BitsFrom.hasOneUse()) { 5458 APInt Demanded = APInt::getBitsSet(32, 5459 OffsetVal, 5460 OffsetVal + WidthVal); 5461 5462 KnownBits Known; 5463 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 5464 !DCI.isBeforeLegalizeOps()); 5465 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5466 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) || 5467 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) { 5468 DCI.CommitTargetLoweringOpt(TLO); 5469 } 5470 } 5471 5472 break; 5473 } 5474 case ISD::LOAD: 5475 return performLoadCombine(N, DCI); 5476 case ISD::STORE: 5477 return performStoreCombine(N, DCI); 5478 case AMDGPUISD::RCP: 5479 case AMDGPUISD::RCP_IFLAG: 5480 return performRcpCombine(N, DCI); 5481 case ISD::AssertZext: 5482 case ISD::AssertSext: 5483 return performAssertSZExtCombine(N, DCI); 5484 case ISD::INTRINSIC_WO_CHAIN: 5485 return performIntrinsicWOChainCombine(N, DCI); 5486 case AMDGPUISD::FMAD_FTZ: { 5487 SDValue N0 = N->getOperand(0); 5488 SDValue N1 = N->getOperand(1); 5489 SDValue N2 = N->getOperand(2); 5490 EVT VT = N->getValueType(0); 5491 5492 // FMAD_FTZ is a FMAD + flush denormals to zero. 5493 // We flush the inputs, the intermediate step, and the output. 5494 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); 5495 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); 5496 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2); 5497 if (N0CFP && N1CFP && N2CFP) { 5498 const auto FTZ = [](const APFloat &V) { 5499 if (V.isDenormal()) { 5500 APFloat Zero(V.getSemantics(), 0); 5501 return V.isNegative() ? -Zero : Zero; 5502 } 5503 return V; 5504 }; 5505 5506 APFloat V0 = FTZ(N0CFP->getValueAPF()); 5507 APFloat V1 = FTZ(N1CFP->getValueAPF()); 5508 APFloat V2 = FTZ(N2CFP->getValueAPF()); 5509 V0.multiply(V1, APFloat::rmNearestTiesToEven); 5510 V0 = FTZ(V0); 5511 V0.add(V2, APFloat::rmNearestTiesToEven); 5512 return DAG.getConstantFP(FTZ(V0), DL, VT); 5513 } 5514 break; 5515 } 5516 } 5517 return SDValue(); 5518 } 5519 5520 //===----------------------------------------------------------------------===// 5521 // Helper functions 5522 //===----------------------------------------------------------------------===// 5523 5524 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 5525 const TargetRegisterClass *RC, 5526 Register Reg, EVT VT, 5527 const SDLoc &SL, 5528 bool RawReg) const { 5529 MachineFunction &MF = DAG.getMachineFunction(); 5530 MachineRegisterInfo &MRI = MF.getRegInfo(); 5531 Register VReg; 5532 5533 if (!MRI.isLiveIn(Reg)) { 5534 VReg = MRI.createVirtualRegister(RC); 5535 MRI.addLiveIn(Reg, VReg); 5536 } else { 5537 VReg = MRI.getLiveInVirtReg(Reg); 5538 } 5539 5540 if (RawReg) 5541 return DAG.getRegister(VReg, VT); 5542 5543 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT); 5544 } 5545 5546 // This may be called multiple times, and nothing prevents creating multiple 5547 // objects at the same offset. See if we already defined this object. 5548 static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, 5549 int64_t Offset) { 5550 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) { 5551 if (MFI.getObjectOffset(I) == Offset) { 5552 assert(MFI.getObjectSize(I) == Size); 5553 return I; 5554 } 5555 } 5556 5557 return MFI.CreateFixedObject(Size, Offset, true); 5558 } 5559 5560 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, 5561 EVT VT, 5562 const SDLoc &SL, 5563 int64_t Offset) const { 5564 MachineFunction &MF = DAG.getMachineFunction(); 5565 MachineFrameInfo &MFI = MF.getFrameInfo(); 5566 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset); 5567 5568 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); 5569 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); 5570 5571 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4), 5572 MachineMemOperand::MODereferenceable | 5573 MachineMemOperand::MOInvariant); 5574 } 5575 5576 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, 5577 const SDLoc &SL, 5578 SDValue Chain, 5579 SDValue ArgVal, 5580 int64_t Offset) const { 5581 MachineFunction &MF = DAG.getMachineFunction(); 5582 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); 5583 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 5584 5585 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32); 5586 // Stores to the argument stack area are relative to the stack pointer. 5587 SDValue SP = 5588 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32); 5589 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr); 5590 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4), 5591 MachineMemOperand::MODereferenceable); 5592 return Store; 5593 } 5594 5595 SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, 5596 const TargetRegisterClass *RC, 5597 EVT VT, const SDLoc &SL, 5598 const ArgDescriptor &Arg) const { 5599 assert(Arg && "Attempting to load missing argument"); 5600 5601 SDValue V = Arg.isRegister() ? 5602 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) : 5603 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); 5604 5605 if (!Arg.isMasked()) 5606 return V; 5607 5608 unsigned Mask = Arg.getMask(); 5609 unsigned Shift = llvm::countr_zero<unsigned>(Mask); 5610 V = DAG.getNode(ISD::SRL, SL, VT, V, 5611 DAG.getShiftAmountConstant(Shift, VT, SL)); 5612 return DAG.getNode(ISD::AND, SL, VT, V, 5613 DAG.getConstant(Mask >> Shift, SL, VT)); 5614 } 5615 5616 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( 5617 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const { 5618 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset(); 5619 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr(); 5620 uint64_t ArgOffset = 5621 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset; 5622 switch (Param) { 5623 case FIRST_IMPLICIT: 5624 return ArgOffset; 5625 case PRIVATE_BASE: 5626 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET; 5627 case SHARED_BASE: 5628 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET; 5629 case QUEUE_PTR: 5630 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET; 5631 } 5632 llvm_unreachable("unexpected implicit parameter type"); 5633 } 5634 5635 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( 5636 const MachineFunction &MF, const ImplicitParameter Param) const { 5637 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); 5638 return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param); 5639 } 5640 5641 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; 5642 5643 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { 5644 switch ((AMDGPUISD::NodeType)Opcode) { 5645 case AMDGPUISD::FIRST_NUMBER: break; 5646 // AMDIL DAG nodes 5647 NODE_NAME_CASE(BRANCH_COND); 5648 5649 // AMDGPU DAG nodes 5650 NODE_NAME_CASE(IF) 5651 NODE_NAME_CASE(ELSE) 5652 NODE_NAME_CASE(LOOP) 5653 NODE_NAME_CASE(CALL) 5654 NODE_NAME_CASE(TC_RETURN) 5655 NODE_NAME_CASE(TC_RETURN_GFX) 5656 NODE_NAME_CASE(TC_RETURN_CHAIN) 5657 NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR) 5658 NODE_NAME_CASE(TRAP) 5659 NODE_NAME_CASE(RET_GLUE) 5660 NODE_NAME_CASE(WAVE_ADDRESS) 5661 NODE_NAME_CASE(RETURN_TO_EPILOG) 5662 NODE_NAME_CASE(ENDPGM) 5663 NODE_NAME_CASE(ENDPGM_TRAP) 5664 NODE_NAME_CASE(SIMULATED_TRAP) 5665 NODE_NAME_CASE(DWORDADDR) 5666 NODE_NAME_CASE(FRACT) 5667 NODE_NAME_CASE(SETCC) 5668 NODE_NAME_CASE(DENORM_MODE) 5669 NODE_NAME_CASE(FMA_W_CHAIN) 5670 NODE_NAME_CASE(FMUL_W_CHAIN) 5671 NODE_NAME_CASE(CLAMP) 5672 NODE_NAME_CASE(COS_HW) 5673 NODE_NAME_CASE(SIN_HW) 5674 NODE_NAME_CASE(FMAX_LEGACY) 5675 NODE_NAME_CASE(FMIN_LEGACY) 5676 NODE_NAME_CASE(FMAX3) 5677 NODE_NAME_CASE(SMAX3) 5678 NODE_NAME_CASE(UMAX3) 5679 NODE_NAME_CASE(FMIN3) 5680 NODE_NAME_CASE(SMIN3) 5681 NODE_NAME_CASE(UMIN3) 5682 NODE_NAME_CASE(FMED3) 5683 NODE_NAME_CASE(SMED3) 5684 NODE_NAME_CASE(UMED3) 5685 NODE_NAME_CASE(FMAXIMUM3) 5686 NODE_NAME_CASE(FMINIMUM3) 5687 NODE_NAME_CASE(FDOT2) 5688 NODE_NAME_CASE(URECIP) 5689 NODE_NAME_CASE(DIV_SCALE) 5690 NODE_NAME_CASE(DIV_FMAS) 5691 NODE_NAME_CASE(DIV_FIXUP) 5692 NODE_NAME_CASE(FMAD_FTZ) 5693 NODE_NAME_CASE(RCP) 5694 NODE_NAME_CASE(RSQ) 5695 NODE_NAME_CASE(RCP_LEGACY) 5696 NODE_NAME_CASE(RCP_IFLAG) 5697 NODE_NAME_CASE(LOG) 5698 NODE_NAME_CASE(EXP) 5699 NODE_NAME_CASE(FMUL_LEGACY) 5700 NODE_NAME_CASE(RSQ_CLAMP) 5701 NODE_NAME_CASE(FP_CLASS) 5702 NODE_NAME_CASE(DOT4) 5703 NODE_NAME_CASE(CARRY) 5704 NODE_NAME_CASE(BORROW) 5705 NODE_NAME_CASE(BFE_U32) 5706 NODE_NAME_CASE(BFE_I32) 5707 NODE_NAME_CASE(BFI) 5708 NODE_NAME_CASE(BFM) 5709 NODE_NAME_CASE(FFBH_U32) 5710 NODE_NAME_CASE(FFBH_I32) 5711 NODE_NAME_CASE(FFBL_B32) 5712 NODE_NAME_CASE(MUL_U24) 5713 NODE_NAME_CASE(MUL_I24) 5714 NODE_NAME_CASE(MULHI_U24) 5715 NODE_NAME_CASE(MULHI_I24) 5716 NODE_NAME_CASE(MAD_U24) 5717 NODE_NAME_CASE(MAD_I24) 5718 NODE_NAME_CASE(MAD_I64_I32) 5719 NODE_NAME_CASE(MAD_U64_U32) 5720 NODE_NAME_CASE(PERM) 5721 NODE_NAME_CASE(TEXTURE_FETCH) 5722 NODE_NAME_CASE(R600_EXPORT) 5723 NODE_NAME_CASE(CONST_ADDRESS) 5724 NODE_NAME_CASE(REGISTER_LOAD) 5725 NODE_NAME_CASE(REGISTER_STORE) 5726 NODE_NAME_CASE(CVT_F32_UBYTE0) 5727 NODE_NAME_CASE(CVT_F32_UBYTE1) 5728 NODE_NAME_CASE(CVT_F32_UBYTE2) 5729 NODE_NAME_CASE(CVT_F32_UBYTE3) 5730 NODE_NAME_CASE(CVT_PKRTZ_F16_F32) 5731 NODE_NAME_CASE(CVT_PKNORM_I16_F32) 5732 NODE_NAME_CASE(CVT_PKNORM_U16_F32) 5733 NODE_NAME_CASE(CVT_PK_I16_I32) 5734 NODE_NAME_CASE(CVT_PK_U16_U32) 5735 NODE_NAME_CASE(FP_TO_FP16) 5736 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) 5737 NODE_NAME_CASE(CONST_DATA_PTR) 5738 NODE_NAME_CASE(PC_ADD_REL_OFFSET) 5739 NODE_NAME_CASE(LDS) 5740 NODE_NAME_CASE(DUMMY_CHAIN) 5741 NODE_NAME_CASE(LOAD_D16_HI) 5742 NODE_NAME_CASE(LOAD_D16_LO) 5743 NODE_NAME_CASE(LOAD_D16_HI_I8) 5744 NODE_NAME_CASE(LOAD_D16_HI_U8) 5745 NODE_NAME_CASE(LOAD_D16_LO_I8) 5746 NODE_NAME_CASE(LOAD_D16_LO_U8) 5747 NODE_NAME_CASE(STORE_MSKOR) 5748 NODE_NAME_CASE(TBUFFER_STORE_FORMAT) 5749 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) 5750 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) 5751 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) 5752 NODE_NAME_CASE(DS_ORDERED_COUNT) 5753 NODE_NAME_CASE(ATOMIC_CMP_SWAP) 5754 NODE_NAME_CASE(BUFFER_LOAD) 5755 NODE_NAME_CASE(BUFFER_LOAD_UBYTE) 5756 NODE_NAME_CASE(BUFFER_LOAD_USHORT) 5757 NODE_NAME_CASE(BUFFER_LOAD_BYTE) 5758 NODE_NAME_CASE(BUFFER_LOAD_SHORT) 5759 NODE_NAME_CASE(BUFFER_LOAD_TFE) 5760 NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE) 5761 NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE) 5762 NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE) 5763 NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE) 5764 NODE_NAME_CASE(BUFFER_LOAD_FORMAT) 5765 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE) 5766 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) 5767 NODE_NAME_CASE(SBUFFER_LOAD) 5768 NODE_NAME_CASE(SBUFFER_LOAD_BYTE) 5769 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE) 5770 NODE_NAME_CASE(SBUFFER_LOAD_SHORT) 5771 NODE_NAME_CASE(SBUFFER_LOAD_USHORT) 5772 NODE_NAME_CASE(SBUFFER_PREFETCH_DATA) 5773 NODE_NAME_CASE(BUFFER_STORE) 5774 NODE_NAME_CASE(BUFFER_STORE_BYTE) 5775 NODE_NAME_CASE(BUFFER_STORE_SHORT) 5776 NODE_NAME_CASE(BUFFER_STORE_FORMAT) 5777 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) 5778 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) 5779 NODE_NAME_CASE(BUFFER_ATOMIC_ADD) 5780 NODE_NAME_CASE(BUFFER_ATOMIC_SUB) 5781 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN) 5782 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN) 5783 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX) 5784 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX) 5785 NODE_NAME_CASE(BUFFER_ATOMIC_AND) 5786 NODE_NAME_CASE(BUFFER_ATOMIC_OR) 5787 NODE_NAME_CASE(BUFFER_ATOMIC_XOR) 5788 NODE_NAME_CASE(BUFFER_ATOMIC_INC) 5789 NODE_NAME_CASE(BUFFER_ATOMIC_DEC) 5790 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) 5791 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) 5792 NODE_NAME_CASE(BUFFER_ATOMIC_FADD) 5793 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) 5794 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) 5795 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32) 5796 } 5797 return nullptr; 5798 } 5799 5800 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand, 5801 SelectionDAG &DAG, int Enabled, 5802 int &RefinementSteps, 5803 bool &UseOneConstNR, 5804 bool Reciprocal) const { 5805 EVT VT = Operand.getValueType(); 5806 5807 if (VT == MVT::f32) { 5808 RefinementSteps = 0; 5809 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand); 5810 } 5811 5812 // TODO: There is also f64 rsq instruction, but the documentation is less 5813 // clear on its precision. 5814 5815 return SDValue(); 5816 } 5817 5818 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, 5819 SelectionDAG &DAG, int Enabled, 5820 int &RefinementSteps) const { 5821 EVT VT = Operand.getValueType(); 5822 5823 if (VT == MVT::f32) { 5824 // Reciprocal, < 1 ulp error. 5825 // 5826 // This reciprocal approximation converges to < 0.5 ulp error with one 5827 // newton rhapson performed with two fused multiple adds (FMAs). 5828 5829 RefinementSteps = 0; 5830 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); 5831 } 5832 5833 // TODO: There is also f64 rcp instruction, but the documentation is less 5834 // clear on its precision. 5835 5836 return SDValue(); 5837 } 5838 5839 static unsigned workitemIntrinsicDim(unsigned ID) { 5840 switch (ID) { 5841 case Intrinsic::amdgcn_workitem_id_x: 5842 return 0; 5843 case Intrinsic::amdgcn_workitem_id_y: 5844 return 1; 5845 case Intrinsic::amdgcn_workitem_id_z: 5846 return 2; 5847 default: 5848 llvm_unreachable("not a workitem intrinsic"); 5849 } 5850 } 5851 5852 void AMDGPUTargetLowering::computeKnownBitsForTargetNode( 5853 const SDValue Op, KnownBits &Known, 5854 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { 5855 5856 Known.resetAll(); // Don't know anything. 5857 5858 unsigned Opc = Op.getOpcode(); 5859 5860 switch (Opc) { 5861 default: 5862 break; 5863 case AMDGPUISD::CARRY: 5864 case AMDGPUISD::BORROW: { 5865 Known.Zero = APInt::getHighBitsSet(32, 31); 5866 break; 5867 } 5868 5869 case AMDGPUISD::BFE_I32: 5870 case AMDGPUISD::BFE_U32: { 5871 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 5872 if (!CWidth) 5873 return; 5874 5875 uint32_t Width = CWidth->getZExtValue() & 0x1f; 5876 5877 if (Opc == AMDGPUISD::BFE_U32) 5878 Known.Zero = APInt::getHighBitsSet(32, 32 - Width); 5879 5880 break; 5881 } 5882 case AMDGPUISD::FP_TO_FP16: { 5883 unsigned BitWidth = Known.getBitWidth(); 5884 5885 // High bits are zero. 5886 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 5887 break; 5888 } 5889 case AMDGPUISD::MUL_U24: 5890 case AMDGPUISD::MUL_I24: { 5891 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 5892 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); 5893 unsigned TrailZ = LHSKnown.countMinTrailingZeros() + 5894 RHSKnown.countMinTrailingZeros(); 5895 Known.Zero.setLowBits(std::min(TrailZ, 32u)); 5896 // Skip extra check if all bits are known zeros. 5897 if (TrailZ >= 32) 5898 break; 5899 5900 // Truncate to 24 bits. 5901 LHSKnown = LHSKnown.trunc(24); 5902 RHSKnown = RHSKnown.trunc(24); 5903 5904 if (Opc == AMDGPUISD::MUL_I24) { 5905 unsigned LHSValBits = LHSKnown.countMaxSignificantBits(); 5906 unsigned RHSValBits = RHSKnown.countMaxSignificantBits(); 5907 unsigned MaxValBits = LHSValBits + RHSValBits; 5908 if (MaxValBits > 32) 5909 break; 5910 unsigned SignBits = 32 - MaxValBits + 1; 5911 bool LHSNegative = LHSKnown.isNegative(); 5912 bool LHSNonNegative = LHSKnown.isNonNegative(); 5913 bool LHSPositive = LHSKnown.isStrictlyPositive(); 5914 bool RHSNegative = RHSKnown.isNegative(); 5915 bool RHSNonNegative = RHSKnown.isNonNegative(); 5916 bool RHSPositive = RHSKnown.isStrictlyPositive(); 5917 5918 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative)) 5919 Known.Zero.setHighBits(SignBits); 5920 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative)) 5921 Known.One.setHighBits(SignBits); 5922 } else { 5923 unsigned LHSValBits = LHSKnown.countMaxActiveBits(); 5924 unsigned RHSValBits = RHSKnown.countMaxActiveBits(); 5925 unsigned MaxValBits = LHSValBits + RHSValBits; 5926 if (MaxValBits >= 32) 5927 break; 5928 Known.Zero.setBitsFrom(MaxValBits); 5929 } 5930 break; 5931 } 5932 case AMDGPUISD::PERM: { 5933 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 5934 if (!CMask) 5935 return; 5936 5937 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 5938 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); 5939 unsigned Sel = CMask->getZExtValue(); 5940 5941 for (unsigned I = 0; I < 32; I += 8) { 5942 unsigned SelBits = Sel & 0xff; 5943 if (SelBits < 4) { 5944 SelBits *= 8; 5945 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; 5946 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; 5947 } else if (SelBits < 7) { 5948 SelBits = (SelBits & 3) * 8; 5949 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; 5950 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; 5951 } else if (SelBits == 0x0c) { 5952 Known.Zero |= 0xFFull << I; 5953 } else if (SelBits > 0x0c) { 5954 Known.One |= 0xFFull << I; 5955 } 5956 Sel >>= 8; 5957 } 5958 break; 5959 } 5960 case AMDGPUISD::BUFFER_LOAD_UBYTE: { 5961 Known.Zero.setHighBits(24); 5962 break; 5963 } 5964 case AMDGPUISD::BUFFER_LOAD_USHORT: { 5965 Known.Zero.setHighBits(16); 5966 break; 5967 } 5968 case AMDGPUISD::LDS: { 5969 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode()); 5970 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout()); 5971 5972 Known.Zero.setHighBits(16); 5973 Known.Zero.setLowBits(Log2(Alignment)); 5974 break; 5975 } 5976 case AMDGPUISD::SMIN3: 5977 case AMDGPUISD::SMAX3: 5978 case AMDGPUISD::SMED3: 5979 case AMDGPUISD::UMIN3: 5980 case AMDGPUISD::UMAX3: 5981 case AMDGPUISD::UMED3: { 5982 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1); 5983 if (Known2.isUnknown()) 5984 break; 5985 5986 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); 5987 if (Known1.isUnknown()) 5988 break; 5989 5990 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 5991 if (Known0.isUnknown()) 5992 break; 5993 5994 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling. 5995 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero; 5996 Known.One = Known0.One & Known1.One & Known2.One; 5997 break; 5998 } 5999 case ISD::INTRINSIC_WO_CHAIN: { 6000 unsigned IID = Op.getConstantOperandVal(0); 6001 switch (IID) { 6002 case Intrinsic::amdgcn_workitem_id_x: 6003 case Intrinsic::amdgcn_workitem_id_y: 6004 case Intrinsic::amdgcn_workitem_id_z: { 6005 unsigned MaxValue = Subtarget->getMaxWorkitemID( 6006 DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID)); 6007 Known.Zero.setHighBits(llvm::countl_zero(MaxValue)); 6008 break; 6009 } 6010 default: 6011 break; 6012 } 6013 } 6014 } 6015 } 6016 6017 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( 6018 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, 6019 unsigned Depth) const { 6020 switch (Op.getOpcode()) { 6021 case AMDGPUISD::BFE_I32: { 6022 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 6023 if (!Width) 6024 return 1; 6025 6026 unsigned SignBits = 32 - Width->getZExtValue() + 1; 6027 if (!isNullConstant(Op.getOperand(1))) 6028 return SignBits; 6029 6030 // TODO: Could probably figure something out with non-0 offsets. 6031 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); 6032 return std::max(SignBits, Op0SignBits); 6033 } 6034 6035 case AMDGPUISD::BFE_U32: { 6036 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 6037 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; 6038 } 6039 6040 case AMDGPUISD::CARRY: 6041 case AMDGPUISD::BORROW: 6042 return 31; 6043 case AMDGPUISD::BUFFER_LOAD_BYTE: 6044 return 25; 6045 case AMDGPUISD::BUFFER_LOAD_SHORT: 6046 return 17; 6047 case AMDGPUISD::BUFFER_LOAD_UBYTE: 6048 return 24; 6049 case AMDGPUISD::BUFFER_LOAD_USHORT: 6050 return 16; 6051 case AMDGPUISD::FP_TO_FP16: 6052 return 16; 6053 case AMDGPUISD::SMIN3: 6054 case AMDGPUISD::SMAX3: 6055 case AMDGPUISD::SMED3: 6056 case AMDGPUISD::UMIN3: 6057 case AMDGPUISD::UMAX3: 6058 case AMDGPUISD::UMED3: { 6059 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1); 6060 if (Tmp2 == 1) 6061 return 1; // Early out. 6062 6063 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1); 6064 if (Tmp1 == 1) 6065 return 1; // Early out. 6066 6067 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); 6068 if (Tmp0 == 1) 6069 return 1; // Early out. 6070 6071 return std::min({Tmp0, Tmp1, Tmp2}); 6072 } 6073 default: 6074 return 1; 6075 } 6076 } 6077 6078 unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr( 6079 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, 6080 const MachineRegisterInfo &MRI, unsigned Depth) const { 6081 const MachineInstr *MI = MRI.getVRegDef(R); 6082 if (!MI) 6083 return 1; 6084 6085 // TODO: Check range metadata on MMO. 6086 switch (MI->getOpcode()) { 6087 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 6088 return 25; 6089 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 6090 return 17; 6091 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 6092 return 24; 6093 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 6094 return 16; 6095 case AMDGPU::G_AMDGPU_SMED3: 6096 case AMDGPU::G_AMDGPU_UMED3: { 6097 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs(); 6098 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1); 6099 if (Tmp2 == 1) 6100 return 1; 6101 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1); 6102 if (Tmp1 == 1) 6103 return 1; 6104 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1); 6105 if (Tmp0 == 1) 6106 return 1; 6107 return std::min({Tmp0, Tmp1, Tmp2}); 6108 } 6109 default: 6110 return 1; 6111 } 6112 } 6113 6114 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode( 6115 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN, 6116 unsigned Depth) const { 6117 unsigned Opcode = Op.getOpcode(); 6118 switch (Opcode) { 6119 case AMDGPUISD::FMIN_LEGACY: 6120 case AMDGPUISD::FMAX_LEGACY: { 6121 if (SNaN) 6122 return true; 6123 6124 // TODO: Can check no nans on one of the operands for each one, but which 6125 // one? 6126 return false; 6127 } 6128 case AMDGPUISD::FMUL_LEGACY: 6129 case AMDGPUISD::CVT_PKRTZ_F16_F32: { 6130 if (SNaN) 6131 return true; 6132 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && 6133 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); 6134 } 6135 case AMDGPUISD::FMED3: 6136 case AMDGPUISD::FMIN3: 6137 case AMDGPUISD::FMAX3: 6138 case AMDGPUISD::FMINIMUM3: 6139 case AMDGPUISD::FMAXIMUM3: 6140 case AMDGPUISD::FMAD_FTZ: { 6141 if (SNaN) 6142 return true; 6143 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && 6144 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && 6145 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); 6146 } 6147 case AMDGPUISD::CVT_F32_UBYTE0: 6148 case AMDGPUISD::CVT_F32_UBYTE1: 6149 case AMDGPUISD::CVT_F32_UBYTE2: 6150 case AMDGPUISD::CVT_F32_UBYTE3: 6151 return true; 6152 6153 case AMDGPUISD::RCP: 6154 case AMDGPUISD::RSQ: 6155 case AMDGPUISD::RCP_LEGACY: 6156 case AMDGPUISD::RSQ_CLAMP: { 6157 if (SNaN) 6158 return true; 6159 6160 // TODO: Need is known positive check. 6161 return false; 6162 } 6163 case ISD::FLDEXP: 6164 case AMDGPUISD::FRACT: { 6165 if (SNaN) 6166 return true; 6167 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); 6168 } 6169 case AMDGPUISD::DIV_SCALE: 6170 case AMDGPUISD::DIV_FMAS: 6171 case AMDGPUISD::DIV_FIXUP: 6172 // TODO: Refine on operands. 6173 return SNaN; 6174 case AMDGPUISD::SIN_HW: 6175 case AMDGPUISD::COS_HW: { 6176 // TODO: Need check for infinity 6177 return SNaN; 6178 } 6179 case ISD::INTRINSIC_WO_CHAIN: { 6180 unsigned IntrinsicID = Op.getConstantOperandVal(0); 6181 // TODO: Handle more intrinsics 6182 switch (IntrinsicID) { 6183 case Intrinsic::amdgcn_cubeid: 6184 case Intrinsic::amdgcn_cvt_off_f32_i4: 6185 return true; 6186 6187 case Intrinsic::amdgcn_frexp_mant: { 6188 if (SNaN) 6189 return true; 6190 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); 6191 } 6192 case Intrinsic::amdgcn_cvt_pkrtz: { 6193 if (SNaN) 6194 return true; 6195 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && 6196 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); 6197 } 6198 case Intrinsic::amdgcn_rcp: 6199 case Intrinsic::amdgcn_rsq: 6200 case Intrinsic::amdgcn_rcp_legacy: 6201 case Intrinsic::amdgcn_rsq_legacy: 6202 case Intrinsic::amdgcn_rsq_clamp: 6203 case Intrinsic::amdgcn_tanh: { 6204 if (SNaN) 6205 return true; 6206 6207 // TODO: Need is known positive check. 6208 return false; 6209 } 6210 case Intrinsic::amdgcn_trig_preop: 6211 case Intrinsic::amdgcn_fdot2: 6212 // TODO: Refine on operand 6213 return SNaN; 6214 case Intrinsic::amdgcn_fma_legacy: 6215 if (SNaN) 6216 return true; 6217 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && 6218 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) && 6219 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1); 6220 default: 6221 return false; 6222 } 6223 } 6224 default: 6225 return false; 6226 } 6227 } 6228 6229 bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, 6230 Register N0, Register N1) const { 6231 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks 6232 } 6233