1 //=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AMDGPUCombinerHelper.h" 10 #include "GCNSubtarget.h" 11 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 12 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 13 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 14 #include "llvm/IR/IntrinsicsAMDGPU.h" 15 #include "llvm/Target/TargetMachine.h" 16 17 using namespace llvm; 18 using namespace MIPatternMatch; 19 20 LLVM_READNONE 21 static bool fnegFoldsIntoMI(const MachineInstr &MI) { 22 switch (MI.getOpcode()) { 23 case AMDGPU::G_FADD: 24 case AMDGPU::G_FSUB: 25 case AMDGPU::G_FMUL: 26 case AMDGPU::G_FMA: 27 case AMDGPU::G_FMAD: 28 case AMDGPU::G_FMINNUM: 29 case AMDGPU::G_FMAXNUM: 30 case AMDGPU::G_FMINNUM_IEEE: 31 case AMDGPU::G_FMAXNUM_IEEE: 32 case AMDGPU::G_FMINIMUM: 33 case AMDGPU::G_FMAXIMUM: 34 case AMDGPU::G_FSIN: 35 case AMDGPU::G_FPEXT: 36 case AMDGPU::G_INTRINSIC_TRUNC: 37 case AMDGPU::G_FPTRUNC: 38 case AMDGPU::G_FRINT: 39 case AMDGPU::G_FNEARBYINT: 40 case AMDGPU::G_INTRINSIC_ROUND: 41 case AMDGPU::G_INTRINSIC_ROUNDEVEN: 42 case AMDGPU::G_FCANONICALIZE: 43 case AMDGPU::G_AMDGPU_RCP_IFLAG: 44 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 45 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 46 return true; 47 case AMDGPU::G_INTRINSIC: { 48 unsigned IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); 49 switch (IntrinsicID) { 50 case Intrinsic::amdgcn_rcp: 51 case Intrinsic::amdgcn_rcp_legacy: 52 case Intrinsic::amdgcn_sin: 53 case Intrinsic::amdgcn_fmul_legacy: 54 case Intrinsic::amdgcn_fmed3: 55 case Intrinsic::amdgcn_fma_legacy: 56 return true; 57 default: 58 return false; 59 } 60 } 61 default: 62 return false; 63 } 64 } 65 66 /// \p returns true if the operation will definitely need to use a 64-bit 67 /// encoding, and thus will use a VOP3 encoding regardless of the source 68 /// modifiers. 69 LLVM_READONLY 70 static bool opMustUseVOP3Encoding(const MachineInstr &MI, 71 const MachineRegisterInfo &MRI) { 72 return MI.getNumOperands() > (isa<GIntrinsic>(MI) ? 4u : 3u) || 73 MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64; 74 } 75 76 // Most FP instructions support source modifiers. 77 LLVM_READONLY 78 static bool hasSourceMods(const MachineInstr &MI) { 79 if (!MI.memoperands().empty()) 80 return false; 81 82 switch (MI.getOpcode()) { 83 case AMDGPU::COPY: 84 case AMDGPU::G_SELECT: 85 case AMDGPU::G_FDIV: 86 case AMDGPU::G_FREM: 87 case TargetOpcode::INLINEASM: 88 case TargetOpcode::INLINEASM_BR: 89 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 90 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: 91 case AMDGPU::G_BITCAST: 92 case AMDGPU::G_ANYEXT: 93 case AMDGPU::G_BUILD_VECTOR: 94 case AMDGPU::G_BUILD_VECTOR_TRUNC: 95 case AMDGPU::G_PHI: 96 return false; 97 case AMDGPU::G_INTRINSIC: 98 case AMDGPU::G_INTRINSIC_CONVERGENT: { 99 unsigned IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); 100 switch (IntrinsicID) { 101 case Intrinsic::amdgcn_interp_p1: 102 case Intrinsic::amdgcn_interp_p2: 103 case Intrinsic::amdgcn_interp_mov: 104 case Intrinsic::amdgcn_interp_p1_f16: 105 case Intrinsic::amdgcn_interp_p2_f16: 106 case Intrinsic::amdgcn_div_scale: 107 return false; 108 default: 109 return true; 110 } 111 } 112 default: 113 return true; 114 } 115 } 116 117 static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, 118 unsigned CostThreshold = 4) { 119 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus 120 // it is truly free to use a source modifier in all cases. If there are 121 // multiple users but for each one will necessitate using VOP3, there will be 122 // a code size increase. Try to avoid increasing code size unless we know it 123 // will save on the instruction count. 124 unsigned NumMayIncreaseSize = 0; 125 Register Dst = MI.getOperand(0).getReg(); 126 for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) { 127 if (!hasSourceMods(Use)) 128 return false; 129 130 if (!opMustUseVOP3Encoding(Use, MRI)) { 131 if (++NumMayIncreaseSize > CostThreshold) 132 return false; 133 } 134 } 135 return true; 136 } 137 138 static bool mayIgnoreSignedZero(MachineInstr &MI) { 139 const TargetOptions &Options = MI.getMF()->getTarget().Options; 140 return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz); 141 } 142 143 static bool isInv2Pi(const APFloat &APF) { 144 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118)); 145 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983)); 146 static const APFloat KF64(APFloat::IEEEdouble(), 147 APInt(64, 0x3fc45f306dc9c882)); 148 149 return APF.bitwiseIsEqual(KF16) || APF.bitwiseIsEqual(KF32) || 150 APF.bitwiseIsEqual(KF64); 151 } 152 153 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an 154 // additional cost to negate them. 155 static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg, 156 MachineRegisterInfo &MRI) { 157 std::optional<FPValueAndVReg> FPValReg; 158 if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) { 159 if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative()) 160 return true; 161 162 const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>(); 163 if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value)) 164 return true; 165 } 166 return false; 167 } 168 169 static unsigned inverseMinMax(unsigned Opc) { 170 switch (Opc) { 171 case AMDGPU::G_FMAXNUM: 172 return AMDGPU::G_FMINNUM; 173 case AMDGPU::G_FMINNUM: 174 return AMDGPU::G_FMAXNUM; 175 case AMDGPU::G_FMAXNUM_IEEE: 176 return AMDGPU::G_FMINNUM_IEEE; 177 case AMDGPU::G_FMINNUM_IEEE: 178 return AMDGPU::G_FMAXNUM_IEEE; 179 case AMDGPU::G_FMAXIMUM: 180 return AMDGPU::G_FMINIMUM; 181 case AMDGPU::G_FMINIMUM: 182 return AMDGPU::G_FMAXIMUM; 183 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 184 return AMDGPU::G_AMDGPU_FMIN_LEGACY; 185 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 186 return AMDGPU::G_AMDGPU_FMAX_LEGACY; 187 default: 188 llvm_unreachable("invalid min/max opcode"); 189 } 190 } 191 192 bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI, 193 MachineInstr *&MatchInfo) { 194 Register Src = MI.getOperand(1).getReg(); 195 MatchInfo = MRI.getVRegDef(Src); 196 197 // If the input has multiple uses and we can either fold the negate down, or 198 // the other uses cannot, give up. This both prevents unprofitable 199 // transformations and infinite loops: we won't repeatedly try to fold around 200 // a negate that has no 'good' form. 201 if (MRI.hasOneNonDBGUse(Src)) { 202 if (allUsesHaveSourceMods(MI, MRI, 0)) 203 return false; 204 } else { 205 if (fnegFoldsIntoMI(*MatchInfo) && 206 (allUsesHaveSourceMods(MI, MRI) || 207 !allUsesHaveSourceMods(*MatchInfo, MRI))) 208 return false; 209 } 210 211 switch (MatchInfo->getOpcode()) { 212 case AMDGPU::G_FMINNUM: 213 case AMDGPU::G_FMAXNUM: 214 case AMDGPU::G_FMINNUM_IEEE: 215 case AMDGPU::G_FMAXNUM_IEEE: 216 case AMDGPU::G_FMINIMUM: 217 case AMDGPU::G_FMAXIMUM: 218 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 219 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 220 // 0 doesn't have a negated inline immediate. 221 return !isConstantCostlierToNegate(*MatchInfo, 222 MatchInfo->getOperand(2).getReg(), MRI); 223 case AMDGPU::G_FADD: 224 case AMDGPU::G_FSUB: 225 case AMDGPU::G_FMA: 226 case AMDGPU::G_FMAD: 227 return mayIgnoreSignedZero(*MatchInfo); 228 case AMDGPU::G_FMUL: 229 case AMDGPU::G_FPEXT: 230 case AMDGPU::G_INTRINSIC_TRUNC: 231 case AMDGPU::G_FPTRUNC: 232 case AMDGPU::G_FRINT: 233 case AMDGPU::G_FNEARBYINT: 234 case AMDGPU::G_INTRINSIC_ROUND: 235 case AMDGPU::G_INTRINSIC_ROUNDEVEN: 236 case AMDGPU::G_FSIN: 237 case AMDGPU::G_FCANONICALIZE: 238 case AMDGPU::G_AMDGPU_RCP_IFLAG: 239 return true; 240 case AMDGPU::G_INTRINSIC: 241 case AMDGPU::G_INTRINSIC_CONVERGENT: { 242 unsigned IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID(); 243 switch (IntrinsicID) { 244 case Intrinsic::amdgcn_rcp: 245 case Intrinsic::amdgcn_rcp_legacy: 246 case Intrinsic::amdgcn_sin: 247 case Intrinsic::amdgcn_fmul_legacy: 248 case Intrinsic::amdgcn_fmed3: 249 return true; 250 case Intrinsic::amdgcn_fma_legacy: 251 return mayIgnoreSignedZero(*MatchInfo); 252 default: 253 return false; 254 } 255 } 256 default: 257 return false; 258 } 259 } 260 261 void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI, 262 MachineInstr *&MatchInfo) { 263 // Transform: 264 // %A = inst %Op1, ... 265 // %B = fneg %A 266 // 267 // into: 268 // 269 // (if %A has one use, specifically fneg above) 270 // %B = inst (maybe fneg %Op1), ... 271 // 272 // (if %A has multiple uses) 273 // %B = inst (maybe fneg %Op1), ... 274 // %A = fneg %B 275 276 // Replace register in operand with a register holding negated value. 277 auto NegateOperand = [&](MachineOperand &Op) { 278 Register Reg = Op.getReg(); 279 if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg)))) 280 Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0); 281 replaceRegOpWith(MRI, Op, Reg); 282 }; 283 284 // Replace either register in operands with a register holding negated value. 285 auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) { 286 Register XReg = X.getReg(); 287 Register YReg = Y.getReg(); 288 if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg)))) 289 replaceRegOpWith(MRI, X, XReg); 290 else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg)))) 291 replaceRegOpWith(MRI, Y, YReg); 292 else { 293 YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0); 294 replaceRegOpWith(MRI, Y, YReg); 295 } 296 }; 297 298 Builder.setInstrAndDebugLoc(*MatchInfo); 299 300 // Negate appropriate operands so that resulting value of MatchInfo is 301 // negated. 302 switch (MatchInfo->getOpcode()) { 303 case AMDGPU::G_FADD: 304 case AMDGPU::G_FSUB: 305 NegateOperand(MatchInfo->getOperand(1)); 306 NegateOperand(MatchInfo->getOperand(2)); 307 break; 308 case AMDGPU::G_FMUL: 309 NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2)); 310 break; 311 case AMDGPU::G_FMINNUM: 312 case AMDGPU::G_FMAXNUM: 313 case AMDGPU::G_FMINNUM_IEEE: 314 case AMDGPU::G_FMAXNUM_IEEE: 315 case AMDGPU::G_FMINIMUM: 316 case AMDGPU::G_FMAXIMUM: 317 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 318 case AMDGPU::G_AMDGPU_FMAX_LEGACY: { 319 NegateOperand(MatchInfo->getOperand(1)); 320 NegateOperand(MatchInfo->getOperand(2)); 321 unsigned Opposite = inverseMinMax(MatchInfo->getOpcode()); 322 replaceOpcodeWith(*MatchInfo, Opposite); 323 break; 324 } 325 case AMDGPU::G_FMA: 326 case AMDGPU::G_FMAD: 327 NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2)); 328 NegateOperand(MatchInfo->getOperand(3)); 329 break; 330 case AMDGPU::G_FPEXT: 331 case AMDGPU::G_INTRINSIC_TRUNC: 332 case AMDGPU::G_FRINT: 333 case AMDGPU::G_FNEARBYINT: 334 case AMDGPU::G_INTRINSIC_ROUND: 335 case AMDGPU::G_INTRINSIC_ROUNDEVEN: 336 case AMDGPU::G_FSIN: 337 case AMDGPU::G_FCANONICALIZE: 338 case AMDGPU::G_AMDGPU_RCP_IFLAG: 339 case AMDGPU::G_FPTRUNC: 340 NegateOperand(MatchInfo->getOperand(1)); 341 break; 342 case AMDGPU::G_INTRINSIC: 343 case AMDGPU::G_INTRINSIC_CONVERGENT: { 344 unsigned IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID(); 345 switch (IntrinsicID) { 346 case Intrinsic::amdgcn_rcp: 347 case Intrinsic::amdgcn_rcp_legacy: 348 case Intrinsic::amdgcn_sin: 349 NegateOperand(MatchInfo->getOperand(2)); 350 break; 351 case Intrinsic::amdgcn_fmul_legacy: 352 NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3)); 353 break; 354 case Intrinsic::amdgcn_fmed3: 355 NegateOperand(MatchInfo->getOperand(2)); 356 NegateOperand(MatchInfo->getOperand(3)); 357 NegateOperand(MatchInfo->getOperand(4)); 358 break; 359 case Intrinsic::amdgcn_fma_legacy: 360 NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3)); 361 NegateOperand(MatchInfo->getOperand(4)); 362 break; 363 default: 364 llvm_unreachable("folding fneg not supported for this intrinsic"); 365 } 366 break; 367 } 368 default: 369 llvm_unreachable("folding fneg not supported for this instruction"); 370 } 371 372 Register Dst = MI.getOperand(0).getReg(); 373 Register MatchInfoDst = MatchInfo->getOperand(0).getReg(); 374 375 if (MRI.hasOneNonDBGUse(MatchInfoDst)) { 376 // MatchInfo now has negated value so use that instead of old Dst. 377 replaceRegWith(MRI, Dst, MatchInfoDst); 378 } else { 379 // We want to swap all uses of Dst with uses of MatchInfoDst and vice versa 380 // but replaceRegWith will replace defs as well. It is easier to replace one 381 // def with a new register. 382 LLT Type = MRI.getType(Dst); 383 Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type); 384 replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo); 385 386 // MatchInfo now has negated value so use that instead of old Dst. 387 replaceRegWith(MRI, Dst, NegatedMatchInfo); 388 389 // Recreate non negated value for other uses of old MatchInfoDst 390 auto NextInst = ++MatchInfo->getIterator(); 391 Builder.setInstrAndDebugLoc(*NextInst); 392 Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags()); 393 } 394 395 MI.eraseFromParent(); 396 } 397 398 // TODO: Should return converted value / extension source and avoid introducing 399 // intermediate fptruncs in the apply function. 400 static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI, 401 Register Reg) { 402 const MachineInstr *Def = MRI.getVRegDef(Reg); 403 if (Def->getOpcode() == TargetOpcode::G_FPEXT) { 404 Register SrcReg = Def->getOperand(1).getReg(); 405 return MRI.getType(SrcReg) == LLT::scalar(16); 406 } 407 408 if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) { 409 APFloat Val = Def->getOperand(1).getFPImm()->getValueAPF(); 410 bool LosesInfo = true; 411 Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo); 412 return !LosesInfo; 413 } 414 415 return false; 416 } 417 418 bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI, 419 Register Src0, 420 Register Src1, 421 Register Src2) { 422 assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC); 423 Register SrcReg = MI.getOperand(1).getReg(); 424 if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32)) 425 return false; 426 427 return isFPExtFromF16OrConst(MRI, Src0) && isFPExtFromF16OrConst(MRI, Src1) && 428 isFPExtFromF16OrConst(MRI, Src2); 429 } 430 431 void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI, 432 Register Src0, 433 Register Src1, 434 Register Src2) { 435 Builder.setInstrAndDebugLoc(MI); 436 437 // We expect fptrunc (fpext x) to fold out, and to constant fold any constant 438 // sources. 439 Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0); 440 Src1 = Builder.buildFPTrunc(LLT::scalar(16), Src1).getReg(0); 441 Src2 = Builder.buildFPTrunc(LLT::scalar(16), Src2).getReg(0); 442 443 LLT Ty = MRI.getType(Src0); 444 auto A1 = Builder.buildFMinNumIEEE(Ty, Src0, Src1); 445 auto B1 = Builder.buildFMaxNumIEEE(Ty, Src0, Src1); 446 auto C1 = Builder.buildFMaxNumIEEE(Ty, A1, Src2); 447 Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1); 448 MI.eraseFromParent(); 449 } 450