1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Custom DAG lowering for R600 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "R600ISelLowering.h" 15 #include "AMDGPU.h" 16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 17 #include "R600Defines.h" 18 #include "R600InstrInfo.h" 19 #include "R600MachineFunctionInfo.h" 20 #include "R600Subtarget.h" 21 #include "llvm/IR/IntrinsicsAMDGPU.h" 22 #include "llvm/IR/IntrinsicsR600.h" 23 24 using namespace llvm; 25 26 #include "R600GenCallingConv.inc" 27 28 R600TargetLowering::R600TargetLowering(const TargetMachine &TM, 29 const R600Subtarget &STI) 30 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) { 31 addRegisterClass(MVT::f32, &R600::R600_Reg32RegClass); 32 addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass); 33 addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass); 34 addRegisterClass(MVT::v2i32, &R600::R600_Reg64RegClass); 35 addRegisterClass(MVT::v4f32, &R600::R600_Reg128RegClass); 36 addRegisterClass(MVT::v4i32, &R600::R600_Reg128RegClass); 37 38 setBooleanContents(ZeroOrNegativeOneBooleanContent); 39 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 40 41 computeRegisterProperties(Subtarget->getRegisterInfo()); 42 43 // Legalize loads and stores to the private address space. 44 setOperationAction(ISD::LOAD, MVT::i32, Custom); 45 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 46 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 47 48 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address 49 // spaces, so it is custom lowered to handle those where it isn't. 50 for (MVT VT : MVT::integer_valuetypes()) { 51 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 52 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); 53 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); 54 55 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 56 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); 57 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); 58 59 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 60 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); 61 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); 62 } 63 64 // Workaround for LegalizeDAG asserting on expansion of i1 vector loads. 65 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand); 66 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); 67 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); 68 69 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand); 70 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); 71 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); 72 73 setOperationAction(ISD::STORE, MVT::i8, Custom); 74 setOperationAction(ISD::STORE, MVT::i32, Custom); 75 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 76 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 77 78 setTruncStoreAction(MVT::i32, MVT::i8, Custom); 79 setTruncStoreAction(MVT::i32, MVT::i16, Custom); 80 // We need to include these since trunc STORES to PRIVATE need 81 // special handling to accommodate RMW 82 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); 83 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Custom); 84 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Custom); 85 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Custom); 86 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Custom); 87 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); 88 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); 89 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Custom); 90 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Custom); 91 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Custom); 92 93 // Workaround for LegalizeDAG asserting on expansion of i1 vector stores. 94 setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand); 95 setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand); 96 97 // Set condition code actions 98 setCondCodeAction(ISD::SETO, MVT::f32, Expand); 99 setCondCodeAction(ISD::SETUO, MVT::f32, Expand); 100 setCondCodeAction(ISD::SETLT, MVT::f32, Expand); 101 setCondCodeAction(ISD::SETLE, MVT::f32, Expand); 102 setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); 103 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 104 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 105 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 106 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 107 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 108 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 109 setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 110 111 setCondCodeAction(ISD::SETLE, MVT::i32, Expand); 112 setCondCodeAction(ISD::SETLT, MVT::i32, Expand); 113 setCondCodeAction(ISD::SETULE, MVT::i32, Expand); 114 setCondCodeAction(ISD::SETULT, MVT::i32, Expand); 115 116 setOperationAction(ISD::FCOS, MVT::f32, Custom); 117 setOperationAction(ISD::FSIN, MVT::f32, Custom); 118 119 setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 120 setOperationAction(ISD::SETCC, MVT::v2i32, Expand); 121 122 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 123 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 124 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 125 126 setOperationAction(ISD::FSUB, MVT::f32, Expand); 127 128 setOperationAction(ISD::FCEIL, MVT::f64, Custom); 129 setOperationAction(ISD::FTRUNC, MVT::f64, Custom); 130 setOperationAction(ISD::FRINT, MVT::f64, Custom); 131 setOperationAction(ISD::FFLOOR, MVT::f64, Custom); 132 133 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 134 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 135 136 setOperationAction(ISD::SETCC, MVT::i32, Expand); 137 setOperationAction(ISD::SETCC, MVT::f32, Expand); 138 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 139 setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom); 140 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 141 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 142 143 setOperationAction(ISD::SELECT, MVT::i32, Expand); 144 setOperationAction(ISD::SELECT, MVT::f32, Expand); 145 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 146 setOperationAction(ISD::SELECT, MVT::v4i32, Expand); 147 148 // ADD, SUB overflow. 149 // TODO: turn these into Legal? 150 if (Subtarget->hasCARRY()) 151 setOperationAction(ISD::UADDO, MVT::i32, Custom); 152 153 if (Subtarget->hasBORROW()) 154 setOperationAction(ISD::USUBO, MVT::i32, Custom); 155 156 // Expand sign extension of vectors 157 if (!Subtarget->hasBFE()) 158 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 159 160 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); 161 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); 162 163 if (!Subtarget->hasBFE()) 164 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 165 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); 166 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); 167 168 if (!Subtarget->hasBFE()) 169 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 170 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); 171 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); 172 173 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 174 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); 175 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); 176 177 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); 178 179 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 180 181 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); 182 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); 183 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 184 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 185 186 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); 187 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); 188 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 189 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 190 191 // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 192 // to be Legal/Custom in order to avoid library calls. 193 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 194 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 195 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 196 197 if (!Subtarget->hasFMA()) { 198 setOperationAction(ISD::FMA, MVT::f32, Expand); 199 setOperationAction(ISD::FMA, MVT::f64, Expand); 200 } 201 202 // FIXME: May need no denormals check 203 setOperationAction(ISD::FMAD, MVT::f32, Legal); 204 205 if (!Subtarget->hasBFI()) { 206 // fcopysign can be done in a single instruction with BFI. 207 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 208 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 209 } 210 211 if (!Subtarget->hasBCNT(32)) 212 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 213 214 if (!Subtarget->hasBCNT(64)) 215 setOperationAction(ISD::CTPOP, MVT::i64, Expand); 216 217 if (Subtarget->hasFFBH()) 218 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); 219 220 if (Subtarget->hasFFBL()) 221 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); 222 223 // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we 224 // need it for R600. 225 if (Subtarget->hasBFE()) 226 setHasExtractBitsInsn(true); 227 228 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 229 230 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 231 for (MVT VT : ScalarIntVTs) { 232 setOperationAction(ISD::ADDC, VT, Expand); 233 setOperationAction(ISD::SUBC, VT, Expand); 234 setOperationAction(ISD::ADDE, VT, Expand); 235 setOperationAction(ISD::SUBE, VT, Expand); 236 } 237 238 // LLVM will expand these to atomic_cmp_swap(0) 239 // and atomic_swap, respectively. 240 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand); 241 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand); 242 243 // We need to custom lower some of the intrinsics 244 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 245 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 246 247 setSchedulingPreference(Sched::Source); 248 249 setTargetDAGCombine(ISD::FP_ROUND); 250 setTargetDAGCombine(ISD::FP_TO_SINT); 251 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 252 setTargetDAGCombine(ISD::SELECT_CC); 253 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 254 setTargetDAGCombine(ISD::LOAD); 255 } 256 257 static inline bool isEOP(MachineBasicBlock::iterator I) { 258 if (std::next(I) == I->getParent()->end()) 259 return false; 260 return std::next(I)->getOpcode() == R600::RETURN; 261 } 262 263 MachineBasicBlock * 264 R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 265 MachineBasicBlock *BB) const { 266 MachineFunction *MF = BB->getParent(); 267 MachineRegisterInfo &MRI = MF->getRegInfo(); 268 MachineBasicBlock::iterator I = MI; 269 const R600InstrInfo *TII = Subtarget->getInstrInfo(); 270 271 switch (MI.getOpcode()) { 272 default: 273 // Replace LDS_*_RET instruction that don't have any uses with the 274 // equivalent LDS_*_NORET instruction. 275 if (TII->isLDSRetInstr(MI.getOpcode())) { 276 int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst); 277 assert(DstIdx != -1); 278 MachineInstrBuilder NewMI; 279 // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add 280 // LDS_1A2D support and remove this special case. 281 if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) || 282 MI.getOpcode() == R600::LDS_CMPST_RET) 283 return BB; 284 285 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), 286 TII->get(R600::getLDSNoRetOp(MI.getOpcode()))); 287 for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) { 288 NewMI.add(MI.getOperand(i)); 289 } 290 } else { 291 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 292 } 293 break; 294 295 case R600::FABS_R600: { 296 MachineInstr *NewMI = TII->buildDefaultInstruction( 297 *BB, I, R600::MOV, MI.getOperand(0).getReg(), 298 MI.getOperand(1).getReg()); 299 TII->addFlag(*NewMI, 0, MO_FLAG_ABS); 300 break; 301 } 302 303 case R600::FNEG_R600: { 304 MachineInstr *NewMI = TII->buildDefaultInstruction( 305 *BB, I, R600::MOV, MI.getOperand(0).getReg(), 306 MI.getOperand(1).getReg()); 307 TII->addFlag(*NewMI, 0, MO_FLAG_NEG); 308 break; 309 } 310 311 case R600::MASK_WRITE: { 312 Register maskedRegister = MI.getOperand(0).getReg(); 313 assert(maskedRegister.isVirtual()); 314 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 315 TII->addFlag(*defInstr, 0, MO_FLAG_MASK); 316 break; 317 } 318 319 case R600::MOV_IMM_F32: 320 TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1) 321 .getFPImm() 322 ->getValueAPF() 323 .bitcastToAPInt() 324 .getZExtValue()); 325 break; 326 327 case R600::MOV_IMM_I32: 328 TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), 329 MI.getOperand(1).getImm()); 330 break; 331 332 case R600::MOV_IMM_GLOBAL_ADDR: { 333 //TODO: Perhaps combine this instruction with the next if possible 334 auto MIB = TII->buildDefaultInstruction( 335 *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_LITERAL_X); 336 int Idx = TII->getOperandIdx(*MIB, R600::OpName::literal); 337 //TODO: Ugh this is rather ugly 338 MIB->getOperand(Idx) = MI.getOperand(1); 339 break; 340 } 341 342 case R600::CONST_COPY: { 343 MachineInstr *NewMI = TII->buildDefaultInstruction( 344 *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_CONST); 345 TII->setImmOperand(*NewMI, R600::OpName::src0_sel, 346 MI.getOperand(1).getImm()); 347 break; 348 } 349 350 case R600::RAT_WRITE_CACHELESS_32_eg: 351 case R600::RAT_WRITE_CACHELESS_64_eg: 352 case R600::RAT_WRITE_CACHELESS_128_eg: 353 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) 354 .add(MI.getOperand(0)) 355 .add(MI.getOperand(1)) 356 .addImm(isEOP(I)); // Set End of program bit 357 break; 358 359 case R600::RAT_STORE_TYPED_eg: 360 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) 361 .add(MI.getOperand(0)) 362 .add(MI.getOperand(1)) 363 .add(MI.getOperand(2)) 364 .addImm(isEOP(I)); // Set End of program bit 365 break; 366 367 case R600::BRANCH: 368 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP)) 369 .add(MI.getOperand(0)); 370 break; 371 372 case R600::BRANCH_COND_f32: { 373 MachineInstr *NewMI = 374 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X), 375 R600::PREDICATE_BIT) 376 .add(MI.getOperand(1)) 377 .addImm(R600::PRED_SETNE) 378 .addImm(0); // Flags 379 TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); 380 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND)) 381 .add(MI.getOperand(0)) 382 .addReg(R600::PREDICATE_BIT, RegState::Kill); 383 break; 384 } 385 386 case R600::BRANCH_COND_i32: { 387 MachineInstr *NewMI = 388 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X), 389 R600::PREDICATE_BIT) 390 .add(MI.getOperand(1)) 391 .addImm(R600::PRED_SETNE_INT) 392 .addImm(0); // Flags 393 TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); 394 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND)) 395 .add(MI.getOperand(0)) 396 .addReg(R600::PREDICATE_BIT, RegState::Kill); 397 break; 398 } 399 400 case R600::EG_ExportSwz: 401 case R600::R600_ExportSwz: { 402 // Instruction is left unmodified if its not the last one of its type 403 bool isLastInstructionOfItsType = true; 404 unsigned InstExportType = MI.getOperand(1).getImm(); 405 for (MachineBasicBlock::iterator NextExportInst = std::next(I), 406 EndBlock = BB->end(); NextExportInst != EndBlock; 407 NextExportInst = std::next(NextExportInst)) { 408 if (NextExportInst->getOpcode() == R600::EG_ExportSwz || 409 NextExportInst->getOpcode() == R600::R600_ExportSwz) { 410 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 411 .getImm(); 412 if (CurrentInstExportType == InstExportType) { 413 isLastInstructionOfItsType = false; 414 break; 415 } 416 } 417 } 418 bool EOP = isEOP(I); 419 if (!EOP && !isLastInstructionOfItsType) 420 return BB; 421 unsigned CfInst = (MI.getOpcode() == R600::EG_ExportSwz) ? 84 : 40; 422 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) 423 .add(MI.getOperand(0)) 424 .add(MI.getOperand(1)) 425 .add(MI.getOperand(2)) 426 .add(MI.getOperand(3)) 427 .add(MI.getOperand(4)) 428 .add(MI.getOperand(5)) 429 .add(MI.getOperand(6)) 430 .addImm(CfInst) 431 .addImm(EOP); 432 break; 433 } 434 case R600::RETURN: { 435 return BB; 436 } 437 } 438 439 MI.eraseFromParent(); 440 return BB; 441 } 442 443 //===----------------------------------------------------------------------===// 444 // Custom DAG Lowering Operations 445 //===----------------------------------------------------------------------===// 446 447 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 448 MachineFunction &MF = DAG.getMachineFunction(); 449 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 450 switch (Op.getOpcode()) { 451 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 452 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 453 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 454 case ISD::SHL_PARTS: 455 case ISD::SRA_PARTS: 456 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 457 case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY); 458 case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW); 459 case ISD::FCOS: 460 case ISD::FSIN: return LowerTrig(Op, DAG); 461 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 462 case ISD::STORE: return LowerSTORE(Op, DAG); 463 case ISD::LOAD: { 464 SDValue Result = LowerLOAD(Op, DAG); 465 assert((!Result.getNode() || 466 Result.getNode()->getNumValues() == 2) && 467 "Load should return a value and a chain"); 468 return Result; 469 } 470 471 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 472 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 473 case ISD::FrameIndex: return lowerFrameIndex(Op, DAG); 474 case ISD::INTRINSIC_VOID: { 475 SDValue Chain = Op.getOperand(0); 476 unsigned IntrinsicID = 477 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 478 switch (IntrinsicID) { 479 case Intrinsic::r600_store_swizzle: { 480 SDLoc DL(Op); 481 const SDValue Args[8] = { 482 Chain, 483 Op.getOperand(2), // Export Value 484 Op.getOperand(3), // ArrayBase 485 Op.getOperand(4), // Type 486 DAG.getConstant(0, DL, MVT::i32), // SWZ_X 487 DAG.getConstant(1, DL, MVT::i32), // SWZ_Y 488 DAG.getConstant(2, DL, MVT::i32), // SWZ_Z 489 DAG.getConstant(3, DL, MVT::i32) // SWZ_W 490 }; 491 return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, Op.getValueType(), Args); 492 } 493 494 // default for switch(IntrinsicID) 495 default: break; 496 } 497 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 498 break; 499 } 500 case ISD::INTRINSIC_WO_CHAIN: { 501 unsigned IntrinsicID = 502 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 503 EVT VT = Op.getValueType(); 504 SDLoc DL(Op); 505 switch (IntrinsicID) { 506 case Intrinsic::r600_tex: 507 case Intrinsic::r600_texc: { 508 unsigned TextureOp; 509 switch (IntrinsicID) { 510 case Intrinsic::r600_tex: 511 TextureOp = 0; 512 break; 513 case Intrinsic::r600_texc: 514 TextureOp = 1; 515 break; 516 default: 517 llvm_unreachable("unhandled texture operation"); 518 } 519 520 SDValue TexArgs[19] = { 521 DAG.getConstant(TextureOp, DL, MVT::i32), 522 Op.getOperand(1), 523 DAG.getConstant(0, DL, MVT::i32), 524 DAG.getConstant(1, DL, MVT::i32), 525 DAG.getConstant(2, DL, MVT::i32), 526 DAG.getConstant(3, DL, MVT::i32), 527 Op.getOperand(2), 528 Op.getOperand(3), 529 Op.getOperand(4), 530 DAG.getConstant(0, DL, MVT::i32), 531 DAG.getConstant(1, DL, MVT::i32), 532 DAG.getConstant(2, DL, MVT::i32), 533 DAG.getConstant(3, DL, MVT::i32), 534 Op.getOperand(5), 535 Op.getOperand(6), 536 Op.getOperand(7), 537 Op.getOperand(8), 538 Op.getOperand(9), 539 Op.getOperand(10) 540 }; 541 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); 542 } 543 case Intrinsic::r600_dot4: { 544 SDValue Args[8] = { 545 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 546 DAG.getConstant(0, DL, MVT::i32)), 547 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 548 DAG.getConstant(0, DL, MVT::i32)), 549 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 550 DAG.getConstant(1, DL, MVT::i32)), 551 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 552 DAG.getConstant(1, DL, MVT::i32)), 553 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 554 DAG.getConstant(2, DL, MVT::i32)), 555 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 556 DAG.getConstant(2, DL, MVT::i32)), 557 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 558 DAG.getConstant(3, DL, MVT::i32)), 559 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 560 DAG.getConstant(3, DL, MVT::i32)) 561 }; 562 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); 563 } 564 565 case Intrinsic::r600_implicitarg_ptr: { 566 MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS); 567 uint32_t ByteOffset = getImplicitParameterOffset(MF, FIRST_IMPLICIT); 568 return DAG.getConstant(ByteOffset, DL, PtrVT); 569 } 570 case Intrinsic::r600_read_ngroups_x: 571 return LowerImplicitParameter(DAG, VT, DL, 0); 572 case Intrinsic::r600_read_ngroups_y: 573 return LowerImplicitParameter(DAG, VT, DL, 1); 574 case Intrinsic::r600_read_ngroups_z: 575 return LowerImplicitParameter(DAG, VT, DL, 2); 576 case Intrinsic::r600_read_global_size_x: 577 return LowerImplicitParameter(DAG, VT, DL, 3); 578 case Intrinsic::r600_read_global_size_y: 579 return LowerImplicitParameter(DAG, VT, DL, 4); 580 case Intrinsic::r600_read_global_size_z: 581 return LowerImplicitParameter(DAG, VT, DL, 5); 582 case Intrinsic::r600_read_local_size_x: 583 return LowerImplicitParameter(DAG, VT, DL, 6); 584 case Intrinsic::r600_read_local_size_y: 585 return LowerImplicitParameter(DAG, VT, DL, 7); 586 case Intrinsic::r600_read_local_size_z: 587 return LowerImplicitParameter(DAG, VT, DL, 8); 588 589 case Intrinsic::r600_read_tgid_x: 590 case Intrinsic::amdgcn_workgroup_id_x: 591 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 592 R600::T1_X, VT); 593 case Intrinsic::r600_read_tgid_y: 594 case Intrinsic::amdgcn_workgroup_id_y: 595 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 596 R600::T1_Y, VT); 597 case Intrinsic::r600_read_tgid_z: 598 case Intrinsic::amdgcn_workgroup_id_z: 599 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 600 R600::T1_Z, VT); 601 case Intrinsic::r600_read_tidig_x: 602 case Intrinsic::amdgcn_workitem_id_x: 603 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 604 R600::T0_X, VT); 605 case Intrinsic::r600_read_tidig_y: 606 case Intrinsic::amdgcn_workitem_id_y: 607 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 608 R600::T0_Y, VT); 609 case Intrinsic::r600_read_tidig_z: 610 case Intrinsic::amdgcn_workitem_id_z: 611 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 612 R600::T0_Z, VT); 613 614 case Intrinsic::r600_recipsqrt_ieee: 615 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 616 617 case Intrinsic::r600_recipsqrt_clamped: 618 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); 619 default: 620 return Op; 621 } 622 623 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 624 break; 625 } 626 } // end switch(Op.getOpcode()) 627 return SDValue(); 628 } 629 630 void R600TargetLowering::ReplaceNodeResults(SDNode *N, 631 SmallVectorImpl<SDValue> &Results, 632 SelectionDAG &DAG) const { 633 switch (N->getOpcode()) { 634 default: 635 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); 636 return; 637 case ISD::FP_TO_UINT: 638 if (N->getValueType(0) == MVT::i1) { 639 Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG)); 640 return; 641 } 642 // Since we don't care about out of bounds values we can use FP_TO_SINT for 643 // uints too. The DAGLegalizer code for uint considers some extra cases 644 // which are not necessary here. 645 LLVM_FALLTHROUGH; 646 case ISD::FP_TO_SINT: { 647 if (N->getValueType(0) == MVT::i1) { 648 Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG)); 649 return; 650 } 651 652 SDValue Result; 653 if (expandFP_TO_SINT(N, Result, DAG)) 654 Results.push_back(Result); 655 return; 656 } 657 case ISD::SDIVREM: { 658 SDValue Op = SDValue(N, 1); 659 SDValue RES = LowerSDIVREM(Op, DAG); 660 Results.push_back(RES); 661 Results.push_back(RES.getValue(1)); 662 break; 663 } 664 case ISD::UDIVREM: { 665 SDValue Op = SDValue(N, 0); 666 LowerUDIVREM64(Op, DAG, Results); 667 break; 668 } 669 } 670 } 671 672 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, 673 SDValue Vector) const { 674 SDLoc DL(Vector); 675 EVT VecVT = Vector.getValueType(); 676 EVT EltVT = VecVT.getVectorElementType(); 677 SmallVector<SDValue, 8> Args; 678 679 for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) { 680 Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, 681 DAG.getVectorIdxConstant(i, DL))); 682 } 683 684 return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); 685 } 686 687 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 688 SelectionDAG &DAG) const { 689 SDLoc DL(Op); 690 SDValue Vector = Op.getOperand(0); 691 SDValue Index = Op.getOperand(1); 692 693 if (isa<ConstantSDNode>(Index) || 694 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 695 return Op; 696 697 Vector = vectorToVerticalVector(DAG, Vector); 698 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), 699 Vector, Index); 700 } 701 702 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 703 SelectionDAG &DAG) const { 704 SDLoc DL(Op); 705 SDValue Vector = Op.getOperand(0); 706 SDValue Value = Op.getOperand(1); 707 SDValue Index = Op.getOperand(2); 708 709 if (isa<ConstantSDNode>(Index) || 710 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 711 return Op; 712 713 Vector = vectorToVerticalVector(DAG, Vector); 714 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), 715 Vector, Value, Index); 716 return vectorToVerticalVector(DAG, Insert); 717 } 718 719 SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 720 SDValue Op, 721 SelectionDAG &DAG) const { 722 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 723 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 724 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 725 726 const DataLayout &DL = DAG.getDataLayout(); 727 const GlobalValue *GV = GSD->getGlobal(); 728 MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 729 730 SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT); 731 return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA); 732 } 733 734 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 735 // On hw >= R700, COS/SIN input must be between -1. and 1. 736 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) 737 EVT VT = Op.getValueType(); 738 SDValue Arg = Op.getOperand(0); 739 SDLoc DL(Op); 740 741 // TODO: Should this propagate fast-math-flags? 742 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, 743 DAG.getNode(ISD::FADD, DL, VT, 744 DAG.getNode(ISD::FMUL, DL, VT, Arg, 745 DAG.getConstantFP(0.15915494309, DL, MVT::f32)), 746 DAG.getConstantFP(0.5, DL, MVT::f32))); 747 unsigned TrigNode; 748 switch (Op.getOpcode()) { 749 case ISD::FCOS: 750 TrigNode = AMDGPUISD::COS_HW; 751 break; 752 case ISD::FSIN: 753 TrigNode = AMDGPUISD::SIN_HW; 754 break; 755 default: 756 llvm_unreachable("Wrong trig opcode"); 757 } 758 SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, 759 DAG.getNode(ISD::FADD, DL, VT, FractPart, 760 DAG.getConstantFP(-0.5, DL, MVT::f32))); 761 if (Gen >= AMDGPUSubtarget::R700) 762 return TrigVal; 763 // On R600 hw, COS/SIN input must be between -Pi and Pi. 764 return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, 765 DAG.getConstantFP(numbers::pif, DL, MVT::f32)); 766 } 767 768 SDValue R600TargetLowering::LowerShiftParts(SDValue Op, 769 SelectionDAG &DAG) const { 770 SDValue Lo, Hi; 771 expandShiftParts(Op.getNode(), Lo, Hi, DAG); 772 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op)); 773 } 774 775 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, 776 unsigned mainop, unsigned ovf) const { 777 SDLoc DL(Op); 778 EVT VT = Op.getValueType(); 779 780 SDValue Lo = Op.getOperand(0); 781 SDValue Hi = Op.getOperand(1); 782 783 SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi); 784 // Extend sign. 785 OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF, 786 DAG.getValueType(MVT::i1)); 787 788 SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi); 789 790 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF); 791 } 792 793 SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const { 794 SDLoc DL(Op); 795 return DAG.getNode( 796 ISD::SETCC, 797 DL, 798 MVT::i1, 799 Op, DAG.getConstantFP(1.0f, DL, MVT::f32), 800 DAG.getCondCode(ISD::SETEQ)); 801 } 802 803 SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { 804 SDLoc DL(Op); 805 return DAG.getNode( 806 ISD::SETCC, 807 DL, 808 MVT::i1, 809 Op, DAG.getConstantFP(-1.0f, DL, MVT::f32), 810 DAG.getCondCode(ISD::SETEQ)); 811 } 812 813 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 814 const SDLoc &DL, 815 unsigned DwordOffset) const { 816 unsigned ByteOffset = DwordOffset * 4; 817 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 818 AMDGPUAS::PARAM_I_ADDRESS); 819 820 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 821 assert(isInt<16>(ByteOffset)); 822 823 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 824 DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR 825 MachinePointerInfo(ConstantPointerNull::get(PtrType))); 826 } 827 828 bool R600TargetLowering::isZero(SDValue Op) const { 829 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 830 return Cst->isNullValue(); 831 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 832 return CstFP->isZero(); 833 } else { 834 return false; 835 } 836 } 837 838 bool R600TargetLowering::isHWTrueValue(SDValue Op) const { 839 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 840 return CFP->isExactlyValue(1.0); 841 } 842 return isAllOnesConstant(Op); 843 } 844 845 bool R600TargetLowering::isHWFalseValue(SDValue Op) const { 846 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 847 return CFP->getValueAPF().isZero(); 848 } 849 return isNullConstant(Op); 850 } 851 852 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 853 SDLoc DL(Op); 854 EVT VT = Op.getValueType(); 855 856 SDValue LHS = Op.getOperand(0); 857 SDValue RHS = Op.getOperand(1); 858 SDValue True = Op.getOperand(2); 859 SDValue False = Op.getOperand(3); 860 SDValue CC = Op.getOperand(4); 861 SDValue Temp; 862 863 if (VT == MVT::f32) { 864 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); 865 SDValue MinMax = combineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); 866 if (MinMax) 867 return MinMax; 868 } 869 870 // LHS and RHS are guaranteed to be the same value type 871 EVT CompareVT = LHS.getValueType(); 872 873 // Check if we can lower this to a native operation. 874 875 // Try to lower to a SET* instruction: 876 // 877 // SET* can match the following patterns: 878 // 879 // select_cc f32, f32, -1, 0, cc_supported 880 // select_cc f32, f32, 1.0f, 0.0f, cc_supported 881 // select_cc i32, i32, -1, 0, cc_supported 882 // 883 884 // Move hardware True/False values to the correct operand. 885 if (isHWTrueValue(False) && isHWFalseValue(True)) { 886 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 887 ISD::CondCode InverseCC = ISD::getSetCCInverse(CCOpcode, CompareVT); 888 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { 889 std::swap(False, True); 890 CC = DAG.getCondCode(InverseCC); 891 } else { 892 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); 893 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { 894 std::swap(False, True); 895 std::swap(LHS, RHS); 896 CC = DAG.getCondCode(SwapInvCC); 897 } 898 } 899 } 900 901 if (isHWTrueValue(True) && isHWFalseValue(False) && 902 (CompareVT == VT || VT == MVT::i32)) { 903 // This can be matched by a SET* instruction. 904 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 905 } 906 907 // Try to lower to a CND* instruction: 908 // 909 // CND* can match the following patterns: 910 // 911 // select_cc f32, 0.0, f32, f32, cc_supported 912 // select_cc f32, 0.0, i32, i32, cc_supported 913 // select_cc i32, 0, f32, f32, cc_supported 914 // select_cc i32, 0, i32, i32, cc_supported 915 // 916 917 // Try to move the zero value to the RHS 918 if (isZero(LHS)) { 919 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 920 // Try swapping the operands 921 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); 922 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 923 std::swap(LHS, RHS); 924 CC = DAG.getCondCode(CCSwapped); 925 } else { 926 // Try inverting the conditon and then swapping the operands 927 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT); 928 CCSwapped = ISD::getSetCCSwappedOperands(CCInv); 929 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 930 std::swap(True, False); 931 std::swap(LHS, RHS); 932 CC = DAG.getCondCode(CCSwapped); 933 } 934 } 935 } 936 if (isZero(RHS)) { 937 SDValue Cond = LHS; 938 SDValue Zero = RHS; 939 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 940 if (CompareVT != VT) { 941 // Bitcast True / False to the correct types. This will end up being 942 // a nop, but it allows us to define only a single pattern in the 943 // .TD files for each CND* instruction rather than having to have 944 // one pattern for integer True/False and one for fp True/False 945 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 946 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 947 } 948 949 switch (CCOpcode) { 950 case ISD::SETONE: 951 case ISD::SETUNE: 952 case ISD::SETNE: 953 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT); 954 Temp = True; 955 True = False; 956 False = Temp; 957 break; 958 default: 959 break; 960 } 961 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 962 Cond, Zero, 963 True, False, 964 DAG.getCondCode(CCOpcode)); 965 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 966 } 967 968 // If we make it this for it means we have no native instructions to handle 969 // this SELECT_CC, so we must lower it. 970 SDValue HWTrue, HWFalse; 971 972 if (CompareVT == MVT::f32) { 973 HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT); 974 HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT); 975 } else if (CompareVT == MVT::i32) { 976 HWTrue = DAG.getConstant(-1, DL, CompareVT); 977 HWFalse = DAG.getConstant(0, DL, CompareVT); 978 } 979 else { 980 llvm_unreachable("Unhandled value type in LowerSELECT_CC"); 981 } 982 983 // Lower this unsupported SELECT_CC into a combination of two supported 984 // SELECT_CC operations. 985 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 986 987 return DAG.getNode(ISD::SELECT_CC, DL, VT, 988 Cond, HWFalse, 989 True, False, 990 DAG.getCondCode(ISD::SETNE)); 991 } 992 993 /// LLVM generates byte-addressed pointers. For indirect addressing, we need to 994 /// convert these pointers to a register index. Each register holds 995 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 996 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used 997 /// for indirect addressing. 998 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 999 unsigned StackWidth, 1000 SelectionDAG &DAG) const { 1001 unsigned SRLPad; 1002 switch(StackWidth) { 1003 case 1: 1004 SRLPad = 2; 1005 break; 1006 case 2: 1007 SRLPad = 3; 1008 break; 1009 case 4: 1010 SRLPad = 4; 1011 break; 1012 default: llvm_unreachable("Invalid stack width"); 1013 } 1014 1015 SDLoc DL(Ptr); 1016 return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr, 1017 DAG.getConstant(SRLPad, DL, MVT::i32)); 1018 } 1019 1020 void R600TargetLowering::getStackAddress(unsigned StackWidth, 1021 unsigned ElemIdx, 1022 unsigned &Channel, 1023 unsigned &PtrIncr) const { 1024 switch (StackWidth) { 1025 default: 1026 case 1: 1027 Channel = 0; 1028 if (ElemIdx > 0) { 1029 PtrIncr = 1; 1030 } else { 1031 PtrIncr = 0; 1032 } 1033 break; 1034 case 2: 1035 Channel = ElemIdx % 2; 1036 if (ElemIdx == 2) { 1037 PtrIncr = 1; 1038 } else { 1039 PtrIncr = 0; 1040 } 1041 break; 1042 case 4: 1043 Channel = ElemIdx; 1044 PtrIncr = 0; 1045 break; 1046 } 1047 } 1048 1049 SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, 1050 SelectionDAG &DAG) const { 1051 SDLoc DL(Store); 1052 //TODO: Who creates the i8 stores? 1053 assert(Store->isTruncatingStore() 1054 || Store->getValue().getValueType() == MVT::i8); 1055 assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS); 1056 1057 SDValue Mask; 1058 if (Store->getMemoryVT() == MVT::i8) { 1059 assert(Store->getAlignment() >= 1); 1060 Mask = DAG.getConstant(0xff, DL, MVT::i32); 1061 } else if (Store->getMemoryVT() == MVT::i16) { 1062 assert(Store->getAlignment() >= 2); 1063 Mask = DAG.getConstant(0xffff, DL, MVT::i32); 1064 } else { 1065 llvm_unreachable("Unsupported private trunc store"); 1066 } 1067 1068 SDValue OldChain = Store->getChain(); 1069 bool VectorTrunc = (OldChain.getOpcode() == AMDGPUISD::DUMMY_CHAIN); 1070 // Skip dummy 1071 SDValue Chain = VectorTrunc ? OldChain->getOperand(0) : OldChain; 1072 SDValue BasePtr = Store->getBasePtr(); 1073 SDValue Offset = Store->getOffset(); 1074 EVT MemVT = Store->getMemoryVT(); 1075 1076 SDValue LoadPtr = BasePtr; 1077 if (!Offset.isUndef()) { 1078 LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset); 1079 } 1080 1081 // Get dword location 1082 // TODO: this should be eliminated by the future SHR ptr, 2 1083 SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, 1084 DAG.getConstant(0xfffffffc, DL, MVT::i32)); 1085 1086 // Load dword 1087 // TODO: can we be smarter about machine pointer info? 1088 MachinePointerInfo PtrInfo(AMDGPUAS::PRIVATE_ADDRESS); 1089 SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo); 1090 1091 Chain = Dst.getValue(1); 1092 1093 // Get offset in dword 1094 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, 1095 DAG.getConstant(0x3, DL, MVT::i32)); 1096 1097 // Convert byte offset to bit shift 1098 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1099 DAG.getConstant(3, DL, MVT::i32)); 1100 1101 // TODO: Contrary to the name of the functiom, 1102 // it also handles sub i32 non-truncating stores (like i1) 1103 SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, 1104 Store->getValue()); 1105 1106 // Mask the value to the right type 1107 SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); 1108 1109 // Shift the value in place 1110 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, 1111 MaskedValue, ShiftAmt); 1112 1113 // Shift the mask in place 1114 SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, Mask, ShiftAmt); 1115 1116 // Invert the mask. NOTE: if we had native ROL instructions we could 1117 // use inverted mask 1118 DstMask = DAG.getNOT(DL, DstMask, MVT::i32); 1119 1120 // Cleanup the target bits 1121 Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); 1122 1123 // Add the new bits 1124 SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); 1125 1126 // Store dword 1127 // TODO: Can we be smarter about MachinePointerInfo? 1128 SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, PtrInfo); 1129 1130 // If we are part of expanded vector, make our neighbors depend on this store 1131 if (VectorTrunc) { 1132 // Make all other vector elements depend on this store 1133 Chain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, NewStore); 1134 DAG.ReplaceAllUsesOfValueWith(OldChain, Chain); 1135 } 1136 return NewStore; 1137 } 1138 1139 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1140 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 1141 unsigned AS = StoreNode->getAddressSpace(); 1142 1143 SDValue Chain = StoreNode->getChain(); 1144 SDValue Ptr = StoreNode->getBasePtr(); 1145 SDValue Value = StoreNode->getValue(); 1146 1147 EVT VT = Value.getValueType(); 1148 EVT MemVT = StoreNode->getMemoryVT(); 1149 EVT PtrVT = Ptr.getValueType(); 1150 1151 SDLoc DL(Op); 1152 1153 const bool TruncatingStore = StoreNode->isTruncatingStore(); 1154 1155 // Neither LOCAL nor PRIVATE can do vectors at the moment 1156 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS || 1157 TruncatingStore) && 1158 VT.isVector()) { 1159 if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) { 1160 // Add an extra level of chain to isolate this vector 1161 SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); 1162 // TODO: can the chain be replaced without creating a new store? 1163 SDValue NewStore = DAG.getTruncStore( 1164 NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), 1165 MemVT, StoreNode->getAlignment(), 1166 StoreNode->getMemOperand()->getFlags(), StoreNode->getAAInfo()); 1167 StoreNode = cast<StoreSDNode>(NewStore); 1168 } 1169 1170 return scalarizeVectorStore(StoreNode, DAG); 1171 } 1172 1173 Align Alignment = StoreNode->getAlign(); 1174 if (Alignment < MemVT.getStoreSize() && 1175 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment, 1176 StoreNode->getMemOperand()->getFlags(), 1177 nullptr)) { 1178 return expandUnalignedStore(StoreNode, DAG); 1179 } 1180 1181 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr, 1182 DAG.getConstant(2, DL, PtrVT)); 1183 1184 if (AS == AMDGPUAS::GLOBAL_ADDRESS) { 1185 // It is beneficial to create MSKOR here instead of combiner to avoid 1186 // artificial dependencies introduced by RMW 1187 if (TruncatingStore) { 1188 assert(VT.bitsLE(MVT::i32)); 1189 SDValue MaskConstant; 1190 if (MemVT == MVT::i8) { 1191 MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32); 1192 } else { 1193 assert(MemVT == MVT::i16); 1194 assert(StoreNode->getAlignment() >= 2); 1195 MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); 1196 } 1197 1198 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, PtrVT, Ptr, 1199 DAG.getConstant(0x00000003, DL, PtrVT)); 1200 SDValue BitShift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, 1201 DAG.getConstant(3, DL, VT)); 1202 1203 // Put the mask in correct place 1204 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift); 1205 1206 // Put the value bits in correct place 1207 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); 1208 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift); 1209 1210 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 1211 // vector instead. 1212 SDValue Src[4] = { 1213 ShiftedValue, 1214 DAG.getConstant(0, DL, MVT::i32), 1215 DAG.getConstant(0, DL, MVT::i32), 1216 Mask 1217 }; 1218 SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src); 1219 SDValue Args[3] = { Chain, Input, DWordAddr }; 1220 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, 1221 Op->getVTList(), Args, MemVT, 1222 StoreNode->getMemOperand()); 1223 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) { 1224 // Convert pointer from byte address to dword address. 1225 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr); 1226 1227 if (StoreNode->isIndexed()) { 1228 llvm_unreachable("Indexed stores not supported yet"); 1229 } else { 1230 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 1231 } 1232 return Chain; 1233 } 1234 } 1235 1236 // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes 1237 if (AS != AMDGPUAS::PRIVATE_ADDRESS) 1238 return SDValue(); 1239 1240 if (MemVT.bitsLT(MVT::i32)) 1241 return lowerPrivateTruncStore(StoreNode, DAG); 1242 1243 // Standard i32+ store, tag it with DWORDADDR to note that the address 1244 // has been shifted 1245 if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) { 1246 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr); 1247 return DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 1248 } 1249 1250 // Tagged i32+ stores will be matched by patterns 1251 return SDValue(); 1252 } 1253 1254 // return (512 + (kc_bank << 12) 1255 static int 1256 ConstantAddressBlock(unsigned AddressSpace) { 1257 switch (AddressSpace) { 1258 case AMDGPUAS::CONSTANT_BUFFER_0: 1259 return 512; 1260 case AMDGPUAS::CONSTANT_BUFFER_1: 1261 return 512 + 4096; 1262 case AMDGPUAS::CONSTANT_BUFFER_2: 1263 return 512 + 4096 * 2; 1264 case AMDGPUAS::CONSTANT_BUFFER_3: 1265 return 512 + 4096 * 3; 1266 case AMDGPUAS::CONSTANT_BUFFER_4: 1267 return 512 + 4096 * 4; 1268 case AMDGPUAS::CONSTANT_BUFFER_5: 1269 return 512 + 4096 * 5; 1270 case AMDGPUAS::CONSTANT_BUFFER_6: 1271 return 512 + 4096 * 6; 1272 case AMDGPUAS::CONSTANT_BUFFER_7: 1273 return 512 + 4096 * 7; 1274 case AMDGPUAS::CONSTANT_BUFFER_8: 1275 return 512 + 4096 * 8; 1276 case AMDGPUAS::CONSTANT_BUFFER_9: 1277 return 512 + 4096 * 9; 1278 case AMDGPUAS::CONSTANT_BUFFER_10: 1279 return 512 + 4096 * 10; 1280 case AMDGPUAS::CONSTANT_BUFFER_11: 1281 return 512 + 4096 * 11; 1282 case AMDGPUAS::CONSTANT_BUFFER_12: 1283 return 512 + 4096 * 12; 1284 case AMDGPUAS::CONSTANT_BUFFER_13: 1285 return 512 + 4096 * 13; 1286 case AMDGPUAS::CONSTANT_BUFFER_14: 1287 return 512 + 4096 * 14; 1288 case AMDGPUAS::CONSTANT_BUFFER_15: 1289 return 512 + 4096 * 15; 1290 default: 1291 return -1; 1292 } 1293 } 1294 1295 SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op, 1296 SelectionDAG &DAG) const { 1297 SDLoc DL(Op); 1298 LoadSDNode *Load = cast<LoadSDNode>(Op); 1299 ISD::LoadExtType ExtType = Load->getExtensionType(); 1300 EVT MemVT = Load->getMemoryVT(); 1301 assert(Load->getAlignment() >= MemVT.getStoreSize()); 1302 1303 SDValue BasePtr = Load->getBasePtr(); 1304 SDValue Chain = Load->getChain(); 1305 SDValue Offset = Load->getOffset(); 1306 1307 SDValue LoadPtr = BasePtr; 1308 if (!Offset.isUndef()) { 1309 LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset); 1310 } 1311 1312 // Get dword location 1313 // NOTE: this should be eliminated by the future SHR ptr, 2 1314 SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, 1315 DAG.getConstant(0xfffffffc, DL, MVT::i32)); 1316 1317 // Load dword 1318 // TODO: can we be smarter about machine pointer info? 1319 MachinePointerInfo PtrInfo(AMDGPUAS::PRIVATE_ADDRESS); 1320 SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo); 1321 1322 // Get offset within the register. 1323 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, 1324 LoadPtr, DAG.getConstant(0x3, DL, MVT::i32)); 1325 1326 // Bit offset of target byte (byteIdx * 8). 1327 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1328 DAG.getConstant(3, DL, MVT::i32)); 1329 1330 // Shift to the right. 1331 SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt); 1332 1333 // Eliminate the upper bits by setting them to ... 1334 EVT MemEltVT = MemVT.getScalarType(); 1335 1336 if (ExtType == ISD::SEXTLOAD) { // ... ones. 1337 SDValue MemEltVTNode = DAG.getValueType(MemEltVT); 1338 Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode); 1339 } else { // ... or zeros. 1340 Ret = DAG.getZeroExtendInReg(Ret, DL, MemEltVT); 1341 } 1342 1343 SDValue Ops[] = { 1344 Ret, 1345 Read.getValue(1) // This should be our output chain 1346 }; 1347 1348 return DAG.getMergeValues(Ops, DL); 1349 } 1350 1351 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 1352 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 1353 unsigned AS = LoadNode->getAddressSpace(); 1354 EVT MemVT = LoadNode->getMemoryVT(); 1355 ISD::LoadExtType ExtType = LoadNode->getExtensionType(); 1356 1357 if (AS == AMDGPUAS::PRIVATE_ADDRESS && 1358 ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) { 1359 return lowerPrivateExtLoad(Op, DAG); 1360 } 1361 1362 SDLoc DL(Op); 1363 EVT VT = Op.getValueType(); 1364 SDValue Chain = LoadNode->getChain(); 1365 SDValue Ptr = LoadNode->getBasePtr(); 1366 1367 if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 1368 LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && 1369 VT.isVector()) { 1370 SDValue Ops[2]; 1371 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LoadNode, DAG); 1372 return DAG.getMergeValues(Ops, DL); 1373 } 1374 1375 // This is still used for explicit load from addrspace(8) 1376 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 1377 if (ConstantBlock > -1 && 1378 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || 1379 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { 1380 SDValue Result; 1381 if (isa<Constant>(LoadNode->getMemOperand()->getValue()) || 1382 isa<ConstantSDNode>(Ptr)) { 1383 return constBufferLoad(LoadNode, LoadNode->getAddressSpace(), DAG); 1384 } else { 1385 //TODO: Does this even work? 1386 // non-constant ptr can't be folded, keeps it as a v4f32 load 1387 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 1388 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, 1389 DAG.getConstant(4, DL, MVT::i32)), 1390 DAG.getConstant(LoadNode->getAddressSpace() - 1391 AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) 1392 ); 1393 } 1394 1395 if (!VT.isVector()) { 1396 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1397 DAG.getConstant(0, DL, MVT::i32)); 1398 } 1399 1400 SDValue MergedValues[2] = { 1401 Result, 1402 Chain 1403 }; 1404 return DAG.getMergeValues(MergedValues, DL); 1405 } 1406 1407 // For most operations returning SDValue() will result in the node being 1408 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we 1409 // need to manually expand loads that may be legal in some address spaces and 1410 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for 1411 // compute shaders, since the data is sign extended when it is uploaded to the 1412 // buffer. However SEXT loads from other address spaces are not supported, so 1413 // we need to expand them here. 1414 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { 1415 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); 1416 SDValue NewLoad = DAG.getExtLoad( 1417 ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT, 1418 LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags()); 1419 SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, 1420 DAG.getValueType(MemVT)); 1421 1422 SDValue MergedValues[2] = { Res, Chain }; 1423 return DAG.getMergeValues(MergedValues, DL); 1424 } 1425 1426 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1427 return SDValue(); 1428 } 1429 1430 // DWORDADDR ISD marks already shifted address 1431 if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) { 1432 assert(VT == MVT::i32); 1433 Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32)); 1434 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr); 1435 return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand()); 1436 } 1437 return SDValue(); 1438 } 1439 1440 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 1441 SDValue Chain = Op.getOperand(0); 1442 SDValue Cond = Op.getOperand(1); 1443 SDValue Jump = Op.getOperand(2); 1444 1445 return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), 1446 Chain, Jump, Cond); 1447 } 1448 1449 SDValue R600TargetLowering::lowerFrameIndex(SDValue Op, 1450 SelectionDAG &DAG) const { 1451 MachineFunction &MF = DAG.getMachineFunction(); 1452 const R600FrameLowering *TFL = Subtarget->getFrameLowering(); 1453 1454 FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); 1455 1456 unsigned FrameIndex = FIN->getIndex(); 1457 Register IgnoredFrameReg; 1458 StackOffset Offset = 1459 TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); 1460 return DAG.getConstant(Offset.getFixed() * 4 * TFL->getStackWidth(MF), 1461 SDLoc(Op), Op.getValueType()); 1462 } 1463 1464 CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1465 bool IsVarArg) const { 1466 switch (CC) { 1467 case CallingConv::AMDGPU_KERNEL: 1468 case CallingConv::SPIR_KERNEL: 1469 case CallingConv::C: 1470 case CallingConv::Fast: 1471 case CallingConv::Cold: 1472 llvm_unreachable("kernels should not be handled here"); 1473 case CallingConv::AMDGPU_VS: 1474 case CallingConv::AMDGPU_GS: 1475 case CallingConv::AMDGPU_PS: 1476 case CallingConv::AMDGPU_CS: 1477 case CallingConv::AMDGPU_HS: 1478 case CallingConv::AMDGPU_ES: 1479 case CallingConv::AMDGPU_LS: 1480 return CC_R600; 1481 default: 1482 report_fatal_error("Unsupported calling convention."); 1483 } 1484 } 1485 1486 /// XXX Only kernel functions are supported, so we can assume for now that 1487 /// every function is a kernel function, but in the future we should use 1488 /// separate calling conventions for kernel and non-kernel functions. 1489 SDValue R600TargetLowering::LowerFormalArguments( 1490 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 1491 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 1492 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 1493 SmallVector<CCValAssign, 16> ArgLocs; 1494 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 1495 *DAG.getContext()); 1496 MachineFunction &MF = DAG.getMachineFunction(); 1497 SmallVector<ISD::InputArg, 8> LocalIns; 1498 1499 if (AMDGPU::isShader(CallConv)) { 1500 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 1501 } else { 1502 analyzeFormalArgumentsCompute(CCInfo, Ins); 1503 } 1504 1505 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 1506 CCValAssign &VA = ArgLocs[i]; 1507 const ISD::InputArg &In = Ins[i]; 1508 EVT VT = In.VT; 1509 EVT MemVT = VA.getLocVT(); 1510 if (!VT.isVector() && MemVT.isVector()) { 1511 // Get load source type if scalarized. 1512 MemVT = MemVT.getVectorElementType(); 1513 } 1514 1515 if (AMDGPU::isShader(CallConv)) { 1516 Register Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass); 1517 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); 1518 InVals.push_back(Register); 1519 continue; 1520 } 1521 1522 // i64 isn't a legal type, so the register type used ends up as i32, which 1523 // isn't expected here. It attempts to create this sextload, but it ends up 1524 // being invalid. Somehow this seems to work with i64 arguments, but breaks 1525 // for <1 x i64>. 1526 1527 // The first 36 bytes of the input buffer contains information about 1528 // thread group and global sizes. 1529 ISD::LoadExtType Ext = ISD::NON_EXTLOAD; 1530 if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { 1531 // FIXME: This should really check the extload type, but the handling of 1532 // extload vector parameters seems to be broken. 1533 1534 // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 1535 Ext = ISD::SEXTLOAD; 1536 } 1537 1538 // Compute the offset from the value. 1539 // XXX - I think PartOffset should give you this, but it seems to give the 1540 // size of the register which isn't useful. 1541 1542 unsigned PartOffset = VA.getLocMemOffset(); 1543 unsigned Alignment = MinAlign(VT.getStoreSize(), PartOffset); 1544 1545 MachinePointerInfo PtrInfo(AMDGPUAS::PARAM_I_ADDRESS); 1546 SDValue Arg = DAG.getLoad( 1547 ISD::UNINDEXED, Ext, VT, DL, Chain, 1548 DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), 1549 PtrInfo, 1550 MemVT, Alignment, MachineMemOperand::MONonTemporal | 1551 MachineMemOperand::MODereferenceable | 1552 MachineMemOperand::MOInvariant); 1553 1554 InVals.push_back(Arg); 1555 } 1556 return Chain; 1557 } 1558 1559 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1560 EVT VT) const { 1561 if (!VT.isVector()) 1562 return MVT::i32; 1563 return VT.changeVectorElementTypeToInteger(); 1564 } 1565 1566 bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, 1567 const SelectionDAG &DAG) const { 1568 // Local and Private addresses do not handle vectors. Limit to i32 1569 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)) { 1570 return (MemVT.getSizeInBits() <= 32); 1571 } 1572 return true; 1573 } 1574 1575 bool R600TargetLowering::allowsMisalignedMemoryAccesses( 1576 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, 1577 bool *IsFast) const { 1578 if (IsFast) 1579 *IsFast = false; 1580 1581 if (!VT.isSimple() || VT == MVT::Other) 1582 return false; 1583 1584 if (VT.bitsLT(MVT::i32)) 1585 return false; 1586 1587 // TODO: This is a rough estimate. 1588 if (IsFast) 1589 *IsFast = true; 1590 1591 return VT.bitsGT(MVT::i32) && Alignment >= Align(4); 1592 } 1593 1594 static SDValue CompactSwizzlableVector( 1595 SelectionDAG &DAG, SDValue VectorEntry, 1596 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1597 assert(RemapSwizzle.empty()); 1598 1599 SDLoc DL(VectorEntry); 1600 EVT EltTy = VectorEntry.getValueType().getVectorElementType(); 1601 1602 SDValue NewBldVec[4]; 1603 for (unsigned i = 0; i < 4; i++) 1604 NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry, 1605 DAG.getIntPtrConstant(i, DL)); 1606 1607 for (unsigned i = 0; i < 4; i++) { 1608 if (NewBldVec[i].isUndef()) 1609 // We mask write here to teach later passes that the ith element of this 1610 // vector is undef. Thus we can use it to reduce 128 bits reg usage, 1611 // break false dependencies and additionnaly make assembly easier to read. 1612 RemapSwizzle[i] = 7; // SEL_MASK_WRITE 1613 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { 1614 if (C->isZero()) { 1615 RemapSwizzle[i] = 4; // SEL_0 1616 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1617 } else if (C->isExactlyValue(1.0)) { 1618 RemapSwizzle[i] = 5; // SEL_1 1619 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1620 } 1621 } 1622 1623 if (NewBldVec[i].isUndef()) 1624 continue; 1625 1626 for (unsigned j = 0; j < i; j++) { 1627 if (NewBldVec[i] == NewBldVec[j]) { 1628 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); 1629 RemapSwizzle[i] = j; 1630 break; 1631 } 1632 } 1633 } 1634 1635 return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), 1636 NewBldVec); 1637 } 1638 1639 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, 1640 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1641 assert(RemapSwizzle.empty()); 1642 1643 SDLoc DL(VectorEntry); 1644 EVT EltTy = VectorEntry.getValueType().getVectorElementType(); 1645 1646 SDValue NewBldVec[4]; 1647 bool isUnmovable[4] = {false, false, false, false}; 1648 for (unsigned i = 0; i < 4; i++) 1649 NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry, 1650 DAG.getIntPtrConstant(i, DL)); 1651 1652 for (unsigned i = 0; i < 4; i++) { 1653 RemapSwizzle[i] = i; 1654 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1655 unsigned Idx = cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1656 ->getZExtValue(); 1657 if (i == Idx) 1658 isUnmovable[Idx] = true; 1659 } 1660 } 1661 1662 for (unsigned i = 0; i < 4; i++) { 1663 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1664 unsigned Idx = cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1665 ->getZExtValue(); 1666 if (isUnmovable[Idx]) 1667 continue; 1668 // Swap i and Idx 1669 std::swap(NewBldVec[Idx], NewBldVec[i]); 1670 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); 1671 break; 1672 } 1673 } 1674 1675 return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), 1676 NewBldVec); 1677 } 1678 1679 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4], 1680 SelectionDAG &DAG, 1681 const SDLoc &DL) const { 1682 // Old -> New swizzle values 1683 DenseMap<unsigned, unsigned> SwizzleRemap; 1684 1685 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); 1686 for (unsigned i = 0; i < 4; i++) { 1687 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1688 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1689 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); 1690 } 1691 1692 SwizzleRemap.clear(); 1693 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); 1694 for (unsigned i = 0; i < 4; i++) { 1695 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1696 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1697 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); 1698 } 1699 1700 return BuildVector; 1701 } 1702 1703 SDValue R600TargetLowering::constBufferLoad(LoadSDNode *LoadNode, int Block, 1704 SelectionDAG &DAG) const { 1705 SDLoc DL(LoadNode); 1706 EVT VT = LoadNode->getValueType(0); 1707 SDValue Chain = LoadNode->getChain(); 1708 SDValue Ptr = LoadNode->getBasePtr(); 1709 assert (isa<ConstantSDNode>(Ptr)); 1710 1711 //TODO: Support smaller loads 1712 if (LoadNode->getMemoryVT().getScalarType() != MVT::i32 || !ISD::isNON_EXTLoad(LoadNode)) 1713 return SDValue(); 1714 1715 if (LoadNode->getAlignment() < 4) 1716 return SDValue(); 1717 1718 int ConstantBlock = ConstantAddressBlock(Block); 1719 1720 SDValue Slots[4]; 1721 for (unsigned i = 0; i < 4; i++) { 1722 // We want Const position encoded with the following formula : 1723 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 1724 // const_index is Ptr computed by llvm using an alignment of 16. 1725 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 1726 // then div by 4 at the ISel step 1727 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 1728 DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); 1729 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 1730 } 1731 EVT NewVT = MVT::v4i32; 1732 unsigned NumElements = 4; 1733 if (VT.isVector()) { 1734 NewVT = VT; 1735 NumElements = VT.getVectorNumElements(); 1736 } 1737 SDValue Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements)); 1738 if (!VT.isVector()) { 1739 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1740 DAG.getConstant(0, DL, MVT::i32)); 1741 } 1742 SDValue MergedValues[2] = { 1743 Result, 1744 Chain 1745 }; 1746 return DAG.getMergeValues(MergedValues, DL); 1747 } 1748 1749 //===----------------------------------------------------------------------===// 1750 // Custom DAG Optimizations 1751 //===----------------------------------------------------------------------===// 1752 1753 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 1754 DAGCombinerInfo &DCI) const { 1755 SelectionDAG &DAG = DCI.DAG; 1756 SDLoc DL(N); 1757 1758 switch (N->getOpcode()) { 1759 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 1760 case ISD::FP_ROUND: { 1761 SDValue Arg = N->getOperand(0); 1762 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 1763 return DAG.getNode(ISD::UINT_TO_FP, DL, N->getValueType(0), 1764 Arg.getOperand(0)); 1765 } 1766 break; 1767 } 1768 1769 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 1770 // (i32 select_cc f32, f32, -1, 0 cc) 1771 // 1772 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 1773 // this to one of the SET*_DX10 instructions. 1774 case ISD::FP_TO_SINT: { 1775 SDValue FNeg = N->getOperand(0); 1776 if (FNeg.getOpcode() != ISD::FNEG) { 1777 return SDValue(); 1778 } 1779 SDValue SelectCC = FNeg.getOperand(0); 1780 if (SelectCC.getOpcode() != ISD::SELECT_CC || 1781 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 1782 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 1783 !isHWTrueValue(SelectCC.getOperand(2)) || 1784 !isHWFalseValue(SelectCC.getOperand(3))) { 1785 return SDValue(); 1786 } 1787 1788 return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(0), 1789 SelectCC.getOperand(0), // LHS 1790 SelectCC.getOperand(1), // RHS 1791 DAG.getConstant(-1, DL, MVT::i32), // True 1792 DAG.getConstant(0, DL, MVT::i32), // False 1793 SelectCC.getOperand(4)); // CC 1794 } 1795 1796 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx 1797 // => build_vector elt0, ... , NewEltIdx, ... , eltN 1798 case ISD::INSERT_VECTOR_ELT: { 1799 SDValue InVec = N->getOperand(0); 1800 SDValue InVal = N->getOperand(1); 1801 SDValue EltNo = N->getOperand(2); 1802 1803 // If the inserted element is an UNDEF, just use the input vector. 1804 if (InVal.isUndef()) 1805 return InVec; 1806 1807 EVT VT = InVec.getValueType(); 1808 1809 // If we can't generate a legal BUILD_VECTOR, exit 1810 if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) 1811 return SDValue(); 1812 1813 // Check that we know which element is being inserted 1814 if (!isa<ConstantSDNode>(EltNo)) 1815 return SDValue(); 1816 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 1817 1818 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially 1819 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the 1820 // vector elements. 1821 SmallVector<SDValue, 8> Ops; 1822 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 1823 Ops.append(InVec.getNode()->op_begin(), 1824 InVec.getNode()->op_end()); 1825 } else if (InVec.isUndef()) { 1826 unsigned NElts = VT.getVectorNumElements(); 1827 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); 1828 } else { 1829 return SDValue(); 1830 } 1831 1832 // Insert the element 1833 if (Elt < Ops.size()) { 1834 // All the operands of BUILD_VECTOR must have the same type; 1835 // we enforce that here. 1836 EVT OpVT = Ops[0].getValueType(); 1837 if (InVal.getValueType() != OpVT) 1838 InVal = OpVT.bitsGT(InVal.getValueType()) ? 1839 DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) : 1840 DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal); 1841 Ops[Elt] = InVal; 1842 } 1843 1844 // Return the new vector 1845 return DAG.getBuildVector(VT, DL, Ops); 1846 } 1847 1848 // Extract_vec (Build_vector) generated by custom lowering 1849 // also needs to be customly combined 1850 case ISD::EXTRACT_VECTOR_ELT: { 1851 SDValue Arg = N->getOperand(0); 1852 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 1853 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1854 unsigned Element = Const->getZExtValue(); 1855 return Arg->getOperand(Element); 1856 } 1857 } 1858 if (Arg.getOpcode() == ISD::BITCAST && 1859 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && 1860 (Arg.getOperand(0).getValueType().getVectorNumElements() == 1861 Arg.getValueType().getVectorNumElements())) { 1862 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1863 unsigned Element = Const->getZExtValue(); 1864 return DAG.getNode(ISD::BITCAST, DL, N->getVTList(), 1865 Arg->getOperand(0).getOperand(Element)); 1866 } 1867 } 1868 break; 1869 } 1870 1871 case ISD::SELECT_CC: { 1872 // Try common optimizations 1873 if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI)) 1874 return Ret; 1875 1876 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 1877 // selectcc x, y, a, b, inv(cc) 1878 // 1879 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 1880 // selectcc x, y, a, b, cc 1881 SDValue LHS = N->getOperand(0); 1882 if (LHS.getOpcode() != ISD::SELECT_CC) { 1883 return SDValue(); 1884 } 1885 1886 SDValue RHS = N->getOperand(1); 1887 SDValue True = N->getOperand(2); 1888 SDValue False = N->getOperand(3); 1889 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 1890 1891 if (LHS.getOperand(2).getNode() != True.getNode() || 1892 LHS.getOperand(3).getNode() != False.getNode() || 1893 RHS.getNode() != False.getNode()) { 1894 return SDValue(); 1895 } 1896 1897 switch (NCC) { 1898 default: return SDValue(); 1899 case ISD::SETNE: return LHS; 1900 case ISD::SETEQ: { 1901 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 1902 LHSCC = ISD::getSetCCInverse(LHSCC, LHS.getOperand(0).getValueType()); 1903 if (DCI.isBeforeLegalizeOps() || 1904 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) 1905 return DAG.getSelectCC(DL, 1906 LHS.getOperand(0), 1907 LHS.getOperand(1), 1908 LHS.getOperand(2), 1909 LHS.getOperand(3), 1910 LHSCC); 1911 break; 1912 } 1913 } 1914 return SDValue(); 1915 } 1916 1917 case AMDGPUISD::R600_EXPORT: { 1918 SDValue Arg = N->getOperand(1); 1919 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 1920 break; 1921 1922 SDValue NewArgs[8] = { 1923 N->getOperand(0), // Chain 1924 SDValue(), 1925 N->getOperand(2), // ArrayBase 1926 N->getOperand(3), // Type 1927 N->getOperand(4), // SWZ_X 1928 N->getOperand(5), // SWZ_Y 1929 N->getOperand(6), // SWZ_Z 1930 N->getOperand(7) // SWZ_W 1931 }; 1932 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL); 1933 return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, N->getVTList(), NewArgs); 1934 } 1935 case AMDGPUISD::TEXTURE_FETCH: { 1936 SDValue Arg = N->getOperand(1); 1937 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 1938 break; 1939 1940 SDValue NewArgs[19] = { 1941 N->getOperand(0), 1942 N->getOperand(1), 1943 N->getOperand(2), 1944 N->getOperand(3), 1945 N->getOperand(4), 1946 N->getOperand(5), 1947 N->getOperand(6), 1948 N->getOperand(7), 1949 N->getOperand(8), 1950 N->getOperand(9), 1951 N->getOperand(10), 1952 N->getOperand(11), 1953 N->getOperand(12), 1954 N->getOperand(13), 1955 N->getOperand(14), 1956 N->getOperand(15), 1957 N->getOperand(16), 1958 N->getOperand(17), 1959 N->getOperand(18), 1960 }; 1961 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); 1962 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); 1963 } 1964 1965 case ISD::LOAD: { 1966 LoadSDNode *LoadNode = cast<LoadSDNode>(N); 1967 SDValue Ptr = LoadNode->getBasePtr(); 1968 if (LoadNode->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS && 1969 isa<ConstantSDNode>(Ptr)) 1970 return constBufferLoad(LoadNode, AMDGPUAS::CONSTANT_BUFFER_0, DAG); 1971 break; 1972 } 1973 1974 default: break; 1975 } 1976 1977 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1978 } 1979 1980 bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, 1981 SDValue &Src, SDValue &Neg, SDValue &Abs, 1982 SDValue &Sel, SDValue &Imm, 1983 SelectionDAG &DAG) const { 1984 const R600InstrInfo *TII = Subtarget->getInstrInfo(); 1985 if (!Src.isMachineOpcode()) 1986 return false; 1987 1988 switch (Src.getMachineOpcode()) { 1989 case R600::FNEG_R600: 1990 if (!Neg.getNode()) 1991 return false; 1992 Src = Src.getOperand(0); 1993 Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); 1994 return true; 1995 case R600::FABS_R600: 1996 if (!Abs.getNode()) 1997 return false; 1998 Src = Src.getOperand(0); 1999 Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); 2000 return true; 2001 case R600::CONST_COPY: { 2002 unsigned Opcode = ParentNode->getMachineOpcode(); 2003 bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1; 2004 2005 if (!Sel.getNode()) 2006 return false; 2007 2008 SDValue CstOffset = Src.getOperand(0); 2009 if (ParentNode->getValueType(0).isVector()) 2010 return false; 2011 2012 // Gather constants values 2013 int SrcIndices[] = { 2014 TII->getOperandIdx(Opcode, R600::OpName::src0), 2015 TII->getOperandIdx(Opcode, R600::OpName::src1), 2016 TII->getOperandIdx(Opcode, R600::OpName::src2), 2017 TII->getOperandIdx(Opcode, R600::OpName::src0_X), 2018 TII->getOperandIdx(Opcode, R600::OpName::src0_Y), 2019 TII->getOperandIdx(Opcode, R600::OpName::src0_Z), 2020 TII->getOperandIdx(Opcode, R600::OpName::src0_W), 2021 TII->getOperandIdx(Opcode, R600::OpName::src1_X), 2022 TII->getOperandIdx(Opcode, R600::OpName::src1_Y), 2023 TII->getOperandIdx(Opcode, R600::OpName::src1_Z), 2024 TII->getOperandIdx(Opcode, R600::OpName::src1_W) 2025 }; 2026 std::vector<unsigned> Consts; 2027 for (int OtherSrcIdx : SrcIndices) { 2028 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); 2029 if (OtherSrcIdx < 0 || OtherSelIdx < 0) 2030 continue; 2031 if (HasDst) { 2032 OtherSrcIdx--; 2033 OtherSelIdx--; 2034 } 2035 if (RegisterSDNode *Reg = 2036 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { 2037 if (Reg->getReg() == R600::ALU_CONST) { 2038 ConstantSDNode *Cst 2039 = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx)); 2040 Consts.push_back(Cst->getZExtValue()); 2041 } 2042 } 2043 } 2044 2045 ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset); 2046 Consts.push_back(Cst->getZExtValue()); 2047 if (!TII->fitsConstReadLimitations(Consts)) { 2048 return false; 2049 } 2050 2051 Sel = CstOffset; 2052 Src = DAG.getRegister(R600::ALU_CONST, MVT::f32); 2053 return true; 2054 } 2055 case R600::MOV_IMM_GLOBAL_ADDR: 2056 // Check if the Imm slot is used. Taken from below. 2057 if (cast<ConstantSDNode>(Imm)->getZExtValue()) 2058 return false; 2059 Imm = Src.getOperand(0); 2060 Src = DAG.getRegister(R600::ALU_LITERAL_X, MVT::i32); 2061 return true; 2062 case R600::MOV_IMM_I32: 2063 case R600::MOV_IMM_F32: { 2064 unsigned ImmReg = R600::ALU_LITERAL_X; 2065 uint64_t ImmValue = 0; 2066 2067 if (Src.getMachineOpcode() == R600::MOV_IMM_F32) { 2068 ConstantFPSDNode *FPC = cast<ConstantFPSDNode>(Src.getOperand(0)); 2069 float FloatValue = FPC->getValueAPF().convertToFloat(); 2070 if (FloatValue == 0.0) { 2071 ImmReg = R600::ZERO; 2072 } else if (FloatValue == 0.5) { 2073 ImmReg = R600::HALF; 2074 } else if (FloatValue == 1.0) { 2075 ImmReg = R600::ONE; 2076 } else { 2077 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); 2078 } 2079 } else { 2080 ConstantSDNode *C = cast<ConstantSDNode>(Src.getOperand(0)); 2081 uint64_t Value = C->getZExtValue(); 2082 if (Value == 0) { 2083 ImmReg = R600::ZERO; 2084 } else if (Value == 1) { 2085 ImmReg = R600::ONE_INT; 2086 } else { 2087 ImmValue = Value; 2088 } 2089 } 2090 2091 // Check that we aren't already using an immediate. 2092 // XXX: It's possible for an instruction to have more than one 2093 // immediate operand, but this is not supported yet. 2094 if (ImmReg == R600::ALU_LITERAL_X) { 2095 if (!Imm.getNode()) 2096 return false; 2097 ConstantSDNode *C = cast<ConstantSDNode>(Imm); 2098 if (C->getZExtValue()) 2099 return false; 2100 Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32); 2101 } 2102 Src = DAG.getRegister(ImmReg, MVT::i32); 2103 return true; 2104 } 2105 default: 2106 return false; 2107 } 2108 } 2109 2110 /// Fold the instructions after selecting them 2111 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, 2112 SelectionDAG &DAG) const { 2113 const R600InstrInfo *TII = Subtarget->getInstrInfo(); 2114 if (!Node->isMachineOpcode()) 2115 return Node; 2116 2117 unsigned Opcode = Node->getMachineOpcode(); 2118 SDValue FakeOp; 2119 2120 std::vector<SDValue> Ops(Node->op_begin(), Node->op_end()); 2121 2122 if (Opcode == R600::DOT_4) { 2123 int OperandIdx[] = { 2124 TII->getOperandIdx(Opcode, R600::OpName::src0_X), 2125 TII->getOperandIdx(Opcode, R600::OpName::src0_Y), 2126 TII->getOperandIdx(Opcode, R600::OpName::src0_Z), 2127 TII->getOperandIdx(Opcode, R600::OpName::src0_W), 2128 TII->getOperandIdx(Opcode, R600::OpName::src1_X), 2129 TII->getOperandIdx(Opcode, R600::OpName::src1_Y), 2130 TII->getOperandIdx(Opcode, R600::OpName::src1_Z), 2131 TII->getOperandIdx(Opcode, R600::OpName::src1_W) 2132 }; 2133 int NegIdx[] = { 2134 TII->getOperandIdx(Opcode, R600::OpName::src0_neg_X), 2135 TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Y), 2136 TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Z), 2137 TII->getOperandIdx(Opcode, R600::OpName::src0_neg_W), 2138 TII->getOperandIdx(Opcode, R600::OpName::src1_neg_X), 2139 TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Y), 2140 TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Z), 2141 TII->getOperandIdx(Opcode, R600::OpName::src1_neg_W) 2142 }; 2143 int AbsIdx[] = { 2144 TII->getOperandIdx(Opcode, R600::OpName::src0_abs_X), 2145 TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Y), 2146 TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Z), 2147 TII->getOperandIdx(Opcode, R600::OpName::src0_abs_W), 2148 TII->getOperandIdx(Opcode, R600::OpName::src1_abs_X), 2149 TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Y), 2150 TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Z), 2151 TII->getOperandIdx(Opcode, R600::OpName::src1_abs_W) 2152 }; 2153 for (unsigned i = 0; i < 8; i++) { 2154 if (OperandIdx[i] < 0) 2155 return Node; 2156 SDValue &Src = Ops[OperandIdx[i] - 1]; 2157 SDValue &Neg = Ops[NegIdx[i] - 1]; 2158 SDValue &Abs = Ops[AbsIdx[i] - 1]; 2159 bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1; 2160 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2161 if (HasDst) 2162 SelIdx--; 2163 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2164 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) 2165 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2166 } 2167 } else if (Opcode == R600::REG_SEQUENCE) { 2168 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { 2169 SDValue &Src = Ops[i]; 2170 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) 2171 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2172 } 2173 } else { 2174 if (!TII->hasInstrModifiers(Opcode)) 2175 return Node; 2176 int OperandIdx[] = { 2177 TII->getOperandIdx(Opcode, R600::OpName::src0), 2178 TII->getOperandIdx(Opcode, R600::OpName::src1), 2179 TII->getOperandIdx(Opcode, R600::OpName::src2) 2180 }; 2181 int NegIdx[] = { 2182 TII->getOperandIdx(Opcode, R600::OpName::src0_neg), 2183 TII->getOperandIdx(Opcode, R600::OpName::src1_neg), 2184 TII->getOperandIdx(Opcode, R600::OpName::src2_neg) 2185 }; 2186 int AbsIdx[] = { 2187 TII->getOperandIdx(Opcode, R600::OpName::src0_abs), 2188 TII->getOperandIdx(Opcode, R600::OpName::src1_abs), 2189 -1 2190 }; 2191 for (unsigned i = 0; i < 3; i++) { 2192 if (OperandIdx[i] < 0) 2193 return Node; 2194 SDValue &Src = Ops[OperandIdx[i] - 1]; 2195 SDValue &Neg = Ops[NegIdx[i] - 1]; 2196 SDValue FakeAbs; 2197 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; 2198 bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1; 2199 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2200 int ImmIdx = TII->getOperandIdx(Opcode, R600::OpName::literal); 2201 if (HasDst) { 2202 SelIdx--; 2203 ImmIdx--; 2204 } 2205 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2206 SDValue &Imm = Ops[ImmIdx]; 2207 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) 2208 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2209 } 2210 } 2211 2212 return Node; 2213 } 2214