1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Custom DAG lowering for R600 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "R600ISelLowering.h" 15 #include "AMDGPU.h" 16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 17 #include "R600Defines.h" 18 #include "R600InstrInfo.h" 19 #include "R600MachineFunctionInfo.h" 20 #include "R600Subtarget.h" 21 #include "llvm/IR/IntrinsicsAMDGPU.h" 22 #include "llvm/IR/IntrinsicsR600.h" 23 24 using namespace llvm; 25 26 #include "R600GenCallingConv.inc" 27 28 R600TargetLowering::R600TargetLowering(const TargetMachine &TM, 29 const R600Subtarget &STI) 30 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) { 31 addRegisterClass(MVT::f32, &R600::R600_Reg32RegClass); 32 addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass); 33 addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass); 34 addRegisterClass(MVT::v2i32, &R600::R600_Reg64RegClass); 35 addRegisterClass(MVT::v4f32, &R600::R600_Reg128RegClass); 36 addRegisterClass(MVT::v4i32, &R600::R600_Reg128RegClass); 37 38 setBooleanContents(ZeroOrNegativeOneBooleanContent); 39 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 40 41 computeRegisterProperties(Subtarget->getRegisterInfo()); 42 43 // Legalize loads and stores to the private address space. 44 setOperationAction(ISD::LOAD, MVT::i32, Custom); 45 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 46 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 47 48 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address 49 // spaces, so it is custom lowered to handle those where it isn't. 50 for (MVT VT : MVT::integer_valuetypes()) { 51 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 52 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); 53 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); 54 55 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 56 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); 57 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); 58 59 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 60 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); 61 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); 62 } 63 64 // Workaround for LegalizeDAG asserting on expansion of i1 vector loads. 65 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand); 66 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); 67 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); 68 69 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand); 70 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); 71 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); 72 73 setOperationAction(ISD::STORE, MVT::i8, Custom); 74 setOperationAction(ISD::STORE, MVT::i32, Custom); 75 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 76 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 77 78 setTruncStoreAction(MVT::i32, MVT::i8, Custom); 79 setTruncStoreAction(MVT::i32, MVT::i16, Custom); 80 // We need to include these since trunc STORES to PRIVATE need 81 // special handling to accommodate RMW 82 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); 83 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Custom); 84 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Custom); 85 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Custom); 86 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Custom); 87 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); 88 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); 89 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Custom); 90 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Custom); 91 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Custom); 92 93 // Workaround for LegalizeDAG asserting on expansion of i1 vector stores. 94 setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand); 95 setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand); 96 97 // Set condition code actions 98 setCondCodeAction(ISD::SETO, MVT::f32, Expand); 99 setCondCodeAction(ISD::SETUO, MVT::f32, Expand); 100 setCondCodeAction(ISD::SETLT, MVT::f32, Expand); 101 setCondCodeAction(ISD::SETLE, MVT::f32, Expand); 102 setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); 103 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 104 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 105 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 106 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 107 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 108 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 109 setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 110 111 setCondCodeAction(ISD::SETLE, MVT::i32, Expand); 112 setCondCodeAction(ISD::SETLT, MVT::i32, Expand); 113 setCondCodeAction(ISD::SETULE, MVT::i32, Expand); 114 setCondCodeAction(ISD::SETULT, MVT::i32, Expand); 115 116 setOperationAction(ISD::FCOS, MVT::f32, Custom); 117 setOperationAction(ISD::FSIN, MVT::f32, Custom); 118 119 setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 120 setOperationAction(ISD::SETCC, MVT::v2i32, Expand); 121 122 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 123 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 124 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 125 126 setOperationAction(ISD::FSUB, MVT::f32, Expand); 127 128 setOperationAction(ISD::FCEIL, MVT::f64, Custom); 129 setOperationAction(ISD::FTRUNC, MVT::f64, Custom); 130 setOperationAction(ISD::FRINT, MVT::f64, Custom); 131 setOperationAction(ISD::FFLOOR, MVT::f64, Custom); 132 133 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 134 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 135 136 setOperationAction(ISD::SETCC, MVT::i32, Expand); 137 setOperationAction(ISD::SETCC, MVT::f32, Expand); 138 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 139 setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom); 140 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 141 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 142 143 setOperationAction(ISD::SELECT, MVT::i32, Expand); 144 setOperationAction(ISD::SELECT, MVT::f32, Expand); 145 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 146 setOperationAction(ISD::SELECT, MVT::v4i32, Expand); 147 148 // ADD, SUB overflow. 149 // TODO: turn these into Legal? 150 if (Subtarget->hasCARRY()) 151 setOperationAction(ISD::UADDO, MVT::i32, Custom); 152 153 if (Subtarget->hasBORROW()) 154 setOperationAction(ISD::USUBO, MVT::i32, Custom); 155 156 // Expand sign extension of vectors 157 if (!Subtarget->hasBFE()) 158 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 159 160 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); 161 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); 162 163 if (!Subtarget->hasBFE()) 164 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 165 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); 166 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); 167 168 if (!Subtarget->hasBFE()) 169 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 170 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); 171 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); 172 173 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 174 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); 175 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); 176 177 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); 178 179 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 180 181 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); 182 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); 183 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 184 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 185 186 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); 187 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); 188 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 189 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 190 191 // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 192 // to be Legal/Custom in order to avoid library calls. 193 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 194 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 195 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 196 197 if (!Subtarget->hasFMA()) { 198 setOperationAction(ISD::FMA, MVT::f32, Expand); 199 setOperationAction(ISD::FMA, MVT::f64, Expand); 200 } 201 202 // FIXME: May need no denormals check 203 setOperationAction(ISD::FMAD, MVT::f32, Legal); 204 205 if (!Subtarget->hasBFI()) { 206 // fcopysign can be done in a single instruction with BFI. 207 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 208 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 209 } 210 211 if (!Subtarget->hasBCNT(32)) 212 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 213 214 if (!Subtarget->hasBCNT(64)) 215 setOperationAction(ISD::CTPOP, MVT::i64, Expand); 216 217 if (Subtarget->hasFFBH()) 218 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); 219 220 if (Subtarget->hasFFBL()) 221 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); 222 223 // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we 224 // need it for R600. 225 if (Subtarget->hasBFE()) 226 setHasExtractBitsInsn(true); 227 228 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 229 230 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 231 for (MVT VT : ScalarIntVTs) { 232 setOperationAction(ISD::ADDC, VT, Expand); 233 setOperationAction(ISD::SUBC, VT, Expand); 234 setOperationAction(ISD::ADDE, VT, Expand); 235 setOperationAction(ISD::SUBE, VT, Expand); 236 } 237 238 // LLVM will expand these to atomic_cmp_swap(0) 239 // and atomic_swap, respectively. 240 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand); 241 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand); 242 243 // We need to custom lower some of the intrinsics 244 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 245 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 246 247 setSchedulingPreference(Sched::Source); 248 249 setTargetDAGCombine(ISD::FP_ROUND); 250 setTargetDAGCombine(ISD::FP_TO_SINT); 251 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 252 setTargetDAGCombine(ISD::SELECT_CC); 253 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 254 setTargetDAGCombine(ISD::LOAD); 255 } 256 257 static inline bool isEOP(MachineBasicBlock::iterator I) { 258 if (std::next(I) == I->getParent()->end()) 259 return false; 260 return std::next(I)->getOpcode() == R600::RETURN; 261 } 262 263 MachineBasicBlock * 264 R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 265 MachineBasicBlock *BB) const { 266 MachineFunction *MF = BB->getParent(); 267 MachineRegisterInfo &MRI = MF->getRegInfo(); 268 MachineBasicBlock::iterator I = MI; 269 const R600InstrInfo *TII = Subtarget->getInstrInfo(); 270 271 switch (MI.getOpcode()) { 272 default: 273 // Replace LDS_*_RET instruction that don't have any uses with the 274 // equivalent LDS_*_NORET instruction. 275 if (TII->isLDSRetInstr(MI.getOpcode())) { 276 int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst); 277 assert(DstIdx != -1); 278 MachineInstrBuilder NewMI; 279 // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add 280 // LDS_1A2D support and remove this special case. 281 if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) || 282 MI.getOpcode() == R600::LDS_CMPST_RET) 283 return BB; 284 285 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), 286 TII->get(R600::getLDSNoRetOp(MI.getOpcode()))); 287 for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) { 288 NewMI.add(MI.getOperand(i)); 289 } 290 } else { 291 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 292 } 293 break; 294 295 case R600::FABS_R600: { 296 MachineInstr *NewMI = TII->buildDefaultInstruction( 297 *BB, I, R600::MOV, MI.getOperand(0).getReg(), 298 MI.getOperand(1).getReg()); 299 TII->addFlag(*NewMI, 0, MO_FLAG_ABS); 300 break; 301 } 302 303 case R600::FNEG_R600: { 304 MachineInstr *NewMI = TII->buildDefaultInstruction( 305 *BB, I, R600::MOV, MI.getOperand(0).getReg(), 306 MI.getOperand(1).getReg()); 307 TII->addFlag(*NewMI, 0, MO_FLAG_NEG); 308 break; 309 } 310 311 case R600::MASK_WRITE: { 312 Register maskedRegister = MI.getOperand(0).getReg(); 313 assert(maskedRegister.isVirtual()); 314 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 315 TII->addFlag(*defInstr, 0, MO_FLAG_MASK); 316 break; 317 } 318 319 case R600::MOV_IMM_F32: 320 TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1) 321 .getFPImm() 322 ->getValueAPF() 323 .bitcastToAPInt() 324 .getZExtValue()); 325 break; 326 327 case R600::MOV_IMM_I32: 328 TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), 329 MI.getOperand(1).getImm()); 330 break; 331 332 case R600::MOV_IMM_GLOBAL_ADDR: { 333 //TODO: Perhaps combine this instruction with the next if possible 334 auto MIB = TII->buildDefaultInstruction( 335 *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_LITERAL_X); 336 int Idx = TII->getOperandIdx(*MIB, R600::OpName::literal); 337 //TODO: Ugh this is rather ugly 338 MIB->getOperand(Idx) = MI.getOperand(1); 339 break; 340 } 341 342 case R600::CONST_COPY: { 343 MachineInstr *NewMI = TII->buildDefaultInstruction( 344 *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_CONST); 345 TII->setImmOperand(*NewMI, R600::OpName::src0_sel, 346 MI.getOperand(1).getImm()); 347 break; 348 } 349 350 case R600::RAT_WRITE_CACHELESS_32_eg: 351 case R600::RAT_WRITE_CACHELESS_64_eg: 352 case R600::RAT_WRITE_CACHELESS_128_eg: 353 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) 354 .add(MI.getOperand(0)) 355 .add(MI.getOperand(1)) 356 .addImm(isEOP(I)); // Set End of program bit 357 break; 358 359 case R600::RAT_STORE_TYPED_eg: 360 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) 361 .add(MI.getOperand(0)) 362 .add(MI.getOperand(1)) 363 .add(MI.getOperand(2)) 364 .addImm(isEOP(I)); // Set End of program bit 365 break; 366 367 case R600::BRANCH: 368 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP)) 369 .add(MI.getOperand(0)); 370 break; 371 372 case R600::BRANCH_COND_f32: { 373 MachineInstr *NewMI = 374 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X), 375 R600::PREDICATE_BIT) 376 .add(MI.getOperand(1)) 377 .addImm(R600::PRED_SETNE) 378 .addImm(0); // Flags 379 TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); 380 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND)) 381 .add(MI.getOperand(0)) 382 .addReg(R600::PREDICATE_BIT, RegState::Kill); 383 break; 384 } 385 386 case R600::BRANCH_COND_i32: { 387 MachineInstr *NewMI = 388 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X), 389 R600::PREDICATE_BIT) 390 .add(MI.getOperand(1)) 391 .addImm(R600::PRED_SETNE_INT) 392 .addImm(0); // Flags 393 TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); 394 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND)) 395 .add(MI.getOperand(0)) 396 .addReg(R600::PREDICATE_BIT, RegState::Kill); 397 break; 398 } 399 400 case R600::EG_ExportSwz: 401 case R600::R600_ExportSwz: { 402 // Instruction is left unmodified if its not the last one of its type 403 bool isLastInstructionOfItsType = true; 404 unsigned InstExportType = MI.getOperand(1).getImm(); 405 for (MachineBasicBlock::iterator NextExportInst = std::next(I), 406 EndBlock = BB->end(); NextExportInst != EndBlock; 407 NextExportInst = std::next(NextExportInst)) { 408 if (NextExportInst->getOpcode() == R600::EG_ExportSwz || 409 NextExportInst->getOpcode() == R600::R600_ExportSwz) { 410 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 411 .getImm(); 412 if (CurrentInstExportType == InstExportType) { 413 isLastInstructionOfItsType = false; 414 break; 415 } 416 } 417 } 418 bool EOP = isEOP(I); 419 if (!EOP && !isLastInstructionOfItsType) 420 return BB; 421 unsigned CfInst = (MI.getOpcode() == R600::EG_ExportSwz) ? 84 : 40; 422 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) 423 .add(MI.getOperand(0)) 424 .add(MI.getOperand(1)) 425 .add(MI.getOperand(2)) 426 .add(MI.getOperand(3)) 427 .add(MI.getOperand(4)) 428 .add(MI.getOperand(5)) 429 .add(MI.getOperand(6)) 430 .addImm(CfInst) 431 .addImm(EOP); 432 break; 433 } 434 case R600::RETURN: { 435 return BB; 436 } 437 } 438 439 MI.eraseFromParent(); 440 return BB; 441 } 442 443 //===----------------------------------------------------------------------===// 444 // Custom DAG Lowering Operations 445 //===----------------------------------------------------------------------===// 446 447 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 448 MachineFunction &MF = DAG.getMachineFunction(); 449 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 450 switch (Op.getOpcode()) { 451 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 452 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 453 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 454 case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); 455 case ISD::SRA_PARTS: 456 case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); 457 case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY); 458 case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW); 459 case ISD::FCOS: 460 case ISD::FSIN: return LowerTrig(Op, DAG); 461 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 462 case ISD::STORE: return LowerSTORE(Op, DAG); 463 case ISD::LOAD: { 464 SDValue Result = LowerLOAD(Op, DAG); 465 assert((!Result.getNode() || 466 Result.getNode()->getNumValues() == 2) && 467 "Load should return a value and a chain"); 468 return Result; 469 } 470 471 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 472 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 473 case ISD::FrameIndex: return lowerFrameIndex(Op, DAG); 474 case ISD::INTRINSIC_VOID: { 475 SDValue Chain = Op.getOperand(0); 476 unsigned IntrinsicID = 477 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 478 switch (IntrinsicID) { 479 case Intrinsic::r600_store_swizzle: { 480 SDLoc DL(Op); 481 const SDValue Args[8] = { 482 Chain, 483 Op.getOperand(2), // Export Value 484 Op.getOperand(3), // ArrayBase 485 Op.getOperand(4), // Type 486 DAG.getConstant(0, DL, MVT::i32), // SWZ_X 487 DAG.getConstant(1, DL, MVT::i32), // SWZ_Y 488 DAG.getConstant(2, DL, MVT::i32), // SWZ_Z 489 DAG.getConstant(3, DL, MVT::i32) // SWZ_W 490 }; 491 return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, Op.getValueType(), Args); 492 } 493 494 // default for switch(IntrinsicID) 495 default: break; 496 } 497 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 498 break; 499 } 500 case ISD::INTRINSIC_WO_CHAIN: { 501 unsigned IntrinsicID = 502 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 503 EVT VT = Op.getValueType(); 504 SDLoc DL(Op); 505 switch (IntrinsicID) { 506 case Intrinsic::r600_tex: 507 case Intrinsic::r600_texc: { 508 unsigned TextureOp; 509 switch (IntrinsicID) { 510 case Intrinsic::r600_tex: 511 TextureOp = 0; 512 break; 513 case Intrinsic::r600_texc: 514 TextureOp = 1; 515 break; 516 default: 517 llvm_unreachable("unhandled texture operation"); 518 } 519 520 SDValue TexArgs[19] = { 521 DAG.getConstant(TextureOp, DL, MVT::i32), 522 Op.getOperand(1), 523 DAG.getConstant(0, DL, MVT::i32), 524 DAG.getConstant(1, DL, MVT::i32), 525 DAG.getConstant(2, DL, MVT::i32), 526 DAG.getConstant(3, DL, MVT::i32), 527 Op.getOperand(2), 528 Op.getOperand(3), 529 Op.getOperand(4), 530 DAG.getConstant(0, DL, MVT::i32), 531 DAG.getConstant(1, DL, MVT::i32), 532 DAG.getConstant(2, DL, MVT::i32), 533 DAG.getConstant(3, DL, MVT::i32), 534 Op.getOperand(5), 535 Op.getOperand(6), 536 Op.getOperand(7), 537 Op.getOperand(8), 538 Op.getOperand(9), 539 Op.getOperand(10) 540 }; 541 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); 542 } 543 case Intrinsic::r600_dot4: { 544 SDValue Args[8] = { 545 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 546 DAG.getConstant(0, DL, MVT::i32)), 547 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 548 DAG.getConstant(0, DL, MVT::i32)), 549 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 550 DAG.getConstant(1, DL, MVT::i32)), 551 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 552 DAG.getConstant(1, DL, MVT::i32)), 553 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 554 DAG.getConstant(2, DL, MVT::i32)), 555 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 556 DAG.getConstant(2, DL, MVT::i32)), 557 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 558 DAG.getConstant(3, DL, MVT::i32)), 559 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 560 DAG.getConstant(3, DL, MVT::i32)) 561 }; 562 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); 563 } 564 565 case Intrinsic::r600_implicitarg_ptr: { 566 MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS); 567 uint32_t ByteOffset = getImplicitParameterOffset(MF, FIRST_IMPLICIT); 568 return DAG.getConstant(ByteOffset, DL, PtrVT); 569 } 570 case Intrinsic::r600_read_ngroups_x: 571 return LowerImplicitParameter(DAG, VT, DL, 0); 572 case Intrinsic::r600_read_ngroups_y: 573 return LowerImplicitParameter(DAG, VT, DL, 1); 574 case Intrinsic::r600_read_ngroups_z: 575 return LowerImplicitParameter(DAG, VT, DL, 2); 576 case Intrinsic::r600_read_global_size_x: 577 return LowerImplicitParameter(DAG, VT, DL, 3); 578 case Intrinsic::r600_read_global_size_y: 579 return LowerImplicitParameter(DAG, VT, DL, 4); 580 case Intrinsic::r600_read_global_size_z: 581 return LowerImplicitParameter(DAG, VT, DL, 5); 582 case Intrinsic::r600_read_local_size_x: 583 return LowerImplicitParameter(DAG, VT, DL, 6); 584 case Intrinsic::r600_read_local_size_y: 585 return LowerImplicitParameter(DAG, VT, DL, 7); 586 case Intrinsic::r600_read_local_size_z: 587 return LowerImplicitParameter(DAG, VT, DL, 8); 588 589 case Intrinsic::r600_read_tgid_x: 590 case Intrinsic::amdgcn_workgroup_id_x: 591 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 592 R600::T1_X, VT); 593 case Intrinsic::r600_read_tgid_y: 594 case Intrinsic::amdgcn_workgroup_id_y: 595 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 596 R600::T1_Y, VT); 597 case Intrinsic::r600_read_tgid_z: 598 case Intrinsic::amdgcn_workgroup_id_z: 599 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 600 R600::T1_Z, VT); 601 case Intrinsic::r600_read_tidig_x: 602 case Intrinsic::amdgcn_workitem_id_x: 603 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 604 R600::T0_X, VT); 605 case Intrinsic::r600_read_tidig_y: 606 case Intrinsic::amdgcn_workitem_id_y: 607 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 608 R600::T0_Y, VT); 609 case Intrinsic::r600_read_tidig_z: 610 case Intrinsic::amdgcn_workitem_id_z: 611 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 612 R600::T0_Z, VT); 613 614 case Intrinsic::r600_recipsqrt_ieee: 615 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 616 617 case Intrinsic::r600_recipsqrt_clamped: 618 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); 619 default: 620 return Op; 621 } 622 623 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 624 break; 625 } 626 } // end switch(Op.getOpcode()) 627 return SDValue(); 628 } 629 630 void R600TargetLowering::ReplaceNodeResults(SDNode *N, 631 SmallVectorImpl<SDValue> &Results, 632 SelectionDAG &DAG) const { 633 switch (N->getOpcode()) { 634 default: 635 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); 636 return; 637 case ISD::FP_TO_UINT: 638 if (N->getValueType(0) == MVT::i1) { 639 Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG)); 640 return; 641 } 642 // Since we don't care about out of bounds values we can use FP_TO_SINT for 643 // uints too. The DAGLegalizer code for uint considers some extra cases 644 // which are not necessary here. 645 LLVM_FALLTHROUGH; 646 case ISD::FP_TO_SINT: { 647 if (N->getValueType(0) == MVT::i1) { 648 Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG)); 649 return; 650 } 651 652 SDValue Result; 653 if (expandFP_TO_SINT(N, Result, DAG)) 654 Results.push_back(Result); 655 return; 656 } 657 case ISD::SDIVREM: { 658 SDValue Op = SDValue(N, 1); 659 SDValue RES = LowerSDIVREM(Op, DAG); 660 Results.push_back(RES); 661 Results.push_back(RES.getValue(1)); 662 break; 663 } 664 case ISD::UDIVREM: { 665 SDValue Op = SDValue(N, 0); 666 LowerUDIVREM64(Op, DAG, Results); 667 break; 668 } 669 } 670 } 671 672 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, 673 SDValue Vector) const { 674 SDLoc DL(Vector); 675 EVT VecVT = Vector.getValueType(); 676 EVT EltVT = VecVT.getVectorElementType(); 677 SmallVector<SDValue, 8> Args; 678 679 for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) { 680 Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, 681 DAG.getVectorIdxConstant(i, DL))); 682 } 683 684 return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); 685 } 686 687 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 688 SelectionDAG &DAG) const { 689 SDLoc DL(Op); 690 SDValue Vector = Op.getOperand(0); 691 SDValue Index = Op.getOperand(1); 692 693 if (isa<ConstantSDNode>(Index) || 694 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 695 return Op; 696 697 Vector = vectorToVerticalVector(DAG, Vector); 698 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), 699 Vector, Index); 700 } 701 702 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 703 SelectionDAG &DAG) const { 704 SDLoc DL(Op); 705 SDValue Vector = Op.getOperand(0); 706 SDValue Value = Op.getOperand(1); 707 SDValue Index = Op.getOperand(2); 708 709 if (isa<ConstantSDNode>(Index) || 710 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 711 return Op; 712 713 Vector = vectorToVerticalVector(DAG, Vector); 714 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), 715 Vector, Value, Index); 716 return vectorToVerticalVector(DAG, Insert); 717 } 718 719 SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 720 SDValue Op, 721 SelectionDAG &DAG) const { 722 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 723 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 724 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 725 726 const DataLayout &DL = DAG.getDataLayout(); 727 const GlobalValue *GV = GSD->getGlobal(); 728 MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 729 730 SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT); 731 return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA); 732 } 733 734 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 735 // On hw >= R700, COS/SIN input must be between -1. and 1. 736 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) 737 EVT VT = Op.getValueType(); 738 SDValue Arg = Op.getOperand(0); 739 SDLoc DL(Op); 740 741 // TODO: Should this propagate fast-math-flags? 742 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, 743 DAG.getNode(ISD::FADD, DL, VT, 744 DAG.getNode(ISD::FMUL, DL, VT, Arg, 745 DAG.getConstantFP(0.15915494309, DL, MVT::f32)), 746 DAG.getConstantFP(0.5, DL, MVT::f32))); 747 unsigned TrigNode; 748 switch (Op.getOpcode()) { 749 case ISD::FCOS: 750 TrigNode = AMDGPUISD::COS_HW; 751 break; 752 case ISD::FSIN: 753 TrigNode = AMDGPUISD::SIN_HW; 754 break; 755 default: 756 llvm_unreachable("Wrong trig opcode"); 757 } 758 SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, 759 DAG.getNode(ISD::FADD, DL, VT, FractPart, 760 DAG.getConstantFP(-0.5, DL, MVT::f32))); 761 if (Gen >= AMDGPUSubtarget::R700) 762 return TrigVal; 763 // On R600 hw, COS/SIN input must be between -Pi and Pi. 764 return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, 765 DAG.getConstantFP(numbers::pif, DL, MVT::f32)); 766 } 767 768 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { 769 SDLoc DL(Op); 770 EVT VT = Op.getValueType(); 771 772 SDValue Lo = Op.getOperand(0); 773 SDValue Hi = Op.getOperand(1); 774 SDValue Shift = Op.getOperand(2); 775 SDValue Zero = DAG.getConstant(0, DL, VT); 776 SDValue One = DAG.getConstant(1, DL, VT); 777 778 SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); 779 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); 780 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); 781 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); 782 783 // The dance around Width1 is necessary for 0 special case. 784 // Without it the CompShift might be 32, producing incorrect results in 785 // Overflow. So we do the shift in two steps, the alternative is to 786 // add a conditional to filter the special case. 787 788 SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift); 789 Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One); 790 791 SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift); 792 HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow); 793 SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift); 794 795 SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift); 796 SDValue LoBig = Zero; 797 798 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); 799 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); 800 801 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); 802 } 803 804 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { 805 SDLoc DL(Op); 806 EVT VT = Op.getValueType(); 807 808 SDValue Lo = Op.getOperand(0); 809 SDValue Hi = Op.getOperand(1); 810 SDValue Shift = Op.getOperand(2); 811 SDValue Zero = DAG.getConstant(0, DL, VT); 812 SDValue One = DAG.getConstant(1, DL, VT); 813 814 const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; 815 816 SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); 817 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); 818 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); 819 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); 820 821 // The dance around Width1 is necessary for 0 special case. 822 // Without it the CompShift might be 32, producing incorrect results in 823 // Overflow. So we do the shift in two steps, the alternative is to 824 // add a conditional to filter the special case. 825 826 SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift); 827 Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One); 828 829 SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift); 830 SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift); 831 LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow); 832 833 SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift); 834 SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero; 835 836 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); 837 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); 838 839 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); 840 } 841 842 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, 843 unsigned mainop, unsigned ovf) const { 844 SDLoc DL(Op); 845 EVT VT = Op.getValueType(); 846 847 SDValue Lo = Op.getOperand(0); 848 SDValue Hi = Op.getOperand(1); 849 850 SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi); 851 // Extend sign. 852 OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF, 853 DAG.getValueType(MVT::i1)); 854 855 SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi); 856 857 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF); 858 } 859 860 SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const { 861 SDLoc DL(Op); 862 return DAG.getNode( 863 ISD::SETCC, 864 DL, 865 MVT::i1, 866 Op, DAG.getConstantFP(1.0f, DL, MVT::f32), 867 DAG.getCondCode(ISD::SETEQ)); 868 } 869 870 SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { 871 SDLoc DL(Op); 872 return DAG.getNode( 873 ISD::SETCC, 874 DL, 875 MVT::i1, 876 Op, DAG.getConstantFP(-1.0f, DL, MVT::f32), 877 DAG.getCondCode(ISD::SETEQ)); 878 } 879 880 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 881 const SDLoc &DL, 882 unsigned DwordOffset) const { 883 unsigned ByteOffset = DwordOffset * 4; 884 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 885 AMDGPUAS::PARAM_I_ADDRESS); 886 887 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 888 assert(isInt<16>(ByteOffset)); 889 890 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 891 DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR 892 MachinePointerInfo(ConstantPointerNull::get(PtrType))); 893 } 894 895 bool R600TargetLowering::isZero(SDValue Op) const { 896 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 897 return Cst->isNullValue(); 898 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 899 return CstFP->isZero(); 900 } else { 901 return false; 902 } 903 } 904 905 bool R600TargetLowering::isHWTrueValue(SDValue Op) const { 906 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 907 return CFP->isExactlyValue(1.0); 908 } 909 return isAllOnesConstant(Op); 910 } 911 912 bool R600TargetLowering::isHWFalseValue(SDValue Op) const { 913 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 914 return CFP->getValueAPF().isZero(); 915 } 916 return isNullConstant(Op); 917 } 918 919 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 920 SDLoc DL(Op); 921 EVT VT = Op.getValueType(); 922 923 SDValue LHS = Op.getOperand(0); 924 SDValue RHS = Op.getOperand(1); 925 SDValue True = Op.getOperand(2); 926 SDValue False = Op.getOperand(3); 927 SDValue CC = Op.getOperand(4); 928 SDValue Temp; 929 930 if (VT == MVT::f32) { 931 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); 932 SDValue MinMax = combineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); 933 if (MinMax) 934 return MinMax; 935 } 936 937 // LHS and RHS are guaranteed to be the same value type 938 EVT CompareVT = LHS.getValueType(); 939 940 // Check if we can lower this to a native operation. 941 942 // Try to lower to a SET* instruction: 943 // 944 // SET* can match the following patterns: 945 // 946 // select_cc f32, f32, -1, 0, cc_supported 947 // select_cc f32, f32, 1.0f, 0.0f, cc_supported 948 // select_cc i32, i32, -1, 0, cc_supported 949 // 950 951 // Move hardware True/False values to the correct operand. 952 if (isHWTrueValue(False) && isHWFalseValue(True)) { 953 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 954 ISD::CondCode InverseCC = ISD::getSetCCInverse(CCOpcode, CompareVT); 955 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { 956 std::swap(False, True); 957 CC = DAG.getCondCode(InverseCC); 958 } else { 959 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); 960 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { 961 std::swap(False, True); 962 std::swap(LHS, RHS); 963 CC = DAG.getCondCode(SwapInvCC); 964 } 965 } 966 } 967 968 if (isHWTrueValue(True) && isHWFalseValue(False) && 969 (CompareVT == VT || VT == MVT::i32)) { 970 // This can be matched by a SET* instruction. 971 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 972 } 973 974 // Try to lower to a CND* instruction: 975 // 976 // CND* can match the following patterns: 977 // 978 // select_cc f32, 0.0, f32, f32, cc_supported 979 // select_cc f32, 0.0, i32, i32, cc_supported 980 // select_cc i32, 0, f32, f32, cc_supported 981 // select_cc i32, 0, i32, i32, cc_supported 982 // 983 984 // Try to move the zero value to the RHS 985 if (isZero(LHS)) { 986 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 987 // Try swapping the operands 988 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); 989 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 990 std::swap(LHS, RHS); 991 CC = DAG.getCondCode(CCSwapped); 992 } else { 993 // Try inverting the conditon and then swapping the operands 994 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT); 995 CCSwapped = ISD::getSetCCSwappedOperands(CCInv); 996 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 997 std::swap(True, False); 998 std::swap(LHS, RHS); 999 CC = DAG.getCondCode(CCSwapped); 1000 } 1001 } 1002 } 1003 if (isZero(RHS)) { 1004 SDValue Cond = LHS; 1005 SDValue Zero = RHS; 1006 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1007 if (CompareVT != VT) { 1008 // Bitcast True / False to the correct types. This will end up being 1009 // a nop, but it allows us to define only a single pattern in the 1010 // .TD files for each CND* instruction rather than having to have 1011 // one pattern for integer True/False and one for fp True/False 1012 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 1013 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 1014 } 1015 1016 switch (CCOpcode) { 1017 case ISD::SETONE: 1018 case ISD::SETUNE: 1019 case ISD::SETNE: 1020 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT); 1021 Temp = True; 1022 True = False; 1023 False = Temp; 1024 break; 1025 default: 1026 break; 1027 } 1028 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 1029 Cond, Zero, 1030 True, False, 1031 DAG.getCondCode(CCOpcode)); 1032 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 1033 } 1034 1035 // If we make it this for it means we have no native instructions to handle 1036 // this SELECT_CC, so we must lower it. 1037 SDValue HWTrue, HWFalse; 1038 1039 if (CompareVT == MVT::f32) { 1040 HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT); 1041 HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT); 1042 } else if (CompareVT == MVT::i32) { 1043 HWTrue = DAG.getConstant(-1, DL, CompareVT); 1044 HWFalse = DAG.getConstant(0, DL, CompareVT); 1045 } 1046 else { 1047 llvm_unreachable("Unhandled value type in LowerSELECT_CC"); 1048 } 1049 1050 // Lower this unsupported SELECT_CC into a combination of two supported 1051 // SELECT_CC operations. 1052 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 1053 1054 return DAG.getNode(ISD::SELECT_CC, DL, VT, 1055 Cond, HWFalse, 1056 True, False, 1057 DAG.getCondCode(ISD::SETNE)); 1058 } 1059 1060 /// LLVM generates byte-addressed pointers. For indirect addressing, we need to 1061 /// convert these pointers to a register index. Each register holds 1062 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 1063 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used 1064 /// for indirect addressing. 1065 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 1066 unsigned StackWidth, 1067 SelectionDAG &DAG) const { 1068 unsigned SRLPad; 1069 switch(StackWidth) { 1070 case 1: 1071 SRLPad = 2; 1072 break; 1073 case 2: 1074 SRLPad = 3; 1075 break; 1076 case 4: 1077 SRLPad = 4; 1078 break; 1079 default: llvm_unreachable("Invalid stack width"); 1080 } 1081 1082 SDLoc DL(Ptr); 1083 return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr, 1084 DAG.getConstant(SRLPad, DL, MVT::i32)); 1085 } 1086 1087 void R600TargetLowering::getStackAddress(unsigned StackWidth, 1088 unsigned ElemIdx, 1089 unsigned &Channel, 1090 unsigned &PtrIncr) const { 1091 switch (StackWidth) { 1092 default: 1093 case 1: 1094 Channel = 0; 1095 if (ElemIdx > 0) { 1096 PtrIncr = 1; 1097 } else { 1098 PtrIncr = 0; 1099 } 1100 break; 1101 case 2: 1102 Channel = ElemIdx % 2; 1103 if (ElemIdx == 2) { 1104 PtrIncr = 1; 1105 } else { 1106 PtrIncr = 0; 1107 } 1108 break; 1109 case 4: 1110 Channel = ElemIdx; 1111 PtrIncr = 0; 1112 break; 1113 } 1114 } 1115 1116 SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, 1117 SelectionDAG &DAG) const { 1118 SDLoc DL(Store); 1119 //TODO: Who creates the i8 stores? 1120 assert(Store->isTruncatingStore() 1121 || Store->getValue().getValueType() == MVT::i8); 1122 assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS); 1123 1124 SDValue Mask; 1125 if (Store->getMemoryVT() == MVT::i8) { 1126 assert(Store->getAlignment() >= 1); 1127 Mask = DAG.getConstant(0xff, DL, MVT::i32); 1128 } else if (Store->getMemoryVT() == MVT::i16) { 1129 assert(Store->getAlignment() >= 2); 1130 Mask = DAG.getConstant(0xffff, DL, MVT::i32); 1131 } else { 1132 llvm_unreachable("Unsupported private trunc store"); 1133 } 1134 1135 SDValue OldChain = Store->getChain(); 1136 bool VectorTrunc = (OldChain.getOpcode() == AMDGPUISD::DUMMY_CHAIN); 1137 // Skip dummy 1138 SDValue Chain = VectorTrunc ? OldChain->getOperand(0) : OldChain; 1139 SDValue BasePtr = Store->getBasePtr(); 1140 SDValue Offset = Store->getOffset(); 1141 EVT MemVT = Store->getMemoryVT(); 1142 1143 SDValue LoadPtr = BasePtr; 1144 if (!Offset.isUndef()) { 1145 LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset); 1146 } 1147 1148 // Get dword location 1149 // TODO: this should be eliminated by the future SHR ptr, 2 1150 SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, 1151 DAG.getConstant(0xfffffffc, DL, MVT::i32)); 1152 1153 // Load dword 1154 // TODO: can we be smarter about machine pointer info? 1155 MachinePointerInfo PtrInfo(AMDGPUAS::PRIVATE_ADDRESS); 1156 SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo); 1157 1158 Chain = Dst.getValue(1); 1159 1160 // Get offset in dword 1161 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, 1162 DAG.getConstant(0x3, DL, MVT::i32)); 1163 1164 // Convert byte offset to bit shift 1165 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1166 DAG.getConstant(3, DL, MVT::i32)); 1167 1168 // TODO: Contrary to the name of the functiom, 1169 // it also handles sub i32 non-truncating stores (like i1) 1170 SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, 1171 Store->getValue()); 1172 1173 // Mask the value to the right type 1174 SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); 1175 1176 // Shift the value in place 1177 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, 1178 MaskedValue, ShiftAmt); 1179 1180 // Shift the mask in place 1181 SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, Mask, ShiftAmt); 1182 1183 // Invert the mask. NOTE: if we had native ROL instructions we could 1184 // use inverted mask 1185 DstMask = DAG.getNOT(DL, DstMask, MVT::i32); 1186 1187 // Cleanup the target bits 1188 Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); 1189 1190 // Add the new bits 1191 SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); 1192 1193 // Store dword 1194 // TODO: Can we be smarter about MachinePointerInfo? 1195 SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, PtrInfo); 1196 1197 // If we are part of expanded vector, make our neighbors depend on this store 1198 if (VectorTrunc) { 1199 // Make all other vector elements depend on this store 1200 Chain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, NewStore); 1201 DAG.ReplaceAllUsesOfValueWith(OldChain, Chain); 1202 } 1203 return NewStore; 1204 } 1205 1206 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1207 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 1208 unsigned AS = StoreNode->getAddressSpace(); 1209 1210 SDValue Chain = StoreNode->getChain(); 1211 SDValue Ptr = StoreNode->getBasePtr(); 1212 SDValue Value = StoreNode->getValue(); 1213 1214 EVT VT = Value.getValueType(); 1215 EVT MemVT = StoreNode->getMemoryVT(); 1216 EVT PtrVT = Ptr.getValueType(); 1217 1218 SDLoc DL(Op); 1219 1220 const bool TruncatingStore = StoreNode->isTruncatingStore(); 1221 1222 // Neither LOCAL nor PRIVATE can do vectors at the moment 1223 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS || 1224 TruncatingStore) && 1225 VT.isVector()) { 1226 if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) { 1227 // Add an extra level of chain to isolate this vector 1228 SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); 1229 // TODO: can the chain be replaced without creating a new store? 1230 SDValue NewStore = DAG.getTruncStore( 1231 NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), 1232 MemVT, StoreNode->getAlignment(), 1233 StoreNode->getMemOperand()->getFlags(), StoreNode->getAAInfo()); 1234 StoreNode = cast<StoreSDNode>(NewStore); 1235 } 1236 1237 return scalarizeVectorStore(StoreNode, DAG); 1238 } 1239 1240 Align Alignment = StoreNode->getAlign(); 1241 if (Alignment < MemVT.getStoreSize() && 1242 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(), 1243 StoreNode->getMemOperand()->getFlags(), 1244 nullptr)) { 1245 return expandUnalignedStore(StoreNode, DAG); 1246 } 1247 1248 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr, 1249 DAG.getConstant(2, DL, PtrVT)); 1250 1251 if (AS == AMDGPUAS::GLOBAL_ADDRESS) { 1252 // It is beneficial to create MSKOR here instead of combiner to avoid 1253 // artificial dependencies introduced by RMW 1254 if (TruncatingStore) { 1255 assert(VT.bitsLE(MVT::i32)); 1256 SDValue MaskConstant; 1257 if (MemVT == MVT::i8) { 1258 MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32); 1259 } else { 1260 assert(MemVT == MVT::i16); 1261 assert(StoreNode->getAlignment() >= 2); 1262 MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); 1263 } 1264 1265 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, PtrVT, Ptr, 1266 DAG.getConstant(0x00000003, DL, PtrVT)); 1267 SDValue BitShift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, 1268 DAG.getConstant(3, DL, VT)); 1269 1270 // Put the mask in correct place 1271 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift); 1272 1273 // Put the value bits in correct place 1274 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); 1275 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift); 1276 1277 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 1278 // vector instead. 1279 SDValue Src[4] = { 1280 ShiftedValue, 1281 DAG.getConstant(0, DL, MVT::i32), 1282 DAG.getConstant(0, DL, MVT::i32), 1283 Mask 1284 }; 1285 SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src); 1286 SDValue Args[3] = { Chain, Input, DWordAddr }; 1287 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, 1288 Op->getVTList(), Args, MemVT, 1289 StoreNode->getMemOperand()); 1290 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) { 1291 // Convert pointer from byte address to dword address. 1292 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr); 1293 1294 if (StoreNode->isIndexed()) { 1295 llvm_unreachable("Indexed stores not supported yet"); 1296 } else { 1297 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 1298 } 1299 return Chain; 1300 } 1301 } 1302 1303 // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes 1304 if (AS != AMDGPUAS::PRIVATE_ADDRESS) 1305 return SDValue(); 1306 1307 if (MemVT.bitsLT(MVT::i32)) 1308 return lowerPrivateTruncStore(StoreNode, DAG); 1309 1310 // Standard i32+ store, tag it with DWORDADDR to note that the address 1311 // has been shifted 1312 if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) { 1313 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr); 1314 return DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 1315 } 1316 1317 // Tagged i32+ stores will be matched by patterns 1318 return SDValue(); 1319 } 1320 1321 // return (512 + (kc_bank << 12) 1322 static int 1323 ConstantAddressBlock(unsigned AddressSpace) { 1324 switch (AddressSpace) { 1325 case AMDGPUAS::CONSTANT_BUFFER_0: 1326 return 512; 1327 case AMDGPUAS::CONSTANT_BUFFER_1: 1328 return 512 + 4096; 1329 case AMDGPUAS::CONSTANT_BUFFER_2: 1330 return 512 + 4096 * 2; 1331 case AMDGPUAS::CONSTANT_BUFFER_3: 1332 return 512 + 4096 * 3; 1333 case AMDGPUAS::CONSTANT_BUFFER_4: 1334 return 512 + 4096 * 4; 1335 case AMDGPUAS::CONSTANT_BUFFER_5: 1336 return 512 + 4096 * 5; 1337 case AMDGPUAS::CONSTANT_BUFFER_6: 1338 return 512 + 4096 * 6; 1339 case AMDGPUAS::CONSTANT_BUFFER_7: 1340 return 512 + 4096 * 7; 1341 case AMDGPUAS::CONSTANT_BUFFER_8: 1342 return 512 + 4096 * 8; 1343 case AMDGPUAS::CONSTANT_BUFFER_9: 1344 return 512 + 4096 * 9; 1345 case AMDGPUAS::CONSTANT_BUFFER_10: 1346 return 512 + 4096 * 10; 1347 case AMDGPUAS::CONSTANT_BUFFER_11: 1348 return 512 + 4096 * 11; 1349 case AMDGPUAS::CONSTANT_BUFFER_12: 1350 return 512 + 4096 * 12; 1351 case AMDGPUAS::CONSTANT_BUFFER_13: 1352 return 512 + 4096 * 13; 1353 case AMDGPUAS::CONSTANT_BUFFER_14: 1354 return 512 + 4096 * 14; 1355 case AMDGPUAS::CONSTANT_BUFFER_15: 1356 return 512 + 4096 * 15; 1357 default: 1358 return -1; 1359 } 1360 } 1361 1362 SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op, 1363 SelectionDAG &DAG) const { 1364 SDLoc DL(Op); 1365 LoadSDNode *Load = cast<LoadSDNode>(Op); 1366 ISD::LoadExtType ExtType = Load->getExtensionType(); 1367 EVT MemVT = Load->getMemoryVT(); 1368 assert(Load->getAlignment() >= MemVT.getStoreSize()); 1369 1370 SDValue BasePtr = Load->getBasePtr(); 1371 SDValue Chain = Load->getChain(); 1372 SDValue Offset = Load->getOffset(); 1373 1374 SDValue LoadPtr = BasePtr; 1375 if (!Offset.isUndef()) { 1376 LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset); 1377 } 1378 1379 // Get dword location 1380 // NOTE: this should be eliminated by the future SHR ptr, 2 1381 SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, 1382 DAG.getConstant(0xfffffffc, DL, MVT::i32)); 1383 1384 // Load dword 1385 // TODO: can we be smarter about machine pointer info? 1386 MachinePointerInfo PtrInfo(AMDGPUAS::PRIVATE_ADDRESS); 1387 SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo); 1388 1389 // Get offset within the register. 1390 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, 1391 LoadPtr, DAG.getConstant(0x3, DL, MVT::i32)); 1392 1393 // Bit offset of target byte (byteIdx * 8). 1394 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1395 DAG.getConstant(3, DL, MVT::i32)); 1396 1397 // Shift to the right. 1398 SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt); 1399 1400 // Eliminate the upper bits by setting them to ... 1401 EVT MemEltVT = MemVT.getScalarType(); 1402 1403 if (ExtType == ISD::SEXTLOAD) { // ... ones. 1404 SDValue MemEltVTNode = DAG.getValueType(MemEltVT); 1405 Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode); 1406 } else { // ... or zeros. 1407 Ret = DAG.getZeroExtendInReg(Ret, DL, MemEltVT); 1408 } 1409 1410 SDValue Ops[] = { 1411 Ret, 1412 Read.getValue(1) // This should be our output chain 1413 }; 1414 1415 return DAG.getMergeValues(Ops, DL); 1416 } 1417 1418 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 1419 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 1420 unsigned AS = LoadNode->getAddressSpace(); 1421 EVT MemVT = LoadNode->getMemoryVT(); 1422 ISD::LoadExtType ExtType = LoadNode->getExtensionType(); 1423 1424 if (AS == AMDGPUAS::PRIVATE_ADDRESS && 1425 ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) { 1426 return lowerPrivateExtLoad(Op, DAG); 1427 } 1428 1429 SDLoc DL(Op); 1430 EVT VT = Op.getValueType(); 1431 SDValue Chain = LoadNode->getChain(); 1432 SDValue Ptr = LoadNode->getBasePtr(); 1433 1434 if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 1435 LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && 1436 VT.isVector()) { 1437 SDValue Ops[2]; 1438 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LoadNode, DAG); 1439 return DAG.getMergeValues(Ops, DL); 1440 } 1441 1442 // This is still used for explicit load from addrspace(8) 1443 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 1444 if (ConstantBlock > -1 && 1445 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || 1446 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { 1447 SDValue Result; 1448 if (isa<Constant>(LoadNode->getMemOperand()->getValue()) || 1449 isa<ConstantSDNode>(Ptr)) { 1450 return constBufferLoad(LoadNode, LoadNode->getAddressSpace(), DAG); 1451 } else { 1452 //TODO: Does this even work? 1453 // non-constant ptr can't be folded, keeps it as a v4f32 load 1454 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 1455 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, 1456 DAG.getConstant(4, DL, MVT::i32)), 1457 DAG.getConstant(LoadNode->getAddressSpace() - 1458 AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) 1459 ); 1460 } 1461 1462 if (!VT.isVector()) { 1463 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1464 DAG.getConstant(0, DL, MVT::i32)); 1465 } 1466 1467 SDValue MergedValues[2] = { 1468 Result, 1469 Chain 1470 }; 1471 return DAG.getMergeValues(MergedValues, DL); 1472 } 1473 1474 // For most operations returning SDValue() will result in the node being 1475 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we 1476 // need to manually expand loads that may be legal in some address spaces and 1477 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for 1478 // compute shaders, since the data is sign extended when it is uploaded to the 1479 // buffer. However SEXT loads from other address spaces are not supported, so 1480 // we need to expand them here. 1481 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { 1482 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); 1483 SDValue NewLoad = DAG.getExtLoad( 1484 ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT, 1485 LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags()); 1486 SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, 1487 DAG.getValueType(MemVT)); 1488 1489 SDValue MergedValues[2] = { Res, Chain }; 1490 return DAG.getMergeValues(MergedValues, DL); 1491 } 1492 1493 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1494 return SDValue(); 1495 } 1496 1497 // DWORDADDR ISD marks already shifted address 1498 if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) { 1499 assert(VT == MVT::i32); 1500 Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32)); 1501 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr); 1502 return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand()); 1503 } 1504 return SDValue(); 1505 } 1506 1507 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 1508 SDValue Chain = Op.getOperand(0); 1509 SDValue Cond = Op.getOperand(1); 1510 SDValue Jump = Op.getOperand(2); 1511 1512 return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), 1513 Chain, Jump, Cond); 1514 } 1515 1516 SDValue R600TargetLowering::lowerFrameIndex(SDValue Op, 1517 SelectionDAG &DAG) const { 1518 MachineFunction &MF = DAG.getMachineFunction(); 1519 const R600FrameLowering *TFL = Subtarget->getFrameLowering(); 1520 1521 FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); 1522 1523 unsigned FrameIndex = FIN->getIndex(); 1524 Register IgnoredFrameReg; 1525 StackOffset Offset = 1526 TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); 1527 return DAG.getConstant(Offset.getFixed() * 4 * TFL->getStackWidth(MF), 1528 SDLoc(Op), Op.getValueType()); 1529 } 1530 1531 CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1532 bool IsVarArg) const { 1533 switch (CC) { 1534 case CallingConv::AMDGPU_KERNEL: 1535 case CallingConv::SPIR_KERNEL: 1536 case CallingConv::C: 1537 case CallingConv::Fast: 1538 case CallingConv::Cold: 1539 llvm_unreachable("kernels should not be handled here"); 1540 case CallingConv::AMDGPU_VS: 1541 case CallingConv::AMDGPU_GS: 1542 case CallingConv::AMDGPU_PS: 1543 case CallingConv::AMDGPU_CS: 1544 case CallingConv::AMDGPU_HS: 1545 case CallingConv::AMDGPU_ES: 1546 case CallingConv::AMDGPU_LS: 1547 return CC_R600; 1548 default: 1549 report_fatal_error("Unsupported calling convention."); 1550 } 1551 } 1552 1553 /// XXX Only kernel functions are supported, so we can assume for now that 1554 /// every function is a kernel function, but in the future we should use 1555 /// separate calling conventions for kernel and non-kernel functions. 1556 SDValue R600TargetLowering::LowerFormalArguments( 1557 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 1558 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 1559 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 1560 SmallVector<CCValAssign, 16> ArgLocs; 1561 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 1562 *DAG.getContext()); 1563 MachineFunction &MF = DAG.getMachineFunction(); 1564 SmallVector<ISD::InputArg, 8> LocalIns; 1565 1566 if (AMDGPU::isShader(CallConv)) { 1567 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 1568 } else { 1569 analyzeFormalArgumentsCompute(CCInfo, Ins); 1570 } 1571 1572 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 1573 CCValAssign &VA = ArgLocs[i]; 1574 const ISD::InputArg &In = Ins[i]; 1575 EVT VT = In.VT; 1576 EVT MemVT = VA.getLocVT(); 1577 if (!VT.isVector() && MemVT.isVector()) { 1578 // Get load source type if scalarized. 1579 MemVT = MemVT.getVectorElementType(); 1580 } 1581 1582 if (AMDGPU::isShader(CallConv)) { 1583 Register Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass); 1584 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); 1585 InVals.push_back(Register); 1586 continue; 1587 } 1588 1589 // i64 isn't a legal type, so the register type used ends up as i32, which 1590 // isn't expected here. It attempts to create this sextload, but it ends up 1591 // being invalid. Somehow this seems to work with i64 arguments, but breaks 1592 // for <1 x i64>. 1593 1594 // The first 36 bytes of the input buffer contains information about 1595 // thread group and global sizes. 1596 ISD::LoadExtType Ext = ISD::NON_EXTLOAD; 1597 if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { 1598 // FIXME: This should really check the extload type, but the handling of 1599 // extload vector parameters seems to be broken. 1600 1601 // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 1602 Ext = ISD::SEXTLOAD; 1603 } 1604 1605 // Compute the offset from the value. 1606 // XXX - I think PartOffset should give you this, but it seems to give the 1607 // size of the register which isn't useful. 1608 1609 unsigned PartOffset = VA.getLocMemOffset(); 1610 unsigned Alignment = MinAlign(VT.getStoreSize(), PartOffset); 1611 1612 MachinePointerInfo PtrInfo(AMDGPUAS::PARAM_I_ADDRESS); 1613 SDValue Arg = DAG.getLoad( 1614 ISD::UNINDEXED, Ext, VT, DL, Chain, 1615 DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), 1616 PtrInfo, 1617 MemVT, Alignment, MachineMemOperand::MONonTemporal | 1618 MachineMemOperand::MODereferenceable | 1619 MachineMemOperand::MOInvariant); 1620 1621 InVals.push_back(Arg); 1622 } 1623 return Chain; 1624 } 1625 1626 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1627 EVT VT) const { 1628 if (!VT.isVector()) 1629 return MVT::i32; 1630 return VT.changeVectorElementTypeToInteger(); 1631 } 1632 1633 bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, 1634 const SelectionDAG &DAG) const { 1635 // Local and Private addresses do not handle vectors. Limit to i32 1636 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)) { 1637 return (MemVT.getSizeInBits() <= 32); 1638 } 1639 return true; 1640 } 1641 1642 bool R600TargetLowering::allowsMisalignedMemoryAccesses( 1643 EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, 1644 bool *IsFast) const { 1645 if (IsFast) 1646 *IsFast = false; 1647 1648 if (!VT.isSimple() || VT == MVT::Other) 1649 return false; 1650 1651 if (VT.bitsLT(MVT::i32)) 1652 return false; 1653 1654 // TODO: This is a rough estimate. 1655 if (IsFast) 1656 *IsFast = true; 1657 1658 return VT.bitsGT(MVT::i32) && Align % 4 == 0; 1659 } 1660 1661 static SDValue CompactSwizzlableVector( 1662 SelectionDAG &DAG, SDValue VectorEntry, 1663 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1664 assert(RemapSwizzle.empty()); 1665 1666 SDLoc DL(VectorEntry); 1667 EVT EltTy = VectorEntry.getValueType().getVectorElementType(); 1668 1669 SDValue NewBldVec[4]; 1670 for (unsigned i = 0; i < 4; i++) 1671 NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry, 1672 DAG.getIntPtrConstant(i, DL)); 1673 1674 for (unsigned i = 0; i < 4; i++) { 1675 if (NewBldVec[i].isUndef()) 1676 // We mask write here to teach later passes that the ith element of this 1677 // vector is undef. Thus we can use it to reduce 128 bits reg usage, 1678 // break false dependencies and additionnaly make assembly easier to read. 1679 RemapSwizzle[i] = 7; // SEL_MASK_WRITE 1680 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { 1681 if (C->isZero()) { 1682 RemapSwizzle[i] = 4; // SEL_0 1683 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1684 } else if (C->isExactlyValue(1.0)) { 1685 RemapSwizzle[i] = 5; // SEL_1 1686 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1687 } 1688 } 1689 1690 if (NewBldVec[i].isUndef()) 1691 continue; 1692 1693 for (unsigned j = 0; j < i; j++) { 1694 if (NewBldVec[i] == NewBldVec[j]) { 1695 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); 1696 RemapSwizzle[i] = j; 1697 break; 1698 } 1699 } 1700 } 1701 1702 return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), 1703 NewBldVec); 1704 } 1705 1706 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, 1707 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1708 assert(RemapSwizzle.empty()); 1709 1710 SDLoc DL(VectorEntry); 1711 EVT EltTy = VectorEntry.getValueType().getVectorElementType(); 1712 1713 SDValue NewBldVec[4]; 1714 bool isUnmovable[4] = {false, false, false, false}; 1715 for (unsigned i = 0; i < 4; i++) 1716 NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry, 1717 DAG.getIntPtrConstant(i, DL)); 1718 1719 for (unsigned i = 0; i < 4; i++) { 1720 RemapSwizzle[i] = i; 1721 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1722 unsigned Idx = cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1723 ->getZExtValue(); 1724 if (i == Idx) 1725 isUnmovable[Idx] = true; 1726 } 1727 } 1728 1729 for (unsigned i = 0; i < 4; i++) { 1730 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1731 unsigned Idx = cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1732 ->getZExtValue(); 1733 if (isUnmovable[Idx]) 1734 continue; 1735 // Swap i and Idx 1736 std::swap(NewBldVec[Idx], NewBldVec[i]); 1737 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); 1738 break; 1739 } 1740 } 1741 1742 return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), 1743 NewBldVec); 1744 } 1745 1746 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4], 1747 SelectionDAG &DAG, 1748 const SDLoc &DL) const { 1749 // Old -> New swizzle values 1750 DenseMap<unsigned, unsigned> SwizzleRemap; 1751 1752 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); 1753 for (unsigned i = 0; i < 4; i++) { 1754 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1755 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1756 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); 1757 } 1758 1759 SwizzleRemap.clear(); 1760 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); 1761 for (unsigned i = 0; i < 4; i++) { 1762 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1763 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1764 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); 1765 } 1766 1767 return BuildVector; 1768 } 1769 1770 SDValue R600TargetLowering::constBufferLoad(LoadSDNode *LoadNode, int Block, 1771 SelectionDAG &DAG) const { 1772 SDLoc DL(LoadNode); 1773 EVT VT = LoadNode->getValueType(0); 1774 SDValue Chain = LoadNode->getChain(); 1775 SDValue Ptr = LoadNode->getBasePtr(); 1776 assert (isa<ConstantSDNode>(Ptr)); 1777 1778 //TODO: Support smaller loads 1779 if (LoadNode->getMemoryVT().getScalarType() != MVT::i32 || !ISD::isNON_EXTLoad(LoadNode)) 1780 return SDValue(); 1781 1782 if (LoadNode->getAlignment() < 4) 1783 return SDValue(); 1784 1785 int ConstantBlock = ConstantAddressBlock(Block); 1786 1787 SDValue Slots[4]; 1788 for (unsigned i = 0; i < 4; i++) { 1789 // We want Const position encoded with the following formula : 1790 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 1791 // const_index is Ptr computed by llvm using an alignment of 16. 1792 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 1793 // then div by 4 at the ISel step 1794 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 1795 DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); 1796 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 1797 } 1798 EVT NewVT = MVT::v4i32; 1799 unsigned NumElements = 4; 1800 if (VT.isVector()) { 1801 NewVT = VT; 1802 NumElements = VT.getVectorNumElements(); 1803 } 1804 SDValue Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements)); 1805 if (!VT.isVector()) { 1806 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1807 DAG.getConstant(0, DL, MVT::i32)); 1808 } 1809 SDValue MergedValues[2] = { 1810 Result, 1811 Chain 1812 }; 1813 return DAG.getMergeValues(MergedValues, DL); 1814 } 1815 1816 //===----------------------------------------------------------------------===// 1817 // Custom DAG Optimizations 1818 //===----------------------------------------------------------------------===// 1819 1820 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 1821 DAGCombinerInfo &DCI) const { 1822 SelectionDAG &DAG = DCI.DAG; 1823 SDLoc DL(N); 1824 1825 switch (N->getOpcode()) { 1826 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 1827 case ISD::FP_ROUND: { 1828 SDValue Arg = N->getOperand(0); 1829 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 1830 return DAG.getNode(ISD::UINT_TO_FP, DL, N->getValueType(0), 1831 Arg.getOperand(0)); 1832 } 1833 break; 1834 } 1835 1836 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 1837 // (i32 select_cc f32, f32, -1, 0 cc) 1838 // 1839 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 1840 // this to one of the SET*_DX10 instructions. 1841 case ISD::FP_TO_SINT: { 1842 SDValue FNeg = N->getOperand(0); 1843 if (FNeg.getOpcode() != ISD::FNEG) { 1844 return SDValue(); 1845 } 1846 SDValue SelectCC = FNeg.getOperand(0); 1847 if (SelectCC.getOpcode() != ISD::SELECT_CC || 1848 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 1849 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 1850 !isHWTrueValue(SelectCC.getOperand(2)) || 1851 !isHWFalseValue(SelectCC.getOperand(3))) { 1852 return SDValue(); 1853 } 1854 1855 return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(0), 1856 SelectCC.getOperand(0), // LHS 1857 SelectCC.getOperand(1), // RHS 1858 DAG.getConstant(-1, DL, MVT::i32), // True 1859 DAG.getConstant(0, DL, MVT::i32), // False 1860 SelectCC.getOperand(4)); // CC 1861 } 1862 1863 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx 1864 // => build_vector elt0, ... , NewEltIdx, ... , eltN 1865 case ISD::INSERT_VECTOR_ELT: { 1866 SDValue InVec = N->getOperand(0); 1867 SDValue InVal = N->getOperand(1); 1868 SDValue EltNo = N->getOperand(2); 1869 1870 // If the inserted element is an UNDEF, just use the input vector. 1871 if (InVal.isUndef()) 1872 return InVec; 1873 1874 EVT VT = InVec.getValueType(); 1875 1876 // If we can't generate a legal BUILD_VECTOR, exit 1877 if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) 1878 return SDValue(); 1879 1880 // Check that we know which element is being inserted 1881 if (!isa<ConstantSDNode>(EltNo)) 1882 return SDValue(); 1883 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 1884 1885 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially 1886 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the 1887 // vector elements. 1888 SmallVector<SDValue, 8> Ops; 1889 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 1890 Ops.append(InVec.getNode()->op_begin(), 1891 InVec.getNode()->op_end()); 1892 } else if (InVec.isUndef()) { 1893 unsigned NElts = VT.getVectorNumElements(); 1894 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); 1895 } else { 1896 return SDValue(); 1897 } 1898 1899 // Insert the element 1900 if (Elt < Ops.size()) { 1901 // All the operands of BUILD_VECTOR must have the same type; 1902 // we enforce that here. 1903 EVT OpVT = Ops[0].getValueType(); 1904 if (InVal.getValueType() != OpVT) 1905 InVal = OpVT.bitsGT(InVal.getValueType()) ? 1906 DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) : 1907 DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal); 1908 Ops[Elt] = InVal; 1909 } 1910 1911 // Return the new vector 1912 return DAG.getBuildVector(VT, DL, Ops); 1913 } 1914 1915 // Extract_vec (Build_vector) generated by custom lowering 1916 // also needs to be customly combined 1917 case ISD::EXTRACT_VECTOR_ELT: { 1918 SDValue Arg = N->getOperand(0); 1919 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 1920 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1921 unsigned Element = Const->getZExtValue(); 1922 return Arg->getOperand(Element); 1923 } 1924 } 1925 if (Arg.getOpcode() == ISD::BITCAST && 1926 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && 1927 (Arg.getOperand(0).getValueType().getVectorNumElements() == 1928 Arg.getValueType().getVectorNumElements())) { 1929 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1930 unsigned Element = Const->getZExtValue(); 1931 return DAG.getNode(ISD::BITCAST, DL, N->getVTList(), 1932 Arg->getOperand(0).getOperand(Element)); 1933 } 1934 } 1935 break; 1936 } 1937 1938 case ISD::SELECT_CC: { 1939 // Try common optimizations 1940 if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI)) 1941 return Ret; 1942 1943 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 1944 // selectcc x, y, a, b, inv(cc) 1945 // 1946 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 1947 // selectcc x, y, a, b, cc 1948 SDValue LHS = N->getOperand(0); 1949 if (LHS.getOpcode() != ISD::SELECT_CC) { 1950 return SDValue(); 1951 } 1952 1953 SDValue RHS = N->getOperand(1); 1954 SDValue True = N->getOperand(2); 1955 SDValue False = N->getOperand(3); 1956 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 1957 1958 if (LHS.getOperand(2).getNode() != True.getNode() || 1959 LHS.getOperand(3).getNode() != False.getNode() || 1960 RHS.getNode() != False.getNode()) { 1961 return SDValue(); 1962 } 1963 1964 switch (NCC) { 1965 default: return SDValue(); 1966 case ISD::SETNE: return LHS; 1967 case ISD::SETEQ: { 1968 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 1969 LHSCC = ISD::getSetCCInverse(LHSCC, LHS.getOperand(0).getValueType()); 1970 if (DCI.isBeforeLegalizeOps() || 1971 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) 1972 return DAG.getSelectCC(DL, 1973 LHS.getOperand(0), 1974 LHS.getOperand(1), 1975 LHS.getOperand(2), 1976 LHS.getOperand(3), 1977 LHSCC); 1978 break; 1979 } 1980 } 1981 return SDValue(); 1982 } 1983 1984 case AMDGPUISD::R600_EXPORT: { 1985 SDValue Arg = N->getOperand(1); 1986 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 1987 break; 1988 1989 SDValue NewArgs[8] = { 1990 N->getOperand(0), // Chain 1991 SDValue(), 1992 N->getOperand(2), // ArrayBase 1993 N->getOperand(3), // Type 1994 N->getOperand(4), // SWZ_X 1995 N->getOperand(5), // SWZ_Y 1996 N->getOperand(6), // SWZ_Z 1997 N->getOperand(7) // SWZ_W 1998 }; 1999 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL); 2000 return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, N->getVTList(), NewArgs); 2001 } 2002 case AMDGPUISD::TEXTURE_FETCH: { 2003 SDValue Arg = N->getOperand(1); 2004 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 2005 break; 2006 2007 SDValue NewArgs[19] = { 2008 N->getOperand(0), 2009 N->getOperand(1), 2010 N->getOperand(2), 2011 N->getOperand(3), 2012 N->getOperand(4), 2013 N->getOperand(5), 2014 N->getOperand(6), 2015 N->getOperand(7), 2016 N->getOperand(8), 2017 N->getOperand(9), 2018 N->getOperand(10), 2019 N->getOperand(11), 2020 N->getOperand(12), 2021 N->getOperand(13), 2022 N->getOperand(14), 2023 N->getOperand(15), 2024 N->getOperand(16), 2025 N->getOperand(17), 2026 N->getOperand(18), 2027 }; 2028 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); 2029 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); 2030 } 2031 2032 case ISD::LOAD: { 2033 LoadSDNode *LoadNode = cast<LoadSDNode>(N); 2034 SDValue Ptr = LoadNode->getBasePtr(); 2035 if (LoadNode->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS && 2036 isa<ConstantSDNode>(Ptr)) 2037 return constBufferLoad(LoadNode, AMDGPUAS::CONSTANT_BUFFER_0, DAG); 2038 break; 2039 } 2040 2041 default: break; 2042 } 2043 2044 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 2045 } 2046 2047 bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, 2048 SDValue &Src, SDValue &Neg, SDValue &Abs, 2049 SDValue &Sel, SDValue &Imm, 2050 SelectionDAG &DAG) const { 2051 const R600InstrInfo *TII = Subtarget->getInstrInfo(); 2052 if (!Src.isMachineOpcode()) 2053 return false; 2054 2055 switch (Src.getMachineOpcode()) { 2056 case R600::FNEG_R600: 2057 if (!Neg.getNode()) 2058 return false; 2059 Src = Src.getOperand(0); 2060 Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); 2061 return true; 2062 case R600::FABS_R600: 2063 if (!Abs.getNode()) 2064 return false; 2065 Src = Src.getOperand(0); 2066 Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); 2067 return true; 2068 case R600::CONST_COPY: { 2069 unsigned Opcode = ParentNode->getMachineOpcode(); 2070 bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1; 2071 2072 if (!Sel.getNode()) 2073 return false; 2074 2075 SDValue CstOffset = Src.getOperand(0); 2076 if (ParentNode->getValueType(0).isVector()) 2077 return false; 2078 2079 // Gather constants values 2080 int SrcIndices[] = { 2081 TII->getOperandIdx(Opcode, R600::OpName::src0), 2082 TII->getOperandIdx(Opcode, R600::OpName::src1), 2083 TII->getOperandIdx(Opcode, R600::OpName::src2), 2084 TII->getOperandIdx(Opcode, R600::OpName::src0_X), 2085 TII->getOperandIdx(Opcode, R600::OpName::src0_Y), 2086 TII->getOperandIdx(Opcode, R600::OpName::src0_Z), 2087 TII->getOperandIdx(Opcode, R600::OpName::src0_W), 2088 TII->getOperandIdx(Opcode, R600::OpName::src1_X), 2089 TII->getOperandIdx(Opcode, R600::OpName::src1_Y), 2090 TII->getOperandIdx(Opcode, R600::OpName::src1_Z), 2091 TII->getOperandIdx(Opcode, R600::OpName::src1_W) 2092 }; 2093 std::vector<unsigned> Consts; 2094 for (int OtherSrcIdx : SrcIndices) { 2095 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); 2096 if (OtherSrcIdx < 0 || OtherSelIdx < 0) 2097 continue; 2098 if (HasDst) { 2099 OtherSrcIdx--; 2100 OtherSelIdx--; 2101 } 2102 if (RegisterSDNode *Reg = 2103 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { 2104 if (Reg->getReg() == R600::ALU_CONST) { 2105 ConstantSDNode *Cst 2106 = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx)); 2107 Consts.push_back(Cst->getZExtValue()); 2108 } 2109 } 2110 } 2111 2112 ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset); 2113 Consts.push_back(Cst->getZExtValue()); 2114 if (!TII->fitsConstReadLimitations(Consts)) { 2115 return false; 2116 } 2117 2118 Sel = CstOffset; 2119 Src = DAG.getRegister(R600::ALU_CONST, MVT::f32); 2120 return true; 2121 } 2122 case R600::MOV_IMM_GLOBAL_ADDR: 2123 // Check if the Imm slot is used. Taken from below. 2124 if (cast<ConstantSDNode>(Imm)->getZExtValue()) 2125 return false; 2126 Imm = Src.getOperand(0); 2127 Src = DAG.getRegister(R600::ALU_LITERAL_X, MVT::i32); 2128 return true; 2129 case R600::MOV_IMM_I32: 2130 case R600::MOV_IMM_F32: { 2131 unsigned ImmReg = R600::ALU_LITERAL_X; 2132 uint64_t ImmValue = 0; 2133 2134 if (Src.getMachineOpcode() == R600::MOV_IMM_F32) { 2135 ConstantFPSDNode *FPC = cast<ConstantFPSDNode>(Src.getOperand(0)); 2136 float FloatValue = FPC->getValueAPF().convertToFloat(); 2137 if (FloatValue == 0.0) { 2138 ImmReg = R600::ZERO; 2139 } else if (FloatValue == 0.5) { 2140 ImmReg = R600::HALF; 2141 } else if (FloatValue == 1.0) { 2142 ImmReg = R600::ONE; 2143 } else { 2144 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); 2145 } 2146 } else { 2147 ConstantSDNode *C = cast<ConstantSDNode>(Src.getOperand(0)); 2148 uint64_t Value = C->getZExtValue(); 2149 if (Value == 0) { 2150 ImmReg = R600::ZERO; 2151 } else if (Value == 1) { 2152 ImmReg = R600::ONE_INT; 2153 } else { 2154 ImmValue = Value; 2155 } 2156 } 2157 2158 // Check that we aren't already using an immediate. 2159 // XXX: It's possible for an instruction to have more than one 2160 // immediate operand, but this is not supported yet. 2161 if (ImmReg == R600::ALU_LITERAL_X) { 2162 if (!Imm.getNode()) 2163 return false; 2164 ConstantSDNode *C = cast<ConstantSDNode>(Imm); 2165 if (C->getZExtValue()) 2166 return false; 2167 Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32); 2168 } 2169 Src = DAG.getRegister(ImmReg, MVT::i32); 2170 return true; 2171 } 2172 default: 2173 return false; 2174 } 2175 } 2176 2177 /// Fold the instructions after selecting them 2178 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, 2179 SelectionDAG &DAG) const { 2180 const R600InstrInfo *TII = Subtarget->getInstrInfo(); 2181 if (!Node->isMachineOpcode()) 2182 return Node; 2183 2184 unsigned Opcode = Node->getMachineOpcode(); 2185 SDValue FakeOp; 2186 2187 std::vector<SDValue> Ops(Node->op_begin(), Node->op_end()); 2188 2189 if (Opcode == R600::DOT_4) { 2190 int OperandIdx[] = { 2191 TII->getOperandIdx(Opcode, R600::OpName::src0_X), 2192 TII->getOperandIdx(Opcode, R600::OpName::src0_Y), 2193 TII->getOperandIdx(Opcode, R600::OpName::src0_Z), 2194 TII->getOperandIdx(Opcode, R600::OpName::src0_W), 2195 TII->getOperandIdx(Opcode, R600::OpName::src1_X), 2196 TII->getOperandIdx(Opcode, R600::OpName::src1_Y), 2197 TII->getOperandIdx(Opcode, R600::OpName::src1_Z), 2198 TII->getOperandIdx(Opcode, R600::OpName::src1_W) 2199 }; 2200 int NegIdx[] = { 2201 TII->getOperandIdx(Opcode, R600::OpName::src0_neg_X), 2202 TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Y), 2203 TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Z), 2204 TII->getOperandIdx(Opcode, R600::OpName::src0_neg_W), 2205 TII->getOperandIdx(Opcode, R600::OpName::src1_neg_X), 2206 TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Y), 2207 TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Z), 2208 TII->getOperandIdx(Opcode, R600::OpName::src1_neg_W) 2209 }; 2210 int AbsIdx[] = { 2211 TII->getOperandIdx(Opcode, R600::OpName::src0_abs_X), 2212 TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Y), 2213 TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Z), 2214 TII->getOperandIdx(Opcode, R600::OpName::src0_abs_W), 2215 TII->getOperandIdx(Opcode, R600::OpName::src1_abs_X), 2216 TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Y), 2217 TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Z), 2218 TII->getOperandIdx(Opcode, R600::OpName::src1_abs_W) 2219 }; 2220 for (unsigned i = 0; i < 8; i++) { 2221 if (OperandIdx[i] < 0) 2222 return Node; 2223 SDValue &Src = Ops[OperandIdx[i] - 1]; 2224 SDValue &Neg = Ops[NegIdx[i] - 1]; 2225 SDValue &Abs = Ops[AbsIdx[i] - 1]; 2226 bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1; 2227 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2228 if (HasDst) 2229 SelIdx--; 2230 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2231 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) 2232 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2233 } 2234 } else if (Opcode == R600::REG_SEQUENCE) { 2235 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { 2236 SDValue &Src = Ops[i]; 2237 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) 2238 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2239 } 2240 } else { 2241 if (!TII->hasInstrModifiers(Opcode)) 2242 return Node; 2243 int OperandIdx[] = { 2244 TII->getOperandIdx(Opcode, R600::OpName::src0), 2245 TII->getOperandIdx(Opcode, R600::OpName::src1), 2246 TII->getOperandIdx(Opcode, R600::OpName::src2) 2247 }; 2248 int NegIdx[] = { 2249 TII->getOperandIdx(Opcode, R600::OpName::src0_neg), 2250 TII->getOperandIdx(Opcode, R600::OpName::src1_neg), 2251 TII->getOperandIdx(Opcode, R600::OpName::src2_neg) 2252 }; 2253 int AbsIdx[] = { 2254 TII->getOperandIdx(Opcode, R600::OpName::src0_abs), 2255 TII->getOperandIdx(Opcode, R600::OpName::src1_abs), 2256 -1 2257 }; 2258 for (unsigned i = 0; i < 3; i++) { 2259 if (OperandIdx[i] < 0) 2260 return Node; 2261 SDValue &Src = Ops[OperandIdx[i] - 1]; 2262 SDValue &Neg = Ops[NegIdx[i] - 1]; 2263 SDValue FakeAbs; 2264 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; 2265 bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1; 2266 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2267 int ImmIdx = TII->getOperandIdx(Opcode, R600::OpName::literal); 2268 if (HasDst) { 2269 SelIdx--; 2270 ImmIdx--; 2271 } 2272 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2273 SDValue &Imm = Ops[ImmIdx]; 2274 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) 2275 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2276 } 2277 } 2278 2279 return Node; 2280 } 2281