1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the lowering of LLVM calls to machine code calls for 11 /// GlobalISel. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUISelLowering.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIISelLowering.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "SIRegisterInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/CodeGen/Analysis.h" 24 #include "llvm/CodeGen/CallingConvLower.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/CodeGen/MachineInstrBuilder.h" 27 #include "llvm/Support/LowLevelTypeImpl.h" 28 29 using namespace llvm; 30 31 namespace { 32 33 struct OutgoingArgHandler : public CallLowering::ValueHandler { 34 OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, 35 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 36 : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} 37 38 MachineInstrBuilder MIB; 39 40 Register getStackAddress(uint64_t Size, int64_t Offset, 41 MachinePointerInfo &MPO) override { 42 llvm_unreachable("not implemented"); 43 } 44 45 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 46 MachinePointerInfo &MPO, CCValAssign &VA) override { 47 llvm_unreachable("not implemented"); 48 } 49 50 void assignValueToReg(Register ValVReg, Register PhysReg, 51 CCValAssign &VA) override { 52 MIB.addUse(PhysReg); 53 MIRBuilder.buildCopy(PhysReg, ValVReg); 54 } 55 56 bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, 57 CCValAssign::LocInfo LocInfo, 58 const CallLowering::ArgInfo &Info, 59 CCState &State) override { 60 return AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); 61 } 62 }; 63 64 } 65 66 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) 67 : CallLowering(&TLI) { 68 } 69 70 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, 71 const Value *Val, 72 ArrayRef<Register> VRegs) const { 73 74 MachineFunction &MF = MIRBuilder.getMF(); 75 MachineRegisterInfo &MRI = MF.getRegInfo(); 76 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 77 MFI->setIfReturnsVoid(!Val); 78 79 if (!Val) { 80 MIRBuilder.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 81 return true; 82 } 83 84 Register VReg = VRegs[0]; 85 86 const Function &F = MF.getFunction(); 87 auto &DL = F.getParent()->getDataLayout(); 88 if (!AMDGPU::isShader(F.getCallingConv())) 89 return false; 90 91 92 const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>(); 93 SmallVector<EVT, 4> SplitVTs; 94 SmallVector<uint64_t, 4> Offsets; 95 ArgInfo OrigArg{VReg, Val->getType()}; 96 setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F); 97 ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0); 98 99 SmallVector<ArgInfo, 8> SplitArgs; 100 CCAssignFn *AssignFn = CCAssignFnForReturn(F.getCallingConv(), false); 101 for (unsigned i = 0, e = Offsets.size(); i != e; ++i) { 102 Type *SplitTy = SplitVTs[i].getTypeForEVT(F.getContext()); 103 SplitArgs.push_back({VRegs[i], SplitTy, OrigArg.Flags, OrigArg.IsFixed}); 104 } 105 auto RetInstr = MIRBuilder.buildInstrNoInsert(AMDGPU::SI_RETURN_TO_EPILOG); 106 OutgoingArgHandler Handler(MIRBuilder, MRI, RetInstr, AssignFn); 107 if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) 108 return false; 109 MIRBuilder.insertInstr(RetInstr); 110 111 return true; 112 } 113 114 Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, 115 Type *ParamTy, 116 uint64_t Offset) const { 117 118 MachineFunction &MF = MIRBuilder.getMF(); 119 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 120 MachineRegisterInfo &MRI = MF.getRegInfo(); 121 const Function &F = MF.getFunction(); 122 const DataLayout &DL = F.getParent()->getDataLayout(); 123 PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); 124 LLT PtrType = getLLTForType(*PtrTy, DL); 125 Register DstReg = MRI.createGenericVirtualRegister(PtrType); 126 Register KernArgSegmentPtr = 127 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 128 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); 129 130 Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); 131 MIRBuilder.buildConstant(OffsetReg, Offset); 132 133 MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg); 134 135 return DstReg; 136 } 137 138 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, 139 Type *ParamTy, uint64_t Offset, 140 unsigned Align, 141 Register DstReg) const { 142 MachineFunction &MF = MIRBuilder.getMF(); 143 const Function &F = MF.getFunction(); 144 const DataLayout &DL = F.getParent()->getDataLayout(); 145 PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); 146 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 147 unsigned TypeSize = DL.getTypeStoreSize(ParamTy); 148 Register PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset); 149 150 MachineMemOperand *MMO = 151 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | 152 MachineMemOperand::MONonTemporal | 153 MachineMemOperand::MOInvariant, 154 TypeSize, Align); 155 156 MIRBuilder.buildLoad(DstReg, PtrReg, *MMO); 157 } 158 159 static Register findFirstFreeSGPR(CCState &CCInfo) { 160 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 161 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { 162 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { 163 return AMDGPU::SGPR0 + Reg; 164 } 165 } 166 llvm_unreachable("Cannot allocate sgpr"); 167 } 168 169 static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, 170 MachineFunction &MF, 171 const SIRegisterInfo &TRI, 172 SIMachineFunctionInfo &Info) { 173 const LLT S32 = LLT::scalar(32); 174 MachineRegisterInfo &MRI = MF.getRegInfo(); 175 176 if (Info.hasWorkItemIDX()) { 177 Register Reg = AMDGPU::VGPR0; 178 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); 179 180 CCInfo.AllocateReg(Reg); 181 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); 182 } 183 184 if (Info.hasWorkItemIDY()) { 185 Register Reg = AMDGPU::VGPR1; 186 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); 187 188 CCInfo.AllocateReg(Reg); 189 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); 190 } 191 192 if (Info.hasWorkItemIDZ()) { 193 Register Reg = AMDGPU::VGPR2; 194 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); 195 196 CCInfo.AllocateReg(Reg); 197 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); 198 } 199 } 200 201 // Allocate special inputs passed in user SGPRs. 202 static void allocateHSAUserSGPRs(CCState &CCInfo, 203 MachineIRBuilder &MIRBuilder, 204 MachineFunction &MF, 205 const SIRegisterInfo &TRI, 206 SIMachineFunctionInfo &Info) { 207 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 208 if (Info.hasPrivateSegmentBuffer()) { 209 unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); 210 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); 211 CCInfo.AllocateReg(PrivateSegmentBufferReg); 212 } 213 214 if (Info.hasDispatchPtr()) { 215 unsigned DispatchPtrReg = Info.addDispatchPtr(TRI); 216 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 217 CCInfo.AllocateReg(DispatchPtrReg); 218 } 219 220 if (Info.hasQueuePtr()) { 221 unsigned QueuePtrReg = Info.addQueuePtr(TRI); 222 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 223 CCInfo.AllocateReg(QueuePtrReg); 224 } 225 226 if (Info.hasKernargSegmentPtr()) { 227 MachineRegisterInfo &MRI = MF.getRegInfo(); 228 Register InputPtrReg = Info.addKernargSegmentPtr(TRI); 229 const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 230 Register VReg = MRI.createGenericVirtualRegister(P4); 231 MRI.addLiveIn(InputPtrReg, VReg); 232 MIRBuilder.getMBB().addLiveIn(InputPtrReg); 233 MIRBuilder.buildCopy(VReg, InputPtrReg); 234 CCInfo.AllocateReg(InputPtrReg); 235 } 236 237 if (Info.hasDispatchID()) { 238 unsigned DispatchIDReg = Info.addDispatchID(TRI); 239 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 240 CCInfo.AllocateReg(DispatchIDReg); 241 } 242 243 if (Info.hasFlatScratchInit()) { 244 unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI); 245 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 246 CCInfo.AllocateReg(FlatScratchInitReg); 247 } 248 249 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 250 // these from the dispatch pointer. 251 } 252 253 static void allocateSystemSGPRs(CCState &CCInfo, 254 MachineFunction &MF, 255 SIMachineFunctionInfo &Info, 256 CallingConv::ID CallConv, 257 bool IsShader) { 258 const LLT S32 = LLT::scalar(32); 259 MachineRegisterInfo &MRI = MF.getRegInfo(); 260 261 if (Info.hasWorkGroupIDX()) { 262 Register Reg = Info.addWorkGroupIDX(); 263 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); 264 CCInfo.AllocateReg(Reg); 265 } 266 267 if (Info.hasWorkGroupIDY()) { 268 Register Reg = Info.addWorkGroupIDY(); 269 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); 270 CCInfo.AllocateReg(Reg); 271 } 272 273 if (Info.hasWorkGroupIDZ()) { 274 unsigned Reg = Info.addWorkGroupIDZ(); 275 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); 276 CCInfo.AllocateReg(Reg); 277 } 278 279 if (Info.hasWorkGroupInfo()) { 280 unsigned Reg = Info.addWorkGroupInfo(); 281 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); 282 CCInfo.AllocateReg(Reg); 283 } 284 285 if (Info.hasPrivateSegmentWaveByteOffset()) { 286 // Scratch wave offset passed in system SGPR. 287 unsigned PrivateSegmentWaveByteOffsetReg; 288 289 if (IsShader) { 290 PrivateSegmentWaveByteOffsetReg = 291 Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); 292 293 // This is true if the scratch wave byte offset doesn't have a fixed 294 // location. 295 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { 296 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); 297 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); 298 } 299 } else 300 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); 301 302 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); 303 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); 304 } 305 } 306 307 bool AMDGPUCallLowering::lowerFormalArgumentsKernel( 308 MachineIRBuilder &MIRBuilder, const Function &F, 309 ArrayRef<ArrayRef<Register>> VRegs) const { 310 MachineFunction &MF = MIRBuilder.getMF(); 311 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); 312 MachineRegisterInfo &MRI = MF.getRegInfo(); 313 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 314 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 315 const DataLayout &DL = F.getParent()->getDataLayout(); 316 317 SmallVector<CCValAssign, 16> ArgLocs; 318 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); 319 320 allocateHSAUserSGPRs(CCInfo, MIRBuilder, MF, *TRI, *Info); 321 322 unsigned i = 0; 323 const unsigned KernArgBaseAlign = 16; 324 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); 325 uint64_t ExplicitArgOffset = 0; 326 327 // TODO: Align down to dword alignment and extract bits for extending loads. 328 for (auto &Arg : F.args()) { 329 Type *ArgTy = Arg.getType(); 330 unsigned AllocSize = DL.getTypeAllocSize(ArgTy); 331 if (AllocSize == 0) 332 continue; 333 334 unsigned ABIAlign = DL.getABITypeAlignment(ArgTy); 335 336 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; 337 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; 338 339 ArrayRef<Register> OrigArgRegs = VRegs[i]; 340 Register ArgReg = 341 OrigArgRegs.size() == 1 342 ? OrigArgRegs[0] 343 : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); 344 unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset); 345 ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy)); 346 lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, ArgReg); 347 if (OrigArgRegs.size() > 1) 348 unpackRegs(OrigArgRegs, ArgReg, ArgTy, MIRBuilder); 349 ++i; 350 } 351 352 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); 353 allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); 354 return true; 355 } 356 357 bool AMDGPUCallLowering::lowerFormalArguments( 358 MachineIRBuilder &MIRBuilder, const Function &F, 359 ArrayRef<ArrayRef<Register>> VRegs) const { 360 // The infrastructure for normal calling convention lowering is essentially 361 // useless for kernels. We want to avoid any kind of legalization or argument 362 // splitting. 363 if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) 364 return lowerFormalArgumentsKernel(MIRBuilder, F, VRegs); 365 366 // AMDGPU_GS and AMDGP_HS are not supported yet. 367 if (F.getCallingConv() == CallingConv::AMDGPU_GS || 368 F.getCallingConv() == CallingConv::AMDGPU_HS) 369 return false; 370 371 MachineFunction &MF = MIRBuilder.getMF(); 372 MachineRegisterInfo &MRI = MF.getRegInfo(); 373 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 374 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 375 const DataLayout &DL = F.getParent()->getDataLayout(); 376 377 bool IsShader = AMDGPU::isShader(F.getCallingConv()); 378 379 SmallVector<CCValAssign, 16> ArgLocs; 380 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); 381 382 if (Info->hasImplicitBufferPtr()) { 383 unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); 384 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); 385 CCInfo.AllocateReg(ImplicitBufferPtrReg); 386 } 387 388 unsigned NumArgs = F.arg_size(); 389 Function::const_arg_iterator CurOrigArg = F.arg_begin(); 390 const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>(); 391 unsigned PSInputNum = 0; 392 BitVector Skipped(NumArgs); 393 for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) { 394 EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType()); 395 396 // We can only hanlde simple value types at the moment. 397 ISD::ArgFlagsTy Flags; 398 assert(VRegs[i].size() == 1 && "Can't lower into more than one register"); 399 ArgInfo OrigArg{VRegs[i][0], CurOrigArg->getType()}; 400 setArgFlags(OrigArg, i + 1, DL, F); 401 Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType())); 402 403 if (F.getCallingConv() == CallingConv::AMDGPU_PS && 404 !OrigArg.Flags.isInReg() && !OrigArg.Flags.isByVal() && 405 PSInputNum <= 15) { 406 if (CurOrigArg->use_empty() && !Info->isPSInputAllocated(PSInputNum)) { 407 Skipped.set(i); 408 ++PSInputNum; 409 continue; 410 } 411 412 Info->markPSInputAllocated(PSInputNum); 413 if (!CurOrigArg->use_empty()) 414 Info->markPSInputEnabled(PSInputNum); 415 416 ++PSInputNum; 417 } 418 419 CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(), 420 /*IsVarArg=*/false); 421 422 if (ValEVT.isVector()) { 423 EVT ElemVT = ValEVT.getVectorElementType(); 424 if (!ValEVT.isSimple()) 425 return false; 426 MVT ValVT = ElemVT.getSimpleVT(); 427 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, 428 OrigArg.Flags, CCInfo); 429 if (!Res) 430 return false; 431 } else { 432 MVT ValVT = ValEVT.getSimpleVT(); 433 if (!ValEVT.isSimple()) 434 return false; 435 bool Res = 436 AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo); 437 438 // Fail if we don't know how to handle this type. 439 if (Res) 440 return false; 441 } 442 } 443 444 Function::const_arg_iterator Arg = F.arg_begin(); 445 446 if (F.getCallingConv() == CallingConv::AMDGPU_VS || 447 F.getCallingConv() == CallingConv::AMDGPU_PS) { 448 for (unsigned i = 0, OrigArgIdx = 0; 449 OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) { 450 if (Skipped.test(OrigArgIdx)) 451 continue; 452 assert(VRegs[OrigArgIdx].size() == 1 && 453 "Can't lower into more than 1 reg"); 454 CCValAssign &VA = ArgLocs[i++]; 455 MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx][0]); 456 MIRBuilder.getMBB().addLiveIn(VA.getLocReg()); 457 MIRBuilder.buildCopy(VRegs[OrigArgIdx][0], VA.getLocReg()); 458 } 459 460 allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), IsShader); 461 return true; 462 } 463 464 return false; 465 } 466