1 //===- AMDGPU.cpp ---------------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "ABIInfoImpl.h" 10 #include "TargetInfo.h" 11 #include "llvm/ADT/StringExtras.h" 12 #include "llvm/Support/AMDGPUAddrSpace.h" 13 14 using namespace clang; 15 using namespace clang::CodeGen; 16 17 //===----------------------------------------------------------------------===// 18 // AMDGPU ABI Implementation 19 //===----------------------------------------------------------------------===// 20 21 namespace { 22 23 class AMDGPUABIInfo final : public DefaultABIInfo { 24 private: 25 static const unsigned MaxNumRegsForArgsRet = 16; 26 27 unsigned numRegsForType(QualType Ty) const; 28 29 bool isHomogeneousAggregateBaseType(QualType Ty) const override; 30 bool isHomogeneousAggregateSmallEnough(const Type *Base, 31 uint64_t Members) const override; 32 33 // Coerce HIP scalar pointer arguments from generic pointers to global ones. 34 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS, 35 unsigned ToAS) const { 36 // Single value types. 37 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty); 38 if (PtrTy && PtrTy->getAddressSpace() == FromAS) 39 return llvm::PointerType::get(Ty->getContext(), ToAS); 40 return Ty; 41 } 42 43 public: 44 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) : 45 DefaultABIInfo(CGT) {} 46 47 ABIArgInfo classifyReturnType(QualType RetTy) const; 48 ABIArgInfo classifyKernelArgumentType(QualType Ty) const; 49 ABIArgInfo classifyArgumentType(QualType Ty, bool Variadic, 50 unsigned &NumRegsLeft) const; 51 52 void computeInfo(CGFunctionInfo &FI) const override; 53 RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty, 54 AggValueSlot Slot) const override; 55 56 llvm::FixedVectorType * 57 getOptimalVectorMemoryType(llvm::FixedVectorType *T, 58 const LangOptions &Opt) const override { 59 // We have legal instructions for 96-bit so 3x32 can be supported. 60 // FIXME: This check should be a subtarget feature as technically SI doesn't 61 // support it. 62 if (T->getNumElements() == 3 && getDataLayout().getTypeSizeInBits(T) == 96) 63 return T; 64 return DefaultABIInfo::getOptimalVectorMemoryType(T, Opt); 65 } 66 }; 67 68 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const { 69 return true; 70 } 71 72 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough( 73 const Type *Base, uint64_t Members) const { 74 uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32; 75 76 // Homogeneous Aggregates may occupy at most 16 registers. 77 return Members * NumRegs <= MaxNumRegsForArgsRet; 78 } 79 80 /// Estimate number of registers the type will use when passed in registers. 81 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const { 82 unsigned NumRegs = 0; 83 84 if (const VectorType *VT = Ty->getAs<VectorType>()) { 85 // Compute from the number of elements. The reported size is based on the 86 // in-memory size, which includes the padding 4th element for 3-vectors. 87 QualType EltTy = VT->getElementType(); 88 unsigned EltSize = getContext().getTypeSize(EltTy); 89 90 // 16-bit element vectors should be passed as packed. 91 if (EltSize == 16) 92 return (VT->getNumElements() + 1) / 2; 93 94 unsigned EltNumRegs = (EltSize + 31) / 32; 95 return EltNumRegs * VT->getNumElements(); 96 } 97 98 if (const RecordType *RT = Ty->getAs<RecordType>()) { 99 const RecordDecl *RD = RT->getDecl(); 100 assert(!RD->hasFlexibleArrayMember()); 101 102 for (const FieldDecl *Field : RD->fields()) { 103 QualType FieldTy = Field->getType(); 104 NumRegs += numRegsForType(FieldTy); 105 } 106 107 return NumRegs; 108 } 109 110 return (getContext().getTypeSize(Ty) + 31) / 32; 111 } 112 113 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const { 114 llvm::CallingConv::ID CC = FI.getCallingConvention(); 115 116 if (!getCXXABI().classifyReturnType(FI)) 117 FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); 118 119 unsigned ArgumentIndex = 0; 120 const unsigned numFixedArguments = FI.getNumRequiredArgs(); 121 122 unsigned NumRegsLeft = MaxNumRegsForArgsRet; 123 for (auto &Arg : FI.arguments()) { 124 if (CC == llvm::CallingConv::AMDGPU_KERNEL) { 125 Arg.info = classifyKernelArgumentType(Arg.type); 126 } else { 127 bool FixedArgument = ArgumentIndex++ < numFixedArguments; 128 Arg.info = classifyArgumentType(Arg.type, !FixedArgument, NumRegsLeft); 129 } 130 } 131 } 132 133 RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, 134 QualType Ty, AggValueSlot Slot) const { 135 const bool IsIndirect = false; 136 const bool AllowHigherAlign = false; 137 return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect, 138 getContext().getTypeInfoInChars(Ty), 139 CharUnits::fromQuantity(4), AllowHigherAlign, Slot); 140 } 141 142 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const { 143 if (isAggregateTypeForABI(RetTy)) { 144 // Records with non-trivial destructors/copy-constructors should not be 145 // returned by value. 146 if (!getRecordArgABI(RetTy, getCXXABI())) { 147 // Ignore empty structs/unions. 148 if (isEmptyRecord(getContext(), RetTy, true)) 149 return ABIArgInfo::getIgnore(); 150 151 // Lower single-element structs to just return a regular value. 152 if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext())) 153 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); 154 155 if (const RecordType *RT = RetTy->getAs<RecordType>()) { 156 const RecordDecl *RD = RT->getDecl(); 157 if (RD->hasFlexibleArrayMember()) 158 return DefaultABIInfo::classifyReturnType(RetTy); 159 } 160 161 // Pack aggregates <= 4 bytes into single VGPR or pair. 162 uint64_t Size = getContext().getTypeSize(RetTy); 163 if (Size <= 16) 164 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); 165 166 if (Size <= 32) 167 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); 168 169 if (Size <= 64) { 170 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext()); 171 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); 172 } 173 174 if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet) 175 return ABIArgInfo::getDirect(); 176 } 177 } 178 179 // Otherwise just do the default thing. 180 return DefaultABIInfo::classifyReturnType(RetTy); 181 } 182 183 /// For kernels all parameters are really passed in a special buffer. It doesn't 184 /// make sense to pass anything byval, so everything must be direct. 185 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const { 186 Ty = useFirstFieldIfTransparentUnion(Ty); 187 188 // TODO: Can we omit empty structs? 189 190 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) 191 Ty = QualType(SeltTy, 0); 192 193 llvm::Type *OrigLTy = CGT.ConvertType(Ty); 194 llvm::Type *LTy = OrigLTy; 195 if (getContext().getLangOpts().HIP) { 196 LTy = coerceKernelArgumentType( 197 OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default), 198 /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device)); 199 } 200 201 // FIXME: This doesn't apply the optimization of coercing pointers in structs 202 // to global address space when using byref. This would require implementing a 203 // new kind of coercion of the in-memory type when for indirect arguments. 204 if (LTy == OrigLTy && isAggregateTypeForABI(Ty)) { 205 return ABIArgInfo::getIndirectAliased( 206 getContext().getTypeAlignInChars(Ty), 207 getContext().getTargetAddressSpace(LangAS::opencl_constant), 208 false /*Realign*/, nullptr /*Padding*/); 209 } 210 211 // If we set CanBeFlattened to true, CodeGen will expand the struct to its 212 // individual elements, which confuses the Clover OpenCL backend; therefore we 213 // have to set it to false here. Other args of getDirect() are just defaults. 214 return ABIArgInfo::getDirect(LTy, 0, nullptr, false); 215 } 216 217 ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic, 218 unsigned &NumRegsLeft) const { 219 assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow"); 220 221 Ty = useFirstFieldIfTransparentUnion(Ty); 222 223 if (Variadic) { 224 return ABIArgInfo::getDirect(/*T=*/nullptr, 225 /*Offset=*/0, 226 /*Padding=*/nullptr, 227 /*CanBeFlattened=*/false, 228 /*Align=*/0); 229 } 230 231 if (isAggregateTypeForABI(Ty)) { 232 // Records with non-trivial destructors/copy-constructors should not be 233 // passed by value. 234 if (auto RAA = getRecordArgABI(Ty, getCXXABI())) 235 return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(), 236 RAA == CGCXXABI::RAA_DirectInMemory); 237 238 // Ignore empty structs/unions. 239 if (isEmptyRecord(getContext(), Ty, true)) 240 return ABIArgInfo::getIgnore(); 241 242 // Lower single-element structs to just pass a regular value. TODO: We 243 // could do reasonable-size multiple-element structs too, using getExpand(), 244 // though watch out for things like bitfields. 245 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) 246 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); 247 248 if (const RecordType *RT = Ty->getAs<RecordType>()) { 249 const RecordDecl *RD = RT->getDecl(); 250 if (RD->hasFlexibleArrayMember()) 251 return DefaultABIInfo::classifyArgumentType(Ty); 252 } 253 254 // Pack aggregates <= 8 bytes into single VGPR or pair. 255 uint64_t Size = getContext().getTypeSize(Ty); 256 if (Size <= 64) { 257 unsigned NumRegs = (Size + 31) / 32; 258 NumRegsLeft -= std::min(NumRegsLeft, NumRegs); 259 260 if (Size <= 16) 261 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); 262 263 if (Size <= 32) 264 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); 265 266 // XXX: Should this be i64 instead, and should the limit increase? 267 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext()); 268 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); 269 } 270 271 if (NumRegsLeft > 0) { 272 unsigned NumRegs = numRegsForType(Ty); 273 if (NumRegsLeft >= NumRegs) { 274 NumRegsLeft -= NumRegs; 275 return ABIArgInfo::getDirect(); 276 } 277 } 278 279 // Use pass-by-reference in stead of pass-by-value for struct arguments in 280 // function ABI. 281 return ABIArgInfo::getIndirectAliased( 282 getContext().getTypeAlignInChars(Ty), 283 getContext().getTargetAddressSpace(LangAS::opencl_private)); 284 } 285 286 // Otherwise just do the default thing. 287 ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty); 288 if (!ArgInfo.isIndirect()) { 289 unsigned NumRegs = numRegsForType(Ty); 290 NumRegsLeft -= std::min(NumRegs, NumRegsLeft); 291 } 292 293 return ArgInfo; 294 } 295 296 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo { 297 public: 298 AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT) 299 : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {} 300 301 bool supportsLibCall() const override { return false; } 302 void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F, 303 CodeGenModule &CGM) const; 304 305 void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, 306 CodeGen::CodeGenModule &M) const override; 307 unsigned getDeviceKernelCallingConv() const override; 308 309 llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM, 310 llvm::PointerType *T, QualType QT) const override; 311 312 LangAS getASTAllocaAddressSpace() const override { 313 return getLangASFromTargetAS( 314 getABIInfo().getDataLayout().getAllocaAddrSpace()); 315 } 316 LangAS getGlobalVarAddressSpace(CodeGenModule &CGM, 317 const VarDecl *D) const override; 318 llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts, 319 SyncScope Scope, 320 llvm::AtomicOrdering Ordering, 321 llvm::LLVMContext &Ctx) const override; 322 void setTargetAtomicMetadata(CodeGenFunction &CGF, 323 llvm::Instruction &AtomicInst, 324 const AtomicExpr *Expr = nullptr) const override; 325 llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF, 326 llvm::Function *BlockInvokeFunc, 327 llvm::Type *BlockTy) const override; 328 bool shouldEmitStaticExternCAliases() const override; 329 bool shouldEmitDWARFBitFieldSeparators() const override; 330 void setCUDAKernelCallingConvention(const FunctionType *&FT) const override; 331 }; 332 } 333 334 static bool requiresAMDGPUProtectedVisibility(const Decl *D, 335 llvm::GlobalValue *GV) { 336 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility) 337 return false; 338 339 return !D->hasAttr<OMPDeclareTargetDeclAttr>() && 340 (D->hasAttr<DeviceKernelAttr>() || 341 (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) || 342 (isa<VarDecl>(D) && 343 (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || 344 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() || 345 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType()))); 346 } 347 348 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( 349 const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const { 350 const auto *ReqdWGS = 351 M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr; 352 const bool IsOpenCLKernel = 353 M.getLangOpts().OpenCL && FD->hasAttr<DeviceKernelAttr>(); 354 const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>(); 355 356 const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>(); 357 if (ReqdWGS || FlatWGS) { 358 M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS); 359 } else if (IsOpenCLKernel || IsHIPKernel) { 360 // By default, restrict the maximum size to a value specified by 361 // --gpu-max-threads-per-block=n or its default value for HIP. 362 const unsigned OpenCLDefaultMaxWorkGroupSize = 256; 363 const unsigned DefaultMaxWorkGroupSize = 364 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize 365 : M.getLangOpts().GPUMaxThreadsPerBlock; 366 std::string AttrVal = 367 std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize); 368 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); 369 } 370 371 if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>()) 372 M.handleAMDGPUWavesPerEUAttr(F, Attr); 373 374 if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) { 375 unsigned NumSGPR = Attr->getNumSGPR(); 376 377 if (NumSGPR != 0) 378 F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR)); 379 } 380 381 if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) { 382 uint32_t NumVGPR = Attr->getNumVGPR(); 383 384 if (NumVGPR != 0) 385 F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR)); 386 } 387 388 if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) { 389 uint32_t X = Attr->getMaxNumWorkGroupsX() 390 ->EvaluateKnownConstInt(M.getContext()) 391 .getExtValue(); 392 // Y and Z dimensions default to 1 if not specified 393 uint32_t Y = Attr->getMaxNumWorkGroupsY() 394 ? Attr->getMaxNumWorkGroupsY() 395 ->EvaluateKnownConstInt(M.getContext()) 396 .getExtValue() 397 : 1; 398 uint32_t Z = Attr->getMaxNumWorkGroupsZ() 399 ? Attr->getMaxNumWorkGroupsZ() 400 ->EvaluateKnownConstInt(M.getContext()) 401 .getExtValue() 402 : 1; 403 404 llvm::SmallString<32> AttrVal; 405 llvm::raw_svector_ostream OS(AttrVal); 406 OS << X << ',' << Y << ',' << Z; 407 408 F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str()); 409 } 410 } 411 412 void AMDGPUTargetCodeGenInfo::setTargetAttributes( 413 const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { 414 if (requiresAMDGPUProtectedVisibility(D, GV)) { 415 GV->setVisibility(llvm::GlobalValue::ProtectedVisibility); 416 GV->setDSOLocal(true); 417 } 418 419 if (GV->isDeclaration()) 420 return; 421 422 llvm::Function *F = dyn_cast<llvm::Function>(GV); 423 if (!F) 424 return; 425 426 const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D); 427 if (FD) 428 setFunctionDeclAttributes(FD, F, M); 429 430 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts) 431 F->addFnAttr("amdgpu-ieee", "false"); 432 } 433 434 unsigned AMDGPUTargetCodeGenInfo::getDeviceKernelCallingConv() const { 435 return llvm::CallingConv::AMDGPU_KERNEL; 436 } 437 438 // Currently LLVM assumes null pointers always have value 0, 439 // which results in incorrectly transformed IR. Therefore, instead of 440 // emitting null pointers in private and local address spaces, a null 441 // pointer in generic address space is emitted which is casted to a 442 // pointer in local or private address space. 443 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer( 444 const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT, 445 QualType QT) const { 446 if (CGM.getContext().getTargetNullPointerValue(QT) == 0) 447 return llvm::ConstantPointerNull::get(PT); 448 449 auto &Ctx = CGM.getContext(); 450 auto NPT = llvm::PointerType::get( 451 PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic)); 452 return llvm::ConstantExpr::getAddrSpaceCast( 453 llvm::ConstantPointerNull::get(NPT), PT); 454 } 455 456 LangAS 457 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM, 458 const VarDecl *D) const { 459 assert(!CGM.getLangOpts().OpenCL && 460 !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) && 461 "Address space agnostic languages only"); 462 LangAS DefaultGlobalAS = getLangASFromTargetAS( 463 CGM.getContext().getTargetAddressSpace(LangAS::opencl_global)); 464 if (!D) 465 return DefaultGlobalAS; 466 467 LangAS AddrSpace = D->getType().getAddressSpace(); 468 if (AddrSpace != LangAS::Default) 469 return AddrSpace; 470 471 // Only promote to address space 4 if VarDecl has constant initialization. 472 if (D->getType().isConstantStorage(CGM.getContext(), false, false) && 473 D->hasConstantInitialization()) { 474 if (auto ConstAS = CGM.getTarget().getConstantAddressSpace()) 475 return *ConstAS; 476 } 477 return DefaultGlobalAS; 478 } 479 480 llvm::SyncScope::ID 481 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts, 482 SyncScope Scope, 483 llvm::AtomicOrdering Ordering, 484 llvm::LLVMContext &Ctx) const { 485 std::string Name; 486 switch (Scope) { 487 case SyncScope::HIPSingleThread: 488 case SyncScope::SingleScope: 489 Name = "singlethread"; 490 break; 491 case SyncScope::HIPWavefront: 492 case SyncScope::OpenCLSubGroup: 493 case SyncScope::WavefrontScope: 494 Name = "wavefront"; 495 break; 496 case SyncScope::HIPWorkgroup: 497 case SyncScope::OpenCLWorkGroup: 498 case SyncScope::WorkgroupScope: 499 Name = "workgroup"; 500 break; 501 case SyncScope::HIPAgent: 502 case SyncScope::OpenCLDevice: 503 case SyncScope::DeviceScope: 504 Name = "agent"; 505 break; 506 case SyncScope::SystemScope: 507 case SyncScope::HIPSystem: 508 case SyncScope::OpenCLAllSVMDevices: 509 Name = ""; 510 break; 511 } 512 513 // OpenCL assumes by default that atomic scopes are per-address space for 514 // non-sequentially consistent operations. 515 if (Scope >= SyncScope::OpenCLWorkGroup && 516 Scope <= SyncScope::OpenCLSubGroup && 517 Ordering != llvm::AtomicOrdering::SequentiallyConsistent) { 518 if (!Name.empty()) 519 Name = Twine(Twine(Name) + Twine("-")).str(); 520 521 Name = Twine(Twine(Name) + Twine("one-as")).str(); 522 } 523 524 return Ctx.getOrInsertSyncScopeID(Name); 525 } 526 527 void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata( 528 CodeGenFunction &CGF, llvm::Instruction &AtomicInst, 529 const AtomicExpr *AE) const { 530 auto *RMW = dyn_cast<llvm::AtomicRMWInst>(&AtomicInst); 531 auto *CmpX = dyn_cast<llvm::AtomicCmpXchgInst>(&AtomicInst); 532 533 // OpenCL and old style HIP atomics consider atomics targeting thread private 534 // memory to be undefined. 535 // 536 // TODO: This is probably undefined for atomic load/store, but there's not 537 // much direct codegen benefit to knowing this. 538 if (((RMW && RMW->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS) || 539 (CmpX && 540 CmpX->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS)) && 541 AE && AE->threadPrivateMemoryAtomicsAreUndefined()) { 542 llvm::MDBuilder MDHelper(CGF.getLLVMContext()); 543 llvm::MDNode *ASRange = MDHelper.createRange( 544 llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS), 545 llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS + 1)); 546 AtomicInst.setMetadata(llvm::LLVMContext::MD_noalias_addrspace, ASRange); 547 } 548 549 if (!RMW) 550 return; 551 552 AtomicOptions AO = CGF.CGM.getAtomicOpts(); 553 llvm::MDNode *Empty = llvm::MDNode::get(CGF.getLLVMContext(), {}); 554 if (!AO.getOption(clang::AtomicOptionKind::FineGrainedMemory)) 555 RMW->setMetadata("amdgpu.no.fine.grained.memory", Empty); 556 if (!AO.getOption(clang::AtomicOptionKind::RemoteMemory)) 557 RMW->setMetadata("amdgpu.no.remote.memory", Empty); 558 if (AO.getOption(clang::AtomicOptionKind::IgnoreDenormalMode) && 559 RMW->getOperation() == llvm::AtomicRMWInst::FAdd && 560 RMW->getType()->isFloatTy()) 561 RMW->setMetadata("amdgpu.ignore.denormal.mode", Empty); 562 } 563 564 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const { 565 return false; 566 } 567 568 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const { 569 return true; 570 } 571 572 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention( 573 const FunctionType *&FT) const { 574 FT = getABIInfo().getContext().adjustFunctionType( 575 FT, FT->getExtInfo().withCallingConv(CC_DeviceKernel)); 576 } 577 578 /// Return IR struct type for rtinfo struct in rocm-device-libs used for device 579 /// enqueue. 580 /// 581 /// ptr addrspace(1) kernel_object, i32 private_segment_size, 582 /// i32 group_segment_size 583 584 static llvm::StructType * 585 getAMDGPURuntimeHandleType(llvm::LLVMContext &C, 586 llvm::Type *KernelDescriptorPtrTy) { 587 llvm::Type *Int32 = llvm::Type::getInt32Ty(C); 588 return llvm::StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32}, 589 "block.runtime.handle.t"); 590 } 591 592 /// Create an OpenCL kernel for an enqueued block. 593 /// 594 /// The type of the first argument (the block literal) is the struct type 595 /// of the block literal instead of a pointer type. The first argument 596 /// (block literal) is passed directly by value to the kernel. The kernel 597 /// allocates the same type of struct on stack and stores the block literal 598 /// to it and passes its pointer to the block invoke function. The kernel 599 /// has "enqueued-block" function attribute and kernel argument metadata. 600 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel( 601 CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const { 602 auto &Builder = CGF.Builder; 603 auto &C = CGF.getLLVMContext(); 604 605 auto *InvokeFT = Invoke->getFunctionType(); 606 llvm::SmallVector<llvm::Type *, 2> ArgTys; 607 llvm::SmallVector<llvm::Metadata *, 8> AddressQuals; 608 llvm::SmallVector<llvm::Metadata *, 8> AccessQuals; 609 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames; 610 llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames; 611 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals; 612 llvm::SmallVector<llvm::Metadata *, 8> ArgNames; 613 614 ArgTys.push_back(BlockTy); 615 ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal")); 616 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0))); 617 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal")); 618 ArgTypeQuals.push_back(llvm::MDString::get(C, "")); 619 AccessQuals.push_back(llvm::MDString::get(C, "none")); 620 ArgNames.push_back(llvm::MDString::get(C, "block_literal")); 621 for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) { 622 ArgTys.push_back(InvokeFT->getParamType(I)); 623 ArgTypeNames.push_back(llvm::MDString::get(C, "void*")); 624 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3))); 625 AccessQuals.push_back(llvm::MDString::get(C, "none")); 626 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*")); 627 ArgTypeQuals.push_back(llvm::MDString::get(C, "")); 628 ArgNames.push_back( 629 llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str())); 630 } 631 632 llvm::Module &Mod = CGF.CGM.getModule(); 633 const llvm::DataLayout &DL = Mod.getDataLayout(); 634 635 llvm::Twine Name = Invoke->getName() + "_kernel"; 636 auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false); 637 638 // The kernel itself can be internal, the runtime does not directly access the 639 // kernel address (only the kernel descriptor). 640 auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name, 641 &Mod); 642 F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); 643 644 llvm::AttrBuilder KernelAttrs(C); 645 // FIXME: The invoke isn't applying the right attributes either 646 // FIXME: This is missing setTargetAttributes 647 CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs); 648 F->addFnAttrs(KernelAttrs); 649 650 auto IP = CGF.Builder.saveIP(); 651 auto *BB = llvm::BasicBlock::Create(C, "entry", F); 652 Builder.SetInsertPoint(BB); 653 const auto BlockAlign = DL.getPrefTypeAlign(BlockTy); 654 auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr); 655 BlockPtr->setAlignment(BlockAlign); 656 Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign); 657 auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0)); 658 llvm::SmallVector<llvm::Value *, 2> Args; 659 Args.push_back(Cast); 660 for (llvm::Argument &A : llvm::drop_begin(F->args())) 661 Args.push_back(&A); 662 llvm::CallInst *call = Builder.CreateCall(Invoke, Args); 663 call->setCallingConv(Invoke->getCallingConv()); 664 Builder.CreateRetVoid(); 665 Builder.restoreIP(IP); 666 667 F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals)); 668 F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals)); 669 F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames)); 670 F->setMetadata("kernel_arg_base_type", 671 llvm::MDNode::get(C, ArgBaseTypeNames)); 672 F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals)); 673 if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata) 674 F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames)); 675 676 llvm::StructType *HandleTy = getAMDGPURuntimeHandleType( 677 C, llvm::PointerType::get(C, DL.getDefaultGlobalsAddressSpace())); 678 llvm::Constant *RuntimeHandleInitializer = 679 llvm::ConstantAggregateZero::get(HandleTy); 680 681 llvm::Twine RuntimeHandleName = F->getName() + ".runtime.handle"; 682 683 // The runtime needs access to the runtime handle as an external symbol. The 684 // runtime handle will need to be made external later, in 685 // AMDGPUExportOpenCLEnqueuedBlocks. The kernel itself has a hidden reference 686 // inside the runtime handle, and is not directly referenced. 687 688 // TODO: We would initialize the first field by declaring F->getName() + ".kd" 689 // to reference the kernel descriptor. The runtime wouldn't need to bother 690 // setting it. We would need to have a final symbol name though. 691 // TODO: Can we directly use an external symbol with getGlobalIdentifier? 692 auto *RuntimeHandle = new llvm::GlobalVariable( 693 Mod, HandleTy, 694 /*isConstant=*/true, llvm::GlobalValue::InternalLinkage, 695 /*Initializer=*/RuntimeHandleInitializer, RuntimeHandleName, 696 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, 697 DL.getDefaultGlobalsAddressSpace(), 698 /*isExternallyInitialized=*/true); 699 700 llvm::MDNode *HandleAsMD = 701 llvm::MDNode::get(C, llvm::ValueAsMetadata::get(RuntimeHandle)); 702 F->setMetadata(llvm::LLVMContext::MD_associated, HandleAsMD); 703 704 RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle"); 705 706 CGF.CGM.addUsedGlobal(F); 707 CGF.CGM.addUsedGlobal(RuntimeHandle); 708 return RuntimeHandle; 709 } 710 711 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr( 712 llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS, 713 const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal, 714 int32_t *MaxThreadsVal) { 715 unsigned Min = 0; 716 unsigned Max = 0; 717 auto Eval = [&](Expr *E) { 718 return E->EvaluateKnownConstInt(getContext()).getExtValue(); 719 }; 720 if (FlatWGS) { 721 Min = Eval(FlatWGS->getMin()); 722 Max = Eval(FlatWGS->getMax()); 723 } 724 if (ReqdWGS && Min == 0 && Max == 0) 725 Min = Max = Eval(ReqdWGS->getXDim()) * Eval(ReqdWGS->getYDim()) * 726 Eval(ReqdWGS->getZDim()); 727 728 if (Min != 0) { 729 assert(Min <= Max && "Min must be less than or equal Max"); 730 731 if (MinThreadsVal) 732 *MinThreadsVal = Min; 733 if (MaxThreadsVal) 734 *MaxThreadsVal = Max; 735 std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max); 736 if (F) 737 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); 738 } else 739 assert(Max == 0 && "Max must be zero"); 740 } 741 742 void CodeGenModule::handleAMDGPUWavesPerEUAttr( 743 llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) { 744 unsigned Min = 745 Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue(); 746 unsigned Max = 747 Attr->getMax() 748 ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue() 749 : 0; 750 751 if (Min != 0) { 752 assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max"); 753 754 std::string AttrVal = llvm::utostr(Min); 755 if (Max != 0) 756 AttrVal = AttrVal + "," + llvm::utostr(Max); 757 F->addFnAttr("amdgpu-waves-per-eu", AttrVal); 758 } else 759 assert(Max == 0 && "Max must be zero"); 760 } 761 762 std::unique_ptr<TargetCodeGenInfo> 763 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) { 764 return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes()); 765 } 766