1 //===- AMDGPU.cpp ---------------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "ABIInfoImpl.h" 10 #include "TargetInfo.h" 11 #include "clang/Basic/TargetOptions.h" 12 13 using namespace clang; 14 using namespace clang::CodeGen; 15 16 //===----------------------------------------------------------------------===// 17 // AMDGPU ABI Implementation 18 //===----------------------------------------------------------------------===// 19 20 namespace { 21 22 class AMDGPUABIInfo final : public DefaultABIInfo { 23 private: 24 static const unsigned MaxNumRegsForArgsRet = 16; 25 26 unsigned numRegsForType(QualType Ty) const; 27 28 bool isHomogeneousAggregateBaseType(QualType Ty) const override; 29 bool isHomogeneousAggregateSmallEnough(const Type *Base, 30 uint64_t Members) const override; 31 32 // Coerce HIP scalar pointer arguments from generic pointers to global ones. 33 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS, 34 unsigned ToAS) const { 35 // Single value types. 36 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty); 37 if (PtrTy && PtrTy->getAddressSpace() == FromAS) 38 return llvm::PointerType::get(Ty->getContext(), ToAS); 39 return Ty; 40 } 41 42 public: 43 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) : 44 DefaultABIInfo(CGT) {} 45 46 ABIArgInfo classifyReturnType(QualType RetTy) const; 47 ABIArgInfo classifyKernelArgumentType(QualType Ty) const; 48 ABIArgInfo classifyArgumentType(QualType Ty, bool Variadic, 49 unsigned &NumRegsLeft) const; 50 51 void computeInfo(CGFunctionInfo &FI) const override; 52 RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty, 53 AggValueSlot Slot) const override; 54 }; 55 56 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const { 57 return true; 58 } 59 60 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough( 61 const Type *Base, uint64_t Members) const { 62 uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32; 63 64 // Homogeneous Aggregates may occupy at most 16 registers. 65 return Members * NumRegs <= MaxNumRegsForArgsRet; 66 } 67 68 /// Estimate number of registers the type will use when passed in registers. 69 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const { 70 unsigned NumRegs = 0; 71 72 if (const VectorType *VT = Ty->getAs<VectorType>()) { 73 // Compute from the number of elements. The reported size is based on the 74 // in-memory size, which includes the padding 4th element for 3-vectors. 75 QualType EltTy = VT->getElementType(); 76 unsigned EltSize = getContext().getTypeSize(EltTy); 77 78 // 16-bit element vectors should be passed as packed. 79 if (EltSize == 16) 80 return (VT->getNumElements() + 1) / 2; 81 82 unsigned EltNumRegs = (EltSize + 31) / 32; 83 return EltNumRegs * VT->getNumElements(); 84 } 85 86 if (const RecordType *RT = Ty->getAs<RecordType>()) { 87 const RecordDecl *RD = RT->getDecl(); 88 assert(!RD->hasFlexibleArrayMember()); 89 90 for (const FieldDecl *Field : RD->fields()) { 91 QualType FieldTy = Field->getType(); 92 NumRegs += numRegsForType(FieldTy); 93 } 94 95 return NumRegs; 96 } 97 98 return (getContext().getTypeSize(Ty) + 31) / 32; 99 } 100 101 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const { 102 llvm::CallingConv::ID CC = FI.getCallingConvention(); 103 104 if (!getCXXABI().classifyReturnType(FI)) 105 FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); 106 107 unsigned ArgumentIndex = 0; 108 const unsigned numFixedArguments = FI.getNumRequiredArgs(); 109 110 unsigned NumRegsLeft = MaxNumRegsForArgsRet; 111 for (auto &Arg : FI.arguments()) { 112 if (CC == llvm::CallingConv::AMDGPU_KERNEL) { 113 Arg.info = classifyKernelArgumentType(Arg.type); 114 } else { 115 bool FixedArgument = ArgumentIndex++ < numFixedArguments; 116 Arg.info = classifyArgumentType(Arg.type, !FixedArgument, NumRegsLeft); 117 } 118 } 119 } 120 121 RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, 122 QualType Ty, AggValueSlot Slot) const { 123 const bool IsIndirect = false; 124 const bool AllowHigherAlign = false; 125 return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect, 126 getContext().getTypeInfoInChars(Ty), 127 CharUnits::fromQuantity(4), AllowHigherAlign, Slot); 128 } 129 130 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const { 131 if (isAggregateTypeForABI(RetTy)) { 132 // Records with non-trivial destructors/copy-constructors should not be 133 // returned by value. 134 if (!getRecordArgABI(RetTy, getCXXABI())) { 135 // Ignore empty structs/unions. 136 if (isEmptyRecord(getContext(), RetTy, true)) 137 return ABIArgInfo::getIgnore(); 138 139 // Lower single-element structs to just return a regular value. 140 if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext())) 141 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); 142 143 if (const RecordType *RT = RetTy->getAs<RecordType>()) { 144 const RecordDecl *RD = RT->getDecl(); 145 if (RD->hasFlexibleArrayMember()) 146 return DefaultABIInfo::classifyReturnType(RetTy); 147 } 148 149 // Pack aggregates <= 4 bytes into single VGPR or pair. 150 uint64_t Size = getContext().getTypeSize(RetTy); 151 if (Size <= 16) 152 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); 153 154 if (Size <= 32) 155 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); 156 157 if (Size <= 64) { 158 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext()); 159 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); 160 } 161 162 if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet) 163 return ABIArgInfo::getDirect(); 164 } 165 } 166 167 // Otherwise just do the default thing. 168 return DefaultABIInfo::classifyReturnType(RetTy); 169 } 170 171 /// For kernels all parameters are really passed in a special buffer. It doesn't 172 /// make sense to pass anything byval, so everything must be direct. 173 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const { 174 Ty = useFirstFieldIfTransparentUnion(Ty); 175 176 // TODO: Can we omit empty structs? 177 178 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) 179 Ty = QualType(SeltTy, 0); 180 181 llvm::Type *OrigLTy = CGT.ConvertType(Ty); 182 llvm::Type *LTy = OrigLTy; 183 if (getContext().getLangOpts().HIP) { 184 LTy = coerceKernelArgumentType( 185 OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default), 186 /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device)); 187 } 188 189 // FIXME: Should also use this for OpenCL, but it requires addressing the 190 // problem of kernels being called. 191 // 192 // FIXME: This doesn't apply the optimization of coercing pointers in structs 193 // to global address space when using byref. This would require implementing a 194 // new kind of coercion of the in-memory type when for indirect arguments. 195 if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy && 196 isAggregateTypeForABI(Ty)) { 197 return ABIArgInfo::getIndirectAliased( 198 getContext().getTypeAlignInChars(Ty), 199 getContext().getTargetAddressSpace(LangAS::opencl_constant), 200 false /*Realign*/, nullptr /*Padding*/); 201 } 202 203 // If we set CanBeFlattened to true, CodeGen will expand the struct to its 204 // individual elements, which confuses the Clover OpenCL backend; therefore we 205 // have to set it to false here. Other args of getDirect() are just defaults. 206 return ABIArgInfo::getDirect(LTy, 0, nullptr, false); 207 } 208 209 ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic, 210 unsigned &NumRegsLeft) const { 211 assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow"); 212 213 Ty = useFirstFieldIfTransparentUnion(Ty); 214 215 if (Variadic) { 216 return ABIArgInfo::getDirect(/*T=*/nullptr, 217 /*Offset=*/0, 218 /*Padding=*/nullptr, 219 /*CanBeFlattened=*/false, 220 /*Align=*/0); 221 } 222 223 if (isAggregateTypeForABI(Ty)) { 224 // Records with non-trivial destructors/copy-constructors should not be 225 // passed by value. 226 if (auto RAA = getRecordArgABI(Ty, getCXXABI())) 227 return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory); 228 229 // Ignore empty structs/unions. 230 if (isEmptyRecord(getContext(), Ty, true)) 231 return ABIArgInfo::getIgnore(); 232 233 // Lower single-element structs to just pass a regular value. TODO: We 234 // could do reasonable-size multiple-element structs too, using getExpand(), 235 // though watch out for things like bitfields. 236 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) 237 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); 238 239 if (const RecordType *RT = Ty->getAs<RecordType>()) { 240 const RecordDecl *RD = RT->getDecl(); 241 if (RD->hasFlexibleArrayMember()) 242 return DefaultABIInfo::classifyArgumentType(Ty); 243 } 244 245 // Pack aggregates <= 8 bytes into single VGPR or pair. 246 uint64_t Size = getContext().getTypeSize(Ty); 247 if (Size <= 64) { 248 unsigned NumRegs = (Size + 31) / 32; 249 NumRegsLeft -= std::min(NumRegsLeft, NumRegs); 250 251 if (Size <= 16) 252 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); 253 254 if (Size <= 32) 255 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); 256 257 // XXX: Should this be i64 instead, and should the limit increase? 258 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext()); 259 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); 260 } 261 262 if (NumRegsLeft > 0) { 263 unsigned NumRegs = numRegsForType(Ty); 264 if (NumRegsLeft >= NumRegs) { 265 NumRegsLeft -= NumRegs; 266 return ABIArgInfo::getDirect(); 267 } 268 } 269 270 // Use pass-by-reference in stead of pass-by-value for struct arguments in 271 // function ABI. 272 return ABIArgInfo::getIndirectAliased( 273 getContext().getTypeAlignInChars(Ty), 274 getContext().getTargetAddressSpace(LangAS::opencl_private)); 275 } 276 277 // Otherwise just do the default thing. 278 ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty); 279 if (!ArgInfo.isIndirect()) { 280 unsigned NumRegs = numRegsForType(Ty); 281 NumRegsLeft -= std::min(NumRegs, NumRegsLeft); 282 } 283 284 return ArgInfo; 285 } 286 287 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo { 288 public: 289 AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT) 290 : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {} 291 292 void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F, 293 CodeGenModule &CGM) const; 294 295 void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override; 296 297 void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, 298 CodeGen::CodeGenModule &M) const override; 299 unsigned getOpenCLKernelCallingConv() const override; 300 301 llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM, 302 llvm::PointerType *T, QualType QT) const override; 303 304 LangAS getASTAllocaAddressSpace() const override { 305 return getLangASFromTargetAS( 306 getABIInfo().getDataLayout().getAllocaAddrSpace()); 307 } 308 LangAS getGlobalVarAddressSpace(CodeGenModule &CGM, 309 const VarDecl *D) const override; 310 llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts, 311 SyncScope Scope, 312 llvm::AtomicOrdering Ordering, 313 llvm::LLVMContext &Ctx) const override; 314 llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF, 315 llvm::Function *BlockInvokeFunc, 316 llvm::Type *BlockTy) const override; 317 bool shouldEmitStaticExternCAliases() const override; 318 bool shouldEmitDWARFBitFieldSeparators() const override; 319 void setCUDAKernelCallingConvention(const FunctionType *&FT) const override; 320 }; 321 } 322 323 static bool requiresAMDGPUProtectedVisibility(const Decl *D, 324 llvm::GlobalValue *GV) { 325 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility) 326 return false; 327 328 return !D->hasAttr<OMPDeclareTargetDeclAttr>() && 329 (D->hasAttr<OpenCLKernelAttr>() || 330 (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) || 331 (isa<VarDecl>(D) && 332 (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || 333 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() || 334 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType()))); 335 } 336 337 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( 338 const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const { 339 const auto *ReqdWGS = 340 M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr; 341 const bool IsOpenCLKernel = 342 M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>(); 343 const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>(); 344 345 const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>(); 346 if (ReqdWGS || FlatWGS) { 347 M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS); 348 } else if (IsOpenCLKernel || IsHIPKernel) { 349 // By default, restrict the maximum size to a value specified by 350 // --gpu-max-threads-per-block=n or its default value for HIP. 351 const unsigned OpenCLDefaultMaxWorkGroupSize = 256; 352 const unsigned DefaultMaxWorkGroupSize = 353 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize 354 : M.getLangOpts().GPUMaxThreadsPerBlock; 355 std::string AttrVal = 356 std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize); 357 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); 358 } 359 360 if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>()) 361 M.handleAMDGPUWavesPerEUAttr(F, Attr); 362 363 if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) { 364 unsigned NumSGPR = Attr->getNumSGPR(); 365 366 if (NumSGPR != 0) 367 F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR)); 368 } 369 370 if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) { 371 uint32_t NumVGPR = Attr->getNumVGPR(); 372 373 if (NumVGPR != 0) 374 F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR)); 375 } 376 377 if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) { 378 uint32_t X = Attr->getMaxNumWorkGroupsX() 379 ->EvaluateKnownConstInt(M.getContext()) 380 .getExtValue(); 381 // Y and Z dimensions default to 1 if not specified 382 uint32_t Y = Attr->getMaxNumWorkGroupsY() 383 ? Attr->getMaxNumWorkGroupsY() 384 ->EvaluateKnownConstInt(M.getContext()) 385 .getExtValue() 386 : 1; 387 uint32_t Z = Attr->getMaxNumWorkGroupsZ() 388 ? Attr->getMaxNumWorkGroupsZ() 389 ->EvaluateKnownConstInt(M.getContext()) 390 .getExtValue() 391 : 1; 392 393 llvm::SmallString<32> AttrVal; 394 llvm::raw_svector_ostream OS(AttrVal); 395 OS << X << ',' << Y << ',' << Z; 396 397 F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str()); 398 } 399 } 400 401 /// Emits control constants used to change per-architecture behaviour in the 402 /// AMDGPU ROCm device libraries. 403 void AMDGPUTargetCodeGenInfo::emitTargetGlobals( 404 CodeGen::CodeGenModule &CGM) const { 405 StringRef Name = "__oclc_ABI_version"; 406 llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name); 407 if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage())) 408 return; 409 410 if (CGM.getTarget().getTargetOpts().CodeObjectVersion == 411 llvm::CodeObjectVersionKind::COV_None) 412 return; 413 414 auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32); 415 llvm::Constant *COV = llvm::ConstantInt::get( 416 Type, CGM.getTarget().getTargetOpts().CodeObjectVersion); 417 418 // It needs to be constant weak_odr without externally_initialized so that 419 // the load instuction can be eliminated by the IPSCCP. 420 auto *GV = new llvm::GlobalVariable( 421 CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name, 422 nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, 423 CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant)); 424 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local); 425 GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility); 426 427 // Replace any external references to this variable with the new global. 428 if (OriginalGV) { 429 OriginalGV->replaceAllUsesWith(GV); 430 GV->takeName(OriginalGV); 431 OriginalGV->eraseFromParent(); 432 } 433 } 434 435 void AMDGPUTargetCodeGenInfo::setTargetAttributes( 436 const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { 437 if (requiresAMDGPUProtectedVisibility(D, GV)) { 438 GV->setVisibility(llvm::GlobalValue::ProtectedVisibility); 439 GV->setDSOLocal(true); 440 } 441 442 if (GV->isDeclaration()) 443 return; 444 445 llvm::Function *F = dyn_cast<llvm::Function>(GV); 446 if (!F) 447 return; 448 449 const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D); 450 if (FD) 451 setFunctionDeclAttributes(FD, F, M); 452 453 if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics()) 454 F->addFnAttr("amdgpu-unsafe-fp-atomics", "true"); 455 456 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts) 457 F->addFnAttr("amdgpu-ieee", "false"); 458 } 459 460 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const { 461 return llvm::CallingConv::AMDGPU_KERNEL; 462 } 463 464 // Currently LLVM assumes null pointers always have value 0, 465 // which results in incorrectly transformed IR. Therefore, instead of 466 // emitting null pointers in private and local address spaces, a null 467 // pointer in generic address space is emitted which is casted to a 468 // pointer in local or private address space. 469 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer( 470 const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT, 471 QualType QT) const { 472 if (CGM.getContext().getTargetNullPointerValue(QT) == 0) 473 return llvm::ConstantPointerNull::get(PT); 474 475 auto &Ctx = CGM.getContext(); 476 auto NPT = llvm::PointerType::get( 477 PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic)); 478 return llvm::ConstantExpr::getAddrSpaceCast( 479 llvm::ConstantPointerNull::get(NPT), PT); 480 } 481 482 LangAS 483 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM, 484 const VarDecl *D) const { 485 assert(!CGM.getLangOpts().OpenCL && 486 !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) && 487 "Address space agnostic languages only"); 488 LangAS DefaultGlobalAS = getLangASFromTargetAS( 489 CGM.getContext().getTargetAddressSpace(LangAS::opencl_global)); 490 if (!D) 491 return DefaultGlobalAS; 492 493 LangAS AddrSpace = D->getType().getAddressSpace(); 494 if (AddrSpace != LangAS::Default) 495 return AddrSpace; 496 497 // Only promote to address space 4 if VarDecl has constant initialization. 498 if (D->getType().isConstantStorage(CGM.getContext(), false, false) && 499 D->hasConstantInitialization()) { 500 if (auto ConstAS = CGM.getTarget().getConstantAddressSpace()) 501 return *ConstAS; 502 } 503 return DefaultGlobalAS; 504 } 505 506 llvm::SyncScope::ID 507 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts, 508 SyncScope Scope, 509 llvm::AtomicOrdering Ordering, 510 llvm::LLVMContext &Ctx) const { 511 std::string Name; 512 switch (Scope) { 513 case SyncScope::HIPSingleThread: 514 case SyncScope::SingleScope: 515 Name = "singlethread"; 516 break; 517 case SyncScope::HIPWavefront: 518 case SyncScope::OpenCLSubGroup: 519 case SyncScope::WavefrontScope: 520 Name = "wavefront"; 521 break; 522 case SyncScope::HIPWorkgroup: 523 case SyncScope::OpenCLWorkGroup: 524 case SyncScope::WorkgroupScope: 525 Name = "workgroup"; 526 break; 527 case SyncScope::HIPAgent: 528 case SyncScope::OpenCLDevice: 529 case SyncScope::DeviceScope: 530 Name = "agent"; 531 break; 532 case SyncScope::SystemScope: 533 case SyncScope::HIPSystem: 534 case SyncScope::OpenCLAllSVMDevices: 535 Name = ""; 536 break; 537 } 538 539 if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) { 540 if (!Name.empty()) 541 Name = Twine(Twine(Name) + Twine("-")).str(); 542 543 Name = Twine(Twine(Name) + Twine("one-as")).str(); 544 } 545 546 return Ctx.getOrInsertSyncScopeID(Name); 547 } 548 549 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const { 550 return false; 551 } 552 553 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const { 554 return true; 555 } 556 557 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention( 558 const FunctionType *&FT) const { 559 FT = getABIInfo().getContext().adjustFunctionType( 560 FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel)); 561 } 562 563 /// Create an OpenCL kernel for an enqueued block. 564 /// 565 /// The type of the first argument (the block literal) is the struct type 566 /// of the block literal instead of a pointer type. The first argument 567 /// (block literal) is passed directly by value to the kernel. The kernel 568 /// allocates the same type of struct on stack and stores the block literal 569 /// to it and passes its pointer to the block invoke function. The kernel 570 /// has "enqueued-block" function attribute and kernel argument metadata. 571 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel( 572 CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const { 573 auto &Builder = CGF.Builder; 574 auto &C = CGF.getLLVMContext(); 575 576 auto *InvokeFT = Invoke->getFunctionType(); 577 llvm::SmallVector<llvm::Type *, 2> ArgTys; 578 llvm::SmallVector<llvm::Metadata *, 8> AddressQuals; 579 llvm::SmallVector<llvm::Metadata *, 8> AccessQuals; 580 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames; 581 llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames; 582 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals; 583 llvm::SmallVector<llvm::Metadata *, 8> ArgNames; 584 585 ArgTys.push_back(BlockTy); 586 ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal")); 587 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0))); 588 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal")); 589 ArgTypeQuals.push_back(llvm::MDString::get(C, "")); 590 AccessQuals.push_back(llvm::MDString::get(C, "none")); 591 ArgNames.push_back(llvm::MDString::get(C, "block_literal")); 592 for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) { 593 ArgTys.push_back(InvokeFT->getParamType(I)); 594 ArgTypeNames.push_back(llvm::MDString::get(C, "void*")); 595 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3))); 596 AccessQuals.push_back(llvm::MDString::get(C, "none")); 597 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*")); 598 ArgTypeQuals.push_back(llvm::MDString::get(C, "")); 599 ArgNames.push_back( 600 llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str())); 601 } 602 std::string Name = Invoke->getName().str() + "_kernel"; 603 auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false); 604 auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name, 605 &CGF.CGM.getModule()); 606 F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); 607 608 llvm::AttrBuilder KernelAttrs(C); 609 // FIXME: The invoke isn't applying the right attributes either 610 // FIXME: This is missing setTargetAttributes 611 CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs); 612 KernelAttrs.addAttribute("enqueued-block"); 613 F->addFnAttrs(KernelAttrs); 614 615 auto IP = CGF.Builder.saveIP(); 616 auto *BB = llvm::BasicBlock::Create(C, "entry", F); 617 Builder.SetInsertPoint(BB); 618 const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy); 619 auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr); 620 BlockPtr->setAlignment(BlockAlign); 621 Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign); 622 auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0)); 623 llvm::SmallVector<llvm::Value *, 2> Args; 624 Args.push_back(Cast); 625 for (llvm::Argument &A : llvm::drop_begin(F->args())) 626 Args.push_back(&A); 627 llvm::CallInst *call = Builder.CreateCall(Invoke, Args); 628 call->setCallingConv(Invoke->getCallingConv()); 629 Builder.CreateRetVoid(); 630 Builder.restoreIP(IP); 631 632 F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals)); 633 F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals)); 634 F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames)); 635 F->setMetadata("kernel_arg_base_type", 636 llvm::MDNode::get(C, ArgBaseTypeNames)); 637 F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals)); 638 if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata) 639 F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames)); 640 641 return F; 642 } 643 644 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr( 645 llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS, 646 const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal, 647 int32_t *MaxThreadsVal) { 648 unsigned Min = 0; 649 unsigned Max = 0; 650 if (FlatWGS) { 651 Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue(); 652 Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue(); 653 } 654 if (ReqdWGS && Min == 0 && Max == 0) 655 Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim(); 656 657 if (Min != 0) { 658 assert(Min <= Max && "Min must be less than or equal Max"); 659 660 if (MinThreadsVal) 661 *MinThreadsVal = Min; 662 if (MaxThreadsVal) 663 *MaxThreadsVal = Max; 664 std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max); 665 if (F) 666 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); 667 } else 668 assert(Max == 0 && "Max must be zero"); 669 } 670 671 void CodeGenModule::handleAMDGPUWavesPerEUAttr( 672 llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) { 673 unsigned Min = 674 Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue(); 675 unsigned Max = 676 Attr->getMax() 677 ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue() 678 : 0; 679 680 if (Min != 0) { 681 assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max"); 682 683 std::string AttrVal = llvm::utostr(Min); 684 if (Max != 0) 685 AttrVal = AttrVal + "," + llvm::utostr(Max); 686 F->addFnAttr("amdgpu-waves-per-eu", AttrVal); 687 } else 688 assert(Max == 0 && "Max must be zero"); 689 } 690 691 std::unique_ptr<TargetCodeGenInfo> 692 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) { 693 return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes()); 694 } 695