1 //===- AMDGPU.cpp ---------------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "ABIInfoImpl.h" 10 #include "TargetInfo.h" 11 #include "clang/Basic/TargetOptions.h" 12 13 using namespace clang; 14 using namespace clang::CodeGen; 15 16 //===----------------------------------------------------------------------===// 17 // AMDGPU ABI Implementation 18 //===----------------------------------------------------------------------===// 19 20 namespace { 21 22 class AMDGPUABIInfo final : public DefaultABIInfo { 23 private: 24 static const unsigned MaxNumRegsForArgsRet = 16; 25 26 unsigned numRegsForType(QualType Ty) const; 27 28 bool isHomogeneousAggregateBaseType(QualType Ty) const override; 29 bool isHomogeneousAggregateSmallEnough(const Type *Base, 30 uint64_t Members) const override; 31 32 // Coerce HIP scalar pointer arguments from generic pointers to global ones. 33 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS, 34 unsigned ToAS) const { 35 // Single value types. 36 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty); 37 if (PtrTy && PtrTy->getAddressSpace() == FromAS) 38 return llvm::PointerType::get(Ty->getContext(), ToAS); 39 return Ty; 40 } 41 42 public: 43 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) : 44 DefaultABIInfo(CGT) {} 45 46 ABIArgInfo classifyReturnType(QualType RetTy) const; 47 ABIArgInfo classifyKernelArgumentType(QualType Ty) const; 48 ABIArgInfo classifyArgumentType(QualType Ty, unsigned &NumRegsLeft) const; 49 50 void computeInfo(CGFunctionInfo &FI) const override; 51 Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, 52 QualType Ty) const override; 53 }; 54 55 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const { 56 return true; 57 } 58 59 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough( 60 const Type *Base, uint64_t Members) const { 61 uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32; 62 63 // Homogeneous Aggregates may occupy at most 16 registers. 64 return Members * NumRegs <= MaxNumRegsForArgsRet; 65 } 66 67 /// Estimate number of registers the type will use when passed in registers. 68 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const { 69 unsigned NumRegs = 0; 70 71 if (const VectorType *VT = Ty->getAs<VectorType>()) { 72 // Compute from the number of elements. The reported size is based on the 73 // in-memory size, which includes the padding 4th element for 3-vectors. 74 QualType EltTy = VT->getElementType(); 75 unsigned EltSize = getContext().getTypeSize(EltTy); 76 77 // 16-bit element vectors should be passed as packed. 78 if (EltSize == 16) 79 return (VT->getNumElements() + 1) / 2; 80 81 unsigned EltNumRegs = (EltSize + 31) / 32; 82 return EltNumRegs * VT->getNumElements(); 83 } 84 85 if (const RecordType *RT = Ty->getAs<RecordType>()) { 86 const RecordDecl *RD = RT->getDecl(); 87 assert(!RD->hasFlexibleArrayMember()); 88 89 for (const FieldDecl *Field : RD->fields()) { 90 QualType FieldTy = Field->getType(); 91 NumRegs += numRegsForType(FieldTy); 92 } 93 94 return NumRegs; 95 } 96 97 return (getContext().getTypeSize(Ty) + 31) / 32; 98 } 99 100 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const { 101 llvm::CallingConv::ID CC = FI.getCallingConvention(); 102 103 if (!getCXXABI().classifyReturnType(FI)) 104 FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); 105 106 unsigned NumRegsLeft = MaxNumRegsForArgsRet; 107 for (auto &Arg : FI.arguments()) { 108 if (CC == llvm::CallingConv::AMDGPU_KERNEL) { 109 Arg.info = classifyKernelArgumentType(Arg.type); 110 } else { 111 Arg.info = classifyArgumentType(Arg.type, NumRegsLeft); 112 } 113 } 114 } 115 116 Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, 117 QualType Ty) const { 118 llvm_unreachable("AMDGPU does not support varargs"); 119 } 120 121 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const { 122 if (isAggregateTypeForABI(RetTy)) { 123 // Records with non-trivial destructors/copy-constructors should not be 124 // returned by value. 125 if (!getRecordArgABI(RetTy, getCXXABI())) { 126 // Ignore empty structs/unions. 127 if (isEmptyRecord(getContext(), RetTy, true)) 128 return ABIArgInfo::getIgnore(); 129 130 // Lower single-element structs to just return a regular value. 131 if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext())) 132 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); 133 134 if (const RecordType *RT = RetTy->getAs<RecordType>()) { 135 const RecordDecl *RD = RT->getDecl(); 136 if (RD->hasFlexibleArrayMember()) 137 return DefaultABIInfo::classifyReturnType(RetTy); 138 } 139 140 // Pack aggregates <= 4 bytes into single VGPR or pair. 141 uint64_t Size = getContext().getTypeSize(RetTy); 142 if (Size <= 16) 143 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); 144 145 if (Size <= 32) 146 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); 147 148 if (Size <= 64) { 149 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext()); 150 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); 151 } 152 153 if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet) 154 return ABIArgInfo::getDirect(); 155 } 156 } 157 158 // Otherwise just do the default thing. 159 return DefaultABIInfo::classifyReturnType(RetTy); 160 } 161 162 /// For kernels all parameters are really passed in a special buffer. It doesn't 163 /// make sense to pass anything byval, so everything must be direct. 164 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const { 165 Ty = useFirstFieldIfTransparentUnion(Ty); 166 167 // TODO: Can we omit empty structs? 168 169 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) 170 Ty = QualType(SeltTy, 0); 171 172 llvm::Type *OrigLTy = CGT.ConvertType(Ty); 173 llvm::Type *LTy = OrigLTy; 174 if (getContext().getLangOpts().HIP) { 175 LTy = coerceKernelArgumentType( 176 OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default), 177 /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device)); 178 } 179 180 // FIXME: Should also use this for OpenCL, but it requires addressing the 181 // problem of kernels being called. 182 // 183 // FIXME: This doesn't apply the optimization of coercing pointers in structs 184 // to global address space when using byref. This would require implementing a 185 // new kind of coercion of the in-memory type when for indirect arguments. 186 if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy && 187 isAggregateTypeForABI(Ty)) { 188 return ABIArgInfo::getIndirectAliased( 189 getContext().getTypeAlignInChars(Ty), 190 getContext().getTargetAddressSpace(LangAS::opencl_constant), 191 false /*Realign*/, nullptr /*Padding*/); 192 } 193 194 // If we set CanBeFlattened to true, CodeGen will expand the struct to its 195 // individual elements, which confuses the Clover OpenCL backend; therefore we 196 // have to set it to false here. Other args of getDirect() are just defaults. 197 return ABIArgInfo::getDirect(LTy, 0, nullptr, false); 198 } 199 200 ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, 201 unsigned &NumRegsLeft) const { 202 assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow"); 203 204 Ty = useFirstFieldIfTransparentUnion(Ty); 205 206 if (isAggregateTypeForABI(Ty)) { 207 // Records with non-trivial destructors/copy-constructors should not be 208 // passed by value. 209 if (auto RAA = getRecordArgABI(Ty, getCXXABI())) 210 return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory); 211 212 // Ignore empty structs/unions. 213 if (isEmptyRecord(getContext(), Ty, true)) 214 return ABIArgInfo::getIgnore(); 215 216 // Lower single-element structs to just pass a regular value. TODO: We 217 // could do reasonable-size multiple-element structs too, using getExpand(), 218 // though watch out for things like bitfields. 219 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) 220 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); 221 222 if (const RecordType *RT = Ty->getAs<RecordType>()) { 223 const RecordDecl *RD = RT->getDecl(); 224 if (RD->hasFlexibleArrayMember()) 225 return DefaultABIInfo::classifyArgumentType(Ty); 226 } 227 228 // Pack aggregates <= 8 bytes into single VGPR or pair. 229 uint64_t Size = getContext().getTypeSize(Ty); 230 if (Size <= 64) { 231 unsigned NumRegs = (Size + 31) / 32; 232 NumRegsLeft -= std::min(NumRegsLeft, NumRegs); 233 234 if (Size <= 16) 235 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); 236 237 if (Size <= 32) 238 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); 239 240 // XXX: Should this be i64 instead, and should the limit increase? 241 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext()); 242 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); 243 } 244 245 if (NumRegsLeft > 0) { 246 unsigned NumRegs = numRegsForType(Ty); 247 if (NumRegsLeft >= NumRegs) { 248 NumRegsLeft -= NumRegs; 249 return ABIArgInfo::getDirect(); 250 } 251 } 252 253 // Use pass-by-reference in stead of pass-by-value for struct arguments in 254 // function ABI. 255 return ABIArgInfo::getIndirectAliased( 256 getContext().getTypeAlignInChars(Ty), 257 getContext().getTargetAddressSpace(LangAS::opencl_private)); 258 } 259 260 // Otherwise just do the default thing. 261 ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty); 262 if (!ArgInfo.isIndirect()) { 263 unsigned NumRegs = numRegsForType(Ty); 264 NumRegsLeft -= std::min(NumRegs, NumRegsLeft); 265 } 266 267 return ArgInfo; 268 } 269 270 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo { 271 public: 272 AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT) 273 : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {} 274 275 void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F, 276 CodeGenModule &CGM) const; 277 278 void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override; 279 280 void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, 281 CodeGen::CodeGenModule &M) const override; 282 unsigned getOpenCLKernelCallingConv() const override; 283 284 llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM, 285 llvm::PointerType *T, QualType QT) const override; 286 287 LangAS getASTAllocaAddressSpace() const override { 288 return getLangASFromTargetAS( 289 getABIInfo().getDataLayout().getAllocaAddrSpace()); 290 } 291 LangAS getGlobalVarAddressSpace(CodeGenModule &CGM, 292 const VarDecl *D) const override; 293 llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts, 294 SyncScope Scope, 295 llvm::AtomicOrdering Ordering, 296 llvm::LLVMContext &Ctx) const override; 297 llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF, 298 llvm::Function *BlockInvokeFunc, 299 llvm::Type *BlockTy) const override; 300 bool shouldEmitStaticExternCAliases() const override; 301 bool shouldEmitDWARFBitFieldSeparators() const override; 302 void setCUDAKernelCallingConvention(const FunctionType *&FT) const override; 303 }; 304 } 305 306 static bool requiresAMDGPUProtectedVisibility(const Decl *D, 307 llvm::GlobalValue *GV) { 308 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility) 309 return false; 310 311 return !D->hasAttr<OMPDeclareTargetDeclAttr>() && 312 (D->hasAttr<OpenCLKernelAttr>() || 313 (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) || 314 (isa<VarDecl>(D) && 315 (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || 316 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() || 317 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType()))); 318 } 319 320 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( 321 const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const { 322 const auto *ReqdWGS = 323 M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr; 324 const bool IsOpenCLKernel = 325 M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>(); 326 const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>(); 327 328 const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>(); 329 if (ReqdWGS || FlatWGS) { 330 M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS); 331 } else if (IsOpenCLKernel || IsHIPKernel) { 332 // By default, restrict the maximum size to a value specified by 333 // --gpu-max-threads-per-block=n or its default value for HIP. 334 const unsigned OpenCLDefaultMaxWorkGroupSize = 256; 335 const unsigned DefaultMaxWorkGroupSize = 336 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize 337 : M.getLangOpts().GPUMaxThreadsPerBlock; 338 std::string AttrVal = 339 std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize); 340 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); 341 } 342 343 if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>()) 344 M.handleAMDGPUWavesPerEUAttr(F, Attr); 345 346 if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) { 347 unsigned NumSGPR = Attr->getNumSGPR(); 348 349 if (NumSGPR != 0) 350 F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR)); 351 } 352 353 if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) { 354 uint32_t NumVGPR = Attr->getNumVGPR(); 355 356 if (NumVGPR != 0) 357 F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR)); 358 } 359 } 360 361 /// Emits control constants used to change per-architecture behaviour in the 362 /// AMDGPU ROCm device libraries. 363 void AMDGPUTargetCodeGenInfo::emitTargetGlobals( 364 CodeGen::CodeGenModule &CGM) const { 365 StringRef Name = "__oclc_ABI_version"; 366 llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name); 367 if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage())) 368 return; 369 370 if (CGM.getTarget().getTargetOpts().CodeObjectVersion == 371 llvm::CodeObjectVersionKind::COV_None) 372 return; 373 374 auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32); 375 llvm::Constant *COV = llvm::ConstantInt::get( 376 Type, CGM.getTarget().getTargetOpts().CodeObjectVersion); 377 378 // It needs to be constant weak_odr without externally_initialized so that 379 // the load instuction can be eliminated by the IPSCCP. 380 auto *GV = new llvm::GlobalVariable( 381 CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name, 382 nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, 383 CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant)); 384 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local); 385 GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility); 386 387 // Replace any external references to this variable with the new global. 388 if (OriginalGV) { 389 OriginalGV->replaceAllUsesWith(GV); 390 GV->takeName(OriginalGV); 391 OriginalGV->eraseFromParent(); 392 } 393 } 394 395 void AMDGPUTargetCodeGenInfo::setTargetAttributes( 396 const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { 397 if (requiresAMDGPUProtectedVisibility(D, GV)) { 398 GV->setVisibility(llvm::GlobalValue::ProtectedVisibility); 399 GV->setDSOLocal(true); 400 } 401 402 if (GV->isDeclaration()) 403 return; 404 405 llvm::Function *F = dyn_cast<llvm::Function>(GV); 406 if (!F) 407 return; 408 409 const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D); 410 if (FD) 411 setFunctionDeclAttributes(FD, F, M); 412 413 if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics()) 414 F->addFnAttr("amdgpu-unsafe-fp-atomics", "true"); 415 416 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts) 417 F->addFnAttr("amdgpu-ieee", "false"); 418 } 419 420 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const { 421 return llvm::CallingConv::AMDGPU_KERNEL; 422 } 423 424 // Currently LLVM assumes null pointers always have value 0, 425 // which results in incorrectly transformed IR. Therefore, instead of 426 // emitting null pointers in private and local address spaces, a null 427 // pointer in generic address space is emitted which is casted to a 428 // pointer in local or private address space. 429 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer( 430 const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT, 431 QualType QT) const { 432 if (CGM.getContext().getTargetNullPointerValue(QT) == 0) 433 return llvm::ConstantPointerNull::get(PT); 434 435 auto &Ctx = CGM.getContext(); 436 auto NPT = llvm::PointerType::get( 437 PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic)); 438 return llvm::ConstantExpr::getAddrSpaceCast( 439 llvm::ConstantPointerNull::get(NPT), PT); 440 } 441 442 LangAS 443 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM, 444 const VarDecl *D) const { 445 assert(!CGM.getLangOpts().OpenCL && 446 !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) && 447 "Address space agnostic languages only"); 448 LangAS DefaultGlobalAS = getLangASFromTargetAS( 449 CGM.getContext().getTargetAddressSpace(LangAS::opencl_global)); 450 if (!D) 451 return DefaultGlobalAS; 452 453 LangAS AddrSpace = D->getType().getAddressSpace(); 454 if (AddrSpace != LangAS::Default) 455 return AddrSpace; 456 457 // Only promote to address space 4 if VarDecl has constant initialization. 458 if (D->getType().isConstantStorage(CGM.getContext(), false, false) && 459 D->hasConstantInitialization()) { 460 if (auto ConstAS = CGM.getTarget().getConstantAddressSpace()) 461 return *ConstAS; 462 } 463 return DefaultGlobalAS; 464 } 465 466 llvm::SyncScope::ID 467 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts, 468 SyncScope Scope, 469 llvm::AtomicOrdering Ordering, 470 llvm::LLVMContext &Ctx) const { 471 std::string Name; 472 switch (Scope) { 473 case SyncScope::HIPSingleThread: 474 case SyncScope::SingleScope: 475 Name = "singlethread"; 476 break; 477 case SyncScope::HIPWavefront: 478 case SyncScope::OpenCLSubGroup: 479 case SyncScope::WavefrontScope: 480 Name = "wavefront"; 481 break; 482 case SyncScope::HIPWorkgroup: 483 case SyncScope::OpenCLWorkGroup: 484 case SyncScope::WorkgroupScope: 485 Name = "workgroup"; 486 break; 487 case SyncScope::HIPAgent: 488 case SyncScope::OpenCLDevice: 489 case SyncScope::DeviceScope: 490 Name = "agent"; 491 break; 492 case SyncScope::SystemScope: 493 case SyncScope::HIPSystem: 494 case SyncScope::OpenCLAllSVMDevices: 495 Name = ""; 496 break; 497 } 498 499 if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) { 500 if (!Name.empty()) 501 Name = Twine(Twine(Name) + Twine("-")).str(); 502 503 Name = Twine(Twine(Name) + Twine("one-as")).str(); 504 } 505 506 return Ctx.getOrInsertSyncScopeID(Name); 507 } 508 509 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const { 510 return false; 511 } 512 513 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const { 514 return true; 515 } 516 517 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention( 518 const FunctionType *&FT) const { 519 FT = getABIInfo().getContext().adjustFunctionType( 520 FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel)); 521 } 522 523 /// Create an OpenCL kernel for an enqueued block. 524 /// 525 /// The type of the first argument (the block literal) is the struct type 526 /// of the block literal instead of a pointer type. The first argument 527 /// (block literal) is passed directly by value to the kernel. The kernel 528 /// allocates the same type of struct on stack and stores the block literal 529 /// to it and passes its pointer to the block invoke function. The kernel 530 /// has "enqueued-block" function attribute and kernel argument metadata. 531 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel( 532 CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const { 533 auto &Builder = CGF.Builder; 534 auto &C = CGF.getLLVMContext(); 535 536 auto *InvokeFT = Invoke->getFunctionType(); 537 llvm::SmallVector<llvm::Type *, 2> ArgTys; 538 llvm::SmallVector<llvm::Metadata *, 8> AddressQuals; 539 llvm::SmallVector<llvm::Metadata *, 8> AccessQuals; 540 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames; 541 llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames; 542 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals; 543 llvm::SmallVector<llvm::Metadata *, 8> ArgNames; 544 545 ArgTys.push_back(BlockTy); 546 ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal")); 547 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0))); 548 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal")); 549 ArgTypeQuals.push_back(llvm::MDString::get(C, "")); 550 AccessQuals.push_back(llvm::MDString::get(C, "none")); 551 ArgNames.push_back(llvm::MDString::get(C, "block_literal")); 552 for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) { 553 ArgTys.push_back(InvokeFT->getParamType(I)); 554 ArgTypeNames.push_back(llvm::MDString::get(C, "void*")); 555 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3))); 556 AccessQuals.push_back(llvm::MDString::get(C, "none")); 557 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*")); 558 ArgTypeQuals.push_back(llvm::MDString::get(C, "")); 559 ArgNames.push_back( 560 llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str())); 561 } 562 std::string Name = Invoke->getName().str() + "_kernel"; 563 auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false); 564 auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name, 565 &CGF.CGM.getModule()); 566 F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); 567 568 llvm::AttrBuilder KernelAttrs(C); 569 // FIXME: The invoke isn't applying the right attributes either 570 // FIXME: This is missing setTargetAttributes 571 CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs); 572 KernelAttrs.addAttribute("enqueued-block"); 573 F->addFnAttrs(KernelAttrs); 574 575 auto IP = CGF.Builder.saveIP(); 576 auto *BB = llvm::BasicBlock::Create(C, "entry", F); 577 Builder.SetInsertPoint(BB); 578 const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy); 579 auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr); 580 BlockPtr->setAlignment(BlockAlign); 581 Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign); 582 auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0)); 583 llvm::SmallVector<llvm::Value *, 2> Args; 584 Args.push_back(Cast); 585 for (llvm::Argument &A : llvm::drop_begin(F->args())) 586 Args.push_back(&A); 587 llvm::CallInst *call = Builder.CreateCall(Invoke, Args); 588 call->setCallingConv(Invoke->getCallingConv()); 589 Builder.CreateRetVoid(); 590 Builder.restoreIP(IP); 591 592 F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals)); 593 F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals)); 594 F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames)); 595 F->setMetadata("kernel_arg_base_type", 596 llvm::MDNode::get(C, ArgBaseTypeNames)); 597 F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals)); 598 if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata) 599 F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames)); 600 601 return F; 602 } 603 604 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr( 605 llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS, 606 const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal, 607 int32_t *MaxThreadsVal) { 608 unsigned Min = 0; 609 unsigned Max = 0; 610 if (FlatWGS) { 611 Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue(); 612 Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue(); 613 } 614 if (ReqdWGS && Min == 0 && Max == 0) 615 Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim(); 616 617 if (Min != 0) { 618 assert(Min <= Max && "Min must be less than or equal Max"); 619 620 if (MinThreadsVal) 621 *MinThreadsVal = Min; 622 if (MaxThreadsVal) 623 *MaxThreadsVal = Max; 624 std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max); 625 if (F) 626 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); 627 } else 628 assert(Max == 0 && "Max must be zero"); 629 } 630 631 void CodeGenModule::handleAMDGPUWavesPerEUAttr( 632 llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) { 633 unsigned Min = 634 Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue(); 635 unsigned Max = 636 Attr->getMax() 637 ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue() 638 : 0; 639 640 if (Min != 0) { 641 assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max"); 642 643 std::string AttrVal = llvm::utostr(Min); 644 if (Max != 0) 645 AttrVal = AttrVal + "," + llvm::utostr(Max); 646 F->addFnAttr("amdgpu-waves-per-eu", AttrVal); 647 } else 648 assert(Max == 0 && "Max must be zero"); 649 } 650 651 std::unique_ptr<TargetCodeGenInfo> 652 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) { 653 return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes()); 654 } 655