1 //===- AMDGPU.cpp ---------------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "ABIInfoImpl.h" 10 #include "TargetInfo.h" 11 12 using namespace clang; 13 using namespace clang::CodeGen; 14 15 //===----------------------------------------------------------------------===// 16 // AMDGPU ABI Implementation 17 //===----------------------------------------------------------------------===// 18 19 namespace { 20 21 class AMDGPUABIInfo final : public DefaultABIInfo { 22 private: 23 static const unsigned MaxNumRegsForArgsRet = 16; 24 25 unsigned numRegsForType(QualType Ty) const; 26 27 bool isHomogeneousAggregateBaseType(QualType Ty) const override; 28 bool isHomogeneousAggregateSmallEnough(const Type *Base, 29 uint64_t Members) const override; 30 31 // Coerce HIP scalar pointer arguments from generic pointers to global ones. 32 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS, 33 unsigned ToAS) const { 34 // Single value types. 35 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty); 36 if (PtrTy && PtrTy->getAddressSpace() == FromAS) 37 return llvm::PointerType::get(Ty->getContext(), ToAS); 38 return Ty; 39 } 40 41 public: 42 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) : 43 DefaultABIInfo(CGT) {} 44 45 ABIArgInfo classifyReturnType(QualType RetTy) const; 46 ABIArgInfo classifyKernelArgumentType(QualType Ty) const; 47 ABIArgInfo classifyArgumentType(QualType Ty, unsigned &NumRegsLeft) const; 48 49 void computeInfo(CGFunctionInfo &FI) const override; 50 Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, 51 QualType Ty) const override; 52 }; 53 54 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const { 55 return true; 56 } 57 58 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough( 59 const Type *Base, uint64_t Members) const { 60 uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32; 61 62 // Homogeneous Aggregates may occupy at most 16 registers. 63 return Members * NumRegs <= MaxNumRegsForArgsRet; 64 } 65 66 /// Estimate number of registers the type will use when passed in registers. 67 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const { 68 unsigned NumRegs = 0; 69 70 if (const VectorType *VT = Ty->getAs<VectorType>()) { 71 // Compute from the number of elements. The reported size is based on the 72 // in-memory size, which includes the padding 4th element for 3-vectors. 73 QualType EltTy = VT->getElementType(); 74 unsigned EltSize = getContext().getTypeSize(EltTy); 75 76 // 16-bit element vectors should be passed as packed. 77 if (EltSize == 16) 78 return (VT->getNumElements() + 1) / 2; 79 80 unsigned EltNumRegs = (EltSize + 31) / 32; 81 return EltNumRegs * VT->getNumElements(); 82 } 83 84 if (const RecordType *RT = Ty->getAs<RecordType>()) { 85 const RecordDecl *RD = RT->getDecl(); 86 assert(!RD->hasFlexibleArrayMember()); 87 88 for (const FieldDecl *Field : RD->fields()) { 89 QualType FieldTy = Field->getType(); 90 NumRegs += numRegsForType(FieldTy); 91 } 92 93 return NumRegs; 94 } 95 96 return (getContext().getTypeSize(Ty) + 31) / 32; 97 } 98 99 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const { 100 llvm::CallingConv::ID CC = FI.getCallingConvention(); 101 102 if (!getCXXABI().classifyReturnType(FI)) 103 FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); 104 105 unsigned NumRegsLeft = MaxNumRegsForArgsRet; 106 for (auto &Arg : FI.arguments()) { 107 if (CC == llvm::CallingConv::AMDGPU_KERNEL) { 108 Arg.info = classifyKernelArgumentType(Arg.type); 109 } else { 110 Arg.info = classifyArgumentType(Arg.type, NumRegsLeft); 111 } 112 } 113 } 114 115 Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, 116 QualType Ty) const { 117 llvm_unreachable("AMDGPU does not support varargs"); 118 } 119 120 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const { 121 if (isAggregateTypeForABI(RetTy)) { 122 // Records with non-trivial destructors/copy-constructors should not be 123 // returned by value. 124 if (!getRecordArgABI(RetTy, getCXXABI())) { 125 // Ignore empty structs/unions. 126 if (isEmptyRecord(getContext(), RetTy, true)) 127 return ABIArgInfo::getIgnore(); 128 129 // Lower single-element structs to just return a regular value. 130 if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext())) 131 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); 132 133 if (const RecordType *RT = RetTy->getAs<RecordType>()) { 134 const RecordDecl *RD = RT->getDecl(); 135 if (RD->hasFlexibleArrayMember()) 136 return DefaultABIInfo::classifyReturnType(RetTy); 137 } 138 139 // Pack aggregates <= 4 bytes into single VGPR or pair. 140 uint64_t Size = getContext().getTypeSize(RetTy); 141 if (Size <= 16) 142 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); 143 144 if (Size <= 32) 145 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); 146 147 if (Size <= 64) { 148 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext()); 149 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); 150 } 151 152 if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet) 153 return ABIArgInfo::getDirect(); 154 } 155 } 156 157 // Otherwise just do the default thing. 158 return DefaultABIInfo::classifyReturnType(RetTy); 159 } 160 161 /// For kernels all parameters are really passed in a special buffer. It doesn't 162 /// make sense to pass anything byval, so everything must be direct. 163 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const { 164 Ty = useFirstFieldIfTransparentUnion(Ty); 165 166 // TODO: Can we omit empty structs? 167 168 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) 169 Ty = QualType(SeltTy, 0); 170 171 llvm::Type *OrigLTy = CGT.ConvertType(Ty); 172 llvm::Type *LTy = OrigLTy; 173 if (getContext().getLangOpts().HIP) { 174 LTy = coerceKernelArgumentType( 175 OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default), 176 /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device)); 177 } 178 179 // FIXME: Should also use this for OpenCL, but it requires addressing the 180 // problem of kernels being called. 181 // 182 // FIXME: This doesn't apply the optimization of coercing pointers in structs 183 // to global address space when using byref. This would require implementing a 184 // new kind of coercion of the in-memory type when for indirect arguments. 185 if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy && 186 isAggregateTypeForABI(Ty)) { 187 return ABIArgInfo::getIndirectAliased( 188 getContext().getTypeAlignInChars(Ty), 189 getContext().getTargetAddressSpace(LangAS::opencl_constant), 190 false /*Realign*/, nullptr /*Padding*/); 191 } 192 193 // If we set CanBeFlattened to true, CodeGen will expand the struct to its 194 // individual elements, which confuses the Clover OpenCL backend; therefore we 195 // have to set it to false here. Other args of getDirect() are just defaults. 196 return ABIArgInfo::getDirect(LTy, 0, nullptr, false); 197 } 198 199 ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, 200 unsigned &NumRegsLeft) const { 201 assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow"); 202 203 Ty = useFirstFieldIfTransparentUnion(Ty); 204 205 if (isAggregateTypeForABI(Ty)) { 206 // Records with non-trivial destructors/copy-constructors should not be 207 // passed by value. 208 if (auto RAA = getRecordArgABI(Ty, getCXXABI())) 209 return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory); 210 211 // Ignore empty structs/unions. 212 if (isEmptyRecord(getContext(), Ty, true)) 213 return ABIArgInfo::getIgnore(); 214 215 // Lower single-element structs to just pass a regular value. TODO: We 216 // could do reasonable-size multiple-element structs too, using getExpand(), 217 // though watch out for things like bitfields. 218 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) 219 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); 220 221 if (const RecordType *RT = Ty->getAs<RecordType>()) { 222 const RecordDecl *RD = RT->getDecl(); 223 if (RD->hasFlexibleArrayMember()) 224 return DefaultABIInfo::classifyArgumentType(Ty); 225 } 226 227 // Pack aggregates <= 8 bytes into single VGPR or pair. 228 uint64_t Size = getContext().getTypeSize(Ty); 229 if (Size <= 64) { 230 unsigned NumRegs = (Size + 31) / 32; 231 NumRegsLeft -= std::min(NumRegsLeft, NumRegs); 232 233 if (Size <= 16) 234 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); 235 236 if (Size <= 32) 237 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); 238 239 // XXX: Should this be i64 instead, and should the limit increase? 240 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext()); 241 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); 242 } 243 244 if (NumRegsLeft > 0) { 245 unsigned NumRegs = numRegsForType(Ty); 246 if (NumRegsLeft >= NumRegs) { 247 NumRegsLeft -= NumRegs; 248 return ABIArgInfo::getDirect(); 249 } 250 } 251 } 252 253 // Otherwise just do the default thing. 254 ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty); 255 if (!ArgInfo.isIndirect()) { 256 unsigned NumRegs = numRegsForType(Ty); 257 NumRegsLeft -= std::min(NumRegs, NumRegsLeft); 258 } 259 260 return ArgInfo; 261 } 262 263 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo { 264 public: 265 AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT) 266 : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {} 267 268 void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F, 269 CodeGenModule &CGM) const; 270 271 void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, 272 CodeGen::CodeGenModule &M) const override; 273 unsigned getOpenCLKernelCallingConv() const override; 274 275 llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM, 276 llvm::PointerType *T, QualType QT) const override; 277 278 LangAS getASTAllocaAddressSpace() const override { 279 return getLangASFromTargetAS( 280 getABIInfo().getDataLayout().getAllocaAddrSpace()); 281 } 282 LangAS getGlobalVarAddressSpace(CodeGenModule &CGM, 283 const VarDecl *D) const override; 284 llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts, 285 SyncScope Scope, 286 llvm::AtomicOrdering Ordering, 287 llvm::LLVMContext &Ctx) const override; 288 llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF, 289 llvm::Function *BlockInvokeFunc, 290 llvm::Type *BlockTy) const override; 291 bool shouldEmitStaticExternCAliases() const override; 292 bool shouldEmitDWARFBitFieldSeparators() const override; 293 void setCUDAKernelCallingConvention(const FunctionType *&FT) const override; 294 }; 295 } 296 297 static bool requiresAMDGPUProtectedVisibility(const Decl *D, 298 llvm::GlobalValue *GV) { 299 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility) 300 return false; 301 302 return D->hasAttr<OpenCLKernelAttr>() || 303 (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) || 304 (isa<VarDecl>(D) && 305 (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || 306 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() || 307 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())); 308 } 309 310 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( 311 const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const { 312 const auto *ReqdWGS = 313 M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr; 314 const bool IsOpenCLKernel = 315 M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>(); 316 const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>(); 317 318 const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>(); 319 if (ReqdWGS || FlatWGS) { 320 unsigned Min = 0; 321 unsigned Max = 0; 322 if (FlatWGS) { 323 Min = FlatWGS->getMin() 324 ->EvaluateKnownConstInt(M.getContext()) 325 .getExtValue(); 326 Max = FlatWGS->getMax() 327 ->EvaluateKnownConstInt(M.getContext()) 328 .getExtValue(); 329 } 330 if (ReqdWGS && Min == 0 && Max == 0) 331 Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim(); 332 333 if (Min != 0) { 334 assert(Min <= Max && "Min must be less than or equal Max"); 335 336 std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max); 337 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); 338 } else 339 assert(Max == 0 && "Max must be zero"); 340 } else if (IsOpenCLKernel || IsHIPKernel) { 341 // By default, restrict the maximum size to a value specified by 342 // --gpu-max-threads-per-block=n or its default value for HIP. 343 const unsigned OpenCLDefaultMaxWorkGroupSize = 256; 344 const unsigned DefaultMaxWorkGroupSize = 345 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize 346 : M.getLangOpts().GPUMaxThreadsPerBlock; 347 std::string AttrVal = 348 std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize); 349 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); 350 } 351 352 if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>()) { 353 unsigned Min = 354 Attr->getMin()->EvaluateKnownConstInt(M.getContext()).getExtValue(); 355 unsigned Max = Attr->getMax() ? Attr->getMax() 356 ->EvaluateKnownConstInt(M.getContext()) 357 .getExtValue() 358 : 0; 359 360 if (Min != 0) { 361 assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max"); 362 363 std::string AttrVal = llvm::utostr(Min); 364 if (Max != 0) 365 AttrVal = AttrVal + "," + llvm::utostr(Max); 366 F->addFnAttr("amdgpu-waves-per-eu", AttrVal); 367 } else 368 assert(Max == 0 && "Max must be zero"); 369 } 370 371 if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) { 372 unsigned NumSGPR = Attr->getNumSGPR(); 373 374 if (NumSGPR != 0) 375 F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR)); 376 } 377 378 if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) { 379 uint32_t NumVGPR = Attr->getNumVGPR(); 380 381 if (NumVGPR != 0) 382 F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR)); 383 } 384 } 385 386 void AMDGPUTargetCodeGenInfo::setTargetAttributes( 387 const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { 388 if (requiresAMDGPUProtectedVisibility(D, GV)) { 389 GV->setVisibility(llvm::GlobalValue::ProtectedVisibility); 390 GV->setDSOLocal(true); 391 } 392 393 if (GV->isDeclaration()) 394 return; 395 396 llvm::Function *F = dyn_cast<llvm::Function>(GV); 397 if (!F) 398 return; 399 400 const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D); 401 if (FD) 402 setFunctionDeclAttributes(FD, F, M); 403 404 const bool IsHIPKernel = 405 M.getLangOpts().HIP && FD && FD->hasAttr<CUDAGlobalAttr>(); 406 407 // TODO: This should be moved to language specific attributes instead. 408 if (IsHIPKernel) 409 F->addFnAttr("uniform-work-group-size", "true"); 410 411 if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics()) 412 F->addFnAttr("amdgpu-unsafe-fp-atomics", "true"); 413 414 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts) 415 F->addFnAttr("amdgpu-ieee", "false"); 416 } 417 418 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const { 419 return llvm::CallingConv::AMDGPU_KERNEL; 420 } 421 422 // Currently LLVM assumes null pointers always have value 0, 423 // which results in incorrectly transformed IR. Therefore, instead of 424 // emitting null pointers in private and local address spaces, a null 425 // pointer in generic address space is emitted which is casted to a 426 // pointer in local or private address space. 427 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer( 428 const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT, 429 QualType QT) const { 430 if (CGM.getContext().getTargetNullPointerValue(QT) == 0) 431 return llvm::ConstantPointerNull::get(PT); 432 433 auto &Ctx = CGM.getContext(); 434 auto NPT = llvm::PointerType::get( 435 PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic)); 436 return llvm::ConstantExpr::getAddrSpaceCast( 437 llvm::ConstantPointerNull::get(NPT), PT); 438 } 439 440 LangAS 441 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM, 442 const VarDecl *D) const { 443 assert(!CGM.getLangOpts().OpenCL && 444 !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) && 445 "Address space agnostic languages only"); 446 LangAS DefaultGlobalAS = getLangASFromTargetAS( 447 CGM.getContext().getTargetAddressSpace(LangAS::opencl_global)); 448 if (!D) 449 return DefaultGlobalAS; 450 451 LangAS AddrSpace = D->getType().getAddressSpace(); 452 assert(AddrSpace == LangAS::Default || isTargetAddressSpace(AddrSpace)); 453 if (AddrSpace != LangAS::Default) 454 return AddrSpace; 455 456 // Only promote to address space 4 if VarDecl has constant initialization. 457 if (CGM.isTypeConstant(D->getType(), false, false) && 458 D->hasConstantInitialization()) { 459 if (auto ConstAS = CGM.getTarget().getConstantAddressSpace()) 460 return *ConstAS; 461 } 462 return DefaultGlobalAS; 463 } 464 465 llvm::SyncScope::ID 466 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts, 467 SyncScope Scope, 468 llvm::AtomicOrdering Ordering, 469 llvm::LLVMContext &Ctx) const { 470 std::string Name; 471 switch (Scope) { 472 case SyncScope::HIPSingleThread: 473 Name = "singlethread"; 474 break; 475 case SyncScope::HIPWavefront: 476 case SyncScope::OpenCLSubGroup: 477 Name = "wavefront"; 478 break; 479 case SyncScope::HIPWorkgroup: 480 case SyncScope::OpenCLWorkGroup: 481 Name = "workgroup"; 482 break; 483 case SyncScope::HIPAgent: 484 case SyncScope::OpenCLDevice: 485 Name = "agent"; 486 break; 487 case SyncScope::HIPSystem: 488 case SyncScope::OpenCLAllSVMDevices: 489 Name = ""; 490 break; 491 } 492 493 if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) { 494 if (!Name.empty()) 495 Name = Twine(Twine(Name) + Twine("-")).str(); 496 497 Name = Twine(Twine(Name) + Twine("one-as")).str(); 498 } 499 500 return Ctx.getOrInsertSyncScopeID(Name); 501 } 502 503 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const { 504 return false; 505 } 506 507 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const { 508 return true; 509 } 510 511 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention( 512 const FunctionType *&FT) const { 513 FT = getABIInfo().getContext().adjustFunctionType( 514 FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel)); 515 } 516 517 /// Create an OpenCL kernel for an enqueued block. 518 /// 519 /// The type of the first argument (the block literal) is the struct type 520 /// of the block literal instead of a pointer type. The first argument 521 /// (block literal) is passed directly by value to the kernel. The kernel 522 /// allocates the same type of struct on stack and stores the block literal 523 /// to it and passes its pointer to the block invoke function. The kernel 524 /// has "enqueued-block" function attribute and kernel argument metadata. 525 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel( 526 CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const { 527 auto &Builder = CGF.Builder; 528 auto &C = CGF.getLLVMContext(); 529 530 auto *InvokeFT = Invoke->getFunctionType(); 531 llvm::SmallVector<llvm::Type *, 2> ArgTys; 532 llvm::SmallVector<llvm::Metadata *, 8> AddressQuals; 533 llvm::SmallVector<llvm::Metadata *, 8> AccessQuals; 534 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames; 535 llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames; 536 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals; 537 llvm::SmallVector<llvm::Metadata *, 8> ArgNames; 538 539 ArgTys.push_back(BlockTy); 540 ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal")); 541 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0))); 542 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal")); 543 ArgTypeQuals.push_back(llvm::MDString::get(C, "")); 544 AccessQuals.push_back(llvm::MDString::get(C, "none")); 545 ArgNames.push_back(llvm::MDString::get(C, "block_literal")); 546 for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) { 547 ArgTys.push_back(InvokeFT->getParamType(I)); 548 ArgTypeNames.push_back(llvm::MDString::get(C, "void*")); 549 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3))); 550 AccessQuals.push_back(llvm::MDString::get(C, "none")); 551 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*")); 552 ArgTypeQuals.push_back(llvm::MDString::get(C, "")); 553 ArgNames.push_back( 554 llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str())); 555 } 556 std::string Name = Invoke->getName().str() + "_kernel"; 557 auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false); 558 auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name, 559 &CGF.CGM.getModule()); 560 F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); 561 562 llvm::AttrBuilder KernelAttrs(C); 563 // FIXME: The invoke isn't applying the right attributes either 564 // FIXME: This is missing setTargetAttributes 565 CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs); 566 KernelAttrs.addAttribute("enqueued-block"); 567 F->addFnAttrs(KernelAttrs); 568 569 auto IP = CGF.Builder.saveIP(); 570 auto *BB = llvm::BasicBlock::Create(C, "entry", F); 571 Builder.SetInsertPoint(BB); 572 const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy); 573 auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr); 574 BlockPtr->setAlignment(BlockAlign); 575 Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign); 576 auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0)); 577 llvm::SmallVector<llvm::Value *, 2> Args; 578 Args.push_back(Cast); 579 for (llvm::Argument &A : llvm::drop_begin(F->args())) 580 Args.push_back(&A); 581 llvm::CallInst *call = Builder.CreateCall(Invoke, Args); 582 call->setCallingConv(Invoke->getCallingConv()); 583 Builder.CreateRetVoid(); 584 Builder.restoreIP(IP); 585 586 F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals)); 587 F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals)); 588 F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames)); 589 F->setMetadata("kernel_arg_base_type", 590 llvm::MDNode::get(C, ArgBaseTypeNames)); 591 F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals)); 592 if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata) 593 F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames)); 594 595 return F; 596 } 597 598 std::unique_ptr<TargetCodeGenInfo> 599 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) { 600 return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes()); 601 } 602