1 //===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This contains code to emit Builtin calls as LLVM code. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "CGBuiltin.h" 14 #include "clang/Basic/TargetBuiltins.h" 15 #include "llvm/Analysis/ValueTracking.h" 16 #include "llvm/IR/IntrinsicsAMDGPU.h" 17 #include "llvm/IR/IntrinsicsR600.h" 18 #include "llvm/IR/MemoryModelRelaxationAnnotations.h" 19 #include "llvm/Support/AMDGPUAddrSpace.h" 20 21 using namespace clang; 22 using namespace CodeGen; 23 using namespace llvm; 24 25 namespace { 26 27 // Has second type mangled argument. 28 static Value * 29 emitBinaryExpMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E, 30 Intrinsic::ID IntrinsicID, 31 Intrinsic::ID ConstrainedIntrinsicID) { 32 llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); 33 llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1)); 34 35 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E); 36 if (CGF.Builder.getIsFPConstrained()) { 37 Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, 38 {Src0->getType(), Src1->getType()}); 39 return CGF.Builder.CreateConstrainedFPCall(F, {Src0, Src1}); 40 } 41 42 Function *F = 43 CGF.CGM.getIntrinsic(IntrinsicID, {Src0->getType(), Src1->getType()}); 44 return CGF.Builder.CreateCall(F, {Src0, Src1}); 45 } 46 47 // If \p E is not null pointer, insert address space cast to match return 48 // type of \p E if necessary. 49 Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF, 50 const CallExpr *E = nullptr) { 51 auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr); 52 auto *Call = CGF.Builder.CreateCall(F); 53 Call->addRetAttr( 54 Attribute::getWithDereferenceableBytes(Call->getContext(), 64)); 55 Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4))); 56 if (!E) 57 return Call; 58 QualType BuiltinRetType = E->getType(); 59 auto *RetTy = cast<llvm::PointerType>(CGF.ConvertType(BuiltinRetType)); 60 if (RetTy == Call->getType()) 61 return Call; 62 return CGF.Builder.CreateAddrSpaceCast(Call, RetTy); 63 } 64 65 Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) { 66 auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr); 67 auto *Call = CGF.Builder.CreateCall(F); 68 Call->addRetAttr( 69 Attribute::getWithDereferenceableBytes(Call->getContext(), 256)); 70 Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8))); 71 return Call; 72 } 73 74 // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively. 75 /// Emit code based on Code Object ABI version. 76 /// COV_4 : Emit code to use dispatch ptr 77 /// COV_5+ : Emit code to use implicitarg ptr 78 /// COV_NONE : Emit code to load a global variable "__oclc_ABI_version" 79 /// and use its value for COV_4 or COV_5+ approach. It is used for 80 /// compiling device libraries in an ABI-agnostic way. 81 Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) { 82 llvm::LoadInst *LD; 83 84 auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion; 85 86 if (Cov == CodeObjectVersionKind::COV_None) { 87 StringRef Name = "__oclc_ABI_version"; 88 auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name); 89 if (!ABIVersionC) 90 ABIVersionC = new llvm::GlobalVariable( 91 CGF.CGM.getModule(), CGF.Int32Ty, false, 92 llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr, 93 llvm::GlobalVariable::NotThreadLocal, 94 CGF.CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant)); 95 96 // This load will be eliminated by the IPSCCP because it is constant 97 // weak_odr without externally_initialized. Either changing it to weak or 98 // adding externally_initialized will keep the load. 99 Value *ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC, 100 CGF.CGM.getIntAlign()); 101 102 Value *IsCOV5 = CGF.Builder.CreateICmpSGE( 103 ABIVersion, 104 llvm::ConstantInt::get(CGF.Int32Ty, CodeObjectVersionKind::COV_5)); 105 106 // Indexing the implicit kernarg segment. 107 Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32( 108 CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2); 109 110 // Indexing the HSA kernel_dispatch_packet struct. 111 Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32( 112 CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2); 113 114 auto Result = CGF.Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP); 115 LD = CGF.Builder.CreateLoad( 116 Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2))); 117 } else { 118 Value *GEP = nullptr; 119 if (Cov >= CodeObjectVersionKind::COV_5) { 120 // Indexing the implicit kernarg segment. 121 GEP = CGF.Builder.CreateConstGEP1_32( 122 CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2); 123 } else { 124 // Indexing the HSA kernel_dispatch_packet struct. 125 GEP = CGF.Builder.CreateConstGEP1_32( 126 CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2); 127 } 128 LD = CGF.Builder.CreateLoad( 129 Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2))); 130 } 131 132 llvm::MDBuilder MDHelper(CGF.getLLVMContext()); 133 llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1), 134 APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1)); 135 LD->setMetadata(llvm::LLVMContext::MD_range, RNode); 136 LD->setMetadata(llvm::LLVMContext::MD_noundef, 137 llvm::MDNode::get(CGF.getLLVMContext(), {})); 138 LD->setMetadata(llvm::LLVMContext::MD_invariant_load, 139 llvm::MDNode::get(CGF.getLLVMContext(), {})); 140 return LD; 141 } 142 143 // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively. 144 Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) { 145 const unsigned XOffset = 12; 146 auto *DP = EmitAMDGPUDispatchPtr(CGF); 147 // Indexing the HSA kernel_dispatch_packet struct. 148 auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 4); 149 auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset); 150 auto *LD = CGF.Builder.CreateLoad( 151 Address(GEP, CGF.Int32Ty, CharUnits::fromQuantity(4))); 152 153 llvm::MDBuilder MDB(CGF.getLLVMContext()); 154 155 // Known non-zero. 156 LD->setMetadata(llvm::LLVMContext::MD_range, 157 MDB.createRange(APInt(32, 1), APInt::getZero(32))); 158 LD->setMetadata(llvm::LLVMContext::MD_invariant_load, 159 llvm::MDNode::get(CGF.getLLVMContext(), {})); 160 return LD; 161 } 162 } // namespace 163 164 // Generates the IR for __builtin_read_exec_*. 165 // Lowers the builtin to amdgcn_ballot intrinsic. 166 static Value *EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const CallExpr *E, 167 llvm::Type *RegisterType, 168 llvm::Type *ValueType, bool isExecHi) { 169 CodeGen::CGBuilderTy &Builder = CGF.Builder; 170 CodeGen::CodeGenModule &CGM = CGF.CGM; 171 172 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {RegisterType}); 173 llvm::Value *Call = Builder.CreateCall(F, {Builder.getInt1(true)}); 174 175 if (isExecHi) { 176 Value *Rt2 = Builder.CreateLShr(Call, 32); 177 Rt2 = Builder.CreateTrunc(Rt2, CGF.Int32Ty); 178 return Rt2; 179 } 180 181 return Call; 182 } 183 184 // Emit an intrinsic that has 1 float or double operand, and 1 integer. 185 static Value *emitFPIntBuiltin(CodeGenFunction &CGF, 186 const CallExpr *E, 187 unsigned IntrinsicID) { 188 llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); 189 llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1)); 190 191 Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType()); 192 return CGF.Builder.CreateCall(F, {Src0, Src1}); 193 } 194 195 // For processing memory ordering and memory scope arguments of various 196 // amdgcn builtins. 197 // \p Order takes a C++11 comptabile memory-ordering specifier and converts 198 // it into LLVM's memory ordering specifier using atomic C ABI, and writes 199 // to \p AO. \p Scope takes a const char * and converts it into AMDGCN 200 // specific SyncScopeID and writes it to \p SSID. 201 void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope, 202 llvm::AtomicOrdering &AO, 203 llvm::SyncScope::ID &SSID) { 204 int ord = cast<llvm::ConstantInt>(Order)->getZExtValue(); 205 206 // Map C11/C++11 memory ordering to LLVM memory ordering 207 assert(llvm::isValidAtomicOrderingCABI(ord)); 208 switch (static_cast<llvm::AtomicOrderingCABI>(ord)) { 209 case llvm::AtomicOrderingCABI::acquire: 210 case llvm::AtomicOrderingCABI::consume: 211 AO = llvm::AtomicOrdering::Acquire; 212 break; 213 case llvm::AtomicOrderingCABI::release: 214 AO = llvm::AtomicOrdering::Release; 215 break; 216 case llvm::AtomicOrderingCABI::acq_rel: 217 AO = llvm::AtomicOrdering::AcquireRelease; 218 break; 219 case llvm::AtomicOrderingCABI::seq_cst: 220 AO = llvm::AtomicOrdering::SequentiallyConsistent; 221 break; 222 case llvm::AtomicOrderingCABI::relaxed: 223 AO = llvm::AtomicOrdering::Monotonic; 224 break; 225 } 226 227 // Some of the atomic builtins take the scope as a string name. 228 StringRef scp; 229 if (llvm::getConstantStringInfo(Scope, scp)) { 230 SSID = getLLVMContext().getOrInsertSyncScopeID(scp); 231 return; 232 } 233 234 // Older builtins had an enum argument for the memory scope. 235 int scope = cast<llvm::ConstantInt>(Scope)->getZExtValue(); 236 switch (scope) { 237 case 0: // __MEMORY_SCOPE_SYSTEM 238 SSID = llvm::SyncScope::System; 239 break; 240 case 1: // __MEMORY_SCOPE_DEVICE 241 SSID = getLLVMContext().getOrInsertSyncScopeID("agent"); 242 break; 243 case 2: // __MEMORY_SCOPE_WRKGRP 244 SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup"); 245 break; 246 case 3: // __MEMORY_SCOPE_WVFRNT 247 SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront"); 248 break; 249 case 4: // __MEMORY_SCOPE_SINGLE 250 SSID = llvm::SyncScope::SingleThread; 251 break; 252 default: 253 SSID = llvm::SyncScope::System; 254 break; 255 } 256 } 257 258 llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments, 259 unsigned Idx, 260 const CallExpr *E) { 261 llvm::Value *Arg = nullptr; 262 if ((ICEArguments & (1 << Idx)) == 0) { 263 Arg = EmitScalarExpr(E->getArg(Idx)); 264 } else { 265 // If this is required to be a constant, constant fold it so that we 266 // know that the generated intrinsic gets a ConstantInt. 267 std::optional<llvm::APSInt> Result = 268 E->getArg(Idx)->getIntegerConstantExpr(getContext()); 269 assert(Result && "Expected argument to be a constant"); 270 Arg = llvm::ConstantInt::get(getLLVMContext(), *Result); 271 } 272 return Arg; 273 } 274 275 void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, 276 const CallExpr *E) { 277 constexpr const char *Tag = "amdgpu-as"; 278 279 LLVMContext &Ctx = Inst->getContext(); 280 SmallVector<MMRAMetadata::TagT, 3> MMRAs; 281 for (unsigned K = 2; K < E->getNumArgs(); ++K) { 282 llvm::Value *V = EmitScalarExpr(E->getArg(K)); 283 StringRef AS; 284 if (llvm::getConstantStringInfo(V, AS)) { 285 MMRAs.push_back({Tag, AS}); 286 // TODO: Delete the resulting unused constant? 287 continue; 288 } 289 CGM.Error(E->getExprLoc(), 290 "expected an address space name as a string literal"); 291 } 292 293 llvm::sort(MMRAs); 294 MMRAs.erase(llvm::unique(MMRAs), MMRAs.end()); 295 Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs)); 296 } 297 298 Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, 299 const CallExpr *E) { 300 llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; 301 llvm::SyncScope::ID SSID; 302 switch (BuiltinID) { 303 case AMDGPU::BI__builtin_amdgcn_div_scale: 304 case AMDGPU::BI__builtin_amdgcn_div_scalef: { 305 // Translate from the intrinsics's struct return to the builtin's out 306 // argument. 307 308 Address FlagOutPtr = EmitPointerWithAlignment(E->getArg(3)); 309 310 llvm::Value *X = EmitScalarExpr(E->getArg(0)); 311 llvm::Value *Y = EmitScalarExpr(E->getArg(1)); 312 llvm::Value *Z = EmitScalarExpr(E->getArg(2)); 313 314 llvm::Function *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale, 315 X->getType()); 316 317 llvm::Value *Tmp = Builder.CreateCall(Callee, {X, Y, Z}); 318 319 llvm::Value *Result = Builder.CreateExtractValue(Tmp, 0); 320 llvm::Value *Flag = Builder.CreateExtractValue(Tmp, 1); 321 322 llvm::Type *RealFlagType = FlagOutPtr.getElementType(); 323 324 llvm::Value *FlagExt = Builder.CreateZExt(Flag, RealFlagType); 325 Builder.CreateStore(FlagExt, FlagOutPtr); 326 return Result; 327 } 328 case AMDGPU::BI__builtin_amdgcn_div_fmas: 329 case AMDGPU::BI__builtin_amdgcn_div_fmasf: { 330 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); 331 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); 332 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); 333 llvm::Value *Src3 = EmitScalarExpr(E->getArg(3)); 334 335 llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas, 336 Src0->getType()); 337 llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3); 338 return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool}); 339 } 340 341 case AMDGPU::BI__builtin_amdgcn_ds_swizzle: 342 return emitBuiltinWithOneOverloadedType<2>(*this, E, 343 Intrinsic::amdgcn_ds_swizzle); 344 case AMDGPU::BI__builtin_amdgcn_mov_dpp8: 345 case AMDGPU::BI__builtin_amdgcn_mov_dpp: 346 case AMDGPU::BI__builtin_amdgcn_update_dpp: { 347 llvm::SmallVector<llvm::Value *, 6> Args; 348 // Find out if any arguments are required to be integer constant 349 // expressions. 350 unsigned ICEArguments = 0; 351 ASTContext::GetBuiltinTypeError Error; 352 getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); 353 assert(Error == ASTContext::GE_None && "Should not codegen an error"); 354 llvm::Type *DataTy = ConvertType(E->getArg(0)->getType()); 355 unsigned Size = DataTy->getPrimitiveSizeInBits(); 356 llvm::Type *IntTy = 357 llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u)); 358 Function *F = 359 CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8 360 ? Intrinsic::amdgcn_mov_dpp8 361 : Intrinsic::amdgcn_update_dpp, 362 IntTy); 363 assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 || 364 E->getNumArgs() == 2); 365 bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp; 366 if (InsertOld) 367 Args.push_back(llvm::PoisonValue::get(IntTy)); 368 for (unsigned I = 0; I != E->getNumArgs(); ++I) { 369 llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E); 370 if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) && 371 Size < 32) { 372 if (!DataTy->isIntegerTy()) 373 V = Builder.CreateBitCast( 374 V, llvm::IntegerType::get(Builder.getContext(), Size)); 375 V = Builder.CreateZExtOrBitCast(V, IntTy); 376 } 377 llvm::Type *ExpTy = 378 F->getFunctionType()->getFunctionParamType(I + InsertOld); 379 Args.push_back(Builder.CreateTruncOrBitCast(V, ExpTy)); 380 } 381 Value *V = Builder.CreateCall(F, Args); 382 if (Size < 32 && !DataTy->isIntegerTy()) 383 V = Builder.CreateTrunc( 384 V, llvm::IntegerType::get(Builder.getContext(), Size)); 385 return Builder.CreateTruncOrBitCast(V, DataTy); 386 } 387 case AMDGPU::BI__builtin_amdgcn_permlane16: 388 case AMDGPU::BI__builtin_amdgcn_permlanex16: 389 return emitBuiltinWithOneOverloadedType<6>( 390 *this, E, 391 BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16 392 ? Intrinsic::amdgcn_permlane16 393 : Intrinsic::amdgcn_permlanex16); 394 case AMDGPU::BI__builtin_amdgcn_permlane64: 395 return emitBuiltinWithOneOverloadedType<1>(*this, E, 396 Intrinsic::amdgcn_permlane64); 397 case AMDGPU::BI__builtin_amdgcn_readlane: 398 return emitBuiltinWithOneOverloadedType<2>(*this, E, 399 Intrinsic::amdgcn_readlane); 400 case AMDGPU::BI__builtin_amdgcn_readfirstlane: 401 return emitBuiltinWithOneOverloadedType<1>(*this, E, 402 Intrinsic::amdgcn_readfirstlane); 403 case AMDGPU::BI__builtin_amdgcn_div_fixup: 404 case AMDGPU::BI__builtin_amdgcn_div_fixupf: 405 case AMDGPU::BI__builtin_amdgcn_div_fixuph: 406 return emitBuiltinWithOneOverloadedType<3>(*this, E, 407 Intrinsic::amdgcn_div_fixup); 408 case AMDGPU::BI__builtin_amdgcn_trig_preop: 409 case AMDGPU::BI__builtin_amdgcn_trig_preopf: 410 return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop); 411 case AMDGPU::BI__builtin_amdgcn_rcp: 412 case AMDGPU::BI__builtin_amdgcn_rcpf: 413 case AMDGPU::BI__builtin_amdgcn_rcph: 414 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rcp); 415 case AMDGPU::BI__builtin_amdgcn_sqrt: 416 case AMDGPU::BI__builtin_amdgcn_sqrtf: 417 case AMDGPU::BI__builtin_amdgcn_sqrth: 418 return emitBuiltinWithOneOverloadedType<1>(*this, E, 419 Intrinsic::amdgcn_sqrt); 420 case AMDGPU::BI__builtin_amdgcn_rsq: 421 case AMDGPU::BI__builtin_amdgcn_rsqf: 422 case AMDGPU::BI__builtin_amdgcn_rsqh: 423 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq); 424 case AMDGPU::BI__builtin_amdgcn_rsq_clamp: 425 case AMDGPU::BI__builtin_amdgcn_rsq_clampf: 426 return emitBuiltinWithOneOverloadedType<1>(*this, E, 427 Intrinsic::amdgcn_rsq_clamp); 428 case AMDGPU::BI__builtin_amdgcn_sinf: 429 case AMDGPU::BI__builtin_amdgcn_sinh: 430 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_sin); 431 case AMDGPU::BI__builtin_amdgcn_cosf: 432 case AMDGPU::BI__builtin_amdgcn_cosh: 433 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_cos); 434 case AMDGPU::BI__builtin_amdgcn_dispatch_ptr: 435 return EmitAMDGPUDispatchPtr(*this, E); 436 case AMDGPU::BI__builtin_amdgcn_logf: 437 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log); 438 case AMDGPU::BI__builtin_amdgcn_exp2f: 439 return emitBuiltinWithOneOverloadedType<1>(*this, E, 440 Intrinsic::amdgcn_exp2); 441 case AMDGPU::BI__builtin_amdgcn_log_clampf: 442 return emitBuiltinWithOneOverloadedType<1>(*this, E, 443 Intrinsic::amdgcn_log_clamp); 444 case AMDGPU::BI__builtin_amdgcn_ldexp: 445 case AMDGPU::BI__builtin_amdgcn_ldexpf: { 446 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); 447 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); 448 llvm::Function *F = 449 CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Src1->getType()}); 450 return Builder.CreateCall(F, {Src0, Src1}); 451 } 452 case AMDGPU::BI__builtin_amdgcn_ldexph: { 453 // The raw instruction has a different behavior for out of bounds exponent 454 // values (implicit truncation instead of saturate to short_min/short_max). 455 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); 456 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); 457 llvm::Function *F = 458 CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Int16Ty}); 459 return Builder.CreateCall(F, {Src0, Builder.CreateTrunc(Src1, Int16Ty)}); 460 } 461 case AMDGPU::BI__builtin_amdgcn_frexp_mant: 462 case AMDGPU::BI__builtin_amdgcn_frexp_mantf: 463 case AMDGPU::BI__builtin_amdgcn_frexp_manth: 464 return emitBuiltinWithOneOverloadedType<1>(*this, E, 465 Intrinsic::amdgcn_frexp_mant); 466 case AMDGPU::BI__builtin_amdgcn_frexp_exp: 467 case AMDGPU::BI__builtin_amdgcn_frexp_expf: { 468 Value *Src0 = EmitScalarExpr(E->getArg(0)); 469 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp, 470 { Builder.getInt32Ty(), Src0->getType() }); 471 return Builder.CreateCall(F, Src0); 472 } 473 case AMDGPU::BI__builtin_amdgcn_frexp_exph: { 474 Value *Src0 = EmitScalarExpr(E->getArg(0)); 475 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp, 476 { Builder.getInt16Ty(), Src0->getType() }); 477 return Builder.CreateCall(F, Src0); 478 } 479 case AMDGPU::BI__builtin_amdgcn_fract: 480 case AMDGPU::BI__builtin_amdgcn_fractf: 481 case AMDGPU::BI__builtin_amdgcn_fracth: 482 return emitBuiltinWithOneOverloadedType<1>(*this, E, 483 Intrinsic::amdgcn_fract); 484 case AMDGPU::BI__builtin_amdgcn_lerp: 485 return emitBuiltinWithOneOverloadedType<3>(*this, E, 486 Intrinsic::amdgcn_lerp); 487 case AMDGPU::BI__builtin_amdgcn_ubfe: 488 return emitBuiltinWithOneOverloadedType<3>(*this, E, 489 Intrinsic::amdgcn_ubfe); 490 case AMDGPU::BI__builtin_amdgcn_sbfe: 491 return emitBuiltinWithOneOverloadedType<3>(*this, E, 492 Intrinsic::amdgcn_sbfe); 493 case AMDGPU::BI__builtin_amdgcn_ballot_w32: 494 case AMDGPU::BI__builtin_amdgcn_ballot_w64: { 495 llvm::Type *ResultType = ConvertType(E->getType()); 496 llvm::Value *Src = EmitScalarExpr(E->getArg(0)); 497 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType }); 498 return Builder.CreateCall(F, { Src }); 499 } 500 case AMDGPU::BI__builtin_amdgcn_tanh_bf16: 501 return emitBuiltinWithOneOverloadedType<1>(*this, E, 502 Intrinsic::amdgcn_tanh); 503 case AMDGPU::BI__builtin_amdgcn_uicmp: 504 case AMDGPU::BI__builtin_amdgcn_uicmpl: 505 case AMDGPU::BI__builtin_amdgcn_sicmp: 506 case AMDGPU::BI__builtin_amdgcn_sicmpl: { 507 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); 508 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); 509 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); 510 511 // FIXME-GFX10: How should 32 bit mask be handled? 512 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_icmp, 513 { Builder.getInt64Ty(), Src0->getType() }); 514 return Builder.CreateCall(F, { Src0, Src1, Src2 }); 515 } 516 case AMDGPU::BI__builtin_amdgcn_fcmp: 517 case AMDGPU::BI__builtin_amdgcn_fcmpf: { 518 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); 519 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); 520 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); 521 522 // FIXME-GFX10: How should 32 bit mask be handled? 523 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_fcmp, 524 { Builder.getInt64Ty(), Src0->getType() }); 525 return Builder.CreateCall(F, { Src0, Src1, Src2 }); 526 } 527 case AMDGPU::BI__builtin_amdgcn_class: 528 case AMDGPU::BI__builtin_amdgcn_classf: 529 case AMDGPU::BI__builtin_amdgcn_classh: 530 return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class); 531 case AMDGPU::BI__builtin_amdgcn_fmed3f: 532 case AMDGPU::BI__builtin_amdgcn_fmed3h: 533 return emitBuiltinWithOneOverloadedType<3>(*this, E, 534 Intrinsic::amdgcn_fmed3); 535 case AMDGPU::BI__builtin_amdgcn_ds_append: 536 case AMDGPU::BI__builtin_amdgcn_ds_consume: { 537 Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ? 538 Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume; 539 Value *Src0 = EmitScalarExpr(E->getArg(0)); 540 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() }); 541 return Builder.CreateCall(F, { Src0, Builder.getFalse() }); 542 } 543 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: 544 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: 545 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: 546 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: 547 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16: 548 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: 549 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: 550 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: 551 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32: 552 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32: 553 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32: 554 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16: 555 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16: 556 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16: 557 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32: 558 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32: 559 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32: 560 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16: 561 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16: 562 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16: 563 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32: 564 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32: 565 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32: 566 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16: 567 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16: 568 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: { 569 Intrinsic::ID IID; 570 switch (BuiltinID) { 571 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: 572 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: 573 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32: 574 IID = Intrinsic::amdgcn_global_load_tr_b64; 575 break; 576 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: 577 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: 578 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16: 579 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: 580 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: 581 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: 582 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16: 583 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16: 584 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16: 585 IID = Intrinsic::amdgcn_global_load_tr_b128; 586 break; 587 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32: 588 IID = Intrinsic::amdgcn_global_load_tr4_b64; 589 break; 590 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32: 591 IID = Intrinsic::amdgcn_global_load_tr6_b96; 592 break; 593 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32: 594 IID = Intrinsic::amdgcn_ds_load_tr4_b64; 595 break; 596 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32: 597 IID = Intrinsic::amdgcn_ds_load_tr6_b96; 598 break; 599 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32: 600 IID = Intrinsic::amdgcn_ds_load_tr8_b64; 601 break; 602 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16: 603 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16: 604 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16: 605 IID = Intrinsic::amdgcn_ds_load_tr16_b128; 606 break; 607 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32: 608 IID = Intrinsic::amdgcn_ds_read_tr4_b64; 609 break; 610 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32: 611 IID = Intrinsic::amdgcn_ds_read_tr8_b64; 612 break; 613 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32: 614 IID = Intrinsic::amdgcn_ds_read_tr6_b96; 615 break; 616 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: 617 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16: 618 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16: 619 IID = Intrinsic::amdgcn_ds_read_tr16_b64; 620 break; 621 } 622 llvm::Type *LoadTy = ConvertType(E->getType()); 623 llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); 624 llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy}); 625 return Builder.CreateCall(F, {Addr}); 626 } 627 case AMDGPU::BI__builtin_amdgcn_load_to_lds: { 628 // Should this have asan instrumentation? 629 return emitBuiltinWithOneOverloadedType<5>(*this, E, 630 Intrinsic::amdgcn_load_to_lds); 631 } 632 case AMDGPU::BI__builtin_amdgcn_get_fpenv: { 633 Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv, 634 {llvm::Type::getInt64Ty(getLLVMContext())}); 635 return Builder.CreateCall(F); 636 } 637 case AMDGPU::BI__builtin_amdgcn_set_fpenv: { 638 Function *F = CGM.getIntrinsic(Intrinsic::set_fpenv, 639 {llvm::Type::getInt64Ty(getLLVMContext())}); 640 llvm::Value *Env = EmitScalarExpr(E->getArg(0)); 641 return Builder.CreateCall(F, {Env}); 642 } 643 case AMDGPU::BI__builtin_amdgcn_read_exec: 644 return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false); 645 case AMDGPU::BI__builtin_amdgcn_read_exec_lo: 646 return EmitAMDGCNBallotForExec(*this, E, Int32Ty, Int32Ty, false); 647 case AMDGPU::BI__builtin_amdgcn_read_exec_hi: 648 return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, true); 649 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray: 650 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h: 651 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l: 652 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: { 653 llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0)); 654 llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1)); 655 llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(2)); 656 llvm::Value *RayDir = EmitScalarExpr(E->getArg(3)); 657 llvm::Value *RayInverseDir = EmitScalarExpr(E->getArg(4)); 658 llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(5)); 659 660 // The builtins take these arguments as vec4 where the last element is 661 // ignored. The intrinsic takes them as vec3. 662 RayOrigin = Builder.CreateShuffleVector(RayOrigin, RayOrigin, 663 {0, 1, 2}); 664 RayDir = 665 Builder.CreateShuffleVector(RayDir, RayDir, {0, 1, 2}); 666 RayInverseDir = Builder.CreateShuffleVector(RayInverseDir, RayInverseDir, 667 {0, 1, 2}); 668 669 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray, 670 {NodePtr->getType(), RayDir->getType()}); 671 return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir, 672 RayInverseDir, TextureDescr}); 673 } 674 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray: 675 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: { 676 Intrinsic::ID IID; 677 switch (BuiltinID) { 678 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray: 679 IID = Intrinsic::amdgcn_image_bvh8_intersect_ray; 680 break; 681 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: 682 IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray; 683 break; 684 } 685 llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0)); 686 llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1)); 687 llvm::Value *InstanceMask = EmitScalarExpr(E->getArg(2)); 688 llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(3)); 689 llvm::Value *RayDir = EmitScalarExpr(E->getArg(4)); 690 llvm::Value *Offset = EmitScalarExpr(E->getArg(5)); 691 llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(6)); 692 693 Address RetRayOriginPtr = EmitPointerWithAlignment(E->getArg(7)); 694 Address RetRayDirPtr = EmitPointerWithAlignment(E->getArg(8)); 695 696 llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID); 697 698 llvm::CallInst *CI = Builder.CreateCall( 699 IntrinsicFunc, {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir, 700 Offset, TextureDescr}); 701 702 llvm::Value *RetVData = Builder.CreateExtractValue(CI, 0); 703 llvm::Value *RetRayOrigin = Builder.CreateExtractValue(CI, 1); 704 llvm::Value *RetRayDir = Builder.CreateExtractValue(CI, 2); 705 706 Builder.CreateStore(RetRayOrigin, RetRayOriginPtr); 707 Builder.CreateStore(RetRayDir, RetRayDirPtr); 708 709 return RetVData; 710 } 711 712 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: 713 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn: 714 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn: 715 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: { 716 Intrinsic::ID IID; 717 switch (BuiltinID) { 718 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: 719 IID = Intrinsic::amdgcn_ds_bvh_stack_rtn; 720 break; 721 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn: 722 IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn; 723 break; 724 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn: 725 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn; 726 break; 727 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: 728 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn; 729 break; 730 } 731 732 SmallVector<Value *, 4> Args; 733 for (int i = 0, e = E->getNumArgs(); i != e; ++i) 734 Args.push_back(EmitScalarExpr(E->getArg(i))); 735 736 Function *F = CGM.getIntrinsic(IID); 737 Value *Call = Builder.CreateCall(F, Args); 738 Value *Rtn = Builder.CreateExtractValue(Call, 0); 739 Value *A = Builder.CreateExtractValue(Call, 1); 740 llvm::Type *RetTy = ConvertType(E->getType()); 741 Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn, 742 (uint64_t)0); 743 // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns 744 // <2 x i64>, zext the second value. 745 if (A->getType()->getPrimitiveSizeInBits() < 746 RetTy->getScalarType()->getPrimitiveSizeInBits()) 747 A = Builder.CreateZExt(A, RetTy->getScalarType()); 748 749 return Builder.CreateInsertElement(I0, A, 1); 750 } 751 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4: 752 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: { 753 llvm::FixedVectorType *VT = FixedVectorType::get(Builder.getInt32Ty(), 8); 754 Function *F = CGM.getIntrinsic( 755 BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 756 ? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4 757 : Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4, 758 {VT, VT}); 759 760 SmallVector<Value *, 9> Args; 761 for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I) 762 Args.push_back(EmitScalarExpr(E->getArg(I))); 763 return Builder.CreateCall(F, Args); 764 } 765 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32: 766 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32: 767 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64: 768 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64: 769 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32: 770 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32: 771 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64: 772 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64: 773 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32: 774 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64: 775 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32: 776 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64: 777 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32: 778 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64: 779 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32: 780 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: 781 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12: 782 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12: 783 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12: 784 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12: 785 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12: 786 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12: 787 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12: 788 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12: 789 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12: 790 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12: 791 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12: 792 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12: 793 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12: 794 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12: 795 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12: 796 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12: 797 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12: 798 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12: 799 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12: 800 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12: 801 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12: 802 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12: 803 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32: 804 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64: 805 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32: 806 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64: 807 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32: 808 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64: 809 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32: 810 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64: 811 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32: 812 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64: 813 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32: 814 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64: 815 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32: 816 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64: 817 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32: 818 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64: 819 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32: 820 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64: 821 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32: 822 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64: 823 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32: 824 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: { 825 826 // These operations perform a matrix multiplication and accumulation of 827 // the form: 828 // D = A * B + C 829 // We need to specify one type for matrices AB and one for matrices CD. 830 // Sparse matrix operations can have different types for A and B as well as 831 // an additional type for sparsity index. 832 // Destination type should be put before types used for source operands. 833 SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes; 834 // On GFX12, the intrinsics with 16-bit accumulator use a packed layout. 835 // There is no need for the variable opsel argument, so always set it to 836 // "false". 837 bool AppendFalseForOpselArg = false; 838 unsigned BuiltinWMMAOp; 839 840 switch (BuiltinID) { 841 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32: 842 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64: 843 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12: 844 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12: 845 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB 846 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16; 847 break; 848 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32: 849 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64: 850 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12: 851 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12: 852 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB 853 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16; 854 break; 855 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12: 856 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12: 857 AppendFalseForOpselArg = true; 858 [[fallthrough]]; 859 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32: 860 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64: 861 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB 862 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16; 863 break; 864 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12: 865 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12: 866 AppendFalseForOpselArg = true; 867 [[fallthrough]]; 868 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32: 869 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64: 870 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB 871 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16; 872 break; 873 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32: 874 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64: 875 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB 876 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied; 877 break; 878 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32: 879 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64: 880 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB 881 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied; 882 break; 883 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32: 884 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: 885 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12: 886 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12: 887 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB 888 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8; 889 break; 890 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32: 891 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64: 892 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12: 893 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12: 894 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB 895 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4; 896 break; 897 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12: 898 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12: 899 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB 900 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8; 901 break; 902 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12: 903 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12: 904 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB 905 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8; 906 break; 907 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12: 908 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12: 909 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB 910 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8; 911 break; 912 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12: 913 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12: 914 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB 915 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8; 916 break; 917 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12: 918 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12: 919 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB 920 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4; 921 break; 922 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32: 923 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64: 924 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index 925 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16; 926 break; 927 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32: 928 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64: 929 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index 930 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16; 931 break; 932 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32: 933 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64: 934 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index 935 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16; 936 break; 937 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32: 938 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64: 939 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index 940 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16; 941 break; 942 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32: 943 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64: 944 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index 945 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8; 946 break; 947 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32: 948 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64: 949 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index 950 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4; 951 break; 952 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32: 953 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64: 954 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index 955 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4; 956 break; 957 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32: 958 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64: 959 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index 960 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8; 961 break; 962 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32: 963 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64: 964 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index 965 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8; 966 break; 967 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32: 968 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64: 969 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index 970 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8; 971 break; 972 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32: 973 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: 974 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index 975 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8; 976 break; 977 } 978 979 SmallVector<Value *, 6> Args; 980 for (int i = 0, e = E->getNumArgs(); i != e; ++i) 981 Args.push_back(EmitScalarExpr(E->getArg(i))); 982 if (AppendFalseForOpselArg) 983 Args.push_back(Builder.getFalse()); 984 985 SmallVector<llvm::Type *, 6> ArgTypes; 986 for (auto ArgIdx : ArgsForMatchingMatrixTypes) 987 ArgTypes.push_back(Args[ArgIdx]->getType()); 988 989 Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes); 990 return Builder.CreateCall(F, Args); 991 } 992 // amdgcn workgroup size 993 case AMDGPU::BI__builtin_amdgcn_workgroup_size_x: 994 return EmitAMDGPUWorkGroupSize(*this, 0); 995 case AMDGPU::BI__builtin_amdgcn_workgroup_size_y: 996 return EmitAMDGPUWorkGroupSize(*this, 1); 997 case AMDGPU::BI__builtin_amdgcn_workgroup_size_z: 998 return EmitAMDGPUWorkGroupSize(*this, 2); 999 1000 // amdgcn grid size 1001 case AMDGPU::BI__builtin_amdgcn_grid_size_x: 1002 return EmitAMDGPUGridSize(*this, 0); 1003 case AMDGPU::BI__builtin_amdgcn_grid_size_y: 1004 return EmitAMDGPUGridSize(*this, 1); 1005 case AMDGPU::BI__builtin_amdgcn_grid_size_z: 1006 return EmitAMDGPUGridSize(*this, 2); 1007 1008 // r600 intrinsics 1009 case AMDGPU::BI__builtin_r600_recipsqrt_ieee: 1010 case AMDGPU::BI__builtin_r600_recipsqrt_ieeef: 1011 return emitBuiltinWithOneOverloadedType<1>(*this, E, 1012 Intrinsic::r600_recipsqrt_ieee); 1013 case AMDGPU::BI__builtin_amdgcn_alignbit: { 1014 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); 1015 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); 1016 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); 1017 Function *F = CGM.getIntrinsic(Intrinsic::fshr, Src0->getType()); 1018 return Builder.CreateCall(F, { Src0, Src1, Src2 }); 1019 } 1020 case AMDGPU::BI__builtin_amdgcn_fence: { 1021 ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(0)), 1022 EmitScalarExpr(E->getArg(1)), AO, SSID); 1023 FenceInst *Fence = Builder.CreateFence(AO, SSID); 1024 if (E->getNumArgs() > 2) 1025 AddAMDGPUFenceAddressSpaceMMRA(Fence, E); 1026 return Fence; 1027 } 1028 case AMDGPU::BI__builtin_amdgcn_atomic_inc32: 1029 case AMDGPU::BI__builtin_amdgcn_atomic_inc64: 1030 case AMDGPU::BI__builtin_amdgcn_atomic_dec32: 1031 case AMDGPU::BI__builtin_amdgcn_atomic_dec64: 1032 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: 1033 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: 1034 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: 1035 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: 1036 case AMDGPU::BI__builtin_amdgcn_ds_faddf: 1037 case AMDGPU::BI__builtin_amdgcn_ds_fminf: 1038 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: 1039 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: 1040 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: 1041 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: 1042 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: 1043 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: 1044 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: 1045 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16: 1046 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: 1047 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64: 1048 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64: 1049 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64: 1050 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: { 1051 llvm::AtomicRMWInst::BinOp BinOp; 1052 switch (BuiltinID) { 1053 case AMDGPU::BI__builtin_amdgcn_atomic_inc32: 1054 case AMDGPU::BI__builtin_amdgcn_atomic_inc64: 1055 BinOp = llvm::AtomicRMWInst::UIncWrap; 1056 break; 1057 case AMDGPU::BI__builtin_amdgcn_atomic_dec32: 1058 case AMDGPU::BI__builtin_amdgcn_atomic_dec64: 1059 BinOp = llvm::AtomicRMWInst::UDecWrap; 1060 break; 1061 case AMDGPU::BI__builtin_amdgcn_ds_faddf: 1062 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: 1063 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: 1064 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: 1065 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: 1066 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: 1067 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: 1068 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: 1069 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: 1070 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: 1071 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: 1072 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16: 1073 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: 1074 BinOp = llvm::AtomicRMWInst::FAdd; 1075 break; 1076 case AMDGPU::BI__builtin_amdgcn_ds_fminf: 1077 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64: 1078 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64: 1079 BinOp = llvm::AtomicRMWInst::FMin; 1080 break; 1081 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64: 1082 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: 1083 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: 1084 BinOp = llvm::AtomicRMWInst::FMax; 1085 break; 1086 } 1087 1088 Address Ptr = CheckAtomicAlignment(*this, E); 1089 Value *Val = EmitScalarExpr(E->getArg(1)); 1090 llvm::Type *OrigTy = Val->getType(); 1091 QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType(); 1092 1093 bool Volatile; 1094 1095 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf || 1096 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf || 1097 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) { 1098 // __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument 1099 Volatile = 1100 cast<ConstantInt>(EmitScalarExpr(E->getArg(4)))->getZExtValue(); 1101 } else { 1102 // Infer volatile from the passed type. 1103 Volatile = 1104 PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified(); 1105 } 1106 1107 if (E->getNumArgs() >= 4) { 1108 // Some of the builtins have explicit ordering and scope arguments. 1109 ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)), 1110 EmitScalarExpr(E->getArg(3)), AO, SSID); 1111 } else { 1112 // Most of the builtins do not have syncscope/order arguments. For DS 1113 // atomics the scope doesn't really matter, as they implicitly operate at 1114 // workgroup scope. 1115 // 1116 // The global/flat cases need to use agent scope to consistently produce 1117 // the native instruction instead of a cmpxchg expansion. 1118 SSID = getLLVMContext().getOrInsertSyncScopeID("agent"); 1119 AO = AtomicOrdering::Monotonic; 1120 1121 // The v2bf16 builtin uses i16 instead of a natural bfloat type. 1122 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 || 1123 BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 || 1124 BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) { 1125 llvm::Type *V2BF16Ty = FixedVectorType::get( 1126 llvm::Type::getBFloatTy(Builder.getContext()), 2); 1127 Val = Builder.CreateBitCast(Val, V2BF16Ty); 1128 } 1129 } 1130 1131 llvm::AtomicRMWInst *RMW = 1132 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID); 1133 if (Volatile) 1134 RMW->setVolatile(true); 1135 1136 unsigned AddrSpace = Ptr.getType()->getAddressSpace(); 1137 if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) { 1138 // Most targets require "amdgpu.no.fine.grained.memory" to emit the native 1139 // instruction for flat and global operations. 1140 llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {}); 1141 RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD); 1142 1143 // Most targets require "amdgpu.ignore.denormal.mode" to emit the native 1144 // instruction, but this only matters for float fadd. 1145 if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy()) 1146 RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD); 1147 } 1148 1149 return Builder.CreateBitCast(RMW, OrigTy); 1150 } 1151 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn: 1152 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: { 1153 llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); 1154 llvm::Type *ResultType = ConvertType(E->getType()); 1155 // s_sendmsg_rtn is mangled using return type only. 1156 Function *F = 1157 CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType}); 1158 return Builder.CreateCall(F, {Arg}); 1159 } 1160 case AMDGPU::BI__builtin_amdgcn_permlane16_swap: 1161 case AMDGPU::BI__builtin_amdgcn_permlane32_swap: { 1162 // Because builtin types are limited, and the intrinsic uses a struct/pair 1163 // output, marshal the pair-of-i32 to <2 x i32>. 1164 Value *VDstOld = EmitScalarExpr(E->getArg(0)); 1165 Value *VSrcOld = EmitScalarExpr(E->getArg(1)); 1166 Value *FI = EmitScalarExpr(E->getArg(2)); 1167 Value *BoundCtrl = EmitScalarExpr(E->getArg(3)); 1168 Function *F = 1169 CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap 1170 ? Intrinsic::amdgcn_permlane16_swap 1171 : Intrinsic::amdgcn_permlane32_swap); 1172 llvm::CallInst *Call = 1173 Builder.CreateCall(F, {VDstOld, VSrcOld, FI, BoundCtrl}); 1174 1175 llvm::Value *Elt0 = Builder.CreateExtractValue(Call, 0); 1176 llvm::Value *Elt1 = Builder.CreateExtractValue(Call, 1); 1177 1178 llvm::Type *ResultType = ConvertType(E->getType()); 1179 1180 llvm::Value *Insert0 = Builder.CreateInsertElement( 1181 llvm::PoisonValue::get(ResultType), Elt0, UINT64_C(0)); 1182 llvm::Value *AsVector = 1183 Builder.CreateInsertElement(Insert0, Elt1, UINT64_C(1)); 1184 return AsVector; 1185 } 1186 case AMDGPU::BI__builtin_amdgcn_bitop3_b32: 1187 case AMDGPU::BI__builtin_amdgcn_bitop3_b16: 1188 return emitBuiltinWithOneOverloadedType<4>(*this, E, 1189 Intrinsic::amdgcn_bitop3); 1190 case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: { 1191 // TODO: LLVM has this overloaded to allow for fat pointers, but since 1192 // those haven't been plumbed through to Clang yet, default to creating the 1193 // resource type. 1194 SmallVector<Value *, 4> Args; 1195 for (unsigned I = 0; I < 4; ++I) 1196 Args.push_back(EmitScalarExpr(E->getArg(I))); 1197 llvm::PointerType *RetTy = llvm::PointerType::get( 1198 Builder.getContext(), llvm::AMDGPUAS::BUFFER_RESOURCE); 1199 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_make_buffer_rsrc, 1200 {RetTy, Args[0]->getType()}); 1201 return Builder.CreateCall(F, Args); 1202 } 1203 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8: 1204 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16: 1205 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32: 1206 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b64: 1207 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b96: 1208 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128: 1209 return emitBuiltinWithOneOverloadedType<5>( 1210 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_store); 1211 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8: 1212 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16: 1213 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32: 1214 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64: 1215 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96: 1216 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: { 1217 llvm::Type *RetTy = nullptr; 1218 switch (BuiltinID) { 1219 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8: 1220 RetTy = Int8Ty; 1221 break; 1222 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16: 1223 RetTy = Int16Ty; 1224 break; 1225 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32: 1226 RetTy = Int32Ty; 1227 break; 1228 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64: 1229 RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/2); 1230 break; 1231 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96: 1232 RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/3); 1233 break; 1234 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: 1235 RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/4); 1236 break; 1237 } 1238 Function *F = 1239 CGM.getIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_load, RetTy); 1240 return Builder.CreateCall( 1241 F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), 1242 EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))}); 1243 } 1244 case AMDGPU::BI__builtin_amdgcn_s_prefetch_data: 1245 return emitBuiltinWithOneOverloadedType<2>( 1246 *this, E, Intrinsic::amdgcn_s_prefetch_data); 1247 case Builtin::BIlogbf: 1248 case Builtin::BI__builtin_logbf: { 1249 Value *Src0 = EmitScalarExpr(E->getArg(0)); 1250 Function *FrExpFunc = CGM.getIntrinsic( 1251 Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()}); 1252 CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0); 1253 Value *Exp = Builder.CreateExtractValue(FrExp, 1); 1254 Value *Add = Builder.CreateAdd( 1255 Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true); 1256 Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getFloatTy()); 1257 Value *Fabs = 1258 emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs); 1259 Value *FCmpONE = Builder.CreateFCmpONE( 1260 Fabs, ConstantFP::getInfinity(Builder.getFloatTy())); 1261 Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs); 1262 Value *FCmpOEQ = 1263 Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getFloatTy())); 1264 Value *Sel2 = Builder.CreateSelect( 1265 FCmpOEQ, 1266 ConstantFP::getInfinity(Builder.getFloatTy(), /*Negative=*/true), Sel1); 1267 return Sel2; 1268 } 1269 case Builtin::BIlogb: 1270 case Builtin::BI__builtin_logb: { 1271 Value *Src0 = EmitScalarExpr(E->getArg(0)); 1272 Function *FrExpFunc = CGM.getIntrinsic( 1273 Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()}); 1274 CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0); 1275 Value *Exp = Builder.CreateExtractValue(FrExp, 1); 1276 Value *Add = Builder.CreateAdd( 1277 Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true); 1278 Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getDoubleTy()); 1279 Value *Fabs = 1280 emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs); 1281 Value *FCmpONE = Builder.CreateFCmpONE( 1282 Fabs, ConstantFP::getInfinity(Builder.getDoubleTy())); 1283 Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs); 1284 Value *FCmpOEQ = 1285 Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getDoubleTy())); 1286 Value *Sel2 = Builder.CreateSelect( 1287 FCmpOEQ, 1288 ConstantFP::getInfinity(Builder.getDoubleTy(), /*Negative=*/true), 1289 Sel1); 1290 return Sel2; 1291 } 1292 case Builtin::BIscalbnf: 1293 case Builtin::BI__builtin_scalbnf: 1294 case Builtin::BIscalbn: 1295 case Builtin::BI__builtin_scalbn: 1296 return emitBinaryExpMaybeConstrainedFPBuiltin( 1297 *this, E, Intrinsic::ldexp, Intrinsic::experimental_constrained_ldexp); 1298 default: 1299 return nullptr; 1300 } 1301 } 1302