xref: /freebsd/contrib/llvm-project/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This contains code to emit Builtin calls as LLVM code.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "CGBuiltin.h"
14 #include "clang/Basic/TargetBuiltins.h"
15 #include "llvm/Analysis/ValueTracking.h"
16 #include "llvm/IR/IntrinsicsAMDGPU.h"
17 #include "llvm/IR/IntrinsicsR600.h"
18 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
19 #include "llvm/Support/AMDGPUAddrSpace.h"
20 
21 using namespace clang;
22 using namespace CodeGen;
23 using namespace llvm;
24 
25 namespace {
26 
27 // Has second type mangled argument.
28 static Value *
29 emitBinaryExpMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E,
30                                        Intrinsic::ID IntrinsicID,
31                                        Intrinsic::ID ConstrainedIntrinsicID) {
32   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
33   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
34 
35   CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
36   if (CGF.Builder.getIsFPConstrained()) {
37     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID,
38                                        {Src0->getType(), Src1->getType()});
39     return CGF.Builder.CreateConstrainedFPCall(F, {Src0, Src1});
40   }
41 
42   Function *F =
43       CGF.CGM.getIntrinsic(IntrinsicID, {Src0->getType(), Src1->getType()});
44   return CGF.Builder.CreateCall(F, {Src0, Src1});
45 }
46 
47 // If \p E is not null pointer, insert address space cast to match return
48 // type of \p E if necessary.
49 Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
50                              const CallExpr *E = nullptr) {
51   auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);
52   auto *Call = CGF.Builder.CreateCall(F);
53   Call->addRetAttr(
54       Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
55   Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
56   if (!E)
57     return Call;
58   QualType BuiltinRetType = E->getType();
59   auto *RetTy = cast<llvm::PointerType>(CGF.ConvertType(BuiltinRetType));
60   if (RetTy == Call->getType())
61     return Call;
62   return CGF.Builder.CreateAddrSpaceCast(Call, RetTy);
63 }
64 
65 Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
66   auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr);
67   auto *Call = CGF.Builder.CreateCall(F);
68   Call->addRetAttr(
69       Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
70   Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
71   return Call;
72 }
73 
74 // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
75 /// Emit code based on Code Object ABI version.
76 /// COV_4    : Emit code to use dispatch ptr
77 /// COV_5+   : Emit code to use implicitarg ptr
78 /// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
79 ///            and use its value for COV_4 or COV_5+ approach. It is used for
80 ///            compiling device libraries in an ABI-agnostic way.
81 Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
82   llvm::LoadInst *LD;
83 
84   auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
85 
86   if (Cov == CodeObjectVersionKind::COV_None) {
87     StringRef Name = "__oclc_ABI_version";
88     auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
89     if (!ABIVersionC)
90       ABIVersionC = new llvm::GlobalVariable(
91           CGF.CGM.getModule(), CGF.Int32Ty, false,
92           llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr,
93           llvm::GlobalVariable::NotThreadLocal,
94           CGF.CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
95 
96     // This load will be eliminated by the IPSCCP because it is constant
97     // weak_odr without externally_initialized. Either changing it to weak or
98     // adding externally_initialized will keep the load.
99     Value *ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC,
100                                                       CGF.CGM.getIntAlign());
101 
102     Value *IsCOV5 = CGF.Builder.CreateICmpSGE(
103         ABIVersion,
104         llvm::ConstantInt::get(CGF.Int32Ty, CodeObjectVersionKind::COV_5));
105 
106     // Indexing the implicit kernarg segment.
107     Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32(
108         CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
109 
110     // Indexing the HSA kernel_dispatch_packet struct.
111     Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32(
112         CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
113 
114     auto Result = CGF.Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP);
115     LD = CGF.Builder.CreateLoad(
116         Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2)));
117   } else {
118     Value *GEP = nullptr;
119     if (Cov >= CodeObjectVersionKind::COV_5) {
120       // Indexing the implicit kernarg segment.
121       GEP = CGF.Builder.CreateConstGEP1_32(
122           CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
123     } else {
124       // Indexing the HSA kernel_dispatch_packet struct.
125       GEP = CGF.Builder.CreateConstGEP1_32(
126           CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
127     }
128     LD = CGF.Builder.CreateLoad(
129         Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2)));
130   }
131 
132   llvm::MDBuilder MDHelper(CGF.getLLVMContext());
133   llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),
134       APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
135   LD->setMetadata(llvm::LLVMContext::MD_range, RNode);
136   LD->setMetadata(llvm::LLVMContext::MD_noundef,
137                   llvm::MDNode::get(CGF.getLLVMContext(), {}));
138   LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
139                   llvm::MDNode::get(CGF.getLLVMContext(), {}));
140   return LD;
141 }
142 
143 // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
144 Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
145   const unsigned XOffset = 12;
146   auto *DP = EmitAMDGPUDispatchPtr(CGF);
147   // Indexing the HSA kernel_dispatch_packet struct.
148   auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 4);
149   auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
150   auto *LD = CGF.Builder.CreateLoad(
151       Address(GEP, CGF.Int32Ty, CharUnits::fromQuantity(4)));
152 
153   llvm::MDBuilder MDB(CGF.getLLVMContext());
154 
155   // Known non-zero.
156   LD->setMetadata(llvm::LLVMContext::MD_range,
157                   MDB.createRange(APInt(32, 1), APInt::getZero(32)));
158   LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
159                   llvm::MDNode::get(CGF.getLLVMContext(), {}));
160   return LD;
161 }
162 } // namespace
163 
164 // Generates the IR for __builtin_read_exec_*.
165 // Lowers the builtin to amdgcn_ballot intrinsic.
166 static Value *EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const CallExpr *E,
167                                       llvm::Type *RegisterType,
168                                       llvm::Type *ValueType, bool isExecHi) {
169   CodeGen::CGBuilderTy &Builder = CGF.Builder;
170   CodeGen::CodeGenModule &CGM = CGF.CGM;
171 
172   Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {RegisterType});
173   llvm::Value *Call = Builder.CreateCall(F, {Builder.getInt1(true)});
174 
175   if (isExecHi) {
176     Value *Rt2 = Builder.CreateLShr(Call, 32);
177     Rt2 = Builder.CreateTrunc(Rt2, CGF.Int32Ty);
178     return Rt2;
179   }
180 
181   return Call;
182 }
183 
184 // Emit an intrinsic that has 1 float or double operand, and 1 integer.
185 static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
186                                const CallExpr *E,
187                                unsigned IntrinsicID) {
188   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
189   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
190 
191   Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
192   return CGF.Builder.CreateCall(F, {Src0, Src1});
193 }
194 
195 // For processing memory ordering and memory scope arguments of various
196 // amdgcn builtins.
197 // \p Order takes a C++11 comptabile memory-ordering specifier and converts
198 // it into LLVM's memory ordering specifier using atomic C ABI, and writes
199 // to \p AO. \p Scope takes a const char * and converts it into AMDGCN
200 // specific SyncScopeID and writes it to \p SSID.
201 void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
202                                               llvm::AtomicOrdering &AO,
203                                               llvm::SyncScope::ID &SSID) {
204   int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
205 
206   // Map C11/C++11 memory ordering to LLVM memory ordering
207   assert(llvm::isValidAtomicOrderingCABI(ord));
208   switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
209   case llvm::AtomicOrderingCABI::acquire:
210   case llvm::AtomicOrderingCABI::consume:
211     AO = llvm::AtomicOrdering::Acquire;
212     break;
213   case llvm::AtomicOrderingCABI::release:
214     AO = llvm::AtomicOrdering::Release;
215     break;
216   case llvm::AtomicOrderingCABI::acq_rel:
217     AO = llvm::AtomicOrdering::AcquireRelease;
218     break;
219   case llvm::AtomicOrderingCABI::seq_cst:
220     AO = llvm::AtomicOrdering::SequentiallyConsistent;
221     break;
222   case llvm::AtomicOrderingCABI::relaxed:
223     AO = llvm::AtomicOrdering::Monotonic;
224     break;
225   }
226 
227   // Some of the atomic builtins take the scope as a string name.
228   StringRef scp;
229   if (llvm::getConstantStringInfo(Scope, scp)) {
230     SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
231     return;
232   }
233 
234   // Older builtins had an enum argument for the memory scope.
235   int scope = cast<llvm::ConstantInt>(Scope)->getZExtValue();
236   switch (scope) {
237   case 0: // __MEMORY_SCOPE_SYSTEM
238     SSID = llvm::SyncScope::System;
239     break;
240   case 1: // __MEMORY_SCOPE_DEVICE
241     SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
242     break;
243   case 2: // __MEMORY_SCOPE_WRKGRP
244     SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup");
245     break;
246   case 3: // __MEMORY_SCOPE_WVFRNT
247     SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
248     break;
249   case 4: // __MEMORY_SCOPE_SINGLE
250     SSID = llvm::SyncScope::SingleThread;
251     break;
252   default:
253     SSID = llvm::SyncScope::System;
254     break;
255   }
256 }
257 
258 llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
259                                                           unsigned Idx,
260                                                           const CallExpr *E) {
261   llvm::Value *Arg = nullptr;
262   if ((ICEArguments & (1 << Idx)) == 0) {
263     Arg = EmitScalarExpr(E->getArg(Idx));
264   } else {
265     // If this is required to be a constant, constant fold it so that we
266     // know that the generated intrinsic gets a ConstantInt.
267     std::optional<llvm::APSInt> Result =
268         E->getArg(Idx)->getIntegerConstantExpr(getContext());
269     assert(Result && "Expected argument to be a constant");
270     Arg = llvm::ConstantInt::get(getLLVMContext(), *Result);
271   }
272   return Arg;
273 }
274 
275 void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
276                                                      const CallExpr *E) {
277   constexpr const char *Tag = "amdgpu-as";
278 
279   LLVMContext &Ctx = Inst->getContext();
280   SmallVector<MMRAMetadata::TagT, 3> MMRAs;
281   for (unsigned K = 2; K < E->getNumArgs(); ++K) {
282     llvm::Value *V = EmitScalarExpr(E->getArg(K));
283     StringRef AS;
284     if (llvm::getConstantStringInfo(V, AS)) {
285       MMRAs.push_back({Tag, AS});
286       // TODO: Delete the resulting unused constant?
287       continue;
288     }
289     CGM.Error(E->getExprLoc(),
290               "expected an address space name as a string literal");
291   }
292 
293   llvm::sort(MMRAs);
294   MMRAs.erase(llvm::unique(MMRAs), MMRAs.end());
295   Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
296 }
297 
298 Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
299                                               const CallExpr *E) {
300   llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
301   llvm::SyncScope::ID SSID;
302   switch (BuiltinID) {
303   case AMDGPU::BI__builtin_amdgcn_div_scale:
304   case AMDGPU::BI__builtin_amdgcn_div_scalef: {
305     // Translate from the intrinsics's struct return to the builtin's out
306     // argument.
307 
308     Address FlagOutPtr = EmitPointerWithAlignment(E->getArg(3));
309 
310     llvm::Value *X = EmitScalarExpr(E->getArg(0));
311     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
312     llvm::Value *Z = EmitScalarExpr(E->getArg(2));
313 
314     llvm::Function *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale,
315                                            X->getType());
316 
317     llvm::Value *Tmp = Builder.CreateCall(Callee, {X, Y, Z});
318 
319     llvm::Value *Result = Builder.CreateExtractValue(Tmp, 0);
320     llvm::Value *Flag = Builder.CreateExtractValue(Tmp, 1);
321 
322     llvm::Type *RealFlagType = FlagOutPtr.getElementType();
323 
324     llvm::Value *FlagExt = Builder.CreateZExt(Flag, RealFlagType);
325     Builder.CreateStore(FlagExt, FlagOutPtr);
326     return Result;
327   }
328   case AMDGPU::BI__builtin_amdgcn_div_fmas:
329   case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
330     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
331     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
332     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
333     llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
334 
335     llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas,
336                                       Src0->getType());
337     llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3);
338     return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool});
339   }
340 
341   case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
342     return emitBuiltinWithOneOverloadedType<2>(*this, E,
343                                                Intrinsic::amdgcn_ds_swizzle);
344   case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
345   case AMDGPU::BI__builtin_amdgcn_mov_dpp:
346   case AMDGPU::BI__builtin_amdgcn_update_dpp: {
347     llvm::SmallVector<llvm::Value *, 6> Args;
348     // Find out if any arguments are required to be integer constant
349     // expressions.
350     unsigned ICEArguments = 0;
351     ASTContext::GetBuiltinTypeError Error;
352     getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
353     assert(Error == ASTContext::GE_None && "Should not codegen an error");
354     llvm::Type *DataTy = ConvertType(E->getArg(0)->getType());
355     unsigned Size = DataTy->getPrimitiveSizeInBits();
356     llvm::Type *IntTy =
357         llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u));
358     Function *F =
359         CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
360                              ? Intrinsic::amdgcn_mov_dpp8
361                              : Intrinsic::amdgcn_update_dpp,
362                          IntTy);
363     assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 ||
364            E->getNumArgs() == 2);
365     bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
366     if (InsertOld)
367       Args.push_back(llvm::PoisonValue::get(IntTy));
368     for (unsigned I = 0; I != E->getNumArgs(); ++I) {
369       llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E);
370       if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) &&
371           Size < 32) {
372         if (!DataTy->isIntegerTy())
373           V = Builder.CreateBitCast(
374               V, llvm::IntegerType::get(Builder.getContext(), Size));
375         V = Builder.CreateZExtOrBitCast(V, IntTy);
376       }
377       llvm::Type *ExpTy =
378           F->getFunctionType()->getFunctionParamType(I + InsertOld);
379       Args.push_back(Builder.CreateTruncOrBitCast(V, ExpTy));
380     }
381     Value *V = Builder.CreateCall(F, Args);
382     if (Size < 32 && !DataTy->isIntegerTy())
383       V = Builder.CreateTrunc(
384           V, llvm::IntegerType::get(Builder.getContext(), Size));
385     return Builder.CreateTruncOrBitCast(V, DataTy);
386   }
387   case AMDGPU::BI__builtin_amdgcn_permlane16:
388   case AMDGPU::BI__builtin_amdgcn_permlanex16:
389     return emitBuiltinWithOneOverloadedType<6>(
390         *this, E,
391         BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
392             ? Intrinsic::amdgcn_permlane16
393             : Intrinsic::amdgcn_permlanex16);
394   case AMDGPU::BI__builtin_amdgcn_permlane64:
395     return emitBuiltinWithOneOverloadedType<1>(*this, E,
396                                                Intrinsic::amdgcn_permlane64);
397   case AMDGPU::BI__builtin_amdgcn_readlane:
398     return emitBuiltinWithOneOverloadedType<2>(*this, E,
399                                                Intrinsic::amdgcn_readlane);
400   case AMDGPU::BI__builtin_amdgcn_readfirstlane:
401     return emitBuiltinWithOneOverloadedType<1>(*this, E,
402                                                Intrinsic::amdgcn_readfirstlane);
403   case AMDGPU::BI__builtin_amdgcn_div_fixup:
404   case AMDGPU::BI__builtin_amdgcn_div_fixupf:
405   case AMDGPU::BI__builtin_amdgcn_div_fixuph:
406     return emitBuiltinWithOneOverloadedType<3>(*this, E,
407                                                Intrinsic::amdgcn_div_fixup);
408   case AMDGPU::BI__builtin_amdgcn_trig_preop:
409   case AMDGPU::BI__builtin_amdgcn_trig_preopf:
410     return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop);
411   case AMDGPU::BI__builtin_amdgcn_rcp:
412   case AMDGPU::BI__builtin_amdgcn_rcpf:
413   case AMDGPU::BI__builtin_amdgcn_rcph:
414     return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rcp);
415   case AMDGPU::BI__builtin_amdgcn_sqrt:
416   case AMDGPU::BI__builtin_amdgcn_sqrtf:
417   case AMDGPU::BI__builtin_amdgcn_sqrth:
418     return emitBuiltinWithOneOverloadedType<1>(*this, E,
419                                                Intrinsic::amdgcn_sqrt);
420   case AMDGPU::BI__builtin_amdgcn_rsq:
421   case AMDGPU::BI__builtin_amdgcn_rsqf:
422   case AMDGPU::BI__builtin_amdgcn_rsqh:
423     return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq);
424   case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
425   case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
426     return emitBuiltinWithOneOverloadedType<1>(*this, E,
427                                                Intrinsic::amdgcn_rsq_clamp);
428   case AMDGPU::BI__builtin_amdgcn_sinf:
429   case AMDGPU::BI__builtin_amdgcn_sinh:
430     return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_sin);
431   case AMDGPU::BI__builtin_amdgcn_cosf:
432   case AMDGPU::BI__builtin_amdgcn_cosh:
433     return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_cos);
434   case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
435     return EmitAMDGPUDispatchPtr(*this, E);
436   case AMDGPU::BI__builtin_amdgcn_logf:
437     return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log);
438   case AMDGPU::BI__builtin_amdgcn_exp2f:
439     return emitBuiltinWithOneOverloadedType<1>(*this, E,
440                                                Intrinsic::amdgcn_exp2);
441   case AMDGPU::BI__builtin_amdgcn_log_clampf:
442     return emitBuiltinWithOneOverloadedType<1>(*this, E,
443                                                Intrinsic::amdgcn_log_clamp);
444   case AMDGPU::BI__builtin_amdgcn_ldexp:
445   case AMDGPU::BI__builtin_amdgcn_ldexpf: {
446     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
447     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
448     llvm::Function *F =
449         CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Src1->getType()});
450     return Builder.CreateCall(F, {Src0, Src1});
451   }
452   case AMDGPU::BI__builtin_amdgcn_ldexph: {
453     // The raw instruction has a different behavior for out of bounds exponent
454     // values (implicit truncation instead of saturate to short_min/short_max).
455     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
456     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
457     llvm::Function *F =
458         CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Int16Ty});
459     return Builder.CreateCall(F, {Src0, Builder.CreateTrunc(Src1, Int16Ty)});
460   }
461   case AMDGPU::BI__builtin_amdgcn_frexp_mant:
462   case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
463   case AMDGPU::BI__builtin_amdgcn_frexp_manth:
464     return emitBuiltinWithOneOverloadedType<1>(*this, E,
465                                                Intrinsic::amdgcn_frexp_mant);
466   case AMDGPU::BI__builtin_amdgcn_frexp_exp:
467   case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
468     Value *Src0 = EmitScalarExpr(E->getArg(0));
469     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
470                                 { Builder.getInt32Ty(), Src0->getType() });
471     return Builder.CreateCall(F, Src0);
472   }
473   case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
474     Value *Src0 = EmitScalarExpr(E->getArg(0));
475     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
476                                 { Builder.getInt16Ty(), Src0->getType() });
477     return Builder.CreateCall(F, Src0);
478   }
479   case AMDGPU::BI__builtin_amdgcn_fract:
480   case AMDGPU::BI__builtin_amdgcn_fractf:
481   case AMDGPU::BI__builtin_amdgcn_fracth:
482     return emitBuiltinWithOneOverloadedType<1>(*this, E,
483                                                Intrinsic::amdgcn_fract);
484   case AMDGPU::BI__builtin_amdgcn_lerp:
485     return emitBuiltinWithOneOverloadedType<3>(*this, E,
486                                                Intrinsic::amdgcn_lerp);
487   case AMDGPU::BI__builtin_amdgcn_ubfe:
488     return emitBuiltinWithOneOverloadedType<3>(*this, E,
489                                                Intrinsic::amdgcn_ubfe);
490   case AMDGPU::BI__builtin_amdgcn_sbfe:
491     return emitBuiltinWithOneOverloadedType<3>(*this, E,
492                                                Intrinsic::amdgcn_sbfe);
493   case AMDGPU::BI__builtin_amdgcn_ballot_w32:
494   case AMDGPU::BI__builtin_amdgcn_ballot_w64: {
495     llvm::Type *ResultType = ConvertType(E->getType());
496     llvm::Value *Src = EmitScalarExpr(E->getArg(0));
497     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
498     return Builder.CreateCall(F, { Src });
499   }
500   case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
501     return emitBuiltinWithOneOverloadedType<1>(*this, E,
502                                                Intrinsic::amdgcn_tanh);
503   case AMDGPU::BI__builtin_amdgcn_uicmp:
504   case AMDGPU::BI__builtin_amdgcn_uicmpl:
505   case AMDGPU::BI__builtin_amdgcn_sicmp:
506   case AMDGPU::BI__builtin_amdgcn_sicmpl: {
507     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
508     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
509     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
510 
511     // FIXME-GFX10: How should 32 bit mask be handled?
512     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_icmp,
513       { Builder.getInt64Ty(), Src0->getType() });
514     return Builder.CreateCall(F, { Src0, Src1, Src2 });
515   }
516   case AMDGPU::BI__builtin_amdgcn_fcmp:
517   case AMDGPU::BI__builtin_amdgcn_fcmpf: {
518     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
519     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
520     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
521 
522     // FIXME-GFX10: How should 32 bit mask be handled?
523     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_fcmp,
524       { Builder.getInt64Ty(), Src0->getType() });
525     return Builder.CreateCall(F, { Src0, Src1, Src2 });
526   }
527   case AMDGPU::BI__builtin_amdgcn_class:
528   case AMDGPU::BI__builtin_amdgcn_classf:
529   case AMDGPU::BI__builtin_amdgcn_classh:
530     return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class);
531   case AMDGPU::BI__builtin_amdgcn_fmed3f:
532   case AMDGPU::BI__builtin_amdgcn_fmed3h:
533     return emitBuiltinWithOneOverloadedType<3>(*this, E,
534                                                Intrinsic::amdgcn_fmed3);
535   case AMDGPU::BI__builtin_amdgcn_ds_append:
536   case AMDGPU::BI__builtin_amdgcn_ds_consume: {
537     Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
538       Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume;
539     Value *Src0 = EmitScalarExpr(E->getArg(0));
540     Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
541     return Builder.CreateCall(F, { Src0, Builder.getFalse() });
542   }
543   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
544   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
545   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
546   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
547   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
548   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
549   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
550   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
551   case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
552   case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
553   case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
554   case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
555   case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
556   case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
557   case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
558   case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
559   case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
560   case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
561   case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
562   case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
563   case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
564   case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
565   case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
566   case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
567   case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
568   case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
569     Intrinsic::ID IID;
570     switch (BuiltinID) {
571     case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
572     case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
573     case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
574       IID = Intrinsic::amdgcn_global_load_tr_b64;
575       break;
576     case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
577     case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
578     case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
579     case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
580     case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
581     case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
582     case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
583     case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
584     case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
585       IID = Intrinsic::amdgcn_global_load_tr_b128;
586       break;
587     case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
588       IID = Intrinsic::amdgcn_global_load_tr4_b64;
589       break;
590     case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
591       IID = Intrinsic::amdgcn_global_load_tr6_b96;
592       break;
593     case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
594       IID = Intrinsic::amdgcn_ds_load_tr4_b64;
595       break;
596     case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
597       IID = Intrinsic::amdgcn_ds_load_tr6_b96;
598       break;
599     case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
600       IID = Intrinsic::amdgcn_ds_load_tr8_b64;
601       break;
602     case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
603     case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
604     case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
605       IID = Intrinsic::amdgcn_ds_load_tr16_b128;
606       break;
607     case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
608       IID = Intrinsic::amdgcn_ds_read_tr4_b64;
609       break;
610     case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
611       IID = Intrinsic::amdgcn_ds_read_tr8_b64;
612       break;
613     case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
614       IID = Intrinsic::amdgcn_ds_read_tr6_b96;
615       break;
616     case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
617     case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
618     case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
619       IID = Intrinsic::amdgcn_ds_read_tr16_b64;
620       break;
621     }
622     llvm::Type *LoadTy = ConvertType(E->getType());
623     llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
624     llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
625     return Builder.CreateCall(F, {Addr});
626   }
627   case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
628     // Should this have asan instrumentation?
629     return emitBuiltinWithOneOverloadedType<5>(*this, E,
630                                                Intrinsic::amdgcn_load_to_lds);
631   }
632   case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
633     Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
634                                    {llvm::Type::getInt64Ty(getLLVMContext())});
635     return Builder.CreateCall(F);
636   }
637   case AMDGPU::BI__builtin_amdgcn_set_fpenv: {
638     Function *F = CGM.getIntrinsic(Intrinsic::set_fpenv,
639                                    {llvm::Type::getInt64Ty(getLLVMContext())});
640     llvm::Value *Env = EmitScalarExpr(E->getArg(0));
641     return Builder.CreateCall(F, {Env});
642   }
643   case AMDGPU::BI__builtin_amdgcn_read_exec:
644     return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
645   case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
646     return EmitAMDGCNBallotForExec(*this, E, Int32Ty, Int32Ty, false);
647   case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
648     return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, true);
649   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
650   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
651   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
652   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: {
653     llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
654     llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
655     llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(2));
656     llvm::Value *RayDir = EmitScalarExpr(E->getArg(3));
657     llvm::Value *RayInverseDir = EmitScalarExpr(E->getArg(4));
658     llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(5));
659 
660     // The builtins take these arguments as vec4 where the last element is
661     // ignored. The intrinsic takes them as vec3.
662     RayOrigin = Builder.CreateShuffleVector(RayOrigin, RayOrigin,
663                                             {0, 1, 2});
664     RayDir =
665         Builder.CreateShuffleVector(RayDir, RayDir, {0, 1, 2});
666     RayInverseDir = Builder.CreateShuffleVector(RayInverseDir, RayInverseDir,
667                                                 {0, 1, 2});
668 
669     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray,
670                                    {NodePtr->getType(), RayDir->getType()});
671     return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir,
672                                   RayInverseDir, TextureDescr});
673   }
674   case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
675   case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
676     Intrinsic::ID IID;
677     switch (BuiltinID) {
678     case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
679       IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
680       break;
681     case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
682       IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
683       break;
684     }
685     llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
686     llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
687     llvm::Value *InstanceMask = EmitScalarExpr(E->getArg(2));
688     llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(3));
689     llvm::Value *RayDir = EmitScalarExpr(E->getArg(4));
690     llvm::Value *Offset = EmitScalarExpr(E->getArg(5));
691     llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(6));
692 
693     Address RetRayOriginPtr = EmitPointerWithAlignment(E->getArg(7));
694     Address RetRayDirPtr = EmitPointerWithAlignment(E->getArg(8));
695 
696     llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);
697 
698     llvm::CallInst *CI = Builder.CreateCall(
699         IntrinsicFunc, {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
700                         Offset, TextureDescr});
701 
702     llvm::Value *RetVData = Builder.CreateExtractValue(CI, 0);
703     llvm::Value *RetRayOrigin = Builder.CreateExtractValue(CI, 1);
704     llvm::Value *RetRayDir = Builder.CreateExtractValue(CI, 2);
705 
706     Builder.CreateStore(RetRayOrigin, RetRayOriginPtr);
707     Builder.CreateStore(RetRayDir, RetRayDirPtr);
708 
709     return RetVData;
710   }
711 
712   case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
713   case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
714   case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
715   case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
716     Intrinsic::ID IID;
717     switch (BuiltinID) {
718     case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
719       IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
720       break;
721     case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
722       IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
723       break;
724     case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
725       IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
726       break;
727     case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
728       IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
729       break;
730     }
731 
732     SmallVector<Value *, 4> Args;
733     for (int i = 0, e = E->getNumArgs(); i != e; ++i)
734       Args.push_back(EmitScalarExpr(E->getArg(i)));
735 
736     Function *F = CGM.getIntrinsic(IID);
737     Value *Call = Builder.CreateCall(F, Args);
738     Value *Rtn = Builder.CreateExtractValue(Call, 0);
739     Value *A = Builder.CreateExtractValue(Call, 1);
740     llvm::Type *RetTy = ConvertType(E->getType());
741     Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn,
742                                             (uint64_t)0);
743     // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
744     // <2 x i64>, zext the second value.
745     if (A->getType()->getPrimitiveSizeInBits() <
746         RetTy->getScalarType()->getPrimitiveSizeInBits())
747       A = Builder.CreateZExt(A, RetTy->getScalarType());
748 
749     return Builder.CreateInsertElement(I0, A, 1);
750   }
751   case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
752   case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
753     llvm::FixedVectorType *VT = FixedVectorType::get(Builder.getInt32Ty(), 8);
754     Function *F = CGM.getIntrinsic(
755         BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4
756             ? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4
757             : Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4,
758         {VT, VT});
759 
760     SmallVector<Value *, 9> Args;
761     for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
762       Args.push_back(EmitScalarExpr(E->getArg(I)));
763     return Builder.CreateCall(F, Args);
764   }
765   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
766   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
767   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
768   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
769   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
770   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
771   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
772   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
773   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
774   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
775   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
776   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
777   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
778   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
779   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
780   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
781   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
782   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
783   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
784   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
785   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
786   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
787   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
788   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
789   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
790   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
791   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
792   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
793   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
794   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
795   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
796   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
797   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
798   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
799   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
800   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
801   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
802   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
803   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
804   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
805   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
806   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
807   case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
808   case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
809   case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
810   case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
811   case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
812   case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
813   case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
814   case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
815   case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
816   case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
817   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
818   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
819   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
820   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
821   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
822   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
823   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
824   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
825 
826     // These operations perform a matrix multiplication and accumulation of
827     // the form:
828     //             D = A * B + C
829     // We need to specify one type for matrices AB and one for matrices CD.
830     // Sparse matrix operations can have different types for A and B as well as
831     // an additional type for sparsity index.
832     // Destination type should be put before types used for source operands.
833     SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
834     // On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
835     // There is no need for the variable opsel argument, so always set it to
836     // "false".
837     bool AppendFalseForOpselArg = false;
838     unsigned BuiltinWMMAOp;
839 
840     switch (BuiltinID) {
841     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
842     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
843     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
844     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
845       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
846       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
847       break;
848     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
849     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
850     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
851     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
852       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
853       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
854       break;
855     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
856     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
857       AppendFalseForOpselArg = true;
858       [[fallthrough]];
859     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
860     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
861       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
862       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
863       break;
864     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
865     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
866       AppendFalseForOpselArg = true;
867       [[fallthrough]];
868     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
869     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
870       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
871       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
872       break;
873     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
874     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
875       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
876       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
877       break;
878     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
879     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
880       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
881       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
882       break;
883     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
884     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
885     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
886     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
887       ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
888       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
889       break;
890     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
891     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
892     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
893     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
894       ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
895       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
896       break;
897     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
898     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
899       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
900       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
901       break;
902     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
903     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
904       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
905       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
906       break;
907     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
908     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
909       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
910       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
911       break;
912     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
913     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
914       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
915       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
916       break;
917     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
918     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
919       ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
920       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
921       break;
922     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
923     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
924       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
925       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
926       break;
927     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
928     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
929       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
930       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
931       break;
932     case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
933     case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
934       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
935       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
936       break;
937     case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
938     case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
939       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
940       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
941       break;
942     case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
943     case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
944       ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
945       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
946       break;
947     case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
948     case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
949       ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
950       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
951       break;
952     case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
953     case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
954       ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
955       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
956       break;
957     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
958     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
959       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
960       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
961       break;
962     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
963     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
964       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
965       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
966       break;
967     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
968     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
969       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
970       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
971       break;
972     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
973     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
974       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
975       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
976       break;
977     }
978 
979     SmallVector<Value *, 6> Args;
980     for (int i = 0, e = E->getNumArgs(); i != e; ++i)
981       Args.push_back(EmitScalarExpr(E->getArg(i)));
982     if (AppendFalseForOpselArg)
983       Args.push_back(Builder.getFalse());
984 
985     SmallVector<llvm::Type *, 6> ArgTypes;
986     for (auto ArgIdx : ArgsForMatchingMatrixTypes)
987       ArgTypes.push_back(Args[ArgIdx]->getType());
988 
989     Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes);
990     return Builder.CreateCall(F, Args);
991   }
992   // amdgcn workgroup size
993   case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
994     return EmitAMDGPUWorkGroupSize(*this, 0);
995   case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
996     return EmitAMDGPUWorkGroupSize(*this, 1);
997   case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
998     return EmitAMDGPUWorkGroupSize(*this, 2);
999 
1000   // amdgcn grid size
1001   case AMDGPU::BI__builtin_amdgcn_grid_size_x:
1002     return EmitAMDGPUGridSize(*this, 0);
1003   case AMDGPU::BI__builtin_amdgcn_grid_size_y:
1004     return EmitAMDGPUGridSize(*this, 1);
1005   case AMDGPU::BI__builtin_amdgcn_grid_size_z:
1006     return EmitAMDGPUGridSize(*this, 2);
1007 
1008   // r600 intrinsics
1009   case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
1010   case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
1011     return emitBuiltinWithOneOverloadedType<1>(*this, E,
1012                                                Intrinsic::r600_recipsqrt_ieee);
1013   case AMDGPU::BI__builtin_amdgcn_alignbit: {
1014     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
1015     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
1016     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
1017     Function *F = CGM.getIntrinsic(Intrinsic::fshr, Src0->getType());
1018     return Builder.CreateCall(F, { Src0, Src1, Src2 });
1019   }
1020   case AMDGPU::BI__builtin_amdgcn_fence: {
1021     ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(0)),
1022                             EmitScalarExpr(E->getArg(1)), AO, SSID);
1023     FenceInst *Fence = Builder.CreateFence(AO, SSID);
1024     if (E->getNumArgs() > 2)
1025       AddAMDGPUFenceAddressSpaceMMRA(Fence, E);
1026     return Fence;
1027   }
1028   case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1029   case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1030   case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1031   case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1032   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1033   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1034   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1035   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1036   case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1037   case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1038   case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1039   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1040   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1041   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1042   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1043   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1044   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1045   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1046   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1047   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1048   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1049   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1050   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
1051     llvm::AtomicRMWInst::BinOp BinOp;
1052     switch (BuiltinID) {
1053     case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1054     case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1055       BinOp = llvm::AtomicRMWInst::UIncWrap;
1056       break;
1057     case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1058     case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1059       BinOp = llvm::AtomicRMWInst::UDecWrap;
1060       break;
1061     case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1062     case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1063     case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1064     case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1065     case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1066     case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1067     case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1068     case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1069     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1070     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1071     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1072     case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1073     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1074       BinOp = llvm::AtomicRMWInst::FAdd;
1075       break;
1076     case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1077     case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1078     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1079       BinOp = llvm::AtomicRMWInst::FMin;
1080       break;
1081     case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1082     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
1083     case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1084       BinOp = llvm::AtomicRMWInst::FMax;
1085       break;
1086     }
1087 
1088     Address Ptr = CheckAtomicAlignment(*this, E);
1089     Value *Val = EmitScalarExpr(E->getArg(1));
1090     llvm::Type *OrigTy = Val->getType();
1091     QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
1092 
1093     bool Volatile;
1094 
1095     if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf ||
1096         BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf ||
1097         BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) {
1098       // __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument
1099       Volatile =
1100           cast<ConstantInt>(EmitScalarExpr(E->getArg(4)))->getZExtValue();
1101     } else {
1102       // Infer volatile from the passed type.
1103       Volatile =
1104           PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
1105     }
1106 
1107     if (E->getNumArgs() >= 4) {
1108       // Some of the builtins have explicit ordering and scope arguments.
1109       ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
1110                               EmitScalarExpr(E->getArg(3)), AO, SSID);
1111     } else {
1112       // Most of the builtins do not have syncscope/order arguments. For DS
1113       // atomics the scope doesn't really matter, as they implicitly operate at
1114       // workgroup scope.
1115       //
1116       // The global/flat cases need to use agent scope to consistently produce
1117       // the native instruction instead of a cmpxchg expansion.
1118       SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
1119       AO = AtomicOrdering::Monotonic;
1120 
1121       // The v2bf16 builtin uses i16 instead of a natural bfloat type.
1122       if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
1123           BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
1124           BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
1125         llvm::Type *V2BF16Ty = FixedVectorType::get(
1126             llvm::Type::getBFloatTy(Builder.getContext()), 2);
1127         Val = Builder.CreateBitCast(Val, V2BF16Ty);
1128       }
1129     }
1130 
1131     llvm::AtomicRMWInst *RMW =
1132         Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
1133     if (Volatile)
1134       RMW->setVolatile(true);
1135 
1136     unsigned AddrSpace = Ptr.getType()->getAddressSpace();
1137     if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
1138       // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
1139       // instruction for flat and global operations.
1140       llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
1141       RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
1142 
1143       // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
1144       // instruction, but this only matters for float fadd.
1145       if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
1146         RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
1147     }
1148 
1149     return Builder.CreateBitCast(RMW, OrigTy);
1150   }
1151   case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
1152   case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
1153     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
1154     llvm::Type *ResultType = ConvertType(E->getType());
1155     // s_sendmsg_rtn is mangled using return type only.
1156     Function *F =
1157         CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType});
1158     return Builder.CreateCall(F, {Arg});
1159   }
1160   case AMDGPU::BI__builtin_amdgcn_permlane16_swap:
1161   case AMDGPU::BI__builtin_amdgcn_permlane32_swap: {
1162     // Because builtin types are limited, and the intrinsic uses a struct/pair
1163     // output, marshal the pair-of-i32 to <2 x i32>.
1164     Value *VDstOld = EmitScalarExpr(E->getArg(0));
1165     Value *VSrcOld = EmitScalarExpr(E->getArg(1));
1166     Value *FI = EmitScalarExpr(E->getArg(2));
1167     Value *BoundCtrl = EmitScalarExpr(E->getArg(3));
1168     Function *F =
1169         CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap
1170                              ? Intrinsic::amdgcn_permlane16_swap
1171                              : Intrinsic::amdgcn_permlane32_swap);
1172     llvm::CallInst *Call =
1173         Builder.CreateCall(F, {VDstOld, VSrcOld, FI, BoundCtrl});
1174 
1175     llvm::Value *Elt0 = Builder.CreateExtractValue(Call, 0);
1176     llvm::Value *Elt1 = Builder.CreateExtractValue(Call, 1);
1177 
1178     llvm::Type *ResultType = ConvertType(E->getType());
1179 
1180     llvm::Value *Insert0 = Builder.CreateInsertElement(
1181         llvm::PoisonValue::get(ResultType), Elt0, UINT64_C(0));
1182     llvm::Value *AsVector =
1183         Builder.CreateInsertElement(Insert0, Elt1, UINT64_C(1));
1184     return AsVector;
1185   }
1186   case AMDGPU::BI__builtin_amdgcn_bitop3_b32:
1187   case AMDGPU::BI__builtin_amdgcn_bitop3_b16:
1188     return emitBuiltinWithOneOverloadedType<4>(*this, E,
1189                                                Intrinsic::amdgcn_bitop3);
1190   case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: {
1191     // TODO: LLVM has this overloaded to allow for fat pointers, but since
1192     // those haven't been plumbed through to Clang yet, default to creating the
1193     // resource type.
1194     SmallVector<Value *, 4> Args;
1195     for (unsigned I = 0; I < 4; ++I)
1196       Args.push_back(EmitScalarExpr(E->getArg(I)));
1197     llvm::PointerType *RetTy = llvm::PointerType::get(
1198         Builder.getContext(), llvm::AMDGPUAS::BUFFER_RESOURCE);
1199     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_make_buffer_rsrc,
1200                                    {RetTy, Args[0]->getType()});
1201     return Builder.CreateCall(F, Args);
1202   }
1203   case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8:
1204   case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16:
1205   case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32:
1206   case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b64:
1207   case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b96:
1208   case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128:
1209     return emitBuiltinWithOneOverloadedType<5>(
1210         *this, E, Intrinsic::amdgcn_raw_ptr_buffer_store);
1211   case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1212   case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1213   case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1214   case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1215   case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1216   case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: {
1217     llvm::Type *RetTy = nullptr;
1218     switch (BuiltinID) {
1219     case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1220       RetTy = Int8Ty;
1221       break;
1222     case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1223       RetTy = Int16Ty;
1224       break;
1225     case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1226       RetTy = Int32Ty;
1227       break;
1228     case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1229       RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/2);
1230       break;
1231     case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1232       RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/3);
1233       break;
1234     case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128:
1235       RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/4);
1236       break;
1237     }
1238     Function *F =
1239         CGM.getIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_load, RetTy);
1240     return Builder.CreateCall(
1241         F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)),
1242             EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))});
1243   }
1244   case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
1245     return emitBuiltinWithOneOverloadedType<2>(
1246         *this, E, Intrinsic::amdgcn_s_prefetch_data);
1247   case Builtin::BIlogbf:
1248   case Builtin::BI__builtin_logbf: {
1249     Value *Src0 = EmitScalarExpr(E->getArg(0));
1250     Function *FrExpFunc = CGM.getIntrinsic(
1251         Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()});
1252     CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0);
1253     Value *Exp = Builder.CreateExtractValue(FrExp, 1);
1254     Value *Add = Builder.CreateAdd(
1255         Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true);
1256     Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getFloatTy());
1257     Value *Fabs =
1258         emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs);
1259     Value *FCmpONE = Builder.CreateFCmpONE(
1260         Fabs, ConstantFP::getInfinity(Builder.getFloatTy()));
1261     Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs);
1262     Value *FCmpOEQ =
1263         Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getFloatTy()));
1264     Value *Sel2 = Builder.CreateSelect(
1265         FCmpOEQ,
1266         ConstantFP::getInfinity(Builder.getFloatTy(), /*Negative=*/true), Sel1);
1267     return Sel2;
1268   }
1269   case Builtin::BIlogb:
1270   case Builtin::BI__builtin_logb: {
1271     Value *Src0 = EmitScalarExpr(E->getArg(0));
1272     Function *FrExpFunc = CGM.getIntrinsic(
1273         Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()});
1274     CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0);
1275     Value *Exp = Builder.CreateExtractValue(FrExp, 1);
1276     Value *Add = Builder.CreateAdd(
1277         Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true);
1278     Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getDoubleTy());
1279     Value *Fabs =
1280         emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs);
1281     Value *FCmpONE = Builder.CreateFCmpONE(
1282         Fabs, ConstantFP::getInfinity(Builder.getDoubleTy()));
1283     Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs);
1284     Value *FCmpOEQ =
1285         Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getDoubleTy()));
1286     Value *Sel2 = Builder.CreateSelect(
1287         FCmpOEQ,
1288         ConstantFP::getInfinity(Builder.getDoubleTy(), /*Negative=*/true),
1289         Sel1);
1290     return Sel2;
1291   }
1292   case Builtin::BIscalbnf:
1293   case Builtin::BI__builtin_scalbnf:
1294   case Builtin::BIscalbn:
1295   case Builtin::BI__builtin_scalbn:
1296     return emitBinaryExpMaybeConstrainedFPBuiltin(
1297         *this, E, Intrinsic::ldexp, Intrinsic::experimental_constrained_ldexp);
1298   default:
1299     return nullptr;
1300   }
1301 }
1302