1 //===-------- NVPTX.cpp - Emit LLVM Code for builtins ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This contains code to emit Builtin calls as LLVM code. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "CGBuiltin.h" 14 #include "clang/Basic/TargetBuiltins.h" 15 #include "llvm/IR/IntrinsicsNVPTX.h" 16 17 using namespace clang; 18 using namespace CodeGen; 19 using namespace llvm; 20 21 namespace { 22 // Helper classes for mapping MMA builtins to particular LLVM intrinsic variant. 23 struct NVPTXMmaLdstInfo { 24 unsigned NumResults; // Number of elements to load/store 25 // Intrinsic IDs for row/col variants. 0 if particular layout is unsupported. 26 unsigned IID_col; 27 unsigned IID_row; 28 }; 29 30 #define MMA_INTR(geom_op_type, layout) \ 31 Intrinsic::nvvm_wmma_##geom_op_type##_##layout##_stride 32 #define MMA_LDST(n, geom_op_type) \ 33 { n, MMA_INTR(geom_op_type, col), MMA_INTR(geom_op_type, row) } 34 35 static NVPTXMmaLdstInfo getNVPTXMmaLdstInfo(unsigned BuiltinID) { 36 switch (BuiltinID) { 37 // FP MMA loads 38 case NVPTX::BI__hmma_m16n16k16_ld_a: 39 return MMA_LDST(8, m16n16k16_load_a_f16); 40 case NVPTX::BI__hmma_m16n16k16_ld_b: 41 return MMA_LDST(8, m16n16k16_load_b_f16); 42 case NVPTX::BI__hmma_m16n16k16_ld_c_f16: 43 return MMA_LDST(4, m16n16k16_load_c_f16); 44 case NVPTX::BI__hmma_m16n16k16_ld_c_f32: 45 return MMA_LDST(8, m16n16k16_load_c_f32); 46 case NVPTX::BI__hmma_m32n8k16_ld_a: 47 return MMA_LDST(8, m32n8k16_load_a_f16); 48 case NVPTX::BI__hmma_m32n8k16_ld_b: 49 return MMA_LDST(8, m32n8k16_load_b_f16); 50 case NVPTX::BI__hmma_m32n8k16_ld_c_f16: 51 return MMA_LDST(4, m32n8k16_load_c_f16); 52 case NVPTX::BI__hmma_m32n8k16_ld_c_f32: 53 return MMA_LDST(8, m32n8k16_load_c_f32); 54 case NVPTX::BI__hmma_m8n32k16_ld_a: 55 return MMA_LDST(8, m8n32k16_load_a_f16); 56 case NVPTX::BI__hmma_m8n32k16_ld_b: 57 return MMA_LDST(8, m8n32k16_load_b_f16); 58 case NVPTX::BI__hmma_m8n32k16_ld_c_f16: 59 return MMA_LDST(4, m8n32k16_load_c_f16); 60 case NVPTX::BI__hmma_m8n32k16_ld_c_f32: 61 return MMA_LDST(8, m8n32k16_load_c_f32); 62 63 // Integer MMA loads 64 case NVPTX::BI__imma_m16n16k16_ld_a_s8: 65 return MMA_LDST(2, m16n16k16_load_a_s8); 66 case NVPTX::BI__imma_m16n16k16_ld_a_u8: 67 return MMA_LDST(2, m16n16k16_load_a_u8); 68 case NVPTX::BI__imma_m16n16k16_ld_b_s8: 69 return MMA_LDST(2, m16n16k16_load_b_s8); 70 case NVPTX::BI__imma_m16n16k16_ld_b_u8: 71 return MMA_LDST(2, m16n16k16_load_b_u8); 72 case NVPTX::BI__imma_m16n16k16_ld_c: 73 return MMA_LDST(8, m16n16k16_load_c_s32); 74 case NVPTX::BI__imma_m32n8k16_ld_a_s8: 75 return MMA_LDST(4, m32n8k16_load_a_s8); 76 case NVPTX::BI__imma_m32n8k16_ld_a_u8: 77 return MMA_LDST(4, m32n8k16_load_a_u8); 78 case NVPTX::BI__imma_m32n8k16_ld_b_s8: 79 return MMA_LDST(1, m32n8k16_load_b_s8); 80 case NVPTX::BI__imma_m32n8k16_ld_b_u8: 81 return MMA_LDST(1, m32n8k16_load_b_u8); 82 case NVPTX::BI__imma_m32n8k16_ld_c: 83 return MMA_LDST(8, m32n8k16_load_c_s32); 84 case NVPTX::BI__imma_m8n32k16_ld_a_s8: 85 return MMA_LDST(1, m8n32k16_load_a_s8); 86 case NVPTX::BI__imma_m8n32k16_ld_a_u8: 87 return MMA_LDST(1, m8n32k16_load_a_u8); 88 case NVPTX::BI__imma_m8n32k16_ld_b_s8: 89 return MMA_LDST(4, m8n32k16_load_b_s8); 90 case NVPTX::BI__imma_m8n32k16_ld_b_u8: 91 return MMA_LDST(4, m8n32k16_load_b_u8); 92 case NVPTX::BI__imma_m8n32k16_ld_c: 93 return MMA_LDST(8, m8n32k16_load_c_s32); 94 95 // Sub-integer MMA loads. 96 // Only row/col layout is supported by A/B fragments. 97 case NVPTX::BI__imma_m8n8k32_ld_a_s4: 98 return {1, 0, MMA_INTR(m8n8k32_load_a_s4, row)}; 99 case NVPTX::BI__imma_m8n8k32_ld_a_u4: 100 return {1, 0, MMA_INTR(m8n8k32_load_a_u4, row)}; 101 case NVPTX::BI__imma_m8n8k32_ld_b_s4: 102 return {1, MMA_INTR(m8n8k32_load_b_s4, col), 0}; 103 case NVPTX::BI__imma_m8n8k32_ld_b_u4: 104 return {1, MMA_INTR(m8n8k32_load_b_u4, col), 0}; 105 case NVPTX::BI__imma_m8n8k32_ld_c: 106 return MMA_LDST(2, m8n8k32_load_c_s32); 107 case NVPTX::BI__bmma_m8n8k128_ld_a_b1: 108 return {1, 0, MMA_INTR(m8n8k128_load_a_b1, row)}; 109 case NVPTX::BI__bmma_m8n8k128_ld_b_b1: 110 return {1, MMA_INTR(m8n8k128_load_b_b1, col), 0}; 111 case NVPTX::BI__bmma_m8n8k128_ld_c: 112 return MMA_LDST(2, m8n8k128_load_c_s32); 113 114 // Double MMA loads 115 case NVPTX::BI__dmma_m8n8k4_ld_a: 116 return MMA_LDST(1, m8n8k4_load_a_f64); 117 case NVPTX::BI__dmma_m8n8k4_ld_b: 118 return MMA_LDST(1, m8n8k4_load_b_f64); 119 case NVPTX::BI__dmma_m8n8k4_ld_c: 120 return MMA_LDST(2, m8n8k4_load_c_f64); 121 122 // Alternate float MMA loads 123 case NVPTX::BI__mma_bf16_m16n16k16_ld_a: 124 return MMA_LDST(4, m16n16k16_load_a_bf16); 125 case NVPTX::BI__mma_bf16_m16n16k16_ld_b: 126 return MMA_LDST(4, m16n16k16_load_b_bf16); 127 case NVPTX::BI__mma_bf16_m8n32k16_ld_a: 128 return MMA_LDST(2, m8n32k16_load_a_bf16); 129 case NVPTX::BI__mma_bf16_m8n32k16_ld_b: 130 return MMA_LDST(8, m8n32k16_load_b_bf16); 131 case NVPTX::BI__mma_bf16_m32n8k16_ld_a: 132 return MMA_LDST(8, m32n8k16_load_a_bf16); 133 case NVPTX::BI__mma_bf16_m32n8k16_ld_b: 134 return MMA_LDST(2, m32n8k16_load_b_bf16); 135 case NVPTX::BI__mma_tf32_m16n16k8_ld_a: 136 return MMA_LDST(4, m16n16k8_load_a_tf32); 137 case NVPTX::BI__mma_tf32_m16n16k8_ld_b: 138 return MMA_LDST(4, m16n16k8_load_b_tf32); 139 case NVPTX::BI__mma_tf32_m16n16k8_ld_c: 140 return MMA_LDST(8, m16n16k8_load_c_f32); 141 142 // NOTE: We need to follow inconsitent naming scheme used by NVCC. Unlike 143 // PTX and LLVM IR where stores always use fragment D, NVCC builtins always 144 // use fragment C for both loads and stores. 145 // FP MMA stores. 146 case NVPTX::BI__hmma_m16n16k16_st_c_f16: 147 return MMA_LDST(4, m16n16k16_store_d_f16); 148 case NVPTX::BI__hmma_m16n16k16_st_c_f32: 149 return MMA_LDST(8, m16n16k16_store_d_f32); 150 case NVPTX::BI__hmma_m32n8k16_st_c_f16: 151 return MMA_LDST(4, m32n8k16_store_d_f16); 152 case NVPTX::BI__hmma_m32n8k16_st_c_f32: 153 return MMA_LDST(8, m32n8k16_store_d_f32); 154 case NVPTX::BI__hmma_m8n32k16_st_c_f16: 155 return MMA_LDST(4, m8n32k16_store_d_f16); 156 case NVPTX::BI__hmma_m8n32k16_st_c_f32: 157 return MMA_LDST(8, m8n32k16_store_d_f32); 158 159 // Integer and sub-integer MMA stores. 160 // Another naming quirk. Unlike other MMA builtins that use PTX types in the 161 // name, integer loads/stores use LLVM's i32. 162 case NVPTX::BI__imma_m16n16k16_st_c_i32: 163 return MMA_LDST(8, m16n16k16_store_d_s32); 164 case NVPTX::BI__imma_m32n8k16_st_c_i32: 165 return MMA_LDST(8, m32n8k16_store_d_s32); 166 case NVPTX::BI__imma_m8n32k16_st_c_i32: 167 return MMA_LDST(8, m8n32k16_store_d_s32); 168 case NVPTX::BI__imma_m8n8k32_st_c_i32: 169 return MMA_LDST(2, m8n8k32_store_d_s32); 170 case NVPTX::BI__bmma_m8n8k128_st_c_i32: 171 return MMA_LDST(2, m8n8k128_store_d_s32); 172 173 // Double MMA store 174 case NVPTX::BI__dmma_m8n8k4_st_c_f64: 175 return MMA_LDST(2, m8n8k4_store_d_f64); 176 177 // Alternate float MMA store 178 case NVPTX::BI__mma_m16n16k8_st_c_f32: 179 return MMA_LDST(8, m16n16k8_store_d_f32); 180 181 default: 182 llvm_unreachable("Unknown MMA builtin"); 183 } 184 } 185 #undef MMA_LDST 186 #undef MMA_INTR 187 188 189 struct NVPTXMmaInfo { 190 unsigned NumEltsA; 191 unsigned NumEltsB; 192 unsigned NumEltsC; 193 unsigned NumEltsD; 194 195 // Variants are ordered by layout-A/layout-B/satf, where 'row' has priority 196 // over 'col' for layout. The index of non-satf variants is expected to match 197 // the undocumented layout constants used by CUDA's mma.hpp. 198 std::array<unsigned, 8> Variants; 199 200 unsigned getMMAIntrinsic(int Layout, bool Satf) { 201 unsigned Index = Layout + 4 * Satf; 202 if (Index >= Variants.size()) 203 return 0; 204 return Variants[Index]; 205 } 206 }; 207 208 // Returns an intrinsic that matches Layout and Satf for valid combinations of 209 // Layout and Satf, 0 otherwise. 210 static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) { 211 // clang-format off 212 #define MMA_VARIANTS(geom, type) \ 213 Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type, \ 214 Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \ 215 Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type, \ 216 Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type 217 #define MMA_SATF_VARIANTS(geom, type) \ 218 MMA_VARIANTS(geom, type), \ 219 Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \ 220 Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \ 221 Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \ 222 Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite 223 // Sub-integer MMA only supports row.col layout. 224 #define MMA_VARIANTS_I4(geom, type) \ 225 0, \ 226 Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \ 227 0, \ 228 0, \ 229 0, \ 230 Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \ 231 0, \ 232 0 233 // b1 MMA does not support .satfinite. 234 #define MMA_VARIANTS_B1_XOR(geom, type) \ 235 0, \ 236 Intrinsic::nvvm_wmma_##geom##_mma_xor_popc_row_col_##type, \ 237 0, \ 238 0, \ 239 0, \ 240 0, \ 241 0, \ 242 0 243 #define MMA_VARIANTS_B1_AND(geom, type) \ 244 0, \ 245 Intrinsic::nvvm_wmma_##geom##_mma_and_popc_row_col_##type, \ 246 0, \ 247 0, \ 248 0, \ 249 0, \ 250 0, \ 251 0 252 // clang-format on 253 switch (BuiltinID) { 254 // FP MMA 255 // Note that 'type' argument of MMA_SATF_VARIANTS uses D_C notation, while 256 // NumEltsN of return value are ordered as A,B,C,D. 257 case NVPTX::BI__hmma_m16n16k16_mma_f16f16: 258 return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m16n16k16, f16_f16)}}}; 259 case NVPTX::BI__hmma_m16n16k16_mma_f32f16: 260 return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m16n16k16, f32_f16)}}}; 261 case NVPTX::BI__hmma_m16n16k16_mma_f16f32: 262 return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m16n16k16, f16_f32)}}}; 263 case NVPTX::BI__hmma_m16n16k16_mma_f32f32: 264 return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, f32_f32)}}}; 265 case NVPTX::BI__hmma_m32n8k16_mma_f16f16: 266 return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m32n8k16, f16_f16)}}}; 267 case NVPTX::BI__hmma_m32n8k16_mma_f32f16: 268 return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m32n8k16, f32_f16)}}}; 269 case NVPTX::BI__hmma_m32n8k16_mma_f16f32: 270 return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m32n8k16, f16_f32)}}}; 271 case NVPTX::BI__hmma_m32n8k16_mma_f32f32: 272 return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, f32_f32)}}}; 273 case NVPTX::BI__hmma_m8n32k16_mma_f16f16: 274 return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m8n32k16, f16_f16)}}}; 275 case NVPTX::BI__hmma_m8n32k16_mma_f32f16: 276 return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m8n32k16, f32_f16)}}}; 277 case NVPTX::BI__hmma_m8n32k16_mma_f16f32: 278 return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m8n32k16, f16_f32)}}}; 279 case NVPTX::BI__hmma_m8n32k16_mma_f32f32: 280 return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, f32_f32)}}}; 281 282 // Integer MMA 283 case NVPTX::BI__imma_m16n16k16_mma_s8: 284 return {2, 2, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, s8)}}}; 285 case NVPTX::BI__imma_m16n16k16_mma_u8: 286 return {2, 2, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, u8)}}}; 287 case NVPTX::BI__imma_m32n8k16_mma_s8: 288 return {4, 1, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, s8)}}}; 289 case NVPTX::BI__imma_m32n8k16_mma_u8: 290 return {4, 1, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, u8)}}}; 291 case NVPTX::BI__imma_m8n32k16_mma_s8: 292 return {1, 4, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, s8)}}}; 293 case NVPTX::BI__imma_m8n32k16_mma_u8: 294 return {1, 4, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, u8)}}}; 295 296 // Sub-integer MMA 297 case NVPTX::BI__imma_m8n8k32_mma_s4: 298 return {1, 1, 2, 2, {{MMA_VARIANTS_I4(m8n8k32, s4)}}}; 299 case NVPTX::BI__imma_m8n8k32_mma_u4: 300 return {1, 1, 2, 2, {{MMA_VARIANTS_I4(m8n8k32, u4)}}}; 301 case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1: 302 return {1, 1, 2, 2, {{MMA_VARIANTS_B1_XOR(m8n8k128, b1)}}}; 303 case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1: 304 return {1, 1, 2, 2, {{MMA_VARIANTS_B1_AND(m8n8k128, b1)}}}; 305 306 // Double MMA 307 case NVPTX::BI__dmma_m8n8k4_mma_f64: 308 return {1, 1, 2, 2, {{MMA_VARIANTS(m8n8k4, f64)}}}; 309 310 // Alternate FP MMA 311 case NVPTX::BI__mma_bf16_m16n16k16_mma_f32: 312 return {4, 4, 8, 8, {{MMA_VARIANTS(m16n16k16, bf16)}}}; 313 case NVPTX::BI__mma_bf16_m8n32k16_mma_f32: 314 return {2, 8, 8, 8, {{MMA_VARIANTS(m8n32k16, bf16)}}}; 315 case NVPTX::BI__mma_bf16_m32n8k16_mma_f32: 316 return {8, 2, 8, 8, {{MMA_VARIANTS(m32n8k16, bf16)}}}; 317 case NVPTX::BI__mma_tf32_m16n16k8_mma_f32: 318 return {4, 4, 8, 8, {{MMA_VARIANTS(m16n16k8, tf32)}}}; 319 default: 320 llvm_unreachable("Unexpected builtin ID."); 321 } 322 #undef MMA_VARIANTS 323 #undef MMA_SATF_VARIANTS 324 #undef MMA_VARIANTS_I4 325 #undef MMA_VARIANTS_B1_AND 326 #undef MMA_VARIANTS_B1_XOR 327 } 328 329 static Value *MakeLdu(unsigned IntrinsicID, CodeGenFunction &CGF, 330 const CallExpr *E) { 331 Value *Ptr = CGF.EmitScalarExpr(E->getArg(0)); 332 QualType ArgType = E->getArg(0)->getType(); 333 clang::CharUnits Align = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType); 334 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType()); 335 return CGF.Builder.CreateCall( 336 CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}), 337 {Ptr, ConstantInt::get(CGF.Builder.getInt32Ty(), Align.getQuantity())}); 338 } 339 340 static Value *MakeLdg(CodeGenFunction &CGF, const CallExpr *E) { 341 Value *Ptr = CGF.EmitScalarExpr(E->getArg(0)); 342 QualType ArgType = E->getArg(0)->getType(); 343 clang::CharUnits AlignV = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType); 344 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType()); 345 346 // Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL 347 auto *ASC = CGF.Builder.CreateAddrSpaceCast(Ptr, CGF.Builder.getPtrTy(1)); 348 auto *LD = CGF.Builder.CreateAlignedLoad(ElemTy, ASC, AlignV.getAsAlign()); 349 MDNode *MD = MDNode::get(CGF.Builder.getContext(), {}); 350 LD->setMetadata(LLVMContext::MD_invariant_load, MD); 351 352 return LD; 353 } 354 355 static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF, 356 const CallExpr *E) { 357 Value *Ptr = CGF.EmitScalarExpr(E->getArg(0)); 358 llvm::Type *ElemTy = 359 CGF.ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType()); 360 return CGF.Builder.CreateCall( 361 CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}), 362 {Ptr, CGF.EmitScalarExpr(E->getArg(1))}); 363 } 364 365 static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS, 366 CodeGenFunction &CGF, const CallExpr *E, 367 int SrcSize) { 368 return E->getNumArgs() == 3 369 ? CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicIDS), 370 {CGF.EmitScalarExpr(E->getArg(0)), 371 CGF.EmitScalarExpr(E->getArg(1)), 372 CGF.EmitScalarExpr(E->getArg(2))}) 373 : CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicID), 374 {CGF.EmitScalarExpr(E->getArg(0)), 375 CGF.EmitScalarExpr(E->getArg(1))}); 376 } 377 378 static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID, 379 const CallExpr *E, CodeGenFunction &CGF) { 380 auto &C = CGF.CGM.getContext(); 381 if (!(C.getLangOpts().NativeHalfType || 382 !C.getTargetInfo().useFP16ConversionIntrinsics())) { 383 CGF.CGM.Error(E->getExprLoc(), C.BuiltinInfo.getQuotedName(BuiltinID) + 384 " requires native half type support."); 385 return nullptr; 386 } 387 388 if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2) 389 return MakeLdg(CGF, E); 390 391 if (IntrinsicID == Intrinsic::nvvm_ldu_global_f) 392 return MakeLdu(IntrinsicID, CGF, E); 393 394 SmallVector<Value *, 16> Args; 395 auto *F = CGF.CGM.getIntrinsic(IntrinsicID); 396 auto *FTy = F->getFunctionType(); 397 unsigned ICEArguments = 0; 398 ASTContext::GetBuiltinTypeError Error; 399 C.GetBuiltinType(BuiltinID, Error, &ICEArguments); 400 assert(Error == ASTContext::GE_None && "Should not codegen an error"); 401 for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) { 402 assert((ICEArguments & (1 << i)) == 0); 403 auto *ArgValue = CGF.EmitScalarExpr(E->getArg(i)); 404 auto *PTy = FTy->getParamType(i); 405 if (PTy != ArgValue->getType()) 406 ArgValue = CGF.Builder.CreateBitCast(ArgValue, PTy); 407 Args.push_back(ArgValue); 408 } 409 410 return CGF.Builder.CreateCall(F, Args); 411 } 412 } // namespace 413 414 Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, 415 const CallExpr *E) { 416 switch (BuiltinID) { 417 case NVPTX::BI__nvvm_atom_add_gen_i: 418 case NVPTX::BI__nvvm_atom_add_gen_l: 419 case NVPTX::BI__nvvm_atom_add_gen_ll: 420 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E); 421 422 case NVPTX::BI__nvvm_atom_sub_gen_i: 423 case NVPTX::BI__nvvm_atom_sub_gen_l: 424 case NVPTX::BI__nvvm_atom_sub_gen_ll: 425 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E); 426 427 case NVPTX::BI__nvvm_atom_and_gen_i: 428 case NVPTX::BI__nvvm_atom_and_gen_l: 429 case NVPTX::BI__nvvm_atom_and_gen_ll: 430 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E); 431 432 case NVPTX::BI__nvvm_atom_or_gen_i: 433 case NVPTX::BI__nvvm_atom_or_gen_l: 434 case NVPTX::BI__nvvm_atom_or_gen_ll: 435 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E); 436 437 case NVPTX::BI__nvvm_atom_xor_gen_i: 438 case NVPTX::BI__nvvm_atom_xor_gen_l: 439 case NVPTX::BI__nvvm_atom_xor_gen_ll: 440 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E); 441 442 case NVPTX::BI__nvvm_atom_xchg_gen_i: 443 case NVPTX::BI__nvvm_atom_xchg_gen_l: 444 case NVPTX::BI__nvvm_atom_xchg_gen_ll: 445 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E); 446 447 case NVPTX::BI__nvvm_atom_max_gen_i: 448 case NVPTX::BI__nvvm_atom_max_gen_l: 449 case NVPTX::BI__nvvm_atom_max_gen_ll: 450 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E); 451 452 case NVPTX::BI__nvvm_atom_max_gen_ui: 453 case NVPTX::BI__nvvm_atom_max_gen_ul: 454 case NVPTX::BI__nvvm_atom_max_gen_ull: 455 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMax, E); 456 457 case NVPTX::BI__nvvm_atom_min_gen_i: 458 case NVPTX::BI__nvvm_atom_min_gen_l: 459 case NVPTX::BI__nvvm_atom_min_gen_ll: 460 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E); 461 462 case NVPTX::BI__nvvm_atom_min_gen_ui: 463 case NVPTX::BI__nvvm_atom_min_gen_ul: 464 case NVPTX::BI__nvvm_atom_min_gen_ull: 465 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E); 466 467 case NVPTX::BI__nvvm_atom_cas_gen_us: 468 case NVPTX::BI__nvvm_atom_cas_gen_i: 469 case NVPTX::BI__nvvm_atom_cas_gen_l: 470 case NVPTX::BI__nvvm_atom_cas_gen_ll: 471 // __nvvm_atom_cas_gen_* should return the old value rather than the 472 // success flag. 473 return MakeAtomicCmpXchgValue(*this, E, /*ReturnBool=*/false); 474 475 case NVPTX::BI__nvvm_atom_add_gen_f: 476 case NVPTX::BI__nvvm_atom_add_gen_d: { 477 Address DestAddr = EmitPointerWithAlignment(E->getArg(0)); 478 Value *Val = EmitScalarExpr(E->getArg(1)); 479 480 return Builder.CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, DestAddr, Val, 481 AtomicOrdering::SequentiallyConsistent); 482 } 483 484 case NVPTX::BI__nvvm_atom_inc_gen_ui: 485 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UIncWrap, E); 486 487 case NVPTX::BI__nvvm_atom_dec_gen_ui: 488 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UDecWrap, E); 489 490 case NVPTX::BI__nvvm_ldg_c: 491 case NVPTX::BI__nvvm_ldg_sc: 492 case NVPTX::BI__nvvm_ldg_c2: 493 case NVPTX::BI__nvvm_ldg_sc2: 494 case NVPTX::BI__nvvm_ldg_c4: 495 case NVPTX::BI__nvvm_ldg_sc4: 496 case NVPTX::BI__nvvm_ldg_s: 497 case NVPTX::BI__nvvm_ldg_s2: 498 case NVPTX::BI__nvvm_ldg_s4: 499 case NVPTX::BI__nvvm_ldg_i: 500 case NVPTX::BI__nvvm_ldg_i2: 501 case NVPTX::BI__nvvm_ldg_i4: 502 case NVPTX::BI__nvvm_ldg_l: 503 case NVPTX::BI__nvvm_ldg_l2: 504 case NVPTX::BI__nvvm_ldg_ll: 505 case NVPTX::BI__nvvm_ldg_ll2: 506 case NVPTX::BI__nvvm_ldg_uc: 507 case NVPTX::BI__nvvm_ldg_uc2: 508 case NVPTX::BI__nvvm_ldg_uc4: 509 case NVPTX::BI__nvvm_ldg_us: 510 case NVPTX::BI__nvvm_ldg_us2: 511 case NVPTX::BI__nvvm_ldg_us4: 512 case NVPTX::BI__nvvm_ldg_ui: 513 case NVPTX::BI__nvvm_ldg_ui2: 514 case NVPTX::BI__nvvm_ldg_ui4: 515 case NVPTX::BI__nvvm_ldg_ul: 516 case NVPTX::BI__nvvm_ldg_ul2: 517 case NVPTX::BI__nvvm_ldg_ull: 518 case NVPTX::BI__nvvm_ldg_ull2: 519 case NVPTX::BI__nvvm_ldg_f: 520 case NVPTX::BI__nvvm_ldg_f2: 521 case NVPTX::BI__nvvm_ldg_f4: 522 case NVPTX::BI__nvvm_ldg_d: 523 case NVPTX::BI__nvvm_ldg_d2: 524 // PTX Interoperability section 2.2: "For a vector with an even number of 525 // elements, its alignment is set to number of elements times the alignment 526 // of its member: n*alignof(t)." 527 return MakeLdg(*this, E); 528 529 case NVPTX::BI__nvvm_ldu_c: 530 case NVPTX::BI__nvvm_ldu_sc: 531 case NVPTX::BI__nvvm_ldu_c2: 532 case NVPTX::BI__nvvm_ldu_sc2: 533 case NVPTX::BI__nvvm_ldu_c4: 534 case NVPTX::BI__nvvm_ldu_sc4: 535 case NVPTX::BI__nvvm_ldu_s: 536 case NVPTX::BI__nvvm_ldu_s2: 537 case NVPTX::BI__nvvm_ldu_s4: 538 case NVPTX::BI__nvvm_ldu_i: 539 case NVPTX::BI__nvvm_ldu_i2: 540 case NVPTX::BI__nvvm_ldu_i4: 541 case NVPTX::BI__nvvm_ldu_l: 542 case NVPTX::BI__nvvm_ldu_l2: 543 case NVPTX::BI__nvvm_ldu_ll: 544 case NVPTX::BI__nvvm_ldu_ll2: 545 case NVPTX::BI__nvvm_ldu_uc: 546 case NVPTX::BI__nvvm_ldu_uc2: 547 case NVPTX::BI__nvvm_ldu_uc4: 548 case NVPTX::BI__nvvm_ldu_us: 549 case NVPTX::BI__nvvm_ldu_us2: 550 case NVPTX::BI__nvvm_ldu_us4: 551 case NVPTX::BI__nvvm_ldu_ui: 552 case NVPTX::BI__nvvm_ldu_ui2: 553 case NVPTX::BI__nvvm_ldu_ui4: 554 case NVPTX::BI__nvvm_ldu_ul: 555 case NVPTX::BI__nvvm_ldu_ul2: 556 case NVPTX::BI__nvvm_ldu_ull: 557 case NVPTX::BI__nvvm_ldu_ull2: 558 return MakeLdu(Intrinsic::nvvm_ldu_global_i, *this, E); 559 case NVPTX::BI__nvvm_ldu_f: 560 case NVPTX::BI__nvvm_ldu_f2: 561 case NVPTX::BI__nvvm_ldu_f4: 562 case NVPTX::BI__nvvm_ldu_d: 563 case NVPTX::BI__nvvm_ldu_d2: 564 return MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E); 565 566 case NVPTX::BI__nvvm_atom_cta_add_gen_i: 567 case NVPTX::BI__nvvm_atom_cta_add_gen_l: 568 case NVPTX::BI__nvvm_atom_cta_add_gen_ll: 569 return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_cta, *this, E); 570 case NVPTX::BI__nvvm_atom_sys_add_gen_i: 571 case NVPTX::BI__nvvm_atom_sys_add_gen_l: 572 case NVPTX::BI__nvvm_atom_sys_add_gen_ll: 573 return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_sys, *this, E); 574 case NVPTX::BI__nvvm_atom_cta_add_gen_f: 575 case NVPTX::BI__nvvm_atom_cta_add_gen_d: 576 return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_cta, *this, E); 577 case NVPTX::BI__nvvm_atom_sys_add_gen_f: 578 case NVPTX::BI__nvvm_atom_sys_add_gen_d: 579 return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_sys, *this, E); 580 case NVPTX::BI__nvvm_atom_cta_xchg_gen_i: 581 case NVPTX::BI__nvvm_atom_cta_xchg_gen_l: 582 case NVPTX::BI__nvvm_atom_cta_xchg_gen_ll: 583 return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_cta, *this, E); 584 case NVPTX::BI__nvvm_atom_sys_xchg_gen_i: 585 case NVPTX::BI__nvvm_atom_sys_xchg_gen_l: 586 case NVPTX::BI__nvvm_atom_sys_xchg_gen_ll: 587 return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_sys, *this, E); 588 case NVPTX::BI__nvvm_atom_cta_max_gen_i: 589 case NVPTX::BI__nvvm_atom_cta_max_gen_ui: 590 case NVPTX::BI__nvvm_atom_cta_max_gen_l: 591 case NVPTX::BI__nvvm_atom_cta_max_gen_ul: 592 case NVPTX::BI__nvvm_atom_cta_max_gen_ll: 593 case NVPTX::BI__nvvm_atom_cta_max_gen_ull: 594 return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_cta, *this, E); 595 case NVPTX::BI__nvvm_atom_sys_max_gen_i: 596 case NVPTX::BI__nvvm_atom_sys_max_gen_ui: 597 case NVPTX::BI__nvvm_atom_sys_max_gen_l: 598 case NVPTX::BI__nvvm_atom_sys_max_gen_ul: 599 case NVPTX::BI__nvvm_atom_sys_max_gen_ll: 600 case NVPTX::BI__nvvm_atom_sys_max_gen_ull: 601 return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_sys, *this, E); 602 case NVPTX::BI__nvvm_atom_cta_min_gen_i: 603 case NVPTX::BI__nvvm_atom_cta_min_gen_ui: 604 case NVPTX::BI__nvvm_atom_cta_min_gen_l: 605 case NVPTX::BI__nvvm_atom_cta_min_gen_ul: 606 case NVPTX::BI__nvvm_atom_cta_min_gen_ll: 607 case NVPTX::BI__nvvm_atom_cta_min_gen_ull: 608 return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_cta, *this, E); 609 case NVPTX::BI__nvvm_atom_sys_min_gen_i: 610 case NVPTX::BI__nvvm_atom_sys_min_gen_ui: 611 case NVPTX::BI__nvvm_atom_sys_min_gen_l: 612 case NVPTX::BI__nvvm_atom_sys_min_gen_ul: 613 case NVPTX::BI__nvvm_atom_sys_min_gen_ll: 614 case NVPTX::BI__nvvm_atom_sys_min_gen_ull: 615 return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_sys, *this, E); 616 case NVPTX::BI__nvvm_atom_cta_inc_gen_ui: 617 return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_cta, *this, E); 618 case NVPTX::BI__nvvm_atom_cta_dec_gen_ui: 619 return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_cta, *this, E); 620 case NVPTX::BI__nvvm_atom_sys_inc_gen_ui: 621 return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_sys, *this, E); 622 case NVPTX::BI__nvvm_atom_sys_dec_gen_ui: 623 return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_sys, *this, E); 624 case NVPTX::BI__nvvm_atom_cta_and_gen_i: 625 case NVPTX::BI__nvvm_atom_cta_and_gen_l: 626 case NVPTX::BI__nvvm_atom_cta_and_gen_ll: 627 return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_cta, *this, E); 628 case NVPTX::BI__nvvm_atom_sys_and_gen_i: 629 case NVPTX::BI__nvvm_atom_sys_and_gen_l: 630 case NVPTX::BI__nvvm_atom_sys_and_gen_ll: 631 return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_sys, *this, E); 632 case NVPTX::BI__nvvm_atom_cta_or_gen_i: 633 case NVPTX::BI__nvvm_atom_cta_or_gen_l: 634 case NVPTX::BI__nvvm_atom_cta_or_gen_ll: 635 return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_cta, *this, E); 636 case NVPTX::BI__nvvm_atom_sys_or_gen_i: 637 case NVPTX::BI__nvvm_atom_sys_or_gen_l: 638 case NVPTX::BI__nvvm_atom_sys_or_gen_ll: 639 return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_sys, *this, E); 640 case NVPTX::BI__nvvm_atom_cta_xor_gen_i: 641 case NVPTX::BI__nvvm_atom_cta_xor_gen_l: 642 case NVPTX::BI__nvvm_atom_cta_xor_gen_ll: 643 return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_cta, *this, E); 644 case NVPTX::BI__nvvm_atom_sys_xor_gen_i: 645 case NVPTX::BI__nvvm_atom_sys_xor_gen_l: 646 case NVPTX::BI__nvvm_atom_sys_xor_gen_ll: 647 return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys, *this, E); 648 case NVPTX::BI__nvvm_atom_cta_cas_gen_us: 649 case NVPTX::BI__nvvm_atom_cta_cas_gen_i: 650 case NVPTX::BI__nvvm_atom_cta_cas_gen_l: 651 case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: { 652 Value *Ptr = EmitScalarExpr(E->getArg(0)); 653 llvm::Type *ElemTy = 654 ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType()); 655 return Builder.CreateCall( 656 CGM.getIntrinsic( 657 Intrinsic::nvvm_atomic_cas_gen_i_cta, {ElemTy, Ptr->getType()}), 658 {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))}); 659 } 660 case NVPTX::BI__nvvm_atom_sys_cas_gen_us: 661 case NVPTX::BI__nvvm_atom_sys_cas_gen_i: 662 case NVPTX::BI__nvvm_atom_sys_cas_gen_l: 663 case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: { 664 Value *Ptr = EmitScalarExpr(E->getArg(0)); 665 llvm::Type *ElemTy = 666 ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType()); 667 return Builder.CreateCall( 668 CGM.getIntrinsic( 669 Intrinsic::nvvm_atomic_cas_gen_i_sys, {ElemTy, Ptr->getType()}), 670 {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))}); 671 } 672 case NVPTX::BI__nvvm_match_all_sync_i32p: 673 case NVPTX::BI__nvvm_match_all_sync_i64p: { 674 Value *Mask = EmitScalarExpr(E->getArg(0)); 675 Value *Val = EmitScalarExpr(E->getArg(1)); 676 Address PredOutPtr = EmitPointerWithAlignment(E->getArg(2)); 677 Value *ResultPair = Builder.CreateCall( 678 CGM.getIntrinsic(BuiltinID == NVPTX::BI__nvvm_match_all_sync_i32p 679 ? Intrinsic::nvvm_match_all_sync_i32p 680 : Intrinsic::nvvm_match_all_sync_i64p), 681 {Mask, Val}); 682 Value *Pred = Builder.CreateZExt(Builder.CreateExtractValue(ResultPair, 1), 683 PredOutPtr.getElementType()); 684 Builder.CreateStore(Pred, PredOutPtr); 685 return Builder.CreateExtractValue(ResultPair, 0); 686 } 687 688 // FP MMA loads 689 case NVPTX::BI__hmma_m16n16k16_ld_a: 690 case NVPTX::BI__hmma_m16n16k16_ld_b: 691 case NVPTX::BI__hmma_m16n16k16_ld_c_f16: 692 case NVPTX::BI__hmma_m16n16k16_ld_c_f32: 693 case NVPTX::BI__hmma_m32n8k16_ld_a: 694 case NVPTX::BI__hmma_m32n8k16_ld_b: 695 case NVPTX::BI__hmma_m32n8k16_ld_c_f16: 696 case NVPTX::BI__hmma_m32n8k16_ld_c_f32: 697 case NVPTX::BI__hmma_m8n32k16_ld_a: 698 case NVPTX::BI__hmma_m8n32k16_ld_b: 699 case NVPTX::BI__hmma_m8n32k16_ld_c_f16: 700 case NVPTX::BI__hmma_m8n32k16_ld_c_f32: 701 // Integer MMA loads. 702 case NVPTX::BI__imma_m16n16k16_ld_a_s8: 703 case NVPTX::BI__imma_m16n16k16_ld_a_u8: 704 case NVPTX::BI__imma_m16n16k16_ld_b_s8: 705 case NVPTX::BI__imma_m16n16k16_ld_b_u8: 706 case NVPTX::BI__imma_m16n16k16_ld_c: 707 case NVPTX::BI__imma_m32n8k16_ld_a_s8: 708 case NVPTX::BI__imma_m32n8k16_ld_a_u8: 709 case NVPTX::BI__imma_m32n8k16_ld_b_s8: 710 case NVPTX::BI__imma_m32n8k16_ld_b_u8: 711 case NVPTX::BI__imma_m32n8k16_ld_c: 712 case NVPTX::BI__imma_m8n32k16_ld_a_s8: 713 case NVPTX::BI__imma_m8n32k16_ld_a_u8: 714 case NVPTX::BI__imma_m8n32k16_ld_b_s8: 715 case NVPTX::BI__imma_m8n32k16_ld_b_u8: 716 case NVPTX::BI__imma_m8n32k16_ld_c: 717 // Sub-integer MMA loads. 718 case NVPTX::BI__imma_m8n8k32_ld_a_s4: 719 case NVPTX::BI__imma_m8n8k32_ld_a_u4: 720 case NVPTX::BI__imma_m8n8k32_ld_b_s4: 721 case NVPTX::BI__imma_m8n8k32_ld_b_u4: 722 case NVPTX::BI__imma_m8n8k32_ld_c: 723 case NVPTX::BI__bmma_m8n8k128_ld_a_b1: 724 case NVPTX::BI__bmma_m8n8k128_ld_b_b1: 725 case NVPTX::BI__bmma_m8n8k128_ld_c: 726 // Double MMA loads. 727 case NVPTX::BI__dmma_m8n8k4_ld_a: 728 case NVPTX::BI__dmma_m8n8k4_ld_b: 729 case NVPTX::BI__dmma_m8n8k4_ld_c: 730 // Alternate float MMA loads. 731 case NVPTX::BI__mma_bf16_m16n16k16_ld_a: 732 case NVPTX::BI__mma_bf16_m16n16k16_ld_b: 733 case NVPTX::BI__mma_bf16_m8n32k16_ld_a: 734 case NVPTX::BI__mma_bf16_m8n32k16_ld_b: 735 case NVPTX::BI__mma_bf16_m32n8k16_ld_a: 736 case NVPTX::BI__mma_bf16_m32n8k16_ld_b: 737 case NVPTX::BI__mma_tf32_m16n16k8_ld_a: 738 case NVPTX::BI__mma_tf32_m16n16k8_ld_b: 739 case NVPTX::BI__mma_tf32_m16n16k8_ld_c: { 740 Address Dst = EmitPointerWithAlignment(E->getArg(0)); 741 Value *Src = EmitScalarExpr(E->getArg(1)); 742 Value *Ldm = EmitScalarExpr(E->getArg(2)); 743 std::optional<llvm::APSInt> isColMajorArg = 744 E->getArg(3)->getIntegerConstantExpr(getContext()); 745 if (!isColMajorArg) 746 return nullptr; 747 bool isColMajor = isColMajorArg->getSExtValue(); 748 NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID); 749 unsigned IID = isColMajor ? II.IID_col : II.IID_row; 750 if (IID == 0) 751 return nullptr; 752 753 Value *Result = 754 Builder.CreateCall(CGM.getIntrinsic(IID, Src->getType()), {Src, Ldm}); 755 756 // Save returned values. 757 assert(II.NumResults); 758 if (II.NumResults == 1) { 759 Builder.CreateAlignedStore(Result, Dst.emitRawPointer(*this), 760 CharUnits::fromQuantity(4)); 761 } else { 762 for (unsigned i = 0; i < II.NumResults; ++i) { 763 Builder.CreateAlignedStore( 764 Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), 765 Dst.getElementType()), 766 Builder.CreateGEP(Dst.getElementType(), Dst.emitRawPointer(*this), 767 llvm::ConstantInt::get(IntTy, i)), 768 CharUnits::fromQuantity(4)); 769 } 770 } 771 return Result; 772 } 773 774 case NVPTX::BI__hmma_m16n16k16_st_c_f16: 775 case NVPTX::BI__hmma_m16n16k16_st_c_f32: 776 case NVPTX::BI__hmma_m32n8k16_st_c_f16: 777 case NVPTX::BI__hmma_m32n8k16_st_c_f32: 778 case NVPTX::BI__hmma_m8n32k16_st_c_f16: 779 case NVPTX::BI__hmma_m8n32k16_st_c_f32: 780 case NVPTX::BI__imma_m16n16k16_st_c_i32: 781 case NVPTX::BI__imma_m32n8k16_st_c_i32: 782 case NVPTX::BI__imma_m8n32k16_st_c_i32: 783 case NVPTX::BI__imma_m8n8k32_st_c_i32: 784 case NVPTX::BI__bmma_m8n8k128_st_c_i32: 785 case NVPTX::BI__dmma_m8n8k4_st_c_f64: 786 case NVPTX::BI__mma_m16n16k8_st_c_f32: { 787 Value *Dst = EmitScalarExpr(E->getArg(0)); 788 Address Src = EmitPointerWithAlignment(E->getArg(1)); 789 Value *Ldm = EmitScalarExpr(E->getArg(2)); 790 std::optional<llvm::APSInt> isColMajorArg = 791 E->getArg(3)->getIntegerConstantExpr(getContext()); 792 if (!isColMajorArg) 793 return nullptr; 794 bool isColMajor = isColMajorArg->getSExtValue(); 795 NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID); 796 unsigned IID = isColMajor ? II.IID_col : II.IID_row; 797 if (IID == 0) 798 return nullptr; 799 Function *Intrinsic = 800 CGM.getIntrinsic(IID, Dst->getType()); 801 llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1); 802 SmallVector<Value *, 10> Values = {Dst}; 803 for (unsigned i = 0; i < II.NumResults; ++i) { 804 Value *V = Builder.CreateAlignedLoad( 805 Src.getElementType(), 806 Builder.CreateGEP(Src.getElementType(), Src.emitRawPointer(*this), 807 llvm::ConstantInt::get(IntTy, i)), 808 CharUnits::fromQuantity(4)); 809 Values.push_back(Builder.CreateBitCast(V, ParamType)); 810 } 811 Values.push_back(Ldm); 812 Value *Result = Builder.CreateCall(Intrinsic, Values); 813 return Result; 814 } 815 816 // BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf) --> 817 // Intrinsic::nvvm_wmma_m16n16k16_mma_sync<layout A,B><DType><CType><Satf> 818 case NVPTX::BI__hmma_m16n16k16_mma_f16f16: 819 case NVPTX::BI__hmma_m16n16k16_mma_f32f16: 820 case NVPTX::BI__hmma_m16n16k16_mma_f32f32: 821 case NVPTX::BI__hmma_m16n16k16_mma_f16f32: 822 case NVPTX::BI__hmma_m32n8k16_mma_f16f16: 823 case NVPTX::BI__hmma_m32n8k16_mma_f32f16: 824 case NVPTX::BI__hmma_m32n8k16_mma_f32f32: 825 case NVPTX::BI__hmma_m32n8k16_mma_f16f32: 826 case NVPTX::BI__hmma_m8n32k16_mma_f16f16: 827 case NVPTX::BI__hmma_m8n32k16_mma_f32f16: 828 case NVPTX::BI__hmma_m8n32k16_mma_f32f32: 829 case NVPTX::BI__hmma_m8n32k16_mma_f16f32: 830 case NVPTX::BI__imma_m16n16k16_mma_s8: 831 case NVPTX::BI__imma_m16n16k16_mma_u8: 832 case NVPTX::BI__imma_m32n8k16_mma_s8: 833 case NVPTX::BI__imma_m32n8k16_mma_u8: 834 case NVPTX::BI__imma_m8n32k16_mma_s8: 835 case NVPTX::BI__imma_m8n32k16_mma_u8: 836 case NVPTX::BI__imma_m8n8k32_mma_s4: 837 case NVPTX::BI__imma_m8n8k32_mma_u4: 838 case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1: 839 case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1: 840 case NVPTX::BI__dmma_m8n8k4_mma_f64: 841 case NVPTX::BI__mma_bf16_m16n16k16_mma_f32: 842 case NVPTX::BI__mma_bf16_m8n32k16_mma_f32: 843 case NVPTX::BI__mma_bf16_m32n8k16_mma_f32: 844 case NVPTX::BI__mma_tf32_m16n16k8_mma_f32: { 845 Address Dst = EmitPointerWithAlignment(E->getArg(0)); 846 Address SrcA = EmitPointerWithAlignment(E->getArg(1)); 847 Address SrcB = EmitPointerWithAlignment(E->getArg(2)); 848 Address SrcC = EmitPointerWithAlignment(E->getArg(3)); 849 std::optional<llvm::APSInt> LayoutArg = 850 E->getArg(4)->getIntegerConstantExpr(getContext()); 851 if (!LayoutArg) 852 return nullptr; 853 int Layout = LayoutArg->getSExtValue(); 854 if (Layout < 0 || Layout > 3) 855 return nullptr; 856 llvm::APSInt SatfArg; 857 if (BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1 || 858 BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1) 859 SatfArg = 0; // .b1 does not have satf argument. 860 else if (std::optional<llvm::APSInt> OptSatfArg = 861 E->getArg(5)->getIntegerConstantExpr(getContext())) 862 SatfArg = *OptSatfArg; 863 else 864 return nullptr; 865 bool Satf = SatfArg.getSExtValue(); 866 NVPTXMmaInfo MI = getNVPTXMmaInfo(BuiltinID); 867 unsigned IID = MI.getMMAIntrinsic(Layout, Satf); 868 if (IID == 0) // Unsupported combination of Layout/Satf. 869 return nullptr; 870 871 SmallVector<Value *, 24> Values; 872 Function *Intrinsic = CGM.getIntrinsic(IID); 873 llvm::Type *AType = Intrinsic->getFunctionType()->getParamType(0); 874 // Load A 875 for (unsigned i = 0; i < MI.NumEltsA; ++i) { 876 Value *V = Builder.CreateAlignedLoad( 877 SrcA.getElementType(), 878 Builder.CreateGEP(SrcA.getElementType(), SrcA.emitRawPointer(*this), 879 llvm::ConstantInt::get(IntTy, i)), 880 CharUnits::fromQuantity(4)); 881 Values.push_back(Builder.CreateBitCast(V, AType)); 882 } 883 // Load B 884 llvm::Type *BType = Intrinsic->getFunctionType()->getParamType(MI.NumEltsA); 885 for (unsigned i = 0; i < MI.NumEltsB; ++i) { 886 Value *V = Builder.CreateAlignedLoad( 887 SrcB.getElementType(), 888 Builder.CreateGEP(SrcB.getElementType(), SrcB.emitRawPointer(*this), 889 llvm::ConstantInt::get(IntTy, i)), 890 CharUnits::fromQuantity(4)); 891 Values.push_back(Builder.CreateBitCast(V, BType)); 892 } 893 // Load C 894 llvm::Type *CType = 895 Intrinsic->getFunctionType()->getParamType(MI.NumEltsA + MI.NumEltsB); 896 for (unsigned i = 0; i < MI.NumEltsC; ++i) { 897 Value *V = Builder.CreateAlignedLoad( 898 SrcC.getElementType(), 899 Builder.CreateGEP(SrcC.getElementType(), SrcC.emitRawPointer(*this), 900 llvm::ConstantInt::get(IntTy, i)), 901 CharUnits::fromQuantity(4)); 902 Values.push_back(Builder.CreateBitCast(V, CType)); 903 } 904 Value *Result = Builder.CreateCall(Intrinsic, Values); 905 llvm::Type *DType = Dst.getElementType(); 906 for (unsigned i = 0; i < MI.NumEltsD; ++i) 907 Builder.CreateAlignedStore( 908 Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), DType), 909 Builder.CreateGEP(Dst.getElementType(), Dst.emitRawPointer(*this), 910 llvm::ConstantInt::get(IntTy, i)), 911 CharUnits::fromQuantity(4)); 912 return Result; 913 } 914 // The following builtins require half type support 915 case NVPTX::BI__nvvm_ex2_approx_f16: 916 return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16, BuiltinID, E, *this); 917 case NVPTX::BI__nvvm_ex2_approx_f16x2: 918 return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16x2, BuiltinID, E, *this); 919 case NVPTX::BI__nvvm_ff2f16x2_rn: 920 return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn, BuiltinID, E, *this); 921 case NVPTX::BI__nvvm_ff2f16x2_rn_relu: 922 return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn_relu, BuiltinID, E, *this); 923 case NVPTX::BI__nvvm_ff2f16x2_rz: 924 return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rz, BuiltinID, E, *this); 925 case NVPTX::BI__nvvm_ff2f16x2_rz_relu: 926 return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rz_relu, BuiltinID, E, *this); 927 case NVPTX::BI__nvvm_fma_rn_f16: 928 return MakeHalfType(Intrinsic::nvvm_fma_rn_f16, BuiltinID, E, *this); 929 case NVPTX::BI__nvvm_fma_rn_f16x2: 930 return MakeHalfType(Intrinsic::nvvm_fma_rn_f16x2, BuiltinID, E, *this); 931 case NVPTX::BI__nvvm_fma_rn_ftz_f16: 932 return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_f16, BuiltinID, E, *this); 933 case NVPTX::BI__nvvm_fma_rn_ftz_f16x2: 934 return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_f16x2, BuiltinID, E, *this); 935 case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16: 936 return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_relu_f16, BuiltinID, E, 937 *this); 938 case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16x2: 939 return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_relu_f16x2, BuiltinID, E, 940 *this); 941 case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16: 942 return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_sat_f16, BuiltinID, E, 943 *this); 944 case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16x2: 945 return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_sat_f16x2, BuiltinID, E, 946 *this); 947 case NVPTX::BI__nvvm_fma_rn_relu_f16: 948 return MakeHalfType(Intrinsic::nvvm_fma_rn_relu_f16, BuiltinID, E, *this); 949 case NVPTX::BI__nvvm_fma_rn_relu_f16x2: 950 return MakeHalfType(Intrinsic::nvvm_fma_rn_relu_f16x2, BuiltinID, E, *this); 951 case NVPTX::BI__nvvm_fma_rn_sat_f16: 952 return MakeHalfType(Intrinsic::nvvm_fma_rn_sat_f16, BuiltinID, E, *this); 953 case NVPTX::BI__nvvm_fma_rn_sat_f16x2: 954 return MakeHalfType(Intrinsic::nvvm_fma_rn_sat_f16x2, BuiltinID, E, *this); 955 case NVPTX::BI__nvvm_fmax_f16: 956 return MakeHalfType(Intrinsic::nvvm_fmax_f16, BuiltinID, E, *this); 957 case NVPTX::BI__nvvm_fmax_f16x2: 958 return MakeHalfType(Intrinsic::nvvm_fmax_f16x2, BuiltinID, E, *this); 959 case NVPTX::BI__nvvm_fmax_ftz_f16: 960 return MakeHalfType(Intrinsic::nvvm_fmax_ftz_f16, BuiltinID, E, *this); 961 case NVPTX::BI__nvvm_fmax_ftz_f16x2: 962 return MakeHalfType(Intrinsic::nvvm_fmax_ftz_f16x2, BuiltinID, E, *this); 963 case NVPTX::BI__nvvm_fmax_ftz_nan_f16: 964 return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_f16, BuiltinID, E, *this); 965 case NVPTX::BI__nvvm_fmax_ftz_nan_f16x2: 966 return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_f16x2, BuiltinID, E, 967 *this); 968 case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16: 969 return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16, BuiltinID, 970 E, *this); 971 case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16x2: 972 return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16x2, 973 BuiltinID, E, *this); 974 case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16: 975 return MakeHalfType(Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16, BuiltinID, E, 976 *this); 977 case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16x2: 978 return MakeHalfType(Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16x2, BuiltinID, 979 E, *this); 980 case NVPTX::BI__nvvm_fmax_nan_f16: 981 return MakeHalfType(Intrinsic::nvvm_fmax_nan_f16, BuiltinID, E, *this); 982 case NVPTX::BI__nvvm_fmax_nan_f16x2: 983 return MakeHalfType(Intrinsic::nvvm_fmax_nan_f16x2, BuiltinID, E, *this); 984 case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16: 985 return MakeHalfType(Intrinsic::nvvm_fmax_nan_xorsign_abs_f16, BuiltinID, E, 986 *this); 987 case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16x2: 988 return MakeHalfType(Intrinsic::nvvm_fmax_nan_xorsign_abs_f16x2, BuiltinID, 989 E, *this); 990 case NVPTX::BI__nvvm_fmax_xorsign_abs_f16: 991 return MakeHalfType(Intrinsic::nvvm_fmax_xorsign_abs_f16, BuiltinID, E, 992 *this); 993 case NVPTX::BI__nvvm_fmax_xorsign_abs_f16x2: 994 return MakeHalfType(Intrinsic::nvvm_fmax_xorsign_abs_f16x2, BuiltinID, E, 995 *this); 996 case NVPTX::BI__nvvm_fmin_f16: 997 return MakeHalfType(Intrinsic::nvvm_fmin_f16, BuiltinID, E, *this); 998 case NVPTX::BI__nvvm_fmin_f16x2: 999 return MakeHalfType(Intrinsic::nvvm_fmin_f16x2, BuiltinID, E, *this); 1000 case NVPTX::BI__nvvm_fmin_ftz_f16: 1001 return MakeHalfType(Intrinsic::nvvm_fmin_ftz_f16, BuiltinID, E, *this); 1002 case NVPTX::BI__nvvm_fmin_ftz_f16x2: 1003 return MakeHalfType(Intrinsic::nvvm_fmin_ftz_f16x2, BuiltinID, E, *this); 1004 case NVPTX::BI__nvvm_fmin_ftz_nan_f16: 1005 return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_f16, BuiltinID, E, *this); 1006 case NVPTX::BI__nvvm_fmin_ftz_nan_f16x2: 1007 return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_f16x2, BuiltinID, E, 1008 *this); 1009 case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16: 1010 return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16, BuiltinID, 1011 E, *this); 1012 case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16x2: 1013 return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16x2, 1014 BuiltinID, E, *this); 1015 case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16: 1016 return MakeHalfType(Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16, BuiltinID, E, 1017 *this); 1018 case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16x2: 1019 return MakeHalfType(Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16x2, BuiltinID, 1020 E, *this); 1021 case NVPTX::BI__nvvm_fmin_nan_f16: 1022 return MakeHalfType(Intrinsic::nvvm_fmin_nan_f16, BuiltinID, E, *this); 1023 case NVPTX::BI__nvvm_fmin_nan_f16x2: 1024 return MakeHalfType(Intrinsic::nvvm_fmin_nan_f16x2, BuiltinID, E, *this); 1025 case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16: 1026 return MakeHalfType(Intrinsic::nvvm_fmin_nan_xorsign_abs_f16, BuiltinID, E, 1027 *this); 1028 case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16x2: 1029 return MakeHalfType(Intrinsic::nvvm_fmin_nan_xorsign_abs_f16x2, BuiltinID, 1030 E, *this); 1031 case NVPTX::BI__nvvm_fmin_xorsign_abs_f16: 1032 return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16, BuiltinID, E, 1033 *this); 1034 case NVPTX::BI__nvvm_fmin_xorsign_abs_f16x2: 1035 return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E, 1036 *this); 1037 case NVPTX::BI__nvvm_fabs_f: 1038 case NVPTX::BI__nvvm_abs_bf16: 1039 case NVPTX::BI__nvvm_abs_bf16x2: 1040 case NVPTX::BI__nvvm_fabs_f16: 1041 case NVPTX::BI__nvvm_fabs_f16x2: 1042 return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_fabs, 1043 EmitScalarExpr(E->getArg(0))); 1044 case NVPTX::BI__nvvm_fabs_ftz_f: 1045 case NVPTX::BI__nvvm_fabs_ftz_f16: 1046 case NVPTX::BI__nvvm_fabs_ftz_f16x2: 1047 return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_fabs_ftz, 1048 EmitScalarExpr(E->getArg(0))); 1049 case NVPTX::BI__nvvm_fabs_d: 1050 return Builder.CreateUnaryIntrinsic(Intrinsic::fabs, 1051 EmitScalarExpr(E->getArg(0))); 1052 case NVPTX::BI__nvvm_ldg_h: 1053 case NVPTX::BI__nvvm_ldg_h2: 1054 return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this); 1055 case NVPTX::BI__nvvm_ldu_h: 1056 case NVPTX::BI__nvvm_ldu_h2: 1057 return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this); 1058 case NVPTX::BI__nvvm_cp_async_ca_shared_global_4: 1059 return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4, 1060 Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E, 1061 4); 1062 case NVPTX::BI__nvvm_cp_async_ca_shared_global_8: 1063 return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_8, 1064 Intrinsic::nvvm_cp_async_ca_shared_global_8_s, *this, E, 1065 8); 1066 case NVPTX::BI__nvvm_cp_async_ca_shared_global_16: 1067 return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_16, 1068 Intrinsic::nvvm_cp_async_ca_shared_global_16_s, *this, E, 1069 16); 1070 case NVPTX::BI__nvvm_cp_async_cg_shared_global_16: 1071 return MakeCpAsync(Intrinsic::nvvm_cp_async_cg_shared_global_16, 1072 Intrinsic::nvvm_cp_async_cg_shared_global_16_s, *this, E, 1073 16); 1074 case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_x: 1075 return Builder.CreateCall( 1076 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_x)); 1077 case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_y: 1078 return Builder.CreateCall( 1079 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_y)); 1080 case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_z: 1081 return Builder.CreateCall( 1082 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_z)); 1083 case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_w: 1084 return Builder.CreateCall( 1085 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_w)); 1086 case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_x: 1087 return Builder.CreateCall( 1088 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_x)); 1089 case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_y: 1090 return Builder.CreateCall( 1091 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_y)); 1092 case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_z: 1093 return Builder.CreateCall( 1094 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_z)); 1095 case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_w: 1096 return Builder.CreateCall( 1097 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_w)); 1098 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_x: 1099 return Builder.CreateCall( 1100 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_x)); 1101 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_y: 1102 return Builder.CreateCall( 1103 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_y)); 1104 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_z: 1105 return Builder.CreateCall( 1106 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_z)); 1107 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_w: 1108 return Builder.CreateCall( 1109 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_w)); 1110 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_x: 1111 return Builder.CreateCall( 1112 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_x)); 1113 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_y: 1114 return Builder.CreateCall( 1115 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_y)); 1116 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_z: 1117 return Builder.CreateCall( 1118 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_z)); 1119 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_w: 1120 return Builder.CreateCall( 1121 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_w)); 1122 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctarank: 1123 return Builder.CreateCall( 1124 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctarank)); 1125 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctarank: 1126 return Builder.CreateCall( 1127 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctarank)); 1128 case NVPTX::BI__nvvm_is_explicit_cluster: 1129 return Builder.CreateCall( 1130 CGM.getIntrinsic(Intrinsic::nvvm_is_explicit_cluster)); 1131 case NVPTX::BI__nvvm_isspacep_shared_cluster: 1132 return Builder.CreateCall( 1133 CGM.getIntrinsic(Intrinsic::nvvm_isspacep_shared_cluster), 1134 EmitScalarExpr(E->getArg(0))); 1135 case NVPTX::BI__nvvm_mapa: 1136 return Builder.CreateCall( 1137 CGM.getIntrinsic(Intrinsic::nvvm_mapa), 1138 {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))}); 1139 case NVPTX::BI__nvvm_mapa_shared_cluster: 1140 return Builder.CreateCall( 1141 CGM.getIntrinsic(Intrinsic::nvvm_mapa_shared_cluster), 1142 {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))}); 1143 case NVPTX::BI__nvvm_getctarank: 1144 return Builder.CreateCall( 1145 CGM.getIntrinsic(Intrinsic::nvvm_getctarank), 1146 EmitScalarExpr(E->getArg(0))); 1147 case NVPTX::BI__nvvm_getctarank_shared_cluster: 1148 return Builder.CreateCall( 1149 CGM.getIntrinsic(Intrinsic::nvvm_getctarank_shared_cluster), 1150 EmitScalarExpr(E->getArg(0))); 1151 case NVPTX::BI__nvvm_barrier_cluster_arrive: 1152 return Builder.CreateCall( 1153 CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive)); 1154 case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed: 1155 return Builder.CreateCall( 1156 CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive_relaxed)); 1157 case NVPTX::BI__nvvm_barrier_cluster_wait: 1158 return Builder.CreateCall( 1159 CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_wait)); 1160 case NVPTX::BI__nvvm_fence_sc_cluster: 1161 return Builder.CreateCall( 1162 CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster)); 1163 case NVPTX::BI__nvvm_bar_sync: 1164 return Builder.CreateCall( 1165 CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all), 1166 EmitScalarExpr(E->getArg(0))); 1167 case NVPTX::BI__syncthreads: 1168 return Builder.CreateCall( 1169 CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all), 1170 Builder.getInt32(0)); 1171 case NVPTX::BI__nvvm_barrier_sync: 1172 return Builder.CreateCall( 1173 CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_all), 1174 EmitScalarExpr(E->getArg(0))); 1175 case NVPTX::BI__nvvm_barrier_sync_cnt: 1176 return Builder.CreateCall( 1177 CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_count), 1178 {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))}); 1179 default: 1180 return nullptr; 1181 } 1182 } 1183