1 //===---------- X86.cpp - Emit LLVM Code for builtins ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This contains code to emit Builtin calls as LLVM code. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "CGBuiltin.h" 14 #include "clang/Basic/TargetBuiltins.h" 15 #include "llvm/IR/InlineAsm.h" 16 #include "llvm/IR/IntrinsicsX86.h" 17 #include "llvm/TargetParser/X86TargetParser.h" 18 19 using namespace clang; 20 using namespace CodeGen; 21 using namespace llvm; 22 23 static std::optional<CodeGenFunction::MSVCIntrin> 24 translateX86ToMsvcIntrin(unsigned BuiltinID) { 25 using MSVCIntrin = CodeGenFunction::MSVCIntrin; 26 switch (BuiltinID) { 27 default: 28 return std::nullopt; 29 case clang::X86::BI_BitScanForward: 30 case clang::X86::BI_BitScanForward64: 31 return MSVCIntrin::_BitScanForward; 32 case clang::X86::BI_BitScanReverse: 33 case clang::X86::BI_BitScanReverse64: 34 return MSVCIntrin::_BitScanReverse; 35 case clang::X86::BI_InterlockedAnd64: 36 return MSVCIntrin::_InterlockedAnd; 37 case clang::X86::BI_InterlockedCompareExchange128: 38 return MSVCIntrin::_InterlockedCompareExchange128; 39 case clang::X86::BI_InterlockedExchange64: 40 return MSVCIntrin::_InterlockedExchange; 41 case clang::X86::BI_InterlockedExchangeAdd64: 42 return MSVCIntrin::_InterlockedExchangeAdd; 43 case clang::X86::BI_InterlockedExchangeSub64: 44 return MSVCIntrin::_InterlockedExchangeSub; 45 case clang::X86::BI_InterlockedOr64: 46 return MSVCIntrin::_InterlockedOr; 47 case clang::X86::BI_InterlockedXor64: 48 return MSVCIntrin::_InterlockedXor; 49 case clang::X86::BI_InterlockedDecrement64: 50 return MSVCIntrin::_InterlockedDecrement; 51 case clang::X86::BI_InterlockedIncrement64: 52 return MSVCIntrin::_InterlockedIncrement; 53 } 54 llvm_unreachable("must return from switch"); 55 } 56 57 // Convert the mask from an integer type to a vector of i1. 58 static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask, 59 unsigned NumElts) { 60 61 auto *MaskTy = llvm::FixedVectorType::get( 62 CGF.Builder.getInt1Ty(), 63 cast<IntegerType>(Mask->getType())->getBitWidth()); 64 Value *MaskVec = CGF.Builder.CreateBitCast(Mask, MaskTy); 65 66 // If we have less than 8 elements, then the starting mask was an i8 and 67 // we need to extract down to the right number of elements. 68 if (NumElts < 8) { 69 int Indices[4]; 70 for (unsigned i = 0; i != NumElts; ++i) 71 Indices[i] = i; 72 MaskVec = CGF.Builder.CreateShuffleVector( 73 MaskVec, MaskVec, ArrayRef(Indices, NumElts), "extract"); 74 } 75 return MaskVec; 76 } 77 78 static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef<Value *> Ops, 79 Align Alignment) { 80 Value *Ptr = Ops[0]; 81 82 Value *MaskVec = getMaskVecValue( 83 CGF, Ops[2], 84 cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements()); 85 86 return CGF.Builder.CreateMaskedStore(Ops[1], Ptr, Alignment, MaskVec); 87 } 88 89 static Value *EmitX86MaskedLoad(CodeGenFunction &CGF, ArrayRef<Value *> Ops, 90 Align Alignment) { 91 llvm::Type *Ty = Ops[1]->getType(); 92 Value *Ptr = Ops[0]; 93 94 Value *MaskVec = getMaskVecValue( 95 CGF, Ops[2], cast<llvm::FixedVectorType>(Ty)->getNumElements()); 96 97 return CGF.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, MaskVec, Ops[1]); 98 } 99 100 static Value *EmitX86ExpandLoad(CodeGenFunction &CGF, 101 ArrayRef<Value *> Ops) { 102 auto *ResultTy = cast<llvm::VectorType>(Ops[1]->getType()); 103 Value *Ptr = Ops[0]; 104 105 Value *MaskVec = getMaskVecValue( 106 CGF, Ops[2], cast<FixedVectorType>(ResultTy)->getNumElements()); 107 108 llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload, 109 ResultTy); 110 return CGF.Builder.CreateCall(F, { Ptr, MaskVec, Ops[1] }); 111 } 112 113 static Value *EmitX86CompressExpand(CodeGenFunction &CGF, 114 ArrayRef<Value *> Ops, 115 bool IsCompress) { 116 auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType()); 117 118 Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements()); 119 120 Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress 121 : Intrinsic::x86_avx512_mask_expand; 122 llvm::Function *F = CGF.CGM.getIntrinsic(IID, ResultTy); 123 return CGF.Builder.CreateCall(F, { Ops[0], Ops[1], MaskVec }); 124 } 125 126 static Value *EmitX86CompressStore(CodeGenFunction &CGF, 127 ArrayRef<Value *> Ops) { 128 auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType()); 129 Value *Ptr = Ops[0]; 130 131 Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements()); 132 133 llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_compressstore, 134 ResultTy); 135 return CGF.Builder.CreateCall(F, { Ops[1], Ptr, MaskVec }); 136 } 137 138 static Value *EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc, 139 ArrayRef<Value *> Ops, 140 bool InvertLHS = false) { 141 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); 142 Value *LHS = getMaskVecValue(CGF, Ops[0], NumElts); 143 Value *RHS = getMaskVecValue(CGF, Ops[1], NumElts); 144 145 if (InvertLHS) 146 LHS = CGF.Builder.CreateNot(LHS); 147 148 return CGF.Builder.CreateBitCast(CGF.Builder.CreateBinOp(Opc, LHS, RHS), 149 Ops[0]->getType()); 150 } 151 152 static Value *EmitX86FunnelShift(CodeGenFunction &CGF, Value *Op0, Value *Op1, 153 Value *Amt, bool IsRight) { 154 llvm::Type *Ty = Op0->getType(); 155 156 // Amount may be scalar immediate, in which case create a splat vector. 157 // Funnel shifts amounts are treated as modulo and types are all power-of-2 so 158 // we only care about the lowest log2 bits anyway. 159 if (Amt->getType() != Ty) { 160 unsigned NumElts = cast<llvm::FixedVectorType>(Ty)->getNumElements(); 161 Amt = CGF.Builder.CreateIntCast(Amt, Ty->getScalarType(), false); 162 Amt = CGF.Builder.CreateVectorSplat(NumElts, Amt); 163 } 164 165 unsigned IID = IsRight ? Intrinsic::fshr : Intrinsic::fshl; 166 Function *F = CGF.CGM.getIntrinsic(IID, Ty); 167 return CGF.Builder.CreateCall(F, {Op0, Op1, Amt}); 168 } 169 170 static Value *EmitX86vpcom(CodeGenFunction &CGF, ArrayRef<Value *> Ops, 171 bool IsSigned) { 172 Value *Op0 = Ops[0]; 173 Value *Op1 = Ops[1]; 174 llvm::Type *Ty = Op0->getType(); 175 uint64_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7; 176 177 CmpInst::Predicate Pred; 178 switch (Imm) { 179 case 0x0: 180 Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; 181 break; 182 case 0x1: 183 Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; 184 break; 185 case 0x2: 186 Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; 187 break; 188 case 0x3: 189 Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; 190 break; 191 case 0x4: 192 Pred = ICmpInst::ICMP_EQ; 193 break; 194 case 0x5: 195 Pred = ICmpInst::ICMP_NE; 196 break; 197 case 0x6: 198 return llvm::Constant::getNullValue(Ty); // FALSE 199 case 0x7: 200 return llvm::Constant::getAllOnesValue(Ty); // TRUE 201 default: 202 llvm_unreachable("Unexpected XOP vpcom/vpcomu predicate"); 203 } 204 205 Value *Cmp = CGF.Builder.CreateICmp(Pred, Op0, Op1); 206 Value *Res = CGF.Builder.CreateSExt(Cmp, Ty); 207 return Res; 208 } 209 210 static Value *EmitX86Select(CodeGenFunction &CGF, 211 Value *Mask, Value *Op0, Value *Op1) { 212 213 // If the mask is all ones just return first argument. 214 if (const auto *C = dyn_cast<Constant>(Mask)) 215 if (C->isAllOnesValue()) 216 return Op0; 217 218 Mask = getMaskVecValue( 219 CGF, Mask, cast<llvm::FixedVectorType>(Op0->getType())->getNumElements()); 220 221 return CGF.Builder.CreateSelect(Mask, Op0, Op1); 222 } 223 224 static Value *EmitX86ScalarSelect(CodeGenFunction &CGF, 225 Value *Mask, Value *Op0, Value *Op1) { 226 // If the mask is all ones just return first argument. 227 if (const auto *C = dyn_cast<Constant>(Mask)) 228 if (C->isAllOnesValue()) 229 return Op0; 230 231 auto *MaskTy = llvm::FixedVectorType::get( 232 CGF.Builder.getInt1Ty(), Mask->getType()->getIntegerBitWidth()); 233 Mask = CGF.Builder.CreateBitCast(Mask, MaskTy); 234 Mask = CGF.Builder.CreateExtractElement(Mask, (uint64_t)0); 235 return CGF.Builder.CreateSelect(Mask, Op0, Op1); 236 } 237 238 static Value *EmitX86MaskedCompareResult(CodeGenFunction &CGF, Value *Cmp, 239 unsigned NumElts, Value *MaskIn) { 240 if (MaskIn) { 241 const auto *C = dyn_cast<Constant>(MaskIn); 242 if (!C || !C->isAllOnesValue()) 243 Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, MaskIn, NumElts)); 244 } 245 246 if (NumElts < 8) { 247 int Indices[8]; 248 for (unsigned i = 0; i != NumElts; ++i) 249 Indices[i] = i; 250 for (unsigned i = NumElts; i != 8; ++i) 251 Indices[i] = i % NumElts + NumElts; 252 Cmp = CGF.Builder.CreateShuffleVector( 253 Cmp, llvm::Constant::getNullValue(Cmp->getType()), Indices); 254 } 255 256 return CGF.Builder.CreateBitCast(Cmp, 257 IntegerType::get(CGF.getLLVMContext(), 258 std::max(NumElts, 8U))); 259 } 260 261 static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC, 262 bool Signed, ArrayRef<Value *> Ops) { 263 assert((Ops.size() == 2 || Ops.size() == 4) && 264 "Unexpected number of arguments"); 265 unsigned NumElts = 266 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); 267 Value *Cmp; 268 269 if (CC == 3) { 270 Cmp = Constant::getNullValue( 271 llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts)); 272 } else if (CC == 7) { 273 Cmp = Constant::getAllOnesValue( 274 llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts)); 275 } else { 276 ICmpInst::Predicate Pred; 277 switch (CC) { 278 default: llvm_unreachable("Unknown condition code"); 279 case 0: Pred = ICmpInst::ICMP_EQ; break; 280 case 1: Pred = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break; 281 case 2: Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break; 282 case 4: Pred = ICmpInst::ICMP_NE; break; 283 case 5: Pred = Signed ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break; 284 case 6: Pred = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break; 285 } 286 Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]); 287 } 288 289 Value *MaskIn = nullptr; 290 if (Ops.size() == 4) 291 MaskIn = Ops[3]; 292 293 return EmitX86MaskedCompareResult(CGF, Cmp, NumElts, MaskIn); 294 } 295 296 static Value *EmitX86ConvertToMask(CodeGenFunction &CGF, Value *In) { 297 Value *Zero = Constant::getNullValue(In->getType()); 298 return EmitX86MaskedCompare(CGF, 1, true, { In, Zero }); 299 } 300 301 static Value *EmitX86ConvertIntToFp(CodeGenFunction &CGF, const CallExpr *E, 302 ArrayRef<Value *> Ops, bool IsSigned) { 303 unsigned Rnd = cast<llvm::ConstantInt>(Ops[3])->getZExtValue(); 304 llvm::Type *Ty = Ops[1]->getType(); 305 306 Value *Res; 307 if (Rnd != 4) { 308 Intrinsic::ID IID = IsSigned ? Intrinsic::x86_avx512_sitofp_round 309 : Intrinsic::x86_avx512_uitofp_round; 310 Function *F = CGF.CGM.getIntrinsic(IID, { Ty, Ops[0]->getType() }); 311 Res = CGF.Builder.CreateCall(F, { Ops[0], Ops[3] }); 312 } else { 313 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E); 314 Res = IsSigned ? CGF.Builder.CreateSIToFP(Ops[0], Ty) 315 : CGF.Builder.CreateUIToFP(Ops[0], Ty); 316 } 317 318 return EmitX86Select(CGF, Ops[2], Res, Ops[1]); 319 } 320 321 // Lowers X86 FMA intrinsics to IR. 322 static Value *EmitX86FMAExpr(CodeGenFunction &CGF, const CallExpr *E, 323 ArrayRef<Value *> Ops, unsigned BuiltinID, 324 bool IsAddSub) { 325 326 bool Subtract = false; 327 Intrinsic::ID IID = Intrinsic::not_intrinsic; 328 switch (BuiltinID) { 329 default: break; 330 case clang::X86::BI__builtin_ia32_vfmsubph512_mask3: 331 Subtract = true; 332 [[fallthrough]]; 333 case clang::X86::BI__builtin_ia32_vfmaddph512_mask: 334 case clang::X86::BI__builtin_ia32_vfmaddph512_maskz: 335 case clang::X86::BI__builtin_ia32_vfmaddph512_mask3: 336 IID = Intrinsic::x86_avx512fp16_vfmadd_ph_512; 337 break; 338 case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3: 339 Subtract = true; 340 [[fallthrough]]; 341 case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask: 342 case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz: 343 case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3: 344 IID = Intrinsic::x86_avx512fp16_vfmaddsub_ph_512; 345 break; 346 case clang::X86::BI__builtin_ia32_vfmsubps512_mask3: 347 Subtract = true; 348 [[fallthrough]]; 349 case clang::X86::BI__builtin_ia32_vfmaddps512_mask: 350 case clang::X86::BI__builtin_ia32_vfmaddps512_maskz: 351 case clang::X86::BI__builtin_ia32_vfmaddps512_mask3: 352 IID = Intrinsic::x86_avx512_vfmadd_ps_512; break; 353 case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3: 354 Subtract = true; 355 [[fallthrough]]; 356 case clang::X86::BI__builtin_ia32_vfmaddpd512_mask: 357 case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz: 358 case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3: 359 IID = Intrinsic::x86_avx512_vfmadd_pd_512; break; 360 case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3: 361 Subtract = true; 362 [[fallthrough]]; 363 case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask: 364 case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz: 365 case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3: 366 IID = Intrinsic::x86_avx512_vfmaddsub_ps_512; 367 break; 368 case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3: 369 Subtract = true; 370 [[fallthrough]]; 371 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask: 372 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz: 373 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3: 374 IID = Intrinsic::x86_avx512_vfmaddsub_pd_512; 375 break; 376 } 377 378 Value *A = Ops[0]; 379 Value *B = Ops[1]; 380 Value *C = Ops[2]; 381 382 if (Subtract) 383 C = CGF.Builder.CreateFNeg(C); 384 385 Value *Res; 386 387 // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding). 388 if (IID != Intrinsic::not_intrinsic && 389 (cast<llvm::ConstantInt>(Ops.back())->getZExtValue() != (uint64_t)4 || 390 IsAddSub)) { 391 Function *Intr = CGF.CGM.getIntrinsic(IID); 392 Res = CGF.Builder.CreateCall(Intr, {A, B, C, Ops.back() }); 393 } else { 394 llvm::Type *Ty = A->getType(); 395 Function *FMA; 396 if (CGF.Builder.getIsFPConstrained()) { 397 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E); 398 FMA = CGF.CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, Ty); 399 Res = CGF.Builder.CreateConstrainedFPCall(FMA, {A, B, C}); 400 } else { 401 FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty); 402 Res = CGF.Builder.CreateCall(FMA, {A, B, C}); 403 } 404 } 405 406 // Handle any required masking. 407 Value *MaskFalseVal = nullptr; 408 switch (BuiltinID) { 409 case clang::X86::BI__builtin_ia32_vfmaddph512_mask: 410 case clang::X86::BI__builtin_ia32_vfmaddps512_mask: 411 case clang::X86::BI__builtin_ia32_vfmaddpd512_mask: 412 case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask: 413 case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask: 414 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask: 415 MaskFalseVal = Ops[0]; 416 break; 417 case clang::X86::BI__builtin_ia32_vfmaddph512_maskz: 418 case clang::X86::BI__builtin_ia32_vfmaddps512_maskz: 419 case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz: 420 case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz: 421 case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz: 422 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz: 423 MaskFalseVal = Constant::getNullValue(Ops[0]->getType()); 424 break; 425 case clang::X86::BI__builtin_ia32_vfmsubph512_mask3: 426 case clang::X86::BI__builtin_ia32_vfmaddph512_mask3: 427 case clang::X86::BI__builtin_ia32_vfmsubps512_mask3: 428 case clang::X86::BI__builtin_ia32_vfmaddps512_mask3: 429 case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3: 430 case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3: 431 case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3: 432 case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3: 433 case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3: 434 case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3: 435 case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3: 436 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3: 437 MaskFalseVal = Ops[2]; 438 break; 439 } 440 441 if (MaskFalseVal) 442 return EmitX86Select(CGF, Ops[3], Res, MaskFalseVal); 443 444 return Res; 445 } 446 447 static Value *EmitScalarFMAExpr(CodeGenFunction &CGF, const CallExpr *E, 448 MutableArrayRef<Value *> Ops, Value *Upper, 449 bool ZeroMask = false, unsigned PTIdx = 0, 450 bool NegAcc = false) { 451 unsigned Rnd = 4; 452 if (Ops.size() > 4) 453 Rnd = cast<llvm::ConstantInt>(Ops[4])->getZExtValue(); 454 455 if (NegAcc) 456 Ops[2] = CGF.Builder.CreateFNeg(Ops[2]); 457 458 Ops[0] = CGF.Builder.CreateExtractElement(Ops[0], (uint64_t)0); 459 Ops[1] = CGF.Builder.CreateExtractElement(Ops[1], (uint64_t)0); 460 Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], (uint64_t)0); 461 Value *Res; 462 if (Rnd != 4) { 463 Intrinsic::ID IID; 464 465 switch (Ops[0]->getType()->getPrimitiveSizeInBits()) { 466 case 16: 467 IID = Intrinsic::x86_avx512fp16_vfmadd_f16; 468 break; 469 case 32: 470 IID = Intrinsic::x86_avx512_vfmadd_f32; 471 break; 472 case 64: 473 IID = Intrinsic::x86_avx512_vfmadd_f64; 474 break; 475 default: 476 llvm_unreachable("Unexpected size"); 477 } 478 Res = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID), 479 {Ops[0], Ops[1], Ops[2], Ops[4]}); 480 } else if (CGF.Builder.getIsFPConstrained()) { 481 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E); 482 Function *FMA = CGF.CGM.getIntrinsic( 483 Intrinsic::experimental_constrained_fma, Ops[0]->getType()); 484 Res = CGF.Builder.CreateConstrainedFPCall(FMA, Ops.slice(0, 3)); 485 } else { 486 Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ops[0]->getType()); 487 Res = CGF.Builder.CreateCall(FMA, Ops.slice(0, 3)); 488 } 489 // If we have more than 3 arguments, we need to do masking. 490 if (Ops.size() > 3) { 491 Value *PassThru = ZeroMask ? Constant::getNullValue(Res->getType()) 492 : Ops[PTIdx]; 493 494 // If we negated the accumulator and the its the PassThru value we need to 495 // bypass the negate. Conveniently Upper should be the same thing in this 496 // case. 497 if (NegAcc && PTIdx == 2) 498 PassThru = CGF.Builder.CreateExtractElement(Upper, (uint64_t)0); 499 500 Res = EmitX86ScalarSelect(CGF, Ops[3], Res, PassThru); 501 } 502 return CGF.Builder.CreateInsertElement(Upper, Res, (uint64_t)0); 503 } 504 505 static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned, 506 ArrayRef<Value *> Ops) { 507 llvm::Type *Ty = Ops[0]->getType(); 508 // Arguments have a vXi32 type so cast to vXi64. 509 Ty = llvm::FixedVectorType::get(CGF.Int64Ty, 510 Ty->getPrimitiveSizeInBits() / 64); 511 Value *LHS = CGF.Builder.CreateBitCast(Ops[0], Ty); 512 Value *RHS = CGF.Builder.CreateBitCast(Ops[1], Ty); 513 514 if (IsSigned) { 515 // Shift left then arithmetic shift right. 516 Constant *ShiftAmt = ConstantInt::get(Ty, 32); 517 LHS = CGF.Builder.CreateShl(LHS, ShiftAmt); 518 LHS = CGF.Builder.CreateAShr(LHS, ShiftAmt); 519 RHS = CGF.Builder.CreateShl(RHS, ShiftAmt); 520 RHS = CGF.Builder.CreateAShr(RHS, ShiftAmt); 521 } else { 522 // Clear the upper bits. 523 Constant *Mask = ConstantInt::get(Ty, 0xffffffff); 524 LHS = CGF.Builder.CreateAnd(LHS, Mask); 525 RHS = CGF.Builder.CreateAnd(RHS, Mask); 526 } 527 528 return CGF.Builder.CreateMul(LHS, RHS); 529 } 530 531 // Emit a masked pternlog intrinsic. This only exists because the header has to 532 // use a macro and we aren't able to pass the input argument to a pternlog 533 // builtin and a select builtin without evaluating it twice. 534 static Value *EmitX86Ternlog(CodeGenFunction &CGF, bool ZeroMask, 535 ArrayRef<Value *> Ops) { 536 llvm::Type *Ty = Ops[0]->getType(); 537 538 unsigned VecWidth = Ty->getPrimitiveSizeInBits(); 539 unsigned EltWidth = Ty->getScalarSizeInBits(); 540 Intrinsic::ID IID; 541 if (VecWidth == 128 && EltWidth == 32) 542 IID = Intrinsic::x86_avx512_pternlog_d_128; 543 else if (VecWidth == 256 && EltWidth == 32) 544 IID = Intrinsic::x86_avx512_pternlog_d_256; 545 else if (VecWidth == 512 && EltWidth == 32) 546 IID = Intrinsic::x86_avx512_pternlog_d_512; 547 else if (VecWidth == 128 && EltWidth == 64) 548 IID = Intrinsic::x86_avx512_pternlog_q_128; 549 else if (VecWidth == 256 && EltWidth == 64) 550 IID = Intrinsic::x86_avx512_pternlog_q_256; 551 else if (VecWidth == 512 && EltWidth == 64) 552 IID = Intrinsic::x86_avx512_pternlog_q_512; 553 else 554 llvm_unreachable("Unexpected intrinsic"); 555 556 Value *Ternlog = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID), 557 Ops.drop_back()); 558 Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty) : Ops[0]; 559 return EmitX86Select(CGF, Ops[4], Ternlog, PassThru); 560 } 561 562 static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op, 563 llvm::Type *DstTy) { 564 unsigned NumberOfElements = 565 cast<llvm::FixedVectorType>(DstTy)->getNumElements(); 566 Value *Mask = getMaskVecValue(CGF, Op, NumberOfElements); 567 return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2"); 568 } 569 570 Value *CodeGenFunction::EmitX86CpuIs(const CallExpr *E) { 571 const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts(); 572 StringRef CPUStr = cast<clang::StringLiteral>(CPUExpr)->getString(); 573 return EmitX86CpuIs(CPUStr); 574 } 575 576 // Convert F16 halfs to floats. 577 static Value *EmitX86CvtF16ToFloatExpr(CodeGenFunction &CGF, 578 ArrayRef<Value *> Ops, 579 llvm::Type *DstTy) { 580 assert((Ops.size() == 1 || Ops.size() == 3 || Ops.size() == 4) && 581 "Unknown cvtph2ps intrinsic"); 582 583 // If the SAE intrinsic doesn't use default rounding then we can't upgrade. 584 if (Ops.size() == 4 && cast<llvm::ConstantInt>(Ops[3])->getZExtValue() != 4) { 585 Function *F = 586 CGF.CGM.getIntrinsic(Intrinsic::x86_avx512_mask_vcvtph2ps_512); 587 return CGF.Builder.CreateCall(F, {Ops[0], Ops[1], Ops[2], Ops[3]}); 588 } 589 590 unsigned NumDstElts = cast<llvm::FixedVectorType>(DstTy)->getNumElements(); 591 Value *Src = Ops[0]; 592 593 // Extract the subvector. 594 if (NumDstElts != 595 cast<llvm::FixedVectorType>(Src->getType())->getNumElements()) { 596 assert(NumDstElts == 4 && "Unexpected vector size"); 597 Src = CGF.Builder.CreateShuffleVector(Src, {0, 1, 2, 3}); 598 } 599 600 // Bitcast from vXi16 to vXf16. 601 auto *HalfTy = llvm::FixedVectorType::get( 602 llvm::Type::getHalfTy(CGF.getLLVMContext()), NumDstElts); 603 Src = CGF.Builder.CreateBitCast(Src, HalfTy); 604 605 // Perform the fp-extension. 606 Value *Res = CGF.Builder.CreateFPExt(Src, DstTy, "cvtph2ps"); 607 608 if (Ops.size() >= 3) 609 Res = EmitX86Select(CGF, Ops[2], Res, Ops[1]); 610 return Res; 611 } 612 613 Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) { 614 615 llvm::Type *Int32Ty = Builder.getInt32Ty(); 616 617 // Matching the struct layout from the compiler-rt/libgcc structure that is 618 // filled in: 619 // unsigned int __cpu_vendor; 620 // unsigned int __cpu_type; 621 // unsigned int __cpu_subtype; 622 // unsigned int __cpu_features[1]; 623 llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty, 624 llvm::ArrayType::get(Int32Ty, 1)); 625 626 // Grab the global __cpu_model. 627 llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model"); 628 cast<llvm::GlobalValue>(CpuModel)->setDSOLocal(true); 629 630 // Calculate the index needed to access the correct field based on the 631 // range. Also adjust the expected value. 632 auto [Index, Value] = StringSwitch<std::pair<unsigned, unsigned>>(CPUStr) 633 #define X86_VENDOR(ENUM, STRING) \ 634 .Case(STRING, {0u, static_cast<unsigned>(llvm::X86::ENUM)}) 635 #define X86_CPU_TYPE_ALIAS(ENUM, ALIAS) \ 636 .Case(ALIAS, {1u, static_cast<unsigned>(llvm::X86::ENUM)}) 637 #define X86_CPU_TYPE(ENUM, STR) \ 638 .Case(STR, {1u, static_cast<unsigned>(llvm::X86::ENUM)}) 639 #define X86_CPU_SUBTYPE_ALIAS(ENUM, ALIAS) \ 640 .Case(ALIAS, {2u, static_cast<unsigned>(llvm::X86::ENUM)}) 641 #define X86_CPU_SUBTYPE(ENUM, STR) \ 642 .Case(STR, {2u, static_cast<unsigned>(llvm::X86::ENUM)}) 643 #include "llvm/TargetParser/X86TargetParser.def" 644 .Default({0, 0}); 645 assert(Value != 0 && "Invalid CPUStr passed to CpuIs"); 646 647 // Grab the appropriate field from __cpu_model. 648 llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0), 649 ConstantInt::get(Int32Ty, Index)}; 650 llvm::Value *CpuValue = Builder.CreateInBoundsGEP(STy, CpuModel, Idxs); 651 CpuValue = Builder.CreateAlignedLoad(Int32Ty, CpuValue, 652 CharUnits::fromQuantity(4)); 653 654 // Check the value of the field against the requested value. 655 return Builder.CreateICmpEQ(CpuValue, 656 llvm::ConstantInt::get(Int32Ty, Value)); 657 } 658 659 Value *CodeGenFunction::EmitX86CpuSupports(const CallExpr *E) { 660 const Expr *FeatureExpr = E->getArg(0)->IgnoreParenCasts(); 661 StringRef FeatureStr = cast<StringLiteral>(FeatureExpr)->getString(); 662 if (!getContext().getTargetInfo().validateCpuSupports(FeatureStr)) 663 return Builder.getFalse(); 664 return EmitX86CpuSupports(FeatureStr); 665 } 666 667 Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) { 668 return EmitX86CpuSupports(llvm::X86::getCpuSupportsMask(FeatureStrs)); 669 } 670 671 llvm::Value * 672 CodeGenFunction::EmitX86CpuSupports(std::array<uint32_t, 4> FeatureMask) { 673 Value *Result = Builder.getTrue(); 674 if (FeatureMask[0] != 0) { 675 // Matching the struct layout from the compiler-rt/libgcc structure that is 676 // filled in: 677 // unsigned int __cpu_vendor; 678 // unsigned int __cpu_type; 679 // unsigned int __cpu_subtype; 680 // unsigned int __cpu_features[1]; 681 llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty, 682 llvm::ArrayType::get(Int32Ty, 1)); 683 684 // Grab the global __cpu_model. 685 llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model"); 686 cast<llvm::GlobalValue>(CpuModel)->setDSOLocal(true); 687 688 // Grab the first (0th) element from the field __cpu_features off of the 689 // global in the struct STy. 690 Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(3), 691 Builder.getInt32(0)}; 692 Value *CpuFeatures = Builder.CreateInBoundsGEP(STy, CpuModel, Idxs); 693 Value *Features = Builder.CreateAlignedLoad(Int32Ty, CpuFeatures, 694 CharUnits::fromQuantity(4)); 695 696 // Check the value of the bit corresponding to the feature requested. 697 Value *Mask = Builder.getInt32(FeatureMask[0]); 698 Value *Bitset = Builder.CreateAnd(Features, Mask); 699 Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask); 700 Result = Builder.CreateAnd(Result, Cmp); 701 } 702 703 llvm::Type *ATy = llvm::ArrayType::get(Int32Ty, 3); 704 llvm::Constant *CpuFeatures2 = 705 CGM.CreateRuntimeVariable(ATy, "__cpu_features2"); 706 cast<llvm::GlobalValue>(CpuFeatures2)->setDSOLocal(true); 707 for (int i = 1; i != 4; ++i) { 708 const uint32_t M = FeatureMask[i]; 709 if (!M) 710 continue; 711 Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(i - 1)}; 712 Value *Features = Builder.CreateAlignedLoad( 713 Int32Ty, Builder.CreateInBoundsGEP(ATy, CpuFeatures2, Idxs), 714 CharUnits::fromQuantity(4)); 715 // Check the value of the bit corresponding to the feature requested. 716 Value *Mask = Builder.getInt32(M); 717 Value *Bitset = Builder.CreateAnd(Features, Mask); 718 Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask); 719 Result = Builder.CreateAnd(Result, Cmp); 720 } 721 722 return Result; 723 } 724 725 Value *CodeGenFunction::EmitX86CpuInit() { 726 llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, 727 /*Variadic*/ false); 728 llvm::FunctionCallee Func = 729 CGM.CreateRuntimeFunction(FTy, "__cpu_indicator_init"); 730 cast<llvm::GlobalValue>(Func.getCallee())->setDSOLocal(true); 731 cast<llvm::GlobalValue>(Func.getCallee()) 732 ->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass); 733 return Builder.CreateCall(Func); 734 } 735 736 737 Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, 738 const CallExpr *E) { 739 if (BuiltinID == Builtin::BI__builtin_cpu_is) 740 return EmitX86CpuIs(E); 741 if (BuiltinID == Builtin::BI__builtin_cpu_supports) 742 return EmitX86CpuSupports(E); 743 if (BuiltinID == Builtin::BI__builtin_cpu_init) 744 return EmitX86CpuInit(); 745 746 // Handle MSVC intrinsics before argument evaluation to prevent double 747 // evaluation. 748 if (std::optional<MSVCIntrin> MsvcIntId = translateX86ToMsvcIntrin(BuiltinID)) 749 return EmitMSVCBuiltinExpr(*MsvcIntId, E); 750 751 SmallVector<Value*, 4> Ops; 752 bool IsMaskFCmp = false; 753 bool IsConjFMA = false; 754 755 // Find out if any arguments are required to be integer constant expressions. 756 unsigned ICEArguments = 0; 757 ASTContext::GetBuiltinTypeError Error; 758 getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); 759 assert(Error == ASTContext::GE_None && "Should not codegen an error"); 760 761 for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) { 762 Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E)); 763 } 764 765 // These exist so that the builtin that takes an immediate can be bounds 766 // checked by clang to avoid passing bad immediates to the backend. Since 767 // AVX has a larger immediate than SSE we would need separate builtins to 768 // do the different bounds checking. Rather than create a clang specific 769 // SSE only builtin, this implements eight separate builtins to match gcc 770 // implementation. 771 auto getCmpIntrinsicCall = [this, &Ops](Intrinsic::ID ID, unsigned Imm) { 772 Ops.push_back(llvm::ConstantInt::get(Int8Ty, Imm)); 773 llvm::Function *F = CGM.getIntrinsic(ID); 774 return Builder.CreateCall(F, Ops); 775 }; 776 777 // For the vector forms of FP comparisons, translate the builtins directly to 778 // IR. 779 // TODO: The builtins could be removed if the SSE header files used vector 780 // extension comparisons directly (vector ordered/unordered may need 781 // additional support via __builtin_isnan()). 782 auto getVectorFCmpIR = [this, &Ops, E](CmpInst::Predicate Pred, 783 bool IsSignaling) { 784 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); 785 Value *Cmp; 786 if (IsSignaling) 787 Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]); 788 else 789 Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]); 790 llvm::VectorType *FPVecTy = cast<llvm::VectorType>(Ops[0]->getType()); 791 llvm::VectorType *IntVecTy = llvm::VectorType::getInteger(FPVecTy); 792 Value *Sext = Builder.CreateSExt(Cmp, IntVecTy); 793 return Builder.CreateBitCast(Sext, FPVecTy); 794 }; 795 796 switch (BuiltinID) { 797 default: return nullptr; 798 case X86::BI_mm_prefetch: { 799 Value *Address = Ops[0]; 800 ConstantInt *C = cast<ConstantInt>(Ops[1]); 801 Value *RW = ConstantInt::get(Int32Ty, (C->getZExtValue() >> 2) & 0x1); 802 Value *Locality = ConstantInt::get(Int32Ty, C->getZExtValue() & 0x3); 803 Value *Data = ConstantInt::get(Int32Ty, 1); 804 Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType()); 805 return Builder.CreateCall(F, {Address, RW, Locality, Data}); 806 } 807 case X86::BI_m_prefetch: 808 case X86::BI_m_prefetchw: { 809 Value *Address = Ops[0]; 810 // The 'w' suffix implies write. 811 Value *RW = 812 ConstantInt::get(Int32Ty, BuiltinID == X86::BI_m_prefetchw ? 1 : 0); 813 Value *Locality = ConstantInt::get(Int32Ty, 0x3); 814 Value *Data = ConstantInt::get(Int32Ty, 1); 815 Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType()); 816 return Builder.CreateCall(F, {Address, RW, Locality, Data}); 817 } 818 case X86::BI_mm_clflush: { 819 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_clflush), 820 Ops[0]); 821 } 822 case X86::BI_mm_lfence: { 823 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_lfence)); 824 } 825 case X86::BI_mm_mfence: { 826 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_mfence)); 827 } 828 case X86::BI_mm_sfence: { 829 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_sfence)); 830 } 831 case X86::BI_mm_pause: { 832 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_pause)); 833 } 834 case X86::BI__rdtsc: { 835 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtsc)); 836 } 837 case X86::BI__builtin_ia32_rdtscp: { 838 Value *Call = Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtscp)); 839 Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1), 840 Ops[0]); 841 return Builder.CreateExtractValue(Call, 0); 842 } 843 case X86::BI__builtin_ia32_lzcnt_u16: 844 case X86::BI__builtin_ia32_lzcnt_u32: 845 case X86::BI__builtin_ia32_lzcnt_u64: { 846 Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType()); 847 return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)}); 848 } 849 case X86::BI__builtin_ia32_tzcnt_u16: 850 case X86::BI__builtin_ia32_tzcnt_u32: 851 case X86::BI__builtin_ia32_tzcnt_u64: { 852 Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType()); 853 return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)}); 854 } 855 case X86::BI__builtin_ia32_undef128: 856 case X86::BI__builtin_ia32_undef256: 857 case X86::BI__builtin_ia32_undef512: 858 // The x86 definition of "undef" is not the same as the LLVM definition 859 // (PR32176). We leave optimizing away an unnecessary zero constant to the 860 // IR optimizer and backend. 861 // TODO: If we had a "freeze" IR instruction to generate a fixed undef 862 // value, we should use that here instead of a zero. 863 return llvm::Constant::getNullValue(ConvertType(E->getType())); 864 case X86::BI__builtin_ia32_vec_ext_v4hi: 865 case X86::BI__builtin_ia32_vec_ext_v16qi: 866 case X86::BI__builtin_ia32_vec_ext_v8hi: 867 case X86::BI__builtin_ia32_vec_ext_v4si: 868 case X86::BI__builtin_ia32_vec_ext_v4sf: 869 case X86::BI__builtin_ia32_vec_ext_v2di: 870 case X86::BI__builtin_ia32_vec_ext_v32qi: 871 case X86::BI__builtin_ia32_vec_ext_v16hi: 872 case X86::BI__builtin_ia32_vec_ext_v8si: 873 case X86::BI__builtin_ia32_vec_ext_v4di: { 874 unsigned NumElts = 875 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); 876 uint64_t Index = cast<ConstantInt>(Ops[1])->getZExtValue(); 877 Index &= NumElts - 1; 878 // These builtins exist so we can ensure the index is an ICE and in range. 879 // Otherwise we could just do this in the header file. 880 return Builder.CreateExtractElement(Ops[0], Index); 881 } 882 case X86::BI__builtin_ia32_vec_set_v4hi: 883 case X86::BI__builtin_ia32_vec_set_v16qi: 884 case X86::BI__builtin_ia32_vec_set_v8hi: 885 case X86::BI__builtin_ia32_vec_set_v4si: 886 case X86::BI__builtin_ia32_vec_set_v2di: 887 case X86::BI__builtin_ia32_vec_set_v32qi: 888 case X86::BI__builtin_ia32_vec_set_v16hi: 889 case X86::BI__builtin_ia32_vec_set_v8si: 890 case X86::BI__builtin_ia32_vec_set_v4di: { 891 unsigned NumElts = 892 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); 893 unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue(); 894 Index &= NumElts - 1; 895 // These builtins exist so we can ensure the index is an ICE and in range. 896 // Otherwise we could just do this in the header file. 897 return Builder.CreateInsertElement(Ops[0], Ops[1], Index); 898 } 899 case X86::BI_mm_setcsr: 900 case X86::BI__builtin_ia32_ldmxcsr: { 901 RawAddress Tmp = CreateMemTemp(E->getArg(0)->getType()); 902 Builder.CreateStore(Ops[0], Tmp); 903 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_ldmxcsr), 904 Tmp.getPointer()); 905 } 906 case X86::BI_mm_getcsr: 907 case X86::BI__builtin_ia32_stmxcsr: { 908 RawAddress Tmp = CreateMemTemp(E->getType()); 909 Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_stmxcsr), 910 Tmp.getPointer()); 911 return Builder.CreateLoad(Tmp, "stmxcsr"); 912 } 913 case X86::BI__builtin_ia32_xsave: 914 case X86::BI__builtin_ia32_xsave64: 915 case X86::BI__builtin_ia32_xrstor: 916 case X86::BI__builtin_ia32_xrstor64: 917 case X86::BI__builtin_ia32_xsaveopt: 918 case X86::BI__builtin_ia32_xsaveopt64: 919 case X86::BI__builtin_ia32_xrstors: 920 case X86::BI__builtin_ia32_xrstors64: 921 case X86::BI__builtin_ia32_xsavec: 922 case X86::BI__builtin_ia32_xsavec64: 923 case X86::BI__builtin_ia32_xsaves: 924 case X86::BI__builtin_ia32_xsaves64: 925 case X86::BI__builtin_ia32_xsetbv: 926 case X86::BI_xsetbv: { 927 Intrinsic::ID ID; 928 #define INTRINSIC_X86_XSAVE_ID(NAME) \ 929 case X86::BI__builtin_ia32_##NAME: \ 930 ID = Intrinsic::x86_##NAME; \ 931 break 932 switch (BuiltinID) { 933 default: llvm_unreachable("Unsupported intrinsic!"); 934 INTRINSIC_X86_XSAVE_ID(xsave); 935 INTRINSIC_X86_XSAVE_ID(xsave64); 936 INTRINSIC_X86_XSAVE_ID(xrstor); 937 INTRINSIC_X86_XSAVE_ID(xrstor64); 938 INTRINSIC_X86_XSAVE_ID(xsaveopt); 939 INTRINSIC_X86_XSAVE_ID(xsaveopt64); 940 INTRINSIC_X86_XSAVE_ID(xrstors); 941 INTRINSIC_X86_XSAVE_ID(xrstors64); 942 INTRINSIC_X86_XSAVE_ID(xsavec); 943 INTRINSIC_X86_XSAVE_ID(xsavec64); 944 INTRINSIC_X86_XSAVE_ID(xsaves); 945 INTRINSIC_X86_XSAVE_ID(xsaves64); 946 INTRINSIC_X86_XSAVE_ID(xsetbv); 947 case X86::BI_xsetbv: 948 ID = Intrinsic::x86_xsetbv; 949 break; 950 } 951 #undef INTRINSIC_X86_XSAVE_ID 952 Value *Mhi = Builder.CreateTrunc( 953 Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, 32)), Int32Ty); 954 Value *Mlo = Builder.CreateTrunc(Ops[1], Int32Ty); 955 Ops[1] = Mhi; 956 Ops.push_back(Mlo); 957 return Builder.CreateCall(CGM.getIntrinsic(ID), Ops); 958 } 959 case X86::BI__builtin_ia32_xgetbv: 960 case X86::BI_xgetbv: 961 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_xgetbv), Ops); 962 case X86::BI__builtin_ia32_storedqudi128_mask: 963 case X86::BI__builtin_ia32_storedqusi128_mask: 964 case X86::BI__builtin_ia32_storedquhi128_mask: 965 case X86::BI__builtin_ia32_storedquqi128_mask: 966 case X86::BI__builtin_ia32_storeupd128_mask: 967 case X86::BI__builtin_ia32_storeups128_mask: 968 case X86::BI__builtin_ia32_storedqudi256_mask: 969 case X86::BI__builtin_ia32_storedqusi256_mask: 970 case X86::BI__builtin_ia32_storedquhi256_mask: 971 case X86::BI__builtin_ia32_storedquqi256_mask: 972 case X86::BI__builtin_ia32_storeupd256_mask: 973 case X86::BI__builtin_ia32_storeups256_mask: 974 case X86::BI__builtin_ia32_storedqudi512_mask: 975 case X86::BI__builtin_ia32_storedqusi512_mask: 976 case X86::BI__builtin_ia32_storedquhi512_mask: 977 case X86::BI__builtin_ia32_storedquqi512_mask: 978 case X86::BI__builtin_ia32_storeupd512_mask: 979 case X86::BI__builtin_ia32_storeups512_mask: 980 return EmitX86MaskedStore(*this, Ops, Align(1)); 981 982 case X86::BI__builtin_ia32_storesbf16128_mask: 983 case X86::BI__builtin_ia32_storesh128_mask: 984 case X86::BI__builtin_ia32_storess128_mask: 985 case X86::BI__builtin_ia32_storesd128_mask: 986 return EmitX86MaskedStore(*this, Ops, Align(1)); 987 988 case X86::BI__builtin_ia32_cvtmask2b128: 989 case X86::BI__builtin_ia32_cvtmask2b256: 990 case X86::BI__builtin_ia32_cvtmask2b512: 991 case X86::BI__builtin_ia32_cvtmask2w128: 992 case X86::BI__builtin_ia32_cvtmask2w256: 993 case X86::BI__builtin_ia32_cvtmask2w512: 994 case X86::BI__builtin_ia32_cvtmask2d128: 995 case X86::BI__builtin_ia32_cvtmask2d256: 996 case X86::BI__builtin_ia32_cvtmask2d512: 997 case X86::BI__builtin_ia32_cvtmask2q128: 998 case X86::BI__builtin_ia32_cvtmask2q256: 999 case X86::BI__builtin_ia32_cvtmask2q512: 1000 return EmitX86SExtMask(*this, Ops[0], ConvertType(E->getType())); 1001 1002 case X86::BI__builtin_ia32_cvtb2mask128: 1003 case X86::BI__builtin_ia32_cvtb2mask256: 1004 case X86::BI__builtin_ia32_cvtb2mask512: 1005 case X86::BI__builtin_ia32_cvtw2mask128: 1006 case X86::BI__builtin_ia32_cvtw2mask256: 1007 case X86::BI__builtin_ia32_cvtw2mask512: 1008 case X86::BI__builtin_ia32_cvtd2mask128: 1009 case X86::BI__builtin_ia32_cvtd2mask256: 1010 case X86::BI__builtin_ia32_cvtd2mask512: 1011 case X86::BI__builtin_ia32_cvtq2mask128: 1012 case X86::BI__builtin_ia32_cvtq2mask256: 1013 case X86::BI__builtin_ia32_cvtq2mask512: 1014 return EmitX86ConvertToMask(*this, Ops[0]); 1015 1016 case X86::BI__builtin_ia32_cvtdq2ps512_mask: 1017 case X86::BI__builtin_ia32_cvtqq2ps512_mask: 1018 case X86::BI__builtin_ia32_cvtqq2pd512_mask: 1019 case X86::BI__builtin_ia32_vcvtw2ph512_mask: 1020 case X86::BI__builtin_ia32_vcvtdq2ph512_mask: 1021 case X86::BI__builtin_ia32_vcvtqq2ph512_mask: 1022 return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ true); 1023 case X86::BI__builtin_ia32_cvtudq2ps512_mask: 1024 case X86::BI__builtin_ia32_cvtuqq2ps512_mask: 1025 case X86::BI__builtin_ia32_cvtuqq2pd512_mask: 1026 case X86::BI__builtin_ia32_vcvtuw2ph512_mask: 1027 case X86::BI__builtin_ia32_vcvtudq2ph512_mask: 1028 case X86::BI__builtin_ia32_vcvtuqq2ph512_mask: 1029 return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false); 1030 1031 case X86::BI__builtin_ia32_vfmaddss3: 1032 case X86::BI__builtin_ia32_vfmaddsd3: 1033 case X86::BI__builtin_ia32_vfmaddsh3_mask: 1034 case X86::BI__builtin_ia32_vfmaddss3_mask: 1035 case X86::BI__builtin_ia32_vfmaddsd3_mask: 1036 return EmitScalarFMAExpr(*this, E, Ops, Ops[0]); 1037 case X86::BI__builtin_ia32_vfmaddss: 1038 case X86::BI__builtin_ia32_vfmaddsd: 1039 return EmitScalarFMAExpr(*this, E, Ops, 1040 Constant::getNullValue(Ops[0]->getType())); 1041 case X86::BI__builtin_ia32_vfmaddsh3_maskz: 1042 case X86::BI__builtin_ia32_vfmaddss3_maskz: 1043 case X86::BI__builtin_ia32_vfmaddsd3_maskz: 1044 return EmitScalarFMAExpr(*this, E, Ops, Ops[0], /*ZeroMask*/ true); 1045 case X86::BI__builtin_ia32_vfmaddsh3_mask3: 1046 case X86::BI__builtin_ia32_vfmaddss3_mask3: 1047 case X86::BI__builtin_ia32_vfmaddsd3_mask3: 1048 return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2); 1049 case X86::BI__builtin_ia32_vfmsubsh3_mask3: 1050 case X86::BI__builtin_ia32_vfmsubss3_mask3: 1051 case X86::BI__builtin_ia32_vfmsubsd3_mask3: 1052 return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2, 1053 /*NegAcc*/ true); 1054 case X86::BI__builtin_ia32_vfmaddph: 1055 case X86::BI__builtin_ia32_vfmaddps: 1056 case X86::BI__builtin_ia32_vfmaddpd: 1057 case X86::BI__builtin_ia32_vfmaddph256: 1058 case X86::BI__builtin_ia32_vfmaddps256: 1059 case X86::BI__builtin_ia32_vfmaddpd256: 1060 case X86::BI__builtin_ia32_vfmaddph512_mask: 1061 case X86::BI__builtin_ia32_vfmaddph512_maskz: 1062 case X86::BI__builtin_ia32_vfmaddph512_mask3: 1063 case X86::BI__builtin_ia32_vfmaddbf16128: 1064 case X86::BI__builtin_ia32_vfmaddbf16256: 1065 case X86::BI__builtin_ia32_vfmaddbf16512: 1066 case X86::BI__builtin_ia32_vfmaddps512_mask: 1067 case X86::BI__builtin_ia32_vfmaddps512_maskz: 1068 case X86::BI__builtin_ia32_vfmaddps512_mask3: 1069 case X86::BI__builtin_ia32_vfmsubps512_mask3: 1070 case X86::BI__builtin_ia32_vfmaddpd512_mask: 1071 case X86::BI__builtin_ia32_vfmaddpd512_maskz: 1072 case X86::BI__builtin_ia32_vfmaddpd512_mask3: 1073 case X86::BI__builtin_ia32_vfmsubpd512_mask3: 1074 case X86::BI__builtin_ia32_vfmsubph512_mask3: 1075 return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ false); 1076 case X86::BI__builtin_ia32_vfmaddsubph512_mask: 1077 case X86::BI__builtin_ia32_vfmaddsubph512_maskz: 1078 case X86::BI__builtin_ia32_vfmaddsubph512_mask3: 1079 case X86::BI__builtin_ia32_vfmsubaddph512_mask3: 1080 case X86::BI__builtin_ia32_vfmaddsubps512_mask: 1081 case X86::BI__builtin_ia32_vfmaddsubps512_maskz: 1082 case X86::BI__builtin_ia32_vfmaddsubps512_mask3: 1083 case X86::BI__builtin_ia32_vfmsubaddps512_mask3: 1084 case X86::BI__builtin_ia32_vfmaddsubpd512_mask: 1085 case X86::BI__builtin_ia32_vfmaddsubpd512_maskz: 1086 case X86::BI__builtin_ia32_vfmaddsubpd512_mask3: 1087 case X86::BI__builtin_ia32_vfmsubaddpd512_mask3: 1088 return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ true); 1089 1090 case X86::BI__builtin_ia32_movdqa32store128_mask: 1091 case X86::BI__builtin_ia32_movdqa64store128_mask: 1092 case X86::BI__builtin_ia32_storeaps128_mask: 1093 case X86::BI__builtin_ia32_storeapd128_mask: 1094 case X86::BI__builtin_ia32_movdqa32store256_mask: 1095 case X86::BI__builtin_ia32_movdqa64store256_mask: 1096 case X86::BI__builtin_ia32_storeaps256_mask: 1097 case X86::BI__builtin_ia32_storeapd256_mask: 1098 case X86::BI__builtin_ia32_movdqa32store512_mask: 1099 case X86::BI__builtin_ia32_movdqa64store512_mask: 1100 case X86::BI__builtin_ia32_storeaps512_mask: 1101 case X86::BI__builtin_ia32_storeapd512_mask: 1102 return EmitX86MaskedStore( 1103 *this, Ops, 1104 getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign()); 1105 1106 case X86::BI__builtin_ia32_loadups128_mask: 1107 case X86::BI__builtin_ia32_loadups256_mask: 1108 case X86::BI__builtin_ia32_loadups512_mask: 1109 case X86::BI__builtin_ia32_loadupd128_mask: 1110 case X86::BI__builtin_ia32_loadupd256_mask: 1111 case X86::BI__builtin_ia32_loadupd512_mask: 1112 case X86::BI__builtin_ia32_loaddquqi128_mask: 1113 case X86::BI__builtin_ia32_loaddquqi256_mask: 1114 case X86::BI__builtin_ia32_loaddquqi512_mask: 1115 case X86::BI__builtin_ia32_loaddquhi128_mask: 1116 case X86::BI__builtin_ia32_loaddquhi256_mask: 1117 case X86::BI__builtin_ia32_loaddquhi512_mask: 1118 case X86::BI__builtin_ia32_loaddqusi128_mask: 1119 case X86::BI__builtin_ia32_loaddqusi256_mask: 1120 case X86::BI__builtin_ia32_loaddqusi512_mask: 1121 case X86::BI__builtin_ia32_loaddqudi128_mask: 1122 case X86::BI__builtin_ia32_loaddqudi256_mask: 1123 case X86::BI__builtin_ia32_loaddqudi512_mask: 1124 return EmitX86MaskedLoad(*this, Ops, Align(1)); 1125 1126 case X86::BI__builtin_ia32_loadsbf16128_mask: 1127 case X86::BI__builtin_ia32_loadsh128_mask: 1128 case X86::BI__builtin_ia32_loadss128_mask: 1129 case X86::BI__builtin_ia32_loadsd128_mask: 1130 return EmitX86MaskedLoad(*this, Ops, Align(1)); 1131 1132 case X86::BI__builtin_ia32_loadaps128_mask: 1133 case X86::BI__builtin_ia32_loadaps256_mask: 1134 case X86::BI__builtin_ia32_loadaps512_mask: 1135 case X86::BI__builtin_ia32_loadapd128_mask: 1136 case X86::BI__builtin_ia32_loadapd256_mask: 1137 case X86::BI__builtin_ia32_loadapd512_mask: 1138 case X86::BI__builtin_ia32_movdqa32load128_mask: 1139 case X86::BI__builtin_ia32_movdqa32load256_mask: 1140 case X86::BI__builtin_ia32_movdqa32load512_mask: 1141 case X86::BI__builtin_ia32_movdqa64load128_mask: 1142 case X86::BI__builtin_ia32_movdqa64load256_mask: 1143 case X86::BI__builtin_ia32_movdqa64load512_mask: 1144 return EmitX86MaskedLoad( 1145 *this, Ops, 1146 getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign()); 1147 1148 case X86::BI__builtin_ia32_expandloaddf128_mask: 1149 case X86::BI__builtin_ia32_expandloaddf256_mask: 1150 case X86::BI__builtin_ia32_expandloaddf512_mask: 1151 case X86::BI__builtin_ia32_expandloadsf128_mask: 1152 case X86::BI__builtin_ia32_expandloadsf256_mask: 1153 case X86::BI__builtin_ia32_expandloadsf512_mask: 1154 case X86::BI__builtin_ia32_expandloaddi128_mask: 1155 case X86::BI__builtin_ia32_expandloaddi256_mask: 1156 case X86::BI__builtin_ia32_expandloaddi512_mask: 1157 case X86::BI__builtin_ia32_expandloadsi128_mask: 1158 case X86::BI__builtin_ia32_expandloadsi256_mask: 1159 case X86::BI__builtin_ia32_expandloadsi512_mask: 1160 case X86::BI__builtin_ia32_expandloadhi128_mask: 1161 case X86::BI__builtin_ia32_expandloadhi256_mask: 1162 case X86::BI__builtin_ia32_expandloadhi512_mask: 1163 case X86::BI__builtin_ia32_expandloadqi128_mask: 1164 case X86::BI__builtin_ia32_expandloadqi256_mask: 1165 case X86::BI__builtin_ia32_expandloadqi512_mask: 1166 return EmitX86ExpandLoad(*this, Ops); 1167 1168 case X86::BI__builtin_ia32_compressstoredf128_mask: 1169 case X86::BI__builtin_ia32_compressstoredf256_mask: 1170 case X86::BI__builtin_ia32_compressstoredf512_mask: 1171 case X86::BI__builtin_ia32_compressstoresf128_mask: 1172 case X86::BI__builtin_ia32_compressstoresf256_mask: 1173 case X86::BI__builtin_ia32_compressstoresf512_mask: 1174 case X86::BI__builtin_ia32_compressstoredi128_mask: 1175 case X86::BI__builtin_ia32_compressstoredi256_mask: 1176 case X86::BI__builtin_ia32_compressstoredi512_mask: 1177 case X86::BI__builtin_ia32_compressstoresi128_mask: 1178 case X86::BI__builtin_ia32_compressstoresi256_mask: 1179 case X86::BI__builtin_ia32_compressstoresi512_mask: 1180 case X86::BI__builtin_ia32_compressstorehi128_mask: 1181 case X86::BI__builtin_ia32_compressstorehi256_mask: 1182 case X86::BI__builtin_ia32_compressstorehi512_mask: 1183 case X86::BI__builtin_ia32_compressstoreqi128_mask: 1184 case X86::BI__builtin_ia32_compressstoreqi256_mask: 1185 case X86::BI__builtin_ia32_compressstoreqi512_mask: 1186 return EmitX86CompressStore(*this, Ops); 1187 1188 case X86::BI__builtin_ia32_expanddf128_mask: 1189 case X86::BI__builtin_ia32_expanddf256_mask: 1190 case X86::BI__builtin_ia32_expanddf512_mask: 1191 case X86::BI__builtin_ia32_expandsf128_mask: 1192 case X86::BI__builtin_ia32_expandsf256_mask: 1193 case X86::BI__builtin_ia32_expandsf512_mask: 1194 case X86::BI__builtin_ia32_expanddi128_mask: 1195 case X86::BI__builtin_ia32_expanddi256_mask: 1196 case X86::BI__builtin_ia32_expanddi512_mask: 1197 case X86::BI__builtin_ia32_expandsi128_mask: 1198 case X86::BI__builtin_ia32_expandsi256_mask: 1199 case X86::BI__builtin_ia32_expandsi512_mask: 1200 case X86::BI__builtin_ia32_expandhi128_mask: 1201 case X86::BI__builtin_ia32_expandhi256_mask: 1202 case X86::BI__builtin_ia32_expandhi512_mask: 1203 case X86::BI__builtin_ia32_expandqi128_mask: 1204 case X86::BI__builtin_ia32_expandqi256_mask: 1205 case X86::BI__builtin_ia32_expandqi512_mask: 1206 return EmitX86CompressExpand(*this, Ops, /*IsCompress*/false); 1207 1208 case X86::BI__builtin_ia32_compressdf128_mask: 1209 case X86::BI__builtin_ia32_compressdf256_mask: 1210 case X86::BI__builtin_ia32_compressdf512_mask: 1211 case X86::BI__builtin_ia32_compresssf128_mask: 1212 case X86::BI__builtin_ia32_compresssf256_mask: 1213 case X86::BI__builtin_ia32_compresssf512_mask: 1214 case X86::BI__builtin_ia32_compressdi128_mask: 1215 case X86::BI__builtin_ia32_compressdi256_mask: 1216 case X86::BI__builtin_ia32_compressdi512_mask: 1217 case X86::BI__builtin_ia32_compresssi128_mask: 1218 case X86::BI__builtin_ia32_compresssi256_mask: 1219 case X86::BI__builtin_ia32_compresssi512_mask: 1220 case X86::BI__builtin_ia32_compresshi128_mask: 1221 case X86::BI__builtin_ia32_compresshi256_mask: 1222 case X86::BI__builtin_ia32_compresshi512_mask: 1223 case X86::BI__builtin_ia32_compressqi128_mask: 1224 case X86::BI__builtin_ia32_compressqi256_mask: 1225 case X86::BI__builtin_ia32_compressqi512_mask: 1226 return EmitX86CompressExpand(*this, Ops, /*IsCompress*/true); 1227 1228 case X86::BI__builtin_ia32_gather3div2df: 1229 case X86::BI__builtin_ia32_gather3div2di: 1230 case X86::BI__builtin_ia32_gather3div4df: 1231 case X86::BI__builtin_ia32_gather3div4di: 1232 case X86::BI__builtin_ia32_gather3div4sf: 1233 case X86::BI__builtin_ia32_gather3div4si: 1234 case X86::BI__builtin_ia32_gather3div8sf: 1235 case X86::BI__builtin_ia32_gather3div8si: 1236 case X86::BI__builtin_ia32_gather3siv2df: 1237 case X86::BI__builtin_ia32_gather3siv2di: 1238 case X86::BI__builtin_ia32_gather3siv4df: 1239 case X86::BI__builtin_ia32_gather3siv4di: 1240 case X86::BI__builtin_ia32_gather3siv4sf: 1241 case X86::BI__builtin_ia32_gather3siv4si: 1242 case X86::BI__builtin_ia32_gather3siv8sf: 1243 case X86::BI__builtin_ia32_gather3siv8si: 1244 case X86::BI__builtin_ia32_gathersiv8df: 1245 case X86::BI__builtin_ia32_gathersiv16sf: 1246 case X86::BI__builtin_ia32_gatherdiv8df: 1247 case X86::BI__builtin_ia32_gatherdiv16sf: 1248 case X86::BI__builtin_ia32_gathersiv8di: 1249 case X86::BI__builtin_ia32_gathersiv16si: 1250 case X86::BI__builtin_ia32_gatherdiv8di: 1251 case X86::BI__builtin_ia32_gatherdiv16si: { 1252 Intrinsic::ID IID; 1253 switch (BuiltinID) { 1254 default: llvm_unreachable("Unexpected builtin"); 1255 case X86::BI__builtin_ia32_gather3div2df: 1256 IID = Intrinsic::x86_avx512_mask_gather3div2_df; 1257 break; 1258 case X86::BI__builtin_ia32_gather3div2di: 1259 IID = Intrinsic::x86_avx512_mask_gather3div2_di; 1260 break; 1261 case X86::BI__builtin_ia32_gather3div4df: 1262 IID = Intrinsic::x86_avx512_mask_gather3div4_df; 1263 break; 1264 case X86::BI__builtin_ia32_gather3div4di: 1265 IID = Intrinsic::x86_avx512_mask_gather3div4_di; 1266 break; 1267 case X86::BI__builtin_ia32_gather3div4sf: 1268 IID = Intrinsic::x86_avx512_mask_gather3div4_sf; 1269 break; 1270 case X86::BI__builtin_ia32_gather3div4si: 1271 IID = Intrinsic::x86_avx512_mask_gather3div4_si; 1272 break; 1273 case X86::BI__builtin_ia32_gather3div8sf: 1274 IID = Intrinsic::x86_avx512_mask_gather3div8_sf; 1275 break; 1276 case X86::BI__builtin_ia32_gather3div8si: 1277 IID = Intrinsic::x86_avx512_mask_gather3div8_si; 1278 break; 1279 case X86::BI__builtin_ia32_gather3siv2df: 1280 IID = Intrinsic::x86_avx512_mask_gather3siv2_df; 1281 break; 1282 case X86::BI__builtin_ia32_gather3siv2di: 1283 IID = Intrinsic::x86_avx512_mask_gather3siv2_di; 1284 break; 1285 case X86::BI__builtin_ia32_gather3siv4df: 1286 IID = Intrinsic::x86_avx512_mask_gather3siv4_df; 1287 break; 1288 case X86::BI__builtin_ia32_gather3siv4di: 1289 IID = Intrinsic::x86_avx512_mask_gather3siv4_di; 1290 break; 1291 case X86::BI__builtin_ia32_gather3siv4sf: 1292 IID = Intrinsic::x86_avx512_mask_gather3siv4_sf; 1293 break; 1294 case X86::BI__builtin_ia32_gather3siv4si: 1295 IID = Intrinsic::x86_avx512_mask_gather3siv4_si; 1296 break; 1297 case X86::BI__builtin_ia32_gather3siv8sf: 1298 IID = Intrinsic::x86_avx512_mask_gather3siv8_sf; 1299 break; 1300 case X86::BI__builtin_ia32_gather3siv8si: 1301 IID = Intrinsic::x86_avx512_mask_gather3siv8_si; 1302 break; 1303 case X86::BI__builtin_ia32_gathersiv8df: 1304 IID = Intrinsic::x86_avx512_mask_gather_dpd_512; 1305 break; 1306 case X86::BI__builtin_ia32_gathersiv16sf: 1307 IID = Intrinsic::x86_avx512_mask_gather_dps_512; 1308 break; 1309 case X86::BI__builtin_ia32_gatherdiv8df: 1310 IID = Intrinsic::x86_avx512_mask_gather_qpd_512; 1311 break; 1312 case X86::BI__builtin_ia32_gatherdiv16sf: 1313 IID = Intrinsic::x86_avx512_mask_gather_qps_512; 1314 break; 1315 case X86::BI__builtin_ia32_gathersiv8di: 1316 IID = Intrinsic::x86_avx512_mask_gather_dpq_512; 1317 break; 1318 case X86::BI__builtin_ia32_gathersiv16si: 1319 IID = Intrinsic::x86_avx512_mask_gather_dpi_512; 1320 break; 1321 case X86::BI__builtin_ia32_gatherdiv8di: 1322 IID = Intrinsic::x86_avx512_mask_gather_qpq_512; 1323 break; 1324 case X86::BI__builtin_ia32_gatherdiv16si: 1325 IID = Intrinsic::x86_avx512_mask_gather_qpi_512; 1326 break; 1327 } 1328 1329 unsigned MinElts = std::min( 1330 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(), 1331 cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements()); 1332 Ops[3] = getMaskVecValue(*this, Ops[3], MinElts); 1333 Function *Intr = CGM.getIntrinsic(IID); 1334 return Builder.CreateCall(Intr, Ops); 1335 } 1336 1337 case X86::BI__builtin_ia32_scattersiv8df: 1338 case X86::BI__builtin_ia32_scattersiv16sf: 1339 case X86::BI__builtin_ia32_scatterdiv8df: 1340 case X86::BI__builtin_ia32_scatterdiv16sf: 1341 case X86::BI__builtin_ia32_scattersiv8di: 1342 case X86::BI__builtin_ia32_scattersiv16si: 1343 case X86::BI__builtin_ia32_scatterdiv8di: 1344 case X86::BI__builtin_ia32_scatterdiv16si: 1345 case X86::BI__builtin_ia32_scatterdiv2df: 1346 case X86::BI__builtin_ia32_scatterdiv2di: 1347 case X86::BI__builtin_ia32_scatterdiv4df: 1348 case X86::BI__builtin_ia32_scatterdiv4di: 1349 case X86::BI__builtin_ia32_scatterdiv4sf: 1350 case X86::BI__builtin_ia32_scatterdiv4si: 1351 case X86::BI__builtin_ia32_scatterdiv8sf: 1352 case X86::BI__builtin_ia32_scatterdiv8si: 1353 case X86::BI__builtin_ia32_scattersiv2df: 1354 case X86::BI__builtin_ia32_scattersiv2di: 1355 case X86::BI__builtin_ia32_scattersiv4df: 1356 case X86::BI__builtin_ia32_scattersiv4di: 1357 case X86::BI__builtin_ia32_scattersiv4sf: 1358 case X86::BI__builtin_ia32_scattersiv4si: 1359 case X86::BI__builtin_ia32_scattersiv8sf: 1360 case X86::BI__builtin_ia32_scattersiv8si: { 1361 Intrinsic::ID IID; 1362 switch (BuiltinID) { 1363 default: llvm_unreachable("Unexpected builtin"); 1364 case X86::BI__builtin_ia32_scattersiv8df: 1365 IID = Intrinsic::x86_avx512_mask_scatter_dpd_512; 1366 break; 1367 case X86::BI__builtin_ia32_scattersiv16sf: 1368 IID = Intrinsic::x86_avx512_mask_scatter_dps_512; 1369 break; 1370 case X86::BI__builtin_ia32_scatterdiv8df: 1371 IID = Intrinsic::x86_avx512_mask_scatter_qpd_512; 1372 break; 1373 case X86::BI__builtin_ia32_scatterdiv16sf: 1374 IID = Intrinsic::x86_avx512_mask_scatter_qps_512; 1375 break; 1376 case X86::BI__builtin_ia32_scattersiv8di: 1377 IID = Intrinsic::x86_avx512_mask_scatter_dpq_512; 1378 break; 1379 case X86::BI__builtin_ia32_scattersiv16si: 1380 IID = Intrinsic::x86_avx512_mask_scatter_dpi_512; 1381 break; 1382 case X86::BI__builtin_ia32_scatterdiv8di: 1383 IID = Intrinsic::x86_avx512_mask_scatter_qpq_512; 1384 break; 1385 case X86::BI__builtin_ia32_scatterdiv16si: 1386 IID = Intrinsic::x86_avx512_mask_scatter_qpi_512; 1387 break; 1388 case X86::BI__builtin_ia32_scatterdiv2df: 1389 IID = Intrinsic::x86_avx512_mask_scatterdiv2_df; 1390 break; 1391 case X86::BI__builtin_ia32_scatterdiv2di: 1392 IID = Intrinsic::x86_avx512_mask_scatterdiv2_di; 1393 break; 1394 case X86::BI__builtin_ia32_scatterdiv4df: 1395 IID = Intrinsic::x86_avx512_mask_scatterdiv4_df; 1396 break; 1397 case X86::BI__builtin_ia32_scatterdiv4di: 1398 IID = Intrinsic::x86_avx512_mask_scatterdiv4_di; 1399 break; 1400 case X86::BI__builtin_ia32_scatterdiv4sf: 1401 IID = Intrinsic::x86_avx512_mask_scatterdiv4_sf; 1402 break; 1403 case X86::BI__builtin_ia32_scatterdiv4si: 1404 IID = Intrinsic::x86_avx512_mask_scatterdiv4_si; 1405 break; 1406 case X86::BI__builtin_ia32_scatterdiv8sf: 1407 IID = Intrinsic::x86_avx512_mask_scatterdiv8_sf; 1408 break; 1409 case X86::BI__builtin_ia32_scatterdiv8si: 1410 IID = Intrinsic::x86_avx512_mask_scatterdiv8_si; 1411 break; 1412 case X86::BI__builtin_ia32_scattersiv2df: 1413 IID = Intrinsic::x86_avx512_mask_scattersiv2_df; 1414 break; 1415 case X86::BI__builtin_ia32_scattersiv2di: 1416 IID = Intrinsic::x86_avx512_mask_scattersiv2_di; 1417 break; 1418 case X86::BI__builtin_ia32_scattersiv4df: 1419 IID = Intrinsic::x86_avx512_mask_scattersiv4_df; 1420 break; 1421 case X86::BI__builtin_ia32_scattersiv4di: 1422 IID = Intrinsic::x86_avx512_mask_scattersiv4_di; 1423 break; 1424 case X86::BI__builtin_ia32_scattersiv4sf: 1425 IID = Intrinsic::x86_avx512_mask_scattersiv4_sf; 1426 break; 1427 case X86::BI__builtin_ia32_scattersiv4si: 1428 IID = Intrinsic::x86_avx512_mask_scattersiv4_si; 1429 break; 1430 case X86::BI__builtin_ia32_scattersiv8sf: 1431 IID = Intrinsic::x86_avx512_mask_scattersiv8_sf; 1432 break; 1433 case X86::BI__builtin_ia32_scattersiv8si: 1434 IID = Intrinsic::x86_avx512_mask_scattersiv8_si; 1435 break; 1436 } 1437 1438 unsigned MinElts = std::min( 1439 cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements(), 1440 cast<llvm::FixedVectorType>(Ops[3]->getType())->getNumElements()); 1441 Ops[1] = getMaskVecValue(*this, Ops[1], MinElts); 1442 Function *Intr = CGM.getIntrinsic(IID); 1443 return Builder.CreateCall(Intr, Ops); 1444 } 1445 1446 case X86::BI__builtin_ia32_vextractf128_pd256: 1447 case X86::BI__builtin_ia32_vextractf128_ps256: 1448 case X86::BI__builtin_ia32_vextractf128_si256: 1449 case X86::BI__builtin_ia32_extract128i256: 1450 case X86::BI__builtin_ia32_extractf64x4_mask: 1451 case X86::BI__builtin_ia32_extractf32x4_mask: 1452 case X86::BI__builtin_ia32_extracti64x4_mask: 1453 case X86::BI__builtin_ia32_extracti32x4_mask: 1454 case X86::BI__builtin_ia32_extractf32x8_mask: 1455 case X86::BI__builtin_ia32_extracti32x8_mask: 1456 case X86::BI__builtin_ia32_extractf32x4_256_mask: 1457 case X86::BI__builtin_ia32_extracti32x4_256_mask: 1458 case X86::BI__builtin_ia32_extractf64x2_256_mask: 1459 case X86::BI__builtin_ia32_extracti64x2_256_mask: 1460 case X86::BI__builtin_ia32_extractf64x2_512_mask: 1461 case X86::BI__builtin_ia32_extracti64x2_512_mask: { 1462 auto *DstTy = cast<llvm::FixedVectorType>(ConvertType(E->getType())); 1463 unsigned NumElts = DstTy->getNumElements(); 1464 unsigned SrcNumElts = 1465 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); 1466 unsigned SubVectors = SrcNumElts / NumElts; 1467 unsigned Index = cast<ConstantInt>(Ops[1])->getZExtValue(); 1468 assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors"); 1469 Index &= SubVectors - 1; // Remove any extra bits. 1470 Index *= NumElts; 1471 1472 int Indices[16]; 1473 for (unsigned i = 0; i != NumElts; ++i) 1474 Indices[i] = i + Index; 1475 1476 Value *Res = Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts), 1477 "extract"); 1478 1479 if (Ops.size() == 4) 1480 Res = EmitX86Select(*this, Ops[3], Res, Ops[2]); 1481 1482 return Res; 1483 } 1484 case X86::BI__builtin_ia32_vinsertf128_pd256: 1485 case X86::BI__builtin_ia32_vinsertf128_ps256: 1486 case X86::BI__builtin_ia32_vinsertf128_si256: 1487 case X86::BI__builtin_ia32_insert128i256: 1488 case X86::BI__builtin_ia32_insertf64x4: 1489 case X86::BI__builtin_ia32_insertf32x4: 1490 case X86::BI__builtin_ia32_inserti64x4: 1491 case X86::BI__builtin_ia32_inserti32x4: 1492 case X86::BI__builtin_ia32_insertf32x8: 1493 case X86::BI__builtin_ia32_inserti32x8: 1494 case X86::BI__builtin_ia32_insertf32x4_256: 1495 case X86::BI__builtin_ia32_inserti32x4_256: 1496 case X86::BI__builtin_ia32_insertf64x2_256: 1497 case X86::BI__builtin_ia32_inserti64x2_256: 1498 case X86::BI__builtin_ia32_insertf64x2_512: 1499 case X86::BI__builtin_ia32_inserti64x2_512: { 1500 unsigned DstNumElts = 1501 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); 1502 unsigned SrcNumElts = 1503 cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements(); 1504 unsigned SubVectors = DstNumElts / SrcNumElts; 1505 unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue(); 1506 assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors"); 1507 Index &= SubVectors - 1; // Remove any extra bits. 1508 Index *= SrcNumElts; 1509 1510 int Indices[16]; 1511 for (unsigned i = 0; i != DstNumElts; ++i) 1512 Indices[i] = (i >= SrcNumElts) ? SrcNumElts + (i % SrcNumElts) : i; 1513 1514 Value *Op1 = Builder.CreateShuffleVector( 1515 Ops[1], ArrayRef(Indices, DstNumElts), "widen"); 1516 1517 for (unsigned i = 0; i != DstNumElts; ++i) { 1518 if (i >= Index && i < (Index + SrcNumElts)) 1519 Indices[i] = (i - Index) + DstNumElts; 1520 else 1521 Indices[i] = i; 1522 } 1523 1524 return Builder.CreateShuffleVector(Ops[0], Op1, 1525 ArrayRef(Indices, DstNumElts), "insert"); 1526 } 1527 case X86::BI__builtin_ia32_pmovqd512_mask: 1528 case X86::BI__builtin_ia32_pmovwb512_mask: { 1529 Value *Res = Builder.CreateTrunc(Ops[0], Ops[1]->getType()); 1530 return EmitX86Select(*this, Ops[2], Res, Ops[1]); 1531 } 1532 case X86::BI__builtin_ia32_pmovdb512_mask: 1533 case X86::BI__builtin_ia32_pmovdw512_mask: 1534 case X86::BI__builtin_ia32_pmovqw512_mask: { 1535 if (const auto *C = dyn_cast<Constant>(Ops[2])) 1536 if (C->isAllOnesValue()) 1537 return Builder.CreateTrunc(Ops[0], Ops[1]->getType()); 1538 1539 Intrinsic::ID IID; 1540 switch (BuiltinID) { 1541 default: llvm_unreachable("Unsupported intrinsic!"); 1542 case X86::BI__builtin_ia32_pmovdb512_mask: 1543 IID = Intrinsic::x86_avx512_mask_pmov_db_512; 1544 break; 1545 case X86::BI__builtin_ia32_pmovdw512_mask: 1546 IID = Intrinsic::x86_avx512_mask_pmov_dw_512; 1547 break; 1548 case X86::BI__builtin_ia32_pmovqw512_mask: 1549 IID = Intrinsic::x86_avx512_mask_pmov_qw_512; 1550 break; 1551 } 1552 1553 Function *Intr = CGM.getIntrinsic(IID); 1554 return Builder.CreateCall(Intr, Ops); 1555 } 1556 case X86::BI__builtin_ia32_pblendw128: 1557 case X86::BI__builtin_ia32_blendpd: 1558 case X86::BI__builtin_ia32_blendps: 1559 case X86::BI__builtin_ia32_blendpd256: 1560 case X86::BI__builtin_ia32_blendps256: 1561 case X86::BI__builtin_ia32_pblendw256: 1562 case X86::BI__builtin_ia32_pblendd128: 1563 case X86::BI__builtin_ia32_pblendd256: { 1564 unsigned NumElts = 1565 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); 1566 unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue(); 1567 1568 int Indices[16]; 1569 // If there are more than 8 elements, the immediate is used twice so make 1570 // sure we handle that. 1571 for (unsigned i = 0; i != NumElts; ++i) 1572 Indices[i] = ((Imm >> (i % 8)) & 0x1) ? NumElts + i : i; 1573 1574 return Builder.CreateShuffleVector(Ops[0], Ops[1], 1575 ArrayRef(Indices, NumElts), "blend"); 1576 } 1577 case X86::BI__builtin_ia32_pshuflw: 1578 case X86::BI__builtin_ia32_pshuflw256: 1579 case X86::BI__builtin_ia32_pshuflw512: { 1580 uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue(); 1581 auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType()); 1582 unsigned NumElts = Ty->getNumElements(); 1583 1584 // Splat the 8-bits of immediate 4 times to help the loop wrap around. 1585 Imm = (Imm & 0xff) * 0x01010101; 1586 1587 int Indices[32]; 1588 for (unsigned l = 0; l != NumElts; l += 8) { 1589 for (unsigned i = 0; i != 4; ++i) { 1590 Indices[l + i] = l + (Imm & 3); 1591 Imm >>= 2; 1592 } 1593 for (unsigned i = 4; i != 8; ++i) 1594 Indices[l + i] = l + i; 1595 } 1596 1597 return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts), 1598 "pshuflw"); 1599 } 1600 case X86::BI__builtin_ia32_pshufhw: 1601 case X86::BI__builtin_ia32_pshufhw256: 1602 case X86::BI__builtin_ia32_pshufhw512: { 1603 uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue(); 1604 auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType()); 1605 unsigned NumElts = Ty->getNumElements(); 1606 1607 // Splat the 8-bits of immediate 4 times to help the loop wrap around. 1608 Imm = (Imm & 0xff) * 0x01010101; 1609 1610 int Indices[32]; 1611 for (unsigned l = 0; l != NumElts; l += 8) { 1612 for (unsigned i = 0; i != 4; ++i) 1613 Indices[l + i] = l + i; 1614 for (unsigned i = 4; i != 8; ++i) { 1615 Indices[l + i] = l + 4 + (Imm & 3); 1616 Imm >>= 2; 1617 } 1618 } 1619 1620 return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts), 1621 "pshufhw"); 1622 } 1623 case X86::BI__builtin_ia32_pshufd: 1624 case X86::BI__builtin_ia32_pshufd256: 1625 case X86::BI__builtin_ia32_pshufd512: 1626 case X86::BI__builtin_ia32_vpermilpd: 1627 case X86::BI__builtin_ia32_vpermilps: 1628 case X86::BI__builtin_ia32_vpermilpd256: 1629 case X86::BI__builtin_ia32_vpermilps256: 1630 case X86::BI__builtin_ia32_vpermilpd512: 1631 case X86::BI__builtin_ia32_vpermilps512: { 1632 uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue(); 1633 auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType()); 1634 unsigned NumElts = Ty->getNumElements(); 1635 unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128; 1636 unsigned NumLaneElts = NumElts / NumLanes; 1637 1638 // Splat the 8-bits of immediate 4 times to help the loop wrap around. 1639 Imm = (Imm & 0xff) * 0x01010101; 1640 1641 int Indices[16]; 1642 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 1643 for (unsigned i = 0; i != NumLaneElts; ++i) { 1644 Indices[i + l] = (Imm % NumLaneElts) + l; 1645 Imm /= NumLaneElts; 1646 } 1647 } 1648 1649 return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts), 1650 "permil"); 1651 } 1652 case X86::BI__builtin_ia32_shufpd: 1653 case X86::BI__builtin_ia32_shufpd256: 1654 case X86::BI__builtin_ia32_shufpd512: 1655 case X86::BI__builtin_ia32_shufps: 1656 case X86::BI__builtin_ia32_shufps256: 1657 case X86::BI__builtin_ia32_shufps512: { 1658 uint32_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue(); 1659 auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType()); 1660 unsigned NumElts = Ty->getNumElements(); 1661 unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128; 1662 unsigned NumLaneElts = NumElts / NumLanes; 1663 1664 // Splat the 8-bits of immediate 4 times to help the loop wrap around. 1665 Imm = (Imm & 0xff) * 0x01010101; 1666 1667 int Indices[16]; 1668 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 1669 for (unsigned i = 0; i != NumLaneElts; ++i) { 1670 unsigned Index = Imm % NumLaneElts; 1671 Imm /= NumLaneElts; 1672 if (i >= (NumLaneElts / 2)) 1673 Index += NumElts; 1674 Indices[l + i] = l + Index; 1675 } 1676 } 1677 1678 return Builder.CreateShuffleVector(Ops[0], Ops[1], 1679 ArrayRef(Indices, NumElts), "shufp"); 1680 } 1681 case X86::BI__builtin_ia32_permdi256: 1682 case X86::BI__builtin_ia32_permdf256: 1683 case X86::BI__builtin_ia32_permdi512: 1684 case X86::BI__builtin_ia32_permdf512: { 1685 unsigned Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue(); 1686 auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType()); 1687 unsigned NumElts = Ty->getNumElements(); 1688 1689 // These intrinsics operate on 256-bit lanes of four 64-bit elements. 1690 int Indices[8]; 1691 for (unsigned l = 0; l != NumElts; l += 4) 1692 for (unsigned i = 0; i != 4; ++i) 1693 Indices[l + i] = l + ((Imm >> (2 * i)) & 0x3); 1694 1695 return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts), 1696 "perm"); 1697 } 1698 case X86::BI__builtin_ia32_palignr128: 1699 case X86::BI__builtin_ia32_palignr256: 1700 case X86::BI__builtin_ia32_palignr512: { 1701 unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff; 1702 1703 unsigned NumElts = 1704 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); 1705 assert(NumElts % 16 == 0); 1706 1707 // If palignr is shifting the pair of vectors more than the size of two 1708 // lanes, emit zero. 1709 if (ShiftVal >= 32) 1710 return llvm::Constant::getNullValue(ConvertType(E->getType())); 1711 1712 // If palignr is shifting the pair of input vectors more than one lane, 1713 // but less than two lanes, convert to shifting in zeroes. 1714 if (ShiftVal > 16) { 1715 ShiftVal -= 16; 1716 Ops[1] = Ops[0]; 1717 Ops[0] = llvm::Constant::getNullValue(Ops[0]->getType()); 1718 } 1719 1720 int Indices[64]; 1721 // 256-bit palignr operates on 128-bit lanes so we need to handle that 1722 for (unsigned l = 0; l != NumElts; l += 16) { 1723 for (unsigned i = 0; i != 16; ++i) { 1724 unsigned Idx = ShiftVal + i; 1725 if (Idx >= 16) 1726 Idx += NumElts - 16; // End of lane, switch operand. 1727 Indices[l + i] = Idx + l; 1728 } 1729 } 1730 1731 return Builder.CreateShuffleVector(Ops[1], Ops[0], 1732 ArrayRef(Indices, NumElts), "palignr"); 1733 } 1734 case X86::BI__builtin_ia32_alignd128: 1735 case X86::BI__builtin_ia32_alignd256: 1736 case X86::BI__builtin_ia32_alignd512: 1737 case X86::BI__builtin_ia32_alignq128: 1738 case X86::BI__builtin_ia32_alignq256: 1739 case X86::BI__builtin_ia32_alignq512: { 1740 unsigned NumElts = 1741 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); 1742 unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff; 1743 1744 // Mask the shift amount to width of a vector. 1745 ShiftVal &= NumElts - 1; 1746 1747 int Indices[16]; 1748 for (unsigned i = 0; i != NumElts; ++i) 1749 Indices[i] = i + ShiftVal; 1750 1751 return Builder.CreateShuffleVector(Ops[1], Ops[0], 1752 ArrayRef(Indices, NumElts), "valign"); 1753 } 1754 case X86::BI__builtin_ia32_shuf_f32x4_256: 1755 case X86::BI__builtin_ia32_shuf_f64x2_256: 1756 case X86::BI__builtin_ia32_shuf_i32x4_256: 1757 case X86::BI__builtin_ia32_shuf_i64x2_256: 1758 case X86::BI__builtin_ia32_shuf_f32x4: 1759 case X86::BI__builtin_ia32_shuf_f64x2: 1760 case X86::BI__builtin_ia32_shuf_i32x4: 1761 case X86::BI__builtin_ia32_shuf_i64x2: { 1762 unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue(); 1763 auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType()); 1764 unsigned NumElts = Ty->getNumElements(); 1765 unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2; 1766 unsigned NumLaneElts = NumElts / NumLanes; 1767 1768 int Indices[16]; 1769 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 1770 unsigned Index = (Imm % NumLanes) * NumLaneElts; 1771 Imm /= NumLanes; // Discard the bits we just used. 1772 if (l >= (NumElts / 2)) 1773 Index += NumElts; // Switch to other source. 1774 for (unsigned i = 0; i != NumLaneElts; ++i) { 1775 Indices[l + i] = Index + i; 1776 } 1777 } 1778 1779 return Builder.CreateShuffleVector(Ops[0], Ops[1], 1780 ArrayRef(Indices, NumElts), "shuf"); 1781 } 1782 1783 case X86::BI__builtin_ia32_vperm2f128_pd256: 1784 case X86::BI__builtin_ia32_vperm2f128_ps256: 1785 case X86::BI__builtin_ia32_vperm2f128_si256: 1786 case X86::BI__builtin_ia32_permti256: { 1787 unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue(); 1788 unsigned NumElts = 1789 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); 1790 1791 // This takes a very simple approach since there are two lanes and a 1792 // shuffle can have 2 inputs. So we reserve the first input for the first 1793 // lane and the second input for the second lane. This may result in 1794 // duplicate sources, but this can be dealt with in the backend. 1795 1796 Value *OutOps[2]; 1797 int Indices[8]; 1798 for (unsigned l = 0; l != 2; ++l) { 1799 // Determine the source for this lane. 1800 if (Imm & (1 << ((l * 4) + 3))) 1801 OutOps[l] = llvm::ConstantAggregateZero::get(Ops[0]->getType()); 1802 else if (Imm & (1 << ((l * 4) + 1))) 1803 OutOps[l] = Ops[1]; 1804 else 1805 OutOps[l] = Ops[0]; 1806 1807 for (unsigned i = 0; i != NumElts/2; ++i) { 1808 // Start with ith element of the source for this lane. 1809 unsigned Idx = (l * NumElts) + i; 1810 // If bit 0 of the immediate half is set, switch to the high half of 1811 // the source. 1812 if (Imm & (1 << (l * 4))) 1813 Idx += NumElts/2; 1814 Indices[(l * (NumElts/2)) + i] = Idx; 1815 } 1816 } 1817 1818 return Builder.CreateShuffleVector(OutOps[0], OutOps[1], 1819 ArrayRef(Indices, NumElts), "vperm"); 1820 } 1821 1822 case X86::BI__builtin_ia32_pslldqi128_byteshift: 1823 case X86::BI__builtin_ia32_pslldqi256_byteshift: 1824 case X86::BI__builtin_ia32_pslldqi512_byteshift: { 1825 unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff; 1826 auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType()); 1827 // Builtin type is vXi64 so multiply by 8 to get bytes. 1828 unsigned NumElts = ResultType->getNumElements() * 8; 1829 1830 // If pslldq is shifting the vector more than 15 bytes, emit zero. 1831 if (ShiftVal >= 16) 1832 return llvm::Constant::getNullValue(ResultType); 1833 1834 int Indices[64]; 1835 // 256/512-bit pslldq operates on 128-bit lanes so we need to handle that 1836 for (unsigned l = 0; l != NumElts; l += 16) { 1837 for (unsigned i = 0; i != 16; ++i) { 1838 unsigned Idx = NumElts + i - ShiftVal; 1839 if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand. 1840 Indices[l + i] = Idx + l; 1841 } 1842 } 1843 1844 auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts); 1845 Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast"); 1846 Value *Zero = llvm::Constant::getNullValue(VecTy); 1847 Value *SV = Builder.CreateShuffleVector( 1848 Zero, Cast, ArrayRef(Indices, NumElts), "pslldq"); 1849 return Builder.CreateBitCast(SV, Ops[0]->getType(), "cast"); 1850 } 1851 case X86::BI__builtin_ia32_psrldqi128_byteshift: 1852 case X86::BI__builtin_ia32_psrldqi256_byteshift: 1853 case X86::BI__builtin_ia32_psrldqi512_byteshift: { 1854 unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff; 1855 auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType()); 1856 // Builtin type is vXi64 so multiply by 8 to get bytes. 1857 unsigned NumElts = ResultType->getNumElements() * 8; 1858 1859 // If psrldq is shifting the vector more than 15 bytes, emit zero. 1860 if (ShiftVal >= 16) 1861 return llvm::Constant::getNullValue(ResultType); 1862 1863 int Indices[64]; 1864 // 256/512-bit psrldq operates on 128-bit lanes so we need to handle that 1865 for (unsigned l = 0; l != NumElts; l += 16) { 1866 for (unsigned i = 0; i != 16; ++i) { 1867 unsigned Idx = i + ShiftVal; 1868 if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand. 1869 Indices[l + i] = Idx + l; 1870 } 1871 } 1872 1873 auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts); 1874 Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast"); 1875 Value *Zero = llvm::Constant::getNullValue(VecTy); 1876 Value *SV = Builder.CreateShuffleVector( 1877 Cast, Zero, ArrayRef(Indices, NumElts), "psrldq"); 1878 return Builder.CreateBitCast(SV, ResultType, "cast"); 1879 } 1880 case X86::BI__builtin_ia32_kshiftliqi: 1881 case X86::BI__builtin_ia32_kshiftlihi: 1882 case X86::BI__builtin_ia32_kshiftlisi: 1883 case X86::BI__builtin_ia32_kshiftlidi: { 1884 unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff; 1885 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); 1886 1887 if (ShiftVal >= NumElts) 1888 return llvm::Constant::getNullValue(Ops[0]->getType()); 1889 1890 Value *In = getMaskVecValue(*this, Ops[0], NumElts); 1891 1892 int Indices[64]; 1893 for (unsigned i = 0; i != NumElts; ++i) 1894 Indices[i] = NumElts + i - ShiftVal; 1895 1896 Value *Zero = llvm::Constant::getNullValue(In->getType()); 1897 Value *SV = Builder.CreateShuffleVector( 1898 Zero, In, ArrayRef(Indices, NumElts), "kshiftl"); 1899 return Builder.CreateBitCast(SV, Ops[0]->getType()); 1900 } 1901 case X86::BI__builtin_ia32_kshiftriqi: 1902 case X86::BI__builtin_ia32_kshiftrihi: 1903 case X86::BI__builtin_ia32_kshiftrisi: 1904 case X86::BI__builtin_ia32_kshiftridi: { 1905 unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff; 1906 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); 1907 1908 if (ShiftVal >= NumElts) 1909 return llvm::Constant::getNullValue(Ops[0]->getType()); 1910 1911 Value *In = getMaskVecValue(*this, Ops[0], NumElts); 1912 1913 int Indices[64]; 1914 for (unsigned i = 0; i != NumElts; ++i) 1915 Indices[i] = i + ShiftVal; 1916 1917 Value *Zero = llvm::Constant::getNullValue(In->getType()); 1918 Value *SV = Builder.CreateShuffleVector( 1919 In, Zero, ArrayRef(Indices, NumElts), "kshiftr"); 1920 return Builder.CreateBitCast(SV, Ops[0]->getType()); 1921 } 1922 case X86::BI__builtin_ia32_movnti: 1923 case X86::BI__builtin_ia32_movnti64: 1924 case X86::BI__builtin_ia32_movntsd: 1925 case X86::BI__builtin_ia32_movntss: { 1926 llvm::MDNode *Node = llvm::MDNode::get( 1927 getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1))); 1928 1929 Value *Ptr = Ops[0]; 1930 Value *Src = Ops[1]; 1931 1932 // Extract the 0'th element of the source vector. 1933 if (BuiltinID == X86::BI__builtin_ia32_movntsd || 1934 BuiltinID == X86::BI__builtin_ia32_movntss) 1935 Src = Builder.CreateExtractElement(Src, (uint64_t)0, "extract"); 1936 1937 // Unaligned nontemporal store of the scalar value. 1938 StoreInst *SI = Builder.CreateDefaultAlignedStore(Src, Ptr); 1939 SI->setMetadata(llvm::LLVMContext::MD_nontemporal, Node); 1940 SI->setAlignment(llvm::Align(1)); 1941 return SI; 1942 } 1943 // Rotate is a special case of funnel shift - 1st 2 args are the same. 1944 case X86::BI__builtin_ia32_vprotb: 1945 case X86::BI__builtin_ia32_vprotw: 1946 case X86::BI__builtin_ia32_vprotd: 1947 case X86::BI__builtin_ia32_vprotq: 1948 case X86::BI__builtin_ia32_vprotbi: 1949 case X86::BI__builtin_ia32_vprotwi: 1950 case X86::BI__builtin_ia32_vprotdi: 1951 case X86::BI__builtin_ia32_vprotqi: 1952 case X86::BI__builtin_ia32_prold128: 1953 case X86::BI__builtin_ia32_prold256: 1954 case X86::BI__builtin_ia32_prold512: 1955 case X86::BI__builtin_ia32_prolq128: 1956 case X86::BI__builtin_ia32_prolq256: 1957 case X86::BI__builtin_ia32_prolq512: 1958 case X86::BI__builtin_ia32_prolvd128: 1959 case X86::BI__builtin_ia32_prolvd256: 1960 case X86::BI__builtin_ia32_prolvd512: 1961 case X86::BI__builtin_ia32_prolvq128: 1962 case X86::BI__builtin_ia32_prolvq256: 1963 case X86::BI__builtin_ia32_prolvq512: 1964 return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], false); 1965 case X86::BI__builtin_ia32_prord128: 1966 case X86::BI__builtin_ia32_prord256: 1967 case X86::BI__builtin_ia32_prord512: 1968 case X86::BI__builtin_ia32_prorq128: 1969 case X86::BI__builtin_ia32_prorq256: 1970 case X86::BI__builtin_ia32_prorq512: 1971 case X86::BI__builtin_ia32_prorvd128: 1972 case X86::BI__builtin_ia32_prorvd256: 1973 case X86::BI__builtin_ia32_prorvd512: 1974 case X86::BI__builtin_ia32_prorvq128: 1975 case X86::BI__builtin_ia32_prorvq256: 1976 case X86::BI__builtin_ia32_prorvq512: 1977 return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], true); 1978 case X86::BI__builtin_ia32_selectb_128: 1979 case X86::BI__builtin_ia32_selectb_256: 1980 case X86::BI__builtin_ia32_selectb_512: 1981 case X86::BI__builtin_ia32_selectw_128: 1982 case X86::BI__builtin_ia32_selectw_256: 1983 case X86::BI__builtin_ia32_selectw_512: 1984 case X86::BI__builtin_ia32_selectd_128: 1985 case X86::BI__builtin_ia32_selectd_256: 1986 case X86::BI__builtin_ia32_selectd_512: 1987 case X86::BI__builtin_ia32_selectq_128: 1988 case X86::BI__builtin_ia32_selectq_256: 1989 case X86::BI__builtin_ia32_selectq_512: 1990 case X86::BI__builtin_ia32_selectph_128: 1991 case X86::BI__builtin_ia32_selectph_256: 1992 case X86::BI__builtin_ia32_selectph_512: 1993 case X86::BI__builtin_ia32_selectpbf_128: 1994 case X86::BI__builtin_ia32_selectpbf_256: 1995 case X86::BI__builtin_ia32_selectpbf_512: 1996 case X86::BI__builtin_ia32_selectps_128: 1997 case X86::BI__builtin_ia32_selectps_256: 1998 case X86::BI__builtin_ia32_selectps_512: 1999 case X86::BI__builtin_ia32_selectpd_128: 2000 case X86::BI__builtin_ia32_selectpd_256: 2001 case X86::BI__builtin_ia32_selectpd_512: 2002 return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]); 2003 case X86::BI__builtin_ia32_selectsh_128: 2004 case X86::BI__builtin_ia32_selectsbf_128: 2005 case X86::BI__builtin_ia32_selectss_128: 2006 case X86::BI__builtin_ia32_selectsd_128: { 2007 Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0); 2008 Value *B = Builder.CreateExtractElement(Ops[2], (uint64_t)0); 2009 A = EmitX86ScalarSelect(*this, Ops[0], A, B); 2010 return Builder.CreateInsertElement(Ops[1], A, (uint64_t)0); 2011 } 2012 case X86::BI__builtin_ia32_cmpb128_mask: 2013 case X86::BI__builtin_ia32_cmpb256_mask: 2014 case X86::BI__builtin_ia32_cmpb512_mask: 2015 case X86::BI__builtin_ia32_cmpw128_mask: 2016 case X86::BI__builtin_ia32_cmpw256_mask: 2017 case X86::BI__builtin_ia32_cmpw512_mask: 2018 case X86::BI__builtin_ia32_cmpd128_mask: 2019 case X86::BI__builtin_ia32_cmpd256_mask: 2020 case X86::BI__builtin_ia32_cmpd512_mask: 2021 case X86::BI__builtin_ia32_cmpq128_mask: 2022 case X86::BI__builtin_ia32_cmpq256_mask: 2023 case X86::BI__builtin_ia32_cmpq512_mask: { 2024 unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7; 2025 return EmitX86MaskedCompare(*this, CC, true, Ops); 2026 } 2027 case X86::BI__builtin_ia32_ucmpb128_mask: 2028 case X86::BI__builtin_ia32_ucmpb256_mask: 2029 case X86::BI__builtin_ia32_ucmpb512_mask: 2030 case X86::BI__builtin_ia32_ucmpw128_mask: 2031 case X86::BI__builtin_ia32_ucmpw256_mask: 2032 case X86::BI__builtin_ia32_ucmpw512_mask: 2033 case X86::BI__builtin_ia32_ucmpd128_mask: 2034 case X86::BI__builtin_ia32_ucmpd256_mask: 2035 case X86::BI__builtin_ia32_ucmpd512_mask: 2036 case X86::BI__builtin_ia32_ucmpq128_mask: 2037 case X86::BI__builtin_ia32_ucmpq256_mask: 2038 case X86::BI__builtin_ia32_ucmpq512_mask: { 2039 unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7; 2040 return EmitX86MaskedCompare(*this, CC, false, Ops); 2041 } 2042 case X86::BI__builtin_ia32_vpcomb: 2043 case X86::BI__builtin_ia32_vpcomw: 2044 case X86::BI__builtin_ia32_vpcomd: 2045 case X86::BI__builtin_ia32_vpcomq: 2046 return EmitX86vpcom(*this, Ops, true); 2047 case X86::BI__builtin_ia32_vpcomub: 2048 case X86::BI__builtin_ia32_vpcomuw: 2049 case X86::BI__builtin_ia32_vpcomud: 2050 case X86::BI__builtin_ia32_vpcomuq: 2051 return EmitX86vpcom(*this, Ops, false); 2052 2053 case X86::BI__builtin_ia32_kortestcqi: 2054 case X86::BI__builtin_ia32_kortestchi: 2055 case X86::BI__builtin_ia32_kortestcsi: 2056 case X86::BI__builtin_ia32_kortestcdi: { 2057 Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops); 2058 Value *C = llvm::Constant::getAllOnesValue(Ops[0]->getType()); 2059 Value *Cmp = Builder.CreateICmpEQ(Or, C); 2060 return Builder.CreateZExt(Cmp, ConvertType(E->getType())); 2061 } 2062 case X86::BI__builtin_ia32_kortestzqi: 2063 case X86::BI__builtin_ia32_kortestzhi: 2064 case X86::BI__builtin_ia32_kortestzsi: 2065 case X86::BI__builtin_ia32_kortestzdi: { 2066 Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops); 2067 Value *C = llvm::Constant::getNullValue(Ops[0]->getType()); 2068 Value *Cmp = Builder.CreateICmpEQ(Or, C); 2069 return Builder.CreateZExt(Cmp, ConvertType(E->getType())); 2070 } 2071 2072 case X86::BI__builtin_ia32_ktestcqi: 2073 case X86::BI__builtin_ia32_ktestzqi: 2074 case X86::BI__builtin_ia32_ktestchi: 2075 case X86::BI__builtin_ia32_ktestzhi: 2076 case X86::BI__builtin_ia32_ktestcsi: 2077 case X86::BI__builtin_ia32_ktestzsi: 2078 case X86::BI__builtin_ia32_ktestcdi: 2079 case X86::BI__builtin_ia32_ktestzdi: { 2080 Intrinsic::ID IID; 2081 switch (BuiltinID) { 2082 default: llvm_unreachable("Unsupported intrinsic!"); 2083 case X86::BI__builtin_ia32_ktestcqi: 2084 IID = Intrinsic::x86_avx512_ktestc_b; 2085 break; 2086 case X86::BI__builtin_ia32_ktestzqi: 2087 IID = Intrinsic::x86_avx512_ktestz_b; 2088 break; 2089 case X86::BI__builtin_ia32_ktestchi: 2090 IID = Intrinsic::x86_avx512_ktestc_w; 2091 break; 2092 case X86::BI__builtin_ia32_ktestzhi: 2093 IID = Intrinsic::x86_avx512_ktestz_w; 2094 break; 2095 case X86::BI__builtin_ia32_ktestcsi: 2096 IID = Intrinsic::x86_avx512_ktestc_d; 2097 break; 2098 case X86::BI__builtin_ia32_ktestzsi: 2099 IID = Intrinsic::x86_avx512_ktestz_d; 2100 break; 2101 case X86::BI__builtin_ia32_ktestcdi: 2102 IID = Intrinsic::x86_avx512_ktestc_q; 2103 break; 2104 case X86::BI__builtin_ia32_ktestzdi: 2105 IID = Intrinsic::x86_avx512_ktestz_q; 2106 break; 2107 } 2108 2109 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); 2110 Value *LHS = getMaskVecValue(*this, Ops[0], NumElts); 2111 Value *RHS = getMaskVecValue(*this, Ops[1], NumElts); 2112 Function *Intr = CGM.getIntrinsic(IID); 2113 return Builder.CreateCall(Intr, {LHS, RHS}); 2114 } 2115 2116 case X86::BI__builtin_ia32_kaddqi: 2117 case X86::BI__builtin_ia32_kaddhi: 2118 case X86::BI__builtin_ia32_kaddsi: 2119 case X86::BI__builtin_ia32_kadddi: { 2120 Intrinsic::ID IID; 2121 switch (BuiltinID) { 2122 default: llvm_unreachable("Unsupported intrinsic!"); 2123 case X86::BI__builtin_ia32_kaddqi: 2124 IID = Intrinsic::x86_avx512_kadd_b; 2125 break; 2126 case X86::BI__builtin_ia32_kaddhi: 2127 IID = Intrinsic::x86_avx512_kadd_w; 2128 break; 2129 case X86::BI__builtin_ia32_kaddsi: 2130 IID = Intrinsic::x86_avx512_kadd_d; 2131 break; 2132 case X86::BI__builtin_ia32_kadddi: 2133 IID = Intrinsic::x86_avx512_kadd_q; 2134 break; 2135 } 2136 2137 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); 2138 Value *LHS = getMaskVecValue(*this, Ops[0], NumElts); 2139 Value *RHS = getMaskVecValue(*this, Ops[1], NumElts); 2140 Function *Intr = CGM.getIntrinsic(IID); 2141 Value *Res = Builder.CreateCall(Intr, {LHS, RHS}); 2142 return Builder.CreateBitCast(Res, Ops[0]->getType()); 2143 } 2144 case X86::BI__builtin_ia32_kandqi: 2145 case X86::BI__builtin_ia32_kandhi: 2146 case X86::BI__builtin_ia32_kandsi: 2147 case X86::BI__builtin_ia32_kanddi: 2148 return EmitX86MaskLogic(*this, Instruction::And, Ops); 2149 case X86::BI__builtin_ia32_kandnqi: 2150 case X86::BI__builtin_ia32_kandnhi: 2151 case X86::BI__builtin_ia32_kandnsi: 2152 case X86::BI__builtin_ia32_kandndi: 2153 return EmitX86MaskLogic(*this, Instruction::And, Ops, true); 2154 case X86::BI__builtin_ia32_korqi: 2155 case X86::BI__builtin_ia32_korhi: 2156 case X86::BI__builtin_ia32_korsi: 2157 case X86::BI__builtin_ia32_kordi: 2158 return EmitX86MaskLogic(*this, Instruction::Or, Ops); 2159 case X86::BI__builtin_ia32_kxnorqi: 2160 case X86::BI__builtin_ia32_kxnorhi: 2161 case X86::BI__builtin_ia32_kxnorsi: 2162 case X86::BI__builtin_ia32_kxnordi: 2163 return EmitX86MaskLogic(*this, Instruction::Xor, Ops, true); 2164 case X86::BI__builtin_ia32_kxorqi: 2165 case X86::BI__builtin_ia32_kxorhi: 2166 case X86::BI__builtin_ia32_kxorsi: 2167 case X86::BI__builtin_ia32_kxordi: 2168 return EmitX86MaskLogic(*this, Instruction::Xor, Ops); 2169 case X86::BI__builtin_ia32_knotqi: 2170 case X86::BI__builtin_ia32_knothi: 2171 case X86::BI__builtin_ia32_knotsi: 2172 case X86::BI__builtin_ia32_knotdi: { 2173 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); 2174 Value *Res = getMaskVecValue(*this, Ops[0], NumElts); 2175 return Builder.CreateBitCast(Builder.CreateNot(Res), 2176 Ops[0]->getType()); 2177 } 2178 case X86::BI__builtin_ia32_kmovb: 2179 case X86::BI__builtin_ia32_kmovw: 2180 case X86::BI__builtin_ia32_kmovd: 2181 case X86::BI__builtin_ia32_kmovq: { 2182 // Bitcast to vXi1 type and then back to integer. This gets the mask 2183 // register type into the IR, but might be optimized out depending on 2184 // what's around it. 2185 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); 2186 Value *Res = getMaskVecValue(*this, Ops[0], NumElts); 2187 return Builder.CreateBitCast(Res, Ops[0]->getType()); 2188 } 2189 2190 case X86::BI__builtin_ia32_kunpckdi: 2191 case X86::BI__builtin_ia32_kunpcksi: 2192 case X86::BI__builtin_ia32_kunpckhi: { 2193 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); 2194 Value *LHS = getMaskVecValue(*this, Ops[0], NumElts); 2195 Value *RHS = getMaskVecValue(*this, Ops[1], NumElts); 2196 int Indices[64]; 2197 for (unsigned i = 0; i != NumElts; ++i) 2198 Indices[i] = i; 2199 2200 // First extract half of each vector. This gives better codegen than 2201 // doing it in a single shuffle. 2202 LHS = Builder.CreateShuffleVector(LHS, LHS, ArrayRef(Indices, NumElts / 2)); 2203 RHS = Builder.CreateShuffleVector(RHS, RHS, ArrayRef(Indices, NumElts / 2)); 2204 // Concat the vectors. 2205 // NOTE: Operands are swapped to match the intrinsic definition. 2206 Value *Res = 2207 Builder.CreateShuffleVector(RHS, LHS, ArrayRef(Indices, NumElts)); 2208 return Builder.CreateBitCast(Res, Ops[0]->getType()); 2209 } 2210 2211 case X86::BI__builtin_ia32_vplzcntd_128: 2212 case X86::BI__builtin_ia32_vplzcntd_256: 2213 case X86::BI__builtin_ia32_vplzcntd_512: 2214 case X86::BI__builtin_ia32_vplzcntq_128: 2215 case X86::BI__builtin_ia32_vplzcntq_256: 2216 case X86::BI__builtin_ia32_vplzcntq_512: { 2217 Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType()); 2218 return Builder.CreateCall(F, {Ops[0],Builder.getInt1(false)}); 2219 } 2220 case X86::BI__builtin_ia32_sqrtss: 2221 case X86::BI__builtin_ia32_sqrtsd: { 2222 Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0); 2223 Function *F; 2224 if (Builder.getIsFPConstrained()) { 2225 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); 2226 F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt, 2227 A->getType()); 2228 A = Builder.CreateConstrainedFPCall(F, {A}); 2229 } else { 2230 F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType()); 2231 A = Builder.CreateCall(F, {A}); 2232 } 2233 return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0); 2234 } 2235 case X86::BI__builtin_ia32_sqrtsh_round_mask: 2236 case X86::BI__builtin_ia32_sqrtsd_round_mask: 2237 case X86::BI__builtin_ia32_sqrtss_round_mask: { 2238 unsigned CC = cast<llvm::ConstantInt>(Ops[4])->getZExtValue(); 2239 // Support only if the rounding mode is 4 (AKA CUR_DIRECTION), 2240 // otherwise keep the intrinsic. 2241 if (CC != 4) { 2242 Intrinsic::ID IID; 2243 2244 switch (BuiltinID) { 2245 default: 2246 llvm_unreachable("Unsupported intrinsic!"); 2247 case X86::BI__builtin_ia32_sqrtsh_round_mask: 2248 IID = Intrinsic::x86_avx512fp16_mask_sqrt_sh; 2249 break; 2250 case X86::BI__builtin_ia32_sqrtsd_round_mask: 2251 IID = Intrinsic::x86_avx512_mask_sqrt_sd; 2252 break; 2253 case X86::BI__builtin_ia32_sqrtss_round_mask: 2254 IID = Intrinsic::x86_avx512_mask_sqrt_ss; 2255 break; 2256 } 2257 return Builder.CreateCall(CGM.getIntrinsic(IID), Ops); 2258 } 2259 Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0); 2260 Function *F; 2261 if (Builder.getIsFPConstrained()) { 2262 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); 2263 F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt, 2264 A->getType()); 2265 A = Builder.CreateConstrainedFPCall(F, A); 2266 } else { 2267 F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType()); 2268 A = Builder.CreateCall(F, A); 2269 } 2270 Value *Src = Builder.CreateExtractElement(Ops[2], (uint64_t)0); 2271 A = EmitX86ScalarSelect(*this, Ops[3], A, Src); 2272 return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0); 2273 } 2274 case X86::BI__builtin_ia32_sqrtpd256: 2275 case X86::BI__builtin_ia32_sqrtpd: 2276 case X86::BI__builtin_ia32_sqrtps256: 2277 case X86::BI__builtin_ia32_sqrtps: 2278 case X86::BI__builtin_ia32_sqrtph256: 2279 case X86::BI__builtin_ia32_sqrtph: 2280 case X86::BI__builtin_ia32_sqrtph512: 2281 case X86::BI__builtin_ia32_vsqrtbf16256: 2282 case X86::BI__builtin_ia32_vsqrtbf16: 2283 case X86::BI__builtin_ia32_vsqrtbf16512: 2284 case X86::BI__builtin_ia32_sqrtps512: 2285 case X86::BI__builtin_ia32_sqrtpd512: { 2286 if (Ops.size() == 2) { 2287 unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue(); 2288 // Support only if the rounding mode is 4 (AKA CUR_DIRECTION), 2289 // otherwise keep the intrinsic. 2290 if (CC != 4) { 2291 Intrinsic::ID IID; 2292 2293 switch (BuiltinID) { 2294 default: 2295 llvm_unreachable("Unsupported intrinsic!"); 2296 case X86::BI__builtin_ia32_sqrtph512: 2297 IID = Intrinsic::x86_avx512fp16_sqrt_ph_512; 2298 break; 2299 case X86::BI__builtin_ia32_sqrtps512: 2300 IID = Intrinsic::x86_avx512_sqrt_ps_512; 2301 break; 2302 case X86::BI__builtin_ia32_sqrtpd512: 2303 IID = Intrinsic::x86_avx512_sqrt_pd_512; 2304 break; 2305 } 2306 return Builder.CreateCall(CGM.getIntrinsic(IID), Ops); 2307 } 2308 } 2309 if (Builder.getIsFPConstrained()) { 2310 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); 2311 Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt, 2312 Ops[0]->getType()); 2313 return Builder.CreateConstrainedFPCall(F, Ops[0]); 2314 } else { 2315 Function *F = CGM.getIntrinsic(Intrinsic::sqrt, Ops[0]->getType()); 2316 return Builder.CreateCall(F, Ops[0]); 2317 } 2318 } 2319 2320 case X86::BI__builtin_ia32_pmuludq128: 2321 case X86::BI__builtin_ia32_pmuludq256: 2322 case X86::BI__builtin_ia32_pmuludq512: 2323 return EmitX86Muldq(*this, /*IsSigned*/false, Ops); 2324 2325 case X86::BI__builtin_ia32_pmuldq128: 2326 case X86::BI__builtin_ia32_pmuldq256: 2327 case X86::BI__builtin_ia32_pmuldq512: 2328 return EmitX86Muldq(*this, /*IsSigned*/true, Ops); 2329 2330 case X86::BI__builtin_ia32_pternlogd512_mask: 2331 case X86::BI__builtin_ia32_pternlogq512_mask: 2332 case X86::BI__builtin_ia32_pternlogd128_mask: 2333 case X86::BI__builtin_ia32_pternlogd256_mask: 2334 case X86::BI__builtin_ia32_pternlogq128_mask: 2335 case X86::BI__builtin_ia32_pternlogq256_mask: 2336 return EmitX86Ternlog(*this, /*ZeroMask*/false, Ops); 2337 2338 case X86::BI__builtin_ia32_pternlogd512_maskz: 2339 case X86::BI__builtin_ia32_pternlogq512_maskz: 2340 case X86::BI__builtin_ia32_pternlogd128_maskz: 2341 case X86::BI__builtin_ia32_pternlogd256_maskz: 2342 case X86::BI__builtin_ia32_pternlogq128_maskz: 2343 case X86::BI__builtin_ia32_pternlogq256_maskz: 2344 return EmitX86Ternlog(*this, /*ZeroMask*/true, Ops); 2345 2346 case X86::BI__builtin_ia32_vpshldd128: 2347 case X86::BI__builtin_ia32_vpshldd256: 2348 case X86::BI__builtin_ia32_vpshldd512: 2349 case X86::BI__builtin_ia32_vpshldq128: 2350 case X86::BI__builtin_ia32_vpshldq256: 2351 case X86::BI__builtin_ia32_vpshldq512: 2352 case X86::BI__builtin_ia32_vpshldw128: 2353 case X86::BI__builtin_ia32_vpshldw256: 2354 case X86::BI__builtin_ia32_vpshldw512: 2355 return EmitX86FunnelShift(*this, Ops[0], Ops[1], Ops[2], false); 2356 2357 case X86::BI__builtin_ia32_vpshrdd128: 2358 case X86::BI__builtin_ia32_vpshrdd256: 2359 case X86::BI__builtin_ia32_vpshrdd512: 2360 case X86::BI__builtin_ia32_vpshrdq128: 2361 case X86::BI__builtin_ia32_vpshrdq256: 2362 case X86::BI__builtin_ia32_vpshrdq512: 2363 case X86::BI__builtin_ia32_vpshrdw128: 2364 case X86::BI__builtin_ia32_vpshrdw256: 2365 case X86::BI__builtin_ia32_vpshrdw512: 2366 // Ops 0 and 1 are swapped. 2367 return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true); 2368 2369 case X86::BI__builtin_ia32_vpshldvd128: 2370 case X86::BI__builtin_ia32_vpshldvd256: 2371 case X86::BI__builtin_ia32_vpshldvd512: 2372 case X86::BI__builtin_ia32_vpshldvq128: 2373 case X86::BI__builtin_ia32_vpshldvq256: 2374 case X86::BI__builtin_ia32_vpshldvq512: 2375 case X86::BI__builtin_ia32_vpshldvw128: 2376 case X86::BI__builtin_ia32_vpshldvw256: 2377 case X86::BI__builtin_ia32_vpshldvw512: 2378 return EmitX86FunnelShift(*this, Ops[0], Ops[1], Ops[2], false); 2379 2380 case X86::BI__builtin_ia32_vpshrdvd128: 2381 case X86::BI__builtin_ia32_vpshrdvd256: 2382 case X86::BI__builtin_ia32_vpshrdvd512: 2383 case X86::BI__builtin_ia32_vpshrdvq128: 2384 case X86::BI__builtin_ia32_vpshrdvq256: 2385 case X86::BI__builtin_ia32_vpshrdvq512: 2386 case X86::BI__builtin_ia32_vpshrdvw128: 2387 case X86::BI__builtin_ia32_vpshrdvw256: 2388 case X86::BI__builtin_ia32_vpshrdvw512: 2389 // Ops 0 and 1 are swapped. 2390 return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true); 2391 2392 // Reductions 2393 case X86::BI__builtin_ia32_reduce_fadd_pd512: 2394 case X86::BI__builtin_ia32_reduce_fadd_ps512: 2395 case X86::BI__builtin_ia32_reduce_fadd_ph512: 2396 case X86::BI__builtin_ia32_reduce_fadd_ph256: 2397 case X86::BI__builtin_ia32_reduce_fadd_ph128: { 2398 Function *F = 2399 CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType()); 2400 IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); 2401 Builder.getFastMathFlags().setAllowReassoc(); 2402 return Builder.CreateCall(F, {Ops[0], Ops[1]}); 2403 } 2404 case X86::BI__builtin_ia32_reduce_fmul_pd512: 2405 case X86::BI__builtin_ia32_reduce_fmul_ps512: 2406 case X86::BI__builtin_ia32_reduce_fmul_ph512: 2407 case X86::BI__builtin_ia32_reduce_fmul_ph256: 2408 case X86::BI__builtin_ia32_reduce_fmul_ph128: { 2409 Function *F = 2410 CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType()); 2411 IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); 2412 Builder.getFastMathFlags().setAllowReassoc(); 2413 return Builder.CreateCall(F, {Ops[0], Ops[1]}); 2414 } 2415 case X86::BI__builtin_ia32_reduce_fmax_pd512: 2416 case X86::BI__builtin_ia32_reduce_fmax_ps512: 2417 case X86::BI__builtin_ia32_reduce_fmax_ph512: 2418 case X86::BI__builtin_ia32_reduce_fmax_ph256: 2419 case X86::BI__builtin_ia32_reduce_fmax_ph128: { 2420 Function *F = 2421 CGM.getIntrinsic(Intrinsic::vector_reduce_fmax, Ops[0]->getType()); 2422 IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); 2423 Builder.getFastMathFlags().setNoNaNs(); 2424 return Builder.CreateCall(F, {Ops[0]}); 2425 } 2426 case X86::BI__builtin_ia32_reduce_fmin_pd512: 2427 case X86::BI__builtin_ia32_reduce_fmin_ps512: 2428 case X86::BI__builtin_ia32_reduce_fmin_ph512: 2429 case X86::BI__builtin_ia32_reduce_fmin_ph256: 2430 case X86::BI__builtin_ia32_reduce_fmin_ph128: { 2431 Function *F = 2432 CGM.getIntrinsic(Intrinsic::vector_reduce_fmin, Ops[0]->getType()); 2433 IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); 2434 Builder.getFastMathFlags().setNoNaNs(); 2435 return Builder.CreateCall(F, {Ops[0]}); 2436 } 2437 2438 case X86::BI__builtin_ia32_rdrand16_step: 2439 case X86::BI__builtin_ia32_rdrand32_step: 2440 case X86::BI__builtin_ia32_rdrand64_step: 2441 case X86::BI__builtin_ia32_rdseed16_step: 2442 case X86::BI__builtin_ia32_rdseed32_step: 2443 case X86::BI__builtin_ia32_rdseed64_step: { 2444 Intrinsic::ID ID; 2445 switch (BuiltinID) { 2446 default: llvm_unreachable("Unsupported intrinsic!"); 2447 case X86::BI__builtin_ia32_rdrand16_step: 2448 ID = Intrinsic::x86_rdrand_16; 2449 break; 2450 case X86::BI__builtin_ia32_rdrand32_step: 2451 ID = Intrinsic::x86_rdrand_32; 2452 break; 2453 case X86::BI__builtin_ia32_rdrand64_step: 2454 ID = Intrinsic::x86_rdrand_64; 2455 break; 2456 case X86::BI__builtin_ia32_rdseed16_step: 2457 ID = Intrinsic::x86_rdseed_16; 2458 break; 2459 case X86::BI__builtin_ia32_rdseed32_step: 2460 ID = Intrinsic::x86_rdseed_32; 2461 break; 2462 case X86::BI__builtin_ia32_rdseed64_step: 2463 ID = Intrinsic::x86_rdseed_64; 2464 break; 2465 } 2466 2467 Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID)); 2468 Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 0), 2469 Ops[0]); 2470 return Builder.CreateExtractValue(Call, 1); 2471 } 2472 case X86::BI__builtin_ia32_addcarryx_u32: 2473 case X86::BI__builtin_ia32_addcarryx_u64: 2474 case X86::BI__builtin_ia32_subborrow_u32: 2475 case X86::BI__builtin_ia32_subborrow_u64: { 2476 Intrinsic::ID IID; 2477 switch (BuiltinID) { 2478 default: llvm_unreachable("Unsupported intrinsic!"); 2479 case X86::BI__builtin_ia32_addcarryx_u32: 2480 IID = Intrinsic::x86_addcarry_32; 2481 break; 2482 case X86::BI__builtin_ia32_addcarryx_u64: 2483 IID = Intrinsic::x86_addcarry_64; 2484 break; 2485 case X86::BI__builtin_ia32_subborrow_u32: 2486 IID = Intrinsic::x86_subborrow_32; 2487 break; 2488 case X86::BI__builtin_ia32_subborrow_u64: 2489 IID = Intrinsic::x86_subborrow_64; 2490 break; 2491 } 2492 2493 Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), 2494 { Ops[0], Ops[1], Ops[2] }); 2495 Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1), 2496 Ops[3]); 2497 return Builder.CreateExtractValue(Call, 0); 2498 } 2499 2500 case X86::BI__builtin_ia32_fpclassps128_mask: 2501 case X86::BI__builtin_ia32_fpclassps256_mask: 2502 case X86::BI__builtin_ia32_fpclassps512_mask: 2503 case X86::BI__builtin_ia32_vfpclassbf16128_mask: 2504 case X86::BI__builtin_ia32_vfpclassbf16256_mask: 2505 case X86::BI__builtin_ia32_vfpclassbf16512_mask: 2506 case X86::BI__builtin_ia32_fpclassph128_mask: 2507 case X86::BI__builtin_ia32_fpclassph256_mask: 2508 case X86::BI__builtin_ia32_fpclassph512_mask: 2509 case X86::BI__builtin_ia32_fpclasspd128_mask: 2510 case X86::BI__builtin_ia32_fpclasspd256_mask: 2511 case X86::BI__builtin_ia32_fpclasspd512_mask: { 2512 unsigned NumElts = 2513 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); 2514 Value *MaskIn = Ops[2]; 2515 Ops.erase(&Ops[2]); 2516 2517 Intrinsic::ID ID; 2518 switch (BuiltinID) { 2519 default: llvm_unreachable("Unsupported intrinsic!"); 2520 case X86::BI__builtin_ia32_vfpclassbf16128_mask: 2521 ID = Intrinsic::x86_avx10_fpclass_bf16_128; 2522 break; 2523 case X86::BI__builtin_ia32_vfpclassbf16256_mask: 2524 ID = Intrinsic::x86_avx10_fpclass_bf16_256; 2525 break; 2526 case X86::BI__builtin_ia32_vfpclassbf16512_mask: 2527 ID = Intrinsic::x86_avx10_fpclass_bf16_512; 2528 break; 2529 case X86::BI__builtin_ia32_fpclassph128_mask: 2530 ID = Intrinsic::x86_avx512fp16_fpclass_ph_128; 2531 break; 2532 case X86::BI__builtin_ia32_fpclassph256_mask: 2533 ID = Intrinsic::x86_avx512fp16_fpclass_ph_256; 2534 break; 2535 case X86::BI__builtin_ia32_fpclassph512_mask: 2536 ID = Intrinsic::x86_avx512fp16_fpclass_ph_512; 2537 break; 2538 case X86::BI__builtin_ia32_fpclassps128_mask: 2539 ID = Intrinsic::x86_avx512_fpclass_ps_128; 2540 break; 2541 case X86::BI__builtin_ia32_fpclassps256_mask: 2542 ID = Intrinsic::x86_avx512_fpclass_ps_256; 2543 break; 2544 case X86::BI__builtin_ia32_fpclassps512_mask: 2545 ID = Intrinsic::x86_avx512_fpclass_ps_512; 2546 break; 2547 case X86::BI__builtin_ia32_fpclasspd128_mask: 2548 ID = Intrinsic::x86_avx512_fpclass_pd_128; 2549 break; 2550 case X86::BI__builtin_ia32_fpclasspd256_mask: 2551 ID = Intrinsic::x86_avx512_fpclass_pd_256; 2552 break; 2553 case X86::BI__builtin_ia32_fpclasspd512_mask: 2554 ID = Intrinsic::x86_avx512_fpclass_pd_512; 2555 break; 2556 } 2557 2558 Value *Fpclass = Builder.CreateCall(CGM.getIntrinsic(ID), Ops); 2559 return EmitX86MaskedCompareResult(*this, Fpclass, NumElts, MaskIn); 2560 } 2561 2562 case X86::BI__builtin_ia32_vp2intersect_q_512: 2563 case X86::BI__builtin_ia32_vp2intersect_q_256: 2564 case X86::BI__builtin_ia32_vp2intersect_q_128: 2565 case X86::BI__builtin_ia32_vp2intersect_d_512: 2566 case X86::BI__builtin_ia32_vp2intersect_d_256: 2567 case X86::BI__builtin_ia32_vp2intersect_d_128: { 2568 unsigned NumElts = 2569 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); 2570 Intrinsic::ID ID; 2571 2572 switch (BuiltinID) { 2573 default: llvm_unreachable("Unsupported intrinsic!"); 2574 case X86::BI__builtin_ia32_vp2intersect_q_512: 2575 ID = Intrinsic::x86_avx512_vp2intersect_q_512; 2576 break; 2577 case X86::BI__builtin_ia32_vp2intersect_q_256: 2578 ID = Intrinsic::x86_avx512_vp2intersect_q_256; 2579 break; 2580 case X86::BI__builtin_ia32_vp2intersect_q_128: 2581 ID = Intrinsic::x86_avx512_vp2intersect_q_128; 2582 break; 2583 case X86::BI__builtin_ia32_vp2intersect_d_512: 2584 ID = Intrinsic::x86_avx512_vp2intersect_d_512; 2585 break; 2586 case X86::BI__builtin_ia32_vp2intersect_d_256: 2587 ID = Intrinsic::x86_avx512_vp2intersect_d_256; 2588 break; 2589 case X86::BI__builtin_ia32_vp2intersect_d_128: 2590 ID = Intrinsic::x86_avx512_vp2intersect_d_128; 2591 break; 2592 } 2593 2594 Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID), {Ops[0], Ops[1]}); 2595 Value *Result = Builder.CreateExtractValue(Call, 0); 2596 Result = EmitX86MaskedCompareResult(*this, Result, NumElts, nullptr); 2597 Builder.CreateDefaultAlignedStore(Result, Ops[2]); 2598 2599 Result = Builder.CreateExtractValue(Call, 1); 2600 Result = EmitX86MaskedCompareResult(*this, Result, NumElts, nullptr); 2601 return Builder.CreateDefaultAlignedStore(Result, Ops[3]); 2602 } 2603 2604 case X86::BI__builtin_ia32_vpmultishiftqb128: 2605 case X86::BI__builtin_ia32_vpmultishiftqb256: 2606 case X86::BI__builtin_ia32_vpmultishiftqb512: { 2607 Intrinsic::ID ID; 2608 switch (BuiltinID) { 2609 default: llvm_unreachable("Unsupported intrinsic!"); 2610 case X86::BI__builtin_ia32_vpmultishiftqb128: 2611 ID = Intrinsic::x86_avx512_pmultishift_qb_128; 2612 break; 2613 case X86::BI__builtin_ia32_vpmultishiftqb256: 2614 ID = Intrinsic::x86_avx512_pmultishift_qb_256; 2615 break; 2616 case X86::BI__builtin_ia32_vpmultishiftqb512: 2617 ID = Intrinsic::x86_avx512_pmultishift_qb_512; 2618 break; 2619 } 2620 2621 return Builder.CreateCall(CGM.getIntrinsic(ID), Ops); 2622 } 2623 2624 case X86::BI__builtin_ia32_vpshufbitqmb128_mask: 2625 case X86::BI__builtin_ia32_vpshufbitqmb256_mask: 2626 case X86::BI__builtin_ia32_vpshufbitqmb512_mask: { 2627 unsigned NumElts = 2628 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); 2629 Value *MaskIn = Ops[2]; 2630 Ops.erase(&Ops[2]); 2631 2632 Intrinsic::ID ID; 2633 switch (BuiltinID) { 2634 default: llvm_unreachable("Unsupported intrinsic!"); 2635 case X86::BI__builtin_ia32_vpshufbitqmb128_mask: 2636 ID = Intrinsic::x86_avx512_vpshufbitqmb_128; 2637 break; 2638 case X86::BI__builtin_ia32_vpshufbitqmb256_mask: 2639 ID = Intrinsic::x86_avx512_vpshufbitqmb_256; 2640 break; 2641 case X86::BI__builtin_ia32_vpshufbitqmb512_mask: 2642 ID = Intrinsic::x86_avx512_vpshufbitqmb_512; 2643 break; 2644 } 2645 2646 Value *Shufbit = Builder.CreateCall(CGM.getIntrinsic(ID), Ops); 2647 return EmitX86MaskedCompareResult(*this, Shufbit, NumElts, MaskIn); 2648 } 2649 2650 // packed comparison intrinsics 2651 case X86::BI__builtin_ia32_cmpeqps: 2652 case X86::BI__builtin_ia32_cmpeqpd: 2653 return getVectorFCmpIR(CmpInst::FCMP_OEQ, /*IsSignaling*/false); 2654 case X86::BI__builtin_ia32_cmpltps: 2655 case X86::BI__builtin_ia32_cmpltpd: 2656 return getVectorFCmpIR(CmpInst::FCMP_OLT, /*IsSignaling*/true); 2657 case X86::BI__builtin_ia32_cmpleps: 2658 case X86::BI__builtin_ia32_cmplepd: 2659 return getVectorFCmpIR(CmpInst::FCMP_OLE, /*IsSignaling*/true); 2660 case X86::BI__builtin_ia32_cmpunordps: 2661 case X86::BI__builtin_ia32_cmpunordpd: 2662 return getVectorFCmpIR(CmpInst::FCMP_UNO, /*IsSignaling*/false); 2663 case X86::BI__builtin_ia32_cmpneqps: 2664 case X86::BI__builtin_ia32_cmpneqpd: 2665 return getVectorFCmpIR(CmpInst::FCMP_UNE, /*IsSignaling*/false); 2666 case X86::BI__builtin_ia32_cmpnltps: 2667 case X86::BI__builtin_ia32_cmpnltpd: 2668 return getVectorFCmpIR(CmpInst::FCMP_UGE, /*IsSignaling*/true); 2669 case X86::BI__builtin_ia32_cmpnleps: 2670 case X86::BI__builtin_ia32_cmpnlepd: 2671 return getVectorFCmpIR(CmpInst::FCMP_UGT, /*IsSignaling*/true); 2672 case X86::BI__builtin_ia32_cmpordps: 2673 case X86::BI__builtin_ia32_cmpordpd: 2674 return getVectorFCmpIR(CmpInst::FCMP_ORD, /*IsSignaling*/false); 2675 case X86::BI__builtin_ia32_cmpph128_mask: 2676 case X86::BI__builtin_ia32_cmpph256_mask: 2677 case X86::BI__builtin_ia32_cmpph512_mask: 2678 case X86::BI__builtin_ia32_cmpps128_mask: 2679 case X86::BI__builtin_ia32_cmpps256_mask: 2680 case X86::BI__builtin_ia32_cmpps512_mask: 2681 case X86::BI__builtin_ia32_cmppd128_mask: 2682 case X86::BI__builtin_ia32_cmppd256_mask: 2683 case X86::BI__builtin_ia32_cmppd512_mask: 2684 case X86::BI__builtin_ia32_vcmpbf16512_mask: 2685 case X86::BI__builtin_ia32_vcmpbf16256_mask: 2686 case X86::BI__builtin_ia32_vcmpbf16128_mask: 2687 IsMaskFCmp = true; 2688 [[fallthrough]]; 2689 case X86::BI__builtin_ia32_cmpps: 2690 case X86::BI__builtin_ia32_cmpps256: 2691 case X86::BI__builtin_ia32_cmppd: 2692 case X86::BI__builtin_ia32_cmppd256: { 2693 // Lowering vector comparisons to fcmp instructions, while 2694 // ignoring signalling behaviour requested 2695 // ignoring rounding mode requested 2696 // This is only possible if fp-model is not strict and FENV_ACCESS is off. 2697 2698 // The third argument is the comparison condition, and integer in the 2699 // range [0, 31] 2700 unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x1f; 2701 2702 // Lowering to IR fcmp instruction. 2703 // Ignoring requested signaling behaviour, 2704 // e.g. both _CMP_GT_OS & _CMP_GT_OQ are translated to FCMP_OGT. 2705 FCmpInst::Predicate Pred; 2706 bool IsSignaling; 2707 // Predicates for 16-31 repeat the 0-15 predicates. Only the signalling 2708 // behavior is inverted. We'll handle that after the switch. 2709 switch (CC & 0xf) { 2710 case 0x00: Pred = FCmpInst::FCMP_OEQ; IsSignaling = false; break; 2711 case 0x01: Pred = FCmpInst::FCMP_OLT; IsSignaling = true; break; 2712 case 0x02: Pred = FCmpInst::FCMP_OLE; IsSignaling = true; break; 2713 case 0x03: Pred = FCmpInst::FCMP_UNO; IsSignaling = false; break; 2714 case 0x04: Pred = FCmpInst::FCMP_UNE; IsSignaling = false; break; 2715 case 0x05: Pred = FCmpInst::FCMP_UGE; IsSignaling = true; break; 2716 case 0x06: Pred = FCmpInst::FCMP_UGT; IsSignaling = true; break; 2717 case 0x07: Pred = FCmpInst::FCMP_ORD; IsSignaling = false; break; 2718 case 0x08: Pred = FCmpInst::FCMP_UEQ; IsSignaling = false; break; 2719 case 0x09: Pred = FCmpInst::FCMP_ULT; IsSignaling = true; break; 2720 case 0x0a: Pred = FCmpInst::FCMP_ULE; IsSignaling = true; break; 2721 case 0x0b: Pred = FCmpInst::FCMP_FALSE; IsSignaling = false; break; 2722 case 0x0c: Pred = FCmpInst::FCMP_ONE; IsSignaling = false; break; 2723 case 0x0d: Pred = FCmpInst::FCMP_OGE; IsSignaling = true; break; 2724 case 0x0e: Pred = FCmpInst::FCMP_OGT; IsSignaling = true; break; 2725 case 0x0f: Pred = FCmpInst::FCMP_TRUE; IsSignaling = false; break; 2726 default: llvm_unreachable("Unhandled CC"); 2727 } 2728 2729 // Invert the signalling behavior for 16-31. 2730 if (CC & 0x10) 2731 IsSignaling = !IsSignaling; 2732 2733 // If the predicate is true or false and we're using constrained intrinsics, 2734 // we don't have a compare intrinsic we can use. Just use the legacy X86 2735 // specific intrinsic. 2736 // If the intrinsic is mask enabled and we're using constrained intrinsics, 2737 // use the legacy X86 specific intrinsic. 2738 if (Builder.getIsFPConstrained() && 2739 (Pred == FCmpInst::FCMP_TRUE || Pred == FCmpInst::FCMP_FALSE || 2740 IsMaskFCmp)) { 2741 2742 Intrinsic::ID IID; 2743 switch (BuiltinID) { 2744 default: llvm_unreachable("Unexpected builtin"); 2745 case X86::BI__builtin_ia32_cmpps: 2746 IID = Intrinsic::x86_sse_cmp_ps; 2747 break; 2748 case X86::BI__builtin_ia32_cmpps256: 2749 IID = Intrinsic::x86_avx_cmp_ps_256; 2750 break; 2751 case X86::BI__builtin_ia32_cmppd: 2752 IID = Intrinsic::x86_sse2_cmp_pd; 2753 break; 2754 case X86::BI__builtin_ia32_cmppd256: 2755 IID = Intrinsic::x86_avx_cmp_pd_256; 2756 break; 2757 case X86::BI__builtin_ia32_cmpph128_mask: 2758 IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_128; 2759 break; 2760 case X86::BI__builtin_ia32_cmpph256_mask: 2761 IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_256; 2762 break; 2763 case X86::BI__builtin_ia32_cmpph512_mask: 2764 IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_512; 2765 break; 2766 case X86::BI__builtin_ia32_cmpps512_mask: 2767 IID = Intrinsic::x86_avx512_mask_cmp_ps_512; 2768 break; 2769 case X86::BI__builtin_ia32_cmppd512_mask: 2770 IID = Intrinsic::x86_avx512_mask_cmp_pd_512; 2771 break; 2772 case X86::BI__builtin_ia32_cmpps128_mask: 2773 IID = Intrinsic::x86_avx512_mask_cmp_ps_128; 2774 break; 2775 case X86::BI__builtin_ia32_cmpps256_mask: 2776 IID = Intrinsic::x86_avx512_mask_cmp_ps_256; 2777 break; 2778 case X86::BI__builtin_ia32_cmppd128_mask: 2779 IID = Intrinsic::x86_avx512_mask_cmp_pd_128; 2780 break; 2781 case X86::BI__builtin_ia32_cmppd256_mask: 2782 IID = Intrinsic::x86_avx512_mask_cmp_pd_256; 2783 break; 2784 } 2785 2786 Function *Intr = CGM.getIntrinsic(IID); 2787 if (IsMaskFCmp) { 2788 unsigned NumElts = 2789 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); 2790 Ops[3] = getMaskVecValue(*this, Ops[3], NumElts); 2791 Value *Cmp = Builder.CreateCall(Intr, Ops); 2792 return EmitX86MaskedCompareResult(*this, Cmp, NumElts, nullptr); 2793 } 2794 2795 return Builder.CreateCall(Intr, Ops); 2796 } 2797 2798 // Builtins without the _mask suffix return a vector of integers 2799 // of the same width as the input vectors 2800 if (IsMaskFCmp) { 2801 // We ignore SAE if strict FP is disabled. We only keep precise 2802 // exception behavior under strict FP. 2803 // NOTE: If strict FP does ever go through here a CGFPOptionsRAII 2804 // object will be required. 2805 unsigned NumElts = 2806 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); 2807 Value *Cmp; 2808 if (IsSignaling) 2809 Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]); 2810 else 2811 Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]); 2812 return EmitX86MaskedCompareResult(*this, Cmp, NumElts, Ops[3]); 2813 } 2814 2815 return getVectorFCmpIR(Pred, IsSignaling); 2816 } 2817 2818 // SSE scalar comparison intrinsics 2819 case X86::BI__builtin_ia32_cmpeqss: 2820 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 0); 2821 case X86::BI__builtin_ia32_cmpltss: 2822 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 1); 2823 case X86::BI__builtin_ia32_cmpless: 2824 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 2); 2825 case X86::BI__builtin_ia32_cmpunordss: 2826 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 3); 2827 case X86::BI__builtin_ia32_cmpneqss: 2828 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 4); 2829 case X86::BI__builtin_ia32_cmpnltss: 2830 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 5); 2831 case X86::BI__builtin_ia32_cmpnless: 2832 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 6); 2833 case X86::BI__builtin_ia32_cmpordss: 2834 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 7); 2835 case X86::BI__builtin_ia32_cmpeqsd: 2836 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 0); 2837 case X86::BI__builtin_ia32_cmpltsd: 2838 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 1); 2839 case X86::BI__builtin_ia32_cmplesd: 2840 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 2); 2841 case X86::BI__builtin_ia32_cmpunordsd: 2842 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 3); 2843 case X86::BI__builtin_ia32_cmpneqsd: 2844 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 4); 2845 case X86::BI__builtin_ia32_cmpnltsd: 2846 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 5); 2847 case X86::BI__builtin_ia32_cmpnlesd: 2848 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 6); 2849 case X86::BI__builtin_ia32_cmpordsd: 2850 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7); 2851 2852 // f16c half2float intrinsics 2853 case X86::BI__builtin_ia32_vcvtph2ps: 2854 case X86::BI__builtin_ia32_vcvtph2ps256: 2855 case X86::BI__builtin_ia32_vcvtph2ps_mask: 2856 case X86::BI__builtin_ia32_vcvtph2ps256_mask: 2857 case X86::BI__builtin_ia32_vcvtph2ps512_mask: { 2858 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); 2859 return EmitX86CvtF16ToFloatExpr(*this, Ops, ConvertType(E->getType())); 2860 } 2861 2862 // AVX512 bf16 intrinsics 2863 case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: { 2864 Ops[2] = getMaskVecValue( 2865 *this, Ops[2], 2866 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements()); 2867 Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128; 2868 return Builder.CreateCall(CGM.getIntrinsic(IID), Ops); 2869 } 2870 case X86::BI__builtin_ia32_cvtsbf162ss_32: 2871 return Builder.CreateFPExt(Ops[0], Builder.getFloatTy()); 2872 2873 case X86::BI__builtin_ia32_cvtneps2bf16_256_mask: 2874 case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: { 2875 Intrinsic::ID IID; 2876 switch (BuiltinID) { 2877 default: llvm_unreachable("Unsupported intrinsic!"); 2878 case X86::BI__builtin_ia32_cvtneps2bf16_256_mask: 2879 IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_256; 2880 break; 2881 case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: 2882 IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_512; 2883 break; 2884 } 2885 Value *Res = Builder.CreateCall(CGM.getIntrinsic(IID), Ops[0]); 2886 return EmitX86Select(*this, Ops[2], Res, Ops[1]); 2887 } 2888 2889 case X86::BI__cpuid: 2890 case X86::BI__cpuidex: { 2891 Value *FuncId = EmitScalarExpr(E->getArg(1)); 2892 Value *SubFuncId = BuiltinID == X86::BI__cpuidex 2893 ? EmitScalarExpr(E->getArg(2)) 2894 : llvm::ConstantInt::get(Int32Ty, 0); 2895 2896 llvm::StructType *CpuidRetTy = 2897 llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty, Int32Ty); 2898 llvm::FunctionType *FTy = 2899 llvm::FunctionType::get(CpuidRetTy, {Int32Ty, Int32Ty}, false); 2900 2901 StringRef Asm, Constraints; 2902 if (getTarget().getTriple().getArch() == llvm::Triple::x86) { 2903 Asm = "cpuid"; 2904 Constraints = "={ax},={bx},={cx},={dx},{ax},{cx}"; 2905 } else { 2906 // x86-64 uses %rbx as the base register, so preserve it. 2907 Asm = "xchgq %rbx, ${1:q}\n" 2908 "cpuid\n" 2909 "xchgq %rbx, ${1:q}"; 2910 Constraints = "={ax},=r,={cx},={dx},0,2"; 2911 } 2912 2913 llvm::InlineAsm *IA = llvm::InlineAsm::get(FTy, Asm, Constraints, 2914 /*hasSideEffects=*/false); 2915 Value *IACall = Builder.CreateCall(IA, {FuncId, SubFuncId}); 2916 Value *BasePtr = EmitScalarExpr(E->getArg(0)); 2917 Value *Store = nullptr; 2918 for (unsigned i = 0; i < 4; i++) { 2919 Value *Extracted = Builder.CreateExtractValue(IACall, i); 2920 Value *StorePtr = Builder.CreateConstInBoundsGEP1_32(Int32Ty, BasePtr, i); 2921 Store = Builder.CreateAlignedStore(Extracted, StorePtr, getIntAlign()); 2922 } 2923 2924 // Return the last store instruction to signal that we have emitted the 2925 // the intrinsic. 2926 return Store; 2927 } 2928 2929 case X86::BI__emul: 2930 case X86::BI__emulu: { 2931 llvm::Type *Int64Ty = llvm::IntegerType::get(getLLVMContext(), 64); 2932 bool isSigned = (BuiltinID == X86::BI__emul); 2933 Value *LHS = Builder.CreateIntCast(Ops[0], Int64Ty, isSigned); 2934 Value *RHS = Builder.CreateIntCast(Ops[1], Int64Ty, isSigned); 2935 return Builder.CreateMul(LHS, RHS, "", !isSigned, isSigned); 2936 } 2937 case X86::BI__mulh: 2938 case X86::BI__umulh: 2939 case X86::BI_mul128: 2940 case X86::BI_umul128: { 2941 llvm::Type *ResType = ConvertType(E->getType()); 2942 llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128); 2943 2944 bool IsSigned = (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI_mul128); 2945 Value *LHS = Builder.CreateIntCast(Ops[0], Int128Ty, IsSigned); 2946 Value *RHS = Builder.CreateIntCast(Ops[1], Int128Ty, IsSigned); 2947 2948 Value *MulResult, *HigherBits; 2949 if (IsSigned) { 2950 MulResult = Builder.CreateNSWMul(LHS, RHS); 2951 HigherBits = Builder.CreateAShr(MulResult, 64); 2952 } else { 2953 MulResult = Builder.CreateNUWMul(LHS, RHS); 2954 HigherBits = Builder.CreateLShr(MulResult, 64); 2955 } 2956 HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned); 2957 2958 if (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI__umulh) 2959 return HigherBits; 2960 2961 Address HighBitsAddress = EmitPointerWithAlignment(E->getArg(2)); 2962 Builder.CreateStore(HigherBits, HighBitsAddress); 2963 return Builder.CreateIntCast(MulResult, ResType, IsSigned); 2964 } 2965 2966 case X86::BI__faststorefence: { 2967 return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, 2968 llvm::SyncScope::System); 2969 } 2970 case X86::BI__shiftleft128: 2971 case X86::BI__shiftright128: { 2972 llvm::Function *F = CGM.getIntrinsic( 2973 BuiltinID == X86::BI__shiftleft128 ? Intrinsic::fshl : Intrinsic::fshr, 2974 Int64Ty); 2975 // Flip low/high ops and zero-extend amount to matching type. 2976 // shiftleft128(Low, High, Amt) -> fshl(High, Low, Amt) 2977 // shiftright128(Low, High, Amt) -> fshr(High, Low, Amt) 2978 std::swap(Ops[0], Ops[1]); 2979 Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty); 2980 return Builder.CreateCall(F, Ops); 2981 } 2982 case X86::BI_ReadWriteBarrier: 2983 case X86::BI_ReadBarrier: 2984 case X86::BI_WriteBarrier: { 2985 return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, 2986 llvm::SyncScope::SingleThread); 2987 } 2988 2989 case X86::BI_AddressOfReturnAddress: { 2990 Function *F = 2991 CGM.getIntrinsic(Intrinsic::addressofreturnaddress, AllocaInt8PtrTy); 2992 return Builder.CreateCall(F); 2993 } 2994 case X86::BI__stosb: { 2995 // We treat __stosb as a volatile memset - it may not generate "rep stosb" 2996 // instruction, but it will create a memset that won't be optimized away. 2997 return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true); 2998 } 2999 // Corresponding to intrisics which will return 2 tiles (tile0_tile1). 3000 case X86::BI__builtin_ia32_t2rpntlvwz0_internal: 3001 case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: 3002 case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: 3003 case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: 3004 case X86::BI__builtin_ia32_t2rpntlvwz1_internal: 3005 case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: 3006 case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: 3007 case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: { 3008 Intrinsic::ID IID; 3009 switch (BuiltinID) { 3010 default: 3011 llvm_unreachable("Unsupported intrinsic!"); 3012 case X86::BI__builtin_ia32_t2rpntlvwz0_internal: 3013 IID = Intrinsic::x86_t2rpntlvwz0_internal; 3014 break; 3015 case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: 3016 IID = Intrinsic::x86_t2rpntlvwz0rs_internal; 3017 break; 3018 case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: 3019 IID = Intrinsic::x86_t2rpntlvwz0t1_internal; 3020 break; 3021 case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: 3022 IID = Intrinsic::x86_t2rpntlvwz0rst1_internal; 3023 break; 3024 case X86::BI__builtin_ia32_t2rpntlvwz1_internal: 3025 IID = Intrinsic::x86_t2rpntlvwz1_internal; 3026 break; 3027 case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: 3028 IID = Intrinsic::x86_t2rpntlvwz1rs_internal; 3029 break; 3030 case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: 3031 IID = Intrinsic::x86_t2rpntlvwz1t1_internal; 3032 break; 3033 case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: 3034 IID = Intrinsic::x86_t2rpntlvwz1rst1_internal; 3035 break; 3036 } 3037 3038 // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride) 3039 Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), 3040 {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]}); 3041 3042 auto *PtrTy = E->getArg(3)->getType()->getAs<PointerType>(); 3043 assert(PtrTy && "arg3 must be of pointer type"); 3044 QualType PtreeTy = PtrTy->getPointeeType(); 3045 llvm::Type *TyPtee = ConvertType(PtreeTy); 3046 3047 // Bitcast amx type (x86_amx) to vector type (256 x i32) 3048 // Then store tile0 into DstPtr0 3049 Value *T0 = Builder.CreateExtractValue(Call, 0); 3050 Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, 3051 {TyPtee}, {T0}); 3052 Builder.CreateDefaultAlignedStore(VecT0, Ops[3]); 3053 3054 // Then store tile1 into DstPtr1 3055 Value *T1 = Builder.CreateExtractValue(Call, 1); 3056 Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, 3057 {TyPtee}, {T1}); 3058 Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]); 3059 3060 // Note: Here we escape directly use x86_tilestored64_internal to store 3061 // the results due to it can't make sure the Mem written scope. This may 3062 // cause shapes reloads after first amx intrinsic, which current amx reg- 3063 // ister allocation has no ability to handle it. 3064 3065 return Store; 3066 } 3067 case X86::BI__ud2: 3068 // llvm.trap makes a ud2a instruction on x86. 3069 return EmitTrapCall(Intrinsic::trap); 3070 case X86::BI__int2c: { 3071 // This syscall signals a driver assertion failure in x86 NT kernels. 3072 llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false); 3073 llvm::InlineAsm *IA = 3074 llvm::InlineAsm::get(FTy, "int $$0x2c", "", /*hasSideEffects=*/true); 3075 llvm::AttributeList NoReturnAttr = llvm::AttributeList::get( 3076 getLLVMContext(), llvm::AttributeList::FunctionIndex, 3077 llvm::Attribute::NoReturn); 3078 llvm::CallInst *CI = Builder.CreateCall(IA); 3079 CI->setAttributes(NoReturnAttr); 3080 return CI; 3081 } 3082 case X86::BI__readfsbyte: 3083 case X86::BI__readfsword: 3084 case X86::BI__readfsdword: 3085 case X86::BI__readfsqword: { 3086 llvm::Type *IntTy = ConvertType(E->getType()); 3087 Value *Ptr = Builder.CreateIntToPtr( 3088 Ops[0], llvm::PointerType::get(getLLVMContext(), 257)); 3089 LoadInst *Load = Builder.CreateAlignedLoad( 3090 IntTy, Ptr, getContext().getTypeAlignInChars(E->getType())); 3091 Load->setVolatile(true); 3092 return Load; 3093 } 3094 case X86::BI__readgsbyte: 3095 case X86::BI__readgsword: 3096 case X86::BI__readgsdword: 3097 case X86::BI__readgsqword: { 3098 llvm::Type *IntTy = ConvertType(E->getType()); 3099 Value *Ptr = Builder.CreateIntToPtr( 3100 Ops[0], llvm::PointerType::get(getLLVMContext(), 256)); 3101 LoadInst *Load = Builder.CreateAlignedLoad( 3102 IntTy, Ptr, getContext().getTypeAlignInChars(E->getType())); 3103 Load->setVolatile(true); 3104 return Load; 3105 } 3106 case X86::BI__builtin_ia32_encodekey128_u32: { 3107 Intrinsic::ID IID = Intrinsic::x86_encodekey128; 3108 3109 Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1]}); 3110 3111 for (int i = 0; i < 3; ++i) { 3112 Value *Extract = Builder.CreateExtractValue(Call, i + 1); 3113 Value *Ptr = Builder.CreateConstGEP1_32(Int8Ty, Ops[2], i * 16); 3114 Builder.CreateAlignedStore(Extract, Ptr, Align(1)); 3115 } 3116 3117 return Builder.CreateExtractValue(Call, 0); 3118 } 3119 case X86::BI__builtin_ia32_encodekey256_u32: { 3120 Intrinsic::ID IID = Intrinsic::x86_encodekey256; 3121 3122 Value *Call = 3123 Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1], Ops[2]}); 3124 3125 for (int i = 0; i < 4; ++i) { 3126 Value *Extract = Builder.CreateExtractValue(Call, i + 1); 3127 Value *Ptr = Builder.CreateConstGEP1_32(Int8Ty, Ops[3], i * 16); 3128 Builder.CreateAlignedStore(Extract, Ptr, Align(1)); 3129 } 3130 3131 return Builder.CreateExtractValue(Call, 0); 3132 } 3133 case X86::BI__builtin_ia32_aesenc128kl_u8: 3134 case X86::BI__builtin_ia32_aesdec128kl_u8: 3135 case X86::BI__builtin_ia32_aesenc256kl_u8: 3136 case X86::BI__builtin_ia32_aesdec256kl_u8: { 3137 Intrinsic::ID IID; 3138 StringRef BlockName; 3139 switch (BuiltinID) { 3140 default: 3141 llvm_unreachable("Unexpected builtin"); 3142 case X86::BI__builtin_ia32_aesenc128kl_u8: 3143 IID = Intrinsic::x86_aesenc128kl; 3144 BlockName = "aesenc128kl"; 3145 break; 3146 case X86::BI__builtin_ia32_aesdec128kl_u8: 3147 IID = Intrinsic::x86_aesdec128kl; 3148 BlockName = "aesdec128kl"; 3149 break; 3150 case X86::BI__builtin_ia32_aesenc256kl_u8: 3151 IID = Intrinsic::x86_aesenc256kl; 3152 BlockName = "aesenc256kl"; 3153 break; 3154 case X86::BI__builtin_ia32_aesdec256kl_u8: 3155 IID = Intrinsic::x86_aesdec256kl; 3156 BlockName = "aesdec256kl"; 3157 break; 3158 } 3159 3160 Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[1], Ops[2]}); 3161 3162 BasicBlock *NoError = 3163 createBasicBlock(BlockName + "_no_error", this->CurFn); 3164 BasicBlock *Error = createBasicBlock(BlockName + "_error", this->CurFn); 3165 BasicBlock *End = createBasicBlock(BlockName + "_end", this->CurFn); 3166 3167 Value *Ret = Builder.CreateExtractValue(Call, 0); 3168 Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty()); 3169 Value *Out = Builder.CreateExtractValue(Call, 1); 3170 Builder.CreateCondBr(Succ, NoError, Error); 3171 3172 Builder.SetInsertPoint(NoError); 3173 Builder.CreateDefaultAlignedStore(Out, Ops[0]); 3174 Builder.CreateBr(End); 3175 3176 Builder.SetInsertPoint(Error); 3177 Constant *Zero = llvm::Constant::getNullValue(Out->getType()); 3178 Builder.CreateDefaultAlignedStore(Zero, Ops[0]); 3179 Builder.CreateBr(End); 3180 3181 Builder.SetInsertPoint(End); 3182 return Builder.CreateExtractValue(Call, 0); 3183 } 3184 case X86::BI__builtin_ia32_aesencwide128kl_u8: 3185 case X86::BI__builtin_ia32_aesdecwide128kl_u8: 3186 case X86::BI__builtin_ia32_aesencwide256kl_u8: 3187 case X86::BI__builtin_ia32_aesdecwide256kl_u8: { 3188 Intrinsic::ID IID; 3189 StringRef BlockName; 3190 switch (BuiltinID) { 3191 case X86::BI__builtin_ia32_aesencwide128kl_u8: 3192 IID = Intrinsic::x86_aesencwide128kl; 3193 BlockName = "aesencwide128kl"; 3194 break; 3195 case X86::BI__builtin_ia32_aesdecwide128kl_u8: 3196 IID = Intrinsic::x86_aesdecwide128kl; 3197 BlockName = "aesdecwide128kl"; 3198 break; 3199 case X86::BI__builtin_ia32_aesencwide256kl_u8: 3200 IID = Intrinsic::x86_aesencwide256kl; 3201 BlockName = "aesencwide256kl"; 3202 break; 3203 case X86::BI__builtin_ia32_aesdecwide256kl_u8: 3204 IID = Intrinsic::x86_aesdecwide256kl; 3205 BlockName = "aesdecwide256kl"; 3206 break; 3207 } 3208 3209 llvm::Type *Ty = FixedVectorType::get(Builder.getInt64Ty(), 2); 3210 Value *InOps[9]; 3211 InOps[0] = Ops[2]; 3212 for (int i = 0; i != 8; ++i) { 3213 Value *Ptr = Builder.CreateConstGEP1_32(Ty, Ops[1], i); 3214 InOps[i + 1] = Builder.CreateAlignedLoad(Ty, Ptr, Align(16)); 3215 } 3216 3217 Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), InOps); 3218 3219 BasicBlock *NoError = 3220 createBasicBlock(BlockName + "_no_error", this->CurFn); 3221 BasicBlock *Error = createBasicBlock(BlockName + "_error", this->CurFn); 3222 BasicBlock *End = createBasicBlock(BlockName + "_end", this->CurFn); 3223 3224 Value *Ret = Builder.CreateExtractValue(Call, 0); 3225 Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty()); 3226 Builder.CreateCondBr(Succ, NoError, Error); 3227 3228 Builder.SetInsertPoint(NoError); 3229 for (int i = 0; i != 8; ++i) { 3230 Value *Extract = Builder.CreateExtractValue(Call, i + 1); 3231 Value *Ptr = Builder.CreateConstGEP1_32(Extract->getType(), Ops[0], i); 3232 Builder.CreateAlignedStore(Extract, Ptr, Align(16)); 3233 } 3234 Builder.CreateBr(End); 3235 3236 Builder.SetInsertPoint(Error); 3237 for (int i = 0; i != 8; ++i) { 3238 Value *Out = Builder.CreateExtractValue(Call, i + 1); 3239 Constant *Zero = llvm::Constant::getNullValue(Out->getType()); 3240 Value *Ptr = Builder.CreateConstGEP1_32(Out->getType(), Ops[0], i); 3241 Builder.CreateAlignedStore(Zero, Ptr, Align(16)); 3242 } 3243 Builder.CreateBr(End); 3244 3245 Builder.SetInsertPoint(End); 3246 return Builder.CreateExtractValue(Call, 0); 3247 } 3248 case X86::BI__builtin_ia32_vfcmaddcph512_mask: 3249 IsConjFMA = true; 3250 [[fallthrough]]; 3251 case X86::BI__builtin_ia32_vfmaddcph512_mask: { 3252 Intrinsic::ID IID = IsConjFMA 3253 ? Intrinsic::x86_avx512fp16_mask_vfcmadd_cph_512 3254 : Intrinsic::x86_avx512fp16_mask_vfmadd_cph_512; 3255 Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops); 3256 return EmitX86Select(*this, Ops[3], Call, Ops[0]); 3257 } 3258 case X86::BI__builtin_ia32_vfcmaddcsh_round_mask: 3259 IsConjFMA = true; 3260 [[fallthrough]]; 3261 case X86::BI__builtin_ia32_vfmaddcsh_round_mask: { 3262 Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh 3263 : Intrinsic::x86_avx512fp16_mask_vfmadd_csh; 3264 Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops); 3265 Value *And = Builder.CreateAnd(Ops[3], llvm::ConstantInt::get(Int8Ty, 1)); 3266 return EmitX86Select(*this, And, Call, Ops[0]); 3267 } 3268 case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3: 3269 IsConjFMA = true; 3270 [[fallthrough]]; 3271 case X86::BI__builtin_ia32_vfmaddcsh_round_mask3: { 3272 Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh 3273 : Intrinsic::x86_avx512fp16_mask_vfmadd_csh; 3274 Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops); 3275 static constexpr int Mask[] = {0, 5, 6, 7}; 3276 return Builder.CreateShuffleVector(Call, Ops[2], Mask); 3277 } 3278 case X86::BI__builtin_ia32_prefetchi: 3279 return Builder.CreateCall( 3280 CGM.getIntrinsic(Intrinsic::prefetch, Ops[0]->getType()), 3281 {Ops[0], llvm::ConstantInt::get(Int32Ty, 0), Ops[1], 3282 llvm::ConstantInt::get(Int32Ty, 0)}); 3283 } 3284 } 3285