xref: /freebsd/contrib/llvm-project/clang/lib/CodeGen/TargetBuiltins/X86.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===---------- X86.cpp - Emit LLVM Code for builtins ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This contains code to emit Builtin calls as LLVM code.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "CGBuiltin.h"
14 #include "clang/Basic/TargetBuiltins.h"
15 #include "llvm/IR/InlineAsm.h"
16 #include "llvm/IR/IntrinsicsX86.h"
17 #include "llvm/TargetParser/X86TargetParser.h"
18 
19 using namespace clang;
20 using namespace CodeGen;
21 using namespace llvm;
22 
23 static std::optional<CodeGenFunction::MSVCIntrin>
translateX86ToMsvcIntrin(unsigned BuiltinID)24 translateX86ToMsvcIntrin(unsigned BuiltinID) {
25   using MSVCIntrin = CodeGenFunction::MSVCIntrin;
26   switch (BuiltinID) {
27   default:
28     return std::nullopt;
29   case clang::X86::BI_BitScanForward:
30   case clang::X86::BI_BitScanForward64:
31     return MSVCIntrin::_BitScanForward;
32   case clang::X86::BI_BitScanReverse:
33   case clang::X86::BI_BitScanReverse64:
34     return MSVCIntrin::_BitScanReverse;
35   case clang::X86::BI_InterlockedAnd64:
36     return MSVCIntrin::_InterlockedAnd;
37   case clang::X86::BI_InterlockedCompareExchange128:
38     return MSVCIntrin::_InterlockedCompareExchange128;
39   case clang::X86::BI_InterlockedExchange64:
40     return MSVCIntrin::_InterlockedExchange;
41   case clang::X86::BI_InterlockedExchangeAdd64:
42     return MSVCIntrin::_InterlockedExchangeAdd;
43   case clang::X86::BI_InterlockedExchangeSub64:
44     return MSVCIntrin::_InterlockedExchangeSub;
45   case clang::X86::BI_InterlockedOr64:
46     return MSVCIntrin::_InterlockedOr;
47   case clang::X86::BI_InterlockedXor64:
48     return MSVCIntrin::_InterlockedXor;
49   case clang::X86::BI_InterlockedDecrement64:
50     return MSVCIntrin::_InterlockedDecrement;
51   case clang::X86::BI_InterlockedIncrement64:
52     return MSVCIntrin::_InterlockedIncrement;
53   }
54   llvm_unreachable("must return from switch");
55 }
56 
57 // Convert the mask from an integer type to a vector of i1.
getMaskVecValue(CodeGenFunction & CGF,Value * Mask,unsigned NumElts)58 static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
59                               unsigned NumElts) {
60 
61   auto *MaskTy = llvm::FixedVectorType::get(
62       CGF.Builder.getInt1Ty(),
63       cast<IntegerType>(Mask->getType())->getBitWidth());
64   Value *MaskVec = CGF.Builder.CreateBitCast(Mask, MaskTy);
65 
66   // If we have less than 8 elements, then the starting mask was an i8 and
67   // we need to extract down to the right number of elements.
68   if (NumElts < 8) {
69     int Indices[4];
70     for (unsigned i = 0; i != NumElts; ++i)
71       Indices[i] = i;
72     MaskVec = CGF.Builder.CreateShuffleVector(
73         MaskVec, MaskVec, ArrayRef(Indices, NumElts), "extract");
74   }
75   return MaskVec;
76 }
77 
EmitX86MaskedStore(CodeGenFunction & CGF,ArrayRef<Value * > Ops,Align Alignment)78 static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
79                                  Align Alignment) {
80   Value *Ptr = Ops[0];
81 
82   Value *MaskVec = getMaskVecValue(
83       CGF, Ops[2],
84       cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements());
85 
86   return CGF.Builder.CreateMaskedStore(Ops[1], Ptr, Alignment, MaskVec);
87 }
88 
EmitX86MaskedLoad(CodeGenFunction & CGF,ArrayRef<Value * > Ops,Align Alignment)89 static Value *EmitX86MaskedLoad(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
90                                 Align Alignment) {
91   llvm::Type *Ty = Ops[1]->getType();
92   Value *Ptr = Ops[0];
93 
94   Value *MaskVec = getMaskVecValue(
95       CGF, Ops[2], cast<llvm::FixedVectorType>(Ty)->getNumElements());
96 
97   return CGF.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, MaskVec, Ops[1]);
98 }
99 
EmitX86ExpandLoad(CodeGenFunction & CGF,ArrayRef<Value * > Ops)100 static Value *EmitX86ExpandLoad(CodeGenFunction &CGF,
101                                 ArrayRef<Value *> Ops) {
102   auto *ResultTy = cast<llvm::VectorType>(Ops[1]->getType());
103   Value *Ptr = Ops[0];
104 
105   Value *MaskVec = getMaskVecValue(
106       CGF, Ops[2], cast<FixedVectorType>(ResultTy)->getNumElements());
107 
108   llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload,
109                                            ResultTy);
110   return CGF.Builder.CreateCall(F, { Ptr, MaskVec, Ops[1] });
111 }
112 
EmitX86CompressExpand(CodeGenFunction & CGF,ArrayRef<Value * > Ops,bool IsCompress)113 static Value *EmitX86CompressExpand(CodeGenFunction &CGF,
114                                     ArrayRef<Value *> Ops,
115                                     bool IsCompress) {
116   auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType());
117 
118   Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
119 
120   Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress
121                                  : Intrinsic::x86_avx512_mask_expand;
122   llvm::Function *F = CGF.CGM.getIntrinsic(IID, ResultTy);
123   return CGF.Builder.CreateCall(F, { Ops[0], Ops[1], MaskVec });
124 }
125 
EmitX86CompressStore(CodeGenFunction & CGF,ArrayRef<Value * > Ops)126 static Value *EmitX86CompressStore(CodeGenFunction &CGF,
127                                    ArrayRef<Value *> Ops) {
128   auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType());
129   Value *Ptr = Ops[0];
130 
131   Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
132 
133   llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_compressstore,
134                                            ResultTy);
135   return CGF.Builder.CreateCall(F, { Ops[1], Ptr, MaskVec });
136 }
137 
EmitX86MaskLogic(CodeGenFunction & CGF,Instruction::BinaryOps Opc,ArrayRef<Value * > Ops,bool InvertLHS=false)138 static Value *EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc,
139                               ArrayRef<Value *> Ops,
140                               bool InvertLHS = false) {
141   unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
142   Value *LHS = getMaskVecValue(CGF, Ops[0], NumElts);
143   Value *RHS = getMaskVecValue(CGF, Ops[1], NumElts);
144 
145   if (InvertLHS)
146     LHS = CGF.Builder.CreateNot(LHS);
147 
148   return CGF.Builder.CreateBitCast(CGF.Builder.CreateBinOp(Opc, LHS, RHS),
149                                    Ops[0]->getType());
150 }
151 
EmitX86FunnelShift(CodeGenFunction & CGF,Value * Op0,Value * Op1,Value * Amt,bool IsRight)152 static Value *EmitX86FunnelShift(CodeGenFunction &CGF, Value *Op0, Value *Op1,
153                                  Value *Amt, bool IsRight) {
154   llvm::Type *Ty = Op0->getType();
155 
156   // Amount may be scalar immediate, in which case create a splat vector.
157   // Funnel shifts amounts are treated as modulo and types are all power-of-2 so
158   // we only care about the lowest log2 bits anyway.
159   if (Amt->getType() != Ty) {
160     unsigned NumElts = cast<llvm::FixedVectorType>(Ty)->getNumElements();
161     Amt = CGF.Builder.CreateIntCast(Amt, Ty->getScalarType(), false);
162     Amt = CGF.Builder.CreateVectorSplat(NumElts, Amt);
163   }
164 
165   unsigned IID = IsRight ? Intrinsic::fshr : Intrinsic::fshl;
166   Function *F = CGF.CGM.getIntrinsic(IID, Ty);
167   return CGF.Builder.CreateCall(F, {Op0, Op1, Amt});
168 }
169 
EmitX86vpcom(CodeGenFunction & CGF,ArrayRef<Value * > Ops,bool IsSigned)170 static Value *EmitX86vpcom(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
171                            bool IsSigned) {
172   Value *Op0 = Ops[0];
173   Value *Op1 = Ops[1];
174   llvm::Type *Ty = Op0->getType();
175   uint64_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
176 
177   CmpInst::Predicate Pred;
178   switch (Imm) {
179   case 0x0:
180     Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
181     break;
182   case 0x1:
183     Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
184     break;
185   case 0x2:
186     Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
187     break;
188   case 0x3:
189     Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
190     break;
191   case 0x4:
192     Pred = ICmpInst::ICMP_EQ;
193     break;
194   case 0x5:
195     Pred = ICmpInst::ICMP_NE;
196     break;
197   case 0x6:
198     return llvm::Constant::getNullValue(Ty); // FALSE
199   case 0x7:
200     return llvm::Constant::getAllOnesValue(Ty); // TRUE
201   default:
202     llvm_unreachable("Unexpected XOP vpcom/vpcomu predicate");
203   }
204 
205   Value *Cmp = CGF.Builder.CreateICmp(Pred, Op0, Op1);
206   Value *Res = CGF.Builder.CreateSExt(Cmp, Ty);
207   return Res;
208 }
209 
EmitX86Select(CodeGenFunction & CGF,Value * Mask,Value * Op0,Value * Op1)210 static Value *EmitX86Select(CodeGenFunction &CGF,
211                             Value *Mask, Value *Op0, Value *Op1) {
212 
213   // If the mask is all ones just return first argument.
214   if (const auto *C = dyn_cast<Constant>(Mask))
215     if (C->isAllOnesValue())
216       return Op0;
217 
218   Mask = getMaskVecValue(
219       CGF, Mask, cast<llvm::FixedVectorType>(Op0->getType())->getNumElements());
220 
221   return CGF.Builder.CreateSelect(Mask, Op0, Op1);
222 }
223 
EmitX86ScalarSelect(CodeGenFunction & CGF,Value * Mask,Value * Op0,Value * Op1)224 static Value *EmitX86ScalarSelect(CodeGenFunction &CGF,
225                                   Value *Mask, Value *Op0, Value *Op1) {
226   // If the mask is all ones just return first argument.
227   if (const auto *C = dyn_cast<Constant>(Mask))
228     if (C->isAllOnesValue())
229       return Op0;
230 
231   auto *MaskTy = llvm::FixedVectorType::get(
232       CGF.Builder.getInt1Ty(), Mask->getType()->getIntegerBitWidth());
233   Mask = CGF.Builder.CreateBitCast(Mask, MaskTy);
234   Mask = CGF.Builder.CreateExtractElement(Mask, (uint64_t)0);
235   return CGF.Builder.CreateSelect(Mask, Op0, Op1);
236 }
237 
EmitX86MaskedCompareResult(CodeGenFunction & CGF,Value * Cmp,unsigned NumElts,Value * MaskIn)238 static Value *EmitX86MaskedCompareResult(CodeGenFunction &CGF, Value *Cmp,
239                                          unsigned NumElts, Value *MaskIn) {
240   if (MaskIn) {
241     const auto *C = dyn_cast<Constant>(MaskIn);
242     if (!C || !C->isAllOnesValue())
243       Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, MaskIn, NumElts));
244   }
245 
246   if (NumElts < 8) {
247     int Indices[8];
248     for (unsigned i = 0; i != NumElts; ++i)
249       Indices[i] = i;
250     for (unsigned i = NumElts; i != 8; ++i)
251       Indices[i] = i % NumElts + NumElts;
252     Cmp = CGF.Builder.CreateShuffleVector(
253         Cmp, llvm::Constant::getNullValue(Cmp->getType()), Indices);
254   }
255 
256   return CGF.Builder.CreateBitCast(Cmp,
257                                    IntegerType::get(CGF.getLLVMContext(),
258                                                     std::max(NumElts, 8U)));
259 }
260 
EmitX86MaskedCompare(CodeGenFunction & CGF,unsigned CC,bool Signed,ArrayRef<Value * > Ops)261 static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC,
262                                    bool Signed, ArrayRef<Value *> Ops) {
263   assert((Ops.size() == 2 || Ops.size() == 4) &&
264          "Unexpected number of arguments");
265   unsigned NumElts =
266       cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
267   Value *Cmp;
268 
269   if (CC == 3) {
270     Cmp = Constant::getNullValue(
271         llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts));
272   } else if (CC == 7) {
273     Cmp = Constant::getAllOnesValue(
274         llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts));
275   } else {
276     ICmpInst::Predicate Pred;
277     switch (CC) {
278     default: llvm_unreachable("Unknown condition code");
279     case 0: Pred = ICmpInst::ICMP_EQ;  break;
280     case 1: Pred = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
281     case 2: Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
282     case 4: Pred = ICmpInst::ICMP_NE;  break;
283     case 5: Pred = Signed ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
284     case 6: Pred = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
285     }
286     Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]);
287   }
288 
289   Value *MaskIn = nullptr;
290   if (Ops.size() == 4)
291     MaskIn = Ops[3];
292 
293   return EmitX86MaskedCompareResult(CGF, Cmp, NumElts, MaskIn);
294 }
295 
EmitX86ConvertToMask(CodeGenFunction & CGF,Value * In)296 static Value *EmitX86ConvertToMask(CodeGenFunction &CGF, Value *In) {
297   Value *Zero = Constant::getNullValue(In->getType());
298   return EmitX86MaskedCompare(CGF, 1, true, { In, Zero });
299 }
300 
EmitX86ConvertIntToFp(CodeGenFunction & CGF,const CallExpr * E,ArrayRef<Value * > Ops,bool IsSigned)301 static Value *EmitX86ConvertIntToFp(CodeGenFunction &CGF, const CallExpr *E,
302                                     ArrayRef<Value *> Ops, bool IsSigned) {
303   unsigned Rnd = cast<llvm::ConstantInt>(Ops[3])->getZExtValue();
304   llvm::Type *Ty = Ops[1]->getType();
305 
306   Value *Res;
307   if (Rnd != 4) {
308     Intrinsic::ID IID = IsSigned ? Intrinsic::x86_avx512_sitofp_round
309                                  : Intrinsic::x86_avx512_uitofp_round;
310     Function *F = CGF.CGM.getIntrinsic(IID, { Ty, Ops[0]->getType() });
311     Res = CGF.Builder.CreateCall(F, { Ops[0], Ops[3] });
312   } else {
313     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
314     Res = IsSigned ? CGF.Builder.CreateSIToFP(Ops[0], Ty)
315                    : CGF.Builder.CreateUIToFP(Ops[0], Ty);
316   }
317 
318   return EmitX86Select(CGF, Ops[2], Res, Ops[1]);
319 }
320 
321 // Lowers X86 FMA intrinsics to IR.
EmitX86FMAExpr(CodeGenFunction & CGF,const CallExpr * E,ArrayRef<Value * > Ops,unsigned BuiltinID,bool IsAddSub)322 static Value *EmitX86FMAExpr(CodeGenFunction &CGF, const CallExpr *E,
323                              ArrayRef<Value *> Ops, unsigned BuiltinID,
324                              bool IsAddSub) {
325 
326   bool Subtract = false;
327   Intrinsic::ID IID = Intrinsic::not_intrinsic;
328   switch (BuiltinID) {
329   default: break;
330   case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
331     Subtract = true;
332     [[fallthrough]];
333   case clang::X86::BI__builtin_ia32_vfmaddph512_mask:
334   case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
335   case clang::X86::BI__builtin_ia32_vfmaddph512_mask3:
336     IID = Intrinsic::x86_avx512fp16_vfmadd_ph_512;
337     break;
338   case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3:
339     Subtract = true;
340     [[fallthrough]];
341   case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
342   case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
343   case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3:
344     IID = Intrinsic::x86_avx512fp16_vfmaddsub_ph_512;
345     break;
346   case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
347     Subtract = true;
348     [[fallthrough]];
349   case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
350   case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
351   case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
352     IID = Intrinsic::x86_avx512_vfmadd_ps_512; break;
353   case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
354     Subtract = true;
355     [[fallthrough]];
356   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
357   case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
358   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
359     IID = Intrinsic::x86_avx512_vfmadd_pd_512; break;
360   case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
361     Subtract = true;
362     [[fallthrough]];
363   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
364   case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
365   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
366     IID = Intrinsic::x86_avx512_vfmaddsub_ps_512;
367     break;
368   case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
369     Subtract = true;
370     [[fallthrough]];
371   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
372   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
373   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
374     IID = Intrinsic::x86_avx512_vfmaddsub_pd_512;
375     break;
376   }
377 
378   Value *A = Ops[0];
379   Value *B = Ops[1];
380   Value *C = Ops[2];
381 
382   if (Subtract)
383     C = CGF.Builder.CreateFNeg(C);
384 
385   Value *Res;
386 
387   // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding).
388   if (IID != Intrinsic::not_intrinsic &&
389       (cast<llvm::ConstantInt>(Ops.back())->getZExtValue() != (uint64_t)4 ||
390        IsAddSub)) {
391     Function *Intr = CGF.CGM.getIntrinsic(IID);
392     Res = CGF.Builder.CreateCall(Intr, {A, B, C, Ops.back() });
393   } else {
394     llvm::Type *Ty = A->getType();
395     Function *FMA;
396     if (CGF.Builder.getIsFPConstrained()) {
397       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
398       FMA = CGF.CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, Ty);
399       Res = CGF.Builder.CreateConstrainedFPCall(FMA, {A, B, C});
400     } else {
401       FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
402       Res = CGF.Builder.CreateCall(FMA, {A, B, C});
403     }
404   }
405 
406   // Handle any required masking.
407   Value *MaskFalseVal = nullptr;
408   switch (BuiltinID) {
409   case clang::X86::BI__builtin_ia32_vfmaddph512_mask:
410   case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
411   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
412   case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
413   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
414   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
415     MaskFalseVal = Ops[0];
416     break;
417   case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
418   case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
419   case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
420   case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
421   case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
422   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
423     MaskFalseVal = Constant::getNullValue(Ops[0]->getType());
424     break;
425   case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
426   case clang::X86::BI__builtin_ia32_vfmaddph512_mask3:
427   case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
428   case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
429   case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
430   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
431   case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3:
432   case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3:
433   case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
434   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
435   case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
436   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
437     MaskFalseVal = Ops[2];
438     break;
439   }
440 
441   if (MaskFalseVal)
442     return EmitX86Select(CGF, Ops[3], Res, MaskFalseVal);
443 
444   return Res;
445 }
446 
EmitScalarFMAExpr(CodeGenFunction & CGF,const CallExpr * E,MutableArrayRef<Value * > Ops,Value * Upper,bool ZeroMask=false,unsigned PTIdx=0,bool NegAcc=false)447 static Value *EmitScalarFMAExpr(CodeGenFunction &CGF, const CallExpr *E,
448                                 MutableArrayRef<Value *> Ops, Value *Upper,
449                                 bool ZeroMask = false, unsigned PTIdx = 0,
450                                 bool NegAcc = false) {
451   unsigned Rnd = 4;
452   if (Ops.size() > 4)
453     Rnd = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
454 
455   if (NegAcc)
456     Ops[2] = CGF.Builder.CreateFNeg(Ops[2]);
457 
458   Ops[0] = CGF.Builder.CreateExtractElement(Ops[0], (uint64_t)0);
459   Ops[1] = CGF.Builder.CreateExtractElement(Ops[1], (uint64_t)0);
460   Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], (uint64_t)0);
461   Value *Res;
462   if (Rnd != 4) {
463     Intrinsic::ID IID;
464 
465     switch (Ops[0]->getType()->getPrimitiveSizeInBits()) {
466     case 16:
467       IID = Intrinsic::x86_avx512fp16_vfmadd_f16;
468       break;
469     case 32:
470       IID = Intrinsic::x86_avx512_vfmadd_f32;
471       break;
472     case 64:
473       IID = Intrinsic::x86_avx512_vfmadd_f64;
474       break;
475     default:
476       llvm_unreachable("Unexpected size");
477     }
478     Res = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
479                                  {Ops[0], Ops[1], Ops[2], Ops[4]});
480   } else if (CGF.Builder.getIsFPConstrained()) {
481     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
482     Function *FMA = CGF.CGM.getIntrinsic(
483         Intrinsic::experimental_constrained_fma, Ops[0]->getType());
484     Res = CGF.Builder.CreateConstrainedFPCall(FMA, Ops.slice(0, 3));
485   } else {
486     Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ops[0]->getType());
487     Res = CGF.Builder.CreateCall(FMA, Ops.slice(0, 3));
488   }
489   // If we have more than 3 arguments, we need to do masking.
490   if (Ops.size() > 3) {
491     Value *PassThru = ZeroMask ? Constant::getNullValue(Res->getType())
492                                : Ops[PTIdx];
493 
494     // If we negated the accumulator and the its the PassThru value we need to
495     // bypass the negate. Conveniently Upper should be the same thing in this
496     // case.
497     if (NegAcc && PTIdx == 2)
498       PassThru = CGF.Builder.CreateExtractElement(Upper, (uint64_t)0);
499 
500     Res = EmitX86ScalarSelect(CGF, Ops[3], Res, PassThru);
501   }
502   return CGF.Builder.CreateInsertElement(Upper, Res, (uint64_t)0);
503 }
504 
EmitX86Muldq(CodeGenFunction & CGF,bool IsSigned,ArrayRef<Value * > Ops)505 static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned,
506                            ArrayRef<Value *> Ops) {
507   llvm::Type *Ty = Ops[0]->getType();
508   // Arguments have a vXi32 type so cast to vXi64.
509   Ty = llvm::FixedVectorType::get(CGF.Int64Ty,
510                                   Ty->getPrimitiveSizeInBits() / 64);
511   Value *LHS = CGF.Builder.CreateBitCast(Ops[0], Ty);
512   Value *RHS = CGF.Builder.CreateBitCast(Ops[1], Ty);
513 
514   if (IsSigned) {
515     // Shift left then arithmetic shift right.
516     Constant *ShiftAmt = ConstantInt::get(Ty, 32);
517     LHS = CGF.Builder.CreateShl(LHS, ShiftAmt);
518     LHS = CGF.Builder.CreateAShr(LHS, ShiftAmt);
519     RHS = CGF.Builder.CreateShl(RHS, ShiftAmt);
520     RHS = CGF.Builder.CreateAShr(RHS, ShiftAmt);
521   } else {
522     // Clear the upper bits.
523     Constant *Mask = ConstantInt::get(Ty, 0xffffffff);
524     LHS = CGF.Builder.CreateAnd(LHS, Mask);
525     RHS = CGF.Builder.CreateAnd(RHS, Mask);
526   }
527 
528   return CGF.Builder.CreateMul(LHS, RHS);
529 }
530 
531 // Emit a masked pternlog intrinsic. This only exists because the header has to
532 // use a macro and we aren't able to pass the input argument to a pternlog
533 // builtin and a select builtin without evaluating it twice.
EmitX86Ternlog(CodeGenFunction & CGF,bool ZeroMask,ArrayRef<Value * > Ops)534 static Value *EmitX86Ternlog(CodeGenFunction &CGF, bool ZeroMask,
535                              ArrayRef<Value *> Ops) {
536   llvm::Type *Ty = Ops[0]->getType();
537 
538   unsigned VecWidth = Ty->getPrimitiveSizeInBits();
539   unsigned EltWidth = Ty->getScalarSizeInBits();
540   Intrinsic::ID IID;
541   if (VecWidth == 128 && EltWidth == 32)
542     IID = Intrinsic::x86_avx512_pternlog_d_128;
543   else if (VecWidth == 256 && EltWidth == 32)
544     IID = Intrinsic::x86_avx512_pternlog_d_256;
545   else if (VecWidth == 512 && EltWidth == 32)
546     IID = Intrinsic::x86_avx512_pternlog_d_512;
547   else if (VecWidth == 128 && EltWidth == 64)
548     IID = Intrinsic::x86_avx512_pternlog_q_128;
549   else if (VecWidth == 256 && EltWidth == 64)
550     IID = Intrinsic::x86_avx512_pternlog_q_256;
551   else if (VecWidth == 512 && EltWidth == 64)
552     IID = Intrinsic::x86_avx512_pternlog_q_512;
553   else
554     llvm_unreachable("Unexpected intrinsic");
555 
556   Value *Ternlog = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
557                                           Ops.drop_back());
558   Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty) : Ops[0];
559   return EmitX86Select(CGF, Ops[4], Ternlog, PassThru);
560 }
561 
EmitX86SExtMask(CodeGenFunction & CGF,Value * Op,llvm::Type * DstTy)562 static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op,
563                               llvm::Type *DstTy) {
564   unsigned NumberOfElements =
565       cast<llvm::FixedVectorType>(DstTy)->getNumElements();
566   Value *Mask = getMaskVecValue(CGF, Op, NumberOfElements);
567   return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2");
568 }
569 
EmitX86CpuIs(const CallExpr * E)570 Value *CodeGenFunction::EmitX86CpuIs(const CallExpr *E) {
571   const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts();
572   StringRef CPUStr = cast<clang::StringLiteral>(CPUExpr)->getString();
573   return EmitX86CpuIs(CPUStr);
574 }
575 
576 // Convert F16 halfs to floats.
EmitX86CvtF16ToFloatExpr(CodeGenFunction & CGF,ArrayRef<Value * > Ops,llvm::Type * DstTy)577 static Value *EmitX86CvtF16ToFloatExpr(CodeGenFunction &CGF,
578                                        ArrayRef<Value *> Ops,
579                                        llvm::Type *DstTy) {
580   assert((Ops.size() == 1 || Ops.size() == 3 || Ops.size() == 4) &&
581          "Unknown cvtph2ps intrinsic");
582 
583   // If the SAE intrinsic doesn't use default rounding then we can't upgrade.
584   if (Ops.size() == 4 && cast<llvm::ConstantInt>(Ops[3])->getZExtValue() != 4) {
585     Function *F =
586         CGF.CGM.getIntrinsic(Intrinsic::x86_avx512_mask_vcvtph2ps_512);
587     return CGF.Builder.CreateCall(F, {Ops[0], Ops[1], Ops[2], Ops[3]});
588   }
589 
590   unsigned NumDstElts = cast<llvm::FixedVectorType>(DstTy)->getNumElements();
591   Value *Src = Ops[0];
592 
593   // Extract the subvector.
594   if (NumDstElts !=
595       cast<llvm::FixedVectorType>(Src->getType())->getNumElements()) {
596     assert(NumDstElts == 4 && "Unexpected vector size");
597     Src = CGF.Builder.CreateShuffleVector(Src, {0, 1, 2, 3});
598   }
599 
600   // Bitcast from vXi16 to vXf16.
601   auto *HalfTy = llvm::FixedVectorType::get(
602       llvm::Type::getHalfTy(CGF.getLLVMContext()), NumDstElts);
603   Src = CGF.Builder.CreateBitCast(Src, HalfTy);
604 
605   // Perform the fp-extension.
606   Value *Res = CGF.Builder.CreateFPExt(Src, DstTy, "cvtph2ps");
607 
608   if (Ops.size() >= 3)
609     Res = EmitX86Select(CGF, Ops[2], Res, Ops[1]);
610   return Res;
611 }
612 
EmitX86CpuIs(StringRef CPUStr)613 Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) {
614 
615   llvm::Type *Int32Ty = Builder.getInt32Ty();
616 
617   // Matching the struct layout from the compiler-rt/libgcc structure that is
618   // filled in:
619   // unsigned int __cpu_vendor;
620   // unsigned int __cpu_type;
621   // unsigned int __cpu_subtype;
622   // unsigned int __cpu_features[1];
623   llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty,
624                                           llvm::ArrayType::get(Int32Ty, 1));
625 
626   // Grab the global __cpu_model.
627   llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model");
628   cast<llvm::GlobalValue>(CpuModel)->setDSOLocal(true);
629 
630   // Calculate the index needed to access the correct field based on the
631   // range. Also adjust the expected value.
632   auto [Index, Value] = StringSwitch<std::pair<unsigned, unsigned>>(CPUStr)
633 #define X86_VENDOR(ENUM, STRING)                                               \
634   .Case(STRING, {0u, static_cast<unsigned>(llvm::X86::ENUM)})
635 #define X86_CPU_TYPE_ALIAS(ENUM, ALIAS)                                        \
636   .Case(ALIAS, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
637 #define X86_CPU_TYPE(ENUM, STR)                                                \
638   .Case(STR, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
639 #define X86_CPU_SUBTYPE_ALIAS(ENUM, ALIAS)                                     \
640   .Case(ALIAS, {2u, static_cast<unsigned>(llvm::X86::ENUM)})
641 #define X86_CPU_SUBTYPE(ENUM, STR)                                             \
642   .Case(STR, {2u, static_cast<unsigned>(llvm::X86::ENUM)})
643 #include "llvm/TargetParser/X86TargetParser.def"
644                                .Default({0, 0});
645   assert(Value != 0 && "Invalid CPUStr passed to CpuIs");
646 
647   // Grab the appropriate field from __cpu_model.
648   llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0),
649                          ConstantInt::get(Int32Ty, Index)};
650   llvm::Value *CpuValue = Builder.CreateInBoundsGEP(STy, CpuModel, Idxs);
651   CpuValue = Builder.CreateAlignedLoad(Int32Ty, CpuValue,
652                                        CharUnits::fromQuantity(4));
653 
654   // Check the value of the field against the requested value.
655   return Builder.CreateICmpEQ(CpuValue,
656                                   llvm::ConstantInt::get(Int32Ty, Value));
657 }
658 
EmitX86CpuSupports(const CallExpr * E)659 Value *CodeGenFunction::EmitX86CpuSupports(const CallExpr *E) {
660   const Expr *FeatureExpr = E->getArg(0)->IgnoreParenCasts();
661   StringRef FeatureStr = cast<StringLiteral>(FeatureExpr)->getString();
662   if (!getContext().getTargetInfo().validateCpuSupports(FeatureStr))
663     return Builder.getFalse();
664   return EmitX86CpuSupports(FeatureStr);
665 }
666 
EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs)667 Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) {
668   return EmitX86CpuSupports(llvm::X86::getCpuSupportsMask(FeatureStrs));
669 }
670 
671 llvm::Value *
EmitX86CpuSupports(std::array<uint32_t,4> FeatureMask)672 CodeGenFunction::EmitX86CpuSupports(std::array<uint32_t, 4> FeatureMask) {
673   Value *Result = Builder.getTrue();
674   if (FeatureMask[0] != 0) {
675     // Matching the struct layout from the compiler-rt/libgcc structure that is
676     // filled in:
677     // unsigned int __cpu_vendor;
678     // unsigned int __cpu_type;
679     // unsigned int __cpu_subtype;
680     // unsigned int __cpu_features[1];
681     llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty,
682                                             llvm::ArrayType::get(Int32Ty, 1));
683 
684     // Grab the global __cpu_model.
685     llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model");
686     cast<llvm::GlobalValue>(CpuModel)->setDSOLocal(true);
687 
688     // Grab the first (0th) element from the field __cpu_features off of the
689     // global in the struct STy.
690     Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(3),
691                      Builder.getInt32(0)};
692     Value *CpuFeatures = Builder.CreateInBoundsGEP(STy, CpuModel, Idxs);
693     Value *Features = Builder.CreateAlignedLoad(Int32Ty, CpuFeatures,
694                                                 CharUnits::fromQuantity(4));
695 
696     // Check the value of the bit corresponding to the feature requested.
697     Value *Mask = Builder.getInt32(FeatureMask[0]);
698     Value *Bitset = Builder.CreateAnd(Features, Mask);
699     Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
700     Result = Builder.CreateAnd(Result, Cmp);
701   }
702 
703   llvm::Type *ATy = llvm::ArrayType::get(Int32Ty, 3);
704   llvm::Constant *CpuFeatures2 =
705       CGM.CreateRuntimeVariable(ATy, "__cpu_features2");
706   cast<llvm::GlobalValue>(CpuFeatures2)->setDSOLocal(true);
707   for (int i = 1; i != 4; ++i) {
708     const uint32_t M = FeatureMask[i];
709     if (!M)
710       continue;
711     Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(i - 1)};
712     Value *Features = Builder.CreateAlignedLoad(
713         Int32Ty, Builder.CreateInBoundsGEP(ATy, CpuFeatures2, Idxs),
714         CharUnits::fromQuantity(4));
715     // Check the value of the bit corresponding to the feature requested.
716     Value *Mask = Builder.getInt32(M);
717     Value *Bitset = Builder.CreateAnd(Features, Mask);
718     Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
719     Result = Builder.CreateAnd(Result, Cmp);
720   }
721 
722   return Result;
723 }
724 
EmitX86CpuInit()725 Value *CodeGenFunction::EmitX86CpuInit() {
726   llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy,
727                                                     /*Variadic*/ false);
728   llvm::FunctionCallee Func =
729       CGM.CreateRuntimeFunction(FTy, "__cpu_indicator_init");
730   cast<llvm::GlobalValue>(Func.getCallee())->setDSOLocal(true);
731   cast<llvm::GlobalValue>(Func.getCallee())
732       ->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
733   return Builder.CreateCall(Func);
734 }
735 
736 
EmitX86BuiltinExpr(unsigned BuiltinID,const CallExpr * E)737 Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
738                                            const CallExpr *E) {
739   if (BuiltinID == Builtin::BI__builtin_cpu_is)
740     return EmitX86CpuIs(E);
741   if (BuiltinID == Builtin::BI__builtin_cpu_supports)
742     return EmitX86CpuSupports(E);
743   if (BuiltinID == Builtin::BI__builtin_cpu_init)
744     return EmitX86CpuInit();
745 
746   // Handle MSVC intrinsics before argument evaluation to prevent double
747   // evaluation.
748   if (std::optional<MSVCIntrin> MsvcIntId = translateX86ToMsvcIntrin(BuiltinID))
749     return EmitMSVCBuiltinExpr(*MsvcIntId, E);
750 
751   SmallVector<Value*, 4> Ops;
752   bool IsMaskFCmp = false;
753   bool IsConjFMA = false;
754 
755   // Find out if any arguments are required to be integer constant expressions.
756   unsigned ICEArguments = 0;
757   ASTContext::GetBuiltinTypeError Error;
758   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
759   assert(Error == ASTContext::GE_None && "Should not codegen an error");
760 
761   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
762     Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E));
763   }
764 
765   // These exist so that the builtin that takes an immediate can be bounds
766   // checked by clang to avoid passing bad immediates to the backend. Since
767   // AVX has a larger immediate than SSE we would need separate builtins to
768   // do the different bounds checking. Rather than create a clang specific
769   // SSE only builtin, this implements eight separate builtins to match gcc
770   // implementation.
771   auto getCmpIntrinsicCall = [this, &Ops](Intrinsic::ID ID, unsigned Imm) {
772     Ops.push_back(llvm::ConstantInt::get(Int8Ty, Imm));
773     llvm::Function *F = CGM.getIntrinsic(ID);
774     return Builder.CreateCall(F, Ops);
775   };
776 
777   // For the vector forms of FP comparisons, translate the builtins directly to
778   // IR.
779   // TODO: The builtins could be removed if the SSE header files used vector
780   // extension comparisons directly (vector ordered/unordered may need
781   // additional support via __builtin_isnan()).
782   auto getVectorFCmpIR = [this, &Ops, E](CmpInst::Predicate Pred,
783                                          bool IsSignaling) {
784     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
785     Value *Cmp;
786     if (IsSignaling)
787       Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]);
788     else
789       Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
790     llvm::VectorType *FPVecTy = cast<llvm::VectorType>(Ops[0]->getType());
791     llvm::VectorType *IntVecTy = llvm::VectorType::getInteger(FPVecTy);
792     Value *Sext = Builder.CreateSExt(Cmp, IntVecTy);
793     return Builder.CreateBitCast(Sext, FPVecTy);
794   };
795 
796   switch (BuiltinID) {
797   default: return nullptr;
798   case X86::BI_mm_prefetch: {
799     Value *Address = Ops[0];
800     ConstantInt *C = cast<ConstantInt>(Ops[1]);
801     Value *RW = ConstantInt::get(Int32Ty, (C->getZExtValue() >> 2) & 0x1);
802     Value *Locality = ConstantInt::get(Int32Ty, C->getZExtValue() & 0x3);
803     Value *Data = ConstantInt::get(Int32Ty, 1);
804     Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
805     return Builder.CreateCall(F, {Address, RW, Locality, Data});
806   }
807   case X86::BI_m_prefetch:
808   case X86::BI_m_prefetchw: {
809     Value *Address = Ops[0];
810     // The 'w' suffix implies write.
811     Value *RW =
812         ConstantInt::get(Int32Ty, BuiltinID == X86::BI_m_prefetchw ? 1 : 0);
813     Value *Locality = ConstantInt::get(Int32Ty, 0x3);
814     Value *Data = ConstantInt::get(Int32Ty, 1);
815     Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
816     return Builder.CreateCall(F, {Address, RW, Locality, Data});
817   }
818   case X86::BI_mm_clflush: {
819     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_clflush),
820                               Ops[0]);
821   }
822   case X86::BI_mm_lfence: {
823     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_lfence));
824   }
825   case X86::BI_mm_mfence: {
826     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_mfence));
827   }
828   case X86::BI_mm_sfence: {
829     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_sfence));
830   }
831   case X86::BI_mm_pause: {
832     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_pause));
833   }
834   case X86::BI__rdtsc: {
835     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtsc));
836   }
837   case X86::BI__builtin_ia32_rdtscp: {
838     Value *Call = Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtscp));
839     Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1),
840                                       Ops[0]);
841     return Builder.CreateExtractValue(Call, 0);
842   }
843   case X86::BI__builtin_ia32_lzcnt_u16:
844   case X86::BI__builtin_ia32_lzcnt_u32:
845   case X86::BI__builtin_ia32_lzcnt_u64: {
846     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
847     return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
848   }
849   case X86::BI__builtin_ia32_tzcnt_u16:
850   case X86::BI__builtin_ia32_tzcnt_u32:
851   case X86::BI__builtin_ia32_tzcnt_u64: {
852     Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType());
853     return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
854   }
855   case X86::BI__builtin_ia32_undef128:
856   case X86::BI__builtin_ia32_undef256:
857   case X86::BI__builtin_ia32_undef512:
858     // The x86 definition of "undef" is not the same as the LLVM definition
859     // (PR32176). We leave optimizing away an unnecessary zero constant to the
860     // IR optimizer and backend.
861     // TODO: If we had a "freeze" IR instruction to generate a fixed undef
862     // value, we should use that here instead of a zero.
863     return llvm::Constant::getNullValue(ConvertType(E->getType()));
864   case X86::BI__builtin_ia32_vec_ext_v4hi:
865   case X86::BI__builtin_ia32_vec_ext_v16qi:
866   case X86::BI__builtin_ia32_vec_ext_v8hi:
867   case X86::BI__builtin_ia32_vec_ext_v4si:
868   case X86::BI__builtin_ia32_vec_ext_v4sf:
869   case X86::BI__builtin_ia32_vec_ext_v2di:
870   case X86::BI__builtin_ia32_vec_ext_v32qi:
871   case X86::BI__builtin_ia32_vec_ext_v16hi:
872   case X86::BI__builtin_ia32_vec_ext_v8si:
873   case X86::BI__builtin_ia32_vec_ext_v4di: {
874     unsigned NumElts =
875         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
876     uint64_t Index = cast<ConstantInt>(Ops[1])->getZExtValue();
877     Index &= NumElts - 1;
878     // These builtins exist so we can ensure the index is an ICE and in range.
879     // Otherwise we could just do this in the header file.
880     return Builder.CreateExtractElement(Ops[0], Index);
881   }
882   case X86::BI__builtin_ia32_vec_set_v4hi:
883   case X86::BI__builtin_ia32_vec_set_v16qi:
884   case X86::BI__builtin_ia32_vec_set_v8hi:
885   case X86::BI__builtin_ia32_vec_set_v4si:
886   case X86::BI__builtin_ia32_vec_set_v2di:
887   case X86::BI__builtin_ia32_vec_set_v32qi:
888   case X86::BI__builtin_ia32_vec_set_v16hi:
889   case X86::BI__builtin_ia32_vec_set_v8si:
890   case X86::BI__builtin_ia32_vec_set_v4di: {
891     unsigned NumElts =
892         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
893     unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
894     Index &= NumElts - 1;
895     // These builtins exist so we can ensure the index is an ICE and in range.
896     // Otherwise we could just do this in the header file.
897     return Builder.CreateInsertElement(Ops[0], Ops[1], Index);
898   }
899   case X86::BI_mm_setcsr:
900   case X86::BI__builtin_ia32_ldmxcsr: {
901     RawAddress Tmp = CreateMemTemp(E->getArg(0)->getType());
902     Builder.CreateStore(Ops[0], Tmp);
903     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_ldmxcsr),
904                               Tmp.getPointer());
905   }
906   case X86::BI_mm_getcsr:
907   case X86::BI__builtin_ia32_stmxcsr: {
908     RawAddress Tmp = CreateMemTemp(E->getType());
909     Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_stmxcsr),
910                        Tmp.getPointer());
911     return Builder.CreateLoad(Tmp, "stmxcsr");
912   }
913   case X86::BI__builtin_ia32_xsave:
914   case X86::BI__builtin_ia32_xsave64:
915   case X86::BI__builtin_ia32_xrstor:
916   case X86::BI__builtin_ia32_xrstor64:
917   case X86::BI__builtin_ia32_xsaveopt:
918   case X86::BI__builtin_ia32_xsaveopt64:
919   case X86::BI__builtin_ia32_xrstors:
920   case X86::BI__builtin_ia32_xrstors64:
921   case X86::BI__builtin_ia32_xsavec:
922   case X86::BI__builtin_ia32_xsavec64:
923   case X86::BI__builtin_ia32_xsaves:
924   case X86::BI__builtin_ia32_xsaves64:
925   case X86::BI__builtin_ia32_xsetbv:
926   case X86::BI_xsetbv: {
927     Intrinsic::ID ID;
928 #define INTRINSIC_X86_XSAVE_ID(NAME) \
929     case X86::BI__builtin_ia32_##NAME: \
930       ID = Intrinsic::x86_##NAME; \
931       break
932     switch (BuiltinID) {
933     default: llvm_unreachable("Unsupported intrinsic!");
934     INTRINSIC_X86_XSAVE_ID(xsave);
935     INTRINSIC_X86_XSAVE_ID(xsave64);
936     INTRINSIC_X86_XSAVE_ID(xrstor);
937     INTRINSIC_X86_XSAVE_ID(xrstor64);
938     INTRINSIC_X86_XSAVE_ID(xsaveopt);
939     INTRINSIC_X86_XSAVE_ID(xsaveopt64);
940     INTRINSIC_X86_XSAVE_ID(xrstors);
941     INTRINSIC_X86_XSAVE_ID(xrstors64);
942     INTRINSIC_X86_XSAVE_ID(xsavec);
943     INTRINSIC_X86_XSAVE_ID(xsavec64);
944     INTRINSIC_X86_XSAVE_ID(xsaves);
945     INTRINSIC_X86_XSAVE_ID(xsaves64);
946     INTRINSIC_X86_XSAVE_ID(xsetbv);
947     case X86::BI_xsetbv:
948       ID = Intrinsic::x86_xsetbv;
949       break;
950     }
951 #undef INTRINSIC_X86_XSAVE_ID
952     Value *Mhi = Builder.CreateTrunc(
953       Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, 32)), Int32Ty);
954     Value *Mlo = Builder.CreateTrunc(Ops[1], Int32Ty);
955     Ops[1] = Mhi;
956     Ops.push_back(Mlo);
957     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
958   }
959   case X86::BI__builtin_ia32_xgetbv:
960   case X86::BI_xgetbv:
961     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_xgetbv), Ops);
962   case X86::BI__builtin_ia32_storedqudi128_mask:
963   case X86::BI__builtin_ia32_storedqusi128_mask:
964   case X86::BI__builtin_ia32_storedquhi128_mask:
965   case X86::BI__builtin_ia32_storedquqi128_mask:
966   case X86::BI__builtin_ia32_storeupd128_mask:
967   case X86::BI__builtin_ia32_storeups128_mask:
968   case X86::BI__builtin_ia32_storedqudi256_mask:
969   case X86::BI__builtin_ia32_storedqusi256_mask:
970   case X86::BI__builtin_ia32_storedquhi256_mask:
971   case X86::BI__builtin_ia32_storedquqi256_mask:
972   case X86::BI__builtin_ia32_storeupd256_mask:
973   case X86::BI__builtin_ia32_storeups256_mask:
974   case X86::BI__builtin_ia32_storedqudi512_mask:
975   case X86::BI__builtin_ia32_storedqusi512_mask:
976   case X86::BI__builtin_ia32_storedquhi512_mask:
977   case X86::BI__builtin_ia32_storedquqi512_mask:
978   case X86::BI__builtin_ia32_storeupd512_mask:
979   case X86::BI__builtin_ia32_storeups512_mask:
980     return EmitX86MaskedStore(*this, Ops, Align(1));
981 
982   case X86::BI__builtin_ia32_storesbf16128_mask:
983   case X86::BI__builtin_ia32_storesh128_mask:
984   case X86::BI__builtin_ia32_storess128_mask:
985   case X86::BI__builtin_ia32_storesd128_mask:
986     return EmitX86MaskedStore(*this, Ops, Align(1));
987 
988   case X86::BI__builtin_ia32_cvtmask2b128:
989   case X86::BI__builtin_ia32_cvtmask2b256:
990   case X86::BI__builtin_ia32_cvtmask2b512:
991   case X86::BI__builtin_ia32_cvtmask2w128:
992   case X86::BI__builtin_ia32_cvtmask2w256:
993   case X86::BI__builtin_ia32_cvtmask2w512:
994   case X86::BI__builtin_ia32_cvtmask2d128:
995   case X86::BI__builtin_ia32_cvtmask2d256:
996   case X86::BI__builtin_ia32_cvtmask2d512:
997   case X86::BI__builtin_ia32_cvtmask2q128:
998   case X86::BI__builtin_ia32_cvtmask2q256:
999   case X86::BI__builtin_ia32_cvtmask2q512:
1000     return EmitX86SExtMask(*this, Ops[0], ConvertType(E->getType()));
1001 
1002   case X86::BI__builtin_ia32_cvtb2mask128:
1003   case X86::BI__builtin_ia32_cvtb2mask256:
1004   case X86::BI__builtin_ia32_cvtb2mask512:
1005   case X86::BI__builtin_ia32_cvtw2mask128:
1006   case X86::BI__builtin_ia32_cvtw2mask256:
1007   case X86::BI__builtin_ia32_cvtw2mask512:
1008   case X86::BI__builtin_ia32_cvtd2mask128:
1009   case X86::BI__builtin_ia32_cvtd2mask256:
1010   case X86::BI__builtin_ia32_cvtd2mask512:
1011   case X86::BI__builtin_ia32_cvtq2mask128:
1012   case X86::BI__builtin_ia32_cvtq2mask256:
1013   case X86::BI__builtin_ia32_cvtq2mask512:
1014     return EmitX86ConvertToMask(*this, Ops[0]);
1015 
1016   case X86::BI__builtin_ia32_cvtdq2ps512_mask:
1017   case X86::BI__builtin_ia32_cvtqq2ps512_mask:
1018   case X86::BI__builtin_ia32_cvtqq2pd512_mask:
1019   case X86::BI__builtin_ia32_vcvtw2ph512_mask:
1020   case X86::BI__builtin_ia32_vcvtdq2ph512_mask:
1021   case X86::BI__builtin_ia32_vcvtqq2ph512_mask:
1022     return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ true);
1023   case X86::BI__builtin_ia32_cvtudq2ps512_mask:
1024   case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
1025   case X86::BI__builtin_ia32_cvtuqq2pd512_mask:
1026   case X86::BI__builtin_ia32_vcvtuw2ph512_mask:
1027   case X86::BI__builtin_ia32_vcvtudq2ph512_mask:
1028   case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
1029     return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false);
1030 
1031   case X86::BI__builtin_ia32_vfmaddss3:
1032   case X86::BI__builtin_ia32_vfmaddsd3:
1033   case X86::BI__builtin_ia32_vfmaddsh3_mask:
1034   case X86::BI__builtin_ia32_vfmaddss3_mask:
1035   case X86::BI__builtin_ia32_vfmaddsd3_mask:
1036     return EmitScalarFMAExpr(*this, E, Ops, Ops[0]);
1037   case X86::BI__builtin_ia32_vfmaddss:
1038   case X86::BI__builtin_ia32_vfmaddsd:
1039     return EmitScalarFMAExpr(*this, E, Ops,
1040                              Constant::getNullValue(Ops[0]->getType()));
1041   case X86::BI__builtin_ia32_vfmaddsh3_maskz:
1042   case X86::BI__builtin_ia32_vfmaddss3_maskz:
1043   case X86::BI__builtin_ia32_vfmaddsd3_maskz:
1044     return EmitScalarFMAExpr(*this, E, Ops, Ops[0], /*ZeroMask*/ true);
1045   case X86::BI__builtin_ia32_vfmaddsh3_mask3:
1046   case X86::BI__builtin_ia32_vfmaddss3_mask3:
1047   case X86::BI__builtin_ia32_vfmaddsd3_mask3:
1048     return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2);
1049   case X86::BI__builtin_ia32_vfmsubsh3_mask3:
1050   case X86::BI__builtin_ia32_vfmsubss3_mask3:
1051   case X86::BI__builtin_ia32_vfmsubsd3_mask3:
1052     return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2,
1053                              /*NegAcc*/ true);
1054   case X86::BI__builtin_ia32_vfmaddph:
1055   case X86::BI__builtin_ia32_vfmaddps:
1056   case X86::BI__builtin_ia32_vfmaddpd:
1057   case X86::BI__builtin_ia32_vfmaddph256:
1058   case X86::BI__builtin_ia32_vfmaddps256:
1059   case X86::BI__builtin_ia32_vfmaddpd256:
1060   case X86::BI__builtin_ia32_vfmaddph512_mask:
1061   case X86::BI__builtin_ia32_vfmaddph512_maskz:
1062   case X86::BI__builtin_ia32_vfmaddph512_mask3:
1063   case X86::BI__builtin_ia32_vfmaddbf16128:
1064   case X86::BI__builtin_ia32_vfmaddbf16256:
1065   case X86::BI__builtin_ia32_vfmaddbf16512:
1066   case X86::BI__builtin_ia32_vfmaddps512_mask:
1067   case X86::BI__builtin_ia32_vfmaddps512_maskz:
1068   case X86::BI__builtin_ia32_vfmaddps512_mask3:
1069   case X86::BI__builtin_ia32_vfmsubps512_mask3:
1070   case X86::BI__builtin_ia32_vfmaddpd512_mask:
1071   case X86::BI__builtin_ia32_vfmaddpd512_maskz:
1072   case X86::BI__builtin_ia32_vfmaddpd512_mask3:
1073   case X86::BI__builtin_ia32_vfmsubpd512_mask3:
1074   case X86::BI__builtin_ia32_vfmsubph512_mask3:
1075     return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ false);
1076   case X86::BI__builtin_ia32_vfmaddsubph512_mask:
1077   case X86::BI__builtin_ia32_vfmaddsubph512_maskz:
1078   case X86::BI__builtin_ia32_vfmaddsubph512_mask3:
1079   case X86::BI__builtin_ia32_vfmsubaddph512_mask3:
1080   case X86::BI__builtin_ia32_vfmaddsubps512_mask:
1081   case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
1082   case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
1083   case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
1084   case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
1085   case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
1086   case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
1087   case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
1088     return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ true);
1089 
1090   case X86::BI__builtin_ia32_movdqa32store128_mask:
1091   case X86::BI__builtin_ia32_movdqa64store128_mask:
1092   case X86::BI__builtin_ia32_storeaps128_mask:
1093   case X86::BI__builtin_ia32_storeapd128_mask:
1094   case X86::BI__builtin_ia32_movdqa32store256_mask:
1095   case X86::BI__builtin_ia32_movdqa64store256_mask:
1096   case X86::BI__builtin_ia32_storeaps256_mask:
1097   case X86::BI__builtin_ia32_storeapd256_mask:
1098   case X86::BI__builtin_ia32_movdqa32store512_mask:
1099   case X86::BI__builtin_ia32_movdqa64store512_mask:
1100   case X86::BI__builtin_ia32_storeaps512_mask:
1101   case X86::BI__builtin_ia32_storeapd512_mask:
1102     return EmitX86MaskedStore(
1103         *this, Ops,
1104         getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign());
1105 
1106   case X86::BI__builtin_ia32_loadups128_mask:
1107   case X86::BI__builtin_ia32_loadups256_mask:
1108   case X86::BI__builtin_ia32_loadups512_mask:
1109   case X86::BI__builtin_ia32_loadupd128_mask:
1110   case X86::BI__builtin_ia32_loadupd256_mask:
1111   case X86::BI__builtin_ia32_loadupd512_mask:
1112   case X86::BI__builtin_ia32_loaddquqi128_mask:
1113   case X86::BI__builtin_ia32_loaddquqi256_mask:
1114   case X86::BI__builtin_ia32_loaddquqi512_mask:
1115   case X86::BI__builtin_ia32_loaddquhi128_mask:
1116   case X86::BI__builtin_ia32_loaddquhi256_mask:
1117   case X86::BI__builtin_ia32_loaddquhi512_mask:
1118   case X86::BI__builtin_ia32_loaddqusi128_mask:
1119   case X86::BI__builtin_ia32_loaddqusi256_mask:
1120   case X86::BI__builtin_ia32_loaddqusi512_mask:
1121   case X86::BI__builtin_ia32_loaddqudi128_mask:
1122   case X86::BI__builtin_ia32_loaddqudi256_mask:
1123   case X86::BI__builtin_ia32_loaddqudi512_mask:
1124     return EmitX86MaskedLoad(*this, Ops, Align(1));
1125 
1126   case X86::BI__builtin_ia32_loadsbf16128_mask:
1127   case X86::BI__builtin_ia32_loadsh128_mask:
1128   case X86::BI__builtin_ia32_loadss128_mask:
1129   case X86::BI__builtin_ia32_loadsd128_mask:
1130     return EmitX86MaskedLoad(*this, Ops, Align(1));
1131 
1132   case X86::BI__builtin_ia32_loadaps128_mask:
1133   case X86::BI__builtin_ia32_loadaps256_mask:
1134   case X86::BI__builtin_ia32_loadaps512_mask:
1135   case X86::BI__builtin_ia32_loadapd128_mask:
1136   case X86::BI__builtin_ia32_loadapd256_mask:
1137   case X86::BI__builtin_ia32_loadapd512_mask:
1138   case X86::BI__builtin_ia32_movdqa32load128_mask:
1139   case X86::BI__builtin_ia32_movdqa32load256_mask:
1140   case X86::BI__builtin_ia32_movdqa32load512_mask:
1141   case X86::BI__builtin_ia32_movdqa64load128_mask:
1142   case X86::BI__builtin_ia32_movdqa64load256_mask:
1143   case X86::BI__builtin_ia32_movdqa64load512_mask:
1144     return EmitX86MaskedLoad(
1145         *this, Ops,
1146         getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign());
1147 
1148   case X86::BI__builtin_ia32_expandloaddf128_mask:
1149   case X86::BI__builtin_ia32_expandloaddf256_mask:
1150   case X86::BI__builtin_ia32_expandloaddf512_mask:
1151   case X86::BI__builtin_ia32_expandloadsf128_mask:
1152   case X86::BI__builtin_ia32_expandloadsf256_mask:
1153   case X86::BI__builtin_ia32_expandloadsf512_mask:
1154   case X86::BI__builtin_ia32_expandloaddi128_mask:
1155   case X86::BI__builtin_ia32_expandloaddi256_mask:
1156   case X86::BI__builtin_ia32_expandloaddi512_mask:
1157   case X86::BI__builtin_ia32_expandloadsi128_mask:
1158   case X86::BI__builtin_ia32_expandloadsi256_mask:
1159   case X86::BI__builtin_ia32_expandloadsi512_mask:
1160   case X86::BI__builtin_ia32_expandloadhi128_mask:
1161   case X86::BI__builtin_ia32_expandloadhi256_mask:
1162   case X86::BI__builtin_ia32_expandloadhi512_mask:
1163   case X86::BI__builtin_ia32_expandloadqi128_mask:
1164   case X86::BI__builtin_ia32_expandloadqi256_mask:
1165   case X86::BI__builtin_ia32_expandloadqi512_mask:
1166     return EmitX86ExpandLoad(*this, Ops);
1167 
1168   case X86::BI__builtin_ia32_compressstoredf128_mask:
1169   case X86::BI__builtin_ia32_compressstoredf256_mask:
1170   case X86::BI__builtin_ia32_compressstoredf512_mask:
1171   case X86::BI__builtin_ia32_compressstoresf128_mask:
1172   case X86::BI__builtin_ia32_compressstoresf256_mask:
1173   case X86::BI__builtin_ia32_compressstoresf512_mask:
1174   case X86::BI__builtin_ia32_compressstoredi128_mask:
1175   case X86::BI__builtin_ia32_compressstoredi256_mask:
1176   case X86::BI__builtin_ia32_compressstoredi512_mask:
1177   case X86::BI__builtin_ia32_compressstoresi128_mask:
1178   case X86::BI__builtin_ia32_compressstoresi256_mask:
1179   case X86::BI__builtin_ia32_compressstoresi512_mask:
1180   case X86::BI__builtin_ia32_compressstorehi128_mask:
1181   case X86::BI__builtin_ia32_compressstorehi256_mask:
1182   case X86::BI__builtin_ia32_compressstorehi512_mask:
1183   case X86::BI__builtin_ia32_compressstoreqi128_mask:
1184   case X86::BI__builtin_ia32_compressstoreqi256_mask:
1185   case X86::BI__builtin_ia32_compressstoreqi512_mask:
1186     return EmitX86CompressStore(*this, Ops);
1187 
1188   case X86::BI__builtin_ia32_expanddf128_mask:
1189   case X86::BI__builtin_ia32_expanddf256_mask:
1190   case X86::BI__builtin_ia32_expanddf512_mask:
1191   case X86::BI__builtin_ia32_expandsf128_mask:
1192   case X86::BI__builtin_ia32_expandsf256_mask:
1193   case X86::BI__builtin_ia32_expandsf512_mask:
1194   case X86::BI__builtin_ia32_expanddi128_mask:
1195   case X86::BI__builtin_ia32_expanddi256_mask:
1196   case X86::BI__builtin_ia32_expanddi512_mask:
1197   case X86::BI__builtin_ia32_expandsi128_mask:
1198   case X86::BI__builtin_ia32_expandsi256_mask:
1199   case X86::BI__builtin_ia32_expandsi512_mask:
1200   case X86::BI__builtin_ia32_expandhi128_mask:
1201   case X86::BI__builtin_ia32_expandhi256_mask:
1202   case X86::BI__builtin_ia32_expandhi512_mask:
1203   case X86::BI__builtin_ia32_expandqi128_mask:
1204   case X86::BI__builtin_ia32_expandqi256_mask:
1205   case X86::BI__builtin_ia32_expandqi512_mask:
1206     return EmitX86CompressExpand(*this, Ops, /*IsCompress*/false);
1207 
1208   case X86::BI__builtin_ia32_compressdf128_mask:
1209   case X86::BI__builtin_ia32_compressdf256_mask:
1210   case X86::BI__builtin_ia32_compressdf512_mask:
1211   case X86::BI__builtin_ia32_compresssf128_mask:
1212   case X86::BI__builtin_ia32_compresssf256_mask:
1213   case X86::BI__builtin_ia32_compresssf512_mask:
1214   case X86::BI__builtin_ia32_compressdi128_mask:
1215   case X86::BI__builtin_ia32_compressdi256_mask:
1216   case X86::BI__builtin_ia32_compressdi512_mask:
1217   case X86::BI__builtin_ia32_compresssi128_mask:
1218   case X86::BI__builtin_ia32_compresssi256_mask:
1219   case X86::BI__builtin_ia32_compresssi512_mask:
1220   case X86::BI__builtin_ia32_compresshi128_mask:
1221   case X86::BI__builtin_ia32_compresshi256_mask:
1222   case X86::BI__builtin_ia32_compresshi512_mask:
1223   case X86::BI__builtin_ia32_compressqi128_mask:
1224   case X86::BI__builtin_ia32_compressqi256_mask:
1225   case X86::BI__builtin_ia32_compressqi512_mask:
1226     return EmitX86CompressExpand(*this, Ops, /*IsCompress*/true);
1227 
1228   case X86::BI__builtin_ia32_gather3div2df:
1229   case X86::BI__builtin_ia32_gather3div2di:
1230   case X86::BI__builtin_ia32_gather3div4df:
1231   case X86::BI__builtin_ia32_gather3div4di:
1232   case X86::BI__builtin_ia32_gather3div4sf:
1233   case X86::BI__builtin_ia32_gather3div4si:
1234   case X86::BI__builtin_ia32_gather3div8sf:
1235   case X86::BI__builtin_ia32_gather3div8si:
1236   case X86::BI__builtin_ia32_gather3siv2df:
1237   case X86::BI__builtin_ia32_gather3siv2di:
1238   case X86::BI__builtin_ia32_gather3siv4df:
1239   case X86::BI__builtin_ia32_gather3siv4di:
1240   case X86::BI__builtin_ia32_gather3siv4sf:
1241   case X86::BI__builtin_ia32_gather3siv4si:
1242   case X86::BI__builtin_ia32_gather3siv8sf:
1243   case X86::BI__builtin_ia32_gather3siv8si:
1244   case X86::BI__builtin_ia32_gathersiv8df:
1245   case X86::BI__builtin_ia32_gathersiv16sf:
1246   case X86::BI__builtin_ia32_gatherdiv8df:
1247   case X86::BI__builtin_ia32_gatherdiv16sf:
1248   case X86::BI__builtin_ia32_gathersiv8di:
1249   case X86::BI__builtin_ia32_gathersiv16si:
1250   case X86::BI__builtin_ia32_gatherdiv8di:
1251   case X86::BI__builtin_ia32_gatherdiv16si: {
1252     Intrinsic::ID IID;
1253     switch (BuiltinID) {
1254     default: llvm_unreachable("Unexpected builtin");
1255     case X86::BI__builtin_ia32_gather3div2df:
1256       IID = Intrinsic::x86_avx512_mask_gather3div2_df;
1257       break;
1258     case X86::BI__builtin_ia32_gather3div2di:
1259       IID = Intrinsic::x86_avx512_mask_gather3div2_di;
1260       break;
1261     case X86::BI__builtin_ia32_gather3div4df:
1262       IID = Intrinsic::x86_avx512_mask_gather3div4_df;
1263       break;
1264     case X86::BI__builtin_ia32_gather3div4di:
1265       IID = Intrinsic::x86_avx512_mask_gather3div4_di;
1266       break;
1267     case X86::BI__builtin_ia32_gather3div4sf:
1268       IID = Intrinsic::x86_avx512_mask_gather3div4_sf;
1269       break;
1270     case X86::BI__builtin_ia32_gather3div4si:
1271       IID = Intrinsic::x86_avx512_mask_gather3div4_si;
1272       break;
1273     case X86::BI__builtin_ia32_gather3div8sf:
1274       IID = Intrinsic::x86_avx512_mask_gather3div8_sf;
1275       break;
1276     case X86::BI__builtin_ia32_gather3div8si:
1277       IID = Intrinsic::x86_avx512_mask_gather3div8_si;
1278       break;
1279     case X86::BI__builtin_ia32_gather3siv2df:
1280       IID = Intrinsic::x86_avx512_mask_gather3siv2_df;
1281       break;
1282     case X86::BI__builtin_ia32_gather3siv2di:
1283       IID = Intrinsic::x86_avx512_mask_gather3siv2_di;
1284       break;
1285     case X86::BI__builtin_ia32_gather3siv4df:
1286       IID = Intrinsic::x86_avx512_mask_gather3siv4_df;
1287       break;
1288     case X86::BI__builtin_ia32_gather3siv4di:
1289       IID = Intrinsic::x86_avx512_mask_gather3siv4_di;
1290       break;
1291     case X86::BI__builtin_ia32_gather3siv4sf:
1292       IID = Intrinsic::x86_avx512_mask_gather3siv4_sf;
1293       break;
1294     case X86::BI__builtin_ia32_gather3siv4si:
1295       IID = Intrinsic::x86_avx512_mask_gather3siv4_si;
1296       break;
1297     case X86::BI__builtin_ia32_gather3siv8sf:
1298       IID = Intrinsic::x86_avx512_mask_gather3siv8_sf;
1299       break;
1300     case X86::BI__builtin_ia32_gather3siv8si:
1301       IID = Intrinsic::x86_avx512_mask_gather3siv8_si;
1302       break;
1303     case X86::BI__builtin_ia32_gathersiv8df:
1304       IID = Intrinsic::x86_avx512_mask_gather_dpd_512;
1305       break;
1306     case X86::BI__builtin_ia32_gathersiv16sf:
1307       IID = Intrinsic::x86_avx512_mask_gather_dps_512;
1308       break;
1309     case X86::BI__builtin_ia32_gatherdiv8df:
1310       IID = Intrinsic::x86_avx512_mask_gather_qpd_512;
1311       break;
1312     case X86::BI__builtin_ia32_gatherdiv16sf:
1313       IID = Intrinsic::x86_avx512_mask_gather_qps_512;
1314       break;
1315     case X86::BI__builtin_ia32_gathersiv8di:
1316       IID = Intrinsic::x86_avx512_mask_gather_dpq_512;
1317       break;
1318     case X86::BI__builtin_ia32_gathersiv16si:
1319       IID = Intrinsic::x86_avx512_mask_gather_dpi_512;
1320       break;
1321     case X86::BI__builtin_ia32_gatherdiv8di:
1322       IID = Intrinsic::x86_avx512_mask_gather_qpq_512;
1323       break;
1324     case X86::BI__builtin_ia32_gatherdiv16si:
1325       IID = Intrinsic::x86_avx512_mask_gather_qpi_512;
1326       break;
1327     }
1328 
1329     unsigned MinElts = std::min(
1330         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(),
1331         cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements());
1332     Ops[3] = getMaskVecValue(*this, Ops[3], MinElts);
1333     Function *Intr = CGM.getIntrinsic(IID);
1334     return Builder.CreateCall(Intr, Ops);
1335   }
1336 
1337   case X86::BI__builtin_ia32_scattersiv8df:
1338   case X86::BI__builtin_ia32_scattersiv16sf:
1339   case X86::BI__builtin_ia32_scatterdiv8df:
1340   case X86::BI__builtin_ia32_scatterdiv16sf:
1341   case X86::BI__builtin_ia32_scattersiv8di:
1342   case X86::BI__builtin_ia32_scattersiv16si:
1343   case X86::BI__builtin_ia32_scatterdiv8di:
1344   case X86::BI__builtin_ia32_scatterdiv16si:
1345   case X86::BI__builtin_ia32_scatterdiv2df:
1346   case X86::BI__builtin_ia32_scatterdiv2di:
1347   case X86::BI__builtin_ia32_scatterdiv4df:
1348   case X86::BI__builtin_ia32_scatterdiv4di:
1349   case X86::BI__builtin_ia32_scatterdiv4sf:
1350   case X86::BI__builtin_ia32_scatterdiv4si:
1351   case X86::BI__builtin_ia32_scatterdiv8sf:
1352   case X86::BI__builtin_ia32_scatterdiv8si:
1353   case X86::BI__builtin_ia32_scattersiv2df:
1354   case X86::BI__builtin_ia32_scattersiv2di:
1355   case X86::BI__builtin_ia32_scattersiv4df:
1356   case X86::BI__builtin_ia32_scattersiv4di:
1357   case X86::BI__builtin_ia32_scattersiv4sf:
1358   case X86::BI__builtin_ia32_scattersiv4si:
1359   case X86::BI__builtin_ia32_scattersiv8sf:
1360   case X86::BI__builtin_ia32_scattersiv8si: {
1361     Intrinsic::ID IID;
1362     switch (BuiltinID) {
1363     default: llvm_unreachable("Unexpected builtin");
1364     case X86::BI__builtin_ia32_scattersiv8df:
1365       IID = Intrinsic::x86_avx512_mask_scatter_dpd_512;
1366       break;
1367     case X86::BI__builtin_ia32_scattersiv16sf:
1368       IID = Intrinsic::x86_avx512_mask_scatter_dps_512;
1369       break;
1370     case X86::BI__builtin_ia32_scatterdiv8df:
1371       IID = Intrinsic::x86_avx512_mask_scatter_qpd_512;
1372       break;
1373     case X86::BI__builtin_ia32_scatterdiv16sf:
1374       IID = Intrinsic::x86_avx512_mask_scatter_qps_512;
1375       break;
1376     case X86::BI__builtin_ia32_scattersiv8di:
1377       IID = Intrinsic::x86_avx512_mask_scatter_dpq_512;
1378       break;
1379     case X86::BI__builtin_ia32_scattersiv16si:
1380       IID = Intrinsic::x86_avx512_mask_scatter_dpi_512;
1381       break;
1382     case X86::BI__builtin_ia32_scatterdiv8di:
1383       IID = Intrinsic::x86_avx512_mask_scatter_qpq_512;
1384       break;
1385     case X86::BI__builtin_ia32_scatterdiv16si:
1386       IID = Intrinsic::x86_avx512_mask_scatter_qpi_512;
1387       break;
1388     case X86::BI__builtin_ia32_scatterdiv2df:
1389       IID = Intrinsic::x86_avx512_mask_scatterdiv2_df;
1390       break;
1391     case X86::BI__builtin_ia32_scatterdiv2di:
1392       IID = Intrinsic::x86_avx512_mask_scatterdiv2_di;
1393       break;
1394     case X86::BI__builtin_ia32_scatterdiv4df:
1395       IID = Intrinsic::x86_avx512_mask_scatterdiv4_df;
1396       break;
1397     case X86::BI__builtin_ia32_scatterdiv4di:
1398       IID = Intrinsic::x86_avx512_mask_scatterdiv4_di;
1399       break;
1400     case X86::BI__builtin_ia32_scatterdiv4sf:
1401       IID = Intrinsic::x86_avx512_mask_scatterdiv4_sf;
1402       break;
1403     case X86::BI__builtin_ia32_scatterdiv4si:
1404       IID = Intrinsic::x86_avx512_mask_scatterdiv4_si;
1405       break;
1406     case X86::BI__builtin_ia32_scatterdiv8sf:
1407       IID = Intrinsic::x86_avx512_mask_scatterdiv8_sf;
1408       break;
1409     case X86::BI__builtin_ia32_scatterdiv8si:
1410       IID = Intrinsic::x86_avx512_mask_scatterdiv8_si;
1411       break;
1412     case X86::BI__builtin_ia32_scattersiv2df:
1413       IID = Intrinsic::x86_avx512_mask_scattersiv2_df;
1414       break;
1415     case X86::BI__builtin_ia32_scattersiv2di:
1416       IID = Intrinsic::x86_avx512_mask_scattersiv2_di;
1417       break;
1418     case X86::BI__builtin_ia32_scattersiv4df:
1419       IID = Intrinsic::x86_avx512_mask_scattersiv4_df;
1420       break;
1421     case X86::BI__builtin_ia32_scattersiv4di:
1422       IID = Intrinsic::x86_avx512_mask_scattersiv4_di;
1423       break;
1424     case X86::BI__builtin_ia32_scattersiv4sf:
1425       IID = Intrinsic::x86_avx512_mask_scattersiv4_sf;
1426       break;
1427     case X86::BI__builtin_ia32_scattersiv4si:
1428       IID = Intrinsic::x86_avx512_mask_scattersiv4_si;
1429       break;
1430     case X86::BI__builtin_ia32_scattersiv8sf:
1431       IID = Intrinsic::x86_avx512_mask_scattersiv8_sf;
1432       break;
1433     case X86::BI__builtin_ia32_scattersiv8si:
1434       IID = Intrinsic::x86_avx512_mask_scattersiv8_si;
1435       break;
1436     }
1437 
1438     unsigned MinElts = std::min(
1439         cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements(),
1440         cast<llvm::FixedVectorType>(Ops[3]->getType())->getNumElements());
1441     Ops[1] = getMaskVecValue(*this, Ops[1], MinElts);
1442     Function *Intr = CGM.getIntrinsic(IID);
1443     return Builder.CreateCall(Intr, Ops);
1444   }
1445 
1446   case X86::BI__builtin_ia32_vextractf128_pd256:
1447   case X86::BI__builtin_ia32_vextractf128_ps256:
1448   case X86::BI__builtin_ia32_vextractf128_si256:
1449   case X86::BI__builtin_ia32_extract128i256:
1450   case X86::BI__builtin_ia32_extractf64x4_mask:
1451   case X86::BI__builtin_ia32_extractf32x4_mask:
1452   case X86::BI__builtin_ia32_extracti64x4_mask:
1453   case X86::BI__builtin_ia32_extracti32x4_mask:
1454   case X86::BI__builtin_ia32_extractf32x8_mask:
1455   case X86::BI__builtin_ia32_extracti32x8_mask:
1456   case X86::BI__builtin_ia32_extractf32x4_256_mask:
1457   case X86::BI__builtin_ia32_extracti32x4_256_mask:
1458   case X86::BI__builtin_ia32_extractf64x2_256_mask:
1459   case X86::BI__builtin_ia32_extracti64x2_256_mask:
1460   case X86::BI__builtin_ia32_extractf64x2_512_mask:
1461   case X86::BI__builtin_ia32_extracti64x2_512_mask: {
1462     auto *DstTy = cast<llvm::FixedVectorType>(ConvertType(E->getType()));
1463     unsigned NumElts = DstTy->getNumElements();
1464     unsigned SrcNumElts =
1465         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
1466     unsigned SubVectors = SrcNumElts / NumElts;
1467     unsigned Index = cast<ConstantInt>(Ops[1])->getZExtValue();
1468     assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
1469     Index &= SubVectors - 1; // Remove any extra bits.
1470     Index *= NumElts;
1471 
1472     int Indices[16];
1473     for (unsigned i = 0; i != NumElts; ++i)
1474       Indices[i] = i + Index;
1475 
1476     Value *Res = Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
1477                                              "extract");
1478 
1479     if (Ops.size() == 4)
1480       Res = EmitX86Select(*this, Ops[3], Res, Ops[2]);
1481 
1482     return Res;
1483   }
1484   case X86::BI__builtin_ia32_vinsertf128_pd256:
1485   case X86::BI__builtin_ia32_vinsertf128_ps256:
1486   case X86::BI__builtin_ia32_vinsertf128_si256:
1487   case X86::BI__builtin_ia32_insert128i256:
1488   case X86::BI__builtin_ia32_insertf64x4:
1489   case X86::BI__builtin_ia32_insertf32x4:
1490   case X86::BI__builtin_ia32_inserti64x4:
1491   case X86::BI__builtin_ia32_inserti32x4:
1492   case X86::BI__builtin_ia32_insertf32x8:
1493   case X86::BI__builtin_ia32_inserti32x8:
1494   case X86::BI__builtin_ia32_insertf32x4_256:
1495   case X86::BI__builtin_ia32_inserti32x4_256:
1496   case X86::BI__builtin_ia32_insertf64x2_256:
1497   case X86::BI__builtin_ia32_inserti64x2_256:
1498   case X86::BI__builtin_ia32_insertf64x2_512:
1499   case X86::BI__builtin_ia32_inserti64x2_512: {
1500     unsigned DstNumElts =
1501         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
1502     unsigned SrcNumElts =
1503         cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements();
1504     unsigned SubVectors = DstNumElts / SrcNumElts;
1505     unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
1506     assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
1507     Index &= SubVectors - 1; // Remove any extra bits.
1508     Index *= SrcNumElts;
1509 
1510     int Indices[16];
1511     for (unsigned i = 0; i != DstNumElts; ++i)
1512       Indices[i] = (i >= SrcNumElts) ? SrcNumElts + (i % SrcNumElts) : i;
1513 
1514     Value *Op1 = Builder.CreateShuffleVector(
1515         Ops[1], ArrayRef(Indices, DstNumElts), "widen");
1516 
1517     for (unsigned i = 0; i != DstNumElts; ++i) {
1518       if (i >= Index && i < (Index + SrcNumElts))
1519         Indices[i] = (i - Index) + DstNumElts;
1520       else
1521         Indices[i] = i;
1522     }
1523 
1524     return Builder.CreateShuffleVector(Ops[0], Op1,
1525                                        ArrayRef(Indices, DstNumElts), "insert");
1526   }
1527   case X86::BI__builtin_ia32_pmovqd512_mask:
1528   case X86::BI__builtin_ia32_pmovwb512_mask: {
1529     Value *Res = Builder.CreateTrunc(Ops[0], Ops[1]->getType());
1530     return EmitX86Select(*this, Ops[2], Res, Ops[1]);
1531   }
1532   case X86::BI__builtin_ia32_pmovdb512_mask:
1533   case X86::BI__builtin_ia32_pmovdw512_mask:
1534   case X86::BI__builtin_ia32_pmovqw512_mask: {
1535     if (const auto *C = dyn_cast<Constant>(Ops[2]))
1536       if (C->isAllOnesValue())
1537         return Builder.CreateTrunc(Ops[0], Ops[1]->getType());
1538 
1539     Intrinsic::ID IID;
1540     switch (BuiltinID) {
1541     default: llvm_unreachable("Unsupported intrinsic!");
1542     case X86::BI__builtin_ia32_pmovdb512_mask:
1543       IID = Intrinsic::x86_avx512_mask_pmov_db_512;
1544       break;
1545     case X86::BI__builtin_ia32_pmovdw512_mask:
1546       IID = Intrinsic::x86_avx512_mask_pmov_dw_512;
1547       break;
1548     case X86::BI__builtin_ia32_pmovqw512_mask:
1549       IID = Intrinsic::x86_avx512_mask_pmov_qw_512;
1550       break;
1551     }
1552 
1553     Function *Intr = CGM.getIntrinsic(IID);
1554     return Builder.CreateCall(Intr, Ops);
1555   }
1556   case X86::BI__builtin_ia32_pblendw128:
1557   case X86::BI__builtin_ia32_blendpd:
1558   case X86::BI__builtin_ia32_blendps:
1559   case X86::BI__builtin_ia32_blendpd256:
1560   case X86::BI__builtin_ia32_blendps256:
1561   case X86::BI__builtin_ia32_pblendw256:
1562   case X86::BI__builtin_ia32_pblendd128:
1563   case X86::BI__builtin_ia32_pblendd256: {
1564     unsigned NumElts =
1565         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
1566     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
1567 
1568     int Indices[16];
1569     // If there are more than 8 elements, the immediate is used twice so make
1570     // sure we handle that.
1571     for (unsigned i = 0; i != NumElts; ++i)
1572       Indices[i] = ((Imm >> (i % 8)) & 0x1) ? NumElts + i : i;
1573 
1574     return Builder.CreateShuffleVector(Ops[0], Ops[1],
1575                                        ArrayRef(Indices, NumElts), "blend");
1576   }
1577   case X86::BI__builtin_ia32_pshuflw:
1578   case X86::BI__builtin_ia32_pshuflw256:
1579   case X86::BI__builtin_ia32_pshuflw512: {
1580     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
1581     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
1582     unsigned NumElts = Ty->getNumElements();
1583 
1584     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
1585     Imm = (Imm & 0xff) * 0x01010101;
1586 
1587     int Indices[32];
1588     for (unsigned l = 0; l != NumElts; l += 8) {
1589       for (unsigned i = 0; i != 4; ++i) {
1590         Indices[l + i] = l + (Imm & 3);
1591         Imm >>= 2;
1592       }
1593       for (unsigned i = 4; i != 8; ++i)
1594         Indices[l + i] = l + i;
1595     }
1596 
1597     return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
1598                                        "pshuflw");
1599   }
1600   case X86::BI__builtin_ia32_pshufhw:
1601   case X86::BI__builtin_ia32_pshufhw256:
1602   case X86::BI__builtin_ia32_pshufhw512: {
1603     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
1604     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
1605     unsigned NumElts = Ty->getNumElements();
1606 
1607     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
1608     Imm = (Imm & 0xff) * 0x01010101;
1609 
1610     int Indices[32];
1611     for (unsigned l = 0; l != NumElts; l += 8) {
1612       for (unsigned i = 0; i != 4; ++i)
1613         Indices[l + i] = l + i;
1614       for (unsigned i = 4; i != 8; ++i) {
1615         Indices[l + i] = l + 4 + (Imm & 3);
1616         Imm >>= 2;
1617       }
1618     }
1619 
1620     return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
1621                                        "pshufhw");
1622   }
1623   case X86::BI__builtin_ia32_pshufd:
1624   case X86::BI__builtin_ia32_pshufd256:
1625   case X86::BI__builtin_ia32_pshufd512:
1626   case X86::BI__builtin_ia32_vpermilpd:
1627   case X86::BI__builtin_ia32_vpermilps:
1628   case X86::BI__builtin_ia32_vpermilpd256:
1629   case X86::BI__builtin_ia32_vpermilps256:
1630   case X86::BI__builtin_ia32_vpermilpd512:
1631   case X86::BI__builtin_ia32_vpermilps512: {
1632     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
1633     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
1634     unsigned NumElts = Ty->getNumElements();
1635     unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
1636     unsigned NumLaneElts = NumElts / NumLanes;
1637 
1638     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
1639     Imm = (Imm & 0xff) * 0x01010101;
1640 
1641     int Indices[16];
1642     for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
1643       for (unsigned i = 0; i != NumLaneElts; ++i) {
1644         Indices[i + l] = (Imm % NumLaneElts) + l;
1645         Imm /= NumLaneElts;
1646       }
1647     }
1648 
1649     return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
1650                                        "permil");
1651   }
1652   case X86::BI__builtin_ia32_shufpd:
1653   case X86::BI__builtin_ia32_shufpd256:
1654   case X86::BI__builtin_ia32_shufpd512:
1655   case X86::BI__builtin_ia32_shufps:
1656   case X86::BI__builtin_ia32_shufps256:
1657   case X86::BI__builtin_ia32_shufps512: {
1658     uint32_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
1659     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
1660     unsigned NumElts = Ty->getNumElements();
1661     unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
1662     unsigned NumLaneElts = NumElts / NumLanes;
1663 
1664     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
1665     Imm = (Imm & 0xff) * 0x01010101;
1666 
1667     int Indices[16];
1668     for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
1669       for (unsigned i = 0; i != NumLaneElts; ++i) {
1670         unsigned Index = Imm % NumLaneElts;
1671         Imm /= NumLaneElts;
1672         if (i >= (NumLaneElts / 2))
1673           Index += NumElts;
1674         Indices[l + i] = l + Index;
1675       }
1676     }
1677 
1678     return Builder.CreateShuffleVector(Ops[0], Ops[1],
1679                                        ArrayRef(Indices, NumElts), "shufp");
1680   }
1681   case X86::BI__builtin_ia32_permdi256:
1682   case X86::BI__builtin_ia32_permdf256:
1683   case X86::BI__builtin_ia32_permdi512:
1684   case X86::BI__builtin_ia32_permdf512: {
1685     unsigned Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
1686     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
1687     unsigned NumElts = Ty->getNumElements();
1688 
1689     // These intrinsics operate on 256-bit lanes of four 64-bit elements.
1690     int Indices[8];
1691     for (unsigned l = 0; l != NumElts; l += 4)
1692       for (unsigned i = 0; i != 4; ++i)
1693         Indices[l + i] = l + ((Imm >> (2 * i)) & 0x3);
1694 
1695     return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
1696                                        "perm");
1697   }
1698   case X86::BI__builtin_ia32_palignr128:
1699   case X86::BI__builtin_ia32_palignr256:
1700   case X86::BI__builtin_ia32_palignr512: {
1701     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
1702 
1703     unsigned NumElts =
1704         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
1705     assert(NumElts % 16 == 0);
1706 
1707     // If palignr is shifting the pair of vectors more than the size of two
1708     // lanes, emit zero.
1709     if (ShiftVal >= 32)
1710       return llvm::Constant::getNullValue(ConvertType(E->getType()));
1711 
1712     // If palignr is shifting the pair of input vectors more than one lane,
1713     // but less than two lanes, convert to shifting in zeroes.
1714     if (ShiftVal > 16) {
1715       ShiftVal -= 16;
1716       Ops[1] = Ops[0];
1717       Ops[0] = llvm::Constant::getNullValue(Ops[0]->getType());
1718     }
1719 
1720     int Indices[64];
1721     // 256-bit palignr operates on 128-bit lanes so we need to handle that
1722     for (unsigned l = 0; l != NumElts; l += 16) {
1723       for (unsigned i = 0; i != 16; ++i) {
1724         unsigned Idx = ShiftVal + i;
1725         if (Idx >= 16)
1726           Idx += NumElts - 16; // End of lane, switch operand.
1727         Indices[l + i] = Idx + l;
1728       }
1729     }
1730 
1731     return Builder.CreateShuffleVector(Ops[1], Ops[0],
1732                                        ArrayRef(Indices, NumElts), "palignr");
1733   }
1734   case X86::BI__builtin_ia32_alignd128:
1735   case X86::BI__builtin_ia32_alignd256:
1736   case X86::BI__builtin_ia32_alignd512:
1737   case X86::BI__builtin_ia32_alignq128:
1738   case X86::BI__builtin_ia32_alignq256:
1739   case X86::BI__builtin_ia32_alignq512: {
1740     unsigned NumElts =
1741         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
1742     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
1743 
1744     // Mask the shift amount to width of a vector.
1745     ShiftVal &= NumElts - 1;
1746 
1747     int Indices[16];
1748     for (unsigned i = 0; i != NumElts; ++i)
1749       Indices[i] = i + ShiftVal;
1750 
1751     return Builder.CreateShuffleVector(Ops[1], Ops[0],
1752                                        ArrayRef(Indices, NumElts), "valign");
1753   }
1754   case X86::BI__builtin_ia32_shuf_f32x4_256:
1755   case X86::BI__builtin_ia32_shuf_f64x2_256:
1756   case X86::BI__builtin_ia32_shuf_i32x4_256:
1757   case X86::BI__builtin_ia32_shuf_i64x2_256:
1758   case X86::BI__builtin_ia32_shuf_f32x4:
1759   case X86::BI__builtin_ia32_shuf_f64x2:
1760   case X86::BI__builtin_ia32_shuf_i32x4:
1761   case X86::BI__builtin_ia32_shuf_i64x2: {
1762     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
1763     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
1764     unsigned NumElts = Ty->getNumElements();
1765     unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2;
1766     unsigned NumLaneElts = NumElts / NumLanes;
1767 
1768     int Indices[16];
1769     for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
1770       unsigned Index = (Imm % NumLanes) * NumLaneElts;
1771       Imm /= NumLanes; // Discard the bits we just used.
1772       if (l >= (NumElts / 2))
1773         Index += NumElts; // Switch to other source.
1774       for (unsigned i = 0; i != NumLaneElts; ++i) {
1775         Indices[l + i] = Index + i;
1776       }
1777     }
1778 
1779     return Builder.CreateShuffleVector(Ops[0], Ops[1],
1780                                        ArrayRef(Indices, NumElts), "shuf");
1781   }
1782 
1783   case X86::BI__builtin_ia32_vperm2f128_pd256:
1784   case X86::BI__builtin_ia32_vperm2f128_ps256:
1785   case X86::BI__builtin_ia32_vperm2f128_si256:
1786   case X86::BI__builtin_ia32_permti256: {
1787     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
1788     unsigned NumElts =
1789         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
1790 
1791     // This takes a very simple approach since there are two lanes and a
1792     // shuffle can have 2 inputs. So we reserve the first input for the first
1793     // lane and the second input for the second lane. This may result in
1794     // duplicate sources, but this can be dealt with in the backend.
1795 
1796     Value *OutOps[2];
1797     int Indices[8];
1798     for (unsigned l = 0; l != 2; ++l) {
1799       // Determine the source for this lane.
1800       if (Imm & (1 << ((l * 4) + 3)))
1801         OutOps[l] = llvm::ConstantAggregateZero::get(Ops[0]->getType());
1802       else if (Imm & (1 << ((l * 4) + 1)))
1803         OutOps[l] = Ops[1];
1804       else
1805         OutOps[l] = Ops[0];
1806 
1807       for (unsigned i = 0; i != NumElts/2; ++i) {
1808         // Start with ith element of the source for this lane.
1809         unsigned Idx = (l * NumElts) + i;
1810         // If bit 0 of the immediate half is set, switch to the high half of
1811         // the source.
1812         if (Imm & (1 << (l * 4)))
1813           Idx += NumElts/2;
1814         Indices[(l * (NumElts/2)) + i] = Idx;
1815       }
1816     }
1817 
1818     return Builder.CreateShuffleVector(OutOps[0], OutOps[1],
1819                                        ArrayRef(Indices, NumElts), "vperm");
1820   }
1821 
1822   case X86::BI__builtin_ia32_pslldqi128_byteshift:
1823   case X86::BI__builtin_ia32_pslldqi256_byteshift:
1824   case X86::BI__builtin_ia32_pslldqi512_byteshift: {
1825     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
1826     auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
1827     // Builtin type is vXi64 so multiply by 8 to get bytes.
1828     unsigned NumElts = ResultType->getNumElements() * 8;
1829 
1830     // If pslldq is shifting the vector more than 15 bytes, emit zero.
1831     if (ShiftVal >= 16)
1832       return llvm::Constant::getNullValue(ResultType);
1833 
1834     int Indices[64];
1835     // 256/512-bit pslldq operates on 128-bit lanes so we need to handle that
1836     for (unsigned l = 0; l != NumElts; l += 16) {
1837       for (unsigned i = 0; i != 16; ++i) {
1838         unsigned Idx = NumElts + i - ShiftVal;
1839         if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand.
1840         Indices[l + i] = Idx + l;
1841       }
1842     }
1843 
1844     auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
1845     Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
1846     Value *Zero = llvm::Constant::getNullValue(VecTy);
1847     Value *SV = Builder.CreateShuffleVector(
1848         Zero, Cast, ArrayRef(Indices, NumElts), "pslldq");
1849     return Builder.CreateBitCast(SV, Ops[0]->getType(), "cast");
1850   }
1851   case X86::BI__builtin_ia32_psrldqi128_byteshift:
1852   case X86::BI__builtin_ia32_psrldqi256_byteshift:
1853   case X86::BI__builtin_ia32_psrldqi512_byteshift: {
1854     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
1855     auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
1856     // Builtin type is vXi64 so multiply by 8 to get bytes.
1857     unsigned NumElts = ResultType->getNumElements() * 8;
1858 
1859     // If psrldq is shifting the vector more than 15 bytes, emit zero.
1860     if (ShiftVal >= 16)
1861       return llvm::Constant::getNullValue(ResultType);
1862 
1863     int Indices[64];
1864     // 256/512-bit psrldq operates on 128-bit lanes so we need to handle that
1865     for (unsigned l = 0; l != NumElts; l += 16) {
1866       for (unsigned i = 0; i != 16; ++i) {
1867         unsigned Idx = i + ShiftVal;
1868         if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand.
1869         Indices[l + i] = Idx + l;
1870       }
1871     }
1872 
1873     auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
1874     Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
1875     Value *Zero = llvm::Constant::getNullValue(VecTy);
1876     Value *SV = Builder.CreateShuffleVector(
1877         Cast, Zero, ArrayRef(Indices, NumElts), "psrldq");
1878     return Builder.CreateBitCast(SV, ResultType, "cast");
1879   }
1880   case X86::BI__builtin_ia32_kshiftliqi:
1881   case X86::BI__builtin_ia32_kshiftlihi:
1882   case X86::BI__builtin_ia32_kshiftlisi:
1883   case X86::BI__builtin_ia32_kshiftlidi: {
1884     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
1885     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
1886 
1887     if (ShiftVal >= NumElts)
1888       return llvm::Constant::getNullValue(Ops[0]->getType());
1889 
1890     Value *In = getMaskVecValue(*this, Ops[0], NumElts);
1891 
1892     int Indices[64];
1893     for (unsigned i = 0; i != NumElts; ++i)
1894       Indices[i] = NumElts + i - ShiftVal;
1895 
1896     Value *Zero = llvm::Constant::getNullValue(In->getType());
1897     Value *SV = Builder.CreateShuffleVector(
1898         Zero, In, ArrayRef(Indices, NumElts), "kshiftl");
1899     return Builder.CreateBitCast(SV, Ops[0]->getType());
1900   }
1901   case X86::BI__builtin_ia32_kshiftriqi:
1902   case X86::BI__builtin_ia32_kshiftrihi:
1903   case X86::BI__builtin_ia32_kshiftrisi:
1904   case X86::BI__builtin_ia32_kshiftridi: {
1905     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
1906     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
1907 
1908     if (ShiftVal >= NumElts)
1909       return llvm::Constant::getNullValue(Ops[0]->getType());
1910 
1911     Value *In = getMaskVecValue(*this, Ops[0], NumElts);
1912 
1913     int Indices[64];
1914     for (unsigned i = 0; i != NumElts; ++i)
1915       Indices[i] = i + ShiftVal;
1916 
1917     Value *Zero = llvm::Constant::getNullValue(In->getType());
1918     Value *SV = Builder.CreateShuffleVector(
1919         In, Zero, ArrayRef(Indices, NumElts), "kshiftr");
1920     return Builder.CreateBitCast(SV, Ops[0]->getType());
1921   }
1922   case X86::BI__builtin_ia32_movnti:
1923   case X86::BI__builtin_ia32_movnti64:
1924   case X86::BI__builtin_ia32_movntsd:
1925   case X86::BI__builtin_ia32_movntss: {
1926     llvm::MDNode *Node = llvm::MDNode::get(
1927         getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
1928 
1929     Value *Ptr = Ops[0];
1930     Value *Src = Ops[1];
1931 
1932     // Extract the 0'th element of the source vector.
1933     if (BuiltinID == X86::BI__builtin_ia32_movntsd ||
1934         BuiltinID == X86::BI__builtin_ia32_movntss)
1935       Src = Builder.CreateExtractElement(Src, (uint64_t)0, "extract");
1936 
1937     // Unaligned nontemporal store of the scalar value.
1938     StoreInst *SI = Builder.CreateDefaultAlignedStore(Src, Ptr);
1939     SI->setMetadata(llvm::LLVMContext::MD_nontemporal, Node);
1940     SI->setAlignment(llvm::Align(1));
1941     return SI;
1942   }
1943   // Rotate is a special case of funnel shift - 1st 2 args are the same.
1944   case X86::BI__builtin_ia32_vprotb:
1945   case X86::BI__builtin_ia32_vprotw:
1946   case X86::BI__builtin_ia32_vprotd:
1947   case X86::BI__builtin_ia32_vprotq:
1948   case X86::BI__builtin_ia32_vprotbi:
1949   case X86::BI__builtin_ia32_vprotwi:
1950   case X86::BI__builtin_ia32_vprotdi:
1951   case X86::BI__builtin_ia32_vprotqi:
1952   case X86::BI__builtin_ia32_prold128:
1953   case X86::BI__builtin_ia32_prold256:
1954   case X86::BI__builtin_ia32_prold512:
1955   case X86::BI__builtin_ia32_prolq128:
1956   case X86::BI__builtin_ia32_prolq256:
1957   case X86::BI__builtin_ia32_prolq512:
1958   case X86::BI__builtin_ia32_prolvd128:
1959   case X86::BI__builtin_ia32_prolvd256:
1960   case X86::BI__builtin_ia32_prolvd512:
1961   case X86::BI__builtin_ia32_prolvq128:
1962   case X86::BI__builtin_ia32_prolvq256:
1963   case X86::BI__builtin_ia32_prolvq512:
1964     return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], false);
1965   case X86::BI__builtin_ia32_prord128:
1966   case X86::BI__builtin_ia32_prord256:
1967   case X86::BI__builtin_ia32_prord512:
1968   case X86::BI__builtin_ia32_prorq128:
1969   case X86::BI__builtin_ia32_prorq256:
1970   case X86::BI__builtin_ia32_prorq512:
1971   case X86::BI__builtin_ia32_prorvd128:
1972   case X86::BI__builtin_ia32_prorvd256:
1973   case X86::BI__builtin_ia32_prorvd512:
1974   case X86::BI__builtin_ia32_prorvq128:
1975   case X86::BI__builtin_ia32_prorvq256:
1976   case X86::BI__builtin_ia32_prorvq512:
1977     return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], true);
1978   case X86::BI__builtin_ia32_selectb_128:
1979   case X86::BI__builtin_ia32_selectb_256:
1980   case X86::BI__builtin_ia32_selectb_512:
1981   case X86::BI__builtin_ia32_selectw_128:
1982   case X86::BI__builtin_ia32_selectw_256:
1983   case X86::BI__builtin_ia32_selectw_512:
1984   case X86::BI__builtin_ia32_selectd_128:
1985   case X86::BI__builtin_ia32_selectd_256:
1986   case X86::BI__builtin_ia32_selectd_512:
1987   case X86::BI__builtin_ia32_selectq_128:
1988   case X86::BI__builtin_ia32_selectq_256:
1989   case X86::BI__builtin_ia32_selectq_512:
1990   case X86::BI__builtin_ia32_selectph_128:
1991   case X86::BI__builtin_ia32_selectph_256:
1992   case X86::BI__builtin_ia32_selectph_512:
1993   case X86::BI__builtin_ia32_selectpbf_128:
1994   case X86::BI__builtin_ia32_selectpbf_256:
1995   case X86::BI__builtin_ia32_selectpbf_512:
1996   case X86::BI__builtin_ia32_selectps_128:
1997   case X86::BI__builtin_ia32_selectps_256:
1998   case X86::BI__builtin_ia32_selectps_512:
1999   case X86::BI__builtin_ia32_selectpd_128:
2000   case X86::BI__builtin_ia32_selectpd_256:
2001   case X86::BI__builtin_ia32_selectpd_512:
2002     return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]);
2003   case X86::BI__builtin_ia32_selectsh_128:
2004   case X86::BI__builtin_ia32_selectsbf_128:
2005   case X86::BI__builtin_ia32_selectss_128:
2006   case X86::BI__builtin_ia32_selectsd_128: {
2007     Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
2008     Value *B = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
2009     A = EmitX86ScalarSelect(*this, Ops[0], A, B);
2010     return Builder.CreateInsertElement(Ops[1], A, (uint64_t)0);
2011   }
2012   case X86::BI__builtin_ia32_cmpb128_mask:
2013   case X86::BI__builtin_ia32_cmpb256_mask:
2014   case X86::BI__builtin_ia32_cmpb512_mask:
2015   case X86::BI__builtin_ia32_cmpw128_mask:
2016   case X86::BI__builtin_ia32_cmpw256_mask:
2017   case X86::BI__builtin_ia32_cmpw512_mask:
2018   case X86::BI__builtin_ia32_cmpd128_mask:
2019   case X86::BI__builtin_ia32_cmpd256_mask:
2020   case X86::BI__builtin_ia32_cmpd512_mask:
2021   case X86::BI__builtin_ia32_cmpq128_mask:
2022   case X86::BI__builtin_ia32_cmpq256_mask:
2023   case X86::BI__builtin_ia32_cmpq512_mask: {
2024     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
2025     return EmitX86MaskedCompare(*this, CC, true, Ops);
2026   }
2027   case X86::BI__builtin_ia32_ucmpb128_mask:
2028   case X86::BI__builtin_ia32_ucmpb256_mask:
2029   case X86::BI__builtin_ia32_ucmpb512_mask:
2030   case X86::BI__builtin_ia32_ucmpw128_mask:
2031   case X86::BI__builtin_ia32_ucmpw256_mask:
2032   case X86::BI__builtin_ia32_ucmpw512_mask:
2033   case X86::BI__builtin_ia32_ucmpd128_mask:
2034   case X86::BI__builtin_ia32_ucmpd256_mask:
2035   case X86::BI__builtin_ia32_ucmpd512_mask:
2036   case X86::BI__builtin_ia32_ucmpq128_mask:
2037   case X86::BI__builtin_ia32_ucmpq256_mask:
2038   case X86::BI__builtin_ia32_ucmpq512_mask: {
2039     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
2040     return EmitX86MaskedCompare(*this, CC, false, Ops);
2041   }
2042   case X86::BI__builtin_ia32_vpcomb:
2043   case X86::BI__builtin_ia32_vpcomw:
2044   case X86::BI__builtin_ia32_vpcomd:
2045   case X86::BI__builtin_ia32_vpcomq:
2046     return EmitX86vpcom(*this, Ops, true);
2047   case X86::BI__builtin_ia32_vpcomub:
2048   case X86::BI__builtin_ia32_vpcomuw:
2049   case X86::BI__builtin_ia32_vpcomud:
2050   case X86::BI__builtin_ia32_vpcomuq:
2051     return EmitX86vpcom(*this, Ops, false);
2052 
2053   case X86::BI__builtin_ia32_kortestcqi:
2054   case X86::BI__builtin_ia32_kortestchi:
2055   case X86::BI__builtin_ia32_kortestcsi:
2056   case X86::BI__builtin_ia32_kortestcdi: {
2057     Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops);
2058     Value *C = llvm::Constant::getAllOnesValue(Ops[0]->getType());
2059     Value *Cmp = Builder.CreateICmpEQ(Or, C);
2060     return Builder.CreateZExt(Cmp, ConvertType(E->getType()));
2061   }
2062   case X86::BI__builtin_ia32_kortestzqi:
2063   case X86::BI__builtin_ia32_kortestzhi:
2064   case X86::BI__builtin_ia32_kortestzsi:
2065   case X86::BI__builtin_ia32_kortestzdi: {
2066     Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops);
2067     Value *C = llvm::Constant::getNullValue(Ops[0]->getType());
2068     Value *Cmp = Builder.CreateICmpEQ(Or, C);
2069     return Builder.CreateZExt(Cmp, ConvertType(E->getType()));
2070   }
2071 
2072   case X86::BI__builtin_ia32_ktestcqi:
2073   case X86::BI__builtin_ia32_ktestzqi:
2074   case X86::BI__builtin_ia32_ktestchi:
2075   case X86::BI__builtin_ia32_ktestzhi:
2076   case X86::BI__builtin_ia32_ktestcsi:
2077   case X86::BI__builtin_ia32_ktestzsi:
2078   case X86::BI__builtin_ia32_ktestcdi:
2079   case X86::BI__builtin_ia32_ktestzdi: {
2080     Intrinsic::ID IID;
2081     switch (BuiltinID) {
2082     default: llvm_unreachable("Unsupported intrinsic!");
2083     case X86::BI__builtin_ia32_ktestcqi:
2084       IID = Intrinsic::x86_avx512_ktestc_b;
2085       break;
2086     case X86::BI__builtin_ia32_ktestzqi:
2087       IID = Intrinsic::x86_avx512_ktestz_b;
2088       break;
2089     case X86::BI__builtin_ia32_ktestchi:
2090       IID = Intrinsic::x86_avx512_ktestc_w;
2091       break;
2092     case X86::BI__builtin_ia32_ktestzhi:
2093       IID = Intrinsic::x86_avx512_ktestz_w;
2094       break;
2095     case X86::BI__builtin_ia32_ktestcsi:
2096       IID = Intrinsic::x86_avx512_ktestc_d;
2097       break;
2098     case X86::BI__builtin_ia32_ktestzsi:
2099       IID = Intrinsic::x86_avx512_ktestz_d;
2100       break;
2101     case X86::BI__builtin_ia32_ktestcdi:
2102       IID = Intrinsic::x86_avx512_ktestc_q;
2103       break;
2104     case X86::BI__builtin_ia32_ktestzdi:
2105       IID = Intrinsic::x86_avx512_ktestz_q;
2106       break;
2107     }
2108 
2109     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
2110     Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
2111     Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
2112     Function *Intr = CGM.getIntrinsic(IID);
2113     return Builder.CreateCall(Intr, {LHS, RHS});
2114   }
2115 
2116   case X86::BI__builtin_ia32_kaddqi:
2117   case X86::BI__builtin_ia32_kaddhi:
2118   case X86::BI__builtin_ia32_kaddsi:
2119   case X86::BI__builtin_ia32_kadddi: {
2120     Intrinsic::ID IID;
2121     switch (BuiltinID) {
2122     default: llvm_unreachable("Unsupported intrinsic!");
2123     case X86::BI__builtin_ia32_kaddqi:
2124       IID = Intrinsic::x86_avx512_kadd_b;
2125       break;
2126     case X86::BI__builtin_ia32_kaddhi:
2127       IID = Intrinsic::x86_avx512_kadd_w;
2128       break;
2129     case X86::BI__builtin_ia32_kaddsi:
2130       IID = Intrinsic::x86_avx512_kadd_d;
2131       break;
2132     case X86::BI__builtin_ia32_kadddi:
2133       IID = Intrinsic::x86_avx512_kadd_q;
2134       break;
2135     }
2136 
2137     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
2138     Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
2139     Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
2140     Function *Intr = CGM.getIntrinsic(IID);
2141     Value *Res = Builder.CreateCall(Intr, {LHS, RHS});
2142     return Builder.CreateBitCast(Res, Ops[0]->getType());
2143   }
2144   case X86::BI__builtin_ia32_kandqi:
2145   case X86::BI__builtin_ia32_kandhi:
2146   case X86::BI__builtin_ia32_kandsi:
2147   case X86::BI__builtin_ia32_kanddi:
2148     return EmitX86MaskLogic(*this, Instruction::And, Ops);
2149   case X86::BI__builtin_ia32_kandnqi:
2150   case X86::BI__builtin_ia32_kandnhi:
2151   case X86::BI__builtin_ia32_kandnsi:
2152   case X86::BI__builtin_ia32_kandndi:
2153     return EmitX86MaskLogic(*this, Instruction::And, Ops, true);
2154   case X86::BI__builtin_ia32_korqi:
2155   case X86::BI__builtin_ia32_korhi:
2156   case X86::BI__builtin_ia32_korsi:
2157   case X86::BI__builtin_ia32_kordi:
2158     return EmitX86MaskLogic(*this, Instruction::Or, Ops);
2159   case X86::BI__builtin_ia32_kxnorqi:
2160   case X86::BI__builtin_ia32_kxnorhi:
2161   case X86::BI__builtin_ia32_kxnorsi:
2162   case X86::BI__builtin_ia32_kxnordi:
2163     return EmitX86MaskLogic(*this, Instruction::Xor, Ops, true);
2164   case X86::BI__builtin_ia32_kxorqi:
2165   case X86::BI__builtin_ia32_kxorhi:
2166   case X86::BI__builtin_ia32_kxorsi:
2167   case X86::BI__builtin_ia32_kxordi:
2168     return EmitX86MaskLogic(*this, Instruction::Xor,  Ops);
2169   case X86::BI__builtin_ia32_knotqi:
2170   case X86::BI__builtin_ia32_knothi:
2171   case X86::BI__builtin_ia32_knotsi:
2172   case X86::BI__builtin_ia32_knotdi: {
2173     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
2174     Value *Res = getMaskVecValue(*this, Ops[0], NumElts);
2175     return Builder.CreateBitCast(Builder.CreateNot(Res),
2176                                  Ops[0]->getType());
2177   }
2178   case X86::BI__builtin_ia32_kmovb:
2179   case X86::BI__builtin_ia32_kmovw:
2180   case X86::BI__builtin_ia32_kmovd:
2181   case X86::BI__builtin_ia32_kmovq: {
2182     // Bitcast to vXi1 type and then back to integer. This gets the mask
2183     // register type into the IR, but might be optimized out depending on
2184     // what's around it.
2185     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
2186     Value *Res = getMaskVecValue(*this, Ops[0], NumElts);
2187     return Builder.CreateBitCast(Res, Ops[0]->getType());
2188   }
2189 
2190   case X86::BI__builtin_ia32_kunpckdi:
2191   case X86::BI__builtin_ia32_kunpcksi:
2192   case X86::BI__builtin_ia32_kunpckhi: {
2193     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
2194     Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
2195     Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
2196     int Indices[64];
2197     for (unsigned i = 0; i != NumElts; ++i)
2198       Indices[i] = i;
2199 
2200     // First extract half of each vector. This gives better codegen than
2201     // doing it in a single shuffle.
2202     LHS = Builder.CreateShuffleVector(LHS, LHS, ArrayRef(Indices, NumElts / 2));
2203     RHS = Builder.CreateShuffleVector(RHS, RHS, ArrayRef(Indices, NumElts / 2));
2204     // Concat the vectors.
2205     // NOTE: Operands are swapped to match the intrinsic definition.
2206     Value *Res =
2207         Builder.CreateShuffleVector(RHS, LHS, ArrayRef(Indices, NumElts));
2208     return Builder.CreateBitCast(Res, Ops[0]->getType());
2209   }
2210 
2211   case X86::BI__builtin_ia32_vplzcntd_128:
2212   case X86::BI__builtin_ia32_vplzcntd_256:
2213   case X86::BI__builtin_ia32_vplzcntd_512:
2214   case X86::BI__builtin_ia32_vplzcntq_128:
2215   case X86::BI__builtin_ia32_vplzcntq_256:
2216   case X86::BI__builtin_ia32_vplzcntq_512: {
2217     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
2218     return Builder.CreateCall(F, {Ops[0],Builder.getInt1(false)});
2219   }
2220   case X86::BI__builtin_ia32_sqrtss:
2221   case X86::BI__builtin_ia32_sqrtsd: {
2222     Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
2223     Function *F;
2224     if (Builder.getIsFPConstrained()) {
2225       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2226       F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
2227                            A->getType());
2228       A = Builder.CreateConstrainedFPCall(F, {A});
2229     } else {
2230       F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
2231       A = Builder.CreateCall(F, {A});
2232     }
2233     return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
2234   }
2235   case X86::BI__builtin_ia32_sqrtsh_round_mask:
2236   case X86::BI__builtin_ia32_sqrtsd_round_mask:
2237   case X86::BI__builtin_ia32_sqrtss_round_mask: {
2238     unsigned CC = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
2239     // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
2240     // otherwise keep the intrinsic.
2241     if (CC != 4) {
2242       Intrinsic::ID IID;
2243 
2244       switch (BuiltinID) {
2245       default:
2246         llvm_unreachable("Unsupported intrinsic!");
2247       case X86::BI__builtin_ia32_sqrtsh_round_mask:
2248         IID = Intrinsic::x86_avx512fp16_mask_sqrt_sh;
2249         break;
2250       case X86::BI__builtin_ia32_sqrtsd_round_mask:
2251         IID = Intrinsic::x86_avx512_mask_sqrt_sd;
2252         break;
2253       case X86::BI__builtin_ia32_sqrtss_round_mask:
2254         IID = Intrinsic::x86_avx512_mask_sqrt_ss;
2255         break;
2256       }
2257       return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
2258     }
2259     Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
2260     Function *F;
2261     if (Builder.getIsFPConstrained()) {
2262       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2263       F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
2264                            A->getType());
2265       A = Builder.CreateConstrainedFPCall(F, A);
2266     } else {
2267       F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
2268       A = Builder.CreateCall(F, A);
2269     }
2270     Value *Src = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
2271     A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
2272     return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
2273   }
2274   case X86::BI__builtin_ia32_sqrtpd256:
2275   case X86::BI__builtin_ia32_sqrtpd:
2276   case X86::BI__builtin_ia32_sqrtps256:
2277   case X86::BI__builtin_ia32_sqrtps:
2278   case X86::BI__builtin_ia32_sqrtph256:
2279   case X86::BI__builtin_ia32_sqrtph:
2280   case X86::BI__builtin_ia32_sqrtph512:
2281   case X86::BI__builtin_ia32_vsqrtbf16256:
2282   case X86::BI__builtin_ia32_vsqrtbf16:
2283   case X86::BI__builtin_ia32_vsqrtbf16512:
2284   case X86::BI__builtin_ia32_sqrtps512:
2285   case X86::BI__builtin_ia32_sqrtpd512: {
2286     if (Ops.size() == 2) {
2287       unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
2288       // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
2289       // otherwise keep the intrinsic.
2290       if (CC != 4) {
2291         Intrinsic::ID IID;
2292 
2293         switch (BuiltinID) {
2294         default:
2295           llvm_unreachable("Unsupported intrinsic!");
2296         case X86::BI__builtin_ia32_sqrtph512:
2297           IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
2298           break;
2299         case X86::BI__builtin_ia32_sqrtps512:
2300           IID = Intrinsic::x86_avx512_sqrt_ps_512;
2301           break;
2302         case X86::BI__builtin_ia32_sqrtpd512:
2303           IID = Intrinsic::x86_avx512_sqrt_pd_512;
2304           break;
2305         }
2306         return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
2307       }
2308     }
2309     if (Builder.getIsFPConstrained()) {
2310       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2311       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
2312                                      Ops[0]->getType());
2313       return Builder.CreateConstrainedFPCall(F, Ops[0]);
2314     } else {
2315       Function *F = CGM.getIntrinsic(Intrinsic::sqrt, Ops[0]->getType());
2316       return Builder.CreateCall(F, Ops[0]);
2317     }
2318   }
2319 
2320   case X86::BI__builtin_ia32_pmuludq128:
2321   case X86::BI__builtin_ia32_pmuludq256:
2322   case X86::BI__builtin_ia32_pmuludq512:
2323     return EmitX86Muldq(*this, /*IsSigned*/false, Ops);
2324 
2325   case X86::BI__builtin_ia32_pmuldq128:
2326   case X86::BI__builtin_ia32_pmuldq256:
2327   case X86::BI__builtin_ia32_pmuldq512:
2328     return EmitX86Muldq(*this, /*IsSigned*/true, Ops);
2329 
2330   case X86::BI__builtin_ia32_pternlogd512_mask:
2331   case X86::BI__builtin_ia32_pternlogq512_mask:
2332   case X86::BI__builtin_ia32_pternlogd128_mask:
2333   case X86::BI__builtin_ia32_pternlogd256_mask:
2334   case X86::BI__builtin_ia32_pternlogq128_mask:
2335   case X86::BI__builtin_ia32_pternlogq256_mask:
2336     return EmitX86Ternlog(*this, /*ZeroMask*/false, Ops);
2337 
2338   case X86::BI__builtin_ia32_pternlogd512_maskz:
2339   case X86::BI__builtin_ia32_pternlogq512_maskz:
2340   case X86::BI__builtin_ia32_pternlogd128_maskz:
2341   case X86::BI__builtin_ia32_pternlogd256_maskz:
2342   case X86::BI__builtin_ia32_pternlogq128_maskz:
2343   case X86::BI__builtin_ia32_pternlogq256_maskz:
2344     return EmitX86Ternlog(*this, /*ZeroMask*/true, Ops);
2345 
2346   case X86::BI__builtin_ia32_vpshldd128:
2347   case X86::BI__builtin_ia32_vpshldd256:
2348   case X86::BI__builtin_ia32_vpshldd512:
2349   case X86::BI__builtin_ia32_vpshldq128:
2350   case X86::BI__builtin_ia32_vpshldq256:
2351   case X86::BI__builtin_ia32_vpshldq512:
2352   case X86::BI__builtin_ia32_vpshldw128:
2353   case X86::BI__builtin_ia32_vpshldw256:
2354   case X86::BI__builtin_ia32_vpshldw512:
2355     return EmitX86FunnelShift(*this, Ops[0], Ops[1], Ops[2], false);
2356 
2357   case X86::BI__builtin_ia32_vpshrdd128:
2358   case X86::BI__builtin_ia32_vpshrdd256:
2359   case X86::BI__builtin_ia32_vpshrdd512:
2360   case X86::BI__builtin_ia32_vpshrdq128:
2361   case X86::BI__builtin_ia32_vpshrdq256:
2362   case X86::BI__builtin_ia32_vpshrdq512:
2363   case X86::BI__builtin_ia32_vpshrdw128:
2364   case X86::BI__builtin_ia32_vpshrdw256:
2365   case X86::BI__builtin_ia32_vpshrdw512:
2366     // Ops 0 and 1 are swapped.
2367     return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true);
2368 
2369   case X86::BI__builtin_ia32_vpshldvd128:
2370   case X86::BI__builtin_ia32_vpshldvd256:
2371   case X86::BI__builtin_ia32_vpshldvd512:
2372   case X86::BI__builtin_ia32_vpshldvq128:
2373   case X86::BI__builtin_ia32_vpshldvq256:
2374   case X86::BI__builtin_ia32_vpshldvq512:
2375   case X86::BI__builtin_ia32_vpshldvw128:
2376   case X86::BI__builtin_ia32_vpshldvw256:
2377   case X86::BI__builtin_ia32_vpshldvw512:
2378     return EmitX86FunnelShift(*this, Ops[0], Ops[1], Ops[2], false);
2379 
2380   case X86::BI__builtin_ia32_vpshrdvd128:
2381   case X86::BI__builtin_ia32_vpshrdvd256:
2382   case X86::BI__builtin_ia32_vpshrdvd512:
2383   case X86::BI__builtin_ia32_vpshrdvq128:
2384   case X86::BI__builtin_ia32_vpshrdvq256:
2385   case X86::BI__builtin_ia32_vpshrdvq512:
2386   case X86::BI__builtin_ia32_vpshrdvw128:
2387   case X86::BI__builtin_ia32_vpshrdvw256:
2388   case X86::BI__builtin_ia32_vpshrdvw512:
2389     // Ops 0 and 1 are swapped.
2390     return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true);
2391 
2392   // Reductions
2393   case X86::BI__builtin_ia32_reduce_fadd_pd512:
2394   case X86::BI__builtin_ia32_reduce_fadd_ps512:
2395   case X86::BI__builtin_ia32_reduce_fadd_ph512:
2396   case X86::BI__builtin_ia32_reduce_fadd_ph256:
2397   case X86::BI__builtin_ia32_reduce_fadd_ph128: {
2398     Function *F =
2399         CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType());
2400     IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
2401     Builder.getFastMathFlags().setAllowReassoc();
2402     return Builder.CreateCall(F, {Ops[0], Ops[1]});
2403   }
2404   case X86::BI__builtin_ia32_reduce_fmul_pd512:
2405   case X86::BI__builtin_ia32_reduce_fmul_ps512:
2406   case X86::BI__builtin_ia32_reduce_fmul_ph512:
2407   case X86::BI__builtin_ia32_reduce_fmul_ph256:
2408   case X86::BI__builtin_ia32_reduce_fmul_ph128: {
2409     Function *F =
2410         CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType());
2411     IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
2412     Builder.getFastMathFlags().setAllowReassoc();
2413     return Builder.CreateCall(F, {Ops[0], Ops[1]});
2414   }
2415   case X86::BI__builtin_ia32_reduce_fmax_pd512:
2416   case X86::BI__builtin_ia32_reduce_fmax_ps512:
2417   case X86::BI__builtin_ia32_reduce_fmax_ph512:
2418   case X86::BI__builtin_ia32_reduce_fmax_ph256:
2419   case X86::BI__builtin_ia32_reduce_fmax_ph128: {
2420     Function *F =
2421         CGM.getIntrinsic(Intrinsic::vector_reduce_fmax, Ops[0]->getType());
2422     IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
2423     Builder.getFastMathFlags().setNoNaNs();
2424     return Builder.CreateCall(F, {Ops[0]});
2425   }
2426   case X86::BI__builtin_ia32_reduce_fmin_pd512:
2427   case X86::BI__builtin_ia32_reduce_fmin_ps512:
2428   case X86::BI__builtin_ia32_reduce_fmin_ph512:
2429   case X86::BI__builtin_ia32_reduce_fmin_ph256:
2430   case X86::BI__builtin_ia32_reduce_fmin_ph128: {
2431     Function *F =
2432         CGM.getIntrinsic(Intrinsic::vector_reduce_fmin, Ops[0]->getType());
2433     IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
2434     Builder.getFastMathFlags().setNoNaNs();
2435     return Builder.CreateCall(F, {Ops[0]});
2436   }
2437 
2438   case X86::BI__builtin_ia32_rdrand16_step:
2439   case X86::BI__builtin_ia32_rdrand32_step:
2440   case X86::BI__builtin_ia32_rdrand64_step:
2441   case X86::BI__builtin_ia32_rdseed16_step:
2442   case X86::BI__builtin_ia32_rdseed32_step:
2443   case X86::BI__builtin_ia32_rdseed64_step: {
2444     Intrinsic::ID ID;
2445     switch (BuiltinID) {
2446     default: llvm_unreachable("Unsupported intrinsic!");
2447     case X86::BI__builtin_ia32_rdrand16_step:
2448       ID = Intrinsic::x86_rdrand_16;
2449       break;
2450     case X86::BI__builtin_ia32_rdrand32_step:
2451       ID = Intrinsic::x86_rdrand_32;
2452       break;
2453     case X86::BI__builtin_ia32_rdrand64_step:
2454       ID = Intrinsic::x86_rdrand_64;
2455       break;
2456     case X86::BI__builtin_ia32_rdseed16_step:
2457       ID = Intrinsic::x86_rdseed_16;
2458       break;
2459     case X86::BI__builtin_ia32_rdseed32_step:
2460       ID = Intrinsic::x86_rdseed_32;
2461       break;
2462     case X86::BI__builtin_ia32_rdseed64_step:
2463       ID = Intrinsic::x86_rdseed_64;
2464       break;
2465     }
2466 
2467     Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID));
2468     Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 0),
2469                                       Ops[0]);
2470     return Builder.CreateExtractValue(Call, 1);
2471   }
2472   case X86::BI__builtin_ia32_addcarryx_u32:
2473   case X86::BI__builtin_ia32_addcarryx_u64:
2474   case X86::BI__builtin_ia32_subborrow_u32:
2475   case X86::BI__builtin_ia32_subborrow_u64: {
2476     Intrinsic::ID IID;
2477     switch (BuiltinID) {
2478     default: llvm_unreachable("Unsupported intrinsic!");
2479     case X86::BI__builtin_ia32_addcarryx_u32:
2480       IID = Intrinsic::x86_addcarry_32;
2481       break;
2482     case X86::BI__builtin_ia32_addcarryx_u64:
2483       IID = Intrinsic::x86_addcarry_64;
2484       break;
2485     case X86::BI__builtin_ia32_subborrow_u32:
2486       IID = Intrinsic::x86_subborrow_32;
2487       break;
2488     case X86::BI__builtin_ia32_subborrow_u64:
2489       IID = Intrinsic::x86_subborrow_64;
2490       break;
2491     }
2492 
2493     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
2494                                      { Ops[0], Ops[1], Ops[2] });
2495     Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1),
2496                                       Ops[3]);
2497     return Builder.CreateExtractValue(Call, 0);
2498   }
2499 
2500   case X86::BI__builtin_ia32_fpclassps128_mask:
2501   case X86::BI__builtin_ia32_fpclassps256_mask:
2502   case X86::BI__builtin_ia32_fpclassps512_mask:
2503   case X86::BI__builtin_ia32_vfpclassbf16128_mask:
2504   case X86::BI__builtin_ia32_vfpclassbf16256_mask:
2505   case X86::BI__builtin_ia32_vfpclassbf16512_mask:
2506   case X86::BI__builtin_ia32_fpclassph128_mask:
2507   case X86::BI__builtin_ia32_fpclassph256_mask:
2508   case X86::BI__builtin_ia32_fpclassph512_mask:
2509   case X86::BI__builtin_ia32_fpclasspd128_mask:
2510   case X86::BI__builtin_ia32_fpclasspd256_mask:
2511   case X86::BI__builtin_ia32_fpclasspd512_mask: {
2512     unsigned NumElts =
2513         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
2514     Value *MaskIn = Ops[2];
2515     Ops.erase(&Ops[2]);
2516 
2517     Intrinsic::ID ID;
2518     switch (BuiltinID) {
2519     default: llvm_unreachable("Unsupported intrinsic!");
2520     case X86::BI__builtin_ia32_vfpclassbf16128_mask:
2521       ID = Intrinsic::x86_avx10_fpclass_bf16_128;
2522       break;
2523     case X86::BI__builtin_ia32_vfpclassbf16256_mask:
2524       ID = Intrinsic::x86_avx10_fpclass_bf16_256;
2525       break;
2526     case X86::BI__builtin_ia32_vfpclassbf16512_mask:
2527       ID = Intrinsic::x86_avx10_fpclass_bf16_512;
2528       break;
2529     case X86::BI__builtin_ia32_fpclassph128_mask:
2530       ID = Intrinsic::x86_avx512fp16_fpclass_ph_128;
2531       break;
2532     case X86::BI__builtin_ia32_fpclassph256_mask:
2533       ID = Intrinsic::x86_avx512fp16_fpclass_ph_256;
2534       break;
2535     case X86::BI__builtin_ia32_fpclassph512_mask:
2536       ID = Intrinsic::x86_avx512fp16_fpclass_ph_512;
2537       break;
2538     case X86::BI__builtin_ia32_fpclassps128_mask:
2539       ID = Intrinsic::x86_avx512_fpclass_ps_128;
2540       break;
2541     case X86::BI__builtin_ia32_fpclassps256_mask:
2542       ID = Intrinsic::x86_avx512_fpclass_ps_256;
2543       break;
2544     case X86::BI__builtin_ia32_fpclassps512_mask:
2545       ID = Intrinsic::x86_avx512_fpclass_ps_512;
2546       break;
2547     case X86::BI__builtin_ia32_fpclasspd128_mask:
2548       ID = Intrinsic::x86_avx512_fpclass_pd_128;
2549       break;
2550     case X86::BI__builtin_ia32_fpclasspd256_mask:
2551       ID = Intrinsic::x86_avx512_fpclass_pd_256;
2552       break;
2553     case X86::BI__builtin_ia32_fpclasspd512_mask:
2554       ID = Intrinsic::x86_avx512_fpclass_pd_512;
2555       break;
2556     }
2557 
2558     Value *Fpclass = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
2559     return EmitX86MaskedCompareResult(*this, Fpclass, NumElts, MaskIn);
2560   }
2561 
2562   case X86::BI__builtin_ia32_vp2intersect_q_512:
2563   case X86::BI__builtin_ia32_vp2intersect_q_256:
2564   case X86::BI__builtin_ia32_vp2intersect_q_128:
2565   case X86::BI__builtin_ia32_vp2intersect_d_512:
2566   case X86::BI__builtin_ia32_vp2intersect_d_256:
2567   case X86::BI__builtin_ia32_vp2intersect_d_128: {
2568     unsigned NumElts =
2569         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
2570     Intrinsic::ID ID;
2571 
2572     switch (BuiltinID) {
2573     default: llvm_unreachable("Unsupported intrinsic!");
2574     case X86::BI__builtin_ia32_vp2intersect_q_512:
2575       ID = Intrinsic::x86_avx512_vp2intersect_q_512;
2576       break;
2577     case X86::BI__builtin_ia32_vp2intersect_q_256:
2578       ID = Intrinsic::x86_avx512_vp2intersect_q_256;
2579       break;
2580     case X86::BI__builtin_ia32_vp2intersect_q_128:
2581       ID = Intrinsic::x86_avx512_vp2intersect_q_128;
2582       break;
2583     case X86::BI__builtin_ia32_vp2intersect_d_512:
2584       ID = Intrinsic::x86_avx512_vp2intersect_d_512;
2585       break;
2586     case X86::BI__builtin_ia32_vp2intersect_d_256:
2587       ID = Intrinsic::x86_avx512_vp2intersect_d_256;
2588       break;
2589     case X86::BI__builtin_ia32_vp2intersect_d_128:
2590       ID = Intrinsic::x86_avx512_vp2intersect_d_128;
2591       break;
2592     }
2593 
2594     Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID), {Ops[0], Ops[1]});
2595     Value *Result = Builder.CreateExtractValue(Call, 0);
2596     Result = EmitX86MaskedCompareResult(*this, Result, NumElts, nullptr);
2597     Builder.CreateDefaultAlignedStore(Result, Ops[2]);
2598 
2599     Result = Builder.CreateExtractValue(Call, 1);
2600     Result = EmitX86MaskedCompareResult(*this, Result, NumElts, nullptr);
2601     return Builder.CreateDefaultAlignedStore(Result, Ops[3]);
2602   }
2603 
2604   case X86::BI__builtin_ia32_vpmultishiftqb128:
2605   case X86::BI__builtin_ia32_vpmultishiftqb256:
2606   case X86::BI__builtin_ia32_vpmultishiftqb512: {
2607     Intrinsic::ID ID;
2608     switch (BuiltinID) {
2609     default: llvm_unreachable("Unsupported intrinsic!");
2610     case X86::BI__builtin_ia32_vpmultishiftqb128:
2611       ID = Intrinsic::x86_avx512_pmultishift_qb_128;
2612       break;
2613     case X86::BI__builtin_ia32_vpmultishiftqb256:
2614       ID = Intrinsic::x86_avx512_pmultishift_qb_256;
2615       break;
2616     case X86::BI__builtin_ia32_vpmultishiftqb512:
2617       ID = Intrinsic::x86_avx512_pmultishift_qb_512;
2618       break;
2619     }
2620 
2621     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
2622   }
2623 
2624   case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
2625   case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
2626   case X86::BI__builtin_ia32_vpshufbitqmb512_mask: {
2627     unsigned NumElts =
2628         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
2629     Value *MaskIn = Ops[2];
2630     Ops.erase(&Ops[2]);
2631 
2632     Intrinsic::ID ID;
2633     switch (BuiltinID) {
2634     default: llvm_unreachable("Unsupported intrinsic!");
2635     case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
2636       ID = Intrinsic::x86_avx512_vpshufbitqmb_128;
2637       break;
2638     case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
2639       ID = Intrinsic::x86_avx512_vpshufbitqmb_256;
2640       break;
2641     case X86::BI__builtin_ia32_vpshufbitqmb512_mask:
2642       ID = Intrinsic::x86_avx512_vpshufbitqmb_512;
2643       break;
2644     }
2645 
2646     Value *Shufbit = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
2647     return EmitX86MaskedCompareResult(*this, Shufbit, NumElts, MaskIn);
2648   }
2649 
2650   // packed comparison intrinsics
2651   case X86::BI__builtin_ia32_cmpeqps:
2652   case X86::BI__builtin_ia32_cmpeqpd:
2653     return getVectorFCmpIR(CmpInst::FCMP_OEQ, /*IsSignaling*/false);
2654   case X86::BI__builtin_ia32_cmpltps:
2655   case X86::BI__builtin_ia32_cmpltpd:
2656     return getVectorFCmpIR(CmpInst::FCMP_OLT, /*IsSignaling*/true);
2657   case X86::BI__builtin_ia32_cmpleps:
2658   case X86::BI__builtin_ia32_cmplepd:
2659     return getVectorFCmpIR(CmpInst::FCMP_OLE, /*IsSignaling*/true);
2660   case X86::BI__builtin_ia32_cmpunordps:
2661   case X86::BI__builtin_ia32_cmpunordpd:
2662     return getVectorFCmpIR(CmpInst::FCMP_UNO, /*IsSignaling*/false);
2663   case X86::BI__builtin_ia32_cmpneqps:
2664   case X86::BI__builtin_ia32_cmpneqpd:
2665     return getVectorFCmpIR(CmpInst::FCMP_UNE, /*IsSignaling*/false);
2666   case X86::BI__builtin_ia32_cmpnltps:
2667   case X86::BI__builtin_ia32_cmpnltpd:
2668     return getVectorFCmpIR(CmpInst::FCMP_UGE, /*IsSignaling*/true);
2669   case X86::BI__builtin_ia32_cmpnleps:
2670   case X86::BI__builtin_ia32_cmpnlepd:
2671     return getVectorFCmpIR(CmpInst::FCMP_UGT, /*IsSignaling*/true);
2672   case X86::BI__builtin_ia32_cmpordps:
2673   case X86::BI__builtin_ia32_cmpordpd:
2674     return getVectorFCmpIR(CmpInst::FCMP_ORD, /*IsSignaling*/false);
2675   case X86::BI__builtin_ia32_cmpph128_mask:
2676   case X86::BI__builtin_ia32_cmpph256_mask:
2677   case X86::BI__builtin_ia32_cmpph512_mask:
2678   case X86::BI__builtin_ia32_cmpps128_mask:
2679   case X86::BI__builtin_ia32_cmpps256_mask:
2680   case X86::BI__builtin_ia32_cmpps512_mask:
2681   case X86::BI__builtin_ia32_cmppd128_mask:
2682   case X86::BI__builtin_ia32_cmppd256_mask:
2683   case X86::BI__builtin_ia32_cmppd512_mask:
2684   case X86::BI__builtin_ia32_vcmpbf16512_mask:
2685   case X86::BI__builtin_ia32_vcmpbf16256_mask:
2686   case X86::BI__builtin_ia32_vcmpbf16128_mask:
2687     IsMaskFCmp = true;
2688     [[fallthrough]];
2689   case X86::BI__builtin_ia32_cmpps:
2690   case X86::BI__builtin_ia32_cmpps256:
2691   case X86::BI__builtin_ia32_cmppd:
2692   case X86::BI__builtin_ia32_cmppd256: {
2693     // Lowering vector comparisons to fcmp instructions, while
2694     // ignoring signalling behaviour requested
2695     // ignoring rounding mode requested
2696     // This is only possible if fp-model is not strict and FENV_ACCESS is off.
2697 
2698     // The third argument is the comparison condition, and integer in the
2699     // range [0, 31]
2700     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x1f;
2701 
2702     // Lowering to IR fcmp instruction.
2703     // Ignoring requested signaling behaviour,
2704     // e.g. both _CMP_GT_OS & _CMP_GT_OQ are translated to FCMP_OGT.
2705     FCmpInst::Predicate Pred;
2706     bool IsSignaling;
2707     // Predicates for 16-31 repeat the 0-15 predicates. Only the signalling
2708     // behavior is inverted. We'll handle that after the switch.
2709     switch (CC & 0xf) {
2710     case 0x00: Pred = FCmpInst::FCMP_OEQ;   IsSignaling = false; break;
2711     case 0x01: Pred = FCmpInst::FCMP_OLT;   IsSignaling = true;  break;
2712     case 0x02: Pred = FCmpInst::FCMP_OLE;   IsSignaling = true;  break;
2713     case 0x03: Pred = FCmpInst::FCMP_UNO;   IsSignaling = false; break;
2714     case 0x04: Pred = FCmpInst::FCMP_UNE;   IsSignaling = false; break;
2715     case 0x05: Pred = FCmpInst::FCMP_UGE;   IsSignaling = true;  break;
2716     case 0x06: Pred = FCmpInst::FCMP_UGT;   IsSignaling = true;  break;
2717     case 0x07: Pred = FCmpInst::FCMP_ORD;   IsSignaling = false; break;
2718     case 0x08: Pred = FCmpInst::FCMP_UEQ;   IsSignaling = false; break;
2719     case 0x09: Pred = FCmpInst::FCMP_ULT;   IsSignaling = true;  break;
2720     case 0x0a: Pred = FCmpInst::FCMP_ULE;   IsSignaling = true;  break;
2721     case 0x0b: Pred = FCmpInst::FCMP_FALSE; IsSignaling = false; break;
2722     case 0x0c: Pred = FCmpInst::FCMP_ONE;   IsSignaling = false; break;
2723     case 0x0d: Pred = FCmpInst::FCMP_OGE;   IsSignaling = true;  break;
2724     case 0x0e: Pred = FCmpInst::FCMP_OGT;   IsSignaling = true;  break;
2725     case 0x0f: Pred = FCmpInst::FCMP_TRUE;  IsSignaling = false; break;
2726     default: llvm_unreachable("Unhandled CC");
2727     }
2728 
2729     // Invert the signalling behavior for 16-31.
2730     if (CC & 0x10)
2731       IsSignaling = !IsSignaling;
2732 
2733     // If the predicate is true or false and we're using constrained intrinsics,
2734     // we don't have a compare intrinsic we can use. Just use the legacy X86
2735     // specific intrinsic.
2736     // If the intrinsic is mask enabled and we're using constrained intrinsics,
2737     // use the legacy X86 specific intrinsic.
2738     if (Builder.getIsFPConstrained() &&
2739         (Pred == FCmpInst::FCMP_TRUE || Pred == FCmpInst::FCMP_FALSE ||
2740          IsMaskFCmp)) {
2741 
2742       Intrinsic::ID IID;
2743       switch (BuiltinID) {
2744       default: llvm_unreachable("Unexpected builtin");
2745       case X86::BI__builtin_ia32_cmpps:
2746         IID = Intrinsic::x86_sse_cmp_ps;
2747         break;
2748       case X86::BI__builtin_ia32_cmpps256:
2749         IID = Intrinsic::x86_avx_cmp_ps_256;
2750         break;
2751       case X86::BI__builtin_ia32_cmppd:
2752         IID = Intrinsic::x86_sse2_cmp_pd;
2753         break;
2754       case X86::BI__builtin_ia32_cmppd256:
2755         IID = Intrinsic::x86_avx_cmp_pd_256;
2756         break;
2757       case X86::BI__builtin_ia32_cmpph128_mask:
2758         IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_128;
2759         break;
2760       case X86::BI__builtin_ia32_cmpph256_mask:
2761         IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_256;
2762         break;
2763       case X86::BI__builtin_ia32_cmpph512_mask:
2764         IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_512;
2765         break;
2766       case X86::BI__builtin_ia32_cmpps512_mask:
2767         IID = Intrinsic::x86_avx512_mask_cmp_ps_512;
2768         break;
2769       case X86::BI__builtin_ia32_cmppd512_mask:
2770         IID = Intrinsic::x86_avx512_mask_cmp_pd_512;
2771         break;
2772       case X86::BI__builtin_ia32_cmpps128_mask:
2773         IID = Intrinsic::x86_avx512_mask_cmp_ps_128;
2774         break;
2775       case X86::BI__builtin_ia32_cmpps256_mask:
2776         IID = Intrinsic::x86_avx512_mask_cmp_ps_256;
2777         break;
2778       case X86::BI__builtin_ia32_cmppd128_mask:
2779         IID = Intrinsic::x86_avx512_mask_cmp_pd_128;
2780         break;
2781       case X86::BI__builtin_ia32_cmppd256_mask:
2782         IID = Intrinsic::x86_avx512_mask_cmp_pd_256;
2783         break;
2784       }
2785 
2786       Function *Intr = CGM.getIntrinsic(IID);
2787       if (IsMaskFCmp) {
2788         unsigned NumElts =
2789             cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
2790         Ops[3] = getMaskVecValue(*this, Ops[3], NumElts);
2791         Value *Cmp = Builder.CreateCall(Intr, Ops);
2792         return EmitX86MaskedCompareResult(*this, Cmp, NumElts, nullptr);
2793       }
2794 
2795       return Builder.CreateCall(Intr, Ops);
2796     }
2797 
2798     // Builtins without the _mask suffix return a vector of integers
2799     // of the same width as the input vectors
2800     if (IsMaskFCmp) {
2801       // We ignore SAE if strict FP is disabled. We only keep precise
2802       // exception behavior under strict FP.
2803       // NOTE: If strict FP does ever go through here a CGFPOptionsRAII
2804       // object will be required.
2805       unsigned NumElts =
2806           cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
2807       Value *Cmp;
2808       if (IsSignaling)
2809         Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]);
2810       else
2811         Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
2812       return EmitX86MaskedCompareResult(*this, Cmp, NumElts, Ops[3]);
2813     }
2814 
2815     return getVectorFCmpIR(Pred, IsSignaling);
2816   }
2817 
2818   // SSE scalar comparison intrinsics
2819   case X86::BI__builtin_ia32_cmpeqss:
2820     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 0);
2821   case X86::BI__builtin_ia32_cmpltss:
2822     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 1);
2823   case X86::BI__builtin_ia32_cmpless:
2824     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 2);
2825   case X86::BI__builtin_ia32_cmpunordss:
2826     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 3);
2827   case X86::BI__builtin_ia32_cmpneqss:
2828     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 4);
2829   case X86::BI__builtin_ia32_cmpnltss:
2830     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 5);
2831   case X86::BI__builtin_ia32_cmpnless:
2832     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 6);
2833   case X86::BI__builtin_ia32_cmpordss:
2834     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 7);
2835   case X86::BI__builtin_ia32_cmpeqsd:
2836     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 0);
2837   case X86::BI__builtin_ia32_cmpltsd:
2838     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 1);
2839   case X86::BI__builtin_ia32_cmplesd:
2840     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 2);
2841   case X86::BI__builtin_ia32_cmpunordsd:
2842     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 3);
2843   case X86::BI__builtin_ia32_cmpneqsd:
2844     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 4);
2845   case X86::BI__builtin_ia32_cmpnltsd:
2846     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 5);
2847   case X86::BI__builtin_ia32_cmpnlesd:
2848     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 6);
2849   case X86::BI__builtin_ia32_cmpordsd:
2850     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
2851 
2852   // f16c half2float intrinsics
2853   case X86::BI__builtin_ia32_vcvtph2ps:
2854   case X86::BI__builtin_ia32_vcvtph2ps256:
2855   case X86::BI__builtin_ia32_vcvtph2ps_mask:
2856   case X86::BI__builtin_ia32_vcvtph2ps256_mask:
2857   case X86::BI__builtin_ia32_vcvtph2ps512_mask: {
2858     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2859     return EmitX86CvtF16ToFloatExpr(*this, Ops, ConvertType(E->getType()));
2860   }
2861 
2862   // AVX512 bf16 intrinsics
2863   case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: {
2864     Ops[2] = getMaskVecValue(
2865         *this, Ops[2],
2866         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements());
2867     Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128;
2868     return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
2869   }
2870   case X86::BI__builtin_ia32_cvtsbf162ss_32:
2871     return Builder.CreateFPExt(Ops[0], Builder.getFloatTy());
2872 
2873   case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
2874   case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
2875     Intrinsic::ID IID;
2876     switch (BuiltinID) {
2877     default: llvm_unreachable("Unsupported intrinsic!");
2878     case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
2879       IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_256;
2880       break;
2881     case X86::BI__builtin_ia32_cvtneps2bf16_512_mask:
2882       IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_512;
2883       break;
2884     }
2885     Value *Res = Builder.CreateCall(CGM.getIntrinsic(IID), Ops[0]);
2886     return EmitX86Select(*this, Ops[2], Res, Ops[1]);
2887   }
2888 
2889   case X86::BI__cpuid:
2890   case X86::BI__cpuidex: {
2891     Value *FuncId = EmitScalarExpr(E->getArg(1));
2892     Value *SubFuncId = BuiltinID == X86::BI__cpuidex
2893                            ? EmitScalarExpr(E->getArg(2))
2894                            : llvm::ConstantInt::get(Int32Ty, 0);
2895 
2896     llvm::StructType *CpuidRetTy =
2897         llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty, Int32Ty);
2898     llvm::FunctionType *FTy =
2899         llvm::FunctionType::get(CpuidRetTy, {Int32Ty, Int32Ty}, false);
2900 
2901     StringRef Asm, Constraints;
2902     if (getTarget().getTriple().getArch() == llvm::Triple::x86) {
2903       Asm = "cpuid";
2904       Constraints = "={ax},={bx},={cx},={dx},{ax},{cx}";
2905     } else {
2906       // x86-64 uses %rbx as the base register, so preserve it.
2907       Asm = "xchgq %rbx, ${1:q}\n"
2908             "cpuid\n"
2909             "xchgq %rbx, ${1:q}";
2910       Constraints = "={ax},=r,={cx},={dx},0,2";
2911     }
2912 
2913     llvm::InlineAsm *IA = llvm::InlineAsm::get(FTy, Asm, Constraints,
2914                                                /*hasSideEffects=*/false);
2915     Value *IACall = Builder.CreateCall(IA, {FuncId, SubFuncId});
2916     Value *BasePtr = EmitScalarExpr(E->getArg(0));
2917     Value *Store = nullptr;
2918     for (unsigned i = 0; i < 4; i++) {
2919       Value *Extracted = Builder.CreateExtractValue(IACall, i);
2920       Value *StorePtr = Builder.CreateConstInBoundsGEP1_32(Int32Ty, BasePtr, i);
2921       Store = Builder.CreateAlignedStore(Extracted, StorePtr, getIntAlign());
2922     }
2923 
2924     // Return the last store instruction to signal that we have emitted the
2925     // the intrinsic.
2926     return Store;
2927   }
2928 
2929   case X86::BI__emul:
2930   case X86::BI__emulu: {
2931     llvm::Type *Int64Ty = llvm::IntegerType::get(getLLVMContext(), 64);
2932     bool isSigned = (BuiltinID == X86::BI__emul);
2933     Value *LHS = Builder.CreateIntCast(Ops[0], Int64Ty, isSigned);
2934     Value *RHS = Builder.CreateIntCast(Ops[1], Int64Ty, isSigned);
2935     return Builder.CreateMul(LHS, RHS, "", !isSigned, isSigned);
2936   }
2937   case X86::BI__mulh:
2938   case X86::BI__umulh:
2939   case X86::BI_mul128:
2940   case X86::BI_umul128: {
2941     llvm::Type *ResType = ConvertType(E->getType());
2942     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
2943 
2944     bool IsSigned = (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI_mul128);
2945     Value *LHS = Builder.CreateIntCast(Ops[0], Int128Ty, IsSigned);
2946     Value *RHS = Builder.CreateIntCast(Ops[1], Int128Ty, IsSigned);
2947 
2948     Value *MulResult, *HigherBits;
2949     if (IsSigned) {
2950       MulResult = Builder.CreateNSWMul(LHS, RHS);
2951       HigherBits = Builder.CreateAShr(MulResult, 64);
2952     } else {
2953       MulResult = Builder.CreateNUWMul(LHS, RHS);
2954       HigherBits = Builder.CreateLShr(MulResult, 64);
2955     }
2956     HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned);
2957 
2958     if (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI__umulh)
2959       return HigherBits;
2960 
2961     Address HighBitsAddress = EmitPointerWithAlignment(E->getArg(2));
2962     Builder.CreateStore(HigherBits, HighBitsAddress);
2963     return Builder.CreateIntCast(MulResult, ResType, IsSigned);
2964   }
2965 
2966   case X86::BI__faststorefence: {
2967     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
2968                                llvm::SyncScope::System);
2969   }
2970   case X86::BI__shiftleft128:
2971   case X86::BI__shiftright128: {
2972     llvm::Function *F = CGM.getIntrinsic(
2973         BuiltinID == X86::BI__shiftleft128 ? Intrinsic::fshl : Intrinsic::fshr,
2974         Int64Ty);
2975     // Flip low/high ops and zero-extend amount to matching type.
2976     // shiftleft128(Low, High, Amt) -> fshl(High, Low, Amt)
2977     // shiftright128(Low, High, Amt) -> fshr(High, Low, Amt)
2978     std::swap(Ops[0], Ops[1]);
2979     Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
2980     return Builder.CreateCall(F, Ops);
2981   }
2982   case X86::BI_ReadWriteBarrier:
2983   case X86::BI_ReadBarrier:
2984   case X86::BI_WriteBarrier: {
2985     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
2986                                llvm::SyncScope::SingleThread);
2987   }
2988 
2989   case X86::BI_AddressOfReturnAddress: {
2990     Function *F =
2991         CGM.getIntrinsic(Intrinsic::addressofreturnaddress, AllocaInt8PtrTy);
2992     return Builder.CreateCall(F);
2993   }
2994   case X86::BI__stosb: {
2995     // We treat __stosb as a volatile memset - it may not generate "rep stosb"
2996     // instruction, but it will create a memset that won't be optimized away.
2997     return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
2998   }
2999   // Corresponding to intrisics which will return 2 tiles (tile0_tile1).
3000   case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
3001   case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
3002   case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
3003   case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
3004   case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
3005   case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
3006   case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
3007   case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: {
3008     Intrinsic::ID IID;
3009     switch (BuiltinID) {
3010     default:
3011       llvm_unreachable("Unsupported intrinsic!");
3012     case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
3013       IID = Intrinsic::x86_t2rpntlvwz0_internal;
3014       break;
3015     case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
3016       IID = Intrinsic::x86_t2rpntlvwz0rs_internal;
3017       break;
3018     case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
3019       IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
3020       break;
3021     case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
3022       IID = Intrinsic::x86_t2rpntlvwz0rst1_internal;
3023       break;
3024     case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
3025       IID = Intrinsic::x86_t2rpntlvwz1_internal;
3026       break;
3027     case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
3028       IID = Intrinsic::x86_t2rpntlvwz1rs_internal;
3029       break;
3030     case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
3031       IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
3032       break;
3033     case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
3034       IID = Intrinsic::x86_t2rpntlvwz1rst1_internal;
3035       break;
3036     }
3037 
3038     // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
3039     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
3040                                      {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]});
3041 
3042     auto *PtrTy = E->getArg(3)->getType()->getAs<PointerType>();
3043     assert(PtrTy && "arg3 must be of pointer type");
3044     QualType PtreeTy = PtrTy->getPointeeType();
3045     llvm::Type *TyPtee = ConvertType(PtreeTy);
3046 
3047     // Bitcast amx type (x86_amx) to vector type (256 x i32)
3048     // Then store tile0 into DstPtr0
3049     Value *T0 = Builder.CreateExtractValue(Call, 0);
3050     Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
3051                                            {TyPtee}, {T0});
3052     Builder.CreateDefaultAlignedStore(VecT0, Ops[3]);
3053 
3054     // Then store tile1 into DstPtr1
3055     Value *T1 = Builder.CreateExtractValue(Call, 1);
3056     Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
3057                                            {TyPtee}, {T1});
3058     Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]);
3059 
3060     // Note: Here we escape directly use x86_tilestored64_internal to store
3061     // the results due to it can't make sure the Mem written scope. This may
3062     // cause shapes reloads after first amx intrinsic, which current amx reg-
3063     // ister allocation has no ability to handle it.
3064 
3065     return Store;
3066   }
3067   case X86::BI__ud2:
3068     // llvm.trap makes a ud2a instruction on x86.
3069     return EmitTrapCall(Intrinsic::trap);
3070   case X86::BI__int2c: {
3071     // This syscall signals a driver assertion failure in x86 NT kernels.
3072     llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false);
3073     llvm::InlineAsm *IA =
3074         llvm::InlineAsm::get(FTy, "int $$0x2c", "", /*hasSideEffects=*/true);
3075     llvm::AttributeList NoReturnAttr = llvm::AttributeList::get(
3076         getLLVMContext(), llvm::AttributeList::FunctionIndex,
3077         llvm::Attribute::NoReturn);
3078     llvm::CallInst *CI = Builder.CreateCall(IA);
3079     CI->setAttributes(NoReturnAttr);
3080     return CI;
3081   }
3082   case X86::BI__readfsbyte:
3083   case X86::BI__readfsword:
3084   case X86::BI__readfsdword:
3085   case X86::BI__readfsqword: {
3086     llvm::Type *IntTy = ConvertType(E->getType());
3087     Value *Ptr = Builder.CreateIntToPtr(
3088         Ops[0], llvm::PointerType::get(getLLVMContext(), 257));
3089     LoadInst *Load = Builder.CreateAlignedLoad(
3090         IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
3091     Load->setVolatile(true);
3092     return Load;
3093   }
3094   case X86::BI__readgsbyte:
3095   case X86::BI__readgsword:
3096   case X86::BI__readgsdword:
3097   case X86::BI__readgsqword: {
3098     llvm::Type *IntTy = ConvertType(E->getType());
3099     Value *Ptr = Builder.CreateIntToPtr(
3100         Ops[0], llvm::PointerType::get(getLLVMContext(), 256));
3101     LoadInst *Load = Builder.CreateAlignedLoad(
3102         IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
3103     Load->setVolatile(true);
3104     return Load;
3105   }
3106   case X86::BI__builtin_ia32_encodekey128_u32: {
3107     Intrinsic::ID IID = Intrinsic::x86_encodekey128;
3108 
3109     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1]});
3110 
3111     for (int i = 0; i < 3; ++i) {
3112       Value *Extract = Builder.CreateExtractValue(Call, i + 1);
3113       Value *Ptr = Builder.CreateConstGEP1_32(Int8Ty, Ops[2], i * 16);
3114       Builder.CreateAlignedStore(Extract, Ptr, Align(1));
3115     }
3116 
3117     return Builder.CreateExtractValue(Call, 0);
3118   }
3119   case X86::BI__builtin_ia32_encodekey256_u32: {
3120     Intrinsic::ID IID = Intrinsic::x86_encodekey256;
3121 
3122     Value *Call =
3123         Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1], Ops[2]});
3124 
3125     for (int i = 0; i < 4; ++i) {
3126       Value *Extract = Builder.CreateExtractValue(Call, i + 1);
3127       Value *Ptr = Builder.CreateConstGEP1_32(Int8Ty, Ops[3], i * 16);
3128       Builder.CreateAlignedStore(Extract, Ptr, Align(1));
3129     }
3130 
3131     return Builder.CreateExtractValue(Call, 0);
3132   }
3133   case X86::BI__builtin_ia32_aesenc128kl_u8:
3134   case X86::BI__builtin_ia32_aesdec128kl_u8:
3135   case X86::BI__builtin_ia32_aesenc256kl_u8:
3136   case X86::BI__builtin_ia32_aesdec256kl_u8: {
3137     Intrinsic::ID IID;
3138     StringRef BlockName;
3139     switch (BuiltinID) {
3140     default:
3141       llvm_unreachable("Unexpected builtin");
3142     case X86::BI__builtin_ia32_aesenc128kl_u8:
3143       IID = Intrinsic::x86_aesenc128kl;
3144       BlockName = "aesenc128kl";
3145       break;
3146     case X86::BI__builtin_ia32_aesdec128kl_u8:
3147       IID = Intrinsic::x86_aesdec128kl;
3148       BlockName = "aesdec128kl";
3149       break;
3150     case X86::BI__builtin_ia32_aesenc256kl_u8:
3151       IID = Intrinsic::x86_aesenc256kl;
3152       BlockName = "aesenc256kl";
3153       break;
3154     case X86::BI__builtin_ia32_aesdec256kl_u8:
3155       IID = Intrinsic::x86_aesdec256kl;
3156       BlockName = "aesdec256kl";
3157       break;
3158     }
3159 
3160     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[1], Ops[2]});
3161 
3162     BasicBlock *NoError =
3163         createBasicBlock(BlockName + "_no_error", this->CurFn);
3164     BasicBlock *Error = createBasicBlock(BlockName + "_error", this->CurFn);
3165     BasicBlock *End = createBasicBlock(BlockName + "_end", this->CurFn);
3166 
3167     Value *Ret = Builder.CreateExtractValue(Call, 0);
3168     Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty());
3169     Value *Out = Builder.CreateExtractValue(Call, 1);
3170     Builder.CreateCondBr(Succ, NoError, Error);
3171 
3172     Builder.SetInsertPoint(NoError);
3173     Builder.CreateDefaultAlignedStore(Out, Ops[0]);
3174     Builder.CreateBr(End);
3175 
3176     Builder.SetInsertPoint(Error);
3177     Constant *Zero = llvm::Constant::getNullValue(Out->getType());
3178     Builder.CreateDefaultAlignedStore(Zero, Ops[0]);
3179     Builder.CreateBr(End);
3180 
3181     Builder.SetInsertPoint(End);
3182     return Builder.CreateExtractValue(Call, 0);
3183   }
3184   case X86::BI__builtin_ia32_aesencwide128kl_u8:
3185   case X86::BI__builtin_ia32_aesdecwide128kl_u8:
3186   case X86::BI__builtin_ia32_aesencwide256kl_u8:
3187   case X86::BI__builtin_ia32_aesdecwide256kl_u8: {
3188     Intrinsic::ID IID;
3189     StringRef BlockName;
3190     switch (BuiltinID) {
3191     case X86::BI__builtin_ia32_aesencwide128kl_u8:
3192       IID = Intrinsic::x86_aesencwide128kl;
3193       BlockName = "aesencwide128kl";
3194       break;
3195     case X86::BI__builtin_ia32_aesdecwide128kl_u8:
3196       IID = Intrinsic::x86_aesdecwide128kl;
3197       BlockName = "aesdecwide128kl";
3198       break;
3199     case X86::BI__builtin_ia32_aesencwide256kl_u8:
3200       IID = Intrinsic::x86_aesencwide256kl;
3201       BlockName = "aesencwide256kl";
3202       break;
3203     case X86::BI__builtin_ia32_aesdecwide256kl_u8:
3204       IID = Intrinsic::x86_aesdecwide256kl;
3205       BlockName = "aesdecwide256kl";
3206       break;
3207     }
3208 
3209     llvm::Type *Ty = FixedVectorType::get(Builder.getInt64Ty(), 2);
3210     Value *InOps[9];
3211     InOps[0] = Ops[2];
3212     for (int i = 0; i != 8; ++i) {
3213       Value *Ptr = Builder.CreateConstGEP1_32(Ty, Ops[1], i);
3214       InOps[i + 1] = Builder.CreateAlignedLoad(Ty, Ptr, Align(16));
3215     }
3216 
3217     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), InOps);
3218 
3219     BasicBlock *NoError =
3220         createBasicBlock(BlockName + "_no_error", this->CurFn);
3221     BasicBlock *Error = createBasicBlock(BlockName + "_error", this->CurFn);
3222     BasicBlock *End = createBasicBlock(BlockName + "_end", this->CurFn);
3223 
3224     Value *Ret = Builder.CreateExtractValue(Call, 0);
3225     Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty());
3226     Builder.CreateCondBr(Succ, NoError, Error);
3227 
3228     Builder.SetInsertPoint(NoError);
3229     for (int i = 0; i != 8; ++i) {
3230       Value *Extract = Builder.CreateExtractValue(Call, i + 1);
3231       Value *Ptr = Builder.CreateConstGEP1_32(Extract->getType(), Ops[0], i);
3232       Builder.CreateAlignedStore(Extract, Ptr, Align(16));
3233     }
3234     Builder.CreateBr(End);
3235 
3236     Builder.SetInsertPoint(Error);
3237     for (int i = 0; i != 8; ++i) {
3238       Value *Out = Builder.CreateExtractValue(Call, i + 1);
3239       Constant *Zero = llvm::Constant::getNullValue(Out->getType());
3240       Value *Ptr = Builder.CreateConstGEP1_32(Out->getType(), Ops[0], i);
3241       Builder.CreateAlignedStore(Zero, Ptr, Align(16));
3242     }
3243     Builder.CreateBr(End);
3244 
3245     Builder.SetInsertPoint(End);
3246     return Builder.CreateExtractValue(Call, 0);
3247   }
3248   case X86::BI__builtin_ia32_vfcmaddcph512_mask:
3249     IsConjFMA = true;
3250     [[fallthrough]];
3251   case X86::BI__builtin_ia32_vfmaddcph512_mask: {
3252     Intrinsic::ID IID = IsConjFMA
3253                             ? Intrinsic::x86_avx512fp16_mask_vfcmadd_cph_512
3254                             : Intrinsic::x86_avx512fp16_mask_vfmadd_cph_512;
3255     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
3256     return EmitX86Select(*this, Ops[3], Call, Ops[0]);
3257   }
3258   case X86::BI__builtin_ia32_vfcmaddcsh_round_mask:
3259     IsConjFMA = true;
3260     [[fallthrough]];
3261   case X86::BI__builtin_ia32_vfmaddcsh_round_mask: {
3262     Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh
3263                                   : Intrinsic::x86_avx512fp16_mask_vfmadd_csh;
3264     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
3265     Value *And = Builder.CreateAnd(Ops[3], llvm::ConstantInt::get(Int8Ty, 1));
3266     return EmitX86Select(*this, And, Call, Ops[0]);
3267   }
3268   case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3:
3269     IsConjFMA = true;
3270     [[fallthrough]];
3271   case X86::BI__builtin_ia32_vfmaddcsh_round_mask3: {
3272     Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh
3273                                   : Intrinsic::x86_avx512fp16_mask_vfmadd_csh;
3274     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
3275     static constexpr int Mask[] = {0, 5, 6, 7};
3276     return Builder.CreateShuffleVector(Call, Ops[2], Mask);
3277   }
3278   case X86::BI__builtin_ia32_prefetchi:
3279     return Builder.CreateCall(
3280         CGM.getIntrinsic(Intrinsic::prefetch, Ops[0]->getType()),
3281         {Ops[0], llvm::ConstantInt::get(Int32Ty, 0), Ops[1],
3282          llvm::ConstantInt::get(Int32Ty, 0)});
3283   }
3284 }
3285