xref: /freebsd/contrib/llvm-project/clang/lib/CodeGen/TargetBuiltins/ARM.cpp (revision b7daab8be1d4555f23a297e60e4128c01caabf82)
1 //===---------- ARM.cpp - Emit LLVM Code for builtins ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This contains code to emit Builtin calls as LLVM code.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "ABIInfo.h"
14 #include "CGBuiltin.h"
15 #include "CGDebugInfo.h"
16 #include "TargetInfo.h"
17 #include "clang/Basic/TargetBuiltins.h"
18 #include "llvm/IR/InlineAsm.h"
19 #include "llvm/IR/IntrinsicsAArch64.h"
20 #include "llvm/IR/IntrinsicsARM.h"
21 #include "llvm/IR/IntrinsicsBPF.h"
22 #include "llvm/TargetParser/AArch64TargetParser.h"
23 
24 #include <numeric>
25 
26 using namespace clang;
27 using namespace CodeGen;
28 using namespace llvm;
29 
30 static std::optional<CodeGenFunction::MSVCIntrin>
31 translateAarch64ToMsvcIntrin(unsigned BuiltinID) {
32   using MSVCIntrin = CodeGenFunction::MSVCIntrin;
33   switch (BuiltinID) {
34   default:
35     return std::nullopt;
36   case clang::AArch64::BI_BitScanForward:
37   case clang::AArch64::BI_BitScanForward64:
38     return MSVCIntrin::_BitScanForward;
39   case clang::AArch64::BI_BitScanReverse:
40   case clang::AArch64::BI_BitScanReverse64:
41     return MSVCIntrin::_BitScanReverse;
42   case clang::AArch64::BI_InterlockedAnd64:
43     return MSVCIntrin::_InterlockedAnd;
44   case clang::AArch64::BI_InterlockedExchange64:
45     return MSVCIntrin::_InterlockedExchange;
46   case clang::AArch64::BI_InterlockedExchangeAdd64:
47     return MSVCIntrin::_InterlockedExchangeAdd;
48   case clang::AArch64::BI_InterlockedExchangeSub64:
49     return MSVCIntrin::_InterlockedExchangeSub;
50   case clang::AArch64::BI_InterlockedOr64:
51     return MSVCIntrin::_InterlockedOr;
52   case clang::AArch64::BI_InterlockedXor64:
53     return MSVCIntrin::_InterlockedXor;
54   case clang::AArch64::BI_InterlockedDecrement64:
55     return MSVCIntrin::_InterlockedDecrement;
56   case clang::AArch64::BI_InterlockedIncrement64:
57     return MSVCIntrin::_InterlockedIncrement;
58   case clang::AArch64::BI_InterlockedExchangeAdd8_acq:
59   case clang::AArch64::BI_InterlockedExchangeAdd16_acq:
60   case clang::AArch64::BI_InterlockedExchangeAdd_acq:
61   case clang::AArch64::BI_InterlockedExchangeAdd64_acq:
62     return MSVCIntrin::_InterlockedExchangeAdd_acq;
63   case clang::AArch64::BI_InterlockedExchangeAdd8_rel:
64   case clang::AArch64::BI_InterlockedExchangeAdd16_rel:
65   case clang::AArch64::BI_InterlockedExchangeAdd_rel:
66   case clang::AArch64::BI_InterlockedExchangeAdd64_rel:
67     return MSVCIntrin::_InterlockedExchangeAdd_rel;
68   case clang::AArch64::BI_InterlockedExchangeAdd8_nf:
69   case clang::AArch64::BI_InterlockedExchangeAdd16_nf:
70   case clang::AArch64::BI_InterlockedExchangeAdd_nf:
71   case clang::AArch64::BI_InterlockedExchangeAdd64_nf:
72     return MSVCIntrin::_InterlockedExchangeAdd_nf;
73   case clang::AArch64::BI_InterlockedExchange8_acq:
74   case clang::AArch64::BI_InterlockedExchange16_acq:
75   case clang::AArch64::BI_InterlockedExchange_acq:
76   case clang::AArch64::BI_InterlockedExchange64_acq:
77   case clang::AArch64::BI_InterlockedExchangePointer_acq:
78     return MSVCIntrin::_InterlockedExchange_acq;
79   case clang::AArch64::BI_InterlockedExchange8_rel:
80   case clang::AArch64::BI_InterlockedExchange16_rel:
81   case clang::AArch64::BI_InterlockedExchange_rel:
82   case clang::AArch64::BI_InterlockedExchange64_rel:
83   case clang::AArch64::BI_InterlockedExchangePointer_rel:
84     return MSVCIntrin::_InterlockedExchange_rel;
85   case clang::AArch64::BI_InterlockedExchange8_nf:
86   case clang::AArch64::BI_InterlockedExchange16_nf:
87   case clang::AArch64::BI_InterlockedExchange_nf:
88   case clang::AArch64::BI_InterlockedExchange64_nf:
89   case clang::AArch64::BI_InterlockedExchangePointer_nf:
90     return MSVCIntrin::_InterlockedExchange_nf;
91   case clang::AArch64::BI_InterlockedCompareExchange8_acq:
92   case clang::AArch64::BI_InterlockedCompareExchange16_acq:
93   case clang::AArch64::BI_InterlockedCompareExchange_acq:
94   case clang::AArch64::BI_InterlockedCompareExchange64_acq:
95   case clang::AArch64::BI_InterlockedCompareExchangePointer_acq:
96     return MSVCIntrin::_InterlockedCompareExchange_acq;
97   case clang::AArch64::BI_InterlockedCompareExchange8_rel:
98   case clang::AArch64::BI_InterlockedCompareExchange16_rel:
99   case clang::AArch64::BI_InterlockedCompareExchange_rel:
100   case clang::AArch64::BI_InterlockedCompareExchange64_rel:
101   case clang::AArch64::BI_InterlockedCompareExchangePointer_rel:
102     return MSVCIntrin::_InterlockedCompareExchange_rel;
103   case clang::AArch64::BI_InterlockedCompareExchange8_nf:
104   case clang::AArch64::BI_InterlockedCompareExchange16_nf:
105   case clang::AArch64::BI_InterlockedCompareExchange_nf:
106   case clang::AArch64::BI_InterlockedCompareExchange64_nf:
107     return MSVCIntrin::_InterlockedCompareExchange_nf;
108   case clang::AArch64::BI_InterlockedCompareExchange128:
109     return MSVCIntrin::_InterlockedCompareExchange128;
110   case clang::AArch64::BI_InterlockedCompareExchange128_acq:
111     return MSVCIntrin::_InterlockedCompareExchange128_acq;
112   case clang::AArch64::BI_InterlockedCompareExchange128_nf:
113     return MSVCIntrin::_InterlockedCompareExchange128_nf;
114   case clang::AArch64::BI_InterlockedCompareExchange128_rel:
115     return MSVCIntrin::_InterlockedCompareExchange128_rel;
116   case clang::AArch64::BI_InterlockedOr8_acq:
117   case clang::AArch64::BI_InterlockedOr16_acq:
118   case clang::AArch64::BI_InterlockedOr_acq:
119   case clang::AArch64::BI_InterlockedOr64_acq:
120     return MSVCIntrin::_InterlockedOr_acq;
121   case clang::AArch64::BI_InterlockedOr8_rel:
122   case clang::AArch64::BI_InterlockedOr16_rel:
123   case clang::AArch64::BI_InterlockedOr_rel:
124   case clang::AArch64::BI_InterlockedOr64_rel:
125     return MSVCIntrin::_InterlockedOr_rel;
126   case clang::AArch64::BI_InterlockedOr8_nf:
127   case clang::AArch64::BI_InterlockedOr16_nf:
128   case clang::AArch64::BI_InterlockedOr_nf:
129   case clang::AArch64::BI_InterlockedOr64_nf:
130     return MSVCIntrin::_InterlockedOr_nf;
131   case clang::AArch64::BI_InterlockedXor8_acq:
132   case clang::AArch64::BI_InterlockedXor16_acq:
133   case clang::AArch64::BI_InterlockedXor_acq:
134   case clang::AArch64::BI_InterlockedXor64_acq:
135     return MSVCIntrin::_InterlockedXor_acq;
136   case clang::AArch64::BI_InterlockedXor8_rel:
137   case clang::AArch64::BI_InterlockedXor16_rel:
138   case clang::AArch64::BI_InterlockedXor_rel:
139   case clang::AArch64::BI_InterlockedXor64_rel:
140     return MSVCIntrin::_InterlockedXor_rel;
141   case clang::AArch64::BI_InterlockedXor8_nf:
142   case clang::AArch64::BI_InterlockedXor16_nf:
143   case clang::AArch64::BI_InterlockedXor_nf:
144   case clang::AArch64::BI_InterlockedXor64_nf:
145     return MSVCIntrin::_InterlockedXor_nf;
146   case clang::AArch64::BI_InterlockedAnd8_acq:
147   case clang::AArch64::BI_InterlockedAnd16_acq:
148   case clang::AArch64::BI_InterlockedAnd_acq:
149   case clang::AArch64::BI_InterlockedAnd64_acq:
150     return MSVCIntrin::_InterlockedAnd_acq;
151   case clang::AArch64::BI_InterlockedAnd8_rel:
152   case clang::AArch64::BI_InterlockedAnd16_rel:
153   case clang::AArch64::BI_InterlockedAnd_rel:
154   case clang::AArch64::BI_InterlockedAnd64_rel:
155     return MSVCIntrin::_InterlockedAnd_rel;
156   case clang::AArch64::BI_InterlockedAnd8_nf:
157   case clang::AArch64::BI_InterlockedAnd16_nf:
158   case clang::AArch64::BI_InterlockedAnd_nf:
159   case clang::AArch64::BI_InterlockedAnd64_nf:
160     return MSVCIntrin::_InterlockedAnd_nf;
161   case clang::AArch64::BI_InterlockedIncrement16_acq:
162   case clang::AArch64::BI_InterlockedIncrement_acq:
163   case clang::AArch64::BI_InterlockedIncrement64_acq:
164     return MSVCIntrin::_InterlockedIncrement_acq;
165   case clang::AArch64::BI_InterlockedIncrement16_rel:
166   case clang::AArch64::BI_InterlockedIncrement_rel:
167   case clang::AArch64::BI_InterlockedIncrement64_rel:
168     return MSVCIntrin::_InterlockedIncrement_rel;
169   case clang::AArch64::BI_InterlockedIncrement16_nf:
170   case clang::AArch64::BI_InterlockedIncrement_nf:
171   case clang::AArch64::BI_InterlockedIncrement64_nf:
172     return MSVCIntrin::_InterlockedIncrement_nf;
173   case clang::AArch64::BI_InterlockedDecrement16_acq:
174   case clang::AArch64::BI_InterlockedDecrement_acq:
175   case clang::AArch64::BI_InterlockedDecrement64_acq:
176     return MSVCIntrin::_InterlockedDecrement_acq;
177   case clang::AArch64::BI_InterlockedDecrement16_rel:
178   case clang::AArch64::BI_InterlockedDecrement_rel:
179   case clang::AArch64::BI_InterlockedDecrement64_rel:
180     return MSVCIntrin::_InterlockedDecrement_rel;
181   case clang::AArch64::BI_InterlockedDecrement16_nf:
182   case clang::AArch64::BI_InterlockedDecrement_nf:
183   case clang::AArch64::BI_InterlockedDecrement64_nf:
184     return MSVCIntrin::_InterlockedDecrement_nf;
185   }
186   llvm_unreachable("must return from switch");
187 }
188 
189 static std::optional<CodeGenFunction::MSVCIntrin>
190 translateArmToMsvcIntrin(unsigned BuiltinID) {
191   using MSVCIntrin = CodeGenFunction::MSVCIntrin;
192   switch (BuiltinID) {
193   default:
194     return std::nullopt;
195   case clang::ARM::BI_BitScanForward:
196   case clang::ARM::BI_BitScanForward64:
197     return MSVCIntrin::_BitScanForward;
198   case clang::ARM::BI_BitScanReverse:
199   case clang::ARM::BI_BitScanReverse64:
200     return MSVCIntrin::_BitScanReverse;
201   case clang::ARM::BI_InterlockedAnd64:
202     return MSVCIntrin::_InterlockedAnd;
203   case clang::ARM::BI_InterlockedExchange64:
204     return MSVCIntrin::_InterlockedExchange;
205   case clang::ARM::BI_InterlockedExchangeAdd64:
206     return MSVCIntrin::_InterlockedExchangeAdd;
207   case clang::ARM::BI_InterlockedExchangeSub64:
208     return MSVCIntrin::_InterlockedExchangeSub;
209   case clang::ARM::BI_InterlockedOr64:
210     return MSVCIntrin::_InterlockedOr;
211   case clang::ARM::BI_InterlockedXor64:
212     return MSVCIntrin::_InterlockedXor;
213   case clang::ARM::BI_InterlockedDecrement64:
214     return MSVCIntrin::_InterlockedDecrement;
215   case clang::ARM::BI_InterlockedIncrement64:
216     return MSVCIntrin::_InterlockedIncrement;
217   case clang::ARM::BI_InterlockedExchangeAdd8_acq:
218   case clang::ARM::BI_InterlockedExchangeAdd16_acq:
219   case clang::ARM::BI_InterlockedExchangeAdd_acq:
220   case clang::ARM::BI_InterlockedExchangeAdd64_acq:
221     return MSVCIntrin::_InterlockedExchangeAdd_acq;
222   case clang::ARM::BI_InterlockedExchangeAdd8_rel:
223   case clang::ARM::BI_InterlockedExchangeAdd16_rel:
224   case clang::ARM::BI_InterlockedExchangeAdd_rel:
225   case clang::ARM::BI_InterlockedExchangeAdd64_rel:
226     return MSVCIntrin::_InterlockedExchangeAdd_rel;
227   case clang::ARM::BI_InterlockedExchangeAdd8_nf:
228   case clang::ARM::BI_InterlockedExchangeAdd16_nf:
229   case clang::ARM::BI_InterlockedExchangeAdd_nf:
230   case clang::ARM::BI_InterlockedExchangeAdd64_nf:
231     return MSVCIntrin::_InterlockedExchangeAdd_nf;
232   case clang::ARM::BI_InterlockedExchange8_acq:
233   case clang::ARM::BI_InterlockedExchange16_acq:
234   case clang::ARM::BI_InterlockedExchange_acq:
235   case clang::ARM::BI_InterlockedExchange64_acq:
236   case clang::ARM::BI_InterlockedExchangePointer_acq:
237     return MSVCIntrin::_InterlockedExchange_acq;
238   case clang::ARM::BI_InterlockedExchange8_rel:
239   case clang::ARM::BI_InterlockedExchange16_rel:
240   case clang::ARM::BI_InterlockedExchange_rel:
241   case clang::ARM::BI_InterlockedExchange64_rel:
242   case clang::ARM::BI_InterlockedExchangePointer_rel:
243     return MSVCIntrin::_InterlockedExchange_rel;
244   case clang::ARM::BI_InterlockedExchange8_nf:
245   case clang::ARM::BI_InterlockedExchange16_nf:
246   case clang::ARM::BI_InterlockedExchange_nf:
247   case clang::ARM::BI_InterlockedExchange64_nf:
248   case clang::ARM::BI_InterlockedExchangePointer_nf:
249     return MSVCIntrin::_InterlockedExchange_nf;
250   case clang::ARM::BI_InterlockedCompareExchange8_acq:
251   case clang::ARM::BI_InterlockedCompareExchange16_acq:
252   case clang::ARM::BI_InterlockedCompareExchange_acq:
253   case clang::ARM::BI_InterlockedCompareExchange64_acq:
254   case clang::ARM::BI_InterlockedCompareExchangePointer_acq:
255     return MSVCIntrin::_InterlockedCompareExchange_acq;
256   case clang::ARM::BI_InterlockedCompareExchange8_rel:
257   case clang::ARM::BI_InterlockedCompareExchange16_rel:
258   case clang::ARM::BI_InterlockedCompareExchange_rel:
259   case clang::ARM::BI_InterlockedCompareExchange64_rel:
260   case clang::ARM::BI_InterlockedCompareExchangePointer_rel:
261     return MSVCIntrin::_InterlockedCompareExchange_rel;
262   case clang::ARM::BI_InterlockedCompareExchange8_nf:
263   case clang::ARM::BI_InterlockedCompareExchange16_nf:
264   case clang::ARM::BI_InterlockedCompareExchange_nf:
265   case clang::ARM::BI_InterlockedCompareExchange64_nf:
266     return MSVCIntrin::_InterlockedCompareExchange_nf;
267   case clang::ARM::BI_InterlockedOr8_acq:
268   case clang::ARM::BI_InterlockedOr16_acq:
269   case clang::ARM::BI_InterlockedOr_acq:
270   case clang::ARM::BI_InterlockedOr64_acq:
271     return MSVCIntrin::_InterlockedOr_acq;
272   case clang::ARM::BI_InterlockedOr8_rel:
273   case clang::ARM::BI_InterlockedOr16_rel:
274   case clang::ARM::BI_InterlockedOr_rel:
275   case clang::ARM::BI_InterlockedOr64_rel:
276     return MSVCIntrin::_InterlockedOr_rel;
277   case clang::ARM::BI_InterlockedOr8_nf:
278   case clang::ARM::BI_InterlockedOr16_nf:
279   case clang::ARM::BI_InterlockedOr_nf:
280   case clang::ARM::BI_InterlockedOr64_nf:
281     return MSVCIntrin::_InterlockedOr_nf;
282   case clang::ARM::BI_InterlockedXor8_acq:
283   case clang::ARM::BI_InterlockedXor16_acq:
284   case clang::ARM::BI_InterlockedXor_acq:
285   case clang::ARM::BI_InterlockedXor64_acq:
286     return MSVCIntrin::_InterlockedXor_acq;
287   case clang::ARM::BI_InterlockedXor8_rel:
288   case clang::ARM::BI_InterlockedXor16_rel:
289   case clang::ARM::BI_InterlockedXor_rel:
290   case clang::ARM::BI_InterlockedXor64_rel:
291     return MSVCIntrin::_InterlockedXor_rel;
292   case clang::ARM::BI_InterlockedXor8_nf:
293   case clang::ARM::BI_InterlockedXor16_nf:
294   case clang::ARM::BI_InterlockedXor_nf:
295   case clang::ARM::BI_InterlockedXor64_nf:
296     return MSVCIntrin::_InterlockedXor_nf;
297   case clang::ARM::BI_InterlockedAnd8_acq:
298   case clang::ARM::BI_InterlockedAnd16_acq:
299   case clang::ARM::BI_InterlockedAnd_acq:
300   case clang::ARM::BI_InterlockedAnd64_acq:
301     return MSVCIntrin::_InterlockedAnd_acq;
302   case clang::ARM::BI_InterlockedAnd8_rel:
303   case clang::ARM::BI_InterlockedAnd16_rel:
304   case clang::ARM::BI_InterlockedAnd_rel:
305   case clang::ARM::BI_InterlockedAnd64_rel:
306     return MSVCIntrin::_InterlockedAnd_rel;
307   case clang::ARM::BI_InterlockedAnd8_nf:
308   case clang::ARM::BI_InterlockedAnd16_nf:
309   case clang::ARM::BI_InterlockedAnd_nf:
310   case clang::ARM::BI_InterlockedAnd64_nf:
311     return MSVCIntrin::_InterlockedAnd_nf;
312   case clang::ARM::BI_InterlockedIncrement16_acq:
313   case clang::ARM::BI_InterlockedIncrement_acq:
314   case clang::ARM::BI_InterlockedIncrement64_acq:
315     return MSVCIntrin::_InterlockedIncrement_acq;
316   case clang::ARM::BI_InterlockedIncrement16_rel:
317   case clang::ARM::BI_InterlockedIncrement_rel:
318   case clang::ARM::BI_InterlockedIncrement64_rel:
319     return MSVCIntrin::_InterlockedIncrement_rel;
320   case clang::ARM::BI_InterlockedIncrement16_nf:
321   case clang::ARM::BI_InterlockedIncrement_nf:
322   case clang::ARM::BI_InterlockedIncrement64_nf:
323     return MSVCIntrin::_InterlockedIncrement_nf;
324   case clang::ARM::BI_InterlockedDecrement16_acq:
325   case clang::ARM::BI_InterlockedDecrement_acq:
326   case clang::ARM::BI_InterlockedDecrement64_acq:
327     return MSVCIntrin::_InterlockedDecrement_acq;
328   case clang::ARM::BI_InterlockedDecrement16_rel:
329   case clang::ARM::BI_InterlockedDecrement_rel:
330   case clang::ARM::BI_InterlockedDecrement64_rel:
331     return MSVCIntrin::_InterlockedDecrement_rel;
332   case clang::ARM::BI_InterlockedDecrement16_nf:
333   case clang::ARM::BI_InterlockedDecrement_nf:
334   case clang::ARM::BI_InterlockedDecrement64_nf:
335     return MSVCIntrin::_InterlockedDecrement_nf;
336   }
337   llvm_unreachable("must return from switch");
338 }
339 
340 // Emit an intrinsic where all operands are of the same type as the result.
341 // Depending on mode, this may be a constrained floating-point intrinsic.
342 static Value *emitCallMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
343                                                 unsigned IntrinsicID,
344                                                 unsigned ConstrainedIntrinsicID,
345                                                 llvm::Type *Ty,
346                                                 ArrayRef<Value *> Args) {
347   Function *F;
348   if (CGF.Builder.getIsFPConstrained())
349     F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Ty);
350   else
351     F = CGF.CGM.getIntrinsic(IntrinsicID, Ty);
352 
353   if (CGF.Builder.getIsFPConstrained())
354     return CGF.Builder.CreateConstrainedFPCall(F, Args);
355   else
356     return CGF.Builder.CreateCall(F, Args);
357 }
358 
359 static llvm::FixedVectorType *GetNeonType(CodeGenFunction *CGF,
360                                           NeonTypeFlags TypeFlags,
361                                           bool HasLegalHalfType = true,
362                                           bool V1Ty = false,
363                                           bool AllowBFloatArgsAndRet = true) {
364   int IsQuad = TypeFlags.isQuad();
365   switch (TypeFlags.getEltType()) {
366   case NeonTypeFlags::Int8:
367   case NeonTypeFlags::Poly8:
368   case NeonTypeFlags::MFloat8:
369     return llvm::FixedVectorType::get(CGF->Int8Ty, V1Ty ? 1 : (8 << IsQuad));
370   case NeonTypeFlags::Int16:
371   case NeonTypeFlags::Poly16:
372     return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
373   case NeonTypeFlags::BFloat16:
374     if (AllowBFloatArgsAndRet)
375       return llvm::FixedVectorType::get(CGF->BFloatTy, V1Ty ? 1 : (4 << IsQuad));
376     else
377       return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
378   case NeonTypeFlags::Float16:
379     if (HasLegalHalfType)
380       return llvm::FixedVectorType::get(CGF->HalfTy, V1Ty ? 1 : (4 << IsQuad));
381     else
382       return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
383   case NeonTypeFlags::Int32:
384     return llvm::FixedVectorType::get(CGF->Int32Ty, V1Ty ? 1 : (2 << IsQuad));
385   case NeonTypeFlags::Int64:
386   case NeonTypeFlags::Poly64:
387     return llvm::FixedVectorType::get(CGF->Int64Ty, V1Ty ? 1 : (1 << IsQuad));
388   case NeonTypeFlags::Poly128:
389     // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
390     // There is a lot of i128 and f128 API missing.
391     // so we use v16i8 to represent poly128 and get pattern matched.
392     return llvm::FixedVectorType::get(CGF->Int8Ty, 16);
393   case NeonTypeFlags::Float32:
394     return llvm::FixedVectorType::get(CGF->FloatTy, V1Ty ? 1 : (2 << IsQuad));
395   case NeonTypeFlags::Float64:
396     return llvm::FixedVectorType::get(CGF->DoubleTy, V1Ty ? 1 : (1 << IsQuad));
397   }
398   llvm_unreachable("Unknown vector element type!");
399 }
400 
401 static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF,
402                                           NeonTypeFlags IntTypeFlags) {
403   int IsQuad = IntTypeFlags.isQuad();
404   switch (IntTypeFlags.getEltType()) {
405   case NeonTypeFlags::Int16:
406     return llvm::FixedVectorType::get(CGF->HalfTy, (4 << IsQuad));
407   case NeonTypeFlags::Int32:
408     return llvm::FixedVectorType::get(CGF->FloatTy, (2 << IsQuad));
409   case NeonTypeFlags::Int64:
410     return llvm::FixedVectorType::get(CGF->DoubleTy, (1 << IsQuad));
411   default:
412     llvm_unreachable("Type can't be converted to floating-point!");
413   }
414 }
415 
416 Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C,
417                                       const ElementCount &Count) {
418   Value *SV = llvm::ConstantVector::getSplat(Count, C);
419   return Builder.CreateShuffleVector(V, V, SV, "lane");
420 }
421 
422 Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) {
423   ElementCount EC = cast<llvm::VectorType>(V->getType())->getElementCount();
424   return EmitNeonSplat(V, C, EC);
425 }
426 
427 Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
428                                      const char *name,
429                                      unsigned shift, bool rightshift) {
430   unsigned j = 0;
431   for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
432        ai != ae; ++ai, ++j) {
433     if (F->isConstrainedFPIntrinsic())
434       if (ai->getType()->isMetadataTy())
435         continue;
436     if (shift > 0 && shift == j)
437       Ops[j] = EmitNeonShiftVector(Ops[j], ai->getType(), rightshift);
438     else
439       Ops[j] = Builder.CreateBitCast(Ops[j], ai->getType(), name);
440   }
441 
442   if (F->isConstrainedFPIntrinsic())
443     return Builder.CreateConstrainedFPCall(F, Ops, name);
444   else
445     return Builder.CreateCall(F, Ops, name);
446 }
447 
448 Value *CodeGenFunction::EmitFP8NeonCall(unsigned IID,
449                                         ArrayRef<llvm::Type *> Tys,
450                                         SmallVectorImpl<Value *> &Ops,
451                                         const CallExpr *E, const char *name) {
452   llvm::Value *FPM =
453       EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
454   Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), FPM);
455   return EmitNeonCall(CGM.getIntrinsic(IID, Tys), Ops, name);
456 }
457 
458 llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
459     unsigned IID, bool ExtendLaneArg, llvm::Type *RetTy,
460     SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) {
461 
462   const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
463                              RetTy->getPrimitiveSizeInBits();
464   llvm::Type *Tys[] = {llvm::FixedVectorType::get(RetTy, ElemCount),
465                        Ops[1]->getType()};
466   if (ExtendLaneArg) {
467     auto *VT = llvm::FixedVectorType::get(Int8Ty, 16);
468     Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
469                                         uint64_t(0));
470   }
471   return EmitFP8NeonCall(IID, Tys, Ops, E, name);
472 }
473 
474 llvm::Value *CodeGenFunction::EmitFP8NeonFMLACall(
475     unsigned IID, bool ExtendLaneArg, llvm::Type *RetTy,
476     SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) {
477 
478   if (ExtendLaneArg) {
479     auto *VT = llvm::FixedVectorType::get(Int8Ty, 16);
480     Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
481                                         uint64_t(0));
482   }
483   const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
484                              RetTy->getPrimitiveSizeInBits();
485   return EmitFP8NeonCall(IID, {llvm::FixedVectorType::get(RetTy, ElemCount)},
486                          Ops, E, name);
487 }
488 
489 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
490                                             bool neg) {
491   int SV = cast<ConstantInt>(V)->getSExtValue();
492   return ConstantInt::get(Ty, neg ? -SV : SV);
493 }
494 
495 Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
496                                            llvm::Type *Ty1, bool Extract,
497                                            SmallVectorImpl<llvm::Value *> &Ops,
498                                            const CallExpr *E,
499                                            const char *name) {
500   llvm::Type *Tys[] = {Ty0, Ty1};
501   if (Extract) {
502     // Op[0] is mfloat8x16_t, but the intrinsic converts only the lower part of
503     // the vector.
504     Tys[1] = llvm::FixedVectorType::get(Int8Ty, 8);
505     Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], uint64_t(0));
506   }
507   return EmitFP8NeonCall(IID, Tys, Ops, E, name);
508 }
509 
510 // Right-shift a vector by a constant.
511 Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift,
512                                           llvm::Type *Ty, bool usgn,
513                                           const char *name) {
514   llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
515 
516   int ShiftAmt = cast<ConstantInt>(Shift)->getSExtValue();
517   int EltSize = VTy->getScalarSizeInBits();
518 
519   Vec = Builder.CreateBitCast(Vec, Ty);
520 
521   // lshr/ashr are undefined when the shift amount is equal to the vector
522   // element size.
523   if (ShiftAmt == EltSize) {
524     if (usgn) {
525       // Right-shifting an unsigned value by its size yields 0.
526       return llvm::ConstantAggregateZero::get(VTy);
527     } else {
528       // Right-shifting a signed value by its size is equivalent
529       // to a shift of size-1.
530       --ShiftAmt;
531       Shift = ConstantInt::get(VTy->getElementType(), ShiftAmt);
532     }
533   }
534 
535   Shift = EmitNeonShiftVector(Shift, Ty, false);
536   if (usgn)
537     return Builder.CreateLShr(Vec, Shift, name);
538   else
539     return Builder.CreateAShr(Vec, Shift, name);
540 }
541 
542 enum {
543   AddRetType = (1 << 0),
544   Add1ArgType = (1 << 1),
545   Add2ArgTypes = (1 << 2),
546 
547   VectorizeRetType = (1 << 3),
548   VectorizeArgTypes = (1 << 4),
549 
550   InventFloatType = (1 << 5),
551   UnsignedAlts = (1 << 6),
552 
553   Use64BitVectors = (1 << 7),
554   Use128BitVectors = (1 << 8),
555 
556   Vectorize1ArgType = Add1ArgType | VectorizeArgTypes,
557   VectorRet = AddRetType | VectorizeRetType,
558   VectorRetGetArgs01 =
559       AddRetType | Add2ArgTypes | VectorizeRetType | VectorizeArgTypes,
560   FpCmpzModifiers =
561       AddRetType | VectorizeRetType | Add1ArgType | InventFloatType
562 };
563 
564 namespace {
565 struct ARMVectorIntrinsicInfo {
566   const char *NameHint;
567   unsigned BuiltinID;
568   unsigned LLVMIntrinsic;
569   unsigned AltLLVMIntrinsic;
570   uint64_t TypeModifier;
571 
572   bool operator<(unsigned RHSBuiltinID) const {
573     return BuiltinID < RHSBuiltinID;
574   }
575   bool operator<(const ARMVectorIntrinsicInfo &TE) const {
576     return BuiltinID < TE.BuiltinID;
577   }
578 };
579 } // end anonymous namespace
580 
581 #define NEONMAP0(NameBase) \
582   { #NameBase, NEON::BI__builtin_neon_ ## NameBase, 0, 0, 0 }
583 
584 #define NEONMAP1(NameBase, LLVMIntrinsic, TypeModifier) \
585   { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \
586       Intrinsic::LLVMIntrinsic, 0, TypeModifier }
587 
588 #define NEONMAP2(NameBase, LLVMIntrinsic, AltLLVMIntrinsic, TypeModifier) \
589   { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \
590       Intrinsic::LLVMIntrinsic, Intrinsic::AltLLVMIntrinsic, \
591       TypeModifier }
592 
593 static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
594   NEONMAP1(__a32_vcvt_bf16_f32, arm_neon_vcvtfp2bf, 0),
595   NEONMAP0(splat_lane_v),
596   NEONMAP0(splat_laneq_v),
597   NEONMAP0(splatq_lane_v),
598   NEONMAP0(splatq_laneq_v),
599   NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
600   NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
601   NEONMAP1(vabs_v, arm_neon_vabs, 0),
602   NEONMAP1(vabsq_v, arm_neon_vabs, 0),
603   NEONMAP0(vadd_v),
604   NEONMAP0(vaddhn_v),
605   NEONMAP0(vaddq_v),
606   NEONMAP1(vaesdq_u8, arm_neon_aesd, 0),
607   NEONMAP1(vaeseq_u8, arm_neon_aese, 0),
608   NEONMAP1(vaesimcq_u8, arm_neon_aesimc, 0),
609   NEONMAP1(vaesmcq_u8, arm_neon_aesmc, 0),
610   NEONMAP1(vbfdot_f32, arm_neon_bfdot, 0),
611   NEONMAP1(vbfdotq_f32, arm_neon_bfdot, 0),
612   NEONMAP1(vbfmlalbq_f32, arm_neon_bfmlalb, 0),
613   NEONMAP1(vbfmlaltq_f32, arm_neon_bfmlalt, 0),
614   NEONMAP1(vbfmmlaq_f32, arm_neon_bfmmla, 0),
615   NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
616   NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
617   NEONMAP1(vcadd_rot270_f16, arm_neon_vcadd_rot270, Add1ArgType),
618   NEONMAP1(vcadd_rot270_f32, arm_neon_vcadd_rot270, Add1ArgType),
619   NEONMAP1(vcadd_rot90_f16, arm_neon_vcadd_rot90, Add1ArgType),
620   NEONMAP1(vcadd_rot90_f32, arm_neon_vcadd_rot90, Add1ArgType),
621   NEONMAP1(vcaddq_rot270_f16, arm_neon_vcadd_rot270, Add1ArgType),
622   NEONMAP1(vcaddq_rot270_f32, arm_neon_vcadd_rot270, Add1ArgType),
623   NEONMAP1(vcaddq_rot270_f64, arm_neon_vcadd_rot270, Add1ArgType),
624   NEONMAP1(vcaddq_rot90_f16, arm_neon_vcadd_rot90, Add1ArgType),
625   NEONMAP1(vcaddq_rot90_f32, arm_neon_vcadd_rot90, Add1ArgType),
626   NEONMAP1(vcaddq_rot90_f64, arm_neon_vcadd_rot90, Add1ArgType),
627   NEONMAP1(vcage_v, arm_neon_vacge, 0),
628   NEONMAP1(vcageq_v, arm_neon_vacge, 0),
629   NEONMAP1(vcagt_v, arm_neon_vacgt, 0),
630   NEONMAP1(vcagtq_v, arm_neon_vacgt, 0),
631   NEONMAP1(vcale_v, arm_neon_vacge, 0),
632   NEONMAP1(vcaleq_v, arm_neon_vacge, 0),
633   NEONMAP1(vcalt_v, arm_neon_vacgt, 0),
634   NEONMAP1(vcaltq_v, arm_neon_vacgt, 0),
635   NEONMAP0(vceqz_v),
636   NEONMAP0(vceqzq_v),
637   NEONMAP0(vcgez_v),
638   NEONMAP0(vcgezq_v),
639   NEONMAP0(vcgtz_v),
640   NEONMAP0(vcgtzq_v),
641   NEONMAP0(vclez_v),
642   NEONMAP0(vclezq_v),
643   NEONMAP1(vcls_v, arm_neon_vcls, Add1ArgType),
644   NEONMAP1(vclsq_v, arm_neon_vcls, Add1ArgType),
645   NEONMAP0(vcltz_v),
646   NEONMAP0(vcltzq_v),
647   NEONMAP1(vclz_v, ctlz, Add1ArgType),
648   NEONMAP1(vclzq_v, ctlz, Add1ArgType),
649   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
650   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
651   NEONMAP1(vcvt_f16_f32, arm_neon_vcvtfp2hf, 0),
652   NEONMAP0(vcvt_f16_s16),
653   NEONMAP0(vcvt_f16_u16),
654   NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0),
655   NEONMAP0(vcvt_f32_v),
656   NEONMAP1(vcvt_n_f16_s16, arm_neon_vcvtfxs2fp, 0),
657   NEONMAP1(vcvt_n_f16_u16, arm_neon_vcvtfxu2fp, 0),
658   NEONMAP2(vcvt_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
659   NEONMAP1(vcvt_n_s16_f16, arm_neon_vcvtfp2fxs, 0),
660   NEONMAP1(vcvt_n_s32_v, arm_neon_vcvtfp2fxs, 0),
661   NEONMAP1(vcvt_n_s64_v, arm_neon_vcvtfp2fxs, 0),
662   NEONMAP1(vcvt_n_u16_f16, arm_neon_vcvtfp2fxu, 0),
663   NEONMAP1(vcvt_n_u32_v, arm_neon_vcvtfp2fxu, 0),
664   NEONMAP1(vcvt_n_u64_v, arm_neon_vcvtfp2fxu, 0),
665   NEONMAP0(vcvt_s16_f16),
666   NEONMAP0(vcvt_s32_v),
667   NEONMAP0(vcvt_s64_v),
668   NEONMAP0(vcvt_u16_f16),
669   NEONMAP0(vcvt_u32_v),
670   NEONMAP0(vcvt_u64_v),
671   NEONMAP1(vcvta_s16_f16, arm_neon_vcvtas, 0),
672   NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0),
673   NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0),
674   NEONMAP1(vcvta_u16_f16, arm_neon_vcvtau, 0),
675   NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0),
676   NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0),
677   NEONMAP1(vcvtaq_s16_f16, arm_neon_vcvtas, 0),
678   NEONMAP1(vcvtaq_s32_v, arm_neon_vcvtas, 0),
679   NEONMAP1(vcvtaq_s64_v, arm_neon_vcvtas, 0),
680   NEONMAP1(vcvtaq_u16_f16, arm_neon_vcvtau, 0),
681   NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0),
682   NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0),
683   NEONMAP1(vcvth_bf16_f32, arm_neon_vcvtbfp2bf, 0),
684   NEONMAP1(vcvtm_s16_f16, arm_neon_vcvtms, 0),
685   NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0),
686   NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0),
687   NEONMAP1(vcvtm_u16_f16, arm_neon_vcvtmu, 0),
688   NEONMAP1(vcvtm_u32_v, arm_neon_vcvtmu, 0),
689   NEONMAP1(vcvtm_u64_v, arm_neon_vcvtmu, 0),
690   NEONMAP1(vcvtmq_s16_f16, arm_neon_vcvtms, 0),
691   NEONMAP1(vcvtmq_s32_v, arm_neon_vcvtms, 0),
692   NEONMAP1(vcvtmq_s64_v, arm_neon_vcvtms, 0),
693   NEONMAP1(vcvtmq_u16_f16, arm_neon_vcvtmu, 0),
694   NEONMAP1(vcvtmq_u32_v, arm_neon_vcvtmu, 0),
695   NEONMAP1(vcvtmq_u64_v, arm_neon_vcvtmu, 0),
696   NEONMAP1(vcvtn_s16_f16, arm_neon_vcvtns, 0),
697   NEONMAP1(vcvtn_s32_v, arm_neon_vcvtns, 0),
698   NEONMAP1(vcvtn_s64_v, arm_neon_vcvtns, 0),
699   NEONMAP1(vcvtn_u16_f16, arm_neon_vcvtnu, 0),
700   NEONMAP1(vcvtn_u32_v, arm_neon_vcvtnu, 0),
701   NEONMAP1(vcvtn_u64_v, arm_neon_vcvtnu, 0),
702   NEONMAP1(vcvtnq_s16_f16, arm_neon_vcvtns, 0),
703   NEONMAP1(vcvtnq_s32_v, arm_neon_vcvtns, 0),
704   NEONMAP1(vcvtnq_s64_v, arm_neon_vcvtns, 0),
705   NEONMAP1(vcvtnq_u16_f16, arm_neon_vcvtnu, 0),
706   NEONMAP1(vcvtnq_u32_v, arm_neon_vcvtnu, 0),
707   NEONMAP1(vcvtnq_u64_v, arm_neon_vcvtnu, 0),
708   NEONMAP1(vcvtp_s16_f16, arm_neon_vcvtps, 0),
709   NEONMAP1(vcvtp_s32_v, arm_neon_vcvtps, 0),
710   NEONMAP1(vcvtp_s64_v, arm_neon_vcvtps, 0),
711   NEONMAP1(vcvtp_u16_f16, arm_neon_vcvtpu, 0),
712   NEONMAP1(vcvtp_u32_v, arm_neon_vcvtpu, 0),
713   NEONMAP1(vcvtp_u64_v, arm_neon_vcvtpu, 0),
714   NEONMAP1(vcvtpq_s16_f16, arm_neon_vcvtps, 0),
715   NEONMAP1(vcvtpq_s32_v, arm_neon_vcvtps, 0),
716   NEONMAP1(vcvtpq_s64_v, arm_neon_vcvtps, 0),
717   NEONMAP1(vcvtpq_u16_f16, arm_neon_vcvtpu, 0),
718   NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0),
719   NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0),
720   NEONMAP0(vcvtq_f16_s16),
721   NEONMAP0(vcvtq_f16_u16),
722   NEONMAP0(vcvtq_f32_v),
723   NEONMAP1(vcvtq_n_f16_s16, arm_neon_vcvtfxs2fp, 0),
724   NEONMAP1(vcvtq_n_f16_u16, arm_neon_vcvtfxu2fp, 0),
725   NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
726   NEONMAP1(vcvtq_n_s16_f16, arm_neon_vcvtfp2fxs, 0),
727   NEONMAP1(vcvtq_n_s32_v, arm_neon_vcvtfp2fxs, 0),
728   NEONMAP1(vcvtq_n_s64_v, arm_neon_vcvtfp2fxs, 0),
729   NEONMAP1(vcvtq_n_u16_f16, arm_neon_vcvtfp2fxu, 0),
730   NEONMAP1(vcvtq_n_u32_v, arm_neon_vcvtfp2fxu, 0),
731   NEONMAP1(vcvtq_n_u64_v, arm_neon_vcvtfp2fxu, 0),
732   NEONMAP0(vcvtq_s16_f16),
733   NEONMAP0(vcvtq_s32_v),
734   NEONMAP0(vcvtq_s64_v),
735   NEONMAP0(vcvtq_u16_f16),
736   NEONMAP0(vcvtq_u32_v),
737   NEONMAP0(vcvtq_u64_v),
738   NEONMAP1(vdot_s32, arm_neon_sdot, 0),
739   NEONMAP1(vdot_u32, arm_neon_udot, 0),
740   NEONMAP1(vdotq_s32, arm_neon_sdot, 0),
741   NEONMAP1(vdotq_u32, arm_neon_udot, 0),
742   NEONMAP0(vext_v),
743   NEONMAP0(vextq_v),
744   NEONMAP0(vfma_v),
745   NEONMAP0(vfmaq_v),
746   NEONMAP2(vhadd_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
747   NEONMAP2(vhaddq_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
748   NEONMAP2(vhsub_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
749   NEONMAP2(vhsubq_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
750   NEONMAP0(vld1_dup_v),
751   NEONMAP1(vld1_v, arm_neon_vld1, 0),
752   NEONMAP1(vld1_x2_v, arm_neon_vld1x2, 0),
753   NEONMAP1(vld1_x3_v, arm_neon_vld1x3, 0),
754   NEONMAP1(vld1_x4_v, arm_neon_vld1x4, 0),
755   NEONMAP0(vld1q_dup_v),
756   NEONMAP1(vld1q_v, arm_neon_vld1, 0),
757   NEONMAP1(vld1q_x2_v, arm_neon_vld1x2, 0),
758   NEONMAP1(vld1q_x3_v, arm_neon_vld1x3, 0),
759   NEONMAP1(vld1q_x4_v, arm_neon_vld1x4, 0),
760   NEONMAP1(vld2_dup_v, arm_neon_vld2dup, 0),
761   NEONMAP1(vld2_lane_v, arm_neon_vld2lane, 0),
762   NEONMAP1(vld2_v, arm_neon_vld2, 0),
763   NEONMAP1(vld2q_dup_v, arm_neon_vld2dup, 0),
764   NEONMAP1(vld2q_lane_v, arm_neon_vld2lane, 0),
765   NEONMAP1(vld2q_v, arm_neon_vld2, 0),
766   NEONMAP1(vld3_dup_v, arm_neon_vld3dup, 0),
767   NEONMAP1(vld3_lane_v, arm_neon_vld3lane, 0),
768   NEONMAP1(vld3_v, arm_neon_vld3, 0),
769   NEONMAP1(vld3q_dup_v, arm_neon_vld3dup, 0),
770   NEONMAP1(vld3q_lane_v, arm_neon_vld3lane, 0),
771   NEONMAP1(vld3q_v, arm_neon_vld3, 0),
772   NEONMAP1(vld4_dup_v, arm_neon_vld4dup, 0),
773   NEONMAP1(vld4_lane_v, arm_neon_vld4lane, 0),
774   NEONMAP1(vld4_v, arm_neon_vld4, 0),
775   NEONMAP1(vld4q_dup_v, arm_neon_vld4dup, 0),
776   NEONMAP1(vld4q_lane_v, arm_neon_vld4lane, 0),
777   NEONMAP1(vld4q_v, arm_neon_vld4, 0),
778   NEONMAP2(vmax_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
779   NEONMAP1(vmaxnm_v, arm_neon_vmaxnm, Add1ArgType),
780   NEONMAP1(vmaxnmq_v, arm_neon_vmaxnm, Add1ArgType),
781   NEONMAP2(vmaxq_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
782   NEONMAP2(vmin_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
783   NEONMAP1(vminnm_v, arm_neon_vminnm, Add1ArgType),
784   NEONMAP1(vminnmq_v, arm_neon_vminnm, Add1ArgType),
785   NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
786   NEONMAP1(vmmlaq_s32, arm_neon_smmla, 0),
787   NEONMAP1(vmmlaq_u32, arm_neon_ummla, 0),
788   NEONMAP0(vmovl_v),
789   NEONMAP0(vmovn_v),
790   NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType),
791   NEONMAP0(vmull_v),
792   NEONMAP1(vmulq_v, arm_neon_vmulp, Add1ArgType),
793   NEONMAP2(vpadal_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
794   NEONMAP2(vpadalq_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
795   NEONMAP1(vpadd_v, arm_neon_vpadd, Add1ArgType),
796   NEONMAP2(vpaddl_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
797   NEONMAP2(vpaddlq_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
798   NEONMAP1(vpaddq_v, arm_neon_vpadd, Add1ArgType),
799   NEONMAP2(vpmax_v, arm_neon_vpmaxu, arm_neon_vpmaxs, Add1ArgType | UnsignedAlts),
800   NEONMAP2(vpmin_v, arm_neon_vpminu, arm_neon_vpmins, Add1ArgType | UnsignedAlts),
801   NEONMAP1(vqabs_v, arm_neon_vqabs, Add1ArgType),
802   NEONMAP1(vqabsq_v, arm_neon_vqabs, Add1ArgType),
803   NEONMAP2(vqadd_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts),
804   NEONMAP2(vqaddq_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts),
805   NEONMAP2(vqdmlal_v, arm_neon_vqdmull, sadd_sat, 0),
806   NEONMAP2(vqdmlsl_v, arm_neon_vqdmull, ssub_sat, 0),
807   NEONMAP1(vqdmulh_v, arm_neon_vqdmulh, Add1ArgType),
808   NEONMAP1(vqdmulhq_v, arm_neon_vqdmulh, Add1ArgType),
809   NEONMAP1(vqdmull_v, arm_neon_vqdmull, Add1ArgType),
810   NEONMAP2(vqmovn_v, arm_neon_vqmovnu, arm_neon_vqmovns, Add1ArgType | UnsignedAlts),
811   NEONMAP1(vqmovun_v, arm_neon_vqmovnsu, Add1ArgType),
812   NEONMAP1(vqneg_v, arm_neon_vqneg, Add1ArgType),
813   NEONMAP1(vqnegq_v, arm_neon_vqneg, Add1ArgType),
814   NEONMAP1(vqrdmlah_s16, arm_neon_vqrdmlah, Add1ArgType),
815   NEONMAP1(vqrdmlah_s32, arm_neon_vqrdmlah, Add1ArgType),
816   NEONMAP1(vqrdmlahq_s16, arm_neon_vqrdmlah, Add1ArgType),
817   NEONMAP1(vqrdmlahq_s32, arm_neon_vqrdmlah, Add1ArgType),
818   NEONMAP1(vqrdmlsh_s16, arm_neon_vqrdmlsh, Add1ArgType),
819   NEONMAP1(vqrdmlsh_s32, arm_neon_vqrdmlsh, Add1ArgType),
820   NEONMAP1(vqrdmlshq_s16, arm_neon_vqrdmlsh, Add1ArgType),
821   NEONMAP1(vqrdmlshq_s32, arm_neon_vqrdmlsh, Add1ArgType),
822   NEONMAP1(vqrdmulh_v, arm_neon_vqrdmulh, Add1ArgType),
823   NEONMAP1(vqrdmulhq_v, arm_neon_vqrdmulh, Add1ArgType),
824   NEONMAP2(vqrshl_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
825   NEONMAP2(vqrshlq_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
826   NEONMAP2(vqshl_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
827   NEONMAP2(vqshl_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
828   NEONMAP2(vqshlq_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
829   NEONMAP2(vqshlq_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
830   NEONMAP1(vqshlu_n_v, arm_neon_vqshiftsu, 0),
831   NEONMAP1(vqshluq_n_v, arm_neon_vqshiftsu, 0),
832   NEONMAP2(vqsub_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts),
833   NEONMAP2(vqsubq_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts),
834   NEONMAP1(vraddhn_v, arm_neon_vraddhn, Add1ArgType),
835   NEONMAP2(vrecpe_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
836   NEONMAP2(vrecpeq_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
837   NEONMAP1(vrecps_v, arm_neon_vrecps, Add1ArgType),
838   NEONMAP1(vrecpsq_v, arm_neon_vrecps, Add1ArgType),
839   NEONMAP2(vrhadd_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
840   NEONMAP2(vrhaddq_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
841   NEONMAP1(vrnd_v, trunc, Add1ArgType),
842   NEONMAP1(vrnda_v, round, Add1ArgType),
843   NEONMAP1(vrndaq_v, round, Add1ArgType),
844   NEONMAP0(vrndi_v),
845   NEONMAP0(vrndiq_v),
846   NEONMAP1(vrndm_v, floor, Add1ArgType),
847   NEONMAP1(vrndmq_v, floor, Add1ArgType),
848   NEONMAP1(vrndn_v, roundeven, Add1ArgType),
849   NEONMAP1(vrndnq_v, roundeven, Add1ArgType),
850   NEONMAP1(vrndp_v, ceil, Add1ArgType),
851   NEONMAP1(vrndpq_v, ceil, Add1ArgType),
852   NEONMAP1(vrndq_v, trunc, Add1ArgType),
853   NEONMAP1(vrndx_v, rint, Add1ArgType),
854   NEONMAP1(vrndxq_v, rint, Add1ArgType),
855   NEONMAP2(vrshl_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
856   NEONMAP2(vrshlq_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
857   NEONMAP2(vrshr_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
858   NEONMAP2(vrshrq_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
859   NEONMAP2(vrsqrte_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
860   NEONMAP2(vrsqrteq_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
861   NEONMAP1(vrsqrts_v, arm_neon_vrsqrts, Add1ArgType),
862   NEONMAP1(vrsqrtsq_v, arm_neon_vrsqrts, Add1ArgType),
863   NEONMAP1(vrsubhn_v, arm_neon_vrsubhn, Add1ArgType),
864   NEONMAP1(vsha1su0q_u32, arm_neon_sha1su0, 0),
865   NEONMAP1(vsha1su1q_u32, arm_neon_sha1su1, 0),
866   NEONMAP1(vsha256h2q_u32, arm_neon_sha256h2, 0),
867   NEONMAP1(vsha256hq_u32, arm_neon_sha256h, 0),
868   NEONMAP1(vsha256su0q_u32, arm_neon_sha256su0, 0),
869   NEONMAP1(vsha256su1q_u32, arm_neon_sha256su1, 0),
870   NEONMAP0(vshl_n_v),
871   NEONMAP2(vshl_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
872   NEONMAP0(vshll_n_v),
873   NEONMAP0(vshlq_n_v),
874   NEONMAP2(vshlq_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
875   NEONMAP0(vshr_n_v),
876   NEONMAP0(vshrn_n_v),
877   NEONMAP0(vshrq_n_v),
878   NEONMAP1(vst1_v, arm_neon_vst1, 0),
879   NEONMAP1(vst1_x2_v, arm_neon_vst1x2, 0),
880   NEONMAP1(vst1_x3_v, arm_neon_vst1x3, 0),
881   NEONMAP1(vst1_x4_v, arm_neon_vst1x4, 0),
882   NEONMAP1(vst1q_v, arm_neon_vst1, 0),
883   NEONMAP1(vst1q_x2_v, arm_neon_vst1x2, 0),
884   NEONMAP1(vst1q_x3_v, arm_neon_vst1x3, 0),
885   NEONMAP1(vst1q_x4_v, arm_neon_vst1x4, 0),
886   NEONMAP1(vst2_lane_v, arm_neon_vst2lane, 0),
887   NEONMAP1(vst2_v, arm_neon_vst2, 0),
888   NEONMAP1(vst2q_lane_v, arm_neon_vst2lane, 0),
889   NEONMAP1(vst2q_v, arm_neon_vst2, 0),
890   NEONMAP1(vst3_lane_v, arm_neon_vst3lane, 0),
891   NEONMAP1(vst3_v, arm_neon_vst3, 0),
892   NEONMAP1(vst3q_lane_v, arm_neon_vst3lane, 0),
893   NEONMAP1(vst3q_v, arm_neon_vst3, 0),
894   NEONMAP1(vst4_lane_v, arm_neon_vst4lane, 0),
895   NEONMAP1(vst4_v, arm_neon_vst4, 0),
896   NEONMAP1(vst4q_lane_v, arm_neon_vst4lane, 0),
897   NEONMAP1(vst4q_v, arm_neon_vst4, 0),
898   NEONMAP0(vsubhn_v),
899   NEONMAP0(vtrn_v),
900   NEONMAP0(vtrnq_v),
901   NEONMAP0(vtst_v),
902   NEONMAP0(vtstq_v),
903   NEONMAP1(vusdot_s32, arm_neon_usdot, 0),
904   NEONMAP1(vusdotq_s32, arm_neon_usdot, 0),
905   NEONMAP1(vusmmlaq_s32, arm_neon_usmmla, 0),
906   NEONMAP0(vuzp_v),
907   NEONMAP0(vuzpq_v),
908   NEONMAP0(vzip_v),
909   NEONMAP0(vzipq_v)
910 };
911 
912 static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
913   NEONMAP0(splat_lane_v),
914   NEONMAP0(splat_laneq_v),
915   NEONMAP0(splatq_lane_v),
916   NEONMAP0(splatq_laneq_v),
917   NEONMAP1(vabs_v, aarch64_neon_abs, 0),
918   NEONMAP1(vabsq_v, aarch64_neon_abs, 0),
919   NEONMAP0(vadd_v),
920   NEONMAP0(vaddhn_v),
921   NEONMAP0(vaddq_p128),
922   NEONMAP0(vaddq_v),
923   NEONMAP1(vaesdq_u8, aarch64_crypto_aesd, 0),
924   NEONMAP1(vaeseq_u8, aarch64_crypto_aese, 0),
925   NEONMAP1(vaesimcq_u8, aarch64_crypto_aesimc, 0),
926   NEONMAP1(vaesmcq_u8, aarch64_crypto_aesmc, 0),
927   NEONMAP2(vbcaxq_s16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
928   NEONMAP2(vbcaxq_s32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
929   NEONMAP2(vbcaxq_s64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
930   NEONMAP2(vbcaxq_s8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
931   NEONMAP2(vbcaxq_u16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
932   NEONMAP2(vbcaxq_u32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
933   NEONMAP2(vbcaxq_u64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
934   NEONMAP2(vbcaxq_u8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
935   NEONMAP1(vbfdot_f32, aarch64_neon_bfdot, 0),
936   NEONMAP1(vbfdotq_f32, aarch64_neon_bfdot, 0),
937   NEONMAP1(vbfmlalbq_f32, aarch64_neon_bfmlalb, 0),
938   NEONMAP1(vbfmlaltq_f32, aarch64_neon_bfmlalt, 0),
939   NEONMAP1(vbfmmlaq_f32, aarch64_neon_bfmmla, 0),
940   NEONMAP1(vcadd_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType),
941   NEONMAP1(vcadd_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType),
942   NEONMAP1(vcadd_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType),
943   NEONMAP1(vcadd_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType),
944   NEONMAP1(vcaddq_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType),
945   NEONMAP1(vcaddq_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType),
946   NEONMAP1(vcaddq_rot270_f64, aarch64_neon_vcadd_rot270, Add1ArgType),
947   NEONMAP1(vcaddq_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType),
948   NEONMAP1(vcaddq_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType),
949   NEONMAP1(vcaddq_rot90_f64, aarch64_neon_vcadd_rot90, Add1ArgType),
950   NEONMAP1(vcage_v, aarch64_neon_facge, 0),
951   NEONMAP1(vcageq_v, aarch64_neon_facge, 0),
952   NEONMAP1(vcagt_v, aarch64_neon_facgt, 0),
953   NEONMAP1(vcagtq_v, aarch64_neon_facgt, 0),
954   NEONMAP1(vcale_v, aarch64_neon_facge, 0),
955   NEONMAP1(vcaleq_v, aarch64_neon_facge, 0),
956   NEONMAP1(vcalt_v, aarch64_neon_facgt, 0),
957   NEONMAP1(vcaltq_v, aarch64_neon_facgt, 0),
958   NEONMAP0(vceqz_v),
959   NEONMAP0(vceqzq_v),
960   NEONMAP0(vcgez_v),
961   NEONMAP0(vcgezq_v),
962   NEONMAP0(vcgtz_v),
963   NEONMAP0(vcgtzq_v),
964   NEONMAP0(vclez_v),
965   NEONMAP0(vclezq_v),
966   NEONMAP1(vcls_v, aarch64_neon_cls, Add1ArgType),
967   NEONMAP1(vclsq_v, aarch64_neon_cls, Add1ArgType),
968   NEONMAP0(vcltz_v),
969   NEONMAP0(vcltzq_v),
970   NEONMAP1(vclz_v, ctlz, Add1ArgType),
971   NEONMAP1(vclzq_v, ctlz, Add1ArgType),
972   NEONMAP1(vcmla_f16, aarch64_neon_vcmla_rot0, Add1ArgType),
973   NEONMAP1(vcmla_f32, aarch64_neon_vcmla_rot0, Add1ArgType),
974   NEONMAP1(vcmla_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType),
975   NEONMAP1(vcmla_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType),
976   NEONMAP1(vcmla_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType),
977   NEONMAP1(vcmla_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType),
978   NEONMAP1(vcmla_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType),
979   NEONMAP1(vcmla_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType),
980   NEONMAP1(vcmlaq_f16, aarch64_neon_vcmla_rot0, Add1ArgType),
981   NEONMAP1(vcmlaq_f32, aarch64_neon_vcmla_rot0, Add1ArgType),
982   NEONMAP1(vcmlaq_f64, aarch64_neon_vcmla_rot0, Add1ArgType),
983   NEONMAP1(vcmlaq_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType),
984   NEONMAP1(vcmlaq_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType),
985   NEONMAP1(vcmlaq_rot180_f64, aarch64_neon_vcmla_rot180, Add1ArgType),
986   NEONMAP1(vcmlaq_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType),
987   NEONMAP1(vcmlaq_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType),
988   NEONMAP1(vcmlaq_rot270_f64, aarch64_neon_vcmla_rot270, Add1ArgType),
989   NEONMAP1(vcmlaq_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType),
990   NEONMAP1(vcmlaq_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType),
991   NEONMAP1(vcmlaq_rot90_f64, aarch64_neon_vcmla_rot90, Add1ArgType),
992   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
993   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
994   NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0),
995   NEONMAP0(vcvt_f16_s16),
996   NEONMAP0(vcvt_f16_u16),
997   NEONMAP1(vcvt_f32_f16, aarch64_neon_vcvthf2fp, 0),
998   NEONMAP0(vcvt_f32_v),
999   NEONMAP1(vcvt_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
1000   NEONMAP1(vcvt_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
1001   NEONMAP2(vcvt_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
1002   NEONMAP2(vcvt_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
1003   NEONMAP1(vcvt_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0),
1004   NEONMAP1(vcvt_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
1005   NEONMAP1(vcvt_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
1006   NEONMAP1(vcvt_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
1007   NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
1008   NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
1009   NEONMAP0(vcvtq_f16_s16),
1010   NEONMAP0(vcvtq_f16_u16),
1011   NEONMAP0(vcvtq_f32_v),
1012   NEONMAP0(vcvtq_high_bf16_f32),
1013   NEONMAP0(vcvtq_low_bf16_f32),
1014   NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
1015   NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
1016   NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
1017   NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
1018   NEONMAP1(vcvtq_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0),
1019   NEONMAP1(vcvtq_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
1020   NEONMAP1(vcvtq_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
1021   NEONMAP1(vcvtq_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
1022   NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
1023   NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
1024   NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType),
1025   NEONMAP1(vdot_s32, aarch64_neon_sdot, 0),
1026   NEONMAP1(vdot_u32, aarch64_neon_udot, 0),
1027   NEONMAP1(vdotq_s32, aarch64_neon_sdot, 0),
1028   NEONMAP1(vdotq_u32, aarch64_neon_udot, 0),
1029   NEONMAP2(veor3q_s16, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
1030   NEONMAP2(veor3q_s32, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
1031   NEONMAP2(veor3q_s64, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
1032   NEONMAP2(veor3q_s8, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
1033   NEONMAP2(veor3q_u16, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
1034   NEONMAP2(veor3q_u32, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
1035   NEONMAP2(veor3q_u64, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
1036   NEONMAP2(veor3q_u8, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
1037   NEONMAP0(vext_v),
1038   NEONMAP0(vextq_v),
1039   NEONMAP0(vfma_v),
1040   NEONMAP0(vfmaq_v),
1041   NEONMAP1(vfmlal_high_f16, aarch64_neon_fmlal2, 0),
1042   NEONMAP1(vfmlal_low_f16, aarch64_neon_fmlal, 0),
1043   NEONMAP1(vfmlalq_high_f16, aarch64_neon_fmlal2, 0),
1044   NEONMAP1(vfmlalq_low_f16, aarch64_neon_fmlal, 0),
1045   NEONMAP1(vfmlsl_high_f16, aarch64_neon_fmlsl2, 0),
1046   NEONMAP1(vfmlsl_low_f16, aarch64_neon_fmlsl, 0),
1047   NEONMAP1(vfmlslq_high_f16, aarch64_neon_fmlsl2, 0),
1048   NEONMAP1(vfmlslq_low_f16, aarch64_neon_fmlsl, 0),
1049   NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
1050   NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
1051   NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
1052   NEONMAP2(vhsubq_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
1053   NEONMAP1(vld1_x2_v, aarch64_neon_ld1x2, 0),
1054   NEONMAP1(vld1_x3_v, aarch64_neon_ld1x3, 0),
1055   NEONMAP1(vld1_x4_v, aarch64_neon_ld1x4, 0),
1056   NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0),
1057   NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0),
1058   NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0),
1059   NEONMAP1(vmmlaq_s32, aarch64_neon_smmla, 0),
1060   NEONMAP1(vmmlaq_u32, aarch64_neon_ummla, 0),
1061   NEONMAP0(vmovl_v),
1062   NEONMAP0(vmovn_v),
1063   NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType),
1064   NEONMAP1(vmulq_v, aarch64_neon_pmul, Add1ArgType),
1065   NEONMAP1(vpadd_v, aarch64_neon_addp, Add1ArgType),
1066   NEONMAP2(vpaddl_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
1067   NEONMAP2(vpaddlq_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
1068   NEONMAP1(vpaddq_v, aarch64_neon_addp, Add1ArgType),
1069   NEONMAP1(vqabs_v, aarch64_neon_sqabs, Add1ArgType),
1070   NEONMAP1(vqabsq_v, aarch64_neon_sqabs, Add1ArgType),
1071   NEONMAP2(vqadd_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
1072   NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
1073   NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
1074   NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
1075   NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, 0),
1076   NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
1077   NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
1078   NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, 0),
1079   NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
1080   NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
1081   NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
1082   NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts),
1083   NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
1084   NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
1085   NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
1086   NEONMAP1(vqrdmlah_s16, aarch64_neon_sqrdmlah, Add1ArgType),
1087   NEONMAP1(vqrdmlah_s32, aarch64_neon_sqrdmlah, Add1ArgType),
1088   NEONMAP1(vqrdmlahq_s16, aarch64_neon_sqrdmlah, Add1ArgType),
1089   NEONMAP1(vqrdmlahq_s32, aarch64_neon_sqrdmlah, Add1ArgType),
1090   NEONMAP1(vqrdmlsh_s16, aarch64_neon_sqrdmlsh, Add1ArgType),
1091   NEONMAP1(vqrdmlsh_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
1092   NEONMAP1(vqrdmlshq_s16, aarch64_neon_sqrdmlsh, Add1ArgType),
1093   NEONMAP1(vqrdmlshq_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
1094   NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0),
1095   NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
1096   NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
1097   NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, 0),
1098   NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
1099   NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
1100   NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
1101   NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
1102   NEONMAP2(vqshl_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl, UnsignedAlts),
1103   NEONMAP2(vqshl_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
1104   NEONMAP2(vqshlq_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl,UnsignedAlts),
1105   NEONMAP2(vqshlq_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
1106   NEONMAP1(vqshlu_n_v, aarch64_neon_sqshlu, 0),
1107   NEONMAP1(vqshluq_n_v, aarch64_neon_sqshlu, 0),
1108   NEONMAP2(vqsub_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
1109   NEONMAP2(vqsubq_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
1110   NEONMAP1(vraddhn_v, aarch64_neon_raddhn, Add1ArgType),
1111   NEONMAP1(vrax1q_u64, aarch64_crypto_rax1, 0),
1112   NEONMAP2(vrecpe_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
1113   NEONMAP2(vrecpeq_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
1114   NEONMAP1(vrecps_v, aarch64_neon_frecps, Add1ArgType),
1115   NEONMAP1(vrecpsq_v, aarch64_neon_frecps, Add1ArgType),
1116   NEONMAP2(vrhadd_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
1117   NEONMAP2(vrhaddq_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
1118   NEONMAP1(vrnd32x_f32, aarch64_neon_frint32x, Add1ArgType),
1119   NEONMAP1(vrnd32x_f64, aarch64_neon_frint32x, Add1ArgType),
1120   NEONMAP1(vrnd32xq_f32, aarch64_neon_frint32x, Add1ArgType),
1121   NEONMAP1(vrnd32xq_f64, aarch64_neon_frint32x, Add1ArgType),
1122   NEONMAP1(vrnd32z_f32, aarch64_neon_frint32z, Add1ArgType),
1123   NEONMAP1(vrnd32z_f64, aarch64_neon_frint32z, Add1ArgType),
1124   NEONMAP1(vrnd32zq_f32, aarch64_neon_frint32z, Add1ArgType),
1125   NEONMAP1(vrnd32zq_f64, aarch64_neon_frint32z, Add1ArgType),
1126   NEONMAP1(vrnd64x_f32, aarch64_neon_frint64x, Add1ArgType),
1127   NEONMAP1(vrnd64x_f64, aarch64_neon_frint64x, Add1ArgType),
1128   NEONMAP1(vrnd64xq_f32, aarch64_neon_frint64x, Add1ArgType),
1129   NEONMAP1(vrnd64xq_f64, aarch64_neon_frint64x, Add1ArgType),
1130   NEONMAP1(vrnd64z_f32, aarch64_neon_frint64z, Add1ArgType),
1131   NEONMAP1(vrnd64z_f64, aarch64_neon_frint64z, Add1ArgType),
1132   NEONMAP1(vrnd64zq_f32, aarch64_neon_frint64z, Add1ArgType),
1133   NEONMAP1(vrnd64zq_f64, aarch64_neon_frint64z, Add1ArgType),
1134   NEONMAP0(vrndi_v),
1135   NEONMAP0(vrndiq_v),
1136   NEONMAP2(vrshl_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
1137   NEONMAP2(vrshlq_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
1138   NEONMAP2(vrshr_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
1139   NEONMAP2(vrshrq_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
1140   NEONMAP2(vrsqrte_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
1141   NEONMAP2(vrsqrteq_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
1142   NEONMAP1(vrsqrts_v, aarch64_neon_frsqrts, Add1ArgType),
1143   NEONMAP1(vrsqrtsq_v, aarch64_neon_frsqrts, Add1ArgType),
1144   NEONMAP1(vrsubhn_v, aarch64_neon_rsubhn, Add1ArgType),
1145   NEONMAP1(vsha1su0q_u32, aarch64_crypto_sha1su0, 0),
1146   NEONMAP1(vsha1su1q_u32, aarch64_crypto_sha1su1, 0),
1147   NEONMAP1(vsha256h2q_u32, aarch64_crypto_sha256h2, 0),
1148   NEONMAP1(vsha256hq_u32, aarch64_crypto_sha256h, 0),
1149   NEONMAP1(vsha256su0q_u32, aarch64_crypto_sha256su0, 0),
1150   NEONMAP1(vsha256su1q_u32, aarch64_crypto_sha256su1, 0),
1151   NEONMAP1(vsha512h2q_u64, aarch64_crypto_sha512h2, 0),
1152   NEONMAP1(vsha512hq_u64, aarch64_crypto_sha512h, 0),
1153   NEONMAP1(vsha512su0q_u64, aarch64_crypto_sha512su0, 0),
1154   NEONMAP1(vsha512su1q_u64, aarch64_crypto_sha512su1, 0),
1155   NEONMAP0(vshl_n_v),
1156   NEONMAP2(vshl_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
1157   NEONMAP0(vshll_n_v),
1158   NEONMAP0(vshlq_n_v),
1159   NEONMAP2(vshlq_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
1160   NEONMAP0(vshr_n_v),
1161   NEONMAP0(vshrn_n_v),
1162   NEONMAP0(vshrq_n_v),
1163   NEONMAP1(vsm3partw1q_u32, aarch64_crypto_sm3partw1, 0),
1164   NEONMAP1(vsm3partw2q_u32, aarch64_crypto_sm3partw2, 0),
1165   NEONMAP1(vsm3ss1q_u32, aarch64_crypto_sm3ss1, 0),
1166   NEONMAP1(vsm3tt1aq_u32, aarch64_crypto_sm3tt1a, 0),
1167   NEONMAP1(vsm3tt1bq_u32, aarch64_crypto_sm3tt1b, 0),
1168   NEONMAP1(vsm3tt2aq_u32, aarch64_crypto_sm3tt2a, 0),
1169   NEONMAP1(vsm3tt2bq_u32, aarch64_crypto_sm3tt2b, 0),
1170   NEONMAP1(vsm4ekeyq_u32, aarch64_crypto_sm4ekey, 0),
1171   NEONMAP1(vsm4eq_u32, aarch64_crypto_sm4e, 0),
1172   NEONMAP1(vst1_x2_v, aarch64_neon_st1x2, 0),
1173   NEONMAP1(vst1_x3_v, aarch64_neon_st1x3, 0),
1174   NEONMAP1(vst1_x4_v, aarch64_neon_st1x4, 0),
1175   NEONMAP1(vst1q_x2_v, aarch64_neon_st1x2, 0),
1176   NEONMAP1(vst1q_x3_v, aarch64_neon_st1x3, 0),
1177   NEONMAP1(vst1q_x4_v, aarch64_neon_st1x4, 0),
1178   NEONMAP0(vsubhn_v),
1179   NEONMAP0(vtst_v),
1180   NEONMAP0(vtstq_v),
1181   NEONMAP1(vusdot_s32, aarch64_neon_usdot, 0),
1182   NEONMAP1(vusdotq_s32, aarch64_neon_usdot, 0),
1183   NEONMAP1(vusmmlaq_s32, aarch64_neon_usmmla, 0),
1184   NEONMAP1(vxarq_u64, aarch64_crypto_xar, 0),
1185 };
1186 
1187 static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
1188   NEONMAP1(vabdd_f64, aarch64_sisd_fabd, Add1ArgType),
1189   NEONMAP1(vabds_f32, aarch64_sisd_fabd, Add1ArgType),
1190   NEONMAP1(vabsd_s64, aarch64_neon_abs, Add1ArgType),
1191   NEONMAP1(vaddlv_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
1192   NEONMAP1(vaddlv_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
1193   NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
1194   NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
1195   NEONMAP1(vaddv_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
1196   NEONMAP1(vaddv_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
1197   NEONMAP1(vaddv_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
1198   NEONMAP1(vaddvq_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
1199   NEONMAP1(vaddvq_f64, aarch64_neon_faddv, AddRetType | Add1ArgType),
1200   NEONMAP1(vaddvq_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
1201   NEONMAP1(vaddvq_s64, aarch64_neon_saddv, AddRetType | Add1ArgType),
1202   NEONMAP1(vaddvq_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
1203   NEONMAP1(vaddvq_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
1204   NEONMAP1(vcaged_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
1205   NEONMAP1(vcages_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
1206   NEONMAP1(vcagtd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
1207   NEONMAP1(vcagts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
1208   NEONMAP1(vcaled_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
1209   NEONMAP1(vcales_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
1210   NEONMAP1(vcaltd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
1211   NEONMAP1(vcalts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
1212   NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
1213   NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
1214   NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
1215   NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
1216   NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
1217   NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
1218   NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
1219   NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
1220   NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
1221   NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
1222   NEONMAP0(vcvth_bf16_f32),
1223   NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
1224   NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
1225   NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
1226   NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
1227   NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
1228   NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
1229   NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
1230   NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
1231   NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
1232   NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
1233   NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
1234   NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
1235   NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
1236   NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
1237   NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
1238   NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
1239   NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
1240   NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
1241   NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0),
1242   NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
1243   NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
1244   NEONMAP1(vmaxnmvq_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
1245   NEONMAP1(vmaxv_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
1246   NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
1247   NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
1248   NEONMAP1(vmaxvq_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
1249   NEONMAP1(vmaxvq_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
1250   NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
1251   NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
1252   NEONMAP1(vminnmv_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
1253   NEONMAP1(vminnmvq_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
1254   NEONMAP1(vminnmvq_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
1255   NEONMAP1(vminv_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
1256   NEONMAP1(vminv_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
1257   NEONMAP1(vminv_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
1258   NEONMAP1(vminvq_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
1259   NEONMAP1(vminvq_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
1260   NEONMAP1(vminvq_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
1261   NEONMAP1(vminvq_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
1262   NEONMAP1(vmull_p64, aarch64_neon_pmull64, 0),
1263   NEONMAP1(vmulxd_f64, aarch64_neon_fmulx, Add1ArgType),
1264   NEONMAP1(vmulxs_f32, aarch64_neon_fmulx, Add1ArgType),
1265   NEONMAP1(vpaddd_s64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
1266   NEONMAP1(vpaddd_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
1267   NEONMAP1(vpmaxnmqd_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
1268   NEONMAP1(vpmaxnms_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
1269   NEONMAP1(vpmaxqd_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
1270   NEONMAP1(vpmaxs_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
1271   NEONMAP1(vpminnmqd_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
1272   NEONMAP1(vpminnms_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
1273   NEONMAP1(vpminqd_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
1274   NEONMAP1(vpmins_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
1275   NEONMAP1(vqabsb_s8, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
1276   NEONMAP1(vqabsd_s64, aarch64_neon_sqabs, Add1ArgType),
1277   NEONMAP1(vqabsh_s16, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
1278   NEONMAP1(vqabss_s32, aarch64_neon_sqabs, Add1ArgType),
1279   NEONMAP1(vqaddb_s8, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
1280   NEONMAP1(vqaddb_u8, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
1281   NEONMAP1(vqaddd_s64, aarch64_neon_sqadd, Add1ArgType),
1282   NEONMAP1(vqaddd_u64, aarch64_neon_uqadd, Add1ArgType),
1283   NEONMAP1(vqaddh_s16, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
1284   NEONMAP1(vqaddh_u16, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
1285   NEONMAP1(vqadds_s32, aarch64_neon_sqadd, Add1ArgType),
1286   NEONMAP1(vqadds_u32, aarch64_neon_uqadd, Add1ArgType),
1287   NEONMAP1(vqdmulhh_s16, aarch64_neon_sqdmulh, Vectorize1ArgType | Use64BitVectors),
1288   NEONMAP1(vqdmulhs_s32, aarch64_neon_sqdmulh, Add1ArgType),
1289   NEONMAP1(vqdmullh_s16, aarch64_neon_sqdmull, VectorRet | Use128BitVectors),
1290   NEONMAP1(vqdmulls_s32, aarch64_neon_sqdmulls_scalar, 0),
1291   NEONMAP1(vqmovnd_s64, aarch64_neon_scalar_sqxtn, AddRetType | Add1ArgType),
1292   NEONMAP1(vqmovnd_u64, aarch64_neon_scalar_uqxtn, AddRetType | Add1ArgType),
1293   NEONMAP1(vqmovnh_s16, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
1294   NEONMAP1(vqmovnh_u16, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
1295   NEONMAP1(vqmovns_s32, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
1296   NEONMAP1(vqmovns_u32, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
1297   NEONMAP1(vqmovund_s64, aarch64_neon_scalar_sqxtun, AddRetType | Add1ArgType),
1298   NEONMAP1(vqmovunh_s16, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
1299   NEONMAP1(vqmovuns_s32, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
1300   NEONMAP1(vqnegb_s8, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
1301   NEONMAP1(vqnegd_s64, aarch64_neon_sqneg, Add1ArgType),
1302   NEONMAP1(vqnegh_s16, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
1303   NEONMAP1(vqnegs_s32, aarch64_neon_sqneg, Add1ArgType),
1304   NEONMAP1(vqrdmlahh_s16, aarch64_neon_sqrdmlah, Vectorize1ArgType | Use64BitVectors),
1305   NEONMAP1(vqrdmlahs_s32, aarch64_neon_sqrdmlah, Add1ArgType),
1306   NEONMAP1(vqrdmlshh_s16, aarch64_neon_sqrdmlsh, Vectorize1ArgType | Use64BitVectors),
1307   NEONMAP1(vqrdmlshs_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
1308   NEONMAP1(vqrdmulhh_s16, aarch64_neon_sqrdmulh, Vectorize1ArgType | Use64BitVectors),
1309   NEONMAP1(vqrdmulhs_s32, aarch64_neon_sqrdmulh, Add1ArgType),
1310   NEONMAP1(vqrshlb_s8, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
1311   NEONMAP1(vqrshlb_u8, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
1312   NEONMAP1(vqrshld_s64, aarch64_neon_sqrshl, Add1ArgType),
1313   NEONMAP1(vqrshld_u64, aarch64_neon_uqrshl, Add1ArgType),
1314   NEONMAP1(vqrshlh_s16, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
1315   NEONMAP1(vqrshlh_u16, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
1316   NEONMAP1(vqrshls_s32, aarch64_neon_sqrshl, Add1ArgType),
1317   NEONMAP1(vqrshls_u32, aarch64_neon_uqrshl, Add1ArgType),
1318   NEONMAP1(vqrshrnd_n_s64, aarch64_neon_sqrshrn, AddRetType),
1319   NEONMAP1(vqrshrnd_n_u64, aarch64_neon_uqrshrn, AddRetType),
1320   NEONMAP1(vqrshrnh_n_s16, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
1321   NEONMAP1(vqrshrnh_n_u16, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
1322   NEONMAP1(vqrshrns_n_s32, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
1323   NEONMAP1(vqrshrns_n_u32, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
1324   NEONMAP1(vqrshrund_n_s64, aarch64_neon_sqrshrun, AddRetType),
1325   NEONMAP1(vqrshrunh_n_s16, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
1326   NEONMAP1(vqrshruns_n_s32, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
1327   NEONMAP1(vqshlb_n_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
1328   NEONMAP1(vqshlb_n_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
1329   NEONMAP1(vqshlb_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
1330   NEONMAP1(vqshlb_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
1331   NEONMAP1(vqshld_s64, aarch64_neon_sqshl, Add1ArgType),
1332   NEONMAP1(vqshld_u64, aarch64_neon_uqshl, Add1ArgType),
1333   NEONMAP1(vqshlh_n_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
1334   NEONMAP1(vqshlh_n_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
1335   NEONMAP1(vqshlh_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
1336   NEONMAP1(vqshlh_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
1337   NEONMAP1(vqshls_n_s32, aarch64_neon_sqshl, Add1ArgType),
1338   NEONMAP1(vqshls_n_u32, aarch64_neon_uqshl, Add1ArgType),
1339   NEONMAP1(vqshls_s32, aarch64_neon_sqshl, Add1ArgType),
1340   NEONMAP1(vqshls_u32, aarch64_neon_uqshl, Add1ArgType),
1341   NEONMAP1(vqshlub_n_s8, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
1342   NEONMAP1(vqshluh_n_s16, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
1343   NEONMAP1(vqshlus_n_s32, aarch64_neon_sqshlu, Add1ArgType),
1344   NEONMAP1(vqshrnd_n_s64, aarch64_neon_sqshrn, AddRetType),
1345   NEONMAP1(vqshrnd_n_u64, aarch64_neon_uqshrn, AddRetType),
1346   NEONMAP1(vqshrnh_n_s16, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
1347   NEONMAP1(vqshrnh_n_u16, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
1348   NEONMAP1(vqshrns_n_s32, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
1349   NEONMAP1(vqshrns_n_u32, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
1350   NEONMAP1(vqshrund_n_s64, aarch64_neon_sqshrun, AddRetType),
1351   NEONMAP1(vqshrunh_n_s16, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
1352   NEONMAP1(vqshruns_n_s32, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
1353   NEONMAP1(vqsubb_s8, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
1354   NEONMAP1(vqsubb_u8, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
1355   NEONMAP1(vqsubd_s64, aarch64_neon_sqsub, Add1ArgType),
1356   NEONMAP1(vqsubd_u64, aarch64_neon_uqsub, Add1ArgType),
1357   NEONMAP1(vqsubh_s16, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
1358   NEONMAP1(vqsubh_u16, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
1359   NEONMAP1(vqsubs_s32, aarch64_neon_sqsub, Add1ArgType),
1360   NEONMAP1(vqsubs_u32, aarch64_neon_uqsub, Add1ArgType),
1361   NEONMAP1(vrecped_f64, aarch64_neon_frecpe, Add1ArgType),
1362   NEONMAP1(vrecpes_f32, aarch64_neon_frecpe, Add1ArgType),
1363   NEONMAP1(vrecpxd_f64, aarch64_neon_frecpx, Add1ArgType),
1364   NEONMAP1(vrecpxs_f32, aarch64_neon_frecpx, Add1ArgType),
1365   NEONMAP1(vrshld_s64, aarch64_neon_srshl, Add1ArgType),
1366   NEONMAP1(vrshld_u64, aarch64_neon_urshl, Add1ArgType),
1367   NEONMAP1(vrsqrted_f64, aarch64_neon_frsqrte, Add1ArgType),
1368   NEONMAP1(vrsqrtes_f32, aarch64_neon_frsqrte, Add1ArgType),
1369   NEONMAP1(vrsqrtsd_f64, aarch64_neon_frsqrts, Add1ArgType),
1370   NEONMAP1(vrsqrtss_f32, aarch64_neon_frsqrts, Add1ArgType),
1371   NEONMAP1(vsha1cq_u32, aarch64_crypto_sha1c, 0),
1372   NEONMAP1(vsha1h_u32, aarch64_crypto_sha1h, 0),
1373   NEONMAP1(vsha1mq_u32, aarch64_crypto_sha1m, 0),
1374   NEONMAP1(vsha1pq_u32, aarch64_crypto_sha1p, 0),
1375   NEONMAP1(vshld_s64, aarch64_neon_sshl, Add1ArgType),
1376   NEONMAP1(vshld_u64, aarch64_neon_ushl, Add1ArgType),
1377   NEONMAP1(vslid_n_s64, aarch64_neon_vsli, Vectorize1ArgType),
1378   NEONMAP1(vslid_n_u64, aarch64_neon_vsli, Vectorize1ArgType),
1379   NEONMAP1(vsqaddb_u8, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
1380   NEONMAP1(vsqaddd_u64, aarch64_neon_usqadd, Add1ArgType),
1381   NEONMAP1(vsqaddh_u16, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
1382   NEONMAP1(vsqadds_u32, aarch64_neon_usqadd, Add1ArgType),
1383   NEONMAP1(vsrid_n_s64, aarch64_neon_vsri, Vectorize1ArgType),
1384   NEONMAP1(vsrid_n_u64, aarch64_neon_vsri, Vectorize1ArgType),
1385   NEONMAP1(vuqaddb_s8, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
1386   NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType),
1387   NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
1388   NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType),
1389   // FP16 scalar intrinisics go here.
1390   NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType),
1391   NEONMAP1(vcvtah_s32_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
1392   NEONMAP1(vcvtah_s64_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
1393   NEONMAP1(vcvtah_u32_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
1394   NEONMAP1(vcvtah_u64_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
1395   NEONMAP1(vcvth_n_f16_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
1396   NEONMAP1(vcvth_n_f16_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
1397   NEONMAP1(vcvth_n_f16_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
1398   NEONMAP1(vcvth_n_f16_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
1399   NEONMAP1(vcvth_n_s32_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
1400   NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
1401   NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
1402   NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
1403   NEONMAP1(vcvth_s32_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
1404   NEONMAP1(vcvth_s64_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
1405   NEONMAP1(vcvth_u32_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
1406   NEONMAP1(vcvth_u64_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
1407   NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
1408   NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
1409   NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
1410   NEONMAP1(vcvtmh_u64_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
1411   NEONMAP1(vcvtnh_s32_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
1412   NEONMAP1(vcvtnh_s64_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
1413   NEONMAP1(vcvtnh_u32_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
1414   NEONMAP1(vcvtnh_u64_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
1415   NEONMAP1(vcvtph_s32_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
1416   NEONMAP1(vcvtph_s64_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
1417   NEONMAP1(vcvtph_u32_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
1418   NEONMAP1(vcvtph_u64_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
1419   NEONMAP1(vmulxh_f16, aarch64_neon_fmulx, Add1ArgType),
1420   NEONMAP1(vrecpeh_f16, aarch64_neon_frecpe, Add1ArgType),
1421   NEONMAP1(vrecpxh_f16, aarch64_neon_frecpx, Add1ArgType),
1422   NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType),
1423   NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType),
1424 };
1425 
1426 // Some intrinsics are equivalent for codegen.
1427 static const std::pair<unsigned, unsigned> NEONEquivalentIntrinsicMap[] = {
1428   { NEON::BI__builtin_neon_splat_lane_bf16, NEON::BI__builtin_neon_splat_lane_v, },
1429   { NEON::BI__builtin_neon_splat_laneq_bf16, NEON::BI__builtin_neon_splat_laneq_v, },
1430   { NEON::BI__builtin_neon_splatq_lane_bf16, NEON::BI__builtin_neon_splatq_lane_v, },
1431   { NEON::BI__builtin_neon_splatq_laneq_bf16, NEON::BI__builtin_neon_splatq_laneq_v, },
1432   { NEON::BI__builtin_neon_vabd_f16, NEON::BI__builtin_neon_vabd_v, },
1433   { NEON::BI__builtin_neon_vabdq_f16, NEON::BI__builtin_neon_vabdq_v, },
1434   { NEON::BI__builtin_neon_vabs_f16, NEON::BI__builtin_neon_vabs_v, },
1435   { NEON::BI__builtin_neon_vabsq_f16, NEON::BI__builtin_neon_vabsq_v, },
1436   { NEON::BI__builtin_neon_vcage_f16, NEON::BI__builtin_neon_vcage_v, },
1437   { NEON::BI__builtin_neon_vcageq_f16, NEON::BI__builtin_neon_vcageq_v, },
1438   { NEON::BI__builtin_neon_vcagt_f16, NEON::BI__builtin_neon_vcagt_v, },
1439   { NEON::BI__builtin_neon_vcagtq_f16, NEON::BI__builtin_neon_vcagtq_v, },
1440   { NEON::BI__builtin_neon_vcale_f16, NEON::BI__builtin_neon_vcale_v, },
1441   { NEON::BI__builtin_neon_vcaleq_f16, NEON::BI__builtin_neon_vcaleq_v, },
1442   { NEON::BI__builtin_neon_vcalt_f16, NEON::BI__builtin_neon_vcalt_v, },
1443   { NEON::BI__builtin_neon_vcaltq_f16, NEON::BI__builtin_neon_vcaltq_v, },
1444   { NEON::BI__builtin_neon_vceqz_f16, NEON::BI__builtin_neon_vceqz_v, },
1445   { NEON::BI__builtin_neon_vceqzq_f16, NEON::BI__builtin_neon_vceqzq_v, },
1446   { NEON::BI__builtin_neon_vcgez_f16, NEON::BI__builtin_neon_vcgez_v, },
1447   { NEON::BI__builtin_neon_vcgezq_f16, NEON::BI__builtin_neon_vcgezq_v, },
1448   { NEON::BI__builtin_neon_vcgtz_f16, NEON::BI__builtin_neon_vcgtz_v, },
1449   { NEON::BI__builtin_neon_vcgtzq_f16, NEON::BI__builtin_neon_vcgtzq_v, },
1450   { NEON::BI__builtin_neon_vclez_f16, NEON::BI__builtin_neon_vclez_v, },
1451   { NEON::BI__builtin_neon_vclezq_f16, NEON::BI__builtin_neon_vclezq_v, },
1452   { NEON::BI__builtin_neon_vcltz_f16, NEON::BI__builtin_neon_vcltz_v, },
1453   { NEON::BI__builtin_neon_vcltzq_f16, NEON::BI__builtin_neon_vcltzq_v, },
1454   { NEON::BI__builtin_neon_vfma_f16, NEON::BI__builtin_neon_vfma_v, },
1455   { NEON::BI__builtin_neon_vfma_lane_f16, NEON::BI__builtin_neon_vfma_lane_v, },
1456   { NEON::BI__builtin_neon_vfma_laneq_f16, NEON::BI__builtin_neon_vfma_laneq_v, },
1457   { NEON::BI__builtin_neon_vfmaq_f16, NEON::BI__builtin_neon_vfmaq_v, },
1458   { NEON::BI__builtin_neon_vfmaq_lane_f16, NEON::BI__builtin_neon_vfmaq_lane_v, },
1459   { NEON::BI__builtin_neon_vfmaq_laneq_f16, NEON::BI__builtin_neon_vfmaq_laneq_v, },
1460   { NEON::BI__builtin_neon_vld1_bf16_x2, NEON::BI__builtin_neon_vld1_x2_v },
1461   { NEON::BI__builtin_neon_vld1_bf16_x3, NEON::BI__builtin_neon_vld1_x3_v },
1462   { NEON::BI__builtin_neon_vld1_bf16_x4, NEON::BI__builtin_neon_vld1_x4_v },
1463   { NEON::BI__builtin_neon_vld1_bf16, NEON::BI__builtin_neon_vld1_v },
1464   { NEON::BI__builtin_neon_vld1_dup_bf16, NEON::BI__builtin_neon_vld1_dup_v },
1465   { NEON::BI__builtin_neon_vld1_lane_bf16, NEON::BI__builtin_neon_vld1_lane_v },
1466   { NEON::BI__builtin_neon_vld1q_bf16_x2, NEON::BI__builtin_neon_vld1q_x2_v },
1467   { NEON::BI__builtin_neon_vld1q_bf16_x3, NEON::BI__builtin_neon_vld1q_x3_v },
1468   { NEON::BI__builtin_neon_vld1q_bf16_x4, NEON::BI__builtin_neon_vld1q_x4_v },
1469   { NEON::BI__builtin_neon_vld1q_bf16, NEON::BI__builtin_neon_vld1q_v },
1470   { NEON::BI__builtin_neon_vld1q_dup_bf16, NEON::BI__builtin_neon_vld1q_dup_v },
1471   { NEON::BI__builtin_neon_vld1q_lane_bf16, NEON::BI__builtin_neon_vld1q_lane_v },
1472   { NEON::BI__builtin_neon_vld2_bf16, NEON::BI__builtin_neon_vld2_v },
1473   { NEON::BI__builtin_neon_vld2_dup_bf16, NEON::BI__builtin_neon_vld2_dup_v },
1474   { NEON::BI__builtin_neon_vld2_lane_bf16, NEON::BI__builtin_neon_vld2_lane_v },
1475   { NEON::BI__builtin_neon_vld2q_bf16, NEON::BI__builtin_neon_vld2q_v },
1476   { NEON::BI__builtin_neon_vld2q_dup_bf16, NEON::BI__builtin_neon_vld2q_dup_v },
1477   { NEON::BI__builtin_neon_vld2q_lane_bf16, NEON::BI__builtin_neon_vld2q_lane_v },
1478   { NEON::BI__builtin_neon_vld3_bf16, NEON::BI__builtin_neon_vld3_v },
1479   { NEON::BI__builtin_neon_vld3_dup_bf16, NEON::BI__builtin_neon_vld3_dup_v },
1480   { NEON::BI__builtin_neon_vld3_lane_bf16, NEON::BI__builtin_neon_vld3_lane_v },
1481   { NEON::BI__builtin_neon_vld3q_bf16, NEON::BI__builtin_neon_vld3q_v },
1482   { NEON::BI__builtin_neon_vld3q_dup_bf16, NEON::BI__builtin_neon_vld3q_dup_v },
1483   { NEON::BI__builtin_neon_vld3q_lane_bf16, NEON::BI__builtin_neon_vld3q_lane_v },
1484   { NEON::BI__builtin_neon_vld4_bf16, NEON::BI__builtin_neon_vld4_v },
1485   { NEON::BI__builtin_neon_vld4_dup_bf16, NEON::BI__builtin_neon_vld4_dup_v },
1486   { NEON::BI__builtin_neon_vld4_lane_bf16, NEON::BI__builtin_neon_vld4_lane_v },
1487   { NEON::BI__builtin_neon_vld4q_bf16, NEON::BI__builtin_neon_vld4q_v },
1488   { NEON::BI__builtin_neon_vld4q_dup_bf16, NEON::BI__builtin_neon_vld4q_dup_v },
1489   { NEON::BI__builtin_neon_vld4q_lane_bf16, NEON::BI__builtin_neon_vld4q_lane_v },
1490   { NEON::BI__builtin_neon_vmax_f16, NEON::BI__builtin_neon_vmax_v, },
1491   { NEON::BI__builtin_neon_vmaxnm_f16, NEON::BI__builtin_neon_vmaxnm_v, },
1492   { NEON::BI__builtin_neon_vmaxnmq_f16, NEON::BI__builtin_neon_vmaxnmq_v, },
1493   { NEON::BI__builtin_neon_vmaxq_f16, NEON::BI__builtin_neon_vmaxq_v, },
1494   { NEON::BI__builtin_neon_vmin_f16, NEON::BI__builtin_neon_vmin_v, },
1495   { NEON::BI__builtin_neon_vminnm_f16, NEON::BI__builtin_neon_vminnm_v, },
1496   { NEON::BI__builtin_neon_vminnmq_f16, NEON::BI__builtin_neon_vminnmq_v, },
1497   { NEON::BI__builtin_neon_vminq_f16, NEON::BI__builtin_neon_vminq_v, },
1498   { NEON::BI__builtin_neon_vmulx_f16, NEON::BI__builtin_neon_vmulx_v, },
1499   { NEON::BI__builtin_neon_vmulxq_f16, NEON::BI__builtin_neon_vmulxq_v, },
1500   { NEON::BI__builtin_neon_vpadd_f16, NEON::BI__builtin_neon_vpadd_v, },
1501   { NEON::BI__builtin_neon_vpaddq_f16, NEON::BI__builtin_neon_vpaddq_v, },
1502   { NEON::BI__builtin_neon_vpmax_f16, NEON::BI__builtin_neon_vpmax_v, },
1503   { NEON::BI__builtin_neon_vpmaxnm_f16, NEON::BI__builtin_neon_vpmaxnm_v, },
1504   { NEON::BI__builtin_neon_vpmaxnmq_f16, NEON::BI__builtin_neon_vpmaxnmq_v, },
1505   { NEON::BI__builtin_neon_vpmaxq_f16, NEON::BI__builtin_neon_vpmaxq_v, },
1506   { NEON::BI__builtin_neon_vpmin_f16, NEON::BI__builtin_neon_vpmin_v, },
1507   { NEON::BI__builtin_neon_vpminnm_f16, NEON::BI__builtin_neon_vpminnm_v, },
1508   { NEON::BI__builtin_neon_vpminnmq_f16, NEON::BI__builtin_neon_vpminnmq_v, },
1509   { NEON::BI__builtin_neon_vpminq_f16, NEON::BI__builtin_neon_vpminq_v, },
1510   { NEON::BI__builtin_neon_vrecpe_f16, NEON::BI__builtin_neon_vrecpe_v, },
1511   { NEON::BI__builtin_neon_vrecpeq_f16, NEON::BI__builtin_neon_vrecpeq_v, },
1512   { NEON::BI__builtin_neon_vrecps_f16, NEON::BI__builtin_neon_vrecps_v, },
1513   { NEON::BI__builtin_neon_vrecpsq_f16, NEON::BI__builtin_neon_vrecpsq_v, },
1514   { NEON::BI__builtin_neon_vrnd_f16, NEON::BI__builtin_neon_vrnd_v, },
1515   { NEON::BI__builtin_neon_vrnda_f16, NEON::BI__builtin_neon_vrnda_v, },
1516   { NEON::BI__builtin_neon_vrndaq_f16, NEON::BI__builtin_neon_vrndaq_v, },
1517   { NEON::BI__builtin_neon_vrndi_f16, NEON::BI__builtin_neon_vrndi_v, },
1518   { NEON::BI__builtin_neon_vrndiq_f16, NEON::BI__builtin_neon_vrndiq_v, },
1519   { NEON::BI__builtin_neon_vrndm_f16, NEON::BI__builtin_neon_vrndm_v, },
1520   { NEON::BI__builtin_neon_vrndmq_f16, NEON::BI__builtin_neon_vrndmq_v, },
1521   { NEON::BI__builtin_neon_vrndn_f16, NEON::BI__builtin_neon_vrndn_v, },
1522   { NEON::BI__builtin_neon_vrndnq_f16, NEON::BI__builtin_neon_vrndnq_v, },
1523   { NEON::BI__builtin_neon_vrndp_f16, NEON::BI__builtin_neon_vrndp_v, },
1524   { NEON::BI__builtin_neon_vrndpq_f16, NEON::BI__builtin_neon_vrndpq_v, },
1525   { NEON::BI__builtin_neon_vrndq_f16, NEON::BI__builtin_neon_vrndq_v, },
1526   { NEON::BI__builtin_neon_vrndx_f16, NEON::BI__builtin_neon_vrndx_v, },
1527   { NEON::BI__builtin_neon_vrndxq_f16, NEON::BI__builtin_neon_vrndxq_v, },
1528   { NEON::BI__builtin_neon_vrsqrte_f16, NEON::BI__builtin_neon_vrsqrte_v, },
1529   { NEON::BI__builtin_neon_vrsqrteq_f16, NEON::BI__builtin_neon_vrsqrteq_v, },
1530   { NEON::BI__builtin_neon_vrsqrts_f16, NEON::BI__builtin_neon_vrsqrts_v, },
1531   { NEON::BI__builtin_neon_vrsqrtsq_f16, NEON::BI__builtin_neon_vrsqrtsq_v, },
1532   { NEON::BI__builtin_neon_vsqrt_f16, NEON::BI__builtin_neon_vsqrt_v, },
1533   { NEON::BI__builtin_neon_vsqrtq_f16, NEON::BI__builtin_neon_vsqrtq_v, },
1534   { NEON::BI__builtin_neon_vst1_bf16_x2, NEON::BI__builtin_neon_vst1_x2_v },
1535   { NEON::BI__builtin_neon_vst1_bf16_x3, NEON::BI__builtin_neon_vst1_x3_v },
1536   { NEON::BI__builtin_neon_vst1_bf16_x4, NEON::BI__builtin_neon_vst1_x4_v },
1537   { NEON::BI__builtin_neon_vst1_bf16, NEON::BI__builtin_neon_vst1_v },
1538   { NEON::BI__builtin_neon_vst1_lane_bf16, NEON::BI__builtin_neon_vst1_lane_v },
1539   { NEON::BI__builtin_neon_vst1q_bf16_x2, NEON::BI__builtin_neon_vst1q_x2_v },
1540   { NEON::BI__builtin_neon_vst1q_bf16_x3, NEON::BI__builtin_neon_vst1q_x3_v },
1541   { NEON::BI__builtin_neon_vst1q_bf16_x4, NEON::BI__builtin_neon_vst1q_x4_v },
1542   { NEON::BI__builtin_neon_vst1q_bf16, NEON::BI__builtin_neon_vst1q_v },
1543   { NEON::BI__builtin_neon_vst1q_lane_bf16, NEON::BI__builtin_neon_vst1q_lane_v },
1544   { NEON::BI__builtin_neon_vst2_bf16, NEON::BI__builtin_neon_vst2_v },
1545   { NEON::BI__builtin_neon_vst2_lane_bf16, NEON::BI__builtin_neon_vst2_lane_v },
1546   { NEON::BI__builtin_neon_vst2q_bf16, NEON::BI__builtin_neon_vst2q_v },
1547   { NEON::BI__builtin_neon_vst2q_lane_bf16, NEON::BI__builtin_neon_vst2q_lane_v },
1548   { NEON::BI__builtin_neon_vst3_bf16, NEON::BI__builtin_neon_vst3_v },
1549   { NEON::BI__builtin_neon_vst3_lane_bf16, NEON::BI__builtin_neon_vst3_lane_v },
1550   { NEON::BI__builtin_neon_vst3q_bf16, NEON::BI__builtin_neon_vst3q_v },
1551   { NEON::BI__builtin_neon_vst3q_lane_bf16, NEON::BI__builtin_neon_vst3q_lane_v },
1552   { NEON::BI__builtin_neon_vst4_bf16, NEON::BI__builtin_neon_vst4_v },
1553   { NEON::BI__builtin_neon_vst4_lane_bf16, NEON::BI__builtin_neon_vst4_lane_v },
1554   { NEON::BI__builtin_neon_vst4q_bf16, NEON::BI__builtin_neon_vst4q_v },
1555   { NEON::BI__builtin_neon_vst4q_lane_bf16, NEON::BI__builtin_neon_vst4q_lane_v },
1556   // The mangling rules cause us to have one ID for each type for vldap1(q)_lane
1557   // and vstl1(q)_lane, but codegen is equivalent for all of them. Choose an
1558   // arbitrary one to be handled as tha canonical variation.
1559   { NEON::BI__builtin_neon_vldap1_lane_u64, NEON::BI__builtin_neon_vldap1_lane_s64 },
1560   { NEON::BI__builtin_neon_vldap1_lane_f64, NEON::BI__builtin_neon_vldap1_lane_s64 },
1561   { NEON::BI__builtin_neon_vldap1_lane_p64, NEON::BI__builtin_neon_vldap1_lane_s64 },
1562   { NEON::BI__builtin_neon_vldap1q_lane_u64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
1563   { NEON::BI__builtin_neon_vldap1q_lane_f64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
1564   { NEON::BI__builtin_neon_vldap1q_lane_p64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
1565   { NEON::BI__builtin_neon_vstl1_lane_u64, NEON::BI__builtin_neon_vstl1_lane_s64 },
1566   { NEON::BI__builtin_neon_vstl1_lane_f64, NEON::BI__builtin_neon_vstl1_lane_s64 },
1567   { NEON::BI__builtin_neon_vstl1_lane_p64, NEON::BI__builtin_neon_vstl1_lane_s64 },
1568   { NEON::BI__builtin_neon_vstl1q_lane_u64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
1569   { NEON::BI__builtin_neon_vstl1q_lane_f64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
1570   { NEON::BI__builtin_neon_vstl1q_lane_p64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
1571 };
1572 
1573 #undef NEONMAP0
1574 #undef NEONMAP1
1575 #undef NEONMAP2
1576 
1577 #define SVEMAP1(NameBase, LLVMIntrinsic, TypeModifier)                         \
1578   {                                                                            \
1579     #NameBase, SVE::BI__builtin_sve_##NameBase, Intrinsic::LLVMIntrinsic, 0,   \
1580         TypeModifier                                                           \
1581   }
1582 
1583 #define SVEMAP2(NameBase, TypeModifier)                                        \
1584   { #NameBase, SVE::BI__builtin_sve_##NameBase, 0, 0, TypeModifier }
1585 static const ARMVectorIntrinsicInfo AArch64SVEIntrinsicMap[] = {
1586 #define GET_SVE_LLVM_INTRINSIC_MAP
1587 #include "clang/Basic/arm_sve_builtin_cg.inc"
1588 #include "clang/Basic/BuiltinsAArch64NeonSVEBridge_cg.def"
1589 #undef GET_SVE_LLVM_INTRINSIC_MAP
1590 };
1591 
1592 #undef SVEMAP1
1593 #undef SVEMAP2
1594 
1595 #define SMEMAP1(NameBase, LLVMIntrinsic, TypeModifier)                         \
1596   {                                                                            \
1597     #NameBase, SME::BI__builtin_sme_##NameBase, Intrinsic::LLVMIntrinsic, 0,   \
1598         TypeModifier                                                           \
1599   }
1600 
1601 #define SMEMAP2(NameBase, TypeModifier)                                        \
1602   { #NameBase, SME::BI__builtin_sme_##NameBase, 0, 0, TypeModifier }
1603 static const ARMVectorIntrinsicInfo AArch64SMEIntrinsicMap[] = {
1604 #define GET_SME_LLVM_INTRINSIC_MAP
1605 #include "clang/Basic/arm_sme_builtin_cg.inc"
1606 #undef GET_SME_LLVM_INTRINSIC_MAP
1607 };
1608 
1609 #undef SMEMAP1
1610 #undef SMEMAP2
1611 
1612 static bool NEONSIMDIntrinsicsProvenSorted = false;
1613 
1614 static bool AArch64SIMDIntrinsicsProvenSorted = false;
1615 static bool AArch64SISDIntrinsicsProvenSorted = false;
1616 static bool AArch64SVEIntrinsicsProvenSorted = false;
1617 static bool AArch64SMEIntrinsicsProvenSorted = false;
1618 
1619 static const ARMVectorIntrinsicInfo *
1620 findARMVectorIntrinsicInMap(ArrayRef<ARMVectorIntrinsicInfo> IntrinsicMap,
1621                             unsigned BuiltinID, bool &MapProvenSorted) {
1622 
1623 #ifndef NDEBUG
1624   if (!MapProvenSorted) {
1625     assert(llvm::is_sorted(IntrinsicMap));
1626     MapProvenSorted = true;
1627   }
1628 #endif
1629 
1630   const ARMVectorIntrinsicInfo *Builtin =
1631       llvm::lower_bound(IntrinsicMap, BuiltinID);
1632 
1633   if (Builtin != IntrinsicMap.end() && Builtin->BuiltinID == BuiltinID)
1634     return Builtin;
1635 
1636   return nullptr;
1637 }
1638 
1639 Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
1640                                                    unsigned Modifier,
1641                                                    llvm::Type *ArgType,
1642                                                    const CallExpr *E) {
1643   int VectorSize = 0;
1644   if (Modifier & Use64BitVectors)
1645     VectorSize = 64;
1646   else if (Modifier & Use128BitVectors)
1647     VectorSize = 128;
1648 
1649   // Return type.
1650   SmallVector<llvm::Type *, 3> Tys;
1651   if (Modifier & AddRetType) {
1652     llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
1653     if (Modifier & VectorizeRetType)
1654       Ty = llvm::FixedVectorType::get(
1655           Ty, VectorSize ? VectorSize / Ty->getPrimitiveSizeInBits() : 1);
1656 
1657     Tys.push_back(Ty);
1658   }
1659 
1660   // Arguments.
1661   if (Modifier & VectorizeArgTypes) {
1662     int Elts = VectorSize ? VectorSize / ArgType->getPrimitiveSizeInBits() : 1;
1663     ArgType = llvm::FixedVectorType::get(ArgType, Elts);
1664   }
1665 
1666   if (Modifier & (Add1ArgType | Add2ArgTypes))
1667     Tys.push_back(ArgType);
1668 
1669   if (Modifier & Add2ArgTypes)
1670     Tys.push_back(ArgType);
1671 
1672   if (Modifier & InventFloatType)
1673     Tys.push_back(FloatTy);
1674 
1675   return CGM.getIntrinsic(IntrinsicID, Tys);
1676 }
1677 
1678 static Value *EmitCommonNeonSISDBuiltinExpr(
1679     CodeGenFunction &CGF, const ARMVectorIntrinsicInfo &SISDInfo,
1680     SmallVectorImpl<Value *> &Ops, const CallExpr *E) {
1681   unsigned BuiltinID = SISDInfo.BuiltinID;
1682   unsigned int Int = SISDInfo.LLVMIntrinsic;
1683   unsigned Modifier = SISDInfo.TypeModifier;
1684   const char *s = SISDInfo.NameHint;
1685 
1686   switch (BuiltinID) {
1687   case NEON::BI__builtin_neon_vcled_s64:
1688   case NEON::BI__builtin_neon_vcled_u64:
1689   case NEON::BI__builtin_neon_vcles_f32:
1690   case NEON::BI__builtin_neon_vcled_f64:
1691   case NEON::BI__builtin_neon_vcltd_s64:
1692   case NEON::BI__builtin_neon_vcltd_u64:
1693   case NEON::BI__builtin_neon_vclts_f32:
1694   case NEON::BI__builtin_neon_vcltd_f64:
1695   case NEON::BI__builtin_neon_vcales_f32:
1696   case NEON::BI__builtin_neon_vcaled_f64:
1697   case NEON::BI__builtin_neon_vcalts_f32:
1698   case NEON::BI__builtin_neon_vcaltd_f64:
1699     // Only one direction of comparisons actually exist, cmle is actually a cmge
1700     // with swapped operands. The table gives us the right intrinsic but we
1701     // still need to do the swap.
1702     std::swap(Ops[0], Ops[1]);
1703     break;
1704   }
1705 
1706   assert(Int && "Generic code assumes a valid intrinsic");
1707 
1708   // Determine the type(s) of this overloaded AArch64 intrinsic.
1709   const Expr *Arg = E->getArg(0);
1710   llvm::Type *ArgTy = CGF.ConvertType(Arg->getType());
1711   Function *F = CGF.LookupNeonLLVMIntrinsic(Int, Modifier, ArgTy, E);
1712 
1713   int j = 0;
1714   ConstantInt *C0 = ConstantInt::get(CGF.SizeTy, 0);
1715   for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
1716        ai != ae; ++ai, ++j) {
1717     llvm::Type *ArgTy = ai->getType();
1718     if (Ops[j]->getType()->getPrimitiveSizeInBits() ==
1719              ArgTy->getPrimitiveSizeInBits())
1720       continue;
1721 
1722     assert(ArgTy->isVectorTy() && !Ops[j]->getType()->isVectorTy());
1723     // The constant argument to an _n_ intrinsic always has Int32Ty, so truncate
1724     // it before inserting.
1725     Ops[j] = CGF.Builder.CreateTruncOrBitCast(
1726         Ops[j], cast<llvm::VectorType>(ArgTy)->getElementType());
1727     Ops[j] =
1728         CGF.Builder.CreateInsertElement(PoisonValue::get(ArgTy), Ops[j], C0);
1729   }
1730 
1731   Value *Result = CGF.EmitNeonCall(F, Ops, s);
1732   llvm::Type *ResultType = CGF.ConvertType(E->getType());
1733   if (ResultType->getPrimitiveSizeInBits().getFixedValue() <
1734       Result->getType()->getPrimitiveSizeInBits().getFixedValue())
1735     return CGF.Builder.CreateExtractElement(Result, C0);
1736 
1737   return CGF.Builder.CreateBitCast(Result, ResultType, s);
1738 }
1739 
1740 Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
1741     unsigned BuiltinID, unsigned LLVMIntrinsic, unsigned AltLLVMIntrinsic,
1742     const char *NameHint, unsigned Modifier, const CallExpr *E,
1743     SmallVectorImpl<llvm::Value *> &Ops, Address PtrOp0, Address PtrOp1,
1744     llvm::Triple::ArchType Arch) {
1745   // Get the last argument, which specifies the vector type.
1746   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
1747   std::optional<llvm::APSInt> NeonTypeConst =
1748       Arg->getIntegerConstantExpr(getContext());
1749   if (!NeonTypeConst)
1750     return nullptr;
1751 
1752   // Determine the type of this overloaded NEON intrinsic.
1753   NeonTypeFlags Type(NeonTypeConst->getZExtValue());
1754   const bool Usgn = Type.isUnsigned();
1755   const bool Quad = Type.isQuad();
1756   const bool Floating = Type.isFloatingPoint();
1757   const bool HasLegalHalfType = getTarget().hasLegalHalfType();
1758   const bool AllowBFloatArgsAndRet =
1759       getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
1760 
1761   llvm::FixedVectorType *VTy =
1762       GetNeonType(this, Type, HasLegalHalfType, false, AllowBFloatArgsAndRet);
1763   llvm::Type *Ty = VTy;
1764   if (!Ty)
1765     return nullptr;
1766 
1767   auto getAlignmentValue32 = [&](Address addr) -> Value* {
1768     return Builder.getInt32(addr.getAlignment().getQuantity());
1769   };
1770 
1771   unsigned Int = LLVMIntrinsic;
1772   if ((Modifier & UnsignedAlts) && !Usgn)
1773     Int = AltLLVMIntrinsic;
1774 
1775   switch (BuiltinID) {
1776   default: break;
1777   case NEON::BI__builtin_neon_splat_lane_v:
1778   case NEON::BI__builtin_neon_splat_laneq_v:
1779   case NEON::BI__builtin_neon_splatq_lane_v:
1780   case NEON::BI__builtin_neon_splatq_laneq_v: {
1781     auto NumElements = VTy->getElementCount();
1782     if (BuiltinID == NEON::BI__builtin_neon_splatq_lane_v)
1783       NumElements = NumElements * 2;
1784     if (BuiltinID == NEON::BI__builtin_neon_splat_laneq_v)
1785       NumElements = NumElements.divideCoefficientBy(2);
1786 
1787     Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
1788     return EmitNeonSplat(Ops[0], cast<ConstantInt>(Ops[1]), NumElements);
1789   }
1790   case NEON::BI__builtin_neon_vpadd_v:
1791   case NEON::BI__builtin_neon_vpaddq_v:
1792     // We don't allow fp/int overloading of intrinsics.
1793     if (VTy->getElementType()->isFloatingPointTy() &&
1794         Int == Intrinsic::aarch64_neon_addp)
1795       Int = Intrinsic::aarch64_neon_faddp;
1796     break;
1797   case NEON::BI__builtin_neon_vabs_v:
1798   case NEON::BI__builtin_neon_vabsq_v:
1799     if (VTy->getElementType()->isFloatingPointTy())
1800       return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs");
1801     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs");
1802   case NEON::BI__builtin_neon_vadd_v:
1803   case NEON::BI__builtin_neon_vaddq_v: {
1804     llvm::Type *VTy = llvm::FixedVectorType::get(Int8Ty, Quad ? 16 : 8);
1805     Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
1806     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
1807     Ops[0] =  Builder.CreateXor(Ops[0], Ops[1]);
1808     return Builder.CreateBitCast(Ops[0], Ty);
1809   }
1810   case NEON::BI__builtin_neon_vaddhn_v: {
1811     llvm::FixedVectorType *SrcTy =
1812         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
1813 
1814     // %sum = add <4 x i32> %lhs, %rhs
1815     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
1816     Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
1817     Ops[0] = Builder.CreateAdd(Ops[0], Ops[1], "vaddhn");
1818 
1819     // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
1820     Constant *ShiftAmt =
1821         ConstantInt::get(SrcTy, SrcTy->getScalarSizeInBits() / 2);
1822     Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vaddhn");
1823 
1824     // %res = trunc <4 x i32> %high to <4 x i16>
1825     return Builder.CreateTrunc(Ops[0], VTy, "vaddhn");
1826   }
1827   case NEON::BI__builtin_neon_vcale_v:
1828   case NEON::BI__builtin_neon_vcaleq_v:
1829   case NEON::BI__builtin_neon_vcalt_v:
1830   case NEON::BI__builtin_neon_vcaltq_v:
1831     std::swap(Ops[0], Ops[1]);
1832     [[fallthrough]];
1833   case NEON::BI__builtin_neon_vcage_v:
1834   case NEON::BI__builtin_neon_vcageq_v:
1835   case NEON::BI__builtin_neon_vcagt_v:
1836   case NEON::BI__builtin_neon_vcagtq_v: {
1837     llvm::Type *Ty;
1838     switch (VTy->getScalarSizeInBits()) {
1839     default: llvm_unreachable("unexpected type");
1840     case 32:
1841       Ty = FloatTy;
1842       break;
1843     case 64:
1844       Ty = DoubleTy;
1845       break;
1846     case 16:
1847       Ty = HalfTy;
1848       break;
1849     }
1850     auto *VecFlt = llvm::FixedVectorType::get(Ty, VTy->getNumElements());
1851     llvm::Type *Tys[] = { VTy, VecFlt };
1852     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
1853     return EmitNeonCall(F, Ops, NameHint);
1854   }
1855   case NEON::BI__builtin_neon_vceqz_v:
1856   case NEON::BI__builtin_neon_vceqzq_v:
1857     return EmitAArch64CompareBuiltinExpr(
1858         Ops[0], Ty, Floating ? ICmpInst::FCMP_OEQ : ICmpInst::ICMP_EQ, "vceqz");
1859   case NEON::BI__builtin_neon_vcgez_v:
1860   case NEON::BI__builtin_neon_vcgezq_v:
1861     return EmitAArch64CompareBuiltinExpr(
1862         Ops[0], Ty, Floating ? ICmpInst::FCMP_OGE : ICmpInst::ICMP_SGE,
1863         "vcgez");
1864   case NEON::BI__builtin_neon_vclez_v:
1865   case NEON::BI__builtin_neon_vclezq_v:
1866     return EmitAArch64CompareBuiltinExpr(
1867         Ops[0], Ty, Floating ? ICmpInst::FCMP_OLE : ICmpInst::ICMP_SLE,
1868         "vclez");
1869   case NEON::BI__builtin_neon_vcgtz_v:
1870   case NEON::BI__builtin_neon_vcgtzq_v:
1871     return EmitAArch64CompareBuiltinExpr(
1872         Ops[0], Ty, Floating ? ICmpInst::FCMP_OGT : ICmpInst::ICMP_SGT,
1873         "vcgtz");
1874   case NEON::BI__builtin_neon_vcltz_v:
1875   case NEON::BI__builtin_neon_vcltzq_v:
1876     return EmitAArch64CompareBuiltinExpr(
1877         Ops[0], Ty, Floating ? ICmpInst::FCMP_OLT : ICmpInst::ICMP_SLT,
1878         "vcltz");
1879   case NEON::BI__builtin_neon_vclz_v:
1880   case NEON::BI__builtin_neon_vclzq_v:
1881     // We generate target-independent intrinsic, which needs a second argument
1882     // for whether or not clz of zero is undefined; on ARM it isn't.
1883     Ops.push_back(Builder.getInt1(getTarget().isCLZForZeroUndef()));
1884     break;
1885   case NEON::BI__builtin_neon_vcvt_f32_v:
1886   case NEON::BI__builtin_neon_vcvtq_f32_v:
1887     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
1888     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad),
1889                      HasLegalHalfType);
1890     return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
1891                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
1892   case NEON::BI__builtin_neon_vcvt_f16_s16:
1893   case NEON::BI__builtin_neon_vcvt_f16_u16:
1894   case NEON::BI__builtin_neon_vcvtq_f16_s16:
1895   case NEON::BI__builtin_neon_vcvtq_f16_u16:
1896     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
1897     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float16, false, Quad),
1898                      HasLegalHalfType);
1899     return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
1900                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
1901   case NEON::BI__builtin_neon_vcvt_n_f16_s16:
1902   case NEON::BI__builtin_neon_vcvt_n_f16_u16:
1903   case NEON::BI__builtin_neon_vcvtq_n_f16_s16:
1904   case NEON::BI__builtin_neon_vcvtq_n_f16_u16: {
1905     llvm::Type *Tys[2] = { GetFloatNeonType(this, Type), Ty };
1906     Function *F = CGM.getIntrinsic(Int, Tys);
1907     return EmitNeonCall(F, Ops, "vcvt_n");
1908   }
1909   case NEON::BI__builtin_neon_vcvt_n_f32_v:
1910   case NEON::BI__builtin_neon_vcvt_n_f64_v:
1911   case NEON::BI__builtin_neon_vcvtq_n_f32_v:
1912   case NEON::BI__builtin_neon_vcvtq_n_f64_v: {
1913     llvm::Type *Tys[2] = { GetFloatNeonType(this, Type), Ty };
1914     Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
1915     Function *F = CGM.getIntrinsic(Int, Tys);
1916     return EmitNeonCall(F, Ops, "vcvt_n");
1917   }
1918   case NEON::BI__builtin_neon_vcvt_n_s16_f16:
1919   case NEON::BI__builtin_neon_vcvt_n_s32_v:
1920   case NEON::BI__builtin_neon_vcvt_n_u16_f16:
1921   case NEON::BI__builtin_neon_vcvt_n_u32_v:
1922   case NEON::BI__builtin_neon_vcvt_n_s64_v:
1923   case NEON::BI__builtin_neon_vcvt_n_u64_v:
1924   case NEON::BI__builtin_neon_vcvtq_n_s16_f16:
1925   case NEON::BI__builtin_neon_vcvtq_n_s32_v:
1926   case NEON::BI__builtin_neon_vcvtq_n_u16_f16:
1927   case NEON::BI__builtin_neon_vcvtq_n_u32_v:
1928   case NEON::BI__builtin_neon_vcvtq_n_s64_v:
1929   case NEON::BI__builtin_neon_vcvtq_n_u64_v: {
1930     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
1931     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
1932     return EmitNeonCall(F, Ops, "vcvt_n");
1933   }
1934   case NEON::BI__builtin_neon_vcvt_s32_v:
1935   case NEON::BI__builtin_neon_vcvt_u32_v:
1936   case NEON::BI__builtin_neon_vcvt_s64_v:
1937   case NEON::BI__builtin_neon_vcvt_u64_v:
1938   case NEON::BI__builtin_neon_vcvt_s16_f16:
1939   case NEON::BI__builtin_neon_vcvt_u16_f16:
1940   case NEON::BI__builtin_neon_vcvtq_s32_v:
1941   case NEON::BI__builtin_neon_vcvtq_u32_v:
1942   case NEON::BI__builtin_neon_vcvtq_s64_v:
1943   case NEON::BI__builtin_neon_vcvtq_u64_v:
1944   case NEON::BI__builtin_neon_vcvtq_s16_f16:
1945   case NEON::BI__builtin_neon_vcvtq_u16_f16: {
1946     Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type));
1947     return Usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt")
1948                 : Builder.CreateFPToSI(Ops[0], Ty, "vcvt");
1949   }
1950   case NEON::BI__builtin_neon_vcvta_s16_f16:
1951   case NEON::BI__builtin_neon_vcvta_s32_v:
1952   case NEON::BI__builtin_neon_vcvta_s64_v:
1953   case NEON::BI__builtin_neon_vcvta_u16_f16:
1954   case NEON::BI__builtin_neon_vcvta_u32_v:
1955   case NEON::BI__builtin_neon_vcvta_u64_v:
1956   case NEON::BI__builtin_neon_vcvtaq_s16_f16:
1957   case NEON::BI__builtin_neon_vcvtaq_s32_v:
1958   case NEON::BI__builtin_neon_vcvtaq_s64_v:
1959   case NEON::BI__builtin_neon_vcvtaq_u16_f16:
1960   case NEON::BI__builtin_neon_vcvtaq_u32_v:
1961   case NEON::BI__builtin_neon_vcvtaq_u64_v:
1962   case NEON::BI__builtin_neon_vcvtn_s16_f16:
1963   case NEON::BI__builtin_neon_vcvtn_s32_v:
1964   case NEON::BI__builtin_neon_vcvtn_s64_v:
1965   case NEON::BI__builtin_neon_vcvtn_u16_f16:
1966   case NEON::BI__builtin_neon_vcvtn_u32_v:
1967   case NEON::BI__builtin_neon_vcvtn_u64_v:
1968   case NEON::BI__builtin_neon_vcvtnq_s16_f16:
1969   case NEON::BI__builtin_neon_vcvtnq_s32_v:
1970   case NEON::BI__builtin_neon_vcvtnq_s64_v:
1971   case NEON::BI__builtin_neon_vcvtnq_u16_f16:
1972   case NEON::BI__builtin_neon_vcvtnq_u32_v:
1973   case NEON::BI__builtin_neon_vcvtnq_u64_v:
1974   case NEON::BI__builtin_neon_vcvtp_s16_f16:
1975   case NEON::BI__builtin_neon_vcvtp_s32_v:
1976   case NEON::BI__builtin_neon_vcvtp_s64_v:
1977   case NEON::BI__builtin_neon_vcvtp_u16_f16:
1978   case NEON::BI__builtin_neon_vcvtp_u32_v:
1979   case NEON::BI__builtin_neon_vcvtp_u64_v:
1980   case NEON::BI__builtin_neon_vcvtpq_s16_f16:
1981   case NEON::BI__builtin_neon_vcvtpq_s32_v:
1982   case NEON::BI__builtin_neon_vcvtpq_s64_v:
1983   case NEON::BI__builtin_neon_vcvtpq_u16_f16:
1984   case NEON::BI__builtin_neon_vcvtpq_u32_v:
1985   case NEON::BI__builtin_neon_vcvtpq_u64_v:
1986   case NEON::BI__builtin_neon_vcvtm_s16_f16:
1987   case NEON::BI__builtin_neon_vcvtm_s32_v:
1988   case NEON::BI__builtin_neon_vcvtm_s64_v:
1989   case NEON::BI__builtin_neon_vcvtm_u16_f16:
1990   case NEON::BI__builtin_neon_vcvtm_u32_v:
1991   case NEON::BI__builtin_neon_vcvtm_u64_v:
1992   case NEON::BI__builtin_neon_vcvtmq_s16_f16:
1993   case NEON::BI__builtin_neon_vcvtmq_s32_v:
1994   case NEON::BI__builtin_neon_vcvtmq_s64_v:
1995   case NEON::BI__builtin_neon_vcvtmq_u16_f16:
1996   case NEON::BI__builtin_neon_vcvtmq_u32_v:
1997   case NEON::BI__builtin_neon_vcvtmq_u64_v: {
1998     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
1999     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);
2000   }
2001   case NEON::BI__builtin_neon_vcvtx_f32_v: {
2002     llvm::Type *Tys[2] = { VTy->getTruncatedElementVectorType(VTy), Ty};
2003     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);
2004 
2005   }
2006   case NEON::BI__builtin_neon_vext_v:
2007   case NEON::BI__builtin_neon_vextq_v: {
2008     int CV = cast<ConstantInt>(Ops[2])->getSExtValue();
2009     SmallVector<int, 16> Indices;
2010     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
2011       Indices.push_back(i+CV);
2012 
2013     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
2014     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
2015     return Builder.CreateShuffleVector(Ops[0], Ops[1], Indices, "vext");
2016   }
2017   case NEON::BI__builtin_neon_vfma_v:
2018   case NEON::BI__builtin_neon_vfmaq_v: {
2019     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
2020     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
2021     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
2022 
2023     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
2024     return emitCallMaybeConstrainedFPBuiltin(
2025         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
2026         {Ops[1], Ops[2], Ops[0]});
2027   }
2028   case NEON::BI__builtin_neon_vld1_v:
2029   case NEON::BI__builtin_neon_vld1q_v: {
2030     llvm::Type *Tys[] = {Ty, Int8PtrTy};
2031     Ops.push_back(getAlignmentValue32(PtrOp0));
2032     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "vld1");
2033   }
2034   case NEON::BI__builtin_neon_vld1_x2_v:
2035   case NEON::BI__builtin_neon_vld1q_x2_v:
2036   case NEON::BI__builtin_neon_vld1_x3_v:
2037   case NEON::BI__builtin_neon_vld1q_x3_v:
2038   case NEON::BI__builtin_neon_vld1_x4_v:
2039   case NEON::BI__builtin_neon_vld1q_x4_v: {
2040     llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
2041     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
2042     Ops[1] = Builder.CreateCall(F, Ops[1], "vld1xN");
2043     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
2044   }
2045   case NEON::BI__builtin_neon_vld2_v:
2046   case NEON::BI__builtin_neon_vld2q_v:
2047   case NEON::BI__builtin_neon_vld3_v:
2048   case NEON::BI__builtin_neon_vld3q_v:
2049   case NEON::BI__builtin_neon_vld4_v:
2050   case NEON::BI__builtin_neon_vld4q_v:
2051   case NEON::BI__builtin_neon_vld2_dup_v:
2052   case NEON::BI__builtin_neon_vld2q_dup_v:
2053   case NEON::BI__builtin_neon_vld3_dup_v:
2054   case NEON::BI__builtin_neon_vld3q_dup_v:
2055   case NEON::BI__builtin_neon_vld4_dup_v:
2056   case NEON::BI__builtin_neon_vld4q_dup_v: {
2057     llvm::Type *Tys[] = {Ty, Int8PtrTy};
2058     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
2059     Value *Align = getAlignmentValue32(PtrOp1);
2060     Ops[1] = Builder.CreateCall(F, {Ops[1], Align}, NameHint);
2061     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
2062   }
2063   case NEON::BI__builtin_neon_vld1_dup_v:
2064   case NEON::BI__builtin_neon_vld1q_dup_v: {
2065     Value *V = PoisonValue::get(Ty);
2066     PtrOp0 = PtrOp0.withElementType(VTy->getElementType());
2067     LoadInst *Ld = Builder.CreateLoad(PtrOp0);
2068     llvm::Constant *CI = ConstantInt::get(SizeTy, 0);
2069     Ops[0] = Builder.CreateInsertElement(V, Ld, CI);
2070     return EmitNeonSplat(Ops[0], CI);
2071   }
2072   case NEON::BI__builtin_neon_vld2_lane_v:
2073   case NEON::BI__builtin_neon_vld2q_lane_v:
2074   case NEON::BI__builtin_neon_vld3_lane_v:
2075   case NEON::BI__builtin_neon_vld3q_lane_v:
2076   case NEON::BI__builtin_neon_vld4_lane_v:
2077   case NEON::BI__builtin_neon_vld4q_lane_v: {
2078     llvm::Type *Tys[] = {Ty, Int8PtrTy};
2079     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
2080     for (unsigned I = 2; I < Ops.size() - 1; ++I)
2081       Ops[I] = Builder.CreateBitCast(Ops[I], Ty);
2082     Ops.push_back(getAlignmentValue32(PtrOp1));
2083     Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), NameHint);
2084     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
2085   }
2086   case NEON::BI__builtin_neon_vmovl_v: {
2087     llvm::FixedVectorType *DTy =
2088         llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
2089     Ops[0] = Builder.CreateBitCast(Ops[0], DTy);
2090     if (Usgn)
2091       return Builder.CreateZExt(Ops[0], Ty, "vmovl");
2092     return Builder.CreateSExt(Ops[0], Ty, "vmovl");
2093   }
2094   case NEON::BI__builtin_neon_vmovn_v: {
2095     llvm::FixedVectorType *QTy =
2096         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
2097     Ops[0] = Builder.CreateBitCast(Ops[0], QTy);
2098     return Builder.CreateTrunc(Ops[0], Ty, "vmovn");
2099   }
2100   case NEON::BI__builtin_neon_vmull_v:
2101     // FIXME: the integer vmull operations could be emitted in terms of pure
2102     // LLVM IR (2 exts followed by a mul). Unfortunately LLVM has a habit of
2103     // hoisting the exts outside loops. Until global ISel comes along that can
2104     // see through such movement this leads to bad CodeGen. So we need an
2105     // intrinsic for now.
2106     Int = Usgn ? Intrinsic::arm_neon_vmullu : Intrinsic::arm_neon_vmulls;
2107     Int = Type.isPoly() ? (unsigned)Intrinsic::arm_neon_vmullp : Int;
2108     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
2109   case NEON::BI__builtin_neon_vpadal_v:
2110   case NEON::BI__builtin_neon_vpadalq_v: {
2111     // The source operand type has twice as many elements of half the size.
2112     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
2113     llvm::Type *EltTy =
2114       llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
2115     auto *NarrowTy =
2116         llvm::FixedVectorType::get(EltTy, VTy->getNumElements() * 2);
2117     llvm::Type *Tys[2] = { Ty, NarrowTy };
2118     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
2119   }
2120   case NEON::BI__builtin_neon_vpaddl_v:
2121   case NEON::BI__builtin_neon_vpaddlq_v: {
2122     // The source operand type has twice as many elements of half the size.
2123     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
2124     llvm::Type *EltTy = llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
2125     auto *NarrowTy =
2126         llvm::FixedVectorType::get(EltTy, VTy->getNumElements() * 2);
2127     llvm::Type *Tys[2] = { Ty, NarrowTy };
2128     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpaddl");
2129   }
2130   case NEON::BI__builtin_neon_vqdmlal_v:
2131   case NEON::BI__builtin_neon_vqdmlsl_v: {
2132     SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end());
2133     Ops[1] =
2134         EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), MulOps, "vqdmlal");
2135     Ops.resize(2);
2136     return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty), Ops, NameHint);
2137   }
2138   case NEON::BI__builtin_neon_vqdmulhq_lane_v:
2139   case NEON::BI__builtin_neon_vqdmulh_lane_v:
2140   case NEON::BI__builtin_neon_vqrdmulhq_lane_v:
2141   case NEON::BI__builtin_neon_vqrdmulh_lane_v: {
2142     auto *RTy = cast<llvm::FixedVectorType>(Ty);
2143     if (BuiltinID == NEON::BI__builtin_neon_vqdmulhq_lane_v ||
2144         BuiltinID == NEON::BI__builtin_neon_vqrdmulhq_lane_v)
2145       RTy = llvm::FixedVectorType::get(RTy->getElementType(),
2146                                        RTy->getNumElements() * 2);
2147     llvm::Type *Tys[2] = {
2148         RTy, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
2149                                              /*isQuad*/ false))};
2150     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
2151   }
2152   case NEON::BI__builtin_neon_vqdmulhq_laneq_v:
2153   case NEON::BI__builtin_neon_vqdmulh_laneq_v:
2154   case NEON::BI__builtin_neon_vqrdmulhq_laneq_v:
2155   case NEON::BI__builtin_neon_vqrdmulh_laneq_v: {
2156     llvm::Type *Tys[2] = {
2157         Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
2158                                             /*isQuad*/ true))};
2159     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
2160   }
2161   case NEON::BI__builtin_neon_vqshl_n_v:
2162   case NEON::BI__builtin_neon_vqshlq_n_v:
2163     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n",
2164                         1, false);
2165   case NEON::BI__builtin_neon_vqshlu_n_v:
2166   case NEON::BI__builtin_neon_vqshluq_n_v:
2167     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshlu_n",
2168                         1, false);
2169   case NEON::BI__builtin_neon_vrecpe_v:
2170   case NEON::BI__builtin_neon_vrecpeq_v:
2171   case NEON::BI__builtin_neon_vrsqrte_v:
2172   case NEON::BI__builtin_neon_vrsqrteq_v:
2173     Int = Ty->isFPOrFPVectorTy() ? LLVMIntrinsic : AltLLVMIntrinsic;
2174     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
2175   case NEON::BI__builtin_neon_vrndi_v:
2176   case NEON::BI__builtin_neon_vrndiq_v:
2177     Int = Builder.getIsFPConstrained()
2178               ? Intrinsic::experimental_constrained_nearbyint
2179               : Intrinsic::nearbyint;
2180     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
2181   case NEON::BI__builtin_neon_vrshr_n_v:
2182   case NEON::BI__builtin_neon_vrshrq_n_v:
2183     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n",
2184                         1, true);
2185   case NEON::BI__builtin_neon_vsha512hq_u64:
2186   case NEON::BI__builtin_neon_vsha512h2q_u64:
2187   case NEON::BI__builtin_neon_vsha512su0q_u64:
2188   case NEON::BI__builtin_neon_vsha512su1q_u64: {
2189     Function *F = CGM.getIntrinsic(Int);
2190     return EmitNeonCall(F, Ops, "");
2191   }
2192   case NEON::BI__builtin_neon_vshl_n_v:
2193   case NEON::BI__builtin_neon_vshlq_n_v:
2194     Ops[1] = EmitNeonShiftVector(Ops[1], Ty, false);
2195     return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1],
2196                              "vshl_n");
2197   case NEON::BI__builtin_neon_vshll_n_v: {
2198     llvm::FixedVectorType *SrcTy =
2199         llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
2200     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
2201     if (Usgn)
2202       Ops[0] = Builder.CreateZExt(Ops[0], VTy);
2203     else
2204       Ops[0] = Builder.CreateSExt(Ops[0], VTy);
2205     Ops[1] = EmitNeonShiftVector(Ops[1], VTy, false);
2206     return Builder.CreateShl(Ops[0], Ops[1], "vshll_n");
2207   }
2208   case NEON::BI__builtin_neon_vshrn_n_v: {
2209     llvm::FixedVectorType *SrcTy =
2210         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
2211     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
2212     Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false);
2213     if (Usgn)
2214       Ops[0] = Builder.CreateLShr(Ops[0], Ops[1]);
2215     else
2216       Ops[0] = Builder.CreateAShr(Ops[0], Ops[1]);
2217     return Builder.CreateTrunc(Ops[0], Ty, "vshrn_n");
2218   }
2219   case NEON::BI__builtin_neon_vshr_n_v:
2220   case NEON::BI__builtin_neon_vshrq_n_v:
2221     return EmitNeonRShiftImm(Ops[0], Ops[1], Ty, Usgn, "vshr_n");
2222   case NEON::BI__builtin_neon_vst1_v:
2223   case NEON::BI__builtin_neon_vst1q_v:
2224   case NEON::BI__builtin_neon_vst2_v:
2225   case NEON::BI__builtin_neon_vst2q_v:
2226   case NEON::BI__builtin_neon_vst3_v:
2227   case NEON::BI__builtin_neon_vst3q_v:
2228   case NEON::BI__builtin_neon_vst4_v:
2229   case NEON::BI__builtin_neon_vst4q_v:
2230   case NEON::BI__builtin_neon_vst2_lane_v:
2231   case NEON::BI__builtin_neon_vst2q_lane_v:
2232   case NEON::BI__builtin_neon_vst3_lane_v:
2233   case NEON::BI__builtin_neon_vst3q_lane_v:
2234   case NEON::BI__builtin_neon_vst4_lane_v:
2235   case NEON::BI__builtin_neon_vst4q_lane_v: {
2236     llvm::Type *Tys[] = {Int8PtrTy, Ty};
2237     Ops.push_back(getAlignmentValue32(PtrOp0));
2238     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "");
2239   }
2240   case NEON::BI__builtin_neon_vsm3partw1q_u32:
2241   case NEON::BI__builtin_neon_vsm3partw2q_u32:
2242   case NEON::BI__builtin_neon_vsm3ss1q_u32:
2243   case NEON::BI__builtin_neon_vsm4ekeyq_u32:
2244   case NEON::BI__builtin_neon_vsm4eq_u32: {
2245     Function *F = CGM.getIntrinsic(Int);
2246     return EmitNeonCall(F, Ops, "");
2247   }
2248   case NEON::BI__builtin_neon_vsm3tt1aq_u32:
2249   case NEON::BI__builtin_neon_vsm3tt1bq_u32:
2250   case NEON::BI__builtin_neon_vsm3tt2aq_u32:
2251   case NEON::BI__builtin_neon_vsm3tt2bq_u32: {
2252     Function *F = CGM.getIntrinsic(Int);
2253     Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
2254     return EmitNeonCall(F, Ops, "");
2255   }
2256   case NEON::BI__builtin_neon_vst1_x2_v:
2257   case NEON::BI__builtin_neon_vst1q_x2_v:
2258   case NEON::BI__builtin_neon_vst1_x3_v:
2259   case NEON::BI__builtin_neon_vst1q_x3_v:
2260   case NEON::BI__builtin_neon_vst1_x4_v:
2261   case NEON::BI__builtin_neon_vst1q_x4_v: {
2262     // TODO: Currently in AArch32 mode the pointer operand comes first, whereas
2263     // in AArch64 it comes last. We may want to stick to one or another.
2264     if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be ||
2265         Arch == llvm::Triple::aarch64_32) {
2266       llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
2267       std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
2268       return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
2269     }
2270     llvm::Type *Tys[2] = {UnqualPtrTy, VTy};
2271     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
2272   }
2273   case NEON::BI__builtin_neon_vsubhn_v: {
2274     llvm::FixedVectorType *SrcTy =
2275         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
2276 
2277     // %sum = add <4 x i32> %lhs, %rhs
2278     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
2279     Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
2280     Ops[0] = Builder.CreateSub(Ops[0], Ops[1], "vsubhn");
2281 
2282     // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
2283     Constant *ShiftAmt =
2284         ConstantInt::get(SrcTy, SrcTy->getScalarSizeInBits() / 2);
2285     Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vsubhn");
2286 
2287     // %res = trunc <4 x i32> %high to <4 x i16>
2288     return Builder.CreateTrunc(Ops[0], VTy, "vsubhn");
2289   }
2290   case NEON::BI__builtin_neon_vtrn_v:
2291   case NEON::BI__builtin_neon_vtrnq_v: {
2292     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
2293     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
2294     Value *SV = nullptr;
2295 
2296     for (unsigned vi = 0; vi != 2; ++vi) {
2297       SmallVector<int, 16> Indices;
2298       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
2299         Indices.push_back(i+vi);
2300         Indices.push_back(i+e+vi);
2301       }
2302       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
2303       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn");
2304       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
2305     }
2306     return SV;
2307   }
2308   case NEON::BI__builtin_neon_vtst_v:
2309   case NEON::BI__builtin_neon_vtstq_v: {
2310     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
2311     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
2312     Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
2313     Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
2314                                 ConstantAggregateZero::get(Ty));
2315     return Builder.CreateSExt(Ops[0], Ty, "vtst");
2316   }
2317   case NEON::BI__builtin_neon_vuzp_v:
2318   case NEON::BI__builtin_neon_vuzpq_v: {
2319     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
2320     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
2321     Value *SV = nullptr;
2322 
2323     for (unsigned vi = 0; vi != 2; ++vi) {
2324       SmallVector<int, 16> Indices;
2325       for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
2326         Indices.push_back(2*i+vi);
2327 
2328       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
2329       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp");
2330       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
2331     }
2332     return SV;
2333   }
2334   case NEON::BI__builtin_neon_vxarq_u64: {
2335     Function *F = CGM.getIntrinsic(Int);
2336     Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
2337     return EmitNeonCall(F, Ops, "");
2338   }
2339   case NEON::BI__builtin_neon_vzip_v:
2340   case NEON::BI__builtin_neon_vzipq_v: {
2341     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
2342     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
2343     Value *SV = nullptr;
2344 
2345     for (unsigned vi = 0; vi != 2; ++vi) {
2346       SmallVector<int, 16> Indices;
2347       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
2348         Indices.push_back((i + vi*e) >> 1);
2349         Indices.push_back(((i + vi*e) >> 1)+e);
2350       }
2351       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
2352       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip");
2353       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
2354     }
2355     return SV;
2356   }
2357   case NEON::BI__builtin_neon_vdot_s32:
2358   case NEON::BI__builtin_neon_vdot_u32:
2359   case NEON::BI__builtin_neon_vdotq_s32:
2360   case NEON::BI__builtin_neon_vdotq_u32: {
2361     auto *InputTy =
2362         llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
2363     llvm::Type *Tys[2] = { Ty, InputTy };
2364     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot");
2365   }
2366   case NEON::BI__builtin_neon_vfmlal_low_f16:
2367   case NEON::BI__builtin_neon_vfmlalq_low_f16: {
2368     auto *InputTy =
2369         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
2370     llvm::Type *Tys[2] = { Ty, InputTy };
2371     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_low");
2372   }
2373   case NEON::BI__builtin_neon_vfmlsl_low_f16:
2374   case NEON::BI__builtin_neon_vfmlslq_low_f16: {
2375     auto *InputTy =
2376         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
2377     llvm::Type *Tys[2] = { Ty, InputTy };
2378     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_low");
2379   }
2380   case NEON::BI__builtin_neon_vfmlal_high_f16:
2381   case NEON::BI__builtin_neon_vfmlalq_high_f16: {
2382     auto *InputTy =
2383         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
2384     llvm::Type *Tys[2] = { Ty, InputTy };
2385     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_high");
2386   }
2387   case NEON::BI__builtin_neon_vfmlsl_high_f16:
2388   case NEON::BI__builtin_neon_vfmlslq_high_f16: {
2389     auto *InputTy =
2390         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
2391     llvm::Type *Tys[2] = { Ty, InputTy };
2392     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high");
2393   }
2394   case NEON::BI__builtin_neon_vmmlaq_s32:
2395   case NEON::BI__builtin_neon_vmmlaq_u32: {
2396     auto *InputTy =
2397         llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
2398     llvm::Type *Tys[2] = { Ty, InputTy };
2399     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "vmmla");
2400   }
2401   case NEON::BI__builtin_neon_vusmmlaq_s32: {
2402     auto *InputTy =
2403         llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
2404     llvm::Type *Tys[2] = { Ty, InputTy };
2405     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusmmla");
2406   }
2407   case NEON::BI__builtin_neon_vusdot_s32:
2408   case NEON::BI__builtin_neon_vusdotq_s32: {
2409     auto *InputTy =
2410         llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
2411     llvm::Type *Tys[2] = { Ty, InputTy };
2412     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot");
2413   }
2414   case NEON::BI__builtin_neon_vbfdot_f32:
2415   case NEON::BI__builtin_neon_vbfdotq_f32: {
2416     llvm::Type *InputTy =
2417         llvm::FixedVectorType::get(BFloatTy, Ty->getPrimitiveSizeInBits() / 16);
2418     llvm::Type *Tys[2] = { Ty, InputTy };
2419     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot");
2420   }
2421   case NEON::BI__builtin_neon___a32_vcvt_bf16_f32: {
2422     llvm::Type *Tys[1] = { Ty };
2423     Function *F = CGM.getIntrinsic(Int, Tys);
2424     return EmitNeonCall(F, Ops, "vcvtfp2bf");
2425   }
2426 
2427   }
2428 
2429   assert(Int && "Expected valid intrinsic number");
2430 
2431   // Determine the type(s) of this overloaded AArch64 intrinsic.
2432   Function *F = LookupNeonLLVMIntrinsic(Int, Modifier, Ty, E);
2433 
2434   Value *Result = EmitNeonCall(F, Ops, NameHint);
2435   llvm::Type *ResultType = ConvertType(E->getType());
2436   // AArch64 intrinsic one-element vector type cast to
2437   // scalar type expected by the builtin
2438   return Builder.CreateBitCast(Result, ResultType, NameHint);
2439 }
2440 
2441 Value *
2442 CodeGenFunction::EmitAArch64CompareBuiltinExpr(Value *Op, llvm::Type *Ty,
2443                                                const CmpInst::Predicate Pred,
2444                                                const Twine &Name) {
2445 
2446   if (isa<FixedVectorType>(Ty)) {
2447     // Vector types are cast to i8 vectors. Recover original type.
2448     Op = Builder.CreateBitCast(Op, Ty);
2449   }
2450 
2451   if (CmpInst::isFPPredicate(Pred)) {
2452     if (Pred == CmpInst::FCMP_OEQ)
2453       Op = Builder.CreateFCmp(Pred, Op, Constant::getNullValue(Op->getType()));
2454     else
2455       Op = Builder.CreateFCmpS(Pred, Op, Constant::getNullValue(Op->getType()));
2456   } else {
2457     Op = Builder.CreateICmp(Pred, Op, Constant::getNullValue(Op->getType()));
2458   }
2459 
2460   llvm::Type *ResTy = Ty;
2461   if (auto *VTy = dyn_cast<FixedVectorType>(Ty))
2462     ResTy = FixedVectorType::get(
2463         IntegerType::get(getLLVMContext(), VTy->getScalarSizeInBits()),
2464         VTy->getNumElements());
2465 
2466   return Builder.CreateSExt(Op, ResTy, Name);
2467 }
2468 
2469 static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
2470                                  Value *ExtOp, Value *IndexOp,
2471                                  llvm::Type *ResTy, unsigned IntID,
2472                                  const char *Name) {
2473   SmallVector<Value *, 2> TblOps;
2474   if (ExtOp)
2475     TblOps.push_back(ExtOp);
2476 
2477   // Build a vector containing sequential number like (0, 1, 2, ..., 15)
2478   SmallVector<int, 16> Indices;
2479   auto *TblTy = cast<llvm::FixedVectorType>(Ops[0]->getType());
2480   for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
2481     Indices.push_back(2*i);
2482     Indices.push_back(2*i+1);
2483   }
2484 
2485   int PairPos = 0, End = Ops.size() - 1;
2486   while (PairPos < End) {
2487     TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
2488                                                      Ops[PairPos+1], Indices,
2489                                                      Name));
2490     PairPos += 2;
2491   }
2492 
2493   // If there's an odd number of 64-bit lookup table, fill the high 64-bit
2494   // of the 128-bit lookup table with zero.
2495   if (PairPos == End) {
2496     Value *ZeroTbl = ConstantAggregateZero::get(TblTy);
2497     TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
2498                                                      ZeroTbl, Indices, Name));
2499   }
2500 
2501   Function *TblF;
2502   TblOps.push_back(IndexOp);
2503   TblF = CGF.CGM.getIntrinsic(IntID, ResTy);
2504 
2505   return CGF.EmitNeonCall(TblF, TblOps, Name);
2506 }
2507 
2508 Value *CodeGenFunction::GetValueForARMHint(unsigned BuiltinID) {
2509   unsigned Value;
2510   switch (BuiltinID) {
2511   default:
2512     return nullptr;
2513   case clang::ARM::BI__builtin_arm_nop:
2514     Value = 0;
2515     break;
2516   case clang::ARM::BI__builtin_arm_yield:
2517   case clang::ARM::BI__yield:
2518     Value = 1;
2519     break;
2520   case clang::ARM::BI__builtin_arm_wfe:
2521   case clang::ARM::BI__wfe:
2522     Value = 2;
2523     break;
2524   case clang::ARM::BI__builtin_arm_wfi:
2525   case clang::ARM::BI__wfi:
2526     Value = 3;
2527     break;
2528   case clang::ARM::BI__builtin_arm_sev:
2529   case clang::ARM::BI__sev:
2530     Value = 4;
2531     break;
2532   case clang::ARM::BI__builtin_arm_sevl:
2533   case clang::ARM::BI__sevl:
2534     Value = 5;
2535     break;
2536   }
2537 
2538   return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_hint),
2539                             llvm::ConstantInt::get(Int32Ty, Value));
2540 }
2541 
2542 enum SpecialRegisterAccessKind {
2543   NormalRead,
2544   VolatileRead,
2545   Write,
2546 };
2547 
2548 // Generates the IR for the read/write special register builtin,
2549 // ValueType is the type of the value that is to be written or read,
2550 // RegisterType is the type of the register being written to or read from.
2551 static Value *EmitSpecialRegisterBuiltin(CodeGenFunction &CGF,
2552                                          const CallExpr *E,
2553                                          llvm::Type *RegisterType,
2554                                          llvm::Type *ValueType,
2555                                          SpecialRegisterAccessKind AccessKind,
2556                                          StringRef SysReg = "") {
2557   // write and register intrinsics only support 32, 64 and 128 bit operations.
2558   assert((RegisterType->isIntegerTy(32) || RegisterType->isIntegerTy(64) ||
2559           RegisterType->isIntegerTy(128)) &&
2560          "Unsupported size for register.");
2561 
2562   CodeGen::CGBuilderTy &Builder = CGF.Builder;
2563   CodeGen::CodeGenModule &CGM = CGF.CGM;
2564   LLVMContext &Context = CGM.getLLVMContext();
2565 
2566   if (SysReg.empty()) {
2567     const Expr *SysRegStrExpr = E->getArg(0)->IgnoreParenCasts();
2568     SysReg = cast<clang::StringLiteral>(SysRegStrExpr)->getString();
2569   }
2570 
2571   llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysReg) };
2572   llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
2573   llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
2574 
2575   llvm::Type *Types[] = { RegisterType };
2576 
2577   bool MixedTypes = RegisterType->isIntegerTy(64) && ValueType->isIntegerTy(32);
2578   assert(!(RegisterType->isIntegerTy(32) && ValueType->isIntegerTy(64))
2579             && "Can't fit 64-bit value in 32-bit register");
2580 
2581   if (AccessKind != Write) {
2582     assert(AccessKind == NormalRead || AccessKind == VolatileRead);
2583     llvm::Function *F = CGM.getIntrinsic(
2584         AccessKind == VolatileRead ? Intrinsic::read_volatile_register
2585                                    : Intrinsic::read_register,
2586         Types);
2587     llvm::Value *Call = Builder.CreateCall(F, Metadata);
2588 
2589     if (MixedTypes)
2590       // Read into 64 bit register and then truncate result to 32 bit.
2591       return Builder.CreateTrunc(Call, ValueType);
2592 
2593     if (ValueType->isPointerTy())
2594       // Have i32/i64 result (Call) but want to return a VoidPtrTy (i8*).
2595       return Builder.CreateIntToPtr(Call, ValueType);
2596 
2597     return Call;
2598   }
2599 
2600   llvm::Function *F = CGM.getIntrinsic(Intrinsic::write_register, Types);
2601   llvm::Value *ArgValue = CGF.EmitScalarExpr(E->getArg(1));
2602   if (MixedTypes) {
2603     // Extend 32 bit write value to 64 bit to pass to write.
2604     ArgValue = Builder.CreateZExt(ArgValue, RegisterType);
2605     return Builder.CreateCall(F, { Metadata, ArgValue });
2606   }
2607 
2608   if (ValueType->isPointerTy()) {
2609     // Have VoidPtrTy ArgValue but want to return an i32/i64.
2610     ArgValue = Builder.CreatePtrToInt(ArgValue, RegisterType);
2611     return Builder.CreateCall(F, { Metadata, ArgValue });
2612   }
2613 
2614   return Builder.CreateCall(F, { Metadata, ArgValue });
2615 }
2616 
2617 /// Return true if BuiltinID is an overloaded Neon intrinsic with an extra
2618 /// argument that specifies the vector type.
2619 static bool HasExtraNeonArgument(unsigned BuiltinID) {
2620   switch (BuiltinID) {
2621   default: break;
2622   case NEON::BI__builtin_neon_vget_lane_i8:
2623   case NEON::BI__builtin_neon_vget_lane_i16:
2624   case NEON::BI__builtin_neon_vget_lane_bf16:
2625   case NEON::BI__builtin_neon_vget_lane_i32:
2626   case NEON::BI__builtin_neon_vget_lane_i64:
2627   case NEON::BI__builtin_neon_vget_lane_mf8:
2628   case NEON::BI__builtin_neon_vget_lane_f32:
2629   case NEON::BI__builtin_neon_vgetq_lane_i8:
2630   case NEON::BI__builtin_neon_vgetq_lane_i16:
2631   case NEON::BI__builtin_neon_vgetq_lane_bf16:
2632   case NEON::BI__builtin_neon_vgetq_lane_i32:
2633   case NEON::BI__builtin_neon_vgetq_lane_i64:
2634   case NEON::BI__builtin_neon_vgetq_lane_mf8:
2635   case NEON::BI__builtin_neon_vgetq_lane_f32:
2636   case NEON::BI__builtin_neon_vduph_lane_bf16:
2637   case NEON::BI__builtin_neon_vduph_laneq_bf16:
2638   case NEON::BI__builtin_neon_vset_lane_i8:
2639   case NEON::BI__builtin_neon_vset_lane_mf8:
2640   case NEON::BI__builtin_neon_vset_lane_i16:
2641   case NEON::BI__builtin_neon_vset_lane_bf16:
2642   case NEON::BI__builtin_neon_vset_lane_i32:
2643   case NEON::BI__builtin_neon_vset_lane_i64:
2644   case NEON::BI__builtin_neon_vset_lane_f32:
2645   case NEON::BI__builtin_neon_vsetq_lane_i8:
2646   case NEON::BI__builtin_neon_vsetq_lane_mf8:
2647   case NEON::BI__builtin_neon_vsetq_lane_i16:
2648   case NEON::BI__builtin_neon_vsetq_lane_bf16:
2649   case NEON::BI__builtin_neon_vsetq_lane_i32:
2650   case NEON::BI__builtin_neon_vsetq_lane_i64:
2651   case NEON::BI__builtin_neon_vsetq_lane_f32:
2652   case NEON::BI__builtin_neon_vsha1h_u32:
2653   case NEON::BI__builtin_neon_vsha1cq_u32:
2654   case NEON::BI__builtin_neon_vsha1pq_u32:
2655   case NEON::BI__builtin_neon_vsha1mq_u32:
2656   case NEON::BI__builtin_neon_vcvth_bf16_f32:
2657   case clang::ARM::BI_MoveToCoprocessor:
2658   case clang::ARM::BI_MoveToCoprocessor2:
2659     return false;
2660   }
2661   return true;
2662 }
2663 
2664 Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
2665                                            const CallExpr *E,
2666                                            ReturnValueSlot ReturnValue,
2667                                            llvm::Triple::ArchType Arch) {
2668   if (auto Hint = GetValueForARMHint(BuiltinID))
2669     return Hint;
2670 
2671   if (BuiltinID == clang::ARM::BI__emit) {
2672     bool IsThumb = getTarget().getTriple().getArch() == llvm::Triple::thumb;
2673     llvm::FunctionType *FTy =
2674         llvm::FunctionType::get(VoidTy, /*Variadic=*/false);
2675 
2676     Expr::EvalResult Result;
2677     if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
2678       llvm_unreachable("Sema will ensure that the parameter is constant");
2679 
2680     llvm::APSInt Value = Result.Val.getInt();
2681     uint64_t ZExtValue = Value.zextOrTrunc(IsThumb ? 16 : 32).getZExtValue();
2682 
2683     llvm::InlineAsm *Emit =
2684         IsThumb ? InlineAsm::get(FTy, ".inst.n 0x" + utohexstr(ZExtValue), "",
2685                                  /*hasSideEffects=*/true)
2686                 : InlineAsm::get(FTy, ".inst 0x" + utohexstr(ZExtValue), "",
2687                                  /*hasSideEffects=*/true);
2688 
2689     return Builder.CreateCall(Emit);
2690   }
2691 
2692   if (BuiltinID == clang::ARM::BI__builtin_arm_dbg) {
2693     Value *Option = EmitScalarExpr(E->getArg(0));
2694     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_dbg), Option);
2695   }
2696 
2697   if (BuiltinID == clang::ARM::BI__builtin_arm_prefetch) {
2698     Value *Address = EmitScalarExpr(E->getArg(0));
2699     Value *RW      = EmitScalarExpr(E->getArg(1));
2700     Value *IsData  = EmitScalarExpr(E->getArg(2));
2701 
2702     // Locality is not supported on ARM target
2703     Value *Locality = llvm::ConstantInt::get(Int32Ty, 3);
2704 
2705     Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
2706     return Builder.CreateCall(F, {Address, RW, Locality, IsData});
2707   }
2708 
2709   if (BuiltinID == clang::ARM::BI__builtin_arm_rbit) {
2710     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
2711     return Builder.CreateCall(
2712         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
2713   }
2714 
2715   if (BuiltinID == clang::ARM::BI__builtin_arm_clz ||
2716       BuiltinID == clang::ARM::BI__builtin_arm_clz64) {
2717     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
2718     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Arg->getType());
2719     Value *Res = Builder.CreateCall(F, {Arg, Builder.getInt1(false)});
2720     if (BuiltinID == clang::ARM::BI__builtin_arm_clz64)
2721       Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
2722     return Res;
2723   }
2724 
2725 
2726   if (BuiltinID == clang::ARM::BI__builtin_arm_cls) {
2727     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
2728     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_cls), Arg, "cls");
2729   }
2730   if (BuiltinID == clang::ARM::BI__builtin_arm_cls64) {
2731     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
2732     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_cls64), Arg,
2733                               "cls");
2734   }
2735 
2736   if (BuiltinID == clang::ARM::BI__clear_cache) {
2737     assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
2738     const FunctionDecl *FD = E->getDirectCallee();
2739     Value *Ops[2];
2740     for (unsigned i = 0; i < 2; i++)
2741       Ops[i] = EmitScalarExpr(E->getArg(i));
2742     llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
2743     llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
2744     StringRef Name = FD->getName();
2745     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
2746   }
2747 
2748   if (BuiltinID == clang::ARM::BI__builtin_arm_mcrr ||
2749       BuiltinID == clang::ARM::BI__builtin_arm_mcrr2) {
2750     Function *F;
2751 
2752     switch (BuiltinID) {
2753     default: llvm_unreachable("unexpected builtin");
2754     case clang::ARM::BI__builtin_arm_mcrr:
2755       F = CGM.getIntrinsic(Intrinsic::arm_mcrr);
2756       break;
2757     case clang::ARM::BI__builtin_arm_mcrr2:
2758       F = CGM.getIntrinsic(Intrinsic::arm_mcrr2);
2759       break;
2760     }
2761 
2762     // MCRR{2} instruction has 5 operands but
2763     // the intrinsic has 4 because Rt and Rt2
2764     // are represented as a single unsigned 64
2765     // bit integer in the intrinsic definition
2766     // but internally it's represented as 2 32
2767     // bit integers.
2768 
2769     Value *Coproc = EmitScalarExpr(E->getArg(0));
2770     Value *Opc1 = EmitScalarExpr(E->getArg(1));
2771     Value *RtAndRt2 = EmitScalarExpr(E->getArg(2));
2772     Value *CRm = EmitScalarExpr(E->getArg(3));
2773 
2774     Value *C1 = llvm::ConstantInt::get(Int64Ty, 32);
2775     Value *Rt = Builder.CreateTruncOrBitCast(RtAndRt2, Int32Ty);
2776     Value *Rt2 = Builder.CreateLShr(RtAndRt2, C1);
2777     Rt2 = Builder.CreateTruncOrBitCast(Rt2, Int32Ty);
2778 
2779     return Builder.CreateCall(F, {Coproc, Opc1, Rt, Rt2, CRm});
2780   }
2781 
2782   if (BuiltinID == clang::ARM::BI__builtin_arm_mrrc ||
2783       BuiltinID == clang::ARM::BI__builtin_arm_mrrc2) {
2784     Function *F;
2785 
2786     switch (BuiltinID) {
2787     default: llvm_unreachable("unexpected builtin");
2788     case clang::ARM::BI__builtin_arm_mrrc:
2789       F = CGM.getIntrinsic(Intrinsic::arm_mrrc);
2790       break;
2791     case clang::ARM::BI__builtin_arm_mrrc2:
2792       F = CGM.getIntrinsic(Intrinsic::arm_mrrc2);
2793       break;
2794     }
2795 
2796     Value *Coproc = EmitScalarExpr(E->getArg(0));
2797     Value *Opc1 = EmitScalarExpr(E->getArg(1));
2798     Value *CRm  = EmitScalarExpr(E->getArg(2));
2799     Value *RtAndRt2 = Builder.CreateCall(F, {Coproc, Opc1, CRm});
2800 
2801     // Returns an unsigned 64 bit integer, represented
2802     // as two 32 bit integers.
2803 
2804     Value *Rt = Builder.CreateExtractValue(RtAndRt2, 1);
2805     Value *Rt1 = Builder.CreateExtractValue(RtAndRt2, 0);
2806     Rt = Builder.CreateZExt(Rt, Int64Ty);
2807     Rt1 = Builder.CreateZExt(Rt1, Int64Ty);
2808 
2809     Value *ShiftCast = llvm::ConstantInt::get(Int64Ty, 32);
2810     RtAndRt2 = Builder.CreateShl(Rt, ShiftCast, "shl", true);
2811     RtAndRt2 = Builder.CreateOr(RtAndRt2, Rt1);
2812 
2813     return Builder.CreateBitCast(RtAndRt2, ConvertType(E->getType()));
2814   }
2815 
2816   if (BuiltinID == clang::ARM::BI__builtin_arm_ldrexd ||
2817       ((BuiltinID == clang::ARM::BI__builtin_arm_ldrex ||
2818         BuiltinID == clang::ARM::BI__builtin_arm_ldaex) &&
2819        getContext().getTypeSize(E->getType()) == 64) ||
2820       BuiltinID == clang::ARM::BI__ldrexd) {
2821     Function *F;
2822 
2823     switch (BuiltinID) {
2824     default: llvm_unreachable("unexpected builtin");
2825     case clang::ARM::BI__builtin_arm_ldaex:
2826       F = CGM.getIntrinsic(Intrinsic::arm_ldaexd);
2827       break;
2828     case clang::ARM::BI__builtin_arm_ldrexd:
2829     case clang::ARM::BI__builtin_arm_ldrex:
2830     case clang::ARM::BI__ldrexd:
2831       F = CGM.getIntrinsic(Intrinsic::arm_ldrexd);
2832       break;
2833     }
2834 
2835     Value *LdPtr = EmitScalarExpr(E->getArg(0));
2836     Value *Val = Builder.CreateCall(F, LdPtr, "ldrexd");
2837 
2838     Value *Val0 = Builder.CreateExtractValue(Val, 1);
2839     Value *Val1 = Builder.CreateExtractValue(Val, 0);
2840     Val0 = Builder.CreateZExt(Val0, Int64Ty);
2841     Val1 = Builder.CreateZExt(Val1, Int64Ty);
2842 
2843     Value *ShiftCst = llvm::ConstantInt::get(Int64Ty, 32);
2844     Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
2845     Val = Builder.CreateOr(Val, Val1);
2846     return Builder.CreateBitCast(Val, ConvertType(E->getType()));
2847   }
2848 
2849   if (BuiltinID == clang::ARM::BI__builtin_arm_ldrex ||
2850       BuiltinID == clang::ARM::BI__builtin_arm_ldaex) {
2851     Value *LoadAddr = EmitScalarExpr(E->getArg(0));
2852 
2853     QualType Ty = E->getType();
2854     llvm::Type *RealResTy = ConvertType(Ty);
2855     llvm::Type *IntTy =
2856         llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty));
2857 
2858     Function *F = CGM.getIntrinsic(
2859         BuiltinID == clang::ARM::BI__builtin_arm_ldaex ? Intrinsic::arm_ldaex
2860                                                        : Intrinsic::arm_ldrex,
2861         UnqualPtrTy);
2862     CallInst *Val = Builder.CreateCall(F, LoadAddr, "ldrex");
2863     Val->addParamAttr(
2864         0, Attribute::get(getLLVMContext(), Attribute::ElementType, IntTy));
2865 
2866     if (RealResTy->isPointerTy())
2867       return Builder.CreateIntToPtr(Val, RealResTy);
2868     else {
2869       llvm::Type *IntResTy = llvm::IntegerType::get(
2870           getLLVMContext(), CGM.getDataLayout().getTypeSizeInBits(RealResTy));
2871       return Builder.CreateBitCast(Builder.CreateTruncOrBitCast(Val, IntResTy),
2872                                    RealResTy);
2873     }
2874   }
2875 
2876   if (BuiltinID == clang::ARM::BI__builtin_arm_strexd ||
2877       ((BuiltinID == clang::ARM::BI__builtin_arm_stlex ||
2878         BuiltinID == clang::ARM::BI__builtin_arm_strex) &&
2879        getContext().getTypeSize(E->getArg(0)->getType()) == 64)) {
2880     Function *F = CGM.getIntrinsic(
2881         BuiltinID == clang::ARM::BI__builtin_arm_stlex ? Intrinsic::arm_stlexd
2882                                                        : Intrinsic::arm_strexd);
2883     llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty);
2884 
2885     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
2886     Value *Val = EmitScalarExpr(E->getArg(0));
2887     Builder.CreateStore(Val, Tmp);
2888 
2889     Address LdPtr = Tmp.withElementType(STy);
2890     Val = Builder.CreateLoad(LdPtr);
2891 
2892     Value *Arg0 = Builder.CreateExtractValue(Val, 0);
2893     Value *Arg1 = Builder.CreateExtractValue(Val, 1);
2894     Value *StPtr = EmitScalarExpr(E->getArg(1));
2895     return Builder.CreateCall(F, {Arg0, Arg1, StPtr}, "strexd");
2896   }
2897 
2898   if (BuiltinID == clang::ARM::BI__builtin_arm_strex ||
2899       BuiltinID == clang::ARM::BI__builtin_arm_stlex) {
2900     Value *StoreVal = EmitScalarExpr(E->getArg(0));
2901     Value *StoreAddr = EmitScalarExpr(E->getArg(1));
2902 
2903     QualType Ty = E->getArg(0)->getType();
2904     llvm::Type *StoreTy =
2905         llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty));
2906 
2907     if (StoreVal->getType()->isPointerTy())
2908       StoreVal = Builder.CreatePtrToInt(StoreVal, Int32Ty);
2909     else {
2910       llvm::Type *IntTy = llvm::IntegerType::get(
2911           getLLVMContext(),
2912           CGM.getDataLayout().getTypeSizeInBits(StoreVal->getType()));
2913       StoreVal = Builder.CreateBitCast(StoreVal, IntTy);
2914       StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int32Ty);
2915     }
2916 
2917     Function *F = CGM.getIntrinsic(
2918         BuiltinID == clang::ARM::BI__builtin_arm_stlex ? Intrinsic::arm_stlex
2919                                                        : Intrinsic::arm_strex,
2920         StoreAddr->getType());
2921 
2922     CallInst *CI = Builder.CreateCall(F, {StoreVal, StoreAddr}, "strex");
2923     CI->addParamAttr(
2924         1, Attribute::get(getLLVMContext(), Attribute::ElementType, StoreTy));
2925     return CI;
2926   }
2927 
2928   if (BuiltinID == clang::ARM::BI__builtin_arm_clrex) {
2929     Function *F = CGM.getIntrinsic(Intrinsic::arm_clrex);
2930     return Builder.CreateCall(F);
2931   }
2932 
2933   // CRC32
2934   Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
2935   switch (BuiltinID) {
2936   case clang::ARM::BI__builtin_arm_crc32b:
2937     CRCIntrinsicID = Intrinsic::arm_crc32b; break;
2938   case clang::ARM::BI__builtin_arm_crc32cb:
2939     CRCIntrinsicID = Intrinsic::arm_crc32cb; break;
2940   case clang::ARM::BI__builtin_arm_crc32h:
2941     CRCIntrinsicID = Intrinsic::arm_crc32h; break;
2942   case clang::ARM::BI__builtin_arm_crc32ch:
2943     CRCIntrinsicID = Intrinsic::arm_crc32ch; break;
2944   case clang::ARM::BI__builtin_arm_crc32w:
2945   case clang::ARM::BI__builtin_arm_crc32d:
2946     CRCIntrinsicID = Intrinsic::arm_crc32w; break;
2947   case clang::ARM::BI__builtin_arm_crc32cw:
2948   case clang::ARM::BI__builtin_arm_crc32cd:
2949     CRCIntrinsicID = Intrinsic::arm_crc32cw; break;
2950   }
2951 
2952   if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
2953     Value *Arg0 = EmitScalarExpr(E->getArg(0));
2954     Value *Arg1 = EmitScalarExpr(E->getArg(1));
2955 
2956     // crc32{c,}d intrinsics are implemented as two calls to crc32{c,}w
2957     // intrinsics, hence we need different codegen for these cases.
2958     if (BuiltinID == clang::ARM::BI__builtin_arm_crc32d ||
2959         BuiltinID == clang::ARM::BI__builtin_arm_crc32cd) {
2960       Value *C1 = llvm::ConstantInt::get(Int64Ty, 32);
2961       Value *Arg1a = Builder.CreateTruncOrBitCast(Arg1, Int32Ty);
2962       Value *Arg1b = Builder.CreateLShr(Arg1, C1);
2963       Arg1b = Builder.CreateTruncOrBitCast(Arg1b, Int32Ty);
2964 
2965       Function *F = CGM.getIntrinsic(CRCIntrinsicID);
2966       Value *Res = Builder.CreateCall(F, {Arg0, Arg1a});
2967       return Builder.CreateCall(F, {Res, Arg1b});
2968     } else {
2969       Arg1 = Builder.CreateZExtOrBitCast(Arg1, Int32Ty);
2970 
2971       Function *F = CGM.getIntrinsic(CRCIntrinsicID);
2972       return Builder.CreateCall(F, {Arg0, Arg1});
2973     }
2974   }
2975 
2976   if (BuiltinID == clang::ARM::BI__builtin_arm_rsr ||
2977       BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
2978       BuiltinID == clang::ARM::BI__builtin_arm_rsrp ||
2979       BuiltinID == clang::ARM::BI__builtin_arm_wsr ||
2980       BuiltinID == clang::ARM::BI__builtin_arm_wsr64 ||
2981       BuiltinID == clang::ARM::BI__builtin_arm_wsrp) {
2982 
2983     SpecialRegisterAccessKind AccessKind = Write;
2984     if (BuiltinID == clang::ARM::BI__builtin_arm_rsr ||
2985         BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
2986         BuiltinID == clang::ARM::BI__builtin_arm_rsrp)
2987       AccessKind = VolatileRead;
2988 
2989     bool IsPointerBuiltin = BuiltinID == clang::ARM::BI__builtin_arm_rsrp ||
2990                             BuiltinID == clang::ARM::BI__builtin_arm_wsrp;
2991 
2992     bool Is64Bit = BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
2993                    BuiltinID == clang::ARM::BI__builtin_arm_wsr64;
2994 
2995     llvm::Type *ValueType;
2996     llvm::Type *RegisterType;
2997     if (IsPointerBuiltin) {
2998       ValueType = VoidPtrTy;
2999       RegisterType = Int32Ty;
3000     } else if (Is64Bit) {
3001       ValueType = RegisterType = Int64Ty;
3002     } else {
3003       ValueType = RegisterType = Int32Ty;
3004     }
3005 
3006     return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType,
3007                                       AccessKind);
3008   }
3009 
3010   if (BuiltinID == ARM::BI__builtin_sponentry) {
3011     llvm::Function *F = CGM.getIntrinsic(Intrinsic::sponentry, AllocaInt8PtrTy);
3012     return Builder.CreateCall(F);
3013   }
3014 
3015   // Handle MSVC intrinsics before argument evaluation to prevent double
3016   // evaluation.
3017   if (std::optional<MSVCIntrin> MsvcIntId = translateArmToMsvcIntrin(BuiltinID))
3018     return EmitMSVCBuiltinExpr(*MsvcIntId, E);
3019 
3020   // Deal with MVE builtins
3021   if (Value *Result = EmitARMMVEBuiltinExpr(BuiltinID, E, ReturnValue, Arch))
3022     return Result;
3023   // Handle CDE builtins
3024   if (Value *Result = EmitARMCDEBuiltinExpr(BuiltinID, E, ReturnValue, Arch))
3025     return Result;
3026 
3027   // Some intrinsics are equivalent - if they are use the base intrinsic ID.
3028   auto It = llvm::find_if(NEONEquivalentIntrinsicMap, [BuiltinID](auto &P) {
3029     return P.first == BuiltinID;
3030   });
3031   if (It != end(NEONEquivalentIntrinsicMap))
3032     BuiltinID = It->second;
3033 
3034   // Find out if any arguments are required to be integer constant
3035   // expressions.
3036   unsigned ICEArguments = 0;
3037   ASTContext::GetBuiltinTypeError Error;
3038   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
3039   assert(Error == ASTContext::GE_None && "Should not codegen an error");
3040 
3041   auto getAlignmentValue32 = [&](Address addr) -> Value* {
3042     return Builder.getInt32(addr.getAlignment().getQuantity());
3043   };
3044 
3045   Address PtrOp0 = Address::invalid();
3046   Address PtrOp1 = Address::invalid();
3047   SmallVector<Value*, 4> Ops;
3048   bool HasExtraArg = HasExtraNeonArgument(BuiltinID);
3049   unsigned NumArgs = E->getNumArgs() - (HasExtraArg ? 1 : 0);
3050   for (unsigned i = 0, e = NumArgs; i != e; i++) {
3051     if (i == 0) {
3052       switch (BuiltinID) {
3053       case NEON::BI__builtin_neon_vld1_v:
3054       case NEON::BI__builtin_neon_vld1q_v:
3055       case NEON::BI__builtin_neon_vld1q_lane_v:
3056       case NEON::BI__builtin_neon_vld1_lane_v:
3057       case NEON::BI__builtin_neon_vld1_dup_v:
3058       case NEON::BI__builtin_neon_vld1q_dup_v:
3059       case NEON::BI__builtin_neon_vst1_v:
3060       case NEON::BI__builtin_neon_vst1q_v:
3061       case NEON::BI__builtin_neon_vst1q_lane_v:
3062       case NEON::BI__builtin_neon_vst1_lane_v:
3063       case NEON::BI__builtin_neon_vst2_v:
3064       case NEON::BI__builtin_neon_vst2q_v:
3065       case NEON::BI__builtin_neon_vst2_lane_v:
3066       case NEON::BI__builtin_neon_vst2q_lane_v:
3067       case NEON::BI__builtin_neon_vst3_v:
3068       case NEON::BI__builtin_neon_vst3q_v:
3069       case NEON::BI__builtin_neon_vst3_lane_v:
3070       case NEON::BI__builtin_neon_vst3q_lane_v:
3071       case NEON::BI__builtin_neon_vst4_v:
3072       case NEON::BI__builtin_neon_vst4q_v:
3073       case NEON::BI__builtin_neon_vst4_lane_v:
3074       case NEON::BI__builtin_neon_vst4q_lane_v:
3075         // Get the alignment for the argument in addition to the value;
3076         // we'll use it later.
3077         PtrOp0 = EmitPointerWithAlignment(E->getArg(0));
3078         Ops.push_back(PtrOp0.emitRawPointer(*this));
3079         continue;
3080       }
3081     }
3082     if (i == 1) {
3083       switch (BuiltinID) {
3084       case NEON::BI__builtin_neon_vld2_v:
3085       case NEON::BI__builtin_neon_vld2q_v:
3086       case NEON::BI__builtin_neon_vld3_v:
3087       case NEON::BI__builtin_neon_vld3q_v:
3088       case NEON::BI__builtin_neon_vld4_v:
3089       case NEON::BI__builtin_neon_vld4q_v:
3090       case NEON::BI__builtin_neon_vld2_lane_v:
3091       case NEON::BI__builtin_neon_vld2q_lane_v:
3092       case NEON::BI__builtin_neon_vld3_lane_v:
3093       case NEON::BI__builtin_neon_vld3q_lane_v:
3094       case NEON::BI__builtin_neon_vld4_lane_v:
3095       case NEON::BI__builtin_neon_vld4q_lane_v:
3096       case NEON::BI__builtin_neon_vld2_dup_v:
3097       case NEON::BI__builtin_neon_vld2q_dup_v:
3098       case NEON::BI__builtin_neon_vld3_dup_v:
3099       case NEON::BI__builtin_neon_vld3q_dup_v:
3100       case NEON::BI__builtin_neon_vld4_dup_v:
3101       case NEON::BI__builtin_neon_vld4q_dup_v:
3102         // Get the alignment for the argument in addition to the value;
3103         // we'll use it later.
3104         PtrOp1 = EmitPointerWithAlignment(E->getArg(1));
3105         Ops.push_back(PtrOp1.emitRawPointer(*this));
3106         continue;
3107       }
3108     }
3109 
3110     Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E));
3111   }
3112 
3113   switch (BuiltinID) {
3114   default: break;
3115 
3116   case NEON::BI__builtin_neon_vget_lane_i8:
3117   case NEON::BI__builtin_neon_vget_lane_i16:
3118   case NEON::BI__builtin_neon_vget_lane_i32:
3119   case NEON::BI__builtin_neon_vget_lane_i64:
3120   case NEON::BI__builtin_neon_vget_lane_bf16:
3121   case NEON::BI__builtin_neon_vget_lane_f32:
3122   case NEON::BI__builtin_neon_vgetq_lane_i8:
3123   case NEON::BI__builtin_neon_vgetq_lane_i16:
3124   case NEON::BI__builtin_neon_vgetq_lane_i32:
3125   case NEON::BI__builtin_neon_vgetq_lane_i64:
3126   case NEON::BI__builtin_neon_vgetq_lane_bf16:
3127   case NEON::BI__builtin_neon_vgetq_lane_f32:
3128   case NEON::BI__builtin_neon_vduph_lane_bf16:
3129   case NEON::BI__builtin_neon_vduph_laneq_bf16:
3130     return Builder.CreateExtractElement(Ops[0], Ops[1], "vget_lane");
3131 
3132   case NEON::BI__builtin_neon_vrndns_f32: {
3133     Value *Arg = EmitScalarExpr(E->getArg(0));
3134     llvm::Type *Tys[] = {Arg->getType()};
3135     Function *F = CGM.getIntrinsic(Intrinsic::roundeven, Tys);
3136     return Builder.CreateCall(F, {Arg}, "vrndn"); }
3137 
3138   case NEON::BI__builtin_neon_vset_lane_i8:
3139   case NEON::BI__builtin_neon_vset_lane_i16:
3140   case NEON::BI__builtin_neon_vset_lane_i32:
3141   case NEON::BI__builtin_neon_vset_lane_i64:
3142   case NEON::BI__builtin_neon_vset_lane_bf16:
3143   case NEON::BI__builtin_neon_vset_lane_f32:
3144   case NEON::BI__builtin_neon_vsetq_lane_i8:
3145   case NEON::BI__builtin_neon_vsetq_lane_i16:
3146   case NEON::BI__builtin_neon_vsetq_lane_i32:
3147   case NEON::BI__builtin_neon_vsetq_lane_i64:
3148   case NEON::BI__builtin_neon_vsetq_lane_bf16:
3149   case NEON::BI__builtin_neon_vsetq_lane_f32:
3150     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
3151 
3152   case NEON::BI__builtin_neon_vsha1h_u32:
3153     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1h), Ops,
3154                         "vsha1h");
3155   case NEON::BI__builtin_neon_vsha1cq_u32:
3156     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1c), Ops,
3157                         "vsha1h");
3158   case NEON::BI__builtin_neon_vsha1pq_u32:
3159     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1p), Ops,
3160                         "vsha1h");
3161   case NEON::BI__builtin_neon_vsha1mq_u32:
3162     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1m), Ops,
3163                         "vsha1h");
3164 
3165   case NEON::BI__builtin_neon_vcvth_bf16_f32: {
3166     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vcvtbfp2bf), Ops,
3167                         "vcvtbfp2bf");
3168   }
3169 
3170   // The ARM _MoveToCoprocessor builtins put the input register value as
3171   // the first argument, but the LLVM intrinsic expects it as the third one.
3172   case clang::ARM::BI_MoveToCoprocessor:
3173   case clang::ARM::BI_MoveToCoprocessor2: {
3174     Function *F = CGM.getIntrinsic(BuiltinID == clang::ARM::BI_MoveToCoprocessor
3175                                        ? Intrinsic::arm_mcr
3176                                        : Intrinsic::arm_mcr2);
3177     return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0],
3178                                   Ops[3], Ops[4], Ops[5]});
3179   }
3180   }
3181 
3182   // Get the last argument, which specifies the vector type.
3183   assert(HasExtraArg);
3184   const Expr *Arg = E->getArg(E->getNumArgs()-1);
3185   std::optional<llvm::APSInt> Result =
3186       Arg->getIntegerConstantExpr(getContext());
3187   if (!Result)
3188     return nullptr;
3189 
3190   if (BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_f ||
3191       BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_d) {
3192     // Determine the overloaded type of this builtin.
3193     llvm::Type *Ty;
3194     if (BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_f)
3195       Ty = FloatTy;
3196     else
3197       Ty = DoubleTy;
3198 
3199     // Determine whether this is an unsigned conversion or not.
3200     bool usgn = Result->getZExtValue() == 1;
3201     unsigned Int = usgn ? Intrinsic::arm_vcvtru : Intrinsic::arm_vcvtr;
3202 
3203     // Call the appropriate intrinsic.
3204     Function *F = CGM.getIntrinsic(Int, Ty);
3205     return Builder.CreateCall(F, Ops, "vcvtr");
3206   }
3207 
3208   // Determine the type of this overloaded NEON intrinsic.
3209   NeonTypeFlags Type = Result->getZExtValue();
3210   bool usgn = Type.isUnsigned();
3211   bool rightShift = false;
3212 
3213   llvm::FixedVectorType *VTy =
3214       GetNeonType(this, Type, getTarget().hasLegalHalfType(), false,
3215                   getTarget().hasBFloat16Type());
3216   llvm::Type *Ty = VTy;
3217   if (!Ty)
3218     return nullptr;
3219 
3220   // Many NEON builtins have identical semantics and uses in ARM and
3221   // AArch64. Emit these in a single function.
3222   auto IntrinsicMap = ArrayRef(ARMSIMDIntrinsicMap);
3223   const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap(
3224       IntrinsicMap, BuiltinID, NEONSIMDIntrinsicsProvenSorted);
3225   if (Builtin)
3226     return EmitCommonNeonBuiltinExpr(
3227         Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
3228         Builtin->NameHint, Builtin->TypeModifier, E, Ops, PtrOp0, PtrOp1, Arch);
3229 
3230   unsigned Int;
3231   switch (BuiltinID) {
3232   default: return nullptr;
3233   case NEON::BI__builtin_neon_vld1q_lane_v:
3234     // Handle 64-bit integer elements as a special case.  Use shuffles of
3235     // one-element vectors to avoid poor code for i64 in the backend.
3236     if (VTy->getElementType()->isIntegerTy(64)) {
3237       // Extract the other lane.
3238       Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
3239       int Lane = cast<ConstantInt>(Ops[2])->getZExtValue();
3240       Value *SV = llvm::ConstantVector::get(ConstantInt::get(Int32Ty, 1-Lane));
3241       Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
3242       // Load the value as a one-element vector.
3243       Ty = llvm::FixedVectorType::get(VTy->getElementType(), 1);
3244       llvm::Type *Tys[] = {Ty, Int8PtrTy};
3245       Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld1, Tys);
3246       Value *Align = getAlignmentValue32(PtrOp0);
3247       Value *Ld = Builder.CreateCall(F, {Ops[0], Align});
3248       // Combine them.
3249       int Indices[] = {1 - Lane, Lane};
3250       return Builder.CreateShuffleVector(Ops[1], Ld, Indices, "vld1q_lane");
3251     }
3252     [[fallthrough]];
3253   case NEON::BI__builtin_neon_vld1_lane_v: {
3254     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
3255     PtrOp0 = PtrOp0.withElementType(VTy->getElementType());
3256     Value *Ld = Builder.CreateLoad(PtrOp0);
3257     return Builder.CreateInsertElement(Ops[1], Ld, Ops[2], "vld1_lane");
3258   }
3259   case NEON::BI__builtin_neon_vqrshrn_n_v:
3260     Int =
3261       usgn ? Intrinsic::arm_neon_vqrshiftnu : Intrinsic::arm_neon_vqrshiftns;
3262     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n",
3263                         1, true);
3264   case NEON::BI__builtin_neon_vqrshrun_n_v:
3265     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqrshiftnsu, Ty),
3266                         Ops, "vqrshrun_n", 1, true);
3267   case NEON::BI__builtin_neon_vqshrn_n_v:
3268     Int = usgn ? Intrinsic::arm_neon_vqshiftnu : Intrinsic::arm_neon_vqshiftns;
3269     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n",
3270                         1, true);
3271   case NEON::BI__builtin_neon_vqshrun_n_v:
3272     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqshiftnsu, Ty),
3273                         Ops, "vqshrun_n", 1, true);
3274   case NEON::BI__builtin_neon_vrecpe_v:
3275   case NEON::BI__builtin_neon_vrecpeq_v:
3276     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrecpe, Ty),
3277                         Ops, "vrecpe");
3278   case NEON::BI__builtin_neon_vrshrn_n_v:
3279     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrshiftn, Ty),
3280                         Ops, "vrshrn_n", 1, true);
3281   case NEON::BI__builtin_neon_vrsra_n_v:
3282   case NEON::BI__builtin_neon_vrsraq_n_v:
3283     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
3284     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
3285     Ops[2] = EmitNeonShiftVector(Ops[2], Ty, true);
3286     Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts;
3287     Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Ty), {Ops[1], Ops[2]});
3288     return Builder.CreateAdd(Ops[0], Ops[1], "vrsra_n");
3289   case NEON::BI__builtin_neon_vsri_n_v:
3290   case NEON::BI__builtin_neon_vsriq_n_v:
3291     rightShift = true;
3292     [[fallthrough]];
3293   case NEON::BI__builtin_neon_vsli_n_v:
3294   case NEON::BI__builtin_neon_vsliq_n_v:
3295     Ops[2] = EmitNeonShiftVector(Ops[2], Ty, rightShift);
3296     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vshiftins, Ty),
3297                         Ops, "vsli_n");
3298   case NEON::BI__builtin_neon_vsra_n_v:
3299   case NEON::BI__builtin_neon_vsraq_n_v:
3300     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
3301     Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
3302     return Builder.CreateAdd(Ops[0], Ops[1]);
3303   case NEON::BI__builtin_neon_vst1q_lane_v:
3304     // Handle 64-bit integer elements as a special case.  Use a shuffle to get
3305     // a one-element vector and avoid poor code for i64 in the backend.
3306     if (VTy->getElementType()->isIntegerTy(64)) {
3307       Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
3308       Value *SV = llvm::ConstantVector::get(cast<llvm::Constant>(Ops[2]));
3309       Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
3310       Ops[2] = getAlignmentValue32(PtrOp0);
3311       llvm::Type *Tys[] = {Int8PtrTy, Ops[1]->getType()};
3312       return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst1,
3313                                                  Tys), Ops);
3314     }
3315     [[fallthrough]];
3316   case NEON::BI__builtin_neon_vst1_lane_v: {
3317     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
3318     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
3319     return Builder.CreateStore(Ops[1],
3320                                PtrOp0.withElementType(Ops[1]->getType()));
3321   }
3322   case NEON::BI__builtin_neon_vtbl1_v:
3323     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl1),
3324                         Ops, "vtbl1");
3325   case NEON::BI__builtin_neon_vtbl2_v:
3326     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl2),
3327                         Ops, "vtbl2");
3328   case NEON::BI__builtin_neon_vtbl3_v:
3329     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl3),
3330                         Ops, "vtbl3");
3331   case NEON::BI__builtin_neon_vtbl4_v:
3332     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl4),
3333                         Ops, "vtbl4");
3334   case NEON::BI__builtin_neon_vtbx1_v:
3335     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx1),
3336                         Ops, "vtbx1");
3337   case NEON::BI__builtin_neon_vtbx2_v:
3338     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx2),
3339                         Ops, "vtbx2");
3340   case NEON::BI__builtin_neon_vtbx3_v:
3341     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx3),
3342                         Ops, "vtbx3");
3343   case NEON::BI__builtin_neon_vtbx4_v:
3344     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx4),
3345                         Ops, "vtbx4");
3346   }
3347 }
3348 
3349 template<typename Integer>
3350 static Integer GetIntegerConstantValue(const Expr *E, ASTContext &Context) {
3351   return E->getIntegerConstantExpr(Context)->getExtValue();
3352 }
3353 
3354 static llvm::Value *SignOrZeroExtend(CGBuilderTy &Builder, llvm::Value *V,
3355                                      llvm::Type *T, bool Unsigned) {
3356   // Helper function called by Tablegen-constructed ARM MVE builtin codegen,
3357   // which finds it convenient to specify signed/unsigned as a boolean flag.
3358   return Unsigned ? Builder.CreateZExt(V, T) : Builder.CreateSExt(V, T);
3359 }
3360 
3361 static llvm::Value *MVEImmediateShr(CGBuilderTy &Builder, llvm::Value *V,
3362                                     uint32_t Shift, bool Unsigned) {
3363   // MVE helper function for integer shift right. This must handle signed vs
3364   // unsigned, and also deal specially with the case where the shift count is
3365   // equal to the lane size. In LLVM IR, an LShr with that parameter would be
3366   // undefined behavior, but in MVE it's legal, so we must convert it to code
3367   // that is not undefined in IR.
3368   unsigned LaneBits = cast<llvm::VectorType>(V->getType())
3369                           ->getElementType()
3370                           ->getPrimitiveSizeInBits();
3371   if (Shift == LaneBits) {
3372     // An unsigned shift of the full lane size always generates zero, so we can
3373     // simply emit a zero vector. A signed shift of the full lane size does the
3374     // same thing as shifting by one bit fewer.
3375     if (Unsigned)
3376       return llvm::Constant::getNullValue(V->getType());
3377     else
3378       --Shift;
3379   }
3380   return Unsigned ? Builder.CreateLShr(V, Shift) : Builder.CreateAShr(V, Shift);
3381 }
3382 
3383 static llvm::Value *ARMMVEVectorSplat(CGBuilderTy &Builder, llvm::Value *V) {
3384   // MVE-specific helper function for a vector splat, which infers the element
3385   // count of the output vector by knowing that MVE vectors are all 128 bits
3386   // wide.
3387   unsigned Elements = 128 / V->getType()->getPrimitiveSizeInBits();
3388   return Builder.CreateVectorSplat(Elements, V);
3389 }
3390 
3391 static llvm::Value *ARMMVEVectorReinterpret(CGBuilderTy &Builder,
3392                                             CodeGenFunction *CGF,
3393                                             llvm::Value *V,
3394                                             llvm::Type *DestType) {
3395   // Convert one MVE vector type into another by reinterpreting its in-register
3396   // format.
3397   //
3398   // Little-endian, this is identical to a bitcast (which reinterprets the
3399   // memory format). But big-endian, they're not necessarily the same, because
3400   // the register and memory formats map to each other differently depending on
3401   // the lane size.
3402   //
3403   // We generate a bitcast whenever we can (if we're little-endian, or if the
3404   // lane sizes are the same anyway). Otherwise we fall back to an IR intrinsic
3405   // that performs the different kind of reinterpretation.
3406   if (CGF->getTarget().isBigEndian() &&
3407       V->getType()->getScalarSizeInBits() != DestType->getScalarSizeInBits()) {
3408     return Builder.CreateCall(
3409         CGF->CGM.getIntrinsic(Intrinsic::arm_mve_vreinterpretq,
3410                               {DestType, V->getType()}),
3411         V);
3412   } else {
3413     return Builder.CreateBitCast(V, DestType);
3414   }
3415 }
3416 
3417 static llvm::Value *VectorUnzip(CGBuilderTy &Builder, llvm::Value *V, bool Odd) {
3418   // Make a shufflevector that extracts every other element of a vector (evens
3419   // or odds, as desired).
3420   SmallVector<int, 16> Indices;
3421   unsigned InputElements =
3422       cast<llvm::FixedVectorType>(V->getType())->getNumElements();
3423   for (unsigned i = 0; i < InputElements; i += 2)
3424     Indices.push_back(i + Odd);
3425   return Builder.CreateShuffleVector(V, Indices);
3426 }
3427 
3428 static llvm::Value *VectorZip(CGBuilderTy &Builder, llvm::Value *V0,
3429                               llvm::Value *V1) {
3430   // Make a shufflevector that interleaves two vectors element by element.
3431   assert(V0->getType() == V1->getType() && "Can't zip different vector types");
3432   SmallVector<int, 16> Indices;
3433   unsigned InputElements =
3434       cast<llvm::FixedVectorType>(V0->getType())->getNumElements();
3435   for (unsigned i = 0; i < InputElements; i++) {
3436     Indices.push_back(i);
3437     Indices.push_back(i + InputElements);
3438   }
3439   return Builder.CreateShuffleVector(V0, V1, Indices);
3440 }
3441 
3442 template<unsigned HighBit, unsigned OtherBits>
3443 static llvm::Value *ARMMVEConstantSplat(CGBuilderTy &Builder, llvm::Type *VT) {
3444   // MVE-specific helper function to make a vector splat of a constant such as
3445   // UINT_MAX or INT_MIN, in which all bits below the highest one are equal.
3446   llvm::Type *T = cast<llvm::VectorType>(VT)->getElementType();
3447   unsigned LaneBits = T->getPrimitiveSizeInBits();
3448   uint32_t Value = HighBit << (LaneBits - 1);
3449   if (OtherBits)
3450     Value |= (1UL << (LaneBits - 1)) - 1;
3451   llvm::Value *Lane = llvm::ConstantInt::get(T, Value);
3452   return ARMMVEVectorSplat(Builder, Lane);
3453 }
3454 
3455 static llvm::Value *ARMMVEVectorElementReverse(CGBuilderTy &Builder,
3456                                                llvm::Value *V,
3457                                                unsigned ReverseWidth) {
3458   // MVE-specific helper function which reverses the elements of a
3459   // vector within every (ReverseWidth)-bit collection of lanes.
3460   SmallVector<int, 16> Indices;
3461   unsigned LaneSize = V->getType()->getScalarSizeInBits();
3462   unsigned Elements = 128 / LaneSize;
3463   unsigned Mask = ReverseWidth / LaneSize - 1;
3464   for (unsigned i = 0; i < Elements; i++)
3465     Indices.push_back(i ^ Mask);
3466   return Builder.CreateShuffleVector(V, Indices);
3467 }
3468 
3469 Value *CodeGenFunction::EmitARMMVEBuiltinExpr(unsigned BuiltinID,
3470                                               const CallExpr *E,
3471                                               ReturnValueSlot ReturnValue,
3472                                               llvm::Triple::ArchType Arch) {
3473   enum class CustomCodeGen { VLD24, VST24 } CustomCodeGenType;
3474   Intrinsic::ID IRIntr;
3475   unsigned NumVectors;
3476 
3477   // Code autogenerated by Tablegen will handle all the simple builtins.
3478   switch (BuiltinID) {
3479     #include "clang/Basic/arm_mve_builtin_cg.inc"
3480 
3481     // If we didn't match an MVE builtin id at all, go back to the
3482     // main EmitARMBuiltinExpr.
3483   default:
3484     return nullptr;
3485   }
3486 
3487   // Anything that breaks from that switch is an MVE builtin that
3488   // needs handwritten code to generate.
3489 
3490   switch (CustomCodeGenType) {
3491 
3492   case CustomCodeGen::VLD24: {
3493     llvm::SmallVector<Value *, 4> Ops;
3494     llvm::SmallVector<llvm::Type *, 4> Tys;
3495 
3496     auto MvecCType = E->getType();
3497     auto MvecLType = ConvertType(MvecCType);
3498     assert(MvecLType->isStructTy() &&
3499            "Return type for vld[24]q should be a struct");
3500     assert(MvecLType->getStructNumElements() == 1 &&
3501            "Return-type struct for vld[24]q should have one element");
3502     auto MvecLTypeInner = MvecLType->getStructElementType(0);
3503     assert(MvecLTypeInner->isArrayTy() &&
3504            "Return-type struct for vld[24]q should contain an array");
3505     assert(MvecLTypeInner->getArrayNumElements() == NumVectors &&
3506            "Array member of return-type struct vld[24]q has wrong length");
3507     auto VecLType = MvecLTypeInner->getArrayElementType();
3508 
3509     Tys.push_back(VecLType);
3510 
3511     auto Addr = E->getArg(0);
3512     Ops.push_back(EmitScalarExpr(Addr));
3513     Tys.push_back(ConvertType(Addr->getType()));
3514 
3515     Function *F = CGM.getIntrinsic(IRIntr, ArrayRef(Tys));
3516     Value *LoadResult = Builder.CreateCall(F, Ops);
3517     Value *MvecOut = PoisonValue::get(MvecLType);
3518     for (unsigned i = 0; i < NumVectors; ++i) {
3519       Value *Vec = Builder.CreateExtractValue(LoadResult, i);
3520       MvecOut = Builder.CreateInsertValue(MvecOut, Vec, {0, i});
3521     }
3522 
3523     if (ReturnValue.isNull())
3524       return MvecOut;
3525     else
3526       return Builder.CreateStore(MvecOut, ReturnValue.getAddress());
3527   }
3528 
3529   case CustomCodeGen::VST24: {
3530     llvm::SmallVector<Value *, 4> Ops;
3531     llvm::SmallVector<llvm::Type *, 4> Tys;
3532 
3533     auto Addr = E->getArg(0);
3534     Ops.push_back(EmitScalarExpr(Addr));
3535     Tys.push_back(ConvertType(Addr->getType()));
3536 
3537     auto MvecCType = E->getArg(1)->getType();
3538     auto MvecLType = ConvertType(MvecCType);
3539     assert(MvecLType->isStructTy() && "Data type for vst2q should be a struct");
3540     assert(MvecLType->getStructNumElements() == 1 &&
3541            "Data-type struct for vst2q should have one element");
3542     auto MvecLTypeInner = MvecLType->getStructElementType(0);
3543     assert(MvecLTypeInner->isArrayTy() &&
3544            "Data-type struct for vst2q should contain an array");
3545     assert(MvecLTypeInner->getArrayNumElements() == NumVectors &&
3546            "Array member of return-type struct vld[24]q has wrong length");
3547     auto VecLType = MvecLTypeInner->getArrayElementType();
3548 
3549     Tys.push_back(VecLType);
3550 
3551     AggValueSlot MvecSlot = CreateAggTemp(MvecCType);
3552     EmitAggExpr(E->getArg(1), MvecSlot);
3553     auto Mvec = Builder.CreateLoad(MvecSlot.getAddress());
3554     for (unsigned i = 0; i < NumVectors; i++)
3555       Ops.push_back(Builder.CreateExtractValue(Mvec, {0, i}));
3556 
3557     Function *F = CGM.getIntrinsic(IRIntr, ArrayRef(Tys));
3558     Value *ToReturn = nullptr;
3559     for (unsigned i = 0; i < NumVectors; i++) {
3560       Ops.push_back(llvm::ConstantInt::get(Int32Ty, i));
3561       ToReturn = Builder.CreateCall(F, Ops);
3562       Ops.pop_back();
3563     }
3564     return ToReturn;
3565   }
3566   }
3567   llvm_unreachable("unknown custom codegen type.");
3568 }
3569 
3570 Value *CodeGenFunction::EmitARMCDEBuiltinExpr(unsigned BuiltinID,
3571                                               const CallExpr *E,
3572                                               ReturnValueSlot ReturnValue,
3573                                               llvm::Triple::ArchType Arch) {
3574   switch (BuiltinID) {
3575   default:
3576     return nullptr;
3577 #include "clang/Basic/arm_cde_builtin_cg.inc"
3578   }
3579 }
3580 
3581 static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID,
3582                                       const CallExpr *E,
3583                                       SmallVectorImpl<Value *> &Ops,
3584                                       llvm::Triple::ArchType Arch) {
3585   unsigned int Int = 0;
3586   const char *s = nullptr;
3587 
3588   switch (BuiltinID) {
3589   default:
3590     return nullptr;
3591   case NEON::BI__builtin_neon_vtbl1_v:
3592   case NEON::BI__builtin_neon_vqtbl1_v:
3593   case NEON::BI__builtin_neon_vqtbl1q_v:
3594   case NEON::BI__builtin_neon_vtbl2_v:
3595   case NEON::BI__builtin_neon_vqtbl2_v:
3596   case NEON::BI__builtin_neon_vqtbl2q_v:
3597   case NEON::BI__builtin_neon_vtbl3_v:
3598   case NEON::BI__builtin_neon_vqtbl3_v:
3599   case NEON::BI__builtin_neon_vqtbl3q_v:
3600   case NEON::BI__builtin_neon_vtbl4_v:
3601   case NEON::BI__builtin_neon_vqtbl4_v:
3602   case NEON::BI__builtin_neon_vqtbl4q_v:
3603     break;
3604   case NEON::BI__builtin_neon_vtbx1_v:
3605   case NEON::BI__builtin_neon_vqtbx1_v:
3606   case NEON::BI__builtin_neon_vqtbx1q_v:
3607   case NEON::BI__builtin_neon_vtbx2_v:
3608   case NEON::BI__builtin_neon_vqtbx2_v:
3609   case NEON::BI__builtin_neon_vqtbx2q_v:
3610   case NEON::BI__builtin_neon_vtbx3_v:
3611   case NEON::BI__builtin_neon_vqtbx3_v:
3612   case NEON::BI__builtin_neon_vqtbx3q_v:
3613   case NEON::BI__builtin_neon_vtbx4_v:
3614   case NEON::BI__builtin_neon_vqtbx4_v:
3615   case NEON::BI__builtin_neon_vqtbx4q_v:
3616     break;
3617   }
3618 
3619   assert(E->getNumArgs() >= 3);
3620 
3621   // Get the last argument, which specifies the vector type.
3622   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
3623   std::optional<llvm::APSInt> Result =
3624       Arg->getIntegerConstantExpr(CGF.getContext());
3625   if (!Result)
3626     return nullptr;
3627 
3628   // Determine the type of this overloaded NEON intrinsic.
3629   NeonTypeFlags Type = Result->getZExtValue();
3630   llvm::FixedVectorType *Ty = GetNeonType(&CGF, Type);
3631   if (!Ty)
3632     return nullptr;
3633 
3634   CodeGen::CGBuilderTy &Builder = CGF.Builder;
3635 
3636   // AArch64 scalar builtins are not overloaded, they do not have an extra
3637   // argument that specifies the vector type, need to handle each case.
3638   switch (BuiltinID) {
3639   case NEON::BI__builtin_neon_vtbl1_v: {
3640     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 1), nullptr, Ops[1],
3641                               Ty, Intrinsic::aarch64_neon_tbl1, "vtbl1");
3642   }
3643   case NEON::BI__builtin_neon_vtbl2_v: {
3644     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 2), nullptr, Ops[2],
3645                               Ty, Intrinsic::aarch64_neon_tbl1, "vtbl1");
3646   }
3647   case NEON::BI__builtin_neon_vtbl3_v: {
3648     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 3), nullptr, Ops[3],
3649                               Ty, Intrinsic::aarch64_neon_tbl2, "vtbl2");
3650   }
3651   case NEON::BI__builtin_neon_vtbl4_v: {
3652     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 4), nullptr, Ops[4],
3653                               Ty, Intrinsic::aarch64_neon_tbl2, "vtbl2");
3654   }
3655   case NEON::BI__builtin_neon_vtbx1_v: {
3656     Value *TblRes =
3657         packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 1), nullptr, Ops[2], Ty,
3658                            Intrinsic::aarch64_neon_tbl1, "vtbl1");
3659 
3660     llvm::Constant *EightV = ConstantInt::get(Ty, 8);
3661     Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[2], EightV);
3662     CmpRes = Builder.CreateSExt(CmpRes, Ty);
3663 
3664     Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
3665     Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
3666     return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
3667   }
3668   case NEON::BI__builtin_neon_vtbx2_v: {
3669     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 2), Ops[0], Ops[3],
3670                               Ty, Intrinsic::aarch64_neon_tbx1, "vtbx1");
3671   }
3672   case NEON::BI__builtin_neon_vtbx3_v: {
3673     Value *TblRes =
3674         packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 3), nullptr, Ops[4], Ty,
3675                            Intrinsic::aarch64_neon_tbl2, "vtbl2");
3676 
3677     llvm::Constant *TwentyFourV = ConstantInt::get(Ty, 24);
3678     Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[4],
3679                                            TwentyFourV);
3680     CmpRes = Builder.CreateSExt(CmpRes, Ty);
3681 
3682     Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
3683     Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
3684     return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
3685   }
3686   case NEON::BI__builtin_neon_vtbx4_v: {
3687     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 4), Ops[0], Ops[5],
3688                               Ty, Intrinsic::aarch64_neon_tbx2, "vtbx2");
3689   }
3690   case NEON::BI__builtin_neon_vqtbl1_v:
3691   case NEON::BI__builtin_neon_vqtbl1q_v:
3692     Int = Intrinsic::aarch64_neon_tbl1; s = "vtbl1"; break;
3693   case NEON::BI__builtin_neon_vqtbl2_v:
3694   case NEON::BI__builtin_neon_vqtbl2q_v: {
3695     Int = Intrinsic::aarch64_neon_tbl2; s = "vtbl2"; break;
3696   case NEON::BI__builtin_neon_vqtbl3_v:
3697   case NEON::BI__builtin_neon_vqtbl3q_v:
3698     Int = Intrinsic::aarch64_neon_tbl3; s = "vtbl3"; break;
3699   case NEON::BI__builtin_neon_vqtbl4_v:
3700   case NEON::BI__builtin_neon_vqtbl4q_v:
3701     Int = Intrinsic::aarch64_neon_tbl4; s = "vtbl4"; break;
3702   case NEON::BI__builtin_neon_vqtbx1_v:
3703   case NEON::BI__builtin_neon_vqtbx1q_v:
3704     Int = Intrinsic::aarch64_neon_tbx1; s = "vtbx1"; break;
3705   case NEON::BI__builtin_neon_vqtbx2_v:
3706   case NEON::BI__builtin_neon_vqtbx2q_v:
3707     Int = Intrinsic::aarch64_neon_tbx2; s = "vtbx2"; break;
3708   case NEON::BI__builtin_neon_vqtbx3_v:
3709   case NEON::BI__builtin_neon_vqtbx3q_v:
3710     Int = Intrinsic::aarch64_neon_tbx3; s = "vtbx3"; break;
3711   case NEON::BI__builtin_neon_vqtbx4_v:
3712   case NEON::BI__builtin_neon_vqtbx4q_v:
3713     Int = Intrinsic::aarch64_neon_tbx4; s = "vtbx4"; break;
3714   }
3715   }
3716 
3717   if (!Int)
3718     return nullptr;
3719 
3720   Function *F = CGF.CGM.getIntrinsic(Int, Ty);
3721   return CGF.EmitNeonCall(F, Ops, s);
3722 }
3723 
3724 Value *CodeGenFunction::vectorWrapScalar16(Value *Op) {
3725   auto *VTy = llvm::FixedVectorType::get(Int16Ty, 4);
3726   Op = Builder.CreateBitCast(Op, Int16Ty);
3727   Value *V = PoisonValue::get(VTy);
3728   llvm::Constant *CI = ConstantInt::get(SizeTy, 0);
3729   Op = Builder.CreateInsertElement(V, Op, CI);
3730   return Op;
3731 }
3732 
3733 /// SVEBuiltinMemEltTy - Returns the memory element type for this memory
3734 /// access builtin.  Only required if it can't be inferred from the base pointer
3735 /// operand.
3736 llvm::Type *CodeGenFunction::SVEBuiltinMemEltTy(const SVETypeFlags &TypeFlags) {
3737   switch (TypeFlags.getMemEltType()) {
3738   case SVETypeFlags::MemEltTyDefault:
3739     return getEltType(TypeFlags);
3740   case SVETypeFlags::MemEltTyInt8:
3741     return Builder.getInt8Ty();
3742   case SVETypeFlags::MemEltTyInt16:
3743     return Builder.getInt16Ty();
3744   case SVETypeFlags::MemEltTyInt32:
3745     return Builder.getInt32Ty();
3746   case SVETypeFlags::MemEltTyInt64:
3747     return Builder.getInt64Ty();
3748   }
3749   llvm_unreachable("Unknown MemEltType");
3750 }
3751 
3752 llvm::Type *CodeGenFunction::getEltType(const SVETypeFlags &TypeFlags) {
3753   switch (TypeFlags.getEltType()) {
3754   default:
3755     llvm_unreachable("Invalid SVETypeFlag!");
3756 
3757   case SVETypeFlags::EltTyMFloat8:
3758   case SVETypeFlags::EltTyInt8:
3759     return Builder.getInt8Ty();
3760   case SVETypeFlags::EltTyInt16:
3761     return Builder.getInt16Ty();
3762   case SVETypeFlags::EltTyInt32:
3763     return Builder.getInt32Ty();
3764   case SVETypeFlags::EltTyInt64:
3765     return Builder.getInt64Ty();
3766   case SVETypeFlags::EltTyInt128:
3767     return Builder.getInt128Ty();
3768 
3769   case SVETypeFlags::EltTyFloat16:
3770     return Builder.getHalfTy();
3771   case SVETypeFlags::EltTyFloat32:
3772     return Builder.getFloatTy();
3773   case SVETypeFlags::EltTyFloat64:
3774     return Builder.getDoubleTy();
3775 
3776   case SVETypeFlags::EltTyBFloat16:
3777     return Builder.getBFloatTy();
3778 
3779   case SVETypeFlags::EltTyBool8:
3780   case SVETypeFlags::EltTyBool16:
3781   case SVETypeFlags::EltTyBool32:
3782   case SVETypeFlags::EltTyBool64:
3783     return Builder.getInt1Ty();
3784   }
3785 }
3786 
3787 // Return the llvm predicate vector type corresponding to the specified element
3788 // TypeFlags.
3789 llvm::ScalableVectorType *
3790 CodeGenFunction::getSVEPredType(const SVETypeFlags &TypeFlags) {
3791   switch (TypeFlags.getEltType()) {
3792   default: llvm_unreachable("Unhandled SVETypeFlag!");
3793 
3794   case SVETypeFlags::EltTyInt8:
3795     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
3796   case SVETypeFlags::EltTyInt16:
3797     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
3798   case SVETypeFlags::EltTyInt32:
3799     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
3800   case SVETypeFlags::EltTyInt64:
3801     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
3802 
3803   case SVETypeFlags::EltTyBFloat16:
3804     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
3805   case SVETypeFlags::EltTyFloat16:
3806     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
3807   case SVETypeFlags::EltTyFloat32:
3808     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
3809   case SVETypeFlags::EltTyFloat64:
3810     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
3811 
3812   case SVETypeFlags::EltTyBool8:
3813     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
3814   case SVETypeFlags::EltTyBool16:
3815     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
3816   case SVETypeFlags::EltTyBool32:
3817     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
3818   case SVETypeFlags::EltTyBool64:
3819     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
3820   }
3821 }
3822 
3823 // Return the llvm vector type corresponding to the specified element TypeFlags.
3824 llvm::ScalableVectorType *
3825 CodeGenFunction::getSVEType(const SVETypeFlags &TypeFlags) {
3826   switch (TypeFlags.getEltType()) {
3827   default:
3828     llvm_unreachable("Invalid SVETypeFlag!");
3829 
3830   case SVETypeFlags::EltTyInt8:
3831     return llvm::ScalableVectorType::get(Builder.getInt8Ty(), 16);
3832   case SVETypeFlags::EltTyInt16:
3833     return llvm::ScalableVectorType::get(Builder.getInt16Ty(), 8);
3834   case SVETypeFlags::EltTyInt32:
3835     return llvm::ScalableVectorType::get(Builder.getInt32Ty(), 4);
3836   case SVETypeFlags::EltTyInt64:
3837     return llvm::ScalableVectorType::get(Builder.getInt64Ty(), 2);
3838 
3839   case SVETypeFlags::EltTyMFloat8:
3840     return llvm::ScalableVectorType::get(Builder.getInt8Ty(), 16);
3841   case SVETypeFlags::EltTyFloat16:
3842     return llvm::ScalableVectorType::get(Builder.getHalfTy(), 8);
3843   case SVETypeFlags::EltTyBFloat16:
3844     return llvm::ScalableVectorType::get(Builder.getBFloatTy(), 8);
3845   case SVETypeFlags::EltTyFloat32:
3846     return llvm::ScalableVectorType::get(Builder.getFloatTy(), 4);
3847   case SVETypeFlags::EltTyFloat64:
3848     return llvm::ScalableVectorType::get(Builder.getDoubleTy(), 2);
3849 
3850   case SVETypeFlags::EltTyBool8:
3851     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
3852   case SVETypeFlags::EltTyBool16:
3853     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
3854   case SVETypeFlags::EltTyBool32:
3855     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
3856   case SVETypeFlags::EltTyBool64:
3857     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
3858   }
3859 }
3860 
3861 llvm::Value *
3862 CodeGenFunction::EmitSVEAllTruePred(const SVETypeFlags &TypeFlags) {
3863   Function *Ptrue =
3864       CGM.getIntrinsic(Intrinsic::aarch64_sve_ptrue, getSVEPredType(TypeFlags));
3865   return Builder.CreateCall(Ptrue, {Builder.getInt32(/*SV_ALL*/ 31)});
3866 }
3867 
3868 constexpr unsigned SVEBitsPerBlock = 128;
3869 
3870 static llvm::ScalableVectorType *getSVEVectorForElementType(llvm::Type *EltTy) {
3871   unsigned NumElts = SVEBitsPerBlock / EltTy->getScalarSizeInBits();
3872   return llvm::ScalableVectorType::get(EltTy, NumElts);
3873 }
3874 
3875 // Reinterpret the input predicate so that it can be used to correctly isolate
3876 // the elements of the specified datatype.
3877 Value *CodeGenFunction::EmitSVEPredicateCast(Value *Pred,
3878                                              llvm::ScalableVectorType *VTy) {
3879 
3880   if (isa<TargetExtType>(Pred->getType()) &&
3881       cast<TargetExtType>(Pred->getType())->getName() == "aarch64.svcount")
3882     return Pred;
3883 
3884   auto *RTy = llvm::VectorType::get(IntegerType::get(getLLVMContext(), 1), VTy);
3885   if (Pred->getType() == RTy)
3886     return Pred;
3887 
3888   unsigned IntID;
3889   llvm::Type *IntrinsicTy;
3890   switch (VTy->getMinNumElements()) {
3891   default:
3892     llvm_unreachable("unsupported element count!");
3893   case 1:
3894   case 2:
3895   case 4:
3896   case 8:
3897     IntID = Intrinsic::aarch64_sve_convert_from_svbool;
3898     IntrinsicTy = RTy;
3899     break;
3900   case 16:
3901     IntID = Intrinsic::aarch64_sve_convert_to_svbool;
3902     IntrinsicTy = Pred->getType();
3903     break;
3904   }
3905 
3906   Function *F = CGM.getIntrinsic(IntID, IntrinsicTy);
3907   Value *C = Builder.CreateCall(F, Pred);
3908   assert(C->getType() == RTy && "Unexpected return type!");
3909   return C;
3910 }
3911 
3912 Value *CodeGenFunction::EmitSVEPredicateTupleCast(Value *PredTuple,
3913                                                   llvm::StructType *Ty) {
3914   if (PredTuple->getType() == Ty)
3915     return PredTuple;
3916 
3917   Value *Ret = llvm::PoisonValue::get(Ty);
3918   for (unsigned I = 0; I < Ty->getNumElements(); ++I) {
3919     Value *Pred = Builder.CreateExtractValue(PredTuple, I);
3920     Pred = EmitSVEPredicateCast(
3921         Pred, cast<llvm::ScalableVectorType>(Ty->getTypeAtIndex(I)));
3922     Ret = Builder.CreateInsertValue(Ret, Pred, I);
3923   }
3924 
3925   return Ret;
3926 }
3927 
3928 Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
3929                                           SmallVectorImpl<Value *> &Ops,
3930                                           unsigned IntID) {
3931   auto *ResultTy = getSVEType(TypeFlags);
3932   auto *OverloadedTy =
3933       llvm::ScalableVectorType::get(SVEBuiltinMemEltTy(TypeFlags), ResultTy);
3934 
3935   Function *F = nullptr;
3936   if (Ops[1]->getType()->isVectorTy())
3937     // This is the "vector base, scalar offset" case. In order to uniquely
3938     // map this built-in to an LLVM IR intrinsic, we need both the return type
3939     // and the type of the vector base.
3940     F = CGM.getIntrinsic(IntID, {OverloadedTy, Ops[1]->getType()});
3941   else
3942     // This is the "scalar base, vector offset case". The type of the offset
3943     // is encoded in the name of the intrinsic. We only need to specify the
3944     // return type in order to uniquely map this built-in to an LLVM IR
3945     // intrinsic.
3946     F = CGM.getIntrinsic(IntID, OverloadedTy);
3947 
3948   // At the ACLE level there's only one predicate type, svbool_t, which is
3949   // mapped to <n x 16 x i1>. However, this might be incompatible with the
3950   // actual type being loaded. For example, when loading doubles (i64) the
3951   // predicate should be <n x 2 x i1> instead. At the IR level the type of
3952   // the predicate and the data being loaded must match. Cast to the type
3953   // expected by the intrinsic. The intrinsic itself should be defined in
3954   // a way than enforces relations between parameter types.
3955   Ops[0] = EmitSVEPredicateCast(
3956       Ops[0], cast<llvm::ScalableVectorType>(F->getArg(0)->getType()));
3957 
3958   // Pass 0 when the offset is missing. This can only be applied when using
3959   // the "vector base" addressing mode for which ACLE allows no offset. The
3960   // corresponding LLVM IR always requires an offset.
3961   if (Ops.size() == 2) {
3962     assert(Ops[1]->getType()->isVectorTy() && "Scalar base requires an offset");
3963     Ops.push_back(ConstantInt::get(Int64Ty, 0));
3964   }
3965 
3966   // For "vector base, scalar index" scale the index so that it becomes a
3967   // scalar offset.
3968   if (!TypeFlags.isByteIndexed() && Ops[1]->getType()->isVectorTy()) {
3969     unsigned BytesPerElt =
3970         OverloadedTy->getElementType()->getScalarSizeInBits() / 8;
3971     Ops[2] = Builder.CreateShl(Ops[2], Log2_32(BytesPerElt));
3972   }
3973 
3974   Value *Call = Builder.CreateCall(F, Ops);
3975 
3976   // The following sext/zext is only needed when ResultTy != OverloadedTy. In
3977   // other cases it's folded into a nop.
3978   return TypeFlags.isZExtReturn() ? Builder.CreateZExt(Call, ResultTy)
3979                                   : Builder.CreateSExt(Call, ResultTy);
3980 }
3981 
3982 Value *CodeGenFunction::EmitSVEScatterStore(const SVETypeFlags &TypeFlags,
3983                                             SmallVectorImpl<Value *> &Ops,
3984                                             unsigned IntID) {
3985   auto *SrcDataTy = getSVEType(TypeFlags);
3986   auto *OverloadedTy =
3987       llvm::ScalableVectorType::get(SVEBuiltinMemEltTy(TypeFlags), SrcDataTy);
3988 
3989   // In ACLE the source data is passed in the last argument, whereas in LLVM IR
3990   // it's the first argument. Move it accordingly.
3991   Ops.insert(Ops.begin(), Ops.pop_back_val());
3992 
3993   Function *F = nullptr;
3994   if (Ops[2]->getType()->isVectorTy())
3995     // This is the "vector base, scalar offset" case. In order to uniquely
3996     // map this built-in to an LLVM IR intrinsic, we need both the return type
3997     // and the type of the vector base.
3998     F = CGM.getIntrinsic(IntID, {OverloadedTy, Ops[2]->getType()});
3999   else
4000     // This is the "scalar base, vector offset case". The type of the offset
4001     // is encoded in the name of the intrinsic. We only need to specify the
4002     // return type in order to uniquely map this built-in to an LLVM IR
4003     // intrinsic.
4004     F = CGM.getIntrinsic(IntID, OverloadedTy);
4005 
4006   // Pass 0 when the offset is missing. This can only be applied when using
4007   // the "vector base" addressing mode for which ACLE allows no offset. The
4008   // corresponding LLVM IR always requires an offset.
4009   if (Ops.size() == 3) {
4010     assert(Ops[1]->getType()->isVectorTy() && "Scalar base requires an offset");
4011     Ops.push_back(ConstantInt::get(Int64Ty, 0));
4012   }
4013 
4014   // Truncation is needed when SrcDataTy != OverloadedTy. In other cases it's
4015   // folded into a nop.
4016   Ops[0] = Builder.CreateTrunc(Ops[0], OverloadedTy);
4017 
4018   // At the ACLE level there's only one predicate type, svbool_t, which is
4019   // mapped to <n x 16 x i1>. However, this might be incompatible with the
4020   // actual type being stored. For example, when storing doubles (i64) the
4021   // predicated should be <n x 2 x i1> instead. At the IR level the type of
4022   // the predicate and the data being stored must match. Cast to the type
4023   // expected by the intrinsic. The intrinsic itself should be defined in
4024   // a way that enforces relations between parameter types.
4025   Ops[1] = EmitSVEPredicateCast(
4026       Ops[1], cast<llvm::ScalableVectorType>(F->getArg(1)->getType()));
4027 
4028   // For "vector base, scalar index" scale the index so that it becomes a
4029   // scalar offset.
4030   if (!TypeFlags.isByteIndexed() && Ops[2]->getType()->isVectorTy()) {
4031     unsigned BytesPerElt =
4032         OverloadedTy->getElementType()->getScalarSizeInBits() / 8;
4033     Ops[3] = Builder.CreateShl(Ops[3], Log2_32(BytesPerElt));
4034   }
4035 
4036   return Builder.CreateCall(F, Ops);
4037 }
4038 
4039 Value *CodeGenFunction::EmitSVEGatherPrefetch(const SVETypeFlags &TypeFlags,
4040                                               SmallVectorImpl<Value *> &Ops,
4041                                               unsigned IntID) {
4042   // The gather prefetches are overloaded on the vector input - this can either
4043   // be the vector of base addresses or vector of offsets.
4044   auto *OverloadedTy = dyn_cast<llvm::ScalableVectorType>(Ops[1]->getType());
4045   if (!OverloadedTy)
4046     OverloadedTy = cast<llvm::ScalableVectorType>(Ops[2]->getType());
4047 
4048   // Cast the predicate from svbool_t to the right number of elements.
4049   Ops[0] = EmitSVEPredicateCast(Ops[0], OverloadedTy);
4050 
4051   // vector + imm addressing modes
4052   if (Ops[1]->getType()->isVectorTy()) {
4053     if (Ops.size() == 3) {
4054       // Pass 0 for 'vector+imm' when the index is omitted.
4055       Ops.push_back(ConstantInt::get(Int64Ty, 0));
4056 
4057       // The sv_prfop is the last operand in the builtin and IR intrinsic.
4058       std::swap(Ops[2], Ops[3]);
4059     } else {
4060       // Index needs to be passed as scaled offset.
4061       llvm::Type *MemEltTy = SVEBuiltinMemEltTy(TypeFlags);
4062       unsigned BytesPerElt = MemEltTy->getPrimitiveSizeInBits() / 8;
4063       if (BytesPerElt > 1)
4064         Ops[2] = Builder.CreateShl(Ops[2], Log2_32(BytesPerElt));
4065     }
4066   }
4067 
4068   Function *F = CGM.getIntrinsic(IntID, OverloadedTy);
4069   return Builder.CreateCall(F, Ops);
4070 }
4071 
4072 Value *CodeGenFunction::EmitSVEStructLoad(const SVETypeFlags &TypeFlags,
4073                                           SmallVectorImpl<Value*> &Ops,
4074                                           unsigned IntID) {
4075   llvm::ScalableVectorType *VTy = getSVEType(TypeFlags);
4076   Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy);
4077   Value *BasePtr = Ops[1];
4078 
4079   // Does the load have an offset?
4080   if (Ops.size() > 2)
4081     BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]);
4082 
4083   Function *F = CGM.getIntrinsic(IntID, {VTy});
4084   return Builder.CreateCall(F, {Predicate, BasePtr});
4085 }
4086 
4087 Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
4088                                            SmallVectorImpl<Value*> &Ops,
4089                                            unsigned IntID) {
4090   llvm::ScalableVectorType *VTy = getSVEType(TypeFlags);
4091 
4092   unsigned N;
4093   switch (IntID) {
4094   case Intrinsic::aarch64_sve_st2:
4095   case Intrinsic::aarch64_sve_st1_pn_x2:
4096   case Intrinsic::aarch64_sve_stnt1_pn_x2:
4097   case Intrinsic::aarch64_sve_st2q:
4098     N = 2;
4099     break;
4100   case Intrinsic::aarch64_sve_st3:
4101   case Intrinsic::aarch64_sve_st3q:
4102     N = 3;
4103     break;
4104   case Intrinsic::aarch64_sve_st4:
4105   case Intrinsic::aarch64_sve_st1_pn_x4:
4106   case Intrinsic::aarch64_sve_stnt1_pn_x4:
4107   case Intrinsic::aarch64_sve_st4q:
4108     N = 4;
4109     break;
4110   default:
4111     llvm_unreachable("unknown intrinsic!");
4112   }
4113 
4114   Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy);
4115   Value *BasePtr = Ops[1];
4116 
4117   // Does the store have an offset?
4118   if (Ops.size() > (2 + N))
4119     BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]);
4120 
4121   // The llvm.aarch64.sve.st2/3/4 intrinsics take legal part vectors, so we
4122   // need to break up the tuple vector.
4123   SmallVector<llvm::Value*, 5> Operands;
4124   for (unsigned I = Ops.size() - N; I < Ops.size(); ++I)
4125     Operands.push_back(Ops[I]);
4126   Operands.append({Predicate, BasePtr});
4127   Function *F = CGM.getIntrinsic(IntID, { VTy });
4128 
4129   return Builder.CreateCall(F, Operands);
4130 }
4131 
4132 // SVE2's svpmullb and svpmullt builtins are similar to the svpmullb_pair and
4133 // svpmullt_pair intrinsics, with the exception that their results are bitcast
4134 // to a wider type.
4135 Value *CodeGenFunction::EmitSVEPMull(const SVETypeFlags &TypeFlags,
4136                                      SmallVectorImpl<Value *> &Ops,
4137                                      unsigned BuiltinID) {
4138   // Splat scalar operand to vector (intrinsics with _n infix)
4139   if (TypeFlags.hasSplatOperand()) {
4140     unsigned OpNo = TypeFlags.getSplatOperand();
4141     Ops[OpNo] = EmitSVEDupX(Ops[OpNo]);
4142   }
4143 
4144   // The pair-wise function has a narrower overloaded type.
4145   Function *F = CGM.getIntrinsic(BuiltinID, Ops[0]->getType());
4146   Value *Call = Builder.CreateCall(F, {Ops[0], Ops[1]});
4147 
4148   // Now bitcast to the wider result type.
4149   llvm::ScalableVectorType *Ty = getSVEType(TypeFlags);
4150   return EmitSVEReinterpret(Call, Ty);
4151 }
4152 
4153 Value *CodeGenFunction::EmitSVEMovl(const SVETypeFlags &TypeFlags,
4154                                     ArrayRef<Value *> Ops, unsigned BuiltinID) {
4155   llvm::Type *OverloadedTy = getSVEType(TypeFlags);
4156   Function *F = CGM.getIntrinsic(BuiltinID, OverloadedTy);
4157   return Builder.CreateCall(F, {Ops[0], Builder.getInt32(0)});
4158 }
4159 
4160 Value *CodeGenFunction::EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
4161                                             SmallVectorImpl<Value *> &Ops,
4162                                             unsigned BuiltinID) {
4163   auto *MemEltTy = SVEBuiltinMemEltTy(TypeFlags);
4164   auto *VectorTy = getSVEVectorForElementType(MemEltTy);
4165   auto *MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
4166 
4167   Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
4168   Value *BasePtr = Ops[1];
4169 
4170   // Implement the index operand if not omitted.
4171   if (Ops.size() > 3)
4172     BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
4173 
4174   Value *PrfOp = Ops.back();
4175 
4176   Function *F = CGM.getIntrinsic(BuiltinID, Predicate->getType());
4177   return Builder.CreateCall(F, {Predicate, BasePtr, PrfOp});
4178 }
4179 
4180 Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
4181                                           llvm::Type *ReturnTy,
4182                                           SmallVectorImpl<Value *> &Ops,
4183                                           unsigned IntrinsicID,
4184                                           bool IsZExtReturn) {
4185   QualType LangPTy = E->getArg(1)->getType();
4186   llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
4187       LangPTy->castAs<PointerType>()->getPointeeType());
4188 
4189   // Mfloat8 types is stored as a vector, so extra work
4190   // to extract sclar element type is necessary.
4191   if (MemEltTy->isVectorTy()) {
4192     assert(MemEltTy == FixedVectorType::get(Int8Ty, 1) &&
4193            "Only <1 x i8> expected");
4194     MemEltTy = cast<llvm::VectorType>(MemEltTy)->getElementType();
4195   }
4196 
4197   // The vector type that is returned may be different from the
4198   // eventual type loaded from memory.
4199   auto VectorTy = cast<llvm::ScalableVectorType>(ReturnTy);
4200   llvm::ScalableVectorType *MemoryTy = nullptr;
4201   llvm::ScalableVectorType *PredTy = nullptr;
4202   bool IsQuadLoad = false;
4203   switch (IntrinsicID) {
4204   case Intrinsic::aarch64_sve_ld1uwq:
4205   case Intrinsic::aarch64_sve_ld1udq:
4206     MemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
4207     PredTy = llvm::ScalableVectorType::get(
4208         llvm::Type::getInt1Ty(getLLVMContext()), 1);
4209     IsQuadLoad = true;
4210     break;
4211   default:
4212     MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
4213     PredTy = MemoryTy;
4214     break;
4215   }
4216 
4217   Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy);
4218   Value *BasePtr = Ops[1];
4219 
4220   // Does the load have an offset?
4221   if (Ops.size() > 2)
4222     BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
4223 
4224   Function *F = CGM.getIntrinsic(IntrinsicID, IsQuadLoad ? VectorTy : MemoryTy);
4225   auto *Load =
4226       cast<llvm::Instruction>(Builder.CreateCall(F, {Predicate, BasePtr}));
4227   auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
4228   CGM.DecorateInstructionWithTBAA(Load, TBAAInfo);
4229 
4230   if (IsQuadLoad)
4231     return Load;
4232 
4233   return IsZExtReturn ? Builder.CreateZExt(Load, VectorTy)
4234                       : Builder.CreateSExt(Load, VectorTy);
4235 }
4236 
4237 Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
4238                                            SmallVectorImpl<Value *> &Ops,
4239                                            unsigned IntrinsicID) {
4240   QualType LangPTy = E->getArg(1)->getType();
4241   llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
4242       LangPTy->castAs<PointerType>()->getPointeeType());
4243 
4244   // Mfloat8 types is stored as a vector, so extra work
4245   // to extract sclar element type is necessary.
4246   if (MemEltTy->isVectorTy()) {
4247     assert(MemEltTy == FixedVectorType::get(Int8Ty, 1) &&
4248            "Only <1 x i8> expected");
4249     MemEltTy = cast<llvm::VectorType>(MemEltTy)->getElementType();
4250   }
4251 
4252   // The vector type that is stored may be different from the
4253   // eventual type stored to memory.
4254   auto VectorTy = cast<llvm::ScalableVectorType>(Ops.back()->getType());
4255   auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
4256 
4257   auto PredTy = MemoryTy;
4258   auto AddrMemoryTy = MemoryTy;
4259   bool IsQuadStore = false;
4260 
4261   switch (IntrinsicID) {
4262   case Intrinsic::aarch64_sve_st1wq:
4263   case Intrinsic::aarch64_sve_st1dq:
4264     AddrMemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
4265     PredTy =
4266         llvm::ScalableVectorType::get(IntegerType::get(getLLVMContext(), 1), 1);
4267     IsQuadStore = true;
4268     break;
4269   default:
4270     break;
4271   }
4272   Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy);
4273   Value *BasePtr = Ops[1];
4274 
4275   // Does the store have an offset?
4276   if (Ops.size() == 4)
4277     BasePtr = Builder.CreateGEP(AddrMemoryTy, BasePtr, Ops[2]);
4278 
4279   // Last value is always the data
4280   Value *Val =
4281       IsQuadStore ? Ops.back() : Builder.CreateTrunc(Ops.back(), MemoryTy);
4282 
4283   Function *F =
4284       CGM.getIntrinsic(IntrinsicID, IsQuadStore ? VectorTy : MemoryTy);
4285   auto *Store =
4286       cast<llvm::Instruction>(Builder.CreateCall(F, {Val, Predicate, BasePtr}));
4287   auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
4288   CGM.DecorateInstructionWithTBAA(Store, TBAAInfo);
4289   return Store;
4290 }
4291 
4292 Value *CodeGenFunction::EmitSMELd1St1(const SVETypeFlags &TypeFlags,
4293                                       SmallVectorImpl<Value *> &Ops,
4294                                       unsigned IntID) {
4295   Ops[2] = EmitSVEPredicateCast(
4296       Ops[2], getSVEVectorForElementType(SVEBuiltinMemEltTy(TypeFlags)));
4297 
4298   SmallVector<Value *> NewOps;
4299   NewOps.push_back(Ops[2]);
4300 
4301   llvm::Value *BasePtr = Ops[3];
4302   llvm::Value *RealSlice = Ops[1];
4303   // If the intrinsic contains the vnum parameter, multiply it with the vector
4304   // size in bytes.
4305   if (Ops.size() == 5) {
4306     Function *StreamingVectorLength =
4307         CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb);
4308     llvm::Value *StreamingVectorLengthCall =
4309         Builder.CreateCall(StreamingVectorLength);
4310     llvm::Value *Mulvl =
4311         Builder.CreateMul(StreamingVectorLengthCall, Ops[4], "mulvl");
4312     // The type of the ptr parameter is void *, so use Int8Ty here.
4313     BasePtr = Builder.CreateGEP(Int8Ty, Ops[3], Mulvl);
4314     RealSlice = Builder.CreateZExt(RealSlice, Int64Ty);
4315     RealSlice = Builder.CreateAdd(RealSlice, Ops[4]);
4316     RealSlice = Builder.CreateTrunc(RealSlice, Int32Ty);
4317   }
4318   NewOps.push_back(BasePtr);
4319   NewOps.push_back(Ops[0]);
4320   NewOps.push_back(RealSlice);
4321   Function *F = CGM.getIntrinsic(IntID);
4322   return Builder.CreateCall(F, NewOps);
4323 }
4324 
4325 Value *CodeGenFunction::EmitSMEReadWrite(const SVETypeFlags &TypeFlags,
4326                                          SmallVectorImpl<Value *> &Ops,
4327                                          unsigned IntID) {
4328   auto *VecTy = getSVEType(TypeFlags);
4329   Function *F = CGM.getIntrinsic(IntID, VecTy);
4330   if (TypeFlags.isReadZA())
4331     Ops[1] = EmitSVEPredicateCast(Ops[1], VecTy);
4332   else if (TypeFlags.isWriteZA())
4333     Ops[2] = EmitSVEPredicateCast(Ops[2], VecTy);
4334   return Builder.CreateCall(F, Ops);
4335 }
4336 
4337 Value *CodeGenFunction::EmitSMEZero(const SVETypeFlags &TypeFlags,
4338                                     SmallVectorImpl<Value *> &Ops,
4339                                     unsigned IntID) {
4340   // svzero_za() intrinsic zeros the entire za tile and has no paramters.
4341   if (Ops.size() == 0)
4342     Ops.push_back(llvm::ConstantInt::get(Int32Ty, 255));
4343   Function *F = CGM.getIntrinsic(IntID, {});
4344   return Builder.CreateCall(F, Ops);
4345 }
4346 
4347 Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags,
4348                                       SmallVectorImpl<Value *> &Ops,
4349                                       unsigned IntID) {
4350   if (Ops.size() == 2)
4351     Ops.push_back(Builder.getInt32(0));
4352   else
4353     Ops[2] = Builder.CreateIntCast(Ops[2], Int32Ty, true);
4354   Function *F = CGM.getIntrinsic(IntID, {});
4355   return Builder.CreateCall(F, Ops);
4356 }
4357 
4358 // Limit the usage of scalable llvm IR generated by the ACLE by using the
4359 // sve dup.x intrinsic instead of IRBuilder::CreateVectorSplat.
4360 Value *CodeGenFunction::EmitSVEDupX(Value *Scalar, llvm::Type *Ty) {
4361   return Builder.CreateVectorSplat(
4362       cast<llvm::VectorType>(Ty)->getElementCount(), Scalar);
4363 }
4364 
4365 Value *CodeGenFunction::EmitSVEDupX(Value *Scalar) {
4366   if (auto *Ty = Scalar->getType(); Ty->isVectorTy()) {
4367 #ifndef NDEBUG
4368     auto *VecTy = cast<llvm::VectorType>(Ty);
4369     ElementCount EC = VecTy->getElementCount();
4370     assert(EC.isScalar() && VecTy->getElementType() == Int8Ty &&
4371            "Only <1 x i8> expected");
4372 #endif
4373     Scalar = Builder.CreateExtractElement(Scalar, uint64_t(0));
4374   }
4375   return EmitSVEDupX(Scalar, getSVEVectorForElementType(Scalar->getType()));
4376 }
4377 
4378 Value *CodeGenFunction::EmitSVEReinterpret(Value *Val, llvm::Type *Ty) {
4379   // FIXME: For big endian this needs an additional REV, or needs a separate
4380   // intrinsic that is code-generated as a no-op, because the LLVM bitcast
4381   // instruction is defined as 'bitwise' equivalent from memory point of
4382   // view (when storing/reloading), whereas the svreinterpret builtin
4383   // implements bitwise equivalent cast from register point of view.
4384   // LLVM CodeGen for a bitcast must add an explicit REV for big-endian.
4385 
4386   if (auto *StructTy = dyn_cast<StructType>(Ty)) {
4387     Value *Tuple = llvm::PoisonValue::get(Ty);
4388 
4389     for (unsigned I = 0; I < StructTy->getNumElements(); ++I) {
4390       Value *In = Builder.CreateExtractValue(Val, I);
4391       Value *Out = Builder.CreateBitCast(In, StructTy->getTypeAtIndex(I));
4392       Tuple = Builder.CreateInsertValue(Tuple, Out, I);
4393     }
4394 
4395     return Tuple;
4396   }
4397 
4398   return Builder.CreateBitCast(Val, Ty);
4399 }
4400 
4401 static void InsertExplicitZeroOperand(CGBuilderTy &Builder, llvm::Type *Ty,
4402                                       SmallVectorImpl<Value *> &Ops) {
4403   auto *SplatZero = Constant::getNullValue(Ty);
4404   Ops.insert(Ops.begin(), SplatZero);
4405 }
4406 
4407 static void InsertExplicitUndefOperand(CGBuilderTy &Builder, llvm::Type *Ty,
4408                                        SmallVectorImpl<Value *> &Ops) {
4409   auto *SplatUndef = UndefValue::get(Ty);
4410   Ops.insert(Ops.begin(), SplatUndef);
4411 }
4412 
4413 SmallVector<llvm::Type *, 2>
4414 CodeGenFunction::getSVEOverloadTypes(const SVETypeFlags &TypeFlags,
4415                                      llvm::Type *ResultType,
4416                                      ArrayRef<Value *> Ops) {
4417   if (TypeFlags.isOverloadNone())
4418     return {};
4419 
4420   llvm::Type *DefaultType = getSVEType(TypeFlags);
4421 
4422   if (TypeFlags.isOverloadWhileOrMultiVecCvt())
4423     return {DefaultType, Ops[1]->getType()};
4424 
4425   if (TypeFlags.isOverloadWhileRW())
4426     return {getSVEPredType(TypeFlags), Ops[0]->getType()};
4427 
4428   if (TypeFlags.isOverloadCvt())
4429     return {Ops[0]->getType(), Ops.back()->getType()};
4430 
4431   if (TypeFlags.isReductionQV() && !ResultType->isScalableTy() &&
4432       ResultType->isVectorTy())
4433     return {ResultType, Ops[1]->getType()};
4434 
4435   assert(TypeFlags.isOverloadDefault() && "Unexpected value for overloads");
4436   return {DefaultType};
4437 }
4438 
4439 Value *CodeGenFunction::EmitSVETupleSetOrGet(const SVETypeFlags &TypeFlags,
4440                                              ArrayRef<Value *> Ops) {
4441   assert((TypeFlags.isTupleSet() || TypeFlags.isTupleGet()) &&
4442          "Expects TypleFlags.isTupleSet() or TypeFlags.isTupleGet()");
4443   unsigned Idx = cast<ConstantInt>(Ops[1])->getZExtValue();
4444 
4445   if (TypeFlags.isTupleSet())
4446     return Builder.CreateInsertValue(Ops[0], Ops[2], Idx);
4447   return Builder.CreateExtractValue(Ops[0], Idx);
4448 }
4449 
4450 Value *CodeGenFunction::EmitSVETupleCreate(const SVETypeFlags &TypeFlags,
4451                                            llvm::Type *Ty,
4452                                            ArrayRef<Value *> Ops) {
4453   assert(TypeFlags.isTupleCreate() && "Expects TypleFlag isTupleCreate");
4454 
4455   Value *Tuple = llvm::PoisonValue::get(Ty);
4456   for (unsigned Idx = 0; Idx < Ops.size(); Idx++)
4457     Tuple = Builder.CreateInsertValue(Tuple, Ops[Idx], Idx);
4458 
4459   return Tuple;
4460 }
4461 
4462 void CodeGenFunction::GetAArch64SVEProcessedOperands(
4463     unsigned BuiltinID, const CallExpr *E, SmallVectorImpl<Value *> &Ops,
4464     SVETypeFlags TypeFlags) {
4465   // Find out if any arguments are required to be integer constant expressions.
4466   unsigned ICEArguments = 0;
4467   ASTContext::GetBuiltinTypeError Error;
4468   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
4469   assert(Error == ASTContext::GE_None && "Should not codegen an error");
4470 
4471   // Tuple set/get only requires one insert/extract vector, which is
4472   // created by EmitSVETupleSetOrGet.
4473   bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet();
4474 
4475   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
4476     bool IsICE = ICEArguments & (1 << i);
4477     Value *Arg = EmitScalarExpr(E->getArg(i));
4478 
4479     if (IsICE) {
4480       // If this is required to be a constant, constant fold it so that we know
4481       // that the generated intrinsic gets a ConstantInt.
4482       std::optional<llvm::APSInt> Result =
4483           E->getArg(i)->getIntegerConstantExpr(getContext());
4484       assert(Result && "Expected argument to be a constant");
4485 
4486       // Immediates for SVE llvm intrinsics are always 32bit.  We can safely
4487       // truncate because the immediate has been range checked and no valid
4488       // immediate requires more than a handful of bits.
4489       *Result = Result->extOrTrunc(32);
4490       Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result));
4491       continue;
4492     }
4493 
4494     if (isa<StructType>(Arg->getType()) && !IsTupleGetOrSet) {
4495       for (unsigned I = 0; I < Arg->getType()->getStructNumElements(); ++I)
4496         Ops.push_back(Builder.CreateExtractValue(Arg, I));
4497 
4498       continue;
4499     }
4500 
4501     Ops.push_back(Arg);
4502   }
4503 }
4504 
4505 Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
4506                                                   const CallExpr *E) {
4507   llvm::Type *Ty = ConvertType(E->getType());
4508   if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
4509       BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64_x4) {
4510     Value *Val = EmitScalarExpr(E->getArg(0));
4511     return EmitSVEReinterpret(Val, Ty);
4512   }
4513 
4514   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SVEIntrinsicMap, BuiltinID,
4515                                               AArch64SVEIntrinsicsProvenSorted);
4516 
4517   llvm::SmallVector<Value *, 4> Ops;
4518   SVETypeFlags TypeFlags(Builtin->TypeModifier);
4519   GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
4520 
4521   if (TypeFlags.isLoad())
4522     return EmitSVEMaskedLoad(E, Ty, Ops, Builtin->LLVMIntrinsic,
4523                              TypeFlags.isZExtReturn());
4524   else if (TypeFlags.isStore())
4525     return EmitSVEMaskedStore(E, Ops, Builtin->LLVMIntrinsic);
4526   else if (TypeFlags.isGatherLoad())
4527     return EmitSVEGatherLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
4528   else if (TypeFlags.isScatterStore())
4529     return EmitSVEScatterStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
4530   else if (TypeFlags.isPrefetch())
4531     return EmitSVEPrefetchLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
4532   else if (TypeFlags.isGatherPrefetch())
4533     return EmitSVEGatherPrefetch(TypeFlags, Ops, Builtin->LLVMIntrinsic);
4534   else if (TypeFlags.isStructLoad())
4535     return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
4536   else if (TypeFlags.isStructStore())
4537     return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
4538   else if (TypeFlags.isTupleSet() || TypeFlags.isTupleGet())
4539     return EmitSVETupleSetOrGet(TypeFlags, Ops);
4540   else if (TypeFlags.isTupleCreate())
4541     return EmitSVETupleCreate(TypeFlags, Ty, Ops);
4542   else if (TypeFlags.isUndef())
4543     return UndefValue::get(Ty);
4544   else if (Builtin->LLVMIntrinsic != 0) {
4545     // Emit set FPMR for intrinsics that require it
4546     if (TypeFlags.setsFPMR())
4547       Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr),
4548                          Ops.pop_back_val());
4549     if (TypeFlags.getMergeType() == SVETypeFlags::MergeZeroExp)
4550       InsertExplicitZeroOperand(Builder, Ty, Ops);
4551 
4552     if (TypeFlags.getMergeType() == SVETypeFlags::MergeAnyExp)
4553       InsertExplicitUndefOperand(Builder, Ty, Ops);
4554 
4555     // Some ACLE builtins leave out the argument to specify the predicate
4556     // pattern, which is expected to be expanded to an SV_ALL pattern.
4557     if (TypeFlags.isAppendSVALL())
4558       Ops.push_back(Builder.getInt32(/*SV_ALL*/ 31));
4559     if (TypeFlags.isInsertOp1SVALL())
4560       Ops.insert(&Ops[1], Builder.getInt32(/*SV_ALL*/ 31));
4561 
4562     // Predicates must match the main datatype.
4563     for (Value *&Op : Ops)
4564       if (auto PredTy = dyn_cast<llvm::VectorType>(Op->getType()))
4565         if (PredTy->getElementType()->isIntegerTy(1))
4566           Op = EmitSVEPredicateCast(Op, getSVEType(TypeFlags));
4567 
4568     // Splat scalar operand to vector (intrinsics with _n infix)
4569     if (TypeFlags.hasSplatOperand()) {
4570       unsigned OpNo = TypeFlags.getSplatOperand();
4571       Ops[OpNo] = EmitSVEDupX(Ops[OpNo]);
4572     }
4573 
4574     if (TypeFlags.isReverseCompare())
4575       std::swap(Ops[1], Ops[2]);
4576     else if (TypeFlags.isReverseUSDOT())
4577       std::swap(Ops[1], Ops[2]);
4578     else if (TypeFlags.isReverseMergeAnyBinOp() &&
4579              TypeFlags.getMergeType() == SVETypeFlags::MergeAny)
4580       std::swap(Ops[1], Ops[2]);
4581     else if (TypeFlags.isReverseMergeAnyAccOp() &&
4582              TypeFlags.getMergeType() == SVETypeFlags::MergeAny)
4583       std::swap(Ops[1], Ops[3]);
4584 
4585     // Predicated intrinsics with _z suffix need a select w/ zeroinitializer.
4586     if (TypeFlags.getMergeType() == SVETypeFlags::MergeZero) {
4587       llvm::Type *OpndTy = Ops[1]->getType();
4588       auto *SplatZero = Constant::getNullValue(OpndTy);
4589       Ops[1] = Builder.CreateSelect(Ops[0], Ops[1], SplatZero);
4590     }
4591 
4592     Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic,
4593                                    getSVEOverloadTypes(TypeFlags, Ty, Ops));
4594     Value *Call = Builder.CreateCall(F, Ops);
4595 
4596     if (Call->getType() == Ty)
4597       return Call;
4598 
4599     // Predicate results must be converted to svbool_t.
4600     if (auto PredTy = dyn_cast<llvm::ScalableVectorType>(Ty))
4601       return EmitSVEPredicateCast(Call, PredTy);
4602     if (auto PredTupleTy = dyn_cast<llvm::StructType>(Ty))
4603       return EmitSVEPredicateTupleCast(Call, PredTupleTy);
4604 
4605     llvm_unreachable("unsupported element count!");
4606   }
4607 
4608   switch (BuiltinID) {
4609   default:
4610     return nullptr;
4611 
4612   case SVE::BI__builtin_sve_svreinterpret_b: {
4613     auto SVCountTy =
4614         llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
4615     Function *CastFromSVCountF =
4616         CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool, SVCountTy);
4617     return Builder.CreateCall(CastFromSVCountF, Ops[0]);
4618   }
4619   case SVE::BI__builtin_sve_svreinterpret_c: {
4620     auto SVCountTy =
4621         llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
4622     Function *CastToSVCountF =
4623         CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, SVCountTy);
4624     return Builder.CreateCall(CastToSVCountF, Ops[0]);
4625   }
4626 
4627   case SVE::BI__builtin_sve_svpsel_lane_b8:
4628   case SVE::BI__builtin_sve_svpsel_lane_b16:
4629   case SVE::BI__builtin_sve_svpsel_lane_b32:
4630   case SVE::BI__builtin_sve_svpsel_lane_b64:
4631   case SVE::BI__builtin_sve_svpsel_lane_c8:
4632   case SVE::BI__builtin_sve_svpsel_lane_c16:
4633   case SVE::BI__builtin_sve_svpsel_lane_c32:
4634   case SVE::BI__builtin_sve_svpsel_lane_c64: {
4635     bool IsSVCount = isa<TargetExtType>(Ops[0]->getType());
4636     assert(((!IsSVCount || cast<TargetExtType>(Ops[0]->getType())->getName() ==
4637                                "aarch64.svcount")) &&
4638            "Unexpected TargetExtType");
4639     auto SVCountTy =
4640         llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
4641     Function *CastFromSVCountF =
4642         CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool, SVCountTy);
4643     Function *CastToSVCountF =
4644         CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, SVCountTy);
4645 
4646     auto OverloadedTy = getSVEType(SVETypeFlags(Builtin->TypeModifier));
4647     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_psel, OverloadedTy);
4648     llvm::Value *Ops0 =
4649         IsSVCount ? Builder.CreateCall(CastFromSVCountF, Ops[0]) : Ops[0];
4650     llvm::Value *Ops1 = EmitSVEPredicateCast(Ops[1], OverloadedTy);
4651     llvm::Value *PSel = Builder.CreateCall(F, {Ops0, Ops1, Ops[2]});
4652     return IsSVCount ? Builder.CreateCall(CastToSVCountF, PSel) : PSel;
4653   }
4654   case SVE::BI__builtin_sve_svmov_b_z: {
4655     // svmov_b_z(pg, op) <=> svand_b_z(pg, op, op)
4656     SVETypeFlags TypeFlags(Builtin->TypeModifier);
4657     llvm::Type* OverloadedTy = getSVEType(TypeFlags);
4658     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_and_z, OverloadedTy);
4659     return Builder.CreateCall(F, {Ops[0], Ops[1], Ops[1]});
4660   }
4661 
4662   case SVE::BI__builtin_sve_svnot_b_z: {
4663     // svnot_b_z(pg, op) <=> sveor_b_z(pg, op, pg)
4664     SVETypeFlags TypeFlags(Builtin->TypeModifier);
4665     llvm::Type* OverloadedTy = getSVEType(TypeFlags);
4666     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_eor_z, OverloadedTy);
4667     return Builder.CreateCall(F, {Ops[0], Ops[1], Ops[0]});
4668   }
4669 
4670   case SVE::BI__builtin_sve_svmovlb_u16:
4671   case SVE::BI__builtin_sve_svmovlb_u32:
4672   case SVE::BI__builtin_sve_svmovlb_u64:
4673     return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_ushllb);
4674 
4675   case SVE::BI__builtin_sve_svmovlb_s16:
4676   case SVE::BI__builtin_sve_svmovlb_s32:
4677   case SVE::BI__builtin_sve_svmovlb_s64:
4678     return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_sshllb);
4679 
4680   case SVE::BI__builtin_sve_svmovlt_u16:
4681   case SVE::BI__builtin_sve_svmovlt_u32:
4682   case SVE::BI__builtin_sve_svmovlt_u64:
4683     return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_ushllt);
4684 
4685   case SVE::BI__builtin_sve_svmovlt_s16:
4686   case SVE::BI__builtin_sve_svmovlt_s32:
4687   case SVE::BI__builtin_sve_svmovlt_s64:
4688     return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_sshllt);
4689 
4690   case SVE::BI__builtin_sve_svpmullt_u16:
4691   case SVE::BI__builtin_sve_svpmullt_u64:
4692   case SVE::BI__builtin_sve_svpmullt_n_u16:
4693   case SVE::BI__builtin_sve_svpmullt_n_u64:
4694     return EmitSVEPMull(TypeFlags, Ops, Intrinsic::aarch64_sve_pmullt_pair);
4695 
4696   case SVE::BI__builtin_sve_svpmullb_u16:
4697   case SVE::BI__builtin_sve_svpmullb_u64:
4698   case SVE::BI__builtin_sve_svpmullb_n_u16:
4699   case SVE::BI__builtin_sve_svpmullb_n_u64:
4700     return EmitSVEPMull(TypeFlags, Ops, Intrinsic::aarch64_sve_pmullb_pair);
4701 
4702   case SVE::BI__builtin_sve_svdup_n_b8:
4703   case SVE::BI__builtin_sve_svdup_n_b16:
4704   case SVE::BI__builtin_sve_svdup_n_b32:
4705   case SVE::BI__builtin_sve_svdup_n_b64: {
4706     Value *CmpNE =
4707         Builder.CreateICmpNE(Ops[0], Constant::getNullValue(Ops[0]->getType()));
4708     llvm::ScalableVectorType *OverloadedTy = getSVEType(TypeFlags);
4709     Value *Dup = EmitSVEDupX(CmpNE, OverloadedTy);
4710     return EmitSVEPredicateCast(Dup, cast<llvm::ScalableVectorType>(Ty));
4711   }
4712 
4713   case SVE::BI__builtin_sve_svdupq_n_b8:
4714   case SVE::BI__builtin_sve_svdupq_n_b16:
4715   case SVE::BI__builtin_sve_svdupq_n_b32:
4716   case SVE::BI__builtin_sve_svdupq_n_b64:
4717   case SVE::BI__builtin_sve_svdupq_n_u8:
4718   case SVE::BI__builtin_sve_svdupq_n_s8:
4719   case SVE::BI__builtin_sve_svdupq_n_u64:
4720   case SVE::BI__builtin_sve_svdupq_n_f64:
4721   case SVE::BI__builtin_sve_svdupq_n_s64:
4722   case SVE::BI__builtin_sve_svdupq_n_u16:
4723   case SVE::BI__builtin_sve_svdupq_n_f16:
4724   case SVE::BI__builtin_sve_svdupq_n_bf16:
4725   case SVE::BI__builtin_sve_svdupq_n_s16:
4726   case SVE::BI__builtin_sve_svdupq_n_u32:
4727   case SVE::BI__builtin_sve_svdupq_n_f32:
4728   case SVE::BI__builtin_sve_svdupq_n_s32: {
4729     // These builtins are implemented by storing each element to an array and using
4730     // ld1rq to materialize a vector.
4731     unsigned NumOpnds = Ops.size();
4732 
4733     bool IsBoolTy =
4734         cast<llvm::VectorType>(Ty)->getElementType()->isIntegerTy(1);
4735 
4736     // For svdupq_n_b* the element type of is an integer of type 128/numelts,
4737     // so that the compare can use the width that is natural for the expected
4738     // number of predicate lanes.
4739     llvm::Type *EltTy = Ops[0]->getType();
4740     if (IsBoolTy)
4741       EltTy = IntegerType::get(getLLVMContext(), SVEBitsPerBlock / NumOpnds);
4742 
4743     SmallVector<llvm::Value *, 16> VecOps;
4744     for (unsigned I = 0; I < NumOpnds; ++I)
4745         VecOps.push_back(Builder.CreateZExt(Ops[I], EltTy));
4746     Value *Vec = BuildVector(VecOps);
4747 
4748     llvm::Type *OverloadedTy = getSVEVectorForElementType(EltTy);
4749     Value *InsertSubVec = Builder.CreateInsertVector(
4750         OverloadedTy, PoisonValue::get(OverloadedTy), Vec, uint64_t(0));
4751 
4752     Function *F =
4753         CGM.getIntrinsic(Intrinsic::aarch64_sve_dupq_lane, OverloadedTy);
4754     Value *DupQLane =
4755         Builder.CreateCall(F, {InsertSubVec, Builder.getInt64(0)});
4756 
4757     if (!IsBoolTy)
4758       return DupQLane;
4759 
4760     SVETypeFlags TypeFlags(Builtin->TypeModifier);
4761     Value *Pred = EmitSVEAllTruePred(TypeFlags);
4762 
4763     // For svdupq_n_b* we need to add an additional 'cmpne' with '0'.
4764     F = CGM.getIntrinsic(NumOpnds == 2 ? Intrinsic::aarch64_sve_cmpne
4765                                        : Intrinsic::aarch64_sve_cmpne_wide,
4766                          OverloadedTy);
4767     Value *Call = Builder.CreateCall(
4768         F, {Pred, DupQLane, EmitSVEDupX(Builder.getInt64(0))});
4769     return EmitSVEPredicateCast(Call, cast<llvm::ScalableVectorType>(Ty));
4770   }
4771 
4772   case SVE::BI__builtin_sve_svpfalse_b:
4773     return ConstantInt::getFalse(Ty);
4774 
4775   case SVE::BI__builtin_sve_svpfalse_c: {
4776     auto SVBoolTy = ScalableVectorType::get(Builder.getInt1Ty(), 16);
4777     Function *CastToSVCountF =
4778         CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, Ty);
4779     return Builder.CreateCall(CastToSVCountF, ConstantInt::getFalse(SVBoolTy));
4780   }
4781 
4782   case SVE::BI__builtin_sve_svlen_bf16:
4783   case SVE::BI__builtin_sve_svlen_f16:
4784   case SVE::BI__builtin_sve_svlen_f32:
4785   case SVE::BI__builtin_sve_svlen_f64:
4786   case SVE::BI__builtin_sve_svlen_s8:
4787   case SVE::BI__builtin_sve_svlen_s16:
4788   case SVE::BI__builtin_sve_svlen_s32:
4789   case SVE::BI__builtin_sve_svlen_s64:
4790   case SVE::BI__builtin_sve_svlen_u8:
4791   case SVE::BI__builtin_sve_svlen_u16:
4792   case SVE::BI__builtin_sve_svlen_u32:
4793   case SVE::BI__builtin_sve_svlen_u64: {
4794     SVETypeFlags TF(Builtin->TypeModifier);
4795     return Builder.CreateElementCount(Ty, getSVEType(TF)->getElementCount());
4796   }
4797 
4798   case SVE::BI__builtin_sve_svtbl2_u8:
4799   case SVE::BI__builtin_sve_svtbl2_s8:
4800   case SVE::BI__builtin_sve_svtbl2_u16:
4801   case SVE::BI__builtin_sve_svtbl2_s16:
4802   case SVE::BI__builtin_sve_svtbl2_u32:
4803   case SVE::BI__builtin_sve_svtbl2_s32:
4804   case SVE::BI__builtin_sve_svtbl2_u64:
4805   case SVE::BI__builtin_sve_svtbl2_s64:
4806   case SVE::BI__builtin_sve_svtbl2_f16:
4807   case SVE::BI__builtin_sve_svtbl2_bf16:
4808   case SVE::BI__builtin_sve_svtbl2_f32:
4809   case SVE::BI__builtin_sve_svtbl2_f64: {
4810     SVETypeFlags TF(Builtin->TypeModifier);
4811     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_tbl2, getSVEType(TF));
4812     return Builder.CreateCall(F, Ops);
4813   }
4814 
4815   case SVE::BI__builtin_sve_svset_neonq_s8:
4816   case SVE::BI__builtin_sve_svset_neonq_s16:
4817   case SVE::BI__builtin_sve_svset_neonq_s32:
4818   case SVE::BI__builtin_sve_svset_neonq_s64:
4819   case SVE::BI__builtin_sve_svset_neonq_u8:
4820   case SVE::BI__builtin_sve_svset_neonq_u16:
4821   case SVE::BI__builtin_sve_svset_neonq_u32:
4822   case SVE::BI__builtin_sve_svset_neonq_u64:
4823   case SVE::BI__builtin_sve_svset_neonq_f16:
4824   case SVE::BI__builtin_sve_svset_neonq_f32:
4825   case SVE::BI__builtin_sve_svset_neonq_f64:
4826   case SVE::BI__builtin_sve_svset_neonq_bf16: {
4827     return Builder.CreateInsertVector(Ty, Ops[0], Ops[1], uint64_t(0));
4828   }
4829 
4830   case SVE::BI__builtin_sve_svget_neonq_s8:
4831   case SVE::BI__builtin_sve_svget_neonq_s16:
4832   case SVE::BI__builtin_sve_svget_neonq_s32:
4833   case SVE::BI__builtin_sve_svget_neonq_s64:
4834   case SVE::BI__builtin_sve_svget_neonq_u8:
4835   case SVE::BI__builtin_sve_svget_neonq_u16:
4836   case SVE::BI__builtin_sve_svget_neonq_u32:
4837   case SVE::BI__builtin_sve_svget_neonq_u64:
4838   case SVE::BI__builtin_sve_svget_neonq_f16:
4839   case SVE::BI__builtin_sve_svget_neonq_f32:
4840   case SVE::BI__builtin_sve_svget_neonq_f64:
4841   case SVE::BI__builtin_sve_svget_neonq_bf16: {
4842     return Builder.CreateExtractVector(Ty, Ops[0], uint64_t(0));
4843   }
4844 
4845   case SVE::BI__builtin_sve_svdup_neonq_s8:
4846   case SVE::BI__builtin_sve_svdup_neonq_s16:
4847   case SVE::BI__builtin_sve_svdup_neonq_s32:
4848   case SVE::BI__builtin_sve_svdup_neonq_s64:
4849   case SVE::BI__builtin_sve_svdup_neonq_u8:
4850   case SVE::BI__builtin_sve_svdup_neonq_u16:
4851   case SVE::BI__builtin_sve_svdup_neonq_u32:
4852   case SVE::BI__builtin_sve_svdup_neonq_u64:
4853   case SVE::BI__builtin_sve_svdup_neonq_f16:
4854   case SVE::BI__builtin_sve_svdup_neonq_f32:
4855   case SVE::BI__builtin_sve_svdup_neonq_f64:
4856   case SVE::BI__builtin_sve_svdup_neonq_bf16: {
4857     Value *Insert = Builder.CreateInsertVector(Ty, PoisonValue::get(Ty), Ops[0],
4858                                                uint64_t(0));
4859     return Builder.CreateIntrinsic(Intrinsic::aarch64_sve_dupq_lane, {Ty},
4860                                    {Insert, Builder.getInt64(0)});
4861   }
4862   }
4863 
4864   /// Should not happen
4865   return nullptr;
4866 }
4867 
4868 static void swapCommutativeSMEOperands(unsigned BuiltinID,
4869                                        SmallVectorImpl<Value *> &Ops) {
4870   unsigned MultiVec;
4871   switch (BuiltinID) {
4872   default:
4873     return;
4874   case SME::BI__builtin_sme_svsumla_za32_s8_vg4x1:
4875     MultiVec = 1;
4876     break;
4877   case SME::BI__builtin_sme_svsumla_za32_s8_vg4x2:
4878   case SME::BI__builtin_sme_svsudot_za32_s8_vg1x2:
4879     MultiVec = 2;
4880     break;
4881   case SME::BI__builtin_sme_svsudot_za32_s8_vg1x4:
4882   case SME::BI__builtin_sme_svsumla_za32_s8_vg4x4:
4883     MultiVec = 4;
4884     break;
4885   }
4886 
4887   if (MultiVec > 0)
4888     for (unsigned I = 0; I < MultiVec; ++I)
4889       std::swap(Ops[I + 1], Ops[I + 1 + MultiVec]);
4890 }
4891 
4892 Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
4893                                                   const CallExpr *E) {
4894   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID,
4895                                               AArch64SMEIntrinsicsProvenSorted);
4896 
4897   llvm::SmallVector<Value *, 4> Ops;
4898   SVETypeFlags TypeFlags(Builtin->TypeModifier);
4899   GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
4900 
4901   if (TypeFlags.isLoad() || TypeFlags.isStore())
4902     return EmitSMELd1St1(TypeFlags, Ops, Builtin->LLVMIntrinsic);
4903   else if (TypeFlags.isReadZA() || TypeFlags.isWriteZA())
4904     return EmitSMEReadWrite(TypeFlags, Ops, Builtin->LLVMIntrinsic);
4905   else if (BuiltinID == SME::BI__builtin_sme_svzero_mask_za ||
4906            BuiltinID == SME::BI__builtin_sme_svzero_za)
4907     return EmitSMEZero(TypeFlags, Ops, Builtin->LLVMIntrinsic);
4908   else if (BuiltinID == SME::BI__builtin_sme_svldr_vnum_za ||
4909            BuiltinID == SME::BI__builtin_sme_svstr_vnum_za ||
4910            BuiltinID == SME::BI__builtin_sme_svldr_za ||
4911            BuiltinID == SME::BI__builtin_sme_svstr_za)
4912     return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
4913 
4914   // Emit set FPMR for intrinsics that require it
4915   if (TypeFlags.setsFPMR())
4916     Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr),
4917                        Ops.pop_back_val());
4918   // Handle builtins which require their multi-vector operands to be swapped
4919   swapCommutativeSMEOperands(BuiltinID, Ops);
4920 
4921   // Should not happen!
4922   if (Builtin->LLVMIntrinsic == 0)
4923     return nullptr;
4924 
4925   if (BuiltinID == SME::BI__builtin_sme___arm_in_streaming_mode) {
4926     // If we already know the streaming mode, don't bother with the intrinsic
4927     // and emit a constant instead
4928     const auto *FD = cast<FunctionDecl>(CurFuncDecl);
4929     if (const auto *FPT = FD->getType()->getAs<FunctionProtoType>()) {
4930       unsigned SMEAttrs = FPT->getAArch64SMEAttributes();
4931       if (!(SMEAttrs & FunctionType::SME_PStateSMCompatibleMask)) {
4932         bool IsStreaming = SMEAttrs & FunctionType::SME_PStateSMEnabledMask;
4933         return ConstantInt::getBool(Builder.getContext(), IsStreaming);
4934       }
4935     }
4936   }
4937 
4938   // Predicates must match the main datatype.
4939   for (Value *&Op : Ops)
4940     if (auto PredTy = dyn_cast<llvm::VectorType>(Op->getType()))
4941       if (PredTy->getElementType()->isIntegerTy(1))
4942         Op = EmitSVEPredicateCast(Op, getSVEType(TypeFlags));
4943 
4944   Function *F =
4945       TypeFlags.isOverloadNone()
4946           ? CGM.getIntrinsic(Builtin->LLVMIntrinsic)
4947           : CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)});
4948 
4949   return Builder.CreateCall(F, Ops);
4950 }
4951 
4952 /// Helper for the read/write/add/inc X18 builtins: read the X18 register and
4953 /// return it as an i8 pointer.
4954 Value *readX18AsPtr(CodeGenFunction &CGF) {
4955   LLVMContext &Context = CGF.CGM.getLLVMContext();
4956   llvm::Metadata *Ops[] = {llvm::MDString::get(Context, "x18")};
4957   llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
4958   llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
4959   llvm::Function *F =
4960       CGF.CGM.getIntrinsic(Intrinsic::read_register, {CGF.Int64Ty});
4961   llvm::Value *X18 = CGF.Builder.CreateCall(F, Metadata);
4962   return CGF.Builder.CreateIntToPtr(X18, CGF.Int8PtrTy);
4963 }
4964 
4965 Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
4966                                                const CallExpr *E,
4967                                                llvm::Triple::ArchType Arch) {
4968   if (BuiltinID >= clang::AArch64::FirstSVEBuiltin &&
4969       BuiltinID <= clang::AArch64::LastSVEBuiltin)
4970     return EmitAArch64SVEBuiltinExpr(BuiltinID, E);
4971 
4972   if (BuiltinID >= clang::AArch64::FirstSMEBuiltin &&
4973       BuiltinID <= clang::AArch64::LastSMEBuiltin)
4974     return EmitAArch64SMEBuiltinExpr(BuiltinID, E);
4975 
4976   if (BuiltinID == Builtin::BI__builtin_cpu_supports)
4977     return EmitAArch64CpuSupports(E);
4978 
4979   unsigned HintID = static_cast<unsigned>(-1);
4980   switch (BuiltinID) {
4981   default: break;
4982   case clang::AArch64::BI__builtin_arm_nop:
4983     HintID = 0;
4984     break;
4985   case clang::AArch64::BI__builtin_arm_yield:
4986   case clang::AArch64::BI__yield:
4987     HintID = 1;
4988     break;
4989   case clang::AArch64::BI__builtin_arm_wfe:
4990   case clang::AArch64::BI__wfe:
4991     HintID = 2;
4992     break;
4993   case clang::AArch64::BI__builtin_arm_wfi:
4994   case clang::AArch64::BI__wfi:
4995     HintID = 3;
4996     break;
4997   case clang::AArch64::BI__builtin_arm_sev:
4998   case clang::AArch64::BI__sev:
4999     HintID = 4;
5000     break;
5001   case clang::AArch64::BI__builtin_arm_sevl:
5002   case clang::AArch64::BI__sevl:
5003     HintID = 5;
5004     break;
5005   }
5006 
5007   if (HintID != static_cast<unsigned>(-1)) {
5008     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_hint);
5009     return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID));
5010   }
5011 
5012   if (BuiltinID == clang::AArch64::BI__builtin_arm_trap) {
5013     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_break);
5014     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
5015     return Builder.CreateCall(F, Builder.CreateZExt(Arg, CGM.Int32Ty));
5016   }
5017 
5018   if (BuiltinID == clang::AArch64::BI__builtin_arm_get_sme_state) {
5019     // Create call to __arm_sme_state and store the results to the two pointers.
5020     CallInst *CI = EmitRuntimeCall(CGM.CreateRuntimeFunction(
5021         llvm::FunctionType::get(StructType::get(CGM.Int64Ty, CGM.Int64Ty), {},
5022                                 false),
5023         "__arm_sme_state"));
5024     auto Attrs = AttributeList().addFnAttribute(getLLVMContext(),
5025                                                 "aarch64_pstate_sm_compatible");
5026     CI->setAttributes(Attrs);
5027     CI->setCallingConv(
5028         llvm::CallingConv::
5029             AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2);
5030     Builder.CreateStore(Builder.CreateExtractValue(CI, 0),
5031                         EmitPointerWithAlignment(E->getArg(0)));
5032     return Builder.CreateStore(Builder.CreateExtractValue(CI, 1),
5033                                EmitPointerWithAlignment(E->getArg(1)));
5034   }
5035 
5036   if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit) {
5037     assert((getContext().getTypeSize(E->getType()) == 32) &&
5038            "rbit of unusual size!");
5039     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
5040     return Builder.CreateCall(
5041         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
5042   }
5043   if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit64) {
5044     assert((getContext().getTypeSize(E->getType()) == 64) &&
5045            "rbit of unusual size!");
5046     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
5047     return Builder.CreateCall(
5048         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
5049   }
5050 
5051   if (BuiltinID == clang::AArch64::BI__builtin_arm_clz ||
5052       BuiltinID == clang::AArch64::BI__builtin_arm_clz64) {
5053     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
5054     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Arg->getType());
5055     Value *Res = Builder.CreateCall(F, {Arg, Builder.getInt1(false)});
5056     if (BuiltinID == clang::AArch64::BI__builtin_arm_clz64)
5057       Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
5058     return Res;
5059   }
5060 
5061   if (BuiltinID == clang::AArch64::BI__builtin_arm_cls) {
5062     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
5063     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_cls), Arg,
5064                               "cls");
5065   }
5066   if (BuiltinID == clang::AArch64::BI__builtin_arm_cls64) {
5067     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
5068     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_cls64), Arg,
5069                               "cls");
5070   }
5071 
5072   if (BuiltinID == clang::AArch64::BI__builtin_arm_rint32zf ||
5073       BuiltinID == clang::AArch64::BI__builtin_arm_rint32z) {
5074     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
5075     llvm::Type *Ty = Arg->getType();
5076     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint32z, Ty),
5077                               Arg, "frint32z");
5078   }
5079 
5080   if (BuiltinID == clang::AArch64::BI__builtin_arm_rint64zf ||
5081       BuiltinID == clang::AArch64::BI__builtin_arm_rint64z) {
5082     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
5083     llvm::Type *Ty = Arg->getType();
5084     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint64z, Ty),
5085                               Arg, "frint64z");
5086   }
5087 
5088   if (BuiltinID == clang::AArch64::BI__builtin_arm_rint32xf ||
5089       BuiltinID == clang::AArch64::BI__builtin_arm_rint32x) {
5090     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
5091     llvm::Type *Ty = Arg->getType();
5092     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint32x, Ty),
5093                               Arg, "frint32x");
5094   }
5095 
5096   if (BuiltinID == clang::AArch64::BI__builtin_arm_rint64xf ||
5097       BuiltinID == clang::AArch64::BI__builtin_arm_rint64x) {
5098     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
5099     llvm::Type *Ty = Arg->getType();
5100     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint64x, Ty),
5101                               Arg, "frint64x");
5102   }
5103 
5104   if (BuiltinID == clang::AArch64::BI__builtin_arm_jcvt) {
5105     assert((getContext().getTypeSize(E->getType()) == 32) &&
5106            "__jcvt of unusual size!");
5107     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
5108     return Builder.CreateCall(
5109         CGM.getIntrinsic(Intrinsic::aarch64_fjcvtzs), Arg);
5110   }
5111 
5112   if (BuiltinID == clang::AArch64::BI__builtin_arm_ld64b ||
5113       BuiltinID == clang::AArch64::BI__builtin_arm_st64b ||
5114       BuiltinID == clang::AArch64::BI__builtin_arm_st64bv ||
5115       BuiltinID == clang::AArch64::BI__builtin_arm_st64bv0) {
5116     llvm::Value *MemAddr = EmitScalarExpr(E->getArg(0));
5117     llvm::Value *ValPtr = EmitScalarExpr(E->getArg(1));
5118 
5119     if (BuiltinID == clang::AArch64::BI__builtin_arm_ld64b) {
5120       // Load from the address via an LLVM intrinsic, receiving a
5121       // tuple of 8 i64 words, and store each one to ValPtr.
5122       Function *F = CGM.getIntrinsic(Intrinsic::aarch64_ld64b);
5123       llvm::Value *Val = Builder.CreateCall(F, MemAddr);
5124       llvm::Value *ToRet;
5125       for (size_t i = 0; i < 8; i++) {
5126         llvm::Value *ValOffsetPtr =
5127             Builder.CreateGEP(Int64Ty, ValPtr, Builder.getInt32(i));
5128         Address Addr =
5129             Address(ValOffsetPtr, Int64Ty, CharUnits::fromQuantity(8));
5130         ToRet = Builder.CreateStore(Builder.CreateExtractValue(Val, i), Addr);
5131       }
5132       return ToRet;
5133     } else {
5134       // Load 8 i64 words from ValPtr, and store them to the address
5135       // via an LLVM intrinsic.
5136       SmallVector<llvm::Value *, 9> Args;
5137       Args.push_back(MemAddr);
5138       for (size_t i = 0; i < 8; i++) {
5139         llvm::Value *ValOffsetPtr =
5140             Builder.CreateGEP(Int64Ty, ValPtr, Builder.getInt32(i));
5141         Address Addr =
5142             Address(ValOffsetPtr, Int64Ty, CharUnits::fromQuantity(8));
5143         Args.push_back(Builder.CreateLoad(Addr));
5144       }
5145 
5146       auto Intr = (BuiltinID == clang::AArch64::BI__builtin_arm_st64b
5147                        ? Intrinsic::aarch64_st64b
5148                    : BuiltinID == clang::AArch64::BI__builtin_arm_st64bv
5149                        ? Intrinsic::aarch64_st64bv
5150                        : Intrinsic::aarch64_st64bv0);
5151       Function *F = CGM.getIntrinsic(Intr);
5152       return Builder.CreateCall(F, Args);
5153     }
5154   }
5155 
5156   if (BuiltinID == clang::AArch64::BI__builtin_arm_rndr ||
5157       BuiltinID == clang::AArch64::BI__builtin_arm_rndrrs) {
5158 
5159     auto Intr = (BuiltinID == clang::AArch64::BI__builtin_arm_rndr
5160                      ? Intrinsic::aarch64_rndr
5161                      : Intrinsic::aarch64_rndrrs);
5162     Function *F = CGM.getIntrinsic(Intr);
5163     llvm::Value *Val = Builder.CreateCall(F);
5164     Value *RandomValue = Builder.CreateExtractValue(Val, 0);
5165     Value *Status = Builder.CreateExtractValue(Val, 1);
5166 
5167     Address MemAddress = EmitPointerWithAlignment(E->getArg(0));
5168     Builder.CreateStore(RandomValue, MemAddress);
5169     Status = Builder.CreateZExt(Status, Int32Ty);
5170     return Status;
5171   }
5172 
5173   if (BuiltinID == clang::AArch64::BI__clear_cache) {
5174     assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
5175     const FunctionDecl *FD = E->getDirectCallee();
5176     Value *Ops[2];
5177     for (unsigned i = 0; i < 2; i++)
5178       Ops[i] = EmitScalarExpr(E->getArg(i));
5179     llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
5180     llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
5181     StringRef Name = FD->getName();
5182     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
5183   }
5184 
5185   if ((BuiltinID == clang::AArch64::BI__builtin_arm_ldrex ||
5186        BuiltinID == clang::AArch64::BI__builtin_arm_ldaex) &&
5187       getContext().getTypeSize(E->getType()) == 128) {
5188     Function *F =
5189         CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_ldaex
5190                              ? Intrinsic::aarch64_ldaxp
5191                              : Intrinsic::aarch64_ldxp);
5192 
5193     Value *LdPtr = EmitScalarExpr(E->getArg(0));
5194     Value *Val = Builder.CreateCall(F, LdPtr, "ldxp");
5195 
5196     Value *Val0 = Builder.CreateExtractValue(Val, 1);
5197     Value *Val1 = Builder.CreateExtractValue(Val, 0);
5198     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
5199     Val0 = Builder.CreateZExt(Val0, Int128Ty);
5200     Val1 = Builder.CreateZExt(Val1, Int128Ty);
5201 
5202     Value *ShiftCst = llvm::ConstantInt::get(Int128Ty, 64);
5203     Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
5204     Val = Builder.CreateOr(Val, Val1);
5205     return Builder.CreateBitCast(Val, ConvertType(E->getType()));
5206   } else if (BuiltinID == clang::AArch64::BI__builtin_arm_ldrex ||
5207              BuiltinID == clang::AArch64::BI__builtin_arm_ldaex) {
5208     Value *LoadAddr = EmitScalarExpr(E->getArg(0));
5209 
5210     QualType Ty = E->getType();
5211     llvm::Type *RealResTy = ConvertType(Ty);
5212     llvm::Type *IntTy =
5213         llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty));
5214 
5215     Function *F =
5216         CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_ldaex
5217                              ? Intrinsic::aarch64_ldaxr
5218                              : Intrinsic::aarch64_ldxr,
5219                          UnqualPtrTy);
5220     CallInst *Val = Builder.CreateCall(F, LoadAddr, "ldxr");
5221     Val->addParamAttr(
5222         0, Attribute::get(getLLVMContext(), Attribute::ElementType, IntTy));
5223 
5224     if (RealResTy->isPointerTy())
5225       return Builder.CreateIntToPtr(Val, RealResTy);
5226 
5227     llvm::Type *IntResTy = llvm::IntegerType::get(
5228         getLLVMContext(), CGM.getDataLayout().getTypeSizeInBits(RealResTy));
5229     return Builder.CreateBitCast(Builder.CreateTruncOrBitCast(Val, IntResTy),
5230                                  RealResTy);
5231   }
5232 
5233   if ((BuiltinID == clang::AArch64::BI__builtin_arm_strex ||
5234        BuiltinID == clang::AArch64::BI__builtin_arm_stlex) &&
5235       getContext().getTypeSize(E->getArg(0)->getType()) == 128) {
5236     Function *F =
5237         CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_stlex
5238                              ? Intrinsic::aarch64_stlxp
5239                              : Intrinsic::aarch64_stxp);
5240     llvm::Type *STy = llvm::StructType::get(Int64Ty, Int64Ty);
5241 
5242     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
5243     EmitAnyExprToMem(E->getArg(0), Tmp, Qualifiers(), /*init*/ true);
5244 
5245     Tmp = Tmp.withElementType(STy);
5246     llvm::Value *Val = Builder.CreateLoad(Tmp);
5247 
5248     Value *Arg0 = Builder.CreateExtractValue(Val, 0);
5249     Value *Arg1 = Builder.CreateExtractValue(Val, 1);
5250     Value *StPtr = EmitScalarExpr(E->getArg(1));
5251     return Builder.CreateCall(F, {Arg0, Arg1, StPtr}, "stxp");
5252   }
5253 
5254   if (BuiltinID == clang::AArch64::BI__builtin_arm_strex ||
5255       BuiltinID == clang::AArch64::BI__builtin_arm_stlex) {
5256     Value *StoreVal = EmitScalarExpr(E->getArg(0));
5257     Value *StoreAddr = EmitScalarExpr(E->getArg(1));
5258 
5259     QualType Ty = E->getArg(0)->getType();
5260     llvm::Type *StoreTy =
5261         llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty));
5262 
5263     if (StoreVal->getType()->isPointerTy())
5264       StoreVal = Builder.CreatePtrToInt(StoreVal, Int64Ty);
5265     else {
5266       llvm::Type *IntTy = llvm::IntegerType::get(
5267           getLLVMContext(),
5268           CGM.getDataLayout().getTypeSizeInBits(StoreVal->getType()));
5269       StoreVal = Builder.CreateBitCast(StoreVal, IntTy);
5270       StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int64Ty);
5271     }
5272 
5273     Function *F =
5274         CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_stlex
5275                              ? Intrinsic::aarch64_stlxr
5276                              : Intrinsic::aarch64_stxr,
5277                          StoreAddr->getType());
5278     CallInst *CI = Builder.CreateCall(F, {StoreVal, StoreAddr}, "stxr");
5279     CI->addParamAttr(
5280         1, Attribute::get(getLLVMContext(), Attribute::ElementType, StoreTy));
5281     return CI;
5282   }
5283 
5284   if (BuiltinID == clang::AArch64::BI__getReg) {
5285     Expr::EvalResult Result;
5286     if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
5287       llvm_unreachable("Sema will ensure that the parameter is constant");
5288 
5289     llvm::APSInt Value = Result.Val.getInt();
5290     LLVMContext &Context = CGM.getLLVMContext();
5291     std::string Reg = Value == 31 ? "sp" : "x" + toString(Value, 10);
5292 
5293     llvm::Metadata *Ops[] = {llvm::MDString::get(Context, Reg)};
5294     llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
5295     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
5296 
5297     llvm::Function *F =
5298         CGM.getIntrinsic(Intrinsic::read_register, {Int64Ty});
5299     return Builder.CreateCall(F, Metadata);
5300   }
5301 
5302   if (BuiltinID == clang::AArch64::BI__break) {
5303     Expr::EvalResult Result;
5304     if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
5305       llvm_unreachable("Sema will ensure that the parameter is constant");
5306 
5307     llvm::Function *F = CGM.getIntrinsic(Intrinsic::aarch64_break);
5308     return Builder.CreateCall(F, {EmitScalarExpr(E->getArg(0))});
5309   }
5310 
5311   if (BuiltinID == clang::AArch64::BI__builtin_arm_clrex) {
5312     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_clrex);
5313     return Builder.CreateCall(F);
5314   }
5315 
5316   if (BuiltinID == clang::AArch64::BI_ReadWriteBarrier)
5317     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
5318                                llvm::SyncScope::SingleThread);
5319 
5320   // CRC32
5321   Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
5322   switch (BuiltinID) {
5323   case clang::AArch64::BI__builtin_arm_crc32b:
5324     CRCIntrinsicID = Intrinsic::aarch64_crc32b; break;
5325   case clang::AArch64::BI__builtin_arm_crc32cb:
5326     CRCIntrinsicID = Intrinsic::aarch64_crc32cb; break;
5327   case clang::AArch64::BI__builtin_arm_crc32h:
5328     CRCIntrinsicID = Intrinsic::aarch64_crc32h; break;
5329   case clang::AArch64::BI__builtin_arm_crc32ch:
5330     CRCIntrinsicID = Intrinsic::aarch64_crc32ch; break;
5331   case clang::AArch64::BI__builtin_arm_crc32w:
5332     CRCIntrinsicID = Intrinsic::aarch64_crc32w; break;
5333   case clang::AArch64::BI__builtin_arm_crc32cw:
5334     CRCIntrinsicID = Intrinsic::aarch64_crc32cw; break;
5335   case clang::AArch64::BI__builtin_arm_crc32d:
5336     CRCIntrinsicID = Intrinsic::aarch64_crc32x; break;
5337   case clang::AArch64::BI__builtin_arm_crc32cd:
5338     CRCIntrinsicID = Intrinsic::aarch64_crc32cx; break;
5339   }
5340 
5341   if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
5342     Value *Arg0 = EmitScalarExpr(E->getArg(0));
5343     Value *Arg1 = EmitScalarExpr(E->getArg(1));
5344     Function *F = CGM.getIntrinsic(CRCIntrinsicID);
5345 
5346     llvm::Type *DataTy = F->getFunctionType()->getParamType(1);
5347     Arg1 = Builder.CreateZExtOrBitCast(Arg1, DataTy);
5348 
5349     return Builder.CreateCall(F, {Arg0, Arg1});
5350   }
5351 
5352   // Memory Operations (MOPS)
5353   if (BuiltinID == AArch64::BI__builtin_arm_mops_memset_tag) {
5354     Value *Dst = EmitScalarExpr(E->getArg(0));
5355     Value *Val = EmitScalarExpr(E->getArg(1));
5356     Value *Size = EmitScalarExpr(E->getArg(2));
5357     Val = Builder.CreateTrunc(Val, Int8Ty);
5358     Size = Builder.CreateIntCast(Size, Int64Ty, false);
5359     return Builder.CreateCall(
5360         CGM.getIntrinsic(Intrinsic::aarch64_mops_memset_tag), {Dst, Val, Size});
5361   }
5362 
5363   // Memory Tagging Extensions (MTE) Intrinsics
5364   Intrinsic::ID MTEIntrinsicID = Intrinsic::not_intrinsic;
5365   switch (BuiltinID) {
5366   case clang::AArch64::BI__builtin_arm_irg:
5367     MTEIntrinsicID = Intrinsic::aarch64_irg; break;
5368   case clang::AArch64::BI__builtin_arm_addg:
5369     MTEIntrinsicID = Intrinsic::aarch64_addg; break;
5370   case clang::AArch64::BI__builtin_arm_gmi:
5371     MTEIntrinsicID = Intrinsic::aarch64_gmi; break;
5372   case clang::AArch64::BI__builtin_arm_ldg:
5373     MTEIntrinsicID = Intrinsic::aarch64_ldg; break;
5374   case clang::AArch64::BI__builtin_arm_stg:
5375     MTEIntrinsicID = Intrinsic::aarch64_stg; break;
5376   case clang::AArch64::BI__builtin_arm_subp:
5377     MTEIntrinsicID = Intrinsic::aarch64_subp; break;
5378   }
5379 
5380   if (MTEIntrinsicID != Intrinsic::not_intrinsic) {
5381     if (MTEIntrinsicID == Intrinsic::aarch64_irg) {
5382       Value *Pointer = EmitScalarExpr(E->getArg(0));
5383       Value *Mask = EmitScalarExpr(E->getArg(1));
5384 
5385       Mask = Builder.CreateZExt(Mask, Int64Ty);
5386       return Builder.CreateCall(CGM.getIntrinsic(MTEIntrinsicID),
5387                                 {Pointer, Mask});
5388     }
5389     if (MTEIntrinsicID == Intrinsic::aarch64_addg) {
5390       Value *Pointer = EmitScalarExpr(E->getArg(0));
5391       Value *TagOffset = EmitScalarExpr(E->getArg(1));
5392 
5393       TagOffset = Builder.CreateZExt(TagOffset, Int64Ty);
5394       return Builder.CreateCall(CGM.getIntrinsic(MTEIntrinsicID),
5395                                 {Pointer, TagOffset});
5396     }
5397     if (MTEIntrinsicID == Intrinsic::aarch64_gmi) {
5398       Value *Pointer = EmitScalarExpr(E->getArg(0));
5399       Value *ExcludedMask = EmitScalarExpr(E->getArg(1));
5400 
5401       ExcludedMask = Builder.CreateZExt(ExcludedMask, Int64Ty);
5402       return Builder.CreateCall(
5403                        CGM.getIntrinsic(MTEIntrinsicID), {Pointer, ExcludedMask});
5404     }
5405     // Although it is possible to supply a different return
5406     // address (first arg) to this intrinsic, for now we set
5407     // return address same as input address.
5408     if (MTEIntrinsicID == Intrinsic::aarch64_ldg) {
5409       Value *TagAddress = EmitScalarExpr(E->getArg(0));
5410       return Builder.CreateCall(CGM.getIntrinsic(MTEIntrinsicID),
5411                                 {TagAddress, TagAddress});
5412     }
5413     // Although it is possible to supply a different tag (to set)
5414     // to this intrinsic (as first arg), for now we supply
5415     // the tag that is in input address arg (common use case).
5416     if (MTEIntrinsicID == Intrinsic::aarch64_stg) {
5417       Value *TagAddress = EmitScalarExpr(E->getArg(0));
5418       return Builder.CreateCall(CGM.getIntrinsic(MTEIntrinsicID),
5419                                 {TagAddress, TagAddress});
5420     }
5421     if (MTEIntrinsicID == Intrinsic::aarch64_subp) {
5422       Value *PointerA = EmitScalarExpr(E->getArg(0));
5423       Value *PointerB = EmitScalarExpr(E->getArg(1));
5424       return Builder.CreateCall(
5425                        CGM.getIntrinsic(MTEIntrinsicID), {PointerA, PointerB});
5426     }
5427   }
5428 
5429   if (BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
5430       BuiltinID == clang::AArch64::BI__builtin_arm_rsr64 ||
5431       BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
5432       BuiltinID == clang::AArch64::BI__builtin_arm_rsrp ||
5433       BuiltinID == clang::AArch64::BI__builtin_arm_wsr ||
5434       BuiltinID == clang::AArch64::BI__builtin_arm_wsr64 ||
5435       BuiltinID == clang::AArch64::BI__builtin_arm_wsr128 ||
5436       BuiltinID == clang::AArch64::BI__builtin_arm_wsrp) {
5437 
5438     SpecialRegisterAccessKind AccessKind = Write;
5439     if (BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
5440         BuiltinID == clang::AArch64::BI__builtin_arm_rsr64 ||
5441         BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
5442         BuiltinID == clang::AArch64::BI__builtin_arm_rsrp)
5443       AccessKind = VolatileRead;
5444 
5445     bool IsPointerBuiltin = BuiltinID == clang::AArch64::BI__builtin_arm_rsrp ||
5446                             BuiltinID == clang::AArch64::BI__builtin_arm_wsrp;
5447 
5448     bool Is32Bit = BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
5449                    BuiltinID == clang::AArch64::BI__builtin_arm_wsr;
5450 
5451     bool Is128Bit = BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
5452                     BuiltinID == clang::AArch64::BI__builtin_arm_wsr128;
5453 
5454     llvm::Type *ValueType;
5455     llvm::Type *RegisterType = Int64Ty;
5456     if (Is32Bit) {
5457       ValueType = Int32Ty;
5458     } else if (Is128Bit) {
5459       llvm::Type *Int128Ty =
5460           llvm::IntegerType::getInt128Ty(CGM.getLLVMContext());
5461       ValueType = Int128Ty;
5462       RegisterType = Int128Ty;
5463     } else if (IsPointerBuiltin) {
5464       ValueType = VoidPtrTy;
5465     } else {
5466       ValueType = Int64Ty;
5467     };
5468 
5469     return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType,
5470                                       AccessKind);
5471   }
5472 
5473   if (BuiltinID == clang::AArch64::BI_ReadStatusReg ||
5474       BuiltinID == clang::AArch64::BI_WriteStatusReg ||
5475       BuiltinID == clang::AArch64::BI__sys) {
5476     LLVMContext &Context = CGM.getLLVMContext();
5477 
5478     unsigned SysReg =
5479       E->getArg(0)->EvaluateKnownConstInt(getContext()).getZExtValue();
5480 
5481     std::string SysRegStr;
5482     unsigned SysRegOp0 = (BuiltinID == clang::AArch64::BI_ReadStatusReg ||
5483                           BuiltinID == clang::AArch64::BI_WriteStatusReg)
5484                              ? ((1 << 1) | ((SysReg >> 14) & 1))
5485                              : 1;
5486     llvm::raw_string_ostream(SysRegStr)
5487         << SysRegOp0 << ":" << ((SysReg >> 11) & 7) << ":"
5488         << ((SysReg >> 7) & 15) << ":" << ((SysReg >> 3) & 15) << ":"
5489         << (SysReg & 7);
5490 
5491     llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysRegStr) };
5492     llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
5493     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
5494 
5495     llvm::Type *RegisterType = Int64Ty;
5496     llvm::Type *Types[] = { RegisterType };
5497 
5498     if (BuiltinID == clang::AArch64::BI_ReadStatusReg) {
5499       llvm::Function *F = CGM.getIntrinsic(Intrinsic::read_register, Types);
5500 
5501       return Builder.CreateCall(F, Metadata);
5502     }
5503 
5504     llvm::Function *F = CGM.getIntrinsic(Intrinsic::write_register, Types);
5505     llvm::Value *ArgValue = EmitScalarExpr(E->getArg(1));
5506     llvm::Value *Result = Builder.CreateCall(F, {Metadata, ArgValue});
5507     if (BuiltinID == clang::AArch64::BI__sys) {
5508       // Return 0 for convenience, even though MSVC returns some other undefined
5509       // value.
5510       Result = ConstantInt::get(Builder.getInt32Ty(), 0);
5511     }
5512     return Result;
5513   }
5514 
5515   if (BuiltinID == clang::AArch64::BI_AddressOfReturnAddress) {
5516     llvm::Function *F =
5517         CGM.getIntrinsic(Intrinsic::addressofreturnaddress, AllocaInt8PtrTy);
5518     return Builder.CreateCall(F);
5519   }
5520 
5521   if (BuiltinID == clang::AArch64::BI__builtin_sponentry) {
5522     llvm::Function *F = CGM.getIntrinsic(Intrinsic::sponentry, AllocaInt8PtrTy);
5523     return Builder.CreateCall(F);
5524   }
5525 
5526   if (BuiltinID == clang::AArch64::BI__mulh ||
5527       BuiltinID == clang::AArch64::BI__umulh) {
5528     llvm::Type *ResType = ConvertType(E->getType());
5529     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
5530 
5531     bool IsSigned = BuiltinID == clang::AArch64::BI__mulh;
5532     Value *LHS =
5533         Builder.CreateIntCast(EmitScalarExpr(E->getArg(0)), Int128Ty, IsSigned);
5534     Value *RHS =
5535         Builder.CreateIntCast(EmitScalarExpr(E->getArg(1)), Int128Ty, IsSigned);
5536 
5537     Value *MulResult, *HigherBits;
5538     if (IsSigned) {
5539       MulResult = Builder.CreateNSWMul(LHS, RHS);
5540       HigherBits = Builder.CreateAShr(MulResult, 64);
5541     } else {
5542       MulResult = Builder.CreateNUWMul(LHS, RHS);
5543       HigherBits = Builder.CreateLShr(MulResult, 64);
5544     }
5545     HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned);
5546 
5547     return HigherBits;
5548   }
5549 
5550   if (BuiltinID == AArch64::BI__writex18byte ||
5551       BuiltinID == AArch64::BI__writex18word ||
5552       BuiltinID == AArch64::BI__writex18dword ||
5553       BuiltinID == AArch64::BI__writex18qword) {
5554     // Process the args first
5555     Value *OffsetArg = EmitScalarExpr(E->getArg(0));
5556     Value *DataArg = EmitScalarExpr(E->getArg(1));
5557 
5558     // Read x18 as i8*
5559     llvm::Value *X18 = readX18AsPtr(*this);
5560 
5561     // Store val at x18 + offset
5562     Value *Offset = Builder.CreateZExt(OffsetArg, Int64Ty);
5563     Value *Ptr = Builder.CreateGEP(Int8Ty, X18, Offset);
5564     StoreInst *Store =
5565         Builder.CreateAlignedStore(DataArg, Ptr, CharUnits::One());
5566     return Store;
5567   }
5568 
5569   if (BuiltinID == AArch64::BI__readx18byte ||
5570       BuiltinID == AArch64::BI__readx18word ||
5571       BuiltinID == AArch64::BI__readx18dword ||
5572       BuiltinID == AArch64::BI__readx18qword) {
5573     // Process the args first
5574     Value *OffsetArg = EmitScalarExpr(E->getArg(0));
5575 
5576     // Read x18 as i8*
5577     llvm::Value *X18 = readX18AsPtr(*this);
5578 
5579     // Load x18 + offset
5580     Value *Offset = Builder.CreateZExt(OffsetArg, Int64Ty);
5581     Value *Ptr = Builder.CreateGEP(Int8Ty, X18, Offset);
5582     llvm::Type *IntTy = ConvertType(E->getType());
5583     LoadInst *Load = Builder.CreateAlignedLoad(IntTy, Ptr, CharUnits::One());
5584     return Load;
5585   }
5586 
5587   if (BuiltinID == AArch64::BI__addx18byte ||
5588       BuiltinID == AArch64::BI__addx18word ||
5589       BuiltinID == AArch64::BI__addx18dword ||
5590       BuiltinID == AArch64::BI__addx18qword ||
5591       BuiltinID == AArch64::BI__incx18byte ||
5592       BuiltinID == AArch64::BI__incx18word ||
5593       BuiltinID == AArch64::BI__incx18dword ||
5594       BuiltinID == AArch64::BI__incx18qword) {
5595     llvm::Type *IntTy;
5596     bool isIncrement;
5597     switch (BuiltinID) {
5598     case AArch64::BI__incx18byte:
5599       IntTy = Int8Ty;
5600       isIncrement = true;
5601       break;
5602     case AArch64::BI__incx18word:
5603       IntTy = Int16Ty;
5604       isIncrement = true;
5605       break;
5606     case AArch64::BI__incx18dword:
5607       IntTy = Int32Ty;
5608       isIncrement = true;
5609       break;
5610     case AArch64::BI__incx18qword:
5611       IntTy = Int64Ty;
5612       isIncrement = true;
5613       break;
5614     default:
5615       IntTy = ConvertType(E->getArg(1)->getType());
5616       isIncrement = false;
5617       break;
5618     }
5619     // Process the args first
5620     Value *OffsetArg = EmitScalarExpr(E->getArg(0));
5621     Value *ValToAdd =
5622         isIncrement ? ConstantInt::get(IntTy, 1) : EmitScalarExpr(E->getArg(1));
5623 
5624     // Read x18 as i8*
5625     llvm::Value *X18 = readX18AsPtr(*this);
5626 
5627     // Load x18 + offset
5628     Value *Offset = Builder.CreateZExt(OffsetArg, Int64Ty);
5629     Value *Ptr = Builder.CreateGEP(Int8Ty, X18, Offset);
5630     LoadInst *Load = Builder.CreateAlignedLoad(IntTy, Ptr, CharUnits::One());
5631 
5632     // Add values
5633     Value *AddResult = Builder.CreateAdd(Load, ValToAdd);
5634 
5635     // Store val at x18 + offset
5636     StoreInst *Store =
5637         Builder.CreateAlignedStore(AddResult, Ptr, CharUnits::One());
5638     return Store;
5639   }
5640 
5641   if (BuiltinID == AArch64::BI_CopyDoubleFromInt64 ||
5642       BuiltinID == AArch64::BI_CopyFloatFromInt32 ||
5643       BuiltinID == AArch64::BI_CopyInt32FromFloat ||
5644       BuiltinID == AArch64::BI_CopyInt64FromDouble) {
5645     Value *Arg = EmitScalarExpr(E->getArg(0));
5646     llvm::Type *RetTy = ConvertType(E->getType());
5647     return Builder.CreateBitCast(Arg, RetTy);
5648   }
5649 
5650   if (BuiltinID == AArch64::BI_CountLeadingOnes ||
5651       BuiltinID == AArch64::BI_CountLeadingOnes64 ||
5652       BuiltinID == AArch64::BI_CountLeadingZeros ||
5653       BuiltinID == AArch64::BI_CountLeadingZeros64) {
5654     Value *Arg = EmitScalarExpr(E->getArg(0));
5655     llvm::Type *ArgType = Arg->getType();
5656 
5657     if (BuiltinID == AArch64::BI_CountLeadingOnes ||
5658         BuiltinID == AArch64::BI_CountLeadingOnes64)
5659       Arg = Builder.CreateXor(Arg, Constant::getAllOnesValue(ArgType));
5660 
5661     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
5662     Value *Result = Builder.CreateCall(F, {Arg, Builder.getInt1(false)});
5663 
5664     if (BuiltinID == AArch64::BI_CountLeadingOnes64 ||
5665         BuiltinID == AArch64::BI_CountLeadingZeros64)
5666       Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
5667     return Result;
5668   }
5669 
5670   if (BuiltinID == AArch64::BI_CountLeadingSigns ||
5671       BuiltinID == AArch64::BI_CountLeadingSigns64) {
5672     Value *Arg = EmitScalarExpr(E->getArg(0));
5673 
5674     Function *F = (BuiltinID == AArch64::BI_CountLeadingSigns)
5675                       ? CGM.getIntrinsic(Intrinsic::aarch64_cls)
5676                       : CGM.getIntrinsic(Intrinsic::aarch64_cls64);
5677 
5678     Value *Result = Builder.CreateCall(F, Arg, "cls");
5679     if (BuiltinID == AArch64::BI_CountLeadingSigns64)
5680       Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
5681     return Result;
5682   }
5683 
5684   if (BuiltinID == AArch64::BI_CountOneBits ||
5685       BuiltinID == AArch64::BI_CountOneBits64) {
5686     Value *ArgValue = EmitScalarExpr(E->getArg(0));
5687     llvm::Type *ArgType = ArgValue->getType();
5688     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
5689 
5690     Value *Result = Builder.CreateCall(F, ArgValue);
5691     if (BuiltinID == AArch64::BI_CountOneBits64)
5692       Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
5693     return Result;
5694   }
5695 
5696   if (BuiltinID == AArch64::BI__prefetch) {
5697     Value *Address = EmitScalarExpr(E->getArg(0));
5698     Value *RW = llvm::ConstantInt::get(Int32Ty, 0);
5699     Value *Locality = ConstantInt::get(Int32Ty, 3);
5700     Value *Data = llvm::ConstantInt::get(Int32Ty, 1);
5701     Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
5702     return Builder.CreateCall(F, {Address, RW, Locality, Data});
5703   }
5704 
5705   if (BuiltinID == AArch64::BI__hlt) {
5706     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_hlt);
5707     Builder.CreateCall(F, {EmitScalarExpr(E->getArg(0))});
5708 
5709     // Return 0 for convenience, even though MSVC returns some other undefined
5710     // value.
5711     return ConstantInt::get(Builder.getInt32Ty(), 0);
5712   }
5713 
5714   if (BuiltinID == NEON::BI__builtin_neon_vcvth_bf16_f32)
5715     return Builder.CreateFPTrunc(
5716         Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
5717                               Builder.getFloatTy()),
5718         Builder.getBFloatTy());
5719 
5720   // Handle MSVC intrinsics before argument evaluation to prevent double
5721   // evaluation.
5722   if (std::optional<MSVCIntrin> MsvcIntId =
5723           translateAarch64ToMsvcIntrin(BuiltinID))
5724     return EmitMSVCBuiltinExpr(*MsvcIntId, E);
5725 
5726   // Some intrinsics are equivalent - if they are use the base intrinsic ID.
5727   auto It = llvm::find_if(NEONEquivalentIntrinsicMap, [BuiltinID](auto &P) {
5728     return P.first == BuiltinID;
5729   });
5730   if (It != end(NEONEquivalentIntrinsicMap))
5731     BuiltinID = It->second;
5732 
5733   // Find out if any arguments are required to be integer constant
5734   // expressions.
5735   unsigned ICEArguments = 0;
5736   ASTContext::GetBuiltinTypeError Error;
5737   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
5738   assert(Error == ASTContext::GE_None && "Should not codegen an error");
5739 
5740   llvm::SmallVector<Value*, 4> Ops;
5741   Address PtrOp0 = Address::invalid();
5742   for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) {
5743     if (i == 0) {
5744       switch (BuiltinID) {
5745       case NEON::BI__builtin_neon_vld1_v:
5746       case NEON::BI__builtin_neon_vld1q_v:
5747       case NEON::BI__builtin_neon_vld1_dup_v:
5748       case NEON::BI__builtin_neon_vld1q_dup_v:
5749       case NEON::BI__builtin_neon_vld1_lane_v:
5750       case NEON::BI__builtin_neon_vld1q_lane_v:
5751       case NEON::BI__builtin_neon_vst1_v:
5752       case NEON::BI__builtin_neon_vst1q_v:
5753       case NEON::BI__builtin_neon_vst1_lane_v:
5754       case NEON::BI__builtin_neon_vst1q_lane_v:
5755       case NEON::BI__builtin_neon_vldap1_lane_s64:
5756       case NEON::BI__builtin_neon_vldap1q_lane_s64:
5757       case NEON::BI__builtin_neon_vstl1_lane_s64:
5758       case NEON::BI__builtin_neon_vstl1q_lane_s64:
5759         // Get the alignment for the argument in addition to the value;
5760         // we'll use it later.
5761         PtrOp0 = EmitPointerWithAlignment(E->getArg(0));
5762         Ops.push_back(PtrOp0.emitRawPointer(*this));
5763         continue;
5764       }
5765     }
5766     Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E));
5767   }
5768 
5769   auto SISDMap = ArrayRef(AArch64SISDIntrinsicMap);
5770   const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap(
5771       SISDMap, BuiltinID, AArch64SISDIntrinsicsProvenSorted);
5772 
5773   if (Builtin) {
5774     Ops.push_back(EmitScalarExpr(E->getArg(E->getNumArgs() - 1)));
5775     Value *Result = EmitCommonNeonSISDBuiltinExpr(*this, *Builtin, Ops, E);
5776     assert(Result && "SISD intrinsic should have been handled");
5777     return Result;
5778   }
5779 
5780   const Expr *Arg = E->getArg(E->getNumArgs()-1);
5781   NeonTypeFlags Type(0);
5782   if (std::optional<llvm::APSInt> Result =
5783           Arg->getIntegerConstantExpr(getContext()))
5784     // Determine the type of this overloaded NEON intrinsic.
5785     Type = NeonTypeFlags(Result->getZExtValue());
5786 
5787   bool usgn = Type.isUnsigned();
5788   bool quad = Type.isQuad();
5789 
5790   // Handle non-overloaded intrinsics first.
5791   switch (BuiltinID) {
5792   default: break;
5793   case NEON::BI__builtin_neon_vabsh_f16:
5794     Ops.push_back(EmitScalarExpr(E->getArg(0)));
5795     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, HalfTy), Ops, "vabs");
5796   case NEON::BI__builtin_neon_vaddq_p128: {
5797     llvm::Type *Ty = GetNeonType(this, NeonTypeFlags::Poly128);
5798     Ops.push_back(EmitScalarExpr(E->getArg(1)));
5799     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5800     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
5801     Ops[0] =  Builder.CreateXor(Ops[0], Ops[1]);
5802     llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
5803     return Builder.CreateBitCast(Ops[0], Int128Ty);
5804   }
5805   case NEON::BI__builtin_neon_vldrq_p128: {
5806     llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
5807     Value *Ptr = EmitScalarExpr(E->getArg(0));
5808     return Builder.CreateAlignedLoad(Int128Ty, Ptr,
5809                                      CharUnits::fromQuantity(16));
5810   }
5811   case NEON::BI__builtin_neon_vstrq_p128: {
5812     Value *Ptr = Ops[0];
5813     return Builder.CreateDefaultAlignedStore(EmitScalarExpr(E->getArg(1)), Ptr);
5814   }
5815   case NEON::BI__builtin_neon_vcvts_f32_u32:
5816   case NEON::BI__builtin_neon_vcvtd_f64_u64:
5817     usgn = true;
5818     [[fallthrough]];
5819   case NEON::BI__builtin_neon_vcvts_f32_s32:
5820   case NEON::BI__builtin_neon_vcvtd_f64_s64: {
5821     Ops.push_back(EmitScalarExpr(E->getArg(0)));
5822     bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64;
5823     llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty;
5824     llvm::Type *FTy = Is64 ? DoubleTy : FloatTy;
5825     Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
5826     if (usgn)
5827       return Builder.CreateUIToFP(Ops[0], FTy);
5828     return Builder.CreateSIToFP(Ops[0], FTy);
5829   }
5830   case NEON::BI__builtin_neon_vcvth_f16_u16:
5831   case NEON::BI__builtin_neon_vcvth_f16_u32:
5832   case NEON::BI__builtin_neon_vcvth_f16_u64:
5833     usgn = true;
5834     [[fallthrough]];
5835   case NEON::BI__builtin_neon_vcvth_f16_s16:
5836   case NEON::BI__builtin_neon_vcvth_f16_s32:
5837   case NEON::BI__builtin_neon_vcvth_f16_s64: {
5838     Ops.push_back(EmitScalarExpr(E->getArg(0)));
5839     llvm::Type *FTy = HalfTy;
5840     llvm::Type *InTy;
5841     if (Ops[0]->getType()->getPrimitiveSizeInBits() == 64)
5842       InTy = Int64Ty;
5843     else if (Ops[0]->getType()->getPrimitiveSizeInBits() == 32)
5844       InTy = Int32Ty;
5845     else
5846       InTy = Int16Ty;
5847     Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
5848     if (usgn)
5849       return Builder.CreateUIToFP(Ops[0], FTy);
5850     return Builder.CreateSIToFP(Ops[0], FTy);
5851   }
5852   case NEON::BI__builtin_neon_vcvtah_u16_f16:
5853   case NEON::BI__builtin_neon_vcvtmh_u16_f16:
5854   case NEON::BI__builtin_neon_vcvtnh_u16_f16:
5855   case NEON::BI__builtin_neon_vcvtph_u16_f16:
5856   case NEON::BI__builtin_neon_vcvth_u16_f16:
5857   case NEON::BI__builtin_neon_vcvtah_s16_f16:
5858   case NEON::BI__builtin_neon_vcvtmh_s16_f16:
5859   case NEON::BI__builtin_neon_vcvtnh_s16_f16:
5860   case NEON::BI__builtin_neon_vcvtph_s16_f16:
5861   case NEON::BI__builtin_neon_vcvth_s16_f16: {
5862     unsigned Int;
5863     llvm::Type* InTy = Int32Ty;
5864     llvm::Type* FTy  = HalfTy;
5865     llvm::Type *Tys[2] = {InTy, FTy};
5866     Ops.push_back(EmitScalarExpr(E->getArg(0)));
5867     switch (BuiltinID) {
5868     default: llvm_unreachable("missing builtin ID in switch!");
5869     case NEON::BI__builtin_neon_vcvtah_u16_f16:
5870       Int = Intrinsic::aarch64_neon_fcvtau; break;
5871     case NEON::BI__builtin_neon_vcvtmh_u16_f16:
5872       Int = Intrinsic::aarch64_neon_fcvtmu; break;
5873     case NEON::BI__builtin_neon_vcvtnh_u16_f16:
5874       Int = Intrinsic::aarch64_neon_fcvtnu; break;
5875     case NEON::BI__builtin_neon_vcvtph_u16_f16:
5876       Int = Intrinsic::aarch64_neon_fcvtpu; break;
5877     case NEON::BI__builtin_neon_vcvth_u16_f16:
5878       Int = Intrinsic::aarch64_neon_fcvtzu; break;
5879     case NEON::BI__builtin_neon_vcvtah_s16_f16:
5880       Int = Intrinsic::aarch64_neon_fcvtas; break;
5881     case NEON::BI__builtin_neon_vcvtmh_s16_f16:
5882       Int = Intrinsic::aarch64_neon_fcvtms; break;
5883     case NEON::BI__builtin_neon_vcvtnh_s16_f16:
5884       Int = Intrinsic::aarch64_neon_fcvtns; break;
5885     case NEON::BI__builtin_neon_vcvtph_s16_f16:
5886       Int = Intrinsic::aarch64_neon_fcvtps; break;
5887     case NEON::BI__builtin_neon_vcvth_s16_f16:
5888       Int = Intrinsic::aarch64_neon_fcvtzs; break;
5889     }
5890     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvt");
5891     return Builder.CreateTrunc(Ops[0], Int16Ty);
5892   }
5893   case NEON::BI__builtin_neon_vcaleh_f16:
5894   case NEON::BI__builtin_neon_vcalth_f16:
5895   case NEON::BI__builtin_neon_vcageh_f16:
5896   case NEON::BI__builtin_neon_vcagth_f16: {
5897     unsigned Int;
5898     llvm::Type* InTy = Int32Ty;
5899     llvm::Type* FTy  = HalfTy;
5900     llvm::Type *Tys[2] = {InTy, FTy};
5901     Ops.push_back(EmitScalarExpr(E->getArg(1)));
5902     switch (BuiltinID) {
5903     default: llvm_unreachable("missing builtin ID in switch!");
5904     case NEON::BI__builtin_neon_vcageh_f16:
5905       Int = Intrinsic::aarch64_neon_facge; break;
5906     case NEON::BI__builtin_neon_vcagth_f16:
5907       Int = Intrinsic::aarch64_neon_facgt; break;
5908     case NEON::BI__builtin_neon_vcaleh_f16:
5909       Int = Intrinsic::aarch64_neon_facge; std::swap(Ops[0], Ops[1]); break;
5910     case NEON::BI__builtin_neon_vcalth_f16:
5911       Int = Intrinsic::aarch64_neon_facgt; std::swap(Ops[0], Ops[1]); break;
5912     }
5913     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "facg");
5914     return Builder.CreateTrunc(Ops[0], Int16Ty);
5915   }
5916   case NEON::BI__builtin_neon_vcvth_n_s16_f16:
5917   case NEON::BI__builtin_neon_vcvth_n_u16_f16: {
5918     unsigned Int;
5919     llvm::Type* InTy = Int32Ty;
5920     llvm::Type* FTy  = HalfTy;
5921     llvm::Type *Tys[2] = {InTy, FTy};
5922     Ops.push_back(EmitScalarExpr(E->getArg(1)));
5923     switch (BuiltinID) {
5924     default: llvm_unreachable("missing builtin ID in switch!");
5925     case NEON::BI__builtin_neon_vcvth_n_s16_f16:
5926       Int = Intrinsic::aarch64_neon_vcvtfp2fxs; break;
5927     case NEON::BI__builtin_neon_vcvth_n_u16_f16:
5928       Int = Intrinsic::aarch64_neon_vcvtfp2fxu; break;
5929     }
5930     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
5931     return Builder.CreateTrunc(Ops[0], Int16Ty);
5932   }
5933   case NEON::BI__builtin_neon_vcvth_n_f16_s16:
5934   case NEON::BI__builtin_neon_vcvth_n_f16_u16: {
5935     unsigned Int;
5936     llvm::Type* FTy  = HalfTy;
5937     llvm::Type* InTy = Int32Ty;
5938     llvm::Type *Tys[2] = {FTy, InTy};
5939     Ops.push_back(EmitScalarExpr(E->getArg(1)));
5940     switch (BuiltinID) {
5941     default: llvm_unreachable("missing builtin ID in switch!");
5942     case NEON::BI__builtin_neon_vcvth_n_f16_s16:
5943       Int = Intrinsic::aarch64_neon_vcvtfxs2fp;
5944       Ops[0] = Builder.CreateSExt(Ops[0], InTy, "sext");
5945       break;
5946     case NEON::BI__builtin_neon_vcvth_n_f16_u16:
5947       Int = Intrinsic::aarch64_neon_vcvtfxu2fp;
5948       Ops[0] = Builder.CreateZExt(Ops[0], InTy);
5949       break;
5950     }
5951     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
5952   }
5953   case NEON::BI__builtin_neon_vpaddd_s64: {
5954     auto *Ty = llvm::FixedVectorType::get(Int64Ty, 2);
5955     Value *Vec = EmitScalarExpr(E->getArg(0));
5956     // The vector is v2f64, so make sure it's bitcast to that.
5957     Vec = Builder.CreateBitCast(Vec, Ty, "v2i64");
5958     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
5959     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
5960     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
5961     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
5962     // Pairwise addition of a v2f64 into a scalar f64.
5963     return Builder.CreateAdd(Op0, Op1, "vpaddd");
5964   }
5965   case NEON::BI__builtin_neon_vpaddd_f64: {
5966     auto *Ty = llvm::FixedVectorType::get(DoubleTy, 2);
5967     Value *Vec = EmitScalarExpr(E->getArg(0));
5968     // The vector is v2f64, so make sure it's bitcast to that.
5969     Vec = Builder.CreateBitCast(Vec, Ty, "v2f64");
5970     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
5971     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
5972     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
5973     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
5974     // Pairwise addition of a v2f64 into a scalar f64.
5975     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
5976   }
5977   case NEON::BI__builtin_neon_vpadds_f32: {
5978     auto *Ty = llvm::FixedVectorType::get(FloatTy, 2);
5979     Value *Vec = EmitScalarExpr(E->getArg(0));
5980     // The vector is v2f32, so make sure it's bitcast to that.
5981     Vec = Builder.CreateBitCast(Vec, Ty, "v2f32");
5982     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
5983     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
5984     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
5985     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
5986     // Pairwise addition of a v2f32 into a scalar f32.
5987     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
5988   }
5989   case NEON::BI__builtin_neon_vceqzd_s64:
5990     Ops.push_back(EmitScalarExpr(E->getArg(0)));
5991     return EmitAArch64CompareBuiltinExpr(
5992         Ops[0], ConvertType(E->getCallReturnType(getContext())),
5993         ICmpInst::ICMP_EQ, "vceqz");
5994   case NEON::BI__builtin_neon_vceqzd_f64:
5995   case NEON::BI__builtin_neon_vceqzs_f32:
5996   case NEON::BI__builtin_neon_vceqzh_f16:
5997     Ops.push_back(EmitScalarExpr(E->getArg(0)));
5998     return EmitAArch64CompareBuiltinExpr(
5999         Ops[0], ConvertType(E->getCallReturnType(getContext())),
6000         ICmpInst::FCMP_OEQ, "vceqz");
6001   case NEON::BI__builtin_neon_vcgezd_s64:
6002     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6003     return EmitAArch64CompareBuiltinExpr(
6004         Ops[0], ConvertType(E->getCallReturnType(getContext())),
6005         ICmpInst::ICMP_SGE, "vcgez");
6006   case NEON::BI__builtin_neon_vcgezd_f64:
6007   case NEON::BI__builtin_neon_vcgezs_f32:
6008   case NEON::BI__builtin_neon_vcgezh_f16:
6009     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6010     return EmitAArch64CompareBuiltinExpr(
6011         Ops[0], ConvertType(E->getCallReturnType(getContext())),
6012         ICmpInst::FCMP_OGE, "vcgez");
6013   case NEON::BI__builtin_neon_vclezd_s64:
6014     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6015     return EmitAArch64CompareBuiltinExpr(
6016         Ops[0], ConvertType(E->getCallReturnType(getContext())),
6017         ICmpInst::ICMP_SLE, "vclez");
6018   case NEON::BI__builtin_neon_vclezd_f64:
6019   case NEON::BI__builtin_neon_vclezs_f32:
6020   case NEON::BI__builtin_neon_vclezh_f16:
6021     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6022     return EmitAArch64CompareBuiltinExpr(
6023         Ops[0], ConvertType(E->getCallReturnType(getContext())),
6024         ICmpInst::FCMP_OLE, "vclez");
6025   case NEON::BI__builtin_neon_vcgtzd_s64:
6026     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6027     return EmitAArch64CompareBuiltinExpr(
6028         Ops[0], ConvertType(E->getCallReturnType(getContext())),
6029         ICmpInst::ICMP_SGT, "vcgtz");
6030   case NEON::BI__builtin_neon_vcgtzd_f64:
6031   case NEON::BI__builtin_neon_vcgtzs_f32:
6032   case NEON::BI__builtin_neon_vcgtzh_f16:
6033     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6034     return EmitAArch64CompareBuiltinExpr(
6035         Ops[0], ConvertType(E->getCallReturnType(getContext())),
6036         ICmpInst::FCMP_OGT, "vcgtz");
6037   case NEON::BI__builtin_neon_vcltzd_s64:
6038     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6039     return EmitAArch64CompareBuiltinExpr(
6040         Ops[0], ConvertType(E->getCallReturnType(getContext())),
6041         ICmpInst::ICMP_SLT, "vcltz");
6042 
6043   case NEON::BI__builtin_neon_vcltzd_f64:
6044   case NEON::BI__builtin_neon_vcltzs_f32:
6045   case NEON::BI__builtin_neon_vcltzh_f16:
6046     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6047     return EmitAArch64CompareBuiltinExpr(
6048         Ops[0], ConvertType(E->getCallReturnType(getContext())),
6049         ICmpInst::FCMP_OLT, "vcltz");
6050 
6051   case NEON::BI__builtin_neon_vceqzd_u64: {
6052     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6053     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
6054     Ops[0] =
6055         Builder.CreateICmpEQ(Ops[0], llvm::Constant::getNullValue(Int64Ty));
6056     return Builder.CreateSExt(Ops[0], Int64Ty, "vceqzd");
6057   }
6058   case NEON::BI__builtin_neon_vceqd_f64:
6059   case NEON::BI__builtin_neon_vcled_f64:
6060   case NEON::BI__builtin_neon_vcltd_f64:
6061   case NEON::BI__builtin_neon_vcged_f64:
6062   case NEON::BI__builtin_neon_vcgtd_f64: {
6063     llvm::CmpInst::Predicate P;
6064     switch (BuiltinID) {
6065     default: llvm_unreachable("missing builtin ID in switch!");
6066     case NEON::BI__builtin_neon_vceqd_f64: P = llvm::FCmpInst::FCMP_OEQ; break;
6067     case NEON::BI__builtin_neon_vcled_f64: P = llvm::FCmpInst::FCMP_OLE; break;
6068     case NEON::BI__builtin_neon_vcltd_f64: P = llvm::FCmpInst::FCMP_OLT; break;
6069     case NEON::BI__builtin_neon_vcged_f64: P = llvm::FCmpInst::FCMP_OGE; break;
6070     case NEON::BI__builtin_neon_vcgtd_f64: P = llvm::FCmpInst::FCMP_OGT; break;
6071     }
6072     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6073     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
6074     Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
6075     if (P == llvm::FCmpInst::FCMP_OEQ)
6076       Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
6077     else
6078       Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]);
6079     return Builder.CreateSExt(Ops[0], Int64Ty, "vcmpd");
6080   }
6081   case NEON::BI__builtin_neon_vceqs_f32:
6082   case NEON::BI__builtin_neon_vcles_f32:
6083   case NEON::BI__builtin_neon_vclts_f32:
6084   case NEON::BI__builtin_neon_vcges_f32:
6085   case NEON::BI__builtin_neon_vcgts_f32: {
6086     llvm::CmpInst::Predicate P;
6087     switch (BuiltinID) {
6088     default: llvm_unreachable("missing builtin ID in switch!");
6089     case NEON::BI__builtin_neon_vceqs_f32: P = llvm::FCmpInst::FCMP_OEQ; break;
6090     case NEON::BI__builtin_neon_vcles_f32: P = llvm::FCmpInst::FCMP_OLE; break;
6091     case NEON::BI__builtin_neon_vclts_f32: P = llvm::FCmpInst::FCMP_OLT; break;
6092     case NEON::BI__builtin_neon_vcges_f32: P = llvm::FCmpInst::FCMP_OGE; break;
6093     case NEON::BI__builtin_neon_vcgts_f32: P = llvm::FCmpInst::FCMP_OGT; break;
6094     }
6095     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6096     Ops[0] = Builder.CreateBitCast(Ops[0], FloatTy);
6097     Ops[1] = Builder.CreateBitCast(Ops[1], FloatTy);
6098     if (P == llvm::FCmpInst::FCMP_OEQ)
6099       Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
6100     else
6101       Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]);
6102     return Builder.CreateSExt(Ops[0], Int32Ty, "vcmpd");
6103   }
6104   case NEON::BI__builtin_neon_vceqh_f16:
6105   case NEON::BI__builtin_neon_vcleh_f16:
6106   case NEON::BI__builtin_neon_vclth_f16:
6107   case NEON::BI__builtin_neon_vcgeh_f16:
6108   case NEON::BI__builtin_neon_vcgth_f16: {
6109     llvm::CmpInst::Predicate P;
6110     switch (BuiltinID) {
6111     default: llvm_unreachable("missing builtin ID in switch!");
6112     case NEON::BI__builtin_neon_vceqh_f16: P = llvm::FCmpInst::FCMP_OEQ; break;
6113     case NEON::BI__builtin_neon_vcleh_f16: P = llvm::FCmpInst::FCMP_OLE; break;
6114     case NEON::BI__builtin_neon_vclth_f16: P = llvm::FCmpInst::FCMP_OLT; break;
6115     case NEON::BI__builtin_neon_vcgeh_f16: P = llvm::FCmpInst::FCMP_OGE; break;
6116     case NEON::BI__builtin_neon_vcgth_f16: P = llvm::FCmpInst::FCMP_OGT; break;
6117     }
6118     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6119     Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
6120     Ops[1] = Builder.CreateBitCast(Ops[1], HalfTy);
6121     if (P == llvm::FCmpInst::FCMP_OEQ)
6122       Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
6123     else
6124       Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]);
6125     return Builder.CreateSExt(Ops[0], Int16Ty, "vcmpd");
6126   }
6127   case NEON::BI__builtin_neon_vceqd_s64:
6128   case NEON::BI__builtin_neon_vceqd_u64:
6129   case NEON::BI__builtin_neon_vcgtd_s64:
6130   case NEON::BI__builtin_neon_vcgtd_u64:
6131   case NEON::BI__builtin_neon_vcltd_s64:
6132   case NEON::BI__builtin_neon_vcltd_u64:
6133   case NEON::BI__builtin_neon_vcged_u64:
6134   case NEON::BI__builtin_neon_vcged_s64:
6135   case NEON::BI__builtin_neon_vcled_u64:
6136   case NEON::BI__builtin_neon_vcled_s64: {
6137     llvm::CmpInst::Predicate P;
6138     switch (BuiltinID) {
6139     default: llvm_unreachable("missing builtin ID in switch!");
6140     case NEON::BI__builtin_neon_vceqd_s64:
6141     case NEON::BI__builtin_neon_vceqd_u64:P = llvm::ICmpInst::ICMP_EQ;break;
6142     case NEON::BI__builtin_neon_vcgtd_s64:P = llvm::ICmpInst::ICMP_SGT;break;
6143     case NEON::BI__builtin_neon_vcgtd_u64:P = llvm::ICmpInst::ICMP_UGT;break;
6144     case NEON::BI__builtin_neon_vcltd_s64:P = llvm::ICmpInst::ICMP_SLT;break;
6145     case NEON::BI__builtin_neon_vcltd_u64:P = llvm::ICmpInst::ICMP_ULT;break;
6146     case NEON::BI__builtin_neon_vcged_u64:P = llvm::ICmpInst::ICMP_UGE;break;
6147     case NEON::BI__builtin_neon_vcged_s64:P = llvm::ICmpInst::ICMP_SGE;break;
6148     case NEON::BI__builtin_neon_vcled_u64:P = llvm::ICmpInst::ICMP_ULE;break;
6149     case NEON::BI__builtin_neon_vcled_s64:P = llvm::ICmpInst::ICMP_SLE;break;
6150     }
6151     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6152     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
6153     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
6154     Ops[0] = Builder.CreateICmp(P, Ops[0], Ops[1]);
6155     return Builder.CreateSExt(Ops[0], Int64Ty, "vceqd");
6156   }
6157   case NEON::BI__builtin_neon_vtstd_s64:
6158   case NEON::BI__builtin_neon_vtstd_u64: {
6159     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6160     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
6161     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
6162     Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
6163     Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
6164                                 llvm::Constant::getNullValue(Int64Ty));
6165     return Builder.CreateSExt(Ops[0], Int64Ty, "vtstd");
6166   }
6167   case NEON::BI__builtin_neon_vset_lane_i8:
6168   case NEON::BI__builtin_neon_vset_lane_i16:
6169   case NEON::BI__builtin_neon_vset_lane_i32:
6170   case NEON::BI__builtin_neon_vset_lane_i64:
6171   case NEON::BI__builtin_neon_vset_lane_bf16:
6172   case NEON::BI__builtin_neon_vset_lane_f32:
6173   case NEON::BI__builtin_neon_vsetq_lane_i8:
6174   case NEON::BI__builtin_neon_vsetq_lane_i16:
6175   case NEON::BI__builtin_neon_vsetq_lane_i32:
6176   case NEON::BI__builtin_neon_vsetq_lane_i64:
6177   case NEON::BI__builtin_neon_vsetq_lane_bf16:
6178   case NEON::BI__builtin_neon_vsetq_lane_f32:
6179     Ops.push_back(EmitScalarExpr(E->getArg(2)));
6180     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
6181   case NEON::BI__builtin_neon_vset_lane_f64:
6182     // The vector type needs a cast for the v1f64 variant.
6183     Ops[1] =
6184         Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(DoubleTy, 1));
6185     Ops.push_back(EmitScalarExpr(E->getArg(2)));
6186     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
6187   case NEON::BI__builtin_neon_vset_lane_mf8:
6188   case NEON::BI__builtin_neon_vsetq_lane_mf8:
6189     Ops.push_back(EmitScalarExpr(E->getArg(2)));
6190     // The input vector type needs a cast to scalar type.
6191     Ops[0] =
6192         Builder.CreateBitCast(Ops[0], llvm::Type::getInt8Ty(getLLVMContext()));
6193     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
6194   case NEON::BI__builtin_neon_vsetq_lane_f64:
6195     // The vector type needs a cast for the v2f64 variant.
6196     Ops[1] =
6197         Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(DoubleTy, 2));
6198     Ops.push_back(EmitScalarExpr(E->getArg(2)));
6199     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
6200 
6201   case NEON::BI__builtin_neon_vget_lane_i8:
6202   case NEON::BI__builtin_neon_vdupb_lane_i8:
6203     Ops[0] =
6204         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int8Ty, 8));
6205     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6206                                         "vget_lane");
6207   case NEON::BI__builtin_neon_vgetq_lane_i8:
6208   case NEON::BI__builtin_neon_vdupb_laneq_i8:
6209     Ops[0] =
6210         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int8Ty, 16));
6211     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6212                                         "vgetq_lane");
6213   case NEON::BI__builtin_neon_vget_lane_mf8:
6214   case NEON::BI__builtin_neon_vdupb_lane_mf8:
6215   case NEON::BI__builtin_neon_vgetq_lane_mf8:
6216   case NEON::BI__builtin_neon_vdupb_laneq_mf8:
6217     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6218                                         "vget_lane");
6219   case NEON::BI__builtin_neon_vget_lane_i16:
6220   case NEON::BI__builtin_neon_vduph_lane_i16:
6221     Ops[0] =
6222         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int16Ty, 4));
6223     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6224                                         "vget_lane");
6225   case NEON::BI__builtin_neon_vgetq_lane_i16:
6226   case NEON::BI__builtin_neon_vduph_laneq_i16:
6227     Ops[0] =
6228         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int16Ty, 8));
6229     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6230                                         "vgetq_lane");
6231   case NEON::BI__builtin_neon_vget_lane_i32:
6232   case NEON::BI__builtin_neon_vdups_lane_i32:
6233     Ops[0] =
6234         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 2));
6235     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6236                                         "vget_lane");
6237   case NEON::BI__builtin_neon_vdups_lane_f32:
6238     Ops[0] =
6239         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 2));
6240     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6241                                         "vdups_lane");
6242   case NEON::BI__builtin_neon_vgetq_lane_i32:
6243   case NEON::BI__builtin_neon_vdups_laneq_i32:
6244     Ops[0] =
6245         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 4));
6246     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6247                                         "vgetq_lane");
6248   case NEON::BI__builtin_neon_vget_lane_i64:
6249   case NEON::BI__builtin_neon_vdupd_lane_i64:
6250     Ops[0] =
6251         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 1));
6252     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6253                                         "vget_lane");
6254   case NEON::BI__builtin_neon_vdupd_lane_f64:
6255     Ops[0] =
6256         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 1));
6257     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6258                                         "vdupd_lane");
6259   case NEON::BI__builtin_neon_vgetq_lane_i64:
6260   case NEON::BI__builtin_neon_vdupd_laneq_i64:
6261     Ops[0] =
6262         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 2));
6263     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6264                                         "vgetq_lane");
6265   case NEON::BI__builtin_neon_vget_lane_f32:
6266     Ops[0] =
6267         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 2));
6268     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6269                                         "vget_lane");
6270   case NEON::BI__builtin_neon_vget_lane_f64:
6271     Ops[0] =
6272         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 1));
6273     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6274                                         "vget_lane");
6275   case NEON::BI__builtin_neon_vgetq_lane_f32:
6276   case NEON::BI__builtin_neon_vdups_laneq_f32:
6277     Ops[0] =
6278         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 4));
6279     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6280                                         "vgetq_lane");
6281   case NEON::BI__builtin_neon_vgetq_lane_f64:
6282   case NEON::BI__builtin_neon_vdupd_laneq_f64:
6283     Ops[0] =
6284         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 2));
6285     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6286                                         "vgetq_lane");
6287   case NEON::BI__builtin_neon_vaddh_f16:
6288     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6289     return Builder.CreateFAdd(Ops[0], Ops[1], "vaddh");
6290   case NEON::BI__builtin_neon_vsubh_f16:
6291     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6292     return Builder.CreateFSub(Ops[0], Ops[1], "vsubh");
6293   case NEON::BI__builtin_neon_vmulh_f16:
6294     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6295     return Builder.CreateFMul(Ops[0], Ops[1], "vmulh");
6296   case NEON::BI__builtin_neon_vdivh_f16:
6297     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6298     return Builder.CreateFDiv(Ops[0], Ops[1], "vdivh");
6299   case NEON::BI__builtin_neon_vfmah_f16:
6300     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
6301     return emitCallMaybeConstrainedFPBuiltin(
6302         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, HalfTy,
6303         {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]});
6304   case NEON::BI__builtin_neon_vfmsh_f16: {
6305     Value* Neg = Builder.CreateFNeg(EmitScalarExpr(E->getArg(1)), "vsubh");
6306 
6307     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
6308     return emitCallMaybeConstrainedFPBuiltin(
6309         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, HalfTy,
6310         {Neg, EmitScalarExpr(E->getArg(2)), Ops[0]});
6311   }
6312   case NEON::BI__builtin_neon_vaddd_s64:
6313   case NEON::BI__builtin_neon_vaddd_u64:
6314     return Builder.CreateAdd(Ops[0], EmitScalarExpr(E->getArg(1)), "vaddd");
6315   case NEON::BI__builtin_neon_vsubd_s64:
6316   case NEON::BI__builtin_neon_vsubd_u64:
6317     return Builder.CreateSub(Ops[0], EmitScalarExpr(E->getArg(1)), "vsubd");
6318   case NEON::BI__builtin_neon_vqdmlalh_s16:
6319   case NEON::BI__builtin_neon_vqdmlslh_s16: {
6320     SmallVector<Value *, 2> ProductOps;
6321     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
6322     ProductOps.push_back(vectorWrapScalar16(EmitScalarExpr(E->getArg(2))));
6323     auto *VTy = llvm::FixedVectorType::get(Int32Ty, 4);
6324     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy),
6325                           ProductOps, "vqdmlXl");
6326     Constant *CI = ConstantInt::get(SizeTy, 0);
6327     Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
6328 
6329     unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlalh_s16
6330                                         ? Intrinsic::aarch64_neon_sqadd
6331                                         : Intrinsic::aarch64_neon_sqsub;
6332     return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int32Ty), Ops, "vqdmlXl");
6333   }
6334   case NEON::BI__builtin_neon_vqshlud_n_s64: {
6335     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6336     Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
6337     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqshlu, Int64Ty),
6338                         Ops, "vqshlu_n");
6339   }
6340   case NEON::BI__builtin_neon_vqshld_n_u64:
6341   case NEON::BI__builtin_neon_vqshld_n_s64: {
6342     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vqshld_n_u64
6343                                    ? Intrinsic::aarch64_neon_uqshl
6344                                    : Intrinsic::aarch64_neon_sqshl;
6345     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6346     Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
6347     return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vqshl_n");
6348   }
6349   case NEON::BI__builtin_neon_vrshrd_n_u64:
6350   case NEON::BI__builtin_neon_vrshrd_n_s64: {
6351     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrshrd_n_u64
6352                                    ? Intrinsic::aarch64_neon_urshl
6353                                    : Intrinsic::aarch64_neon_srshl;
6354     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6355     int SV = cast<ConstantInt>(Ops[1])->getSExtValue();
6356     Ops[1] = ConstantInt::get(Int64Ty, -SV);
6357     return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vrshr_n");
6358   }
6359   case NEON::BI__builtin_neon_vrsrad_n_u64:
6360   case NEON::BI__builtin_neon_vrsrad_n_s64: {
6361     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrsrad_n_u64
6362                                    ? Intrinsic::aarch64_neon_urshl
6363                                    : Intrinsic::aarch64_neon_srshl;
6364     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
6365     Ops.push_back(Builder.CreateNeg(EmitScalarExpr(E->getArg(2))));
6366     Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Int64Ty),
6367                                 {Ops[1], Builder.CreateSExt(Ops[2], Int64Ty)});
6368     return Builder.CreateAdd(Ops[0], Builder.CreateBitCast(Ops[1], Int64Ty));
6369   }
6370   case NEON::BI__builtin_neon_vshld_n_s64:
6371   case NEON::BI__builtin_neon_vshld_n_u64: {
6372     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
6373     return Builder.CreateShl(
6374         Ops[0], ConstantInt::get(Int64Ty, Amt->getZExtValue()), "shld_n");
6375   }
6376   case NEON::BI__builtin_neon_vshrd_n_s64: {
6377     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
6378     return Builder.CreateAShr(
6379         Ops[0], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
6380                                                    Amt->getZExtValue())),
6381         "shrd_n");
6382   }
6383   case NEON::BI__builtin_neon_vshrd_n_u64: {
6384     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
6385     uint64_t ShiftAmt = Amt->getZExtValue();
6386     // Right-shifting an unsigned value by its size yields 0.
6387     if (ShiftAmt == 64)
6388       return ConstantInt::get(Int64Ty, 0);
6389     return Builder.CreateLShr(Ops[0], ConstantInt::get(Int64Ty, ShiftAmt),
6390                               "shrd_n");
6391   }
6392   case NEON::BI__builtin_neon_vsrad_n_s64: {
6393     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
6394     Ops[1] = Builder.CreateAShr(
6395         Ops[1], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
6396                                                    Amt->getZExtValue())),
6397         "shrd_n");
6398     return Builder.CreateAdd(Ops[0], Ops[1]);
6399   }
6400   case NEON::BI__builtin_neon_vsrad_n_u64: {
6401     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
6402     uint64_t ShiftAmt = Amt->getZExtValue();
6403     // Right-shifting an unsigned value by its size yields 0.
6404     // As Op + 0 = Op, return Ops[0] directly.
6405     if (ShiftAmt == 64)
6406       return Ops[0];
6407     Ops[1] = Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, ShiftAmt),
6408                                 "shrd_n");
6409     return Builder.CreateAdd(Ops[0], Ops[1]);
6410   }
6411   case NEON::BI__builtin_neon_vqdmlalh_lane_s16:
6412   case NEON::BI__builtin_neon_vqdmlalh_laneq_s16:
6413   case NEON::BI__builtin_neon_vqdmlslh_lane_s16:
6414   case NEON::BI__builtin_neon_vqdmlslh_laneq_s16: {
6415     Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
6416                                           "lane");
6417     SmallVector<Value *, 2> ProductOps;
6418     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
6419     ProductOps.push_back(vectorWrapScalar16(Ops[2]));
6420     auto *VTy = llvm::FixedVectorType::get(Int32Ty, 4);
6421     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy),
6422                           ProductOps, "vqdmlXl");
6423     Constant *CI = ConstantInt::get(SizeTy, 0);
6424     Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
6425     Ops.pop_back();
6426 
6427     unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlalh_lane_s16 ||
6428                        BuiltinID == NEON::BI__builtin_neon_vqdmlalh_laneq_s16)
6429                           ? Intrinsic::aarch64_neon_sqadd
6430                           : Intrinsic::aarch64_neon_sqsub;
6431     return EmitNeonCall(CGM.getIntrinsic(AccInt, Int32Ty), Ops, "vqdmlXl");
6432   }
6433   case NEON::BI__builtin_neon_vqdmlals_s32:
6434   case NEON::BI__builtin_neon_vqdmlsls_s32: {
6435     SmallVector<Value *, 2> ProductOps;
6436     ProductOps.push_back(Ops[1]);
6437     ProductOps.push_back(EmitScalarExpr(E->getArg(2)));
6438     Ops[1] =
6439         EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar),
6440                      ProductOps, "vqdmlXl");
6441 
6442     unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlals_s32
6443                                         ? Intrinsic::aarch64_neon_sqadd
6444                                         : Intrinsic::aarch64_neon_sqsub;
6445     return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int64Ty), Ops, "vqdmlXl");
6446   }
6447   case NEON::BI__builtin_neon_vqdmlals_lane_s32:
6448   case NEON::BI__builtin_neon_vqdmlals_laneq_s32:
6449   case NEON::BI__builtin_neon_vqdmlsls_lane_s32:
6450   case NEON::BI__builtin_neon_vqdmlsls_laneq_s32: {
6451     Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
6452                                           "lane");
6453     SmallVector<Value *, 2> ProductOps;
6454     ProductOps.push_back(Ops[1]);
6455     ProductOps.push_back(Ops[2]);
6456     Ops[1] =
6457         EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar),
6458                      ProductOps, "vqdmlXl");
6459     Ops.pop_back();
6460 
6461     unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlals_lane_s32 ||
6462                        BuiltinID == NEON::BI__builtin_neon_vqdmlals_laneq_s32)
6463                           ? Intrinsic::aarch64_neon_sqadd
6464                           : Intrinsic::aarch64_neon_sqsub;
6465     return EmitNeonCall(CGM.getIntrinsic(AccInt, Int64Ty), Ops, "vqdmlXl");
6466   }
6467   case NEON::BI__builtin_neon_vget_lane_bf16:
6468   case NEON::BI__builtin_neon_vduph_lane_bf16:
6469   case NEON::BI__builtin_neon_vduph_lane_f16: {
6470     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6471                                         "vget_lane");
6472   }
6473   case NEON::BI__builtin_neon_vgetq_lane_bf16:
6474   case NEON::BI__builtin_neon_vduph_laneq_bf16:
6475   case NEON::BI__builtin_neon_vduph_laneq_f16: {
6476     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6477                                         "vgetq_lane");
6478   }
6479   case NEON::BI__builtin_neon_vcvt_bf16_f32: {
6480     llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
6481     llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
6482     return Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
6483   }
6484   case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: {
6485     SmallVector<int, 16> ConcatMask(8);
6486     std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
6487     llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
6488     llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
6489     llvm::Value *Trunc =
6490         Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
6491     return Builder.CreateShuffleVector(
6492         Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
6493   }
6494   case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: {
6495     SmallVector<int, 16> ConcatMask(8);
6496     std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
6497     SmallVector<int, 16> LoMask(4);
6498     std::iota(LoMask.begin(), LoMask.end(), 0);
6499     llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
6500     llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
6501     llvm::Type *V8BF16 = FixedVectorType::get(Builder.getBFloatTy(), 8);
6502     llvm::Value *Inactive = Builder.CreateShuffleVector(
6503         Builder.CreateBitCast(Ops[0], V8BF16), LoMask);
6504     llvm::Value *Trunc =
6505         Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[1], V4F32), V4BF16);
6506     return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
6507   }
6508 
6509   case clang::AArch64::BI_InterlockedAdd:
6510   case clang::AArch64::BI_InterlockedAdd_acq:
6511   case clang::AArch64::BI_InterlockedAdd_rel:
6512   case clang::AArch64::BI_InterlockedAdd_nf:
6513   case clang::AArch64::BI_InterlockedAdd64:
6514   case clang::AArch64::BI_InterlockedAdd64_acq:
6515   case clang::AArch64::BI_InterlockedAdd64_rel:
6516   case clang::AArch64::BI_InterlockedAdd64_nf: {
6517     Address DestAddr = CheckAtomicAlignment(*this, E);
6518     Value *Val = EmitScalarExpr(E->getArg(1));
6519     llvm::AtomicOrdering Ordering;
6520     switch (BuiltinID) {
6521     case clang::AArch64::BI_InterlockedAdd:
6522     case clang::AArch64::BI_InterlockedAdd64:
6523       Ordering = llvm::AtomicOrdering::SequentiallyConsistent;
6524       break;
6525     case clang::AArch64::BI_InterlockedAdd_acq:
6526     case clang::AArch64::BI_InterlockedAdd64_acq:
6527       Ordering = llvm::AtomicOrdering::Acquire;
6528       break;
6529     case clang::AArch64::BI_InterlockedAdd_rel:
6530     case clang::AArch64::BI_InterlockedAdd64_rel:
6531       Ordering = llvm::AtomicOrdering::Release;
6532       break;
6533     case clang::AArch64::BI_InterlockedAdd_nf:
6534     case clang::AArch64::BI_InterlockedAdd64_nf:
6535       Ordering = llvm::AtomicOrdering::Monotonic;
6536       break;
6537     default:
6538       llvm_unreachable("missing builtin ID in switch!");
6539     }
6540     AtomicRMWInst *RMWI =
6541         Builder.CreateAtomicRMW(AtomicRMWInst::Add, DestAddr, Val, Ordering);
6542     return Builder.CreateAdd(RMWI, Val);
6543   }
6544   }
6545 
6546   llvm::FixedVectorType *VTy = GetNeonType(this, Type);
6547   llvm::Type *Ty = VTy;
6548   if (!Ty)
6549     return nullptr;
6550 
6551   // Not all intrinsics handled by the common case work for AArch64 yet, so only
6552   // defer to common code if it's been added to our special map.
6553   Builtin = findARMVectorIntrinsicInMap(AArch64SIMDIntrinsicMap, BuiltinID,
6554                                         AArch64SIMDIntrinsicsProvenSorted);
6555 
6556   if (Builtin)
6557     return EmitCommonNeonBuiltinExpr(
6558         Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
6559         Builtin->NameHint, Builtin->TypeModifier, E, Ops,
6560         /*never use addresses*/ Address::invalid(), Address::invalid(), Arch);
6561 
6562   if (Value *V = EmitAArch64TblBuiltinExpr(*this, BuiltinID, E, Ops, Arch))
6563     return V;
6564 
6565   unsigned Int;
6566   bool ExtractLow = false;
6567   bool ExtendLaneArg = false;
6568   switch (BuiltinID) {
6569   default: return nullptr;
6570   case NEON::BI__builtin_neon_vbsl_v:
6571   case NEON::BI__builtin_neon_vbslq_v: {
6572     llvm::Type *BitTy = llvm::VectorType::getInteger(VTy);
6573     Ops[0] = Builder.CreateBitCast(Ops[0], BitTy, "vbsl");
6574     Ops[1] = Builder.CreateBitCast(Ops[1], BitTy, "vbsl");
6575     Ops[2] = Builder.CreateBitCast(Ops[2], BitTy, "vbsl");
6576 
6577     Ops[1] = Builder.CreateAnd(Ops[0], Ops[1], "vbsl");
6578     Ops[2] = Builder.CreateAnd(Builder.CreateNot(Ops[0]), Ops[2], "vbsl");
6579     Ops[0] = Builder.CreateOr(Ops[1], Ops[2], "vbsl");
6580     return Builder.CreateBitCast(Ops[0], Ty);
6581   }
6582   case NEON::BI__builtin_neon_vfma_lane_v:
6583   case NEON::BI__builtin_neon_vfmaq_lane_v: { // Only used for FP types
6584     // The ARM builtins (and instructions) have the addend as the first
6585     // operand, but the 'fma' intrinsics have it last. Swap it around here.
6586     Value *Addend = Ops[0];
6587     Value *Multiplicand = Ops[1];
6588     Value *LaneSource = Ops[2];
6589     Ops[0] = Multiplicand;
6590     Ops[1] = LaneSource;
6591     Ops[2] = Addend;
6592 
6593     // Now adjust things to handle the lane access.
6594     auto *SourceTy = BuiltinID == NEON::BI__builtin_neon_vfmaq_lane_v
6595                          ? llvm::FixedVectorType::get(VTy->getElementType(),
6596                                                       VTy->getNumElements() / 2)
6597                          : VTy;
6598     llvm::Constant *cst = cast<Constant>(Ops[3]);
6599     Value *SV = llvm::ConstantVector::getSplat(VTy->getElementCount(), cst);
6600     Ops[1] = Builder.CreateBitCast(Ops[1], SourceTy);
6601     Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV, "lane");
6602 
6603     Ops.pop_back();
6604     Int = Builder.getIsFPConstrained() ? Intrinsic::experimental_constrained_fma
6605                                        : Intrinsic::fma;
6606     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla");
6607   }
6608   case NEON::BI__builtin_neon_vfma_laneq_v: {
6609     auto *VTy = cast<llvm::FixedVectorType>(Ty);
6610     // v1f64 fma should be mapped to Neon scalar f64 fma
6611     if (VTy && VTy->getElementType() == DoubleTy) {
6612       Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
6613       Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
6614       llvm::FixedVectorType *VTy =
6615           GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, true));
6616       Ops[2] = Builder.CreateBitCast(Ops[2], VTy);
6617       Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
6618       Value *Result;
6619       Result = emitCallMaybeConstrainedFPBuiltin(
6620           *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma,
6621           DoubleTy, {Ops[1], Ops[2], Ops[0]});
6622       return Builder.CreateBitCast(Result, Ty);
6623     }
6624     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6625     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6626 
6627     auto *STy = llvm::FixedVectorType::get(VTy->getElementType(),
6628                                            VTy->getNumElements() * 2);
6629     Ops[2] = Builder.CreateBitCast(Ops[2], STy);
6630     Value *SV = llvm::ConstantVector::getSplat(VTy->getElementCount(),
6631                                                cast<ConstantInt>(Ops[3]));
6632     Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
6633 
6634     return emitCallMaybeConstrainedFPBuiltin(
6635         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
6636         {Ops[2], Ops[1], Ops[0]});
6637   }
6638   case NEON::BI__builtin_neon_vfmaq_laneq_v: {
6639     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6640     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6641 
6642     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
6643     Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
6644     return emitCallMaybeConstrainedFPBuiltin(
6645         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
6646         {Ops[2], Ops[1], Ops[0]});
6647   }
6648   case NEON::BI__builtin_neon_vfmah_lane_f16:
6649   case NEON::BI__builtin_neon_vfmas_lane_f32:
6650   case NEON::BI__builtin_neon_vfmah_laneq_f16:
6651   case NEON::BI__builtin_neon_vfmas_laneq_f32:
6652   case NEON::BI__builtin_neon_vfmad_lane_f64:
6653   case NEON::BI__builtin_neon_vfmad_laneq_f64: {
6654     Ops.push_back(EmitScalarExpr(E->getArg(3)));
6655     llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
6656     Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
6657     return emitCallMaybeConstrainedFPBuiltin(
6658         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
6659         {Ops[1], Ops[2], Ops[0]});
6660   }
6661   case NEON::BI__builtin_neon_vmull_v:
6662     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6663     Int = usgn ? Intrinsic::aarch64_neon_umull : Intrinsic::aarch64_neon_smull;
6664     if (Type.isPoly()) Int = Intrinsic::aarch64_neon_pmull;
6665     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
6666   case NEON::BI__builtin_neon_vmax_v:
6667   case NEON::BI__builtin_neon_vmaxq_v:
6668     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6669     Int = usgn ? Intrinsic::aarch64_neon_umax : Intrinsic::aarch64_neon_smax;
6670     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax;
6671     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax");
6672   case NEON::BI__builtin_neon_vmaxh_f16: {
6673     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6674     Int = Intrinsic::aarch64_neon_fmax;
6675     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmax");
6676   }
6677   case NEON::BI__builtin_neon_vmin_v:
6678   case NEON::BI__builtin_neon_vminq_v:
6679     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6680     Int = usgn ? Intrinsic::aarch64_neon_umin : Intrinsic::aarch64_neon_smin;
6681     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin;
6682     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin");
6683   case NEON::BI__builtin_neon_vminh_f16: {
6684     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6685     Int = Intrinsic::aarch64_neon_fmin;
6686     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmin");
6687   }
6688   case NEON::BI__builtin_neon_vabd_v:
6689   case NEON::BI__builtin_neon_vabdq_v:
6690     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6691     Int = usgn ? Intrinsic::aarch64_neon_uabd : Intrinsic::aarch64_neon_sabd;
6692     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fabd;
6693     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vabd");
6694   case NEON::BI__builtin_neon_vpadal_v:
6695   case NEON::BI__builtin_neon_vpadalq_v: {
6696     unsigned ArgElts = VTy->getNumElements();
6697     llvm::IntegerType *EltTy = cast<IntegerType>(VTy->getElementType());
6698     unsigned BitWidth = EltTy->getBitWidth();
6699     auto *ArgTy = llvm::FixedVectorType::get(
6700         llvm::IntegerType::get(getLLVMContext(), BitWidth / 2), 2 * ArgElts);
6701     llvm::Type* Tys[2] = { VTy, ArgTy };
6702     Int = usgn ? Intrinsic::aarch64_neon_uaddlp : Intrinsic::aarch64_neon_saddlp;
6703     SmallVector<llvm::Value*, 1> TmpOps;
6704     TmpOps.push_back(Ops[1]);
6705     Function *F = CGM.getIntrinsic(Int, Tys);
6706     llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vpadal");
6707     llvm::Value *addend = Builder.CreateBitCast(Ops[0], tmp->getType());
6708     return Builder.CreateAdd(tmp, addend);
6709   }
6710   case NEON::BI__builtin_neon_vpmin_v:
6711   case NEON::BI__builtin_neon_vpminq_v:
6712     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6713     Int = usgn ? Intrinsic::aarch64_neon_uminp : Intrinsic::aarch64_neon_sminp;
6714     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fminp;
6715     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin");
6716   case NEON::BI__builtin_neon_vpmax_v:
6717   case NEON::BI__builtin_neon_vpmaxq_v:
6718     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6719     Int = usgn ? Intrinsic::aarch64_neon_umaxp : Intrinsic::aarch64_neon_smaxp;
6720     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmaxp;
6721     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax");
6722   case NEON::BI__builtin_neon_vminnm_v:
6723   case NEON::BI__builtin_neon_vminnmq_v:
6724     Int = Intrinsic::aarch64_neon_fminnm;
6725     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm");
6726   case NEON::BI__builtin_neon_vminnmh_f16:
6727     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6728     Int = Intrinsic::aarch64_neon_fminnm;
6729     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vminnm");
6730   case NEON::BI__builtin_neon_vmaxnm_v:
6731   case NEON::BI__builtin_neon_vmaxnmq_v:
6732     Int = Intrinsic::aarch64_neon_fmaxnm;
6733     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm");
6734   case NEON::BI__builtin_neon_vmaxnmh_f16:
6735     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6736     Int = Intrinsic::aarch64_neon_fmaxnm;
6737     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmaxnm");
6738   case NEON::BI__builtin_neon_vrecpss_f32: {
6739     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6740     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, FloatTy),
6741                         Ops, "vrecps");
6742   }
6743   case NEON::BI__builtin_neon_vrecpsd_f64:
6744     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6745     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, DoubleTy),
6746                         Ops, "vrecps");
6747   case NEON::BI__builtin_neon_vrecpsh_f16:
6748     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6749     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, HalfTy),
6750                         Ops, "vrecps");
6751   case NEON::BI__builtin_neon_vqshrun_n_v:
6752     Int = Intrinsic::aarch64_neon_sqshrun;
6753     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n");
6754   case NEON::BI__builtin_neon_vqrshrun_n_v:
6755     Int = Intrinsic::aarch64_neon_sqrshrun;
6756     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrun_n");
6757   case NEON::BI__builtin_neon_vqshrn_n_v:
6758     Int = usgn ? Intrinsic::aarch64_neon_uqshrn : Intrinsic::aarch64_neon_sqshrn;
6759     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n");
6760   case NEON::BI__builtin_neon_vrshrn_n_v:
6761     Int = Intrinsic::aarch64_neon_rshrn;
6762     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshrn_n");
6763   case NEON::BI__builtin_neon_vqrshrn_n_v:
6764     Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : Intrinsic::aarch64_neon_sqrshrn;
6765     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
6766   case NEON::BI__builtin_neon_vrndah_f16: {
6767     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6768     Int = Builder.getIsFPConstrained()
6769               ? Intrinsic::experimental_constrained_round
6770               : Intrinsic::round;
6771     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrnda");
6772   }
6773   case NEON::BI__builtin_neon_vrnda_v:
6774   case NEON::BI__builtin_neon_vrndaq_v: {
6775     Int = Builder.getIsFPConstrained()
6776               ? Intrinsic::experimental_constrained_round
6777               : Intrinsic::round;
6778     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
6779   }
6780   case NEON::BI__builtin_neon_vrndih_f16: {
6781     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6782     Int = Builder.getIsFPConstrained()
6783               ? Intrinsic::experimental_constrained_nearbyint
6784               : Intrinsic::nearbyint;
6785     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndi");
6786   }
6787   case NEON::BI__builtin_neon_vrndmh_f16: {
6788     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6789     Int = Builder.getIsFPConstrained()
6790               ? Intrinsic::experimental_constrained_floor
6791               : Intrinsic::floor;
6792     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndm");
6793   }
6794   case NEON::BI__builtin_neon_vrndm_v:
6795   case NEON::BI__builtin_neon_vrndmq_v: {
6796     Int = Builder.getIsFPConstrained()
6797               ? Intrinsic::experimental_constrained_floor
6798               : Intrinsic::floor;
6799     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
6800   }
6801   case NEON::BI__builtin_neon_vrndnh_f16: {
6802     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6803     Int = Builder.getIsFPConstrained()
6804               ? Intrinsic::experimental_constrained_roundeven
6805               : Intrinsic::roundeven;
6806     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndn");
6807   }
6808   case NEON::BI__builtin_neon_vrndn_v:
6809   case NEON::BI__builtin_neon_vrndnq_v: {
6810     Int = Builder.getIsFPConstrained()
6811               ? Intrinsic::experimental_constrained_roundeven
6812               : Intrinsic::roundeven;
6813     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn");
6814   }
6815   case NEON::BI__builtin_neon_vrndns_f32: {
6816     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6817     Int = Builder.getIsFPConstrained()
6818               ? Intrinsic::experimental_constrained_roundeven
6819               : Intrinsic::roundeven;
6820     return EmitNeonCall(CGM.getIntrinsic(Int, FloatTy), Ops, "vrndn");
6821   }
6822   case NEON::BI__builtin_neon_vrndph_f16: {
6823     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6824     Int = Builder.getIsFPConstrained()
6825               ? Intrinsic::experimental_constrained_ceil
6826               : Intrinsic::ceil;
6827     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndp");
6828   }
6829   case NEON::BI__builtin_neon_vrndp_v:
6830   case NEON::BI__builtin_neon_vrndpq_v: {
6831     Int = Builder.getIsFPConstrained()
6832               ? Intrinsic::experimental_constrained_ceil
6833               : Intrinsic::ceil;
6834     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
6835   }
6836   case NEON::BI__builtin_neon_vrndxh_f16: {
6837     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6838     Int = Builder.getIsFPConstrained()
6839               ? Intrinsic::experimental_constrained_rint
6840               : Intrinsic::rint;
6841     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndx");
6842   }
6843   case NEON::BI__builtin_neon_vrndx_v:
6844   case NEON::BI__builtin_neon_vrndxq_v: {
6845     Int = Builder.getIsFPConstrained()
6846               ? Intrinsic::experimental_constrained_rint
6847               : Intrinsic::rint;
6848     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
6849   }
6850   case NEON::BI__builtin_neon_vrndh_f16: {
6851     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6852     Int = Builder.getIsFPConstrained()
6853               ? Intrinsic::experimental_constrained_trunc
6854               : Intrinsic::trunc;
6855     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndz");
6856   }
6857   case NEON::BI__builtin_neon_vrnd32x_f32:
6858   case NEON::BI__builtin_neon_vrnd32xq_f32:
6859   case NEON::BI__builtin_neon_vrnd32x_f64:
6860   case NEON::BI__builtin_neon_vrnd32xq_f64: {
6861     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6862     Int = Intrinsic::aarch64_neon_frint32x;
6863     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd32x");
6864   }
6865   case NEON::BI__builtin_neon_vrnd32z_f32:
6866   case NEON::BI__builtin_neon_vrnd32zq_f32:
6867   case NEON::BI__builtin_neon_vrnd32z_f64:
6868   case NEON::BI__builtin_neon_vrnd32zq_f64: {
6869     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6870     Int = Intrinsic::aarch64_neon_frint32z;
6871     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd32z");
6872   }
6873   case NEON::BI__builtin_neon_vrnd64x_f32:
6874   case NEON::BI__builtin_neon_vrnd64xq_f32:
6875   case NEON::BI__builtin_neon_vrnd64x_f64:
6876   case NEON::BI__builtin_neon_vrnd64xq_f64: {
6877     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6878     Int = Intrinsic::aarch64_neon_frint64x;
6879     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd64x");
6880   }
6881   case NEON::BI__builtin_neon_vrnd64z_f32:
6882   case NEON::BI__builtin_neon_vrnd64zq_f32:
6883   case NEON::BI__builtin_neon_vrnd64z_f64:
6884   case NEON::BI__builtin_neon_vrnd64zq_f64: {
6885     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6886     Int = Intrinsic::aarch64_neon_frint64z;
6887     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd64z");
6888   }
6889   case NEON::BI__builtin_neon_vrnd_v:
6890   case NEON::BI__builtin_neon_vrndq_v: {
6891     Int = Builder.getIsFPConstrained()
6892               ? Intrinsic::experimental_constrained_trunc
6893               : Intrinsic::trunc;
6894     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndz");
6895   }
6896   case NEON::BI__builtin_neon_vcvt_f64_v:
6897   case NEON::BI__builtin_neon_vcvtq_f64_v:
6898     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6899     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
6900     return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
6901                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
6902   case NEON::BI__builtin_neon_vcvt_f64_f32: {
6903     assert(Type.getEltType() == NeonTypeFlags::Float64 && quad &&
6904            "unexpected vcvt_f64_f32 builtin");
6905     NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float32, false, false);
6906     Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
6907 
6908     return Builder.CreateFPExt(Ops[0], Ty, "vcvt");
6909   }
6910   case NEON::BI__builtin_neon_vcvt_f32_f64: {
6911     assert(Type.getEltType() == NeonTypeFlags::Float32 &&
6912            "unexpected vcvt_f32_f64 builtin");
6913     NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true);
6914     Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
6915 
6916     return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt");
6917   }
6918   case NEON::BI__builtin_neon_vcvt_s32_v:
6919   case NEON::BI__builtin_neon_vcvt_u32_v:
6920   case NEON::BI__builtin_neon_vcvt_s64_v:
6921   case NEON::BI__builtin_neon_vcvt_u64_v:
6922   case NEON::BI__builtin_neon_vcvt_s16_f16:
6923   case NEON::BI__builtin_neon_vcvt_u16_f16:
6924   case NEON::BI__builtin_neon_vcvtq_s32_v:
6925   case NEON::BI__builtin_neon_vcvtq_u32_v:
6926   case NEON::BI__builtin_neon_vcvtq_s64_v:
6927   case NEON::BI__builtin_neon_vcvtq_u64_v:
6928   case NEON::BI__builtin_neon_vcvtq_s16_f16:
6929   case NEON::BI__builtin_neon_vcvtq_u16_f16: {
6930     Int =
6931         usgn ? Intrinsic::aarch64_neon_fcvtzu : Intrinsic::aarch64_neon_fcvtzs;
6932     llvm::Type *Tys[2] = {Ty, GetFloatNeonType(this, Type)};
6933     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtz");
6934   }
6935   case NEON::BI__builtin_neon_vcvta_s16_f16:
6936   case NEON::BI__builtin_neon_vcvta_u16_f16:
6937   case NEON::BI__builtin_neon_vcvta_s32_v:
6938   case NEON::BI__builtin_neon_vcvtaq_s16_f16:
6939   case NEON::BI__builtin_neon_vcvtaq_s32_v:
6940   case NEON::BI__builtin_neon_vcvta_u32_v:
6941   case NEON::BI__builtin_neon_vcvtaq_u16_f16:
6942   case NEON::BI__builtin_neon_vcvtaq_u32_v:
6943   case NEON::BI__builtin_neon_vcvta_s64_v:
6944   case NEON::BI__builtin_neon_vcvtaq_s64_v:
6945   case NEON::BI__builtin_neon_vcvta_u64_v:
6946   case NEON::BI__builtin_neon_vcvtaq_u64_v: {
6947     Int = usgn ? Intrinsic::aarch64_neon_fcvtau : Intrinsic::aarch64_neon_fcvtas;
6948     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
6949     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvta");
6950   }
6951   case NEON::BI__builtin_neon_vcvtm_s16_f16:
6952   case NEON::BI__builtin_neon_vcvtm_s32_v:
6953   case NEON::BI__builtin_neon_vcvtmq_s16_f16:
6954   case NEON::BI__builtin_neon_vcvtmq_s32_v:
6955   case NEON::BI__builtin_neon_vcvtm_u16_f16:
6956   case NEON::BI__builtin_neon_vcvtm_u32_v:
6957   case NEON::BI__builtin_neon_vcvtmq_u16_f16:
6958   case NEON::BI__builtin_neon_vcvtmq_u32_v:
6959   case NEON::BI__builtin_neon_vcvtm_s64_v:
6960   case NEON::BI__builtin_neon_vcvtmq_s64_v:
6961   case NEON::BI__builtin_neon_vcvtm_u64_v:
6962   case NEON::BI__builtin_neon_vcvtmq_u64_v: {
6963     Int = usgn ? Intrinsic::aarch64_neon_fcvtmu : Intrinsic::aarch64_neon_fcvtms;
6964     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
6965     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtm");
6966   }
6967   case NEON::BI__builtin_neon_vcvtn_s16_f16:
6968   case NEON::BI__builtin_neon_vcvtn_s32_v:
6969   case NEON::BI__builtin_neon_vcvtnq_s16_f16:
6970   case NEON::BI__builtin_neon_vcvtnq_s32_v:
6971   case NEON::BI__builtin_neon_vcvtn_u16_f16:
6972   case NEON::BI__builtin_neon_vcvtn_u32_v:
6973   case NEON::BI__builtin_neon_vcvtnq_u16_f16:
6974   case NEON::BI__builtin_neon_vcvtnq_u32_v:
6975   case NEON::BI__builtin_neon_vcvtn_s64_v:
6976   case NEON::BI__builtin_neon_vcvtnq_s64_v:
6977   case NEON::BI__builtin_neon_vcvtn_u64_v:
6978   case NEON::BI__builtin_neon_vcvtnq_u64_v: {
6979     Int = usgn ? Intrinsic::aarch64_neon_fcvtnu : Intrinsic::aarch64_neon_fcvtns;
6980     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
6981     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtn");
6982   }
6983   case NEON::BI__builtin_neon_vcvtp_s16_f16:
6984   case NEON::BI__builtin_neon_vcvtp_s32_v:
6985   case NEON::BI__builtin_neon_vcvtpq_s16_f16:
6986   case NEON::BI__builtin_neon_vcvtpq_s32_v:
6987   case NEON::BI__builtin_neon_vcvtp_u16_f16:
6988   case NEON::BI__builtin_neon_vcvtp_u32_v:
6989   case NEON::BI__builtin_neon_vcvtpq_u16_f16:
6990   case NEON::BI__builtin_neon_vcvtpq_u32_v:
6991   case NEON::BI__builtin_neon_vcvtp_s64_v:
6992   case NEON::BI__builtin_neon_vcvtpq_s64_v:
6993   case NEON::BI__builtin_neon_vcvtp_u64_v:
6994   case NEON::BI__builtin_neon_vcvtpq_u64_v: {
6995     Int = usgn ? Intrinsic::aarch64_neon_fcvtpu : Intrinsic::aarch64_neon_fcvtps;
6996     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
6997     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtp");
6998   }
6999   case NEON::BI__builtin_neon_vmulx_v:
7000   case NEON::BI__builtin_neon_vmulxq_v: {
7001     Int = Intrinsic::aarch64_neon_fmulx;
7002     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
7003   }
7004   case NEON::BI__builtin_neon_vmulxh_lane_f16:
7005   case NEON::BI__builtin_neon_vmulxh_laneq_f16: {
7006     // vmulx_lane should be mapped to Neon scalar mulx after
7007     // extracting the scalar element
7008     Ops.push_back(EmitScalarExpr(E->getArg(2)));
7009     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
7010     Ops.pop_back();
7011     Int = Intrinsic::aarch64_neon_fmulx;
7012     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmulx");
7013   }
7014   case NEON::BI__builtin_neon_vmul_lane_v:
7015   case NEON::BI__builtin_neon_vmul_laneq_v: {
7016     // v1f64 vmul_lane should be mapped to Neon scalar mul lane
7017     bool Quad = false;
7018     if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v)
7019       Quad = true;
7020     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
7021     llvm::FixedVectorType *VTy =
7022         GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
7023     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
7024     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
7025     Value *Result = Builder.CreateFMul(Ops[0], Ops[1]);
7026     return Builder.CreateBitCast(Result, Ty);
7027   }
7028   case NEON::BI__builtin_neon_vnegd_s64:
7029     return Builder.CreateNeg(EmitScalarExpr(E->getArg(0)), "vnegd");
7030   case NEON::BI__builtin_neon_vnegh_f16:
7031     return Builder.CreateFNeg(EmitScalarExpr(E->getArg(0)), "vnegh");
7032   case NEON::BI__builtin_neon_vpmaxnm_v:
7033   case NEON::BI__builtin_neon_vpmaxnmq_v: {
7034     Int = Intrinsic::aarch64_neon_fmaxnmp;
7035     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmaxnm");
7036   }
7037   case NEON::BI__builtin_neon_vpminnm_v:
7038   case NEON::BI__builtin_neon_vpminnmq_v: {
7039     Int = Intrinsic::aarch64_neon_fminnmp;
7040     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm");
7041   }
7042   case NEON::BI__builtin_neon_vsqrth_f16: {
7043     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7044     Int = Builder.getIsFPConstrained()
7045               ? Intrinsic::experimental_constrained_sqrt
7046               : Intrinsic::sqrt;
7047     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vsqrt");
7048   }
7049   case NEON::BI__builtin_neon_vsqrt_v:
7050   case NEON::BI__builtin_neon_vsqrtq_v: {
7051     Int = Builder.getIsFPConstrained()
7052               ? Intrinsic::experimental_constrained_sqrt
7053               : Intrinsic::sqrt;
7054     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7055     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqrt");
7056   }
7057   case NEON::BI__builtin_neon_vrbit_v:
7058   case NEON::BI__builtin_neon_vrbitq_v: {
7059     Int = Intrinsic::bitreverse;
7060     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit");
7061   }
7062   case NEON::BI__builtin_neon_vaddv_u8:
7063     // FIXME: These are handled by the AArch64 scalar code.
7064     usgn = true;
7065     [[fallthrough]];
7066   case NEON::BI__builtin_neon_vaddv_s8: {
7067     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
7068     Ty = Int32Ty;
7069     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
7070     llvm::Type *Tys[2] = { Ty, VTy };
7071     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7072     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
7073     return Builder.CreateTrunc(Ops[0], Int8Ty);
7074   }
7075   case NEON::BI__builtin_neon_vaddv_u16:
7076     usgn = true;
7077     [[fallthrough]];
7078   case NEON::BI__builtin_neon_vaddv_s16: {
7079     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
7080     Ty = Int32Ty;
7081     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
7082     llvm::Type *Tys[2] = { Ty, VTy };
7083     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7084     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
7085     return Builder.CreateTrunc(Ops[0], Int16Ty);
7086   }
7087   case NEON::BI__builtin_neon_vaddvq_u8:
7088     usgn = true;
7089     [[fallthrough]];
7090   case NEON::BI__builtin_neon_vaddvq_s8: {
7091     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
7092     Ty = Int32Ty;
7093     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
7094     llvm::Type *Tys[2] = { Ty, VTy };
7095     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7096     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
7097     return Builder.CreateTrunc(Ops[0], Int8Ty);
7098   }
7099   case NEON::BI__builtin_neon_vaddvq_u16:
7100     usgn = true;
7101     [[fallthrough]];
7102   case NEON::BI__builtin_neon_vaddvq_s16: {
7103     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
7104     Ty = Int32Ty;
7105     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
7106     llvm::Type *Tys[2] = { Ty, VTy };
7107     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7108     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
7109     return Builder.CreateTrunc(Ops[0], Int16Ty);
7110   }
7111   case NEON::BI__builtin_neon_vmaxv_u8: {
7112     Int = Intrinsic::aarch64_neon_umaxv;
7113     Ty = Int32Ty;
7114     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
7115     llvm::Type *Tys[2] = { Ty, VTy };
7116     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7117     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7118     return Builder.CreateTrunc(Ops[0], Int8Ty);
7119   }
7120   case NEON::BI__builtin_neon_vmaxv_u16: {
7121     Int = Intrinsic::aarch64_neon_umaxv;
7122     Ty = Int32Ty;
7123     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
7124     llvm::Type *Tys[2] = { Ty, VTy };
7125     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7126     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7127     return Builder.CreateTrunc(Ops[0], Int16Ty);
7128   }
7129   case NEON::BI__builtin_neon_vmaxvq_u8: {
7130     Int = Intrinsic::aarch64_neon_umaxv;
7131     Ty = Int32Ty;
7132     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
7133     llvm::Type *Tys[2] = { Ty, VTy };
7134     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7135     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7136     return Builder.CreateTrunc(Ops[0], Int8Ty);
7137   }
7138   case NEON::BI__builtin_neon_vmaxvq_u16: {
7139     Int = Intrinsic::aarch64_neon_umaxv;
7140     Ty = Int32Ty;
7141     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
7142     llvm::Type *Tys[2] = { Ty, VTy };
7143     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7144     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7145     return Builder.CreateTrunc(Ops[0], Int16Ty);
7146   }
7147   case NEON::BI__builtin_neon_vmaxv_s8: {
7148     Int = Intrinsic::aarch64_neon_smaxv;
7149     Ty = Int32Ty;
7150     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
7151     llvm::Type *Tys[2] = { Ty, VTy };
7152     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7153     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7154     return Builder.CreateTrunc(Ops[0], Int8Ty);
7155   }
7156   case NEON::BI__builtin_neon_vmaxv_s16: {
7157     Int = Intrinsic::aarch64_neon_smaxv;
7158     Ty = Int32Ty;
7159     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
7160     llvm::Type *Tys[2] = { Ty, VTy };
7161     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7162     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7163     return Builder.CreateTrunc(Ops[0], Int16Ty);
7164   }
7165   case NEON::BI__builtin_neon_vmaxvq_s8: {
7166     Int = Intrinsic::aarch64_neon_smaxv;
7167     Ty = Int32Ty;
7168     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
7169     llvm::Type *Tys[2] = { Ty, VTy };
7170     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7171     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7172     return Builder.CreateTrunc(Ops[0], Int8Ty);
7173   }
7174   case NEON::BI__builtin_neon_vmaxvq_s16: {
7175     Int = Intrinsic::aarch64_neon_smaxv;
7176     Ty = Int32Ty;
7177     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
7178     llvm::Type *Tys[2] = { Ty, VTy };
7179     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7180     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7181     return Builder.CreateTrunc(Ops[0], Int16Ty);
7182   }
7183   case NEON::BI__builtin_neon_vmaxv_f16: {
7184     Int = Intrinsic::aarch64_neon_fmaxv;
7185     Ty = HalfTy;
7186     VTy = llvm::FixedVectorType::get(HalfTy, 4);
7187     llvm::Type *Tys[2] = { Ty, VTy };
7188     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7189     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7190     return Builder.CreateTrunc(Ops[0], HalfTy);
7191   }
7192   case NEON::BI__builtin_neon_vmaxvq_f16: {
7193     Int = Intrinsic::aarch64_neon_fmaxv;
7194     Ty = HalfTy;
7195     VTy = llvm::FixedVectorType::get(HalfTy, 8);
7196     llvm::Type *Tys[2] = { Ty, VTy };
7197     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7198     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7199     return Builder.CreateTrunc(Ops[0], HalfTy);
7200   }
7201   case NEON::BI__builtin_neon_vminv_u8: {
7202     Int = Intrinsic::aarch64_neon_uminv;
7203     Ty = Int32Ty;
7204     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
7205     llvm::Type *Tys[2] = { Ty, VTy };
7206     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7207     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7208     return Builder.CreateTrunc(Ops[0], Int8Ty);
7209   }
7210   case NEON::BI__builtin_neon_vminv_u16: {
7211     Int = Intrinsic::aarch64_neon_uminv;
7212     Ty = Int32Ty;
7213     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
7214     llvm::Type *Tys[2] = { Ty, VTy };
7215     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7216     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7217     return Builder.CreateTrunc(Ops[0], Int16Ty);
7218   }
7219   case NEON::BI__builtin_neon_vminvq_u8: {
7220     Int = Intrinsic::aarch64_neon_uminv;
7221     Ty = Int32Ty;
7222     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
7223     llvm::Type *Tys[2] = { Ty, VTy };
7224     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7225     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7226     return Builder.CreateTrunc(Ops[0], Int8Ty);
7227   }
7228   case NEON::BI__builtin_neon_vminvq_u16: {
7229     Int = Intrinsic::aarch64_neon_uminv;
7230     Ty = Int32Ty;
7231     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
7232     llvm::Type *Tys[2] = { Ty, VTy };
7233     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7234     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7235     return Builder.CreateTrunc(Ops[0], Int16Ty);
7236   }
7237   case NEON::BI__builtin_neon_vminv_s8: {
7238     Int = Intrinsic::aarch64_neon_sminv;
7239     Ty = Int32Ty;
7240     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
7241     llvm::Type *Tys[2] = { Ty, VTy };
7242     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7243     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7244     return Builder.CreateTrunc(Ops[0], Int8Ty);
7245   }
7246   case NEON::BI__builtin_neon_vminv_s16: {
7247     Int = Intrinsic::aarch64_neon_sminv;
7248     Ty = Int32Ty;
7249     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
7250     llvm::Type *Tys[2] = { Ty, VTy };
7251     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7252     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7253     return Builder.CreateTrunc(Ops[0], Int16Ty);
7254   }
7255   case NEON::BI__builtin_neon_vminvq_s8: {
7256     Int = Intrinsic::aarch64_neon_sminv;
7257     Ty = Int32Ty;
7258     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
7259     llvm::Type *Tys[2] = { Ty, VTy };
7260     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7261     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7262     return Builder.CreateTrunc(Ops[0], Int8Ty);
7263   }
7264   case NEON::BI__builtin_neon_vminvq_s16: {
7265     Int = Intrinsic::aarch64_neon_sminv;
7266     Ty = Int32Ty;
7267     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
7268     llvm::Type *Tys[2] = { Ty, VTy };
7269     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7270     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7271     return Builder.CreateTrunc(Ops[0], Int16Ty);
7272   }
7273   case NEON::BI__builtin_neon_vminv_f16: {
7274     Int = Intrinsic::aarch64_neon_fminv;
7275     Ty = HalfTy;
7276     VTy = llvm::FixedVectorType::get(HalfTy, 4);
7277     llvm::Type *Tys[2] = { Ty, VTy };
7278     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7279     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7280     return Builder.CreateTrunc(Ops[0], HalfTy);
7281   }
7282   case NEON::BI__builtin_neon_vminvq_f16: {
7283     Int = Intrinsic::aarch64_neon_fminv;
7284     Ty = HalfTy;
7285     VTy = llvm::FixedVectorType::get(HalfTy, 8);
7286     llvm::Type *Tys[2] = { Ty, VTy };
7287     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7288     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7289     return Builder.CreateTrunc(Ops[0], HalfTy);
7290   }
7291   case NEON::BI__builtin_neon_vmaxnmv_f16: {
7292     Int = Intrinsic::aarch64_neon_fmaxnmv;
7293     Ty = HalfTy;
7294     VTy = llvm::FixedVectorType::get(HalfTy, 4);
7295     llvm::Type *Tys[2] = { Ty, VTy };
7296     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7297     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
7298     return Builder.CreateTrunc(Ops[0], HalfTy);
7299   }
7300   case NEON::BI__builtin_neon_vmaxnmvq_f16: {
7301     Int = Intrinsic::aarch64_neon_fmaxnmv;
7302     Ty = HalfTy;
7303     VTy = llvm::FixedVectorType::get(HalfTy, 8);
7304     llvm::Type *Tys[2] = { Ty, VTy };
7305     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7306     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
7307     return Builder.CreateTrunc(Ops[0], HalfTy);
7308   }
7309   case NEON::BI__builtin_neon_vminnmv_f16: {
7310     Int = Intrinsic::aarch64_neon_fminnmv;
7311     Ty = HalfTy;
7312     VTy = llvm::FixedVectorType::get(HalfTy, 4);
7313     llvm::Type *Tys[2] = { Ty, VTy };
7314     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7315     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
7316     return Builder.CreateTrunc(Ops[0], HalfTy);
7317   }
7318   case NEON::BI__builtin_neon_vminnmvq_f16: {
7319     Int = Intrinsic::aarch64_neon_fminnmv;
7320     Ty = HalfTy;
7321     VTy = llvm::FixedVectorType::get(HalfTy, 8);
7322     llvm::Type *Tys[2] = { Ty, VTy };
7323     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7324     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
7325     return Builder.CreateTrunc(Ops[0], HalfTy);
7326   }
7327   case NEON::BI__builtin_neon_vmul_n_f64: {
7328     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
7329     Value *RHS = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), DoubleTy);
7330     return Builder.CreateFMul(Ops[0], RHS);
7331   }
7332   case NEON::BI__builtin_neon_vaddlv_u8: {
7333     Int = Intrinsic::aarch64_neon_uaddlv;
7334     Ty = Int32Ty;
7335     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
7336     llvm::Type *Tys[2] = { Ty, VTy };
7337     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7338     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
7339     return Builder.CreateTrunc(Ops[0], Int16Ty);
7340   }
7341   case NEON::BI__builtin_neon_vaddlv_u16: {
7342     Int = Intrinsic::aarch64_neon_uaddlv;
7343     Ty = Int32Ty;
7344     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
7345     llvm::Type *Tys[2] = { Ty, VTy };
7346     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7347     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
7348   }
7349   case NEON::BI__builtin_neon_vaddlvq_u8: {
7350     Int = Intrinsic::aarch64_neon_uaddlv;
7351     Ty = Int32Ty;
7352     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
7353     llvm::Type *Tys[2] = { Ty, VTy };
7354     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7355     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
7356     return Builder.CreateTrunc(Ops[0], Int16Ty);
7357   }
7358   case NEON::BI__builtin_neon_vaddlvq_u16: {
7359     Int = Intrinsic::aarch64_neon_uaddlv;
7360     Ty = Int32Ty;
7361     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
7362     llvm::Type *Tys[2] = { Ty, VTy };
7363     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7364     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
7365   }
7366   case NEON::BI__builtin_neon_vaddlv_s8: {
7367     Int = Intrinsic::aarch64_neon_saddlv;
7368     Ty = Int32Ty;
7369     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
7370     llvm::Type *Tys[2] = { Ty, VTy };
7371     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7372     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
7373     return Builder.CreateTrunc(Ops[0], Int16Ty);
7374   }
7375   case NEON::BI__builtin_neon_vaddlv_s16: {
7376     Int = Intrinsic::aarch64_neon_saddlv;
7377     Ty = Int32Ty;
7378     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
7379     llvm::Type *Tys[2] = { Ty, VTy };
7380     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7381     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
7382   }
7383   case NEON::BI__builtin_neon_vaddlvq_s8: {
7384     Int = Intrinsic::aarch64_neon_saddlv;
7385     Ty = Int32Ty;
7386     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
7387     llvm::Type *Tys[2] = { Ty, VTy };
7388     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7389     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
7390     return Builder.CreateTrunc(Ops[0], Int16Ty);
7391   }
7392   case NEON::BI__builtin_neon_vaddlvq_s16: {
7393     Int = Intrinsic::aarch64_neon_saddlv;
7394     Ty = Int32Ty;
7395     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
7396     llvm::Type *Tys[2] = { Ty, VTy };
7397     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7398     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
7399   }
7400   case NEON::BI__builtin_neon_vsri_n_v:
7401   case NEON::BI__builtin_neon_vsriq_n_v: {
7402     Int = Intrinsic::aarch64_neon_vsri;
7403     llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
7404     return EmitNeonCall(Intrin, Ops, "vsri_n");
7405   }
7406   case NEON::BI__builtin_neon_vsli_n_v:
7407   case NEON::BI__builtin_neon_vsliq_n_v: {
7408     Int = Intrinsic::aarch64_neon_vsli;
7409     llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
7410     return EmitNeonCall(Intrin, Ops, "vsli_n");
7411   }
7412   case NEON::BI__builtin_neon_vsra_n_v:
7413   case NEON::BI__builtin_neon_vsraq_n_v:
7414     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7415     Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
7416     return Builder.CreateAdd(Ops[0], Ops[1]);
7417   case NEON::BI__builtin_neon_vrsra_n_v:
7418   case NEON::BI__builtin_neon_vrsraq_n_v: {
7419     Int = usgn ? Intrinsic::aarch64_neon_urshl : Intrinsic::aarch64_neon_srshl;
7420     SmallVector<llvm::Value*,2> TmpOps;
7421     TmpOps.push_back(Ops[1]);
7422     TmpOps.push_back(Ops[2]);
7423     Function* F = CGM.getIntrinsic(Int, Ty);
7424     llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vrshr_n", 1, true);
7425     Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
7426     return Builder.CreateAdd(Ops[0], tmp);
7427   }
7428   case NEON::BI__builtin_neon_vld1_v:
7429   case NEON::BI__builtin_neon_vld1q_v: {
7430     return Builder.CreateAlignedLoad(VTy, Ops[0], PtrOp0.getAlignment());
7431   }
7432   case NEON::BI__builtin_neon_vst1_v:
7433   case NEON::BI__builtin_neon_vst1q_v:
7434     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
7435     return Builder.CreateAlignedStore(Ops[1], Ops[0], PtrOp0.getAlignment());
7436   case NEON::BI__builtin_neon_vld1_lane_v:
7437   case NEON::BI__builtin_neon_vld1q_lane_v: {
7438     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7439     Ops[0] = Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0],
7440                                        PtrOp0.getAlignment());
7441     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane");
7442   }
7443   case NEON::BI__builtin_neon_vldap1_lane_s64:
7444   case NEON::BI__builtin_neon_vldap1q_lane_s64: {
7445     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7446     llvm::LoadInst *LI = Builder.CreateAlignedLoad(
7447         VTy->getElementType(), Ops[0], PtrOp0.getAlignment());
7448     LI->setAtomic(llvm::AtomicOrdering::Acquire);
7449     Ops[0] = LI;
7450     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vldap1_lane");
7451   }
7452   case NEON::BI__builtin_neon_vld1_dup_v:
7453   case NEON::BI__builtin_neon_vld1q_dup_v: {
7454     Value *V = PoisonValue::get(Ty);
7455     Ops[0] = Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0],
7456                                        PtrOp0.getAlignment());
7457     llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
7458     Ops[0] = Builder.CreateInsertElement(V, Ops[0], CI);
7459     return EmitNeonSplat(Ops[0], CI);
7460   }
7461   case NEON::BI__builtin_neon_vst1_lane_v:
7462   case NEON::BI__builtin_neon_vst1q_lane_v:
7463     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7464     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
7465     return Builder.CreateAlignedStore(Ops[1], Ops[0], PtrOp0.getAlignment());
7466   case NEON::BI__builtin_neon_vstl1_lane_s64:
7467   case NEON::BI__builtin_neon_vstl1q_lane_s64: {
7468     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7469     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
7470     llvm::StoreInst *SI =
7471         Builder.CreateAlignedStore(Ops[1], Ops[0], PtrOp0.getAlignment());
7472     SI->setAtomic(llvm::AtomicOrdering::Release);
7473     return SI;
7474   }
7475   case NEON::BI__builtin_neon_vld2_v:
7476   case NEON::BI__builtin_neon_vld2q_v: {
7477     llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
7478     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2, Tys);
7479     Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
7480     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7481   }
7482   case NEON::BI__builtin_neon_vld3_v:
7483   case NEON::BI__builtin_neon_vld3q_v: {
7484     llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
7485     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3, Tys);
7486     Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
7487     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7488   }
7489   case NEON::BI__builtin_neon_vld4_v:
7490   case NEON::BI__builtin_neon_vld4q_v: {
7491     llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
7492     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4, Tys);
7493     Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
7494     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7495   }
7496   case NEON::BI__builtin_neon_vld2_dup_v:
7497   case NEON::BI__builtin_neon_vld2q_dup_v: {
7498     llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
7499     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2r, Tys);
7500     Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
7501     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7502   }
7503   case NEON::BI__builtin_neon_vld3_dup_v:
7504   case NEON::BI__builtin_neon_vld3q_dup_v: {
7505     llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
7506     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3r, Tys);
7507     Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
7508     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7509   }
7510   case NEON::BI__builtin_neon_vld4_dup_v:
7511   case NEON::BI__builtin_neon_vld4q_dup_v: {
7512     llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
7513     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4r, Tys);
7514     Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
7515     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7516   }
7517   case NEON::BI__builtin_neon_vld2_lane_v:
7518   case NEON::BI__builtin_neon_vld2q_lane_v: {
7519     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
7520     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2lane, Tys);
7521     std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end());
7522     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7523     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7524     Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
7525     Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), "vld2_lane");
7526     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7527   }
7528   case NEON::BI__builtin_neon_vld3_lane_v:
7529   case NEON::BI__builtin_neon_vld3q_lane_v: {
7530     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
7531     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3lane, Tys);
7532     std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end());
7533     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7534     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7535     Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
7536     Ops[4] = Builder.CreateZExt(Ops[4], Int64Ty);
7537     Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), "vld3_lane");
7538     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7539   }
7540   case NEON::BI__builtin_neon_vld4_lane_v:
7541   case NEON::BI__builtin_neon_vld4q_lane_v: {
7542     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
7543     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4lane, Tys);
7544     std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end());
7545     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7546     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7547     Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
7548     Ops[4] = Builder.CreateBitCast(Ops[4], Ty);
7549     Ops[5] = Builder.CreateZExt(Ops[5], Int64Ty);
7550     Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), "vld4_lane");
7551     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7552   }
7553   case NEON::BI__builtin_neon_vst2_v:
7554   case NEON::BI__builtin_neon_vst2q_v: {
7555     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
7556     llvm::Type *Tys[2] = { VTy, Ops[2]->getType() };
7557     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2, Tys),
7558                         Ops, "");
7559   }
7560   case NEON::BI__builtin_neon_vst2_lane_v:
7561   case NEON::BI__builtin_neon_vst2q_lane_v: {
7562     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
7563     Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
7564     llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
7565     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2lane, Tys),
7566                         Ops, "");
7567   }
7568   case NEON::BI__builtin_neon_vst3_v:
7569   case NEON::BI__builtin_neon_vst3q_v: {
7570     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
7571     llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
7572     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3, Tys),
7573                         Ops, "");
7574   }
7575   case NEON::BI__builtin_neon_vst3_lane_v:
7576   case NEON::BI__builtin_neon_vst3q_lane_v: {
7577     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
7578     Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
7579     llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
7580     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3lane, Tys),
7581                         Ops, "");
7582   }
7583   case NEON::BI__builtin_neon_vst4_v:
7584   case NEON::BI__builtin_neon_vst4q_v: {
7585     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
7586     llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
7587     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4, Tys),
7588                         Ops, "");
7589   }
7590   case NEON::BI__builtin_neon_vst4_lane_v:
7591   case NEON::BI__builtin_neon_vst4q_lane_v: {
7592     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
7593     Ops[4] = Builder.CreateZExt(Ops[4], Int64Ty);
7594     llvm::Type *Tys[2] = { VTy, Ops[5]->getType() };
7595     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4lane, Tys),
7596                         Ops, "");
7597   }
7598   case NEON::BI__builtin_neon_vtrn_v:
7599   case NEON::BI__builtin_neon_vtrnq_v: {
7600     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7601     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7602     Value *SV = nullptr;
7603 
7604     for (unsigned vi = 0; vi != 2; ++vi) {
7605       SmallVector<int, 16> Indices;
7606       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
7607         Indices.push_back(i+vi);
7608         Indices.push_back(i+e+vi);
7609       }
7610       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
7611       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn");
7612       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
7613     }
7614     return SV;
7615   }
7616   case NEON::BI__builtin_neon_vuzp_v:
7617   case NEON::BI__builtin_neon_vuzpq_v: {
7618     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7619     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7620     Value *SV = nullptr;
7621 
7622     for (unsigned vi = 0; vi != 2; ++vi) {
7623       SmallVector<int, 16> Indices;
7624       for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
7625         Indices.push_back(2*i+vi);
7626 
7627       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
7628       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp");
7629       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
7630     }
7631     return SV;
7632   }
7633   case NEON::BI__builtin_neon_vzip_v:
7634   case NEON::BI__builtin_neon_vzipq_v: {
7635     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7636     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7637     Value *SV = nullptr;
7638 
7639     for (unsigned vi = 0; vi != 2; ++vi) {
7640       SmallVector<int, 16> Indices;
7641       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
7642         Indices.push_back((i + vi*e) >> 1);
7643         Indices.push_back(((i + vi*e) >> 1)+e);
7644       }
7645       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
7646       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip");
7647       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
7648     }
7649     return SV;
7650   }
7651   case NEON::BI__builtin_neon_vqtbl1q_v: {
7652     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl1, Ty),
7653                         Ops, "vtbl1");
7654   }
7655   case NEON::BI__builtin_neon_vqtbl2q_v: {
7656     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl2, Ty),
7657                         Ops, "vtbl2");
7658   }
7659   case NEON::BI__builtin_neon_vqtbl3q_v: {
7660     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl3, Ty),
7661                         Ops, "vtbl3");
7662   }
7663   case NEON::BI__builtin_neon_vqtbl4q_v: {
7664     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl4, Ty),
7665                         Ops, "vtbl4");
7666   }
7667   case NEON::BI__builtin_neon_vqtbx1q_v: {
7668     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx1, Ty),
7669                         Ops, "vtbx1");
7670   }
7671   case NEON::BI__builtin_neon_vqtbx2q_v: {
7672     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx2, Ty),
7673                         Ops, "vtbx2");
7674   }
7675   case NEON::BI__builtin_neon_vqtbx3q_v: {
7676     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx3, Ty),
7677                         Ops, "vtbx3");
7678   }
7679   case NEON::BI__builtin_neon_vqtbx4q_v: {
7680     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx4, Ty),
7681                         Ops, "vtbx4");
7682   }
7683   case NEON::BI__builtin_neon_vsqadd_v:
7684   case NEON::BI__builtin_neon_vsqaddq_v: {
7685     Int = Intrinsic::aarch64_neon_usqadd;
7686     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqadd");
7687   }
7688   case NEON::BI__builtin_neon_vuqadd_v:
7689   case NEON::BI__builtin_neon_vuqaddq_v: {
7690     Int = Intrinsic::aarch64_neon_suqadd;
7691     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
7692   }
7693 
7694   case NEON::BI__builtin_neon_vluti2_laneq_mf8:
7695   case NEON::BI__builtin_neon_vluti2_laneq_bf16:
7696   case NEON::BI__builtin_neon_vluti2_laneq_f16:
7697   case NEON::BI__builtin_neon_vluti2_laneq_p16:
7698   case NEON::BI__builtin_neon_vluti2_laneq_p8:
7699   case NEON::BI__builtin_neon_vluti2_laneq_s16:
7700   case NEON::BI__builtin_neon_vluti2_laneq_s8:
7701   case NEON::BI__builtin_neon_vluti2_laneq_u16:
7702   case NEON::BI__builtin_neon_vluti2_laneq_u8: {
7703     Int = Intrinsic::aarch64_neon_vluti2_laneq;
7704     llvm::Type *Tys[2];
7705     Tys[0] = Ty;
7706     Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
7707                                              /*isQuad*/ false));
7708     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_laneq");
7709   }
7710   case NEON::BI__builtin_neon_vluti2q_laneq_mf8:
7711   case NEON::BI__builtin_neon_vluti2q_laneq_bf16:
7712   case NEON::BI__builtin_neon_vluti2q_laneq_f16:
7713   case NEON::BI__builtin_neon_vluti2q_laneq_p16:
7714   case NEON::BI__builtin_neon_vluti2q_laneq_p8:
7715   case NEON::BI__builtin_neon_vluti2q_laneq_s16:
7716   case NEON::BI__builtin_neon_vluti2q_laneq_s8:
7717   case NEON::BI__builtin_neon_vluti2q_laneq_u16:
7718   case NEON::BI__builtin_neon_vluti2q_laneq_u8: {
7719     Int = Intrinsic::aarch64_neon_vluti2_laneq;
7720     llvm::Type *Tys[2];
7721     Tys[0] = Ty;
7722     Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
7723                                              /*isQuad*/ true));
7724     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_laneq");
7725   }
7726   case NEON::BI__builtin_neon_vluti2_lane_mf8:
7727   case NEON::BI__builtin_neon_vluti2_lane_bf16:
7728   case NEON::BI__builtin_neon_vluti2_lane_f16:
7729   case NEON::BI__builtin_neon_vluti2_lane_p16:
7730   case NEON::BI__builtin_neon_vluti2_lane_p8:
7731   case NEON::BI__builtin_neon_vluti2_lane_s16:
7732   case NEON::BI__builtin_neon_vluti2_lane_s8:
7733   case NEON::BI__builtin_neon_vluti2_lane_u16:
7734   case NEON::BI__builtin_neon_vluti2_lane_u8: {
7735     Int = Intrinsic::aarch64_neon_vluti2_lane;
7736     llvm::Type *Tys[2];
7737     Tys[0] = Ty;
7738     Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
7739                                              /*isQuad*/ false));
7740     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_lane");
7741   }
7742   case NEON::BI__builtin_neon_vluti2q_lane_mf8:
7743   case NEON::BI__builtin_neon_vluti2q_lane_bf16:
7744   case NEON::BI__builtin_neon_vluti2q_lane_f16:
7745   case NEON::BI__builtin_neon_vluti2q_lane_p16:
7746   case NEON::BI__builtin_neon_vluti2q_lane_p8:
7747   case NEON::BI__builtin_neon_vluti2q_lane_s16:
7748   case NEON::BI__builtin_neon_vluti2q_lane_s8:
7749   case NEON::BI__builtin_neon_vluti2q_lane_u16:
7750   case NEON::BI__builtin_neon_vluti2q_lane_u8: {
7751     Int = Intrinsic::aarch64_neon_vluti2_lane;
7752     llvm::Type *Tys[2];
7753     Tys[0] = Ty;
7754     Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
7755                                              /*isQuad*/ true));
7756     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_lane");
7757   }
7758   case NEON::BI__builtin_neon_vluti4q_lane_mf8:
7759   case NEON::BI__builtin_neon_vluti4q_lane_p8:
7760   case NEON::BI__builtin_neon_vluti4q_lane_s8:
7761   case NEON::BI__builtin_neon_vluti4q_lane_u8: {
7762     Int = Intrinsic::aarch64_neon_vluti4q_lane;
7763     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_lane");
7764   }
7765   case NEON::BI__builtin_neon_vluti4q_laneq_mf8:
7766   case NEON::BI__builtin_neon_vluti4q_laneq_p8:
7767   case NEON::BI__builtin_neon_vluti4q_laneq_s8:
7768   case NEON::BI__builtin_neon_vluti4q_laneq_u8: {
7769     Int = Intrinsic::aarch64_neon_vluti4q_laneq;
7770     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq");
7771   }
7772   case NEON::BI__builtin_neon_vluti4q_lane_bf16_x2:
7773   case NEON::BI__builtin_neon_vluti4q_lane_f16_x2:
7774   case NEON::BI__builtin_neon_vluti4q_lane_p16_x2:
7775   case NEON::BI__builtin_neon_vluti4q_lane_s16_x2:
7776   case NEON::BI__builtin_neon_vluti4q_lane_u16_x2: {
7777     Int = Intrinsic::aarch64_neon_vluti4q_lane_x2;
7778     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_lane_x2");
7779   }
7780   case NEON::BI__builtin_neon_vluti4q_laneq_bf16_x2:
7781   case NEON::BI__builtin_neon_vluti4q_laneq_f16_x2:
7782   case NEON::BI__builtin_neon_vluti4q_laneq_p16_x2:
7783   case NEON::BI__builtin_neon_vluti4q_laneq_s16_x2:
7784   case NEON::BI__builtin_neon_vluti4q_laneq_u16_x2: {
7785     Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
7786     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
7787   }
7788   case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
7789     ExtractLow = true;
7790     LLVM_FALLTHROUGH;
7791   case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm:
7792   case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm:
7793     return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1,
7794                               llvm::FixedVectorType::get(BFloatTy, 8),
7795                               Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1");
7796   case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm:
7797     ExtractLow = true;
7798     LLVM_FALLTHROUGH;
7799   case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm:
7800   case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm:
7801     return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2,
7802                               llvm::FixedVectorType::get(BFloatTy, 8),
7803                               Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2");
7804   case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm:
7805     ExtractLow = true;
7806     LLVM_FALLTHROUGH;
7807   case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm:
7808   case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm:
7809     return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1,
7810                               llvm::FixedVectorType::get(HalfTy, 8),
7811                               Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1");
7812   case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm:
7813     ExtractLow = true;
7814     LLVM_FALLTHROUGH;
7815   case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm:
7816   case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm:
7817     return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2,
7818                               llvm::FixedVectorType::get(HalfTy, 8),
7819                               Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2");
7820   case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm:
7821     return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
7822                               llvm::FixedVectorType::get(Int8Ty, 8),
7823                               Ops[0]->getType(), false, Ops, E, "vfcvtn");
7824   case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm:
7825     return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
7826                               llvm::FixedVectorType::get(Int8Ty, 8),
7827                               llvm::FixedVectorType::get(HalfTy, 4), false, Ops,
7828                               E, "vfcvtn");
7829   case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm:
7830     return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
7831                               llvm::FixedVectorType::get(Int8Ty, 16),
7832                               llvm::FixedVectorType::get(HalfTy, 8), false, Ops,
7833                               E, "vfcvtn");
7834   case NEON::BI__builtin_neon_vcvt_high_mf8_f32_fpm: {
7835     llvm::Type *Ty = llvm::FixedVectorType::get(Int8Ty, 16);
7836     Ops[0] = Builder.CreateInsertVector(Ty, PoisonValue::get(Ty), Ops[0],
7837                                         uint64_t(0));
7838     return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn2, Ty,
7839                               Ops[1]->getType(), false, Ops, E, "vfcvtn2");
7840   }
7841 
7842   case NEON::BI__builtin_neon_vdot_f16_mf8_fpm:
7843   case NEON::BI__builtin_neon_vdotq_f16_mf8_fpm:
7844     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2, false, HalfTy,
7845                                Ops, E, "fdot2");
7846   case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm:
7847   case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm:
7848     ExtendLaneArg = true;
7849     LLVM_FALLTHROUGH;
7850   case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm:
7851   case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm:
7852     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2_lane,
7853                                ExtendLaneArg, HalfTy, Ops, E, "fdot2_lane");
7854   case NEON::BI__builtin_neon_vdot_f32_mf8_fpm:
7855   case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm:
7856     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4, false,
7857                                FloatTy, Ops, E, "fdot4");
7858   case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm:
7859   case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm:
7860     ExtendLaneArg = true;
7861     LLVM_FALLTHROUGH;
7862   case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm:
7863   case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
7864     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
7865                                ExtendLaneArg, FloatTy, Ops, E, "fdot4_lane");
7866 
7867   case NEON::BI__builtin_neon_vmlalbq_f16_mf8_fpm:
7868     return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalb,
7869                            {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
7870                            "vmlal");
7871   case NEON::BI__builtin_neon_vmlaltq_f16_mf8_fpm:
7872     return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalt,
7873                            {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
7874                            "vmlal");
7875   case NEON::BI__builtin_neon_vmlallbbq_f32_mf8_fpm:
7876     return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlallbb,
7877                            {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
7878                            "vmlall");
7879   case NEON::BI__builtin_neon_vmlallbtq_f32_mf8_fpm:
7880     return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlallbt,
7881                            {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
7882                            "vmlall");
7883   case NEON::BI__builtin_neon_vmlalltbq_f32_mf8_fpm:
7884     return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalltb,
7885                            {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
7886                            "vmlall");
7887   case NEON::BI__builtin_neon_vmlallttq_f32_mf8_fpm:
7888     return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalltt,
7889                            {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
7890                            "vmlall");
7891   case NEON::BI__builtin_neon_vmlalbq_lane_f16_mf8_fpm:
7892     ExtendLaneArg = true;
7893     LLVM_FALLTHROUGH;
7894   case NEON::BI__builtin_neon_vmlalbq_laneq_f16_mf8_fpm:
7895     return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalb_lane,
7896                                ExtendLaneArg, HalfTy, Ops, E, "vmlal_lane");
7897   case NEON::BI__builtin_neon_vmlaltq_lane_f16_mf8_fpm:
7898     ExtendLaneArg = true;
7899     LLVM_FALLTHROUGH;
7900   case NEON::BI__builtin_neon_vmlaltq_laneq_f16_mf8_fpm:
7901     return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalt_lane,
7902                                ExtendLaneArg, HalfTy, Ops, E, "vmlal_lane");
7903   case NEON::BI__builtin_neon_vmlallbbq_lane_f32_mf8_fpm:
7904     ExtendLaneArg = true;
7905     LLVM_FALLTHROUGH;
7906   case NEON::BI__builtin_neon_vmlallbbq_laneq_f32_mf8_fpm:
7907     return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlallbb_lane,
7908                                ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane");
7909   case NEON::BI__builtin_neon_vmlallbtq_lane_f32_mf8_fpm:
7910     ExtendLaneArg = true;
7911     LLVM_FALLTHROUGH;
7912   case NEON::BI__builtin_neon_vmlallbtq_laneq_f32_mf8_fpm:
7913     return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlallbt_lane,
7914                                ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane");
7915   case NEON::BI__builtin_neon_vmlalltbq_lane_f32_mf8_fpm:
7916     ExtendLaneArg = true;
7917     LLVM_FALLTHROUGH;
7918   case NEON::BI__builtin_neon_vmlalltbq_laneq_f32_mf8_fpm:
7919     return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalltb_lane,
7920                                ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane");
7921   case NEON::BI__builtin_neon_vmlallttq_lane_f32_mf8_fpm:
7922     ExtendLaneArg = true;
7923     LLVM_FALLTHROUGH;
7924   case NEON::BI__builtin_neon_vmlallttq_laneq_f32_mf8_fpm:
7925     return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalltt_lane,
7926                                ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane");
7927   case NEON::BI__builtin_neon_vamin_f16:
7928   case NEON::BI__builtin_neon_vaminq_f16:
7929   case NEON::BI__builtin_neon_vamin_f32:
7930   case NEON::BI__builtin_neon_vaminq_f32:
7931   case NEON::BI__builtin_neon_vaminq_f64: {
7932     Int = Intrinsic::aarch64_neon_famin;
7933     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "famin");
7934   }
7935   case NEON::BI__builtin_neon_vamax_f16:
7936   case NEON::BI__builtin_neon_vamaxq_f16:
7937   case NEON::BI__builtin_neon_vamax_f32:
7938   case NEON::BI__builtin_neon_vamaxq_f32:
7939   case NEON::BI__builtin_neon_vamaxq_f64: {
7940     Int = Intrinsic::aarch64_neon_famax;
7941     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "famax");
7942   }
7943   case NEON::BI__builtin_neon_vscale_f16:
7944   case NEON::BI__builtin_neon_vscaleq_f16:
7945   case NEON::BI__builtin_neon_vscale_f32:
7946   case NEON::BI__builtin_neon_vscaleq_f32:
7947   case NEON::BI__builtin_neon_vscaleq_f64: {
7948     Int = Intrinsic::aarch64_neon_fp8_fscale;
7949     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fscale");
7950   }
7951   }
7952 }
7953 
7954 Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID,
7955                                            const CallExpr *E) {
7956   assert((BuiltinID == BPF::BI__builtin_preserve_field_info ||
7957           BuiltinID == BPF::BI__builtin_btf_type_id ||
7958           BuiltinID == BPF::BI__builtin_preserve_type_info ||
7959           BuiltinID == BPF::BI__builtin_preserve_enum_value) &&
7960          "unexpected BPF builtin");
7961 
7962   // A sequence number, injected into IR builtin functions, to
7963   // prevent CSE given the only difference of the function
7964   // may just be the debuginfo metadata.
7965   static uint32_t BuiltinSeqNum;
7966 
7967   switch (BuiltinID) {
7968   default:
7969     llvm_unreachable("Unexpected BPF builtin");
7970   case BPF::BI__builtin_preserve_field_info: {
7971     const Expr *Arg = E->getArg(0);
7972     bool IsBitField = Arg->IgnoreParens()->getObjectKind() == OK_BitField;
7973 
7974     if (!getDebugInfo()) {
7975       CGM.Error(E->getExprLoc(),
7976                 "using __builtin_preserve_field_info() without -g");
7977       return IsBitField ? EmitLValue(Arg).getRawBitFieldPointer(*this)
7978                         : EmitLValue(Arg).emitRawPointer(*this);
7979     }
7980 
7981     // Enable underlying preserve_*_access_index() generation.
7982     bool OldIsInPreservedAIRegion = IsInPreservedAIRegion;
7983     IsInPreservedAIRegion = true;
7984     Value *FieldAddr = IsBitField ? EmitLValue(Arg).getRawBitFieldPointer(*this)
7985                                   : EmitLValue(Arg).emitRawPointer(*this);
7986     IsInPreservedAIRegion = OldIsInPreservedAIRegion;
7987 
7988     ConstantInt *C = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
7989     Value *InfoKind = ConstantInt::get(Int64Ty, C->getSExtValue());
7990 
7991     // Built the IR for the preserve_field_info intrinsic.
7992     llvm::Function *FnGetFieldInfo = Intrinsic::getOrInsertDeclaration(
7993         &CGM.getModule(), Intrinsic::bpf_preserve_field_info,
7994         {FieldAddr->getType()});
7995     return Builder.CreateCall(FnGetFieldInfo, {FieldAddr, InfoKind});
7996   }
7997   case BPF::BI__builtin_btf_type_id:
7998   case BPF::BI__builtin_preserve_type_info: {
7999     if (!getDebugInfo()) {
8000       CGM.Error(E->getExprLoc(), "using builtin function without -g");
8001       return nullptr;
8002     }
8003 
8004     const Expr *Arg0 = E->getArg(0);
8005     llvm::DIType *DbgInfo = getDebugInfo()->getOrCreateStandaloneType(
8006         Arg0->getType(), Arg0->getExprLoc());
8007 
8008     ConstantInt *Flag = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
8009     Value *FlagValue = ConstantInt::get(Int64Ty, Flag->getSExtValue());
8010     Value *SeqNumVal = ConstantInt::get(Int32Ty, BuiltinSeqNum++);
8011 
8012     llvm::Function *FnDecl;
8013     if (BuiltinID == BPF::BI__builtin_btf_type_id)
8014       FnDecl = Intrinsic::getOrInsertDeclaration(
8015           &CGM.getModule(), Intrinsic::bpf_btf_type_id, {});
8016     else
8017       FnDecl = Intrinsic::getOrInsertDeclaration(
8018           &CGM.getModule(), Intrinsic::bpf_preserve_type_info, {});
8019     CallInst *Fn = Builder.CreateCall(FnDecl, {SeqNumVal, FlagValue});
8020     Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
8021     return Fn;
8022   }
8023   case BPF::BI__builtin_preserve_enum_value: {
8024     if (!getDebugInfo()) {
8025       CGM.Error(E->getExprLoc(), "using builtin function without -g");
8026       return nullptr;
8027     }
8028 
8029     const Expr *Arg0 = E->getArg(0);
8030     llvm::DIType *DbgInfo = getDebugInfo()->getOrCreateStandaloneType(
8031         Arg0->getType(), Arg0->getExprLoc());
8032 
8033     // Find enumerator
8034     const auto *UO = cast<UnaryOperator>(Arg0->IgnoreParens());
8035     const auto *CE = cast<CStyleCastExpr>(UO->getSubExpr());
8036     const auto *DR = cast<DeclRefExpr>(CE->getSubExpr());
8037     const auto *Enumerator = cast<EnumConstantDecl>(DR->getDecl());
8038 
8039     auto InitVal = Enumerator->getInitVal();
8040     std::string InitValStr;
8041     if (InitVal.isNegative() || InitVal > uint64_t(INT64_MAX))
8042       InitValStr = std::to_string(InitVal.getSExtValue());
8043     else
8044       InitValStr = std::to_string(InitVal.getZExtValue());
8045     std::string EnumStr = Enumerator->getNameAsString() + ":" + InitValStr;
8046     Value *EnumStrVal = Builder.CreateGlobalString(EnumStr);
8047 
8048     ConstantInt *Flag = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
8049     Value *FlagValue = ConstantInt::get(Int64Ty, Flag->getSExtValue());
8050     Value *SeqNumVal = ConstantInt::get(Int32Ty, BuiltinSeqNum++);
8051 
8052     llvm::Function *IntrinsicFn = Intrinsic::getOrInsertDeclaration(
8053         &CGM.getModule(), Intrinsic::bpf_preserve_enum_value, {});
8054     CallInst *Fn =
8055         Builder.CreateCall(IntrinsicFn, {SeqNumVal, EnumStrVal, FlagValue});
8056     Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
8057     return Fn;
8058   }
8059   }
8060 }
8061 
8062 llvm::Value *CodeGenFunction::
8063 BuildVector(ArrayRef<llvm::Value*> Ops) {
8064   assert((Ops.size() & (Ops.size() - 1)) == 0 &&
8065          "Not a power-of-two sized vector!");
8066   bool AllConstants = true;
8067   for (unsigned i = 0, e = Ops.size(); i != e && AllConstants; ++i)
8068     AllConstants &= isa<Constant>(Ops[i]);
8069 
8070   // If this is a constant vector, create a ConstantVector.
8071   if (AllConstants) {
8072     SmallVector<llvm::Constant*, 16> CstOps;
8073     for (llvm::Value *Op : Ops)
8074       CstOps.push_back(cast<Constant>(Op));
8075     return llvm::ConstantVector::get(CstOps);
8076   }
8077 
8078   // Otherwise, insertelement the values to build the vector.
8079   Value *Result = llvm::PoisonValue::get(
8080       llvm::FixedVectorType::get(Ops[0]->getType(), Ops.size()));
8081 
8082   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
8083     Result = Builder.CreateInsertElement(Result, Ops[i], Builder.getInt64(i));
8084 
8085   return Result;
8086 }
8087 
8088 Value *CodeGenFunction::EmitAArch64CpuInit() {
8089   llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false);
8090   llvm::FunctionCallee Func =
8091       CGM.CreateRuntimeFunction(FTy, "__init_cpu_features_resolver");
8092   cast<llvm::GlobalValue>(Func.getCallee())->setDSOLocal(true);
8093   cast<llvm::GlobalValue>(Func.getCallee())
8094       ->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
8095   return Builder.CreateCall(Func);
8096 }
8097 
8098 Value *CodeGenFunction::EmitAArch64CpuSupports(const CallExpr *E) {
8099   const Expr *ArgExpr = E->getArg(0)->IgnoreParenCasts();
8100   StringRef ArgStr = cast<StringLiteral>(ArgExpr)->getString();
8101   llvm::SmallVector<StringRef, 8> Features;
8102   ArgStr.split(Features, "+");
8103   for (auto &Feature : Features) {
8104     Feature = Feature.trim();
8105     if (!llvm::AArch64::parseFMVExtension(Feature))
8106       return Builder.getFalse();
8107     if (Feature != "default")
8108       Features.push_back(Feature);
8109   }
8110   return EmitAArch64CpuSupports(Features);
8111 }
8112 
8113 llvm::Value *
8114 CodeGenFunction::EmitAArch64CpuSupports(ArrayRef<StringRef> FeaturesStrs) {
8115   uint64_t FeaturesMask = llvm::AArch64::getCpuSupportsMask(FeaturesStrs);
8116   Value *Result = Builder.getTrue();
8117   if (FeaturesMask != 0) {
8118     // Get features from structure in runtime library
8119     // struct {
8120     //   unsigned long long features;
8121     // } __aarch64_cpu_features;
8122     llvm::Type *STy = llvm::StructType::get(Int64Ty);
8123     llvm::Constant *AArch64CPUFeatures =
8124         CGM.CreateRuntimeVariable(STy, "__aarch64_cpu_features");
8125     cast<llvm::GlobalValue>(AArch64CPUFeatures)->setDSOLocal(true);
8126     llvm::Value *CpuFeatures = Builder.CreateGEP(
8127         STy, AArch64CPUFeatures,
8128         {ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, 0)});
8129     Value *Features = Builder.CreateAlignedLoad(Int64Ty, CpuFeatures,
8130                                                 CharUnits::fromQuantity(8));
8131     Value *Mask = Builder.getInt64(FeaturesMask);
8132     Value *Bitset = Builder.CreateAnd(Features, Mask);
8133     Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
8134     Result = Builder.CreateAnd(Result, Cmp);
8135   }
8136   return Result;
8137 }
8138