1 //===---------- ARM.cpp - Emit LLVM Code for builtins ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This contains code to emit Builtin calls as LLVM code. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "ABIInfo.h" 14 #include "CGBuiltin.h" 15 #include "CGDebugInfo.h" 16 #include "TargetInfo.h" 17 #include "clang/Basic/TargetBuiltins.h" 18 #include "llvm/IR/InlineAsm.h" 19 #include "llvm/IR/IntrinsicsAArch64.h" 20 #include "llvm/IR/IntrinsicsARM.h" 21 #include "llvm/IR/IntrinsicsBPF.h" 22 #include "llvm/TargetParser/AArch64TargetParser.h" 23 24 #include <numeric> 25 26 using namespace clang; 27 using namespace CodeGen; 28 using namespace llvm; 29 30 static std::optional<CodeGenFunction::MSVCIntrin> 31 translateAarch64ToMsvcIntrin(unsigned BuiltinID) { 32 using MSVCIntrin = CodeGenFunction::MSVCIntrin; 33 switch (BuiltinID) { 34 default: 35 return std::nullopt; 36 case clang::AArch64::BI_BitScanForward: 37 case clang::AArch64::BI_BitScanForward64: 38 return MSVCIntrin::_BitScanForward; 39 case clang::AArch64::BI_BitScanReverse: 40 case clang::AArch64::BI_BitScanReverse64: 41 return MSVCIntrin::_BitScanReverse; 42 case clang::AArch64::BI_InterlockedAnd64: 43 return MSVCIntrin::_InterlockedAnd; 44 case clang::AArch64::BI_InterlockedExchange64: 45 return MSVCIntrin::_InterlockedExchange; 46 case clang::AArch64::BI_InterlockedExchangeAdd64: 47 return MSVCIntrin::_InterlockedExchangeAdd; 48 case clang::AArch64::BI_InterlockedExchangeSub64: 49 return MSVCIntrin::_InterlockedExchangeSub; 50 case clang::AArch64::BI_InterlockedOr64: 51 return MSVCIntrin::_InterlockedOr; 52 case clang::AArch64::BI_InterlockedXor64: 53 return MSVCIntrin::_InterlockedXor; 54 case clang::AArch64::BI_InterlockedDecrement64: 55 return MSVCIntrin::_InterlockedDecrement; 56 case clang::AArch64::BI_InterlockedIncrement64: 57 return MSVCIntrin::_InterlockedIncrement; 58 case clang::AArch64::BI_InterlockedExchangeAdd8_acq: 59 case clang::AArch64::BI_InterlockedExchangeAdd16_acq: 60 case clang::AArch64::BI_InterlockedExchangeAdd_acq: 61 case clang::AArch64::BI_InterlockedExchangeAdd64_acq: 62 return MSVCIntrin::_InterlockedExchangeAdd_acq; 63 case clang::AArch64::BI_InterlockedExchangeAdd8_rel: 64 case clang::AArch64::BI_InterlockedExchangeAdd16_rel: 65 case clang::AArch64::BI_InterlockedExchangeAdd_rel: 66 case clang::AArch64::BI_InterlockedExchangeAdd64_rel: 67 return MSVCIntrin::_InterlockedExchangeAdd_rel; 68 case clang::AArch64::BI_InterlockedExchangeAdd8_nf: 69 case clang::AArch64::BI_InterlockedExchangeAdd16_nf: 70 case clang::AArch64::BI_InterlockedExchangeAdd_nf: 71 case clang::AArch64::BI_InterlockedExchangeAdd64_nf: 72 return MSVCIntrin::_InterlockedExchangeAdd_nf; 73 case clang::AArch64::BI_InterlockedExchange8_acq: 74 case clang::AArch64::BI_InterlockedExchange16_acq: 75 case clang::AArch64::BI_InterlockedExchange_acq: 76 case clang::AArch64::BI_InterlockedExchange64_acq: 77 case clang::AArch64::BI_InterlockedExchangePointer_acq: 78 return MSVCIntrin::_InterlockedExchange_acq; 79 case clang::AArch64::BI_InterlockedExchange8_rel: 80 case clang::AArch64::BI_InterlockedExchange16_rel: 81 case clang::AArch64::BI_InterlockedExchange_rel: 82 case clang::AArch64::BI_InterlockedExchange64_rel: 83 case clang::AArch64::BI_InterlockedExchangePointer_rel: 84 return MSVCIntrin::_InterlockedExchange_rel; 85 case clang::AArch64::BI_InterlockedExchange8_nf: 86 case clang::AArch64::BI_InterlockedExchange16_nf: 87 case clang::AArch64::BI_InterlockedExchange_nf: 88 case clang::AArch64::BI_InterlockedExchange64_nf: 89 case clang::AArch64::BI_InterlockedExchangePointer_nf: 90 return MSVCIntrin::_InterlockedExchange_nf; 91 case clang::AArch64::BI_InterlockedCompareExchange8_acq: 92 case clang::AArch64::BI_InterlockedCompareExchange16_acq: 93 case clang::AArch64::BI_InterlockedCompareExchange_acq: 94 case clang::AArch64::BI_InterlockedCompareExchange64_acq: 95 case clang::AArch64::BI_InterlockedCompareExchangePointer_acq: 96 return MSVCIntrin::_InterlockedCompareExchange_acq; 97 case clang::AArch64::BI_InterlockedCompareExchange8_rel: 98 case clang::AArch64::BI_InterlockedCompareExchange16_rel: 99 case clang::AArch64::BI_InterlockedCompareExchange_rel: 100 case clang::AArch64::BI_InterlockedCompareExchange64_rel: 101 case clang::AArch64::BI_InterlockedCompareExchangePointer_rel: 102 return MSVCIntrin::_InterlockedCompareExchange_rel; 103 case clang::AArch64::BI_InterlockedCompareExchange8_nf: 104 case clang::AArch64::BI_InterlockedCompareExchange16_nf: 105 case clang::AArch64::BI_InterlockedCompareExchange_nf: 106 case clang::AArch64::BI_InterlockedCompareExchange64_nf: 107 return MSVCIntrin::_InterlockedCompareExchange_nf; 108 case clang::AArch64::BI_InterlockedCompareExchange128: 109 return MSVCIntrin::_InterlockedCompareExchange128; 110 case clang::AArch64::BI_InterlockedCompareExchange128_acq: 111 return MSVCIntrin::_InterlockedCompareExchange128_acq; 112 case clang::AArch64::BI_InterlockedCompareExchange128_nf: 113 return MSVCIntrin::_InterlockedCompareExchange128_nf; 114 case clang::AArch64::BI_InterlockedCompareExchange128_rel: 115 return MSVCIntrin::_InterlockedCompareExchange128_rel; 116 case clang::AArch64::BI_InterlockedOr8_acq: 117 case clang::AArch64::BI_InterlockedOr16_acq: 118 case clang::AArch64::BI_InterlockedOr_acq: 119 case clang::AArch64::BI_InterlockedOr64_acq: 120 return MSVCIntrin::_InterlockedOr_acq; 121 case clang::AArch64::BI_InterlockedOr8_rel: 122 case clang::AArch64::BI_InterlockedOr16_rel: 123 case clang::AArch64::BI_InterlockedOr_rel: 124 case clang::AArch64::BI_InterlockedOr64_rel: 125 return MSVCIntrin::_InterlockedOr_rel; 126 case clang::AArch64::BI_InterlockedOr8_nf: 127 case clang::AArch64::BI_InterlockedOr16_nf: 128 case clang::AArch64::BI_InterlockedOr_nf: 129 case clang::AArch64::BI_InterlockedOr64_nf: 130 return MSVCIntrin::_InterlockedOr_nf; 131 case clang::AArch64::BI_InterlockedXor8_acq: 132 case clang::AArch64::BI_InterlockedXor16_acq: 133 case clang::AArch64::BI_InterlockedXor_acq: 134 case clang::AArch64::BI_InterlockedXor64_acq: 135 return MSVCIntrin::_InterlockedXor_acq; 136 case clang::AArch64::BI_InterlockedXor8_rel: 137 case clang::AArch64::BI_InterlockedXor16_rel: 138 case clang::AArch64::BI_InterlockedXor_rel: 139 case clang::AArch64::BI_InterlockedXor64_rel: 140 return MSVCIntrin::_InterlockedXor_rel; 141 case clang::AArch64::BI_InterlockedXor8_nf: 142 case clang::AArch64::BI_InterlockedXor16_nf: 143 case clang::AArch64::BI_InterlockedXor_nf: 144 case clang::AArch64::BI_InterlockedXor64_nf: 145 return MSVCIntrin::_InterlockedXor_nf; 146 case clang::AArch64::BI_InterlockedAnd8_acq: 147 case clang::AArch64::BI_InterlockedAnd16_acq: 148 case clang::AArch64::BI_InterlockedAnd_acq: 149 case clang::AArch64::BI_InterlockedAnd64_acq: 150 return MSVCIntrin::_InterlockedAnd_acq; 151 case clang::AArch64::BI_InterlockedAnd8_rel: 152 case clang::AArch64::BI_InterlockedAnd16_rel: 153 case clang::AArch64::BI_InterlockedAnd_rel: 154 case clang::AArch64::BI_InterlockedAnd64_rel: 155 return MSVCIntrin::_InterlockedAnd_rel; 156 case clang::AArch64::BI_InterlockedAnd8_nf: 157 case clang::AArch64::BI_InterlockedAnd16_nf: 158 case clang::AArch64::BI_InterlockedAnd_nf: 159 case clang::AArch64::BI_InterlockedAnd64_nf: 160 return MSVCIntrin::_InterlockedAnd_nf; 161 case clang::AArch64::BI_InterlockedIncrement16_acq: 162 case clang::AArch64::BI_InterlockedIncrement_acq: 163 case clang::AArch64::BI_InterlockedIncrement64_acq: 164 return MSVCIntrin::_InterlockedIncrement_acq; 165 case clang::AArch64::BI_InterlockedIncrement16_rel: 166 case clang::AArch64::BI_InterlockedIncrement_rel: 167 case clang::AArch64::BI_InterlockedIncrement64_rel: 168 return MSVCIntrin::_InterlockedIncrement_rel; 169 case clang::AArch64::BI_InterlockedIncrement16_nf: 170 case clang::AArch64::BI_InterlockedIncrement_nf: 171 case clang::AArch64::BI_InterlockedIncrement64_nf: 172 return MSVCIntrin::_InterlockedIncrement_nf; 173 case clang::AArch64::BI_InterlockedDecrement16_acq: 174 case clang::AArch64::BI_InterlockedDecrement_acq: 175 case clang::AArch64::BI_InterlockedDecrement64_acq: 176 return MSVCIntrin::_InterlockedDecrement_acq; 177 case clang::AArch64::BI_InterlockedDecrement16_rel: 178 case clang::AArch64::BI_InterlockedDecrement_rel: 179 case clang::AArch64::BI_InterlockedDecrement64_rel: 180 return MSVCIntrin::_InterlockedDecrement_rel; 181 case clang::AArch64::BI_InterlockedDecrement16_nf: 182 case clang::AArch64::BI_InterlockedDecrement_nf: 183 case clang::AArch64::BI_InterlockedDecrement64_nf: 184 return MSVCIntrin::_InterlockedDecrement_nf; 185 } 186 llvm_unreachable("must return from switch"); 187 } 188 189 static std::optional<CodeGenFunction::MSVCIntrin> 190 translateArmToMsvcIntrin(unsigned BuiltinID) { 191 using MSVCIntrin = CodeGenFunction::MSVCIntrin; 192 switch (BuiltinID) { 193 default: 194 return std::nullopt; 195 case clang::ARM::BI_BitScanForward: 196 case clang::ARM::BI_BitScanForward64: 197 return MSVCIntrin::_BitScanForward; 198 case clang::ARM::BI_BitScanReverse: 199 case clang::ARM::BI_BitScanReverse64: 200 return MSVCIntrin::_BitScanReverse; 201 case clang::ARM::BI_InterlockedAnd64: 202 return MSVCIntrin::_InterlockedAnd; 203 case clang::ARM::BI_InterlockedExchange64: 204 return MSVCIntrin::_InterlockedExchange; 205 case clang::ARM::BI_InterlockedExchangeAdd64: 206 return MSVCIntrin::_InterlockedExchangeAdd; 207 case clang::ARM::BI_InterlockedExchangeSub64: 208 return MSVCIntrin::_InterlockedExchangeSub; 209 case clang::ARM::BI_InterlockedOr64: 210 return MSVCIntrin::_InterlockedOr; 211 case clang::ARM::BI_InterlockedXor64: 212 return MSVCIntrin::_InterlockedXor; 213 case clang::ARM::BI_InterlockedDecrement64: 214 return MSVCIntrin::_InterlockedDecrement; 215 case clang::ARM::BI_InterlockedIncrement64: 216 return MSVCIntrin::_InterlockedIncrement; 217 case clang::ARM::BI_InterlockedExchangeAdd8_acq: 218 case clang::ARM::BI_InterlockedExchangeAdd16_acq: 219 case clang::ARM::BI_InterlockedExchangeAdd_acq: 220 case clang::ARM::BI_InterlockedExchangeAdd64_acq: 221 return MSVCIntrin::_InterlockedExchangeAdd_acq; 222 case clang::ARM::BI_InterlockedExchangeAdd8_rel: 223 case clang::ARM::BI_InterlockedExchangeAdd16_rel: 224 case clang::ARM::BI_InterlockedExchangeAdd_rel: 225 case clang::ARM::BI_InterlockedExchangeAdd64_rel: 226 return MSVCIntrin::_InterlockedExchangeAdd_rel; 227 case clang::ARM::BI_InterlockedExchangeAdd8_nf: 228 case clang::ARM::BI_InterlockedExchangeAdd16_nf: 229 case clang::ARM::BI_InterlockedExchangeAdd_nf: 230 case clang::ARM::BI_InterlockedExchangeAdd64_nf: 231 return MSVCIntrin::_InterlockedExchangeAdd_nf; 232 case clang::ARM::BI_InterlockedExchange8_acq: 233 case clang::ARM::BI_InterlockedExchange16_acq: 234 case clang::ARM::BI_InterlockedExchange_acq: 235 case clang::ARM::BI_InterlockedExchange64_acq: 236 case clang::ARM::BI_InterlockedExchangePointer_acq: 237 return MSVCIntrin::_InterlockedExchange_acq; 238 case clang::ARM::BI_InterlockedExchange8_rel: 239 case clang::ARM::BI_InterlockedExchange16_rel: 240 case clang::ARM::BI_InterlockedExchange_rel: 241 case clang::ARM::BI_InterlockedExchange64_rel: 242 case clang::ARM::BI_InterlockedExchangePointer_rel: 243 return MSVCIntrin::_InterlockedExchange_rel; 244 case clang::ARM::BI_InterlockedExchange8_nf: 245 case clang::ARM::BI_InterlockedExchange16_nf: 246 case clang::ARM::BI_InterlockedExchange_nf: 247 case clang::ARM::BI_InterlockedExchange64_nf: 248 case clang::ARM::BI_InterlockedExchangePointer_nf: 249 return MSVCIntrin::_InterlockedExchange_nf; 250 case clang::ARM::BI_InterlockedCompareExchange8_acq: 251 case clang::ARM::BI_InterlockedCompareExchange16_acq: 252 case clang::ARM::BI_InterlockedCompareExchange_acq: 253 case clang::ARM::BI_InterlockedCompareExchange64_acq: 254 case clang::ARM::BI_InterlockedCompareExchangePointer_acq: 255 return MSVCIntrin::_InterlockedCompareExchange_acq; 256 case clang::ARM::BI_InterlockedCompareExchange8_rel: 257 case clang::ARM::BI_InterlockedCompareExchange16_rel: 258 case clang::ARM::BI_InterlockedCompareExchange_rel: 259 case clang::ARM::BI_InterlockedCompareExchange64_rel: 260 case clang::ARM::BI_InterlockedCompareExchangePointer_rel: 261 return MSVCIntrin::_InterlockedCompareExchange_rel; 262 case clang::ARM::BI_InterlockedCompareExchange8_nf: 263 case clang::ARM::BI_InterlockedCompareExchange16_nf: 264 case clang::ARM::BI_InterlockedCompareExchange_nf: 265 case clang::ARM::BI_InterlockedCompareExchange64_nf: 266 return MSVCIntrin::_InterlockedCompareExchange_nf; 267 case clang::ARM::BI_InterlockedOr8_acq: 268 case clang::ARM::BI_InterlockedOr16_acq: 269 case clang::ARM::BI_InterlockedOr_acq: 270 case clang::ARM::BI_InterlockedOr64_acq: 271 return MSVCIntrin::_InterlockedOr_acq; 272 case clang::ARM::BI_InterlockedOr8_rel: 273 case clang::ARM::BI_InterlockedOr16_rel: 274 case clang::ARM::BI_InterlockedOr_rel: 275 case clang::ARM::BI_InterlockedOr64_rel: 276 return MSVCIntrin::_InterlockedOr_rel; 277 case clang::ARM::BI_InterlockedOr8_nf: 278 case clang::ARM::BI_InterlockedOr16_nf: 279 case clang::ARM::BI_InterlockedOr_nf: 280 case clang::ARM::BI_InterlockedOr64_nf: 281 return MSVCIntrin::_InterlockedOr_nf; 282 case clang::ARM::BI_InterlockedXor8_acq: 283 case clang::ARM::BI_InterlockedXor16_acq: 284 case clang::ARM::BI_InterlockedXor_acq: 285 case clang::ARM::BI_InterlockedXor64_acq: 286 return MSVCIntrin::_InterlockedXor_acq; 287 case clang::ARM::BI_InterlockedXor8_rel: 288 case clang::ARM::BI_InterlockedXor16_rel: 289 case clang::ARM::BI_InterlockedXor_rel: 290 case clang::ARM::BI_InterlockedXor64_rel: 291 return MSVCIntrin::_InterlockedXor_rel; 292 case clang::ARM::BI_InterlockedXor8_nf: 293 case clang::ARM::BI_InterlockedXor16_nf: 294 case clang::ARM::BI_InterlockedXor_nf: 295 case clang::ARM::BI_InterlockedXor64_nf: 296 return MSVCIntrin::_InterlockedXor_nf; 297 case clang::ARM::BI_InterlockedAnd8_acq: 298 case clang::ARM::BI_InterlockedAnd16_acq: 299 case clang::ARM::BI_InterlockedAnd_acq: 300 case clang::ARM::BI_InterlockedAnd64_acq: 301 return MSVCIntrin::_InterlockedAnd_acq; 302 case clang::ARM::BI_InterlockedAnd8_rel: 303 case clang::ARM::BI_InterlockedAnd16_rel: 304 case clang::ARM::BI_InterlockedAnd_rel: 305 case clang::ARM::BI_InterlockedAnd64_rel: 306 return MSVCIntrin::_InterlockedAnd_rel; 307 case clang::ARM::BI_InterlockedAnd8_nf: 308 case clang::ARM::BI_InterlockedAnd16_nf: 309 case clang::ARM::BI_InterlockedAnd_nf: 310 case clang::ARM::BI_InterlockedAnd64_nf: 311 return MSVCIntrin::_InterlockedAnd_nf; 312 case clang::ARM::BI_InterlockedIncrement16_acq: 313 case clang::ARM::BI_InterlockedIncrement_acq: 314 case clang::ARM::BI_InterlockedIncrement64_acq: 315 return MSVCIntrin::_InterlockedIncrement_acq; 316 case clang::ARM::BI_InterlockedIncrement16_rel: 317 case clang::ARM::BI_InterlockedIncrement_rel: 318 case clang::ARM::BI_InterlockedIncrement64_rel: 319 return MSVCIntrin::_InterlockedIncrement_rel; 320 case clang::ARM::BI_InterlockedIncrement16_nf: 321 case clang::ARM::BI_InterlockedIncrement_nf: 322 case clang::ARM::BI_InterlockedIncrement64_nf: 323 return MSVCIntrin::_InterlockedIncrement_nf; 324 case clang::ARM::BI_InterlockedDecrement16_acq: 325 case clang::ARM::BI_InterlockedDecrement_acq: 326 case clang::ARM::BI_InterlockedDecrement64_acq: 327 return MSVCIntrin::_InterlockedDecrement_acq; 328 case clang::ARM::BI_InterlockedDecrement16_rel: 329 case clang::ARM::BI_InterlockedDecrement_rel: 330 case clang::ARM::BI_InterlockedDecrement64_rel: 331 return MSVCIntrin::_InterlockedDecrement_rel; 332 case clang::ARM::BI_InterlockedDecrement16_nf: 333 case clang::ARM::BI_InterlockedDecrement_nf: 334 case clang::ARM::BI_InterlockedDecrement64_nf: 335 return MSVCIntrin::_InterlockedDecrement_nf; 336 } 337 llvm_unreachable("must return from switch"); 338 } 339 340 // Emit an intrinsic where all operands are of the same type as the result. 341 // Depending on mode, this may be a constrained floating-point intrinsic. 342 static Value *emitCallMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, 343 unsigned IntrinsicID, 344 unsigned ConstrainedIntrinsicID, 345 llvm::Type *Ty, 346 ArrayRef<Value *> Args) { 347 Function *F; 348 if (CGF.Builder.getIsFPConstrained()) 349 F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Ty); 350 else 351 F = CGF.CGM.getIntrinsic(IntrinsicID, Ty); 352 353 if (CGF.Builder.getIsFPConstrained()) 354 return CGF.Builder.CreateConstrainedFPCall(F, Args); 355 else 356 return CGF.Builder.CreateCall(F, Args); 357 } 358 359 static llvm::FixedVectorType *GetNeonType(CodeGenFunction *CGF, 360 NeonTypeFlags TypeFlags, 361 bool HasLegalHalfType = true, 362 bool V1Ty = false, 363 bool AllowBFloatArgsAndRet = true) { 364 int IsQuad = TypeFlags.isQuad(); 365 switch (TypeFlags.getEltType()) { 366 case NeonTypeFlags::Int8: 367 case NeonTypeFlags::Poly8: 368 case NeonTypeFlags::MFloat8: 369 return llvm::FixedVectorType::get(CGF->Int8Ty, V1Ty ? 1 : (8 << IsQuad)); 370 case NeonTypeFlags::Int16: 371 case NeonTypeFlags::Poly16: 372 return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad)); 373 case NeonTypeFlags::BFloat16: 374 if (AllowBFloatArgsAndRet) 375 return llvm::FixedVectorType::get(CGF->BFloatTy, V1Ty ? 1 : (4 << IsQuad)); 376 else 377 return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad)); 378 case NeonTypeFlags::Float16: 379 if (HasLegalHalfType) 380 return llvm::FixedVectorType::get(CGF->HalfTy, V1Ty ? 1 : (4 << IsQuad)); 381 else 382 return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad)); 383 case NeonTypeFlags::Int32: 384 return llvm::FixedVectorType::get(CGF->Int32Ty, V1Ty ? 1 : (2 << IsQuad)); 385 case NeonTypeFlags::Int64: 386 case NeonTypeFlags::Poly64: 387 return llvm::FixedVectorType::get(CGF->Int64Ty, V1Ty ? 1 : (1 << IsQuad)); 388 case NeonTypeFlags::Poly128: 389 // FIXME: i128 and f128 doesn't get fully support in Clang and llvm. 390 // There is a lot of i128 and f128 API missing. 391 // so we use v16i8 to represent poly128 and get pattern matched. 392 return llvm::FixedVectorType::get(CGF->Int8Ty, 16); 393 case NeonTypeFlags::Float32: 394 return llvm::FixedVectorType::get(CGF->FloatTy, V1Ty ? 1 : (2 << IsQuad)); 395 case NeonTypeFlags::Float64: 396 return llvm::FixedVectorType::get(CGF->DoubleTy, V1Ty ? 1 : (1 << IsQuad)); 397 } 398 llvm_unreachable("Unknown vector element type!"); 399 } 400 401 static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF, 402 NeonTypeFlags IntTypeFlags) { 403 int IsQuad = IntTypeFlags.isQuad(); 404 switch (IntTypeFlags.getEltType()) { 405 case NeonTypeFlags::Int16: 406 return llvm::FixedVectorType::get(CGF->HalfTy, (4 << IsQuad)); 407 case NeonTypeFlags::Int32: 408 return llvm::FixedVectorType::get(CGF->FloatTy, (2 << IsQuad)); 409 case NeonTypeFlags::Int64: 410 return llvm::FixedVectorType::get(CGF->DoubleTy, (1 << IsQuad)); 411 default: 412 llvm_unreachable("Type can't be converted to floating-point!"); 413 } 414 } 415 416 Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C, 417 const ElementCount &Count) { 418 Value *SV = llvm::ConstantVector::getSplat(Count, C); 419 return Builder.CreateShuffleVector(V, V, SV, "lane"); 420 } 421 422 Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) { 423 ElementCount EC = cast<llvm::VectorType>(V->getType())->getElementCount(); 424 return EmitNeonSplat(V, C, EC); 425 } 426 427 Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops, 428 const char *name, 429 unsigned shift, bool rightshift) { 430 unsigned j = 0; 431 for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); 432 ai != ae; ++ai, ++j) { 433 if (F->isConstrainedFPIntrinsic()) 434 if (ai->getType()->isMetadataTy()) 435 continue; 436 if (shift > 0 && shift == j) 437 Ops[j] = EmitNeonShiftVector(Ops[j], ai->getType(), rightshift); 438 else 439 Ops[j] = Builder.CreateBitCast(Ops[j], ai->getType(), name); 440 } 441 442 if (F->isConstrainedFPIntrinsic()) 443 return Builder.CreateConstrainedFPCall(F, Ops, name); 444 else 445 return Builder.CreateCall(F, Ops, name); 446 } 447 448 Value *CodeGenFunction::EmitFP8NeonCall(unsigned IID, 449 ArrayRef<llvm::Type *> Tys, 450 SmallVectorImpl<Value *> &Ops, 451 const CallExpr *E, const char *name) { 452 llvm::Value *FPM = 453 EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E); 454 Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), FPM); 455 return EmitNeonCall(CGM.getIntrinsic(IID, Tys), Ops, name); 456 } 457 458 llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall( 459 unsigned IID, bool ExtendLaneArg, llvm::Type *RetTy, 460 SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) { 461 462 const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() / 463 RetTy->getPrimitiveSizeInBits(); 464 llvm::Type *Tys[] = {llvm::FixedVectorType::get(RetTy, ElemCount), 465 Ops[1]->getType()}; 466 if (ExtendLaneArg) { 467 auto *VT = llvm::FixedVectorType::get(Int8Ty, 16); 468 Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2], 469 uint64_t(0)); 470 } 471 return EmitFP8NeonCall(IID, Tys, Ops, E, name); 472 } 473 474 llvm::Value *CodeGenFunction::EmitFP8NeonFMLACall( 475 unsigned IID, bool ExtendLaneArg, llvm::Type *RetTy, 476 SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) { 477 478 if (ExtendLaneArg) { 479 auto *VT = llvm::FixedVectorType::get(Int8Ty, 16); 480 Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2], 481 uint64_t(0)); 482 } 483 const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() / 484 RetTy->getPrimitiveSizeInBits(); 485 return EmitFP8NeonCall(IID, {llvm::FixedVectorType::get(RetTy, ElemCount)}, 486 Ops, E, name); 487 } 488 489 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty, 490 bool neg) { 491 int SV = cast<ConstantInt>(V)->getSExtValue(); 492 return ConstantInt::get(Ty, neg ? -SV : SV); 493 } 494 495 Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0, 496 llvm::Type *Ty1, bool Extract, 497 SmallVectorImpl<llvm::Value *> &Ops, 498 const CallExpr *E, 499 const char *name) { 500 llvm::Type *Tys[] = {Ty0, Ty1}; 501 if (Extract) { 502 // Op[0] is mfloat8x16_t, but the intrinsic converts only the lower part of 503 // the vector. 504 Tys[1] = llvm::FixedVectorType::get(Int8Ty, 8); 505 Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], uint64_t(0)); 506 } 507 return EmitFP8NeonCall(IID, Tys, Ops, E, name); 508 } 509 510 // Right-shift a vector by a constant. 511 Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift, 512 llvm::Type *Ty, bool usgn, 513 const char *name) { 514 llvm::VectorType *VTy = cast<llvm::VectorType>(Ty); 515 516 int ShiftAmt = cast<ConstantInt>(Shift)->getSExtValue(); 517 int EltSize = VTy->getScalarSizeInBits(); 518 519 Vec = Builder.CreateBitCast(Vec, Ty); 520 521 // lshr/ashr are undefined when the shift amount is equal to the vector 522 // element size. 523 if (ShiftAmt == EltSize) { 524 if (usgn) { 525 // Right-shifting an unsigned value by its size yields 0. 526 return llvm::ConstantAggregateZero::get(VTy); 527 } else { 528 // Right-shifting a signed value by its size is equivalent 529 // to a shift of size-1. 530 --ShiftAmt; 531 Shift = ConstantInt::get(VTy->getElementType(), ShiftAmt); 532 } 533 } 534 535 Shift = EmitNeonShiftVector(Shift, Ty, false); 536 if (usgn) 537 return Builder.CreateLShr(Vec, Shift, name); 538 else 539 return Builder.CreateAShr(Vec, Shift, name); 540 } 541 542 enum { 543 AddRetType = (1 << 0), 544 Add1ArgType = (1 << 1), 545 Add2ArgTypes = (1 << 2), 546 547 VectorizeRetType = (1 << 3), 548 VectorizeArgTypes = (1 << 4), 549 550 InventFloatType = (1 << 5), 551 UnsignedAlts = (1 << 6), 552 553 Use64BitVectors = (1 << 7), 554 Use128BitVectors = (1 << 8), 555 556 Vectorize1ArgType = Add1ArgType | VectorizeArgTypes, 557 VectorRet = AddRetType | VectorizeRetType, 558 VectorRetGetArgs01 = 559 AddRetType | Add2ArgTypes | VectorizeRetType | VectorizeArgTypes, 560 FpCmpzModifiers = 561 AddRetType | VectorizeRetType | Add1ArgType | InventFloatType 562 }; 563 564 namespace { 565 struct ARMVectorIntrinsicInfo { 566 const char *NameHint; 567 unsigned BuiltinID; 568 unsigned LLVMIntrinsic; 569 unsigned AltLLVMIntrinsic; 570 uint64_t TypeModifier; 571 572 bool operator<(unsigned RHSBuiltinID) const { 573 return BuiltinID < RHSBuiltinID; 574 } 575 bool operator<(const ARMVectorIntrinsicInfo &TE) const { 576 return BuiltinID < TE.BuiltinID; 577 } 578 }; 579 } // end anonymous namespace 580 581 #define NEONMAP0(NameBase) \ 582 { #NameBase, NEON::BI__builtin_neon_ ## NameBase, 0, 0, 0 } 583 584 #define NEONMAP1(NameBase, LLVMIntrinsic, TypeModifier) \ 585 { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \ 586 Intrinsic::LLVMIntrinsic, 0, TypeModifier } 587 588 #define NEONMAP2(NameBase, LLVMIntrinsic, AltLLVMIntrinsic, TypeModifier) \ 589 { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \ 590 Intrinsic::LLVMIntrinsic, Intrinsic::AltLLVMIntrinsic, \ 591 TypeModifier } 592 593 static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = { 594 NEONMAP1(__a32_vcvt_bf16_f32, arm_neon_vcvtfp2bf, 0), 595 NEONMAP0(splat_lane_v), 596 NEONMAP0(splat_laneq_v), 597 NEONMAP0(splatq_lane_v), 598 NEONMAP0(splatq_laneq_v), 599 NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts), 600 NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts), 601 NEONMAP1(vabs_v, arm_neon_vabs, 0), 602 NEONMAP1(vabsq_v, arm_neon_vabs, 0), 603 NEONMAP0(vadd_v), 604 NEONMAP0(vaddhn_v), 605 NEONMAP0(vaddq_v), 606 NEONMAP1(vaesdq_u8, arm_neon_aesd, 0), 607 NEONMAP1(vaeseq_u8, arm_neon_aese, 0), 608 NEONMAP1(vaesimcq_u8, arm_neon_aesimc, 0), 609 NEONMAP1(vaesmcq_u8, arm_neon_aesmc, 0), 610 NEONMAP1(vbfdot_f32, arm_neon_bfdot, 0), 611 NEONMAP1(vbfdotq_f32, arm_neon_bfdot, 0), 612 NEONMAP1(vbfmlalbq_f32, arm_neon_bfmlalb, 0), 613 NEONMAP1(vbfmlaltq_f32, arm_neon_bfmlalt, 0), 614 NEONMAP1(vbfmmlaq_f32, arm_neon_bfmmla, 0), 615 NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType), 616 NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType), 617 NEONMAP1(vcadd_rot270_f16, arm_neon_vcadd_rot270, Add1ArgType), 618 NEONMAP1(vcadd_rot270_f32, arm_neon_vcadd_rot270, Add1ArgType), 619 NEONMAP1(vcadd_rot90_f16, arm_neon_vcadd_rot90, Add1ArgType), 620 NEONMAP1(vcadd_rot90_f32, arm_neon_vcadd_rot90, Add1ArgType), 621 NEONMAP1(vcaddq_rot270_f16, arm_neon_vcadd_rot270, Add1ArgType), 622 NEONMAP1(vcaddq_rot270_f32, arm_neon_vcadd_rot270, Add1ArgType), 623 NEONMAP1(vcaddq_rot270_f64, arm_neon_vcadd_rot270, Add1ArgType), 624 NEONMAP1(vcaddq_rot90_f16, arm_neon_vcadd_rot90, Add1ArgType), 625 NEONMAP1(vcaddq_rot90_f32, arm_neon_vcadd_rot90, Add1ArgType), 626 NEONMAP1(vcaddq_rot90_f64, arm_neon_vcadd_rot90, Add1ArgType), 627 NEONMAP1(vcage_v, arm_neon_vacge, 0), 628 NEONMAP1(vcageq_v, arm_neon_vacge, 0), 629 NEONMAP1(vcagt_v, arm_neon_vacgt, 0), 630 NEONMAP1(vcagtq_v, arm_neon_vacgt, 0), 631 NEONMAP1(vcale_v, arm_neon_vacge, 0), 632 NEONMAP1(vcaleq_v, arm_neon_vacge, 0), 633 NEONMAP1(vcalt_v, arm_neon_vacgt, 0), 634 NEONMAP1(vcaltq_v, arm_neon_vacgt, 0), 635 NEONMAP0(vceqz_v), 636 NEONMAP0(vceqzq_v), 637 NEONMAP0(vcgez_v), 638 NEONMAP0(vcgezq_v), 639 NEONMAP0(vcgtz_v), 640 NEONMAP0(vcgtzq_v), 641 NEONMAP0(vclez_v), 642 NEONMAP0(vclezq_v), 643 NEONMAP1(vcls_v, arm_neon_vcls, Add1ArgType), 644 NEONMAP1(vclsq_v, arm_neon_vcls, Add1ArgType), 645 NEONMAP0(vcltz_v), 646 NEONMAP0(vcltzq_v), 647 NEONMAP1(vclz_v, ctlz, Add1ArgType), 648 NEONMAP1(vclzq_v, ctlz, Add1ArgType), 649 NEONMAP1(vcnt_v, ctpop, Add1ArgType), 650 NEONMAP1(vcntq_v, ctpop, Add1ArgType), 651 NEONMAP1(vcvt_f16_f32, arm_neon_vcvtfp2hf, 0), 652 NEONMAP0(vcvt_f16_s16), 653 NEONMAP0(vcvt_f16_u16), 654 NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0), 655 NEONMAP0(vcvt_f32_v), 656 NEONMAP1(vcvt_n_f16_s16, arm_neon_vcvtfxs2fp, 0), 657 NEONMAP1(vcvt_n_f16_u16, arm_neon_vcvtfxu2fp, 0), 658 NEONMAP2(vcvt_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0), 659 NEONMAP1(vcvt_n_s16_f16, arm_neon_vcvtfp2fxs, 0), 660 NEONMAP1(vcvt_n_s32_v, arm_neon_vcvtfp2fxs, 0), 661 NEONMAP1(vcvt_n_s64_v, arm_neon_vcvtfp2fxs, 0), 662 NEONMAP1(vcvt_n_u16_f16, arm_neon_vcvtfp2fxu, 0), 663 NEONMAP1(vcvt_n_u32_v, arm_neon_vcvtfp2fxu, 0), 664 NEONMAP1(vcvt_n_u64_v, arm_neon_vcvtfp2fxu, 0), 665 NEONMAP0(vcvt_s16_f16), 666 NEONMAP0(vcvt_s32_v), 667 NEONMAP0(vcvt_s64_v), 668 NEONMAP0(vcvt_u16_f16), 669 NEONMAP0(vcvt_u32_v), 670 NEONMAP0(vcvt_u64_v), 671 NEONMAP1(vcvta_s16_f16, arm_neon_vcvtas, 0), 672 NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0), 673 NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0), 674 NEONMAP1(vcvta_u16_f16, arm_neon_vcvtau, 0), 675 NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0), 676 NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0), 677 NEONMAP1(vcvtaq_s16_f16, arm_neon_vcvtas, 0), 678 NEONMAP1(vcvtaq_s32_v, arm_neon_vcvtas, 0), 679 NEONMAP1(vcvtaq_s64_v, arm_neon_vcvtas, 0), 680 NEONMAP1(vcvtaq_u16_f16, arm_neon_vcvtau, 0), 681 NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0), 682 NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0), 683 NEONMAP1(vcvth_bf16_f32, arm_neon_vcvtbfp2bf, 0), 684 NEONMAP1(vcvtm_s16_f16, arm_neon_vcvtms, 0), 685 NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0), 686 NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0), 687 NEONMAP1(vcvtm_u16_f16, arm_neon_vcvtmu, 0), 688 NEONMAP1(vcvtm_u32_v, arm_neon_vcvtmu, 0), 689 NEONMAP1(vcvtm_u64_v, arm_neon_vcvtmu, 0), 690 NEONMAP1(vcvtmq_s16_f16, arm_neon_vcvtms, 0), 691 NEONMAP1(vcvtmq_s32_v, arm_neon_vcvtms, 0), 692 NEONMAP1(vcvtmq_s64_v, arm_neon_vcvtms, 0), 693 NEONMAP1(vcvtmq_u16_f16, arm_neon_vcvtmu, 0), 694 NEONMAP1(vcvtmq_u32_v, arm_neon_vcvtmu, 0), 695 NEONMAP1(vcvtmq_u64_v, arm_neon_vcvtmu, 0), 696 NEONMAP1(vcvtn_s16_f16, arm_neon_vcvtns, 0), 697 NEONMAP1(vcvtn_s32_v, arm_neon_vcvtns, 0), 698 NEONMAP1(vcvtn_s64_v, arm_neon_vcvtns, 0), 699 NEONMAP1(vcvtn_u16_f16, arm_neon_vcvtnu, 0), 700 NEONMAP1(vcvtn_u32_v, arm_neon_vcvtnu, 0), 701 NEONMAP1(vcvtn_u64_v, arm_neon_vcvtnu, 0), 702 NEONMAP1(vcvtnq_s16_f16, arm_neon_vcvtns, 0), 703 NEONMAP1(vcvtnq_s32_v, arm_neon_vcvtns, 0), 704 NEONMAP1(vcvtnq_s64_v, arm_neon_vcvtns, 0), 705 NEONMAP1(vcvtnq_u16_f16, arm_neon_vcvtnu, 0), 706 NEONMAP1(vcvtnq_u32_v, arm_neon_vcvtnu, 0), 707 NEONMAP1(vcvtnq_u64_v, arm_neon_vcvtnu, 0), 708 NEONMAP1(vcvtp_s16_f16, arm_neon_vcvtps, 0), 709 NEONMAP1(vcvtp_s32_v, arm_neon_vcvtps, 0), 710 NEONMAP1(vcvtp_s64_v, arm_neon_vcvtps, 0), 711 NEONMAP1(vcvtp_u16_f16, arm_neon_vcvtpu, 0), 712 NEONMAP1(vcvtp_u32_v, arm_neon_vcvtpu, 0), 713 NEONMAP1(vcvtp_u64_v, arm_neon_vcvtpu, 0), 714 NEONMAP1(vcvtpq_s16_f16, arm_neon_vcvtps, 0), 715 NEONMAP1(vcvtpq_s32_v, arm_neon_vcvtps, 0), 716 NEONMAP1(vcvtpq_s64_v, arm_neon_vcvtps, 0), 717 NEONMAP1(vcvtpq_u16_f16, arm_neon_vcvtpu, 0), 718 NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0), 719 NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0), 720 NEONMAP0(vcvtq_f16_s16), 721 NEONMAP0(vcvtq_f16_u16), 722 NEONMAP0(vcvtq_f32_v), 723 NEONMAP1(vcvtq_n_f16_s16, arm_neon_vcvtfxs2fp, 0), 724 NEONMAP1(vcvtq_n_f16_u16, arm_neon_vcvtfxu2fp, 0), 725 NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0), 726 NEONMAP1(vcvtq_n_s16_f16, arm_neon_vcvtfp2fxs, 0), 727 NEONMAP1(vcvtq_n_s32_v, arm_neon_vcvtfp2fxs, 0), 728 NEONMAP1(vcvtq_n_s64_v, arm_neon_vcvtfp2fxs, 0), 729 NEONMAP1(vcvtq_n_u16_f16, arm_neon_vcvtfp2fxu, 0), 730 NEONMAP1(vcvtq_n_u32_v, arm_neon_vcvtfp2fxu, 0), 731 NEONMAP1(vcvtq_n_u64_v, arm_neon_vcvtfp2fxu, 0), 732 NEONMAP0(vcvtq_s16_f16), 733 NEONMAP0(vcvtq_s32_v), 734 NEONMAP0(vcvtq_s64_v), 735 NEONMAP0(vcvtq_u16_f16), 736 NEONMAP0(vcvtq_u32_v), 737 NEONMAP0(vcvtq_u64_v), 738 NEONMAP1(vdot_s32, arm_neon_sdot, 0), 739 NEONMAP1(vdot_u32, arm_neon_udot, 0), 740 NEONMAP1(vdotq_s32, arm_neon_sdot, 0), 741 NEONMAP1(vdotq_u32, arm_neon_udot, 0), 742 NEONMAP0(vext_v), 743 NEONMAP0(vextq_v), 744 NEONMAP0(vfma_v), 745 NEONMAP0(vfmaq_v), 746 NEONMAP2(vhadd_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts), 747 NEONMAP2(vhaddq_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts), 748 NEONMAP2(vhsub_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts), 749 NEONMAP2(vhsubq_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts), 750 NEONMAP0(vld1_dup_v), 751 NEONMAP1(vld1_v, arm_neon_vld1, 0), 752 NEONMAP1(vld1_x2_v, arm_neon_vld1x2, 0), 753 NEONMAP1(vld1_x3_v, arm_neon_vld1x3, 0), 754 NEONMAP1(vld1_x4_v, arm_neon_vld1x4, 0), 755 NEONMAP0(vld1q_dup_v), 756 NEONMAP1(vld1q_v, arm_neon_vld1, 0), 757 NEONMAP1(vld1q_x2_v, arm_neon_vld1x2, 0), 758 NEONMAP1(vld1q_x3_v, arm_neon_vld1x3, 0), 759 NEONMAP1(vld1q_x4_v, arm_neon_vld1x4, 0), 760 NEONMAP1(vld2_dup_v, arm_neon_vld2dup, 0), 761 NEONMAP1(vld2_lane_v, arm_neon_vld2lane, 0), 762 NEONMAP1(vld2_v, arm_neon_vld2, 0), 763 NEONMAP1(vld2q_dup_v, arm_neon_vld2dup, 0), 764 NEONMAP1(vld2q_lane_v, arm_neon_vld2lane, 0), 765 NEONMAP1(vld2q_v, arm_neon_vld2, 0), 766 NEONMAP1(vld3_dup_v, arm_neon_vld3dup, 0), 767 NEONMAP1(vld3_lane_v, arm_neon_vld3lane, 0), 768 NEONMAP1(vld3_v, arm_neon_vld3, 0), 769 NEONMAP1(vld3q_dup_v, arm_neon_vld3dup, 0), 770 NEONMAP1(vld3q_lane_v, arm_neon_vld3lane, 0), 771 NEONMAP1(vld3q_v, arm_neon_vld3, 0), 772 NEONMAP1(vld4_dup_v, arm_neon_vld4dup, 0), 773 NEONMAP1(vld4_lane_v, arm_neon_vld4lane, 0), 774 NEONMAP1(vld4_v, arm_neon_vld4, 0), 775 NEONMAP1(vld4q_dup_v, arm_neon_vld4dup, 0), 776 NEONMAP1(vld4q_lane_v, arm_neon_vld4lane, 0), 777 NEONMAP1(vld4q_v, arm_neon_vld4, 0), 778 NEONMAP2(vmax_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts), 779 NEONMAP1(vmaxnm_v, arm_neon_vmaxnm, Add1ArgType), 780 NEONMAP1(vmaxnmq_v, arm_neon_vmaxnm, Add1ArgType), 781 NEONMAP2(vmaxq_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts), 782 NEONMAP2(vmin_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts), 783 NEONMAP1(vminnm_v, arm_neon_vminnm, Add1ArgType), 784 NEONMAP1(vminnmq_v, arm_neon_vminnm, Add1ArgType), 785 NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts), 786 NEONMAP1(vmmlaq_s32, arm_neon_smmla, 0), 787 NEONMAP1(vmmlaq_u32, arm_neon_ummla, 0), 788 NEONMAP0(vmovl_v), 789 NEONMAP0(vmovn_v), 790 NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType), 791 NEONMAP0(vmull_v), 792 NEONMAP1(vmulq_v, arm_neon_vmulp, Add1ArgType), 793 NEONMAP2(vpadal_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts), 794 NEONMAP2(vpadalq_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts), 795 NEONMAP1(vpadd_v, arm_neon_vpadd, Add1ArgType), 796 NEONMAP2(vpaddl_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts), 797 NEONMAP2(vpaddlq_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts), 798 NEONMAP1(vpaddq_v, arm_neon_vpadd, Add1ArgType), 799 NEONMAP2(vpmax_v, arm_neon_vpmaxu, arm_neon_vpmaxs, Add1ArgType | UnsignedAlts), 800 NEONMAP2(vpmin_v, arm_neon_vpminu, arm_neon_vpmins, Add1ArgType | UnsignedAlts), 801 NEONMAP1(vqabs_v, arm_neon_vqabs, Add1ArgType), 802 NEONMAP1(vqabsq_v, arm_neon_vqabs, Add1ArgType), 803 NEONMAP2(vqadd_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts), 804 NEONMAP2(vqaddq_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts), 805 NEONMAP2(vqdmlal_v, arm_neon_vqdmull, sadd_sat, 0), 806 NEONMAP2(vqdmlsl_v, arm_neon_vqdmull, ssub_sat, 0), 807 NEONMAP1(vqdmulh_v, arm_neon_vqdmulh, Add1ArgType), 808 NEONMAP1(vqdmulhq_v, arm_neon_vqdmulh, Add1ArgType), 809 NEONMAP1(vqdmull_v, arm_neon_vqdmull, Add1ArgType), 810 NEONMAP2(vqmovn_v, arm_neon_vqmovnu, arm_neon_vqmovns, Add1ArgType | UnsignedAlts), 811 NEONMAP1(vqmovun_v, arm_neon_vqmovnsu, Add1ArgType), 812 NEONMAP1(vqneg_v, arm_neon_vqneg, Add1ArgType), 813 NEONMAP1(vqnegq_v, arm_neon_vqneg, Add1ArgType), 814 NEONMAP1(vqrdmlah_s16, arm_neon_vqrdmlah, Add1ArgType), 815 NEONMAP1(vqrdmlah_s32, arm_neon_vqrdmlah, Add1ArgType), 816 NEONMAP1(vqrdmlahq_s16, arm_neon_vqrdmlah, Add1ArgType), 817 NEONMAP1(vqrdmlahq_s32, arm_neon_vqrdmlah, Add1ArgType), 818 NEONMAP1(vqrdmlsh_s16, arm_neon_vqrdmlsh, Add1ArgType), 819 NEONMAP1(vqrdmlsh_s32, arm_neon_vqrdmlsh, Add1ArgType), 820 NEONMAP1(vqrdmlshq_s16, arm_neon_vqrdmlsh, Add1ArgType), 821 NEONMAP1(vqrdmlshq_s32, arm_neon_vqrdmlsh, Add1ArgType), 822 NEONMAP1(vqrdmulh_v, arm_neon_vqrdmulh, Add1ArgType), 823 NEONMAP1(vqrdmulhq_v, arm_neon_vqrdmulh, Add1ArgType), 824 NEONMAP2(vqrshl_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts), 825 NEONMAP2(vqrshlq_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts), 826 NEONMAP2(vqshl_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts), 827 NEONMAP2(vqshl_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts), 828 NEONMAP2(vqshlq_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts), 829 NEONMAP2(vqshlq_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts), 830 NEONMAP1(vqshlu_n_v, arm_neon_vqshiftsu, 0), 831 NEONMAP1(vqshluq_n_v, arm_neon_vqshiftsu, 0), 832 NEONMAP2(vqsub_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts), 833 NEONMAP2(vqsubq_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts), 834 NEONMAP1(vraddhn_v, arm_neon_vraddhn, Add1ArgType), 835 NEONMAP2(vrecpe_v, arm_neon_vrecpe, arm_neon_vrecpe, 0), 836 NEONMAP2(vrecpeq_v, arm_neon_vrecpe, arm_neon_vrecpe, 0), 837 NEONMAP1(vrecps_v, arm_neon_vrecps, Add1ArgType), 838 NEONMAP1(vrecpsq_v, arm_neon_vrecps, Add1ArgType), 839 NEONMAP2(vrhadd_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts), 840 NEONMAP2(vrhaddq_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts), 841 NEONMAP1(vrnd_v, trunc, Add1ArgType), 842 NEONMAP1(vrnda_v, round, Add1ArgType), 843 NEONMAP1(vrndaq_v, round, Add1ArgType), 844 NEONMAP0(vrndi_v), 845 NEONMAP0(vrndiq_v), 846 NEONMAP1(vrndm_v, floor, Add1ArgType), 847 NEONMAP1(vrndmq_v, floor, Add1ArgType), 848 NEONMAP1(vrndn_v, roundeven, Add1ArgType), 849 NEONMAP1(vrndnq_v, roundeven, Add1ArgType), 850 NEONMAP1(vrndp_v, ceil, Add1ArgType), 851 NEONMAP1(vrndpq_v, ceil, Add1ArgType), 852 NEONMAP1(vrndq_v, trunc, Add1ArgType), 853 NEONMAP1(vrndx_v, rint, Add1ArgType), 854 NEONMAP1(vrndxq_v, rint, Add1ArgType), 855 NEONMAP2(vrshl_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts), 856 NEONMAP2(vrshlq_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts), 857 NEONMAP2(vrshr_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts), 858 NEONMAP2(vrshrq_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts), 859 NEONMAP2(vrsqrte_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0), 860 NEONMAP2(vrsqrteq_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0), 861 NEONMAP1(vrsqrts_v, arm_neon_vrsqrts, Add1ArgType), 862 NEONMAP1(vrsqrtsq_v, arm_neon_vrsqrts, Add1ArgType), 863 NEONMAP1(vrsubhn_v, arm_neon_vrsubhn, Add1ArgType), 864 NEONMAP1(vsha1su0q_u32, arm_neon_sha1su0, 0), 865 NEONMAP1(vsha1su1q_u32, arm_neon_sha1su1, 0), 866 NEONMAP1(vsha256h2q_u32, arm_neon_sha256h2, 0), 867 NEONMAP1(vsha256hq_u32, arm_neon_sha256h, 0), 868 NEONMAP1(vsha256su0q_u32, arm_neon_sha256su0, 0), 869 NEONMAP1(vsha256su1q_u32, arm_neon_sha256su1, 0), 870 NEONMAP0(vshl_n_v), 871 NEONMAP2(vshl_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts), 872 NEONMAP0(vshll_n_v), 873 NEONMAP0(vshlq_n_v), 874 NEONMAP2(vshlq_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts), 875 NEONMAP0(vshr_n_v), 876 NEONMAP0(vshrn_n_v), 877 NEONMAP0(vshrq_n_v), 878 NEONMAP1(vst1_v, arm_neon_vst1, 0), 879 NEONMAP1(vst1_x2_v, arm_neon_vst1x2, 0), 880 NEONMAP1(vst1_x3_v, arm_neon_vst1x3, 0), 881 NEONMAP1(vst1_x4_v, arm_neon_vst1x4, 0), 882 NEONMAP1(vst1q_v, arm_neon_vst1, 0), 883 NEONMAP1(vst1q_x2_v, arm_neon_vst1x2, 0), 884 NEONMAP1(vst1q_x3_v, arm_neon_vst1x3, 0), 885 NEONMAP1(vst1q_x4_v, arm_neon_vst1x4, 0), 886 NEONMAP1(vst2_lane_v, arm_neon_vst2lane, 0), 887 NEONMAP1(vst2_v, arm_neon_vst2, 0), 888 NEONMAP1(vst2q_lane_v, arm_neon_vst2lane, 0), 889 NEONMAP1(vst2q_v, arm_neon_vst2, 0), 890 NEONMAP1(vst3_lane_v, arm_neon_vst3lane, 0), 891 NEONMAP1(vst3_v, arm_neon_vst3, 0), 892 NEONMAP1(vst3q_lane_v, arm_neon_vst3lane, 0), 893 NEONMAP1(vst3q_v, arm_neon_vst3, 0), 894 NEONMAP1(vst4_lane_v, arm_neon_vst4lane, 0), 895 NEONMAP1(vst4_v, arm_neon_vst4, 0), 896 NEONMAP1(vst4q_lane_v, arm_neon_vst4lane, 0), 897 NEONMAP1(vst4q_v, arm_neon_vst4, 0), 898 NEONMAP0(vsubhn_v), 899 NEONMAP0(vtrn_v), 900 NEONMAP0(vtrnq_v), 901 NEONMAP0(vtst_v), 902 NEONMAP0(vtstq_v), 903 NEONMAP1(vusdot_s32, arm_neon_usdot, 0), 904 NEONMAP1(vusdotq_s32, arm_neon_usdot, 0), 905 NEONMAP1(vusmmlaq_s32, arm_neon_usmmla, 0), 906 NEONMAP0(vuzp_v), 907 NEONMAP0(vuzpq_v), 908 NEONMAP0(vzip_v), 909 NEONMAP0(vzipq_v) 910 }; 911 912 static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = { 913 NEONMAP0(splat_lane_v), 914 NEONMAP0(splat_laneq_v), 915 NEONMAP0(splatq_lane_v), 916 NEONMAP0(splatq_laneq_v), 917 NEONMAP1(vabs_v, aarch64_neon_abs, 0), 918 NEONMAP1(vabsq_v, aarch64_neon_abs, 0), 919 NEONMAP0(vadd_v), 920 NEONMAP0(vaddhn_v), 921 NEONMAP0(vaddq_p128), 922 NEONMAP0(vaddq_v), 923 NEONMAP1(vaesdq_u8, aarch64_crypto_aesd, 0), 924 NEONMAP1(vaeseq_u8, aarch64_crypto_aese, 0), 925 NEONMAP1(vaesimcq_u8, aarch64_crypto_aesimc, 0), 926 NEONMAP1(vaesmcq_u8, aarch64_crypto_aesmc, 0), 927 NEONMAP2(vbcaxq_s16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), 928 NEONMAP2(vbcaxq_s32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), 929 NEONMAP2(vbcaxq_s64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), 930 NEONMAP2(vbcaxq_s8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), 931 NEONMAP2(vbcaxq_u16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), 932 NEONMAP2(vbcaxq_u32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), 933 NEONMAP2(vbcaxq_u64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), 934 NEONMAP2(vbcaxq_u8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), 935 NEONMAP1(vbfdot_f32, aarch64_neon_bfdot, 0), 936 NEONMAP1(vbfdotq_f32, aarch64_neon_bfdot, 0), 937 NEONMAP1(vbfmlalbq_f32, aarch64_neon_bfmlalb, 0), 938 NEONMAP1(vbfmlaltq_f32, aarch64_neon_bfmlalt, 0), 939 NEONMAP1(vbfmmlaq_f32, aarch64_neon_bfmmla, 0), 940 NEONMAP1(vcadd_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType), 941 NEONMAP1(vcadd_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType), 942 NEONMAP1(vcadd_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType), 943 NEONMAP1(vcadd_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType), 944 NEONMAP1(vcaddq_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType), 945 NEONMAP1(vcaddq_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType), 946 NEONMAP1(vcaddq_rot270_f64, aarch64_neon_vcadd_rot270, Add1ArgType), 947 NEONMAP1(vcaddq_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType), 948 NEONMAP1(vcaddq_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType), 949 NEONMAP1(vcaddq_rot90_f64, aarch64_neon_vcadd_rot90, Add1ArgType), 950 NEONMAP1(vcage_v, aarch64_neon_facge, 0), 951 NEONMAP1(vcageq_v, aarch64_neon_facge, 0), 952 NEONMAP1(vcagt_v, aarch64_neon_facgt, 0), 953 NEONMAP1(vcagtq_v, aarch64_neon_facgt, 0), 954 NEONMAP1(vcale_v, aarch64_neon_facge, 0), 955 NEONMAP1(vcaleq_v, aarch64_neon_facge, 0), 956 NEONMAP1(vcalt_v, aarch64_neon_facgt, 0), 957 NEONMAP1(vcaltq_v, aarch64_neon_facgt, 0), 958 NEONMAP0(vceqz_v), 959 NEONMAP0(vceqzq_v), 960 NEONMAP0(vcgez_v), 961 NEONMAP0(vcgezq_v), 962 NEONMAP0(vcgtz_v), 963 NEONMAP0(vcgtzq_v), 964 NEONMAP0(vclez_v), 965 NEONMAP0(vclezq_v), 966 NEONMAP1(vcls_v, aarch64_neon_cls, Add1ArgType), 967 NEONMAP1(vclsq_v, aarch64_neon_cls, Add1ArgType), 968 NEONMAP0(vcltz_v), 969 NEONMAP0(vcltzq_v), 970 NEONMAP1(vclz_v, ctlz, Add1ArgType), 971 NEONMAP1(vclzq_v, ctlz, Add1ArgType), 972 NEONMAP1(vcmla_f16, aarch64_neon_vcmla_rot0, Add1ArgType), 973 NEONMAP1(vcmla_f32, aarch64_neon_vcmla_rot0, Add1ArgType), 974 NEONMAP1(vcmla_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType), 975 NEONMAP1(vcmla_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType), 976 NEONMAP1(vcmla_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType), 977 NEONMAP1(vcmla_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType), 978 NEONMAP1(vcmla_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType), 979 NEONMAP1(vcmla_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType), 980 NEONMAP1(vcmlaq_f16, aarch64_neon_vcmla_rot0, Add1ArgType), 981 NEONMAP1(vcmlaq_f32, aarch64_neon_vcmla_rot0, Add1ArgType), 982 NEONMAP1(vcmlaq_f64, aarch64_neon_vcmla_rot0, Add1ArgType), 983 NEONMAP1(vcmlaq_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType), 984 NEONMAP1(vcmlaq_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType), 985 NEONMAP1(vcmlaq_rot180_f64, aarch64_neon_vcmla_rot180, Add1ArgType), 986 NEONMAP1(vcmlaq_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType), 987 NEONMAP1(vcmlaq_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType), 988 NEONMAP1(vcmlaq_rot270_f64, aarch64_neon_vcmla_rot270, Add1ArgType), 989 NEONMAP1(vcmlaq_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType), 990 NEONMAP1(vcmlaq_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType), 991 NEONMAP1(vcmlaq_rot90_f64, aarch64_neon_vcmla_rot90, Add1ArgType), 992 NEONMAP1(vcnt_v, ctpop, Add1ArgType), 993 NEONMAP1(vcntq_v, ctpop, Add1ArgType), 994 NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0), 995 NEONMAP0(vcvt_f16_s16), 996 NEONMAP0(vcvt_f16_u16), 997 NEONMAP1(vcvt_f32_f16, aarch64_neon_vcvthf2fp, 0), 998 NEONMAP0(vcvt_f32_v), 999 NEONMAP1(vcvt_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0), 1000 NEONMAP1(vcvt_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0), 1001 NEONMAP2(vcvt_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), 1002 NEONMAP2(vcvt_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), 1003 NEONMAP1(vcvt_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0), 1004 NEONMAP1(vcvt_n_s32_v, aarch64_neon_vcvtfp2fxs, 0), 1005 NEONMAP1(vcvt_n_s64_v, aarch64_neon_vcvtfp2fxs, 0), 1006 NEONMAP1(vcvt_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0), 1007 NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0), 1008 NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0), 1009 NEONMAP0(vcvtq_f16_s16), 1010 NEONMAP0(vcvtq_f16_u16), 1011 NEONMAP0(vcvtq_f32_v), 1012 NEONMAP0(vcvtq_high_bf16_f32), 1013 NEONMAP0(vcvtq_low_bf16_f32), 1014 NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0), 1015 NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0), 1016 NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), 1017 NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), 1018 NEONMAP1(vcvtq_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0), 1019 NEONMAP1(vcvtq_n_s32_v, aarch64_neon_vcvtfp2fxs, 0), 1020 NEONMAP1(vcvtq_n_s64_v, aarch64_neon_vcvtfp2fxs, 0), 1021 NEONMAP1(vcvtq_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0), 1022 NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0), 1023 NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0), 1024 NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType), 1025 NEONMAP1(vdot_s32, aarch64_neon_sdot, 0), 1026 NEONMAP1(vdot_u32, aarch64_neon_udot, 0), 1027 NEONMAP1(vdotq_s32, aarch64_neon_sdot, 0), 1028 NEONMAP1(vdotq_u32, aarch64_neon_udot, 0), 1029 NEONMAP2(veor3q_s16, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), 1030 NEONMAP2(veor3q_s32, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), 1031 NEONMAP2(veor3q_s64, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), 1032 NEONMAP2(veor3q_s8, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), 1033 NEONMAP2(veor3q_u16, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), 1034 NEONMAP2(veor3q_u32, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), 1035 NEONMAP2(veor3q_u64, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), 1036 NEONMAP2(veor3q_u8, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), 1037 NEONMAP0(vext_v), 1038 NEONMAP0(vextq_v), 1039 NEONMAP0(vfma_v), 1040 NEONMAP0(vfmaq_v), 1041 NEONMAP1(vfmlal_high_f16, aarch64_neon_fmlal2, 0), 1042 NEONMAP1(vfmlal_low_f16, aarch64_neon_fmlal, 0), 1043 NEONMAP1(vfmlalq_high_f16, aarch64_neon_fmlal2, 0), 1044 NEONMAP1(vfmlalq_low_f16, aarch64_neon_fmlal, 0), 1045 NEONMAP1(vfmlsl_high_f16, aarch64_neon_fmlsl2, 0), 1046 NEONMAP1(vfmlsl_low_f16, aarch64_neon_fmlsl, 0), 1047 NEONMAP1(vfmlslq_high_f16, aarch64_neon_fmlsl2, 0), 1048 NEONMAP1(vfmlslq_low_f16, aarch64_neon_fmlsl, 0), 1049 NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts), 1050 NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts), 1051 NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts), 1052 NEONMAP2(vhsubq_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts), 1053 NEONMAP1(vld1_x2_v, aarch64_neon_ld1x2, 0), 1054 NEONMAP1(vld1_x3_v, aarch64_neon_ld1x3, 0), 1055 NEONMAP1(vld1_x4_v, aarch64_neon_ld1x4, 0), 1056 NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0), 1057 NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0), 1058 NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0), 1059 NEONMAP1(vmmlaq_s32, aarch64_neon_smmla, 0), 1060 NEONMAP1(vmmlaq_u32, aarch64_neon_ummla, 0), 1061 NEONMAP0(vmovl_v), 1062 NEONMAP0(vmovn_v), 1063 NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType), 1064 NEONMAP1(vmulq_v, aarch64_neon_pmul, Add1ArgType), 1065 NEONMAP1(vpadd_v, aarch64_neon_addp, Add1ArgType), 1066 NEONMAP2(vpaddl_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts), 1067 NEONMAP2(vpaddlq_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts), 1068 NEONMAP1(vpaddq_v, aarch64_neon_addp, Add1ArgType), 1069 NEONMAP1(vqabs_v, aarch64_neon_sqabs, Add1ArgType), 1070 NEONMAP1(vqabsq_v, aarch64_neon_sqabs, Add1ArgType), 1071 NEONMAP2(vqadd_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts), 1072 NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts), 1073 NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0), 1074 NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0), 1075 NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, 0), 1076 NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, 0), 1077 NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType), 1078 NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, 0), 1079 NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, 0), 1080 NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType), 1081 NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType), 1082 NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts), 1083 NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType), 1084 NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType), 1085 NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType), 1086 NEONMAP1(vqrdmlah_s16, aarch64_neon_sqrdmlah, Add1ArgType), 1087 NEONMAP1(vqrdmlah_s32, aarch64_neon_sqrdmlah, Add1ArgType), 1088 NEONMAP1(vqrdmlahq_s16, aarch64_neon_sqrdmlah, Add1ArgType), 1089 NEONMAP1(vqrdmlahq_s32, aarch64_neon_sqrdmlah, Add1ArgType), 1090 NEONMAP1(vqrdmlsh_s16, aarch64_neon_sqrdmlsh, Add1ArgType), 1091 NEONMAP1(vqrdmlsh_s32, aarch64_neon_sqrdmlsh, Add1ArgType), 1092 NEONMAP1(vqrdmlshq_s16, aarch64_neon_sqrdmlsh, Add1ArgType), 1093 NEONMAP1(vqrdmlshq_s32, aarch64_neon_sqrdmlsh, Add1ArgType), 1094 NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0), 1095 NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0), 1096 NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType), 1097 NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, 0), 1098 NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, 0), 1099 NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType), 1100 NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts), 1101 NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts), 1102 NEONMAP2(vqshl_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl, UnsignedAlts), 1103 NEONMAP2(vqshl_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts), 1104 NEONMAP2(vqshlq_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl,UnsignedAlts), 1105 NEONMAP2(vqshlq_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts), 1106 NEONMAP1(vqshlu_n_v, aarch64_neon_sqshlu, 0), 1107 NEONMAP1(vqshluq_n_v, aarch64_neon_sqshlu, 0), 1108 NEONMAP2(vqsub_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts), 1109 NEONMAP2(vqsubq_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts), 1110 NEONMAP1(vraddhn_v, aarch64_neon_raddhn, Add1ArgType), 1111 NEONMAP1(vrax1q_u64, aarch64_crypto_rax1, 0), 1112 NEONMAP2(vrecpe_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0), 1113 NEONMAP2(vrecpeq_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0), 1114 NEONMAP1(vrecps_v, aarch64_neon_frecps, Add1ArgType), 1115 NEONMAP1(vrecpsq_v, aarch64_neon_frecps, Add1ArgType), 1116 NEONMAP2(vrhadd_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts), 1117 NEONMAP2(vrhaddq_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts), 1118 NEONMAP1(vrnd32x_f32, aarch64_neon_frint32x, Add1ArgType), 1119 NEONMAP1(vrnd32x_f64, aarch64_neon_frint32x, Add1ArgType), 1120 NEONMAP1(vrnd32xq_f32, aarch64_neon_frint32x, Add1ArgType), 1121 NEONMAP1(vrnd32xq_f64, aarch64_neon_frint32x, Add1ArgType), 1122 NEONMAP1(vrnd32z_f32, aarch64_neon_frint32z, Add1ArgType), 1123 NEONMAP1(vrnd32z_f64, aarch64_neon_frint32z, Add1ArgType), 1124 NEONMAP1(vrnd32zq_f32, aarch64_neon_frint32z, Add1ArgType), 1125 NEONMAP1(vrnd32zq_f64, aarch64_neon_frint32z, Add1ArgType), 1126 NEONMAP1(vrnd64x_f32, aarch64_neon_frint64x, Add1ArgType), 1127 NEONMAP1(vrnd64x_f64, aarch64_neon_frint64x, Add1ArgType), 1128 NEONMAP1(vrnd64xq_f32, aarch64_neon_frint64x, Add1ArgType), 1129 NEONMAP1(vrnd64xq_f64, aarch64_neon_frint64x, Add1ArgType), 1130 NEONMAP1(vrnd64z_f32, aarch64_neon_frint64z, Add1ArgType), 1131 NEONMAP1(vrnd64z_f64, aarch64_neon_frint64z, Add1ArgType), 1132 NEONMAP1(vrnd64zq_f32, aarch64_neon_frint64z, Add1ArgType), 1133 NEONMAP1(vrnd64zq_f64, aarch64_neon_frint64z, Add1ArgType), 1134 NEONMAP0(vrndi_v), 1135 NEONMAP0(vrndiq_v), 1136 NEONMAP2(vrshl_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts), 1137 NEONMAP2(vrshlq_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts), 1138 NEONMAP2(vrshr_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts), 1139 NEONMAP2(vrshrq_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts), 1140 NEONMAP2(vrsqrte_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0), 1141 NEONMAP2(vrsqrteq_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0), 1142 NEONMAP1(vrsqrts_v, aarch64_neon_frsqrts, Add1ArgType), 1143 NEONMAP1(vrsqrtsq_v, aarch64_neon_frsqrts, Add1ArgType), 1144 NEONMAP1(vrsubhn_v, aarch64_neon_rsubhn, Add1ArgType), 1145 NEONMAP1(vsha1su0q_u32, aarch64_crypto_sha1su0, 0), 1146 NEONMAP1(vsha1su1q_u32, aarch64_crypto_sha1su1, 0), 1147 NEONMAP1(vsha256h2q_u32, aarch64_crypto_sha256h2, 0), 1148 NEONMAP1(vsha256hq_u32, aarch64_crypto_sha256h, 0), 1149 NEONMAP1(vsha256su0q_u32, aarch64_crypto_sha256su0, 0), 1150 NEONMAP1(vsha256su1q_u32, aarch64_crypto_sha256su1, 0), 1151 NEONMAP1(vsha512h2q_u64, aarch64_crypto_sha512h2, 0), 1152 NEONMAP1(vsha512hq_u64, aarch64_crypto_sha512h, 0), 1153 NEONMAP1(vsha512su0q_u64, aarch64_crypto_sha512su0, 0), 1154 NEONMAP1(vsha512su1q_u64, aarch64_crypto_sha512su1, 0), 1155 NEONMAP0(vshl_n_v), 1156 NEONMAP2(vshl_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts), 1157 NEONMAP0(vshll_n_v), 1158 NEONMAP0(vshlq_n_v), 1159 NEONMAP2(vshlq_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts), 1160 NEONMAP0(vshr_n_v), 1161 NEONMAP0(vshrn_n_v), 1162 NEONMAP0(vshrq_n_v), 1163 NEONMAP1(vsm3partw1q_u32, aarch64_crypto_sm3partw1, 0), 1164 NEONMAP1(vsm3partw2q_u32, aarch64_crypto_sm3partw2, 0), 1165 NEONMAP1(vsm3ss1q_u32, aarch64_crypto_sm3ss1, 0), 1166 NEONMAP1(vsm3tt1aq_u32, aarch64_crypto_sm3tt1a, 0), 1167 NEONMAP1(vsm3tt1bq_u32, aarch64_crypto_sm3tt1b, 0), 1168 NEONMAP1(vsm3tt2aq_u32, aarch64_crypto_sm3tt2a, 0), 1169 NEONMAP1(vsm3tt2bq_u32, aarch64_crypto_sm3tt2b, 0), 1170 NEONMAP1(vsm4ekeyq_u32, aarch64_crypto_sm4ekey, 0), 1171 NEONMAP1(vsm4eq_u32, aarch64_crypto_sm4e, 0), 1172 NEONMAP1(vst1_x2_v, aarch64_neon_st1x2, 0), 1173 NEONMAP1(vst1_x3_v, aarch64_neon_st1x3, 0), 1174 NEONMAP1(vst1_x4_v, aarch64_neon_st1x4, 0), 1175 NEONMAP1(vst1q_x2_v, aarch64_neon_st1x2, 0), 1176 NEONMAP1(vst1q_x3_v, aarch64_neon_st1x3, 0), 1177 NEONMAP1(vst1q_x4_v, aarch64_neon_st1x4, 0), 1178 NEONMAP0(vsubhn_v), 1179 NEONMAP0(vtst_v), 1180 NEONMAP0(vtstq_v), 1181 NEONMAP1(vusdot_s32, aarch64_neon_usdot, 0), 1182 NEONMAP1(vusdotq_s32, aarch64_neon_usdot, 0), 1183 NEONMAP1(vusmmlaq_s32, aarch64_neon_usmmla, 0), 1184 NEONMAP1(vxarq_u64, aarch64_crypto_xar, 0), 1185 }; 1186 1187 static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = { 1188 NEONMAP1(vabdd_f64, aarch64_sisd_fabd, Add1ArgType), 1189 NEONMAP1(vabds_f32, aarch64_sisd_fabd, Add1ArgType), 1190 NEONMAP1(vabsd_s64, aarch64_neon_abs, Add1ArgType), 1191 NEONMAP1(vaddlv_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType), 1192 NEONMAP1(vaddlv_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType), 1193 NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType), 1194 NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType), 1195 NEONMAP1(vaddv_f32, aarch64_neon_faddv, AddRetType | Add1ArgType), 1196 NEONMAP1(vaddv_s32, aarch64_neon_saddv, AddRetType | Add1ArgType), 1197 NEONMAP1(vaddv_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType), 1198 NEONMAP1(vaddvq_f32, aarch64_neon_faddv, AddRetType | Add1ArgType), 1199 NEONMAP1(vaddvq_f64, aarch64_neon_faddv, AddRetType | Add1ArgType), 1200 NEONMAP1(vaddvq_s32, aarch64_neon_saddv, AddRetType | Add1ArgType), 1201 NEONMAP1(vaddvq_s64, aarch64_neon_saddv, AddRetType | Add1ArgType), 1202 NEONMAP1(vaddvq_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType), 1203 NEONMAP1(vaddvq_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType), 1204 NEONMAP1(vcaged_f64, aarch64_neon_facge, AddRetType | Add1ArgType), 1205 NEONMAP1(vcages_f32, aarch64_neon_facge, AddRetType | Add1ArgType), 1206 NEONMAP1(vcagtd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType), 1207 NEONMAP1(vcagts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType), 1208 NEONMAP1(vcaled_f64, aarch64_neon_facge, AddRetType | Add1ArgType), 1209 NEONMAP1(vcales_f32, aarch64_neon_facge, AddRetType | Add1ArgType), 1210 NEONMAP1(vcaltd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType), 1211 NEONMAP1(vcalts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType), 1212 NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType), 1213 NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType), 1214 NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType), 1215 NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType), 1216 NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType), 1217 NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), 1218 NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), 1219 NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), 1220 NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType), 1221 NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType), 1222 NEONMAP0(vcvth_bf16_f32), 1223 NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType), 1224 NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), 1225 NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType), 1226 NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), 1227 NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType), 1228 NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType), 1229 NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType), 1230 NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType), 1231 NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType), 1232 NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType), 1233 NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType), 1234 NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType), 1235 NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType), 1236 NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), 1237 NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), 1238 NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), 1239 NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType), 1240 NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType), 1241 NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0), 1242 NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), 1243 NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), 1244 NEONMAP1(vmaxnmvq_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), 1245 NEONMAP1(vmaxv_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType), 1246 NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType), 1247 NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType), 1248 NEONMAP1(vmaxvq_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType), 1249 NEONMAP1(vmaxvq_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType), 1250 NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType), 1251 NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType), 1252 NEONMAP1(vminnmv_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType), 1253 NEONMAP1(vminnmvq_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType), 1254 NEONMAP1(vminnmvq_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType), 1255 NEONMAP1(vminv_f32, aarch64_neon_fminv, AddRetType | Add1ArgType), 1256 NEONMAP1(vminv_s32, aarch64_neon_sminv, AddRetType | Add1ArgType), 1257 NEONMAP1(vminv_u32, aarch64_neon_uminv, AddRetType | Add1ArgType), 1258 NEONMAP1(vminvq_f32, aarch64_neon_fminv, AddRetType | Add1ArgType), 1259 NEONMAP1(vminvq_f64, aarch64_neon_fminv, AddRetType | Add1ArgType), 1260 NEONMAP1(vminvq_s32, aarch64_neon_sminv, AddRetType | Add1ArgType), 1261 NEONMAP1(vminvq_u32, aarch64_neon_uminv, AddRetType | Add1ArgType), 1262 NEONMAP1(vmull_p64, aarch64_neon_pmull64, 0), 1263 NEONMAP1(vmulxd_f64, aarch64_neon_fmulx, Add1ArgType), 1264 NEONMAP1(vmulxs_f32, aarch64_neon_fmulx, Add1ArgType), 1265 NEONMAP1(vpaddd_s64, aarch64_neon_uaddv, AddRetType | Add1ArgType), 1266 NEONMAP1(vpaddd_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType), 1267 NEONMAP1(vpmaxnmqd_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), 1268 NEONMAP1(vpmaxnms_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), 1269 NEONMAP1(vpmaxqd_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType), 1270 NEONMAP1(vpmaxs_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType), 1271 NEONMAP1(vpminnmqd_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType), 1272 NEONMAP1(vpminnms_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType), 1273 NEONMAP1(vpminqd_f64, aarch64_neon_fminv, AddRetType | Add1ArgType), 1274 NEONMAP1(vpmins_f32, aarch64_neon_fminv, AddRetType | Add1ArgType), 1275 NEONMAP1(vqabsb_s8, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors), 1276 NEONMAP1(vqabsd_s64, aarch64_neon_sqabs, Add1ArgType), 1277 NEONMAP1(vqabsh_s16, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors), 1278 NEONMAP1(vqabss_s32, aarch64_neon_sqabs, Add1ArgType), 1279 NEONMAP1(vqaddb_s8, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors), 1280 NEONMAP1(vqaddb_u8, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors), 1281 NEONMAP1(vqaddd_s64, aarch64_neon_sqadd, Add1ArgType), 1282 NEONMAP1(vqaddd_u64, aarch64_neon_uqadd, Add1ArgType), 1283 NEONMAP1(vqaddh_s16, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors), 1284 NEONMAP1(vqaddh_u16, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors), 1285 NEONMAP1(vqadds_s32, aarch64_neon_sqadd, Add1ArgType), 1286 NEONMAP1(vqadds_u32, aarch64_neon_uqadd, Add1ArgType), 1287 NEONMAP1(vqdmulhh_s16, aarch64_neon_sqdmulh, Vectorize1ArgType | Use64BitVectors), 1288 NEONMAP1(vqdmulhs_s32, aarch64_neon_sqdmulh, Add1ArgType), 1289 NEONMAP1(vqdmullh_s16, aarch64_neon_sqdmull, VectorRet | Use128BitVectors), 1290 NEONMAP1(vqdmulls_s32, aarch64_neon_sqdmulls_scalar, 0), 1291 NEONMAP1(vqmovnd_s64, aarch64_neon_scalar_sqxtn, AddRetType | Add1ArgType), 1292 NEONMAP1(vqmovnd_u64, aarch64_neon_scalar_uqxtn, AddRetType | Add1ArgType), 1293 NEONMAP1(vqmovnh_s16, aarch64_neon_sqxtn, VectorRet | Use64BitVectors), 1294 NEONMAP1(vqmovnh_u16, aarch64_neon_uqxtn, VectorRet | Use64BitVectors), 1295 NEONMAP1(vqmovns_s32, aarch64_neon_sqxtn, VectorRet | Use64BitVectors), 1296 NEONMAP1(vqmovns_u32, aarch64_neon_uqxtn, VectorRet | Use64BitVectors), 1297 NEONMAP1(vqmovund_s64, aarch64_neon_scalar_sqxtun, AddRetType | Add1ArgType), 1298 NEONMAP1(vqmovunh_s16, aarch64_neon_sqxtun, VectorRet | Use64BitVectors), 1299 NEONMAP1(vqmovuns_s32, aarch64_neon_sqxtun, VectorRet | Use64BitVectors), 1300 NEONMAP1(vqnegb_s8, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors), 1301 NEONMAP1(vqnegd_s64, aarch64_neon_sqneg, Add1ArgType), 1302 NEONMAP1(vqnegh_s16, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors), 1303 NEONMAP1(vqnegs_s32, aarch64_neon_sqneg, Add1ArgType), 1304 NEONMAP1(vqrdmlahh_s16, aarch64_neon_sqrdmlah, Vectorize1ArgType | Use64BitVectors), 1305 NEONMAP1(vqrdmlahs_s32, aarch64_neon_sqrdmlah, Add1ArgType), 1306 NEONMAP1(vqrdmlshh_s16, aarch64_neon_sqrdmlsh, Vectorize1ArgType | Use64BitVectors), 1307 NEONMAP1(vqrdmlshs_s32, aarch64_neon_sqrdmlsh, Add1ArgType), 1308 NEONMAP1(vqrdmulhh_s16, aarch64_neon_sqrdmulh, Vectorize1ArgType | Use64BitVectors), 1309 NEONMAP1(vqrdmulhs_s32, aarch64_neon_sqrdmulh, Add1ArgType), 1310 NEONMAP1(vqrshlb_s8, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors), 1311 NEONMAP1(vqrshlb_u8, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors), 1312 NEONMAP1(vqrshld_s64, aarch64_neon_sqrshl, Add1ArgType), 1313 NEONMAP1(vqrshld_u64, aarch64_neon_uqrshl, Add1ArgType), 1314 NEONMAP1(vqrshlh_s16, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors), 1315 NEONMAP1(vqrshlh_u16, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors), 1316 NEONMAP1(vqrshls_s32, aarch64_neon_sqrshl, Add1ArgType), 1317 NEONMAP1(vqrshls_u32, aarch64_neon_uqrshl, Add1ArgType), 1318 NEONMAP1(vqrshrnd_n_s64, aarch64_neon_sqrshrn, AddRetType), 1319 NEONMAP1(vqrshrnd_n_u64, aarch64_neon_uqrshrn, AddRetType), 1320 NEONMAP1(vqrshrnh_n_s16, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors), 1321 NEONMAP1(vqrshrnh_n_u16, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors), 1322 NEONMAP1(vqrshrns_n_s32, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors), 1323 NEONMAP1(vqrshrns_n_u32, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors), 1324 NEONMAP1(vqrshrund_n_s64, aarch64_neon_sqrshrun, AddRetType), 1325 NEONMAP1(vqrshrunh_n_s16, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors), 1326 NEONMAP1(vqrshruns_n_s32, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors), 1327 NEONMAP1(vqshlb_n_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors), 1328 NEONMAP1(vqshlb_n_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors), 1329 NEONMAP1(vqshlb_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors), 1330 NEONMAP1(vqshlb_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors), 1331 NEONMAP1(vqshld_s64, aarch64_neon_sqshl, Add1ArgType), 1332 NEONMAP1(vqshld_u64, aarch64_neon_uqshl, Add1ArgType), 1333 NEONMAP1(vqshlh_n_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors), 1334 NEONMAP1(vqshlh_n_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors), 1335 NEONMAP1(vqshlh_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors), 1336 NEONMAP1(vqshlh_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors), 1337 NEONMAP1(vqshls_n_s32, aarch64_neon_sqshl, Add1ArgType), 1338 NEONMAP1(vqshls_n_u32, aarch64_neon_uqshl, Add1ArgType), 1339 NEONMAP1(vqshls_s32, aarch64_neon_sqshl, Add1ArgType), 1340 NEONMAP1(vqshls_u32, aarch64_neon_uqshl, Add1ArgType), 1341 NEONMAP1(vqshlub_n_s8, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors), 1342 NEONMAP1(vqshluh_n_s16, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors), 1343 NEONMAP1(vqshlus_n_s32, aarch64_neon_sqshlu, Add1ArgType), 1344 NEONMAP1(vqshrnd_n_s64, aarch64_neon_sqshrn, AddRetType), 1345 NEONMAP1(vqshrnd_n_u64, aarch64_neon_uqshrn, AddRetType), 1346 NEONMAP1(vqshrnh_n_s16, aarch64_neon_sqshrn, VectorRet | Use64BitVectors), 1347 NEONMAP1(vqshrnh_n_u16, aarch64_neon_uqshrn, VectorRet | Use64BitVectors), 1348 NEONMAP1(vqshrns_n_s32, aarch64_neon_sqshrn, VectorRet | Use64BitVectors), 1349 NEONMAP1(vqshrns_n_u32, aarch64_neon_uqshrn, VectorRet | Use64BitVectors), 1350 NEONMAP1(vqshrund_n_s64, aarch64_neon_sqshrun, AddRetType), 1351 NEONMAP1(vqshrunh_n_s16, aarch64_neon_sqshrun, VectorRet | Use64BitVectors), 1352 NEONMAP1(vqshruns_n_s32, aarch64_neon_sqshrun, VectorRet | Use64BitVectors), 1353 NEONMAP1(vqsubb_s8, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors), 1354 NEONMAP1(vqsubb_u8, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors), 1355 NEONMAP1(vqsubd_s64, aarch64_neon_sqsub, Add1ArgType), 1356 NEONMAP1(vqsubd_u64, aarch64_neon_uqsub, Add1ArgType), 1357 NEONMAP1(vqsubh_s16, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors), 1358 NEONMAP1(vqsubh_u16, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors), 1359 NEONMAP1(vqsubs_s32, aarch64_neon_sqsub, Add1ArgType), 1360 NEONMAP1(vqsubs_u32, aarch64_neon_uqsub, Add1ArgType), 1361 NEONMAP1(vrecped_f64, aarch64_neon_frecpe, Add1ArgType), 1362 NEONMAP1(vrecpes_f32, aarch64_neon_frecpe, Add1ArgType), 1363 NEONMAP1(vrecpxd_f64, aarch64_neon_frecpx, Add1ArgType), 1364 NEONMAP1(vrecpxs_f32, aarch64_neon_frecpx, Add1ArgType), 1365 NEONMAP1(vrshld_s64, aarch64_neon_srshl, Add1ArgType), 1366 NEONMAP1(vrshld_u64, aarch64_neon_urshl, Add1ArgType), 1367 NEONMAP1(vrsqrted_f64, aarch64_neon_frsqrte, Add1ArgType), 1368 NEONMAP1(vrsqrtes_f32, aarch64_neon_frsqrte, Add1ArgType), 1369 NEONMAP1(vrsqrtsd_f64, aarch64_neon_frsqrts, Add1ArgType), 1370 NEONMAP1(vrsqrtss_f32, aarch64_neon_frsqrts, Add1ArgType), 1371 NEONMAP1(vsha1cq_u32, aarch64_crypto_sha1c, 0), 1372 NEONMAP1(vsha1h_u32, aarch64_crypto_sha1h, 0), 1373 NEONMAP1(vsha1mq_u32, aarch64_crypto_sha1m, 0), 1374 NEONMAP1(vsha1pq_u32, aarch64_crypto_sha1p, 0), 1375 NEONMAP1(vshld_s64, aarch64_neon_sshl, Add1ArgType), 1376 NEONMAP1(vshld_u64, aarch64_neon_ushl, Add1ArgType), 1377 NEONMAP1(vslid_n_s64, aarch64_neon_vsli, Vectorize1ArgType), 1378 NEONMAP1(vslid_n_u64, aarch64_neon_vsli, Vectorize1ArgType), 1379 NEONMAP1(vsqaddb_u8, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors), 1380 NEONMAP1(vsqaddd_u64, aarch64_neon_usqadd, Add1ArgType), 1381 NEONMAP1(vsqaddh_u16, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors), 1382 NEONMAP1(vsqadds_u32, aarch64_neon_usqadd, Add1ArgType), 1383 NEONMAP1(vsrid_n_s64, aarch64_neon_vsri, Vectorize1ArgType), 1384 NEONMAP1(vsrid_n_u64, aarch64_neon_vsri, Vectorize1ArgType), 1385 NEONMAP1(vuqaddb_s8, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors), 1386 NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType), 1387 NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors), 1388 NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType), 1389 // FP16 scalar intrinisics go here. 1390 NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType), 1391 NEONMAP1(vcvtah_s32_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType), 1392 NEONMAP1(vcvtah_s64_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType), 1393 NEONMAP1(vcvtah_u32_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType), 1394 NEONMAP1(vcvtah_u64_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType), 1395 NEONMAP1(vcvth_n_f16_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType), 1396 NEONMAP1(vcvth_n_f16_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType), 1397 NEONMAP1(vcvth_n_f16_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), 1398 NEONMAP1(vcvth_n_f16_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), 1399 NEONMAP1(vcvth_n_s32_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), 1400 NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), 1401 NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), 1402 NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), 1403 NEONMAP1(vcvth_s32_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType), 1404 NEONMAP1(vcvth_s64_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType), 1405 NEONMAP1(vcvth_u32_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType), 1406 NEONMAP1(vcvth_u64_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType), 1407 NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType), 1408 NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType), 1409 NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), 1410 NEONMAP1(vcvtmh_u64_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), 1411 NEONMAP1(vcvtnh_s32_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType), 1412 NEONMAP1(vcvtnh_s64_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType), 1413 NEONMAP1(vcvtnh_u32_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType), 1414 NEONMAP1(vcvtnh_u64_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType), 1415 NEONMAP1(vcvtph_s32_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType), 1416 NEONMAP1(vcvtph_s64_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType), 1417 NEONMAP1(vcvtph_u32_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType), 1418 NEONMAP1(vcvtph_u64_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType), 1419 NEONMAP1(vmulxh_f16, aarch64_neon_fmulx, Add1ArgType), 1420 NEONMAP1(vrecpeh_f16, aarch64_neon_frecpe, Add1ArgType), 1421 NEONMAP1(vrecpxh_f16, aarch64_neon_frecpx, Add1ArgType), 1422 NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType), 1423 NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType), 1424 }; 1425 1426 // Some intrinsics are equivalent for codegen. 1427 static const std::pair<unsigned, unsigned> NEONEquivalentIntrinsicMap[] = { 1428 { NEON::BI__builtin_neon_splat_lane_bf16, NEON::BI__builtin_neon_splat_lane_v, }, 1429 { NEON::BI__builtin_neon_splat_laneq_bf16, NEON::BI__builtin_neon_splat_laneq_v, }, 1430 { NEON::BI__builtin_neon_splatq_lane_bf16, NEON::BI__builtin_neon_splatq_lane_v, }, 1431 { NEON::BI__builtin_neon_splatq_laneq_bf16, NEON::BI__builtin_neon_splatq_laneq_v, }, 1432 { NEON::BI__builtin_neon_vabd_f16, NEON::BI__builtin_neon_vabd_v, }, 1433 { NEON::BI__builtin_neon_vabdq_f16, NEON::BI__builtin_neon_vabdq_v, }, 1434 { NEON::BI__builtin_neon_vabs_f16, NEON::BI__builtin_neon_vabs_v, }, 1435 { NEON::BI__builtin_neon_vabsq_f16, NEON::BI__builtin_neon_vabsq_v, }, 1436 { NEON::BI__builtin_neon_vcage_f16, NEON::BI__builtin_neon_vcage_v, }, 1437 { NEON::BI__builtin_neon_vcageq_f16, NEON::BI__builtin_neon_vcageq_v, }, 1438 { NEON::BI__builtin_neon_vcagt_f16, NEON::BI__builtin_neon_vcagt_v, }, 1439 { NEON::BI__builtin_neon_vcagtq_f16, NEON::BI__builtin_neon_vcagtq_v, }, 1440 { NEON::BI__builtin_neon_vcale_f16, NEON::BI__builtin_neon_vcale_v, }, 1441 { NEON::BI__builtin_neon_vcaleq_f16, NEON::BI__builtin_neon_vcaleq_v, }, 1442 { NEON::BI__builtin_neon_vcalt_f16, NEON::BI__builtin_neon_vcalt_v, }, 1443 { NEON::BI__builtin_neon_vcaltq_f16, NEON::BI__builtin_neon_vcaltq_v, }, 1444 { NEON::BI__builtin_neon_vceqz_f16, NEON::BI__builtin_neon_vceqz_v, }, 1445 { NEON::BI__builtin_neon_vceqzq_f16, NEON::BI__builtin_neon_vceqzq_v, }, 1446 { NEON::BI__builtin_neon_vcgez_f16, NEON::BI__builtin_neon_vcgez_v, }, 1447 { NEON::BI__builtin_neon_vcgezq_f16, NEON::BI__builtin_neon_vcgezq_v, }, 1448 { NEON::BI__builtin_neon_vcgtz_f16, NEON::BI__builtin_neon_vcgtz_v, }, 1449 { NEON::BI__builtin_neon_vcgtzq_f16, NEON::BI__builtin_neon_vcgtzq_v, }, 1450 { NEON::BI__builtin_neon_vclez_f16, NEON::BI__builtin_neon_vclez_v, }, 1451 { NEON::BI__builtin_neon_vclezq_f16, NEON::BI__builtin_neon_vclezq_v, }, 1452 { NEON::BI__builtin_neon_vcltz_f16, NEON::BI__builtin_neon_vcltz_v, }, 1453 { NEON::BI__builtin_neon_vcltzq_f16, NEON::BI__builtin_neon_vcltzq_v, }, 1454 { NEON::BI__builtin_neon_vfma_f16, NEON::BI__builtin_neon_vfma_v, }, 1455 { NEON::BI__builtin_neon_vfma_lane_f16, NEON::BI__builtin_neon_vfma_lane_v, }, 1456 { NEON::BI__builtin_neon_vfma_laneq_f16, NEON::BI__builtin_neon_vfma_laneq_v, }, 1457 { NEON::BI__builtin_neon_vfmaq_f16, NEON::BI__builtin_neon_vfmaq_v, }, 1458 { NEON::BI__builtin_neon_vfmaq_lane_f16, NEON::BI__builtin_neon_vfmaq_lane_v, }, 1459 { NEON::BI__builtin_neon_vfmaq_laneq_f16, NEON::BI__builtin_neon_vfmaq_laneq_v, }, 1460 { NEON::BI__builtin_neon_vld1_bf16_x2, NEON::BI__builtin_neon_vld1_x2_v }, 1461 { NEON::BI__builtin_neon_vld1_bf16_x3, NEON::BI__builtin_neon_vld1_x3_v }, 1462 { NEON::BI__builtin_neon_vld1_bf16_x4, NEON::BI__builtin_neon_vld1_x4_v }, 1463 { NEON::BI__builtin_neon_vld1_bf16, NEON::BI__builtin_neon_vld1_v }, 1464 { NEON::BI__builtin_neon_vld1_dup_bf16, NEON::BI__builtin_neon_vld1_dup_v }, 1465 { NEON::BI__builtin_neon_vld1_lane_bf16, NEON::BI__builtin_neon_vld1_lane_v }, 1466 { NEON::BI__builtin_neon_vld1q_bf16_x2, NEON::BI__builtin_neon_vld1q_x2_v }, 1467 { NEON::BI__builtin_neon_vld1q_bf16_x3, NEON::BI__builtin_neon_vld1q_x3_v }, 1468 { NEON::BI__builtin_neon_vld1q_bf16_x4, NEON::BI__builtin_neon_vld1q_x4_v }, 1469 { NEON::BI__builtin_neon_vld1q_bf16, NEON::BI__builtin_neon_vld1q_v }, 1470 { NEON::BI__builtin_neon_vld1q_dup_bf16, NEON::BI__builtin_neon_vld1q_dup_v }, 1471 { NEON::BI__builtin_neon_vld1q_lane_bf16, NEON::BI__builtin_neon_vld1q_lane_v }, 1472 { NEON::BI__builtin_neon_vld2_bf16, NEON::BI__builtin_neon_vld2_v }, 1473 { NEON::BI__builtin_neon_vld2_dup_bf16, NEON::BI__builtin_neon_vld2_dup_v }, 1474 { NEON::BI__builtin_neon_vld2_lane_bf16, NEON::BI__builtin_neon_vld2_lane_v }, 1475 { NEON::BI__builtin_neon_vld2q_bf16, NEON::BI__builtin_neon_vld2q_v }, 1476 { NEON::BI__builtin_neon_vld2q_dup_bf16, NEON::BI__builtin_neon_vld2q_dup_v }, 1477 { NEON::BI__builtin_neon_vld2q_lane_bf16, NEON::BI__builtin_neon_vld2q_lane_v }, 1478 { NEON::BI__builtin_neon_vld3_bf16, NEON::BI__builtin_neon_vld3_v }, 1479 { NEON::BI__builtin_neon_vld3_dup_bf16, NEON::BI__builtin_neon_vld3_dup_v }, 1480 { NEON::BI__builtin_neon_vld3_lane_bf16, NEON::BI__builtin_neon_vld3_lane_v }, 1481 { NEON::BI__builtin_neon_vld3q_bf16, NEON::BI__builtin_neon_vld3q_v }, 1482 { NEON::BI__builtin_neon_vld3q_dup_bf16, NEON::BI__builtin_neon_vld3q_dup_v }, 1483 { NEON::BI__builtin_neon_vld3q_lane_bf16, NEON::BI__builtin_neon_vld3q_lane_v }, 1484 { NEON::BI__builtin_neon_vld4_bf16, NEON::BI__builtin_neon_vld4_v }, 1485 { NEON::BI__builtin_neon_vld4_dup_bf16, NEON::BI__builtin_neon_vld4_dup_v }, 1486 { NEON::BI__builtin_neon_vld4_lane_bf16, NEON::BI__builtin_neon_vld4_lane_v }, 1487 { NEON::BI__builtin_neon_vld4q_bf16, NEON::BI__builtin_neon_vld4q_v }, 1488 { NEON::BI__builtin_neon_vld4q_dup_bf16, NEON::BI__builtin_neon_vld4q_dup_v }, 1489 { NEON::BI__builtin_neon_vld4q_lane_bf16, NEON::BI__builtin_neon_vld4q_lane_v }, 1490 { NEON::BI__builtin_neon_vmax_f16, NEON::BI__builtin_neon_vmax_v, }, 1491 { NEON::BI__builtin_neon_vmaxnm_f16, NEON::BI__builtin_neon_vmaxnm_v, }, 1492 { NEON::BI__builtin_neon_vmaxnmq_f16, NEON::BI__builtin_neon_vmaxnmq_v, }, 1493 { NEON::BI__builtin_neon_vmaxq_f16, NEON::BI__builtin_neon_vmaxq_v, }, 1494 { NEON::BI__builtin_neon_vmin_f16, NEON::BI__builtin_neon_vmin_v, }, 1495 { NEON::BI__builtin_neon_vminnm_f16, NEON::BI__builtin_neon_vminnm_v, }, 1496 { NEON::BI__builtin_neon_vminnmq_f16, NEON::BI__builtin_neon_vminnmq_v, }, 1497 { NEON::BI__builtin_neon_vminq_f16, NEON::BI__builtin_neon_vminq_v, }, 1498 { NEON::BI__builtin_neon_vmulx_f16, NEON::BI__builtin_neon_vmulx_v, }, 1499 { NEON::BI__builtin_neon_vmulxq_f16, NEON::BI__builtin_neon_vmulxq_v, }, 1500 { NEON::BI__builtin_neon_vpadd_f16, NEON::BI__builtin_neon_vpadd_v, }, 1501 { NEON::BI__builtin_neon_vpaddq_f16, NEON::BI__builtin_neon_vpaddq_v, }, 1502 { NEON::BI__builtin_neon_vpmax_f16, NEON::BI__builtin_neon_vpmax_v, }, 1503 { NEON::BI__builtin_neon_vpmaxnm_f16, NEON::BI__builtin_neon_vpmaxnm_v, }, 1504 { NEON::BI__builtin_neon_vpmaxnmq_f16, NEON::BI__builtin_neon_vpmaxnmq_v, }, 1505 { NEON::BI__builtin_neon_vpmaxq_f16, NEON::BI__builtin_neon_vpmaxq_v, }, 1506 { NEON::BI__builtin_neon_vpmin_f16, NEON::BI__builtin_neon_vpmin_v, }, 1507 { NEON::BI__builtin_neon_vpminnm_f16, NEON::BI__builtin_neon_vpminnm_v, }, 1508 { NEON::BI__builtin_neon_vpminnmq_f16, NEON::BI__builtin_neon_vpminnmq_v, }, 1509 { NEON::BI__builtin_neon_vpminq_f16, NEON::BI__builtin_neon_vpminq_v, }, 1510 { NEON::BI__builtin_neon_vrecpe_f16, NEON::BI__builtin_neon_vrecpe_v, }, 1511 { NEON::BI__builtin_neon_vrecpeq_f16, NEON::BI__builtin_neon_vrecpeq_v, }, 1512 { NEON::BI__builtin_neon_vrecps_f16, NEON::BI__builtin_neon_vrecps_v, }, 1513 { NEON::BI__builtin_neon_vrecpsq_f16, NEON::BI__builtin_neon_vrecpsq_v, }, 1514 { NEON::BI__builtin_neon_vrnd_f16, NEON::BI__builtin_neon_vrnd_v, }, 1515 { NEON::BI__builtin_neon_vrnda_f16, NEON::BI__builtin_neon_vrnda_v, }, 1516 { NEON::BI__builtin_neon_vrndaq_f16, NEON::BI__builtin_neon_vrndaq_v, }, 1517 { NEON::BI__builtin_neon_vrndi_f16, NEON::BI__builtin_neon_vrndi_v, }, 1518 { NEON::BI__builtin_neon_vrndiq_f16, NEON::BI__builtin_neon_vrndiq_v, }, 1519 { NEON::BI__builtin_neon_vrndm_f16, NEON::BI__builtin_neon_vrndm_v, }, 1520 { NEON::BI__builtin_neon_vrndmq_f16, NEON::BI__builtin_neon_vrndmq_v, }, 1521 { NEON::BI__builtin_neon_vrndn_f16, NEON::BI__builtin_neon_vrndn_v, }, 1522 { NEON::BI__builtin_neon_vrndnq_f16, NEON::BI__builtin_neon_vrndnq_v, }, 1523 { NEON::BI__builtin_neon_vrndp_f16, NEON::BI__builtin_neon_vrndp_v, }, 1524 { NEON::BI__builtin_neon_vrndpq_f16, NEON::BI__builtin_neon_vrndpq_v, }, 1525 { NEON::BI__builtin_neon_vrndq_f16, NEON::BI__builtin_neon_vrndq_v, }, 1526 { NEON::BI__builtin_neon_vrndx_f16, NEON::BI__builtin_neon_vrndx_v, }, 1527 { NEON::BI__builtin_neon_vrndxq_f16, NEON::BI__builtin_neon_vrndxq_v, }, 1528 { NEON::BI__builtin_neon_vrsqrte_f16, NEON::BI__builtin_neon_vrsqrte_v, }, 1529 { NEON::BI__builtin_neon_vrsqrteq_f16, NEON::BI__builtin_neon_vrsqrteq_v, }, 1530 { NEON::BI__builtin_neon_vrsqrts_f16, NEON::BI__builtin_neon_vrsqrts_v, }, 1531 { NEON::BI__builtin_neon_vrsqrtsq_f16, NEON::BI__builtin_neon_vrsqrtsq_v, }, 1532 { NEON::BI__builtin_neon_vsqrt_f16, NEON::BI__builtin_neon_vsqrt_v, }, 1533 { NEON::BI__builtin_neon_vsqrtq_f16, NEON::BI__builtin_neon_vsqrtq_v, }, 1534 { NEON::BI__builtin_neon_vst1_bf16_x2, NEON::BI__builtin_neon_vst1_x2_v }, 1535 { NEON::BI__builtin_neon_vst1_bf16_x3, NEON::BI__builtin_neon_vst1_x3_v }, 1536 { NEON::BI__builtin_neon_vst1_bf16_x4, NEON::BI__builtin_neon_vst1_x4_v }, 1537 { NEON::BI__builtin_neon_vst1_bf16, NEON::BI__builtin_neon_vst1_v }, 1538 { NEON::BI__builtin_neon_vst1_lane_bf16, NEON::BI__builtin_neon_vst1_lane_v }, 1539 { NEON::BI__builtin_neon_vst1q_bf16_x2, NEON::BI__builtin_neon_vst1q_x2_v }, 1540 { NEON::BI__builtin_neon_vst1q_bf16_x3, NEON::BI__builtin_neon_vst1q_x3_v }, 1541 { NEON::BI__builtin_neon_vst1q_bf16_x4, NEON::BI__builtin_neon_vst1q_x4_v }, 1542 { NEON::BI__builtin_neon_vst1q_bf16, NEON::BI__builtin_neon_vst1q_v }, 1543 { NEON::BI__builtin_neon_vst1q_lane_bf16, NEON::BI__builtin_neon_vst1q_lane_v }, 1544 { NEON::BI__builtin_neon_vst2_bf16, NEON::BI__builtin_neon_vst2_v }, 1545 { NEON::BI__builtin_neon_vst2_lane_bf16, NEON::BI__builtin_neon_vst2_lane_v }, 1546 { NEON::BI__builtin_neon_vst2q_bf16, NEON::BI__builtin_neon_vst2q_v }, 1547 { NEON::BI__builtin_neon_vst2q_lane_bf16, NEON::BI__builtin_neon_vst2q_lane_v }, 1548 { NEON::BI__builtin_neon_vst3_bf16, NEON::BI__builtin_neon_vst3_v }, 1549 { NEON::BI__builtin_neon_vst3_lane_bf16, NEON::BI__builtin_neon_vst3_lane_v }, 1550 { NEON::BI__builtin_neon_vst3q_bf16, NEON::BI__builtin_neon_vst3q_v }, 1551 { NEON::BI__builtin_neon_vst3q_lane_bf16, NEON::BI__builtin_neon_vst3q_lane_v }, 1552 { NEON::BI__builtin_neon_vst4_bf16, NEON::BI__builtin_neon_vst4_v }, 1553 { NEON::BI__builtin_neon_vst4_lane_bf16, NEON::BI__builtin_neon_vst4_lane_v }, 1554 { NEON::BI__builtin_neon_vst4q_bf16, NEON::BI__builtin_neon_vst4q_v }, 1555 { NEON::BI__builtin_neon_vst4q_lane_bf16, NEON::BI__builtin_neon_vst4q_lane_v }, 1556 // The mangling rules cause us to have one ID for each type for vldap1(q)_lane 1557 // and vstl1(q)_lane, but codegen is equivalent for all of them. Choose an 1558 // arbitrary one to be handled as tha canonical variation. 1559 { NEON::BI__builtin_neon_vldap1_lane_u64, NEON::BI__builtin_neon_vldap1_lane_s64 }, 1560 { NEON::BI__builtin_neon_vldap1_lane_f64, NEON::BI__builtin_neon_vldap1_lane_s64 }, 1561 { NEON::BI__builtin_neon_vldap1_lane_p64, NEON::BI__builtin_neon_vldap1_lane_s64 }, 1562 { NEON::BI__builtin_neon_vldap1q_lane_u64, NEON::BI__builtin_neon_vldap1q_lane_s64 }, 1563 { NEON::BI__builtin_neon_vldap1q_lane_f64, NEON::BI__builtin_neon_vldap1q_lane_s64 }, 1564 { NEON::BI__builtin_neon_vldap1q_lane_p64, NEON::BI__builtin_neon_vldap1q_lane_s64 }, 1565 { NEON::BI__builtin_neon_vstl1_lane_u64, NEON::BI__builtin_neon_vstl1_lane_s64 }, 1566 { NEON::BI__builtin_neon_vstl1_lane_f64, NEON::BI__builtin_neon_vstl1_lane_s64 }, 1567 { NEON::BI__builtin_neon_vstl1_lane_p64, NEON::BI__builtin_neon_vstl1_lane_s64 }, 1568 { NEON::BI__builtin_neon_vstl1q_lane_u64, NEON::BI__builtin_neon_vstl1q_lane_s64 }, 1569 { NEON::BI__builtin_neon_vstl1q_lane_f64, NEON::BI__builtin_neon_vstl1q_lane_s64 }, 1570 { NEON::BI__builtin_neon_vstl1q_lane_p64, NEON::BI__builtin_neon_vstl1q_lane_s64 }, 1571 }; 1572 1573 #undef NEONMAP0 1574 #undef NEONMAP1 1575 #undef NEONMAP2 1576 1577 #define SVEMAP1(NameBase, LLVMIntrinsic, TypeModifier) \ 1578 { \ 1579 #NameBase, SVE::BI__builtin_sve_##NameBase, Intrinsic::LLVMIntrinsic, 0, \ 1580 TypeModifier \ 1581 } 1582 1583 #define SVEMAP2(NameBase, TypeModifier) \ 1584 { #NameBase, SVE::BI__builtin_sve_##NameBase, 0, 0, TypeModifier } 1585 static const ARMVectorIntrinsicInfo AArch64SVEIntrinsicMap[] = { 1586 #define GET_SVE_LLVM_INTRINSIC_MAP 1587 #include "clang/Basic/arm_sve_builtin_cg.inc" 1588 #include "clang/Basic/BuiltinsAArch64NeonSVEBridge_cg.def" 1589 #undef GET_SVE_LLVM_INTRINSIC_MAP 1590 }; 1591 1592 #undef SVEMAP1 1593 #undef SVEMAP2 1594 1595 #define SMEMAP1(NameBase, LLVMIntrinsic, TypeModifier) \ 1596 { \ 1597 #NameBase, SME::BI__builtin_sme_##NameBase, Intrinsic::LLVMIntrinsic, 0, \ 1598 TypeModifier \ 1599 } 1600 1601 #define SMEMAP2(NameBase, TypeModifier) \ 1602 { #NameBase, SME::BI__builtin_sme_##NameBase, 0, 0, TypeModifier } 1603 static const ARMVectorIntrinsicInfo AArch64SMEIntrinsicMap[] = { 1604 #define GET_SME_LLVM_INTRINSIC_MAP 1605 #include "clang/Basic/arm_sme_builtin_cg.inc" 1606 #undef GET_SME_LLVM_INTRINSIC_MAP 1607 }; 1608 1609 #undef SMEMAP1 1610 #undef SMEMAP2 1611 1612 static bool NEONSIMDIntrinsicsProvenSorted = false; 1613 1614 static bool AArch64SIMDIntrinsicsProvenSorted = false; 1615 static bool AArch64SISDIntrinsicsProvenSorted = false; 1616 static bool AArch64SVEIntrinsicsProvenSorted = false; 1617 static bool AArch64SMEIntrinsicsProvenSorted = false; 1618 1619 static const ARMVectorIntrinsicInfo * 1620 findARMVectorIntrinsicInMap(ArrayRef<ARMVectorIntrinsicInfo> IntrinsicMap, 1621 unsigned BuiltinID, bool &MapProvenSorted) { 1622 1623 #ifndef NDEBUG 1624 if (!MapProvenSorted) { 1625 assert(llvm::is_sorted(IntrinsicMap)); 1626 MapProvenSorted = true; 1627 } 1628 #endif 1629 1630 const ARMVectorIntrinsicInfo *Builtin = 1631 llvm::lower_bound(IntrinsicMap, BuiltinID); 1632 1633 if (Builtin != IntrinsicMap.end() && Builtin->BuiltinID == BuiltinID) 1634 return Builtin; 1635 1636 return nullptr; 1637 } 1638 1639 Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID, 1640 unsigned Modifier, 1641 llvm::Type *ArgType, 1642 const CallExpr *E) { 1643 int VectorSize = 0; 1644 if (Modifier & Use64BitVectors) 1645 VectorSize = 64; 1646 else if (Modifier & Use128BitVectors) 1647 VectorSize = 128; 1648 1649 // Return type. 1650 SmallVector<llvm::Type *, 3> Tys; 1651 if (Modifier & AddRetType) { 1652 llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext())); 1653 if (Modifier & VectorizeRetType) 1654 Ty = llvm::FixedVectorType::get( 1655 Ty, VectorSize ? VectorSize / Ty->getPrimitiveSizeInBits() : 1); 1656 1657 Tys.push_back(Ty); 1658 } 1659 1660 // Arguments. 1661 if (Modifier & VectorizeArgTypes) { 1662 int Elts = VectorSize ? VectorSize / ArgType->getPrimitiveSizeInBits() : 1; 1663 ArgType = llvm::FixedVectorType::get(ArgType, Elts); 1664 } 1665 1666 if (Modifier & (Add1ArgType | Add2ArgTypes)) 1667 Tys.push_back(ArgType); 1668 1669 if (Modifier & Add2ArgTypes) 1670 Tys.push_back(ArgType); 1671 1672 if (Modifier & InventFloatType) 1673 Tys.push_back(FloatTy); 1674 1675 return CGM.getIntrinsic(IntrinsicID, Tys); 1676 } 1677 1678 static Value *EmitCommonNeonSISDBuiltinExpr( 1679 CodeGenFunction &CGF, const ARMVectorIntrinsicInfo &SISDInfo, 1680 SmallVectorImpl<Value *> &Ops, const CallExpr *E) { 1681 unsigned BuiltinID = SISDInfo.BuiltinID; 1682 unsigned int Int = SISDInfo.LLVMIntrinsic; 1683 unsigned Modifier = SISDInfo.TypeModifier; 1684 const char *s = SISDInfo.NameHint; 1685 1686 switch (BuiltinID) { 1687 case NEON::BI__builtin_neon_vcled_s64: 1688 case NEON::BI__builtin_neon_vcled_u64: 1689 case NEON::BI__builtin_neon_vcles_f32: 1690 case NEON::BI__builtin_neon_vcled_f64: 1691 case NEON::BI__builtin_neon_vcltd_s64: 1692 case NEON::BI__builtin_neon_vcltd_u64: 1693 case NEON::BI__builtin_neon_vclts_f32: 1694 case NEON::BI__builtin_neon_vcltd_f64: 1695 case NEON::BI__builtin_neon_vcales_f32: 1696 case NEON::BI__builtin_neon_vcaled_f64: 1697 case NEON::BI__builtin_neon_vcalts_f32: 1698 case NEON::BI__builtin_neon_vcaltd_f64: 1699 // Only one direction of comparisons actually exist, cmle is actually a cmge 1700 // with swapped operands. The table gives us the right intrinsic but we 1701 // still need to do the swap. 1702 std::swap(Ops[0], Ops[1]); 1703 break; 1704 } 1705 1706 assert(Int && "Generic code assumes a valid intrinsic"); 1707 1708 // Determine the type(s) of this overloaded AArch64 intrinsic. 1709 const Expr *Arg = E->getArg(0); 1710 llvm::Type *ArgTy = CGF.ConvertType(Arg->getType()); 1711 Function *F = CGF.LookupNeonLLVMIntrinsic(Int, Modifier, ArgTy, E); 1712 1713 int j = 0; 1714 ConstantInt *C0 = ConstantInt::get(CGF.SizeTy, 0); 1715 for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); 1716 ai != ae; ++ai, ++j) { 1717 llvm::Type *ArgTy = ai->getType(); 1718 if (Ops[j]->getType()->getPrimitiveSizeInBits() == 1719 ArgTy->getPrimitiveSizeInBits()) 1720 continue; 1721 1722 assert(ArgTy->isVectorTy() && !Ops[j]->getType()->isVectorTy()); 1723 // The constant argument to an _n_ intrinsic always has Int32Ty, so truncate 1724 // it before inserting. 1725 Ops[j] = CGF.Builder.CreateTruncOrBitCast( 1726 Ops[j], cast<llvm::VectorType>(ArgTy)->getElementType()); 1727 Ops[j] = 1728 CGF.Builder.CreateInsertElement(PoisonValue::get(ArgTy), Ops[j], C0); 1729 } 1730 1731 Value *Result = CGF.EmitNeonCall(F, Ops, s); 1732 llvm::Type *ResultType = CGF.ConvertType(E->getType()); 1733 if (ResultType->getPrimitiveSizeInBits().getFixedValue() < 1734 Result->getType()->getPrimitiveSizeInBits().getFixedValue()) 1735 return CGF.Builder.CreateExtractElement(Result, C0); 1736 1737 return CGF.Builder.CreateBitCast(Result, ResultType, s); 1738 } 1739 1740 Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( 1741 unsigned BuiltinID, unsigned LLVMIntrinsic, unsigned AltLLVMIntrinsic, 1742 const char *NameHint, unsigned Modifier, const CallExpr *E, 1743 SmallVectorImpl<llvm::Value *> &Ops, Address PtrOp0, Address PtrOp1, 1744 llvm::Triple::ArchType Arch) { 1745 // Get the last argument, which specifies the vector type. 1746 const Expr *Arg = E->getArg(E->getNumArgs() - 1); 1747 std::optional<llvm::APSInt> NeonTypeConst = 1748 Arg->getIntegerConstantExpr(getContext()); 1749 if (!NeonTypeConst) 1750 return nullptr; 1751 1752 // Determine the type of this overloaded NEON intrinsic. 1753 NeonTypeFlags Type(NeonTypeConst->getZExtValue()); 1754 const bool Usgn = Type.isUnsigned(); 1755 const bool Quad = Type.isQuad(); 1756 const bool Floating = Type.isFloatingPoint(); 1757 const bool HasLegalHalfType = getTarget().hasLegalHalfType(); 1758 const bool AllowBFloatArgsAndRet = 1759 getTargetHooks().getABIInfo().allowBFloatArgsAndRet(); 1760 1761 llvm::FixedVectorType *VTy = 1762 GetNeonType(this, Type, HasLegalHalfType, false, AllowBFloatArgsAndRet); 1763 llvm::Type *Ty = VTy; 1764 if (!Ty) 1765 return nullptr; 1766 1767 auto getAlignmentValue32 = [&](Address addr) -> Value* { 1768 return Builder.getInt32(addr.getAlignment().getQuantity()); 1769 }; 1770 1771 unsigned Int = LLVMIntrinsic; 1772 if ((Modifier & UnsignedAlts) && !Usgn) 1773 Int = AltLLVMIntrinsic; 1774 1775 switch (BuiltinID) { 1776 default: break; 1777 case NEON::BI__builtin_neon_splat_lane_v: 1778 case NEON::BI__builtin_neon_splat_laneq_v: 1779 case NEON::BI__builtin_neon_splatq_lane_v: 1780 case NEON::BI__builtin_neon_splatq_laneq_v: { 1781 auto NumElements = VTy->getElementCount(); 1782 if (BuiltinID == NEON::BI__builtin_neon_splatq_lane_v) 1783 NumElements = NumElements * 2; 1784 if (BuiltinID == NEON::BI__builtin_neon_splat_laneq_v) 1785 NumElements = NumElements.divideCoefficientBy(2); 1786 1787 Ops[0] = Builder.CreateBitCast(Ops[0], VTy); 1788 return EmitNeonSplat(Ops[0], cast<ConstantInt>(Ops[1]), NumElements); 1789 } 1790 case NEON::BI__builtin_neon_vpadd_v: 1791 case NEON::BI__builtin_neon_vpaddq_v: 1792 // We don't allow fp/int overloading of intrinsics. 1793 if (VTy->getElementType()->isFloatingPointTy() && 1794 Int == Intrinsic::aarch64_neon_addp) 1795 Int = Intrinsic::aarch64_neon_faddp; 1796 break; 1797 case NEON::BI__builtin_neon_vabs_v: 1798 case NEON::BI__builtin_neon_vabsq_v: 1799 if (VTy->getElementType()->isFloatingPointTy()) 1800 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs"); 1801 return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs"); 1802 case NEON::BI__builtin_neon_vadd_v: 1803 case NEON::BI__builtin_neon_vaddq_v: { 1804 llvm::Type *VTy = llvm::FixedVectorType::get(Int8Ty, Quad ? 16 : 8); 1805 Ops[0] = Builder.CreateBitCast(Ops[0], VTy); 1806 Ops[1] = Builder.CreateBitCast(Ops[1], VTy); 1807 Ops[0] = Builder.CreateXor(Ops[0], Ops[1]); 1808 return Builder.CreateBitCast(Ops[0], Ty); 1809 } 1810 case NEON::BI__builtin_neon_vaddhn_v: { 1811 llvm::FixedVectorType *SrcTy = 1812 llvm::FixedVectorType::getExtendedElementVectorType(VTy); 1813 1814 // %sum = add <4 x i32> %lhs, %rhs 1815 Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); 1816 Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy); 1817 Ops[0] = Builder.CreateAdd(Ops[0], Ops[1], "vaddhn"); 1818 1819 // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16> 1820 Constant *ShiftAmt = 1821 ConstantInt::get(SrcTy, SrcTy->getScalarSizeInBits() / 2); 1822 Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vaddhn"); 1823 1824 // %res = trunc <4 x i32> %high to <4 x i16> 1825 return Builder.CreateTrunc(Ops[0], VTy, "vaddhn"); 1826 } 1827 case NEON::BI__builtin_neon_vcale_v: 1828 case NEON::BI__builtin_neon_vcaleq_v: 1829 case NEON::BI__builtin_neon_vcalt_v: 1830 case NEON::BI__builtin_neon_vcaltq_v: 1831 std::swap(Ops[0], Ops[1]); 1832 [[fallthrough]]; 1833 case NEON::BI__builtin_neon_vcage_v: 1834 case NEON::BI__builtin_neon_vcageq_v: 1835 case NEON::BI__builtin_neon_vcagt_v: 1836 case NEON::BI__builtin_neon_vcagtq_v: { 1837 llvm::Type *Ty; 1838 switch (VTy->getScalarSizeInBits()) { 1839 default: llvm_unreachable("unexpected type"); 1840 case 32: 1841 Ty = FloatTy; 1842 break; 1843 case 64: 1844 Ty = DoubleTy; 1845 break; 1846 case 16: 1847 Ty = HalfTy; 1848 break; 1849 } 1850 auto *VecFlt = llvm::FixedVectorType::get(Ty, VTy->getNumElements()); 1851 llvm::Type *Tys[] = { VTy, VecFlt }; 1852 Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys); 1853 return EmitNeonCall(F, Ops, NameHint); 1854 } 1855 case NEON::BI__builtin_neon_vceqz_v: 1856 case NEON::BI__builtin_neon_vceqzq_v: 1857 return EmitAArch64CompareBuiltinExpr( 1858 Ops[0], Ty, Floating ? ICmpInst::FCMP_OEQ : ICmpInst::ICMP_EQ, "vceqz"); 1859 case NEON::BI__builtin_neon_vcgez_v: 1860 case NEON::BI__builtin_neon_vcgezq_v: 1861 return EmitAArch64CompareBuiltinExpr( 1862 Ops[0], Ty, Floating ? ICmpInst::FCMP_OGE : ICmpInst::ICMP_SGE, 1863 "vcgez"); 1864 case NEON::BI__builtin_neon_vclez_v: 1865 case NEON::BI__builtin_neon_vclezq_v: 1866 return EmitAArch64CompareBuiltinExpr( 1867 Ops[0], Ty, Floating ? ICmpInst::FCMP_OLE : ICmpInst::ICMP_SLE, 1868 "vclez"); 1869 case NEON::BI__builtin_neon_vcgtz_v: 1870 case NEON::BI__builtin_neon_vcgtzq_v: 1871 return EmitAArch64CompareBuiltinExpr( 1872 Ops[0], Ty, Floating ? ICmpInst::FCMP_OGT : ICmpInst::ICMP_SGT, 1873 "vcgtz"); 1874 case NEON::BI__builtin_neon_vcltz_v: 1875 case NEON::BI__builtin_neon_vcltzq_v: 1876 return EmitAArch64CompareBuiltinExpr( 1877 Ops[0], Ty, Floating ? ICmpInst::FCMP_OLT : ICmpInst::ICMP_SLT, 1878 "vcltz"); 1879 case NEON::BI__builtin_neon_vclz_v: 1880 case NEON::BI__builtin_neon_vclzq_v: 1881 // We generate target-independent intrinsic, which needs a second argument 1882 // for whether or not clz of zero is undefined; on ARM it isn't. 1883 Ops.push_back(Builder.getInt1(getTarget().isCLZForZeroUndef())); 1884 break; 1885 case NEON::BI__builtin_neon_vcvt_f32_v: 1886 case NEON::BI__builtin_neon_vcvtq_f32_v: 1887 Ops[0] = Builder.CreateBitCast(Ops[0], Ty); 1888 Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad), 1889 HasLegalHalfType); 1890 return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt") 1891 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt"); 1892 case NEON::BI__builtin_neon_vcvt_f16_s16: 1893 case NEON::BI__builtin_neon_vcvt_f16_u16: 1894 case NEON::BI__builtin_neon_vcvtq_f16_s16: 1895 case NEON::BI__builtin_neon_vcvtq_f16_u16: 1896 Ops[0] = Builder.CreateBitCast(Ops[0], Ty); 1897 Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float16, false, Quad), 1898 HasLegalHalfType); 1899 return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt") 1900 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt"); 1901 case NEON::BI__builtin_neon_vcvt_n_f16_s16: 1902 case NEON::BI__builtin_neon_vcvt_n_f16_u16: 1903 case NEON::BI__builtin_neon_vcvtq_n_f16_s16: 1904 case NEON::BI__builtin_neon_vcvtq_n_f16_u16: { 1905 llvm::Type *Tys[2] = { GetFloatNeonType(this, Type), Ty }; 1906 Function *F = CGM.getIntrinsic(Int, Tys); 1907 return EmitNeonCall(F, Ops, "vcvt_n"); 1908 } 1909 case NEON::BI__builtin_neon_vcvt_n_f32_v: 1910 case NEON::BI__builtin_neon_vcvt_n_f64_v: 1911 case NEON::BI__builtin_neon_vcvtq_n_f32_v: 1912 case NEON::BI__builtin_neon_vcvtq_n_f64_v: { 1913 llvm::Type *Tys[2] = { GetFloatNeonType(this, Type), Ty }; 1914 Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic; 1915 Function *F = CGM.getIntrinsic(Int, Tys); 1916 return EmitNeonCall(F, Ops, "vcvt_n"); 1917 } 1918 case NEON::BI__builtin_neon_vcvt_n_s16_f16: 1919 case NEON::BI__builtin_neon_vcvt_n_s32_v: 1920 case NEON::BI__builtin_neon_vcvt_n_u16_f16: 1921 case NEON::BI__builtin_neon_vcvt_n_u32_v: 1922 case NEON::BI__builtin_neon_vcvt_n_s64_v: 1923 case NEON::BI__builtin_neon_vcvt_n_u64_v: 1924 case NEON::BI__builtin_neon_vcvtq_n_s16_f16: 1925 case NEON::BI__builtin_neon_vcvtq_n_s32_v: 1926 case NEON::BI__builtin_neon_vcvtq_n_u16_f16: 1927 case NEON::BI__builtin_neon_vcvtq_n_u32_v: 1928 case NEON::BI__builtin_neon_vcvtq_n_s64_v: 1929 case NEON::BI__builtin_neon_vcvtq_n_u64_v: { 1930 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) }; 1931 Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys); 1932 return EmitNeonCall(F, Ops, "vcvt_n"); 1933 } 1934 case NEON::BI__builtin_neon_vcvt_s32_v: 1935 case NEON::BI__builtin_neon_vcvt_u32_v: 1936 case NEON::BI__builtin_neon_vcvt_s64_v: 1937 case NEON::BI__builtin_neon_vcvt_u64_v: 1938 case NEON::BI__builtin_neon_vcvt_s16_f16: 1939 case NEON::BI__builtin_neon_vcvt_u16_f16: 1940 case NEON::BI__builtin_neon_vcvtq_s32_v: 1941 case NEON::BI__builtin_neon_vcvtq_u32_v: 1942 case NEON::BI__builtin_neon_vcvtq_s64_v: 1943 case NEON::BI__builtin_neon_vcvtq_u64_v: 1944 case NEON::BI__builtin_neon_vcvtq_s16_f16: 1945 case NEON::BI__builtin_neon_vcvtq_u16_f16: { 1946 Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type)); 1947 return Usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt") 1948 : Builder.CreateFPToSI(Ops[0], Ty, "vcvt"); 1949 } 1950 case NEON::BI__builtin_neon_vcvta_s16_f16: 1951 case NEON::BI__builtin_neon_vcvta_s32_v: 1952 case NEON::BI__builtin_neon_vcvta_s64_v: 1953 case NEON::BI__builtin_neon_vcvta_u16_f16: 1954 case NEON::BI__builtin_neon_vcvta_u32_v: 1955 case NEON::BI__builtin_neon_vcvta_u64_v: 1956 case NEON::BI__builtin_neon_vcvtaq_s16_f16: 1957 case NEON::BI__builtin_neon_vcvtaq_s32_v: 1958 case NEON::BI__builtin_neon_vcvtaq_s64_v: 1959 case NEON::BI__builtin_neon_vcvtaq_u16_f16: 1960 case NEON::BI__builtin_neon_vcvtaq_u32_v: 1961 case NEON::BI__builtin_neon_vcvtaq_u64_v: 1962 case NEON::BI__builtin_neon_vcvtn_s16_f16: 1963 case NEON::BI__builtin_neon_vcvtn_s32_v: 1964 case NEON::BI__builtin_neon_vcvtn_s64_v: 1965 case NEON::BI__builtin_neon_vcvtn_u16_f16: 1966 case NEON::BI__builtin_neon_vcvtn_u32_v: 1967 case NEON::BI__builtin_neon_vcvtn_u64_v: 1968 case NEON::BI__builtin_neon_vcvtnq_s16_f16: 1969 case NEON::BI__builtin_neon_vcvtnq_s32_v: 1970 case NEON::BI__builtin_neon_vcvtnq_s64_v: 1971 case NEON::BI__builtin_neon_vcvtnq_u16_f16: 1972 case NEON::BI__builtin_neon_vcvtnq_u32_v: 1973 case NEON::BI__builtin_neon_vcvtnq_u64_v: 1974 case NEON::BI__builtin_neon_vcvtp_s16_f16: 1975 case NEON::BI__builtin_neon_vcvtp_s32_v: 1976 case NEON::BI__builtin_neon_vcvtp_s64_v: 1977 case NEON::BI__builtin_neon_vcvtp_u16_f16: 1978 case NEON::BI__builtin_neon_vcvtp_u32_v: 1979 case NEON::BI__builtin_neon_vcvtp_u64_v: 1980 case NEON::BI__builtin_neon_vcvtpq_s16_f16: 1981 case NEON::BI__builtin_neon_vcvtpq_s32_v: 1982 case NEON::BI__builtin_neon_vcvtpq_s64_v: 1983 case NEON::BI__builtin_neon_vcvtpq_u16_f16: 1984 case NEON::BI__builtin_neon_vcvtpq_u32_v: 1985 case NEON::BI__builtin_neon_vcvtpq_u64_v: 1986 case NEON::BI__builtin_neon_vcvtm_s16_f16: 1987 case NEON::BI__builtin_neon_vcvtm_s32_v: 1988 case NEON::BI__builtin_neon_vcvtm_s64_v: 1989 case NEON::BI__builtin_neon_vcvtm_u16_f16: 1990 case NEON::BI__builtin_neon_vcvtm_u32_v: 1991 case NEON::BI__builtin_neon_vcvtm_u64_v: 1992 case NEON::BI__builtin_neon_vcvtmq_s16_f16: 1993 case NEON::BI__builtin_neon_vcvtmq_s32_v: 1994 case NEON::BI__builtin_neon_vcvtmq_s64_v: 1995 case NEON::BI__builtin_neon_vcvtmq_u16_f16: 1996 case NEON::BI__builtin_neon_vcvtmq_u32_v: 1997 case NEON::BI__builtin_neon_vcvtmq_u64_v: { 1998 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) }; 1999 return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint); 2000 } 2001 case NEON::BI__builtin_neon_vcvtx_f32_v: { 2002 llvm::Type *Tys[2] = { VTy->getTruncatedElementVectorType(VTy), Ty}; 2003 return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint); 2004 2005 } 2006 case NEON::BI__builtin_neon_vext_v: 2007 case NEON::BI__builtin_neon_vextq_v: { 2008 int CV = cast<ConstantInt>(Ops[2])->getSExtValue(); 2009 SmallVector<int, 16> Indices; 2010 for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) 2011 Indices.push_back(i+CV); 2012 2013 Ops[0] = Builder.CreateBitCast(Ops[0], Ty); 2014 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 2015 return Builder.CreateShuffleVector(Ops[0], Ops[1], Indices, "vext"); 2016 } 2017 case NEON::BI__builtin_neon_vfma_v: 2018 case NEON::BI__builtin_neon_vfmaq_v: { 2019 Ops[0] = Builder.CreateBitCast(Ops[0], Ty); 2020 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 2021 Ops[2] = Builder.CreateBitCast(Ops[2], Ty); 2022 2023 // NEON intrinsic puts accumulator first, unlike the LLVM fma. 2024 return emitCallMaybeConstrainedFPBuiltin( 2025 *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty, 2026 {Ops[1], Ops[2], Ops[0]}); 2027 } 2028 case NEON::BI__builtin_neon_vld1_v: 2029 case NEON::BI__builtin_neon_vld1q_v: { 2030 llvm::Type *Tys[] = {Ty, Int8PtrTy}; 2031 Ops.push_back(getAlignmentValue32(PtrOp0)); 2032 return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "vld1"); 2033 } 2034 case NEON::BI__builtin_neon_vld1_x2_v: 2035 case NEON::BI__builtin_neon_vld1q_x2_v: 2036 case NEON::BI__builtin_neon_vld1_x3_v: 2037 case NEON::BI__builtin_neon_vld1q_x3_v: 2038 case NEON::BI__builtin_neon_vld1_x4_v: 2039 case NEON::BI__builtin_neon_vld1q_x4_v: { 2040 llvm::Type *Tys[2] = {VTy, UnqualPtrTy}; 2041 Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys); 2042 Ops[1] = Builder.CreateCall(F, Ops[1], "vld1xN"); 2043 return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); 2044 } 2045 case NEON::BI__builtin_neon_vld2_v: 2046 case NEON::BI__builtin_neon_vld2q_v: 2047 case NEON::BI__builtin_neon_vld3_v: 2048 case NEON::BI__builtin_neon_vld3q_v: 2049 case NEON::BI__builtin_neon_vld4_v: 2050 case NEON::BI__builtin_neon_vld4q_v: 2051 case NEON::BI__builtin_neon_vld2_dup_v: 2052 case NEON::BI__builtin_neon_vld2q_dup_v: 2053 case NEON::BI__builtin_neon_vld3_dup_v: 2054 case NEON::BI__builtin_neon_vld3q_dup_v: 2055 case NEON::BI__builtin_neon_vld4_dup_v: 2056 case NEON::BI__builtin_neon_vld4q_dup_v: { 2057 llvm::Type *Tys[] = {Ty, Int8PtrTy}; 2058 Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys); 2059 Value *Align = getAlignmentValue32(PtrOp1); 2060 Ops[1] = Builder.CreateCall(F, {Ops[1], Align}, NameHint); 2061 return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); 2062 } 2063 case NEON::BI__builtin_neon_vld1_dup_v: 2064 case NEON::BI__builtin_neon_vld1q_dup_v: { 2065 Value *V = PoisonValue::get(Ty); 2066 PtrOp0 = PtrOp0.withElementType(VTy->getElementType()); 2067 LoadInst *Ld = Builder.CreateLoad(PtrOp0); 2068 llvm::Constant *CI = ConstantInt::get(SizeTy, 0); 2069 Ops[0] = Builder.CreateInsertElement(V, Ld, CI); 2070 return EmitNeonSplat(Ops[0], CI); 2071 } 2072 case NEON::BI__builtin_neon_vld2_lane_v: 2073 case NEON::BI__builtin_neon_vld2q_lane_v: 2074 case NEON::BI__builtin_neon_vld3_lane_v: 2075 case NEON::BI__builtin_neon_vld3q_lane_v: 2076 case NEON::BI__builtin_neon_vld4_lane_v: 2077 case NEON::BI__builtin_neon_vld4q_lane_v: { 2078 llvm::Type *Tys[] = {Ty, Int8PtrTy}; 2079 Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys); 2080 for (unsigned I = 2; I < Ops.size() - 1; ++I) 2081 Ops[I] = Builder.CreateBitCast(Ops[I], Ty); 2082 Ops.push_back(getAlignmentValue32(PtrOp1)); 2083 Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), NameHint); 2084 return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); 2085 } 2086 case NEON::BI__builtin_neon_vmovl_v: { 2087 llvm::FixedVectorType *DTy = 2088 llvm::FixedVectorType::getTruncatedElementVectorType(VTy); 2089 Ops[0] = Builder.CreateBitCast(Ops[0], DTy); 2090 if (Usgn) 2091 return Builder.CreateZExt(Ops[0], Ty, "vmovl"); 2092 return Builder.CreateSExt(Ops[0], Ty, "vmovl"); 2093 } 2094 case NEON::BI__builtin_neon_vmovn_v: { 2095 llvm::FixedVectorType *QTy = 2096 llvm::FixedVectorType::getExtendedElementVectorType(VTy); 2097 Ops[0] = Builder.CreateBitCast(Ops[0], QTy); 2098 return Builder.CreateTrunc(Ops[0], Ty, "vmovn"); 2099 } 2100 case NEON::BI__builtin_neon_vmull_v: 2101 // FIXME: the integer vmull operations could be emitted in terms of pure 2102 // LLVM IR (2 exts followed by a mul). Unfortunately LLVM has a habit of 2103 // hoisting the exts outside loops. Until global ISel comes along that can 2104 // see through such movement this leads to bad CodeGen. So we need an 2105 // intrinsic for now. 2106 Int = Usgn ? Intrinsic::arm_neon_vmullu : Intrinsic::arm_neon_vmulls; 2107 Int = Type.isPoly() ? (unsigned)Intrinsic::arm_neon_vmullp : Int; 2108 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull"); 2109 case NEON::BI__builtin_neon_vpadal_v: 2110 case NEON::BI__builtin_neon_vpadalq_v: { 2111 // The source operand type has twice as many elements of half the size. 2112 unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits(); 2113 llvm::Type *EltTy = 2114 llvm::IntegerType::get(getLLVMContext(), EltBits / 2); 2115 auto *NarrowTy = 2116 llvm::FixedVectorType::get(EltTy, VTy->getNumElements() * 2); 2117 llvm::Type *Tys[2] = { Ty, NarrowTy }; 2118 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint); 2119 } 2120 case NEON::BI__builtin_neon_vpaddl_v: 2121 case NEON::BI__builtin_neon_vpaddlq_v: { 2122 // The source operand type has twice as many elements of half the size. 2123 unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits(); 2124 llvm::Type *EltTy = llvm::IntegerType::get(getLLVMContext(), EltBits / 2); 2125 auto *NarrowTy = 2126 llvm::FixedVectorType::get(EltTy, VTy->getNumElements() * 2); 2127 llvm::Type *Tys[2] = { Ty, NarrowTy }; 2128 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpaddl"); 2129 } 2130 case NEON::BI__builtin_neon_vqdmlal_v: 2131 case NEON::BI__builtin_neon_vqdmlsl_v: { 2132 SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end()); 2133 Ops[1] = 2134 EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), MulOps, "vqdmlal"); 2135 Ops.resize(2); 2136 return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty), Ops, NameHint); 2137 } 2138 case NEON::BI__builtin_neon_vqdmulhq_lane_v: 2139 case NEON::BI__builtin_neon_vqdmulh_lane_v: 2140 case NEON::BI__builtin_neon_vqrdmulhq_lane_v: 2141 case NEON::BI__builtin_neon_vqrdmulh_lane_v: { 2142 auto *RTy = cast<llvm::FixedVectorType>(Ty); 2143 if (BuiltinID == NEON::BI__builtin_neon_vqdmulhq_lane_v || 2144 BuiltinID == NEON::BI__builtin_neon_vqrdmulhq_lane_v) 2145 RTy = llvm::FixedVectorType::get(RTy->getElementType(), 2146 RTy->getNumElements() * 2); 2147 llvm::Type *Tys[2] = { 2148 RTy, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, 2149 /*isQuad*/ false))}; 2150 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint); 2151 } 2152 case NEON::BI__builtin_neon_vqdmulhq_laneq_v: 2153 case NEON::BI__builtin_neon_vqdmulh_laneq_v: 2154 case NEON::BI__builtin_neon_vqrdmulhq_laneq_v: 2155 case NEON::BI__builtin_neon_vqrdmulh_laneq_v: { 2156 llvm::Type *Tys[2] = { 2157 Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, 2158 /*isQuad*/ true))}; 2159 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint); 2160 } 2161 case NEON::BI__builtin_neon_vqshl_n_v: 2162 case NEON::BI__builtin_neon_vqshlq_n_v: 2163 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n", 2164 1, false); 2165 case NEON::BI__builtin_neon_vqshlu_n_v: 2166 case NEON::BI__builtin_neon_vqshluq_n_v: 2167 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshlu_n", 2168 1, false); 2169 case NEON::BI__builtin_neon_vrecpe_v: 2170 case NEON::BI__builtin_neon_vrecpeq_v: 2171 case NEON::BI__builtin_neon_vrsqrte_v: 2172 case NEON::BI__builtin_neon_vrsqrteq_v: 2173 Int = Ty->isFPOrFPVectorTy() ? LLVMIntrinsic : AltLLVMIntrinsic; 2174 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint); 2175 case NEON::BI__builtin_neon_vrndi_v: 2176 case NEON::BI__builtin_neon_vrndiq_v: 2177 Int = Builder.getIsFPConstrained() 2178 ? Intrinsic::experimental_constrained_nearbyint 2179 : Intrinsic::nearbyint; 2180 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint); 2181 case NEON::BI__builtin_neon_vrshr_n_v: 2182 case NEON::BI__builtin_neon_vrshrq_n_v: 2183 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n", 2184 1, true); 2185 case NEON::BI__builtin_neon_vsha512hq_u64: 2186 case NEON::BI__builtin_neon_vsha512h2q_u64: 2187 case NEON::BI__builtin_neon_vsha512su0q_u64: 2188 case NEON::BI__builtin_neon_vsha512su1q_u64: { 2189 Function *F = CGM.getIntrinsic(Int); 2190 return EmitNeonCall(F, Ops, ""); 2191 } 2192 case NEON::BI__builtin_neon_vshl_n_v: 2193 case NEON::BI__builtin_neon_vshlq_n_v: 2194 Ops[1] = EmitNeonShiftVector(Ops[1], Ty, false); 2195 return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1], 2196 "vshl_n"); 2197 case NEON::BI__builtin_neon_vshll_n_v: { 2198 llvm::FixedVectorType *SrcTy = 2199 llvm::FixedVectorType::getTruncatedElementVectorType(VTy); 2200 Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); 2201 if (Usgn) 2202 Ops[0] = Builder.CreateZExt(Ops[0], VTy); 2203 else 2204 Ops[0] = Builder.CreateSExt(Ops[0], VTy); 2205 Ops[1] = EmitNeonShiftVector(Ops[1], VTy, false); 2206 return Builder.CreateShl(Ops[0], Ops[1], "vshll_n"); 2207 } 2208 case NEON::BI__builtin_neon_vshrn_n_v: { 2209 llvm::FixedVectorType *SrcTy = 2210 llvm::FixedVectorType::getExtendedElementVectorType(VTy); 2211 Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); 2212 Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false); 2213 if (Usgn) 2214 Ops[0] = Builder.CreateLShr(Ops[0], Ops[1]); 2215 else 2216 Ops[0] = Builder.CreateAShr(Ops[0], Ops[1]); 2217 return Builder.CreateTrunc(Ops[0], Ty, "vshrn_n"); 2218 } 2219 case NEON::BI__builtin_neon_vshr_n_v: 2220 case NEON::BI__builtin_neon_vshrq_n_v: 2221 return EmitNeonRShiftImm(Ops[0], Ops[1], Ty, Usgn, "vshr_n"); 2222 case NEON::BI__builtin_neon_vst1_v: 2223 case NEON::BI__builtin_neon_vst1q_v: 2224 case NEON::BI__builtin_neon_vst2_v: 2225 case NEON::BI__builtin_neon_vst2q_v: 2226 case NEON::BI__builtin_neon_vst3_v: 2227 case NEON::BI__builtin_neon_vst3q_v: 2228 case NEON::BI__builtin_neon_vst4_v: 2229 case NEON::BI__builtin_neon_vst4q_v: 2230 case NEON::BI__builtin_neon_vst2_lane_v: 2231 case NEON::BI__builtin_neon_vst2q_lane_v: 2232 case NEON::BI__builtin_neon_vst3_lane_v: 2233 case NEON::BI__builtin_neon_vst3q_lane_v: 2234 case NEON::BI__builtin_neon_vst4_lane_v: 2235 case NEON::BI__builtin_neon_vst4q_lane_v: { 2236 llvm::Type *Tys[] = {Int8PtrTy, Ty}; 2237 Ops.push_back(getAlignmentValue32(PtrOp0)); 2238 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, ""); 2239 } 2240 case NEON::BI__builtin_neon_vsm3partw1q_u32: 2241 case NEON::BI__builtin_neon_vsm3partw2q_u32: 2242 case NEON::BI__builtin_neon_vsm3ss1q_u32: 2243 case NEON::BI__builtin_neon_vsm4ekeyq_u32: 2244 case NEON::BI__builtin_neon_vsm4eq_u32: { 2245 Function *F = CGM.getIntrinsic(Int); 2246 return EmitNeonCall(F, Ops, ""); 2247 } 2248 case NEON::BI__builtin_neon_vsm3tt1aq_u32: 2249 case NEON::BI__builtin_neon_vsm3tt1bq_u32: 2250 case NEON::BI__builtin_neon_vsm3tt2aq_u32: 2251 case NEON::BI__builtin_neon_vsm3tt2bq_u32: { 2252 Function *F = CGM.getIntrinsic(Int); 2253 Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty); 2254 return EmitNeonCall(F, Ops, ""); 2255 } 2256 case NEON::BI__builtin_neon_vst1_x2_v: 2257 case NEON::BI__builtin_neon_vst1q_x2_v: 2258 case NEON::BI__builtin_neon_vst1_x3_v: 2259 case NEON::BI__builtin_neon_vst1q_x3_v: 2260 case NEON::BI__builtin_neon_vst1_x4_v: 2261 case NEON::BI__builtin_neon_vst1q_x4_v: { 2262 // TODO: Currently in AArch32 mode the pointer operand comes first, whereas 2263 // in AArch64 it comes last. We may want to stick to one or another. 2264 if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be || 2265 Arch == llvm::Triple::aarch64_32) { 2266 llvm::Type *Tys[2] = {VTy, UnqualPtrTy}; 2267 std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end()); 2268 return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, ""); 2269 } 2270 llvm::Type *Tys[2] = {UnqualPtrTy, VTy}; 2271 return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, ""); 2272 } 2273 case NEON::BI__builtin_neon_vsubhn_v: { 2274 llvm::FixedVectorType *SrcTy = 2275 llvm::FixedVectorType::getExtendedElementVectorType(VTy); 2276 2277 // %sum = add <4 x i32> %lhs, %rhs 2278 Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); 2279 Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy); 2280 Ops[0] = Builder.CreateSub(Ops[0], Ops[1], "vsubhn"); 2281 2282 // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16> 2283 Constant *ShiftAmt = 2284 ConstantInt::get(SrcTy, SrcTy->getScalarSizeInBits() / 2); 2285 Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vsubhn"); 2286 2287 // %res = trunc <4 x i32> %high to <4 x i16> 2288 return Builder.CreateTrunc(Ops[0], VTy, "vsubhn"); 2289 } 2290 case NEON::BI__builtin_neon_vtrn_v: 2291 case NEON::BI__builtin_neon_vtrnq_v: { 2292 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 2293 Ops[2] = Builder.CreateBitCast(Ops[2], Ty); 2294 Value *SV = nullptr; 2295 2296 for (unsigned vi = 0; vi != 2; ++vi) { 2297 SmallVector<int, 16> Indices; 2298 for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) { 2299 Indices.push_back(i+vi); 2300 Indices.push_back(i+e+vi); 2301 } 2302 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi); 2303 SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn"); 2304 SV = Builder.CreateDefaultAlignedStore(SV, Addr); 2305 } 2306 return SV; 2307 } 2308 case NEON::BI__builtin_neon_vtst_v: 2309 case NEON::BI__builtin_neon_vtstq_v: { 2310 Ops[0] = Builder.CreateBitCast(Ops[0], Ty); 2311 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 2312 Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]); 2313 Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0], 2314 ConstantAggregateZero::get(Ty)); 2315 return Builder.CreateSExt(Ops[0], Ty, "vtst"); 2316 } 2317 case NEON::BI__builtin_neon_vuzp_v: 2318 case NEON::BI__builtin_neon_vuzpq_v: { 2319 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 2320 Ops[2] = Builder.CreateBitCast(Ops[2], Ty); 2321 Value *SV = nullptr; 2322 2323 for (unsigned vi = 0; vi != 2; ++vi) { 2324 SmallVector<int, 16> Indices; 2325 for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) 2326 Indices.push_back(2*i+vi); 2327 2328 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi); 2329 SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp"); 2330 SV = Builder.CreateDefaultAlignedStore(SV, Addr); 2331 } 2332 return SV; 2333 } 2334 case NEON::BI__builtin_neon_vxarq_u64: { 2335 Function *F = CGM.getIntrinsic(Int); 2336 Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty); 2337 return EmitNeonCall(F, Ops, ""); 2338 } 2339 case NEON::BI__builtin_neon_vzip_v: 2340 case NEON::BI__builtin_neon_vzipq_v: { 2341 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 2342 Ops[2] = Builder.CreateBitCast(Ops[2], Ty); 2343 Value *SV = nullptr; 2344 2345 for (unsigned vi = 0; vi != 2; ++vi) { 2346 SmallVector<int, 16> Indices; 2347 for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) { 2348 Indices.push_back((i + vi*e) >> 1); 2349 Indices.push_back(((i + vi*e) >> 1)+e); 2350 } 2351 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi); 2352 SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip"); 2353 SV = Builder.CreateDefaultAlignedStore(SV, Addr); 2354 } 2355 return SV; 2356 } 2357 case NEON::BI__builtin_neon_vdot_s32: 2358 case NEON::BI__builtin_neon_vdot_u32: 2359 case NEON::BI__builtin_neon_vdotq_s32: 2360 case NEON::BI__builtin_neon_vdotq_u32: { 2361 auto *InputTy = 2362 llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); 2363 llvm::Type *Tys[2] = { Ty, InputTy }; 2364 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot"); 2365 } 2366 case NEON::BI__builtin_neon_vfmlal_low_f16: 2367 case NEON::BI__builtin_neon_vfmlalq_low_f16: { 2368 auto *InputTy = 2369 llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); 2370 llvm::Type *Tys[2] = { Ty, InputTy }; 2371 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_low"); 2372 } 2373 case NEON::BI__builtin_neon_vfmlsl_low_f16: 2374 case NEON::BI__builtin_neon_vfmlslq_low_f16: { 2375 auto *InputTy = 2376 llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); 2377 llvm::Type *Tys[2] = { Ty, InputTy }; 2378 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_low"); 2379 } 2380 case NEON::BI__builtin_neon_vfmlal_high_f16: 2381 case NEON::BI__builtin_neon_vfmlalq_high_f16: { 2382 auto *InputTy = 2383 llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); 2384 llvm::Type *Tys[2] = { Ty, InputTy }; 2385 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_high"); 2386 } 2387 case NEON::BI__builtin_neon_vfmlsl_high_f16: 2388 case NEON::BI__builtin_neon_vfmlslq_high_f16: { 2389 auto *InputTy = 2390 llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); 2391 llvm::Type *Tys[2] = { Ty, InputTy }; 2392 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high"); 2393 } 2394 case NEON::BI__builtin_neon_vmmlaq_s32: 2395 case NEON::BI__builtin_neon_vmmlaq_u32: { 2396 auto *InputTy = 2397 llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); 2398 llvm::Type *Tys[2] = { Ty, InputTy }; 2399 return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "vmmla"); 2400 } 2401 case NEON::BI__builtin_neon_vusmmlaq_s32: { 2402 auto *InputTy = 2403 llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); 2404 llvm::Type *Tys[2] = { Ty, InputTy }; 2405 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusmmla"); 2406 } 2407 case NEON::BI__builtin_neon_vusdot_s32: 2408 case NEON::BI__builtin_neon_vusdotq_s32: { 2409 auto *InputTy = 2410 llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); 2411 llvm::Type *Tys[2] = { Ty, InputTy }; 2412 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot"); 2413 } 2414 case NEON::BI__builtin_neon_vbfdot_f32: 2415 case NEON::BI__builtin_neon_vbfdotq_f32: { 2416 llvm::Type *InputTy = 2417 llvm::FixedVectorType::get(BFloatTy, Ty->getPrimitiveSizeInBits() / 16); 2418 llvm::Type *Tys[2] = { Ty, InputTy }; 2419 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot"); 2420 } 2421 case NEON::BI__builtin_neon___a32_vcvt_bf16_f32: { 2422 llvm::Type *Tys[1] = { Ty }; 2423 Function *F = CGM.getIntrinsic(Int, Tys); 2424 return EmitNeonCall(F, Ops, "vcvtfp2bf"); 2425 } 2426 2427 } 2428 2429 assert(Int && "Expected valid intrinsic number"); 2430 2431 // Determine the type(s) of this overloaded AArch64 intrinsic. 2432 Function *F = LookupNeonLLVMIntrinsic(Int, Modifier, Ty, E); 2433 2434 Value *Result = EmitNeonCall(F, Ops, NameHint); 2435 llvm::Type *ResultType = ConvertType(E->getType()); 2436 // AArch64 intrinsic one-element vector type cast to 2437 // scalar type expected by the builtin 2438 return Builder.CreateBitCast(Result, ResultType, NameHint); 2439 } 2440 2441 Value * 2442 CodeGenFunction::EmitAArch64CompareBuiltinExpr(Value *Op, llvm::Type *Ty, 2443 const CmpInst::Predicate Pred, 2444 const Twine &Name) { 2445 2446 if (isa<FixedVectorType>(Ty)) { 2447 // Vector types are cast to i8 vectors. Recover original type. 2448 Op = Builder.CreateBitCast(Op, Ty); 2449 } 2450 2451 if (CmpInst::isFPPredicate(Pred)) { 2452 if (Pred == CmpInst::FCMP_OEQ) 2453 Op = Builder.CreateFCmp(Pred, Op, Constant::getNullValue(Op->getType())); 2454 else 2455 Op = Builder.CreateFCmpS(Pred, Op, Constant::getNullValue(Op->getType())); 2456 } else { 2457 Op = Builder.CreateICmp(Pred, Op, Constant::getNullValue(Op->getType())); 2458 } 2459 2460 llvm::Type *ResTy = Ty; 2461 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) 2462 ResTy = FixedVectorType::get( 2463 IntegerType::get(getLLVMContext(), VTy->getScalarSizeInBits()), 2464 VTy->getNumElements()); 2465 2466 return Builder.CreateSExt(Op, ResTy, Name); 2467 } 2468 2469 static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops, 2470 Value *ExtOp, Value *IndexOp, 2471 llvm::Type *ResTy, unsigned IntID, 2472 const char *Name) { 2473 SmallVector<Value *, 2> TblOps; 2474 if (ExtOp) 2475 TblOps.push_back(ExtOp); 2476 2477 // Build a vector containing sequential number like (0, 1, 2, ..., 15) 2478 SmallVector<int, 16> Indices; 2479 auto *TblTy = cast<llvm::FixedVectorType>(Ops[0]->getType()); 2480 for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) { 2481 Indices.push_back(2*i); 2482 Indices.push_back(2*i+1); 2483 } 2484 2485 int PairPos = 0, End = Ops.size() - 1; 2486 while (PairPos < End) { 2487 TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos], 2488 Ops[PairPos+1], Indices, 2489 Name)); 2490 PairPos += 2; 2491 } 2492 2493 // If there's an odd number of 64-bit lookup table, fill the high 64-bit 2494 // of the 128-bit lookup table with zero. 2495 if (PairPos == End) { 2496 Value *ZeroTbl = ConstantAggregateZero::get(TblTy); 2497 TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos], 2498 ZeroTbl, Indices, Name)); 2499 } 2500 2501 Function *TblF; 2502 TblOps.push_back(IndexOp); 2503 TblF = CGF.CGM.getIntrinsic(IntID, ResTy); 2504 2505 return CGF.EmitNeonCall(TblF, TblOps, Name); 2506 } 2507 2508 Value *CodeGenFunction::GetValueForARMHint(unsigned BuiltinID) { 2509 unsigned Value; 2510 switch (BuiltinID) { 2511 default: 2512 return nullptr; 2513 case clang::ARM::BI__builtin_arm_nop: 2514 Value = 0; 2515 break; 2516 case clang::ARM::BI__builtin_arm_yield: 2517 case clang::ARM::BI__yield: 2518 Value = 1; 2519 break; 2520 case clang::ARM::BI__builtin_arm_wfe: 2521 case clang::ARM::BI__wfe: 2522 Value = 2; 2523 break; 2524 case clang::ARM::BI__builtin_arm_wfi: 2525 case clang::ARM::BI__wfi: 2526 Value = 3; 2527 break; 2528 case clang::ARM::BI__builtin_arm_sev: 2529 case clang::ARM::BI__sev: 2530 Value = 4; 2531 break; 2532 case clang::ARM::BI__builtin_arm_sevl: 2533 case clang::ARM::BI__sevl: 2534 Value = 5; 2535 break; 2536 } 2537 2538 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_hint), 2539 llvm::ConstantInt::get(Int32Ty, Value)); 2540 } 2541 2542 enum SpecialRegisterAccessKind { 2543 NormalRead, 2544 VolatileRead, 2545 Write, 2546 }; 2547 2548 // Generates the IR for the read/write special register builtin, 2549 // ValueType is the type of the value that is to be written or read, 2550 // RegisterType is the type of the register being written to or read from. 2551 static Value *EmitSpecialRegisterBuiltin(CodeGenFunction &CGF, 2552 const CallExpr *E, 2553 llvm::Type *RegisterType, 2554 llvm::Type *ValueType, 2555 SpecialRegisterAccessKind AccessKind, 2556 StringRef SysReg = "") { 2557 // write and register intrinsics only support 32, 64 and 128 bit operations. 2558 assert((RegisterType->isIntegerTy(32) || RegisterType->isIntegerTy(64) || 2559 RegisterType->isIntegerTy(128)) && 2560 "Unsupported size for register."); 2561 2562 CodeGen::CGBuilderTy &Builder = CGF.Builder; 2563 CodeGen::CodeGenModule &CGM = CGF.CGM; 2564 LLVMContext &Context = CGM.getLLVMContext(); 2565 2566 if (SysReg.empty()) { 2567 const Expr *SysRegStrExpr = E->getArg(0)->IgnoreParenCasts(); 2568 SysReg = cast<clang::StringLiteral>(SysRegStrExpr)->getString(); 2569 } 2570 2571 llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysReg) }; 2572 llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops); 2573 llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName); 2574 2575 llvm::Type *Types[] = { RegisterType }; 2576 2577 bool MixedTypes = RegisterType->isIntegerTy(64) && ValueType->isIntegerTy(32); 2578 assert(!(RegisterType->isIntegerTy(32) && ValueType->isIntegerTy(64)) 2579 && "Can't fit 64-bit value in 32-bit register"); 2580 2581 if (AccessKind != Write) { 2582 assert(AccessKind == NormalRead || AccessKind == VolatileRead); 2583 llvm::Function *F = CGM.getIntrinsic( 2584 AccessKind == VolatileRead ? Intrinsic::read_volatile_register 2585 : Intrinsic::read_register, 2586 Types); 2587 llvm::Value *Call = Builder.CreateCall(F, Metadata); 2588 2589 if (MixedTypes) 2590 // Read into 64 bit register and then truncate result to 32 bit. 2591 return Builder.CreateTrunc(Call, ValueType); 2592 2593 if (ValueType->isPointerTy()) 2594 // Have i32/i64 result (Call) but want to return a VoidPtrTy (i8*). 2595 return Builder.CreateIntToPtr(Call, ValueType); 2596 2597 return Call; 2598 } 2599 2600 llvm::Function *F = CGM.getIntrinsic(Intrinsic::write_register, Types); 2601 llvm::Value *ArgValue = CGF.EmitScalarExpr(E->getArg(1)); 2602 if (MixedTypes) { 2603 // Extend 32 bit write value to 64 bit to pass to write. 2604 ArgValue = Builder.CreateZExt(ArgValue, RegisterType); 2605 return Builder.CreateCall(F, { Metadata, ArgValue }); 2606 } 2607 2608 if (ValueType->isPointerTy()) { 2609 // Have VoidPtrTy ArgValue but want to return an i32/i64. 2610 ArgValue = Builder.CreatePtrToInt(ArgValue, RegisterType); 2611 return Builder.CreateCall(F, { Metadata, ArgValue }); 2612 } 2613 2614 return Builder.CreateCall(F, { Metadata, ArgValue }); 2615 } 2616 2617 /// Return true if BuiltinID is an overloaded Neon intrinsic with an extra 2618 /// argument that specifies the vector type. 2619 static bool HasExtraNeonArgument(unsigned BuiltinID) { 2620 switch (BuiltinID) { 2621 default: break; 2622 case NEON::BI__builtin_neon_vget_lane_i8: 2623 case NEON::BI__builtin_neon_vget_lane_i16: 2624 case NEON::BI__builtin_neon_vget_lane_bf16: 2625 case NEON::BI__builtin_neon_vget_lane_i32: 2626 case NEON::BI__builtin_neon_vget_lane_i64: 2627 case NEON::BI__builtin_neon_vget_lane_mf8: 2628 case NEON::BI__builtin_neon_vget_lane_f32: 2629 case NEON::BI__builtin_neon_vgetq_lane_i8: 2630 case NEON::BI__builtin_neon_vgetq_lane_i16: 2631 case NEON::BI__builtin_neon_vgetq_lane_bf16: 2632 case NEON::BI__builtin_neon_vgetq_lane_i32: 2633 case NEON::BI__builtin_neon_vgetq_lane_i64: 2634 case NEON::BI__builtin_neon_vgetq_lane_mf8: 2635 case NEON::BI__builtin_neon_vgetq_lane_f32: 2636 case NEON::BI__builtin_neon_vduph_lane_bf16: 2637 case NEON::BI__builtin_neon_vduph_laneq_bf16: 2638 case NEON::BI__builtin_neon_vset_lane_i8: 2639 case NEON::BI__builtin_neon_vset_lane_mf8: 2640 case NEON::BI__builtin_neon_vset_lane_i16: 2641 case NEON::BI__builtin_neon_vset_lane_bf16: 2642 case NEON::BI__builtin_neon_vset_lane_i32: 2643 case NEON::BI__builtin_neon_vset_lane_i64: 2644 case NEON::BI__builtin_neon_vset_lane_f32: 2645 case NEON::BI__builtin_neon_vsetq_lane_i8: 2646 case NEON::BI__builtin_neon_vsetq_lane_mf8: 2647 case NEON::BI__builtin_neon_vsetq_lane_i16: 2648 case NEON::BI__builtin_neon_vsetq_lane_bf16: 2649 case NEON::BI__builtin_neon_vsetq_lane_i32: 2650 case NEON::BI__builtin_neon_vsetq_lane_i64: 2651 case NEON::BI__builtin_neon_vsetq_lane_f32: 2652 case NEON::BI__builtin_neon_vsha1h_u32: 2653 case NEON::BI__builtin_neon_vsha1cq_u32: 2654 case NEON::BI__builtin_neon_vsha1pq_u32: 2655 case NEON::BI__builtin_neon_vsha1mq_u32: 2656 case NEON::BI__builtin_neon_vcvth_bf16_f32: 2657 case clang::ARM::BI_MoveToCoprocessor: 2658 case clang::ARM::BI_MoveToCoprocessor2: 2659 return false; 2660 } 2661 return true; 2662 } 2663 2664 Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, 2665 const CallExpr *E, 2666 ReturnValueSlot ReturnValue, 2667 llvm::Triple::ArchType Arch) { 2668 if (auto Hint = GetValueForARMHint(BuiltinID)) 2669 return Hint; 2670 2671 if (BuiltinID == clang::ARM::BI__emit) { 2672 bool IsThumb = getTarget().getTriple().getArch() == llvm::Triple::thumb; 2673 llvm::FunctionType *FTy = 2674 llvm::FunctionType::get(VoidTy, /*Variadic=*/false); 2675 2676 Expr::EvalResult Result; 2677 if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext())) 2678 llvm_unreachable("Sema will ensure that the parameter is constant"); 2679 2680 llvm::APSInt Value = Result.Val.getInt(); 2681 uint64_t ZExtValue = Value.zextOrTrunc(IsThumb ? 16 : 32).getZExtValue(); 2682 2683 llvm::InlineAsm *Emit = 2684 IsThumb ? InlineAsm::get(FTy, ".inst.n 0x" + utohexstr(ZExtValue), "", 2685 /*hasSideEffects=*/true) 2686 : InlineAsm::get(FTy, ".inst 0x" + utohexstr(ZExtValue), "", 2687 /*hasSideEffects=*/true); 2688 2689 return Builder.CreateCall(Emit); 2690 } 2691 2692 if (BuiltinID == clang::ARM::BI__builtin_arm_dbg) { 2693 Value *Option = EmitScalarExpr(E->getArg(0)); 2694 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_dbg), Option); 2695 } 2696 2697 if (BuiltinID == clang::ARM::BI__builtin_arm_prefetch) { 2698 Value *Address = EmitScalarExpr(E->getArg(0)); 2699 Value *RW = EmitScalarExpr(E->getArg(1)); 2700 Value *IsData = EmitScalarExpr(E->getArg(2)); 2701 2702 // Locality is not supported on ARM target 2703 Value *Locality = llvm::ConstantInt::get(Int32Ty, 3); 2704 2705 Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType()); 2706 return Builder.CreateCall(F, {Address, RW, Locality, IsData}); 2707 } 2708 2709 if (BuiltinID == clang::ARM::BI__builtin_arm_rbit) { 2710 llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); 2711 return Builder.CreateCall( 2712 CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit"); 2713 } 2714 2715 if (BuiltinID == clang::ARM::BI__builtin_arm_clz || 2716 BuiltinID == clang::ARM::BI__builtin_arm_clz64) { 2717 llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); 2718 Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Arg->getType()); 2719 Value *Res = Builder.CreateCall(F, {Arg, Builder.getInt1(false)}); 2720 if (BuiltinID == clang::ARM::BI__builtin_arm_clz64) 2721 Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); 2722 return Res; 2723 } 2724 2725 2726 if (BuiltinID == clang::ARM::BI__builtin_arm_cls) { 2727 llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); 2728 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_cls), Arg, "cls"); 2729 } 2730 if (BuiltinID == clang::ARM::BI__builtin_arm_cls64) { 2731 llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); 2732 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_cls64), Arg, 2733 "cls"); 2734 } 2735 2736 if (BuiltinID == clang::ARM::BI__clear_cache) { 2737 assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments"); 2738 const FunctionDecl *FD = E->getDirectCallee(); 2739 Value *Ops[2]; 2740 for (unsigned i = 0; i < 2; i++) 2741 Ops[i] = EmitScalarExpr(E->getArg(i)); 2742 llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType()); 2743 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty); 2744 StringRef Name = FD->getName(); 2745 return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops); 2746 } 2747 2748 if (BuiltinID == clang::ARM::BI__builtin_arm_mcrr || 2749 BuiltinID == clang::ARM::BI__builtin_arm_mcrr2) { 2750 Function *F; 2751 2752 switch (BuiltinID) { 2753 default: llvm_unreachable("unexpected builtin"); 2754 case clang::ARM::BI__builtin_arm_mcrr: 2755 F = CGM.getIntrinsic(Intrinsic::arm_mcrr); 2756 break; 2757 case clang::ARM::BI__builtin_arm_mcrr2: 2758 F = CGM.getIntrinsic(Intrinsic::arm_mcrr2); 2759 break; 2760 } 2761 2762 // MCRR{2} instruction has 5 operands but 2763 // the intrinsic has 4 because Rt and Rt2 2764 // are represented as a single unsigned 64 2765 // bit integer in the intrinsic definition 2766 // but internally it's represented as 2 32 2767 // bit integers. 2768 2769 Value *Coproc = EmitScalarExpr(E->getArg(0)); 2770 Value *Opc1 = EmitScalarExpr(E->getArg(1)); 2771 Value *RtAndRt2 = EmitScalarExpr(E->getArg(2)); 2772 Value *CRm = EmitScalarExpr(E->getArg(3)); 2773 2774 Value *C1 = llvm::ConstantInt::get(Int64Ty, 32); 2775 Value *Rt = Builder.CreateTruncOrBitCast(RtAndRt2, Int32Ty); 2776 Value *Rt2 = Builder.CreateLShr(RtAndRt2, C1); 2777 Rt2 = Builder.CreateTruncOrBitCast(Rt2, Int32Ty); 2778 2779 return Builder.CreateCall(F, {Coproc, Opc1, Rt, Rt2, CRm}); 2780 } 2781 2782 if (BuiltinID == clang::ARM::BI__builtin_arm_mrrc || 2783 BuiltinID == clang::ARM::BI__builtin_arm_mrrc2) { 2784 Function *F; 2785 2786 switch (BuiltinID) { 2787 default: llvm_unreachable("unexpected builtin"); 2788 case clang::ARM::BI__builtin_arm_mrrc: 2789 F = CGM.getIntrinsic(Intrinsic::arm_mrrc); 2790 break; 2791 case clang::ARM::BI__builtin_arm_mrrc2: 2792 F = CGM.getIntrinsic(Intrinsic::arm_mrrc2); 2793 break; 2794 } 2795 2796 Value *Coproc = EmitScalarExpr(E->getArg(0)); 2797 Value *Opc1 = EmitScalarExpr(E->getArg(1)); 2798 Value *CRm = EmitScalarExpr(E->getArg(2)); 2799 Value *RtAndRt2 = Builder.CreateCall(F, {Coproc, Opc1, CRm}); 2800 2801 // Returns an unsigned 64 bit integer, represented 2802 // as two 32 bit integers. 2803 2804 Value *Rt = Builder.CreateExtractValue(RtAndRt2, 1); 2805 Value *Rt1 = Builder.CreateExtractValue(RtAndRt2, 0); 2806 Rt = Builder.CreateZExt(Rt, Int64Ty); 2807 Rt1 = Builder.CreateZExt(Rt1, Int64Ty); 2808 2809 Value *ShiftCast = llvm::ConstantInt::get(Int64Ty, 32); 2810 RtAndRt2 = Builder.CreateShl(Rt, ShiftCast, "shl", true); 2811 RtAndRt2 = Builder.CreateOr(RtAndRt2, Rt1); 2812 2813 return Builder.CreateBitCast(RtAndRt2, ConvertType(E->getType())); 2814 } 2815 2816 if (BuiltinID == clang::ARM::BI__builtin_arm_ldrexd || 2817 ((BuiltinID == clang::ARM::BI__builtin_arm_ldrex || 2818 BuiltinID == clang::ARM::BI__builtin_arm_ldaex) && 2819 getContext().getTypeSize(E->getType()) == 64) || 2820 BuiltinID == clang::ARM::BI__ldrexd) { 2821 Function *F; 2822 2823 switch (BuiltinID) { 2824 default: llvm_unreachable("unexpected builtin"); 2825 case clang::ARM::BI__builtin_arm_ldaex: 2826 F = CGM.getIntrinsic(Intrinsic::arm_ldaexd); 2827 break; 2828 case clang::ARM::BI__builtin_arm_ldrexd: 2829 case clang::ARM::BI__builtin_arm_ldrex: 2830 case clang::ARM::BI__ldrexd: 2831 F = CGM.getIntrinsic(Intrinsic::arm_ldrexd); 2832 break; 2833 } 2834 2835 Value *LdPtr = EmitScalarExpr(E->getArg(0)); 2836 Value *Val = Builder.CreateCall(F, LdPtr, "ldrexd"); 2837 2838 Value *Val0 = Builder.CreateExtractValue(Val, 1); 2839 Value *Val1 = Builder.CreateExtractValue(Val, 0); 2840 Val0 = Builder.CreateZExt(Val0, Int64Ty); 2841 Val1 = Builder.CreateZExt(Val1, Int64Ty); 2842 2843 Value *ShiftCst = llvm::ConstantInt::get(Int64Ty, 32); 2844 Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */); 2845 Val = Builder.CreateOr(Val, Val1); 2846 return Builder.CreateBitCast(Val, ConvertType(E->getType())); 2847 } 2848 2849 if (BuiltinID == clang::ARM::BI__builtin_arm_ldrex || 2850 BuiltinID == clang::ARM::BI__builtin_arm_ldaex) { 2851 Value *LoadAddr = EmitScalarExpr(E->getArg(0)); 2852 2853 QualType Ty = E->getType(); 2854 llvm::Type *RealResTy = ConvertType(Ty); 2855 llvm::Type *IntTy = 2856 llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty)); 2857 2858 Function *F = CGM.getIntrinsic( 2859 BuiltinID == clang::ARM::BI__builtin_arm_ldaex ? Intrinsic::arm_ldaex 2860 : Intrinsic::arm_ldrex, 2861 UnqualPtrTy); 2862 CallInst *Val = Builder.CreateCall(F, LoadAddr, "ldrex"); 2863 Val->addParamAttr( 2864 0, Attribute::get(getLLVMContext(), Attribute::ElementType, IntTy)); 2865 2866 if (RealResTy->isPointerTy()) 2867 return Builder.CreateIntToPtr(Val, RealResTy); 2868 else { 2869 llvm::Type *IntResTy = llvm::IntegerType::get( 2870 getLLVMContext(), CGM.getDataLayout().getTypeSizeInBits(RealResTy)); 2871 return Builder.CreateBitCast(Builder.CreateTruncOrBitCast(Val, IntResTy), 2872 RealResTy); 2873 } 2874 } 2875 2876 if (BuiltinID == clang::ARM::BI__builtin_arm_strexd || 2877 ((BuiltinID == clang::ARM::BI__builtin_arm_stlex || 2878 BuiltinID == clang::ARM::BI__builtin_arm_strex) && 2879 getContext().getTypeSize(E->getArg(0)->getType()) == 64)) { 2880 Function *F = CGM.getIntrinsic( 2881 BuiltinID == clang::ARM::BI__builtin_arm_stlex ? Intrinsic::arm_stlexd 2882 : Intrinsic::arm_strexd); 2883 llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty); 2884 2885 Address Tmp = CreateMemTemp(E->getArg(0)->getType()); 2886 Value *Val = EmitScalarExpr(E->getArg(0)); 2887 Builder.CreateStore(Val, Tmp); 2888 2889 Address LdPtr = Tmp.withElementType(STy); 2890 Val = Builder.CreateLoad(LdPtr); 2891 2892 Value *Arg0 = Builder.CreateExtractValue(Val, 0); 2893 Value *Arg1 = Builder.CreateExtractValue(Val, 1); 2894 Value *StPtr = EmitScalarExpr(E->getArg(1)); 2895 return Builder.CreateCall(F, {Arg0, Arg1, StPtr}, "strexd"); 2896 } 2897 2898 if (BuiltinID == clang::ARM::BI__builtin_arm_strex || 2899 BuiltinID == clang::ARM::BI__builtin_arm_stlex) { 2900 Value *StoreVal = EmitScalarExpr(E->getArg(0)); 2901 Value *StoreAddr = EmitScalarExpr(E->getArg(1)); 2902 2903 QualType Ty = E->getArg(0)->getType(); 2904 llvm::Type *StoreTy = 2905 llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty)); 2906 2907 if (StoreVal->getType()->isPointerTy()) 2908 StoreVal = Builder.CreatePtrToInt(StoreVal, Int32Ty); 2909 else { 2910 llvm::Type *IntTy = llvm::IntegerType::get( 2911 getLLVMContext(), 2912 CGM.getDataLayout().getTypeSizeInBits(StoreVal->getType())); 2913 StoreVal = Builder.CreateBitCast(StoreVal, IntTy); 2914 StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int32Ty); 2915 } 2916 2917 Function *F = CGM.getIntrinsic( 2918 BuiltinID == clang::ARM::BI__builtin_arm_stlex ? Intrinsic::arm_stlex 2919 : Intrinsic::arm_strex, 2920 StoreAddr->getType()); 2921 2922 CallInst *CI = Builder.CreateCall(F, {StoreVal, StoreAddr}, "strex"); 2923 CI->addParamAttr( 2924 1, Attribute::get(getLLVMContext(), Attribute::ElementType, StoreTy)); 2925 return CI; 2926 } 2927 2928 if (BuiltinID == clang::ARM::BI__builtin_arm_clrex) { 2929 Function *F = CGM.getIntrinsic(Intrinsic::arm_clrex); 2930 return Builder.CreateCall(F); 2931 } 2932 2933 // CRC32 2934 Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic; 2935 switch (BuiltinID) { 2936 case clang::ARM::BI__builtin_arm_crc32b: 2937 CRCIntrinsicID = Intrinsic::arm_crc32b; break; 2938 case clang::ARM::BI__builtin_arm_crc32cb: 2939 CRCIntrinsicID = Intrinsic::arm_crc32cb; break; 2940 case clang::ARM::BI__builtin_arm_crc32h: 2941 CRCIntrinsicID = Intrinsic::arm_crc32h; break; 2942 case clang::ARM::BI__builtin_arm_crc32ch: 2943 CRCIntrinsicID = Intrinsic::arm_crc32ch; break; 2944 case clang::ARM::BI__builtin_arm_crc32w: 2945 case clang::ARM::BI__builtin_arm_crc32d: 2946 CRCIntrinsicID = Intrinsic::arm_crc32w; break; 2947 case clang::ARM::BI__builtin_arm_crc32cw: 2948 case clang::ARM::BI__builtin_arm_crc32cd: 2949 CRCIntrinsicID = Intrinsic::arm_crc32cw; break; 2950 } 2951 2952 if (CRCIntrinsicID != Intrinsic::not_intrinsic) { 2953 Value *Arg0 = EmitScalarExpr(E->getArg(0)); 2954 Value *Arg1 = EmitScalarExpr(E->getArg(1)); 2955 2956 // crc32{c,}d intrinsics are implemented as two calls to crc32{c,}w 2957 // intrinsics, hence we need different codegen for these cases. 2958 if (BuiltinID == clang::ARM::BI__builtin_arm_crc32d || 2959 BuiltinID == clang::ARM::BI__builtin_arm_crc32cd) { 2960 Value *C1 = llvm::ConstantInt::get(Int64Ty, 32); 2961 Value *Arg1a = Builder.CreateTruncOrBitCast(Arg1, Int32Ty); 2962 Value *Arg1b = Builder.CreateLShr(Arg1, C1); 2963 Arg1b = Builder.CreateTruncOrBitCast(Arg1b, Int32Ty); 2964 2965 Function *F = CGM.getIntrinsic(CRCIntrinsicID); 2966 Value *Res = Builder.CreateCall(F, {Arg0, Arg1a}); 2967 return Builder.CreateCall(F, {Res, Arg1b}); 2968 } else { 2969 Arg1 = Builder.CreateZExtOrBitCast(Arg1, Int32Ty); 2970 2971 Function *F = CGM.getIntrinsic(CRCIntrinsicID); 2972 return Builder.CreateCall(F, {Arg0, Arg1}); 2973 } 2974 } 2975 2976 if (BuiltinID == clang::ARM::BI__builtin_arm_rsr || 2977 BuiltinID == clang::ARM::BI__builtin_arm_rsr64 || 2978 BuiltinID == clang::ARM::BI__builtin_arm_rsrp || 2979 BuiltinID == clang::ARM::BI__builtin_arm_wsr || 2980 BuiltinID == clang::ARM::BI__builtin_arm_wsr64 || 2981 BuiltinID == clang::ARM::BI__builtin_arm_wsrp) { 2982 2983 SpecialRegisterAccessKind AccessKind = Write; 2984 if (BuiltinID == clang::ARM::BI__builtin_arm_rsr || 2985 BuiltinID == clang::ARM::BI__builtin_arm_rsr64 || 2986 BuiltinID == clang::ARM::BI__builtin_arm_rsrp) 2987 AccessKind = VolatileRead; 2988 2989 bool IsPointerBuiltin = BuiltinID == clang::ARM::BI__builtin_arm_rsrp || 2990 BuiltinID == clang::ARM::BI__builtin_arm_wsrp; 2991 2992 bool Is64Bit = BuiltinID == clang::ARM::BI__builtin_arm_rsr64 || 2993 BuiltinID == clang::ARM::BI__builtin_arm_wsr64; 2994 2995 llvm::Type *ValueType; 2996 llvm::Type *RegisterType; 2997 if (IsPointerBuiltin) { 2998 ValueType = VoidPtrTy; 2999 RegisterType = Int32Ty; 3000 } else if (Is64Bit) { 3001 ValueType = RegisterType = Int64Ty; 3002 } else { 3003 ValueType = RegisterType = Int32Ty; 3004 } 3005 3006 return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType, 3007 AccessKind); 3008 } 3009 3010 if (BuiltinID == ARM::BI__builtin_sponentry) { 3011 llvm::Function *F = CGM.getIntrinsic(Intrinsic::sponentry, AllocaInt8PtrTy); 3012 return Builder.CreateCall(F); 3013 } 3014 3015 // Handle MSVC intrinsics before argument evaluation to prevent double 3016 // evaluation. 3017 if (std::optional<MSVCIntrin> MsvcIntId = translateArmToMsvcIntrin(BuiltinID)) 3018 return EmitMSVCBuiltinExpr(*MsvcIntId, E); 3019 3020 // Deal with MVE builtins 3021 if (Value *Result = EmitARMMVEBuiltinExpr(BuiltinID, E, ReturnValue, Arch)) 3022 return Result; 3023 // Handle CDE builtins 3024 if (Value *Result = EmitARMCDEBuiltinExpr(BuiltinID, E, ReturnValue, Arch)) 3025 return Result; 3026 3027 // Some intrinsics are equivalent - if they are use the base intrinsic ID. 3028 auto It = llvm::find_if(NEONEquivalentIntrinsicMap, [BuiltinID](auto &P) { 3029 return P.first == BuiltinID; 3030 }); 3031 if (It != end(NEONEquivalentIntrinsicMap)) 3032 BuiltinID = It->second; 3033 3034 // Find out if any arguments are required to be integer constant 3035 // expressions. 3036 unsigned ICEArguments = 0; 3037 ASTContext::GetBuiltinTypeError Error; 3038 getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); 3039 assert(Error == ASTContext::GE_None && "Should not codegen an error"); 3040 3041 auto getAlignmentValue32 = [&](Address addr) -> Value* { 3042 return Builder.getInt32(addr.getAlignment().getQuantity()); 3043 }; 3044 3045 Address PtrOp0 = Address::invalid(); 3046 Address PtrOp1 = Address::invalid(); 3047 SmallVector<Value*, 4> Ops; 3048 bool HasExtraArg = HasExtraNeonArgument(BuiltinID); 3049 unsigned NumArgs = E->getNumArgs() - (HasExtraArg ? 1 : 0); 3050 for (unsigned i = 0, e = NumArgs; i != e; i++) { 3051 if (i == 0) { 3052 switch (BuiltinID) { 3053 case NEON::BI__builtin_neon_vld1_v: 3054 case NEON::BI__builtin_neon_vld1q_v: 3055 case NEON::BI__builtin_neon_vld1q_lane_v: 3056 case NEON::BI__builtin_neon_vld1_lane_v: 3057 case NEON::BI__builtin_neon_vld1_dup_v: 3058 case NEON::BI__builtin_neon_vld1q_dup_v: 3059 case NEON::BI__builtin_neon_vst1_v: 3060 case NEON::BI__builtin_neon_vst1q_v: 3061 case NEON::BI__builtin_neon_vst1q_lane_v: 3062 case NEON::BI__builtin_neon_vst1_lane_v: 3063 case NEON::BI__builtin_neon_vst2_v: 3064 case NEON::BI__builtin_neon_vst2q_v: 3065 case NEON::BI__builtin_neon_vst2_lane_v: 3066 case NEON::BI__builtin_neon_vst2q_lane_v: 3067 case NEON::BI__builtin_neon_vst3_v: 3068 case NEON::BI__builtin_neon_vst3q_v: 3069 case NEON::BI__builtin_neon_vst3_lane_v: 3070 case NEON::BI__builtin_neon_vst3q_lane_v: 3071 case NEON::BI__builtin_neon_vst4_v: 3072 case NEON::BI__builtin_neon_vst4q_v: 3073 case NEON::BI__builtin_neon_vst4_lane_v: 3074 case NEON::BI__builtin_neon_vst4q_lane_v: 3075 // Get the alignment for the argument in addition to the value; 3076 // we'll use it later. 3077 PtrOp0 = EmitPointerWithAlignment(E->getArg(0)); 3078 Ops.push_back(PtrOp0.emitRawPointer(*this)); 3079 continue; 3080 } 3081 } 3082 if (i == 1) { 3083 switch (BuiltinID) { 3084 case NEON::BI__builtin_neon_vld2_v: 3085 case NEON::BI__builtin_neon_vld2q_v: 3086 case NEON::BI__builtin_neon_vld3_v: 3087 case NEON::BI__builtin_neon_vld3q_v: 3088 case NEON::BI__builtin_neon_vld4_v: 3089 case NEON::BI__builtin_neon_vld4q_v: 3090 case NEON::BI__builtin_neon_vld2_lane_v: 3091 case NEON::BI__builtin_neon_vld2q_lane_v: 3092 case NEON::BI__builtin_neon_vld3_lane_v: 3093 case NEON::BI__builtin_neon_vld3q_lane_v: 3094 case NEON::BI__builtin_neon_vld4_lane_v: 3095 case NEON::BI__builtin_neon_vld4q_lane_v: 3096 case NEON::BI__builtin_neon_vld2_dup_v: 3097 case NEON::BI__builtin_neon_vld2q_dup_v: 3098 case NEON::BI__builtin_neon_vld3_dup_v: 3099 case NEON::BI__builtin_neon_vld3q_dup_v: 3100 case NEON::BI__builtin_neon_vld4_dup_v: 3101 case NEON::BI__builtin_neon_vld4q_dup_v: 3102 // Get the alignment for the argument in addition to the value; 3103 // we'll use it later. 3104 PtrOp1 = EmitPointerWithAlignment(E->getArg(1)); 3105 Ops.push_back(PtrOp1.emitRawPointer(*this)); 3106 continue; 3107 } 3108 } 3109 3110 Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E)); 3111 } 3112 3113 switch (BuiltinID) { 3114 default: break; 3115 3116 case NEON::BI__builtin_neon_vget_lane_i8: 3117 case NEON::BI__builtin_neon_vget_lane_i16: 3118 case NEON::BI__builtin_neon_vget_lane_i32: 3119 case NEON::BI__builtin_neon_vget_lane_i64: 3120 case NEON::BI__builtin_neon_vget_lane_bf16: 3121 case NEON::BI__builtin_neon_vget_lane_f32: 3122 case NEON::BI__builtin_neon_vgetq_lane_i8: 3123 case NEON::BI__builtin_neon_vgetq_lane_i16: 3124 case NEON::BI__builtin_neon_vgetq_lane_i32: 3125 case NEON::BI__builtin_neon_vgetq_lane_i64: 3126 case NEON::BI__builtin_neon_vgetq_lane_bf16: 3127 case NEON::BI__builtin_neon_vgetq_lane_f32: 3128 case NEON::BI__builtin_neon_vduph_lane_bf16: 3129 case NEON::BI__builtin_neon_vduph_laneq_bf16: 3130 return Builder.CreateExtractElement(Ops[0], Ops[1], "vget_lane"); 3131 3132 case NEON::BI__builtin_neon_vrndns_f32: { 3133 Value *Arg = EmitScalarExpr(E->getArg(0)); 3134 llvm::Type *Tys[] = {Arg->getType()}; 3135 Function *F = CGM.getIntrinsic(Intrinsic::roundeven, Tys); 3136 return Builder.CreateCall(F, {Arg}, "vrndn"); } 3137 3138 case NEON::BI__builtin_neon_vset_lane_i8: 3139 case NEON::BI__builtin_neon_vset_lane_i16: 3140 case NEON::BI__builtin_neon_vset_lane_i32: 3141 case NEON::BI__builtin_neon_vset_lane_i64: 3142 case NEON::BI__builtin_neon_vset_lane_bf16: 3143 case NEON::BI__builtin_neon_vset_lane_f32: 3144 case NEON::BI__builtin_neon_vsetq_lane_i8: 3145 case NEON::BI__builtin_neon_vsetq_lane_i16: 3146 case NEON::BI__builtin_neon_vsetq_lane_i32: 3147 case NEON::BI__builtin_neon_vsetq_lane_i64: 3148 case NEON::BI__builtin_neon_vsetq_lane_bf16: 3149 case NEON::BI__builtin_neon_vsetq_lane_f32: 3150 return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane"); 3151 3152 case NEON::BI__builtin_neon_vsha1h_u32: 3153 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1h), Ops, 3154 "vsha1h"); 3155 case NEON::BI__builtin_neon_vsha1cq_u32: 3156 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1c), Ops, 3157 "vsha1h"); 3158 case NEON::BI__builtin_neon_vsha1pq_u32: 3159 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1p), Ops, 3160 "vsha1h"); 3161 case NEON::BI__builtin_neon_vsha1mq_u32: 3162 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1m), Ops, 3163 "vsha1h"); 3164 3165 case NEON::BI__builtin_neon_vcvth_bf16_f32: { 3166 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vcvtbfp2bf), Ops, 3167 "vcvtbfp2bf"); 3168 } 3169 3170 // The ARM _MoveToCoprocessor builtins put the input register value as 3171 // the first argument, but the LLVM intrinsic expects it as the third one. 3172 case clang::ARM::BI_MoveToCoprocessor: 3173 case clang::ARM::BI_MoveToCoprocessor2: { 3174 Function *F = CGM.getIntrinsic(BuiltinID == clang::ARM::BI_MoveToCoprocessor 3175 ? Intrinsic::arm_mcr 3176 : Intrinsic::arm_mcr2); 3177 return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0], 3178 Ops[3], Ops[4], Ops[5]}); 3179 } 3180 } 3181 3182 // Get the last argument, which specifies the vector type. 3183 assert(HasExtraArg); 3184 const Expr *Arg = E->getArg(E->getNumArgs()-1); 3185 std::optional<llvm::APSInt> Result = 3186 Arg->getIntegerConstantExpr(getContext()); 3187 if (!Result) 3188 return nullptr; 3189 3190 if (BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_f || 3191 BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_d) { 3192 // Determine the overloaded type of this builtin. 3193 llvm::Type *Ty; 3194 if (BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_f) 3195 Ty = FloatTy; 3196 else 3197 Ty = DoubleTy; 3198 3199 // Determine whether this is an unsigned conversion or not. 3200 bool usgn = Result->getZExtValue() == 1; 3201 unsigned Int = usgn ? Intrinsic::arm_vcvtru : Intrinsic::arm_vcvtr; 3202 3203 // Call the appropriate intrinsic. 3204 Function *F = CGM.getIntrinsic(Int, Ty); 3205 return Builder.CreateCall(F, Ops, "vcvtr"); 3206 } 3207 3208 // Determine the type of this overloaded NEON intrinsic. 3209 NeonTypeFlags Type = Result->getZExtValue(); 3210 bool usgn = Type.isUnsigned(); 3211 bool rightShift = false; 3212 3213 llvm::FixedVectorType *VTy = 3214 GetNeonType(this, Type, getTarget().hasLegalHalfType(), false, 3215 getTarget().hasBFloat16Type()); 3216 llvm::Type *Ty = VTy; 3217 if (!Ty) 3218 return nullptr; 3219 3220 // Many NEON builtins have identical semantics and uses in ARM and 3221 // AArch64. Emit these in a single function. 3222 auto IntrinsicMap = ArrayRef(ARMSIMDIntrinsicMap); 3223 const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap( 3224 IntrinsicMap, BuiltinID, NEONSIMDIntrinsicsProvenSorted); 3225 if (Builtin) 3226 return EmitCommonNeonBuiltinExpr( 3227 Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic, 3228 Builtin->NameHint, Builtin->TypeModifier, E, Ops, PtrOp0, PtrOp1, Arch); 3229 3230 unsigned Int; 3231 switch (BuiltinID) { 3232 default: return nullptr; 3233 case NEON::BI__builtin_neon_vld1q_lane_v: 3234 // Handle 64-bit integer elements as a special case. Use shuffles of 3235 // one-element vectors to avoid poor code for i64 in the backend. 3236 if (VTy->getElementType()->isIntegerTy(64)) { 3237 // Extract the other lane. 3238 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 3239 int Lane = cast<ConstantInt>(Ops[2])->getZExtValue(); 3240 Value *SV = llvm::ConstantVector::get(ConstantInt::get(Int32Ty, 1-Lane)); 3241 Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV); 3242 // Load the value as a one-element vector. 3243 Ty = llvm::FixedVectorType::get(VTy->getElementType(), 1); 3244 llvm::Type *Tys[] = {Ty, Int8PtrTy}; 3245 Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld1, Tys); 3246 Value *Align = getAlignmentValue32(PtrOp0); 3247 Value *Ld = Builder.CreateCall(F, {Ops[0], Align}); 3248 // Combine them. 3249 int Indices[] = {1 - Lane, Lane}; 3250 return Builder.CreateShuffleVector(Ops[1], Ld, Indices, "vld1q_lane"); 3251 } 3252 [[fallthrough]]; 3253 case NEON::BI__builtin_neon_vld1_lane_v: { 3254 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 3255 PtrOp0 = PtrOp0.withElementType(VTy->getElementType()); 3256 Value *Ld = Builder.CreateLoad(PtrOp0); 3257 return Builder.CreateInsertElement(Ops[1], Ld, Ops[2], "vld1_lane"); 3258 } 3259 case NEON::BI__builtin_neon_vqrshrn_n_v: 3260 Int = 3261 usgn ? Intrinsic::arm_neon_vqrshiftnu : Intrinsic::arm_neon_vqrshiftns; 3262 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n", 3263 1, true); 3264 case NEON::BI__builtin_neon_vqrshrun_n_v: 3265 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqrshiftnsu, Ty), 3266 Ops, "vqrshrun_n", 1, true); 3267 case NEON::BI__builtin_neon_vqshrn_n_v: 3268 Int = usgn ? Intrinsic::arm_neon_vqshiftnu : Intrinsic::arm_neon_vqshiftns; 3269 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n", 3270 1, true); 3271 case NEON::BI__builtin_neon_vqshrun_n_v: 3272 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqshiftnsu, Ty), 3273 Ops, "vqshrun_n", 1, true); 3274 case NEON::BI__builtin_neon_vrecpe_v: 3275 case NEON::BI__builtin_neon_vrecpeq_v: 3276 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrecpe, Ty), 3277 Ops, "vrecpe"); 3278 case NEON::BI__builtin_neon_vrshrn_n_v: 3279 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrshiftn, Ty), 3280 Ops, "vrshrn_n", 1, true); 3281 case NEON::BI__builtin_neon_vrsra_n_v: 3282 case NEON::BI__builtin_neon_vrsraq_n_v: 3283 Ops[0] = Builder.CreateBitCast(Ops[0], Ty); 3284 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 3285 Ops[2] = EmitNeonShiftVector(Ops[2], Ty, true); 3286 Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts; 3287 Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Ty), {Ops[1], Ops[2]}); 3288 return Builder.CreateAdd(Ops[0], Ops[1], "vrsra_n"); 3289 case NEON::BI__builtin_neon_vsri_n_v: 3290 case NEON::BI__builtin_neon_vsriq_n_v: 3291 rightShift = true; 3292 [[fallthrough]]; 3293 case NEON::BI__builtin_neon_vsli_n_v: 3294 case NEON::BI__builtin_neon_vsliq_n_v: 3295 Ops[2] = EmitNeonShiftVector(Ops[2], Ty, rightShift); 3296 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vshiftins, Ty), 3297 Ops, "vsli_n"); 3298 case NEON::BI__builtin_neon_vsra_n_v: 3299 case NEON::BI__builtin_neon_vsraq_n_v: 3300 Ops[0] = Builder.CreateBitCast(Ops[0], Ty); 3301 Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n"); 3302 return Builder.CreateAdd(Ops[0], Ops[1]); 3303 case NEON::BI__builtin_neon_vst1q_lane_v: 3304 // Handle 64-bit integer elements as a special case. Use a shuffle to get 3305 // a one-element vector and avoid poor code for i64 in the backend. 3306 if (VTy->getElementType()->isIntegerTy(64)) { 3307 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 3308 Value *SV = llvm::ConstantVector::get(cast<llvm::Constant>(Ops[2])); 3309 Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV); 3310 Ops[2] = getAlignmentValue32(PtrOp0); 3311 llvm::Type *Tys[] = {Int8PtrTy, Ops[1]->getType()}; 3312 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst1, 3313 Tys), Ops); 3314 } 3315 [[fallthrough]]; 3316 case NEON::BI__builtin_neon_vst1_lane_v: { 3317 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 3318 Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]); 3319 return Builder.CreateStore(Ops[1], 3320 PtrOp0.withElementType(Ops[1]->getType())); 3321 } 3322 case NEON::BI__builtin_neon_vtbl1_v: 3323 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl1), 3324 Ops, "vtbl1"); 3325 case NEON::BI__builtin_neon_vtbl2_v: 3326 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl2), 3327 Ops, "vtbl2"); 3328 case NEON::BI__builtin_neon_vtbl3_v: 3329 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl3), 3330 Ops, "vtbl3"); 3331 case NEON::BI__builtin_neon_vtbl4_v: 3332 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl4), 3333 Ops, "vtbl4"); 3334 case NEON::BI__builtin_neon_vtbx1_v: 3335 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx1), 3336 Ops, "vtbx1"); 3337 case NEON::BI__builtin_neon_vtbx2_v: 3338 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx2), 3339 Ops, "vtbx2"); 3340 case NEON::BI__builtin_neon_vtbx3_v: 3341 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx3), 3342 Ops, "vtbx3"); 3343 case NEON::BI__builtin_neon_vtbx4_v: 3344 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx4), 3345 Ops, "vtbx4"); 3346 } 3347 } 3348 3349 template<typename Integer> 3350 static Integer GetIntegerConstantValue(const Expr *E, ASTContext &Context) { 3351 return E->getIntegerConstantExpr(Context)->getExtValue(); 3352 } 3353 3354 static llvm::Value *SignOrZeroExtend(CGBuilderTy &Builder, llvm::Value *V, 3355 llvm::Type *T, bool Unsigned) { 3356 // Helper function called by Tablegen-constructed ARM MVE builtin codegen, 3357 // which finds it convenient to specify signed/unsigned as a boolean flag. 3358 return Unsigned ? Builder.CreateZExt(V, T) : Builder.CreateSExt(V, T); 3359 } 3360 3361 static llvm::Value *MVEImmediateShr(CGBuilderTy &Builder, llvm::Value *V, 3362 uint32_t Shift, bool Unsigned) { 3363 // MVE helper function for integer shift right. This must handle signed vs 3364 // unsigned, and also deal specially with the case where the shift count is 3365 // equal to the lane size. In LLVM IR, an LShr with that parameter would be 3366 // undefined behavior, but in MVE it's legal, so we must convert it to code 3367 // that is not undefined in IR. 3368 unsigned LaneBits = cast<llvm::VectorType>(V->getType()) 3369 ->getElementType() 3370 ->getPrimitiveSizeInBits(); 3371 if (Shift == LaneBits) { 3372 // An unsigned shift of the full lane size always generates zero, so we can 3373 // simply emit a zero vector. A signed shift of the full lane size does the 3374 // same thing as shifting by one bit fewer. 3375 if (Unsigned) 3376 return llvm::Constant::getNullValue(V->getType()); 3377 else 3378 --Shift; 3379 } 3380 return Unsigned ? Builder.CreateLShr(V, Shift) : Builder.CreateAShr(V, Shift); 3381 } 3382 3383 static llvm::Value *ARMMVEVectorSplat(CGBuilderTy &Builder, llvm::Value *V) { 3384 // MVE-specific helper function for a vector splat, which infers the element 3385 // count of the output vector by knowing that MVE vectors are all 128 bits 3386 // wide. 3387 unsigned Elements = 128 / V->getType()->getPrimitiveSizeInBits(); 3388 return Builder.CreateVectorSplat(Elements, V); 3389 } 3390 3391 static llvm::Value *ARMMVEVectorReinterpret(CGBuilderTy &Builder, 3392 CodeGenFunction *CGF, 3393 llvm::Value *V, 3394 llvm::Type *DestType) { 3395 // Convert one MVE vector type into another by reinterpreting its in-register 3396 // format. 3397 // 3398 // Little-endian, this is identical to a bitcast (which reinterprets the 3399 // memory format). But big-endian, they're not necessarily the same, because 3400 // the register and memory formats map to each other differently depending on 3401 // the lane size. 3402 // 3403 // We generate a bitcast whenever we can (if we're little-endian, or if the 3404 // lane sizes are the same anyway). Otherwise we fall back to an IR intrinsic 3405 // that performs the different kind of reinterpretation. 3406 if (CGF->getTarget().isBigEndian() && 3407 V->getType()->getScalarSizeInBits() != DestType->getScalarSizeInBits()) { 3408 return Builder.CreateCall( 3409 CGF->CGM.getIntrinsic(Intrinsic::arm_mve_vreinterpretq, 3410 {DestType, V->getType()}), 3411 V); 3412 } else { 3413 return Builder.CreateBitCast(V, DestType); 3414 } 3415 } 3416 3417 static llvm::Value *VectorUnzip(CGBuilderTy &Builder, llvm::Value *V, bool Odd) { 3418 // Make a shufflevector that extracts every other element of a vector (evens 3419 // or odds, as desired). 3420 SmallVector<int, 16> Indices; 3421 unsigned InputElements = 3422 cast<llvm::FixedVectorType>(V->getType())->getNumElements(); 3423 for (unsigned i = 0; i < InputElements; i += 2) 3424 Indices.push_back(i + Odd); 3425 return Builder.CreateShuffleVector(V, Indices); 3426 } 3427 3428 static llvm::Value *VectorZip(CGBuilderTy &Builder, llvm::Value *V0, 3429 llvm::Value *V1) { 3430 // Make a shufflevector that interleaves two vectors element by element. 3431 assert(V0->getType() == V1->getType() && "Can't zip different vector types"); 3432 SmallVector<int, 16> Indices; 3433 unsigned InputElements = 3434 cast<llvm::FixedVectorType>(V0->getType())->getNumElements(); 3435 for (unsigned i = 0; i < InputElements; i++) { 3436 Indices.push_back(i); 3437 Indices.push_back(i + InputElements); 3438 } 3439 return Builder.CreateShuffleVector(V0, V1, Indices); 3440 } 3441 3442 template<unsigned HighBit, unsigned OtherBits> 3443 static llvm::Value *ARMMVEConstantSplat(CGBuilderTy &Builder, llvm::Type *VT) { 3444 // MVE-specific helper function to make a vector splat of a constant such as 3445 // UINT_MAX or INT_MIN, in which all bits below the highest one are equal. 3446 llvm::Type *T = cast<llvm::VectorType>(VT)->getElementType(); 3447 unsigned LaneBits = T->getPrimitiveSizeInBits(); 3448 uint32_t Value = HighBit << (LaneBits - 1); 3449 if (OtherBits) 3450 Value |= (1UL << (LaneBits - 1)) - 1; 3451 llvm::Value *Lane = llvm::ConstantInt::get(T, Value); 3452 return ARMMVEVectorSplat(Builder, Lane); 3453 } 3454 3455 static llvm::Value *ARMMVEVectorElementReverse(CGBuilderTy &Builder, 3456 llvm::Value *V, 3457 unsigned ReverseWidth) { 3458 // MVE-specific helper function which reverses the elements of a 3459 // vector within every (ReverseWidth)-bit collection of lanes. 3460 SmallVector<int, 16> Indices; 3461 unsigned LaneSize = V->getType()->getScalarSizeInBits(); 3462 unsigned Elements = 128 / LaneSize; 3463 unsigned Mask = ReverseWidth / LaneSize - 1; 3464 for (unsigned i = 0; i < Elements; i++) 3465 Indices.push_back(i ^ Mask); 3466 return Builder.CreateShuffleVector(V, Indices); 3467 } 3468 3469 Value *CodeGenFunction::EmitARMMVEBuiltinExpr(unsigned BuiltinID, 3470 const CallExpr *E, 3471 ReturnValueSlot ReturnValue, 3472 llvm::Triple::ArchType Arch) { 3473 enum class CustomCodeGen { VLD24, VST24 } CustomCodeGenType; 3474 Intrinsic::ID IRIntr; 3475 unsigned NumVectors; 3476 3477 // Code autogenerated by Tablegen will handle all the simple builtins. 3478 switch (BuiltinID) { 3479 #include "clang/Basic/arm_mve_builtin_cg.inc" 3480 3481 // If we didn't match an MVE builtin id at all, go back to the 3482 // main EmitARMBuiltinExpr. 3483 default: 3484 return nullptr; 3485 } 3486 3487 // Anything that breaks from that switch is an MVE builtin that 3488 // needs handwritten code to generate. 3489 3490 switch (CustomCodeGenType) { 3491 3492 case CustomCodeGen::VLD24: { 3493 llvm::SmallVector<Value *, 4> Ops; 3494 llvm::SmallVector<llvm::Type *, 4> Tys; 3495 3496 auto MvecCType = E->getType(); 3497 auto MvecLType = ConvertType(MvecCType); 3498 assert(MvecLType->isStructTy() && 3499 "Return type for vld[24]q should be a struct"); 3500 assert(MvecLType->getStructNumElements() == 1 && 3501 "Return-type struct for vld[24]q should have one element"); 3502 auto MvecLTypeInner = MvecLType->getStructElementType(0); 3503 assert(MvecLTypeInner->isArrayTy() && 3504 "Return-type struct for vld[24]q should contain an array"); 3505 assert(MvecLTypeInner->getArrayNumElements() == NumVectors && 3506 "Array member of return-type struct vld[24]q has wrong length"); 3507 auto VecLType = MvecLTypeInner->getArrayElementType(); 3508 3509 Tys.push_back(VecLType); 3510 3511 auto Addr = E->getArg(0); 3512 Ops.push_back(EmitScalarExpr(Addr)); 3513 Tys.push_back(ConvertType(Addr->getType())); 3514 3515 Function *F = CGM.getIntrinsic(IRIntr, ArrayRef(Tys)); 3516 Value *LoadResult = Builder.CreateCall(F, Ops); 3517 Value *MvecOut = PoisonValue::get(MvecLType); 3518 for (unsigned i = 0; i < NumVectors; ++i) { 3519 Value *Vec = Builder.CreateExtractValue(LoadResult, i); 3520 MvecOut = Builder.CreateInsertValue(MvecOut, Vec, {0, i}); 3521 } 3522 3523 if (ReturnValue.isNull()) 3524 return MvecOut; 3525 else 3526 return Builder.CreateStore(MvecOut, ReturnValue.getAddress()); 3527 } 3528 3529 case CustomCodeGen::VST24: { 3530 llvm::SmallVector<Value *, 4> Ops; 3531 llvm::SmallVector<llvm::Type *, 4> Tys; 3532 3533 auto Addr = E->getArg(0); 3534 Ops.push_back(EmitScalarExpr(Addr)); 3535 Tys.push_back(ConvertType(Addr->getType())); 3536 3537 auto MvecCType = E->getArg(1)->getType(); 3538 auto MvecLType = ConvertType(MvecCType); 3539 assert(MvecLType->isStructTy() && "Data type for vst2q should be a struct"); 3540 assert(MvecLType->getStructNumElements() == 1 && 3541 "Data-type struct for vst2q should have one element"); 3542 auto MvecLTypeInner = MvecLType->getStructElementType(0); 3543 assert(MvecLTypeInner->isArrayTy() && 3544 "Data-type struct for vst2q should contain an array"); 3545 assert(MvecLTypeInner->getArrayNumElements() == NumVectors && 3546 "Array member of return-type struct vld[24]q has wrong length"); 3547 auto VecLType = MvecLTypeInner->getArrayElementType(); 3548 3549 Tys.push_back(VecLType); 3550 3551 AggValueSlot MvecSlot = CreateAggTemp(MvecCType); 3552 EmitAggExpr(E->getArg(1), MvecSlot); 3553 auto Mvec = Builder.CreateLoad(MvecSlot.getAddress()); 3554 for (unsigned i = 0; i < NumVectors; i++) 3555 Ops.push_back(Builder.CreateExtractValue(Mvec, {0, i})); 3556 3557 Function *F = CGM.getIntrinsic(IRIntr, ArrayRef(Tys)); 3558 Value *ToReturn = nullptr; 3559 for (unsigned i = 0; i < NumVectors; i++) { 3560 Ops.push_back(llvm::ConstantInt::get(Int32Ty, i)); 3561 ToReturn = Builder.CreateCall(F, Ops); 3562 Ops.pop_back(); 3563 } 3564 return ToReturn; 3565 } 3566 } 3567 llvm_unreachable("unknown custom codegen type."); 3568 } 3569 3570 Value *CodeGenFunction::EmitARMCDEBuiltinExpr(unsigned BuiltinID, 3571 const CallExpr *E, 3572 ReturnValueSlot ReturnValue, 3573 llvm::Triple::ArchType Arch) { 3574 switch (BuiltinID) { 3575 default: 3576 return nullptr; 3577 #include "clang/Basic/arm_cde_builtin_cg.inc" 3578 } 3579 } 3580 3581 static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID, 3582 const CallExpr *E, 3583 SmallVectorImpl<Value *> &Ops, 3584 llvm::Triple::ArchType Arch) { 3585 unsigned int Int = 0; 3586 const char *s = nullptr; 3587 3588 switch (BuiltinID) { 3589 default: 3590 return nullptr; 3591 case NEON::BI__builtin_neon_vtbl1_v: 3592 case NEON::BI__builtin_neon_vqtbl1_v: 3593 case NEON::BI__builtin_neon_vqtbl1q_v: 3594 case NEON::BI__builtin_neon_vtbl2_v: 3595 case NEON::BI__builtin_neon_vqtbl2_v: 3596 case NEON::BI__builtin_neon_vqtbl2q_v: 3597 case NEON::BI__builtin_neon_vtbl3_v: 3598 case NEON::BI__builtin_neon_vqtbl3_v: 3599 case NEON::BI__builtin_neon_vqtbl3q_v: 3600 case NEON::BI__builtin_neon_vtbl4_v: 3601 case NEON::BI__builtin_neon_vqtbl4_v: 3602 case NEON::BI__builtin_neon_vqtbl4q_v: 3603 break; 3604 case NEON::BI__builtin_neon_vtbx1_v: 3605 case NEON::BI__builtin_neon_vqtbx1_v: 3606 case NEON::BI__builtin_neon_vqtbx1q_v: 3607 case NEON::BI__builtin_neon_vtbx2_v: 3608 case NEON::BI__builtin_neon_vqtbx2_v: 3609 case NEON::BI__builtin_neon_vqtbx2q_v: 3610 case NEON::BI__builtin_neon_vtbx3_v: 3611 case NEON::BI__builtin_neon_vqtbx3_v: 3612 case NEON::BI__builtin_neon_vqtbx3q_v: 3613 case NEON::BI__builtin_neon_vtbx4_v: 3614 case NEON::BI__builtin_neon_vqtbx4_v: 3615 case NEON::BI__builtin_neon_vqtbx4q_v: 3616 break; 3617 } 3618 3619 assert(E->getNumArgs() >= 3); 3620 3621 // Get the last argument, which specifies the vector type. 3622 const Expr *Arg = E->getArg(E->getNumArgs() - 1); 3623 std::optional<llvm::APSInt> Result = 3624 Arg->getIntegerConstantExpr(CGF.getContext()); 3625 if (!Result) 3626 return nullptr; 3627 3628 // Determine the type of this overloaded NEON intrinsic. 3629 NeonTypeFlags Type = Result->getZExtValue(); 3630 llvm::FixedVectorType *Ty = GetNeonType(&CGF, Type); 3631 if (!Ty) 3632 return nullptr; 3633 3634 CodeGen::CGBuilderTy &Builder = CGF.Builder; 3635 3636 // AArch64 scalar builtins are not overloaded, they do not have an extra 3637 // argument that specifies the vector type, need to handle each case. 3638 switch (BuiltinID) { 3639 case NEON::BI__builtin_neon_vtbl1_v: { 3640 return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 1), nullptr, Ops[1], 3641 Ty, Intrinsic::aarch64_neon_tbl1, "vtbl1"); 3642 } 3643 case NEON::BI__builtin_neon_vtbl2_v: { 3644 return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 2), nullptr, Ops[2], 3645 Ty, Intrinsic::aarch64_neon_tbl1, "vtbl1"); 3646 } 3647 case NEON::BI__builtin_neon_vtbl3_v: { 3648 return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 3), nullptr, Ops[3], 3649 Ty, Intrinsic::aarch64_neon_tbl2, "vtbl2"); 3650 } 3651 case NEON::BI__builtin_neon_vtbl4_v: { 3652 return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 4), nullptr, Ops[4], 3653 Ty, Intrinsic::aarch64_neon_tbl2, "vtbl2"); 3654 } 3655 case NEON::BI__builtin_neon_vtbx1_v: { 3656 Value *TblRes = 3657 packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 1), nullptr, Ops[2], Ty, 3658 Intrinsic::aarch64_neon_tbl1, "vtbl1"); 3659 3660 llvm::Constant *EightV = ConstantInt::get(Ty, 8); 3661 Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[2], EightV); 3662 CmpRes = Builder.CreateSExt(CmpRes, Ty); 3663 3664 Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]); 3665 Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes); 3666 return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx"); 3667 } 3668 case NEON::BI__builtin_neon_vtbx2_v: { 3669 return packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 2), Ops[0], Ops[3], 3670 Ty, Intrinsic::aarch64_neon_tbx1, "vtbx1"); 3671 } 3672 case NEON::BI__builtin_neon_vtbx3_v: { 3673 Value *TblRes = 3674 packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 3), nullptr, Ops[4], Ty, 3675 Intrinsic::aarch64_neon_tbl2, "vtbl2"); 3676 3677 llvm::Constant *TwentyFourV = ConstantInt::get(Ty, 24); 3678 Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[4], 3679 TwentyFourV); 3680 CmpRes = Builder.CreateSExt(CmpRes, Ty); 3681 3682 Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]); 3683 Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes); 3684 return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx"); 3685 } 3686 case NEON::BI__builtin_neon_vtbx4_v: { 3687 return packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 4), Ops[0], Ops[5], 3688 Ty, Intrinsic::aarch64_neon_tbx2, "vtbx2"); 3689 } 3690 case NEON::BI__builtin_neon_vqtbl1_v: 3691 case NEON::BI__builtin_neon_vqtbl1q_v: 3692 Int = Intrinsic::aarch64_neon_tbl1; s = "vtbl1"; break; 3693 case NEON::BI__builtin_neon_vqtbl2_v: 3694 case NEON::BI__builtin_neon_vqtbl2q_v: { 3695 Int = Intrinsic::aarch64_neon_tbl2; s = "vtbl2"; break; 3696 case NEON::BI__builtin_neon_vqtbl3_v: 3697 case NEON::BI__builtin_neon_vqtbl3q_v: 3698 Int = Intrinsic::aarch64_neon_tbl3; s = "vtbl3"; break; 3699 case NEON::BI__builtin_neon_vqtbl4_v: 3700 case NEON::BI__builtin_neon_vqtbl4q_v: 3701 Int = Intrinsic::aarch64_neon_tbl4; s = "vtbl4"; break; 3702 case NEON::BI__builtin_neon_vqtbx1_v: 3703 case NEON::BI__builtin_neon_vqtbx1q_v: 3704 Int = Intrinsic::aarch64_neon_tbx1; s = "vtbx1"; break; 3705 case NEON::BI__builtin_neon_vqtbx2_v: 3706 case NEON::BI__builtin_neon_vqtbx2q_v: 3707 Int = Intrinsic::aarch64_neon_tbx2; s = "vtbx2"; break; 3708 case NEON::BI__builtin_neon_vqtbx3_v: 3709 case NEON::BI__builtin_neon_vqtbx3q_v: 3710 Int = Intrinsic::aarch64_neon_tbx3; s = "vtbx3"; break; 3711 case NEON::BI__builtin_neon_vqtbx4_v: 3712 case NEON::BI__builtin_neon_vqtbx4q_v: 3713 Int = Intrinsic::aarch64_neon_tbx4; s = "vtbx4"; break; 3714 } 3715 } 3716 3717 if (!Int) 3718 return nullptr; 3719 3720 Function *F = CGF.CGM.getIntrinsic(Int, Ty); 3721 return CGF.EmitNeonCall(F, Ops, s); 3722 } 3723 3724 Value *CodeGenFunction::vectorWrapScalar16(Value *Op) { 3725 auto *VTy = llvm::FixedVectorType::get(Int16Ty, 4); 3726 Op = Builder.CreateBitCast(Op, Int16Ty); 3727 Value *V = PoisonValue::get(VTy); 3728 llvm::Constant *CI = ConstantInt::get(SizeTy, 0); 3729 Op = Builder.CreateInsertElement(V, Op, CI); 3730 return Op; 3731 } 3732 3733 /// SVEBuiltinMemEltTy - Returns the memory element type for this memory 3734 /// access builtin. Only required if it can't be inferred from the base pointer 3735 /// operand. 3736 llvm::Type *CodeGenFunction::SVEBuiltinMemEltTy(const SVETypeFlags &TypeFlags) { 3737 switch (TypeFlags.getMemEltType()) { 3738 case SVETypeFlags::MemEltTyDefault: 3739 return getEltType(TypeFlags); 3740 case SVETypeFlags::MemEltTyInt8: 3741 return Builder.getInt8Ty(); 3742 case SVETypeFlags::MemEltTyInt16: 3743 return Builder.getInt16Ty(); 3744 case SVETypeFlags::MemEltTyInt32: 3745 return Builder.getInt32Ty(); 3746 case SVETypeFlags::MemEltTyInt64: 3747 return Builder.getInt64Ty(); 3748 } 3749 llvm_unreachable("Unknown MemEltType"); 3750 } 3751 3752 llvm::Type *CodeGenFunction::getEltType(const SVETypeFlags &TypeFlags) { 3753 switch (TypeFlags.getEltType()) { 3754 default: 3755 llvm_unreachable("Invalid SVETypeFlag!"); 3756 3757 case SVETypeFlags::EltTyMFloat8: 3758 case SVETypeFlags::EltTyInt8: 3759 return Builder.getInt8Ty(); 3760 case SVETypeFlags::EltTyInt16: 3761 return Builder.getInt16Ty(); 3762 case SVETypeFlags::EltTyInt32: 3763 return Builder.getInt32Ty(); 3764 case SVETypeFlags::EltTyInt64: 3765 return Builder.getInt64Ty(); 3766 case SVETypeFlags::EltTyInt128: 3767 return Builder.getInt128Ty(); 3768 3769 case SVETypeFlags::EltTyFloat16: 3770 return Builder.getHalfTy(); 3771 case SVETypeFlags::EltTyFloat32: 3772 return Builder.getFloatTy(); 3773 case SVETypeFlags::EltTyFloat64: 3774 return Builder.getDoubleTy(); 3775 3776 case SVETypeFlags::EltTyBFloat16: 3777 return Builder.getBFloatTy(); 3778 3779 case SVETypeFlags::EltTyBool8: 3780 case SVETypeFlags::EltTyBool16: 3781 case SVETypeFlags::EltTyBool32: 3782 case SVETypeFlags::EltTyBool64: 3783 return Builder.getInt1Ty(); 3784 } 3785 } 3786 3787 // Return the llvm predicate vector type corresponding to the specified element 3788 // TypeFlags. 3789 llvm::ScalableVectorType * 3790 CodeGenFunction::getSVEPredType(const SVETypeFlags &TypeFlags) { 3791 switch (TypeFlags.getEltType()) { 3792 default: llvm_unreachable("Unhandled SVETypeFlag!"); 3793 3794 case SVETypeFlags::EltTyInt8: 3795 return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16); 3796 case SVETypeFlags::EltTyInt16: 3797 return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8); 3798 case SVETypeFlags::EltTyInt32: 3799 return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4); 3800 case SVETypeFlags::EltTyInt64: 3801 return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2); 3802 3803 case SVETypeFlags::EltTyBFloat16: 3804 return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8); 3805 case SVETypeFlags::EltTyFloat16: 3806 return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8); 3807 case SVETypeFlags::EltTyFloat32: 3808 return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4); 3809 case SVETypeFlags::EltTyFloat64: 3810 return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2); 3811 3812 case SVETypeFlags::EltTyBool8: 3813 return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16); 3814 case SVETypeFlags::EltTyBool16: 3815 return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8); 3816 case SVETypeFlags::EltTyBool32: 3817 return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4); 3818 case SVETypeFlags::EltTyBool64: 3819 return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2); 3820 } 3821 } 3822 3823 // Return the llvm vector type corresponding to the specified element TypeFlags. 3824 llvm::ScalableVectorType * 3825 CodeGenFunction::getSVEType(const SVETypeFlags &TypeFlags) { 3826 switch (TypeFlags.getEltType()) { 3827 default: 3828 llvm_unreachable("Invalid SVETypeFlag!"); 3829 3830 case SVETypeFlags::EltTyInt8: 3831 return llvm::ScalableVectorType::get(Builder.getInt8Ty(), 16); 3832 case SVETypeFlags::EltTyInt16: 3833 return llvm::ScalableVectorType::get(Builder.getInt16Ty(), 8); 3834 case SVETypeFlags::EltTyInt32: 3835 return llvm::ScalableVectorType::get(Builder.getInt32Ty(), 4); 3836 case SVETypeFlags::EltTyInt64: 3837 return llvm::ScalableVectorType::get(Builder.getInt64Ty(), 2); 3838 3839 case SVETypeFlags::EltTyMFloat8: 3840 return llvm::ScalableVectorType::get(Builder.getInt8Ty(), 16); 3841 case SVETypeFlags::EltTyFloat16: 3842 return llvm::ScalableVectorType::get(Builder.getHalfTy(), 8); 3843 case SVETypeFlags::EltTyBFloat16: 3844 return llvm::ScalableVectorType::get(Builder.getBFloatTy(), 8); 3845 case SVETypeFlags::EltTyFloat32: 3846 return llvm::ScalableVectorType::get(Builder.getFloatTy(), 4); 3847 case SVETypeFlags::EltTyFloat64: 3848 return llvm::ScalableVectorType::get(Builder.getDoubleTy(), 2); 3849 3850 case SVETypeFlags::EltTyBool8: 3851 return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16); 3852 case SVETypeFlags::EltTyBool16: 3853 return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8); 3854 case SVETypeFlags::EltTyBool32: 3855 return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4); 3856 case SVETypeFlags::EltTyBool64: 3857 return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2); 3858 } 3859 } 3860 3861 llvm::Value * 3862 CodeGenFunction::EmitSVEAllTruePred(const SVETypeFlags &TypeFlags) { 3863 Function *Ptrue = 3864 CGM.getIntrinsic(Intrinsic::aarch64_sve_ptrue, getSVEPredType(TypeFlags)); 3865 return Builder.CreateCall(Ptrue, {Builder.getInt32(/*SV_ALL*/ 31)}); 3866 } 3867 3868 constexpr unsigned SVEBitsPerBlock = 128; 3869 3870 static llvm::ScalableVectorType *getSVEVectorForElementType(llvm::Type *EltTy) { 3871 unsigned NumElts = SVEBitsPerBlock / EltTy->getScalarSizeInBits(); 3872 return llvm::ScalableVectorType::get(EltTy, NumElts); 3873 } 3874 3875 // Reinterpret the input predicate so that it can be used to correctly isolate 3876 // the elements of the specified datatype. 3877 Value *CodeGenFunction::EmitSVEPredicateCast(Value *Pred, 3878 llvm::ScalableVectorType *VTy) { 3879 3880 if (isa<TargetExtType>(Pred->getType()) && 3881 cast<TargetExtType>(Pred->getType())->getName() == "aarch64.svcount") 3882 return Pred; 3883 3884 auto *RTy = llvm::VectorType::get(IntegerType::get(getLLVMContext(), 1), VTy); 3885 if (Pred->getType() == RTy) 3886 return Pred; 3887 3888 unsigned IntID; 3889 llvm::Type *IntrinsicTy; 3890 switch (VTy->getMinNumElements()) { 3891 default: 3892 llvm_unreachable("unsupported element count!"); 3893 case 1: 3894 case 2: 3895 case 4: 3896 case 8: 3897 IntID = Intrinsic::aarch64_sve_convert_from_svbool; 3898 IntrinsicTy = RTy; 3899 break; 3900 case 16: 3901 IntID = Intrinsic::aarch64_sve_convert_to_svbool; 3902 IntrinsicTy = Pred->getType(); 3903 break; 3904 } 3905 3906 Function *F = CGM.getIntrinsic(IntID, IntrinsicTy); 3907 Value *C = Builder.CreateCall(F, Pred); 3908 assert(C->getType() == RTy && "Unexpected return type!"); 3909 return C; 3910 } 3911 3912 Value *CodeGenFunction::EmitSVEPredicateTupleCast(Value *PredTuple, 3913 llvm::StructType *Ty) { 3914 if (PredTuple->getType() == Ty) 3915 return PredTuple; 3916 3917 Value *Ret = llvm::PoisonValue::get(Ty); 3918 for (unsigned I = 0; I < Ty->getNumElements(); ++I) { 3919 Value *Pred = Builder.CreateExtractValue(PredTuple, I); 3920 Pred = EmitSVEPredicateCast( 3921 Pred, cast<llvm::ScalableVectorType>(Ty->getTypeAtIndex(I))); 3922 Ret = Builder.CreateInsertValue(Ret, Pred, I); 3923 } 3924 3925 return Ret; 3926 } 3927 3928 Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags, 3929 SmallVectorImpl<Value *> &Ops, 3930 unsigned IntID) { 3931 auto *ResultTy = getSVEType(TypeFlags); 3932 auto *OverloadedTy = 3933 llvm::ScalableVectorType::get(SVEBuiltinMemEltTy(TypeFlags), ResultTy); 3934 3935 Function *F = nullptr; 3936 if (Ops[1]->getType()->isVectorTy()) 3937 // This is the "vector base, scalar offset" case. In order to uniquely 3938 // map this built-in to an LLVM IR intrinsic, we need both the return type 3939 // and the type of the vector base. 3940 F = CGM.getIntrinsic(IntID, {OverloadedTy, Ops[1]->getType()}); 3941 else 3942 // This is the "scalar base, vector offset case". The type of the offset 3943 // is encoded in the name of the intrinsic. We only need to specify the 3944 // return type in order to uniquely map this built-in to an LLVM IR 3945 // intrinsic. 3946 F = CGM.getIntrinsic(IntID, OverloadedTy); 3947 3948 // At the ACLE level there's only one predicate type, svbool_t, which is 3949 // mapped to <n x 16 x i1>. However, this might be incompatible with the 3950 // actual type being loaded. For example, when loading doubles (i64) the 3951 // predicate should be <n x 2 x i1> instead. At the IR level the type of 3952 // the predicate and the data being loaded must match. Cast to the type 3953 // expected by the intrinsic. The intrinsic itself should be defined in 3954 // a way than enforces relations between parameter types. 3955 Ops[0] = EmitSVEPredicateCast( 3956 Ops[0], cast<llvm::ScalableVectorType>(F->getArg(0)->getType())); 3957 3958 // Pass 0 when the offset is missing. This can only be applied when using 3959 // the "vector base" addressing mode for which ACLE allows no offset. The 3960 // corresponding LLVM IR always requires an offset. 3961 if (Ops.size() == 2) { 3962 assert(Ops[1]->getType()->isVectorTy() && "Scalar base requires an offset"); 3963 Ops.push_back(ConstantInt::get(Int64Ty, 0)); 3964 } 3965 3966 // For "vector base, scalar index" scale the index so that it becomes a 3967 // scalar offset. 3968 if (!TypeFlags.isByteIndexed() && Ops[1]->getType()->isVectorTy()) { 3969 unsigned BytesPerElt = 3970 OverloadedTy->getElementType()->getScalarSizeInBits() / 8; 3971 Ops[2] = Builder.CreateShl(Ops[2], Log2_32(BytesPerElt)); 3972 } 3973 3974 Value *Call = Builder.CreateCall(F, Ops); 3975 3976 // The following sext/zext is only needed when ResultTy != OverloadedTy. In 3977 // other cases it's folded into a nop. 3978 return TypeFlags.isZExtReturn() ? Builder.CreateZExt(Call, ResultTy) 3979 : Builder.CreateSExt(Call, ResultTy); 3980 } 3981 3982 Value *CodeGenFunction::EmitSVEScatterStore(const SVETypeFlags &TypeFlags, 3983 SmallVectorImpl<Value *> &Ops, 3984 unsigned IntID) { 3985 auto *SrcDataTy = getSVEType(TypeFlags); 3986 auto *OverloadedTy = 3987 llvm::ScalableVectorType::get(SVEBuiltinMemEltTy(TypeFlags), SrcDataTy); 3988 3989 // In ACLE the source data is passed in the last argument, whereas in LLVM IR 3990 // it's the first argument. Move it accordingly. 3991 Ops.insert(Ops.begin(), Ops.pop_back_val()); 3992 3993 Function *F = nullptr; 3994 if (Ops[2]->getType()->isVectorTy()) 3995 // This is the "vector base, scalar offset" case. In order to uniquely 3996 // map this built-in to an LLVM IR intrinsic, we need both the return type 3997 // and the type of the vector base. 3998 F = CGM.getIntrinsic(IntID, {OverloadedTy, Ops[2]->getType()}); 3999 else 4000 // This is the "scalar base, vector offset case". The type of the offset 4001 // is encoded in the name of the intrinsic. We only need to specify the 4002 // return type in order to uniquely map this built-in to an LLVM IR 4003 // intrinsic. 4004 F = CGM.getIntrinsic(IntID, OverloadedTy); 4005 4006 // Pass 0 when the offset is missing. This can only be applied when using 4007 // the "vector base" addressing mode for which ACLE allows no offset. The 4008 // corresponding LLVM IR always requires an offset. 4009 if (Ops.size() == 3) { 4010 assert(Ops[1]->getType()->isVectorTy() && "Scalar base requires an offset"); 4011 Ops.push_back(ConstantInt::get(Int64Ty, 0)); 4012 } 4013 4014 // Truncation is needed when SrcDataTy != OverloadedTy. In other cases it's 4015 // folded into a nop. 4016 Ops[0] = Builder.CreateTrunc(Ops[0], OverloadedTy); 4017 4018 // At the ACLE level there's only one predicate type, svbool_t, which is 4019 // mapped to <n x 16 x i1>. However, this might be incompatible with the 4020 // actual type being stored. For example, when storing doubles (i64) the 4021 // predicated should be <n x 2 x i1> instead. At the IR level the type of 4022 // the predicate and the data being stored must match. Cast to the type 4023 // expected by the intrinsic. The intrinsic itself should be defined in 4024 // a way that enforces relations between parameter types. 4025 Ops[1] = EmitSVEPredicateCast( 4026 Ops[1], cast<llvm::ScalableVectorType>(F->getArg(1)->getType())); 4027 4028 // For "vector base, scalar index" scale the index so that it becomes a 4029 // scalar offset. 4030 if (!TypeFlags.isByteIndexed() && Ops[2]->getType()->isVectorTy()) { 4031 unsigned BytesPerElt = 4032 OverloadedTy->getElementType()->getScalarSizeInBits() / 8; 4033 Ops[3] = Builder.CreateShl(Ops[3], Log2_32(BytesPerElt)); 4034 } 4035 4036 return Builder.CreateCall(F, Ops); 4037 } 4038 4039 Value *CodeGenFunction::EmitSVEGatherPrefetch(const SVETypeFlags &TypeFlags, 4040 SmallVectorImpl<Value *> &Ops, 4041 unsigned IntID) { 4042 // The gather prefetches are overloaded on the vector input - this can either 4043 // be the vector of base addresses or vector of offsets. 4044 auto *OverloadedTy = dyn_cast<llvm::ScalableVectorType>(Ops[1]->getType()); 4045 if (!OverloadedTy) 4046 OverloadedTy = cast<llvm::ScalableVectorType>(Ops[2]->getType()); 4047 4048 // Cast the predicate from svbool_t to the right number of elements. 4049 Ops[0] = EmitSVEPredicateCast(Ops[0], OverloadedTy); 4050 4051 // vector + imm addressing modes 4052 if (Ops[1]->getType()->isVectorTy()) { 4053 if (Ops.size() == 3) { 4054 // Pass 0 for 'vector+imm' when the index is omitted. 4055 Ops.push_back(ConstantInt::get(Int64Ty, 0)); 4056 4057 // The sv_prfop is the last operand in the builtin and IR intrinsic. 4058 std::swap(Ops[2], Ops[3]); 4059 } else { 4060 // Index needs to be passed as scaled offset. 4061 llvm::Type *MemEltTy = SVEBuiltinMemEltTy(TypeFlags); 4062 unsigned BytesPerElt = MemEltTy->getPrimitiveSizeInBits() / 8; 4063 if (BytesPerElt > 1) 4064 Ops[2] = Builder.CreateShl(Ops[2], Log2_32(BytesPerElt)); 4065 } 4066 } 4067 4068 Function *F = CGM.getIntrinsic(IntID, OverloadedTy); 4069 return Builder.CreateCall(F, Ops); 4070 } 4071 4072 Value *CodeGenFunction::EmitSVEStructLoad(const SVETypeFlags &TypeFlags, 4073 SmallVectorImpl<Value*> &Ops, 4074 unsigned IntID) { 4075 llvm::ScalableVectorType *VTy = getSVEType(TypeFlags); 4076 Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy); 4077 Value *BasePtr = Ops[1]; 4078 4079 // Does the load have an offset? 4080 if (Ops.size() > 2) 4081 BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]); 4082 4083 Function *F = CGM.getIntrinsic(IntID, {VTy}); 4084 return Builder.CreateCall(F, {Predicate, BasePtr}); 4085 } 4086 4087 Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags, 4088 SmallVectorImpl<Value*> &Ops, 4089 unsigned IntID) { 4090 llvm::ScalableVectorType *VTy = getSVEType(TypeFlags); 4091 4092 unsigned N; 4093 switch (IntID) { 4094 case Intrinsic::aarch64_sve_st2: 4095 case Intrinsic::aarch64_sve_st1_pn_x2: 4096 case Intrinsic::aarch64_sve_stnt1_pn_x2: 4097 case Intrinsic::aarch64_sve_st2q: 4098 N = 2; 4099 break; 4100 case Intrinsic::aarch64_sve_st3: 4101 case Intrinsic::aarch64_sve_st3q: 4102 N = 3; 4103 break; 4104 case Intrinsic::aarch64_sve_st4: 4105 case Intrinsic::aarch64_sve_st1_pn_x4: 4106 case Intrinsic::aarch64_sve_stnt1_pn_x4: 4107 case Intrinsic::aarch64_sve_st4q: 4108 N = 4; 4109 break; 4110 default: 4111 llvm_unreachable("unknown intrinsic!"); 4112 } 4113 4114 Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy); 4115 Value *BasePtr = Ops[1]; 4116 4117 // Does the store have an offset? 4118 if (Ops.size() > (2 + N)) 4119 BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]); 4120 4121 // The llvm.aarch64.sve.st2/3/4 intrinsics take legal part vectors, so we 4122 // need to break up the tuple vector. 4123 SmallVector<llvm::Value*, 5> Operands; 4124 for (unsigned I = Ops.size() - N; I < Ops.size(); ++I) 4125 Operands.push_back(Ops[I]); 4126 Operands.append({Predicate, BasePtr}); 4127 Function *F = CGM.getIntrinsic(IntID, { VTy }); 4128 4129 return Builder.CreateCall(F, Operands); 4130 } 4131 4132 // SVE2's svpmullb and svpmullt builtins are similar to the svpmullb_pair and 4133 // svpmullt_pair intrinsics, with the exception that their results are bitcast 4134 // to a wider type. 4135 Value *CodeGenFunction::EmitSVEPMull(const SVETypeFlags &TypeFlags, 4136 SmallVectorImpl<Value *> &Ops, 4137 unsigned BuiltinID) { 4138 // Splat scalar operand to vector (intrinsics with _n infix) 4139 if (TypeFlags.hasSplatOperand()) { 4140 unsigned OpNo = TypeFlags.getSplatOperand(); 4141 Ops[OpNo] = EmitSVEDupX(Ops[OpNo]); 4142 } 4143 4144 // The pair-wise function has a narrower overloaded type. 4145 Function *F = CGM.getIntrinsic(BuiltinID, Ops[0]->getType()); 4146 Value *Call = Builder.CreateCall(F, {Ops[0], Ops[1]}); 4147 4148 // Now bitcast to the wider result type. 4149 llvm::ScalableVectorType *Ty = getSVEType(TypeFlags); 4150 return EmitSVEReinterpret(Call, Ty); 4151 } 4152 4153 Value *CodeGenFunction::EmitSVEMovl(const SVETypeFlags &TypeFlags, 4154 ArrayRef<Value *> Ops, unsigned BuiltinID) { 4155 llvm::Type *OverloadedTy = getSVEType(TypeFlags); 4156 Function *F = CGM.getIntrinsic(BuiltinID, OverloadedTy); 4157 return Builder.CreateCall(F, {Ops[0], Builder.getInt32(0)}); 4158 } 4159 4160 Value *CodeGenFunction::EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags, 4161 SmallVectorImpl<Value *> &Ops, 4162 unsigned BuiltinID) { 4163 auto *MemEltTy = SVEBuiltinMemEltTy(TypeFlags); 4164 auto *VectorTy = getSVEVectorForElementType(MemEltTy); 4165 auto *MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy); 4166 4167 Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy); 4168 Value *BasePtr = Ops[1]; 4169 4170 // Implement the index operand if not omitted. 4171 if (Ops.size() > 3) 4172 BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]); 4173 4174 Value *PrfOp = Ops.back(); 4175 4176 Function *F = CGM.getIntrinsic(BuiltinID, Predicate->getType()); 4177 return Builder.CreateCall(F, {Predicate, BasePtr, PrfOp}); 4178 } 4179 4180 Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E, 4181 llvm::Type *ReturnTy, 4182 SmallVectorImpl<Value *> &Ops, 4183 unsigned IntrinsicID, 4184 bool IsZExtReturn) { 4185 QualType LangPTy = E->getArg(1)->getType(); 4186 llvm::Type *MemEltTy = CGM.getTypes().ConvertType( 4187 LangPTy->castAs<PointerType>()->getPointeeType()); 4188 4189 // Mfloat8 types is stored as a vector, so extra work 4190 // to extract sclar element type is necessary. 4191 if (MemEltTy->isVectorTy()) { 4192 assert(MemEltTy == FixedVectorType::get(Int8Ty, 1) && 4193 "Only <1 x i8> expected"); 4194 MemEltTy = cast<llvm::VectorType>(MemEltTy)->getElementType(); 4195 } 4196 4197 // The vector type that is returned may be different from the 4198 // eventual type loaded from memory. 4199 auto VectorTy = cast<llvm::ScalableVectorType>(ReturnTy); 4200 llvm::ScalableVectorType *MemoryTy = nullptr; 4201 llvm::ScalableVectorType *PredTy = nullptr; 4202 bool IsQuadLoad = false; 4203 switch (IntrinsicID) { 4204 case Intrinsic::aarch64_sve_ld1uwq: 4205 case Intrinsic::aarch64_sve_ld1udq: 4206 MemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1); 4207 PredTy = llvm::ScalableVectorType::get( 4208 llvm::Type::getInt1Ty(getLLVMContext()), 1); 4209 IsQuadLoad = true; 4210 break; 4211 default: 4212 MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy); 4213 PredTy = MemoryTy; 4214 break; 4215 } 4216 4217 Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy); 4218 Value *BasePtr = Ops[1]; 4219 4220 // Does the load have an offset? 4221 if (Ops.size() > 2) 4222 BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]); 4223 4224 Function *F = CGM.getIntrinsic(IntrinsicID, IsQuadLoad ? VectorTy : MemoryTy); 4225 auto *Load = 4226 cast<llvm::Instruction>(Builder.CreateCall(F, {Predicate, BasePtr})); 4227 auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType()); 4228 CGM.DecorateInstructionWithTBAA(Load, TBAAInfo); 4229 4230 if (IsQuadLoad) 4231 return Load; 4232 4233 return IsZExtReturn ? Builder.CreateZExt(Load, VectorTy) 4234 : Builder.CreateSExt(Load, VectorTy); 4235 } 4236 4237 Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E, 4238 SmallVectorImpl<Value *> &Ops, 4239 unsigned IntrinsicID) { 4240 QualType LangPTy = E->getArg(1)->getType(); 4241 llvm::Type *MemEltTy = CGM.getTypes().ConvertType( 4242 LangPTy->castAs<PointerType>()->getPointeeType()); 4243 4244 // Mfloat8 types is stored as a vector, so extra work 4245 // to extract sclar element type is necessary. 4246 if (MemEltTy->isVectorTy()) { 4247 assert(MemEltTy == FixedVectorType::get(Int8Ty, 1) && 4248 "Only <1 x i8> expected"); 4249 MemEltTy = cast<llvm::VectorType>(MemEltTy)->getElementType(); 4250 } 4251 4252 // The vector type that is stored may be different from the 4253 // eventual type stored to memory. 4254 auto VectorTy = cast<llvm::ScalableVectorType>(Ops.back()->getType()); 4255 auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy); 4256 4257 auto PredTy = MemoryTy; 4258 auto AddrMemoryTy = MemoryTy; 4259 bool IsQuadStore = false; 4260 4261 switch (IntrinsicID) { 4262 case Intrinsic::aarch64_sve_st1wq: 4263 case Intrinsic::aarch64_sve_st1dq: 4264 AddrMemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1); 4265 PredTy = 4266 llvm::ScalableVectorType::get(IntegerType::get(getLLVMContext(), 1), 1); 4267 IsQuadStore = true; 4268 break; 4269 default: 4270 break; 4271 } 4272 Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy); 4273 Value *BasePtr = Ops[1]; 4274 4275 // Does the store have an offset? 4276 if (Ops.size() == 4) 4277 BasePtr = Builder.CreateGEP(AddrMemoryTy, BasePtr, Ops[2]); 4278 4279 // Last value is always the data 4280 Value *Val = 4281 IsQuadStore ? Ops.back() : Builder.CreateTrunc(Ops.back(), MemoryTy); 4282 4283 Function *F = 4284 CGM.getIntrinsic(IntrinsicID, IsQuadStore ? VectorTy : MemoryTy); 4285 auto *Store = 4286 cast<llvm::Instruction>(Builder.CreateCall(F, {Val, Predicate, BasePtr})); 4287 auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType()); 4288 CGM.DecorateInstructionWithTBAA(Store, TBAAInfo); 4289 return Store; 4290 } 4291 4292 Value *CodeGenFunction::EmitSMELd1St1(const SVETypeFlags &TypeFlags, 4293 SmallVectorImpl<Value *> &Ops, 4294 unsigned IntID) { 4295 Ops[2] = EmitSVEPredicateCast( 4296 Ops[2], getSVEVectorForElementType(SVEBuiltinMemEltTy(TypeFlags))); 4297 4298 SmallVector<Value *> NewOps; 4299 NewOps.push_back(Ops[2]); 4300 4301 llvm::Value *BasePtr = Ops[3]; 4302 llvm::Value *RealSlice = Ops[1]; 4303 // If the intrinsic contains the vnum parameter, multiply it with the vector 4304 // size in bytes. 4305 if (Ops.size() == 5) { 4306 Function *StreamingVectorLength = 4307 CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb); 4308 llvm::Value *StreamingVectorLengthCall = 4309 Builder.CreateCall(StreamingVectorLength); 4310 llvm::Value *Mulvl = 4311 Builder.CreateMul(StreamingVectorLengthCall, Ops[4], "mulvl"); 4312 // The type of the ptr parameter is void *, so use Int8Ty here. 4313 BasePtr = Builder.CreateGEP(Int8Ty, Ops[3], Mulvl); 4314 RealSlice = Builder.CreateZExt(RealSlice, Int64Ty); 4315 RealSlice = Builder.CreateAdd(RealSlice, Ops[4]); 4316 RealSlice = Builder.CreateTrunc(RealSlice, Int32Ty); 4317 } 4318 NewOps.push_back(BasePtr); 4319 NewOps.push_back(Ops[0]); 4320 NewOps.push_back(RealSlice); 4321 Function *F = CGM.getIntrinsic(IntID); 4322 return Builder.CreateCall(F, NewOps); 4323 } 4324 4325 Value *CodeGenFunction::EmitSMEReadWrite(const SVETypeFlags &TypeFlags, 4326 SmallVectorImpl<Value *> &Ops, 4327 unsigned IntID) { 4328 auto *VecTy = getSVEType(TypeFlags); 4329 Function *F = CGM.getIntrinsic(IntID, VecTy); 4330 if (TypeFlags.isReadZA()) 4331 Ops[1] = EmitSVEPredicateCast(Ops[1], VecTy); 4332 else if (TypeFlags.isWriteZA()) 4333 Ops[2] = EmitSVEPredicateCast(Ops[2], VecTy); 4334 return Builder.CreateCall(F, Ops); 4335 } 4336 4337 Value *CodeGenFunction::EmitSMEZero(const SVETypeFlags &TypeFlags, 4338 SmallVectorImpl<Value *> &Ops, 4339 unsigned IntID) { 4340 // svzero_za() intrinsic zeros the entire za tile and has no paramters. 4341 if (Ops.size() == 0) 4342 Ops.push_back(llvm::ConstantInt::get(Int32Ty, 255)); 4343 Function *F = CGM.getIntrinsic(IntID, {}); 4344 return Builder.CreateCall(F, Ops); 4345 } 4346 4347 Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags, 4348 SmallVectorImpl<Value *> &Ops, 4349 unsigned IntID) { 4350 if (Ops.size() == 2) 4351 Ops.push_back(Builder.getInt32(0)); 4352 else 4353 Ops[2] = Builder.CreateIntCast(Ops[2], Int32Ty, true); 4354 Function *F = CGM.getIntrinsic(IntID, {}); 4355 return Builder.CreateCall(F, Ops); 4356 } 4357 4358 // Limit the usage of scalable llvm IR generated by the ACLE by using the 4359 // sve dup.x intrinsic instead of IRBuilder::CreateVectorSplat. 4360 Value *CodeGenFunction::EmitSVEDupX(Value *Scalar, llvm::Type *Ty) { 4361 return Builder.CreateVectorSplat( 4362 cast<llvm::VectorType>(Ty)->getElementCount(), Scalar); 4363 } 4364 4365 Value *CodeGenFunction::EmitSVEDupX(Value *Scalar) { 4366 if (auto *Ty = Scalar->getType(); Ty->isVectorTy()) { 4367 #ifndef NDEBUG 4368 auto *VecTy = cast<llvm::VectorType>(Ty); 4369 ElementCount EC = VecTy->getElementCount(); 4370 assert(EC.isScalar() && VecTy->getElementType() == Int8Ty && 4371 "Only <1 x i8> expected"); 4372 #endif 4373 Scalar = Builder.CreateExtractElement(Scalar, uint64_t(0)); 4374 } 4375 return EmitSVEDupX(Scalar, getSVEVectorForElementType(Scalar->getType())); 4376 } 4377 4378 Value *CodeGenFunction::EmitSVEReinterpret(Value *Val, llvm::Type *Ty) { 4379 // FIXME: For big endian this needs an additional REV, or needs a separate 4380 // intrinsic that is code-generated as a no-op, because the LLVM bitcast 4381 // instruction is defined as 'bitwise' equivalent from memory point of 4382 // view (when storing/reloading), whereas the svreinterpret builtin 4383 // implements bitwise equivalent cast from register point of view. 4384 // LLVM CodeGen for a bitcast must add an explicit REV for big-endian. 4385 4386 if (auto *StructTy = dyn_cast<StructType>(Ty)) { 4387 Value *Tuple = llvm::PoisonValue::get(Ty); 4388 4389 for (unsigned I = 0; I < StructTy->getNumElements(); ++I) { 4390 Value *In = Builder.CreateExtractValue(Val, I); 4391 Value *Out = Builder.CreateBitCast(In, StructTy->getTypeAtIndex(I)); 4392 Tuple = Builder.CreateInsertValue(Tuple, Out, I); 4393 } 4394 4395 return Tuple; 4396 } 4397 4398 return Builder.CreateBitCast(Val, Ty); 4399 } 4400 4401 static void InsertExplicitZeroOperand(CGBuilderTy &Builder, llvm::Type *Ty, 4402 SmallVectorImpl<Value *> &Ops) { 4403 auto *SplatZero = Constant::getNullValue(Ty); 4404 Ops.insert(Ops.begin(), SplatZero); 4405 } 4406 4407 static void InsertExplicitUndefOperand(CGBuilderTy &Builder, llvm::Type *Ty, 4408 SmallVectorImpl<Value *> &Ops) { 4409 auto *SplatUndef = UndefValue::get(Ty); 4410 Ops.insert(Ops.begin(), SplatUndef); 4411 } 4412 4413 SmallVector<llvm::Type *, 2> 4414 CodeGenFunction::getSVEOverloadTypes(const SVETypeFlags &TypeFlags, 4415 llvm::Type *ResultType, 4416 ArrayRef<Value *> Ops) { 4417 if (TypeFlags.isOverloadNone()) 4418 return {}; 4419 4420 llvm::Type *DefaultType = getSVEType(TypeFlags); 4421 4422 if (TypeFlags.isOverloadWhileOrMultiVecCvt()) 4423 return {DefaultType, Ops[1]->getType()}; 4424 4425 if (TypeFlags.isOverloadWhileRW()) 4426 return {getSVEPredType(TypeFlags), Ops[0]->getType()}; 4427 4428 if (TypeFlags.isOverloadCvt()) 4429 return {Ops[0]->getType(), Ops.back()->getType()}; 4430 4431 if (TypeFlags.isReductionQV() && !ResultType->isScalableTy() && 4432 ResultType->isVectorTy()) 4433 return {ResultType, Ops[1]->getType()}; 4434 4435 assert(TypeFlags.isOverloadDefault() && "Unexpected value for overloads"); 4436 return {DefaultType}; 4437 } 4438 4439 Value *CodeGenFunction::EmitSVETupleSetOrGet(const SVETypeFlags &TypeFlags, 4440 ArrayRef<Value *> Ops) { 4441 assert((TypeFlags.isTupleSet() || TypeFlags.isTupleGet()) && 4442 "Expects TypleFlags.isTupleSet() or TypeFlags.isTupleGet()"); 4443 unsigned Idx = cast<ConstantInt>(Ops[1])->getZExtValue(); 4444 4445 if (TypeFlags.isTupleSet()) 4446 return Builder.CreateInsertValue(Ops[0], Ops[2], Idx); 4447 return Builder.CreateExtractValue(Ops[0], Idx); 4448 } 4449 4450 Value *CodeGenFunction::EmitSVETupleCreate(const SVETypeFlags &TypeFlags, 4451 llvm::Type *Ty, 4452 ArrayRef<Value *> Ops) { 4453 assert(TypeFlags.isTupleCreate() && "Expects TypleFlag isTupleCreate"); 4454 4455 Value *Tuple = llvm::PoisonValue::get(Ty); 4456 for (unsigned Idx = 0; Idx < Ops.size(); Idx++) 4457 Tuple = Builder.CreateInsertValue(Tuple, Ops[Idx], Idx); 4458 4459 return Tuple; 4460 } 4461 4462 void CodeGenFunction::GetAArch64SVEProcessedOperands( 4463 unsigned BuiltinID, const CallExpr *E, SmallVectorImpl<Value *> &Ops, 4464 SVETypeFlags TypeFlags) { 4465 // Find out if any arguments are required to be integer constant expressions. 4466 unsigned ICEArguments = 0; 4467 ASTContext::GetBuiltinTypeError Error; 4468 getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); 4469 assert(Error == ASTContext::GE_None && "Should not codegen an error"); 4470 4471 // Tuple set/get only requires one insert/extract vector, which is 4472 // created by EmitSVETupleSetOrGet. 4473 bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet(); 4474 4475 for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) { 4476 bool IsICE = ICEArguments & (1 << i); 4477 Value *Arg = EmitScalarExpr(E->getArg(i)); 4478 4479 if (IsICE) { 4480 // If this is required to be a constant, constant fold it so that we know 4481 // that the generated intrinsic gets a ConstantInt. 4482 std::optional<llvm::APSInt> Result = 4483 E->getArg(i)->getIntegerConstantExpr(getContext()); 4484 assert(Result && "Expected argument to be a constant"); 4485 4486 // Immediates for SVE llvm intrinsics are always 32bit. We can safely 4487 // truncate because the immediate has been range checked and no valid 4488 // immediate requires more than a handful of bits. 4489 *Result = Result->extOrTrunc(32); 4490 Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result)); 4491 continue; 4492 } 4493 4494 if (isa<StructType>(Arg->getType()) && !IsTupleGetOrSet) { 4495 for (unsigned I = 0; I < Arg->getType()->getStructNumElements(); ++I) 4496 Ops.push_back(Builder.CreateExtractValue(Arg, I)); 4497 4498 continue; 4499 } 4500 4501 Ops.push_back(Arg); 4502 } 4503 } 4504 4505 Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, 4506 const CallExpr *E) { 4507 llvm::Type *Ty = ConvertType(E->getType()); 4508 if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 && 4509 BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64_x4) { 4510 Value *Val = EmitScalarExpr(E->getArg(0)); 4511 return EmitSVEReinterpret(Val, Ty); 4512 } 4513 4514 auto *Builtin = findARMVectorIntrinsicInMap(AArch64SVEIntrinsicMap, BuiltinID, 4515 AArch64SVEIntrinsicsProvenSorted); 4516 4517 llvm::SmallVector<Value *, 4> Ops; 4518 SVETypeFlags TypeFlags(Builtin->TypeModifier); 4519 GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags); 4520 4521 if (TypeFlags.isLoad()) 4522 return EmitSVEMaskedLoad(E, Ty, Ops, Builtin->LLVMIntrinsic, 4523 TypeFlags.isZExtReturn()); 4524 else if (TypeFlags.isStore()) 4525 return EmitSVEMaskedStore(E, Ops, Builtin->LLVMIntrinsic); 4526 else if (TypeFlags.isGatherLoad()) 4527 return EmitSVEGatherLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic); 4528 else if (TypeFlags.isScatterStore()) 4529 return EmitSVEScatterStore(TypeFlags, Ops, Builtin->LLVMIntrinsic); 4530 else if (TypeFlags.isPrefetch()) 4531 return EmitSVEPrefetchLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic); 4532 else if (TypeFlags.isGatherPrefetch()) 4533 return EmitSVEGatherPrefetch(TypeFlags, Ops, Builtin->LLVMIntrinsic); 4534 else if (TypeFlags.isStructLoad()) 4535 return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic); 4536 else if (TypeFlags.isStructStore()) 4537 return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic); 4538 else if (TypeFlags.isTupleSet() || TypeFlags.isTupleGet()) 4539 return EmitSVETupleSetOrGet(TypeFlags, Ops); 4540 else if (TypeFlags.isTupleCreate()) 4541 return EmitSVETupleCreate(TypeFlags, Ty, Ops); 4542 else if (TypeFlags.isUndef()) 4543 return UndefValue::get(Ty); 4544 else if (Builtin->LLVMIntrinsic != 0) { 4545 // Emit set FPMR for intrinsics that require it 4546 if (TypeFlags.setsFPMR()) 4547 Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), 4548 Ops.pop_back_val()); 4549 if (TypeFlags.getMergeType() == SVETypeFlags::MergeZeroExp) 4550 InsertExplicitZeroOperand(Builder, Ty, Ops); 4551 4552 if (TypeFlags.getMergeType() == SVETypeFlags::MergeAnyExp) 4553 InsertExplicitUndefOperand(Builder, Ty, Ops); 4554 4555 // Some ACLE builtins leave out the argument to specify the predicate 4556 // pattern, which is expected to be expanded to an SV_ALL pattern. 4557 if (TypeFlags.isAppendSVALL()) 4558 Ops.push_back(Builder.getInt32(/*SV_ALL*/ 31)); 4559 if (TypeFlags.isInsertOp1SVALL()) 4560 Ops.insert(&Ops[1], Builder.getInt32(/*SV_ALL*/ 31)); 4561 4562 // Predicates must match the main datatype. 4563 for (Value *&Op : Ops) 4564 if (auto PredTy = dyn_cast<llvm::VectorType>(Op->getType())) 4565 if (PredTy->getElementType()->isIntegerTy(1)) 4566 Op = EmitSVEPredicateCast(Op, getSVEType(TypeFlags)); 4567 4568 // Splat scalar operand to vector (intrinsics with _n infix) 4569 if (TypeFlags.hasSplatOperand()) { 4570 unsigned OpNo = TypeFlags.getSplatOperand(); 4571 Ops[OpNo] = EmitSVEDupX(Ops[OpNo]); 4572 } 4573 4574 if (TypeFlags.isReverseCompare()) 4575 std::swap(Ops[1], Ops[2]); 4576 else if (TypeFlags.isReverseUSDOT()) 4577 std::swap(Ops[1], Ops[2]); 4578 else if (TypeFlags.isReverseMergeAnyBinOp() && 4579 TypeFlags.getMergeType() == SVETypeFlags::MergeAny) 4580 std::swap(Ops[1], Ops[2]); 4581 else if (TypeFlags.isReverseMergeAnyAccOp() && 4582 TypeFlags.getMergeType() == SVETypeFlags::MergeAny) 4583 std::swap(Ops[1], Ops[3]); 4584 4585 // Predicated intrinsics with _z suffix need a select w/ zeroinitializer. 4586 if (TypeFlags.getMergeType() == SVETypeFlags::MergeZero) { 4587 llvm::Type *OpndTy = Ops[1]->getType(); 4588 auto *SplatZero = Constant::getNullValue(OpndTy); 4589 Ops[1] = Builder.CreateSelect(Ops[0], Ops[1], SplatZero); 4590 } 4591 4592 Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic, 4593 getSVEOverloadTypes(TypeFlags, Ty, Ops)); 4594 Value *Call = Builder.CreateCall(F, Ops); 4595 4596 if (Call->getType() == Ty) 4597 return Call; 4598 4599 // Predicate results must be converted to svbool_t. 4600 if (auto PredTy = dyn_cast<llvm::ScalableVectorType>(Ty)) 4601 return EmitSVEPredicateCast(Call, PredTy); 4602 if (auto PredTupleTy = dyn_cast<llvm::StructType>(Ty)) 4603 return EmitSVEPredicateTupleCast(Call, PredTupleTy); 4604 4605 llvm_unreachable("unsupported element count!"); 4606 } 4607 4608 switch (BuiltinID) { 4609 default: 4610 return nullptr; 4611 4612 case SVE::BI__builtin_sve_svreinterpret_b: { 4613 auto SVCountTy = 4614 llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount"); 4615 Function *CastFromSVCountF = 4616 CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool, SVCountTy); 4617 return Builder.CreateCall(CastFromSVCountF, Ops[0]); 4618 } 4619 case SVE::BI__builtin_sve_svreinterpret_c: { 4620 auto SVCountTy = 4621 llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount"); 4622 Function *CastToSVCountF = 4623 CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, SVCountTy); 4624 return Builder.CreateCall(CastToSVCountF, Ops[0]); 4625 } 4626 4627 case SVE::BI__builtin_sve_svpsel_lane_b8: 4628 case SVE::BI__builtin_sve_svpsel_lane_b16: 4629 case SVE::BI__builtin_sve_svpsel_lane_b32: 4630 case SVE::BI__builtin_sve_svpsel_lane_b64: 4631 case SVE::BI__builtin_sve_svpsel_lane_c8: 4632 case SVE::BI__builtin_sve_svpsel_lane_c16: 4633 case SVE::BI__builtin_sve_svpsel_lane_c32: 4634 case SVE::BI__builtin_sve_svpsel_lane_c64: { 4635 bool IsSVCount = isa<TargetExtType>(Ops[0]->getType()); 4636 assert(((!IsSVCount || cast<TargetExtType>(Ops[0]->getType())->getName() == 4637 "aarch64.svcount")) && 4638 "Unexpected TargetExtType"); 4639 auto SVCountTy = 4640 llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount"); 4641 Function *CastFromSVCountF = 4642 CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool, SVCountTy); 4643 Function *CastToSVCountF = 4644 CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, SVCountTy); 4645 4646 auto OverloadedTy = getSVEType(SVETypeFlags(Builtin->TypeModifier)); 4647 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_psel, OverloadedTy); 4648 llvm::Value *Ops0 = 4649 IsSVCount ? Builder.CreateCall(CastFromSVCountF, Ops[0]) : Ops[0]; 4650 llvm::Value *Ops1 = EmitSVEPredicateCast(Ops[1], OverloadedTy); 4651 llvm::Value *PSel = Builder.CreateCall(F, {Ops0, Ops1, Ops[2]}); 4652 return IsSVCount ? Builder.CreateCall(CastToSVCountF, PSel) : PSel; 4653 } 4654 case SVE::BI__builtin_sve_svmov_b_z: { 4655 // svmov_b_z(pg, op) <=> svand_b_z(pg, op, op) 4656 SVETypeFlags TypeFlags(Builtin->TypeModifier); 4657 llvm::Type* OverloadedTy = getSVEType(TypeFlags); 4658 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_and_z, OverloadedTy); 4659 return Builder.CreateCall(F, {Ops[0], Ops[1], Ops[1]}); 4660 } 4661 4662 case SVE::BI__builtin_sve_svnot_b_z: { 4663 // svnot_b_z(pg, op) <=> sveor_b_z(pg, op, pg) 4664 SVETypeFlags TypeFlags(Builtin->TypeModifier); 4665 llvm::Type* OverloadedTy = getSVEType(TypeFlags); 4666 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_eor_z, OverloadedTy); 4667 return Builder.CreateCall(F, {Ops[0], Ops[1], Ops[0]}); 4668 } 4669 4670 case SVE::BI__builtin_sve_svmovlb_u16: 4671 case SVE::BI__builtin_sve_svmovlb_u32: 4672 case SVE::BI__builtin_sve_svmovlb_u64: 4673 return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_ushllb); 4674 4675 case SVE::BI__builtin_sve_svmovlb_s16: 4676 case SVE::BI__builtin_sve_svmovlb_s32: 4677 case SVE::BI__builtin_sve_svmovlb_s64: 4678 return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_sshllb); 4679 4680 case SVE::BI__builtin_sve_svmovlt_u16: 4681 case SVE::BI__builtin_sve_svmovlt_u32: 4682 case SVE::BI__builtin_sve_svmovlt_u64: 4683 return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_ushllt); 4684 4685 case SVE::BI__builtin_sve_svmovlt_s16: 4686 case SVE::BI__builtin_sve_svmovlt_s32: 4687 case SVE::BI__builtin_sve_svmovlt_s64: 4688 return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_sshllt); 4689 4690 case SVE::BI__builtin_sve_svpmullt_u16: 4691 case SVE::BI__builtin_sve_svpmullt_u64: 4692 case SVE::BI__builtin_sve_svpmullt_n_u16: 4693 case SVE::BI__builtin_sve_svpmullt_n_u64: 4694 return EmitSVEPMull(TypeFlags, Ops, Intrinsic::aarch64_sve_pmullt_pair); 4695 4696 case SVE::BI__builtin_sve_svpmullb_u16: 4697 case SVE::BI__builtin_sve_svpmullb_u64: 4698 case SVE::BI__builtin_sve_svpmullb_n_u16: 4699 case SVE::BI__builtin_sve_svpmullb_n_u64: 4700 return EmitSVEPMull(TypeFlags, Ops, Intrinsic::aarch64_sve_pmullb_pair); 4701 4702 case SVE::BI__builtin_sve_svdup_n_b8: 4703 case SVE::BI__builtin_sve_svdup_n_b16: 4704 case SVE::BI__builtin_sve_svdup_n_b32: 4705 case SVE::BI__builtin_sve_svdup_n_b64: { 4706 Value *CmpNE = 4707 Builder.CreateICmpNE(Ops[0], Constant::getNullValue(Ops[0]->getType())); 4708 llvm::ScalableVectorType *OverloadedTy = getSVEType(TypeFlags); 4709 Value *Dup = EmitSVEDupX(CmpNE, OverloadedTy); 4710 return EmitSVEPredicateCast(Dup, cast<llvm::ScalableVectorType>(Ty)); 4711 } 4712 4713 case SVE::BI__builtin_sve_svdupq_n_b8: 4714 case SVE::BI__builtin_sve_svdupq_n_b16: 4715 case SVE::BI__builtin_sve_svdupq_n_b32: 4716 case SVE::BI__builtin_sve_svdupq_n_b64: 4717 case SVE::BI__builtin_sve_svdupq_n_u8: 4718 case SVE::BI__builtin_sve_svdupq_n_s8: 4719 case SVE::BI__builtin_sve_svdupq_n_u64: 4720 case SVE::BI__builtin_sve_svdupq_n_f64: 4721 case SVE::BI__builtin_sve_svdupq_n_s64: 4722 case SVE::BI__builtin_sve_svdupq_n_u16: 4723 case SVE::BI__builtin_sve_svdupq_n_f16: 4724 case SVE::BI__builtin_sve_svdupq_n_bf16: 4725 case SVE::BI__builtin_sve_svdupq_n_s16: 4726 case SVE::BI__builtin_sve_svdupq_n_u32: 4727 case SVE::BI__builtin_sve_svdupq_n_f32: 4728 case SVE::BI__builtin_sve_svdupq_n_s32: { 4729 // These builtins are implemented by storing each element to an array and using 4730 // ld1rq to materialize a vector. 4731 unsigned NumOpnds = Ops.size(); 4732 4733 bool IsBoolTy = 4734 cast<llvm::VectorType>(Ty)->getElementType()->isIntegerTy(1); 4735 4736 // For svdupq_n_b* the element type of is an integer of type 128/numelts, 4737 // so that the compare can use the width that is natural for the expected 4738 // number of predicate lanes. 4739 llvm::Type *EltTy = Ops[0]->getType(); 4740 if (IsBoolTy) 4741 EltTy = IntegerType::get(getLLVMContext(), SVEBitsPerBlock / NumOpnds); 4742 4743 SmallVector<llvm::Value *, 16> VecOps; 4744 for (unsigned I = 0; I < NumOpnds; ++I) 4745 VecOps.push_back(Builder.CreateZExt(Ops[I], EltTy)); 4746 Value *Vec = BuildVector(VecOps); 4747 4748 llvm::Type *OverloadedTy = getSVEVectorForElementType(EltTy); 4749 Value *InsertSubVec = Builder.CreateInsertVector( 4750 OverloadedTy, PoisonValue::get(OverloadedTy), Vec, uint64_t(0)); 4751 4752 Function *F = 4753 CGM.getIntrinsic(Intrinsic::aarch64_sve_dupq_lane, OverloadedTy); 4754 Value *DupQLane = 4755 Builder.CreateCall(F, {InsertSubVec, Builder.getInt64(0)}); 4756 4757 if (!IsBoolTy) 4758 return DupQLane; 4759 4760 SVETypeFlags TypeFlags(Builtin->TypeModifier); 4761 Value *Pred = EmitSVEAllTruePred(TypeFlags); 4762 4763 // For svdupq_n_b* we need to add an additional 'cmpne' with '0'. 4764 F = CGM.getIntrinsic(NumOpnds == 2 ? Intrinsic::aarch64_sve_cmpne 4765 : Intrinsic::aarch64_sve_cmpne_wide, 4766 OverloadedTy); 4767 Value *Call = Builder.CreateCall( 4768 F, {Pred, DupQLane, EmitSVEDupX(Builder.getInt64(0))}); 4769 return EmitSVEPredicateCast(Call, cast<llvm::ScalableVectorType>(Ty)); 4770 } 4771 4772 case SVE::BI__builtin_sve_svpfalse_b: 4773 return ConstantInt::getFalse(Ty); 4774 4775 case SVE::BI__builtin_sve_svpfalse_c: { 4776 auto SVBoolTy = ScalableVectorType::get(Builder.getInt1Ty(), 16); 4777 Function *CastToSVCountF = 4778 CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, Ty); 4779 return Builder.CreateCall(CastToSVCountF, ConstantInt::getFalse(SVBoolTy)); 4780 } 4781 4782 case SVE::BI__builtin_sve_svlen_bf16: 4783 case SVE::BI__builtin_sve_svlen_f16: 4784 case SVE::BI__builtin_sve_svlen_f32: 4785 case SVE::BI__builtin_sve_svlen_f64: 4786 case SVE::BI__builtin_sve_svlen_s8: 4787 case SVE::BI__builtin_sve_svlen_s16: 4788 case SVE::BI__builtin_sve_svlen_s32: 4789 case SVE::BI__builtin_sve_svlen_s64: 4790 case SVE::BI__builtin_sve_svlen_u8: 4791 case SVE::BI__builtin_sve_svlen_u16: 4792 case SVE::BI__builtin_sve_svlen_u32: 4793 case SVE::BI__builtin_sve_svlen_u64: { 4794 SVETypeFlags TF(Builtin->TypeModifier); 4795 return Builder.CreateElementCount(Ty, getSVEType(TF)->getElementCount()); 4796 } 4797 4798 case SVE::BI__builtin_sve_svtbl2_u8: 4799 case SVE::BI__builtin_sve_svtbl2_s8: 4800 case SVE::BI__builtin_sve_svtbl2_u16: 4801 case SVE::BI__builtin_sve_svtbl2_s16: 4802 case SVE::BI__builtin_sve_svtbl2_u32: 4803 case SVE::BI__builtin_sve_svtbl2_s32: 4804 case SVE::BI__builtin_sve_svtbl2_u64: 4805 case SVE::BI__builtin_sve_svtbl2_s64: 4806 case SVE::BI__builtin_sve_svtbl2_f16: 4807 case SVE::BI__builtin_sve_svtbl2_bf16: 4808 case SVE::BI__builtin_sve_svtbl2_f32: 4809 case SVE::BI__builtin_sve_svtbl2_f64: { 4810 SVETypeFlags TF(Builtin->TypeModifier); 4811 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_tbl2, getSVEType(TF)); 4812 return Builder.CreateCall(F, Ops); 4813 } 4814 4815 case SVE::BI__builtin_sve_svset_neonq_s8: 4816 case SVE::BI__builtin_sve_svset_neonq_s16: 4817 case SVE::BI__builtin_sve_svset_neonq_s32: 4818 case SVE::BI__builtin_sve_svset_neonq_s64: 4819 case SVE::BI__builtin_sve_svset_neonq_u8: 4820 case SVE::BI__builtin_sve_svset_neonq_u16: 4821 case SVE::BI__builtin_sve_svset_neonq_u32: 4822 case SVE::BI__builtin_sve_svset_neonq_u64: 4823 case SVE::BI__builtin_sve_svset_neonq_f16: 4824 case SVE::BI__builtin_sve_svset_neonq_f32: 4825 case SVE::BI__builtin_sve_svset_neonq_f64: 4826 case SVE::BI__builtin_sve_svset_neonq_bf16: { 4827 return Builder.CreateInsertVector(Ty, Ops[0], Ops[1], uint64_t(0)); 4828 } 4829 4830 case SVE::BI__builtin_sve_svget_neonq_s8: 4831 case SVE::BI__builtin_sve_svget_neonq_s16: 4832 case SVE::BI__builtin_sve_svget_neonq_s32: 4833 case SVE::BI__builtin_sve_svget_neonq_s64: 4834 case SVE::BI__builtin_sve_svget_neonq_u8: 4835 case SVE::BI__builtin_sve_svget_neonq_u16: 4836 case SVE::BI__builtin_sve_svget_neonq_u32: 4837 case SVE::BI__builtin_sve_svget_neonq_u64: 4838 case SVE::BI__builtin_sve_svget_neonq_f16: 4839 case SVE::BI__builtin_sve_svget_neonq_f32: 4840 case SVE::BI__builtin_sve_svget_neonq_f64: 4841 case SVE::BI__builtin_sve_svget_neonq_bf16: { 4842 return Builder.CreateExtractVector(Ty, Ops[0], uint64_t(0)); 4843 } 4844 4845 case SVE::BI__builtin_sve_svdup_neonq_s8: 4846 case SVE::BI__builtin_sve_svdup_neonq_s16: 4847 case SVE::BI__builtin_sve_svdup_neonq_s32: 4848 case SVE::BI__builtin_sve_svdup_neonq_s64: 4849 case SVE::BI__builtin_sve_svdup_neonq_u8: 4850 case SVE::BI__builtin_sve_svdup_neonq_u16: 4851 case SVE::BI__builtin_sve_svdup_neonq_u32: 4852 case SVE::BI__builtin_sve_svdup_neonq_u64: 4853 case SVE::BI__builtin_sve_svdup_neonq_f16: 4854 case SVE::BI__builtin_sve_svdup_neonq_f32: 4855 case SVE::BI__builtin_sve_svdup_neonq_f64: 4856 case SVE::BI__builtin_sve_svdup_neonq_bf16: { 4857 Value *Insert = Builder.CreateInsertVector(Ty, PoisonValue::get(Ty), Ops[0], 4858 uint64_t(0)); 4859 return Builder.CreateIntrinsic(Intrinsic::aarch64_sve_dupq_lane, {Ty}, 4860 {Insert, Builder.getInt64(0)}); 4861 } 4862 } 4863 4864 /// Should not happen 4865 return nullptr; 4866 } 4867 4868 static void swapCommutativeSMEOperands(unsigned BuiltinID, 4869 SmallVectorImpl<Value *> &Ops) { 4870 unsigned MultiVec; 4871 switch (BuiltinID) { 4872 default: 4873 return; 4874 case SME::BI__builtin_sme_svsumla_za32_s8_vg4x1: 4875 MultiVec = 1; 4876 break; 4877 case SME::BI__builtin_sme_svsumla_za32_s8_vg4x2: 4878 case SME::BI__builtin_sme_svsudot_za32_s8_vg1x2: 4879 MultiVec = 2; 4880 break; 4881 case SME::BI__builtin_sme_svsudot_za32_s8_vg1x4: 4882 case SME::BI__builtin_sme_svsumla_za32_s8_vg4x4: 4883 MultiVec = 4; 4884 break; 4885 } 4886 4887 if (MultiVec > 0) 4888 for (unsigned I = 0; I < MultiVec; ++I) 4889 std::swap(Ops[I + 1], Ops[I + 1 + MultiVec]); 4890 } 4891 4892 Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, 4893 const CallExpr *E) { 4894 auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID, 4895 AArch64SMEIntrinsicsProvenSorted); 4896 4897 llvm::SmallVector<Value *, 4> Ops; 4898 SVETypeFlags TypeFlags(Builtin->TypeModifier); 4899 GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags); 4900 4901 if (TypeFlags.isLoad() || TypeFlags.isStore()) 4902 return EmitSMELd1St1(TypeFlags, Ops, Builtin->LLVMIntrinsic); 4903 else if (TypeFlags.isReadZA() || TypeFlags.isWriteZA()) 4904 return EmitSMEReadWrite(TypeFlags, Ops, Builtin->LLVMIntrinsic); 4905 else if (BuiltinID == SME::BI__builtin_sme_svzero_mask_za || 4906 BuiltinID == SME::BI__builtin_sme_svzero_za) 4907 return EmitSMEZero(TypeFlags, Ops, Builtin->LLVMIntrinsic); 4908 else if (BuiltinID == SME::BI__builtin_sme_svldr_vnum_za || 4909 BuiltinID == SME::BI__builtin_sme_svstr_vnum_za || 4910 BuiltinID == SME::BI__builtin_sme_svldr_za || 4911 BuiltinID == SME::BI__builtin_sme_svstr_za) 4912 return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic); 4913 4914 // Emit set FPMR for intrinsics that require it 4915 if (TypeFlags.setsFPMR()) 4916 Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), 4917 Ops.pop_back_val()); 4918 // Handle builtins which require their multi-vector operands to be swapped 4919 swapCommutativeSMEOperands(BuiltinID, Ops); 4920 4921 // Should not happen! 4922 if (Builtin->LLVMIntrinsic == 0) 4923 return nullptr; 4924 4925 if (BuiltinID == SME::BI__builtin_sme___arm_in_streaming_mode) { 4926 // If we already know the streaming mode, don't bother with the intrinsic 4927 // and emit a constant instead 4928 const auto *FD = cast<FunctionDecl>(CurFuncDecl); 4929 if (const auto *FPT = FD->getType()->getAs<FunctionProtoType>()) { 4930 unsigned SMEAttrs = FPT->getAArch64SMEAttributes(); 4931 if (!(SMEAttrs & FunctionType::SME_PStateSMCompatibleMask)) { 4932 bool IsStreaming = SMEAttrs & FunctionType::SME_PStateSMEnabledMask; 4933 return ConstantInt::getBool(Builder.getContext(), IsStreaming); 4934 } 4935 } 4936 } 4937 4938 // Predicates must match the main datatype. 4939 for (Value *&Op : Ops) 4940 if (auto PredTy = dyn_cast<llvm::VectorType>(Op->getType())) 4941 if (PredTy->getElementType()->isIntegerTy(1)) 4942 Op = EmitSVEPredicateCast(Op, getSVEType(TypeFlags)); 4943 4944 Function *F = 4945 TypeFlags.isOverloadNone() 4946 ? CGM.getIntrinsic(Builtin->LLVMIntrinsic) 4947 : CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)}); 4948 4949 return Builder.CreateCall(F, Ops); 4950 } 4951 4952 /// Helper for the read/write/add/inc X18 builtins: read the X18 register and 4953 /// return it as an i8 pointer. 4954 Value *readX18AsPtr(CodeGenFunction &CGF) { 4955 LLVMContext &Context = CGF.CGM.getLLVMContext(); 4956 llvm::Metadata *Ops[] = {llvm::MDString::get(Context, "x18")}; 4957 llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops); 4958 llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName); 4959 llvm::Function *F = 4960 CGF.CGM.getIntrinsic(Intrinsic::read_register, {CGF.Int64Ty}); 4961 llvm::Value *X18 = CGF.Builder.CreateCall(F, Metadata); 4962 return CGF.Builder.CreateIntToPtr(X18, CGF.Int8PtrTy); 4963 } 4964 4965 Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, 4966 const CallExpr *E, 4967 llvm::Triple::ArchType Arch) { 4968 if (BuiltinID >= clang::AArch64::FirstSVEBuiltin && 4969 BuiltinID <= clang::AArch64::LastSVEBuiltin) 4970 return EmitAArch64SVEBuiltinExpr(BuiltinID, E); 4971 4972 if (BuiltinID >= clang::AArch64::FirstSMEBuiltin && 4973 BuiltinID <= clang::AArch64::LastSMEBuiltin) 4974 return EmitAArch64SMEBuiltinExpr(BuiltinID, E); 4975 4976 if (BuiltinID == Builtin::BI__builtin_cpu_supports) 4977 return EmitAArch64CpuSupports(E); 4978 4979 unsigned HintID = static_cast<unsigned>(-1); 4980 switch (BuiltinID) { 4981 default: break; 4982 case clang::AArch64::BI__builtin_arm_nop: 4983 HintID = 0; 4984 break; 4985 case clang::AArch64::BI__builtin_arm_yield: 4986 case clang::AArch64::BI__yield: 4987 HintID = 1; 4988 break; 4989 case clang::AArch64::BI__builtin_arm_wfe: 4990 case clang::AArch64::BI__wfe: 4991 HintID = 2; 4992 break; 4993 case clang::AArch64::BI__builtin_arm_wfi: 4994 case clang::AArch64::BI__wfi: 4995 HintID = 3; 4996 break; 4997 case clang::AArch64::BI__builtin_arm_sev: 4998 case clang::AArch64::BI__sev: 4999 HintID = 4; 5000 break; 5001 case clang::AArch64::BI__builtin_arm_sevl: 5002 case clang::AArch64::BI__sevl: 5003 HintID = 5; 5004 break; 5005 } 5006 5007 if (HintID != static_cast<unsigned>(-1)) { 5008 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_hint); 5009 return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID)); 5010 } 5011 5012 if (BuiltinID == clang::AArch64::BI__builtin_arm_trap) { 5013 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_break); 5014 llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); 5015 return Builder.CreateCall(F, Builder.CreateZExt(Arg, CGM.Int32Ty)); 5016 } 5017 5018 if (BuiltinID == clang::AArch64::BI__builtin_arm_get_sme_state) { 5019 // Create call to __arm_sme_state and store the results to the two pointers. 5020 CallInst *CI = EmitRuntimeCall(CGM.CreateRuntimeFunction( 5021 llvm::FunctionType::get(StructType::get(CGM.Int64Ty, CGM.Int64Ty), {}, 5022 false), 5023 "__arm_sme_state")); 5024 auto Attrs = AttributeList().addFnAttribute(getLLVMContext(), 5025 "aarch64_pstate_sm_compatible"); 5026 CI->setAttributes(Attrs); 5027 CI->setCallingConv( 5028 llvm::CallingConv:: 5029 AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2); 5030 Builder.CreateStore(Builder.CreateExtractValue(CI, 0), 5031 EmitPointerWithAlignment(E->getArg(0))); 5032 return Builder.CreateStore(Builder.CreateExtractValue(CI, 1), 5033 EmitPointerWithAlignment(E->getArg(1))); 5034 } 5035 5036 if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit) { 5037 assert((getContext().getTypeSize(E->getType()) == 32) && 5038 "rbit of unusual size!"); 5039 llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); 5040 return Builder.CreateCall( 5041 CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit"); 5042 } 5043 if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit64) { 5044 assert((getContext().getTypeSize(E->getType()) == 64) && 5045 "rbit of unusual size!"); 5046 llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); 5047 return Builder.CreateCall( 5048 CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit"); 5049 } 5050 5051 if (BuiltinID == clang::AArch64::BI__builtin_arm_clz || 5052 BuiltinID == clang::AArch64::BI__builtin_arm_clz64) { 5053 llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); 5054 Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Arg->getType()); 5055 Value *Res = Builder.CreateCall(F, {Arg, Builder.getInt1(false)}); 5056 if (BuiltinID == clang::AArch64::BI__builtin_arm_clz64) 5057 Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); 5058 return Res; 5059 } 5060 5061 if (BuiltinID == clang::AArch64::BI__builtin_arm_cls) { 5062 llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); 5063 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_cls), Arg, 5064 "cls"); 5065 } 5066 if (BuiltinID == clang::AArch64::BI__builtin_arm_cls64) { 5067 llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); 5068 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_cls64), Arg, 5069 "cls"); 5070 } 5071 5072 if (BuiltinID == clang::AArch64::BI__builtin_arm_rint32zf || 5073 BuiltinID == clang::AArch64::BI__builtin_arm_rint32z) { 5074 llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); 5075 llvm::Type *Ty = Arg->getType(); 5076 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint32z, Ty), 5077 Arg, "frint32z"); 5078 } 5079 5080 if (BuiltinID == clang::AArch64::BI__builtin_arm_rint64zf || 5081 BuiltinID == clang::AArch64::BI__builtin_arm_rint64z) { 5082 llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); 5083 llvm::Type *Ty = Arg->getType(); 5084 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint64z, Ty), 5085 Arg, "frint64z"); 5086 } 5087 5088 if (BuiltinID == clang::AArch64::BI__builtin_arm_rint32xf || 5089 BuiltinID == clang::AArch64::BI__builtin_arm_rint32x) { 5090 llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); 5091 llvm::Type *Ty = Arg->getType(); 5092 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint32x, Ty), 5093 Arg, "frint32x"); 5094 } 5095 5096 if (BuiltinID == clang::AArch64::BI__builtin_arm_rint64xf || 5097 BuiltinID == clang::AArch64::BI__builtin_arm_rint64x) { 5098 llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); 5099 llvm::Type *Ty = Arg->getType(); 5100 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint64x, Ty), 5101 Arg, "frint64x"); 5102 } 5103 5104 if (BuiltinID == clang::AArch64::BI__builtin_arm_jcvt) { 5105 assert((getContext().getTypeSize(E->getType()) == 32) && 5106 "__jcvt of unusual size!"); 5107 llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); 5108 return Builder.CreateCall( 5109 CGM.getIntrinsic(Intrinsic::aarch64_fjcvtzs), Arg); 5110 } 5111 5112 if (BuiltinID == clang::AArch64::BI__builtin_arm_ld64b || 5113 BuiltinID == clang::AArch64::BI__builtin_arm_st64b || 5114 BuiltinID == clang::AArch64::BI__builtin_arm_st64bv || 5115 BuiltinID == clang::AArch64::BI__builtin_arm_st64bv0) { 5116 llvm::Value *MemAddr = EmitScalarExpr(E->getArg(0)); 5117 llvm::Value *ValPtr = EmitScalarExpr(E->getArg(1)); 5118 5119 if (BuiltinID == clang::AArch64::BI__builtin_arm_ld64b) { 5120 // Load from the address via an LLVM intrinsic, receiving a 5121 // tuple of 8 i64 words, and store each one to ValPtr. 5122 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_ld64b); 5123 llvm::Value *Val = Builder.CreateCall(F, MemAddr); 5124 llvm::Value *ToRet; 5125 for (size_t i = 0; i < 8; i++) { 5126 llvm::Value *ValOffsetPtr = 5127 Builder.CreateGEP(Int64Ty, ValPtr, Builder.getInt32(i)); 5128 Address Addr = 5129 Address(ValOffsetPtr, Int64Ty, CharUnits::fromQuantity(8)); 5130 ToRet = Builder.CreateStore(Builder.CreateExtractValue(Val, i), Addr); 5131 } 5132 return ToRet; 5133 } else { 5134 // Load 8 i64 words from ValPtr, and store them to the address 5135 // via an LLVM intrinsic. 5136 SmallVector<llvm::Value *, 9> Args; 5137 Args.push_back(MemAddr); 5138 for (size_t i = 0; i < 8; i++) { 5139 llvm::Value *ValOffsetPtr = 5140 Builder.CreateGEP(Int64Ty, ValPtr, Builder.getInt32(i)); 5141 Address Addr = 5142 Address(ValOffsetPtr, Int64Ty, CharUnits::fromQuantity(8)); 5143 Args.push_back(Builder.CreateLoad(Addr)); 5144 } 5145 5146 auto Intr = (BuiltinID == clang::AArch64::BI__builtin_arm_st64b 5147 ? Intrinsic::aarch64_st64b 5148 : BuiltinID == clang::AArch64::BI__builtin_arm_st64bv 5149 ? Intrinsic::aarch64_st64bv 5150 : Intrinsic::aarch64_st64bv0); 5151 Function *F = CGM.getIntrinsic(Intr); 5152 return Builder.CreateCall(F, Args); 5153 } 5154 } 5155 5156 if (BuiltinID == clang::AArch64::BI__builtin_arm_rndr || 5157 BuiltinID == clang::AArch64::BI__builtin_arm_rndrrs) { 5158 5159 auto Intr = (BuiltinID == clang::AArch64::BI__builtin_arm_rndr 5160 ? Intrinsic::aarch64_rndr 5161 : Intrinsic::aarch64_rndrrs); 5162 Function *F = CGM.getIntrinsic(Intr); 5163 llvm::Value *Val = Builder.CreateCall(F); 5164 Value *RandomValue = Builder.CreateExtractValue(Val, 0); 5165 Value *Status = Builder.CreateExtractValue(Val, 1); 5166 5167 Address MemAddress = EmitPointerWithAlignment(E->getArg(0)); 5168 Builder.CreateStore(RandomValue, MemAddress); 5169 Status = Builder.CreateZExt(Status, Int32Ty); 5170 return Status; 5171 } 5172 5173 if (BuiltinID == clang::AArch64::BI__clear_cache) { 5174 assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments"); 5175 const FunctionDecl *FD = E->getDirectCallee(); 5176 Value *Ops[2]; 5177 for (unsigned i = 0; i < 2; i++) 5178 Ops[i] = EmitScalarExpr(E->getArg(i)); 5179 llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType()); 5180 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty); 5181 StringRef Name = FD->getName(); 5182 return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops); 5183 } 5184 5185 if ((BuiltinID == clang::AArch64::BI__builtin_arm_ldrex || 5186 BuiltinID == clang::AArch64::BI__builtin_arm_ldaex) && 5187 getContext().getTypeSize(E->getType()) == 128) { 5188 Function *F = 5189 CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_ldaex 5190 ? Intrinsic::aarch64_ldaxp 5191 : Intrinsic::aarch64_ldxp); 5192 5193 Value *LdPtr = EmitScalarExpr(E->getArg(0)); 5194 Value *Val = Builder.CreateCall(F, LdPtr, "ldxp"); 5195 5196 Value *Val0 = Builder.CreateExtractValue(Val, 1); 5197 Value *Val1 = Builder.CreateExtractValue(Val, 0); 5198 llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128); 5199 Val0 = Builder.CreateZExt(Val0, Int128Ty); 5200 Val1 = Builder.CreateZExt(Val1, Int128Ty); 5201 5202 Value *ShiftCst = llvm::ConstantInt::get(Int128Ty, 64); 5203 Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */); 5204 Val = Builder.CreateOr(Val, Val1); 5205 return Builder.CreateBitCast(Val, ConvertType(E->getType())); 5206 } else if (BuiltinID == clang::AArch64::BI__builtin_arm_ldrex || 5207 BuiltinID == clang::AArch64::BI__builtin_arm_ldaex) { 5208 Value *LoadAddr = EmitScalarExpr(E->getArg(0)); 5209 5210 QualType Ty = E->getType(); 5211 llvm::Type *RealResTy = ConvertType(Ty); 5212 llvm::Type *IntTy = 5213 llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty)); 5214 5215 Function *F = 5216 CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_ldaex 5217 ? Intrinsic::aarch64_ldaxr 5218 : Intrinsic::aarch64_ldxr, 5219 UnqualPtrTy); 5220 CallInst *Val = Builder.CreateCall(F, LoadAddr, "ldxr"); 5221 Val->addParamAttr( 5222 0, Attribute::get(getLLVMContext(), Attribute::ElementType, IntTy)); 5223 5224 if (RealResTy->isPointerTy()) 5225 return Builder.CreateIntToPtr(Val, RealResTy); 5226 5227 llvm::Type *IntResTy = llvm::IntegerType::get( 5228 getLLVMContext(), CGM.getDataLayout().getTypeSizeInBits(RealResTy)); 5229 return Builder.CreateBitCast(Builder.CreateTruncOrBitCast(Val, IntResTy), 5230 RealResTy); 5231 } 5232 5233 if ((BuiltinID == clang::AArch64::BI__builtin_arm_strex || 5234 BuiltinID == clang::AArch64::BI__builtin_arm_stlex) && 5235 getContext().getTypeSize(E->getArg(0)->getType()) == 128) { 5236 Function *F = 5237 CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_stlex 5238 ? Intrinsic::aarch64_stlxp 5239 : Intrinsic::aarch64_stxp); 5240 llvm::Type *STy = llvm::StructType::get(Int64Ty, Int64Ty); 5241 5242 Address Tmp = CreateMemTemp(E->getArg(0)->getType()); 5243 EmitAnyExprToMem(E->getArg(0), Tmp, Qualifiers(), /*init*/ true); 5244 5245 Tmp = Tmp.withElementType(STy); 5246 llvm::Value *Val = Builder.CreateLoad(Tmp); 5247 5248 Value *Arg0 = Builder.CreateExtractValue(Val, 0); 5249 Value *Arg1 = Builder.CreateExtractValue(Val, 1); 5250 Value *StPtr = EmitScalarExpr(E->getArg(1)); 5251 return Builder.CreateCall(F, {Arg0, Arg1, StPtr}, "stxp"); 5252 } 5253 5254 if (BuiltinID == clang::AArch64::BI__builtin_arm_strex || 5255 BuiltinID == clang::AArch64::BI__builtin_arm_stlex) { 5256 Value *StoreVal = EmitScalarExpr(E->getArg(0)); 5257 Value *StoreAddr = EmitScalarExpr(E->getArg(1)); 5258 5259 QualType Ty = E->getArg(0)->getType(); 5260 llvm::Type *StoreTy = 5261 llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty)); 5262 5263 if (StoreVal->getType()->isPointerTy()) 5264 StoreVal = Builder.CreatePtrToInt(StoreVal, Int64Ty); 5265 else { 5266 llvm::Type *IntTy = llvm::IntegerType::get( 5267 getLLVMContext(), 5268 CGM.getDataLayout().getTypeSizeInBits(StoreVal->getType())); 5269 StoreVal = Builder.CreateBitCast(StoreVal, IntTy); 5270 StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int64Ty); 5271 } 5272 5273 Function *F = 5274 CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_stlex 5275 ? Intrinsic::aarch64_stlxr 5276 : Intrinsic::aarch64_stxr, 5277 StoreAddr->getType()); 5278 CallInst *CI = Builder.CreateCall(F, {StoreVal, StoreAddr}, "stxr"); 5279 CI->addParamAttr( 5280 1, Attribute::get(getLLVMContext(), Attribute::ElementType, StoreTy)); 5281 return CI; 5282 } 5283 5284 if (BuiltinID == clang::AArch64::BI__getReg) { 5285 Expr::EvalResult Result; 5286 if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext())) 5287 llvm_unreachable("Sema will ensure that the parameter is constant"); 5288 5289 llvm::APSInt Value = Result.Val.getInt(); 5290 LLVMContext &Context = CGM.getLLVMContext(); 5291 std::string Reg = Value == 31 ? "sp" : "x" + toString(Value, 10); 5292 5293 llvm::Metadata *Ops[] = {llvm::MDString::get(Context, Reg)}; 5294 llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops); 5295 llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName); 5296 5297 llvm::Function *F = 5298 CGM.getIntrinsic(Intrinsic::read_register, {Int64Ty}); 5299 return Builder.CreateCall(F, Metadata); 5300 } 5301 5302 if (BuiltinID == clang::AArch64::BI__break) { 5303 Expr::EvalResult Result; 5304 if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext())) 5305 llvm_unreachable("Sema will ensure that the parameter is constant"); 5306 5307 llvm::Function *F = CGM.getIntrinsic(Intrinsic::aarch64_break); 5308 return Builder.CreateCall(F, {EmitScalarExpr(E->getArg(0))}); 5309 } 5310 5311 if (BuiltinID == clang::AArch64::BI__builtin_arm_clrex) { 5312 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_clrex); 5313 return Builder.CreateCall(F); 5314 } 5315 5316 if (BuiltinID == clang::AArch64::BI_ReadWriteBarrier) 5317 return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, 5318 llvm::SyncScope::SingleThread); 5319 5320 // CRC32 5321 Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic; 5322 switch (BuiltinID) { 5323 case clang::AArch64::BI__builtin_arm_crc32b: 5324 CRCIntrinsicID = Intrinsic::aarch64_crc32b; break; 5325 case clang::AArch64::BI__builtin_arm_crc32cb: 5326 CRCIntrinsicID = Intrinsic::aarch64_crc32cb; break; 5327 case clang::AArch64::BI__builtin_arm_crc32h: 5328 CRCIntrinsicID = Intrinsic::aarch64_crc32h; break; 5329 case clang::AArch64::BI__builtin_arm_crc32ch: 5330 CRCIntrinsicID = Intrinsic::aarch64_crc32ch; break; 5331 case clang::AArch64::BI__builtin_arm_crc32w: 5332 CRCIntrinsicID = Intrinsic::aarch64_crc32w; break; 5333 case clang::AArch64::BI__builtin_arm_crc32cw: 5334 CRCIntrinsicID = Intrinsic::aarch64_crc32cw; break; 5335 case clang::AArch64::BI__builtin_arm_crc32d: 5336 CRCIntrinsicID = Intrinsic::aarch64_crc32x; break; 5337 case clang::AArch64::BI__builtin_arm_crc32cd: 5338 CRCIntrinsicID = Intrinsic::aarch64_crc32cx; break; 5339 } 5340 5341 if (CRCIntrinsicID != Intrinsic::not_intrinsic) { 5342 Value *Arg0 = EmitScalarExpr(E->getArg(0)); 5343 Value *Arg1 = EmitScalarExpr(E->getArg(1)); 5344 Function *F = CGM.getIntrinsic(CRCIntrinsicID); 5345 5346 llvm::Type *DataTy = F->getFunctionType()->getParamType(1); 5347 Arg1 = Builder.CreateZExtOrBitCast(Arg1, DataTy); 5348 5349 return Builder.CreateCall(F, {Arg0, Arg1}); 5350 } 5351 5352 // Memory Operations (MOPS) 5353 if (BuiltinID == AArch64::BI__builtin_arm_mops_memset_tag) { 5354 Value *Dst = EmitScalarExpr(E->getArg(0)); 5355 Value *Val = EmitScalarExpr(E->getArg(1)); 5356 Value *Size = EmitScalarExpr(E->getArg(2)); 5357 Val = Builder.CreateTrunc(Val, Int8Ty); 5358 Size = Builder.CreateIntCast(Size, Int64Ty, false); 5359 return Builder.CreateCall( 5360 CGM.getIntrinsic(Intrinsic::aarch64_mops_memset_tag), {Dst, Val, Size}); 5361 } 5362 5363 // Memory Tagging Extensions (MTE) Intrinsics 5364 Intrinsic::ID MTEIntrinsicID = Intrinsic::not_intrinsic; 5365 switch (BuiltinID) { 5366 case clang::AArch64::BI__builtin_arm_irg: 5367 MTEIntrinsicID = Intrinsic::aarch64_irg; break; 5368 case clang::AArch64::BI__builtin_arm_addg: 5369 MTEIntrinsicID = Intrinsic::aarch64_addg; break; 5370 case clang::AArch64::BI__builtin_arm_gmi: 5371 MTEIntrinsicID = Intrinsic::aarch64_gmi; break; 5372 case clang::AArch64::BI__builtin_arm_ldg: 5373 MTEIntrinsicID = Intrinsic::aarch64_ldg; break; 5374 case clang::AArch64::BI__builtin_arm_stg: 5375 MTEIntrinsicID = Intrinsic::aarch64_stg; break; 5376 case clang::AArch64::BI__builtin_arm_subp: 5377 MTEIntrinsicID = Intrinsic::aarch64_subp; break; 5378 } 5379 5380 if (MTEIntrinsicID != Intrinsic::not_intrinsic) { 5381 if (MTEIntrinsicID == Intrinsic::aarch64_irg) { 5382 Value *Pointer = EmitScalarExpr(E->getArg(0)); 5383 Value *Mask = EmitScalarExpr(E->getArg(1)); 5384 5385 Mask = Builder.CreateZExt(Mask, Int64Ty); 5386 return Builder.CreateCall(CGM.getIntrinsic(MTEIntrinsicID), 5387 {Pointer, Mask}); 5388 } 5389 if (MTEIntrinsicID == Intrinsic::aarch64_addg) { 5390 Value *Pointer = EmitScalarExpr(E->getArg(0)); 5391 Value *TagOffset = EmitScalarExpr(E->getArg(1)); 5392 5393 TagOffset = Builder.CreateZExt(TagOffset, Int64Ty); 5394 return Builder.CreateCall(CGM.getIntrinsic(MTEIntrinsicID), 5395 {Pointer, TagOffset}); 5396 } 5397 if (MTEIntrinsicID == Intrinsic::aarch64_gmi) { 5398 Value *Pointer = EmitScalarExpr(E->getArg(0)); 5399 Value *ExcludedMask = EmitScalarExpr(E->getArg(1)); 5400 5401 ExcludedMask = Builder.CreateZExt(ExcludedMask, Int64Ty); 5402 return Builder.CreateCall( 5403 CGM.getIntrinsic(MTEIntrinsicID), {Pointer, ExcludedMask}); 5404 } 5405 // Although it is possible to supply a different return 5406 // address (first arg) to this intrinsic, for now we set 5407 // return address same as input address. 5408 if (MTEIntrinsicID == Intrinsic::aarch64_ldg) { 5409 Value *TagAddress = EmitScalarExpr(E->getArg(0)); 5410 return Builder.CreateCall(CGM.getIntrinsic(MTEIntrinsicID), 5411 {TagAddress, TagAddress}); 5412 } 5413 // Although it is possible to supply a different tag (to set) 5414 // to this intrinsic (as first arg), for now we supply 5415 // the tag that is in input address arg (common use case). 5416 if (MTEIntrinsicID == Intrinsic::aarch64_stg) { 5417 Value *TagAddress = EmitScalarExpr(E->getArg(0)); 5418 return Builder.CreateCall(CGM.getIntrinsic(MTEIntrinsicID), 5419 {TagAddress, TagAddress}); 5420 } 5421 if (MTEIntrinsicID == Intrinsic::aarch64_subp) { 5422 Value *PointerA = EmitScalarExpr(E->getArg(0)); 5423 Value *PointerB = EmitScalarExpr(E->getArg(1)); 5424 return Builder.CreateCall( 5425 CGM.getIntrinsic(MTEIntrinsicID), {PointerA, PointerB}); 5426 } 5427 } 5428 5429 if (BuiltinID == clang::AArch64::BI__builtin_arm_rsr || 5430 BuiltinID == clang::AArch64::BI__builtin_arm_rsr64 || 5431 BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 || 5432 BuiltinID == clang::AArch64::BI__builtin_arm_rsrp || 5433 BuiltinID == clang::AArch64::BI__builtin_arm_wsr || 5434 BuiltinID == clang::AArch64::BI__builtin_arm_wsr64 || 5435 BuiltinID == clang::AArch64::BI__builtin_arm_wsr128 || 5436 BuiltinID == clang::AArch64::BI__builtin_arm_wsrp) { 5437 5438 SpecialRegisterAccessKind AccessKind = Write; 5439 if (BuiltinID == clang::AArch64::BI__builtin_arm_rsr || 5440 BuiltinID == clang::AArch64::BI__builtin_arm_rsr64 || 5441 BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 || 5442 BuiltinID == clang::AArch64::BI__builtin_arm_rsrp) 5443 AccessKind = VolatileRead; 5444 5445 bool IsPointerBuiltin = BuiltinID == clang::AArch64::BI__builtin_arm_rsrp || 5446 BuiltinID == clang::AArch64::BI__builtin_arm_wsrp; 5447 5448 bool Is32Bit = BuiltinID == clang::AArch64::BI__builtin_arm_rsr || 5449 BuiltinID == clang::AArch64::BI__builtin_arm_wsr; 5450 5451 bool Is128Bit = BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 || 5452 BuiltinID == clang::AArch64::BI__builtin_arm_wsr128; 5453 5454 llvm::Type *ValueType; 5455 llvm::Type *RegisterType = Int64Ty; 5456 if (Is32Bit) { 5457 ValueType = Int32Ty; 5458 } else if (Is128Bit) { 5459 llvm::Type *Int128Ty = 5460 llvm::IntegerType::getInt128Ty(CGM.getLLVMContext()); 5461 ValueType = Int128Ty; 5462 RegisterType = Int128Ty; 5463 } else if (IsPointerBuiltin) { 5464 ValueType = VoidPtrTy; 5465 } else { 5466 ValueType = Int64Ty; 5467 }; 5468 5469 return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType, 5470 AccessKind); 5471 } 5472 5473 if (BuiltinID == clang::AArch64::BI_ReadStatusReg || 5474 BuiltinID == clang::AArch64::BI_WriteStatusReg || 5475 BuiltinID == clang::AArch64::BI__sys) { 5476 LLVMContext &Context = CGM.getLLVMContext(); 5477 5478 unsigned SysReg = 5479 E->getArg(0)->EvaluateKnownConstInt(getContext()).getZExtValue(); 5480 5481 std::string SysRegStr; 5482 unsigned SysRegOp0 = (BuiltinID == clang::AArch64::BI_ReadStatusReg || 5483 BuiltinID == clang::AArch64::BI_WriteStatusReg) 5484 ? ((1 << 1) | ((SysReg >> 14) & 1)) 5485 : 1; 5486 llvm::raw_string_ostream(SysRegStr) 5487 << SysRegOp0 << ":" << ((SysReg >> 11) & 7) << ":" 5488 << ((SysReg >> 7) & 15) << ":" << ((SysReg >> 3) & 15) << ":" 5489 << (SysReg & 7); 5490 5491 llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysRegStr) }; 5492 llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops); 5493 llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName); 5494 5495 llvm::Type *RegisterType = Int64Ty; 5496 llvm::Type *Types[] = { RegisterType }; 5497 5498 if (BuiltinID == clang::AArch64::BI_ReadStatusReg) { 5499 llvm::Function *F = CGM.getIntrinsic(Intrinsic::read_register, Types); 5500 5501 return Builder.CreateCall(F, Metadata); 5502 } 5503 5504 llvm::Function *F = CGM.getIntrinsic(Intrinsic::write_register, Types); 5505 llvm::Value *ArgValue = EmitScalarExpr(E->getArg(1)); 5506 llvm::Value *Result = Builder.CreateCall(F, {Metadata, ArgValue}); 5507 if (BuiltinID == clang::AArch64::BI__sys) { 5508 // Return 0 for convenience, even though MSVC returns some other undefined 5509 // value. 5510 Result = ConstantInt::get(Builder.getInt32Ty(), 0); 5511 } 5512 return Result; 5513 } 5514 5515 if (BuiltinID == clang::AArch64::BI_AddressOfReturnAddress) { 5516 llvm::Function *F = 5517 CGM.getIntrinsic(Intrinsic::addressofreturnaddress, AllocaInt8PtrTy); 5518 return Builder.CreateCall(F); 5519 } 5520 5521 if (BuiltinID == clang::AArch64::BI__builtin_sponentry) { 5522 llvm::Function *F = CGM.getIntrinsic(Intrinsic::sponentry, AllocaInt8PtrTy); 5523 return Builder.CreateCall(F); 5524 } 5525 5526 if (BuiltinID == clang::AArch64::BI__mulh || 5527 BuiltinID == clang::AArch64::BI__umulh) { 5528 llvm::Type *ResType = ConvertType(E->getType()); 5529 llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128); 5530 5531 bool IsSigned = BuiltinID == clang::AArch64::BI__mulh; 5532 Value *LHS = 5533 Builder.CreateIntCast(EmitScalarExpr(E->getArg(0)), Int128Ty, IsSigned); 5534 Value *RHS = 5535 Builder.CreateIntCast(EmitScalarExpr(E->getArg(1)), Int128Ty, IsSigned); 5536 5537 Value *MulResult, *HigherBits; 5538 if (IsSigned) { 5539 MulResult = Builder.CreateNSWMul(LHS, RHS); 5540 HigherBits = Builder.CreateAShr(MulResult, 64); 5541 } else { 5542 MulResult = Builder.CreateNUWMul(LHS, RHS); 5543 HigherBits = Builder.CreateLShr(MulResult, 64); 5544 } 5545 HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned); 5546 5547 return HigherBits; 5548 } 5549 5550 if (BuiltinID == AArch64::BI__writex18byte || 5551 BuiltinID == AArch64::BI__writex18word || 5552 BuiltinID == AArch64::BI__writex18dword || 5553 BuiltinID == AArch64::BI__writex18qword) { 5554 // Process the args first 5555 Value *OffsetArg = EmitScalarExpr(E->getArg(0)); 5556 Value *DataArg = EmitScalarExpr(E->getArg(1)); 5557 5558 // Read x18 as i8* 5559 llvm::Value *X18 = readX18AsPtr(*this); 5560 5561 // Store val at x18 + offset 5562 Value *Offset = Builder.CreateZExt(OffsetArg, Int64Ty); 5563 Value *Ptr = Builder.CreateGEP(Int8Ty, X18, Offset); 5564 StoreInst *Store = 5565 Builder.CreateAlignedStore(DataArg, Ptr, CharUnits::One()); 5566 return Store; 5567 } 5568 5569 if (BuiltinID == AArch64::BI__readx18byte || 5570 BuiltinID == AArch64::BI__readx18word || 5571 BuiltinID == AArch64::BI__readx18dword || 5572 BuiltinID == AArch64::BI__readx18qword) { 5573 // Process the args first 5574 Value *OffsetArg = EmitScalarExpr(E->getArg(0)); 5575 5576 // Read x18 as i8* 5577 llvm::Value *X18 = readX18AsPtr(*this); 5578 5579 // Load x18 + offset 5580 Value *Offset = Builder.CreateZExt(OffsetArg, Int64Ty); 5581 Value *Ptr = Builder.CreateGEP(Int8Ty, X18, Offset); 5582 llvm::Type *IntTy = ConvertType(E->getType()); 5583 LoadInst *Load = Builder.CreateAlignedLoad(IntTy, Ptr, CharUnits::One()); 5584 return Load; 5585 } 5586 5587 if (BuiltinID == AArch64::BI__addx18byte || 5588 BuiltinID == AArch64::BI__addx18word || 5589 BuiltinID == AArch64::BI__addx18dword || 5590 BuiltinID == AArch64::BI__addx18qword || 5591 BuiltinID == AArch64::BI__incx18byte || 5592 BuiltinID == AArch64::BI__incx18word || 5593 BuiltinID == AArch64::BI__incx18dword || 5594 BuiltinID == AArch64::BI__incx18qword) { 5595 llvm::Type *IntTy; 5596 bool isIncrement; 5597 switch (BuiltinID) { 5598 case AArch64::BI__incx18byte: 5599 IntTy = Int8Ty; 5600 isIncrement = true; 5601 break; 5602 case AArch64::BI__incx18word: 5603 IntTy = Int16Ty; 5604 isIncrement = true; 5605 break; 5606 case AArch64::BI__incx18dword: 5607 IntTy = Int32Ty; 5608 isIncrement = true; 5609 break; 5610 case AArch64::BI__incx18qword: 5611 IntTy = Int64Ty; 5612 isIncrement = true; 5613 break; 5614 default: 5615 IntTy = ConvertType(E->getArg(1)->getType()); 5616 isIncrement = false; 5617 break; 5618 } 5619 // Process the args first 5620 Value *OffsetArg = EmitScalarExpr(E->getArg(0)); 5621 Value *ValToAdd = 5622 isIncrement ? ConstantInt::get(IntTy, 1) : EmitScalarExpr(E->getArg(1)); 5623 5624 // Read x18 as i8* 5625 llvm::Value *X18 = readX18AsPtr(*this); 5626 5627 // Load x18 + offset 5628 Value *Offset = Builder.CreateZExt(OffsetArg, Int64Ty); 5629 Value *Ptr = Builder.CreateGEP(Int8Ty, X18, Offset); 5630 LoadInst *Load = Builder.CreateAlignedLoad(IntTy, Ptr, CharUnits::One()); 5631 5632 // Add values 5633 Value *AddResult = Builder.CreateAdd(Load, ValToAdd); 5634 5635 // Store val at x18 + offset 5636 StoreInst *Store = 5637 Builder.CreateAlignedStore(AddResult, Ptr, CharUnits::One()); 5638 return Store; 5639 } 5640 5641 if (BuiltinID == AArch64::BI_CopyDoubleFromInt64 || 5642 BuiltinID == AArch64::BI_CopyFloatFromInt32 || 5643 BuiltinID == AArch64::BI_CopyInt32FromFloat || 5644 BuiltinID == AArch64::BI_CopyInt64FromDouble) { 5645 Value *Arg = EmitScalarExpr(E->getArg(0)); 5646 llvm::Type *RetTy = ConvertType(E->getType()); 5647 return Builder.CreateBitCast(Arg, RetTy); 5648 } 5649 5650 if (BuiltinID == AArch64::BI_CountLeadingOnes || 5651 BuiltinID == AArch64::BI_CountLeadingOnes64 || 5652 BuiltinID == AArch64::BI_CountLeadingZeros || 5653 BuiltinID == AArch64::BI_CountLeadingZeros64) { 5654 Value *Arg = EmitScalarExpr(E->getArg(0)); 5655 llvm::Type *ArgType = Arg->getType(); 5656 5657 if (BuiltinID == AArch64::BI_CountLeadingOnes || 5658 BuiltinID == AArch64::BI_CountLeadingOnes64) 5659 Arg = Builder.CreateXor(Arg, Constant::getAllOnesValue(ArgType)); 5660 5661 Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType); 5662 Value *Result = Builder.CreateCall(F, {Arg, Builder.getInt1(false)}); 5663 5664 if (BuiltinID == AArch64::BI_CountLeadingOnes64 || 5665 BuiltinID == AArch64::BI_CountLeadingZeros64) 5666 Result = Builder.CreateTrunc(Result, Builder.getInt32Ty()); 5667 return Result; 5668 } 5669 5670 if (BuiltinID == AArch64::BI_CountLeadingSigns || 5671 BuiltinID == AArch64::BI_CountLeadingSigns64) { 5672 Value *Arg = EmitScalarExpr(E->getArg(0)); 5673 5674 Function *F = (BuiltinID == AArch64::BI_CountLeadingSigns) 5675 ? CGM.getIntrinsic(Intrinsic::aarch64_cls) 5676 : CGM.getIntrinsic(Intrinsic::aarch64_cls64); 5677 5678 Value *Result = Builder.CreateCall(F, Arg, "cls"); 5679 if (BuiltinID == AArch64::BI_CountLeadingSigns64) 5680 Result = Builder.CreateTrunc(Result, Builder.getInt32Ty()); 5681 return Result; 5682 } 5683 5684 if (BuiltinID == AArch64::BI_CountOneBits || 5685 BuiltinID == AArch64::BI_CountOneBits64) { 5686 Value *ArgValue = EmitScalarExpr(E->getArg(0)); 5687 llvm::Type *ArgType = ArgValue->getType(); 5688 Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType); 5689 5690 Value *Result = Builder.CreateCall(F, ArgValue); 5691 if (BuiltinID == AArch64::BI_CountOneBits64) 5692 Result = Builder.CreateTrunc(Result, Builder.getInt32Ty()); 5693 return Result; 5694 } 5695 5696 if (BuiltinID == AArch64::BI__prefetch) { 5697 Value *Address = EmitScalarExpr(E->getArg(0)); 5698 Value *RW = llvm::ConstantInt::get(Int32Ty, 0); 5699 Value *Locality = ConstantInt::get(Int32Ty, 3); 5700 Value *Data = llvm::ConstantInt::get(Int32Ty, 1); 5701 Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType()); 5702 return Builder.CreateCall(F, {Address, RW, Locality, Data}); 5703 } 5704 5705 if (BuiltinID == AArch64::BI__hlt) { 5706 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_hlt); 5707 Builder.CreateCall(F, {EmitScalarExpr(E->getArg(0))}); 5708 5709 // Return 0 for convenience, even though MSVC returns some other undefined 5710 // value. 5711 return ConstantInt::get(Builder.getInt32Ty(), 0); 5712 } 5713 5714 if (BuiltinID == NEON::BI__builtin_neon_vcvth_bf16_f32) 5715 return Builder.CreateFPTrunc( 5716 Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), 5717 Builder.getFloatTy()), 5718 Builder.getBFloatTy()); 5719 5720 // Handle MSVC intrinsics before argument evaluation to prevent double 5721 // evaluation. 5722 if (std::optional<MSVCIntrin> MsvcIntId = 5723 translateAarch64ToMsvcIntrin(BuiltinID)) 5724 return EmitMSVCBuiltinExpr(*MsvcIntId, E); 5725 5726 // Some intrinsics are equivalent - if they are use the base intrinsic ID. 5727 auto It = llvm::find_if(NEONEquivalentIntrinsicMap, [BuiltinID](auto &P) { 5728 return P.first == BuiltinID; 5729 }); 5730 if (It != end(NEONEquivalentIntrinsicMap)) 5731 BuiltinID = It->second; 5732 5733 // Find out if any arguments are required to be integer constant 5734 // expressions. 5735 unsigned ICEArguments = 0; 5736 ASTContext::GetBuiltinTypeError Error; 5737 getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); 5738 assert(Error == ASTContext::GE_None && "Should not codegen an error"); 5739 5740 llvm::SmallVector<Value*, 4> Ops; 5741 Address PtrOp0 = Address::invalid(); 5742 for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) { 5743 if (i == 0) { 5744 switch (BuiltinID) { 5745 case NEON::BI__builtin_neon_vld1_v: 5746 case NEON::BI__builtin_neon_vld1q_v: 5747 case NEON::BI__builtin_neon_vld1_dup_v: 5748 case NEON::BI__builtin_neon_vld1q_dup_v: 5749 case NEON::BI__builtin_neon_vld1_lane_v: 5750 case NEON::BI__builtin_neon_vld1q_lane_v: 5751 case NEON::BI__builtin_neon_vst1_v: 5752 case NEON::BI__builtin_neon_vst1q_v: 5753 case NEON::BI__builtin_neon_vst1_lane_v: 5754 case NEON::BI__builtin_neon_vst1q_lane_v: 5755 case NEON::BI__builtin_neon_vldap1_lane_s64: 5756 case NEON::BI__builtin_neon_vldap1q_lane_s64: 5757 case NEON::BI__builtin_neon_vstl1_lane_s64: 5758 case NEON::BI__builtin_neon_vstl1q_lane_s64: 5759 // Get the alignment for the argument in addition to the value; 5760 // we'll use it later. 5761 PtrOp0 = EmitPointerWithAlignment(E->getArg(0)); 5762 Ops.push_back(PtrOp0.emitRawPointer(*this)); 5763 continue; 5764 } 5765 } 5766 Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E)); 5767 } 5768 5769 auto SISDMap = ArrayRef(AArch64SISDIntrinsicMap); 5770 const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap( 5771 SISDMap, BuiltinID, AArch64SISDIntrinsicsProvenSorted); 5772 5773 if (Builtin) { 5774 Ops.push_back(EmitScalarExpr(E->getArg(E->getNumArgs() - 1))); 5775 Value *Result = EmitCommonNeonSISDBuiltinExpr(*this, *Builtin, Ops, E); 5776 assert(Result && "SISD intrinsic should have been handled"); 5777 return Result; 5778 } 5779 5780 const Expr *Arg = E->getArg(E->getNumArgs()-1); 5781 NeonTypeFlags Type(0); 5782 if (std::optional<llvm::APSInt> Result = 5783 Arg->getIntegerConstantExpr(getContext())) 5784 // Determine the type of this overloaded NEON intrinsic. 5785 Type = NeonTypeFlags(Result->getZExtValue()); 5786 5787 bool usgn = Type.isUnsigned(); 5788 bool quad = Type.isQuad(); 5789 5790 // Handle non-overloaded intrinsics first. 5791 switch (BuiltinID) { 5792 default: break; 5793 case NEON::BI__builtin_neon_vabsh_f16: 5794 Ops.push_back(EmitScalarExpr(E->getArg(0))); 5795 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, HalfTy), Ops, "vabs"); 5796 case NEON::BI__builtin_neon_vaddq_p128: { 5797 llvm::Type *Ty = GetNeonType(this, NeonTypeFlags::Poly128); 5798 Ops.push_back(EmitScalarExpr(E->getArg(1))); 5799 Ops[0] = Builder.CreateBitCast(Ops[0], Ty); 5800 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 5801 Ops[0] = Builder.CreateXor(Ops[0], Ops[1]); 5802 llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128); 5803 return Builder.CreateBitCast(Ops[0], Int128Ty); 5804 } 5805 case NEON::BI__builtin_neon_vldrq_p128: { 5806 llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128); 5807 Value *Ptr = EmitScalarExpr(E->getArg(0)); 5808 return Builder.CreateAlignedLoad(Int128Ty, Ptr, 5809 CharUnits::fromQuantity(16)); 5810 } 5811 case NEON::BI__builtin_neon_vstrq_p128: { 5812 Value *Ptr = Ops[0]; 5813 return Builder.CreateDefaultAlignedStore(EmitScalarExpr(E->getArg(1)), Ptr); 5814 } 5815 case NEON::BI__builtin_neon_vcvts_f32_u32: 5816 case NEON::BI__builtin_neon_vcvtd_f64_u64: 5817 usgn = true; 5818 [[fallthrough]]; 5819 case NEON::BI__builtin_neon_vcvts_f32_s32: 5820 case NEON::BI__builtin_neon_vcvtd_f64_s64: { 5821 Ops.push_back(EmitScalarExpr(E->getArg(0))); 5822 bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64; 5823 llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty; 5824 llvm::Type *FTy = Is64 ? DoubleTy : FloatTy; 5825 Ops[0] = Builder.CreateBitCast(Ops[0], InTy); 5826 if (usgn) 5827 return Builder.CreateUIToFP(Ops[0], FTy); 5828 return Builder.CreateSIToFP(Ops[0], FTy); 5829 } 5830 case NEON::BI__builtin_neon_vcvth_f16_u16: 5831 case NEON::BI__builtin_neon_vcvth_f16_u32: 5832 case NEON::BI__builtin_neon_vcvth_f16_u64: 5833 usgn = true; 5834 [[fallthrough]]; 5835 case NEON::BI__builtin_neon_vcvth_f16_s16: 5836 case NEON::BI__builtin_neon_vcvth_f16_s32: 5837 case NEON::BI__builtin_neon_vcvth_f16_s64: { 5838 Ops.push_back(EmitScalarExpr(E->getArg(0))); 5839 llvm::Type *FTy = HalfTy; 5840 llvm::Type *InTy; 5841 if (Ops[0]->getType()->getPrimitiveSizeInBits() == 64) 5842 InTy = Int64Ty; 5843 else if (Ops[0]->getType()->getPrimitiveSizeInBits() == 32) 5844 InTy = Int32Ty; 5845 else 5846 InTy = Int16Ty; 5847 Ops[0] = Builder.CreateBitCast(Ops[0], InTy); 5848 if (usgn) 5849 return Builder.CreateUIToFP(Ops[0], FTy); 5850 return Builder.CreateSIToFP(Ops[0], FTy); 5851 } 5852 case NEON::BI__builtin_neon_vcvtah_u16_f16: 5853 case NEON::BI__builtin_neon_vcvtmh_u16_f16: 5854 case NEON::BI__builtin_neon_vcvtnh_u16_f16: 5855 case NEON::BI__builtin_neon_vcvtph_u16_f16: 5856 case NEON::BI__builtin_neon_vcvth_u16_f16: 5857 case NEON::BI__builtin_neon_vcvtah_s16_f16: 5858 case NEON::BI__builtin_neon_vcvtmh_s16_f16: 5859 case NEON::BI__builtin_neon_vcvtnh_s16_f16: 5860 case NEON::BI__builtin_neon_vcvtph_s16_f16: 5861 case NEON::BI__builtin_neon_vcvth_s16_f16: { 5862 unsigned Int; 5863 llvm::Type* InTy = Int32Ty; 5864 llvm::Type* FTy = HalfTy; 5865 llvm::Type *Tys[2] = {InTy, FTy}; 5866 Ops.push_back(EmitScalarExpr(E->getArg(0))); 5867 switch (BuiltinID) { 5868 default: llvm_unreachable("missing builtin ID in switch!"); 5869 case NEON::BI__builtin_neon_vcvtah_u16_f16: 5870 Int = Intrinsic::aarch64_neon_fcvtau; break; 5871 case NEON::BI__builtin_neon_vcvtmh_u16_f16: 5872 Int = Intrinsic::aarch64_neon_fcvtmu; break; 5873 case NEON::BI__builtin_neon_vcvtnh_u16_f16: 5874 Int = Intrinsic::aarch64_neon_fcvtnu; break; 5875 case NEON::BI__builtin_neon_vcvtph_u16_f16: 5876 Int = Intrinsic::aarch64_neon_fcvtpu; break; 5877 case NEON::BI__builtin_neon_vcvth_u16_f16: 5878 Int = Intrinsic::aarch64_neon_fcvtzu; break; 5879 case NEON::BI__builtin_neon_vcvtah_s16_f16: 5880 Int = Intrinsic::aarch64_neon_fcvtas; break; 5881 case NEON::BI__builtin_neon_vcvtmh_s16_f16: 5882 Int = Intrinsic::aarch64_neon_fcvtms; break; 5883 case NEON::BI__builtin_neon_vcvtnh_s16_f16: 5884 Int = Intrinsic::aarch64_neon_fcvtns; break; 5885 case NEON::BI__builtin_neon_vcvtph_s16_f16: 5886 Int = Intrinsic::aarch64_neon_fcvtps; break; 5887 case NEON::BI__builtin_neon_vcvth_s16_f16: 5888 Int = Intrinsic::aarch64_neon_fcvtzs; break; 5889 } 5890 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvt"); 5891 return Builder.CreateTrunc(Ops[0], Int16Ty); 5892 } 5893 case NEON::BI__builtin_neon_vcaleh_f16: 5894 case NEON::BI__builtin_neon_vcalth_f16: 5895 case NEON::BI__builtin_neon_vcageh_f16: 5896 case NEON::BI__builtin_neon_vcagth_f16: { 5897 unsigned Int; 5898 llvm::Type* InTy = Int32Ty; 5899 llvm::Type* FTy = HalfTy; 5900 llvm::Type *Tys[2] = {InTy, FTy}; 5901 Ops.push_back(EmitScalarExpr(E->getArg(1))); 5902 switch (BuiltinID) { 5903 default: llvm_unreachable("missing builtin ID in switch!"); 5904 case NEON::BI__builtin_neon_vcageh_f16: 5905 Int = Intrinsic::aarch64_neon_facge; break; 5906 case NEON::BI__builtin_neon_vcagth_f16: 5907 Int = Intrinsic::aarch64_neon_facgt; break; 5908 case NEON::BI__builtin_neon_vcaleh_f16: 5909 Int = Intrinsic::aarch64_neon_facge; std::swap(Ops[0], Ops[1]); break; 5910 case NEON::BI__builtin_neon_vcalth_f16: 5911 Int = Intrinsic::aarch64_neon_facgt; std::swap(Ops[0], Ops[1]); break; 5912 } 5913 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "facg"); 5914 return Builder.CreateTrunc(Ops[0], Int16Ty); 5915 } 5916 case NEON::BI__builtin_neon_vcvth_n_s16_f16: 5917 case NEON::BI__builtin_neon_vcvth_n_u16_f16: { 5918 unsigned Int; 5919 llvm::Type* InTy = Int32Ty; 5920 llvm::Type* FTy = HalfTy; 5921 llvm::Type *Tys[2] = {InTy, FTy}; 5922 Ops.push_back(EmitScalarExpr(E->getArg(1))); 5923 switch (BuiltinID) { 5924 default: llvm_unreachable("missing builtin ID in switch!"); 5925 case NEON::BI__builtin_neon_vcvth_n_s16_f16: 5926 Int = Intrinsic::aarch64_neon_vcvtfp2fxs; break; 5927 case NEON::BI__builtin_neon_vcvth_n_u16_f16: 5928 Int = Intrinsic::aarch64_neon_vcvtfp2fxu; break; 5929 } 5930 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n"); 5931 return Builder.CreateTrunc(Ops[0], Int16Ty); 5932 } 5933 case NEON::BI__builtin_neon_vcvth_n_f16_s16: 5934 case NEON::BI__builtin_neon_vcvth_n_f16_u16: { 5935 unsigned Int; 5936 llvm::Type* FTy = HalfTy; 5937 llvm::Type* InTy = Int32Ty; 5938 llvm::Type *Tys[2] = {FTy, InTy}; 5939 Ops.push_back(EmitScalarExpr(E->getArg(1))); 5940 switch (BuiltinID) { 5941 default: llvm_unreachable("missing builtin ID in switch!"); 5942 case NEON::BI__builtin_neon_vcvth_n_f16_s16: 5943 Int = Intrinsic::aarch64_neon_vcvtfxs2fp; 5944 Ops[0] = Builder.CreateSExt(Ops[0], InTy, "sext"); 5945 break; 5946 case NEON::BI__builtin_neon_vcvth_n_f16_u16: 5947 Int = Intrinsic::aarch64_neon_vcvtfxu2fp; 5948 Ops[0] = Builder.CreateZExt(Ops[0], InTy); 5949 break; 5950 } 5951 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n"); 5952 } 5953 case NEON::BI__builtin_neon_vpaddd_s64: { 5954 auto *Ty = llvm::FixedVectorType::get(Int64Ty, 2); 5955 Value *Vec = EmitScalarExpr(E->getArg(0)); 5956 // The vector is v2f64, so make sure it's bitcast to that. 5957 Vec = Builder.CreateBitCast(Vec, Ty, "v2i64"); 5958 llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0); 5959 llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1); 5960 Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0"); 5961 Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1"); 5962 // Pairwise addition of a v2f64 into a scalar f64. 5963 return Builder.CreateAdd(Op0, Op1, "vpaddd"); 5964 } 5965 case NEON::BI__builtin_neon_vpaddd_f64: { 5966 auto *Ty = llvm::FixedVectorType::get(DoubleTy, 2); 5967 Value *Vec = EmitScalarExpr(E->getArg(0)); 5968 // The vector is v2f64, so make sure it's bitcast to that. 5969 Vec = Builder.CreateBitCast(Vec, Ty, "v2f64"); 5970 llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0); 5971 llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1); 5972 Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0"); 5973 Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1"); 5974 // Pairwise addition of a v2f64 into a scalar f64. 5975 return Builder.CreateFAdd(Op0, Op1, "vpaddd"); 5976 } 5977 case NEON::BI__builtin_neon_vpadds_f32: { 5978 auto *Ty = llvm::FixedVectorType::get(FloatTy, 2); 5979 Value *Vec = EmitScalarExpr(E->getArg(0)); 5980 // The vector is v2f32, so make sure it's bitcast to that. 5981 Vec = Builder.CreateBitCast(Vec, Ty, "v2f32"); 5982 llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0); 5983 llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1); 5984 Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0"); 5985 Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1"); 5986 // Pairwise addition of a v2f32 into a scalar f32. 5987 return Builder.CreateFAdd(Op0, Op1, "vpaddd"); 5988 } 5989 case NEON::BI__builtin_neon_vceqzd_s64: 5990 Ops.push_back(EmitScalarExpr(E->getArg(0))); 5991 return EmitAArch64CompareBuiltinExpr( 5992 Ops[0], ConvertType(E->getCallReturnType(getContext())), 5993 ICmpInst::ICMP_EQ, "vceqz"); 5994 case NEON::BI__builtin_neon_vceqzd_f64: 5995 case NEON::BI__builtin_neon_vceqzs_f32: 5996 case NEON::BI__builtin_neon_vceqzh_f16: 5997 Ops.push_back(EmitScalarExpr(E->getArg(0))); 5998 return EmitAArch64CompareBuiltinExpr( 5999 Ops[0], ConvertType(E->getCallReturnType(getContext())), 6000 ICmpInst::FCMP_OEQ, "vceqz"); 6001 case NEON::BI__builtin_neon_vcgezd_s64: 6002 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6003 return EmitAArch64CompareBuiltinExpr( 6004 Ops[0], ConvertType(E->getCallReturnType(getContext())), 6005 ICmpInst::ICMP_SGE, "vcgez"); 6006 case NEON::BI__builtin_neon_vcgezd_f64: 6007 case NEON::BI__builtin_neon_vcgezs_f32: 6008 case NEON::BI__builtin_neon_vcgezh_f16: 6009 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6010 return EmitAArch64CompareBuiltinExpr( 6011 Ops[0], ConvertType(E->getCallReturnType(getContext())), 6012 ICmpInst::FCMP_OGE, "vcgez"); 6013 case NEON::BI__builtin_neon_vclezd_s64: 6014 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6015 return EmitAArch64CompareBuiltinExpr( 6016 Ops[0], ConvertType(E->getCallReturnType(getContext())), 6017 ICmpInst::ICMP_SLE, "vclez"); 6018 case NEON::BI__builtin_neon_vclezd_f64: 6019 case NEON::BI__builtin_neon_vclezs_f32: 6020 case NEON::BI__builtin_neon_vclezh_f16: 6021 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6022 return EmitAArch64CompareBuiltinExpr( 6023 Ops[0], ConvertType(E->getCallReturnType(getContext())), 6024 ICmpInst::FCMP_OLE, "vclez"); 6025 case NEON::BI__builtin_neon_vcgtzd_s64: 6026 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6027 return EmitAArch64CompareBuiltinExpr( 6028 Ops[0], ConvertType(E->getCallReturnType(getContext())), 6029 ICmpInst::ICMP_SGT, "vcgtz"); 6030 case NEON::BI__builtin_neon_vcgtzd_f64: 6031 case NEON::BI__builtin_neon_vcgtzs_f32: 6032 case NEON::BI__builtin_neon_vcgtzh_f16: 6033 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6034 return EmitAArch64CompareBuiltinExpr( 6035 Ops[0], ConvertType(E->getCallReturnType(getContext())), 6036 ICmpInst::FCMP_OGT, "vcgtz"); 6037 case NEON::BI__builtin_neon_vcltzd_s64: 6038 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6039 return EmitAArch64CompareBuiltinExpr( 6040 Ops[0], ConvertType(E->getCallReturnType(getContext())), 6041 ICmpInst::ICMP_SLT, "vcltz"); 6042 6043 case NEON::BI__builtin_neon_vcltzd_f64: 6044 case NEON::BI__builtin_neon_vcltzs_f32: 6045 case NEON::BI__builtin_neon_vcltzh_f16: 6046 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6047 return EmitAArch64CompareBuiltinExpr( 6048 Ops[0], ConvertType(E->getCallReturnType(getContext())), 6049 ICmpInst::FCMP_OLT, "vcltz"); 6050 6051 case NEON::BI__builtin_neon_vceqzd_u64: { 6052 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6053 Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty); 6054 Ops[0] = 6055 Builder.CreateICmpEQ(Ops[0], llvm::Constant::getNullValue(Int64Ty)); 6056 return Builder.CreateSExt(Ops[0], Int64Ty, "vceqzd"); 6057 } 6058 case NEON::BI__builtin_neon_vceqd_f64: 6059 case NEON::BI__builtin_neon_vcled_f64: 6060 case NEON::BI__builtin_neon_vcltd_f64: 6061 case NEON::BI__builtin_neon_vcged_f64: 6062 case NEON::BI__builtin_neon_vcgtd_f64: { 6063 llvm::CmpInst::Predicate P; 6064 switch (BuiltinID) { 6065 default: llvm_unreachable("missing builtin ID in switch!"); 6066 case NEON::BI__builtin_neon_vceqd_f64: P = llvm::FCmpInst::FCMP_OEQ; break; 6067 case NEON::BI__builtin_neon_vcled_f64: P = llvm::FCmpInst::FCMP_OLE; break; 6068 case NEON::BI__builtin_neon_vcltd_f64: P = llvm::FCmpInst::FCMP_OLT; break; 6069 case NEON::BI__builtin_neon_vcged_f64: P = llvm::FCmpInst::FCMP_OGE; break; 6070 case NEON::BI__builtin_neon_vcgtd_f64: P = llvm::FCmpInst::FCMP_OGT; break; 6071 } 6072 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6073 Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); 6074 Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy); 6075 if (P == llvm::FCmpInst::FCMP_OEQ) 6076 Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]); 6077 else 6078 Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]); 6079 return Builder.CreateSExt(Ops[0], Int64Ty, "vcmpd"); 6080 } 6081 case NEON::BI__builtin_neon_vceqs_f32: 6082 case NEON::BI__builtin_neon_vcles_f32: 6083 case NEON::BI__builtin_neon_vclts_f32: 6084 case NEON::BI__builtin_neon_vcges_f32: 6085 case NEON::BI__builtin_neon_vcgts_f32: { 6086 llvm::CmpInst::Predicate P; 6087 switch (BuiltinID) { 6088 default: llvm_unreachable("missing builtin ID in switch!"); 6089 case NEON::BI__builtin_neon_vceqs_f32: P = llvm::FCmpInst::FCMP_OEQ; break; 6090 case NEON::BI__builtin_neon_vcles_f32: P = llvm::FCmpInst::FCMP_OLE; break; 6091 case NEON::BI__builtin_neon_vclts_f32: P = llvm::FCmpInst::FCMP_OLT; break; 6092 case NEON::BI__builtin_neon_vcges_f32: P = llvm::FCmpInst::FCMP_OGE; break; 6093 case NEON::BI__builtin_neon_vcgts_f32: P = llvm::FCmpInst::FCMP_OGT; break; 6094 } 6095 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6096 Ops[0] = Builder.CreateBitCast(Ops[0], FloatTy); 6097 Ops[1] = Builder.CreateBitCast(Ops[1], FloatTy); 6098 if (P == llvm::FCmpInst::FCMP_OEQ) 6099 Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]); 6100 else 6101 Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]); 6102 return Builder.CreateSExt(Ops[0], Int32Ty, "vcmpd"); 6103 } 6104 case NEON::BI__builtin_neon_vceqh_f16: 6105 case NEON::BI__builtin_neon_vcleh_f16: 6106 case NEON::BI__builtin_neon_vclth_f16: 6107 case NEON::BI__builtin_neon_vcgeh_f16: 6108 case NEON::BI__builtin_neon_vcgth_f16: { 6109 llvm::CmpInst::Predicate P; 6110 switch (BuiltinID) { 6111 default: llvm_unreachable("missing builtin ID in switch!"); 6112 case NEON::BI__builtin_neon_vceqh_f16: P = llvm::FCmpInst::FCMP_OEQ; break; 6113 case NEON::BI__builtin_neon_vcleh_f16: P = llvm::FCmpInst::FCMP_OLE; break; 6114 case NEON::BI__builtin_neon_vclth_f16: P = llvm::FCmpInst::FCMP_OLT; break; 6115 case NEON::BI__builtin_neon_vcgeh_f16: P = llvm::FCmpInst::FCMP_OGE; break; 6116 case NEON::BI__builtin_neon_vcgth_f16: P = llvm::FCmpInst::FCMP_OGT; break; 6117 } 6118 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6119 Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy); 6120 Ops[1] = Builder.CreateBitCast(Ops[1], HalfTy); 6121 if (P == llvm::FCmpInst::FCMP_OEQ) 6122 Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]); 6123 else 6124 Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]); 6125 return Builder.CreateSExt(Ops[0], Int16Ty, "vcmpd"); 6126 } 6127 case NEON::BI__builtin_neon_vceqd_s64: 6128 case NEON::BI__builtin_neon_vceqd_u64: 6129 case NEON::BI__builtin_neon_vcgtd_s64: 6130 case NEON::BI__builtin_neon_vcgtd_u64: 6131 case NEON::BI__builtin_neon_vcltd_s64: 6132 case NEON::BI__builtin_neon_vcltd_u64: 6133 case NEON::BI__builtin_neon_vcged_u64: 6134 case NEON::BI__builtin_neon_vcged_s64: 6135 case NEON::BI__builtin_neon_vcled_u64: 6136 case NEON::BI__builtin_neon_vcled_s64: { 6137 llvm::CmpInst::Predicate P; 6138 switch (BuiltinID) { 6139 default: llvm_unreachable("missing builtin ID in switch!"); 6140 case NEON::BI__builtin_neon_vceqd_s64: 6141 case NEON::BI__builtin_neon_vceqd_u64:P = llvm::ICmpInst::ICMP_EQ;break; 6142 case NEON::BI__builtin_neon_vcgtd_s64:P = llvm::ICmpInst::ICMP_SGT;break; 6143 case NEON::BI__builtin_neon_vcgtd_u64:P = llvm::ICmpInst::ICMP_UGT;break; 6144 case NEON::BI__builtin_neon_vcltd_s64:P = llvm::ICmpInst::ICMP_SLT;break; 6145 case NEON::BI__builtin_neon_vcltd_u64:P = llvm::ICmpInst::ICMP_ULT;break; 6146 case NEON::BI__builtin_neon_vcged_u64:P = llvm::ICmpInst::ICMP_UGE;break; 6147 case NEON::BI__builtin_neon_vcged_s64:P = llvm::ICmpInst::ICMP_SGE;break; 6148 case NEON::BI__builtin_neon_vcled_u64:P = llvm::ICmpInst::ICMP_ULE;break; 6149 case NEON::BI__builtin_neon_vcled_s64:P = llvm::ICmpInst::ICMP_SLE;break; 6150 } 6151 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6152 Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty); 6153 Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty); 6154 Ops[0] = Builder.CreateICmp(P, Ops[0], Ops[1]); 6155 return Builder.CreateSExt(Ops[0], Int64Ty, "vceqd"); 6156 } 6157 case NEON::BI__builtin_neon_vtstd_s64: 6158 case NEON::BI__builtin_neon_vtstd_u64: { 6159 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6160 Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty); 6161 Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty); 6162 Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]); 6163 Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0], 6164 llvm::Constant::getNullValue(Int64Ty)); 6165 return Builder.CreateSExt(Ops[0], Int64Ty, "vtstd"); 6166 } 6167 case NEON::BI__builtin_neon_vset_lane_i8: 6168 case NEON::BI__builtin_neon_vset_lane_i16: 6169 case NEON::BI__builtin_neon_vset_lane_i32: 6170 case NEON::BI__builtin_neon_vset_lane_i64: 6171 case NEON::BI__builtin_neon_vset_lane_bf16: 6172 case NEON::BI__builtin_neon_vset_lane_f32: 6173 case NEON::BI__builtin_neon_vsetq_lane_i8: 6174 case NEON::BI__builtin_neon_vsetq_lane_i16: 6175 case NEON::BI__builtin_neon_vsetq_lane_i32: 6176 case NEON::BI__builtin_neon_vsetq_lane_i64: 6177 case NEON::BI__builtin_neon_vsetq_lane_bf16: 6178 case NEON::BI__builtin_neon_vsetq_lane_f32: 6179 Ops.push_back(EmitScalarExpr(E->getArg(2))); 6180 return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane"); 6181 case NEON::BI__builtin_neon_vset_lane_f64: 6182 // The vector type needs a cast for the v1f64 variant. 6183 Ops[1] = 6184 Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(DoubleTy, 1)); 6185 Ops.push_back(EmitScalarExpr(E->getArg(2))); 6186 return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane"); 6187 case NEON::BI__builtin_neon_vset_lane_mf8: 6188 case NEON::BI__builtin_neon_vsetq_lane_mf8: 6189 Ops.push_back(EmitScalarExpr(E->getArg(2))); 6190 // The input vector type needs a cast to scalar type. 6191 Ops[0] = 6192 Builder.CreateBitCast(Ops[0], llvm::Type::getInt8Ty(getLLVMContext())); 6193 return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane"); 6194 case NEON::BI__builtin_neon_vsetq_lane_f64: 6195 // The vector type needs a cast for the v2f64 variant. 6196 Ops[1] = 6197 Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(DoubleTy, 2)); 6198 Ops.push_back(EmitScalarExpr(E->getArg(2))); 6199 return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane"); 6200 6201 case NEON::BI__builtin_neon_vget_lane_i8: 6202 case NEON::BI__builtin_neon_vdupb_lane_i8: 6203 Ops[0] = 6204 Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int8Ty, 8)); 6205 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6206 "vget_lane"); 6207 case NEON::BI__builtin_neon_vgetq_lane_i8: 6208 case NEON::BI__builtin_neon_vdupb_laneq_i8: 6209 Ops[0] = 6210 Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int8Ty, 16)); 6211 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6212 "vgetq_lane"); 6213 case NEON::BI__builtin_neon_vget_lane_mf8: 6214 case NEON::BI__builtin_neon_vdupb_lane_mf8: 6215 case NEON::BI__builtin_neon_vgetq_lane_mf8: 6216 case NEON::BI__builtin_neon_vdupb_laneq_mf8: 6217 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6218 "vget_lane"); 6219 case NEON::BI__builtin_neon_vget_lane_i16: 6220 case NEON::BI__builtin_neon_vduph_lane_i16: 6221 Ops[0] = 6222 Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int16Ty, 4)); 6223 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6224 "vget_lane"); 6225 case NEON::BI__builtin_neon_vgetq_lane_i16: 6226 case NEON::BI__builtin_neon_vduph_laneq_i16: 6227 Ops[0] = 6228 Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int16Ty, 8)); 6229 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6230 "vgetq_lane"); 6231 case NEON::BI__builtin_neon_vget_lane_i32: 6232 case NEON::BI__builtin_neon_vdups_lane_i32: 6233 Ops[0] = 6234 Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 2)); 6235 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6236 "vget_lane"); 6237 case NEON::BI__builtin_neon_vdups_lane_f32: 6238 Ops[0] = 6239 Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 2)); 6240 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6241 "vdups_lane"); 6242 case NEON::BI__builtin_neon_vgetq_lane_i32: 6243 case NEON::BI__builtin_neon_vdups_laneq_i32: 6244 Ops[0] = 6245 Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 4)); 6246 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6247 "vgetq_lane"); 6248 case NEON::BI__builtin_neon_vget_lane_i64: 6249 case NEON::BI__builtin_neon_vdupd_lane_i64: 6250 Ops[0] = 6251 Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 1)); 6252 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6253 "vget_lane"); 6254 case NEON::BI__builtin_neon_vdupd_lane_f64: 6255 Ops[0] = 6256 Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 1)); 6257 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6258 "vdupd_lane"); 6259 case NEON::BI__builtin_neon_vgetq_lane_i64: 6260 case NEON::BI__builtin_neon_vdupd_laneq_i64: 6261 Ops[0] = 6262 Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 2)); 6263 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6264 "vgetq_lane"); 6265 case NEON::BI__builtin_neon_vget_lane_f32: 6266 Ops[0] = 6267 Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 2)); 6268 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6269 "vget_lane"); 6270 case NEON::BI__builtin_neon_vget_lane_f64: 6271 Ops[0] = 6272 Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 1)); 6273 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6274 "vget_lane"); 6275 case NEON::BI__builtin_neon_vgetq_lane_f32: 6276 case NEON::BI__builtin_neon_vdups_laneq_f32: 6277 Ops[0] = 6278 Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 4)); 6279 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6280 "vgetq_lane"); 6281 case NEON::BI__builtin_neon_vgetq_lane_f64: 6282 case NEON::BI__builtin_neon_vdupd_laneq_f64: 6283 Ops[0] = 6284 Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 2)); 6285 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6286 "vgetq_lane"); 6287 case NEON::BI__builtin_neon_vaddh_f16: 6288 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6289 return Builder.CreateFAdd(Ops[0], Ops[1], "vaddh"); 6290 case NEON::BI__builtin_neon_vsubh_f16: 6291 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6292 return Builder.CreateFSub(Ops[0], Ops[1], "vsubh"); 6293 case NEON::BI__builtin_neon_vmulh_f16: 6294 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6295 return Builder.CreateFMul(Ops[0], Ops[1], "vmulh"); 6296 case NEON::BI__builtin_neon_vdivh_f16: 6297 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6298 return Builder.CreateFDiv(Ops[0], Ops[1], "vdivh"); 6299 case NEON::BI__builtin_neon_vfmah_f16: 6300 // NEON intrinsic puts accumulator first, unlike the LLVM fma. 6301 return emitCallMaybeConstrainedFPBuiltin( 6302 *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, HalfTy, 6303 {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]}); 6304 case NEON::BI__builtin_neon_vfmsh_f16: { 6305 Value* Neg = Builder.CreateFNeg(EmitScalarExpr(E->getArg(1)), "vsubh"); 6306 6307 // NEON intrinsic puts accumulator first, unlike the LLVM fma. 6308 return emitCallMaybeConstrainedFPBuiltin( 6309 *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, HalfTy, 6310 {Neg, EmitScalarExpr(E->getArg(2)), Ops[0]}); 6311 } 6312 case NEON::BI__builtin_neon_vaddd_s64: 6313 case NEON::BI__builtin_neon_vaddd_u64: 6314 return Builder.CreateAdd(Ops[0], EmitScalarExpr(E->getArg(1)), "vaddd"); 6315 case NEON::BI__builtin_neon_vsubd_s64: 6316 case NEON::BI__builtin_neon_vsubd_u64: 6317 return Builder.CreateSub(Ops[0], EmitScalarExpr(E->getArg(1)), "vsubd"); 6318 case NEON::BI__builtin_neon_vqdmlalh_s16: 6319 case NEON::BI__builtin_neon_vqdmlslh_s16: { 6320 SmallVector<Value *, 2> ProductOps; 6321 ProductOps.push_back(vectorWrapScalar16(Ops[1])); 6322 ProductOps.push_back(vectorWrapScalar16(EmitScalarExpr(E->getArg(2)))); 6323 auto *VTy = llvm::FixedVectorType::get(Int32Ty, 4); 6324 Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy), 6325 ProductOps, "vqdmlXl"); 6326 Constant *CI = ConstantInt::get(SizeTy, 0); 6327 Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0"); 6328 6329 unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlalh_s16 6330 ? Intrinsic::aarch64_neon_sqadd 6331 : Intrinsic::aarch64_neon_sqsub; 6332 return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int32Ty), Ops, "vqdmlXl"); 6333 } 6334 case NEON::BI__builtin_neon_vqshlud_n_s64: { 6335 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6336 Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty); 6337 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqshlu, Int64Ty), 6338 Ops, "vqshlu_n"); 6339 } 6340 case NEON::BI__builtin_neon_vqshld_n_u64: 6341 case NEON::BI__builtin_neon_vqshld_n_s64: { 6342 unsigned Int = BuiltinID == NEON::BI__builtin_neon_vqshld_n_u64 6343 ? Intrinsic::aarch64_neon_uqshl 6344 : Intrinsic::aarch64_neon_sqshl; 6345 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6346 Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty); 6347 return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vqshl_n"); 6348 } 6349 case NEON::BI__builtin_neon_vrshrd_n_u64: 6350 case NEON::BI__builtin_neon_vrshrd_n_s64: { 6351 unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrshrd_n_u64 6352 ? Intrinsic::aarch64_neon_urshl 6353 : Intrinsic::aarch64_neon_srshl; 6354 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6355 int SV = cast<ConstantInt>(Ops[1])->getSExtValue(); 6356 Ops[1] = ConstantInt::get(Int64Ty, -SV); 6357 return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vrshr_n"); 6358 } 6359 case NEON::BI__builtin_neon_vrsrad_n_u64: 6360 case NEON::BI__builtin_neon_vrsrad_n_s64: { 6361 unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrsrad_n_u64 6362 ? Intrinsic::aarch64_neon_urshl 6363 : Intrinsic::aarch64_neon_srshl; 6364 Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty); 6365 Ops.push_back(Builder.CreateNeg(EmitScalarExpr(E->getArg(2)))); 6366 Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Int64Ty), 6367 {Ops[1], Builder.CreateSExt(Ops[2], Int64Ty)}); 6368 return Builder.CreateAdd(Ops[0], Builder.CreateBitCast(Ops[1], Int64Ty)); 6369 } 6370 case NEON::BI__builtin_neon_vshld_n_s64: 6371 case NEON::BI__builtin_neon_vshld_n_u64: { 6372 llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1))); 6373 return Builder.CreateShl( 6374 Ops[0], ConstantInt::get(Int64Ty, Amt->getZExtValue()), "shld_n"); 6375 } 6376 case NEON::BI__builtin_neon_vshrd_n_s64: { 6377 llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1))); 6378 return Builder.CreateAShr( 6379 Ops[0], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63), 6380 Amt->getZExtValue())), 6381 "shrd_n"); 6382 } 6383 case NEON::BI__builtin_neon_vshrd_n_u64: { 6384 llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1))); 6385 uint64_t ShiftAmt = Amt->getZExtValue(); 6386 // Right-shifting an unsigned value by its size yields 0. 6387 if (ShiftAmt == 64) 6388 return ConstantInt::get(Int64Ty, 0); 6389 return Builder.CreateLShr(Ops[0], ConstantInt::get(Int64Ty, ShiftAmt), 6390 "shrd_n"); 6391 } 6392 case NEON::BI__builtin_neon_vsrad_n_s64: { 6393 llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2))); 6394 Ops[1] = Builder.CreateAShr( 6395 Ops[1], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63), 6396 Amt->getZExtValue())), 6397 "shrd_n"); 6398 return Builder.CreateAdd(Ops[0], Ops[1]); 6399 } 6400 case NEON::BI__builtin_neon_vsrad_n_u64: { 6401 llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2))); 6402 uint64_t ShiftAmt = Amt->getZExtValue(); 6403 // Right-shifting an unsigned value by its size yields 0. 6404 // As Op + 0 = Op, return Ops[0] directly. 6405 if (ShiftAmt == 64) 6406 return Ops[0]; 6407 Ops[1] = Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, ShiftAmt), 6408 "shrd_n"); 6409 return Builder.CreateAdd(Ops[0], Ops[1]); 6410 } 6411 case NEON::BI__builtin_neon_vqdmlalh_lane_s16: 6412 case NEON::BI__builtin_neon_vqdmlalh_laneq_s16: 6413 case NEON::BI__builtin_neon_vqdmlslh_lane_s16: 6414 case NEON::BI__builtin_neon_vqdmlslh_laneq_s16: { 6415 Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)), 6416 "lane"); 6417 SmallVector<Value *, 2> ProductOps; 6418 ProductOps.push_back(vectorWrapScalar16(Ops[1])); 6419 ProductOps.push_back(vectorWrapScalar16(Ops[2])); 6420 auto *VTy = llvm::FixedVectorType::get(Int32Ty, 4); 6421 Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy), 6422 ProductOps, "vqdmlXl"); 6423 Constant *CI = ConstantInt::get(SizeTy, 0); 6424 Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0"); 6425 Ops.pop_back(); 6426 6427 unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlalh_lane_s16 || 6428 BuiltinID == NEON::BI__builtin_neon_vqdmlalh_laneq_s16) 6429 ? Intrinsic::aarch64_neon_sqadd 6430 : Intrinsic::aarch64_neon_sqsub; 6431 return EmitNeonCall(CGM.getIntrinsic(AccInt, Int32Ty), Ops, "vqdmlXl"); 6432 } 6433 case NEON::BI__builtin_neon_vqdmlals_s32: 6434 case NEON::BI__builtin_neon_vqdmlsls_s32: { 6435 SmallVector<Value *, 2> ProductOps; 6436 ProductOps.push_back(Ops[1]); 6437 ProductOps.push_back(EmitScalarExpr(E->getArg(2))); 6438 Ops[1] = 6439 EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar), 6440 ProductOps, "vqdmlXl"); 6441 6442 unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlals_s32 6443 ? Intrinsic::aarch64_neon_sqadd 6444 : Intrinsic::aarch64_neon_sqsub; 6445 return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int64Ty), Ops, "vqdmlXl"); 6446 } 6447 case NEON::BI__builtin_neon_vqdmlals_lane_s32: 6448 case NEON::BI__builtin_neon_vqdmlals_laneq_s32: 6449 case NEON::BI__builtin_neon_vqdmlsls_lane_s32: 6450 case NEON::BI__builtin_neon_vqdmlsls_laneq_s32: { 6451 Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)), 6452 "lane"); 6453 SmallVector<Value *, 2> ProductOps; 6454 ProductOps.push_back(Ops[1]); 6455 ProductOps.push_back(Ops[2]); 6456 Ops[1] = 6457 EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar), 6458 ProductOps, "vqdmlXl"); 6459 Ops.pop_back(); 6460 6461 unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlals_lane_s32 || 6462 BuiltinID == NEON::BI__builtin_neon_vqdmlals_laneq_s32) 6463 ? Intrinsic::aarch64_neon_sqadd 6464 : Intrinsic::aarch64_neon_sqsub; 6465 return EmitNeonCall(CGM.getIntrinsic(AccInt, Int64Ty), Ops, "vqdmlXl"); 6466 } 6467 case NEON::BI__builtin_neon_vget_lane_bf16: 6468 case NEON::BI__builtin_neon_vduph_lane_bf16: 6469 case NEON::BI__builtin_neon_vduph_lane_f16: { 6470 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6471 "vget_lane"); 6472 } 6473 case NEON::BI__builtin_neon_vgetq_lane_bf16: 6474 case NEON::BI__builtin_neon_vduph_laneq_bf16: 6475 case NEON::BI__builtin_neon_vduph_laneq_f16: { 6476 return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), 6477 "vgetq_lane"); 6478 } 6479 case NEON::BI__builtin_neon_vcvt_bf16_f32: { 6480 llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4); 6481 llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4); 6482 return Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16); 6483 } 6484 case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: { 6485 SmallVector<int, 16> ConcatMask(8); 6486 std::iota(ConcatMask.begin(), ConcatMask.end(), 0); 6487 llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4); 6488 llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4); 6489 llvm::Value *Trunc = 6490 Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16); 6491 return Builder.CreateShuffleVector( 6492 Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask); 6493 } 6494 case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: { 6495 SmallVector<int, 16> ConcatMask(8); 6496 std::iota(ConcatMask.begin(), ConcatMask.end(), 0); 6497 SmallVector<int, 16> LoMask(4); 6498 std::iota(LoMask.begin(), LoMask.end(), 0); 6499 llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4); 6500 llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4); 6501 llvm::Type *V8BF16 = FixedVectorType::get(Builder.getBFloatTy(), 8); 6502 llvm::Value *Inactive = Builder.CreateShuffleVector( 6503 Builder.CreateBitCast(Ops[0], V8BF16), LoMask); 6504 llvm::Value *Trunc = 6505 Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[1], V4F32), V4BF16); 6506 return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask); 6507 } 6508 6509 case clang::AArch64::BI_InterlockedAdd: 6510 case clang::AArch64::BI_InterlockedAdd_acq: 6511 case clang::AArch64::BI_InterlockedAdd_rel: 6512 case clang::AArch64::BI_InterlockedAdd_nf: 6513 case clang::AArch64::BI_InterlockedAdd64: 6514 case clang::AArch64::BI_InterlockedAdd64_acq: 6515 case clang::AArch64::BI_InterlockedAdd64_rel: 6516 case clang::AArch64::BI_InterlockedAdd64_nf: { 6517 Address DestAddr = CheckAtomicAlignment(*this, E); 6518 Value *Val = EmitScalarExpr(E->getArg(1)); 6519 llvm::AtomicOrdering Ordering; 6520 switch (BuiltinID) { 6521 case clang::AArch64::BI_InterlockedAdd: 6522 case clang::AArch64::BI_InterlockedAdd64: 6523 Ordering = llvm::AtomicOrdering::SequentiallyConsistent; 6524 break; 6525 case clang::AArch64::BI_InterlockedAdd_acq: 6526 case clang::AArch64::BI_InterlockedAdd64_acq: 6527 Ordering = llvm::AtomicOrdering::Acquire; 6528 break; 6529 case clang::AArch64::BI_InterlockedAdd_rel: 6530 case clang::AArch64::BI_InterlockedAdd64_rel: 6531 Ordering = llvm::AtomicOrdering::Release; 6532 break; 6533 case clang::AArch64::BI_InterlockedAdd_nf: 6534 case clang::AArch64::BI_InterlockedAdd64_nf: 6535 Ordering = llvm::AtomicOrdering::Monotonic; 6536 break; 6537 default: 6538 llvm_unreachable("missing builtin ID in switch!"); 6539 } 6540 AtomicRMWInst *RMWI = 6541 Builder.CreateAtomicRMW(AtomicRMWInst::Add, DestAddr, Val, Ordering); 6542 return Builder.CreateAdd(RMWI, Val); 6543 } 6544 } 6545 6546 llvm::FixedVectorType *VTy = GetNeonType(this, Type); 6547 llvm::Type *Ty = VTy; 6548 if (!Ty) 6549 return nullptr; 6550 6551 // Not all intrinsics handled by the common case work for AArch64 yet, so only 6552 // defer to common code if it's been added to our special map. 6553 Builtin = findARMVectorIntrinsicInMap(AArch64SIMDIntrinsicMap, BuiltinID, 6554 AArch64SIMDIntrinsicsProvenSorted); 6555 6556 if (Builtin) 6557 return EmitCommonNeonBuiltinExpr( 6558 Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic, 6559 Builtin->NameHint, Builtin->TypeModifier, E, Ops, 6560 /*never use addresses*/ Address::invalid(), Address::invalid(), Arch); 6561 6562 if (Value *V = EmitAArch64TblBuiltinExpr(*this, BuiltinID, E, Ops, Arch)) 6563 return V; 6564 6565 unsigned Int; 6566 bool ExtractLow = false; 6567 bool ExtendLaneArg = false; 6568 switch (BuiltinID) { 6569 default: return nullptr; 6570 case NEON::BI__builtin_neon_vbsl_v: 6571 case NEON::BI__builtin_neon_vbslq_v: { 6572 llvm::Type *BitTy = llvm::VectorType::getInteger(VTy); 6573 Ops[0] = Builder.CreateBitCast(Ops[0], BitTy, "vbsl"); 6574 Ops[1] = Builder.CreateBitCast(Ops[1], BitTy, "vbsl"); 6575 Ops[2] = Builder.CreateBitCast(Ops[2], BitTy, "vbsl"); 6576 6577 Ops[1] = Builder.CreateAnd(Ops[0], Ops[1], "vbsl"); 6578 Ops[2] = Builder.CreateAnd(Builder.CreateNot(Ops[0]), Ops[2], "vbsl"); 6579 Ops[0] = Builder.CreateOr(Ops[1], Ops[2], "vbsl"); 6580 return Builder.CreateBitCast(Ops[0], Ty); 6581 } 6582 case NEON::BI__builtin_neon_vfma_lane_v: 6583 case NEON::BI__builtin_neon_vfmaq_lane_v: { // Only used for FP types 6584 // The ARM builtins (and instructions) have the addend as the first 6585 // operand, but the 'fma' intrinsics have it last. Swap it around here. 6586 Value *Addend = Ops[0]; 6587 Value *Multiplicand = Ops[1]; 6588 Value *LaneSource = Ops[2]; 6589 Ops[0] = Multiplicand; 6590 Ops[1] = LaneSource; 6591 Ops[2] = Addend; 6592 6593 // Now adjust things to handle the lane access. 6594 auto *SourceTy = BuiltinID == NEON::BI__builtin_neon_vfmaq_lane_v 6595 ? llvm::FixedVectorType::get(VTy->getElementType(), 6596 VTy->getNumElements() / 2) 6597 : VTy; 6598 llvm::Constant *cst = cast<Constant>(Ops[3]); 6599 Value *SV = llvm::ConstantVector::getSplat(VTy->getElementCount(), cst); 6600 Ops[1] = Builder.CreateBitCast(Ops[1], SourceTy); 6601 Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV, "lane"); 6602 6603 Ops.pop_back(); 6604 Int = Builder.getIsFPConstrained() ? Intrinsic::experimental_constrained_fma 6605 : Intrinsic::fma; 6606 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla"); 6607 } 6608 case NEON::BI__builtin_neon_vfma_laneq_v: { 6609 auto *VTy = cast<llvm::FixedVectorType>(Ty); 6610 // v1f64 fma should be mapped to Neon scalar f64 fma 6611 if (VTy && VTy->getElementType() == DoubleTy) { 6612 Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); 6613 Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy); 6614 llvm::FixedVectorType *VTy = 6615 GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, true)); 6616 Ops[2] = Builder.CreateBitCast(Ops[2], VTy); 6617 Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract"); 6618 Value *Result; 6619 Result = emitCallMaybeConstrainedFPBuiltin( 6620 *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, 6621 DoubleTy, {Ops[1], Ops[2], Ops[0]}); 6622 return Builder.CreateBitCast(Result, Ty); 6623 } 6624 Ops[0] = Builder.CreateBitCast(Ops[0], Ty); 6625 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 6626 6627 auto *STy = llvm::FixedVectorType::get(VTy->getElementType(), 6628 VTy->getNumElements() * 2); 6629 Ops[2] = Builder.CreateBitCast(Ops[2], STy); 6630 Value *SV = llvm::ConstantVector::getSplat(VTy->getElementCount(), 6631 cast<ConstantInt>(Ops[3])); 6632 Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane"); 6633 6634 return emitCallMaybeConstrainedFPBuiltin( 6635 *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty, 6636 {Ops[2], Ops[1], Ops[0]}); 6637 } 6638 case NEON::BI__builtin_neon_vfmaq_laneq_v: { 6639 Ops[0] = Builder.CreateBitCast(Ops[0], Ty); 6640 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 6641 6642 Ops[2] = Builder.CreateBitCast(Ops[2], Ty); 6643 Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3])); 6644 return emitCallMaybeConstrainedFPBuiltin( 6645 *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty, 6646 {Ops[2], Ops[1], Ops[0]}); 6647 } 6648 case NEON::BI__builtin_neon_vfmah_lane_f16: 6649 case NEON::BI__builtin_neon_vfmas_lane_f32: 6650 case NEON::BI__builtin_neon_vfmah_laneq_f16: 6651 case NEON::BI__builtin_neon_vfmas_laneq_f32: 6652 case NEON::BI__builtin_neon_vfmad_lane_f64: 6653 case NEON::BI__builtin_neon_vfmad_laneq_f64: { 6654 Ops.push_back(EmitScalarExpr(E->getArg(3))); 6655 llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext())); 6656 Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract"); 6657 return emitCallMaybeConstrainedFPBuiltin( 6658 *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty, 6659 {Ops[1], Ops[2], Ops[0]}); 6660 } 6661 case NEON::BI__builtin_neon_vmull_v: 6662 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. 6663 Int = usgn ? Intrinsic::aarch64_neon_umull : Intrinsic::aarch64_neon_smull; 6664 if (Type.isPoly()) Int = Intrinsic::aarch64_neon_pmull; 6665 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull"); 6666 case NEON::BI__builtin_neon_vmax_v: 6667 case NEON::BI__builtin_neon_vmaxq_v: 6668 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. 6669 Int = usgn ? Intrinsic::aarch64_neon_umax : Intrinsic::aarch64_neon_smax; 6670 if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax; 6671 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax"); 6672 case NEON::BI__builtin_neon_vmaxh_f16: { 6673 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6674 Int = Intrinsic::aarch64_neon_fmax; 6675 return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmax"); 6676 } 6677 case NEON::BI__builtin_neon_vmin_v: 6678 case NEON::BI__builtin_neon_vminq_v: 6679 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. 6680 Int = usgn ? Intrinsic::aarch64_neon_umin : Intrinsic::aarch64_neon_smin; 6681 if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin; 6682 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin"); 6683 case NEON::BI__builtin_neon_vminh_f16: { 6684 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6685 Int = Intrinsic::aarch64_neon_fmin; 6686 return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmin"); 6687 } 6688 case NEON::BI__builtin_neon_vabd_v: 6689 case NEON::BI__builtin_neon_vabdq_v: 6690 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. 6691 Int = usgn ? Intrinsic::aarch64_neon_uabd : Intrinsic::aarch64_neon_sabd; 6692 if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fabd; 6693 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vabd"); 6694 case NEON::BI__builtin_neon_vpadal_v: 6695 case NEON::BI__builtin_neon_vpadalq_v: { 6696 unsigned ArgElts = VTy->getNumElements(); 6697 llvm::IntegerType *EltTy = cast<IntegerType>(VTy->getElementType()); 6698 unsigned BitWidth = EltTy->getBitWidth(); 6699 auto *ArgTy = llvm::FixedVectorType::get( 6700 llvm::IntegerType::get(getLLVMContext(), BitWidth / 2), 2 * ArgElts); 6701 llvm::Type* Tys[2] = { VTy, ArgTy }; 6702 Int = usgn ? Intrinsic::aarch64_neon_uaddlp : Intrinsic::aarch64_neon_saddlp; 6703 SmallVector<llvm::Value*, 1> TmpOps; 6704 TmpOps.push_back(Ops[1]); 6705 Function *F = CGM.getIntrinsic(Int, Tys); 6706 llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vpadal"); 6707 llvm::Value *addend = Builder.CreateBitCast(Ops[0], tmp->getType()); 6708 return Builder.CreateAdd(tmp, addend); 6709 } 6710 case NEON::BI__builtin_neon_vpmin_v: 6711 case NEON::BI__builtin_neon_vpminq_v: 6712 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. 6713 Int = usgn ? Intrinsic::aarch64_neon_uminp : Intrinsic::aarch64_neon_sminp; 6714 if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fminp; 6715 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin"); 6716 case NEON::BI__builtin_neon_vpmax_v: 6717 case NEON::BI__builtin_neon_vpmaxq_v: 6718 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. 6719 Int = usgn ? Intrinsic::aarch64_neon_umaxp : Intrinsic::aarch64_neon_smaxp; 6720 if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmaxp; 6721 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax"); 6722 case NEON::BI__builtin_neon_vminnm_v: 6723 case NEON::BI__builtin_neon_vminnmq_v: 6724 Int = Intrinsic::aarch64_neon_fminnm; 6725 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm"); 6726 case NEON::BI__builtin_neon_vminnmh_f16: 6727 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6728 Int = Intrinsic::aarch64_neon_fminnm; 6729 return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vminnm"); 6730 case NEON::BI__builtin_neon_vmaxnm_v: 6731 case NEON::BI__builtin_neon_vmaxnmq_v: 6732 Int = Intrinsic::aarch64_neon_fmaxnm; 6733 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm"); 6734 case NEON::BI__builtin_neon_vmaxnmh_f16: 6735 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6736 Int = Intrinsic::aarch64_neon_fmaxnm; 6737 return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmaxnm"); 6738 case NEON::BI__builtin_neon_vrecpss_f32: { 6739 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6740 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, FloatTy), 6741 Ops, "vrecps"); 6742 } 6743 case NEON::BI__builtin_neon_vrecpsd_f64: 6744 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6745 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, DoubleTy), 6746 Ops, "vrecps"); 6747 case NEON::BI__builtin_neon_vrecpsh_f16: 6748 Ops.push_back(EmitScalarExpr(E->getArg(1))); 6749 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, HalfTy), 6750 Ops, "vrecps"); 6751 case NEON::BI__builtin_neon_vqshrun_n_v: 6752 Int = Intrinsic::aarch64_neon_sqshrun; 6753 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n"); 6754 case NEON::BI__builtin_neon_vqrshrun_n_v: 6755 Int = Intrinsic::aarch64_neon_sqrshrun; 6756 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrun_n"); 6757 case NEON::BI__builtin_neon_vqshrn_n_v: 6758 Int = usgn ? Intrinsic::aarch64_neon_uqshrn : Intrinsic::aarch64_neon_sqshrn; 6759 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n"); 6760 case NEON::BI__builtin_neon_vrshrn_n_v: 6761 Int = Intrinsic::aarch64_neon_rshrn; 6762 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshrn_n"); 6763 case NEON::BI__builtin_neon_vqrshrn_n_v: 6764 Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : Intrinsic::aarch64_neon_sqrshrn; 6765 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n"); 6766 case NEON::BI__builtin_neon_vrndah_f16: { 6767 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6768 Int = Builder.getIsFPConstrained() 6769 ? Intrinsic::experimental_constrained_round 6770 : Intrinsic::round; 6771 return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrnda"); 6772 } 6773 case NEON::BI__builtin_neon_vrnda_v: 6774 case NEON::BI__builtin_neon_vrndaq_v: { 6775 Int = Builder.getIsFPConstrained() 6776 ? Intrinsic::experimental_constrained_round 6777 : Intrinsic::round; 6778 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda"); 6779 } 6780 case NEON::BI__builtin_neon_vrndih_f16: { 6781 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6782 Int = Builder.getIsFPConstrained() 6783 ? Intrinsic::experimental_constrained_nearbyint 6784 : Intrinsic::nearbyint; 6785 return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndi"); 6786 } 6787 case NEON::BI__builtin_neon_vrndmh_f16: { 6788 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6789 Int = Builder.getIsFPConstrained() 6790 ? Intrinsic::experimental_constrained_floor 6791 : Intrinsic::floor; 6792 return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndm"); 6793 } 6794 case NEON::BI__builtin_neon_vrndm_v: 6795 case NEON::BI__builtin_neon_vrndmq_v: { 6796 Int = Builder.getIsFPConstrained() 6797 ? Intrinsic::experimental_constrained_floor 6798 : Intrinsic::floor; 6799 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm"); 6800 } 6801 case NEON::BI__builtin_neon_vrndnh_f16: { 6802 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6803 Int = Builder.getIsFPConstrained() 6804 ? Intrinsic::experimental_constrained_roundeven 6805 : Intrinsic::roundeven; 6806 return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndn"); 6807 } 6808 case NEON::BI__builtin_neon_vrndn_v: 6809 case NEON::BI__builtin_neon_vrndnq_v: { 6810 Int = Builder.getIsFPConstrained() 6811 ? Intrinsic::experimental_constrained_roundeven 6812 : Intrinsic::roundeven; 6813 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn"); 6814 } 6815 case NEON::BI__builtin_neon_vrndns_f32: { 6816 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6817 Int = Builder.getIsFPConstrained() 6818 ? Intrinsic::experimental_constrained_roundeven 6819 : Intrinsic::roundeven; 6820 return EmitNeonCall(CGM.getIntrinsic(Int, FloatTy), Ops, "vrndn"); 6821 } 6822 case NEON::BI__builtin_neon_vrndph_f16: { 6823 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6824 Int = Builder.getIsFPConstrained() 6825 ? Intrinsic::experimental_constrained_ceil 6826 : Intrinsic::ceil; 6827 return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndp"); 6828 } 6829 case NEON::BI__builtin_neon_vrndp_v: 6830 case NEON::BI__builtin_neon_vrndpq_v: { 6831 Int = Builder.getIsFPConstrained() 6832 ? Intrinsic::experimental_constrained_ceil 6833 : Intrinsic::ceil; 6834 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp"); 6835 } 6836 case NEON::BI__builtin_neon_vrndxh_f16: { 6837 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6838 Int = Builder.getIsFPConstrained() 6839 ? Intrinsic::experimental_constrained_rint 6840 : Intrinsic::rint; 6841 return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndx"); 6842 } 6843 case NEON::BI__builtin_neon_vrndx_v: 6844 case NEON::BI__builtin_neon_vrndxq_v: { 6845 Int = Builder.getIsFPConstrained() 6846 ? Intrinsic::experimental_constrained_rint 6847 : Intrinsic::rint; 6848 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx"); 6849 } 6850 case NEON::BI__builtin_neon_vrndh_f16: { 6851 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6852 Int = Builder.getIsFPConstrained() 6853 ? Intrinsic::experimental_constrained_trunc 6854 : Intrinsic::trunc; 6855 return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndz"); 6856 } 6857 case NEON::BI__builtin_neon_vrnd32x_f32: 6858 case NEON::BI__builtin_neon_vrnd32xq_f32: 6859 case NEON::BI__builtin_neon_vrnd32x_f64: 6860 case NEON::BI__builtin_neon_vrnd32xq_f64: { 6861 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6862 Int = Intrinsic::aarch64_neon_frint32x; 6863 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd32x"); 6864 } 6865 case NEON::BI__builtin_neon_vrnd32z_f32: 6866 case NEON::BI__builtin_neon_vrnd32zq_f32: 6867 case NEON::BI__builtin_neon_vrnd32z_f64: 6868 case NEON::BI__builtin_neon_vrnd32zq_f64: { 6869 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6870 Int = Intrinsic::aarch64_neon_frint32z; 6871 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd32z"); 6872 } 6873 case NEON::BI__builtin_neon_vrnd64x_f32: 6874 case NEON::BI__builtin_neon_vrnd64xq_f32: 6875 case NEON::BI__builtin_neon_vrnd64x_f64: 6876 case NEON::BI__builtin_neon_vrnd64xq_f64: { 6877 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6878 Int = Intrinsic::aarch64_neon_frint64x; 6879 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd64x"); 6880 } 6881 case NEON::BI__builtin_neon_vrnd64z_f32: 6882 case NEON::BI__builtin_neon_vrnd64zq_f32: 6883 case NEON::BI__builtin_neon_vrnd64z_f64: 6884 case NEON::BI__builtin_neon_vrnd64zq_f64: { 6885 Ops.push_back(EmitScalarExpr(E->getArg(0))); 6886 Int = Intrinsic::aarch64_neon_frint64z; 6887 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd64z"); 6888 } 6889 case NEON::BI__builtin_neon_vrnd_v: 6890 case NEON::BI__builtin_neon_vrndq_v: { 6891 Int = Builder.getIsFPConstrained() 6892 ? Intrinsic::experimental_constrained_trunc 6893 : Intrinsic::trunc; 6894 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndz"); 6895 } 6896 case NEON::BI__builtin_neon_vcvt_f64_v: 6897 case NEON::BI__builtin_neon_vcvtq_f64_v: 6898 Ops[0] = Builder.CreateBitCast(Ops[0], Ty); 6899 Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad)); 6900 return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt") 6901 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt"); 6902 case NEON::BI__builtin_neon_vcvt_f64_f32: { 6903 assert(Type.getEltType() == NeonTypeFlags::Float64 && quad && 6904 "unexpected vcvt_f64_f32 builtin"); 6905 NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float32, false, false); 6906 Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag)); 6907 6908 return Builder.CreateFPExt(Ops[0], Ty, "vcvt"); 6909 } 6910 case NEON::BI__builtin_neon_vcvt_f32_f64: { 6911 assert(Type.getEltType() == NeonTypeFlags::Float32 && 6912 "unexpected vcvt_f32_f64 builtin"); 6913 NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true); 6914 Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag)); 6915 6916 return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt"); 6917 } 6918 case NEON::BI__builtin_neon_vcvt_s32_v: 6919 case NEON::BI__builtin_neon_vcvt_u32_v: 6920 case NEON::BI__builtin_neon_vcvt_s64_v: 6921 case NEON::BI__builtin_neon_vcvt_u64_v: 6922 case NEON::BI__builtin_neon_vcvt_s16_f16: 6923 case NEON::BI__builtin_neon_vcvt_u16_f16: 6924 case NEON::BI__builtin_neon_vcvtq_s32_v: 6925 case NEON::BI__builtin_neon_vcvtq_u32_v: 6926 case NEON::BI__builtin_neon_vcvtq_s64_v: 6927 case NEON::BI__builtin_neon_vcvtq_u64_v: 6928 case NEON::BI__builtin_neon_vcvtq_s16_f16: 6929 case NEON::BI__builtin_neon_vcvtq_u16_f16: { 6930 Int = 6931 usgn ? Intrinsic::aarch64_neon_fcvtzu : Intrinsic::aarch64_neon_fcvtzs; 6932 llvm::Type *Tys[2] = {Ty, GetFloatNeonType(this, Type)}; 6933 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtz"); 6934 } 6935 case NEON::BI__builtin_neon_vcvta_s16_f16: 6936 case NEON::BI__builtin_neon_vcvta_u16_f16: 6937 case NEON::BI__builtin_neon_vcvta_s32_v: 6938 case NEON::BI__builtin_neon_vcvtaq_s16_f16: 6939 case NEON::BI__builtin_neon_vcvtaq_s32_v: 6940 case NEON::BI__builtin_neon_vcvta_u32_v: 6941 case NEON::BI__builtin_neon_vcvtaq_u16_f16: 6942 case NEON::BI__builtin_neon_vcvtaq_u32_v: 6943 case NEON::BI__builtin_neon_vcvta_s64_v: 6944 case NEON::BI__builtin_neon_vcvtaq_s64_v: 6945 case NEON::BI__builtin_neon_vcvta_u64_v: 6946 case NEON::BI__builtin_neon_vcvtaq_u64_v: { 6947 Int = usgn ? Intrinsic::aarch64_neon_fcvtau : Intrinsic::aarch64_neon_fcvtas; 6948 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) }; 6949 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvta"); 6950 } 6951 case NEON::BI__builtin_neon_vcvtm_s16_f16: 6952 case NEON::BI__builtin_neon_vcvtm_s32_v: 6953 case NEON::BI__builtin_neon_vcvtmq_s16_f16: 6954 case NEON::BI__builtin_neon_vcvtmq_s32_v: 6955 case NEON::BI__builtin_neon_vcvtm_u16_f16: 6956 case NEON::BI__builtin_neon_vcvtm_u32_v: 6957 case NEON::BI__builtin_neon_vcvtmq_u16_f16: 6958 case NEON::BI__builtin_neon_vcvtmq_u32_v: 6959 case NEON::BI__builtin_neon_vcvtm_s64_v: 6960 case NEON::BI__builtin_neon_vcvtmq_s64_v: 6961 case NEON::BI__builtin_neon_vcvtm_u64_v: 6962 case NEON::BI__builtin_neon_vcvtmq_u64_v: { 6963 Int = usgn ? Intrinsic::aarch64_neon_fcvtmu : Intrinsic::aarch64_neon_fcvtms; 6964 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) }; 6965 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtm"); 6966 } 6967 case NEON::BI__builtin_neon_vcvtn_s16_f16: 6968 case NEON::BI__builtin_neon_vcvtn_s32_v: 6969 case NEON::BI__builtin_neon_vcvtnq_s16_f16: 6970 case NEON::BI__builtin_neon_vcvtnq_s32_v: 6971 case NEON::BI__builtin_neon_vcvtn_u16_f16: 6972 case NEON::BI__builtin_neon_vcvtn_u32_v: 6973 case NEON::BI__builtin_neon_vcvtnq_u16_f16: 6974 case NEON::BI__builtin_neon_vcvtnq_u32_v: 6975 case NEON::BI__builtin_neon_vcvtn_s64_v: 6976 case NEON::BI__builtin_neon_vcvtnq_s64_v: 6977 case NEON::BI__builtin_neon_vcvtn_u64_v: 6978 case NEON::BI__builtin_neon_vcvtnq_u64_v: { 6979 Int = usgn ? Intrinsic::aarch64_neon_fcvtnu : Intrinsic::aarch64_neon_fcvtns; 6980 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) }; 6981 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtn"); 6982 } 6983 case NEON::BI__builtin_neon_vcvtp_s16_f16: 6984 case NEON::BI__builtin_neon_vcvtp_s32_v: 6985 case NEON::BI__builtin_neon_vcvtpq_s16_f16: 6986 case NEON::BI__builtin_neon_vcvtpq_s32_v: 6987 case NEON::BI__builtin_neon_vcvtp_u16_f16: 6988 case NEON::BI__builtin_neon_vcvtp_u32_v: 6989 case NEON::BI__builtin_neon_vcvtpq_u16_f16: 6990 case NEON::BI__builtin_neon_vcvtpq_u32_v: 6991 case NEON::BI__builtin_neon_vcvtp_s64_v: 6992 case NEON::BI__builtin_neon_vcvtpq_s64_v: 6993 case NEON::BI__builtin_neon_vcvtp_u64_v: 6994 case NEON::BI__builtin_neon_vcvtpq_u64_v: { 6995 Int = usgn ? Intrinsic::aarch64_neon_fcvtpu : Intrinsic::aarch64_neon_fcvtps; 6996 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) }; 6997 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtp"); 6998 } 6999 case NEON::BI__builtin_neon_vmulx_v: 7000 case NEON::BI__builtin_neon_vmulxq_v: { 7001 Int = Intrinsic::aarch64_neon_fmulx; 7002 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx"); 7003 } 7004 case NEON::BI__builtin_neon_vmulxh_lane_f16: 7005 case NEON::BI__builtin_neon_vmulxh_laneq_f16: { 7006 // vmulx_lane should be mapped to Neon scalar mulx after 7007 // extracting the scalar element 7008 Ops.push_back(EmitScalarExpr(E->getArg(2))); 7009 Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract"); 7010 Ops.pop_back(); 7011 Int = Intrinsic::aarch64_neon_fmulx; 7012 return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmulx"); 7013 } 7014 case NEON::BI__builtin_neon_vmul_lane_v: 7015 case NEON::BI__builtin_neon_vmul_laneq_v: { 7016 // v1f64 vmul_lane should be mapped to Neon scalar mul lane 7017 bool Quad = false; 7018 if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v) 7019 Quad = true; 7020 Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); 7021 llvm::FixedVectorType *VTy = 7022 GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, Quad)); 7023 Ops[1] = Builder.CreateBitCast(Ops[1], VTy); 7024 Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract"); 7025 Value *Result = Builder.CreateFMul(Ops[0], Ops[1]); 7026 return Builder.CreateBitCast(Result, Ty); 7027 } 7028 case NEON::BI__builtin_neon_vnegd_s64: 7029 return Builder.CreateNeg(EmitScalarExpr(E->getArg(0)), "vnegd"); 7030 case NEON::BI__builtin_neon_vnegh_f16: 7031 return Builder.CreateFNeg(EmitScalarExpr(E->getArg(0)), "vnegh"); 7032 case NEON::BI__builtin_neon_vpmaxnm_v: 7033 case NEON::BI__builtin_neon_vpmaxnmq_v: { 7034 Int = Intrinsic::aarch64_neon_fmaxnmp; 7035 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmaxnm"); 7036 } 7037 case NEON::BI__builtin_neon_vpminnm_v: 7038 case NEON::BI__builtin_neon_vpminnmq_v: { 7039 Int = Intrinsic::aarch64_neon_fminnmp; 7040 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm"); 7041 } 7042 case NEON::BI__builtin_neon_vsqrth_f16: { 7043 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7044 Int = Builder.getIsFPConstrained() 7045 ? Intrinsic::experimental_constrained_sqrt 7046 : Intrinsic::sqrt; 7047 return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vsqrt"); 7048 } 7049 case NEON::BI__builtin_neon_vsqrt_v: 7050 case NEON::BI__builtin_neon_vsqrtq_v: { 7051 Int = Builder.getIsFPConstrained() 7052 ? Intrinsic::experimental_constrained_sqrt 7053 : Intrinsic::sqrt; 7054 Ops[0] = Builder.CreateBitCast(Ops[0], Ty); 7055 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqrt"); 7056 } 7057 case NEON::BI__builtin_neon_vrbit_v: 7058 case NEON::BI__builtin_neon_vrbitq_v: { 7059 Int = Intrinsic::bitreverse; 7060 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit"); 7061 } 7062 case NEON::BI__builtin_neon_vaddv_u8: 7063 // FIXME: These are handled by the AArch64 scalar code. 7064 usgn = true; 7065 [[fallthrough]]; 7066 case NEON::BI__builtin_neon_vaddv_s8: { 7067 Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv; 7068 Ty = Int32Ty; 7069 VTy = llvm::FixedVectorType::get(Int8Ty, 8); 7070 llvm::Type *Tys[2] = { Ty, VTy }; 7071 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7072 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv"); 7073 return Builder.CreateTrunc(Ops[0], Int8Ty); 7074 } 7075 case NEON::BI__builtin_neon_vaddv_u16: 7076 usgn = true; 7077 [[fallthrough]]; 7078 case NEON::BI__builtin_neon_vaddv_s16: { 7079 Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv; 7080 Ty = Int32Ty; 7081 VTy = llvm::FixedVectorType::get(Int16Ty, 4); 7082 llvm::Type *Tys[2] = { Ty, VTy }; 7083 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7084 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv"); 7085 return Builder.CreateTrunc(Ops[0], Int16Ty); 7086 } 7087 case NEON::BI__builtin_neon_vaddvq_u8: 7088 usgn = true; 7089 [[fallthrough]]; 7090 case NEON::BI__builtin_neon_vaddvq_s8: { 7091 Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv; 7092 Ty = Int32Ty; 7093 VTy = llvm::FixedVectorType::get(Int8Ty, 16); 7094 llvm::Type *Tys[2] = { Ty, VTy }; 7095 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7096 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv"); 7097 return Builder.CreateTrunc(Ops[0], Int8Ty); 7098 } 7099 case NEON::BI__builtin_neon_vaddvq_u16: 7100 usgn = true; 7101 [[fallthrough]]; 7102 case NEON::BI__builtin_neon_vaddvq_s16: { 7103 Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv; 7104 Ty = Int32Ty; 7105 VTy = llvm::FixedVectorType::get(Int16Ty, 8); 7106 llvm::Type *Tys[2] = { Ty, VTy }; 7107 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7108 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv"); 7109 return Builder.CreateTrunc(Ops[0], Int16Ty); 7110 } 7111 case NEON::BI__builtin_neon_vmaxv_u8: { 7112 Int = Intrinsic::aarch64_neon_umaxv; 7113 Ty = Int32Ty; 7114 VTy = llvm::FixedVectorType::get(Int8Ty, 8); 7115 llvm::Type *Tys[2] = { Ty, VTy }; 7116 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7117 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); 7118 return Builder.CreateTrunc(Ops[0], Int8Ty); 7119 } 7120 case NEON::BI__builtin_neon_vmaxv_u16: { 7121 Int = Intrinsic::aarch64_neon_umaxv; 7122 Ty = Int32Ty; 7123 VTy = llvm::FixedVectorType::get(Int16Ty, 4); 7124 llvm::Type *Tys[2] = { Ty, VTy }; 7125 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7126 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); 7127 return Builder.CreateTrunc(Ops[0], Int16Ty); 7128 } 7129 case NEON::BI__builtin_neon_vmaxvq_u8: { 7130 Int = Intrinsic::aarch64_neon_umaxv; 7131 Ty = Int32Ty; 7132 VTy = llvm::FixedVectorType::get(Int8Ty, 16); 7133 llvm::Type *Tys[2] = { Ty, VTy }; 7134 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7135 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); 7136 return Builder.CreateTrunc(Ops[0], Int8Ty); 7137 } 7138 case NEON::BI__builtin_neon_vmaxvq_u16: { 7139 Int = Intrinsic::aarch64_neon_umaxv; 7140 Ty = Int32Ty; 7141 VTy = llvm::FixedVectorType::get(Int16Ty, 8); 7142 llvm::Type *Tys[2] = { Ty, VTy }; 7143 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7144 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); 7145 return Builder.CreateTrunc(Ops[0], Int16Ty); 7146 } 7147 case NEON::BI__builtin_neon_vmaxv_s8: { 7148 Int = Intrinsic::aarch64_neon_smaxv; 7149 Ty = Int32Ty; 7150 VTy = llvm::FixedVectorType::get(Int8Ty, 8); 7151 llvm::Type *Tys[2] = { Ty, VTy }; 7152 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7153 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); 7154 return Builder.CreateTrunc(Ops[0], Int8Ty); 7155 } 7156 case NEON::BI__builtin_neon_vmaxv_s16: { 7157 Int = Intrinsic::aarch64_neon_smaxv; 7158 Ty = Int32Ty; 7159 VTy = llvm::FixedVectorType::get(Int16Ty, 4); 7160 llvm::Type *Tys[2] = { Ty, VTy }; 7161 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7162 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); 7163 return Builder.CreateTrunc(Ops[0], Int16Ty); 7164 } 7165 case NEON::BI__builtin_neon_vmaxvq_s8: { 7166 Int = Intrinsic::aarch64_neon_smaxv; 7167 Ty = Int32Ty; 7168 VTy = llvm::FixedVectorType::get(Int8Ty, 16); 7169 llvm::Type *Tys[2] = { Ty, VTy }; 7170 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7171 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); 7172 return Builder.CreateTrunc(Ops[0], Int8Ty); 7173 } 7174 case NEON::BI__builtin_neon_vmaxvq_s16: { 7175 Int = Intrinsic::aarch64_neon_smaxv; 7176 Ty = Int32Ty; 7177 VTy = llvm::FixedVectorType::get(Int16Ty, 8); 7178 llvm::Type *Tys[2] = { Ty, VTy }; 7179 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7180 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); 7181 return Builder.CreateTrunc(Ops[0], Int16Ty); 7182 } 7183 case NEON::BI__builtin_neon_vmaxv_f16: { 7184 Int = Intrinsic::aarch64_neon_fmaxv; 7185 Ty = HalfTy; 7186 VTy = llvm::FixedVectorType::get(HalfTy, 4); 7187 llvm::Type *Tys[2] = { Ty, VTy }; 7188 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7189 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); 7190 return Builder.CreateTrunc(Ops[0], HalfTy); 7191 } 7192 case NEON::BI__builtin_neon_vmaxvq_f16: { 7193 Int = Intrinsic::aarch64_neon_fmaxv; 7194 Ty = HalfTy; 7195 VTy = llvm::FixedVectorType::get(HalfTy, 8); 7196 llvm::Type *Tys[2] = { Ty, VTy }; 7197 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7198 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); 7199 return Builder.CreateTrunc(Ops[0], HalfTy); 7200 } 7201 case NEON::BI__builtin_neon_vminv_u8: { 7202 Int = Intrinsic::aarch64_neon_uminv; 7203 Ty = Int32Ty; 7204 VTy = llvm::FixedVectorType::get(Int8Ty, 8); 7205 llvm::Type *Tys[2] = { Ty, VTy }; 7206 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7207 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); 7208 return Builder.CreateTrunc(Ops[0], Int8Ty); 7209 } 7210 case NEON::BI__builtin_neon_vminv_u16: { 7211 Int = Intrinsic::aarch64_neon_uminv; 7212 Ty = Int32Ty; 7213 VTy = llvm::FixedVectorType::get(Int16Ty, 4); 7214 llvm::Type *Tys[2] = { Ty, VTy }; 7215 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7216 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); 7217 return Builder.CreateTrunc(Ops[0], Int16Ty); 7218 } 7219 case NEON::BI__builtin_neon_vminvq_u8: { 7220 Int = Intrinsic::aarch64_neon_uminv; 7221 Ty = Int32Ty; 7222 VTy = llvm::FixedVectorType::get(Int8Ty, 16); 7223 llvm::Type *Tys[2] = { Ty, VTy }; 7224 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7225 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); 7226 return Builder.CreateTrunc(Ops[0], Int8Ty); 7227 } 7228 case NEON::BI__builtin_neon_vminvq_u16: { 7229 Int = Intrinsic::aarch64_neon_uminv; 7230 Ty = Int32Ty; 7231 VTy = llvm::FixedVectorType::get(Int16Ty, 8); 7232 llvm::Type *Tys[2] = { Ty, VTy }; 7233 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7234 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); 7235 return Builder.CreateTrunc(Ops[0], Int16Ty); 7236 } 7237 case NEON::BI__builtin_neon_vminv_s8: { 7238 Int = Intrinsic::aarch64_neon_sminv; 7239 Ty = Int32Ty; 7240 VTy = llvm::FixedVectorType::get(Int8Ty, 8); 7241 llvm::Type *Tys[2] = { Ty, VTy }; 7242 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7243 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); 7244 return Builder.CreateTrunc(Ops[0], Int8Ty); 7245 } 7246 case NEON::BI__builtin_neon_vminv_s16: { 7247 Int = Intrinsic::aarch64_neon_sminv; 7248 Ty = Int32Ty; 7249 VTy = llvm::FixedVectorType::get(Int16Ty, 4); 7250 llvm::Type *Tys[2] = { Ty, VTy }; 7251 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7252 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); 7253 return Builder.CreateTrunc(Ops[0], Int16Ty); 7254 } 7255 case NEON::BI__builtin_neon_vminvq_s8: { 7256 Int = Intrinsic::aarch64_neon_sminv; 7257 Ty = Int32Ty; 7258 VTy = llvm::FixedVectorType::get(Int8Ty, 16); 7259 llvm::Type *Tys[2] = { Ty, VTy }; 7260 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7261 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); 7262 return Builder.CreateTrunc(Ops[0], Int8Ty); 7263 } 7264 case NEON::BI__builtin_neon_vminvq_s16: { 7265 Int = Intrinsic::aarch64_neon_sminv; 7266 Ty = Int32Ty; 7267 VTy = llvm::FixedVectorType::get(Int16Ty, 8); 7268 llvm::Type *Tys[2] = { Ty, VTy }; 7269 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7270 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); 7271 return Builder.CreateTrunc(Ops[0], Int16Ty); 7272 } 7273 case NEON::BI__builtin_neon_vminv_f16: { 7274 Int = Intrinsic::aarch64_neon_fminv; 7275 Ty = HalfTy; 7276 VTy = llvm::FixedVectorType::get(HalfTy, 4); 7277 llvm::Type *Tys[2] = { Ty, VTy }; 7278 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7279 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); 7280 return Builder.CreateTrunc(Ops[0], HalfTy); 7281 } 7282 case NEON::BI__builtin_neon_vminvq_f16: { 7283 Int = Intrinsic::aarch64_neon_fminv; 7284 Ty = HalfTy; 7285 VTy = llvm::FixedVectorType::get(HalfTy, 8); 7286 llvm::Type *Tys[2] = { Ty, VTy }; 7287 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7288 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); 7289 return Builder.CreateTrunc(Ops[0], HalfTy); 7290 } 7291 case NEON::BI__builtin_neon_vmaxnmv_f16: { 7292 Int = Intrinsic::aarch64_neon_fmaxnmv; 7293 Ty = HalfTy; 7294 VTy = llvm::FixedVectorType::get(HalfTy, 4); 7295 llvm::Type *Tys[2] = { Ty, VTy }; 7296 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7297 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv"); 7298 return Builder.CreateTrunc(Ops[0], HalfTy); 7299 } 7300 case NEON::BI__builtin_neon_vmaxnmvq_f16: { 7301 Int = Intrinsic::aarch64_neon_fmaxnmv; 7302 Ty = HalfTy; 7303 VTy = llvm::FixedVectorType::get(HalfTy, 8); 7304 llvm::Type *Tys[2] = { Ty, VTy }; 7305 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7306 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv"); 7307 return Builder.CreateTrunc(Ops[0], HalfTy); 7308 } 7309 case NEON::BI__builtin_neon_vminnmv_f16: { 7310 Int = Intrinsic::aarch64_neon_fminnmv; 7311 Ty = HalfTy; 7312 VTy = llvm::FixedVectorType::get(HalfTy, 4); 7313 llvm::Type *Tys[2] = { Ty, VTy }; 7314 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7315 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv"); 7316 return Builder.CreateTrunc(Ops[0], HalfTy); 7317 } 7318 case NEON::BI__builtin_neon_vminnmvq_f16: { 7319 Int = Intrinsic::aarch64_neon_fminnmv; 7320 Ty = HalfTy; 7321 VTy = llvm::FixedVectorType::get(HalfTy, 8); 7322 llvm::Type *Tys[2] = { Ty, VTy }; 7323 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7324 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv"); 7325 return Builder.CreateTrunc(Ops[0], HalfTy); 7326 } 7327 case NEON::BI__builtin_neon_vmul_n_f64: { 7328 Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); 7329 Value *RHS = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), DoubleTy); 7330 return Builder.CreateFMul(Ops[0], RHS); 7331 } 7332 case NEON::BI__builtin_neon_vaddlv_u8: { 7333 Int = Intrinsic::aarch64_neon_uaddlv; 7334 Ty = Int32Ty; 7335 VTy = llvm::FixedVectorType::get(Int8Ty, 8); 7336 llvm::Type *Tys[2] = { Ty, VTy }; 7337 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7338 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv"); 7339 return Builder.CreateTrunc(Ops[0], Int16Ty); 7340 } 7341 case NEON::BI__builtin_neon_vaddlv_u16: { 7342 Int = Intrinsic::aarch64_neon_uaddlv; 7343 Ty = Int32Ty; 7344 VTy = llvm::FixedVectorType::get(Int16Ty, 4); 7345 llvm::Type *Tys[2] = { Ty, VTy }; 7346 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7347 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv"); 7348 } 7349 case NEON::BI__builtin_neon_vaddlvq_u8: { 7350 Int = Intrinsic::aarch64_neon_uaddlv; 7351 Ty = Int32Ty; 7352 VTy = llvm::FixedVectorType::get(Int8Ty, 16); 7353 llvm::Type *Tys[2] = { Ty, VTy }; 7354 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7355 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv"); 7356 return Builder.CreateTrunc(Ops[0], Int16Ty); 7357 } 7358 case NEON::BI__builtin_neon_vaddlvq_u16: { 7359 Int = Intrinsic::aarch64_neon_uaddlv; 7360 Ty = Int32Ty; 7361 VTy = llvm::FixedVectorType::get(Int16Ty, 8); 7362 llvm::Type *Tys[2] = { Ty, VTy }; 7363 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7364 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv"); 7365 } 7366 case NEON::BI__builtin_neon_vaddlv_s8: { 7367 Int = Intrinsic::aarch64_neon_saddlv; 7368 Ty = Int32Ty; 7369 VTy = llvm::FixedVectorType::get(Int8Ty, 8); 7370 llvm::Type *Tys[2] = { Ty, VTy }; 7371 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7372 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv"); 7373 return Builder.CreateTrunc(Ops[0], Int16Ty); 7374 } 7375 case NEON::BI__builtin_neon_vaddlv_s16: { 7376 Int = Intrinsic::aarch64_neon_saddlv; 7377 Ty = Int32Ty; 7378 VTy = llvm::FixedVectorType::get(Int16Ty, 4); 7379 llvm::Type *Tys[2] = { Ty, VTy }; 7380 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7381 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv"); 7382 } 7383 case NEON::BI__builtin_neon_vaddlvq_s8: { 7384 Int = Intrinsic::aarch64_neon_saddlv; 7385 Ty = Int32Ty; 7386 VTy = llvm::FixedVectorType::get(Int8Ty, 16); 7387 llvm::Type *Tys[2] = { Ty, VTy }; 7388 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7389 Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv"); 7390 return Builder.CreateTrunc(Ops[0], Int16Ty); 7391 } 7392 case NEON::BI__builtin_neon_vaddlvq_s16: { 7393 Int = Intrinsic::aarch64_neon_saddlv; 7394 Ty = Int32Ty; 7395 VTy = llvm::FixedVectorType::get(Int16Ty, 8); 7396 llvm::Type *Tys[2] = { Ty, VTy }; 7397 Ops.push_back(EmitScalarExpr(E->getArg(0))); 7398 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv"); 7399 } 7400 case NEON::BI__builtin_neon_vsri_n_v: 7401 case NEON::BI__builtin_neon_vsriq_n_v: { 7402 Int = Intrinsic::aarch64_neon_vsri; 7403 llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty); 7404 return EmitNeonCall(Intrin, Ops, "vsri_n"); 7405 } 7406 case NEON::BI__builtin_neon_vsli_n_v: 7407 case NEON::BI__builtin_neon_vsliq_n_v: { 7408 Int = Intrinsic::aarch64_neon_vsli; 7409 llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty); 7410 return EmitNeonCall(Intrin, Ops, "vsli_n"); 7411 } 7412 case NEON::BI__builtin_neon_vsra_n_v: 7413 case NEON::BI__builtin_neon_vsraq_n_v: 7414 Ops[0] = Builder.CreateBitCast(Ops[0], Ty); 7415 Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n"); 7416 return Builder.CreateAdd(Ops[0], Ops[1]); 7417 case NEON::BI__builtin_neon_vrsra_n_v: 7418 case NEON::BI__builtin_neon_vrsraq_n_v: { 7419 Int = usgn ? Intrinsic::aarch64_neon_urshl : Intrinsic::aarch64_neon_srshl; 7420 SmallVector<llvm::Value*,2> TmpOps; 7421 TmpOps.push_back(Ops[1]); 7422 TmpOps.push_back(Ops[2]); 7423 Function* F = CGM.getIntrinsic(Int, Ty); 7424 llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vrshr_n", 1, true); 7425 Ops[0] = Builder.CreateBitCast(Ops[0], VTy); 7426 return Builder.CreateAdd(Ops[0], tmp); 7427 } 7428 case NEON::BI__builtin_neon_vld1_v: 7429 case NEON::BI__builtin_neon_vld1q_v: { 7430 return Builder.CreateAlignedLoad(VTy, Ops[0], PtrOp0.getAlignment()); 7431 } 7432 case NEON::BI__builtin_neon_vst1_v: 7433 case NEON::BI__builtin_neon_vst1q_v: 7434 Ops[1] = Builder.CreateBitCast(Ops[1], VTy); 7435 return Builder.CreateAlignedStore(Ops[1], Ops[0], PtrOp0.getAlignment()); 7436 case NEON::BI__builtin_neon_vld1_lane_v: 7437 case NEON::BI__builtin_neon_vld1q_lane_v: { 7438 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 7439 Ops[0] = Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0], 7440 PtrOp0.getAlignment()); 7441 return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane"); 7442 } 7443 case NEON::BI__builtin_neon_vldap1_lane_s64: 7444 case NEON::BI__builtin_neon_vldap1q_lane_s64: { 7445 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 7446 llvm::LoadInst *LI = Builder.CreateAlignedLoad( 7447 VTy->getElementType(), Ops[0], PtrOp0.getAlignment()); 7448 LI->setAtomic(llvm::AtomicOrdering::Acquire); 7449 Ops[0] = LI; 7450 return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vldap1_lane"); 7451 } 7452 case NEON::BI__builtin_neon_vld1_dup_v: 7453 case NEON::BI__builtin_neon_vld1q_dup_v: { 7454 Value *V = PoisonValue::get(Ty); 7455 Ops[0] = Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0], 7456 PtrOp0.getAlignment()); 7457 llvm::Constant *CI = ConstantInt::get(Int32Ty, 0); 7458 Ops[0] = Builder.CreateInsertElement(V, Ops[0], CI); 7459 return EmitNeonSplat(Ops[0], CI); 7460 } 7461 case NEON::BI__builtin_neon_vst1_lane_v: 7462 case NEON::BI__builtin_neon_vst1q_lane_v: 7463 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 7464 Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]); 7465 return Builder.CreateAlignedStore(Ops[1], Ops[0], PtrOp0.getAlignment()); 7466 case NEON::BI__builtin_neon_vstl1_lane_s64: 7467 case NEON::BI__builtin_neon_vstl1q_lane_s64: { 7468 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 7469 Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]); 7470 llvm::StoreInst *SI = 7471 Builder.CreateAlignedStore(Ops[1], Ops[0], PtrOp0.getAlignment()); 7472 SI->setAtomic(llvm::AtomicOrdering::Release); 7473 return SI; 7474 } 7475 case NEON::BI__builtin_neon_vld2_v: 7476 case NEON::BI__builtin_neon_vld2q_v: { 7477 llvm::Type *Tys[2] = {VTy, UnqualPtrTy}; 7478 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2, Tys); 7479 Ops[1] = Builder.CreateCall(F, Ops[1], "vld2"); 7480 return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); 7481 } 7482 case NEON::BI__builtin_neon_vld3_v: 7483 case NEON::BI__builtin_neon_vld3q_v: { 7484 llvm::Type *Tys[2] = {VTy, UnqualPtrTy}; 7485 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3, Tys); 7486 Ops[1] = Builder.CreateCall(F, Ops[1], "vld3"); 7487 return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); 7488 } 7489 case NEON::BI__builtin_neon_vld4_v: 7490 case NEON::BI__builtin_neon_vld4q_v: { 7491 llvm::Type *Tys[2] = {VTy, UnqualPtrTy}; 7492 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4, Tys); 7493 Ops[1] = Builder.CreateCall(F, Ops[1], "vld4"); 7494 return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); 7495 } 7496 case NEON::BI__builtin_neon_vld2_dup_v: 7497 case NEON::BI__builtin_neon_vld2q_dup_v: { 7498 llvm::Type *Tys[2] = {VTy, UnqualPtrTy}; 7499 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2r, Tys); 7500 Ops[1] = Builder.CreateCall(F, Ops[1], "vld2"); 7501 return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); 7502 } 7503 case NEON::BI__builtin_neon_vld3_dup_v: 7504 case NEON::BI__builtin_neon_vld3q_dup_v: { 7505 llvm::Type *Tys[2] = {VTy, UnqualPtrTy}; 7506 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3r, Tys); 7507 Ops[1] = Builder.CreateCall(F, Ops[1], "vld3"); 7508 return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); 7509 } 7510 case NEON::BI__builtin_neon_vld4_dup_v: 7511 case NEON::BI__builtin_neon_vld4q_dup_v: { 7512 llvm::Type *Tys[2] = {VTy, UnqualPtrTy}; 7513 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4r, Tys); 7514 Ops[1] = Builder.CreateCall(F, Ops[1], "vld4"); 7515 return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); 7516 } 7517 case NEON::BI__builtin_neon_vld2_lane_v: 7518 case NEON::BI__builtin_neon_vld2q_lane_v: { 7519 llvm::Type *Tys[2] = { VTy, Ops[1]->getType() }; 7520 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2lane, Tys); 7521 std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end()); 7522 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 7523 Ops[2] = Builder.CreateBitCast(Ops[2], Ty); 7524 Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty); 7525 Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), "vld2_lane"); 7526 return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); 7527 } 7528 case NEON::BI__builtin_neon_vld3_lane_v: 7529 case NEON::BI__builtin_neon_vld3q_lane_v: { 7530 llvm::Type *Tys[2] = { VTy, Ops[1]->getType() }; 7531 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3lane, Tys); 7532 std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end()); 7533 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 7534 Ops[2] = Builder.CreateBitCast(Ops[2], Ty); 7535 Ops[3] = Builder.CreateBitCast(Ops[3], Ty); 7536 Ops[4] = Builder.CreateZExt(Ops[4], Int64Ty); 7537 Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), "vld3_lane"); 7538 return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); 7539 } 7540 case NEON::BI__builtin_neon_vld4_lane_v: 7541 case NEON::BI__builtin_neon_vld4q_lane_v: { 7542 llvm::Type *Tys[2] = { VTy, Ops[1]->getType() }; 7543 Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4lane, Tys); 7544 std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end()); 7545 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 7546 Ops[2] = Builder.CreateBitCast(Ops[2], Ty); 7547 Ops[3] = Builder.CreateBitCast(Ops[3], Ty); 7548 Ops[4] = Builder.CreateBitCast(Ops[4], Ty); 7549 Ops[5] = Builder.CreateZExt(Ops[5], Int64Ty); 7550 Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), "vld4_lane"); 7551 return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); 7552 } 7553 case NEON::BI__builtin_neon_vst2_v: 7554 case NEON::BI__builtin_neon_vst2q_v: { 7555 std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end()); 7556 llvm::Type *Tys[2] = { VTy, Ops[2]->getType() }; 7557 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2, Tys), 7558 Ops, ""); 7559 } 7560 case NEON::BI__builtin_neon_vst2_lane_v: 7561 case NEON::BI__builtin_neon_vst2q_lane_v: { 7562 std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end()); 7563 Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty); 7564 llvm::Type *Tys[2] = { VTy, Ops[3]->getType() }; 7565 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2lane, Tys), 7566 Ops, ""); 7567 } 7568 case NEON::BI__builtin_neon_vst3_v: 7569 case NEON::BI__builtin_neon_vst3q_v: { 7570 std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end()); 7571 llvm::Type *Tys[2] = { VTy, Ops[3]->getType() }; 7572 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3, Tys), 7573 Ops, ""); 7574 } 7575 case NEON::BI__builtin_neon_vst3_lane_v: 7576 case NEON::BI__builtin_neon_vst3q_lane_v: { 7577 std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end()); 7578 Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty); 7579 llvm::Type *Tys[2] = { VTy, Ops[4]->getType() }; 7580 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3lane, Tys), 7581 Ops, ""); 7582 } 7583 case NEON::BI__builtin_neon_vst4_v: 7584 case NEON::BI__builtin_neon_vst4q_v: { 7585 std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end()); 7586 llvm::Type *Tys[2] = { VTy, Ops[4]->getType() }; 7587 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4, Tys), 7588 Ops, ""); 7589 } 7590 case NEON::BI__builtin_neon_vst4_lane_v: 7591 case NEON::BI__builtin_neon_vst4q_lane_v: { 7592 std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end()); 7593 Ops[4] = Builder.CreateZExt(Ops[4], Int64Ty); 7594 llvm::Type *Tys[2] = { VTy, Ops[5]->getType() }; 7595 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4lane, Tys), 7596 Ops, ""); 7597 } 7598 case NEON::BI__builtin_neon_vtrn_v: 7599 case NEON::BI__builtin_neon_vtrnq_v: { 7600 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 7601 Ops[2] = Builder.CreateBitCast(Ops[2], Ty); 7602 Value *SV = nullptr; 7603 7604 for (unsigned vi = 0; vi != 2; ++vi) { 7605 SmallVector<int, 16> Indices; 7606 for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) { 7607 Indices.push_back(i+vi); 7608 Indices.push_back(i+e+vi); 7609 } 7610 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi); 7611 SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn"); 7612 SV = Builder.CreateDefaultAlignedStore(SV, Addr); 7613 } 7614 return SV; 7615 } 7616 case NEON::BI__builtin_neon_vuzp_v: 7617 case NEON::BI__builtin_neon_vuzpq_v: { 7618 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 7619 Ops[2] = Builder.CreateBitCast(Ops[2], Ty); 7620 Value *SV = nullptr; 7621 7622 for (unsigned vi = 0; vi != 2; ++vi) { 7623 SmallVector<int, 16> Indices; 7624 for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) 7625 Indices.push_back(2*i+vi); 7626 7627 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi); 7628 SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp"); 7629 SV = Builder.CreateDefaultAlignedStore(SV, Addr); 7630 } 7631 return SV; 7632 } 7633 case NEON::BI__builtin_neon_vzip_v: 7634 case NEON::BI__builtin_neon_vzipq_v: { 7635 Ops[1] = Builder.CreateBitCast(Ops[1], Ty); 7636 Ops[2] = Builder.CreateBitCast(Ops[2], Ty); 7637 Value *SV = nullptr; 7638 7639 for (unsigned vi = 0; vi != 2; ++vi) { 7640 SmallVector<int, 16> Indices; 7641 for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) { 7642 Indices.push_back((i + vi*e) >> 1); 7643 Indices.push_back(((i + vi*e) >> 1)+e); 7644 } 7645 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi); 7646 SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip"); 7647 SV = Builder.CreateDefaultAlignedStore(SV, Addr); 7648 } 7649 return SV; 7650 } 7651 case NEON::BI__builtin_neon_vqtbl1q_v: { 7652 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl1, Ty), 7653 Ops, "vtbl1"); 7654 } 7655 case NEON::BI__builtin_neon_vqtbl2q_v: { 7656 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl2, Ty), 7657 Ops, "vtbl2"); 7658 } 7659 case NEON::BI__builtin_neon_vqtbl3q_v: { 7660 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl3, Ty), 7661 Ops, "vtbl3"); 7662 } 7663 case NEON::BI__builtin_neon_vqtbl4q_v: { 7664 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl4, Ty), 7665 Ops, "vtbl4"); 7666 } 7667 case NEON::BI__builtin_neon_vqtbx1q_v: { 7668 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx1, Ty), 7669 Ops, "vtbx1"); 7670 } 7671 case NEON::BI__builtin_neon_vqtbx2q_v: { 7672 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx2, Ty), 7673 Ops, "vtbx2"); 7674 } 7675 case NEON::BI__builtin_neon_vqtbx3q_v: { 7676 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx3, Ty), 7677 Ops, "vtbx3"); 7678 } 7679 case NEON::BI__builtin_neon_vqtbx4q_v: { 7680 return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx4, Ty), 7681 Ops, "vtbx4"); 7682 } 7683 case NEON::BI__builtin_neon_vsqadd_v: 7684 case NEON::BI__builtin_neon_vsqaddq_v: { 7685 Int = Intrinsic::aarch64_neon_usqadd; 7686 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqadd"); 7687 } 7688 case NEON::BI__builtin_neon_vuqadd_v: 7689 case NEON::BI__builtin_neon_vuqaddq_v: { 7690 Int = Intrinsic::aarch64_neon_suqadd; 7691 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd"); 7692 } 7693 7694 case NEON::BI__builtin_neon_vluti2_laneq_mf8: 7695 case NEON::BI__builtin_neon_vluti2_laneq_bf16: 7696 case NEON::BI__builtin_neon_vluti2_laneq_f16: 7697 case NEON::BI__builtin_neon_vluti2_laneq_p16: 7698 case NEON::BI__builtin_neon_vluti2_laneq_p8: 7699 case NEON::BI__builtin_neon_vluti2_laneq_s16: 7700 case NEON::BI__builtin_neon_vluti2_laneq_s8: 7701 case NEON::BI__builtin_neon_vluti2_laneq_u16: 7702 case NEON::BI__builtin_neon_vluti2_laneq_u8: { 7703 Int = Intrinsic::aarch64_neon_vluti2_laneq; 7704 llvm::Type *Tys[2]; 7705 Tys[0] = Ty; 7706 Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, 7707 /*isQuad*/ false)); 7708 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_laneq"); 7709 } 7710 case NEON::BI__builtin_neon_vluti2q_laneq_mf8: 7711 case NEON::BI__builtin_neon_vluti2q_laneq_bf16: 7712 case NEON::BI__builtin_neon_vluti2q_laneq_f16: 7713 case NEON::BI__builtin_neon_vluti2q_laneq_p16: 7714 case NEON::BI__builtin_neon_vluti2q_laneq_p8: 7715 case NEON::BI__builtin_neon_vluti2q_laneq_s16: 7716 case NEON::BI__builtin_neon_vluti2q_laneq_s8: 7717 case NEON::BI__builtin_neon_vluti2q_laneq_u16: 7718 case NEON::BI__builtin_neon_vluti2q_laneq_u8: { 7719 Int = Intrinsic::aarch64_neon_vluti2_laneq; 7720 llvm::Type *Tys[2]; 7721 Tys[0] = Ty; 7722 Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, 7723 /*isQuad*/ true)); 7724 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_laneq"); 7725 } 7726 case NEON::BI__builtin_neon_vluti2_lane_mf8: 7727 case NEON::BI__builtin_neon_vluti2_lane_bf16: 7728 case NEON::BI__builtin_neon_vluti2_lane_f16: 7729 case NEON::BI__builtin_neon_vluti2_lane_p16: 7730 case NEON::BI__builtin_neon_vluti2_lane_p8: 7731 case NEON::BI__builtin_neon_vluti2_lane_s16: 7732 case NEON::BI__builtin_neon_vluti2_lane_s8: 7733 case NEON::BI__builtin_neon_vluti2_lane_u16: 7734 case NEON::BI__builtin_neon_vluti2_lane_u8: { 7735 Int = Intrinsic::aarch64_neon_vluti2_lane; 7736 llvm::Type *Tys[2]; 7737 Tys[0] = Ty; 7738 Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, 7739 /*isQuad*/ false)); 7740 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_lane"); 7741 } 7742 case NEON::BI__builtin_neon_vluti2q_lane_mf8: 7743 case NEON::BI__builtin_neon_vluti2q_lane_bf16: 7744 case NEON::BI__builtin_neon_vluti2q_lane_f16: 7745 case NEON::BI__builtin_neon_vluti2q_lane_p16: 7746 case NEON::BI__builtin_neon_vluti2q_lane_p8: 7747 case NEON::BI__builtin_neon_vluti2q_lane_s16: 7748 case NEON::BI__builtin_neon_vluti2q_lane_s8: 7749 case NEON::BI__builtin_neon_vluti2q_lane_u16: 7750 case NEON::BI__builtin_neon_vluti2q_lane_u8: { 7751 Int = Intrinsic::aarch64_neon_vluti2_lane; 7752 llvm::Type *Tys[2]; 7753 Tys[0] = Ty; 7754 Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, 7755 /*isQuad*/ true)); 7756 return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_lane"); 7757 } 7758 case NEON::BI__builtin_neon_vluti4q_lane_mf8: 7759 case NEON::BI__builtin_neon_vluti4q_lane_p8: 7760 case NEON::BI__builtin_neon_vluti4q_lane_s8: 7761 case NEON::BI__builtin_neon_vluti4q_lane_u8: { 7762 Int = Intrinsic::aarch64_neon_vluti4q_lane; 7763 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_lane"); 7764 } 7765 case NEON::BI__builtin_neon_vluti4q_laneq_mf8: 7766 case NEON::BI__builtin_neon_vluti4q_laneq_p8: 7767 case NEON::BI__builtin_neon_vluti4q_laneq_s8: 7768 case NEON::BI__builtin_neon_vluti4q_laneq_u8: { 7769 Int = Intrinsic::aarch64_neon_vluti4q_laneq; 7770 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq"); 7771 } 7772 case NEON::BI__builtin_neon_vluti4q_lane_bf16_x2: 7773 case NEON::BI__builtin_neon_vluti4q_lane_f16_x2: 7774 case NEON::BI__builtin_neon_vluti4q_lane_p16_x2: 7775 case NEON::BI__builtin_neon_vluti4q_lane_s16_x2: 7776 case NEON::BI__builtin_neon_vluti4q_lane_u16_x2: { 7777 Int = Intrinsic::aarch64_neon_vluti4q_lane_x2; 7778 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_lane_x2"); 7779 } 7780 case NEON::BI__builtin_neon_vluti4q_laneq_bf16_x2: 7781 case NEON::BI__builtin_neon_vluti4q_laneq_f16_x2: 7782 case NEON::BI__builtin_neon_vluti4q_laneq_p16_x2: 7783 case NEON::BI__builtin_neon_vluti4q_laneq_s16_x2: 7784 case NEON::BI__builtin_neon_vluti4q_laneq_u16_x2: { 7785 Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2; 7786 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2"); 7787 } 7788 case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm: 7789 ExtractLow = true; 7790 LLVM_FALLTHROUGH; 7791 case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm: 7792 case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm: 7793 return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1, 7794 llvm::FixedVectorType::get(BFloatTy, 8), 7795 Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1"); 7796 case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm: 7797 ExtractLow = true; 7798 LLVM_FALLTHROUGH; 7799 case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm: 7800 case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm: 7801 return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2, 7802 llvm::FixedVectorType::get(BFloatTy, 8), 7803 Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2"); 7804 case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm: 7805 ExtractLow = true; 7806 LLVM_FALLTHROUGH; 7807 case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm: 7808 case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm: 7809 return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1, 7810 llvm::FixedVectorType::get(HalfTy, 8), 7811 Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1"); 7812 case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm: 7813 ExtractLow = true; 7814 LLVM_FALLTHROUGH; 7815 case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm: 7816 case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm: 7817 return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2, 7818 llvm::FixedVectorType::get(HalfTy, 8), 7819 Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2"); 7820 case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm: 7821 return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn, 7822 llvm::FixedVectorType::get(Int8Ty, 8), 7823 Ops[0]->getType(), false, Ops, E, "vfcvtn"); 7824 case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm: 7825 return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn, 7826 llvm::FixedVectorType::get(Int8Ty, 8), 7827 llvm::FixedVectorType::get(HalfTy, 4), false, Ops, 7828 E, "vfcvtn"); 7829 case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm: 7830 return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn, 7831 llvm::FixedVectorType::get(Int8Ty, 16), 7832 llvm::FixedVectorType::get(HalfTy, 8), false, Ops, 7833 E, "vfcvtn"); 7834 case NEON::BI__builtin_neon_vcvt_high_mf8_f32_fpm: { 7835 llvm::Type *Ty = llvm::FixedVectorType::get(Int8Ty, 16); 7836 Ops[0] = Builder.CreateInsertVector(Ty, PoisonValue::get(Ty), Ops[0], 7837 uint64_t(0)); 7838 return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn2, Ty, 7839 Ops[1]->getType(), false, Ops, E, "vfcvtn2"); 7840 } 7841 7842 case NEON::BI__builtin_neon_vdot_f16_mf8_fpm: 7843 case NEON::BI__builtin_neon_vdotq_f16_mf8_fpm: 7844 return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2, false, HalfTy, 7845 Ops, E, "fdot2"); 7846 case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm: 7847 case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm: 7848 ExtendLaneArg = true; 7849 LLVM_FALLTHROUGH; 7850 case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm: 7851 case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm: 7852 return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2_lane, 7853 ExtendLaneArg, HalfTy, Ops, E, "fdot2_lane"); 7854 case NEON::BI__builtin_neon_vdot_f32_mf8_fpm: 7855 case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm: 7856 return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4, false, 7857 FloatTy, Ops, E, "fdot4"); 7858 case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm: 7859 case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm: 7860 ExtendLaneArg = true; 7861 LLVM_FALLTHROUGH; 7862 case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm: 7863 case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm: 7864 return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane, 7865 ExtendLaneArg, FloatTy, Ops, E, "fdot4_lane"); 7866 7867 case NEON::BI__builtin_neon_vmlalbq_f16_mf8_fpm: 7868 return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalb, 7869 {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E, 7870 "vmlal"); 7871 case NEON::BI__builtin_neon_vmlaltq_f16_mf8_fpm: 7872 return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalt, 7873 {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E, 7874 "vmlal"); 7875 case NEON::BI__builtin_neon_vmlallbbq_f32_mf8_fpm: 7876 return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlallbb, 7877 {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E, 7878 "vmlall"); 7879 case NEON::BI__builtin_neon_vmlallbtq_f32_mf8_fpm: 7880 return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlallbt, 7881 {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E, 7882 "vmlall"); 7883 case NEON::BI__builtin_neon_vmlalltbq_f32_mf8_fpm: 7884 return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalltb, 7885 {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E, 7886 "vmlall"); 7887 case NEON::BI__builtin_neon_vmlallttq_f32_mf8_fpm: 7888 return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalltt, 7889 {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E, 7890 "vmlall"); 7891 case NEON::BI__builtin_neon_vmlalbq_lane_f16_mf8_fpm: 7892 ExtendLaneArg = true; 7893 LLVM_FALLTHROUGH; 7894 case NEON::BI__builtin_neon_vmlalbq_laneq_f16_mf8_fpm: 7895 return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalb_lane, 7896 ExtendLaneArg, HalfTy, Ops, E, "vmlal_lane"); 7897 case NEON::BI__builtin_neon_vmlaltq_lane_f16_mf8_fpm: 7898 ExtendLaneArg = true; 7899 LLVM_FALLTHROUGH; 7900 case NEON::BI__builtin_neon_vmlaltq_laneq_f16_mf8_fpm: 7901 return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalt_lane, 7902 ExtendLaneArg, HalfTy, Ops, E, "vmlal_lane"); 7903 case NEON::BI__builtin_neon_vmlallbbq_lane_f32_mf8_fpm: 7904 ExtendLaneArg = true; 7905 LLVM_FALLTHROUGH; 7906 case NEON::BI__builtin_neon_vmlallbbq_laneq_f32_mf8_fpm: 7907 return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlallbb_lane, 7908 ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane"); 7909 case NEON::BI__builtin_neon_vmlallbtq_lane_f32_mf8_fpm: 7910 ExtendLaneArg = true; 7911 LLVM_FALLTHROUGH; 7912 case NEON::BI__builtin_neon_vmlallbtq_laneq_f32_mf8_fpm: 7913 return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlallbt_lane, 7914 ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane"); 7915 case NEON::BI__builtin_neon_vmlalltbq_lane_f32_mf8_fpm: 7916 ExtendLaneArg = true; 7917 LLVM_FALLTHROUGH; 7918 case NEON::BI__builtin_neon_vmlalltbq_laneq_f32_mf8_fpm: 7919 return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalltb_lane, 7920 ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane"); 7921 case NEON::BI__builtin_neon_vmlallttq_lane_f32_mf8_fpm: 7922 ExtendLaneArg = true; 7923 LLVM_FALLTHROUGH; 7924 case NEON::BI__builtin_neon_vmlallttq_laneq_f32_mf8_fpm: 7925 return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalltt_lane, 7926 ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane"); 7927 case NEON::BI__builtin_neon_vamin_f16: 7928 case NEON::BI__builtin_neon_vaminq_f16: 7929 case NEON::BI__builtin_neon_vamin_f32: 7930 case NEON::BI__builtin_neon_vaminq_f32: 7931 case NEON::BI__builtin_neon_vaminq_f64: { 7932 Int = Intrinsic::aarch64_neon_famin; 7933 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "famin"); 7934 } 7935 case NEON::BI__builtin_neon_vamax_f16: 7936 case NEON::BI__builtin_neon_vamaxq_f16: 7937 case NEON::BI__builtin_neon_vamax_f32: 7938 case NEON::BI__builtin_neon_vamaxq_f32: 7939 case NEON::BI__builtin_neon_vamaxq_f64: { 7940 Int = Intrinsic::aarch64_neon_famax; 7941 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "famax"); 7942 } 7943 case NEON::BI__builtin_neon_vscale_f16: 7944 case NEON::BI__builtin_neon_vscaleq_f16: 7945 case NEON::BI__builtin_neon_vscale_f32: 7946 case NEON::BI__builtin_neon_vscaleq_f32: 7947 case NEON::BI__builtin_neon_vscaleq_f64: { 7948 Int = Intrinsic::aarch64_neon_fp8_fscale; 7949 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fscale"); 7950 } 7951 } 7952 } 7953 7954 Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID, 7955 const CallExpr *E) { 7956 assert((BuiltinID == BPF::BI__builtin_preserve_field_info || 7957 BuiltinID == BPF::BI__builtin_btf_type_id || 7958 BuiltinID == BPF::BI__builtin_preserve_type_info || 7959 BuiltinID == BPF::BI__builtin_preserve_enum_value) && 7960 "unexpected BPF builtin"); 7961 7962 // A sequence number, injected into IR builtin functions, to 7963 // prevent CSE given the only difference of the function 7964 // may just be the debuginfo metadata. 7965 static uint32_t BuiltinSeqNum; 7966 7967 switch (BuiltinID) { 7968 default: 7969 llvm_unreachable("Unexpected BPF builtin"); 7970 case BPF::BI__builtin_preserve_field_info: { 7971 const Expr *Arg = E->getArg(0); 7972 bool IsBitField = Arg->IgnoreParens()->getObjectKind() == OK_BitField; 7973 7974 if (!getDebugInfo()) { 7975 CGM.Error(E->getExprLoc(), 7976 "using __builtin_preserve_field_info() without -g"); 7977 return IsBitField ? EmitLValue(Arg).getRawBitFieldPointer(*this) 7978 : EmitLValue(Arg).emitRawPointer(*this); 7979 } 7980 7981 // Enable underlying preserve_*_access_index() generation. 7982 bool OldIsInPreservedAIRegion = IsInPreservedAIRegion; 7983 IsInPreservedAIRegion = true; 7984 Value *FieldAddr = IsBitField ? EmitLValue(Arg).getRawBitFieldPointer(*this) 7985 : EmitLValue(Arg).emitRawPointer(*this); 7986 IsInPreservedAIRegion = OldIsInPreservedAIRegion; 7987 7988 ConstantInt *C = cast<ConstantInt>(EmitScalarExpr(E->getArg(1))); 7989 Value *InfoKind = ConstantInt::get(Int64Ty, C->getSExtValue()); 7990 7991 // Built the IR for the preserve_field_info intrinsic. 7992 llvm::Function *FnGetFieldInfo = Intrinsic::getOrInsertDeclaration( 7993 &CGM.getModule(), Intrinsic::bpf_preserve_field_info, 7994 {FieldAddr->getType()}); 7995 return Builder.CreateCall(FnGetFieldInfo, {FieldAddr, InfoKind}); 7996 } 7997 case BPF::BI__builtin_btf_type_id: 7998 case BPF::BI__builtin_preserve_type_info: { 7999 if (!getDebugInfo()) { 8000 CGM.Error(E->getExprLoc(), "using builtin function without -g"); 8001 return nullptr; 8002 } 8003 8004 const Expr *Arg0 = E->getArg(0); 8005 llvm::DIType *DbgInfo = getDebugInfo()->getOrCreateStandaloneType( 8006 Arg0->getType(), Arg0->getExprLoc()); 8007 8008 ConstantInt *Flag = cast<ConstantInt>(EmitScalarExpr(E->getArg(1))); 8009 Value *FlagValue = ConstantInt::get(Int64Ty, Flag->getSExtValue()); 8010 Value *SeqNumVal = ConstantInt::get(Int32Ty, BuiltinSeqNum++); 8011 8012 llvm::Function *FnDecl; 8013 if (BuiltinID == BPF::BI__builtin_btf_type_id) 8014 FnDecl = Intrinsic::getOrInsertDeclaration( 8015 &CGM.getModule(), Intrinsic::bpf_btf_type_id, {}); 8016 else 8017 FnDecl = Intrinsic::getOrInsertDeclaration( 8018 &CGM.getModule(), Intrinsic::bpf_preserve_type_info, {}); 8019 CallInst *Fn = Builder.CreateCall(FnDecl, {SeqNumVal, FlagValue}); 8020 Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo); 8021 return Fn; 8022 } 8023 case BPF::BI__builtin_preserve_enum_value: { 8024 if (!getDebugInfo()) { 8025 CGM.Error(E->getExprLoc(), "using builtin function without -g"); 8026 return nullptr; 8027 } 8028 8029 const Expr *Arg0 = E->getArg(0); 8030 llvm::DIType *DbgInfo = getDebugInfo()->getOrCreateStandaloneType( 8031 Arg0->getType(), Arg0->getExprLoc()); 8032 8033 // Find enumerator 8034 const auto *UO = cast<UnaryOperator>(Arg0->IgnoreParens()); 8035 const auto *CE = cast<CStyleCastExpr>(UO->getSubExpr()); 8036 const auto *DR = cast<DeclRefExpr>(CE->getSubExpr()); 8037 const auto *Enumerator = cast<EnumConstantDecl>(DR->getDecl()); 8038 8039 auto InitVal = Enumerator->getInitVal(); 8040 std::string InitValStr; 8041 if (InitVal.isNegative() || InitVal > uint64_t(INT64_MAX)) 8042 InitValStr = std::to_string(InitVal.getSExtValue()); 8043 else 8044 InitValStr = std::to_string(InitVal.getZExtValue()); 8045 std::string EnumStr = Enumerator->getNameAsString() + ":" + InitValStr; 8046 Value *EnumStrVal = Builder.CreateGlobalString(EnumStr); 8047 8048 ConstantInt *Flag = cast<ConstantInt>(EmitScalarExpr(E->getArg(1))); 8049 Value *FlagValue = ConstantInt::get(Int64Ty, Flag->getSExtValue()); 8050 Value *SeqNumVal = ConstantInt::get(Int32Ty, BuiltinSeqNum++); 8051 8052 llvm::Function *IntrinsicFn = Intrinsic::getOrInsertDeclaration( 8053 &CGM.getModule(), Intrinsic::bpf_preserve_enum_value, {}); 8054 CallInst *Fn = 8055 Builder.CreateCall(IntrinsicFn, {SeqNumVal, EnumStrVal, FlagValue}); 8056 Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo); 8057 return Fn; 8058 } 8059 } 8060 } 8061 8062 llvm::Value *CodeGenFunction:: 8063 BuildVector(ArrayRef<llvm::Value*> Ops) { 8064 assert((Ops.size() & (Ops.size() - 1)) == 0 && 8065 "Not a power-of-two sized vector!"); 8066 bool AllConstants = true; 8067 for (unsigned i = 0, e = Ops.size(); i != e && AllConstants; ++i) 8068 AllConstants &= isa<Constant>(Ops[i]); 8069 8070 // If this is a constant vector, create a ConstantVector. 8071 if (AllConstants) { 8072 SmallVector<llvm::Constant*, 16> CstOps; 8073 for (llvm::Value *Op : Ops) 8074 CstOps.push_back(cast<Constant>(Op)); 8075 return llvm::ConstantVector::get(CstOps); 8076 } 8077 8078 // Otherwise, insertelement the values to build the vector. 8079 Value *Result = llvm::PoisonValue::get( 8080 llvm::FixedVectorType::get(Ops[0]->getType(), Ops.size())); 8081 8082 for (unsigned i = 0, e = Ops.size(); i != e; ++i) 8083 Result = Builder.CreateInsertElement(Result, Ops[i], Builder.getInt64(i)); 8084 8085 return Result; 8086 } 8087 8088 Value *CodeGenFunction::EmitAArch64CpuInit() { 8089 llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false); 8090 llvm::FunctionCallee Func = 8091 CGM.CreateRuntimeFunction(FTy, "__init_cpu_features_resolver"); 8092 cast<llvm::GlobalValue>(Func.getCallee())->setDSOLocal(true); 8093 cast<llvm::GlobalValue>(Func.getCallee()) 8094 ->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass); 8095 return Builder.CreateCall(Func); 8096 } 8097 8098 Value *CodeGenFunction::EmitAArch64CpuSupports(const CallExpr *E) { 8099 const Expr *ArgExpr = E->getArg(0)->IgnoreParenCasts(); 8100 StringRef ArgStr = cast<StringLiteral>(ArgExpr)->getString(); 8101 llvm::SmallVector<StringRef, 8> Features; 8102 ArgStr.split(Features, "+"); 8103 for (auto &Feature : Features) { 8104 Feature = Feature.trim(); 8105 if (!llvm::AArch64::parseFMVExtension(Feature)) 8106 return Builder.getFalse(); 8107 if (Feature != "default") 8108 Features.push_back(Feature); 8109 } 8110 return EmitAArch64CpuSupports(Features); 8111 } 8112 8113 llvm::Value * 8114 CodeGenFunction::EmitAArch64CpuSupports(ArrayRef<StringRef> FeaturesStrs) { 8115 uint64_t FeaturesMask = llvm::AArch64::getCpuSupportsMask(FeaturesStrs); 8116 Value *Result = Builder.getTrue(); 8117 if (FeaturesMask != 0) { 8118 // Get features from structure in runtime library 8119 // struct { 8120 // unsigned long long features; 8121 // } __aarch64_cpu_features; 8122 llvm::Type *STy = llvm::StructType::get(Int64Ty); 8123 llvm::Constant *AArch64CPUFeatures = 8124 CGM.CreateRuntimeVariable(STy, "__aarch64_cpu_features"); 8125 cast<llvm::GlobalValue>(AArch64CPUFeatures)->setDSOLocal(true); 8126 llvm::Value *CpuFeatures = Builder.CreateGEP( 8127 STy, AArch64CPUFeatures, 8128 {ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, 0)}); 8129 Value *Features = Builder.CreateAlignedLoad(Int64Ty, CpuFeatures, 8130 CharUnits::fromQuantity(8)); 8131 Value *Mask = Builder.getInt64(FeaturesMask); 8132 Value *Bitset = Builder.CreateAnd(Features, Mask); 8133 Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask); 8134 Result = Builder.CreateAnd(Result, Cmp); 8135 } 8136 return Result; 8137 } 8138