1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AArch64. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64LegalizerInfo.h" 15 #include "AArch64RegisterBankInfo.h" 16 #include "AArch64Subtarget.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 19 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 21 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/Utils.h" 24 #include "llvm/CodeGen/MachineInstr.h" 25 #include "llvm/CodeGen/MachineRegisterInfo.h" 26 #include "llvm/CodeGen/TargetOpcodes.h" 27 #include "llvm/CodeGen/ValueTypes.h" 28 #include "llvm/IR/DerivedTypes.h" 29 #include "llvm/IR/Intrinsics.h" 30 #include "llvm/IR/IntrinsicsAArch64.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/MathExtras.h" 33 #include <initializer_list> 34 35 #define DEBUG_TYPE "aarch64-legalinfo" 36 37 using namespace llvm; 38 using namespace LegalizeActions; 39 using namespace LegalizeMutations; 40 using namespace LegalityPredicates; 41 using namespace MIPatternMatch; 42 43 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) 44 : ST(&ST) { 45 using namespace TargetOpcode; 46 const LLT p0 = LLT::pointer(0, 64); 47 const LLT s8 = LLT::scalar(8); 48 const LLT s16 = LLT::scalar(16); 49 const LLT s32 = LLT::scalar(32); 50 const LLT s64 = LLT::scalar(64); 51 const LLT s128 = LLT::scalar(128); 52 const LLT v16s8 = LLT::fixed_vector(16, 8); 53 const LLT v8s8 = LLT::fixed_vector(8, 8); 54 const LLT v4s8 = LLT::fixed_vector(4, 8); 55 const LLT v8s16 = LLT::fixed_vector(8, 16); 56 const LLT v4s16 = LLT::fixed_vector(4, 16); 57 const LLT v2s16 = LLT::fixed_vector(2, 16); 58 const LLT v2s32 = LLT::fixed_vector(2, 32); 59 const LLT v4s32 = LLT::fixed_vector(4, 32); 60 const LLT v2s64 = LLT::fixed_vector(2, 64); 61 const LLT v2p0 = LLT::fixed_vector(2, p0); 62 63 std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */ 64 v16s8, v8s16, v4s32, 65 v2s64, v2p0, 66 /* End 128bit types */ 67 /* Begin 64bit types */ 68 v8s8, v4s16, v2s32}; 69 std::initializer_list<LLT> ScalarAndPtrTypesList = {s8, s16, s32, s64, p0}; 70 SmallVector<LLT, 8> PackedVectorAllTypesVec(PackedVectorAllTypeList); 71 SmallVector<LLT, 8> ScalarAndPtrTypesVec(ScalarAndPtrTypesList); 72 73 const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine(); 74 75 // FIXME: support subtargets which have neon/fp-armv8 disabled. 76 if (!ST.hasNEON() || !ST.hasFPARMv8()) { 77 getLegacyLegalizerInfo().computeTables(); 78 return; 79 } 80 81 // Some instructions only support s16 if the subtarget has full 16-bit FP 82 // support. 83 const bool HasFP16 = ST.hasFullFP16(); 84 const LLT &MinFPScalar = HasFP16 ? s16 : s32; 85 86 const bool HasCSSC = ST.hasCSSC(); 87 const bool HasRCPC3 = ST.hasRCPC3(); 88 89 getActionDefinitionsBuilder( 90 {G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER}) 91 .legalFor({p0, s8, s16, s32, s64}) 92 .legalFor(PackedVectorAllTypeList) 93 .widenScalarToNextPow2(0) 94 .clampScalar(0, s8, s64) 95 .fewerElementsIf( 96 [=](const LegalityQuery &Query) { 97 return Query.Types[0].isVector() && 98 (Query.Types[0].getElementType() != s64 || 99 Query.Types[0].getNumElements() != 2); 100 }, 101 [=](const LegalityQuery &Query) { 102 LLT EltTy = Query.Types[0].getElementType(); 103 if (EltTy == s64) 104 return std::make_pair(0, LLT::fixed_vector(2, 64)); 105 return std::make_pair(0, EltTy); 106 }); 107 108 getActionDefinitionsBuilder(G_PHI) 109 .legalFor({p0, s16, s32, s64}) 110 .legalFor(PackedVectorAllTypeList) 111 .widenScalarToNextPow2(0) 112 .clampScalar(0, s16, s64) 113 // Maximum: sN * k = 128 114 .clampMaxNumElements(0, s8, 16) 115 .clampMaxNumElements(0, s16, 8) 116 .clampMaxNumElements(0, s32, 4) 117 .clampMaxNumElements(0, s64, 2) 118 .clampMaxNumElements(0, p0, 2); 119 120 getActionDefinitionsBuilder(G_BSWAP) 121 .legalFor({s32, s64, v4s32, v2s32, v2s64}) 122 .widenScalarToNextPow2(0) 123 .clampScalar(0, s32, s64); 124 125 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) 126 .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8}) 127 .widenScalarToNextPow2(0) 128 .clampScalar(0, s32, s64) 129 .clampMaxNumElements(0, s8, 16) 130 .clampMaxNumElements(0, s16, 8) 131 .clampNumElements(0, v2s32, v4s32) 132 .clampNumElements(0, v2s64, v2s64) 133 .minScalarOrEltIf( 134 [=](const LegalityQuery &Query) { 135 return Query.Types[0].getNumElements() <= 2; 136 }, 137 0, s32) 138 .minScalarOrEltIf( 139 [=](const LegalityQuery &Query) { 140 return Query.Types[0].getNumElements() <= 4; 141 }, 142 0, s16) 143 .minScalarOrEltIf( 144 [=](const LegalityQuery &Query) { 145 return Query.Types[0].getNumElements() <= 16; 146 }, 147 0, s8) 148 .moreElementsToNextPow2(0); 149 150 getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) 151 .customIf([=](const LegalityQuery &Query) { 152 const auto &SrcTy = Query.Types[0]; 153 const auto &AmtTy = Query.Types[1]; 154 return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && 155 AmtTy.getSizeInBits() == 32; 156 }) 157 .legalFor({ 158 {s32, s32}, 159 {s32, s64}, 160 {s64, s64}, 161 {v8s8, v8s8}, 162 {v16s8, v16s8}, 163 {v4s16, v4s16}, 164 {v8s16, v8s16}, 165 {v2s32, v2s32}, 166 {v4s32, v4s32}, 167 {v2s64, v2s64}, 168 }) 169 .widenScalarToNextPow2(0) 170 .clampScalar(1, s32, s64) 171 .clampScalar(0, s32, s64) 172 .clampNumElements(0, v8s8, v16s8) 173 .clampNumElements(0, v4s16, v8s16) 174 .clampNumElements(0, v2s32, v4s32) 175 .clampNumElements(0, v2s64, v2s64) 176 .moreElementsToNextPow2(0) 177 .minScalarSameAs(1, 0); 178 179 getActionDefinitionsBuilder(G_PTR_ADD) 180 .legalFor({{p0, s64}, {v2p0, v2s64}}) 181 .clampScalar(1, s64, s64); 182 183 getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}}); 184 185 getActionDefinitionsBuilder({G_SDIV, G_UDIV}) 186 .legalFor({s32, s64}) 187 .libcallFor({s128}) 188 .clampScalar(0, s32, s64) 189 .widenScalarToNextPow2(0) 190 .scalarize(0); 191 192 getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 193 .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32}) 194 .widenScalarOrEltToNextPow2(0) 195 .clampScalarOrElt(0, s32, s64) 196 .clampNumElements(0, v2s32, v4s32) 197 .clampNumElements(0, v2s64, v2s64) 198 .moreElementsToNextPow2(0); 199 200 201 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 202 .widenScalarToNextPow2(0, /*Min = */ 32) 203 .clampScalar(0, s32, s64) 204 .lower(); 205 206 getActionDefinitionsBuilder({G_SMULH, G_UMULH}) 207 .legalFor({s64, v8s16, v16s8, v4s32}) 208 .lower(); 209 210 auto &MinMaxActions = getActionDefinitionsBuilder( 211 {G_SMIN, G_SMAX, G_UMIN, G_UMAX}); 212 if (HasCSSC) 213 MinMaxActions 214 .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) 215 // Making clamping conditional on CSSC extension as without legal types we 216 // lower to CMP which can fold one of the two sxtb's we'd otherwise need 217 // if we detect a type smaller than 32-bit. 218 .minScalar(0, s32); 219 else 220 MinMaxActions 221 .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}); 222 MinMaxActions 223 .clampNumElements(0, v8s8, v16s8) 224 .clampNumElements(0, v4s16, v8s16) 225 .clampNumElements(0, v2s32, v4s32) 226 // FIXME: This sholdn't be needed as v2s64 types are going to 227 // be expanded anyway, but G_ICMP doesn't support splitting vectors yet 228 .clampNumElements(0, v2s64, v2s64) 229 .lower(); 230 231 getActionDefinitionsBuilder( 232 {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO}) 233 .legalFor({{s32, s32}, {s64, s32}}) 234 .clampScalar(0, s32, s64) 235 .clampScalar(1, s32, s64) 236 .widenScalarToNextPow2(0); 237 238 getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FNEG, 239 G_FABS, G_FSQRT, G_FMAXNUM, G_FMINNUM, 240 G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR, 241 G_FRINT, G_FNEARBYINT, G_INTRINSIC_TRUNC, 242 G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 243 .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64}) 244 .legalIf([=](const LegalityQuery &Query) { 245 const auto &Ty = Query.Types[0]; 246 return (Ty == v8s16 || Ty == v4s16) && HasFP16; 247 }) 248 .libcallFor({s128}) 249 .minScalarOrElt(0, MinFPScalar) 250 .clampNumElements(0, v4s16, v8s16) 251 .clampNumElements(0, v2s32, v4s32) 252 .clampNumElements(0, v2s64, v2s64) 253 .moreElementsToNextPow2(0); 254 255 getActionDefinitionsBuilder(G_FREM) 256 .libcallFor({s32, s64}) 257 .minScalar(0, s32) 258 .scalarize(0); 259 260 getActionDefinitionsBuilder(G_INTRINSIC_LRINT) 261 // If we don't have full FP16 support, then scalarize the elements of 262 // vectors containing fp16 types. 263 .fewerElementsIf( 264 [=, &ST](const LegalityQuery &Query) { 265 const auto &Ty = Query.Types[0]; 266 return Ty.isVector() && Ty.getElementType() == s16 && 267 !ST.hasFullFP16(); 268 }, 269 [=](const LegalityQuery &Query) { return std::make_pair(0, s16); }) 270 // If we don't have full FP16 support, then widen s16 to s32 if we 271 // encounter it. 272 .widenScalarIf( 273 [=, &ST](const LegalityQuery &Query) { 274 return Query.Types[0] == s16 && !ST.hasFullFP16(); 275 }, 276 [=](const LegalityQuery &Query) { return std::make_pair(0, s32); }) 277 .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16}); 278 279 getActionDefinitionsBuilder( 280 {G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2, G_FLOG10, 281 G_FEXP, G_FEXP2, G_FEXP10}) 282 // We need a call for these, so we always need to scalarize. 283 .scalarize(0) 284 // Regardless of FP16 support, widen 16-bit elements to 32-bits. 285 .minScalar(0, s32) 286 .libcallFor({s32, s64}); 287 getActionDefinitionsBuilder(G_FPOWI) 288 .scalarize(0) 289 .minScalar(0, s32) 290 .libcallFor({{s32, s32}, {s64, s32}}); 291 292 getActionDefinitionsBuilder(G_INSERT) 293 .legalIf(all(typeInSet(0, {s32, s64, p0}), 294 typeInSet(1, {s8, s16, s32}), smallerThan(1, 0))) 295 .widenScalarToNextPow2(0) 296 .clampScalar(0, s32, s64) 297 .widenScalarToNextPow2(1) 298 .minScalar(1, s8) 299 .maxScalarIf(typeInSet(0, {s32}), 1, s16) 300 .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32); 301 302 getActionDefinitionsBuilder(G_EXTRACT) 303 .legalIf(all(typeInSet(0, {s16, s32, s64, p0}), 304 typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1))) 305 .widenScalarToNextPow2(1) 306 .clampScalar(1, s32, s128) 307 .widenScalarToNextPow2(0) 308 .minScalar(0, s16) 309 .maxScalarIf(typeInSet(1, {s32}), 0, s16) 310 .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32) 311 .maxScalarIf(typeInSet(1, {s128}), 0, s64); 312 313 314 for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) { 315 auto &Actions = getActionDefinitionsBuilder(Op); 316 317 if (Op == G_SEXTLOAD) 318 Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)); 319 320 // Atomics have zero extending behavior. 321 Actions 322 .legalForTypesWithMemDesc({{s32, p0, s8, 8}, 323 {s32, p0, s16, 8}, 324 {s32, p0, s32, 8}, 325 {s64, p0, s8, 2}, 326 {s64, p0, s16, 2}, 327 {s64, p0, s32, 4}, 328 {s64, p0, s64, 8}, 329 {p0, p0, s64, 8}, 330 {v2s32, p0, s64, 8}}) 331 .widenScalarToNextPow2(0) 332 .clampScalar(0, s32, s64) 333 // TODO: We could support sum-of-pow2's but the lowering code doesn't know 334 // how to do that yet. 335 .unsupportedIfMemSizeNotPow2() 336 // Lower anything left over into G_*EXT and G_LOAD 337 .lower(); 338 } 339 340 auto IsPtrVecPred = [=](const LegalityQuery &Query) { 341 const LLT &ValTy = Query.Types[0]; 342 if (!ValTy.isVector()) 343 return false; 344 const LLT EltTy = ValTy.getElementType(); 345 return EltTy.isPointer() && EltTy.getAddressSpace() == 0; 346 }; 347 348 getActionDefinitionsBuilder(G_LOAD) 349 .customIf([=](const LegalityQuery &Query) { 350 return HasRCPC3 && Query.Types[0] == s128 && 351 Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire; 352 }) 353 .customIf([=](const LegalityQuery &Query) { 354 return Query.Types[0] == s128 && 355 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; 356 }) 357 .legalForTypesWithMemDesc({{s8, p0, s8, 8}, 358 {s16, p0, s16, 8}, 359 {s32, p0, s32, 8}, 360 {s64, p0, s64, 8}, 361 {p0, p0, s64, 8}, 362 {s128, p0, s128, 8}, 363 {v8s8, p0, s64, 8}, 364 {v16s8, p0, s128, 8}, 365 {v4s16, p0, s64, 8}, 366 {v8s16, p0, s128, 8}, 367 {v2s32, p0, s64, 8}, 368 {v4s32, p0, s128, 8}, 369 {v2s64, p0, s128, 8}}) 370 // These extends are also legal 371 .legalForTypesWithMemDesc( 372 {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}}) 373 .widenScalarToNextPow2(0, /* MinSize = */ 8) 374 .lowerIfMemSizeNotByteSizePow2() 375 .clampScalar(0, s8, s64) 376 .narrowScalarIf( 377 [=](const LegalityQuery &Query) { 378 // Clamp extending load results to 32-bits. 379 return Query.Types[0].isScalar() && 380 Query.Types[0] != Query.MMODescrs[0].MemoryTy && 381 Query.Types[0].getSizeInBits() > 32; 382 }, 383 changeTo(0, s32)) 384 .clampMaxNumElements(0, s8, 16) 385 .clampMaxNumElements(0, s16, 8) 386 .clampMaxNumElements(0, s32, 4) 387 .clampMaxNumElements(0, s64, 2) 388 .clampMaxNumElements(0, p0, 2) 389 .customIf(IsPtrVecPred) 390 .scalarizeIf(typeIs(0, v2s16), 0); 391 392 getActionDefinitionsBuilder(G_STORE) 393 .customIf([=](const LegalityQuery &Query) { 394 return HasRCPC3 && Query.Types[0] == s128 && 395 Query.MMODescrs[0].Ordering == AtomicOrdering::Release; 396 }) 397 .customIf([=](const LegalityQuery &Query) { 398 return Query.Types[0] == s128 && 399 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; 400 }) 401 .legalForTypesWithMemDesc( 402 {{s8, p0, s8, 8}, {s16, p0, s8, 8}, // truncstorei8 from s16 403 {s32, p0, s8, 8}, // truncstorei8 from s32 404 {s64, p0, s8, 8}, // truncstorei8 from s64 405 {s16, p0, s16, 8}, {s32, p0, s16, 8}, // truncstorei16 from s32 406 {s64, p0, s16, 8}, // truncstorei16 from s64 407 {s32, p0, s8, 8}, {s32, p0, s16, 8}, {s32, p0, s32, 8}, 408 {s64, p0, s64, 8}, {s64, p0, s32, 8}, // truncstorei32 from s64 409 {p0, p0, s64, 8}, {s128, p0, s128, 8}, {v16s8, p0, s128, 8}, 410 {v8s8, p0, s64, 8}, {v4s16, p0, s64, 8}, {v8s16, p0, s128, 8}, 411 {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}}) 412 .clampScalar(0, s8, s64) 413 .lowerIf([=](const LegalityQuery &Query) { 414 return Query.Types[0].isScalar() && 415 Query.Types[0] != Query.MMODescrs[0].MemoryTy; 416 }) 417 // Maximum: sN * k = 128 418 .clampMaxNumElements(0, s8, 16) 419 .clampMaxNumElements(0, s16, 8) 420 .clampMaxNumElements(0, s32, 4) 421 .clampMaxNumElements(0, s64, 2) 422 .clampMaxNumElements(0, p0, 2) 423 .lowerIfMemSizeNotPow2() 424 .customIf(IsPtrVecPred) 425 .scalarizeIf(typeIs(0, v2s16), 0); 426 427 getActionDefinitionsBuilder(G_INDEXED_STORE) 428 // Idx 0 == Ptr, Idx 1 == Val 429 // TODO: we can implement legalizations but as of now these are 430 // generated in a very specific way. 431 .legalForTypesWithMemDesc({ 432 {p0, s8, s8, 8}, 433 {p0, s16, s16, 8}, 434 {p0, s32, s8, 8}, 435 {p0, s32, s16, 8}, 436 {p0, s32, s32, 8}, 437 {p0, s64, s64, 8}, 438 {p0, p0, p0, 8}, 439 {p0, v8s8, v8s8, 8}, 440 {p0, v16s8, v16s8, 8}, 441 {p0, v4s16, v4s16, 8}, 442 {p0, v8s16, v8s16, 8}, 443 {p0, v2s32, v2s32, 8}, 444 {p0, v4s32, v4s32, 8}, 445 {p0, v2s64, v2s64, 8}, 446 {p0, v2p0, v2p0, 8}, 447 {p0, s128, s128, 8}, 448 }) 449 .unsupported(); 450 451 auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) { 452 LLT LdTy = Query.Types[0]; 453 LLT PtrTy = Query.Types[1]; 454 if (!llvm::is_contained(PackedVectorAllTypesVec, LdTy) && 455 !llvm::is_contained(ScalarAndPtrTypesVec, LdTy) && LdTy != s128) 456 return false; 457 if (PtrTy != p0) 458 return false; 459 return true; 460 }; 461 getActionDefinitionsBuilder(G_INDEXED_LOAD) 462 .unsupportedIf( 463 atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)) 464 .legalIf(IndexedLoadBasicPred) 465 .unsupported(); 466 getActionDefinitionsBuilder({G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD}) 467 .unsupportedIf( 468 atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)) 469 .legalIf(all(typeInSet(0, {s16, s32, s64}), 470 LegalityPredicate([=](const LegalityQuery &Q) { 471 LLT LdTy = Q.Types[0]; 472 LLT PtrTy = Q.Types[1]; 473 LLT MemTy = Q.MMODescrs[0].MemoryTy; 474 if (PtrTy != p0) 475 return false; 476 if (LdTy == s16) 477 return MemTy == s8; 478 if (LdTy == s32) 479 return MemTy == s8 || MemTy == s16; 480 if (LdTy == s64) 481 return MemTy == s8 || MemTy == s16 || MemTy == s32; 482 return false; 483 }))) 484 .unsupported(); 485 486 // Constants 487 getActionDefinitionsBuilder(G_CONSTANT) 488 .legalFor({p0, s8, s16, s32, s64}) 489 .widenScalarToNextPow2(0) 490 .clampScalar(0, s8, s64); 491 getActionDefinitionsBuilder(G_FCONSTANT) 492 .legalIf([=](const LegalityQuery &Query) { 493 const auto &Ty = Query.Types[0]; 494 if (HasFP16 && Ty == s16) 495 return true; 496 return Ty == s32 || Ty == s64 || Ty == s128; 497 }) 498 .clampScalar(0, MinFPScalar, s128); 499 500 // FIXME: fix moreElementsToNextPow2 501 getActionDefinitionsBuilder(G_ICMP) 502 .legalFor({{s32, s32}, 503 {s32, s64}, 504 {s32, p0}, 505 {v4s32, v4s32}, 506 {v2s32, v2s32}, 507 {v2s64, v2s64}, 508 {v2s64, v2p0}, 509 {v4s16, v4s16}, 510 {v8s16, v8s16}, 511 {v8s8, v8s8}, 512 {v16s8, v16s8}}) 513 .widenScalarOrEltToNextPow2(1) 514 .clampScalar(1, s32, s64) 515 .clampScalar(0, s32, s32) 516 .minScalarEltSameAsIf( 517 [=](const LegalityQuery &Query) { 518 const LLT &Ty = Query.Types[0]; 519 const LLT &SrcTy = Query.Types[1]; 520 return Ty.isVector() && !SrcTy.getElementType().isPointer() && 521 Ty.getElementType() != SrcTy.getElementType(); 522 }, 523 0, 1) 524 .minScalarOrEltIf( 525 [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; }, 526 1, s32) 527 .minScalarOrEltIf( 528 [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0, 529 s64) 530 .moreElementsToNextPow2(0) 531 .clampNumElements(0, v8s8, v16s8) 532 .clampNumElements(0, v4s16, v8s16) 533 .clampNumElements(0, v2s32, v4s32) 534 .clampNumElements(0, v2s64, v2s64); 535 536 getActionDefinitionsBuilder(G_FCMP) 537 // If we don't have full FP16 support, then scalarize the elements of 538 // vectors containing fp16 types. 539 .fewerElementsIf( 540 [=](const LegalityQuery &Query) { 541 const auto &Ty = Query.Types[0]; 542 return Ty.isVector() && Ty.getElementType() == s16 && !HasFP16; 543 }, 544 [=](const LegalityQuery &Query) { return std::make_pair(0, s16); }) 545 // If we don't have full FP16 support, then widen s16 to s32 if we 546 // encounter it. 547 .widenScalarIf( 548 [=](const LegalityQuery &Query) { 549 return Query.Types[0] == s16 && !HasFP16; 550 }, 551 [=](const LegalityQuery &Query) { return std::make_pair(0, s32); }) 552 .legalFor({{s16, s16}, 553 {s32, s32}, 554 {s32, s64}, 555 {v4s32, v4s32}, 556 {v2s32, v2s32}, 557 {v2s64, v2s64}, 558 {v4s16, v4s16}, 559 {v8s16, v8s16}}) 560 .widenScalarOrEltToNextPow2(1) 561 .clampScalar(1, s32, s64) 562 .clampScalar(0, s32, s32) 563 .minScalarEltSameAsIf( 564 [=](const LegalityQuery &Query) { 565 const LLT &Ty = Query.Types[0]; 566 const LLT &SrcTy = Query.Types[1]; 567 return Ty.isVector() && !SrcTy.getElementType().isPointer() && 568 Ty.getElementType() != SrcTy.getElementType(); 569 }, 570 0, 1) 571 .clampNumElements(0, v2s32, v4s32) 572 .clampMaxNumElements(1, s64, 2); 573 574 // Extensions 575 auto ExtLegalFunc = [=](const LegalityQuery &Query) { 576 unsigned DstSize = Query.Types[0].getSizeInBits(); 577 578 // Handle legal vectors using legalFor 579 if (Query.Types[0].isVector()) 580 return false; 581 582 if (DstSize < 8 || DstSize >= 128 || !isPowerOf2_32(DstSize)) 583 return false; // Extending to a scalar s128 needs narrowing. 584 585 const LLT &SrcTy = Query.Types[1]; 586 587 // Make sure we fit in a register otherwise. Don't bother checking that 588 // the source type is below 128 bits. We shouldn't be allowing anything 589 // through which is wider than the destination in the first place. 590 unsigned SrcSize = SrcTy.getSizeInBits(); 591 if (SrcSize < 8 || !isPowerOf2_32(SrcSize)) 592 return false; 593 594 return true; 595 }; 596 getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) 597 .legalIf(ExtLegalFunc) 598 .legalFor({{v2s64, v2s32}, {v4s32, v4s16}, {v8s16, v8s8}}) 599 .clampScalar(0, s64, s64) // Just for s128, others are handled above. 600 .moreElementsToNextPow2(1) 601 .clampMaxNumElements(1, s8, 8) 602 .clampMaxNumElements(1, s16, 4) 603 .clampMaxNumElements(1, s32, 2) 604 // Tries to convert a large EXTEND into two smaller EXTENDs 605 .lowerIf([=](const LegalityQuery &Query) { 606 return (Query.Types[0].getScalarSizeInBits() > 607 Query.Types[1].getScalarSizeInBits() * 2) && 608 Query.Types[0].isVector() && 609 (Query.Types[1].getScalarSizeInBits() == 8 || 610 Query.Types[1].getScalarSizeInBits() == 16); 611 }); 612 613 getActionDefinitionsBuilder(G_TRUNC) 614 .legalFor({{v2s32, v2s64}, {v4s16, v4s32}, {v8s8, v8s16}}) 615 .moreElementsToNextPow2(0) 616 .clampMaxNumElements(0, s8, 8) 617 .clampMaxNumElements(0, s16, 4) 618 .clampMaxNumElements(0, s32, 2) 619 .minScalarOrEltIf( 620 [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); }, 621 0, s8) 622 .lowerIf([=](const LegalityQuery &Query) { 623 LLT DstTy = Query.Types[0]; 624 LLT SrcTy = Query.Types[1]; 625 return DstTy.isVector() && SrcTy.getSizeInBits() > 128 && 626 DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits(); 627 }) 628 629 .alwaysLegal(); 630 631 getActionDefinitionsBuilder(G_SEXT_INREG) 632 .legalFor({s32, s64}) 633 .legalFor(PackedVectorAllTypeList) 634 .maxScalar(0, s64) 635 .clampNumElements(0, v8s8, v16s8) 636 .clampNumElements(0, v4s16, v8s16) 637 .clampNumElements(0, v2s32, v4s32) 638 .clampMaxNumElements(0, s64, 2) 639 .lower(); 640 641 // FP conversions 642 getActionDefinitionsBuilder(G_FPTRUNC) 643 .legalFor( 644 {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}) 645 .clampNumElements(0, v4s16, v4s16) 646 .clampNumElements(0, v2s32, v2s32) 647 .scalarize(0); 648 649 getActionDefinitionsBuilder(G_FPEXT) 650 .legalFor( 651 {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}}) 652 .clampNumElements(0, v4s32, v4s32) 653 .clampNumElements(0, v2s64, v2s64) 654 .scalarize(0); 655 656 // Conversions 657 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 658 .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) 659 .legalIf([=](const LegalityQuery &Query) { 660 return HasFP16 && 661 (Query.Types[1] == s16 || Query.Types[1] == v4s16 || 662 Query.Types[1] == v8s16) && 663 (Query.Types[0] == s32 || Query.Types[0] == s64 || 664 Query.Types[0] == v4s16 || Query.Types[0] == v8s16); 665 }) 666 .widenScalarToNextPow2(0) 667 .clampScalar(0, s32, s64) 668 .widenScalarToNextPow2(1) 669 .clampScalarOrElt(1, MinFPScalar, s64) 670 .moreElementsToNextPow2(0) 671 .widenScalarIf( 672 [=](const LegalityQuery &Query) { 673 return Query.Types[0].getScalarSizeInBits() > 674 Query.Types[1].getScalarSizeInBits(); 675 }, 676 LegalizeMutations::changeElementSizeTo(1, 0)) 677 .widenScalarIf( 678 [=](const LegalityQuery &Query) { 679 return Query.Types[0].getScalarSizeInBits() < 680 Query.Types[1].getScalarSizeInBits(); 681 }, 682 LegalizeMutations::changeElementSizeTo(0, 1)) 683 .clampNumElements(0, v4s16, v8s16) 684 .clampNumElements(0, v2s32, v4s32) 685 .clampMaxNumElements(0, s64, 2); 686 687 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 688 .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) 689 .legalIf([=](const LegalityQuery &Query) { 690 return HasFP16 && 691 (Query.Types[0] == s16 || Query.Types[0] == v4s16 || 692 Query.Types[0] == v8s16) && 693 (Query.Types[1] == s32 || Query.Types[1] == s64 || 694 Query.Types[1] == v4s16 || Query.Types[1] == v8s16); 695 }) 696 .widenScalarToNextPow2(1) 697 .clampScalar(1, s32, s64) 698 .widenScalarToNextPow2(0) 699 .clampScalarOrElt(0, MinFPScalar, s64) 700 .moreElementsToNextPow2(0) 701 .widenScalarIf( 702 [=](const LegalityQuery &Query) { 703 return Query.Types[0].getScalarSizeInBits() < 704 Query.Types[1].getScalarSizeInBits(); 705 }, 706 LegalizeMutations::changeElementSizeTo(0, 1)) 707 .widenScalarIf( 708 [=](const LegalityQuery &Query) { 709 return Query.Types[0].getScalarSizeInBits() > 710 Query.Types[1].getScalarSizeInBits(); 711 }, 712 LegalizeMutations::changeElementSizeTo(1, 0)) 713 .clampNumElements(0, v4s16, v8s16) 714 .clampNumElements(0, v2s32, v4s32) 715 .clampMaxNumElements(0, s64, 2); 716 717 // Control-flow 718 getActionDefinitionsBuilder(G_BRCOND) 719 .legalFor({s32}) 720 .clampScalar(0, s32, s32); 721 getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0}); 722 723 getActionDefinitionsBuilder(G_SELECT) 724 .legalFor({{s32, s32}, {s64, s32}, {p0, s32}}) 725 .widenScalarToNextPow2(0) 726 .clampScalar(0, s32, s64) 727 .clampScalar(1, s32, s32) 728 .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0) 729 .lowerIf(isVector(0)); 730 731 // Pointer-handling 732 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); 733 734 if (TM.getCodeModel() == CodeModel::Small) 735 getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom(); 736 else 737 getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); 738 739 getActionDefinitionsBuilder(G_PTRTOINT) 740 .legalFor({{s64, p0}, {v2s64, v2p0}}) 741 .widenScalarToNextPow2(0, 64) 742 .clampScalar(0, s64, s64); 743 744 getActionDefinitionsBuilder(G_INTTOPTR) 745 .unsupportedIf([&](const LegalityQuery &Query) { 746 return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits(); 747 }) 748 .legalFor({{p0, s64}, {v2p0, v2s64}}); 749 750 // Casts for 32 and 64-bit width type are just copies. 751 // Same for 128-bit width type, except they are on the FPR bank. 752 getActionDefinitionsBuilder(G_BITCAST) 753 // FIXME: This is wrong since G_BITCAST is not allowed to change the 754 // number of bits but it's what the previous code described and fixing 755 // it breaks tests. 756 .legalForCartesianProduct({s8, s16, s32, s64, s128, v16s8, v8s8, v4s8, 757 v8s16, v4s16, v2s16, v4s32, v2s32, v2s64, 758 v2p0}); 759 760 getActionDefinitionsBuilder(G_VASTART).legalFor({p0}); 761 762 // va_list must be a pointer, but most sized types are pretty easy to handle 763 // as the destination. 764 getActionDefinitionsBuilder(G_VAARG) 765 .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0}) 766 .clampScalar(0, s8, s64) 767 .widenScalarToNextPow2(0, /*Min*/ 8); 768 769 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 770 .lowerIf( 771 all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0))); 772 773 LegalityPredicate UseOutlineAtomics = [&ST](const LegalityQuery &Query) { 774 return ST.outlineAtomics() && !ST.hasLSE(); 775 }; 776 777 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 778 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0), 779 predNot(UseOutlineAtomics))) 780 .customIf(all(typeIs(0, s128), predNot(UseOutlineAtomics))) 781 .customIf([UseOutlineAtomics](const LegalityQuery &Query) { 782 return Query.Types[0].getSizeInBits() == 128 && 783 !UseOutlineAtomics(Query); 784 }) 785 .libcallIf(all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, p0), 786 UseOutlineAtomics)) 787 .clampScalar(0, s32, s64); 788 789 getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, 790 G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR, 791 G_ATOMICRMW_XOR}) 792 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0), 793 predNot(UseOutlineAtomics))) 794 .libcallIf(all(typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0), 795 UseOutlineAtomics)) 796 .clampScalar(0, s32, s64); 797 798 // Do not outline these atomics operations, as per comment in 799 // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR(). 800 getActionDefinitionsBuilder( 801 {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX}) 802 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0))) 803 .clampScalar(0, s32, s64); 804 805 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0}); 806 807 // Merge/Unmerge 808 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 809 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 810 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 811 getActionDefinitionsBuilder(Op) 812 .widenScalarToNextPow2(LitTyIdx, 8) 813 .widenScalarToNextPow2(BigTyIdx, 32) 814 .clampScalar(LitTyIdx, s8, s64) 815 .clampScalar(BigTyIdx, s32, s128) 816 .legalIf([=](const LegalityQuery &Q) { 817 switch (Q.Types[BigTyIdx].getSizeInBits()) { 818 case 32: 819 case 64: 820 case 128: 821 break; 822 default: 823 return false; 824 } 825 switch (Q.Types[LitTyIdx].getSizeInBits()) { 826 case 8: 827 case 16: 828 case 32: 829 case 64: 830 return true; 831 default: 832 return false; 833 } 834 }); 835 } 836 837 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 838 .unsupportedIf([=](const LegalityQuery &Query) { 839 const LLT &EltTy = Query.Types[1].getElementType(); 840 return Query.Types[0] != EltTy; 841 }) 842 .minScalar(2, s64) 843 .customIf([=](const LegalityQuery &Query) { 844 const LLT &VecTy = Query.Types[1]; 845 return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 || 846 VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 || 847 VecTy == v8s8 || VecTy == v16s8 || VecTy == v2p0; 848 }) 849 .minScalarOrEltIf( 850 [=](const LegalityQuery &Query) { 851 // We want to promote to <M x s1> to <M x s64> if that wouldn't 852 // cause the total vec size to be > 128b. 853 return Query.Types[1].getNumElements() <= 2; 854 }, 855 0, s64) 856 .minScalarOrEltIf( 857 [=](const LegalityQuery &Query) { 858 return Query.Types[1].getNumElements() <= 4; 859 }, 860 0, s32) 861 .minScalarOrEltIf( 862 [=](const LegalityQuery &Query) { 863 return Query.Types[1].getNumElements() <= 8; 864 }, 865 0, s16) 866 .minScalarOrEltIf( 867 [=](const LegalityQuery &Query) { 868 return Query.Types[1].getNumElements() <= 16; 869 }, 870 0, s8) 871 .minScalarOrElt(0, s8) // Worst case, we need at least s8. 872 .moreElementsToNextPow2(1) 873 .clampMaxNumElements(1, s64, 2) 874 .clampMaxNumElements(1, s32, 4) 875 .clampMaxNumElements(1, s16, 8) 876 .clampMaxNumElements(1, p0, 2); 877 878 getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) 879 .legalIf(typeInSet(0, {v16s8, v8s8, v8s16, v4s16, v4s32, v2s32, v2s64})) 880 .widenVectorEltsToVectorMinSize(0, 64); 881 882 getActionDefinitionsBuilder(G_BUILD_VECTOR) 883 .legalFor({{v8s8, s8}, 884 {v16s8, s8}, 885 {v4s16, s16}, 886 {v8s16, s16}, 887 {v2s32, s32}, 888 {v4s32, s32}, 889 {v2p0, p0}, 890 {v2s64, s64}}) 891 .clampNumElements(0, v4s32, v4s32) 892 .clampNumElements(0, v2s64, v2s64) 893 .minScalarOrElt(0, s8) 894 .widenVectorEltsToVectorMinSize(0, 64) 895 .minScalarSameAs(1, 0); 896 897 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower(); 898 899 getActionDefinitionsBuilder(G_CTLZ) 900 .legalForCartesianProduct( 901 {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) 902 .scalarize(1) 903 .widenScalarToNextPow2(1, /*Min=*/32) 904 .clampScalar(1, s32, s64) 905 .scalarSameSizeAs(0, 1); 906 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower(); 907 908 // TODO: Custom lowering for v2s32, v4s32, v2s64. 909 getActionDefinitionsBuilder(G_BITREVERSE) 910 .legalFor({s32, s64, v8s8, v16s8}) 911 .widenScalarToNextPow2(0, /*Min = */ 32) 912 .clampScalar(0, s32, s64); 913 914 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower(); 915 916 getActionDefinitionsBuilder(G_CTTZ) 917 .lowerIf(isVector(0)) 918 .widenScalarToNextPow2(1, /*Min=*/32) 919 .clampScalar(1, s32, s64) 920 .scalarSameSizeAs(0, 1) 921 .legalIf([=](const LegalityQuery &Query) { 922 return (HasCSSC && typeInSet(0, {s32, s64})(Query)); 923 }) 924 .customIf([=](const LegalityQuery &Query) { 925 return (!HasCSSC && typeInSet(0, {s32, s64})(Query)); 926 }); 927 928 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 929 .legalIf([=](const LegalityQuery &Query) { 930 const LLT &DstTy = Query.Types[0]; 931 const LLT &SrcTy = Query.Types[1]; 932 // For now just support the TBL2 variant which needs the source vectors 933 // to be the same size as the dest. 934 if (DstTy != SrcTy) 935 return false; 936 return llvm::is_contained( 937 {v2s64, v2p0, v2s32, v4s32, v4s16, v16s8, v8s8, v8s16}, DstTy); 938 }) 939 // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we 940 // just want those lowered into G_BUILD_VECTOR 941 .lowerIf([=](const LegalityQuery &Query) { 942 return !Query.Types[1].isVector(); 943 }) 944 .moreElementsIf( 945 [](const LegalityQuery &Query) { 946 return Query.Types[0].isVector() && Query.Types[1].isVector() && 947 Query.Types[0].getNumElements() > 948 Query.Types[1].getNumElements(); 949 }, 950 changeTo(1, 0)) 951 .moreElementsToNextPow2(0) 952 .clampNumElements(0, v4s32, v4s32) 953 .clampNumElements(0, v2s64, v2s64) 954 .moreElementsIf( 955 [](const LegalityQuery &Query) { 956 return Query.Types[0].isVector() && Query.Types[1].isVector() && 957 Query.Types[0].getNumElements() < 958 Query.Types[1].getNumElements(); 959 }, 960 changeTo(0, 1)); 961 962 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 963 .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}}); 964 965 getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({p0}); 966 967 getActionDefinitionsBuilder(G_BRJT).legalFor({{p0, s64}}); 968 969 getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom(); 970 971 getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower(); 972 973 if (ST.hasMOPS()) { 974 // G_BZERO is not supported. Currently it is only emitted by 975 // PreLegalizerCombiner for G_MEMSET with zero constant. 976 getActionDefinitionsBuilder(G_BZERO).unsupported(); 977 978 getActionDefinitionsBuilder(G_MEMSET) 979 .legalForCartesianProduct({p0}, {s64}, {s64}) 980 .customForCartesianProduct({p0}, {s8}, {s64}) 981 .immIdx(0); // Inform verifier imm idx 0 is handled. 982 983 getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE}) 984 .legalForCartesianProduct({p0}, {p0}, {s64}) 985 .immIdx(0); // Inform verifier imm idx 0 is handled. 986 987 // G_MEMCPY_INLINE does not have a tailcall immediate 988 getActionDefinitionsBuilder(G_MEMCPY_INLINE) 989 .legalForCartesianProduct({p0}, {p0}, {s64}); 990 991 } else { 992 getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET}) 993 .libcall(); 994 } 995 996 // FIXME: Legal vector types are only legal with NEON. 997 auto &ABSActions = getActionDefinitionsBuilder(G_ABS); 998 if (HasCSSC) 999 ABSActions 1000 .legalFor({s32, s64}); 1001 ABSActions 1002 .legalFor(PackedVectorAllTypeList) 1003 .lowerIf(isScalar(0)); 1004 1005 // For fadd reductions we have pairwise operations available. We treat the 1006 // usual legal types as legal and handle the lowering to pairwise instructions 1007 // later. 1008 getActionDefinitionsBuilder(G_VECREDUCE_FADD) 1009 .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}}) 1010 .legalIf([=](const LegalityQuery &Query) { 1011 const auto &Ty = Query.Types[1]; 1012 return (Ty == v4s16 || Ty == v8s16) && HasFP16; 1013 }) 1014 .minScalarOrElt(0, MinFPScalar) 1015 .clampMaxNumElements(1, s64, 2) 1016 .clampMaxNumElements(1, s32, 4) 1017 .clampMaxNumElements(1, s16, 8) 1018 .lower(); 1019 1020 // For fmul reductions we need to split up into individual operations. We 1021 // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of 1022 // smaller types, followed by scalarizing what remains. 1023 getActionDefinitionsBuilder(G_VECREDUCE_FMUL) 1024 .minScalarOrElt(0, MinFPScalar) 1025 .clampMaxNumElements(1, s64, 2) 1026 .clampMaxNumElements(1, s32, 4) 1027 .clampMaxNumElements(1, s16, 8) 1028 .clampMaxNumElements(1, s32, 2) 1029 .clampMaxNumElements(1, s16, 4) 1030 .scalarize(1) 1031 .lower(); 1032 1033 getActionDefinitionsBuilder({G_VECREDUCE_SEQ_FADD, G_VECREDUCE_SEQ_FMUL}) 1034 .scalarize(2) 1035 .lower(); 1036 1037 getActionDefinitionsBuilder(G_VECREDUCE_ADD) 1038 .legalFor({{s8, v16s8}, 1039 {s8, v8s8}, 1040 {s16, v8s16}, 1041 {s16, v4s16}, 1042 {s32, v4s32}, 1043 {s32, v2s32}, 1044 {s64, v2s64}}) 1045 .clampMaxNumElements(1, s64, 2) 1046 .clampMaxNumElements(1, s32, 4) 1047 .clampMaxNumElements(1, s16, 8) 1048 .clampMaxNumElements(1, s8, 16) 1049 .lower(); 1050 1051 getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX, 1052 G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM}) 1053 .legalFor({{s32, v4s32}, {s32, v2s32}, {s64, v2s64}}) 1054 .legalIf([=](const LegalityQuery &Query) { 1055 const auto &Ty = Query.Types[1]; 1056 return Query.Types[0] == s16 && (Ty == v8s16 || Ty == v4s16) && HasFP16; 1057 }) 1058 .minScalarOrElt(0, MinFPScalar) 1059 .clampMaxNumElements(1, s64, 2) 1060 .clampMaxNumElements(1, s32, 4) 1061 .clampMaxNumElements(1, s16, 8) 1062 .lower(); 1063 1064 getActionDefinitionsBuilder(G_VECREDUCE_MUL) 1065 .clampMaxNumElements(1, s32, 2) 1066 .clampMaxNumElements(1, s16, 4) 1067 .clampMaxNumElements(1, s8, 8) 1068 .scalarize(1) 1069 .lower(); 1070 1071 getActionDefinitionsBuilder( 1072 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX}) 1073 .legalFor({{s8, v8s8}, 1074 {s8, v16s8}, 1075 {s16, v4s16}, 1076 {s16, v8s16}, 1077 {s32, v2s32}, 1078 {s32, v4s32}}) 1079 .clampMaxNumElements(1, s64, 2) 1080 .clampMaxNumElements(1, s32, 4) 1081 .clampMaxNumElements(1, s16, 8) 1082 .clampMaxNumElements(1, s8, 16) 1083 .scalarize(1) 1084 .lower(); 1085 1086 getActionDefinitionsBuilder( 1087 {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR}) 1088 // Try to break down into smaller vectors as long as they're at least 64 1089 // bits. This lets us use vector operations for some parts of the 1090 // reduction. 1091 .fewerElementsIf( 1092 [=](const LegalityQuery &Q) { 1093 LLT SrcTy = Q.Types[1]; 1094 if (SrcTy.isScalar()) 1095 return false; 1096 if (!isPowerOf2_32(SrcTy.getNumElements())) 1097 return false; 1098 // We can usually perform 64b vector operations. 1099 return SrcTy.getSizeInBits() > 64; 1100 }, 1101 [=](const LegalityQuery &Q) { 1102 LLT SrcTy = Q.Types[1]; 1103 return std::make_pair(1, SrcTy.divide(2)); 1104 }) 1105 .scalarize(1) 1106 .lower(); 1107 1108 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 1109 .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); }); 1110 1111 getActionDefinitionsBuilder({G_FSHL, G_FSHR}) 1112 .customFor({{s32, s32}, {s32, s64}, {s64, s64}}) 1113 .lower(); 1114 1115 getActionDefinitionsBuilder(G_ROTR) 1116 .legalFor({{s32, s64}, {s64, s64}}) 1117 .customIf([=](const LegalityQuery &Q) { 1118 return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64; 1119 }) 1120 .lower(); 1121 getActionDefinitionsBuilder(G_ROTL).lower(); 1122 1123 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1124 .customFor({{s32, s32}, {s64, s64}}); 1125 1126 auto always = [=](const LegalityQuery &Q) { return true; }; 1127 auto &CTPOPActions = getActionDefinitionsBuilder(G_CTPOP); 1128 if (HasCSSC) 1129 CTPOPActions 1130 .legalFor({{s32, s32}, 1131 {s64, s64}, 1132 {v8s8, v8s8}, 1133 {v16s8, v16s8}}) 1134 .customFor({{s128, s128}, 1135 {v2s64, v2s64}, 1136 {v2s32, v2s32}, 1137 {v4s32, v4s32}, 1138 {v4s16, v4s16}, 1139 {v8s16, v8s16}}); 1140 else 1141 CTPOPActions 1142 .legalFor({{v8s8, v8s8}, 1143 {v16s8, v16s8}}) 1144 .customFor({{s32, s32}, 1145 {s64, s64}, 1146 {s128, s128}, 1147 {v2s64, v2s64}, 1148 {v2s32, v2s32}, 1149 {v4s32, v4s32}, 1150 {v4s16, v4s16}, 1151 {v8s16, v8s16}}); 1152 CTPOPActions 1153 .clampScalar(0, s32, s128) 1154 .widenScalarToNextPow2(0) 1155 .minScalarEltSameAsIf(always, 1, 0) 1156 .maxScalarEltSameAsIf(always, 1, 0); 1157 1158 // TODO: Vector types. 1159 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0)); 1160 1161 // TODO: Libcall support for s128. 1162 // TODO: s16 should be legal with full FP16 support. 1163 getActionDefinitionsBuilder({G_LROUND, G_LLROUND}) 1164 .legalFor({{s64, s32}, {s64, s64}}); 1165 1166 // TODO: Custom legalization for vector types. 1167 // TODO: Custom legalization for mismatched types. 1168 // TODO: s16 support. 1169 getActionDefinitionsBuilder(G_FCOPYSIGN).customFor({{s32, s32}, {s64, s64}}); 1170 1171 getActionDefinitionsBuilder(G_FMAD).lower(); 1172 1173 // Access to floating-point environment. 1174 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV, 1175 G_GET_FPMODE, G_SET_FPMODE, G_RESET_FPMODE}) 1176 .libcall(); 1177 1178 getActionDefinitionsBuilder(G_IS_FPCLASS).lower(); 1179 1180 getActionDefinitionsBuilder(G_PREFETCH).custom(); 1181 1182 getLegacyLegalizerInfo().computeTables(); 1183 verify(*ST.getInstrInfo()); 1184 } 1185 1186 bool AArch64LegalizerInfo::legalizeCustom( 1187 LegalizerHelper &Helper, MachineInstr &MI, 1188 LostDebugLocObserver &LocObserver) const { 1189 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1190 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 1191 GISelChangeObserver &Observer = Helper.Observer; 1192 switch (MI.getOpcode()) { 1193 default: 1194 // No idea what to do. 1195 return false; 1196 case TargetOpcode::G_VAARG: 1197 return legalizeVaArg(MI, MRI, MIRBuilder); 1198 case TargetOpcode::G_LOAD: 1199 case TargetOpcode::G_STORE: 1200 return legalizeLoadStore(MI, MRI, MIRBuilder, Observer); 1201 case TargetOpcode::G_SHL: 1202 case TargetOpcode::G_ASHR: 1203 case TargetOpcode::G_LSHR: 1204 return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer); 1205 case TargetOpcode::G_GLOBAL_VALUE: 1206 return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer); 1207 case TargetOpcode::G_SBFX: 1208 case TargetOpcode::G_UBFX: 1209 return legalizeBitfieldExtract(MI, MRI, Helper); 1210 case TargetOpcode::G_FSHL: 1211 case TargetOpcode::G_FSHR: 1212 return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper); 1213 case TargetOpcode::G_ROTR: 1214 return legalizeRotate(MI, MRI, Helper); 1215 case TargetOpcode::G_CTPOP: 1216 return legalizeCTPOP(MI, MRI, Helper); 1217 case TargetOpcode::G_ATOMIC_CMPXCHG: 1218 return legalizeAtomicCmpxchg128(MI, MRI, Helper); 1219 case TargetOpcode::G_CTTZ: 1220 return legalizeCTTZ(MI, Helper); 1221 case TargetOpcode::G_BZERO: 1222 case TargetOpcode::G_MEMCPY: 1223 case TargetOpcode::G_MEMMOVE: 1224 case TargetOpcode::G_MEMSET: 1225 return legalizeMemOps(MI, Helper); 1226 case TargetOpcode::G_FCOPYSIGN: 1227 return legalizeFCopySign(MI, Helper); 1228 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1229 return legalizeExtractVectorElt(MI, MRI, Helper); 1230 case TargetOpcode::G_DYN_STACKALLOC: 1231 return legalizeDynStackAlloc(MI, Helper); 1232 case TargetOpcode::G_PREFETCH: 1233 return legalizePrefetch(MI, Helper); 1234 } 1235 1236 llvm_unreachable("expected switch to return"); 1237 } 1238 1239 bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI, 1240 MachineRegisterInfo &MRI, 1241 MachineIRBuilder &MIRBuilder, 1242 GISelChangeObserver &Observer, 1243 LegalizerHelper &Helper) const { 1244 assert(MI.getOpcode() == TargetOpcode::G_FSHL || 1245 MI.getOpcode() == TargetOpcode::G_FSHR); 1246 1247 // Keep as G_FSHR if shift amount is a G_CONSTANT, else use generic 1248 // lowering 1249 Register ShiftNo = MI.getOperand(3).getReg(); 1250 LLT ShiftTy = MRI.getType(ShiftNo); 1251 auto VRegAndVal = getIConstantVRegValWithLookThrough(ShiftNo, MRI); 1252 1253 // Adjust shift amount according to Opcode (FSHL/FSHR) 1254 // Convert FSHL to FSHR 1255 LLT OperationTy = MRI.getType(MI.getOperand(0).getReg()); 1256 APInt BitWidth(ShiftTy.getSizeInBits(), OperationTy.getSizeInBits(), false); 1257 1258 // Lower non-constant shifts and leave zero shifts to the optimizer. 1259 if (!VRegAndVal || VRegAndVal->Value.urem(BitWidth) == 0) 1260 return (Helper.lowerFunnelShiftAsShifts(MI) == 1261 LegalizerHelper::LegalizeResult::Legalized); 1262 1263 APInt Amount = VRegAndVal->Value.urem(BitWidth); 1264 1265 Amount = MI.getOpcode() == TargetOpcode::G_FSHL ? BitWidth - Amount : Amount; 1266 1267 // If the instruction is G_FSHR, has a 64-bit G_CONSTANT for shift amount 1268 // in the range of 0 <-> BitWidth, it is legal 1269 if (ShiftTy.getSizeInBits() == 64 && MI.getOpcode() == TargetOpcode::G_FSHR && 1270 VRegAndVal->Value.ult(BitWidth)) 1271 return true; 1272 1273 // Cast the ShiftNumber to a 64-bit type 1274 auto Cast64 = MIRBuilder.buildConstant(LLT::scalar(64), Amount.zext(64)); 1275 1276 if (MI.getOpcode() == TargetOpcode::G_FSHR) { 1277 Observer.changingInstr(MI); 1278 MI.getOperand(3).setReg(Cast64.getReg(0)); 1279 Observer.changedInstr(MI); 1280 } 1281 // If Opcode is FSHL, remove the FSHL instruction and create a FSHR 1282 // instruction 1283 else if (MI.getOpcode() == TargetOpcode::G_FSHL) { 1284 MIRBuilder.buildInstr(TargetOpcode::G_FSHR, {MI.getOperand(0).getReg()}, 1285 {MI.getOperand(1).getReg(), MI.getOperand(2).getReg(), 1286 Cast64.getReg(0)}); 1287 MI.eraseFromParent(); 1288 } 1289 return true; 1290 } 1291 1292 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI, 1293 MachineRegisterInfo &MRI, 1294 LegalizerHelper &Helper) const { 1295 // To allow for imported patterns to match, we ensure that the rotate amount 1296 // is 64b with an extension. 1297 Register AmtReg = MI.getOperand(2).getReg(); 1298 LLT AmtTy = MRI.getType(AmtReg); 1299 (void)AmtTy; 1300 assert(AmtTy.isScalar() && "Expected a scalar rotate"); 1301 assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal"); 1302 auto NewAmt = Helper.MIRBuilder.buildZExt(LLT::scalar(64), AmtReg); 1303 Helper.Observer.changingInstr(MI); 1304 MI.getOperand(2).setReg(NewAmt.getReg(0)); 1305 Helper.Observer.changedInstr(MI); 1306 return true; 1307 } 1308 1309 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( 1310 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1311 GISelChangeObserver &Observer) const { 1312 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); 1313 // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP + 1314 // G_ADD_LOW instructions. 1315 // By splitting this here, we can optimize accesses in the small code model by 1316 // folding in the G_ADD_LOW into the load/store offset. 1317 auto &GlobalOp = MI.getOperand(1); 1318 const auto* GV = GlobalOp.getGlobal(); 1319 if (GV->isThreadLocal()) 1320 return true; // Don't want to modify TLS vars. 1321 1322 auto &TM = ST->getTargetLowering()->getTargetMachine(); 1323 unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM); 1324 1325 if (OpFlags & AArch64II::MO_GOT) 1326 return true; 1327 1328 auto Offset = GlobalOp.getOffset(); 1329 Register DstReg = MI.getOperand(0).getReg(); 1330 auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {}) 1331 .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE); 1332 // Set the regclass on the dest reg too. 1333 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); 1334 1335 // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so 1336 // by creating a MOVK that sets bits 48-63 of the register to (global address 1337 // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to 1338 // prevent an incorrect tag being generated during relocation when the 1339 // global appears before the code section. Without the offset, a global at 1340 // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced 1341 // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 = 1342 // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe` 1343 // instead of `0xf`. 1344 // This assumes that we're in the small code model so we can assume a binary 1345 // size of <= 4GB, which makes the untagged PC relative offset positive. The 1346 // binary must also be loaded into address range [0, 2^48). Both of these 1347 // properties need to be ensured at runtime when using tagged addresses. 1348 if (OpFlags & AArch64II::MO_TAGGED) { 1349 assert(!Offset && 1350 "Should not have folded in an offset for a tagged global!"); 1351 ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP}) 1352 .addGlobalAddress(GV, 0x100000000, 1353 AArch64II::MO_PREL | AArch64II::MO_G3) 1354 .addImm(48); 1355 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); 1356 } 1357 1358 MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP}) 1359 .addGlobalAddress(GV, Offset, 1360 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 1361 MI.eraseFromParent(); 1362 return true; 1363 } 1364 1365 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 1366 MachineInstr &MI) const { 1367 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); 1368 switch (IntrinsicID) { 1369 case Intrinsic::vacopy: { 1370 unsigned PtrSize = ST->isTargetILP32() ? 4 : 8; 1371 unsigned VaListSize = 1372 (ST->isTargetDarwin() || ST->isTargetWindows()) 1373 ? PtrSize 1374 : ST->isTargetILP32() ? 20 : 32; 1375 1376 MachineFunction &MF = *MI.getMF(); 1377 auto Val = MF.getRegInfo().createGenericVirtualRegister( 1378 LLT::scalar(VaListSize * 8)); 1379 MachineIRBuilder MIB(MI); 1380 MIB.buildLoad(Val, MI.getOperand(2), 1381 *MF.getMachineMemOperand(MachinePointerInfo(), 1382 MachineMemOperand::MOLoad, 1383 VaListSize, Align(PtrSize))); 1384 MIB.buildStore(Val, MI.getOperand(1), 1385 *MF.getMachineMemOperand(MachinePointerInfo(), 1386 MachineMemOperand::MOStore, 1387 VaListSize, Align(PtrSize))); 1388 MI.eraseFromParent(); 1389 return true; 1390 } 1391 case Intrinsic::get_dynamic_area_offset: { 1392 MachineIRBuilder &MIB = Helper.MIRBuilder; 1393 MIB.buildConstant(MI.getOperand(0).getReg(), 0); 1394 MI.eraseFromParent(); 1395 return true; 1396 } 1397 case Intrinsic::aarch64_mops_memset_tag: { 1398 assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 1399 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by 1400 // the instruction). 1401 MachineIRBuilder MIB(MI); 1402 auto &Value = MI.getOperand(3); 1403 Register ExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0); 1404 Value.setReg(ExtValueReg); 1405 return true; 1406 } 1407 case Intrinsic::aarch64_prefetch: { 1408 MachineIRBuilder MIB(MI); 1409 auto &AddrVal = MI.getOperand(1); 1410 1411 int64_t IsWrite = MI.getOperand(2).getImm(); 1412 int64_t Target = MI.getOperand(3).getImm(); 1413 int64_t IsStream = MI.getOperand(4).getImm(); 1414 int64_t IsData = MI.getOperand(5).getImm(); 1415 1416 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 1417 (!IsData << 3) | // IsDataCache bit 1418 (Target << 1) | // Cache level bits 1419 (unsigned)IsStream; // Stream bit 1420 1421 MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal); 1422 MI.eraseFromParent(); 1423 return true; 1424 } 1425 case Intrinsic::aarch64_neon_uaddv: 1426 case Intrinsic::aarch64_neon_saddv: 1427 case Intrinsic::aarch64_neon_umaxv: 1428 case Intrinsic::aarch64_neon_smaxv: 1429 case Intrinsic::aarch64_neon_uminv: 1430 case Intrinsic::aarch64_neon_sminv: { 1431 MachineIRBuilder MIB(MI); 1432 MachineRegisterInfo &MRI = *MIB.getMRI(); 1433 bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv || 1434 IntrinsicID == Intrinsic::aarch64_neon_smaxv || 1435 IntrinsicID == Intrinsic::aarch64_neon_sminv; 1436 1437 auto OldDst = MI.getOperand(0).getReg(); 1438 auto OldDstTy = MRI.getType(OldDst); 1439 LLT NewDstTy = MRI.getType(MI.getOperand(2).getReg()).getElementType(); 1440 if (OldDstTy == NewDstTy) 1441 return true; 1442 1443 auto NewDst = MRI.createGenericVirtualRegister(NewDstTy); 1444 1445 Helper.Observer.changingInstr(MI); 1446 MI.getOperand(0).setReg(NewDst); 1447 Helper.Observer.changedInstr(MI); 1448 1449 MIB.setInsertPt(MIB.getMBB(), ++MIB.getInsertPt()); 1450 MIB.buildExtOrTrunc(IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT, 1451 OldDst, NewDst); 1452 1453 return true; 1454 } 1455 case Intrinsic::aarch64_neon_uaddlp: 1456 case Intrinsic::aarch64_neon_saddlp: { 1457 MachineIRBuilder MIB(MI); 1458 1459 unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp 1460 ? AArch64::G_UADDLP 1461 : AArch64::G_SADDLP; 1462 MIB.buildInstr(Opc, {MI.getOperand(0)}, {MI.getOperand(2)}); 1463 MI.eraseFromParent(); 1464 1465 return true; 1466 } 1467 case Intrinsic::aarch64_neon_uaddlv: 1468 case Intrinsic::aarch64_neon_saddlv: { 1469 MachineIRBuilder MIB(MI); 1470 MachineRegisterInfo &MRI = *MIB.getMRI(); 1471 1472 unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv 1473 ? AArch64::G_UADDLV 1474 : AArch64::G_SADDLV; 1475 Register DstReg = MI.getOperand(0).getReg(); 1476 Register SrcReg = MI.getOperand(2).getReg(); 1477 LLT DstTy = MRI.getType(DstReg); 1478 1479 LLT MidTy, ExtTy; 1480 if (DstTy.isScalar() && DstTy.getScalarSizeInBits() <= 32) { 1481 MidTy = LLT::fixed_vector(4, 32); 1482 ExtTy = LLT::scalar(32); 1483 } else { 1484 MidTy = LLT::fixed_vector(2, 64); 1485 ExtTy = LLT::scalar(64); 1486 } 1487 1488 Register MidReg = 1489 MIB.buildInstr(Opc, {MidTy}, {SrcReg})->getOperand(0).getReg(); 1490 Register ZeroReg = 1491 MIB.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg(); 1492 Register ExtReg = MIB.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, {ExtTy}, 1493 {MidReg, ZeroReg}) 1494 .getReg(0); 1495 1496 if (DstTy.getScalarSizeInBits() < 32) 1497 MIB.buildTrunc(DstReg, ExtReg); 1498 else 1499 MIB.buildCopy(DstReg, ExtReg); 1500 1501 MI.eraseFromParent(); 1502 1503 return true; 1504 } 1505 case Intrinsic::aarch64_neon_smax: 1506 case Intrinsic::aarch64_neon_smin: 1507 case Intrinsic::aarch64_neon_umax: 1508 case Intrinsic::aarch64_neon_umin: 1509 case Intrinsic::aarch64_neon_fmax: 1510 case Intrinsic::aarch64_neon_fmin: 1511 case Intrinsic::aarch64_neon_fmaxnm: 1512 case Intrinsic::aarch64_neon_fminnm: { 1513 MachineIRBuilder MIB(MI); 1514 if (IntrinsicID == Intrinsic::aarch64_neon_smax) 1515 MIB.buildSMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3)); 1516 else if (IntrinsicID == Intrinsic::aarch64_neon_smin) 1517 MIB.buildSMin(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3)); 1518 else if (IntrinsicID == Intrinsic::aarch64_neon_umax) 1519 MIB.buildUMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3)); 1520 else if (IntrinsicID == Intrinsic::aarch64_neon_umin) 1521 MIB.buildUMin(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3)); 1522 else if (IntrinsicID == Intrinsic::aarch64_neon_fmax) 1523 MIB.buildInstr(TargetOpcode::G_FMAXIMUM, {MI.getOperand(0)}, 1524 {MI.getOperand(2), MI.getOperand(3)}); 1525 else if (IntrinsicID == Intrinsic::aarch64_neon_fmin) 1526 MIB.buildInstr(TargetOpcode::G_FMINIMUM, {MI.getOperand(0)}, 1527 {MI.getOperand(2), MI.getOperand(3)}); 1528 else if (IntrinsicID == Intrinsic::aarch64_neon_fmaxnm) 1529 MIB.buildInstr(TargetOpcode::G_FMAXNUM, {MI.getOperand(0)}, 1530 {MI.getOperand(2), MI.getOperand(3)}); 1531 else if (IntrinsicID == Intrinsic::aarch64_neon_fminnm) 1532 MIB.buildInstr(TargetOpcode::G_FMINNUM, {MI.getOperand(0)}, 1533 {MI.getOperand(2), MI.getOperand(3)}); 1534 MI.eraseFromParent(); 1535 return true; 1536 } 1537 case Intrinsic::experimental_vector_reverse: 1538 // TODO: Add support for vector_reverse 1539 return false; 1540 } 1541 1542 return true; 1543 } 1544 1545 bool AArch64LegalizerInfo::legalizeShlAshrLshr( 1546 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1547 GISelChangeObserver &Observer) const { 1548 assert(MI.getOpcode() == TargetOpcode::G_ASHR || 1549 MI.getOpcode() == TargetOpcode::G_LSHR || 1550 MI.getOpcode() == TargetOpcode::G_SHL); 1551 // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the 1552 // imported patterns can select it later. Either way, it will be legal. 1553 Register AmtReg = MI.getOperand(2).getReg(); 1554 auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI); 1555 if (!VRegAndVal) 1556 return true; 1557 // Check the shift amount is in range for an immediate form. 1558 int64_t Amount = VRegAndVal->Value.getSExtValue(); 1559 if (Amount > 31) 1560 return true; // This will have to remain a register variant. 1561 auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount); 1562 Observer.changingInstr(MI); 1563 MI.getOperand(2).setReg(ExtCst.getReg(0)); 1564 Observer.changedInstr(MI); 1565 return true; 1566 } 1567 1568 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset, 1569 MachineRegisterInfo &MRI) { 1570 Base = Root; 1571 Offset = 0; 1572 1573 Register NewBase; 1574 int64_t NewOffset; 1575 if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) && 1576 isShiftedInt<7, 3>(NewOffset)) { 1577 Base = NewBase; 1578 Offset = NewOffset; 1579 } 1580 } 1581 1582 // FIXME: This should be removed and replaced with the generic bitcast legalize 1583 // action. 1584 bool AArch64LegalizerInfo::legalizeLoadStore( 1585 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1586 GISelChangeObserver &Observer) const { 1587 assert(MI.getOpcode() == TargetOpcode::G_STORE || 1588 MI.getOpcode() == TargetOpcode::G_LOAD); 1589 // Here we just try to handle vector loads/stores where our value type might 1590 // have pointer elements, which the SelectionDAG importer can't handle. To 1591 // allow the existing patterns for s64 to fire for p0, we just try to bitcast 1592 // the value to use s64 types. 1593 1594 // Custom legalization requires the instruction, if not deleted, must be fully 1595 // legalized. In order to allow further legalization of the inst, we create 1596 // a new instruction and erase the existing one. 1597 1598 Register ValReg = MI.getOperand(0).getReg(); 1599 const LLT ValTy = MRI.getType(ValReg); 1600 1601 if (ValTy == LLT::scalar(128)) { 1602 1603 AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering(); 1604 bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD; 1605 bool IsLoadAcquire = IsLoad && Ordering == AtomicOrdering::Acquire; 1606 bool IsStoreRelease = !IsLoad && Ordering == AtomicOrdering::Release; 1607 bool IsRcpC3 = 1608 ST->hasLSE2() && ST->hasRCPC3() && (IsLoadAcquire || IsStoreRelease); 1609 1610 LLT s64 = LLT::scalar(64); 1611 1612 unsigned Opcode; 1613 if (IsRcpC3) { 1614 Opcode = IsLoad ? AArch64::LDIAPPX : AArch64::STILPX; 1615 } else { 1616 // For LSE2, loads/stores should have been converted to monotonic and had 1617 // a fence inserted after them. 1618 assert(Ordering == AtomicOrdering::Monotonic || 1619 Ordering == AtomicOrdering::Unordered); 1620 assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2"); 1621 1622 Opcode = IsLoad ? AArch64::LDPXi : AArch64::STPXi; 1623 } 1624 1625 MachineInstrBuilder NewI; 1626 if (IsLoad) { 1627 NewI = MIRBuilder.buildInstr(Opcode, {s64, s64}, {}); 1628 MIRBuilder.buildMergeLikeInstr( 1629 ValReg, {NewI->getOperand(0), NewI->getOperand(1)}); 1630 } else { 1631 auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0)); 1632 NewI = MIRBuilder.buildInstr( 1633 Opcode, {}, {Split->getOperand(0), Split->getOperand(1)}); 1634 } 1635 1636 if (IsRcpC3) { 1637 NewI.addUse(MI.getOperand(1).getReg()); 1638 } else { 1639 Register Base; 1640 int Offset; 1641 matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI); 1642 NewI.addUse(Base); 1643 NewI.addImm(Offset / 8); 1644 } 1645 1646 NewI.cloneMemRefs(MI); 1647 constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(), 1648 *MRI.getTargetRegisterInfo(), 1649 *ST->getRegBankInfo()); 1650 MI.eraseFromParent(); 1651 return true; 1652 } 1653 1654 if (!ValTy.isVector() || !ValTy.getElementType().isPointer() || 1655 ValTy.getElementType().getAddressSpace() != 0) { 1656 LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store"); 1657 return false; 1658 } 1659 1660 unsigned PtrSize = ValTy.getElementType().getSizeInBits(); 1661 const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize); 1662 auto &MMO = **MI.memoperands_begin(); 1663 MMO.setType(NewTy); 1664 1665 if (MI.getOpcode() == TargetOpcode::G_STORE) { 1666 auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg); 1667 MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO); 1668 } else { 1669 auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO); 1670 MIRBuilder.buildBitcast(ValReg, NewLoad); 1671 } 1672 MI.eraseFromParent(); 1673 return true; 1674 } 1675 1676 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, 1677 MachineRegisterInfo &MRI, 1678 MachineIRBuilder &MIRBuilder) const { 1679 MachineFunction &MF = MIRBuilder.getMF(); 1680 Align Alignment(MI.getOperand(2).getImm()); 1681 Register Dst = MI.getOperand(0).getReg(); 1682 Register ListPtr = MI.getOperand(1).getReg(); 1683 1684 LLT PtrTy = MRI.getType(ListPtr); 1685 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1686 1687 const unsigned PtrSize = PtrTy.getSizeInBits() / 8; 1688 const Align PtrAlign = Align(PtrSize); 1689 auto List = MIRBuilder.buildLoad( 1690 PtrTy, ListPtr, 1691 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, 1692 PtrTy, PtrAlign)); 1693 1694 MachineInstrBuilder DstPtr; 1695 if (Alignment > PtrAlign) { 1696 // Realign the list to the actual required alignment. 1697 auto AlignMinus1 = 1698 MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1); 1699 auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0)); 1700 DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment)); 1701 } else 1702 DstPtr = List; 1703 1704 LLT ValTy = MRI.getType(Dst); 1705 uint64_t ValSize = ValTy.getSizeInBits() / 8; 1706 MIRBuilder.buildLoad( 1707 Dst, DstPtr, 1708 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, 1709 ValTy, std::max(Alignment, PtrAlign))); 1710 1711 auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign)); 1712 1713 auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0)); 1714 1715 MIRBuilder.buildStore(NewList, ListPtr, 1716 *MF.getMachineMemOperand(MachinePointerInfo(), 1717 MachineMemOperand::MOStore, 1718 PtrTy, PtrAlign)); 1719 1720 MI.eraseFromParent(); 1721 return true; 1722 } 1723 1724 bool AArch64LegalizerInfo::legalizeBitfieldExtract( 1725 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 1726 // Only legal if we can select immediate forms. 1727 // TODO: Lower this otherwise. 1728 return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) && 1729 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 1730 } 1731 1732 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI, 1733 MachineRegisterInfo &MRI, 1734 LegalizerHelper &Helper) const { 1735 // When there is no integer popcount instruction (FEAT_CSSC isn't available), 1736 // it can be more efficiently lowered to the following sequence that uses 1737 // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD 1738 // registers are cheap. 1739 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 1740 // CNT V0.8B, V0.8B // 8xbyte pop-counts 1741 // ADDV B0, V0.8B // sum 8xbyte pop-counts 1742 // UMOV X0, V0.B[0] // copy byte result back to integer reg 1743 // 1744 // For 128 bit vector popcounts, we lower to the following sequence: 1745 // cnt.16b v0, v0 // v8s16, v4s32, v2s64 1746 // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64 1747 // uaddlp.4s v0, v0 // v4s32, v2s64 1748 // uaddlp.2d v0, v0 // v2s64 1749 // 1750 // For 64 bit vector popcounts, we lower to the following sequence: 1751 // cnt.8b v0, v0 // v4s16, v2s32 1752 // uaddlp.4h v0, v0 // v4s16, v2s32 1753 // uaddlp.2s v0, v0 // v2s32 1754 1755 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1756 Register Dst = MI.getOperand(0).getReg(); 1757 Register Val = MI.getOperand(1).getReg(); 1758 LLT Ty = MRI.getType(Val); 1759 unsigned Size = Ty.getSizeInBits(); 1760 1761 assert(Ty == MRI.getType(Dst) && 1762 "Expected src and dst to have the same type!"); 1763 1764 if (ST->hasCSSC() && Ty.isScalar() && Size == 128) { 1765 LLT s64 = LLT::scalar(64); 1766 1767 auto Split = MIRBuilder.buildUnmerge(s64, Val); 1768 auto CTPOP1 = MIRBuilder.buildCTPOP(s64, Split->getOperand(0)); 1769 auto CTPOP2 = MIRBuilder.buildCTPOP(s64, Split->getOperand(1)); 1770 auto Add = MIRBuilder.buildAdd(s64, CTPOP1, CTPOP2); 1771 1772 MIRBuilder.buildZExt(Dst, Add); 1773 MI.eraseFromParent(); 1774 return true; 1775 } 1776 1777 if (!ST->hasNEON() || 1778 MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) { 1779 // Use generic lowering when custom lowering is not possible. 1780 return Ty.isScalar() && (Size == 32 || Size == 64) && 1781 Helper.lowerBitCount(MI) == 1782 LegalizerHelper::LegalizeResult::Legalized; 1783 } 1784 1785 // Pre-conditioning: widen Val up to the nearest vector type. 1786 // s32,s64,v4s16,v2s32 -> v8i8 1787 // v8s16,v4s32,v2s64 -> v16i8 1788 LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8); 1789 if (Ty.isScalar()) { 1790 assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!"); 1791 if (Size == 32) { 1792 Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0); 1793 } 1794 } 1795 Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0); 1796 1797 // Count bits in each byte-sized lane. 1798 auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val); 1799 1800 // Sum across lanes. 1801 Register HSum = CTPOP.getReg(0); 1802 unsigned Opc; 1803 SmallVector<LLT> HAddTys; 1804 if (Ty.isScalar()) { 1805 Opc = Intrinsic::aarch64_neon_uaddlv; 1806 HAddTys.push_back(LLT::scalar(32)); 1807 } else if (Ty == LLT::fixed_vector(8, 16)) { 1808 Opc = Intrinsic::aarch64_neon_uaddlp; 1809 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1810 } else if (Ty == LLT::fixed_vector(4, 32)) { 1811 Opc = Intrinsic::aarch64_neon_uaddlp; 1812 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1813 HAddTys.push_back(LLT::fixed_vector(4, 32)); 1814 } else if (Ty == LLT::fixed_vector(2, 64)) { 1815 Opc = Intrinsic::aarch64_neon_uaddlp; 1816 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1817 HAddTys.push_back(LLT::fixed_vector(4, 32)); 1818 HAddTys.push_back(LLT::fixed_vector(2, 64)); 1819 } else if (Ty == LLT::fixed_vector(4, 16)) { 1820 Opc = Intrinsic::aarch64_neon_uaddlp; 1821 HAddTys.push_back(LLT::fixed_vector(4, 16)); 1822 } else if (Ty == LLT::fixed_vector(2, 32)) { 1823 Opc = Intrinsic::aarch64_neon_uaddlp; 1824 HAddTys.push_back(LLT::fixed_vector(4, 16)); 1825 HAddTys.push_back(LLT::fixed_vector(2, 32)); 1826 } else 1827 llvm_unreachable("unexpected vector shape"); 1828 MachineInstrBuilder UADD; 1829 for (LLT HTy : HAddTys) { 1830 UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}).addUse(HSum); 1831 HSum = UADD.getReg(0); 1832 } 1833 1834 // Post-conditioning. 1835 if (Ty.isScalar() && (Size == 64 || Size == 128)) 1836 MIRBuilder.buildZExt(Dst, UADD); 1837 else 1838 UADD->getOperand(0).setReg(Dst); 1839 MI.eraseFromParent(); 1840 return true; 1841 } 1842 1843 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128( 1844 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 1845 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1846 LLT s64 = LLT::scalar(64); 1847 auto Addr = MI.getOperand(1).getReg(); 1848 auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2)); 1849 auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3)); 1850 auto DstLo = MRI.createGenericVirtualRegister(s64); 1851 auto DstHi = MRI.createGenericVirtualRegister(s64); 1852 1853 MachineInstrBuilder CAS; 1854 if (ST->hasLSE()) { 1855 // We have 128-bit CASP instructions taking XSeqPair registers, which are 1856 // s128. We need the merge/unmerge to bracket the expansion and pair up with 1857 // the rest of the MIR so we must reassemble the extracted registers into a 1858 // 128-bit known-regclass one with code like this: 1859 // 1860 // %in1 = REG_SEQUENCE Lo, Hi ; One for each input 1861 // %out = CASP %in1, ... 1862 // %OldLo = G_EXTRACT %out, 0 1863 // %OldHi = G_EXTRACT %out, 64 1864 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); 1865 unsigned Opcode; 1866 switch (Ordering) { 1867 case AtomicOrdering::Acquire: 1868 Opcode = AArch64::CASPAX; 1869 break; 1870 case AtomicOrdering::Release: 1871 Opcode = AArch64::CASPLX; 1872 break; 1873 case AtomicOrdering::AcquireRelease: 1874 case AtomicOrdering::SequentiallyConsistent: 1875 Opcode = AArch64::CASPALX; 1876 break; 1877 default: 1878 Opcode = AArch64::CASPX; 1879 break; 1880 } 1881 1882 LLT s128 = LLT::scalar(128); 1883 auto CASDst = MRI.createGenericVirtualRegister(s128); 1884 auto CASDesired = MRI.createGenericVirtualRegister(s128); 1885 auto CASNew = MRI.createGenericVirtualRegister(s128); 1886 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {}) 1887 .addUse(DesiredI->getOperand(0).getReg()) 1888 .addImm(AArch64::sube64) 1889 .addUse(DesiredI->getOperand(1).getReg()) 1890 .addImm(AArch64::subo64); 1891 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {}) 1892 .addUse(NewI->getOperand(0).getReg()) 1893 .addImm(AArch64::sube64) 1894 .addUse(NewI->getOperand(1).getReg()) 1895 .addImm(AArch64::subo64); 1896 1897 CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr}); 1898 1899 MIRBuilder.buildExtract({DstLo}, {CASDst}, 0); 1900 MIRBuilder.buildExtract({DstHi}, {CASDst}, 64); 1901 } else { 1902 // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP 1903 // can take arbitrary registers so it just has the normal GPR64 operands the 1904 // rest of AArch64 is expecting. 1905 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); 1906 unsigned Opcode; 1907 switch (Ordering) { 1908 case AtomicOrdering::Acquire: 1909 Opcode = AArch64::CMP_SWAP_128_ACQUIRE; 1910 break; 1911 case AtomicOrdering::Release: 1912 Opcode = AArch64::CMP_SWAP_128_RELEASE; 1913 break; 1914 case AtomicOrdering::AcquireRelease: 1915 case AtomicOrdering::SequentiallyConsistent: 1916 Opcode = AArch64::CMP_SWAP_128; 1917 break; 1918 default: 1919 Opcode = AArch64::CMP_SWAP_128_MONOTONIC; 1920 break; 1921 } 1922 1923 auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 1924 CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch}, 1925 {Addr, DesiredI->getOperand(0), 1926 DesiredI->getOperand(1), NewI->getOperand(0), 1927 NewI->getOperand(1)}); 1928 } 1929 1930 CAS.cloneMemRefs(MI); 1931 constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(), 1932 *MRI.getTargetRegisterInfo(), 1933 *ST->getRegBankInfo()); 1934 1935 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {DstLo, DstHi}); 1936 MI.eraseFromParent(); 1937 return true; 1938 } 1939 1940 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI, 1941 LegalizerHelper &Helper) const { 1942 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1943 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 1944 LLT Ty = MRI.getType(MI.getOperand(1).getReg()); 1945 auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1)); 1946 MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse); 1947 MI.eraseFromParent(); 1948 return true; 1949 } 1950 1951 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI, 1952 LegalizerHelper &Helper) const { 1953 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1954 1955 // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic 1956 if (MI.getOpcode() == TargetOpcode::G_MEMSET) { 1957 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by 1958 // the instruction). 1959 auto &Value = MI.getOperand(1); 1960 Register ExtValueReg = 1961 MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0); 1962 Value.setReg(ExtValueReg); 1963 return true; 1964 } 1965 1966 return false; 1967 } 1968 1969 bool AArch64LegalizerInfo::legalizeFCopySign(MachineInstr &MI, 1970 LegalizerHelper &Helper) const { 1971 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1972 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 1973 Register Dst = MI.getOperand(0).getReg(); 1974 LLT DstTy = MRI.getType(Dst); 1975 assert(DstTy.isScalar() && "Only expected scalars right now!"); 1976 const unsigned DstSize = DstTy.getSizeInBits(); 1977 assert((DstSize == 32 || DstSize == 64) && "Unexpected dst type!"); 1978 assert(MRI.getType(MI.getOperand(2).getReg()) == DstTy && 1979 "Expected homogeneous types!"); 1980 1981 // We want to materialize a mask with the high bit set. 1982 uint64_t EltMask; 1983 LLT VecTy; 1984 1985 // TODO: s16 support. 1986 switch (DstSize) { 1987 default: 1988 llvm_unreachable("Unexpected type for G_FCOPYSIGN!"); 1989 case 64: { 1990 // AdvSIMD immediate moves cannot materialize out mask in a single 1991 // instruction for 64-bit elements. Instead, materialize zero and then 1992 // negate it. 1993 EltMask = 0; 1994 VecTy = LLT::fixed_vector(2, DstTy); 1995 break; 1996 } 1997 case 32: 1998 EltMask = 0x80000000ULL; 1999 VecTy = LLT::fixed_vector(4, DstTy); 2000 break; 2001 } 2002 2003 // Widen In1 and In2 to 128 bits. We want these to eventually become 2004 // INSERT_SUBREGs. 2005 auto Undef = MIRBuilder.buildUndef(VecTy); 2006 auto Zero = MIRBuilder.buildConstant(DstTy, 0); 2007 auto Ins1 = MIRBuilder.buildInsertVectorElement( 2008 VecTy, Undef, MI.getOperand(1).getReg(), Zero); 2009 auto Ins2 = MIRBuilder.buildInsertVectorElement( 2010 VecTy, Undef, MI.getOperand(2).getReg(), Zero); 2011 2012 // Construct the mask. 2013 auto Mask = MIRBuilder.buildConstant(VecTy, EltMask); 2014 if (DstSize == 64) 2015 Mask = MIRBuilder.buildFNeg(VecTy, Mask); 2016 2017 auto Sel = MIRBuilder.buildInstr(AArch64::G_BSP, {VecTy}, {Mask, Ins2, Ins1}); 2018 2019 // Build an unmerge whose 0th elt is the original G_FCOPYSIGN destination. We 2020 // want this to eventually become an EXTRACT_SUBREG. 2021 SmallVector<Register, 2> DstRegs(1, Dst); 2022 for (unsigned I = 1, E = VecTy.getNumElements(); I < E; ++I) 2023 DstRegs.push_back(MRI.createGenericVirtualRegister(DstTy)); 2024 MIRBuilder.buildUnmerge(DstRegs, Sel); 2025 MI.eraseFromParent(); 2026 return true; 2027 } 2028 2029 bool AArch64LegalizerInfo::legalizeExtractVectorElt( 2030 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 2031 assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT); 2032 auto VRegAndVal = 2033 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2034 if (VRegAndVal) 2035 return true; 2036 return Helper.lowerExtractInsertVectorElt(MI) != 2037 LegalizerHelper::LegalizeResult::UnableToLegalize; 2038 } 2039 2040 bool AArch64LegalizerInfo::legalizeDynStackAlloc( 2041 MachineInstr &MI, LegalizerHelper &Helper) const { 2042 MachineFunction &MF = *MI.getParent()->getParent(); 2043 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 2044 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 2045 2046 // If stack probing is not enabled for this function, use the default 2047 // lowering. 2048 if (!MF.getFunction().hasFnAttribute("probe-stack") || 2049 MF.getFunction().getFnAttribute("probe-stack").getValueAsString() != 2050 "inline-asm") { 2051 Helper.lowerDynStackAlloc(MI); 2052 return true; 2053 } 2054 2055 Register Dst = MI.getOperand(0).getReg(); 2056 Register AllocSize = MI.getOperand(1).getReg(); 2057 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 2058 2059 assert(MRI.getType(Dst) == LLT::pointer(0, 64) && 2060 "Unexpected type for dynamic alloca"); 2061 assert(MRI.getType(AllocSize) == LLT::scalar(64) && 2062 "Unexpected type for dynamic alloca"); 2063 2064 LLT PtrTy = MRI.getType(Dst); 2065 Register SPReg = 2066 Helper.getTargetLowering().getStackPointerRegisterToSaveRestore(); 2067 Register SPTmp = 2068 Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy); 2069 auto NewMI = 2070 MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp}); 2071 MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass); 2072 MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI); 2073 MIRBuilder.buildCopy(Dst, SPTmp); 2074 2075 MI.eraseFromParent(); 2076 return true; 2077 } 2078 2079 bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI, 2080 LegalizerHelper &Helper) const { 2081 MachineIRBuilder &MIB = Helper.MIRBuilder; 2082 auto &AddrVal = MI.getOperand(0); 2083 2084 int64_t IsWrite = MI.getOperand(1).getImm(); 2085 int64_t Locality = MI.getOperand(2).getImm(); 2086 int64_t IsData = MI.getOperand(3).getImm(); 2087 2088 bool IsStream = Locality == 0; 2089 if (Locality != 0) { 2090 assert(Locality <= 3 && "Prefetch locality out-of-range"); 2091 // The locality degree is the opposite of the cache speed. 2092 // Put the number the other way around. 2093 // The encoding starts at 0 for level 1 2094 Locality = 3 - Locality; 2095 } 2096 2097 unsigned PrfOp = (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream; 2098 2099 MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal); 2100 MI.eraseFromParent(); 2101 return true; 2102 } 2103