1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AArch64. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64LegalizerInfo.h" 15 #include "AArch64RegisterBankInfo.h" 16 #include "AArch64Subtarget.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 19 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 21 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/Utils.h" 24 #include "llvm/CodeGen/MachineInstr.h" 25 #include "llvm/CodeGen/MachineRegisterInfo.h" 26 #include "llvm/CodeGen/TargetOpcodes.h" 27 #include "llvm/CodeGen/ValueTypes.h" 28 #include "llvm/IR/DerivedTypes.h" 29 #include "llvm/IR/Intrinsics.h" 30 #include "llvm/IR/IntrinsicsAArch64.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/MathExtras.h" 33 #include <initializer_list> 34 35 #define DEBUG_TYPE "aarch64-legalinfo" 36 37 using namespace llvm; 38 using namespace LegalizeActions; 39 using namespace LegalizeMutations; 40 using namespace LegalityPredicates; 41 using namespace MIPatternMatch; 42 43 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) 44 : ST(&ST) { 45 using namespace TargetOpcode; 46 const LLT p0 = LLT::pointer(0, 64); 47 const LLT s8 = LLT::scalar(8); 48 const LLT s16 = LLT::scalar(16); 49 const LLT s32 = LLT::scalar(32); 50 const LLT s64 = LLT::scalar(64); 51 const LLT s128 = LLT::scalar(128); 52 const LLT v16s8 = LLT::fixed_vector(16, 8); 53 const LLT v8s8 = LLT::fixed_vector(8, 8); 54 const LLT v4s8 = LLT::fixed_vector(4, 8); 55 const LLT v8s16 = LLT::fixed_vector(8, 16); 56 const LLT v4s16 = LLT::fixed_vector(4, 16); 57 const LLT v2s16 = LLT::fixed_vector(2, 16); 58 const LLT v2s32 = LLT::fixed_vector(2, 32); 59 const LLT v4s32 = LLT::fixed_vector(4, 32); 60 const LLT v2s64 = LLT::fixed_vector(2, 64); 61 const LLT v2p0 = LLT::fixed_vector(2, p0); 62 63 std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */ 64 v16s8, v8s16, v4s32, 65 v2s64, v2p0, 66 /* End 128bit types */ 67 /* Begin 64bit types */ 68 v8s8, v4s16, v2s32}; 69 std::initializer_list<LLT> ScalarAndPtrTypesList = {s8, s16, s32, s64, p0}; 70 SmallVector<LLT, 8> PackedVectorAllTypesVec(PackedVectorAllTypeList); 71 SmallVector<LLT, 8> ScalarAndPtrTypesVec(ScalarAndPtrTypesList); 72 73 const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine(); 74 75 // FIXME: support subtargets which have neon/fp-armv8 disabled. 76 if (!ST.hasNEON() || !ST.hasFPARMv8()) { 77 getLegacyLegalizerInfo().computeTables(); 78 return; 79 } 80 81 // Some instructions only support s16 if the subtarget has full 16-bit FP 82 // support. 83 const bool HasFP16 = ST.hasFullFP16(); 84 const LLT &MinFPScalar = HasFP16 ? s16 : s32; 85 86 const bool HasCSSC = ST.hasCSSC(); 87 const bool HasRCPC3 = ST.hasRCPC3(); 88 89 getActionDefinitionsBuilder( 90 {G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER}) 91 .legalFor({p0, s8, s16, s32, s64}) 92 .legalFor(PackedVectorAllTypeList) 93 .widenScalarToNextPow2(0) 94 .clampScalar(0, s8, s64) 95 .fewerElementsIf( 96 [=](const LegalityQuery &Query) { 97 return Query.Types[0].isVector() && 98 (Query.Types[0].getElementType() != s64 || 99 Query.Types[0].getNumElements() != 2); 100 }, 101 [=](const LegalityQuery &Query) { 102 LLT EltTy = Query.Types[0].getElementType(); 103 if (EltTy == s64) 104 return std::make_pair(0, LLT::fixed_vector(2, 64)); 105 return std::make_pair(0, EltTy); 106 }); 107 108 getActionDefinitionsBuilder(G_PHI) 109 .legalFor({p0, s16, s32, s64}) 110 .legalFor(PackedVectorAllTypeList) 111 .widenScalarToNextPow2(0) 112 .clampScalar(0, s16, s64) 113 // Maximum: sN * k = 128 114 .clampMaxNumElements(0, s8, 16) 115 .clampMaxNumElements(0, s16, 8) 116 .clampMaxNumElements(0, s32, 4) 117 .clampMaxNumElements(0, s64, 2) 118 .clampMaxNumElements(0, p0, 2); 119 120 getActionDefinitionsBuilder(G_BSWAP) 121 .legalFor({s32, s64, v4s32, v2s32, v2s64}) 122 .widenScalarToNextPow2(0) 123 .clampScalar(0, s32, s64); 124 125 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) 126 .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8}) 127 .widenScalarToNextPow2(0) 128 .clampScalar(0, s32, s64) 129 .clampMaxNumElements(0, s8, 16) 130 .clampMaxNumElements(0, s16, 8) 131 .clampNumElements(0, v2s32, v4s32) 132 .clampNumElements(0, v2s64, v2s64) 133 .minScalarOrEltIf( 134 [=](const LegalityQuery &Query) { 135 return Query.Types[0].getNumElements() <= 2; 136 }, 137 0, s32) 138 .minScalarOrEltIf( 139 [=](const LegalityQuery &Query) { 140 return Query.Types[0].getNumElements() <= 4; 141 }, 142 0, s16) 143 .minScalarOrEltIf( 144 [=](const LegalityQuery &Query) { 145 return Query.Types[0].getNumElements() <= 16; 146 }, 147 0, s8) 148 .moreElementsToNextPow2(0); 149 150 getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) 151 .customIf([=](const LegalityQuery &Query) { 152 const auto &SrcTy = Query.Types[0]; 153 const auto &AmtTy = Query.Types[1]; 154 return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && 155 AmtTy.getSizeInBits() == 32; 156 }) 157 .legalFor({ 158 {s32, s32}, 159 {s32, s64}, 160 {s64, s64}, 161 {v8s8, v8s8}, 162 {v16s8, v16s8}, 163 {v4s16, v4s16}, 164 {v8s16, v8s16}, 165 {v2s32, v2s32}, 166 {v4s32, v4s32}, 167 {v2s64, v2s64}, 168 }) 169 .widenScalarToNextPow2(0) 170 .clampScalar(1, s32, s64) 171 .clampScalar(0, s32, s64) 172 .clampNumElements(0, v8s8, v16s8) 173 .clampNumElements(0, v4s16, v8s16) 174 .clampNumElements(0, v2s32, v4s32) 175 .clampNumElements(0, v2s64, v2s64) 176 .moreElementsToNextPow2(0) 177 .minScalarSameAs(1, 0); 178 179 getActionDefinitionsBuilder(G_PTR_ADD) 180 .legalFor({{p0, s64}, {v2p0, v2s64}}) 181 .clampScalar(1, s64, s64); 182 183 getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}}); 184 185 getActionDefinitionsBuilder({G_SDIV, G_UDIV}) 186 .legalFor({s32, s64}) 187 .libcallFor({s128}) 188 .clampScalar(0, s32, s64) 189 .widenScalarToNextPow2(0) 190 .scalarize(0); 191 192 getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 193 .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32}) 194 .widenScalarOrEltToNextPow2(0) 195 .clampScalarOrElt(0, s32, s64) 196 .clampNumElements(0, v2s32, v4s32) 197 .clampNumElements(0, v2s64, v2s64) 198 .moreElementsToNextPow2(0); 199 200 201 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 202 .widenScalarToNextPow2(0, /*Min = */ 32) 203 .clampScalar(0, s32, s64) 204 .lower(); 205 206 getActionDefinitionsBuilder({G_SMULH, G_UMULH}) 207 .legalFor({s64, v8s16, v16s8, v4s32}) 208 .lower(); 209 210 auto &MinMaxActions = getActionDefinitionsBuilder( 211 {G_SMIN, G_SMAX, G_UMIN, G_UMAX}); 212 if (HasCSSC) 213 MinMaxActions 214 .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) 215 // Making clamping conditional on CSSC extension as without legal types we 216 // lower to CMP which can fold one of the two sxtb's we'd otherwise need 217 // if we detect a type smaller than 32-bit. 218 .minScalar(0, s32); 219 else 220 MinMaxActions 221 .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}); 222 MinMaxActions 223 .clampNumElements(0, v8s8, v16s8) 224 .clampNumElements(0, v4s16, v8s16) 225 .clampNumElements(0, v2s32, v4s32) 226 // FIXME: This sholdn't be needed as v2s64 types are going to 227 // be expanded anyway, but G_ICMP doesn't support splitting vectors yet 228 .clampNumElements(0, v2s64, v2s64) 229 .lower(); 230 231 getActionDefinitionsBuilder( 232 {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO}) 233 .legalFor({{s32, s32}, {s64, s32}}) 234 .clampScalar(0, s32, s64) 235 .clampScalar(1, s32, s64) 236 .widenScalarToNextPow2(0); 237 238 getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FNEG, 239 G_FABS, G_FSQRT, G_FMAXNUM, G_FMINNUM, 240 G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR, 241 G_FRINT, G_FNEARBYINT, G_INTRINSIC_TRUNC, 242 G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 243 .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64}) 244 .legalIf([=](const LegalityQuery &Query) { 245 const auto &Ty = Query.Types[0]; 246 return (Ty == v8s16 || Ty == v4s16) && HasFP16; 247 }) 248 .libcallFor({s128}) 249 .minScalarOrElt(0, MinFPScalar) 250 .clampNumElements(0, v4s16, v8s16) 251 .clampNumElements(0, v2s32, v4s32) 252 .clampNumElements(0, v2s64, v2s64) 253 .moreElementsToNextPow2(0); 254 255 getActionDefinitionsBuilder(G_FREM) 256 .libcallFor({s32, s64}) 257 .minScalar(0, s32) 258 .scalarize(0); 259 260 getActionDefinitionsBuilder(G_INTRINSIC_LRINT) 261 // If we don't have full FP16 support, then scalarize the elements of 262 // vectors containing fp16 types. 263 .fewerElementsIf( 264 [=, &ST](const LegalityQuery &Query) { 265 const auto &Ty = Query.Types[0]; 266 return Ty.isVector() && Ty.getElementType() == s16 && 267 !ST.hasFullFP16(); 268 }, 269 [=](const LegalityQuery &Query) { return std::make_pair(0, s16); }) 270 // If we don't have full FP16 support, then widen s16 to s32 if we 271 // encounter it. 272 .widenScalarIf( 273 [=, &ST](const LegalityQuery &Query) { 274 return Query.Types[0] == s16 && !ST.hasFullFP16(); 275 }, 276 [=](const LegalityQuery &Query) { return std::make_pair(0, s32); }) 277 .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16}); 278 279 getActionDefinitionsBuilder( 280 {G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2, G_FLOG10, 281 G_FEXP, G_FEXP2, G_FEXP10}) 282 // We need a call for these, so we always need to scalarize. 283 .scalarize(0) 284 // Regardless of FP16 support, widen 16-bit elements to 32-bits. 285 .minScalar(0, s32) 286 .libcallFor({s32, s64}); 287 getActionDefinitionsBuilder(G_FPOWI) 288 .scalarize(0) 289 .minScalar(0, s32) 290 .libcallFor({{s32, s32}, {s64, s32}}); 291 292 getActionDefinitionsBuilder(G_INSERT) 293 .legalIf(all(typeInSet(0, {s32, s64, p0}), 294 typeInSet(1, {s8, s16, s32}), smallerThan(1, 0))) 295 .widenScalarToNextPow2(0) 296 .clampScalar(0, s32, s64) 297 .widenScalarToNextPow2(1) 298 .minScalar(1, s8) 299 .maxScalarIf(typeInSet(0, {s32}), 1, s16) 300 .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32); 301 302 getActionDefinitionsBuilder(G_EXTRACT) 303 .legalIf(all(typeInSet(0, {s16, s32, s64, p0}), 304 typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1))) 305 .widenScalarToNextPow2(1) 306 .clampScalar(1, s32, s128) 307 .widenScalarToNextPow2(0) 308 .minScalar(0, s16) 309 .maxScalarIf(typeInSet(1, {s32}), 0, s16) 310 .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32) 311 .maxScalarIf(typeInSet(1, {s128}), 0, s64); 312 313 314 for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) { 315 auto &Actions = getActionDefinitionsBuilder(Op); 316 317 if (Op == G_SEXTLOAD) 318 Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)); 319 320 // Atomics have zero extending behavior. 321 Actions 322 .legalForTypesWithMemDesc({{s32, p0, s8, 8}, 323 {s32, p0, s16, 8}, 324 {s32, p0, s32, 8}, 325 {s64, p0, s8, 2}, 326 {s64, p0, s16, 2}, 327 {s64, p0, s32, 4}, 328 {s64, p0, s64, 8}, 329 {p0, p0, s64, 8}, 330 {v2s32, p0, s64, 8}}) 331 .widenScalarToNextPow2(0) 332 .clampScalar(0, s32, s64) 333 // TODO: We could support sum-of-pow2's but the lowering code doesn't know 334 // how to do that yet. 335 .unsupportedIfMemSizeNotPow2() 336 // Lower anything left over into G_*EXT and G_LOAD 337 .lower(); 338 } 339 340 auto IsPtrVecPred = [=](const LegalityQuery &Query) { 341 const LLT &ValTy = Query.Types[0]; 342 if (!ValTy.isVector()) 343 return false; 344 const LLT EltTy = ValTy.getElementType(); 345 return EltTy.isPointer() && EltTy.getAddressSpace() == 0; 346 }; 347 348 getActionDefinitionsBuilder(G_LOAD) 349 .customIf([=](const LegalityQuery &Query) { 350 return HasRCPC3 && Query.Types[0] == s128 && 351 Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire; 352 }) 353 .customIf([=](const LegalityQuery &Query) { 354 return Query.Types[0] == s128 && 355 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; 356 }) 357 .legalForTypesWithMemDesc({{s8, p0, s8, 8}, 358 {s16, p0, s16, 8}, 359 {s32, p0, s32, 8}, 360 {s64, p0, s64, 8}, 361 {p0, p0, s64, 8}, 362 {s128, p0, s128, 8}, 363 {v8s8, p0, s64, 8}, 364 {v16s8, p0, s128, 8}, 365 {v4s16, p0, s64, 8}, 366 {v8s16, p0, s128, 8}, 367 {v2s32, p0, s64, 8}, 368 {v4s32, p0, s128, 8}, 369 {v2s64, p0, s128, 8}}) 370 // These extends are also legal 371 .legalForTypesWithMemDesc( 372 {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}}) 373 .widenScalarToNextPow2(0, /* MinSize = */ 8) 374 .lowerIfMemSizeNotByteSizePow2() 375 .clampScalar(0, s8, s64) 376 .narrowScalarIf( 377 [=](const LegalityQuery &Query) { 378 // Clamp extending load results to 32-bits. 379 return Query.Types[0].isScalar() && 380 Query.Types[0] != Query.MMODescrs[0].MemoryTy && 381 Query.Types[0].getSizeInBits() > 32; 382 }, 383 changeTo(0, s32)) 384 .clampMaxNumElements(0, s8, 16) 385 .clampMaxNumElements(0, s16, 8) 386 .clampMaxNumElements(0, s32, 4) 387 .clampMaxNumElements(0, s64, 2) 388 .clampMaxNumElements(0, p0, 2) 389 .customIf(IsPtrVecPred) 390 .scalarizeIf(typeIs(0, v2s16), 0); 391 392 getActionDefinitionsBuilder(G_STORE) 393 .customIf([=](const LegalityQuery &Query) { 394 return HasRCPC3 && Query.Types[0] == s128 && 395 Query.MMODescrs[0].Ordering == AtomicOrdering::Release; 396 }) 397 .customIf([=](const LegalityQuery &Query) { 398 return Query.Types[0] == s128 && 399 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; 400 }) 401 .legalForTypesWithMemDesc( 402 {{s8, p0, s8, 8}, {s16, p0, s8, 8}, // truncstorei8 from s16 403 {s32, p0, s8, 8}, // truncstorei8 from s32 404 {s64, p0, s8, 8}, // truncstorei8 from s64 405 {s16, p0, s16, 8}, {s32, p0, s16, 8}, // truncstorei16 from s32 406 {s64, p0, s16, 8}, // truncstorei16 from s64 407 {s32, p0, s8, 8}, {s32, p0, s16, 8}, {s32, p0, s32, 8}, 408 {s64, p0, s64, 8}, {s64, p0, s32, 8}, // truncstorei32 from s64 409 {p0, p0, s64, 8}, {s128, p0, s128, 8}, {v16s8, p0, s128, 8}, 410 {v8s8, p0, s64, 8}, {v4s16, p0, s64, 8}, {v8s16, p0, s128, 8}, 411 {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}}) 412 .clampScalar(0, s8, s64) 413 .lowerIf([=](const LegalityQuery &Query) { 414 return Query.Types[0].isScalar() && 415 Query.Types[0] != Query.MMODescrs[0].MemoryTy; 416 }) 417 // Maximum: sN * k = 128 418 .clampMaxNumElements(0, s8, 16) 419 .clampMaxNumElements(0, s16, 8) 420 .clampMaxNumElements(0, s32, 4) 421 .clampMaxNumElements(0, s64, 2) 422 .clampMaxNumElements(0, p0, 2) 423 .lowerIfMemSizeNotPow2() 424 .customIf(IsPtrVecPred) 425 .scalarizeIf(typeIs(0, v2s16), 0); 426 427 getActionDefinitionsBuilder(G_INDEXED_STORE) 428 // Idx 0 == Ptr, Idx 1 == Val 429 // TODO: we can implement legalizations but as of now these are 430 // generated in a very specific way. 431 .legalForTypesWithMemDesc({ 432 {p0, s8, s8, 8}, 433 {p0, s16, s16, 8}, 434 {p0, s32, s8, 8}, 435 {p0, s32, s16, 8}, 436 {p0, s32, s32, 8}, 437 {p0, s64, s64, 8}, 438 {p0, p0, p0, 8}, 439 {p0, v8s8, v8s8, 8}, 440 {p0, v16s8, v16s8, 8}, 441 {p0, v4s16, v4s16, 8}, 442 {p0, v8s16, v8s16, 8}, 443 {p0, v2s32, v2s32, 8}, 444 {p0, v4s32, v4s32, 8}, 445 {p0, v2s64, v2s64, 8}, 446 {p0, v2p0, v2p0, 8}, 447 {p0, s128, s128, 8}, 448 }) 449 .unsupported(); 450 451 auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) { 452 LLT LdTy = Query.Types[0]; 453 LLT PtrTy = Query.Types[1]; 454 if (!llvm::is_contained(PackedVectorAllTypesVec, LdTy) && 455 !llvm::is_contained(ScalarAndPtrTypesVec, LdTy) && LdTy != s128) 456 return false; 457 if (PtrTy != p0) 458 return false; 459 return true; 460 }; 461 getActionDefinitionsBuilder(G_INDEXED_LOAD) 462 .unsupportedIf( 463 atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)) 464 .legalIf(IndexedLoadBasicPred) 465 .unsupported(); 466 getActionDefinitionsBuilder({G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD}) 467 .unsupportedIf( 468 atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)) 469 .legalIf(all(typeInSet(0, {s16, s32, s64}), 470 LegalityPredicate([=](const LegalityQuery &Q) { 471 LLT LdTy = Q.Types[0]; 472 LLT PtrTy = Q.Types[1]; 473 LLT MemTy = Q.MMODescrs[0].MemoryTy; 474 if (PtrTy != p0) 475 return false; 476 if (LdTy == s16) 477 return MemTy == s8; 478 if (LdTy == s32) 479 return MemTy == s8 || MemTy == s16; 480 if (LdTy == s64) 481 return MemTy == s8 || MemTy == s16 || MemTy == s32; 482 return false; 483 }))) 484 .unsupported(); 485 486 // Constants 487 getActionDefinitionsBuilder(G_CONSTANT) 488 .legalFor({p0, s8, s16, s32, s64}) 489 .widenScalarToNextPow2(0) 490 .clampScalar(0, s8, s64); 491 getActionDefinitionsBuilder(G_FCONSTANT) 492 .legalIf([=](const LegalityQuery &Query) { 493 const auto &Ty = Query.Types[0]; 494 if (HasFP16 && Ty == s16) 495 return true; 496 return Ty == s32 || Ty == s64 || Ty == s128; 497 }) 498 .clampScalar(0, MinFPScalar, s128); 499 500 // FIXME: fix moreElementsToNextPow2 501 getActionDefinitionsBuilder(G_ICMP) 502 .legalFor({{s32, s32}, 503 {s32, s64}, 504 {s32, p0}, 505 {v4s32, v4s32}, 506 {v2s32, v2s32}, 507 {v2s64, v2s64}, 508 {v2s64, v2p0}, 509 {v4s16, v4s16}, 510 {v8s16, v8s16}, 511 {v8s8, v8s8}, 512 {v16s8, v16s8}}) 513 .widenScalarOrEltToNextPow2(1) 514 .clampScalar(1, s32, s64) 515 .clampScalar(0, s32, s32) 516 .minScalarEltSameAsIf( 517 [=](const LegalityQuery &Query) { 518 const LLT &Ty = Query.Types[0]; 519 const LLT &SrcTy = Query.Types[1]; 520 return Ty.isVector() && !SrcTy.getElementType().isPointer() && 521 Ty.getElementType() != SrcTy.getElementType(); 522 }, 523 0, 1) 524 .minScalarOrEltIf( 525 [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; }, 526 1, s32) 527 .minScalarOrEltIf( 528 [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0, 529 s64) 530 .moreElementsToNextPow2(0) 531 .clampNumElements(0, v8s8, v16s8) 532 .clampNumElements(0, v4s16, v8s16) 533 .clampNumElements(0, v2s32, v4s32) 534 .clampNumElements(0, v2s64, v2s64); 535 536 getActionDefinitionsBuilder(G_FCMP) 537 // If we don't have full FP16 support, then scalarize the elements of 538 // vectors containing fp16 types. 539 .fewerElementsIf( 540 [=](const LegalityQuery &Query) { 541 const auto &Ty = Query.Types[0]; 542 return Ty.isVector() && Ty.getElementType() == s16 && !HasFP16; 543 }, 544 [=](const LegalityQuery &Query) { return std::make_pair(0, s16); }) 545 // If we don't have full FP16 support, then widen s16 to s32 if we 546 // encounter it. 547 .widenScalarIf( 548 [=](const LegalityQuery &Query) { 549 return Query.Types[0] == s16 && !HasFP16; 550 }, 551 [=](const LegalityQuery &Query) { return std::make_pair(0, s32); }) 552 .legalFor({{s16, s16}, 553 {s32, s32}, 554 {s32, s64}, 555 {v4s32, v4s32}, 556 {v2s32, v2s32}, 557 {v2s64, v2s64}, 558 {v4s16, v4s16}, 559 {v8s16, v8s16}}) 560 .widenScalarOrEltToNextPow2(1) 561 .clampScalar(1, s32, s64) 562 .clampScalar(0, s32, s32) 563 .minScalarEltSameAsIf( 564 [=](const LegalityQuery &Query) { 565 const LLT &Ty = Query.Types[0]; 566 const LLT &SrcTy = Query.Types[1]; 567 return Ty.isVector() && !SrcTy.getElementType().isPointer() && 568 Ty.getElementType() != SrcTy.getElementType(); 569 }, 570 0, 1) 571 .clampNumElements(0, v2s32, v4s32) 572 .clampMaxNumElements(1, s64, 2); 573 574 // Extensions 575 auto ExtLegalFunc = [=](const LegalityQuery &Query) { 576 unsigned DstSize = Query.Types[0].getSizeInBits(); 577 578 // Handle legal vectors using legalFor 579 if (Query.Types[0].isVector()) 580 return false; 581 582 if (DstSize < 8 || DstSize >= 128 || !isPowerOf2_32(DstSize)) 583 return false; // Extending to a scalar s128 needs narrowing. 584 585 const LLT &SrcTy = Query.Types[1]; 586 587 // Make sure we fit in a register otherwise. Don't bother checking that 588 // the source type is below 128 bits. We shouldn't be allowing anything 589 // through which is wider than the destination in the first place. 590 unsigned SrcSize = SrcTy.getSizeInBits(); 591 if (SrcSize < 8 || !isPowerOf2_32(SrcSize)) 592 return false; 593 594 return true; 595 }; 596 getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) 597 .legalIf(ExtLegalFunc) 598 .legalFor({{v2s64, v2s32}, {v4s32, v4s16}, {v8s16, v8s8}}) 599 .clampScalar(0, s64, s64) // Just for s128, others are handled above. 600 .moreElementsToNextPow2(1) 601 .clampMaxNumElements(1, s8, 8) 602 .clampMaxNumElements(1, s16, 4) 603 .clampMaxNumElements(1, s32, 2) 604 // Tries to convert a large EXTEND into two smaller EXTENDs 605 .lowerIf([=](const LegalityQuery &Query) { 606 return (Query.Types[0].getScalarSizeInBits() > 607 Query.Types[1].getScalarSizeInBits() * 2) && 608 Query.Types[0].isVector() && 609 (Query.Types[1].getScalarSizeInBits() == 8 || 610 Query.Types[1].getScalarSizeInBits() == 16); 611 }); 612 613 getActionDefinitionsBuilder(G_TRUNC) 614 .legalFor({{v2s32, v2s64}, {v4s16, v4s32}, {v8s8, v8s16}}) 615 .moreElementsToNextPow2(0) 616 .clampMaxNumElements(0, s8, 8) 617 .clampMaxNumElements(0, s16, 4) 618 .clampMaxNumElements(0, s32, 2) 619 .minScalarOrEltIf( 620 [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); }, 621 0, s8) 622 .lowerIf([=](const LegalityQuery &Query) { 623 LLT DstTy = Query.Types[0]; 624 LLT SrcTy = Query.Types[1]; 625 return DstTy.isVector() && SrcTy.getSizeInBits() > 128 && 626 DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits(); 627 }) 628 629 .alwaysLegal(); 630 631 getActionDefinitionsBuilder(G_SEXT_INREG) 632 .legalFor({s32, s64}) 633 .legalFor(PackedVectorAllTypeList) 634 .maxScalar(0, s64) 635 .clampNumElements(0, v8s8, v16s8) 636 .clampNumElements(0, v4s16, v8s16) 637 .clampNumElements(0, v2s32, v4s32) 638 .clampMaxNumElements(0, s64, 2) 639 .lower(); 640 641 // FP conversions 642 getActionDefinitionsBuilder(G_FPTRUNC) 643 .legalFor( 644 {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}) 645 .clampNumElements(0, v4s16, v4s16) 646 .clampNumElements(0, v2s32, v2s32) 647 .scalarize(0); 648 649 getActionDefinitionsBuilder(G_FPEXT) 650 .legalFor( 651 {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}}) 652 .clampNumElements(0, v4s32, v4s32) 653 .clampNumElements(0, v2s64, v2s64) 654 .scalarize(0); 655 656 // Conversions 657 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 658 .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) 659 .legalIf([=](const LegalityQuery &Query) { 660 return HasFP16 && 661 (Query.Types[1] == s16 || Query.Types[1] == v4s16 || 662 Query.Types[1] == v8s16) && 663 (Query.Types[0] == s32 || Query.Types[0] == s64 || 664 Query.Types[0] == v4s16 || Query.Types[0] == v8s16); 665 }) 666 .widenScalarToNextPow2(0) 667 .clampScalar(0, s32, s64) 668 .widenScalarToNextPow2(1) 669 .clampScalarOrElt(1, MinFPScalar, s64) 670 .moreElementsToNextPow2(0) 671 .widenScalarIf( 672 [=](const LegalityQuery &Query) { 673 return Query.Types[0].getScalarSizeInBits() > 674 Query.Types[1].getScalarSizeInBits(); 675 }, 676 LegalizeMutations::changeElementSizeTo(1, 0)) 677 .widenScalarIf( 678 [=](const LegalityQuery &Query) { 679 return Query.Types[0].getScalarSizeInBits() < 680 Query.Types[1].getScalarSizeInBits(); 681 }, 682 LegalizeMutations::changeElementSizeTo(0, 1)) 683 .clampNumElements(0, v4s16, v8s16) 684 .clampNumElements(0, v2s32, v4s32) 685 .clampMaxNumElements(0, s64, 2); 686 687 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 688 .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) 689 .legalIf([=](const LegalityQuery &Query) { 690 return HasFP16 && 691 (Query.Types[0] == s16 || Query.Types[0] == v4s16 || 692 Query.Types[0] == v8s16) && 693 (Query.Types[1] == s32 || Query.Types[1] == s64 || 694 Query.Types[1] == v4s16 || Query.Types[1] == v8s16); 695 }) 696 .widenScalarToNextPow2(1) 697 .clampScalar(1, s32, s64) 698 .widenScalarToNextPow2(0) 699 .clampScalarOrElt(0, MinFPScalar, s64) 700 .moreElementsToNextPow2(0) 701 .widenScalarIf( 702 [=](const LegalityQuery &Query) { 703 return Query.Types[0].getScalarSizeInBits() < 704 Query.Types[1].getScalarSizeInBits(); 705 }, 706 LegalizeMutations::changeElementSizeTo(0, 1)) 707 .widenScalarIf( 708 [=](const LegalityQuery &Query) { 709 return Query.Types[0].getScalarSizeInBits() > 710 Query.Types[1].getScalarSizeInBits(); 711 }, 712 LegalizeMutations::changeElementSizeTo(1, 0)) 713 .clampNumElements(0, v4s16, v8s16) 714 .clampNumElements(0, v2s32, v4s32) 715 .clampMaxNumElements(0, s64, 2); 716 717 // Control-flow 718 getActionDefinitionsBuilder(G_BRCOND) 719 .legalFor({s32}) 720 .clampScalar(0, s32, s32); 721 getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0}); 722 723 getActionDefinitionsBuilder(G_SELECT) 724 .legalFor({{s32, s32}, {s64, s32}, {p0, s32}}) 725 .widenScalarToNextPow2(0) 726 .clampScalar(0, s32, s64) 727 .clampScalar(1, s32, s32) 728 .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0) 729 .lowerIf(isVector(0)); 730 731 // Pointer-handling 732 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); 733 734 if (TM.getCodeModel() == CodeModel::Small) 735 getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom(); 736 else 737 getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); 738 739 getActionDefinitionsBuilder(G_PTRTOINT) 740 .legalFor({{s64, p0}, {v2s64, v2p0}}) 741 .widenScalarToNextPow2(0, 64) 742 .clampScalar(0, s64, s64); 743 744 getActionDefinitionsBuilder(G_INTTOPTR) 745 .unsupportedIf([&](const LegalityQuery &Query) { 746 return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits(); 747 }) 748 .legalFor({{p0, s64}, {v2p0, v2s64}}); 749 750 // Casts for 32 and 64-bit width type are just copies. 751 // Same for 128-bit width type, except they are on the FPR bank. 752 getActionDefinitionsBuilder(G_BITCAST) 753 // FIXME: This is wrong since G_BITCAST is not allowed to change the 754 // number of bits but it's what the previous code described and fixing 755 // it breaks tests. 756 .legalForCartesianProduct({s8, s16, s32, s64, s128, v16s8, v8s8, v4s8, 757 v8s16, v4s16, v2s16, v4s32, v2s32, v2s64, 758 v2p0}); 759 760 getActionDefinitionsBuilder(G_VASTART).legalFor({p0}); 761 762 // va_list must be a pointer, but most sized types are pretty easy to handle 763 // as the destination. 764 getActionDefinitionsBuilder(G_VAARG) 765 .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0}) 766 .clampScalar(0, s8, s64) 767 .widenScalarToNextPow2(0, /*Min*/ 8); 768 769 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 770 .lowerIf( 771 all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0))); 772 773 LegalityPredicate UseOutlineAtomics = [&ST](const LegalityQuery &Query) { 774 return ST.outlineAtomics() && !ST.hasLSE(); 775 }; 776 777 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 778 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0), 779 predNot(UseOutlineAtomics))) 780 .customIf(all(typeIs(0, s128), predNot(UseOutlineAtomics))) 781 .customIf([UseOutlineAtomics](const LegalityQuery &Query) { 782 return Query.Types[0].getSizeInBits() == 128 && 783 !UseOutlineAtomics(Query); 784 }) 785 .libcallIf(all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, p0), 786 UseOutlineAtomics)) 787 .clampScalar(0, s32, s64); 788 789 getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, 790 G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR, 791 G_ATOMICRMW_XOR}) 792 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0), 793 predNot(UseOutlineAtomics))) 794 .libcallIf(all(typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0), 795 UseOutlineAtomics)) 796 .clampScalar(0, s32, s64); 797 798 // Do not outline these atomics operations, as per comment in 799 // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR(). 800 getActionDefinitionsBuilder( 801 {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX}) 802 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0))) 803 .clampScalar(0, s32, s64); 804 805 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0}); 806 807 // Merge/Unmerge 808 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 809 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 810 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 811 getActionDefinitionsBuilder(Op) 812 .widenScalarToNextPow2(LitTyIdx, 8) 813 .widenScalarToNextPow2(BigTyIdx, 32) 814 .clampScalar(LitTyIdx, s8, s64) 815 .clampScalar(BigTyIdx, s32, s128) 816 .legalIf([=](const LegalityQuery &Q) { 817 switch (Q.Types[BigTyIdx].getSizeInBits()) { 818 case 32: 819 case 64: 820 case 128: 821 break; 822 default: 823 return false; 824 } 825 switch (Q.Types[LitTyIdx].getSizeInBits()) { 826 case 8: 827 case 16: 828 case 32: 829 case 64: 830 return true; 831 default: 832 return false; 833 } 834 }); 835 } 836 837 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 838 .unsupportedIf([=](const LegalityQuery &Query) { 839 const LLT &EltTy = Query.Types[1].getElementType(); 840 return Query.Types[0] != EltTy; 841 }) 842 .minScalar(2, s64) 843 .customIf([=](const LegalityQuery &Query) { 844 const LLT &VecTy = Query.Types[1]; 845 return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 || 846 VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 || 847 VecTy == v8s8 || VecTy == v16s8 || VecTy == v2p0; 848 }) 849 .minScalarOrEltIf( 850 [=](const LegalityQuery &Query) { 851 // We want to promote to <M x s1> to <M x s64> if that wouldn't 852 // cause the total vec size to be > 128b. 853 return Query.Types[1].getNumElements() <= 2; 854 }, 855 0, s64) 856 .minScalarOrEltIf( 857 [=](const LegalityQuery &Query) { 858 return Query.Types[1].getNumElements() <= 4; 859 }, 860 0, s32) 861 .minScalarOrEltIf( 862 [=](const LegalityQuery &Query) { 863 return Query.Types[1].getNumElements() <= 8; 864 }, 865 0, s16) 866 .minScalarOrEltIf( 867 [=](const LegalityQuery &Query) { 868 return Query.Types[1].getNumElements() <= 16; 869 }, 870 0, s8) 871 .minScalarOrElt(0, s8) // Worst case, we need at least s8. 872 .moreElementsToNextPow2(1) 873 .clampMaxNumElements(1, s64, 2) 874 .clampMaxNumElements(1, s32, 4) 875 .clampMaxNumElements(1, s16, 8) 876 .clampMaxNumElements(1, p0, 2); 877 878 getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) 879 .legalIf(typeInSet(0, {v16s8, v8s8, v8s16, v4s16, v4s32, v2s32, v2s64})) 880 .moreElementsToNextPow2(0) 881 .widenVectorEltsToVectorMinSize(0, 64); 882 883 getActionDefinitionsBuilder(G_BUILD_VECTOR) 884 .legalFor({{v8s8, s8}, 885 {v16s8, s8}, 886 {v4s16, s16}, 887 {v8s16, s16}, 888 {v2s32, s32}, 889 {v4s32, s32}, 890 {v2p0, p0}, 891 {v2s64, s64}}) 892 .clampNumElements(0, v4s32, v4s32) 893 .clampNumElements(0, v2s64, v2s64) 894 .minScalarOrElt(0, s8) 895 .widenVectorEltsToVectorMinSize(0, 64) 896 .minScalarSameAs(1, 0); 897 898 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower(); 899 900 getActionDefinitionsBuilder(G_CTLZ) 901 .legalForCartesianProduct( 902 {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) 903 .scalarize(1) 904 .widenScalarToNextPow2(1, /*Min=*/32) 905 .clampScalar(1, s32, s64) 906 .scalarSameSizeAs(0, 1); 907 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower(); 908 909 // TODO: Custom lowering for v2s32, v4s32, v2s64. 910 getActionDefinitionsBuilder(G_BITREVERSE) 911 .legalFor({s32, s64, v8s8, v16s8}) 912 .widenScalarToNextPow2(0, /*Min = */ 32) 913 .clampScalar(0, s32, s64); 914 915 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower(); 916 917 getActionDefinitionsBuilder(G_CTTZ) 918 .lowerIf(isVector(0)) 919 .widenScalarToNextPow2(1, /*Min=*/32) 920 .clampScalar(1, s32, s64) 921 .scalarSameSizeAs(0, 1) 922 .legalIf([=](const LegalityQuery &Query) { 923 return (HasCSSC && typeInSet(0, {s32, s64})(Query)); 924 }) 925 .customIf([=](const LegalityQuery &Query) { 926 return (!HasCSSC && typeInSet(0, {s32, s64})(Query)); 927 }); 928 929 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 930 .legalIf([=](const LegalityQuery &Query) { 931 const LLT &DstTy = Query.Types[0]; 932 const LLT &SrcTy = Query.Types[1]; 933 // For now just support the TBL2 variant which needs the source vectors 934 // to be the same size as the dest. 935 if (DstTy != SrcTy) 936 return false; 937 return llvm::is_contained( 938 {v2s64, v2p0, v2s32, v4s32, v4s16, v16s8, v8s8, v8s16}, DstTy); 939 }) 940 // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we 941 // just want those lowered into G_BUILD_VECTOR 942 .lowerIf([=](const LegalityQuery &Query) { 943 return !Query.Types[1].isVector(); 944 }) 945 .moreElementsIf( 946 [](const LegalityQuery &Query) { 947 return Query.Types[0].isVector() && Query.Types[1].isVector() && 948 Query.Types[0].getNumElements() > 949 Query.Types[1].getNumElements(); 950 }, 951 changeTo(1, 0)) 952 .moreElementsToNextPow2(0) 953 .clampNumElements(0, v4s32, v4s32) 954 .clampNumElements(0, v2s64, v2s64) 955 .moreElementsIf( 956 [](const LegalityQuery &Query) { 957 return Query.Types[0].isVector() && Query.Types[1].isVector() && 958 Query.Types[0].getNumElements() < 959 Query.Types[1].getNumElements(); 960 }, 961 changeTo(0, 1)); 962 963 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 964 .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}}); 965 966 getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({p0}); 967 968 getActionDefinitionsBuilder(G_BRJT).legalFor({{p0, s64}}); 969 970 getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom(); 971 972 getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower(); 973 974 if (ST.hasMOPS()) { 975 // G_BZERO is not supported. Currently it is only emitted by 976 // PreLegalizerCombiner for G_MEMSET with zero constant. 977 getActionDefinitionsBuilder(G_BZERO).unsupported(); 978 979 getActionDefinitionsBuilder(G_MEMSET) 980 .legalForCartesianProduct({p0}, {s64}, {s64}) 981 .customForCartesianProduct({p0}, {s8}, {s64}) 982 .immIdx(0); // Inform verifier imm idx 0 is handled. 983 984 getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE}) 985 .legalForCartesianProduct({p0}, {p0}, {s64}) 986 .immIdx(0); // Inform verifier imm idx 0 is handled. 987 988 // G_MEMCPY_INLINE does not have a tailcall immediate 989 getActionDefinitionsBuilder(G_MEMCPY_INLINE) 990 .legalForCartesianProduct({p0}, {p0}, {s64}); 991 992 } else { 993 getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET}) 994 .libcall(); 995 } 996 997 // FIXME: Legal vector types are only legal with NEON. 998 auto &ABSActions = getActionDefinitionsBuilder(G_ABS); 999 if (HasCSSC) 1000 ABSActions 1001 .legalFor({s32, s64}); 1002 ABSActions 1003 .legalFor(PackedVectorAllTypeList) 1004 .lowerIf(isScalar(0)); 1005 1006 // For fadd reductions we have pairwise operations available. We treat the 1007 // usual legal types as legal and handle the lowering to pairwise instructions 1008 // later. 1009 getActionDefinitionsBuilder(G_VECREDUCE_FADD) 1010 .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}}) 1011 .legalIf([=](const LegalityQuery &Query) { 1012 const auto &Ty = Query.Types[1]; 1013 return (Ty == v4s16 || Ty == v8s16) && HasFP16; 1014 }) 1015 .minScalarOrElt(0, MinFPScalar) 1016 .clampMaxNumElements(1, s64, 2) 1017 .clampMaxNumElements(1, s32, 4) 1018 .clampMaxNumElements(1, s16, 8) 1019 .lower(); 1020 1021 // For fmul reductions we need to split up into individual operations. We 1022 // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of 1023 // smaller types, followed by scalarizing what remains. 1024 getActionDefinitionsBuilder(G_VECREDUCE_FMUL) 1025 .minScalarOrElt(0, MinFPScalar) 1026 .clampMaxNumElements(1, s64, 2) 1027 .clampMaxNumElements(1, s32, 4) 1028 .clampMaxNumElements(1, s16, 8) 1029 .clampMaxNumElements(1, s32, 2) 1030 .clampMaxNumElements(1, s16, 4) 1031 .scalarize(1) 1032 .lower(); 1033 1034 getActionDefinitionsBuilder({G_VECREDUCE_SEQ_FADD, G_VECREDUCE_SEQ_FMUL}) 1035 .scalarize(2) 1036 .lower(); 1037 1038 getActionDefinitionsBuilder(G_VECREDUCE_ADD) 1039 .legalFor({{s8, v16s8}, 1040 {s8, v8s8}, 1041 {s16, v8s16}, 1042 {s16, v4s16}, 1043 {s32, v4s32}, 1044 {s32, v2s32}, 1045 {s64, v2s64}}) 1046 .clampMaxNumElements(1, s64, 2) 1047 .clampMaxNumElements(1, s32, 4) 1048 .clampMaxNumElements(1, s16, 8) 1049 .clampMaxNumElements(1, s8, 16) 1050 .lower(); 1051 1052 getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX, 1053 G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM}) 1054 .legalFor({{s32, v4s32}, {s32, v2s32}, {s64, v2s64}}) 1055 .legalIf([=](const LegalityQuery &Query) { 1056 const auto &Ty = Query.Types[1]; 1057 return Query.Types[0] == s16 && (Ty == v8s16 || Ty == v4s16) && HasFP16; 1058 }) 1059 .minScalarOrElt(0, MinFPScalar) 1060 .clampMaxNumElements(1, s64, 2) 1061 .clampMaxNumElements(1, s32, 4) 1062 .clampMaxNumElements(1, s16, 8) 1063 .lower(); 1064 1065 getActionDefinitionsBuilder(G_VECREDUCE_MUL) 1066 .clampMaxNumElements(1, s32, 2) 1067 .clampMaxNumElements(1, s16, 4) 1068 .clampMaxNumElements(1, s8, 8) 1069 .scalarize(1) 1070 .lower(); 1071 1072 getActionDefinitionsBuilder( 1073 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX}) 1074 .legalFor({{s8, v8s8}, 1075 {s8, v16s8}, 1076 {s16, v4s16}, 1077 {s16, v8s16}, 1078 {s32, v2s32}, 1079 {s32, v4s32}}) 1080 .clampMaxNumElements(1, s64, 2) 1081 .clampMaxNumElements(1, s32, 4) 1082 .clampMaxNumElements(1, s16, 8) 1083 .clampMaxNumElements(1, s8, 16) 1084 .scalarize(1) 1085 .lower(); 1086 1087 getActionDefinitionsBuilder( 1088 {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR}) 1089 // Try to break down into smaller vectors as long as they're at least 64 1090 // bits. This lets us use vector operations for some parts of the 1091 // reduction. 1092 .fewerElementsIf( 1093 [=](const LegalityQuery &Q) { 1094 LLT SrcTy = Q.Types[1]; 1095 if (SrcTy.isScalar()) 1096 return false; 1097 if (!isPowerOf2_32(SrcTy.getNumElements())) 1098 return false; 1099 // We can usually perform 64b vector operations. 1100 return SrcTy.getSizeInBits() > 64; 1101 }, 1102 [=](const LegalityQuery &Q) { 1103 LLT SrcTy = Q.Types[1]; 1104 return std::make_pair(1, SrcTy.divide(2)); 1105 }) 1106 .scalarize(1) 1107 .lower(); 1108 1109 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 1110 .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); }); 1111 1112 getActionDefinitionsBuilder({G_FSHL, G_FSHR}) 1113 .customFor({{s32, s32}, {s32, s64}, {s64, s64}}) 1114 .lower(); 1115 1116 getActionDefinitionsBuilder(G_ROTR) 1117 .legalFor({{s32, s64}, {s64, s64}}) 1118 .customIf([=](const LegalityQuery &Q) { 1119 return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64; 1120 }) 1121 .lower(); 1122 getActionDefinitionsBuilder(G_ROTL).lower(); 1123 1124 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1125 .customFor({{s32, s32}, {s64, s64}}); 1126 1127 auto always = [=](const LegalityQuery &Q) { return true; }; 1128 auto &CTPOPActions = getActionDefinitionsBuilder(G_CTPOP); 1129 if (HasCSSC) 1130 CTPOPActions 1131 .legalFor({{s32, s32}, 1132 {s64, s64}, 1133 {v8s8, v8s8}, 1134 {v16s8, v16s8}}) 1135 .customFor({{s128, s128}, 1136 {v2s64, v2s64}, 1137 {v2s32, v2s32}, 1138 {v4s32, v4s32}, 1139 {v4s16, v4s16}, 1140 {v8s16, v8s16}}); 1141 else 1142 CTPOPActions 1143 .legalFor({{v8s8, v8s8}, 1144 {v16s8, v16s8}}) 1145 .customFor({{s32, s32}, 1146 {s64, s64}, 1147 {s128, s128}, 1148 {v2s64, v2s64}, 1149 {v2s32, v2s32}, 1150 {v4s32, v4s32}, 1151 {v4s16, v4s16}, 1152 {v8s16, v8s16}}); 1153 CTPOPActions 1154 .clampScalar(0, s32, s128) 1155 .widenScalarToNextPow2(0) 1156 .minScalarEltSameAsIf(always, 1, 0) 1157 .maxScalarEltSameAsIf(always, 1, 0); 1158 1159 // TODO: Vector types. 1160 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0)); 1161 1162 // TODO: Libcall support for s128. 1163 // TODO: s16 should be legal with full FP16 support. 1164 getActionDefinitionsBuilder({G_LROUND, G_LLROUND}) 1165 .legalFor({{s64, s32}, {s64, s64}}); 1166 1167 // TODO: Custom legalization for vector types. 1168 // TODO: Custom legalization for mismatched types. 1169 // TODO: s16 support. 1170 getActionDefinitionsBuilder(G_FCOPYSIGN).customFor({{s32, s32}, {s64, s64}}); 1171 1172 getActionDefinitionsBuilder(G_FMAD).lower(); 1173 1174 // Access to floating-point environment. 1175 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV, 1176 G_GET_FPMODE, G_SET_FPMODE, G_RESET_FPMODE}) 1177 .libcall(); 1178 1179 getActionDefinitionsBuilder(G_IS_FPCLASS).lower(); 1180 1181 getActionDefinitionsBuilder(G_PREFETCH).custom(); 1182 1183 getLegacyLegalizerInfo().computeTables(); 1184 verify(*ST.getInstrInfo()); 1185 } 1186 1187 bool AArch64LegalizerInfo::legalizeCustom( 1188 LegalizerHelper &Helper, MachineInstr &MI, 1189 LostDebugLocObserver &LocObserver) const { 1190 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1191 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 1192 GISelChangeObserver &Observer = Helper.Observer; 1193 switch (MI.getOpcode()) { 1194 default: 1195 // No idea what to do. 1196 return false; 1197 case TargetOpcode::G_VAARG: 1198 return legalizeVaArg(MI, MRI, MIRBuilder); 1199 case TargetOpcode::G_LOAD: 1200 case TargetOpcode::G_STORE: 1201 return legalizeLoadStore(MI, MRI, MIRBuilder, Observer); 1202 case TargetOpcode::G_SHL: 1203 case TargetOpcode::G_ASHR: 1204 case TargetOpcode::G_LSHR: 1205 return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer); 1206 case TargetOpcode::G_GLOBAL_VALUE: 1207 return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer); 1208 case TargetOpcode::G_SBFX: 1209 case TargetOpcode::G_UBFX: 1210 return legalizeBitfieldExtract(MI, MRI, Helper); 1211 case TargetOpcode::G_FSHL: 1212 case TargetOpcode::G_FSHR: 1213 return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper); 1214 case TargetOpcode::G_ROTR: 1215 return legalizeRotate(MI, MRI, Helper); 1216 case TargetOpcode::G_CTPOP: 1217 return legalizeCTPOP(MI, MRI, Helper); 1218 case TargetOpcode::G_ATOMIC_CMPXCHG: 1219 return legalizeAtomicCmpxchg128(MI, MRI, Helper); 1220 case TargetOpcode::G_CTTZ: 1221 return legalizeCTTZ(MI, Helper); 1222 case TargetOpcode::G_BZERO: 1223 case TargetOpcode::G_MEMCPY: 1224 case TargetOpcode::G_MEMMOVE: 1225 case TargetOpcode::G_MEMSET: 1226 return legalizeMemOps(MI, Helper); 1227 case TargetOpcode::G_FCOPYSIGN: 1228 return legalizeFCopySign(MI, Helper); 1229 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1230 return legalizeExtractVectorElt(MI, MRI, Helper); 1231 case TargetOpcode::G_DYN_STACKALLOC: 1232 return legalizeDynStackAlloc(MI, Helper); 1233 case TargetOpcode::G_PREFETCH: 1234 return legalizePrefetch(MI, Helper); 1235 } 1236 1237 llvm_unreachable("expected switch to return"); 1238 } 1239 1240 bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI, 1241 MachineRegisterInfo &MRI, 1242 MachineIRBuilder &MIRBuilder, 1243 GISelChangeObserver &Observer, 1244 LegalizerHelper &Helper) const { 1245 assert(MI.getOpcode() == TargetOpcode::G_FSHL || 1246 MI.getOpcode() == TargetOpcode::G_FSHR); 1247 1248 // Keep as G_FSHR if shift amount is a G_CONSTANT, else use generic 1249 // lowering 1250 Register ShiftNo = MI.getOperand(3).getReg(); 1251 LLT ShiftTy = MRI.getType(ShiftNo); 1252 auto VRegAndVal = getIConstantVRegValWithLookThrough(ShiftNo, MRI); 1253 1254 // Adjust shift amount according to Opcode (FSHL/FSHR) 1255 // Convert FSHL to FSHR 1256 LLT OperationTy = MRI.getType(MI.getOperand(0).getReg()); 1257 APInt BitWidth(ShiftTy.getSizeInBits(), OperationTy.getSizeInBits(), false); 1258 1259 // Lower non-constant shifts and leave zero shifts to the optimizer. 1260 if (!VRegAndVal || VRegAndVal->Value.urem(BitWidth) == 0) 1261 return (Helper.lowerFunnelShiftAsShifts(MI) == 1262 LegalizerHelper::LegalizeResult::Legalized); 1263 1264 APInt Amount = VRegAndVal->Value.urem(BitWidth); 1265 1266 Amount = MI.getOpcode() == TargetOpcode::G_FSHL ? BitWidth - Amount : Amount; 1267 1268 // If the instruction is G_FSHR, has a 64-bit G_CONSTANT for shift amount 1269 // in the range of 0 <-> BitWidth, it is legal 1270 if (ShiftTy.getSizeInBits() == 64 && MI.getOpcode() == TargetOpcode::G_FSHR && 1271 VRegAndVal->Value.ult(BitWidth)) 1272 return true; 1273 1274 // Cast the ShiftNumber to a 64-bit type 1275 auto Cast64 = MIRBuilder.buildConstant(LLT::scalar(64), Amount.zext(64)); 1276 1277 if (MI.getOpcode() == TargetOpcode::G_FSHR) { 1278 Observer.changingInstr(MI); 1279 MI.getOperand(3).setReg(Cast64.getReg(0)); 1280 Observer.changedInstr(MI); 1281 } 1282 // If Opcode is FSHL, remove the FSHL instruction and create a FSHR 1283 // instruction 1284 else if (MI.getOpcode() == TargetOpcode::G_FSHL) { 1285 MIRBuilder.buildInstr(TargetOpcode::G_FSHR, {MI.getOperand(0).getReg()}, 1286 {MI.getOperand(1).getReg(), MI.getOperand(2).getReg(), 1287 Cast64.getReg(0)}); 1288 MI.eraseFromParent(); 1289 } 1290 return true; 1291 } 1292 1293 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI, 1294 MachineRegisterInfo &MRI, 1295 LegalizerHelper &Helper) const { 1296 // To allow for imported patterns to match, we ensure that the rotate amount 1297 // is 64b with an extension. 1298 Register AmtReg = MI.getOperand(2).getReg(); 1299 LLT AmtTy = MRI.getType(AmtReg); 1300 (void)AmtTy; 1301 assert(AmtTy.isScalar() && "Expected a scalar rotate"); 1302 assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal"); 1303 auto NewAmt = Helper.MIRBuilder.buildZExt(LLT::scalar(64), AmtReg); 1304 Helper.Observer.changingInstr(MI); 1305 MI.getOperand(2).setReg(NewAmt.getReg(0)); 1306 Helper.Observer.changedInstr(MI); 1307 return true; 1308 } 1309 1310 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( 1311 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1312 GISelChangeObserver &Observer) const { 1313 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); 1314 // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP + 1315 // G_ADD_LOW instructions. 1316 // By splitting this here, we can optimize accesses in the small code model by 1317 // folding in the G_ADD_LOW into the load/store offset. 1318 auto &GlobalOp = MI.getOperand(1); 1319 const auto* GV = GlobalOp.getGlobal(); 1320 if (GV->isThreadLocal()) 1321 return true; // Don't want to modify TLS vars. 1322 1323 auto &TM = ST->getTargetLowering()->getTargetMachine(); 1324 unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM); 1325 1326 if (OpFlags & AArch64II::MO_GOT) 1327 return true; 1328 1329 auto Offset = GlobalOp.getOffset(); 1330 Register DstReg = MI.getOperand(0).getReg(); 1331 auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {}) 1332 .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE); 1333 // Set the regclass on the dest reg too. 1334 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); 1335 1336 // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so 1337 // by creating a MOVK that sets bits 48-63 of the register to (global address 1338 // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to 1339 // prevent an incorrect tag being generated during relocation when the 1340 // global appears before the code section. Without the offset, a global at 1341 // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced 1342 // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 = 1343 // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe` 1344 // instead of `0xf`. 1345 // This assumes that we're in the small code model so we can assume a binary 1346 // size of <= 4GB, which makes the untagged PC relative offset positive. The 1347 // binary must also be loaded into address range [0, 2^48). Both of these 1348 // properties need to be ensured at runtime when using tagged addresses. 1349 if (OpFlags & AArch64II::MO_TAGGED) { 1350 assert(!Offset && 1351 "Should not have folded in an offset for a tagged global!"); 1352 ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP}) 1353 .addGlobalAddress(GV, 0x100000000, 1354 AArch64II::MO_PREL | AArch64II::MO_G3) 1355 .addImm(48); 1356 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); 1357 } 1358 1359 MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP}) 1360 .addGlobalAddress(GV, Offset, 1361 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 1362 MI.eraseFromParent(); 1363 return true; 1364 } 1365 1366 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 1367 MachineInstr &MI) const { 1368 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); 1369 switch (IntrinsicID) { 1370 case Intrinsic::vacopy: { 1371 unsigned PtrSize = ST->isTargetILP32() ? 4 : 8; 1372 unsigned VaListSize = 1373 (ST->isTargetDarwin() || ST->isTargetWindows()) 1374 ? PtrSize 1375 : ST->isTargetILP32() ? 20 : 32; 1376 1377 MachineFunction &MF = *MI.getMF(); 1378 auto Val = MF.getRegInfo().createGenericVirtualRegister( 1379 LLT::scalar(VaListSize * 8)); 1380 MachineIRBuilder MIB(MI); 1381 MIB.buildLoad(Val, MI.getOperand(2), 1382 *MF.getMachineMemOperand(MachinePointerInfo(), 1383 MachineMemOperand::MOLoad, 1384 VaListSize, Align(PtrSize))); 1385 MIB.buildStore(Val, MI.getOperand(1), 1386 *MF.getMachineMemOperand(MachinePointerInfo(), 1387 MachineMemOperand::MOStore, 1388 VaListSize, Align(PtrSize))); 1389 MI.eraseFromParent(); 1390 return true; 1391 } 1392 case Intrinsic::get_dynamic_area_offset: { 1393 MachineIRBuilder &MIB = Helper.MIRBuilder; 1394 MIB.buildConstant(MI.getOperand(0).getReg(), 0); 1395 MI.eraseFromParent(); 1396 return true; 1397 } 1398 case Intrinsic::aarch64_mops_memset_tag: { 1399 assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 1400 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by 1401 // the instruction). 1402 MachineIRBuilder MIB(MI); 1403 auto &Value = MI.getOperand(3); 1404 Register ExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0); 1405 Value.setReg(ExtValueReg); 1406 return true; 1407 } 1408 case Intrinsic::aarch64_prefetch: { 1409 MachineIRBuilder MIB(MI); 1410 auto &AddrVal = MI.getOperand(1); 1411 1412 int64_t IsWrite = MI.getOperand(2).getImm(); 1413 int64_t Target = MI.getOperand(3).getImm(); 1414 int64_t IsStream = MI.getOperand(4).getImm(); 1415 int64_t IsData = MI.getOperand(5).getImm(); 1416 1417 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 1418 (!IsData << 3) | // IsDataCache bit 1419 (Target << 1) | // Cache level bits 1420 (unsigned)IsStream; // Stream bit 1421 1422 MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal); 1423 MI.eraseFromParent(); 1424 return true; 1425 } 1426 case Intrinsic::aarch64_neon_uaddv: 1427 case Intrinsic::aarch64_neon_saddv: 1428 case Intrinsic::aarch64_neon_umaxv: 1429 case Intrinsic::aarch64_neon_smaxv: 1430 case Intrinsic::aarch64_neon_uminv: 1431 case Intrinsic::aarch64_neon_sminv: { 1432 MachineIRBuilder MIB(MI); 1433 MachineRegisterInfo &MRI = *MIB.getMRI(); 1434 bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv || 1435 IntrinsicID == Intrinsic::aarch64_neon_smaxv || 1436 IntrinsicID == Intrinsic::aarch64_neon_sminv; 1437 1438 auto OldDst = MI.getOperand(0).getReg(); 1439 auto OldDstTy = MRI.getType(OldDst); 1440 LLT NewDstTy = MRI.getType(MI.getOperand(2).getReg()).getElementType(); 1441 if (OldDstTy == NewDstTy) 1442 return true; 1443 1444 auto NewDst = MRI.createGenericVirtualRegister(NewDstTy); 1445 1446 Helper.Observer.changingInstr(MI); 1447 MI.getOperand(0).setReg(NewDst); 1448 Helper.Observer.changedInstr(MI); 1449 1450 MIB.setInsertPt(MIB.getMBB(), ++MIB.getInsertPt()); 1451 MIB.buildExtOrTrunc(IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT, 1452 OldDst, NewDst); 1453 1454 return true; 1455 } 1456 case Intrinsic::aarch64_neon_uaddlp: 1457 case Intrinsic::aarch64_neon_saddlp: { 1458 MachineIRBuilder MIB(MI); 1459 1460 unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp 1461 ? AArch64::G_UADDLP 1462 : AArch64::G_SADDLP; 1463 MIB.buildInstr(Opc, {MI.getOperand(0)}, {MI.getOperand(2)}); 1464 MI.eraseFromParent(); 1465 1466 return true; 1467 } 1468 case Intrinsic::aarch64_neon_uaddlv: 1469 case Intrinsic::aarch64_neon_saddlv: { 1470 MachineIRBuilder MIB(MI); 1471 MachineRegisterInfo &MRI = *MIB.getMRI(); 1472 1473 unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv 1474 ? AArch64::G_UADDLV 1475 : AArch64::G_SADDLV; 1476 Register DstReg = MI.getOperand(0).getReg(); 1477 Register SrcReg = MI.getOperand(2).getReg(); 1478 LLT DstTy = MRI.getType(DstReg); 1479 1480 LLT MidTy, ExtTy; 1481 if (DstTy.isScalar() && DstTy.getScalarSizeInBits() <= 32) { 1482 MidTy = LLT::fixed_vector(4, 32); 1483 ExtTy = LLT::scalar(32); 1484 } else { 1485 MidTy = LLT::fixed_vector(2, 64); 1486 ExtTy = LLT::scalar(64); 1487 } 1488 1489 Register MidReg = 1490 MIB.buildInstr(Opc, {MidTy}, {SrcReg})->getOperand(0).getReg(); 1491 Register ZeroReg = 1492 MIB.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg(); 1493 Register ExtReg = MIB.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, {ExtTy}, 1494 {MidReg, ZeroReg}) 1495 .getReg(0); 1496 1497 if (DstTy.getScalarSizeInBits() < 32) 1498 MIB.buildTrunc(DstReg, ExtReg); 1499 else 1500 MIB.buildCopy(DstReg, ExtReg); 1501 1502 MI.eraseFromParent(); 1503 1504 return true; 1505 } 1506 case Intrinsic::aarch64_neon_smax: 1507 case Intrinsic::aarch64_neon_smin: 1508 case Intrinsic::aarch64_neon_umax: 1509 case Intrinsic::aarch64_neon_umin: 1510 case Intrinsic::aarch64_neon_fmax: 1511 case Intrinsic::aarch64_neon_fmin: 1512 case Intrinsic::aarch64_neon_fmaxnm: 1513 case Intrinsic::aarch64_neon_fminnm: { 1514 MachineIRBuilder MIB(MI); 1515 if (IntrinsicID == Intrinsic::aarch64_neon_smax) 1516 MIB.buildSMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3)); 1517 else if (IntrinsicID == Intrinsic::aarch64_neon_smin) 1518 MIB.buildSMin(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3)); 1519 else if (IntrinsicID == Intrinsic::aarch64_neon_umax) 1520 MIB.buildUMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3)); 1521 else if (IntrinsicID == Intrinsic::aarch64_neon_umin) 1522 MIB.buildUMin(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3)); 1523 else if (IntrinsicID == Intrinsic::aarch64_neon_fmax) 1524 MIB.buildInstr(TargetOpcode::G_FMAXIMUM, {MI.getOperand(0)}, 1525 {MI.getOperand(2), MI.getOperand(3)}); 1526 else if (IntrinsicID == Intrinsic::aarch64_neon_fmin) 1527 MIB.buildInstr(TargetOpcode::G_FMINIMUM, {MI.getOperand(0)}, 1528 {MI.getOperand(2), MI.getOperand(3)}); 1529 else if (IntrinsicID == Intrinsic::aarch64_neon_fmaxnm) 1530 MIB.buildInstr(TargetOpcode::G_FMAXNUM, {MI.getOperand(0)}, 1531 {MI.getOperand(2), MI.getOperand(3)}); 1532 else if (IntrinsicID == Intrinsic::aarch64_neon_fminnm) 1533 MIB.buildInstr(TargetOpcode::G_FMINNUM, {MI.getOperand(0)}, 1534 {MI.getOperand(2), MI.getOperand(3)}); 1535 MI.eraseFromParent(); 1536 return true; 1537 } 1538 case Intrinsic::experimental_vector_reverse: 1539 // TODO: Add support for vector_reverse 1540 return false; 1541 } 1542 1543 return true; 1544 } 1545 1546 bool AArch64LegalizerInfo::legalizeShlAshrLshr( 1547 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1548 GISelChangeObserver &Observer) const { 1549 assert(MI.getOpcode() == TargetOpcode::G_ASHR || 1550 MI.getOpcode() == TargetOpcode::G_LSHR || 1551 MI.getOpcode() == TargetOpcode::G_SHL); 1552 // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the 1553 // imported patterns can select it later. Either way, it will be legal. 1554 Register AmtReg = MI.getOperand(2).getReg(); 1555 auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI); 1556 if (!VRegAndVal) 1557 return true; 1558 // Check the shift amount is in range for an immediate form. 1559 int64_t Amount = VRegAndVal->Value.getSExtValue(); 1560 if (Amount > 31) 1561 return true; // This will have to remain a register variant. 1562 auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount); 1563 Observer.changingInstr(MI); 1564 MI.getOperand(2).setReg(ExtCst.getReg(0)); 1565 Observer.changedInstr(MI); 1566 return true; 1567 } 1568 1569 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset, 1570 MachineRegisterInfo &MRI) { 1571 Base = Root; 1572 Offset = 0; 1573 1574 Register NewBase; 1575 int64_t NewOffset; 1576 if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) && 1577 isShiftedInt<7, 3>(NewOffset)) { 1578 Base = NewBase; 1579 Offset = NewOffset; 1580 } 1581 } 1582 1583 // FIXME: This should be removed and replaced with the generic bitcast legalize 1584 // action. 1585 bool AArch64LegalizerInfo::legalizeLoadStore( 1586 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1587 GISelChangeObserver &Observer) const { 1588 assert(MI.getOpcode() == TargetOpcode::G_STORE || 1589 MI.getOpcode() == TargetOpcode::G_LOAD); 1590 // Here we just try to handle vector loads/stores where our value type might 1591 // have pointer elements, which the SelectionDAG importer can't handle. To 1592 // allow the existing patterns for s64 to fire for p0, we just try to bitcast 1593 // the value to use s64 types. 1594 1595 // Custom legalization requires the instruction, if not deleted, must be fully 1596 // legalized. In order to allow further legalization of the inst, we create 1597 // a new instruction and erase the existing one. 1598 1599 Register ValReg = MI.getOperand(0).getReg(); 1600 const LLT ValTy = MRI.getType(ValReg); 1601 1602 if (ValTy == LLT::scalar(128)) { 1603 1604 AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering(); 1605 bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD; 1606 bool IsLoadAcquire = IsLoad && Ordering == AtomicOrdering::Acquire; 1607 bool IsStoreRelease = !IsLoad && Ordering == AtomicOrdering::Release; 1608 bool IsRcpC3 = 1609 ST->hasLSE2() && ST->hasRCPC3() && (IsLoadAcquire || IsStoreRelease); 1610 1611 LLT s64 = LLT::scalar(64); 1612 1613 unsigned Opcode; 1614 if (IsRcpC3) { 1615 Opcode = IsLoad ? AArch64::LDIAPPX : AArch64::STILPX; 1616 } else { 1617 // For LSE2, loads/stores should have been converted to monotonic and had 1618 // a fence inserted after them. 1619 assert(Ordering == AtomicOrdering::Monotonic || 1620 Ordering == AtomicOrdering::Unordered); 1621 assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2"); 1622 1623 Opcode = IsLoad ? AArch64::LDPXi : AArch64::STPXi; 1624 } 1625 1626 MachineInstrBuilder NewI; 1627 if (IsLoad) { 1628 NewI = MIRBuilder.buildInstr(Opcode, {s64, s64}, {}); 1629 MIRBuilder.buildMergeLikeInstr( 1630 ValReg, {NewI->getOperand(0), NewI->getOperand(1)}); 1631 } else { 1632 auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0)); 1633 NewI = MIRBuilder.buildInstr( 1634 Opcode, {}, {Split->getOperand(0), Split->getOperand(1)}); 1635 } 1636 1637 if (IsRcpC3) { 1638 NewI.addUse(MI.getOperand(1).getReg()); 1639 } else { 1640 Register Base; 1641 int Offset; 1642 matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI); 1643 NewI.addUse(Base); 1644 NewI.addImm(Offset / 8); 1645 } 1646 1647 NewI.cloneMemRefs(MI); 1648 constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(), 1649 *MRI.getTargetRegisterInfo(), 1650 *ST->getRegBankInfo()); 1651 MI.eraseFromParent(); 1652 return true; 1653 } 1654 1655 if (!ValTy.isVector() || !ValTy.getElementType().isPointer() || 1656 ValTy.getElementType().getAddressSpace() != 0) { 1657 LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store"); 1658 return false; 1659 } 1660 1661 unsigned PtrSize = ValTy.getElementType().getSizeInBits(); 1662 const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize); 1663 auto &MMO = **MI.memoperands_begin(); 1664 MMO.setType(NewTy); 1665 1666 if (MI.getOpcode() == TargetOpcode::G_STORE) { 1667 auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg); 1668 MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO); 1669 } else { 1670 auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO); 1671 MIRBuilder.buildBitcast(ValReg, NewLoad); 1672 } 1673 MI.eraseFromParent(); 1674 return true; 1675 } 1676 1677 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, 1678 MachineRegisterInfo &MRI, 1679 MachineIRBuilder &MIRBuilder) const { 1680 MachineFunction &MF = MIRBuilder.getMF(); 1681 Align Alignment(MI.getOperand(2).getImm()); 1682 Register Dst = MI.getOperand(0).getReg(); 1683 Register ListPtr = MI.getOperand(1).getReg(); 1684 1685 LLT PtrTy = MRI.getType(ListPtr); 1686 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1687 1688 const unsigned PtrSize = PtrTy.getSizeInBits() / 8; 1689 const Align PtrAlign = Align(PtrSize); 1690 auto List = MIRBuilder.buildLoad( 1691 PtrTy, ListPtr, 1692 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, 1693 PtrTy, PtrAlign)); 1694 1695 MachineInstrBuilder DstPtr; 1696 if (Alignment > PtrAlign) { 1697 // Realign the list to the actual required alignment. 1698 auto AlignMinus1 = 1699 MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1); 1700 auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0)); 1701 DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment)); 1702 } else 1703 DstPtr = List; 1704 1705 LLT ValTy = MRI.getType(Dst); 1706 uint64_t ValSize = ValTy.getSizeInBits() / 8; 1707 MIRBuilder.buildLoad( 1708 Dst, DstPtr, 1709 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, 1710 ValTy, std::max(Alignment, PtrAlign))); 1711 1712 auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign)); 1713 1714 auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0)); 1715 1716 MIRBuilder.buildStore(NewList, ListPtr, 1717 *MF.getMachineMemOperand(MachinePointerInfo(), 1718 MachineMemOperand::MOStore, 1719 PtrTy, PtrAlign)); 1720 1721 MI.eraseFromParent(); 1722 return true; 1723 } 1724 1725 bool AArch64LegalizerInfo::legalizeBitfieldExtract( 1726 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 1727 // Only legal if we can select immediate forms. 1728 // TODO: Lower this otherwise. 1729 return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) && 1730 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 1731 } 1732 1733 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI, 1734 MachineRegisterInfo &MRI, 1735 LegalizerHelper &Helper) const { 1736 // When there is no integer popcount instruction (FEAT_CSSC isn't available), 1737 // it can be more efficiently lowered to the following sequence that uses 1738 // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD 1739 // registers are cheap. 1740 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 1741 // CNT V0.8B, V0.8B // 8xbyte pop-counts 1742 // ADDV B0, V0.8B // sum 8xbyte pop-counts 1743 // UMOV X0, V0.B[0] // copy byte result back to integer reg 1744 // 1745 // For 128 bit vector popcounts, we lower to the following sequence: 1746 // cnt.16b v0, v0 // v8s16, v4s32, v2s64 1747 // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64 1748 // uaddlp.4s v0, v0 // v4s32, v2s64 1749 // uaddlp.2d v0, v0 // v2s64 1750 // 1751 // For 64 bit vector popcounts, we lower to the following sequence: 1752 // cnt.8b v0, v0 // v4s16, v2s32 1753 // uaddlp.4h v0, v0 // v4s16, v2s32 1754 // uaddlp.2s v0, v0 // v2s32 1755 1756 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1757 Register Dst = MI.getOperand(0).getReg(); 1758 Register Val = MI.getOperand(1).getReg(); 1759 LLT Ty = MRI.getType(Val); 1760 unsigned Size = Ty.getSizeInBits(); 1761 1762 assert(Ty == MRI.getType(Dst) && 1763 "Expected src and dst to have the same type!"); 1764 1765 if (ST->hasCSSC() && Ty.isScalar() && Size == 128) { 1766 LLT s64 = LLT::scalar(64); 1767 1768 auto Split = MIRBuilder.buildUnmerge(s64, Val); 1769 auto CTPOP1 = MIRBuilder.buildCTPOP(s64, Split->getOperand(0)); 1770 auto CTPOP2 = MIRBuilder.buildCTPOP(s64, Split->getOperand(1)); 1771 auto Add = MIRBuilder.buildAdd(s64, CTPOP1, CTPOP2); 1772 1773 MIRBuilder.buildZExt(Dst, Add); 1774 MI.eraseFromParent(); 1775 return true; 1776 } 1777 1778 if (!ST->hasNEON() || 1779 MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) { 1780 // Use generic lowering when custom lowering is not possible. 1781 return Ty.isScalar() && (Size == 32 || Size == 64) && 1782 Helper.lowerBitCount(MI) == 1783 LegalizerHelper::LegalizeResult::Legalized; 1784 } 1785 1786 // Pre-conditioning: widen Val up to the nearest vector type. 1787 // s32,s64,v4s16,v2s32 -> v8i8 1788 // v8s16,v4s32,v2s64 -> v16i8 1789 LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8); 1790 if (Ty.isScalar()) { 1791 assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!"); 1792 if (Size == 32) { 1793 Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0); 1794 } 1795 } 1796 Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0); 1797 1798 // Count bits in each byte-sized lane. 1799 auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val); 1800 1801 // Sum across lanes. 1802 Register HSum = CTPOP.getReg(0); 1803 unsigned Opc; 1804 SmallVector<LLT> HAddTys; 1805 if (Ty.isScalar()) { 1806 Opc = Intrinsic::aarch64_neon_uaddlv; 1807 HAddTys.push_back(LLT::scalar(32)); 1808 } else if (Ty == LLT::fixed_vector(8, 16)) { 1809 Opc = Intrinsic::aarch64_neon_uaddlp; 1810 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1811 } else if (Ty == LLT::fixed_vector(4, 32)) { 1812 Opc = Intrinsic::aarch64_neon_uaddlp; 1813 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1814 HAddTys.push_back(LLT::fixed_vector(4, 32)); 1815 } else if (Ty == LLT::fixed_vector(2, 64)) { 1816 Opc = Intrinsic::aarch64_neon_uaddlp; 1817 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1818 HAddTys.push_back(LLT::fixed_vector(4, 32)); 1819 HAddTys.push_back(LLT::fixed_vector(2, 64)); 1820 } else if (Ty == LLT::fixed_vector(4, 16)) { 1821 Opc = Intrinsic::aarch64_neon_uaddlp; 1822 HAddTys.push_back(LLT::fixed_vector(4, 16)); 1823 } else if (Ty == LLT::fixed_vector(2, 32)) { 1824 Opc = Intrinsic::aarch64_neon_uaddlp; 1825 HAddTys.push_back(LLT::fixed_vector(4, 16)); 1826 HAddTys.push_back(LLT::fixed_vector(2, 32)); 1827 } else 1828 llvm_unreachable("unexpected vector shape"); 1829 MachineInstrBuilder UADD; 1830 for (LLT HTy : HAddTys) { 1831 UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}).addUse(HSum); 1832 HSum = UADD.getReg(0); 1833 } 1834 1835 // Post-conditioning. 1836 if (Ty.isScalar() && (Size == 64 || Size == 128)) 1837 MIRBuilder.buildZExt(Dst, UADD); 1838 else 1839 UADD->getOperand(0).setReg(Dst); 1840 MI.eraseFromParent(); 1841 return true; 1842 } 1843 1844 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128( 1845 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 1846 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1847 LLT s64 = LLT::scalar(64); 1848 auto Addr = MI.getOperand(1).getReg(); 1849 auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2)); 1850 auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3)); 1851 auto DstLo = MRI.createGenericVirtualRegister(s64); 1852 auto DstHi = MRI.createGenericVirtualRegister(s64); 1853 1854 MachineInstrBuilder CAS; 1855 if (ST->hasLSE()) { 1856 // We have 128-bit CASP instructions taking XSeqPair registers, which are 1857 // s128. We need the merge/unmerge to bracket the expansion and pair up with 1858 // the rest of the MIR so we must reassemble the extracted registers into a 1859 // 128-bit known-regclass one with code like this: 1860 // 1861 // %in1 = REG_SEQUENCE Lo, Hi ; One for each input 1862 // %out = CASP %in1, ... 1863 // %OldLo = G_EXTRACT %out, 0 1864 // %OldHi = G_EXTRACT %out, 64 1865 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); 1866 unsigned Opcode; 1867 switch (Ordering) { 1868 case AtomicOrdering::Acquire: 1869 Opcode = AArch64::CASPAX; 1870 break; 1871 case AtomicOrdering::Release: 1872 Opcode = AArch64::CASPLX; 1873 break; 1874 case AtomicOrdering::AcquireRelease: 1875 case AtomicOrdering::SequentiallyConsistent: 1876 Opcode = AArch64::CASPALX; 1877 break; 1878 default: 1879 Opcode = AArch64::CASPX; 1880 break; 1881 } 1882 1883 LLT s128 = LLT::scalar(128); 1884 auto CASDst = MRI.createGenericVirtualRegister(s128); 1885 auto CASDesired = MRI.createGenericVirtualRegister(s128); 1886 auto CASNew = MRI.createGenericVirtualRegister(s128); 1887 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {}) 1888 .addUse(DesiredI->getOperand(0).getReg()) 1889 .addImm(AArch64::sube64) 1890 .addUse(DesiredI->getOperand(1).getReg()) 1891 .addImm(AArch64::subo64); 1892 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {}) 1893 .addUse(NewI->getOperand(0).getReg()) 1894 .addImm(AArch64::sube64) 1895 .addUse(NewI->getOperand(1).getReg()) 1896 .addImm(AArch64::subo64); 1897 1898 CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr}); 1899 1900 MIRBuilder.buildExtract({DstLo}, {CASDst}, 0); 1901 MIRBuilder.buildExtract({DstHi}, {CASDst}, 64); 1902 } else { 1903 // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP 1904 // can take arbitrary registers so it just has the normal GPR64 operands the 1905 // rest of AArch64 is expecting. 1906 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); 1907 unsigned Opcode; 1908 switch (Ordering) { 1909 case AtomicOrdering::Acquire: 1910 Opcode = AArch64::CMP_SWAP_128_ACQUIRE; 1911 break; 1912 case AtomicOrdering::Release: 1913 Opcode = AArch64::CMP_SWAP_128_RELEASE; 1914 break; 1915 case AtomicOrdering::AcquireRelease: 1916 case AtomicOrdering::SequentiallyConsistent: 1917 Opcode = AArch64::CMP_SWAP_128; 1918 break; 1919 default: 1920 Opcode = AArch64::CMP_SWAP_128_MONOTONIC; 1921 break; 1922 } 1923 1924 auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 1925 CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch}, 1926 {Addr, DesiredI->getOperand(0), 1927 DesiredI->getOperand(1), NewI->getOperand(0), 1928 NewI->getOperand(1)}); 1929 } 1930 1931 CAS.cloneMemRefs(MI); 1932 constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(), 1933 *MRI.getTargetRegisterInfo(), 1934 *ST->getRegBankInfo()); 1935 1936 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {DstLo, DstHi}); 1937 MI.eraseFromParent(); 1938 return true; 1939 } 1940 1941 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI, 1942 LegalizerHelper &Helper) const { 1943 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1944 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 1945 LLT Ty = MRI.getType(MI.getOperand(1).getReg()); 1946 auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1)); 1947 MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse); 1948 MI.eraseFromParent(); 1949 return true; 1950 } 1951 1952 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI, 1953 LegalizerHelper &Helper) const { 1954 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1955 1956 // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic 1957 if (MI.getOpcode() == TargetOpcode::G_MEMSET) { 1958 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by 1959 // the instruction). 1960 auto &Value = MI.getOperand(1); 1961 Register ExtValueReg = 1962 MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0); 1963 Value.setReg(ExtValueReg); 1964 return true; 1965 } 1966 1967 return false; 1968 } 1969 1970 bool AArch64LegalizerInfo::legalizeFCopySign(MachineInstr &MI, 1971 LegalizerHelper &Helper) const { 1972 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1973 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 1974 Register Dst = MI.getOperand(0).getReg(); 1975 LLT DstTy = MRI.getType(Dst); 1976 assert(DstTy.isScalar() && "Only expected scalars right now!"); 1977 const unsigned DstSize = DstTy.getSizeInBits(); 1978 assert((DstSize == 32 || DstSize == 64) && "Unexpected dst type!"); 1979 assert(MRI.getType(MI.getOperand(2).getReg()) == DstTy && 1980 "Expected homogeneous types!"); 1981 1982 // We want to materialize a mask with the high bit set. 1983 uint64_t EltMask; 1984 LLT VecTy; 1985 1986 // TODO: s16 support. 1987 switch (DstSize) { 1988 default: 1989 llvm_unreachable("Unexpected type for G_FCOPYSIGN!"); 1990 case 64: { 1991 // AdvSIMD immediate moves cannot materialize out mask in a single 1992 // instruction for 64-bit elements. Instead, materialize zero and then 1993 // negate it. 1994 EltMask = 0; 1995 VecTy = LLT::fixed_vector(2, DstTy); 1996 break; 1997 } 1998 case 32: 1999 EltMask = 0x80000000ULL; 2000 VecTy = LLT::fixed_vector(4, DstTy); 2001 break; 2002 } 2003 2004 // Widen In1 and In2 to 128 bits. We want these to eventually become 2005 // INSERT_SUBREGs. 2006 auto Undef = MIRBuilder.buildUndef(VecTy); 2007 auto Zero = MIRBuilder.buildConstant(DstTy, 0); 2008 auto Ins1 = MIRBuilder.buildInsertVectorElement( 2009 VecTy, Undef, MI.getOperand(1).getReg(), Zero); 2010 auto Ins2 = MIRBuilder.buildInsertVectorElement( 2011 VecTy, Undef, MI.getOperand(2).getReg(), Zero); 2012 2013 // Construct the mask. 2014 auto Mask = MIRBuilder.buildConstant(VecTy, EltMask); 2015 if (DstSize == 64) 2016 Mask = MIRBuilder.buildFNeg(VecTy, Mask); 2017 2018 auto Sel = MIRBuilder.buildInstr(AArch64::G_BSP, {VecTy}, {Mask, Ins2, Ins1}); 2019 2020 // Build an unmerge whose 0th elt is the original G_FCOPYSIGN destination. We 2021 // want this to eventually become an EXTRACT_SUBREG. 2022 SmallVector<Register, 2> DstRegs(1, Dst); 2023 for (unsigned I = 1, E = VecTy.getNumElements(); I < E; ++I) 2024 DstRegs.push_back(MRI.createGenericVirtualRegister(DstTy)); 2025 MIRBuilder.buildUnmerge(DstRegs, Sel); 2026 MI.eraseFromParent(); 2027 return true; 2028 } 2029 2030 bool AArch64LegalizerInfo::legalizeExtractVectorElt( 2031 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 2032 assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT); 2033 auto VRegAndVal = 2034 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2035 if (VRegAndVal) 2036 return true; 2037 return Helper.lowerExtractInsertVectorElt(MI) != 2038 LegalizerHelper::LegalizeResult::UnableToLegalize; 2039 } 2040 2041 bool AArch64LegalizerInfo::legalizeDynStackAlloc( 2042 MachineInstr &MI, LegalizerHelper &Helper) const { 2043 MachineFunction &MF = *MI.getParent()->getParent(); 2044 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 2045 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 2046 2047 // If stack probing is not enabled for this function, use the default 2048 // lowering. 2049 if (!MF.getFunction().hasFnAttribute("probe-stack") || 2050 MF.getFunction().getFnAttribute("probe-stack").getValueAsString() != 2051 "inline-asm") { 2052 Helper.lowerDynStackAlloc(MI); 2053 return true; 2054 } 2055 2056 Register Dst = MI.getOperand(0).getReg(); 2057 Register AllocSize = MI.getOperand(1).getReg(); 2058 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 2059 2060 assert(MRI.getType(Dst) == LLT::pointer(0, 64) && 2061 "Unexpected type for dynamic alloca"); 2062 assert(MRI.getType(AllocSize) == LLT::scalar(64) && 2063 "Unexpected type for dynamic alloca"); 2064 2065 LLT PtrTy = MRI.getType(Dst); 2066 Register SPReg = 2067 Helper.getTargetLowering().getStackPointerRegisterToSaveRestore(); 2068 Register SPTmp = 2069 Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy); 2070 auto NewMI = 2071 MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp}); 2072 MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass); 2073 MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI); 2074 MIRBuilder.buildCopy(Dst, SPTmp); 2075 2076 MI.eraseFromParent(); 2077 return true; 2078 } 2079 2080 bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI, 2081 LegalizerHelper &Helper) const { 2082 MachineIRBuilder &MIB = Helper.MIRBuilder; 2083 auto &AddrVal = MI.getOperand(0); 2084 2085 int64_t IsWrite = MI.getOperand(1).getImm(); 2086 int64_t Locality = MI.getOperand(2).getImm(); 2087 int64_t IsData = MI.getOperand(3).getImm(); 2088 2089 bool IsStream = Locality == 0; 2090 if (Locality != 0) { 2091 assert(Locality <= 3 && "Prefetch locality out-of-range"); 2092 // The locality degree is the opposite of the cache speed. 2093 // Put the number the other way around. 2094 // The encoding starts at 0 for level 1 2095 Locality = 3 - Locality; 2096 } 2097 2098 unsigned PrfOp = (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream; 2099 2100 MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal); 2101 MI.eraseFromParent(); 2102 return true; 2103 } 2104