1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AArch64. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64LegalizerInfo.h" 15 #include "AArch64RegisterBankInfo.h" 16 #include "AArch64Subtarget.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 19 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 21 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/Utils.h" 24 #include "llvm/CodeGen/MachineInstr.h" 25 #include "llvm/CodeGen/MachineRegisterInfo.h" 26 #include "llvm/CodeGen/TargetOpcodes.h" 27 #include "llvm/CodeGen/ValueTypes.h" 28 #include "llvm/IR/DerivedTypes.h" 29 #include "llvm/IR/Intrinsics.h" 30 #include "llvm/IR/IntrinsicsAArch64.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/MathExtras.h" 33 #include <initializer_list> 34 35 #define DEBUG_TYPE "aarch64-legalinfo" 36 37 using namespace llvm; 38 using namespace LegalizeActions; 39 using namespace LegalizeMutations; 40 using namespace LegalityPredicates; 41 using namespace MIPatternMatch; 42 43 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) 44 : ST(&ST) { 45 using namespace TargetOpcode; 46 const LLT p0 = LLT::pointer(0, 64); 47 const LLT s8 = LLT::scalar(8); 48 const LLT s16 = LLT::scalar(16); 49 const LLT s32 = LLT::scalar(32); 50 const LLT s64 = LLT::scalar(64); 51 const LLT s128 = LLT::scalar(128); 52 const LLT v16s8 = LLT::fixed_vector(16, 8); 53 const LLT v8s8 = LLT::fixed_vector(8, 8); 54 const LLT v4s8 = LLT::fixed_vector(4, 8); 55 const LLT v2s8 = LLT::fixed_vector(2, 8); 56 const LLT v8s16 = LLT::fixed_vector(8, 16); 57 const LLT v4s16 = LLT::fixed_vector(4, 16); 58 const LLT v2s16 = LLT::fixed_vector(2, 16); 59 const LLT v2s32 = LLT::fixed_vector(2, 32); 60 const LLT v4s32 = LLT::fixed_vector(4, 32); 61 const LLT v2s64 = LLT::fixed_vector(2, 64); 62 const LLT v2p0 = LLT::fixed_vector(2, p0); 63 64 const LLT nxv16s8 = LLT::scalable_vector(16, s8); 65 const LLT nxv8s16 = LLT::scalable_vector(8, s16); 66 const LLT nxv4s32 = LLT::scalable_vector(4, s32); 67 const LLT nxv2s64 = LLT::scalable_vector(2, s64); 68 69 std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */ 70 v16s8, v8s16, v4s32, 71 v2s64, v2p0, 72 /* End 128bit types */ 73 /* Begin 64bit types */ 74 v8s8, v4s16, v2s32}; 75 std::initializer_list<LLT> ScalarAndPtrTypesList = {s8, s16, s32, s64, p0}; 76 SmallVector<LLT, 8> PackedVectorAllTypesVec(PackedVectorAllTypeList); 77 SmallVector<LLT, 8> ScalarAndPtrTypesVec(ScalarAndPtrTypesList); 78 79 const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine(); 80 81 // FIXME: support subtargets which have neon/fp-armv8 disabled. 82 if (!ST.hasNEON() || !ST.hasFPARMv8()) { 83 getLegacyLegalizerInfo().computeTables(); 84 return; 85 } 86 87 // Some instructions only support s16 if the subtarget has full 16-bit FP 88 // support. 89 const bool HasFP16 = ST.hasFullFP16(); 90 const LLT &MinFPScalar = HasFP16 ? s16 : s32; 91 92 const bool HasCSSC = ST.hasCSSC(); 93 const bool HasRCPC3 = ST.hasRCPC3(); 94 95 getActionDefinitionsBuilder( 96 {G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER}) 97 .legalFor({p0, s8, s16, s32, s64}) 98 .legalFor(PackedVectorAllTypeList) 99 .widenScalarToNextPow2(0) 100 .clampScalar(0, s8, s64) 101 .moreElementsToNextPow2(0) 102 .widenVectorEltsToVectorMinSize(0, 64) 103 .clampNumElements(0, v8s8, v16s8) 104 .clampNumElements(0, v4s16, v8s16) 105 .clampNumElements(0, v2s32, v4s32) 106 .clampNumElements(0, v2s64, v2s64); 107 108 getActionDefinitionsBuilder(G_PHI) 109 .legalFor({p0, s16, s32, s64}) 110 .legalFor(PackedVectorAllTypeList) 111 .widenScalarToNextPow2(0) 112 .clampScalar(0, s16, s64) 113 // Maximum: sN * k = 128 114 .clampMaxNumElements(0, s8, 16) 115 .clampMaxNumElements(0, s16, 8) 116 .clampMaxNumElements(0, s32, 4) 117 .clampMaxNumElements(0, s64, 2) 118 .clampMaxNumElements(0, p0, 2); 119 120 getActionDefinitionsBuilder(G_BSWAP) 121 .legalFor({s32, s64, v4s16, v8s16, v2s32, v4s32, v2s64}) 122 .widenScalarOrEltToNextPow2(0, 16) 123 .clampScalar(0, s32, s64) 124 .clampNumElements(0, v4s16, v8s16) 125 .clampNumElements(0, v2s32, v4s32) 126 .clampNumElements(0, v2s64, v2s64) 127 .moreElementsToNextPow2(0); 128 129 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) 130 .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8}) 131 .widenScalarToNextPow2(0) 132 .clampScalar(0, s32, s64) 133 .clampMaxNumElements(0, s8, 16) 134 .clampMaxNumElements(0, s16, 8) 135 .clampNumElements(0, v2s32, v4s32) 136 .clampNumElements(0, v2s64, v2s64) 137 .minScalarOrEltIf( 138 [=](const LegalityQuery &Query) { 139 return Query.Types[0].getNumElements() <= 2; 140 }, 141 0, s32) 142 .minScalarOrEltIf( 143 [=](const LegalityQuery &Query) { 144 return Query.Types[0].getNumElements() <= 4; 145 }, 146 0, s16) 147 .minScalarOrEltIf( 148 [=](const LegalityQuery &Query) { 149 return Query.Types[0].getNumElements() <= 16; 150 }, 151 0, s8) 152 .moreElementsToNextPow2(0); 153 154 getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) 155 .customIf([=](const LegalityQuery &Query) { 156 const auto &SrcTy = Query.Types[0]; 157 const auto &AmtTy = Query.Types[1]; 158 return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && 159 AmtTy.getSizeInBits() == 32; 160 }) 161 .legalFor({ 162 {s32, s32}, 163 {s32, s64}, 164 {s64, s64}, 165 {v8s8, v8s8}, 166 {v16s8, v16s8}, 167 {v4s16, v4s16}, 168 {v8s16, v8s16}, 169 {v2s32, v2s32}, 170 {v4s32, v4s32}, 171 {v2s64, v2s64}, 172 }) 173 .widenScalarToNextPow2(0) 174 .clampScalar(1, s32, s64) 175 .clampScalar(0, s32, s64) 176 .clampNumElements(0, v8s8, v16s8) 177 .clampNumElements(0, v4s16, v8s16) 178 .clampNumElements(0, v2s32, v4s32) 179 .clampNumElements(0, v2s64, v2s64) 180 .moreElementsToNextPow2(0) 181 .minScalarSameAs(1, 0); 182 183 getActionDefinitionsBuilder(G_PTR_ADD) 184 .legalFor({{p0, s64}, {v2p0, v2s64}}) 185 .clampScalarOrElt(1, s64, s64) 186 .clampNumElements(0, v2p0, v2p0); 187 188 getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}}); 189 190 getActionDefinitionsBuilder({G_SDIV, G_UDIV}) 191 .legalFor({s32, s64}) 192 .libcallFor({s128}) 193 .clampScalar(0, s32, s64) 194 .widenScalarToNextPow2(0) 195 .scalarize(0); 196 197 getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 198 .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32}) 199 .widenScalarOrEltToNextPow2(0) 200 .clampScalarOrElt(0, s32, s64) 201 .clampNumElements(0, v2s32, v4s32) 202 .clampNumElements(0, v2s64, v2s64) 203 .moreElementsToNextPow2(0); 204 205 206 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 207 .widenScalarToNextPow2(0, /*Min = */ 32) 208 .clampScalar(0, s32, s64) 209 .lower(); 210 211 getActionDefinitionsBuilder({G_SMULH, G_UMULH}) 212 .legalFor({s64, v8s16, v16s8, v4s32}) 213 .lower(); 214 215 auto &MinMaxActions = getActionDefinitionsBuilder( 216 {G_SMIN, G_SMAX, G_UMIN, G_UMAX}); 217 if (HasCSSC) 218 MinMaxActions 219 .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) 220 // Making clamping conditional on CSSC extension as without legal types we 221 // lower to CMP which can fold one of the two sxtb's we'd otherwise need 222 // if we detect a type smaller than 32-bit. 223 .minScalar(0, s32); 224 else 225 MinMaxActions 226 .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}); 227 MinMaxActions 228 .clampNumElements(0, v8s8, v16s8) 229 .clampNumElements(0, v4s16, v8s16) 230 .clampNumElements(0, v2s32, v4s32) 231 // FIXME: This sholdn't be needed as v2s64 types are going to 232 // be expanded anyway, but G_ICMP doesn't support splitting vectors yet 233 .clampNumElements(0, v2s64, v2s64) 234 .lower(); 235 236 getActionDefinitionsBuilder( 237 {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO}) 238 .legalFor({{s32, s32}, {s64, s32}}) 239 .clampScalar(0, s32, s64) 240 .clampScalar(1, s32, s64) 241 .widenScalarToNextPow2(0); 242 243 getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FNEG, 244 G_FABS, G_FSQRT, G_FMAXNUM, G_FMINNUM, 245 G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR, 246 G_FRINT, G_FNEARBYINT, G_INTRINSIC_TRUNC, 247 G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 248 .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64}) 249 .legalIf([=](const LegalityQuery &Query) { 250 const auto &Ty = Query.Types[0]; 251 return (Ty == v8s16 || Ty == v4s16) && HasFP16; 252 }) 253 .libcallFor({s128}) 254 .minScalarOrElt(0, MinFPScalar) 255 .clampNumElements(0, v4s16, v8s16) 256 .clampNumElements(0, v2s32, v4s32) 257 .clampNumElements(0, v2s64, v2s64) 258 .moreElementsToNextPow2(0); 259 260 getActionDefinitionsBuilder(G_FREM) 261 .libcallFor({s32, s64}) 262 .minScalar(0, s32) 263 .scalarize(0); 264 265 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT}) 266 .legalFor({{s64, MinFPScalar}, {s64, s32}, {s64, s64}}) 267 .libcallFor({{s64, s128}}) 268 .minScalarOrElt(1, MinFPScalar); 269 270 getActionDefinitionsBuilder( 271 {G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2, G_FLOG10, G_FTAN, G_FEXP, 272 G_FEXP2, G_FEXP10, G_FACOS, G_FASIN, G_FATAN, G_FCOSH, G_FSINH, G_FTANH}) 273 // We need a call for these, so we always need to scalarize. 274 .scalarize(0) 275 // Regardless of FP16 support, widen 16-bit elements to 32-bits. 276 .minScalar(0, s32) 277 .libcallFor({s32, s64}); 278 getActionDefinitionsBuilder(G_FPOWI) 279 .scalarize(0) 280 .minScalar(0, s32) 281 .libcallFor({{s32, s32}, {s64, s32}}); 282 283 getActionDefinitionsBuilder(G_INSERT) 284 .legalIf(all(typeInSet(0, {s32, s64, p0}), 285 typeInSet(1, {s8, s16, s32}), smallerThan(1, 0))) 286 .widenScalarToNextPow2(0) 287 .clampScalar(0, s32, s64) 288 .widenScalarToNextPow2(1) 289 .minScalar(1, s8) 290 .maxScalarIf(typeInSet(0, {s32}), 1, s16) 291 .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32); 292 293 getActionDefinitionsBuilder(G_EXTRACT) 294 .legalIf(all(typeInSet(0, {s16, s32, s64, p0}), 295 typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1))) 296 .widenScalarToNextPow2(1) 297 .clampScalar(1, s32, s128) 298 .widenScalarToNextPow2(0) 299 .minScalar(0, s16) 300 .maxScalarIf(typeInSet(1, {s32}), 0, s16) 301 .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32) 302 .maxScalarIf(typeInSet(1, {s128}), 0, s64); 303 304 305 for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) { 306 auto &Actions = getActionDefinitionsBuilder(Op); 307 308 if (Op == G_SEXTLOAD) 309 Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)); 310 311 // Atomics have zero extending behavior. 312 Actions 313 .legalForTypesWithMemDesc({{s32, p0, s8, 8}, 314 {s32, p0, s16, 8}, 315 {s32, p0, s32, 8}, 316 {s64, p0, s8, 2}, 317 {s64, p0, s16, 2}, 318 {s64, p0, s32, 4}, 319 {s64, p0, s64, 8}, 320 {p0, p0, s64, 8}, 321 {v2s32, p0, s64, 8}}) 322 .widenScalarToNextPow2(0) 323 .clampScalar(0, s32, s64) 324 // TODO: We could support sum-of-pow2's but the lowering code doesn't know 325 // how to do that yet. 326 .unsupportedIfMemSizeNotPow2() 327 // Lower anything left over into G_*EXT and G_LOAD 328 .lower(); 329 } 330 331 auto IsPtrVecPred = [=](const LegalityQuery &Query) { 332 const LLT &ValTy = Query.Types[0]; 333 return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0; 334 }; 335 336 auto &LoadActions = getActionDefinitionsBuilder(G_LOAD); 337 auto &StoreActions = getActionDefinitionsBuilder(G_STORE); 338 339 if (ST.hasSVE()) { 340 LoadActions.legalForTypesWithMemDesc({ 341 // 128 bit base sizes 342 {nxv16s8, p0, nxv16s8, 8}, 343 {nxv8s16, p0, nxv8s16, 8}, 344 {nxv4s32, p0, nxv4s32, 8}, 345 {nxv2s64, p0, nxv2s64, 8}, 346 }); 347 348 // TODO: Add nxv2p0. Consider bitcastIf. 349 // See #92130 350 // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461 351 StoreActions.legalForTypesWithMemDesc({ 352 // 128 bit base sizes 353 {nxv16s8, p0, nxv16s8, 8}, 354 {nxv8s16, p0, nxv8s16, 8}, 355 {nxv4s32, p0, nxv4s32, 8}, 356 {nxv2s64, p0, nxv2s64, 8}, 357 }); 358 } 359 360 LoadActions 361 .customIf([=](const LegalityQuery &Query) { 362 return HasRCPC3 && Query.Types[0] == s128 && 363 Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire; 364 }) 365 .customIf([=](const LegalityQuery &Query) { 366 return Query.Types[0] == s128 && 367 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; 368 }) 369 .legalForTypesWithMemDesc({{s8, p0, s8, 8}, 370 {s16, p0, s16, 8}, 371 {s32, p0, s32, 8}, 372 {s64, p0, s64, 8}, 373 {p0, p0, s64, 8}, 374 {s128, p0, s128, 8}, 375 {v8s8, p0, s64, 8}, 376 {v16s8, p0, s128, 8}, 377 {v4s16, p0, s64, 8}, 378 {v8s16, p0, s128, 8}, 379 {v2s32, p0, s64, 8}, 380 {v4s32, p0, s128, 8}, 381 {v2s64, p0, s128, 8}}) 382 // These extends are also legal 383 .legalForTypesWithMemDesc( 384 {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}}) 385 .widenScalarToNextPow2(0, /* MinSize = */ 8) 386 .clampMaxNumElements(0, s8, 16) 387 .clampMaxNumElements(0, s16, 8) 388 .clampMaxNumElements(0, s32, 4) 389 .clampMaxNumElements(0, s64, 2) 390 .clampMaxNumElements(0, p0, 2) 391 .lowerIfMemSizeNotByteSizePow2() 392 .clampScalar(0, s8, s64) 393 .narrowScalarIf( 394 [=](const LegalityQuery &Query) { 395 // Clamp extending load results to 32-bits. 396 return Query.Types[0].isScalar() && 397 Query.Types[0] != Query.MMODescrs[0].MemoryTy && 398 Query.Types[0].getSizeInBits() > 32; 399 }, 400 changeTo(0, s32)) 401 // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out 402 .bitcastIf(typeInSet(0, {v4s8}), 403 [=](const LegalityQuery &Query) { 404 const LLT VecTy = Query.Types[0]; 405 return std::pair(0, LLT::scalar(VecTy.getSizeInBits())); 406 }) 407 .customIf(IsPtrVecPred) 408 .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0); 409 410 StoreActions 411 .customIf([=](const LegalityQuery &Query) { 412 return HasRCPC3 && Query.Types[0] == s128 && 413 Query.MMODescrs[0].Ordering == AtomicOrdering::Release; 414 }) 415 .customIf([=](const LegalityQuery &Query) { 416 return Query.Types[0] == s128 && 417 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; 418 }) 419 .legalForTypesWithMemDesc( 420 {{s8, p0, s8, 8}, {s16, p0, s8, 8}, // truncstorei8 from s16 421 {s32, p0, s8, 8}, // truncstorei8 from s32 422 {s64, p0, s8, 8}, // truncstorei8 from s64 423 {s16, p0, s16, 8}, {s32, p0, s16, 8}, // truncstorei16 from s32 424 {s64, p0, s16, 8}, // truncstorei16 from s64 425 {s32, p0, s8, 8}, {s32, p0, s16, 8}, {s32, p0, s32, 8}, 426 {s64, p0, s64, 8}, {s64, p0, s32, 8}, // truncstorei32 from s64 427 {p0, p0, s64, 8}, {s128, p0, s128, 8}, {v16s8, p0, s128, 8}, 428 {v8s8, p0, s64, 8}, {v4s16, p0, s64, 8}, {v8s16, p0, s128, 8}, 429 {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}}) 430 .clampScalar(0, s8, s64) 431 .lowerIf([=](const LegalityQuery &Query) { 432 return Query.Types[0].isScalar() && 433 Query.Types[0] != Query.MMODescrs[0].MemoryTy; 434 }) 435 // Maximum: sN * k = 128 436 .clampMaxNumElements(0, s8, 16) 437 .clampMaxNumElements(0, s16, 8) 438 .clampMaxNumElements(0, s32, 4) 439 .clampMaxNumElements(0, s64, 2) 440 .clampMaxNumElements(0, p0, 2) 441 .lowerIfMemSizeNotPow2() 442 // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out 443 .bitcastIf(typeInSet(0, {v4s8}), 444 [=](const LegalityQuery &Query) { 445 const LLT VecTy = Query.Types[0]; 446 return std::pair(0, LLT::scalar(VecTy.getSizeInBits())); 447 }) 448 .customIf(IsPtrVecPred) 449 .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0); 450 451 getActionDefinitionsBuilder(G_INDEXED_STORE) 452 // Idx 0 == Ptr, Idx 1 == Val 453 // TODO: we can implement legalizations but as of now these are 454 // generated in a very specific way. 455 .legalForTypesWithMemDesc({ 456 {p0, s8, s8, 8}, 457 {p0, s16, s16, 8}, 458 {p0, s32, s8, 8}, 459 {p0, s32, s16, 8}, 460 {p0, s32, s32, 8}, 461 {p0, s64, s64, 8}, 462 {p0, p0, p0, 8}, 463 {p0, v8s8, v8s8, 8}, 464 {p0, v16s8, v16s8, 8}, 465 {p0, v4s16, v4s16, 8}, 466 {p0, v8s16, v8s16, 8}, 467 {p0, v2s32, v2s32, 8}, 468 {p0, v4s32, v4s32, 8}, 469 {p0, v2s64, v2s64, 8}, 470 {p0, v2p0, v2p0, 8}, 471 {p0, s128, s128, 8}, 472 }) 473 .unsupported(); 474 475 auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) { 476 LLT LdTy = Query.Types[0]; 477 LLT PtrTy = Query.Types[1]; 478 if (!llvm::is_contained(PackedVectorAllTypesVec, LdTy) && 479 !llvm::is_contained(ScalarAndPtrTypesVec, LdTy) && LdTy != s128) 480 return false; 481 if (PtrTy != p0) 482 return false; 483 return true; 484 }; 485 getActionDefinitionsBuilder(G_INDEXED_LOAD) 486 .unsupportedIf( 487 atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)) 488 .legalIf(IndexedLoadBasicPred) 489 .unsupported(); 490 getActionDefinitionsBuilder({G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD}) 491 .unsupportedIf( 492 atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)) 493 .legalIf(all(typeInSet(0, {s16, s32, s64}), 494 LegalityPredicate([=](const LegalityQuery &Q) { 495 LLT LdTy = Q.Types[0]; 496 LLT PtrTy = Q.Types[1]; 497 LLT MemTy = Q.MMODescrs[0].MemoryTy; 498 if (PtrTy != p0) 499 return false; 500 if (LdTy == s16) 501 return MemTy == s8; 502 if (LdTy == s32) 503 return MemTy == s8 || MemTy == s16; 504 if (LdTy == s64) 505 return MemTy == s8 || MemTy == s16 || MemTy == s32; 506 return false; 507 }))) 508 .unsupported(); 509 510 // Constants 511 getActionDefinitionsBuilder(G_CONSTANT) 512 .legalFor({p0, s8, s16, s32, s64}) 513 .widenScalarToNextPow2(0) 514 .clampScalar(0, s8, s64); 515 getActionDefinitionsBuilder(G_FCONSTANT) 516 .legalIf([=](const LegalityQuery &Query) { 517 const auto &Ty = Query.Types[0]; 518 if (HasFP16 && Ty == s16) 519 return true; 520 return Ty == s32 || Ty == s64 || Ty == s128; 521 }) 522 .clampScalar(0, MinFPScalar, s128); 523 524 // FIXME: fix moreElementsToNextPow2 525 getActionDefinitionsBuilder(G_ICMP) 526 .legalFor({{s32, s32}, {s32, s64}, {s32, p0}}) 527 .widenScalarOrEltToNextPow2(1) 528 .clampScalar(1, s32, s64) 529 .clampScalar(0, s32, s32) 530 .minScalarEltSameAsIf( 531 [=](const LegalityQuery &Query) { 532 const LLT &Ty = Query.Types[0]; 533 const LLT &SrcTy = Query.Types[1]; 534 return Ty.isVector() && !SrcTy.isPointerVector() && 535 Ty.getElementType() != SrcTy.getElementType(); 536 }, 537 0, 1) 538 .minScalarOrEltIf( 539 [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; }, 540 1, s32) 541 .minScalarOrEltIf( 542 [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0, 543 s64) 544 .moreElementsToNextPow2(1) 545 .clampNumElements(1, v8s8, v16s8) 546 .clampNumElements(1, v4s16, v8s16) 547 .clampNumElements(1, v2s32, v4s32) 548 .clampNumElements(1, v2s64, v2s64) 549 .customIf(isVector(0)); 550 551 getActionDefinitionsBuilder(G_FCMP) 552 .legalFor({{s32, MinFPScalar}, 553 {s32, s32}, 554 {s32, s64}, 555 {v4s32, v4s32}, 556 {v2s32, v2s32}, 557 {v2s64, v2s64}}) 558 .legalIf([=](const LegalityQuery &Query) { 559 const auto &Ty = Query.Types[1]; 560 return (Ty == v8s16 || Ty == v4s16) && Ty == Query.Types[0] && HasFP16; 561 }) 562 .widenScalarOrEltToNextPow2(1) 563 .clampScalar(0, s32, s32) 564 .clampScalarOrElt(1, MinFPScalar, s64) 565 .minScalarEltSameAsIf( 566 [=](const LegalityQuery &Query) { 567 const LLT &Ty = Query.Types[0]; 568 const LLT &SrcTy = Query.Types[1]; 569 return Ty.isVector() && !SrcTy.isPointerVector() && 570 Ty.getElementType() != SrcTy.getElementType(); 571 }, 572 0, 1) 573 .clampNumElements(1, v4s16, v8s16) 574 .clampNumElements(1, v2s32, v4s32) 575 .clampMaxNumElements(1, s64, 2) 576 .moreElementsToNextPow2(1); 577 578 // Extensions 579 auto ExtLegalFunc = [=](const LegalityQuery &Query) { 580 unsigned DstSize = Query.Types[0].getSizeInBits(); 581 582 // Handle legal vectors using legalFor 583 if (Query.Types[0].isVector()) 584 return false; 585 586 if (DstSize < 8 || DstSize >= 128 || !isPowerOf2_32(DstSize)) 587 return false; // Extending to a scalar s128 needs narrowing. 588 589 const LLT &SrcTy = Query.Types[1]; 590 591 // Make sure we fit in a register otherwise. Don't bother checking that 592 // the source type is below 128 bits. We shouldn't be allowing anything 593 // through which is wider than the destination in the first place. 594 unsigned SrcSize = SrcTy.getSizeInBits(); 595 if (SrcSize < 8 || !isPowerOf2_32(SrcSize)) 596 return false; 597 598 return true; 599 }; 600 getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) 601 .legalIf(ExtLegalFunc) 602 .legalFor({{v2s64, v2s32}, {v4s32, v4s16}, {v8s16, v8s8}}) 603 .clampScalar(0, s64, s64) // Just for s128, others are handled above. 604 .moreElementsToNextPow2(0) 605 .clampMaxNumElements(1, s8, 8) 606 .clampMaxNumElements(1, s16, 4) 607 .clampMaxNumElements(1, s32, 2) 608 // Tries to convert a large EXTEND into two smaller EXTENDs 609 .lowerIf([=](const LegalityQuery &Query) { 610 return (Query.Types[0].getScalarSizeInBits() > 611 Query.Types[1].getScalarSizeInBits() * 2) && 612 Query.Types[0].isVector() && 613 (Query.Types[1].getScalarSizeInBits() == 8 || 614 Query.Types[1].getScalarSizeInBits() == 16); 615 }) 616 .clampMinNumElements(1, s8, 8) 617 .clampMinNumElements(1, s16, 4); 618 619 getActionDefinitionsBuilder(G_TRUNC) 620 .legalFor({{v2s32, v2s64}, {v4s16, v4s32}, {v8s8, v8s16}}) 621 .moreElementsToNextPow2(0) 622 .clampMaxNumElements(0, s8, 8) 623 .clampMaxNumElements(0, s16, 4) 624 .clampMaxNumElements(0, s32, 2) 625 .minScalarOrEltIf( 626 [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); }, 627 0, s8) 628 .lowerIf([=](const LegalityQuery &Query) { 629 LLT DstTy = Query.Types[0]; 630 LLT SrcTy = Query.Types[1]; 631 return DstTy.isVector() && SrcTy.getSizeInBits() > 128 && 632 DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits(); 633 }) 634 .clampMinNumElements(0, s8, 8) 635 .clampMinNumElements(0, s16, 4) 636 .alwaysLegal(); 637 638 getActionDefinitionsBuilder(G_SEXT_INREG) 639 .legalFor({s32, s64}) 640 .legalFor(PackedVectorAllTypeList) 641 .maxScalar(0, s64) 642 .clampNumElements(0, v8s8, v16s8) 643 .clampNumElements(0, v4s16, v8s16) 644 .clampNumElements(0, v2s32, v4s32) 645 .clampMaxNumElements(0, s64, 2) 646 .lower(); 647 648 // FP conversions 649 getActionDefinitionsBuilder(G_FPTRUNC) 650 .legalFor( 651 {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}) 652 .libcallFor({{s16, s128}, {s32, s128}, {s64, s128}}) 653 .clampNumElements(0, v4s16, v4s16) 654 .clampNumElements(0, v2s32, v2s32) 655 .scalarize(0); 656 657 getActionDefinitionsBuilder(G_FPEXT) 658 .legalFor( 659 {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}}) 660 .libcallFor({{s128, s64}, {s128, s32}, {s128, s16}}) 661 .clampNumElements(0, v4s32, v4s32) 662 .clampNumElements(0, v2s64, v2s64) 663 .scalarize(0); 664 665 // Conversions 666 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 667 .legalFor({{s32, s32}, 668 {s64, s32}, 669 {s32, s64}, 670 {s64, s64}, 671 {v2s64, v2s64}, 672 {v4s32, v4s32}, 673 {v2s32, v2s32}}) 674 .legalIf([=](const LegalityQuery &Query) { 675 return HasFP16 && 676 (Query.Types[1] == s16 || Query.Types[1] == v4s16 || 677 Query.Types[1] == v8s16) && 678 (Query.Types[0] == s32 || Query.Types[0] == s64 || 679 Query.Types[0] == v4s16 || Query.Types[0] == v8s16); 680 }) 681 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) 682 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1) 683 // The range of a fp16 value fits into an i17, so we can lower the width 684 // to i64. 685 .narrowScalarIf( 686 [=](const LegalityQuery &Query) { 687 return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64; 688 }, 689 changeTo(0, s64)) 690 .moreElementsToNextPow2(0) 691 .widenScalarOrEltToNextPow2OrMinSize(0) 692 .minScalar(0, s32) 693 .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16 ? 16 : 32) 694 .widenScalarIf( 695 [=](const LegalityQuery &Query) { 696 return Query.Types[0].getScalarSizeInBits() <= 64 && 697 Query.Types[0].getScalarSizeInBits() > 698 Query.Types[1].getScalarSizeInBits(); 699 }, 700 LegalizeMutations::changeElementSizeTo(1, 0)) 701 .widenScalarIf( 702 [=](const LegalityQuery &Query) { 703 return Query.Types[1].getScalarSizeInBits() <= 64 && 704 Query.Types[0].getScalarSizeInBits() < 705 Query.Types[1].getScalarSizeInBits(); 706 }, 707 LegalizeMutations::changeElementSizeTo(0, 1)) 708 .clampNumElements(0, v4s16, v8s16) 709 .clampNumElements(0, v2s32, v4s32) 710 .clampMaxNumElements(0, s64, 2) 711 .libcallFor( 712 {{s32, s128}, {s64, s128}, {s128, s128}, {s128, s32}, {s128, s64}}); 713 714 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 715 .legalFor({{s32, s32}, 716 {s64, s32}, 717 {s32, s64}, 718 {s64, s64}, 719 {v2s64, v2s64}, 720 {v4s32, v4s32}, 721 {v2s32, v2s32}}) 722 .legalIf([=](const LegalityQuery &Query) { 723 return HasFP16 && 724 (Query.Types[0] == s16 || Query.Types[0] == v4s16 || 725 Query.Types[0] == v8s16) && 726 (Query.Types[1] == s32 || Query.Types[1] == s64 || 727 Query.Types[1] == v4s16 || Query.Types[1] == v8s16); 728 }) 729 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1) 730 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) 731 .moreElementsToNextPow2(1) 732 .widenScalarOrEltToNextPow2OrMinSize(1) 733 .minScalar(1, s32) 734 .widenScalarOrEltToNextPow2OrMinSize(0, /*MinSize=*/HasFP16 ? 16 : 32) 735 .widenScalarIf( 736 [=](const LegalityQuery &Query) { 737 return Query.Types[1].getScalarSizeInBits() <= 64 && 738 Query.Types[0].getScalarSizeInBits() < 739 Query.Types[1].getScalarSizeInBits(); 740 }, 741 LegalizeMutations::changeElementSizeTo(0, 1)) 742 .widenScalarIf( 743 [=](const LegalityQuery &Query) { 744 return Query.Types[0].getScalarSizeInBits() <= 64 && 745 Query.Types[0].getScalarSizeInBits() > 746 Query.Types[1].getScalarSizeInBits(); 747 }, 748 LegalizeMutations::changeElementSizeTo(1, 0)) 749 .clampNumElements(0, v4s16, v8s16) 750 .clampNumElements(0, v2s32, v4s32) 751 .clampMaxNumElements(0, s64, 2) 752 .libcallFor({{s16, s128}, 753 {s32, s128}, 754 {s64, s128}, 755 {s128, s128}, 756 {s128, s32}, 757 {s128, s64}}); 758 759 // Control-flow 760 getActionDefinitionsBuilder(G_BRCOND) 761 .legalFor({s32}) 762 .clampScalar(0, s32, s32); 763 getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0}); 764 765 getActionDefinitionsBuilder(G_SELECT) 766 .legalFor({{s32, s32}, {s64, s32}, {p0, s32}}) 767 .widenScalarToNextPow2(0) 768 .clampScalar(0, s32, s64) 769 .clampScalar(1, s32, s32) 770 .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0) 771 .lowerIf(isVector(0)); 772 773 // Pointer-handling 774 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); 775 776 if (TM.getCodeModel() == CodeModel::Small) 777 getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom(); 778 else 779 getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); 780 781 getActionDefinitionsBuilder(G_PTRAUTH_GLOBAL_VALUE) 782 .legalIf(all(typeIs(0, p0), typeIs(1, p0))); 783 784 getActionDefinitionsBuilder(G_PTRTOINT) 785 .legalFor({{s64, p0}, {v2s64, v2p0}}) 786 .widenScalarToNextPow2(0, 64) 787 .clampScalar(0, s64, s64); 788 789 getActionDefinitionsBuilder(G_INTTOPTR) 790 .unsupportedIf([&](const LegalityQuery &Query) { 791 return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits(); 792 }) 793 .legalFor({{p0, s64}, {v2p0, v2s64}}); 794 795 // Casts for 32 and 64-bit width type are just copies. 796 // Same for 128-bit width type, except they are on the FPR bank. 797 getActionDefinitionsBuilder(G_BITCAST) 798 // Keeping 32-bit instructions legal to prevent regression in some tests 799 .legalForCartesianProduct({s32, v2s16, v4s8}) 800 .legalForCartesianProduct({s64, v8s8, v4s16, v2s32}) 801 .legalForCartesianProduct({s128, v16s8, v8s16, v4s32, v2s64, v2p0}) 802 .lowerIf([=](const LegalityQuery &Query) { 803 return Query.Types[0].isVector() != Query.Types[1].isVector(); 804 }) 805 .moreElementsToNextPow2(0) 806 .clampNumElements(0, v8s8, v16s8) 807 .clampNumElements(0, v4s16, v8s16) 808 .clampNumElements(0, v2s32, v4s32) 809 .lower(); 810 811 getActionDefinitionsBuilder(G_VASTART).legalFor({p0}); 812 813 // va_list must be a pointer, but most sized types are pretty easy to handle 814 // as the destination. 815 getActionDefinitionsBuilder(G_VAARG) 816 .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0}) 817 .clampScalar(0, s8, s64) 818 .widenScalarToNextPow2(0, /*Min*/ 8); 819 820 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 821 .lowerIf( 822 all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0))); 823 824 LegalityPredicate UseOutlineAtomics = [&ST](const LegalityQuery &Query) { 825 return ST.outlineAtomics() && !ST.hasLSE(); 826 }; 827 828 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 829 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0), 830 predNot(UseOutlineAtomics))) 831 .customIf(all(typeIs(0, s128), predNot(UseOutlineAtomics))) 832 .customIf([UseOutlineAtomics](const LegalityQuery &Query) { 833 return Query.Types[0].getSizeInBits() == 128 && 834 !UseOutlineAtomics(Query); 835 }) 836 .libcallIf(all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, p0), 837 UseOutlineAtomics)) 838 .clampScalar(0, s32, s64); 839 840 getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, 841 G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR, 842 G_ATOMICRMW_XOR}) 843 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0), 844 predNot(UseOutlineAtomics))) 845 .libcallIf(all(typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0), 846 UseOutlineAtomics)) 847 .clampScalar(0, s32, s64); 848 849 // Do not outline these atomics operations, as per comment in 850 // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR(). 851 getActionDefinitionsBuilder( 852 {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX}) 853 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0))) 854 .clampScalar(0, s32, s64); 855 856 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0}); 857 858 // Merge/Unmerge 859 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 860 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 861 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 862 getActionDefinitionsBuilder(Op) 863 .widenScalarToNextPow2(LitTyIdx, 8) 864 .widenScalarToNextPow2(BigTyIdx, 32) 865 .clampScalar(LitTyIdx, s8, s64) 866 .clampScalar(BigTyIdx, s32, s128) 867 .legalIf([=](const LegalityQuery &Q) { 868 switch (Q.Types[BigTyIdx].getSizeInBits()) { 869 case 32: 870 case 64: 871 case 128: 872 break; 873 default: 874 return false; 875 } 876 switch (Q.Types[LitTyIdx].getSizeInBits()) { 877 case 8: 878 case 16: 879 case 32: 880 case 64: 881 return true; 882 default: 883 return false; 884 } 885 }); 886 } 887 888 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 889 .unsupportedIf([=](const LegalityQuery &Query) { 890 const LLT &EltTy = Query.Types[1].getElementType(); 891 return Query.Types[0] != EltTy; 892 }) 893 .minScalar(2, s64) 894 .customIf([=](const LegalityQuery &Query) { 895 const LLT &VecTy = Query.Types[1]; 896 return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 || 897 VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 || 898 VecTy == v8s8 || VecTy == v16s8 || VecTy == v2p0; 899 }) 900 .minScalarOrEltIf( 901 [=](const LegalityQuery &Query) { 902 // We want to promote to <M x s1> to <M x s64> if that wouldn't 903 // cause the total vec size to be > 128b. 904 return Query.Types[1].getNumElements() <= 2; 905 }, 906 0, s64) 907 .minScalarOrEltIf( 908 [=](const LegalityQuery &Query) { 909 return Query.Types[1].getNumElements() <= 4; 910 }, 911 0, s32) 912 .minScalarOrEltIf( 913 [=](const LegalityQuery &Query) { 914 return Query.Types[1].getNumElements() <= 8; 915 }, 916 0, s16) 917 .minScalarOrEltIf( 918 [=](const LegalityQuery &Query) { 919 return Query.Types[1].getNumElements() <= 16; 920 }, 921 0, s8) 922 .minScalarOrElt(0, s8) // Worst case, we need at least s8. 923 .moreElementsToNextPow2(1) 924 .clampMaxNumElements(1, s64, 2) 925 .clampMaxNumElements(1, s32, 4) 926 .clampMaxNumElements(1, s16, 8) 927 .clampMaxNumElements(1, s8, 16) 928 .clampMaxNumElements(1, p0, 2); 929 930 getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) 931 .legalIf( 932 typeInSet(0, {v16s8, v8s8, v8s16, v4s16, v4s32, v2s32, v2s64, v2p0})) 933 .moreElementsToNextPow2(0) 934 .widenVectorEltsToVectorMinSize(0, 64) 935 .clampNumElements(0, v8s8, v16s8) 936 .clampNumElements(0, v4s16, v8s16) 937 .clampNumElements(0, v2s32, v4s32) 938 .clampMaxNumElements(0, s64, 2) 939 .clampMaxNumElements(0, p0, 2); 940 941 getActionDefinitionsBuilder(G_BUILD_VECTOR) 942 .legalFor({{v8s8, s8}, 943 {v16s8, s8}, 944 {v4s16, s16}, 945 {v8s16, s16}, 946 {v2s32, s32}, 947 {v4s32, s32}, 948 {v2p0, p0}, 949 {v2s64, s64}}) 950 .clampNumElements(0, v4s32, v4s32) 951 .clampNumElements(0, v2s64, v2s64) 952 .minScalarOrElt(0, s8) 953 .widenVectorEltsToVectorMinSize(0, 64) 954 .minScalarSameAs(1, 0); 955 956 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower(); 957 958 getActionDefinitionsBuilder(G_CTLZ) 959 .legalForCartesianProduct( 960 {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) 961 .scalarize(1) 962 .widenScalarToNextPow2(1, /*Min=*/32) 963 .clampScalar(1, s32, s64) 964 .scalarSameSizeAs(0, 1); 965 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower(); 966 967 // TODO: Custom lowering for v2s32, v4s32, v2s64. 968 getActionDefinitionsBuilder(G_BITREVERSE) 969 .legalFor({s32, s64, v8s8, v16s8}) 970 .widenScalarToNextPow2(0, /*Min = */ 32) 971 .clampScalar(0, s32, s64); 972 973 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower(); 974 975 getActionDefinitionsBuilder(G_CTTZ) 976 .lowerIf(isVector(0)) 977 .widenScalarToNextPow2(1, /*Min=*/32) 978 .clampScalar(1, s32, s64) 979 .scalarSameSizeAs(0, 1) 980 .legalIf([=](const LegalityQuery &Query) { 981 return (HasCSSC && typeInSet(0, {s32, s64})(Query)); 982 }) 983 .customIf([=](const LegalityQuery &Query) { 984 return (!HasCSSC && typeInSet(0, {s32, s64})(Query)); 985 }); 986 987 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 988 .legalIf([=](const LegalityQuery &Query) { 989 const LLT &DstTy = Query.Types[0]; 990 const LLT &SrcTy = Query.Types[1]; 991 // For now just support the TBL2 variant which needs the source vectors 992 // to be the same size as the dest. 993 if (DstTy != SrcTy) 994 return false; 995 return llvm::is_contained( 996 {v2s64, v2p0, v2s32, v4s32, v4s16, v16s8, v8s8, v8s16}, DstTy); 997 }) 998 // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we 999 // just want those lowered into G_BUILD_VECTOR 1000 .lowerIf([=](const LegalityQuery &Query) { 1001 return !Query.Types[1].isVector(); 1002 }) 1003 .moreElementsIf( 1004 [](const LegalityQuery &Query) { 1005 return Query.Types[0].isVector() && Query.Types[1].isVector() && 1006 Query.Types[0].getNumElements() > 1007 Query.Types[1].getNumElements(); 1008 }, 1009 changeTo(1, 0)) 1010 .moreElementsToNextPow2(0) 1011 .moreElementsIf( 1012 [](const LegalityQuery &Query) { 1013 return Query.Types[0].isVector() && Query.Types[1].isVector() && 1014 Query.Types[0].getNumElements() < 1015 Query.Types[1].getNumElements(); 1016 }, 1017 changeTo(0, 1)) 1018 .widenScalarOrEltToNextPow2OrMinSize(0, 8) 1019 .clampNumElements(0, v8s8, v16s8) 1020 .clampNumElements(0, v4s16, v8s16) 1021 .clampNumElements(0, v4s32, v4s32) 1022 .clampNumElements(0, v2s64, v2s64); 1023 1024 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1025 .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}}) 1026 .bitcastIf( 1027 [=](const LegalityQuery &Query) { 1028 return Query.Types[0].getSizeInBits() <= 128 && 1029 Query.Types[1].getSizeInBits() <= 64; 1030 }, 1031 [=](const LegalityQuery &Query) { 1032 const LLT DstTy = Query.Types[0]; 1033 const LLT SrcTy = Query.Types[1]; 1034 return std::pair( 1035 0, DstTy.changeElementSize(SrcTy.getSizeInBits()) 1036 .changeElementCount( 1037 DstTy.getElementCount().divideCoefficientBy( 1038 SrcTy.getNumElements()))); 1039 }); 1040 1041 getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({p0}); 1042 1043 getActionDefinitionsBuilder(G_BRJT).legalFor({{p0, s64}}); 1044 1045 getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom(); 1046 1047 getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower(); 1048 1049 if (ST.hasMOPS()) { 1050 // G_BZERO is not supported. Currently it is only emitted by 1051 // PreLegalizerCombiner for G_MEMSET with zero constant. 1052 getActionDefinitionsBuilder(G_BZERO).unsupported(); 1053 1054 getActionDefinitionsBuilder(G_MEMSET) 1055 .legalForCartesianProduct({p0}, {s64}, {s64}) 1056 .customForCartesianProduct({p0}, {s8}, {s64}) 1057 .immIdx(0); // Inform verifier imm idx 0 is handled. 1058 1059 getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE}) 1060 .legalForCartesianProduct({p0}, {p0}, {s64}) 1061 .immIdx(0); // Inform verifier imm idx 0 is handled. 1062 1063 // G_MEMCPY_INLINE does not have a tailcall immediate 1064 getActionDefinitionsBuilder(G_MEMCPY_INLINE) 1065 .legalForCartesianProduct({p0}, {p0}, {s64}); 1066 1067 } else { 1068 getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET}) 1069 .libcall(); 1070 } 1071 1072 // FIXME: Legal vector types are only legal with NEON. 1073 auto &ABSActions = getActionDefinitionsBuilder(G_ABS); 1074 if (HasCSSC) 1075 ABSActions 1076 .legalFor({s32, s64}); 1077 ABSActions.legalFor(PackedVectorAllTypeList) 1078 .customIf([=](const LegalityQuery &Q) { 1079 // TODO: Fix suboptimal codegen for 128+ bit types. 1080 LLT SrcTy = Q.Types[0]; 1081 return SrcTy.isScalar() && SrcTy.getSizeInBits() < 128; 1082 }) 1083 .widenScalarIf( 1084 [=](const LegalityQuery &Query) { return Query.Types[0] == v4s8; }, 1085 [=](const LegalityQuery &Query) { return std::make_pair(0, v4s16); }) 1086 .widenScalarIf( 1087 [=](const LegalityQuery &Query) { return Query.Types[0] == v2s16; }, 1088 [=](const LegalityQuery &Query) { return std::make_pair(0, v2s32); }) 1089 .clampNumElements(0, v8s8, v16s8) 1090 .clampNumElements(0, v4s16, v8s16) 1091 .clampNumElements(0, v2s32, v4s32) 1092 .clampNumElements(0, v2s64, v2s64) 1093 .moreElementsToNextPow2(0) 1094 .lower(); 1095 1096 // For fadd reductions we have pairwise operations available. We treat the 1097 // usual legal types as legal and handle the lowering to pairwise instructions 1098 // later. 1099 getActionDefinitionsBuilder(G_VECREDUCE_FADD) 1100 .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}}) 1101 .legalIf([=](const LegalityQuery &Query) { 1102 const auto &Ty = Query.Types[1]; 1103 return (Ty == v4s16 || Ty == v8s16) && HasFP16; 1104 }) 1105 .minScalarOrElt(0, MinFPScalar) 1106 .clampMaxNumElements(1, s64, 2) 1107 .clampMaxNumElements(1, s32, 4) 1108 .clampMaxNumElements(1, s16, 8) 1109 .lower(); 1110 1111 // For fmul reductions we need to split up into individual operations. We 1112 // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of 1113 // smaller types, followed by scalarizing what remains. 1114 getActionDefinitionsBuilder(G_VECREDUCE_FMUL) 1115 .minScalarOrElt(0, MinFPScalar) 1116 .clampMaxNumElements(1, s64, 2) 1117 .clampMaxNumElements(1, s32, 4) 1118 .clampMaxNumElements(1, s16, 8) 1119 .clampMaxNumElements(1, s32, 2) 1120 .clampMaxNumElements(1, s16, 4) 1121 .scalarize(1) 1122 .lower(); 1123 1124 getActionDefinitionsBuilder({G_VECREDUCE_SEQ_FADD, G_VECREDUCE_SEQ_FMUL}) 1125 .scalarize(2) 1126 .lower(); 1127 1128 getActionDefinitionsBuilder(G_VECREDUCE_ADD) 1129 .legalFor({{s8, v16s8}, 1130 {s8, v8s8}, 1131 {s16, v8s16}, 1132 {s16, v4s16}, 1133 {s32, v4s32}, 1134 {s32, v2s32}, 1135 {s64, v2s64}}) 1136 .clampMaxNumElements(1, s64, 2) 1137 .clampMaxNumElements(1, s32, 4) 1138 .clampMaxNumElements(1, s16, 8) 1139 .clampMaxNumElements(1, s8, 16) 1140 .lower(); 1141 1142 getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX, 1143 G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM}) 1144 .legalFor({{s32, v4s32}, {s32, v2s32}, {s64, v2s64}}) 1145 .legalIf([=](const LegalityQuery &Query) { 1146 const auto &Ty = Query.Types[1]; 1147 return Query.Types[0] == s16 && (Ty == v8s16 || Ty == v4s16) && HasFP16; 1148 }) 1149 .minScalarOrElt(0, MinFPScalar) 1150 .clampMaxNumElements(1, s64, 2) 1151 .clampMaxNumElements(1, s32, 4) 1152 .clampMaxNumElements(1, s16, 8) 1153 .lower(); 1154 1155 getActionDefinitionsBuilder(G_VECREDUCE_MUL) 1156 .clampMaxNumElements(1, s32, 2) 1157 .clampMaxNumElements(1, s16, 4) 1158 .clampMaxNumElements(1, s8, 8) 1159 .scalarize(1) 1160 .lower(); 1161 1162 getActionDefinitionsBuilder( 1163 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX}) 1164 .legalFor({{s8, v8s8}, 1165 {s8, v16s8}, 1166 {s16, v4s16}, 1167 {s16, v8s16}, 1168 {s32, v2s32}, 1169 {s32, v4s32}}) 1170 .moreElementsIf( 1171 [=](const LegalityQuery &Query) { 1172 return Query.Types[1].isVector() && 1173 Query.Types[1].getElementType() != s8 && 1174 Query.Types[1].getNumElements() & 1; 1175 }, 1176 LegalizeMutations::moreElementsToNextPow2(1)) 1177 .clampMaxNumElements(1, s64, 2) 1178 .clampMaxNumElements(1, s32, 4) 1179 .clampMaxNumElements(1, s16, 8) 1180 .clampMaxNumElements(1, s8, 16) 1181 .scalarize(1) 1182 .lower(); 1183 1184 getActionDefinitionsBuilder( 1185 {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR}) 1186 // Try to break down into smaller vectors as long as they're at least 64 1187 // bits. This lets us use vector operations for some parts of the 1188 // reduction. 1189 .fewerElementsIf( 1190 [=](const LegalityQuery &Q) { 1191 LLT SrcTy = Q.Types[1]; 1192 if (SrcTy.isScalar()) 1193 return false; 1194 if (!isPowerOf2_32(SrcTy.getNumElements())) 1195 return false; 1196 // We can usually perform 64b vector operations. 1197 return SrcTy.getSizeInBits() > 64; 1198 }, 1199 [=](const LegalityQuery &Q) { 1200 LLT SrcTy = Q.Types[1]; 1201 return std::make_pair(1, SrcTy.divide(2)); 1202 }) 1203 .scalarize(1) 1204 .lower(); 1205 1206 // TODO: Update this to correct handling when adding AArch64/SVE support. 1207 getActionDefinitionsBuilder(G_VECTOR_COMPRESS).lower(); 1208 1209 getActionDefinitionsBuilder({G_FSHL, G_FSHR}) 1210 .customFor({{s32, s32}, {s32, s64}, {s64, s64}}) 1211 .lower(); 1212 1213 getActionDefinitionsBuilder(G_ROTR) 1214 .legalFor({{s32, s64}, {s64, s64}}) 1215 .customIf([=](const LegalityQuery &Q) { 1216 return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64; 1217 }) 1218 .lower(); 1219 getActionDefinitionsBuilder(G_ROTL).lower(); 1220 1221 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1222 .customFor({{s32, s32}, {s64, s64}}); 1223 1224 auto always = [=](const LegalityQuery &Q) { return true; }; 1225 auto &CTPOPActions = getActionDefinitionsBuilder(G_CTPOP); 1226 if (HasCSSC) 1227 CTPOPActions 1228 .legalFor({{s32, s32}, 1229 {s64, s64}, 1230 {v8s8, v8s8}, 1231 {v16s8, v16s8}}) 1232 .customFor({{s128, s128}, 1233 {v2s64, v2s64}, 1234 {v2s32, v2s32}, 1235 {v4s32, v4s32}, 1236 {v4s16, v4s16}, 1237 {v8s16, v8s16}}); 1238 else 1239 CTPOPActions 1240 .legalFor({{v8s8, v8s8}, 1241 {v16s8, v16s8}}) 1242 .customFor({{s32, s32}, 1243 {s64, s64}, 1244 {s128, s128}, 1245 {v2s64, v2s64}, 1246 {v2s32, v2s32}, 1247 {v4s32, v4s32}, 1248 {v4s16, v4s16}, 1249 {v8s16, v8s16}}); 1250 CTPOPActions 1251 .clampScalar(0, s32, s128) 1252 .widenScalarToNextPow2(0) 1253 .minScalarEltSameAsIf(always, 1, 0) 1254 .maxScalarEltSameAsIf(always, 1, 0); 1255 1256 getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}) 1257 .legalFor({v2s64, v2s32, v4s32, v4s16, v8s16, v8s8, v16s8}) 1258 .clampNumElements(0, v8s8, v16s8) 1259 .clampNumElements(0, v4s16, v8s16) 1260 .clampNumElements(0, v2s32, v4s32) 1261 .clampMaxNumElements(0, s64, 2) 1262 .moreElementsToNextPow2(0) 1263 .lower(); 1264 1265 // TODO: Libcall support for s128. 1266 // TODO: s16 should be legal with full FP16 support. 1267 getActionDefinitionsBuilder({G_LROUND, G_LLROUND}) 1268 .legalFor({{s64, s32}, {s64, s64}}); 1269 1270 // TODO: Custom legalization for mismatched types. 1271 getActionDefinitionsBuilder(G_FCOPYSIGN) 1272 .moreElementsIf( 1273 [](const LegalityQuery &Query) { return Query.Types[0].isScalar(); }, 1274 [=](const LegalityQuery &Query) { 1275 const LLT Ty = Query.Types[0]; 1276 return std::pair(0, LLT::fixed_vector(Ty == s16 ? 4 : 2, Ty)); 1277 }) 1278 .lower(); 1279 1280 getActionDefinitionsBuilder(G_FMAD).lower(); 1281 1282 // Access to floating-point environment. 1283 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV, 1284 G_GET_FPMODE, G_SET_FPMODE, G_RESET_FPMODE}) 1285 .libcall(); 1286 1287 getActionDefinitionsBuilder(G_IS_FPCLASS).lower(); 1288 1289 getActionDefinitionsBuilder(G_PREFETCH).custom(); 1290 1291 getActionDefinitionsBuilder({G_SCMP, G_UCMP}).lower(); 1292 1293 getLegacyLegalizerInfo().computeTables(); 1294 verify(*ST.getInstrInfo()); 1295 } 1296 1297 bool AArch64LegalizerInfo::legalizeCustom( 1298 LegalizerHelper &Helper, MachineInstr &MI, 1299 LostDebugLocObserver &LocObserver) const { 1300 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1301 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 1302 GISelChangeObserver &Observer = Helper.Observer; 1303 switch (MI.getOpcode()) { 1304 default: 1305 // No idea what to do. 1306 return false; 1307 case TargetOpcode::G_VAARG: 1308 return legalizeVaArg(MI, MRI, MIRBuilder); 1309 case TargetOpcode::G_LOAD: 1310 case TargetOpcode::G_STORE: 1311 return legalizeLoadStore(MI, MRI, MIRBuilder, Observer); 1312 case TargetOpcode::G_SHL: 1313 case TargetOpcode::G_ASHR: 1314 case TargetOpcode::G_LSHR: 1315 return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer); 1316 case TargetOpcode::G_GLOBAL_VALUE: 1317 return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer); 1318 case TargetOpcode::G_SBFX: 1319 case TargetOpcode::G_UBFX: 1320 return legalizeBitfieldExtract(MI, MRI, Helper); 1321 case TargetOpcode::G_FSHL: 1322 case TargetOpcode::G_FSHR: 1323 return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper); 1324 case TargetOpcode::G_ROTR: 1325 return legalizeRotate(MI, MRI, Helper); 1326 case TargetOpcode::G_CTPOP: 1327 return legalizeCTPOP(MI, MRI, Helper); 1328 case TargetOpcode::G_ATOMIC_CMPXCHG: 1329 return legalizeAtomicCmpxchg128(MI, MRI, Helper); 1330 case TargetOpcode::G_CTTZ: 1331 return legalizeCTTZ(MI, Helper); 1332 case TargetOpcode::G_BZERO: 1333 case TargetOpcode::G_MEMCPY: 1334 case TargetOpcode::G_MEMMOVE: 1335 case TargetOpcode::G_MEMSET: 1336 return legalizeMemOps(MI, Helper); 1337 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1338 return legalizeExtractVectorElt(MI, MRI, Helper); 1339 case TargetOpcode::G_DYN_STACKALLOC: 1340 return legalizeDynStackAlloc(MI, Helper); 1341 case TargetOpcode::G_PREFETCH: 1342 return legalizePrefetch(MI, Helper); 1343 case TargetOpcode::G_ABS: 1344 return Helper.lowerAbsToCNeg(MI); 1345 case TargetOpcode::G_ICMP: 1346 return legalizeICMP(MI, MRI, MIRBuilder); 1347 } 1348 1349 llvm_unreachable("expected switch to return"); 1350 } 1351 1352 bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI, 1353 MachineRegisterInfo &MRI, 1354 MachineIRBuilder &MIRBuilder, 1355 GISelChangeObserver &Observer, 1356 LegalizerHelper &Helper) const { 1357 assert(MI.getOpcode() == TargetOpcode::G_FSHL || 1358 MI.getOpcode() == TargetOpcode::G_FSHR); 1359 1360 // Keep as G_FSHR if shift amount is a G_CONSTANT, else use generic 1361 // lowering 1362 Register ShiftNo = MI.getOperand(3).getReg(); 1363 LLT ShiftTy = MRI.getType(ShiftNo); 1364 auto VRegAndVal = getIConstantVRegValWithLookThrough(ShiftNo, MRI); 1365 1366 // Adjust shift amount according to Opcode (FSHL/FSHR) 1367 // Convert FSHL to FSHR 1368 LLT OperationTy = MRI.getType(MI.getOperand(0).getReg()); 1369 APInt BitWidth(ShiftTy.getSizeInBits(), OperationTy.getSizeInBits(), false); 1370 1371 // Lower non-constant shifts and leave zero shifts to the optimizer. 1372 if (!VRegAndVal || VRegAndVal->Value.urem(BitWidth) == 0) 1373 return (Helper.lowerFunnelShiftAsShifts(MI) == 1374 LegalizerHelper::LegalizeResult::Legalized); 1375 1376 APInt Amount = VRegAndVal->Value.urem(BitWidth); 1377 1378 Amount = MI.getOpcode() == TargetOpcode::G_FSHL ? BitWidth - Amount : Amount; 1379 1380 // If the instruction is G_FSHR, has a 64-bit G_CONSTANT for shift amount 1381 // in the range of 0 <-> BitWidth, it is legal 1382 if (ShiftTy.getSizeInBits() == 64 && MI.getOpcode() == TargetOpcode::G_FSHR && 1383 VRegAndVal->Value.ult(BitWidth)) 1384 return true; 1385 1386 // Cast the ShiftNumber to a 64-bit type 1387 auto Cast64 = MIRBuilder.buildConstant(LLT::scalar(64), Amount.zext(64)); 1388 1389 if (MI.getOpcode() == TargetOpcode::G_FSHR) { 1390 Observer.changingInstr(MI); 1391 MI.getOperand(3).setReg(Cast64.getReg(0)); 1392 Observer.changedInstr(MI); 1393 } 1394 // If Opcode is FSHL, remove the FSHL instruction and create a FSHR 1395 // instruction 1396 else if (MI.getOpcode() == TargetOpcode::G_FSHL) { 1397 MIRBuilder.buildInstr(TargetOpcode::G_FSHR, {MI.getOperand(0).getReg()}, 1398 {MI.getOperand(1).getReg(), MI.getOperand(2).getReg(), 1399 Cast64.getReg(0)}); 1400 MI.eraseFromParent(); 1401 } 1402 return true; 1403 } 1404 1405 bool AArch64LegalizerInfo::legalizeICMP(MachineInstr &MI, 1406 MachineRegisterInfo &MRI, 1407 MachineIRBuilder &MIRBuilder) const { 1408 Register DstReg = MI.getOperand(0).getReg(); 1409 Register SrcReg1 = MI.getOperand(2).getReg(); 1410 Register SrcReg2 = MI.getOperand(3).getReg(); 1411 LLT DstTy = MRI.getType(DstReg); 1412 LLT SrcTy = MRI.getType(SrcReg1); 1413 1414 // Check the vector types are legal 1415 if (DstTy.getScalarSizeInBits() != SrcTy.getScalarSizeInBits() || 1416 DstTy.getNumElements() != SrcTy.getNumElements() || 1417 (DstTy.getSizeInBits() != 64 && DstTy.getSizeInBits() != 128)) 1418 return false; 1419 1420 // Lowers G_ICMP NE => G_ICMP EQ to allow better pattern matching for 1421 // following passes 1422 CmpInst::Predicate Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate(); 1423 if (Pred != CmpInst::ICMP_NE) 1424 return true; 1425 Register CmpReg = 1426 MIRBuilder 1427 .buildICmp(CmpInst::ICMP_EQ, MRI.getType(DstReg), SrcReg1, SrcReg2) 1428 .getReg(0); 1429 MIRBuilder.buildNot(DstReg, CmpReg); 1430 1431 MI.eraseFromParent(); 1432 return true; 1433 } 1434 1435 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI, 1436 MachineRegisterInfo &MRI, 1437 LegalizerHelper &Helper) const { 1438 // To allow for imported patterns to match, we ensure that the rotate amount 1439 // is 64b with an extension. 1440 Register AmtReg = MI.getOperand(2).getReg(); 1441 LLT AmtTy = MRI.getType(AmtReg); 1442 (void)AmtTy; 1443 assert(AmtTy.isScalar() && "Expected a scalar rotate"); 1444 assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal"); 1445 auto NewAmt = Helper.MIRBuilder.buildZExt(LLT::scalar(64), AmtReg); 1446 Helper.Observer.changingInstr(MI); 1447 MI.getOperand(2).setReg(NewAmt.getReg(0)); 1448 Helper.Observer.changedInstr(MI); 1449 return true; 1450 } 1451 1452 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( 1453 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1454 GISelChangeObserver &Observer) const { 1455 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); 1456 // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP + 1457 // G_ADD_LOW instructions. 1458 // By splitting this here, we can optimize accesses in the small code model by 1459 // folding in the G_ADD_LOW into the load/store offset. 1460 auto &GlobalOp = MI.getOperand(1); 1461 // Don't modify an intrinsic call. 1462 if (GlobalOp.isSymbol()) 1463 return true; 1464 const auto* GV = GlobalOp.getGlobal(); 1465 if (GV->isThreadLocal()) 1466 return true; // Don't want to modify TLS vars. 1467 1468 auto &TM = ST->getTargetLowering()->getTargetMachine(); 1469 unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM); 1470 1471 if (OpFlags & AArch64II::MO_GOT) 1472 return true; 1473 1474 auto Offset = GlobalOp.getOffset(); 1475 Register DstReg = MI.getOperand(0).getReg(); 1476 auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {}) 1477 .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE); 1478 // Set the regclass on the dest reg too. 1479 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); 1480 1481 // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so 1482 // by creating a MOVK that sets bits 48-63 of the register to (global address 1483 // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to 1484 // prevent an incorrect tag being generated during relocation when the 1485 // global appears before the code section. Without the offset, a global at 1486 // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced 1487 // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 = 1488 // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe` 1489 // instead of `0xf`. 1490 // This assumes that we're in the small code model so we can assume a binary 1491 // size of <= 4GB, which makes the untagged PC relative offset positive. The 1492 // binary must also be loaded into address range [0, 2^48). Both of these 1493 // properties need to be ensured at runtime when using tagged addresses. 1494 if (OpFlags & AArch64II::MO_TAGGED) { 1495 assert(!Offset && 1496 "Should not have folded in an offset for a tagged global!"); 1497 ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP}) 1498 .addGlobalAddress(GV, 0x100000000, 1499 AArch64II::MO_PREL | AArch64II::MO_G3) 1500 .addImm(48); 1501 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); 1502 } 1503 1504 MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP}) 1505 .addGlobalAddress(GV, Offset, 1506 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 1507 MI.eraseFromParent(); 1508 return true; 1509 } 1510 1511 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 1512 MachineInstr &MI) const { 1513 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); 1514 switch (IntrinsicID) { 1515 case Intrinsic::vacopy: { 1516 unsigned PtrSize = ST->isTargetILP32() ? 4 : 8; 1517 unsigned VaListSize = 1518 (ST->isTargetDarwin() || ST->isTargetWindows()) 1519 ? PtrSize 1520 : ST->isTargetILP32() ? 20 : 32; 1521 1522 MachineFunction &MF = *MI.getMF(); 1523 auto Val = MF.getRegInfo().createGenericVirtualRegister( 1524 LLT::scalar(VaListSize * 8)); 1525 MachineIRBuilder MIB(MI); 1526 MIB.buildLoad(Val, MI.getOperand(2), 1527 *MF.getMachineMemOperand(MachinePointerInfo(), 1528 MachineMemOperand::MOLoad, 1529 VaListSize, Align(PtrSize))); 1530 MIB.buildStore(Val, MI.getOperand(1), 1531 *MF.getMachineMemOperand(MachinePointerInfo(), 1532 MachineMemOperand::MOStore, 1533 VaListSize, Align(PtrSize))); 1534 MI.eraseFromParent(); 1535 return true; 1536 } 1537 case Intrinsic::get_dynamic_area_offset: { 1538 MachineIRBuilder &MIB = Helper.MIRBuilder; 1539 MIB.buildConstant(MI.getOperand(0).getReg(), 0); 1540 MI.eraseFromParent(); 1541 return true; 1542 } 1543 case Intrinsic::aarch64_mops_memset_tag: { 1544 assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 1545 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by 1546 // the instruction). 1547 MachineIRBuilder MIB(MI); 1548 auto &Value = MI.getOperand(3); 1549 Register ExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0); 1550 Value.setReg(ExtValueReg); 1551 return true; 1552 } 1553 case Intrinsic::aarch64_prefetch: { 1554 MachineIRBuilder MIB(MI); 1555 auto &AddrVal = MI.getOperand(1); 1556 1557 int64_t IsWrite = MI.getOperand(2).getImm(); 1558 int64_t Target = MI.getOperand(3).getImm(); 1559 int64_t IsStream = MI.getOperand(4).getImm(); 1560 int64_t IsData = MI.getOperand(5).getImm(); 1561 1562 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 1563 (!IsData << 3) | // IsDataCache bit 1564 (Target << 1) | // Cache level bits 1565 (unsigned)IsStream; // Stream bit 1566 1567 MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal); 1568 MI.eraseFromParent(); 1569 return true; 1570 } 1571 case Intrinsic::aarch64_neon_uaddv: 1572 case Intrinsic::aarch64_neon_saddv: 1573 case Intrinsic::aarch64_neon_umaxv: 1574 case Intrinsic::aarch64_neon_smaxv: 1575 case Intrinsic::aarch64_neon_uminv: 1576 case Intrinsic::aarch64_neon_sminv: { 1577 MachineIRBuilder MIB(MI); 1578 MachineRegisterInfo &MRI = *MIB.getMRI(); 1579 bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv || 1580 IntrinsicID == Intrinsic::aarch64_neon_smaxv || 1581 IntrinsicID == Intrinsic::aarch64_neon_sminv; 1582 1583 auto OldDst = MI.getOperand(0).getReg(); 1584 auto OldDstTy = MRI.getType(OldDst); 1585 LLT NewDstTy = MRI.getType(MI.getOperand(2).getReg()).getElementType(); 1586 if (OldDstTy == NewDstTy) 1587 return true; 1588 1589 auto NewDst = MRI.createGenericVirtualRegister(NewDstTy); 1590 1591 Helper.Observer.changingInstr(MI); 1592 MI.getOperand(0).setReg(NewDst); 1593 Helper.Observer.changedInstr(MI); 1594 1595 MIB.setInsertPt(MIB.getMBB(), ++MIB.getInsertPt()); 1596 MIB.buildExtOrTrunc(IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT, 1597 OldDst, NewDst); 1598 1599 return true; 1600 } 1601 case Intrinsic::aarch64_neon_uaddlp: 1602 case Intrinsic::aarch64_neon_saddlp: { 1603 MachineIRBuilder MIB(MI); 1604 1605 unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp 1606 ? AArch64::G_UADDLP 1607 : AArch64::G_SADDLP; 1608 MIB.buildInstr(Opc, {MI.getOperand(0)}, {MI.getOperand(2)}); 1609 MI.eraseFromParent(); 1610 1611 return true; 1612 } 1613 case Intrinsic::aarch64_neon_uaddlv: 1614 case Intrinsic::aarch64_neon_saddlv: { 1615 MachineIRBuilder MIB(MI); 1616 MachineRegisterInfo &MRI = *MIB.getMRI(); 1617 1618 unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv 1619 ? AArch64::G_UADDLV 1620 : AArch64::G_SADDLV; 1621 Register DstReg = MI.getOperand(0).getReg(); 1622 Register SrcReg = MI.getOperand(2).getReg(); 1623 LLT DstTy = MRI.getType(DstReg); 1624 1625 LLT MidTy, ExtTy; 1626 if (DstTy.isScalar() && DstTy.getScalarSizeInBits() <= 32) { 1627 MidTy = LLT::fixed_vector(4, 32); 1628 ExtTy = LLT::scalar(32); 1629 } else { 1630 MidTy = LLT::fixed_vector(2, 64); 1631 ExtTy = LLT::scalar(64); 1632 } 1633 1634 Register MidReg = 1635 MIB.buildInstr(Opc, {MidTy}, {SrcReg})->getOperand(0).getReg(); 1636 Register ZeroReg = 1637 MIB.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg(); 1638 Register ExtReg = MIB.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, {ExtTy}, 1639 {MidReg, ZeroReg}) 1640 .getReg(0); 1641 1642 if (DstTy.getScalarSizeInBits() < 32) 1643 MIB.buildTrunc(DstReg, ExtReg); 1644 else 1645 MIB.buildCopy(DstReg, ExtReg); 1646 1647 MI.eraseFromParent(); 1648 1649 return true; 1650 } 1651 case Intrinsic::aarch64_neon_smax: 1652 case Intrinsic::aarch64_neon_smin: 1653 case Intrinsic::aarch64_neon_umax: 1654 case Intrinsic::aarch64_neon_umin: 1655 case Intrinsic::aarch64_neon_fmax: 1656 case Intrinsic::aarch64_neon_fmin: 1657 case Intrinsic::aarch64_neon_fmaxnm: 1658 case Intrinsic::aarch64_neon_fminnm: { 1659 MachineIRBuilder MIB(MI); 1660 if (IntrinsicID == Intrinsic::aarch64_neon_smax) 1661 MIB.buildSMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3)); 1662 else if (IntrinsicID == Intrinsic::aarch64_neon_smin) 1663 MIB.buildSMin(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3)); 1664 else if (IntrinsicID == Intrinsic::aarch64_neon_umax) 1665 MIB.buildUMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3)); 1666 else if (IntrinsicID == Intrinsic::aarch64_neon_umin) 1667 MIB.buildUMin(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3)); 1668 else if (IntrinsicID == Intrinsic::aarch64_neon_fmax) 1669 MIB.buildInstr(TargetOpcode::G_FMAXIMUM, {MI.getOperand(0)}, 1670 {MI.getOperand(2), MI.getOperand(3)}); 1671 else if (IntrinsicID == Intrinsic::aarch64_neon_fmin) 1672 MIB.buildInstr(TargetOpcode::G_FMINIMUM, {MI.getOperand(0)}, 1673 {MI.getOperand(2), MI.getOperand(3)}); 1674 else if (IntrinsicID == Intrinsic::aarch64_neon_fmaxnm) 1675 MIB.buildInstr(TargetOpcode::G_FMAXNUM, {MI.getOperand(0)}, 1676 {MI.getOperand(2), MI.getOperand(3)}); 1677 else if (IntrinsicID == Intrinsic::aarch64_neon_fminnm) 1678 MIB.buildInstr(TargetOpcode::G_FMINNUM, {MI.getOperand(0)}, 1679 {MI.getOperand(2), MI.getOperand(3)}); 1680 MI.eraseFromParent(); 1681 return true; 1682 } 1683 case Intrinsic::vector_reverse: 1684 // TODO: Add support for vector_reverse 1685 return false; 1686 } 1687 1688 return true; 1689 } 1690 1691 bool AArch64LegalizerInfo::legalizeShlAshrLshr( 1692 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1693 GISelChangeObserver &Observer) const { 1694 assert(MI.getOpcode() == TargetOpcode::G_ASHR || 1695 MI.getOpcode() == TargetOpcode::G_LSHR || 1696 MI.getOpcode() == TargetOpcode::G_SHL); 1697 // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the 1698 // imported patterns can select it later. Either way, it will be legal. 1699 Register AmtReg = MI.getOperand(2).getReg(); 1700 auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI); 1701 if (!VRegAndVal) 1702 return true; 1703 // Check the shift amount is in range for an immediate form. 1704 int64_t Amount = VRegAndVal->Value.getSExtValue(); 1705 if (Amount > 31) 1706 return true; // This will have to remain a register variant. 1707 auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount); 1708 Observer.changingInstr(MI); 1709 MI.getOperand(2).setReg(ExtCst.getReg(0)); 1710 Observer.changedInstr(MI); 1711 return true; 1712 } 1713 1714 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset, 1715 MachineRegisterInfo &MRI) { 1716 Base = Root; 1717 Offset = 0; 1718 1719 Register NewBase; 1720 int64_t NewOffset; 1721 if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) && 1722 isShiftedInt<7, 3>(NewOffset)) { 1723 Base = NewBase; 1724 Offset = NewOffset; 1725 } 1726 } 1727 1728 // FIXME: This should be removed and replaced with the generic bitcast legalize 1729 // action. 1730 bool AArch64LegalizerInfo::legalizeLoadStore( 1731 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1732 GISelChangeObserver &Observer) const { 1733 assert(MI.getOpcode() == TargetOpcode::G_STORE || 1734 MI.getOpcode() == TargetOpcode::G_LOAD); 1735 // Here we just try to handle vector loads/stores where our value type might 1736 // have pointer elements, which the SelectionDAG importer can't handle. To 1737 // allow the existing patterns for s64 to fire for p0, we just try to bitcast 1738 // the value to use s64 types. 1739 1740 // Custom legalization requires the instruction, if not deleted, must be fully 1741 // legalized. In order to allow further legalization of the inst, we create 1742 // a new instruction and erase the existing one. 1743 1744 Register ValReg = MI.getOperand(0).getReg(); 1745 const LLT ValTy = MRI.getType(ValReg); 1746 1747 if (ValTy == LLT::scalar(128)) { 1748 1749 AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering(); 1750 bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD; 1751 bool IsLoadAcquire = IsLoad && Ordering == AtomicOrdering::Acquire; 1752 bool IsStoreRelease = !IsLoad && Ordering == AtomicOrdering::Release; 1753 bool IsRcpC3 = 1754 ST->hasLSE2() && ST->hasRCPC3() && (IsLoadAcquire || IsStoreRelease); 1755 1756 LLT s64 = LLT::scalar(64); 1757 1758 unsigned Opcode; 1759 if (IsRcpC3) { 1760 Opcode = IsLoad ? AArch64::LDIAPPX : AArch64::STILPX; 1761 } else { 1762 // For LSE2, loads/stores should have been converted to monotonic and had 1763 // a fence inserted after them. 1764 assert(Ordering == AtomicOrdering::Monotonic || 1765 Ordering == AtomicOrdering::Unordered); 1766 assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2"); 1767 1768 Opcode = IsLoad ? AArch64::LDPXi : AArch64::STPXi; 1769 } 1770 1771 MachineInstrBuilder NewI; 1772 if (IsLoad) { 1773 NewI = MIRBuilder.buildInstr(Opcode, {s64, s64}, {}); 1774 MIRBuilder.buildMergeLikeInstr( 1775 ValReg, {NewI->getOperand(0), NewI->getOperand(1)}); 1776 } else { 1777 auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0)); 1778 NewI = MIRBuilder.buildInstr( 1779 Opcode, {}, {Split->getOperand(0), Split->getOperand(1)}); 1780 } 1781 1782 if (IsRcpC3) { 1783 NewI.addUse(MI.getOperand(1).getReg()); 1784 } else { 1785 Register Base; 1786 int Offset; 1787 matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI); 1788 NewI.addUse(Base); 1789 NewI.addImm(Offset / 8); 1790 } 1791 1792 NewI.cloneMemRefs(MI); 1793 constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(), 1794 *MRI.getTargetRegisterInfo(), 1795 *ST->getRegBankInfo()); 1796 MI.eraseFromParent(); 1797 return true; 1798 } 1799 1800 if (!ValTy.isPointerVector() || 1801 ValTy.getElementType().getAddressSpace() != 0) { 1802 LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store"); 1803 return false; 1804 } 1805 1806 unsigned PtrSize = ValTy.getElementType().getSizeInBits(); 1807 const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize); 1808 auto &MMO = **MI.memoperands_begin(); 1809 MMO.setType(NewTy); 1810 1811 if (MI.getOpcode() == TargetOpcode::G_STORE) { 1812 auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg); 1813 MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO); 1814 } else { 1815 auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO); 1816 MIRBuilder.buildBitcast(ValReg, NewLoad); 1817 } 1818 MI.eraseFromParent(); 1819 return true; 1820 } 1821 1822 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, 1823 MachineRegisterInfo &MRI, 1824 MachineIRBuilder &MIRBuilder) const { 1825 MachineFunction &MF = MIRBuilder.getMF(); 1826 Align Alignment(MI.getOperand(2).getImm()); 1827 Register Dst = MI.getOperand(0).getReg(); 1828 Register ListPtr = MI.getOperand(1).getReg(); 1829 1830 LLT PtrTy = MRI.getType(ListPtr); 1831 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1832 1833 const unsigned PtrSize = PtrTy.getSizeInBits() / 8; 1834 const Align PtrAlign = Align(PtrSize); 1835 auto List = MIRBuilder.buildLoad( 1836 PtrTy, ListPtr, 1837 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, 1838 PtrTy, PtrAlign)); 1839 1840 MachineInstrBuilder DstPtr; 1841 if (Alignment > PtrAlign) { 1842 // Realign the list to the actual required alignment. 1843 auto AlignMinus1 = 1844 MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1); 1845 auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0)); 1846 DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment)); 1847 } else 1848 DstPtr = List; 1849 1850 LLT ValTy = MRI.getType(Dst); 1851 uint64_t ValSize = ValTy.getSizeInBits() / 8; 1852 MIRBuilder.buildLoad( 1853 Dst, DstPtr, 1854 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, 1855 ValTy, std::max(Alignment, PtrAlign))); 1856 1857 auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign)); 1858 1859 auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0)); 1860 1861 MIRBuilder.buildStore(NewList, ListPtr, 1862 *MF.getMachineMemOperand(MachinePointerInfo(), 1863 MachineMemOperand::MOStore, 1864 PtrTy, PtrAlign)); 1865 1866 MI.eraseFromParent(); 1867 return true; 1868 } 1869 1870 bool AArch64LegalizerInfo::legalizeBitfieldExtract( 1871 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 1872 // Only legal if we can select immediate forms. 1873 // TODO: Lower this otherwise. 1874 return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) && 1875 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 1876 } 1877 1878 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI, 1879 MachineRegisterInfo &MRI, 1880 LegalizerHelper &Helper) const { 1881 // When there is no integer popcount instruction (FEAT_CSSC isn't available), 1882 // it can be more efficiently lowered to the following sequence that uses 1883 // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD 1884 // registers are cheap. 1885 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 1886 // CNT V0.8B, V0.8B // 8xbyte pop-counts 1887 // ADDV B0, V0.8B // sum 8xbyte pop-counts 1888 // UMOV X0, V0.B[0] // copy byte result back to integer reg 1889 // 1890 // For 128 bit vector popcounts, we lower to the following sequence: 1891 // cnt.16b v0, v0 // v8s16, v4s32, v2s64 1892 // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64 1893 // uaddlp.4s v0, v0 // v4s32, v2s64 1894 // uaddlp.2d v0, v0 // v2s64 1895 // 1896 // For 64 bit vector popcounts, we lower to the following sequence: 1897 // cnt.8b v0, v0 // v4s16, v2s32 1898 // uaddlp.4h v0, v0 // v4s16, v2s32 1899 // uaddlp.2s v0, v0 // v2s32 1900 1901 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1902 Register Dst = MI.getOperand(0).getReg(); 1903 Register Val = MI.getOperand(1).getReg(); 1904 LLT Ty = MRI.getType(Val); 1905 unsigned Size = Ty.getSizeInBits(); 1906 1907 assert(Ty == MRI.getType(Dst) && 1908 "Expected src and dst to have the same type!"); 1909 1910 if (ST->hasCSSC() && Ty.isScalar() && Size == 128) { 1911 LLT s64 = LLT::scalar(64); 1912 1913 auto Split = MIRBuilder.buildUnmerge(s64, Val); 1914 auto CTPOP1 = MIRBuilder.buildCTPOP(s64, Split->getOperand(0)); 1915 auto CTPOP2 = MIRBuilder.buildCTPOP(s64, Split->getOperand(1)); 1916 auto Add = MIRBuilder.buildAdd(s64, CTPOP1, CTPOP2); 1917 1918 MIRBuilder.buildZExt(Dst, Add); 1919 MI.eraseFromParent(); 1920 return true; 1921 } 1922 1923 if (!ST->hasNEON() || 1924 MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) { 1925 // Use generic lowering when custom lowering is not possible. 1926 return Ty.isScalar() && (Size == 32 || Size == 64) && 1927 Helper.lowerBitCount(MI) == 1928 LegalizerHelper::LegalizeResult::Legalized; 1929 } 1930 1931 // Pre-conditioning: widen Val up to the nearest vector type. 1932 // s32,s64,v4s16,v2s32 -> v8i8 1933 // v8s16,v4s32,v2s64 -> v16i8 1934 LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8); 1935 if (Ty.isScalar()) { 1936 assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!"); 1937 if (Size == 32) { 1938 Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0); 1939 } 1940 } 1941 Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0); 1942 1943 // Count bits in each byte-sized lane. 1944 auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val); 1945 1946 // Sum across lanes. 1947 1948 if (ST->hasDotProd() && Ty.isVector() && Ty.getNumElements() >= 2 && 1949 Ty.getScalarSizeInBits() != 16) { 1950 LLT Dt = Ty == LLT::fixed_vector(2, 64) ? LLT::fixed_vector(4, 32) : Ty; 1951 auto Zeros = MIRBuilder.buildConstant(Dt, 0); 1952 auto Ones = MIRBuilder.buildConstant(VTy, 1); 1953 MachineInstrBuilder Sum; 1954 1955 if (Ty == LLT::fixed_vector(2, 64)) { 1956 auto UDOT = 1957 MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP}); 1958 Sum = MIRBuilder.buildInstr(AArch64::G_UADDLP, {Ty}, {UDOT}); 1959 } else if (Ty == LLT::fixed_vector(4, 32)) { 1960 Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP}); 1961 } else if (Ty == LLT::fixed_vector(2, 32)) { 1962 Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP}); 1963 } else { 1964 llvm_unreachable("unexpected vector shape"); 1965 } 1966 1967 Sum->getOperand(0).setReg(Dst); 1968 MI.eraseFromParent(); 1969 return true; 1970 } 1971 1972 Register HSum = CTPOP.getReg(0); 1973 unsigned Opc; 1974 SmallVector<LLT> HAddTys; 1975 if (Ty.isScalar()) { 1976 Opc = Intrinsic::aarch64_neon_uaddlv; 1977 HAddTys.push_back(LLT::scalar(32)); 1978 } else if (Ty == LLT::fixed_vector(8, 16)) { 1979 Opc = Intrinsic::aarch64_neon_uaddlp; 1980 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1981 } else if (Ty == LLT::fixed_vector(4, 32)) { 1982 Opc = Intrinsic::aarch64_neon_uaddlp; 1983 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1984 HAddTys.push_back(LLT::fixed_vector(4, 32)); 1985 } else if (Ty == LLT::fixed_vector(2, 64)) { 1986 Opc = Intrinsic::aarch64_neon_uaddlp; 1987 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1988 HAddTys.push_back(LLT::fixed_vector(4, 32)); 1989 HAddTys.push_back(LLT::fixed_vector(2, 64)); 1990 } else if (Ty == LLT::fixed_vector(4, 16)) { 1991 Opc = Intrinsic::aarch64_neon_uaddlp; 1992 HAddTys.push_back(LLT::fixed_vector(4, 16)); 1993 } else if (Ty == LLT::fixed_vector(2, 32)) { 1994 Opc = Intrinsic::aarch64_neon_uaddlp; 1995 HAddTys.push_back(LLT::fixed_vector(4, 16)); 1996 HAddTys.push_back(LLT::fixed_vector(2, 32)); 1997 } else 1998 llvm_unreachable("unexpected vector shape"); 1999 MachineInstrBuilder UADD; 2000 for (LLT HTy : HAddTys) { 2001 UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}).addUse(HSum); 2002 HSum = UADD.getReg(0); 2003 } 2004 2005 // Post-conditioning. 2006 if (Ty.isScalar() && (Size == 64 || Size == 128)) 2007 MIRBuilder.buildZExt(Dst, UADD); 2008 else 2009 UADD->getOperand(0).setReg(Dst); 2010 MI.eraseFromParent(); 2011 return true; 2012 } 2013 2014 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128( 2015 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 2016 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 2017 LLT s64 = LLT::scalar(64); 2018 auto Addr = MI.getOperand(1).getReg(); 2019 auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2)); 2020 auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3)); 2021 auto DstLo = MRI.createGenericVirtualRegister(s64); 2022 auto DstHi = MRI.createGenericVirtualRegister(s64); 2023 2024 MachineInstrBuilder CAS; 2025 if (ST->hasLSE()) { 2026 // We have 128-bit CASP instructions taking XSeqPair registers, which are 2027 // s128. We need the merge/unmerge to bracket the expansion and pair up with 2028 // the rest of the MIR so we must reassemble the extracted registers into a 2029 // 128-bit known-regclass one with code like this: 2030 // 2031 // %in1 = REG_SEQUENCE Lo, Hi ; One for each input 2032 // %out = CASP %in1, ... 2033 // %OldLo = G_EXTRACT %out, 0 2034 // %OldHi = G_EXTRACT %out, 64 2035 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); 2036 unsigned Opcode; 2037 switch (Ordering) { 2038 case AtomicOrdering::Acquire: 2039 Opcode = AArch64::CASPAX; 2040 break; 2041 case AtomicOrdering::Release: 2042 Opcode = AArch64::CASPLX; 2043 break; 2044 case AtomicOrdering::AcquireRelease: 2045 case AtomicOrdering::SequentiallyConsistent: 2046 Opcode = AArch64::CASPALX; 2047 break; 2048 default: 2049 Opcode = AArch64::CASPX; 2050 break; 2051 } 2052 2053 LLT s128 = LLT::scalar(128); 2054 auto CASDst = MRI.createGenericVirtualRegister(s128); 2055 auto CASDesired = MRI.createGenericVirtualRegister(s128); 2056 auto CASNew = MRI.createGenericVirtualRegister(s128); 2057 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {}) 2058 .addUse(DesiredI->getOperand(0).getReg()) 2059 .addImm(AArch64::sube64) 2060 .addUse(DesiredI->getOperand(1).getReg()) 2061 .addImm(AArch64::subo64); 2062 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {}) 2063 .addUse(NewI->getOperand(0).getReg()) 2064 .addImm(AArch64::sube64) 2065 .addUse(NewI->getOperand(1).getReg()) 2066 .addImm(AArch64::subo64); 2067 2068 CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr}); 2069 2070 MIRBuilder.buildExtract({DstLo}, {CASDst}, 0); 2071 MIRBuilder.buildExtract({DstHi}, {CASDst}, 64); 2072 } else { 2073 // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP 2074 // can take arbitrary registers so it just has the normal GPR64 operands the 2075 // rest of AArch64 is expecting. 2076 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); 2077 unsigned Opcode; 2078 switch (Ordering) { 2079 case AtomicOrdering::Acquire: 2080 Opcode = AArch64::CMP_SWAP_128_ACQUIRE; 2081 break; 2082 case AtomicOrdering::Release: 2083 Opcode = AArch64::CMP_SWAP_128_RELEASE; 2084 break; 2085 case AtomicOrdering::AcquireRelease: 2086 case AtomicOrdering::SequentiallyConsistent: 2087 Opcode = AArch64::CMP_SWAP_128; 2088 break; 2089 default: 2090 Opcode = AArch64::CMP_SWAP_128_MONOTONIC; 2091 break; 2092 } 2093 2094 auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 2095 CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch}, 2096 {Addr, DesiredI->getOperand(0), 2097 DesiredI->getOperand(1), NewI->getOperand(0), 2098 NewI->getOperand(1)}); 2099 } 2100 2101 CAS.cloneMemRefs(MI); 2102 constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(), 2103 *MRI.getTargetRegisterInfo(), 2104 *ST->getRegBankInfo()); 2105 2106 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {DstLo, DstHi}); 2107 MI.eraseFromParent(); 2108 return true; 2109 } 2110 2111 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI, 2112 LegalizerHelper &Helper) const { 2113 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 2114 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 2115 LLT Ty = MRI.getType(MI.getOperand(1).getReg()); 2116 auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1)); 2117 MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse); 2118 MI.eraseFromParent(); 2119 return true; 2120 } 2121 2122 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI, 2123 LegalizerHelper &Helper) const { 2124 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 2125 2126 // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic 2127 if (MI.getOpcode() == TargetOpcode::G_MEMSET) { 2128 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by 2129 // the instruction). 2130 auto &Value = MI.getOperand(1); 2131 Register ExtValueReg = 2132 MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0); 2133 Value.setReg(ExtValueReg); 2134 return true; 2135 } 2136 2137 return false; 2138 } 2139 2140 bool AArch64LegalizerInfo::legalizeExtractVectorElt( 2141 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 2142 assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT); 2143 auto VRegAndVal = 2144 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2145 if (VRegAndVal) 2146 return true; 2147 return Helper.lowerExtractInsertVectorElt(MI) != 2148 LegalizerHelper::LegalizeResult::UnableToLegalize; 2149 } 2150 2151 bool AArch64LegalizerInfo::legalizeDynStackAlloc( 2152 MachineInstr &MI, LegalizerHelper &Helper) const { 2153 MachineFunction &MF = *MI.getParent()->getParent(); 2154 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 2155 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 2156 2157 // If stack probing is not enabled for this function, use the default 2158 // lowering. 2159 if (!MF.getFunction().hasFnAttribute("probe-stack") || 2160 MF.getFunction().getFnAttribute("probe-stack").getValueAsString() != 2161 "inline-asm") { 2162 Helper.lowerDynStackAlloc(MI); 2163 return true; 2164 } 2165 2166 Register Dst = MI.getOperand(0).getReg(); 2167 Register AllocSize = MI.getOperand(1).getReg(); 2168 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 2169 2170 assert(MRI.getType(Dst) == LLT::pointer(0, 64) && 2171 "Unexpected type for dynamic alloca"); 2172 assert(MRI.getType(AllocSize) == LLT::scalar(64) && 2173 "Unexpected type for dynamic alloca"); 2174 2175 LLT PtrTy = MRI.getType(Dst); 2176 Register SPReg = 2177 Helper.getTargetLowering().getStackPointerRegisterToSaveRestore(); 2178 Register SPTmp = 2179 Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy); 2180 auto NewMI = 2181 MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp}); 2182 MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass); 2183 MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI); 2184 MIRBuilder.buildCopy(Dst, SPTmp); 2185 2186 MI.eraseFromParent(); 2187 return true; 2188 } 2189 2190 bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI, 2191 LegalizerHelper &Helper) const { 2192 MachineIRBuilder &MIB = Helper.MIRBuilder; 2193 auto &AddrVal = MI.getOperand(0); 2194 2195 int64_t IsWrite = MI.getOperand(1).getImm(); 2196 int64_t Locality = MI.getOperand(2).getImm(); 2197 int64_t IsData = MI.getOperand(3).getImm(); 2198 2199 bool IsStream = Locality == 0; 2200 if (Locality != 0) { 2201 assert(Locality <= 3 && "Prefetch locality out-of-range"); 2202 // The locality degree is the opposite of the cache speed. 2203 // Put the number the other way around. 2204 // The encoding starts at 0 for level 1 2205 Locality = 3 - Locality; 2206 } 2207 2208 unsigned PrfOp = (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream; 2209 2210 MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal); 2211 MI.eraseFromParent(); 2212 return true; 2213 } 2214