1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AArch64. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64LegalizerInfo.h" 15 #include "AArch64RegisterBankInfo.h" 16 #include "AArch64Subtarget.h" 17 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 19 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 20 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 21 #include "llvm/CodeGen/GlobalISel/Utils.h" 22 #include "llvm/CodeGen/MachineInstr.h" 23 #include "llvm/CodeGen/MachineRegisterInfo.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/Intrinsics.h" 28 #include "llvm/IR/IntrinsicsAArch64.h" 29 #include "llvm/IR/Type.h" 30 #include "llvm/Support/MathExtras.h" 31 #include <initializer_list> 32 33 #define DEBUG_TYPE "aarch64-legalinfo" 34 35 using namespace llvm; 36 using namespace LegalizeActions; 37 using namespace LegalizeMutations; 38 using namespace LegalityPredicates; 39 using namespace MIPatternMatch; 40 41 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) 42 : ST(&ST) { 43 using namespace TargetOpcode; 44 const LLT p0 = LLT::pointer(0, 64); 45 const LLT s8 = LLT::scalar(8); 46 const LLT s16 = LLT::scalar(16); 47 const LLT s32 = LLT::scalar(32); 48 const LLT s64 = LLT::scalar(64); 49 const LLT s128 = LLT::scalar(128); 50 const LLT v16s8 = LLT::fixed_vector(16, 8); 51 const LLT v8s8 = LLT::fixed_vector(8, 8); 52 const LLT v4s8 = LLT::fixed_vector(4, 8); 53 const LLT v8s16 = LLT::fixed_vector(8, 16); 54 const LLT v4s16 = LLT::fixed_vector(4, 16); 55 const LLT v2s16 = LLT::fixed_vector(2, 16); 56 const LLT v2s32 = LLT::fixed_vector(2, 32); 57 const LLT v4s32 = LLT::fixed_vector(4, 32); 58 const LLT v2s64 = LLT::fixed_vector(2, 64); 59 const LLT v2p0 = LLT::fixed_vector(2, p0); 60 61 std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */ 62 v16s8, v8s16, v4s32, 63 v2s64, v2p0, 64 /* End 128bit types */ 65 /* Begin 64bit types */ 66 v8s8, v4s16, v2s32}; 67 68 const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine(); 69 70 // FIXME: support subtargets which have neon/fp-armv8 disabled. 71 if (!ST.hasNEON() || !ST.hasFPARMv8()) { 72 getLegacyLegalizerInfo().computeTables(); 73 return; 74 } 75 76 // Some instructions only support s16 if the subtarget has full 16-bit FP 77 // support. 78 const bool HasFP16 = ST.hasFullFP16(); 79 const LLT &MinFPScalar = HasFP16 ? s16 : s32; 80 81 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 82 .legalFor({p0, s8, s16, s32, s64}) 83 .legalFor(PackedVectorAllTypeList) 84 .widenScalarToNextPow2(0) 85 .clampScalar(0, s8, s64) 86 .fewerElementsIf( 87 [=](const LegalityQuery &Query) { 88 return Query.Types[0].isVector() && 89 (Query.Types[0].getElementType() != s64 || 90 Query.Types[0].getNumElements() != 2); 91 }, 92 [=](const LegalityQuery &Query) { 93 LLT EltTy = Query.Types[0].getElementType(); 94 if (EltTy == s64) 95 return std::make_pair(0, LLT::fixed_vector(2, 64)); 96 return std::make_pair(0, EltTy); 97 }); 98 99 getActionDefinitionsBuilder(G_PHI) 100 .legalFor({p0, s16, s32, s64}) 101 .legalFor(PackedVectorAllTypeList) 102 .widenScalarToNextPow2(0) 103 .clampScalar(0, s16, s64) 104 // Maximum: sN * k = 128 105 .clampMaxNumElements(0, s8, 16) 106 .clampMaxNumElements(0, s16, 8) 107 .clampMaxNumElements(0, s32, 4) 108 .clampMaxNumElements(0, s64, 2) 109 .clampMaxNumElements(0, p0, 2); 110 111 getActionDefinitionsBuilder(G_BSWAP) 112 .legalFor({s32, s64, v4s32, v2s32, v2s64}) 113 .widenScalarToNextPow2(0) 114 .clampScalar(0, s32, s64); 115 116 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) 117 .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8}) 118 .scalarizeIf( 119 [=](const LegalityQuery &Query) { 120 return Query.Opcode == G_MUL && Query.Types[0] == v2s64; 121 }, 122 0) 123 .legalFor({v2s64}) 124 .widenScalarToNextPow2(0) 125 .clampScalar(0, s32, s64) 126 .clampNumElements(0, v2s32, v4s32) 127 .clampNumElements(0, v2s64, v2s64) 128 .moreElementsToNextPow2(0); 129 130 getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) 131 .customIf([=](const LegalityQuery &Query) { 132 const auto &SrcTy = Query.Types[0]; 133 const auto &AmtTy = Query.Types[1]; 134 return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && 135 AmtTy.getSizeInBits() == 32; 136 }) 137 .legalFor({ 138 {s32, s32}, 139 {s32, s64}, 140 {s64, s64}, 141 {v8s8, v8s8}, 142 {v16s8, v16s8}, 143 {v4s16, v4s16}, 144 {v8s16, v8s16}, 145 {v2s32, v2s32}, 146 {v4s32, v4s32}, 147 {v2s64, v2s64}, 148 }) 149 .widenScalarToNextPow2(0) 150 .clampScalar(1, s32, s64) 151 .clampScalar(0, s32, s64) 152 .clampNumElements(0, v2s32, v4s32) 153 .clampNumElements(0, v2s64, v2s64) 154 .moreElementsToNextPow2(0) 155 .minScalarSameAs(1, 0); 156 157 getActionDefinitionsBuilder(G_PTR_ADD) 158 .legalFor({{p0, s64}, {v2p0, v2s64}}) 159 .clampScalar(1, s64, s64); 160 161 getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}}); 162 163 getActionDefinitionsBuilder({G_SDIV, G_UDIV}) 164 .legalFor({s32, s64}) 165 .libcallFor({s128}) 166 .clampScalar(0, s32, s64) 167 .widenScalarToNextPow2(0) 168 .scalarize(0); 169 170 getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 171 .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32}) 172 .widenScalarOrEltToNextPow2(0) 173 .clampScalarOrElt(0, s32, s64) 174 .clampNumElements(0, v2s32, v4s32) 175 .clampNumElements(0, v2s64, v2s64) 176 .moreElementsToNextPow2(0); 177 178 179 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 180 .widenScalarToNextPow2(0, /*Min = */ 32) 181 .clampScalar(0, s32, s64) 182 .lower(); 183 184 getActionDefinitionsBuilder({G_SMULH, G_UMULH}) 185 .legalFor({s64, v8s16, v16s8, v4s32}) 186 .lower(); 187 188 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 189 .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) 190 .clampNumElements(0, v8s8, v16s8) 191 .clampNumElements(0, v4s16, v8s16) 192 .clampNumElements(0, v2s32, v4s32) 193 // FIXME: This sholdn't be needed as v2s64 types are going to 194 // be expanded anyway, but G_ICMP doesn't support splitting vectors yet 195 .clampNumElements(0, v2s64, v2s64) 196 .lower(); 197 198 getActionDefinitionsBuilder( 199 {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO}) 200 .legalFor({{s32, s32}, {s64, s32}}) 201 .clampScalar(0, s32, s64) 202 .clampScalar(1, s32, s64) 203 .widenScalarToNextPow2(0); 204 205 getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG}) 206 .legalFor({MinFPScalar, s32, s64, v2s64, v4s32, v2s32}) 207 .clampScalar(0, MinFPScalar, s64) 208 .clampNumElements(0, v2s32, v4s32) 209 .clampNumElements(0, v2s64, v2s64); 210 211 getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64}); 212 213 getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT, 214 G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, 215 G_FNEARBYINT, G_INTRINSIC_LRINT}) 216 // If we don't have full FP16 support, then scalarize the elements of 217 // vectors containing fp16 types. 218 .fewerElementsIf( 219 [=, &ST](const LegalityQuery &Query) { 220 const auto &Ty = Query.Types[0]; 221 return Ty.isVector() && Ty.getElementType() == s16 && 222 !ST.hasFullFP16(); 223 }, 224 [=](const LegalityQuery &Query) { return std::make_pair(0, s16); }) 225 // If we don't have full FP16 support, then widen s16 to s32 if we 226 // encounter it. 227 .widenScalarIf( 228 [=, &ST](const LegalityQuery &Query) { 229 return Query.Types[0] == s16 && !ST.hasFullFP16(); 230 }, 231 [=](const LegalityQuery &Query) { return std::make_pair(0, s32); }) 232 .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16}); 233 234 getActionDefinitionsBuilder( 235 {G_FCOS, G_FSIN, G_FLOG10, G_FLOG, G_FLOG2, G_FEXP, G_FEXP2, G_FPOW}) 236 // We need a call for these, so we always need to scalarize. 237 .scalarize(0) 238 // Regardless of FP16 support, widen 16-bit elements to 32-bits. 239 .minScalar(0, s32) 240 .libcallFor({s32, s64, v2s32, v4s32, v2s64}); 241 242 getActionDefinitionsBuilder(G_INSERT) 243 .legalIf(all(typeInSet(0, {s32, s64, p0}), 244 typeInSet(1, {s8, s16, s32}), smallerThan(1, 0))) 245 .widenScalarToNextPow2(0) 246 .clampScalar(0, s32, s64) 247 .widenScalarToNextPow2(1) 248 .minScalar(1, s8) 249 .maxScalarIf(typeInSet(0, {s32}), 1, s16) 250 .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32); 251 252 getActionDefinitionsBuilder(G_EXTRACT) 253 .legalIf(all(typeInSet(0, {s16, s32, s64, p0}), 254 typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1))) 255 .widenScalarToNextPow2(1) 256 .clampScalar(1, s32, s128) 257 .widenScalarToNextPow2(0) 258 .minScalar(0, s16) 259 .maxScalarIf(typeInSet(1, {s32}), 0, s16) 260 .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32) 261 .maxScalarIf(typeInSet(1, {s128}), 0, s64); 262 263 264 for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) { 265 auto &Actions = getActionDefinitionsBuilder(Op); 266 267 if (Op == G_SEXTLOAD) 268 Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)); 269 270 // Atomics have zero extending behavior. 271 Actions 272 .legalForTypesWithMemDesc({{s32, p0, s8, 8}, 273 {s32, p0, s16, 8}, 274 {s32, p0, s32, 8}, 275 {s64, p0, s8, 2}, 276 {s64, p0, s16, 2}, 277 {s64, p0, s32, 4}, 278 {s64, p0, s64, 8}, 279 {p0, p0, s64, 8}, 280 {v2s32, p0, s64, 8}}) 281 .widenScalarToNextPow2(0) 282 .clampScalar(0, s32, s64) 283 // TODO: We could support sum-of-pow2's but the lowering code doesn't know 284 // how to do that yet. 285 .unsupportedIfMemSizeNotPow2() 286 // Lower anything left over into G_*EXT and G_LOAD 287 .lower(); 288 } 289 290 auto IsPtrVecPred = [=](const LegalityQuery &Query) { 291 const LLT &ValTy = Query.Types[0]; 292 if (!ValTy.isVector()) 293 return false; 294 const LLT EltTy = ValTy.getElementType(); 295 return EltTy.isPointer() && EltTy.getAddressSpace() == 0; 296 }; 297 298 getActionDefinitionsBuilder(G_LOAD) 299 .customIf([=](const LegalityQuery &Query) { 300 return Query.Types[0] == s128 && 301 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; 302 }) 303 .legalForTypesWithMemDesc({{s8, p0, s8, 8}, 304 {s16, p0, s16, 8}, 305 {s32, p0, s32, 8}, 306 {s64, p0, s64, 8}, 307 {p0, p0, s64, 8}, 308 {s128, p0, s128, 8}, 309 {v8s8, p0, s64, 8}, 310 {v16s8, p0, s128, 8}, 311 {v4s16, p0, s64, 8}, 312 {v8s16, p0, s128, 8}, 313 {v2s32, p0, s64, 8}, 314 {v4s32, p0, s128, 8}, 315 {v2s64, p0, s128, 8}}) 316 // These extends are also legal 317 .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}}) 318 .widenScalarToNextPow2(0, /* MinSize = */8) 319 .lowerIfMemSizeNotByteSizePow2() 320 .clampScalar(0, s8, s64) 321 .narrowScalarIf([=](const LegalityQuery &Query) { 322 // Clamp extending load results to 32-bits. 323 return Query.Types[0].isScalar() && 324 Query.Types[0] != Query.MMODescrs[0].MemoryTy && 325 Query.Types[0].getSizeInBits() > 32; 326 }, 327 changeTo(0, s32)) 328 .clampMaxNumElements(0, s8, 16) 329 .clampMaxNumElements(0, s16, 8) 330 .clampMaxNumElements(0, s32, 4) 331 .clampMaxNumElements(0, s64, 2) 332 .clampMaxNumElements(0, p0, 2) 333 .customIf(IsPtrVecPred) 334 .scalarizeIf(typeIs(0, v2s16), 0); 335 336 getActionDefinitionsBuilder(G_STORE) 337 .customIf([=](const LegalityQuery &Query) { 338 return Query.Types[0] == s128 && 339 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; 340 }) 341 .legalForTypesWithMemDesc({{s8, p0, s8, 8}, 342 {s16, p0, s8, 8}, // truncstorei8 from s16 343 {s32, p0, s8, 8}, // truncstorei8 from s32 344 {s64, p0, s8, 8}, // truncstorei8 from s64 345 {s16, p0, s16, 8}, 346 {s32, p0, s16, 8}, // truncstorei16 from s32 347 {s64, p0, s16, 8}, // truncstorei16 from s64 348 {s32, p0, s8, 8}, 349 {s32, p0, s16, 8}, 350 {s32, p0, s32, 8}, 351 {s64, p0, s64, 8}, 352 {s64, p0, s32, 8}, // truncstorei32 from s64 353 {p0, p0, s64, 8}, 354 {s128, p0, s128, 8}, 355 {v16s8, p0, s128, 8}, 356 {v8s8, p0, s64, 8}, 357 {v4s16, p0, s64, 8}, 358 {v8s16, p0, s128, 8}, 359 {v2s32, p0, s64, 8}, 360 {v4s32, p0, s128, 8}, 361 {v2s64, p0, s128, 8}}) 362 .clampScalar(0, s8, s64) 363 .lowerIf([=](const LegalityQuery &Query) { 364 return Query.Types[0].isScalar() && 365 Query.Types[0] != Query.MMODescrs[0].MemoryTy; 366 }) 367 // Maximum: sN * k = 128 368 .clampMaxNumElements(0, s8, 16) 369 .clampMaxNumElements(0, s16, 8) 370 .clampMaxNumElements(0, s32, 4) 371 .clampMaxNumElements(0, s64, 2) 372 .clampMaxNumElements(0, p0, 2) 373 .lowerIfMemSizeNotPow2() 374 .customIf(IsPtrVecPred) 375 .scalarizeIf(typeIs(0, v2s16), 0); 376 377 // Constants 378 getActionDefinitionsBuilder(G_CONSTANT) 379 .legalFor({p0, s8, s16, s32, s64}) 380 .widenScalarToNextPow2(0) 381 .clampScalar(0, s8, s64); 382 getActionDefinitionsBuilder(G_FCONSTANT) 383 .legalIf([=](const LegalityQuery &Query) { 384 const auto &Ty = Query.Types[0]; 385 if (HasFP16 && Ty == s16) 386 return true; 387 return Ty == s32 || Ty == s64 || Ty == s128; 388 }) 389 .clampScalar(0, MinFPScalar, s128); 390 391 getActionDefinitionsBuilder({G_ICMP, G_FCMP}) 392 .legalFor({{s32, s32}, 393 {s32, s64}, 394 {s32, p0}, 395 {v4s32, v4s32}, 396 {v2s32, v2s32}, 397 {v2s64, v2s64}, 398 {v2s64, v2p0}, 399 {v4s16, v4s16}, 400 {v8s16, v8s16}, 401 {v8s8, v8s8}, 402 {v16s8, v16s8}}) 403 .widenScalarOrEltToNextPow2(1) 404 .clampScalar(1, s32, s64) 405 .clampScalar(0, s32, s32) 406 .minScalarEltSameAsIf( 407 [=](const LegalityQuery &Query) { 408 const LLT &Ty = Query.Types[0]; 409 const LLT &SrcTy = Query.Types[1]; 410 return Ty.isVector() && !SrcTy.getElementType().isPointer() && 411 Ty.getElementType() != SrcTy.getElementType(); 412 }, 413 0, 1) 414 .minScalarOrEltIf( 415 [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; }, 416 1, s32) 417 .minScalarOrEltIf( 418 [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0, 419 s64) 420 .clampNumElements(0, v2s32, v4s32); 421 422 // Extensions 423 auto ExtLegalFunc = [=](const LegalityQuery &Query) { 424 unsigned DstSize = Query.Types[0].getSizeInBits(); 425 426 if (DstSize == 128 && !Query.Types[0].isVector()) 427 return false; // Extending to a scalar s128 needs narrowing. 428 429 // Make sure that we have something that will fit in a register, and 430 // make sure it's a power of 2. 431 if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize)) 432 return false; 433 434 const LLT &SrcTy = Query.Types[1]; 435 436 // Make sure we fit in a register otherwise. Don't bother checking that 437 // the source type is below 128 bits. We shouldn't be allowing anything 438 // through which is wider than the destination in the first place. 439 unsigned SrcSize = SrcTy.getSizeInBits(); 440 if (SrcSize < 8 || !isPowerOf2_32(SrcSize)) 441 return false; 442 443 return true; 444 }; 445 getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) 446 .legalIf(ExtLegalFunc) 447 .clampScalar(0, s64, s64); // Just for s128, others are handled above. 448 449 getActionDefinitionsBuilder(G_TRUNC) 450 .minScalarOrEltIf( 451 [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); }, 452 0, s8) 453 .customIf([=](const LegalityQuery &Query) { 454 LLT DstTy = Query.Types[0]; 455 LLT SrcTy = Query.Types[1]; 456 return DstTy == v8s8 && SrcTy.getSizeInBits() > 128; 457 }) 458 .alwaysLegal(); 459 460 getActionDefinitionsBuilder(G_SEXT_INREG).legalFor({s32, s64}).lower(); 461 462 // FP conversions 463 getActionDefinitionsBuilder(G_FPTRUNC) 464 .legalFor( 465 {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}) 466 .clampMaxNumElements(0, s32, 2); 467 getActionDefinitionsBuilder(G_FPEXT) 468 .legalFor( 469 {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}}) 470 .clampMaxNumElements(0, s64, 2); 471 472 // Conversions 473 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 474 .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) 475 .widenScalarToNextPow2(0) 476 .clampScalar(0, s32, s64) 477 .widenScalarToNextPow2(1) 478 .clampScalar(1, s32, s64); 479 480 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 481 .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) 482 .clampScalar(1, s32, s64) 483 .minScalarSameAs(1, 0) 484 .clampScalar(0, s32, s64) 485 .widenScalarToNextPow2(0); 486 487 // Control-flow 488 getActionDefinitionsBuilder(G_BRCOND) 489 .legalFor({s32}) 490 .clampScalar(0, s32, s32); 491 getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0}); 492 493 getActionDefinitionsBuilder(G_SELECT) 494 .legalFor({{s32, s32}, {s64, s32}, {p0, s32}}) 495 .widenScalarToNextPow2(0) 496 .clampScalar(0, s32, s64) 497 .clampScalar(1, s32, s32) 498 .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0) 499 .lowerIf(isVector(0)); 500 501 // Pointer-handling 502 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); 503 504 if (TM.getCodeModel() == CodeModel::Small) 505 getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom(); 506 else 507 getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); 508 509 getActionDefinitionsBuilder(G_PTRTOINT) 510 .legalForCartesianProduct({s8, s16, s32, s64}, {p0}) 511 .legalFor({{v2s64, v2p0}}) 512 .maxScalar(0, s64) 513 .widenScalarToNextPow2(0, /*Min*/ 8); 514 515 getActionDefinitionsBuilder(G_INTTOPTR) 516 .unsupportedIf([&](const LegalityQuery &Query) { 517 return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits(); 518 }) 519 .legalFor({{p0, s64}, {v2p0, v2s64}}); 520 521 // Casts for 32 and 64-bit width type are just copies. 522 // Same for 128-bit width type, except they are on the FPR bank. 523 getActionDefinitionsBuilder(G_BITCAST) 524 // FIXME: This is wrong since G_BITCAST is not allowed to change the 525 // number of bits but it's what the previous code described and fixing 526 // it breaks tests. 527 .legalForCartesianProduct({s8, s16, s32, s64, s128, v16s8, v8s8, v4s8, 528 v8s16, v4s16, v2s16, v4s32, v2s32, v2s64, 529 v2p0}); 530 531 getActionDefinitionsBuilder(G_VASTART).legalFor({p0}); 532 533 // va_list must be a pointer, but most sized types are pretty easy to handle 534 // as the destination. 535 getActionDefinitionsBuilder(G_VAARG) 536 .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0}) 537 .clampScalar(0, s8, s64) 538 .widenScalarToNextPow2(0, /*Min*/ 8); 539 540 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 541 .lowerIf( 542 all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0))); 543 544 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 545 .customIf([](const LegalityQuery &Query) { 546 return Query.Types[0].getSizeInBits() == 128; 547 }) 548 .clampScalar(0, s32, s64) 549 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0))); 550 551 getActionDefinitionsBuilder( 552 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND, 553 G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, 554 G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX}) 555 .clampScalar(0, s32, s64) 556 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0))); 557 558 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0}); 559 560 // Merge/Unmerge 561 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 562 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 563 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 564 getActionDefinitionsBuilder(Op) 565 .widenScalarToNextPow2(LitTyIdx, 8) 566 .widenScalarToNextPow2(BigTyIdx, 32) 567 .clampScalar(LitTyIdx, s8, s64) 568 .clampScalar(BigTyIdx, s32, s128) 569 .legalIf([=](const LegalityQuery &Q) { 570 switch (Q.Types[BigTyIdx].getSizeInBits()) { 571 case 32: 572 case 64: 573 case 128: 574 break; 575 default: 576 return false; 577 } 578 switch (Q.Types[LitTyIdx].getSizeInBits()) { 579 case 8: 580 case 16: 581 case 32: 582 case 64: 583 return true; 584 default: 585 return false; 586 } 587 }); 588 } 589 590 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 591 .unsupportedIf([=](const LegalityQuery &Query) { 592 const LLT &EltTy = Query.Types[1].getElementType(); 593 return Query.Types[0] != EltTy; 594 }) 595 .minScalar(2, s64) 596 .legalIf([=](const LegalityQuery &Query) { 597 const LLT &VecTy = Query.Types[1]; 598 return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 || 599 VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 || 600 VecTy == v8s8 || VecTy == v16s8 || VecTy == v2s32 || 601 VecTy == v2p0; 602 }) 603 .minScalarOrEltIf( 604 [=](const LegalityQuery &Query) { 605 // We want to promote to <M x s1> to <M x s64> if that wouldn't 606 // cause the total vec size to be > 128b. 607 return Query.Types[1].getNumElements() <= 2; 608 }, 609 0, s64) 610 .minScalarOrEltIf( 611 [=](const LegalityQuery &Query) { 612 return Query.Types[1].getNumElements() <= 4; 613 }, 614 0, s32) 615 .minScalarOrEltIf( 616 [=](const LegalityQuery &Query) { 617 return Query.Types[1].getNumElements() <= 8; 618 }, 619 0, s16) 620 .minScalarOrEltIf( 621 [=](const LegalityQuery &Query) { 622 return Query.Types[1].getNumElements() <= 16; 623 }, 624 0, s8) 625 .minScalarOrElt(0, s8) // Worst case, we need at least s8. 626 .clampMaxNumElements(1, s64, 2) 627 .clampMaxNumElements(1, s32, 4) 628 .clampMaxNumElements(1, s16, 8) 629 .clampMaxNumElements(1, p0, 2); 630 631 getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) 632 .legalIf(typeInSet(0, {v8s16, v2s32, v4s32, v2s64})); 633 634 getActionDefinitionsBuilder(G_BUILD_VECTOR) 635 .legalFor({{v8s8, s8}, 636 {v16s8, s8}, 637 {v2s16, s16}, 638 {v4s16, s16}, 639 {v8s16, s16}, 640 {v2s32, s32}, 641 {v4s32, s32}, 642 {v2p0, p0}, 643 {v2s64, s64}}) 644 .clampNumElements(0, v4s32, v4s32) 645 .clampNumElements(0, v2s64, v2s64) 646 .minScalarOrElt(0, s8) 647 .minScalarSameAs(1, 0); 648 649 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower(); 650 651 getActionDefinitionsBuilder(G_CTLZ) 652 .legalForCartesianProduct( 653 {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) 654 .scalarize(1); 655 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower(); 656 657 // TODO: Custom lowering for v2s32, v4s32, v2s64. 658 getActionDefinitionsBuilder(G_BITREVERSE) 659 .legalFor({s32, s64, v8s8, v16s8}) 660 .widenScalarToNextPow2(0, /*Min = */ 32) 661 .clampScalar(0, s32, s64); 662 663 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower(); 664 665 getActionDefinitionsBuilder(G_CTTZ) 666 .lowerIf(isVector(0)) 667 .clampScalar(0, s32, s64) 668 .scalarSameSizeAs(1, 0) 669 .customFor({s32, s64}); 670 671 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 672 .legalIf([=](const LegalityQuery &Query) { 673 const LLT &DstTy = Query.Types[0]; 674 const LLT &SrcTy = Query.Types[1]; 675 // For now just support the TBL2 variant which needs the source vectors 676 // to be the same size as the dest. 677 if (DstTy != SrcTy) 678 return false; 679 for (auto &Ty : {v2s32, v4s32, v2s64, v2p0, v16s8, v8s16}) { 680 if (DstTy == Ty) 681 return true; 682 } 683 return false; 684 }) 685 // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we 686 // just want those lowered into G_BUILD_VECTOR 687 .lowerIf([=](const LegalityQuery &Query) { 688 return !Query.Types[1].isVector(); 689 }) 690 .moreElementsToNextPow2(0) 691 .clampNumElements(0, v4s32, v4s32) 692 .clampNumElements(0, v2s64, v2s64); 693 694 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 695 .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}}); 696 697 getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({{p0}, {s64}}); 698 699 getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) { 700 return Query.Types[0] == p0 && Query.Types[1] == s64; 701 }); 702 703 getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower(); 704 705 if (ST.hasMOPS()) { 706 // G_BZERO is not supported. Currently it is only emitted by 707 // PreLegalizerCombiner for G_MEMSET with zero constant. 708 getActionDefinitionsBuilder(G_BZERO).unsupported(); 709 710 getActionDefinitionsBuilder(G_MEMSET) 711 .legalForCartesianProduct({p0}, {s64}, {s64}) 712 .customForCartesianProduct({p0}, {s8}, {s64}) 713 .immIdx(0); // Inform verifier imm idx 0 is handled. 714 715 getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE}) 716 .legalForCartesianProduct({p0}, {p0}, {s64}) 717 .immIdx(0); // Inform verifier imm idx 0 is handled. 718 719 // G_MEMCPY_INLINE does not have a tailcall immediate 720 getActionDefinitionsBuilder(G_MEMCPY_INLINE) 721 .legalForCartesianProduct({p0}, {p0}, {s64}); 722 723 } else { 724 getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET}) 725 .libcall(); 726 } 727 728 // FIXME: Legal types are only legal with NEON. 729 getActionDefinitionsBuilder(G_ABS) 730 .lowerIf(isScalar(0)) 731 .legalFor(PackedVectorAllTypeList); 732 733 getActionDefinitionsBuilder(G_VECREDUCE_FADD) 734 // We only have FADDP to do reduction-like operations. Lower the rest. 735 .legalFor({{s32, v2s32}, {s64, v2s64}}) 736 .clampMaxNumElements(1, s64, 2) 737 .clampMaxNumElements(1, s32, 2) 738 .lower(); 739 740 getActionDefinitionsBuilder(G_VECREDUCE_ADD) 741 .legalFor( 742 {{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s32, v2s32}, {s64, v2s64}}) 743 .clampMaxNumElements(1, s64, 2) 744 .clampMaxNumElements(1, s32, 4) 745 .lower(); 746 747 getActionDefinitionsBuilder( 748 {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR}) 749 // Try to break down into smaller vectors as long as they're at least 64 750 // bits. This lets us use vector operations for some parts of the 751 // reduction. 752 .fewerElementsIf( 753 [=](const LegalityQuery &Q) { 754 LLT SrcTy = Q.Types[1]; 755 if (SrcTy.isScalar()) 756 return false; 757 if (!isPowerOf2_32(SrcTy.getNumElements())) 758 return false; 759 // We can usually perform 64b vector operations. 760 return SrcTy.getSizeInBits() > 64; 761 }, 762 [=](const LegalityQuery &Q) { 763 LLT SrcTy = Q.Types[1]; 764 return std::make_pair(1, SrcTy.divide(2)); 765 }) 766 .scalarize(1) 767 .lower(); 768 769 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 770 .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); }); 771 772 getActionDefinitionsBuilder({G_FSHL, G_FSHR}).lower(); 773 774 getActionDefinitionsBuilder(G_ROTR) 775 .legalFor({{s32, s64}, {s64, s64}}) 776 .customIf([=](const LegalityQuery &Q) { 777 return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64; 778 }) 779 .lower(); 780 getActionDefinitionsBuilder(G_ROTL).lower(); 781 782 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 783 .customFor({{s32, s32}, {s64, s64}}); 784 785 // TODO: Use generic lowering when custom lowering is not possible. 786 auto always = [=](const LegalityQuery &Q) { return true; }; 787 getActionDefinitionsBuilder(G_CTPOP) 788 .legalFor({{v8s8, v8s8}, {v16s8, v16s8}}) 789 .clampScalar(0, s32, s128) 790 .widenScalarToNextPow2(0) 791 .minScalarEltSameAsIf(always, 1, 0) 792 .maxScalarEltSameAsIf(always, 1, 0) 793 .customFor({{s32, s32}, 794 {s64, s64}, 795 {s128, s128}, 796 {v2s64, v2s64}, 797 {v2s32, v2s32}, 798 {v4s32, v4s32}, 799 {v4s16, v4s16}, 800 {v8s16, v8s16}}); 801 802 // TODO: Vector types. 803 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0)); 804 805 // TODO: Vector types. 806 getActionDefinitionsBuilder({G_FMAXNUM, G_FMINNUM}) 807 .legalFor({MinFPScalar, s32, s64}) 808 .libcallFor({s128}) 809 .minScalar(0, MinFPScalar); 810 811 // TODO: Vector types. 812 getActionDefinitionsBuilder({G_FMAXIMUM, G_FMINIMUM}) 813 .legalFor({MinFPScalar, s32, s64}) 814 .minScalar(0, MinFPScalar); 815 816 // TODO: Libcall support for s128. 817 // TODO: s16 should be legal with full FP16 support. 818 getActionDefinitionsBuilder({G_LROUND, G_LLROUND}) 819 .legalFor({{s64, s32}, {s64, s64}}); 820 821 getLegacyLegalizerInfo().computeTables(); 822 verify(*ST.getInstrInfo()); 823 } 824 825 bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 826 MachineInstr &MI) const { 827 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 828 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 829 GISelChangeObserver &Observer = Helper.Observer; 830 switch (MI.getOpcode()) { 831 default: 832 // No idea what to do. 833 return false; 834 case TargetOpcode::G_VAARG: 835 return legalizeVaArg(MI, MRI, MIRBuilder); 836 case TargetOpcode::G_LOAD: 837 case TargetOpcode::G_STORE: 838 return legalizeLoadStore(MI, MRI, MIRBuilder, Observer); 839 case TargetOpcode::G_SHL: 840 case TargetOpcode::G_ASHR: 841 case TargetOpcode::G_LSHR: 842 return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer); 843 case TargetOpcode::G_GLOBAL_VALUE: 844 return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer); 845 case TargetOpcode::G_TRUNC: 846 return legalizeVectorTrunc(MI, Helper); 847 case TargetOpcode::G_SBFX: 848 case TargetOpcode::G_UBFX: 849 return legalizeBitfieldExtract(MI, MRI, Helper); 850 case TargetOpcode::G_ROTR: 851 return legalizeRotate(MI, MRI, Helper); 852 case TargetOpcode::G_CTPOP: 853 return legalizeCTPOP(MI, MRI, Helper); 854 case TargetOpcode::G_ATOMIC_CMPXCHG: 855 return legalizeAtomicCmpxchg128(MI, MRI, Helper); 856 case TargetOpcode::G_CTTZ: 857 return legalizeCTTZ(MI, Helper); 858 case TargetOpcode::G_BZERO: 859 case TargetOpcode::G_MEMCPY: 860 case TargetOpcode::G_MEMMOVE: 861 case TargetOpcode::G_MEMSET: 862 return legalizeMemOps(MI, Helper); 863 } 864 865 llvm_unreachable("expected switch to return"); 866 } 867 868 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI, 869 MachineRegisterInfo &MRI, 870 LegalizerHelper &Helper) const { 871 // To allow for imported patterns to match, we ensure that the rotate amount 872 // is 64b with an extension. 873 Register AmtReg = MI.getOperand(2).getReg(); 874 LLT AmtTy = MRI.getType(AmtReg); 875 (void)AmtTy; 876 assert(AmtTy.isScalar() && "Expected a scalar rotate"); 877 assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal"); 878 auto NewAmt = Helper.MIRBuilder.buildSExt(LLT::scalar(64), AmtReg); 879 Helper.Observer.changingInstr(MI); 880 MI.getOperand(2).setReg(NewAmt.getReg(0)); 881 Helper.Observer.changedInstr(MI); 882 return true; 883 } 884 885 static void extractParts(Register Reg, MachineRegisterInfo &MRI, 886 MachineIRBuilder &MIRBuilder, LLT Ty, int NumParts, 887 SmallVectorImpl<Register> &VRegs) { 888 for (int I = 0; I < NumParts; ++I) 889 VRegs.push_back(MRI.createGenericVirtualRegister(Ty)); 890 MIRBuilder.buildUnmerge(VRegs, Reg); 891 } 892 893 bool AArch64LegalizerInfo::legalizeVectorTrunc( 894 MachineInstr &MI, LegalizerHelper &Helper) const { 895 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 896 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 897 // Similar to how operand splitting is done in SelectiondDAG, we can handle 898 // %res(v8s8) = G_TRUNC %in(v8s32) by generating: 899 // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>) 900 // %lo16(<4 x s16>) = G_TRUNC %inlo 901 // %hi16(<4 x s16>) = G_TRUNC %inhi 902 // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16 903 // %res(<8 x s8>) = G_TRUNC %in16 904 905 Register DstReg = MI.getOperand(0).getReg(); 906 Register SrcReg = MI.getOperand(1).getReg(); 907 LLT DstTy = MRI.getType(DstReg); 908 LLT SrcTy = MRI.getType(SrcReg); 909 assert(isPowerOf2_32(DstTy.getSizeInBits()) && 910 isPowerOf2_32(SrcTy.getSizeInBits())); 911 912 // Split input type. 913 LLT SplitSrcTy = 914 SrcTy.changeElementCount(SrcTy.getElementCount().divideCoefficientBy(2)); 915 // First, split the source into two smaller vectors. 916 SmallVector<Register, 2> SplitSrcs; 917 extractParts(SrcReg, MRI, MIRBuilder, SplitSrcTy, 2, SplitSrcs); 918 919 // Truncate the splits into intermediate narrower elements. 920 LLT InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2); 921 for (unsigned I = 0; I < SplitSrcs.size(); ++I) 922 SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0); 923 924 auto Concat = MIRBuilder.buildConcatVectors( 925 DstTy.changeElementSize(DstTy.getScalarSizeInBits() * 2), SplitSrcs); 926 927 Helper.Observer.changingInstr(MI); 928 MI.getOperand(1).setReg(Concat.getReg(0)); 929 Helper.Observer.changedInstr(MI); 930 return true; 931 } 932 933 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( 934 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 935 GISelChangeObserver &Observer) const { 936 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); 937 // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP + 938 // G_ADD_LOW instructions. 939 // By splitting this here, we can optimize accesses in the small code model by 940 // folding in the G_ADD_LOW into the load/store offset. 941 auto &GlobalOp = MI.getOperand(1); 942 const auto* GV = GlobalOp.getGlobal(); 943 if (GV->isThreadLocal()) 944 return true; // Don't want to modify TLS vars. 945 946 auto &TM = ST->getTargetLowering()->getTargetMachine(); 947 unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM); 948 949 if (OpFlags & AArch64II::MO_GOT) 950 return true; 951 952 auto Offset = GlobalOp.getOffset(); 953 Register DstReg = MI.getOperand(0).getReg(); 954 auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {}) 955 .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE); 956 // Set the regclass on the dest reg too. 957 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); 958 959 // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so 960 // by creating a MOVK that sets bits 48-63 of the register to (global address 961 // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to 962 // prevent an incorrect tag being generated during relocation when the the 963 // global appears before the code section. Without the offset, a global at 964 // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced 965 // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 = 966 // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe` 967 // instead of `0xf`. 968 // This assumes that we're in the small code model so we can assume a binary 969 // size of <= 4GB, which makes the untagged PC relative offset positive. The 970 // binary must also be loaded into address range [0, 2^48). Both of these 971 // properties need to be ensured at runtime when using tagged addresses. 972 if (OpFlags & AArch64II::MO_TAGGED) { 973 assert(!Offset && 974 "Should not have folded in an offset for a tagged global!"); 975 ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP}) 976 .addGlobalAddress(GV, 0x100000000, 977 AArch64II::MO_PREL | AArch64II::MO_G3) 978 .addImm(48); 979 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); 980 } 981 982 MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP}) 983 .addGlobalAddress(GV, Offset, 984 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 985 MI.eraseFromParent(); 986 return true; 987 } 988 989 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 990 MachineInstr &MI) const { 991 switch (MI.getIntrinsicID()) { 992 case Intrinsic::vacopy: { 993 unsigned PtrSize = ST->isTargetILP32() ? 4 : 8; 994 unsigned VaListSize = 995 (ST->isTargetDarwin() || ST->isTargetWindows()) 996 ? PtrSize 997 : ST->isTargetILP32() ? 20 : 32; 998 999 MachineFunction &MF = *MI.getMF(); 1000 auto Val = MF.getRegInfo().createGenericVirtualRegister( 1001 LLT::scalar(VaListSize * 8)); 1002 MachineIRBuilder MIB(MI); 1003 MIB.buildLoad(Val, MI.getOperand(2), 1004 *MF.getMachineMemOperand(MachinePointerInfo(), 1005 MachineMemOperand::MOLoad, 1006 VaListSize, Align(PtrSize))); 1007 MIB.buildStore(Val, MI.getOperand(1), 1008 *MF.getMachineMemOperand(MachinePointerInfo(), 1009 MachineMemOperand::MOStore, 1010 VaListSize, Align(PtrSize))); 1011 MI.eraseFromParent(); 1012 return true; 1013 } 1014 case Intrinsic::get_dynamic_area_offset: { 1015 MachineIRBuilder &MIB = Helper.MIRBuilder; 1016 MIB.buildConstant(MI.getOperand(0).getReg(), 0); 1017 MI.eraseFromParent(); 1018 return true; 1019 } 1020 case Intrinsic::aarch64_mops_memset_tag: { 1021 assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 1022 // Zext the value to 64 bit 1023 MachineIRBuilder MIB(MI); 1024 auto &Value = MI.getOperand(3); 1025 Register ZExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0); 1026 Value.setReg(ZExtValueReg); 1027 return true; 1028 } 1029 } 1030 1031 return true; 1032 } 1033 1034 bool AArch64LegalizerInfo::legalizeShlAshrLshr( 1035 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1036 GISelChangeObserver &Observer) const { 1037 assert(MI.getOpcode() == TargetOpcode::G_ASHR || 1038 MI.getOpcode() == TargetOpcode::G_LSHR || 1039 MI.getOpcode() == TargetOpcode::G_SHL); 1040 // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the 1041 // imported patterns can select it later. Either way, it will be legal. 1042 Register AmtReg = MI.getOperand(2).getReg(); 1043 auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI); 1044 if (!VRegAndVal) 1045 return true; 1046 // Check the shift amount is in range for an immediate form. 1047 int64_t Amount = VRegAndVal->Value.getSExtValue(); 1048 if (Amount > 31) 1049 return true; // This will have to remain a register variant. 1050 auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount); 1051 Observer.changingInstr(MI); 1052 MI.getOperand(2).setReg(ExtCst.getReg(0)); 1053 Observer.changedInstr(MI); 1054 return true; 1055 } 1056 1057 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset, 1058 MachineRegisterInfo &MRI) { 1059 Base = Root; 1060 Offset = 0; 1061 1062 Register NewBase; 1063 int64_t NewOffset; 1064 if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) && 1065 isShiftedInt<7, 3>(NewOffset)) { 1066 Base = NewBase; 1067 Offset = NewOffset; 1068 } 1069 } 1070 1071 // FIXME: This should be removed and replaced with the generic bitcast legalize 1072 // action. 1073 bool AArch64LegalizerInfo::legalizeLoadStore( 1074 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1075 GISelChangeObserver &Observer) const { 1076 assert(MI.getOpcode() == TargetOpcode::G_STORE || 1077 MI.getOpcode() == TargetOpcode::G_LOAD); 1078 // Here we just try to handle vector loads/stores where our value type might 1079 // have pointer elements, which the SelectionDAG importer can't handle. To 1080 // allow the existing patterns for s64 to fire for p0, we just try to bitcast 1081 // the value to use s64 types. 1082 1083 // Custom legalization requires the instruction, if not deleted, must be fully 1084 // legalized. In order to allow further legalization of the inst, we create 1085 // a new instruction and erase the existing one. 1086 1087 Register ValReg = MI.getOperand(0).getReg(); 1088 const LLT ValTy = MRI.getType(ValReg); 1089 1090 if (ValTy == LLT::scalar(128)) { 1091 assert((*MI.memoperands_begin())->getSuccessOrdering() == 1092 AtomicOrdering::Monotonic || 1093 (*MI.memoperands_begin())->getSuccessOrdering() == 1094 AtomicOrdering::Unordered); 1095 assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2"); 1096 LLT s64 = LLT::scalar(64); 1097 MachineInstrBuilder NewI; 1098 if (MI.getOpcode() == TargetOpcode::G_LOAD) { 1099 NewI = MIRBuilder.buildInstr(AArch64::LDPXi, {s64, s64}, {}); 1100 MIRBuilder.buildMerge(ValReg, {NewI->getOperand(0), NewI->getOperand(1)}); 1101 } else { 1102 auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0)); 1103 NewI = MIRBuilder.buildInstr( 1104 AArch64::STPXi, {}, {Split->getOperand(0), Split->getOperand(1)}); 1105 } 1106 Register Base; 1107 int Offset; 1108 matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI); 1109 NewI.addUse(Base); 1110 NewI.addImm(Offset / 8); 1111 1112 NewI.cloneMemRefs(MI); 1113 constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(), 1114 *MRI.getTargetRegisterInfo(), 1115 *ST->getRegBankInfo()); 1116 MI.eraseFromParent(); 1117 return true; 1118 } 1119 1120 if (!ValTy.isVector() || !ValTy.getElementType().isPointer() || 1121 ValTy.getElementType().getAddressSpace() != 0) { 1122 LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store"); 1123 return false; 1124 } 1125 1126 unsigned PtrSize = ValTy.getElementType().getSizeInBits(); 1127 const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize); 1128 auto &MMO = **MI.memoperands_begin(); 1129 MMO.setType(NewTy); 1130 1131 if (MI.getOpcode() == TargetOpcode::G_STORE) { 1132 auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg); 1133 MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO); 1134 } else { 1135 auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO); 1136 MIRBuilder.buildBitcast(ValReg, NewLoad); 1137 } 1138 MI.eraseFromParent(); 1139 return true; 1140 } 1141 1142 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, 1143 MachineRegisterInfo &MRI, 1144 MachineIRBuilder &MIRBuilder) const { 1145 MachineFunction &MF = MIRBuilder.getMF(); 1146 Align Alignment(MI.getOperand(2).getImm()); 1147 Register Dst = MI.getOperand(0).getReg(); 1148 Register ListPtr = MI.getOperand(1).getReg(); 1149 1150 LLT PtrTy = MRI.getType(ListPtr); 1151 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1152 1153 const unsigned PtrSize = PtrTy.getSizeInBits() / 8; 1154 const Align PtrAlign = Align(PtrSize); 1155 auto List = MIRBuilder.buildLoad( 1156 PtrTy, ListPtr, 1157 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, 1158 PtrTy, PtrAlign)); 1159 1160 MachineInstrBuilder DstPtr; 1161 if (Alignment > PtrAlign) { 1162 // Realign the list to the actual required alignment. 1163 auto AlignMinus1 = 1164 MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1); 1165 auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0)); 1166 DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment)); 1167 } else 1168 DstPtr = List; 1169 1170 LLT ValTy = MRI.getType(Dst); 1171 uint64_t ValSize = ValTy.getSizeInBits() / 8; 1172 MIRBuilder.buildLoad( 1173 Dst, DstPtr, 1174 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, 1175 ValTy, std::max(Alignment, PtrAlign))); 1176 1177 auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign)); 1178 1179 auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0)); 1180 1181 MIRBuilder.buildStore(NewList, ListPtr, 1182 *MF.getMachineMemOperand(MachinePointerInfo(), 1183 MachineMemOperand::MOStore, 1184 PtrTy, PtrAlign)); 1185 1186 MI.eraseFromParent(); 1187 return true; 1188 } 1189 1190 bool AArch64LegalizerInfo::legalizeBitfieldExtract( 1191 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 1192 // Only legal if we can select immediate forms. 1193 // TODO: Lower this otherwise. 1194 return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) && 1195 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 1196 } 1197 1198 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI, 1199 MachineRegisterInfo &MRI, 1200 LegalizerHelper &Helper) const { 1201 // While there is no integer popcount instruction, it can 1202 // be more efficiently lowered to the following sequence that uses 1203 // AdvSIMD registers/instructions as long as the copies to/from 1204 // the AdvSIMD registers are cheap. 1205 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 1206 // CNT V0.8B, V0.8B // 8xbyte pop-counts 1207 // ADDV B0, V0.8B // sum 8xbyte pop-counts 1208 // UMOV X0, V0.B[0] // copy byte result back to integer reg 1209 // 1210 // For 128 bit vector popcounts, we lower to the following sequence: 1211 // cnt.16b v0, v0 // v8s16, v4s32, v2s64 1212 // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64 1213 // uaddlp.4s v0, v0 // v4s32, v2s64 1214 // uaddlp.2d v0, v0 // v2s64 1215 // 1216 // For 64 bit vector popcounts, we lower to the following sequence: 1217 // cnt.8b v0, v0 // v4s16, v2s32 1218 // uaddlp.4h v0, v0 // v4s16, v2s32 1219 // uaddlp.2s v0, v0 // v2s32 1220 1221 if (!ST->hasNEON() || 1222 MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) 1223 return false; 1224 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1225 Register Dst = MI.getOperand(0).getReg(); 1226 Register Val = MI.getOperand(1).getReg(); 1227 LLT Ty = MRI.getType(Val); 1228 1229 assert(Ty == MRI.getType(Dst) && 1230 "Expected src and dst to have the same type!"); 1231 unsigned Size = Ty.getSizeInBits(); 1232 1233 // Pre-conditioning: widen Val up to the nearest vector type. 1234 // s32,s64,v4s16,v2s32 -> v8i8 1235 // v8s16,v4s32,v2s64 -> v16i8 1236 LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8); 1237 if (Ty.isScalar()) { 1238 assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!"); 1239 if (Size == 32) { 1240 Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0); 1241 } 1242 } 1243 Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0); 1244 1245 // Count bits in each byte-sized lane. 1246 auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val); 1247 1248 // Sum across lanes. 1249 Register HSum = CTPOP.getReg(0); 1250 unsigned Opc; 1251 SmallVector<LLT> HAddTys; 1252 if (Ty.isScalar()) { 1253 Opc = Intrinsic::aarch64_neon_uaddlv; 1254 HAddTys.push_back(LLT::scalar(32)); 1255 } else if (Ty == LLT::fixed_vector(8, 16)) { 1256 Opc = Intrinsic::aarch64_neon_uaddlp; 1257 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1258 } else if (Ty == LLT::fixed_vector(4, 32)) { 1259 Opc = Intrinsic::aarch64_neon_uaddlp; 1260 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1261 HAddTys.push_back(LLT::fixed_vector(4, 32)); 1262 } else if (Ty == LLT::fixed_vector(2, 64)) { 1263 Opc = Intrinsic::aarch64_neon_uaddlp; 1264 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1265 HAddTys.push_back(LLT::fixed_vector(4, 32)); 1266 HAddTys.push_back(LLT::fixed_vector(2, 64)); 1267 } else if (Ty == LLT::fixed_vector(4, 16)) { 1268 Opc = Intrinsic::aarch64_neon_uaddlp; 1269 HAddTys.push_back(LLT::fixed_vector(4, 16)); 1270 } else if (Ty == LLT::fixed_vector(2, 32)) { 1271 Opc = Intrinsic::aarch64_neon_uaddlp; 1272 HAddTys.push_back(LLT::fixed_vector(4, 16)); 1273 HAddTys.push_back(LLT::fixed_vector(2, 32)); 1274 } else 1275 llvm_unreachable("unexpected vector shape"); 1276 MachineInstrBuilder UADD; 1277 for (LLT HTy : HAddTys) { 1278 UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}, /*HasSideEffects =*/false) 1279 .addUse(HSum); 1280 HSum = UADD.getReg(0); 1281 } 1282 1283 // Post-conditioning. 1284 if (Ty.isScalar() && (Size == 64 || Size == 128)) 1285 MIRBuilder.buildZExt(Dst, UADD); 1286 else 1287 UADD->getOperand(0).setReg(Dst); 1288 MI.eraseFromParent(); 1289 return true; 1290 } 1291 1292 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128( 1293 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 1294 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1295 LLT s64 = LLT::scalar(64); 1296 auto Addr = MI.getOperand(1).getReg(); 1297 auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2)); 1298 auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3)); 1299 auto DstLo = MRI.createGenericVirtualRegister(s64); 1300 auto DstHi = MRI.createGenericVirtualRegister(s64); 1301 1302 MachineInstrBuilder CAS; 1303 if (ST->hasLSE()) { 1304 // We have 128-bit CASP instructions taking XSeqPair registers, which are 1305 // s128. We need the merge/unmerge to bracket the expansion and pair up with 1306 // the rest of the MIR so we must reassemble the extracted registers into a 1307 // 128-bit known-regclass one with code like this: 1308 // 1309 // %in1 = REG_SEQUENCE Lo, Hi ; One for each input 1310 // %out = CASP %in1, ... 1311 // %OldLo = G_EXTRACT %out, 0 1312 // %OldHi = G_EXTRACT %out, 64 1313 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); 1314 unsigned Opcode; 1315 switch (Ordering) { 1316 case AtomicOrdering::Acquire: 1317 Opcode = AArch64::CASPAX; 1318 break; 1319 case AtomicOrdering::Release: 1320 Opcode = AArch64::CASPLX; 1321 break; 1322 case AtomicOrdering::AcquireRelease: 1323 case AtomicOrdering::SequentiallyConsistent: 1324 Opcode = AArch64::CASPALX; 1325 break; 1326 default: 1327 Opcode = AArch64::CASPX; 1328 break; 1329 } 1330 1331 LLT s128 = LLT::scalar(128); 1332 auto CASDst = MRI.createGenericVirtualRegister(s128); 1333 auto CASDesired = MRI.createGenericVirtualRegister(s128); 1334 auto CASNew = MRI.createGenericVirtualRegister(s128); 1335 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {}) 1336 .addUse(DesiredI->getOperand(0).getReg()) 1337 .addImm(AArch64::sube64) 1338 .addUse(DesiredI->getOperand(1).getReg()) 1339 .addImm(AArch64::subo64); 1340 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {}) 1341 .addUse(NewI->getOperand(0).getReg()) 1342 .addImm(AArch64::sube64) 1343 .addUse(NewI->getOperand(1).getReg()) 1344 .addImm(AArch64::subo64); 1345 1346 CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr}); 1347 1348 MIRBuilder.buildExtract({DstLo}, {CASDst}, 0); 1349 MIRBuilder.buildExtract({DstHi}, {CASDst}, 64); 1350 } else { 1351 // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP 1352 // can take arbitrary registers so it just has the normal GPR64 operands the 1353 // rest of AArch64 is expecting. 1354 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); 1355 unsigned Opcode; 1356 switch (Ordering) { 1357 case AtomicOrdering::Acquire: 1358 Opcode = AArch64::CMP_SWAP_128_ACQUIRE; 1359 break; 1360 case AtomicOrdering::Release: 1361 Opcode = AArch64::CMP_SWAP_128_RELEASE; 1362 break; 1363 case AtomicOrdering::AcquireRelease: 1364 case AtomicOrdering::SequentiallyConsistent: 1365 Opcode = AArch64::CMP_SWAP_128; 1366 break; 1367 default: 1368 Opcode = AArch64::CMP_SWAP_128_MONOTONIC; 1369 break; 1370 } 1371 1372 auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 1373 CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch}, 1374 {Addr, DesiredI->getOperand(0), 1375 DesiredI->getOperand(1), NewI->getOperand(0), 1376 NewI->getOperand(1)}); 1377 } 1378 1379 CAS.cloneMemRefs(MI); 1380 constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(), 1381 *MRI.getTargetRegisterInfo(), 1382 *ST->getRegBankInfo()); 1383 1384 MIRBuilder.buildMerge(MI.getOperand(0), {DstLo, DstHi}); 1385 MI.eraseFromParent(); 1386 return true; 1387 } 1388 1389 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI, 1390 LegalizerHelper &Helper) const { 1391 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1392 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 1393 LLT Ty = MRI.getType(MI.getOperand(1).getReg()); 1394 auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1)); 1395 MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse); 1396 MI.eraseFromParent(); 1397 return true; 1398 } 1399 1400 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI, 1401 LegalizerHelper &Helper) const { 1402 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1403 1404 // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic 1405 if (MI.getOpcode() == TargetOpcode::G_MEMSET) { 1406 // Zext the value operand to 64 bit 1407 auto &Value = MI.getOperand(1); 1408 Register ZExtValueReg = 1409 MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0); 1410 Value.setReg(ZExtValueReg); 1411 return true; 1412 } 1413 1414 return false; 1415 } 1416