1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AArch64. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64LegalizerInfo.h" 15 #include "AArch64RegisterBankInfo.h" 16 #include "AArch64Subtarget.h" 17 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 19 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 20 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 21 #include "llvm/CodeGen/GlobalISel/Utils.h" 22 #include "llvm/CodeGen/MachineInstr.h" 23 #include "llvm/CodeGen/MachineRegisterInfo.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/Intrinsics.h" 28 #include "llvm/IR/IntrinsicsAArch64.h" 29 #include "llvm/IR/Type.h" 30 #include "llvm/Support/MathExtras.h" 31 #include <initializer_list> 32 33 #define DEBUG_TYPE "aarch64-legalinfo" 34 35 using namespace llvm; 36 using namespace LegalizeActions; 37 using namespace LegalizeMutations; 38 using namespace LegalityPredicates; 39 using namespace MIPatternMatch; 40 41 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) 42 : ST(&ST) { 43 using namespace TargetOpcode; 44 const LLT p0 = LLT::pointer(0, 64); 45 const LLT s1 = LLT::scalar(1); 46 const LLT s8 = LLT::scalar(8); 47 const LLT s16 = LLT::scalar(16); 48 const LLT s32 = LLT::scalar(32); 49 const LLT s64 = LLT::scalar(64); 50 const LLT s128 = LLT::scalar(128); 51 const LLT v16s8 = LLT::fixed_vector(16, 8); 52 const LLT v8s8 = LLT::fixed_vector(8, 8); 53 const LLT v4s8 = LLT::fixed_vector(4, 8); 54 const LLT v8s16 = LLT::fixed_vector(8, 16); 55 const LLT v4s16 = LLT::fixed_vector(4, 16); 56 const LLT v2s16 = LLT::fixed_vector(2, 16); 57 const LLT v2s32 = LLT::fixed_vector(2, 32); 58 const LLT v4s32 = LLT::fixed_vector(4, 32); 59 const LLT v2s64 = LLT::fixed_vector(2, 64); 60 const LLT v2p0 = LLT::fixed_vector(2, p0); 61 62 std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */ 63 v16s8, v8s16, v4s32, 64 v2s64, v2p0, 65 /* End 128bit types */ 66 /* Begin 64bit types */ 67 v8s8, v4s16, v2s32}; 68 69 const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine(); 70 71 // FIXME: support subtargets which have neon/fp-armv8 disabled. 72 if (!ST.hasNEON() || !ST.hasFPARMv8()) { 73 getLegacyLegalizerInfo().computeTables(); 74 return; 75 } 76 77 // Some instructions only support s16 if the subtarget has full 16-bit FP 78 // support. 79 const bool HasFP16 = ST.hasFullFP16(); 80 const LLT &MinFPScalar = HasFP16 ? s16 : s32; 81 82 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 83 .legalFor({p0, s1, s8, s16, s32, s64}) 84 .legalFor(PackedVectorAllTypeList) 85 .widenScalarToNextPow2(0) 86 .clampScalar(0, s8, s64) 87 .fewerElementsIf( 88 [=](const LegalityQuery &Query) { 89 return Query.Types[0].isVector() && 90 (Query.Types[0].getElementType() != s64 || 91 Query.Types[0].getNumElements() != 2); 92 }, 93 [=](const LegalityQuery &Query) { 94 LLT EltTy = Query.Types[0].getElementType(); 95 if (EltTy == s64) 96 return std::make_pair(0, LLT::fixed_vector(2, 64)); 97 return std::make_pair(0, EltTy); 98 }); 99 100 getActionDefinitionsBuilder(G_PHI) 101 .legalFor({p0, s16, s32, s64}) 102 .legalFor(PackedVectorAllTypeList) 103 .widenScalarToNextPow2(0) 104 .clampScalar(0, s16, s64) 105 // Maximum: sN * k = 128 106 .clampMaxNumElements(0, s8, 16) 107 .clampMaxNumElements(0, s16, 8) 108 .clampMaxNumElements(0, s32, 4) 109 .clampMaxNumElements(0, s64, 2) 110 .clampMaxNumElements(0, p0, 2); 111 112 getActionDefinitionsBuilder(G_BSWAP) 113 .legalFor({s32, s64, v4s32, v2s32, v2s64}) 114 .widenScalarToNextPow2(0) 115 .clampScalar(0, s32, s64); 116 117 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) 118 .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8}) 119 .scalarizeIf( 120 [=](const LegalityQuery &Query) { 121 return Query.Opcode == G_MUL && Query.Types[0] == v2s64; 122 }, 123 0) 124 .legalFor({v2s64}) 125 .widenScalarToNextPow2(0) 126 .clampScalar(0, s32, s64) 127 .clampNumElements(0, v2s32, v4s32) 128 .clampNumElements(0, v2s64, v2s64) 129 .moreElementsToNextPow2(0); 130 131 getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) 132 .customIf([=](const LegalityQuery &Query) { 133 const auto &SrcTy = Query.Types[0]; 134 const auto &AmtTy = Query.Types[1]; 135 return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && 136 AmtTy.getSizeInBits() == 32; 137 }) 138 .legalFor({ 139 {s32, s32}, 140 {s32, s64}, 141 {s64, s64}, 142 {v8s8, v8s8}, 143 {v16s8, v16s8}, 144 {v4s16, v4s16}, 145 {v8s16, v8s16}, 146 {v2s32, v2s32}, 147 {v4s32, v4s32}, 148 {v2s64, v2s64}, 149 }) 150 .widenScalarToNextPow2(0) 151 .clampScalar(1, s32, s64) 152 .clampScalar(0, s32, s64) 153 .clampNumElements(0, v2s32, v4s32) 154 .clampNumElements(0, v2s64, v2s64) 155 .moreElementsToNextPow2(0) 156 .minScalarSameAs(1, 0); 157 158 getActionDefinitionsBuilder(G_PTR_ADD) 159 .legalFor({{p0, s64}, {v2p0, v2s64}}) 160 .clampScalar(1, s64, s64); 161 162 getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}}); 163 164 getActionDefinitionsBuilder({G_SDIV, G_UDIV}) 165 .legalFor({s32, s64}) 166 .libcallFor({s128}) 167 .clampScalar(0, s32, s64) 168 .widenScalarToNextPow2(0) 169 .scalarize(0); 170 171 getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 172 .lowerFor({s1, s8, s16, s32, s64, v2s64, v4s32, v2s32}) 173 .widenScalarOrEltToNextPow2(0) 174 .clampScalarOrElt(0, s32, s64) 175 .clampNumElements(0, v2s32, v4s32) 176 .clampNumElements(0, v2s64, v2s64) 177 .moreElementsToNextPow2(0); 178 179 180 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 181 .widenScalarToNextPow2(0, /*Min = */ 32) 182 .clampScalar(0, s32, s64) 183 .lowerIf(typeIs(1, s1)); 184 185 getActionDefinitionsBuilder({G_SMULH, G_UMULH}) 186 .legalFor({s64, v8s16, v16s8, v4s32}) 187 .lower(); 188 189 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 190 .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) 191 .clampNumElements(0, v8s8, v16s8) 192 .clampNumElements(0, v4s16, v8s16) 193 .clampNumElements(0, v2s32, v4s32) 194 // FIXME: This sholdn't be needed as v2s64 types are going to 195 // be expanded anyway, but G_ICMP doesn't support splitting vectors yet 196 .clampNumElements(0, v2s64, v2s64) 197 .lower(); 198 199 getActionDefinitionsBuilder( 200 {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO}) 201 .legalFor({{s32, s1}, {s64, s1}}) 202 .clampScalar(0, s32, s64) 203 .widenScalarToNextPow2(0); 204 205 getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG}) 206 .legalFor({MinFPScalar, s32, s64, v2s64, v4s32, v2s32}) 207 .clampScalar(0, MinFPScalar, s64) 208 .clampNumElements(0, v2s32, v4s32) 209 .clampNumElements(0, v2s64, v2s64); 210 211 getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64}); 212 213 getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT, 214 G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, 215 G_FNEARBYINT, G_INTRINSIC_LRINT}) 216 // If we don't have full FP16 support, then scalarize the elements of 217 // vectors containing fp16 types. 218 .fewerElementsIf( 219 [=, &ST](const LegalityQuery &Query) { 220 const auto &Ty = Query.Types[0]; 221 return Ty.isVector() && Ty.getElementType() == s16 && 222 !ST.hasFullFP16(); 223 }, 224 [=](const LegalityQuery &Query) { return std::make_pair(0, s16); }) 225 // If we don't have full FP16 support, then widen s16 to s32 if we 226 // encounter it. 227 .widenScalarIf( 228 [=, &ST](const LegalityQuery &Query) { 229 return Query.Types[0] == s16 && !ST.hasFullFP16(); 230 }, 231 [=](const LegalityQuery &Query) { return std::make_pair(0, s32); }) 232 .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16}); 233 234 getActionDefinitionsBuilder( 235 {G_FCOS, G_FSIN, G_FLOG10, G_FLOG, G_FLOG2, G_FEXP, G_FEXP2, G_FPOW}) 236 // We need a call for these, so we always need to scalarize. 237 .scalarize(0) 238 // Regardless of FP16 support, widen 16-bit elements to 32-bits. 239 .minScalar(0, s32) 240 .libcallFor({s32, s64, v2s32, v4s32, v2s64}); 241 242 getActionDefinitionsBuilder(G_INSERT) 243 .legalIf(all(typeInSet(0, {s32, s64, p0}), 244 typeInSet(1, {s1, s8, s16, s32}), smallerThan(1, 0))) 245 .widenScalarToNextPow2(0) 246 .clampScalar(0, s32, s64) 247 .widenScalarToNextPow2(1) 248 .minScalar(1, s8) 249 .maxScalarIf(typeInSet(0, {s32}), 1, s16) 250 .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32); 251 252 getActionDefinitionsBuilder(G_EXTRACT) 253 .legalIf(all(typeInSet(0, {s16, s32, s64, p0}), 254 typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1))) 255 .widenScalarToNextPow2(1) 256 .clampScalar(1, s32, s128) 257 .widenScalarToNextPow2(0) 258 .minScalar(0, s16) 259 .maxScalarIf(typeInSet(1, {s32}), 0, s16) 260 .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32) 261 .maxScalarIf(typeInSet(1, {s128}), 0, s64); 262 263 getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 264 .lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)) 265 .legalForTypesWithMemDesc({{s32, p0, s8, 8}, 266 {s32, p0, s16, 8}, 267 {s32, p0, s32, 8}, 268 {s64, p0, s8, 2}, 269 {s64, p0, s16, 2}, 270 {s64, p0, s32, 4}, 271 {s64, p0, s64, 8}, 272 {p0, p0, s64, 8}, 273 {v2s32, p0, s64, 8}}) 274 .widenScalarToNextPow2(0) 275 .clampScalar(0, s32, s64) 276 // TODO: We could support sum-of-pow2's but the lowering code doesn't know 277 // how to do that yet. 278 .unsupportedIfMemSizeNotPow2() 279 // Lower anything left over into G_*EXT and G_LOAD 280 .lower(); 281 282 auto IsPtrVecPred = [=](const LegalityQuery &Query) { 283 const LLT &ValTy = Query.Types[0]; 284 if (!ValTy.isVector()) 285 return false; 286 const LLT EltTy = ValTy.getElementType(); 287 return EltTy.isPointer() && EltTy.getAddressSpace() == 0; 288 }; 289 290 getActionDefinitionsBuilder(G_LOAD) 291 .customIf([=](const LegalityQuery &Query) { 292 return Query.Types[0] == s128 && 293 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; 294 }) 295 .legalForTypesWithMemDesc({{s8, p0, s8, 8}, 296 {s16, p0, s16, 8}, 297 {s32, p0, s32, 8}, 298 {s64, p0, s64, 8}, 299 {p0, p0, s64, 8}, 300 {s128, p0, s128, 8}, 301 {v8s8, p0, s64, 8}, 302 {v16s8, p0, s128, 8}, 303 {v4s16, p0, s64, 8}, 304 {v8s16, p0, s128, 8}, 305 {v2s32, p0, s64, 8}, 306 {v4s32, p0, s128, 8}, 307 {v2s64, p0, s128, 8}}) 308 // These extends are also legal 309 .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}}) 310 .widenScalarToNextPow2(0, /* MinSize = */8) 311 .lowerIfMemSizeNotPow2() 312 .clampScalar(0, s8, s64) 313 .narrowScalarIf([=](const LegalityQuery &Query) { 314 // Clamp extending load results to 32-bits. 315 return Query.Types[0].isScalar() && 316 Query.Types[0] != Query.MMODescrs[0].MemoryTy && 317 Query.Types[0].getSizeInBits() > 32; 318 }, 319 changeTo(0, s32)) 320 // Lower any any-extending loads left into G_ANYEXT and G_LOAD 321 .lowerIf([=](const LegalityQuery &Query) { 322 return Query.Types[0] != Query.MMODescrs[0].MemoryTy; 323 }) 324 .clampMaxNumElements(0, s8, 16) 325 .clampMaxNumElements(0, s16, 8) 326 .clampMaxNumElements(0, s32, 4) 327 .clampMaxNumElements(0, s64, 2) 328 .clampMaxNumElements(0, p0, 2) 329 .customIf(IsPtrVecPred) 330 .scalarizeIf(typeIs(0, v2s16), 0); 331 332 getActionDefinitionsBuilder(G_STORE) 333 .customIf([=](const LegalityQuery &Query) { 334 return Query.Types[0] == s128 && 335 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; 336 }) 337 .legalForTypesWithMemDesc({{s8, p0, s8, 8}, 338 {s16, p0, s8, 8}, // truncstorei8 from s16 339 {s32, p0, s8, 8}, // truncstorei8 from s32 340 {s64, p0, s8, 8}, // truncstorei8 from s64 341 {s16, p0, s16, 8}, 342 {s32, p0, s16, 8}, // truncstorei16 from s32 343 {s64, p0, s16, 8}, // truncstorei16 from s64 344 {s32, p0, s8, 8}, 345 {s32, p0, s16, 8}, 346 {s32, p0, s32, 8}, 347 {s64, p0, s64, 8}, 348 {s64, p0, s32, 8}, // truncstorei32 from s64 349 {p0, p0, s64, 8}, 350 {s128, p0, s128, 8}, 351 {v16s8, p0, s128, 8}, 352 {v8s8, p0, s64, 8}, 353 {v4s16, p0, s64, 8}, 354 {v8s16, p0, s128, 8}, 355 {v2s32, p0, s64, 8}, 356 {v4s32, p0, s128, 8}, 357 {v2s64, p0, s128, 8}}) 358 .clampScalar(0, s8, s64) 359 .lowerIf([=](const LegalityQuery &Query) { 360 return Query.Types[0].isScalar() && 361 Query.Types[0] != Query.MMODescrs[0].MemoryTy; 362 }) 363 // Maximum: sN * k = 128 364 .clampMaxNumElements(0, s8, 16) 365 .clampMaxNumElements(0, s16, 8) 366 .clampMaxNumElements(0, s32, 4) 367 .clampMaxNumElements(0, s64, 2) 368 .clampMaxNumElements(0, p0, 2) 369 .lowerIfMemSizeNotPow2() 370 .customIf(IsPtrVecPred) 371 .scalarizeIf(typeIs(0, v2s16), 0); 372 373 // Constants 374 getActionDefinitionsBuilder(G_CONSTANT) 375 .legalFor({p0, s8, s16, s32, s64}) 376 .widenScalarToNextPow2(0) 377 .clampScalar(0, s8, s64); 378 getActionDefinitionsBuilder(G_FCONSTANT) 379 .legalIf([=](const LegalityQuery &Query) { 380 const auto &Ty = Query.Types[0]; 381 if (HasFP16 && Ty == s16) 382 return true; 383 return Ty == s32 || Ty == s64 || Ty == s128; 384 }) 385 .clampScalar(0, MinFPScalar, s128); 386 387 getActionDefinitionsBuilder({G_ICMP, G_FCMP}) 388 .legalFor({{s32, s32}, 389 {s32, s64}, 390 {s32, p0}, 391 {v4s32, v4s32}, 392 {v2s32, v2s32}, 393 {v2s64, v2s64}, 394 {v2s64, v2p0}, 395 {v4s16, v4s16}, 396 {v8s16, v8s16}, 397 {v8s8, v8s8}, 398 {v16s8, v16s8}}) 399 .widenScalarOrEltToNextPow2(1) 400 .clampScalar(1, s32, s64) 401 .clampScalar(0, s32, s32) 402 .minScalarEltSameAsIf( 403 [=](const LegalityQuery &Query) { 404 const LLT &Ty = Query.Types[0]; 405 const LLT &SrcTy = Query.Types[1]; 406 return Ty.isVector() && !SrcTy.getElementType().isPointer() && 407 Ty.getElementType() != SrcTy.getElementType(); 408 }, 409 0, 1) 410 .minScalarOrEltIf( 411 [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; }, 412 1, s32) 413 .minScalarOrEltIf( 414 [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0, 415 s64) 416 .clampNumElements(0, v2s32, v4s32); 417 418 // Extensions 419 auto ExtLegalFunc = [=](const LegalityQuery &Query) { 420 unsigned DstSize = Query.Types[0].getSizeInBits(); 421 422 if (DstSize == 128 && !Query.Types[0].isVector()) 423 return false; // Extending to a scalar s128 needs narrowing. 424 425 // Make sure that we have something that will fit in a register, and 426 // make sure it's a power of 2. 427 if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize)) 428 return false; 429 430 const LLT &SrcTy = Query.Types[1]; 431 432 // Special case for s1. 433 if (SrcTy == s1) 434 return true; 435 436 // Make sure we fit in a register otherwise. Don't bother checking that 437 // the source type is below 128 bits. We shouldn't be allowing anything 438 // through which is wider than the destination in the first place. 439 unsigned SrcSize = SrcTy.getSizeInBits(); 440 if (SrcSize < 8 || !isPowerOf2_32(SrcSize)) 441 return false; 442 443 return true; 444 }; 445 getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) 446 .legalIf(ExtLegalFunc) 447 .clampScalar(0, s64, s64); // Just for s128, others are handled above. 448 449 getActionDefinitionsBuilder(G_TRUNC) 450 .minScalarOrEltIf( 451 [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); }, 452 0, s8) 453 .customIf([=](const LegalityQuery &Query) { 454 LLT DstTy = Query.Types[0]; 455 LLT SrcTy = Query.Types[1]; 456 return DstTy == v8s8 && SrcTy.getSizeInBits() > 128; 457 }) 458 .alwaysLegal(); 459 460 getActionDefinitionsBuilder(G_SEXT_INREG).legalFor({s32, s64}).lower(); 461 462 // FP conversions 463 getActionDefinitionsBuilder(G_FPTRUNC) 464 .legalFor( 465 {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}) 466 .clampMaxNumElements(0, s32, 2); 467 getActionDefinitionsBuilder(G_FPEXT) 468 .legalFor( 469 {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}}) 470 .clampMaxNumElements(0, s64, 2); 471 472 // Conversions 473 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 474 .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) 475 .widenScalarToNextPow2(0) 476 .clampScalar(0, s32, s64) 477 .widenScalarToNextPow2(1) 478 .clampScalar(1, s32, s64); 479 480 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 481 .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) 482 .clampScalar(1, s32, s64) 483 .minScalarSameAs(1, 0) 484 .clampScalar(0, s32, s64) 485 .widenScalarToNextPow2(0); 486 487 // Control-flow 488 getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s8, s16, s32}); 489 getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0}); 490 491 getActionDefinitionsBuilder(G_SELECT) 492 .legalFor({{s32, s1}, {s64, s1}, {p0, s1}}) 493 .widenScalarToNextPow2(0) 494 .clampScalar(0, s32, s64) 495 .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0) 496 .lowerIf(isVector(0)); 497 498 // Pointer-handling 499 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); 500 501 if (TM.getCodeModel() == CodeModel::Small) 502 getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom(); 503 else 504 getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); 505 506 getActionDefinitionsBuilder(G_PTRTOINT) 507 .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0}) 508 .legalFor({{v2s64, v2p0}}) 509 .maxScalar(0, s64) 510 .widenScalarToNextPow2(0, /*Min*/ 8); 511 512 getActionDefinitionsBuilder(G_INTTOPTR) 513 .unsupportedIf([&](const LegalityQuery &Query) { 514 return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits(); 515 }) 516 .legalFor({{p0, s64}, {v2p0, v2s64}}); 517 518 // Casts for 32 and 64-bit width type are just copies. 519 // Same for 128-bit width type, except they are on the FPR bank. 520 getActionDefinitionsBuilder(G_BITCAST) 521 // FIXME: This is wrong since G_BITCAST is not allowed to change the 522 // number of bits but it's what the previous code described and fixing 523 // it breaks tests. 524 .legalForCartesianProduct({s1, s8, s16, s32, s64, s128, v16s8, v8s8, v4s8, 525 v8s16, v4s16, v2s16, v4s32, v2s32, v2s64, 526 v2p0}); 527 528 getActionDefinitionsBuilder(G_VASTART).legalFor({p0}); 529 530 // va_list must be a pointer, but most sized types are pretty easy to handle 531 // as the destination. 532 getActionDefinitionsBuilder(G_VAARG) 533 .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0}) 534 .clampScalar(0, s8, s64) 535 .widenScalarToNextPow2(0, /*Min*/ 8); 536 537 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 538 .lowerIf( 539 all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, s1), typeIs(2, p0))); 540 541 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 542 .customIf([](const LegalityQuery &Query) { 543 return Query.Types[0].getSizeInBits() == 128; 544 }) 545 .clampScalar(0, s32, s64) 546 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0))); 547 548 getActionDefinitionsBuilder( 549 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND, 550 G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, 551 G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX}) 552 .clampScalar(0, s32, s64) 553 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0))); 554 555 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0}); 556 557 // Merge/Unmerge 558 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 559 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 560 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 561 getActionDefinitionsBuilder(Op) 562 .widenScalarToNextPow2(LitTyIdx, 8) 563 .widenScalarToNextPow2(BigTyIdx, 32) 564 .clampScalar(LitTyIdx, s8, s64) 565 .clampScalar(BigTyIdx, s32, s128) 566 .legalIf([=](const LegalityQuery &Q) { 567 switch (Q.Types[BigTyIdx].getSizeInBits()) { 568 case 32: 569 case 64: 570 case 128: 571 break; 572 default: 573 return false; 574 } 575 switch (Q.Types[LitTyIdx].getSizeInBits()) { 576 case 8: 577 case 16: 578 case 32: 579 case 64: 580 return true; 581 default: 582 return false; 583 } 584 }); 585 } 586 587 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 588 .unsupportedIf([=](const LegalityQuery &Query) { 589 const LLT &EltTy = Query.Types[1].getElementType(); 590 return Query.Types[0] != EltTy; 591 }) 592 .minScalar(2, s64) 593 .legalIf([=](const LegalityQuery &Query) { 594 const LLT &VecTy = Query.Types[1]; 595 return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 || 596 VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 || 597 VecTy == v8s8 || VecTy == v16s8 || VecTy == v2s32 || 598 VecTy == v2p0; 599 }) 600 .minScalarOrEltIf( 601 [=](const LegalityQuery &Query) { 602 // We want to promote to <M x s1> to <M x s64> if that wouldn't 603 // cause the total vec size to be > 128b. 604 return Query.Types[1].getNumElements() <= 2; 605 }, 606 0, s64) 607 .minScalarOrEltIf( 608 [=](const LegalityQuery &Query) { 609 return Query.Types[1].getNumElements() <= 4; 610 }, 611 0, s32) 612 .minScalarOrEltIf( 613 [=](const LegalityQuery &Query) { 614 return Query.Types[1].getNumElements() <= 8; 615 }, 616 0, s16) 617 .minScalarOrEltIf( 618 [=](const LegalityQuery &Query) { 619 return Query.Types[1].getNumElements() <= 16; 620 }, 621 0, s8) 622 .minScalarOrElt(0, s8) // Worst case, we need at least s8. 623 .clampMaxNumElements(1, s64, 2) 624 .clampMaxNumElements(1, s32, 4) 625 .clampMaxNumElements(1, s16, 8) 626 .clampMaxNumElements(1, p0, 2); 627 628 getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) 629 .legalIf(typeInSet(0, {v8s16, v2s32, v4s32, v2s64})); 630 631 getActionDefinitionsBuilder(G_BUILD_VECTOR) 632 .legalFor({{v8s8, s8}, 633 {v16s8, s8}, 634 {v2s16, s16}, 635 {v4s16, s16}, 636 {v8s16, s16}, 637 {v2s32, s32}, 638 {v4s32, s32}, 639 {v2p0, p0}, 640 {v2s64, s64}}) 641 .clampNumElements(0, v4s32, v4s32) 642 .clampNumElements(0, v2s64, v2s64) 643 .minScalarOrElt(0, s8) 644 .minScalarSameAs(1, 0); 645 646 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower(); 647 648 getActionDefinitionsBuilder(G_CTLZ) 649 .legalForCartesianProduct( 650 {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) 651 .scalarize(1); 652 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower(); 653 654 // TODO: Custom lowering for v2s32, v4s32, v2s64. 655 getActionDefinitionsBuilder(G_BITREVERSE) 656 .legalFor({s32, s64, v8s8, v16s8}) 657 .widenScalarToNextPow2(0, /*Min = */ 32) 658 .clampScalar(0, s32, s64); 659 660 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower(); 661 662 // TODO: Handle vector types. 663 getActionDefinitionsBuilder(G_CTTZ) 664 .clampScalar(0, s32, s64) 665 .scalarSameSizeAs(1, 0) 666 .customFor({s32, s64}); 667 668 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 669 .legalIf([=](const LegalityQuery &Query) { 670 const LLT &DstTy = Query.Types[0]; 671 const LLT &SrcTy = Query.Types[1]; 672 // For now just support the TBL2 variant which needs the source vectors 673 // to be the same size as the dest. 674 if (DstTy != SrcTy) 675 return false; 676 for (auto &Ty : {v2s32, v4s32, v2s64, v2p0, v16s8, v8s16}) { 677 if (DstTy == Ty) 678 return true; 679 } 680 return false; 681 }) 682 // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we 683 // just want those lowered into G_BUILD_VECTOR 684 .lowerIf([=](const LegalityQuery &Query) { 685 return !Query.Types[1].isVector(); 686 }) 687 .moreElementsToNextPow2(0) 688 .clampNumElements(0, v4s32, v4s32) 689 .clampNumElements(0, v2s64, v2s64); 690 691 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 692 .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}}); 693 694 getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({{p0}, {s64}}); 695 696 getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) { 697 return Query.Types[0] == p0 && Query.Types[1] == s64; 698 }); 699 700 getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower(); 701 702 if (ST.hasMOPS()) { 703 // G_BZERO is not supported. Currently it is only emitted by 704 // PreLegalizerCombiner for G_MEMSET with zero constant. 705 getActionDefinitionsBuilder(G_BZERO).unsupported(); 706 707 getActionDefinitionsBuilder(G_MEMSET) 708 .legalForCartesianProduct({p0}, {s64}, {s64}) 709 .customForCartesianProduct({p0}, {s8}, {s64}) 710 .immIdx(0); // Inform verifier imm idx 0 is handled. 711 712 getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE}) 713 .legalForCartesianProduct({p0}, {p0}, {s64}) 714 .immIdx(0); // Inform verifier imm idx 0 is handled. 715 716 // G_MEMCPY_INLINE does not have a tailcall immediate 717 getActionDefinitionsBuilder(G_MEMCPY_INLINE) 718 .legalForCartesianProduct({p0}, {p0}, {s64}); 719 720 } else { 721 getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET}) 722 .libcall(); 723 } 724 725 // FIXME: Legal types are only legal with NEON. 726 getActionDefinitionsBuilder(G_ABS) 727 .lowerIf(isScalar(0)) 728 .legalFor(PackedVectorAllTypeList); 729 730 getActionDefinitionsBuilder(G_VECREDUCE_FADD) 731 // We only have FADDP to do reduction-like operations. Lower the rest. 732 .legalFor({{s32, v2s32}, {s64, v2s64}}) 733 .clampMaxNumElements(1, s64, 2) 734 .clampMaxNumElements(1, s32, 2) 735 .lower(); 736 737 getActionDefinitionsBuilder(G_VECREDUCE_ADD) 738 .legalFor( 739 {{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s32, v2s32}, {s64, v2s64}}) 740 .clampMaxNumElements(1, s64, 2) 741 .clampMaxNumElements(1, s32, 4) 742 .lower(); 743 744 getActionDefinitionsBuilder( 745 {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR}) 746 // Try to break down into smaller vectors as long as they're at least 64 747 // bits. This lets us use vector operations for some parts of the 748 // reduction. 749 .fewerElementsIf( 750 [=](const LegalityQuery &Q) { 751 LLT SrcTy = Q.Types[1]; 752 if (SrcTy.isScalar()) 753 return false; 754 if (!isPowerOf2_32(SrcTy.getNumElements())) 755 return false; 756 // We can usually perform 64b vector operations. 757 return SrcTy.getSizeInBits() > 64; 758 }, 759 [=](const LegalityQuery &Q) { 760 LLT SrcTy = Q.Types[1]; 761 return std::make_pair(1, SrcTy.divide(2)); 762 }) 763 .scalarize(1) 764 .lower(); 765 766 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 767 .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); }); 768 769 getActionDefinitionsBuilder({G_FSHL, G_FSHR}).lower(); 770 771 getActionDefinitionsBuilder(G_ROTR) 772 .legalFor({{s32, s64}, {s64, s64}}) 773 .customIf([=](const LegalityQuery &Q) { 774 return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64; 775 }) 776 .lower(); 777 getActionDefinitionsBuilder(G_ROTL).lower(); 778 779 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 780 .customFor({{s32, s32}, {s64, s64}}); 781 782 // TODO: Use generic lowering when custom lowering is not possible. 783 auto always = [=](const LegalityQuery &Q) { return true; }; 784 getActionDefinitionsBuilder(G_CTPOP) 785 .legalFor({{v8s8, v8s8}, {v16s8, v16s8}}) 786 .clampScalar(0, s32, s128) 787 .widenScalarToNextPow2(0) 788 .minScalarEltSameAsIf(always, 1, 0) 789 .maxScalarEltSameAsIf(always, 1, 0) 790 .customFor({{s32, s32}, 791 {s64, s64}, 792 {s128, s128}, 793 {v2s64, v2s64}, 794 {v2s32, v2s32}, 795 {v4s32, v4s32}, 796 {v4s16, v4s16}, 797 {v8s16, v8s16}}); 798 799 // TODO: Vector types. 800 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0)); 801 802 // TODO: Vector types. 803 getActionDefinitionsBuilder({G_FMAXNUM, G_FMINNUM}) 804 .legalFor({MinFPScalar, s32, s64}) 805 .libcallFor({s128}) 806 .minScalar(0, MinFPScalar); 807 808 // TODO: Vector types. 809 getActionDefinitionsBuilder({G_FMAXIMUM, G_FMINIMUM}) 810 .legalFor({MinFPScalar, s32, s64}) 811 .minScalar(0, MinFPScalar); 812 813 // TODO: Libcall support for s128. 814 // TODO: s16 should be legal with full FP16 support. 815 getActionDefinitionsBuilder({G_LROUND, G_LLROUND}) 816 .legalFor({{s64, s32}, {s64, s64}}); 817 818 getLegacyLegalizerInfo().computeTables(); 819 verify(*ST.getInstrInfo()); 820 } 821 822 bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 823 MachineInstr &MI) const { 824 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 825 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 826 GISelChangeObserver &Observer = Helper.Observer; 827 switch (MI.getOpcode()) { 828 default: 829 // No idea what to do. 830 return false; 831 case TargetOpcode::G_VAARG: 832 return legalizeVaArg(MI, MRI, MIRBuilder); 833 case TargetOpcode::G_LOAD: 834 case TargetOpcode::G_STORE: 835 return legalizeLoadStore(MI, MRI, MIRBuilder, Observer); 836 case TargetOpcode::G_SHL: 837 case TargetOpcode::G_ASHR: 838 case TargetOpcode::G_LSHR: 839 return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer); 840 case TargetOpcode::G_GLOBAL_VALUE: 841 return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer); 842 case TargetOpcode::G_TRUNC: 843 return legalizeVectorTrunc(MI, Helper); 844 case TargetOpcode::G_SBFX: 845 case TargetOpcode::G_UBFX: 846 return legalizeBitfieldExtract(MI, MRI, Helper); 847 case TargetOpcode::G_ROTR: 848 return legalizeRotate(MI, MRI, Helper); 849 case TargetOpcode::G_CTPOP: 850 return legalizeCTPOP(MI, MRI, Helper); 851 case TargetOpcode::G_ATOMIC_CMPXCHG: 852 return legalizeAtomicCmpxchg128(MI, MRI, Helper); 853 case TargetOpcode::G_CTTZ: 854 return legalizeCTTZ(MI, Helper); 855 case TargetOpcode::G_BZERO: 856 case TargetOpcode::G_MEMCPY: 857 case TargetOpcode::G_MEMMOVE: 858 case TargetOpcode::G_MEMSET: 859 return legalizeMemOps(MI, Helper); 860 } 861 862 llvm_unreachable("expected switch to return"); 863 } 864 865 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI, 866 MachineRegisterInfo &MRI, 867 LegalizerHelper &Helper) const { 868 // To allow for imported patterns to match, we ensure that the rotate amount 869 // is 64b with an extension. 870 Register AmtReg = MI.getOperand(2).getReg(); 871 LLT AmtTy = MRI.getType(AmtReg); 872 (void)AmtTy; 873 assert(AmtTy.isScalar() && "Expected a scalar rotate"); 874 assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal"); 875 auto NewAmt = Helper.MIRBuilder.buildSExt(LLT::scalar(64), AmtReg); 876 Helper.Observer.changingInstr(MI); 877 MI.getOperand(2).setReg(NewAmt.getReg(0)); 878 Helper.Observer.changedInstr(MI); 879 return true; 880 } 881 882 static void extractParts(Register Reg, MachineRegisterInfo &MRI, 883 MachineIRBuilder &MIRBuilder, LLT Ty, int NumParts, 884 SmallVectorImpl<Register> &VRegs) { 885 for (int I = 0; I < NumParts; ++I) 886 VRegs.push_back(MRI.createGenericVirtualRegister(Ty)); 887 MIRBuilder.buildUnmerge(VRegs, Reg); 888 } 889 890 bool AArch64LegalizerInfo::legalizeVectorTrunc( 891 MachineInstr &MI, LegalizerHelper &Helper) const { 892 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 893 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 894 // Similar to how operand splitting is done in SelectiondDAG, we can handle 895 // %res(v8s8) = G_TRUNC %in(v8s32) by generating: 896 // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>) 897 // %lo16(<4 x s16>) = G_TRUNC %inlo 898 // %hi16(<4 x s16>) = G_TRUNC %inhi 899 // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16 900 // %res(<8 x s8>) = G_TRUNC %in16 901 902 Register DstReg = MI.getOperand(0).getReg(); 903 Register SrcReg = MI.getOperand(1).getReg(); 904 LLT DstTy = MRI.getType(DstReg); 905 LLT SrcTy = MRI.getType(SrcReg); 906 assert(isPowerOf2_32(DstTy.getSizeInBits()) && 907 isPowerOf2_32(SrcTy.getSizeInBits())); 908 909 // Split input type. 910 LLT SplitSrcTy = 911 SrcTy.changeElementCount(SrcTy.getElementCount().divideCoefficientBy(2)); 912 // First, split the source into two smaller vectors. 913 SmallVector<Register, 2> SplitSrcs; 914 extractParts(SrcReg, MRI, MIRBuilder, SplitSrcTy, 2, SplitSrcs); 915 916 // Truncate the splits into intermediate narrower elements. 917 LLT InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2); 918 for (unsigned I = 0; I < SplitSrcs.size(); ++I) 919 SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0); 920 921 auto Concat = MIRBuilder.buildConcatVectors( 922 DstTy.changeElementSize(DstTy.getScalarSizeInBits() * 2), SplitSrcs); 923 924 Helper.Observer.changingInstr(MI); 925 MI.getOperand(1).setReg(Concat.getReg(0)); 926 Helper.Observer.changedInstr(MI); 927 return true; 928 } 929 930 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( 931 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 932 GISelChangeObserver &Observer) const { 933 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); 934 // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP + 935 // G_ADD_LOW instructions. 936 // By splitting this here, we can optimize accesses in the small code model by 937 // folding in the G_ADD_LOW into the load/store offset. 938 auto &GlobalOp = MI.getOperand(1); 939 const auto* GV = GlobalOp.getGlobal(); 940 if (GV->isThreadLocal()) 941 return true; // Don't want to modify TLS vars. 942 943 auto &TM = ST->getTargetLowering()->getTargetMachine(); 944 unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM); 945 946 if (OpFlags & AArch64II::MO_GOT) 947 return true; 948 949 auto Offset = GlobalOp.getOffset(); 950 Register DstReg = MI.getOperand(0).getReg(); 951 auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {}) 952 .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE); 953 // Set the regclass on the dest reg too. 954 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); 955 956 // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so 957 // by creating a MOVK that sets bits 48-63 of the register to (global address 958 // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to 959 // prevent an incorrect tag being generated during relocation when the the 960 // global appears before the code section. Without the offset, a global at 961 // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced 962 // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 = 963 // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe` 964 // instead of `0xf`. 965 // This assumes that we're in the small code model so we can assume a binary 966 // size of <= 4GB, which makes the untagged PC relative offset positive. The 967 // binary must also be loaded into address range [0, 2^48). Both of these 968 // properties need to be ensured at runtime when using tagged addresses. 969 if (OpFlags & AArch64II::MO_TAGGED) { 970 assert(!Offset && 971 "Should not have folded in an offset for a tagged global!"); 972 ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP}) 973 .addGlobalAddress(GV, 0x100000000, 974 AArch64II::MO_PREL | AArch64II::MO_G3) 975 .addImm(48); 976 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); 977 } 978 979 MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP}) 980 .addGlobalAddress(GV, Offset, 981 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 982 MI.eraseFromParent(); 983 return true; 984 } 985 986 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 987 MachineInstr &MI) const { 988 switch (MI.getIntrinsicID()) { 989 case Intrinsic::vacopy: { 990 unsigned PtrSize = ST->isTargetILP32() ? 4 : 8; 991 unsigned VaListSize = 992 (ST->isTargetDarwin() || ST->isTargetWindows()) 993 ? PtrSize 994 : ST->isTargetILP32() ? 20 : 32; 995 996 MachineFunction &MF = *MI.getMF(); 997 auto Val = MF.getRegInfo().createGenericVirtualRegister( 998 LLT::scalar(VaListSize * 8)); 999 MachineIRBuilder MIB(MI); 1000 MIB.buildLoad(Val, MI.getOperand(2), 1001 *MF.getMachineMemOperand(MachinePointerInfo(), 1002 MachineMemOperand::MOLoad, 1003 VaListSize, Align(PtrSize))); 1004 MIB.buildStore(Val, MI.getOperand(1), 1005 *MF.getMachineMemOperand(MachinePointerInfo(), 1006 MachineMemOperand::MOStore, 1007 VaListSize, Align(PtrSize))); 1008 MI.eraseFromParent(); 1009 return true; 1010 } 1011 case Intrinsic::get_dynamic_area_offset: { 1012 MachineIRBuilder &MIB = Helper.MIRBuilder; 1013 MIB.buildConstant(MI.getOperand(0).getReg(), 0); 1014 MI.eraseFromParent(); 1015 return true; 1016 } 1017 case Intrinsic::aarch64_mops_memset_tag: { 1018 assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 1019 // Zext the value to 64 bit 1020 MachineIRBuilder MIB(MI); 1021 auto &Value = MI.getOperand(3); 1022 Register ZExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0); 1023 Value.setReg(ZExtValueReg); 1024 return true; 1025 } 1026 } 1027 1028 return true; 1029 } 1030 1031 bool AArch64LegalizerInfo::legalizeShlAshrLshr( 1032 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1033 GISelChangeObserver &Observer) const { 1034 assert(MI.getOpcode() == TargetOpcode::G_ASHR || 1035 MI.getOpcode() == TargetOpcode::G_LSHR || 1036 MI.getOpcode() == TargetOpcode::G_SHL); 1037 // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the 1038 // imported patterns can select it later. Either way, it will be legal. 1039 Register AmtReg = MI.getOperand(2).getReg(); 1040 auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI); 1041 if (!VRegAndVal) 1042 return true; 1043 // Check the shift amount is in range for an immediate form. 1044 int64_t Amount = VRegAndVal->Value.getSExtValue(); 1045 if (Amount > 31) 1046 return true; // This will have to remain a register variant. 1047 auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount); 1048 Observer.changingInstr(MI); 1049 MI.getOperand(2).setReg(ExtCst.getReg(0)); 1050 Observer.changedInstr(MI); 1051 return true; 1052 } 1053 1054 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset, 1055 MachineRegisterInfo &MRI) { 1056 Base = Root; 1057 Offset = 0; 1058 1059 Register NewBase; 1060 int64_t NewOffset; 1061 if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) && 1062 isShiftedInt<7, 3>(NewOffset)) { 1063 Base = NewBase; 1064 Offset = NewOffset; 1065 } 1066 } 1067 1068 // FIXME: This should be removed and replaced with the generic bitcast legalize 1069 // action. 1070 bool AArch64LegalizerInfo::legalizeLoadStore( 1071 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1072 GISelChangeObserver &Observer) const { 1073 assert(MI.getOpcode() == TargetOpcode::G_STORE || 1074 MI.getOpcode() == TargetOpcode::G_LOAD); 1075 // Here we just try to handle vector loads/stores where our value type might 1076 // have pointer elements, which the SelectionDAG importer can't handle. To 1077 // allow the existing patterns for s64 to fire for p0, we just try to bitcast 1078 // the value to use s64 types. 1079 1080 // Custom legalization requires the instruction, if not deleted, must be fully 1081 // legalized. In order to allow further legalization of the inst, we create 1082 // a new instruction and erase the existing one. 1083 1084 Register ValReg = MI.getOperand(0).getReg(); 1085 const LLT ValTy = MRI.getType(ValReg); 1086 1087 if (ValTy == LLT::scalar(128)) { 1088 assert((*MI.memoperands_begin())->getSuccessOrdering() == 1089 AtomicOrdering::Monotonic || 1090 (*MI.memoperands_begin())->getSuccessOrdering() == 1091 AtomicOrdering::Unordered); 1092 assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2"); 1093 LLT s64 = LLT::scalar(64); 1094 MachineInstrBuilder NewI; 1095 if (MI.getOpcode() == TargetOpcode::G_LOAD) { 1096 NewI = MIRBuilder.buildInstr(AArch64::LDPXi, {s64, s64}, {}); 1097 MIRBuilder.buildMerge(ValReg, {NewI->getOperand(0), NewI->getOperand(1)}); 1098 } else { 1099 auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0)); 1100 NewI = MIRBuilder.buildInstr( 1101 AArch64::STPXi, {}, {Split->getOperand(0), Split->getOperand(1)}); 1102 } 1103 Register Base; 1104 int Offset; 1105 matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI); 1106 NewI.addUse(Base); 1107 NewI.addImm(Offset / 8); 1108 1109 NewI.cloneMemRefs(MI); 1110 constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(), 1111 *MRI.getTargetRegisterInfo(), 1112 *ST->getRegBankInfo()); 1113 MI.eraseFromParent(); 1114 return true; 1115 } 1116 1117 if (!ValTy.isVector() || !ValTy.getElementType().isPointer() || 1118 ValTy.getElementType().getAddressSpace() != 0) { 1119 LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store"); 1120 return false; 1121 } 1122 1123 unsigned PtrSize = ValTy.getElementType().getSizeInBits(); 1124 const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize); 1125 auto &MMO = **MI.memoperands_begin(); 1126 MMO.setType(NewTy); 1127 1128 if (MI.getOpcode() == TargetOpcode::G_STORE) { 1129 auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg); 1130 MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO); 1131 } else { 1132 auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO); 1133 MIRBuilder.buildBitcast(ValReg, NewLoad); 1134 } 1135 MI.eraseFromParent(); 1136 return true; 1137 } 1138 1139 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, 1140 MachineRegisterInfo &MRI, 1141 MachineIRBuilder &MIRBuilder) const { 1142 MachineFunction &MF = MIRBuilder.getMF(); 1143 Align Alignment(MI.getOperand(2).getImm()); 1144 Register Dst = MI.getOperand(0).getReg(); 1145 Register ListPtr = MI.getOperand(1).getReg(); 1146 1147 LLT PtrTy = MRI.getType(ListPtr); 1148 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1149 1150 const unsigned PtrSize = PtrTy.getSizeInBits() / 8; 1151 const Align PtrAlign = Align(PtrSize); 1152 auto List = MIRBuilder.buildLoad( 1153 PtrTy, ListPtr, 1154 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, 1155 PtrTy, PtrAlign)); 1156 1157 MachineInstrBuilder DstPtr; 1158 if (Alignment > PtrAlign) { 1159 // Realign the list to the actual required alignment. 1160 auto AlignMinus1 = 1161 MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1); 1162 auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0)); 1163 DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment)); 1164 } else 1165 DstPtr = List; 1166 1167 LLT ValTy = MRI.getType(Dst); 1168 uint64_t ValSize = ValTy.getSizeInBits() / 8; 1169 MIRBuilder.buildLoad( 1170 Dst, DstPtr, 1171 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, 1172 ValTy, std::max(Alignment, PtrAlign))); 1173 1174 auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign)); 1175 1176 auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0)); 1177 1178 MIRBuilder.buildStore(NewList, ListPtr, 1179 *MF.getMachineMemOperand(MachinePointerInfo(), 1180 MachineMemOperand::MOStore, 1181 PtrTy, PtrAlign)); 1182 1183 MI.eraseFromParent(); 1184 return true; 1185 } 1186 1187 bool AArch64LegalizerInfo::legalizeBitfieldExtract( 1188 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 1189 // Only legal if we can select immediate forms. 1190 // TODO: Lower this otherwise. 1191 return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) && 1192 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 1193 } 1194 1195 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI, 1196 MachineRegisterInfo &MRI, 1197 LegalizerHelper &Helper) const { 1198 // While there is no integer popcount instruction, it can 1199 // be more efficiently lowered to the following sequence that uses 1200 // AdvSIMD registers/instructions as long as the copies to/from 1201 // the AdvSIMD registers are cheap. 1202 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 1203 // CNT V0.8B, V0.8B // 8xbyte pop-counts 1204 // ADDV B0, V0.8B // sum 8xbyte pop-counts 1205 // UMOV X0, V0.B[0] // copy byte result back to integer reg 1206 // 1207 // For 128 bit vector popcounts, we lower to the following sequence: 1208 // cnt.16b v0, v0 // v8s16, v4s32, v2s64 1209 // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64 1210 // uaddlp.4s v0, v0 // v4s32, v2s64 1211 // uaddlp.2d v0, v0 // v2s64 1212 // 1213 // For 64 bit vector popcounts, we lower to the following sequence: 1214 // cnt.8b v0, v0 // v4s16, v2s32 1215 // uaddlp.4h v0, v0 // v4s16, v2s32 1216 // uaddlp.2s v0, v0 // v2s32 1217 1218 if (!ST->hasNEON() || 1219 MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) 1220 return false; 1221 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1222 Register Dst = MI.getOperand(0).getReg(); 1223 Register Val = MI.getOperand(1).getReg(); 1224 LLT Ty = MRI.getType(Val); 1225 1226 assert(Ty == MRI.getType(Dst) && 1227 "Expected src and dst to have the same type!"); 1228 unsigned Size = Ty.getSizeInBits(); 1229 1230 // Pre-conditioning: widen Val up to the nearest vector type. 1231 // s32,s64,v4s16,v2s32 -> v8i8 1232 // v8s16,v4s32,v2s64 -> v16i8 1233 LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8); 1234 if (Ty.isScalar()) { 1235 assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!"); 1236 if (Size == 32) { 1237 Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0); 1238 } 1239 } 1240 Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0); 1241 1242 // Count bits in each byte-sized lane. 1243 auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val); 1244 1245 // Sum across lanes. 1246 Register HSum = CTPOP.getReg(0); 1247 unsigned Opc; 1248 SmallVector<LLT> HAddTys; 1249 if (Ty.isScalar()) { 1250 Opc = Intrinsic::aarch64_neon_uaddlv; 1251 HAddTys.push_back(LLT::scalar(32)); 1252 } else if (Ty == LLT::fixed_vector(8, 16)) { 1253 Opc = Intrinsic::aarch64_neon_uaddlp; 1254 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1255 } else if (Ty == LLT::fixed_vector(4, 32)) { 1256 Opc = Intrinsic::aarch64_neon_uaddlp; 1257 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1258 HAddTys.push_back(LLT::fixed_vector(4, 32)); 1259 } else if (Ty == LLT::fixed_vector(2, 64)) { 1260 Opc = Intrinsic::aarch64_neon_uaddlp; 1261 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1262 HAddTys.push_back(LLT::fixed_vector(4, 32)); 1263 HAddTys.push_back(LLT::fixed_vector(2, 64)); 1264 } else if (Ty == LLT::fixed_vector(4, 16)) { 1265 Opc = Intrinsic::aarch64_neon_uaddlp; 1266 HAddTys.push_back(LLT::fixed_vector(4, 16)); 1267 } else if (Ty == LLT::fixed_vector(2, 32)) { 1268 Opc = Intrinsic::aarch64_neon_uaddlp; 1269 HAddTys.push_back(LLT::fixed_vector(4, 16)); 1270 HAddTys.push_back(LLT::fixed_vector(2, 32)); 1271 } else 1272 llvm_unreachable("unexpected vector shape"); 1273 MachineInstrBuilder UADD; 1274 for (LLT HTy : HAddTys) { 1275 UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}, /*HasSideEffects =*/false) 1276 .addUse(HSum); 1277 HSum = UADD.getReg(0); 1278 } 1279 1280 // Post-conditioning. 1281 if (Ty.isScalar() && (Size == 64 || Size == 128)) 1282 MIRBuilder.buildZExt(Dst, UADD); 1283 else 1284 UADD->getOperand(0).setReg(Dst); 1285 MI.eraseFromParent(); 1286 return true; 1287 } 1288 1289 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128( 1290 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 1291 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1292 LLT s64 = LLT::scalar(64); 1293 auto Addr = MI.getOperand(1).getReg(); 1294 auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2)); 1295 auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3)); 1296 auto DstLo = MRI.createGenericVirtualRegister(s64); 1297 auto DstHi = MRI.createGenericVirtualRegister(s64); 1298 1299 MachineInstrBuilder CAS; 1300 if (ST->hasLSE()) { 1301 // We have 128-bit CASP instructions taking XSeqPair registers, which are 1302 // s128. We need the merge/unmerge to bracket the expansion and pair up with 1303 // the rest of the MIR so we must reassemble the extracted registers into a 1304 // 128-bit known-regclass one with code like this: 1305 // 1306 // %in1 = REG_SEQUENCE Lo, Hi ; One for each input 1307 // %out = CASP %in1, ... 1308 // %OldLo = G_EXTRACT %out, 0 1309 // %OldHi = G_EXTRACT %out, 64 1310 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); 1311 unsigned Opcode; 1312 switch (Ordering) { 1313 case AtomicOrdering::Acquire: 1314 Opcode = AArch64::CASPAX; 1315 break; 1316 case AtomicOrdering::Release: 1317 Opcode = AArch64::CASPLX; 1318 break; 1319 case AtomicOrdering::AcquireRelease: 1320 case AtomicOrdering::SequentiallyConsistent: 1321 Opcode = AArch64::CASPALX; 1322 break; 1323 default: 1324 Opcode = AArch64::CASPX; 1325 break; 1326 } 1327 1328 LLT s128 = LLT::scalar(128); 1329 auto CASDst = MRI.createGenericVirtualRegister(s128); 1330 auto CASDesired = MRI.createGenericVirtualRegister(s128); 1331 auto CASNew = MRI.createGenericVirtualRegister(s128); 1332 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {}) 1333 .addUse(DesiredI->getOperand(0).getReg()) 1334 .addImm(AArch64::sube64) 1335 .addUse(DesiredI->getOperand(1).getReg()) 1336 .addImm(AArch64::subo64); 1337 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {}) 1338 .addUse(NewI->getOperand(0).getReg()) 1339 .addImm(AArch64::sube64) 1340 .addUse(NewI->getOperand(1).getReg()) 1341 .addImm(AArch64::subo64); 1342 1343 CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr}); 1344 1345 MIRBuilder.buildExtract({DstLo}, {CASDst}, 0); 1346 MIRBuilder.buildExtract({DstHi}, {CASDst}, 64); 1347 } else { 1348 // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP 1349 // can take arbitrary registers so it just has the normal GPR64 operands the 1350 // rest of AArch64 is expecting. 1351 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); 1352 unsigned Opcode; 1353 switch (Ordering) { 1354 case AtomicOrdering::Acquire: 1355 Opcode = AArch64::CMP_SWAP_128_ACQUIRE; 1356 break; 1357 case AtomicOrdering::Release: 1358 Opcode = AArch64::CMP_SWAP_128_RELEASE; 1359 break; 1360 case AtomicOrdering::AcquireRelease: 1361 case AtomicOrdering::SequentiallyConsistent: 1362 Opcode = AArch64::CMP_SWAP_128; 1363 break; 1364 default: 1365 Opcode = AArch64::CMP_SWAP_128_MONOTONIC; 1366 break; 1367 } 1368 1369 auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 1370 CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch}, 1371 {Addr, DesiredI->getOperand(0), 1372 DesiredI->getOperand(1), NewI->getOperand(0), 1373 NewI->getOperand(1)}); 1374 } 1375 1376 CAS.cloneMemRefs(MI); 1377 constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(), 1378 *MRI.getTargetRegisterInfo(), 1379 *ST->getRegBankInfo()); 1380 1381 MIRBuilder.buildMerge(MI.getOperand(0), {DstLo, DstHi}); 1382 MI.eraseFromParent(); 1383 return true; 1384 } 1385 1386 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI, 1387 LegalizerHelper &Helper) const { 1388 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1389 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 1390 LLT Ty = MRI.getType(MI.getOperand(1).getReg()); 1391 auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1)); 1392 MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse); 1393 MI.eraseFromParent(); 1394 return true; 1395 } 1396 1397 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI, 1398 LegalizerHelper &Helper) const { 1399 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1400 1401 // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic 1402 if (MI.getOpcode() == TargetOpcode::G_MEMSET) { 1403 // Zext the value operand to 64 bit 1404 auto &Value = MI.getOperand(1); 1405 Register ZExtValueReg = 1406 MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0); 1407 Value.setReg(ZExtValueReg); 1408 return true; 1409 } 1410 1411 return false; 1412 } 1413