1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AArch64. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64LegalizerInfo.h" 15 #include "AArch64RegisterBankInfo.h" 16 #include "AArch64Subtarget.h" 17 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 19 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 20 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 21 #include "llvm/CodeGen/GlobalISel/Utils.h" 22 #include "llvm/CodeGen/MachineInstr.h" 23 #include "llvm/CodeGen/MachineRegisterInfo.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/Intrinsics.h" 28 #include "llvm/IR/IntrinsicsAArch64.h" 29 #include "llvm/IR/Type.h" 30 #include "llvm/Support/MathExtras.h" 31 #include <initializer_list> 32 33 #define DEBUG_TYPE "aarch64-legalinfo" 34 35 using namespace llvm; 36 using namespace LegalizeActions; 37 using namespace LegalizeMutations; 38 using namespace LegalityPredicates; 39 using namespace MIPatternMatch; 40 41 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) 42 : ST(&ST) { 43 using namespace TargetOpcode; 44 const LLT p0 = LLT::pointer(0, 64); 45 const LLT s1 = LLT::scalar(1); 46 const LLT s8 = LLT::scalar(8); 47 const LLT s16 = LLT::scalar(16); 48 const LLT s32 = LLT::scalar(32); 49 const LLT s64 = LLT::scalar(64); 50 const LLT s128 = LLT::scalar(128); 51 const LLT v16s8 = LLT::fixed_vector(16, 8); 52 const LLT v8s8 = LLT::fixed_vector(8, 8); 53 const LLT v4s8 = LLT::fixed_vector(4, 8); 54 const LLT v8s16 = LLT::fixed_vector(8, 16); 55 const LLT v4s16 = LLT::fixed_vector(4, 16); 56 const LLT v2s16 = LLT::fixed_vector(2, 16); 57 const LLT v2s32 = LLT::fixed_vector(2, 32); 58 const LLT v4s32 = LLT::fixed_vector(4, 32); 59 const LLT v2s64 = LLT::fixed_vector(2, 64); 60 const LLT v2p0 = LLT::fixed_vector(2, p0); 61 62 std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */ 63 v16s8, v8s16, v4s32, 64 v2s64, v2p0, 65 /* End 128bit types */ 66 /* Begin 64bit types */ 67 v8s8, v4s16, v2s32}; 68 69 const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine(); 70 71 // FIXME: support subtargets which have neon/fp-armv8 disabled. 72 if (!ST.hasNEON() || !ST.hasFPARMv8()) { 73 getLegacyLegalizerInfo().computeTables(); 74 return; 75 } 76 77 // Some instructions only support s16 if the subtarget has full 16-bit FP 78 // support. 79 const bool HasFP16 = ST.hasFullFP16(); 80 const LLT &MinFPScalar = HasFP16 ? s16 : s32; 81 82 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 83 .legalFor({p0, s1, s8, s16, s32, s64}) 84 .legalFor(PackedVectorAllTypeList) 85 .widenScalarToNextPow2(0) 86 .clampScalar(0, s8, s64) 87 .fewerElementsIf( 88 [=](const LegalityQuery &Query) { 89 return Query.Types[0].isVector() && 90 (Query.Types[0].getElementType() != s64 || 91 Query.Types[0].getNumElements() != 2); 92 }, 93 [=](const LegalityQuery &Query) { 94 LLT EltTy = Query.Types[0].getElementType(); 95 if (EltTy == s64) 96 return std::make_pair(0, LLT::fixed_vector(2, 64)); 97 return std::make_pair(0, EltTy); 98 }); 99 100 getActionDefinitionsBuilder(G_PHI) 101 .legalFor({p0, s16, s32, s64}) 102 .legalFor(PackedVectorAllTypeList) 103 .widenScalarToNextPow2(0) 104 .clampScalar(0, s16, s64) 105 // Maximum: sN * k = 128 106 .clampMaxNumElements(0, s8, 16) 107 .clampMaxNumElements(0, s16, 8) 108 .clampMaxNumElements(0, s32, 4) 109 .clampMaxNumElements(0, s64, 2) 110 .clampMaxNumElements(0, p0, 2); 111 112 getActionDefinitionsBuilder(G_BSWAP) 113 .legalFor({s32, s64, v4s32, v2s32, v2s64}) 114 .widenScalarToNextPow2(0) 115 .clampScalar(0, s32, s64); 116 117 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) 118 .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8}) 119 .scalarizeIf( 120 [=](const LegalityQuery &Query) { 121 return Query.Opcode == G_MUL && Query.Types[0] == v2s64; 122 }, 123 0) 124 .legalFor({v2s64}) 125 .widenScalarToNextPow2(0) 126 .clampScalar(0, s32, s64) 127 .clampNumElements(0, v2s32, v4s32) 128 .clampNumElements(0, v2s64, v2s64) 129 .moreElementsToNextPow2(0); 130 131 getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) 132 .customIf([=](const LegalityQuery &Query) { 133 const auto &SrcTy = Query.Types[0]; 134 const auto &AmtTy = Query.Types[1]; 135 return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && 136 AmtTy.getSizeInBits() == 32; 137 }) 138 .legalFor({ 139 {s32, s32}, 140 {s32, s64}, 141 {s64, s64}, 142 {v8s8, v8s8}, 143 {v16s8, v16s8}, 144 {v4s16, v4s16}, 145 {v8s16, v8s16}, 146 {v2s32, v2s32}, 147 {v4s32, v4s32}, 148 {v2s64, v2s64}, 149 }) 150 .widenScalarToNextPow2(0) 151 .clampScalar(1, s32, s64) 152 .clampScalar(0, s32, s64) 153 .clampNumElements(0, v2s32, v4s32) 154 .clampNumElements(0, v2s64, v2s64) 155 .moreElementsToNextPow2(0) 156 .minScalarSameAs(1, 0); 157 158 getActionDefinitionsBuilder(G_PTR_ADD) 159 .legalFor({{p0, s64}, {v2p0, v2s64}}) 160 .clampScalar(1, s64, s64); 161 162 getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}}); 163 164 getActionDefinitionsBuilder({G_SDIV, G_UDIV}) 165 .legalFor({s32, s64}) 166 .libcallFor({s128}) 167 .clampScalar(0, s32, s64) 168 .widenScalarToNextPow2(0) 169 .scalarize(0); 170 171 getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 172 .lowerFor({s1, s8, s16, s32, s64, v2s64, v4s32, v2s32}) 173 .widenScalarOrEltToNextPow2(0) 174 .clampScalarOrElt(0, s32, s64) 175 .clampNumElements(0, v2s32, v4s32) 176 .clampNumElements(0, v2s64, v2s64) 177 .moreElementsToNextPow2(0); 178 179 180 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 181 .widenScalarToNextPow2(0, /*Min = */ 32) 182 .clampScalar(0, s32, s64) 183 .lowerIf(typeIs(1, s1)); 184 185 getActionDefinitionsBuilder({G_SMULH, G_UMULH}) 186 .legalFor({s64, v8s16, v16s8, v4s32}) 187 .lower(); 188 189 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 190 .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) 191 .clampNumElements(0, v8s8, v16s8) 192 .clampNumElements(0, v4s16, v8s16) 193 .clampNumElements(0, v2s32, v4s32) 194 // FIXME: This sholdn't be needed as v2s64 types are going to 195 // be expanded anyway, but G_ICMP doesn't support splitting vectors yet 196 .clampNumElements(0, v2s64, v2s64) 197 .lower(); 198 199 getActionDefinitionsBuilder( 200 {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO}) 201 .legalFor({{s32, s1}, {s64, s1}}) 202 .clampScalar(0, s32, s64) 203 .widenScalarToNextPow2(0); 204 205 getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG}) 206 .legalFor({MinFPScalar, s32, s64, v2s64, v4s32, v2s32}) 207 .clampScalar(0, MinFPScalar, s64) 208 .clampNumElements(0, v2s32, v4s32) 209 .clampNumElements(0, v2s64, v2s64); 210 211 getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64}); 212 213 getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT, 214 G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, 215 G_FNEARBYINT, G_INTRINSIC_LRINT}) 216 // If we don't have full FP16 support, then scalarize the elements of 217 // vectors containing fp16 types. 218 .fewerElementsIf( 219 [=, &ST](const LegalityQuery &Query) { 220 const auto &Ty = Query.Types[0]; 221 return Ty.isVector() && Ty.getElementType() == s16 && 222 !ST.hasFullFP16(); 223 }, 224 [=](const LegalityQuery &Query) { return std::make_pair(0, s16); }) 225 // If we don't have full FP16 support, then widen s16 to s32 if we 226 // encounter it. 227 .widenScalarIf( 228 [=, &ST](const LegalityQuery &Query) { 229 return Query.Types[0] == s16 && !ST.hasFullFP16(); 230 }, 231 [=](const LegalityQuery &Query) { return std::make_pair(0, s32); }) 232 .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16}); 233 234 getActionDefinitionsBuilder( 235 {G_FCOS, G_FSIN, G_FLOG10, G_FLOG, G_FLOG2, G_FEXP, G_FEXP2, G_FPOW}) 236 // We need a call for these, so we always need to scalarize. 237 .scalarize(0) 238 // Regardless of FP16 support, widen 16-bit elements to 32-bits. 239 .minScalar(0, s32) 240 .libcallFor({s32, s64, v2s32, v4s32, v2s64}); 241 242 getActionDefinitionsBuilder(G_INSERT) 243 .legalIf(all(typeInSet(0, {s32, s64, p0}), 244 typeInSet(1, {s1, s8, s16, s32}), smallerThan(1, 0))) 245 .widenScalarToNextPow2(0) 246 .clampScalar(0, s32, s64) 247 .widenScalarToNextPow2(1) 248 .minScalar(1, s8) 249 .maxScalarIf(typeInSet(0, {s32}), 1, s16) 250 .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32); 251 252 getActionDefinitionsBuilder(G_EXTRACT) 253 .legalIf(all(typeInSet(0, {s16, s32, s64, p0}), 254 typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1))) 255 .widenScalarToNextPow2(1) 256 .clampScalar(1, s32, s128) 257 .widenScalarToNextPow2(0) 258 .minScalar(0, s16) 259 .maxScalarIf(typeInSet(1, {s32}), 0, s16) 260 .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32) 261 .maxScalarIf(typeInSet(1, {s128}), 0, s64); 262 263 getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 264 .lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)) 265 .legalForTypesWithMemDesc({{s32, p0, s8, 8}, 266 {s32, p0, s16, 8}, 267 {s32, p0, s32, 8}, 268 {s64, p0, s8, 2}, 269 {s64, p0, s16, 2}, 270 {s64, p0, s32, 4}, 271 {s64, p0, s64, 8}, 272 {p0, p0, s64, 8}, 273 {v2s32, p0, s64, 8}}) 274 .widenScalarToNextPow2(0) 275 .clampScalar(0, s32, s64) 276 // TODO: We could support sum-of-pow2's but the lowering code doesn't know 277 // how to do that yet. 278 .unsupportedIfMemSizeNotPow2() 279 // Lower anything left over into G_*EXT and G_LOAD 280 .lower(); 281 282 auto IsPtrVecPred = [=](const LegalityQuery &Query) { 283 const LLT &ValTy = Query.Types[0]; 284 if (!ValTy.isVector()) 285 return false; 286 const LLT EltTy = ValTy.getElementType(); 287 return EltTy.isPointer() && EltTy.getAddressSpace() == 0; 288 }; 289 290 getActionDefinitionsBuilder(G_LOAD) 291 .customIf([=](const LegalityQuery &Query) { 292 return Query.Types[0] == s128 && 293 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; 294 }) 295 .legalForTypesWithMemDesc({{s8, p0, s8, 8}, 296 {s16, p0, s16, 8}, 297 {s32, p0, s32, 8}, 298 {s64, p0, s64, 8}, 299 {p0, p0, s64, 8}, 300 {s128, p0, s128, 8}, 301 {v8s8, p0, s64, 8}, 302 {v16s8, p0, s128, 8}, 303 {v4s16, p0, s64, 8}, 304 {v8s16, p0, s128, 8}, 305 {v2s32, p0, s64, 8}, 306 {v4s32, p0, s128, 8}, 307 {v2s64, p0, s128, 8}}) 308 // These extends are also legal 309 .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}}) 310 .widenScalarToNextPow2(0, /* MinSize = */8) 311 .lowerIfMemSizeNotPow2() 312 .clampScalar(0, s8, s64) 313 .narrowScalarIf([=](const LegalityQuery &Query) { 314 // Clamp extending load results to 32-bits. 315 return Query.Types[0].isScalar() && 316 Query.Types[0] != Query.MMODescrs[0].MemoryTy && 317 Query.Types[0].getSizeInBits() > 32; 318 }, 319 changeTo(0, s32)) 320 // Lower any any-extending loads left into G_ANYEXT and G_LOAD 321 .lowerIf([=](const LegalityQuery &Query) { 322 return Query.Types[0] != Query.MMODescrs[0].MemoryTy; 323 }) 324 .clampMaxNumElements(0, s8, 16) 325 .clampMaxNumElements(0, s16, 8) 326 .clampMaxNumElements(0, s32, 4) 327 .clampMaxNumElements(0, s64, 2) 328 .clampMaxNumElements(0, p0, 2) 329 .customIf(IsPtrVecPred) 330 .scalarizeIf(typeIs(0, v2s16), 0); 331 332 getActionDefinitionsBuilder(G_STORE) 333 .customIf([=](const LegalityQuery &Query) { 334 return Query.Types[0] == s128 && 335 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; 336 }) 337 .legalForTypesWithMemDesc({{s8, p0, s8, 8}, 338 {s16, p0, s8, 8}, // truncstorei8 from s16 339 {s32, p0, s8, 8}, // truncstorei8 from s32 340 {s64, p0, s8, 8}, // truncstorei8 from s64 341 {s16, p0, s16, 8}, 342 {s32, p0, s16, 8}, // truncstorei16 from s32 343 {s64, p0, s16, 8}, // truncstorei16 from s64 344 {s32, p0, s8, 8}, 345 {s32, p0, s16, 8}, 346 {s32, p0, s32, 8}, 347 {s64, p0, s64, 8}, 348 {s64, p0, s32, 8}, // truncstorei32 from s64 349 {p0, p0, s64, 8}, 350 {s128, p0, s128, 8}, 351 {v16s8, p0, s128, 8}, 352 {v8s8, p0, s64, 8}, 353 {v4s16, p0, s64, 8}, 354 {v8s16, p0, s128, 8}, 355 {v2s32, p0, s64, 8}, 356 {v4s32, p0, s128, 8}, 357 {v2s64, p0, s128, 8}}) 358 .clampScalar(0, s8, s64) 359 .lowerIf([=](const LegalityQuery &Query) { 360 return Query.Types[0].isScalar() && 361 Query.Types[0] != Query.MMODescrs[0].MemoryTy; 362 }) 363 // Maximum: sN * k = 128 364 .clampMaxNumElements(0, s8, 16) 365 .clampMaxNumElements(0, s16, 8) 366 .clampMaxNumElements(0, s32, 4) 367 .clampMaxNumElements(0, s64, 2) 368 .clampMaxNumElements(0, p0, 2) 369 .lowerIfMemSizeNotPow2() 370 .customIf(IsPtrVecPred) 371 .scalarizeIf(typeIs(0, v2s16), 0); 372 373 // Constants 374 getActionDefinitionsBuilder(G_CONSTANT) 375 .legalFor({p0, s8, s16, s32, s64}) 376 .widenScalarToNextPow2(0) 377 .clampScalar(0, s8, s64); 378 getActionDefinitionsBuilder(G_FCONSTANT) 379 .legalIf([=](const LegalityQuery &Query) { 380 const auto &Ty = Query.Types[0]; 381 if (HasFP16 && Ty == s16) 382 return true; 383 return Ty == s32 || Ty == s64 || Ty == s128; 384 }) 385 .clampScalar(0, MinFPScalar, s128); 386 387 getActionDefinitionsBuilder({G_ICMP, G_FCMP}) 388 .legalFor({{s32, s32}, 389 {s32, s64}, 390 {s32, p0}, 391 {v4s32, v4s32}, 392 {v2s32, v2s32}, 393 {v2s64, v2s64}, 394 {v2s64, v2p0}, 395 {v4s16, v4s16}, 396 {v8s16, v8s16}, 397 {v8s8, v8s8}, 398 {v16s8, v16s8}}) 399 .widenScalarOrEltToNextPow2(1) 400 .clampScalar(1, s32, s64) 401 .clampScalar(0, s32, s32) 402 .minScalarEltSameAsIf( 403 [=](const LegalityQuery &Query) { 404 const LLT &Ty = Query.Types[0]; 405 const LLT &SrcTy = Query.Types[1]; 406 return Ty.isVector() && !SrcTy.getElementType().isPointer() && 407 Ty.getElementType() != SrcTy.getElementType(); 408 }, 409 0, 1) 410 .minScalarOrEltIf( 411 [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; }, 412 1, s32) 413 .minScalarOrEltIf( 414 [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0, 415 s64) 416 .clampNumElements(0, v2s32, v4s32); 417 418 // Extensions 419 auto ExtLegalFunc = [=](const LegalityQuery &Query) { 420 unsigned DstSize = Query.Types[0].getSizeInBits(); 421 422 if (DstSize == 128 && !Query.Types[0].isVector()) 423 return false; // Extending to a scalar s128 needs narrowing. 424 425 // Make sure that we have something that will fit in a register, and 426 // make sure it's a power of 2. 427 if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize)) 428 return false; 429 430 const LLT &SrcTy = Query.Types[1]; 431 432 // Special case for s1. 433 if (SrcTy == s1) 434 return true; 435 436 // Make sure we fit in a register otherwise. Don't bother checking that 437 // the source type is below 128 bits. We shouldn't be allowing anything 438 // through which is wider than the destination in the first place. 439 unsigned SrcSize = SrcTy.getSizeInBits(); 440 if (SrcSize < 8 || !isPowerOf2_32(SrcSize)) 441 return false; 442 443 return true; 444 }; 445 getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) 446 .legalIf(ExtLegalFunc) 447 .clampScalar(0, s64, s64); // Just for s128, others are handled above. 448 449 getActionDefinitionsBuilder(G_TRUNC) 450 .minScalarOrEltIf( 451 [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); }, 452 0, s8) 453 .customIf([=](const LegalityQuery &Query) { 454 LLT DstTy = Query.Types[0]; 455 LLT SrcTy = Query.Types[1]; 456 return DstTy == v8s8 && SrcTy.getSizeInBits() > 128; 457 }) 458 .alwaysLegal(); 459 460 getActionDefinitionsBuilder(G_SEXT_INREG).legalFor({s32, s64}).lower(); 461 462 // FP conversions 463 getActionDefinitionsBuilder(G_FPTRUNC) 464 .legalFor( 465 {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}) 466 .clampMaxNumElements(0, s32, 2); 467 getActionDefinitionsBuilder(G_FPEXT) 468 .legalFor( 469 {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}}) 470 .clampMaxNumElements(0, s64, 2); 471 472 // Conversions 473 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 474 .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) 475 .widenScalarToNextPow2(0) 476 .clampScalar(0, s32, s64) 477 .widenScalarToNextPow2(1) 478 .clampScalar(1, s32, s64); 479 480 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 481 .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) 482 .clampScalar(1, s32, s64) 483 .minScalarSameAs(1, 0) 484 .clampScalar(0, s32, s64) 485 .widenScalarToNextPow2(0); 486 487 // Control-flow 488 getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s8, s16, s32}); 489 getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0}); 490 491 getActionDefinitionsBuilder(G_SELECT) 492 .legalFor({{s32, s1}, {s64, s1}, {p0, s1}}) 493 .widenScalarToNextPow2(0) 494 .clampScalar(0, s32, s64) 495 .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0) 496 .lowerIf(isVector(0)); 497 498 // Pointer-handling 499 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); 500 501 if (TM.getCodeModel() == CodeModel::Small) 502 getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom(); 503 else 504 getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); 505 506 getActionDefinitionsBuilder(G_PTRTOINT) 507 .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0}) 508 .legalFor({{v2s64, v2p0}}) 509 .maxScalar(0, s64) 510 .widenScalarToNextPow2(0, /*Min*/ 8); 511 512 getActionDefinitionsBuilder(G_INTTOPTR) 513 .unsupportedIf([&](const LegalityQuery &Query) { 514 return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits(); 515 }) 516 .legalFor({{p0, s64}, {v2p0, v2s64}}); 517 518 // Casts for 32 and 64-bit width type are just copies. 519 // Same for 128-bit width type, except they are on the FPR bank. 520 getActionDefinitionsBuilder(G_BITCAST) 521 // FIXME: This is wrong since G_BITCAST is not allowed to change the 522 // number of bits but it's what the previous code described and fixing 523 // it breaks tests. 524 .legalForCartesianProduct({s1, s8, s16, s32, s64, s128, v16s8, v8s8, v4s8, 525 v8s16, v4s16, v2s16, v4s32, v2s32, v2s64, 526 v2p0}); 527 528 getActionDefinitionsBuilder(G_VASTART).legalFor({p0}); 529 530 // va_list must be a pointer, but most sized types are pretty easy to handle 531 // as the destination. 532 getActionDefinitionsBuilder(G_VAARG) 533 .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0}) 534 .clampScalar(0, s8, s64) 535 .widenScalarToNextPow2(0, /*Min*/ 8); 536 537 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 538 .lowerIf( 539 all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, s1), typeIs(2, p0))); 540 541 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 542 .customIf([](const LegalityQuery &Query) { 543 return Query.Types[0].getSizeInBits() == 128; 544 }) 545 .clampScalar(0, s32, s64) 546 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0))); 547 548 getActionDefinitionsBuilder( 549 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND, 550 G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, 551 G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX}) 552 .clampScalar(0, s32, s64) 553 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0))); 554 555 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0}); 556 557 // Merge/Unmerge 558 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 559 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 560 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 561 getActionDefinitionsBuilder(Op) 562 .widenScalarToNextPow2(LitTyIdx, 8) 563 .widenScalarToNextPow2(BigTyIdx, 32) 564 .clampScalar(LitTyIdx, s8, s64) 565 .clampScalar(BigTyIdx, s32, s128) 566 .legalIf([=](const LegalityQuery &Q) { 567 switch (Q.Types[BigTyIdx].getSizeInBits()) { 568 case 32: 569 case 64: 570 case 128: 571 break; 572 default: 573 return false; 574 } 575 switch (Q.Types[LitTyIdx].getSizeInBits()) { 576 case 8: 577 case 16: 578 case 32: 579 case 64: 580 return true; 581 default: 582 return false; 583 } 584 }); 585 } 586 587 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 588 .unsupportedIf([=](const LegalityQuery &Query) { 589 const LLT &EltTy = Query.Types[1].getElementType(); 590 return Query.Types[0] != EltTy; 591 }) 592 .minScalar(2, s64) 593 .legalIf([=](const LegalityQuery &Query) { 594 const LLT &VecTy = Query.Types[1]; 595 return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 || 596 VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 || 597 VecTy == v8s8 || VecTy == v16s8 || VecTy == v2s32 || 598 VecTy == v2p0; 599 }) 600 .minScalarOrEltIf( 601 [=](const LegalityQuery &Query) { 602 // We want to promote to <M x s1> to <M x s64> if that wouldn't 603 // cause the total vec size to be > 128b. 604 return Query.Types[1].getNumElements() <= 2; 605 }, 606 0, s64) 607 .minScalarOrEltIf( 608 [=](const LegalityQuery &Query) { 609 return Query.Types[1].getNumElements() <= 4; 610 }, 611 0, s32) 612 .minScalarOrEltIf( 613 [=](const LegalityQuery &Query) { 614 return Query.Types[1].getNumElements() <= 8; 615 }, 616 0, s16) 617 .minScalarOrEltIf( 618 [=](const LegalityQuery &Query) { 619 return Query.Types[1].getNumElements() <= 16; 620 }, 621 0, s8) 622 .minScalarOrElt(0, s8) // Worst case, we need at least s8. 623 .clampMaxNumElements(1, s64, 2) 624 .clampMaxNumElements(1, s32, 4) 625 .clampMaxNumElements(1, s16, 8) 626 .clampMaxNumElements(1, p0, 2); 627 628 getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) 629 .legalIf(typeInSet(0, {v8s16, v2s32, v4s32, v2s64})); 630 631 getActionDefinitionsBuilder(G_BUILD_VECTOR) 632 .legalFor({{v8s8, s8}, 633 {v16s8, s8}, 634 {v2s16, s16}, 635 {v4s16, s16}, 636 {v8s16, s16}, 637 {v2s32, s32}, 638 {v4s32, s32}, 639 {v2p0, p0}, 640 {v2s64, s64}}) 641 .clampNumElements(0, v4s32, v4s32) 642 .clampNumElements(0, v2s64, v2s64) 643 .minScalarOrElt(0, s8) 644 .minScalarSameAs(1, 0); 645 646 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower(); 647 648 getActionDefinitionsBuilder(G_CTLZ) 649 .legalForCartesianProduct( 650 {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) 651 .scalarize(1); 652 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower(); 653 654 // TODO: Custom lowering for v2s32, v4s32, v2s64. 655 getActionDefinitionsBuilder(G_BITREVERSE) 656 .legalFor({s32, s64, v8s8, v16s8}) 657 .widenScalarToNextPow2(0, /*Min = */ 32) 658 .clampScalar(0, s32, s64); 659 660 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower(); 661 662 // TODO: Handle vector types. 663 getActionDefinitionsBuilder(G_CTTZ) 664 .clampScalar(0, s32, s64) 665 .scalarSameSizeAs(1, 0) 666 .customFor({s32, s64}); 667 668 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 669 .legalIf([=](const LegalityQuery &Query) { 670 const LLT &DstTy = Query.Types[0]; 671 const LLT &SrcTy = Query.Types[1]; 672 // For now just support the TBL2 variant which needs the source vectors 673 // to be the same size as the dest. 674 if (DstTy != SrcTy) 675 return false; 676 for (auto &Ty : {v2s32, v4s32, v2s64, v2p0, v16s8, v8s16}) { 677 if (DstTy == Ty) 678 return true; 679 } 680 return false; 681 }) 682 // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we 683 // just want those lowered into G_BUILD_VECTOR 684 .lowerIf([=](const LegalityQuery &Query) { 685 return !Query.Types[1].isVector(); 686 }) 687 .moreElementsToNextPow2(0) 688 .clampNumElements(0, v4s32, v4s32) 689 .clampNumElements(0, v2s64, v2s64); 690 691 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 692 .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}}); 693 694 getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({{p0}, {s64}}); 695 696 getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) { 697 return Query.Types[0] == p0 && Query.Types[1] == s64; 698 }); 699 700 getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower(); 701 702 getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET}) 703 .libcall(); 704 705 // FIXME: Legal types are only legal with NEON. 706 getActionDefinitionsBuilder(G_ABS) 707 .lowerIf(isScalar(0)) 708 .legalFor(PackedVectorAllTypeList); 709 710 getActionDefinitionsBuilder(G_VECREDUCE_FADD) 711 // We only have FADDP to do reduction-like operations. Lower the rest. 712 .legalFor({{s32, v2s32}, {s64, v2s64}}) 713 .clampMaxNumElements(1, s64, 2) 714 .clampMaxNumElements(1, s32, 2) 715 .lower(); 716 717 getActionDefinitionsBuilder(G_VECREDUCE_ADD) 718 .legalFor( 719 {{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s32, v2s32}, {s64, v2s64}}) 720 .clampMaxNumElements(1, s64, 2) 721 .clampMaxNumElements(1, s32, 4) 722 .lower(); 723 724 getActionDefinitionsBuilder( 725 {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR}) 726 // Try to break down into smaller vectors as long as they're at least 64 727 // bits. This lets us use vector operations for some parts of the 728 // reduction. 729 .fewerElementsIf( 730 [=](const LegalityQuery &Q) { 731 LLT SrcTy = Q.Types[1]; 732 if (SrcTy.isScalar()) 733 return false; 734 if (!isPowerOf2_32(SrcTy.getNumElements())) 735 return false; 736 // We can usually perform 64b vector operations. 737 return SrcTy.getSizeInBits() > 64; 738 }, 739 [=](const LegalityQuery &Q) { 740 LLT SrcTy = Q.Types[1]; 741 return std::make_pair(1, SrcTy.divide(2)); 742 }) 743 .scalarize(1) 744 .lower(); 745 746 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 747 .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); }); 748 749 getActionDefinitionsBuilder({G_FSHL, G_FSHR}).lower(); 750 751 getActionDefinitionsBuilder(G_ROTR) 752 .legalFor({{s32, s64}, {s64, s64}}) 753 .customIf([=](const LegalityQuery &Q) { 754 return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64; 755 }) 756 .lower(); 757 getActionDefinitionsBuilder(G_ROTL).lower(); 758 759 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 760 .customFor({{s32, s32}, {s64, s64}}); 761 762 // TODO: Use generic lowering when custom lowering is not possible. 763 auto always = [=](const LegalityQuery &Q) { return true; }; 764 getActionDefinitionsBuilder(G_CTPOP) 765 .legalFor({{v8s8, v8s8}, {v16s8, v16s8}}) 766 .clampScalar(0, s32, s128) 767 .widenScalarToNextPow2(0) 768 .minScalarEltSameAsIf(always, 1, 0) 769 .maxScalarEltSameAsIf(always, 1, 0) 770 .customFor({{s32, s32}, 771 {s64, s64}, 772 {s128, s128}, 773 {v2s64, v2s64}, 774 {v2s32, v2s32}, 775 {v4s32, v4s32}, 776 {v4s16, v4s16}, 777 {v8s16, v8s16}}); 778 779 // TODO: Vector types. 780 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0)); 781 782 // TODO: Vector types. 783 getActionDefinitionsBuilder({G_FMAXNUM, G_FMINNUM}) 784 .legalFor({MinFPScalar, s32, s64}) 785 .libcallFor({s128}) 786 .minScalar(0, MinFPScalar); 787 788 // TODO: Libcall support for s128. 789 // TODO: s16 should be legal with full FP16 support. 790 getActionDefinitionsBuilder({G_LROUND, G_LLROUND}) 791 .legalFor({{s64, s32}, {s64, s64}}); 792 793 getLegacyLegalizerInfo().computeTables(); 794 verify(*ST.getInstrInfo()); 795 } 796 797 bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 798 MachineInstr &MI) const { 799 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 800 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 801 GISelChangeObserver &Observer = Helper.Observer; 802 switch (MI.getOpcode()) { 803 default: 804 // No idea what to do. 805 return false; 806 case TargetOpcode::G_VAARG: 807 return legalizeVaArg(MI, MRI, MIRBuilder); 808 case TargetOpcode::G_LOAD: 809 case TargetOpcode::G_STORE: 810 return legalizeLoadStore(MI, MRI, MIRBuilder, Observer); 811 case TargetOpcode::G_SHL: 812 case TargetOpcode::G_ASHR: 813 case TargetOpcode::G_LSHR: 814 return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer); 815 case TargetOpcode::G_GLOBAL_VALUE: 816 return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer); 817 case TargetOpcode::G_TRUNC: 818 return legalizeVectorTrunc(MI, Helper); 819 case TargetOpcode::G_SBFX: 820 case TargetOpcode::G_UBFX: 821 return legalizeBitfieldExtract(MI, MRI, Helper); 822 case TargetOpcode::G_ROTR: 823 return legalizeRotate(MI, MRI, Helper); 824 case TargetOpcode::G_CTPOP: 825 return legalizeCTPOP(MI, MRI, Helper); 826 case TargetOpcode::G_ATOMIC_CMPXCHG: 827 return legalizeAtomicCmpxchg128(MI, MRI, Helper); 828 case TargetOpcode::G_CTTZ: 829 return legalizeCTTZ(MI, Helper); 830 } 831 832 llvm_unreachable("expected switch to return"); 833 } 834 835 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI, 836 MachineRegisterInfo &MRI, 837 LegalizerHelper &Helper) const { 838 // To allow for imported patterns to match, we ensure that the rotate amount 839 // is 64b with an extension. 840 Register AmtReg = MI.getOperand(2).getReg(); 841 LLT AmtTy = MRI.getType(AmtReg); 842 (void)AmtTy; 843 assert(AmtTy.isScalar() && "Expected a scalar rotate"); 844 assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal"); 845 auto NewAmt = Helper.MIRBuilder.buildSExt(LLT::scalar(64), AmtReg); 846 Helper.Observer.changingInstr(MI); 847 MI.getOperand(2).setReg(NewAmt.getReg(0)); 848 Helper.Observer.changedInstr(MI); 849 return true; 850 } 851 852 static void extractParts(Register Reg, MachineRegisterInfo &MRI, 853 MachineIRBuilder &MIRBuilder, LLT Ty, int NumParts, 854 SmallVectorImpl<Register> &VRegs) { 855 for (int I = 0; I < NumParts; ++I) 856 VRegs.push_back(MRI.createGenericVirtualRegister(Ty)); 857 MIRBuilder.buildUnmerge(VRegs, Reg); 858 } 859 860 bool AArch64LegalizerInfo::legalizeVectorTrunc( 861 MachineInstr &MI, LegalizerHelper &Helper) const { 862 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 863 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 864 // Similar to how operand splitting is done in SelectiondDAG, we can handle 865 // %res(v8s8) = G_TRUNC %in(v8s32) by generating: 866 // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>) 867 // %lo16(<4 x s16>) = G_TRUNC %inlo 868 // %hi16(<4 x s16>) = G_TRUNC %inhi 869 // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16 870 // %res(<8 x s8>) = G_TRUNC %in16 871 872 Register DstReg = MI.getOperand(0).getReg(); 873 Register SrcReg = MI.getOperand(1).getReg(); 874 LLT DstTy = MRI.getType(DstReg); 875 LLT SrcTy = MRI.getType(SrcReg); 876 assert(isPowerOf2_32(DstTy.getSizeInBits()) && 877 isPowerOf2_32(SrcTy.getSizeInBits())); 878 879 // Split input type. 880 LLT SplitSrcTy = 881 SrcTy.changeElementCount(SrcTy.getElementCount().divideCoefficientBy(2)); 882 // First, split the source into two smaller vectors. 883 SmallVector<Register, 2> SplitSrcs; 884 extractParts(SrcReg, MRI, MIRBuilder, SplitSrcTy, 2, SplitSrcs); 885 886 // Truncate the splits into intermediate narrower elements. 887 LLT InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2); 888 for (unsigned I = 0; I < SplitSrcs.size(); ++I) 889 SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0); 890 891 auto Concat = MIRBuilder.buildConcatVectors( 892 DstTy.changeElementSize(DstTy.getScalarSizeInBits() * 2), SplitSrcs); 893 894 Helper.Observer.changingInstr(MI); 895 MI.getOperand(1).setReg(Concat.getReg(0)); 896 Helper.Observer.changedInstr(MI); 897 return true; 898 } 899 900 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( 901 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 902 GISelChangeObserver &Observer) const { 903 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); 904 // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP + 905 // G_ADD_LOW instructions. 906 // By splitting this here, we can optimize accesses in the small code model by 907 // folding in the G_ADD_LOW into the load/store offset. 908 auto &GlobalOp = MI.getOperand(1); 909 const auto* GV = GlobalOp.getGlobal(); 910 if (GV->isThreadLocal()) 911 return true; // Don't want to modify TLS vars. 912 913 auto &TM = ST->getTargetLowering()->getTargetMachine(); 914 unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM); 915 916 if (OpFlags & AArch64II::MO_GOT) 917 return true; 918 919 auto Offset = GlobalOp.getOffset(); 920 Register DstReg = MI.getOperand(0).getReg(); 921 auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {}) 922 .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE); 923 // Set the regclass on the dest reg too. 924 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); 925 926 // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so 927 // by creating a MOVK that sets bits 48-63 of the register to (global address 928 // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to 929 // prevent an incorrect tag being generated during relocation when the the 930 // global appears before the code section. Without the offset, a global at 931 // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced 932 // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 = 933 // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe` 934 // instead of `0xf`. 935 // This assumes that we're in the small code model so we can assume a binary 936 // size of <= 4GB, which makes the untagged PC relative offset positive. The 937 // binary must also be loaded into address range [0, 2^48). Both of these 938 // properties need to be ensured at runtime when using tagged addresses. 939 if (OpFlags & AArch64II::MO_TAGGED) { 940 assert(!Offset && 941 "Should not have folded in an offset for a tagged global!"); 942 ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP}) 943 .addGlobalAddress(GV, 0x100000000, 944 AArch64II::MO_PREL | AArch64II::MO_G3) 945 .addImm(48); 946 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); 947 } 948 949 MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP}) 950 .addGlobalAddress(GV, Offset, 951 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 952 MI.eraseFromParent(); 953 return true; 954 } 955 956 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 957 MachineInstr &MI) const { 958 switch (MI.getIntrinsicID()) { 959 case Intrinsic::vacopy: { 960 unsigned PtrSize = ST->isTargetILP32() ? 4 : 8; 961 unsigned VaListSize = 962 (ST->isTargetDarwin() || ST->isTargetWindows()) 963 ? PtrSize 964 : ST->isTargetILP32() ? 20 : 32; 965 966 MachineFunction &MF = *MI.getMF(); 967 auto Val = MF.getRegInfo().createGenericVirtualRegister( 968 LLT::scalar(VaListSize * 8)); 969 MachineIRBuilder MIB(MI); 970 MIB.buildLoad(Val, MI.getOperand(2), 971 *MF.getMachineMemOperand(MachinePointerInfo(), 972 MachineMemOperand::MOLoad, 973 VaListSize, Align(PtrSize))); 974 MIB.buildStore(Val, MI.getOperand(1), 975 *MF.getMachineMemOperand(MachinePointerInfo(), 976 MachineMemOperand::MOStore, 977 VaListSize, Align(PtrSize))); 978 MI.eraseFromParent(); 979 return true; 980 } 981 case Intrinsic::get_dynamic_area_offset: { 982 MachineIRBuilder &MIB = Helper.MIRBuilder; 983 MIB.buildConstant(MI.getOperand(0).getReg(), 0); 984 MI.eraseFromParent(); 985 return true; 986 } 987 } 988 989 return true; 990 } 991 992 bool AArch64LegalizerInfo::legalizeShlAshrLshr( 993 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 994 GISelChangeObserver &Observer) const { 995 assert(MI.getOpcode() == TargetOpcode::G_ASHR || 996 MI.getOpcode() == TargetOpcode::G_LSHR || 997 MI.getOpcode() == TargetOpcode::G_SHL); 998 // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the 999 // imported patterns can select it later. Either way, it will be legal. 1000 Register AmtReg = MI.getOperand(2).getReg(); 1001 auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI); 1002 if (!VRegAndVal) 1003 return true; 1004 // Check the shift amount is in range for an immediate form. 1005 int64_t Amount = VRegAndVal->Value.getSExtValue(); 1006 if (Amount > 31) 1007 return true; // This will have to remain a register variant. 1008 auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount); 1009 Observer.changingInstr(MI); 1010 MI.getOperand(2).setReg(ExtCst.getReg(0)); 1011 Observer.changedInstr(MI); 1012 return true; 1013 } 1014 1015 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset, 1016 MachineRegisterInfo &MRI) { 1017 Base = Root; 1018 Offset = 0; 1019 1020 Register NewBase; 1021 int64_t NewOffset; 1022 if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) && 1023 isShiftedInt<7, 3>(NewOffset)) { 1024 Base = NewBase; 1025 Offset = NewOffset; 1026 } 1027 } 1028 1029 // FIXME: This should be removed and replaced with the generic bitcast legalize 1030 // action. 1031 bool AArch64LegalizerInfo::legalizeLoadStore( 1032 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1033 GISelChangeObserver &Observer) const { 1034 assert(MI.getOpcode() == TargetOpcode::G_STORE || 1035 MI.getOpcode() == TargetOpcode::G_LOAD); 1036 // Here we just try to handle vector loads/stores where our value type might 1037 // have pointer elements, which the SelectionDAG importer can't handle. To 1038 // allow the existing patterns for s64 to fire for p0, we just try to bitcast 1039 // the value to use s64 types. 1040 1041 // Custom legalization requires the instruction, if not deleted, must be fully 1042 // legalized. In order to allow further legalization of the inst, we create 1043 // a new instruction and erase the existing one. 1044 1045 Register ValReg = MI.getOperand(0).getReg(); 1046 const LLT ValTy = MRI.getType(ValReg); 1047 1048 if (ValTy == LLT::scalar(128)) { 1049 assert((*MI.memoperands_begin())->getSuccessOrdering() == 1050 AtomicOrdering::Monotonic || 1051 (*MI.memoperands_begin())->getSuccessOrdering() == 1052 AtomicOrdering::Unordered); 1053 assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2"); 1054 LLT s64 = LLT::scalar(64); 1055 MachineInstrBuilder NewI; 1056 if (MI.getOpcode() == TargetOpcode::G_LOAD) { 1057 NewI = MIRBuilder.buildInstr(AArch64::LDPXi, {s64, s64}, {}); 1058 MIRBuilder.buildMerge(ValReg, {NewI->getOperand(0), NewI->getOperand(1)}); 1059 } else { 1060 auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0)); 1061 NewI = MIRBuilder.buildInstr( 1062 AArch64::STPXi, {}, {Split->getOperand(0), Split->getOperand(1)}); 1063 } 1064 Register Base; 1065 int Offset; 1066 matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI); 1067 NewI.addUse(Base); 1068 NewI.addImm(Offset / 8); 1069 1070 NewI.cloneMemRefs(MI); 1071 constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(), 1072 *MRI.getTargetRegisterInfo(), 1073 *ST->getRegBankInfo()); 1074 MI.eraseFromParent(); 1075 return true; 1076 } 1077 1078 if (!ValTy.isVector() || !ValTy.getElementType().isPointer() || 1079 ValTy.getElementType().getAddressSpace() != 0) { 1080 LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store"); 1081 return false; 1082 } 1083 1084 unsigned PtrSize = ValTy.getElementType().getSizeInBits(); 1085 const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize); 1086 auto &MMO = **MI.memoperands_begin(); 1087 MMO.setType(NewTy); 1088 1089 if (MI.getOpcode() == TargetOpcode::G_STORE) { 1090 auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg); 1091 MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO); 1092 } else { 1093 auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO); 1094 MIRBuilder.buildBitcast(ValReg, NewLoad); 1095 } 1096 MI.eraseFromParent(); 1097 return true; 1098 } 1099 1100 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, 1101 MachineRegisterInfo &MRI, 1102 MachineIRBuilder &MIRBuilder) const { 1103 MachineFunction &MF = MIRBuilder.getMF(); 1104 Align Alignment(MI.getOperand(2).getImm()); 1105 Register Dst = MI.getOperand(0).getReg(); 1106 Register ListPtr = MI.getOperand(1).getReg(); 1107 1108 LLT PtrTy = MRI.getType(ListPtr); 1109 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1110 1111 const unsigned PtrSize = PtrTy.getSizeInBits() / 8; 1112 const Align PtrAlign = Align(PtrSize); 1113 auto List = MIRBuilder.buildLoad( 1114 PtrTy, ListPtr, 1115 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, 1116 PtrTy, PtrAlign)); 1117 1118 MachineInstrBuilder DstPtr; 1119 if (Alignment > PtrAlign) { 1120 // Realign the list to the actual required alignment. 1121 auto AlignMinus1 = 1122 MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1); 1123 auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0)); 1124 DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment)); 1125 } else 1126 DstPtr = List; 1127 1128 LLT ValTy = MRI.getType(Dst); 1129 uint64_t ValSize = ValTy.getSizeInBits() / 8; 1130 MIRBuilder.buildLoad( 1131 Dst, DstPtr, 1132 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, 1133 ValTy, std::max(Alignment, PtrAlign))); 1134 1135 auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign)); 1136 1137 auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0)); 1138 1139 MIRBuilder.buildStore(NewList, ListPtr, 1140 *MF.getMachineMemOperand(MachinePointerInfo(), 1141 MachineMemOperand::MOStore, 1142 PtrTy, PtrAlign)); 1143 1144 MI.eraseFromParent(); 1145 return true; 1146 } 1147 1148 bool AArch64LegalizerInfo::legalizeBitfieldExtract( 1149 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 1150 // Only legal if we can select immediate forms. 1151 // TODO: Lower this otherwise. 1152 return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) && 1153 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 1154 } 1155 1156 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI, 1157 MachineRegisterInfo &MRI, 1158 LegalizerHelper &Helper) const { 1159 // While there is no integer popcount instruction, it can 1160 // be more efficiently lowered to the following sequence that uses 1161 // AdvSIMD registers/instructions as long as the copies to/from 1162 // the AdvSIMD registers are cheap. 1163 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 1164 // CNT V0.8B, V0.8B // 8xbyte pop-counts 1165 // ADDV B0, V0.8B // sum 8xbyte pop-counts 1166 // UMOV X0, V0.B[0] // copy byte result back to integer reg 1167 // 1168 // For 128 bit vector popcounts, we lower to the following sequence: 1169 // cnt.16b v0, v0 // v8s16, v4s32, v2s64 1170 // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64 1171 // uaddlp.4s v0, v0 // v4s32, v2s64 1172 // uaddlp.2d v0, v0 // v2s64 1173 // 1174 // For 64 bit vector popcounts, we lower to the following sequence: 1175 // cnt.8b v0, v0 // v4s16, v2s32 1176 // uaddlp.4h v0, v0 // v4s16, v2s32 1177 // uaddlp.2s v0, v0 // v2s32 1178 1179 if (!ST->hasNEON() || 1180 MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) 1181 return false; 1182 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1183 Register Dst = MI.getOperand(0).getReg(); 1184 Register Val = MI.getOperand(1).getReg(); 1185 LLT Ty = MRI.getType(Val); 1186 1187 assert(Ty == MRI.getType(Dst) && 1188 "Expected src and dst to have the same type!"); 1189 unsigned Size = Ty.getSizeInBits(); 1190 1191 // Pre-conditioning: widen Val up to the nearest vector type. 1192 // s32,s64,v4s16,v2s32 -> v8i8 1193 // v8s16,v4s32,v2s64 -> v16i8 1194 LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8); 1195 if (Ty.isScalar()) { 1196 assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!"); 1197 if (Size == 32) { 1198 Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0); 1199 } 1200 } 1201 Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0); 1202 1203 // Count bits in each byte-sized lane. 1204 auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val); 1205 1206 // Sum across lanes. 1207 Register HSum = CTPOP.getReg(0); 1208 unsigned Opc; 1209 SmallVector<LLT> HAddTys; 1210 if (Ty.isScalar()) { 1211 Opc = Intrinsic::aarch64_neon_uaddlv; 1212 HAddTys.push_back(LLT::scalar(32)); 1213 } else if (Ty == LLT::fixed_vector(8, 16)) { 1214 Opc = Intrinsic::aarch64_neon_uaddlp; 1215 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1216 } else if (Ty == LLT::fixed_vector(4, 32)) { 1217 Opc = Intrinsic::aarch64_neon_uaddlp; 1218 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1219 HAddTys.push_back(LLT::fixed_vector(4, 32)); 1220 } else if (Ty == LLT::fixed_vector(2, 64)) { 1221 Opc = Intrinsic::aarch64_neon_uaddlp; 1222 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1223 HAddTys.push_back(LLT::fixed_vector(4, 32)); 1224 HAddTys.push_back(LLT::fixed_vector(2, 64)); 1225 } else if (Ty == LLT::fixed_vector(4, 16)) { 1226 Opc = Intrinsic::aarch64_neon_uaddlp; 1227 HAddTys.push_back(LLT::fixed_vector(4, 16)); 1228 } else if (Ty == LLT::fixed_vector(2, 32)) { 1229 Opc = Intrinsic::aarch64_neon_uaddlp; 1230 HAddTys.push_back(LLT::fixed_vector(4, 16)); 1231 HAddTys.push_back(LLT::fixed_vector(2, 32)); 1232 } else 1233 llvm_unreachable("unexpected vector shape"); 1234 MachineInstrBuilder UADD; 1235 for (LLT HTy : HAddTys) { 1236 UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}, /*HasSideEffects =*/false) 1237 .addUse(HSum); 1238 HSum = UADD.getReg(0); 1239 } 1240 1241 // Post-conditioning. 1242 if (Ty.isScalar() && (Size == 64 || Size == 128)) 1243 MIRBuilder.buildZExt(Dst, UADD); 1244 else 1245 UADD->getOperand(0).setReg(Dst); 1246 MI.eraseFromParent(); 1247 return true; 1248 } 1249 1250 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128( 1251 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 1252 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1253 LLT s64 = LLT::scalar(64); 1254 auto Addr = MI.getOperand(1).getReg(); 1255 auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2)); 1256 auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3)); 1257 auto DstLo = MRI.createGenericVirtualRegister(s64); 1258 auto DstHi = MRI.createGenericVirtualRegister(s64); 1259 1260 MachineInstrBuilder CAS; 1261 if (ST->hasLSE()) { 1262 // We have 128-bit CASP instructions taking XSeqPair registers, which are 1263 // s128. We need the merge/unmerge to bracket the expansion and pair up with 1264 // the rest of the MIR so we must reassemble the extracted registers into a 1265 // 128-bit known-regclass one with code like this: 1266 // 1267 // %in1 = REG_SEQUENCE Lo, Hi ; One for each input 1268 // %out = CASP %in1, ... 1269 // %OldLo = G_EXTRACT %out, 0 1270 // %OldHi = G_EXTRACT %out, 64 1271 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); 1272 unsigned Opcode; 1273 switch (Ordering) { 1274 case AtomicOrdering::Acquire: 1275 Opcode = AArch64::CASPAX; 1276 break; 1277 case AtomicOrdering::Release: 1278 Opcode = AArch64::CASPLX; 1279 break; 1280 case AtomicOrdering::AcquireRelease: 1281 case AtomicOrdering::SequentiallyConsistent: 1282 Opcode = AArch64::CASPALX; 1283 break; 1284 default: 1285 Opcode = AArch64::CASPX; 1286 break; 1287 } 1288 1289 LLT s128 = LLT::scalar(128); 1290 auto CASDst = MRI.createGenericVirtualRegister(s128); 1291 auto CASDesired = MRI.createGenericVirtualRegister(s128); 1292 auto CASNew = MRI.createGenericVirtualRegister(s128); 1293 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {}) 1294 .addUse(DesiredI->getOperand(0).getReg()) 1295 .addImm(AArch64::sube64) 1296 .addUse(DesiredI->getOperand(1).getReg()) 1297 .addImm(AArch64::subo64); 1298 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {}) 1299 .addUse(NewI->getOperand(0).getReg()) 1300 .addImm(AArch64::sube64) 1301 .addUse(NewI->getOperand(1).getReg()) 1302 .addImm(AArch64::subo64); 1303 1304 CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr}); 1305 1306 MIRBuilder.buildExtract({DstLo}, {CASDst}, 0); 1307 MIRBuilder.buildExtract({DstHi}, {CASDst}, 64); 1308 } else { 1309 // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP 1310 // can take arbitrary registers so it just has the normal GPR64 operands the 1311 // rest of AArch64 is expecting. 1312 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); 1313 unsigned Opcode; 1314 switch (Ordering) { 1315 case AtomicOrdering::Acquire: 1316 Opcode = AArch64::CMP_SWAP_128_ACQUIRE; 1317 break; 1318 case AtomicOrdering::Release: 1319 Opcode = AArch64::CMP_SWAP_128_RELEASE; 1320 break; 1321 case AtomicOrdering::AcquireRelease: 1322 case AtomicOrdering::SequentiallyConsistent: 1323 Opcode = AArch64::CMP_SWAP_128; 1324 break; 1325 default: 1326 Opcode = AArch64::CMP_SWAP_128_MONOTONIC; 1327 break; 1328 } 1329 1330 auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 1331 CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch}, 1332 {Addr, DesiredI->getOperand(0), 1333 DesiredI->getOperand(1), NewI->getOperand(0), 1334 NewI->getOperand(1)}); 1335 } 1336 1337 CAS.cloneMemRefs(MI); 1338 constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(), 1339 *MRI.getTargetRegisterInfo(), 1340 *ST->getRegBankInfo()); 1341 1342 MIRBuilder.buildMerge(MI.getOperand(0), {DstLo, DstHi}); 1343 MI.eraseFromParent(); 1344 return true; 1345 } 1346 1347 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI, 1348 LegalizerHelper &Helper) const { 1349 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1350 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 1351 LLT Ty = MRI.getType(MI.getOperand(1).getReg()); 1352 auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1)); 1353 MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse); 1354 MI.eraseFromParent(); 1355 return true; 1356 } 1357