1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AArch64. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64LegalizerInfo.h" 15 #include "AArch64RegisterBankInfo.h" 16 #include "AArch64Subtarget.h" 17 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 19 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 20 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 21 #include "llvm/CodeGen/GlobalISel/Utils.h" 22 #include "llvm/CodeGen/MachineInstr.h" 23 #include "llvm/CodeGen/MachineRegisterInfo.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/Intrinsics.h" 28 #include "llvm/IR/IntrinsicsAArch64.h" 29 #include "llvm/IR/Type.h" 30 #include "llvm/Support/MathExtras.h" 31 #include <initializer_list> 32 33 #define DEBUG_TYPE "aarch64-legalinfo" 34 35 using namespace llvm; 36 using namespace LegalizeActions; 37 using namespace LegalizeMutations; 38 using namespace LegalityPredicates; 39 using namespace MIPatternMatch; 40 41 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) 42 : ST(&ST) { 43 using namespace TargetOpcode; 44 const LLT p0 = LLT::pointer(0, 64); 45 const LLT s8 = LLT::scalar(8); 46 const LLT s16 = LLT::scalar(16); 47 const LLT s32 = LLT::scalar(32); 48 const LLT s64 = LLT::scalar(64); 49 const LLT s128 = LLT::scalar(128); 50 const LLT v16s8 = LLT::fixed_vector(16, 8); 51 const LLT v8s8 = LLT::fixed_vector(8, 8); 52 const LLT v4s8 = LLT::fixed_vector(4, 8); 53 const LLT v8s16 = LLT::fixed_vector(8, 16); 54 const LLT v4s16 = LLT::fixed_vector(4, 16); 55 const LLT v2s16 = LLT::fixed_vector(2, 16); 56 const LLT v2s32 = LLT::fixed_vector(2, 32); 57 const LLT v4s32 = LLT::fixed_vector(4, 32); 58 const LLT v2s64 = LLT::fixed_vector(2, 64); 59 const LLT v2p0 = LLT::fixed_vector(2, p0); 60 61 std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */ 62 v16s8, v8s16, v4s32, 63 v2s64, v2p0, 64 /* End 128bit types */ 65 /* Begin 64bit types */ 66 v8s8, v4s16, v2s32}; 67 68 const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine(); 69 70 // FIXME: support subtargets which have neon/fp-armv8 disabled. 71 if (!ST.hasNEON() || !ST.hasFPARMv8()) { 72 getLegacyLegalizerInfo().computeTables(); 73 return; 74 } 75 76 // Some instructions only support s16 if the subtarget has full 16-bit FP 77 // support. 78 const bool HasFP16 = ST.hasFullFP16(); 79 const LLT &MinFPScalar = HasFP16 ? s16 : s32; 80 81 const bool HasCSSC = ST.hasCSSC(); 82 83 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 84 .legalFor({p0, s8, s16, s32, s64}) 85 .legalFor(PackedVectorAllTypeList) 86 .widenScalarToNextPow2(0) 87 .clampScalar(0, s8, s64) 88 .fewerElementsIf( 89 [=](const LegalityQuery &Query) { 90 return Query.Types[0].isVector() && 91 (Query.Types[0].getElementType() != s64 || 92 Query.Types[0].getNumElements() != 2); 93 }, 94 [=](const LegalityQuery &Query) { 95 LLT EltTy = Query.Types[0].getElementType(); 96 if (EltTy == s64) 97 return std::make_pair(0, LLT::fixed_vector(2, 64)); 98 return std::make_pair(0, EltTy); 99 }); 100 101 getActionDefinitionsBuilder(G_PHI) 102 .legalFor({p0, s16, s32, s64}) 103 .legalFor(PackedVectorAllTypeList) 104 .widenScalarToNextPow2(0) 105 .clampScalar(0, s16, s64) 106 // Maximum: sN * k = 128 107 .clampMaxNumElements(0, s8, 16) 108 .clampMaxNumElements(0, s16, 8) 109 .clampMaxNumElements(0, s32, 4) 110 .clampMaxNumElements(0, s64, 2) 111 .clampMaxNumElements(0, p0, 2); 112 113 getActionDefinitionsBuilder(G_BSWAP) 114 .legalFor({s32, s64, v4s32, v2s32, v2s64}) 115 .widenScalarToNextPow2(0) 116 .clampScalar(0, s32, s64); 117 118 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) 119 .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8}) 120 .scalarizeIf( 121 [=](const LegalityQuery &Query) { 122 return Query.Opcode == G_MUL && Query.Types[0] == v2s64; 123 }, 124 0) 125 .legalFor({v2s64}) 126 .widenScalarToNextPow2(0) 127 .clampScalar(0, s32, s64) 128 .clampNumElements(0, v2s32, v4s32) 129 .clampNumElements(0, v2s64, v2s64) 130 .moreElementsToNextPow2(0); 131 132 getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) 133 .customIf([=](const LegalityQuery &Query) { 134 const auto &SrcTy = Query.Types[0]; 135 const auto &AmtTy = Query.Types[1]; 136 return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && 137 AmtTy.getSizeInBits() == 32; 138 }) 139 .legalFor({ 140 {s32, s32}, 141 {s32, s64}, 142 {s64, s64}, 143 {v8s8, v8s8}, 144 {v16s8, v16s8}, 145 {v4s16, v4s16}, 146 {v8s16, v8s16}, 147 {v2s32, v2s32}, 148 {v4s32, v4s32}, 149 {v2s64, v2s64}, 150 }) 151 .widenScalarToNextPow2(0) 152 .clampScalar(1, s32, s64) 153 .clampScalar(0, s32, s64) 154 .clampNumElements(0, v2s32, v4s32) 155 .clampNumElements(0, v2s64, v2s64) 156 .moreElementsToNextPow2(0) 157 .minScalarSameAs(1, 0); 158 159 getActionDefinitionsBuilder(G_PTR_ADD) 160 .legalFor({{p0, s64}, {v2p0, v2s64}}) 161 .clampScalar(1, s64, s64); 162 163 getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}}); 164 165 getActionDefinitionsBuilder({G_SDIV, G_UDIV}) 166 .legalFor({s32, s64}) 167 .libcallFor({s128}) 168 .clampScalar(0, s32, s64) 169 .widenScalarToNextPow2(0) 170 .scalarize(0); 171 172 getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 173 .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32}) 174 .widenScalarOrEltToNextPow2(0) 175 .clampScalarOrElt(0, s32, s64) 176 .clampNumElements(0, v2s32, v4s32) 177 .clampNumElements(0, v2s64, v2s64) 178 .moreElementsToNextPow2(0); 179 180 181 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 182 .widenScalarToNextPow2(0, /*Min = */ 32) 183 .clampScalar(0, s32, s64) 184 .lower(); 185 186 getActionDefinitionsBuilder({G_SMULH, G_UMULH}) 187 .legalFor({s64, v8s16, v16s8, v4s32}) 188 .lower(); 189 190 auto &MinMaxActions = getActionDefinitionsBuilder( 191 {G_SMIN, G_SMAX, G_UMIN, G_UMAX}); 192 if (HasCSSC) 193 MinMaxActions 194 .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) 195 // Making clamping conditional on CSSC extension as without legal types we 196 // lower to CMP which can fold one of the two sxtb's we'd otherwise need 197 // if we detect a type smaller than 32-bit. 198 .minScalar(0, s32); 199 else 200 MinMaxActions 201 .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}); 202 MinMaxActions 203 .clampNumElements(0, v8s8, v16s8) 204 .clampNumElements(0, v4s16, v8s16) 205 .clampNumElements(0, v2s32, v4s32) 206 // FIXME: This sholdn't be needed as v2s64 types are going to 207 // be expanded anyway, but G_ICMP doesn't support splitting vectors yet 208 .clampNumElements(0, v2s64, v2s64) 209 .lower(); 210 211 getActionDefinitionsBuilder( 212 {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO}) 213 .legalFor({{s32, s32}, {s64, s32}}) 214 .clampScalar(0, s32, s64) 215 .clampScalar(1, s32, s64) 216 .widenScalarToNextPow2(0); 217 218 getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG}) 219 .legalFor({MinFPScalar, s32, s64, v2s64, v4s32, v2s32}) 220 .clampScalar(0, MinFPScalar, s64) 221 .clampNumElements(0, v2s32, v4s32) 222 .clampNumElements(0, v2s64, v2s64); 223 224 getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64}); 225 226 getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT, 227 G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, 228 G_FNEARBYINT, G_INTRINSIC_LRINT}) 229 // If we don't have full FP16 support, then scalarize the elements of 230 // vectors containing fp16 types. 231 .fewerElementsIf( 232 [=, &ST](const LegalityQuery &Query) { 233 const auto &Ty = Query.Types[0]; 234 return Ty.isVector() && Ty.getElementType() == s16 && 235 !ST.hasFullFP16(); 236 }, 237 [=](const LegalityQuery &Query) { return std::make_pair(0, s16); }) 238 // If we don't have full FP16 support, then widen s16 to s32 if we 239 // encounter it. 240 .widenScalarIf( 241 [=, &ST](const LegalityQuery &Query) { 242 return Query.Types[0] == s16 && !ST.hasFullFP16(); 243 }, 244 [=](const LegalityQuery &Query) { return std::make_pair(0, s32); }) 245 .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16}); 246 247 getActionDefinitionsBuilder( 248 {G_FCOS, G_FSIN, G_FLOG10, G_FLOG, G_FLOG2, G_FEXP, G_FEXP2, G_FPOW}) 249 // We need a call for these, so we always need to scalarize. 250 .scalarize(0) 251 // Regardless of FP16 support, widen 16-bit elements to 32-bits. 252 .minScalar(0, s32) 253 .libcallFor({s32, s64, v2s32, v4s32, v2s64}); 254 255 getActionDefinitionsBuilder(G_INSERT) 256 .legalIf(all(typeInSet(0, {s32, s64, p0}), 257 typeInSet(1, {s8, s16, s32}), smallerThan(1, 0))) 258 .widenScalarToNextPow2(0) 259 .clampScalar(0, s32, s64) 260 .widenScalarToNextPow2(1) 261 .minScalar(1, s8) 262 .maxScalarIf(typeInSet(0, {s32}), 1, s16) 263 .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32); 264 265 getActionDefinitionsBuilder(G_EXTRACT) 266 .legalIf(all(typeInSet(0, {s16, s32, s64, p0}), 267 typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1))) 268 .widenScalarToNextPow2(1) 269 .clampScalar(1, s32, s128) 270 .widenScalarToNextPow2(0) 271 .minScalar(0, s16) 272 .maxScalarIf(typeInSet(1, {s32}), 0, s16) 273 .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32) 274 .maxScalarIf(typeInSet(1, {s128}), 0, s64); 275 276 277 for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) { 278 auto &Actions = getActionDefinitionsBuilder(Op); 279 280 if (Op == G_SEXTLOAD) 281 Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)); 282 283 // Atomics have zero extending behavior. 284 Actions 285 .legalForTypesWithMemDesc({{s32, p0, s8, 8}, 286 {s32, p0, s16, 8}, 287 {s32, p0, s32, 8}, 288 {s64, p0, s8, 2}, 289 {s64, p0, s16, 2}, 290 {s64, p0, s32, 4}, 291 {s64, p0, s64, 8}, 292 {p0, p0, s64, 8}, 293 {v2s32, p0, s64, 8}}) 294 .widenScalarToNextPow2(0) 295 .clampScalar(0, s32, s64) 296 // TODO: We could support sum-of-pow2's but the lowering code doesn't know 297 // how to do that yet. 298 .unsupportedIfMemSizeNotPow2() 299 // Lower anything left over into G_*EXT and G_LOAD 300 .lower(); 301 } 302 303 auto IsPtrVecPred = [=](const LegalityQuery &Query) { 304 const LLT &ValTy = Query.Types[0]; 305 if (!ValTy.isVector()) 306 return false; 307 const LLT EltTy = ValTy.getElementType(); 308 return EltTy.isPointer() && EltTy.getAddressSpace() == 0; 309 }; 310 311 getActionDefinitionsBuilder(G_LOAD) 312 .customIf([=](const LegalityQuery &Query) { 313 return Query.Types[0] == s128 && 314 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; 315 }) 316 .legalForTypesWithMemDesc({{s8, p0, s8, 8}, 317 {s16, p0, s16, 8}, 318 {s32, p0, s32, 8}, 319 {s64, p0, s64, 8}, 320 {p0, p0, s64, 8}, 321 {s128, p0, s128, 8}, 322 {v8s8, p0, s64, 8}, 323 {v16s8, p0, s128, 8}, 324 {v4s16, p0, s64, 8}, 325 {v8s16, p0, s128, 8}, 326 {v2s32, p0, s64, 8}, 327 {v4s32, p0, s128, 8}, 328 {v2s64, p0, s128, 8}}) 329 // These extends are also legal 330 .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}}) 331 .widenScalarToNextPow2(0, /* MinSize = */8) 332 .lowerIfMemSizeNotByteSizePow2() 333 .clampScalar(0, s8, s64) 334 .narrowScalarIf([=](const LegalityQuery &Query) { 335 // Clamp extending load results to 32-bits. 336 return Query.Types[0].isScalar() && 337 Query.Types[0] != Query.MMODescrs[0].MemoryTy && 338 Query.Types[0].getSizeInBits() > 32; 339 }, 340 changeTo(0, s32)) 341 .clampMaxNumElements(0, s8, 16) 342 .clampMaxNumElements(0, s16, 8) 343 .clampMaxNumElements(0, s32, 4) 344 .clampMaxNumElements(0, s64, 2) 345 .clampMaxNumElements(0, p0, 2) 346 .customIf(IsPtrVecPred) 347 .scalarizeIf(typeIs(0, v2s16), 0); 348 349 getActionDefinitionsBuilder(G_STORE) 350 .customIf([=](const LegalityQuery &Query) { 351 return Query.Types[0] == s128 && 352 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; 353 }) 354 .legalForTypesWithMemDesc({{s8, p0, s8, 8}, 355 {s16, p0, s8, 8}, // truncstorei8 from s16 356 {s32, p0, s8, 8}, // truncstorei8 from s32 357 {s64, p0, s8, 8}, // truncstorei8 from s64 358 {s16, p0, s16, 8}, 359 {s32, p0, s16, 8}, // truncstorei16 from s32 360 {s64, p0, s16, 8}, // truncstorei16 from s64 361 {s32, p0, s8, 8}, 362 {s32, p0, s16, 8}, 363 {s32, p0, s32, 8}, 364 {s64, p0, s64, 8}, 365 {s64, p0, s32, 8}, // truncstorei32 from s64 366 {p0, p0, s64, 8}, 367 {s128, p0, s128, 8}, 368 {v16s8, p0, s128, 8}, 369 {v8s8, p0, s64, 8}, 370 {v4s16, p0, s64, 8}, 371 {v8s16, p0, s128, 8}, 372 {v2s32, p0, s64, 8}, 373 {v4s32, p0, s128, 8}, 374 {v2s64, p0, s128, 8}}) 375 .clampScalar(0, s8, s64) 376 .lowerIf([=](const LegalityQuery &Query) { 377 return Query.Types[0].isScalar() && 378 Query.Types[0] != Query.MMODescrs[0].MemoryTy; 379 }) 380 // Maximum: sN * k = 128 381 .clampMaxNumElements(0, s8, 16) 382 .clampMaxNumElements(0, s16, 8) 383 .clampMaxNumElements(0, s32, 4) 384 .clampMaxNumElements(0, s64, 2) 385 .clampMaxNumElements(0, p0, 2) 386 .lowerIfMemSizeNotPow2() 387 .customIf(IsPtrVecPred) 388 .scalarizeIf(typeIs(0, v2s16), 0); 389 390 // Constants 391 getActionDefinitionsBuilder(G_CONSTANT) 392 .legalFor({p0, s8, s16, s32, s64}) 393 .widenScalarToNextPow2(0) 394 .clampScalar(0, s8, s64); 395 getActionDefinitionsBuilder(G_FCONSTANT) 396 .legalIf([=](const LegalityQuery &Query) { 397 const auto &Ty = Query.Types[0]; 398 if (HasFP16 && Ty == s16) 399 return true; 400 return Ty == s32 || Ty == s64 || Ty == s128; 401 }) 402 .clampScalar(0, MinFPScalar, s128); 403 404 getActionDefinitionsBuilder({G_ICMP, G_FCMP}) 405 .legalFor({{s32, s32}, 406 {s32, s64}, 407 {s32, p0}, 408 {v4s32, v4s32}, 409 {v2s32, v2s32}, 410 {v2s64, v2s64}, 411 {v2s64, v2p0}, 412 {v4s16, v4s16}, 413 {v8s16, v8s16}, 414 {v8s8, v8s8}, 415 {v16s8, v16s8}}) 416 .widenScalarOrEltToNextPow2(1) 417 .clampScalar(1, s32, s64) 418 .clampScalar(0, s32, s32) 419 .minScalarEltSameAsIf( 420 [=](const LegalityQuery &Query) { 421 const LLT &Ty = Query.Types[0]; 422 const LLT &SrcTy = Query.Types[1]; 423 return Ty.isVector() && !SrcTy.getElementType().isPointer() && 424 Ty.getElementType() != SrcTy.getElementType(); 425 }, 426 0, 1) 427 .minScalarOrEltIf( 428 [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; }, 429 1, s32) 430 .minScalarOrEltIf( 431 [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0, 432 s64) 433 .clampNumElements(0, v2s32, v4s32); 434 435 // Extensions 436 auto ExtLegalFunc = [=](const LegalityQuery &Query) { 437 unsigned DstSize = Query.Types[0].getSizeInBits(); 438 439 if (DstSize == 128 && !Query.Types[0].isVector()) 440 return false; // Extending to a scalar s128 needs narrowing. 441 442 // Make sure that we have something that will fit in a register, and 443 // make sure it's a power of 2. 444 if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize)) 445 return false; 446 447 const LLT &SrcTy = Query.Types[1]; 448 449 // Make sure we fit in a register otherwise. Don't bother checking that 450 // the source type is below 128 bits. We shouldn't be allowing anything 451 // through which is wider than the destination in the first place. 452 unsigned SrcSize = SrcTy.getSizeInBits(); 453 if (SrcSize < 8 || !isPowerOf2_32(SrcSize)) 454 return false; 455 456 return true; 457 }; 458 getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) 459 .legalIf(ExtLegalFunc) 460 .clampScalar(0, s64, s64); // Just for s128, others are handled above. 461 462 getActionDefinitionsBuilder(G_TRUNC) 463 .minScalarOrEltIf( 464 [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); }, 465 0, s8) 466 .customIf([=](const LegalityQuery &Query) { 467 LLT DstTy = Query.Types[0]; 468 LLT SrcTy = Query.Types[1]; 469 return DstTy == v8s8 && SrcTy.getSizeInBits() > 128; 470 }) 471 .alwaysLegal(); 472 473 getActionDefinitionsBuilder(G_SEXT_INREG) 474 .legalFor({s32, s64}) 475 .legalFor(PackedVectorAllTypeList) 476 .lower(); 477 478 // FP conversions 479 getActionDefinitionsBuilder(G_FPTRUNC) 480 .legalFor( 481 {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}) 482 .clampMaxNumElements(0, s32, 2); 483 getActionDefinitionsBuilder(G_FPEXT) 484 .legalFor( 485 {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}}) 486 .clampMaxNumElements(0, s64, 2); 487 488 // Conversions 489 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 490 .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) 491 .widenScalarToNextPow2(0) 492 .clampScalar(0, s32, s64) 493 .widenScalarToNextPow2(1) 494 .clampScalar(1, s32, s64); 495 496 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 497 .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) 498 .clampScalar(1, s32, s64) 499 .minScalarSameAs(1, 0) 500 .clampScalar(0, s32, s64) 501 .widenScalarToNextPow2(0); 502 503 // Control-flow 504 getActionDefinitionsBuilder(G_BRCOND) 505 .legalFor({s32}) 506 .clampScalar(0, s32, s32); 507 getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0}); 508 509 getActionDefinitionsBuilder(G_SELECT) 510 .legalFor({{s32, s32}, {s64, s32}, {p0, s32}}) 511 .widenScalarToNextPow2(0) 512 .clampScalar(0, s32, s64) 513 .clampScalar(1, s32, s32) 514 .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0) 515 .lowerIf(isVector(0)); 516 517 // Pointer-handling 518 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); 519 520 if (TM.getCodeModel() == CodeModel::Small) 521 getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom(); 522 else 523 getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); 524 525 getActionDefinitionsBuilder(G_PTRTOINT) 526 .legalFor({{s64, p0}, {v2s64, v2p0}}) 527 .widenScalarToNextPow2(0, 64) 528 .clampScalar(0, s64, s64); 529 530 getActionDefinitionsBuilder(G_INTTOPTR) 531 .unsupportedIf([&](const LegalityQuery &Query) { 532 return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits(); 533 }) 534 .legalFor({{p0, s64}, {v2p0, v2s64}}); 535 536 // Casts for 32 and 64-bit width type are just copies. 537 // Same for 128-bit width type, except they are on the FPR bank. 538 getActionDefinitionsBuilder(G_BITCAST) 539 // FIXME: This is wrong since G_BITCAST is not allowed to change the 540 // number of bits but it's what the previous code described and fixing 541 // it breaks tests. 542 .legalForCartesianProduct({s8, s16, s32, s64, s128, v16s8, v8s8, v4s8, 543 v8s16, v4s16, v2s16, v4s32, v2s32, v2s64, 544 v2p0}); 545 546 getActionDefinitionsBuilder(G_VASTART).legalFor({p0}); 547 548 // va_list must be a pointer, but most sized types are pretty easy to handle 549 // as the destination. 550 getActionDefinitionsBuilder(G_VAARG) 551 .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0}) 552 .clampScalar(0, s8, s64) 553 .widenScalarToNextPow2(0, /*Min*/ 8); 554 555 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 556 .lowerIf( 557 all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0))); 558 559 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 560 .customIf([](const LegalityQuery &Query) { 561 return Query.Types[0].getSizeInBits() == 128; 562 }) 563 .clampScalar(0, s32, s64) 564 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0))); 565 566 getActionDefinitionsBuilder( 567 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND, 568 G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, 569 G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX}) 570 .clampScalar(0, s32, s64) 571 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0))); 572 573 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0}); 574 575 // Merge/Unmerge 576 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 577 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 578 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 579 getActionDefinitionsBuilder(Op) 580 .widenScalarToNextPow2(LitTyIdx, 8) 581 .widenScalarToNextPow2(BigTyIdx, 32) 582 .clampScalar(LitTyIdx, s8, s64) 583 .clampScalar(BigTyIdx, s32, s128) 584 .legalIf([=](const LegalityQuery &Q) { 585 switch (Q.Types[BigTyIdx].getSizeInBits()) { 586 case 32: 587 case 64: 588 case 128: 589 break; 590 default: 591 return false; 592 } 593 switch (Q.Types[LitTyIdx].getSizeInBits()) { 594 case 8: 595 case 16: 596 case 32: 597 case 64: 598 return true; 599 default: 600 return false; 601 } 602 }); 603 } 604 605 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 606 .unsupportedIf([=](const LegalityQuery &Query) { 607 const LLT &EltTy = Query.Types[1].getElementType(); 608 return Query.Types[0] != EltTy; 609 }) 610 .minScalar(2, s64) 611 .legalIf([=](const LegalityQuery &Query) { 612 const LLT &VecTy = Query.Types[1]; 613 return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 || 614 VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 || 615 VecTy == v8s8 || VecTy == v16s8 || VecTy == v2s32 || 616 VecTy == v2p0; 617 }) 618 .minScalarOrEltIf( 619 [=](const LegalityQuery &Query) { 620 // We want to promote to <M x s1> to <M x s64> if that wouldn't 621 // cause the total vec size to be > 128b. 622 return Query.Types[1].getNumElements() <= 2; 623 }, 624 0, s64) 625 .minScalarOrEltIf( 626 [=](const LegalityQuery &Query) { 627 return Query.Types[1].getNumElements() <= 4; 628 }, 629 0, s32) 630 .minScalarOrEltIf( 631 [=](const LegalityQuery &Query) { 632 return Query.Types[1].getNumElements() <= 8; 633 }, 634 0, s16) 635 .minScalarOrEltIf( 636 [=](const LegalityQuery &Query) { 637 return Query.Types[1].getNumElements() <= 16; 638 }, 639 0, s8) 640 .minScalarOrElt(0, s8) // Worst case, we need at least s8. 641 .clampMaxNumElements(1, s64, 2) 642 .clampMaxNumElements(1, s32, 4) 643 .clampMaxNumElements(1, s16, 8) 644 .clampMaxNumElements(1, p0, 2); 645 646 getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) 647 .legalIf(typeInSet(0, {v8s16, v2s32, v4s32, v2s64})); 648 649 getActionDefinitionsBuilder(G_BUILD_VECTOR) 650 .legalFor({{v8s8, s8}, 651 {v16s8, s8}, 652 {v2s16, s16}, 653 {v4s16, s16}, 654 {v8s16, s16}, 655 {v2s32, s32}, 656 {v4s32, s32}, 657 {v2p0, p0}, 658 {v2s64, s64}}) 659 .clampNumElements(0, v4s32, v4s32) 660 .clampNumElements(0, v2s64, v2s64) 661 .minScalarOrElt(0, s8) 662 .minScalarSameAs(1, 0); 663 664 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower(); 665 666 getActionDefinitionsBuilder(G_CTLZ) 667 .legalForCartesianProduct( 668 {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) 669 .scalarize(1); 670 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower(); 671 672 // TODO: Custom lowering for v2s32, v4s32, v2s64. 673 getActionDefinitionsBuilder(G_BITREVERSE) 674 .legalFor({s32, s64, v8s8, v16s8}) 675 .widenScalarToNextPow2(0, /*Min = */ 32) 676 .clampScalar(0, s32, s64); 677 678 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower(); 679 680 getActionDefinitionsBuilder(G_CTTZ) 681 .lowerIf(isVector(0)) 682 .clampScalar(0, s32, s64) 683 .scalarSameSizeAs(1, 0) 684 .legalIf([=](const LegalityQuery &Query) { 685 return (HasCSSC && typeInSet(0, {s32, s64})(Query)); 686 }) 687 .customIf([=](const LegalityQuery &Query) { 688 return (!HasCSSC && typeInSet(0, {s32, s64})(Query)); 689 }); 690 691 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 692 .legalIf([=](const LegalityQuery &Query) { 693 const LLT &DstTy = Query.Types[0]; 694 const LLT &SrcTy = Query.Types[1]; 695 // For now just support the TBL2 variant which needs the source vectors 696 // to be the same size as the dest. 697 if (DstTy != SrcTy) 698 return false; 699 return llvm::is_contained({v2s32, v4s32, v2s64, v2p0, v16s8, v8s16}, 700 DstTy); 701 }) 702 // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we 703 // just want those lowered into G_BUILD_VECTOR 704 .lowerIf([=](const LegalityQuery &Query) { 705 return !Query.Types[1].isVector(); 706 }) 707 .moreElementsIf( 708 [](const LegalityQuery &Query) { 709 return Query.Types[0].isVector() && Query.Types[1].isVector() && 710 Query.Types[0].getNumElements() > 711 Query.Types[1].getNumElements(); 712 }, 713 changeTo(1, 0)) 714 .moreElementsToNextPow2(0) 715 .clampNumElements(0, v4s32, v4s32) 716 .clampNumElements(0, v2s64, v2s64); 717 718 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 719 .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}}); 720 721 getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({{p0}, {s64}}); 722 723 getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) { 724 return Query.Types[0] == p0 && Query.Types[1] == s64; 725 }); 726 727 getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower(); 728 729 if (ST.hasMOPS()) { 730 // G_BZERO is not supported. Currently it is only emitted by 731 // PreLegalizerCombiner for G_MEMSET with zero constant. 732 getActionDefinitionsBuilder(G_BZERO).unsupported(); 733 734 getActionDefinitionsBuilder(G_MEMSET) 735 .legalForCartesianProduct({p0}, {s64}, {s64}) 736 .customForCartesianProduct({p0}, {s8}, {s64}) 737 .immIdx(0); // Inform verifier imm idx 0 is handled. 738 739 getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE}) 740 .legalForCartesianProduct({p0}, {p0}, {s64}) 741 .immIdx(0); // Inform verifier imm idx 0 is handled. 742 743 // G_MEMCPY_INLINE does not have a tailcall immediate 744 getActionDefinitionsBuilder(G_MEMCPY_INLINE) 745 .legalForCartesianProduct({p0}, {p0}, {s64}); 746 747 } else { 748 getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET}) 749 .libcall(); 750 } 751 752 // FIXME: Legal vector types are only legal with NEON. 753 auto &ABSActions = getActionDefinitionsBuilder(G_ABS); 754 if (HasCSSC) 755 ABSActions 756 .legalFor({s32, s64}); 757 ABSActions 758 .legalFor(PackedVectorAllTypeList) 759 .lowerIf(isScalar(0)); 760 761 getActionDefinitionsBuilder(G_VECREDUCE_FADD) 762 // We only have FADDP to do reduction-like operations. Lower the rest. 763 .legalFor({{s32, v2s32}, {s64, v2s64}}) 764 .clampMaxNumElements(1, s64, 2) 765 .clampMaxNumElements(1, s32, 2) 766 .lower(); 767 768 getActionDefinitionsBuilder(G_VECREDUCE_ADD) 769 .legalFor( 770 {{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s32, v2s32}, {s64, v2s64}}) 771 .clampMaxNumElements(1, s64, 2) 772 .clampMaxNumElements(1, s32, 4) 773 .lower(); 774 775 getActionDefinitionsBuilder( 776 {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR}) 777 // Try to break down into smaller vectors as long as they're at least 64 778 // bits. This lets us use vector operations for some parts of the 779 // reduction. 780 .fewerElementsIf( 781 [=](const LegalityQuery &Q) { 782 LLT SrcTy = Q.Types[1]; 783 if (SrcTy.isScalar()) 784 return false; 785 if (!isPowerOf2_32(SrcTy.getNumElements())) 786 return false; 787 // We can usually perform 64b vector operations. 788 return SrcTy.getSizeInBits() > 64; 789 }, 790 [=](const LegalityQuery &Q) { 791 LLT SrcTy = Q.Types[1]; 792 return std::make_pair(1, SrcTy.divide(2)); 793 }) 794 .scalarize(1) 795 .lower(); 796 797 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 798 .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); }); 799 800 getActionDefinitionsBuilder({G_FSHL, G_FSHR}).lower(); 801 802 getActionDefinitionsBuilder(G_ROTR) 803 .legalFor({{s32, s64}, {s64, s64}}) 804 .customIf([=](const LegalityQuery &Q) { 805 return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64; 806 }) 807 .lower(); 808 getActionDefinitionsBuilder(G_ROTL).lower(); 809 810 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 811 .customFor({{s32, s32}, {s64, s64}}); 812 813 auto always = [=](const LegalityQuery &Q) { return true; }; 814 auto &CTPOPActions = getActionDefinitionsBuilder(G_CTPOP); 815 if (HasCSSC) 816 CTPOPActions 817 .legalFor({{s32, s32}, 818 {s64, s64}, 819 {v8s8, v8s8}, 820 {v16s8, v16s8}}) 821 .customFor({{s128, s128}, 822 {v2s64, v2s64}, 823 {v2s32, v2s32}, 824 {v4s32, v4s32}, 825 {v4s16, v4s16}, 826 {v8s16, v8s16}}); 827 else 828 CTPOPActions 829 .legalFor({{v8s8, v8s8}, 830 {v16s8, v16s8}}) 831 .customFor({{s32, s32}, 832 {s64, s64}, 833 {s128, s128}, 834 {v2s64, v2s64}, 835 {v2s32, v2s32}, 836 {v4s32, v4s32}, 837 {v4s16, v4s16}, 838 {v8s16, v8s16}}); 839 CTPOPActions 840 .clampScalar(0, s32, s128) 841 .widenScalarToNextPow2(0) 842 .minScalarEltSameAsIf(always, 1, 0) 843 .maxScalarEltSameAsIf(always, 1, 0); 844 845 // TODO: Vector types. 846 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0)); 847 848 // TODO: Vector types. 849 getActionDefinitionsBuilder({G_FMAXNUM, G_FMINNUM}) 850 .legalFor({MinFPScalar, s32, s64}) 851 .libcallFor({s128}) 852 .minScalar(0, MinFPScalar); 853 854 getActionDefinitionsBuilder({G_FMAXIMUM, G_FMINIMUM}) 855 .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64}) 856 .legalIf([=](const LegalityQuery &Query) { 857 const auto &Ty = Query.Types[0]; 858 return (Ty == v8s16 || Ty == v4s16) && HasFP16; 859 }) 860 .minScalar(0, MinFPScalar) 861 .clampNumElements(0, v4s16, v8s16) 862 .clampNumElements(0, v2s32, v4s32) 863 .clampNumElements(0, v2s64, v2s64); 864 865 // TODO: Libcall support for s128. 866 // TODO: s16 should be legal with full FP16 support. 867 getActionDefinitionsBuilder({G_LROUND, G_LLROUND}) 868 .legalFor({{s64, s32}, {s64, s64}}); 869 870 // TODO: Custom legalization for vector types. 871 // TODO: Custom legalization for mismatched types. 872 // TODO: s16 support. 873 getActionDefinitionsBuilder(G_FCOPYSIGN).customFor({{s32, s32}, {s64, s64}}); 874 875 getActionDefinitionsBuilder(G_FMAD).lower(); 876 877 getLegacyLegalizerInfo().computeTables(); 878 verify(*ST.getInstrInfo()); 879 } 880 881 bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 882 MachineInstr &MI) const { 883 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 884 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 885 GISelChangeObserver &Observer = Helper.Observer; 886 switch (MI.getOpcode()) { 887 default: 888 // No idea what to do. 889 return false; 890 case TargetOpcode::G_VAARG: 891 return legalizeVaArg(MI, MRI, MIRBuilder); 892 case TargetOpcode::G_LOAD: 893 case TargetOpcode::G_STORE: 894 return legalizeLoadStore(MI, MRI, MIRBuilder, Observer); 895 case TargetOpcode::G_SHL: 896 case TargetOpcode::G_ASHR: 897 case TargetOpcode::G_LSHR: 898 return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer); 899 case TargetOpcode::G_GLOBAL_VALUE: 900 return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer); 901 case TargetOpcode::G_TRUNC: 902 return legalizeVectorTrunc(MI, Helper); 903 case TargetOpcode::G_SBFX: 904 case TargetOpcode::G_UBFX: 905 return legalizeBitfieldExtract(MI, MRI, Helper); 906 case TargetOpcode::G_ROTR: 907 return legalizeRotate(MI, MRI, Helper); 908 case TargetOpcode::G_CTPOP: 909 return legalizeCTPOP(MI, MRI, Helper); 910 case TargetOpcode::G_ATOMIC_CMPXCHG: 911 return legalizeAtomicCmpxchg128(MI, MRI, Helper); 912 case TargetOpcode::G_CTTZ: 913 return legalizeCTTZ(MI, Helper); 914 case TargetOpcode::G_BZERO: 915 case TargetOpcode::G_MEMCPY: 916 case TargetOpcode::G_MEMMOVE: 917 case TargetOpcode::G_MEMSET: 918 return legalizeMemOps(MI, Helper); 919 case TargetOpcode::G_FCOPYSIGN: 920 return legalizeFCopySign(MI, Helper); 921 } 922 923 llvm_unreachable("expected switch to return"); 924 } 925 926 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI, 927 MachineRegisterInfo &MRI, 928 LegalizerHelper &Helper) const { 929 // To allow for imported patterns to match, we ensure that the rotate amount 930 // is 64b with an extension. 931 Register AmtReg = MI.getOperand(2).getReg(); 932 LLT AmtTy = MRI.getType(AmtReg); 933 (void)AmtTy; 934 assert(AmtTy.isScalar() && "Expected a scalar rotate"); 935 assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal"); 936 auto NewAmt = Helper.MIRBuilder.buildZExt(LLT::scalar(64), AmtReg); 937 Helper.Observer.changingInstr(MI); 938 MI.getOperand(2).setReg(NewAmt.getReg(0)); 939 Helper.Observer.changedInstr(MI); 940 return true; 941 } 942 943 static void extractParts(Register Reg, MachineRegisterInfo &MRI, 944 MachineIRBuilder &MIRBuilder, LLT Ty, int NumParts, 945 SmallVectorImpl<Register> &VRegs) { 946 for (int I = 0; I < NumParts; ++I) 947 VRegs.push_back(MRI.createGenericVirtualRegister(Ty)); 948 MIRBuilder.buildUnmerge(VRegs, Reg); 949 } 950 951 bool AArch64LegalizerInfo::legalizeVectorTrunc( 952 MachineInstr &MI, LegalizerHelper &Helper) const { 953 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 954 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 955 // Similar to how operand splitting is done in SelectiondDAG, we can handle 956 // %res(v8s8) = G_TRUNC %in(v8s32) by generating: 957 // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>) 958 // %lo16(<4 x s16>) = G_TRUNC %inlo 959 // %hi16(<4 x s16>) = G_TRUNC %inhi 960 // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16 961 // %res(<8 x s8>) = G_TRUNC %in16 962 963 Register DstReg = MI.getOperand(0).getReg(); 964 Register SrcReg = MI.getOperand(1).getReg(); 965 LLT DstTy = MRI.getType(DstReg); 966 LLT SrcTy = MRI.getType(SrcReg); 967 assert(isPowerOf2_32(DstTy.getSizeInBits()) && 968 isPowerOf2_32(SrcTy.getSizeInBits())); 969 970 // Split input type. 971 LLT SplitSrcTy = 972 SrcTy.changeElementCount(SrcTy.getElementCount().divideCoefficientBy(2)); 973 // First, split the source into two smaller vectors. 974 SmallVector<Register, 2> SplitSrcs; 975 extractParts(SrcReg, MRI, MIRBuilder, SplitSrcTy, 2, SplitSrcs); 976 977 // Truncate the splits into intermediate narrower elements. 978 LLT InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2); 979 for (unsigned I = 0; I < SplitSrcs.size(); ++I) 980 SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0); 981 982 auto Concat = MIRBuilder.buildConcatVectors( 983 DstTy.changeElementSize(DstTy.getScalarSizeInBits() * 2), SplitSrcs); 984 985 Helper.Observer.changingInstr(MI); 986 MI.getOperand(1).setReg(Concat.getReg(0)); 987 Helper.Observer.changedInstr(MI); 988 return true; 989 } 990 991 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( 992 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 993 GISelChangeObserver &Observer) const { 994 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); 995 // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP + 996 // G_ADD_LOW instructions. 997 // By splitting this here, we can optimize accesses in the small code model by 998 // folding in the G_ADD_LOW into the load/store offset. 999 auto &GlobalOp = MI.getOperand(1); 1000 const auto* GV = GlobalOp.getGlobal(); 1001 if (GV->isThreadLocal()) 1002 return true; // Don't want to modify TLS vars. 1003 1004 auto &TM = ST->getTargetLowering()->getTargetMachine(); 1005 unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM); 1006 1007 if (OpFlags & AArch64II::MO_GOT) 1008 return true; 1009 1010 auto Offset = GlobalOp.getOffset(); 1011 Register DstReg = MI.getOperand(0).getReg(); 1012 auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {}) 1013 .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE); 1014 // Set the regclass on the dest reg too. 1015 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); 1016 1017 // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so 1018 // by creating a MOVK that sets bits 48-63 of the register to (global address 1019 // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to 1020 // prevent an incorrect tag being generated during relocation when the the 1021 // global appears before the code section. Without the offset, a global at 1022 // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced 1023 // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 = 1024 // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe` 1025 // instead of `0xf`. 1026 // This assumes that we're in the small code model so we can assume a binary 1027 // size of <= 4GB, which makes the untagged PC relative offset positive. The 1028 // binary must also be loaded into address range [0, 2^48). Both of these 1029 // properties need to be ensured at runtime when using tagged addresses. 1030 if (OpFlags & AArch64II::MO_TAGGED) { 1031 assert(!Offset && 1032 "Should not have folded in an offset for a tagged global!"); 1033 ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP}) 1034 .addGlobalAddress(GV, 0x100000000, 1035 AArch64II::MO_PREL | AArch64II::MO_G3) 1036 .addImm(48); 1037 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); 1038 } 1039 1040 MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP}) 1041 .addGlobalAddress(GV, Offset, 1042 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 1043 MI.eraseFromParent(); 1044 return true; 1045 } 1046 1047 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 1048 MachineInstr &MI) const { 1049 switch (MI.getIntrinsicID()) { 1050 case Intrinsic::vacopy: { 1051 unsigned PtrSize = ST->isTargetILP32() ? 4 : 8; 1052 unsigned VaListSize = 1053 (ST->isTargetDarwin() || ST->isTargetWindows()) 1054 ? PtrSize 1055 : ST->isTargetILP32() ? 20 : 32; 1056 1057 MachineFunction &MF = *MI.getMF(); 1058 auto Val = MF.getRegInfo().createGenericVirtualRegister( 1059 LLT::scalar(VaListSize * 8)); 1060 MachineIRBuilder MIB(MI); 1061 MIB.buildLoad(Val, MI.getOperand(2), 1062 *MF.getMachineMemOperand(MachinePointerInfo(), 1063 MachineMemOperand::MOLoad, 1064 VaListSize, Align(PtrSize))); 1065 MIB.buildStore(Val, MI.getOperand(1), 1066 *MF.getMachineMemOperand(MachinePointerInfo(), 1067 MachineMemOperand::MOStore, 1068 VaListSize, Align(PtrSize))); 1069 MI.eraseFromParent(); 1070 return true; 1071 } 1072 case Intrinsic::get_dynamic_area_offset: { 1073 MachineIRBuilder &MIB = Helper.MIRBuilder; 1074 MIB.buildConstant(MI.getOperand(0).getReg(), 0); 1075 MI.eraseFromParent(); 1076 return true; 1077 } 1078 case Intrinsic::aarch64_mops_memset_tag: { 1079 assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 1080 // Zext the value to 64 bit 1081 MachineIRBuilder MIB(MI); 1082 auto &Value = MI.getOperand(3); 1083 Register ZExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0); 1084 Value.setReg(ZExtValueReg); 1085 return true; 1086 } 1087 case Intrinsic::prefetch: { 1088 MachineIRBuilder MIB(MI); 1089 auto &AddrVal = MI.getOperand(1); 1090 1091 int64_t IsWrite = MI.getOperand(2).getImm(); 1092 int64_t Locality = MI.getOperand(3).getImm(); 1093 int64_t IsData = MI.getOperand(4).getImm(); 1094 1095 bool IsStream = Locality == 0; 1096 if (Locality != 0) { 1097 assert(Locality <= 3 && "Prefetch locality out-of-range"); 1098 // The locality degree is the opposite of the cache speed. 1099 // Put the number the other way around. 1100 // The encoding starts at 0 for level 1 1101 Locality = 3 - Locality; 1102 } 1103 1104 unsigned PrfOp = 1105 (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream; 1106 1107 MIB.buildInstr(AArch64::G_PREFETCH).addImm(PrfOp).add(AddrVal); 1108 MI.eraseFromParent(); 1109 return true; 1110 } 1111 case Intrinsic::aarch64_prefetch: { 1112 MachineIRBuilder MIB(MI); 1113 auto &AddrVal = MI.getOperand(1); 1114 1115 int64_t IsWrite = MI.getOperand(2).getImm(); 1116 int64_t Target = MI.getOperand(3).getImm(); 1117 int64_t IsStream = MI.getOperand(4).getImm(); 1118 int64_t IsData = MI.getOperand(5).getImm(); 1119 1120 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 1121 (!IsData << 3) | // IsDataCache bit 1122 (Target << 1) | // Cache level bits 1123 (unsigned)IsStream; // Stream bit 1124 1125 MIB.buildInstr(AArch64::G_PREFETCH).addImm(PrfOp).add(AddrVal); 1126 MI.eraseFromParent(); 1127 return true; 1128 } 1129 } 1130 1131 return true; 1132 } 1133 1134 bool AArch64LegalizerInfo::legalizeShlAshrLshr( 1135 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1136 GISelChangeObserver &Observer) const { 1137 assert(MI.getOpcode() == TargetOpcode::G_ASHR || 1138 MI.getOpcode() == TargetOpcode::G_LSHR || 1139 MI.getOpcode() == TargetOpcode::G_SHL); 1140 // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the 1141 // imported patterns can select it later. Either way, it will be legal. 1142 Register AmtReg = MI.getOperand(2).getReg(); 1143 auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI); 1144 if (!VRegAndVal) 1145 return true; 1146 // Check the shift amount is in range for an immediate form. 1147 int64_t Amount = VRegAndVal->Value.getSExtValue(); 1148 if (Amount > 31) 1149 return true; // This will have to remain a register variant. 1150 auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount); 1151 Observer.changingInstr(MI); 1152 MI.getOperand(2).setReg(ExtCst.getReg(0)); 1153 Observer.changedInstr(MI); 1154 return true; 1155 } 1156 1157 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset, 1158 MachineRegisterInfo &MRI) { 1159 Base = Root; 1160 Offset = 0; 1161 1162 Register NewBase; 1163 int64_t NewOffset; 1164 if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) && 1165 isShiftedInt<7, 3>(NewOffset)) { 1166 Base = NewBase; 1167 Offset = NewOffset; 1168 } 1169 } 1170 1171 // FIXME: This should be removed and replaced with the generic bitcast legalize 1172 // action. 1173 bool AArch64LegalizerInfo::legalizeLoadStore( 1174 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1175 GISelChangeObserver &Observer) const { 1176 assert(MI.getOpcode() == TargetOpcode::G_STORE || 1177 MI.getOpcode() == TargetOpcode::G_LOAD); 1178 // Here we just try to handle vector loads/stores where our value type might 1179 // have pointer elements, which the SelectionDAG importer can't handle. To 1180 // allow the existing patterns for s64 to fire for p0, we just try to bitcast 1181 // the value to use s64 types. 1182 1183 // Custom legalization requires the instruction, if not deleted, must be fully 1184 // legalized. In order to allow further legalization of the inst, we create 1185 // a new instruction and erase the existing one. 1186 1187 Register ValReg = MI.getOperand(0).getReg(); 1188 const LLT ValTy = MRI.getType(ValReg); 1189 1190 if (ValTy == LLT::scalar(128)) { 1191 assert((*MI.memoperands_begin())->getSuccessOrdering() == 1192 AtomicOrdering::Monotonic || 1193 (*MI.memoperands_begin())->getSuccessOrdering() == 1194 AtomicOrdering::Unordered); 1195 assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2"); 1196 LLT s64 = LLT::scalar(64); 1197 MachineInstrBuilder NewI; 1198 if (MI.getOpcode() == TargetOpcode::G_LOAD) { 1199 NewI = MIRBuilder.buildInstr(AArch64::LDPXi, {s64, s64}, {}); 1200 MIRBuilder.buildMergeLikeInstr( 1201 ValReg, {NewI->getOperand(0), NewI->getOperand(1)}); 1202 } else { 1203 auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0)); 1204 NewI = MIRBuilder.buildInstr( 1205 AArch64::STPXi, {}, {Split->getOperand(0), Split->getOperand(1)}); 1206 } 1207 Register Base; 1208 int Offset; 1209 matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI); 1210 NewI.addUse(Base); 1211 NewI.addImm(Offset / 8); 1212 1213 NewI.cloneMemRefs(MI); 1214 constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(), 1215 *MRI.getTargetRegisterInfo(), 1216 *ST->getRegBankInfo()); 1217 MI.eraseFromParent(); 1218 return true; 1219 } 1220 1221 if (!ValTy.isVector() || !ValTy.getElementType().isPointer() || 1222 ValTy.getElementType().getAddressSpace() != 0) { 1223 LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store"); 1224 return false; 1225 } 1226 1227 unsigned PtrSize = ValTy.getElementType().getSizeInBits(); 1228 const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize); 1229 auto &MMO = **MI.memoperands_begin(); 1230 MMO.setType(NewTy); 1231 1232 if (MI.getOpcode() == TargetOpcode::G_STORE) { 1233 auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg); 1234 MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO); 1235 } else { 1236 auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO); 1237 MIRBuilder.buildBitcast(ValReg, NewLoad); 1238 } 1239 MI.eraseFromParent(); 1240 return true; 1241 } 1242 1243 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, 1244 MachineRegisterInfo &MRI, 1245 MachineIRBuilder &MIRBuilder) const { 1246 MachineFunction &MF = MIRBuilder.getMF(); 1247 Align Alignment(MI.getOperand(2).getImm()); 1248 Register Dst = MI.getOperand(0).getReg(); 1249 Register ListPtr = MI.getOperand(1).getReg(); 1250 1251 LLT PtrTy = MRI.getType(ListPtr); 1252 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1253 1254 const unsigned PtrSize = PtrTy.getSizeInBits() / 8; 1255 const Align PtrAlign = Align(PtrSize); 1256 auto List = MIRBuilder.buildLoad( 1257 PtrTy, ListPtr, 1258 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, 1259 PtrTy, PtrAlign)); 1260 1261 MachineInstrBuilder DstPtr; 1262 if (Alignment > PtrAlign) { 1263 // Realign the list to the actual required alignment. 1264 auto AlignMinus1 = 1265 MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1); 1266 auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0)); 1267 DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment)); 1268 } else 1269 DstPtr = List; 1270 1271 LLT ValTy = MRI.getType(Dst); 1272 uint64_t ValSize = ValTy.getSizeInBits() / 8; 1273 MIRBuilder.buildLoad( 1274 Dst, DstPtr, 1275 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, 1276 ValTy, std::max(Alignment, PtrAlign))); 1277 1278 auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign)); 1279 1280 auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0)); 1281 1282 MIRBuilder.buildStore(NewList, ListPtr, 1283 *MF.getMachineMemOperand(MachinePointerInfo(), 1284 MachineMemOperand::MOStore, 1285 PtrTy, PtrAlign)); 1286 1287 MI.eraseFromParent(); 1288 return true; 1289 } 1290 1291 bool AArch64LegalizerInfo::legalizeBitfieldExtract( 1292 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 1293 // Only legal if we can select immediate forms. 1294 // TODO: Lower this otherwise. 1295 return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) && 1296 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 1297 } 1298 1299 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI, 1300 MachineRegisterInfo &MRI, 1301 LegalizerHelper &Helper) const { 1302 // When there is no integer popcount instruction (FEAT_CSSC isn't available), 1303 // it can be more efficiently lowered to the following sequence that uses 1304 // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD 1305 // registers are cheap. 1306 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 1307 // CNT V0.8B, V0.8B // 8xbyte pop-counts 1308 // ADDV B0, V0.8B // sum 8xbyte pop-counts 1309 // UMOV X0, V0.B[0] // copy byte result back to integer reg 1310 // 1311 // For 128 bit vector popcounts, we lower to the following sequence: 1312 // cnt.16b v0, v0 // v8s16, v4s32, v2s64 1313 // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64 1314 // uaddlp.4s v0, v0 // v4s32, v2s64 1315 // uaddlp.2d v0, v0 // v2s64 1316 // 1317 // For 64 bit vector popcounts, we lower to the following sequence: 1318 // cnt.8b v0, v0 // v4s16, v2s32 1319 // uaddlp.4h v0, v0 // v4s16, v2s32 1320 // uaddlp.2s v0, v0 // v2s32 1321 1322 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1323 Register Dst = MI.getOperand(0).getReg(); 1324 Register Val = MI.getOperand(1).getReg(); 1325 LLT Ty = MRI.getType(Val); 1326 unsigned Size = Ty.getSizeInBits(); 1327 1328 assert(Ty == MRI.getType(Dst) && 1329 "Expected src and dst to have the same type!"); 1330 1331 if (ST->hasCSSC() && Ty.isScalar() && Size == 128) { 1332 LLT s64 = LLT::scalar(64); 1333 1334 auto Split = MIRBuilder.buildUnmerge(s64, Val); 1335 auto CTPOP1 = MIRBuilder.buildCTPOP(s64, Split->getOperand(0)); 1336 auto CTPOP2 = MIRBuilder.buildCTPOP(s64, Split->getOperand(1)); 1337 auto Add = MIRBuilder.buildAdd(s64, CTPOP1, CTPOP2); 1338 1339 MIRBuilder.buildZExt(Dst, Add); 1340 MI.eraseFromParent(); 1341 return true; 1342 } 1343 1344 if (!ST->hasNEON() || 1345 MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) { 1346 // Use generic lowering when custom lowering is not possible. 1347 return Ty.isScalar() && (Size == 32 || Size == 64) && 1348 Helper.lowerBitCount(MI) == 1349 LegalizerHelper::LegalizeResult::Legalized; 1350 } 1351 1352 // Pre-conditioning: widen Val up to the nearest vector type. 1353 // s32,s64,v4s16,v2s32 -> v8i8 1354 // v8s16,v4s32,v2s64 -> v16i8 1355 LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8); 1356 if (Ty.isScalar()) { 1357 assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!"); 1358 if (Size == 32) { 1359 Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0); 1360 } 1361 } 1362 Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0); 1363 1364 // Count bits in each byte-sized lane. 1365 auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val); 1366 1367 // Sum across lanes. 1368 Register HSum = CTPOP.getReg(0); 1369 unsigned Opc; 1370 SmallVector<LLT> HAddTys; 1371 if (Ty.isScalar()) { 1372 Opc = Intrinsic::aarch64_neon_uaddlv; 1373 HAddTys.push_back(LLT::scalar(32)); 1374 } else if (Ty == LLT::fixed_vector(8, 16)) { 1375 Opc = Intrinsic::aarch64_neon_uaddlp; 1376 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1377 } else if (Ty == LLT::fixed_vector(4, 32)) { 1378 Opc = Intrinsic::aarch64_neon_uaddlp; 1379 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1380 HAddTys.push_back(LLT::fixed_vector(4, 32)); 1381 } else if (Ty == LLT::fixed_vector(2, 64)) { 1382 Opc = Intrinsic::aarch64_neon_uaddlp; 1383 HAddTys.push_back(LLT::fixed_vector(8, 16)); 1384 HAddTys.push_back(LLT::fixed_vector(4, 32)); 1385 HAddTys.push_back(LLT::fixed_vector(2, 64)); 1386 } else if (Ty == LLT::fixed_vector(4, 16)) { 1387 Opc = Intrinsic::aarch64_neon_uaddlp; 1388 HAddTys.push_back(LLT::fixed_vector(4, 16)); 1389 } else if (Ty == LLT::fixed_vector(2, 32)) { 1390 Opc = Intrinsic::aarch64_neon_uaddlp; 1391 HAddTys.push_back(LLT::fixed_vector(4, 16)); 1392 HAddTys.push_back(LLT::fixed_vector(2, 32)); 1393 } else 1394 llvm_unreachable("unexpected vector shape"); 1395 MachineInstrBuilder UADD; 1396 for (LLT HTy : HAddTys) { 1397 UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}, /*HasSideEffects =*/false) 1398 .addUse(HSum); 1399 HSum = UADD.getReg(0); 1400 } 1401 1402 // Post-conditioning. 1403 if (Ty.isScalar() && (Size == 64 || Size == 128)) 1404 MIRBuilder.buildZExt(Dst, UADD); 1405 else 1406 UADD->getOperand(0).setReg(Dst); 1407 MI.eraseFromParent(); 1408 return true; 1409 } 1410 1411 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128( 1412 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 1413 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1414 LLT s64 = LLT::scalar(64); 1415 auto Addr = MI.getOperand(1).getReg(); 1416 auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2)); 1417 auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3)); 1418 auto DstLo = MRI.createGenericVirtualRegister(s64); 1419 auto DstHi = MRI.createGenericVirtualRegister(s64); 1420 1421 MachineInstrBuilder CAS; 1422 if (ST->hasLSE()) { 1423 // We have 128-bit CASP instructions taking XSeqPair registers, which are 1424 // s128. We need the merge/unmerge to bracket the expansion and pair up with 1425 // the rest of the MIR so we must reassemble the extracted registers into a 1426 // 128-bit known-regclass one with code like this: 1427 // 1428 // %in1 = REG_SEQUENCE Lo, Hi ; One for each input 1429 // %out = CASP %in1, ... 1430 // %OldLo = G_EXTRACT %out, 0 1431 // %OldHi = G_EXTRACT %out, 64 1432 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); 1433 unsigned Opcode; 1434 switch (Ordering) { 1435 case AtomicOrdering::Acquire: 1436 Opcode = AArch64::CASPAX; 1437 break; 1438 case AtomicOrdering::Release: 1439 Opcode = AArch64::CASPLX; 1440 break; 1441 case AtomicOrdering::AcquireRelease: 1442 case AtomicOrdering::SequentiallyConsistent: 1443 Opcode = AArch64::CASPALX; 1444 break; 1445 default: 1446 Opcode = AArch64::CASPX; 1447 break; 1448 } 1449 1450 LLT s128 = LLT::scalar(128); 1451 auto CASDst = MRI.createGenericVirtualRegister(s128); 1452 auto CASDesired = MRI.createGenericVirtualRegister(s128); 1453 auto CASNew = MRI.createGenericVirtualRegister(s128); 1454 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {}) 1455 .addUse(DesiredI->getOperand(0).getReg()) 1456 .addImm(AArch64::sube64) 1457 .addUse(DesiredI->getOperand(1).getReg()) 1458 .addImm(AArch64::subo64); 1459 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {}) 1460 .addUse(NewI->getOperand(0).getReg()) 1461 .addImm(AArch64::sube64) 1462 .addUse(NewI->getOperand(1).getReg()) 1463 .addImm(AArch64::subo64); 1464 1465 CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr}); 1466 1467 MIRBuilder.buildExtract({DstLo}, {CASDst}, 0); 1468 MIRBuilder.buildExtract({DstHi}, {CASDst}, 64); 1469 } else { 1470 // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP 1471 // can take arbitrary registers so it just has the normal GPR64 operands the 1472 // rest of AArch64 is expecting. 1473 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); 1474 unsigned Opcode; 1475 switch (Ordering) { 1476 case AtomicOrdering::Acquire: 1477 Opcode = AArch64::CMP_SWAP_128_ACQUIRE; 1478 break; 1479 case AtomicOrdering::Release: 1480 Opcode = AArch64::CMP_SWAP_128_RELEASE; 1481 break; 1482 case AtomicOrdering::AcquireRelease: 1483 case AtomicOrdering::SequentiallyConsistent: 1484 Opcode = AArch64::CMP_SWAP_128; 1485 break; 1486 default: 1487 Opcode = AArch64::CMP_SWAP_128_MONOTONIC; 1488 break; 1489 } 1490 1491 auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 1492 CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch}, 1493 {Addr, DesiredI->getOperand(0), 1494 DesiredI->getOperand(1), NewI->getOperand(0), 1495 NewI->getOperand(1)}); 1496 } 1497 1498 CAS.cloneMemRefs(MI); 1499 constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(), 1500 *MRI.getTargetRegisterInfo(), 1501 *ST->getRegBankInfo()); 1502 1503 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {DstLo, DstHi}); 1504 MI.eraseFromParent(); 1505 return true; 1506 } 1507 1508 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI, 1509 LegalizerHelper &Helper) const { 1510 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1511 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 1512 LLT Ty = MRI.getType(MI.getOperand(1).getReg()); 1513 auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1)); 1514 MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse); 1515 MI.eraseFromParent(); 1516 return true; 1517 } 1518 1519 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI, 1520 LegalizerHelper &Helper) const { 1521 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1522 1523 // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic 1524 if (MI.getOpcode() == TargetOpcode::G_MEMSET) { 1525 // Zext the value operand to 64 bit 1526 auto &Value = MI.getOperand(1); 1527 Register ZExtValueReg = 1528 MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0); 1529 Value.setReg(ZExtValueReg); 1530 return true; 1531 } 1532 1533 return false; 1534 } 1535 1536 bool AArch64LegalizerInfo::legalizeFCopySign(MachineInstr &MI, 1537 LegalizerHelper &Helper) const { 1538 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1539 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 1540 Register Dst = MI.getOperand(0).getReg(); 1541 LLT DstTy = MRI.getType(Dst); 1542 assert(DstTy.isScalar() && "Only expected scalars right now!"); 1543 const unsigned DstSize = DstTy.getSizeInBits(); 1544 assert((DstSize == 32 || DstSize == 64) && "Unexpected dst type!"); 1545 assert(MRI.getType(MI.getOperand(2).getReg()) == DstTy && 1546 "Expected homogeneous types!"); 1547 1548 // We want to materialize a mask with the high bit set. 1549 uint64_t EltMask; 1550 LLT VecTy; 1551 1552 // TODO: s16 support. 1553 switch (DstSize) { 1554 default: 1555 llvm_unreachable("Unexpected type for G_FCOPYSIGN!"); 1556 case 64: { 1557 // AdvSIMD immediate moves cannot materialize out mask in a single 1558 // instruction for 64-bit elements. Instead, materialize zero and then 1559 // negate it. 1560 EltMask = 0; 1561 VecTy = LLT::fixed_vector(2, DstTy); 1562 break; 1563 } 1564 case 32: 1565 EltMask = 0x80000000ULL; 1566 VecTy = LLT::fixed_vector(4, DstTy); 1567 break; 1568 } 1569 1570 // Widen In1 and In2 to 128 bits. We want these to eventually become 1571 // INSERT_SUBREGs. 1572 auto Undef = MIRBuilder.buildUndef(VecTy); 1573 auto Zero = MIRBuilder.buildConstant(DstTy, 0); 1574 auto Ins1 = MIRBuilder.buildInsertVectorElement( 1575 VecTy, Undef, MI.getOperand(1).getReg(), Zero); 1576 auto Ins2 = MIRBuilder.buildInsertVectorElement( 1577 VecTy, Undef, MI.getOperand(2).getReg(), Zero); 1578 1579 // Construct the mask. 1580 auto Mask = MIRBuilder.buildConstant(VecTy, EltMask); 1581 if (DstSize == 64) 1582 Mask = MIRBuilder.buildFNeg(VecTy, Mask); 1583 1584 auto Sel = MIRBuilder.buildInstr(AArch64::G_BIT, {VecTy}, {Ins1, Ins2, Mask}); 1585 1586 // Build an unmerge whose 0th elt is the original G_FCOPYSIGN destination. We 1587 // want this to eventually become an EXTRACT_SUBREG. 1588 SmallVector<Register, 2> DstRegs(1, Dst); 1589 for (unsigned I = 1, E = VecTy.getNumElements(); I < E; ++I) 1590 DstRegs.push_back(MRI.createGenericVirtualRegister(DstTy)); 1591 MIRBuilder.buildUnmerge(DstRegs, Sel); 1592 MI.eraseFromParent(); 1593 return true; 1594 } 1595