1 //===-- WebAssemblyTargetTransformInfo.cpp - WebAssembly-specific TTI -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file defines the WebAssembly-specific TargetTransformInfo
11 /// implementation.
12 ///
13 //===----------------------------------------------------------------------===//
14
15 #include "WebAssemblyTargetTransformInfo.h"
16
17 #include "llvm/CodeGen/CostTable.h"
18 using namespace llvm;
19
20 #define DEBUG_TYPE "wasmtti"
21
22 TargetTransformInfo::PopcntSupportKind
getPopcntSupport(unsigned TyWidth) const23 WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
24 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
25 return TargetTransformInfo::PSK_FastHardware;
26 }
27
getNumberOfRegisters(unsigned ClassID) const28 unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
29 unsigned Result = BaseT::getNumberOfRegisters(ClassID);
30
31 // For SIMD, use at least 16 registers, as a rough guess.
32 bool Vector = (ClassID == 1);
33 if (Vector)
34 Result = std::max(Result, 16u);
35
36 return Result;
37 }
38
getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const39 TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
40 TargetTransformInfo::RegisterKind K) const {
41 switch (K) {
42 case TargetTransformInfo::RGK_Scalar:
43 return TypeSize::getFixed(64);
44 case TargetTransformInfo::RGK_FixedWidthVector:
45 return TypeSize::getFixed(getST()->hasSIMD128() ? 128 : 64);
46 case TargetTransformInfo::RGK_ScalableVector:
47 return TypeSize::getScalable(0);
48 }
49
50 llvm_unreachable("Unsupported register kind");
51 }
52
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueInfo Op1Info,TTI::OperandValueInfo Op2Info,ArrayRef<const Value * > Args,const Instruction * CxtI) const53 InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
54 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
55 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
56 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
57
58 InstructionCost Cost =
59 BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
60 Opcode, Ty, CostKind, Op1Info, Op2Info);
61
62 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
63 switch (Opcode) {
64 case Instruction::LShr:
65 case Instruction::AShr:
66 case Instruction::Shl:
67 // SIMD128's shifts currently only accept a scalar shift count. For each
68 // element, we'll need to extract, op, insert. The following is a rough
69 // approximation.
70 if (!Op2Info.isUniform())
71 Cost =
72 cast<FixedVectorType>(VTy)->getNumElements() *
73 (TargetTransformInfo::TCC_Basic +
74 getArithmeticInstrCost(Opcode, VTy->getElementType(), CostKind) +
75 TargetTransformInfo::TCC_Basic);
76 break;
77 }
78 }
79 return Cost;
80 }
81
getCastInstrCost(unsigned Opcode,Type * Dst,Type * Src,TTI::CastContextHint CCH,TTI::TargetCostKind CostKind,const Instruction * I) const82 InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
83 unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH,
84 TTI::TargetCostKind CostKind, const Instruction *I) const {
85 int ISD = TLI->InstructionOpcodeToISD(Opcode);
86 auto SrcTy = TLI->getValueType(DL, Src);
87 auto DstTy = TLI->getValueType(DL, Dst);
88
89 if (!SrcTy.isSimple() || !DstTy.isSimple()) {
90 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
91 }
92
93 if (!ST->hasSIMD128()) {
94 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
95 }
96
97 auto DstVT = DstTy.getSimpleVT();
98 auto SrcVT = SrcTy.getSimpleVT();
99
100 if (I && I->hasOneUser()) {
101 auto *SingleUser = cast<Instruction>(*I->user_begin());
102 int UserISD = TLI->InstructionOpcodeToISD(SingleUser->getOpcode());
103
104 // extmul_low support
105 if (UserISD == ISD::MUL &&
106 (ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND)) {
107 // Free low extensions.
108 if ((SrcVT == MVT::v8i8 && DstVT == MVT::v8i16) ||
109 (SrcVT == MVT::v4i16 && DstVT == MVT::v4i32) ||
110 (SrcVT == MVT::v2i32 && DstVT == MVT::v2i64)) {
111 return 0;
112 }
113 // Will require an additional extlow operation for the intermediate
114 // i16/i32 value.
115 if ((SrcVT == MVT::v4i8 && DstVT == MVT::v4i32) ||
116 (SrcVT == MVT::v2i16 && DstVT == MVT::v2i64)) {
117 return 1;
118 }
119 }
120 }
121
122 // extend_low
123 static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
124 {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1},
125 {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1},
126 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1},
127 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1},
128 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1},
129 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1},
130 {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2},
131 {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2},
132 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2},
133 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2},
134 };
135
136 if (const auto *Entry =
137 ConvertCostTableLookup(ConversionTbl, ISD, DstVT, SrcVT)) {
138 return Entry->Cost;
139 }
140
141 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
142 }
143
getMemoryOpCost(unsigned Opcode,Type * Ty,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,TTI::OperandValueInfo OpInfo,const Instruction * I) const144 InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
145 unsigned Opcode, Type *Ty, Align Alignment, unsigned AddressSpace,
146 TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
147 const Instruction *I) const {
148 if (!ST->hasSIMD128() || !isa<FixedVectorType>(Ty)) {
149 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
150 CostKind);
151 }
152
153 int ISD = TLI->InstructionOpcodeToISD(Opcode);
154 if (ISD != ISD::LOAD) {
155 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
156 CostKind);
157 }
158
159 EVT VT = TLI->getValueType(DL, Ty, true);
160 // Type legalization can't handle structs
161 if (VT == MVT::Other)
162 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
163 CostKind);
164
165 auto LT = getTypeLegalizationCost(Ty);
166 if (!LT.first.isValid())
167 return InstructionCost::getInvalid();
168
169 // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads can
170 // be lowered to load32_zero and load64_zero respectively. Assume SIMD loads
171 // are twice as expensive as scalar.
172 unsigned width = VT.getSizeInBits();
173 switch (width) {
174 default:
175 break;
176 case 32:
177 case 64:
178 case 128:
179 return 2;
180 }
181
182 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind);
183 }
184
getVectorInstrCost(unsigned Opcode,Type * Val,TTI::TargetCostKind CostKind,unsigned Index,const Value * Op0,const Value * Op1) const185 InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
186 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
187 const Value *Op0, const Value *Op1) const {
188 InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost(
189 Opcode, Val, CostKind, Index, Op0, Op1);
190
191 // SIMD128's insert/extract currently only take constant indices.
192 if (Index == -1u)
193 return Cost + 25 * TargetTransformInfo::TCC_Expensive;
194
195 return Cost;
196 }
197
getPartialReductionCost(unsigned Opcode,Type * InputTypeA,Type * InputTypeB,Type * AccumType,ElementCount VF,TTI::PartialReductionExtendKind OpAExtend,TTI::PartialReductionExtendKind OpBExtend,std::optional<unsigned> BinOp,TTI::TargetCostKind CostKind) const198 InstructionCost WebAssemblyTTIImpl::getPartialReductionCost(
199 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
200 ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
201 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
202 TTI::TargetCostKind CostKind) const {
203 InstructionCost Invalid = InstructionCost::getInvalid();
204 if (!VF.isFixed() || !ST->hasSIMD128())
205 return Invalid;
206
207 if (CostKind != TTI::TCK_RecipThroughput)
208 return Invalid;
209
210 InstructionCost Cost(TTI::TCC_Basic);
211
212 // Possible options:
213 // - i16x8.extadd_pairwise_i8x16_sx
214 // - i32x4.extadd_pairwise_i16x8_sx
215 // - i32x4.dot_i16x8_s
216 // Only try to support dot, for now.
217
218 if (Opcode != Instruction::Add)
219 return Invalid;
220
221 if (!BinOp || *BinOp != Instruction::Mul)
222 return Invalid;
223
224 if (InputTypeA != InputTypeB)
225 return Invalid;
226
227 if (OpAExtend != OpBExtend)
228 return Invalid;
229
230 EVT InputEVT = EVT::getEVT(InputTypeA);
231 EVT AccumEVT = EVT::getEVT(AccumType);
232
233 // TODO: Add i64 accumulator.
234 if (AccumEVT != MVT::i32)
235 return Invalid;
236
237 // Signed inputs can lower to dot
238 if (InputEVT == MVT::i16 && VF.getFixedValue() == 8)
239 return OpAExtend == TTI::PR_SignExtend ? Cost : Cost * 2;
240
241 // Double the size of the lowered sequence.
242 if (InputEVT == MVT::i8 && VF.getFixedValue() == 16)
243 return OpAExtend == TTI::PR_SignExtend ? Cost * 2 : Cost * 4;
244
245 return Invalid;
246 }
247
getPreferredExpandedReductionShuffle(const IntrinsicInst * II) const248 TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle(
249 const IntrinsicInst *II) const {
250
251 switch (II->getIntrinsicID()) {
252 default:
253 break;
254 case Intrinsic::vector_reduce_fadd:
255 return TTI::ReductionShuffle::Pairwise;
256 }
257 return TTI::ReductionShuffle::SplitHalf;
258 }
259
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE) const260 void WebAssemblyTTIImpl::getUnrollingPreferences(
261 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
262 OptimizationRemarkEmitter *ORE) const {
263 // Scan the loop: don't unroll loops with calls. This is a standard approach
264 // for most (all?) targets.
265 for (BasicBlock *BB : L->blocks())
266 for (Instruction &I : *BB)
267 if (isa<CallInst>(I) || isa<InvokeInst>(I))
268 if (const Function *F = cast<CallBase>(I).getCalledFunction())
269 if (isLoweredToCall(F))
270 return;
271
272 // The chosen threshold is within the range of 'LoopMicroOpBufferSize' of
273 // the various microarchitectures that use the BasicTTI implementation and
274 // has been selected through heuristics across multiple cores and runtimes.
275 UP.Partial = UP.Runtime = UP.UpperBound = true;
276 UP.PartialThreshold = 30;
277
278 // Avoid unrolling when optimizing for size.
279 UP.OptSizeThreshold = 0;
280 UP.PartialOptSizeThreshold = 0;
281
282 // Set number of instructions optimized when "back edge"
283 // becomes "fall through" to default value of 2.
284 UP.BEInsns = 2;
285 }
286
supportsTailCalls() const287 bool WebAssemblyTTIImpl::supportsTailCalls() const {
288 return getST()->hasTailCall();
289 }
290
isProfitableToSinkOperands(Instruction * I,SmallVectorImpl<Use * > & Ops) const291 bool WebAssemblyTTIImpl::isProfitableToSinkOperands(
292 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
293 using namespace llvm::PatternMatch;
294
295 if (!I->getType()->isVectorTy() || !I->isShift())
296 return false;
297
298 Value *V = I->getOperand(1);
299 // We dont need to sink constant splat.
300 if (isa<Constant>(V))
301 return false;
302
303 if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
304 m_Value(), m_ZeroMask()))) {
305 // Sink insert
306 Ops.push_back(&cast<Instruction>(V)->getOperandUse(0));
307 // Sink shuffle
308 Ops.push_back(&I->getOperandUse(1));
309 return true;
310 }
311
312 return false;
313 }
314