xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86FixupInstTuning.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===-- X86FixupInstTunings.cpp - replace instructions -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file does a tuning pass replacing slower machine instructions
10 // with faster ones. We do this here, as opposed to during normal ISel, as
11 // attempting to get the "right" instruction can break patterns. This pass
12 // is not meant search for special cases where an instruction can be transformed
13 // to another, it is only meant to do transformations where the old instruction
14 // is always replacable with the new instructions. For example:
15 //
16 //      `vpermq ymm` -> `vshufd ymm`
17 //          -- BAD, not always valid (lane cross/non-repeated mask)
18 //
19 //      `vpermilps ymm` -> `vshufd ymm`
20 //          -- GOOD, always replaceable
21 //
22 //===----------------------------------------------------------------------===//
23 
24 #include "X86.h"
25 #include "X86InstrInfo.h"
26 #include "X86Subtarget.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/CodeGen/MachineFunctionPass.h"
29 #include "llvm/CodeGen/MachineInstrBuilder.h"
30 
31 using namespace llvm;
32 
33 #define DEBUG_TYPE "x86-fixup-inst-tuning"
34 
35 STATISTIC(NumInstChanges, "Number of instructions changes");
36 
37 namespace {
38 class X86FixupInstTuningPass : public MachineFunctionPass {
39 public:
40   static char ID;
41 
X86FixupInstTuningPass()42   X86FixupInstTuningPass() : MachineFunctionPass(ID) {}
43 
getPassName() const44   StringRef getPassName() const override { return "X86 Fixup Inst Tuning"; }
45 
46   bool runOnMachineFunction(MachineFunction &MF) override;
47   bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB,
48                           MachineBasicBlock::iterator &I);
49 
50   // This pass runs after regalloc and doesn't support VReg operands.
getRequiredProperties() const51   MachineFunctionProperties getRequiredProperties() const override {
52     return MachineFunctionProperties().setNoVRegs();
53   }
54 
55 private:
56   const X86InstrInfo *TII = nullptr;
57   const X86Subtarget *ST = nullptr;
58   const MCSchedModel *SM = nullptr;
59 };
60 } // end anonymous namespace
61 
62 char X86FixupInstTuningPass::ID = 0;
63 
INITIALIZE_PASS(X86FixupInstTuningPass,DEBUG_TYPE,DEBUG_TYPE,false,false)64 INITIALIZE_PASS(X86FixupInstTuningPass, DEBUG_TYPE, DEBUG_TYPE, false, false)
65 
66 FunctionPass *llvm::createX86FixupInstTuning() {
67   return new X86FixupInstTuningPass();
68 }
69 
70 template <typename T>
CmpOptionals(T NewVal,T CurVal)71 static std::optional<bool> CmpOptionals(T NewVal, T CurVal) {
72   if (NewVal.has_value() && CurVal.has_value() && *NewVal != *CurVal)
73     return *NewVal < *CurVal;
74 
75   return std::nullopt;
76 }
77 
processInstruction(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator & I)78 bool X86FixupInstTuningPass::processInstruction(
79     MachineFunction &MF, MachineBasicBlock &MBB,
80     MachineBasicBlock::iterator &I) {
81   MachineInstr &MI = *I;
82   unsigned Opc = MI.getOpcode();
83   unsigned NumOperands = MI.getDesc().getNumOperands();
84   bool OptSize = MF.getFunction().hasOptSize();
85 
86   auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> {
87     // We already checked that SchedModel exists in `NewOpcPreferable`.
88     return MCSchedModel::getReciprocalThroughput(
89         *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
90   };
91 
92   auto GetInstLat = [&](unsigned Opcode) -> std::optional<double> {
93     // We already checked that SchedModel exists in `NewOpcPreferable`.
94     return MCSchedModel::computeInstrLatency(
95         *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
96   };
97 
98   auto GetInstSize = [&](unsigned Opcode) -> std::optional<unsigned> {
99     if (unsigned Size = TII->get(Opcode).getSize())
100       return Size;
101     // Zero size means we where unable to compute it.
102     return std::nullopt;
103   };
104 
105   auto NewOpcPreferable = [&](unsigned NewOpc,
106                               bool ReplaceInTie = true) -> bool {
107     std::optional<bool> Res;
108     if (SM->hasInstrSchedModel()) {
109       // Compare tput -> lat -> code size.
110       Res = CmpOptionals(GetInstTput(NewOpc), GetInstTput(Opc));
111       if (Res.has_value())
112         return *Res;
113 
114       Res = CmpOptionals(GetInstLat(NewOpc), GetInstLat(Opc));
115       if (Res.has_value())
116         return *Res;
117     }
118 
119     Res = CmpOptionals(GetInstSize(Opc), GetInstSize(NewOpc));
120     if (Res.has_value())
121       return *Res;
122 
123     // We either have either were unable to get tput/lat/codesize or all values
124     // were equal. Return specified option for a tie.
125     return ReplaceInTie;
126   };
127 
128   // `vpermilpd r, i` -> `vshufpd r, r, i`
129   // `vpermilpd r, i, k` -> `vshufpd r, r, i, k`
130   // `vshufpd` is always as fast or faster than `vpermilpd` and takes
131   // 1 less byte of code size for VEX and EVEX encoding.
132   auto ProcessVPERMILPDri = [&](unsigned NewOpc) -> bool {
133     if (!NewOpcPreferable(NewOpc))
134       return false;
135     LLVM_DEBUG(dbgs() << "Replacing: " << MI);
136     {
137       unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
138       MI.removeOperand(NumOperands - 1);
139       MI.addOperand(MI.getOperand(NumOperands - 2));
140       MI.setDesc(TII->get(NewOpc));
141       MI.addOperand(MachineOperand::CreateImm(MaskImm));
142     }
143     LLVM_DEBUG(dbgs() << "     With: " << MI);
144     return true;
145   };
146 
147   // `vpermilps r, i` -> `vshufps r, r, i`
148   // `vpermilps r, i, k` -> `vshufps r, r, i, k`
149   // `vshufps` is always as fast or faster than `vpermilps` and takes
150   // 1 less byte of code size for VEX and EVEX encoding.
151   auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool {
152     if (!NewOpcPreferable(NewOpc))
153       return false;
154     LLVM_DEBUG(dbgs() << "Replacing: " << MI);
155     {
156       unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
157       MI.removeOperand(NumOperands - 1);
158       MI.addOperand(MI.getOperand(NumOperands - 2));
159       MI.setDesc(TII->get(NewOpc));
160       MI.addOperand(MachineOperand::CreateImm(MaskImm));
161     }
162     LLVM_DEBUG(dbgs() << "     With: " << MI);
163     return true;
164   };
165 
166   // `vpermilps m, i` -> `vpshufd m, i` iff no domain delay penalty on shuffles.
167   // `vpshufd` is always as fast or faster than `vpermilps` and takes 1 less
168   // byte of code size.
169   auto ProcessVPERMILPSmi = [&](unsigned NewOpc) -> bool {
170     // TODO: Might be work adding bypass delay if -Os/-Oz is enabled as
171     // `vpshufd` saves a byte of code size.
172     if (!ST->hasNoDomainDelayShuffle() ||
173         !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
174       return false;
175     LLVM_DEBUG(dbgs() << "Replacing: " << MI);
176     {
177       MI.setDesc(TII->get(NewOpc));
178     }
179     LLVM_DEBUG(dbgs() << "     With: " << MI);
180     return true;
181   };
182 
183   // `vunpcklpd/vmovlhps r, r` -> `vunpcklqdq r, r`/`vshufpd r, r, 0x00`
184   // `vunpckhpd/vmovlhps r, r` -> `vunpckhqdq r, r`/`vshufpd r, r, 0xff`
185   // `vunpcklpd r, r, k` -> `vunpcklqdq r, r, k`/`vshufpd r, r, k, 0x00`
186   // `vunpckhpd r, r, k` -> `vunpckhqdq r, r, k`/`vshufpd r, r, k, 0xff`
187   // `vunpcklpd r, m` -> `vunpcklqdq r, m, k`
188   // `vunpckhpd r, m` -> `vunpckhqdq r, m, k`
189   // `vunpcklpd r, m, k` -> `vunpcklqdq r, m, k`
190   // `vunpckhpd r, m, k` -> `vunpckhqdq r, m, k`
191   // 1) If no bypass delay and `vunpck{l|h}qdq` faster than `vunpck{l|h}pd`
192   //        -> `vunpck{l|h}qdq`
193   // 2) If `vshufpd` faster than `vunpck{l|h}pd`
194   //        -> `vshufpd`
195   //
196   // `vunpcklps` -> `vunpckldq` (for all operand types if no bypass delay)
197   auto ProcessUNPCK = [&](unsigned NewOpc, unsigned MaskImm) -> bool {
198     if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
199       return false;
200     LLVM_DEBUG(dbgs() << "Replacing: " << MI);
201     {
202       MI.setDesc(TII->get(NewOpc));
203       MI.addOperand(MachineOperand::CreateImm(MaskImm));
204     }
205     LLVM_DEBUG(dbgs() << "     With: " << MI);
206     return true;
207   };
208 
209   auto ProcessUNPCKToIntDomain = [&](unsigned NewOpc) -> bool {
210     // TODO it may be worth it to set ReplaceInTie to `true` as there is no real
211     // downside to the integer unpck, but if someone doesn't specify exact
212     // target we won't find it faster.
213     if (!ST->hasNoDomainDelayShuffle() ||
214         !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
215       return false;
216     LLVM_DEBUG(dbgs() << "Replacing: " << MI);
217     {
218       MI.setDesc(TII->get(NewOpc));
219     }
220     LLVM_DEBUG(dbgs() << "     With: " << MI);
221     return true;
222   };
223 
224   auto ProcessUNPCKLPDrr = [&](unsigned NewOpcIntDomain,
225                                unsigned NewOpc) -> bool {
226     if (ProcessUNPCKToIntDomain(NewOpcIntDomain))
227       return true;
228     return ProcessUNPCK(NewOpc, 0x00);
229   };
230   auto ProcessUNPCKHPDrr = [&](unsigned NewOpcIntDomain,
231                                unsigned NewOpc) -> bool {
232     if (ProcessUNPCKToIntDomain(NewOpcIntDomain))
233       return true;
234     return ProcessUNPCK(NewOpc, 0xff);
235   };
236 
237   auto ProcessUNPCKPDrm = [&](unsigned NewOpcIntDomain) -> bool {
238     return ProcessUNPCKToIntDomain(NewOpcIntDomain);
239   };
240 
241   auto ProcessUNPCKPS = [&](unsigned NewOpc) -> bool {
242     return ProcessUNPCKToIntDomain(NewOpc);
243   };
244 
245   auto ProcessBLENDWToBLENDD = [&](unsigned MovOpc, unsigned NumElts) -> bool {
246     if (!ST->hasAVX2() || !NewOpcPreferable(MovOpc))
247       return false;
248     // Convert to VPBLENDD if scaling the VPBLENDW mask down/up loses no bits.
249     APInt MaskW =
250         APInt(8, MI.getOperand(NumOperands - 1).getImm(), /*IsSigned=*/false);
251     APInt MaskD = APIntOps::ScaleBitMask(MaskW, 4, /*MatchAllBits=*/true);
252     if (MaskW != APIntOps::ScaleBitMask(MaskD, 8, /*MatchAllBits=*/true))
253       return false;
254     APInt NewMaskD = APInt::getSplat(NumElts, MaskD);
255     LLVM_DEBUG(dbgs() << "Replacing: " << MI);
256     {
257       MI.setDesc(TII->get(MovOpc));
258       MI.removeOperand(NumOperands - 1);
259       MI.addOperand(MachineOperand::CreateImm(NewMaskD.getZExtValue()));
260     }
261     LLVM_DEBUG(dbgs() << "     With: " << MI);
262     return true;
263   };
264 
265   auto ProcessBLENDToMOV = [&](unsigned MovOpc, unsigned Mask,
266                                unsigned MovImm) -> bool {
267     if ((MI.getOperand(NumOperands - 1).getImm() & Mask) != MovImm)
268       return false;
269     if (!OptSize && !NewOpcPreferable(MovOpc))
270       return false;
271     LLVM_DEBUG(dbgs() << "Replacing: " << MI);
272     {
273       MI.setDesc(TII->get(MovOpc));
274       MI.removeOperand(NumOperands - 1);
275     }
276     LLVM_DEBUG(dbgs() << "     With: " << MI);
277     return true;
278   };
279 
280   switch (Opc) {
281   case X86::BLENDPDrri:
282     return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
283   case X86::VBLENDPDrri:
284     return ProcessBLENDToMOV(X86::VMOVSDrr, 0x3, 0x1);
285 
286   case X86::BLENDPSrri:
287     return ProcessBLENDToMOV(X86::MOVSSrr, 0xF, 0x1) ||
288            ProcessBLENDToMOV(X86::MOVSDrr, 0xF, 0x3);
289   case X86::VBLENDPSrri:
290     return ProcessBLENDToMOV(X86::VMOVSSrr, 0xF, 0x1) ||
291            ProcessBLENDToMOV(X86::VMOVSDrr, 0xF, 0x3);
292 
293   case X86::VPBLENDWrri:
294     // TODO: Add X86::VPBLENDWrmi handling
295     // TODO: Add X86::VPBLENDWYrri handling
296     // TODO: Add X86::VPBLENDWYrmi handling
297     return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4);
298 
299   case X86::VPERMILPDri:
300     return ProcessVPERMILPDri(X86::VSHUFPDrri);
301   case X86::VPERMILPDYri:
302     return ProcessVPERMILPDri(X86::VSHUFPDYrri);
303   case X86::VPERMILPDZ128ri:
304     return ProcessVPERMILPDri(X86::VSHUFPDZ128rri);
305   case X86::VPERMILPDZ256ri:
306     return ProcessVPERMILPDri(X86::VSHUFPDZ256rri);
307   case X86::VPERMILPDZri:
308     return ProcessVPERMILPDri(X86::VSHUFPDZrri);
309   case X86::VPERMILPDZ128rikz:
310     return ProcessVPERMILPDri(X86::VSHUFPDZ128rrikz);
311   case X86::VPERMILPDZ256rikz:
312     return ProcessVPERMILPDri(X86::VSHUFPDZ256rrikz);
313   case X86::VPERMILPDZrikz:
314     return ProcessVPERMILPDri(X86::VSHUFPDZrrikz);
315   case X86::VPERMILPDZ128rik:
316     return ProcessVPERMILPDri(X86::VSHUFPDZ128rrik);
317   case X86::VPERMILPDZ256rik:
318     return ProcessVPERMILPDri(X86::VSHUFPDZ256rrik);
319   case X86::VPERMILPDZrik:
320     return ProcessVPERMILPDri(X86::VSHUFPDZrrik);
321 
322   case X86::VPERMILPSri:
323     return ProcessVPERMILPSri(X86::VSHUFPSrri);
324   case X86::VPERMILPSYri:
325     return ProcessVPERMILPSri(X86::VSHUFPSYrri);
326   case X86::VPERMILPSZ128ri:
327     return ProcessVPERMILPSri(X86::VSHUFPSZ128rri);
328   case X86::VPERMILPSZ256ri:
329     return ProcessVPERMILPSri(X86::VSHUFPSZ256rri);
330   case X86::VPERMILPSZri:
331     return ProcessVPERMILPSri(X86::VSHUFPSZrri);
332   case X86::VPERMILPSZ128rikz:
333     return ProcessVPERMILPSri(X86::VSHUFPSZ128rrikz);
334   case X86::VPERMILPSZ256rikz:
335     return ProcessVPERMILPSri(X86::VSHUFPSZ256rrikz);
336   case X86::VPERMILPSZrikz:
337     return ProcessVPERMILPSri(X86::VSHUFPSZrrikz);
338   case X86::VPERMILPSZ128rik:
339     return ProcessVPERMILPSri(X86::VSHUFPSZ128rrik);
340   case X86::VPERMILPSZ256rik:
341     return ProcessVPERMILPSri(X86::VSHUFPSZ256rrik);
342   case X86::VPERMILPSZrik:
343     return ProcessVPERMILPSri(X86::VSHUFPSZrrik);
344   case X86::VPERMILPSmi:
345     return ProcessVPERMILPSmi(X86::VPSHUFDmi);
346   case X86::VPERMILPSYmi:
347     // TODO: See if there is a more generic way we can test if the replacement
348     // instruction is supported.
349     return ST->hasAVX2() ? ProcessVPERMILPSmi(X86::VPSHUFDYmi) : false;
350   case X86::VPERMILPSZ128mi:
351     return ProcessVPERMILPSmi(X86::VPSHUFDZ128mi);
352   case X86::VPERMILPSZ256mi:
353     return ProcessVPERMILPSmi(X86::VPSHUFDZ256mi);
354   case X86::VPERMILPSZmi:
355     return ProcessVPERMILPSmi(X86::VPSHUFDZmi);
356   case X86::VPERMILPSZ128mikz:
357     return ProcessVPERMILPSmi(X86::VPSHUFDZ128mikz);
358   case X86::VPERMILPSZ256mikz:
359     return ProcessVPERMILPSmi(X86::VPSHUFDZ256mikz);
360   case X86::VPERMILPSZmikz:
361     return ProcessVPERMILPSmi(X86::VPSHUFDZmikz);
362   case X86::VPERMILPSZ128mik:
363     return ProcessVPERMILPSmi(X86::VPSHUFDZ128mik);
364   case X86::VPERMILPSZ256mik:
365     return ProcessVPERMILPSmi(X86::VPSHUFDZ256mik);
366   case X86::VPERMILPSZmik:
367     return ProcessVPERMILPSmi(X86::VPSHUFDZmik);
368 
369   case X86::MOVLHPSrr:
370   case X86::UNPCKLPDrr:
371     return ProcessUNPCKLPDrr(X86::PUNPCKLQDQrr, X86::SHUFPDrri);
372   case X86::VMOVLHPSrr:
373   case X86::VUNPCKLPDrr:
374     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQrr, X86::VSHUFPDrri);
375   case X86::VUNPCKLPDYrr:
376     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQYrr, X86::VSHUFPDYrri);
377     // VMOVLHPS is always 128 bits.
378   case X86::VMOVLHPSZrr:
379   case X86::VUNPCKLPDZ128rr:
380     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rr, X86::VSHUFPDZ128rri);
381   case X86::VUNPCKLPDZ256rr:
382     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rr, X86::VSHUFPDZ256rri);
383   case X86::VUNPCKLPDZrr:
384     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrr, X86::VSHUFPDZrri);
385   case X86::VUNPCKLPDZ128rrk:
386     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrk, X86::VSHUFPDZ128rrik);
387   case X86::VUNPCKLPDZ256rrk:
388     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrk, X86::VSHUFPDZ256rrik);
389   case X86::VUNPCKLPDZrrk:
390     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrk, X86::VSHUFPDZrrik);
391   case X86::VUNPCKLPDZ128rrkz:
392     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrkz, X86::VSHUFPDZ128rrikz);
393   case X86::VUNPCKLPDZ256rrkz:
394     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrkz, X86::VSHUFPDZ256rrikz);
395   case X86::VUNPCKLPDZrrkz:
396     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrkz, X86::VSHUFPDZrrikz);
397   case X86::UNPCKHPDrr:
398     return ProcessUNPCKHPDrr(X86::PUNPCKHQDQrr, X86::SHUFPDrri);
399   case X86::VUNPCKHPDrr:
400     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQrr, X86::VSHUFPDrri);
401   case X86::VUNPCKHPDYrr:
402     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQYrr, X86::VSHUFPDYrri);
403   case X86::VUNPCKHPDZ128rr:
404     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rr, X86::VSHUFPDZ128rri);
405   case X86::VUNPCKHPDZ256rr:
406     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rr, X86::VSHUFPDZ256rri);
407   case X86::VUNPCKHPDZrr:
408     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrr, X86::VSHUFPDZrri);
409   case X86::VUNPCKHPDZ128rrk:
410     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrk, X86::VSHUFPDZ128rrik);
411   case X86::VUNPCKHPDZ256rrk:
412     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrk, X86::VSHUFPDZ256rrik);
413   case X86::VUNPCKHPDZrrk:
414     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrk, X86::VSHUFPDZrrik);
415   case X86::VUNPCKHPDZ128rrkz:
416     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrkz, X86::VSHUFPDZ128rrikz);
417   case X86::VUNPCKHPDZ256rrkz:
418     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrkz, X86::VSHUFPDZ256rrikz);
419   case X86::VUNPCKHPDZrrkz:
420     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrkz, X86::VSHUFPDZrrikz);
421   case X86::UNPCKLPDrm:
422     return ProcessUNPCKPDrm(X86::PUNPCKLQDQrm);
423   case X86::VUNPCKLPDrm:
424     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQrm);
425   case X86::VUNPCKLPDYrm:
426     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQYrm);
427   case X86::VUNPCKLPDZ128rm:
428     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rm);
429   case X86::VUNPCKLPDZ256rm:
430     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rm);
431   case X86::VUNPCKLPDZrm:
432     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrm);
433   case X86::VUNPCKLPDZ128rmk:
434     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmk);
435   case X86::VUNPCKLPDZ256rmk:
436     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmk);
437   case X86::VUNPCKLPDZrmk:
438     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmk);
439   case X86::VUNPCKLPDZ128rmkz:
440     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmkz);
441   case X86::VUNPCKLPDZ256rmkz:
442     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmkz);
443   case X86::VUNPCKLPDZrmkz:
444     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmkz);
445   case X86::UNPCKHPDrm:
446     return ProcessUNPCKPDrm(X86::PUNPCKHQDQrm);
447   case X86::VUNPCKHPDrm:
448     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQrm);
449   case X86::VUNPCKHPDYrm:
450     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQYrm);
451   case X86::VUNPCKHPDZ128rm:
452     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rm);
453   case X86::VUNPCKHPDZ256rm:
454     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rm);
455   case X86::VUNPCKHPDZrm:
456     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrm);
457   case X86::VUNPCKHPDZ128rmk:
458     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmk);
459   case X86::VUNPCKHPDZ256rmk:
460     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmk);
461   case X86::VUNPCKHPDZrmk:
462     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmk);
463   case X86::VUNPCKHPDZ128rmkz:
464     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmkz);
465   case X86::VUNPCKHPDZ256rmkz:
466     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmkz);
467   case X86::VUNPCKHPDZrmkz:
468     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmkz);
469 
470   case X86::UNPCKLPSrr:
471     return ProcessUNPCKPS(X86::PUNPCKLDQrr);
472   case X86::VUNPCKLPSrr:
473     return ProcessUNPCKPS(X86::VPUNPCKLDQrr);
474   case X86::VUNPCKLPSYrr:
475     return ProcessUNPCKPS(X86::VPUNPCKLDQYrr);
476   case X86::VUNPCKLPSZ128rr:
477     return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rr);
478   case X86::VUNPCKLPSZ256rr:
479     return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rr);
480   case X86::VUNPCKLPSZrr:
481     return ProcessUNPCKPS(X86::VPUNPCKLDQZrr);
482   case X86::VUNPCKLPSZ128rrk:
483     return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rrk);
484   case X86::VUNPCKLPSZ256rrk:
485     return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rrk);
486   case X86::VUNPCKLPSZrrk:
487     return ProcessUNPCKPS(X86::VPUNPCKLDQZrrk);
488   case X86::VUNPCKLPSZ128rrkz:
489     return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rrkz);
490   case X86::VUNPCKLPSZ256rrkz:
491     return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rrkz);
492   case X86::VUNPCKLPSZrrkz:
493     return ProcessUNPCKPS(X86::VPUNPCKLDQZrrkz);
494   case X86::UNPCKHPSrr:
495     return ProcessUNPCKPS(X86::PUNPCKHDQrr);
496   case X86::VUNPCKHPSrr:
497     return ProcessUNPCKPS(X86::VPUNPCKHDQrr);
498   case X86::VUNPCKHPSYrr:
499     return ProcessUNPCKPS(X86::VPUNPCKHDQYrr);
500   case X86::VUNPCKHPSZ128rr:
501     return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rr);
502   case X86::VUNPCKHPSZ256rr:
503     return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rr);
504   case X86::VUNPCKHPSZrr:
505     return ProcessUNPCKPS(X86::VPUNPCKHDQZrr);
506   case X86::VUNPCKHPSZ128rrk:
507     return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rrk);
508   case X86::VUNPCKHPSZ256rrk:
509     return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rrk);
510   case X86::VUNPCKHPSZrrk:
511     return ProcessUNPCKPS(X86::VPUNPCKHDQZrrk);
512   case X86::VUNPCKHPSZ128rrkz:
513     return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rrkz);
514   case X86::VUNPCKHPSZ256rrkz:
515     return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rrkz);
516   case X86::VUNPCKHPSZrrkz:
517     return ProcessUNPCKPS(X86::VPUNPCKHDQZrrkz);
518   case X86::UNPCKLPSrm:
519     return ProcessUNPCKPS(X86::PUNPCKLDQrm);
520   case X86::VUNPCKLPSrm:
521     return ProcessUNPCKPS(X86::VPUNPCKLDQrm);
522   case X86::VUNPCKLPSYrm:
523     return ProcessUNPCKPS(X86::VPUNPCKLDQYrm);
524   case X86::VUNPCKLPSZ128rm:
525     return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rm);
526   case X86::VUNPCKLPSZ256rm:
527     return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rm);
528   case X86::VUNPCKLPSZrm:
529     return ProcessUNPCKPS(X86::VPUNPCKLDQZrm);
530   case X86::VUNPCKLPSZ128rmk:
531     return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rmk);
532   case X86::VUNPCKLPSZ256rmk:
533     return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rmk);
534   case X86::VUNPCKLPSZrmk:
535     return ProcessUNPCKPS(X86::VPUNPCKLDQZrmk);
536   case X86::VUNPCKLPSZ128rmkz:
537     return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rmkz);
538   case X86::VUNPCKLPSZ256rmkz:
539     return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rmkz);
540   case X86::VUNPCKLPSZrmkz:
541     return ProcessUNPCKPS(X86::VPUNPCKLDQZrmkz);
542   case X86::UNPCKHPSrm:
543     return ProcessUNPCKPS(X86::PUNPCKHDQrm);
544   case X86::VUNPCKHPSrm:
545     return ProcessUNPCKPS(X86::VPUNPCKHDQrm);
546   case X86::VUNPCKHPSYrm:
547     return ProcessUNPCKPS(X86::VPUNPCKHDQYrm);
548   case X86::VUNPCKHPSZ128rm:
549     return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rm);
550   case X86::VUNPCKHPSZ256rm:
551     return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rm);
552   case X86::VUNPCKHPSZrm:
553     return ProcessUNPCKPS(X86::VPUNPCKHDQZrm);
554   case X86::VUNPCKHPSZ128rmk:
555     return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rmk);
556   case X86::VUNPCKHPSZ256rmk:
557     return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmk);
558   case X86::VUNPCKHPSZrmk:
559     return ProcessUNPCKPS(X86::VPUNPCKHDQZrmk);
560   case X86::VUNPCKHPSZ128rmkz:
561     return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rmkz);
562   case X86::VUNPCKHPSZ256rmkz:
563     return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz);
564   case X86::VUNPCKHPSZrmkz:
565     return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz);
566   default:
567     return false;
568   }
569 }
570 
runOnMachineFunction(MachineFunction & MF)571 bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) {
572   LLVM_DEBUG(dbgs() << "Start X86FixupInstTuning\n";);
573   bool Changed = false;
574   ST = &MF.getSubtarget<X86Subtarget>();
575   TII = ST->getInstrInfo();
576   SM = &ST->getSchedModel();
577 
578   for (MachineBasicBlock &MBB : MF) {
579     for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
580       if (processInstruction(MF, MBB, I)) {
581         ++NumInstChanges;
582         Changed = true;
583       }
584     }
585   }
586   LLVM_DEBUG(dbgs() << "End X86FixupInstTuning\n";);
587   return Changed;
588 }
589