xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86CompressEVEX.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1 //===- X86CompressEVEX.cpp ------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass compresses instructions from EVEX space to legacy/VEX/EVEX space
10 // when possible in order to reduce code size or facilitate HW decoding.
11 //
12 // Possible compression:
13 //   a. AVX512 instruction (EVEX) -> AVX instruction (VEX)
14 //   b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy/VEX)
15 //   c. NDD (EVEX) -> non-NDD (legacy)
16 //   d. NF_ND (EVEX) -> NF (EVEX)
17 //
18 // Compression a, b and c can always reduce code size, with some exceptions
19 // such as promoted 16-bit CRC32 which is as long as the legacy version.
20 //
21 // legacy:
22 //   crc32w %si, %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0xc6]
23 // promoted:
24 //   crc32w %si, %eax ## encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xc6]
25 //
26 // From performance perspective, these should be same (same uops and same EXE
27 // ports). From a FMV perspective, an older legacy encoding is preferred b/c it
28 // can execute in more places (broader HW install base). So we will still do
29 // the compression.
30 //
31 // Compression d can help hardware decode (HW may skip reading the NDD
32 // register) although the instruction length remains unchanged.
33 //===----------------------------------------------------------------------===//
34 
35 #include "MCTargetDesc/X86BaseInfo.h"
36 #include "MCTargetDesc/X86InstComments.h"
37 #include "X86.h"
38 #include "X86InstrInfo.h"
39 #include "X86Subtarget.h"
40 #include "llvm/ADT/StringRef.h"
41 #include "llvm/CodeGen/MachineFunction.h"
42 #include "llvm/CodeGen/MachineFunctionPass.h"
43 #include "llvm/CodeGen/MachineInstr.h"
44 #include "llvm/CodeGen/MachineOperand.h"
45 #include "llvm/MC/MCInstrDesc.h"
46 #include "llvm/Pass.h"
47 #include <atomic>
48 #include <cassert>
49 #include <cstdint>
50 
51 using namespace llvm;
52 
53 // Including the generated EVEX compression tables.
54 struct X86CompressEVEXTableEntry {
55   uint16_t OldOpc;
56   uint16_t NewOpc;
57 
58   bool operator<(const X86CompressEVEXTableEntry &RHS) const {
59     return OldOpc < RHS.OldOpc;
60   }
61 
62   friend bool operator<(const X86CompressEVEXTableEntry &TE, unsigned Opc) {
63     return TE.OldOpc < Opc;
64   }
65 };
66 #include "X86GenCompressEVEXTables.inc"
67 
68 #define COMP_EVEX_DESC "Compressing EVEX instrs when possible"
69 #define COMP_EVEX_NAME "x86-compress-evex"
70 
71 #define DEBUG_TYPE COMP_EVEX_NAME
72 
73 namespace {
74 
75 class CompressEVEXPass : public MachineFunctionPass {
76 public:
77   static char ID;
78   CompressEVEXPass() : MachineFunctionPass(ID) {}
79   StringRef getPassName() const override { return COMP_EVEX_DESC; }
80 
81   bool runOnMachineFunction(MachineFunction &MF) override;
82 
83   // This pass runs after regalloc and doesn't support VReg operands.
84   MachineFunctionProperties getRequiredProperties() const override {
85     return MachineFunctionProperties().set(
86         MachineFunctionProperties::Property::NoVRegs);
87   }
88 };
89 
90 } // end anonymous namespace
91 
92 char CompressEVEXPass::ID = 0;
93 
94 static bool usesExtendedRegister(const MachineInstr &MI) {
95   auto isHiRegIdx = [](unsigned Reg) {
96     // Check for XMM register with indexes between 16 - 31.
97     if (Reg >= X86::XMM16 && Reg <= X86::XMM31)
98       return true;
99     // Check for YMM register with indexes between 16 - 31.
100     if (Reg >= X86::YMM16 && Reg <= X86::YMM31)
101       return true;
102     // Check for GPR with indexes between 16 - 31.
103     if (X86II::isApxExtendedReg(Reg))
104       return true;
105     return false;
106   };
107 
108   // Check that operands are not ZMM regs or
109   // XMM/YMM regs with hi indexes between 16 - 31.
110   for (const MachineOperand &MO : MI.explicit_operands()) {
111     if (!MO.isReg())
112       continue;
113 
114     Register Reg = MO.getReg();
115     assert(!X86II::isZMMReg(Reg) &&
116            "ZMM instructions should not be in the EVEX->VEX tables");
117     if (isHiRegIdx(Reg))
118       return true;
119   }
120 
121   return false;
122 }
123 
124 static bool checkVEXInstPredicate(unsigned OldOpc, const X86Subtarget &ST) {
125   switch (OldOpc) {
126   default:
127     return true;
128   case X86::VCVTNEPS2BF16Z128rm:
129   case X86::VCVTNEPS2BF16Z128rr:
130   case X86::VCVTNEPS2BF16Z256rm:
131   case X86::VCVTNEPS2BF16Z256rr:
132     return ST.hasAVXNECONVERT();
133   case X86::VPDPBUSDSZ128m:
134   case X86::VPDPBUSDSZ128r:
135   case X86::VPDPBUSDSZ256m:
136   case X86::VPDPBUSDSZ256r:
137   case X86::VPDPBUSDZ128m:
138   case X86::VPDPBUSDZ128r:
139   case X86::VPDPBUSDZ256m:
140   case X86::VPDPBUSDZ256r:
141   case X86::VPDPWSSDSZ128m:
142   case X86::VPDPWSSDSZ128r:
143   case X86::VPDPWSSDSZ256m:
144   case X86::VPDPWSSDSZ256r:
145   case X86::VPDPWSSDZ128m:
146   case X86::VPDPWSSDZ128r:
147   case X86::VPDPWSSDZ256m:
148   case X86::VPDPWSSDZ256r:
149     return ST.hasAVXVNNI();
150   case X86::VPMADD52HUQZ128m:
151   case X86::VPMADD52HUQZ128r:
152   case X86::VPMADD52HUQZ256m:
153   case X86::VPMADD52HUQZ256r:
154   case X86::VPMADD52LUQZ128m:
155   case X86::VPMADD52LUQZ128r:
156   case X86::VPMADD52LUQZ256m:
157   case X86::VPMADD52LUQZ256r:
158     return ST.hasAVXIFMA();
159   }
160 }
161 
162 // Do any custom cleanup needed to finalize the conversion.
163 static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
164   (void)NewOpc;
165   unsigned Opc = MI.getOpcode();
166   switch (Opc) {
167   case X86::VALIGNDZ128rri:
168   case X86::VALIGNDZ128rmi:
169   case X86::VALIGNQZ128rri:
170   case X86::VALIGNQZ128rmi: {
171     assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) &&
172            "Unexpected new opcode!");
173     unsigned Scale =
174         (Opc == X86::VALIGNQZ128rri || Opc == X86::VALIGNQZ128rmi) ? 8 : 4;
175     MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
176     Imm.setImm(Imm.getImm() * Scale);
177     break;
178   }
179   case X86::VSHUFF32X4Z256rmi:
180   case X86::VSHUFF32X4Z256rri:
181   case X86::VSHUFF64X2Z256rmi:
182   case X86::VSHUFF64X2Z256rri:
183   case X86::VSHUFI32X4Z256rmi:
184   case X86::VSHUFI32X4Z256rri:
185   case X86::VSHUFI64X2Z256rmi:
186   case X86::VSHUFI64X2Z256rri: {
187     assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr ||
188             NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) &&
189            "Unexpected new opcode!");
190     MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
191     int64_t ImmVal = Imm.getImm();
192     // Set bit 5, move bit 1 to bit 4, copy bit 0.
193     Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1));
194     break;
195   }
196   case X86::VRNDSCALEPDZ128rri:
197   case X86::VRNDSCALEPDZ128rmi:
198   case X86::VRNDSCALEPSZ128rri:
199   case X86::VRNDSCALEPSZ128rmi:
200   case X86::VRNDSCALEPDZ256rri:
201   case X86::VRNDSCALEPDZ256rmi:
202   case X86::VRNDSCALEPSZ256rri:
203   case X86::VRNDSCALEPSZ256rmi:
204   case X86::VRNDSCALESDZr:
205   case X86::VRNDSCALESDZm:
206   case X86::VRNDSCALESSZr:
207   case X86::VRNDSCALESSZm:
208   case X86::VRNDSCALESDZr_Int:
209   case X86::VRNDSCALESDZm_Int:
210   case X86::VRNDSCALESSZr_Int:
211   case X86::VRNDSCALESSZm_Int:
212     const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
213     int64_t ImmVal = Imm.getImm();
214     // Ensure that only bits 3:0 of the immediate are used.
215     if ((ImmVal & 0xf) != ImmVal)
216       return false;
217     break;
218   }
219 
220   return true;
221 }
222 
223 static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
224   uint64_t TSFlags = MI.getDesc().TSFlags;
225 
226   // Check for EVEX instructions only.
227   if ((TSFlags & X86II::EncodingMask) != X86II::EVEX)
228     return false;
229 
230   // Instructions with mask or 512-bit vector can't be converted to VEX.
231   if (TSFlags & (X86II::EVEX_K | X86II::EVEX_L2))
232     return false;
233 
234   // EVEX_B has several meanings.
235   // AVX512:
236   //  register form: rounding control or SAE
237   //  memory form: broadcast
238   //
239   // APX:
240   //  MAP4: NDD
241   //
242   // For AVX512 cases, EVEX prefix is needed in order to carry this information
243   // thus preventing the transformation to VEX encoding.
244   if (TSFlags & X86II::EVEX_B)
245     return false;
246 
247   ArrayRef<X86CompressEVEXTableEntry> Table = ArrayRef(X86CompressEVEXTable);
248 
249   unsigned Opc = MI.getOpcode();
250   const auto *I = llvm::lower_bound(Table, Opc);
251   if (I == Table.end() || I->OldOpc != Opc)
252     return false;
253 
254   if (usesExtendedRegister(MI) || !checkVEXInstPredicate(Opc, ST) ||
255       !performCustomAdjustments(MI, I->NewOpc))
256     return false;
257 
258   const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(I->NewOpc);
259   MI.setDesc(NewDesc);
260   uint64_t Encoding = NewDesc.TSFlags & X86II::EncodingMask;
261   auto AsmComment =
262       (Encoding == X86II::VEX) ? X86::AC_EVEX_2_VEX : X86::AC_EVEX_2_LEGACY;
263   MI.setAsmPrinterFlag(AsmComment);
264   return true;
265 }
266 
267 bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) {
268 #ifndef NDEBUG
269   // Make sure the tables are sorted.
270   static std::atomic<bool> TableChecked(false);
271   if (!TableChecked.load(std::memory_order_relaxed)) {
272     assert(llvm::is_sorted(X86CompressEVEXTable) &&
273            "X86CompressEVEXTable is not sorted!");
274     TableChecked.store(true, std::memory_order_relaxed);
275   }
276 #endif
277   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
278   if (!ST.hasAVX512() && !ST.hasEGPR() && !ST.hasNDD())
279     return false;
280 
281   bool Changed = false;
282 
283   for (MachineBasicBlock &MBB : MF) {
284     // Traverse the basic block.
285     for (MachineInstr &MI : MBB)
286       Changed |= CompressEVEXImpl(MI, ST);
287   }
288 
289   return Changed;
290 }
291 
292 INITIALIZE_PASS(CompressEVEXPass, COMP_EVEX_NAME, COMP_EVEX_DESC, false, false)
293 
294 FunctionPass *llvm::createX86CompressEVEXPass() {
295   return new CompressEVEXPass();
296 }
297