xref: /freebsd/contrib/llvm-project/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 //===- llvm/MC/MCDisassembler.h - Disassembler interface --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LLVM_MC_MCDISASSEMBLER_MCDISASSEMBLER_H
10 #define LLVM_MC_MCDISASSEMBLER_MCDISASSEMBLER_H
11 
12 #include "llvm/ADT/StringRef.h"
13 #include "llvm/BinaryFormat/XCOFF.h"
14 #include "llvm/MC/MCDisassembler/MCSymbolizer.h"
15 #include "llvm/Support/Error.h"
16 #include <cstdint>
17 #include <memory>
18 #include <vector>
19 
20 namespace llvm {
21 
22 struct XCOFFSymbolInfoTy {
23   std::optional<XCOFF::StorageMappingClass> StorageMappingClass;
24   std::optional<uint32_t> Index;
25   bool IsLabel = false;
26   bool operator<(const XCOFFSymbolInfoTy &SymInfo) const;
27 };
28 
29 struct SymbolInfoTy {
30   uint64_t Addr;
31   StringRef Name;
32   // XCOFF uses XCOFFSymInfo. Other targets use Type.
33   XCOFFSymbolInfoTy XCOFFSymInfo;
34   uint8_t Type;
35   // Used by ELF to describe a mapping symbol that is usually not displayed.
36   bool IsMappingSymbol;
37 
38 private:
39   bool IsXCOFF;
40   bool HasType;
41 
42 public:
43   SymbolInfoTy(std::optional<XCOFF::StorageMappingClass> Smc, uint64_t Addr,
44                StringRef Name, std::optional<uint32_t> Idx, bool Label)
45       : Addr(Addr), Name(Name), XCOFFSymInfo{Smc, Idx, Label}, Type(0),
46         IsMappingSymbol(false), IsXCOFF(true), HasType(false) {}
47   SymbolInfoTy(uint64_t Addr, StringRef Name, uint8_t Type,
48                bool IsMappingSymbol = false, bool IsXCOFF = false)
49       : Addr(Addr), Name(Name), Type(Type), IsMappingSymbol(IsMappingSymbol),
50         IsXCOFF(IsXCOFF), HasType(true) {}
51   bool isXCOFF() const { return IsXCOFF; }
52 
53 private:
54   friend bool operator<(const SymbolInfoTy &P1, const SymbolInfoTy &P2) {
55     assert((P1.IsXCOFF == P2.IsXCOFF && P1.HasType == P2.HasType) &&
56            "The value of IsXCOFF and HasType in P1 and P2 should be the same "
57            "respectively.");
58 
59     if (P1.IsXCOFF && P1.HasType)
60       return std::tie(P1.Addr, P1.Type, P1.Name) <
61              std::tie(P2.Addr, P2.Type, P2.Name);
62 
63     if (P1.IsXCOFF)
64       return std::tie(P1.Addr, P1.XCOFFSymInfo, P1.Name) <
65              std::tie(P2.Addr, P2.XCOFFSymInfo, P2.Name);
66 
67     // With the same address, place mapping symbols first.
68     bool MS1 = !P1.IsMappingSymbol, MS2 = !P2.IsMappingSymbol;
69     return std::tie(P1.Addr, MS1, P1.Name, P1.Type) <
70            std::tie(P2.Addr, MS2, P2.Name, P2.Type);
71   }
72 };
73 
74 using SectionSymbolsTy = std::vector<SymbolInfoTy>;
75 
76 template <typename T> class ArrayRef;
77 class MCContext;
78 class MCInst;
79 class MCSubtargetInfo;
80 class raw_ostream;
81 
82 /// Superclass for all disassemblers. Consumes a memory region and provides an
83 /// array of assembly instructions.
84 class MCDisassembler {
85 public:
86   /// Ternary decode status. Most backends will just use Fail and
87   /// Success, however some have a concept of an instruction with
88   /// understandable semantics but which is architecturally
89   /// incorrect. An example of this is ARM UNPREDICTABLE instructions
90   /// which are disassemblable but cause undefined behaviour.
91   ///
92   /// Because it makes sense to disassemble these instructions, there
93   /// is a "soft fail" failure mode that indicates the MCInst& is
94   /// valid but architecturally incorrect.
95   ///
96   /// The enum numbers are deliberately chosen such that reduction
97   /// from Success->SoftFail ->Fail can be done with a simple
98   /// bitwise-AND:
99   ///
100   ///   LEFT & TOP =  | Success       Unpredictable   Fail
101   ///   --------------+-----------------------------------
102   ///   Success       | Success       Unpredictable   Fail
103   ///   Unpredictable | Unpredictable Unpredictable   Fail
104   ///   Fail          | Fail          Fail            Fail
105   ///
106   /// An easy way of encoding this is as 0b11, 0b01, 0b00 for
107   /// Success, SoftFail, Fail respectively.
108   enum DecodeStatus {
109     Fail = 0,
110     SoftFail = 1,
111     Success = 3
112   };
113 
114   MCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
115     : Ctx(Ctx), STI(STI) {}
116 
117   virtual ~MCDisassembler();
118 
119   /// Returns the disassembly of a single instruction.
120   ///
121   /// \param Instr    - An MCInst to populate with the contents of the
122   ///                   instruction.
123   /// \param Size     - A value to populate with the size of the instruction, or
124   ///                   the number of bytes consumed while attempting to decode
125   ///                   an invalid instruction.
126   /// \param Address  - The address, in the memory space of region, of the first
127   ///                   byte of the instruction.
128   /// \param Bytes    - A reference to the actual bytes of the instruction.
129   /// \param CStream  - The stream to print comments and annotations on.
130   /// \return         - MCDisassembler::Success if the instruction is valid,
131   ///                   MCDisassembler::SoftFail if the instruction was
132   ///                                            disassemblable but invalid,
133   ///                   MCDisassembler::Fail if the instruction was invalid.
134   virtual DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
135                                       ArrayRef<uint8_t> Bytes, uint64_t Address,
136                                       raw_ostream &CStream) const = 0;
137 
138   /// Used to perform separate target specific disassembly for a particular
139   /// symbol. May parse any prelude that precedes instructions after the
140   /// start of a symbol, or the entire symbol.
141   /// This is used for example by WebAssembly to decode preludes.
142   ///
143   /// Base implementation returns false. So all targets by default decline to
144   /// treat symbols separately.
145   ///
146   /// \param Symbol   - The symbol.
147   /// \param Size     - The number of bytes consumed.
148   /// \param Address  - The address, in the memory space of region, of the first
149   ///                   byte of the symbol.
150   /// \param Bytes    - A reference to the actual bytes at the symbol location.
151   /// \return         - True if this symbol triggered some target specific
152   ///                   disassembly for this symbol. Size must be set with the
153   ///                   number of bytes consumed.
154   ///                 - Error if this symbol triggered some target specific
155   ///                   disassembly for this symbol, but an error was found with
156   ///                   it. Size must be set with the number of bytes consumed.
157   ///                 - False if the target doesn't want to handle the symbol
158   ///                   separately. The value of Size is ignored in this case,
159   ///                   and Err must not be set.
160   virtual Expected<bool> onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
161                                        ArrayRef<uint8_t> Bytes,
162                                        uint64_t Address) const;
163   // TODO:
164   // Implement similar hooks that can be used at other points during
165   // disassembly. Something along the following lines:
166   // - onBeforeInstructionDecode()
167   // - onAfterInstructionDecode()
168   // - onSymbolEnd()
169   // It should help move much of the target specific code from llvm-objdump to
170   // respective target disassemblers.
171 
172   /// Suggest a distance to skip in a buffer of data to find the next
173   /// place to look for the start of an instruction. For example, if
174   /// all instructions have a fixed alignment, this might advance to
175   /// the next multiple of that alignment.
176   ///
177   /// If not overridden, the default is 1.
178   ///
179   /// \param Address  - The address, in the memory space of region, of the
180   ///                   starting point (typically the first byte of something
181   ///                   that did not decode as a valid instruction at all).
182   /// \param Bytes    - A reference to the actual bytes at Address. May be
183   ///                   needed in order to determine the width of an
184   ///                   unrecognized instruction (e.g. in Thumb this is a simple
185   ///                   consistent criterion that doesn't require knowing the
186   ///                   specific instruction). The caller can pass as much data
187   ///                   as they have available, and the function is required to
188   ///                   make a reasonable default choice if not enough data is
189   ///                   available to make a better one.
190   /// \return         - A number of bytes to skip. Must always be greater than
191   ///                   zero. May be greater than the size of Bytes.
192   virtual uint64_t suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
193                                       uint64_t Address) const;
194 
195 private:
196   MCContext &Ctx;
197 
198 protected:
199   // Subtarget information, for instruction decoding predicates if required.
200   const MCSubtargetInfo &STI;
201   std::unique_ptr<MCSymbolizer> Symbolizer;
202 
203 public:
204   // Helpers around MCSymbolizer
205   bool tryAddingSymbolicOperand(MCInst &Inst, int64_t Value, uint64_t Address,
206                                 bool IsBranch, uint64_t Offset, uint64_t OpSize,
207                                 uint64_t InstSize) const;
208 
209   void tryAddingPcLoadReferenceComment(int64_t Value, uint64_t Address) const;
210 
211   /// Set \p Symzer as the current symbolizer.
212   /// This takes ownership of \p Symzer, and deletes the previously set one.
213   void setSymbolizer(std::unique_ptr<MCSymbolizer> Symzer);
214 
215   MCContext& getContext() const { return Ctx; }
216 
217   const MCSubtargetInfo& getSubtargetInfo() const { return STI; }
218 
219   /// ELF-specific, set the ABI version from the object header.
220   virtual void setABIVersion(unsigned Version) {}
221 
222   // Marked mutable because we cache it inside the disassembler, rather than
223   // having to pass it around as an argument through all the autogenerated code.
224   mutable raw_ostream *CommentStream = nullptr;
225 };
226 
227 } // end namespace llvm
228 
229 #endif // LLVM_MC_MCDISASSEMBLER_MCDISASSEMBLER_H
230