xref: /freebsd/contrib/llvm-project/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h (revision 59144db3fca192c4637637dfe6b5a5d98632cd47)
1 //===- llvm/MC/MCDisassembler.h - Disassembler interface --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LLVM_MC_MCDISASSEMBLER_MCDISASSEMBLER_H
10 #define LLVM_MC_MCDISASSEMBLER_MCDISASSEMBLER_H
11 
12 #include "llvm/ADT/StringRef.h"
13 #include "llvm/BinaryFormat/XCOFF.h"
14 #include "llvm/MC/MCDisassembler/MCSymbolizer.h"
15 #include <cstdint>
16 #include <memory>
17 #include <vector>
18 
19 namespace llvm {
20 
21 struct XCOFFSymbolInfoTy {
22   std::optional<XCOFF::StorageMappingClass> StorageMappingClass;
23   std::optional<uint32_t> Index;
24   bool IsLabel = false;
25   bool operator<(const XCOFFSymbolInfoTy &SymInfo) const;
26 };
27 
28 struct SymbolInfoTy {
29   uint64_t Addr;
30   StringRef Name;
31   // XCOFF uses XCOFFSymInfo. Other targets use Type.
32   XCOFFSymbolInfoTy XCOFFSymInfo;
33   uint8_t Type;
34   // Used by ELF to describe a mapping symbol that is usually not displayed.
35   bool IsMappingSymbol;
36 
37 private:
38   bool IsXCOFF;
39   bool HasType;
40 
41 public:
42   SymbolInfoTy(std::optional<XCOFF::StorageMappingClass> Smc, uint64_t Addr,
43                StringRef Name, std::optional<uint32_t> Idx, bool Label)
44       : Addr(Addr), Name(Name), XCOFFSymInfo{Smc, Idx, Label}, Type(0),
45         IsMappingSymbol(false), IsXCOFF(true), HasType(false) {}
46   SymbolInfoTy(uint64_t Addr, StringRef Name, uint8_t Type,
47                bool IsMappingSymbol = false, bool IsXCOFF = false)
48       : Addr(Addr), Name(Name), Type(Type), IsMappingSymbol(IsMappingSymbol),
49         IsXCOFF(IsXCOFF), HasType(true) {}
50   bool isXCOFF() const { return IsXCOFF; }
51 
52 private:
53   friend bool operator<(const SymbolInfoTy &P1, const SymbolInfoTy &P2) {
54     assert((P1.IsXCOFF == P2.IsXCOFF && P1.HasType == P2.HasType) &&
55            "The value of IsXCOFF and HasType in P1 and P2 should be the same "
56            "respectively.");
57 
58     if (P1.IsXCOFF && P1.HasType)
59       return std::tie(P1.Addr, P1.Type, P1.Name) <
60              std::tie(P2.Addr, P2.Type, P2.Name);
61 
62     if (P1.IsXCOFF)
63       return std::tie(P1.Addr, P1.XCOFFSymInfo, P1.Name) <
64              std::tie(P2.Addr, P2.XCOFFSymInfo, P2.Name);
65 
66     // With the same address, place mapping symbols first.
67     bool MS1 = !P1.IsMappingSymbol, MS2 = !P2.IsMappingSymbol;
68     return std::tie(P1.Addr, MS1, P1.Name, P1.Type) <
69            std::tie(P2.Addr, MS2, P2.Name, P2.Type);
70   }
71 };
72 
73 using SectionSymbolsTy = std::vector<SymbolInfoTy>;
74 
75 template <typename T> class ArrayRef;
76 class MCContext;
77 class MCInst;
78 class MCSubtargetInfo;
79 class raw_ostream;
80 
81 /// Superclass for all disassemblers. Consumes a memory region and provides an
82 /// array of assembly instructions.
83 class MCDisassembler {
84 public:
85   /// Ternary decode status. Most backends will just use Fail and
86   /// Success, however some have a concept of an instruction with
87   /// understandable semantics but which is architecturally
88   /// incorrect. An example of this is ARM UNPREDICTABLE instructions
89   /// which are disassemblable but cause undefined behaviour.
90   ///
91   /// Because it makes sense to disassemble these instructions, there
92   /// is a "soft fail" failure mode that indicates the MCInst& is
93   /// valid but architecturally incorrect.
94   ///
95   /// The enum numbers are deliberately chosen such that reduction
96   /// from Success->SoftFail ->Fail can be done with a simple
97   /// bitwise-AND:
98   ///
99   ///   LEFT & TOP =  | Success       Unpredictable   Fail
100   ///   --------------+-----------------------------------
101   ///   Success       | Success       Unpredictable   Fail
102   ///   Unpredictable | Unpredictable Unpredictable   Fail
103   ///   Fail          | Fail          Fail            Fail
104   ///
105   /// An easy way of encoding this is as 0b11, 0b01, 0b00 for
106   /// Success, SoftFail, Fail respectively.
107   enum DecodeStatus {
108     Fail = 0,
109     SoftFail = 1,
110     Success = 3
111   };
112 
113   MCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
114     : Ctx(Ctx), STI(STI) {}
115 
116   virtual ~MCDisassembler();
117 
118   /// Returns the disassembly of a single instruction.
119   ///
120   /// \param Instr    - An MCInst to populate with the contents of the
121   ///                   instruction.
122   /// \param Size     - A value to populate with the size of the instruction, or
123   ///                   the number of bytes consumed while attempting to decode
124   ///                   an invalid instruction.
125   /// \param Address  - The address, in the memory space of region, of the first
126   ///                   byte of the instruction.
127   /// \param Bytes    - A reference to the actual bytes of the instruction.
128   /// \param CStream  - The stream to print comments and annotations on.
129   /// \return         - MCDisassembler::Success if the instruction is valid,
130   ///                   MCDisassembler::SoftFail if the instruction was
131   ///                                            disassemblable but invalid,
132   ///                   MCDisassembler::Fail if the instruction was invalid.
133   virtual DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
134                                       ArrayRef<uint8_t> Bytes, uint64_t Address,
135                                       raw_ostream &CStream) const = 0;
136 
137   /// Used to perform separate target specific disassembly for a particular
138   /// symbol. May parse any prelude that precedes instructions after the
139   /// start of a symbol, or the entire symbol.
140   /// This is used for example by WebAssembly to decode preludes.
141   ///
142   /// Base implementation returns std::nullopt. So all targets by default ignore
143   /// to treat symbols separately.
144   ///
145   /// \param Symbol   - The symbol.
146   /// \param Size     - The number of bytes consumed.
147   /// \param Address  - The address, in the memory space of region, of the first
148   ///                   byte of the symbol.
149   /// \param Bytes    - A reference to the actual bytes at the symbol location.
150   /// \param CStream  - The stream to print comments and annotations on.
151   /// \return         - MCDisassembler::Success if bytes are decoded
152   ///                   successfully. Size must hold the number of bytes that
153   ///                   were decoded.
154   ///                 - MCDisassembler::Fail if the bytes are invalid. Size
155   ///                   must hold the number of bytes that were decoded before
156   ///                   failing. The target must print nothing. This can be
157   ///                   done by buffering the output if needed.
158   ///                 - std::nullopt if the target doesn't want to handle the
159   ///                   symbol separately. Value of Size is ignored in this
160   ///                   case.
161   virtual std::optional<DecodeStatus>
162   onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, ArrayRef<uint8_t> Bytes,
163                 uint64_t Address, raw_ostream &CStream) const;
164   // TODO:
165   // Implement similar hooks that can be used at other points during
166   // disassembly. Something along the following lines:
167   // - onBeforeInstructionDecode()
168   // - onAfterInstructionDecode()
169   // - onSymbolEnd()
170   // It should help move much of the target specific code from llvm-objdump to
171   // respective target disassemblers.
172 
173   /// Suggest a distance to skip in a buffer of data to find the next
174   /// place to look for the start of an instruction. For example, if
175   /// all instructions have a fixed alignment, this might advance to
176   /// the next multiple of that alignment.
177   ///
178   /// If not overridden, the default is 1.
179   ///
180   /// \param Address  - The address, in the memory space of region, of the
181   ///                   starting point (typically the first byte of something
182   ///                   that did not decode as a valid instruction at all).
183   /// \param Bytes    - A reference to the actual bytes at Address. May be
184   ///                   needed in order to determine the width of an
185   ///                   unrecognized instruction (e.g. in Thumb this is a simple
186   ///                   consistent criterion that doesn't require knowing the
187   ///                   specific instruction). The caller can pass as much data
188   ///                   as they have available, and the function is required to
189   ///                   make a reasonable default choice if not enough data is
190   ///                   available to make a better one.
191   /// \return         - A number of bytes to skip. Must always be greater than
192   ///                   zero. May be greater than the size of Bytes.
193   virtual uint64_t suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
194                                       uint64_t Address) const;
195 
196 private:
197   MCContext &Ctx;
198 
199 protected:
200   // Subtarget information, for instruction decoding predicates if required.
201   const MCSubtargetInfo &STI;
202   std::unique_ptr<MCSymbolizer> Symbolizer;
203 
204 public:
205   // Helpers around MCSymbolizer
206   bool tryAddingSymbolicOperand(MCInst &Inst, int64_t Value, uint64_t Address,
207                                 bool IsBranch, uint64_t Offset, uint64_t OpSize,
208                                 uint64_t InstSize) const;
209 
210   void tryAddingPcLoadReferenceComment(int64_t Value, uint64_t Address) const;
211 
212   /// Set \p Symzer as the current symbolizer.
213   /// This takes ownership of \p Symzer, and deletes the previously set one.
214   void setSymbolizer(std::unique_ptr<MCSymbolizer> Symzer);
215 
216   MCContext& getContext() const { return Ctx; }
217 
218   const MCSubtargetInfo& getSubtargetInfo() const { return STI; }
219 
220   // Marked mutable because we cache it inside the disassembler, rather than
221   // having to pass it around as an argument through all the autogenerated code.
222   mutable raw_ostream *CommentStream = nullptr;
223 };
224 
225 } // end namespace llvm
226 
227 #endif // LLVM_MC_MCDISASSEMBLER_MCDISASSEMBLER_H
228