1 //===- DataAccessProf.h - Data access profile format support ---------*- C++ 2 //-*-===// 3 // 4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 // See https://llvm.org/LICENSE.txt for license information. 6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains support to construct and use data access profiles. 11 // 12 // For the original RFC of this pass please see 13 // https://discourse.llvm.org/t/rfc-profile-guided-static-data-partitioning/83744 14 // 15 //===----------------------------------------------------------------------===// 16 17 #ifndef LLVM_PROFILEDATA_DATAACCESSPROF_H_ 18 #define LLVM_PROFILEDATA_DATAACCESSPROF_H_ 19 20 #include "llvm/ADT/DenseMapInfoVariant.h" 21 #include "llvm/ADT/MapVector.h" 22 #include "llvm/ADT/SetVector.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/ADT/StringRef.h" 25 #include "llvm/ProfileData/InstrProf.h" 26 #include "llvm/Support/Allocator.h" 27 #include "llvm/Support/Compiler.h" 28 #include "llvm/Support/Error.h" 29 #include "llvm/Support/StringSaver.h" 30 31 #include <cstdint> 32 #include <optional> 33 #include <variant> 34 35 namespace llvm { 36 37 namespace memprof { 38 39 /// The location of data in the source code. Used by profile lookup API. 40 struct SourceLocation { SourceLocationSourceLocation41 SourceLocation(StringRef FileNameRef, uint32_t Line) 42 : FileName(FileNameRef.str()), Line(Line) {} 43 44 // Empty constructor is used in yaml conversion. SourceLocationSourceLocation45 SourceLocation() {} 46 /// The filename where the data is located. 47 std::string FileName; 48 /// The line number in the source code. 49 uint32_t Line; 50 }; 51 52 namespace internal { 53 54 // Conceptually similar to SourceLocation except that FileNames are StringRef of 55 // which strings are owned by `DataAccessProfData`. Used by `DataAccessProfData` 56 // to represent data locations internally. 57 struct SourceLocationRef { SourceLocationRefSourceLocationRef58 SourceLocationRef(StringRef FileNameRef, uint32_t Line) 59 : FileName(FileNameRef), Line(Line) {} 60 // The filename where the data is located. 61 StringRef FileName; 62 // The line number in the source code. 63 uint32_t Line; 64 }; 65 66 // The data access profiles for a symbol. Used by `DataAccessProfData` 67 // to represent records internally. 68 struct DataAccessProfRecordRef { DataAccessProfRecordRefDataAccessProfRecordRef69 DataAccessProfRecordRef(uint64_t SymbolID, uint64_t AccessCount, 70 bool IsStringLiteral) 71 : SymbolID(SymbolID), AccessCount(AccessCount), 72 IsStringLiteral(IsStringLiteral) {} 73 74 // Represents a data symbol. The semantic comes in two forms: a symbol index 75 // for symbol name if `IsStringLiteral` is false, or the hash of a string 76 // content if `IsStringLiteral` is true. For most of the symbolizable static 77 // data, the mangled symbol names remain stable relative to the source code 78 // and therefore used to identify symbols across binary releases. String 79 // literals have unstable name patterns like `.str.N[.llvm.hash]`, so we use 80 // the content hash instead. This is a required field. 81 uint64_t SymbolID; 82 83 // The access count of symbol. Required. 84 uint64_t AccessCount; 85 86 // True iff this is a record for string literal (symbols with name pattern 87 // `.str.*` in the symbol table). Required. 88 bool IsStringLiteral; 89 90 // The locations of data in the source code. Optional. 91 llvm::SmallVector<SourceLocationRef, 0> Locations; 92 }; 93 } // namespace internal 94 95 // SymbolID is either a string representing symbol name if the symbol has 96 // stable mangled name relative to source code, or a uint64_t representing the 97 // content hash of a string literal (with unstable name patterns like 98 // `.str.N[.llvm.hash]`). The StringRef is owned by the class's saver object. 99 using SymbolHandleRef = std::variant<StringRef, uint64_t>; 100 101 // The senamtic is the same as `SymbolHandleRef` above. The strings are owned. 102 using SymbolHandle = std::variant<std::string, uint64_t>; 103 104 /// The data access profiles for a symbol. 105 struct DataAccessProfRecord { 106 public: DataAccessProfRecordDataAccessProfRecord107 DataAccessProfRecord(SymbolHandleRef SymHandleRef, uint64_t AccessCount, 108 ArrayRef<internal::SourceLocationRef> LocRefs) 109 : AccessCount(AccessCount) { 110 if (std::holds_alternative<StringRef>(SymHandleRef)) { 111 SymHandle = std::get<StringRef>(SymHandleRef).str(); 112 } else 113 SymHandle = std::get<uint64_t>(SymHandleRef); 114 115 for (auto Loc : LocRefs) 116 Locations.emplace_back(Loc.FileName, Loc.Line); 117 } 118 // Empty constructor is used in yaml conversion. DataAccessProfRecordDataAccessProfRecord119 DataAccessProfRecord() : AccessCount(0) {} 120 SymbolHandle SymHandle; 121 uint64_t AccessCount; 122 // The locations of data in the source code. Optional. 123 SmallVector<SourceLocation> Locations; 124 }; 125 126 /// Encapsulates the data access profile data and the methods to operate on 127 /// it. This class provides profile look-up, serialization and 128 /// deserialization. 129 class DataAccessProfData { 130 public: 131 // Use MapVector to keep input order of strings for serialization and 132 // deserialization. 133 using StringToIndexMap = llvm::MapVector<StringRef, uint64_t>; 134 DataAccessProfData()135 DataAccessProfData() : Saver(Allocator) {} 136 137 /// Serialize profile data to the output stream. 138 /// Storage layout: 139 /// - Serialized strings. 140 /// - The encoded hashes. 141 /// - Records. 142 LLVM_ABI Error serialize(ProfOStream &OS) const; 143 144 /// Deserialize this class from the given buffer. 145 LLVM_ABI Error deserialize(const unsigned char *&Ptr); 146 147 /// Returns a profile record for \p SymbolID, or std::nullopt if there 148 /// isn't a record. Internally, this function will canonicalize the symbol 149 /// name before the lookup. 150 LLVM_ABI std::optional<DataAccessProfRecord> 151 getProfileRecord(const SymbolHandleRef SymID) const; 152 153 /// Returns true if \p SymID is seen in profiled binaries and cold. 154 LLVM_ABI bool isKnownColdSymbol(const SymbolHandleRef SymID) const; 155 156 /// Methods to set symbolized data access profile. Returns error if 157 /// duplicated symbol names or content hashes are seen. The user of this 158 /// class should aggregate counters that correspond to the same symbol name 159 /// or with the same string literal hash before calling 'set*' methods. 160 LLVM_ABI Error setDataAccessProfile(SymbolHandleRef SymbolID, 161 uint64_t AccessCount); 162 /// Similar to the method above, for records with \p Locations representing 163 /// the `filename:line` where this symbol shows up. Note because of linker's 164 /// merge of identical symbols (e.g., unnamed_addr string literals), one 165 /// symbol is likely to have multiple locations. 166 LLVM_ABI Error setDataAccessProfile(SymbolHandleRef SymbolID, 167 uint64_t AccessCount, 168 ArrayRef<SourceLocation> Locations); 169 /// Add a symbol that's seen in the profiled binary without samples. 170 LLVM_ABI Error addKnownSymbolWithoutSamples(SymbolHandleRef SymbolID); 171 172 /// The following methods return array reference for various internal data 173 /// structures. getStrToIndexMapRef()174 ArrayRef<StringToIndexMap::value_type> getStrToIndexMapRef() const { 175 return StrToIndexMap.getArrayRef(); 176 } 177 ArrayRef< 178 MapVector<SymbolHandleRef, internal::DataAccessProfRecordRef>::value_type> getRecords()179 getRecords() const { 180 return Records.getArrayRef(); 181 } getKnownColdSymbols()182 ArrayRef<StringRef> getKnownColdSymbols() const { 183 return KnownColdSymbols.getArrayRef(); 184 } getKnownColdHashes()185 ArrayRef<uint64_t> getKnownColdHashes() const { 186 return KnownColdHashes.getArrayRef(); 187 } 188 189 private: 190 /// Serialize the symbol strings into the output stream. 191 Error serializeSymbolsAndFilenames(ProfOStream &OS) const; 192 193 /// Deserialize the symbol strings from \p Ptr and increment \p Ptr to the 194 /// start of the next payload. 195 Error deserializeSymbolsAndFilenames(const unsigned char *&Ptr, 196 const uint64_t NumSampledSymbols, 197 const uint64_t NumColdKnownSymbols); 198 199 /// Decode the records and increment \p Ptr to the start of the next 200 /// payload. 201 Error deserializeRecords(const unsigned char *&Ptr); 202 203 /// A helper function to compute a storage index for \p SymbolID. 204 uint64_t getEncodedIndex(const SymbolHandleRef SymbolID) const; 205 206 // Keeps owned copies of the input strings. 207 // NOTE: Keep `Saver` initialized before other class members that reference 208 // its string copies and destructed after they are destructed. 209 llvm::BumpPtrAllocator Allocator; 210 llvm::UniqueStringSaver Saver; 211 212 // `Records` stores the records. 213 MapVector<SymbolHandleRef, internal::DataAccessProfRecordRef> Records; 214 215 StringToIndexMap StrToIndexMap; 216 llvm::SetVector<uint64_t> KnownColdHashes; 217 llvm::SetVector<StringRef> KnownColdSymbols; 218 }; 219 220 } // namespace memprof 221 } // namespace llvm 222 223 #endif // LLVM_PROFILEDATA_DATAACCESSPROF_H_ 224