1 //===- MultiOnDiskHashTable.h - Merged set of hash tables -------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file provides a hash table data structure suitable for incremental and 10 // distributed storage across a set of files. 11 // 12 // Multiple hash tables from different files are implicitly merged to improve 13 // performance, and on reload the merged table will override those from other 14 // files. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #ifndef LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H 19 #define LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H 20 21 #include "llvm/ADT/DenseMap.h" 22 #include "llvm/ADT/DenseSet.h" 23 #include "llvm/ADT/PointerUnion.h" 24 #include "llvm/ADT/STLExtras.h" 25 #include "llvm/ADT/SmallVector.h" 26 #include "llvm/ADT/TinyPtrVector.h" 27 #include "llvm/ADT/iterator_range.h" 28 #include "llvm/Support/Endian.h" 29 #include "llvm/Support/EndianStream.h" 30 #include "llvm/Support/OnDiskHashTable.h" 31 #include "llvm/Support/raw_ostream.h" 32 #include <algorithm> 33 #include <cstdint> 34 #include <vector> 35 36 namespace clang { 37 namespace serialization { 38 39 /// A collection of on-disk hash tables, merged when relevant for performance. 40 template<typename Info> class MultiOnDiskHashTable { 41 public: 42 /// A handle to a file, used when overriding tables. 43 using file_type = typename Info::file_type; 44 45 /// A pointer to an on-disk representation of the hash table. 46 using storage_type = const unsigned char *; 47 48 using external_key_type = typename Info::external_key_type; 49 using internal_key_type = typename Info::internal_key_type; 50 using data_type = typename Info::data_type; 51 using data_type_builder = typename Info::data_type_builder; 52 using hash_value_type = unsigned; 53 54 private: 55 /// The generator is permitted to read our merged table. 56 template<typename ReaderInfo, typename WriterInfo> 57 friend class MultiOnDiskHashTableGenerator; 58 59 /// A hash table stored on disk. 60 struct OnDiskTable { 61 using HashTable = llvm::OnDiskIterableChainedHashTable<Info>; 62 63 file_type File; 64 HashTable Table; 65 66 OnDiskTable(file_type File, unsigned NumBuckets, unsigned NumEntries, 67 storage_type Buckets, storage_type Payload, storage_type Base, 68 const Info &InfoObj) 69 : File(File), 70 Table(NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj) {} 71 }; 72 73 struct MergedTable { 74 std::vector<file_type> Files; 75 llvm::DenseMap<internal_key_type, data_type> Data; 76 }; 77 78 using Table = llvm::PointerUnion<OnDiskTable *, MergedTable *>; 79 using TableVector = llvm::TinyPtrVector<void *>; 80 81 /// The current set of on-disk and merged tables. 82 /// We manually store the opaque value of the Table because TinyPtrVector 83 /// can't cope with holding a PointerUnion directly. 84 /// There can be at most one MergedTable in this vector, and if present, 85 /// it is the first table. 86 TableVector Tables; 87 88 /// Files corresponding to overridden tables that we've not yet 89 /// discarded. 90 llvm::TinyPtrVector<file_type> PendingOverrides; 91 92 struct AsOnDiskTable { 93 using result_type = OnDiskTable *; 94 95 result_type operator()(void *P) const { 96 return llvm::cast<OnDiskTable *>(Table::getFromOpaqueValue(P)); 97 } 98 }; 99 100 using table_iterator = 101 llvm::mapped_iterator<TableVector::iterator, AsOnDiskTable>; 102 using table_range = llvm::iterator_range<table_iterator>; 103 104 /// The current set of on-disk tables. 105 table_range tables() { 106 unsigned DropBegin = getMergedTable() ? 1 : 0; 107 return llvm::map_range(llvm::drop_begin(Tables, DropBegin), 108 AsOnDiskTable()); 109 } 110 111 MergedTable *getMergedTable() const { 112 // If we already have a merged table, it's the first one. 113 return Tables.empty() ? nullptr : Table::getFromOpaqueValue(*Tables.begin()) 114 .template dyn_cast<MergedTable*>(); 115 } 116 117 /// Delete all our current on-disk tables. 118 void clear() { 119 for (auto *T : tables()) 120 delete T; 121 if (auto *M = getMergedTable()) 122 delete M; 123 Tables.clear(); 124 } 125 126 void removeOverriddenTables() { 127 llvm::DenseSet<file_type> Files; 128 Files.insert_range(PendingOverrides); 129 // Explicitly capture Files to work around an MSVC 2015 rejects-valid bug. 130 auto ShouldRemove = [&Files](void *T) -> bool { 131 auto *ODT = llvm::cast<OnDiskTable *>(Table::getFromOpaqueValue(T)); 132 bool Remove = Files.count(ODT->File); 133 if (Remove) 134 delete ODT; 135 return Remove; 136 }; 137 Tables.erase(std::remove_if(tables().begin().getCurrent(), Tables.end(), 138 ShouldRemove), 139 Tables.end()); 140 PendingOverrides.clear(); 141 } 142 143 void condense() { 144 MergedTable *Merged = getMergedTable(); 145 if (!Merged) 146 Merged = new MergedTable; 147 148 // Read in all the tables and merge them together. 149 // FIXME: Be smarter about which tables we merge. 150 for (auto *ODT : tables()) { 151 auto &HT = ODT->Table; 152 Info &InfoObj = HT.getInfoObj(); 153 154 for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) { 155 auto *LocalPtr = I.getItem(); 156 157 // FIXME: Don't rely on the OnDiskHashTable format here. 158 auto L = InfoObj.ReadKeyDataLength(LocalPtr); 159 const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first); 160 data_type_builder ValueBuilder(Merged->Data[Key]); 161 InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, 162 ValueBuilder); 163 } 164 165 Merged->Files.push_back(ODT->File); 166 delete ODT; 167 } 168 169 Tables.clear(); 170 Tables.push_back(Table(Merged).getOpaqueValue()); 171 } 172 173 public: 174 MultiOnDiskHashTable() = default; 175 176 MultiOnDiskHashTable(MultiOnDiskHashTable &&O) 177 : Tables(std::move(O.Tables)), 178 PendingOverrides(std::move(O.PendingOverrides)) { 179 O.Tables.clear(); 180 } 181 182 MultiOnDiskHashTable &operator=(MultiOnDiskHashTable &&O) { 183 if (&O == this) 184 return *this; 185 clear(); 186 Tables = std::move(O.Tables); 187 O.Tables.clear(); 188 PendingOverrides = std::move(O.PendingOverrides); 189 return *this; 190 } 191 192 ~MultiOnDiskHashTable() { clear(); } 193 194 /// Add the table \p Data loaded from file \p File. 195 void add(file_type File, storage_type Data, Info InfoObj = Info()) { 196 using namespace llvm::support; 197 198 storage_type Ptr = Data; 199 200 uint32_t BucketOffset = 201 endian::readNext<uint32_t, llvm::endianness::little>(Ptr); 202 203 // Read the list of overridden files. 204 uint32_t NumFiles = 205 endian::readNext<uint32_t, llvm::endianness::little>(Ptr); 206 // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make 207 // an additional copy. 208 llvm::SmallVector<file_type, 16> OverriddenFiles; 209 OverriddenFiles.reserve(NumFiles); 210 for (/**/; NumFiles != 0; --NumFiles) 211 OverriddenFiles.push_back(InfoObj.ReadFileRef(Ptr)); 212 llvm::append_range(PendingOverrides, OverriddenFiles); 213 214 // Read the OnDiskChainedHashTable header. 215 storage_type Buckets = Data + BucketOffset; 216 auto NumBucketsAndEntries = 217 OnDiskTable::HashTable::readNumBucketsAndEntries(Buckets); 218 219 // Register the table. 220 Table NewTable = new OnDiskTable(File, NumBucketsAndEntries.first, 221 NumBucketsAndEntries.second, 222 Buckets, Ptr, Data, std::move(InfoObj)); 223 Tables.push_back(NewTable.getOpaqueValue()); 224 } 225 226 /// Find and read the lookup results for \p EKey. 227 data_type find(const external_key_type &EKey) { 228 data_type Result; 229 230 if (!PendingOverrides.empty()) 231 removeOverriddenTables(); 232 233 if (Tables.size() > static_cast<unsigned>(Info::MaxTables)) 234 condense(); 235 236 internal_key_type Key = Info::GetInternalKey(EKey); 237 auto KeyHash = Info::ComputeHash(Key); 238 239 if (MergedTable *M = getMergedTable()) { 240 auto It = M->Data.find(Key); 241 if (It != M->Data.end()) 242 Result = It->second; 243 } 244 245 data_type_builder ResultBuilder(Result); 246 247 for (auto *ODT : tables()) { 248 auto &HT = ODT->Table; 249 auto It = HT.find_hashed(Key, KeyHash); 250 if (It != HT.end()) 251 HT.getInfoObj().ReadDataInto(Key, It.getDataPtr(), It.getDataLen(), 252 ResultBuilder); 253 } 254 255 return Result; 256 } 257 258 /// Read all the lookup results into a single value. This only makes 259 /// sense if merging values across keys is meaningful. 260 data_type findAll() { 261 data_type Result; 262 data_type_builder ResultBuilder(Result); 263 264 if (!PendingOverrides.empty()) 265 removeOverriddenTables(); 266 267 if (MergedTable *M = getMergedTable()) { 268 for (auto &KV : M->Data) 269 Info::MergeDataInto(KV.second, ResultBuilder); 270 } 271 272 for (auto *ODT : tables()) { 273 auto &HT = ODT->Table; 274 Info &InfoObj = HT.getInfoObj(); 275 for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) { 276 auto *LocalPtr = I.getItem(); 277 278 // FIXME: Don't rely on the OnDiskHashTable format here. 279 auto L = InfoObj.ReadKeyDataLength(LocalPtr); 280 const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first); 281 InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, ResultBuilder); 282 } 283 } 284 285 return Result; 286 } 287 }; 288 289 /// Writer for the on-disk hash table. 290 template<typename ReaderInfo, typename WriterInfo> 291 class MultiOnDiskHashTableGenerator { 292 using BaseTable = MultiOnDiskHashTable<ReaderInfo>; 293 using Generator = llvm::OnDiskChainedHashTableGenerator<WriterInfo>; 294 295 Generator Gen; 296 297 public: 298 MultiOnDiskHashTableGenerator() : Gen() {} 299 300 void insert(typename WriterInfo::key_type_ref Key, 301 typename WriterInfo::data_type_ref Data, WriterInfo &Info) { 302 Gen.insert(Key, Data, Info); 303 } 304 305 void emit(llvm::SmallVectorImpl<char> &Out, WriterInfo &Info, 306 const BaseTable *Base) { 307 using namespace llvm::support; 308 309 llvm::raw_svector_ostream OutStream(Out); 310 311 // Write our header information. 312 { 313 endian::Writer Writer(OutStream, llvm::endianness::little); 314 315 // Reserve four bytes for the bucket offset. 316 Writer.write<uint32_t>(0); 317 318 if (auto *Merged = Base ? Base->getMergedTable() : nullptr) { 319 // Write list of overridden files. 320 Writer.write<uint32_t>(Merged->Files.size()); 321 for (const auto &F : Merged->Files) 322 Info.EmitFileRef(OutStream, F); 323 324 // Add all merged entries from Base to the generator. 325 for (auto &KV : Merged->Data) { 326 if (!Gen.contains(KV.first, Info)) 327 Gen.insert(KV.first, Info.ImportData(KV.second), Info); 328 } 329 } else { 330 Writer.write<uint32_t>(0); 331 } 332 } 333 334 // Write the table itself. 335 uint32_t BucketOffset = Gen.Emit(OutStream, Info); 336 337 // Replace the first four bytes with the bucket offset. 338 endian::write32le(Out.data(), BucketOffset); 339 } 340 }; 341 342 } // namespace serialization 343 } // namespace clang 344 345 #endif // LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H 346