10b57cec5SDimitry Andric //===- MultiOnDiskHashTable.h - Merged set of hash tables -------*- C++ -*-===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric // This file provides a hash table data structure suitable for incremental and 100b57cec5SDimitry Andric // distributed storage across a set of files. 110b57cec5SDimitry Andric // 120b57cec5SDimitry Andric // Multiple hash tables from different files are implicitly merged to improve 130b57cec5SDimitry Andric // performance, and on reload the merged table will override those from other 140b57cec5SDimitry Andric // files. 150b57cec5SDimitry Andric // 160b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 170b57cec5SDimitry Andric 180b57cec5SDimitry Andric #ifndef LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H 190b57cec5SDimitry Andric #define LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H 200b57cec5SDimitry Andric 210b57cec5SDimitry Andric #include "llvm/ADT/DenseMap.h" 220b57cec5SDimitry Andric #include "llvm/ADT/DenseSet.h" 230b57cec5SDimitry Andric #include "llvm/ADT/PointerUnion.h" 240b57cec5SDimitry Andric #include "llvm/ADT/STLExtras.h" 250b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h" 260b57cec5SDimitry Andric #include "llvm/ADT/TinyPtrVector.h" 270b57cec5SDimitry Andric #include "llvm/ADT/iterator_range.h" 280b57cec5SDimitry Andric #include "llvm/Support/Endian.h" 290b57cec5SDimitry Andric #include "llvm/Support/EndianStream.h" 300b57cec5SDimitry Andric #include "llvm/Support/OnDiskHashTable.h" 310b57cec5SDimitry Andric #include "llvm/Support/raw_ostream.h" 320b57cec5SDimitry Andric #include <algorithm> 330b57cec5SDimitry Andric #include <cstdint> 340b57cec5SDimitry Andric #include <vector> 350b57cec5SDimitry Andric 360b57cec5SDimitry Andric namespace clang { 370b57cec5SDimitry Andric namespace serialization { 380b57cec5SDimitry Andric 390b57cec5SDimitry Andric /// A collection of on-disk hash tables, merged when relevant for performance. 400b57cec5SDimitry Andric template<typename Info> class MultiOnDiskHashTable { 410b57cec5SDimitry Andric public: 420b57cec5SDimitry Andric /// A handle to a file, used when overriding tables. 430b57cec5SDimitry Andric using file_type = typename Info::file_type; 440b57cec5SDimitry Andric 450b57cec5SDimitry Andric /// A pointer to an on-disk representation of the hash table. 460b57cec5SDimitry Andric using storage_type = const unsigned char *; 470b57cec5SDimitry Andric 480b57cec5SDimitry Andric using external_key_type = typename Info::external_key_type; 490b57cec5SDimitry Andric using internal_key_type = typename Info::internal_key_type; 500b57cec5SDimitry Andric using data_type = typename Info::data_type; 510b57cec5SDimitry Andric using data_type_builder = typename Info::data_type_builder; 520b57cec5SDimitry Andric using hash_value_type = unsigned; 530b57cec5SDimitry Andric 540b57cec5SDimitry Andric private: 550b57cec5SDimitry Andric /// The generator is permitted to read our merged table. 560b57cec5SDimitry Andric template<typename ReaderInfo, typename WriterInfo> 570b57cec5SDimitry Andric friend class MultiOnDiskHashTableGenerator; 580b57cec5SDimitry Andric 590b57cec5SDimitry Andric /// A hash table stored on disk. 600b57cec5SDimitry Andric struct OnDiskTable { 610b57cec5SDimitry Andric using HashTable = llvm::OnDiskIterableChainedHashTable<Info>; 620b57cec5SDimitry Andric 630b57cec5SDimitry Andric file_type File; 640b57cec5SDimitry Andric HashTable Table; 650b57cec5SDimitry Andric 660b57cec5SDimitry Andric OnDiskTable(file_type File, unsigned NumBuckets, unsigned NumEntries, 670b57cec5SDimitry Andric storage_type Buckets, storage_type Payload, storage_type Base, 680b57cec5SDimitry Andric const Info &InfoObj) 690b57cec5SDimitry Andric : File(File), 700b57cec5SDimitry Andric Table(NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj) {} 710b57cec5SDimitry Andric }; 720b57cec5SDimitry Andric 730b57cec5SDimitry Andric struct MergedTable { 740b57cec5SDimitry Andric std::vector<file_type> Files; 750b57cec5SDimitry Andric llvm::DenseMap<internal_key_type, data_type> Data; 760b57cec5SDimitry Andric }; 770b57cec5SDimitry Andric 780b57cec5SDimitry Andric using Table = llvm::PointerUnion<OnDiskTable *, MergedTable *>; 790b57cec5SDimitry Andric using TableVector = llvm::TinyPtrVector<void *>; 800b57cec5SDimitry Andric 810b57cec5SDimitry Andric /// The current set of on-disk and merged tables. 820b57cec5SDimitry Andric /// We manually store the opaque value of the Table because TinyPtrVector 830b57cec5SDimitry Andric /// can't cope with holding a PointerUnion directly. 840b57cec5SDimitry Andric /// There can be at most one MergedTable in this vector, and if present, 850b57cec5SDimitry Andric /// it is the first table. 860b57cec5SDimitry Andric TableVector Tables; 870b57cec5SDimitry Andric 880b57cec5SDimitry Andric /// Files corresponding to overridden tables that we've not yet 890b57cec5SDimitry Andric /// discarded. 900b57cec5SDimitry Andric llvm::TinyPtrVector<file_type> PendingOverrides; 910b57cec5SDimitry Andric 920b57cec5SDimitry Andric struct AsOnDiskTable { 930b57cec5SDimitry Andric using result_type = OnDiskTable *; 940b57cec5SDimitry Andric 950b57cec5SDimitry Andric result_type operator()(void *P) const { 960b57cec5SDimitry Andric return Table::getFromOpaqueValue(P).template get<OnDiskTable *>(); 970b57cec5SDimitry Andric } 980b57cec5SDimitry Andric }; 990b57cec5SDimitry Andric 1000b57cec5SDimitry Andric using table_iterator = 1010b57cec5SDimitry Andric llvm::mapped_iterator<TableVector::iterator, AsOnDiskTable>; 1020b57cec5SDimitry Andric using table_range = llvm::iterator_range<table_iterator>; 1030b57cec5SDimitry Andric 1040b57cec5SDimitry Andric /// The current set of on-disk tables. 1050b57cec5SDimitry Andric table_range tables() { 1060b57cec5SDimitry Andric auto Begin = Tables.begin(), End = Tables.end(); 1070b57cec5SDimitry Andric if (getMergedTable()) 1080b57cec5SDimitry Andric ++Begin; 1090b57cec5SDimitry Andric return llvm::make_range(llvm::map_iterator(Begin, AsOnDiskTable()), 1100b57cec5SDimitry Andric llvm::map_iterator(End, AsOnDiskTable())); 1110b57cec5SDimitry Andric } 1120b57cec5SDimitry Andric 1130b57cec5SDimitry Andric MergedTable *getMergedTable() const { 1140b57cec5SDimitry Andric // If we already have a merged table, it's the first one. 1150b57cec5SDimitry Andric return Tables.empty() ? nullptr : Table::getFromOpaqueValue(*Tables.begin()) 1160b57cec5SDimitry Andric .template dyn_cast<MergedTable*>(); 1170b57cec5SDimitry Andric } 1180b57cec5SDimitry Andric 1190b57cec5SDimitry Andric /// Delete all our current on-disk tables. 1200b57cec5SDimitry Andric void clear() { 1210b57cec5SDimitry Andric for (auto *T : tables()) 1220b57cec5SDimitry Andric delete T; 1230b57cec5SDimitry Andric if (auto *M = getMergedTable()) 1240b57cec5SDimitry Andric delete M; 1250b57cec5SDimitry Andric Tables.clear(); 1260b57cec5SDimitry Andric } 1270b57cec5SDimitry Andric 1280b57cec5SDimitry Andric void removeOverriddenTables() { 1290b57cec5SDimitry Andric llvm::DenseSet<file_type> Files; 1300b57cec5SDimitry Andric Files.insert(PendingOverrides.begin(), PendingOverrides.end()); 1310b57cec5SDimitry Andric // Explicitly capture Files to work around an MSVC 2015 rejects-valid bug. 1320b57cec5SDimitry Andric auto ShouldRemove = [&Files](void *T) -> bool { 1330b57cec5SDimitry Andric auto *ODT = Table::getFromOpaqueValue(T).template get<OnDiskTable *>(); 1340b57cec5SDimitry Andric bool Remove = Files.count(ODT->File); 1350b57cec5SDimitry Andric if (Remove) 1360b57cec5SDimitry Andric delete ODT; 1370b57cec5SDimitry Andric return Remove; 1380b57cec5SDimitry Andric }; 1390b57cec5SDimitry Andric Tables.erase(std::remove_if(tables().begin().getCurrent(), Tables.end(), 1400b57cec5SDimitry Andric ShouldRemove), 1410b57cec5SDimitry Andric Tables.end()); 1420b57cec5SDimitry Andric PendingOverrides.clear(); 1430b57cec5SDimitry Andric } 1440b57cec5SDimitry Andric 1450b57cec5SDimitry Andric void condense() { 1460b57cec5SDimitry Andric MergedTable *Merged = getMergedTable(); 1470b57cec5SDimitry Andric if (!Merged) 1480b57cec5SDimitry Andric Merged = new MergedTable; 1490b57cec5SDimitry Andric 1500b57cec5SDimitry Andric // Read in all the tables and merge them together. 1510b57cec5SDimitry Andric // FIXME: Be smarter about which tables we merge. 1520b57cec5SDimitry Andric for (auto *ODT : tables()) { 1530b57cec5SDimitry Andric auto &HT = ODT->Table; 1540b57cec5SDimitry Andric Info &InfoObj = HT.getInfoObj(); 1550b57cec5SDimitry Andric 1560b57cec5SDimitry Andric for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) { 1570b57cec5SDimitry Andric auto *LocalPtr = I.getItem(); 1580b57cec5SDimitry Andric 1590b57cec5SDimitry Andric // FIXME: Don't rely on the OnDiskHashTable format here. 1600b57cec5SDimitry Andric auto L = InfoObj.ReadKeyDataLength(LocalPtr); 1610b57cec5SDimitry Andric const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first); 1620b57cec5SDimitry Andric data_type_builder ValueBuilder(Merged->Data[Key]); 1630b57cec5SDimitry Andric InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, 1640b57cec5SDimitry Andric ValueBuilder); 1650b57cec5SDimitry Andric } 1660b57cec5SDimitry Andric 1670b57cec5SDimitry Andric Merged->Files.push_back(ODT->File); 1680b57cec5SDimitry Andric delete ODT; 1690b57cec5SDimitry Andric } 1700b57cec5SDimitry Andric 1710b57cec5SDimitry Andric Tables.clear(); 1720b57cec5SDimitry Andric Tables.push_back(Table(Merged).getOpaqueValue()); 1730b57cec5SDimitry Andric } 1740b57cec5SDimitry Andric 1750b57cec5SDimitry Andric public: 1760b57cec5SDimitry Andric MultiOnDiskHashTable() = default; 1770b57cec5SDimitry Andric 1780b57cec5SDimitry Andric MultiOnDiskHashTable(MultiOnDiskHashTable &&O) 1790b57cec5SDimitry Andric : Tables(std::move(O.Tables)), 1800b57cec5SDimitry Andric PendingOverrides(std::move(O.PendingOverrides)) { 1810b57cec5SDimitry Andric O.Tables.clear(); 1820b57cec5SDimitry Andric } 1830b57cec5SDimitry Andric 1840b57cec5SDimitry Andric MultiOnDiskHashTable &operator=(MultiOnDiskHashTable &&O) { 1850b57cec5SDimitry Andric if (&O == this) 1860b57cec5SDimitry Andric return *this; 1870b57cec5SDimitry Andric clear(); 1880b57cec5SDimitry Andric Tables = std::move(O.Tables); 1890b57cec5SDimitry Andric O.Tables.clear(); 1900b57cec5SDimitry Andric PendingOverrides = std::move(O.PendingOverrides); 1910b57cec5SDimitry Andric return *this; 1920b57cec5SDimitry Andric } 1930b57cec5SDimitry Andric 1940b57cec5SDimitry Andric ~MultiOnDiskHashTable() { clear(); } 1950b57cec5SDimitry Andric 1960b57cec5SDimitry Andric /// Add the table \p Data loaded from file \p File. 1970b57cec5SDimitry Andric void add(file_type File, storage_type Data, Info InfoObj = Info()) { 1980b57cec5SDimitry Andric using namespace llvm::support; 1990b57cec5SDimitry Andric 2000b57cec5SDimitry Andric storage_type Ptr = Data; 2010b57cec5SDimitry Andric 202*5f757f3fSDimitry Andric uint32_t BucketOffset = 203*5f757f3fSDimitry Andric endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Ptr); 2040b57cec5SDimitry Andric 2050b57cec5SDimitry Andric // Read the list of overridden files. 206*5f757f3fSDimitry Andric uint32_t NumFiles = 207*5f757f3fSDimitry Andric endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Ptr); 2080b57cec5SDimitry Andric // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make 2090b57cec5SDimitry Andric // an additional copy. 2100b57cec5SDimitry Andric llvm::SmallVector<file_type, 16> OverriddenFiles; 2110b57cec5SDimitry Andric OverriddenFiles.reserve(NumFiles); 2120b57cec5SDimitry Andric for (/**/; NumFiles != 0; --NumFiles) 2130b57cec5SDimitry Andric OverriddenFiles.push_back(InfoObj.ReadFileRef(Ptr)); 2140b57cec5SDimitry Andric PendingOverrides.insert(PendingOverrides.end(), OverriddenFiles.begin(), 2150b57cec5SDimitry Andric OverriddenFiles.end()); 2160b57cec5SDimitry Andric 2170b57cec5SDimitry Andric // Read the OnDiskChainedHashTable header. 2180b57cec5SDimitry Andric storage_type Buckets = Data + BucketOffset; 2190b57cec5SDimitry Andric auto NumBucketsAndEntries = 2200b57cec5SDimitry Andric OnDiskTable::HashTable::readNumBucketsAndEntries(Buckets); 2210b57cec5SDimitry Andric 2220b57cec5SDimitry Andric // Register the table. 2230b57cec5SDimitry Andric Table NewTable = new OnDiskTable(File, NumBucketsAndEntries.first, 2240b57cec5SDimitry Andric NumBucketsAndEntries.second, 2250b57cec5SDimitry Andric Buckets, Ptr, Data, std::move(InfoObj)); 2260b57cec5SDimitry Andric Tables.push_back(NewTable.getOpaqueValue()); 2270b57cec5SDimitry Andric } 2280b57cec5SDimitry Andric 2290b57cec5SDimitry Andric /// Find and read the lookup results for \p EKey. 2300b57cec5SDimitry Andric data_type find(const external_key_type &EKey) { 2310b57cec5SDimitry Andric data_type Result; 2320b57cec5SDimitry Andric 2330b57cec5SDimitry Andric if (!PendingOverrides.empty()) 2340b57cec5SDimitry Andric removeOverriddenTables(); 2350b57cec5SDimitry Andric 2360b57cec5SDimitry Andric if (Tables.size() > static_cast<unsigned>(Info::MaxTables)) 2370b57cec5SDimitry Andric condense(); 2380b57cec5SDimitry Andric 2390b57cec5SDimitry Andric internal_key_type Key = Info::GetInternalKey(EKey); 2400b57cec5SDimitry Andric auto KeyHash = Info::ComputeHash(Key); 2410b57cec5SDimitry Andric 2420b57cec5SDimitry Andric if (MergedTable *M = getMergedTable()) { 2430b57cec5SDimitry Andric auto It = M->Data.find(Key); 2440b57cec5SDimitry Andric if (It != M->Data.end()) 2450b57cec5SDimitry Andric Result = It->second; 2460b57cec5SDimitry Andric } 2470b57cec5SDimitry Andric 2480b57cec5SDimitry Andric data_type_builder ResultBuilder(Result); 2490b57cec5SDimitry Andric 2500b57cec5SDimitry Andric for (auto *ODT : tables()) { 2510b57cec5SDimitry Andric auto &HT = ODT->Table; 2520b57cec5SDimitry Andric auto It = HT.find_hashed(Key, KeyHash); 2530b57cec5SDimitry Andric if (It != HT.end()) 2540b57cec5SDimitry Andric HT.getInfoObj().ReadDataInto(Key, It.getDataPtr(), It.getDataLen(), 2550b57cec5SDimitry Andric ResultBuilder); 2560b57cec5SDimitry Andric } 2570b57cec5SDimitry Andric 2580b57cec5SDimitry Andric return Result; 2590b57cec5SDimitry Andric } 2600b57cec5SDimitry Andric 2610b57cec5SDimitry Andric /// Read all the lookup results into a single value. This only makes 2620b57cec5SDimitry Andric /// sense if merging values across keys is meaningful. 2630b57cec5SDimitry Andric data_type findAll() { 2640b57cec5SDimitry Andric data_type Result; 2650b57cec5SDimitry Andric data_type_builder ResultBuilder(Result); 2660b57cec5SDimitry Andric 2670b57cec5SDimitry Andric if (!PendingOverrides.empty()) 2680b57cec5SDimitry Andric removeOverriddenTables(); 2690b57cec5SDimitry Andric 2700b57cec5SDimitry Andric if (MergedTable *M = getMergedTable()) { 2710b57cec5SDimitry Andric for (auto &KV : M->Data) 2720b57cec5SDimitry Andric Info::MergeDataInto(KV.second, ResultBuilder); 2730b57cec5SDimitry Andric } 2740b57cec5SDimitry Andric 2750b57cec5SDimitry Andric for (auto *ODT : tables()) { 2760b57cec5SDimitry Andric auto &HT = ODT->Table; 2770b57cec5SDimitry Andric Info &InfoObj = HT.getInfoObj(); 2780b57cec5SDimitry Andric for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) { 2790b57cec5SDimitry Andric auto *LocalPtr = I.getItem(); 2800b57cec5SDimitry Andric 2810b57cec5SDimitry Andric // FIXME: Don't rely on the OnDiskHashTable format here. 2820b57cec5SDimitry Andric auto L = InfoObj.ReadKeyDataLength(LocalPtr); 2830b57cec5SDimitry Andric const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first); 2840b57cec5SDimitry Andric InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, ResultBuilder); 2850b57cec5SDimitry Andric } 2860b57cec5SDimitry Andric } 2870b57cec5SDimitry Andric 2880b57cec5SDimitry Andric return Result; 2890b57cec5SDimitry Andric } 2900b57cec5SDimitry Andric }; 2910b57cec5SDimitry Andric 2920b57cec5SDimitry Andric /// Writer for the on-disk hash table. 2930b57cec5SDimitry Andric template<typename ReaderInfo, typename WriterInfo> 2940b57cec5SDimitry Andric class MultiOnDiskHashTableGenerator { 2950b57cec5SDimitry Andric using BaseTable = MultiOnDiskHashTable<ReaderInfo>; 2960b57cec5SDimitry Andric using Generator = llvm::OnDiskChainedHashTableGenerator<WriterInfo>; 2970b57cec5SDimitry Andric 2980b57cec5SDimitry Andric Generator Gen; 2990b57cec5SDimitry Andric 3000b57cec5SDimitry Andric public: 3010b57cec5SDimitry Andric MultiOnDiskHashTableGenerator() : Gen() {} 3020b57cec5SDimitry Andric 3030b57cec5SDimitry Andric void insert(typename WriterInfo::key_type_ref Key, 3040b57cec5SDimitry Andric typename WriterInfo::data_type_ref Data, WriterInfo &Info) { 3050b57cec5SDimitry Andric Gen.insert(Key, Data, Info); 3060b57cec5SDimitry Andric } 3070b57cec5SDimitry Andric 3080b57cec5SDimitry Andric void emit(llvm::SmallVectorImpl<char> &Out, WriterInfo &Info, 3090b57cec5SDimitry Andric const BaseTable *Base) { 3100b57cec5SDimitry Andric using namespace llvm::support; 3110b57cec5SDimitry Andric 3120b57cec5SDimitry Andric llvm::raw_svector_ostream OutStream(Out); 3130b57cec5SDimitry Andric 3140b57cec5SDimitry Andric // Write our header information. 3150b57cec5SDimitry Andric { 316*5f757f3fSDimitry Andric endian::Writer Writer(OutStream, llvm::endianness::little); 3170b57cec5SDimitry Andric 3180b57cec5SDimitry Andric // Reserve four bytes for the bucket offset. 3190b57cec5SDimitry Andric Writer.write<uint32_t>(0); 3200b57cec5SDimitry Andric 3210b57cec5SDimitry Andric if (auto *Merged = Base ? Base->getMergedTable() : nullptr) { 3220b57cec5SDimitry Andric // Write list of overridden files. 3230b57cec5SDimitry Andric Writer.write<uint32_t>(Merged->Files.size()); 3240b57cec5SDimitry Andric for (const auto &F : Merged->Files) 3250b57cec5SDimitry Andric Info.EmitFileRef(OutStream, F); 3260b57cec5SDimitry Andric 3270b57cec5SDimitry Andric // Add all merged entries from Base to the generator. 3280b57cec5SDimitry Andric for (auto &KV : Merged->Data) { 3290b57cec5SDimitry Andric if (!Gen.contains(KV.first, Info)) 3300b57cec5SDimitry Andric Gen.insert(KV.first, Info.ImportData(KV.second), Info); 3310b57cec5SDimitry Andric } 3320b57cec5SDimitry Andric } else { 3330b57cec5SDimitry Andric Writer.write<uint32_t>(0); 3340b57cec5SDimitry Andric } 3350b57cec5SDimitry Andric } 3360b57cec5SDimitry Andric 3370b57cec5SDimitry Andric // Write the table itself. 3380b57cec5SDimitry Andric uint32_t BucketOffset = Gen.Emit(OutStream, Info); 3390b57cec5SDimitry Andric 3400b57cec5SDimitry Andric // Replace the first four bytes with the bucket offset. 3410b57cec5SDimitry Andric endian::write32le(Out.data(), BucketOffset); 3420b57cec5SDimitry Andric } 3430b57cec5SDimitry Andric }; 3440b57cec5SDimitry Andric 3450b57cec5SDimitry Andric } // namespace serialization 3460b57cec5SDimitry Andric } // namespace clang 3470b57cec5SDimitry Andric 3480b57cec5SDimitry Andric #endif // LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H 349