1 //===-- Resizable Monotonic HashTable ---------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef LLVM_LIBC_SRC___SUPPORT_HASHTABLE_TABLE_H 10 #define LLVM_LIBC_SRC___SUPPORT_HASHTABLE_TABLE_H 11 12 #include "hdr/types/ENTRY.h" 13 #include "src/__support/CPP/bit.h" // bit_ceil 14 #include "src/__support/CPP/new.h" 15 #include "src/__support/HashTable/bitmask.h" 16 #include "src/__support/hash.h" 17 #include "src/__support/macros/attributes.h" 18 #include "src/__support/macros/config.h" 19 #include "src/__support/macros/optimization.h" 20 #include "src/__support/memory_size.h" 21 #include "src/string/memory_utils/inline_strcmp.h" 22 #include "src/string/string_utils.h" 23 #include <stddef.h> 24 #include <stdint.h> 25 26 namespace LIBC_NAMESPACE_DECL { 27 namespace internal { 28 29 LIBC_INLINE uint8_t secondary_hash(uint64_t hash) { 30 // top 7 bits of the hash. 31 return static_cast<uint8_t>(hash >> 57); 32 } 33 34 // Probe sequence based on triangular numbers, which is guaranteed (since our 35 // table size is a power of two) to visit every group of elements exactly once. 36 // 37 // A triangular probe has us jump by 1 more group every time. So first we 38 // jump by 1 group (meaning we just continue our linear scan), then 2 groups 39 // (skipping over 1 group), then 3 groups (skipping over 2 groups), and so on. 40 // 41 // If we set sizeof(Group) to be one unit: 42 // T[k] = sum {1 + 2 + ... + k} = k * (k + 1) / 2 43 // It is provable that T[k] mod 2^m generates a permutation of 44 // 0, 1, 2, 3, ..., 2^m - 2, 2^m - 1 45 // Detailed proof is available at: 46 // https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/ 47 struct ProbeSequence { 48 size_t position; 49 size_t stride; 50 size_t entries_mask; 51 52 LIBC_INLINE size_t next() { 53 position += stride; 54 position &= entries_mask; 55 stride += sizeof(Group); 56 return position; 57 } 58 }; 59 60 // The number of entries is at least group width: we do not 61 // need to do the fixup when we set the control bytes. 62 // The number of entries is at least 8: we don't have to worry 63 // about special sizes when check the fullness of the table. 64 LIBC_INLINE size_t capacity_to_entries(size_t cap) { 65 if (8 >= sizeof(Group) && cap < 8) 66 return 8; 67 if (16 >= sizeof(Group) && cap < 15) 68 return 16; 69 if (cap < sizeof(Group)) 70 cap = sizeof(Group); 71 // overflow is always checked in allocate() 72 return cpp::bit_ceil(cap * 8 / 7); 73 } 74 75 // The heap memory layout for N buckets HashTable is as follows: 76 // 77 // ======================= 78 // | N * Entry | 79 // ======================= <- align boundary 80 // | Header | 81 // ======================= <- align boundary (for fast resize) 82 // | (N + 1) * Byte | 83 // ======================= 84 // 85 // The trailing group part is to make sure we can always load 86 // a whole group of control bytes. 87 88 struct HashTable { 89 HashState state; 90 size_t entries_mask; // number of buckets - 1 91 size_t available_slots; // less than capacity 92 private: 93 // How many entries are there in the table. 94 LIBC_INLINE size_t num_of_entries() const { return entries_mask + 1; } 95 96 // How many entries can we store in the table before resizing. 97 LIBC_INLINE size_t full_capacity() const { return num_of_entries() / 8 * 7; } 98 99 // The alignment of the whole memory area is the maximum of the alignment 100 // among the following types: 101 // - HashTable 102 // - ENTRY 103 // - Group 104 LIBC_INLINE constexpr static size_t table_alignment() { 105 size_t left_align = alignof(HashTable) > alignof(ENTRY) ? alignof(HashTable) 106 : alignof(ENTRY); 107 return left_align > alignof(Group) ? left_align : alignof(Group); 108 } 109 110 LIBC_INLINE bool is_full() const { return available_slots == 0; } 111 112 LIBC_INLINE size_t offset_from_entries() const { 113 size_t entries_size = num_of_entries() * sizeof(ENTRY); 114 return entries_size + 115 SafeMemSize::offset_to(entries_size, table_alignment()); 116 } 117 118 LIBC_INLINE constexpr static size_t offset_to_groups() { 119 size_t header_size = sizeof(HashTable); 120 return header_size + SafeMemSize::offset_to(header_size, table_alignment()); 121 } 122 123 LIBC_INLINE ENTRY &entry(size_t i) { 124 return reinterpret_cast<ENTRY *>(this)[-i - 1]; 125 } 126 127 LIBC_INLINE const ENTRY &entry(size_t i) const { 128 return reinterpret_cast<const ENTRY *>(this)[-i - 1]; 129 } 130 131 LIBC_INLINE uint8_t &control(size_t i) { 132 uint8_t *ptr = reinterpret_cast<uint8_t *>(this) + offset_to_groups(); 133 return ptr[i]; 134 } 135 136 LIBC_INLINE const uint8_t &control(size_t i) const { 137 const uint8_t *ptr = 138 reinterpret_cast<const uint8_t *>(this) + offset_to_groups(); 139 return ptr[i]; 140 } 141 142 // We duplicate a group of control bytes to the end. Thus, it is possible that 143 // we need to set two control bytes at the same time. 144 LIBC_INLINE void set_ctrl(size_t index, uint8_t value) { 145 size_t index2 = ((index - sizeof(Group)) & entries_mask) + sizeof(Group); 146 control(index) = value; 147 control(index2) = value; 148 } 149 150 LIBC_INLINE size_t find(const char *key, uint64_t primary) { 151 uint8_t secondary = secondary_hash(primary); 152 ProbeSequence sequence{static_cast<size_t>(primary), 0, entries_mask}; 153 while (true) { 154 size_t pos = sequence.next(); 155 Group ctrls = Group::load(&control(pos)); 156 IteratableBitMask masks = ctrls.match_byte(secondary); 157 for (size_t i : masks) { 158 size_t index = (pos + i) & entries_mask; 159 ENTRY &entry = this->entry(index); 160 auto comp = [](char l, char r) -> int { return l - r; }; 161 if (LIBC_LIKELY(entry.key != nullptr && 162 inline_strcmp(entry.key, key, comp) == 0)) 163 return index; 164 } 165 BitMask available = ctrls.mask_available(); 166 // Since there is no deletion, the first time we find an available slot 167 // it is also ready to be used as an insertion point. Therefore, we also 168 // return the first available slot we find. If such entry is empty, the 169 // key will be nullptr. 170 if (LIBC_LIKELY(available.any_bit_set())) { 171 size_t index = 172 (pos + available.lowest_set_bit_nonzero()) & entries_mask; 173 return index; 174 } 175 } 176 } 177 178 LIBC_INLINE uint64_t oneshot_hash(const char *key) const { 179 LIBC_NAMESPACE::internal::HashState hasher = state; 180 hasher.update(key, internal::string_length(key)); 181 return hasher.finish(); 182 } 183 184 // A fast insertion routine without checking if a key already exists. 185 // Nor does the routine check if the table is full. 186 // This is only to be used in grow() where we insert all existing entries 187 // into a new table. Hence, the requirements are naturally satisfied. 188 LIBC_INLINE ENTRY *unsafe_insert(ENTRY item) { 189 uint64_t primary = oneshot_hash(item.key); 190 uint8_t secondary = secondary_hash(primary); 191 ProbeSequence sequence{static_cast<size_t>(primary), 0, entries_mask}; 192 while (true) { 193 size_t pos = sequence.next(); 194 Group ctrls = Group::load(&control(pos)); 195 BitMask available = ctrls.mask_available(); 196 if (available.any_bit_set()) { 197 size_t index = 198 (pos + available.lowest_set_bit_nonzero()) & entries_mask; 199 set_ctrl(index, secondary); 200 entry(index).key = item.key; 201 entry(index).data = item.data; 202 available_slots--; 203 return &entry(index); 204 } 205 } 206 } 207 208 LIBC_INLINE HashTable *grow() const { 209 size_t hint = full_capacity() + 1; 210 HashState state = this->state; 211 // migrate to a new random state 212 state.update(&hint, sizeof(hint)); 213 HashTable *new_table = allocate(hint, state.finish()); 214 // It is safe to call unsafe_insert() because we know that: 215 // - the new table has enough capacity to hold all the entries 216 // - there is no duplicate key in the old table 217 if (new_table != nullptr) 218 for (ENTRY e : *this) 219 new_table->unsafe_insert(e); 220 return new_table; 221 } 222 223 LIBC_INLINE static ENTRY *insert(HashTable *&table, ENTRY item, 224 uint64_t primary) { 225 auto index = table->find(item.key, primary); 226 auto slot = &table->entry(index); 227 // SVr4 and POSIX.1-2001 specify that action is significant only for 228 // unsuccessful searches, so that an ENTER should not do anything 229 // for a successful search. 230 if (slot->key != nullptr) 231 return slot; 232 233 // if table of full, we try to grow the table 234 if (table->is_full()) { 235 HashTable *new_table = table->grow(); 236 // allocation failed, return nullptr to indicate failure 237 if (new_table == nullptr) 238 return nullptr; 239 // resized sccuessfully: clean up the old table and use the new one 240 deallocate(table); 241 table = new_table; 242 // it is still valid to use the fastpath insertion. 243 return table->unsafe_insert(item); 244 } 245 246 table->set_ctrl(index, secondary_hash(primary)); 247 slot->key = item.key; 248 slot->data = item.data; 249 table->available_slots--; 250 return slot; 251 } 252 253 public: 254 LIBC_INLINE static void deallocate(HashTable *table) { 255 if (table) { 256 void *ptr = 257 reinterpret_cast<uint8_t *>(table) - table->offset_from_entries(); 258 operator delete(ptr, std::align_val_t{table_alignment()}); 259 } 260 } 261 262 LIBC_INLINE static HashTable *allocate(size_t capacity, uint64_t randomness) { 263 // check if capacity_to_entries overflows MAX_MEM_SIZE 264 if (capacity > size_t{1} << (8 * sizeof(size_t) - 1 - 3)) 265 return nullptr; 266 SafeMemSize entries{capacity_to_entries(capacity)}; 267 SafeMemSize entries_size = entries * SafeMemSize{sizeof(ENTRY)}; 268 SafeMemSize align_boundary = entries_size.align_up(table_alignment()); 269 SafeMemSize ctrl_sizes = entries + SafeMemSize{sizeof(Group)}; 270 SafeMemSize header_size{offset_to_groups()}; 271 SafeMemSize total_size = 272 (align_boundary + header_size + ctrl_sizes).align_up(table_alignment()); 273 if (!total_size.valid()) 274 return nullptr; 275 AllocChecker ac; 276 277 void *mem = operator new(total_size, std::align_val_t{table_alignment()}, 278 ac); 279 280 HashTable *table = reinterpret_cast<HashTable *>( 281 static_cast<uint8_t *>(mem) + align_boundary); 282 if (ac) { 283 table->entries_mask = entries - 1u; 284 table->available_slots = entries / 8 * 7; 285 table->state = HashState{randomness}; 286 __builtin_memset(&table->control(0), 0x80, ctrl_sizes); 287 __builtin_memset(mem, 0, table->offset_from_entries()); 288 } 289 return table; 290 } 291 292 struct FullTableIterator { 293 size_t current_offset; 294 size_t remaining; 295 IteratableBitMask current_mask; 296 const HashTable &table; 297 298 // It is fine to use remaining to represent the iterator: 299 // - this comparison only happens with the same table 300 // - hashtable will not be mutated during the iteration 301 LIBC_INLINE bool operator==(const FullTableIterator &other) const { 302 return remaining == other.remaining; 303 } 304 LIBC_INLINE bool operator!=(const FullTableIterator &other) const { 305 return remaining != other.remaining; 306 } 307 308 LIBC_INLINE FullTableIterator &operator++() { 309 this->ensure_valid_group(); 310 current_mask.remove_lowest_bit(); 311 remaining--; 312 return *this; 313 } 314 LIBC_INLINE const ENTRY &operator*() { 315 this->ensure_valid_group(); 316 return table.entry( 317 (current_offset + current_mask.lowest_set_bit_nonzero()) & 318 table.entries_mask); 319 } 320 321 private: 322 LIBC_INLINE void ensure_valid_group() { 323 while (!current_mask.any_bit_set()) { 324 current_offset += sizeof(Group); 325 // It is ensured that the load will only happen at aligned boundaries. 326 current_mask = 327 Group::load_aligned(&table.control(current_offset)).occupied(); 328 } 329 } 330 }; 331 332 using value_type = ENTRY; 333 using iterator = FullTableIterator; 334 iterator begin() const { 335 return {0, full_capacity() - available_slots, 336 Group::load_aligned(&control(0)).occupied(), *this}; 337 } 338 iterator end() const { return {0, 0, {BitMask{0}}, *this}; } 339 340 LIBC_INLINE ENTRY *find(const char *key) { 341 uint64_t primary = oneshot_hash(key); 342 ENTRY &entry = this->entry(find(key, primary)); 343 if (entry.key == nullptr) 344 return nullptr; 345 return &entry; 346 } 347 348 LIBC_INLINE static ENTRY *insert(HashTable *&table, ENTRY item) { 349 uint64_t primary = table->oneshot_hash(item.key); 350 return insert(table, item, primary); 351 } 352 }; 353 } // namespace internal 354 } // namespace LIBC_NAMESPACE_DECL 355 356 #endif // LLVM_LIBC_SRC___SUPPORT_HASHTABLE_TABLE_H 357