1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2023 Red Hat 4 */ 5 6 #include "sparse-cache.h" 7 8 #include <linux/cache.h> 9 #include <linux/delay.h> 10 #include <linux/dm-bufio.h> 11 12 #include "logger.h" 13 #include "memory-alloc.h" 14 #include "permassert.h" 15 16 #include "chapter-index.h" 17 #include "config.h" 18 #include "index.h" 19 20 /* 21 * Since the cache is small, it is implemented as a simple array of cache entries. Searching for a 22 * specific virtual chapter is implemented as a linear search. The cache replacement policy is 23 * least-recently-used (LRU). Again, the small size of the cache allows the LRU order to be 24 * maintained by shifting entries in an array list. 25 * 26 * Changing the contents of the cache requires the coordinated participation of all zone threads 27 * via the careful use of barrier messages sent to all the index zones by the triage queue worker 28 * thread. The critical invariant for coordination is that the cache membership must not change 29 * between updates, so that all calls to uds_sparse_cache_contains() from the zone threads must all 30 * receive the same results for every virtual chapter number. To ensure that critical invariant, 31 * state changes such as "that virtual chapter is no longer in the volume" and "skip searching that 32 * chapter because it has had too many cache misses" are represented separately from the cache 33 * membership information (the virtual chapter number). 34 * 35 * As a result of this invariant, we have the guarantee that every zone thread will call 36 * uds_update_sparse_cache() once and exactly once to request a chapter that is not in the cache, 37 * and the serialization of the barrier requests from the triage queue ensures they will all 38 * request the same chapter number. This means the only synchronization we need can be provided by 39 * a pair of thread barriers used only in the uds_update_sparse_cache() call, providing a critical 40 * section where a single zone thread can drive the cache update while all the other zone threads 41 * are known to be blocked, waiting in the second barrier. Outside that critical section, all the 42 * zone threads implicitly hold a shared lock. Inside it, the thread for zone zero holds an 43 * exclusive lock. No other threads may access or modify the cache entries. 44 * 45 * Chapter statistics must only be modified by a single thread, which is also the zone zero thread. 46 * All fields that might be frequently updated by that thread are kept in separate cache-aligned 47 * structures so they will not cause cache contention via "false sharing" with the fields that are 48 * frequently accessed by all of the zone threads. 49 * 50 * The LRU order is managed independently by each zone thread, and each zone uses its own list for 51 * searching and cache membership queries. The zone zero list is used to decide which chapter to 52 * evict when the cache is updated, and its search list is copied to the other threads at that 53 * time. 54 * 55 * The virtual chapter number field of the cache entry is the single field indicating whether a 56 * chapter is a member of the cache or not. The value NO_CHAPTER is used to represent a null or 57 * undefined chapter number. When present in the virtual chapter number field of a 58 * cached_chapter_index, it indicates that the cache entry is dead, and all the other fields of 59 * that entry (other than immutable pointers to cache memory) are undefined and irrelevant. Any 60 * cache entry that is not marked as dead is fully defined and a member of the cache, and 61 * uds_sparse_cache_contains() will always return true for any virtual chapter number that appears 62 * in any of the cache entries. 63 * 64 * A chapter index that is a member of the cache may be excluded from searches between calls to 65 * uds_update_sparse_cache() in two different ways. First, when a chapter falls off the end of the 66 * volume, its virtual chapter number will be less that the oldest virtual chapter number. Since 67 * that chapter is no longer part of the volume, there's no point in continuing to search that 68 * chapter index. Once invalidated, that virtual chapter will still be considered a member of the 69 * cache, but it will no longer be searched for matching names. 70 * 71 * The second mechanism is a heuristic based on keeping track of the number of consecutive search 72 * misses in a given chapter index. Once that count exceeds a threshold, the skip_search flag will 73 * be set to true, causing the chapter to be skipped when searching the entire cache, but still 74 * allowing it to be found when searching for a hook in that specific chapter. Finding a hook will 75 * clear the skip_search flag, once again allowing the non-hook searches to use that cache entry. 76 * Again, regardless of the state of the skip_search flag, the virtual chapter must still 77 * considered to be a member of the cache for uds_sparse_cache_contains(). 78 */ 79 80 #define SKIP_SEARCH_THRESHOLD 20000 81 #define ZONE_ZERO 0 82 83 /* 84 * These counters are essentially fields of the struct cached_chapter_index, but are segregated 85 * into this structure because they are frequently modified. They are grouped and aligned to keep 86 * them on different cache lines from the chapter fields that are accessed far more often than they 87 * are updated. 88 */ 89 struct __aligned(L1_CACHE_BYTES) cached_index_counters { 90 u64 consecutive_misses; 91 }; 92 93 struct __aligned(L1_CACHE_BYTES) cached_chapter_index { 94 /* 95 * The virtual chapter number of the cached chapter index. NO_CHAPTER means this cache 96 * entry is unused. This field must only be modified in the critical section in 97 * uds_update_sparse_cache(). 98 */ 99 u64 virtual_chapter; 100 101 u32 index_pages_count; 102 103 /* 104 * These pointers are immutable during the life of the cache. The contents of the arrays 105 * change when the cache entry is replaced. 106 */ 107 struct delta_index_page *index_pages; 108 struct dm_buffer **page_buffers; 109 110 /* 111 * If set, skip the chapter when searching the entire cache. This flag is just a 112 * performance optimization. This flag is mutable between cache updates, but it rarely 113 * changes and is frequently accessed, so it groups with the immutable fields. 114 */ 115 bool skip_search; 116 117 /* 118 * The cache-aligned counters change often and are placed at the end of the structure to 119 * prevent false sharing with the more stable fields above. 120 */ 121 struct cached_index_counters counters; 122 }; 123 124 /* 125 * A search_list represents an ordering of the sparse chapter index cache entry array, from most 126 * recently accessed to least recently accessed, which is the order in which the indexes should be 127 * searched and the reverse order in which they should be evicted from the cache. 128 * 129 * Cache entries that are dead or empty are kept at the end of the list, avoiding the need to even 130 * iterate over them to search, and ensuring that dead entries are replaced before any live entries 131 * are evicted. 132 * 133 * The search list is instantiated for each zone thread, avoiding any need for synchronization. The 134 * structure is allocated on a cache boundary to avoid false sharing of memory cache lines between 135 * zone threads. 136 */ 137 struct search_list { 138 u8 capacity; 139 u8 first_dead_entry; 140 struct cached_chapter_index *entries[]; 141 }; 142 143 struct threads_barrier { 144 /* Lock for this barrier object */ 145 struct semaphore lock; 146 /* Semaphore for threads waiting at this barrier */ 147 struct semaphore wait; 148 /* Number of threads which have arrived */ 149 int arrived; 150 /* Total number of threads using this barrier */ 151 int thread_count; 152 }; 153 154 struct sparse_cache { 155 const struct index_geometry *geometry; 156 unsigned int capacity; 157 unsigned int zone_count; 158 159 unsigned int skip_threshold; 160 struct search_list *search_lists[MAX_ZONES]; 161 struct cached_chapter_index **scratch_entries; 162 163 struct threads_barrier begin_update_barrier; 164 struct threads_barrier end_update_barrier; 165 166 struct cached_chapter_index chapters[]; 167 }; 168 169 static void initialize_threads_barrier(struct threads_barrier *barrier, 170 unsigned int thread_count) 171 { 172 sema_init(&barrier->lock, 1); 173 barrier->arrived = 0; 174 barrier->thread_count = thread_count; 175 sema_init(&barrier->wait, 0); 176 } 177 178 static inline void __down(struct semaphore *semaphore) 179 { 180 /* 181 * Do not use down(semaphore). Instead use down_interruptible so that 182 * we do not get 120 second stall messages in kern.log. 183 */ 184 while (down_interruptible(semaphore) != 0) { 185 /* 186 * If we're called from a user-mode process (e.g., "dmsetup 187 * remove") while waiting for an operation that may take a 188 * while (e.g., UDS index save), and a signal is sent (SIGINT, 189 * SIGUSR2), then down_interruptible will not block. If that 190 * happens, sleep briefly to avoid keeping the CPU locked up in 191 * this loop. We could just call cond_resched, but then we'd 192 * still keep consuming CPU time slices and swamp other threads 193 * trying to do computational work. 194 */ 195 fsleep(1000); 196 } 197 } 198 199 static void enter_threads_barrier(struct threads_barrier *barrier) 200 { 201 __down(&barrier->lock); 202 if (++barrier->arrived == barrier->thread_count) { 203 /* last thread */ 204 int i; 205 206 for (i = 1; i < barrier->thread_count; i++) 207 up(&barrier->wait); 208 209 barrier->arrived = 0; 210 up(&barrier->lock); 211 } else { 212 up(&barrier->lock); 213 __down(&barrier->wait); 214 } 215 } 216 217 static int __must_check initialize_cached_chapter_index(struct cached_chapter_index *chapter, 218 const struct index_geometry *geometry) 219 { 220 int result; 221 222 chapter->virtual_chapter = NO_CHAPTER; 223 chapter->index_pages_count = geometry->index_pages_per_chapter; 224 225 result = vdo_allocate(chapter->index_pages_count, struct delta_index_page, 226 __func__, &chapter->index_pages); 227 if (result != VDO_SUCCESS) 228 return result; 229 230 return vdo_allocate(chapter->index_pages_count, struct dm_buffer *, 231 "sparse index volume pages", &chapter->page_buffers); 232 } 233 234 static int __must_check make_search_list(struct sparse_cache *cache, 235 struct search_list **list_ptr) 236 { 237 struct search_list *list; 238 unsigned int bytes; 239 u8 i; 240 int result; 241 242 bytes = (sizeof(struct search_list) + 243 (cache->capacity * sizeof(struct cached_chapter_index *))); 244 result = vdo_allocate_cache_aligned(bytes, "search list", &list); 245 if (result != VDO_SUCCESS) 246 return result; 247 248 list->capacity = cache->capacity; 249 list->first_dead_entry = 0; 250 251 for (i = 0; i < list->capacity; i++) 252 list->entries[i] = &cache->chapters[i]; 253 254 *list_ptr = list; 255 return UDS_SUCCESS; 256 } 257 258 int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int capacity, 259 unsigned int zone_count, struct sparse_cache **cache_ptr) 260 { 261 int result; 262 unsigned int i; 263 struct sparse_cache *cache; 264 unsigned int bytes; 265 266 bytes = (sizeof(struct sparse_cache) + (capacity * sizeof(struct cached_chapter_index))); 267 result = vdo_allocate_cache_aligned(bytes, "sparse cache", &cache); 268 if (result != VDO_SUCCESS) 269 return result; 270 271 cache->geometry = geometry; 272 cache->capacity = capacity; 273 cache->zone_count = zone_count; 274 275 /* 276 * Scale down the skip threshold since the cache only counts cache misses in zone zero, but 277 * requests are being handled in all zones. 278 */ 279 cache->skip_threshold = (SKIP_SEARCH_THRESHOLD / zone_count); 280 281 initialize_threads_barrier(&cache->begin_update_barrier, zone_count); 282 initialize_threads_barrier(&cache->end_update_barrier, zone_count); 283 284 for (i = 0; i < capacity; i++) { 285 result = initialize_cached_chapter_index(&cache->chapters[i], geometry); 286 if (result != UDS_SUCCESS) 287 goto out; 288 } 289 290 for (i = 0; i < zone_count; i++) { 291 result = make_search_list(cache, &cache->search_lists[i]); 292 if (result != UDS_SUCCESS) 293 goto out; 294 } 295 296 /* purge_search_list() needs some temporary lists for sorting. */ 297 result = vdo_allocate(capacity * 2, struct cached_chapter_index *, 298 "scratch entries", &cache->scratch_entries); 299 if (result != VDO_SUCCESS) 300 goto out; 301 302 *cache_ptr = cache; 303 return UDS_SUCCESS; 304 out: 305 uds_free_sparse_cache(cache); 306 return result; 307 } 308 309 static inline void set_skip_search(struct cached_chapter_index *chapter, 310 bool skip_search) 311 { 312 /* Check before setting to reduce cache line contention. */ 313 if (READ_ONCE(chapter->skip_search) != skip_search) 314 WRITE_ONCE(chapter->skip_search, skip_search); 315 } 316 317 static void score_search_hit(struct cached_chapter_index *chapter) 318 { 319 chapter->counters.consecutive_misses = 0; 320 set_skip_search(chapter, false); 321 } 322 323 static void score_search_miss(struct sparse_cache *cache, 324 struct cached_chapter_index *chapter) 325 { 326 chapter->counters.consecutive_misses++; 327 if (chapter->counters.consecutive_misses > cache->skip_threshold) 328 set_skip_search(chapter, true); 329 } 330 331 static void release_cached_chapter_index(struct cached_chapter_index *chapter) 332 { 333 unsigned int i; 334 335 chapter->virtual_chapter = NO_CHAPTER; 336 if (chapter->page_buffers == NULL) 337 return; 338 339 for (i = 0; i < chapter->index_pages_count; i++) { 340 if (chapter->page_buffers[i] != NULL) 341 dm_bufio_release(vdo_forget(chapter->page_buffers[i])); 342 } 343 } 344 345 void uds_free_sparse_cache(struct sparse_cache *cache) 346 { 347 unsigned int i; 348 349 if (cache == NULL) 350 return; 351 352 vdo_free(cache->scratch_entries); 353 354 for (i = 0; i < cache->zone_count; i++) 355 vdo_free(cache->search_lists[i]); 356 357 for (i = 0; i < cache->capacity; i++) { 358 release_cached_chapter_index(&cache->chapters[i]); 359 vdo_free(cache->chapters[i].index_pages); 360 vdo_free(cache->chapters[i].page_buffers); 361 } 362 363 vdo_free(cache); 364 } 365 366 /* 367 * Take the indicated element of the search list and move it to the start, pushing the pointers 368 * previously before it back down the list. 369 */ 370 static inline void set_newest_entry(struct search_list *search_list, u8 index) 371 { 372 struct cached_chapter_index *newest; 373 374 if (index > 0) { 375 newest = search_list->entries[index]; 376 memmove(&search_list->entries[1], &search_list->entries[0], 377 index * sizeof(struct cached_chapter_index *)); 378 search_list->entries[0] = newest; 379 } 380 381 /* 382 * This function may have moved a dead chapter to the front of the list for reuse, in which 383 * case the set of dead chapters becomes smaller. 384 */ 385 if (search_list->first_dead_entry <= index) 386 search_list->first_dead_entry++; 387 } 388 389 bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter, 390 unsigned int zone_number) 391 { 392 struct search_list *search_list; 393 struct cached_chapter_index *chapter; 394 u8 i; 395 396 /* 397 * The correctness of the barriers depends on the invariant that between calls to 398 * uds_update_sparse_cache(), the answers this function returns must never vary: the result 399 * for a given chapter must be identical across zones. That invariant must be maintained 400 * even if the chapter falls off the end of the volume, or if searching it is disabled 401 * because of too many search misses. 402 */ 403 search_list = cache->search_lists[zone_number]; 404 for (i = 0; i < search_list->first_dead_entry; i++) { 405 chapter = search_list->entries[i]; 406 407 if (virtual_chapter == chapter->virtual_chapter) { 408 if (zone_number == ZONE_ZERO) 409 score_search_hit(chapter); 410 411 set_newest_entry(search_list, i); 412 return true; 413 } 414 } 415 416 return false; 417 } 418 419 /* 420 * Re-sort cache entries into three sets (active, skippable, and dead) while maintaining the LRU 421 * ordering that already existed. This operation must only be called during the critical section in 422 * uds_update_sparse_cache(). 423 */ 424 static void purge_search_list(struct search_list *search_list, 425 struct sparse_cache *cache, u64 oldest_virtual_chapter) 426 { 427 struct cached_chapter_index **entries; 428 struct cached_chapter_index **skipped; 429 struct cached_chapter_index **dead; 430 struct cached_chapter_index *chapter; 431 unsigned int next_alive = 0; 432 unsigned int next_skipped = 0; 433 unsigned int next_dead = 0; 434 unsigned int i; 435 436 entries = &search_list->entries[0]; 437 skipped = &cache->scratch_entries[0]; 438 dead = &cache->scratch_entries[search_list->capacity]; 439 440 for (i = 0; i < search_list->first_dead_entry; i++) { 441 chapter = search_list->entries[i]; 442 if ((chapter->virtual_chapter < oldest_virtual_chapter) || 443 (chapter->virtual_chapter == NO_CHAPTER)) 444 dead[next_dead++] = chapter; 445 else if (chapter->skip_search) 446 skipped[next_skipped++] = chapter; 447 else 448 entries[next_alive++] = chapter; 449 } 450 451 memcpy(&entries[next_alive], skipped, 452 next_skipped * sizeof(struct cached_chapter_index *)); 453 memcpy(&entries[next_alive + next_skipped], dead, 454 next_dead * sizeof(struct cached_chapter_index *)); 455 search_list->first_dead_entry = next_alive + next_skipped; 456 } 457 458 static int __must_check cache_chapter_index(struct cached_chapter_index *chapter, 459 u64 virtual_chapter, 460 const struct volume *volume) 461 { 462 int result; 463 464 release_cached_chapter_index(chapter); 465 466 result = uds_read_chapter_index_from_volume(volume, virtual_chapter, 467 chapter->page_buffers, 468 chapter->index_pages); 469 if (result != UDS_SUCCESS) 470 return result; 471 472 chapter->counters.consecutive_misses = 0; 473 chapter->virtual_chapter = virtual_chapter; 474 chapter->skip_search = false; 475 476 return UDS_SUCCESS; 477 } 478 479 static inline void copy_search_list(const struct search_list *source, 480 struct search_list *target) 481 { 482 *target = *source; 483 memcpy(target->entries, source->entries, 484 source->capacity * sizeof(struct cached_chapter_index *)); 485 } 486 487 /* 488 * Update the sparse cache to contain a chapter index. This function must be called by all the zone 489 * threads with the same chapter number to correctly enter the thread barriers used to synchronize 490 * the cache updates. 491 */ 492 int uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter) 493 { 494 int result = UDS_SUCCESS; 495 const struct uds_index *index = zone->index; 496 struct sparse_cache *cache = index->volume->sparse_cache; 497 498 if (uds_sparse_cache_contains(cache, virtual_chapter, zone->id)) 499 return UDS_SUCCESS; 500 501 /* 502 * Wait for every zone thread to reach its corresponding barrier request and invoke this 503 * function before starting to modify the cache. 504 */ 505 enter_threads_barrier(&cache->begin_update_barrier); 506 507 /* 508 * This is the start of the critical section: the zone zero thread is captain, effectively 509 * holding an exclusive lock on the sparse cache. All the other zone threads must do 510 * nothing between the two barriers. They will wait at the end_update_barrier again for the 511 * captain to finish the update. 512 */ 513 514 if (zone->id == ZONE_ZERO) { 515 unsigned int z; 516 struct search_list *list = cache->search_lists[ZONE_ZERO]; 517 518 purge_search_list(list, cache, zone->oldest_virtual_chapter); 519 520 if (virtual_chapter >= index->oldest_virtual_chapter) { 521 set_newest_entry(list, list->capacity - 1); 522 result = cache_chapter_index(list->entries[0], virtual_chapter, 523 index->volume); 524 } 525 526 for (z = 1; z < cache->zone_count; z++) 527 copy_search_list(list, cache->search_lists[z]); 528 } 529 530 /* 531 * This is the end of the critical section. All cache invariants must have been restored. 532 */ 533 enter_threads_barrier(&cache->end_update_barrier); 534 return result; 535 } 536 537 void uds_invalidate_sparse_cache(struct sparse_cache *cache) 538 { 539 unsigned int i; 540 541 for (i = 0; i < cache->capacity; i++) 542 release_cached_chapter_index(&cache->chapters[i]); 543 } 544 545 static inline bool should_skip_chapter(struct cached_chapter_index *chapter, 546 u64 oldest_chapter, u64 requested_chapter) 547 { 548 if ((chapter->virtual_chapter == NO_CHAPTER) || 549 (chapter->virtual_chapter < oldest_chapter)) 550 return true; 551 552 if (requested_chapter != NO_CHAPTER) 553 return requested_chapter != chapter->virtual_chapter; 554 else 555 return READ_ONCE(chapter->skip_search); 556 } 557 558 static int __must_check search_cached_chapter_index(struct cached_chapter_index *chapter, 559 const struct index_geometry *geometry, 560 const struct index_page_map *index_page_map, 561 const struct uds_record_name *name, 562 u16 *record_page_ptr) 563 { 564 u32 physical_chapter = 565 uds_map_to_physical_chapter(geometry, chapter->virtual_chapter); 566 u32 index_page_number = 567 uds_find_index_page_number(index_page_map, name, physical_chapter); 568 struct delta_index_page *index_page = 569 &chapter->index_pages[index_page_number]; 570 571 return uds_search_chapter_index_page(index_page, geometry, name, 572 record_page_ptr); 573 } 574 575 int uds_search_sparse_cache(struct index_zone *zone, const struct uds_record_name *name, 576 u64 *virtual_chapter_ptr, u16 *record_page_ptr) 577 { 578 int result; 579 struct volume *volume = zone->index->volume; 580 struct sparse_cache *cache = volume->sparse_cache; 581 struct cached_chapter_index *chapter; 582 struct search_list *search_list; 583 u8 i; 584 /* Search the entire cache unless a specific chapter was requested. */ 585 bool search_one = (*virtual_chapter_ptr != NO_CHAPTER); 586 587 *record_page_ptr = NO_CHAPTER_INDEX_ENTRY; 588 search_list = cache->search_lists[zone->id]; 589 for (i = 0; i < search_list->first_dead_entry; i++) { 590 chapter = search_list->entries[i]; 591 592 if (should_skip_chapter(chapter, zone->oldest_virtual_chapter, 593 *virtual_chapter_ptr)) 594 continue; 595 596 result = search_cached_chapter_index(chapter, cache->geometry, 597 volume->index_page_map, name, 598 record_page_ptr); 599 if (result != UDS_SUCCESS) 600 return result; 601 602 if (*record_page_ptr != NO_CHAPTER_INDEX_ENTRY) { 603 /* 604 * In theory, this might be a false match while a true match exists in 605 * another chapter, but that's a very rare case and not worth the extra 606 * search complexity. 607 */ 608 set_newest_entry(search_list, i); 609 if (zone->id == ZONE_ZERO) 610 score_search_hit(chapter); 611 612 *virtual_chapter_ptr = chapter->virtual_chapter; 613 return UDS_SUCCESS; 614 } 615 616 if (zone->id == ZONE_ZERO) 617 score_search_miss(cache, chapter); 618 619 if (search_one) 620 break; 621 } 622 623 return UDS_SUCCESS; 624 } 625