1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright 2023 Red Hat
4 */
5
6 #include "sparse-cache.h"
7
8 #include <linux/cache.h>
9 #include <linux/delay.h>
10 #include <linux/dm-bufio.h>
11
12 #include "logger.h"
13 #include "memory-alloc.h"
14 #include "permassert.h"
15
16 #include "chapter-index.h"
17 #include "config.h"
18 #include "index.h"
19
20 /*
21 * Since the cache is small, it is implemented as a simple array of cache entries. Searching for a
22 * specific virtual chapter is implemented as a linear search. The cache replacement policy is
23 * least-recently-used (LRU). Again, the small size of the cache allows the LRU order to be
24 * maintained by shifting entries in an array list.
25 *
26 * Changing the contents of the cache requires the coordinated participation of all zone threads
27 * via the careful use of barrier messages sent to all the index zones by the triage queue worker
28 * thread. The critical invariant for coordination is that the cache membership must not change
29 * between updates, so that all calls to uds_sparse_cache_contains() from the zone threads must all
30 * receive the same results for every virtual chapter number. To ensure that critical invariant,
31 * state changes such as "that virtual chapter is no longer in the volume" and "skip searching that
32 * chapter because it has had too many cache misses" are represented separately from the cache
33 * membership information (the virtual chapter number).
34 *
35 * As a result of this invariant, we have the guarantee that every zone thread will call
36 * uds_update_sparse_cache() once and exactly once to request a chapter that is not in the cache,
37 * and the serialization of the barrier requests from the triage queue ensures they will all
38 * request the same chapter number. This means the only synchronization we need can be provided by
39 * a pair of thread barriers used only in the uds_update_sparse_cache() call, providing a critical
40 * section where a single zone thread can drive the cache update while all the other zone threads
41 * are known to be blocked, waiting in the second barrier. Outside that critical section, all the
42 * zone threads implicitly hold a shared lock. Inside it, the thread for zone zero holds an
43 * exclusive lock. No other threads may access or modify the cache entries.
44 *
45 * Chapter statistics must only be modified by a single thread, which is also the zone zero thread.
46 * All fields that might be frequently updated by that thread are kept in separate cache-aligned
47 * structures so they will not cause cache contention via "false sharing" with the fields that are
48 * frequently accessed by all of the zone threads.
49 *
50 * The LRU order is managed independently by each zone thread, and each zone uses its own list for
51 * searching and cache membership queries. The zone zero list is used to decide which chapter to
52 * evict when the cache is updated, and its search list is copied to the other threads at that
53 * time.
54 *
55 * The virtual chapter number field of the cache entry is the single field indicating whether a
56 * chapter is a member of the cache or not. The value NO_CHAPTER is used to represent a null or
57 * undefined chapter number. When present in the virtual chapter number field of a
58 * cached_chapter_index, it indicates that the cache entry is dead, and all the other fields of
59 * that entry (other than immutable pointers to cache memory) are undefined and irrelevant. Any
60 * cache entry that is not marked as dead is fully defined and a member of the cache, and
61 * uds_sparse_cache_contains() will always return true for any virtual chapter number that appears
62 * in any of the cache entries.
63 *
64 * A chapter index that is a member of the cache may be excluded from searches between calls to
65 * uds_update_sparse_cache() in two different ways. First, when a chapter falls off the end of the
66 * volume, its virtual chapter number will be less that the oldest virtual chapter number. Since
67 * that chapter is no longer part of the volume, there's no point in continuing to search that
68 * chapter index. Once invalidated, that virtual chapter will still be considered a member of the
69 * cache, but it will no longer be searched for matching names.
70 *
71 * The second mechanism is a heuristic based on keeping track of the number of consecutive search
72 * misses in a given chapter index. Once that count exceeds a threshold, the skip_search flag will
73 * be set to true, causing the chapter to be skipped when searching the entire cache, but still
74 * allowing it to be found when searching for a hook in that specific chapter. Finding a hook will
75 * clear the skip_search flag, once again allowing the non-hook searches to use that cache entry.
76 * Again, regardless of the state of the skip_search flag, the virtual chapter must still
77 * considered to be a member of the cache for uds_sparse_cache_contains().
78 */
79
80 #define SKIP_SEARCH_THRESHOLD 20000
81 #define ZONE_ZERO 0
82
83 /*
84 * These counters are essentially fields of the struct cached_chapter_index, but are segregated
85 * into this structure because they are frequently modified. They are grouped and aligned to keep
86 * them on different cache lines from the chapter fields that are accessed far more often than they
87 * are updated.
88 */
__aligned(L1_CACHE_BYTES)89 struct __aligned(L1_CACHE_BYTES) cached_index_counters {
90 u64 consecutive_misses;
91 };
92
__aligned(L1_CACHE_BYTES)93 struct __aligned(L1_CACHE_BYTES) cached_chapter_index {
94 /*
95 * The virtual chapter number of the cached chapter index. NO_CHAPTER means this cache
96 * entry is unused. This field must only be modified in the critical section in
97 * uds_update_sparse_cache().
98 */
99 u64 virtual_chapter;
100
101 u32 index_pages_count;
102
103 /*
104 * These pointers are immutable during the life of the cache. The contents of the arrays
105 * change when the cache entry is replaced.
106 */
107 struct delta_index_page *index_pages;
108 struct dm_buffer **page_buffers;
109
110 /*
111 * If set, skip the chapter when searching the entire cache. This flag is just a
112 * performance optimization. This flag is mutable between cache updates, but it rarely
113 * changes and is frequently accessed, so it groups with the immutable fields.
114 */
115 bool skip_search;
116
117 /*
118 * The cache-aligned counters change often and are placed at the end of the structure to
119 * prevent false sharing with the more stable fields above.
120 */
121 struct cached_index_counters counters;
122 };
123
124 /*
125 * A search_list represents an ordering of the sparse chapter index cache entry array, from most
126 * recently accessed to least recently accessed, which is the order in which the indexes should be
127 * searched and the reverse order in which they should be evicted from the cache.
128 *
129 * Cache entries that are dead or empty are kept at the end of the list, avoiding the need to even
130 * iterate over them to search, and ensuring that dead entries are replaced before any live entries
131 * are evicted.
132 *
133 * The search list is instantiated for each zone thread, avoiding any need for synchronization. The
134 * structure is allocated on a cache boundary to avoid false sharing of memory cache lines between
135 * zone threads.
136 */
137 struct search_list {
138 u8 capacity;
139 u8 first_dead_entry;
140 struct cached_chapter_index *entries[];
141 };
142
143 struct threads_barrier {
144 /* Lock for this barrier object */
145 struct semaphore lock;
146 /* Semaphore for threads waiting at this barrier */
147 struct semaphore wait;
148 /* Number of threads which have arrived */
149 int arrived;
150 /* Total number of threads using this barrier */
151 int thread_count;
152 };
153
154 struct sparse_cache {
155 const struct index_geometry *geometry;
156 unsigned int capacity;
157 unsigned int zone_count;
158
159 unsigned int skip_threshold;
160 struct search_list *search_lists[MAX_ZONES];
161 struct cached_chapter_index **scratch_entries;
162
163 struct threads_barrier begin_update_barrier;
164 struct threads_barrier end_update_barrier;
165
166 struct cached_chapter_index chapters[];
167 };
168
initialize_threads_barrier(struct threads_barrier * barrier,unsigned int thread_count)169 static void initialize_threads_barrier(struct threads_barrier *barrier,
170 unsigned int thread_count)
171 {
172 sema_init(&barrier->lock, 1);
173 barrier->arrived = 0;
174 barrier->thread_count = thread_count;
175 sema_init(&barrier->wait, 0);
176 }
177
__down(struct semaphore * semaphore)178 static inline void __down(struct semaphore *semaphore)
179 {
180 /*
181 * Do not use down(semaphore). Instead use down_interruptible so that
182 * we do not get 120 second stall messages in kern.log.
183 */
184 while (down_interruptible(semaphore) != 0) {
185 /*
186 * If we're called from a user-mode process (e.g., "dmsetup
187 * remove") while waiting for an operation that may take a
188 * while (e.g., UDS index save), and a signal is sent (SIGINT,
189 * SIGUSR2), then down_interruptible will not block. If that
190 * happens, sleep briefly to avoid keeping the CPU locked up in
191 * this loop. We could just call cond_resched, but then we'd
192 * still keep consuming CPU time slices and swamp other threads
193 * trying to do computational work.
194 */
195 fsleep(1000);
196 }
197 }
198
enter_threads_barrier(struct threads_barrier * barrier)199 static void enter_threads_barrier(struct threads_barrier *barrier)
200 {
201 __down(&barrier->lock);
202 if (++barrier->arrived == barrier->thread_count) {
203 /* last thread */
204 int i;
205
206 for (i = 1; i < barrier->thread_count; i++)
207 up(&barrier->wait);
208
209 barrier->arrived = 0;
210 up(&barrier->lock);
211 } else {
212 up(&barrier->lock);
213 __down(&barrier->wait);
214 }
215 }
216
initialize_cached_chapter_index(struct cached_chapter_index * chapter,const struct index_geometry * geometry)217 static int __must_check initialize_cached_chapter_index(struct cached_chapter_index *chapter,
218 const struct index_geometry *geometry)
219 {
220 int result;
221
222 chapter->virtual_chapter = NO_CHAPTER;
223 chapter->index_pages_count = geometry->index_pages_per_chapter;
224
225 result = vdo_allocate(chapter->index_pages_count, struct delta_index_page,
226 __func__, &chapter->index_pages);
227 if (result != VDO_SUCCESS)
228 return result;
229
230 return vdo_allocate(chapter->index_pages_count, struct dm_buffer *,
231 "sparse index volume pages", &chapter->page_buffers);
232 }
233
make_search_list(struct sparse_cache * cache,struct search_list ** list_ptr)234 static int __must_check make_search_list(struct sparse_cache *cache,
235 struct search_list **list_ptr)
236 {
237 struct search_list *list;
238 unsigned int bytes;
239 u8 i;
240 int result;
241
242 bytes = (sizeof(struct search_list) +
243 (cache->capacity * sizeof(struct cached_chapter_index *)));
244 result = vdo_allocate_cache_aligned(bytes, "search list", &list);
245 if (result != VDO_SUCCESS)
246 return result;
247
248 list->capacity = cache->capacity;
249 list->first_dead_entry = 0;
250
251 for (i = 0; i < list->capacity; i++)
252 list->entries[i] = &cache->chapters[i];
253
254 *list_ptr = list;
255 return UDS_SUCCESS;
256 }
257
uds_make_sparse_cache(const struct index_geometry * geometry,unsigned int capacity,unsigned int zone_count,struct sparse_cache ** cache_ptr)258 int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int capacity,
259 unsigned int zone_count, struct sparse_cache **cache_ptr)
260 {
261 int result;
262 unsigned int i;
263 struct sparse_cache *cache;
264 unsigned int bytes;
265
266 bytes = (sizeof(struct sparse_cache) + (capacity * sizeof(struct cached_chapter_index)));
267 result = vdo_allocate_cache_aligned(bytes, "sparse cache", &cache);
268 if (result != VDO_SUCCESS)
269 return result;
270
271 cache->geometry = geometry;
272 cache->capacity = capacity;
273 cache->zone_count = zone_count;
274
275 /*
276 * Scale down the skip threshold since the cache only counts cache misses in zone zero, but
277 * requests are being handled in all zones.
278 */
279 cache->skip_threshold = (SKIP_SEARCH_THRESHOLD / zone_count);
280
281 initialize_threads_barrier(&cache->begin_update_barrier, zone_count);
282 initialize_threads_barrier(&cache->end_update_barrier, zone_count);
283
284 for (i = 0; i < capacity; i++) {
285 result = initialize_cached_chapter_index(&cache->chapters[i], geometry);
286 if (result != UDS_SUCCESS)
287 goto out;
288 }
289
290 for (i = 0; i < zone_count; i++) {
291 result = make_search_list(cache, &cache->search_lists[i]);
292 if (result != UDS_SUCCESS)
293 goto out;
294 }
295
296 /* purge_search_list() needs some temporary lists for sorting. */
297 result = vdo_allocate(capacity * 2, struct cached_chapter_index *,
298 "scratch entries", &cache->scratch_entries);
299 if (result != VDO_SUCCESS)
300 goto out;
301
302 *cache_ptr = cache;
303 return UDS_SUCCESS;
304 out:
305 uds_free_sparse_cache(cache);
306 return result;
307 }
308
set_skip_search(struct cached_chapter_index * chapter,bool skip_search)309 static inline void set_skip_search(struct cached_chapter_index *chapter,
310 bool skip_search)
311 {
312 /* Check before setting to reduce cache line contention. */
313 if (READ_ONCE(chapter->skip_search) != skip_search)
314 WRITE_ONCE(chapter->skip_search, skip_search);
315 }
316
score_search_hit(struct cached_chapter_index * chapter)317 static void score_search_hit(struct cached_chapter_index *chapter)
318 {
319 chapter->counters.consecutive_misses = 0;
320 set_skip_search(chapter, false);
321 }
322
score_search_miss(struct sparse_cache * cache,struct cached_chapter_index * chapter)323 static void score_search_miss(struct sparse_cache *cache,
324 struct cached_chapter_index *chapter)
325 {
326 chapter->counters.consecutive_misses++;
327 if (chapter->counters.consecutive_misses > cache->skip_threshold)
328 set_skip_search(chapter, true);
329 }
330
release_cached_chapter_index(struct cached_chapter_index * chapter)331 static void release_cached_chapter_index(struct cached_chapter_index *chapter)
332 {
333 unsigned int i;
334
335 chapter->virtual_chapter = NO_CHAPTER;
336 if (chapter->page_buffers == NULL)
337 return;
338
339 for (i = 0; i < chapter->index_pages_count; i++) {
340 if (chapter->page_buffers[i] != NULL)
341 dm_bufio_release(vdo_forget(chapter->page_buffers[i]));
342 }
343 }
344
uds_free_sparse_cache(struct sparse_cache * cache)345 void uds_free_sparse_cache(struct sparse_cache *cache)
346 {
347 unsigned int i;
348
349 if (cache == NULL)
350 return;
351
352 vdo_free(cache->scratch_entries);
353
354 for (i = 0; i < cache->zone_count; i++)
355 vdo_free(cache->search_lists[i]);
356
357 for (i = 0; i < cache->capacity; i++) {
358 release_cached_chapter_index(&cache->chapters[i]);
359 vdo_free(cache->chapters[i].index_pages);
360 vdo_free(cache->chapters[i].page_buffers);
361 }
362
363 vdo_free(cache);
364 }
365
366 /*
367 * Take the indicated element of the search list and move it to the start, pushing the pointers
368 * previously before it back down the list.
369 */
set_newest_entry(struct search_list * search_list,u8 index)370 static inline void set_newest_entry(struct search_list *search_list, u8 index)
371 {
372 struct cached_chapter_index *newest;
373
374 if (index > 0) {
375 newest = search_list->entries[index];
376 memmove(&search_list->entries[1], &search_list->entries[0],
377 index * sizeof(struct cached_chapter_index *));
378 search_list->entries[0] = newest;
379 }
380
381 /*
382 * This function may have moved a dead chapter to the front of the list for reuse, in which
383 * case the set of dead chapters becomes smaller.
384 */
385 if (search_list->first_dead_entry <= index)
386 search_list->first_dead_entry++;
387 }
388
uds_sparse_cache_contains(struct sparse_cache * cache,u64 virtual_chapter,unsigned int zone_number)389 bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter,
390 unsigned int zone_number)
391 {
392 struct search_list *search_list;
393 struct cached_chapter_index *chapter;
394 u8 i;
395
396 /*
397 * The correctness of the barriers depends on the invariant that between calls to
398 * uds_update_sparse_cache(), the answers this function returns must never vary: the result
399 * for a given chapter must be identical across zones. That invariant must be maintained
400 * even if the chapter falls off the end of the volume, or if searching it is disabled
401 * because of too many search misses.
402 */
403 search_list = cache->search_lists[zone_number];
404 for (i = 0; i < search_list->first_dead_entry; i++) {
405 chapter = search_list->entries[i];
406
407 if (virtual_chapter == chapter->virtual_chapter) {
408 if (zone_number == ZONE_ZERO)
409 score_search_hit(chapter);
410
411 set_newest_entry(search_list, i);
412 return true;
413 }
414 }
415
416 return false;
417 }
418
419 /*
420 * Re-sort cache entries into three sets (active, skippable, and dead) while maintaining the LRU
421 * ordering that already existed. This operation must only be called during the critical section in
422 * uds_update_sparse_cache().
423 */
purge_search_list(struct search_list * search_list,struct sparse_cache * cache,u64 oldest_virtual_chapter)424 static void purge_search_list(struct search_list *search_list,
425 struct sparse_cache *cache, u64 oldest_virtual_chapter)
426 {
427 struct cached_chapter_index **entries;
428 struct cached_chapter_index **skipped;
429 struct cached_chapter_index **dead;
430 struct cached_chapter_index *chapter;
431 unsigned int next_alive = 0;
432 unsigned int next_skipped = 0;
433 unsigned int next_dead = 0;
434 unsigned int i;
435
436 entries = &search_list->entries[0];
437 skipped = &cache->scratch_entries[0];
438 dead = &cache->scratch_entries[search_list->capacity];
439
440 for (i = 0; i < search_list->first_dead_entry; i++) {
441 chapter = search_list->entries[i];
442 if ((chapter->virtual_chapter < oldest_virtual_chapter) ||
443 (chapter->virtual_chapter == NO_CHAPTER))
444 dead[next_dead++] = chapter;
445 else if (chapter->skip_search)
446 skipped[next_skipped++] = chapter;
447 else
448 entries[next_alive++] = chapter;
449 }
450
451 memcpy(&entries[next_alive], skipped,
452 next_skipped * sizeof(struct cached_chapter_index *));
453 memcpy(&entries[next_alive + next_skipped], dead,
454 next_dead * sizeof(struct cached_chapter_index *));
455 search_list->first_dead_entry = next_alive + next_skipped;
456 }
457
cache_chapter_index(struct cached_chapter_index * chapter,u64 virtual_chapter,const struct volume * volume)458 static int __must_check cache_chapter_index(struct cached_chapter_index *chapter,
459 u64 virtual_chapter,
460 const struct volume *volume)
461 {
462 int result;
463
464 release_cached_chapter_index(chapter);
465
466 result = uds_read_chapter_index_from_volume(volume, virtual_chapter,
467 chapter->page_buffers,
468 chapter->index_pages);
469 if (result != UDS_SUCCESS)
470 return result;
471
472 chapter->counters.consecutive_misses = 0;
473 chapter->virtual_chapter = virtual_chapter;
474 chapter->skip_search = false;
475
476 return UDS_SUCCESS;
477 }
478
copy_search_list(const struct search_list * source,struct search_list * target)479 static inline void copy_search_list(const struct search_list *source,
480 struct search_list *target)
481 {
482 *target = *source;
483 memcpy(target->entries, source->entries,
484 source->capacity * sizeof(struct cached_chapter_index *));
485 }
486
487 /*
488 * Update the sparse cache to contain a chapter index. This function must be called by all the zone
489 * threads with the same chapter number to correctly enter the thread barriers used to synchronize
490 * the cache updates.
491 */
uds_update_sparse_cache(struct index_zone * zone,u64 virtual_chapter)492 int uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter)
493 {
494 int result = UDS_SUCCESS;
495 const struct uds_index *index = zone->index;
496 struct sparse_cache *cache = index->volume->sparse_cache;
497
498 if (uds_sparse_cache_contains(cache, virtual_chapter, zone->id))
499 return UDS_SUCCESS;
500
501 /*
502 * Wait for every zone thread to reach its corresponding barrier request and invoke this
503 * function before starting to modify the cache.
504 */
505 enter_threads_barrier(&cache->begin_update_barrier);
506
507 /*
508 * This is the start of the critical section: the zone zero thread is captain, effectively
509 * holding an exclusive lock on the sparse cache. All the other zone threads must do
510 * nothing between the two barriers. They will wait at the end_update_barrier again for the
511 * captain to finish the update.
512 */
513
514 if (zone->id == ZONE_ZERO) {
515 unsigned int z;
516 struct search_list *list = cache->search_lists[ZONE_ZERO];
517
518 purge_search_list(list, cache, zone->oldest_virtual_chapter);
519
520 if (virtual_chapter >= index->oldest_virtual_chapter) {
521 set_newest_entry(list, list->capacity - 1);
522 result = cache_chapter_index(list->entries[0], virtual_chapter,
523 index->volume);
524 }
525
526 for (z = 1; z < cache->zone_count; z++)
527 copy_search_list(list, cache->search_lists[z]);
528 }
529
530 /*
531 * This is the end of the critical section. All cache invariants must have been restored.
532 */
533 enter_threads_barrier(&cache->end_update_barrier);
534 return result;
535 }
536
uds_invalidate_sparse_cache(struct sparse_cache * cache)537 void uds_invalidate_sparse_cache(struct sparse_cache *cache)
538 {
539 unsigned int i;
540
541 for (i = 0; i < cache->capacity; i++)
542 release_cached_chapter_index(&cache->chapters[i]);
543 }
544
should_skip_chapter(struct cached_chapter_index * chapter,u64 oldest_chapter,u64 requested_chapter)545 static inline bool should_skip_chapter(struct cached_chapter_index *chapter,
546 u64 oldest_chapter, u64 requested_chapter)
547 {
548 if ((chapter->virtual_chapter == NO_CHAPTER) ||
549 (chapter->virtual_chapter < oldest_chapter))
550 return true;
551
552 if (requested_chapter != NO_CHAPTER)
553 return requested_chapter != chapter->virtual_chapter;
554 else
555 return READ_ONCE(chapter->skip_search);
556 }
557
search_cached_chapter_index(struct cached_chapter_index * chapter,const struct index_geometry * geometry,const struct index_page_map * index_page_map,const struct uds_record_name * name,u16 * record_page_ptr)558 static int __must_check search_cached_chapter_index(struct cached_chapter_index *chapter,
559 const struct index_geometry *geometry,
560 const struct index_page_map *index_page_map,
561 const struct uds_record_name *name,
562 u16 *record_page_ptr)
563 {
564 u32 physical_chapter =
565 uds_map_to_physical_chapter(geometry, chapter->virtual_chapter);
566 u32 index_page_number =
567 uds_find_index_page_number(index_page_map, name, physical_chapter);
568 struct delta_index_page *index_page =
569 &chapter->index_pages[index_page_number];
570
571 return uds_search_chapter_index_page(index_page, geometry, name,
572 record_page_ptr);
573 }
574
uds_search_sparse_cache(struct index_zone * zone,const struct uds_record_name * name,u64 * virtual_chapter_ptr,u16 * record_page_ptr)575 int uds_search_sparse_cache(struct index_zone *zone, const struct uds_record_name *name,
576 u64 *virtual_chapter_ptr, u16 *record_page_ptr)
577 {
578 int result;
579 struct volume *volume = zone->index->volume;
580 struct sparse_cache *cache = volume->sparse_cache;
581 struct cached_chapter_index *chapter;
582 struct search_list *search_list;
583 u8 i;
584 /* Search the entire cache unless a specific chapter was requested. */
585 bool search_one = (*virtual_chapter_ptr != NO_CHAPTER);
586
587 *record_page_ptr = NO_CHAPTER_INDEX_ENTRY;
588 search_list = cache->search_lists[zone->id];
589 for (i = 0; i < search_list->first_dead_entry; i++) {
590 chapter = search_list->entries[i];
591
592 if (should_skip_chapter(chapter, zone->oldest_virtual_chapter,
593 *virtual_chapter_ptr))
594 continue;
595
596 result = search_cached_chapter_index(chapter, cache->geometry,
597 volume->index_page_map, name,
598 record_page_ptr);
599 if (result != UDS_SUCCESS)
600 return result;
601
602 if (*record_page_ptr != NO_CHAPTER_INDEX_ENTRY) {
603 /*
604 * In theory, this might be a false match while a true match exists in
605 * another chapter, but that's a very rare case and not worth the extra
606 * search complexity.
607 */
608 set_newest_entry(search_list, i);
609 if (zone->id == ZONE_ZERO)
610 score_search_hit(chapter);
611
612 *virtual_chapter_ptr = chapter->virtual_chapter;
613 return UDS_SUCCESS;
614 }
615
616 if (zone->id == ZONE_ZERO)
617 score_search_miss(cache, chapter);
618
619 if (search_one)
620 break;
621 }
622
623 return UDS_SUCCESS;
624 }
625