xref: /linux/drivers/md/dm-vdo/indexer/sparse-cache.c (revision aec2f682d47c54ef434b2d440992626d80b1ebdc)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright 2023 Red Hat
4  */
5 
6 #include "sparse-cache.h"
7 
8 #include <linux/cache.h>
9 #include <linux/delay.h>
10 #include <linux/dm-bufio.h>
11 
12 #include "logger.h"
13 #include "memory-alloc.h"
14 #include "permassert.h"
15 
16 #include "chapter-index.h"
17 #include "config.h"
18 #include "index.h"
19 
20 /*
21  * Since the cache is small, it is implemented as a simple array of cache entries. Searching for a
22  * specific virtual chapter is implemented as a linear search. The cache replacement policy is
23  * least-recently-used (LRU). Again, the small size of the cache allows the LRU order to be
24  * maintained by shifting entries in an array list.
25  *
26  * Changing the contents of the cache requires the coordinated participation of all zone threads
27  * via the careful use of barrier messages sent to all the index zones by the triage queue worker
28  * thread. The critical invariant for coordination is that the cache membership must not change
29  * between updates, so that all calls to uds_sparse_cache_contains() from the zone threads must all
30  * receive the same results for every virtual chapter number. To ensure that critical invariant,
31  * state changes such as "that virtual chapter is no longer in the volume" and "skip searching that
32  * chapter because it has had too many cache misses" are represented separately from the cache
33  * membership information (the virtual chapter number).
34  *
35  * As a result of this invariant, we have the guarantee that every zone thread will call
36  * uds_update_sparse_cache() once and exactly once to request a chapter that is not in the cache,
37  * and the serialization of the barrier requests from the triage queue ensures they will all
38  * request the same chapter number. This means the only synchronization we need can be provided by
39  * a pair of thread barriers used only in the uds_update_sparse_cache() call, providing a critical
40  * section where a single zone thread can drive the cache update while all the other zone threads
41  * are known to be blocked, waiting in the second barrier. Outside that critical section, all the
42  * zone threads implicitly hold a shared lock. Inside it, the thread for zone zero holds an
43  * exclusive lock. No other threads may access or modify the cache entries.
44  *
45  * Chapter statistics must only be modified by a single thread, which is also the zone zero thread.
46  * All fields that might be frequently updated by that thread are kept in separate cache-aligned
47  * structures so they will not cause cache contention via "false sharing" with the fields that are
48  * frequently accessed by all of the zone threads.
49  *
50  * The LRU order is managed independently by each zone thread, and each zone uses its own list for
51  * searching and cache membership queries. The zone zero list is used to decide which chapter to
52  * evict when the cache is updated, and its search list is copied to the other threads at that
53  * time.
54  *
55  * The virtual chapter number field of the cache entry is the single field indicating whether a
56  * chapter is a member of the cache or not. The value NO_CHAPTER is used to represent a null or
57  * undefined chapter number. When present in the virtual chapter number field of a
58  * cached_chapter_index, it indicates that the cache entry is dead, and all the other fields of
59  * that entry (other than immutable pointers to cache memory) are undefined and irrelevant. Any
60  * cache entry that is not marked as dead is fully defined and a member of the cache, and
61  * uds_sparse_cache_contains() will always return true for any virtual chapter number that appears
62  * in any of the cache entries.
63  *
64  * A chapter index that is a member of the cache may be excluded from searches between calls to
65  * uds_update_sparse_cache() in two different ways. First, when a chapter falls off the end of the
66  * volume, its virtual chapter number will be less that the oldest virtual chapter number. Since
67  * that chapter is no longer part of the volume, there's no point in continuing to search that
68  * chapter index. Once invalidated, that virtual chapter will still be considered a member of the
69  * cache, but it will no longer be searched for matching names.
70  *
71  * The second mechanism is a heuristic based on keeping track of the number of consecutive search
72  * misses in a given chapter index. Once that count exceeds a threshold, the skip_search flag will
73  * be set to true, causing the chapter to be skipped when searching the entire cache, but still
74  * allowing it to be found when searching for a hook in that specific chapter. Finding a hook will
75  * clear the skip_search flag, once again allowing the non-hook searches to use that cache entry.
76  * Again, regardless of the state of the skip_search flag, the virtual chapter must still
77  * considered to be a member of the cache for uds_sparse_cache_contains().
78  */
79 
80 #define SKIP_SEARCH_THRESHOLD 20000
81 #define ZONE_ZERO 0
82 
83 /*
84  * These counters are essentially fields of the struct cached_chapter_index, but are segregated
85  * into this structure because they are frequently modified. They are grouped and aligned to keep
86  * them on different cache lines from the chapter fields that are accessed far more often than they
87  * are updated.
88  */
89 struct __aligned(L1_CACHE_BYTES) cached_index_counters {
90 	u64 consecutive_misses;
91 };
92 
93 struct __aligned(L1_CACHE_BYTES) cached_chapter_index {
94 	/*
95 	 * The virtual chapter number of the cached chapter index. NO_CHAPTER means this cache
96 	 * entry is unused. This field must only be modified in the critical section in
97 	 * uds_update_sparse_cache().
98 	 */
99 	u64 virtual_chapter;
100 
101 	u32 index_pages_count;
102 
103 	/*
104 	 * These pointers are immutable during the life of the cache. The contents of the arrays
105 	 * change when the cache entry is replaced.
106 	 */
107 	struct delta_index_page *index_pages;
108 	struct dm_buffer **page_buffers;
109 
110 	/*
111 	 * If set, skip the chapter when searching the entire cache. This flag is just a
112 	 * performance optimization. This flag is mutable between cache updates, but it rarely
113 	 * changes and is frequently accessed, so it groups with the immutable fields.
114 	 */
115 	bool skip_search;
116 
117 	/*
118 	 * The cache-aligned counters change often and are placed at the end of the structure to
119 	 * prevent false sharing with the more stable fields above.
120 	 */
121 	struct cached_index_counters counters;
122 };
123 
124 /*
125  * A search_list represents an ordering of the sparse chapter index cache entry array, from most
126  * recently accessed to least recently accessed, which is the order in which the indexes should be
127  * searched and the reverse order in which they should be evicted from the cache.
128  *
129  * Cache entries that are dead or empty are kept at the end of the list, avoiding the need to even
130  * iterate over them to search, and ensuring that dead entries are replaced before any live entries
131  * are evicted.
132  *
133  * The search list is instantiated for each zone thread, avoiding any need for synchronization. The
134  * structure is allocated on a cache boundary to avoid false sharing of memory cache lines between
135  * zone threads.
136  */
137 struct search_list {
138 	u8 capacity;
139 	u8 first_dead_entry;
140 	struct cached_chapter_index *entries[];
141 };
142 
143 struct threads_barrier {
144 	/* Lock for this barrier object */
145 	struct semaphore lock;
146 	/* Semaphore for threads waiting at this barrier */
147 	struct semaphore wait;
148 	/* Number of threads which have arrived */
149 	int arrived;
150 	/* Total number of threads using this barrier */
151 	int thread_count;
152 };
153 
154 struct sparse_cache {
155 	const struct index_geometry *geometry;
156 	unsigned int capacity;
157 	unsigned int zone_count;
158 
159 	unsigned int skip_threshold;
160 	struct search_list *search_lists[MAX_ZONES];
161 	struct cached_chapter_index **scratch_entries;
162 
163 	struct threads_barrier begin_update_barrier;
164 	struct threads_barrier end_update_barrier;
165 
166 	struct cached_chapter_index chapters[];
167 };
168 
169 static void initialize_threads_barrier(struct threads_barrier *barrier,
170 				       unsigned int thread_count)
171 {
172 	sema_init(&barrier->lock, 1);
173 	barrier->arrived = 0;
174 	barrier->thread_count = thread_count;
175 	sema_init(&barrier->wait, 0);
176 }
177 
178 static inline void __down(struct semaphore *semaphore)
179 {
180 	/*
181 	 * Do not use down(semaphore). Instead use down_interruptible so that
182 	 * we do not get 120 second stall messages in kern.log.
183 	 */
184 	while (down_interruptible(semaphore) != 0) {
185 		/*
186 		 * If we're called from a user-mode process (e.g., "dmsetup
187 		 * remove") while waiting for an operation that may take a
188 		 * while (e.g., UDS index save), and a signal is sent (SIGINT,
189 		 * SIGUSR2), then down_interruptible will not block. If that
190 		 * happens, sleep briefly to avoid keeping the CPU locked up in
191 		 * this loop. We could just call cond_resched, but then we'd
192 		 * still keep consuming CPU time slices and swamp other threads
193 		 * trying to do computational work.
194 		 */
195 		fsleep(1000);
196 	}
197 }
198 
199 static void enter_threads_barrier(struct threads_barrier *barrier)
200 {
201 	__down(&barrier->lock);
202 	if (++barrier->arrived == barrier->thread_count) {
203 		/* last thread */
204 		int i;
205 
206 		for (i = 1; i < barrier->thread_count; i++)
207 			up(&barrier->wait);
208 
209 		barrier->arrived = 0;
210 		up(&barrier->lock);
211 	} else {
212 		up(&barrier->lock);
213 		__down(&barrier->wait);
214 	}
215 }
216 
217 static int __must_check initialize_cached_chapter_index(struct cached_chapter_index *chapter,
218 							const struct index_geometry *geometry)
219 {
220 	int result;
221 
222 	chapter->virtual_chapter = NO_CHAPTER;
223 	chapter->index_pages_count = geometry->index_pages_per_chapter;
224 
225 	result = vdo_allocate(chapter->index_pages_count, __func__, &chapter->index_pages);
226 	if (result != VDO_SUCCESS)
227 		return result;
228 
229 	return vdo_allocate(chapter->index_pages_count, "sparse index volume pages",
230 			    &chapter->page_buffers);
231 }
232 
233 static int __must_check make_search_list(struct sparse_cache *cache,
234 					 struct search_list **list_ptr)
235 {
236 	struct search_list *list;
237 	unsigned int bytes;
238 	u8 i;
239 	int result;
240 
241 	bytes = (sizeof(struct search_list) +
242 		 (cache->capacity * sizeof(struct cached_chapter_index *)));
243 	result = vdo_allocate_cache_aligned(bytes, "search list", &list);
244 	if (result != VDO_SUCCESS)
245 		return result;
246 
247 	list->capacity = cache->capacity;
248 	list->first_dead_entry = 0;
249 
250 	for (i = 0; i < list->capacity; i++)
251 		list->entries[i] = &cache->chapters[i];
252 
253 	*list_ptr = list;
254 	return UDS_SUCCESS;
255 }
256 
257 int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int capacity,
258 			  unsigned int zone_count, struct sparse_cache **cache_ptr)
259 {
260 	int result;
261 	unsigned int i;
262 	struct sparse_cache *cache;
263 	unsigned int bytes;
264 
265 	bytes = (sizeof(struct sparse_cache) + (capacity * sizeof(struct cached_chapter_index)));
266 	result = vdo_allocate_cache_aligned(bytes, "sparse cache", &cache);
267 	if (result != VDO_SUCCESS)
268 		return result;
269 
270 	cache->geometry = geometry;
271 	cache->capacity = capacity;
272 	cache->zone_count = zone_count;
273 
274 	/*
275 	 * Scale down the skip threshold since the cache only counts cache misses in zone zero, but
276 	 * requests are being handled in all zones.
277 	 */
278 	cache->skip_threshold = (SKIP_SEARCH_THRESHOLD / zone_count);
279 
280 	initialize_threads_barrier(&cache->begin_update_barrier, zone_count);
281 	initialize_threads_barrier(&cache->end_update_barrier, zone_count);
282 
283 	for (i = 0; i < capacity; i++) {
284 		result = initialize_cached_chapter_index(&cache->chapters[i], geometry);
285 		if (result != UDS_SUCCESS)
286 			goto out;
287 	}
288 
289 	for (i = 0; i < zone_count; i++) {
290 		result = make_search_list(cache, &cache->search_lists[i]);
291 		if (result != UDS_SUCCESS)
292 			goto out;
293 	}
294 
295 	/* purge_search_list() needs some temporary lists for sorting. */
296 	result = vdo_allocate(capacity * 2, "scratch entries", &cache->scratch_entries);
297 	if (result != VDO_SUCCESS)
298 		goto out;
299 
300 	*cache_ptr = cache;
301 	return UDS_SUCCESS;
302 out:
303 	uds_free_sparse_cache(cache);
304 	return result;
305 }
306 
307 static inline void set_skip_search(struct cached_chapter_index *chapter,
308 				   bool skip_search)
309 {
310 	/* Check before setting to reduce cache line contention. */
311 	if (READ_ONCE(chapter->skip_search) != skip_search)
312 		WRITE_ONCE(chapter->skip_search, skip_search);
313 }
314 
315 static void score_search_hit(struct cached_chapter_index *chapter)
316 {
317 	chapter->counters.consecutive_misses = 0;
318 	set_skip_search(chapter, false);
319 }
320 
321 static void score_search_miss(struct sparse_cache *cache,
322 			      struct cached_chapter_index *chapter)
323 {
324 	chapter->counters.consecutive_misses++;
325 	if (chapter->counters.consecutive_misses > cache->skip_threshold)
326 		set_skip_search(chapter, true);
327 }
328 
329 static void release_cached_chapter_index(struct cached_chapter_index *chapter)
330 {
331 	unsigned int i;
332 
333 	chapter->virtual_chapter = NO_CHAPTER;
334 	if (chapter->page_buffers == NULL)
335 		return;
336 
337 	for (i = 0; i < chapter->index_pages_count; i++) {
338 		if (chapter->page_buffers[i] != NULL)
339 			dm_bufio_release(vdo_forget(chapter->page_buffers[i]));
340 	}
341 }
342 
343 void uds_free_sparse_cache(struct sparse_cache *cache)
344 {
345 	unsigned int i;
346 
347 	if (cache == NULL)
348 		return;
349 
350 	vdo_free(cache->scratch_entries);
351 
352 	for (i = 0; i < cache->zone_count; i++)
353 		vdo_free(cache->search_lists[i]);
354 
355 	for (i = 0; i < cache->capacity; i++) {
356 		release_cached_chapter_index(&cache->chapters[i]);
357 		vdo_free(cache->chapters[i].index_pages);
358 		vdo_free(cache->chapters[i].page_buffers);
359 	}
360 
361 	vdo_free(cache);
362 }
363 
364 /*
365  * Take the indicated element of the search list and move it to the start, pushing the pointers
366  * previously before it back down the list.
367  */
368 static inline void set_newest_entry(struct search_list *search_list, u8 index)
369 {
370 	struct cached_chapter_index *newest;
371 
372 	if (index > 0) {
373 		newest = search_list->entries[index];
374 		memmove(&search_list->entries[1], &search_list->entries[0],
375 			index * sizeof(struct cached_chapter_index *));
376 		search_list->entries[0] = newest;
377 	}
378 
379 	/*
380 	 * This function may have moved a dead chapter to the front of the list for reuse, in which
381 	 * case the set of dead chapters becomes smaller.
382 	 */
383 	if (search_list->first_dead_entry <= index)
384 		search_list->first_dead_entry++;
385 }
386 
387 bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter,
388 			       unsigned int zone_number)
389 {
390 	struct search_list *search_list;
391 	struct cached_chapter_index *chapter;
392 	u8 i;
393 
394 	/*
395 	 * The correctness of the barriers depends on the invariant that between calls to
396 	 * uds_update_sparse_cache(), the answers this function returns must never vary: the result
397 	 * for a given chapter must be identical across zones. That invariant must be maintained
398 	 * even if the chapter falls off the end of the volume, or if searching it is disabled
399 	 * because of too many search misses.
400 	 */
401 	search_list = cache->search_lists[zone_number];
402 	for (i = 0; i < search_list->first_dead_entry; i++) {
403 		chapter = search_list->entries[i];
404 
405 		if (virtual_chapter == chapter->virtual_chapter) {
406 			if (zone_number == ZONE_ZERO)
407 				score_search_hit(chapter);
408 
409 			set_newest_entry(search_list, i);
410 			return true;
411 		}
412 	}
413 
414 	return false;
415 }
416 
417 /*
418  * Re-sort cache entries into three sets (active, skippable, and dead) while maintaining the LRU
419  * ordering that already existed. This operation must only be called during the critical section in
420  * uds_update_sparse_cache().
421  */
422 static void purge_search_list(struct search_list *search_list,
423 			      struct sparse_cache *cache, u64 oldest_virtual_chapter)
424 {
425 	struct cached_chapter_index **entries;
426 	struct cached_chapter_index **skipped;
427 	struct cached_chapter_index **dead;
428 	struct cached_chapter_index *chapter;
429 	unsigned int next_alive = 0;
430 	unsigned int next_skipped = 0;
431 	unsigned int next_dead = 0;
432 	unsigned int i;
433 
434 	entries = &search_list->entries[0];
435 	skipped = &cache->scratch_entries[0];
436 	dead = &cache->scratch_entries[search_list->capacity];
437 
438 	for (i = 0; i < search_list->first_dead_entry; i++) {
439 		chapter = search_list->entries[i];
440 		if ((chapter->virtual_chapter < oldest_virtual_chapter) ||
441 		    (chapter->virtual_chapter == NO_CHAPTER))
442 			dead[next_dead++] = chapter;
443 		else if (chapter->skip_search)
444 			skipped[next_skipped++] = chapter;
445 		else
446 			entries[next_alive++] = chapter;
447 	}
448 
449 	memcpy(&entries[next_alive], skipped,
450 	       next_skipped * sizeof(struct cached_chapter_index *));
451 	memcpy(&entries[next_alive + next_skipped], dead,
452 	       next_dead * sizeof(struct cached_chapter_index *));
453 	search_list->first_dead_entry = next_alive + next_skipped;
454 }
455 
456 static int __must_check cache_chapter_index(struct cached_chapter_index *chapter,
457 					    u64 virtual_chapter,
458 					    const struct volume *volume)
459 {
460 	int result;
461 
462 	release_cached_chapter_index(chapter);
463 
464 	result = uds_read_chapter_index_from_volume(volume, virtual_chapter,
465 						    chapter->page_buffers,
466 						    chapter->index_pages);
467 	if (result != UDS_SUCCESS)
468 		return result;
469 
470 	chapter->counters.consecutive_misses = 0;
471 	chapter->virtual_chapter = virtual_chapter;
472 	chapter->skip_search = false;
473 
474 	return UDS_SUCCESS;
475 }
476 
477 static inline void copy_search_list(const struct search_list *source,
478 				    struct search_list *target)
479 {
480 	*target = *source;
481 	memcpy(target->entries, source->entries,
482 	       source->capacity * sizeof(struct cached_chapter_index *));
483 }
484 
485 /*
486  * Update the sparse cache to contain a chapter index. This function must be called by all the zone
487  * threads with the same chapter number to correctly enter the thread barriers used to synchronize
488  * the cache updates.
489  */
490 int uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter)
491 {
492 	int result = UDS_SUCCESS;
493 	const struct uds_index *index = zone->index;
494 	struct sparse_cache *cache = index->volume->sparse_cache;
495 
496 	if (uds_sparse_cache_contains(cache, virtual_chapter, zone->id))
497 		return UDS_SUCCESS;
498 
499 	/*
500 	 * Wait for every zone thread to reach its corresponding barrier request and invoke this
501 	 * function before starting to modify the cache.
502 	 */
503 	enter_threads_barrier(&cache->begin_update_barrier);
504 
505 	/*
506 	 * This is the start of the critical section: the zone zero thread is captain, effectively
507 	 * holding an exclusive lock on the sparse cache. All the other zone threads must do
508 	 * nothing between the two barriers. They will wait at the end_update_barrier again for the
509 	 * captain to finish the update.
510 	 */
511 
512 	if (zone->id == ZONE_ZERO) {
513 		unsigned int z;
514 		struct search_list *list = cache->search_lists[ZONE_ZERO];
515 
516 		purge_search_list(list, cache, zone->oldest_virtual_chapter);
517 
518 		if (virtual_chapter >= index->oldest_virtual_chapter) {
519 			set_newest_entry(list, list->capacity - 1);
520 			result = cache_chapter_index(list->entries[0], virtual_chapter,
521 						     index->volume);
522 		}
523 
524 		for (z = 1; z < cache->zone_count; z++)
525 			copy_search_list(list, cache->search_lists[z]);
526 	}
527 
528 	/*
529 	 * This is the end of the critical section. All cache invariants must have been restored.
530 	 */
531 	enter_threads_barrier(&cache->end_update_barrier);
532 	return result;
533 }
534 
535 void uds_invalidate_sparse_cache(struct sparse_cache *cache)
536 {
537 	unsigned int i;
538 
539 	for (i = 0; i < cache->capacity; i++)
540 		release_cached_chapter_index(&cache->chapters[i]);
541 }
542 
543 static inline bool should_skip_chapter(struct cached_chapter_index *chapter,
544 				       u64 oldest_chapter, u64 requested_chapter)
545 {
546 	if ((chapter->virtual_chapter == NO_CHAPTER) ||
547 	    (chapter->virtual_chapter < oldest_chapter))
548 		return true;
549 
550 	if (requested_chapter != NO_CHAPTER)
551 		return requested_chapter != chapter->virtual_chapter;
552 	else
553 		return READ_ONCE(chapter->skip_search);
554 }
555 
556 static int __must_check search_cached_chapter_index(struct cached_chapter_index *chapter,
557 						    const struct index_geometry *geometry,
558 						    const struct index_page_map *index_page_map,
559 						    const struct uds_record_name *name,
560 						    u16 *record_page_ptr)
561 {
562 	u32 physical_chapter =
563 		uds_map_to_physical_chapter(geometry, chapter->virtual_chapter);
564 	u32 index_page_number =
565 		uds_find_index_page_number(index_page_map, name, physical_chapter);
566 	struct delta_index_page *index_page =
567 		&chapter->index_pages[index_page_number];
568 
569 	return uds_search_chapter_index_page(index_page, geometry, name,
570 					     record_page_ptr);
571 }
572 
573 int uds_search_sparse_cache(struct index_zone *zone, const struct uds_record_name *name,
574 			    u64 *virtual_chapter_ptr, u16 *record_page_ptr)
575 {
576 	int result;
577 	struct volume *volume = zone->index->volume;
578 	struct sparse_cache *cache = volume->sparse_cache;
579 	struct cached_chapter_index *chapter;
580 	struct search_list *search_list;
581 	u8 i;
582 	/* Search the entire cache unless a specific chapter was requested. */
583 	bool search_one = (*virtual_chapter_ptr != NO_CHAPTER);
584 
585 	*record_page_ptr = NO_CHAPTER_INDEX_ENTRY;
586 	search_list = cache->search_lists[zone->id];
587 	for (i = 0; i < search_list->first_dead_entry; i++) {
588 		chapter = search_list->entries[i];
589 
590 		if (should_skip_chapter(chapter, zone->oldest_virtual_chapter,
591 					*virtual_chapter_ptr))
592 			continue;
593 
594 		result = search_cached_chapter_index(chapter, cache->geometry,
595 						     volume->index_page_map, name,
596 						     record_page_ptr);
597 		if (result != UDS_SUCCESS)
598 			return result;
599 
600 		if (*record_page_ptr != NO_CHAPTER_INDEX_ENTRY) {
601 			/*
602 			 * In theory, this might be a false match while a true match exists in
603 			 * another chapter, but that's a very rare case and not worth the extra
604 			 * search complexity.
605 			 */
606 			set_newest_entry(search_list, i);
607 			if (zone->id == ZONE_ZERO)
608 				score_search_hit(chapter);
609 
610 			*virtual_chapter_ptr = chapter->virtual_chapter;
611 			return UDS_SUCCESS;
612 		}
613 
614 		if (zone->id == ZONE_ZERO)
615 			score_search_miss(cache, chapter);
616 
617 		if (search_one)
618 			break;
619 	}
620 
621 	return UDS_SUCCESS;
622 }
623