xref: /linux/drivers/md/dm-vdo/block-map.c (revision 6e7fd890f1d6ac83805409e9c346240de2705584)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright 2023 Red Hat
4  */
5 
6 #include "block-map.h"
7 
8 #include <linux/bio.h>
9 #include <linux/ratelimit.h>
10 
11 #include "errors.h"
12 #include "logger.h"
13 #include "memory-alloc.h"
14 #include "permassert.h"
15 
16 #include "action-manager.h"
17 #include "admin-state.h"
18 #include "completion.h"
19 #include "constants.h"
20 #include "data-vio.h"
21 #include "encodings.h"
22 #include "io-submitter.h"
23 #include "physical-zone.h"
24 #include "recovery-journal.h"
25 #include "slab-depot.h"
26 #include "status-codes.h"
27 #include "types.h"
28 #include "vdo.h"
29 #include "vio.h"
30 #include "wait-queue.h"
31 
32 /**
33  * DOC: Block map eras
34  *
35  * The block map era, or maximum age, is used as follows:
36  *
37  * Each block map page, when dirty, records the earliest recovery journal block sequence number of
38  * the changes reflected in that dirty block. Sequence numbers are classified into eras: every
39  * @maximum_age sequence numbers, we switch to a new era. Block map pages are assigned to eras
40  * according to the sequence number they record.
41  *
42  * In the current (newest) era, block map pages are not written unless there is cache pressure. In
43  * the next oldest era, each time a new journal block is written 1/@maximum_age of the pages in
44  * this era are issued for write. In all older eras, pages are issued for write immediately.
45  */
46 
47 struct page_descriptor {
48 	root_count_t root_index;
49 	height_t height;
50 	page_number_t page_index;
51 	slot_number_t slot;
52 } __packed;
53 
54 union page_key {
55 	struct page_descriptor descriptor;
56 	u64 key;
57 };
58 
59 struct write_if_not_dirtied_context {
60 	struct block_map_zone *zone;
61 	u8 generation;
62 };
63 
64 struct block_map_tree_segment {
65 	struct tree_page *levels[VDO_BLOCK_MAP_TREE_HEIGHT];
66 };
67 
68 struct block_map_tree {
69 	struct block_map_tree_segment *segments;
70 };
71 
72 struct forest {
73 	struct block_map *map;
74 	size_t segments;
75 	struct boundary *boundaries;
76 	struct tree_page **pages;
77 	struct block_map_tree trees[];
78 };
79 
80 struct cursor_level {
81 	page_number_t page_index;
82 	slot_number_t slot;
83 };
84 
85 struct cursors;
86 
87 struct cursor {
88 	struct vdo_waiter waiter;
89 	struct block_map_tree *tree;
90 	height_t height;
91 	struct cursors *parent;
92 	struct boundary boundary;
93 	struct cursor_level levels[VDO_BLOCK_MAP_TREE_HEIGHT];
94 	struct pooled_vio *vio;
95 };
96 
97 struct cursors {
98 	struct block_map_zone *zone;
99 	struct vio_pool *pool;
100 	vdo_entry_callback_fn entry_callback;
101 	struct vdo_completion *completion;
102 	root_count_t active_roots;
103 	struct cursor cursors[];
104 };
105 
106 static const physical_block_number_t NO_PAGE = 0xFFFFFFFFFFFFFFFF;
107 
108 /* Used to indicate that the page holding the location of a tree root has been "loaded". */
109 static const physical_block_number_t VDO_INVALID_PBN = 0xFFFFFFFFFFFFFFFF;
110 
111 const struct block_map_entry UNMAPPED_BLOCK_MAP_ENTRY = {
112 	.mapping_state = VDO_MAPPING_STATE_UNMAPPED & 0x0F,
113 	.pbn_high_nibble = 0,
114 	.pbn_low_word = __cpu_to_le32(VDO_ZERO_BLOCK & UINT_MAX),
115 };
116 
117 #define LOG_INTERVAL 4000
118 #define DISPLAY_INTERVAL 100000
119 
120 /*
121  * For adjusting VDO page cache statistic fields which are only mutated on the logical zone thread.
122  * Prevents any compiler shenanigans from affecting other threads reading those stats.
123  */
124 #define ADD_ONCE(value, delta) WRITE_ONCE(value, (value) + (delta))
125 
126 static inline bool is_dirty(const struct page_info *info)
127 {
128 	return info->state == PS_DIRTY;
129 }
130 
131 static inline bool is_present(const struct page_info *info)
132 {
133 	return (info->state == PS_RESIDENT) || (info->state == PS_DIRTY);
134 }
135 
136 static inline bool is_in_flight(const struct page_info *info)
137 {
138 	return (info->state == PS_INCOMING) || (info->state == PS_OUTGOING);
139 }
140 
141 static inline bool is_incoming(const struct page_info *info)
142 {
143 	return info->state == PS_INCOMING;
144 }
145 
146 static inline bool is_outgoing(const struct page_info *info)
147 {
148 	return info->state == PS_OUTGOING;
149 }
150 
151 static inline bool is_valid(const struct page_info *info)
152 {
153 	return is_present(info) || is_outgoing(info);
154 }
155 
156 static char *get_page_buffer(struct page_info *info)
157 {
158 	struct vdo_page_cache *cache = info->cache;
159 
160 	return &cache->pages[(info - cache->infos) * VDO_BLOCK_SIZE];
161 }
162 
163 static inline struct vdo_page_completion *page_completion_from_waiter(struct vdo_waiter *waiter)
164 {
165 	struct vdo_page_completion *completion;
166 
167 	if (waiter == NULL)
168 		return NULL;
169 
170 	completion = container_of(waiter, struct vdo_page_completion, waiter);
171 	vdo_assert_completion_type(&completion->completion, VDO_PAGE_COMPLETION);
172 	return completion;
173 }
174 
175 /**
176  * initialize_info() - Initialize all page info structures and put them on the free list.
177  *
178  * Return: VDO_SUCCESS or an error.
179  */
180 static int initialize_info(struct vdo_page_cache *cache)
181 {
182 	struct page_info *info;
183 
184 	INIT_LIST_HEAD(&cache->free_list);
185 	for (info = cache->infos; info < cache->infos + cache->page_count; info++) {
186 		int result;
187 
188 		info->cache = cache;
189 		info->state = PS_FREE;
190 		info->pbn = NO_PAGE;
191 
192 		result = create_metadata_vio(cache->vdo, VIO_TYPE_BLOCK_MAP,
193 					     VIO_PRIORITY_METADATA, info,
194 					     get_page_buffer(info), &info->vio);
195 		if (result != VDO_SUCCESS)
196 			return result;
197 
198 		/* The thread ID should never change. */
199 		info->vio->completion.callback_thread_id = cache->zone->thread_id;
200 
201 		INIT_LIST_HEAD(&info->state_entry);
202 		list_add_tail(&info->state_entry, &cache->free_list);
203 		INIT_LIST_HEAD(&info->lru_entry);
204 	}
205 
206 	return VDO_SUCCESS;
207 }
208 
209 /**
210  * allocate_cache_components() - Allocate components of the cache which require their own
211  *                               allocation.
212  * @maximum_age: The number of journal blocks before a dirtied page is considered old and must be
213  *               written out.
214  *
215  * The caller is responsible for all clean up on errors.
216  *
217  * Return: VDO_SUCCESS or an error code.
218  */
219 static int __must_check allocate_cache_components(struct vdo_page_cache *cache)
220 {
221 	u64 size = cache->page_count * (u64) VDO_BLOCK_SIZE;
222 	int result;
223 
224 	result = vdo_allocate(cache->page_count, struct page_info, "page infos",
225 			      &cache->infos);
226 	if (result != VDO_SUCCESS)
227 		return result;
228 
229 	result = vdo_allocate_memory(size, VDO_BLOCK_SIZE, "cache pages", &cache->pages);
230 	if (result != VDO_SUCCESS)
231 		return result;
232 
233 	result = vdo_int_map_create(cache->page_count, &cache->page_map);
234 	if (result != VDO_SUCCESS)
235 		return result;
236 
237 	return initialize_info(cache);
238 }
239 
240 /**
241  * assert_on_cache_thread() - Assert that a function has been called on the VDO page cache's
242  *                            thread.
243  */
244 static inline void assert_on_cache_thread(struct vdo_page_cache *cache,
245 					  const char *function_name)
246 {
247 	thread_id_t thread_id = vdo_get_callback_thread_id();
248 
249 	VDO_ASSERT_LOG_ONLY((thread_id == cache->zone->thread_id),
250 			    "%s() must only be called on cache thread %d, not thread %d",
251 			    function_name, cache->zone->thread_id, thread_id);
252 }
253 
254 /** assert_io_allowed() - Assert that a page cache may issue I/O. */
255 static inline void assert_io_allowed(struct vdo_page_cache *cache)
256 {
257 	VDO_ASSERT_LOG_ONLY(!vdo_is_state_quiescent(&cache->zone->state),
258 			    "VDO page cache may issue I/O");
259 }
260 
261 /** report_cache_pressure() - Log and, if enabled, report cache pressure. */
262 static void report_cache_pressure(struct vdo_page_cache *cache)
263 {
264 	ADD_ONCE(cache->stats.cache_pressure, 1);
265 	if (cache->waiter_count > cache->page_count) {
266 		if ((cache->pressure_report % LOG_INTERVAL) == 0)
267 			vdo_log_info("page cache pressure %u", cache->stats.cache_pressure);
268 
269 		if (++cache->pressure_report >= DISPLAY_INTERVAL)
270 			cache->pressure_report = 0;
271 	}
272 }
273 
274 /**
275  * get_page_state_name() - Return the name of a page state.
276  *
277  * If the page state is invalid a static string is returned and the invalid state is logged.
278  *
279  * Return: A pointer to a static page state name.
280  */
281 static const char * __must_check get_page_state_name(enum vdo_page_buffer_state state)
282 {
283 	int result;
284 	static const char * const state_names[] = {
285 		"FREE", "INCOMING", "FAILED", "RESIDENT", "DIRTY", "OUTGOING"
286 	};
287 
288 	BUILD_BUG_ON(ARRAY_SIZE(state_names) != PAGE_STATE_COUNT);
289 
290 	result = VDO_ASSERT(state < ARRAY_SIZE(state_names),
291 			    "Unknown page_state value %d", state);
292 	if (result != VDO_SUCCESS)
293 		return "[UNKNOWN PAGE STATE]";
294 
295 	return state_names[state];
296 }
297 
298 /**
299  * update_counter() - Update the counter associated with a given state.
300  * @info: The page info to count.
301  * @delta: The delta to apply to the counter.
302  */
303 static void update_counter(struct page_info *info, s32 delta)
304 {
305 	struct block_map_statistics *stats = &info->cache->stats;
306 
307 	switch (info->state) {
308 	case PS_FREE:
309 		ADD_ONCE(stats->free_pages, delta);
310 		return;
311 
312 	case PS_INCOMING:
313 		ADD_ONCE(stats->incoming_pages, delta);
314 		return;
315 
316 	case PS_OUTGOING:
317 		ADD_ONCE(stats->outgoing_pages, delta);
318 		return;
319 
320 	case PS_FAILED:
321 		ADD_ONCE(stats->failed_pages, delta);
322 		return;
323 
324 	case PS_RESIDENT:
325 		ADD_ONCE(stats->clean_pages, delta);
326 		return;
327 
328 	case PS_DIRTY:
329 		ADD_ONCE(stats->dirty_pages, delta);
330 		return;
331 
332 	default:
333 		return;
334 	}
335 }
336 
337 /** update_lru() - Update the lru information for an active page. */
338 static void update_lru(struct page_info *info)
339 {
340 	if (info->cache->lru_list.prev != &info->lru_entry)
341 		list_move_tail(&info->lru_entry, &info->cache->lru_list);
342 }
343 
344 /**
345  * set_info_state() - Set the state of a page_info and put it on the right list, adjusting
346  *                    counters.
347  */
348 static void set_info_state(struct page_info *info, enum vdo_page_buffer_state new_state)
349 {
350 	if (new_state == info->state)
351 		return;
352 
353 	update_counter(info, -1);
354 	info->state = new_state;
355 	update_counter(info, 1);
356 
357 	switch (info->state) {
358 	case PS_FREE:
359 	case PS_FAILED:
360 		list_move_tail(&info->state_entry, &info->cache->free_list);
361 		return;
362 
363 	case PS_OUTGOING:
364 		list_move_tail(&info->state_entry, &info->cache->outgoing_list);
365 		return;
366 
367 	case PS_DIRTY:
368 		return;
369 
370 	default:
371 		list_del_init(&info->state_entry);
372 	}
373 }
374 
375 /** set_info_pbn() - Set the pbn for an info, updating the map as needed. */
376 static int __must_check set_info_pbn(struct page_info *info, physical_block_number_t pbn)
377 {
378 	struct vdo_page_cache *cache = info->cache;
379 
380 	/* Either the new or the old page number must be NO_PAGE. */
381 	int result = VDO_ASSERT((pbn == NO_PAGE) || (info->pbn == NO_PAGE),
382 				"Must free a page before reusing it.");
383 	if (result != VDO_SUCCESS)
384 		return result;
385 
386 	if (info->pbn != NO_PAGE)
387 		vdo_int_map_remove(cache->page_map, info->pbn);
388 
389 	info->pbn = pbn;
390 
391 	if (pbn != NO_PAGE) {
392 		result = vdo_int_map_put(cache->page_map, pbn, info, true, NULL);
393 		if (result != VDO_SUCCESS)
394 			return result;
395 	}
396 	return VDO_SUCCESS;
397 }
398 
399 /** reset_page_info() - Reset page info to represent an unallocated page. */
400 static int reset_page_info(struct page_info *info)
401 {
402 	int result;
403 
404 	result = VDO_ASSERT(info->busy == 0, "VDO Page must not be busy");
405 	if (result != VDO_SUCCESS)
406 		return result;
407 
408 	result = VDO_ASSERT(!vdo_waitq_has_waiters(&info->waiting),
409 			    "VDO Page must not have waiters");
410 	if (result != VDO_SUCCESS)
411 		return result;
412 
413 	result = set_info_pbn(info, NO_PAGE);
414 	set_info_state(info, PS_FREE);
415 	list_del_init(&info->lru_entry);
416 	return result;
417 }
418 
419 /**
420  * find_free_page() - Find a free page.
421  *
422  * Return: A pointer to the page info structure (if found), NULL otherwise.
423  */
424 static struct page_info * __must_check find_free_page(struct vdo_page_cache *cache)
425 {
426 	struct page_info *info;
427 
428 	info = list_first_entry_or_null(&cache->free_list, struct page_info,
429 					state_entry);
430 	if (info != NULL)
431 		list_del_init(&info->state_entry);
432 
433 	return info;
434 }
435 
436 /**
437  * find_page() - Find the page info (if any) associated with a given pbn.
438  * @pbn: The absolute physical block number of the page.
439  *
440  * Return: The page info for the page if available, or NULL if not.
441  */
442 static struct page_info * __must_check find_page(struct vdo_page_cache *cache,
443 						 physical_block_number_t pbn)
444 {
445 	if ((cache->last_found != NULL) && (cache->last_found->pbn == pbn))
446 		return cache->last_found;
447 
448 	cache->last_found = vdo_int_map_get(cache->page_map, pbn);
449 	return cache->last_found;
450 }
451 
452 /**
453  * select_lru_page() - Determine which page is least recently used.
454  *
455  * Picks the least recently used from among the non-busy entries at the front of each of the lru
456  * ring. Since whenever we mark a page busy we also put it to the end of the ring it is unlikely
457  * that the entries at the front are busy unless the queue is very short, but not impossible.
458  *
459  * Return: A pointer to the info structure for a relevant page, or NULL if no such page can be
460  *         found. The page can be dirty or resident.
461  */
462 static struct page_info * __must_check select_lru_page(struct vdo_page_cache *cache)
463 {
464 	struct page_info *info;
465 
466 	list_for_each_entry(info, &cache->lru_list, lru_entry)
467 		if ((info->busy == 0) && !is_in_flight(info))
468 			return info;
469 
470 	return NULL;
471 }
472 
473 /* ASYNCHRONOUS INTERFACE BEYOND THIS POINT */
474 
475 /**
476  * complete_with_page() - Helper to complete the VDO Page Completion request successfully.
477  * @info: The page info representing the result page.
478  * @vdo_page_comp: The VDO page completion to complete.
479  */
480 static void complete_with_page(struct page_info *info,
481 			       struct vdo_page_completion *vdo_page_comp)
482 {
483 	bool available = vdo_page_comp->writable ? is_present(info) : is_valid(info);
484 
485 	if (!available) {
486 		vdo_log_error_strerror(VDO_BAD_PAGE,
487 				       "Requested cache page %llu in state %s is not %s",
488 				       (unsigned long long) info->pbn,
489 				       get_page_state_name(info->state),
490 				       vdo_page_comp->writable ? "present" : "valid");
491 		vdo_fail_completion(&vdo_page_comp->completion, VDO_BAD_PAGE);
492 		return;
493 	}
494 
495 	vdo_page_comp->info = info;
496 	vdo_page_comp->ready = true;
497 	vdo_finish_completion(&vdo_page_comp->completion);
498 }
499 
500 /**
501  * complete_waiter_with_error() - Complete a page completion with an error code.
502  * @waiter: The page completion, as a waiter.
503  * @result_ptr: A pointer to the error code.
504  *
505  * Implements waiter_callback_fn.
506  */
507 static void complete_waiter_with_error(struct vdo_waiter *waiter, void *result_ptr)
508 {
509 	int *result = result_ptr;
510 
511 	vdo_fail_completion(&page_completion_from_waiter(waiter)->completion, *result);
512 }
513 
514 /**
515  * complete_waiter_with_page() - Complete a page completion with a page.
516  * @waiter: The page completion, as a waiter.
517  * @page_info: The page info to complete with.
518  *
519  * Implements waiter_callback_fn.
520  */
521 static void complete_waiter_with_page(struct vdo_waiter *waiter, void *page_info)
522 {
523 	complete_with_page(page_info, page_completion_from_waiter(waiter));
524 }
525 
526 /**
527  * distribute_page_over_waitq() - Complete a waitq of VDO page completions with a page result.
528  *
529  * Upon completion the waitq will be empty.
530  *
531  * Return: The number of pages distributed.
532  */
533 static unsigned int distribute_page_over_waitq(struct page_info *info,
534 					       struct vdo_wait_queue *waitq)
535 {
536 	size_t num_pages;
537 
538 	update_lru(info);
539 	num_pages = vdo_waitq_num_waiters(waitq);
540 
541 	/*
542 	 * Increment the busy count once for each pending completion so that this page does not
543 	 * stop being busy until all completions have been processed.
544 	 */
545 	info->busy += num_pages;
546 
547 	vdo_waitq_notify_all_waiters(waitq, complete_waiter_with_page, info);
548 	return num_pages;
549 }
550 
551 /**
552  * set_persistent_error() - Set a persistent error which all requests will receive in the future.
553  * @context: A string describing what triggered the error.
554  *
555  * Once triggered, all enqueued completions will get this error. Any future requests will result in
556  * this error as well.
557  */
558 static void set_persistent_error(struct vdo_page_cache *cache, const char *context,
559 				 int result)
560 {
561 	struct page_info *info;
562 	/* If we're already read-only, there's no need to log. */
563 	struct vdo *vdo = cache->vdo;
564 
565 	if ((result != VDO_READ_ONLY) && !vdo_is_read_only(vdo)) {
566 		vdo_log_error_strerror(result, "VDO Page Cache persistent error: %s",
567 				       context);
568 		vdo_enter_read_only_mode(vdo, result);
569 	}
570 
571 	assert_on_cache_thread(cache, __func__);
572 
573 	vdo_waitq_notify_all_waiters(&cache->free_waiters,
574 				     complete_waiter_with_error, &result);
575 	cache->waiter_count = 0;
576 
577 	for (info = cache->infos; info < cache->infos + cache->page_count; info++) {
578 		vdo_waitq_notify_all_waiters(&info->waiting,
579 					     complete_waiter_with_error, &result);
580 	}
581 }
582 
583 /**
584  * validate_completed_page() - Check that a page completion which is being freed to the cache
585  *                             referred to a valid page and is in a valid state.
586  * @writable: Whether a writable page is required.
587  *
588  * Return: VDO_SUCCESS if the page was valid, otherwise as error
589  */
590 static int __must_check validate_completed_page(struct vdo_page_completion *completion,
591 						bool writable)
592 {
593 	int result;
594 
595 	result = VDO_ASSERT(completion->ready, "VDO Page completion not ready");
596 	if (result != VDO_SUCCESS)
597 		return result;
598 
599 	result = VDO_ASSERT(completion->info != NULL,
600 			    "VDO Page Completion must be complete");
601 	if (result != VDO_SUCCESS)
602 		return result;
603 
604 	result = VDO_ASSERT(completion->info->pbn == completion->pbn,
605 			    "VDO Page Completion pbn must be consistent");
606 	if (result != VDO_SUCCESS)
607 		return result;
608 
609 	result = VDO_ASSERT(is_valid(completion->info),
610 			    "VDO Page Completion page must be valid");
611 	if (result != VDO_SUCCESS)
612 		return result;
613 
614 	if (writable) {
615 		result = VDO_ASSERT(completion->writable,
616 				    "VDO Page Completion must be writable");
617 		if (result != VDO_SUCCESS)
618 			return result;
619 	}
620 
621 	return VDO_SUCCESS;
622 }
623 
624 static void check_for_drain_complete(struct block_map_zone *zone)
625 {
626 	if (vdo_is_state_draining(&zone->state) &&
627 	    (zone->active_lookups == 0) &&
628 	    !vdo_waitq_has_waiters(&zone->flush_waiters) &&
629 	    !is_vio_pool_busy(zone->vio_pool) &&
630 	    (zone->page_cache.outstanding_reads == 0) &&
631 	    (zone->page_cache.outstanding_writes == 0)) {
632 		vdo_finish_draining_with_result(&zone->state,
633 						(vdo_is_read_only(zone->block_map->vdo) ?
634 						 VDO_READ_ONLY : VDO_SUCCESS));
635 	}
636 }
637 
638 static void enter_zone_read_only_mode(struct block_map_zone *zone, int result)
639 {
640 	vdo_enter_read_only_mode(zone->block_map->vdo, result);
641 
642 	/*
643 	 * We are in read-only mode, so we won't ever write any page out.
644 	 * Just take all waiters off the waitq so the zone can drain.
645 	 */
646 	vdo_waitq_init(&zone->flush_waiters);
647 	check_for_drain_complete(zone);
648 }
649 
650 static bool __must_check
651 validate_completed_page_or_enter_read_only_mode(struct vdo_page_completion *completion,
652 						bool writable)
653 {
654 	int result = validate_completed_page(completion, writable);
655 
656 	if (result == VDO_SUCCESS)
657 		return true;
658 
659 	enter_zone_read_only_mode(completion->info->cache->zone, result);
660 	return false;
661 }
662 
663 /**
664  * handle_load_error() - Handle page load errors.
665  * @completion: The page read vio.
666  */
667 static void handle_load_error(struct vdo_completion *completion)
668 {
669 	int result = completion->result;
670 	struct page_info *info = completion->parent;
671 	struct vdo_page_cache *cache = info->cache;
672 
673 	assert_on_cache_thread(cache, __func__);
674 	vio_record_metadata_io_error(as_vio(completion));
675 	vdo_enter_read_only_mode(cache->zone->block_map->vdo, result);
676 	ADD_ONCE(cache->stats.failed_reads, 1);
677 	set_info_state(info, PS_FAILED);
678 	vdo_waitq_notify_all_waiters(&info->waiting, complete_waiter_with_error, &result);
679 	reset_page_info(info);
680 
681 	/*
682 	 * Don't decrement until right before calling check_for_drain_complete() to
683 	 * ensure that the above work can't cause the page cache to be freed out from under us.
684 	 */
685 	cache->outstanding_reads--;
686 	check_for_drain_complete(cache->zone);
687 }
688 
689 /**
690  * page_is_loaded() - Callback used when a page has been loaded.
691  * @completion: The vio which has loaded the page. Its parent is the page_info.
692  */
693 static void page_is_loaded(struct vdo_completion *completion)
694 {
695 	struct page_info *info = completion->parent;
696 	struct vdo_page_cache *cache = info->cache;
697 	nonce_t nonce = info->cache->zone->block_map->nonce;
698 	struct block_map_page *page;
699 	enum block_map_page_validity validity;
700 
701 	assert_on_cache_thread(cache, __func__);
702 
703 	page = (struct block_map_page *) get_page_buffer(info);
704 	validity = vdo_validate_block_map_page(page, nonce, info->pbn);
705 	if (validity == VDO_BLOCK_MAP_PAGE_BAD) {
706 		physical_block_number_t pbn = vdo_get_block_map_page_pbn(page);
707 		int result = vdo_log_error_strerror(VDO_BAD_PAGE,
708 						    "Expected page %llu but got page %llu instead",
709 						    (unsigned long long) info->pbn,
710 						    (unsigned long long) pbn);
711 
712 		vdo_continue_completion(completion, result);
713 		return;
714 	}
715 
716 	if (validity == VDO_BLOCK_MAP_PAGE_INVALID)
717 		vdo_format_block_map_page(page, nonce, info->pbn, false);
718 
719 	info->recovery_lock = 0;
720 	set_info_state(info, PS_RESIDENT);
721 	distribute_page_over_waitq(info, &info->waiting);
722 
723 	/*
724 	 * Don't decrement until right before calling check_for_drain_complete() to
725 	 * ensure that the above work can't cause the page cache to be freed out from under us.
726 	 */
727 	cache->outstanding_reads--;
728 	check_for_drain_complete(cache->zone);
729 }
730 
731 /**
732  * handle_rebuild_read_error() - Handle a read error during a read-only rebuild.
733  * @completion: The page load completion.
734  */
735 static void handle_rebuild_read_error(struct vdo_completion *completion)
736 {
737 	struct page_info *info = completion->parent;
738 	struct vdo_page_cache *cache = info->cache;
739 
740 	assert_on_cache_thread(cache, __func__);
741 
742 	/*
743 	 * We are doing a read-only rebuild, so treat this as a successful read
744 	 * of an uninitialized page.
745 	 */
746 	vio_record_metadata_io_error(as_vio(completion));
747 	ADD_ONCE(cache->stats.failed_reads, 1);
748 	memset(get_page_buffer(info), 0, VDO_BLOCK_SIZE);
749 	vdo_reset_completion(completion);
750 	page_is_loaded(completion);
751 }
752 
753 static void load_cache_page_endio(struct bio *bio)
754 {
755 	struct vio *vio = bio->bi_private;
756 	struct page_info *info = vio->completion.parent;
757 
758 	continue_vio_after_io(vio, page_is_loaded, info->cache->zone->thread_id);
759 }
760 
761 /**
762  * launch_page_load() - Begin the process of loading a page.
763  *
764  * Return: VDO_SUCCESS or an error code.
765  */
766 static int __must_check launch_page_load(struct page_info *info,
767 					 physical_block_number_t pbn)
768 {
769 	int result;
770 	vdo_action_fn callback;
771 	struct vdo_page_cache *cache = info->cache;
772 
773 	assert_io_allowed(cache);
774 
775 	result = set_info_pbn(info, pbn);
776 	if (result != VDO_SUCCESS)
777 		return result;
778 
779 	result = VDO_ASSERT((info->busy == 0), "Page is not busy before loading.");
780 	if (result != VDO_SUCCESS)
781 		return result;
782 
783 	set_info_state(info, PS_INCOMING);
784 	cache->outstanding_reads++;
785 	ADD_ONCE(cache->stats.pages_loaded, 1);
786 	callback = (cache->rebuilding ? handle_rebuild_read_error : handle_load_error);
787 	vdo_submit_metadata_vio(info->vio, pbn, load_cache_page_endio,
788 				callback, REQ_OP_READ | REQ_PRIO);
789 	return VDO_SUCCESS;
790 }
791 
792 static void write_pages(struct vdo_completion *completion);
793 
794 /** handle_flush_error() - Handle errors flushing the layer. */
795 static void handle_flush_error(struct vdo_completion *completion)
796 {
797 	struct page_info *info = completion->parent;
798 
799 	vio_record_metadata_io_error(as_vio(completion));
800 	set_persistent_error(info->cache, "flush failed", completion->result);
801 	write_pages(completion);
802 }
803 
804 static void flush_endio(struct bio *bio)
805 {
806 	struct vio *vio = bio->bi_private;
807 	struct page_info *info = vio->completion.parent;
808 
809 	continue_vio_after_io(vio, write_pages, info->cache->zone->thread_id);
810 }
811 
812 /** save_pages() - Attempt to save the outgoing pages by first flushing the layer. */
813 static void save_pages(struct vdo_page_cache *cache)
814 {
815 	struct page_info *info;
816 	struct vio *vio;
817 
818 	if ((cache->pages_in_flush > 0) || (cache->pages_to_flush == 0))
819 		return;
820 
821 	assert_io_allowed(cache);
822 
823 	info = list_first_entry(&cache->outgoing_list, struct page_info, state_entry);
824 
825 	cache->pages_in_flush = cache->pages_to_flush;
826 	cache->pages_to_flush = 0;
827 	ADD_ONCE(cache->stats.flush_count, 1);
828 
829 	vio = info->vio;
830 
831 	/*
832 	 * We must make sure that the recovery journal entries that changed these pages were
833 	 * successfully persisted, and thus must issue a flush before each batch of pages is
834 	 * written to ensure this.
835 	 */
836 	vdo_submit_flush_vio(vio, flush_endio, handle_flush_error);
837 }
838 
839 /**
840  * schedule_page_save() - Add a page to the outgoing list of pages waiting to be saved.
841  *
842  * Once in the list, a page may not be used until it has been written out.
843  */
844 static void schedule_page_save(struct page_info *info)
845 {
846 	if (info->busy > 0) {
847 		info->write_status = WRITE_STATUS_DEFERRED;
848 		return;
849 	}
850 
851 	info->cache->pages_to_flush++;
852 	info->cache->outstanding_writes++;
853 	set_info_state(info, PS_OUTGOING);
854 }
855 
856 /**
857  * launch_page_save() - Add a page to outgoing pages waiting to be saved, and then start saving
858  * pages if another save is not in progress.
859  */
860 static void launch_page_save(struct page_info *info)
861 {
862 	schedule_page_save(info);
863 	save_pages(info->cache);
864 }
865 
866 /**
867  * completion_needs_page() - Determine whether a given vdo_page_completion (as a waiter) is
868  *                           requesting a given page number.
869  * @context: A pointer to the pbn of the desired page.
870  *
871  * Implements waiter_match_fn.
872  *
873  * Return: true if the page completion is for the desired page number.
874  */
875 static bool completion_needs_page(struct vdo_waiter *waiter, void *context)
876 {
877 	physical_block_number_t *pbn = context;
878 
879 	return (page_completion_from_waiter(waiter)->pbn == *pbn);
880 }
881 
882 /**
883  * allocate_free_page() - Allocate a free page to the first completion in the waiting queue, and
884  *                        any other completions that match it in page number.
885  */
886 static void allocate_free_page(struct page_info *info)
887 {
888 	int result;
889 	struct vdo_waiter *oldest_waiter;
890 	physical_block_number_t pbn;
891 	struct vdo_page_cache *cache = info->cache;
892 
893 	assert_on_cache_thread(cache, __func__);
894 
895 	if (!vdo_waitq_has_waiters(&cache->free_waiters)) {
896 		if (cache->stats.cache_pressure > 0) {
897 			vdo_log_info("page cache pressure relieved");
898 			WRITE_ONCE(cache->stats.cache_pressure, 0);
899 		}
900 
901 		return;
902 	}
903 
904 	result = reset_page_info(info);
905 	if (result != VDO_SUCCESS) {
906 		set_persistent_error(cache, "cannot reset page info", result);
907 		return;
908 	}
909 
910 	oldest_waiter = vdo_waitq_get_first_waiter(&cache->free_waiters);
911 	pbn = page_completion_from_waiter(oldest_waiter)->pbn;
912 
913 	/*
914 	 * Remove all entries which match the page number in question and push them onto the page
915 	 * info's waitq.
916 	 */
917 	vdo_waitq_dequeue_matching_waiters(&cache->free_waiters, completion_needs_page,
918 					   &pbn, &info->waiting);
919 	cache->waiter_count -= vdo_waitq_num_waiters(&info->waiting);
920 
921 	result = launch_page_load(info, pbn);
922 	if (result != VDO_SUCCESS) {
923 		vdo_waitq_notify_all_waiters(&info->waiting,
924 					     complete_waiter_with_error, &result);
925 	}
926 }
927 
928 /**
929  * discard_a_page() - Begin the process of discarding a page.
930  *
931  * If no page is discardable, increments a count of deferred frees so that the next release of a
932  * page which is no longer busy will kick off another discard cycle. This is an indication that the
933  * cache is not big enough.
934  *
935  * If the selected page is not dirty, immediately allocates the page to the oldest completion
936  * waiting for a free page.
937  */
938 static void discard_a_page(struct vdo_page_cache *cache)
939 {
940 	struct page_info *info = select_lru_page(cache);
941 
942 	if (info == NULL) {
943 		report_cache_pressure(cache);
944 		return;
945 	}
946 
947 	if (!is_dirty(info)) {
948 		allocate_free_page(info);
949 		return;
950 	}
951 
952 	VDO_ASSERT_LOG_ONLY(!is_in_flight(info),
953 			    "page selected for discard is not in flight");
954 
955 	cache->discard_count++;
956 	info->write_status = WRITE_STATUS_DISCARD;
957 	launch_page_save(info);
958 }
959 
960 /**
961  * discard_page_for_completion() - Helper used to trigger a discard so that the completion can get
962  *                                 a different page.
963  */
964 static void discard_page_for_completion(struct vdo_page_completion *vdo_page_comp)
965 {
966 	struct vdo_page_cache *cache = vdo_page_comp->cache;
967 
968 	cache->waiter_count++;
969 	vdo_waitq_enqueue_waiter(&cache->free_waiters, &vdo_page_comp->waiter);
970 	discard_a_page(cache);
971 }
972 
973 /**
974  * discard_page_if_needed() - Helper used to trigger a discard if the cache needs another free
975  *                            page.
976  * @cache: The page cache.
977  */
978 static void discard_page_if_needed(struct vdo_page_cache *cache)
979 {
980 	if (cache->waiter_count > cache->discard_count)
981 		discard_a_page(cache);
982 }
983 
984 /**
985  * write_has_finished() - Inform the cache that a write has finished (possibly with an error).
986  * @info: The info structure for the page whose write just completed.
987  *
988  * Return: true if the page write was a discard.
989  */
990 static bool write_has_finished(struct page_info *info)
991 {
992 	bool was_discard = (info->write_status == WRITE_STATUS_DISCARD);
993 
994 	assert_on_cache_thread(info->cache, __func__);
995 	info->cache->outstanding_writes--;
996 
997 	info->write_status = WRITE_STATUS_NORMAL;
998 	return was_discard;
999 }
1000 
1001 /**
1002  * handle_page_write_error() - Handler for page write errors.
1003  * @completion: The page write vio.
1004  */
1005 static void handle_page_write_error(struct vdo_completion *completion)
1006 {
1007 	int result = completion->result;
1008 	struct page_info *info = completion->parent;
1009 	struct vdo_page_cache *cache = info->cache;
1010 
1011 	vio_record_metadata_io_error(as_vio(completion));
1012 
1013 	/* If we're already read-only, write failures are to be expected. */
1014 	if (result != VDO_READ_ONLY) {
1015 		vdo_log_ratelimit(vdo_log_error,
1016 				  "failed to write block map page %llu",
1017 				  (unsigned long long) info->pbn);
1018 	}
1019 
1020 	set_info_state(info, PS_DIRTY);
1021 	ADD_ONCE(cache->stats.failed_writes, 1);
1022 	set_persistent_error(cache, "cannot write page", result);
1023 
1024 	if (!write_has_finished(info))
1025 		discard_page_if_needed(cache);
1026 
1027 	check_for_drain_complete(cache->zone);
1028 }
1029 
1030 static void page_is_written_out(struct vdo_completion *completion);
1031 
1032 static void write_cache_page_endio(struct bio *bio)
1033 {
1034 	struct vio *vio = bio->bi_private;
1035 	struct page_info *info = vio->completion.parent;
1036 
1037 	continue_vio_after_io(vio, page_is_written_out, info->cache->zone->thread_id);
1038 }
1039 
1040 /**
1041  * page_is_written_out() - Callback used when a page has been written out.
1042  * @completion: The vio which wrote the page. Its parent is a page_info.
1043  */
1044 static void page_is_written_out(struct vdo_completion *completion)
1045 {
1046 	bool was_discard, reclaimed;
1047 	u32 reclamations;
1048 	struct page_info *info = completion->parent;
1049 	struct vdo_page_cache *cache = info->cache;
1050 	struct block_map_page *page = (struct block_map_page *) get_page_buffer(info);
1051 
1052 	if (!page->header.initialized) {
1053 		page->header.initialized = true;
1054 		vdo_submit_metadata_vio(info->vio, info->pbn,
1055 					write_cache_page_endio,
1056 					handle_page_write_error,
1057 					REQ_OP_WRITE | REQ_PRIO | REQ_PREFLUSH);
1058 		return;
1059 	}
1060 
1061 	/* Handle journal updates and torn write protection. */
1062 	vdo_release_recovery_journal_block_reference(cache->zone->block_map->journal,
1063 						     info->recovery_lock,
1064 						     VDO_ZONE_TYPE_LOGICAL,
1065 						     cache->zone->zone_number);
1066 	info->recovery_lock = 0;
1067 	was_discard = write_has_finished(info);
1068 	reclaimed = (!was_discard || (info->busy > 0) || vdo_waitq_has_waiters(&info->waiting));
1069 
1070 	set_info_state(info, PS_RESIDENT);
1071 
1072 	reclamations = distribute_page_over_waitq(info, &info->waiting);
1073 	ADD_ONCE(cache->stats.reclaimed, reclamations);
1074 
1075 	if (was_discard)
1076 		cache->discard_count--;
1077 
1078 	if (reclaimed)
1079 		discard_page_if_needed(cache);
1080 	else
1081 		allocate_free_page(info);
1082 
1083 	check_for_drain_complete(cache->zone);
1084 }
1085 
1086 /**
1087  * write_pages() - Write the batch of pages which were covered by the layer flush which just
1088  *                 completed.
1089  * @flush_completion: The flush vio.
1090  *
1091  * This callback is registered in save_pages().
1092  */
1093 static void write_pages(struct vdo_completion *flush_completion)
1094 {
1095 	struct vdo_page_cache *cache = ((struct page_info *) flush_completion->parent)->cache;
1096 
1097 	/*
1098 	 * We need to cache these two values on the stack since it is possible for the last
1099 	 * page info to cause the page cache to get freed. Hence once we launch the last page,
1100 	 * it may be unsafe to dereference the cache.
1101 	 */
1102 	bool has_unflushed_pages = (cache->pages_to_flush > 0);
1103 	page_count_t pages_in_flush = cache->pages_in_flush;
1104 
1105 	cache->pages_in_flush = 0;
1106 	while (pages_in_flush-- > 0) {
1107 		struct page_info *info =
1108 			list_first_entry(&cache->outgoing_list, struct page_info,
1109 					 state_entry);
1110 
1111 		list_del_init(&info->state_entry);
1112 		if (vdo_is_read_only(info->cache->vdo)) {
1113 			struct vdo_completion *completion = &info->vio->completion;
1114 
1115 			vdo_reset_completion(completion);
1116 			completion->callback = page_is_written_out;
1117 			completion->error_handler = handle_page_write_error;
1118 			vdo_fail_completion(completion, VDO_READ_ONLY);
1119 			continue;
1120 		}
1121 		ADD_ONCE(info->cache->stats.pages_saved, 1);
1122 		vdo_submit_metadata_vio(info->vio, info->pbn, write_cache_page_endio,
1123 					handle_page_write_error, REQ_OP_WRITE | REQ_PRIO);
1124 	}
1125 
1126 	if (has_unflushed_pages) {
1127 		/*
1128 		 * If there are unflushed pages, the cache can't have been freed, so this call is
1129 		 * safe.
1130 		 */
1131 		save_pages(cache);
1132 	}
1133 }
1134 
1135 /**
1136  * vdo_release_page_completion() - Release a VDO Page Completion.
1137  *
1138  * The page referenced by this completion (if any) will no longer be held busy by this completion.
1139  * If a page becomes discardable and there are completions awaiting free pages then a new round of
1140  * page discarding is started.
1141  */
1142 void vdo_release_page_completion(struct vdo_completion *completion)
1143 {
1144 	struct page_info *discard_info = NULL;
1145 	struct vdo_page_completion *page_completion = as_vdo_page_completion(completion);
1146 	struct vdo_page_cache *cache;
1147 
1148 	if (completion->result == VDO_SUCCESS) {
1149 		if (!validate_completed_page_or_enter_read_only_mode(page_completion, false))
1150 			return;
1151 
1152 		if (--page_completion->info->busy == 0)
1153 			discard_info = page_completion->info;
1154 	}
1155 
1156 	VDO_ASSERT_LOG_ONLY((page_completion->waiter.next_waiter == NULL),
1157 			    "Page being released after leaving all queues");
1158 
1159 	page_completion->info = NULL;
1160 	cache = page_completion->cache;
1161 	assert_on_cache_thread(cache, __func__);
1162 
1163 	if (discard_info != NULL) {
1164 		if (discard_info->write_status == WRITE_STATUS_DEFERRED) {
1165 			discard_info->write_status = WRITE_STATUS_NORMAL;
1166 			launch_page_save(discard_info);
1167 		}
1168 
1169 		/*
1170 		 * if there are excess requests for pages (that have not already started discards)
1171 		 * we need to discard some page (which may be this one)
1172 		 */
1173 		discard_page_if_needed(cache);
1174 	}
1175 }
1176 
1177 /**
1178  * load_page_for_completion() - Helper function to load a page as described by a VDO Page
1179  *                              Completion.
1180  */
1181 static void load_page_for_completion(struct page_info *info,
1182 				     struct vdo_page_completion *vdo_page_comp)
1183 {
1184 	int result;
1185 
1186 	vdo_waitq_enqueue_waiter(&info->waiting, &vdo_page_comp->waiter);
1187 	result = launch_page_load(info, vdo_page_comp->pbn);
1188 	if (result != VDO_SUCCESS) {
1189 		vdo_waitq_notify_all_waiters(&info->waiting,
1190 					     complete_waiter_with_error, &result);
1191 	}
1192 }
1193 
1194 /**
1195  * vdo_get_page() - Initialize a page completion and get a block map page.
1196  * @page_completion: The vdo_page_completion to initialize.
1197  * @zone: The block map zone of the desired page.
1198  * @pbn: The absolute physical block of the desired page.
1199  * @writable: Whether the page can be modified.
1200  * @parent: The object to notify when the fetch is complete.
1201  * @callback: The notification callback.
1202  * @error_handler: The handler for fetch errors.
1203  * @requeue: Whether we must requeue when notifying the parent.
1204  *
1205  * May cause another page to be discarded (potentially writing a dirty page) and the one nominated
1206  * by the completion to be loaded from disk. When the callback is invoked, the page will be
1207  * resident in the cache and marked busy. All callers must call vdo_release_page_completion()
1208  * when they are done with the page to clear the busy mark.
1209  */
1210 void vdo_get_page(struct vdo_page_completion *page_completion,
1211 		  struct block_map_zone *zone, physical_block_number_t pbn,
1212 		  bool writable, void *parent, vdo_action_fn callback,
1213 		  vdo_action_fn error_handler, bool requeue)
1214 {
1215 	struct vdo_page_cache *cache = &zone->page_cache;
1216 	struct vdo_completion *completion = &page_completion->completion;
1217 	struct page_info *info;
1218 
1219 	assert_on_cache_thread(cache, __func__);
1220 	VDO_ASSERT_LOG_ONLY((page_completion->waiter.next_waiter == NULL),
1221 			    "New page completion was not already on a wait queue");
1222 
1223 	*page_completion = (struct vdo_page_completion) {
1224 		.pbn = pbn,
1225 		.writable = writable,
1226 		.cache = cache,
1227 	};
1228 
1229 	vdo_initialize_completion(completion, cache->vdo, VDO_PAGE_COMPLETION);
1230 	vdo_prepare_completion(completion, callback, error_handler,
1231 			       cache->zone->thread_id, parent);
1232 	completion->requeue = requeue;
1233 
1234 	if (page_completion->writable && vdo_is_read_only(cache->vdo)) {
1235 		vdo_fail_completion(completion, VDO_READ_ONLY);
1236 		return;
1237 	}
1238 
1239 	if (page_completion->writable)
1240 		ADD_ONCE(cache->stats.write_count, 1);
1241 	else
1242 		ADD_ONCE(cache->stats.read_count, 1);
1243 
1244 	info = find_page(cache, page_completion->pbn);
1245 	if (info != NULL) {
1246 		/* The page is in the cache already. */
1247 		if ((info->write_status == WRITE_STATUS_DEFERRED) ||
1248 		    is_incoming(info) ||
1249 		    (is_outgoing(info) && page_completion->writable)) {
1250 			/* The page is unusable until it has finished I/O. */
1251 			ADD_ONCE(cache->stats.wait_for_page, 1);
1252 			vdo_waitq_enqueue_waiter(&info->waiting, &page_completion->waiter);
1253 			return;
1254 		}
1255 
1256 		if (is_valid(info)) {
1257 			/* The page is usable. */
1258 			ADD_ONCE(cache->stats.found_in_cache, 1);
1259 			if (!is_present(info))
1260 				ADD_ONCE(cache->stats.read_outgoing, 1);
1261 			update_lru(info);
1262 			info->busy++;
1263 			complete_with_page(info, page_completion);
1264 			return;
1265 		}
1266 
1267 		/* Something horrible has gone wrong. */
1268 		VDO_ASSERT_LOG_ONLY(false, "Info found in a usable state.");
1269 	}
1270 
1271 	/* The page must be fetched. */
1272 	info = find_free_page(cache);
1273 	if (info != NULL) {
1274 		ADD_ONCE(cache->stats.fetch_required, 1);
1275 		load_page_for_completion(info, page_completion);
1276 		return;
1277 	}
1278 
1279 	/* The page must wait for a page to be discarded. */
1280 	ADD_ONCE(cache->stats.discard_required, 1);
1281 	discard_page_for_completion(page_completion);
1282 }
1283 
1284 /**
1285  * vdo_request_page_write() - Request that a VDO page be written out as soon as it is not busy.
1286  * @completion: The vdo_page_completion containing the page.
1287  */
1288 void vdo_request_page_write(struct vdo_completion *completion)
1289 {
1290 	struct page_info *info;
1291 	struct vdo_page_completion *vdo_page_comp = as_vdo_page_completion(completion);
1292 
1293 	if (!validate_completed_page_or_enter_read_only_mode(vdo_page_comp, true))
1294 		return;
1295 
1296 	info = vdo_page_comp->info;
1297 	set_info_state(info, PS_DIRTY);
1298 	launch_page_save(info);
1299 }
1300 
1301 /**
1302  * vdo_get_cached_page() - Get the block map page from a page completion.
1303  * @completion: A vdo page completion whose callback has been called.
1304  * @page_ptr: A pointer to hold the page
1305  *
1306  * Return: VDO_SUCCESS or an error
1307  */
1308 int vdo_get_cached_page(struct vdo_completion *completion,
1309 			struct block_map_page **page_ptr)
1310 {
1311 	int result;
1312 	struct vdo_page_completion *vpc;
1313 
1314 	vpc = as_vdo_page_completion(completion);
1315 	result = validate_completed_page(vpc, true);
1316 	if (result == VDO_SUCCESS)
1317 		*page_ptr = (struct block_map_page *) get_page_buffer(vpc->info);
1318 
1319 	return result;
1320 }
1321 
1322 /**
1323  * vdo_invalidate_page_cache() - Invalidate all entries in the VDO page cache.
1324  *
1325  * There must not be any dirty pages in the cache.
1326  *
1327  * Return: A success or error code.
1328  */
1329 int vdo_invalidate_page_cache(struct vdo_page_cache *cache)
1330 {
1331 	struct page_info *info;
1332 
1333 	assert_on_cache_thread(cache, __func__);
1334 
1335 	/* Make sure we don't throw away any dirty pages. */
1336 	for (info = cache->infos; info < cache->infos + cache->page_count; info++) {
1337 		int result = VDO_ASSERT(!is_dirty(info), "cache must have no dirty pages");
1338 
1339 		if (result != VDO_SUCCESS)
1340 			return result;
1341 	}
1342 
1343 	/* Reset the page map by re-allocating it. */
1344 	vdo_int_map_free(vdo_forget(cache->page_map));
1345 	return vdo_int_map_create(cache->page_count, &cache->page_map);
1346 }
1347 
1348 /**
1349  * get_tree_page_by_index() - Get the tree page for a given height and page index.
1350  *
1351  * Return: The requested page.
1352  */
1353 static struct tree_page * __must_check get_tree_page_by_index(struct forest *forest,
1354 							      root_count_t root_index,
1355 							      height_t height,
1356 							      page_number_t page_index)
1357 {
1358 	page_number_t offset = 0;
1359 	size_t segment;
1360 
1361 	for (segment = 0; segment < forest->segments; segment++) {
1362 		page_number_t border = forest->boundaries[segment].levels[height - 1];
1363 
1364 		if (page_index < border) {
1365 			struct block_map_tree *tree = &forest->trees[root_index];
1366 
1367 			return &(tree->segments[segment].levels[height - 1][page_index - offset]);
1368 		}
1369 
1370 		offset = border;
1371 	}
1372 
1373 	return NULL;
1374 }
1375 
1376 /* Get the page referred to by the lock's tree slot at its current height. */
1377 static inline struct tree_page *get_tree_page(const struct block_map_zone *zone,
1378 					      const struct tree_lock *lock)
1379 {
1380 	return get_tree_page_by_index(zone->block_map->forest, lock->root_index,
1381 				      lock->height,
1382 				      lock->tree_slots[lock->height].page_index);
1383 }
1384 
1385 /** vdo_copy_valid_page() - Validate and copy a buffer to a page. */
1386 bool vdo_copy_valid_page(char *buffer, nonce_t nonce,
1387 			 physical_block_number_t pbn,
1388 			 struct block_map_page *page)
1389 {
1390 	struct block_map_page *loaded = (struct block_map_page *) buffer;
1391 	enum block_map_page_validity validity =
1392 		vdo_validate_block_map_page(loaded, nonce, pbn);
1393 
1394 	if (validity == VDO_BLOCK_MAP_PAGE_VALID) {
1395 		memcpy(page, loaded, VDO_BLOCK_SIZE);
1396 		return true;
1397 	}
1398 
1399 	if (validity == VDO_BLOCK_MAP_PAGE_BAD) {
1400 		vdo_log_error_strerror(VDO_BAD_PAGE,
1401 				       "Expected page %llu but got page %llu instead",
1402 				       (unsigned long long) pbn,
1403 				       (unsigned long long) vdo_get_block_map_page_pbn(loaded));
1404 	}
1405 
1406 	return false;
1407 }
1408 
1409 /**
1410  * in_cyclic_range() - Check whether the given value is between the lower and upper bounds, within
1411  *                     a cyclic range of values from 0 to (modulus - 1).
1412  * @lower: The lowest value to accept.
1413  * @value: The value to check.
1414  * @upper: The highest value to accept.
1415  * @modulus: The size of the cyclic space, no more than 2^15.
1416  *
1417  * The value and both bounds must be smaller than the modulus.
1418  *
1419  * Return: true if the value is in range.
1420  */
1421 static bool in_cyclic_range(u16 lower, u16 value, u16 upper, u16 modulus)
1422 {
1423 	if (value < lower)
1424 		value += modulus;
1425 	if (upper < lower)
1426 		upper += modulus;
1427 	return (value <= upper);
1428 }
1429 
1430 /**
1431  * is_not_older() - Check whether a generation is strictly older than some other generation in the
1432  *                  context of a zone's current generation range.
1433  * @zone: The zone in which to do the comparison.
1434  * @a: The generation in question.
1435  * @b: The generation to compare to.
1436  *
1437  * Return: true if generation @a is not strictly older than generation @b in the context of @zone
1438  */
1439 static bool __must_check is_not_older(struct block_map_zone *zone, u8 a, u8 b)
1440 {
1441 	int result;
1442 
1443 	result = VDO_ASSERT((in_cyclic_range(zone->oldest_generation, a, zone->generation, 1 << 8) &&
1444 			     in_cyclic_range(zone->oldest_generation, b, zone->generation, 1 << 8)),
1445 			    "generation(s) %u, %u are out of range [%u, %u]",
1446 			    a, b, zone->oldest_generation, zone->generation);
1447 	if (result != VDO_SUCCESS) {
1448 		enter_zone_read_only_mode(zone, result);
1449 		return true;
1450 	}
1451 
1452 	return in_cyclic_range(b, a, zone->generation, 1 << 8);
1453 }
1454 
1455 static void release_generation(struct block_map_zone *zone, u8 generation)
1456 {
1457 	int result;
1458 
1459 	result = VDO_ASSERT((zone->dirty_page_counts[generation] > 0),
1460 			    "dirty page count underflow for generation %u", generation);
1461 	if (result != VDO_SUCCESS) {
1462 		enter_zone_read_only_mode(zone, result);
1463 		return;
1464 	}
1465 
1466 	zone->dirty_page_counts[generation]--;
1467 	while ((zone->dirty_page_counts[zone->oldest_generation] == 0) &&
1468 	       (zone->oldest_generation != zone->generation))
1469 		zone->oldest_generation++;
1470 }
1471 
1472 static void set_generation(struct block_map_zone *zone, struct tree_page *page,
1473 			   u8 new_generation)
1474 {
1475 	u32 new_count;
1476 	int result;
1477 	bool decrement_old = vdo_waiter_is_waiting(&page->waiter);
1478 	u8 old_generation = page->generation;
1479 
1480 	if (decrement_old && (old_generation == new_generation))
1481 		return;
1482 
1483 	page->generation = new_generation;
1484 	new_count = ++zone->dirty_page_counts[new_generation];
1485 	result = VDO_ASSERT((new_count != 0), "dirty page count overflow for generation %u",
1486 			    new_generation);
1487 	if (result != VDO_SUCCESS) {
1488 		enter_zone_read_only_mode(zone, result);
1489 		return;
1490 	}
1491 
1492 	if (decrement_old)
1493 		release_generation(zone, old_generation);
1494 }
1495 
1496 static void write_page(struct tree_page *tree_page, struct pooled_vio *vio);
1497 
1498 /* Implements waiter_callback_fn */
1499 static void write_page_callback(struct vdo_waiter *waiter, void *context)
1500 {
1501 	write_page(container_of(waiter, struct tree_page, waiter), context);
1502 }
1503 
1504 static void acquire_vio(struct vdo_waiter *waiter, struct block_map_zone *zone)
1505 {
1506 	waiter->callback = write_page_callback;
1507 	acquire_vio_from_pool(zone->vio_pool, waiter);
1508 }
1509 
1510 /* Return: true if all possible generations were not already active */
1511 static bool attempt_increment(struct block_map_zone *zone)
1512 {
1513 	u8 generation = zone->generation + 1;
1514 
1515 	if (zone->oldest_generation == generation)
1516 		return false;
1517 
1518 	zone->generation = generation;
1519 	return true;
1520 }
1521 
1522 /* Launches a flush if one is not already in progress. */
1523 static void enqueue_page(struct tree_page *page, struct block_map_zone *zone)
1524 {
1525 	if ((zone->flusher == NULL) && attempt_increment(zone)) {
1526 		zone->flusher = page;
1527 		acquire_vio(&page->waiter, zone);
1528 		return;
1529 	}
1530 
1531 	vdo_waitq_enqueue_waiter(&zone->flush_waiters, &page->waiter);
1532 }
1533 
1534 static void write_page_if_not_dirtied(struct vdo_waiter *waiter, void *context)
1535 {
1536 	struct tree_page *page = container_of(waiter, struct tree_page, waiter);
1537 	struct write_if_not_dirtied_context *write_context = context;
1538 
1539 	if (page->generation == write_context->generation) {
1540 		acquire_vio(waiter, write_context->zone);
1541 		return;
1542 	}
1543 
1544 	enqueue_page(page, write_context->zone);
1545 }
1546 
1547 static void return_to_pool(struct block_map_zone *zone, struct pooled_vio *vio)
1548 {
1549 	return_vio_to_pool(zone->vio_pool, vio);
1550 	check_for_drain_complete(zone);
1551 }
1552 
1553 /* This callback is registered in write_initialized_page(). */
1554 static void finish_page_write(struct vdo_completion *completion)
1555 {
1556 	bool dirty;
1557 	struct vio *vio = as_vio(completion);
1558 	struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio);
1559 	struct tree_page *page = completion->parent;
1560 	struct block_map_zone *zone = pooled->context;
1561 
1562 	vdo_release_recovery_journal_block_reference(zone->block_map->journal,
1563 						     page->writing_recovery_lock,
1564 						     VDO_ZONE_TYPE_LOGICAL,
1565 						     zone->zone_number);
1566 
1567 	dirty = (page->writing_generation != page->generation);
1568 	release_generation(zone, page->writing_generation);
1569 	page->writing = false;
1570 
1571 	if (zone->flusher == page) {
1572 		struct write_if_not_dirtied_context context = {
1573 			.zone = zone,
1574 			.generation = page->writing_generation,
1575 		};
1576 
1577 		vdo_waitq_notify_all_waiters(&zone->flush_waiters,
1578 					     write_page_if_not_dirtied, &context);
1579 		if (dirty && attempt_increment(zone)) {
1580 			write_page(page, pooled);
1581 			return;
1582 		}
1583 
1584 		zone->flusher = NULL;
1585 	}
1586 
1587 	if (dirty) {
1588 		enqueue_page(page, zone);
1589 	} else if ((zone->flusher == NULL) && vdo_waitq_has_waiters(&zone->flush_waiters) &&
1590 		   attempt_increment(zone)) {
1591 		zone->flusher = container_of(vdo_waitq_dequeue_waiter(&zone->flush_waiters),
1592 					     struct tree_page, waiter);
1593 		write_page(zone->flusher, pooled);
1594 		return;
1595 	}
1596 
1597 	return_to_pool(zone, pooled);
1598 }
1599 
1600 static void handle_write_error(struct vdo_completion *completion)
1601 {
1602 	int result = completion->result;
1603 	struct vio *vio = as_vio(completion);
1604 	struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio);
1605 	struct block_map_zone *zone = pooled->context;
1606 
1607 	vio_record_metadata_io_error(vio);
1608 	enter_zone_read_only_mode(zone, result);
1609 	return_to_pool(zone, pooled);
1610 }
1611 
1612 static void write_page_endio(struct bio *bio);
1613 
1614 static void write_initialized_page(struct vdo_completion *completion)
1615 {
1616 	struct vio *vio = as_vio(completion);
1617 	struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio);
1618 	struct block_map_zone *zone = pooled->context;
1619 	struct tree_page *tree_page = completion->parent;
1620 	struct block_map_page *page = (struct block_map_page *) vio->data;
1621 	blk_opf_t operation = REQ_OP_WRITE | REQ_PRIO;
1622 
1623 	/*
1624 	 * Now that we know the page has been written at least once, mark the copy we are writing
1625 	 * as initialized.
1626 	 */
1627 	page->header.initialized = true;
1628 
1629 	if (zone->flusher == tree_page)
1630 		operation |= REQ_PREFLUSH;
1631 
1632 	vdo_submit_metadata_vio(vio, vdo_get_block_map_page_pbn(page),
1633 				write_page_endio, handle_write_error,
1634 				operation);
1635 }
1636 
1637 static void write_page_endio(struct bio *bio)
1638 {
1639 	struct pooled_vio *vio = bio->bi_private;
1640 	struct block_map_zone *zone = vio->context;
1641 	struct block_map_page *page = (struct block_map_page *) vio->vio.data;
1642 
1643 	continue_vio_after_io(&vio->vio,
1644 			      (page->header.initialized ?
1645 			       finish_page_write : write_initialized_page),
1646 			      zone->thread_id);
1647 }
1648 
1649 static void write_page(struct tree_page *tree_page, struct pooled_vio *vio)
1650 {
1651 	struct vdo_completion *completion = &vio->vio.completion;
1652 	struct block_map_zone *zone = vio->context;
1653 	struct block_map_page *page = vdo_as_block_map_page(tree_page);
1654 
1655 	if ((zone->flusher != tree_page) &&
1656 	    is_not_older(zone, tree_page->generation, zone->generation)) {
1657 		/*
1658 		 * This page was re-dirtied after the last flush was issued, hence we need to do
1659 		 * another flush.
1660 		 */
1661 		enqueue_page(tree_page, zone);
1662 		return_to_pool(zone, vio);
1663 		return;
1664 	}
1665 
1666 	completion->parent = tree_page;
1667 	memcpy(vio->vio.data, tree_page->page_buffer, VDO_BLOCK_SIZE);
1668 	completion->callback_thread_id = zone->thread_id;
1669 
1670 	tree_page->writing = true;
1671 	tree_page->writing_generation = tree_page->generation;
1672 	tree_page->writing_recovery_lock = tree_page->recovery_lock;
1673 
1674 	/* Clear this now so that we know this page is not on any dirty list. */
1675 	tree_page->recovery_lock = 0;
1676 
1677 	/*
1678 	 * We've already copied the page into the vio which will write it, so if it was not yet
1679 	 * initialized, the first write will indicate that (for torn write protection). It is now
1680 	 * safe to mark it as initialized in memory since if the write fails, the in memory state
1681 	 * will become irrelevant.
1682 	 */
1683 	if (page->header.initialized) {
1684 		write_initialized_page(completion);
1685 		return;
1686 	}
1687 
1688 	page->header.initialized = true;
1689 	vdo_submit_metadata_vio(&vio->vio, vdo_get_block_map_page_pbn(page),
1690 				write_page_endio, handle_write_error,
1691 				REQ_OP_WRITE | REQ_PRIO);
1692 }
1693 
1694 /* Release a lock on a page which was being loaded or allocated. */
1695 static void release_page_lock(struct data_vio *data_vio, char *what)
1696 {
1697 	struct block_map_zone *zone;
1698 	struct tree_lock *lock_holder;
1699 	struct tree_lock *lock = &data_vio->tree_lock;
1700 
1701 	VDO_ASSERT_LOG_ONLY(lock->locked,
1702 			    "release of unlocked block map page %s for key %llu in tree %u",
1703 			    what, (unsigned long long) lock->key, lock->root_index);
1704 
1705 	zone = data_vio->logical.zone->block_map_zone;
1706 	lock_holder = vdo_int_map_remove(zone->loading_pages, lock->key);
1707 	VDO_ASSERT_LOG_ONLY((lock_holder == lock),
1708 			    "block map page %s mismatch for key %llu in tree %u",
1709 			    what, (unsigned long long) lock->key, lock->root_index);
1710 	lock->locked = false;
1711 }
1712 
1713 static void finish_lookup(struct data_vio *data_vio, int result)
1714 {
1715 	data_vio->tree_lock.height = 0;
1716 
1717 	--data_vio->logical.zone->block_map_zone->active_lookups;
1718 
1719 	set_data_vio_logical_callback(data_vio, continue_data_vio_with_block_map_slot);
1720 	data_vio->vio.completion.error_handler = handle_data_vio_error;
1721 	continue_data_vio_with_error(data_vio, result);
1722 }
1723 
1724 static void abort_lookup_for_waiter(struct vdo_waiter *waiter, void *context)
1725 {
1726 	struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);
1727 	int result = *((int *) context);
1728 
1729 	if (!data_vio->write) {
1730 		if (result == VDO_NO_SPACE)
1731 			result = VDO_SUCCESS;
1732 	} else if (result != VDO_NO_SPACE) {
1733 		result = VDO_READ_ONLY;
1734 	}
1735 
1736 	finish_lookup(data_vio, result);
1737 }
1738 
1739 static void abort_lookup(struct data_vio *data_vio, int result, char *what)
1740 {
1741 	if (result != VDO_NO_SPACE)
1742 		enter_zone_read_only_mode(data_vio->logical.zone->block_map_zone, result);
1743 
1744 	if (data_vio->tree_lock.locked) {
1745 		release_page_lock(data_vio, what);
1746 		vdo_waitq_notify_all_waiters(&data_vio->tree_lock.waiters,
1747 					     abort_lookup_for_waiter,
1748 					     &result);
1749 	}
1750 
1751 	finish_lookup(data_vio, result);
1752 }
1753 
1754 static void abort_load(struct data_vio *data_vio, int result)
1755 {
1756 	abort_lookup(data_vio, result, "load");
1757 }
1758 
1759 static bool __must_check is_invalid_tree_entry(const struct vdo *vdo,
1760 					       const struct data_location *mapping,
1761 					       height_t height)
1762 {
1763 	if (!vdo_is_valid_location(mapping) ||
1764 	    vdo_is_state_compressed(mapping->state) ||
1765 	    (vdo_is_mapped_location(mapping) && (mapping->pbn == VDO_ZERO_BLOCK)))
1766 		return true;
1767 
1768 	/* Roots aren't physical data blocks, so we can't check their PBNs. */
1769 	if (height == VDO_BLOCK_MAP_TREE_HEIGHT)
1770 		return false;
1771 
1772 	return !vdo_is_physical_data_block(vdo->depot, mapping->pbn);
1773 }
1774 
1775 static void load_block_map_page(struct block_map_zone *zone, struct data_vio *data_vio);
1776 static void allocate_block_map_page(struct block_map_zone *zone,
1777 				    struct data_vio *data_vio);
1778 
1779 static void continue_with_loaded_page(struct data_vio *data_vio,
1780 				      struct block_map_page *page)
1781 {
1782 	struct tree_lock *lock = &data_vio->tree_lock;
1783 	struct block_map_tree_slot slot = lock->tree_slots[lock->height];
1784 	struct data_location mapping =
1785 		vdo_unpack_block_map_entry(&page->entries[slot.block_map_slot.slot]);
1786 
1787 	if (is_invalid_tree_entry(vdo_from_data_vio(data_vio), &mapping, lock->height)) {
1788 		vdo_log_error_strerror(VDO_BAD_MAPPING,
1789 				       "Invalid block map tree PBN: %llu with state %u for page index %u at height %u",
1790 				       (unsigned long long) mapping.pbn, mapping.state,
1791 				       lock->tree_slots[lock->height - 1].page_index,
1792 				       lock->height - 1);
1793 		abort_load(data_vio, VDO_BAD_MAPPING);
1794 		return;
1795 	}
1796 
1797 	if (!vdo_is_mapped_location(&mapping)) {
1798 		/* The page we need is unallocated */
1799 		allocate_block_map_page(data_vio->logical.zone->block_map_zone,
1800 					data_vio);
1801 		return;
1802 	}
1803 
1804 	lock->tree_slots[lock->height - 1].block_map_slot.pbn = mapping.pbn;
1805 	if (lock->height == 1) {
1806 		finish_lookup(data_vio, VDO_SUCCESS);
1807 		return;
1808 	}
1809 
1810 	/* We know what page we need to load next */
1811 	load_block_map_page(data_vio->logical.zone->block_map_zone, data_vio);
1812 }
1813 
1814 static void continue_load_for_waiter(struct vdo_waiter *waiter, void *context)
1815 {
1816 	struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);
1817 
1818 	data_vio->tree_lock.height--;
1819 	continue_with_loaded_page(data_vio, context);
1820 }
1821 
1822 static void finish_block_map_page_load(struct vdo_completion *completion)
1823 {
1824 	physical_block_number_t pbn;
1825 	struct tree_page *tree_page;
1826 	struct block_map_page *page;
1827 	nonce_t nonce;
1828 	struct vio *vio = as_vio(completion);
1829 	struct pooled_vio *pooled = vio_as_pooled_vio(vio);
1830 	struct data_vio *data_vio = completion->parent;
1831 	struct block_map_zone *zone = pooled->context;
1832 	struct tree_lock *tree_lock = &data_vio->tree_lock;
1833 
1834 	tree_lock->height--;
1835 	pbn = tree_lock->tree_slots[tree_lock->height].block_map_slot.pbn;
1836 	tree_page = get_tree_page(zone, tree_lock);
1837 	page = (struct block_map_page *) tree_page->page_buffer;
1838 	nonce = zone->block_map->nonce;
1839 
1840 	if (!vdo_copy_valid_page(vio->data, nonce, pbn, page))
1841 		vdo_format_block_map_page(page, nonce, pbn, false);
1842 	return_vio_to_pool(zone->vio_pool, pooled);
1843 
1844 	/* Release our claim to the load and wake any waiters */
1845 	release_page_lock(data_vio, "load");
1846 	vdo_waitq_notify_all_waiters(&tree_lock->waiters, continue_load_for_waiter, page);
1847 	continue_with_loaded_page(data_vio, page);
1848 }
1849 
1850 static void handle_io_error(struct vdo_completion *completion)
1851 {
1852 	int result = completion->result;
1853 	struct vio *vio = as_vio(completion);
1854 	struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio);
1855 	struct data_vio *data_vio = completion->parent;
1856 	struct block_map_zone *zone = pooled->context;
1857 
1858 	vio_record_metadata_io_error(vio);
1859 	return_vio_to_pool(zone->vio_pool, pooled);
1860 	abort_load(data_vio, result);
1861 }
1862 
1863 static void load_page_endio(struct bio *bio)
1864 {
1865 	struct vio *vio = bio->bi_private;
1866 	struct data_vio *data_vio = vio->completion.parent;
1867 
1868 	continue_vio_after_io(vio, finish_block_map_page_load,
1869 			      data_vio->logical.zone->thread_id);
1870 }
1871 
1872 static void load_page(struct vdo_waiter *waiter, void *context)
1873 {
1874 	struct pooled_vio *pooled = context;
1875 	struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);
1876 	struct tree_lock *lock = &data_vio->tree_lock;
1877 	physical_block_number_t pbn = lock->tree_slots[lock->height - 1].block_map_slot.pbn;
1878 
1879 	pooled->vio.completion.parent = data_vio;
1880 	vdo_submit_metadata_vio(&pooled->vio, pbn, load_page_endio,
1881 				handle_io_error, REQ_OP_READ | REQ_PRIO);
1882 }
1883 
1884 /*
1885  * If the page is already locked, queue up to wait for the lock to be released. If the lock is
1886  * acquired, @data_vio->tree_lock.locked will be true.
1887  */
1888 static int attempt_page_lock(struct block_map_zone *zone, struct data_vio *data_vio)
1889 {
1890 	int result;
1891 	struct tree_lock *lock_holder;
1892 	struct tree_lock *lock = &data_vio->tree_lock;
1893 	height_t height = lock->height;
1894 	struct block_map_tree_slot tree_slot = lock->tree_slots[height];
1895 	union page_key key;
1896 
1897 	key.descriptor = (struct page_descriptor) {
1898 		.root_index = lock->root_index,
1899 		.height = height,
1900 		.page_index = tree_slot.page_index,
1901 		.slot = tree_slot.block_map_slot.slot,
1902 	};
1903 	lock->key = key.key;
1904 
1905 	result = vdo_int_map_put(zone->loading_pages, lock->key,
1906 				 lock, false, (void **) &lock_holder);
1907 	if (result != VDO_SUCCESS)
1908 		return result;
1909 
1910 	if (lock_holder == NULL) {
1911 		/* We got the lock */
1912 		data_vio->tree_lock.locked = true;
1913 		return VDO_SUCCESS;
1914 	}
1915 
1916 	/* Someone else is loading or allocating the page we need */
1917 	vdo_waitq_enqueue_waiter(&lock_holder->waiters, &data_vio->waiter);
1918 	return VDO_SUCCESS;
1919 }
1920 
1921 /* Load a block map tree page from disk, for the next level in the data vio tree lock. */
1922 static void load_block_map_page(struct block_map_zone *zone, struct data_vio *data_vio)
1923 {
1924 	int result;
1925 
1926 	result = attempt_page_lock(zone, data_vio);
1927 	if (result != VDO_SUCCESS) {
1928 		abort_load(data_vio, result);
1929 		return;
1930 	}
1931 
1932 	if (data_vio->tree_lock.locked) {
1933 		data_vio->waiter.callback = load_page;
1934 		acquire_vio_from_pool(zone->vio_pool, &data_vio->waiter);
1935 	}
1936 }
1937 
1938 static void allocation_failure(struct vdo_completion *completion)
1939 {
1940 	struct data_vio *data_vio = as_data_vio(completion);
1941 
1942 	if (vdo_requeue_completion_if_needed(completion,
1943 					     data_vio->logical.zone->thread_id))
1944 		return;
1945 
1946 	abort_lookup(data_vio, completion->result, "allocation");
1947 }
1948 
1949 static void continue_allocation_for_waiter(struct vdo_waiter *waiter, void *context)
1950 {
1951 	struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);
1952 	struct tree_lock *tree_lock = &data_vio->tree_lock;
1953 	physical_block_number_t pbn = *((physical_block_number_t *) context);
1954 
1955 	tree_lock->height--;
1956 	data_vio->tree_lock.tree_slots[tree_lock->height].block_map_slot.pbn = pbn;
1957 
1958 	if (tree_lock->height == 0) {
1959 		finish_lookup(data_vio, VDO_SUCCESS);
1960 		return;
1961 	}
1962 
1963 	allocate_block_map_page(data_vio->logical.zone->block_map_zone, data_vio);
1964 }
1965 
1966 /** expire_oldest_list() - Expire the oldest list. */
1967 static void expire_oldest_list(struct dirty_lists *dirty_lists)
1968 {
1969 	block_count_t i = dirty_lists->offset++;
1970 
1971 	dirty_lists->oldest_period++;
1972 	if (!list_empty(&dirty_lists->eras[i][VDO_TREE_PAGE])) {
1973 		list_splice_tail_init(&dirty_lists->eras[i][VDO_TREE_PAGE],
1974 				      &dirty_lists->expired[VDO_TREE_PAGE]);
1975 	}
1976 
1977 	if (!list_empty(&dirty_lists->eras[i][VDO_CACHE_PAGE])) {
1978 		list_splice_tail_init(&dirty_lists->eras[i][VDO_CACHE_PAGE],
1979 				      &dirty_lists->expired[VDO_CACHE_PAGE]);
1980 	}
1981 
1982 	if (dirty_lists->offset == dirty_lists->maximum_age)
1983 		dirty_lists->offset = 0;
1984 }
1985 
1986 
1987 /** update_period() - Update the dirty_lists period if necessary. */
1988 static void update_period(struct dirty_lists *dirty, sequence_number_t period)
1989 {
1990 	while (dirty->next_period <= period) {
1991 		if ((dirty->next_period - dirty->oldest_period) == dirty->maximum_age)
1992 			expire_oldest_list(dirty);
1993 		dirty->next_period++;
1994 	}
1995 }
1996 
1997 /** write_expired_elements() - Write out the expired list. */
1998 static void write_expired_elements(struct block_map_zone *zone)
1999 {
2000 	struct tree_page *page, *ttmp;
2001 	struct page_info *info, *ptmp;
2002 	struct list_head *expired;
2003 	u8 generation = zone->generation;
2004 
2005 	expired = &zone->dirty_lists->expired[VDO_TREE_PAGE];
2006 	list_for_each_entry_safe(page, ttmp, expired, entry) {
2007 		int result;
2008 
2009 		list_del_init(&page->entry);
2010 
2011 		result = VDO_ASSERT(!vdo_waiter_is_waiting(&page->waiter),
2012 				    "Newly expired page not already waiting to write");
2013 		if (result != VDO_SUCCESS) {
2014 			enter_zone_read_only_mode(zone, result);
2015 			continue;
2016 		}
2017 
2018 		set_generation(zone, page, generation);
2019 		if (!page->writing)
2020 			enqueue_page(page, zone);
2021 	}
2022 
2023 	expired = &zone->dirty_lists->expired[VDO_CACHE_PAGE];
2024 	list_for_each_entry_safe(info, ptmp, expired, state_entry) {
2025 		list_del_init(&info->state_entry);
2026 		schedule_page_save(info);
2027 	}
2028 
2029 	save_pages(&zone->page_cache);
2030 }
2031 
2032 /**
2033  * add_to_dirty_lists() - Add an element to the dirty lists.
2034  * @zone: The zone in which we are operating.
2035  * @entry: The list entry of the element to add.
2036  * @type: The type of page.
2037  * @old_period: The period in which the element was previously dirtied, or 0 if it was not dirty.
2038  * @new_period: The period in which the element has now been dirtied, or 0 if it does not hold a
2039  *              lock.
2040  */
2041 static void add_to_dirty_lists(struct block_map_zone *zone,
2042 			       struct list_head *entry,
2043 			       enum block_map_page_type type,
2044 			       sequence_number_t old_period,
2045 			       sequence_number_t new_period)
2046 {
2047 	struct dirty_lists *dirty_lists = zone->dirty_lists;
2048 
2049 	if ((old_period == new_period) || ((old_period != 0) && (old_period < new_period)))
2050 		return;
2051 
2052 	if (new_period < dirty_lists->oldest_period) {
2053 		list_move_tail(entry, &dirty_lists->expired[type]);
2054 	} else {
2055 		update_period(dirty_lists, new_period);
2056 		list_move_tail(entry,
2057 			       &dirty_lists->eras[new_period % dirty_lists->maximum_age][type]);
2058 	}
2059 
2060 	write_expired_elements(zone);
2061 }
2062 
2063 /*
2064  * Record the allocation in the tree and wake any waiters now that the write lock has been
2065  * released.
2066  */
2067 static void finish_block_map_allocation(struct vdo_completion *completion)
2068 {
2069 	physical_block_number_t pbn;
2070 	struct tree_page *tree_page;
2071 	struct block_map_page *page;
2072 	sequence_number_t old_lock;
2073 	struct data_vio *data_vio = as_data_vio(completion);
2074 	struct block_map_zone *zone = data_vio->logical.zone->block_map_zone;
2075 	struct tree_lock *tree_lock = &data_vio->tree_lock;
2076 	height_t height = tree_lock->height;
2077 
2078 	assert_data_vio_in_logical_zone(data_vio);
2079 
2080 	tree_page = get_tree_page(zone, tree_lock);
2081 	pbn = tree_lock->tree_slots[height - 1].block_map_slot.pbn;
2082 
2083 	/* Record the allocation. */
2084 	page = (struct block_map_page *) tree_page->page_buffer;
2085 	old_lock = tree_page->recovery_lock;
2086 	vdo_update_block_map_page(page, data_vio, pbn,
2087 				  VDO_MAPPING_STATE_UNCOMPRESSED,
2088 				  &tree_page->recovery_lock);
2089 
2090 	if (vdo_waiter_is_waiting(&tree_page->waiter)) {
2091 		/* This page is waiting to be written out. */
2092 		if (zone->flusher != tree_page) {
2093 			/*
2094 			 * The outstanding flush won't cover the update we just made,
2095 			 * so mark the page as needing another flush.
2096 			 */
2097 			set_generation(zone, tree_page, zone->generation);
2098 		}
2099 	} else {
2100 		/* Put the page on a dirty list */
2101 		if (old_lock == 0)
2102 			INIT_LIST_HEAD(&tree_page->entry);
2103 		add_to_dirty_lists(zone, &tree_page->entry, VDO_TREE_PAGE,
2104 				   old_lock, tree_page->recovery_lock);
2105 	}
2106 
2107 	tree_lock->height--;
2108 	if (height > 1) {
2109 		/* Format the interior node we just allocated (in memory). */
2110 		tree_page = get_tree_page(zone, tree_lock);
2111 		vdo_format_block_map_page(tree_page->page_buffer,
2112 					  zone->block_map->nonce,
2113 					  pbn, false);
2114 	}
2115 
2116 	/* Release our claim to the allocation and wake any waiters */
2117 	release_page_lock(data_vio, "allocation");
2118 	vdo_waitq_notify_all_waiters(&tree_lock->waiters,
2119 				     continue_allocation_for_waiter, &pbn);
2120 	if (tree_lock->height == 0) {
2121 		finish_lookup(data_vio, VDO_SUCCESS);
2122 		return;
2123 	}
2124 
2125 	allocate_block_map_page(zone, data_vio);
2126 }
2127 
2128 static void release_block_map_write_lock(struct vdo_completion *completion)
2129 {
2130 	struct data_vio *data_vio = as_data_vio(completion);
2131 
2132 	assert_data_vio_in_allocated_zone(data_vio);
2133 
2134 	release_data_vio_allocation_lock(data_vio, true);
2135 	launch_data_vio_logical_callback(data_vio, finish_block_map_allocation);
2136 }
2137 
2138 /*
2139  * Newly allocated block map pages are set to have to MAXIMUM_REFERENCES after they are journaled,
2140  * to prevent deduplication against the block after we release the write lock on it, but before we
2141  * write out the page.
2142  */
2143 static void set_block_map_page_reference_count(struct vdo_completion *completion)
2144 {
2145 	struct data_vio *data_vio = as_data_vio(completion);
2146 
2147 	assert_data_vio_in_allocated_zone(data_vio);
2148 
2149 	completion->callback = release_block_map_write_lock;
2150 	vdo_modify_reference_count(completion, &data_vio->increment_updater);
2151 }
2152 
2153 static void journal_block_map_allocation(struct vdo_completion *completion)
2154 {
2155 	struct data_vio *data_vio = as_data_vio(completion);
2156 
2157 	assert_data_vio_in_journal_zone(data_vio);
2158 
2159 	set_data_vio_allocated_zone_callback(data_vio,
2160 					     set_block_map_page_reference_count);
2161 	vdo_add_recovery_journal_entry(completion->vdo->recovery_journal, data_vio);
2162 }
2163 
2164 static void allocate_block(struct vdo_completion *completion)
2165 {
2166 	struct data_vio *data_vio = as_data_vio(completion);
2167 	struct tree_lock *lock = &data_vio->tree_lock;
2168 	physical_block_number_t pbn;
2169 
2170 	assert_data_vio_in_allocated_zone(data_vio);
2171 
2172 	if (!vdo_allocate_block_in_zone(data_vio))
2173 		return;
2174 
2175 	pbn = data_vio->allocation.pbn;
2176 	lock->tree_slots[lock->height - 1].block_map_slot.pbn = pbn;
2177 	data_vio->increment_updater = (struct reference_updater) {
2178 		.operation = VDO_JOURNAL_BLOCK_MAP_REMAPPING,
2179 		.increment = true,
2180 		.zpbn = {
2181 			.pbn = pbn,
2182 			.state = VDO_MAPPING_STATE_UNCOMPRESSED,
2183 		},
2184 		.lock = data_vio->allocation.lock,
2185 	};
2186 
2187 	launch_data_vio_journal_callback(data_vio, journal_block_map_allocation);
2188 }
2189 
2190 static void allocate_block_map_page(struct block_map_zone *zone,
2191 				    struct data_vio *data_vio)
2192 {
2193 	int result;
2194 
2195 	if (!data_vio->write || data_vio->is_discard) {
2196 		/* This is a pure read or a discard, so there's nothing left to do here. */
2197 		finish_lookup(data_vio, VDO_SUCCESS);
2198 		return;
2199 	}
2200 
2201 	result = attempt_page_lock(zone, data_vio);
2202 	if (result != VDO_SUCCESS) {
2203 		abort_lookup(data_vio, result, "allocation");
2204 		return;
2205 	}
2206 
2207 	if (!data_vio->tree_lock.locked)
2208 		return;
2209 
2210 	data_vio_allocate_data_block(data_vio, VIO_BLOCK_MAP_WRITE_LOCK,
2211 				     allocate_block, allocation_failure);
2212 }
2213 
2214 /**
2215  * vdo_find_block_map_slot() - Find the block map slot in which the block map entry for a data_vio
2216  *                             resides and cache that result in the data_vio.
2217  *
2218  * All ancestors in the tree will be allocated or loaded, as needed.
2219  */
2220 void vdo_find_block_map_slot(struct data_vio *data_vio)
2221 {
2222 	page_number_t page_index;
2223 	struct block_map_tree_slot tree_slot;
2224 	struct data_location mapping;
2225 	struct block_map_page *page = NULL;
2226 	struct tree_lock *lock = &data_vio->tree_lock;
2227 	struct block_map_zone *zone = data_vio->logical.zone->block_map_zone;
2228 
2229 	zone->active_lookups++;
2230 	if (vdo_is_state_draining(&zone->state)) {
2231 		finish_lookup(data_vio, VDO_SHUTTING_DOWN);
2232 		return;
2233 	}
2234 
2235 	lock->tree_slots[0].block_map_slot.slot =
2236 		data_vio->logical.lbn % VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
2237 	page_index = (lock->tree_slots[0].page_index / zone->block_map->root_count);
2238 	tree_slot = (struct block_map_tree_slot) {
2239 		.page_index = page_index / VDO_BLOCK_MAP_ENTRIES_PER_PAGE,
2240 		.block_map_slot = {
2241 			.pbn = 0,
2242 			.slot = page_index % VDO_BLOCK_MAP_ENTRIES_PER_PAGE,
2243 		},
2244 	};
2245 
2246 	for (lock->height = 1; lock->height <= VDO_BLOCK_MAP_TREE_HEIGHT; lock->height++) {
2247 		physical_block_number_t pbn;
2248 
2249 		lock->tree_slots[lock->height] = tree_slot;
2250 		page = (struct block_map_page *) (get_tree_page(zone, lock)->page_buffer);
2251 		pbn = vdo_get_block_map_page_pbn(page);
2252 		if (pbn != VDO_ZERO_BLOCK) {
2253 			lock->tree_slots[lock->height].block_map_slot.pbn = pbn;
2254 			break;
2255 		}
2256 
2257 		/* Calculate the index and slot for the next level. */
2258 		tree_slot.block_map_slot.slot =
2259 			tree_slot.page_index % VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
2260 		tree_slot.page_index = tree_slot.page_index / VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
2261 	}
2262 
2263 	/* The page at this height has been allocated and loaded. */
2264 	mapping = vdo_unpack_block_map_entry(&page->entries[tree_slot.block_map_slot.slot]);
2265 	if (is_invalid_tree_entry(vdo_from_data_vio(data_vio), &mapping, lock->height)) {
2266 		vdo_log_error_strerror(VDO_BAD_MAPPING,
2267 				       "Invalid block map tree PBN: %llu with state %u for page index %u at height %u",
2268 				       (unsigned long long) mapping.pbn, mapping.state,
2269 				       lock->tree_slots[lock->height - 1].page_index,
2270 				       lock->height - 1);
2271 		abort_load(data_vio, VDO_BAD_MAPPING);
2272 		return;
2273 	}
2274 
2275 	if (!vdo_is_mapped_location(&mapping)) {
2276 		/* The page we want one level down has not been allocated, so allocate it. */
2277 		allocate_block_map_page(zone, data_vio);
2278 		return;
2279 	}
2280 
2281 	lock->tree_slots[lock->height - 1].block_map_slot.pbn = mapping.pbn;
2282 	if (lock->height == 1) {
2283 		/* This is the ultimate block map page, so we're done */
2284 		finish_lookup(data_vio, VDO_SUCCESS);
2285 		return;
2286 	}
2287 
2288 	/* We know what page we need to load. */
2289 	load_block_map_page(zone, data_vio);
2290 }
2291 
2292 /*
2293  * Find the PBN of a leaf block map page. This method may only be used after all allocated tree
2294  * pages have been loaded, otherwise, it may give the wrong answer (0).
2295  */
2296 physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map,
2297 						    page_number_t page_number)
2298 {
2299 	struct data_location mapping;
2300 	struct tree_page *tree_page;
2301 	struct block_map_page *page;
2302 	root_count_t root_index = page_number % map->root_count;
2303 	page_number_t page_index = page_number / map->root_count;
2304 	slot_number_t slot = page_index % VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
2305 
2306 	page_index /= VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
2307 
2308 	tree_page = get_tree_page_by_index(map->forest, root_index, 1, page_index);
2309 	page = (struct block_map_page *) tree_page->page_buffer;
2310 	if (!page->header.initialized)
2311 		return VDO_ZERO_BLOCK;
2312 
2313 	mapping = vdo_unpack_block_map_entry(&page->entries[slot]);
2314 	if (!vdo_is_valid_location(&mapping) || vdo_is_state_compressed(mapping.state))
2315 		return VDO_ZERO_BLOCK;
2316 	return mapping.pbn;
2317 }
2318 
2319 /*
2320  * Write a tree page or indicate that it has been re-dirtied if it is already being written. This
2321  * method is used when correcting errors in the tree during read-only rebuild.
2322  */
2323 void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone)
2324 {
2325 	bool waiting = vdo_waiter_is_waiting(&page->waiter);
2326 
2327 	if (waiting && (zone->flusher == page))
2328 		return;
2329 
2330 	set_generation(zone, page, zone->generation);
2331 	if (waiting || page->writing)
2332 		return;
2333 
2334 	enqueue_page(page, zone);
2335 }
2336 
2337 static int make_segment(struct forest *old_forest, block_count_t new_pages,
2338 			struct boundary *new_boundary, struct forest *forest)
2339 {
2340 	size_t index = (old_forest == NULL) ? 0 : old_forest->segments;
2341 	struct tree_page *page_ptr;
2342 	page_count_t segment_sizes[VDO_BLOCK_MAP_TREE_HEIGHT];
2343 	height_t height;
2344 	root_count_t root;
2345 	int result;
2346 
2347 	forest->segments = index + 1;
2348 
2349 	result = vdo_allocate(forest->segments, struct boundary,
2350 			      "forest boundary array", &forest->boundaries);
2351 	if (result != VDO_SUCCESS)
2352 		return result;
2353 
2354 	result = vdo_allocate(forest->segments, struct tree_page *,
2355 			      "forest page pointers", &forest->pages);
2356 	if (result != VDO_SUCCESS)
2357 		return result;
2358 
2359 	result = vdo_allocate(new_pages, struct tree_page,
2360 			      "new forest pages", &forest->pages[index]);
2361 	if (result != VDO_SUCCESS)
2362 		return result;
2363 
2364 	if (index > 0) {
2365 		memcpy(forest->boundaries, old_forest->boundaries,
2366 		       index * sizeof(struct boundary));
2367 		memcpy(forest->pages, old_forest->pages,
2368 		       index * sizeof(struct tree_page *));
2369 	}
2370 
2371 	memcpy(&(forest->boundaries[index]), new_boundary, sizeof(struct boundary));
2372 
2373 	for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT; height++) {
2374 		segment_sizes[height] = new_boundary->levels[height];
2375 		if (index > 0)
2376 			segment_sizes[height] -= old_forest->boundaries[index - 1].levels[height];
2377 	}
2378 
2379 	page_ptr = forest->pages[index];
2380 	for (root = 0; root < forest->map->root_count; root++) {
2381 		struct block_map_tree_segment *segment;
2382 		struct block_map_tree *tree = &(forest->trees[root]);
2383 		height_t height;
2384 
2385 		int result = vdo_allocate(forest->segments,
2386 					  struct block_map_tree_segment,
2387 					  "tree root segments", &tree->segments);
2388 		if (result != VDO_SUCCESS)
2389 			return result;
2390 
2391 		if (index > 0) {
2392 			memcpy(tree->segments, old_forest->trees[root].segments,
2393 			       index * sizeof(struct block_map_tree_segment));
2394 		}
2395 
2396 		segment = &(tree->segments[index]);
2397 		for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT; height++) {
2398 			if (segment_sizes[height] == 0)
2399 				continue;
2400 
2401 			segment->levels[height] = page_ptr;
2402 			if (height == (VDO_BLOCK_MAP_TREE_HEIGHT - 1)) {
2403 				/* Record the root. */
2404 				struct block_map_page *page =
2405 					vdo_format_block_map_page(page_ptr->page_buffer,
2406 								  forest->map->nonce,
2407 								  VDO_INVALID_PBN, true);
2408 				page->entries[0] =
2409 					vdo_pack_block_map_entry(forest->map->root_origin + root,
2410 								 VDO_MAPPING_STATE_UNCOMPRESSED);
2411 			}
2412 			page_ptr += segment_sizes[height];
2413 		}
2414 	}
2415 
2416 	return VDO_SUCCESS;
2417 }
2418 
2419 static void deforest(struct forest *forest, size_t first_page_segment)
2420 {
2421 	root_count_t root;
2422 
2423 	if (forest->pages != NULL) {
2424 		size_t segment;
2425 
2426 		for (segment = first_page_segment; segment < forest->segments; segment++)
2427 			vdo_free(forest->pages[segment]);
2428 		vdo_free(forest->pages);
2429 	}
2430 
2431 	for (root = 0; root < forest->map->root_count; root++)
2432 		vdo_free(forest->trees[root].segments);
2433 
2434 	vdo_free(forest->boundaries);
2435 	vdo_free(forest);
2436 }
2437 
2438 /**
2439  * make_forest() - Make a collection of trees for a block_map, expanding the existing forest if
2440  *                 there is one.
2441  * @entries: The number of entries the block map will hold.
2442  *
2443  * Return: VDO_SUCCESS or an error.
2444  */
2445 static int make_forest(struct block_map *map, block_count_t entries)
2446 {
2447 	struct forest *forest, *old_forest = map->forest;
2448 	struct boundary new_boundary, *old_boundary = NULL;
2449 	block_count_t new_pages;
2450 	int result;
2451 
2452 	if (old_forest != NULL)
2453 		old_boundary = &(old_forest->boundaries[old_forest->segments - 1]);
2454 
2455 	new_pages = vdo_compute_new_forest_pages(map->root_count, old_boundary,
2456 						 entries, &new_boundary);
2457 	if (new_pages == 0) {
2458 		map->next_entry_count = entries;
2459 		return VDO_SUCCESS;
2460 	}
2461 
2462 	result = vdo_allocate_extended(struct forest, map->root_count,
2463 				       struct block_map_tree, __func__,
2464 				       &forest);
2465 	if (result != VDO_SUCCESS)
2466 		return result;
2467 
2468 	forest->map = map;
2469 	result = make_segment(old_forest, new_pages, &new_boundary, forest);
2470 	if (result != VDO_SUCCESS) {
2471 		deforest(forest, forest->segments - 1);
2472 		return result;
2473 	}
2474 
2475 	map->next_forest = forest;
2476 	map->next_entry_count = entries;
2477 	return VDO_SUCCESS;
2478 }
2479 
2480 /**
2481  * replace_forest() - Replace a block_map's forest with the already-prepared larger forest.
2482  */
2483 static void replace_forest(struct block_map *map)
2484 {
2485 	if (map->next_forest != NULL) {
2486 		if (map->forest != NULL)
2487 			deforest(map->forest, map->forest->segments);
2488 		map->forest = vdo_forget(map->next_forest);
2489 	}
2490 
2491 	map->entry_count = map->next_entry_count;
2492 	map->next_entry_count = 0;
2493 }
2494 
2495 /**
2496  * finish_cursor() - Finish the traversal of a single tree. If it was the last cursor, finish the
2497  *                   traversal.
2498  */
2499 static void finish_cursor(struct cursor *cursor)
2500 {
2501 	struct cursors *cursors = cursor->parent;
2502 	struct vdo_completion *completion = cursors->completion;
2503 
2504 	return_vio_to_pool(cursors->pool, vdo_forget(cursor->vio));
2505 	if (--cursors->active_roots > 0)
2506 		return;
2507 
2508 	vdo_free(cursors);
2509 
2510 	vdo_finish_completion(completion);
2511 }
2512 
2513 static void traverse(struct cursor *cursor);
2514 
2515 /**
2516  * continue_traversal() - Continue traversing a block map tree.
2517  * @completion: The VIO doing a read or write.
2518  */
2519 static void continue_traversal(struct vdo_completion *completion)
2520 {
2521 	vio_record_metadata_io_error(as_vio(completion));
2522 	traverse(completion->parent);
2523 }
2524 
2525 /**
2526  * finish_traversal_load() - Continue traversing a block map tree now that a page has been loaded.
2527  * @completion: The VIO doing the read.
2528  */
2529 static void finish_traversal_load(struct vdo_completion *completion)
2530 {
2531 	struct cursor *cursor = completion->parent;
2532 	height_t height = cursor->height;
2533 	struct cursor_level *level = &cursor->levels[height];
2534 	struct tree_page *tree_page =
2535 		&(cursor->tree->segments[0].levels[height][level->page_index]);
2536 	struct block_map_page *page = (struct block_map_page *) tree_page->page_buffer;
2537 
2538 	vdo_copy_valid_page(cursor->vio->vio.data,
2539 			    cursor->parent->zone->block_map->nonce,
2540 			    pbn_from_vio_bio(cursor->vio->vio.bio), page);
2541 	traverse(cursor);
2542 }
2543 
2544 static void traversal_endio(struct bio *bio)
2545 {
2546 	struct vio *vio = bio->bi_private;
2547 	struct cursor *cursor = vio->completion.parent;
2548 
2549 	continue_vio_after_io(vio, finish_traversal_load,
2550 			      cursor->parent->zone->thread_id);
2551 }
2552 
2553 /**
2554  * traverse() - Traverse a single block map tree.
2555  *
2556  * This is the recursive heart of the traversal process.
2557  */
2558 static void traverse(struct cursor *cursor)
2559 {
2560 	for (; cursor->height < VDO_BLOCK_MAP_TREE_HEIGHT; cursor->height++) {
2561 		height_t height = cursor->height;
2562 		struct cursor_level *level = &cursor->levels[height];
2563 		struct tree_page *tree_page =
2564 			&(cursor->tree->segments[0].levels[height][level->page_index]);
2565 		struct block_map_page *page = (struct block_map_page *) tree_page->page_buffer;
2566 
2567 		if (!page->header.initialized)
2568 			continue;
2569 
2570 		for (; level->slot < VDO_BLOCK_MAP_ENTRIES_PER_PAGE; level->slot++) {
2571 			struct cursor_level *next_level;
2572 			page_number_t entry_index =
2573 				(VDO_BLOCK_MAP_ENTRIES_PER_PAGE * level->page_index) + level->slot;
2574 			struct data_location location =
2575 				vdo_unpack_block_map_entry(&page->entries[level->slot]);
2576 
2577 			if (!vdo_is_valid_location(&location)) {
2578 				/* This entry is invalid, so remove it from the page. */
2579 				page->entries[level->slot] = UNMAPPED_BLOCK_MAP_ENTRY;
2580 				vdo_write_tree_page(tree_page, cursor->parent->zone);
2581 				continue;
2582 			}
2583 
2584 			if (!vdo_is_mapped_location(&location))
2585 				continue;
2586 
2587 			/* Erase mapped entries past the end of the logical space. */
2588 			if (entry_index >= cursor->boundary.levels[height]) {
2589 				page->entries[level->slot] = UNMAPPED_BLOCK_MAP_ENTRY;
2590 				vdo_write_tree_page(tree_page, cursor->parent->zone);
2591 				continue;
2592 			}
2593 
2594 			if (cursor->height < VDO_BLOCK_MAP_TREE_HEIGHT - 1) {
2595 				int result = cursor->parent->entry_callback(location.pbn,
2596 									    cursor->parent->completion);
2597 				if (result != VDO_SUCCESS) {
2598 					page->entries[level->slot] = UNMAPPED_BLOCK_MAP_ENTRY;
2599 					vdo_write_tree_page(tree_page, cursor->parent->zone);
2600 					continue;
2601 				}
2602 			}
2603 
2604 			if (cursor->height == 0)
2605 				continue;
2606 
2607 			cursor->height--;
2608 			next_level = &cursor->levels[cursor->height];
2609 			next_level->page_index = entry_index;
2610 			next_level->slot = 0;
2611 			level->slot++;
2612 			vdo_submit_metadata_vio(&cursor->vio->vio, location.pbn,
2613 						traversal_endio, continue_traversal,
2614 						REQ_OP_READ | REQ_PRIO);
2615 			return;
2616 		}
2617 	}
2618 
2619 	finish_cursor(cursor);
2620 }
2621 
2622 /**
2623  * launch_cursor() - Start traversing a single block map tree now that the cursor has a VIO with
2624  *                   which to load pages.
2625  * @context: The pooled_vio just acquired.
2626  *
2627  * Implements waiter_callback_fn.
2628  */
2629 static void launch_cursor(struct vdo_waiter *waiter, void *context)
2630 {
2631 	struct cursor *cursor = container_of(waiter, struct cursor, waiter);
2632 	struct pooled_vio *pooled = context;
2633 
2634 	cursor->vio = pooled;
2635 	pooled->vio.completion.parent = cursor;
2636 	pooled->vio.completion.callback_thread_id = cursor->parent->zone->thread_id;
2637 	traverse(cursor);
2638 }
2639 
2640 /**
2641  * compute_boundary() - Compute the number of pages used at each level of the given root's tree.
2642  *
2643  * Return: The list of page counts as a boundary structure.
2644  */
2645 static struct boundary compute_boundary(struct block_map *map, root_count_t root_index)
2646 {
2647 	struct boundary boundary;
2648 	height_t height;
2649 	page_count_t leaf_pages = vdo_compute_block_map_page_count(map->entry_count);
2650 	/*
2651 	 * Compute the leaf pages for this root. If the number of leaf pages does not distribute
2652 	 * evenly, we must determine if this root gets an extra page. Extra pages are assigned to
2653 	 * roots starting from tree 0.
2654 	 */
2655 	page_count_t last_tree_root = (leaf_pages - 1) % map->root_count;
2656 	page_count_t level_pages = leaf_pages / map->root_count;
2657 
2658 	if (root_index <= last_tree_root)
2659 		level_pages++;
2660 
2661 	for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT - 1; height++) {
2662 		boundary.levels[height] = level_pages;
2663 		level_pages = DIV_ROUND_UP(level_pages, VDO_BLOCK_MAP_ENTRIES_PER_PAGE);
2664 	}
2665 
2666 	/* The root node always exists, even if the root is otherwise unused. */
2667 	boundary.levels[VDO_BLOCK_MAP_TREE_HEIGHT - 1] = 1;
2668 
2669 	return boundary;
2670 }
2671 
2672 /**
2673  * vdo_traverse_forest() - Walk the entire forest of a block map.
2674  * @callback: A function to call with the pbn of each allocated node in the forest.
2675  * @completion: The completion to notify on each traversed PBN, and when traversal completes.
2676  */
2677 void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback,
2678 			 struct vdo_completion *completion)
2679 {
2680 	root_count_t root;
2681 	struct cursors *cursors;
2682 	int result;
2683 
2684 	result = vdo_allocate_extended(struct cursors, map->root_count,
2685 				       struct cursor, __func__, &cursors);
2686 	if (result != VDO_SUCCESS) {
2687 		vdo_fail_completion(completion, result);
2688 		return;
2689 	}
2690 
2691 	cursors->zone = &map->zones[0];
2692 	cursors->pool = cursors->zone->vio_pool;
2693 	cursors->entry_callback = callback;
2694 	cursors->completion = completion;
2695 	cursors->active_roots = map->root_count;
2696 	for (root = 0; root < map->root_count; root++) {
2697 		struct cursor *cursor = &cursors->cursors[root];
2698 
2699 		*cursor = (struct cursor) {
2700 			.tree = &map->forest->trees[root],
2701 			.height = VDO_BLOCK_MAP_TREE_HEIGHT - 1,
2702 			.parent = cursors,
2703 			.boundary = compute_boundary(map, root),
2704 		};
2705 
2706 		cursor->waiter.callback = launch_cursor;
2707 		acquire_vio_from_pool(cursors->pool, &cursor->waiter);
2708 	}
2709 }
2710 
2711 /**
2712  * initialize_block_map_zone() - Initialize the per-zone portions of the block map.
2713  * @maximum_age: The number of journal blocks before a dirtied page is considered old and must be
2714  *               written out.
2715  */
2716 static int __must_check initialize_block_map_zone(struct block_map *map,
2717 						  zone_count_t zone_number,
2718 						  page_count_t cache_size,
2719 						  block_count_t maximum_age)
2720 {
2721 	int result;
2722 	block_count_t i;
2723 	struct vdo *vdo = map->vdo;
2724 	struct block_map_zone *zone = &map->zones[zone_number];
2725 
2726 	BUILD_BUG_ON(sizeof(struct page_descriptor) != sizeof(u64));
2727 
2728 	zone->zone_number = zone_number;
2729 	zone->thread_id = vdo->thread_config.logical_threads[zone_number];
2730 	zone->block_map = map;
2731 
2732 	result = vdo_allocate_extended(struct dirty_lists, maximum_age,
2733 				       dirty_era_t, __func__,
2734 				       &zone->dirty_lists);
2735 	if (result != VDO_SUCCESS)
2736 		return result;
2737 
2738 	zone->dirty_lists->maximum_age = maximum_age;
2739 	INIT_LIST_HEAD(&zone->dirty_lists->expired[VDO_TREE_PAGE]);
2740 	INIT_LIST_HEAD(&zone->dirty_lists->expired[VDO_CACHE_PAGE]);
2741 
2742 	for (i = 0; i < maximum_age; i++) {
2743 		INIT_LIST_HEAD(&zone->dirty_lists->eras[i][VDO_TREE_PAGE]);
2744 		INIT_LIST_HEAD(&zone->dirty_lists->eras[i][VDO_CACHE_PAGE]);
2745 	}
2746 
2747 	result = vdo_int_map_create(VDO_LOCK_MAP_CAPACITY, &zone->loading_pages);
2748 	if (result != VDO_SUCCESS)
2749 		return result;
2750 
2751 	result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE,
2752 			       zone->thread_id, VIO_TYPE_BLOCK_MAP_INTERIOR,
2753 			       VIO_PRIORITY_METADATA, zone, &zone->vio_pool);
2754 	if (result != VDO_SUCCESS)
2755 		return result;
2756 
2757 	vdo_set_admin_state_code(&zone->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
2758 
2759 	zone->page_cache.zone = zone;
2760 	zone->page_cache.vdo = vdo;
2761 	zone->page_cache.page_count = cache_size / map->zone_count;
2762 	zone->page_cache.stats.free_pages = zone->page_cache.page_count;
2763 
2764 	result = allocate_cache_components(&zone->page_cache);
2765 	if (result != VDO_SUCCESS)
2766 		return result;
2767 
2768 	/* initialize empty circular queues */
2769 	INIT_LIST_HEAD(&zone->page_cache.lru_list);
2770 	INIT_LIST_HEAD(&zone->page_cache.outgoing_list);
2771 
2772 	return VDO_SUCCESS;
2773 }
2774 
2775 /* Implements vdo_zone_thread_getter_fn */
2776 static thread_id_t get_block_map_zone_thread_id(void *context, zone_count_t zone_number)
2777 {
2778 	struct block_map *map = context;
2779 
2780 	return map->zones[zone_number].thread_id;
2781 }
2782 
2783 /* Implements vdo_action_preamble_fn */
2784 static void prepare_for_era_advance(void *context, struct vdo_completion *parent)
2785 {
2786 	struct block_map *map = context;
2787 
2788 	map->current_era_point = map->pending_era_point;
2789 	vdo_finish_completion(parent);
2790 }
2791 
2792 /* Implements vdo_zone_action_fn */
2793 static void advance_block_map_zone_era(void *context, zone_count_t zone_number,
2794 				       struct vdo_completion *parent)
2795 {
2796 	struct block_map *map = context;
2797 	struct block_map_zone *zone = &map->zones[zone_number];
2798 
2799 	update_period(zone->dirty_lists, map->current_era_point);
2800 	write_expired_elements(zone);
2801 	vdo_finish_completion(parent);
2802 }
2803 
2804 /*
2805  * Schedule an era advance if necessary. This method should not be called directly. Rather, call
2806  * vdo_schedule_default_action() on the block map's action manager.
2807  *
2808  * Implements vdo_action_scheduler_fn.
2809  */
2810 static bool schedule_era_advance(void *context)
2811 {
2812 	struct block_map *map = context;
2813 
2814 	if (map->current_era_point == map->pending_era_point)
2815 		return false;
2816 
2817 	return vdo_schedule_action(map->action_manager, prepare_for_era_advance,
2818 				   advance_block_map_zone_era, NULL, NULL);
2819 }
2820 
2821 static void uninitialize_block_map_zone(struct block_map_zone *zone)
2822 {
2823 	struct vdo_page_cache *cache = &zone->page_cache;
2824 
2825 	vdo_free(vdo_forget(zone->dirty_lists));
2826 	free_vio_pool(vdo_forget(zone->vio_pool));
2827 	vdo_int_map_free(vdo_forget(zone->loading_pages));
2828 	if (cache->infos != NULL) {
2829 		struct page_info *info;
2830 
2831 		for (info = cache->infos; info < cache->infos + cache->page_count; info++)
2832 			free_vio(vdo_forget(info->vio));
2833 	}
2834 
2835 	vdo_int_map_free(vdo_forget(cache->page_map));
2836 	vdo_free(vdo_forget(cache->infos));
2837 	vdo_free(vdo_forget(cache->pages));
2838 }
2839 
2840 void vdo_free_block_map(struct block_map *map)
2841 {
2842 	zone_count_t zone;
2843 
2844 	if (map == NULL)
2845 		return;
2846 
2847 	for (zone = 0; zone < map->zone_count; zone++)
2848 		uninitialize_block_map_zone(&map->zones[zone]);
2849 
2850 	vdo_abandon_block_map_growth(map);
2851 	if (map->forest != NULL)
2852 		deforest(vdo_forget(map->forest), 0);
2853 	vdo_free(vdo_forget(map->action_manager));
2854 	vdo_free(map);
2855 }
2856 
2857 /* @journal may be NULL. */
2858 int vdo_decode_block_map(struct block_map_state_2_0 state, block_count_t logical_blocks,
2859 			 struct vdo *vdo, struct recovery_journal *journal,
2860 			 nonce_t nonce, page_count_t cache_size, block_count_t maximum_age,
2861 			 struct block_map **map_ptr)
2862 {
2863 	struct block_map *map;
2864 	int result;
2865 	zone_count_t zone = 0;
2866 
2867 	BUILD_BUG_ON(VDO_BLOCK_MAP_ENTRIES_PER_PAGE !=
2868 		     ((VDO_BLOCK_SIZE - sizeof(struct block_map_page)) /
2869 		      sizeof(struct block_map_entry)));
2870 	result = VDO_ASSERT(cache_size > 0, "block map cache size is specified");
2871 	if (result != VDO_SUCCESS)
2872 		return result;
2873 
2874 	result = vdo_allocate_extended(struct block_map,
2875 				       vdo->thread_config.logical_zone_count,
2876 				       struct block_map_zone, __func__, &map);
2877 	if (result != VDO_SUCCESS)
2878 		return result;
2879 
2880 	map->vdo = vdo;
2881 	map->root_origin = state.root_origin;
2882 	map->root_count = state.root_count;
2883 	map->entry_count = logical_blocks;
2884 	map->journal = journal;
2885 	map->nonce = nonce;
2886 
2887 	result = make_forest(map, map->entry_count);
2888 	if (result != VDO_SUCCESS) {
2889 		vdo_free_block_map(map);
2890 		return result;
2891 	}
2892 
2893 	replace_forest(map);
2894 
2895 	map->zone_count = vdo->thread_config.logical_zone_count;
2896 	for (zone = 0; zone < map->zone_count; zone++) {
2897 		result = initialize_block_map_zone(map, zone, cache_size, maximum_age);
2898 		if (result != VDO_SUCCESS) {
2899 			vdo_free_block_map(map);
2900 			return result;
2901 		}
2902 	}
2903 
2904 	result = vdo_make_action_manager(map->zone_count, get_block_map_zone_thread_id,
2905 					 vdo_get_recovery_journal_thread_id(journal),
2906 					 map, schedule_era_advance, vdo,
2907 					 &map->action_manager);
2908 	if (result != VDO_SUCCESS) {
2909 		vdo_free_block_map(map);
2910 		return result;
2911 	}
2912 
2913 	*map_ptr = map;
2914 	return VDO_SUCCESS;
2915 }
2916 
2917 struct block_map_state_2_0 vdo_record_block_map(const struct block_map *map)
2918 {
2919 	return (struct block_map_state_2_0) {
2920 		.flat_page_origin = VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN,
2921 		/* This is the flat page count, which has turned out to always be 0. */
2922 		.flat_page_count = 0,
2923 		.root_origin = map->root_origin,
2924 		.root_count = map->root_count,
2925 	};
2926 }
2927 
2928 /* The block map needs to know the journals' sequence number to initialize the eras. */
2929 void vdo_initialize_block_map_from_journal(struct block_map *map,
2930 					   struct recovery_journal *journal)
2931 {
2932 	zone_count_t z = 0;
2933 
2934 	map->current_era_point = vdo_get_recovery_journal_current_sequence_number(journal);
2935 	map->pending_era_point = map->current_era_point;
2936 
2937 	for (z = 0; z < map->zone_count; z++) {
2938 		struct dirty_lists *dirty_lists = map->zones[z].dirty_lists;
2939 
2940 		VDO_ASSERT_LOG_ONLY(dirty_lists->next_period == 0, "current period not set");
2941 		dirty_lists->oldest_period = map->current_era_point;
2942 		dirty_lists->next_period = map->current_era_point + 1;
2943 		dirty_lists->offset = map->current_era_point % dirty_lists->maximum_age;
2944 	}
2945 }
2946 
2947 /* Compute the logical zone for the LBN of a data vio. */
2948 zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio)
2949 {
2950 	struct block_map *map = vdo_from_data_vio(data_vio)->block_map;
2951 	struct tree_lock *tree_lock = &data_vio->tree_lock;
2952 	page_number_t page_number = data_vio->logical.lbn / VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
2953 
2954 	tree_lock->tree_slots[0].page_index = page_number;
2955 	tree_lock->root_index = page_number % map->root_count;
2956 	return (tree_lock->root_index % map->zone_count);
2957 }
2958 
2959 void vdo_advance_block_map_era(struct block_map *map,
2960 			       sequence_number_t recovery_block_number)
2961 {
2962 	if (map == NULL)
2963 		return;
2964 
2965 	map->pending_era_point = recovery_block_number;
2966 	vdo_schedule_default_action(map->action_manager);
2967 }
2968 
2969 /* Implements vdo_admin_initiator_fn */
2970 static void initiate_drain(struct admin_state *state)
2971 {
2972 	struct block_map_zone *zone = container_of(state, struct block_map_zone, state);
2973 
2974 	VDO_ASSERT_LOG_ONLY((zone->active_lookups == 0),
2975 			    "%s() called with no active lookups", __func__);
2976 
2977 	if (!vdo_is_state_suspending(state)) {
2978 		while (zone->dirty_lists->oldest_period < zone->dirty_lists->next_period)
2979 			expire_oldest_list(zone->dirty_lists);
2980 		write_expired_elements(zone);
2981 	}
2982 
2983 	check_for_drain_complete(zone);
2984 }
2985 
2986 /* Implements vdo_zone_action_fn. */
2987 static void drain_zone(void *context, zone_count_t zone_number,
2988 		       struct vdo_completion *parent)
2989 {
2990 	struct block_map *map = context;
2991 	struct block_map_zone *zone = &map->zones[zone_number];
2992 
2993 	vdo_start_draining(&zone->state,
2994 			   vdo_get_current_manager_operation(map->action_manager),
2995 			   parent, initiate_drain);
2996 }
2997 
2998 void vdo_drain_block_map(struct block_map *map, const struct admin_state_code *operation,
2999 			 struct vdo_completion *parent)
3000 {
3001 	vdo_schedule_operation(map->action_manager, operation, NULL, drain_zone, NULL,
3002 			       parent);
3003 }
3004 
3005 /* Implements vdo_zone_action_fn. */
3006 static void resume_block_map_zone(void *context, zone_count_t zone_number,
3007 				  struct vdo_completion *parent)
3008 {
3009 	struct block_map *map = context;
3010 	struct block_map_zone *zone = &map->zones[zone_number];
3011 
3012 	vdo_fail_completion(parent, vdo_resume_if_quiescent(&zone->state));
3013 }
3014 
3015 void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent)
3016 {
3017 	vdo_schedule_operation(map->action_manager, VDO_ADMIN_STATE_RESUMING,
3018 			       NULL, resume_block_map_zone, NULL, parent);
3019 }
3020 
3021 /* Allocate an expanded collection of trees, for a future growth. */
3022 int vdo_prepare_to_grow_block_map(struct block_map *map,
3023 				  block_count_t new_logical_blocks)
3024 {
3025 	if (map->next_entry_count == new_logical_blocks)
3026 		return VDO_SUCCESS;
3027 
3028 	if (map->next_entry_count > 0)
3029 		vdo_abandon_block_map_growth(map);
3030 
3031 	if (new_logical_blocks < map->entry_count) {
3032 		map->next_entry_count = map->entry_count;
3033 		return VDO_SUCCESS;
3034 	}
3035 
3036 	return make_forest(map, new_logical_blocks);
3037 }
3038 
3039 /* Implements vdo_action_preamble_fn */
3040 static void grow_forest(void *context, struct vdo_completion *completion)
3041 {
3042 	replace_forest(context);
3043 	vdo_finish_completion(completion);
3044 }
3045 
3046 /* Requires vdo_prepare_to_grow_block_map() to have been previously called. */
3047 void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent)
3048 {
3049 	vdo_schedule_operation(map->action_manager,
3050 			       VDO_ADMIN_STATE_SUSPENDED_OPERATION,
3051 			       grow_forest, NULL, NULL, parent);
3052 }
3053 
3054 void vdo_abandon_block_map_growth(struct block_map *map)
3055 {
3056 	struct forest *forest = vdo_forget(map->next_forest);
3057 
3058 	if (forest != NULL)
3059 		deforest(forest, forest->segments - 1);
3060 
3061 	map->next_entry_count = 0;
3062 }
3063 
3064 /* Release the page completion and then continue the requester. */
3065 static inline void finish_processing_page(struct vdo_completion *completion, int result)
3066 {
3067 	struct vdo_completion *parent = completion->parent;
3068 
3069 	vdo_release_page_completion(completion);
3070 	vdo_continue_completion(parent, result);
3071 }
3072 
3073 static void handle_page_error(struct vdo_completion *completion)
3074 {
3075 	finish_processing_page(completion, completion->result);
3076 }
3077 
3078 /* Fetch the mapping page for a block map update, and call the provided handler when fetched. */
3079 static void fetch_mapping_page(struct data_vio *data_vio, bool modifiable,
3080 			       vdo_action_fn action)
3081 {
3082 	struct block_map_zone *zone = data_vio->logical.zone->block_map_zone;
3083 
3084 	if (vdo_is_state_draining(&zone->state)) {
3085 		continue_data_vio_with_error(data_vio, VDO_SHUTTING_DOWN);
3086 		return;
3087 	}
3088 
3089 	vdo_get_page(&data_vio->page_completion, zone,
3090 		     data_vio->tree_lock.tree_slots[0].block_map_slot.pbn,
3091 		     modifiable, &data_vio->vio.completion,
3092 		     action, handle_page_error, false);
3093 }
3094 
3095 /**
3096  * clear_mapped_location() - Clear a data_vio's mapped block location, setting it to be unmapped.
3097  *
3098  * This indicates the block map entry for the logical block is either unmapped or corrupted.
3099  */
3100 static void clear_mapped_location(struct data_vio *data_vio)
3101 {
3102 	data_vio->mapped = (struct zoned_pbn) {
3103 		.state = VDO_MAPPING_STATE_UNMAPPED,
3104 	};
3105 }
3106 
3107 /**
3108  * set_mapped_location() - Decode and validate a block map entry, and set the mapped location of a
3109  *                         data_vio.
3110  *
3111  * Return: VDO_SUCCESS or VDO_BAD_MAPPING if the map entry is invalid or an error code for any
3112  *         other failure
3113  */
3114 static int __must_check set_mapped_location(struct data_vio *data_vio,
3115 					    const struct block_map_entry *entry)
3116 {
3117 	/* Unpack the PBN for logging purposes even if the entry is invalid. */
3118 	struct data_location mapped = vdo_unpack_block_map_entry(entry);
3119 
3120 	if (vdo_is_valid_location(&mapped)) {
3121 		int result;
3122 
3123 		result = vdo_get_physical_zone(vdo_from_data_vio(data_vio),
3124 					       mapped.pbn, &data_vio->mapped.zone);
3125 		if (result == VDO_SUCCESS) {
3126 			data_vio->mapped.pbn = mapped.pbn;
3127 			data_vio->mapped.state = mapped.state;
3128 			return VDO_SUCCESS;
3129 		}
3130 
3131 		/*
3132 		 * Return all errors not specifically known to be errors from validating the
3133 		 * location.
3134 		 */
3135 		if ((result != VDO_OUT_OF_RANGE) && (result != VDO_BAD_MAPPING))
3136 			return result;
3137 	}
3138 
3139 	/*
3140 	 * Log the corruption even if we wind up ignoring it for write VIOs, converting all cases
3141 	 * to VDO_BAD_MAPPING.
3142 	 */
3143 	vdo_log_error_strerror(VDO_BAD_MAPPING,
3144 			       "PBN %llu with state %u read from the block map was invalid",
3145 			       (unsigned long long) mapped.pbn, mapped.state);
3146 
3147 	/*
3148 	 * A read VIO has no option but to report the bad mapping--reading zeros would be hiding
3149 	 * known data loss.
3150 	 */
3151 	if (!data_vio->write)
3152 		return VDO_BAD_MAPPING;
3153 
3154 	/*
3155 	 * A write VIO only reads this mapping to decref the old block. Treat this as an unmapped
3156 	 * entry rather than fail the write.
3157 	 */
3158 	clear_mapped_location(data_vio);
3159 	return VDO_SUCCESS;
3160 }
3161 
3162 /* This callback is registered in vdo_get_mapped_block(). */
3163 static void get_mapping_from_fetched_page(struct vdo_completion *completion)
3164 {
3165 	int result;
3166 	struct vdo_page_completion *vpc = as_vdo_page_completion(completion);
3167 	const struct block_map_page *page;
3168 	const struct block_map_entry *entry;
3169 	struct data_vio *data_vio = as_data_vio(completion->parent);
3170 	struct block_map_tree_slot *tree_slot;
3171 
3172 	if (completion->result != VDO_SUCCESS) {
3173 		finish_processing_page(completion, completion->result);
3174 		return;
3175 	}
3176 
3177 	result = validate_completed_page(vpc, false);
3178 	if (result != VDO_SUCCESS) {
3179 		finish_processing_page(completion, result);
3180 		return;
3181 	}
3182 
3183 	page = (const struct block_map_page *) get_page_buffer(vpc->info);
3184 	tree_slot = &data_vio->tree_lock.tree_slots[0];
3185 	entry = &page->entries[tree_slot->block_map_slot.slot];
3186 
3187 	result = set_mapped_location(data_vio, entry);
3188 	finish_processing_page(completion, result);
3189 }
3190 
3191 void vdo_update_block_map_page(struct block_map_page *page, struct data_vio *data_vio,
3192 			       physical_block_number_t pbn,
3193 			       enum block_mapping_state mapping_state,
3194 			       sequence_number_t *recovery_lock)
3195 {
3196 	struct block_map_zone *zone = data_vio->logical.zone->block_map_zone;
3197 	struct block_map *block_map = zone->block_map;
3198 	struct recovery_journal *journal = block_map->journal;
3199 	sequence_number_t old_locked, new_locked;
3200 	struct tree_lock *tree_lock = &data_vio->tree_lock;
3201 
3202 	/* Encode the new mapping. */
3203 	page->entries[tree_lock->tree_slots[tree_lock->height].block_map_slot.slot] =
3204 		vdo_pack_block_map_entry(pbn, mapping_state);
3205 
3206 	/* Adjust references on the recovery journal blocks. */
3207 	old_locked = *recovery_lock;
3208 	new_locked = data_vio->recovery_sequence_number;
3209 
3210 	if ((old_locked == 0) || (old_locked > new_locked)) {
3211 		vdo_acquire_recovery_journal_block_reference(journal, new_locked,
3212 							     VDO_ZONE_TYPE_LOGICAL,
3213 							     zone->zone_number);
3214 
3215 		if (old_locked > 0) {
3216 			vdo_release_recovery_journal_block_reference(journal, old_locked,
3217 								     VDO_ZONE_TYPE_LOGICAL,
3218 								     zone->zone_number);
3219 		}
3220 
3221 		*recovery_lock = new_locked;
3222 	}
3223 
3224 	/*
3225 	 * FIXME: explain this more
3226 	 * Release the transferred lock from the data_vio.
3227 	 */
3228 	vdo_release_journal_entry_lock(journal, new_locked);
3229 	data_vio->recovery_sequence_number = 0;
3230 }
3231 
3232 static void put_mapping_in_fetched_page(struct vdo_completion *completion)
3233 {
3234 	struct data_vio *data_vio = as_data_vio(completion->parent);
3235 	sequence_number_t old_lock;
3236 	struct vdo_page_completion *vpc;
3237 	struct page_info *info;
3238 	int result;
3239 
3240 	if (completion->result != VDO_SUCCESS) {
3241 		finish_processing_page(completion, completion->result);
3242 		return;
3243 	}
3244 
3245 	vpc = as_vdo_page_completion(completion);
3246 	result = validate_completed_page(vpc, true);
3247 	if (result != VDO_SUCCESS) {
3248 		finish_processing_page(completion, result);
3249 		return;
3250 	}
3251 
3252 	info = vpc->info;
3253 	old_lock = info->recovery_lock;
3254 	vdo_update_block_map_page((struct block_map_page *) get_page_buffer(info),
3255 				  data_vio, data_vio->new_mapped.pbn,
3256 				  data_vio->new_mapped.state, &info->recovery_lock);
3257 	set_info_state(info, PS_DIRTY);
3258 	add_to_dirty_lists(info->cache->zone, &info->state_entry,
3259 			   VDO_CACHE_PAGE, old_lock, info->recovery_lock);
3260 	finish_processing_page(completion, VDO_SUCCESS);
3261 }
3262 
3263 /* Read a stored block mapping into a data_vio. */
3264 void vdo_get_mapped_block(struct data_vio *data_vio)
3265 {
3266 	if (data_vio->tree_lock.tree_slots[0].block_map_slot.pbn == VDO_ZERO_BLOCK) {
3267 		/*
3268 		 * We know that the block map page for this LBN has not been allocated, so the
3269 		 * block must be unmapped.
3270 		 */
3271 		clear_mapped_location(data_vio);
3272 		continue_data_vio(data_vio);
3273 		return;
3274 	}
3275 
3276 	fetch_mapping_page(data_vio, false, get_mapping_from_fetched_page);
3277 }
3278 
3279 /* Update a stored block mapping to reflect a data_vio's new mapping. */
3280 void vdo_put_mapped_block(struct data_vio *data_vio)
3281 {
3282 	fetch_mapping_page(data_vio, true, put_mapping_in_fetched_page);
3283 }
3284 
3285 struct block_map_statistics vdo_get_block_map_statistics(struct block_map *map)
3286 {
3287 	zone_count_t zone = 0;
3288 	struct block_map_statistics totals;
3289 
3290 	memset(&totals, 0, sizeof(struct block_map_statistics));
3291 	for (zone = 0; zone < map->zone_count; zone++) {
3292 		const struct block_map_statistics *stats =
3293 			&(map->zones[zone].page_cache.stats);
3294 
3295 		totals.dirty_pages += READ_ONCE(stats->dirty_pages);
3296 		totals.clean_pages += READ_ONCE(stats->clean_pages);
3297 		totals.free_pages += READ_ONCE(stats->free_pages);
3298 		totals.failed_pages += READ_ONCE(stats->failed_pages);
3299 		totals.incoming_pages += READ_ONCE(stats->incoming_pages);
3300 		totals.outgoing_pages += READ_ONCE(stats->outgoing_pages);
3301 		totals.cache_pressure += READ_ONCE(stats->cache_pressure);
3302 		totals.read_count += READ_ONCE(stats->read_count);
3303 		totals.write_count += READ_ONCE(stats->write_count);
3304 		totals.failed_reads += READ_ONCE(stats->failed_reads);
3305 		totals.failed_writes += READ_ONCE(stats->failed_writes);
3306 		totals.reclaimed += READ_ONCE(stats->reclaimed);
3307 		totals.read_outgoing += READ_ONCE(stats->read_outgoing);
3308 		totals.found_in_cache += READ_ONCE(stats->found_in_cache);
3309 		totals.discard_required += READ_ONCE(stats->discard_required);
3310 		totals.wait_for_page += READ_ONCE(stats->wait_for_page);
3311 		totals.fetch_required += READ_ONCE(stats->fetch_required);
3312 		totals.pages_loaded += READ_ONCE(stats->pages_loaded);
3313 		totals.pages_saved += READ_ONCE(stats->pages_saved);
3314 		totals.flush_count += READ_ONCE(stats->flush_count);
3315 	}
3316 
3317 	return totals;
3318 }
3319