xref: /linux/drivers/md/dm-vdo/block-map.h (revision 8e1bb4a41aa78d6105e59186af3dcd545fc66e70)
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Copyright 2023 Red Hat
4  */
5 
6 #ifndef VDO_BLOCK_MAP_H
7 #define VDO_BLOCK_MAP_H
8 
9 #include <linux/list.h>
10 
11 #include "numeric.h"
12 
13 #include "admin-state.h"
14 #include "completion.h"
15 #include "encodings.h"
16 #include "int-map.h"
17 #include "statistics.h"
18 #include "types.h"
19 #include "vio.h"
20 #include "wait-queue.h"
21 
22 /*
23  * The block map is responsible for tracking all the logical to physical mappings of a VDO. It
24  * consists of a collection of 60 radix trees gradually allocated as logical addresses are used.
25  * Each tree is assigned to a logical zone such that it is easy to compute which zone must handle
26  * each logical address. Each logical zone also has a dedicated portion of the leaf page cache.
27  *
28  * Each logical zone has a single dedicated queue and thread for performing all updates to the
29  * radix trees assigned to that zone. The concurrency guarantees of this single-threaded model
30  * allow the code to omit more fine-grained locking for the block map structures.
31  *
32  * Load operations must be performed on the admin thread. Normal operations, such as reading and
33  * updating mappings, must be performed on the appropriate logical zone thread. Save operations
34  * must be launched from the same admin thread as the original load operation.
35  */
36 
37 enum {
38 	BLOCK_MAP_VIO_POOL_SIZE = 64,
39 };
40 
41 /*
42  * Generation counter for page references.
43  */
44 typedef u32 vdo_page_generation;
45 
46 extern const struct block_map_entry UNMAPPED_BLOCK_MAP_ENTRY;
47 
48 /* The VDO Page Cache abstraction. */
49 struct vdo_page_cache {
50 	/* the VDO which owns this cache */
51 	struct vdo *vdo;
52 	/* number of pages in cache */
53 	page_count_t page_count;
54 	/* number of pages to write in the current batch */
55 	page_count_t pages_in_batch;
56 	/* Whether the VDO is doing a read-only rebuild */
57 	bool rebuilding;
58 
59 	/* array of page information entries */
60 	struct page_info *infos;
61 	/* raw memory for pages */
62 	char *pages;
63 	/* cache last found page info */
64 	struct page_info *last_found;
65 	/* map of page number to info */
66 	struct int_map *page_map;
67 	/* main LRU list (all infos) */
68 	struct list_head lru_list;
69 	/* free page list (oldest first) */
70 	struct list_head free_list;
71 	/* outgoing page list */
72 	struct list_head outgoing_list;
73 	/* number of read I/O operations pending */
74 	page_count_t outstanding_reads;
75 	/* number of write I/O operations pending */
76 	page_count_t outstanding_writes;
77 	/* number of pages covered by the current flush */
78 	page_count_t pages_in_flush;
79 	/* number of pages waiting to be included in the next flush */
80 	page_count_t pages_to_flush;
81 	/* number of discards in progress */
82 	unsigned int discard_count;
83 	/* how many VPCs waiting for free page */
84 	unsigned int waiter_count;
85 	/* queue of waiters who want a free page */
86 	struct vdo_wait_queue free_waiters;
87 	/*
88 	 * Statistics are only updated on the logical zone thread, but are accessed from other
89 	 * threads.
90 	 */
91 	struct block_map_statistics stats;
92 	/* counter for pressure reports */
93 	u32 pressure_report;
94 	/* the block map zone to which this cache belongs */
95 	struct block_map_zone *zone;
96 };
97 
98 /*
99  * The state of a page buffer. If the page buffer is free no particular page is bound to it,
100  * otherwise the page buffer is bound to particular page whose absolute pbn is in the pbn field. If
101  * the page is resident or dirty the page data is stable and may be accessed. Otherwise the page is
102  * in flight (incoming or outgoing) and its data should not be accessed.
103  *
104  * @note Update the static data in get_page_state_name() if you change this enumeration.
105  */
106 enum vdo_page_buffer_state {
107 	/* this page buffer is not being used */
108 	PS_FREE,
109 	/* this page is being read from store */
110 	PS_INCOMING,
111 	/* attempt to load this page failed */
112 	PS_FAILED,
113 	/* this page is valid and un-modified */
114 	PS_RESIDENT,
115 	/* this page is valid and modified */
116 	PS_DIRTY,
117 	/* this page is being written and should not be used */
118 	PS_OUTGOING,
119 	/* not a state */
120 	PAGE_STATE_COUNT,
121 } __packed;
122 
123 /*
124  * The write status of page
125  */
126 enum vdo_page_write_status {
127 	WRITE_STATUS_NORMAL,
128 	WRITE_STATUS_DISCARD,
129 	WRITE_STATUS_DEFERRED,
130 } __packed;
131 
132 /* Per-page-slot information. */
133 struct page_info {
134 	/* Preallocated page struct vio */
135 	struct vio *vio;
136 	/* back-link for references */
137 	struct vdo_page_cache *cache;
138 	/* the pbn of the page */
139 	physical_block_number_t pbn;
140 	/* page is busy (temporarily locked) */
141 	u16 busy;
142 	/* the write status the page */
143 	enum vdo_page_write_status write_status;
144 	/* page state */
145 	enum vdo_page_buffer_state state;
146 	/* queue of completions awaiting this item */
147 	struct vdo_wait_queue waiting;
148 	/* state linked list entry */
149 	struct list_head state_entry;
150 	/* LRU entry */
151 	struct list_head lru_entry;
152 	/*
153 	 * The earliest recovery journal block containing uncommitted updates to the block map page
154 	 * associated with this page_info. A reference (lock) is held on that block to prevent it
155 	 * from being reaped. When this value changes, the reference on the old value must be
156 	 * released and a reference on the new value must be acquired.
157 	 */
158 	sequence_number_t recovery_lock;
159 };
160 
161 /*
162  * A completion awaiting a specific page. Also a live reference into the page once completed, until
163  * freed.
164  */
165 struct vdo_page_completion {
166 	/* The generic completion */
167 	struct vdo_completion completion;
168 	/* The cache involved */
169 	struct vdo_page_cache *cache;
170 	/* The waiter for the pending list */
171 	struct vdo_waiter waiter;
172 	/* The absolute physical block number of the page on disk */
173 	physical_block_number_t pbn;
174 	/* Whether the page may be modified */
175 	bool writable;
176 	/* Whether the page is available */
177 	bool ready;
178 	/* The info structure for the page, only valid when ready */
179 	struct page_info *info;
180 };
181 
182 struct forest;
183 
184 struct tree_page {
185 	struct vdo_waiter waiter;
186 
187 	/* Dirty list entry */
188 	struct list_head entry;
189 
190 	/* If dirty, the tree zone flush generation in which it was last dirtied. */
191 	u8 generation;
192 
193 	/* Whether this page is an interior tree page being written out. */
194 	bool writing;
195 
196 	/* If writing, the tree zone flush generation of the copy being written. */
197 	u8 writing_generation;
198 
199 	/*
200 	 * Sequence number of the earliest recovery journal block containing uncommitted updates to
201 	 * this page
202 	 */
203 	sequence_number_t recovery_lock;
204 
205 	/* The value of recovery_lock when the this page last started writing */
206 	sequence_number_t writing_recovery_lock;
207 
208 	char page_buffer[VDO_BLOCK_SIZE];
209 };
210 
211 enum block_map_page_type {
212 	VDO_TREE_PAGE,
213 	VDO_CACHE_PAGE,
214 };
215 
216 typedef struct list_head dirty_era_t[2];
217 
218 struct dirty_lists {
219 	/* The number of periods after which an element will be expired */
220 	block_count_t maximum_age;
221 	/* The oldest period which has unexpired elements */
222 	sequence_number_t oldest_period;
223 	/* One more than the current period */
224 	sequence_number_t next_period;
225 	/* The offset in the array of lists of the oldest period */
226 	block_count_t offset;
227 	/* Expired pages */
228 	dirty_era_t expired;
229 	/* The lists of dirty pages */
230 	dirty_era_t eras[];
231 };
232 
233 struct block_map_zone {
234 	zone_count_t zone_number;
235 	thread_id_t thread_id;
236 	struct admin_state state;
237 	struct block_map *block_map;
238 	/* Dirty pages, by era*/
239 	struct dirty_lists *dirty_lists;
240 	struct vdo_page_cache page_cache;
241 	data_vio_count_t active_lookups;
242 	struct int_map *loading_pages;
243 	struct vio_pool *vio_pool;
244 	/* The tree page which has issued or will be issuing a flush */
245 	struct tree_page *flusher;
246 	struct vdo_wait_queue flush_waiters;
247 	/* The generation after the most recent flush */
248 	u8 generation;
249 	u8 oldest_generation;
250 	/* The counts of dirty pages in each generation */
251 	u32 dirty_page_counts[256];
252 };
253 
254 struct block_map {
255 	struct vdo *vdo;
256 	struct action_manager *action_manager;
257 	/* The absolute PBN of the first root of the tree part of the block map */
258 	physical_block_number_t root_origin;
259 	block_count_t root_count;
260 
261 	/* The era point we are currently distributing to the zones */
262 	sequence_number_t current_era_point;
263 	/* The next era point */
264 	sequence_number_t pending_era_point;
265 
266 	/* The number of entries in block map */
267 	block_count_t entry_count;
268 	nonce_t nonce;
269 	struct recovery_journal *journal;
270 
271 	/* The trees for finding block map pages */
272 	struct forest *forest;
273 	/* The expanded trees awaiting growth */
274 	struct forest *next_forest;
275 	/* The number of entries after growth */
276 	block_count_t next_entry_count;
277 
278 	zone_count_t zone_count;
279 	struct block_map_zone zones[];
280 };
281 
282 /**
283  * typedef vdo_entry_callback_fn - A function to be called for each allocated PBN when traversing
284  *                                 the forest.
285  * @pbn: A PBN of a tree node.
286  * @completion: The parent completion of the traversal.
287  *
288  * Return: VDO_SUCCESS or an error.
289  */
290 typedef int (*vdo_entry_callback_fn)(physical_block_number_t pbn,
291 				     struct vdo_completion *completion);
292 
293 static inline struct vdo_page_completion *as_vdo_page_completion(struct vdo_completion *completion)
294 {
295 	vdo_assert_completion_type(completion, VDO_PAGE_COMPLETION);
296 	return container_of(completion, struct vdo_page_completion, completion);
297 }
298 
299 void vdo_release_page_completion(struct vdo_completion *completion);
300 
301 void vdo_get_page(struct vdo_page_completion *page_completion,
302 		  struct block_map_zone *zone, physical_block_number_t pbn,
303 		  bool writable, void *parent, vdo_action_fn callback,
304 		  vdo_action_fn error_handler, bool requeue);
305 
306 void vdo_request_page_write(struct vdo_completion *completion);
307 
308 int __must_check vdo_get_cached_page(struct vdo_completion *completion,
309 				     struct block_map_page **page_ptr);
310 
311 int __must_check vdo_invalidate_page_cache(struct vdo_page_cache *cache);
312 
313 static inline struct block_map_page * __must_check
314 vdo_as_block_map_page(struct tree_page *tree_page)
315 {
316 	return (struct block_map_page *) tree_page->page_buffer;
317 }
318 
319 bool vdo_copy_valid_page(char *buffer, nonce_t nonce,
320 			 physical_block_number_t pbn,
321 			 struct block_map_page *page);
322 
323 void vdo_find_block_map_slot(struct data_vio *data_vio);
324 
325 physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map,
326 						    page_number_t page_number);
327 
328 void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone);
329 
330 void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback,
331 			 struct vdo_completion *completion);
332 
333 int __must_check vdo_decode_block_map(struct block_map_state_2_0 state,
334 				      block_count_t logical_blocks, struct vdo *vdo,
335 				      struct recovery_journal *journal, nonce_t nonce,
336 				      page_count_t cache_size, block_count_t maximum_age,
337 				      struct block_map **map_ptr);
338 
339 void vdo_drain_block_map(struct block_map *map, const struct admin_state_code *operation,
340 			 struct vdo_completion *parent);
341 
342 void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent);
343 
344 int __must_check vdo_prepare_to_grow_block_map(struct block_map *map,
345 					       block_count_t new_logical_blocks);
346 
347 void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent);
348 
349 void vdo_abandon_block_map_growth(struct block_map *map);
350 
351 void vdo_free_block_map(struct block_map *map);
352 
353 struct block_map_state_2_0 __must_check vdo_record_block_map(const struct block_map *map);
354 
355 void vdo_initialize_block_map_from_journal(struct block_map *map,
356 					   struct recovery_journal *journal);
357 
358 zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio);
359 
360 void vdo_advance_block_map_era(struct block_map *map,
361 			       sequence_number_t recovery_block_number);
362 
363 void vdo_update_block_map_page(struct block_map_page *page, struct data_vio *data_vio,
364 			       physical_block_number_t pbn,
365 			       enum block_mapping_state mapping_state,
366 			       sequence_number_t *recovery_lock);
367 
368 void vdo_get_mapped_block(struct data_vio *data_vio);
369 
370 void vdo_put_mapped_block(struct data_vio *data_vio);
371 
372 struct block_map_statistics __must_check vdo_get_block_map_statistics(struct block_map *map);
373 
374 /**
375  * vdo_convert_maximum_age() - Convert the maximum age to reflect the new recovery journal format
376  * @age: The configured maximum age
377  *
378  * Return: The converted age
379  *
380  * In the old recovery journal format, each journal block held 311 entries, and every write bio
381  * made two entries. The old maximum age was half the usable journal length. In the new format,
382  * each block holds only 217 entries, but each bio only makes one entry. We convert the configured
383  * age so that the number of writes in a block map era is the same in the old and new formats. This
384  * keeps the bound on the amount of work required to recover the block map from the recovery
385  * journal the same across the format change. It also keeps the amortization of block map page
386  * writes to write bios the same.
387  */
388 static inline block_count_t vdo_convert_maximum_age(block_count_t age)
389 {
390 	return DIV_ROUND_UP(age * RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK,
391 			    2 * RECOVERY_JOURNAL_ENTRIES_PER_BLOCK);
392 }
393 
394 #endif /* VDO_BLOCK_MAP_H */
395