1 /* SPDX-License-Identifier: GPL-2.0-only */ 2 /* 3 * Copyright 2023 Red Hat 4 */ 5 6 #ifndef VDO_BLOCK_MAP_H 7 #define VDO_BLOCK_MAP_H 8 9 #include <linux/list.h> 10 11 #include "numeric.h" 12 13 #include "admin-state.h" 14 #include "completion.h" 15 #include "encodings.h" 16 #include "int-map.h" 17 #include "statistics.h" 18 #include "types.h" 19 #include "vio.h" 20 #include "wait-queue.h" 21 22 /* 23 * The block map is responsible for tracking all the logical to physical mappings of a VDO. It 24 * consists of a collection of 60 radix trees gradually allocated as logical addresses are used. 25 * Each tree is assigned to a logical zone such that it is easy to compute which zone must handle 26 * each logical address. Each logical zone also has a dedicated portion of the leaf page cache. 27 * 28 * Each logical zone has a single dedicated queue and thread for performing all updates to the 29 * radix trees assigned to that zone. The concurrency guarantees of this single-threaded model 30 * allow the code to omit more fine-grained locking for the block map structures. 31 * 32 * Load operations must be performed on the admin thread. Normal operations, such as reading and 33 * updating mappings, must be performed on the appropriate logical zone thread. Save operations 34 * must be launched from the same admin thread as the original load operation. 35 */ 36 37 enum { 38 BLOCK_MAP_VIO_POOL_SIZE = 64, 39 }; 40 41 /* 42 * Generation counter for page references. 43 */ 44 typedef u32 vdo_page_generation; 45 46 extern const struct block_map_entry UNMAPPED_BLOCK_MAP_ENTRY; 47 48 /* The VDO Page Cache abstraction. */ 49 struct vdo_page_cache { 50 /* the VDO which owns this cache */ 51 struct vdo *vdo; 52 /* number of pages in cache */ 53 page_count_t page_count; 54 /* number of pages to write in the current batch */ 55 page_count_t pages_in_batch; 56 /* Whether the VDO is doing a read-only rebuild */ 57 bool rebuilding; 58 59 /* array of page information entries */ 60 struct page_info *infos; 61 /* raw memory for pages */ 62 char *pages; 63 /* cache last found page info */ 64 struct page_info *last_found; 65 /* map of page number to info */ 66 struct int_map *page_map; 67 /* main LRU list (all infos) */ 68 struct list_head lru_list; 69 /* free page list (oldest first) */ 70 struct list_head free_list; 71 /* outgoing page list */ 72 struct list_head outgoing_list; 73 /* number of read I/O operations pending */ 74 page_count_t outstanding_reads; 75 /* number of write I/O operations pending */ 76 page_count_t outstanding_writes; 77 /* number of pages covered by the current flush */ 78 page_count_t pages_in_flush; 79 /* number of pages waiting to be included in the next flush */ 80 page_count_t pages_to_flush; 81 /* number of discards in progress */ 82 unsigned int discard_count; 83 /* how many VPCs waiting for free page */ 84 unsigned int waiter_count; 85 /* queue of waiters who want a free page */ 86 struct vdo_wait_queue free_waiters; 87 /* 88 * Statistics are only updated on the logical zone thread, but are accessed from other 89 * threads. 90 */ 91 struct block_map_statistics stats; 92 /* counter for pressure reports */ 93 u32 pressure_report; 94 /* the block map zone to which this cache belongs */ 95 struct block_map_zone *zone; 96 }; 97 98 /* 99 * The state of a page buffer. If the page buffer is free no particular page is bound to it, 100 * otherwise the page buffer is bound to particular page whose absolute pbn is in the pbn field. If 101 * the page is resident or dirty the page data is stable and may be accessed. Otherwise the page is 102 * in flight (incoming or outgoing) and its data should not be accessed. 103 * 104 * @note Update the static data in get_page_state_name() if you change this enumeration. 105 */ 106 enum vdo_page_buffer_state { 107 /* this page buffer is not being used */ 108 PS_FREE, 109 /* this page is being read from store */ 110 PS_INCOMING, 111 /* attempt to load this page failed */ 112 PS_FAILED, 113 /* this page is valid and un-modified */ 114 PS_RESIDENT, 115 /* this page is valid and modified */ 116 PS_DIRTY, 117 /* this page is being written and should not be used */ 118 PS_OUTGOING, 119 /* not a state */ 120 PAGE_STATE_COUNT, 121 } __packed; 122 123 /* 124 * The write status of page 125 */ 126 enum vdo_page_write_status { 127 WRITE_STATUS_NORMAL, 128 WRITE_STATUS_DISCARD, 129 WRITE_STATUS_DEFERRED, 130 } __packed; 131 132 /* Per-page-slot information. */ 133 struct page_info { 134 /* Preallocated page struct vio */ 135 struct vio *vio; 136 /* back-link for references */ 137 struct vdo_page_cache *cache; 138 /* the pbn of the page */ 139 physical_block_number_t pbn; 140 /* page is busy (temporarily locked) */ 141 u16 busy; 142 /* the write status the page */ 143 enum vdo_page_write_status write_status; 144 /* page state */ 145 enum vdo_page_buffer_state state; 146 /* queue of completions awaiting this item */ 147 struct vdo_wait_queue waiting; 148 /* state linked list entry */ 149 struct list_head state_entry; 150 /* LRU entry */ 151 struct list_head lru_entry; 152 /* 153 * The earliest recovery journal block containing uncommitted updates to the block map page 154 * associated with this page_info. A reference (lock) is held on that block to prevent it 155 * from being reaped. When this value changes, the reference on the old value must be 156 * released and a reference on the new value must be acquired. 157 */ 158 sequence_number_t recovery_lock; 159 }; 160 161 /* 162 * A completion awaiting a specific page. Also a live reference into the page once completed, until 163 * freed. 164 */ 165 struct vdo_page_completion { 166 /* The generic completion */ 167 struct vdo_completion completion; 168 /* The cache involved */ 169 struct vdo_page_cache *cache; 170 /* The waiter for the pending list */ 171 struct vdo_waiter waiter; 172 /* The absolute physical block number of the page on disk */ 173 physical_block_number_t pbn; 174 /* Whether the page may be modified */ 175 bool writable; 176 /* Whether the page is available */ 177 bool ready; 178 /* The info structure for the page, only valid when ready */ 179 struct page_info *info; 180 }; 181 182 struct forest; 183 184 struct tree_page { 185 struct vdo_waiter waiter; 186 187 /* Dirty list entry */ 188 struct list_head entry; 189 190 /* If dirty, the tree zone flush generation in which it was last dirtied. */ 191 u8 generation; 192 193 /* Whether this page is an interior tree page being written out. */ 194 bool writing; 195 196 /* If writing, the tree zone flush generation of the copy being written. */ 197 u8 writing_generation; 198 199 /* 200 * Sequence number of the earliest recovery journal block containing uncommitted updates to 201 * this page 202 */ 203 sequence_number_t recovery_lock; 204 205 /* The value of recovery_lock when the this page last started writing */ 206 sequence_number_t writing_recovery_lock; 207 208 char page_buffer[VDO_BLOCK_SIZE]; 209 }; 210 211 enum block_map_page_type { 212 VDO_TREE_PAGE, 213 VDO_CACHE_PAGE, 214 }; 215 216 typedef struct list_head dirty_era_t[2]; 217 218 struct dirty_lists { 219 /* The number of periods after which an element will be expired */ 220 block_count_t maximum_age; 221 /* The oldest period which has unexpired elements */ 222 sequence_number_t oldest_period; 223 /* One more than the current period */ 224 sequence_number_t next_period; 225 /* The offset in the array of lists of the oldest period */ 226 block_count_t offset; 227 /* Expired pages */ 228 dirty_era_t expired; 229 /* The lists of dirty pages */ 230 dirty_era_t eras[]; 231 }; 232 233 struct block_map_zone { 234 zone_count_t zone_number; 235 thread_id_t thread_id; 236 struct admin_state state; 237 struct block_map *block_map; 238 /* Dirty pages, by era*/ 239 struct dirty_lists *dirty_lists; 240 struct vdo_page_cache page_cache; 241 data_vio_count_t active_lookups; 242 struct int_map *loading_pages; 243 struct vio_pool *vio_pool; 244 /* The tree page which has issued or will be issuing a flush */ 245 struct tree_page *flusher; 246 struct vdo_wait_queue flush_waiters; 247 /* The generation after the most recent flush */ 248 u8 generation; 249 u8 oldest_generation; 250 /* The counts of dirty pages in each generation */ 251 u32 dirty_page_counts[256]; 252 }; 253 254 struct block_map { 255 struct vdo *vdo; 256 struct action_manager *action_manager; 257 /* The absolute PBN of the first root of the tree part of the block map */ 258 physical_block_number_t root_origin; 259 block_count_t root_count; 260 261 /* The era point we are currently distributing to the zones */ 262 sequence_number_t current_era_point; 263 /* The next era point */ 264 sequence_number_t pending_era_point; 265 266 /* The number of entries in block map */ 267 block_count_t entry_count; 268 nonce_t nonce; 269 struct recovery_journal *journal; 270 271 /* The trees for finding block map pages */ 272 struct forest *forest; 273 /* The expanded trees awaiting growth */ 274 struct forest *next_forest; 275 /* The number of entries after growth */ 276 block_count_t next_entry_count; 277 278 zone_count_t zone_count; 279 struct block_map_zone zones[]; 280 }; 281 282 /** 283 * typedef vdo_entry_callback_fn - A function to be called for each allocated PBN when traversing 284 * the forest. 285 * @pbn: A PBN of a tree node. 286 * @completion: The parent completion of the traversal. 287 * 288 * Return: VDO_SUCCESS or an error. 289 */ 290 typedef int (*vdo_entry_callback_fn)(physical_block_number_t pbn, 291 struct vdo_completion *completion); 292 293 static inline struct vdo_page_completion *as_vdo_page_completion(struct vdo_completion *completion) 294 { 295 vdo_assert_completion_type(completion, VDO_PAGE_COMPLETION); 296 return container_of(completion, struct vdo_page_completion, completion); 297 } 298 299 void vdo_release_page_completion(struct vdo_completion *completion); 300 301 void vdo_get_page(struct vdo_page_completion *page_completion, 302 struct block_map_zone *zone, physical_block_number_t pbn, 303 bool writable, void *parent, vdo_action_fn callback, 304 vdo_action_fn error_handler, bool requeue); 305 306 void vdo_request_page_write(struct vdo_completion *completion); 307 308 int __must_check vdo_get_cached_page(struct vdo_completion *completion, 309 struct block_map_page **page_ptr); 310 311 int __must_check vdo_invalidate_page_cache(struct vdo_page_cache *cache); 312 313 static inline struct block_map_page * __must_check 314 vdo_as_block_map_page(struct tree_page *tree_page) 315 { 316 return (struct block_map_page *) tree_page->page_buffer; 317 } 318 319 bool vdo_copy_valid_page(char *buffer, nonce_t nonce, 320 physical_block_number_t pbn, 321 struct block_map_page *page); 322 323 void vdo_find_block_map_slot(struct data_vio *data_vio); 324 325 physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map, 326 page_number_t page_number); 327 328 void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone); 329 330 void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback, 331 struct vdo_completion *completion); 332 333 int __must_check vdo_decode_block_map(struct block_map_state_2_0 state, 334 block_count_t logical_blocks, struct vdo *vdo, 335 struct recovery_journal *journal, nonce_t nonce, 336 page_count_t cache_size, block_count_t maximum_age, 337 struct block_map **map_ptr); 338 339 void vdo_drain_block_map(struct block_map *map, const struct admin_state_code *operation, 340 struct vdo_completion *parent); 341 342 void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent); 343 344 int __must_check vdo_prepare_to_grow_block_map(struct block_map *map, 345 block_count_t new_logical_blocks); 346 347 void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent); 348 349 void vdo_abandon_block_map_growth(struct block_map *map); 350 351 void vdo_free_block_map(struct block_map *map); 352 353 struct block_map_state_2_0 __must_check vdo_record_block_map(const struct block_map *map); 354 355 void vdo_initialize_block_map_from_journal(struct block_map *map, 356 struct recovery_journal *journal); 357 358 zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio); 359 360 void vdo_advance_block_map_era(struct block_map *map, 361 sequence_number_t recovery_block_number); 362 363 void vdo_update_block_map_page(struct block_map_page *page, struct data_vio *data_vio, 364 physical_block_number_t pbn, 365 enum block_mapping_state mapping_state, 366 sequence_number_t *recovery_lock); 367 368 void vdo_get_mapped_block(struct data_vio *data_vio); 369 370 void vdo_put_mapped_block(struct data_vio *data_vio); 371 372 struct block_map_statistics __must_check vdo_get_block_map_statistics(struct block_map *map); 373 374 /** 375 * vdo_convert_maximum_age() - Convert the maximum age to reflect the new recovery journal format 376 * @age: The configured maximum age 377 * 378 * Return: The converted age 379 * 380 * In the old recovery journal format, each journal block held 311 entries, and every write bio 381 * made two entries. The old maximum age was half the usable journal length. In the new format, 382 * each block holds only 217 entries, but each bio only makes one entry. We convert the configured 383 * age so that the number of writes in a block map era is the same in the old and new formats. This 384 * keeps the bound on the amount of work required to recover the block map from the recovery 385 * journal the same across the format change. It also keeps the amortization of block map page 386 * writes to write bios the same. 387 */ 388 static inline block_count_t vdo_convert_maximum_age(block_count_t age) 389 { 390 return DIV_ROUND_UP(age * RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK, 391 2 * RECOVERY_JOURNAL_ENTRIES_PER_BLOCK); 392 } 393 394 #endif /* VDO_BLOCK_MAP_H */ 395