1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2023 Red Hat 4 */ 5 6 #include "repair.h" 7 8 #include <linux/min_heap.h> 9 #include <linux/minmax.h> 10 11 #include "logger.h" 12 #include "memory-alloc.h" 13 #include "permassert.h" 14 15 #include "block-map.h" 16 #include "completion.h" 17 #include "constants.h" 18 #include "encodings.h" 19 #include "int-map.h" 20 #include "io-submitter.h" 21 #include "recovery-journal.h" 22 #include "slab-depot.h" 23 #include "types.h" 24 #include "vdo.h" 25 #include "wait-queue.h" 26 27 /* 28 * An explicitly numbered block mapping. Numbering the mappings allows them to be sorted by logical 29 * block number during repair while still preserving the relative order of journal entries with 30 * the same logical block number. 31 */ 32 struct numbered_block_mapping { 33 struct block_map_slot block_map_slot; 34 struct block_map_entry block_map_entry; 35 /* A serial number to use during replay */ 36 u32 number; 37 } __packed; 38 39 /* 40 * The absolute position of an entry in the recovery journal, including the sector number and the 41 * entry number within the sector. 42 */ 43 struct recovery_point { 44 /* Block sequence number */ 45 sequence_number_t sequence_number; 46 /* Sector number */ 47 u8 sector_count; 48 /* Entry number */ 49 journal_entry_count_t entry_count; 50 /* Whether or not the increment portion of the current entry has been applied */ 51 bool increment_applied; 52 }; 53 54 DEFINE_MIN_HEAP(struct numbered_block_mapping, replay_heap); 55 56 struct repair_completion { 57 /* The completion header */ 58 struct vdo_completion completion; 59 60 /* A buffer to hold the data read off disk */ 61 char *journal_data; 62 63 /* For loading the journal */ 64 data_vio_count_t vio_count; 65 data_vio_count_t vios_complete; 66 struct vio *vios; 67 68 /* The number of entries to be applied to the block map */ 69 size_t block_map_entry_count; 70 /* The sequence number of the first valid block for block map recovery */ 71 sequence_number_t block_map_head; 72 /* The sequence number of the first valid block for slab journal replay */ 73 sequence_number_t slab_journal_head; 74 /* The sequence number of the last valid block of the journal (if known) */ 75 sequence_number_t tail; 76 /* 77 * The highest sequence number of the journal. During recovery (vs read-only rebuild), not 78 * the same as the tail, since the tail ignores blocks after the first hole. 79 */ 80 sequence_number_t highest_tail; 81 82 /* The number of logical blocks currently known to be in use */ 83 block_count_t logical_blocks_used; 84 /* The number of block map data blocks known to be allocated */ 85 block_count_t block_map_data_blocks; 86 87 /* These fields are for playing the journal into the block map */ 88 /* The entry data for the block map recovery */ 89 struct numbered_block_mapping *entries; 90 /* The number of entries in the entry array */ 91 size_t entry_count; 92 /* number of pending (non-ready) requests*/ 93 page_count_t outstanding; 94 /* number of page completions */ 95 page_count_t page_count; 96 bool launching; 97 /* 98 * a heap wrapping journal_entries. It re-orders and sorts journal entries in ascending LBN 99 * order, then original journal order. This permits efficient iteration over the journal 100 * entries in order. 101 */ 102 struct replay_heap replay_heap; 103 /* Fields tracking progress through the journal entries. */ 104 struct numbered_block_mapping *current_entry; 105 struct numbered_block_mapping *current_unfetched_entry; 106 /* Current requested page's PBN */ 107 physical_block_number_t pbn; 108 109 /* These fields are only used during recovery. */ 110 /* A location just beyond the last valid entry of the journal */ 111 struct recovery_point tail_recovery_point; 112 /* The location of the next recovery journal entry to apply */ 113 struct recovery_point next_recovery_point; 114 /* The journal point to give to the next synthesized decref */ 115 struct journal_point next_journal_point; 116 /* The number of entries played into slab journals */ 117 size_t entries_added_to_slab_journals; 118 119 /* These fields are only used during read-only rebuild */ 120 page_count_t page_to_fetch; 121 /* the number of leaf pages in the block map */ 122 page_count_t leaf_pages; 123 /* the last slot of the block map */ 124 struct block_map_slot last_slot; 125 126 /* 127 * The page completions used for playing the journal into the block map, and, during 128 * read-only rebuild, for rebuilding the reference counts from the block map. 129 */ 130 struct vdo_page_completion page_completions[]; 131 }; 132 133 /* 134 * This is a min_heap callback function that orders numbered_block_mappings using the 135 * 'block_map_slot' field as the primary key and the mapping 'number' field as the secondary key. 136 * Using the mapping number preserves the journal order of entries for the same slot, allowing us 137 * to sort by slot while still ensuring we replay all entries with the same slot in the exact order 138 * as they appeared in the journal. 139 */ 140 static bool mapping_is_less_than(const void *item1, const void *item2, void __always_unused *args) 141 { 142 const struct numbered_block_mapping *mapping1 = 143 (const struct numbered_block_mapping *) item1; 144 const struct numbered_block_mapping *mapping2 = 145 (const struct numbered_block_mapping *) item2; 146 147 if (mapping1->block_map_slot.pbn != mapping2->block_map_slot.pbn) 148 return mapping1->block_map_slot.pbn < mapping2->block_map_slot.pbn; 149 150 if (mapping1->block_map_slot.slot != mapping2->block_map_slot.slot) 151 return mapping1->block_map_slot.slot < mapping2->block_map_slot.slot; 152 153 if (mapping1->number != mapping2->number) 154 return mapping1->number < mapping2->number; 155 156 return 0; 157 } 158 159 static void swap_mappings(void *item1, void *item2, void __always_unused *args) 160 { 161 struct numbered_block_mapping *mapping1 = item1; 162 struct numbered_block_mapping *mapping2 = item2; 163 164 swap(*mapping1, *mapping2); 165 } 166 167 static const struct min_heap_callbacks repair_min_heap = { 168 .less = mapping_is_less_than, 169 .swp = swap_mappings, 170 }; 171 172 static struct numbered_block_mapping *sort_next_heap_element(struct repair_completion *repair) 173 { 174 struct replay_heap *heap = &repair->replay_heap; 175 struct numbered_block_mapping *last; 176 177 if (heap->nr == 0) 178 return NULL; 179 180 /* 181 * Swap the next heap element with the last one on the heap, popping it off the heap, 182 * restore the heap invariant, and return a pointer to the popped element. 183 */ 184 last = &repair->entries[--heap->nr]; 185 swap_mappings(heap->data, last, NULL); 186 min_heap_sift_down(heap, 0, &repair_min_heap, NULL); 187 return last; 188 } 189 190 /** 191 * as_repair_completion() - Convert a generic completion to a repair_completion. 192 * @completion: The completion to convert. 193 * 194 * Return: The repair_completion. 195 */ 196 static inline struct repair_completion * __must_check 197 as_repair_completion(struct vdo_completion *completion) 198 { 199 vdo_assert_completion_type(completion, VDO_REPAIR_COMPLETION); 200 return container_of(completion, struct repair_completion, completion); 201 } 202 203 static void prepare_repair_completion(struct repair_completion *repair, 204 vdo_action_fn callback, enum vdo_zone_type zone_type) 205 { 206 struct vdo_completion *completion = &repair->completion; 207 const struct thread_config *thread_config = &completion->vdo->thread_config; 208 thread_id_t thread_id; 209 210 /* All blockmap access is done on single thread, so use logical zone 0. */ 211 thread_id = ((zone_type == VDO_ZONE_TYPE_LOGICAL) ? 212 thread_config->logical_threads[0] : 213 thread_config->admin_thread); 214 vdo_reset_completion(completion); 215 vdo_set_completion_callback(completion, callback, thread_id); 216 } 217 218 static void launch_repair_completion(struct repair_completion *repair, 219 vdo_action_fn callback, enum vdo_zone_type zone_type) 220 { 221 prepare_repair_completion(repair, callback, zone_type); 222 vdo_launch_completion(&repair->completion); 223 } 224 225 static void uninitialize_vios(struct repair_completion *repair) 226 { 227 while (repair->vio_count > 0) 228 free_vio_components(&repair->vios[--repair->vio_count]); 229 230 vdo_free(vdo_forget(repair->vios)); 231 } 232 233 static void free_repair_completion(struct repair_completion *repair) 234 { 235 if (repair == NULL) 236 return; 237 238 /* 239 * We do this here because this function is the only common bottleneck for all clean up 240 * paths. 241 */ 242 repair->completion.vdo->block_map->zones[0].page_cache.rebuilding = false; 243 244 uninitialize_vios(repair); 245 vdo_free(vdo_forget(repair->journal_data)); 246 vdo_free(vdo_forget(repair->entries)); 247 vdo_free(repair); 248 } 249 250 static void finish_repair(struct vdo_completion *completion) 251 { 252 struct vdo_completion *parent = completion->parent; 253 struct vdo *vdo = completion->vdo; 254 struct repair_completion *repair = as_repair_completion(completion); 255 256 vdo_assert_on_admin_thread(vdo, __func__); 257 258 if (vdo->load_state != VDO_REBUILD_FOR_UPGRADE) 259 vdo->states.vdo.complete_recoveries++; 260 261 vdo_initialize_recovery_journal_post_repair(vdo->recovery_journal, 262 vdo->states.vdo.complete_recoveries, 263 repair->highest_tail, 264 repair->logical_blocks_used, 265 repair->block_map_data_blocks); 266 free_repair_completion(vdo_forget(repair)); 267 268 if (vdo_state_requires_read_only_rebuild(vdo->load_state)) { 269 vdo_log_info("Read-only rebuild complete"); 270 vdo_launch_completion(parent); 271 return; 272 } 273 274 /* FIXME: shouldn't this say either "recovery" or "repair"? */ 275 vdo_log_info("Rebuild complete"); 276 277 /* 278 * Now that we've freed the repair completion and its vast array of journal entries, we 279 * can allocate refcounts. 280 */ 281 vdo_continue_completion(parent, vdo_allocate_reference_counters(vdo->depot)); 282 } 283 284 /** 285 * abort_repair() - Handle a repair error. 286 * @completion: The repair completion. 287 */ 288 static void abort_repair(struct vdo_completion *completion) 289 { 290 struct vdo_completion *parent = completion->parent; 291 int result = completion->result; 292 struct repair_completion *repair = as_repair_completion(completion); 293 294 if (vdo_state_requires_read_only_rebuild(completion->vdo->load_state)) 295 vdo_log_info("Read-only rebuild aborted"); 296 else 297 vdo_log_warning("Recovery aborted"); 298 299 free_repair_completion(vdo_forget(repair)); 300 vdo_continue_completion(parent, result); 301 } 302 303 /** 304 * abort_on_error() - Abort a repair if there is an error. 305 * @result: The result to check. 306 * @repair: The repair completion. 307 * 308 * Return: true if the result was an error. 309 */ 310 static bool __must_check abort_on_error(int result, struct repair_completion *repair) 311 { 312 if (result == VDO_SUCCESS) 313 return false; 314 315 vdo_fail_completion(&repair->completion, result); 316 return true; 317 } 318 319 /** 320 * drain_slab_depot() - Flush out all dirty refcounts blocks now that they have been rebuilt or 321 * recovered. 322 * @completion: The repair completion. 323 */ 324 static void drain_slab_depot(struct vdo_completion *completion) 325 { 326 struct vdo *vdo = completion->vdo; 327 struct repair_completion *repair = as_repair_completion(completion); 328 const struct admin_state_code *operation; 329 330 vdo_assert_on_admin_thread(vdo, __func__); 331 332 prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN); 333 if (vdo_state_requires_read_only_rebuild(vdo->load_state)) { 334 vdo_log_info("Saving rebuilt state"); 335 operation = VDO_ADMIN_STATE_REBUILDING; 336 } else { 337 vdo_log_info("Replayed %zu journal entries into slab journals", 338 repair->entries_added_to_slab_journals); 339 operation = VDO_ADMIN_STATE_RECOVERING; 340 } 341 342 vdo_drain_slab_depot(vdo->depot, operation, completion); 343 } 344 345 /** 346 * flush_block_map_updates() - Flush the block map now that all the reference counts are rebuilt. 347 * @completion: The repair completion. 348 * 349 * This callback is registered in finish_if_done(). 350 */ 351 static void flush_block_map_updates(struct vdo_completion *completion) 352 { 353 vdo_assert_on_admin_thread(completion->vdo, __func__); 354 355 vdo_log_info("Flushing block map changes"); 356 prepare_repair_completion(as_repair_completion(completion), drain_slab_depot, 357 VDO_ZONE_TYPE_ADMIN); 358 vdo_drain_block_map(completion->vdo->block_map, VDO_ADMIN_STATE_RECOVERING, 359 completion); 360 } 361 362 static bool fetch_page(struct repair_completion *repair, 363 struct vdo_completion *completion); 364 365 /** 366 * handle_page_load_error() - Handle an error loading a page. 367 * @completion: The vdo_page_completion. 368 */ 369 static void handle_page_load_error(struct vdo_completion *completion) 370 { 371 struct repair_completion *repair = completion->parent; 372 373 repair->outstanding--; 374 vdo_set_completion_result(&repair->completion, completion->result); 375 vdo_release_page_completion(completion); 376 fetch_page(repair, completion); 377 } 378 379 /** 380 * unmap_entry() - Unmap an invalid entry and indicate that its page must be written out. 381 * @page: The page containing the entries 382 * @completion: The page_completion for writing the page 383 * @slot: The slot to unmap 384 */ 385 static void unmap_entry(struct block_map_page *page, struct vdo_completion *completion, 386 slot_number_t slot) 387 { 388 page->entries[slot] = UNMAPPED_BLOCK_MAP_ENTRY; 389 vdo_request_page_write(completion); 390 } 391 392 /** 393 * remove_out_of_bounds_entries() - Unmap entries which outside the logical space. 394 * @page: The page containing the entries 395 * @completion: The page_completion for writing the page 396 * @start: The first slot to check 397 */ 398 static void remove_out_of_bounds_entries(struct block_map_page *page, 399 struct vdo_completion *completion, 400 slot_number_t start) 401 { 402 slot_number_t slot; 403 404 for (slot = start; slot < VDO_BLOCK_MAP_ENTRIES_PER_PAGE; slot++) { 405 struct data_location mapping = vdo_unpack_block_map_entry(&page->entries[slot]); 406 407 if (vdo_is_mapped_location(&mapping)) 408 unmap_entry(page, completion, slot); 409 } 410 } 411 412 /** 413 * process_slot() - Update the reference counts for a single entry. 414 * @page: The page containing the entries 415 * @completion: The page_completion for writing the page 416 * @slot: The slot to check 417 * 418 * Return: true if the entry was a valid mapping 419 */ 420 static bool process_slot(struct block_map_page *page, struct vdo_completion *completion, 421 slot_number_t slot) 422 { 423 struct slab_depot *depot = completion->vdo->depot; 424 int result; 425 struct data_location mapping = vdo_unpack_block_map_entry(&page->entries[slot]); 426 427 if (!vdo_is_valid_location(&mapping)) { 428 /* This entry is invalid, so remove it from the page. */ 429 unmap_entry(page, completion, slot); 430 return false; 431 } 432 433 if (!vdo_is_mapped_location(&mapping)) 434 return false; 435 436 437 if (mapping.pbn == VDO_ZERO_BLOCK) 438 return true; 439 440 if (!vdo_is_physical_data_block(depot, mapping.pbn)) { 441 /* 442 * This is a nonsense mapping. Remove it from the map so we're at least consistent 443 * and mark the page dirty. 444 */ 445 unmap_entry(page, completion, slot); 446 return false; 447 } 448 449 result = vdo_adjust_reference_count_for_rebuild(depot, mapping.pbn, 450 VDO_JOURNAL_DATA_REMAPPING); 451 if (result == VDO_SUCCESS) 452 return true; 453 454 vdo_log_error_strerror(result, 455 "Could not adjust reference count for PBN %llu, slot %u mapped to PBN %llu", 456 (unsigned long long) vdo_get_block_map_page_pbn(page), 457 slot, (unsigned long long) mapping.pbn); 458 unmap_entry(page, completion, slot); 459 return false; 460 } 461 462 /** 463 * rebuild_reference_counts_from_page() - Rebuild reference counts from a block map page. 464 * @repair: The repair completion. 465 * @completion: The page completion holding the page. 466 */ 467 static void rebuild_reference_counts_from_page(struct repair_completion *repair, 468 struct vdo_completion *completion) 469 { 470 slot_number_t slot, last_slot; 471 struct block_map_page *page; 472 int result; 473 474 result = vdo_get_cached_page(completion, &page); 475 if (result != VDO_SUCCESS) { 476 vdo_set_completion_result(&repair->completion, result); 477 return; 478 } 479 480 if (!page->header.initialized) 481 return; 482 483 /* Remove any bogus entries which exist beyond the end of the logical space. */ 484 if (vdo_get_block_map_page_pbn(page) == repair->last_slot.pbn) { 485 last_slot = repair->last_slot.slot; 486 remove_out_of_bounds_entries(page, completion, last_slot); 487 } else { 488 last_slot = VDO_BLOCK_MAP_ENTRIES_PER_PAGE; 489 } 490 491 /* Inform the slab depot of all entries on this page. */ 492 for (slot = 0; slot < last_slot; slot++) { 493 if (process_slot(page, completion, slot)) 494 repair->logical_blocks_used++; 495 } 496 } 497 498 /** 499 * page_loaded() - Process a page which has just been loaded. 500 * @completion: The vdo_page_completion for the fetched page. 501 * 502 * This callback is registered by fetch_page(). 503 */ 504 static void page_loaded(struct vdo_completion *completion) 505 { 506 struct repair_completion *repair = completion->parent; 507 508 repair->outstanding--; 509 rebuild_reference_counts_from_page(repair, completion); 510 vdo_release_page_completion(completion); 511 512 /* Advance progress to the next page, and fetch the next page we haven't yet requested. */ 513 fetch_page(repair, completion); 514 } 515 516 static physical_block_number_t get_pbn_to_fetch(struct repair_completion *repair, 517 struct block_map *block_map) 518 { 519 physical_block_number_t pbn = VDO_ZERO_BLOCK; 520 521 if (repair->completion.result != VDO_SUCCESS) 522 return VDO_ZERO_BLOCK; 523 524 while ((pbn == VDO_ZERO_BLOCK) && (repair->page_to_fetch < repair->leaf_pages)) 525 pbn = vdo_find_block_map_page_pbn(block_map, repair->page_to_fetch++); 526 527 if (vdo_is_physical_data_block(repair->completion.vdo->depot, pbn)) 528 return pbn; 529 530 vdo_set_completion_result(&repair->completion, VDO_BAD_MAPPING); 531 return VDO_ZERO_BLOCK; 532 } 533 534 /** 535 * fetch_page() - Fetch a page from the block map. 536 * @repair: The repair_completion. 537 * @completion: The page completion to use. 538 * 539 * Return true if the rebuild is complete 540 */ 541 static bool fetch_page(struct repair_completion *repair, 542 struct vdo_completion *completion) 543 { 544 struct vdo_page_completion *page_completion = (struct vdo_page_completion *) completion; 545 struct block_map *block_map = repair->completion.vdo->block_map; 546 physical_block_number_t pbn = get_pbn_to_fetch(repair, block_map); 547 548 if (pbn != VDO_ZERO_BLOCK) { 549 repair->outstanding++; 550 /* 551 * We must set the requeue flag here to ensure that we don't blow the stack if all 552 * the requested pages are already in the cache or get load errors. 553 */ 554 vdo_get_page(page_completion, &block_map->zones[0], pbn, true, repair, 555 page_loaded, handle_page_load_error, true); 556 } 557 558 if (repair->outstanding > 0) 559 return false; 560 561 launch_repair_completion(repair, flush_block_map_updates, VDO_ZONE_TYPE_ADMIN); 562 return true; 563 } 564 565 /** 566 * rebuild_from_leaves() - Rebuild reference counts from the leaf block map pages. 567 * @completion: The repair completion. 568 * 569 * Rebuilds reference counts from the leaf block map pages now that reference counts have been 570 * rebuilt from the interior tree pages (which have been loaded in the process). This callback is 571 * registered in rebuild_reference_counts(). 572 */ 573 static void rebuild_from_leaves(struct vdo_completion *completion) 574 { 575 page_count_t i; 576 struct repair_completion *repair = as_repair_completion(completion); 577 struct block_map *map = completion->vdo->block_map; 578 579 repair->logical_blocks_used = 0; 580 581 /* 582 * The PBN calculation doesn't work until the tree pages have been loaded, so we can't set 583 * this value at the start of repair. 584 */ 585 repair->leaf_pages = vdo_compute_block_map_page_count(map->entry_count); 586 repair->last_slot = (struct block_map_slot) { 587 .slot = map->entry_count % VDO_BLOCK_MAP_ENTRIES_PER_PAGE, 588 .pbn = vdo_find_block_map_page_pbn(map, repair->leaf_pages - 1), 589 }; 590 if (repair->last_slot.slot == 0) 591 repair->last_slot.slot = VDO_BLOCK_MAP_ENTRIES_PER_PAGE; 592 593 for (i = 0; i < repair->page_count; i++) { 594 if (fetch_page(repair, &repair->page_completions[i].completion)) { 595 /* 596 * The rebuild has already moved on, so it isn't safe nor is there a need 597 * to launch any more fetches. 598 */ 599 return; 600 } 601 } 602 } 603 604 /** 605 * process_entry() - Process a single entry from the block map tree. 606 * @pbn: A pbn which holds a block map tree page. 607 * @completion: The parent completion of the traversal. 608 * 609 * Implements vdo_entry_callback_fn. 610 * 611 * Return: VDO_SUCCESS or an error. 612 */ 613 static int process_entry(physical_block_number_t pbn, struct vdo_completion *completion) 614 { 615 struct repair_completion *repair = as_repair_completion(completion); 616 struct slab_depot *depot = completion->vdo->depot; 617 int result; 618 619 if ((pbn == VDO_ZERO_BLOCK) || !vdo_is_physical_data_block(depot, pbn)) { 620 return vdo_log_error_strerror(VDO_BAD_CONFIGURATION, 621 "PBN %llu out of range", 622 (unsigned long long) pbn); 623 } 624 625 result = vdo_adjust_reference_count_for_rebuild(depot, pbn, 626 VDO_JOURNAL_BLOCK_MAP_REMAPPING); 627 if (result != VDO_SUCCESS) { 628 return vdo_log_error_strerror(result, 629 "Could not adjust reference count for block map tree PBN %llu", 630 (unsigned long long) pbn); 631 } 632 633 repair->block_map_data_blocks++; 634 return VDO_SUCCESS; 635 } 636 637 static void rebuild_reference_counts(struct vdo_completion *completion) 638 { 639 struct repair_completion *repair = as_repair_completion(completion); 640 struct vdo *vdo = completion->vdo; 641 struct vdo_page_cache *cache = &vdo->block_map->zones[0].page_cache; 642 643 /* We must allocate ref_counts before we can rebuild them. */ 644 if (abort_on_error(vdo_allocate_reference_counters(vdo->depot), repair)) 645 return; 646 647 /* 648 * Completion chaining from page cache hits can lead to stack overflow during the rebuild, 649 * so clear out the cache before this rebuild phase. 650 */ 651 if (abort_on_error(vdo_invalidate_page_cache(cache), repair)) 652 return; 653 654 prepare_repair_completion(repair, rebuild_from_leaves, VDO_ZONE_TYPE_LOGICAL); 655 vdo_traverse_forest(vdo->block_map, process_entry, completion); 656 } 657 658 static void increment_recovery_point(struct recovery_point *point) 659 { 660 if (++point->entry_count < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) 661 return; 662 663 point->entry_count = 0; 664 if (point->sector_count < (VDO_SECTORS_PER_BLOCK - 1)) { 665 point->sector_count++; 666 return; 667 } 668 669 point->sequence_number++; 670 point->sector_count = 1; 671 } 672 673 /** 674 * advance_points() - Advance the current recovery and journal points. 675 * @repair: The repair_completion whose points are to be advanced. 676 * @entries_per_block: The number of entries in a recovery journal block. 677 */ 678 static void advance_points(struct repair_completion *repair, 679 journal_entry_count_t entries_per_block) 680 { 681 if (!repair->next_recovery_point.increment_applied) { 682 repair->next_recovery_point.increment_applied = true; 683 return; 684 } 685 686 increment_recovery_point(&repair->next_recovery_point); 687 vdo_advance_journal_point(&repair->next_journal_point, entries_per_block); 688 repair->next_recovery_point.increment_applied = false; 689 } 690 691 /** 692 * before_recovery_point() - Check whether the first point precedes the second point. 693 * @first: The first recovery point. 694 * @second: The second recovery point. 695 * 696 * Return: true if the first point precedes the second point. 697 */ 698 static bool __must_check before_recovery_point(const struct recovery_point *first, 699 const struct recovery_point *second) 700 { 701 if (first->sequence_number < second->sequence_number) 702 return true; 703 704 if (first->sequence_number > second->sequence_number) 705 return false; 706 707 if (first->sector_count < second->sector_count) 708 return true; 709 710 return ((first->sector_count == second->sector_count) && 711 (first->entry_count < second->entry_count)); 712 } 713 714 static struct packed_journal_sector * __must_check get_sector(struct recovery_journal *journal, 715 char *journal_data, 716 sequence_number_t sequence, 717 u8 sector_number) 718 { 719 off_t offset; 720 721 offset = ((vdo_get_recovery_journal_block_number(journal, sequence) * VDO_BLOCK_SIZE) + 722 (VDO_SECTOR_SIZE * sector_number)); 723 return (struct packed_journal_sector *) (journal_data + offset); 724 } 725 726 /** 727 * get_entry() - Unpack the recovery journal entry associated with the given recovery point. 728 * @repair: The repair completion. 729 * @point: The recovery point. 730 * 731 * Return: The unpacked contents of the matching recovery journal entry. 732 */ 733 static struct recovery_journal_entry get_entry(const struct repair_completion *repair, 734 const struct recovery_point *point) 735 { 736 struct packed_journal_sector *sector; 737 738 sector = get_sector(repair->completion.vdo->recovery_journal, 739 repair->journal_data, point->sequence_number, 740 point->sector_count); 741 return vdo_unpack_recovery_journal_entry(§or->entries[point->entry_count]); 742 } 743 744 /** 745 * validate_recovery_journal_entry() - Validate a recovery journal entry. 746 * @vdo: The vdo. 747 * @entry: The entry to validate. 748 * 749 * Return: VDO_SUCCESS or an error. 750 */ 751 static int validate_recovery_journal_entry(const struct vdo *vdo, 752 const struct recovery_journal_entry *entry) 753 { 754 if ((entry->slot.pbn >= vdo->states.vdo.config.physical_blocks) || 755 (entry->slot.slot >= VDO_BLOCK_MAP_ENTRIES_PER_PAGE) || 756 !vdo_is_valid_location(&entry->mapping) || 757 !vdo_is_valid_location(&entry->unmapping) || 758 !vdo_is_physical_data_block(vdo->depot, entry->mapping.pbn) || 759 !vdo_is_physical_data_block(vdo->depot, entry->unmapping.pbn)) { 760 return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL, 761 "Invalid entry: %s (%llu, %u) from %llu to %llu is not within bounds", 762 vdo_get_journal_operation_name(entry->operation), 763 (unsigned long long) entry->slot.pbn, 764 entry->slot.slot, 765 (unsigned long long) entry->unmapping.pbn, 766 (unsigned long long) entry->mapping.pbn); 767 } 768 769 if ((entry->operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) && 770 (vdo_is_state_compressed(entry->mapping.state) || 771 (entry->mapping.pbn == VDO_ZERO_BLOCK) || 772 (entry->unmapping.state != VDO_MAPPING_STATE_UNMAPPED) || 773 (entry->unmapping.pbn != VDO_ZERO_BLOCK))) { 774 return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL, 775 "Invalid entry: %s (%llu, %u) from %llu to %llu is not a valid tree mapping", 776 vdo_get_journal_operation_name(entry->operation), 777 (unsigned long long) entry->slot.pbn, 778 entry->slot.slot, 779 (unsigned long long) entry->unmapping.pbn, 780 (unsigned long long) entry->mapping.pbn); 781 } 782 783 return VDO_SUCCESS; 784 } 785 786 /** 787 * add_slab_journal_entries() - Replay recovery journal entries into the slab journals of the 788 * allocator currently being recovered. 789 * @completion: The allocator completion. 790 * 791 * Waits for slab journal tailblock space when necessary. This method is its own callback. 792 */ 793 static void add_slab_journal_entries(struct vdo_completion *completion) 794 { 795 struct recovery_point *recovery_point; 796 struct repair_completion *repair = completion->parent; 797 struct vdo *vdo = completion->vdo; 798 struct recovery_journal *journal = vdo->recovery_journal; 799 struct block_allocator *allocator = vdo_as_block_allocator(completion); 800 801 /* Get ready in case we need to enqueue again. */ 802 vdo_prepare_completion(completion, add_slab_journal_entries, 803 vdo_notify_slab_journals_are_recovered, 804 completion->callback_thread_id, repair); 805 for (recovery_point = &repair->next_recovery_point; 806 before_recovery_point(recovery_point, &repair->tail_recovery_point); 807 advance_points(repair, journal->entries_per_block)) { 808 int result; 809 physical_block_number_t pbn; 810 struct vdo_slab *slab; 811 struct recovery_journal_entry entry = get_entry(repair, recovery_point); 812 bool increment = !repair->next_recovery_point.increment_applied; 813 814 if (increment) { 815 result = validate_recovery_journal_entry(vdo, &entry); 816 if (result != VDO_SUCCESS) { 817 vdo_enter_read_only_mode(vdo, result); 818 vdo_fail_completion(completion, result); 819 return; 820 } 821 822 pbn = entry.mapping.pbn; 823 } else { 824 pbn = entry.unmapping.pbn; 825 } 826 827 if (pbn == VDO_ZERO_BLOCK) 828 continue; 829 830 slab = vdo_get_slab(vdo->depot, pbn); 831 if (slab->allocator != allocator) 832 continue; 833 834 if (!vdo_attempt_replay_into_slab(slab, pbn, entry.operation, increment, 835 &repair->next_journal_point, 836 completion)) 837 return; 838 839 repair->entries_added_to_slab_journals++; 840 } 841 842 vdo_notify_slab_journals_are_recovered(completion); 843 } 844 845 /** 846 * vdo_replay_into_slab_journals() - Replay recovery journal entries in the slab journals of slabs 847 * owned by a given block_allocator. 848 * @allocator: The allocator whose slab journals are to be recovered. 849 * @context: The slab depot load context supplied by a recovery when it loads the depot. 850 */ 851 void vdo_replay_into_slab_journals(struct block_allocator *allocator, void *context) 852 { 853 struct vdo_completion *completion = &allocator->completion; 854 struct repair_completion *repair = context; 855 struct vdo *vdo = completion->vdo; 856 857 vdo_assert_on_physical_zone_thread(vdo, allocator->zone_number, __func__); 858 if (repair->entry_count == 0) { 859 /* there's nothing to replay */ 860 repair->logical_blocks_used = vdo->recovery_journal->logical_blocks_used; 861 repair->block_map_data_blocks = vdo->recovery_journal->block_map_data_blocks; 862 vdo_notify_slab_journals_are_recovered(completion); 863 return; 864 } 865 866 repair->next_recovery_point = (struct recovery_point) { 867 .sequence_number = repair->slab_journal_head, 868 .sector_count = 1, 869 .entry_count = 0, 870 }; 871 872 repair->next_journal_point = (struct journal_point) { 873 .sequence_number = repair->slab_journal_head, 874 .entry_count = 0, 875 }; 876 877 vdo_log_info("Replaying entries into slab journals for zone %u", 878 allocator->zone_number); 879 completion->parent = repair; 880 add_slab_journal_entries(completion); 881 } 882 883 static void load_slab_depot(struct vdo_completion *completion) 884 { 885 struct repair_completion *repair = as_repair_completion(completion); 886 const struct admin_state_code *operation; 887 888 vdo_assert_on_admin_thread(completion->vdo, __func__); 889 890 if (vdo_state_requires_read_only_rebuild(completion->vdo->load_state)) { 891 prepare_repair_completion(repair, rebuild_reference_counts, 892 VDO_ZONE_TYPE_LOGICAL); 893 operation = VDO_ADMIN_STATE_LOADING_FOR_REBUILD; 894 } else { 895 prepare_repair_completion(repair, drain_slab_depot, VDO_ZONE_TYPE_ADMIN); 896 operation = VDO_ADMIN_STATE_LOADING_FOR_RECOVERY; 897 } 898 899 vdo_load_slab_depot(completion->vdo->depot, operation, completion, repair); 900 } 901 902 static void flush_block_map(struct vdo_completion *completion) 903 { 904 struct repair_completion *repair = as_repair_completion(completion); 905 const struct admin_state_code *operation; 906 907 vdo_assert_on_admin_thread(completion->vdo, __func__); 908 909 vdo_log_info("Flushing block map changes"); 910 prepare_repair_completion(repair, load_slab_depot, VDO_ZONE_TYPE_ADMIN); 911 operation = (vdo_state_requires_read_only_rebuild(completion->vdo->load_state) ? 912 VDO_ADMIN_STATE_REBUILDING : 913 VDO_ADMIN_STATE_RECOVERING); 914 vdo_drain_block_map(completion->vdo->block_map, operation, completion); 915 } 916 917 static bool finish_if_done(struct repair_completion *repair) 918 { 919 /* Pages are still being launched or there is still work to do */ 920 if (repair->launching || (repair->outstanding > 0)) 921 return false; 922 923 if (repair->completion.result != VDO_SUCCESS) { 924 page_count_t i; 925 926 for (i = 0; i < repair->page_count; i++) { 927 struct vdo_page_completion *page_completion = 928 &repair->page_completions[i]; 929 930 if (page_completion->ready) 931 vdo_release_page_completion(&page_completion->completion); 932 } 933 934 vdo_launch_completion(&repair->completion); 935 return true; 936 } 937 938 if (repair->current_entry >= repair->entries) 939 return false; 940 941 launch_repair_completion(repair, flush_block_map, VDO_ZONE_TYPE_ADMIN); 942 return true; 943 } 944 945 static void abort_block_map_recovery(struct repair_completion *repair, int result) 946 { 947 vdo_set_completion_result(&repair->completion, result); 948 finish_if_done(repair); 949 } 950 951 /** 952 * find_entry_starting_next_page() - Find the first journal entry after a given entry which is not 953 * on the same block map page. 954 * @repair: The repair completion. 955 * @current_entry: The entry to search from. 956 * @needs_sort: Whether sorting is needed to proceed. 957 * 958 * Return: Pointer to the first later journal entry on a different block map page, or a pointer to 959 * just before the journal entries if no subsequent entry is on a different block map page. 960 */ 961 static struct numbered_block_mapping * 962 find_entry_starting_next_page(struct repair_completion *repair, 963 struct numbered_block_mapping *current_entry, bool needs_sort) 964 { 965 size_t current_page; 966 967 /* If current_entry is invalid, return immediately. */ 968 if (current_entry < repair->entries) 969 return current_entry; 970 971 current_page = current_entry->block_map_slot.pbn; 972 973 /* Decrement current_entry until it's out of bounds or on a different page. */ 974 while ((current_entry >= repair->entries) && 975 (current_entry->block_map_slot.pbn == current_page)) { 976 if (needs_sort) { 977 struct numbered_block_mapping *just_sorted_entry = 978 sort_next_heap_element(repair); 979 VDO_ASSERT_LOG_ONLY(just_sorted_entry < current_entry, 980 "heap is returning elements in an unexpected order"); 981 } 982 983 current_entry--; 984 } 985 986 return current_entry; 987 } 988 989 /* 990 * Apply a range of journal entries [starting_entry, ending_entry) journal 991 * entries to a block map page. 992 */ 993 static void apply_journal_entries_to_page(struct block_map_page *page, 994 struct numbered_block_mapping *starting_entry, 995 struct numbered_block_mapping *ending_entry) 996 { 997 struct numbered_block_mapping *current_entry = starting_entry; 998 999 while (current_entry != ending_entry) { 1000 page->entries[current_entry->block_map_slot.slot] = current_entry->block_map_entry; 1001 current_entry--; 1002 } 1003 } 1004 1005 static void recover_ready_pages(struct repair_completion *repair, 1006 struct vdo_completion *completion); 1007 1008 static void block_map_page_loaded(struct vdo_completion *completion) 1009 { 1010 struct repair_completion *repair = as_repair_completion(completion->parent); 1011 1012 repair->outstanding--; 1013 if (!repair->launching) 1014 recover_ready_pages(repair, completion); 1015 } 1016 1017 static void handle_block_map_page_load_error(struct vdo_completion *completion) 1018 { 1019 struct repair_completion *repair = as_repair_completion(completion->parent); 1020 1021 repair->outstanding--; 1022 abort_block_map_recovery(repair, completion->result); 1023 } 1024 1025 static void fetch_block_map_page(struct repair_completion *repair, 1026 struct vdo_completion *completion) 1027 { 1028 physical_block_number_t pbn; 1029 1030 if (repair->current_unfetched_entry < repair->entries) 1031 /* Nothing left to fetch. */ 1032 return; 1033 1034 /* Fetch the next page we haven't yet requested. */ 1035 pbn = repair->current_unfetched_entry->block_map_slot.pbn; 1036 repair->current_unfetched_entry = 1037 find_entry_starting_next_page(repair, repair->current_unfetched_entry, 1038 true); 1039 repair->outstanding++; 1040 vdo_get_page(((struct vdo_page_completion *) completion), 1041 &repair->completion.vdo->block_map->zones[0], pbn, true, 1042 &repair->completion, block_map_page_loaded, 1043 handle_block_map_page_load_error, false); 1044 } 1045 1046 static struct vdo_page_completion *get_next_page_completion(struct repair_completion *repair, 1047 struct vdo_page_completion *completion) 1048 { 1049 completion++; 1050 if (completion == (&repair->page_completions[repair->page_count])) 1051 completion = &repair->page_completions[0]; 1052 return completion; 1053 } 1054 1055 static void recover_ready_pages(struct repair_completion *repair, 1056 struct vdo_completion *completion) 1057 { 1058 struct vdo_page_completion *page_completion = (struct vdo_page_completion *) completion; 1059 1060 if (finish_if_done(repair)) 1061 return; 1062 1063 if (repair->pbn != page_completion->pbn) 1064 return; 1065 1066 while (page_completion->ready) { 1067 struct numbered_block_mapping *start_of_next_page; 1068 struct block_map_page *page; 1069 int result; 1070 1071 result = vdo_get_cached_page(completion, &page); 1072 if (result != VDO_SUCCESS) { 1073 abort_block_map_recovery(repair, result); 1074 return; 1075 } 1076 1077 start_of_next_page = 1078 find_entry_starting_next_page(repair, repair->current_entry, 1079 false); 1080 apply_journal_entries_to_page(page, repair->current_entry, 1081 start_of_next_page); 1082 repair->current_entry = start_of_next_page; 1083 vdo_request_page_write(completion); 1084 vdo_release_page_completion(completion); 1085 1086 if (finish_if_done(repair)) 1087 return; 1088 1089 repair->pbn = repair->current_entry->block_map_slot.pbn; 1090 fetch_block_map_page(repair, completion); 1091 page_completion = get_next_page_completion(repair, page_completion); 1092 completion = &page_completion->completion; 1093 } 1094 } 1095 1096 static void recover_block_map(struct vdo_completion *completion) 1097 { 1098 struct repair_completion *repair = as_repair_completion(completion); 1099 struct vdo *vdo = completion->vdo; 1100 struct numbered_block_mapping *first_sorted_entry; 1101 page_count_t i; 1102 1103 vdo_assert_on_logical_zone_thread(vdo, 0, __func__); 1104 1105 /* Suppress block map errors. */ 1106 vdo->block_map->zones[0].page_cache.rebuilding = 1107 vdo_state_requires_read_only_rebuild(vdo->load_state); 1108 1109 if (repair->block_map_entry_count == 0) { 1110 vdo_log_info("Replaying 0 recovery entries into block map"); 1111 vdo_free(vdo_forget(repair->journal_data)); 1112 launch_repair_completion(repair, load_slab_depot, VDO_ZONE_TYPE_ADMIN); 1113 return; 1114 } 1115 1116 /* 1117 * Organize the journal entries into a binary heap so we can iterate over them in sorted 1118 * order incrementally, avoiding an expensive sort call. 1119 */ 1120 repair->replay_heap = (struct replay_heap) { 1121 .data = repair->entries, 1122 .nr = repair->block_map_entry_count, 1123 .size = repair->block_map_entry_count, 1124 }; 1125 min_heapify_all(&repair->replay_heap, &repair_min_heap, NULL); 1126 1127 vdo_log_info("Replaying %zu recovery entries into block map", 1128 repair->block_map_entry_count); 1129 1130 repair->current_entry = &repair->entries[repair->block_map_entry_count - 1]; 1131 first_sorted_entry = sort_next_heap_element(repair); 1132 VDO_ASSERT_LOG_ONLY(first_sorted_entry == repair->current_entry, 1133 "heap is returning elements in an unexpected order"); 1134 1135 /* Prevent any page from being processed until all pages have been launched. */ 1136 repair->launching = true; 1137 repair->pbn = repair->current_entry->block_map_slot.pbn; 1138 repair->current_unfetched_entry = repair->current_entry; 1139 for (i = 0; i < repair->page_count; i++) { 1140 if (repair->current_unfetched_entry < repair->entries) 1141 break; 1142 1143 fetch_block_map_page(repair, &repair->page_completions[i].completion); 1144 } 1145 repair->launching = false; 1146 1147 /* Process any ready pages. */ 1148 recover_ready_pages(repair, &repair->page_completions[0].completion); 1149 } 1150 1151 /** 1152 * get_recovery_journal_block_header() - Get the block header for a block at a position in the 1153 * journal data and unpack it. 1154 * @journal: The recovery journal. 1155 * @data: The recovery journal data. 1156 * @sequence: The sequence number. 1157 * 1158 * Return: The unpacked header. 1159 */ 1160 static struct recovery_block_header __must_check 1161 get_recovery_journal_block_header(struct recovery_journal *journal, char *data, 1162 sequence_number_t sequence) 1163 { 1164 physical_block_number_t pbn = 1165 vdo_get_recovery_journal_block_number(journal, sequence); 1166 char *header = &data[pbn * VDO_BLOCK_SIZE]; 1167 1168 return vdo_unpack_recovery_block_header((struct packed_journal_header *) header); 1169 } 1170 1171 /** 1172 * is_valid_recovery_journal_block() - Determine whether the given header describes a valid block 1173 * for the given journal. 1174 * @journal: The journal to use. 1175 * @header: The unpacked block header to check. 1176 * @old_ok: Whether an old format header is valid. 1177 * 1178 * A block is not valid if it is unformatted, or if it is older than the last successful recovery 1179 * or reformat. 1180 * 1181 * Return: True if the header is valid. 1182 */ 1183 static bool __must_check is_valid_recovery_journal_block(const struct recovery_journal *journal, 1184 const struct recovery_block_header *header, 1185 bool old_ok) 1186 { 1187 if ((header->nonce != journal->nonce) || 1188 (header->recovery_count != journal->recovery_count)) 1189 return false; 1190 1191 if (header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL_2) 1192 return (header->entry_count <= journal->entries_per_block); 1193 1194 return (old_ok && 1195 (header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL) && 1196 (header->entry_count <= RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK)); 1197 } 1198 1199 /** 1200 * is_exact_recovery_journal_block() - Determine whether the given header describes the exact block 1201 * indicated. 1202 * @journal: The journal to use. 1203 * @header: The unpacked block header to check. 1204 * @sequence: The expected sequence number. 1205 * @type: The expected metadata type. 1206 * 1207 * Return: True if the block matches. 1208 */ 1209 static bool __must_check is_exact_recovery_journal_block(const struct recovery_journal *journal, 1210 const struct recovery_block_header *header, 1211 sequence_number_t sequence, 1212 enum vdo_metadata_type type) 1213 { 1214 return ((header->metadata_type == type) && 1215 (header->sequence_number == sequence) && 1216 (is_valid_recovery_journal_block(journal, header, true))); 1217 } 1218 1219 /** 1220 * find_recovery_journal_head_and_tail() - Find the tail and head of the journal. 1221 * @repair: The repair completion. 1222 * 1223 * Return: True if there were valid journal blocks. 1224 */ 1225 static bool find_recovery_journal_head_and_tail(struct repair_completion *repair) 1226 { 1227 struct recovery_journal *journal = repair->completion.vdo->recovery_journal; 1228 bool found_entries = false; 1229 physical_block_number_t i; 1230 1231 /* 1232 * Ensure that we don't replay old entries since we know the tail recorded in the super 1233 * block must be a lower bound. Not doing so can result in extra data loss by setting the 1234 * tail too early. 1235 */ 1236 repair->highest_tail = journal->tail; 1237 for (i = 0; i < journal->size; i++) { 1238 struct recovery_block_header header = 1239 get_recovery_journal_block_header(journal, repair->journal_data, i); 1240 1241 if (!is_valid_recovery_journal_block(journal, &header, true)) { 1242 /* This block is old or incorrectly formatted */ 1243 continue; 1244 } 1245 1246 if (vdo_get_recovery_journal_block_number(journal, header.sequence_number) != i) { 1247 /* This block is in the wrong location */ 1248 continue; 1249 } 1250 1251 if (header.sequence_number >= repair->highest_tail) { 1252 found_entries = true; 1253 repair->highest_tail = header.sequence_number; 1254 } 1255 1256 if (!found_entries) 1257 continue; 1258 1259 if (header.block_map_head > repair->block_map_head) 1260 repair->block_map_head = header.block_map_head; 1261 1262 if (header.slab_journal_head > repair->slab_journal_head) 1263 repair->slab_journal_head = header.slab_journal_head; 1264 } 1265 1266 return found_entries; 1267 } 1268 1269 /** 1270 * unpack_entry() - Unpack a recovery journal entry in either format. 1271 * @vdo: The vdo. 1272 * @packed: The entry to unpack. 1273 * @format: The expected format of the entry. 1274 * @entry: The unpacked entry. 1275 * 1276 * Return: true if the entry should be applied.3 1277 */ 1278 static bool unpack_entry(struct vdo *vdo, char *packed, enum vdo_metadata_type format, 1279 struct recovery_journal_entry *entry) 1280 { 1281 if (format == VDO_METADATA_RECOVERY_JOURNAL_2) { 1282 struct packed_recovery_journal_entry *packed_entry = 1283 (struct packed_recovery_journal_entry *) packed; 1284 1285 *entry = vdo_unpack_recovery_journal_entry(packed_entry); 1286 } else { 1287 physical_block_number_t low32, high4; 1288 1289 struct packed_recovery_journal_entry_1 *packed_entry = 1290 (struct packed_recovery_journal_entry_1 *) packed; 1291 1292 if (packed_entry->operation == VDO_JOURNAL_DATA_INCREMENT) 1293 entry->operation = VDO_JOURNAL_DATA_REMAPPING; 1294 else if (packed_entry->operation == VDO_JOURNAL_BLOCK_MAP_INCREMENT) 1295 entry->operation = VDO_JOURNAL_BLOCK_MAP_REMAPPING; 1296 else 1297 return false; 1298 1299 low32 = __le32_to_cpu(packed_entry->pbn_low_word); 1300 high4 = packed_entry->pbn_high_nibble; 1301 entry->slot = (struct block_map_slot) { 1302 .pbn = ((high4 << 32) | low32), 1303 .slot = (packed_entry->slot_low | (packed_entry->slot_high << 6)), 1304 }; 1305 entry->mapping = vdo_unpack_block_map_entry(&packed_entry->block_map_entry); 1306 entry->unmapping = (struct data_location) { 1307 .pbn = VDO_ZERO_BLOCK, 1308 .state = VDO_MAPPING_STATE_UNMAPPED, 1309 }; 1310 } 1311 1312 return (validate_recovery_journal_entry(vdo, entry) == VDO_SUCCESS); 1313 } 1314 1315 /** 1316 * append_sector_entries() - Append an array of recovery journal entries from a journal block 1317 * sector to the array of numbered mappings in the repair completion, 1318 * numbering each entry in the order they are appended. 1319 * @repair: The repair completion. 1320 * @entries: The entries in the sector. 1321 * @format: The format of the sector. 1322 * @entry_count: The number of entries to append. 1323 */ 1324 static void append_sector_entries(struct repair_completion *repair, char *entries, 1325 enum vdo_metadata_type format, 1326 journal_entry_count_t entry_count) 1327 { 1328 journal_entry_count_t i; 1329 struct vdo *vdo = repair->completion.vdo; 1330 off_t increment = ((format == VDO_METADATA_RECOVERY_JOURNAL_2) 1331 ? sizeof(struct packed_recovery_journal_entry) 1332 : sizeof(struct packed_recovery_journal_entry_1)); 1333 1334 for (i = 0; i < entry_count; i++, entries += increment) { 1335 struct recovery_journal_entry entry; 1336 1337 if (!unpack_entry(vdo, entries, format, &entry)) 1338 /* When recovering from read-only mode, ignore damaged entries. */ 1339 continue; 1340 1341 repair->entries[repair->block_map_entry_count] = 1342 (struct numbered_block_mapping) { 1343 .block_map_slot = entry.slot, 1344 .block_map_entry = vdo_pack_block_map_entry(entry.mapping.pbn, 1345 entry.mapping.state), 1346 .number = repair->block_map_entry_count, 1347 }; 1348 repair->block_map_entry_count++; 1349 } 1350 } 1351 1352 static journal_entry_count_t entries_per_sector(enum vdo_metadata_type format, 1353 u8 sector_number) 1354 { 1355 if (format == VDO_METADATA_RECOVERY_JOURNAL_2) 1356 return RECOVERY_JOURNAL_ENTRIES_PER_SECTOR; 1357 1358 return ((sector_number == (VDO_SECTORS_PER_BLOCK - 1)) 1359 ? RECOVERY_JOURNAL_1_ENTRIES_IN_LAST_SECTOR 1360 : RECOVERY_JOURNAL_1_ENTRIES_PER_SECTOR); 1361 } 1362 1363 static void extract_entries_from_block(struct repair_completion *repair, 1364 struct recovery_journal *journal, 1365 sequence_number_t sequence, 1366 enum vdo_metadata_type format, 1367 journal_entry_count_t entries) 1368 { 1369 sector_count_t i; 1370 struct recovery_block_header header = 1371 get_recovery_journal_block_header(journal, repair->journal_data, 1372 sequence); 1373 1374 if (!is_exact_recovery_journal_block(journal, &header, sequence, format)) { 1375 /* This block is invalid, so skip it. */ 1376 return; 1377 } 1378 1379 entries = min(entries, header.entry_count); 1380 for (i = 1; i < VDO_SECTORS_PER_BLOCK; i++) { 1381 struct packed_journal_sector *sector = 1382 get_sector(journal, repair->journal_data, sequence, i); 1383 journal_entry_count_t sector_entries = 1384 min(entries, entries_per_sector(format, i)); 1385 1386 if (vdo_is_valid_recovery_journal_sector(&header, sector, i)) { 1387 /* Only extract as many as the block header calls for. */ 1388 append_sector_entries(repair, (char *) sector->entries, format, 1389 min_t(journal_entry_count_t, 1390 sector->entry_count, 1391 sector_entries)); 1392 } 1393 1394 /* 1395 * Even if the sector wasn't full, count it as full when counting up to the 1396 * entry count the block header claims. 1397 */ 1398 entries -= sector_entries; 1399 } 1400 } 1401 1402 static int parse_journal_for_rebuild(struct repair_completion *repair) 1403 { 1404 int result; 1405 sequence_number_t i; 1406 block_count_t count; 1407 enum vdo_metadata_type format; 1408 struct vdo *vdo = repair->completion.vdo; 1409 struct recovery_journal *journal = vdo->recovery_journal; 1410 journal_entry_count_t entries_per_block = journal->entries_per_block; 1411 1412 format = get_recovery_journal_block_header(journal, repair->journal_data, 1413 repair->highest_tail).metadata_type; 1414 if (format == VDO_METADATA_RECOVERY_JOURNAL) 1415 entries_per_block = RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK; 1416 1417 /* 1418 * Allocate an array of numbered_block_mapping structures large enough to transcribe every 1419 * packed_recovery_journal_entry from every valid journal block. 1420 */ 1421 count = ((repair->highest_tail - repair->block_map_head + 1) * entries_per_block); 1422 result = vdo_allocate(count, struct numbered_block_mapping, __func__, 1423 &repair->entries); 1424 if (result != VDO_SUCCESS) 1425 return result; 1426 1427 for (i = repair->block_map_head; i <= repair->highest_tail; i++) 1428 extract_entries_from_block(repair, journal, i, format, entries_per_block); 1429 1430 return VDO_SUCCESS; 1431 } 1432 1433 static int validate_heads(struct repair_completion *repair) 1434 { 1435 /* Both reap heads must be behind the tail. */ 1436 if ((repair->block_map_head <= repair->tail) && 1437 (repair->slab_journal_head <= repair->tail)) 1438 return VDO_SUCCESS; 1439 1440 1441 return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL, 1442 "Journal tail too early. block map head: %llu, slab journal head: %llu, tail: %llu", 1443 (unsigned long long) repair->block_map_head, 1444 (unsigned long long) repair->slab_journal_head, 1445 (unsigned long long) repair->tail); 1446 } 1447 1448 /** 1449 * extract_new_mappings() - Find all valid new mappings to be applied to the block map. 1450 * @repair: The repair completion. 1451 * 1452 * The mappings are extracted from the journal and stored in a sortable array so that all of the 1453 * mappings to be applied to a given block map page can be done in a single page fetch. 1454 */ 1455 static int extract_new_mappings(struct repair_completion *repair) 1456 { 1457 int result; 1458 struct vdo *vdo = repair->completion.vdo; 1459 struct recovery_point recovery_point = { 1460 .sequence_number = repair->block_map_head, 1461 .sector_count = 1, 1462 .entry_count = 0, 1463 }; 1464 1465 /* 1466 * Allocate an array of numbered_block_mapping structs just large enough to transcribe 1467 * every packed_recovery_journal_entry from every valid journal block. 1468 */ 1469 result = vdo_allocate(repair->entry_count, struct numbered_block_mapping, 1470 __func__, &repair->entries); 1471 if (result != VDO_SUCCESS) 1472 return result; 1473 1474 for (; before_recovery_point(&recovery_point, &repair->tail_recovery_point); 1475 increment_recovery_point(&recovery_point)) { 1476 struct recovery_journal_entry entry = get_entry(repair, &recovery_point); 1477 1478 result = validate_recovery_journal_entry(vdo, &entry); 1479 if (result != VDO_SUCCESS) { 1480 vdo_enter_read_only_mode(vdo, result); 1481 return result; 1482 } 1483 1484 repair->entries[repair->block_map_entry_count] = 1485 (struct numbered_block_mapping) { 1486 .block_map_slot = entry.slot, 1487 .block_map_entry = vdo_pack_block_map_entry(entry.mapping.pbn, 1488 entry.mapping.state), 1489 .number = repair->block_map_entry_count, 1490 }; 1491 repair->block_map_entry_count++; 1492 } 1493 1494 result = VDO_ASSERT((repair->block_map_entry_count <= repair->entry_count), 1495 "approximate entry count is an upper bound"); 1496 if (result != VDO_SUCCESS) 1497 vdo_enter_read_only_mode(vdo, result); 1498 1499 return result; 1500 } 1501 1502 /** 1503 * compute_usages() - Compute the lbns in use and block map data blocks counts from the tail of 1504 * the journal. 1505 * @repair: The repair completion. 1506 */ 1507 static noinline int compute_usages(struct repair_completion *repair) 1508 { 1509 /* 1510 * This function is declared noinline to avoid a spurious valgrind error regarding the 1511 * following structure being uninitialized. 1512 */ 1513 struct recovery_point recovery_point = { 1514 .sequence_number = repair->tail, 1515 .sector_count = 1, 1516 .entry_count = 0, 1517 }; 1518 1519 struct vdo *vdo = repair->completion.vdo; 1520 struct recovery_journal *journal = vdo->recovery_journal; 1521 struct recovery_block_header header = 1522 get_recovery_journal_block_header(journal, repair->journal_data, 1523 repair->tail); 1524 1525 repair->logical_blocks_used = header.logical_blocks_used; 1526 repair->block_map_data_blocks = header.block_map_data_blocks; 1527 1528 for (; before_recovery_point(&recovery_point, &repair->tail_recovery_point); 1529 increment_recovery_point(&recovery_point)) { 1530 struct recovery_journal_entry entry = get_entry(repair, &recovery_point); 1531 int result; 1532 1533 result = validate_recovery_journal_entry(vdo, &entry); 1534 if (result != VDO_SUCCESS) { 1535 vdo_enter_read_only_mode(vdo, result); 1536 return result; 1537 } 1538 1539 if (entry.operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) { 1540 repair->block_map_data_blocks++; 1541 continue; 1542 } 1543 1544 if (vdo_is_mapped_location(&entry.mapping)) 1545 repair->logical_blocks_used++; 1546 1547 if (vdo_is_mapped_location(&entry.unmapping)) 1548 repair->logical_blocks_used--; 1549 } 1550 1551 return VDO_SUCCESS; 1552 } 1553 1554 static int parse_journal_for_recovery(struct repair_completion *repair) 1555 { 1556 int result; 1557 sequence_number_t i, head; 1558 bool found_entries = false; 1559 struct recovery_journal *journal = repair->completion.vdo->recovery_journal; 1560 1561 head = min(repair->block_map_head, repair->slab_journal_head); 1562 for (i = head; i <= repair->highest_tail; i++) { 1563 struct recovery_block_header header; 1564 journal_entry_count_t block_entries; 1565 u8 j; 1566 1567 repair->tail = i; 1568 repair->tail_recovery_point = (struct recovery_point) { 1569 .sequence_number = i, 1570 .sector_count = 0, 1571 .entry_count = 0, 1572 }; 1573 1574 header = get_recovery_journal_block_header(journal, repair->journal_data, i); 1575 if (header.metadata_type == VDO_METADATA_RECOVERY_JOURNAL) { 1576 /* This is an old format block, so we need to upgrade */ 1577 vdo_log_error_strerror(VDO_UNSUPPORTED_VERSION, 1578 "Recovery journal is in the old format, a read-only rebuild is required."); 1579 vdo_enter_read_only_mode(repair->completion.vdo, 1580 VDO_UNSUPPORTED_VERSION); 1581 return VDO_UNSUPPORTED_VERSION; 1582 } 1583 1584 if (!is_exact_recovery_journal_block(journal, &header, i, 1585 VDO_METADATA_RECOVERY_JOURNAL_2)) { 1586 /* A bad block header was found so this must be the end of the journal. */ 1587 break; 1588 } 1589 1590 block_entries = header.entry_count; 1591 1592 /* Examine each sector in turn to determine the last valid sector. */ 1593 for (j = 1; j < VDO_SECTORS_PER_BLOCK; j++) { 1594 struct packed_journal_sector *sector = 1595 get_sector(journal, repair->journal_data, i, j); 1596 journal_entry_count_t sector_entries = 1597 min_t(journal_entry_count_t, sector->entry_count, 1598 block_entries); 1599 1600 /* A bad sector means that this block was torn. */ 1601 if (!vdo_is_valid_recovery_journal_sector(&header, sector, j)) 1602 break; 1603 1604 if (sector_entries > 0) { 1605 found_entries = true; 1606 repair->tail_recovery_point.sector_count++; 1607 repair->tail_recovery_point.entry_count = sector_entries; 1608 block_entries -= sector_entries; 1609 repair->entry_count += sector_entries; 1610 } 1611 1612 /* If this sector is short, the later sectors can't matter. */ 1613 if ((sector_entries < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) || 1614 (block_entries == 0)) 1615 break; 1616 } 1617 1618 /* If this block was not filled, or if it tore, no later block can matter. */ 1619 if ((header.entry_count != journal->entries_per_block) || (block_entries > 0)) 1620 break; 1621 } 1622 1623 if (!found_entries) 1624 return validate_heads(repair); 1625 1626 /* Set the tail to the last valid tail block, if there is one. */ 1627 if (repair->tail_recovery_point.sector_count == 0) 1628 repair->tail--; 1629 1630 result = validate_heads(repair); 1631 if (result != VDO_SUCCESS) 1632 return result; 1633 1634 vdo_log_info("Highest-numbered recovery journal block has sequence number %llu, and the highest-numbered usable block is %llu", 1635 (unsigned long long) repair->highest_tail, 1636 (unsigned long long) repair->tail); 1637 1638 result = extract_new_mappings(repair); 1639 if (result != VDO_SUCCESS) 1640 return result; 1641 1642 return compute_usages(repair); 1643 } 1644 1645 static int parse_journal(struct repair_completion *repair) 1646 { 1647 if (!find_recovery_journal_head_and_tail(repair)) 1648 return VDO_SUCCESS; 1649 1650 return (vdo_state_requires_read_only_rebuild(repair->completion.vdo->load_state) ? 1651 parse_journal_for_rebuild(repair) : 1652 parse_journal_for_recovery(repair)); 1653 } 1654 1655 static void finish_journal_load(struct vdo_completion *completion) 1656 { 1657 struct repair_completion *repair = completion->parent; 1658 1659 if (++repair->vios_complete != repair->vio_count) 1660 return; 1661 1662 vdo_log_info("Finished reading recovery journal"); 1663 uninitialize_vios(repair); 1664 prepare_repair_completion(repair, recover_block_map, VDO_ZONE_TYPE_LOGICAL); 1665 vdo_continue_completion(&repair->completion, parse_journal(repair)); 1666 } 1667 1668 static void handle_journal_load_error(struct vdo_completion *completion) 1669 { 1670 struct repair_completion *repair = completion->parent; 1671 1672 /* Preserve the error */ 1673 vdo_set_completion_result(&repair->completion, completion->result); 1674 vio_record_metadata_io_error(as_vio(completion)); 1675 completion->callback(completion); 1676 } 1677 1678 static void read_journal_endio(struct bio *bio) 1679 { 1680 struct vio *vio = bio->bi_private; 1681 struct vdo *vdo = vio->completion.vdo; 1682 1683 continue_vio_after_io(vio, finish_journal_load, vdo->thread_config.admin_thread); 1684 } 1685 1686 /** 1687 * vdo_repair() - Load the recovery journal and then recover or rebuild a vdo. 1688 * @parent: The completion to notify when the operation is complete 1689 */ 1690 void vdo_repair(struct vdo_completion *parent) 1691 { 1692 int result; 1693 char *ptr; 1694 struct repair_completion *repair; 1695 struct vdo *vdo = parent->vdo; 1696 struct recovery_journal *journal = vdo->recovery_journal; 1697 physical_block_number_t pbn = journal->origin; 1698 block_count_t remaining = journal->size; 1699 block_count_t vio_count = DIV_ROUND_UP(remaining, MAX_BLOCKS_PER_VIO); 1700 page_count_t page_count = min_t(page_count_t, 1701 vdo->device_config->cache_size >> 1, 1702 MAXIMUM_SIMULTANEOUS_VDO_BLOCK_MAP_RESTORATION_READS); 1703 1704 vdo_assert_on_admin_thread(vdo, __func__); 1705 1706 if (vdo->load_state == VDO_FORCE_REBUILD) { 1707 vdo_log_warning("Rebuilding reference counts to clear read-only mode"); 1708 vdo->states.vdo.read_only_recoveries++; 1709 } else if (vdo->load_state == VDO_REBUILD_FOR_UPGRADE) { 1710 vdo_log_warning("Rebuilding reference counts for upgrade"); 1711 } else { 1712 vdo_log_warning("Device was dirty, rebuilding reference counts"); 1713 } 1714 1715 result = vdo_allocate_extended(struct repair_completion, page_count, 1716 struct vdo_page_completion, __func__, 1717 &repair); 1718 if (result != VDO_SUCCESS) { 1719 vdo_fail_completion(parent, result); 1720 return; 1721 } 1722 1723 vdo_initialize_completion(&repair->completion, vdo, VDO_REPAIR_COMPLETION); 1724 repair->completion.error_handler = abort_repair; 1725 repair->completion.parent = parent; 1726 prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN); 1727 repair->page_count = page_count; 1728 1729 result = vdo_allocate(remaining * VDO_BLOCK_SIZE, char, __func__, 1730 &repair->journal_data); 1731 if (abort_on_error(result, repair)) 1732 return; 1733 1734 result = vdo_allocate(vio_count, struct vio, __func__, &repair->vios); 1735 if (abort_on_error(result, repair)) 1736 return; 1737 1738 ptr = repair->journal_data; 1739 for (repair->vio_count = 0; repair->vio_count < vio_count; repair->vio_count++) { 1740 block_count_t blocks = min_t(block_count_t, remaining, 1741 MAX_BLOCKS_PER_VIO); 1742 1743 result = allocate_vio_components(vdo, VIO_TYPE_RECOVERY_JOURNAL, 1744 VIO_PRIORITY_METADATA, 1745 repair, blocks, ptr, 1746 &repair->vios[repair->vio_count]); 1747 if (abort_on_error(result, repair)) 1748 return; 1749 1750 ptr += (blocks * VDO_BLOCK_SIZE); 1751 remaining -= blocks; 1752 } 1753 1754 for (vio_count = 0; vio_count < repair->vio_count; 1755 vio_count++, pbn += MAX_BLOCKS_PER_VIO) { 1756 vdo_submit_metadata_vio(&repair->vios[vio_count], pbn, read_journal_endio, 1757 handle_journal_load_error, REQ_OP_READ); 1758 } 1759 } 1760