1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kexec_handover.c - kexec handover metadata processing 4 * Copyright (C) 2023 Alexander Graf <graf@amazon.com> 5 * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> 6 * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> 7 */ 8 9 #define pr_fmt(fmt) "KHO: " fmt 10 11 #include <linux/cma.h> 12 #include <linux/count_zeros.h> 13 #include <linux/debugfs.h> 14 #include <linux/kexec.h> 15 #include <linux/kexec_handover.h> 16 #include <linux/libfdt.h> 17 #include <linux/list.h> 18 #include <linux/memblock.h> 19 #include <linux/notifier.h> 20 #include <linux/page-isolation.h> 21 22 #include <asm/early_ioremap.h> 23 24 /* 25 * KHO is tightly coupled with mm init and needs access to some of mm 26 * internal APIs. 27 */ 28 #include "../mm/internal.h" 29 #include "kexec_internal.h" 30 31 #define KHO_FDT_COMPATIBLE "kho-v1" 32 #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" 33 #define PROP_SUB_FDT "fdt" 34 35 static bool kho_enable __ro_after_init; 36 37 bool kho_is_enabled(void) 38 { 39 return kho_enable; 40 } 41 EXPORT_SYMBOL_GPL(kho_is_enabled); 42 43 static int __init kho_parse_enable(char *p) 44 { 45 return kstrtobool(p, &kho_enable); 46 } 47 early_param("kho", kho_parse_enable); 48 49 /* 50 * Keep track of memory that is to be preserved across KHO. 51 * 52 * The serializing side uses two levels of xarrays to manage chunks of per-order 53 * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a 54 * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations 55 * each bitmap will cover 16M of address space. Thus, for 16G of memory at most 56 * 512K of bitmap memory will be needed for order 0. 57 * 58 * This approach is fully incremental, as the serialization progresses folios 59 * can continue be aggregated to the tracker. The final step, immediately prior 60 * to kexec would serialize the xarray information into a linked list for the 61 * successor kernel to parse. 62 */ 63 64 #define PRESERVE_BITS (512 * 8) 65 66 struct kho_mem_phys_bits { 67 DECLARE_BITMAP(preserve, PRESERVE_BITS); 68 }; 69 70 struct kho_mem_phys { 71 /* 72 * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized 73 * to order. 74 */ 75 struct xarray phys_bits; 76 }; 77 78 struct kho_mem_track { 79 /* Points to kho_mem_phys, each order gets its own bitmap tree */ 80 struct xarray orders; 81 }; 82 83 struct khoser_mem_chunk; 84 85 struct kho_serialization { 86 struct page *fdt; 87 struct list_head fdt_list; 88 struct dentry *sub_fdt_dir; 89 struct kho_mem_track track; 90 /* First chunk of serialized preserved memory map */ 91 struct khoser_mem_chunk *preserved_mem_map; 92 }; 93 94 static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) 95 { 96 void *elm, *res; 97 98 elm = xa_load(xa, index); 99 if (elm) 100 return elm; 101 102 elm = kzalloc(sz, GFP_KERNEL); 103 if (!elm) 104 return ERR_PTR(-ENOMEM); 105 106 res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); 107 if (xa_is_err(res)) 108 res = ERR_PTR(xa_err(res)); 109 110 if (res) { 111 kfree(elm); 112 return res; 113 } 114 115 return elm; 116 } 117 118 static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, 119 unsigned long end_pfn) 120 { 121 struct kho_mem_phys_bits *bits; 122 struct kho_mem_phys *physxa; 123 124 while (pfn < end_pfn) { 125 const unsigned int order = 126 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 127 const unsigned long pfn_high = pfn >> order; 128 129 physxa = xa_load(&track->orders, order); 130 if (!physxa) 131 continue; 132 133 bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); 134 if (!bits) 135 continue; 136 137 clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); 138 139 pfn += 1 << order; 140 } 141 } 142 143 static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, 144 unsigned int order) 145 { 146 struct kho_mem_phys_bits *bits; 147 struct kho_mem_phys *physxa; 148 const unsigned long pfn_high = pfn >> order; 149 150 might_sleep(); 151 152 physxa = xa_load_or_alloc(&track->orders, order, sizeof(*physxa)); 153 if (IS_ERR(physxa)) 154 return PTR_ERR(physxa); 155 156 bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, 157 sizeof(*bits)); 158 if (IS_ERR(bits)) 159 return PTR_ERR(bits); 160 161 set_bit(pfn_high % PRESERVE_BITS, bits->preserve); 162 163 return 0; 164 } 165 166 /* almost as free_reserved_page(), just don't free the page */ 167 static void kho_restore_page(struct page *page, unsigned int order) 168 { 169 unsigned int nr_pages = (1 << order); 170 171 /* Head page gets refcount of 1. */ 172 set_page_count(page, 1); 173 174 /* For higher order folios, tail pages get a page count of zero. */ 175 for (unsigned int i = 1; i < nr_pages; i++) 176 set_page_count(page + i, 0); 177 178 if (order > 0) 179 prep_compound_page(page, order); 180 181 adjust_managed_page_count(page, nr_pages); 182 } 183 184 /** 185 * kho_restore_folio - recreates the folio from the preserved memory. 186 * @phys: physical address of the folio. 187 * 188 * Return: pointer to the struct folio on success, NULL on failure. 189 */ 190 struct folio *kho_restore_folio(phys_addr_t phys) 191 { 192 struct page *page = pfn_to_online_page(PHYS_PFN(phys)); 193 unsigned long order; 194 195 if (!page) 196 return NULL; 197 198 order = page->private; 199 if (order > MAX_PAGE_ORDER) 200 return NULL; 201 202 kho_restore_page(page, order); 203 return page_folio(page); 204 } 205 EXPORT_SYMBOL_GPL(kho_restore_folio); 206 207 /* Serialize and deserialize struct kho_mem_phys across kexec 208 * 209 * Record all the bitmaps in a linked list of pages for the next kernel to 210 * process. Each chunk holds bitmaps of the same order and each block of bitmaps 211 * starts at a given physical address. This allows the bitmaps to be sparse. The 212 * xarray is used to store them in a tree while building up the data structure, 213 * but the KHO successor kernel only needs to process them once in order. 214 * 215 * All of this memory is normal kmalloc() memory and is not marked for 216 * preservation. The successor kernel will remain isolated to the scratch space 217 * until it completes processing this list. Once processed all the memory 218 * storing these ranges will be marked as free. 219 */ 220 221 struct khoser_mem_bitmap_ptr { 222 phys_addr_t phys_start; 223 DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *); 224 }; 225 226 struct khoser_mem_chunk_hdr { 227 DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *); 228 unsigned int order; 229 unsigned int num_elms; 230 }; 231 232 #define KHOSER_BITMAP_SIZE \ 233 ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \ 234 sizeof(struct khoser_mem_bitmap_ptr)) 235 236 struct khoser_mem_chunk { 237 struct khoser_mem_chunk_hdr hdr; 238 struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE]; 239 }; 240 241 static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); 242 243 static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, 244 unsigned long order) 245 { 246 struct khoser_mem_chunk *chunk; 247 248 chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); 249 if (!chunk) 250 return NULL; 251 chunk->hdr.order = order; 252 if (cur_chunk) 253 KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); 254 return chunk; 255 } 256 257 static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) 258 { 259 struct khoser_mem_chunk *chunk = first_chunk; 260 261 while (chunk) { 262 struct khoser_mem_chunk *tmp = chunk; 263 264 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 265 kfree(tmp); 266 } 267 } 268 269 static int kho_mem_serialize(struct kho_serialization *ser) 270 { 271 struct khoser_mem_chunk *first_chunk = NULL; 272 struct khoser_mem_chunk *chunk = NULL; 273 struct kho_mem_phys *physxa; 274 unsigned long order; 275 276 xa_for_each(&ser->track.orders, order, physxa) { 277 struct kho_mem_phys_bits *bits; 278 unsigned long phys; 279 280 chunk = new_chunk(chunk, order); 281 if (!chunk) 282 goto err_free; 283 284 if (!first_chunk) 285 first_chunk = chunk; 286 287 xa_for_each(&physxa->phys_bits, phys, bits) { 288 struct khoser_mem_bitmap_ptr *elm; 289 290 if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { 291 chunk = new_chunk(chunk, order); 292 if (!chunk) 293 goto err_free; 294 } 295 296 elm = &chunk->bitmaps[chunk->hdr.num_elms]; 297 chunk->hdr.num_elms++; 298 elm->phys_start = (phys * PRESERVE_BITS) 299 << (order + PAGE_SHIFT); 300 KHOSER_STORE_PTR(elm->bitmap, bits); 301 } 302 } 303 304 ser->preserved_mem_map = first_chunk; 305 306 return 0; 307 308 err_free: 309 kho_mem_ser_free(first_chunk); 310 return -ENOMEM; 311 } 312 313 static void deserialize_bitmap(unsigned int order, 314 struct khoser_mem_bitmap_ptr *elm) 315 { 316 struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap); 317 unsigned long bit; 318 319 for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) { 320 int sz = 1 << (order + PAGE_SHIFT); 321 phys_addr_t phys = 322 elm->phys_start + (bit << (order + PAGE_SHIFT)); 323 struct page *page = phys_to_page(phys); 324 325 memblock_reserve(phys, sz); 326 memblock_reserved_mark_noinit(phys, sz); 327 page->private = order; 328 } 329 } 330 331 static void __init kho_mem_deserialize(const void *fdt) 332 { 333 struct khoser_mem_chunk *chunk; 334 const phys_addr_t *mem; 335 int len; 336 337 mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); 338 339 if (!mem || len != sizeof(*mem)) { 340 pr_err("failed to get preserved memory bitmaps\n"); 341 return; 342 } 343 344 chunk = *mem ? phys_to_virt(*mem) : NULL; 345 while (chunk) { 346 unsigned int i; 347 348 for (i = 0; i != chunk->hdr.num_elms; i++) 349 deserialize_bitmap(chunk->hdr.order, 350 &chunk->bitmaps[i]); 351 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 352 } 353 } 354 355 /* 356 * With KHO enabled, memory can become fragmented because KHO regions may 357 * be anywhere in physical address space. The scratch regions give us a 358 * safe zones that we will never see KHO allocations from. This is where we 359 * can later safely load our new kexec images into and then use the scratch 360 * area for early allocations that happen before page allocator is 361 * initialized. 362 */ 363 static struct kho_scratch *kho_scratch; 364 static unsigned int kho_scratch_cnt; 365 366 /* 367 * The scratch areas are scaled by default as percent of memory allocated from 368 * memblock. A user can override the scale with command line parameter: 369 * 370 * kho_scratch=N% 371 * 372 * It is also possible to explicitly define size for a lowmem, a global and 373 * per-node scratch areas: 374 * 375 * kho_scratch=l[KMG],n[KMG],m[KMG] 376 * 377 * The explicit size definition takes precedence over scale definition. 378 */ 379 static unsigned int scratch_scale __initdata = 200; 380 static phys_addr_t scratch_size_global __initdata; 381 static phys_addr_t scratch_size_pernode __initdata; 382 static phys_addr_t scratch_size_lowmem __initdata; 383 384 static int __init kho_parse_scratch_size(char *p) 385 { 386 size_t len; 387 unsigned long sizes[3]; 388 int i; 389 390 if (!p) 391 return -EINVAL; 392 393 len = strlen(p); 394 if (!len) 395 return -EINVAL; 396 397 /* parse nn% */ 398 if (p[len - 1] == '%') { 399 /* unsigned int max is 4,294,967,295, 10 chars */ 400 char s_scale[11] = {}; 401 int ret = 0; 402 403 if (len > ARRAY_SIZE(s_scale)) 404 return -EINVAL; 405 406 memcpy(s_scale, p, len - 1); 407 ret = kstrtouint(s_scale, 10, &scratch_scale); 408 if (!ret) 409 pr_notice("scratch scale is %d%%\n", scratch_scale); 410 return ret; 411 } 412 413 /* parse ll[KMG],mm[KMG],nn[KMG] */ 414 for (i = 0; i < ARRAY_SIZE(sizes); i++) { 415 char *endp = p; 416 417 if (i > 0) { 418 if (*p != ',') 419 return -EINVAL; 420 p += 1; 421 } 422 423 sizes[i] = memparse(p, &endp); 424 if (!sizes[i] || endp == p) 425 return -EINVAL; 426 p = endp; 427 } 428 429 scratch_size_lowmem = sizes[0]; 430 scratch_size_global = sizes[1]; 431 scratch_size_pernode = sizes[2]; 432 scratch_scale = 0; 433 434 pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n", 435 (u64)(scratch_size_lowmem >> 20), 436 (u64)(scratch_size_global >> 20), 437 (u64)(scratch_size_pernode >> 20)); 438 439 return 0; 440 } 441 early_param("kho_scratch", kho_parse_scratch_size); 442 443 static void __init scratch_size_update(void) 444 { 445 phys_addr_t size; 446 447 if (!scratch_scale) 448 return; 449 450 size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT, 451 NUMA_NO_NODE); 452 size = size * scratch_scale / 100; 453 scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES); 454 455 size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, 456 NUMA_NO_NODE); 457 size = size * scratch_scale / 100 - scratch_size_lowmem; 458 scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES); 459 } 460 461 static phys_addr_t __init scratch_size_node(int nid) 462 { 463 phys_addr_t size; 464 465 if (scratch_scale) { 466 size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, 467 nid); 468 size = size * scratch_scale / 100; 469 } else { 470 size = scratch_size_pernode; 471 } 472 473 return round_up(size, CMA_MIN_ALIGNMENT_BYTES); 474 } 475 476 /** 477 * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec 478 * 479 * With KHO we can preserve arbitrary pages in the system. To ensure we still 480 * have a large contiguous region of memory when we search the physical address 481 * space for target memory, let's make sure we always have a large CMA region 482 * active. This CMA region will only be used for movable pages which are not a 483 * problem for us during KHO because we can just move them somewhere else. 484 */ 485 static void __init kho_reserve_scratch(void) 486 { 487 phys_addr_t addr, size; 488 int nid, i = 0; 489 490 if (!kho_enable) 491 return; 492 493 scratch_size_update(); 494 495 /* FIXME: deal with node hot-plug/remove */ 496 kho_scratch_cnt = num_online_nodes() + 2; 497 size = kho_scratch_cnt * sizeof(*kho_scratch); 498 kho_scratch = memblock_alloc(size, PAGE_SIZE); 499 if (!kho_scratch) 500 goto err_disable_kho; 501 502 /* 503 * reserve scratch area in low memory for lowmem allocations in the 504 * next kernel 505 */ 506 size = scratch_size_lowmem; 507 addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0, 508 ARCH_LOW_ADDRESS_LIMIT); 509 if (!addr) 510 goto err_free_scratch_desc; 511 512 kho_scratch[i].addr = addr; 513 kho_scratch[i].size = size; 514 i++; 515 516 /* reserve large contiguous area for allocations without nid */ 517 size = scratch_size_global; 518 addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES); 519 if (!addr) 520 goto err_free_scratch_areas; 521 522 kho_scratch[i].addr = addr; 523 kho_scratch[i].size = size; 524 i++; 525 526 for_each_online_node(nid) { 527 size = scratch_size_node(nid); 528 addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, 529 0, MEMBLOCK_ALLOC_ACCESSIBLE, 530 nid, true); 531 if (!addr) 532 goto err_free_scratch_areas; 533 534 kho_scratch[i].addr = addr; 535 kho_scratch[i].size = size; 536 i++; 537 } 538 539 return; 540 541 err_free_scratch_areas: 542 for (i--; i >= 0; i--) 543 memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size); 544 err_free_scratch_desc: 545 memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch)); 546 err_disable_kho: 547 kho_enable = false; 548 } 549 550 struct fdt_debugfs { 551 struct list_head list; 552 struct debugfs_blob_wrapper wrapper; 553 struct dentry *file; 554 }; 555 556 static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, 557 const char *name, const void *fdt) 558 { 559 struct fdt_debugfs *f; 560 struct dentry *file; 561 562 f = kmalloc(sizeof(*f), GFP_KERNEL); 563 if (!f) 564 return -ENOMEM; 565 566 f->wrapper.data = (void *)fdt; 567 f->wrapper.size = fdt_totalsize(fdt); 568 569 file = debugfs_create_blob(name, 0400, dir, &f->wrapper); 570 if (IS_ERR(file)) { 571 kfree(f); 572 return PTR_ERR(file); 573 } 574 575 f->file = file; 576 list_add(&f->list, list); 577 578 return 0; 579 } 580 581 /** 582 * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. 583 * @ser: serialization control object passed by KHO notifiers. 584 * @name: name of the sub tree. 585 * @fdt: the sub tree blob. 586 * 587 * Creates a new child node named @name in KHO root FDT and records 588 * the physical address of @fdt. The pages of @fdt must also be preserved 589 * by KHO for the new kernel to retrieve it after kexec. 590 * 591 * A debugfs blob entry is also created at 592 * ``/sys/kernel/debug/kho/out/sub_fdts/@name``. 593 * 594 * Return: 0 on success, error code on failure 595 */ 596 int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) 597 { 598 int err = 0; 599 u64 phys = (u64)virt_to_phys(fdt); 600 void *root = page_to_virt(ser->fdt); 601 602 err |= fdt_begin_node(root, name); 603 err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); 604 err |= fdt_end_node(root); 605 606 if (err) 607 return err; 608 609 return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt); 610 } 611 EXPORT_SYMBOL_GPL(kho_add_subtree); 612 613 struct kho_out { 614 struct blocking_notifier_head chain_head; 615 616 struct dentry *dir; 617 618 struct mutex lock; /* protects KHO FDT finalization */ 619 620 struct kho_serialization ser; 621 bool finalized; 622 }; 623 624 static struct kho_out kho_out = { 625 .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), 626 .lock = __MUTEX_INITIALIZER(kho_out.lock), 627 .ser = { 628 .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), 629 .track = { 630 .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), 631 }, 632 }, 633 .finalized = false, 634 }; 635 636 int register_kho_notifier(struct notifier_block *nb) 637 { 638 return blocking_notifier_chain_register(&kho_out.chain_head, nb); 639 } 640 EXPORT_SYMBOL_GPL(register_kho_notifier); 641 642 int unregister_kho_notifier(struct notifier_block *nb) 643 { 644 return blocking_notifier_chain_unregister(&kho_out.chain_head, nb); 645 } 646 EXPORT_SYMBOL_GPL(unregister_kho_notifier); 647 648 /** 649 * kho_preserve_folio - preserve a folio across kexec. 650 * @folio: folio to preserve. 651 * 652 * Instructs KHO to preserve the whole folio across kexec. The order 653 * will be preserved as well. 654 * 655 * Return: 0 on success, error code on failure 656 */ 657 int kho_preserve_folio(struct folio *folio) 658 { 659 const unsigned long pfn = folio_pfn(folio); 660 const unsigned int order = folio_order(folio); 661 struct kho_mem_track *track = &kho_out.ser.track; 662 663 if (kho_out.finalized) 664 return -EBUSY; 665 666 return __kho_preserve_order(track, pfn, order); 667 } 668 EXPORT_SYMBOL_GPL(kho_preserve_folio); 669 670 /** 671 * kho_preserve_phys - preserve a physically contiguous range across kexec. 672 * @phys: physical address of the range. 673 * @size: size of the range. 674 * 675 * Instructs KHO to preserve the memory range from @phys to @phys + @size 676 * across kexec. 677 * 678 * Return: 0 on success, error code on failure 679 */ 680 int kho_preserve_phys(phys_addr_t phys, size_t size) 681 { 682 unsigned long pfn = PHYS_PFN(phys); 683 unsigned long failed_pfn = 0; 684 const unsigned long start_pfn = pfn; 685 const unsigned long end_pfn = PHYS_PFN(phys + size); 686 int err = 0; 687 struct kho_mem_track *track = &kho_out.ser.track; 688 689 if (kho_out.finalized) 690 return -EBUSY; 691 692 if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) 693 return -EINVAL; 694 695 while (pfn < end_pfn) { 696 const unsigned int order = 697 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 698 699 err = __kho_preserve_order(track, pfn, order); 700 if (err) { 701 failed_pfn = pfn; 702 break; 703 } 704 705 pfn += 1 << order; 706 } 707 708 if (err) 709 __kho_unpreserve(track, start_pfn, failed_pfn); 710 711 return err; 712 } 713 EXPORT_SYMBOL_GPL(kho_preserve_phys); 714 715 /* Handling for debug/kho/out */ 716 717 static struct dentry *debugfs_root; 718 719 static int kho_out_update_debugfs_fdt(void) 720 { 721 int err = 0; 722 struct fdt_debugfs *ff, *tmp; 723 724 if (kho_out.finalized) { 725 err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir, 726 "fdt", page_to_virt(kho_out.ser.fdt)); 727 } else { 728 list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) { 729 debugfs_remove(ff->file); 730 list_del(&ff->list); 731 kfree(ff); 732 } 733 } 734 735 return err; 736 } 737 738 static int kho_abort(void) 739 { 740 int err; 741 unsigned long order; 742 struct kho_mem_phys *physxa; 743 744 xa_for_each(&kho_out.ser.track.orders, order, physxa) { 745 struct kho_mem_phys_bits *bits; 746 unsigned long phys; 747 748 xa_for_each(&physxa->phys_bits, phys, bits) 749 kfree(bits); 750 751 xa_destroy(&physxa->phys_bits); 752 kfree(physxa); 753 } 754 xa_destroy(&kho_out.ser.track.orders); 755 756 if (kho_out.ser.preserved_mem_map) { 757 kho_mem_ser_free(kho_out.ser.preserved_mem_map); 758 kho_out.ser.preserved_mem_map = NULL; 759 } 760 761 err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT, 762 NULL); 763 err = notifier_to_errno(err); 764 765 if (err) 766 pr_err("Failed to abort KHO finalization: %d\n", err); 767 768 return err; 769 } 770 771 static int kho_finalize(void) 772 { 773 int err = 0; 774 u64 *preserved_mem_map; 775 void *fdt = page_to_virt(kho_out.ser.fdt); 776 777 err |= fdt_create(fdt, PAGE_SIZE); 778 err |= fdt_finish_reservemap(fdt); 779 err |= fdt_begin_node(fdt, ""); 780 err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE); 781 /** 782 * Reserve the preserved-memory-map property in the root FDT, so 783 * that all property definitions will precede subnodes created by 784 * KHO callers. 785 */ 786 err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP, 787 sizeof(*preserved_mem_map), 788 (void **)&preserved_mem_map); 789 if (err) 790 goto abort; 791 792 err = kho_preserve_folio(page_folio(kho_out.ser.fdt)); 793 if (err) 794 goto abort; 795 796 err = blocking_notifier_call_chain(&kho_out.chain_head, 797 KEXEC_KHO_FINALIZE, &kho_out.ser); 798 err = notifier_to_errno(err); 799 if (err) 800 goto abort; 801 802 err = kho_mem_serialize(&kho_out.ser); 803 if (err) 804 goto abort; 805 806 *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map); 807 808 err |= fdt_end_node(fdt); 809 err |= fdt_finish(fdt); 810 811 abort: 812 if (err) { 813 pr_err("Failed to convert KHO state tree: %d\n", err); 814 kho_abort(); 815 } 816 817 return err; 818 } 819 820 static int kho_out_finalize_get(void *data, u64 *val) 821 { 822 mutex_lock(&kho_out.lock); 823 *val = kho_out.finalized; 824 mutex_unlock(&kho_out.lock); 825 826 return 0; 827 } 828 829 static int kho_out_finalize_set(void *data, u64 _val) 830 { 831 int ret = 0; 832 bool val = !!_val; 833 834 mutex_lock(&kho_out.lock); 835 836 if (val == kho_out.finalized) { 837 if (kho_out.finalized) 838 ret = -EEXIST; 839 else 840 ret = -ENOENT; 841 goto unlock; 842 } 843 844 if (val) 845 ret = kho_finalize(); 846 else 847 ret = kho_abort(); 848 849 if (ret) 850 goto unlock; 851 852 kho_out.finalized = val; 853 ret = kho_out_update_debugfs_fdt(); 854 855 unlock: 856 mutex_unlock(&kho_out.lock); 857 return ret; 858 } 859 860 DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get, 861 kho_out_finalize_set, "%llu\n"); 862 863 static int scratch_phys_show(struct seq_file *m, void *v) 864 { 865 for (int i = 0; i < kho_scratch_cnt; i++) 866 seq_printf(m, "0x%llx\n", kho_scratch[i].addr); 867 868 return 0; 869 } 870 DEFINE_SHOW_ATTRIBUTE(scratch_phys); 871 872 static int scratch_len_show(struct seq_file *m, void *v) 873 { 874 for (int i = 0; i < kho_scratch_cnt; i++) 875 seq_printf(m, "0x%llx\n", kho_scratch[i].size); 876 877 return 0; 878 } 879 DEFINE_SHOW_ATTRIBUTE(scratch_len); 880 881 static __init int kho_out_debugfs_init(void) 882 { 883 struct dentry *dir, *f, *sub_fdt_dir; 884 885 dir = debugfs_create_dir("out", debugfs_root); 886 if (IS_ERR(dir)) 887 return -ENOMEM; 888 889 sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); 890 if (IS_ERR(sub_fdt_dir)) 891 goto err_rmdir; 892 893 f = debugfs_create_file("scratch_phys", 0400, dir, NULL, 894 &scratch_phys_fops); 895 if (IS_ERR(f)) 896 goto err_rmdir; 897 898 f = debugfs_create_file("scratch_len", 0400, dir, NULL, 899 &scratch_len_fops); 900 if (IS_ERR(f)) 901 goto err_rmdir; 902 903 f = debugfs_create_file("finalize", 0600, dir, NULL, 904 &fops_kho_out_finalize); 905 if (IS_ERR(f)) 906 goto err_rmdir; 907 908 kho_out.dir = dir; 909 kho_out.ser.sub_fdt_dir = sub_fdt_dir; 910 return 0; 911 912 err_rmdir: 913 debugfs_remove_recursive(dir); 914 return -ENOENT; 915 } 916 917 struct kho_in { 918 struct dentry *dir; 919 phys_addr_t fdt_phys; 920 phys_addr_t scratch_phys; 921 struct list_head fdt_list; 922 }; 923 924 static struct kho_in kho_in = { 925 .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list), 926 }; 927 928 static const void *kho_get_fdt(void) 929 { 930 return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; 931 } 932 933 /** 934 * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. 935 * @name: the name of the sub FDT passed to kho_add_subtree(). 936 * @phys: if found, the physical address of the sub FDT is stored in @phys. 937 * 938 * Retrieve a preserved sub FDT named @name and store its physical 939 * address in @phys. 940 * 941 * Return: 0 on success, error code on failure 942 */ 943 int kho_retrieve_subtree(const char *name, phys_addr_t *phys) 944 { 945 const void *fdt = kho_get_fdt(); 946 const u64 *val; 947 int offset, len; 948 949 if (!fdt) 950 return -ENOENT; 951 952 if (!phys) 953 return -EINVAL; 954 955 offset = fdt_subnode_offset(fdt, 0, name); 956 if (offset < 0) 957 return -ENOENT; 958 959 val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len); 960 if (!val || len != sizeof(*val)) 961 return -EINVAL; 962 963 *phys = (phys_addr_t)*val; 964 965 return 0; 966 } 967 EXPORT_SYMBOL_GPL(kho_retrieve_subtree); 968 969 /* Handling for debugfs/kho/in */ 970 971 static __init int kho_in_debugfs_init(const void *fdt) 972 { 973 struct dentry *sub_fdt_dir; 974 int err, child; 975 976 kho_in.dir = debugfs_create_dir("in", debugfs_root); 977 if (IS_ERR(kho_in.dir)) 978 return PTR_ERR(kho_in.dir); 979 980 sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir); 981 if (IS_ERR(sub_fdt_dir)) { 982 err = PTR_ERR(sub_fdt_dir); 983 goto err_rmdir; 984 } 985 986 err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt); 987 if (err) 988 goto err_rmdir; 989 990 fdt_for_each_subnode(child, fdt, 0) { 991 int len = 0; 992 const char *name = fdt_get_name(fdt, child, NULL); 993 const u64 *fdt_phys; 994 995 fdt_phys = fdt_getprop(fdt, child, "fdt", &len); 996 if (!fdt_phys) 997 continue; 998 if (len != sizeof(*fdt_phys)) { 999 pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n", 1000 name, len); 1001 continue; 1002 } 1003 err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name, 1004 phys_to_virt(*fdt_phys)); 1005 if (err) { 1006 pr_warn("failed to add fdt `%s` to debugfs: %d\n", name, 1007 err); 1008 continue; 1009 } 1010 } 1011 1012 return 0; 1013 1014 err_rmdir: 1015 debugfs_remove_recursive(kho_in.dir); 1016 return err; 1017 } 1018 1019 static __init int kho_init(void) 1020 { 1021 int err = 0; 1022 const void *fdt = kho_get_fdt(); 1023 1024 if (!kho_enable) 1025 return 0; 1026 1027 kho_out.ser.fdt = alloc_page(GFP_KERNEL); 1028 if (!kho_out.ser.fdt) { 1029 err = -ENOMEM; 1030 goto err_free_scratch; 1031 } 1032 1033 debugfs_root = debugfs_create_dir("kho", NULL); 1034 if (IS_ERR(debugfs_root)) { 1035 err = -ENOENT; 1036 goto err_free_fdt; 1037 } 1038 1039 err = kho_out_debugfs_init(); 1040 if (err) 1041 goto err_free_fdt; 1042 1043 if (fdt) { 1044 err = kho_in_debugfs_init(fdt); 1045 /* 1046 * Failure to create /sys/kernel/debug/kho/in does not prevent 1047 * reviving state from KHO and setting up KHO for the next 1048 * kexec. 1049 */ 1050 if (err) 1051 pr_err("failed exposing handover FDT in debugfs: %d\n", 1052 err); 1053 1054 return 0; 1055 } 1056 1057 for (int i = 0; i < kho_scratch_cnt; i++) { 1058 unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr); 1059 unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; 1060 unsigned long pfn; 1061 1062 for (pfn = base_pfn; pfn < base_pfn + count; 1063 pfn += pageblock_nr_pages) 1064 init_cma_reserved_pageblock(pfn_to_page(pfn)); 1065 } 1066 1067 return 0; 1068 1069 err_free_fdt: 1070 put_page(kho_out.ser.fdt); 1071 kho_out.ser.fdt = NULL; 1072 err_free_scratch: 1073 for (int i = 0; i < kho_scratch_cnt; i++) { 1074 void *start = __va(kho_scratch[i].addr); 1075 void *end = start + kho_scratch[i].size; 1076 1077 free_reserved_area(start, end, -1, ""); 1078 } 1079 kho_enable = false; 1080 return err; 1081 } 1082 late_initcall(kho_init); 1083 1084 static void __init kho_release_scratch(void) 1085 { 1086 phys_addr_t start, end; 1087 u64 i; 1088 1089 memmap_init_kho_scratch_pages(); 1090 1091 /* 1092 * Mark scratch mem as CMA before we return it. That way we 1093 * ensure that no kernel allocations happen on it. That means 1094 * we can reuse it as scratch memory again later. 1095 */ 1096 __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, 1097 MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) { 1098 ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start)); 1099 ulong end_pfn = pageblock_align(PFN_UP(end)); 1100 ulong pfn; 1101 1102 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) 1103 set_pageblock_migratetype(pfn_to_page(pfn), 1104 MIGRATE_CMA); 1105 } 1106 } 1107 1108 void __init kho_memory_init(void) 1109 { 1110 struct folio *folio; 1111 1112 if (kho_in.scratch_phys) { 1113 kho_scratch = phys_to_virt(kho_in.scratch_phys); 1114 kho_release_scratch(); 1115 1116 kho_mem_deserialize(kho_get_fdt()); 1117 folio = kho_restore_folio(kho_in.fdt_phys); 1118 if (!folio) 1119 pr_warn("failed to restore folio for KHO fdt\n"); 1120 } else { 1121 kho_reserve_scratch(); 1122 } 1123 } 1124 1125 void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, 1126 phys_addr_t scratch_phys, u64 scratch_len) 1127 { 1128 void *fdt = NULL; 1129 struct kho_scratch *scratch = NULL; 1130 int err = 0; 1131 unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); 1132 1133 /* Validate the input FDT */ 1134 fdt = early_memremap(fdt_phys, fdt_len); 1135 if (!fdt) { 1136 pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys); 1137 err = -EFAULT; 1138 goto out; 1139 } 1140 err = fdt_check_header(fdt); 1141 if (err) { 1142 pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n", 1143 fdt_phys, err); 1144 err = -EINVAL; 1145 goto out; 1146 } 1147 err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE); 1148 if (err) { 1149 pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n", 1150 fdt_phys, KHO_FDT_COMPATIBLE, err); 1151 err = -EINVAL; 1152 goto out; 1153 } 1154 1155 scratch = early_memremap(scratch_phys, scratch_len); 1156 if (!scratch) { 1157 pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n", 1158 scratch_phys, scratch_len); 1159 err = -EFAULT; 1160 goto out; 1161 } 1162 1163 /* 1164 * We pass a safe contiguous blocks of memory to use for early boot 1165 * purporses from the previous kernel so that we can resize the 1166 * memblock array as needed. 1167 */ 1168 for (int i = 0; i < scratch_cnt; i++) { 1169 struct kho_scratch *area = &scratch[i]; 1170 u64 size = area->size; 1171 1172 memblock_add(area->addr, size); 1173 err = memblock_mark_kho_scratch(area->addr, size); 1174 if (WARN_ON(err)) { 1175 pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d", 1176 &area->addr, &size, err); 1177 goto out; 1178 } 1179 pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); 1180 } 1181 1182 memblock_reserve(scratch_phys, scratch_len); 1183 1184 /* 1185 * Now that we have a viable region of scratch memory, let's tell 1186 * the memblocks allocator to only use that for any allocations. 1187 * That way we ensure that nothing scribbles over in use data while 1188 * we initialize the page tables which we will need to ingest all 1189 * memory reservations from the previous kernel. 1190 */ 1191 memblock_set_kho_scratch_only(); 1192 1193 kho_in.fdt_phys = fdt_phys; 1194 kho_in.scratch_phys = scratch_phys; 1195 kho_scratch_cnt = scratch_cnt; 1196 pr_info("found kexec handover data. Will skip init for some devices\n"); 1197 1198 out: 1199 if (fdt) 1200 early_memunmap(fdt, fdt_len); 1201 if (scratch) 1202 early_memunmap(scratch, scratch_len); 1203 if (err) 1204 pr_warn("disabling KHO revival: %d\n", err); 1205 } 1206 1207 /* Helper functions for kexec_file_load */ 1208 1209 int kho_fill_kimage(struct kimage *image) 1210 { 1211 ssize_t scratch_size; 1212 int err = 0; 1213 struct kexec_buf scratch; 1214 1215 if (!kho_enable) 1216 return 0; 1217 1218 image->kho.fdt = page_to_phys(kho_out.ser.fdt); 1219 1220 scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; 1221 scratch = (struct kexec_buf){ 1222 .image = image, 1223 .buffer = kho_scratch, 1224 .bufsz = scratch_size, 1225 .mem = KEXEC_BUF_MEM_UNKNOWN, 1226 .memsz = scratch_size, 1227 .buf_align = SZ_64K, /* Makes it easier to map */ 1228 .buf_max = ULONG_MAX, 1229 .top_down = true, 1230 }; 1231 err = kexec_add_buffer(&scratch); 1232 if (err) 1233 return err; 1234 image->kho.scratch = &image->segment[image->nr_segments - 1]; 1235 1236 return 0; 1237 } 1238 1239 static int kho_walk_scratch(struct kexec_buf *kbuf, 1240 int (*func)(struct resource *, void *)) 1241 { 1242 int ret = 0; 1243 int i; 1244 1245 for (i = 0; i < kho_scratch_cnt; i++) { 1246 struct resource res = { 1247 .start = kho_scratch[i].addr, 1248 .end = kho_scratch[i].addr + kho_scratch[i].size - 1, 1249 }; 1250 1251 /* Try to fit the kimage into our KHO scratch region */ 1252 ret = func(&res, kbuf); 1253 if (ret) 1254 break; 1255 } 1256 1257 return ret; 1258 } 1259 1260 int kho_locate_mem_hole(struct kexec_buf *kbuf, 1261 int (*func)(struct resource *, void *)) 1262 { 1263 int ret; 1264 1265 if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH) 1266 return 1; 1267 1268 ret = kho_walk_scratch(kbuf, func); 1269 1270 return ret == 1 ? 0 : -EADDRNOTAVAIL; 1271 } 1272