1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kexec_handover.c - kexec handover metadata processing 4 * Copyright (C) 2023 Alexander Graf <graf@amazon.com> 5 * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> 6 * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> 7 */ 8 9 #define pr_fmt(fmt) "KHO: " fmt 10 11 #include <linux/cma.h> 12 #include <linux/count_zeros.h> 13 #include <linux/debugfs.h> 14 #include <linux/kexec.h> 15 #include <linux/kexec_handover.h> 16 #include <linux/libfdt.h> 17 #include <linux/list.h> 18 #include <linux/memblock.h> 19 #include <linux/notifier.h> 20 #include <linux/page-isolation.h> 21 22 #include <asm/early_ioremap.h> 23 24 /* 25 * KHO is tightly coupled with mm init and needs access to some of mm 26 * internal APIs. 27 */ 28 #include "../mm/internal.h" 29 #include "kexec_internal.h" 30 31 #define KHO_FDT_COMPATIBLE "kho-v1" 32 #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" 33 #define PROP_SUB_FDT "fdt" 34 35 static bool kho_enable __ro_after_init; 36 37 bool kho_is_enabled(void) 38 { 39 return kho_enable; 40 } 41 EXPORT_SYMBOL_GPL(kho_is_enabled); 42 43 static int __init kho_parse_enable(char *p) 44 { 45 return kstrtobool(p, &kho_enable); 46 } 47 early_param("kho", kho_parse_enable); 48 49 /* 50 * Keep track of memory that is to be preserved across KHO. 51 * 52 * The serializing side uses two levels of xarrays to manage chunks of per-order 53 * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a 54 * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations 55 * each bitmap will cover 16M of address space. Thus, for 16G of memory at most 56 * 512K of bitmap memory will be needed for order 0. 57 * 58 * This approach is fully incremental, as the serialization progresses folios 59 * can continue be aggregated to the tracker. The final step, immediately prior 60 * to kexec would serialize the xarray information into a linked list for the 61 * successor kernel to parse. 62 */ 63 64 #define PRESERVE_BITS (512 * 8) 65 66 struct kho_mem_phys_bits { 67 DECLARE_BITMAP(preserve, PRESERVE_BITS); 68 }; 69 70 struct kho_mem_phys { 71 /* 72 * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized 73 * to order. 74 */ 75 struct xarray phys_bits; 76 }; 77 78 struct kho_mem_track { 79 /* Points to kho_mem_phys, each order gets its own bitmap tree */ 80 struct xarray orders; 81 }; 82 83 struct khoser_mem_chunk; 84 85 struct kho_serialization { 86 struct page *fdt; 87 struct list_head fdt_list; 88 struct dentry *sub_fdt_dir; 89 struct kho_mem_track track; 90 /* First chunk of serialized preserved memory map */ 91 struct khoser_mem_chunk *preserved_mem_map; 92 }; 93 94 static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) 95 { 96 void *elm, *res; 97 98 elm = xa_load(xa, index); 99 if (elm) 100 return elm; 101 102 elm = kzalloc(sz, GFP_KERNEL); 103 if (!elm) 104 return ERR_PTR(-ENOMEM); 105 106 res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); 107 if (xa_is_err(res)) 108 res = ERR_PTR(xa_err(res)); 109 110 if (res) { 111 kfree(elm); 112 return res; 113 } 114 115 return elm; 116 } 117 118 static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, 119 unsigned long end_pfn) 120 { 121 struct kho_mem_phys_bits *bits; 122 struct kho_mem_phys *physxa; 123 124 while (pfn < end_pfn) { 125 const unsigned int order = 126 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 127 const unsigned long pfn_high = pfn >> order; 128 129 physxa = xa_load(&track->orders, order); 130 if (!physxa) 131 continue; 132 133 bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); 134 if (!bits) 135 continue; 136 137 clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); 138 139 pfn += 1 << order; 140 } 141 } 142 143 static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, 144 unsigned int order) 145 { 146 struct kho_mem_phys_bits *bits; 147 struct kho_mem_phys *physxa, *new_physxa; 148 const unsigned long pfn_high = pfn >> order; 149 150 might_sleep(); 151 152 physxa = xa_load(&track->orders, order); 153 if (!physxa) { 154 int err; 155 156 new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL); 157 if (!new_physxa) 158 return -ENOMEM; 159 160 xa_init(&new_physxa->phys_bits); 161 physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa, 162 GFP_KERNEL); 163 164 err = xa_err(physxa); 165 if (err || physxa) { 166 xa_destroy(&new_physxa->phys_bits); 167 kfree(new_physxa); 168 169 if (err) 170 return err; 171 } else { 172 physxa = new_physxa; 173 } 174 } 175 176 bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, 177 sizeof(*bits)); 178 if (IS_ERR(bits)) 179 return PTR_ERR(bits); 180 181 set_bit(pfn_high % PRESERVE_BITS, bits->preserve); 182 183 return 0; 184 } 185 186 /* almost as free_reserved_page(), just don't free the page */ 187 static void kho_restore_page(struct page *page, unsigned int order) 188 { 189 unsigned int nr_pages = (1 << order); 190 191 /* Head page gets refcount of 1. */ 192 set_page_count(page, 1); 193 194 /* For higher order folios, tail pages get a page count of zero. */ 195 for (unsigned int i = 1; i < nr_pages; i++) 196 set_page_count(page + i, 0); 197 198 if (order > 0) 199 prep_compound_page(page, order); 200 201 adjust_managed_page_count(page, nr_pages); 202 } 203 204 /** 205 * kho_restore_folio - recreates the folio from the preserved memory. 206 * @phys: physical address of the folio. 207 * 208 * Return: pointer to the struct folio on success, NULL on failure. 209 */ 210 struct folio *kho_restore_folio(phys_addr_t phys) 211 { 212 struct page *page = pfn_to_online_page(PHYS_PFN(phys)); 213 unsigned long order; 214 215 if (!page) 216 return NULL; 217 218 order = page->private; 219 if (order > MAX_PAGE_ORDER) 220 return NULL; 221 222 kho_restore_page(page, order); 223 return page_folio(page); 224 } 225 EXPORT_SYMBOL_GPL(kho_restore_folio); 226 227 /* Serialize and deserialize struct kho_mem_phys across kexec 228 * 229 * Record all the bitmaps in a linked list of pages for the next kernel to 230 * process. Each chunk holds bitmaps of the same order and each block of bitmaps 231 * starts at a given physical address. This allows the bitmaps to be sparse. The 232 * xarray is used to store them in a tree while building up the data structure, 233 * but the KHO successor kernel only needs to process them once in order. 234 * 235 * All of this memory is normal kmalloc() memory and is not marked for 236 * preservation. The successor kernel will remain isolated to the scratch space 237 * until it completes processing this list. Once processed all the memory 238 * storing these ranges will be marked as free. 239 */ 240 241 struct khoser_mem_bitmap_ptr { 242 phys_addr_t phys_start; 243 DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *); 244 }; 245 246 struct khoser_mem_chunk_hdr { 247 DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *); 248 unsigned int order; 249 unsigned int num_elms; 250 }; 251 252 #define KHOSER_BITMAP_SIZE \ 253 ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \ 254 sizeof(struct khoser_mem_bitmap_ptr)) 255 256 struct khoser_mem_chunk { 257 struct khoser_mem_chunk_hdr hdr; 258 struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE]; 259 }; 260 261 static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); 262 263 static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, 264 unsigned long order) 265 { 266 struct khoser_mem_chunk *chunk; 267 268 chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); 269 if (!chunk) 270 return NULL; 271 chunk->hdr.order = order; 272 if (cur_chunk) 273 KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); 274 return chunk; 275 } 276 277 static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) 278 { 279 struct khoser_mem_chunk *chunk = first_chunk; 280 281 while (chunk) { 282 struct khoser_mem_chunk *tmp = chunk; 283 284 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 285 kfree(tmp); 286 } 287 } 288 289 static int kho_mem_serialize(struct kho_serialization *ser) 290 { 291 struct khoser_mem_chunk *first_chunk = NULL; 292 struct khoser_mem_chunk *chunk = NULL; 293 struct kho_mem_phys *physxa; 294 unsigned long order; 295 296 xa_for_each(&ser->track.orders, order, physxa) { 297 struct kho_mem_phys_bits *bits; 298 unsigned long phys; 299 300 chunk = new_chunk(chunk, order); 301 if (!chunk) 302 goto err_free; 303 304 if (!first_chunk) 305 first_chunk = chunk; 306 307 xa_for_each(&physxa->phys_bits, phys, bits) { 308 struct khoser_mem_bitmap_ptr *elm; 309 310 if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { 311 chunk = new_chunk(chunk, order); 312 if (!chunk) 313 goto err_free; 314 } 315 316 elm = &chunk->bitmaps[chunk->hdr.num_elms]; 317 chunk->hdr.num_elms++; 318 elm->phys_start = (phys * PRESERVE_BITS) 319 << (order + PAGE_SHIFT); 320 KHOSER_STORE_PTR(elm->bitmap, bits); 321 } 322 } 323 324 ser->preserved_mem_map = first_chunk; 325 326 return 0; 327 328 err_free: 329 kho_mem_ser_free(first_chunk); 330 return -ENOMEM; 331 } 332 333 static void __init deserialize_bitmap(unsigned int order, 334 struct khoser_mem_bitmap_ptr *elm) 335 { 336 struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap); 337 unsigned long bit; 338 339 for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) { 340 int sz = 1 << (order + PAGE_SHIFT); 341 phys_addr_t phys = 342 elm->phys_start + (bit << (order + PAGE_SHIFT)); 343 struct page *page = phys_to_page(phys); 344 345 memblock_reserve(phys, sz); 346 memblock_reserved_mark_noinit(phys, sz); 347 page->private = order; 348 } 349 } 350 351 static void __init kho_mem_deserialize(const void *fdt) 352 { 353 struct khoser_mem_chunk *chunk; 354 const phys_addr_t *mem; 355 int len; 356 357 mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); 358 359 if (!mem || len != sizeof(*mem)) { 360 pr_err("failed to get preserved memory bitmaps\n"); 361 return; 362 } 363 364 chunk = *mem ? phys_to_virt(*mem) : NULL; 365 while (chunk) { 366 unsigned int i; 367 368 for (i = 0; i != chunk->hdr.num_elms; i++) 369 deserialize_bitmap(chunk->hdr.order, 370 &chunk->bitmaps[i]); 371 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 372 } 373 } 374 375 /* 376 * With KHO enabled, memory can become fragmented because KHO regions may 377 * be anywhere in physical address space. The scratch regions give us a 378 * safe zones that we will never see KHO allocations from. This is where we 379 * can later safely load our new kexec images into and then use the scratch 380 * area for early allocations that happen before page allocator is 381 * initialized. 382 */ 383 static struct kho_scratch *kho_scratch; 384 static unsigned int kho_scratch_cnt; 385 386 /* 387 * The scratch areas are scaled by default as percent of memory allocated from 388 * memblock. A user can override the scale with command line parameter: 389 * 390 * kho_scratch=N% 391 * 392 * It is also possible to explicitly define size for a lowmem, a global and 393 * per-node scratch areas: 394 * 395 * kho_scratch=l[KMG],n[KMG],m[KMG] 396 * 397 * The explicit size definition takes precedence over scale definition. 398 */ 399 static unsigned int scratch_scale __initdata = 200; 400 static phys_addr_t scratch_size_global __initdata; 401 static phys_addr_t scratch_size_pernode __initdata; 402 static phys_addr_t scratch_size_lowmem __initdata; 403 404 static int __init kho_parse_scratch_size(char *p) 405 { 406 size_t len; 407 unsigned long sizes[3]; 408 size_t total_size = 0; 409 int i; 410 411 if (!p) 412 return -EINVAL; 413 414 len = strlen(p); 415 if (!len) 416 return -EINVAL; 417 418 /* parse nn% */ 419 if (p[len - 1] == '%') { 420 /* unsigned int max is 4,294,967,295, 10 chars */ 421 char s_scale[11] = {}; 422 int ret = 0; 423 424 if (len > ARRAY_SIZE(s_scale)) 425 return -EINVAL; 426 427 memcpy(s_scale, p, len - 1); 428 ret = kstrtouint(s_scale, 10, &scratch_scale); 429 if (!ret) 430 pr_notice("scratch scale is %d%%\n", scratch_scale); 431 return ret; 432 } 433 434 /* parse ll[KMG],mm[KMG],nn[KMG] */ 435 for (i = 0; i < ARRAY_SIZE(sizes); i++) { 436 char *endp = p; 437 438 if (i > 0) { 439 if (*p != ',') 440 return -EINVAL; 441 p += 1; 442 } 443 444 sizes[i] = memparse(p, &endp); 445 if (endp == p) 446 return -EINVAL; 447 p = endp; 448 total_size += sizes[i]; 449 } 450 451 if (!total_size) 452 return -EINVAL; 453 454 /* The string should be fully consumed by now. */ 455 if (*p) 456 return -EINVAL; 457 458 scratch_size_lowmem = sizes[0]; 459 scratch_size_global = sizes[1]; 460 scratch_size_pernode = sizes[2]; 461 scratch_scale = 0; 462 463 pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n", 464 (u64)(scratch_size_lowmem >> 20), 465 (u64)(scratch_size_global >> 20), 466 (u64)(scratch_size_pernode >> 20)); 467 468 return 0; 469 } 470 early_param("kho_scratch", kho_parse_scratch_size); 471 472 static void __init scratch_size_update(void) 473 { 474 phys_addr_t size; 475 476 if (!scratch_scale) 477 return; 478 479 size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT, 480 NUMA_NO_NODE); 481 size = size * scratch_scale / 100; 482 scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES); 483 484 size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, 485 NUMA_NO_NODE); 486 size = size * scratch_scale / 100 - scratch_size_lowmem; 487 scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES); 488 } 489 490 static phys_addr_t __init scratch_size_node(int nid) 491 { 492 phys_addr_t size; 493 494 if (scratch_scale) { 495 size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, 496 nid); 497 size = size * scratch_scale / 100; 498 } else { 499 size = scratch_size_pernode; 500 } 501 502 return round_up(size, CMA_MIN_ALIGNMENT_BYTES); 503 } 504 505 /** 506 * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec 507 * 508 * With KHO we can preserve arbitrary pages in the system. To ensure we still 509 * have a large contiguous region of memory when we search the physical address 510 * space for target memory, let's make sure we always have a large CMA region 511 * active. This CMA region will only be used for movable pages which are not a 512 * problem for us during KHO because we can just move them somewhere else. 513 */ 514 static void __init kho_reserve_scratch(void) 515 { 516 phys_addr_t addr, size; 517 int nid, i = 0; 518 519 if (!kho_enable) 520 return; 521 522 scratch_size_update(); 523 524 /* FIXME: deal with node hot-plug/remove */ 525 kho_scratch_cnt = num_online_nodes() + 2; 526 size = kho_scratch_cnt * sizeof(*kho_scratch); 527 kho_scratch = memblock_alloc(size, PAGE_SIZE); 528 if (!kho_scratch) 529 goto err_disable_kho; 530 531 /* 532 * reserve scratch area in low memory for lowmem allocations in the 533 * next kernel 534 */ 535 size = scratch_size_lowmem; 536 addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0, 537 ARCH_LOW_ADDRESS_LIMIT); 538 if (!addr) 539 goto err_free_scratch_desc; 540 541 kho_scratch[i].addr = addr; 542 kho_scratch[i].size = size; 543 i++; 544 545 /* reserve large contiguous area for allocations without nid */ 546 size = scratch_size_global; 547 addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES); 548 if (!addr) 549 goto err_free_scratch_areas; 550 551 kho_scratch[i].addr = addr; 552 kho_scratch[i].size = size; 553 i++; 554 555 for_each_online_node(nid) { 556 size = scratch_size_node(nid); 557 addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, 558 0, MEMBLOCK_ALLOC_ACCESSIBLE, 559 nid, true); 560 if (!addr) 561 goto err_free_scratch_areas; 562 563 kho_scratch[i].addr = addr; 564 kho_scratch[i].size = size; 565 i++; 566 } 567 568 return; 569 570 err_free_scratch_areas: 571 for (i--; i >= 0; i--) 572 memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size); 573 err_free_scratch_desc: 574 memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch)); 575 err_disable_kho: 576 pr_warn("Failed to reserve scratch area, disabling kexec handover\n"); 577 kho_enable = false; 578 } 579 580 struct fdt_debugfs { 581 struct list_head list; 582 struct debugfs_blob_wrapper wrapper; 583 struct dentry *file; 584 }; 585 586 static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, 587 const char *name, const void *fdt) 588 { 589 struct fdt_debugfs *f; 590 struct dentry *file; 591 592 f = kmalloc(sizeof(*f), GFP_KERNEL); 593 if (!f) 594 return -ENOMEM; 595 596 f->wrapper.data = (void *)fdt; 597 f->wrapper.size = fdt_totalsize(fdt); 598 599 file = debugfs_create_blob(name, 0400, dir, &f->wrapper); 600 if (IS_ERR(file)) { 601 kfree(f); 602 return PTR_ERR(file); 603 } 604 605 f->file = file; 606 list_add(&f->list, list); 607 608 return 0; 609 } 610 611 /** 612 * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. 613 * @ser: serialization control object passed by KHO notifiers. 614 * @name: name of the sub tree. 615 * @fdt: the sub tree blob. 616 * 617 * Creates a new child node named @name in KHO root FDT and records 618 * the physical address of @fdt. The pages of @fdt must also be preserved 619 * by KHO for the new kernel to retrieve it after kexec. 620 * 621 * A debugfs blob entry is also created at 622 * ``/sys/kernel/debug/kho/out/sub_fdts/@name``. 623 * 624 * Return: 0 on success, error code on failure 625 */ 626 int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) 627 { 628 int err = 0; 629 u64 phys = (u64)virt_to_phys(fdt); 630 void *root = page_to_virt(ser->fdt); 631 632 err |= fdt_begin_node(root, name); 633 err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); 634 err |= fdt_end_node(root); 635 636 if (err) 637 return err; 638 639 return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt); 640 } 641 EXPORT_SYMBOL_GPL(kho_add_subtree); 642 643 struct kho_out { 644 struct blocking_notifier_head chain_head; 645 646 struct dentry *dir; 647 648 struct mutex lock; /* protects KHO FDT finalization */ 649 650 struct kho_serialization ser; 651 bool finalized; 652 }; 653 654 static struct kho_out kho_out = { 655 .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), 656 .lock = __MUTEX_INITIALIZER(kho_out.lock), 657 .ser = { 658 .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), 659 .track = { 660 .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), 661 }, 662 }, 663 .finalized = false, 664 }; 665 666 int register_kho_notifier(struct notifier_block *nb) 667 { 668 return blocking_notifier_chain_register(&kho_out.chain_head, nb); 669 } 670 EXPORT_SYMBOL_GPL(register_kho_notifier); 671 672 int unregister_kho_notifier(struct notifier_block *nb) 673 { 674 return blocking_notifier_chain_unregister(&kho_out.chain_head, nb); 675 } 676 EXPORT_SYMBOL_GPL(unregister_kho_notifier); 677 678 /** 679 * kho_preserve_folio - preserve a folio across kexec. 680 * @folio: folio to preserve. 681 * 682 * Instructs KHO to preserve the whole folio across kexec. The order 683 * will be preserved as well. 684 * 685 * Return: 0 on success, error code on failure 686 */ 687 int kho_preserve_folio(struct folio *folio) 688 { 689 const unsigned long pfn = folio_pfn(folio); 690 const unsigned int order = folio_order(folio); 691 struct kho_mem_track *track = &kho_out.ser.track; 692 693 if (kho_out.finalized) 694 return -EBUSY; 695 696 return __kho_preserve_order(track, pfn, order); 697 } 698 EXPORT_SYMBOL_GPL(kho_preserve_folio); 699 700 /** 701 * kho_preserve_phys - preserve a physically contiguous range across kexec. 702 * @phys: physical address of the range. 703 * @size: size of the range. 704 * 705 * Instructs KHO to preserve the memory range from @phys to @phys + @size 706 * across kexec. 707 * 708 * Return: 0 on success, error code on failure 709 */ 710 int kho_preserve_phys(phys_addr_t phys, size_t size) 711 { 712 unsigned long pfn = PHYS_PFN(phys); 713 unsigned long failed_pfn = 0; 714 const unsigned long start_pfn = pfn; 715 const unsigned long end_pfn = PHYS_PFN(phys + size); 716 int err = 0; 717 struct kho_mem_track *track = &kho_out.ser.track; 718 719 if (kho_out.finalized) 720 return -EBUSY; 721 722 if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) 723 return -EINVAL; 724 725 while (pfn < end_pfn) { 726 const unsigned int order = 727 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 728 729 err = __kho_preserve_order(track, pfn, order); 730 if (err) { 731 failed_pfn = pfn; 732 break; 733 } 734 735 pfn += 1 << order; 736 } 737 738 if (err) 739 __kho_unpreserve(track, start_pfn, failed_pfn); 740 741 return err; 742 } 743 EXPORT_SYMBOL_GPL(kho_preserve_phys); 744 745 /* Handling for debug/kho/out */ 746 747 static struct dentry *debugfs_root; 748 749 static int kho_out_update_debugfs_fdt(void) 750 { 751 int err = 0; 752 struct fdt_debugfs *ff, *tmp; 753 754 if (kho_out.finalized) { 755 err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir, 756 "fdt", page_to_virt(kho_out.ser.fdt)); 757 } else { 758 list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) { 759 debugfs_remove(ff->file); 760 list_del(&ff->list); 761 kfree(ff); 762 } 763 } 764 765 return err; 766 } 767 768 static int kho_abort(void) 769 { 770 int err; 771 unsigned long order; 772 struct kho_mem_phys *physxa; 773 774 xa_for_each(&kho_out.ser.track.orders, order, physxa) { 775 struct kho_mem_phys_bits *bits; 776 unsigned long phys; 777 778 xa_for_each(&physxa->phys_bits, phys, bits) 779 kfree(bits); 780 781 xa_destroy(&physxa->phys_bits); 782 kfree(physxa); 783 } 784 xa_destroy(&kho_out.ser.track.orders); 785 786 if (kho_out.ser.preserved_mem_map) { 787 kho_mem_ser_free(kho_out.ser.preserved_mem_map); 788 kho_out.ser.preserved_mem_map = NULL; 789 } 790 791 err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT, 792 NULL); 793 err = notifier_to_errno(err); 794 795 if (err) 796 pr_err("Failed to abort KHO finalization: %d\n", err); 797 798 return err; 799 } 800 801 static int kho_finalize(void) 802 { 803 int err = 0; 804 u64 *preserved_mem_map; 805 void *fdt = page_to_virt(kho_out.ser.fdt); 806 807 err |= fdt_create(fdt, PAGE_SIZE); 808 err |= fdt_finish_reservemap(fdt); 809 err |= fdt_begin_node(fdt, ""); 810 err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE); 811 /** 812 * Reserve the preserved-memory-map property in the root FDT, so 813 * that all property definitions will precede subnodes created by 814 * KHO callers. 815 */ 816 err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP, 817 sizeof(*preserved_mem_map), 818 (void **)&preserved_mem_map); 819 if (err) 820 goto abort; 821 822 err = kho_preserve_folio(page_folio(kho_out.ser.fdt)); 823 if (err) 824 goto abort; 825 826 err = blocking_notifier_call_chain(&kho_out.chain_head, 827 KEXEC_KHO_FINALIZE, &kho_out.ser); 828 err = notifier_to_errno(err); 829 if (err) 830 goto abort; 831 832 err = kho_mem_serialize(&kho_out.ser); 833 if (err) 834 goto abort; 835 836 *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map); 837 838 err |= fdt_end_node(fdt); 839 err |= fdt_finish(fdt); 840 841 abort: 842 if (err) { 843 pr_err("Failed to convert KHO state tree: %d\n", err); 844 kho_abort(); 845 } 846 847 return err; 848 } 849 850 static int kho_out_finalize_get(void *data, u64 *val) 851 { 852 mutex_lock(&kho_out.lock); 853 *val = kho_out.finalized; 854 mutex_unlock(&kho_out.lock); 855 856 return 0; 857 } 858 859 static int kho_out_finalize_set(void *data, u64 _val) 860 { 861 int ret = 0; 862 bool val = !!_val; 863 864 mutex_lock(&kho_out.lock); 865 866 if (val == kho_out.finalized) { 867 if (kho_out.finalized) 868 ret = -EEXIST; 869 else 870 ret = -ENOENT; 871 goto unlock; 872 } 873 874 if (val) 875 ret = kho_finalize(); 876 else 877 ret = kho_abort(); 878 879 if (ret) 880 goto unlock; 881 882 kho_out.finalized = val; 883 ret = kho_out_update_debugfs_fdt(); 884 885 unlock: 886 mutex_unlock(&kho_out.lock); 887 return ret; 888 } 889 890 DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get, 891 kho_out_finalize_set, "%llu\n"); 892 893 static int scratch_phys_show(struct seq_file *m, void *v) 894 { 895 for (int i = 0; i < kho_scratch_cnt; i++) 896 seq_printf(m, "0x%llx\n", kho_scratch[i].addr); 897 898 return 0; 899 } 900 DEFINE_SHOW_ATTRIBUTE(scratch_phys); 901 902 static int scratch_len_show(struct seq_file *m, void *v) 903 { 904 for (int i = 0; i < kho_scratch_cnt; i++) 905 seq_printf(m, "0x%llx\n", kho_scratch[i].size); 906 907 return 0; 908 } 909 DEFINE_SHOW_ATTRIBUTE(scratch_len); 910 911 static __init int kho_out_debugfs_init(void) 912 { 913 struct dentry *dir, *f, *sub_fdt_dir; 914 915 dir = debugfs_create_dir("out", debugfs_root); 916 if (IS_ERR(dir)) 917 return -ENOMEM; 918 919 sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); 920 if (IS_ERR(sub_fdt_dir)) 921 goto err_rmdir; 922 923 f = debugfs_create_file("scratch_phys", 0400, dir, NULL, 924 &scratch_phys_fops); 925 if (IS_ERR(f)) 926 goto err_rmdir; 927 928 f = debugfs_create_file("scratch_len", 0400, dir, NULL, 929 &scratch_len_fops); 930 if (IS_ERR(f)) 931 goto err_rmdir; 932 933 f = debugfs_create_file("finalize", 0600, dir, NULL, 934 &fops_kho_out_finalize); 935 if (IS_ERR(f)) 936 goto err_rmdir; 937 938 kho_out.dir = dir; 939 kho_out.ser.sub_fdt_dir = sub_fdt_dir; 940 return 0; 941 942 err_rmdir: 943 debugfs_remove_recursive(dir); 944 return -ENOENT; 945 } 946 947 struct kho_in { 948 struct dentry *dir; 949 phys_addr_t fdt_phys; 950 phys_addr_t scratch_phys; 951 struct list_head fdt_list; 952 }; 953 954 static struct kho_in kho_in = { 955 .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list), 956 }; 957 958 static const void *kho_get_fdt(void) 959 { 960 return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; 961 } 962 963 /** 964 * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. 965 * @name: the name of the sub FDT passed to kho_add_subtree(). 966 * @phys: if found, the physical address of the sub FDT is stored in @phys. 967 * 968 * Retrieve a preserved sub FDT named @name and store its physical 969 * address in @phys. 970 * 971 * Return: 0 on success, error code on failure 972 */ 973 int kho_retrieve_subtree(const char *name, phys_addr_t *phys) 974 { 975 const void *fdt = kho_get_fdt(); 976 const u64 *val; 977 int offset, len; 978 979 if (!fdt) 980 return -ENOENT; 981 982 if (!phys) 983 return -EINVAL; 984 985 offset = fdt_subnode_offset(fdt, 0, name); 986 if (offset < 0) 987 return -ENOENT; 988 989 val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len); 990 if (!val || len != sizeof(*val)) 991 return -EINVAL; 992 993 *phys = (phys_addr_t)*val; 994 995 return 0; 996 } 997 EXPORT_SYMBOL_GPL(kho_retrieve_subtree); 998 999 /* Handling for debugfs/kho/in */ 1000 1001 static __init int kho_in_debugfs_init(const void *fdt) 1002 { 1003 struct dentry *sub_fdt_dir; 1004 int err, child; 1005 1006 kho_in.dir = debugfs_create_dir("in", debugfs_root); 1007 if (IS_ERR(kho_in.dir)) 1008 return PTR_ERR(kho_in.dir); 1009 1010 sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir); 1011 if (IS_ERR(sub_fdt_dir)) { 1012 err = PTR_ERR(sub_fdt_dir); 1013 goto err_rmdir; 1014 } 1015 1016 err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt); 1017 if (err) 1018 goto err_rmdir; 1019 1020 fdt_for_each_subnode(child, fdt, 0) { 1021 int len = 0; 1022 const char *name = fdt_get_name(fdt, child, NULL); 1023 const u64 *fdt_phys; 1024 1025 fdt_phys = fdt_getprop(fdt, child, "fdt", &len); 1026 if (!fdt_phys) 1027 continue; 1028 if (len != sizeof(*fdt_phys)) { 1029 pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n", 1030 name, len); 1031 continue; 1032 } 1033 err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name, 1034 phys_to_virt(*fdt_phys)); 1035 if (err) { 1036 pr_warn("failed to add fdt `%s` to debugfs: %d\n", name, 1037 err); 1038 continue; 1039 } 1040 } 1041 1042 return 0; 1043 1044 err_rmdir: 1045 debugfs_remove_recursive(kho_in.dir); 1046 return err; 1047 } 1048 1049 static __init int kho_init(void) 1050 { 1051 int err = 0; 1052 const void *fdt = kho_get_fdt(); 1053 1054 if (!kho_enable) 1055 return 0; 1056 1057 kho_out.ser.fdt = alloc_page(GFP_KERNEL); 1058 if (!kho_out.ser.fdt) { 1059 err = -ENOMEM; 1060 goto err_free_scratch; 1061 } 1062 1063 debugfs_root = debugfs_create_dir("kho", NULL); 1064 if (IS_ERR(debugfs_root)) { 1065 err = -ENOENT; 1066 goto err_free_fdt; 1067 } 1068 1069 err = kho_out_debugfs_init(); 1070 if (err) 1071 goto err_free_fdt; 1072 1073 if (fdt) { 1074 err = kho_in_debugfs_init(fdt); 1075 /* 1076 * Failure to create /sys/kernel/debug/kho/in does not prevent 1077 * reviving state from KHO and setting up KHO for the next 1078 * kexec. 1079 */ 1080 if (err) 1081 pr_err("failed exposing handover FDT in debugfs: %d\n", 1082 err); 1083 1084 return 0; 1085 } 1086 1087 for (int i = 0; i < kho_scratch_cnt; i++) { 1088 unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr); 1089 unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; 1090 unsigned long pfn; 1091 1092 for (pfn = base_pfn; pfn < base_pfn + count; 1093 pfn += pageblock_nr_pages) 1094 init_cma_reserved_pageblock(pfn_to_page(pfn)); 1095 } 1096 1097 return 0; 1098 1099 err_free_fdt: 1100 put_page(kho_out.ser.fdt); 1101 kho_out.ser.fdt = NULL; 1102 err_free_scratch: 1103 for (int i = 0; i < kho_scratch_cnt; i++) { 1104 void *start = __va(kho_scratch[i].addr); 1105 void *end = start + kho_scratch[i].size; 1106 1107 free_reserved_area(start, end, -1, ""); 1108 } 1109 kho_enable = false; 1110 return err; 1111 } 1112 late_initcall(kho_init); 1113 1114 static void __init kho_release_scratch(void) 1115 { 1116 phys_addr_t start, end; 1117 u64 i; 1118 1119 memmap_init_kho_scratch_pages(); 1120 1121 /* 1122 * Mark scratch mem as CMA before we return it. That way we 1123 * ensure that no kernel allocations happen on it. That means 1124 * we can reuse it as scratch memory again later. 1125 */ 1126 __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, 1127 MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) { 1128 ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start)); 1129 ulong end_pfn = pageblock_align(PFN_UP(end)); 1130 ulong pfn; 1131 1132 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) 1133 init_pageblock_migratetype(pfn_to_page(pfn), 1134 MIGRATE_CMA, false); 1135 } 1136 } 1137 1138 void __init kho_memory_init(void) 1139 { 1140 struct folio *folio; 1141 1142 if (kho_in.scratch_phys) { 1143 kho_scratch = phys_to_virt(kho_in.scratch_phys); 1144 kho_release_scratch(); 1145 1146 kho_mem_deserialize(kho_get_fdt()); 1147 folio = kho_restore_folio(kho_in.fdt_phys); 1148 if (!folio) 1149 pr_warn("failed to restore folio for KHO fdt\n"); 1150 } else { 1151 kho_reserve_scratch(); 1152 } 1153 } 1154 1155 void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, 1156 phys_addr_t scratch_phys, u64 scratch_len) 1157 { 1158 void *fdt = NULL; 1159 struct kho_scratch *scratch = NULL; 1160 int err = 0; 1161 unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); 1162 1163 /* Validate the input FDT */ 1164 fdt = early_memremap(fdt_phys, fdt_len); 1165 if (!fdt) { 1166 pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys); 1167 err = -EFAULT; 1168 goto out; 1169 } 1170 err = fdt_check_header(fdt); 1171 if (err) { 1172 pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n", 1173 fdt_phys, err); 1174 err = -EINVAL; 1175 goto out; 1176 } 1177 err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE); 1178 if (err) { 1179 pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n", 1180 fdt_phys, KHO_FDT_COMPATIBLE, err); 1181 err = -EINVAL; 1182 goto out; 1183 } 1184 1185 scratch = early_memremap(scratch_phys, scratch_len); 1186 if (!scratch) { 1187 pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n", 1188 scratch_phys, scratch_len); 1189 err = -EFAULT; 1190 goto out; 1191 } 1192 1193 /* 1194 * We pass a safe contiguous blocks of memory to use for early boot 1195 * purporses from the previous kernel so that we can resize the 1196 * memblock array as needed. 1197 */ 1198 for (int i = 0; i < scratch_cnt; i++) { 1199 struct kho_scratch *area = &scratch[i]; 1200 u64 size = area->size; 1201 1202 memblock_add(area->addr, size); 1203 err = memblock_mark_kho_scratch(area->addr, size); 1204 if (WARN_ON(err)) { 1205 pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d", 1206 &area->addr, &size, err); 1207 goto out; 1208 } 1209 pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); 1210 } 1211 1212 memblock_reserve(scratch_phys, scratch_len); 1213 1214 /* 1215 * Now that we have a viable region of scratch memory, let's tell 1216 * the memblocks allocator to only use that for any allocations. 1217 * That way we ensure that nothing scribbles over in use data while 1218 * we initialize the page tables which we will need to ingest all 1219 * memory reservations from the previous kernel. 1220 */ 1221 memblock_set_kho_scratch_only(); 1222 1223 kho_in.fdt_phys = fdt_phys; 1224 kho_in.scratch_phys = scratch_phys; 1225 kho_scratch_cnt = scratch_cnt; 1226 pr_info("found kexec handover data. Will skip init for some devices\n"); 1227 1228 out: 1229 if (fdt) 1230 early_memunmap(fdt, fdt_len); 1231 if (scratch) 1232 early_memunmap(scratch, scratch_len); 1233 if (err) 1234 pr_warn("disabling KHO revival: %d\n", err); 1235 } 1236 1237 /* Helper functions for kexec_file_load */ 1238 1239 int kho_fill_kimage(struct kimage *image) 1240 { 1241 ssize_t scratch_size; 1242 int err = 0; 1243 struct kexec_buf scratch; 1244 1245 if (!kho_enable) 1246 return 0; 1247 1248 image->kho.fdt = page_to_phys(kho_out.ser.fdt); 1249 1250 scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; 1251 scratch = (struct kexec_buf){ 1252 .image = image, 1253 .buffer = kho_scratch, 1254 .bufsz = scratch_size, 1255 .mem = KEXEC_BUF_MEM_UNKNOWN, 1256 .memsz = scratch_size, 1257 .buf_align = SZ_64K, /* Makes it easier to map */ 1258 .buf_max = ULONG_MAX, 1259 .top_down = true, 1260 }; 1261 err = kexec_add_buffer(&scratch); 1262 if (err) 1263 return err; 1264 image->kho.scratch = &image->segment[image->nr_segments - 1]; 1265 1266 return 0; 1267 } 1268 1269 static int kho_walk_scratch(struct kexec_buf *kbuf, 1270 int (*func)(struct resource *, void *)) 1271 { 1272 int ret = 0; 1273 int i; 1274 1275 for (i = 0; i < kho_scratch_cnt; i++) { 1276 struct resource res = { 1277 .start = kho_scratch[i].addr, 1278 .end = kho_scratch[i].addr + kho_scratch[i].size - 1, 1279 }; 1280 1281 /* Try to fit the kimage into our KHO scratch region */ 1282 ret = func(&res, kbuf); 1283 if (ret) 1284 break; 1285 } 1286 1287 return ret; 1288 } 1289 1290 int kho_locate_mem_hole(struct kexec_buf *kbuf, 1291 int (*func)(struct resource *, void *)) 1292 { 1293 int ret; 1294 1295 if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH) 1296 return 1; 1297 1298 ret = kho_walk_scratch(kbuf, func); 1299 1300 return ret == 1 ? 0 : -EADDRNOTAVAIL; 1301 } 1302