1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kexec_handover.c - kexec handover metadata processing 4 * Copyright (C) 2023 Alexander Graf <graf@amazon.com> 5 * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> 6 * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> 7 */ 8 9 #define pr_fmt(fmt) "KHO: " fmt 10 11 #include <linux/cma.h> 12 #include <linux/count_zeros.h> 13 #include <linux/debugfs.h> 14 #include <linux/kexec.h> 15 #include <linux/kexec_handover.h> 16 #include <linux/libfdt.h> 17 #include <linux/list.h> 18 #include <linux/memblock.h> 19 #include <linux/notifier.h> 20 #include <linux/page-isolation.h> 21 22 #include <asm/early_ioremap.h> 23 24 /* 25 * KHO is tightly coupled with mm init and needs access to some of mm 26 * internal APIs. 27 */ 28 #include "../mm/internal.h" 29 #include "kexec_internal.h" 30 31 #define KHO_FDT_COMPATIBLE "kho-v1" 32 #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" 33 #define PROP_SUB_FDT "fdt" 34 35 #define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */ 36 37 /* 38 * KHO uses page->private, which is an unsigned long, to store page metadata. 39 * Use it to store both the magic and the order. 40 */ 41 union kho_page_info { 42 unsigned long page_private; 43 struct { 44 unsigned int order; 45 unsigned int magic; 46 }; 47 }; 48 49 static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private)); 50 51 static bool kho_enable __ro_after_init; 52 53 bool kho_is_enabled(void) 54 { 55 return kho_enable; 56 } 57 EXPORT_SYMBOL_GPL(kho_is_enabled); 58 59 static int __init kho_parse_enable(char *p) 60 { 61 return kstrtobool(p, &kho_enable); 62 } 63 early_param("kho", kho_parse_enable); 64 65 /* 66 * Keep track of memory that is to be preserved across KHO. 67 * 68 * The serializing side uses two levels of xarrays to manage chunks of per-order 69 * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a 70 * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations 71 * each bitmap will cover 16M of address space. Thus, for 16G of memory at most 72 * 512K of bitmap memory will be needed for order 0. 73 * 74 * This approach is fully incremental, as the serialization progresses folios 75 * can continue be aggregated to the tracker. The final step, immediately prior 76 * to kexec would serialize the xarray information into a linked list for the 77 * successor kernel to parse. 78 */ 79 80 #define PRESERVE_BITS (512 * 8) 81 82 struct kho_mem_phys_bits { 83 DECLARE_BITMAP(preserve, PRESERVE_BITS); 84 }; 85 86 struct kho_mem_phys { 87 /* 88 * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized 89 * to order. 90 */ 91 struct xarray phys_bits; 92 }; 93 94 struct kho_mem_track { 95 /* Points to kho_mem_phys, each order gets its own bitmap tree */ 96 struct xarray orders; 97 }; 98 99 struct khoser_mem_chunk; 100 101 struct kho_serialization { 102 struct page *fdt; 103 struct list_head fdt_list; 104 struct dentry *sub_fdt_dir; 105 struct kho_mem_track track; 106 /* First chunk of serialized preserved memory map */ 107 struct khoser_mem_chunk *preserved_mem_map; 108 }; 109 110 static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) 111 { 112 void *elm, *res; 113 114 elm = xa_load(xa, index); 115 if (elm) 116 return elm; 117 118 elm = kzalloc(sz, GFP_KERNEL); 119 if (!elm) 120 return ERR_PTR(-ENOMEM); 121 122 res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); 123 if (xa_is_err(res)) 124 res = ERR_PTR(xa_err(res)); 125 126 if (res) { 127 kfree(elm); 128 return res; 129 } 130 131 return elm; 132 } 133 134 static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, 135 unsigned long end_pfn) 136 { 137 struct kho_mem_phys_bits *bits; 138 struct kho_mem_phys *physxa; 139 140 while (pfn < end_pfn) { 141 const unsigned int order = 142 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 143 const unsigned long pfn_high = pfn >> order; 144 145 physxa = xa_load(&track->orders, order); 146 if (!physxa) 147 continue; 148 149 bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); 150 if (!bits) 151 continue; 152 153 clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); 154 155 pfn += 1 << order; 156 } 157 } 158 159 static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, 160 unsigned int order) 161 { 162 struct kho_mem_phys_bits *bits; 163 struct kho_mem_phys *physxa, *new_physxa; 164 const unsigned long pfn_high = pfn >> order; 165 166 might_sleep(); 167 168 physxa = xa_load(&track->orders, order); 169 if (!physxa) { 170 int err; 171 172 new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL); 173 if (!new_physxa) 174 return -ENOMEM; 175 176 xa_init(&new_physxa->phys_bits); 177 physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa, 178 GFP_KERNEL); 179 180 err = xa_err(physxa); 181 if (err || physxa) { 182 xa_destroy(&new_physxa->phys_bits); 183 kfree(new_physxa); 184 185 if (err) 186 return err; 187 } else { 188 physxa = new_physxa; 189 } 190 } 191 192 bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, 193 sizeof(*bits)); 194 if (IS_ERR(bits)) 195 return PTR_ERR(bits); 196 197 set_bit(pfn_high % PRESERVE_BITS, bits->preserve); 198 199 return 0; 200 } 201 202 static struct page *kho_restore_page(phys_addr_t phys) 203 { 204 struct page *page = pfn_to_online_page(PHYS_PFN(phys)); 205 union kho_page_info info; 206 unsigned int nr_pages; 207 208 if (!page) 209 return NULL; 210 211 info.page_private = page->private; 212 /* 213 * deserialize_bitmap() only sets the magic on the head page. This magic 214 * check also implicitly makes sure phys is order-aligned since for 215 * non-order-aligned phys addresses, magic will never be set. 216 */ 217 if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC || info.order > MAX_PAGE_ORDER)) 218 return NULL; 219 nr_pages = (1 << info.order); 220 221 /* Clear private to make sure later restores on this page error out. */ 222 page->private = 0; 223 /* Head page gets refcount of 1. */ 224 set_page_count(page, 1); 225 226 /* For higher order folios, tail pages get a page count of zero. */ 227 for (unsigned int i = 1; i < nr_pages; i++) 228 set_page_count(page + i, 0); 229 230 if (info.order > 0) 231 prep_compound_page(page, info.order); 232 233 adjust_managed_page_count(page, nr_pages); 234 return page; 235 } 236 237 /** 238 * kho_restore_folio - recreates the folio from the preserved memory. 239 * @phys: physical address of the folio. 240 * 241 * Return: pointer to the struct folio on success, NULL on failure. 242 */ 243 struct folio *kho_restore_folio(phys_addr_t phys) 244 { 245 struct page *page = kho_restore_page(phys); 246 247 return page ? page_folio(page) : NULL; 248 } 249 EXPORT_SYMBOL_GPL(kho_restore_folio); 250 251 /* Serialize and deserialize struct kho_mem_phys across kexec 252 * 253 * Record all the bitmaps in a linked list of pages for the next kernel to 254 * process. Each chunk holds bitmaps of the same order and each block of bitmaps 255 * starts at a given physical address. This allows the bitmaps to be sparse. The 256 * xarray is used to store them in a tree while building up the data structure, 257 * but the KHO successor kernel only needs to process them once in order. 258 * 259 * All of this memory is normal kmalloc() memory and is not marked for 260 * preservation. The successor kernel will remain isolated to the scratch space 261 * until it completes processing this list. Once processed all the memory 262 * storing these ranges will be marked as free. 263 */ 264 265 struct khoser_mem_bitmap_ptr { 266 phys_addr_t phys_start; 267 DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *); 268 }; 269 270 struct khoser_mem_chunk_hdr { 271 DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *); 272 unsigned int order; 273 unsigned int num_elms; 274 }; 275 276 #define KHOSER_BITMAP_SIZE \ 277 ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \ 278 sizeof(struct khoser_mem_bitmap_ptr)) 279 280 struct khoser_mem_chunk { 281 struct khoser_mem_chunk_hdr hdr; 282 struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE]; 283 }; 284 285 static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); 286 287 static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, 288 unsigned long order) 289 { 290 struct khoser_mem_chunk *chunk; 291 292 chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); 293 if (!chunk) 294 return NULL; 295 chunk->hdr.order = order; 296 if (cur_chunk) 297 KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); 298 return chunk; 299 } 300 301 static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) 302 { 303 struct khoser_mem_chunk *chunk = first_chunk; 304 305 while (chunk) { 306 struct khoser_mem_chunk *tmp = chunk; 307 308 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 309 kfree(tmp); 310 } 311 } 312 313 static int kho_mem_serialize(struct kho_serialization *ser) 314 { 315 struct khoser_mem_chunk *first_chunk = NULL; 316 struct khoser_mem_chunk *chunk = NULL; 317 struct kho_mem_phys *physxa; 318 unsigned long order; 319 320 xa_for_each(&ser->track.orders, order, physxa) { 321 struct kho_mem_phys_bits *bits; 322 unsigned long phys; 323 324 chunk = new_chunk(chunk, order); 325 if (!chunk) 326 goto err_free; 327 328 if (!first_chunk) 329 first_chunk = chunk; 330 331 xa_for_each(&physxa->phys_bits, phys, bits) { 332 struct khoser_mem_bitmap_ptr *elm; 333 334 if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { 335 chunk = new_chunk(chunk, order); 336 if (!chunk) 337 goto err_free; 338 } 339 340 elm = &chunk->bitmaps[chunk->hdr.num_elms]; 341 chunk->hdr.num_elms++; 342 elm->phys_start = (phys * PRESERVE_BITS) 343 << (order + PAGE_SHIFT); 344 KHOSER_STORE_PTR(elm->bitmap, bits); 345 } 346 } 347 348 ser->preserved_mem_map = first_chunk; 349 350 return 0; 351 352 err_free: 353 kho_mem_ser_free(first_chunk); 354 return -ENOMEM; 355 } 356 357 static void __init deserialize_bitmap(unsigned int order, 358 struct khoser_mem_bitmap_ptr *elm) 359 { 360 struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap); 361 unsigned long bit; 362 363 for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) { 364 int sz = 1 << (order + PAGE_SHIFT); 365 phys_addr_t phys = 366 elm->phys_start + (bit << (order + PAGE_SHIFT)); 367 struct page *page = phys_to_page(phys); 368 union kho_page_info info; 369 370 memblock_reserve(phys, sz); 371 memblock_reserved_mark_noinit(phys, sz); 372 info.magic = KHO_PAGE_MAGIC; 373 info.order = order; 374 page->private = info.page_private; 375 } 376 } 377 378 static void __init kho_mem_deserialize(const void *fdt) 379 { 380 struct khoser_mem_chunk *chunk; 381 const phys_addr_t *mem; 382 int len; 383 384 mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); 385 386 if (!mem || len != sizeof(*mem)) { 387 pr_err("failed to get preserved memory bitmaps\n"); 388 return; 389 } 390 391 chunk = *mem ? phys_to_virt(*mem) : NULL; 392 while (chunk) { 393 unsigned int i; 394 395 for (i = 0; i != chunk->hdr.num_elms; i++) 396 deserialize_bitmap(chunk->hdr.order, 397 &chunk->bitmaps[i]); 398 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 399 } 400 } 401 402 /* 403 * With KHO enabled, memory can become fragmented because KHO regions may 404 * be anywhere in physical address space. The scratch regions give us a 405 * safe zones that we will never see KHO allocations from. This is where we 406 * can later safely load our new kexec images into and then use the scratch 407 * area for early allocations that happen before page allocator is 408 * initialized. 409 */ 410 static struct kho_scratch *kho_scratch; 411 static unsigned int kho_scratch_cnt; 412 413 /* 414 * The scratch areas are scaled by default as percent of memory allocated from 415 * memblock. A user can override the scale with command line parameter: 416 * 417 * kho_scratch=N% 418 * 419 * It is also possible to explicitly define size for a lowmem, a global and 420 * per-node scratch areas: 421 * 422 * kho_scratch=l[KMG],n[KMG],m[KMG] 423 * 424 * The explicit size definition takes precedence over scale definition. 425 */ 426 static unsigned int scratch_scale __initdata = 200; 427 static phys_addr_t scratch_size_global __initdata; 428 static phys_addr_t scratch_size_pernode __initdata; 429 static phys_addr_t scratch_size_lowmem __initdata; 430 431 static int __init kho_parse_scratch_size(char *p) 432 { 433 size_t len; 434 unsigned long sizes[3]; 435 size_t total_size = 0; 436 int i; 437 438 if (!p) 439 return -EINVAL; 440 441 len = strlen(p); 442 if (!len) 443 return -EINVAL; 444 445 /* parse nn% */ 446 if (p[len - 1] == '%') { 447 /* unsigned int max is 4,294,967,295, 10 chars */ 448 char s_scale[11] = {}; 449 int ret = 0; 450 451 if (len > ARRAY_SIZE(s_scale)) 452 return -EINVAL; 453 454 memcpy(s_scale, p, len - 1); 455 ret = kstrtouint(s_scale, 10, &scratch_scale); 456 if (!ret) 457 pr_notice("scratch scale is %d%%\n", scratch_scale); 458 return ret; 459 } 460 461 /* parse ll[KMG],mm[KMG],nn[KMG] */ 462 for (i = 0; i < ARRAY_SIZE(sizes); i++) { 463 char *endp = p; 464 465 if (i > 0) { 466 if (*p != ',') 467 return -EINVAL; 468 p += 1; 469 } 470 471 sizes[i] = memparse(p, &endp); 472 if (endp == p) 473 return -EINVAL; 474 p = endp; 475 total_size += sizes[i]; 476 } 477 478 if (!total_size) 479 return -EINVAL; 480 481 /* The string should be fully consumed by now. */ 482 if (*p) 483 return -EINVAL; 484 485 scratch_size_lowmem = sizes[0]; 486 scratch_size_global = sizes[1]; 487 scratch_size_pernode = sizes[2]; 488 scratch_scale = 0; 489 490 pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n", 491 (u64)(scratch_size_lowmem >> 20), 492 (u64)(scratch_size_global >> 20), 493 (u64)(scratch_size_pernode >> 20)); 494 495 return 0; 496 } 497 early_param("kho_scratch", kho_parse_scratch_size); 498 499 static void __init scratch_size_update(void) 500 { 501 phys_addr_t size; 502 503 if (!scratch_scale) 504 return; 505 506 size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT, 507 NUMA_NO_NODE); 508 size = size * scratch_scale / 100; 509 scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES); 510 511 size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, 512 NUMA_NO_NODE); 513 size = size * scratch_scale / 100 - scratch_size_lowmem; 514 scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES); 515 } 516 517 static phys_addr_t __init scratch_size_node(int nid) 518 { 519 phys_addr_t size; 520 521 if (scratch_scale) { 522 size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, 523 nid); 524 size = size * scratch_scale / 100; 525 } else { 526 size = scratch_size_pernode; 527 } 528 529 return round_up(size, CMA_MIN_ALIGNMENT_BYTES); 530 } 531 532 /** 533 * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec 534 * 535 * With KHO we can preserve arbitrary pages in the system. To ensure we still 536 * have a large contiguous region of memory when we search the physical address 537 * space for target memory, let's make sure we always have a large CMA region 538 * active. This CMA region will only be used for movable pages which are not a 539 * problem for us during KHO because we can just move them somewhere else. 540 */ 541 static void __init kho_reserve_scratch(void) 542 { 543 phys_addr_t addr, size; 544 int nid, i = 0; 545 546 if (!kho_enable) 547 return; 548 549 scratch_size_update(); 550 551 /* FIXME: deal with node hot-plug/remove */ 552 kho_scratch_cnt = num_online_nodes() + 2; 553 size = kho_scratch_cnt * sizeof(*kho_scratch); 554 kho_scratch = memblock_alloc(size, PAGE_SIZE); 555 if (!kho_scratch) 556 goto err_disable_kho; 557 558 /* 559 * reserve scratch area in low memory for lowmem allocations in the 560 * next kernel 561 */ 562 size = scratch_size_lowmem; 563 addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0, 564 ARCH_LOW_ADDRESS_LIMIT); 565 if (!addr) 566 goto err_free_scratch_desc; 567 568 kho_scratch[i].addr = addr; 569 kho_scratch[i].size = size; 570 i++; 571 572 /* reserve large contiguous area for allocations without nid */ 573 size = scratch_size_global; 574 addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES); 575 if (!addr) 576 goto err_free_scratch_areas; 577 578 kho_scratch[i].addr = addr; 579 kho_scratch[i].size = size; 580 i++; 581 582 for_each_online_node(nid) { 583 size = scratch_size_node(nid); 584 addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, 585 0, MEMBLOCK_ALLOC_ACCESSIBLE, 586 nid, true); 587 if (!addr) 588 goto err_free_scratch_areas; 589 590 kho_scratch[i].addr = addr; 591 kho_scratch[i].size = size; 592 i++; 593 } 594 595 return; 596 597 err_free_scratch_areas: 598 for (i--; i >= 0; i--) 599 memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size); 600 err_free_scratch_desc: 601 memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch)); 602 err_disable_kho: 603 pr_warn("Failed to reserve scratch area, disabling kexec handover\n"); 604 kho_enable = false; 605 } 606 607 struct fdt_debugfs { 608 struct list_head list; 609 struct debugfs_blob_wrapper wrapper; 610 struct dentry *file; 611 }; 612 613 static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, 614 const char *name, const void *fdt) 615 { 616 struct fdt_debugfs *f; 617 struct dentry *file; 618 619 f = kmalloc(sizeof(*f), GFP_KERNEL); 620 if (!f) 621 return -ENOMEM; 622 623 f->wrapper.data = (void *)fdt; 624 f->wrapper.size = fdt_totalsize(fdt); 625 626 file = debugfs_create_blob(name, 0400, dir, &f->wrapper); 627 if (IS_ERR(file)) { 628 kfree(f); 629 return PTR_ERR(file); 630 } 631 632 f->file = file; 633 list_add(&f->list, list); 634 635 return 0; 636 } 637 638 /** 639 * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. 640 * @ser: serialization control object passed by KHO notifiers. 641 * @name: name of the sub tree. 642 * @fdt: the sub tree blob. 643 * 644 * Creates a new child node named @name in KHO root FDT and records 645 * the physical address of @fdt. The pages of @fdt must also be preserved 646 * by KHO for the new kernel to retrieve it after kexec. 647 * 648 * A debugfs blob entry is also created at 649 * ``/sys/kernel/debug/kho/out/sub_fdts/@name``. 650 * 651 * Return: 0 on success, error code on failure 652 */ 653 int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) 654 { 655 int err = 0; 656 u64 phys = (u64)virt_to_phys(fdt); 657 void *root = page_to_virt(ser->fdt); 658 659 err |= fdt_begin_node(root, name); 660 err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); 661 err |= fdt_end_node(root); 662 663 if (err) 664 return err; 665 666 return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt); 667 } 668 EXPORT_SYMBOL_GPL(kho_add_subtree); 669 670 struct kho_out { 671 struct blocking_notifier_head chain_head; 672 673 struct dentry *dir; 674 675 struct mutex lock; /* protects KHO FDT finalization */ 676 677 struct kho_serialization ser; 678 bool finalized; 679 }; 680 681 static struct kho_out kho_out = { 682 .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), 683 .lock = __MUTEX_INITIALIZER(kho_out.lock), 684 .ser = { 685 .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), 686 .track = { 687 .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), 688 }, 689 }, 690 .finalized = false, 691 }; 692 693 int register_kho_notifier(struct notifier_block *nb) 694 { 695 return blocking_notifier_chain_register(&kho_out.chain_head, nb); 696 } 697 EXPORT_SYMBOL_GPL(register_kho_notifier); 698 699 int unregister_kho_notifier(struct notifier_block *nb) 700 { 701 return blocking_notifier_chain_unregister(&kho_out.chain_head, nb); 702 } 703 EXPORT_SYMBOL_GPL(unregister_kho_notifier); 704 705 /** 706 * kho_preserve_folio - preserve a folio across kexec. 707 * @folio: folio to preserve. 708 * 709 * Instructs KHO to preserve the whole folio across kexec. The order 710 * will be preserved as well. 711 * 712 * Return: 0 on success, error code on failure 713 */ 714 int kho_preserve_folio(struct folio *folio) 715 { 716 const unsigned long pfn = folio_pfn(folio); 717 const unsigned int order = folio_order(folio); 718 struct kho_mem_track *track = &kho_out.ser.track; 719 720 if (kho_out.finalized) 721 return -EBUSY; 722 723 return __kho_preserve_order(track, pfn, order); 724 } 725 EXPORT_SYMBOL_GPL(kho_preserve_folio); 726 727 /** 728 * kho_preserve_phys - preserve a physically contiguous range across kexec. 729 * @phys: physical address of the range. 730 * @size: size of the range. 731 * 732 * Instructs KHO to preserve the memory range from @phys to @phys + @size 733 * across kexec. 734 * 735 * Return: 0 on success, error code on failure 736 */ 737 int kho_preserve_phys(phys_addr_t phys, size_t size) 738 { 739 unsigned long pfn = PHYS_PFN(phys); 740 unsigned long failed_pfn = 0; 741 const unsigned long start_pfn = pfn; 742 const unsigned long end_pfn = PHYS_PFN(phys + size); 743 int err = 0; 744 struct kho_mem_track *track = &kho_out.ser.track; 745 746 if (kho_out.finalized) 747 return -EBUSY; 748 749 if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) 750 return -EINVAL; 751 752 while (pfn < end_pfn) { 753 const unsigned int order = 754 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 755 756 err = __kho_preserve_order(track, pfn, order); 757 if (err) { 758 failed_pfn = pfn; 759 break; 760 } 761 762 pfn += 1 << order; 763 } 764 765 if (err) 766 __kho_unpreserve(track, start_pfn, failed_pfn); 767 768 return err; 769 } 770 EXPORT_SYMBOL_GPL(kho_preserve_phys); 771 772 /* Handling for debug/kho/out */ 773 774 static struct dentry *debugfs_root; 775 776 static int kho_out_update_debugfs_fdt(void) 777 { 778 int err = 0; 779 struct fdt_debugfs *ff, *tmp; 780 781 if (kho_out.finalized) { 782 err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir, 783 "fdt", page_to_virt(kho_out.ser.fdt)); 784 } else { 785 list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) { 786 debugfs_remove(ff->file); 787 list_del(&ff->list); 788 kfree(ff); 789 } 790 } 791 792 return err; 793 } 794 795 static int kho_abort(void) 796 { 797 int err; 798 unsigned long order; 799 struct kho_mem_phys *physxa; 800 801 xa_for_each(&kho_out.ser.track.orders, order, physxa) { 802 struct kho_mem_phys_bits *bits; 803 unsigned long phys; 804 805 xa_for_each(&physxa->phys_bits, phys, bits) 806 kfree(bits); 807 808 xa_destroy(&physxa->phys_bits); 809 kfree(physxa); 810 } 811 xa_destroy(&kho_out.ser.track.orders); 812 813 if (kho_out.ser.preserved_mem_map) { 814 kho_mem_ser_free(kho_out.ser.preserved_mem_map); 815 kho_out.ser.preserved_mem_map = NULL; 816 } 817 818 err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT, 819 NULL); 820 err = notifier_to_errno(err); 821 822 if (err) 823 pr_err("Failed to abort KHO finalization: %d\n", err); 824 825 return err; 826 } 827 828 static int kho_finalize(void) 829 { 830 int err = 0; 831 u64 *preserved_mem_map; 832 void *fdt = page_to_virt(kho_out.ser.fdt); 833 834 err |= fdt_create(fdt, PAGE_SIZE); 835 err |= fdt_finish_reservemap(fdt); 836 err |= fdt_begin_node(fdt, ""); 837 err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE); 838 /** 839 * Reserve the preserved-memory-map property in the root FDT, so 840 * that all property definitions will precede subnodes created by 841 * KHO callers. 842 */ 843 err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP, 844 sizeof(*preserved_mem_map), 845 (void **)&preserved_mem_map); 846 if (err) 847 goto abort; 848 849 err = kho_preserve_folio(page_folio(kho_out.ser.fdt)); 850 if (err) 851 goto abort; 852 853 err = blocking_notifier_call_chain(&kho_out.chain_head, 854 KEXEC_KHO_FINALIZE, &kho_out.ser); 855 err = notifier_to_errno(err); 856 if (err) 857 goto abort; 858 859 err = kho_mem_serialize(&kho_out.ser); 860 if (err) 861 goto abort; 862 863 *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map); 864 865 err |= fdt_end_node(fdt); 866 err |= fdt_finish(fdt); 867 868 abort: 869 if (err) { 870 pr_err("Failed to convert KHO state tree: %d\n", err); 871 kho_abort(); 872 } 873 874 return err; 875 } 876 877 static int kho_out_finalize_get(void *data, u64 *val) 878 { 879 mutex_lock(&kho_out.lock); 880 *val = kho_out.finalized; 881 mutex_unlock(&kho_out.lock); 882 883 return 0; 884 } 885 886 static int kho_out_finalize_set(void *data, u64 _val) 887 { 888 int ret = 0; 889 bool val = !!_val; 890 891 mutex_lock(&kho_out.lock); 892 893 if (val == kho_out.finalized) { 894 if (kho_out.finalized) 895 ret = -EEXIST; 896 else 897 ret = -ENOENT; 898 goto unlock; 899 } 900 901 if (val) 902 ret = kho_finalize(); 903 else 904 ret = kho_abort(); 905 906 if (ret) 907 goto unlock; 908 909 kho_out.finalized = val; 910 ret = kho_out_update_debugfs_fdt(); 911 912 unlock: 913 mutex_unlock(&kho_out.lock); 914 return ret; 915 } 916 917 DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get, 918 kho_out_finalize_set, "%llu\n"); 919 920 static int scratch_phys_show(struct seq_file *m, void *v) 921 { 922 for (int i = 0; i < kho_scratch_cnt; i++) 923 seq_printf(m, "0x%llx\n", kho_scratch[i].addr); 924 925 return 0; 926 } 927 DEFINE_SHOW_ATTRIBUTE(scratch_phys); 928 929 static int scratch_len_show(struct seq_file *m, void *v) 930 { 931 for (int i = 0; i < kho_scratch_cnt; i++) 932 seq_printf(m, "0x%llx\n", kho_scratch[i].size); 933 934 return 0; 935 } 936 DEFINE_SHOW_ATTRIBUTE(scratch_len); 937 938 static __init int kho_out_debugfs_init(void) 939 { 940 struct dentry *dir, *f, *sub_fdt_dir; 941 942 dir = debugfs_create_dir("out", debugfs_root); 943 if (IS_ERR(dir)) 944 return -ENOMEM; 945 946 sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); 947 if (IS_ERR(sub_fdt_dir)) 948 goto err_rmdir; 949 950 f = debugfs_create_file("scratch_phys", 0400, dir, NULL, 951 &scratch_phys_fops); 952 if (IS_ERR(f)) 953 goto err_rmdir; 954 955 f = debugfs_create_file("scratch_len", 0400, dir, NULL, 956 &scratch_len_fops); 957 if (IS_ERR(f)) 958 goto err_rmdir; 959 960 f = debugfs_create_file("finalize", 0600, dir, NULL, 961 &fops_kho_out_finalize); 962 if (IS_ERR(f)) 963 goto err_rmdir; 964 965 kho_out.dir = dir; 966 kho_out.ser.sub_fdt_dir = sub_fdt_dir; 967 return 0; 968 969 err_rmdir: 970 debugfs_remove_recursive(dir); 971 return -ENOENT; 972 } 973 974 struct kho_in { 975 struct dentry *dir; 976 phys_addr_t fdt_phys; 977 phys_addr_t scratch_phys; 978 struct list_head fdt_list; 979 }; 980 981 static struct kho_in kho_in = { 982 .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list), 983 }; 984 985 static const void *kho_get_fdt(void) 986 { 987 return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; 988 } 989 990 /** 991 * is_kho_boot - check if current kernel was booted via KHO-enabled 992 * kexec 993 * 994 * This function checks if the current kernel was loaded through a kexec 995 * operation with KHO enabled, by verifying that a valid KHO FDT 996 * was passed. 997 * 998 * Note: This function returns reliable results only after 999 * kho_populate() has been called during early boot. Before that, 1000 * it may return false even if KHO data is present. 1001 * 1002 * Return: true if booted via KHO-enabled kexec, false otherwise 1003 */ 1004 bool is_kho_boot(void) 1005 { 1006 return !!kho_get_fdt(); 1007 } 1008 EXPORT_SYMBOL_GPL(is_kho_boot); 1009 1010 /** 1011 * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. 1012 * @name: the name of the sub FDT passed to kho_add_subtree(). 1013 * @phys: if found, the physical address of the sub FDT is stored in @phys. 1014 * 1015 * Retrieve a preserved sub FDT named @name and store its physical 1016 * address in @phys. 1017 * 1018 * Return: 0 on success, error code on failure 1019 */ 1020 int kho_retrieve_subtree(const char *name, phys_addr_t *phys) 1021 { 1022 const void *fdt = kho_get_fdt(); 1023 const u64 *val; 1024 int offset, len; 1025 1026 if (!fdt) 1027 return -ENOENT; 1028 1029 if (!phys) 1030 return -EINVAL; 1031 1032 offset = fdt_subnode_offset(fdt, 0, name); 1033 if (offset < 0) 1034 return -ENOENT; 1035 1036 val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len); 1037 if (!val || len != sizeof(*val)) 1038 return -EINVAL; 1039 1040 *phys = (phys_addr_t)*val; 1041 1042 return 0; 1043 } 1044 EXPORT_SYMBOL_GPL(kho_retrieve_subtree); 1045 1046 /* Handling for debugfs/kho/in */ 1047 1048 static __init int kho_in_debugfs_init(const void *fdt) 1049 { 1050 struct dentry *sub_fdt_dir; 1051 int err, child; 1052 1053 kho_in.dir = debugfs_create_dir("in", debugfs_root); 1054 if (IS_ERR(kho_in.dir)) 1055 return PTR_ERR(kho_in.dir); 1056 1057 sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir); 1058 if (IS_ERR(sub_fdt_dir)) { 1059 err = PTR_ERR(sub_fdt_dir); 1060 goto err_rmdir; 1061 } 1062 1063 err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt); 1064 if (err) 1065 goto err_rmdir; 1066 1067 fdt_for_each_subnode(child, fdt, 0) { 1068 int len = 0; 1069 const char *name = fdt_get_name(fdt, child, NULL); 1070 const u64 *fdt_phys; 1071 1072 fdt_phys = fdt_getprop(fdt, child, "fdt", &len); 1073 if (!fdt_phys) 1074 continue; 1075 if (len != sizeof(*fdt_phys)) { 1076 pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n", 1077 name, len); 1078 continue; 1079 } 1080 err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name, 1081 phys_to_virt(*fdt_phys)); 1082 if (err) { 1083 pr_warn("failed to add fdt `%s` to debugfs: %d\n", name, 1084 err); 1085 continue; 1086 } 1087 } 1088 1089 return 0; 1090 1091 err_rmdir: 1092 debugfs_remove_recursive(kho_in.dir); 1093 return err; 1094 } 1095 1096 static __init int kho_init(void) 1097 { 1098 int err = 0; 1099 const void *fdt = kho_get_fdt(); 1100 1101 if (!kho_enable) 1102 return 0; 1103 1104 kho_out.ser.fdt = alloc_page(GFP_KERNEL); 1105 if (!kho_out.ser.fdt) { 1106 err = -ENOMEM; 1107 goto err_free_scratch; 1108 } 1109 1110 debugfs_root = debugfs_create_dir("kho", NULL); 1111 if (IS_ERR(debugfs_root)) { 1112 err = -ENOENT; 1113 goto err_free_fdt; 1114 } 1115 1116 err = kho_out_debugfs_init(); 1117 if (err) 1118 goto err_free_fdt; 1119 1120 if (fdt) { 1121 err = kho_in_debugfs_init(fdt); 1122 /* 1123 * Failure to create /sys/kernel/debug/kho/in does not prevent 1124 * reviving state from KHO and setting up KHO for the next 1125 * kexec. 1126 */ 1127 if (err) 1128 pr_err("failed exposing handover FDT in debugfs: %d\n", 1129 err); 1130 1131 return 0; 1132 } 1133 1134 for (int i = 0; i < kho_scratch_cnt; i++) { 1135 unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr); 1136 unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; 1137 unsigned long pfn; 1138 1139 for (pfn = base_pfn; pfn < base_pfn + count; 1140 pfn += pageblock_nr_pages) 1141 init_cma_reserved_pageblock(pfn_to_page(pfn)); 1142 } 1143 1144 return 0; 1145 1146 err_free_fdt: 1147 put_page(kho_out.ser.fdt); 1148 kho_out.ser.fdt = NULL; 1149 err_free_scratch: 1150 for (int i = 0; i < kho_scratch_cnt; i++) { 1151 void *start = __va(kho_scratch[i].addr); 1152 void *end = start + kho_scratch[i].size; 1153 1154 free_reserved_area(start, end, -1, ""); 1155 } 1156 kho_enable = false; 1157 return err; 1158 } 1159 late_initcall(kho_init); 1160 1161 static void __init kho_release_scratch(void) 1162 { 1163 phys_addr_t start, end; 1164 u64 i; 1165 1166 memmap_init_kho_scratch_pages(); 1167 1168 /* 1169 * Mark scratch mem as CMA before we return it. That way we 1170 * ensure that no kernel allocations happen on it. That means 1171 * we can reuse it as scratch memory again later. 1172 */ 1173 __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, 1174 MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) { 1175 ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start)); 1176 ulong end_pfn = pageblock_align(PFN_UP(end)); 1177 ulong pfn; 1178 1179 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) 1180 init_pageblock_migratetype(pfn_to_page(pfn), 1181 MIGRATE_CMA, false); 1182 } 1183 } 1184 1185 void __init kho_memory_init(void) 1186 { 1187 struct folio *folio; 1188 1189 if (kho_in.scratch_phys) { 1190 kho_scratch = phys_to_virt(kho_in.scratch_phys); 1191 kho_release_scratch(); 1192 1193 kho_mem_deserialize(kho_get_fdt()); 1194 folio = kho_restore_folio(kho_in.fdt_phys); 1195 if (!folio) 1196 pr_warn("failed to restore folio for KHO fdt\n"); 1197 } else { 1198 kho_reserve_scratch(); 1199 } 1200 } 1201 1202 void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, 1203 phys_addr_t scratch_phys, u64 scratch_len) 1204 { 1205 void *fdt = NULL; 1206 struct kho_scratch *scratch = NULL; 1207 int err = 0; 1208 unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); 1209 1210 /* Validate the input FDT */ 1211 fdt = early_memremap(fdt_phys, fdt_len); 1212 if (!fdt) { 1213 pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys); 1214 err = -EFAULT; 1215 goto out; 1216 } 1217 err = fdt_check_header(fdt); 1218 if (err) { 1219 pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n", 1220 fdt_phys, err); 1221 err = -EINVAL; 1222 goto out; 1223 } 1224 err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE); 1225 if (err) { 1226 pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n", 1227 fdt_phys, KHO_FDT_COMPATIBLE, err); 1228 err = -EINVAL; 1229 goto out; 1230 } 1231 1232 scratch = early_memremap(scratch_phys, scratch_len); 1233 if (!scratch) { 1234 pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n", 1235 scratch_phys, scratch_len); 1236 err = -EFAULT; 1237 goto out; 1238 } 1239 1240 /* 1241 * We pass a safe contiguous blocks of memory to use for early boot 1242 * purporses from the previous kernel so that we can resize the 1243 * memblock array as needed. 1244 */ 1245 for (int i = 0; i < scratch_cnt; i++) { 1246 struct kho_scratch *area = &scratch[i]; 1247 u64 size = area->size; 1248 1249 memblock_add(area->addr, size); 1250 err = memblock_mark_kho_scratch(area->addr, size); 1251 if (WARN_ON(err)) { 1252 pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d", 1253 &area->addr, &size, err); 1254 goto out; 1255 } 1256 pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); 1257 } 1258 1259 memblock_reserve(scratch_phys, scratch_len); 1260 1261 /* 1262 * Now that we have a viable region of scratch memory, let's tell 1263 * the memblocks allocator to only use that for any allocations. 1264 * That way we ensure that nothing scribbles over in use data while 1265 * we initialize the page tables which we will need to ingest all 1266 * memory reservations from the previous kernel. 1267 */ 1268 memblock_set_kho_scratch_only(); 1269 1270 kho_in.fdt_phys = fdt_phys; 1271 kho_in.scratch_phys = scratch_phys; 1272 kho_scratch_cnt = scratch_cnt; 1273 pr_info("found kexec handover data. Will skip init for some devices\n"); 1274 1275 out: 1276 if (fdt) 1277 early_memunmap(fdt, fdt_len); 1278 if (scratch) 1279 early_memunmap(scratch, scratch_len); 1280 if (err) 1281 pr_warn("disabling KHO revival: %d\n", err); 1282 } 1283 1284 /* Helper functions for kexec_file_load */ 1285 1286 int kho_fill_kimage(struct kimage *image) 1287 { 1288 ssize_t scratch_size; 1289 int err = 0; 1290 struct kexec_buf scratch; 1291 1292 if (!kho_out.finalized) 1293 return 0; 1294 1295 image->kho.fdt = page_to_phys(kho_out.ser.fdt); 1296 1297 scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; 1298 scratch = (struct kexec_buf){ 1299 .image = image, 1300 .buffer = kho_scratch, 1301 .bufsz = scratch_size, 1302 .mem = KEXEC_BUF_MEM_UNKNOWN, 1303 .memsz = scratch_size, 1304 .buf_align = SZ_64K, /* Makes it easier to map */ 1305 .buf_max = ULONG_MAX, 1306 .top_down = true, 1307 }; 1308 err = kexec_add_buffer(&scratch); 1309 if (err) 1310 return err; 1311 image->kho.scratch = &image->segment[image->nr_segments - 1]; 1312 1313 return 0; 1314 } 1315 1316 static int kho_walk_scratch(struct kexec_buf *kbuf, 1317 int (*func)(struct resource *, void *)) 1318 { 1319 int ret = 0; 1320 int i; 1321 1322 for (i = 0; i < kho_scratch_cnt; i++) { 1323 struct resource res = { 1324 .start = kho_scratch[i].addr, 1325 .end = kho_scratch[i].addr + kho_scratch[i].size - 1, 1326 }; 1327 1328 /* Try to fit the kimage into our KHO scratch region */ 1329 ret = func(&res, kbuf); 1330 if (ret) 1331 break; 1332 } 1333 1334 return ret; 1335 } 1336 1337 int kho_locate_mem_hole(struct kexec_buf *kbuf, 1338 int (*func)(struct resource *, void *)) 1339 { 1340 int ret; 1341 1342 if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH) 1343 return 1; 1344 1345 ret = kho_walk_scratch(kbuf, func); 1346 1347 return ret == 1 ? 0 : -EADDRNOTAVAIL; 1348 } 1349