1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kexec_handover.c - kexec handover metadata processing 4 * Copyright (C) 2023 Alexander Graf <graf@amazon.com> 5 * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> 6 * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> 7 */ 8 9 #define pr_fmt(fmt) "KHO: " fmt 10 11 #include <linux/cma.h> 12 #include <linux/count_zeros.h> 13 #include <linux/debugfs.h> 14 #include <linux/kexec.h> 15 #include <linux/kexec_handover.h> 16 #include <linux/libfdt.h> 17 #include <linux/list.h> 18 #include <linux/memblock.h> 19 #include <linux/notifier.h> 20 #include <linux/page-isolation.h> 21 #include <linux/vmalloc.h> 22 23 #include <asm/early_ioremap.h> 24 25 /* 26 * KHO is tightly coupled with mm init and needs access to some of mm 27 * internal APIs. 28 */ 29 #include "../mm/internal.h" 30 #include "kexec_internal.h" 31 32 #define KHO_FDT_COMPATIBLE "kho-v1" 33 #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" 34 #define PROP_SUB_FDT "fdt" 35 36 #define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */ 37 38 /* 39 * KHO uses page->private, which is an unsigned long, to store page metadata. 40 * Use it to store both the magic and the order. 41 */ 42 union kho_page_info { 43 unsigned long page_private; 44 struct { 45 unsigned int order; 46 unsigned int magic; 47 }; 48 }; 49 50 static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private)); 51 52 static bool kho_enable __ro_after_init; 53 54 bool kho_is_enabled(void) 55 { 56 return kho_enable; 57 } 58 EXPORT_SYMBOL_GPL(kho_is_enabled); 59 60 static int __init kho_parse_enable(char *p) 61 { 62 return kstrtobool(p, &kho_enable); 63 } 64 early_param("kho", kho_parse_enable); 65 66 /* 67 * Keep track of memory that is to be preserved across KHO. 68 * 69 * The serializing side uses two levels of xarrays to manage chunks of per-order 70 * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a 71 * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations 72 * each bitmap will cover 16M of address space. Thus, for 16G of memory at most 73 * 512K of bitmap memory will be needed for order 0. 74 * 75 * This approach is fully incremental, as the serialization progresses folios 76 * can continue be aggregated to the tracker. The final step, immediately prior 77 * to kexec would serialize the xarray information into a linked list for the 78 * successor kernel to parse. 79 */ 80 81 #define PRESERVE_BITS (512 * 8) 82 83 struct kho_mem_phys_bits { 84 DECLARE_BITMAP(preserve, PRESERVE_BITS); 85 }; 86 87 struct kho_mem_phys { 88 /* 89 * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized 90 * to order. 91 */ 92 struct xarray phys_bits; 93 }; 94 95 struct kho_mem_track { 96 /* Points to kho_mem_phys, each order gets its own bitmap tree */ 97 struct xarray orders; 98 }; 99 100 struct khoser_mem_chunk; 101 102 struct kho_serialization { 103 struct page *fdt; 104 struct list_head fdt_list; 105 struct dentry *sub_fdt_dir; 106 struct kho_mem_track track; 107 /* First chunk of serialized preserved memory map */ 108 struct khoser_mem_chunk *preserved_mem_map; 109 }; 110 111 struct kho_out { 112 struct blocking_notifier_head chain_head; 113 114 struct dentry *dir; 115 116 struct mutex lock; /* protects KHO FDT finalization */ 117 118 struct kho_serialization ser; 119 bool finalized; 120 }; 121 122 static struct kho_out kho_out = { 123 .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), 124 .lock = __MUTEX_INITIALIZER(kho_out.lock), 125 .ser = { 126 .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), 127 .track = { 128 .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), 129 }, 130 }, 131 .finalized = false, 132 }; 133 134 static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) 135 { 136 void *elm, *res; 137 138 elm = xa_load(xa, index); 139 if (elm) 140 return elm; 141 142 elm = kzalloc(sz, GFP_KERNEL); 143 if (!elm) 144 return ERR_PTR(-ENOMEM); 145 146 res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); 147 if (xa_is_err(res)) 148 res = ERR_PTR(xa_err(res)); 149 150 if (res) { 151 kfree(elm); 152 return res; 153 } 154 155 return elm; 156 } 157 158 static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, 159 unsigned long end_pfn) 160 { 161 struct kho_mem_phys_bits *bits; 162 struct kho_mem_phys *physxa; 163 164 while (pfn < end_pfn) { 165 const unsigned int order = 166 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 167 const unsigned long pfn_high = pfn >> order; 168 169 physxa = xa_load(&track->orders, order); 170 if (!physxa) 171 continue; 172 173 bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); 174 if (!bits) 175 continue; 176 177 clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); 178 179 pfn += 1 << order; 180 } 181 } 182 183 static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, 184 unsigned int order) 185 { 186 struct kho_mem_phys_bits *bits; 187 struct kho_mem_phys *physxa, *new_physxa; 188 const unsigned long pfn_high = pfn >> order; 189 190 might_sleep(); 191 192 if (kho_out.finalized) 193 return -EBUSY; 194 195 physxa = xa_load(&track->orders, order); 196 if (!physxa) { 197 int err; 198 199 new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL); 200 if (!new_physxa) 201 return -ENOMEM; 202 203 xa_init(&new_physxa->phys_bits); 204 physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa, 205 GFP_KERNEL); 206 207 err = xa_err(physxa); 208 if (err || physxa) { 209 xa_destroy(&new_physxa->phys_bits); 210 kfree(new_physxa); 211 212 if (err) 213 return err; 214 } else { 215 physxa = new_physxa; 216 } 217 } 218 219 bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, 220 sizeof(*bits)); 221 if (IS_ERR(bits)) 222 return PTR_ERR(bits); 223 224 set_bit(pfn_high % PRESERVE_BITS, bits->preserve); 225 226 return 0; 227 } 228 229 static struct page *kho_restore_page(phys_addr_t phys) 230 { 231 struct page *page = pfn_to_online_page(PHYS_PFN(phys)); 232 union kho_page_info info; 233 unsigned int nr_pages; 234 235 if (!page) 236 return NULL; 237 238 info.page_private = page->private; 239 /* 240 * deserialize_bitmap() only sets the magic on the head page. This magic 241 * check also implicitly makes sure phys is order-aligned since for 242 * non-order-aligned phys addresses, magic will never be set. 243 */ 244 if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC || info.order > MAX_PAGE_ORDER)) 245 return NULL; 246 nr_pages = (1 << info.order); 247 248 /* Clear private to make sure later restores on this page error out. */ 249 page->private = 0; 250 /* Head page gets refcount of 1. */ 251 set_page_count(page, 1); 252 253 /* For higher order folios, tail pages get a page count of zero. */ 254 for (unsigned int i = 1; i < nr_pages; i++) 255 set_page_count(page + i, 0); 256 257 if (info.order > 0) 258 prep_compound_page(page, info.order); 259 260 adjust_managed_page_count(page, nr_pages); 261 return page; 262 } 263 264 /** 265 * kho_restore_folio - recreates the folio from the preserved memory. 266 * @phys: physical address of the folio. 267 * 268 * Return: pointer to the struct folio on success, NULL on failure. 269 */ 270 struct folio *kho_restore_folio(phys_addr_t phys) 271 { 272 struct page *page = kho_restore_page(phys); 273 274 return page ? page_folio(page) : NULL; 275 } 276 EXPORT_SYMBOL_GPL(kho_restore_folio); 277 278 /** 279 * kho_restore_pages - restore list of contiguous order 0 pages. 280 * @phys: physical address of the first page. 281 * @nr_pages: number of pages. 282 * 283 * Restore a contiguous list of order 0 pages that was preserved with 284 * kho_preserve_pages(). 285 * 286 * Return: 0 on success, error code on failure 287 */ 288 struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages) 289 { 290 const unsigned long start_pfn = PHYS_PFN(phys); 291 const unsigned long end_pfn = start_pfn + nr_pages; 292 unsigned long pfn = start_pfn; 293 294 while (pfn < end_pfn) { 295 const unsigned int order = 296 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 297 struct page *page = kho_restore_page(PFN_PHYS(pfn)); 298 299 if (!page) 300 return NULL; 301 split_page(page, order); 302 pfn += 1 << order; 303 } 304 305 return pfn_to_page(start_pfn); 306 } 307 EXPORT_SYMBOL_GPL(kho_restore_pages); 308 309 /* Serialize and deserialize struct kho_mem_phys across kexec 310 * 311 * Record all the bitmaps in a linked list of pages for the next kernel to 312 * process. Each chunk holds bitmaps of the same order and each block of bitmaps 313 * starts at a given physical address. This allows the bitmaps to be sparse. The 314 * xarray is used to store them in a tree while building up the data structure, 315 * but the KHO successor kernel only needs to process them once in order. 316 * 317 * All of this memory is normal kmalloc() memory and is not marked for 318 * preservation. The successor kernel will remain isolated to the scratch space 319 * until it completes processing this list. Once processed all the memory 320 * storing these ranges will be marked as free. 321 */ 322 323 struct khoser_mem_bitmap_ptr { 324 phys_addr_t phys_start; 325 DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *); 326 }; 327 328 struct khoser_mem_chunk_hdr { 329 DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *); 330 unsigned int order; 331 unsigned int num_elms; 332 }; 333 334 #define KHOSER_BITMAP_SIZE \ 335 ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \ 336 sizeof(struct khoser_mem_bitmap_ptr)) 337 338 struct khoser_mem_chunk { 339 struct khoser_mem_chunk_hdr hdr; 340 struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE]; 341 }; 342 343 static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); 344 345 static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, 346 unsigned long order) 347 { 348 struct khoser_mem_chunk *chunk; 349 350 chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); 351 if (!chunk) 352 return NULL; 353 chunk->hdr.order = order; 354 if (cur_chunk) 355 KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); 356 return chunk; 357 } 358 359 static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) 360 { 361 struct khoser_mem_chunk *chunk = first_chunk; 362 363 while (chunk) { 364 struct khoser_mem_chunk *tmp = chunk; 365 366 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 367 kfree(tmp); 368 } 369 } 370 371 static int kho_mem_serialize(struct kho_serialization *ser) 372 { 373 struct khoser_mem_chunk *first_chunk = NULL; 374 struct khoser_mem_chunk *chunk = NULL; 375 struct kho_mem_phys *physxa; 376 unsigned long order; 377 378 xa_for_each(&ser->track.orders, order, physxa) { 379 struct kho_mem_phys_bits *bits; 380 unsigned long phys; 381 382 chunk = new_chunk(chunk, order); 383 if (!chunk) 384 goto err_free; 385 386 if (!first_chunk) 387 first_chunk = chunk; 388 389 xa_for_each(&physxa->phys_bits, phys, bits) { 390 struct khoser_mem_bitmap_ptr *elm; 391 392 if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { 393 chunk = new_chunk(chunk, order); 394 if (!chunk) 395 goto err_free; 396 } 397 398 elm = &chunk->bitmaps[chunk->hdr.num_elms]; 399 chunk->hdr.num_elms++; 400 elm->phys_start = (phys * PRESERVE_BITS) 401 << (order + PAGE_SHIFT); 402 KHOSER_STORE_PTR(elm->bitmap, bits); 403 } 404 } 405 406 ser->preserved_mem_map = first_chunk; 407 408 return 0; 409 410 err_free: 411 kho_mem_ser_free(first_chunk); 412 return -ENOMEM; 413 } 414 415 static void __init deserialize_bitmap(unsigned int order, 416 struct khoser_mem_bitmap_ptr *elm) 417 { 418 struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap); 419 unsigned long bit; 420 421 for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) { 422 int sz = 1 << (order + PAGE_SHIFT); 423 phys_addr_t phys = 424 elm->phys_start + (bit << (order + PAGE_SHIFT)); 425 struct page *page = phys_to_page(phys); 426 union kho_page_info info; 427 428 memblock_reserve(phys, sz); 429 memblock_reserved_mark_noinit(phys, sz); 430 info.magic = KHO_PAGE_MAGIC; 431 info.order = order; 432 page->private = info.page_private; 433 } 434 } 435 436 static void __init kho_mem_deserialize(const void *fdt) 437 { 438 struct khoser_mem_chunk *chunk; 439 const phys_addr_t *mem; 440 int len; 441 442 mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); 443 444 if (!mem || len != sizeof(*mem)) { 445 pr_err("failed to get preserved memory bitmaps\n"); 446 return; 447 } 448 449 chunk = *mem ? phys_to_virt(*mem) : NULL; 450 while (chunk) { 451 unsigned int i; 452 453 for (i = 0; i != chunk->hdr.num_elms; i++) 454 deserialize_bitmap(chunk->hdr.order, 455 &chunk->bitmaps[i]); 456 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 457 } 458 } 459 460 /* 461 * With KHO enabled, memory can become fragmented because KHO regions may 462 * be anywhere in physical address space. The scratch regions give us a 463 * safe zones that we will never see KHO allocations from. This is where we 464 * can later safely load our new kexec images into and then use the scratch 465 * area for early allocations that happen before page allocator is 466 * initialized. 467 */ 468 static struct kho_scratch *kho_scratch; 469 static unsigned int kho_scratch_cnt; 470 471 /* 472 * The scratch areas are scaled by default as percent of memory allocated from 473 * memblock. A user can override the scale with command line parameter: 474 * 475 * kho_scratch=N% 476 * 477 * It is also possible to explicitly define size for a lowmem, a global and 478 * per-node scratch areas: 479 * 480 * kho_scratch=l[KMG],n[KMG],m[KMG] 481 * 482 * The explicit size definition takes precedence over scale definition. 483 */ 484 static unsigned int scratch_scale __initdata = 200; 485 static phys_addr_t scratch_size_global __initdata; 486 static phys_addr_t scratch_size_pernode __initdata; 487 static phys_addr_t scratch_size_lowmem __initdata; 488 489 static int __init kho_parse_scratch_size(char *p) 490 { 491 size_t len; 492 unsigned long sizes[3]; 493 size_t total_size = 0; 494 int i; 495 496 if (!p) 497 return -EINVAL; 498 499 len = strlen(p); 500 if (!len) 501 return -EINVAL; 502 503 /* parse nn% */ 504 if (p[len - 1] == '%') { 505 /* unsigned int max is 4,294,967,295, 10 chars */ 506 char s_scale[11] = {}; 507 int ret = 0; 508 509 if (len > ARRAY_SIZE(s_scale)) 510 return -EINVAL; 511 512 memcpy(s_scale, p, len - 1); 513 ret = kstrtouint(s_scale, 10, &scratch_scale); 514 if (!ret) 515 pr_notice("scratch scale is %d%%\n", scratch_scale); 516 return ret; 517 } 518 519 /* parse ll[KMG],mm[KMG],nn[KMG] */ 520 for (i = 0; i < ARRAY_SIZE(sizes); i++) { 521 char *endp = p; 522 523 if (i > 0) { 524 if (*p != ',') 525 return -EINVAL; 526 p += 1; 527 } 528 529 sizes[i] = memparse(p, &endp); 530 if (endp == p) 531 return -EINVAL; 532 p = endp; 533 total_size += sizes[i]; 534 } 535 536 if (!total_size) 537 return -EINVAL; 538 539 /* The string should be fully consumed by now. */ 540 if (*p) 541 return -EINVAL; 542 543 scratch_size_lowmem = sizes[0]; 544 scratch_size_global = sizes[1]; 545 scratch_size_pernode = sizes[2]; 546 scratch_scale = 0; 547 548 pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n", 549 (u64)(scratch_size_lowmem >> 20), 550 (u64)(scratch_size_global >> 20), 551 (u64)(scratch_size_pernode >> 20)); 552 553 return 0; 554 } 555 early_param("kho_scratch", kho_parse_scratch_size); 556 557 static void __init scratch_size_update(void) 558 { 559 phys_addr_t size; 560 561 if (!scratch_scale) 562 return; 563 564 size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT, 565 NUMA_NO_NODE); 566 size = size * scratch_scale / 100; 567 scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES); 568 569 size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, 570 NUMA_NO_NODE); 571 size = size * scratch_scale / 100 - scratch_size_lowmem; 572 scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES); 573 } 574 575 static phys_addr_t __init scratch_size_node(int nid) 576 { 577 phys_addr_t size; 578 579 if (scratch_scale) { 580 size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, 581 nid); 582 size = size * scratch_scale / 100; 583 } else { 584 size = scratch_size_pernode; 585 } 586 587 return round_up(size, CMA_MIN_ALIGNMENT_BYTES); 588 } 589 590 /** 591 * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec 592 * 593 * With KHO we can preserve arbitrary pages in the system. To ensure we still 594 * have a large contiguous region of memory when we search the physical address 595 * space for target memory, let's make sure we always have a large CMA region 596 * active. This CMA region will only be used for movable pages which are not a 597 * problem for us during KHO because we can just move them somewhere else. 598 */ 599 static void __init kho_reserve_scratch(void) 600 { 601 phys_addr_t addr, size; 602 int nid, i = 0; 603 604 if (!kho_enable) 605 return; 606 607 scratch_size_update(); 608 609 /* FIXME: deal with node hot-plug/remove */ 610 kho_scratch_cnt = num_online_nodes() + 2; 611 size = kho_scratch_cnt * sizeof(*kho_scratch); 612 kho_scratch = memblock_alloc(size, PAGE_SIZE); 613 if (!kho_scratch) 614 goto err_disable_kho; 615 616 /* 617 * reserve scratch area in low memory for lowmem allocations in the 618 * next kernel 619 */ 620 size = scratch_size_lowmem; 621 addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0, 622 ARCH_LOW_ADDRESS_LIMIT); 623 if (!addr) 624 goto err_free_scratch_desc; 625 626 kho_scratch[i].addr = addr; 627 kho_scratch[i].size = size; 628 i++; 629 630 /* reserve large contiguous area for allocations without nid */ 631 size = scratch_size_global; 632 addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES); 633 if (!addr) 634 goto err_free_scratch_areas; 635 636 kho_scratch[i].addr = addr; 637 kho_scratch[i].size = size; 638 i++; 639 640 for_each_online_node(nid) { 641 size = scratch_size_node(nid); 642 addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, 643 0, MEMBLOCK_ALLOC_ACCESSIBLE, 644 nid, true); 645 if (!addr) 646 goto err_free_scratch_areas; 647 648 kho_scratch[i].addr = addr; 649 kho_scratch[i].size = size; 650 i++; 651 } 652 653 return; 654 655 err_free_scratch_areas: 656 for (i--; i >= 0; i--) 657 memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size); 658 err_free_scratch_desc: 659 memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch)); 660 err_disable_kho: 661 pr_warn("Failed to reserve scratch area, disabling kexec handover\n"); 662 kho_enable = false; 663 } 664 665 struct fdt_debugfs { 666 struct list_head list; 667 struct debugfs_blob_wrapper wrapper; 668 struct dentry *file; 669 }; 670 671 static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, 672 const char *name, const void *fdt) 673 { 674 struct fdt_debugfs *f; 675 struct dentry *file; 676 677 f = kmalloc(sizeof(*f), GFP_KERNEL); 678 if (!f) 679 return -ENOMEM; 680 681 f->wrapper.data = (void *)fdt; 682 f->wrapper.size = fdt_totalsize(fdt); 683 684 file = debugfs_create_blob(name, 0400, dir, &f->wrapper); 685 if (IS_ERR(file)) { 686 kfree(f); 687 return PTR_ERR(file); 688 } 689 690 f->file = file; 691 list_add(&f->list, list); 692 693 return 0; 694 } 695 696 /** 697 * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. 698 * @ser: serialization control object passed by KHO notifiers. 699 * @name: name of the sub tree. 700 * @fdt: the sub tree blob. 701 * 702 * Creates a new child node named @name in KHO root FDT and records 703 * the physical address of @fdt. The pages of @fdt must also be preserved 704 * by KHO for the new kernel to retrieve it after kexec. 705 * 706 * A debugfs blob entry is also created at 707 * ``/sys/kernel/debug/kho/out/sub_fdts/@name``. 708 * 709 * Return: 0 on success, error code on failure 710 */ 711 int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) 712 { 713 int err = 0; 714 u64 phys = (u64)virt_to_phys(fdt); 715 void *root = page_to_virt(ser->fdt); 716 717 err |= fdt_begin_node(root, name); 718 err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); 719 err |= fdt_end_node(root); 720 721 if (err) 722 return err; 723 724 return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt); 725 } 726 EXPORT_SYMBOL_GPL(kho_add_subtree); 727 728 int register_kho_notifier(struct notifier_block *nb) 729 { 730 return blocking_notifier_chain_register(&kho_out.chain_head, nb); 731 } 732 EXPORT_SYMBOL_GPL(register_kho_notifier); 733 734 int unregister_kho_notifier(struct notifier_block *nb) 735 { 736 return blocking_notifier_chain_unregister(&kho_out.chain_head, nb); 737 } 738 EXPORT_SYMBOL_GPL(unregister_kho_notifier); 739 740 /** 741 * kho_preserve_folio - preserve a folio across kexec. 742 * @folio: folio to preserve. 743 * 744 * Instructs KHO to preserve the whole folio across kexec. The order 745 * will be preserved as well. 746 * 747 * Return: 0 on success, error code on failure 748 */ 749 int kho_preserve_folio(struct folio *folio) 750 { 751 const unsigned long pfn = folio_pfn(folio); 752 const unsigned int order = folio_order(folio); 753 struct kho_mem_track *track = &kho_out.ser.track; 754 755 return __kho_preserve_order(track, pfn, order); 756 } 757 EXPORT_SYMBOL_GPL(kho_preserve_folio); 758 759 /** 760 * kho_preserve_pages - preserve contiguous pages across kexec 761 * @page: first page in the list. 762 * @nr_pages: number of pages. 763 * 764 * Preserve a contiguous list of order 0 pages. Must be restored using 765 * kho_restore_pages() to ensure the pages are restored properly as order 0. 766 * 767 * Return: 0 on success, error code on failure 768 */ 769 int kho_preserve_pages(struct page *page, unsigned int nr_pages) 770 { 771 struct kho_mem_track *track = &kho_out.ser.track; 772 const unsigned long start_pfn = page_to_pfn(page); 773 const unsigned long end_pfn = start_pfn + nr_pages; 774 unsigned long pfn = start_pfn; 775 unsigned long failed_pfn = 0; 776 int err = 0; 777 778 while (pfn < end_pfn) { 779 const unsigned int order = 780 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 781 782 err = __kho_preserve_order(track, pfn, order); 783 if (err) { 784 failed_pfn = pfn; 785 break; 786 } 787 788 pfn += 1 << order; 789 } 790 791 if (err) 792 __kho_unpreserve(track, start_pfn, failed_pfn); 793 794 return err; 795 } 796 EXPORT_SYMBOL_GPL(kho_preserve_pages); 797 798 struct kho_vmalloc_hdr { 799 DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); 800 }; 801 802 #define KHO_VMALLOC_SIZE \ 803 ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \ 804 sizeof(phys_addr_t)) 805 806 struct kho_vmalloc_chunk { 807 struct kho_vmalloc_hdr hdr; 808 phys_addr_t phys[KHO_VMALLOC_SIZE]; 809 }; 810 811 static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE); 812 813 /* vmalloc flags KHO supports */ 814 #define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC | VM_ALLOW_HUGE_VMAP) 815 816 /* KHO internal flags for vmalloc preservations */ 817 #define KHO_VMALLOC_ALLOC 0x0001 818 #define KHO_VMALLOC_HUGE_VMAP 0x0002 819 820 static unsigned short vmalloc_flags_to_kho(unsigned int vm_flags) 821 { 822 unsigned short kho_flags = 0; 823 824 if (vm_flags & VM_ALLOC) 825 kho_flags |= KHO_VMALLOC_ALLOC; 826 if (vm_flags & VM_ALLOW_HUGE_VMAP) 827 kho_flags |= KHO_VMALLOC_HUGE_VMAP; 828 829 return kho_flags; 830 } 831 832 static unsigned int kho_flags_to_vmalloc(unsigned short kho_flags) 833 { 834 unsigned int vm_flags = 0; 835 836 if (kho_flags & KHO_VMALLOC_ALLOC) 837 vm_flags |= VM_ALLOC; 838 if (kho_flags & KHO_VMALLOC_HUGE_VMAP) 839 vm_flags |= VM_ALLOW_HUGE_VMAP; 840 841 return vm_flags; 842 } 843 844 static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur) 845 { 846 struct kho_vmalloc_chunk *chunk; 847 int err; 848 849 chunk = (struct kho_vmalloc_chunk *)get_zeroed_page(GFP_KERNEL); 850 if (!chunk) 851 return NULL; 852 853 err = kho_preserve_pages(virt_to_page(chunk), 1); 854 if (err) 855 goto err_free; 856 if (cur) 857 KHOSER_STORE_PTR(cur->hdr.next, chunk); 858 return chunk; 859 860 err_free: 861 free_page((unsigned long)chunk); 862 return NULL; 863 } 864 865 static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) 866 { 867 struct kho_mem_track *track = &kho_out.ser.track; 868 unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); 869 870 __kho_unpreserve(track, pfn, pfn + 1); 871 872 for (int i = 0; chunk->phys[i]; i++) { 873 pfn = PHYS_PFN(chunk->phys[i]); 874 __kho_unpreserve(track, pfn, pfn + 1); 875 } 876 } 877 878 static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc) 879 { 880 struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(kho_vmalloc->first); 881 882 while (chunk) { 883 struct kho_vmalloc_chunk *tmp = chunk; 884 885 kho_vmalloc_unpreserve_chunk(chunk); 886 887 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 888 free_page((unsigned long)tmp); 889 } 890 } 891 892 /** 893 * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec 894 * @ptr: pointer to the area in vmalloc address space 895 * @preservation: placeholder for preservation metadata 896 * 897 * Instructs KHO to preserve the area in vmalloc address space at @ptr. The 898 * physical pages mapped at @ptr will be preserved and on successful return 899 * @preservation will hold the physical address of a structure that describes 900 * the preservation. 901 * 902 * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably 903 * restored on the same node 904 * 905 * Return: 0 on success, error code on failure 906 */ 907 int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) 908 { 909 struct kho_vmalloc_chunk *chunk; 910 struct vm_struct *vm = find_vm_area(ptr); 911 unsigned int order, flags, nr_contig_pages; 912 unsigned int idx = 0; 913 int err; 914 915 if (!vm) 916 return -EINVAL; 917 918 if (vm->flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) 919 return -EOPNOTSUPP; 920 921 flags = vmalloc_flags_to_kho(vm->flags); 922 order = get_vm_area_page_order(vm); 923 924 chunk = new_vmalloc_chunk(NULL); 925 if (!chunk) 926 return -ENOMEM; 927 KHOSER_STORE_PTR(preservation->first, chunk); 928 929 nr_contig_pages = (1 << order); 930 for (int i = 0; i < vm->nr_pages; i += nr_contig_pages) { 931 phys_addr_t phys = page_to_phys(vm->pages[i]); 932 933 err = kho_preserve_pages(vm->pages[i], nr_contig_pages); 934 if (err) 935 goto err_free; 936 937 chunk->phys[idx++] = phys; 938 if (idx == ARRAY_SIZE(chunk->phys)) { 939 chunk = new_vmalloc_chunk(chunk); 940 if (!chunk) 941 goto err_free; 942 idx = 0; 943 } 944 } 945 946 preservation->total_pages = vm->nr_pages; 947 preservation->flags = flags; 948 preservation->order = order; 949 950 return 0; 951 952 err_free: 953 kho_vmalloc_free_chunks(preservation); 954 return err; 955 } 956 EXPORT_SYMBOL_GPL(kho_preserve_vmalloc); 957 958 /** 959 * kho_restore_vmalloc - recreates and populates an area in vmalloc address 960 * space from the preserved memory. 961 * @preservation: preservation metadata. 962 * 963 * Recreates an area in vmalloc address space and populates it with memory that 964 * was preserved using kho_preserve_vmalloc(). 965 * 966 * Return: pointer to the area in the vmalloc address space, NULL on failure. 967 */ 968 void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) 969 { 970 struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); 971 unsigned int align, order, shift, vm_flags; 972 unsigned long total_pages, contig_pages; 973 unsigned long addr, size; 974 struct vm_struct *area; 975 struct page **pages; 976 unsigned int idx = 0; 977 int err; 978 979 vm_flags = kho_flags_to_vmalloc(preservation->flags); 980 if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) 981 return NULL; 982 983 total_pages = preservation->total_pages; 984 pages = kvmalloc_array(total_pages, sizeof(*pages), GFP_KERNEL); 985 if (!pages) 986 return NULL; 987 order = preservation->order; 988 contig_pages = (1 << order); 989 shift = PAGE_SHIFT + order; 990 align = 1 << shift; 991 992 while (chunk) { 993 struct page *page; 994 995 for (int i = 0; chunk->phys[i]; i++) { 996 phys_addr_t phys = chunk->phys[i]; 997 998 if (idx + contig_pages > total_pages) 999 goto err_free_pages_array; 1000 1001 page = kho_restore_pages(phys, contig_pages); 1002 if (!page) 1003 goto err_free_pages_array; 1004 1005 for (int j = 0; j < contig_pages; j++) 1006 pages[idx++] = page; 1007 1008 phys += contig_pages * PAGE_SIZE; 1009 } 1010 1011 page = kho_restore_pages(virt_to_phys(chunk), 1); 1012 if (!page) 1013 goto err_free_pages_array; 1014 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 1015 __free_page(page); 1016 } 1017 1018 if (idx != total_pages) 1019 goto err_free_pages_array; 1020 1021 area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift, 1022 vm_flags, VMALLOC_START, VMALLOC_END, 1023 NUMA_NO_NODE, GFP_KERNEL, 1024 __builtin_return_address(0)); 1025 if (!area) 1026 goto err_free_pages_array; 1027 1028 addr = (unsigned long)area->addr; 1029 size = get_vm_area_size(area); 1030 err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift); 1031 if (err) 1032 goto err_free_vm_area; 1033 1034 area->nr_pages = total_pages; 1035 area->pages = pages; 1036 1037 return area->addr; 1038 1039 err_free_vm_area: 1040 free_vm_area(area); 1041 err_free_pages_array: 1042 kvfree(pages); 1043 return NULL; 1044 } 1045 EXPORT_SYMBOL_GPL(kho_restore_vmalloc); 1046 1047 /* Handling for debug/kho/out */ 1048 1049 static struct dentry *debugfs_root; 1050 1051 static int kho_out_update_debugfs_fdt(void) 1052 { 1053 int err = 0; 1054 struct fdt_debugfs *ff, *tmp; 1055 1056 if (kho_out.finalized) { 1057 err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir, 1058 "fdt", page_to_virt(kho_out.ser.fdt)); 1059 } else { 1060 list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) { 1061 debugfs_remove(ff->file); 1062 list_del(&ff->list); 1063 kfree(ff); 1064 } 1065 } 1066 1067 return err; 1068 } 1069 1070 static int kho_abort(void) 1071 { 1072 int err; 1073 unsigned long order; 1074 struct kho_mem_phys *physxa; 1075 1076 xa_for_each(&kho_out.ser.track.orders, order, physxa) { 1077 struct kho_mem_phys_bits *bits; 1078 unsigned long phys; 1079 1080 xa_for_each(&physxa->phys_bits, phys, bits) 1081 kfree(bits); 1082 1083 xa_destroy(&physxa->phys_bits); 1084 kfree(physxa); 1085 } 1086 xa_destroy(&kho_out.ser.track.orders); 1087 1088 if (kho_out.ser.preserved_mem_map) { 1089 kho_mem_ser_free(kho_out.ser.preserved_mem_map); 1090 kho_out.ser.preserved_mem_map = NULL; 1091 } 1092 1093 err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT, 1094 NULL); 1095 err = notifier_to_errno(err); 1096 1097 if (err) 1098 pr_err("Failed to abort KHO finalization: %d\n", err); 1099 1100 return err; 1101 } 1102 1103 static int kho_finalize(void) 1104 { 1105 int err = 0; 1106 u64 *preserved_mem_map; 1107 void *fdt = page_to_virt(kho_out.ser.fdt); 1108 1109 err |= fdt_create(fdt, PAGE_SIZE); 1110 err |= fdt_finish_reservemap(fdt); 1111 err |= fdt_begin_node(fdt, ""); 1112 err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE); 1113 /** 1114 * Reserve the preserved-memory-map property in the root FDT, so 1115 * that all property definitions will precede subnodes created by 1116 * KHO callers. 1117 */ 1118 err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP, 1119 sizeof(*preserved_mem_map), 1120 (void **)&preserved_mem_map); 1121 if (err) 1122 goto abort; 1123 1124 err = kho_preserve_folio(page_folio(kho_out.ser.fdt)); 1125 if (err) 1126 goto abort; 1127 1128 err = blocking_notifier_call_chain(&kho_out.chain_head, 1129 KEXEC_KHO_FINALIZE, &kho_out.ser); 1130 err = notifier_to_errno(err); 1131 if (err) 1132 goto abort; 1133 1134 err = kho_mem_serialize(&kho_out.ser); 1135 if (err) 1136 goto abort; 1137 1138 *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map); 1139 1140 err |= fdt_end_node(fdt); 1141 err |= fdt_finish(fdt); 1142 1143 abort: 1144 if (err) { 1145 pr_err("Failed to convert KHO state tree: %d\n", err); 1146 kho_abort(); 1147 } 1148 1149 return err; 1150 } 1151 1152 static int kho_out_finalize_get(void *data, u64 *val) 1153 { 1154 mutex_lock(&kho_out.lock); 1155 *val = kho_out.finalized; 1156 mutex_unlock(&kho_out.lock); 1157 1158 return 0; 1159 } 1160 1161 static int kho_out_finalize_set(void *data, u64 _val) 1162 { 1163 int ret = 0; 1164 bool val = !!_val; 1165 1166 mutex_lock(&kho_out.lock); 1167 1168 if (val == kho_out.finalized) { 1169 if (kho_out.finalized) 1170 ret = -EEXIST; 1171 else 1172 ret = -ENOENT; 1173 goto unlock; 1174 } 1175 1176 if (val) 1177 ret = kho_finalize(); 1178 else 1179 ret = kho_abort(); 1180 1181 if (ret) 1182 goto unlock; 1183 1184 kho_out.finalized = val; 1185 ret = kho_out_update_debugfs_fdt(); 1186 1187 unlock: 1188 mutex_unlock(&kho_out.lock); 1189 return ret; 1190 } 1191 1192 DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get, 1193 kho_out_finalize_set, "%llu\n"); 1194 1195 static int scratch_phys_show(struct seq_file *m, void *v) 1196 { 1197 for (int i = 0; i < kho_scratch_cnt; i++) 1198 seq_printf(m, "0x%llx\n", kho_scratch[i].addr); 1199 1200 return 0; 1201 } 1202 DEFINE_SHOW_ATTRIBUTE(scratch_phys); 1203 1204 static int scratch_len_show(struct seq_file *m, void *v) 1205 { 1206 for (int i = 0; i < kho_scratch_cnt; i++) 1207 seq_printf(m, "0x%llx\n", kho_scratch[i].size); 1208 1209 return 0; 1210 } 1211 DEFINE_SHOW_ATTRIBUTE(scratch_len); 1212 1213 static __init int kho_out_debugfs_init(void) 1214 { 1215 struct dentry *dir, *f, *sub_fdt_dir; 1216 1217 dir = debugfs_create_dir("out", debugfs_root); 1218 if (IS_ERR(dir)) 1219 return -ENOMEM; 1220 1221 sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); 1222 if (IS_ERR(sub_fdt_dir)) 1223 goto err_rmdir; 1224 1225 f = debugfs_create_file("scratch_phys", 0400, dir, NULL, 1226 &scratch_phys_fops); 1227 if (IS_ERR(f)) 1228 goto err_rmdir; 1229 1230 f = debugfs_create_file("scratch_len", 0400, dir, NULL, 1231 &scratch_len_fops); 1232 if (IS_ERR(f)) 1233 goto err_rmdir; 1234 1235 f = debugfs_create_file("finalize", 0600, dir, NULL, 1236 &fops_kho_out_finalize); 1237 if (IS_ERR(f)) 1238 goto err_rmdir; 1239 1240 kho_out.dir = dir; 1241 kho_out.ser.sub_fdt_dir = sub_fdt_dir; 1242 return 0; 1243 1244 err_rmdir: 1245 debugfs_remove_recursive(dir); 1246 return -ENOENT; 1247 } 1248 1249 struct kho_in { 1250 struct dentry *dir; 1251 phys_addr_t fdt_phys; 1252 phys_addr_t scratch_phys; 1253 struct list_head fdt_list; 1254 }; 1255 1256 static struct kho_in kho_in = { 1257 .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list), 1258 }; 1259 1260 static const void *kho_get_fdt(void) 1261 { 1262 return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; 1263 } 1264 1265 /** 1266 * is_kho_boot - check if current kernel was booted via KHO-enabled 1267 * kexec 1268 * 1269 * This function checks if the current kernel was loaded through a kexec 1270 * operation with KHO enabled, by verifying that a valid KHO FDT 1271 * was passed. 1272 * 1273 * Note: This function returns reliable results only after 1274 * kho_populate() has been called during early boot. Before that, 1275 * it may return false even if KHO data is present. 1276 * 1277 * Return: true if booted via KHO-enabled kexec, false otherwise 1278 */ 1279 bool is_kho_boot(void) 1280 { 1281 return !!kho_get_fdt(); 1282 } 1283 EXPORT_SYMBOL_GPL(is_kho_boot); 1284 1285 /** 1286 * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. 1287 * @name: the name of the sub FDT passed to kho_add_subtree(). 1288 * @phys: if found, the physical address of the sub FDT is stored in @phys. 1289 * 1290 * Retrieve a preserved sub FDT named @name and store its physical 1291 * address in @phys. 1292 * 1293 * Return: 0 on success, error code on failure 1294 */ 1295 int kho_retrieve_subtree(const char *name, phys_addr_t *phys) 1296 { 1297 const void *fdt = kho_get_fdt(); 1298 const u64 *val; 1299 int offset, len; 1300 1301 if (!fdt) 1302 return -ENOENT; 1303 1304 if (!phys) 1305 return -EINVAL; 1306 1307 offset = fdt_subnode_offset(fdt, 0, name); 1308 if (offset < 0) 1309 return -ENOENT; 1310 1311 val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len); 1312 if (!val || len != sizeof(*val)) 1313 return -EINVAL; 1314 1315 *phys = (phys_addr_t)*val; 1316 1317 return 0; 1318 } 1319 EXPORT_SYMBOL_GPL(kho_retrieve_subtree); 1320 1321 /* Handling for debugfs/kho/in */ 1322 1323 static __init int kho_in_debugfs_init(const void *fdt) 1324 { 1325 struct dentry *sub_fdt_dir; 1326 int err, child; 1327 1328 kho_in.dir = debugfs_create_dir("in", debugfs_root); 1329 if (IS_ERR(kho_in.dir)) 1330 return PTR_ERR(kho_in.dir); 1331 1332 sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir); 1333 if (IS_ERR(sub_fdt_dir)) { 1334 err = PTR_ERR(sub_fdt_dir); 1335 goto err_rmdir; 1336 } 1337 1338 err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt); 1339 if (err) 1340 goto err_rmdir; 1341 1342 fdt_for_each_subnode(child, fdt, 0) { 1343 int len = 0; 1344 const char *name = fdt_get_name(fdt, child, NULL); 1345 const u64 *fdt_phys; 1346 1347 fdt_phys = fdt_getprop(fdt, child, "fdt", &len); 1348 if (!fdt_phys) 1349 continue; 1350 if (len != sizeof(*fdt_phys)) { 1351 pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n", 1352 name, len); 1353 continue; 1354 } 1355 err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name, 1356 phys_to_virt(*fdt_phys)); 1357 if (err) { 1358 pr_warn("failed to add fdt `%s` to debugfs: %d\n", name, 1359 err); 1360 continue; 1361 } 1362 } 1363 1364 return 0; 1365 1366 err_rmdir: 1367 debugfs_remove_recursive(kho_in.dir); 1368 return err; 1369 } 1370 1371 static __init int kho_init(void) 1372 { 1373 int err = 0; 1374 const void *fdt = kho_get_fdt(); 1375 1376 if (!kho_enable) 1377 return 0; 1378 1379 kho_out.ser.fdt = alloc_page(GFP_KERNEL); 1380 if (!kho_out.ser.fdt) { 1381 err = -ENOMEM; 1382 goto err_free_scratch; 1383 } 1384 1385 debugfs_root = debugfs_create_dir("kho", NULL); 1386 if (IS_ERR(debugfs_root)) { 1387 err = -ENOENT; 1388 goto err_free_fdt; 1389 } 1390 1391 err = kho_out_debugfs_init(); 1392 if (err) 1393 goto err_free_fdt; 1394 1395 if (fdt) { 1396 err = kho_in_debugfs_init(fdt); 1397 /* 1398 * Failure to create /sys/kernel/debug/kho/in does not prevent 1399 * reviving state from KHO and setting up KHO for the next 1400 * kexec. 1401 */ 1402 if (err) 1403 pr_err("failed exposing handover FDT in debugfs: %d\n", 1404 err); 1405 1406 return 0; 1407 } 1408 1409 for (int i = 0; i < kho_scratch_cnt; i++) { 1410 unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr); 1411 unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; 1412 unsigned long pfn; 1413 1414 for (pfn = base_pfn; pfn < base_pfn + count; 1415 pfn += pageblock_nr_pages) 1416 init_cma_reserved_pageblock(pfn_to_page(pfn)); 1417 } 1418 1419 return 0; 1420 1421 err_free_fdt: 1422 put_page(kho_out.ser.fdt); 1423 kho_out.ser.fdt = NULL; 1424 err_free_scratch: 1425 for (int i = 0; i < kho_scratch_cnt; i++) { 1426 void *start = __va(kho_scratch[i].addr); 1427 void *end = start + kho_scratch[i].size; 1428 1429 free_reserved_area(start, end, -1, ""); 1430 } 1431 kho_enable = false; 1432 return err; 1433 } 1434 late_initcall(kho_init); 1435 1436 static void __init kho_release_scratch(void) 1437 { 1438 phys_addr_t start, end; 1439 u64 i; 1440 1441 memmap_init_kho_scratch_pages(); 1442 1443 /* 1444 * Mark scratch mem as CMA before we return it. That way we 1445 * ensure that no kernel allocations happen on it. That means 1446 * we can reuse it as scratch memory again later. 1447 */ 1448 __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, 1449 MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) { 1450 ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start)); 1451 ulong end_pfn = pageblock_align(PFN_UP(end)); 1452 ulong pfn; 1453 1454 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) 1455 init_pageblock_migratetype(pfn_to_page(pfn), 1456 MIGRATE_CMA, false); 1457 } 1458 } 1459 1460 void __init kho_memory_init(void) 1461 { 1462 struct folio *folio; 1463 1464 if (kho_in.scratch_phys) { 1465 kho_scratch = phys_to_virt(kho_in.scratch_phys); 1466 kho_release_scratch(); 1467 1468 kho_mem_deserialize(kho_get_fdt()); 1469 folio = kho_restore_folio(kho_in.fdt_phys); 1470 if (!folio) 1471 pr_warn("failed to restore folio for KHO fdt\n"); 1472 } else { 1473 kho_reserve_scratch(); 1474 } 1475 } 1476 1477 void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, 1478 phys_addr_t scratch_phys, u64 scratch_len) 1479 { 1480 void *fdt = NULL; 1481 struct kho_scratch *scratch = NULL; 1482 int err = 0; 1483 unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); 1484 1485 /* Validate the input FDT */ 1486 fdt = early_memremap(fdt_phys, fdt_len); 1487 if (!fdt) { 1488 pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys); 1489 err = -EFAULT; 1490 goto out; 1491 } 1492 err = fdt_check_header(fdt); 1493 if (err) { 1494 pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n", 1495 fdt_phys, err); 1496 err = -EINVAL; 1497 goto out; 1498 } 1499 err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE); 1500 if (err) { 1501 pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n", 1502 fdt_phys, KHO_FDT_COMPATIBLE, err); 1503 err = -EINVAL; 1504 goto out; 1505 } 1506 1507 scratch = early_memremap(scratch_phys, scratch_len); 1508 if (!scratch) { 1509 pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n", 1510 scratch_phys, scratch_len); 1511 err = -EFAULT; 1512 goto out; 1513 } 1514 1515 /* 1516 * We pass a safe contiguous blocks of memory to use for early boot 1517 * purporses from the previous kernel so that we can resize the 1518 * memblock array as needed. 1519 */ 1520 for (int i = 0; i < scratch_cnt; i++) { 1521 struct kho_scratch *area = &scratch[i]; 1522 u64 size = area->size; 1523 1524 memblock_add(area->addr, size); 1525 err = memblock_mark_kho_scratch(area->addr, size); 1526 if (WARN_ON(err)) { 1527 pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d", 1528 &area->addr, &size, err); 1529 goto out; 1530 } 1531 pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); 1532 } 1533 1534 memblock_reserve(scratch_phys, scratch_len); 1535 1536 /* 1537 * Now that we have a viable region of scratch memory, let's tell 1538 * the memblocks allocator to only use that for any allocations. 1539 * That way we ensure that nothing scribbles over in use data while 1540 * we initialize the page tables which we will need to ingest all 1541 * memory reservations from the previous kernel. 1542 */ 1543 memblock_set_kho_scratch_only(); 1544 1545 kho_in.fdt_phys = fdt_phys; 1546 kho_in.scratch_phys = scratch_phys; 1547 kho_scratch_cnt = scratch_cnt; 1548 pr_info("found kexec handover data. Will skip init for some devices\n"); 1549 1550 out: 1551 if (fdt) 1552 early_memunmap(fdt, fdt_len); 1553 if (scratch) 1554 early_memunmap(scratch, scratch_len); 1555 if (err) 1556 pr_warn("disabling KHO revival: %d\n", err); 1557 } 1558 1559 /* Helper functions for kexec_file_load */ 1560 1561 int kho_fill_kimage(struct kimage *image) 1562 { 1563 ssize_t scratch_size; 1564 int err = 0; 1565 struct kexec_buf scratch; 1566 1567 if (!kho_out.finalized) 1568 return 0; 1569 1570 image->kho.fdt = page_to_phys(kho_out.ser.fdt); 1571 1572 scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; 1573 scratch = (struct kexec_buf){ 1574 .image = image, 1575 .buffer = kho_scratch, 1576 .bufsz = scratch_size, 1577 .mem = KEXEC_BUF_MEM_UNKNOWN, 1578 .memsz = scratch_size, 1579 .buf_align = SZ_64K, /* Makes it easier to map */ 1580 .buf_max = ULONG_MAX, 1581 .top_down = true, 1582 }; 1583 err = kexec_add_buffer(&scratch); 1584 if (err) 1585 return err; 1586 image->kho.scratch = &image->segment[image->nr_segments - 1]; 1587 1588 return 0; 1589 } 1590 1591 static int kho_walk_scratch(struct kexec_buf *kbuf, 1592 int (*func)(struct resource *, void *)) 1593 { 1594 int ret = 0; 1595 int i; 1596 1597 for (i = 0; i < kho_scratch_cnt; i++) { 1598 struct resource res = { 1599 .start = kho_scratch[i].addr, 1600 .end = kho_scratch[i].addr + kho_scratch[i].size - 1, 1601 }; 1602 1603 /* Try to fit the kimage into our KHO scratch region */ 1604 ret = func(&res, kbuf); 1605 if (ret) 1606 break; 1607 } 1608 1609 return ret; 1610 } 1611 1612 int kho_locate_mem_hole(struct kexec_buf *kbuf, 1613 int (*func)(struct resource *, void *)) 1614 { 1615 int ret; 1616 1617 if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH) 1618 return 1; 1619 1620 ret = kho_walk_scratch(kbuf, func); 1621 1622 return ret == 1 ? 0 : -EADDRNOTAVAIL; 1623 } 1624