1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kexec_handover.c - kexec handover metadata processing 4 * Copyright (C) 2023 Alexander Graf <graf@amazon.com> 5 * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> 6 * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> 7 * Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com> 8 */ 9 10 #define pr_fmt(fmt) "KHO: " fmt 11 12 #include <linux/cleanup.h> 13 #include <linux/cma.h> 14 #include <linux/count_zeros.h> 15 #include <linux/kexec.h> 16 #include <linux/kexec_handover.h> 17 #include <linux/libfdt.h> 18 #include <linux/list.h> 19 #include <linux/memblock.h> 20 #include <linux/page-isolation.h> 21 #include <linux/unaligned.h> 22 #include <linux/vmalloc.h> 23 24 #include <asm/early_ioremap.h> 25 26 #include "kexec_handover_internal.h" 27 /* 28 * KHO is tightly coupled with mm init and needs access to some of mm 29 * internal APIs. 30 */ 31 #include "../../mm/internal.h" 32 #include "../kexec_internal.h" 33 #include "kexec_handover_internal.h" 34 35 #define KHO_FDT_COMPATIBLE "kho-v1" 36 #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" 37 #define PROP_SUB_FDT "fdt" 38 39 #define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */ 40 41 /* 42 * KHO uses page->private, which is an unsigned long, to store page metadata. 43 * Use it to store both the magic and the order. 44 */ 45 union kho_page_info { 46 unsigned long page_private; 47 struct { 48 unsigned int order; 49 unsigned int magic; 50 }; 51 }; 52 53 static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private)); 54 55 static bool kho_enable __ro_after_init = IS_ENABLED(CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT); 56 57 bool kho_is_enabled(void) 58 { 59 return kho_enable; 60 } 61 EXPORT_SYMBOL_GPL(kho_is_enabled); 62 63 static int __init kho_parse_enable(char *p) 64 { 65 return kstrtobool(p, &kho_enable); 66 } 67 early_param("kho", kho_parse_enable); 68 69 /* 70 * Keep track of memory that is to be preserved across KHO. 71 * 72 * The serializing side uses two levels of xarrays to manage chunks of per-order 73 * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order 74 * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0 75 * allocations each bitmap will cover 128M of address space. Thus, for 16G of 76 * memory at most 512K of bitmap memory will be needed for order 0. 77 * 78 * This approach is fully incremental, as the serialization progresses folios 79 * can continue be aggregated to the tracker. The final step, immediately prior 80 * to kexec would serialize the xarray information into a linked list for the 81 * successor kernel to parse. 82 */ 83 84 #define PRESERVE_BITS (PAGE_SIZE * 8) 85 86 struct kho_mem_phys_bits { 87 DECLARE_BITMAP(preserve, PRESERVE_BITS); 88 }; 89 90 static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE); 91 92 struct kho_mem_phys { 93 /* 94 * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized 95 * to order. 96 */ 97 struct xarray phys_bits; 98 }; 99 100 struct kho_mem_track { 101 /* Points to kho_mem_phys, each order gets its own bitmap tree */ 102 struct xarray orders; 103 }; 104 105 struct khoser_mem_chunk; 106 107 struct kho_out { 108 void *fdt; 109 bool finalized; 110 struct mutex lock; /* protects KHO FDT finalization */ 111 112 struct kho_mem_track track; 113 struct kho_debugfs dbg; 114 }; 115 116 static struct kho_out kho_out = { 117 .lock = __MUTEX_INITIALIZER(kho_out.lock), 118 .track = { 119 .orders = XARRAY_INIT(kho_out.track.orders, 0), 120 }, 121 .finalized = false, 122 }; 123 124 static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) 125 { 126 void *res = xa_load(xa, index); 127 128 if (res) 129 return res; 130 131 void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL); 132 133 if (!elm) 134 return ERR_PTR(-ENOMEM); 135 136 if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE))) 137 return ERR_PTR(-EINVAL); 138 139 res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); 140 if (xa_is_err(res)) 141 return ERR_PTR(xa_err(res)); 142 else if (res) 143 return res; 144 145 return no_free_ptr(elm); 146 } 147 148 static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn, 149 unsigned int order) 150 { 151 struct kho_mem_phys_bits *bits; 152 struct kho_mem_phys *physxa; 153 const unsigned long pfn_high = pfn >> order; 154 155 physxa = xa_load(&track->orders, order); 156 if (WARN_ON_ONCE(!physxa)) 157 return; 158 159 bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); 160 if (WARN_ON_ONCE(!bits)) 161 return; 162 163 clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); 164 } 165 166 static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, 167 unsigned long end_pfn) 168 { 169 unsigned int order; 170 171 while (pfn < end_pfn) { 172 order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 173 174 __kho_unpreserve_order(track, pfn, order); 175 176 pfn += 1 << order; 177 } 178 } 179 180 static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, 181 unsigned int order) 182 { 183 struct kho_mem_phys_bits *bits; 184 struct kho_mem_phys *physxa, *new_physxa; 185 const unsigned long pfn_high = pfn >> order; 186 187 might_sleep(); 188 physxa = xa_load(&track->orders, order); 189 if (!physxa) { 190 int err; 191 192 new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL); 193 if (!new_physxa) 194 return -ENOMEM; 195 196 xa_init(&new_physxa->phys_bits); 197 physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa, 198 GFP_KERNEL); 199 200 err = xa_err(physxa); 201 if (err || physxa) { 202 xa_destroy(&new_physxa->phys_bits); 203 kfree(new_physxa); 204 205 if (err) 206 return err; 207 } else { 208 physxa = new_physxa; 209 } 210 } 211 212 bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS); 213 if (IS_ERR(bits)) 214 return PTR_ERR(bits); 215 216 set_bit(pfn_high % PRESERVE_BITS, bits->preserve); 217 218 return 0; 219 } 220 221 static struct page *kho_restore_page(phys_addr_t phys) 222 { 223 struct page *page = pfn_to_online_page(PHYS_PFN(phys)); 224 union kho_page_info info; 225 unsigned int nr_pages; 226 227 if (!page) 228 return NULL; 229 230 info.page_private = page->private; 231 /* 232 * deserialize_bitmap() only sets the magic on the head page. This magic 233 * check also implicitly makes sure phys is order-aligned since for 234 * non-order-aligned phys addresses, magic will never be set. 235 */ 236 if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC || info.order > MAX_PAGE_ORDER)) 237 return NULL; 238 nr_pages = (1 << info.order); 239 240 /* Clear private to make sure later restores on this page error out. */ 241 page->private = 0; 242 /* Head page gets refcount of 1. */ 243 set_page_count(page, 1); 244 245 /* For higher order folios, tail pages get a page count of zero. */ 246 for (unsigned int i = 1; i < nr_pages; i++) 247 set_page_count(page + i, 0); 248 249 if (info.order > 0) 250 prep_compound_page(page, info.order); 251 252 adjust_managed_page_count(page, nr_pages); 253 return page; 254 } 255 256 /** 257 * kho_restore_folio - recreates the folio from the preserved memory. 258 * @phys: physical address of the folio. 259 * 260 * Return: pointer to the struct folio on success, NULL on failure. 261 */ 262 struct folio *kho_restore_folio(phys_addr_t phys) 263 { 264 struct page *page = kho_restore_page(phys); 265 266 return page ? page_folio(page) : NULL; 267 } 268 EXPORT_SYMBOL_GPL(kho_restore_folio); 269 270 /** 271 * kho_restore_pages - restore list of contiguous order 0 pages. 272 * @phys: physical address of the first page. 273 * @nr_pages: number of pages. 274 * 275 * Restore a contiguous list of order 0 pages that was preserved with 276 * kho_preserve_pages(). 277 * 278 * Return: 0 on success, error code on failure 279 */ 280 struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages) 281 { 282 const unsigned long start_pfn = PHYS_PFN(phys); 283 const unsigned long end_pfn = start_pfn + nr_pages; 284 unsigned long pfn = start_pfn; 285 286 while (pfn < end_pfn) { 287 const unsigned int order = 288 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 289 struct page *page = kho_restore_page(PFN_PHYS(pfn)); 290 291 if (!page) 292 return NULL; 293 split_page(page, order); 294 pfn += 1 << order; 295 } 296 297 return pfn_to_page(start_pfn); 298 } 299 EXPORT_SYMBOL_GPL(kho_restore_pages); 300 301 /* Serialize and deserialize struct kho_mem_phys across kexec 302 * 303 * Record all the bitmaps in a linked list of pages for the next kernel to 304 * process. Each chunk holds bitmaps of the same order and each block of bitmaps 305 * starts at a given physical address. This allows the bitmaps to be sparse. The 306 * xarray is used to store them in a tree while building up the data structure, 307 * but the KHO successor kernel only needs to process them once in order. 308 * 309 * All of this memory is normal kmalloc() memory and is not marked for 310 * preservation. The successor kernel will remain isolated to the scratch space 311 * until it completes processing this list. Once processed all the memory 312 * storing these ranges will be marked as free. 313 */ 314 315 struct khoser_mem_bitmap_ptr { 316 phys_addr_t phys_start; 317 DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *); 318 }; 319 320 struct khoser_mem_chunk_hdr { 321 DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *); 322 unsigned int order; 323 unsigned int num_elms; 324 }; 325 326 #define KHOSER_BITMAP_SIZE \ 327 ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \ 328 sizeof(struct khoser_mem_bitmap_ptr)) 329 330 struct khoser_mem_chunk { 331 struct khoser_mem_chunk_hdr hdr; 332 struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE]; 333 }; 334 335 static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); 336 337 static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, 338 unsigned long order) 339 { 340 struct khoser_mem_chunk *chunk __free(free_page) = NULL; 341 342 chunk = (void *)get_zeroed_page(GFP_KERNEL); 343 if (!chunk) 344 return ERR_PTR(-ENOMEM); 345 346 if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE))) 347 return ERR_PTR(-EINVAL); 348 349 chunk->hdr.order = order; 350 if (cur_chunk) 351 KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); 352 return no_free_ptr(chunk); 353 } 354 355 static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) 356 { 357 struct khoser_mem_chunk *chunk = first_chunk; 358 359 while (chunk) { 360 struct khoser_mem_chunk *tmp = chunk; 361 362 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 363 kfree(tmp); 364 } 365 } 366 367 /* 368 * Update memory map property, if old one is found discard it via 369 * kho_mem_ser_free(). 370 */ 371 static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk) 372 { 373 void *ptr; 374 u64 phys; 375 376 ptr = fdt_getprop_w(kho_out.fdt, 0, PROP_PRESERVED_MEMORY_MAP, NULL); 377 378 /* Check and discard previous memory map */ 379 phys = get_unaligned((u64 *)ptr); 380 if (phys) 381 kho_mem_ser_free((struct khoser_mem_chunk *)phys_to_virt(phys)); 382 383 /* Update with the new value */ 384 phys = first_chunk ? (u64)virt_to_phys(first_chunk) : 0; 385 put_unaligned(phys, (u64 *)ptr); 386 } 387 388 static int kho_mem_serialize(struct kho_out *kho_out) 389 { 390 struct khoser_mem_chunk *first_chunk = NULL; 391 struct khoser_mem_chunk *chunk = NULL; 392 struct kho_mem_phys *physxa; 393 unsigned long order; 394 int err = -ENOMEM; 395 396 xa_for_each(&kho_out->track.orders, order, physxa) { 397 struct kho_mem_phys_bits *bits; 398 unsigned long phys; 399 400 chunk = new_chunk(chunk, order); 401 if (IS_ERR(chunk)) { 402 err = PTR_ERR(chunk); 403 goto err_free; 404 } 405 406 if (!first_chunk) 407 first_chunk = chunk; 408 409 xa_for_each(&physxa->phys_bits, phys, bits) { 410 struct khoser_mem_bitmap_ptr *elm; 411 412 if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { 413 chunk = new_chunk(chunk, order); 414 if (IS_ERR(chunk)) { 415 err = PTR_ERR(chunk); 416 goto err_free; 417 } 418 } 419 420 elm = &chunk->bitmaps[chunk->hdr.num_elms]; 421 chunk->hdr.num_elms++; 422 elm->phys_start = (phys * PRESERVE_BITS) 423 << (order + PAGE_SHIFT); 424 KHOSER_STORE_PTR(elm->bitmap, bits); 425 } 426 } 427 428 kho_update_memory_map(first_chunk); 429 430 return 0; 431 432 err_free: 433 kho_mem_ser_free(first_chunk); 434 return err; 435 } 436 437 static void __init deserialize_bitmap(unsigned int order, 438 struct khoser_mem_bitmap_ptr *elm) 439 { 440 struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap); 441 unsigned long bit; 442 443 for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) { 444 int sz = 1 << (order + PAGE_SHIFT); 445 phys_addr_t phys = 446 elm->phys_start + (bit << (order + PAGE_SHIFT)); 447 struct page *page = phys_to_page(phys); 448 union kho_page_info info; 449 450 memblock_reserve(phys, sz); 451 memblock_reserved_mark_noinit(phys, sz); 452 info.magic = KHO_PAGE_MAGIC; 453 info.order = order; 454 page->private = info.page_private; 455 } 456 } 457 458 /* Return true if memory was deserizlied */ 459 static bool __init kho_mem_deserialize(const void *fdt) 460 { 461 struct khoser_mem_chunk *chunk; 462 const void *mem_ptr; 463 u64 mem; 464 int len; 465 466 mem_ptr = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); 467 if (!mem_ptr || len != sizeof(u64)) { 468 pr_err("failed to get preserved memory bitmaps\n"); 469 return false; 470 } 471 472 mem = get_unaligned((const u64 *)mem_ptr); 473 chunk = mem ? phys_to_virt(mem) : NULL; 474 475 /* No preserved physical pages were passed, no deserialization */ 476 if (!chunk) 477 return false; 478 479 while (chunk) { 480 unsigned int i; 481 482 for (i = 0; i != chunk->hdr.num_elms; i++) 483 deserialize_bitmap(chunk->hdr.order, 484 &chunk->bitmaps[i]); 485 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 486 } 487 488 return true; 489 } 490 491 /* 492 * With KHO enabled, memory can become fragmented because KHO regions may 493 * be anywhere in physical address space. The scratch regions give us a 494 * safe zones that we will never see KHO allocations from. This is where we 495 * can later safely load our new kexec images into and then use the scratch 496 * area for early allocations that happen before page allocator is 497 * initialized. 498 */ 499 struct kho_scratch *kho_scratch; 500 unsigned int kho_scratch_cnt; 501 502 /* 503 * The scratch areas are scaled by default as percent of memory allocated from 504 * memblock. A user can override the scale with command line parameter: 505 * 506 * kho_scratch=N% 507 * 508 * It is also possible to explicitly define size for a lowmem, a global and 509 * per-node scratch areas: 510 * 511 * kho_scratch=l[KMG],n[KMG],m[KMG] 512 * 513 * The explicit size definition takes precedence over scale definition. 514 */ 515 static unsigned int scratch_scale __initdata = 200; 516 static phys_addr_t scratch_size_global __initdata; 517 static phys_addr_t scratch_size_pernode __initdata; 518 static phys_addr_t scratch_size_lowmem __initdata; 519 520 static int __init kho_parse_scratch_size(char *p) 521 { 522 size_t len; 523 unsigned long sizes[3]; 524 size_t total_size = 0; 525 int i; 526 527 if (!p) 528 return -EINVAL; 529 530 len = strlen(p); 531 if (!len) 532 return -EINVAL; 533 534 /* parse nn% */ 535 if (p[len - 1] == '%') { 536 /* unsigned int max is 4,294,967,295, 10 chars */ 537 char s_scale[11] = {}; 538 int ret = 0; 539 540 if (len > ARRAY_SIZE(s_scale)) 541 return -EINVAL; 542 543 memcpy(s_scale, p, len - 1); 544 ret = kstrtouint(s_scale, 10, &scratch_scale); 545 if (!ret) 546 pr_notice("scratch scale is %d%%\n", scratch_scale); 547 return ret; 548 } 549 550 /* parse ll[KMG],mm[KMG],nn[KMG] */ 551 for (i = 0; i < ARRAY_SIZE(sizes); i++) { 552 char *endp = p; 553 554 if (i > 0) { 555 if (*p != ',') 556 return -EINVAL; 557 p += 1; 558 } 559 560 sizes[i] = memparse(p, &endp); 561 if (endp == p) 562 return -EINVAL; 563 p = endp; 564 total_size += sizes[i]; 565 } 566 567 if (!total_size) 568 return -EINVAL; 569 570 /* The string should be fully consumed by now. */ 571 if (*p) 572 return -EINVAL; 573 574 scratch_size_lowmem = sizes[0]; 575 scratch_size_global = sizes[1]; 576 scratch_size_pernode = sizes[2]; 577 scratch_scale = 0; 578 579 pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n", 580 (u64)(scratch_size_lowmem >> 20), 581 (u64)(scratch_size_global >> 20), 582 (u64)(scratch_size_pernode >> 20)); 583 584 return 0; 585 } 586 early_param("kho_scratch", kho_parse_scratch_size); 587 588 static void __init scratch_size_update(void) 589 { 590 phys_addr_t size; 591 592 if (!scratch_scale) 593 return; 594 595 size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT, 596 NUMA_NO_NODE); 597 size = size * scratch_scale / 100; 598 scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES); 599 600 size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, 601 NUMA_NO_NODE); 602 size = size * scratch_scale / 100 - scratch_size_lowmem; 603 scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES); 604 } 605 606 static phys_addr_t __init scratch_size_node(int nid) 607 { 608 phys_addr_t size; 609 610 if (scratch_scale) { 611 size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, 612 nid); 613 size = size * scratch_scale / 100; 614 } else { 615 size = scratch_size_pernode; 616 } 617 618 return round_up(size, CMA_MIN_ALIGNMENT_BYTES); 619 } 620 621 /** 622 * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec 623 * 624 * With KHO we can preserve arbitrary pages in the system. To ensure we still 625 * have a large contiguous region of memory when we search the physical address 626 * space for target memory, let's make sure we always have a large CMA region 627 * active. This CMA region will only be used for movable pages which are not a 628 * problem for us during KHO because we can just move them somewhere else. 629 */ 630 static void __init kho_reserve_scratch(void) 631 { 632 phys_addr_t addr, size; 633 int nid, i = 0; 634 635 if (!kho_enable) 636 return; 637 638 scratch_size_update(); 639 640 /* FIXME: deal with node hot-plug/remove */ 641 kho_scratch_cnt = num_online_nodes() + 2; 642 size = kho_scratch_cnt * sizeof(*kho_scratch); 643 kho_scratch = memblock_alloc(size, PAGE_SIZE); 644 if (!kho_scratch) 645 goto err_disable_kho; 646 647 /* 648 * reserve scratch area in low memory for lowmem allocations in the 649 * next kernel 650 */ 651 size = scratch_size_lowmem; 652 addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0, 653 ARCH_LOW_ADDRESS_LIMIT); 654 if (!addr) 655 goto err_free_scratch_desc; 656 657 kho_scratch[i].addr = addr; 658 kho_scratch[i].size = size; 659 i++; 660 661 /* reserve large contiguous area for allocations without nid */ 662 size = scratch_size_global; 663 addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES); 664 if (!addr) 665 goto err_free_scratch_areas; 666 667 kho_scratch[i].addr = addr; 668 kho_scratch[i].size = size; 669 i++; 670 671 for_each_online_node(nid) { 672 size = scratch_size_node(nid); 673 addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, 674 0, MEMBLOCK_ALLOC_ACCESSIBLE, 675 nid, true); 676 if (!addr) 677 goto err_free_scratch_areas; 678 679 kho_scratch[i].addr = addr; 680 kho_scratch[i].size = size; 681 i++; 682 } 683 684 return; 685 686 err_free_scratch_areas: 687 for (i--; i >= 0; i--) 688 memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size); 689 err_free_scratch_desc: 690 memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch)); 691 err_disable_kho: 692 pr_warn("Failed to reserve scratch area, disabling kexec handover\n"); 693 kho_enable = false; 694 } 695 696 /** 697 * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. 698 * @name: name of the sub tree. 699 * @fdt: the sub tree blob. 700 * 701 * Creates a new child node named @name in KHO root FDT and records 702 * the physical address of @fdt. The pages of @fdt must also be preserved 703 * by KHO for the new kernel to retrieve it after kexec. 704 * 705 * A debugfs blob entry is also created at 706 * ``/sys/kernel/debug/kho/out/sub_fdts/@name`` when kernel is configured with 707 * CONFIG_KEXEC_HANDOVER_DEBUGFS 708 * 709 * Return: 0 on success, error code on failure 710 */ 711 int kho_add_subtree(const char *name, void *fdt) 712 { 713 phys_addr_t phys = virt_to_phys(fdt); 714 void *root_fdt = kho_out.fdt; 715 int err = -ENOMEM; 716 int off, fdt_err; 717 718 guard(mutex)(&kho_out.lock); 719 720 fdt_err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE); 721 if (fdt_err < 0) 722 return err; 723 724 off = fdt_add_subnode(root_fdt, 0, name); 725 if (off < 0) { 726 if (off == -FDT_ERR_EXISTS) 727 err = -EEXIST; 728 goto out_pack; 729 } 730 731 err = fdt_setprop(root_fdt, off, PROP_SUB_FDT, &phys, sizeof(phys)); 732 if (err < 0) 733 goto out_pack; 734 735 WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false)); 736 737 out_pack: 738 fdt_pack(root_fdt); 739 740 return err; 741 } 742 EXPORT_SYMBOL_GPL(kho_add_subtree); 743 744 void kho_remove_subtree(void *fdt) 745 { 746 phys_addr_t target_phys = virt_to_phys(fdt); 747 void *root_fdt = kho_out.fdt; 748 int off; 749 int err; 750 751 guard(mutex)(&kho_out.lock); 752 753 err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE); 754 if (err < 0) 755 return; 756 757 for (off = fdt_first_subnode(root_fdt, 0); off >= 0; 758 off = fdt_next_subnode(root_fdt, off)) { 759 const u64 *val; 760 int len; 761 762 val = fdt_getprop(root_fdt, off, PROP_SUB_FDT, &len); 763 if (!val || len != sizeof(phys_addr_t)) 764 continue; 765 766 if ((phys_addr_t)*val == target_phys) { 767 fdt_del_node(root_fdt, off); 768 kho_debugfs_fdt_remove(&kho_out.dbg, fdt); 769 break; 770 } 771 } 772 773 fdt_pack(root_fdt); 774 } 775 EXPORT_SYMBOL_GPL(kho_remove_subtree); 776 777 /** 778 * kho_preserve_folio - preserve a folio across kexec. 779 * @folio: folio to preserve. 780 * 781 * Instructs KHO to preserve the whole folio across kexec. The order 782 * will be preserved as well. 783 * 784 * Return: 0 on success, error code on failure 785 */ 786 int kho_preserve_folio(struct folio *folio) 787 { 788 const unsigned long pfn = folio_pfn(folio); 789 const unsigned int order = folio_order(folio); 790 struct kho_mem_track *track = &kho_out.track; 791 792 if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) 793 return -EINVAL; 794 795 return __kho_preserve_order(track, pfn, order); 796 } 797 EXPORT_SYMBOL_GPL(kho_preserve_folio); 798 799 /** 800 * kho_unpreserve_folio - unpreserve a folio. 801 * @folio: folio to unpreserve. 802 * 803 * Instructs KHO to unpreserve a folio that was preserved by 804 * kho_preserve_folio() before. The provided @folio (pfn and order) 805 * must exactly match a previously preserved folio. 806 */ 807 void kho_unpreserve_folio(struct folio *folio) 808 { 809 const unsigned long pfn = folio_pfn(folio); 810 const unsigned int order = folio_order(folio); 811 struct kho_mem_track *track = &kho_out.track; 812 813 __kho_unpreserve_order(track, pfn, order); 814 } 815 EXPORT_SYMBOL_GPL(kho_unpreserve_folio); 816 817 /** 818 * kho_preserve_pages - preserve contiguous pages across kexec 819 * @page: first page in the list. 820 * @nr_pages: number of pages. 821 * 822 * Preserve a contiguous list of order 0 pages. Must be restored using 823 * kho_restore_pages() to ensure the pages are restored properly as order 0. 824 * 825 * Return: 0 on success, error code on failure 826 */ 827 int kho_preserve_pages(struct page *page, unsigned int nr_pages) 828 { 829 struct kho_mem_track *track = &kho_out.track; 830 const unsigned long start_pfn = page_to_pfn(page); 831 const unsigned long end_pfn = start_pfn + nr_pages; 832 unsigned long pfn = start_pfn; 833 unsigned long failed_pfn = 0; 834 int err = 0; 835 836 if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT, 837 nr_pages << PAGE_SHIFT))) { 838 return -EINVAL; 839 } 840 841 while (pfn < end_pfn) { 842 const unsigned int order = 843 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 844 845 err = __kho_preserve_order(track, pfn, order); 846 if (err) { 847 failed_pfn = pfn; 848 break; 849 } 850 851 pfn += 1 << order; 852 } 853 854 if (err) 855 __kho_unpreserve(track, start_pfn, failed_pfn); 856 857 return err; 858 } 859 EXPORT_SYMBOL_GPL(kho_preserve_pages); 860 861 /** 862 * kho_unpreserve_pages - unpreserve contiguous pages. 863 * @page: first page in the list. 864 * @nr_pages: number of pages. 865 * 866 * Instructs KHO to unpreserve @nr_pages contiguous pages starting from @page. 867 * This must be called with the same @page and @nr_pages as the corresponding 868 * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger 869 * preserved blocks is not supported. 870 */ 871 void kho_unpreserve_pages(struct page *page, unsigned int nr_pages) 872 { 873 struct kho_mem_track *track = &kho_out.track; 874 const unsigned long start_pfn = page_to_pfn(page); 875 const unsigned long end_pfn = start_pfn + nr_pages; 876 877 __kho_unpreserve(track, start_pfn, end_pfn); 878 } 879 EXPORT_SYMBOL_GPL(kho_unpreserve_pages); 880 881 struct kho_vmalloc_hdr { 882 DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); 883 }; 884 885 #define KHO_VMALLOC_SIZE \ 886 ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \ 887 sizeof(phys_addr_t)) 888 889 struct kho_vmalloc_chunk { 890 struct kho_vmalloc_hdr hdr; 891 phys_addr_t phys[KHO_VMALLOC_SIZE]; 892 }; 893 894 static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE); 895 896 /* vmalloc flags KHO supports */ 897 #define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC | VM_ALLOW_HUGE_VMAP) 898 899 /* KHO internal flags for vmalloc preservations */ 900 #define KHO_VMALLOC_ALLOC 0x0001 901 #define KHO_VMALLOC_HUGE_VMAP 0x0002 902 903 static unsigned short vmalloc_flags_to_kho(unsigned int vm_flags) 904 { 905 unsigned short kho_flags = 0; 906 907 if (vm_flags & VM_ALLOC) 908 kho_flags |= KHO_VMALLOC_ALLOC; 909 if (vm_flags & VM_ALLOW_HUGE_VMAP) 910 kho_flags |= KHO_VMALLOC_HUGE_VMAP; 911 912 return kho_flags; 913 } 914 915 static unsigned int kho_flags_to_vmalloc(unsigned short kho_flags) 916 { 917 unsigned int vm_flags = 0; 918 919 if (kho_flags & KHO_VMALLOC_ALLOC) 920 vm_flags |= VM_ALLOC; 921 if (kho_flags & KHO_VMALLOC_HUGE_VMAP) 922 vm_flags |= VM_ALLOW_HUGE_VMAP; 923 924 return vm_flags; 925 } 926 927 static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur) 928 { 929 struct kho_vmalloc_chunk *chunk; 930 int err; 931 932 chunk = (struct kho_vmalloc_chunk *)get_zeroed_page(GFP_KERNEL); 933 if (!chunk) 934 return NULL; 935 936 err = kho_preserve_pages(virt_to_page(chunk), 1); 937 if (err) 938 goto err_free; 939 if (cur) 940 KHOSER_STORE_PTR(cur->hdr.next, chunk); 941 return chunk; 942 943 err_free: 944 free_page((unsigned long)chunk); 945 return NULL; 946 } 947 948 static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, 949 unsigned short order) 950 { 951 struct kho_mem_track *track = &kho_out.track; 952 unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); 953 954 __kho_unpreserve(track, pfn, pfn + 1); 955 956 for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { 957 pfn = PHYS_PFN(chunk->phys[i]); 958 __kho_unpreserve(track, pfn, pfn + (1 << order)); 959 } 960 } 961 962 /** 963 * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec 964 * @ptr: pointer to the area in vmalloc address space 965 * @preservation: placeholder for preservation metadata 966 * 967 * Instructs KHO to preserve the area in vmalloc address space at @ptr. The 968 * physical pages mapped at @ptr will be preserved and on successful return 969 * @preservation will hold the physical address of a structure that describes 970 * the preservation. 971 * 972 * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably 973 * restored on the same node 974 * 975 * Return: 0 on success, error code on failure 976 */ 977 int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) 978 { 979 struct kho_vmalloc_chunk *chunk; 980 struct vm_struct *vm = find_vm_area(ptr); 981 unsigned int order, flags, nr_contig_pages; 982 unsigned int idx = 0; 983 int err; 984 985 if (!vm) 986 return -EINVAL; 987 988 if (vm->flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) 989 return -EOPNOTSUPP; 990 991 flags = vmalloc_flags_to_kho(vm->flags); 992 order = get_vm_area_page_order(vm); 993 994 chunk = new_vmalloc_chunk(NULL); 995 if (!chunk) 996 return -ENOMEM; 997 KHOSER_STORE_PTR(preservation->first, chunk); 998 999 nr_contig_pages = (1 << order); 1000 for (int i = 0; i < vm->nr_pages; i += nr_contig_pages) { 1001 phys_addr_t phys = page_to_phys(vm->pages[i]); 1002 1003 err = kho_preserve_pages(vm->pages[i], nr_contig_pages); 1004 if (err) 1005 goto err_free; 1006 1007 chunk->phys[idx++] = phys; 1008 if (idx == ARRAY_SIZE(chunk->phys)) { 1009 chunk = new_vmalloc_chunk(chunk); 1010 if (!chunk) 1011 goto err_free; 1012 idx = 0; 1013 } 1014 } 1015 1016 preservation->total_pages = vm->nr_pages; 1017 preservation->flags = flags; 1018 preservation->order = order; 1019 1020 return 0; 1021 1022 err_free: 1023 kho_unpreserve_vmalloc(preservation); 1024 return err; 1025 } 1026 EXPORT_SYMBOL_GPL(kho_preserve_vmalloc); 1027 1028 /** 1029 * kho_unpreserve_vmalloc - unpreserve memory allocated with vmalloc() 1030 * @preservation: preservation metadata returned by kho_preserve_vmalloc() 1031 * 1032 * Instructs KHO to unpreserve the area in vmalloc address space that was 1033 * previously preserved with kho_preserve_vmalloc(). 1034 */ 1035 void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) 1036 { 1037 struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); 1038 1039 while (chunk) { 1040 struct kho_vmalloc_chunk *tmp = chunk; 1041 1042 kho_vmalloc_unpreserve_chunk(chunk, preservation->order); 1043 1044 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 1045 free_page((unsigned long)tmp); 1046 } 1047 } 1048 EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc); 1049 1050 /** 1051 * kho_restore_vmalloc - recreates and populates an area in vmalloc address 1052 * space from the preserved memory. 1053 * @preservation: preservation metadata. 1054 * 1055 * Recreates an area in vmalloc address space and populates it with memory that 1056 * was preserved using kho_preserve_vmalloc(). 1057 * 1058 * Return: pointer to the area in the vmalloc address space, NULL on failure. 1059 */ 1060 void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) 1061 { 1062 struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); 1063 unsigned int align, order, shift, vm_flags; 1064 unsigned long total_pages, contig_pages; 1065 unsigned long addr, size; 1066 struct vm_struct *area; 1067 struct page **pages; 1068 unsigned int idx = 0; 1069 int err; 1070 1071 vm_flags = kho_flags_to_vmalloc(preservation->flags); 1072 if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) 1073 return NULL; 1074 1075 total_pages = preservation->total_pages; 1076 pages = kvmalloc_array(total_pages, sizeof(*pages), GFP_KERNEL); 1077 if (!pages) 1078 return NULL; 1079 order = preservation->order; 1080 contig_pages = (1 << order); 1081 shift = PAGE_SHIFT + order; 1082 align = 1 << shift; 1083 1084 while (chunk) { 1085 struct page *page; 1086 1087 for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { 1088 phys_addr_t phys = chunk->phys[i]; 1089 1090 if (idx + contig_pages > total_pages) 1091 goto err_free_pages_array; 1092 1093 page = kho_restore_pages(phys, contig_pages); 1094 if (!page) 1095 goto err_free_pages_array; 1096 1097 for (int j = 0; j < contig_pages; j++) 1098 pages[idx++] = page; 1099 1100 phys += contig_pages * PAGE_SIZE; 1101 } 1102 1103 page = kho_restore_pages(virt_to_phys(chunk), 1); 1104 if (!page) 1105 goto err_free_pages_array; 1106 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 1107 __free_page(page); 1108 } 1109 1110 if (idx != total_pages) 1111 goto err_free_pages_array; 1112 1113 area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift, 1114 vm_flags, VMALLOC_START, VMALLOC_END, 1115 NUMA_NO_NODE, GFP_KERNEL, 1116 __builtin_return_address(0)); 1117 if (!area) 1118 goto err_free_pages_array; 1119 1120 addr = (unsigned long)area->addr; 1121 size = get_vm_area_size(area); 1122 err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift); 1123 if (err) 1124 goto err_free_vm_area; 1125 1126 area->nr_pages = total_pages; 1127 area->pages = pages; 1128 1129 return area->addr; 1130 1131 err_free_vm_area: 1132 free_vm_area(area); 1133 err_free_pages_array: 1134 kvfree(pages); 1135 return NULL; 1136 } 1137 EXPORT_SYMBOL_GPL(kho_restore_vmalloc); 1138 1139 /** 1140 * kho_alloc_preserve - Allocate, zero, and preserve memory. 1141 * @size: The number of bytes to allocate. 1142 * 1143 * Allocates a physically contiguous block of zeroed pages that is large 1144 * enough to hold @size bytes. The allocated memory is then registered with 1145 * KHO for preservation across a kexec. 1146 * 1147 * Note: The actual allocated size will be rounded up to the nearest 1148 * power-of-two page boundary. 1149 * 1150 * @return A virtual pointer to the allocated and preserved memory on success, 1151 * or an ERR_PTR() encoded error on failure. 1152 */ 1153 void *kho_alloc_preserve(size_t size) 1154 { 1155 struct folio *folio; 1156 int order, ret; 1157 1158 if (!size) 1159 return ERR_PTR(-EINVAL); 1160 1161 order = get_order(size); 1162 if (order > MAX_PAGE_ORDER) 1163 return ERR_PTR(-E2BIG); 1164 1165 folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order); 1166 if (!folio) 1167 return ERR_PTR(-ENOMEM); 1168 1169 ret = kho_preserve_folio(folio); 1170 if (ret) { 1171 folio_put(folio); 1172 return ERR_PTR(ret); 1173 } 1174 1175 return folio_address(folio); 1176 } 1177 EXPORT_SYMBOL_GPL(kho_alloc_preserve); 1178 1179 /** 1180 * kho_unpreserve_free - Unpreserve and free memory. 1181 * @mem: Pointer to the memory allocated by kho_alloc_preserve(). 1182 * 1183 * Unregisters the memory from KHO preservation and frees the underlying 1184 * pages back to the system. This function should be called to clean up 1185 * memory allocated with kho_alloc_preserve(). 1186 */ 1187 void kho_unpreserve_free(void *mem) 1188 { 1189 struct folio *folio; 1190 1191 if (!mem) 1192 return; 1193 1194 folio = virt_to_folio(mem); 1195 kho_unpreserve_folio(folio); 1196 folio_put(folio); 1197 } 1198 EXPORT_SYMBOL_GPL(kho_unpreserve_free); 1199 1200 /** 1201 * kho_restore_free - Restore and free memory after kexec. 1202 * @mem: Pointer to the memory (in the new kernel's address space) 1203 * that was allocated by the old kernel. 1204 * 1205 * This function is intended to be called in the new kernel (post-kexec) 1206 * to take ownership of and free a memory region that was preserved by the 1207 * old kernel using kho_alloc_preserve(). 1208 * 1209 * It first restores the pages from KHO (using their physical address) 1210 * and then frees the pages back to the new kernel's page allocator. 1211 */ 1212 void kho_restore_free(void *mem) 1213 { 1214 struct folio *folio; 1215 1216 if (!mem) 1217 return; 1218 1219 folio = kho_restore_folio(__pa(mem)); 1220 if (!WARN_ON(!folio)) 1221 folio_put(folio); 1222 } 1223 EXPORT_SYMBOL_GPL(kho_restore_free); 1224 1225 int kho_finalize(void) 1226 { 1227 int ret; 1228 1229 if (!kho_enable) 1230 return -EOPNOTSUPP; 1231 1232 guard(mutex)(&kho_out.lock); 1233 ret = kho_mem_serialize(&kho_out); 1234 if (ret) 1235 return ret; 1236 1237 kho_out.finalized = true; 1238 1239 return 0; 1240 } 1241 1242 bool kho_finalized(void) 1243 { 1244 guard(mutex)(&kho_out.lock); 1245 return kho_out.finalized; 1246 } 1247 1248 struct kho_in { 1249 phys_addr_t fdt_phys; 1250 phys_addr_t scratch_phys; 1251 struct kho_debugfs dbg; 1252 }; 1253 1254 static struct kho_in kho_in = { 1255 }; 1256 1257 static const void *kho_get_fdt(void) 1258 { 1259 return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; 1260 } 1261 1262 /** 1263 * is_kho_boot - check if current kernel was booted via KHO-enabled 1264 * kexec 1265 * 1266 * This function checks if the current kernel was loaded through a kexec 1267 * operation with KHO enabled, by verifying that a valid KHO FDT 1268 * was passed. 1269 * 1270 * Note: This function returns reliable results only after 1271 * kho_populate() has been called during early boot. Before that, 1272 * it may return false even if KHO data is present. 1273 * 1274 * Return: true if booted via KHO-enabled kexec, false otherwise 1275 */ 1276 bool is_kho_boot(void) 1277 { 1278 return !!kho_get_fdt(); 1279 } 1280 EXPORT_SYMBOL_GPL(is_kho_boot); 1281 1282 /** 1283 * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. 1284 * @name: the name of the sub FDT passed to kho_add_subtree(). 1285 * @phys: if found, the physical address of the sub FDT is stored in @phys. 1286 * 1287 * Retrieve a preserved sub FDT named @name and store its physical 1288 * address in @phys. 1289 * 1290 * Return: 0 on success, error code on failure 1291 */ 1292 int kho_retrieve_subtree(const char *name, phys_addr_t *phys) 1293 { 1294 const void *fdt = kho_get_fdt(); 1295 const u64 *val; 1296 int offset, len; 1297 1298 if (!fdt) 1299 return -ENOENT; 1300 1301 if (!phys) 1302 return -EINVAL; 1303 1304 offset = fdt_subnode_offset(fdt, 0, name); 1305 if (offset < 0) 1306 return -ENOENT; 1307 1308 val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len); 1309 if (!val || len != sizeof(*val)) 1310 return -EINVAL; 1311 1312 *phys = (phys_addr_t)*val; 1313 1314 return 0; 1315 } 1316 EXPORT_SYMBOL_GPL(kho_retrieve_subtree); 1317 1318 static __init int kho_out_fdt_setup(void) 1319 { 1320 void *root = kho_out.fdt; 1321 u64 empty_mem_map = 0; 1322 int err; 1323 1324 err = fdt_create(root, PAGE_SIZE); 1325 err |= fdt_finish_reservemap(root); 1326 err |= fdt_begin_node(root, ""); 1327 err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); 1328 err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map, 1329 sizeof(empty_mem_map)); 1330 err |= fdt_end_node(root); 1331 err |= fdt_finish(root); 1332 1333 return err; 1334 } 1335 1336 static __init int kho_init(void) 1337 { 1338 const void *fdt = kho_get_fdt(); 1339 int err = 0; 1340 1341 if (!kho_enable) 1342 return 0; 1343 1344 kho_out.fdt = kho_alloc_preserve(PAGE_SIZE); 1345 if (IS_ERR(kho_out.fdt)) { 1346 err = PTR_ERR(kho_out.fdt); 1347 goto err_free_scratch; 1348 } 1349 1350 err = kho_debugfs_init(); 1351 if (err) 1352 goto err_free_fdt; 1353 1354 err = kho_out_debugfs_init(&kho_out.dbg); 1355 if (err) 1356 goto err_free_fdt; 1357 1358 err = kho_out_fdt_setup(); 1359 if (err) 1360 goto err_free_fdt; 1361 1362 if (fdt) { 1363 kho_in_debugfs_init(&kho_in.dbg, fdt); 1364 return 0; 1365 } 1366 1367 for (int i = 0; i < kho_scratch_cnt; i++) { 1368 unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr); 1369 unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; 1370 unsigned long pfn; 1371 1372 for (pfn = base_pfn; pfn < base_pfn + count; 1373 pfn += pageblock_nr_pages) 1374 init_cma_reserved_pageblock(pfn_to_page(pfn)); 1375 } 1376 1377 WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt", 1378 kho_out.fdt, true)); 1379 1380 return 0; 1381 1382 err_free_fdt: 1383 kho_unpreserve_free(kho_out.fdt); 1384 err_free_scratch: 1385 kho_out.fdt = NULL; 1386 for (int i = 0; i < kho_scratch_cnt; i++) { 1387 void *start = __va(kho_scratch[i].addr); 1388 void *end = start + kho_scratch[i].size; 1389 1390 free_reserved_area(start, end, -1, ""); 1391 } 1392 kho_enable = false; 1393 return err; 1394 } 1395 fs_initcall(kho_init); 1396 1397 static void __init kho_release_scratch(void) 1398 { 1399 phys_addr_t start, end; 1400 u64 i; 1401 1402 memmap_init_kho_scratch_pages(); 1403 1404 /* 1405 * Mark scratch mem as CMA before we return it. That way we 1406 * ensure that no kernel allocations happen on it. That means 1407 * we can reuse it as scratch memory again later. 1408 */ 1409 __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, 1410 MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) { 1411 ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start)); 1412 ulong end_pfn = pageblock_align(PFN_UP(end)); 1413 ulong pfn; 1414 1415 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) 1416 init_pageblock_migratetype(pfn_to_page(pfn), 1417 MIGRATE_CMA, false); 1418 } 1419 } 1420 1421 void __init kho_memory_init(void) 1422 { 1423 if (kho_in.scratch_phys) { 1424 kho_scratch = phys_to_virt(kho_in.scratch_phys); 1425 kho_release_scratch(); 1426 1427 if (!kho_mem_deserialize(kho_get_fdt())) 1428 kho_in.fdt_phys = 0; 1429 } else { 1430 kho_reserve_scratch(); 1431 } 1432 } 1433 1434 void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, 1435 phys_addr_t scratch_phys, u64 scratch_len) 1436 { 1437 void *fdt = NULL; 1438 struct kho_scratch *scratch = NULL; 1439 int err = 0; 1440 unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); 1441 1442 /* Validate the input FDT */ 1443 fdt = early_memremap(fdt_phys, fdt_len); 1444 if (!fdt) { 1445 pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys); 1446 err = -EFAULT; 1447 goto out; 1448 } 1449 err = fdt_check_header(fdt); 1450 if (err) { 1451 pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n", 1452 fdt_phys, err); 1453 err = -EINVAL; 1454 goto out; 1455 } 1456 err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE); 1457 if (err) { 1458 pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n", 1459 fdt_phys, KHO_FDT_COMPATIBLE, err); 1460 err = -EINVAL; 1461 goto out; 1462 } 1463 1464 scratch = early_memremap(scratch_phys, scratch_len); 1465 if (!scratch) { 1466 pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n", 1467 scratch_phys, scratch_len); 1468 err = -EFAULT; 1469 goto out; 1470 } 1471 1472 /* 1473 * We pass a safe contiguous blocks of memory to use for early boot 1474 * purporses from the previous kernel so that we can resize the 1475 * memblock array as needed. 1476 */ 1477 for (int i = 0; i < scratch_cnt; i++) { 1478 struct kho_scratch *area = &scratch[i]; 1479 u64 size = area->size; 1480 1481 memblock_add(area->addr, size); 1482 err = memblock_mark_kho_scratch(area->addr, size); 1483 if (WARN_ON(err)) { 1484 pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe", 1485 &area->addr, &size, ERR_PTR(err)); 1486 goto out; 1487 } 1488 pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); 1489 } 1490 1491 memblock_reserve(scratch_phys, scratch_len); 1492 1493 /* 1494 * Now that we have a viable region of scratch memory, let's tell 1495 * the memblocks allocator to only use that for any allocations. 1496 * That way we ensure that nothing scribbles over in use data while 1497 * we initialize the page tables which we will need to ingest all 1498 * memory reservations from the previous kernel. 1499 */ 1500 memblock_set_kho_scratch_only(); 1501 1502 kho_in.fdt_phys = fdt_phys; 1503 kho_in.scratch_phys = scratch_phys; 1504 kho_scratch_cnt = scratch_cnt; 1505 pr_info("found kexec handover data.\n"); 1506 1507 out: 1508 if (fdt) 1509 early_memunmap(fdt, fdt_len); 1510 if (scratch) 1511 early_memunmap(scratch, scratch_len); 1512 if (err) 1513 pr_warn("disabling KHO revival: %d\n", err); 1514 } 1515 1516 /* Helper functions for kexec_file_load */ 1517 1518 int kho_fill_kimage(struct kimage *image) 1519 { 1520 ssize_t scratch_size; 1521 int err = 0; 1522 struct kexec_buf scratch; 1523 1524 if (!kho_enable) 1525 return 0; 1526 1527 image->kho.fdt = virt_to_phys(kho_out.fdt); 1528 1529 scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; 1530 scratch = (struct kexec_buf){ 1531 .image = image, 1532 .buffer = kho_scratch, 1533 .bufsz = scratch_size, 1534 .mem = KEXEC_BUF_MEM_UNKNOWN, 1535 .memsz = scratch_size, 1536 .buf_align = SZ_64K, /* Makes it easier to map */ 1537 .buf_max = ULONG_MAX, 1538 .top_down = true, 1539 }; 1540 err = kexec_add_buffer(&scratch); 1541 if (err) 1542 return err; 1543 image->kho.scratch = &image->segment[image->nr_segments - 1]; 1544 1545 return 0; 1546 } 1547 1548 static int kho_walk_scratch(struct kexec_buf *kbuf, 1549 int (*func)(struct resource *, void *)) 1550 { 1551 int ret = 0; 1552 int i; 1553 1554 for (i = 0; i < kho_scratch_cnt; i++) { 1555 struct resource res = { 1556 .start = kho_scratch[i].addr, 1557 .end = kho_scratch[i].addr + kho_scratch[i].size - 1, 1558 }; 1559 1560 /* Try to fit the kimage into our KHO scratch region */ 1561 ret = func(&res, kbuf); 1562 if (ret) 1563 break; 1564 } 1565 1566 return ret; 1567 } 1568 1569 int kho_locate_mem_hole(struct kexec_buf *kbuf, 1570 int (*func)(struct resource *, void *)) 1571 { 1572 int ret; 1573 1574 if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH) 1575 return 1; 1576 1577 ret = kho_walk_scratch(kbuf, func); 1578 1579 return ret == 1 ? 0 : -EADDRNOTAVAIL; 1580 } 1581