1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kexec_handover.c - kexec handover metadata processing 4 * Copyright (C) 2023 Alexander Graf <graf@amazon.com> 5 * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> 6 * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> 7 * Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com> 8 */ 9 10 #define pr_fmt(fmt) "KHO: " fmt 11 12 #include <linux/cleanup.h> 13 #include <linux/cma.h> 14 #include <linux/kmemleak.h> 15 #include <linux/count_zeros.h> 16 #include <linux/kexec.h> 17 #include <linux/kexec_handover.h> 18 #include <linux/kho/abi/kexec_handover.h> 19 #include <linux/libfdt.h> 20 #include <linux/list.h> 21 #include <linux/memblock.h> 22 #include <linux/page-isolation.h> 23 #include <linux/unaligned.h> 24 #include <linux/vmalloc.h> 25 26 #include <asm/early_ioremap.h> 27 28 /* 29 * KHO is tightly coupled with mm init and needs access to some of mm 30 * internal APIs. 31 */ 32 #include "../../mm/internal.h" 33 #include "../kexec_internal.h" 34 #include "kexec_handover_internal.h" 35 36 /* The magic token for preserved pages */ 37 #define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */ 38 39 /* 40 * KHO uses page->private, which is an unsigned long, to store page metadata. 41 * Use it to store both the magic and the order. 42 */ 43 union kho_page_info { 44 unsigned long page_private; 45 struct { 46 unsigned int order; 47 unsigned int magic; 48 }; 49 }; 50 51 static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private)); 52 53 static bool kho_enable __ro_after_init = IS_ENABLED(CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT); 54 55 bool kho_is_enabled(void) 56 { 57 return kho_enable; 58 } 59 EXPORT_SYMBOL_GPL(kho_is_enabled); 60 61 static int __init kho_parse_enable(char *p) 62 { 63 return kstrtobool(p, &kho_enable); 64 } 65 early_param("kho", kho_parse_enable); 66 67 /* 68 * Keep track of memory that is to be preserved across KHO. 69 * 70 * The serializing side uses two levels of xarrays to manage chunks of per-order 71 * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order 72 * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0 73 * allocations each bitmap will cover 128M of address space. Thus, for 16G of 74 * memory at most 512K of bitmap memory will be needed for order 0. 75 * 76 * This approach is fully incremental, as the serialization progresses folios 77 * can continue be aggregated to the tracker. The final step, immediately prior 78 * to kexec would serialize the xarray information into a linked list for the 79 * successor kernel to parse. 80 */ 81 82 #define PRESERVE_BITS (PAGE_SIZE * 8) 83 84 struct kho_mem_phys_bits { 85 DECLARE_BITMAP(preserve, PRESERVE_BITS); 86 }; 87 88 static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE); 89 90 struct kho_mem_phys { 91 /* 92 * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized 93 * to order. 94 */ 95 struct xarray phys_bits; 96 }; 97 98 struct kho_mem_track { 99 /* Points to kho_mem_phys, each order gets its own bitmap tree */ 100 struct xarray orders; 101 }; 102 103 struct khoser_mem_chunk; 104 105 struct kho_out { 106 void *fdt; 107 bool finalized; 108 struct mutex lock; /* protects KHO FDT finalization */ 109 110 struct kho_mem_track track; 111 struct kho_debugfs dbg; 112 }; 113 114 static struct kho_out kho_out = { 115 .lock = __MUTEX_INITIALIZER(kho_out.lock), 116 .track = { 117 .orders = XARRAY_INIT(kho_out.track.orders, 0), 118 }, 119 .finalized = false, 120 }; 121 122 static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) 123 { 124 void *res = xa_load(xa, index); 125 126 if (res) 127 return res; 128 129 void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL); 130 131 if (!elm) 132 return ERR_PTR(-ENOMEM); 133 134 if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE))) 135 return ERR_PTR(-EINVAL); 136 137 res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); 138 if (xa_is_err(res)) 139 return ERR_PTR(xa_err(res)); 140 else if (res) 141 return res; 142 143 return no_free_ptr(elm); 144 } 145 146 static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn, 147 unsigned int order) 148 { 149 struct kho_mem_phys_bits *bits; 150 struct kho_mem_phys *physxa; 151 const unsigned long pfn_high = pfn >> order; 152 153 physxa = xa_load(&track->orders, order); 154 if (WARN_ON_ONCE(!physxa)) 155 return; 156 157 bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); 158 if (WARN_ON_ONCE(!bits)) 159 return; 160 161 clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); 162 } 163 164 static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, 165 unsigned long end_pfn) 166 { 167 unsigned int order; 168 169 while (pfn < end_pfn) { 170 order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 171 172 __kho_unpreserve_order(track, pfn, order); 173 174 pfn += 1 << order; 175 } 176 } 177 178 static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, 179 unsigned int order) 180 { 181 struct kho_mem_phys_bits *bits; 182 struct kho_mem_phys *physxa, *new_physxa; 183 const unsigned long pfn_high = pfn >> order; 184 185 might_sleep(); 186 physxa = xa_load(&track->orders, order); 187 if (!physxa) { 188 int err; 189 190 new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL); 191 if (!new_physxa) 192 return -ENOMEM; 193 194 xa_init(&new_physxa->phys_bits); 195 physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa, 196 GFP_KERNEL); 197 198 err = xa_err(physxa); 199 if (err || physxa) { 200 xa_destroy(&new_physxa->phys_bits); 201 kfree(new_physxa); 202 203 if (err) 204 return err; 205 } else { 206 physxa = new_physxa; 207 } 208 } 209 210 bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS); 211 if (IS_ERR(bits)) 212 return PTR_ERR(bits); 213 214 set_bit(pfn_high % PRESERVE_BITS, bits->preserve); 215 216 return 0; 217 } 218 219 static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) 220 { 221 struct page *page = pfn_to_online_page(PHYS_PFN(phys)); 222 unsigned int nr_pages, ref_cnt; 223 union kho_page_info info; 224 225 if (!page) 226 return NULL; 227 228 info.page_private = page->private; 229 /* 230 * deserialize_bitmap() only sets the magic on the head page. This magic 231 * check also implicitly makes sure phys is order-aligned since for 232 * non-order-aligned phys addresses, magic will never be set. 233 */ 234 if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC || info.order > MAX_PAGE_ORDER)) 235 return NULL; 236 nr_pages = (1 << info.order); 237 238 /* Clear private to make sure later restores on this page error out. */ 239 page->private = 0; 240 /* Head page gets refcount of 1. */ 241 set_page_count(page, 1); 242 243 /* 244 * For higher order folios, tail pages get a page count of zero. 245 * For physically contiguous order-0 pages every pages gets a page 246 * count of 1 247 */ 248 ref_cnt = is_folio ? 0 : 1; 249 for (unsigned int i = 1; i < nr_pages; i++) 250 set_page_count(page + i, ref_cnt); 251 252 if (is_folio && info.order) 253 prep_compound_page(page, info.order); 254 255 adjust_managed_page_count(page, nr_pages); 256 return page; 257 } 258 259 /** 260 * kho_restore_folio - recreates the folio from the preserved memory. 261 * @phys: physical address of the folio. 262 * 263 * Return: pointer to the struct folio on success, NULL on failure. 264 */ 265 struct folio *kho_restore_folio(phys_addr_t phys) 266 { 267 struct page *page = kho_restore_page(phys, true); 268 269 return page ? page_folio(page) : NULL; 270 } 271 EXPORT_SYMBOL_GPL(kho_restore_folio); 272 273 /** 274 * kho_restore_pages - restore list of contiguous order 0 pages. 275 * @phys: physical address of the first page. 276 * @nr_pages: number of pages. 277 * 278 * Restore a contiguous list of order 0 pages that was preserved with 279 * kho_preserve_pages(). 280 * 281 * Return: 0 on success, error code on failure 282 */ 283 struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages) 284 { 285 const unsigned long start_pfn = PHYS_PFN(phys); 286 const unsigned long end_pfn = start_pfn + nr_pages; 287 unsigned long pfn = start_pfn; 288 289 while (pfn < end_pfn) { 290 const unsigned int order = 291 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 292 struct page *page = kho_restore_page(PFN_PHYS(pfn), false); 293 294 if (!page) 295 return NULL; 296 pfn += 1 << order; 297 } 298 299 return pfn_to_page(start_pfn); 300 } 301 EXPORT_SYMBOL_GPL(kho_restore_pages); 302 303 /* Serialize and deserialize struct kho_mem_phys across kexec 304 * 305 * Record all the bitmaps in a linked list of pages for the next kernel to 306 * process. Each chunk holds bitmaps of the same order and each block of bitmaps 307 * starts at a given physical address. This allows the bitmaps to be sparse. The 308 * xarray is used to store them in a tree while building up the data structure, 309 * but the KHO successor kernel only needs to process them once in order. 310 * 311 * All of this memory is normal kmalloc() memory and is not marked for 312 * preservation. The successor kernel will remain isolated to the scratch space 313 * until it completes processing this list. Once processed all the memory 314 * storing these ranges will be marked as free. 315 */ 316 317 struct khoser_mem_bitmap_ptr { 318 phys_addr_t phys_start; 319 DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *); 320 }; 321 322 struct khoser_mem_chunk_hdr { 323 DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *); 324 unsigned int order; 325 unsigned int num_elms; 326 }; 327 328 #define KHOSER_BITMAP_SIZE \ 329 ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \ 330 sizeof(struct khoser_mem_bitmap_ptr)) 331 332 struct khoser_mem_chunk { 333 struct khoser_mem_chunk_hdr hdr; 334 struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE]; 335 }; 336 337 static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); 338 339 static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, 340 unsigned long order) 341 { 342 struct khoser_mem_chunk *chunk __free(free_page) = NULL; 343 344 chunk = (void *)get_zeroed_page(GFP_KERNEL); 345 if (!chunk) 346 return ERR_PTR(-ENOMEM); 347 348 if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE))) 349 return ERR_PTR(-EINVAL); 350 351 chunk->hdr.order = order; 352 if (cur_chunk) 353 KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); 354 return no_free_ptr(chunk); 355 } 356 357 static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) 358 { 359 struct khoser_mem_chunk *chunk = first_chunk; 360 361 while (chunk) { 362 struct khoser_mem_chunk *tmp = chunk; 363 364 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 365 free_page((unsigned long)tmp); 366 } 367 } 368 369 /* 370 * Update memory map property, if old one is found discard it via 371 * kho_mem_ser_free(). 372 */ 373 static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk) 374 { 375 void *ptr; 376 u64 phys; 377 378 ptr = fdt_getprop_w(kho_out.fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, NULL); 379 380 /* Check and discard previous memory map */ 381 phys = get_unaligned((u64 *)ptr); 382 if (phys) 383 kho_mem_ser_free((struct khoser_mem_chunk *)phys_to_virt(phys)); 384 385 /* Update with the new value */ 386 phys = first_chunk ? (u64)virt_to_phys(first_chunk) : 0; 387 put_unaligned(phys, (u64 *)ptr); 388 } 389 390 static int kho_mem_serialize(struct kho_out *kho_out) 391 { 392 struct khoser_mem_chunk *first_chunk = NULL; 393 struct khoser_mem_chunk *chunk = NULL; 394 struct kho_mem_phys *physxa; 395 unsigned long order; 396 int err = -ENOMEM; 397 398 xa_for_each(&kho_out->track.orders, order, physxa) { 399 struct kho_mem_phys_bits *bits; 400 unsigned long phys; 401 402 chunk = new_chunk(chunk, order); 403 if (IS_ERR(chunk)) { 404 err = PTR_ERR(chunk); 405 goto err_free; 406 } 407 408 if (!first_chunk) 409 first_chunk = chunk; 410 411 xa_for_each(&physxa->phys_bits, phys, bits) { 412 struct khoser_mem_bitmap_ptr *elm; 413 414 if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { 415 chunk = new_chunk(chunk, order); 416 if (IS_ERR(chunk)) { 417 err = PTR_ERR(chunk); 418 goto err_free; 419 } 420 } 421 422 elm = &chunk->bitmaps[chunk->hdr.num_elms]; 423 chunk->hdr.num_elms++; 424 elm->phys_start = (phys * PRESERVE_BITS) 425 << (order + PAGE_SHIFT); 426 KHOSER_STORE_PTR(elm->bitmap, bits); 427 } 428 } 429 430 kho_update_memory_map(first_chunk); 431 432 return 0; 433 434 err_free: 435 kho_mem_ser_free(first_chunk); 436 return err; 437 } 438 439 static void __init deserialize_bitmap(unsigned int order, 440 struct khoser_mem_bitmap_ptr *elm) 441 { 442 struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap); 443 unsigned long bit; 444 445 for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) { 446 int sz = 1 << (order + PAGE_SHIFT); 447 phys_addr_t phys = 448 elm->phys_start + (bit << (order + PAGE_SHIFT)); 449 struct page *page = phys_to_page(phys); 450 union kho_page_info info; 451 452 memblock_reserve(phys, sz); 453 memblock_reserved_mark_noinit(phys, sz); 454 info.magic = KHO_PAGE_MAGIC; 455 info.order = order; 456 page->private = info.page_private; 457 } 458 } 459 460 /* Returns physical address of the preserved memory map from FDT */ 461 static phys_addr_t __init kho_get_mem_map_phys(const void *fdt) 462 { 463 const void *mem_ptr; 464 int len; 465 466 mem_ptr = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len); 467 if (!mem_ptr || len != sizeof(u64)) { 468 pr_err("failed to get preserved memory bitmaps\n"); 469 return 0; 470 } 471 472 return get_unaligned((const u64 *)mem_ptr); 473 } 474 475 static void __init kho_mem_deserialize(struct khoser_mem_chunk *chunk) 476 { 477 while (chunk) { 478 unsigned int i; 479 480 for (i = 0; i != chunk->hdr.num_elms; i++) 481 deserialize_bitmap(chunk->hdr.order, 482 &chunk->bitmaps[i]); 483 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 484 } 485 } 486 487 /* 488 * With KHO enabled, memory can become fragmented because KHO regions may 489 * be anywhere in physical address space. The scratch regions give us a 490 * safe zones that we will never see KHO allocations from. This is where we 491 * can later safely load our new kexec images into and then use the scratch 492 * area for early allocations that happen before page allocator is 493 * initialized. 494 */ 495 struct kho_scratch *kho_scratch; 496 unsigned int kho_scratch_cnt; 497 498 /* 499 * The scratch areas are scaled by default as percent of memory allocated from 500 * memblock. A user can override the scale with command line parameter: 501 * 502 * kho_scratch=N% 503 * 504 * It is also possible to explicitly define size for a lowmem, a global and 505 * per-node scratch areas: 506 * 507 * kho_scratch=l[KMG],n[KMG],m[KMG] 508 * 509 * The explicit size definition takes precedence over scale definition. 510 */ 511 static unsigned int scratch_scale __initdata = 200; 512 static phys_addr_t scratch_size_global __initdata; 513 static phys_addr_t scratch_size_pernode __initdata; 514 static phys_addr_t scratch_size_lowmem __initdata; 515 516 static int __init kho_parse_scratch_size(char *p) 517 { 518 size_t len; 519 unsigned long sizes[3]; 520 size_t total_size = 0; 521 int i; 522 523 if (!p) 524 return -EINVAL; 525 526 len = strlen(p); 527 if (!len) 528 return -EINVAL; 529 530 /* parse nn% */ 531 if (p[len - 1] == '%') { 532 /* unsigned int max is 4,294,967,295, 10 chars */ 533 char s_scale[11] = {}; 534 int ret = 0; 535 536 if (len > ARRAY_SIZE(s_scale)) 537 return -EINVAL; 538 539 memcpy(s_scale, p, len - 1); 540 ret = kstrtouint(s_scale, 10, &scratch_scale); 541 if (!ret) 542 pr_notice("scratch scale is %d%%\n", scratch_scale); 543 return ret; 544 } 545 546 /* parse ll[KMG],mm[KMG],nn[KMG] */ 547 for (i = 0; i < ARRAY_SIZE(sizes); i++) { 548 char *endp = p; 549 550 if (i > 0) { 551 if (*p != ',') 552 return -EINVAL; 553 p += 1; 554 } 555 556 sizes[i] = memparse(p, &endp); 557 if (endp == p) 558 return -EINVAL; 559 p = endp; 560 total_size += sizes[i]; 561 } 562 563 if (!total_size) 564 return -EINVAL; 565 566 /* The string should be fully consumed by now. */ 567 if (*p) 568 return -EINVAL; 569 570 scratch_size_lowmem = sizes[0]; 571 scratch_size_global = sizes[1]; 572 scratch_size_pernode = sizes[2]; 573 scratch_scale = 0; 574 575 pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n", 576 (u64)(scratch_size_lowmem >> 20), 577 (u64)(scratch_size_global >> 20), 578 (u64)(scratch_size_pernode >> 20)); 579 580 return 0; 581 } 582 early_param("kho_scratch", kho_parse_scratch_size); 583 584 static void __init scratch_size_update(void) 585 { 586 phys_addr_t size; 587 588 if (!scratch_scale) 589 return; 590 591 size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT, 592 NUMA_NO_NODE); 593 size = size * scratch_scale / 100; 594 scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES); 595 596 size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, 597 NUMA_NO_NODE); 598 size = size * scratch_scale / 100 - scratch_size_lowmem; 599 scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES); 600 } 601 602 static phys_addr_t __init scratch_size_node(int nid) 603 { 604 phys_addr_t size; 605 606 if (scratch_scale) { 607 size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, 608 nid); 609 size = size * scratch_scale / 100; 610 } else { 611 size = scratch_size_pernode; 612 } 613 614 return round_up(size, CMA_MIN_ALIGNMENT_BYTES); 615 } 616 617 /** 618 * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec 619 * 620 * With KHO we can preserve arbitrary pages in the system. To ensure we still 621 * have a large contiguous region of memory when we search the physical address 622 * space for target memory, let's make sure we always have a large CMA region 623 * active. This CMA region will only be used for movable pages which are not a 624 * problem for us during KHO because we can just move them somewhere else. 625 */ 626 static void __init kho_reserve_scratch(void) 627 { 628 phys_addr_t addr, size; 629 int nid, i = 0; 630 631 if (!kho_enable) 632 return; 633 634 scratch_size_update(); 635 636 /* FIXME: deal with node hot-plug/remove */ 637 kho_scratch_cnt = num_online_nodes() + 2; 638 size = kho_scratch_cnt * sizeof(*kho_scratch); 639 kho_scratch = memblock_alloc(size, PAGE_SIZE); 640 if (!kho_scratch) 641 goto err_disable_kho; 642 643 /* 644 * reserve scratch area in low memory for lowmem allocations in the 645 * next kernel 646 */ 647 size = scratch_size_lowmem; 648 addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0, 649 ARCH_LOW_ADDRESS_LIMIT); 650 if (!addr) 651 goto err_free_scratch_desc; 652 653 kho_scratch[i].addr = addr; 654 kho_scratch[i].size = size; 655 i++; 656 657 /* reserve large contiguous area for allocations without nid */ 658 size = scratch_size_global; 659 addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES); 660 if (!addr) 661 goto err_free_scratch_areas; 662 663 kho_scratch[i].addr = addr; 664 kho_scratch[i].size = size; 665 i++; 666 667 for_each_online_node(nid) { 668 size = scratch_size_node(nid); 669 addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, 670 0, MEMBLOCK_ALLOC_ACCESSIBLE, 671 nid, true); 672 if (!addr) 673 goto err_free_scratch_areas; 674 675 kho_scratch[i].addr = addr; 676 kho_scratch[i].size = size; 677 i++; 678 } 679 680 return; 681 682 err_free_scratch_areas: 683 for (i--; i >= 0; i--) 684 memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size); 685 err_free_scratch_desc: 686 memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch)); 687 err_disable_kho: 688 pr_warn("Failed to reserve scratch area, disabling kexec handover\n"); 689 kho_enable = false; 690 } 691 692 /** 693 * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. 694 * @name: name of the sub tree. 695 * @fdt: the sub tree blob. 696 * 697 * Creates a new child node named @name in KHO root FDT and records 698 * the physical address of @fdt. The pages of @fdt must also be preserved 699 * by KHO for the new kernel to retrieve it after kexec. 700 * 701 * A debugfs blob entry is also created at 702 * ``/sys/kernel/debug/kho/out/sub_fdts/@name`` when kernel is configured with 703 * CONFIG_KEXEC_HANDOVER_DEBUGFS 704 * 705 * Return: 0 on success, error code on failure 706 */ 707 int kho_add_subtree(const char *name, void *fdt) 708 { 709 phys_addr_t phys = virt_to_phys(fdt); 710 void *root_fdt = kho_out.fdt; 711 int err = -ENOMEM; 712 int off, fdt_err; 713 714 guard(mutex)(&kho_out.lock); 715 716 fdt_err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE); 717 if (fdt_err < 0) 718 return err; 719 720 off = fdt_add_subnode(root_fdt, 0, name); 721 if (off < 0) { 722 if (off == -FDT_ERR_EXISTS) 723 err = -EEXIST; 724 goto out_pack; 725 } 726 727 err = fdt_setprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, 728 &phys, sizeof(phys)); 729 if (err < 0) 730 goto out_pack; 731 732 WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false)); 733 734 out_pack: 735 fdt_pack(root_fdt); 736 737 return err; 738 } 739 EXPORT_SYMBOL_GPL(kho_add_subtree); 740 741 void kho_remove_subtree(void *fdt) 742 { 743 phys_addr_t target_phys = virt_to_phys(fdt); 744 void *root_fdt = kho_out.fdt; 745 int off; 746 int err; 747 748 guard(mutex)(&kho_out.lock); 749 750 err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE); 751 if (err < 0) 752 return; 753 754 for (off = fdt_first_subnode(root_fdt, 0); off >= 0; 755 off = fdt_next_subnode(root_fdt, off)) { 756 const u64 *val; 757 int len; 758 759 val = fdt_getprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, &len); 760 if (!val || len != sizeof(phys_addr_t)) 761 continue; 762 763 if ((phys_addr_t)*val == target_phys) { 764 fdt_del_node(root_fdt, off); 765 kho_debugfs_fdt_remove(&kho_out.dbg, fdt); 766 break; 767 } 768 } 769 770 fdt_pack(root_fdt); 771 } 772 EXPORT_SYMBOL_GPL(kho_remove_subtree); 773 774 /** 775 * kho_preserve_folio - preserve a folio across kexec. 776 * @folio: folio to preserve. 777 * 778 * Instructs KHO to preserve the whole folio across kexec. The order 779 * will be preserved as well. 780 * 781 * Return: 0 on success, error code on failure 782 */ 783 int kho_preserve_folio(struct folio *folio) 784 { 785 const unsigned long pfn = folio_pfn(folio); 786 const unsigned int order = folio_order(folio); 787 struct kho_mem_track *track = &kho_out.track; 788 789 if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) 790 return -EINVAL; 791 792 return __kho_preserve_order(track, pfn, order); 793 } 794 EXPORT_SYMBOL_GPL(kho_preserve_folio); 795 796 /** 797 * kho_unpreserve_folio - unpreserve a folio. 798 * @folio: folio to unpreserve. 799 * 800 * Instructs KHO to unpreserve a folio that was preserved by 801 * kho_preserve_folio() before. The provided @folio (pfn and order) 802 * must exactly match a previously preserved folio. 803 */ 804 void kho_unpreserve_folio(struct folio *folio) 805 { 806 const unsigned long pfn = folio_pfn(folio); 807 const unsigned int order = folio_order(folio); 808 struct kho_mem_track *track = &kho_out.track; 809 810 __kho_unpreserve_order(track, pfn, order); 811 } 812 EXPORT_SYMBOL_GPL(kho_unpreserve_folio); 813 814 /** 815 * kho_preserve_pages - preserve contiguous pages across kexec 816 * @page: first page in the list. 817 * @nr_pages: number of pages. 818 * 819 * Preserve a contiguous list of order 0 pages. Must be restored using 820 * kho_restore_pages() to ensure the pages are restored properly as order 0. 821 * 822 * Return: 0 on success, error code on failure 823 */ 824 int kho_preserve_pages(struct page *page, unsigned int nr_pages) 825 { 826 struct kho_mem_track *track = &kho_out.track; 827 const unsigned long start_pfn = page_to_pfn(page); 828 const unsigned long end_pfn = start_pfn + nr_pages; 829 unsigned long pfn = start_pfn; 830 unsigned long failed_pfn = 0; 831 int err = 0; 832 833 if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT, 834 nr_pages << PAGE_SHIFT))) { 835 return -EINVAL; 836 } 837 838 while (pfn < end_pfn) { 839 const unsigned int order = 840 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 841 842 err = __kho_preserve_order(track, pfn, order); 843 if (err) { 844 failed_pfn = pfn; 845 break; 846 } 847 848 pfn += 1 << order; 849 } 850 851 if (err) 852 __kho_unpreserve(track, start_pfn, failed_pfn); 853 854 return err; 855 } 856 EXPORT_SYMBOL_GPL(kho_preserve_pages); 857 858 /** 859 * kho_unpreserve_pages - unpreserve contiguous pages. 860 * @page: first page in the list. 861 * @nr_pages: number of pages. 862 * 863 * Instructs KHO to unpreserve @nr_pages contiguous pages starting from @page. 864 * This must be called with the same @page and @nr_pages as the corresponding 865 * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger 866 * preserved blocks is not supported. 867 */ 868 void kho_unpreserve_pages(struct page *page, unsigned int nr_pages) 869 { 870 struct kho_mem_track *track = &kho_out.track; 871 const unsigned long start_pfn = page_to_pfn(page); 872 const unsigned long end_pfn = start_pfn + nr_pages; 873 874 __kho_unpreserve(track, start_pfn, end_pfn); 875 } 876 EXPORT_SYMBOL_GPL(kho_unpreserve_pages); 877 878 /* vmalloc flags KHO supports */ 879 #define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC | VM_ALLOW_HUGE_VMAP) 880 881 /* KHO internal flags for vmalloc preservations */ 882 #define KHO_VMALLOC_ALLOC 0x0001 883 #define KHO_VMALLOC_HUGE_VMAP 0x0002 884 885 static unsigned short vmalloc_flags_to_kho(unsigned int vm_flags) 886 { 887 unsigned short kho_flags = 0; 888 889 if (vm_flags & VM_ALLOC) 890 kho_flags |= KHO_VMALLOC_ALLOC; 891 if (vm_flags & VM_ALLOW_HUGE_VMAP) 892 kho_flags |= KHO_VMALLOC_HUGE_VMAP; 893 894 return kho_flags; 895 } 896 897 static unsigned int kho_flags_to_vmalloc(unsigned short kho_flags) 898 { 899 unsigned int vm_flags = 0; 900 901 if (kho_flags & KHO_VMALLOC_ALLOC) 902 vm_flags |= VM_ALLOC; 903 if (kho_flags & KHO_VMALLOC_HUGE_VMAP) 904 vm_flags |= VM_ALLOW_HUGE_VMAP; 905 906 return vm_flags; 907 } 908 909 static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur) 910 { 911 struct kho_vmalloc_chunk *chunk; 912 int err; 913 914 chunk = (struct kho_vmalloc_chunk *)get_zeroed_page(GFP_KERNEL); 915 if (!chunk) 916 return NULL; 917 918 err = kho_preserve_pages(virt_to_page(chunk), 1); 919 if (err) 920 goto err_free; 921 if (cur) 922 KHOSER_STORE_PTR(cur->hdr.next, chunk); 923 return chunk; 924 925 err_free: 926 free_page((unsigned long)chunk); 927 return NULL; 928 } 929 930 static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, 931 unsigned short order) 932 { 933 struct kho_mem_track *track = &kho_out.track; 934 unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); 935 936 __kho_unpreserve(track, pfn, pfn + 1); 937 938 for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { 939 pfn = PHYS_PFN(chunk->phys[i]); 940 __kho_unpreserve(track, pfn, pfn + (1 << order)); 941 } 942 } 943 944 /** 945 * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec 946 * @ptr: pointer to the area in vmalloc address space 947 * @preservation: placeholder for preservation metadata 948 * 949 * Instructs KHO to preserve the area in vmalloc address space at @ptr. The 950 * physical pages mapped at @ptr will be preserved and on successful return 951 * @preservation will hold the physical address of a structure that describes 952 * the preservation. 953 * 954 * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably 955 * restored on the same node 956 * 957 * Return: 0 on success, error code on failure 958 */ 959 int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) 960 { 961 struct kho_vmalloc_chunk *chunk; 962 struct vm_struct *vm = find_vm_area(ptr); 963 unsigned int order, flags, nr_contig_pages; 964 unsigned int idx = 0; 965 int err; 966 967 if (!vm) 968 return -EINVAL; 969 970 if (vm->flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) 971 return -EOPNOTSUPP; 972 973 flags = vmalloc_flags_to_kho(vm->flags); 974 order = get_vm_area_page_order(vm); 975 976 chunk = new_vmalloc_chunk(NULL); 977 if (!chunk) 978 return -ENOMEM; 979 KHOSER_STORE_PTR(preservation->first, chunk); 980 981 nr_contig_pages = (1 << order); 982 for (int i = 0; i < vm->nr_pages; i += nr_contig_pages) { 983 phys_addr_t phys = page_to_phys(vm->pages[i]); 984 985 err = kho_preserve_pages(vm->pages[i], nr_contig_pages); 986 if (err) 987 goto err_free; 988 989 chunk->phys[idx++] = phys; 990 if (idx == ARRAY_SIZE(chunk->phys)) { 991 chunk = new_vmalloc_chunk(chunk); 992 if (!chunk) 993 goto err_free; 994 idx = 0; 995 } 996 } 997 998 preservation->total_pages = vm->nr_pages; 999 preservation->flags = flags; 1000 preservation->order = order; 1001 1002 return 0; 1003 1004 err_free: 1005 kho_unpreserve_vmalloc(preservation); 1006 return err; 1007 } 1008 EXPORT_SYMBOL_GPL(kho_preserve_vmalloc); 1009 1010 /** 1011 * kho_unpreserve_vmalloc - unpreserve memory allocated with vmalloc() 1012 * @preservation: preservation metadata returned by kho_preserve_vmalloc() 1013 * 1014 * Instructs KHO to unpreserve the area in vmalloc address space that was 1015 * previously preserved with kho_preserve_vmalloc(). 1016 */ 1017 void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) 1018 { 1019 struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); 1020 1021 while (chunk) { 1022 struct kho_vmalloc_chunk *tmp = chunk; 1023 1024 kho_vmalloc_unpreserve_chunk(chunk, preservation->order); 1025 1026 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 1027 free_page((unsigned long)tmp); 1028 } 1029 } 1030 EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc); 1031 1032 /** 1033 * kho_restore_vmalloc - recreates and populates an area in vmalloc address 1034 * space from the preserved memory. 1035 * @preservation: preservation metadata. 1036 * 1037 * Recreates an area in vmalloc address space and populates it with memory that 1038 * was preserved using kho_preserve_vmalloc(). 1039 * 1040 * Return: pointer to the area in the vmalloc address space, NULL on failure. 1041 */ 1042 void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) 1043 { 1044 struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); 1045 unsigned int align, order, shift, vm_flags; 1046 unsigned long total_pages, contig_pages; 1047 unsigned long addr, size; 1048 struct vm_struct *area; 1049 struct page **pages; 1050 unsigned int idx = 0; 1051 int err; 1052 1053 vm_flags = kho_flags_to_vmalloc(preservation->flags); 1054 if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) 1055 return NULL; 1056 1057 total_pages = preservation->total_pages; 1058 pages = kvmalloc_array(total_pages, sizeof(*pages), GFP_KERNEL); 1059 if (!pages) 1060 return NULL; 1061 order = preservation->order; 1062 contig_pages = (1 << order); 1063 shift = PAGE_SHIFT + order; 1064 align = 1 << shift; 1065 1066 while (chunk) { 1067 struct page *page; 1068 1069 for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { 1070 phys_addr_t phys = chunk->phys[i]; 1071 1072 if (idx + contig_pages > total_pages) 1073 goto err_free_pages_array; 1074 1075 page = kho_restore_pages(phys, contig_pages); 1076 if (!page) 1077 goto err_free_pages_array; 1078 1079 for (int j = 0; j < contig_pages; j++) 1080 pages[idx++] = page + j; 1081 1082 phys += contig_pages * PAGE_SIZE; 1083 } 1084 1085 page = kho_restore_pages(virt_to_phys(chunk), 1); 1086 if (!page) 1087 goto err_free_pages_array; 1088 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 1089 __free_page(page); 1090 } 1091 1092 if (idx != total_pages) 1093 goto err_free_pages_array; 1094 1095 area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift, 1096 vm_flags, VMALLOC_START, VMALLOC_END, 1097 NUMA_NO_NODE, GFP_KERNEL, 1098 __builtin_return_address(0)); 1099 if (!area) 1100 goto err_free_pages_array; 1101 1102 addr = (unsigned long)area->addr; 1103 size = get_vm_area_size(area); 1104 err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift); 1105 if (err) 1106 goto err_free_vm_area; 1107 1108 area->nr_pages = total_pages; 1109 area->pages = pages; 1110 1111 return area->addr; 1112 1113 err_free_vm_area: 1114 free_vm_area(area); 1115 err_free_pages_array: 1116 kvfree(pages); 1117 return NULL; 1118 } 1119 EXPORT_SYMBOL_GPL(kho_restore_vmalloc); 1120 1121 /** 1122 * kho_alloc_preserve - Allocate, zero, and preserve memory. 1123 * @size: The number of bytes to allocate. 1124 * 1125 * Allocates a physically contiguous block of zeroed pages that is large 1126 * enough to hold @size bytes. The allocated memory is then registered with 1127 * KHO for preservation across a kexec. 1128 * 1129 * Note: The actual allocated size will be rounded up to the nearest 1130 * power-of-two page boundary. 1131 * 1132 * @return A virtual pointer to the allocated and preserved memory on success, 1133 * or an ERR_PTR() encoded error on failure. 1134 */ 1135 void *kho_alloc_preserve(size_t size) 1136 { 1137 struct folio *folio; 1138 int order, ret; 1139 1140 if (!size) 1141 return ERR_PTR(-EINVAL); 1142 1143 order = get_order(size); 1144 if (order > MAX_PAGE_ORDER) 1145 return ERR_PTR(-E2BIG); 1146 1147 folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order); 1148 if (!folio) 1149 return ERR_PTR(-ENOMEM); 1150 1151 ret = kho_preserve_folio(folio); 1152 if (ret) { 1153 folio_put(folio); 1154 return ERR_PTR(ret); 1155 } 1156 1157 return folio_address(folio); 1158 } 1159 EXPORT_SYMBOL_GPL(kho_alloc_preserve); 1160 1161 /** 1162 * kho_unpreserve_free - Unpreserve and free memory. 1163 * @mem: Pointer to the memory allocated by kho_alloc_preserve(). 1164 * 1165 * Unregisters the memory from KHO preservation and frees the underlying 1166 * pages back to the system. This function should be called to clean up 1167 * memory allocated with kho_alloc_preserve(). 1168 */ 1169 void kho_unpreserve_free(void *mem) 1170 { 1171 struct folio *folio; 1172 1173 if (!mem) 1174 return; 1175 1176 folio = virt_to_folio(mem); 1177 kho_unpreserve_folio(folio); 1178 folio_put(folio); 1179 } 1180 EXPORT_SYMBOL_GPL(kho_unpreserve_free); 1181 1182 /** 1183 * kho_restore_free - Restore and free memory after kexec. 1184 * @mem: Pointer to the memory (in the new kernel's address space) 1185 * that was allocated by the old kernel. 1186 * 1187 * This function is intended to be called in the new kernel (post-kexec) 1188 * to take ownership of and free a memory region that was preserved by the 1189 * old kernel using kho_alloc_preserve(). 1190 * 1191 * It first restores the pages from KHO (using their physical address) 1192 * and then frees the pages back to the new kernel's page allocator. 1193 */ 1194 void kho_restore_free(void *mem) 1195 { 1196 struct folio *folio; 1197 1198 if (!mem) 1199 return; 1200 1201 folio = kho_restore_folio(__pa(mem)); 1202 if (!WARN_ON(!folio)) 1203 folio_put(folio); 1204 } 1205 EXPORT_SYMBOL_GPL(kho_restore_free); 1206 1207 int kho_finalize(void) 1208 { 1209 int ret; 1210 1211 if (!kho_enable) 1212 return -EOPNOTSUPP; 1213 1214 guard(mutex)(&kho_out.lock); 1215 ret = kho_mem_serialize(&kho_out); 1216 if (ret) 1217 return ret; 1218 1219 kho_out.finalized = true; 1220 1221 return 0; 1222 } 1223 1224 bool kho_finalized(void) 1225 { 1226 guard(mutex)(&kho_out.lock); 1227 return kho_out.finalized; 1228 } 1229 1230 struct kho_in { 1231 phys_addr_t fdt_phys; 1232 phys_addr_t scratch_phys; 1233 phys_addr_t mem_map_phys; 1234 struct kho_debugfs dbg; 1235 }; 1236 1237 static struct kho_in kho_in = { 1238 }; 1239 1240 static const void *kho_get_fdt(void) 1241 { 1242 return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; 1243 } 1244 1245 /** 1246 * is_kho_boot - check if current kernel was booted via KHO-enabled 1247 * kexec 1248 * 1249 * This function checks if the current kernel was loaded through a kexec 1250 * operation with KHO enabled, by verifying that a valid KHO FDT 1251 * was passed. 1252 * 1253 * Note: This function returns reliable results only after 1254 * kho_populate() has been called during early boot. Before that, 1255 * it may return false even if KHO data is present. 1256 * 1257 * Return: true if booted via KHO-enabled kexec, false otherwise 1258 */ 1259 bool is_kho_boot(void) 1260 { 1261 return !!kho_get_fdt(); 1262 } 1263 EXPORT_SYMBOL_GPL(is_kho_boot); 1264 1265 /** 1266 * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. 1267 * @name: the name of the sub FDT passed to kho_add_subtree(). 1268 * @phys: if found, the physical address of the sub FDT is stored in @phys. 1269 * 1270 * Retrieve a preserved sub FDT named @name and store its physical 1271 * address in @phys. 1272 * 1273 * Return: 0 on success, error code on failure 1274 */ 1275 int kho_retrieve_subtree(const char *name, phys_addr_t *phys) 1276 { 1277 const void *fdt = kho_get_fdt(); 1278 const u64 *val; 1279 int offset, len; 1280 1281 if (!fdt) 1282 return -ENOENT; 1283 1284 if (!phys) 1285 return -EINVAL; 1286 1287 offset = fdt_subnode_offset(fdt, 0, name); 1288 if (offset < 0) 1289 return -ENOENT; 1290 1291 val = fdt_getprop(fdt, offset, KHO_FDT_SUB_TREE_PROP_NAME, &len); 1292 if (!val || len != sizeof(*val)) 1293 return -EINVAL; 1294 1295 *phys = (phys_addr_t)*val; 1296 1297 return 0; 1298 } 1299 EXPORT_SYMBOL_GPL(kho_retrieve_subtree); 1300 1301 static __init int kho_out_fdt_setup(void) 1302 { 1303 void *root = kho_out.fdt; 1304 u64 empty_mem_map = 0; 1305 int err; 1306 1307 err = fdt_create(root, PAGE_SIZE); 1308 err |= fdt_finish_reservemap(root); 1309 err |= fdt_begin_node(root, ""); 1310 err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); 1311 err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME, &empty_mem_map, 1312 sizeof(empty_mem_map)); 1313 err |= fdt_end_node(root); 1314 err |= fdt_finish(root); 1315 1316 return err; 1317 } 1318 1319 static __init int kho_init(void) 1320 { 1321 const void *fdt = kho_get_fdt(); 1322 int err = 0; 1323 1324 if (!kho_enable) 1325 return 0; 1326 1327 kho_out.fdt = kho_alloc_preserve(PAGE_SIZE); 1328 if (IS_ERR(kho_out.fdt)) { 1329 err = PTR_ERR(kho_out.fdt); 1330 goto err_free_scratch; 1331 } 1332 1333 err = kho_debugfs_init(); 1334 if (err) 1335 goto err_free_fdt; 1336 1337 err = kho_out_debugfs_init(&kho_out.dbg); 1338 if (err) 1339 goto err_free_fdt; 1340 1341 err = kho_out_fdt_setup(); 1342 if (err) 1343 goto err_free_fdt; 1344 1345 if (fdt) { 1346 kho_in_debugfs_init(&kho_in.dbg, fdt); 1347 return 0; 1348 } 1349 1350 for (int i = 0; i < kho_scratch_cnt; i++) { 1351 unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr); 1352 unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; 1353 unsigned long pfn; 1354 1355 /* 1356 * When debug_pagealloc is enabled, __free_pages() clears the 1357 * corresponding PRESENT bit in the kernel page table. 1358 * Subsequent kmemleak scans of these pages cause the 1359 * non-PRESENT page faults. 1360 * Mark scratch areas with kmemleak_ignore_phys() to exclude 1361 * them from kmemleak scanning. 1362 */ 1363 kmemleak_ignore_phys(kho_scratch[i].addr); 1364 for (pfn = base_pfn; pfn < base_pfn + count; 1365 pfn += pageblock_nr_pages) 1366 init_cma_reserved_pageblock(pfn_to_page(pfn)); 1367 } 1368 1369 WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt", 1370 kho_out.fdt, true)); 1371 1372 return 0; 1373 1374 err_free_fdt: 1375 kho_unpreserve_free(kho_out.fdt); 1376 err_free_scratch: 1377 kho_out.fdt = NULL; 1378 for (int i = 0; i < kho_scratch_cnt; i++) { 1379 void *start = __va(kho_scratch[i].addr); 1380 void *end = start + kho_scratch[i].size; 1381 1382 free_reserved_area(start, end, -1, ""); 1383 } 1384 kho_enable = false; 1385 return err; 1386 } 1387 fs_initcall(kho_init); 1388 1389 static void __init kho_release_scratch(void) 1390 { 1391 phys_addr_t start, end; 1392 u64 i; 1393 1394 memmap_init_kho_scratch_pages(); 1395 1396 /* 1397 * Mark scratch mem as CMA before we return it. That way we 1398 * ensure that no kernel allocations happen on it. That means 1399 * we can reuse it as scratch memory again later. 1400 */ 1401 __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, 1402 MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) { 1403 ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start)); 1404 ulong end_pfn = pageblock_align(PFN_UP(end)); 1405 ulong pfn; 1406 1407 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) 1408 init_pageblock_migratetype(pfn_to_page(pfn), 1409 MIGRATE_CMA, false); 1410 } 1411 } 1412 1413 void __init kho_memory_init(void) 1414 { 1415 if (kho_in.mem_map_phys) { 1416 kho_scratch = phys_to_virt(kho_in.scratch_phys); 1417 kho_release_scratch(); 1418 kho_mem_deserialize(phys_to_virt(kho_in.mem_map_phys)); 1419 } else { 1420 kho_reserve_scratch(); 1421 } 1422 } 1423 1424 void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, 1425 phys_addr_t scratch_phys, u64 scratch_len) 1426 { 1427 struct kho_scratch *scratch = NULL; 1428 phys_addr_t mem_map_phys; 1429 void *fdt = NULL; 1430 int err = 0; 1431 unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); 1432 1433 /* Validate the input FDT */ 1434 fdt = early_memremap(fdt_phys, fdt_len); 1435 if (!fdt) { 1436 pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys); 1437 err = -EFAULT; 1438 goto out; 1439 } 1440 err = fdt_check_header(fdt); 1441 if (err) { 1442 pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n", 1443 fdt_phys, err); 1444 err = -EINVAL; 1445 goto out; 1446 } 1447 err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE); 1448 if (err) { 1449 pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n", 1450 fdt_phys, KHO_FDT_COMPATIBLE, err); 1451 err = -EINVAL; 1452 goto out; 1453 } 1454 1455 mem_map_phys = kho_get_mem_map_phys(fdt); 1456 if (!mem_map_phys) { 1457 err = -ENOENT; 1458 goto out; 1459 } 1460 1461 scratch = early_memremap(scratch_phys, scratch_len); 1462 if (!scratch) { 1463 pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n", 1464 scratch_phys, scratch_len); 1465 err = -EFAULT; 1466 goto out; 1467 } 1468 1469 /* 1470 * We pass a safe contiguous blocks of memory to use for early boot 1471 * purporses from the previous kernel so that we can resize the 1472 * memblock array as needed. 1473 */ 1474 for (int i = 0; i < scratch_cnt; i++) { 1475 struct kho_scratch *area = &scratch[i]; 1476 u64 size = area->size; 1477 1478 memblock_add(area->addr, size); 1479 err = memblock_mark_kho_scratch(area->addr, size); 1480 if (WARN_ON(err)) { 1481 pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe", 1482 &area->addr, &size, ERR_PTR(err)); 1483 goto out; 1484 } 1485 pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); 1486 } 1487 1488 memblock_reserve(scratch_phys, scratch_len); 1489 1490 /* 1491 * Now that we have a viable region of scratch memory, let's tell 1492 * the memblocks allocator to only use that for any allocations. 1493 * That way we ensure that nothing scribbles over in use data while 1494 * we initialize the page tables which we will need to ingest all 1495 * memory reservations from the previous kernel. 1496 */ 1497 memblock_set_kho_scratch_only(); 1498 1499 kho_in.fdt_phys = fdt_phys; 1500 kho_in.scratch_phys = scratch_phys; 1501 kho_in.mem_map_phys = mem_map_phys; 1502 kho_scratch_cnt = scratch_cnt; 1503 pr_info("found kexec handover data.\n"); 1504 1505 out: 1506 if (fdt) 1507 early_memunmap(fdt, fdt_len); 1508 if (scratch) 1509 early_memunmap(scratch, scratch_len); 1510 if (err) 1511 pr_warn("disabling KHO revival: %d\n", err); 1512 } 1513 1514 /* Helper functions for kexec_file_load */ 1515 1516 int kho_fill_kimage(struct kimage *image) 1517 { 1518 ssize_t scratch_size; 1519 int err = 0; 1520 struct kexec_buf scratch; 1521 1522 if (!kho_enable) 1523 return 0; 1524 1525 image->kho.fdt = virt_to_phys(kho_out.fdt); 1526 1527 scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; 1528 scratch = (struct kexec_buf){ 1529 .image = image, 1530 .buffer = kho_scratch, 1531 .bufsz = scratch_size, 1532 .mem = KEXEC_BUF_MEM_UNKNOWN, 1533 .memsz = scratch_size, 1534 .buf_align = SZ_64K, /* Makes it easier to map */ 1535 .buf_max = ULONG_MAX, 1536 .top_down = true, 1537 }; 1538 err = kexec_add_buffer(&scratch); 1539 if (err) 1540 return err; 1541 image->kho.scratch = &image->segment[image->nr_segments - 1]; 1542 1543 return 0; 1544 } 1545 1546 static int kho_walk_scratch(struct kexec_buf *kbuf, 1547 int (*func)(struct resource *, void *)) 1548 { 1549 int ret = 0; 1550 int i; 1551 1552 for (i = 0; i < kho_scratch_cnt; i++) { 1553 struct resource res = { 1554 .start = kho_scratch[i].addr, 1555 .end = kho_scratch[i].addr + kho_scratch[i].size - 1, 1556 }; 1557 1558 /* Try to fit the kimage into our KHO scratch region */ 1559 ret = func(&res, kbuf); 1560 if (ret) 1561 break; 1562 } 1563 1564 return ret; 1565 } 1566 1567 int kho_locate_mem_hole(struct kexec_buf *kbuf, 1568 int (*func)(struct resource *, void *)) 1569 { 1570 int ret; 1571 1572 if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH) 1573 return 1; 1574 1575 ret = kho_walk_scratch(kbuf, func); 1576 1577 return ret == 1 ? 0 : -EADDRNOTAVAIL; 1578 } 1579