1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * Copyright (c) 2025, Google LLC. 5 * Pasha Tatashin <pasha.tatashin@soleen.com> 6 * 7 * Copyright (C) 2025 Amazon.com Inc. or its affiliates. 8 * Pratyush Yadav <ptyadav@amazon.de> 9 */ 10 11 /** 12 * DOC: Memfd Preservation via LUO 13 * 14 * Overview 15 * ======== 16 * 17 * Memory file descriptors (memfd) can be preserved over a kexec using the Live 18 * Update Orchestrator (LUO) file preservation. This allows userspace to 19 * transfer its memory contents to the next kernel after a kexec. 20 * 21 * The preservation is not intended to be transparent. Only select properties of 22 * the file are preserved. All others are reset to default. The preserved 23 * properties are described below. 24 * 25 * .. note:: 26 * The LUO API is not stabilized yet, so the preserved properties of a memfd 27 * are also not stable and are subject to backwards incompatible changes. 28 * 29 * .. note:: 30 * Currently a memfd backed by Hugetlb is not supported. Memfds created 31 * with ``MFD_HUGETLB`` will be rejected. 32 * 33 * Preserved Properties 34 * ==================== 35 * 36 * The following properties of the memfd are preserved across kexec: 37 * 38 * File Contents 39 * All data stored in the file is preserved. 40 * 41 * File Size 42 * The size of the file is preserved. Holes in the file are filled by 43 * allocating pages for them during preservation. 44 * 45 * File Position 46 * The current file position is preserved, allowing applications to continue 47 * reading/writing from their last position. 48 * 49 * File Status Flags 50 * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property 51 * is maintained. 52 * 53 * Seals 54 * File seals set on the memfd are preserved and re-applied on restore. 55 * Only seals known to this LUO version (see ``MEMFD_LUO_ALL_SEALS``) may 56 * be present; preservation fails with ``-EOPNOTSUPP`` otherwise. 57 * 58 * Non-Preserved Properties 59 * ======================== 60 * 61 * All properties which are not preserved must be assumed to be reset to 62 * default. This section describes some of those properties which may be more of 63 * note. 64 * 65 * ``FD_CLOEXEC`` flag 66 * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the 67 * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set 68 * again after restore via ``fcntl()``. 69 */ 70 71 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 72 73 #include <linux/bits.h> 74 #include <linux/err.h> 75 #include <linux/file.h> 76 #include <linux/io.h> 77 #include <linux/kexec_handover.h> 78 #include <linux/kho/abi/memfd.h> 79 #include <linux/liveupdate.h> 80 #include <linux/shmem_fs.h> 81 #include <linux/vmalloc.h> 82 #include <linux/memfd.h> 83 #include <uapi/linux/memfd.h> 84 85 #include "internal.h" 86 87 static int memfd_luo_preserve_folios(struct file *file, 88 struct kho_vmalloc *kho_vmalloc, 89 struct memfd_luo_folio_ser **out_folios_ser, 90 u64 *nr_foliosp) 91 { 92 struct inode *inode = file_inode(file); 93 struct memfd_luo_folio_ser *folios_ser; 94 unsigned int max_folios; 95 long i, size, nr_pinned; 96 struct folio **folios; 97 int err = -EINVAL; 98 pgoff_t offset; 99 u64 nr_folios; 100 101 size = i_size_read(inode); 102 /* 103 * If the file has zero size, then the folios and nr_folios properties 104 * are not set. 105 */ 106 if (!size) { 107 *nr_foliosp = 0; 108 *out_folios_ser = NULL; 109 return 0; 110 } 111 112 /* 113 * Guess the number of folios based on inode size. Real number might end 114 * up being smaller if there are higher order folios. 115 */ 116 max_folios = PAGE_ALIGN(size) / PAGE_SIZE; 117 folios = kvmalloc_objs(*folios, max_folios); 118 if (!folios) 119 return -ENOMEM; 120 121 /* 122 * Pin the folios so they don't move around behind our back. This also 123 * ensures none of the folios are in CMA -- which ensures they don't 124 * fall in KHO scratch memory. It also moves swapped out folios back to 125 * memory. 126 * 127 * A side effect of doing this is that it allocates a folio for all 128 * indices in the file. This might waste memory on sparse memfds. If 129 * that is really a problem in the future, we can have a 130 * memfd_pin_folios() variant that does not allocate a page on empty 131 * slots. 132 */ 133 nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios, 134 &offset); 135 if (nr_pinned < 0) { 136 err = nr_pinned; 137 pr_err("failed to pin folios: %d\n", err); 138 goto err_free_folios; 139 } 140 nr_folios = nr_pinned; 141 142 folios_ser = vcalloc(nr_folios, sizeof(*folios_ser)); 143 if (!folios_ser) { 144 err = -ENOMEM; 145 goto err_unpin; 146 } 147 148 for (i = 0; i < nr_folios; i++) { 149 struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 150 struct folio *folio = folios[i]; 151 152 err = kho_preserve_folio(folio); 153 if (err) 154 goto err_unpreserve; 155 156 folio_lock(folio); 157 158 /* 159 * A dirty folio is one which has been written to. A clean folio 160 * is its opposite. Since a clean folio does not carry user 161 * data, it can be freed by page reclaim under memory pressure. 162 * 163 * Saving the dirty flag at prepare() time doesn't work since it 164 * can change later. Saving it at freeze() also won't work 165 * because the dirty bit is normally synced at unmap and there 166 * might still be a mapping of the file at freeze(). 167 * 168 * To see why this is a problem, say a folio is clean at 169 * preserve, but gets dirtied later. The pfolio flags will mark 170 * it as clean. After retrieve, the next kernel might try to 171 * reclaim this folio under memory pressure, losing user data. 172 * 173 * Unconditionally mark it dirty to avoid this problem. This 174 * comes at the cost of making clean folios un-reclaimable after 175 * live update. 176 */ 177 folio_mark_dirty(folio); 178 179 /* 180 * If the folio is not uptodate, it was fallocated but never 181 * used. Saving this flag at prepare() doesn't work since it 182 * might change later when someone uses the folio. 183 * 184 * Since we have taken the performance penalty of allocating, 185 * zeroing, and pinning all the folios in the holes, take a bit 186 * more and zero all non-uptodate folios too. 187 * 188 * NOTE: For someone looking to improve preserve performance, 189 * this is a good place to look. 190 */ 191 if (!folio_test_uptodate(folio)) { 192 folio_zero_range(folio, 0, folio_size(folio)); 193 flush_dcache_folio(folio); 194 folio_mark_uptodate(folio); 195 } 196 197 folio_unlock(folio); 198 199 pfolio->pfn = folio_pfn(folio); 200 pfolio->flags = MEMFD_LUO_FOLIO_DIRTY | MEMFD_LUO_FOLIO_UPTODATE; 201 pfolio->index = folio->index; 202 } 203 204 err = kho_preserve_vmalloc(folios_ser, kho_vmalloc); 205 if (err) 206 goto err_unpreserve; 207 208 kvfree(folios); 209 *nr_foliosp = nr_folios; 210 *out_folios_ser = folios_ser; 211 212 /* 213 * Note: folios_ser is purposely not freed here. It is preserved 214 * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer 215 * that is passed via private_data. 216 */ 217 return 0; 218 219 err_unpreserve: 220 for (i = i - 1; i >= 0; i--) 221 kho_unpreserve_folio(folios[i]); 222 vfree(folios_ser); 223 err_unpin: 224 unpin_folios(folios, nr_folios); 225 err_free_folios: 226 kvfree(folios); 227 228 return err; 229 } 230 231 static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc, 232 struct memfd_luo_folio_ser *folios_ser, 233 u64 nr_folios) 234 { 235 long i; 236 237 if (!nr_folios) 238 return; 239 240 kho_unpreserve_vmalloc(kho_vmalloc); 241 242 for (i = 0; i < nr_folios; i++) { 243 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 244 struct folio *folio; 245 246 if (!pfolio->pfn) 247 continue; 248 249 folio = pfn_folio(pfolio->pfn); 250 251 kho_unpreserve_folio(folio); 252 unpin_folio(folio); 253 } 254 255 vfree(folios_ser); 256 } 257 258 static int memfd_luo_preserve(struct liveupdate_file_op_args *args) 259 { 260 struct inode *inode = file_inode(args->file); 261 struct memfd_luo_folio_ser *folios_ser; 262 struct memfd_luo_ser *ser; 263 u64 nr_folios, inode_size; 264 int err = 0, seals; 265 266 inode_lock(inode); 267 shmem_freeze(inode, true); 268 269 /* Allocate the main serialization structure in preserved memory */ 270 ser = kho_alloc_preserve(sizeof(*ser)); 271 if (IS_ERR(ser)) { 272 err = PTR_ERR(ser); 273 goto err_unlock; 274 } 275 276 seals = memfd_get_seals(args->file); 277 if (seals < 0) { 278 err = seals; 279 goto err_free_ser; 280 } 281 282 /* Make sure the file only has the seals supported by this version. */ 283 if (seals & ~MEMFD_LUO_ALL_SEALS) { 284 err = -EOPNOTSUPP; 285 goto err_free_ser; 286 } 287 288 ser->pos = args->file->f_pos; 289 inode_size = i_size_read(inode); 290 291 /* 292 * memfd_pin_folios() caps at UINT_MAX folios; refuse larger 293 * files to avoid silently preserving only a prefix. 294 */ 295 if (DIV_ROUND_UP_ULL(inode_size, PAGE_SIZE) > UINT_MAX) { 296 err = -EFBIG; 297 goto err_free_ser; 298 } 299 300 ser->size = inode_size; 301 ser->seals = seals; 302 303 err = memfd_luo_preserve_folios(args->file, &ser->folios, 304 &folios_ser, &nr_folios); 305 if (err) 306 goto err_free_ser; 307 308 ser->nr_folios = nr_folios; 309 inode_unlock(inode); 310 311 args->private_data = folios_ser; 312 args->serialized_data = virt_to_phys(ser); 313 314 return 0; 315 316 err_free_ser: 317 kho_unpreserve_free(ser); 318 err_unlock: 319 shmem_freeze(inode, false); 320 inode_unlock(inode); 321 return err; 322 } 323 324 static int memfd_luo_freeze(struct liveupdate_file_op_args *args) 325 { 326 struct memfd_luo_ser *ser; 327 328 if (WARN_ON_ONCE(!args->serialized_data)) 329 return -EINVAL; 330 331 ser = phys_to_virt(args->serialized_data); 332 333 /* 334 * The pos might have changed since prepare. Everything else stays the 335 * same. 336 */ 337 ser->pos = args->file->f_pos; 338 339 return 0; 340 } 341 342 static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args) 343 { 344 struct inode *inode = file_inode(args->file); 345 struct memfd_luo_ser *ser; 346 347 if (WARN_ON_ONCE(!args->serialized_data)) 348 return; 349 350 inode_lock(inode); 351 shmem_freeze(inode, false); 352 353 ser = phys_to_virt(args->serialized_data); 354 355 memfd_luo_unpreserve_folios(&ser->folios, args->private_data, 356 ser->nr_folios); 357 358 kho_unpreserve_free(ser); 359 inode_unlock(inode); 360 } 361 362 static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser, 363 u64 nr_folios) 364 { 365 u64 i; 366 367 for (i = 0; i < nr_folios; i++) { 368 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 369 struct folio *folio; 370 phys_addr_t phys; 371 372 if (!pfolio->pfn) 373 continue; 374 375 phys = PFN_PHYS(pfolio->pfn); 376 folio = kho_restore_folio(phys); 377 if (!folio) { 378 pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n", 379 phys); 380 continue; 381 } 382 383 folio_put(folio); 384 } 385 } 386 387 static void memfd_luo_finish(struct liveupdate_file_op_args *args) 388 { 389 struct memfd_luo_folio_ser *folios_ser; 390 struct memfd_luo_ser *ser; 391 392 /* 393 * If retrieve was successful, nothing to do. If it failed, retrieve() 394 * already cleaned up everything it could. So nothing to do there 395 * either. Only need to clean up when retrieve was not called. 396 */ 397 if (args->retrieve_status) 398 return; 399 400 ser = phys_to_virt(args->serialized_data); 401 if (!ser) 402 return; 403 404 if (ser->nr_folios) { 405 folios_ser = kho_restore_vmalloc(&ser->folios); 406 if (!folios_ser) 407 goto out; 408 409 memfd_luo_discard_folios(folios_ser, ser->nr_folios); 410 vfree(folios_ser); 411 } 412 413 out: 414 kho_restore_free(ser); 415 } 416 417 static int memfd_luo_retrieve_folios(struct file *file, 418 struct memfd_luo_folio_ser *folios_ser, 419 u64 nr_folios) 420 { 421 struct inode *inode = file_inode(file); 422 struct address_space *mapping = inode->i_mapping; 423 struct folio *folio; 424 long npages, nr_added_pages = 0; 425 int err = -EIO; 426 long i; 427 428 for (i = 0; i < nr_folios; i++) { 429 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 430 phys_addr_t phys; 431 u64 index; 432 int flags; 433 434 if (!pfolio->pfn) 435 continue; 436 437 phys = PFN_PHYS(pfolio->pfn); 438 folio = kho_restore_folio(phys); 439 if (!folio) { 440 pr_err("Unable to restore folio at physical address: %llx\n", 441 phys); 442 err = -EIO; 443 goto put_folios; 444 } 445 index = pfolio->index; 446 flags = pfolio->flags; 447 448 /* Set up the folio for insertion. */ 449 __folio_set_locked(folio); 450 __folio_set_swapbacked(folio); 451 452 err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping)); 453 if (err) { 454 pr_err("shmem: failed to charge folio index %ld: %d\n", 455 i, err); 456 goto unlock_folio; 457 } 458 459 err = shmem_add_to_page_cache(folio, mapping, index, NULL, 460 mapping_gfp_mask(mapping)); 461 if (err) { 462 pr_err("shmem: failed to add to page cache folio index %ld: %d\n", 463 i, err); 464 goto unlock_folio; 465 } 466 467 if (flags & MEMFD_LUO_FOLIO_UPTODATE) 468 folio_mark_uptodate(folio); 469 if (flags & MEMFD_LUO_FOLIO_DIRTY) 470 folio_mark_dirty(folio); 471 472 npages = folio_nr_pages(folio); 473 err = shmem_inode_acct_blocks(inode, npages); 474 if (err) { 475 pr_err("shmem: failed to account folio index %ld(%ld pages): %d\n", 476 i, npages, err); 477 goto remove_from_cache; 478 } 479 480 nr_added_pages += npages; 481 folio_add_lru(folio); 482 folio_unlock(folio); 483 folio_put(folio); 484 } 485 486 shmem_recalc_inode(inode, nr_added_pages, 0); 487 488 return 0; 489 490 remove_from_cache: 491 filemap_remove_folio(folio); 492 unlock_folio: 493 folio_unlock(folio); 494 folio_put(folio); 495 put_folios: 496 /* 497 * Note: don't free the folios already added to the file. They will be 498 * freed when the file is freed. Free the ones not added yet here. 499 */ 500 for (long j = i + 1; j < nr_folios; j++) { 501 const struct memfd_luo_folio_ser *pfolio = &folios_ser[j]; 502 phys_addr_t phys; 503 504 if (!pfolio->pfn) 505 continue; 506 507 phys = PFN_PHYS(pfolio->pfn); 508 folio = kho_restore_folio(phys); 509 if (folio) 510 folio_put(folio); 511 } 512 513 shmem_recalc_inode(inode, nr_added_pages, 0); 514 515 return err; 516 } 517 518 static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) 519 { 520 struct memfd_luo_folio_ser *folios_ser; 521 struct memfd_luo_ser *ser; 522 struct file *file; 523 int err; 524 525 ser = phys_to_virt(args->serialized_data); 526 if (!ser) 527 return -EINVAL; 528 529 /* Make sure the file only has seals supported by this version. */ 530 if (ser->seals & ~MEMFD_LUO_ALL_SEALS) { 531 err = -EOPNOTSUPP; 532 goto free_ser; 533 } 534 535 /* 536 * The seals are preserved. Allow sealing here so they can be added 537 * later. 538 */ 539 file = memfd_alloc_file("", MFD_ALLOW_SEALING); 540 if (IS_ERR(file)) { 541 pr_err("failed to setup file: %pe\n", file); 542 err = PTR_ERR(file); 543 goto free_ser; 544 } 545 546 err = memfd_add_seals(file, ser->seals); 547 if (err) { 548 pr_err("failed to add seals: %pe\n", ERR_PTR(err)); 549 goto put_file; 550 } 551 552 vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE); 553 i_size_write(file_inode(file), ser->size); 554 555 if (ser->nr_folios) { 556 folios_ser = kho_restore_vmalloc(&ser->folios); 557 if (!folios_ser) { 558 err = -EINVAL; 559 goto put_file; 560 } 561 562 err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios); 563 vfree(folios_ser); 564 if (err) 565 goto put_file; 566 } 567 568 args->file = file; 569 kho_restore_free(ser); 570 571 return 0; 572 573 put_file: 574 fput(file); 575 free_ser: 576 kho_restore_free(ser); 577 return err; 578 } 579 580 static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler, 581 struct file *file) 582 { 583 struct inode *inode = file_inode(file); 584 585 return shmem_file(file) && !inode->i_nlink; 586 } 587 588 static unsigned long memfd_luo_get_id(struct file *file) 589 { 590 return (unsigned long)file_inode(file); 591 } 592 593 static const struct liveupdate_file_ops memfd_luo_file_ops = { 594 .freeze = memfd_luo_freeze, 595 .finish = memfd_luo_finish, 596 .retrieve = memfd_luo_retrieve, 597 .preserve = memfd_luo_preserve, 598 .unpreserve = memfd_luo_unpreserve, 599 .can_preserve = memfd_luo_can_preserve, 600 .get_id = memfd_luo_get_id, 601 .owner = THIS_MODULE, 602 }; 603 604 static struct liveupdate_file_handler memfd_luo_handler = { 605 .ops = &memfd_luo_file_ops, 606 .compatible = MEMFD_LUO_FH_COMPATIBLE, 607 }; 608 609 static int __init memfd_luo_init(void) 610 { 611 int err = liveupdate_register_file_handler(&memfd_luo_handler); 612 613 if (err && err != -EOPNOTSUPP) { 614 pr_err("Could not register luo filesystem handler: %pe\n", 615 ERR_PTR(err)); 616 617 return err; 618 } 619 620 return 0; 621 } 622 late_initcall(memfd_luo_init); 623