1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * Copyright (c) 2025, Google LLC. 5 * Pasha Tatashin <pasha.tatashin@soleen.com> 6 * 7 * Copyright (C) 2025 Amazon.com Inc. or its affiliates. 8 * Pratyush Yadav <ptyadav@amazon.de> 9 */ 10 11 /** 12 * DOC: Memfd Preservation via LUO 13 * 14 * Overview 15 * ======== 16 * 17 * Memory file descriptors (memfd) can be preserved over a kexec using the Live 18 * Update Orchestrator (LUO) file preservation. This allows userspace to 19 * transfer its memory contents to the next kernel after a kexec. 20 * 21 * The preservation is not intended to be transparent. Only select properties of 22 * the file are preserved. All others are reset to default. The preserved 23 * properties are described below. 24 * 25 * .. note:: 26 * The LUO API is not stabilized yet, so the preserved properties of a memfd 27 * are also not stable and are subject to backwards incompatible changes. 28 * 29 * .. note:: 30 * Currently a memfd backed by Hugetlb is not supported. Memfds created 31 * with ``MFD_HUGETLB`` will be rejected. 32 * 33 * Preserved Properties 34 * ==================== 35 * 36 * The following properties of the memfd are preserved across kexec: 37 * 38 * File Contents 39 * All data stored in the file is preserved. 40 * 41 * File Size 42 * The size of the file is preserved. Holes in the file are filled by 43 * allocating pages for them during preservation. 44 * 45 * File Position 46 * The current file position is preserved, allowing applications to continue 47 * reading/writing from their last position. 48 * 49 * File Status Flags 50 * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property 51 * is maintained. 52 * 53 * Non-Preserved Properties 54 * ======================== 55 * 56 * All properties which are not preserved must be assumed to be reset to 57 * default. This section describes some of those properties which may be more of 58 * note. 59 * 60 * ``FD_CLOEXEC`` flag 61 * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the 62 * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set 63 * again after restore via ``fcntl()``. 64 * 65 * Seals 66 * File seals are not preserved. The file is unsealed on restore and if 67 * needed, must be sealed again via ``fcntl()``. 68 */ 69 70 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 71 72 #include <linux/bits.h> 73 #include <linux/err.h> 74 #include <linux/file.h> 75 #include <linux/io.h> 76 #include <linux/kexec_handover.h> 77 #include <linux/kho/abi/memfd.h> 78 #include <linux/liveupdate.h> 79 #include <linux/shmem_fs.h> 80 #include <linux/vmalloc.h> 81 #include <linux/memfd.h> 82 #include <uapi/linux/memfd.h> 83 84 #include "internal.h" 85 86 static int memfd_luo_preserve_folios(struct file *file, 87 struct kho_vmalloc *kho_vmalloc, 88 struct memfd_luo_folio_ser **out_folios_ser, 89 u64 *nr_foliosp) 90 { 91 struct inode *inode = file_inode(file); 92 struct memfd_luo_folio_ser *folios_ser; 93 unsigned int max_folios; 94 long i, size, nr_pinned; 95 struct folio **folios; 96 int err = -EINVAL; 97 pgoff_t offset; 98 u64 nr_folios; 99 100 size = i_size_read(inode); 101 /* 102 * If the file has zero size, then the folios and nr_folios properties 103 * are not set. 104 */ 105 if (!size) { 106 *nr_foliosp = 0; 107 *out_folios_ser = NULL; 108 return 0; 109 } 110 111 /* 112 * Guess the number of folios based on inode size. Real number might end 113 * up being smaller if there are higher order folios. 114 */ 115 max_folios = PAGE_ALIGN(size) / PAGE_SIZE; 116 folios = kvmalloc_objs(*folios, max_folios); 117 if (!folios) 118 return -ENOMEM; 119 120 /* 121 * Pin the folios so they don't move around behind our back. This also 122 * ensures none of the folios are in CMA -- which ensures they don't 123 * fall in KHO scratch memory. It also moves swapped out folios back to 124 * memory. 125 * 126 * A side effect of doing this is that it allocates a folio for all 127 * indices in the file. This might waste memory on sparse memfds. If 128 * that is really a problem in the future, we can have a 129 * memfd_pin_folios() variant that does not allocate a page on empty 130 * slots. 131 */ 132 nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios, 133 &offset); 134 if (nr_pinned < 0) { 135 err = nr_pinned; 136 pr_err("failed to pin folios: %d\n", err); 137 goto err_free_folios; 138 } 139 nr_folios = nr_pinned; 140 141 folios_ser = vcalloc(nr_folios, sizeof(*folios_ser)); 142 if (!folios_ser) { 143 err = -ENOMEM; 144 goto err_unpin; 145 } 146 147 for (i = 0; i < nr_folios; i++) { 148 struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 149 struct folio *folio = folios[i]; 150 151 err = kho_preserve_folio(folio); 152 if (err) 153 goto err_unpreserve; 154 155 folio_lock(folio); 156 157 /* 158 * A dirty folio is one which has been written to. A clean folio 159 * is its opposite. Since a clean folio does not carry user 160 * data, it can be freed by page reclaim under memory pressure. 161 * 162 * Saving the dirty flag at prepare() time doesn't work since it 163 * can change later. Saving it at freeze() also won't work 164 * because the dirty bit is normally synced at unmap and there 165 * might still be a mapping of the file at freeze(). 166 * 167 * To see why this is a problem, say a folio is clean at 168 * preserve, but gets dirtied later. The pfolio flags will mark 169 * it as clean. After retrieve, the next kernel might try to 170 * reclaim this folio under memory pressure, losing user data. 171 * 172 * Unconditionally mark it dirty to avoid this problem. This 173 * comes at the cost of making clean folios un-reclaimable after 174 * live update. 175 */ 176 folio_mark_dirty(folio); 177 178 /* 179 * If the folio is not uptodate, it was fallocated but never 180 * used. Saving this flag at prepare() doesn't work since it 181 * might change later when someone uses the folio. 182 * 183 * Since we have taken the performance penalty of allocating, 184 * zeroing, and pinning all the folios in the holes, take a bit 185 * more and zero all non-uptodate folios too. 186 * 187 * NOTE: For someone looking to improve preserve performance, 188 * this is a good place to look. 189 */ 190 if (!folio_test_uptodate(folio)) { 191 folio_zero_range(folio, 0, folio_size(folio)); 192 flush_dcache_folio(folio); 193 folio_mark_uptodate(folio); 194 } 195 196 folio_unlock(folio); 197 198 pfolio->pfn = folio_pfn(folio); 199 pfolio->flags = MEMFD_LUO_FOLIO_DIRTY | MEMFD_LUO_FOLIO_UPTODATE; 200 pfolio->index = folio->index; 201 } 202 203 err = kho_preserve_vmalloc(folios_ser, kho_vmalloc); 204 if (err) 205 goto err_unpreserve; 206 207 kvfree(folios); 208 *nr_foliosp = nr_folios; 209 *out_folios_ser = folios_ser; 210 211 /* 212 * Note: folios_ser is purposely not freed here. It is preserved 213 * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer 214 * that is passed via private_data. 215 */ 216 return 0; 217 218 err_unpreserve: 219 for (i = i - 1; i >= 0; i--) 220 kho_unpreserve_folio(folios[i]); 221 vfree(folios_ser); 222 err_unpin: 223 unpin_folios(folios, nr_folios); 224 err_free_folios: 225 kvfree(folios); 226 227 return err; 228 } 229 230 static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc, 231 struct memfd_luo_folio_ser *folios_ser, 232 u64 nr_folios) 233 { 234 long i; 235 236 if (!nr_folios) 237 return; 238 239 kho_unpreserve_vmalloc(kho_vmalloc); 240 241 for (i = 0; i < nr_folios; i++) { 242 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 243 struct folio *folio; 244 245 if (!pfolio->pfn) 246 continue; 247 248 folio = pfn_folio(pfolio->pfn); 249 250 kho_unpreserve_folio(folio); 251 unpin_folio(folio); 252 } 253 254 vfree(folios_ser); 255 } 256 257 static int memfd_luo_preserve(struct liveupdate_file_op_args *args) 258 { 259 struct inode *inode = file_inode(args->file); 260 struct memfd_luo_folio_ser *folios_ser; 261 struct memfd_luo_ser *ser; 262 u64 nr_folios; 263 int err = 0, seals; 264 265 inode_lock(inode); 266 shmem_freeze(inode, true); 267 268 /* Allocate the main serialization structure in preserved memory */ 269 ser = kho_alloc_preserve(sizeof(*ser)); 270 if (IS_ERR(ser)) { 271 err = PTR_ERR(ser); 272 goto err_unlock; 273 } 274 275 seals = memfd_get_seals(args->file); 276 if (seals < 0) { 277 err = seals; 278 goto err_free_ser; 279 } 280 281 /* Make sure the file only has the seals supported by this version. */ 282 if (seals & ~MEMFD_LUO_ALL_SEALS) { 283 err = -EOPNOTSUPP; 284 goto err_free_ser; 285 } 286 287 ser->pos = args->file->f_pos; 288 ser->size = i_size_read(inode); 289 ser->seals = seals; 290 291 err = memfd_luo_preserve_folios(args->file, &ser->folios, 292 &folios_ser, &nr_folios); 293 if (err) 294 goto err_free_ser; 295 296 ser->nr_folios = nr_folios; 297 inode_unlock(inode); 298 299 args->private_data = folios_ser; 300 args->serialized_data = virt_to_phys(ser); 301 302 return 0; 303 304 err_free_ser: 305 kho_unpreserve_free(ser); 306 err_unlock: 307 shmem_freeze(inode, false); 308 inode_unlock(inode); 309 return err; 310 } 311 312 static int memfd_luo_freeze(struct liveupdate_file_op_args *args) 313 { 314 struct memfd_luo_ser *ser; 315 316 if (WARN_ON_ONCE(!args->serialized_data)) 317 return -EINVAL; 318 319 ser = phys_to_virt(args->serialized_data); 320 321 /* 322 * The pos might have changed since prepare. Everything else stays the 323 * same. 324 */ 325 ser->pos = args->file->f_pos; 326 327 return 0; 328 } 329 330 static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args) 331 { 332 struct inode *inode = file_inode(args->file); 333 struct memfd_luo_ser *ser; 334 335 if (WARN_ON_ONCE(!args->serialized_data)) 336 return; 337 338 inode_lock(inode); 339 shmem_freeze(inode, false); 340 341 ser = phys_to_virt(args->serialized_data); 342 343 memfd_luo_unpreserve_folios(&ser->folios, args->private_data, 344 ser->nr_folios); 345 346 kho_unpreserve_free(ser); 347 inode_unlock(inode); 348 } 349 350 static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser, 351 u64 nr_folios) 352 { 353 u64 i; 354 355 for (i = 0; i < nr_folios; i++) { 356 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 357 struct folio *folio; 358 phys_addr_t phys; 359 360 if (!pfolio->pfn) 361 continue; 362 363 phys = PFN_PHYS(pfolio->pfn); 364 folio = kho_restore_folio(phys); 365 if (!folio) { 366 pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n", 367 phys); 368 continue; 369 } 370 371 folio_put(folio); 372 } 373 } 374 375 static void memfd_luo_finish(struct liveupdate_file_op_args *args) 376 { 377 struct memfd_luo_folio_ser *folios_ser; 378 struct memfd_luo_ser *ser; 379 380 /* 381 * If retrieve was successful, nothing to do. If it failed, retrieve() 382 * already cleaned up everything it could. So nothing to do there 383 * either. Only need to clean up when retrieve was not called. 384 */ 385 if (args->retrieve_status) 386 return; 387 388 ser = phys_to_virt(args->serialized_data); 389 if (!ser) 390 return; 391 392 if (ser->nr_folios) { 393 folios_ser = kho_restore_vmalloc(&ser->folios); 394 if (!folios_ser) 395 goto out; 396 397 memfd_luo_discard_folios(folios_ser, ser->nr_folios); 398 vfree(folios_ser); 399 } 400 401 out: 402 kho_restore_free(ser); 403 } 404 405 static int memfd_luo_retrieve_folios(struct file *file, 406 struct memfd_luo_folio_ser *folios_ser, 407 u64 nr_folios) 408 { 409 struct inode *inode = file_inode(file); 410 struct address_space *mapping = inode->i_mapping; 411 struct folio *folio; 412 long npages, nr_added_pages = 0; 413 int err = -EIO; 414 long i; 415 416 for (i = 0; i < nr_folios; i++) { 417 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 418 phys_addr_t phys; 419 u64 index; 420 int flags; 421 422 if (!pfolio->pfn) 423 continue; 424 425 phys = PFN_PHYS(pfolio->pfn); 426 folio = kho_restore_folio(phys); 427 if (!folio) { 428 pr_err("Unable to restore folio at physical address: %llx\n", 429 phys); 430 goto put_folios; 431 } 432 index = pfolio->index; 433 flags = pfolio->flags; 434 435 /* Set up the folio for insertion. */ 436 __folio_set_locked(folio); 437 __folio_set_swapbacked(folio); 438 439 err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping)); 440 if (err) { 441 pr_err("shmem: failed to charge folio index %ld: %d\n", 442 i, err); 443 goto unlock_folio; 444 } 445 446 err = shmem_add_to_page_cache(folio, mapping, index, NULL, 447 mapping_gfp_mask(mapping)); 448 if (err) { 449 pr_err("shmem: failed to add to page cache folio index %ld: %d\n", 450 i, err); 451 goto unlock_folio; 452 } 453 454 if (flags & MEMFD_LUO_FOLIO_UPTODATE) 455 folio_mark_uptodate(folio); 456 if (flags & MEMFD_LUO_FOLIO_DIRTY) 457 folio_mark_dirty(folio); 458 459 npages = folio_nr_pages(folio); 460 err = shmem_inode_acct_blocks(inode, npages); 461 if (err) { 462 pr_err("shmem: failed to account folio index %ld(%ld pages): %d\n", 463 i, npages, err); 464 goto remove_from_cache; 465 } 466 467 nr_added_pages += npages; 468 folio_add_lru(folio); 469 folio_unlock(folio); 470 folio_put(folio); 471 } 472 473 shmem_recalc_inode(inode, nr_added_pages, 0); 474 475 return 0; 476 477 remove_from_cache: 478 filemap_remove_folio(folio); 479 unlock_folio: 480 folio_unlock(folio); 481 folio_put(folio); 482 put_folios: 483 /* 484 * Note: don't free the folios already added to the file. They will be 485 * freed when the file is freed. Free the ones not added yet here. 486 */ 487 for (long j = i + 1; j < nr_folios; j++) { 488 const struct memfd_luo_folio_ser *pfolio = &folios_ser[j]; 489 phys_addr_t phys; 490 491 if (!pfolio->pfn) 492 continue; 493 494 phys = PFN_PHYS(pfolio->pfn); 495 folio = kho_restore_folio(phys); 496 if (folio) 497 folio_put(folio); 498 } 499 500 shmem_recalc_inode(inode, nr_added_pages, 0); 501 502 return err; 503 } 504 505 static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) 506 { 507 struct memfd_luo_folio_ser *folios_ser; 508 struct memfd_luo_ser *ser; 509 struct file *file; 510 int err; 511 512 ser = phys_to_virt(args->serialized_data); 513 if (!ser) 514 return -EINVAL; 515 516 /* Make sure the file only has seals supported by this version. */ 517 if (ser->seals & ~MEMFD_LUO_ALL_SEALS) { 518 err = -EOPNOTSUPP; 519 goto free_ser; 520 } 521 522 /* 523 * The seals are preserved. Allow sealing here so they can be added 524 * later. 525 */ 526 file = memfd_alloc_file("", MFD_ALLOW_SEALING); 527 if (IS_ERR(file)) { 528 pr_err("failed to setup file: %pe\n", file); 529 err = PTR_ERR(file); 530 goto free_ser; 531 } 532 533 err = memfd_add_seals(file, ser->seals); 534 if (err) { 535 pr_err("failed to add seals: %pe\n", ERR_PTR(err)); 536 goto put_file; 537 } 538 539 vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE); 540 i_size_write(file_inode(file), ser->size); 541 542 if (ser->nr_folios) { 543 folios_ser = kho_restore_vmalloc(&ser->folios); 544 if (!folios_ser) { 545 err = -EINVAL; 546 goto put_file; 547 } 548 549 err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios); 550 vfree(folios_ser); 551 if (err) 552 goto put_file; 553 } 554 555 args->file = file; 556 kho_restore_free(ser); 557 558 return 0; 559 560 put_file: 561 fput(file); 562 free_ser: 563 kho_restore_free(ser); 564 return err; 565 } 566 567 static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler, 568 struct file *file) 569 { 570 struct inode *inode = file_inode(file); 571 572 return shmem_file(file) && !inode->i_nlink; 573 } 574 575 static unsigned long memfd_luo_get_id(struct file *file) 576 { 577 return (unsigned long)file_inode(file); 578 } 579 580 static const struct liveupdate_file_ops memfd_luo_file_ops = { 581 .freeze = memfd_luo_freeze, 582 .finish = memfd_luo_finish, 583 .retrieve = memfd_luo_retrieve, 584 .preserve = memfd_luo_preserve, 585 .unpreserve = memfd_luo_unpreserve, 586 .can_preserve = memfd_luo_can_preserve, 587 .get_id = memfd_luo_get_id, 588 .owner = THIS_MODULE, 589 }; 590 591 static struct liveupdate_file_handler memfd_luo_handler = { 592 .ops = &memfd_luo_file_ops, 593 .compatible = MEMFD_LUO_FH_COMPATIBLE, 594 }; 595 596 static int __init memfd_luo_init(void) 597 { 598 int err = liveupdate_register_file_handler(&memfd_luo_handler); 599 600 if (err && err != -EOPNOTSUPP) { 601 pr_err("Could not register luo filesystem handler: %pe\n", 602 ERR_PTR(err)); 603 604 return err; 605 } 606 607 return 0; 608 } 609 late_initcall(memfd_luo_init); 610