1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * Copyright (c) 2025, Google LLC. 5 * Pasha Tatashin <pasha.tatashin@soleen.com> 6 * 7 * Copyright (C) 2025 Amazon.com Inc. or its affiliates. 8 * Pratyush Yadav <ptyadav@amazon.de> 9 */ 10 11 /** 12 * DOC: Memfd Preservation via LUO 13 * 14 * Overview 15 * ======== 16 * 17 * Memory file descriptors (memfd) can be preserved over a kexec using the Live 18 * Update Orchestrator (LUO) file preservation. This allows userspace to 19 * transfer its memory contents to the next kernel after a kexec. 20 * 21 * The preservation is not intended to be transparent. Only select properties of 22 * the file are preserved. All others are reset to default. The preserved 23 * properties are described below. 24 * 25 * .. note:: 26 * The LUO API is not stabilized yet, so the preserved properties of a memfd 27 * are also not stable and are subject to backwards incompatible changes. 28 * 29 * .. note:: 30 * Currently a memfd backed by Hugetlb is not supported. Memfds created 31 * with ``MFD_HUGETLB`` will be rejected. 32 * 33 * Preserved Properties 34 * ==================== 35 * 36 * The following properties of the memfd are preserved across kexec: 37 * 38 * File Contents 39 * All data stored in the file is preserved. 40 * 41 * File Size 42 * The size of the file is preserved. Holes in the file are filled by 43 * allocating pages for them during preservation. 44 * 45 * File Position 46 * The current file position is preserved, allowing applications to continue 47 * reading/writing from their last position. 48 * 49 * File Status Flags 50 * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property 51 * is maintained. 52 * 53 * Non-Preserved Properties 54 * ======================== 55 * 56 * All properties which are not preserved must be assumed to be reset to 57 * default. This section describes some of those properties which may be more of 58 * note. 59 * 60 * ``FD_CLOEXEC`` flag 61 * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the 62 * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set 63 * again after restore via ``fcntl()``. 64 * 65 * Seals 66 * File seals are not preserved. The file is unsealed on restore and if 67 * needed, must be sealed again via ``fcntl()``. 68 */ 69 70 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 71 72 #include <linux/bits.h> 73 #include <linux/err.h> 74 #include <linux/file.h> 75 #include <linux/io.h> 76 #include <linux/kexec_handover.h> 77 #include <linux/kho/abi/memfd.h> 78 #include <linux/liveupdate.h> 79 #include <linux/shmem_fs.h> 80 #include <linux/vmalloc.h> 81 #include <linux/memfd.h> 82 #include "internal.h" 83 84 static int memfd_luo_preserve_folios(struct file *file, 85 struct kho_vmalloc *kho_vmalloc, 86 struct memfd_luo_folio_ser **out_folios_ser, 87 u64 *nr_foliosp) 88 { 89 struct inode *inode = file_inode(file); 90 struct memfd_luo_folio_ser *folios_ser; 91 unsigned int max_folios; 92 long i, size, nr_pinned; 93 struct folio **folios; 94 int err = -EINVAL; 95 pgoff_t offset; 96 u64 nr_folios; 97 98 size = i_size_read(inode); 99 /* 100 * If the file has zero size, then the folios and nr_folios properties 101 * are not set. 102 */ 103 if (!size) { 104 *nr_foliosp = 0; 105 *out_folios_ser = NULL; 106 memset(kho_vmalloc, 0, sizeof(*kho_vmalloc)); 107 return 0; 108 } 109 110 /* 111 * Guess the number of folios based on inode size. Real number might end 112 * up being smaller if there are higher order folios. 113 */ 114 max_folios = PAGE_ALIGN(size) / PAGE_SIZE; 115 folios = kvmalloc_objs(*folios, max_folios); 116 if (!folios) 117 return -ENOMEM; 118 119 /* 120 * Pin the folios so they don't move around behind our back. This also 121 * ensures none of the folios are in CMA -- which ensures they don't 122 * fall in KHO scratch memory. It also moves swapped out folios back to 123 * memory. 124 * 125 * A side effect of doing this is that it allocates a folio for all 126 * indices in the file. This might waste memory on sparse memfds. If 127 * that is really a problem in the future, we can have a 128 * memfd_pin_folios() variant that does not allocate a page on empty 129 * slots. 130 */ 131 nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios, 132 &offset); 133 if (nr_pinned < 0) { 134 err = nr_pinned; 135 pr_err("failed to pin folios: %d\n", err); 136 goto err_free_folios; 137 } 138 nr_folios = nr_pinned; 139 140 folios_ser = vcalloc(nr_folios, sizeof(*folios_ser)); 141 if (!folios_ser) { 142 err = -ENOMEM; 143 goto err_unpin; 144 } 145 146 for (i = 0; i < nr_folios; i++) { 147 struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 148 struct folio *folio = folios[i]; 149 150 err = kho_preserve_folio(folio); 151 if (err) 152 goto err_unpreserve; 153 154 folio_lock(folio); 155 156 /* 157 * A dirty folio is one which has been written to. A clean folio 158 * is its opposite. Since a clean folio does not carry user 159 * data, it can be freed by page reclaim under memory pressure. 160 * 161 * Saving the dirty flag at prepare() time doesn't work since it 162 * can change later. Saving it at freeze() also won't work 163 * because the dirty bit is normally synced at unmap and there 164 * might still be a mapping of the file at freeze(). 165 * 166 * To see why this is a problem, say a folio is clean at 167 * preserve, but gets dirtied later. The pfolio flags will mark 168 * it as clean. After retrieve, the next kernel might try to 169 * reclaim this folio under memory pressure, losing user data. 170 * 171 * Unconditionally mark it dirty to avoid this problem. This 172 * comes at the cost of making clean folios un-reclaimable after 173 * live update. 174 */ 175 folio_mark_dirty(folio); 176 177 /* 178 * If the folio is not uptodate, it was fallocated but never 179 * used. Saving this flag at prepare() doesn't work since it 180 * might change later when someone uses the folio. 181 * 182 * Since we have taken the performance penalty of allocating, 183 * zeroing, and pinning all the folios in the holes, take a bit 184 * more and zero all non-uptodate folios too. 185 * 186 * NOTE: For someone looking to improve preserve performance, 187 * this is a good place to look. 188 */ 189 if (!folio_test_uptodate(folio)) { 190 folio_zero_range(folio, 0, folio_size(folio)); 191 flush_dcache_folio(folio); 192 folio_mark_uptodate(folio); 193 } 194 195 folio_unlock(folio); 196 197 pfolio->pfn = folio_pfn(folio); 198 pfolio->flags = MEMFD_LUO_FOLIO_DIRTY | MEMFD_LUO_FOLIO_UPTODATE; 199 pfolio->index = folio->index; 200 } 201 202 err = kho_preserve_vmalloc(folios_ser, kho_vmalloc); 203 if (err) 204 goto err_unpreserve; 205 206 kvfree(folios); 207 *nr_foliosp = nr_folios; 208 *out_folios_ser = folios_ser; 209 210 /* 211 * Note: folios_ser is purposely not freed here. It is preserved 212 * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer 213 * that is passed via private_data. 214 */ 215 return 0; 216 217 err_unpreserve: 218 for (i = i - 1; i >= 0; i--) 219 kho_unpreserve_folio(folios[i]); 220 vfree(folios_ser); 221 err_unpin: 222 unpin_folios(folios, nr_folios); 223 err_free_folios: 224 kvfree(folios); 225 226 return err; 227 } 228 229 static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc, 230 struct memfd_luo_folio_ser *folios_ser, 231 u64 nr_folios) 232 { 233 long i; 234 235 if (!nr_folios) 236 return; 237 238 kho_unpreserve_vmalloc(kho_vmalloc); 239 240 for (i = 0; i < nr_folios; i++) { 241 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 242 struct folio *folio; 243 244 if (!pfolio->pfn) 245 continue; 246 247 folio = pfn_folio(pfolio->pfn); 248 249 kho_unpreserve_folio(folio); 250 unpin_folio(folio); 251 } 252 253 vfree(folios_ser); 254 } 255 256 static int memfd_luo_preserve(struct liveupdate_file_op_args *args) 257 { 258 struct inode *inode = file_inode(args->file); 259 struct memfd_luo_folio_ser *folios_ser; 260 struct memfd_luo_ser *ser; 261 u64 nr_folios; 262 int err = 0; 263 264 inode_lock(inode); 265 shmem_freeze(inode, true); 266 267 /* Allocate the main serialization structure in preserved memory */ 268 ser = kho_alloc_preserve(sizeof(*ser)); 269 if (IS_ERR(ser)) { 270 err = PTR_ERR(ser); 271 goto err_unlock; 272 } 273 274 ser->pos = args->file->f_pos; 275 ser->size = i_size_read(inode); 276 277 err = memfd_luo_preserve_folios(args->file, &ser->folios, 278 &folios_ser, &nr_folios); 279 if (err) 280 goto err_free_ser; 281 282 ser->nr_folios = nr_folios; 283 inode_unlock(inode); 284 285 args->private_data = folios_ser; 286 args->serialized_data = virt_to_phys(ser); 287 288 return 0; 289 290 err_free_ser: 291 kho_unpreserve_free(ser); 292 err_unlock: 293 shmem_freeze(inode, false); 294 inode_unlock(inode); 295 return err; 296 } 297 298 static int memfd_luo_freeze(struct liveupdate_file_op_args *args) 299 { 300 struct memfd_luo_ser *ser; 301 302 if (WARN_ON_ONCE(!args->serialized_data)) 303 return -EINVAL; 304 305 ser = phys_to_virt(args->serialized_data); 306 307 /* 308 * The pos might have changed since prepare. Everything else stays the 309 * same. 310 */ 311 ser->pos = args->file->f_pos; 312 313 return 0; 314 } 315 316 static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args) 317 { 318 struct inode *inode = file_inode(args->file); 319 struct memfd_luo_ser *ser; 320 321 if (WARN_ON_ONCE(!args->serialized_data)) 322 return; 323 324 inode_lock(inode); 325 shmem_freeze(inode, false); 326 327 ser = phys_to_virt(args->serialized_data); 328 329 memfd_luo_unpreserve_folios(&ser->folios, args->private_data, 330 ser->nr_folios); 331 332 kho_unpreserve_free(ser); 333 inode_unlock(inode); 334 } 335 336 static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser, 337 u64 nr_folios) 338 { 339 u64 i; 340 341 for (i = 0; i < nr_folios; i++) { 342 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 343 struct folio *folio; 344 phys_addr_t phys; 345 346 if (!pfolio->pfn) 347 continue; 348 349 phys = PFN_PHYS(pfolio->pfn); 350 folio = kho_restore_folio(phys); 351 if (!folio) { 352 pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n", 353 phys); 354 continue; 355 } 356 357 folio_put(folio); 358 } 359 } 360 361 static void memfd_luo_finish(struct liveupdate_file_op_args *args) 362 { 363 struct memfd_luo_folio_ser *folios_ser; 364 struct memfd_luo_ser *ser; 365 366 /* 367 * If retrieve was successful, nothing to do. If it failed, retrieve() 368 * already cleaned up everything it could. So nothing to do there 369 * either. Only need to clean up when retrieve was not called. 370 */ 371 if (args->retrieve_status) 372 return; 373 374 ser = phys_to_virt(args->serialized_data); 375 if (!ser) 376 return; 377 378 if (ser->nr_folios) { 379 folios_ser = kho_restore_vmalloc(&ser->folios); 380 if (!folios_ser) 381 goto out; 382 383 memfd_luo_discard_folios(folios_ser, ser->nr_folios); 384 vfree(folios_ser); 385 } 386 387 out: 388 kho_restore_free(ser); 389 } 390 391 static int memfd_luo_retrieve_folios(struct file *file, 392 struct memfd_luo_folio_ser *folios_ser, 393 u64 nr_folios) 394 { 395 struct inode *inode = file_inode(file); 396 struct address_space *mapping = inode->i_mapping; 397 struct folio *folio; 398 int err = -EIO; 399 long i; 400 401 for (i = 0; i < nr_folios; i++) { 402 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 403 phys_addr_t phys; 404 u64 index; 405 int flags; 406 407 if (!pfolio->pfn) 408 continue; 409 410 phys = PFN_PHYS(pfolio->pfn); 411 folio = kho_restore_folio(phys); 412 if (!folio) { 413 pr_err("Unable to restore folio at physical address: %llx\n", 414 phys); 415 goto put_folios; 416 } 417 index = pfolio->index; 418 flags = pfolio->flags; 419 420 /* Set up the folio for insertion. */ 421 __folio_set_locked(folio); 422 __folio_set_swapbacked(folio); 423 424 err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping)); 425 if (err) { 426 pr_err("shmem: failed to charge folio index %ld: %d\n", 427 i, err); 428 goto unlock_folio; 429 } 430 431 err = shmem_add_to_page_cache(folio, mapping, index, NULL, 432 mapping_gfp_mask(mapping)); 433 if (err) { 434 pr_err("shmem: failed to add to page cache folio index %ld: %d\n", 435 i, err); 436 goto unlock_folio; 437 } 438 439 if (flags & MEMFD_LUO_FOLIO_UPTODATE) 440 folio_mark_uptodate(folio); 441 if (flags & MEMFD_LUO_FOLIO_DIRTY) 442 folio_mark_dirty(folio); 443 444 err = shmem_inode_acct_blocks(inode, 1); 445 if (err) { 446 pr_err("shmem: failed to account folio index %ld: %d\n", 447 i, err); 448 goto unlock_folio; 449 } 450 451 shmem_recalc_inode(inode, 1, 0); 452 folio_add_lru(folio); 453 folio_unlock(folio); 454 folio_put(folio); 455 } 456 457 return 0; 458 459 unlock_folio: 460 folio_unlock(folio); 461 folio_put(folio); 462 put_folios: 463 /* 464 * Note: don't free the folios already added to the file. They will be 465 * freed when the file is freed. Free the ones not added yet here. 466 */ 467 for (long j = i + 1; j < nr_folios; j++) { 468 const struct memfd_luo_folio_ser *pfolio = &folios_ser[j]; 469 470 folio = kho_restore_folio(pfolio->pfn); 471 if (folio) 472 folio_put(folio); 473 } 474 475 return err; 476 } 477 478 static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) 479 { 480 struct memfd_luo_folio_ser *folios_ser; 481 struct memfd_luo_ser *ser; 482 struct file *file; 483 int err; 484 485 ser = phys_to_virt(args->serialized_data); 486 if (!ser) 487 return -EINVAL; 488 489 file = memfd_alloc_file("", 0); 490 if (IS_ERR(file)) { 491 pr_err("failed to setup file: %pe\n", file); 492 err = PTR_ERR(file); 493 goto free_ser; 494 } 495 496 vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE); 497 file->f_inode->i_size = ser->size; 498 499 if (ser->nr_folios) { 500 folios_ser = kho_restore_vmalloc(&ser->folios); 501 if (!folios_ser) { 502 err = -EINVAL; 503 goto put_file; 504 } 505 506 err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios); 507 vfree(folios_ser); 508 if (err) 509 goto put_file; 510 } 511 512 args->file = file; 513 kho_restore_free(ser); 514 515 return 0; 516 517 put_file: 518 fput(file); 519 free_ser: 520 kho_restore_free(ser); 521 return err; 522 } 523 524 static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler, 525 struct file *file) 526 { 527 struct inode *inode = file_inode(file); 528 529 return shmem_file(file) && !inode->i_nlink; 530 } 531 532 static const struct liveupdate_file_ops memfd_luo_file_ops = { 533 .freeze = memfd_luo_freeze, 534 .finish = memfd_luo_finish, 535 .retrieve = memfd_luo_retrieve, 536 .preserve = memfd_luo_preserve, 537 .unpreserve = memfd_luo_unpreserve, 538 .can_preserve = memfd_luo_can_preserve, 539 .owner = THIS_MODULE, 540 }; 541 542 static struct liveupdate_file_handler memfd_luo_handler = { 543 .ops = &memfd_luo_file_ops, 544 .compatible = MEMFD_LUO_FH_COMPATIBLE, 545 }; 546 547 static int __init memfd_luo_init(void) 548 { 549 int err = liveupdate_register_file_handler(&memfd_luo_handler); 550 551 if (err && err != -EOPNOTSUPP) { 552 pr_err("Could not register luo filesystem handler: %pe\n", 553 ERR_PTR(err)); 554 555 return err; 556 } 557 558 return 0; 559 } 560 late_initcall(memfd_luo_init); 561