1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * Copyright (c) 2025, Google LLC. 5 * Pasha Tatashin <pasha.tatashin@soleen.com> 6 * 7 * Copyright (C) 2025 Amazon.com Inc. or its affiliates. 8 * Pratyush Yadav <ptyadav@amazon.de> 9 */ 10 11 /** 12 * DOC: Memfd Preservation via LUO 13 * 14 * Overview 15 * ======== 16 * 17 * Memory file descriptors (memfd) can be preserved over a kexec using the Live 18 * Update Orchestrator (LUO) file preservation. This allows userspace to 19 * transfer its memory contents to the next kernel after a kexec. 20 * 21 * The preservation is not intended to be transparent. Only select properties of 22 * the file are preserved. All others are reset to default. The preserved 23 * properties are described below. 24 * 25 * .. note:: 26 * The LUO API is not stabilized yet, so the preserved properties of a memfd 27 * are also not stable and are subject to backwards incompatible changes. 28 * 29 * .. note:: 30 * Currently a memfd backed by Hugetlb is not supported. Memfds created 31 * with ``MFD_HUGETLB`` will be rejected. 32 * 33 * Preserved Properties 34 * ==================== 35 * 36 * The following properties of the memfd are preserved across kexec: 37 * 38 * File Contents 39 * All data stored in the file is preserved. 40 * 41 * File Size 42 * The size of the file is preserved. Holes in the file are filled by 43 * allocating pages for them during preservation. 44 * 45 * File Position 46 * The current file position is preserved, allowing applications to continue 47 * reading/writing from their last position. 48 * 49 * File Status Flags 50 * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property 51 * is maintained. 52 * 53 * Non-Preserved Properties 54 * ======================== 55 * 56 * All properties which are not preserved must be assumed to be reset to 57 * default. This section describes some of those properties which may be more of 58 * note. 59 * 60 * ``FD_CLOEXEC`` flag 61 * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the 62 * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set 63 * again after restore via ``fcntl()``. 64 * 65 * Seals 66 * File seals are not preserved. The file is unsealed on restore and if 67 * needed, must be sealed again via ``fcntl()``. 68 */ 69 70 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 71 72 #include <linux/bits.h> 73 #include <linux/err.h> 74 #include <linux/file.h> 75 #include <linux/io.h> 76 #include <linux/kexec_handover.h> 77 #include <linux/kho/abi/memfd.h> 78 #include <linux/liveupdate.h> 79 #include <linux/shmem_fs.h> 80 #include <linux/vmalloc.h> 81 #include <linux/memfd.h> 82 #include "internal.h" 83 84 static int memfd_luo_preserve_folios(struct file *file, 85 struct kho_vmalloc *kho_vmalloc, 86 struct memfd_luo_folio_ser **out_folios_ser, 87 u64 *nr_foliosp) 88 { 89 struct inode *inode = file_inode(file); 90 struct memfd_luo_folio_ser *folios_ser; 91 unsigned int max_folios; 92 long i, size, nr_pinned; 93 struct folio **folios; 94 int err = -EINVAL; 95 pgoff_t offset; 96 u64 nr_folios; 97 98 size = i_size_read(inode); 99 /* 100 * If the file has zero size, then the folios and nr_folios properties 101 * are not set. 102 */ 103 if (!size) { 104 *nr_foliosp = 0; 105 *out_folios_ser = NULL; 106 memset(kho_vmalloc, 0, sizeof(*kho_vmalloc)); 107 return 0; 108 } 109 110 /* 111 * Guess the number of folios based on inode size. Real number might end 112 * up being smaller if there are higher order folios. 113 */ 114 max_folios = PAGE_ALIGN(size) / PAGE_SIZE; 115 folios = kvmalloc_objs(*folios, max_folios); 116 if (!folios) 117 return -ENOMEM; 118 119 /* 120 * Pin the folios so they don't move around behind our back. This also 121 * ensures none of the folios are in CMA -- which ensures they don't 122 * fall in KHO scratch memory. It also moves swapped out folios back to 123 * memory. 124 * 125 * A side effect of doing this is that it allocates a folio for all 126 * indices in the file. This might waste memory on sparse memfds. If 127 * that is really a problem in the future, we can have a 128 * memfd_pin_folios() variant that does not allocate a page on empty 129 * slots. 130 */ 131 nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios, 132 &offset); 133 if (nr_pinned < 0) { 134 err = nr_pinned; 135 pr_err("failed to pin folios: %d\n", err); 136 goto err_free_folios; 137 } 138 nr_folios = nr_pinned; 139 140 folios_ser = vcalloc(nr_folios, sizeof(*folios_ser)); 141 if (!folios_ser) { 142 err = -ENOMEM; 143 goto err_unpin; 144 } 145 146 for (i = 0; i < nr_folios; i++) { 147 struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 148 struct folio *folio = folios[i]; 149 unsigned int flags = 0; 150 151 err = kho_preserve_folio(folio); 152 if (err) 153 goto err_unpreserve; 154 155 if (folio_test_dirty(folio)) 156 flags |= MEMFD_LUO_FOLIO_DIRTY; 157 if (folio_test_uptodate(folio)) 158 flags |= MEMFD_LUO_FOLIO_UPTODATE; 159 160 pfolio->pfn = folio_pfn(folio); 161 pfolio->flags = flags; 162 pfolio->index = folio->index; 163 } 164 165 err = kho_preserve_vmalloc(folios_ser, kho_vmalloc); 166 if (err) 167 goto err_unpreserve; 168 169 kvfree(folios); 170 *nr_foliosp = nr_folios; 171 *out_folios_ser = folios_ser; 172 173 /* 174 * Note: folios_ser is purposely not freed here. It is preserved 175 * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer 176 * that is passed via private_data. 177 */ 178 return 0; 179 180 err_unpreserve: 181 for (i = i - 1; i >= 0; i--) 182 kho_unpreserve_folio(folios[i]); 183 vfree(folios_ser); 184 err_unpin: 185 unpin_folios(folios, nr_folios); 186 err_free_folios: 187 kvfree(folios); 188 189 return err; 190 } 191 192 static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc, 193 struct memfd_luo_folio_ser *folios_ser, 194 u64 nr_folios) 195 { 196 long i; 197 198 if (!nr_folios) 199 return; 200 201 kho_unpreserve_vmalloc(kho_vmalloc); 202 203 for (i = 0; i < nr_folios; i++) { 204 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 205 struct folio *folio; 206 207 if (!pfolio->pfn) 208 continue; 209 210 folio = pfn_folio(pfolio->pfn); 211 212 kho_unpreserve_folio(folio); 213 unpin_folio(folio); 214 } 215 216 vfree(folios_ser); 217 } 218 219 static int memfd_luo_preserve(struct liveupdate_file_op_args *args) 220 { 221 struct inode *inode = file_inode(args->file); 222 struct memfd_luo_folio_ser *folios_ser; 223 struct memfd_luo_ser *ser; 224 u64 nr_folios; 225 int err = 0; 226 227 inode_lock(inode); 228 shmem_freeze(inode, true); 229 230 /* Allocate the main serialization structure in preserved memory */ 231 ser = kho_alloc_preserve(sizeof(*ser)); 232 if (IS_ERR(ser)) { 233 err = PTR_ERR(ser); 234 goto err_unlock; 235 } 236 237 ser->pos = args->file->f_pos; 238 ser->size = i_size_read(inode); 239 240 err = memfd_luo_preserve_folios(args->file, &ser->folios, 241 &folios_ser, &nr_folios); 242 if (err) 243 goto err_free_ser; 244 245 ser->nr_folios = nr_folios; 246 inode_unlock(inode); 247 248 args->private_data = folios_ser; 249 args->serialized_data = virt_to_phys(ser); 250 251 return 0; 252 253 err_free_ser: 254 kho_unpreserve_free(ser); 255 err_unlock: 256 shmem_freeze(inode, false); 257 inode_unlock(inode); 258 return err; 259 } 260 261 static int memfd_luo_freeze(struct liveupdate_file_op_args *args) 262 { 263 struct memfd_luo_ser *ser; 264 265 if (WARN_ON_ONCE(!args->serialized_data)) 266 return -EINVAL; 267 268 ser = phys_to_virt(args->serialized_data); 269 270 /* 271 * The pos might have changed since prepare. Everything else stays the 272 * same. 273 */ 274 ser->pos = args->file->f_pos; 275 276 return 0; 277 } 278 279 static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args) 280 { 281 struct inode *inode = file_inode(args->file); 282 struct memfd_luo_ser *ser; 283 284 if (WARN_ON_ONCE(!args->serialized_data)) 285 return; 286 287 inode_lock(inode); 288 shmem_freeze(inode, false); 289 290 ser = phys_to_virt(args->serialized_data); 291 292 memfd_luo_unpreserve_folios(&ser->folios, args->private_data, 293 ser->nr_folios); 294 295 kho_unpreserve_free(ser); 296 inode_unlock(inode); 297 } 298 299 static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser, 300 u64 nr_folios) 301 { 302 u64 i; 303 304 for (i = 0; i < nr_folios; i++) { 305 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 306 struct folio *folio; 307 phys_addr_t phys; 308 309 if (!pfolio->pfn) 310 continue; 311 312 phys = PFN_PHYS(pfolio->pfn); 313 folio = kho_restore_folio(phys); 314 if (!folio) { 315 pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n", 316 phys); 317 continue; 318 } 319 320 folio_put(folio); 321 } 322 } 323 324 static void memfd_luo_finish(struct liveupdate_file_op_args *args) 325 { 326 struct memfd_luo_folio_ser *folios_ser; 327 struct memfd_luo_ser *ser; 328 329 /* 330 * If retrieve was successful, nothing to do. If it failed, retrieve() 331 * already cleaned up everything it could. So nothing to do there 332 * either. Only need to clean up when retrieve was not called. 333 */ 334 if (args->retrieve_status) 335 return; 336 337 ser = phys_to_virt(args->serialized_data); 338 if (!ser) 339 return; 340 341 if (ser->nr_folios) { 342 folios_ser = kho_restore_vmalloc(&ser->folios); 343 if (!folios_ser) 344 goto out; 345 346 memfd_luo_discard_folios(folios_ser, ser->nr_folios); 347 vfree(folios_ser); 348 } 349 350 out: 351 kho_restore_free(ser); 352 } 353 354 static int memfd_luo_retrieve_folios(struct file *file, 355 struct memfd_luo_folio_ser *folios_ser, 356 u64 nr_folios) 357 { 358 struct inode *inode = file_inode(file); 359 struct address_space *mapping = inode->i_mapping; 360 struct folio *folio; 361 int err = -EIO; 362 long i; 363 364 for (i = 0; i < nr_folios; i++) { 365 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 366 phys_addr_t phys; 367 u64 index; 368 int flags; 369 370 if (!pfolio->pfn) 371 continue; 372 373 phys = PFN_PHYS(pfolio->pfn); 374 folio = kho_restore_folio(phys); 375 if (!folio) { 376 pr_err("Unable to restore folio at physical address: %llx\n", 377 phys); 378 goto put_folios; 379 } 380 index = pfolio->index; 381 flags = pfolio->flags; 382 383 /* Set up the folio for insertion. */ 384 __folio_set_locked(folio); 385 __folio_set_swapbacked(folio); 386 387 err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping)); 388 if (err) { 389 pr_err("shmem: failed to charge folio index %ld: %d\n", 390 i, err); 391 goto unlock_folio; 392 } 393 394 err = shmem_add_to_page_cache(folio, mapping, index, NULL, 395 mapping_gfp_mask(mapping)); 396 if (err) { 397 pr_err("shmem: failed to add to page cache folio index %ld: %d\n", 398 i, err); 399 goto unlock_folio; 400 } 401 402 if (flags & MEMFD_LUO_FOLIO_UPTODATE) 403 folio_mark_uptodate(folio); 404 if (flags & MEMFD_LUO_FOLIO_DIRTY) 405 folio_mark_dirty(folio); 406 407 err = shmem_inode_acct_blocks(inode, 1); 408 if (err) { 409 pr_err("shmem: failed to account folio index %ld: %d\n", 410 i, err); 411 goto unlock_folio; 412 } 413 414 shmem_recalc_inode(inode, 1, 0); 415 folio_add_lru(folio); 416 folio_unlock(folio); 417 folio_put(folio); 418 } 419 420 return 0; 421 422 unlock_folio: 423 folio_unlock(folio); 424 folio_put(folio); 425 put_folios: 426 /* 427 * Note: don't free the folios already added to the file. They will be 428 * freed when the file is freed. Free the ones not added yet here. 429 */ 430 for (long j = i + 1; j < nr_folios; j++) { 431 const struct memfd_luo_folio_ser *pfolio = &folios_ser[j]; 432 433 folio = kho_restore_folio(pfolio->pfn); 434 if (folio) 435 folio_put(folio); 436 } 437 438 return err; 439 } 440 441 static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) 442 { 443 struct memfd_luo_folio_ser *folios_ser; 444 struct memfd_luo_ser *ser; 445 struct file *file; 446 int err; 447 448 ser = phys_to_virt(args->serialized_data); 449 if (!ser) 450 return -EINVAL; 451 452 file = memfd_alloc_file("", 0); 453 if (IS_ERR(file)) { 454 pr_err("failed to setup file: %pe\n", file); 455 err = PTR_ERR(file); 456 goto free_ser; 457 } 458 459 vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE); 460 file->f_inode->i_size = ser->size; 461 462 if (ser->nr_folios) { 463 folios_ser = kho_restore_vmalloc(&ser->folios); 464 if (!folios_ser) { 465 err = -EINVAL; 466 goto put_file; 467 } 468 469 err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios); 470 vfree(folios_ser); 471 if (err) 472 goto put_file; 473 } 474 475 args->file = file; 476 kho_restore_free(ser); 477 478 return 0; 479 480 put_file: 481 fput(file); 482 free_ser: 483 kho_restore_free(ser); 484 return err; 485 } 486 487 static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler, 488 struct file *file) 489 { 490 struct inode *inode = file_inode(file); 491 492 return shmem_file(file) && !inode->i_nlink; 493 } 494 495 static const struct liveupdate_file_ops memfd_luo_file_ops = { 496 .freeze = memfd_luo_freeze, 497 .finish = memfd_luo_finish, 498 .retrieve = memfd_luo_retrieve, 499 .preserve = memfd_luo_preserve, 500 .unpreserve = memfd_luo_unpreserve, 501 .can_preserve = memfd_luo_can_preserve, 502 .owner = THIS_MODULE, 503 }; 504 505 static struct liveupdate_file_handler memfd_luo_handler = { 506 .ops = &memfd_luo_file_ops, 507 .compatible = MEMFD_LUO_FH_COMPATIBLE, 508 }; 509 510 static int __init memfd_luo_init(void) 511 { 512 int err = liveupdate_register_file_handler(&memfd_luo_handler); 513 514 if (err && err != -EOPNOTSUPP) { 515 pr_err("Could not register luo filesystem handler: %pe\n", 516 ERR_PTR(err)); 517 518 return err; 519 } 520 521 return 0; 522 } 523 late_initcall(memfd_luo_init); 524