1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4 * Copyright (c) 2025, Google LLC.
5 * Pasha Tatashin <pasha.tatashin@soleen.com>
6 *
7 * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
8 * Pratyush Yadav <ptyadav@amazon.de>
9 */
10
11 /**
12 * DOC: Memfd Preservation via LUO
13 *
14 * Overview
15 * ========
16 *
17 * Memory file descriptors (memfd) can be preserved over a kexec using the Live
18 * Update Orchestrator (LUO) file preservation. This allows userspace to
19 * transfer its memory contents to the next kernel after a kexec.
20 *
21 * The preservation is not intended to be transparent. Only select properties of
22 * the file are preserved. All others are reset to default. The preserved
23 * properties are described below.
24 *
25 * .. note::
26 * The LUO API is not stabilized yet, so the preserved properties of a memfd
27 * are also not stable and are subject to backwards incompatible changes.
28 *
29 * .. note::
30 * Currently a memfd backed by Hugetlb is not supported. Memfds created
31 * with ``MFD_HUGETLB`` will be rejected.
32 *
33 * Preserved Properties
34 * ====================
35 *
36 * The following properties of the memfd are preserved across kexec:
37 *
38 * File Contents
39 * All data stored in the file is preserved.
40 *
41 * File Size
42 * The size of the file is preserved. Holes in the file are filled by
43 * allocating pages for them during preservation.
44 *
45 * File Position
46 * The current file position is preserved, allowing applications to continue
47 * reading/writing from their last position.
48 *
49 * File Status Flags
50 * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
51 * is maintained.
52 *
53 * Seals
54 * File seals set on the memfd are preserved and re-applied on restore.
55 * Only seals known to this LUO version (see ``MEMFD_LUO_ALL_SEALS``) may
56 * be present; preservation fails with ``-EOPNOTSUPP`` otherwise.
57 *
58 * Non-Preserved Properties
59 * ========================
60 *
61 * All properties which are not preserved must be assumed to be reset to
62 * default. This section describes some of those properties which may be more of
63 * note.
64 *
65 * ``FD_CLOEXEC`` flag
66 * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
67 * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
68 * again after restore via ``fcntl()``.
69 */
70
71 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
72
73 #include <linux/bits.h>
74 #include <linux/err.h>
75 #include <linux/file.h>
76 #include <linux/io.h>
77 #include <linux/kexec_handover.h>
78 #include <linux/kho/abi/memfd.h>
79 #include <linux/liveupdate.h>
80 #include <linux/shmem_fs.h>
81 #include <linux/vmalloc.h>
82 #include <linux/memfd.h>
83 #include <uapi/linux/memfd.h>
84
85 #include "internal.h"
86
memfd_luo_preserve_folios(struct file * file,struct kho_vmalloc * kho_vmalloc,struct memfd_luo_folio_ser ** out_folios_ser,u64 * nr_foliosp)87 static int memfd_luo_preserve_folios(struct file *file,
88 struct kho_vmalloc *kho_vmalloc,
89 struct memfd_luo_folio_ser **out_folios_ser,
90 u64 *nr_foliosp)
91 {
92 struct inode *inode = file_inode(file);
93 struct memfd_luo_folio_ser *folios_ser;
94 unsigned int max_folios;
95 long i, size, nr_pinned;
96 struct folio **folios;
97 int err = -EINVAL;
98 pgoff_t offset;
99 u64 nr_folios;
100
101 size = i_size_read(inode);
102 /*
103 * If the file has zero size, then the folios and nr_folios properties
104 * are not set.
105 */
106 if (!size) {
107 *nr_foliosp = 0;
108 *out_folios_ser = NULL;
109 return 0;
110 }
111
112 /*
113 * Guess the number of folios based on inode size. Real number might end
114 * up being smaller if there are higher order folios.
115 */
116 max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
117 folios = kvmalloc_objs(*folios, max_folios);
118 if (!folios)
119 return -ENOMEM;
120
121 /*
122 * Pin the folios so they don't move around behind our back. This also
123 * ensures none of the folios are in CMA -- which ensures they don't
124 * fall in KHO scratch memory. It also moves swapped out folios back to
125 * memory.
126 *
127 * A side effect of doing this is that it allocates a folio for all
128 * indices in the file. This might waste memory on sparse memfds. If
129 * that is really a problem in the future, we can have a
130 * memfd_pin_folios() variant that does not allocate a page on empty
131 * slots.
132 */
133 nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
134 &offset);
135 if (nr_pinned < 0) {
136 err = nr_pinned;
137 pr_err("failed to pin folios: %d\n", err);
138 goto err_free_folios;
139 }
140 nr_folios = nr_pinned;
141
142 folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
143 if (!folios_ser) {
144 err = -ENOMEM;
145 goto err_unpin;
146 }
147
148 for (i = 0; i < nr_folios; i++) {
149 struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
150 struct folio *folio = folios[i];
151
152 err = kho_preserve_folio(folio);
153 if (err)
154 goto err_unpreserve;
155
156 folio_lock(folio);
157
158 /*
159 * A dirty folio is one which has been written to. A clean folio
160 * is its opposite. Since a clean folio does not carry user
161 * data, it can be freed by page reclaim under memory pressure.
162 *
163 * Saving the dirty flag at prepare() time doesn't work since it
164 * can change later. Saving it at freeze() also won't work
165 * because the dirty bit is normally synced at unmap and there
166 * might still be a mapping of the file at freeze().
167 *
168 * To see why this is a problem, say a folio is clean at
169 * preserve, but gets dirtied later. The pfolio flags will mark
170 * it as clean. After retrieve, the next kernel might try to
171 * reclaim this folio under memory pressure, losing user data.
172 *
173 * Unconditionally mark it dirty to avoid this problem. This
174 * comes at the cost of making clean folios un-reclaimable after
175 * live update.
176 */
177 folio_mark_dirty(folio);
178
179 /*
180 * If the folio is not uptodate, it was fallocated but never
181 * used. Saving this flag at prepare() doesn't work since it
182 * might change later when someone uses the folio.
183 *
184 * Since we have taken the performance penalty of allocating,
185 * zeroing, and pinning all the folios in the holes, take a bit
186 * more and zero all non-uptodate folios too.
187 *
188 * NOTE: For someone looking to improve preserve performance,
189 * this is a good place to look.
190 */
191 if (!folio_test_uptodate(folio)) {
192 folio_zero_range(folio, 0, folio_size(folio));
193 flush_dcache_folio(folio);
194 folio_mark_uptodate(folio);
195 }
196
197 folio_unlock(folio);
198
199 pfolio->pfn = folio_pfn(folio);
200 pfolio->flags = MEMFD_LUO_FOLIO_DIRTY | MEMFD_LUO_FOLIO_UPTODATE;
201 pfolio->index = folio->index;
202 }
203
204 err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
205 if (err)
206 goto err_unpreserve;
207
208 kvfree(folios);
209 *nr_foliosp = nr_folios;
210 *out_folios_ser = folios_ser;
211
212 /*
213 * Note: folios_ser is purposely not freed here. It is preserved
214 * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
215 * that is passed via private_data.
216 */
217 return 0;
218
219 err_unpreserve:
220 for (i = i - 1; i >= 0; i--)
221 kho_unpreserve_folio(folios[i]);
222 vfree(folios_ser);
223 err_unpin:
224 unpin_folios(folios, nr_folios);
225 err_free_folios:
226 kvfree(folios);
227
228 return err;
229 }
230
memfd_luo_unpreserve_folios(struct kho_vmalloc * kho_vmalloc,struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)231 static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
232 struct memfd_luo_folio_ser *folios_ser,
233 u64 nr_folios)
234 {
235 long i;
236
237 if (!nr_folios)
238 return;
239
240 kho_unpreserve_vmalloc(kho_vmalloc);
241
242 for (i = 0; i < nr_folios; i++) {
243 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
244 struct folio *folio;
245
246 if (!pfolio->pfn)
247 continue;
248
249 folio = pfn_folio(pfolio->pfn);
250
251 kho_unpreserve_folio(folio);
252 unpin_folio(folio);
253 }
254
255 vfree(folios_ser);
256 }
257
memfd_luo_preserve(struct liveupdate_file_op_args * args)258 static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
259 {
260 struct inode *inode = file_inode(args->file);
261 struct memfd_luo_folio_ser *folios_ser;
262 struct memfd_luo_ser *ser;
263 u64 nr_folios, inode_size;
264 int err = 0, seals;
265
266 inode_lock(inode);
267 shmem_freeze(inode, true);
268
269 /* Allocate the main serialization structure in preserved memory */
270 ser = kho_alloc_preserve(sizeof(*ser));
271 if (IS_ERR(ser)) {
272 err = PTR_ERR(ser);
273 goto err_unlock;
274 }
275
276 seals = memfd_get_seals(args->file);
277 if (seals < 0) {
278 err = seals;
279 goto err_free_ser;
280 }
281
282 /* Make sure the file only has the seals supported by this version. */
283 if (seals & ~MEMFD_LUO_ALL_SEALS) {
284 err = -EOPNOTSUPP;
285 goto err_free_ser;
286 }
287
288 ser->pos = args->file->f_pos;
289 inode_size = i_size_read(inode);
290
291 /*
292 * memfd_pin_folios() caps at UINT_MAX folios; refuse larger
293 * files to avoid silently preserving only a prefix.
294 */
295 if (DIV_ROUND_UP_ULL(inode_size, PAGE_SIZE) > UINT_MAX) {
296 err = -EFBIG;
297 goto err_free_ser;
298 }
299
300 ser->size = inode_size;
301 ser->seals = seals;
302
303 err = memfd_luo_preserve_folios(args->file, &ser->folios,
304 &folios_ser, &nr_folios);
305 if (err)
306 goto err_free_ser;
307
308 ser->nr_folios = nr_folios;
309 inode_unlock(inode);
310
311 args->private_data = folios_ser;
312 args->serialized_data = virt_to_phys(ser);
313
314 return 0;
315
316 err_free_ser:
317 kho_unpreserve_free(ser);
318 err_unlock:
319 shmem_freeze(inode, false);
320 inode_unlock(inode);
321 return err;
322 }
323
memfd_luo_freeze(struct liveupdate_file_op_args * args)324 static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
325 {
326 struct memfd_luo_ser *ser;
327
328 if (WARN_ON_ONCE(!args->serialized_data))
329 return -EINVAL;
330
331 ser = phys_to_virt(args->serialized_data);
332
333 /*
334 * The pos might have changed since prepare. Everything else stays the
335 * same.
336 */
337 ser->pos = args->file->f_pos;
338
339 return 0;
340 }
341
memfd_luo_unpreserve(struct liveupdate_file_op_args * args)342 static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
343 {
344 struct inode *inode = file_inode(args->file);
345 struct memfd_luo_ser *ser;
346
347 if (WARN_ON_ONCE(!args->serialized_data))
348 return;
349
350 inode_lock(inode);
351 shmem_freeze(inode, false);
352
353 ser = phys_to_virt(args->serialized_data);
354
355 memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
356 ser->nr_folios);
357
358 kho_unpreserve_free(ser);
359 inode_unlock(inode);
360 }
361
memfd_luo_discard_folios(const struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)362 static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
363 u64 nr_folios)
364 {
365 u64 i;
366
367 for (i = 0; i < nr_folios; i++) {
368 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
369 struct folio *folio;
370 phys_addr_t phys;
371
372 if (!pfolio->pfn)
373 continue;
374
375 phys = PFN_PHYS(pfolio->pfn);
376 folio = kho_restore_folio(phys);
377 if (!folio) {
378 pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
379 phys);
380 continue;
381 }
382
383 folio_put(folio);
384 }
385 }
386
memfd_luo_finish(struct liveupdate_file_op_args * args)387 static void memfd_luo_finish(struct liveupdate_file_op_args *args)
388 {
389 struct memfd_luo_folio_ser *folios_ser;
390 struct memfd_luo_ser *ser;
391
392 /*
393 * If retrieve was successful, nothing to do. If it failed, retrieve()
394 * already cleaned up everything it could. So nothing to do there
395 * either. Only need to clean up when retrieve was not called.
396 */
397 if (args->retrieve_status)
398 return;
399
400 ser = phys_to_virt(args->serialized_data);
401 if (!ser)
402 return;
403
404 if (ser->nr_folios) {
405 folios_ser = kho_restore_vmalloc(&ser->folios);
406 if (!folios_ser)
407 goto out;
408
409 memfd_luo_discard_folios(folios_ser, ser->nr_folios);
410 vfree(folios_ser);
411 }
412
413 out:
414 kho_restore_free(ser);
415 }
416
memfd_luo_retrieve_folios(struct file * file,struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)417 static int memfd_luo_retrieve_folios(struct file *file,
418 struct memfd_luo_folio_ser *folios_ser,
419 u64 nr_folios)
420 {
421 struct inode *inode = file_inode(file);
422 struct address_space *mapping = inode->i_mapping;
423 struct folio *folio;
424 long npages, nr_added_pages = 0;
425 int err = -EIO;
426 long i;
427
428 for (i = 0; i < nr_folios; i++) {
429 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
430 phys_addr_t phys;
431 u64 index;
432 int flags;
433
434 if (!pfolio->pfn)
435 continue;
436
437 phys = PFN_PHYS(pfolio->pfn);
438 folio = kho_restore_folio(phys);
439 if (!folio) {
440 pr_err("Unable to restore folio at physical address: %llx\n",
441 phys);
442 err = -EIO;
443 goto put_folios;
444 }
445 index = pfolio->index;
446 flags = pfolio->flags;
447
448 /* Set up the folio for insertion. */
449 __folio_set_locked(folio);
450 __folio_set_swapbacked(folio);
451
452 err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
453 if (err) {
454 pr_err("shmem: failed to charge folio index %ld: %d\n",
455 i, err);
456 goto unlock_folio;
457 }
458
459 err = shmem_add_to_page_cache(folio, mapping, index, NULL,
460 mapping_gfp_mask(mapping));
461 if (err) {
462 pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
463 i, err);
464 goto unlock_folio;
465 }
466
467 if (flags & MEMFD_LUO_FOLIO_UPTODATE)
468 folio_mark_uptodate(folio);
469 if (flags & MEMFD_LUO_FOLIO_DIRTY)
470 folio_mark_dirty(folio);
471
472 npages = folio_nr_pages(folio);
473 err = shmem_inode_acct_blocks(inode, npages);
474 if (err) {
475 pr_err("shmem: failed to account folio index %ld(%ld pages): %d\n",
476 i, npages, err);
477 goto remove_from_cache;
478 }
479
480 nr_added_pages += npages;
481 folio_add_lru(folio);
482 folio_unlock(folio);
483 folio_put(folio);
484 }
485
486 shmem_recalc_inode(inode, nr_added_pages, 0);
487
488 return 0;
489
490 remove_from_cache:
491 filemap_remove_folio(folio);
492 unlock_folio:
493 folio_unlock(folio);
494 folio_put(folio);
495 put_folios:
496 /*
497 * Note: don't free the folios already added to the file. They will be
498 * freed when the file is freed. Free the ones not added yet here.
499 */
500 for (long j = i + 1; j < nr_folios; j++) {
501 const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
502 phys_addr_t phys;
503
504 if (!pfolio->pfn)
505 continue;
506
507 phys = PFN_PHYS(pfolio->pfn);
508 folio = kho_restore_folio(phys);
509 if (folio)
510 folio_put(folio);
511 }
512
513 shmem_recalc_inode(inode, nr_added_pages, 0);
514
515 return err;
516 }
517
memfd_luo_retrieve(struct liveupdate_file_op_args * args)518 static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
519 {
520 struct memfd_luo_folio_ser *folios_ser;
521 struct memfd_luo_ser *ser;
522 struct file *file;
523 int err;
524
525 ser = phys_to_virt(args->serialized_data);
526 if (!ser)
527 return -EINVAL;
528
529 /* Make sure the file only has seals supported by this version. */
530 if (ser->seals & ~MEMFD_LUO_ALL_SEALS) {
531 err = -EOPNOTSUPP;
532 goto free_ser;
533 }
534
535 /*
536 * The seals are preserved. Allow sealing here so they can be added
537 * later.
538 */
539 file = memfd_alloc_file("", MFD_ALLOW_SEALING);
540 if (IS_ERR(file)) {
541 pr_err("failed to setup file: %pe\n", file);
542 err = PTR_ERR(file);
543 goto free_ser;
544 }
545
546 err = memfd_add_seals(file, ser->seals);
547 if (err) {
548 pr_err("failed to add seals: %pe\n", ERR_PTR(err));
549 goto put_file;
550 }
551
552 vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
553 i_size_write(file_inode(file), ser->size);
554
555 if (ser->nr_folios) {
556 folios_ser = kho_restore_vmalloc(&ser->folios);
557 if (!folios_ser) {
558 err = -EINVAL;
559 goto put_file;
560 }
561
562 err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
563 vfree(folios_ser);
564 if (err)
565 goto put_file;
566 }
567
568 args->file = file;
569 kho_restore_free(ser);
570
571 return 0;
572
573 put_file:
574 fput(file);
575 free_ser:
576 kho_restore_free(ser);
577 return err;
578 }
579
memfd_luo_can_preserve(struct liveupdate_file_handler * handler,struct file * file)580 static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
581 struct file *file)
582 {
583 struct inode *inode = file_inode(file);
584
585 return shmem_file(file) && !inode->i_nlink;
586 }
587
memfd_luo_get_id(struct file * file)588 static unsigned long memfd_luo_get_id(struct file *file)
589 {
590 return (unsigned long)file_inode(file);
591 }
592
593 static const struct liveupdate_file_ops memfd_luo_file_ops = {
594 .freeze = memfd_luo_freeze,
595 .finish = memfd_luo_finish,
596 .retrieve = memfd_luo_retrieve,
597 .preserve = memfd_luo_preserve,
598 .unpreserve = memfd_luo_unpreserve,
599 .can_preserve = memfd_luo_can_preserve,
600 .get_id = memfd_luo_get_id,
601 .owner = THIS_MODULE,
602 };
603
604 static struct liveupdate_file_handler memfd_luo_handler = {
605 .ops = &memfd_luo_file_ops,
606 .compatible = MEMFD_LUO_FH_COMPATIBLE,
607 };
608
memfd_luo_init(void)609 static int __init memfd_luo_init(void)
610 {
611 int err = liveupdate_register_file_handler(&memfd_luo_handler);
612
613 if (err && err != -EOPNOTSUPP) {
614 pr_err("Could not register luo filesystem handler: %pe\n",
615 ERR_PTR(err));
616
617 return err;
618 }
619
620 return 0;
621 }
622 late_initcall(memfd_luo_init);
623