1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4 * Copyright (c) 2025, Google LLC.
5 * Pasha Tatashin <pasha.tatashin@soleen.com>
6 *
7 * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
8 * Pratyush Yadav <ptyadav@amazon.de>
9 */
10
11 /**
12 * DOC: Memfd Preservation via LUO
13 *
14 * Overview
15 * ========
16 *
17 * Memory file descriptors (memfd) can be preserved over a kexec using the Live
18 * Update Orchestrator (LUO) file preservation. This allows userspace to
19 * transfer its memory contents to the next kernel after a kexec.
20 *
21 * The preservation is not intended to be transparent. Only select properties of
22 * the file are preserved. All others are reset to default. The preserved
23 * properties are described below.
24 *
25 * .. note::
26 * The LUO API is not stabilized yet, so the preserved properties of a memfd
27 * are also not stable and are subject to backwards incompatible changes.
28 *
29 * .. note::
30 * Currently a memfd backed by Hugetlb is not supported. Memfds created
31 * with ``MFD_HUGETLB`` will be rejected.
32 *
33 * Preserved Properties
34 * ====================
35 *
36 * The following properties of the memfd are preserved across kexec:
37 *
38 * File Contents
39 * All data stored in the file is preserved.
40 *
41 * File Size
42 * The size of the file is preserved. Holes in the file are filled by
43 * allocating pages for them during preservation.
44 *
45 * File Position
46 * The current file position is preserved, allowing applications to continue
47 * reading/writing from their last position.
48 *
49 * File Status Flags
50 * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
51 * is maintained.
52 *
53 * Non-Preserved Properties
54 * ========================
55 *
56 * All properties which are not preserved must be assumed to be reset to
57 * default. This section describes some of those properties which may be more of
58 * note.
59 *
60 * ``FD_CLOEXEC`` flag
61 * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
62 * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
63 * again after restore via ``fcntl()``.
64 *
65 * Seals
66 * File seals are not preserved. The file is unsealed on restore and if
67 * needed, must be sealed again via ``fcntl()``.
68 */
69
70 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
71
72 #include <linux/bits.h>
73 #include <linux/err.h>
74 #include <linux/file.h>
75 #include <linux/io.h>
76 #include <linux/kexec_handover.h>
77 #include <linux/kho/abi/memfd.h>
78 #include <linux/liveupdate.h>
79 #include <linux/shmem_fs.h>
80 #include <linux/vmalloc.h>
81 #include <linux/memfd.h>
82 #include "internal.h"
83
memfd_luo_preserve_folios(struct file * file,struct kho_vmalloc * kho_vmalloc,struct memfd_luo_folio_ser ** out_folios_ser,u64 * nr_foliosp)84 static int memfd_luo_preserve_folios(struct file *file,
85 struct kho_vmalloc *kho_vmalloc,
86 struct memfd_luo_folio_ser **out_folios_ser,
87 u64 *nr_foliosp)
88 {
89 struct inode *inode = file_inode(file);
90 struct memfd_luo_folio_ser *folios_ser;
91 unsigned int max_folios;
92 long i, size, nr_pinned;
93 struct folio **folios;
94 int err = -EINVAL;
95 pgoff_t offset;
96 u64 nr_folios;
97
98 size = i_size_read(inode);
99 /*
100 * If the file has zero size, then the folios and nr_folios properties
101 * are not set.
102 */
103 if (!size) {
104 *nr_foliosp = 0;
105 *out_folios_ser = NULL;
106 memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
107 return 0;
108 }
109
110 /*
111 * Guess the number of folios based on inode size. Real number might end
112 * up being smaller if there are higher order folios.
113 */
114 max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
115 folios = kvmalloc_objs(*folios, max_folios);
116 if (!folios)
117 return -ENOMEM;
118
119 /*
120 * Pin the folios so they don't move around behind our back. This also
121 * ensures none of the folios are in CMA -- which ensures they don't
122 * fall in KHO scratch memory. It also moves swapped out folios back to
123 * memory.
124 *
125 * A side effect of doing this is that it allocates a folio for all
126 * indices in the file. This might waste memory on sparse memfds. If
127 * that is really a problem in the future, we can have a
128 * memfd_pin_folios() variant that does not allocate a page on empty
129 * slots.
130 */
131 nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
132 &offset);
133 if (nr_pinned < 0) {
134 err = nr_pinned;
135 pr_err("failed to pin folios: %d\n", err);
136 goto err_free_folios;
137 }
138 nr_folios = nr_pinned;
139
140 folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
141 if (!folios_ser) {
142 err = -ENOMEM;
143 goto err_unpin;
144 }
145
146 for (i = 0; i < nr_folios; i++) {
147 struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
148 struct folio *folio = folios[i];
149
150 err = kho_preserve_folio(folio);
151 if (err)
152 goto err_unpreserve;
153
154 folio_lock(folio);
155
156 /*
157 * A dirty folio is one which has been written to. A clean folio
158 * is its opposite. Since a clean folio does not carry user
159 * data, it can be freed by page reclaim under memory pressure.
160 *
161 * Saving the dirty flag at prepare() time doesn't work since it
162 * can change later. Saving it at freeze() also won't work
163 * because the dirty bit is normally synced at unmap and there
164 * might still be a mapping of the file at freeze().
165 *
166 * To see why this is a problem, say a folio is clean at
167 * preserve, but gets dirtied later. The pfolio flags will mark
168 * it as clean. After retrieve, the next kernel might try to
169 * reclaim this folio under memory pressure, losing user data.
170 *
171 * Unconditionally mark it dirty to avoid this problem. This
172 * comes at the cost of making clean folios un-reclaimable after
173 * live update.
174 */
175 folio_mark_dirty(folio);
176
177 /*
178 * If the folio is not uptodate, it was fallocated but never
179 * used. Saving this flag at prepare() doesn't work since it
180 * might change later when someone uses the folio.
181 *
182 * Since we have taken the performance penalty of allocating,
183 * zeroing, and pinning all the folios in the holes, take a bit
184 * more and zero all non-uptodate folios too.
185 *
186 * NOTE: For someone looking to improve preserve performance,
187 * this is a good place to look.
188 */
189 if (!folio_test_uptodate(folio)) {
190 folio_zero_range(folio, 0, folio_size(folio));
191 flush_dcache_folio(folio);
192 folio_mark_uptodate(folio);
193 }
194
195 folio_unlock(folio);
196
197 pfolio->pfn = folio_pfn(folio);
198 pfolio->flags = MEMFD_LUO_FOLIO_DIRTY | MEMFD_LUO_FOLIO_UPTODATE;
199 pfolio->index = folio->index;
200 }
201
202 err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
203 if (err)
204 goto err_unpreserve;
205
206 kvfree(folios);
207 *nr_foliosp = nr_folios;
208 *out_folios_ser = folios_ser;
209
210 /*
211 * Note: folios_ser is purposely not freed here. It is preserved
212 * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
213 * that is passed via private_data.
214 */
215 return 0;
216
217 err_unpreserve:
218 for (i = i - 1; i >= 0; i--)
219 kho_unpreserve_folio(folios[i]);
220 vfree(folios_ser);
221 err_unpin:
222 unpin_folios(folios, nr_folios);
223 err_free_folios:
224 kvfree(folios);
225
226 return err;
227 }
228
memfd_luo_unpreserve_folios(struct kho_vmalloc * kho_vmalloc,struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)229 static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
230 struct memfd_luo_folio_ser *folios_ser,
231 u64 nr_folios)
232 {
233 long i;
234
235 if (!nr_folios)
236 return;
237
238 kho_unpreserve_vmalloc(kho_vmalloc);
239
240 for (i = 0; i < nr_folios; i++) {
241 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
242 struct folio *folio;
243
244 if (!pfolio->pfn)
245 continue;
246
247 folio = pfn_folio(pfolio->pfn);
248
249 kho_unpreserve_folio(folio);
250 unpin_folio(folio);
251 }
252
253 vfree(folios_ser);
254 }
255
memfd_luo_preserve(struct liveupdate_file_op_args * args)256 static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
257 {
258 struct inode *inode = file_inode(args->file);
259 struct memfd_luo_folio_ser *folios_ser;
260 struct memfd_luo_ser *ser;
261 u64 nr_folios;
262 int err = 0;
263
264 inode_lock(inode);
265 shmem_freeze(inode, true);
266
267 /* Allocate the main serialization structure in preserved memory */
268 ser = kho_alloc_preserve(sizeof(*ser));
269 if (IS_ERR(ser)) {
270 err = PTR_ERR(ser);
271 goto err_unlock;
272 }
273
274 ser->pos = args->file->f_pos;
275 ser->size = i_size_read(inode);
276
277 err = memfd_luo_preserve_folios(args->file, &ser->folios,
278 &folios_ser, &nr_folios);
279 if (err)
280 goto err_free_ser;
281
282 ser->nr_folios = nr_folios;
283 inode_unlock(inode);
284
285 args->private_data = folios_ser;
286 args->serialized_data = virt_to_phys(ser);
287
288 return 0;
289
290 err_free_ser:
291 kho_unpreserve_free(ser);
292 err_unlock:
293 shmem_freeze(inode, false);
294 inode_unlock(inode);
295 return err;
296 }
297
memfd_luo_freeze(struct liveupdate_file_op_args * args)298 static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
299 {
300 struct memfd_luo_ser *ser;
301
302 if (WARN_ON_ONCE(!args->serialized_data))
303 return -EINVAL;
304
305 ser = phys_to_virt(args->serialized_data);
306
307 /*
308 * The pos might have changed since prepare. Everything else stays the
309 * same.
310 */
311 ser->pos = args->file->f_pos;
312
313 return 0;
314 }
315
memfd_luo_unpreserve(struct liveupdate_file_op_args * args)316 static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
317 {
318 struct inode *inode = file_inode(args->file);
319 struct memfd_luo_ser *ser;
320
321 if (WARN_ON_ONCE(!args->serialized_data))
322 return;
323
324 inode_lock(inode);
325 shmem_freeze(inode, false);
326
327 ser = phys_to_virt(args->serialized_data);
328
329 memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
330 ser->nr_folios);
331
332 kho_unpreserve_free(ser);
333 inode_unlock(inode);
334 }
335
memfd_luo_discard_folios(const struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)336 static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
337 u64 nr_folios)
338 {
339 u64 i;
340
341 for (i = 0; i < nr_folios; i++) {
342 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
343 struct folio *folio;
344 phys_addr_t phys;
345
346 if (!pfolio->pfn)
347 continue;
348
349 phys = PFN_PHYS(pfolio->pfn);
350 folio = kho_restore_folio(phys);
351 if (!folio) {
352 pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
353 phys);
354 continue;
355 }
356
357 folio_put(folio);
358 }
359 }
360
memfd_luo_finish(struct liveupdate_file_op_args * args)361 static void memfd_luo_finish(struct liveupdate_file_op_args *args)
362 {
363 struct memfd_luo_folio_ser *folios_ser;
364 struct memfd_luo_ser *ser;
365
366 /*
367 * If retrieve was successful, nothing to do. If it failed, retrieve()
368 * already cleaned up everything it could. So nothing to do there
369 * either. Only need to clean up when retrieve was not called.
370 */
371 if (args->retrieve_status)
372 return;
373
374 ser = phys_to_virt(args->serialized_data);
375 if (!ser)
376 return;
377
378 if (ser->nr_folios) {
379 folios_ser = kho_restore_vmalloc(&ser->folios);
380 if (!folios_ser)
381 goto out;
382
383 memfd_luo_discard_folios(folios_ser, ser->nr_folios);
384 vfree(folios_ser);
385 }
386
387 out:
388 kho_restore_free(ser);
389 }
390
memfd_luo_retrieve_folios(struct file * file,struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)391 static int memfd_luo_retrieve_folios(struct file *file,
392 struct memfd_luo_folio_ser *folios_ser,
393 u64 nr_folios)
394 {
395 struct inode *inode = file_inode(file);
396 struct address_space *mapping = inode->i_mapping;
397 struct folio *folio;
398 int err = -EIO;
399 long i;
400
401 for (i = 0; i < nr_folios; i++) {
402 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
403 phys_addr_t phys;
404 u64 index;
405 int flags;
406
407 if (!pfolio->pfn)
408 continue;
409
410 phys = PFN_PHYS(pfolio->pfn);
411 folio = kho_restore_folio(phys);
412 if (!folio) {
413 pr_err("Unable to restore folio at physical address: %llx\n",
414 phys);
415 goto put_folios;
416 }
417 index = pfolio->index;
418 flags = pfolio->flags;
419
420 /* Set up the folio for insertion. */
421 __folio_set_locked(folio);
422 __folio_set_swapbacked(folio);
423
424 err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
425 if (err) {
426 pr_err("shmem: failed to charge folio index %ld: %d\n",
427 i, err);
428 goto unlock_folio;
429 }
430
431 err = shmem_add_to_page_cache(folio, mapping, index, NULL,
432 mapping_gfp_mask(mapping));
433 if (err) {
434 pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
435 i, err);
436 goto unlock_folio;
437 }
438
439 if (flags & MEMFD_LUO_FOLIO_UPTODATE)
440 folio_mark_uptodate(folio);
441 if (flags & MEMFD_LUO_FOLIO_DIRTY)
442 folio_mark_dirty(folio);
443
444 err = shmem_inode_acct_blocks(inode, 1);
445 if (err) {
446 pr_err("shmem: failed to account folio index %ld: %d\n",
447 i, err);
448 goto unlock_folio;
449 }
450
451 shmem_recalc_inode(inode, 1, 0);
452 folio_add_lru(folio);
453 folio_unlock(folio);
454 folio_put(folio);
455 }
456
457 return 0;
458
459 unlock_folio:
460 folio_unlock(folio);
461 folio_put(folio);
462 put_folios:
463 /*
464 * Note: don't free the folios already added to the file. They will be
465 * freed when the file is freed. Free the ones not added yet here.
466 */
467 for (long j = i + 1; j < nr_folios; j++) {
468 const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
469
470 folio = kho_restore_folio(pfolio->pfn);
471 if (folio)
472 folio_put(folio);
473 }
474
475 return err;
476 }
477
memfd_luo_retrieve(struct liveupdate_file_op_args * args)478 static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
479 {
480 struct memfd_luo_folio_ser *folios_ser;
481 struct memfd_luo_ser *ser;
482 struct file *file;
483 int err;
484
485 ser = phys_to_virt(args->serialized_data);
486 if (!ser)
487 return -EINVAL;
488
489 file = memfd_alloc_file("", 0);
490 if (IS_ERR(file)) {
491 pr_err("failed to setup file: %pe\n", file);
492 err = PTR_ERR(file);
493 goto free_ser;
494 }
495
496 vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
497 file->f_inode->i_size = ser->size;
498
499 if (ser->nr_folios) {
500 folios_ser = kho_restore_vmalloc(&ser->folios);
501 if (!folios_ser) {
502 err = -EINVAL;
503 goto put_file;
504 }
505
506 err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
507 vfree(folios_ser);
508 if (err)
509 goto put_file;
510 }
511
512 args->file = file;
513 kho_restore_free(ser);
514
515 return 0;
516
517 put_file:
518 fput(file);
519 free_ser:
520 kho_restore_free(ser);
521 return err;
522 }
523
memfd_luo_can_preserve(struct liveupdate_file_handler * handler,struct file * file)524 static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
525 struct file *file)
526 {
527 struct inode *inode = file_inode(file);
528
529 return shmem_file(file) && !inode->i_nlink;
530 }
531
532 static const struct liveupdate_file_ops memfd_luo_file_ops = {
533 .freeze = memfd_luo_freeze,
534 .finish = memfd_luo_finish,
535 .retrieve = memfd_luo_retrieve,
536 .preserve = memfd_luo_preserve,
537 .unpreserve = memfd_luo_unpreserve,
538 .can_preserve = memfd_luo_can_preserve,
539 .owner = THIS_MODULE,
540 };
541
542 static struct liveupdate_file_handler memfd_luo_handler = {
543 .ops = &memfd_luo_file_ops,
544 .compatible = MEMFD_LUO_FH_COMPATIBLE,
545 };
546
memfd_luo_init(void)547 static int __init memfd_luo_init(void)
548 {
549 int err = liveupdate_register_file_handler(&memfd_luo_handler);
550
551 if (err && err != -EOPNOTSUPP) {
552 pr_err("Could not register luo filesystem handler: %pe\n",
553 ERR_PTR(err));
554
555 return err;
556 }
557
558 return 0;
559 }
560 late_initcall(memfd_luo_init);
561