1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4 * Copyright (c) 2025, Google LLC.
5 * Pasha Tatashin <pasha.tatashin@soleen.com>
6 *
7 * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
8 * Pratyush Yadav <ptyadav@amazon.de>
9 */
10
11 /**
12 * DOC: Memfd Preservation via LUO
13 *
14 * Overview
15 * ========
16 *
17 * Memory file descriptors (memfd) can be preserved over a kexec using the Live
18 * Update Orchestrator (LUO) file preservation. This allows userspace to
19 * transfer its memory contents to the next kernel after a kexec.
20 *
21 * The preservation is not intended to be transparent. Only select properties of
22 * the file are preserved. All others are reset to default. The preserved
23 * properties are described below.
24 *
25 * .. note::
26 * The LUO API is not stabilized yet, so the preserved properties of a memfd
27 * are also not stable and are subject to backwards incompatible changes.
28 *
29 * .. note::
30 * Currently a memfd backed by Hugetlb is not supported. Memfds created
31 * with ``MFD_HUGETLB`` will be rejected.
32 *
33 * Preserved Properties
34 * ====================
35 *
36 * The following properties of the memfd are preserved across kexec:
37 *
38 * File Contents
39 * All data stored in the file is preserved.
40 *
41 * File Size
42 * The size of the file is preserved. Holes in the file are filled by
43 * allocating pages for them during preservation.
44 *
45 * File Position
46 * The current file position is preserved, allowing applications to continue
47 * reading/writing from their last position.
48 *
49 * File Status Flags
50 * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
51 * is maintained.
52 *
53 * Non-Preserved Properties
54 * ========================
55 *
56 * All properties which are not preserved must be assumed to be reset to
57 * default. This section describes some of those properties which may be more of
58 * note.
59 *
60 * ``FD_CLOEXEC`` flag
61 * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
62 * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
63 * again after restore via ``fcntl()``.
64 *
65 * Seals
66 * File seals are not preserved. The file is unsealed on restore and if
67 * needed, must be sealed again via ``fcntl()``.
68 */
69
70 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
71
72 #include <linux/bits.h>
73 #include <linux/err.h>
74 #include <linux/file.h>
75 #include <linux/io.h>
76 #include <linux/kexec_handover.h>
77 #include <linux/kho/abi/memfd.h>
78 #include <linux/liveupdate.h>
79 #include <linux/shmem_fs.h>
80 #include <linux/vmalloc.h>
81 #include "internal.h"
82
memfd_luo_preserve_folios(struct file * file,struct kho_vmalloc * kho_vmalloc,struct memfd_luo_folio_ser ** out_folios_ser,u64 * nr_foliosp)83 static int memfd_luo_preserve_folios(struct file *file,
84 struct kho_vmalloc *kho_vmalloc,
85 struct memfd_luo_folio_ser **out_folios_ser,
86 u64 *nr_foliosp)
87 {
88 struct inode *inode = file_inode(file);
89 struct memfd_luo_folio_ser *folios_ser;
90 unsigned int max_folios;
91 long i, size, nr_pinned;
92 struct folio **folios;
93 int err = -EINVAL;
94 pgoff_t offset;
95 u64 nr_folios;
96
97 size = i_size_read(inode);
98 /*
99 * If the file has zero size, then the folios and nr_folios properties
100 * are not set.
101 */
102 if (!size) {
103 *nr_foliosp = 0;
104 *out_folios_ser = NULL;
105 memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
106 return 0;
107 }
108
109 /*
110 * Guess the number of folios based on inode size. Real number might end
111 * up being smaller if there are higher order folios.
112 */
113 max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
114 folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);
115 if (!folios)
116 return -ENOMEM;
117
118 /*
119 * Pin the folios so they don't move around behind our back. This also
120 * ensures none of the folios are in CMA -- which ensures they don't
121 * fall in KHO scratch memory. It also moves swapped out folios back to
122 * memory.
123 *
124 * A side effect of doing this is that it allocates a folio for all
125 * indices in the file. This might waste memory on sparse memfds. If
126 * that is really a problem in the future, we can have a
127 * memfd_pin_folios() variant that does not allocate a page on empty
128 * slots.
129 */
130 nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
131 &offset);
132 if (nr_pinned < 0) {
133 err = nr_pinned;
134 pr_err("failed to pin folios: %d\n", err);
135 goto err_free_folios;
136 }
137 nr_folios = nr_pinned;
138
139 folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
140 if (!folios_ser) {
141 err = -ENOMEM;
142 goto err_unpin;
143 }
144
145 for (i = 0; i < nr_folios; i++) {
146 struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
147 struct folio *folio = folios[i];
148 unsigned int flags = 0;
149
150 err = kho_preserve_folio(folio);
151 if (err)
152 goto err_unpreserve;
153
154 if (folio_test_dirty(folio))
155 flags |= MEMFD_LUO_FOLIO_DIRTY;
156 if (folio_test_uptodate(folio))
157 flags |= MEMFD_LUO_FOLIO_UPTODATE;
158
159 pfolio->pfn = folio_pfn(folio);
160 pfolio->flags = flags;
161 pfolio->index = folio->index;
162 }
163
164 err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
165 if (err)
166 goto err_unpreserve;
167
168 kvfree(folios);
169 *nr_foliosp = nr_folios;
170 *out_folios_ser = folios_ser;
171
172 /*
173 * Note: folios_ser is purposely not freed here. It is preserved
174 * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
175 * that is passed via private_data.
176 */
177 return 0;
178
179 err_unpreserve:
180 for (i = i - 1; i >= 0; i--)
181 kho_unpreserve_folio(folios[i]);
182 vfree(folios_ser);
183 err_unpin:
184 unpin_folios(folios, nr_folios);
185 err_free_folios:
186 kvfree(folios);
187
188 return err;
189 }
190
memfd_luo_unpreserve_folios(struct kho_vmalloc * kho_vmalloc,struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)191 static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
192 struct memfd_luo_folio_ser *folios_ser,
193 u64 nr_folios)
194 {
195 long i;
196
197 if (!nr_folios)
198 return;
199
200 kho_unpreserve_vmalloc(kho_vmalloc);
201
202 for (i = 0; i < nr_folios; i++) {
203 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
204 struct folio *folio;
205
206 if (!pfolio->pfn)
207 continue;
208
209 folio = pfn_folio(pfolio->pfn);
210
211 kho_unpreserve_folio(folio);
212 unpin_folio(folio);
213 }
214
215 vfree(folios_ser);
216 }
217
memfd_luo_preserve(struct liveupdate_file_op_args * args)218 static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
219 {
220 struct inode *inode = file_inode(args->file);
221 struct memfd_luo_folio_ser *folios_ser;
222 struct memfd_luo_ser *ser;
223 u64 nr_folios;
224 int err = 0;
225
226 inode_lock(inode);
227 shmem_freeze(inode, true);
228
229 /* Allocate the main serialization structure in preserved memory */
230 ser = kho_alloc_preserve(sizeof(*ser));
231 if (IS_ERR(ser)) {
232 err = PTR_ERR(ser);
233 goto err_unlock;
234 }
235
236 ser->pos = args->file->f_pos;
237 ser->size = i_size_read(inode);
238
239 err = memfd_luo_preserve_folios(args->file, &ser->folios,
240 &folios_ser, &nr_folios);
241 if (err)
242 goto err_free_ser;
243
244 ser->nr_folios = nr_folios;
245 inode_unlock(inode);
246
247 args->private_data = folios_ser;
248 args->serialized_data = virt_to_phys(ser);
249
250 return 0;
251
252 err_free_ser:
253 kho_unpreserve_free(ser);
254 err_unlock:
255 shmem_freeze(inode, false);
256 inode_unlock(inode);
257 return err;
258 }
259
memfd_luo_freeze(struct liveupdate_file_op_args * args)260 static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
261 {
262 struct memfd_luo_ser *ser;
263
264 if (WARN_ON_ONCE(!args->serialized_data))
265 return -EINVAL;
266
267 ser = phys_to_virt(args->serialized_data);
268
269 /*
270 * The pos might have changed since prepare. Everything else stays the
271 * same.
272 */
273 ser->pos = args->file->f_pos;
274
275 return 0;
276 }
277
memfd_luo_unpreserve(struct liveupdate_file_op_args * args)278 static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
279 {
280 struct inode *inode = file_inode(args->file);
281 struct memfd_luo_ser *ser;
282
283 if (WARN_ON_ONCE(!args->serialized_data))
284 return;
285
286 inode_lock(inode);
287 shmem_freeze(inode, false);
288
289 ser = phys_to_virt(args->serialized_data);
290
291 memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
292 ser->nr_folios);
293
294 kho_unpreserve_free(ser);
295 inode_unlock(inode);
296 }
297
memfd_luo_discard_folios(const struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)298 static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
299 u64 nr_folios)
300 {
301 u64 i;
302
303 for (i = 0; i < nr_folios; i++) {
304 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
305 struct folio *folio;
306 phys_addr_t phys;
307
308 if (!pfolio->pfn)
309 continue;
310
311 phys = PFN_PHYS(pfolio->pfn);
312 folio = kho_restore_folio(phys);
313 if (!folio) {
314 pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
315 phys);
316 continue;
317 }
318
319 folio_put(folio);
320 }
321 }
322
memfd_luo_finish(struct liveupdate_file_op_args * args)323 static void memfd_luo_finish(struct liveupdate_file_op_args *args)
324 {
325 struct memfd_luo_folio_ser *folios_ser;
326 struct memfd_luo_ser *ser;
327
328 if (args->retrieved)
329 return;
330
331 ser = phys_to_virt(args->serialized_data);
332 if (!ser)
333 return;
334
335 if (ser->nr_folios) {
336 folios_ser = kho_restore_vmalloc(&ser->folios);
337 if (!folios_ser)
338 goto out;
339
340 memfd_luo_discard_folios(folios_ser, ser->nr_folios);
341 vfree(folios_ser);
342 }
343
344 out:
345 kho_restore_free(ser);
346 }
347
memfd_luo_retrieve_folios(struct file * file,struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)348 static int memfd_luo_retrieve_folios(struct file *file,
349 struct memfd_luo_folio_ser *folios_ser,
350 u64 nr_folios)
351 {
352 struct inode *inode = file_inode(file);
353 struct address_space *mapping = inode->i_mapping;
354 struct folio *folio;
355 int err = -EIO;
356 long i;
357
358 for (i = 0; i < nr_folios; i++) {
359 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
360 phys_addr_t phys;
361 u64 index;
362 int flags;
363
364 if (!pfolio->pfn)
365 continue;
366
367 phys = PFN_PHYS(pfolio->pfn);
368 folio = kho_restore_folio(phys);
369 if (!folio) {
370 pr_err("Unable to restore folio at physical address: %llx\n",
371 phys);
372 goto put_folios;
373 }
374 index = pfolio->index;
375 flags = pfolio->flags;
376
377 /* Set up the folio for insertion. */
378 __folio_set_locked(folio);
379 __folio_set_swapbacked(folio);
380
381 err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
382 if (err) {
383 pr_err("shmem: failed to charge folio index %ld: %d\n",
384 i, err);
385 goto unlock_folio;
386 }
387
388 err = shmem_add_to_page_cache(folio, mapping, index, NULL,
389 mapping_gfp_mask(mapping));
390 if (err) {
391 pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
392 i, err);
393 goto unlock_folio;
394 }
395
396 if (flags & MEMFD_LUO_FOLIO_UPTODATE)
397 folio_mark_uptodate(folio);
398 if (flags & MEMFD_LUO_FOLIO_DIRTY)
399 folio_mark_dirty(folio);
400
401 err = shmem_inode_acct_blocks(inode, 1);
402 if (err) {
403 pr_err("shmem: failed to account folio index %ld: %d\n",
404 i, err);
405 goto unlock_folio;
406 }
407
408 shmem_recalc_inode(inode, 1, 0);
409 folio_add_lru(folio);
410 folio_unlock(folio);
411 folio_put(folio);
412 }
413
414 return 0;
415
416 unlock_folio:
417 folio_unlock(folio);
418 folio_put(folio);
419 put_folios:
420 /*
421 * Note: don't free the folios already added to the file. They will be
422 * freed when the file is freed. Free the ones not added yet here.
423 */
424 for (long j = i + 1; j < nr_folios; j++) {
425 const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
426
427 folio = kho_restore_folio(pfolio->pfn);
428 if (folio)
429 folio_put(folio);
430 }
431
432 return err;
433 }
434
memfd_luo_retrieve(struct liveupdate_file_op_args * args)435 static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
436 {
437 struct memfd_luo_folio_ser *folios_ser;
438 struct memfd_luo_ser *ser;
439 struct file *file;
440 int err;
441
442 ser = phys_to_virt(args->serialized_data);
443 if (!ser)
444 return -EINVAL;
445
446 file = shmem_file_setup("", 0, VM_NORESERVE);
447
448 if (IS_ERR(file)) {
449 pr_err("failed to setup file: %pe\n", file);
450 return PTR_ERR(file);
451 }
452
453 vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
454 file->f_inode->i_size = ser->size;
455
456 if (ser->nr_folios) {
457 folios_ser = kho_restore_vmalloc(&ser->folios);
458 if (!folios_ser) {
459 err = -EINVAL;
460 goto put_file;
461 }
462
463 err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
464 vfree(folios_ser);
465 if (err)
466 goto put_file;
467 }
468
469 args->file = file;
470 kho_restore_free(ser);
471
472 return 0;
473
474 put_file:
475 fput(file);
476
477 return err;
478 }
479
memfd_luo_can_preserve(struct liveupdate_file_handler * handler,struct file * file)480 static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
481 struct file *file)
482 {
483 struct inode *inode = file_inode(file);
484
485 return shmem_file(file) && !inode->i_nlink;
486 }
487
488 static const struct liveupdate_file_ops memfd_luo_file_ops = {
489 .freeze = memfd_luo_freeze,
490 .finish = memfd_luo_finish,
491 .retrieve = memfd_luo_retrieve,
492 .preserve = memfd_luo_preserve,
493 .unpreserve = memfd_luo_unpreserve,
494 .can_preserve = memfd_luo_can_preserve,
495 .owner = THIS_MODULE,
496 };
497
498 static struct liveupdate_file_handler memfd_luo_handler = {
499 .ops = &memfd_luo_file_ops,
500 .compatible = MEMFD_LUO_FH_COMPATIBLE,
501 };
502
memfd_luo_init(void)503 static int __init memfd_luo_init(void)
504 {
505 int err = liveupdate_register_file_handler(&memfd_luo_handler);
506
507 if (err && err != -EOPNOTSUPP) {
508 pr_err("Could not register luo filesystem handler: %pe\n",
509 ERR_PTR(err));
510
511 return err;
512 }
513
514 return 0;
515 }
516 late_initcall(memfd_luo_init);
517