xref: /linux/mm/memfd_luo.c (revision 5ea5880764cbb164afb17a62e76ca75dc371409d)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright (c) 2025, Google LLC.
5  * Pasha Tatashin <pasha.tatashin@soleen.com>
6  *
7  * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
8  * Pratyush Yadav <ptyadav@amazon.de>
9  */
10 
11 /**
12  * DOC: Memfd Preservation via LUO
13  *
14  * Overview
15  * ========
16  *
17  * Memory file descriptors (memfd) can be preserved over a kexec using the Live
18  * Update Orchestrator (LUO) file preservation. This allows userspace to
19  * transfer its memory contents to the next kernel after a kexec.
20  *
21  * The preservation is not intended to be transparent. Only select properties of
22  * the file are preserved. All others are reset to default. The preserved
23  * properties are described below.
24  *
25  * .. note::
26  *    The LUO API is not stabilized yet, so the preserved properties of a memfd
27  *    are also not stable and are subject to backwards incompatible changes.
28  *
29  * .. note::
30  *    Currently a memfd backed by Hugetlb is not supported. Memfds created
31  *    with ``MFD_HUGETLB`` will be rejected.
32  *
33  * Preserved Properties
34  * ====================
35  *
36  * The following properties of the memfd are preserved across kexec:
37  *
38  * File Contents
39  *   All data stored in the file is preserved.
40  *
41  * File Size
42  *   The size of the file is preserved. Holes in the file are filled by
43  *   allocating pages for them during preservation.
44  *
45  * File Position
46  *   The current file position is preserved, allowing applications to continue
47  *   reading/writing from their last position.
48  *
49  * File Status Flags
50  *   memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
51  *   is maintained.
52  *
53  * Non-Preserved Properties
54  * ========================
55  *
56  * All properties which are not preserved must be assumed to be reset to
57  * default. This section describes some of those properties which may be more of
58  * note.
59  *
60  * ``FD_CLOEXEC`` flag
61  *   A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
62  *   ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
63  *   again after restore via ``fcntl()``.
64  *
65  * Seals
66  *   File seals are not preserved. The file is unsealed on restore and if
67  *   needed, must be sealed again via ``fcntl()``.
68  */
69 
70 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
71 
72 #include <linux/bits.h>
73 #include <linux/err.h>
74 #include <linux/file.h>
75 #include <linux/io.h>
76 #include <linux/kexec_handover.h>
77 #include <linux/kho/abi/memfd.h>
78 #include <linux/liveupdate.h>
79 #include <linux/shmem_fs.h>
80 #include <linux/vmalloc.h>
81 #include <linux/memfd.h>
82 #include <uapi/linux/memfd.h>
83 
84 #include "internal.h"
85 
86 static int memfd_luo_preserve_folios(struct file *file,
87 				     struct kho_vmalloc *kho_vmalloc,
88 				     struct memfd_luo_folio_ser **out_folios_ser,
89 				     u64 *nr_foliosp)
90 {
91 	struct inode *inode = file_inode(file);
92 	struct memfd_luo_folio_ser *folios_ser;
93 	unsigned int max_folios;
94 	long i, size, nr_pinned;
95 	struct folio **folios;
96 	int err = -EINVAL;
97 	pgoff_t offset;
98 	u64 nr_folios;
99 
100 	size = i_size_read(inode);
101 	/*
102 	 * If the file has zero size, then the folios and nr_folios properties
103 	 * are not set.
104 	 */
105 	if (!size) {
106 		*nr_foliosp = 0;
107 		*out_folios_ser = NULL;
108 		return 0;
109 	}
110 
111 	/*
112 	 * Guess the number of folios based on inode size. Real number might end
113 	 * up being smaller if there are higher order folios.
114 	 */
115 	max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
116 	folios = kvmalloc_objs(*folios, max_folios);
117 	if (!folios)
118 		return -ENOMEM;
119 
120 	/*
121 	 * Pin the folios so they don't move around behind our back. This also
122 	 * ensures none of the folios are in CMA -- which ensures they don't
123 	 * fall in KHO scratch memory. It also moves swapped out folios back to
124 	 * memory.
125 	 *
126 	 * A side effect of doing this is that it allocates a folio for all
127 	 * indices in the file. This might waste memory on sparse memfds. If
128 	 * that is really a problem in the future, we can have a
129 	 * memfd_pin_folios() variant that does not allocate a page on empty
130 	 * slots.
131 	 */
132 	nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
133 				     &offset);
134 	if (nr_pinned < 0) {
135 		err = nr_pinned;
136 		pr_err("failed to pin folios: %d\n", err);
137 		goto err_free_folios;
138 	}
139 	nr_folios = nr_pinned;
140 
141 	folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
142 	if (!folios_ser) {
143 		err = -ENOMEM;
144 		goto err_unpin;
145 	}
146 
147 	for (i = 0; i < nr_folios; i++) {
148 		struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
149 		struct folio *folio = folios[i];
150 
151 		err = kho_preserve_folio(folio);
152 		if (err)
153 			goto err_unpreserve;
154 
155 		folio_lock(folio);
156 
157 		/*
158 		 * A dirty folio is one which has been written to. A clean folio
159 		 * is its opposite. Since a clean folio does not carry user
160 		 * data, it can be freed by page reclaim under memory pressure.
161 		 *
162 		 * Saving the dirty flag at prepare() time doesn't work since it
163 		 * can change later. Saving it at freeze() also won't work
164 		 * because the dirty bit is normally synced at unmap and there
165 		 * might still be a mapping of the file at freeze().
166 		 *
167 		 * To see why this is a problem, say a folio is clean at
168 		 * preserve, but gets dirtied later. The pfolio flags will mark
169 		 * it as clean. After retrieve, the next kernel might try to
170 		 * reclaim this folio under memory pressure, losing user data.
171 		 *
172 		 * Unconditionally mark it dirty to avoid this problem. This
173 		 * comes at the cost of making clean folios un-reclaimable after
174 		 * live update.
175 		 */
176 		folio_mark_dirty(folio);
177 
178 		/*
179 		 * If the folio is not uptodate, it was fallocated but never
180 		 * used. Saving this flag at prepare() doesn't work since it
181 		 * might change later when someone uses the folio.
182 		 *
183 		 * Since we have taken the performance penalty of allocating,
184 		 * zeroing, and pinning all the folios in the holes, take a bit
185 		 * more and zero all non-uptodate folios too.
186 		 *
187 		 * NOTE: For someone looking to improve preserve performance,
188 		 * this is a good place to look.
189 		 */
190 		if (!folio_test_uptodate(folio)) {
191 			folio_zero_range(folio, 0, folio_size(folio));
192 			flush_dcache_folio(folio);
193 			folio_mark_uptodate(folio);
194 		}
195 
196 		folio_unlock(folio);
197 
198 		pfolio->pfn = folio_pfn(folio);
199 		pfolio->flags = MEMFD_LUO_FOLIO_DIRTY | MEMFD_LUO_FOLIO_UPTODATE;
200 		pfolio->index = folio->index;
201 	}
202 
203 	err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
204 	if (err)
205 		goto err_unpreserve;
206 
207 	kvfree(folios);
208 	*nr_foliosp = nr_folios;
209 	*out_folios_ser = folios_ser;
210 
211 	/*
212 	 * Note: folios_ser is purposely not freed here. It is preserved
213 	 * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
214 	 * that is passed via private_data.
215 	 */
216 	return 0;
217 
218 err_unpreserve:
219 	for (i = i - 1; i >= 0; i--)
220 		kho_unpreserve_folio(folios[i]);
221 	vfree(folios_ser);
222 err_unpin:
223 	unpin_folios(folios, nr_folios);
224 err_free_folios:
225 	kvfree(folios);
226 
227 	return err;
228 }
229 
230 static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
231 					struct memfd_luo_folio_ser *folios_ser,
232 					u64 nr_folios)
233 {
234 	long i;
235 
236 	if (!nr_folios)
237 		return;
238 
239 	kho_unpreserve_vmalloc(kho_vmalloc);
240 
241 	for (i = 0; i < nr_folios; i++) {
242 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
243 		struct folio *folio;
244 
245 		if (!pfolio->pfn)
246 			continue;
247 
248 		folio = pfn_folio(pfolio->pfn);
249 
250 		kho_unpreserve_folio(folio);
251 		unpin_folio(folio);
252 	}
253 
254 	vfree(folios_ser);
255 }
256 
257 static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
258 {
259 	struct inode *inode = file_inode(args->file);
260 	struct memfd_luo_folio_ser *folios_ser;
261 	struct memfd_luo_ser *ser;
262 	u64 nr_folios;
263 	int err = 0, seals;
264 
265 	inode_lock(inode);
266 	shmem_freeze(inode, true);
267 
268 	/* Allocate the main serialization structure in preserved memory */
269 	ser = kho_alloc_preserve(sizeof(*ser));
270 	if (IS_ERR(ser)) {
271 		err = PTR_ERR(ser);
272 		goto err_unlock;
273 	}
274 
275 	seals = memfd_get_seals(args->file);
276 	if (seals < 0) {
277 		err = seals;
278 		goto err_free_ser;
279 	}
280 
281 	/* Make sure the file only has the seals supported by this version. */
282 	if (seals & ~MEMFD_LUO_ALL_SEALS) {
283 		err = -EOPNOTSUPP;
284 		goto err_free_ser;
285 	}
286 
287 	ser->pos = args->file->f_pos;
288 	ser->size = i_size_read(inode);
289 	ser->seals = seals;
290 
291 	err = memfd_luo_preserve_folios(args->file, &ser->folios,
292 					&folios_ser, &nr_folios);
293 	if (err)
294 		goto err_free_ser;
295 
296 	ser->nr_folios = nr_folios;
297 	inode_unlock(inode);
298 
299 	args->private_data = folios_ser;
300 	args->serialized_data = virt_to_phys(ser);
301 
302 	return 0;
303 
304 err_free_ser:
305 	kho_unpreserve_free(ser);
306 err_unlock:
307 	shmem_freeze(inode, false);
308 	inode_unlock(inode);
309 	return err;
310 }
311 
312 static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
313 {
314 	struct memfd_luo_ser *ser;
315 
316 	if (WARN_ON_ONCE(!args->serialized_data))
317 		return -EINVAL;
318 
319 	ser = phys_to_virt(args->serialized_data);
320 
321 	/*
322 	 * The pos might have changed since prepare. Everything else stays the
323 	 * same.
324 	 */
325 	ser->pos = args->file->f_pos;
326 
327 	return 0;
328 }
329 
330 static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
331 {
332 	struct inode *inode = file_inode(args->file);
333 	struct memfd_luo_ser *ser;
334 
335 	if (WARN_ON_ONCE(!args->serialized_data))
336 		return;
337 
338 	inode_lock(inode);
339 	shmem_freeze(inode, false);
340 
341 	ser = phys_to_virt(args->serialized_data);
342 
343 	memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
344 				    ser->nr_folios);
345 
346 	kho_unpreserve_free(ser);
347 	inode_unlock(inode);
348 }
349 
350 static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
351 				     u64 nr_folios)
352 {
353 	u64 i;
354 
355 	for (i = 0; i < nr_folios; i++) {
356 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
357 		struct folio *folio;
358 		phys_addr_t phys;
359 
360 		if (!pfolio->pfn)
361 			continue;
362 
363 		phys = PFN_PHYS(pfolio->pfn);
364 		folio = kho_restore_folio(phys);
365 		if (!folio) {
366 			pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
367 					    phys);
368 			continue;
369 		}
370 
371 		folio_put(folio);
372 	}
373 }
374 
375 static void memfd_luo_finish(struct liveupdate_file_op_args *args)
376 {
377 	struct memfd_luo_folio_ser *folios_ser;
378 	struct memfd_luo_ser *ser;
379 
380 	/*
381 	 * If retrieve was successful, nothing to do. If it failed, retrieve()
382 	 * already cleaned up everything it could. So nothing to do there
383 	 * either. Only need to clean up when retrieve was not called.
384 	 */
385 	if (args->retrieve_status)
386 		return;
387 
388 	ser = phys_to_virt(args->serialized_data);
389 	if (!ser)
390 		return;
391 
392 	if (ser->nr_folios) {
393 		folios_ser = kho_restore_vmalloc(&ser->folios);
394 		if (!folios_ser)
395 			goto out;
396 
397 		memfd_luo_discard_folios(folios_ser, ser->nr_folios);
398 		vfree(folios_ser);
399 	}
400 
401 out:
402 	kho_restore_free(ser);
403 }
404 
405 static int memfd_luo_retrieve_folios(struct file *file,
406 				     struct memfd_luo_folio_ser *folios_ser,
407 				     u64 nr_folios)
408 {
409 	struct inode *inode = file_inode(file);
410 	struct address_space *mapping = inode->i_mapping;
411 	struct folio *folio;
412 	long npages, nr_added_pages = 0;
413 	int err = -EIO;
414 	long i;
415 
416 	for (i = 0; i < nr_folios; i++) {
417 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
418 		phys_addr_t phys;
419 		u64 index;
420 		int flags;
421 
422 		if (!pfolio->pfn)
423 			continue;
424 
425 		phys = PFN_PHYS(pfolio->pfn);
426 		folio = kho_restore_folio(phys);
427 		if (!folio) {
428 			pr_err("Unable to restore folio at physical address: %llx\n",
429 			       phys);
430 			goto put_folios;
431 		}
432 		index = pfolio->index;
433 		flags = pfolio->flags;
434 
435 		/* Set up the folio for insertion. */
436 		__folio_set_locked(folio);
437 		__folio_set_swapbacked(folio);
438 
439 		err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
440 		if (err) {
441 			pr_err("shmem: failed to charge folio index %ld: %d\n",
442 			       i, err);
443 			goto unlock_folio;
444 		}
445 
446 		err = shmem_add_to_page_cache(folio, mapping, index, NULL,
447 					      mapping_gfp_mask(mapping));
448 		if (err) {
449 			pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
450 			       i, err);
451 			goto unlock_folio;
452 		}
453 
454 		if (flags & MEMFD_LUO_FOLIO_UPTODATE)
455 			folio_mark_uptodate(folio);
456 		if (flags & MEMFD_LUO_FOLIO_DIRTY)
457 			folio_mark_dirty(folio);
458 
459 		npages = folio_nr_pages(folio);
460 		err = shmem_inode_acct_blocks(inode, npages);
461 		if (err) {
462 			pr_err("shmem: failed to account folio index %ld(%ld pages): %d\n",
463 			       i, npages, err);
464 			goto remove_from_cache;
465 		}
466 
467 		nr_added_pages += npages;
468 		folio_add_lru(folio);
469 		folio_unlock(folio);
470 		folio_put(folio);
471 	}
472 
473 	shmem_recalc_inode(inode, nr_added_pages, 0);
474 
475 	return 0;
476 
477 remove_from_cache:
478 	filemap_remove_folio(folio);
479 unlock_folio:
480 	folio_unlock(folio);
481 	folio_put(folio);
482 put_folios:
483 	/*
484 	 * Note: don't free the folios already added to the file. They will be
485 	 * freed when the file is freed. Free the ones not added yet here.
486 	 */
487 	for (long j = i + 1; j < nr_folios; j++) {
488 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
489 		phys_addr_t phys;
490 
491 		if (!pfolio->pfn)
492 			continue;
493 
494 		phys = PFN_PHYS(pfolio->pfn);
495 		folio = kho_restore_folio(phys);
496 		if (folio)
497 			folio_put(folio);
498 	}
499 
500 	shmem_recalc_inode(inode, nr_added_pages, 0);
501 
502 	return err;
503 }
504 
505 static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
506 {
507 	struct memfd_luo_folio_ser *folios_ser;
508 	struct memfd_luo_ser *ser;
509 	struct file *file;
510 	int err;
511 
512 	ser = phys_to_virt(args->serialized_data);
513 	if (!ser)
514 		return -EINVAL;
515 
516 	/* Make sure the file only has seals supported by this version. */
517 	if (ser->seals & ~MEMFD_LUO_ALL_SEALS) {
518 		err = -EOPNOTSUPP;
519 		goto free_ser;
520 	}
521 
522 	/*
523 	 * The seals are preserved. Allow sealing here so they can be added
524 	 * later.
525 	 */
526 	file = memfd_alloc_file("", MFD_ALLOW_SEALING);
527 	if (IS_ERR(file)) {
528 		pr_err("failed to setup file: %pe\n", file);
529 		err = PTR_ERR(file);
530 		goto free_ser;
531 	}
532 
533 	err = memfd_add_seals(file, ser->seals);
534 	if (err) {
535 		pr_err("failed to add seals: %pe\n", ERR_PTR(err));
536 		goto put_file;
537 	}
538 
539 	vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
540 	i_size_write(file_inode(file), ser->size);
541 
542 	if (ser->nr_folios) {
543 		folios_ser = kho_restore_vmalloc(&ser->folios);
544 		if (!folios_ser) {
545 			err = -EINVAL;
546 			goto put_file;
547 		}
548 
549 		err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
550 		vfree(folios_ser);
551 		if (err)
552 			goto put_file;
553 	}
554 
555 	args->file = file;
556 	kho_restore_free(ser);
557 
558 	return 0;
559 
560 put_file:
561 	fput(file);
562 free_ser:
563 	kho_restore_free(ser);
564 	return err;
565 }
566 
567 static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
568 				   struct file *file)
569 {
570 	struct inode *inode = file_inode(file);
571 
572 	return shmem_file(file) && !inode->i_nlink;
573 }
574 
575 static unsigned long memfd_luo_get_id(struct file *file)
576 {
577 	return (unsigned long)file_inode(file);
578 }
579 
580 static const struct liveupdate_file_ops memfd_luo_file_ops = {
581 	.freeze = memfd_luo_freeze,
582 	.finish = memfd_luo_finish,
583 	.retrieve = memfd_luo_retrieve,
584 	.preserve = memfd_luo_preserve,
585 	.unpreserve = memfd_luo_unpreserve,
586 	.can_preserve = memfd_luo_can_preserve,
587 	.get_id = memfd_luo_get_id,
588 	.owner = THIS_MODULE,
589 };
590 
591 static struct liveupdate_file_handler memfd_luo_handler = {
592 	.ops = &memfd_luo_file_ops,
593 	.compatible = MEMFD_LUO_FH_COMPATIBLE,
594 };
595 
596 static int __init memfd_luo_init(void)
597 {
598 	int err = liveupdate_register_file_handler(&memfd_luo_handler);
599 
600 	if (err && err != -EOPNOTSUPP) {
601 		pr_err("Could not register luo filesystem handler: %pe\n",
602 		       ERR_PTR(err));
603 
604 		return err;
605 	}
606 
607 	return 0;
608 }
609 late_initcall(memfd_luo_init);
610