xref: /linux/mm/memfd_luo.c (revision 1fd1dc41724319406b0aff221a352a400b0ddfc5)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright (c) 2025, Google LLC.
5  * Pasha Tatashin <pasha.tatashin@soleen.com>
6  *
7  * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
8  * Pratyush Yadav <ptyadav@amazon.de>
9  */
10 
11 /**
12  * DOC: Memfd Preservation via LUO
13  *
14  * Overview
15  * ========
16  *
17  * Memory file descriptors (memfd) can be preserved over a kexec using the Live
18  * Update Orchestrator (LUO) file preservation. This allows userspace to
19  * transfer its memory contents to the next kernel after a kexec.
20  *
21  * The preservation is not intended to be transparent. Only select properties of
22  * the file are preserved. All others are reset to default. The preserved
23  * properties are described below.
24  *
25  * .. note::
26  *    The LUO API is not stabilized yet, so the preserved properties of a memfd
27  *    are also not stable and are subject to backwards incompatible changes.
28  *
29  * .. note::
30  *    Currently a memfd backed by Hugetlb is not supported. Memfds created
31  *    with ``MFD_HUGETLB`` will be rejected.
32  *
33  * Preserved Properties
34  * ====================
35  *
36  * The following properties of the memfd are preserved across kexec:
37  *
38  * File Contents
39  *   All data stored in the file is preserved.
40  *
41  * File Size
42  *   The size of the file is preserved. Holes in the file are filled by
43  *   allocating pages for them during preservation.
44  *
45  * File Position
46  *   The current file position is preserved, allowing applications to continue
47  *   reading/writing from their last position.
48  *
49  * File Status Flags
50  *   memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
51  *   is maintained.
52  *
53  * Non-Preserved Properties
54  * ========================
55  *
56  * All properties which are not preserved must be assumed to be reset to
57  * default. This section describes some of those properties which may be more of
58  * note.
59  *
60  * ``FD_CLOEXEC`` flag
61  *   A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
62  *   ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
63  *   again after restore via ``fcntl()``.
64  *
65  * Seals
66  *   File seals are not preserved. The file is unsealed on restore and if
67  *   needed, must be sealed again via ``fcntl()``.
68  */
69 
70 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
71 
72 #include <linux/bits.h>
73 #include <linux/err.h>
74 #include <linux/file.h>
75 #include <linux/io.h>
76 #include <linux/kexec_handover.h>
77 #include <linux/kho/abi/memfd.h>
78 #include <linux/liveupdate.h>
79 #include <linux/shmem_fs.h>
80 #include <linux/vmalloc.h>
81 #include <linux/memfd.h>
82 #include "internal.h"
83 
84 static int memfd_luo_preserve_folios(struct file *file,
85 				     struct kho_vmalloc *kho_vmalloc,
86 				     struct memfd_luo_folio_ser **out_folios_ser,
87 				     u64 *nr_foliosp)
88 {
89 	struct inode *inode = file_inode(file);
90 	struct memfd_luo_folio_ser *folios_ser;
91 	unsigned int max_folios;
92 	long i, size, nr_pinned;
93 	struct folio **folios;
94 	int err = -EINVAL;
95 	pgoff_t offset;
96 	u64 nr_folios;
97 
98 	size = i_size_read(inode);
99 	/*
100 	 * If the file has zero size, then the folios and nr_folios properties
101 	 * are not set.
102 	 */
103 	if (!size) {
104 		*nr_foliosp = 0;
105 		*out_folios_ser = NULL;
106 		memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
107 		return 0;
108 	}
109 
110 	/*
111 	 * Guess the number of folios based on inode size. Real number might end
112 	 * up being smaller if there are higher order folios.
113 	 */
114 	max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
115 	folios = kvmalloc_objs(*folios, max_folios);
116 	if (!folios)
117 		return -ENOMEM;
118 
119 	/*
120 	 * Pin the folios so they don't move around behind our back. This also
121 	 * ensures none of the folios are in CMA -- which ensures they don't
122 	 * fall in KHO scratch memory. It also moves swapped out folios back to
123 	 * memory.
124 	 *
125 	 * A side effect of doing this is that it allocates a folio for all
126 	 * indices in the file. This might waste memory on sparse memfds. If
127 	 * that is really a problem in the future, we can have a
128 	 * memfd_pin_folios() variant that does not allocate a page on empty
129 	 * slots.
130 	 */
131 	nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
132 				     &offset);
133 	if (nr_pinned < 0) {
134 		err = nr_pinned;
135 		pr_err("failed to pin folios: %d\n", err);
136 		goto err_free_folios;
137 	}
138 	nr_folios = nr_pinned;
139 
140 	folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
141 	if (!folios_ser) {
142 		err = -ENOMEM;
143 		goto err_unpin;
144 	}
145 
146 	for (i = 0; i < nr_folios; i++) {
147 		struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
148 		struct folio *folio = folios[i];
149 		unsigned int flags = 0;
150 
151 		err = kho_preserve_folio(folio);
152 		if (err)
153 			goto err_unpreserve;
154 
155 		if (folio_test_dirty(folio))
156 			flags |= MEMFD_LUO_FOLIO_DIRTY;
157 		if (folio_test_uptodate(folio))
158 			flags |= MEMFD_LUO_FOLIO_UPTODATE;
159 
160 		pfolio->pfn = folio_pfn(folio);
161 		pfolio->flags = flags;
162 		pfolio->index = folio->index;
163 	}
164 
165 	err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
166 	if (err)
167 		goto err_unpreserve;
168 
169 	kvfree(folios);
170 	*nr_foliosp = nr_folios;
171 	*out_folios_ser = folios_ser;
172 
173 	/*
174 	 * Note: folios_ser is purposely not freed here. It is preserved
175 	 * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
176 	 * that is passed via private_data.
177 	 */
178 	return 0;
179 
180 err_unpreserve:
181 	for (i = i - 1; i >= 0; i--)
182 		kho_unpreserve_folio(folios[i]);
183 	vfree(folios_ser);
184 err_unpin:
185 	unpin_folios(folios, nr_folios);
186 err_free_folios:
187 	kvfree(folios);
188 
189 	return err;
190 }
191 
192 static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
193 					struct memfd_luo_folio_ser *folios_ser,
194 					u64 nr_folios)
195 {
196 	long i;
197 
198 	if (!nr_folios)
199 		return;
200 
201 	kho_unpreserve_vmalloc(kho_vmalloc);
202 
203 	for (i = 0; i < nr_folios; i++) {
204 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
205 		struct folio *folio;
206 
207 		if (!pfolio->pfn)
208 			continue;
209 
210 		folio = pfn_folio(pfolio->pfn);
211 
212 		kho_unpreserve_folio(folio);
213 		unpin_folio(folio);
214 	}
215 
216 	vfree(folios_ser);
217 }
218 
219 static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
220 {
221 	struct inode *inode = file_inode(args->file);
222 	struct memfd_luo_folio_ser *folios_ser;
223 	struct memfd_luo_ser *ser;
224 	u64 nr_folios;
225 	int err = 0;
226 
227 	inode_lock(inode);
228 	shmem_freeze(inode, true);
229 
230 	/* Allocate the main serialization structure in preserved memory */
231 	ser = kho_alloc_preserve(sizeof(*ser));
232 	if (IS_ERR(ser)) {
233 		err = PTR_ERR(ser);
234 		goto err_unlock;
235 	}
236 
237 	ser->pos = args->file->f_pos;
238 	ser->size = i_size_read(inode);
239 
240 	err = memfd_luo_preserve_folios(args->file, &ser->folios,
241 					&folios_ser, &nr_folios);
242 	if (err)
243 		goto err_free_ser;
244 
245 	ser->nr_folios = nr_folios;
246 	inode_unlock(inode);
247 
248 	args->private_data = folios_ser;
249 	args->serialized_data = virt_to_phys(ser);
250 
251 	return 0;
252 
253 err_free_ser:
254 	kho_unpreserve_free(ser);
255 err_unlock:
256 	shmem_freeze(inode, false);
257 	inode_unlock(inode);
258 	return err;
259 }
260 
261 static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
262 {
263 	struct memfd_luo_ser *ser;
264 
265 	if (WARN_ON_ONCE(!args->serialized_data))
266 		return -EINVAL;
267 
268 	ser = phys_to_virt(args->serialized_data);
269 
270 	/*
271 	 * The pos might have changed since prepare. Everything else stays the
272 	 * same.
273 	 */
274 	ser->pos = args->file->f_pos;
275 
276 	return 0;
277 }
278 
279 static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
280 {
281 	struct inode *inode = file_inode(args->file);
282 	struct memfd_luo_ser *ser;
283 
284 	if (WARN_ON_ONCE(!args->serialized_data))
285 		return;
286 
287 	inode_lock(inode);
288 	shmem_freeze(inode, false);
289 
290 	ser = phys_to_virt(args->serialized_data);
291 
292 	memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
293 				    ser->nr_folios);
294 
295 	kho_unpreserve_free(ser);
296 	inode_unlock(inode);
297 }
298 
299 static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
300 				     u64 nr_folios)
301 {
302 	u64 i;
303 
304 	for (i = 0; i < nr_folios; i++) {
305 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
306 		struct folio *folio;
307 		phys_addr_t phys;
308 
309 		if (!pfolio->pfn)
310 			continue;
311 
312 		phys = PFN_PHYS(pfolio->pfn);
313 		folio = kho_restore_folio(phys);
314 		if (!folio) {
315 			pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
316 					    phys);
317 			continue;
318 		}
319 
320 		folio_put(folio);
321 	}
322 }
323 
324 static void memfd_luo_finish(struct liveupdate_file_op_args *args)
325 {
326 	struct memfd_luo_folio_ser *folios_ser;
327 	struct memfd_luo_ser *ser;
328 
329 	/*
330 	 * If retrieve was successful, nothing to do. If it failed, retrieve()
331 	 * already cleaned up everything it could. So nothing to do there
332 	 * either. Only need to clean up when retrieve was not called.
333 	 */
334 	if (args->retrieve_status)
335 		return;
336 
337 	ser = phys_to_virt(args->serialized_data);
338 	if (!ser)
339 		return;
340 
341 	if (ser->nr_folios) {
342 		folios_ser = kho_restore_vmalloc(&ser->folios);
343 		if (!folios_ser)
344 			goto out;
345 
346 		memfd_luo_discard_folios(folios_ser, ser->nr_folios);
347 		vfree(folios_ser);
348 	}
349 
350 out:
351 	kho_restore_free(ser);
352 }
353 
354 static int memfd_luo_retrieve_folios(struct file *file,
355 				     struct memfd_luo_folio_ser *folios_ser,
356 				     u64 nr_folios)
357 {
358 	struct inode *inode = file_inode(file);
359 	struct address_space *mapping = inode->i_mapping;
360 	struct folio *folio;
361 	int err = -EIO;
362 	long i;
363 
364 	for (i = 0; i < nr_folios; i++) {
365 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
366 		phys_addr_t phys;
367 		u64 index;
368 		int flags;
369 
370 		if (!pfolio->pfn)
371 			continue;
372 
373 		phys = PFN_PHYS(pfolio->pfn);
374 		folio = kho_restore_folio(phys);
375 		if (!folio) {
376 			pr_err("Unable to restore folio at physical address: %llx\n",
377 			       phys);
378 			goto put_folios;
379 		}
380 		index = pfolio->index;
381 		flags = pfolio->flags;
382 
383 		/* Set up the folio for insertion. */
384 		__folio_set_locked(folio);
385 		__folio_set_swapbacked(folio);
386 
387 		err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
388 		if (err) {
389 			pr_err("shmem: failed to charge folio index %ld: %d\n",
390 			       i, err);
391 			goto unlock_folio;
392 		}
393 
394 		err = shmem_add_to_page_cache(folio, mapping, index, NULL,
395 					      mapping_gfp_mask(mapping));
396 		if (err) {
397 			pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
398 			       i, err);
399 			goto unlock_folio;
400 		}
401 
402 		if (flags & MEMFD_LUO_FOLIO_UPTODATE)
403 			folio_mark_uptodate(folio);
404 		if (flags & MEMFD_LUO_FOLIO_DIRTY)
405 			folio_mark_dirty(folio);
406 
407 		err = shmem_inode_acct_blocks(inode, 1);
408 		if (err) {
409 			pr_err("shmem: failed to account folio index %ld: %d\n",
410 			       i, err);
411 			goto unlock_folio;
412 		}
413 
414 		shmem_recalc_inode(inode, 1, 0);
415 		folio_add_lru(folio);
416 		folio_unlock(folio);
417 		folio_put(folio);
418 	}
419 
420 	return 0;
421 
422 unlock_folio:
423 	folio_unlock(folio);
424 	folio_put(folio);
425 put_folios:
426 	/*
427 	 * Note: don't free the folios already added to the file. They will be
428 	 * freed when the file is freed. Free the ones not added yet here.
429 	 */
430 	for (long j = i + 1; j < nr_folios; j++) {
431 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
432 
433 		folio = kho_restore_folio(pfolio->pfn);
434 		if (folio)
435 			folio_put(folio);
436 	}
437 
438 	return err;
439 }
440 
441 static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
442 {
443 	struct memfd_luo_folio_ser *folios_ser;
444 	struct memfd_luo_ser *ser;
445 	struct file *file;
446 	int err;
447 
448 	ser = phys_to_virt(args->serialized_data);
449 	if (!ser)
450 		return -EINVAL;
451 
452 	file = memfd_alloc_file("", 0);
453 	if (IS_ERR(file)) {
454 		pr_err("failed to setup file: %pe\n", file);
455 		err = PTR_ERR(file);
456 		goto free_ser;
457 	}
458 
459 	vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
460 	file->f_inode->i_size = ser->size;
461 
462 	if (ser->nr_folios) {
463 		folios_ser = kho_restore_vmalloc(&ser->folios);
464 		if (!folios_ser) {
465 			err = -EINVAL;
466 			goto put_file;
467 		}
468 
469 		err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
470 		vfree(folios_ser);
471 		if (err)
472 			goto put_file;
473 	}
474 
475 	args->file = file;
476 	kho_restore_free(ser);
477 
478 	return 0;
479 
480 put_file:
481 	fput(file);
482 free_ser:
483 	kho_restore_free(ser);
484 	return err;
485 }
486 
487 static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
488 				   struct file *file)
489 {
490 	struct inode *inode = file_inode(file);
491 
492 	return shmem_file(file) && !inode->i_nlink;
493 }
494 
495 static const struct liveupdate_file_ops memfd_luo_file_ops = {
496 	.freeze = memfd_luo_freeze,
497 	.finish = memfd_luo_finish,
498 	.retrieve = memfd_luo_retrieve,
499 	.preserve = memfd_luo_preserve,
500 	.unpreserve = memfd_luo_unpreserve,
501 	.can_preserve = memfd_luo_can_preserve,
502 	.owner = THIS_MODULE,
503 };
504 
505 static struct liveupdate_file_handler memfd_luo_handler = {
506 	.ops = &memfd_luo_file_ops,
507 	.compatible = MEMFD_LUO_FH_COMPATIBLE,
508 };
509 
510 static int __init memfd_luo_init(void)
511 {
512 	int err = liveupdate_register_file_handler(&memfd_luo_handler);
513 
514 	if (err && err != -EOPNOTSUPP) {
515 		pr_err("Could not register luo filesystem handler: %pe\n",
516 		       ERR_PTR(err));
517 
518 		return err;
519 	}
520 
521 	return 0;
522 }
523 late_initcall(memfd_luo_init);
524