xref: /linux/mm/memfd_luo.c (revision 509d3f45847627f4c5cdce004c3ec79262b5239c)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright (c) 2025, Google LLC.
5  * Pasha Tatashin <pasha.tatashin@soleen.com>
6  *
7  * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
8  * Pratyush Yadav <ptyadav@amazon.de>
9  */
10 
11 /**
12  * DOC: Memfd Preservation via LUO
13  *
14  * Overview
15  * ========
16  *
17  * Memory file descriptors (memfd) can be preserved over a kexec using the Live
18  * Update Orchestrator (LUO) file preservation. This allows userspace to
19  * transfer its memory contents to the next kernel after a kexec.
20  *
21  * The preservation is not intended to be transparent. Only select properties of
22  * the file are preserved. All others are reset to default. The preserved
23  * properties are described below.
24  *
25  * .. note::
26  *    The LUO API is not stabilized yet, so the preserved properties of a memfd
27  *    are also not stable and are subject to backwards incompatible changes.
28  *
29  * .. note::
30  *    Currently a memfd backed by Hugetlb is not supported. Memfds created
31  *    with ``MFD_HUGETLB`` will be rejected.
32  *
33  * Preserved Properties
34  * ====================
35  *
36  * The following properties of the memfd are preserved across kexec:
37  *
38  * File Contents
39  *   All data stored in the file is preserved.
40  *
41  * File Size
42  *   The size of the file is preserved. Holes in the file are filled by
43  *   allocating pages for them during preservation.
44  *
45  * File Position
46  *   The current file position is preserved, allowing applications to continue
47  *   reading/writing from their last position.
48  *
49  * File Status Flags
50  *   memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
51  *   is maintained.
52  *
53  * Non-Preserved Properties
54  * ========================
55  *
56  * All properties which are not preserved must be assumed to be reset to
57  * default. This section describes some of those properties which may be more of
58  * note.
59  *
60  * ``FD_CLOEXEC`` flag
61  *   A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
62  *   ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
63  *   again after restore via ``fcntl()``.
64  *
65  * Seals
66  *   File seals are not preserved. The file is unsealed on restore and if
67  *   needed, must be sealed again via ``fcntl()``.
68  */
69 
70 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
71 
72 #include <linux/bits.h>
73 #include <linux/err.h>
74 #include <linux/file.h>
75 #include <linux/io.h>
76 #include <linux/kexec_handover.h>
77 #include <linux/kho/abi/memfd.h>
78 #include <linux/liveupdate.h>
79 #include <linux/shmem_fs.h>
80 #include <linux/vmalloc.h>
81 #include "internal.h"
82 
memfd_luo_preserve_folios(struct file * file,struct kho_vmalloc * kho_vmalloc,struct memfd_luo_folio_ser ** out_folios_ser,u64 * nr_foliosp)83 static int memfd_luo_preserve_folios(struct file *file,
84 				     struct kho_vmalloc *kho_vmalloc,
85 				     struct memfd_luo_folio_ser **out_folios_ser,
86 				     u64 *nr_foliosp)
87 {
88 	struct inode *inode = file_inode(file);
89 	struct memfd_luo_folio_ser *folios_ser;
90 	unsigned int max_folios;
91 	long i, size, nr_pinned;
92 	struct folio **folios;
93 	int err = -EINVAL;
94 	pgoff_t offset;
95 	u64 nr_folios;
96 
97 	size = i_size_read(inode);
98 	/*
99 	 * If the file has zero size, then the folios and nr_folios properties
100 	 * are not set.
101 	 */
102 	if (!size) {
103 		*nr_foliosp = 0;
104 		*out_folios_ser = NULL;
105 		memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
106 		return 0;
107 	}
108 
109 	/*
110 	 * Guess the number of folios based on inode size. Real number might end
111 	 * up being smaller if there are higher order folios.
112 	 */
113 	max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
114 	folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);
115 	if (!folios)
116 		return -ENOMEM;
117 
118 	/*
119 	 * Pin the folios so they don't move around behind our back. This also
120 	 * ensures none of the folios are in CMA -- which ensures they don't
121 	 * fall in KHO scratch memory. It also moves swapped out folios back to
122 	 * memory.
123 	 *
124 	 * A side effect of doing this is that it allocates a folio for all
125 	 * indices in the file. This might waste memory on sparse memfds. If
126 	 * that is really a problem in the future, we can have a
127 	 * memfd_pin_folios() variant that does not allocate a page on empty
128 	 * slots.
129 	 */
130 	nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
131 				     &offset);
132 	if (nr_pinned < 0) {
133 		err = nr_pinned;
134 		pr_err("failed to pin folios: %d\n", err);
135 		goto err_free_folios;
136 	}
137 	nr_folios = nr_pinned;
138 
139 	folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
140 	if (!folios_ser) {
141 		err = -ENOMEM;
142 		goto err_unpin;
143 	}
144 
145 	for (i = 0; i < nr_folios; i++) {
146 		struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
147 		struct folio *folio = folios[i];
148 		unsigned int flags = 0;
149 
150 		err = kho_preserve_folio(folio);
151 		if (err)
152 			goto err_unpreserve;
153 
154 		if (folio_test_dirty(folio))
155 			flags |= MEMFD_LUO_FOLIO_DIRTY;
156 		if (folio_test_uptodate(folio))
157 			flags |= MEMFD_LUO_FOLIO_UPTODATE;
158 
159 		pfolio->pfn = folio_pfn(folio);
160 		pfolio->flags = flags;
161 		pfolio->index = folio->index;
162 	}
163 
164 	err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
165 	if (err)
166 		goto err_unpreserve;
167 
168 	kvfree(folios);
169 	*nr_foliosp = nr_folios;
170 	*out_folios_ser = folios_ser;
171 
172 	/*
173 	 * Note: folios_ser is purposely not freed here. It is preserved
174 	 * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
175 	 * that is passed via private_data.
176 	 */
177 	return 0;
178 
179 err_unpreserve:
180 	for (i = i - 1; i >= 0; i--)
181 		kho_unpreserve_folio(folios[i]);
182 	vfree(folios_ser);
183 err_unpin:
184 	unpin_folios(folios, nr_folios);
185 err_free_folios:
186 	kvfree(folios);
187 
188 	return err;
189 }
190 
memfd_luo_unpreserve_folios(struct kho_vmalloc * kho_vmalloc,struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)191 static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
192 					struct memfd_luo_folio_ser *folios_ser,
193 					u64 nr_folios)
194 {
195 	long i;
196 
197 	if (!nr_folios)
198 		return;
199 
200 	kho_unpreserve_vmalloc(kho_vmalloc);
201 
202 	for (i = 0; i < nr_folios; i++) {
203 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
204 		struct folio *folio;
205 
206 		if (!pfolio->pfn)
207 			continue;
208 
209 		folio = pfn_folio(pfolio->pfn);
210 
211 		kho_unpreserve_folio(folio);
212 		unpin_folio(folio);
213 	}
214 
215 	vfree(folios_ser);
216 }
217 
memfd_luo_preserve(struct liveupdate_file_op_args * args)218 static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
219 {
220 	struct inode *inode = file_inode(args->file);
221 	struct memfd_luo_folio_ser *folios_ser;
222 	struct memfd_luo_ser *ser;
223 	u64 nr_folios;
224 	int err = 0;
225 
226 	inode_lock(inode);
227 	shmem_freeze(inode, true);
228 
229 	/* Allocate the main serialization structure in preserved memory */
230 	ser = kho_alloc_preserve(sizeof(*ser));
231 	if (IS_ERR(ser)) {
232 		err = PTR_ERR(ser);
233 		goto err_unlock;
234 	}
235 
236 	ser->pos = args->file->f_pos;
237 	ser->size = i_size_read(inode);
238 
239 	err = memfd_luo_preserve_folios(args->file, &ser->folios,
240 					&folios_ser, &nr_folios);
241 	if (err)
242 		goto err_free_ser;
243 
244 	ser->nr_folios = nr_folios;
245 	inode_unlock(inode);
246 
247 	args->private_data = folios_ser;
248 	args->serialized_data = virt_to_phys(ser);
249 
250 	return 0;
251 
252 err_free_ser:
253 	kho_unpreserve_free(ser);
254 err_unlock:
255 	shmem_freeze(inode, false);
256 	inode_unlock(inode);
257 	return err;
258 }
259 
memfd_luo_freeze(struct liveupdate_file_op_args * args)260 static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
261 {
262 	struct memfd_luo_ser *ser;
263 
264 	if (WARN_ON_ONCE(!args->serialized_data))
265 		return -EINVAL;
266 
267 	ser = phys_to_virt(args->serialized_data);
268 
269 	/*
270 	 * The pos might have changed since prepare. Everything else stays the
271 	 * same.
272 	 */
273 	ser->pos = args->file->f_pos;
274 
275 	return 0;
276 }
277 
memfd_luo_unpreserve(struct liveupdate_file_op_args * args)278 static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
279 {
280 	struct inode *inode = file_inode(args->file);
281 	struct memfd_luo_ser *ser;
282 
283 	if (WARN_ON_ONCE(!args->serialized_data))
284 		return;
285 
286 	inode_lock(inode);
287 	shmem_freeze(inode, false);
288 
289 	ser = phys_to_virt(args->serialized_data);
290 
291 	memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
292 				    ser->nr_folios);
293 
294 	kho_unpreserve_free(ser);
295 	inode_unlock(inode);
296 }
297 
memfd_luo_discard_folios(const struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)298 static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
299 				     u64 nr_folios)
300 {
301 	u64 i;
302 
303 	for (i = 0; i < nr_folios; i++) {
304 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
305 		struct folio *folio;
306 		phys_addr_t phys;
307 
308 		if (!pfolio->pfn)
309 			continue;
310 
311 		phys = PFN_PHYS(pfolio->pfn);
312 		folio = kho_restore_folio(phys);
313 		if (!folio) {
314 			pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
315 					    phys);
316 			continue;
317 		}
318 
319 		folio_put(folio);
320 	}
321 }
322 
memfd_luo_finish(struct liveupdate_file_op_args * args)323 static void memfd_luo_finish(struct liveupdate_file_op_args *args)
324 {
325 	struct memfd_luo_folio_ser *folios_ser;
326 	struct memfd_luo_ser *ser;
327 
328 	if (args->retrieved)
329 		return;
330 
331 	ser = phys_to_virt(args->serialized_data);
332 	if (!ser)
333 		return;
334 
335 	if (ser->nr_folios) {
336 		folios_ser = kho_restore_vmalloc(&ser->folios);
337 		if (!folios_ser)
338 			goto out;
339 
340 		memfd_luo_discard_folios(folios_ser, ser->nr_folios);
341 		vfree(folios_ser);
342 	}
343 
344 out:
345 	kho_restore_free(ser);
346 }
347 
memfd_luo_retrieve_folios(struct file * file,struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)348 static int memfd_luo_retrieve_folios(struct file *file,
349 				     struct memfd_luo_folio_ser *folios_ser,
350 				     u64 nr_folios)
351 {
352 	struct inode *inode = file_inode(file);
353 	struct address_space *mapping = inode->i_mapping;
354 	struct folio *folio;
355 	int err = -EIO;
356 	long i;
357 
358 	for (i = 0; i < nr_folios; i++) {
359 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
360 		phys_addr_t phys;
361 		u64 index;
362 		int flags;
363 
364 		if (!pfolio->pfn)
365 			continue;
366 
367 		phys = PFN_PHYS(pfolio->pfn);
368 		folio = kho_restore_folio(phys);
369 		if (!folio) {
370 			pr_err("Unable to restore folio at physical address: %llx\n",
371 			       phys);
372 			goto put_folios;
373 		}
374 		index = pfolio->index;
375 		flags = pfolio->flags;
376 
377 		/* Set up the folio for insertion. */
378 		__folio_set_locked(folio);
379 		__folio_set_swapbacked(folio);
380 
381 		err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
382 		if (err) {
383 			pr_err("shmem: failed to charge folio index %ld: %d\n",
384 			       i, err);
385 			goto unlock_folio;
386 		}
387 
388 		err = shmem_add_to_page_cache(folio, mapping, index, NULL,
389 					      mapping_gfp_mask(mapping));
390 		if (err) {
391 			pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
392 			       i, err);
393 			goto unlock_folio;
394 		}
395 
396 		if (flags & MEMFD_LUO_FOLIO_UPTODATE)
397 			folio_mark_uptodate(folio);
398 		if (flags & MEMFD_LUO_FOLIO_DIRTY)
399 			folio_mark_dirty(folio);
400 
401 		err = shmem_inode_acct_blocks(inode, 1);
402 		if (err) {
403 			pr_err("shmem: failed to account folio index %ld: %d\n",
404 			       i, err);
405 			goto unlock_folio;
406 		}
407 
408 		shmem_recalc_inode(inode, 1, 0);
409 		folio_add_lru(folio);
410 		folio_unlock(folio);
411 		folio_put(folio);
412 	}
413 
414 	return 0;
415 
416 unlock_folio:
417 	folio_unlock(folio);
418 	folio_put(folio);
419 put_folios:
420 	/*
421 	 * Note: don't free the folios already added to the file. They will be
422 	 * freed when the file is freed. Free the ones not added yet here.
423 	 */
424 	for (long j = i + 1; j < nr_folios; j++) {
425 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
426 
427 		folio = kho_restore_folio(pfolio->pfn);
428 		if (folio)
429 			folio_put(folio);
430 	}
431 
432 	return err;
433 }
434 
memfd_luo_retrieve(struct liveupdate_file_op_args * args)435 static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
436 {
437 	struct memfd_luo_folio_ser *folios_ser;
438 	struct memfd_luo_ser *ser;
439 	struct file *file;
440 	int err;
441 
442 	ser = phys_to_virt(args->serialized_data);
443 	if (!ser)
444 		return -EINVAL;
445 
446 	file = shmem_file_setup("", 0, VM_NORESERVE);
447 
448 	if (IS_ERR(file)) {
449 		pr_err("failed to setup file: %pe\n", file);
450 		return PTR_ERR(file);
451 	}
452 
453 	vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
454 	file->f_inode->i_size = ser->size;
455 
456 	if (ser->nr_folios) {
457 		folios_ser = kho_restore_vmalloc(&ser->folios);
458 		if (!folios_ser) {
459 			err = -EINVAL;
460 			goto put_file;
461 		}
462 
463 		err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
464 		vfree(folios_ser);
465 		if (err)
466 			goto put_file;
467 	}
468 
469 	args->file = file;
470 	kho_restore_free(ser);
471 
472 	return 0;
473 
474 put_file:
475 	fput(file);
476 
477 	return err;
478 }
479 
memfd_luo_can_preserve(struct liveupdate_file_handler * handler,struct file * file)480 static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
481 				   struct file *file)
482 {
483 	struct inode *inode = file_inode(file);
484 
485 	return shmem_file(file) && !inode->i_nlink;
486 }
487 
488 static const struct liveupdate_file_ops memfd_luo_file_ops = {
489 	.freeze = memfd_luo_freeze,
490 	.finish = memfd_luo_finish,
491 	.retrieve = memfd_luo_retrieve,
492 	.preserve = memfd_luo_preserve,
493 	.unpreserve = memfd_luo_unpreserve,
494 	.can_preserve = memfd_luo_can_preserve,
495 	.owner = THIS_MODULE,
496 };
497 
498 static struct liveupdate_file_handler memfd_luo_handler = {
499 	.ops = &memfd_luo_file_ops,
500 	.compatible = MEMFD_LUO_FH_COMPATIBLE,
501 };
502 
memfd_luo_init(void)503 static int __init memfd_luo_init(void)
504 {
505 	int err = liveupdate_register_file_handler(&memfd_luo_handler);
506 
507 	if (err && err != -EOPNOTSUPP) {
508 		pr_err("Could not register luo filesystem handler: %pe\n",
509 		       ERR_PTR(err));
510 
511 		return err;
512 	}
513 
514 	return 0;
515 }
516 late_initcall(memfd_luo_init);
517