xref: /linux/mm/memfd_luo.c (revision 509d3f45847627f4c5cdce004c3ec79262b5239c)
1*b3749f17SPratyush Yadav // SPDX-License-Identifier: GPL-2.0
2*b3749f17SPratyush Yadav 
3*b3749f17SPratyush Yadav /*
4*b3749f17SPratyush Yadav  * Copyright (c) 2025, Google LLC.
5*b3749f17SPratyush Yadav  * Pasha Tatashin <pasha.tatashin@soleen.com>
6*b3749f17SPratyush Yadav  *
7*b3749f17SPratyush Yadav  * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
8*b3749f17SPratyush Yadav  * Pratyush Yadav <ptyadav@amazon.de>
9*b3749f17SPratyush Yadav  */
10*b3749f17SPratyush Yadav 
11*b3749f17SPratyush Yadav /**
12*b3749f17SPratyush Yadav  * DOC: Memfd Preservation via LUO
13*b3749f17SPratyush Yadav  *
14*b3749f17SPratyush Yadav  * Overview
15*b3749f17SPratyush Yadav  * ========
16*b3749f17SPratyush Yadav  *
17*b3749f17SPratyush Yadav  * Memory file descriptors (memfd) can be preserved over a kexec using the Live
18*b3749f17SPratyush Yadav  * Update Orchestrator (LUO) file preservation. This allows userspace to
19*b3749f17SPratyush Yadav  * transfer its memory contents to the next kernel after a kexec.
20*b3749f17SPratyush Yadav  *
21*b3749f17SPratyush Yadav  * The preservation is not intended to be transparent. Only select properties of
22*b3749f17SPratyush Yadav  * the file are preserved. All others are reset to default. The preserved
23*b3749f17SPratyush Yadav  * properties are described below.
24*b3749f17SPratyush Yadav  *
25*b3749f17SPratyush Yadav  * .. note::
26*b3749f17SPratyush Yadav  *    The LUO API is not stabilized yet, so the preserved properties of a memfd
27*b3749f17SPratyush Yadav  *    are also not stable and are subject to backwards incompatible changes.
28*b3749f17SPratyush Yadav  *
29*b3749f17SPratyush Yadav  * .. note::
30*b3749f17SPratyush Yadav  *    Currently a memfd backed by Hugetlb is not supported. Memfds created
31*b3749f17SPratyush Yadav  *    with ``MFD_HUGETLB`` will be rejected.
32*b3749f17SPratyush Yadav  *
33*b3749f17SPratyush Yadav  * Preserved Properties
34*b3749f17SPratyush Yadav  * ====================
35*b3749f17SPratyush Yadav  *
36*b3749f17SPratyush Yadav  * The following properties of the memfd are preserved across kexec:
37*b3749f17SPratyush Yadav  *
38*b3749f17SPratyush Yadav  * File Contents
39*b3749f17SPratyush Yadav  *   All data stored in the file is preserved.
40*b3749f17SPratyush Yadav  *
41*b3749f17SPratyush Yadav  * File Size
42*b3749f17SPratyush Yadav  *   The size of the file is preserved. Holes in the file are filled by
43*b3749f17SPratyush Yadav  *   allocating pages for them during preservation.
44*b3749f17SPratyush Yadav  *
45*b3749f17SPratyush Yadav  * File Position
46*b3749f17SPratyush Yadav  *   The current file position is preserved, allowing applications to continue
47*b3749f17SPratyush Yadav  *   reading/writing from their last position.
48*b3749f17SPratyush Yadav  *
49*b3749f17SPratyush Yadav  * File Status Flags
50*b3749f17SPratyush Yadav  *   memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
51*b3749f17SPratyush Yadav  *   is maintained.
52*b3749f17SPratyush Yadav  *
53*b3749f17SPratyush Yadav  * Non-Preserved Properties
54*b3749f17SPratyush Yadav  * ========================
55*b3749f17SPratyush Yadav  *
56*b3749f17SPratyush Yadav  * All properties which are not preserved must be assumed to be reset to
57*b3749f17SPratyush Yadav  * default. This section describes some of those properties which may be more of
58*b3749f17SPratyush Yadav  * note.
59*b3749f17SPratyush Yadav  *
60*b3749f17SPratyush Yadav  * ``FD_CLOEXEC`` flag
61*b3749f17SPratyush Yadav  *   A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
62*b3749f17SPratyush Yadav  *   ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
63*b3749f17SPratyush Yadav  *   again after restore via ``fcntl()``.
64*b3749f17SPratyush Yadav  *
65*b3749f17SPratyush Yadav  * Seals
66*b3749f17SPratyush Yadav  *   File seals are not preserved. The file is unsealed on restore and if
67*b3749f17SPratyush Yadav  *   needed, must be sealed again via ``fcntl()``.
68*b3749f17SPratyush Yadav  */
69*b3749f17SPratyush Yadav 
70*b3749f17SPratyush Yadav #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
71*b3749f17SPratyush Yadav 
72*b3749f17SPratyush Yadav #include <linux/bits.h>
73*b3749f17SPratyush Yadav #include <linux/err.h>
74*b3749f17SPratyush Yadav #include <linux/file.h>
75*b3749f17SPratyush Yadav #include <linux/io.h>
76*b3749f17SPratyush Yadav #include <linux/kexec_handover.h>
77*b3749f17SPratyush Yadav #include <linux/kho/abi/memfd.h>
78*b3749f17SPratyush Yadav #include <linux/liveupdate.h>
79*b3749f17SPratyush Yadav #include <linux/shmem_fs.h>
80*b3749f17SPratyush Yadav #include <linux/vmalloc.h>
81*b3749f17SPratyush Yadav #include "internal.h"
82*b3749f17SPratyush Yadav 
memfd_luo_preserve_folios(struct file * file,struct kho_vmalloc * kho_vmalloc,struct memfd_luo_folio_ser ** out_folios_ser,u64 * nr_foliosp)83*b3749f17SPratyush Yadav static int memfd_luo_preserve_folios(struct file *file,
84*b3749f17SPratyush Yadav 				     struct kho_vmalloc *kho_vmalloc,
85*b3749f17SPratyush Yadav 				     struct memfd_luo_folio_ser **out_folios_ser,
86*b3749f17SPratyush Yadav 				     u64 *nr_foliosp)
87*b3749f17SPratyush Yadav {
88*b3749f17SPratyush Yadav 	struct inode *inode = file_inode(file);
89*b3749f17SPratyush Yadav 	struct memfd_luo_folio_ser *folios_ser;
90*b3749f17SPratyush Yadav 	unsigned int max_folios;
91*b3749f17SPratyush Yadav 	long i, size, nr_pinned;
92*b3749f17SPratyush Yadav 	struct folio **folios;
93*b3749f17SPratyush Yadav 	int err = -EINVAL;
94*b3749f17SPratyush Yadav 	pgoff_t offset;
95*b3749f17SPratyush Yadav 	u64 nr_folios;
96*b3749f17SPratyush Yadav 
97*b3749f17SPratyush Yadav 	size = i_size_read(inode);
98*b3749f17SPratyush Yadav 	/*
99*b3749f17SPratyush Yadav 	 * If the file has zero size, then the folios and nr_folios properties
100*b3749f17SPratyush Yadav 	 * are not set.
101*b3749f17SPratyush Yadav 	 */
102*b3749f17SPratyush Yadav 	if (!size) {
103*b3749f17SPratyush Yadav 		*nr_foliosp = 0;
104*b3749f17SPratyush Yadav 		*out_folios_ser = NULL;
105*b3749f17SPratyush Yadav 		memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
106*b3749f17SPratyush Yadav 		return 0;
107*b3749f17SPratyush Yadav 	}
108*b3749f17SPratyush Yadav 
109*b3749f17SPratyush Yadav 	/*
110*b3749f17SPratyush Yadav 	 * Guess the number of folios based on inode size. Real number might end
111*b3749f17SPratyush Yadav 	 * up being smaller if there are higher order folios.
112*b3749f17SPratyush Yadav 	 */
113*b3749f17SPratyush Yadav 	max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
114*b3749f17SPratyush Yadav 	folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);
115*b3749f17SPratyush Yadav 	if (!folios)
116*b3749f17SPratyush Yadav 		return -ENOMEM;
117*b3749f17SPratyush Yadav 
118*b3749f17SPratyush Yadav 	/*
119*b3749f17SPratyush Yadav 	 * Pin the folios so they don't move around behind our back. This also
120*b3749f17SPratyush Yadav 	 * ensures none of the folios are in CMA -- which ensures they don't
121*b3749f17SPratyush Yadav 	 * fall in KHO scratch memory. It also moves swapped out folios back to
122*b3749f17SPratyush Yadav 	 * memory.
123*b3749f17SPratyush Yadav 	 *
124*b3749f17SPratyush Yadav 	 * A side effect of doing this is that it allocates a folio for all
125*b3749f17SPratyush Yadav 	 * indices in the file. This might waste memory on sparse memfds. If
126*b3749f17SPratyush Yadav 	 * that is really a problem in the future, we can have a
127*b3749f17SPratyush Yadav 	 * memfd_pin_folios() variant that does not allocate a page on empty
128*b3749f17SPratyush Yadav 	 * slots.
129*b3749f17SPratyush Yadav 	 */
130*b3749f17SPratyush Yadav 	nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
131*b3749f17SPratyush Yadav 				     &offset);
132*b3749f17SPratyush Yadav 	if (nr_pinned < 0) {
133*b3749f17SPratyush Yadav 		err = nr_pinned;
134*b3749f17SPratyush Yadav 		pr_err("failed to pin folios: %d\n", err);
135*b3749f17SPratyush Yadav 		goto err_free_folios;
136*b3749f17SPratyush Yadav 	}
137*b3749f17SPratyush Yadav 	nr_folios = nr_pinned;
138*b3749f17SPratyush Yadav 
139*b3749f17SPratyush Yadav 	folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
140*b3749f17SPratyush Yadav 	if (!folios_ser) {
141*b3749f17SPratyush Yadav 		err = -ENOMEM;
142*b3749f17SPratyush Yadav 		goto err_unpin;
143*b3749f17SPratyush Yadav 	}
144*b3749f17SPratyush Yadav 
145*b3749f17SPratyush Yadav 	for (i = 0; i < nr_folios; i++) {
146*b3749f17SPratyush Yadav 		struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
147*b3749f17SPratyush Yadav 		struct folio *folio = folios[i];
148*b3749f17SPratyush Yadav 		unsigned int flags = 0;
149*b3749f17SPratyush Yadav 
150*b3749f17SPratyush Yadav 		err = kho_preserve_folio(folio);
151*b3749f17SPratyush Yadav 		if (err)
152*b3749f17SPratyush Yadav 			goto err_unpreserve;
153*b3749f17SPratyush Yadav 
154*b3749f17SPratyush Yadav 		if (folio_test_dirty(folio))
155*b3749f17SPratyush Yadav 			flags |= MEMFD_LUO_FOLIO_DIRTY;
156*b3749f17SPratyush Yadav 		if (folio_test_uptodate(folio))
157*b3749f17SPratyush Yadav 			flags |= MEMFD_LUO_FOLIO_UPTODATE;
158*b3749f17SPratyush Yadav 
159*b3749f17SPratyush Yadav 		pfolio->pfn = folio_pfn(folio);
160*b3749f17SPratyush Yadav 		pfolio->flags = flags;
161*b3749f17SPratyush Yadav 		pfolio->index = folio->index;
162*b3749f17SPratyush Yadav 	}
163*b3749f17SPratyush Yadav 
164*b3749f17SPratyush Yadav 	err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
165*b3749f17SPratyush Yadav 	if (err)
166*b3749f17SPratyush Yadav 		goto err_unpreserve;
167*b3749f17SPratyush Yadav 
168*b3749f17SPratyush Yadav 	kvfree(folios);
169*b3749f17SPratyush Yadav 	*nr_foliosp = nr_folios;
170*b3749f17SPratyush Yadav 	*out_folios_ser = folios_ser;
171*b3749f17SPratyush Yadav 
172*b3749f17SPratyush Yadav 	/*
173*b3749f17SPratyush Yadav 	 * Note: folios_ser is purposely not freed here. It is preserved
174*b3749f17SPratyush Yadav 	 * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
175*b3749f17SPratyush Yadav 	 * that is passed via private_data.
176*b3749f17SPratyush Yadav 	 */
177*b3749f17SPratyush Yadav 	return 0;
178*b3749f17SPratyush Yadav 
179*b3749f17SPratyush Yadav err_unpreserve:
180*b3749f17SPratyush Yadav 	for (i = i - 1; i >= 0; i--)
181*b3749f17SPratyush Yadav 		kho_unpreserve_folio(folios[i]);
182*b3749f17SPratyush Yadav 	vfree(folios_ser);
183*b3749f17SPratyush Yadav err_unpin:
184*b3749f17SPratyush Yadav 	unpin_folios(folios, nr_folios);
185*b3749f17SPratyush Yadav err_free_folios:
186*b3749f17SPratyush Yadav 	kvfree(folios);
187*b3749f17SPratyush Yadav 
188*b3749f17SPratyush Yadav 	return err;
189*b3749f17SPratyush Yadav }
190*b3749f17SPratyush Yadav 
memfd_luo_unpreserve_folios(struct kho_vmalloc * kho_vmalloc,struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)191*b3749f17SPratyush Yadav static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
192*b3749f17SPratyush Yadav 					struct memfd_luo_folio_ser *folios_ser,
193*b3749f17SPratyush Yadav 					u64 nr_folios)
194*b3749f17SPratyush Yadav {
195*b3749f17SPratyush Yadav 	long i;
196*b3749f17SPratyush Yadav 
197*b3749f17SPratyush Yadav 	if (!nr_folios)
198*b3749f17SPratyush Yadav 		return;
199*b3749f17SPratyush Yadav 
200*b3749f17SPratyush Yadav 	kho_unpreserve_vmalloc(kho_vmalloc);
201*b3749f17SPratyush Yadav 
202*b3749f17SPratyush Yadav 	for (i = 0; i < nr_folios; i++) {
203*b3749f17SPratyush Yadav 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
204*b3749f17SPratyush Yadav 		struct folio *folio;
205*b3749f17SPratyush Yadav 
206*b3749f17SPratyush Yadav 		if (!pfolio->pfn)
207*b3749f17SPratyush Yadav 			continue;
208*b3749f17SPratyush Yadav 
209*b3749f17SPratyush Yadav 		folio = pfn_folio(pfolio->pfn);
210*b3749f17SPratyush Yadav 
211*b3749f17SPratyush Yadav 		kho_unpreserve_folio(folio);
212*b3749f17SPratyush Yadav 		unpin_folio(folio);
213*b3749f17SPratyush Yadav 	}
214*b3749f17SPratyush Yadav 
215*b3749f17SPratyush Yadav 	vfree(folios_ser);
216*b3749f17SPratyush Yadav }
217*b3749f17SPratyush Yadav 
memfd_luo_preserve(struct liveupdate_file_op_args * args)218*b3749f17SPratyush Yadav static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
219*b3749f17SPratyush Yadav {
220*b3749f17SPratyush Yadav 	struct inode *inode = file_inode(args->file);
221*b3749f17SPratyush Yadav 	struct memfd_luo_folio_ser *folios_ser;
222*b3749f17SPratyush Yadav 	struct memfd_luo_ser *ser;
223*b3749f17SPratyush Yadav 	u64 nr_folios;
224*b3749f17SPratyush Yadav 	int err = 0;
225*b3749f17SPratyush Yadav 
226*b3749f17SPratyush Yadav 	inode_lock(inode);
227*b3749f17SPratyush Yadav 	shmem_freeze(inode, true);
228*b3749f17SPratyush Yadav 
229*b3749f17SPratyush Yadav 	/* Allocate the main serialization structure in preserved memory */
230*b3749f17SPratyush Yadav 	ser = kho_alloc_preserve(sizeof(*ser));
231*b3749f17SPratyush Yadav 	if (IS_ERR(ser)) {
232*b3749f17SPratyush Yadav 		err = PTR_ERR(ser);
233*b3749f17SPratyush Yadav 		goto err_unlock;
234*b3749f17SPratyush Yadav 	}
235*b3749f17SPratyush Yadav 
236*b3749f17SPratyush Yadav 	ser->pos = args->file->f_pos;
237*b3749f17SPratyush Yadav 	ser->size = i_size_read(inode);
238*b3749f17SPratyush Yadav 
239*b3749f17SPratyush Yadav 	err = memfd_luo_preserve_folios(args->file, &ser->folios,
240*b3749f17SPratyush Yadav 					&folios_ser, &nr_folios);
241*b3749f17SPratyush Yadav 	if (err)
242*b3749f17SPratyush Yadav 		goto err_free_ser;
243*b3749f17SPratyush Yadav 
244*b3749f17SPratyush Yadav 	ser->nr_folios = nr_folios;
245*b3749f17SPratyush Yadav 	inode_unlock(inode);
246*b3749f17SPratyush Yadav 
247*b3749f17SPratyush Yadav 	args->private_data = folios_ser;
248*b3749f17SPratyush Yadav 	args->serialized_data = virt_to_phys(ser);
249*b3749f17SPratyush Yadav 
250*b3749f17SPratyush Yadav 	return 0;
251*b3749f17SPratyush Yadav 
252*b3749f17SPratyush Yadav err_free_ser:
253*b3749f17SPratyush Yadav 	kho_unpreserve_free(ser);
254*b3749f17SPratyush Yadav err_unlock:
255*b3749f17SPratyush Yadav 	shmem_freeze(inode, false);
256*b3749f17SPratyush Yadav 	inode_unlock(inode);
257*b3749f17SPratyush Yadav 	return err;
258*b3749f17SPratyush Yadav }
259*b3749f17SPratyush Yadav 
memfd_luo_freeze(struct liveupdate_file_op_args * args)260*b3749f17SPratyush Yadav static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
261*b3749f17SPratyush Yadav {
262*b3749f17SPratyush Yadav 	struct memfd_luo_ser *ser;
263*b3749f17SPratyush Yadav 
264*b3749f17SPratyush Yadav 	if (WARN_ON_ONCE(!args->serialized_data))
265*b3749f17SPratyush Yadav 		return -EINVAL;
266*b3749f17SPratyush Yadav 
267*b3749f17SPratyush Yadav 	ser = phys_to_virt(args->serialized_data);
268*b3749f17SPratyush Yadav 
269*b3749f17SPratyush Yadav 	/*
270*b3749f17SPratyush Yadav 	 * The pos might have changed since prepare. Everything else stays the
271*b3749f17SPratyush Yadav 	 * same.
272*b3749f17SPratyush Yadav 	 */
273*b3749f17SPratyush Yadav 	ser->pos = args->file->f_pos;
274*b3749f17SPratyush Yadav 
275*b3749f17SPratyush Yadav 	return 0;
276*b3749f17SPratyush Yadav }
277*b3749f17SPratyush Yadav 
memfd_luo_unpreserve(struct liveupdate_file_op_args * args)278*b3749f17SPratyush Yadav static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
279*b3749f17SPratyush Yadav {
280*b3749f17SPratyush Yadav 	struct inode *inode = file_inode(args->file);
281*b3749f17SPratyush Yadav 	struct memfd_luo_ser *ser;
282*b3749f17SPratyush Yadav 
283*b3749f17SPratyush Yadav 	if (WARN_ON_ONCE(!args->serialized_data))
284*b3749f17SPratyush Yadav 		return;
285*b3749f17SPratyush Yadav 
286*b3749f17SPratyush Yadav 	inode_lock(inode);
287*b3749f17SPratyush Yadav 	shmem_freeze(inode, false);
288*b3749f17SPratyush Yadav 
289*b3749f17SPratyush Yadav 	ser = phys_to_virt(args->serialized_data);
290*b3749f17SPratyush Yadav 
291*b3749f17SPratyush Yadav 	memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
292*b3749f17SPratyush Yadav 				    ser->nr_folios);
293*b3749f17SPratyush Yadav 
294*b3749f17SPratyush Yadav 	kho_unpreserve_free(ser);
295*b3749f17SPratyush Yadav 	inode_unlock(inode);
296*b3749f17SPratyush Yadav }
297*b3749f17SPratyush Yadav 
memfd_luo_discard_folios(const struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)298*b3749f17SPratyush Yadav static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
299*b3749f17SPratyush Yadav 				     u64 nr_folios)
300*b3749f17SPratyush Yadav {
301*b3749f17SPratyush Yadav 	u64 i;
302*b3749f17SPratyush Yadav 
303*b3749f17SPratyush Yadav 	for (i = 0; i < nr_folios; i++) {
304*b3749f17SPratyush Yadav 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
305*b3749f17SPratyush Yadav 		struct folio *folio;
306*b3749f17SPratyush Yadav 		phys_addr_t phys;
307*b3749f17SPratyush Yadav 
308*b3749f17SPratyush Yadav 		if (!pfolio->pfn)
309*b3749f17SPratyush Yadav 			continue;
310*b3749f17SPratyush Yadav 
311*b3749f17SPratyush Yadav 		phys = PFN_PHYS(pfolio->pfn);
312*b3749f17SPratyush Yadav 		folio = kho_restore_folio(phys);
313*b3749f17SPratyush Yadav 		if (!folio) {
314*b3749f17SPratyush Yadav 			pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
315*b3749f17SPratyush Yadav 					    phys);
316*b3749f17SPratyush Yadav 			continue;
317*b3749f17SPratyush Yadav 		}
318*b3749f17SPratyush Yadav 
319*b3749f17SPratyush Yadav 		folio_put(folio);
320*b3749f17SPratyush Yadav 	}
321*b3749f17SPratyush Yadav }
322*b3749f17SPratyush Yadav 
memfd_luo_finish(struct liveupdate_file_op_args * args)323*b3749f17SPratyush Yadav static void memfd_luo_finish(struct liveupdate_file_op_args *args)
324*b3749f17SPratyush Yadav {
325*b3749f17SPratyush Yadav 	struct memfd_luo_folio_ser *folios_ser;
326*b3749f17SPratyush Yadav 	struct memfd_luo_ser *ser;
327*b3749f17SPratyush Yadav 
328*b3749f17SPratyush Yadav 	if (args->retrieved)
329*b3749f17SPratyush Yadav 		return;
330*b3749f17SPratyush Yadav 
331*b3749f17SPratyush Yadav 	ser = phys_to_virt(args->serialized_data);
332*b3749f17SPratyush Yadav 	if (!ser)
333*b3749f17SPratyush Yadav 		return;
334*b3749f17SPratyush Yadav 
335*b3749f17SPratyush Yadav 	if (ser->nr_folios) {
336*b3749f17SPratyush Yadav 		folios_ser = kho_restore_vmalloc(&ser->folios);
337*b3749f17SPratyush Yadav 		if (!folios_ser)
338*b3749f17SPratyush Yadav 			goto out;
339*b3749f17SPratyush Yadav 
340*b3749f17SPratyush Yadav 		memfd_luo_discard_folios(folios_ser, ser->nr_folios);
341*b3749f17SPratyush Yadav 		vfree(folios_ser);
342*b3749f17SPratyush Yadav 	}
343*b3749f17SPratyush Yadav 
344*b3749f17SPratyush Yadav out:
345*b3749f17SPratyush Yadav 	kho_restore_free(ser);
346*b3749f17SPratyush Yadav }
347*b3749f17SPratyush Yadav 
memfd_luo_retrieve_folios(struct file * file,struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)348*b3749f17SPratyush Yadav static int memfd_luo_retrieve_folios(struct file *file,
349*b3749f17SPratyush Yadav 				     struct memfd_luo_folio_ser *folios_ser,
350*b3749f17SPratyush Yadav 				     u64 nr_folios)
351*b3749f17SPratyush Yadav {
352*b3749f17SPratyush Yadav 	struct inode *inode = file_inode(file);
353*b3749f17SPratyush Yadav 	struct address_space *mapping = inode->i_mapping;
354*b3749f17SPratyush Yadav 	struct folio *folio;
355*b3749f17SPratyush Yadav 	int err = -EIO;
356*b3749f17SPratyush Yadav 	long i;
357*b3749f17SPratyush Yadav 
358*b3749f17SPratyush Yadav 	for (i = 0; i < nr_folios; i++) {
359*b3749f17SPratyush Yadav 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
360*b3749f17SPratyush Yadav 		phys_addr_t phys;
361*b3749f17SPratyush Yadav 		u64 index;
362*b3749f17SPratyush Yadav 		int flags;
363*b3749f17SPratyush Yadav 
364*b3749f17SPratyush Yadav 		if (!pfolio->pfn)
365*b3749f17SPratyush Yadav 			continue;
366*b3749f17SPratyush Yadav 
367*b3749f17SPratyush Yadav 		phys = PFN_PHYS(pfolio->pfn);
368*b3749f17SPratyush Yadav 		folio = kho_restore_folio(phys);
369*b3749f17SPratyush Yadav 		if (!folio) {
370*b3749f17SPratyush Yadav 			pr_err("Unable to restore folio at physical address: %llx\n",
371*b3749f17SPratyush Yadav 			       phys);
372*b3749f17SPratyush Yadav 			goto put_folios;
373*b3749f17SPratyush Yadav 		}
374*b3749f17SPratyush Yadav 		index = pfolio->index;
375*b3749f17SPratyush Yadav 		flags = pfolio->flags;
376*b3749f17SPratyush Yadav 
377*b3749f17SPratyush Yadav 		/* Set up the folio for insertion. */
378*b3749f17SPratyush Yadav 		__folio_set_locked(folio);
379*b3749f17SPratyush Yadav 		__folio_set_swapbacked(folio);
380*b3749f17SPratyush Yadav 
381*b3749f17SPratyush Yadav 		err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
382*b3749f17SPratyush Yadav 		if (err) {
383*b3749f17SPratyush Yadav 			pr_err("shmem: failed to charge folio index %ld: %d\n",
384*b3749f17SPratyush Yadav 			       i, err);
385*b3749f17SPratyush Yadav 			goto unlock_folio;
386*b3749f17SPratyush Yadav 		}
387*b3749f17SPratyush Yadav 
388*b3749f17SPratyush Yadav 		err = shmem_add_to_page_cache(folio, mapping, index, NULL,
389*b3749f17SPratyush Yadav 					      mapping_gfp_mask(mapping));
390*b3749f17SPratyush Yadav 		if (err) {
391*b3749f17SPratyush Yadav 			pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
392*b3749f17SPratyush Yadav 			       i, err);
393*b3749f17SPratyush Yadav 			goto unlock_folio;
394*b3749f17SPratyush Yadav 		}
395*b3749f17SPratyush Yadav 
396*b3749f17SPratyush Yadav 		if (flags & MEMFD_LUO_FOLIO_UPTODATE)
397*b3749f17SPratyush Yadav 			folio_mark_uptodate(folio);
398*b3749f17SPratyush Yadav 		if (flags & MEMFD_LUO_FOLIO_DIRTY)
399*b3749f17SPratyush Yadav 			folio_mark_dirty(folio);
400*b3749f17SPratyush Yadav 
401*b3749f17SPratyush Yadav 		err = shmem_inode_acct_blocks(inode, 1);
402*b3749f17SPratyush Yadav 		if (err) {
403*b3749f17SPratyush Yadav 			pr_err("shmem: failed to account folio index %ld: %d\n",
404*b3749f17SPratyush Yadav 			       i, err);
405*b3749f17SPratyush Yadav 			goto unlock_folio;
406*b3749f17SPratyush Yadav 		}
407*b3749f17SPratyush Yadav 
408*b3749f17SPratyush Yadav 		shmem_recalc_inode(inode, 1, 0);
409*b3749f17SPratyush Yadav 		folio_add_lru(folio);
410*b3749f17SPratyush Yadav 		folio_unlock(folio);
411*b3749f17SPratyush Yadav 		folio_put(folio);
412*b3749f17SPratyush Yadav 	}
413*b3749f17SPratyush Yadav 
414*b3749f17SPratyush Yadav 	return 0;
415*b3749f17SPratyush Yadav 
416*b3749f17SPratyush Yadav unlock_folio:
417*b3749f17SPratyush Yadav 	folio_unlock(folio);
418*b3749f17SPratyush Yadav 	folio_put(folio);
419*b3749f17SPratyush Yadav put_folios:
420*b3749f17SPratyush Yadav 	/*
421*b3749f17SPratyush Yadav 	 * Note: don't free the folios already added to the file. They will be
422*b3749f17SPratyush Yadav 	 * freed when the file is freed. Free the ones not added yet here.
423*b3749f17SPratyush Yadav 	 */
424*b3749f17SPratyush Yadav 	for (long j = i + 1; j < nr_folios; j++) {
425*b3749f17SPratyush Yadav 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
426*b3749f17SPratyush Yadav 
427*b3749f17SPratyush Yadav 		folio = kho_restore_folio(pfolio->pfn);
428*b3749f17SPratyush Yadav 		if (folio)
429*b3749f17SPratyush Yadav 			folio_put(folio);
430*b3749f17SPratyush Yadav 	}
431*b3749f17SPratyush Yadav 
432*b3749f17SPratyush Yadav 	return err;
433*b3749f17SPratyush Yadav }
434*b3749f17SPratyush Yadav 
memfd_luo_retrieve(struct liveupdate_file_op_args * args)435*b3749f17SPratyush Yadav static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
436*b3749f17SPratyush Yadav {
437*b3749f17SPratyush Yadav 	struct memfd_luo_folio_ser *folios_ser;
438*b3749f17SPratyush Yadav 	struct memfd_luo_ser *ser;
439*b3749f17SPratyush Yadav 	struct file *file;
440*b3749f17SPratyush Yadav 	int err;
441*b3749f17SPratyush Yadav 
442*b3749f17SPratyush Yadav 	ser = phys_to_virt(args->serialized_data);
443*b3749f17SPratyush Yadav 	if (!ser)
444*b3749f17SPratyush Yadav 		return -EINVAL;
445*b3749f17SPratyush Yadav 
446*b3749f17SPratyush Yadav 	file = shmem_file_setup("", 0, VM_NORESERVE);
447*b3749f17SPratyush Yadav 
448*b3749f17SPratyush Yadav 	if (IS_ERR(file)) {
449*b3749f17SPratyush Yadav 		pr_err("failed to setup file: %pe\n", file);
450*b3749f17SPratyush Yadav 		return PTR_ERR(file);
451*b3749f17SPratyush Yadav 	}
452*b3749f17SPratyush Yadav 
453*b3749f17SPratyush Yadav 	vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
454*b3749f17SPratyush Yadav 	file->f_inode->i_size = ser->size;
455*b3749f17SPratyush Yadav 
456*b3749f17SPratyush Yadav 	if (ser->nr_folios) {
457*b3749f17SPratyush Yadav 		folios_ser = kho_restore_vmalloc(&ser->folios);
458*b3749f17SPratyush Yadav 		if (!folios_ser) {
459*b3749f17SPratyush Yadav 			err = -EINVAL;
460*b3749f17SPratyush Yadav 			goto put_file;
461*b3749f17SPratyush Yadav 		}
462*b3749f17SPratyush Yadav 
463*b3749f17SPratyush Yadav 		err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
464*b3749f17SPratyush Yadav 		vfree(folios_ser);
465*b3749f17SPratyush Yadav 		if (err)
466*b3749f17SPratyush Yadav 			goto put_file;
467*b3749f17SPratyush Yadav 	}
468*b3749f17SPratyush Yadav 
469*b3749f17SPratyush Yadav 	args->file = file;
470*b3749f17SPratyush Yadav 	kho_restore_free(ser);
471*b3749f17SPratyush Yadav 
472*b3749f17SPratyush Yadav 	return 0;
473*b3749f17SPratyush Yadav 
474*b3749f17SPratyush Yadav put_file:
475*b3749f17SPratyush Yadav 	fput(file);
476*b3749f17SPratyush Yadav 
477*b3749f17SPratyush Yadav 	return err;
478*b3749f17SPratyush Yadav }
479*b3749f17SPratyush Yadav 
memfd_luo_can_preserve(struct liveupdate_file_handler * handler,struct file * file)480*b3749f17SPratyush Yadav static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
481*b3749f17SPratyush Yadav 				   struct file *file)
482*b3749f17SPratyush Yadav {
483*b3749f17SPratyush Yadav 	struct inode *inode = file_inode(file);
484*b3749f17SPratyush Yadav 
485*b3749f17SPratyush Yadav 	return shmem_file(file) && !inode->i_nlink;
486*b3749f17SPratyush Yadav }
487*b3749f17SPratyush Yadav 
488*b3749f17SPratyush Yadav static const struct liveupdate_file_ops memfd_luo_file_ops = {
489*b3749f17SPratyush Yadav 	.freeze = memfd_luo_freeze,
490*b3749f17SPratyush Yadav 	.finish = memfd_luo_finish,
491*b3749f17SPratyush Yadav 	.retrieve = memfd_luo_retrieve,
492*b3749f17SPratyush Yadav 	.preserve = memfd_luo_preserve,
493*b3749f17SPratyush Yadav 	.unpreserve = memfd_luo_unpreserve,
494*b3749f17SPratyush Yadav 	.can_preserve = memfd_luo_can_preserve,
495*b3749f17SPratyush Yadav 	.owner = THIS_MODULE,
496*b3749f17SPratyush Yadav };
497*b3749f17SPratyush Yadav 
498*b3749f17SPratyush Yadav static struct liveupdate_file_handler memfd_luo_handler = {
499*b3749f17SPratyush Yadav 	.ops = &memfd_luo_file_ops,
500*b3749f17SPratyush Yadav 	.compatible = MEMFD_LUO_FH_COMPATIBLE,
501*b3749f17SPratyush Yadav };
502*b3749f17SPratyush Yadav 
memfd_luo_init(void)503*b3749f17SPratyush Yadav static int __init memfd_luo_init(void)
504*b3749f17SPratyush Yadav {
505*b3749f17SPratyush Yadav 	int err = liveupdate_register_file_handler(&memfd_luo_handler);
506*b3749f17SPratyush Yadav 
507*b3749f17SPratyush Yadav 	if (err && err != -EOPNOTSUPP) {
508*b3749f17SPratyush Yadav 		pr_err("Could not register luo filesystem handler: %pe\n",
509*b3749f17SPratyush Yadav 		       ERR_PTR(err));
510*b3749f17SPratyush Yadav 
511*b3749f17SPratyush Yadav 		return err;
512*b3749f17SPratyush Yadav 	}
513*b3749f17SPratyush Yadav 
514*b3749f17SPratyush Yadav 	return 0;
515*b3749f17SPratyush Yadav }
516*b3749f17SPratyush Yadav late_initcall(memfd_luo_init);
517