1*b3749f17SPratyush Yadav // SPDX-License-Identifier: GPL-2.0
2*b3749f17SPratyush Yadav
3*b3749f17SPratyush Yadav /*
4*b3749f17SPratyush Yadav * Copyright (c) 2025, Google LLC.
5*b3749f17SPratyush Yadav * Pasha Tatashin <pasha.tatashin@soleen.com>
6*b3749f17SPratyush Yadav *
7*b3749f17SPratyush Yadav * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
8*b3749f17SPratyush Yadav * Pratyush Yadav <ptyadav@amazon.de>
9*b3749f17SPratyush Yadav */
10*b3749f17SPratyush Yadav
11*b3749f17SPratyush Yadav /**
12*b3749f17SPratyush Yadav * DOC: Memfd Preservation via LUO
13*b3749f17SPratyush Yadav *
14*b3749f17SPratyush Yadav * Overview
15*b3749f17SPratyush Yadav * ========
16*b3749f17SPratyush Yadav *
17*b3749f17SPratyush Yadav * Memory file descriptors (memfd) can be preserved over a kexec using the Live
18*b3749f17SPratyush Yadav * Update Orchestrator (LUO) file preservation. This allows userspace to
19*b3749f17SPratyush Yadav * transfer its memory contents to the next kernel after a kexec.
20*b3749f17SPratyush Yadav *
21*b3749f17SPratyush Yadav * The preservation is not intended to be transparent. Only select properties of
22*b3749f17SPratyush Yadav * the file are preserved. All others are reset to default. The preserved
23*b3749f17SPratyush Yadav * properties are described below.
24*b3749f17SPratyush Yadav *
25*b3749f17SPratyush Yadav * .. note::
26*b3749f17SPratyush Yadav * The LUO API is not stabilized yet, so the preserved properties of a memfd
27*b3749f17SPratyush Yadav * are also not stable and are subject to backwards incompatible changes.
28*b3749f17SPratyush Yadav *
29*b3749f17SPratyush Yadav * .. note::
30*b3749f17SPratyush Yadav * Currently a memfd backed by Hugetlb is not supported. Memfds created
31*b3749f17SPratyush Yadav * with ``MFD_HUGETLB`` will be rejected.
32*b3749f17SPratyush Yadav *
33*b3749f17SPratyush Yadav * Preserved Properties
34*b3749f17SPratyush Yadav * ====================
35*b3749f17SPratyush Yadav *
36*b3749f17SPratyush Yadav * The following properties of the memfd are preserved across kexec:
37*b3749f17SPratyush Yadav *
38*b3749f17SPratyush Yadav * File Contents
39*b3749f17SPratyush Yadav * All data stored in the file is preserved.
40*b3749f17SPratyush Yadav *
41*b3749f17SPratyush Yadav * File Size
42*b3749f17SPratyush Yadav * The size of the file is preserved. Holes in the file are filled by
43*b3749f17SPratyush Yadav * allocating pages for them during preservation.
44*b3749f17SPratyush Yadav *
45*b3749f17SPratyush Yadav * File Position
46*b3749f17SPratyush Yadav * The current file position is preserved, allowing applications to continue
47*b3749f17SPratyush Yadav * reading/writing from their last position.
48*b3749f17SPratyush Yadav *
49*b3749f17SPratyush Yadav * File Status Flags
50*b3749f17SPratyush Yadav * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
51*b3749f17SPratyush Yadav * is maintained.
52*b3749f17SPratyush Yadav *
53*b3749f17SPratyush Yadav * Non-Preserved Properties
54*b3749f17SPratyush Yadav * ========================
55*b3749f17SPratyush Yadav *
56*b3749f17SPratyush Yadav * All properties which are not preserved must be assumed to be reset to
57*b3749f17SPratyush Yadav * default. This section describes some of those properties which may be more of
58*b3749f17SPratyush Yadav * note.
59*b3749f17SPratyush Yadav *
60*b3749f17SPratyush Yadav * ``FD_CLOEXEC`` flag
61*b3749f17SPratyush Yadav * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
62*b3749f17SPratyush Yadav * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
63*b3749f17SPratyush Yadav * again after restore via ``fcntl()``.
64*b3749f17SPratyush Yadav *
65*b3749f17SPratyush Yadav * Seals
66*b3749f17SPratyush Yadav * File seals are not preserved. The file is unsealed on restore and if
67*b3749f17SPratyush Yadav * needed, must be sealed again via ``fcntl()``.
68*b3749f17SPratyush Yadav */
69*b3749f17SPratyush Yadav
70*b3749f17SPratyush Yadav #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
71*b3749f17SPratyush Yadav
72*b3749f17SPratyush Yadav #include <linux/bits.h>
73*b3749f17SPratyush Yadav #include <linux/err.h>
74*b3749f17SPratyush Yadav #include <linux/file.h>
75*b3749f17SPratyush Yadav #include <linux/io.h>
76*b3749f17SPratyush Yadav #include <linux/kexec_handover.h>
77*b3749f17SPratyush Yadav #include <linux/kho/abi/memfd.h>
78*b3749f17SPratyush Yadav #include <linux/liveupdate.h>
79*b3749f17SPratyush Yadav #include <linux/shmem_fs.h>
80*b3749f17SPratyush Yadav #include <linux/vmalloc.h>
81*b3749f17SPratyush Yadav #include "internal.h"
82*b3749f17SPratyush Yadav
memfd_luo_preserve_folios(struct file * file,struct kho_vmalloc * kho_vmalloc,struct memfd_luo_folio_ser ** out_folios_ser,u64 * nr_foliosp)83*b3749f17SPratyush Yadav static int memfd_luo_preserve_folios(struct file *file,
84*b3749f17SPratyush Yadav struct kho_vmalloc *kho_vmalloc,
85*b3749f17SPratyush Yadav struct memfd_luo_folio_ser **out_folios_ser,
86*b3749f17SPratyush Yadav u64 *nr_foliosp)
87*b3749f17SPratyush Yadav {
88*b3749f17SPratyush Yadav struct inode *inode = file_inode(file);
89*b3749f17SPratyush Yadav struct memfd_luo_folio_ser *folios_ser;
90*b3749f17SPratyush Yadav unsigned int max_folios;
91*b3749f17SPratyush Yadav long i, size, nr_pinned;
92*b3749f17SPratyush Yadav struct folio **folios;
93*b3749f17SPratyush Yadav int err = -EINVAL;
94*b3749f17SPratyush Yadav pgoff_t offset;
95*b3749f17SPratyush Yadav u64 nr_folios;
96*b3749f17SPratyush Yadav
97*b3749f17SPratyush Yadav size = i_size_read(inode);
98*b3749f17SPratyush Yadav /*
99*b3749f17SPratyush Yadav * If the file has zero size, then the folios and nr_folios properties
100*b3749f17SPratyush Yadav * are not set.
101*b3749f17SPratyush Yadav */
102*b3749f17SPratyush Yadav if (!size) {
103*b3749f17SPratyush Yadav *nr_foliosp = 0;
104*b3749f17SPratyush Yadav *out_folios_ser = NULL;
105*b3749f17SPratyush Yadav memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
106*b3749f17SPratyush Yadav return 0;
107*b3749f17SPratyush Yadav }
108*b3749f17SPratyush Yadav
109*b3749f17SPratyush Yadav /*
110*b3749f17SPratyush Yadav * Guess the number of folios based on inode size. Real number might end
111*b3749f17SPratyush Yadav * up being smaller if there are higher order folios.
112*b3749f17SPratyush Yadav */
113*b3749f17SPratyush Yadav max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
114*b3749f17SPratyush Yadav folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);
115*b3749f17SPratyush Yadav if (!folios)
116*b3749f17SPratyush Yadav return -ENOMEM;
117*b3749f17SPratyush Yadav
118*b3749f17SPratyush Yadav /*
119*b3749f17SPratyush Yadav * Pin the folios so they don't move around behind our back. This also
120*b3749f17SPratyush Yadav * ensures none of the folios are in CMA -- which ensures they don't
121*b3749f17SPratyush Yadav * fall in KHO scratch memory. It also moves swapped out folios back to
122*b3749f17SPratyush Yadav * memory.
123*b3749f17SPratyush Yadav *
124*b3749f17SPratyush Yadav * A side effect of doing this is that it allocates a folio for all
125*b3749f17SPratyush Yadav * indices in the file. This might waste memory on sparse memfds. If
126*b3749f17SPratyush Yadav * that is really a problem in the future, we can have a
127*b3749f17SPratyush Yadav * memfd_pin_folios() variant that does not allocate a page on empty
128*b3749f17SPratyush Yadav * slots.
129*b3749f17SPratyush Yadav */
130*b3749f17SPratyush Yadav nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
131*b3749f17SPratyush Yadav &offset);
132*b3749f17SPratyush Yadav if (nr_pinned < 0) {
133*b3749f17SPratyush Yadav err = nr_pinned;
134*b3749f17SPratyush Yadav pr_err("failed to pin folios: %d\n", err);
135*b3749f17SPratyush Yadav goto err_free_folios;
136*b3749f17SPratyush Yadav }
137*b3749f17SPratyush Yadav nr_folios = nr_pinned;
138*b3749f17SPratyush Yadav
139*b3749f17SPratyush Yadav folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
140*b3749f17SPratyush Yadav if (!folios_ser) {
141*b3749f17SPratyush Yadav err = -ENOMEM;
142*b3749f17SPratyush Yadav goto err_unpin;
143*b3749f17SPratyush Yadav }
144*b3749f17SPratyush Yadav
145*b3749f17SPratyush Yadav for (i = 0; i < nr_folios; i++) {
146*b3749f17SPratyush Yadav struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
147*b3749f17SPratyush Yadav struct folio *folio = folios[i];
148*b3749f17SPratyush Yadav unsigned int flags = 0;
149*b3749f17SPratyush Yadav
150*b3749f17SPratyush Yadav err = kho_preserve_folio(folio);
151*b3749f17SPratyush Yadav if (err)
152*b3749f17SPratyush Yadav goto err_unpreserve;
153*b3749f17SPratyush Yadav
154*b3749f17SPratyush Yadav if (folio_test_dirty(folio))
155*b3749f17SPratyush Yadav flags |= MEMFD_LUO_FOLIO_DIRTY;
156*b3749f17SPratyush Yadav if (folio_test_uptodate(folio))
157*b3749f17SPratyush Yadav flags |= MEMFD_LUO_FOLIO_UPTODATE;
158*b3749f17SPratyush Yadav
159*b3749f17SPratyush Yadav pfolio->pfn = folio_pfn(folio);
160*b3749f17SPratyush Yadav pfolio->flags = flags;
161*b3749f17SPratyush Yadav pfolio->index = folio->index;
162*b3749f17SPratyush Yadav }
163*b3749f17SPratyush Yadav
164*b3749f17SPratyush Yadav err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
165*b3749f17SPratyush Yadav if (err)
166*b3749f17SPratyush Yadav goto err_unpreserve;
167*b3749f17SPratyush Yadav
168*b3749f17SPratyush Yadav kvfree(folios);
169*b3749f17SPratyush Yadav *nr_foliosp = nr_folios;
170*b3749f17SPratyush Yadav *out_folios_ser = folios_ser;
171*b3749f17SPratyush Yadav
172*b3749f17SPratyush Yadav /*
173*b3749f17SPratyush Yadav * Note: folios_ser is purposely not freed here. It is preserved
174*b3749f17SPratyush Yadav * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
175*b3749f17SPratyush Yadav * that is passed via private_data.
176*b3749f17SPratyush Yadav */
177*b3749f17SPratyush Yadav return 0;
178*b3749f17SPratyush Yadav
179*b3749f17SPratyush Yadav err_unpreserve:
180*b3749f17SPratyush Yadav for (i = i - 1; i >= 0; i--)
181*b3749f17SPratyush Yadav kho_unpreserve_folio(folios[i]);
182*b3749f17SPratyush Yadav vfree(folios_ser);
183*b3749f17SPratyush Yadav err_unpin:
184*b3749f17SPratyush Yadav unpin_folios(folios, nr_folios);
185*b3749f17SPratyush Yadav err_free_folios:
186*b3749f17SPratyush Yadav kvfree(folios);
187*b3749f17SPratyush Yadav
188*b3749f17SPratyush Yadav return err;
189*b3749f17SPratyush Yadav }
190*b3749f17SPratyush Yadav
memfd_luo_unpreserve_folios(struct kho_vmalloc * kho_vmalloc,struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)191*b3749f17SPratyush Yadav static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
192*b3749f17SPratyush Yadav struct memfd_luo_folio_ser *folios_ser,
193*b3749f17SPratyush Yadav u64 nr_folios)
194*b3749f17SPratyush Yadav {
195*b3749f17SPratyush Yadav long i;
196*b3749f17SPratyush Yadav
197*b3749f17SPratyush Yadav if (!nr_folios)
198*b3749f17SPratyush Yadav return;
199*b3749f17SPratyush Yadav
200*b3749f17SPratyush Yadav kho_unpreserve_vmalloc(kho_vmalloc);
201*b3749f17SPratyush Yadav
202*b3749f17SPratyush Yadav for (i = 0; i < nr_folios; i++) {
203*b3749f17SPratyush Yadav const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
204*b3749f17SPratyush Yadav struct folio *folio;
205*b3749f17SPratyush Yadav
206*b3749f17SPratyush Yadav if (!pfolio->pfn)
207*b3749f17SPratyush Yadav continue;
208*b3749f17SPratyush Yadav
209*b3749f17SPratyush Yadav folio = pfn_folio(pfolio->pfn);
210*b3749f17SPratyush Yadav
211*b3749f17SPratyush Yadav kho_unpreserve_folio(folio);
212*b3749f17SPratyush Yadav unpin_folio(folio);
213*b3749f17SPratyush Yadav }
214*b3749f17SPratyush Yadav
215*b3749f17SPratyush Yadav vfree(folios_ser);
216*b3749f17SPratyush Yadav }
217*b3749f17SPratyush Yadav
memfd_luo_preserve(struct liveupdate_file_op_args * args)218*b3749f17SPratyush Yadav static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
219*b3749f17SPratyush Yadav {
220*b3749f17SPratyush Yadav struct inode *inode = file_inode(args->file);
221*b3749f17SPratyush Yadav struct memfd_luo_folio_ser *folios_ser;
222*b3749f17SPratyush Yadav struct memfd_luo_ser *ser;
223*b3749f17SPratyush Yadav u64 nr_folios;
224*b3749f17SPratyush Yadav int err = 0;
225*b3749f17SPratyush Yadav
226*b3749f17SPratyush Yadav inode_lock(inode);
227*b3749f17SPratyush Yadav shmem_freeze(inode, true);
228*b3749f17SPratyush Yadav
229*b3749f17SPratyush Yadav /* Allocate the main serialization structure in preserved memory */
230*b3749f17SPratyush Yadav ser = kho_alloc_preserve(sizeof(*ser));
231*b3749f17SPratyush Yadav if (IS_ERR(ser)) {
232*b3749f17SPratyush Yadav err = PTR_ERR(ser);
233*b3749f17SPratyush Yadav goto err_unlock;
234*b3749f17SPratyush Yadav }
235*b3749f17SPratyush Yadav
236*b3749f17SPratyush Yadav ser->pos = args->file->f_pos;
237*b3749f17SPratyush Yadav ser->size = i_size_read(inode);
238*b3749f17SPratyush Yadav
239*b3749f17SPratyush Yadav err = memfd_luo_preserve_folios(args->file, &ser->folios,
240*b3749f17SPratyush Yadav &folios_ser, &nr_folios);
241*b3749f17SPratyush Yadav if (err)
242*b3749f17SPratyush Yadav goto err_free_ser;
243*b3749f17SPratyush Yadav
244*b3749f17SPratyush Yadav ser->nr_folios = nr_folios;
245*b3749f17SPratyush Yadav inode_unlock(inode);
246*b3749f17SPratyush Yadav
247*b3749f17SPratyush Yadav args->private_data = folios_ser;
248*b3749f17SPratyush Yadav args->serialized_data = virt_to_phys(ser);
249*b3749f17SPratyush Yadav
250*b3749f17SPratyush Yadav return 0;
251*b3749f17SPratyush Yadav
252*b3749f17SPratyush Yadav err_free_ser:
253*b3749f17SPratyush Yadav kho_unpreserve_free(ser);
254*b3749f17SPratyush Yadav err_unlock:
255*b3749f17SPratyush Yadav shmem_freeze(inode, false);
256*b3749f17SPratyush Yadav inode_unlock(inode);
257*b3749f17SPratyush Yadav return err;
258*b3749f17SPratyush Yadav }
259*b3749f17SPratyush Yadav
memfd_luo_freeze(struct liveupdate_file_op_args * args)260*b3749f17SPratyush Yadav static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
261*b3749f17SPratyush Yadav {
262*b3749f17SPratyush Yadav struct memfd_luo_ser *ser;
263*b3749f17SPratyush Yadav
264*b3749f17SPratyush Yadav if (WARN_ON_ONCE(!args->serialized_data))
265*b3749f17SPratyush Yadav return -EINVAL;
266*b3749f17SPratyush Yadav
267*b3749f17SPratyush Yadav ser = phys_to_virt(args->serialized_data);
268*b3749f17SPratyush Yadav
269*b3749f17SPratyush Yadav /*
270*b3749f17SPratyush Yadav * The pos might have changed since prepare. Everything else stays the
271*b3749f17SPratyush Yadav * same.
272*b3749f17SPratyush Yadav */
273*b3749f17SPratyush Yadav ser->pos = args->file->f_pos;
274*b3749f17SPratyush Yadav
275*b3749f17SPratyush Yadav return 0;
276*b3749f17SPratyush Yadav }
277*b3749f17SPratyush Yadav
memfd_luo_unpreserve(struct liveupdate_file_op_args * args)278*b3749f17SPratyush Yadav static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
279*b3749f17SPratyush Yadav {
280*b3749f17SPratyush Yadav struct inode *inode = file_inode(args->file);
281*b3749f17SPratyush Yadav struct memfd_luo_ser *ser;
282*b3749f17SPratyush Yadav
283*b3749f17SPratyush Yadav if (WARN_ON_ONCE(!args->serialized_data))
284*b3749f17SPratyush Yadav return;
285*b3749f17SPratyush Yadav
286*b3749f17SPratyush Yadav inode_lock(inode);
287*b3749f17SPratyush Yadav shmem_freeze(inode, false);
288*b3749f17SPratyush Yadav
289*b3749f17SPratyush Yadav ser = phys_to_virt(args->serialized_data);
290*b3749f17SPratyush Yadav
291*b3749f17SPratyush Yadav memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
292*b3749f17SPratyush Yadav ser->nr_folios);
293*b3749f17SPratyush Yadav
294*b3749f17SPratyush Yadav kho_unpreserve_free(ser);
295*b3749f17SPratyush Yadav inode_unlock(inode);
296*b3749f17SPratyush Yadav }
297*b3749f17SPratyush Yadav
memfd_luo_discard_folios(const struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)298*b3749f17SPratyush Yadav static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
299*b3749f17SPratyush Yadav u64 nr_folios)
300*b3749f17SPratyush Yadav {
301*b3749f17SPratyush Yadav u64 i;
302*b3749f17SPratyush Yadav
303*b3749f17SPratyush Yadav for (i = 0; i < nr_folios; i++) {
304*b3749f17SPratyush Yadav const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
305*b3749f17SPratyush Yadav struct folio *folio;
306*b3749f17SPratyush Yadav phys_addr_t phys;
307*b3749f17SPratyush Yadav
308*b3749f17SPratyush Yadav if (!pfolio->pfn)
309*b3749f17SPratyush Yadav continue;
310*b3749f17SPratyush Yadav
311*b3749f17SPratyush Yadav phys = PFN_PHYS(pfolio->pfn);
312*b3749f17SPratyush Yadav folio = kho_restore_folio(phys);
313*b3749f17SPratyush Yadav if (!folio) {
314*b3749f17SPratyush Yadav pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
315*b3749f17SPratyush Yadav phys);
316*b3749f17SPratyush Yadav continue;
317*b3749f17SPratyush Yadav }
318*b3749f17SPratyush Yadav
319*b3749f17SPratyush Yadav folio_put(folio);
320*b3749f17SPratyush Yadav }
321*b3749f17SPratyush Yadav }
322*b3749f17SPratyush Yadav
memfd_luo_finish(struct liveupdate_file_op_args * args)323*b3749f17SPratyush Yadav static void memfd_luo_finish(struct liveupdate_file_op_args *args)
324*b3749f17SPratyush Yadav {
325*b3749f17SPratyush Yadav struct memfd_luo_folio_ser *folios_ser;
326*b3749f17SPratyush Yadav struct memfd_luo_ser *ser;
327*b3749f17SPratyush Yadav
328*b3749f17SPratyush Yadav if (args->retrieved)
329*b3749f17SPratyush Yadav return;
330*b3749f17SPratyush Yadav
331*b3749f17SPratyush Yadav ser = phys_to_virt(args->serialized_data);
332*b3749f17SPratyush Yadav if (!ser)
333*b3749f17SPratyush Yadav return;
334*b3749f17SPratyush Yadav
335*b3749f17SPratyush Yadav if (ser->nr_folios) {
336*b3749f17SPratyush Yadav folios_ser = kho_restore_vmalloc(&ser->folios);
337*b3749f17SPratyush Yadav if (!folios_ser)
338*b3749f17SPratyush Yadav goto out;
339*b3749f17SPratyush Yadav
340*b3749f17SPratyush Yadav memfd_luo_discard_folios(folios_ser, ser->nr_folios);
341*b3749f17SPratyush Yadav vfree(folios_ser);
342*b3749f17SPratyush Yadav }
343*b3749f17SPratyush Yadav
344*b3749f17SPratyush Yadav out:
345*b3749f17SPratyush Yadav kho_restore_free(ser);
346*b3749f17SPratyush Yadav }
347*b3749f17SPratyush Yadav
memfd_luo_retrieve_folios(struct file * file,struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)348*b3749f17SPratyush Yadav static int memfd_luo_retrieve_folios(struct file *file,
349*b3749f17SPratyush Yadav struct memfd_luo_folio_ser *folios_ser,
350*b3749f17SPratyush Yadav u64 nr_folios)
351*b3749f17SPratyush Yadav {
352*b3749f17SPratyush Yadav struct inode *inode = file_inode(file);
353*b3749f17SPratyush Yadav struct address_space *mapping = inode->i_mapping;
354*b3749f17SPratyush Yadav struct folio *folio;
355*b3749f17SPratyush Yadav int err = -EIO;
356*b3749f17SPratyush Yadav long i;
357*b3749f17SPratyush Yadav
358*b3749f17SPratyush Yadav for (i = 0; i < nr_folios; i++) {
359*b3749f17SPratyush Yadav const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
360*b3749f17SPratyush Yadav phys_addr_t phys;
361*b3749f17SPratyush Yadav u64 index;
362*b3749f17SPratyush Yadav int flags;
363*b3749f17SPratyush Yadav
364*b3749f17SPratyush Yadav if (!pfolio->pfn)
365*b3749f17SPratyush Yadav continue;
366*b3749f17SPratyush Yadav
367*b3749f17SPratyush Yadav phys = PFN_PHYS(pfolio->pfn);
368*b3749f17SPratyush Yadav folio = kho_restore_folio(phys);
369*b3749f17SPratyush Yadav if (!folio) {
370*b3749f17SPratyush Yadav pr_err("Unable to restore folio at physical address: %llx\n",
371*b3749f17SPratyush Yadav phys);
372*b3749f17SPratyush Yadav goto put_folios;
373*b3749f17SPratyush Yadav }
374*b3749f17SPratyush Yadav index = pfolio->index;
375*b3749f17SPratyush Yadav flags = pfolio->flags;
376*b3749f17SPratyush Yadav
377*b3749f17SPratyush Yadav /* Set up the folio for insertion. */
378*b3749f17SPratyush Yadav __folio_set_locked(folio);
379*b3749f17SPratyush Yadav __folio_set_swapbacked(folio);
380*b3749f17SPratyush Yadav
381*b3749f17SPratyush Yadav err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
382*b3749f17SPratyush Yadav if (err) {
383*b3749f17SPratyush Yadav pr_err("shmem: failed to charge folio index %ld: %d\n",
384*b3749f17SPratyush Yadav i, err);
385*b3749f17SPratyush Yadav goto unlock_folio;
386*b3749f17SPratyush Yadav }
387*b3749f17SPratyush Yadav
388*b3749f17SPratyush Yadav err = shmem_add_to_page_cache(folio, mapping, index, NULL,
389*b3749f17SPratyush Yadav mapping_gfp_mask(mapping));
390*b3749f17SPratyush Yadav if (err) {
391*b3749f17SPratyush Yadav pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
392*b3749f17SPratyush Yadav i, err);
393*b3749f17SPratyush Yadav goto unlock_folio;
394*b3749f17SPratyush Yadav }
395*b3749f17SPratyush Yadav
396*b3749f17SPratyush Yadav if (flags & MEMFD_LUO_FOLIO_UPTODATE)
397*b3749f17SPratyush Yadav folio_mark_uptodate(folio);
398*b3749f17SPratyush Yadav if (flags & MEMFD_LUO_FOLIO_DIRTY)
399*b3749f17SPratyush Yadav folio_mark_dirty(folio);
400*b3749f17SPratyush Yadav
401*b3749f17SPratyush Yadav err = shmem_inode_acct_blocks(inode, 1);
402*b3749f17SPratyush Yadav if (err) {
403*b3749f17SPratyush Yadav pr_err("shmem: failed to account folio index %ld: %d\n",
404*b3749f17SPratyush Yadav i, err);
405*b3749f17SPratyush Yadav goto unlock_folio;
406*b3749f17SPratyush Yadav }
407*b3749f17SPratyush Yadav
408*b3749f17SPratyush Yadav shmem_recalc_inode(inode, 1, 0);
409*b3749f17SPratyush Yadav folio_add_lru(folio);
410*b3749f17SPratyush Yadav folio_unlock(folio);
411*b3749f17SPratyush Yadav folio_put(folio);
412*b3749f17SPratyush Yadav }
413*b3749f17SPratyush Yadav
414*b3749f17SPratyush Yadav return 0;
415*b3749f17SPratyush Yadav
416*b3749f17SPratyush Yadav unlock_folio:
417*b3749f17SPratyush Yadav folio_unlock(folio);
418*b3749f17SPratyush Yadav folio_put(folio);
419*b3749f17SPratyush Yadav put_folios:
420*b3749f17SPratyush Yadav /*
421*b3749f17SPratyush Yadav * Note: don't free the folios already added to the file. They will be
422*b3749f17SPratyush Yadav * freed when the file is freed. Free the ones not added yet here.
423*b3749f17SPratyush Yadav */
424*b3749f17SPratyush Yadav for (long j = i + 1; j < nr_folios; j++) {
425*b3749f17SPratyush Yadav const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
426*b3749f17SPratyush Yadav
427*b3749f17SPratyush Yadav folio = kho_restore_folio(pfolio->pfn);
428*b3749f17SPratyush Yadav if (folio)
429*b3749f17SPratyush Yadav folio_put(folio);
430*b3749f17SPratyush Yadav }
431*b3749f17SPratyush Yadav
432*b3749f17SPratyush Yadav return err;
433*b3749f17SPratyush Yadav }
434*b3749f17SPratyush Yadav
memfd_luo_retrieve(struct liveupdate_file_op_args * args)435*b3749f17SPratyush Yadav static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
436*b3749f17SPratyush Yadav {
437*b3749f17SPratyush Yadav struct memfd_luo_folio_ser *folios_ser;
438*b3749f17SPratyush Yadav struct memfd_luo_ser *ser;
439*b3749f17SPratyush Yadav struct file *file;
440*b3749f17SPratyush Yadav int err;
441*b3749f17SPratyush Yadav
442*b3749f17SPratyush Yadav ser = phys_to_virt(args->serialized_data);
443*b3749f17SPratyush Yadav if (!ser)
444*b3749f17SPratyush Yadav return -EINVAL;
445*b3749f17SPratyush Yadav
446*b3749f17SPratyush Yadav file = shmem_file_setup("", 0, VM_NORESERVE);
447*b3749f17SPratyush Yadav
448*b3749f17SPratyush Yadav if (IS_ERR(file)) {
449*b3749f17SPratyush Yadav pr_err("failed to setup file: %pe\n", file);
450*b3749f17SPratyush Yadav return PTR_ERR(file);
451*b3749f17SPratyush Yadav }
452*b3749f17SPratyush Yadav
453*b3749f17SPratyush Yadav vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
454*b3749f17SPratyush Yadav file->f_inode->i_size = ser->size;
455*b3749f17SPratyush Yadav
456*b3749f17SPratyush Yadav if (ser->nr_folios) {
457*b3749f17SPratyush Yadav folios_ser = kho_restore_vmalloc(&ser->folios);
458*b3749f17SPratyush Yadav if (!folios_ser) {
459*b3749f17SPratyush Yadav err = -EINVAL;
460*b3749f17SPratyush Yadav goto put_file;
461*b3749f17SPratyush Yadav }
462*b3749f17SPratyush Yadav
463*b3749f17SPratyush Yadav err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
464*b3749f17SPratyush Yadav vfree(folios_ser);
465*b3749f17SPratyush Yadav if (err)
466*b3749f17SPratyush Yadav goto put_file;
467*b3749f17SPratyush Yadav }
468*b3749f17SPratyush Yadav
469*b3749f17SPratyush Yadav args->file = file;
470*b3749f17SPratyush Yadav kho_restore_free(ser);
471*b3749f17SPratyush Yadav
472*b3749f17SPratyush Yadav return 0;
473*b3749f17SPratyush Yadav
474*b3749f17SPratyush Yadav put_file:
475*b3749f17SPratyush Yadav fput(file);
476*b3749f17SPratyush Yadav
477*b3749f17SPratyush Yadav return err;
478*b3749f17SPratyush Yadav }
479*b3749f17SPratyush Yadav
memfd_luo_can_preserve(struct liveupdate_file_handler * handler,struct file * file)480*b3749f17SPratyush Yadav static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
481*b3749f17SPratyush Yadav struct file *file)
482*b3749f17SPratyush Yadav {
483*b3749f17SPratyush Yadav struct inode *inode = file_inode(file);
484*b3749f17SPratyush Yadav
485*b3749f17SPratyush Yadav return shmem_file(file) && !inode->i_nlink;
486*b3749f17SPratyush Yadav }
487*b3749f17SPratyush Yadav
488*b3749f17SPratyush Yadav static const struct liveupdate_file_ops memfd_luo_file_ops = {
489*b3749f17SPratyush Yadav .freeze = memfd_luo_freeze,
490*b3749f17SPratyush Yadav .finish = memfd_luo_finish,
491*b3749f17SPratyush Yadav .retrieve = memfd_luo_retrieve,
492*b3749f17SPratyush Yadav .preserve = memfd_luo_preserve,
493*b3749f17SPratyush Yadav .unpreserve = memfd_luo_unpreserve,
494*b3749f17SPratyush Yadav .can_preserve = memfd_luo_can_preserve,
495*b3749f17SPratyush Yadav .owner = THIS_MODULE,
496*b3749f17SPratyush Yadav };
497*b3749f17SPratyush Yadav
498*b3749f17SPratyush Yadav static struct liveupdate_file_handler memfd_luo_handler = {
499*b3749f17SPratyush Yadav .ops = &memfd_luo_file_ops,
500*b3749f17SPratyush Yadav .compatible = MEMFD_LUO_FH_COMPATIBLE,
501*b3749f17SPratyush Yadav };
502*b3749f17SPratyush Yadav
memfd_luo_init(void)503*b3749f17SPratyush Yadav static int __init memfd_luo_init(void)
504*b3749f17SPratyush Yadav {
505*b3749f17SPratyush Yadav int err = liveupdate_register_file_handler(&memfd_luo_handler);
506*b3749f17SPratyush Yadav
507*b3749f17SPratyush Yadav if (err && err != -EOPNOTSUPP) {
508*b3749f17SPratyush Yadav pr_err("Could not register luo filesystem handler: %pe\n",
509*b3749f17SPratyush Yadav ERR_PTR(err));
510*b3749f17SPratyush Yadav
511*b3749f17SPratyush Yadav return err;
512*b3749f17SPratyush Yadav }
513*b3749f17SPratyush Yadav
514*b3749f17SPratyush Yadav return 0;
515*b3749f17SPratyush Yadav }
516*b3749f17SPratyush Yadav late_initcall(memfd_luo_init);
517