xref: /linux/fs/xfs/scrub/xfile.c (revision 3d0fe49454652117522f60bfbefb978ba0e5300b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "scrub/xfile.h"
14 #include "scrub/xfarray.h"
15 #include "scrub/scrub.h"
16 #include "scrub/trace.h"
17 #include <linux/shmem_fs.h>
18 
19 /*
20  * Swappable Temporary Memory
21  * ==========================
22  *
23  * Online checking sometimes needs to be able to stage a large amount of data
24  * in memory.  This information might not fit in the available memory and it
25  * doesn't all need to be accessible at all times.  In other words, we want an
26  * indexed data buffer to store data that can be paged out.
27  *
28  * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
29  * requirements.  Therefore, the xfile mechanism uses an unlinked shmem file to
30  * store our staging data.  This file is not installed in the file descriptor
31  * table so that user programs cannot access the data, which means that the
32  * xfile must be freed with xfile_destroy.
33  *
34  * xfiles assume that the caller will handle all required concurrency
35  * management; standard vfs locks (freezer and inode) are not taken.  Reads
36  * and writes are satisfied directly from the page cache.
37  *
38  * NOTE: The current shmemfs implementation has a quirk that in-kernel reads
39  * of a hole cause a page to be mapped into the file.  If you are going to
40  * create a sparse xfile, please be careful about reading from uninitialized
41  * parts of the file.  These pages are !Uptodate and will eventually be
42  * reclaimed if not written, but in the short term this boosts memory
43  * consumption.
44  */
45 
46 /*
47  * xfiles must not be exposed to userspace and require upper layers to
48  * coordinate access to the one handle returned by the constructor, so
49  * establish a separate lock class for xfiles to avoid confusing lockdep.
50  */
51 static struct lock_class_key xfile_i_mutex_key;
52 
53 /*
54  * Create an xfile of the given size.  The description will be used in the
55  * trace output.
56  */
57 int
58 xfile_create(
59 	const char		*description,
60 	loff_t			isize,
61 	struct xfile		**xfilep)
62 {
63 	struct inode		*inode;
64 	struct xfile		*xf;
65 	int			error = -ENOMEM;
66 
67 	xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
68 	if (!xf)
69 		return -ENOMEM;
70 
71 	xf->file = shmem_file_setup(description, isize, 0);
72 	if (!xf->file)
73 		goto out_xfile;
74 	if (IS_ERR(xf->file)) {
75 		error = PTR_ERR(xf->file);
76 		goto out_xfile;
77 	}
78 
79 	/*
80 	 * We want a large sparse file that we can pread, pwrite, and seek.
81 	 * xfile users are responsible for keeping the xfile hidden away from
82 	 * all other callers, so we skip timestamp updates and security checks.
83 	 * Make the inode only accessible by root, just in case the xfile ever
84 	 * escapes.
85 	 */
86 	xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME |
87 			    FMODE_LSEEK;
88 	xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
89 	inode = file_inode(xf->file);
90 	inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME;
91 	inode->i_mode &= ~0177;
92 	inode->i_uid = GLOBAL_ROOT_UID;
93 	inode->i_gid = GLOBAL_ROOT_GID;
94 
95 	lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key);
96 
97 	trace_xfile_create(xf);
98 
99 	*xfilep = xf;
100 	return 0;
101 out_xfile:
102 	kfree(xf);
103 	return error;
104 }
105 
106 /* Close the file and release all resources. */
107 void
108 xfile_destroy(
109 	struct xfile		*xf)
110 {
111 	struct inode		*inode = file_inode(xf->file);
112 
113 	trace_xfile_destroy(xf);
114 
115 	lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key);
116 	fput(xf->file);
117 	kfree(xf);
118 }
119 
120 /*
121  * Read a memory object directly from the xfile's page cache.  Unlike regular
122  * pread, we return -E2BIG and -EFBIG for reads that are too large or at too
123  * high an offset, instead of truncating the read.  Otherwise, we return
124  * bytes read or an error code, like regular pread.
125  */
126 ssize_t
127 xfile_pread(
128 	struct xfile		*xf,
129 	void			*buf,
130 	size_t			count,
131 	loff_t			pos)
132 {
133 	struct inode		*inode = file_inode(xf->file);
134 	struct address_space	*mapping = inode->i_mapping;
135 	struct page		*page = NULL;
136 	ssize_t			read = 0;
137 	unsigned int		pflags;
138 	int			error = 0;
139 
140 	if (count > MAX_RW_COUNT)
141 		return -E2BIG;
142 	if (inode->i_sb->s_maxbytes - pos < count)
143 		return -EFBIG;
144 
145 	trace_xfile_pread(xf, pos, count);
146 
147 	pflags = memalloc_nofs_save();
148 	while (count > 0) {
149 		void		*p, *kaddr;
150 		unsigned int	len;
151 
152 		len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
153 
154 		/*
155 		 * In-kernel reads of a shmem file cause it to allocate a page
156 		 * if the mapping shows a hole.  Therefore, if we hit ENOMEM
157 		 * we can continue by zeroing the caller's buffer.
158 		 */
159 		page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT,
160 				__GFP_NOWARN);
161 		if (IS_ERR(page)) {
162 			error = PTR_ERR(page);
163 			if (error != -ENOMEM)
164 				break;
165 
166 			memset(buf, 0, len);
167 			goto advance;
168 		}
169 
170 		if (PageUptodate(page)) {
171 			/*
172 			 * xfile pages must never be mapped into userspace, so
173 			 * we skip the dcache flush.
174 			 */
175 			kaddr = kmap_local_page(page);
176 			p = kaddr + offset_in_page(pos);
177 			memcpy(buf, p, len);
178 			kunmap_local(kaddr);
179 		} else {
180 			memset(buf, 0, len);
181 		}
182 		put_page(page);
183 
184 advance:
185 		count -= len;
186 		pos += len;
187 		buf += len;
188 		read += len;
189 	}
190 	memalloc_nofs_restore(pflags);
191 
192 	if (read > 0)
193 		return read;
194 	return error;
195 }
196 
197 /*
198  * Write a memory object directly to the xfile's page cache.  Unlike regular
199  * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
200  * high an offset, instead of truncating the write.  Otherwise, we return
201  * bytes written or an error code, like regular pwrite.
202  */
203 ssize_t
204 xfile_pwrite(
205 	struct xfile		*xf,
206 	const void		*buf,
207 	size_t			count,
208 	loff_t			pos)
209 {
210 	struct inode		*inode = file_inode(xf->file);
211 	struct address_space	*mapping = inode->i_mapping;
212 	const struct address_space_operations *aops = mapping->a_ops;
213 	struct page		*page = NULL;
214 	ssize_t			written = 0;
215 	unsigned int		pflags;
216 	int			error = 0;
217 
218 	if (count > MAX_RW_COUNT)
219 		return -E2BIG;
220 	if (inode->i_sb->s_maxbytes - pos < count)
221 		return -EFBIG;
222 
223 	trace_xfile_pwrite(xf, pos, count);
224 
225 	pflags = memalloc_nofs_save();
226 	while (count > 0) {
227 		void		*fsdata = NULL;
228 		void		*p, *kaddr;
229 		unsigned int	len;
230 		int		ret;
231 
232 		len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
233 
234 		/*
235 		 * We call write_begin directly here to avoid all the freezer
236 		 * protection lock-taking that happens in the normal path.
237 		 * shmem doesn't support fs freeze, but lockdep doesn't know
238 		 * that and will trip over that.
239 		 */
240 		error = aops->write_begin(NULL, mapping, pos, len, &page,
241 				&fsdata);
242 		if (error)
243 			break;
244 
245 		/*
246 		 * xfile pages must never be mapped into userspace, so we skip
247 		 * the dcache flush.  If the page is not uptodate, zero it
248 		 * before writing data.
249 		 */
250 		kaddr = kmap_local_page(page);
251 		if (!PageUptodate(page)) {
252 			memset(kaddr, 0, PAGE_SIZE);
253 			SetPageUptodate(page);
254 		}
255 		p = kaddr + offset_in_page(pos);
256 		memcpy(p, buf, len);
257 		kunmap_local(kaddr);
258 
259 		ret = aops->write_end(NULL, mapping, pos, len, len, page,
260 				fsdata);
261 		if (ret < 0) {
262 			error = ret;
263 			break;
264 		}
265 
266 		written += ret;
267 		if (ret != len)
268 			break;
269 
270 		count -= ret;
271 		pos += ret;
272 		buf += ret;
273 	}
274 	memalloc_nofs_restore(pflags);
275 
276 	if (written > 0)
277 		return written;
278 	return error;
279 }
280 
281 /* Find the next written area in the xfile data for a given offset. */
282 loff_t
283 xfile_seek_data(
284 	struct xfile		*xf,
285 	loff_t			pos)
286 {
287 	loff_t			ret;
288 
289 	ret = vfs_llseek(xf->file, pos, SEEK_DATA);
290 	trace_xfile_seek_data(xf, pos, ret);
291 	return ret;
292 }
293 
294 /* Query stat information for an xfile. */
295 int
296 xfile_stat(
297 	struct xfile		*xf,
298 	struct xfile_stat	*statbuf)
299 {
300 	struct kstat		ks;
301 	int			error;
302 
303 	error = vfs_getattr_nosec(&xf->file->f_path, &ks,
304 			STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC);
305 	if (error)
306 		return error;
307 
308 	statbuf->size = ks.size;
309 	statbuf->bytes = ks.blocks << SECTOR_SHIFT;
310 	return 0;
311 }
312 
313 /*
314  * Grab the (locked) page for a memory object.  The object cannot span a page
315  * boundary.  Returns 0 (and a locked page) if successful, -ENOTBLK if we
316  * cannot grab the page, or the usual negative errno.
317  */
318 int
319 xfile_get_page(
320 	struct xfile		*xf,
321 	loff_t			pos,
322 	unsigned int		len,
323 	struct xfile_page	*xfpage)
324 {
325 	struct inode		*inode = file_inode(xf->file);
326 	struct address_space	*mapping = inode->i_mapping;
327 	const struct address_space_operations *aops = mapping->a_ops;
328 	struct page		*page = NULL;
329 	void			*fsdata = NULL;
330 	loff_t			key = round_down(pos, PAGE_SIZE);
331 	unsigned int		pflags;
332 	int			error;
333 
334 	if (inode->i_sb->s_maxbytes - pos < len)
335 		return -ENOMEM;
336 	if (len > PAGE_SIZE - offset_in_page(pos))
337 		return -ENOTBLK;
338 
339 	trace_xfile_get_page(xf, pos, len);
340 
341 	pflags = memalloc_nofs_save();
342 
343 	/*
344 	 * We call write_begin directly here to avoid all the freezer
345 	 * protection lock-taking that happens in the normal path.  shmem
346 	 * doesn't support fs freeze, but lockdep doesn't know that and will
347 	 * trip over that.
348 	 */
349 	error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page,
350 			&fsdata);
351 	if (error)
352 		goto out_pflags;
353 
354 	/* We got the page, so make sure we push out EOF. */
355 	if (i_size_read(inode) < pos + len)
356 		i_size_write(inode, pos + len);
357 
358 	/*
359 	 * If the page isn't up to date, fill it with zeroes before we hand it
360 	 * to the caller and make sure the backing store will hold on to them.
361 	 */
362 	if (!PageUptodate(page)) {
363 		void	*kaddr;
364 
365 		kaddr = kmap_local_page(page);
366 		memset(kaddr, 0, PAGE_SIZE);
367 		kunmap_local(kaddr);
368 		SetPageUptodate(page);
369 	}
370 
371 	/*
372 	 * Mark each page dirty so that the contents are written to some
373 	 * backing store when we drop this buffer, and take an extra reference
374 	 * to prevent the xfile page from being swapped or removed from the
375 	 * page cache by reclaim if the caller unlocks the page.
376 	 */
377 	set_page_dirty(page);
378 	get_page(page);
379 
380 	xfpage->page = page;
381 	xfpage->fsdata = fsdata;
382 	xfpage->pos = key;
383 out_pflags:
384 	memalloc_nofs_restore(pflags);
385 	return error;
386 }
387 
388 /*
389  * Release the (locked) page for a memory object.  Returns 0 or a negative
390  * errno.
391  */
392 int
393 xfile_put_page(
394 	struct xfile		*xf,
395 	struct xfile_page	*xfpage)
396 {
397 	struct inode		*inode = file_inode(xf->file);
398 	struct address_space	*mapping = inode->i_mapping;
399 	const struct address_space_operations *aops = mapping->a_ops;
400 	unsigned int		pflags;
401 	int			ret;
402 
403 	trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE);
404 
405 	/* Give back the reference that we took in xfile_get_page. */
406 	put_page(xfpage->page);
407 
408 	pflags = memalloc_nofs_save();
409 	ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE,
410 			xfpage->page, xfpage->fsdata);
411 	memalloc_nofs_restore(pflags);
412 	memset(xfpage, 0, sizeof(struct xfile_page));
413 
414 	if (ret < 0)
415 		return ret;
416 	if (ret != PAGE_SIZE)
417 		return -EIO;
418 	return 0;
419 }
420