1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "scrub/xfile.h" 14 #include "scrub/xfarray.h" 15 #include "scrub/scrub.h" 16 #include "scrub/trace.h" 17 #include <linux/shmem_fs.h> 18 19 /* 20 * Swappable Temporary Memory 21 * ========================== 22 * 23 * Online checking sometimes needs to be able to stage a large amount of data 24 * in memory. This information might not fit in the available memory and it 25 * doesn't all need to be accessible at all times. In other words, we want an 26 * indexed data buffer to store data that can be paged out. 27 * 28 * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those 29 * requirements. Therefore, the xfile mechanism uses an unlinked shmem file to 30 * store our staging data. This file is not installed in the file descriptor 31 * table so that user programs cannot access the data, which means that the 32 * xfile must be freed with xfile_destroy. 33 * 34 * xfiles assume that the caller will handle all required concurrency 35 * management; standard vfs locks (freezer and inode) are not taken. Reads 36 * and writes are satisfied directly from the page cache. 37 * 38 * NOTE: The current shmemfs implementation has a quirk that in-kernel reads 39 * of a hole cause a page to be mapped into the file. If you are going to 40 * create a sparse xfile, please be careful about reading from uninitialized 41 * parts of the file. These pages are !Uptodate and will eventually be 42 * reclaimed if not written, but in the short term this boosts memory 43 * consumption. 44 */ 45 46 /* 47 * xfiles must not be exposed to userspace and require upper layers to 48 * coordinate access to the one handle returned by the constructor, so 49 * establish a separate lock class for xfiles to avoid confusing lockdep. 50 */ 51 static struct lock_class_key xfile_i_mutex_key; 52 53 /* 54 * Create an xfile of the given size. The description will be used in the 55 * trace output. 56 */ 57 int 58 xfile_create( 59 const char *description, 60 loff_t isize, 61 struct xfile **xfilep) 62 { 63 struct inode *inode; 64 struct xfile *xf; 65 int error = -ENOMEM; 66 67 xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS); 68 if (!xf) 69 return -ENOMEM; 70 71 xf->file = shmem_file_setup(description, isize, 0); 72 if (!xf->file) 73 goto out_xfile; 74 if (IS_ERR(xf->file)) { 75 error = PTR_ERR(xf->file); 76 goto out_xfile; 77 } 78 79 /* 80 * We want a large sparse file that we can pread, pwrite, and seek. 81 * xfile users are responsible for keeping the xfile hidden away from 82 * all other callers, so we skip timestamp updates and security checks. 83 * Make the inode only accessible by root, just in case the xfile ever 84 * escapes. 85 */ 86 xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME | 87 FMODE_LSEEK; 88 xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME; 89 inode = file_inode(xf->file); 90 inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME; 91 inode->i_mode &= ~0177; 92 inode->i_uid = GLOBAL_ROOT_UID; 93 inode->i_gid = GLOBAL_ROOT_GID; 94 95 lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key); 96 97 trace_xfile_create(xf); 98 99 *xfilep = xf; 100 return 0; 101 out_xfile: 102 kfree(xf); 103 return error; 104 } 105 106 /* Close the file and release all resources. */ 107 void 108 xfile_destroy( 109 struct xfile *xf) 110 { 111 struct inode *inode = file_inode(xf->file); 112 113 trace_xfile_destroy(xf); 114 115 lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key); 116 fput(xf->file); 117 kfree(xf); 118 } 119 120 /* 121 * Read a memory object directly from the xfile's page cache. Unlike regular 122 * pread, we return -E2BIG and -EFBIG for reads that are too large or at too 123 * high an offset, instead of truncating the read. Otherwise, we return 124 * bytes read or an error code, like regular pread. 125 */ 126 ssize_t 127 xfile_pread( 128 struct xfile *xf, 129 void *buf, 130 size_t count, 131 loff_t pos) 132 { 133 struct inode *inode = file_inode(xf->file); 134 struct address_space *mapping = inode->i_mapping; 135 struct page *page = NULL; 136 ssize_t read = 0; 137 unsigned int pflags; 138 int error = 0; 139 140 if (count > MAX_RW_COUNT) 141 return -E2BIG; 142 if (inode->i_sb->s_maxbytes - pos < count) 143 return -EFBIG; 144 145 trace_xfile_pread(xf, pos, count); 146 147 pflags = memalloc_nofs_save(); 148 while (count > 0) { 149 void *p, *kaddr; 150 unsigned int len; 151 152 len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); 153 154 /* 155 * In-kernel reads of a shmem file cause it to allocate a page 156 * if the mapping shows a hole. Therefore, if we hit ENOMEM 157 * we can continue by zeroing the caller's buffer. 158 */ 159 page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT, 160 __GFP_NOWARN); 161 if (IS_ERR(page)) { 162 error = PTR_ERR(page); 163 if (error != -ENOMEM) 164 break; 165 166 memset(buf, 0, len); 167 goto advance; 168 } 169 170 if (PageUptodate(page)) { 171 /* 172 * xfile pages must never be mapped into userspace, so 173 * we skip the dcache flush. 174 */ 175 kaddr = kmap_local_page(page); 176 p = kaddr + offset_in_page(pos); 177 memcpy(buf, p, len); 178 kunmap_local(kaddr); 179 } else { 180 memset(buf, 0, len); 181 } 182 put_page(page); 183 184 advance: 185 count -= len; 186 pos += len; 187 buf += len; 188 read += len; 189 } 190 memalloc_nofs_restore(pflags); 191 192 if (read > 0) 193 return read; 194 return error; 195 } 196 197 /* 198 * Write a memory object directly to the xfile's page cache. Unlike regular 199 * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too 200 * high an offset, instead of truncating the write. Otherwise, we return 201 * bytes written or an error code, like regular pwrite. 202 */ 203 ssize_t 204 xfile_pwrite( 205 struct xfile *xf, 206 const void *buf, 207 size_t count, 208 loff_t pos) 209 { 210 struct inode *inode = file_inode(xf->file); 211 struct address_space *mapping = inode->i_mapping; 212 const struct address_space_operations *aops = mapping->a_ops; 213 struct page *page = NULL; 214 ssize_t written = 0; 215 unsigned int pflags; 216 int error = 0; 217 218 if (count > MAX_RW_COUNT) 219 return -E2BIG; 220 if (inode->i_sb->s_maxbytes - pos < count) 221 return -EFBIG; 222 223 trace_xfile_pwrite(xf, pos, count); 224 225 pflags = memalloc_nofs_save(); 226 while (count > 0) { 227 void *fsdata = NULL; 228 void *p, *kaddr; 229 unsigned int len; 230 int ret; 231 232 len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); 233 234 /* 235 * We call write_begin directly here to avoid all the freezer 236 * protection lock-taking that happens in the normal path. 237 * shmem doesn't support fs freeze, but lockdep doesn't know 238 * that and will trip over that. 239 */ 240 error = aops->write_begin(NULL, mapping, pos, len, &page, 241 &fsdata); 242 if (error) 243 break; 244 245 /* 246 * xfile pages must never be mapped into userspace, so we skip 247 * the dcache flush. If the page is not uptodate, zero it 248 * before writing data. 249 */ 250 kaddr = kmap_local_page(page); 251 if (!PageUptodate(page)) { 252 memset(kaddr, 0, PAGE_SIZE); 253 SetPageUptodate(page); 254 } 255 p = kaddr + offset_in_page(pos); 256 memcpy(p, buf, len); 257 kunmap_local(kaddr); 258 259 ret = aops->write_end(NULL, mapping, pos, len, len, page, 260 fsdata); 261 if (ret < 0) { 262 error = ret; 263 break; 264 } 265 266 written += ret; 267 if (ret != len) 268 break; 269 270 count -= ret; 271 pos += ret; 272 buf += ret; 273 } 274 memalloc_nofs_restore(pflags); 275 276 if (written > 0) 277 return written; 278 return error; 279 } 280 281 /* Find the next written area in the xfile data for a given offset. */ 282 loff_t 283 xfile_seek_data( 284 struct xfile *xf, 285 loff_t pos) 286 { 287 loff_t ret; 288 289 ret = vfs_llseek(xf->file, pos, SEEK_DATA); 290 trace_xfile_seek_data(xf, pos, ret); 291 return ret; 292 } 293 294 /* Query stat information for an xfile. */ 295 int 296 xfile_stat( 297 struct xfile *xf, 298 struct xfile_stat *statbuf) 299 { 300 struct kstat ks; 301 int error; 302 303 error = vfs_getattr_nosec(&xf->file->f_path, &ks, 304 STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC); 305 if (error) 306 return error; 307 308 statbuf->size = ks.size; 309 statbuf->bytes = ks.blocks << SECTOR_SHIFT; 310 return 0; 311 } 312 313 /* 314 * Grab the (locked) page for a memory object. The object cannot span a page 315 * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we 316 * cannot grab the page, or the usual negative errno. 317 */ 318 int 319 xfile_get_page( 320 struct xfile *xf, 321 loff_t pos, 322 unsigned int len, 323 struct xfile_page *xfpage) 324 { 325 struct inode *inode = file_inode(xf->file); 326 struct address_space *mapping = inode->i_mapping; 327 const struct address_space_operations *aops = mapping->a_ops; 328 struct page *page = NULL; 329 void *fsdata = NULL; 330 loff_t key = round_down(pos, PAGE_SIZE); 331 unsigned int pflags; 332 int error; 333 334 if (inode->i_sb->s_maxbytes - pos < len) 335 return -ENOMEM; 336 if (len > PAGE_SIZE - offset_in_page(pos)) 337 return -ENOTBLK; 338 339 trace_xfile_get_page(xf, pos, len); 340 341 pflags = memalloc_nofs_save(); 342 343 /* 344 * We call write_begin directly here to avoid all the freezer 345 * protection lock-taking that happens in the normal path. shmem 346 * doesn't support fs freeze, but lockdep doesn't know that and will 347 * trip over that. 348 */ 349 error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page, 350 &fsdata); 351 if (error) 352 goto out_pflags; 353 354 /* We got the page, so make sure we push out EOF. */ 355 if (i_size_read(inode) < pos + len) 356 i_size_write(inode, pos + len); 357 358 /* 359 * If the page isn't up to date, fill it with zeroes before we hand it 360 * to the caller and make sure the backing store will hold on to them. 361 */ 362 if (!PageUptodate(page)) { 363 void *kaddr; 364 365 kaddr = kmap_local_page(page); 366 memset(kaddr, 0, PAGE_SIZE); 367 kunmap_local(kaddr); 368 SetPageUptodate(page); 369 } 370 371 /* 372 * Mark each page dirty so that the contents are written to some 373 * backing store when we drop this buffer, and take an extra reference 374 * to prevent the xfile page from being swapped or removed from the 375 * page cache by reclaim if the caller unlocks the page. 376 */ 377 set_page_dirty(page); 378 get_page(page); 379 380 xfpage->page = page; 381 xfpage->fsdata = fsdata; 382 xfpage->pos = key; 383 out_pflags: 384 memalloc_nofs_restore(pflags); 385 return error; 386 } 387 388 /* 389 * Release the (locked) page for a memory object. Returns 0 or a negative 390 * errno. 391 */ 392 int 393 xfile_put_page( 394 struct xfile *xf, 395 struct xfile_page *xfpage) 396 { 397 struct inode *inode = file_inode(xf->file); 398 struct address_space *mapping = inode->i_mapping; 399 const struct address_space_operations *aops = mapping->a_ops; 400 unsigned int pflags; 401 int ret; 402 403 trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE); 404 405 /* Give back the reference that we took in xfile_get_page. */ 406 put_page(xfpage->page); 407 408 pflags = memalloc_nofs_save(); 409 ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE, 410 xfpage->page, xfpage->fsdata); 411 memalloc_nofs_restore(pflags); 412 memset(xfpage, 0, sizeof(struct xfile_page)); 413 414 if (ret < 0) 415 return ret; 416 if (ret != PAGE_SIZE) 417 return -EIO; 418 return 0; 419 } 420