1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_format.h" 14 #include "scrub/xfile.h" 15 #include "scrub/xfarray.h" 16 #include "scrub/scrub.h" 17 #include "scrub/trace.h" 18 #include <linux/shmem_fs.h> 19 20 /* 21 * Swappable Temporary Memory 22 * ========================== 23 * 24 * Online checking sometimes needs to be able to stage a large amount of data 25 * in memory. This information might not fit in the available memory and it 26 * doesn't all need to be accessible at all times. In other words, we want an 27 * indexed data buffer to store data that can be paged out. 28 * 29 * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those 30 * requirements. Therefore, the xfile mechanism uses an unlinked shmem file to 31 * store our staging data. This file is not installed in the file descriptor 32 * table so that user programs cannot access the data, which means that the 33 * xfile must be freed with xfile_destroy. 34 * 35 * xfiles assume that the caller will handle all required concurrency 36 * management; standard vfs locks (freezer and inode) are not taken. Reads 37 * and writes are satisfied directly from the page cache. 38 * 39 * NOTE: The current shmemfs implementation has a quirk that in-kernel reads 40 * of a hole cause a page to be mapped into the file. If you are going to 41 * create a sparse xfile, please be careful about reading from uninitialized 42 * parts of the file. These pages are !Uptodate and will eventually be 43 * reclaimed if not written, but in the short term this boosts memory 44 * consumption. 45 */ 46 47 /* 48 * xfiles must not be exposed to userspace and require upper layers to 49 * coordinate access to the one handle returned by the constructor, so 50 * establish a separate lock class for xfiles to avoid confusing lockdep. 51 */ 52 static struct lock_class_key xfile_i_mutex_key; 53 54 /* 55 * Create an xfile of the given size. The description will be used in the 56 * trace output. 57 */ 58 int 59 xfile_create( 60 const char *description, 61 loff_t isize, 62 struct xfile **xfilep) 63 { 64 struct inode *inode; 65 struct xfile *xf; 66 int error = -ENOMEM; 67 68 xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS); 69 if (!xf) 70 return -ENOMEM; 71 72 xf->file = shmem_file_setup(description, isize, 0); 73 if (!xf->file) 74 goto out_xfile; 75 if (IS_ERR(xf->file)) { 76 error = PTR_ERR(xf->file); 77 goto out_xfile; 78 } 79 80 /* 81 * We want a large sparse file that we can pread, pwrite, and seek. 82 * xfile users are responsible for keeping the xfile hidden away from 83 * all other callers, so we skip timestamp updates and security checks. 84 * Make the inode only accessible by root, just in case the xfile ever 85 * escapes. 86 */ 87 xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME | 88 FMODE_LSEEK; 89 xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME; 90 inode = file_inode(xf->file); 91 inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME; 92 inode->i_mode &= ~0177; 93 inode->i_uid = GLOBAL_ROOT_UID; 94 inode->i_gid = GLOBAL_ROOT_GID; 95 96 lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key); 97 98 trace_xfile_create(xf); 99 100 *xfilep = xf; 101 return 0; 102 out_xfile: 103 kfree(xf); 104 return error; 105 } 106 107 /* Close the file and release all resources. */ 108 void 109 xfile_destroy( 110 struct xfile *xf) 111 { 112 struct inode *inode = file_inode(xf->file); 113 114 trace_xfile_destroy(xf); 115 116 lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key); 117 fput(xf->file); 118 kfree(xf); 119 } 120 121 /* 122 * Read a memory object directly from the xfile's page cache. Unlike regular 123 * pread, we return -E2BIG and -EFBIG for reads that are too large or at too 124 * high an offset, instead of truncating the read. Otherwise, we return 125 * bytes read or an error code, like regular pread. 126 */ 127 ssize_t 128 xfile_pread( 129 struct xfile *xf, 130 void *buf, 131 size_t count, 132 loff_t pos) 133 { 134 struct inode *inode = file_inode(xf->file); 135 struct address_space *mapping = inode->i_mapping; 136 struct page *page = NULL; 137 ssize_t read = 0; 138 unsigned int pflags; 139 int error = 0; 140 141 if (count > MAX_RW_COUNT) 142 return -E2BIG; 143 if (inode->i_sb->s_maxbytes - pos < count) 144 return -EFBIG; 145 146 trace_xfile_pread(xf, pos, count); 147 148 pflags = memalloc_nofs_save(); 149 while (count > 0) { 150 void *p, *kaddr; 151 unsigned int len; 152 153 len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); 154 155 /* 156 * In-kernel reads of a shmem file cause it to allocate a page 157 * if the mapping shows a hole. Therefore, if we hit ENOMEM 158 * we can continue by zeroing the caller's buffer. 159 */ 160 page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT, 161 __GFP_NOWARN); 162 if (IS_ERR(page)) { 163 error = PTR_ERR(page); 164 if (error != -ENOMEM) 165 break; 166 167 memset(buf, 0, len); 168 goto advance; 169 } 170 171 if (PageUptodate(page)) { 172 /* 173 * xfile pages must never be mapped into userspace, so 174 * we skip the dcache flush. 175 */ 176 kaddr = kmap_local_page(page); 177 p = kaddr + offset_in_page(pos); 178 memcpy(buf, p, len); 179 kunmap_local(kaddr); 180 } else { 181 memset(buf, 0, len); 182 } 183 put_page(page); 184 185 advance: 186 count -= len; 187 pos += len; 188 buf += len; 189 read += len; 190 } 191 memalloc_nofs_restore(pflags); 192 193 if (read > 0) 194 return read; 195 return error; 196 } 197 198 /* 199 * Write a memory object directly to the xfile's page cache. Unlike regular 200 * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too 201 * high an offset, instead of truncating the write. Otherwise, we return 202 * bytes written or an error code, like regular pwrite. 203 */ 204 ssize_t 205 xfile_pwrite( 206 struct xfile *xf, 207 const void *buf, 208 size_t count, 209 loff_t pos) 210 { 211 struct inode *inode = file_inode(xf->file); 212 struct address_space *mapping = inode->i_mapping; 213 const struct address_space_operations *aops = mapping->a_ops; 214 struct page *page = NULL; 215 ssize_t written = 0; 216 unsigned int pflags; 217 int error = 0; 218 219 if (count > MAX_RW_COUNT) 220 return -E2BIG; 221 if (inode->i_sb->s_maxbytes - pos < count) 222 return -EFBIG; 223 224 trace_xfile_pwrite(xf, pos, count); 225 226 pflags = memalloc_nofs_save(); 227 while (count > 0) { 228 void *fsdata = NULL; 229 void *p, *kaddr; 230 unsigned int len; 231 int ret; 232 233 len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); 234 235 /* 236 * We call write_begin directly here to avoid all the freezer 237 * protection lock-taking that happens in the normal path. 238 * shmem doesn't support fs freeze, but lockdep doesn't know 239 * that and will trip over that. 240 */ 241 error = aops->write_begin(NULL, mapping, pos, len, &page, 242 &fsdata); 243 if (error) 244 break; 245 246 /* 247 * xfile pages must never be mapped into userspace, so we skip 248 * the dcache flush. If the page is not uptodate, zero it 249 * before writing data. 250 */ 251 kaddr = kmap_local_page(page); 252 if (!PageUptodate(page)) { 253 memset(kaddr, 0, PAGE_SIZE); 254 SetPageUptodate(page); 255 } 256 p = kaddr + offset_in_page(pos); 257 memcpy(p, buf, len); 258 kunmap_local(kaddr); 259 260 ret = aops->write_end(NULL, mapping, pos, len, len, page, 261 fsdata); 262 if (ret < 0) { 263 error = ret; 264 break; 265 } 266 267 written += ret; 268 if (ret != len) 269 break; 270 271 count -= ret; 272 pos += ret; 273 buf += ret; 274 } 275 memalloc_nofs_restore(pflags); 276 277 if (written > 0) 278 return written; 279 return error; 280 } 281 282 /* Find the next written area in the xfile data for a given offset. */ 283 loff_t 284 xfile_seek_data( 285 struct xfile *xf, 286 loff_t pos) 287 { 288 loff_t ret; 289 290 ret = vfs_llseek(xf->file, pos, SEEK_DATA); 291 trace_xfile_seek_data(xf, pos, ret); 292 return ret; 293 } 294 295 /* Query stat information for an xfile. */ 296 int 297 xfile_stat( 298 struct xfile *xf, 299 struct xfile_stat *statbuf) 300 { 301 struct kstat ks; 302 int error; 303 304 error = vfs_getattr_nosec(&xf->file->f_path, &ks, 305 STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC); 306 if (error) 307 return error; 308 309 statbuf->size = ks.size; 310 statbuf->bytes = ks.blocks << SECTOR_SHIFT; 311 return 0; 312 } 313 314 /* 315 * Grab the (locked) page for a memory object. The object cannot span a page 316 * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we 317 * cannot grab the page, or the usual negative errno. 318 */ 319 int 320 xfile_get_page( 321 struct xfile *xf, 322 loff_t pos, 323 unsigned int len, 324 struct xfile_page *xfpage) 325 { 326 struct inode *inode = file_inode(xf->file); 327 struct address_space *mapping = inode->i_mapping; 328 const struct address_space_operations *aops = mapping->a_ops; 329 struct page *page = NULL; 330 void *fsdata = NULL; 331 loff_t key = round_down(pos, PAGE_SIZE); 332 unsigned int pflags; 333 int error; 334 335 if (inode->i_sb->s_maxbytes - pos < len) 336 return -ENOMEM; 337 if (len > PAGE_SIZE - offset_in_page(pos)) 338 return -ENOTBLK; 339 340 trace_xfile_get_page(xf, pos, len); 341 342 pflags = memalloc_nofs_save(); 343 344 /* 345 * We call write_begin directly here to avoid all the freezer 346 * protection lock-taking that happens in the normal path. shmem 347 * doesn't support fs freeze, but lockdep doesn't know that and will 348 * trip over that. 349 */ 350 error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page, 351 &fsdata); 352 if (error) 353 goto out_pflags; 354 355 /* We got the page, so make sure we push out EOF. */ 356 if (i_size_read(inode) < pos + len) 357 i_size_write(inode, pos + len); 358 359 /* 360 * If the page isn't up to date, fill it with zeroes before we hand it 361 * to the caller and make sure the backing store will hold on to them. 362 */ 363 if (!PageUptodate(page)) { 364 void *kaddr; 365 366 kaddr = kmap_local_page(page); 367 memset(kaddr, 0, PAGE_SIZE); 368 kunmap_local(kaddr); 369 SetPageUptodate(page); 370 } 371 372 /* 373 * Mark each page dirty so that the contents are written to some 374 * backing store when we drop this buffer, and take an extra reference 375 * to prevent the xfile page from being swapped or removed from the 376 * page cache by reclaim if the caller unlocks the page. 377 */ 378 set_page_dirty(page); 379 get_page(page); 380 381 xfpage->page = page; 382 xfpage->fsdata = fsdata; 383 xfpage->pos = key; 384 out_pflags: 385 memalloc_nofs_restore(pflags); 386 return error; 387 } 388 389 /* 390 * Release the (locked) page for a memory object. Returns 0 or a negative 391 * errno. 392 */ 393 int 394 xfile_put_page( 395 struct xfile *xf, 396 struct xfile_page *xfpage) 397 { 398 struct inode *inode = file_inode(xf->file); 399 struct address_space *mapping = inode->i_mapping; 400 const struct address_space_operations *aops = mapping->a_ops; 401 unsigned int pflags; 402 int ret; 403 404 trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE); 405 406 /* Give back the reference that we took in xfile_get_page. */ 407 put_page(xfpage->page); 408 409 pflags = memalloc_nofs_save(); 410 ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE, 411 xfpage->page, xfpage->fsdata); 412 memalloc_nofs_restore(pflags); 413 memset(xfpage, 0, sizeof(struct xfile_page)); 414 415 if (ret < 0) 416 return ret; 417 if (ret != PAGE_SIZE) 418 return -EIO; 419 return 0; 420 } 421