1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2024 Oxide Computer Company 24 */ 25 26 #include <sys/types.h> 27 #include <sys/t_lock.h> 28 #include <sys/param.h> 29 #include <sys/time.h> 30 #include <sys/systm.h> 31 #include <sys/sysmacros.h> 32 #include <sys/resource.h> 33 #include <sys/signal.h> 34 #include <sys/cred.h> 35 #include <sys/user.h> 36 #include <sys/buf.h> 37 #include <sys/vfs.h> 38 #include <sys/vnode.h> 39 #include <sys/proc.h> 40 #include <sys/disp.h> 41 #include <sys/file.h> 42 #include <sys/fcntl.h> 43 #include <sys/flock.h> 44 #include <sys/atomic.h> 45 #include <sys/kmem.h> 46 #include <sys/uio.h> 47 #include <sys/conf.h> 48 #include <sys/mman.h> 49 #include <sys/pathname.h> 50 #include <sys/debug.h> 51 #include <sys/vmsystm.h> 52 #include <sys/cmn_err.h> 53 #include <sys/filio.h> 54 #include <sys/dnlc.h> 55 56 #include <sys/fs/ufs_filio.h> 57 #include <sys/fs/ufs_lockfs.h> 58 #include <sys/fs/ufs_fs.h> 59 #include <sys/fs/ufs_inode.h> 60 #include <sys/fs/ufs_fsdir.h> 61 #include <sys/fs/ufs_quota.h> 62 #include <sys/fs/ufs_trans.h> 63 #include <sys/fs/ufs_log.h> 64 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 65 #include <sys/errno.h> 66 #include <sys/sysinfo.h> 67 68 #include <vm/hat.h> 69 #include <vm/page.h> 70 #include <vm/pvn.h> 71 #include <vm/as.h> 72 #include <vm/seg.h> 73 #include <vm/seg_map.h> 74 #include <vm/seg_vn.h> 75 #include <vm/rm.h> 76 #include <sys/swap.h> 77 #include <sys/model.h> 78 #include <sys/policy.h> 79 80 #include "fs/fs_subr.h" 81 82 /* 83 * ufs_fioio is the ufs equivalent of NFS_CNVT and is tailored to 84 * metamucil's needs. It may change at any time. 85 */ 86 /* ARGSUSED */ 87 int 88 ufs_fioio( 89 struct vnode *vp, /* any file on the fs */ 90 struct fioio *fiou, /* fioio struct in userland */ 91 int flag, /* flag from VOP_IOCTL() */ 92 struct cred *cr) /* credentials from ufs_ioctl */ 93 { 94 int error = 0; 95 struct vnode *vpio = NULL; /* vnode for inode open */ 96 struct inode *ipio = NULL; /* inode for inode open */ 97 struct file *fpio = NULL; /* file for inode open */ 98 struct inode *ip; /* inode for file system */ 99 struct fs *fs; /* fs for file system */ 100 STRUCT_DECL(fioio, fio); /* copy of user's fioio struct */ 101 102 /* 103 * must be privileged 104 */ 105 if (secpolicy_fs_config(cr, vp->v_vfsp) != 0) 106 return (EPERM); 107 108 STRUCT_INIT(fio, flag & DATAMODEL_MASK); 109 110 /* 111 * get user's copy of fioio struct 112 */ 113 if (copyin(fiou, STRUCT_BUF(fio), STRUCT_SIZE(fio))) 114 return (EFAULT); 115 116 ip = VTOI(vp); 117 fs = ip->i_fs; 118 119 /* 120 * check the inode number against the fs's inode number bounds 121 */ 122 if (STRUCT_FGET(fio, fio_ino) < UFSROOTINO) 123 return (ESRCH); 124 if (STRUCT_FGET(fio, fio_ino) >= fs->fs_ncg * fs->fs_ipg) 125 return (ESRCH); 126 127 rw_enter(&ip->i_ufsvfs->vfs_dqrwlock, RW_READER); 128 129 /* 130 * get the inode 131 */ 132 error = ufs_iget(ip->i_vfs, STRUCT_FGET(fio, fio_ino), &ipio, cr); 133 134 rw_exit(&ip->i_ufsvfs->vfs_dqrwlock); 135 136 if (error) 137 return (error); 138 139 /* 140 * check the generation number 141 */ 142 rw_enter(&ipio->i_contents, RW_READER); 143 if (ipio->i_gen != STRUCT_FGET(fio, fio_gen)) { 144 error = ESTALE; 145 rw_exit(&ipio->i_contents); 146 goto errout; 147 } 148 149 /* 150 * check if the inode is free 151 */ 152 if (ipio->i_mode == 0) { 153 error = ENOENT; 154 rw_exit(&ipio->i_contents); 155 goto errout; 156 } 157 rw_exit(&ipio->i_contents); 158 159 /* 160 * Adapted from copen: get a file struct 161 * Large Files: We open this file descriptor with FOFFMAX flag 162 * set so that it will be like a large file open. 163 */ 164 if (falloc(NULL, (FREAD|FOFFMAX), &fpio, STRUCT_FADDR(fio, fio_fd))) 165 goto errout; 166 167 /* 168 * Adapted from vn_open: check access and then open the file 169 */ 170 vpio = ITOV(ipio); 171 if (error = VOP_ACCESS(vpio, VREAD, 0, cr, NULL)) 172 goto errout; 173 174 if (error = VOP_OPEN(&vpio, FREAD, cr, NULL)) 175 goto errout; 176 177 /* 178 * Adapted from copen: initialize the file struct 179 */ 180 fpio->f_vnode = vpio; 181 182 /* 183 * return the fd 184 */ 185 if (copyout(STRUCT_BUF(fio), fiou, STRUCT_SIZE(fio))) { 186 error = EFAULT; 187 goto errout; 188 } 189 setf(STRUCT_FGET(fio, fio_fd), fpio); 190 mutex_exit(&fpio->f_tlock); 191 return (0); 192 errout: 193 /* 194 * free the file struct and fd 195 */ 196 if (fpio) { 197 setf(STRUCT_FGET(fio, fio_fd), NULL); 198 unfalloc(fpio); 199 } 200 201 /* 202 * release the hold on the inode 203 */ 204 if (ipio) 205 VN_RELE(ITOV(ipio)); 206 return (error); 207 } 208 209 /* 210 * ufs_fiosatime 211 * set access time w/o altering change time. This ioctl is tailored 212 * to metamucil's needs and may change at any time. 213 */ 214 int 215 ufs_fiosatime( 216 struct vnode *vp, /* file's vnode */ 217 struct timeval *tvu, /* struct timeval in userland */ 218 int flag, /* flag from VOP_IOCTL() */ 219 struct cred *cr) /* credentials from ufs_ioctl */ 220 { 221 struct inode *ip; /* inode for vp */ 222 struct timeval32 tv; /* copy of user's timeval */ 223 int now = 0; 224 225 /* 226 * must have sufficient privileges 227 */ 228 if (secpolicy_fs_config(cr, vp->v_vfsp) != 0) 229 return (EPERM); 230 231 /* 232 * get user's copy of timeval struct and check values 233 * if input is NULL, will set time to now 234 */ 235 if (tvu == NULL) { 236 now = 1; 237 } else { 238 if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) { 239 if (copyin(tvu, &tv, sizeof (tv))) 240 return (EFAULT); 241 } else { 242 struct timeval tv64; 243 244 if (copyin(tvu, &tv64, sizeof (tv64))) 245 return (EFAULT); 246 if (TIMEVAL_OVERFLOW(&tv64)) 247 return (EOVERFLOW); 248 TIMEVAL_TO_TIMEVAL32(&tv, &tv64); 249 } 250 251 if (tv.tv_usec < 0 || tv.tv_usec >= 1000000) 252 return (EINVAL); 253 } 254 255 /* 256 * update access time 257 */ 258 ip = VTOI(vp); 259 rw_enter(&ip->i_contents, RW_WRITER); 260 ITIMES_NOLOCK(ip); 261 if (now) { 262 mutex_enter(&ufs_iuniqtime_lock); 263 ip->i_atime = iuniqtime; 264 mutex_exit(&ufs_iuniqtime_lock); 265 } else { 266 ip->i_atime = tv; 267 } 268 ip->i_flag |= IMODACC; 269 rw_exit(&ip->i_contents); 270 271 return (0); 272 } 273 274 /* 275 * ufs_fiogdio 276 * Get delayed-io state. This ioctl is tailored 277 * to metamucil's needs and may change at any time. 278 */ 279 /* ARGSUSED */ 280 int 281 ufs_fiogdio( 282 struct vnode *vp, /* file's vnode */ 283 uint_t *diop, /* dio state returned here */ 284 int flag, /* flag from ufs_ioctl */ 285 struct cred *cr) /* credentials from ufs_ioctl */ 286 { 287 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 288 289 /* 290 * forcibly unmounted 291 */ 292 if (ufsvfsp == NULL) 293 return (EIO); 294 295 if (suword32(diop, ufsvfsp->vfs_dio)) 296 return (EFAULT); 297 return (0); 298 } 299 300 /* 301 * ufs_fiosdio 302 * Set delayed-io state. This ioctl is tailored 303 * to metamucil's needs and may change at any time. 304 */ 305 int 306 ufs_fiosdio( 307 struct vnode *vp, /* file's vnode */ 308 uint_t *diop, /* dio flag */ 309 int flag, /* flag from ufs_ioctl */ 310 struct cred *cr) /* credentials from ufs_ioctl */ 311 { 312 uint_t dio; /* copy of user's dio */ 313 struct inode *ip; /* inode for vp */ 314 struct ufsvfs *ufsvfsp; 315 struct fs *fs; 316 struct ulockfs *ulp; 317 int error = 0; 318 319 #ifdef lint 320 flag = flag; 321 #endif 322 323 /* check input conditions */ 324 if (secpolicy_fs_config(cr, vp->v_vfsp) != 0) 325 return (EPERM); 326 327 if (copyin(diop, &dio, sizeof (dio))) 328 return (EFAULT); 329 330 if (dio > 1) 331 return (EINVAL); 332 333 /* file system has been forcibly unmounted */ 334 if (VTOI(vp)->i_ufsvfs == NULL) 335 return (EIO); 336 337 ip = VTOI(vp); 338 ufsvfsp = ip->i_ufsvfs; 339 ulp = &ufsvfsp->vfs_ulockfs; 340 341 /* logging file system; dio ignored */ 342 if (TRANS_ISTRANS(ufsvfsp)) 343 return (error); 344 345 /* hold the mutex to prevent race with a lockfs request */ 346 vfs_lock_wait(vp->v_vfsp); 347 mutex_enter(&ulp->ul_lock); 348 atomic_inc_ulong(&ufs_quiesce_pend); 349 350 if (ULOCKFS_IS_HLOCK(ulp)) { 351 error = EIO; 352 goto out; 353 } 354 355 if (ULOCKFS_IS_ELOCK(ulp)) { 356 error = EBUSY; 357 goto out; 358 } 359 /* wait for outstanding accesses to finish */ 360 if (error = ufs_quiesce(ulp)) 361 goto out; 362 363 /* flush w/invalidate */ 364 if (error = ufs_flush(vp->v_vfsp)) 365 goto out; 366 367 /* 368 * update dio 369 */ 370 mutex_enter(&ufsvfsp->vfs_lock); 371 ufsvfsp->vfs_dio = dio; 372 373 /* 374 * enable/disable clean flag processing 375 */ 376 fs = ip->i_fs; 377 if (fs->fs_ronly == 0 && 378 fs->fs_clean != FSBAD && 379 fs->fs_clean != FSLOG) { 380 if (dio) 381 fs->fs_clean = FSSUSPEND; 382 else 383 fs->fs_clean = FSACTIVE; 384 ufs_sbwrite(ufsvfsp); 385 mutex_exit(&ufsvfsp->vfs_lock); 386 } else 387 mutex_exit(&ufsvfsp->vfs_lock); 388 out: 389 /* 390 * we need this broadcast because of the ufs_quiesce call above 391 */ 392 atomic_dec_ulong(&ufs_quiesce_pend); 393 cv_broadcast(&ulp->ul_cv); 394 mutex_exit(&ulp->ul_lock); 395 vfs_unlock(vp->v_vfsp); 396 return (error); 397 } 398 399 /* 400 * ufs_fioffs - common function for VFS and ioctl entry points to flush a single 401 * file system. 402 */ 403 int 404 ufs_fioffs(vfs_t *vfsp, cred_t *cr) 405 { 406 int error; 407 struct ufsvfs *ufsvfsp; 408 struct ulockfs *ulp; 409 410 /* file system has been forcibly unmounted */ 411 if ((vfsp->vfs_flag & VFS_UNMOUNTED) != 0) 412 return (EIO); 413 414 ufsvfsp = vfsp->vfs_data; 415 ulp = &ufsvfsp->vfs_ulockfs; 416 417 /* 418 * suspend the delete thread 419 * this must be done outside the lockfs locking protocol 420 */ 421 vfs_lock_wait(vfsp); 422 ufs_thread_suspend(&ufsvfsp->vfs_delete); 423 424 /* hold the mutex to prevent race with a lockfs request */ 425 mutex_enter(&ulp->ul_lock); 426 atomic_inc_ulong(&ufs_quiesce_pend); 427 428 if (ULOCKFS_IS_HLOCK(ulp)) { 429 error = EIO; 430 goto out; 431 } 432 if (ULOCKFS_IS_ELOCK(ulp)) { 433 error = EBUSY; 434 goto out; 435 } 436 /* wait for outstanding accesses to finish */ 437 if (error = ufs_quiesce(ulp)) 438 goto out; 439 440 /* 441 * If logging, and the logmap was marked as not rollable, 442 * make it rollable now, and start the trans_roll thread and 443 * the reclaim thread. The log at this point is safe to write to. 444 */ 445 if (ufsvfsp->vfs_log) { 446 ml_unit_t *ul = ufsvfsp->vfs_log; 447 struct fs *fsp = ufsvfsp->vfs_fs; 448 int err; 449 450 if (ul->un_flags & LDL_NOROLL) { 451 ul->un_flags &= ~LDL_NOROLL; 452 logmap_start_roll(ul); 453 if (!fsp->fs_ronly && (fsp->fs_reclaim & 454 (FS_RECLAIM|FS_RECLAIMING))) { 455 fsp->fs_reclaim &= ~FS_RECLAIM; 456 fsp->fs_reclaim |= FS_RECLAIMING; 457 ufs_thread_start(&ufsvfsp->vfs_reclaim, 458 ufs_thread_reclaim, vfsp); 459 if (!fsp->fs_ronly) { 460 TRANS_SBWRITE(ufsvfsp, 461 TOP_SBUPDATE_UPDATE); 462 if (err = 463 geterror(ufsvfsp->vfs_bufp)) { 464 refstr_t *mntpt; 465 mntpt = vfs_getmntpoint(vfsp); 466 cmn_err(CE_NOTE, 467 "Filesystem Flush " 468 "Failed to update " 469 "Reclaim Status for " 470 " %s, Write failed to " 471 "update superblock, " 472 "error %d", 473 refstr_value(mntpt), 474 err); 475 refstr_rele(mntpt); 476 } 477 } 478 } 479 } 480 } 481 482 /* synchronously flush dirty data and metadata */ 483 error = ufs_flush(vfsp); 484 485 out: 486 atomic_dec_ulong(&ufs_quiesce_pend); 487 cv_broadcast(&ulp->ul_cv); 488 mutex_exit(&ulp->ul_lock); 489 vfs_unlock(vfsp); 490 491 /* 492 * allow the delete thread to continue 493 */ 494 ufs_thread_continue(&ufsvfsp->vfs_delete); 495 return (error); 496 } 497 498 /* 499 * ufs_fioisbusy 500 * Get number of references on this vnode. 501 * Contract-private interface for Legato's NetWorker product. 502 */ 503 /* ARGSUSED */ 504 int 505 ufs_fioisbusy(struct vnode *vp, int *isbusy, struct cred *cr) 506 { 507 int is_it_busy; 508 509 /* 510 * The caller holds one reference, there may be one in the dnlc 511 * so we need to flush it. 512 */ 513 if (vp->v_count > 1) 514 dnlc_purge_vp(vp); 515 /* 516 * Since we've just flushed the dnlc and we hold a reference 517 * to this vnode, then anything but 1 means busy (this had 518 * BETTER not be zero!). Also, it's possible for someone to 519 * have this file mmap'ed with no additional reference count. 520 */ 521 ASSERT(vp->v_count > 0); 522 if ((vp->v_count == 1) && (VTOI(vp)->i_mapcnt == 0)) 523 is_it_busy = 0; 524 else 525 is_it_busy = 1; 526 527 if (suword32(isbusy, is_it_busy)) 528 return (EFAULT); 529 return (0); 530 } 531 532 /* ARGSUSED */ 533 int 534 ufs_fiodirectio(struct vnode *vp, int cmd, struct cred *cr) 535 { 536 int error = 0; 537 struct inode *ip = VTOI(vp); 538 539 /* 540 * Acquire reader lock and set/reset direct mode 541 */ 542 rw_enter(&ip->i_contents, RW_READER); 543 mutex_enter(&ip->i_tlock); 544 if (cmd == DIRECTIO_ON) 545 ip->i_flag |= IDIRECTIO; /* enable direct mode */ 546 else if (cmd == DIRECTIO_OFF) 547 ip->i_flag &= ~IDIRECTIO; /* disable direct mode */ 548 else 549 error = EINVAL; 550 mutex_exit(&ip->i_tlock); 551 rw_exit(&ip->i_contents); 552 return (error); 553 } 554 555 /* 556 * ufs_fiotune 557 * Allow some tunables to be set on a mounted fs 558 */ 559 int 560 ufs_fiotune(struct vnode *vp, struct fiotune *uftp, struct cred *cr) 561 { 562 struct fiotune ftp; 563 struct fs *fs; 564 struct ufsvfs *ufsvfsp; 565 566 /* 567 * must have sufficient privileges 568 */ 569 if (secpolicy_fs_config(cr, vp->v_vfsp) != 0) 570 return (EPERM); 571 572 /* 573 * get user's copy 574 */ 575 if (copyin(uftp, &ftp, sizeof (ftp))) 576 return (EFAULT); 577 578 /* 579 * some minimal sanity checks 580 */ 581 if ((ftp.maxcontig <= 0) || 582 (ftp.rotdelay != 0) || 583 (ftp.maxbpg <= 0) || 584 (ftp.minfree < 0) || 585 (ftp.minfree > 99) || 586 ((ftp.optim != FS_OPTTIME) && (ftp.optim != FS_OPTSPACE))) 587 return (EINVAL); 588 589 /* 590 * update superblock but don't write it! If it gets out, fine. 591 */ 592 fs = VTOI(vp)->i_fs; 593 594 fs->fs_maxcontig = ftp.maxcontig; 595 fs->fs_rotdelay = ftp.rotdelay; 596 fs->fs_maxbpg = ftp.maxbpg; 597 fs->fs_minfree = ftp.minfree; 598 fs->fs_optim = ftp.optim; 599 600 /* 601 * Adjust cluster based on the new maxcontig. The cluster size 602 * can be any positive value. The check for this is done above. 603 */ 604 ufsvfsp = VTOI(vp)->i_ufsvfs; 605 ufsvfsp->vfs_ioclustsz = fs->fs_bsize * fs->fs_maxcontig; 606 607 /* 608 * Adjust minfrags from minfree 609 */ 610 ufsvfsp->vfs_minfrags = (int)((int64_t)fs->fs_dsize * 611 fs->fs_minfree / 100); 612 613 /* 614 * Write the superblock 615 */ 616 if (fs->fs_ronly == 0) { 617 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SBUPDATE_UPDATE, 618 TOP_SBWRITE_SIZE); 619 TRANS_SBWRITE(ufsvfsp, TOP_SBUPDATE_UPDATE); 620 TRANS_END_ASYNC(ufsvfsp, TOP_SBUPDATE_UPDATE, TOP_SBWRITE_SIZE); 621 } 622 623 return (0); 624 } 625 626 /* 627 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 628 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 629 */ 630 int 631 ufs_fio_holey(vnode_t *vp, int cmd, offset_t *off) 632 { 633 inode_t *ip = VTOI(vp); 634 u_offset_t noff = (u_offset_t)*off; /* new offset */ 635 u_offset_t isz; 636 int error; 637 boolean_t hole; 638 639 rw_enter(&ip->i_contents, RW_READER); 640 isz = ip->i_size; 641 if (noff >= isz) { 642 rw_exit(&ip->i_contents); 643 return (ENXIO); 644 } 645 646 /* 647 * Check for the usual case where a file has no holes. 648 * If so we can optimise to set the end of the file as the first 649 * (virtual) hole. This avoids bmap_find() searching through 650 * every block in the file for a (non-existent) hole. 651 */ 652 if (!bmap_has_holes(ip)) { 653 rw_exit(&ip->i_contents); 654 if (cmd == _FIO_SEEK_HOLE) { 655 *off = isz; 656 return (0); 657 } 658 /* *off must already point to valid data (non hole) */ 659 return (0); 660 } 661 662 /* 663 * Calling bmap_read() one block at a time on a 1TB file takes forever, 664 * so we use a special function to search for holes or blocks. 665 */ 666 if (cmd == _FIO_SEEK_HOLE) 667 hole = B_TRUE; 668 else 669 hole = B_FALSE; 670 error = bmap_find(ip, hole, &noff); 671 rw_exit(&ip->i_contents); 672 673 /* end of file? */ 674 if (error == ENXIO) { 675 /* 676 * Handle the virtual hole at the end of file. 677 */ 678 if (cmd == _FIO_SEEK_HOLE) { 679 *off = isz; 680 return (0); 681 } 682 return (ENXIO); 683 } 684 if (noff < *off) 685 return (error); 686 *off = noff; 687 return (error); 688 } 689 690 int 691 ufs_mark_compressed(struct vnode *vp) 692 { 693 struct inode *ip = VTOI(vp); 694 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 695 696 if (vp->v_type != VREG) 697 return (EINVAL); 698 699 rw_enter(&ip->i_contents, RW_WRITER); 700 ip->i_cflags |= ICOMPRESS; 701 TRANS_INODE(ufsvfsp, ip); 702 ip->i_flag |= (ICHG|ISEQ); 703 ip->i_seq++; 704 if (!TRANS_ISTRANS(ufsvfsp)) 705 ufs_iupdat(ip, I_ASYNC); 706 rw_exit(&ip->i_contents); 707 708 return (0); 709 } 710