1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/t_lock.h> 31 #include <sys/param.h> 32 #include <sys/time.h> 33 #include <sys/systm.h> 34 #include <sys/sysmacros.h> 35 #include <sys/resource.h> 36 #include <sys/signal.h> 37 #include <sys/cred.h> 38 #include <sys/user.h> 39 #include <sys/buf.h> 40 #include <sys/vfs.h> 41 #include <sys/vnode.h> 42 #include <sys/proc.h> 43 #include <sys/disp.h> 44 #include <sys/file.h> 45 #include <sys/fcntl.h> 46 #include <sys/flock.h> 47 #include <sys/atomic.h> 48 #include <sys/kmem.h> 49 #include <sys/uio.h> 50 #include <sys/conf.h> 51 #include <sys/mman.h> 52 #include <sys/pathname.h> 53 #include <sys/debug.h> 54 #include <sys/vmmeter.h> 55 #include <sys/vmsystm.h> 56 #include <sys/cmn_err.h> 57 #include <sys/vtrace.h> 58 #include <sys/filio.h> 59 #include <sys/dnlc.h> 60 61 #include <sys/fs/ufs_filio.h> 62 #include <sys/fs/ufs_lockfs.h> 63 #include <sys/fs/ufs_fs.h> 64 #include <sys/fs/ufs_inode.h> 65 #include <sys/fs/ufs_fsdir.h> 66 #include <sys/fs/ufs_quota.h> 67 #include <sys/fs/ufs_trans.h> 68 #include <sys/fs/ufs_log.h> 69 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 70 #include <sys/errno.h> 71 #include <sys/sysinfo.h> 72 73 #include <vm/hat.h> 74 #include <vm/page.h> 75 #include <vm/pvn.h> 76 #include <vm/as.h> 77 #include <vm/seg.h> 78 #include <vm/seg_map.h> 79 #include <vm/seg_vn.h> 80 #include <vm/rm.h> 81 #include <sys/swap.h> 82 #include <sys/model.h> 83 #include <sys/policy.h> 84 85 #include "fs/fs_subr.h" 86 87 /* 88 * ufs_fioio is the ufs equivalent of NFS_CNVT and is tailored to 89 * metamucil's needs. It may change at any time. 90 */ 91 /* ARGSUSED */ 92 int 93 ufs_fioio( 94 struct vnode *vp, /* any file on the fs */ 95 struct fioio *fiou, /* fioio struct in userland */ 96 int flag, /* flag from VOP_IOCTL() */ 97 struct cred *cr) /* credentials from ufs_ioctl */ 98 { 99 int error = 0; 100 struct vnode *vpio = NULL; /* vnode for inode open */ 101 struct inode *ipio = NULL; /* inode for inode open */ 102 struct file *fpio = NULL; /* file for inode open */ 103 struct inode *ip; /* inode for file system */ 104 struct fs *fs; /* fs for file system */ 105 STRUCT_DECL(fioio, fio); /* copy of user's fioio struct */ 106 107 /* 108 * must be privileged 109 */ 110 if (secpolicy_fs_config(cr, vp->v_vfsp) != 0) 111 return (EPERM); 112 113 STRUCT_INIT(fio, flag & DATAMODEL_MASK); 114 115 /* 116 * get user's copy of fioio struct 117 */ 118 if (copyin(fiou, STRUCT_BUF(fio), STRUCT_SIZE(fio))) 119 return (EFAULT); 120 121 ip = VTOI(vp); 122 fs = ip->i_fs; 123 124 /* 125 * check the inode number against the fs's inode number bounds 126 */ 127 if (STRUCT_FGET(fio, fio_ino) < UFSROOTINO) 128 return (ESRCH); 129 if (STRUCT_FGET(fio, fio_ino) >= fs->fs_ncg * fs->fs_ipg) 130 return (ESRCH); 131 132 rw_enter(&ip->i_ufsvfs->vfs_dqrwlock, RW_READER); 133 134 /* 135 * get the inode 136 */ 137 error = ufs_iget(ip->i_vfs, STRUCT_FGET(fio, fio_ino), &ipio, cr); 138 139 rw_exit(&ip->i_ufsvfs->vfs_dqrwlock); 140 141 if (error) 142 return (error); 143 144 /* 145 * check the generation number 146 */ 147 rw_enter(&ipio->i_contents, RW_READER); 148 if (ipio->i_gen != STRUCT_FGET(fio, fio_gen)) { 149 error = ESTALE; 150 rw_exit(&ipio->i_contents); 151 goto errout; 152 } 153 154 /* 155 * check if the inode is free 156 */ 157 if (ipio->i_mode == 0) { 158 error = ENOENT; 159 rw_exit(&ipio->i_contents); 160 goto errout; 161 } 162 rw_exit(&ipio->i_contents); 163 164 /* 165 * Adapted from copen: get a file struct 166 * Large Files: We open this file descriptor with FOFFMAX flag 167 * set so that it will be like a large file open. 168 */ 169 if (falloc(NULL, (FREAD|FOFFMAX), &fpio, STRUCT_FADDR(fio, fio_fd))) 170 goto errout; 171 172 /* 173 * Adapted from vn_open: check access and then open the file 174 */ 175 vpio = ITOV(ipio); 176 if (error = VOP_ACCESS(vpio, VREAD, 0, cr)) 177 goto errout; 178 179 if (error = VOP_OPEN(&vpio, FREAD, cr)) 180 goto errout; 181 182 /* 183 * Adapted from copen: initialize the file struct 184 */ 185 fpio->f_vnode = vpio; 186 187 /* 188 * return the fd 189 */ 190 if (copyout(STRUCT_BUF(fio), fiou, STRUCT_SIZE(fio))) { 191 error = EFAULT; 192 goto errout; 193 } 194 setf(STRUCT_FGET(fio, fio_fd), fpio); 195 mutex_exit(&fpio->f_tlock); 196 return (0); 197 errout: 198 /* 199 * free the file struct and fd 200 */ 201 if (fpio) { 202 setf(STRUCT_FGET(fio, fio_fd), NULL); 203 unfalloc(fpio); 204 } 205 206 /* 207 * release the hold on the inode 208 */ 209 if (ipio) 210 VN_RELE(ITOV(ipio)); 211 return (error); 212 } 213 214 /* 215 * ufs_fiosatime 216 * set access time w/o altering change time. This ioctl is tailored 217 * to metamucil's needs and may change at any time. 218 */ 219 int 220 ufs_fiosatime( 221 struct vnode *vp, /* file's vnode */ 222 struct timeval *tvu, /* struct timeval in userland */ 223 int flag, /* flag from VOP_IOCTL() */ 224 struct cred *cr) /* credentials from ufs_ioctl */ 225 { 226 struct inode *ip; /* inode for vp */ 227 struct timeval32 tv; /* copy of user's timeval */ 228 int now = 0; 229 230 /* 231 * must have sufficient privileges 232 */ 233 if (secpolicy_fs_config(cr, vp->v_vfsp) != 0) 234 return (EPERM); 235 236 /* 237 * get user's copy of timeval struct and check values 238 * if input is NULL, will set time to now 239 */ 240 if (tvu == NULL) { 241 now = 1; 242 } else { 243 if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) { 244 if (copyin(tvu, &tv, sizeof (tv))) 245 return (EFAULT); 246 } else { 247 struct timeval tv64; 248 249 if (copyin(tvu, &tv64, sizeof (tv64))) 250 return (EFAULT); 251 if (TIMEVAL_OVERFLOW(&tv64)) 252 return (EOVERFLOW); 253 TIMEVAL_TO_TIMEVAL32(&tv, &tv64); 254 } 255 256 if (tv.tv_usec < 0 || tv.tv_usec >= 1000000) 257 return (EINVAL); 258 } 259 260 /* 261 * update access time 262 */ 263 ip = VTOI(vp); 264 rw_enter(&ip->i_contents, RW_WRITER); 265 ITIMES_NOLOCK(ip); 266 if (now) { 267 mutex_enter(&ufs_iuniqtime_lock); 268 ip->i_atime = iuniqtime; 269 mutex_exit(&ufs_iuniqtime_lock); 270 } else { 271 ip->i_atime = tv; 272 } 273 ip->i_flag |= IMODACC; 274 rw_exit(&ip->i_contents); 275 276 return (0); 277 } 278 279 /* 280 * ufs_fiogdio 281 * Get delayed-io state. This ioctl is tailored 282 * to metamucil's needs and may change at any time. 283 */ 284 /* ARGSUSED */ 285 int 286 ufs_fiogdio( 287 struct vnode *vp, /* file's vnode */ 288 uint_t *diop, /* dio state returned here */ 289 int flag, /* flag from ufs_ioctl */ 290 struct cred *cr) /* credentials from ufs_ioctl */ 291 { 292 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 293 294 /* 295 * forcibly unmounted 296 */ 297 if (ufsvfsp == NULL) 298 return (EIO); 299 300 if (suword32(diop, ufsvfsp->vfs_dio)) 301 return (EFAULT); 302 return (0); 303 } 304 305 /* 306 * ufs_fiosdio 307 * Set delayed-io state. This ioctl is tailored 308 * to metamucil's needs and may change at any time. 309 */ 310 int 311 ufs_fiosdio( 312 struct vnode *vp, /* file's vnode */ 313 uint_t *diop, /* dio flag */ 314 int flag, /* flag from ufs_ioctl */ 315 struct cred *cr) /* credentials from ufs_ioctl */ 316 { 317 uint_t dio; /* copy of user's dio */ 318 struct inode *ip; /* inode for vp */ 319 struct ufsvfs *ufsvfsp; 320 struct fs *fs; 321 struct ulockfs *ulp; 322 int error = 0; 323 324 #ifdef lint 325 flag = flag; 326 #endif 327 328 /* check input conditions */ 329 if (secpolicy_fs_config(cr, vp->v_vfsp) != 0) 330 return (EPERM); 331 332 if (copyin(diop, &dio, sizeof (dio))) 333 return (EFAULT); 334 335 if (dio > 1) 336 return (EINVAL); 337 338 /* file system has been forcibly unmounted */ 339 if (VTOI(vp)->i_ufsvfs == NULL) 340 return (EIO); 341 342 ip = VTOI(vp); 343 ufsvfsp = ip->i_ufsvfs; 344 ulp = &ufsvfsp->vfs_ulockfs; 345 346 /* logging file system; dio ignored */ 347 if (TRANS_ISTRANS(ufsvfsp)) 348 return (error); 349 350 /* hold the mutex to prevent race with a lockfs request */ 351 vfs_lock_wait(vp->v_vfsp); 352 mutex_enter(&ulp->ul_lock); 353 atomic_add_long(&ufs_quiesce_pend, 1); 354 355 if (ULOCKFS_IS_HLOCK(ulp)) { 356 error = EIO; 357 goto out; 358 } 359 360 if (ULOCKFS_IS_ELOCK(ulp)) { 361 error = EBUSY; 362 goto out; 363 } 364 /* wait for outstanding accesses to finish */ 365 if (error = ufs_quiesce(ulp)) 366 goto out; 367 368 /* flush w/invalidate */ 369 if (error = ufs_flush(vp->v_vfsp)) 370 goto out; 371 372 /* 373 * update dio 374 */ 375 mutex_enter(&ufsvfsp->vfs_lock); 376 ufsvfsp->vfs_dio = dio; 377 378 /* 379 * enable/disable clean flag processing 380 */ 381 fs = ip->i_fs; 382 if (fs->fs_ronly == 0 && 383 fs->fs_clean != FSBAD && 384 fs->fs_clean != FSLOG) { 385 if (dio) 386 fs->fs_clean = FSSUSPEND; 387 else 388 fs->fs_clean = FSACTIVE; 389 ufs_sbwrite(ufsvfsp); 390 mutex_exit(&ufsvfsp->vfs_lock); 391 } else 392 mutex_exit(&ufsvfsp->vfs_lock); 393 out: 394 /* 395 * we need this broadcast because of the ufs_quiesce call above 396 */ 397 atomic_add_long(&ufs_quiesce_pend, -1); 398 cv_broadcast(&ulp->ul_cv); 399 mutex_exit(&ulp->ul_lock); 400 vfs_unlock(vp->v_vfsp); 401 return (error); 402 } 403 404 /* 405 * ufs_fioffs - ioctl handler for flushing file system 406 */ 407 /* ARGSUSED */ 408 int 409 ufs_fioffs( 410 struct vnode *vp, 411 char *vap, /* must be NULL - reserved */ 412 struct cred *cr) /* credentials from ufs_ioctl */ 413 { 414 int error; 415 struct ufsvfs *ufsvfsp; 416 struct ulockfs *ulp; 417 418 /* file system has been forcibly unmounted */ 419 ufsvfsp = VTOI(vp)->i_ufsvfs; 420 if (ufsvfsp == NULL) 421 return (EIO); 422 423 ulp = &ufsvfsp->vfs_ulockfs; 424 425 /* 426 * suspend the delete thread 427 * this must be done outside the lockfs locking protocol 428 */ 429 ufs_thread_suspend(&ufsvfsp->vfs_delete); 430 431 vfs_lock_wait(vp->v_vfsp); 432 /* hold the mutex to prevent race with a lockfs request */ 433 mutex_enter(&ulp->ul_lock); 434 atomic_add_long(&ufs_quiesce_pend, 1); 435 436 if (ULOCKFS_IS_HLOCK(ulp)) { 437 error = EIO; 438 goto out; 439 } 440 if (ULOCKFS_IS_ELOCK(ulp)) { 441 error = EBUSY; 442 goto out; 443 } 444 /* wait for outstanding accesses to finish */ 445 if (error = ufs_quiesce(ulp)) 446 goto out; 447 448 /* 449 * If logging, and the logmap was marked as not rollable, 450 * make it rollable now, and start the trans_roll thread and 451 * the reclaim thread. The log at this point is safe to write to. 452 */ 453 if (ufsvfsp->vfs_log) { 454 ml_unit_t *ul = ufsvfsp->vfs_log; 455 struct fs *fsp = ufsvfsp->vfs_fs; 456 int err; 457 458 if (ul->un_flags & LDL_NOROLL) { 459 ul->un_flags &= ~LDL_NOROLL; 460 logmap_start_roll(ul); 461 if (!fsp->fs_ronly && (fsp->fs_reclaim & 462 (FS_RECLAIM|FS_RECLAIMING))) { 463 fsp->fs_reclaim &= ~FS_RECLAIM; 464 fsp->fs_reclaim |= FS_RECLAIMING; 465 ufs_thread_start(&ufsvfsp->vfs_reclaim, 466 ufs_thread_reclaim, 467 vp->v_vfsp); 468 if (!fsp->fs_ronly) { 469 TRANS_SBWRITE(ufsvfsp, 470 TOP_SBUPDATE_UPDATE); 471 if (err = 472 geterror(ufsvfsp->vfs_bufp)) { 473 refstr_t *mntpt; 474 mntpt = vfs_getmntpoint( 475 vp->v_vfsp); 476 cmn_err(CE_NOTE, 477 "Filesystem Flush " 478 "Failed to update " 479 "Reclaim Status for " 480 " %s, Write failed to " 481 "update superblock, " 482 "error %d", 483 refstr_value(mntpt), 484 err); 485 refstr_rele(mntpt); 486 } 487 } 488 } 489 } 490 } 491 492 /* synchronously flush dirty data and metadata */ 493 error = ufs_flush(vp->v_vfsp); 494 495 out: 496 atomic_add_long(&ufs_quiesce_pend, -1); 497 cv_broadcast(&ulp->ul_cv); 498 mutex_exit(&ulp->ul_lock); 499 vfs_unlock(vp->v_vfsp); 500 501 /* 502 * allow the delete thread to continue 503 */ 504 ufs_thread_continue(&ufsvfsp->vfs_delete); 505 return (error); 506 } 507 508 /* 509 * ufs_fioisbusy 510 * Get number of references on this vnode. 511 * Contract-private interface for Legato's NetWorker product. 512 */ 513 /* ARGSUSED */ 514 int 515 ufs_fioisbusy(struct vnode *vp, int *isbusy, struct cred *cr) 516 { 517 int is_it_busy; 518 519 /* 520 * The caller holds one reference, there may be one in the dnlc 521 * so we need to flush it. 522 */ 523 if (vp->v_count > 1) 524 dnlc_purge_vp(vp); 525 /* 526 * Since we've just flushed the dnlc and we hold a reference 527 * to this vnode, then anything but 1 means busy (this had 528 * BETTER not be zero!). Also, it's possible for someone to 529 * have this file mmap'ed with no additional reference count. 530 */ 531 ASSERT(vp->v_count > 0); 532 if ((vp->v_count == 1) && (VTOI(vp)->i_mapcnt == 0)) 533 is_it_busy = 0; 534 else 535 is_it_busy = 1; 536 537 if (suword32(isbusy, is_it_busy)) 538 return (EFAULT); 539 return (0); 540 } 541 542 /* ARGSUSED */ 543 int 544 ufs_fiodirectio(struct vnode *vp, int cmd, struct cred *cr) 545 { 546 int error = 0; 547 struct inode *ip = VTOI(vp); 548 549 /* 550 * Acquire reader lock and set/reset direct mode 551 */ 552 rw_enter(&ip->i_contents, RW_READER); 553 mutex_enter(&ip->i_tlock); 554 if (cmd == DIRECTIO_ON) 555 ip->i_flag |= IDIRECTIO; /* enable direct mode */ 556 else if (cmd == DIRECTIO_OFF) 557 ip->i_flag &= ~IDIRECTIO; /* disable direct mode */ 558 else 559 error = EINVAL; 560 mutex_exit(&ip->i_tlock); 561 rw_exit(&ip->i_contents); 562 return (error); 563 } 564 565 /* 566 * ufs_fiotune 567 * Allow some tunables to be set on a mounted fs 568 */ 569 int 570 ufs_fiotune(struct vnode *vp, struct fiotune *uftp, struct cred *cr) 571 { 572 struct fiotune ftp; 573 struct fs *fs; 574 struct ufsvfs *ufsvfsp; 575 576 /* 577 * must have sufficient privileges 578 */ 579 if (secpolicy_fs_config(cr, vp->v_vfsp) != 0) 580 return (EPERM); 581 582 /* 583 * get user's copy 584 */ 585 if (copyin(uftp, &ftp, sizeof (ftp))) 586 return (EFAULT); 587 588 /* 589 * some minimal sanity checks 590 */ 591 if ((ftp.maxcontig <= 0) || 592 (ftp.rotdelay != 0) || 593 (ftp.maxbpg <= 0) || 594 (ftp.minfree < 0) || 595 (ftp.minfree > 99) || 596 ((ftp.optim != FS_OPTTIME) && (ftp.optim != FS_OPTSPACE))) 597 return (EINVAL); 598 599 /* 600 * update superblock but don't write it! If it gets out, fine. 601 */ 602 fs = VTOI(vp)->i_fs; 603 604 fs->fs_maxcontig = ftp.maxcontig; 605 fs->fs_rotdelay = ftp.rotdelay; 606 fs->fs_maxbpg = ftp.maxbpg; 607 fs->fs_minfree = ftp.minfree; 608 fs->fs_optim = ftp.optim; 609 610 /* 611 * Adjust cluster based on the new maxcontig. The cluster size 612 * can be any positive value. The check for this is done above. 613 */ 614 ufsvfsp = VTOI(vp)->i_ufsvfs; 615 ufsvfsp->vfs_ioclustsz = fs->fs_bsize * fs->fs_maxcontig; 616 617 /* 618 * Adjust minfrags from minfree 619 */ 620 ufsvfsp->vfs_minfrags = (int)((int64_t)fs->fs_dsize * 621 fs->fs_minfree / 100); 622 623 /* 624 * Write the superblock 625 */ 626 if (fs->fs_ronly == 0) { 627 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SBUPDATE_UPDATE, 628 TOP_SBWRITE_SIZE); 629 TRANS_SBWRITE(ufsvfsp, TOP_SBUPDATE_UPDATE); 630 TRANS_END_ASYNC(ufsvfsp, TOP_SBUPDATE_UPDATE, TOP_SBWRITE_SIZE); 631 } 632 633 return (0); 634 } 635 636 /* 637 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 638 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 639 */ 640 int 641 ufs_fio_holey(vnode_t *vp, int cmd, offset_t *off) 642 { 643 inode_t *ip = VTOI(vp); 644 u_offset_t noff = (u_offset_t)*off; /* new offset */ 645 u_offset_t isz; 646 int error; 647 boolean_t hole; 648 649 rw_enter(&ip->i_contents, RW_READER); 650 isz = ip->i_size; 651 if (noff >= isz) { 652 rw_exit(&ip->i_contents); 653 return (ENXIO); 654 } 655 656 /* 657 * Check for the usual case where a file has no holes. 658 * If so we can optimise to set the end of the file as the first 659 * (virtual) hole. This avoids bmap_find() searching through 660 * every block in the file for a (non-existent) hole. 661 */ 662 if (!bmap_has_holes(ip)) { 663 rw_exit(&ip->i_contents); 664 if (cmd == _FIO_SEEK_HOLE) { 665 *off = isz; 666 return (0); 667 } 668 /* *off must already point to valid data (non hole) */ 669 return (0); 670 } 671 672 /* 673 * Calling bmap_read() one block at a time on a 1TB file takes forever, 674 * so we use a special function to search for holes or blocks. 675 */ 676 if (cmd == _FIO_SEEK_HOLE) 677 hole = B_TRUE; 678 else 679 hole = B_FALSE; 680 error = bmap_find(ip, hole, &noff); 681 rw_exit(&ip->i_contents); 682 683 /* end of file? */ 684 if (error == ENXIO) { 685 /* 686 * Handle the virtual hole at the end of file. 687 */ 688 if (cmd == _FIO_SEEK_HOLE) { 689 *off = isz; 690 return (0); 691 } 692 return (ENXIO); 693 } 694 if (noff < *off) 695 return (error); 696 *off = noff; 697 return (error); 698 } 699