1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/t_lock.h> 30 #include <sys/param.h> 31 #include <sys/time.h> 32 #include <sys/systm.h> 33 #include <sys/sysmacros.h> 34 #include <sys/resource.h> 35 #include <sys/signal.h> 36 #include <sys/cred.h> 37 #include <sys/user.h> 38 #include <sys/buf.h> 39 #include <sys/vfs.h> 40 #include <sys/vnode.h> 41 #include <sys/proc.h> 42 #include <sys/disp.h> 43 #include <sys/file.h> 44 #include <sys/fcntl.h> 45 #include <sys/flock.h> 46 #include <sys/atomic.h> 47 #include <sys/kmem.h> 48 #include <sys/uio.h> 49 #include <sys/conf.h> 50 #include <sys/mman.h> 51 #include <sys/pathname.h> 52 #include <sys/debug.h> 53 #include <sys/vmmeter.h> 54 #include <sys/vmsystm.h> 55 #include <sys/cmn_err.h> 56 #include <sys/filio.h> 57 #include <sys/dnlc.h> 58 59 #include <sys/fs/ufs_filio.h> 60 #include <sys/fs/ufs_lockfs.h> 61 #include <sys/fs/ufs_fs.h> 62 #include <sys/fs/ufs_inode.h> 63 #include <sys/fs/ufs_fsdir.h> 64 #include <sys/fs/ufs_quota.h> 65 #include <sys/fs/ufs_trans.h> 66 #include <sys/fs/ufs_log.h> 67 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 68 #include <sys/errno.h> 69 #include <sys/sysinfo.h> 70 71 #include <vm/hat.h> 72 #include <vm/page.h> 73 #include <vm/pvn.h> 74 #include <vm/as.h> 75 #include <vm/seg.h> 76 #include <vm/seg_map.h> 77 #include <vm/seg_vn.h> 78 #include <vm/rm.h> 79 #include <sys/swap.h> 80 #include <sys/model.h> 81 #include <sys/policy.h> 82 83 #include "fs/fs_subr.h" 84 85 /* 86 * ufs_fioio is the ufs equivalent of NFS_CNVT and is tailored to 87 * metamucil's needs. It may change at any time. 88 */ 89 /* ARGSUSED */ 90 int 91 ufs_fioio( 92 struct vnode *vp, /* any file on the fs */ 93 struct fioio *fiou, /* fioio struct in userland */ 94 int flag, /* flag from VOP_IOCTL() */ 95 struct cred *cr) /* credentials from ufs_ioctl */ 96 { 97 int error = 0; 98 struct vnode *vpio = NULL; /* vnode for inode open */ 99 struct inode *ipio = NULL; /* inode for inode open */ 100 struct file *fpio = NULL; /* file for inode open */ 101 struct inode *ip; /* inode for file system */ 102 struct fs *fs; /* fs for file system */ 103 STRUCT_DECL(fioio, fio); /* copy of user's fioio struct */ 104 105 /* 106 * must be privileged 107 */ 108 if (secpolicy_fs_config(cr, vp->v_vfsp) != 0) 109 return (EPERM); 110 111 STRUCT_INIT(fio, flag & DATAMODEL_MASK); 112 113 /* 114 * get user's copy of fioio struct 115 */ 116 if (copyin(fiou, STRUCT_BUF(fio), STRUCT_SIZE(fio))) 117 return (EFAULT); 118 119 ip = VTOI(vp); 120 fs = ip->i_fs; 121 122 /* 123 * check the inode number against the fs's inode number bounds 124 */ 125 if (STRUCT_FGET(fio, fio_ino) < UFSROOTINO) 126 return (ESRCH); 127 if (STRUCT_FGET(fio, fio_ino) >= fs->fs_ncg * fs->fs_ipg) 128 return (ESRCH); 129 130 rw_enter(&ip->i_ufsvfs->vfs_dqrwlock, RW_READER); 131 132 /* 133 * get the inode 134 */ 135 error = ufs_iget(ip->i_vfs, STRUCT_FGET(fio, fio_ino), &ipio, cr); 136 137 rw_exit(&ip->i_ufsvfs->vfs_dqrwlock); 138 139 if (error) 140 return (error); 141 142 /* 143 * check the generation number 144 */ 145 rw_enter(&ipio->i_contents, RW_READER); 146 if (ipio->i_gen != STRUCT_FGET(fio, fio_gen)) { 147 error = ESTALE; 148 rw_exit(&ipio->i_contents); 149 goto errout; 150 } 151 152 /* 153 * check if the inode is free 154 */ 155 if (ipio->i_mode == 0) { 156 error = ENOENT; 157 rw_exit(&ipio->i_contents); 158 goto errout; 159 } 160 rw_exit(&ipio->i_contents); 161 162 /* 163 * Adapted from copen: get a file struct 164 * Large Files: We open this file descriptor with FOFFMAX flag 165 * set so that it will be like a large file open. 166 */ 167 if (falloc(NULL, (FREAD|FOFFMAX), &fpio, STRUCT_FADDR(fio, fio_fd))) 168 goto errout; 169 170 /* 171 * Adapted from vn_open: check access and then open the file 172 */ 173 vpio = ITOV(ipio); 174 if (error = VOP_ACCESS(vpio, VREAD, 0, cr, NULL)) 175 goto errout; 176 177 if (error = VOP_OPEN(&vpio, FREAD, cr, NULL)) 178 goto errout; 179 180 /* 181 * Adapted from copen: initialize the file struct 182 */ 183 fpio->f_vnode = vpio; 184 185 /* 186 * return the fd 187 */ 188 if (copyout(STRUCT_BUF(fio), fiou, STRUCT_SIZE(fio))) { 189 error = EFAULT; 190 goto errout; 191 } 192 setf(STRUCT_FGET(fio, fio_fd), fpio); 193 mutex_exit(&fpio->f_tlock); 194 return (0); 195 errout: 196 /* 197 * free the file struct and fd 198 */ 199 if (fpio) { 200 setf(STRUCT_FGET(fio, fio_fd), NULL); 201 unfalloc(fpio); 202 } 203 204 /* 205 * release the hold on the inode 206 */ 207 if (ipio) 208 VN_RELE(ITOV(ipio)); 209 return (error); 210 } 211 212 /* 213 * ufs_fiosatime 214 * set access time w/o altering change time. This ioctl is tailored 215 * to metamucil's needs and may change at any time. 216 */ 217 int 218 ufs_fiosatime( 219 struct vnode *vp, /* file's vnode */ 220 struct timeval *tvu, /* struct timeval in userland */ 221 int flag, /* flag from VOP_IOCTL() */ 222 struct cred *cr) /* credentials from ufs_ioctl */ 223 { 224 struct inode *ip; /* inode for vp */ 225 struct timeval32 tv; /* copy of user's timeval */ 226 int now = 0; 227 228 /* 229 * must have sufficient privileges 230 */ 231 if (secpolicy_fs_config(cr, vp->v_vfsp) != 0) 232 return (EPERM); 233 234 /* 235 * get user's copy of timeval struct and check values 236 * if input is NULL, will set time to now 237 */ 238 if (tvu == NULL) { 239 now = 1; 240 } else { 241 if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) { 242 if (copyin(tvu, &tv, sizeof (tv))) 243 return (EFAULT); 244 } else { 245 struct timeval tv64; 246 247 if (copyin(tvu, &tv64, sizeof (tv64))) 248 return (EFAULT); 249 if (TIMEVAL_OVERFLOW(&tv64)) 250 return (EOVERFLOW); 251 TIMEVAL_TO_TIMEVAL32(&tv, &tv64); 252 } 253 254 if (tv.tv_usec < 0 || tv.tv_usec >= 1000000) 255 return (EINVAL); 256 } 257 258 /* 259 * update access time 260 */ 261 ip = VTOI(vp); 262 rw_enter(&ip->i_contents, RW_WRITER); 263 ITIMES_NOLOCK(ip); 264 if (now) { 265 mutex_enter(&ufs_iuniqtime_lock); 266 ip->i_atime = iuniqtime; 267 mutex_exit(&ufs_iuniqtime_lock); 268 } else { 269 ip->i_atime = tv; 270 } 271 ip->i_flag |= IMODACC; 272 rw_exit(&ip->i_contents); 273 274 return (0); 275 } 276 277 /* 278 * ufs_fiogdio 279 * Get delayed-io state. This ioctl is tailored 280 * to metamucil's needs and may change at any time. 281 */ 282 /* ARGSUSED */ 283 int 284 ufs_fiogdio( 285 struct vnode *vp, /* file's vnode */ 286 uint_t *diop, /* dio state returned here */ 287 int flag, /* flag from ufs_ioctl */ 288 struct cred *cr) /* credentials from ufs_ioctl */ 289 { 290 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 291 292 /* 293 * forcibly unmounted 294 */ 295 if (ufsvfsp == NULL) 296 return (EIO); 297 298 if (suword32(diop, ufsvfsp->vfs_dio)) 299 return (EFAULT); 300 return (0); 301 } 302 303 /* 304 * ufs_fiosdio 305 * Set delayed-io state. This ioctl is tailored 306 * to metamucil's needs and may change at any time. 307 */ 308 int 309 ufs_fiosdio( 310 struct vnode *vp, /* file's vnode */ 311 uint_t *diop, /* dio flag */ 312 int flag, /* flag from ufs_ioctl */ 313 struct cred *cr) /* credentials from ufs_ioctl */ 314 { 315 uint_t dio; /* copy of user's dio */ 316 struct inode *ip; /* inode for vp */ 317 struct ufsvfs *ufsvfsp; 318 struct fs *fs; 319 struct ulockfs *ulp; 320 int error = 0; 321 322 #ifdef lint 323 flag = flag; 324 #endif 325 326 /* check input conditions */ 327 if (secpolicy_fs_config(cr, vp->v_vfsp) != 0) 328 return (EPERM); 329 330 if (copyin(diop, &dio, sizeof (dio))) 331 return (EFAULT); 332 333 if (dio > 1) 334 return (EINVAL); 335 336 /* file system has been forcibly unmounted */ 337 if (VTOI(vp)->i_ufsvfs == NULL) 338 return (EIO); 339 340 ip = VTOI(vp); 341 ufsvfsp = ip->i_ufsvfs; 342 ulp = &ufsvfsp->vfs_ulockfs; 343 344 /* logging file system; dio ignored */ 345 if (TRANS_ISTRANS(ufsvfsp)) 346 return (error); 347 348 /* hold the mutex to prevent race with a lockfs request */ 349 vfs_lock_wait(vp->v_vfsp); 350 mutex_enter(&ulp->ul_lock); 351 atomic_add_long(&ufs_quiesce_pend, 1); 352 353 if (ULOCKFS_IS_HLOCK(ulp)) { 354 error = EIO; 355 goto out; 356 } 357 358 if (ULOCKFS_IS_ELOCK(ulp)) { 359 error = EBUSY; 360 goto out; 361 } 362 /* wait for outstanding accesses to finish */ 363 if (error = ufs_quiesce(ulp)) 364 goto out; 365 366 /* flush w/invalidate */ 367 if (error = ufs_flush(vp->v_vfsp)) 368 goto out; 369 370 /* 371 * update dio 372 */ 373 mutex_enter(&ufsvfsp->vfs_lock); 374 ufsvfsp->vfs_dio = dio; 375 376 /* 377 * enable/disable clean flag processing 378 */ 379 fs = ip->i_fs; 380 if (fs->fs_ronly == 0 && 381 fs->fs_clean != FSBAD && 382 fs->fs_clean != FSLOG) { 383 if (dio) 384 fs->fs_clean = FSSUSPEND; 385 else 386 fs->fs_clean = FSACTIVE; 387 ufs_sbwrite(ufsvfsp); 388 mutex_exit(&ufsvfsp->vfs_lock); 389 } else 390 mutex_exit(&ufsvfsp->vfs_lock); 391 out: 392 /* 393 * we need this broadcast because of the ufs_quiesce call above 394 */ 395 atomic_add_long(&ufs_quiesce_pend, -1); 396 cv_broadcast(&ulp->ul_cv); 397 mutex_exit(&ulp->ul_lock); 398 vfs_unlock(vp->v_vfsp); 399 return (error); 400 } 401 402 /* 403 * ufs_fioffs - ioctl handler for flushing file system 404 */ 405 /* ARGSUSED */ 406 int 407 ufs_fioffs( 408 struct vnode *vp, 409 char *vap, /* must be NULL - reserved */ 410 struct cred *cr) /* credentials from ufs_ioctl */ 411 { 412 int error; 413 struct ufsvfs *ufsvfsp; 414 struct ulockfs *ulp; 415 416 /* file system has been forcibly unmounted */ 417 ufsvfsp = VTOI(vp)->i_ufsvfs; 418 if (ufsvfsp == NULL) 419 return (EIO); 420 421 ulp = &ufsvfsp->vfs_ulockfs; 422 423 /* 424 * suspend the delete thread 425 * this must be done outside the lockfs locking protocol 426 */ 427 ufs_thread_suspend(&ufsvfsp->vfs_delete); 428 429 vfs_lock_wait(vp->v_vfsp); 430 /* hold the mutex to prevent race with a lockfs request */ 431 mutex_enter(&ulp->ul_lock); 432 atomic_add_long(&ufs_quiesce_pend, 1); 433 434 if (ULOCKFS_IS_HLOCK(ulp)) { 435 error = EIO; 436 goto out; 437 } 438 if (ULOCKFS_IS_ELOCK(ulp)) { 439 error = EBUSY; 440 goto out; 441 } 442 /* wait for outstanding accesses to finish */ 443 if (error = ufs_quiesce(ulp)) 444 goto out; 445 446 /* 447 * If logging, and the logmap was marked as not rollable, 448 * make it rollable now, and start the trans_roll thread and 449 * the reclaim thread. The log at this point is safe to write to. 450 */ 451 if (ufsvfsp->vfs_log) { 452 ml_unit_t *ul = ufsvfsp->vfs_log; 453 struct fs *fsp = ufsvfsp->vfs_fs; 454 int err; 455 456 if (ul->un_flags & LDL_NOROLL) { 457 ul->un_flags &= ~LDL_NOROLL; 458 logmap_start_roll(ul); 459 if (!fsp->fs_ronly && (fsp->fs_reclaim & 460 (FS_RECLAIM|FS_RECLAIMING))) { 461 fsp->fs_reclaim &= ~FS_RECLAIM; 462 fsp->fs_reclaim |= FS_RECLAIMING; 463 ufs_thread_start(&ufsvfsp->vfs_reclaim, 464 ufs_thread_reclaim, vp->v_vfsp); 465 if (!fsp->fs_ronly) { 466 TRANS_SBWRITE(ufsvfsp, 467 TOP_SBUPDATE_UPDATE); 468 if (err = 469 geterror(ufsvfsp->vfs_bufp)) { 470 refstr_t *mntpt; 471 mntpt = vfs_getmntpoint( 472 vp->v_vfsp); 473 cmn_err(CE_NOTE, 474 "Filesystem Flush " 475 "Failed to update " 476 "Reclaim Status for " 477 " %s, Write failed to " 478 "update superblock, " 479 "error %d", 480 refstr_value(mntpt), 481 err); 482 refstr_rele(mntpt); 483 } 484 } 485 } 486 } 487 } 488 489 /* synchronously flush dirty data and metadata */ 490 error = ufs_flush(vp->v_vfsp); 491 492 out: 493 atomic_add_long(&ufs_quiesce_pend, -1); 494 cv_broadcast(&ulp->ul_cv); 495 mutex_exit(&ulp->ul_lock); 496 vfs_unlock(vp->v_vfsp); 497 498 /* 499 * allow the delete thread to continue 500 */ 501 ufs_thread_continue(&ufsvfsp->vfs_delete); 502 return (error); 503 } 504 505 /* 506 * ufs_fioisbusy 507 * Get number of references on this vnode. 508 * Contract-private interface for Legato's NetWorker product. 509 */ 510 /* ARGSUSED */ 511 int 512 ufs_fioisbusy(struct vnode *vp, int *isbusy, struct cred *cr) 513 { 514 int is_it_busy; 515 516 /* 517 * The caller holds one reference, there may be one in the dnlc 518 * so we need to flush it. 519 */ 520 if (vp->v_count > 1) 521 dnlc_purge_vp(vp); 522 /* 523 * Since we've just flushed the dnlc and we hold a reference 524 * to this vnode, then anything but 1 means busy (this had 525 * BETTER not be zero!). Also, it's possible for someone to 526 * have this file mmap'ed with no additional reference count. 527 */ 528 ASSERT(vp->v_count > 0); 529 if ((vp->v_count == 1) && (VTOI(vp)->i_mapcnt == 0)) 530 is_it_busy = 0; 531 else 532 is_it_busy = 1; 533 534 if (suword32(isbusy, is_it_busy)) 535 return (EFAULT); 536 return (0); 537 } 538 539 /* ARGSUSED */ 540 int 541 ufs_fiodirectio(struct vnode *vp, int cmd, struct cred *cr) 542 { 543 int error = 0; 544 struct inode *ip = VTOI(vp); 545 546 /* 547 * Acquire reader lock and set/reset direct mode 548 */ 549 rw_enter(&ip->i_contents, RW_READER); 550 mutex_enter(&ip->i_tlock); 551 if (cmd == DIRECTIO_ON) 552 ip->i_flag |= IDIRECTIO; /* enable direct mode */ 553 else if (cmd == DIRECTIO_OFF) 554 ip->i_flag &= ~IDIRECTIO; /* disable direct mode */ 555 else 556 error = EINVAL; 557 mutex_exit(&ip->i_tlock); 558 rw_exit(&ip->i_contents); 559 return (error); 560 } 561 562 /* 563 * ufs_fiotune 564 * Allow some tunables to be set on a mounted fs 565 */ 566 int 567 ufs_fiotune(struct vnode *vp, struct fiotune *uftp, struct cred *cr) 568 { 569 struct fiotune ftp; 570 struct fs *fs; 571 struct ufsvfs *ufsvfsp; 572 573 /* 574 * must have sufficient privileges 575 */ 576 if (secpolicy_fs_config(cr, vp->v_vfsp) != 0) 577 return (EPERM); 578 579 /* 580 * get user's copy 581 */ 582 if (copyin(uftp, &ftp, sizeof (ftp))) 583 return (EFAULT); 584 585 /* 586 * some minimal sanity checks 587 */ 588 if ((ftp.maxcontig <= 0) || 589 (ftp.rotdelay != 0) || 590 (ftp.maxbpg <= 0) || 591 (ftp.minfree < 0) || 592 (ftp.minfree > 99) || 593 ((ftp.optim != FS_OPTTIME) && (ftp.optim != FS_OPTSPACE))) 594 return (EINVAL); 595 596 /* 597 * update superblock but don't write it! If it gets out, fine. 598 */ 599 fs = VTOI(vp)->i_fs; 600 601 fs->fs_maxcontig = ftp.maxcontig; 602 fs->fs_rotdelay = ftp.rotdelay; 603 fs->fs_maxbpg = ftp.maxbpg; 604 fs->fs_minfree = ftp.minfree; 605 fs->fs_optim = ftp.optim; 606 607 /* 608 * Adjust cluster based on the new maxcontig. The cluster size 609 * can be any positive value. The check for this is done above. 610 */ 611 ufsvfsp = VTOI(vp)->i_ufsvfs; 612 ufsvfsp->vfs_ioclustsz = fs->fs_bsize * fs->fs_maxcontig; 613 614 /* 615 * Adjust minfrags from minfree 616 */ 617 ufsvfsp->vfs_minfrags = (int)((int64_t)fs->fs_dsize * 618 fs->fs_minfree / 100); 619 620 /* 621 * Write the superblock 622 */ 623 if (fs->fs_ronly == 0) { 624 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SBUPDATE_UPDATE, 625 TOP_SBWRITE_SIZE); 626 TRANS_SBWRITE(ufsvfsp, TOP_SBUPDATE_UPDATE); 627 TRANS_END_ASYNC(ufsvfsp, TOP_SBUPDATE_UPDATE, TOP_SBWRITE_SIZE); 628 } 629 630 return (0); 631 } 632 633 /* 634 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 635 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 636 */ 637 int 638 ufs_fio_holey(vnode_t *vp, int cmd, offset_t *off) 639 { 640 inode_t *ip = VTOI(vp); 641 u_offset_t noff = (u_offset_t)*off; /* new offset */ 642 u_offset_t isz; 643 int error; 644 boolean_t hole; 645 646 rw_enter(&ip->i_contents, RW_READER); 647 isz = ip->i_size; 648 if (noff >= isz) { 649 rw_exit(&ip->i_contents); 650 return (ENXIO); 651 } 652 653 /* 654 * Check for the usual case where a file has no holes. 655 * If so we can optimise to set the end of the file as the first 656 * (virtual) hole. This avoids bmap_find() searching through 657 * every block in the file for a (non-existent) hole. 658 */ 659 if (!bmap_has_holes(ip)) { 660 rw_exit(&ip->i_contents); 661 if (cmd == _FIO_SEEK_HOLE) { 662 *off = isz; 663 return (0); 664 } 665 /* *off must already point to valid data (non hole) */ 666 return (0); 667 } 668 669 /* 670 * Calling bmap_read() one block at a time on a 1TB file takes forever, 671 * so we use a special function to search for holes or blocks. 672 */ 673 if (cmd == _FIO_SEEK_HOLE) 674 hole = B_TRUE; 675 else 676 hole = B_FALSE; 677 error = bmap_find(ip, hole, &noff); 678 rw_exit(&ip->i_contents); 679 680 /* end of file? */ 681 if (error == ENXIO) { 682 /* 683 * Handle the virtual hole at the end of file. 684 */ 685 if (cmd == _FIO_SEEK_HOLE) { 686 *off = isz; 687 return (0); 688 } 689 return (ENXIO); 690 } 691 if (noff < *off) 692 return (error); 693 *off = noff; 694 return (error); 695 } 696