1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_mac.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/fcntl.h> 49 #include <sys/file.h> 50 #include <sys/stat.h> 51 #include <sys/proc.h> 52 #include <sys/limits.h> 53 #include <sys/lock.h> 54 #include <sys/mac.h> 55 #include <sys/mount.h> 56 #include <sys/mutex.h> 57 #include <sys/namei.h> 58 #include <sys/vnode.h> 59 #include <sys/bio.h> 60 #include <sys/buf.h> 61 #include <sys/filio.h> 62 #include <sys/sx.h> 63 #include <sys/ttycom.h> 64 #include <sys/conf.h> 65 #include <sys/syslog.h> 66 67 static fo_rdwr_t vn_read; 68 static fo_rdwr_t vn_write; 69 static fo_ioctl_t vn_ioctl; 70 static fo_poll_t vn_poll; 71 static fo_kqfilter_t vn_kqfilter; 72 static fo_stat_t vn_statfile; 73 static fo_close_t vn_closefile; 74 75 struct fileops vnops = { 76 .fo_read = vn_read, 77 .fo_write = vn_write, 78 .fo_ioctl = vn_ioctl, 79 .fo_poll = vn_poll, 80 .fo_kqfilter = vn_kqfilter, 81 .fo_stat = vn_statfile, 82 .fo_close = vn_closefile, 83 .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE 84 }; 85 86 int 87 vn_open(ndp, flagp, cmode) 88 register struct nameidata *ndp; 89 int *flagp, cmode; 90 { 91 struct thread *td = ndp->ni_cnd.cn_thread; 92 93 return (vn_open_cred(ndp, flagp, cmode, td->td_ucred)); 94 } 95 96 /* 97 * Common code for vnode open operations. 98 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. 99 * 100 * Note that this does NOT free nameidata for the successful case, 101 * due to the NDINIT being done elsewhere. 102 */ 103 int 104 vn_open_cred(ndp, flagp, cmode, cred) 105 register struct nameidata *ndp; 106 int *flagp, cmode; 107 struct ucred *cred; 108 { 109 struct vnode *vp; 110 struct mount *mp; 111 struct thread *td = ndp->ni_cnd.cn_thread; 112 struct vattr vat; 113 struct vattr *vap = &vat; 114 int mode, fmode, error; 115 #ifdef LOOKUP_SHARED 116 int exclusive; /* The current intended lock state */ 117 118 exclusive = 0; 119 #endif 120 121 restart: 122 fmode = *flagp; 123 if (fmode & O_CREAT) { 124 ndp->ni_cnd.cn_nameiop = CREATE; 125 ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; 126 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) 127 ndp->ni_cnd.cn_flags |= FOLLOW; 128 bwillwrite(); 129 if ((error = namei(ndp)) != 0) 130 return (error); 131 if (ndp->ni_vp == NULL) { 132 VATTR_NULL(vap); 133 vap->va_type = VREG; 134 vap->va_mode = cmode; 135 if (fmode & O_EXCL) 136 vap->va_vaflags |= VA_EXCLUSIVE; 137 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { 138 NDFREE(ndp, NDF_ONLY_PNBUF); 139 vput(ndp->ni_dvp); 140 if ((error = vn_start_write(NULL, &mp, 141 V_XSLEEP | PCATCH)) != 0) 142 return (error); 143 goto restart; 144 } 145 #ifdef MAC 146 error = mac_check_vnode_create(cred, ndp->ni_dvp, 147 &ndp->ni_cnd, vap); 148 if (error == 0) { 149 #endif 150 VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE); 151 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, 152 &ndp->ni_cnd, vap); 153 #ifdef MAC 154 } 155 #endif 156 vput(ndp->ni_dvp); 157 vn_finished_write(mp); 158 if (error) { 159 NDFREE(ndp, NDF_ONLY_PNBUF); 160 return (error); 161 } 162 ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create"); 163 ASSERT_VOP_LOCKED(ndp->ni_vp, "create"); 164 fmode &= ~O_TRUNC; 165 vp = ndp->ni_vp; 166 #ifdef LOOKUP_SHARED 167 exclusive = 1; 168 #endif 169 } else { 170 if (ndp->ni_dvp == ndp->ni_vp) 171 vrele(ndp->ni_dvp); 172 else 173 vput(ndp->ni_dvp); 174 ndp->ni_dvp = NULL; 175 vp = ndp->ni_vp; 176 if (fmode & O_EXCL) { 177 error = EEXIST; 178 goto bad; 179 } 180 fmode &= ~O_CREAT; 181 } 182 } else { 183 ndp->ni_cnd.cn_nameiop = LOOKUP; 184 #ifdef LOOKUP_SHARED 185 ndp->ni_cnd.cn_flags = 186 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | 187 LOCKSHARED | LOCKLEAF; 188 #else 189 ndp->ni_cnd.cn_flags = 190 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF; 191 #endif 192 if ((error = namei(ndp)) != 0) 193 return (error); 194 vp = ndp->ni_vp; 195 } 196 if (vp->v_type == VLNK) { 197 error = EMLINK; 198 goto bad; 199 } 200 if (vp->v_type == VSOCK) { 201 error = EOPNOTSUPP; 202 goto bad; 203 } 204 mode = 0; 205 if (fmode & (FWRITE | O_TRUNC)) { 206 if (vp->v_type == VDIR) { 207 error = EISDIR; 208 goto bad; 209 } 210 mode |= VWRITE; 211 } 212 if (fmode & FREAD) 213 mode |= VREAD; 214 if (fmode & O_APPEND) 215 mode |= VAPPEND; 216 #ifdef MAC 217 error = mac_check_vnode_open(cred, vp, mode); 218 if (error) 219 goto bad; 220 #endif 221 if ((fmode & O_CREAT) == 0) { 222 if (mode & VWRITE) { 223 error = vn_writechk(vp); 224 if (error) 225 goto bad; 226 } 227 if (mode) { 228 error = VOP_ACCESS(vp, mode, cred, td); 229 if (error) 230 goto bad; 231 } 232 } 233 if ((error = VOP_GETATTR(vp, vap, cred, td)) == 0) { 234 vp->v_cachedfs = vap->va_fsid; 235 vp->v_cachedid = vap->va_fileid; 236 } 237 if ((error = VOP_OPEN(vp, fmode, cred, td)) != 0) 238 goto bad; 239 /* 240 * Make sure that a VM object is created for VMIO support. 241 */ 242 if (vn_canvmio(vp) == TRUE) { 243 #ifdef LOOKUP_SHARED 244 int flock; 245 246 if (!exclusive && VOP_GETVOBJECT(vp, NULL) != 0) 247 VOP_LOCK(vp, LK_UPGRADE, td); 248 /* 249 * In cases where the object is marked as dead object_create 250 * will unlock and relock exclusive. It is safe to call in 251 * here with a shared lock because we only examine fields that 252 * the shared lock guarantees will be stable. In the UPGRADE 253 * case it is not likely that anyone has used this vnode yet 254 * so there will be no contention. The logic after this call 255 * restores the requested locking state. 256 */ 257 #endif 258 if ((error = vfs_object_create(vp, td, cred)) != 0) { 259 VOP_UNLOCK(vp, 0, td); 260 VOP_CLOSE(vp, fmode, cred, td); 261 NDFREE(ndp, NDF_ONLY_PNBUF); 262 vrele(vp); 263 *flagp = fmode; 264 return (error); 265 } 266 #ifdef LOOKUP_SHARED 267 flock = VOP_ISLOCKED(vp, td); 268 if (!exclusive && flock == LK_EXCLUSIVE) 269 VOP_LOCK(vp, LK_DOWNGRADE, td); 270 #endif 271 } 272 273 if (fmode & FWRITE) 274 vp->v_writecount++; 275 *flagp = fmode; 276 ASSERT_VOP_LOCKED(vp, "vn_open_cred"); 277 return (0); 278 bad: 279 NDFREE(ndp, NDF_ONLY_PNBUF); 280 vput(vp); 281 *flagp = fmode; 282 ndp->ni_vp = NULL; 283 return (error); 284 } 285 286 /* 287 * Check for write permissions on the specified vnode. 288 * Prototype text segments cannot be written. 289 */ 290 int 291 vn_writechk(vp) 292 register struct vnode *vp; 293 { 294 295 ASSERT_VOP_LOCKED(vp, "vn_writechk"); 296 /* 297 * If there's shared text associated with 298 * the vnode, try to free it up once. If 299 * we fail, we can't allow writing. 300 */ 301 if (vp->v_vflag & VV_TEXT) 302 return (ETXTBSY); 303 304 return (0); 305 } 306 307 /* 308 * Vnode close call 309 */ 310 int 311 vn_close(vp, flags, file_cred, td) 312 register struct vnode *vp; 313 int flags; 314 struct ucred *file_cred; 315 struct thread *td; 316 { 317 int error; 318 319 if (flags & FWRITE) 320 vp->v_writecount--; 321 error = VOP_CLOSE(vp, flags, file_cred, td); 322 /* 323 * XXX - In certain instances VOP_CLOSE has to do the vrele 324 * itself. If the vrele has been done, it will return EAGAIN 325 * to indicate that the vrele should not be done again. When 326 * this happens, we just return success. The correct thing to 327 * do would be to have all VOP_CLOSE instances do the vrele. 328 */ 329 if (error == EAGAIN) 330 return (0); 331 vrele(vp); 332 return (error); 333 } 334 335 /* 336 * Sequential heuristic - detect sequential operation 337 */ 338 static __inline 339 int 340 sequential_heuristic(struct uio *uio, struct file *fp) 341 { 342 343 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || 344 uio->uio_offset == fp->f_nextoff) { 345 /* 346 * XXX we assume that the filesystem block size is 347 * the default. Not true, but still gives us a pretty 348 * good indicator of how sequential the read operations 349 * are. 350 */ 351 fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE; 352 if (fp->f_seqcount > IO_SEQMAX) 353 fp->f_seqcount = IO_SEQMAX; 354 return(fp->f_seqcount << IO_SEQSHIFT); 355 } 356 357 /* 358 * Not sequential, quick draw-down of seqcount 359 */ 360 if (fp->f_seqcount > 1) 361 fp->f_seqcount = 1; 362 else 363 fp->f_seqcount = 0; 364 return(0); 365 } 366 367 /* 368 * Package up an I/O request on a vnode into a uio and do it. 369 */ 370 int 371 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, 372 aresid, td) 373 enum uio_rw rw; 374 struct vnode *vp; 375 caddr_t base; 376 int len; 377 off_t offset; 378 enum uio_seg segflg; 379 int ioflg; 380 struct ucred *active_cred; 381 struct ucred *file_cred; 382 int *aresid; 383 struct thread *td; 384 { 385 struct uio auio; 386 struct iovec aiov; 387 struct mount *mp; 388 struct ucred *cred; 389 int error; 390 391 if ((ioflg & IO_NODELOCKED) == 0) { 392 mp = NULL; 393 if (rw == UIO_WRITE) { 394 if (vp->v_type != VCHR && 395 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) 396 != 0) 397 return (error); 398 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 399 } else { 400 /* 401 * XXX This should be LK_SHARED but I don't trust VFS 402 * enough to leave it like that until it has been 403 * reviewed further. 404 */ 405 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 406 } 407 408 } 409 auio.uio_iov = &aiov; 410 auio.uio_iovcnt = 1; 411 aiov.iov_base = base; 412 aiov.iov_len = len; 413 auio.uio_resid = len; 414 auio.uio_offset = offset; 415 auio.uio_segflg = segflg; 416 auio.uio_rw = rw; 417 auio.uio_td = td; 418 error = 0; 419 #ifdef MAC 420 if ((ioflg & IO_NOMACCHECK) == 0) { 421 if (rw == UIO_READ) 422 error = mac_check_vnode_read(active_cred, file_cred, 423 vp); 424 else 425 error = mac_check_vnode_write(active_cred, file_cred, 426 vp); 427 } 428 #endif 429 if (error == 0) { 430 if (file_cred) 431 cred = file_cred; 432 else 433 cred = active_cred; 434 if (rw == UIO_READ) 435 error = VOP_READ(vp, &auio, ioflg, cred); 436 else 437 error = VOP_WRITE(vp, &auio, ioflg, cred); 438 } 439 if (aresid) 440 *aresid = auio.uio_resid; 441 else 442 if (auio.uio_resid && error == 0) 443 error = EIO; 444 if ((ioflg & IO_NODELOCKED) == 0) { 445 if (rw == UIO_WRITE) 446 vn_finished_write(mp); 447 VOP_UNLOCK(vp, 0, td); 448 } 449 return (error); 450 } 451 452 /* 453 * Package up an I/O request on a vnode into a uio and do it. The I/O 454 * request is split up into smaller chunks and we try to avoid saturating 455 * the buffer cache while potentially holding a vnode locked, so we 456 * check bwillwrite() before calling vn_rdwr(). We also call uio_yield() 457 * to give other processes a chance to lock the vnode (either other processes 458 * core'ing the same binary, or unrelated processes scanning the directory). 459 */ 460 int 461 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred, 462 file_cred, aresid, td) 463 enum uio_rw rw; 464 struct vnode *vp; 465 caddr_t base; 466 int len; 467 off_t offset; 468 enum uio_seg segflg; 469 int ioflg; 470 struct ucred *active_cred; 471 struct ucred *file_cred; 472 int *aresid; 473 struct thread *td; 474 { 475 int error = 0; 476 477 do { 478 int chunk = (len > MAXBSIZE) ? MAXBSIZE : len; 479 480 if (rw != UIO_READ && vp->v_type == VREG) 481 bwillwrite(); 482 error = vn_rdwr(rw, vp, base, chunk, offset, segflg, 483 ioflg, active_cred, file_cred, aresid, td); 484 len -= chunk; /* aresid calc already includes length */ 485 if (error) 486 break; 487 offset += chunk; 488 base += chunk; 489 uio_yield(); 490 } while (len); 491 if (aresid) 492 *aresid += len; 493 return (error); 494 } 495 496 /* 497 * File table vnode read routine. 498 */ 499 static int 500 vn_read(fp, uio, active_cred, flags, td) 501 struct file *fp; 502 struct uio *uio; 503 struct ucred *active_cred; 504 struct thread *td; 505 int flags; 506 { 507 struct vnode *vp; 508 int error, ioflag; 509 510 mtx_lock(&Giant); 511 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", 512 uio->uio_td, td)); 513 vp = fp->f_vnode; 514 ioflag = 0; 515 if (fp->f_flag & FNONBLOCK) 516 ioflag |= IO_NDELAY; 517 if (fp->f_flag & O_DIRECT) 518 ioflag |= IO_DIRECT; 519 VOP_LEASE(vp, td, fp->f_cred, LEASE_READ); 520 /* 521 * According to McKusick the vn lock is protecting f_offset here. 522 * Once this field has it's own lock we can acquire this shared. 523 */ 524 if ((flags & FOF_OFFSET) == 0) { 525 vn_lock(vp, LK_EXCLUSIVE | LK_NOPAUSE | LK_RETRY, td); 526 uio->uio_offset = fp->f_offset; 527 } else 528 vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td); 529 530 ioflag |= sequential_heuristic(uio, fp); 531 532 #ifdef MAC 533 error = mac_check_vnode_read(active_cred, fp->f_cred, vp); 534 if (error == 0) 535 #endif 536 error = VOP_READ(vp, uio, ioflag, fp->f_cred); 537 if ((flags & FOF_OFFSET) == 0) 538 fp->f_offset = uio->uio_offset; 539 fp->f_nextoff = uio->uio_offset; 540 VOP_UNLOCK(vp, 0, td); 541 mtx_unlock(&Giant); 542 return (error); 543 } 544 545 /* 546 * File table vnode write routine. 547 */ 548 static int 549 vn_write(fp, uio, active_cred, flags, td) 550 struct file *fp; 551 struct uio *uio; 552 struct ucred *active_cred; 553 struct thread *td; 554 int flags; 555 { 556 struct vnode *vp; 557 struct mount *mp; 558 int error, ioflag; 559 560 mtx_lock(&Giant); 561 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", 562 uio->uio_td, td)); 563 vp = fp->f_vnode; 564 if (vp->v_type == VREG) 565 bwillwrite(); 566 ioflag = IO_UNIT; 567 if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) 568 ioflag |= IO_APPEND; 569 if (fp->f_flag & FNONBLOCK) 570 ioflag |= IO_NDELAY; 571 if (fp->f_flag & O_DIRECT) 572 ioflag |= IO_DIRECT; 573 if ((fp->f_flag & O_FSYNC) || 574 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) 575 ioflag |= IO_SYNC; 576 mp = NULL; 577 if (vp->v_type != VCHR && 578 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { 579 mtx_unlock(&Giant); 580 return (error); 581 } 582 VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE); 583 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 584 if ((flags & FOF_OFFSET) == 0) 585 uio->uio_offset = fp->f_offset; 586 ioflag |= sequential_heuristic(uio, fp); 587 #ifdef MAC 588 error = mac_check_vnode_write(active_cred, fp->f_cred, vp); 589 if (error == 0) 590 #endif 591 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); 592 if ((flags & FOF_OFFSET) == 0) 593 fp->f_offset = uio->uio_offset; 594 fp->f_nextoff = uio->uio_offset; 595 VOP_UNLOCK(vp, 0, td); 596 vn_finished_write(mp); 597 mtx_unlock(&Giant); 598 return (error); 599 } 600 601 /* 602 * File table vnode stat routine. 603 */ 604 static int 605 vn_statfile(fp, sb, active_cred, td) 606 struct file *fp; 607 struct stat *sb; 608 struct ucred *active_cred; 609 struct thread *td; 610 { 611 struct vnode *vp = fp->f_vnode; 612 int error; 613 614 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 615 error = vn_stat(vp, sb, active_cred, fp->f_cred, td); 616 VOP_UNLOCK(vp, 0, td); 617 618 return (error); 619 } 620 621 /* 622 * Stat a vnode; implementation for the stat syscall 623 */ 624 int 625 vn_stat(vp, sb, active_cred, file_cred, td) 626 struct vnode *vp; 627 register struct stat *sb; 628 struct ucred *active_cred; 629 struct ucred *file_cred; 630 struct thread *td; 631 { 632 struct vattr vattr; 633 register struct vattr *vap; 634 int error; 635 u_short mode; 636 637 #ifdef MAC 638 error = mac_check_vnode_stat(active_cred, file_cred, vp); 639 if (error) 640 return (error); 641 #endif 642 643 vap = &vattr; 644 error = VOP_GETATTR(vp, vap, active_cred, td); 645 if (error) 646 return (error); 647 648 vp->v_cachedfs = vap->va_fsid; 649 vp->v_cachedid = vap->va_fileid; 650 651 /* 652 * Zero the spare stat fields 653 */ 654 bzero(sb, sizeof *sb); 655 656 /* 657 * Copy from vattr table 658 */ 659 if (vap->va_fsid != VNOVAL) 660 sb->st_dev = vap->va_fsid; 661 else 662 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; 663 sb->st_ino = vap->va_fileid; 664 mode = vap->va_mode; 665 switch (vap->va_type) { 666 case VREG: 667 mode |= S_IFREG; 668 break; 669 case VDIR: 670 mode |= S_IFDIR; 671 break; 672 case VBLK: 673 mode |= S_IFBLK; 674 break; 675 case VCHR: 676 mode |= S_IFCHR; 677 break; 678 case VLNK: 679 mode |= S_IFLNK; 680 /* This is a cosmetic change, symlinks do not have a mode. */ 681 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) 682 sb->st_mode &= ~ACCESSPERMS; /* 0000 */ 683 else 684 sb->st_mode |= ACCESSPERMS; /* 0777 */ 685 break; 686 case VSOCK: 687 mode |= S_IFSOCK; 688 break; 689 case VFIFO: 690 mode |= S_IFIFO; 691 break; 692 default: 693 return (EBADF); 694 }; 695 sb->st_mode = mode; 696 sb->st_nlink = vap->va_nlink; 697 sb->st_uid = vap->va_uid; 698 sb->st_gid = vap->va_gid; 699 sb->st_rdev = vap->va_rdev; 700 if (vap->va_size > OFF_MAX) 701 return (EOVERFLOW); 702 sb->st_size = vap->va_size; 703 sb->st_atimespec = vap->va_atime; 704 sb->st_mtimespec = vap->va_mtime; 705 sb->st_ctimespec = vap->va_ctime; 706 sb->st_birthtimespec = vap->va_birthtime; 707 708 /* 709 * According to www.opengroup.org, the meaning of st_blksize is 710 * "a filesystem-specific preferred I/O block size for this 711 * object. In some filesystem types, this may vary from file 712 * to file" 713 * Default to PAGE_SIZE after much discussion. 714 */ 715 716 if (vap->va_type == VREG) { 717 sb->st_blksize = vap->va_blocksize; 718 } else if (vn_isdisk(vp, NULL)) { 719 sb->st_blksize = vp->v_rdev->si_bsize_best; 720 if (sb->st_blksize < vp->v_rdev->si_bsize_phys) 721 sb->st_blksize = vp->v_rdev->si_bsize_phys; 722 if (sb->st_blksize < BLKDEV_IOSIZE) 723 sb->st_blksize = BLKDEV_IOSIZE; 724 } else { 725 sb->st_blksize = PAGE_SIZE; 726 } 727 728 sb->st_flags = vap->va_flags; 729 if (suser(td)) 730 sb->st_gen = 0; 731 else 732 sb->st_gen = vap->va_gen; 733 734 #if (S_BLKSIZE == 512) 735 /* Optimize this case */ 736 sb->st_blocks = vap->va_bytes >> 9; 737 #else 738 sb->st_blocks = vap->va_bytes / S_BLKSIZE; 739 #endif 740 return (0); 741 } 742 743 /* 744 * File table vnode ioctl routine. 745 */ 746 static int 747 vn_ioctl(fp, com, data, active_cred, td) 748 struct file *fp; 749 u_long com; 750 void *data; 751 struct ucred *active_cred; 752 struct thread *td; 753 { 754 struct vnode *vp = fp->f_vnode; 755 struct vnode *vpold; 756 struct vattr vattr; 757 int error; 758 759 switch (vp->v_type) { 760 761 case VREG: 762 case VDIR: 763 if (com == FIONREAD) { 764 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 765 error = VOP_GETATTR(vp, &vattr, active_cred, td); 766 VOP_UNLOCK(vp, 0, td); 767 if (error) 768 return (error); 769 *(int *)data = vattr.va_size - fp->f_offset; 770 return (0); 771 } 772 if (com == FIONBIO || com == FIOASYNC) /* XXX */ 773 return (0); /* XXX */ 774 /* FALLTHROUGH */ 775 776 default: 777 #if 0 778 return (ENOTTY); 779 #endif 780 case VFIFO: 781 case VCHR: 782 case VBLK: 783 if (com == FIODTYPE) { 784 if (vp->v_type != VCHR && vp->v_type != VBLK) 785 return (ENOTTY); 786 *(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK; 787 return (0); 788 } 789 error = VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td); 790 if (error == ENOIOCTL) { 791 #ifdef DIAGNOSTIC 792 Debugger("ENOIOCTL leaked through"); 793 #endif 794 error = ENOTTY; 795 } 796 if (error == 0 && com == TIOCSCTTY) { 797 798 /* Do nothing if reassigning same control tty */ 799 sx_slock(&proctree_lock); 800 if (td->td_proc->p_session->s_ttyvp == vp) { 801 sx_sunlock(&proctree_lock); 802 return (0); 803 } 804 805 vpold = td->td_proc->p_session->s_ttyvp; 806 VREF(vp); 807 SESS_LOCK(td->td_proc->p_session); 808 td->td_proc->p_session->s_ttyvp = vp; 809 SESS_UNLOCK(td->td_proc->p_session); 810 811 sx_sunlock(&proctree_lock); 812 813 /* Get rid of reference to old control tty */ 814 if (vpold) 815 vrele(vpold); 816 } 817 return (error); 818 } 819 } 820 821 /* 822 * File table vnode poll routine. 823 */ 824 static int 825 vn_poll(fp, events, active_cred, td) 826 struct file *fp; 827 int events; 828 struct ucred *active_cred; 829 struct thread *td; 830 { 831 struct vnode *vp; 832 #ifdef MAC 833 int error; 834 #endif 835 836 vp = fp->f_vnode; 837 #ifdef MAC 838 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 839 error = mac_check_vnode_poll(active_cred, fp->f_cred, vp); 840 VOP_UNLOCK(vp, 0, td); 841 if (error) 842 return (error); 843 #endif 844 845 return (VOP_POLL(vp, events, fp->f_cred, td)); 846 } 847 848 /* 849 * Check that the vnode is still valid, and if so 850 * acquire requested lock. 851 */ 852 int 853 #ifndef DEBUG_LOCKS 854 vn_lock(vp, flags, td) 855 #else 856 debug_vn_lock(vp, flags, td, filename, line) 857 #endif 858 struct vnode *vp; 859 int flags; 860 struct thread *td; 861 #ifdef DEBUG_LOCKS 862 const char *filename; 863 int line; 864 #endif 865 { 866 int error; 867 868 do { 869 if ((flags & LK_INTERLOCK) == 0) 870 VI_LOCK(vp); 871 if ((vp->v_iflag & VI_XLOCK) && vp->v_vxproc != curthread) { 872 vp->v_iflag |= VI_XWANT; 873 msleep(vp, VI_MTX(vp), PINOD, "vn_lock", 0); 874 error = ENOENT; 875 if ((flags & LK_RETRY) == 0) { 876 VI_UNLOCK(vp); 877 return (error); 878 } 879 } 880 #ifdef DEBUG_LOCKS 881 vp->filename = filename; 882 vp->line = line; 883 #endif 884 /* 885 * lockmgr drops interlock before it will return for 886 * any reason. So force the code above to relock it. 887 */ 888 error = VOP_LOCK(vp, flags | LK_NOPAUSE | LK_INTERLOCK, td); 889 flags &= ~LK_INTERLOCK; 890 } while (flags & LK_RETRY && error != 0); 891 return (error); 892 } 893 894 /* 895 * File table vnode close routine. 896 */ 897 static int 898 vn_closefile(fp, td) 899 struct file *fp; 900 struct thread *td; 901 { 902 903 fp->f_ops = &badfileops; 904 return (vn_close(fp->f_vnode, fp->f_flag, fp->f_cred, td)); 905 } 906 907 /* 908 * Preparing to start a filesystem write operation. If the operation is 909 * permitted, then we bump the count of operations in progress and 910 * proceed. If a suspend request is in progress, we wait until the 911 * suspension is over, and then proceed. 912 */ 913 int 914 vn_start_write(vp, mpp, flags) 915 struct vnode *vp; 916 struct mount **mpp; 917 int flags; 918 { 919 struct mount *mp; 920 int error; 921 922 /* 923 * If a vnode is provided, get and return the mount point that 924 * to which it will write. 925 */ 926 if (vp != NULL) { 927 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { 928 *mpp = NULL; 929 if (error != EOPNOTSUPP) 930 return (error); 931 return (0); 932 } 933 } 934 if ((mp = *mpp) == NULL) 935 return (0); 936 /* 937 * Check on status of suspension. 938 */ 939 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 940 if (flags & V_NOWAIT) 941 return (EWOULDBLOCK); 942 error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH), 943 "suspfs", 0); 944 if (error) 945 return (error); 946 } 947 if (flags & V_XSLEEP) 948 return (0); 949 mp->mnt_writeopcount++; 950 return (0); 951 } 952 953 /* 954 * Secondary suspension. Used by operations such as vop_inactive 955 * routines that are needed by the higher level functions. These 956 * are allowed to proceed until all the higher level functions have 957 * completed (indicated by mnt_writeopcount dropping to zero). At that 958 * time, these operations are halted until the suspension is over. 959 */ 960 int 961 vn_write_suspend_wait(vp, mp, flags) 962 struct vnode *vp; 963 struct mount *mp; 964 int flags; 965 { 966 int error; 967 968 if (vp != NULL) { 969 if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) { 970 if (error != EOPNOTSUPP) 971 return (error); 972 return (0); 973 } 974 } 975 /* 976 * If we are not suspended or have not yet reached suspended 977 * mode, then let the operation proceed. 978 */ 979 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) 980 return (0); 981 if (flags & V_NOWAIT) 982 return (EWOULDBLOCK); 983 /* 984 * Wait for the suspension to finish. 985 */ 986 return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH), 987 "suspfs", 0)); 988 } 989 990 /* 991 * Filesystem write operation has completed. If we are suspending and this 992 * operation is the last one, notify the suspender that the suspension is 993 * now in effect. 994 */ 995 void 996 vn_finished_write(mp) 997 struct mount *mp; 998 { 999 1000 if (mp == NULL) 1001 return; 1002 mp->mnt_writeopcount--; 1003 if (mp->mnt_writeopcount < 0) 1004 panic("vn_finished_write: neg cnt"); 1005 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && 1006 mp->mnt_writeopcount <= 0) 1007 wakeup(&mp->mnt_writeopcount); 1008 } 1009 1010 /* 1011 * Request a filesystem to suspend write operations. 1012 */ 1013 int 1014 vfs_write_suspend(mp) 1015 struct mount *mp; 1016 { 1017 struct thread *td = curthread; 1018 int error; 1019 1020 if (mp->mnt_kern_flag & MNTK_SUSPEND) 1021 return (0); 1022 mp->mnt_kern_flag |= MNTK_SUSPEND; 1023 if (mp->mnt_writeopcount > 0) 1024 (void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0); 1025 if ((error = VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td)) != 0) { 1026 vfs_write_resume(mp); 1027 return (error); 1028 } 1029 mp->mnt_kern_flag |= MNTK_SUSPENDED; 1030 return (0); 1031 } 1032 1033 /* 1034 * Request a filesystem to resume write operations. 1035 */ 1036 void 1037 vfs_write_resume(mp) 1038 struct mount *mp; 1039 { 1040 1041 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) 1042 return; 1043 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED); 1044 wakeup(&mp->mnt_writeopcount); 1045 wakeup(&mp->mnt_flag); 1046 } 1047 1048 /* 1049 * Implement kqueues for files by translating it to vnode operation. 1050 */ 1051 static int 1052 vn_kqfilter(struct file *fp, struct knote *kn) 1053 { 1054 1055 return (VOP_KQFILTER(fp->f_vnode, kn)); 1056 } 1057 1058 /* 1059 * Simplified in-kernel wrapper calls for extended attribute access. 1060 * Both calls pass in a NULL credential, authorizing as "kernel" access. 1061 * Set IO_NODELOCKED in ioflg if the vnode is already locked. 1062 */ 1063 int 1064 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, 1065 const char *attrname, int *buflen, char *buf, struct thread *td) 1066 { 1067 struct uio auio; 1068 struct iovec iov; 1069 int error; 1070 1071 iov.iov_len = *buflen; 1072 iov.iov_base = buf; 1073 1074 auio.uio_iov = &iov; 1075 auio.uio_iovcnt = 1; 1076 auio.uio_rw = UIO_READ; 1077 auio.uio_segflg = UIO_SYSSPACE; 1078 auio.uio_td = td; 1079 auio.uio_offset = 0; 1080 auio.uio_resid = *buflen; 1081 1082 if ((ioflg & IO_NODELOCKED) == 0) 1083 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1084 1085 /* authorize attribute retrieval as kernel */ 1086 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, 1087 td); 1088 1089 if ((ioflg & IO_NODELOCKED) == 0) 1090 VOP_UNLOCK(vp, 0, td); 1091 1092 if (error == 0) { 1093 *buflen = *buflen - auio.uio_resid; 1094 } 1095 1096 return (error); 1097 } 1098 1099 /* 1100 * XXX failure mode if partially written? 1101 */ 1102 int 1103 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, 1104 const char *attrname, int buflen, char *buf, struct thread *td) 1105 { 1106 struct uio auio; 1107 struct iovec iov; 1108 struct mount *mp; 1109 int error; 1110 1111 iov.iov_len = buflen; 1112 iov.iov_base = buf; 1113 1114 auio.uio_iov = &iov; 1115 auio.uio_iovcnt = 1; 1116 auio.uio_rw = UIO_WRITE; 1117 auio.uio_segflg = UIO_SYSSPACE; 1118 auio.uio_td = td; 1119 auio.uio_offset = 0; 1120 auio.uio_resid = buflen; 1121 1122 if ((ioflg & IO_NODELOCKED) == 0) { 1123 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) 1124 return (error); 1125 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1126 } 1127 1128 /* authorize attribute setting as kernel */ 1129 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); 1130 1131 if ((ioflg & IO_NODELOCKED) == 0) { 1132 vn_finished_write(mp); 1133 VOP_UNLOCK(vp, 0, td); 1134 } 1135 1136 return (error); 1137 } 1138 1139 int 1140 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, 1141 const char *attrname, struct thread *td) 1142 { 1143 struct mount *mp; 1144 int error; 1145 1146 if ((ioflg & IO_NODELOCKED) == 0) { 1147 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) 1148 return (error); 1149 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1150 } 1151 1152 /* authorize attribute removal as kernel */ 1153 error = VOP_RMEXTATTR(vp, attrnamespace, attrname, NULL, td); 1154 if (error == EOPNOTSUPP) 1155 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, 1156 NULL, td); 1157 1158 if ((ioflg & IO_NODELOCKED) == 0) { 1159 vn_finished_write(mp); 1160 VOP_UNLOCK(vp, 0, td); 1161 } 1162 1163 return (error); 1164 } 1165