1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_mac.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/fcntl.h> 45 #include <sys/file.h> 46 #include <sys/kdb.h> 47 #include <sys/stat.h> 48 #include <sys/proc.h> 49 #include <sys/limits.h> 50 #include <sys/lock.h> 51 #include <sys/mac.h> 52 #include <sys/mount.h> 53 #include <sys/mutex.h> 54 #include <sys/namei.h> 55 #include <sys/vnode.h> 56 #include <sys/bio.h> 57 #include <sys/buf.h> 58 #include <sys/filio.h> 59 #include <sys/sx.h> 60 #include <sys/ttycom.h> 61 #include <sys/conf.h> 62 #include <sys/syslog.h> 63 #include <sys/unistd.h> 64 65 static fo_rdwr_t vn_read; 66 static fo_rdwr_t vn_write; 67 static fo_ioctl_t vn_ioctl; 68 static fo_poll_t vn_poll; 69 static fo_kqfilter_t vn_kqfilter; 70 static fo_stat_t vn_statfile; 71 static fo_close_t vn_closefile; 72 73 struct fileops vnops = { 74 .fo_read = vn_read, 75 .fo_write = vn_write, 76 .fo_ioctl = vn_ioctl, 77 .fo_poll = vn_poll, 78 .fo_kqfilter = vn_kqfilter, 79 .fo_stat = vn_statfile, 80 .fo_close = vn_closefile, 81 .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE 82 }; 83 84 int 85 vn_open(ndp, flagp, cmode, fdidx) 86 struct nameidata *ndp; 87 int *flagp, cmode, fdidx; 88 { 89 struct thread *td = ndp->ni_cnd.cn_thread; 90 91 return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fdidx)); 92 } 93 94 /* 95 * Common code for vnode open operations. 96 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. 97 * 98 * Note that this does NOT free nameidata for the successful case, 99 * due to the NDINIT being done elsewhere. 100 */ 101 int 102 vn_open_cred(ndp, flagp, cmode, cred, fdidx) 103 struct nameidata *ndp; 104 int *flagp, cmode; 105 struct ucred *cred; 106 int fdidx; 107 { 108 struct vnode *vp; 109 struct mount *mp; 110 struct thread *td = ndp->ni_cnd.cn_thread; 111 struct vattr vat; 112 struct vattr *vap = &vat; 113 int mode, fmode, error; 114 int vfslocked; 115 116 restart: 117 vfslocked = 0; 118 fmode = *flagp; 119 if (fmode & O_CREAT) { 120 ndp->ni_cnd.cn_nameiop = CREATE; 121 ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | MPSAFE; 122 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) 123 ndp->ni_cnd.cn_flags |= FOLLOW; 124 bwillwrite(); 125 if ((error = namei(ndp)) != 0) 126 return (error); 127 vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0; 128 ndp->ni_cnd.cn_flags &= ~MPSAFE; 129 if (ndp->ni_vp == NULL) { 130 VATTR_NULL(vap); 131 vap->va_type = VREG; 132 vap->va_mode = cmode; 133 if (fmode & O_EXCL) 134 vap->va_vaflags |= VA_EXCLUSIVE; 135 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { 136 NDFREE(ndp, NDF_ONLY_PNBUF); 137 vput(ndp->ni_dvp); 138 VFS_UNLOCK_GIANT(vfslocked); 139 if ((error = vn_start_write(NULL, &mp, 140 V_XSLEEP | PCATCH)) != 0) 141 return (error); 142 goto restart; 143 } 144 #ifdef MAC 145 error = mac_check_vnode_create(cred, ndp->ni_dvp, 146 &ndp->ni_cnd, vap); 147 if (error == 0) { 148 #endif 149 VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE); 150 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, 151 &ndp->ni_cnd, vap); 152 #ifdef MAC 153 } 154 #endif 155 vput(ndp->ni_dvp); 156 vn_finished_write(mp); 157 if (error) { 158 VFS_UNLOCK_GIANT(vfslocked); 159 NDFREE(ndp, NDF_ONLY_PNBUF); 160 return (error); 161 } 162 fmode &= ~O_TRUNC; 163 vp = ndp->ni_vp; 164 } else { 165 if (ndp->ni_dvp == ndp->ni_vp) 166 vrele(ndp->ni_dvp); 167 else 168 vput(ndp->ni_dvp); 169 ndp->ni_dvp = NULL; 170 vp = ndp->ni_vp; 171 if (fmode & O_EXCL) { 172 error = EEXIST; 173 goto bad; 174 } 175 fmode &= ~O_CREAT; 176 } 177 } else { 178 ndp->ni_cnd.cn_nameiop = LOOKUP; 179 ndp->ni_cnd.cn_flags = ISOPEN | 180 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | 181 LOCKSHARED | LOCKLEAF | MPSAFE; 182 if ((error = namei(ndp)) != 0) 183 return (error); 184 ndp->ni_cnd.cn_flags &= ~MPSAFE; 185 vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0; 186 vp = ndp->ni_vp; 187 } 188 if (vp->v_type == VLNK) { 189 error = EMLINK; 190 goto bad; 191 } 192 if (vp->v_type == VSOCK) { 193 error = EOPNOTSUPP; 194 goto bad; 195 } 196 mode = 0; 197 if (fmode & (FWRITE | O_TRUNC)) { 198 if (vp->v_type == VDIR) { 199 error = EISDIR; 200 goto bad; 201 } 202 mode |= VWRITE; 203 } 204 if (fmode & FREAD) 205 mode |= VREAD; 206 if (fmode & O_APPEND) 207 mode |= VAPPEND; 208 #ifdef MAC 209 error = mac_check_vnode_open(cred, vp, mode); 210 if (error) 211 goto bad; 212 #endif 213 if ((fmode & O_CREAT) == 0) { 214 if (mode & VWRITE) { 215 error = vn_writechk(vp); 216 if (error) 217 goto bad; 218 } 219 if (mode) { 220 error = VOP_ACCESS(vp, mode, cred, td); 221 if (error) 222 goto bad; 223 } 224 } 225 if ((error = VOP_OPEN(vp, fmode, cred, td, fdidx)) != 0) 226 goto bad; 227 228 if (fmode & FWRITE) 229 vp->v_writecount++; 230 *flagp = fmode; 231 ASSERT_VOP_LOCKED(vp, "vn_open_cred"); 232 if (fdidx == -1) 233 VFS_UNLOCK_GIANT(vfslocked); 234 return (0); 235 bad: 236 NDFREE(ndp, NDF_ONLY_PNBUF); 237 vput(vp); 238 VFS_UNLOCK_GIANT(vfslocked); 239 *flagp = fmode; 240 ndp->ni_vp = NULL; 241 return (error); 242 } 243 244 /* 245 * Check for write permissions on the specified vnode. 246 * Prototype text segments cannot be written. 247 */ 248 int 249 vn_writechk(vp) 250 register struct vnode *vp; 251 { 252 253 ASSERT_VOP_LOCKED(vp, "vn_writechk"); 254 /* 255 * If there's shared text associated with 256 * the vnode, try to free it up once. If 257 * we fail, we can't allow writing. 258 */ 259 if (vp->v_vflag & VV_TEXT) 260 return (ETXTBSY); 261 262 return (0); 263 } 264 265 /* 266 * Vnode close call 267 */ 268 int 269 vn_close(vp, flags, file_cred, td) 270 register struct vnode *vp; 271 int flags; 272 struct ucred *file_cred; 273 struct thread *td; 274 { 275 struct mount *mp; 276 int error; 277 278 VFS_ASSERT_GIANT(vp->v_mount); 279 280 vn_start_write(vp, &mp, V_WAIT); 281 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 282 if (flags & FWRITE) 283 vp->v_writecount--; 284 error = VOP_CLOSE(vp, flags, file_cred, td); 285 vput(vp); 286 vn_finished_write(mp); 287 return (error); 288 } 289 290 /* 291 * Sequential heuristic - detect sequential operation 292 */ 293 static __inline 294 int 295 sequential_heuristic(struct uio *uio, struct file *fp) 296 { 297 298 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || 299 uio->uio_offset == fp->f_nextoff) { 300 /* 301 * XXX we assume that the filesystem block size is 302 * the default. Not true, but still gives us a pretty 303 * good indicator of how sequential the read operations 304 * are. 305 */ 306 fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE; 307 if (fp->f_seqcount > IO_SEQMAX) 308 fp->f_seqcount = IO_SEQMAX; 309 return(fp->f_seqcount << IO_SEQSHIFT); 310 } 311 312 /* 313 * Not sequential, quick draw-down of seqcount 314 */ 315 if (fp->f_seqcount > 1) 316 fp->f_seqcount = 1; 317 else 318 fp->f_seqcount = 0; 319 return(0); 320 } 321 322 /* 323 * Package up an I/O request on a vnode into a uio and do it. 324 */ 325 int 326 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, 327 aresid, td) 328 enum uio_rw rw; 329 struct vnode *vp; 330 caddr_t base; 331 int len; 332 off_t offset; 333 enum uio_seg segflg; 334 int ioflg; 335 struct ucred *active_cred; 336 struct ucred *file_cred; 337 int *aresid; 338 struct thread *td; 339 { 340 struct uio auio; 341 struct iovec aiov; 342 struct mount *mp; 343 struct ucred *cred; 344 int error; 345 346 VFS_ASSERT_GIANT(vp->v_mount); 347 348 if ((ioflg & IO_NODELOCKED) == 0) { 349 mp = NULL; 350 if (rw == UIO_WRITE) { 351 if (vp->v_type != VCHR && 352 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) 353 != 0) 354 return (error); 355 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 356 } else { 357 /* 358 * XXX This should be LK_SHARED but I don't trust VFS 359 * enough to leave it like that until it has been 360 * reviewed further. 361 */ 362 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 363 } 364 365 } 366 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 367 auio.uio_iov = &aiov; 368 auio.uio_iovcnt = 1; 369 aiov.iov_base = base; 370 aiov.iov_len = len; 371 auio.uio_resid = len; 372 auio.uio_offset = offset; 373 auio.uio_segflg = segflg; 374 auio.uio_rw = rw; 375 auio.uio_td = td; 376 error = 0; 377 #ifdef MAC 378 if ((ioflg & IO_NOMACCHECK) == 0) { 379 if (rw == UIO_READ) 380 error = mac_check_vnode_read(active_cred, file_cred, 381 vp); 382 else 383 error = mac_check_vnode_write(active_cred, file_cred, 384 vp); 385 } 386 #endif 387 if (error == 0) { 388 if (file_cred) 389 cred = file_cred; 390 else 391 cred = active_cred; 392 if (rw == UIO_READ) 393 error = VOP_READ(vp, &auio, ioflg, cred); 394 else 395 error = VOP_WRITE(vp, &auio, ioflg, cred); 396 } 397 if (aresid) 398 *aresid = auio.uio_resid; 399 else 400 if (auio.uio_resid && error == 0) 401 error = EIO; 402 if ((ioflg & IO_NODELOCKED) == 0) { 403 if (rw == UIO_WRITE) 404 vn_finished_write(mp); 405 VOP_UNLOCK(vp, 0, td); 406 } 407 return (error); 408 } 409 410 /* 411 * Package up an I/O request on a vnode into a uio and do it. The I/O 412 * request is split up into smaller chunks and we try to avoid saturating 413 * the buffer cache while potentially holding a vnode locked, so we 414 * check bwillwrite() before calling vn_rdwr(). We also call uio_yield() 415 * to give other processes a chance to lock the vnode (either other processes 416 * core'ing the same binary, or unrelated processes scanning the directory). 417 */ 418 int 419 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred, 420 file_cred, aresid, td) 421 enum uio_rw rw; 422 struct vnode *vp; 423 caddr_t base; 424 size_t len; 425 off_t offset; 426 enum uio_seg segflg; 427 int ioflg; 428 struct ucred *active_cred; 429 struct ucred *file_cred; 430 size_t *aresid; 431 struct thread *td; 432 { 433 int error = 0; 434 int iaresid; 435 436 VFS_ASSERT_GIANT(vp->v_mount); 437 438 do { 439 int chunk; 440 441 /* 442 * Force `offset' to a multiple of MAXBSIZE except possibly 443 * for the first chunk, so that filesystems only need to 444 * write full blocks except possibly for the first and last 445 * chunks. 446 */ 447 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; 448 449 if (chunk > len) 450 chunk = len; 451 if (rw != UIO_READ && vp->v_type == VREG) 452 bwillwrite(); 453 iaresid = 0; 454 error = vn_rdwr(rw, vp, base, chunk, offset, segflg, 455 ioflg, active_cred, file_cred, &iaresid, td); 456 len -= chunk; /* aresid calc already includes length */ 457 if (error) 458 break; 459 offset += chunk; 460 base += chunk; 461 uio_yield(); 462 } while (len); 463 if (aresid) 464 *aresid = len + iaresid; 465 return (error); 466 } 467 468 /* 469 * File table vnode read routine. 470 */ 471 static int 472 vn_read(fp, uio, active_cred, flags, td) 473 struct file *fp; 474 struct uio *uio; 475 struct ucred *active_cred; 476 struct thread *td; 477 int flags; 478 { 479 struct vnode *vp; 480 int error, ioflag; 481 int vfslocked; 482 483 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", 484 uio->uio_td, td)); 485 vp = fp->f_vnode; 486 ioflag = 0; 487 if (fp->f_flag & FNONBLOCK) 488 ioflag |= IO_NDELAY; 489 if (fp->f_flag & O_DIRECT) 490 ioflag |= IO_DIRECT; 491 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 492 VOP_LEASE(vp, td, fp->f_cred, LEASE_READ); 493 /* 494 * According to McKusick the vn lock is protecting f_offset here. 495 * Once this field has it's own lock we can acquire this shared. 496 */ 497 if ((flags & FOF_OFFSET) == 0) { 498 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 499 uio->uio_offset = fp->f_offset; 500 } else 501 vn_lock(vp, LK_SHARED | LK_RETRY, td); 502 503 ioflag |= sequential_heuristic(uio, fp); 504 505 #ifdef MAC 506 error = mac_check_vnode_read(active_cred, fp->f_cred, vp); 507 if (error == 0) 508 #endif 509 error = VOP_READ(vp, uio, ioflag, fp->f_cred); 510 if ((flags & FOF_OFFSET) == 0) 511 fp->f_offset = uio->uio_offset; 512 fp->f_nextoff = uio->uio_offset; 513 VOP_UNLOCK(vp, 0, td); 514 VFS_UNLOCK_GIANT(vfslocked); 515 return (error); 516 } 517 518 /* 519 * File table vnode write routine. 520 */ 521 static int 522 vn_write(fp, uio, active_cred, flags, td) 523 struct file *fp; 524 struct uio *uio; 525 struct ucred *active_cred; 526 struct thread *td; 527 int flags; 528 { 529 struct vnode *vp; 530 struct mount *mp; 531 int error, ioflag; 532 int vfslocked; 533 534 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", 535 uio->uio_td, td)); 536 vp = fp->f_vnode; 537 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 538 if (vp->v_type == VREG) 539 bwillwrite(); 540 ioflag = IO_UNIT; 541 if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) 542 ioflag |= IO_APPEND; 543 if (fp->f_flag & FNONBLOCK) 544 ioflag |= IO_NDELAY; 545 if (fp->f_flag & O_DIRECT) 546 ioflag |= IO_DIRECT; 547 if ((fp->f_flag & O_FSYNC) || 548 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) 549 ioflag |= IO_SYNC; 550 mp = NULL; 551 if (vp->v_type != VCHR && 552 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) 553 goto unlock; 554 VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE); 555 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 556 if ((flags & FOF_OFFSET) == 0) 557 uio->uio_offset = fp->f_offset; 558 ioflag |= sequential_heuristic(uio, fp); 559 #ifdef MAC 560 error = mac_check_vnode_write(active_cred, fp->f_cred, vp); 561 if (error == 0) 562 #endif 563 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); 564 if ((flags & FOF_OFFSET) == 0) 565 fp->f_offset = uio->uio_offset; 566 fp->f_nextoff = uio->uio_offset; 567 VOP_UNLOCK(vp, 0, td); 568 vn_finished_write(mp); 569 unlock: 570 VFS_UNLOCK_GIANT(vfslocked); 571 return (error); 572 } 573 574 /* 575 * File table vnode stat routine. 576 */ 577 static int 578 vn_statfile(fp, sb, active_cred, td) 579 struct file *fp; 580 struct stat *sb; 581 struct ucred *active_cred; 582 struct thread *td; 583 { 584 struct vnode *vp = fp->f_vnode; 585 int vfslocked; 586 int error; 587 588 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 589 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 590 error = vn_stat(vp, sb, active_cred, fp->f_cred, td); 591 VOP_UNLOCK(vp, 0, td); 592 VFS_UNLOCK_GIANT(vfslocked); 593 594 return (error); 595 } 596 597 /* 598 * Stat a vnode; implementation for the stat syscall 599 */ 600 int 601 vn_stat(vp, sb, active_cred, file_cred, td) 602 struct vnode *vp; 603 register struct stat *sb; 604 struct ucred *active_cred; 605 struct ucred *file_cred; 606 struct thread *td; 607 { 608 struct vattr vattr; 609 register struct vattr *vap; 610 int error; 611 u_short mode; 612 613 #ifdef MAC 614 error = mac_check_vnode_stat(active_cred, file_cred, vp); 615 if (error) 616 return (error); 617 #endif 618 619 vap = &vattr; 620 error = VOP_GETATTR(vp, vap, active_cred, td); 621 if (error) 622 return (error); 623 624 /* 625 * Zero the spare stat fields 626 */ 627 bzero(sb, sizeof *sb); 628 629 /* 630 * Copy from vattr table 631 */ 632 if (vap->va_fsid != VNOVAL) 633 sb->st_dev = vap->va_fsid; 634 else 635 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; 636 sb->st_ino = vap->va_fileid; 637 mode = vap->va_mode; 638 switch (vap->va_type) { 639 case VREG: 640 mode |= S_IFREG; 641 break; 642 case VDIR: 643 mode |= S_IFDIR; 644 break; 645 case VBLK: 646 mode |= S_IFBLK; 647 break; 648 case VCHR: 649 mode |= S_IFCHR; 650 break; 651 case VLNK: 652 mode |= S_IFLNK; 653 /* This is a cosmetic change, symlinks do not have a mode. */ 654 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) 655 sb->st_mode &= ~ACCESSPERMS; /* 0000 */ 656 else 657 sb->st_mode |= ACCESSPERMS; /* 0777 */ 658 break; 659 case VSOCK: 660 mode |= S_IFSOCK; 661 break; 662 case VFIFO: 663 mode |= S_IFIFO; 664 break; 665 default: 666 return (EBADF); 667 }; 668 sb->st_mode = mode; 669 sb->st_nlink = vap->va_nlink; 670 sb->st_uid = vap->va_uid; 671 sb->st_gid = vap->va_gid; 672 sb->st_rdev = vap->va_rdev; 673 if (vap->va_size > OFF_MAX) 674 return (EOVERFLOW); 675 sb->st_size = vap->va_size; 676 sb->st_atimespec = vap->va_atime; 677 sb->st_mtimespec = vap->va_mtime; 678 sb->st_ctimespec = vap->va_ctime; 679 sb->st_birthtimespec = vap->va_birthtime; 680 681 /* 682 * According to www.opengroup.org, the meaning of st_blksize is 683 * "a filesystem-specific preferred I/O block size for this 684 * object. In some filesystem types, this may vary from file 685 * to file" 686 * Default to PAGE_SIZE after much discussion. 687 * XXX: min(PAGE_SIZE, vp->v_bufobj.bo_bsize) may be more correct. 688 */ 689 690 sb->st_blksize = PAGE_SIZE; 691 692 sb->st_flags = vap->va_flags; 693 if (suser(td)) 694 sb->st_gen = 0; 695 else 696 sb->st_gen = vap->va_gen; 697 698 #if (S_BLKSIZE == 512) 699 /* Optimize this case */ 700 sb->st_blocks = vap->va_bytes >> 9; 701 #else 702 sb->st_blocks = vap->va_bytes / S_BLKSIZE; 703 #endif 704 return (0); 705 } 706 707 /* 708 * File table vnode ioctl routine. 709 */ 710 static int 711 vn_ioctl(fp, com, data, active_cred, td) 712 struct file *fp; 713 u_long com; 714 void *data; 715 struct ucred *active_cred; 716 struct thread *td; 717 { 718 struct vnode *vp = fp->f_vnode; 719 struct vattr vattr; 720 int vfslocked; 721 int error; 722 723 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 724 error = ENOTTY; 725 switch (vp->v_type) { 726 case VREG: 727 case VDIR: 728 if (com == FIONREAD) { 729 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 730 error = VOP_GETATTR(vp, &vattr, active_cred, td); 731 VOP_UNLOCK(vp, 0, td); 732 if (!error) 733 *(int *)data = vattr.va_size - fp->f_offset; 734 } 735 if (com == FIONBIO || com == FIOASYNC) /* XXX */ 736 error = 0; 737 else 738 error = VOP_IOCTL(vp, com, data, fp->f_flag, 739 active_cred, td); 740 break; 741 742 default: 743 break; 744 } 745 VFS_UNLOCK_GIANT(vfslocked); 746 return (error); 747 } 748 749 /* 750 * File table vnode poll routine. 751 */ 752 static int 753 vn_poll(fp, events, active_cred, td) 754 struct file *fp; 755 int events; 756 struct ucred *active_cred; 757 struct thread *td; 758 { 759 struct vnode *vp; 760 int error; 761 762 mtx_lock(&Giant); 763 764 vp = fp->f_vnode; 765 #ifdef MAC 766 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 767 error = mac_check_vnode_poll(active_cred, fp->f_cred, vp); 768 VOP_UNLOCK(vp, 0, td); 769 if (!error) 770 #endif 771 772 error = VOP_POLL(vp, events, fp->f_cred, td); 773 mtx_unlock(&Giant); 774 return (error); 775 } 776 777 /* 778 * Check that the vnode is still valid, and if so 779 * acquire requested lock. 780 */ 781 int 782 vn_lock(vp, flags, td) 783 struct vnode *vp; 784 int flags; 785 struct thread *td; 786 { 787 int error; 788 789 do { 790 if ((flags & LK_INTERLOCK) == 0) 791 VI_LOCK(vp); 792 if ((flags & LK_NOWAIT || (flags & LK_TYPE_MASK) == 0) && 793 vp->v_iflag & VI_DOOMED) { 794 VI_UNLOCK(vp); 795 return (ENOENT); 796 } 797 /* 798 * Just polling to check validity. 799 */ 800 if ((flags & LK_TYPE_MASK) == 0) { 801 VI_UNLOCK(vp); 802 return (0); 803 } 804 /* 805 * lockmgr drops interlock before it will return for 806 * any reason. So force the code above to relock it. 807 */ 808 error = VOP_LOCK(vp, flags | LK_INTERLOCK, td); 809 flags &= ~LK_INTERLOCK; 810 KASSERT((flags & LK_RETRY) == 0 || error == 0, 811 ("LK_RETRY set with incompatible flags %d\n", flags)); 812 /* 813 * Callers specify LK_RETRY if they wish to get dead vnodes. 814 * If RETRY is not set, we return ENOENT instead. 815 */ 816 if (error == 0 && vp->v_iflag & VI_DOOMED && 817 (flags & LK_RETRY) == 0) { 818 VOP_UNLOCK(vp, 0, td); 819 error = ENOENT; 820 break; 821 } 822 } while (flags & LK_RETRY && error != 0); 823 return (error); 824 } 825 826 /* 827 * File table vnode close routine. 828 */ 829 static int 830 vn_closefile(fp, td) 831 struct file *fp; 832 struct thread *td; 833 { 834 struct vnode *vp; 835 struct flock lf; 836 int vfslocked; 837 int error; 838 839 vp = fp->f_vnode; 840 841 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 842 if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) { 843 lf.l_whence = SEEK_SET; 844 lf.l_start = 0; 845 lf.l_len = 0; 846 lf.l_type = F_UNLCK; 847 (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); 848 } 849 850 fp->f_ops = &badfileops; 851 852 error = vn_close(vp, fp->f_flag, fp->f_cred, td); 853 VFS_UNLOCK_GIANT(vfslocked); 854 return (error); 855 } 856 857 /* 858 * Preparing to start a filesystem write operation. If the operation is 859 * permitted, then we bump the count of operations in progress and 860 * proceed. If a suspend request is in progress, we wait until the 861 * suspension is over, and then proceed. 862 */ 863 int 864 vn_start_write(vp, mpp, flags) 865 struct vnode *vp; 866 struct mount **mpp; 867 int flags; 868 { 869 struct mount *mp; 870 int error; 871 872 error = 0; 873 /* 874 * If a vnode is provided, get and return the mount point that 875 * to which it will write. 876 */ 877 if (vp != NULL) { 878 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { 879 *mpp = NULL; 880 if (error != EOPNOTSUPP) 881 return (error); 882 return (0); 883 } 884 } 885 if ((mp = *mpp) == NULL) 886 return (0); 887 MNT_ILOCK(mp); 888 /* 889 * Check on status of suspension. 890 */ 891 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 892 if (flags & V_NOWAIT) { 893 error = EWOULDBLOCK; 894 goto unlock; 895 } 896 error = msleep(&mp->mnt_flag, MNT_MTX(mp), 897 (PUSER - 1) | (flags & PCATCH), "suspfs", 0); 898 if (error) 899 goto unlock; 900 } 901 if (flags & V_XSLEEP) 902 goto unlock; 903 mp->mnt_writeopcount++; 904 unlock: 905 MNT_IUNLOCK(mp); 906 return (error); 907 } 908 909 /* 910 * Secondary suspension. Used by operations such as vop_inactive 911 * routines that are needed by the higher level functions. These 912 * are allowed to proceed until all the higher level functions have 913 * completed (indicated by mnt_writeopcount dropping to zero). At that 914 * time, these operations are halted until the suspension is over. 915 */ 916 int 917 vn_write_suspend_wait(vp, mp, flags) 918 struct vnode *vp; 919 struct mount *mp; 920 int flags; 921 { 922 int error; 923 924 if (vp != NULL) { 925 if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) { 926 if (error != EOPNOTSUPP) 927 return (error); 928 return (0); 929 } 930 } 931 /* 932 * If we are not suspended or have not yet reached suspended 933 * mode, then let the operation proceed. 934 */ 935 if (mp == NULL) 936 return (0); 937 MNT_ILOCK(mp); 938 if ((mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) { 939 MNT_IUNLOCK(mp); 940 return (0); 941 } 942 if (flags & V_NOWAIT) { 943 MNT_IUNLOCK(mp); 944 return (EWOULDBLOCK); 945 } 946 /* 947 * Wait for the suspension to finish. 948 */ 949 return (msleep(&mp->mnt_flag, MNT_MTX(mp), 950 (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0)); 951 } 952 953 /* 954 * Filesystem write operation has completed. If we are suspending and this 955 * operation is the last one, notify the suspender that the suspension is 956 * now in effect. 957 */ 958 void 959 vn_finished_write(mp) 960 struct mount *mp; 961 { 962 if (mp == NULL) 963 return; 964 MNT_ILOCK(mp); 965 mp->mnt_writeopcount--; 966 if (mp->mnt_writeopcount < 0) 967 panic("vn_finished_write: neg cnt"); 968 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && 969 mp->mnt_writeopcount <= 0) 970 wakeup(&mp->mnt_writeopcount); 971 MNT_IUNLOCK(mp); 972 } 973 974 /* 975 * Request a filesystem to suspend write operations. 976 */ 977 int 978 vfs_write_suspend(mp) 979 struct mount *mp; 980 { 981 struct thread *td = curthread; 982 int error; 983 984 error = 0; 985 MNT_ILOCK(mp); 986 if (mp->mnt_kern_flag & MNTK_SUSPEND) 987 goto unlock; 988 mp->mnt_kern_flag |= MNTK_SUSPEND; 989 if (mp->mnt_writeopcount > 0) 990 (void) msleep(&mp->mnt_writeopcount, 991 MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); 992 else 993 MNT_IUNLOCK(mp); 994 if ((error = VFS_SYNC(mp, MNT_WAIT, td)) != 0) { 995 vfs_write_resume(mp); 996 return (error); 997 } 998 MNT_ILOCK(mp); 999 mp->mnt_kern_flag |= MNTK_SUSPENDED; 1000 unlock: 1001 MNT_IUNLOCK(mp); 1002 return (error); 1003 } 1004 1005 /* 1006 * Request a filesystem to resume write operations. 1007 */ 1008 void 1009 vfs_write_resume(mp) 1010 struct mount *mp; 1011 { 1012 1013 MNT_ILOCK(mp); 1014 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 1015 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED); 1016 wakeup(&mp->mnt_writeopcount); 1017 wakeup(&mp->mnt_flag); 1018 } 1019 MNT_IUNLOCK(mp); 1020 } 1021 1022 /* 1023 * Implement kqueues for files by translating it to vnode operation. 1024 */ 1025 static int 1026 vn_kqfilter(struct file *fp, struct knote *kn) 1027 { 1028 int error; 1029 1030 mtx_lock(&Giant); 1031 error = VOP_KQFILTER(fp->f_vnode, kn); 1032 mtx_unlock(&Giant); 1033 1034 return error; 1035 } 1036 1037 /* 1038 * Simplified in-kernel wrapper calls for extended attribute access. 1039 * Both calls pass in a NULL credential, authorizing as "kernel" access. 1040 * Set IO_NODELOCKED in ioflg if the vnode is already locked. 1041 */ 1042 int 1043 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, 1044 const char *attrname, int *buflen, char *buf, struct thread *td) 1045 { 1046 struct uio auio; 1047 struct iovec iov; 1048 int error; 1049 1050 iov.iov_len = *buflen; 1051 iov.iov_base = buf; 1052 1053 auio.uio_iov = &iov; 1054 auio.uio_iovcnt = 1; 1055 auio.uio_rw = UIO_READ; 1056 auio.uio_segflg = UIO_SYSSPACE; 1057 auio.uio_td = td; 1058 auio.uio_offset = 0; 1059 auio.uio_resid = *buflen; 1060 1061 if ((ioflg & IO_NODELOCKED) == 0) 1062 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1063 1064 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 1065 1066 /* authorize attribute retrieval as kernel */ 1067 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, 1068 td); 1069 1070 if ((ioflg & IO_NODELOCKED) == 0) 1071 VOP_UNLOCK(vp, 0, td); 1072 1073 if (error == 0) { 1074 *buflen = *buflen - auio.uio_resid; 1075 } 1076 1077 return (error); 1078 } 1079 1080 /* 1081 * XXX failure mode if partially written? 1082 */ 1083 int 1084 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, 1085 const char *attrname, int buflen, char *buf, struct thread *td) 1086 { 1087 struct uio auio; 1088 struct iovec iov; 1089 struct mount *mp; 1090 int error; 1091 1092 iov.iov_len = buflen; 1093 iov.iov_base = buf; 1094 1095 auio.uio_iov = &iov; 1096 auio.uio_iovcnt = 1; 1097 auio.uio_rw = UIO_WRITE; 1098 auio.uio_segflg = UIO_SYSSPACE; 1099 auio.uio_td = td; 1100 auio.uio_offset = 0; 1101 auio.uio_resid = buflen; 1102 1103 if ((ioflg & IO_NODELOCKED) == 0) { 1104 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) 1105 return (error); 1106 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1107 } 1108 1109 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 1110 1111 /* authorize attribute setting as kernel */ 1112 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); 1113 1114 if ((ioflg & IO_NODELOCKED) == 0) { 1115 vn_finished_write(mp); 1116 VOP_UNLOCK(vp, 0, td); 1117 } 1118 1119 return (error); 1120 } 1121 1122 int 1123 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, 1124 const char *attrname, struct thread *td) 1125 { 1126 struct mount *mp; 1127 int error; 1128 1129 if ((ioflg & IO_NODELOCKED) == 0) { 1130 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) 1131 return (error); 1132 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1133 } 1134 1135 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 1136 1137 /* authorize attribute removal as kernel */ 1138 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); 1139 if (error == EOPNOTSUPP) 1140 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, 1141 NULL, td); 1142 1143 if ((ioflg & IO_NODELOCKED) == 0) { 1144 vn_finished_write(mp); 1145 VOP_UNLOCK(vp, 0, td); 1146 } 1147 1148 return (error); 1149 } 1150