1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 39 * $FreeBSD$ 40 */ 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/fcntl.h> 45 #include <sys/file.h> 46 #include <sys/stat.h> 47 #include <sys/proc.h> 48 #include <sys/mount.h> 49 #include <sys/mutex.h> 50 #include <sys/namei.h> 51 #include <sys/vnode.h> 52 #include <sys/bio.h> 53 #include <sys/buf.h> 54 #include <sys/filio.h> 55 #include <sys/ttycom.h> 56 #include <sys/conf.h> 57 58 #include <ufs/ufs/quota.h> 59 #include <ufs/ufs/inode.h> 60 61 static int vn_closefile __P((struct file *fp, struct proc *p)); 62 static int vn_ioctl __P((struct file *fp, u_long com, caddr_t data, 63 struct proc *p)); 64 static int vn_read __P((struct file *fp, struct uio *uio, 65 struct ucred *cred, int flags, struct proc *p)); 66 static int vn_poll __P((struct file *fp, int events, struct ucred *cred, 67 struct proc *p)); 68 static int vn_statfile __P((struct file *fp, struct stat *sb, struct proc *p)); 69 static int vn_write __P((struct file *fp, struct uio *uio, 70 struct ucred *cred, int flags, struct proc *p)); 71 72 struct fileops vnops = 73 { vn_read, vn_write, vn_ioctl, vn_poll, vn_statfile, vn_closefile }; 74 75 static int filt_nullattach(struct knote *kn); 76 static int filt_vnattach(struct knote *kn); 77 static void filt_vndetach(struct knote *kn); 78 static int filt_vnode(struct knote *kn, long hint); 79 static int filt_vnread(struct knote *kn, long hint); 80 81 struct filterops vn_filtops = 82 { 1, filt_vnattach, filt_vndetach, filt_vnode }; 83 84 /* 85 * XXX 86 * filt_vnread is ufs-specific, so the attach routine should really 87 * switch out to different filterops based on the vn filetype 88 */ 89 struct filterops vn_rwfiltops[] = { 90 { 1, filt_vnattach, filt_vndetach, filt_vnread }, 91 { 1, filt_nullattach, NULL, NULL }, 92 }; 93 94 /* 95 * Common code for vnode open operations. 96 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. 97 * 98 * Note that this does NOT free nameidata for the successful case, 99 * due to the NDINIT being done elsewhere. 100 */ 101 int 102 vn_open(ndp, flagp, cmode) 103 register struct nameidata *ndp; 104 int *flagp, cmode; 105 { 106 struct vnode *vp; 107 struct mount *mp; 108 struct proc *p = ndp->ni_cnd.cn_proc; 109 struct ucred *cred = p->p_ucred; 110 struct vattr vat; 111 struct vattr *vap = &vat; 112 int mode, fmode, error; 113 114 restart: 115 fmode = *flagp; 116 if (fmode & O_CREAT) { 117 ndp->ni_cnd.cn_nameiop = CREATE; 118 ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; 119 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) 120 ndp->ni_cnd.cn_flags |= FOLLOW; 121 bwillwrite(); 122 if ((error = namei(ndp)) != 0) 123 return (error); 124 if (ndp->ni_vp == NULL) { 125 VATTR_NULL(vap); 126 vap->va_type = VREG; 127 vap->va_mode = cmode; 128 if (fmode & O_EXCL) 129 vap->va_vaflags |= VA_EXCLUSIVE; 130 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { 131 NDFREE(ndp, NDF_ONLY_PNBUF); 132 vput(ndp->ni_dvp); 133 if ((error = vn_start_write(NULL, &mp, 134 V_XSLEEP | PCATCH)) != 0) 135 return (error); 136 goto restart; 137 } 138 VOP_LEASE(ndp->ni_dvp, p, cred, LEASE_WRITE); 139 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, 140 &ndp->ni_cnd, vap); 141 vput(ndp->ni_dvp); 142 vn_finished_write(mp); 143 if (error) { 144 NDFREE(ndp, NDF_ONLY_PNBUF); 145 return (error); 146 } 147 ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create"); 148 ASSERT_VOP_LOCKED(ndp->ni_vp, "create"); 149 fmode &= ~O_TRUNC; 150 vp = ndp->ni_vp; 151 } else { 152 if (ndp->ni_dvp == ndp->ni_vp) 153 vrele(ndp->ni_dvp); 154 else 155 vput(ndp->ni_dvp); 156 ndp->ni_dvp = NULL; 157 vp = ndp->ni_vp; 158 if (fmode & O_EXCL) { 159 error = EEXIST; 160 goto bad; 161 } 162 fmode &= ~O_CREAT; 163 } 164 } else { 165 ndp->ni_cnd.cn_nameiop = LOOKUP; 166 ndp->ni_cnd.cn_flags = 167 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF; 168 if ((error = namei(ndp)) != 0) 169 return (error); 170 vp = ndp->ni_vp; 171 } 172 if (vp->v_type == VLNK) { 173 error = EMLINK; 174 goto bad; 175 } 176 if (vp->v_type == VSOCK) { 177 error = EOPNOTSUPP; 178 goto bad; 179 } 180 if ((fmode & O_CREAT) == 0) { 181 mode = 0; 182 if (fmode & (FWRITE | O_TRUNC)) { 183 if (vp->v_type == VDIR) { 184 error = EISDIR; 185 goto bad; 186 } 187 error = vn_writechk(vp); 188 if (error) 189 goto bad; 190 mode |= VWRITE; 191 } 192 if (fmode & FREAD) 193 mode |= VREAD; 194 if (mode) { 195 error = VOP_ACCESS(vp, mode, cred, p); 196 if (error) 197 goto bad; 198 } 199 } 200 if ((error = VOP_OPEN(vp, fmode, cred, p)) != 0) 201 goto bad; 202 /* 203 * Make sure that a VM object is created for VMIO support. 204 */ 205 if (vn_canvmio(vp) == TRUE) { 206 if ((error = vfs_object_create(vp, p, cred)) != 0) 207 goto bad; 208 } 209 210 if (fmode & FWRITE) 211 vp->v_writecount++; 212 *flagp = fmode; 213 return (0); 214 bad: 215 NDFREE(ndp, NDF_ONLY_PNBUF); 216 vput(vp); 217 *flagp = fmode; 218 return (error); 219 } 220 221 /* 222 * Check for write permissions on the specified vnode. 223 * Prototype text segments cannot be written. 224 */ 225 int 226 vn_writechk(vp) 227 register struct vnode *vp; 228 { 229 230 /* 231 * If there's shared text associated with 232 * the vnode, try to free it up once. If 233 * we fail, we can't allow writing. 234 */ 235 if (vp->v_flag & VTEXT) 236 return (ETXTBSY); 237 return (0); 238 } 239 240 /* 241 * Vnode close call 242 */ 243 int 244 vn_close(vp, flags, cred, p) 245 register struct vnode *vp; 246 int flags; 247 struct ucred *cred; 248 struct proc *p; 249 { 250 int error; 251 252 if (flags & FWRITE) 253 vp->v_writecount--; 254 error = VOP_CLOSE(vp, flags, cred, p); 255 vrele(vp); 256 return (error); 257 } 258 259 static __inline 260 int 261 sequential_heuristic(struct uio *uio, struct file *fp) 262 { 263 /* 264 * Sequential heuristic - detect sequential operation 265 */ 266 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || 267 uio->uio_offset == fp->f_nextoff) { 268 /* 269 * XXX we assume that the filesystem block size is 270 * the default. Not true, but still gives us a pretty 271 * good indicator of how sequential the read operations 272 * are. 273 */ 274 fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE; 275 if (fp->f_seqcount >= 127) 276 fp->f_seqcount = 127; 277 return(fp->f_seqcount << 16); 278 } 279 280 /* 281 * Not sequential, quick draw-down of seqcount 282 */ 283 if (fp->f_seqcount > 1) 284 fp->f_seqcount = 1; 285 else 286 fp->f_seqcount = 0; 287 return(0); 288 } 289 290 /* 291 * Package up an I/O request on a vnode into a uio and do it. 292 */ 293 int 294 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p) 295 enum uio_rw rw; 296 struct vnode *vp; 297 caddr_t base; 298 int len; 299 off_t offset; 300 enum uio_seg segflg; 301 int ioflg; 302 struct ucred *cred; 303 int *aresid; 304 struct proc *p; 305 { 306 struct uio auio; 307 struct iovec aiov; 308 struct mount *mp; 309 int error; 310 311 if ((ioflg & IO_NODELOCKED) == 0) { 312 mp = NULL; 313 if (rw == UIO_WRITE && 314 vp->v_type != VCHR && 315 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) 316 return (error); 317 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 318 } 319 auio.uio_iov = &aiov; 320 auio.uio_iovcnt = 1; 321 aiov.iov_base = base; 322 aiov.iov_len = len; 323 auio.uio_resid = len; 324 auio.uio_offset = offset; 325 auio.uio_segflg = segflg; 326 auio.uio_rw = rw; 327 auio.uio_procp = p; 328 if (rw == UIO_READ) { 329 error = VOP_READ(vp, &auio, ioflg, cred); 330 } else { 331 error = VOP_WRITE(vp, &auio, ioflg, cred); 332 } 333 if (aresid) 334 *aresid = auio.uio_resid; 335 else 336 if (auio.uio_resid && error == 0) 337 error = EIO; 338 if ((ioflg & IO_NODELOCKED) == 0) { 339 vn_finished_write(mp); 340 VOP_UNLOCK(vp, 0, p); 341 } 342 return (error); 343 } 344 345 /* 346 * File table vnode read routine. 347 */ 348 static int 349 vn_read(fp, uio, cred, flags, p) 350 struct file *fp; 351 struct uio *uio; 352 struct ucred *cred; 353 struct proc *p; 354 int flags; 355 { 356 struct vnode *vp; 357 int error, ioflag; 358 359 KASSERT(uio->uio_procp == p, ("uio_procp %p is not p %p", 360 uio->uio_procp, p)); 361 vp = (struct vnode *)fp->f_data; 362 ioflag = 0; 363 if (fp->f_flag & FNONBLOCK) 364 ioflag |= IO_NDELAY; 365 VOP_LEASE(vp, p, cred, LEASE_READ); 366 vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p); 367 if ((flags & FOF_OFFSET) == 0) 368 uio->uio_offset = fp->f_offset; 369 370 ioflag |= sequential_heuristic(uio, fp); 371 372 error = VOP_READ(vp, uio, ioflag, cred); 373 if ((flags & FOF_OFFSET) == 0) 374 fp->f_offset = uio->uio_offset; 375 fp->f_nextoff = uio->uio_offset; 376 VOP_UNLOCK(vp, 0, p); 377 return (error); 378 } 379 380 /* 381 * File table vnode write routine. 382 */ 383 static int 384 vn_write(fp, uio, cred, flags, p) 385 struct file *fp; 386 struct uio *uio; 387 struct ucred *cred; 388 struct proc *p; 389 int flags; 390 { 391 struct vnode *vp; 392 struct mount *mp; 393 int error, ioflag; 394 395 KASSERT(uio->uio_procp == p, ("uio_procp %p is not p %p", 396 uio->uio_procp, p)); 397 vp = (struct vnode *)fp->f_data; 398 if (vp->v_type == VREG) 399 bwillwrite(); 400 vp = (struct vnode *)fp->f_data; /* XXX needed? */ 401 ioflag = IO_UNIT; 402 if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) 403 ioflag |= IO_APPEND; 404 if (fp->f_flag & FNONBLOCK) 405 ioflag |= IO_NDELAY; 406 if ((fp->f_flag & O_FSYNC) || 407 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) 408 ioflag |= IO_SYNC; 409 mp = NULL; 410 if (vp->v_type != VCHR && 411 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) 412 return (error); 413 VOP_LEASE(vp, p, cred, LEASE_WRITE); 414 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 415 if ((flags & FOF_OFFSET) == 0) 416 uio->uio_offset = fp->f_offset; 417 ioflag |= sequential_heuristic(uio, fp); 418 error = VOP_WRITE(vp, uio, ioflag, cred); 419 if ((flags & FOF_OFFSET) == 0) 420 fp->f_offset = uio->uio_offset; 421 fp->f_nextoff = uio->uio_offset; 422 VOP_UNLOCK(vp, 0, p); 423 vn_finished_write(mp); 424 return (error); 425 } 426 427 /* 428 * File table vnode stat routine. 429 */ 430 static int 431 vn_statfile(fp, sb, p) 432 struct file *fp; 433 struct stat *sb; 434 struct proc *p; 435 { 436 struct vnode *vp = (struct vnode *)fp->f_data; 437 438 return vn_stat(vp, sb, p); 439 } 440 441 int 442 vn_stat(vp, sb, p) 443 struct vnode *vp; 444 register struct stat *sb; 445 struct proc *p; 446 { 447 struct vattr vattr; 448 register struct vattr *vap; 449 int error; 450 u_short mode; 451 452 vap = &vattr; 453 error = VOP_GETATTR(vp, vap, p->p_ucred, p); 454 if (error) 455 return (error); 456 457 /* 458 * Zero the spare stat fields 459 */ 460 sb->st_lspare = 0; 461 sb->st_qspare[0] = 0; 462 sb->st_qspare[1] = 0; 463 464 /* 465 * Copy from vattr table 466 */ 467 if (vap->va_fsid != VNOVAL) 468 sb->st_dev = vap->va_fsid; 469 else 470 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; 471 sb->st_ino = vap->va_fileid; 472 mode = vap->va_mode; 473 switch (vap->va_type) { 474 case VREG: 475 mode |= S_IFREG; 476 break; 477 case VDIR: 478 mode |= S_IFDIR; 479 break; 480 case VBLK: 481 mode |= S_IFBLK; 482 break; 483 case VCHR: 484 mode |= S_IFCHR; 485 break; 486 case VLNK: 487 mode |= S_IFLNK; 488 /* This is a cosmetic change, symlinks do not have a mode. */ 489 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) 490 sb->st_mode &= ~ACCESSPERMS; /* 0000 */ 491 else 492 sb->st_mode |= ACCESSPERMS; /* 0777 */ 493 break; 494 case VSOCK: 495 mode |= S_IFSOCK; 496 break; 497 case VFIFO: 498 mode |= S_IFIFO; 499 break; 500 default: 501 return (EBADF); 502 }; 503 sb->st_mode = mode; 504 sb->st_nlink = vap->va_nlink; 505 sb->st_uid = vap->va_uid; 506 sb->st_gid = vap->va_gid; 507 sb->st_rdev = vap->va_rdev; 508 sb->st_size = vap->va_size; 509 sb->st_atimespec = vap->va_atime; 510 sb->st_mtimespec = vap->va_mtime; 511 sb->st_ctimespec = vap->va_ctime; 512 513 /* 514 * According to www.opengroup.org, the meaning of st_blksize is 515 * "a filesystem-specific preferred I/O block size for this 516 * object. In some filesystem types, this may vary from file 517 * to file" 518 * Default to zero to catch bogus uses of this field. 519 */ 520 521 if (vap->va_type == VREG) { 522 sb->st_blksize = vap->va_blocksize; 523 } else if (vn_isdisk(vp, NULL)) { 524 sb->st_blksize = vp->v_rdev->si_bsize_best; 525 if (sb->st_blksize < vp->v_rdev->si_bsize_phys) 526 sb->st_blksize = vp->v_rdev->si_bsize_phys; 527 if (sb->st_blksize < BLKDEV_IOSIZE) 528 sb->st_blksize = BLKDEV_IOSIZE; 529 } else { 530 sb->st_blksize = 0; 531 } 532 533 sb->st_flags = vap->va_flags; 534 if (suser_xxx(p->p_ucred, 0, 0)) 535 sb->st_gen = 0; 536 else 537 sb->st_gen = vap->va_gen; 538 539 #if (S_BLKSIZE == 512) 540 /* Optimize this case */ 541 sb->st_blocks = vap->va_bytes >> 9; 542 #else 543 sb->st_blocks = vap->va_bytes / S_BLKSIZE; 544 #endif 545 return (0); 546 } 547 548 /* 549 * File table vnode ioctl routine. 550 */ 551 static int 552 vn_ioctl(fp, com, data, p) 553 struct file *fp; 554 u_long com; 555 caddr_t data; 556 struct proc *p; 557 { 558 register struct vnode *vp = ((struct vnode *)fp->f_data); 559 struct vattr vattr; 560 int error; 561 562 switch (vp->v_type) { 563 564 case VREG: 565 case VDIR: 566 if (com == FIONREAD) { 567 error = VOP_GETATTR(vp, &vattr, p->p_ucred, p); 568 if (error) 569 return (error); 570 *(int *)data = vattr.va_size - fp->f_offset; 571 return (0); 572 } 573 if (com == FIONBIO || com == FIOASYNC) /* XXX */ 574 return (0); /* XXX */ 575 /* fall into ... */ 576 577 default: 578 #if 0 579 return (ENOTTY); 580 #endif 581 case VFIFO: 582 case VCHR: 583 case VBLK: 584 if (com == FIODTYPE) { 585 if (vp->v_type != VCHR && vp->v_type != VBLK) 586 return (ENOTTY); 587 *(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK; 588 return (0); 589 } 590 error = VOP_IOCTL(vp, com, data, fp->f_flag, p->p_ucred, p); 591 if (error == 0 && com == TIOCSCTTY) { 592 593 /* Do nothing if reassigning same control tty */ 594 if (p->p_session->s_ttyvp == vp) 595 return (0); 596 597 /* Get rid of reference to old control tty */ 598 if (p->p_session->s_ttyvp) 599 vrele(p->p_session->s_ttyvp); 600 601 p->p_session->s_ttyvp = vp; 602 VREF(vp); 603 } 604 return (error); 605 } 606 } 607 608 /* 609 * File table vnode poll routine. 610 */ 611 static int 612 vn_poll(fp, events, cred, p) 613 struct file *fp; 614 int events; 615 struct ucred *cred; 616 struct proc *p; 617 { 618 619 return (VOP_POLL(((struct vnode *)fp->f_data), events, cred, p)); 620 } 621 622 /* 623 * Check that the vnode is still valid, and if so 624 * acquire requested lock. 625 */ 626 int 627 #ifndef DEBUG_LOCKS 628 vn_lock(vp, flags, p) 629 #else 630 debug_vn_lock(vp, flags, p, filename, line) 631 #endif 632 struct vnode *vp; 633 int flags; 634 struct proc *p; 635 #ifdef DEBUG_LOCKS 636 const char *filename; 637 int line; 638 #endif 639 { 640 int error; 641 642 do { 643 if ((flags & LK_INTERLOCK) == 0) 644 mtx_enter(&vp->v_interlock, MTX_DEF); 645 if ((vp->v_flag & VXLOCK) && vp->v_vxproc != curproc) { 646 vp->v_flag |= VXWANT; 647 mtx_exit(&vp->v_interlock, MTX_DEF); 648 tsleep((caddr_t)vp, PINOD, "vn_lock", 0); 649 error = ENOENT; 650 } else { 651 if (vp->v_vxproc != NULL) 652 printf("VXLOCK interlock avoided in vn_lock\n"); 653 #ifdef DEBUG_LOCKS 654 vp->filename = filename; 655 vp->line = line; 656 #endif 657 error = VOP_LOCK(vp, 658 flags | LK_NOPAUSE | LK_INTERLOCK, p); 659 if (error == 0) 660 return (error); 661 } 662 flags &= ~LK_INTERLOCK; 663 } while (flags & LK_RETRY); 664 return (error); 665 } 666 667 /* 668 * File table vnode close routine. 669 */ 670 static int 671 vn_closefile(fp, p) 672 struct file *fp; 673 struct proc *p; 674 { 675 676 fp->f_ops = &badfileops; 677 return (vn_close(((struct vnode *)fp->f_data), fp->f_flag, 678 fp->f_cred, p)); 679 } 680 681 /* 682 * Preparing to start a filesystem write operation. If the operation is 683 * permitted, then we bump the count of operations in progress and 684 * proceed. If a suspend request is in progress, we wait until the 685 * suspension is over, and then proceed. 686 */ 687 int 688 vn_start_write(vp, mpp, flags) 689 struct vnode *vp; 690 struct mount **mpp; 691 int flags; 692 { 693 struct mount *mp; 694 int error; 695 696 /* 697 * If a vnode is provided, get and return the mount point that 698 * to which it will write. 699 */ 700 if (vp != NULL) { 701 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { 702 *mpp = NULL; 703 if (error != EOPNOTSUPP) 704 return (error); 705 return (0); 706 } 707 } 708 if ((mp = *mpp) == NULL) 709 return (0); 710 /* 711 * Check on status of suspension. 712 */ 713 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 714 if (flags & V_NOWAIT) 715 return (EWOULDBLOCK); 716 error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH), 717 "suspfs", 0); 718 if (error) 719 return (error); 720 } 721 if (flags & V_XSLEEP) 722 return (0); 723 mp->mnt_writeopcount++; 724 return (0); 725 } 726 727 /* 728 * Secondary suspension. Used by operations such as vop_inactive 729 * routines that are needed by the higher level functions. These 730 * are allowed to proceed until all the higher level functions have 731 * completed (indicated by mnt_writeopcount dropping to zero). At that 732 * time, these operations are halted until the suspension is over. 733 */ 734 int 735 vn_write_suspend_wait(vp, mp, flags) 736 struct vnode *vp; 737 struct mount *mp; 738 int flags; 739 { 740 int error; 741 742 if (vp != NULL) { 743 if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) { 744 if (error != EOPNOTSUPP) 745 return (error); 746 return (0); 747 } 748 } 749 /* 750 * If we are not suspended or have not yet reached suspended 751 * mode, then let the operation proceed. 752 */ 753 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) 754 return (0); 755 if (flags & V_NOWAIT) 756 return (EWOULDBLOCK); 757 /* 758 * Wait for the suspension to finish. 759 */ 760 return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH), 761 "suspfs", 0)); 762 } 763 764 /* 765 * Filesystem write operation has completed. If we are suspending and this 766 * operation is the last one, notify the suspender that the suspension is 767 * now in effect. 768 */ 769 void 770 vn_finished_write(mp) 771 struct mount *mp; 772 { 773 774 if (mp == NULL) 775 return; 776 mp->mnt_writeopcount--; 777 if (mp->mnt_writeopcount < 0) 778 panic("vn_finished_write: neg cnt"); 779 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && 780 mp->mnt_writeopcount <= 0) 781 wakeup(&mp->mnt_writeopcount); 782 } 783 784 /* 785 * Request a filesystem to suspend write operations. 786 */ 787 void 788 vfs_write_suspend(mp) 789 struct mount *mp; 790 { 791 struct proc *p = curproc; 792 793 if (mp->mnt_kern_flag & MNTK_SUSPEND) 794 return; 795 mp->mnt_kern_flag |= MNTK_SUSPEND; 796 if (mp->mnt_writeopcount > 0) 797 (void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0); 798 VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p); 799 mp->mnt_kern_flag |= MNTK_SUSPENDED; 800 } 801 802 /* 803 * Request a filesystem to resume write operations. 804 */ 805 void 806 vfs_write_resume(mp) 807 struct mount *mp; 808 { 809 810 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) 811 return; 812 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED); 813 wakeup(&mp->mnt_writeopcount); 814 wakeup(&mp->mnt_flag); 815 } 816 817 static int 818 filt_vnattach(struct knote *kn) 819 { 820 struct vnode *vp; 821 822 if (kn->kn_fp->f_type != DTYPE_VNODE && 823 kn->kn_fp->f_type != DTYPE_FIFO) 824 return (EBADF); 825 826 vp = (struct vnode *)kn->kn_fp->f_data; 827 828 /* 829 * XXX 830 * this is a hack simply to cause the filter attach to fail 831 * for non-ufs filesystems, until the support for them is done. 832 */ 833 if ((vp)->v_tag != VT_UFS) 834 return (EOPNOTSUPP); 835 836 simple_lock(&vp->v_pollinfo.vpi_lock); 837 SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext); 838 simple_unlock(&vp->v_pollinfo.vpi_lock); 839 840 return (0); 841 } 842 843 static void 844 filt_vndetach(struct knote *kn) 845 { 846 struct vnode *vp = (struct vnode *)kn->kn_fp->f_data; 847 848 simple_lock(&vp->v_pollinfo.vpi_lock); 849 SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note, 850 kn, knote, kn_selnext); 851 simple_unlock(&vp->v_pollinfo.vpi_lock); 852 } 853 854 static int 855 filt_vnode(struct knote *kn, long hint) 856 { 857 858 if (kn->kn_sfflags & hint) 859 kn->kn_fflags |= hint; 860 return (kn->kn_fflags != 0); 861 } 862 863 static int 864 filt_nullattach(struct knote *kn) 865 { 866 return (ENXIO); 867 } 868 869 /*ARGSUSED*/ 870 static int 871 filt_vnread(struct knote *kn, long hint) 872 { 873 struct vnode *vp = (struct vnode *)kn->kn_fp->f_data; 874 struct inode *ip = VTOI(vp); 875 876 kn->kn_data = ip->i_size - kn->kn_fp->f_offset; 877 return (kn->kn_data != 0); 878 } 879 880 /* 881 * Simplified in-kernel wrapper calls for extended attribute access. 882 * Both calls pass in a NULL credential, authorizing as "kernel" access. 883 * Set IO_NODELOCKED in ioflg if the vnode is already locked. 884 */ 885 int 886 vn_extattr_get(struct vnode *vp, int ioflg, const char *attrname, int *buflen, 887 char *buf, struct proc *p) 888 { 889 struct uio auio; 890 struct iovec iov; 891 int error; 892 893 iov.iov_len = *buflen; 894 iov.iov_base = buf; 895 896 auio.uio_iov = &iov; 897 auio.uio_iovcnt = 1; 898 auio.uio_rw = UIO_READ; 899 auio.uio_segflg = UIO_SYSSPACE; 900 auio.uio_procp = p; 901 auio.uio_offset = 0; 902 auio.uio_resid = *buflen; 903 904 if ((ioflg & IO_NODELOCKED) == 0) 905 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 906 907 /* authorize attribute retrieval as kernel */ 908 error = VOP_GETEXTATTR(vp, attrname, &auio, NULL, p); 909 910 if ((ioflg & IO_NODELOCKED) == 0) 911 VOP_UNLOCK(vp, 0, p); 912 913 if (error == 0) { 914 *buflen = *buflen - auio.uio_resid; 915 } 916 917 return (error); 918 } 919 920 /* 921 * XXX failure mode if partially written? 922 */ 923 int 924 vn_extattr_set(struct vnode *vp, int ioflg, const char *attrname, int buflen, 925 char *buf, struct proc *p) 926 { 927 struct uio auio; 928 struct iovec iov; 929 struct mount *mp; 930 int error; 931 932 iov.iov_len = buflen; 933 iov.iov_base = buf; 934 935 auio.uio_iov = &iov; 936 auio.uio_iovcnt = 1; 937 auio.uio_rw = UIO_WRITE; 938 auio.uio_segflg = UIO_SYSSPACE; 939 auio.uio_procp = p; 940 auio.uio_offset = 0; 941 auio.uio_resid = buflen; 942 943 if ((ioflg & IO_NODELOCKED) == 0) { 944 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) 945 return (error); 946 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 947 } 948 949 /* authorize attribute setting as kernel */ 950 error = VOP_SETEXTATTR(vp, attrname, &auio, NULL, p); 951 952 if ((ioflg & IO_NODELOCKED) == 0) { 953 vn_finished_write(mp); 954 VOP_UNLOCK(vp, 0, p); 955 } 956 957 return (error); 958 } 959 960 int 961 vn_extattr_rm(struct vnode *vp, int ioflg, const char *attrname, struct proc *p) 962 { 963 struct mount *mp; 964 int error; 965 966 if ((ioflg & IO_NODELOCKED) == 0) { 967 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) 968 return (error); 969 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 970 } 971 972 /* authorize attribute removal as kernel */ 973 error = VOP_SETEXTATTR(vp, attrname, NULL, NULL, p); 974 975 if ((ioflg & IO_NODELOCKED) == 0) { 976 vn_finished_write(mp); 977 VOP_UNLOCK(vp, 0, p); 978 } 979 980 return (error); 981 } 982