1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/fcntl.h> 43 #include <sys/file.h> 44 #include <sys/kdb.h> 45 #include <sys/stat.h> 46 #include <sys/priv.h> 47 #include <sys/proc.h> 48 #include <sys/limits.h> 49 #include <sys/lock.h> 50 #include <sys/mount.h> 51 #include <sys/mutex.h> 52 #include <sys/namei.h> 53 #include <sys/vnode.h> 54 #include <sys/bio.h> 55 #include <sys/buf.h> 56 #include <sys/filio.h> 57 #include <sys/resourcevar.h> 58 #include <sys/sx.h> 59 #include <sys/ttycom.h> 60 #include <sys/conf.h> 61 #include <sys/syslog.h> 62 #include <sys/unistd.h> 63 64 #include <security/audit/audit.h> 65 #include <security/mac/mac_framework.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_extern.h> 69 #include <vm/pmap.h> 70 #include <vm/vm_map.h> 71 #include <vm/vm_object.h> 72 #include <vm/vm_page.h> 73 74 static fo_rdwr_t vn_read; 75 static fo_rdwr_t vn_write; 76 static fo_rdwr_t vn_io_fault; 77 static fo_truncate_t vn_truncate; 78 static fo_ioctl_t vn_ioctl; 79 static fo_poll_t vn_poll; 80 static fo_kqfilter_t vn_kqfilter; 81 static fo_stat_t vn_statfile; 82 static fo_close_t vn_closefile; 83 84 struct fileops vnops = { 85 .fo_read = vn_io_fault, 86 .fo_write = vn_io_fault, 87 .fo_truncate = vn_truncate, 88 .fo_ioctl = vn_ioctl, 89 .fo_poll = vn_poll, 90 .fo_kqfilter = vn_kqfilter, 91 .fo_stat = vn_statfile, 92 .fo_close = vn_closefile, 93 .fo_chmod = vn_chmod, 94 .fo_chown = vn_chown, 95 .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE 96 }; 97 98 int 99 vn_open(ndp, flagp, cmode, fp) 100 struct nameidata *ndp; 101 int *flagp, cmode; 102 struct file *fp; 103 { 104 struct thread *td = ndp->ni_cnd.cn_thread; 105 106 return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp)); 107 } 108 109 /* 110 * Common code for vnode open operations. 111 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. 112 * 113 * Note that this does NOT free nameidata for the successful case, 114 * due to the NDINIT being done elsewhere. 115 */ 116 int 117 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, 118 struct ucred *cred, struct file *fp) 119 { 120 struct vnode *vp; 121 struct mount *mp; 122 struct thread *td = ndp->ni_cnd.cn_thread; 123 struct vattr vat; 124 struct vattr *vap = &vat; 125 int fmode, error; 126 accmode_t accmode; 127 int vfslocked, mpsafe; 128 129 mpsafe = ndp->ni_cnd.cn_flags & MPSAFE; 130 restart: 131 vfslocked = 0; 132 fmode = *flagp; 133 if (fmode & O_CREAT) { 134 ndp->ni_cnd.cn_nameiop = CREATE; 135 ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | 136 MPSAFE; 137 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) 138 ndp->ni_cnd.cn_flags |= FOLLOW; 139 if (!(vn_open_flags & VN_OPEN_NOAUDIT)) 140 ndp->ni_cnd.cn_flags |= AUDITVNODE1; 141 bwillwrite(); 142 if ((error = namei(ndp)) != 0) 143 return (error); 144 vfslocked = NDHASGIANT(ndp); 145 if (!mpsafe) 146 ndp->ni_cnd.cn_flags &= ~MPSAFE; 147 if (ndp->ni_vp == NULL) { 148 VATTR_NULL(vap); 149 vap->va_type = VREG; 150 vap->va_mode = cmode; 151 if (fmode & O_EXCL) 152 vap->va_vaflags |= VA_EXCLUSIVE; 153 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { 154 NDFREE(ndp, NDF_ONLY_PNBUF); 155 vput(ndp->ni_dvp); 156 VFS_UNLOCK_GIANT(vfslocked); 157 if ((error = vn_start_write(NULL, &mp, 158 V_XSLEEP | PCATCH)) != 0) 159 return (error); 160 goto restart; 161 } 162 #ifdef MAC 163 error = mac_vnode_check_create(cred, ndp->ni_dvp, 164 &ndp->ni_cnd, vap); 165 if (error == 0) 166 #endif 167 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, 168 &ndp->ni_cnd, vap); 169 vput(ndp->ni_dvp); 170 vn_finished_write(mp); 171 if (error) { 172 VFS_UNLOCK_GIANT(vfslocked); 173 NDFREE(ndp, NDF_ONLY_PNBUF); 174 return (error); 175 } 176 fmode &= ~O_TRUNC; 177 vp = ndp->ni_vp; 178 } else { 179 if (ndp->ni_dvp == ndp->ni_vp) 180 vrele(ndp->ni_dvp); 181 else 182 vput(ndp->ni_dvp); 183 ndp->ni_dvp = NULL; 184 vp = ndp->ni_vp; 185 if (fmode & O_EXCL) { 186 error = EEXIST; 187 goto bad; 188 } 189 fmode &= ~O_CREAT; 190 } 191 } else { 192 ndp->ni_cnd.cn_nameiop = LOOKUP; 193 ndp->ni_cnd.cn_flags = ISOPEN | 194 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | 195 LOCKLEAF | MPSAFE; 196 if (!(fmode & FWRITE)) 197 ndp->ni_cnd.cn_flags |= LOCKSHARED; 198 if (!(vn_open_flags & VN_OPEN_NOAUDIT)) 199 ndp->ni_cnd.cn_flags |= AUDITVNODE1; 200 if ((error = namei(ndp)) != 0) 201 return (error); 202 if (!mpsafe) 203 ndp->ni_cnd.cn_flags &= ~MPSAFE; 204 vfslocked = NDHASGIANT(ndp); 205 vp = ndp->ni_vp; 206 } 207 if (vp->v_type == VLNK) { 208 error = EMLINK; 209 goto bad; 210 } 211 if (vp->v_type == VSOCK) { 212 error = EOPNOTSUPP; 213 goto bad; 214 } 215 if (vp->v_type != VDIR && fmode & O_DIRECTORY) { 216 error = ENOTDIR; 217 goto bad; 218 } 219 accmode = 0; 220 if (fmode & (FWRITE | O_TRUNC)) { 221 if (vp->v_type == VDIR) { 222 error = EISDIR; 223 goto bad; 224 } 225 accmode |= VWRITE; 226 } 227 if (fmode & FREAD) 228 accmode |= VREAD; 229 if (fmode & FEXEC) 230 accmode |= VEXEC; 231 if ((fmode & O_APPEND) && (fmode & FWRITE)) 232 accmode |= VAPPEND; 233 #ifdef MAC 234 error = mac_vnode_check_open(cred, vp, accmode); 235 if (error) 236 goto bad; 237 #endif 238 if ((fmode & O_CREAT) == 0) { 239 if (accmode & VWRITE) { 240 error = vn_writechk(vp); 241 if (error) 242 goto bad; 243 } 244 if (accmode) { 245 error = VOP_ACCESS(vp, accmode, cred, td); 246 if (error) 247 goto bad; 248 } 249 } 250 if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0) 251 goto bad; 252 253 if (fmode & FWRITE) { 254 vp->v_writecount++; 255 CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", 256 __func__, vp, vp->v_writecount); 257 } 258 *flagp = fmode; 259 ASSERT_VOP_LOCKED(vp, "vn_open_cred"); 260 if (!mpsafe) 261 VFS_UNLOCK_GIANT(vfslocked); 262 return (0); 263 bad: 264 NDFREE(ndp, NDF_ONLY_PNBUF); 265 vput(vp); 266 VFS_UNLOCK_GIANT(vfslocked); 267 *flagp = fmode; 268 ndp->ni_vp = NULL; 269 return (error); 270 } 271 272 /* 273 * Check for write permissions on the specified vnode. 274 * Prototype text segments cannot be written. 275 */ 276 int 277 vn_writechk(vp) 278 register struct vnode *vp; 279 { 280 281 ASSERT_VOP_LOCKED(vp, "vn_writechk"); 282 /* 283 * If there's shared text associated with 284 * the vnode, try to free it up once. If 285 * we fail, we can't allow writing. 286 */ 287 if (vp->v_vflag & VV_TEXT) 288 return (ETXTBSY); 289 290 return (0); 291 } 292 293 /* 294 * Vnode close call 295 */ 296 int 297 vn_close(vp, flags, file_cred, td) 298 register struct vnode *vp; 299 int flags; 300 struct ucred *file_cred; 301 struct thread *td; 302 { 303 struct mount *mp; 304 int error, lock_flags; 305 306 if (!(flags & FWRITE) && vp->v_mount != NULL && 307 vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED) 308 lock_flags = LK_SHARED; 309 else 310 lock_flags = LK_EXCLUSIVE; 311 312 VFS_ASSERT_GIANT(vp->v_mount); 313 314 vn_start_write(vp, &mp, V_WAIT); 315 vn_lock(vp, lock_flags | LK_RETRY); 316 if (flags & FWRITE) { 317 VNASSERT(vp->v_writecount > 0, vp, 318 ("vn_close: negative writecount")); 319 vp->v_writecount--; 320 CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", 321 __func__, vp, vp->v_writecount); 322 } 323 error = VOP_CLOSE(vp, flags, file_cred, td); 324 vput(vp); 325 vn_finished_write(mp); 326 return (error); 327 } 328 329 /* 330 * Heuristic to detect sequential operation. 331 */ 332 static int 333 sequential_heuristic(struct uio *uio, struct file *fp) 334 { 335 336 if (atomic_load_acq_int(&(fp->f_flag)) & FRDAHEAD) 337 return (fp->f_seqcount << IO_SEQSHIFT); 338 339 /* 340 * Offset 0 is handled specially. open() sets f_seqcount to 1 so 341 * that the first I/O is normally considered to be slightly 342 * sequential. Seeking to offset 0 doesn't change sequentiality 343 * unless previous seeks have reduced f_seqcount to 0, in which 344 * case offset 0 is not special. 345 */ 346 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || 347 uio->uio_offset == fp->f_nextoff) { 348 /* 349 * f_seqcount is in units of fixed-size blocks so that it 350 * depends mainly on the amount of sequential I/O and not 351 * much on the number of sequential I/O's. The fixed size 352 * of 16384 is hard-coded here since it is (not quite) just 353 * a magic size that works well here. This size is more 354 * closely related to the best I/O size for real disks than 355 * to any block size used by software. 356 */ 357 fp->f_seqcount += howmany(uio->uio_resid, 16384); 358 if (fp->f_seqcount > IO_SEQMAX) 359 fp->f_seqcount = IO_SEQMAX; 360 return (fp->f_seqcount << IO_SEQSHIFT); 361 } 362 363 /* Not sequential. Quickly draw-down sequentiality. */ 364 if (fp->f_seqcount > 1) 365 fp->f_seqcount = 1; 366 else 367 fp->f_seqcount = 0; 368 return (0); 369 } 370 371 /* 372 * Package up an I/O request on a vnode into a uio and do it. 373 */ 374 int 375 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, 376 enum uio_seg segflg, int ioflg, struct ucred *active_cred, 377 struct ucred *file_cred, ssize_t *aresid, struct thread *td) 378 { 379 struct uio auio; 380 struct iovec aiov; 381 struct mount *mp; 382 struct ucred *cred; 383 void *rl_cookie; 384 int error, lock_flags; 385 386 VFS_ASSERT_GIANT(vp->v_mount); 387 388 auio.uio_iov = &aiov; 389 auio.uio_iovcnt = 1; 390 aiov.iov_base = base; 391 aiov.iov_len = len; 392 auio.uio_resid = len; 393 auio.uio_offset = offset; 394 auio.uio_segflg = segflg; 395 auio.uio_rw = rw; 396 auio.uio_td = td; 397 error = 0; 398 399 if ((ioflg & IO_NODELOCKED) == 0) { 400 if (rw == UIO_READ) { 401 rl_cookie = vn_rangelock_rlock(vp, offset, 402 offset + len); 403 } else { 404 rl_cookie = vn_rangelock_wlock(vp, offset, 405 offset + len); 406 } 407 mp = NULL; 408 if (rw == UIO_WRITE) { 409 if (vp->v_type != VCHR && 410 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) 411 != 0) 412 goto out; 413 if (MNT_SHARED_WRITES(mp) || 414 ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) 415 lock_flags = LK_SHARED; 416 else 417 lock_flags = LK_EXCLUSIVE; 418 } else 419 lock_flags = LK_SHARED; 420 vn_lock(vp, lock_flags | LK_RETRY); 421 } else 422 rl_cookie = NULL; 423 424 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 425 #ifdef MAC 426 if ((ioflg & IO_NOMACCHECK) == 0) { 427 if (rw == UIO_READ) 428 error = mac_vnode_check_read(active_cred, file_cred, 429 vp); 430 else 431 error = mac_vnode_check_write(active_cred, file_cred, 432 vp); 433 } 434 #endif 435 if (error == 0) { 436 if (file_cred != NULL) 437 cred = file_cred; 438 else 439 cred = active_cred; 440 if (rw == UIO_READ) 441 error = VOP_READ(vp, &auio, ioflg, cred); 442 else 443 error = VOP_WRITE(vp, &auio, ioflg, cred); 444 } 445 if (aresid) 446 *aresid = auio.uio_resid; 447 else 448 if (auio.uio_resid && error == 0) 449 error = EIO; 450 if ((ioflg & IO_NODELOCKED) == 0) { 451 VOP_UNLOCK(vp, 0); 452 if (mp != NULL) 453 vn_finished_write(mp); 454 } 455 out: 456 if (rl_cookie != NULL) 457 vn_rangelock_unlock(vp, rl_cookie); 458 return (error); 459 } 460 461 /* 462 * Package up an I/O request on a vnode into a uio and do it. The I/O 463 * request is split up into smaller chunks and we try to avoid saturating 464 * the buffer cache while potentially holding a vnode locked, so we 465 * check bwillwrite() before calling vn_rdwr(). We also call kern_yield() 466 * to give other processes a chance to lock the vnode (either other processes 467 * core'ing the same binary, or unrelated processes scanning the directory). 468 */ 469 int 470 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred, 471 file_cred, aresid, td) 472 enum uio_rw rw; 473 struct vnode *vp; 474 void *base; 475 size_t len; 476 off_t offset; 477 enum uio_seg segflg; 478 int ioflg; 479 struct ucred *active_cred; 480 struct ucred *file_cred; 481 size_t *aresid; 482 struct thread *td; 483 { 484 int error = 0; 485 ssize_t iaresid; 486 487 VFS_ASSERT_GIANT(vp->v_mount); 488 489 do { 490 int chunk; 491 492 /* 493 * Force `offset' to a multiple of MAXBSIZE except possibly 494 * for the first chunk, so that filesystems only need to 495 * write full blocks except possibly for the first and last 496 * chunks. 497 */ 498 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; 499 500 if (chunk > len) 501 chunk = len; 502 if (rw != UIO_READ && vp->v_type == VREG) 503 bwillwrite(); 504 iaresid = 0; 505 error = vn_rdwr(rw, vp, base, chunk, offset, segflg, 506 ioflg, active_cred, file_cred, &iaresid, td); 507 len -= chunk; /* aresid calc already includes length */ 508 if (error) 509 break; 510 offset += chunk; 511 base = (char *)base + chunk; 512 kern_yield(PRI_USER); 513 } while (len); 514 if (aresid) 515 *aresid = len + iaresid; 516 return (error); 517 } 518 519 /* 520 * File table vnode read routine. 521 */ 522 static int 523 vn_read(fp, uio, active_cred, flags, td) 524 struct file *fp; 525 struct uio *uio; 526 struct ucred *active_cred; 527 int flags; 528 struct thread *td; 529 { 530 struct vnode *vp; 531 int error, ioflag; 532 struct mtx *mtxp; 533 int advice, vfslocked; 534 off_t offset; 535 536 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", 537 uio->uio_td, td)); 538 mtxp = NULL; 539 vp = fp->f_vnode; 540 ioflag = 0; 541 if (fp->f_flag & FNONBLOCK) 542 ioflag |= IO_NDELAY; 543 if (fp->f_flag & O_DIRECT) 544 ioflag |= IO_DIRECT; 545 advice = POSIX_FADV_NORMAL; 546 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 547 /* 548 * According to McKusick the vn lock was protecting f_offset here. 549 * It is now protected by the FOFFSET_LOCKED flag. 550 */ 551 if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) { 552 mtxp = mtx_pool_find(mtxpool_sleep, fp); 553 mtx_lock(mtxp); 554 if ((flags & FOF_OFFSET) == 0) { 555 while (fp->f_vnread_flags & FOFFSET_LOCKED) { 556 fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; 557 msleep(&fp->f_vnread_flags, mtxp, PUSER -1, 558 "vnread offlock", 0); 559 } 560 fp->f_vnread_flags |= FOFFSET_LOCKED; 561 uio->uio_offset = fp->f_offset; 562 } 563 if (fp->f_advice != NULL && 564 uio->uio_offset >= fp->f_advice->fa_start && 565 uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) 566 advice = fp->f_advice->fa_advice; 567 mtx_unlock(mtxp); 568 } 569 vn_lock(vp, LK_SHARED | LK_RETRY); 570 571 switch (advice) { 572 case POSIX_FADV_NORMAL: 573 case POSIX_FADV_SEQUENTIAL: 574 case POSIX_FADV_NOREUSE: 575 ioflag |= sequential_heuristic(uio, fp); 576 break; 577 case POSIX_FADV_RANDOM: 578 /* Disable read-ahead for random I/O. */ 579 break; 580 } 581 offset = uio->uio_offset; 582 583 #ifdef MAC 584 error = mac_vnode_check_read(active_cred, fp->f_cred, vp); 585 if (error == 0) 586 #endif 587 error = VOP_READ(vp, uio, ioflag, fp->f_cred); 588 if ((flags & FOF_OFFSET) == 0) { 589 fp->f_offset = uio->uio_offset; 590 mtx_lock(mtxp); 591 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) 592 wakeup(&fp->f_vnread_flags); 593 fp->f_vnread_flags = 0; 594 mtx_unlock(mtxp); 595 } 596 fp->f_nextoff = uio->uio_offset; 597 VOP_UNLOCK(vp, 0); 598 if (error == 0 && advice == POSIX_FADV_NOREUSE && 599 offset != uio->uio_offset) 600 error = VOP_ADVISE(vp, offset, uio->uio_offset - 1, 601 POSIX_FADV_DONTNEED); 602 VFS_UNLOCK_GIANT(vfslocked); 603 return (error); 604 } 605 606 /* 607 * File table vnode write routine. 608 */ 609 static int 610 vn_write(fp, uio, active_cred, flags, td) 611 struct file *fp; 612 struct uio *uio; 613 struct ucred *active_cred; 614 int flags; 615 struct thread *td; 616 { 617 struct vnode *vp; 618 struct mount *mp; 619 int error, ioflag, lock_flags; 620 struct mtx *mtxp; 621 int advice, vfslocked; 622 623 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", 624 uio->uio_td, td)); 625 vp = fp->f_vnode; 626 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 627 if (vp->v_type == VREG) 628 bwillwrite(); 629 ioflag = IO_UNIT; 630 if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) 631 ioflag |= IO_APPEND; 632 if (fp->f_flag & FNONBLOCK) 633 ioflag |= IO_NDELAY; 634 if (fp->f_flag & O_DIRECT) 635 ioflag |= IO_DIRECT; 636 if ((fp->f_flag & O_FSYNC) || 637 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) 638 ioflag |= IO_SYNC; 639 mp = NULL; 640 if (vp->v_type != VCHR && 641 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) 642 goto unlock; 643 644 if ((MNT_SHARED_WRITES(mp) || 645 ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) && 646 (flags & FOF_OFFSET) != 0) { 647 lock_flags = LK_SHARED; 648 } else { 649 lock_flags = LK_EXCLUSIVE; 650 } 651 652 vn_lock(vp, lock_flags | LK_RETRY); 653 if ((flags & FOF_OFFSET) == 0) 654 uio->uio_offset = fp->f_offset; 655 advice = POSIX_FADV_NORMAL; 656 if (fp->f_advice != NULL) { 657 mtxp = mtx_pool_find(mtxpool_sleep, fp); 658 mtx_lock(mtxp); 659 if (fp->f_advice != NULL && 660 uio->uio_offset >= fp->f_advice->fa_start && 661 uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) 662 advice = fp->f_advice->fa_advice; 663 mtx_unlock(mtxp); 664 } 665 switch (advice) { 666 case POSIX_FADV_NORMAL: 667 case POSIX_FADV_SEQUENTIAL: 668 ioflag |= sequential_heuristic(uio, fp); 669 break; 670 case POSIX_FADV_RANDOM: 671 /* XXX: Is this correct? */ 672 break; 673 case POSIX_FADV_NOREUSE: 674 /* 675 * Request the underlying FS to discard the buffers 676 * and pages after the I/O is complete. 677 */ 678 ioflag |= IO_DIRECT; 679 break; 680 } 681 682 #ifdef MAC 683 error = mac_vnode_check_write(active_cred, fp->f_cred, vp); 684 if (error == 0) 685 #endif 686 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); 687 if ((flags & FOF_OFFSET) == 0) 688 fp->f_offset = uio->uio_offset; 689 fp->f_nextoff = uio->uio_offset; 690 VOP_UNLOCK(vp, 0); 691 if (vp->v_type != VCHR) 692 vn_finished_write(mp); 693 unlock: 694 VFS_UNLOCK_GIANT(vfslocked); 695 return (error); 696 } 697 698 static const int io_hold_cnt = 16; 699 700 /* 701 * The vn_io_fault() is a wrapper around vn_read() and vn_write() to 702 * prevent the following deadlock: 703 * 704 * Assume that the thread A reads from the vnode vp1 into userspace 705 * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is 706 * currently not resident, then system ends up with the call chain 707 * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] -> 708 * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2) 709 * which establishes lock order vp1->vn_lock, then vp2->vn_lock. 710 * If, at the same time, thread B reads from vnode vp2 into buffer buf2 711 * backed by the pages of vnode vp1, and some page in buf2 is not 712 * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock. 713 * 714 * To prevent the lock order reversal and deadlock, vn_io_fault() does 715 * not allow page faults to happen during VOP_READ() or VOP_WRITE(). 716 * Instead, it first tries to do the whole range i/o with pagefaults 717 * disabled. If all pages in the i/o buffer are resident and mapped, 718 * VOP will succeed (ignoring the genuine filesystem errors). 719 * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do 720 * i/o in chunks, with all pages in the chunk prefaulted and held 721 * using vm_fault_quick_hold_pages(). 722 * 723 * Filesystems using this deadlock avoidance scheme should use the 724 * array of the held pages from uio, saved in the curthread->td_ma, 725 * instead of doing uiomove(). A helper function 726 * vn_io_fault_uiomove() converts uiomove request into 727 * uiomove_fromphys() over td_ma array. 728 * 729 * Since vnode locks do not cover the whole i/o anymore, rangelocks 730 * make the current i/o request atomic with respect to other i/os and 731 * truncations. 732 */ 733 static int 734 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred, 735 int flags, struct thread *td) 736 { 737 vm_page_t ma[io_hold_cnt + 2]; 738 struct uio *uio_clone, short_uio; 739 struct iovec short_iovec[1]; 740 fo_rdwr_t *doio; 741 struct vnode *vp; 742 void *rl_cookie; 743 struct mount *mp; 744 vm_page_t *prev_td_ma; 745 int cnt, error, save, saveheld, prev_td_ma_cnt; 746 vm_offset_t addr, end; 747 vm_prot_t prot; 748 size_t len, resid; 749 ssize_t adv; 750 751 if (uio->uio_rw == UIO_READ) 752 doio = vn_read; 753 else 754 doio = vn_write; 755 vp = fp->f_vnode; 756 if (uio->uio_segflg != UIO_USERSPACE || vp->v_type != VREG || 757 ((mp = vp->v_mount) != NULL && 758 (mp->mnt_kern_flag & MNTK_NO_IOPF) == 0)) 759 return (doio(fp, uio, active_cred, flags, td)); 760 761 /* 762 * The UFS follows IO_UNIT directive and replays back both 763 * uio_offset and uio_resid if an error is encountered during the 764 * operation. But, since the iovec may be already advanced, 765 * uio is still in an inconsistent state. 766 * 767 * Cache a copy of the original uio, which is advanced to the redo 768 * point using UIO_NOCOPY below. 769 */ 770 uio_clone = cloneuio(uio); 771 resid = uio->uio_resid; 772 773 short_uio.uio_segflg = UIO_USERSPACE; 774 short_uio.uio_rw = uio->uio_rw; 775 short_uio.uio_td = uio->uio_td; 776 777 if (uio->uio_rw == UIO_READ) { 778 prot = VM_PROT_WRITE; 779 rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, 780 uio->uio_offset + uio->uio_resid); 781 } else { 782 prot = VM_PROT_READ; 783 if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0) 784 /* For appenders, punt and lock the whole range. */ 785 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); 786 else 787 rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, 788 uio->uio_offset + uio->uio_resid); 789 } 790 791 save = vm_fault_disable_pagefaults(); 792 error = doio(fp, uio, active_cred, flags, td); 793 if (error != EFAULT) 794 goto out; 795 796 uio_clone->uio_segflg = UIO_NOCOPY; 797 uiomove(NULL, resid - uio->uio_resid, uio_clone); 798 uio_clone->uio_segflg = uio->uio_segflg; 799 800 saveheld = curthread_pflags_set(TDP_UIOHELD); 801 prev_td_ma = td->td_ma; 802 prev_td_ma_cnt = td->td_ma_cnt; 803 804 while (uio_clone->uio_resid != 0) { 805 len = uio_clone->uio_iov->iov_len; 806 if (len == 0) { 807 KASSERT(uio_clone->uio_iovcnt >= 1, 808 ("iovcnt underflow")); 809 uio_clone->uio_iov++; 810 uio_clone->uio_iovcnt--; 811 continue; 812 } 813 814 addr = (vm_offset_t)uio_clone->uio_iov->iov_base; 815 end = round_page(addr + len); 816 cnt = howmany(end - trunc_page(addr), PAGE_SIZE); 817 /* 818 * A perfectly misaligned address and length could cause 819 * both the start and the end of the chunk to use partial 820 * page. +2 accounts for such a situation. 821 */ 822 if (cnt > io_hold_cnt + 2) { 823 len = io_hold_cnt * PAGE_SIZE; 824 KASSERT(howmany(round_page(addr + len) - 825 trunc_page(addr), PAGE_SIZE) <= io_hold_cnt + 2, 826 ("cnt overflow")); 827 } 828 cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map, 829 addr, len, prot, ma, io_hold_cnt + 2); 830 if (cnt == -1) { 831 error = EFAULT; 832 break; 833 } 834 short_uio.uio_iov = &short_iovec[0]; 835 short_iovec[0].iov_base = (void *)addr; 836 short_uio.uio_iovcnt = 1; 837 short_uio.uio_resid = short_iovec[0].iov_len = len; 838 short_uio.uio_offset = uio_clone->uio_offset; 839 td->td_ma = ma; 840 td->td_ma_cnt = cnt; 841 842 error = doio(fp, &short_uio, active_cred, flags, td); 843 vm_page_unhold_pages(ma, cnt); 844 adv = len - short_uio.uio_resid; 845 846 uio_clone->uio_iov->iov_base = 847 (char *)uio_clone->uio_iov->iov_base + adv; 848 uio_clone->uio_iov->iov_len -= adv; 849 uio_clone->uio_resid -= adv; 850 uio_clone->uio_offset += adv; 851 852 uio->uio_resid -= adv; 853 uio->uio_offset += adv; 854 855 if (error != 0 || adv == 0) 856 break; 857 } 858 td->td_ma = prev_td_ma; 859 td->td_ma_cnt = prev_td_ma_cnt; 860 curthread_pflags_restore(saveheld); 861 out: 862 vm_fault_enable_pagefaults(save); 863 vn_rangelock_unlock(vp, rl_cookie); 864 free(uio_clone, M_IOV); 865 return (error); 866 } 867 868 /* 869 * Helper function to perform the requested uiomove operation using 870 * the held pages for io->uio_iov[0].iov_base buffer instead of 871 * copyin/copyout. Access to the pages with uiomove_fromphys() 872 * instead of iov_base prevents page faults that could occur due to 873 * pmap_collect() invalidating the mapping created by 874 * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or 875 * object cleanup revoking the write access from page mappings. 876 * 877 * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove() 878 * instead of plain uiomove(). 879 */ 880 int 881 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) 882 { 883 struct uio transp_uio; 884 struct iovec transp_iov[1]; 885 struct thread *td; 886 size_t adv; 887 int error, pgadv; 888 889 td = curthread; 890 if ((td->td_pflags & TDP_UIOHELD) == 0 || 891 uio->uio_segflg != UIO_USERSPACE) 892 return (uiomove(data, xfersize, uio)); 893 894 KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); 895 transp_iov[0].iov_base = data; 896 transp_uio.uio_iov = &transp_iov[0]; 897 transp_uio.uio_iovcnt = 1; 898 if (xfersize > uio->uio_resid) 899 xfersize = uio->uio_resid; 900 transp_uio.uio_resid = transp_iov[0].iov_len = xfersize; 901 transp_uio.uio_offset = 0; 902 transp_uio.uio_segflg = UIO_SYSSPACE; 903 /* 904 * Since transp_iov points to data, and td_ma page array 905 * corresponds to original uio->uio_iov, we need to invert the 906 * direction of the i/o operation as passed to 907 * uiomove_fromphys(). 908 */ 909 switch (uio->uio_rw) { 910 case UIO_WRITE: 911 transp_uio.uio_rw = UIO_READ; 912 break; 913 case UIO_READ: 914 transp_uio.uio_rw = UIO_WRITE; 915 break; 916 } 917 transp_uio.uio_td = uio->uio_td; 918 error = uiomove_fromphys(td->td_ma, 919 ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK, 920 xfersize, &transp_uio); 921 adv = xfersize - transp_uio.uio_resid; 922 pgadv = 923 (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) - 924 (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT); 925 td->td_ma += pgadv; 926 KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, 927 pgadv)); 928 td->td_ma_cnt -= pgadv; 929 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv; 930 uio->uio_iov->iov_len -= adv; 931 uio->uio_resid -= adv; 932 uio->uio_offset += adv; 933 return (error); 934 } 935 936 /* 937 * File table truncate routine. 938 */ 939 static int 940 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred, 941 struct thread *td) 942 { 943 struct vattr vattr; 944 struct mount *mp; 945 struct vnode *vp; 946 void *rl_cookie; 947 int vfslocked; 948 int error; 949 950 vp = fp->f_vnode; 951 952 /* 953 * Lock the whole range for truncation. Otherwise split i/o 954 * might happen partly before and partly after the truncation. 955 */ 956 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); 957 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 958 error = vn_start_write(vp, &mp, V_WAIT | PCATCH); 959 if (error) 960 goto out1; 961 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 962 if (vp->v_type == VDIR) { 963 error = EISDIR; 964 goto out; 965 } 966 #ifdef MAC 967 error = mac_vnode_check_write(active_cred, fp->f_cred, vp); 968 if (error) 969 goto out; 970 #endif 971 error = vn_writechk(vp); 972 if (error == 0) { 973 VATTR_NULL(&vattr); 974 vattr.va_size = length; 975 error = VOP_SETATTR(vp, &vattr, fp->f_cred); 976 } 977 out: 978 VOP_UNLOCK(vp, 0); 979 vn_finished_write(mp); 980 out1: 981 VFS_UNLOCK_GIANT(vfslocked); 982 vn_rangelock_unlock(vp, rl_cookie); 983 return (error); 984 } 985 986 /* 987 * File table vnode stat routine. 988 */ 989 static int 990 vn_statfile(fp, sb, active_cred, td) 991 struct file *fp; 992 struct stat *sb; 993 struct ucred *active_cred; 994 struct thread *td; 995 { 996 struct vnode *vp = fp->f_vnode; 997 int vfslocked; 998 int error; 999 1000 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 1001 vn_lock(vp, LK_SHARED | LK_RETRY); 1002 error = vn_stat(vp, sb, active_cred, fp->f_cred, td); 1003 VOP_UNLOCK(vp, 0); 1004 VFS_UNLOCK_GIANT(vfslocked); 1005 1006 return (error); 1007 } 1008 1009 /* 1010 * Stat a vnode; implementation for the stat syscall 1011 */ 1012 int 1013 vn_stat(vp, sb, active_cred, file_cred, td) 1014 struct vnode *vp; 1015 register struct stat *sb; 1016 struct ucred *active_cred; 1017 struct ucred *file_cred; 1018 struct thread *td; 1019 { 1020 struct vattr vattr; 1021 register struct vattr *vap; 1022 int error; 1023 u_short mode; 1024 1025 #ifdef MAC 1026 error = mac_vnode_check_stat(active_cred, file_cred, vp); 1027 if (error) 1028 return (error); 1029 #endif 1030 1031 vap = &vattr; 1032 1033 /* 1034 * Initialize defaults for new and unusual fields, so that file 1035 * systems which don't support these fields don't need to know 1036 * about them. 1037 */ 1038 vap->va_birthtime.tv_sec = -1; 1039 vap->va_birthtime.tv_nsec = 0; 1040 vap->va_fsid = VNOVAL; 1041 vap->va_rdev = NODEV; 1042 1043 error = VOP_GETATTR(vp, vap, active_cred); 1044 if (error) 1045 return (error); 1046 1047 /* 1048 * Zero the spare stat fields 1049 */ 1050 bzero(sb, sizeof *sb); 1051 1052 /* 1053 * Copy from vattr table 1054 */ 1055 if (vap->va_fsid != VNOVAL) 1056 sb->st_dev = vap->va_fsid; 1057 else 1058 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; 1059 sb->st_ino = vap->va_fileid; 1060 mode = vap->va_mode; 1061 switch (vap->va_type) { 1062 case VREG: 1063 mode |= S_IFREG; 1064 break; 1065 case VDIR: 1066 mode |= S_IFDIR; 1067 break; 1068 case VBLK: 1069 mode |= S_IFBLK; 1070 break; 1071 case VCHR: 1072 mode |= S_IFCHR; 1073 break; 1074 case VLNK: 1075 mode |= S_IFLNK; 1076 break; 1077 case VSOCK: 1078 mode |= S_IFSOCK; 1079 break; 1080 case VFIFO: 1081 mode |= S_IFIFO; 1082 break; 1083 default: 1084 return (EBADF); 1085 }; 1086 sb->st_mode = mode; 1087 sb->st_nlink = vap->va_nlink; 1088 sb->st_uid = vap->va_uid; 1089 sb->st_gid = vap->va_gid; 1090 sb->st_rdev = vap->va_rdev; 1091 if (vap->va_size > OFF_MAX) 1092 return (EOVERFLOW); 1093 sb->st_size = vap->va_size; 1094 sb->st_atim = vap->va_atime; 1095 sb->st_mtim = vap->va_mtime; 1096 sb->st_ctim = vap->va_ctime; 1097 sb->st_birthtim = vap->va_birthtime; 1098 1099 /* 1100 * According to www.opengroup.org, the meaning of st_blksize is 1101 * "a filesystem-specific preferred I/O block size for this 1102 * object. In some filesystem types, this may vary from file 1103 * to file" 1104 * Use miminum/default of PAGE_SIZE (e.g. for VCHR). 1105 */ 1106 1107 sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize); 1108 1109 sb->st_flags = vap->va_flags; 1110 if (priv_check(td, PRIV_VFS_GENERATION)) 1111 sb->st_gen = 0; 1112 else 1113 sb->st_gen = vap->va_gen; 1114 1115 sb->st_blocks = vap->va_bytes / S_BLKSIZE; 1116 return (0); 1117 } 1118 1119 /* 1120 * File table vnode ioctl routine. 1121 */ 1122 static int 1123 vn_ioctl(fp, com, data, active_cred, td) 1124 struct file *fp; 1125 u_long com; 1126 void *data; 1127 struct ucred *active_cred; 1128 struct thread *td; 1129 { 1130 struct vnode *vp = fp->f_vnode; 1131 struct vattr vattr; 1132 int vfslocked; 1133 int error; 1134 1135 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 1136 error = ENOTTY; 1137 switch (vp->v_type) { 1138 case VREG: 1139 case VDIR: 1140 if (com == FIONREAD) { 1141 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1142 error = VOP_GETATTR(vp, &vattr, active_cred); 1143 VOP_UNLOCK(vp, 0); 1144 if (!error) 1145 *(int *)data = vattr.va_size - fp->f_offset; 1146 } 1147 if (com == FIONBIO || com == FIOASYNC) /* XXX */ 1148 error = 0; 1149 else 1150 error = VOP_IOCTL(vp, com, data, fp->f_flag, 1151 active_cred, td); 1152 break; 1153 1154 default: 1155 break; 1156 } 1157 VFS_UNLOCK_GIANT(vfslocked); 1158 return (error); 1159 } 1160 1161 /* 1162 * File table vnode poll routine. 1163 */ 1164 static int 1165 vn_poll(fp, events, active_cred, td) 1166 struct file *fp; 1167 int events; 1168 struct ucred *active_cred; 1169 struct thread *td; 1170 { 1171 struct vnode *vp; 1172 int vfslocked; 1173 int error; 1174 1175 vp = fp->f_vnode; 1176 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 1177 #ifdef MAC 1178 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1179 error = mac_vnode_check_poll(active_cred, fp->f_cred, vp); 1180 VOP_UNLOCK(vp, 0); 1181 if (!error) 1182 #endif 1183 1184 error = VOP_POLL(vp, events, fp->f_cred, td); 1185 VFS_UNLOCK_GIANT(vfslocked); 1186 return (error); 1187 } 1188 1189 /* 1190 * Acquire the requested lock and then check for validity. LK_RETRY 1191 * permits vn_lock to return doomed vnodes. 1192 */ 1193 int 1194 _vn_lock(struct vnode *vp, int flags, char *file, int line) 1195 { 1196 int error; 1197 1198 VNASSERT((flags & LK_TYPE_MASK) != 0, vp, 1199 ("vn_lock called with no locktype.")); 1200 do { 1201 #ifdef DEBUG_VFS_LOCKS 1202 KASSERT(vp->v_holdcnt != 0, 1203 ("vn_lock %p: zero hold count", vp)); 1204 #endif 1205 error = VOP_LOCK1(vp, flags, file, line); 1206 flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */ 1207 KASSERT((flags & LK_RETRY) == 0 || error == 0, 1208 ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)", 1209 flags, error)); 1210 /* 1211 * Callers specify LK_RETRY if they wish to get dead vnodes. 1212 * If RETRY is not set, we return ENOENT instead. 1213 */ 1214 if (error == 0 && vp->v_iflag & VI_DOOMED && 1215 (flags & LK_RETRY) == 0) { 1216 VOP_UNLOCK(vp, 0); 1217 error = ENOENT; 1218 break; 1219 } 1220 } while (flags & LK_RETRY && error != 0); 1221 return (error); 1222 } 1223 1224 /* 1225 * File table vnode close routine. 1226 */ 1227 static int 1228 vn_closefile(fp, td) 1229 struct file *fp; 1230 struct thread *td; 1231 { 1232 struct vnode *vp; 1233 struct flock lf; 1234 int vfslocked; 1235 int error; 1236 1237 vp = fp->f_vnode; 1238 1239 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 1240 if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) { 1241 lf.l_whence = SEEK_SET; 1242 lf.l_start = 0; 1243 lf.l_len = 0; 1244 lf.l_type = F_UNLCK; 1245 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); 1246 } 1247 1248 fp->f_ops = &badfileops; 1249 1250 error = vn_close(vp, fp->f_flag, fp->f_cred, td); 1251 VFS_UNLOCK_GIANT(vfslocked); 1252 return (error); 1253 } 1254 1255 /* 1256 * Preparing to start a filesystem write operation. If the operation is 1257 * permitted, then we bump the count of operations in progress and 1258 * proceed. If a suspend request is in progress, we wait until the 1259 * suspension is over, and then proceed. 1260 */ 1261 int 1262 vn_start_write(vp, mpp, flags) 1263 struct vnode *vp; 1264 struct mount **mpp; 1265 int flags; 1266 { 1267 struct mount *mp; 1268 int error; 1269 1270 error = 0; 1271 /* 1272 * If a vnode is provided, get and return the mount point that 1273 * to which it will write. 1274 */ 1275 if (vp != NULL) { 1276 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { 1277 *mpp = NULL; 1278 if (error != EOPNOTSUPP) 1279 return (error); 1280 return (0); 1281 } 1282 } 1283 if ((mp = *mpp) == NULL) 1284 return (0); 1285 1286 /* 1287 * VOP_GETWRITEMOUNT() returns with the mp refcount held through 1288 * a vfs_ref(). 1289 * As long as a vnode is not provided we need to acquire a 1290 * refcount for the provided mountpoint too, in order to 1291 * emulate a vfs_ref(). 1292 */ 1293 MNT_ILOCK(mp); 1294 if (vp == NULL) 1295 MNT_REF(mp); 1296 1297 /* 1298 * Check on status of suspension. 1299 */ 1300 if ((curthread->td_pflags & TDP_IGNSUSP) == 0 || 1301 mp->mnt_susp_owner != curthread) { 1302 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 1303 if (flags & V_NOWAIT) { 1304 error = EWOULDBLOCK; 1305 goto unlock; 1306 } 1307 error = msleep(&mp->mnt_flag, MNT_MTX(mp), 1308 (PUSER - 1) | (flags & PCATCH), "suspfs", 0); 1309 if (error) 1310 goto unlock; 1311 } 1312 } 1313 if (flags & V_XSLEEP) 1314 goto unlock; 1315 mp->mnt_writeopcount++; 1316 unlock: 1317 if (error != 0 || (flags & V_XSLEEP) != 0) 1318 MNT_REL(mp); 1319 MNT_IUNLOCK(mp); 1320 return (error); 1321 } 1322 1323 /* 1324 * Secondary suspension. Used by operations such as vop_inactive 1325 * routines that are needed by the higher level functions. These 1326 * are allowed to proceed until all the higher level functions have 1327 * completed (indicated by mnt_writeopcount dropping to zero). At that 1328 * time, these operations are halted until the suspension is over. 1329 */ 1330 int 1331 vn_start_secondary_write(vp, mpp, flags) 1332 struct vnode *vp; 1333 struct mount **mpp; 1334 int flags; 1335 { 1336 struct mount *mp; 1337 int error; 1338 1339 retry: 1340 if (vp != NULL) { 1341 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { 1342 *mpp = NULL; 1343 if (error != EOPNOTSUPP) 1344 return (error); 1345 return (0); 1346 } 1347 } 1348 /* 1349 * If we are not suspended or have not yet reached suspended 1350 * mode, then let the operation proceed. 1351 */ 1352 if ((mp = *mpp) == NULL) 1353 return (0); 1354 1355 /* 1356 * VOP_GETWRITEMOUNT() returns with the mp refcount held through 1357 * a vfs_ref(). 1358 * As long as a vnode is not provided we need to acquire a 1359 * refcount for the provided mountpoint too, in order to 1360 * emulate a vfs_ref(). 1361 */ 1362 MNT_ILOCK(mp); 1363 if (vp == NULL) 1364 MNT_REF(mp); 1365 if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) { 1366 mp->mnt_secondary_writes++; 1367 mp->mnt_secondary_accwrites++; 1368 MNT_IUNLOCK(mp); 1369 return (0); 1370 } 1371 if (flags & V_NOWAIT) { 1372 MNT_REL(mp); 1373 MNT_IUNLOCK(mp); 1374 return (EWOULDBLOCK); 1375 } 1376 /* 1377 * Wait for the suspension to finish. 1378 */ 1379 error = msleep(&mp->mnt_flag, MNT_MTX(mp), 1380 (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0); 1381 vfs_rel(mp); 1382 if (error == 0) 1383 goto retry; 1384 return (error); 1385 } 1386 1387 /* 1388 * Filesystem write operation has completed. If we are suspending and this 1389 * operation is the last one, notify the suspender that the suspension is 1390 * now in effect. 1391 */ 1392 void 1393 vn_finished_write(mp) 1394 struct mount *mp; 1395 { 1396 if (mp == NULL) 1397 return; 1398 MNT_ILOCK(mp); 1399 MNT_REL(mp); 1400 mp->mnt_writeopcount--; 1401 if (mp->mnt_writeopcount < 0) 1402 panic("vn_finished_write: neg cnt"); 1403 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && 1404 mp->mnt_writeopcount <= 0) 1405 wakeup(&mp->mnt_writeopcount); 1406 MNT_IUNLOCK(mp); 1407 } 1408 1409 1410 /* 1411 * Filesystem secondary write operation has completed. If we are 1412 * suspending and this operation is the last one, notify the suspender 1413 * that the suspension is now in effect. 1414 */ 1415 void 1416 vn_finished_secondary_write(mp) 1417 struct mount *mp; 1418 { 1419 if (mp == NULL) 1420 return; 1421 MNT_ILOCK(mp); 1422 MNT_REL(mp); 1423 mp->mnt_secondary_writes--; 1424 if (mp->mnt_secondary_writes < 0) 1425 panic("vn_finished_secondary_write: neg cnt"); 1426 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && 1427 mp->mnt_secondary_writes <= 0) 1428 wakeup(&mp->mnt_secondary_writes); 1429 MNT_IUNLOCK(mp); 1430 } 1431 1432 1433 1434 /* 1435 * Request a filesystem to suspend write operations. 1436 */ 1437 int 1438 vfs_write_suspend(mp) 1439 struct mount *mp; 1440 { 1441 int error; 1442 1443 MNT_ILOCK(mp); 1444 if (mp->mnt_susp_owner == curthread) { 1445 MNT_IUNLOCK(mp); 1446 return (EALREADY); 1447 } 1448 while (mp->mnt_kern_flag & MNTK_SUSPEND) 1449 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0); 1450 mp->mnt_kern_flag |= MNTK_SUSPEND; 1451 mp->mnt_susp_owner = curthread; 1452 if (mp->mnt_writeopcount > 0) 1453 (void) msleep(&mp->mnt_writeopcount, 1454 MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); 1455 else 1456 MNT_IUNLOCK(mp); 1457 if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) 1458 vfs_write_resume(mp); 1459 return (error); 1460 } 1461 1462 /* 1463 * Request a filesystem to resume write operations. 1464 */ 1465 void 1466 vfs_write_resume(mp) 1467 struct mount *mp; 1468 { 1469 1470 MNT_ILOCK(mp); 1471 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 1472 KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner")); 1473 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 | 1474 MNTK_SUSPENDED); 1475 mp->mnt_susp_owner = NULL; 1476 wakeup(&mp->mnt_writeopcount); 1477 wakeup(&mp->mnt_flag); 1478 curthread->td_pflags &= ~TDP_IGNSUSP; 1479 MNT_IUNLOCK(mp); 1480 VFS_SUSP_CLEAN(mp); 1481 } else 1482 MNT_IUNLOCK(mp); 1483 } 1484 1485 /* 1486 * Implement kqueues for files by translating it to vnode operation. 1487 */ 1488 static int 1489 vn_kqfilter(struct file *fp, struct knote *kn) 1490 { 1491 int vfslocked; 1492 int error; 1493 1494 vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); 1495 error = VOP_KQFILTER(fp->f_vnode, kn); 1496 VFS_UNLOCK_GIANT(vfslocked); 1497 1498 return error; 1499 } 1500 1501 /* 1502 * Simplified in-kernel wrapper calls for extended attribute access. 1503 * Both calls pass in a NULL credential, authorizing as "kernel" access. 1504 * Set IO_NODELOCKED in ioflg if the vnode is already locked. 1505 */ 1506 int 1507 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, 1508 const char *attrname, int *buflen, char *buf, struct thread *td) 1509 { 1510 struct uio auio; 1511 struct iovec iov; 1512 int error; 1513 1514 iov.iov_len = *buflen; 1515 iov.iov_base = buf; 1516 1517 auio.uio_iov = &iov; 1518 auio.uio_iovcnt = 1; 1519 auio.uio_rw = UIO_READ; 1520 auio.uio_segflg = UIO_SYSSPACE; 1521 auio.uio_td = td; 1522 auio.uio_offset = 0; 1523 auio.uio_resid = *buflen; 1524 1525 if ((ioflg & IO_NODELOCKED) == 0) 1526 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1527 1528 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 1529 1530 /* authorize attribute retrieval as kernel */ 1531 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, 1532 td); 1533 1534 if ((ioflg & IO_NODELOCKED) == 0) 1535 VOP_UNLOCK(vp, 0); 1536 1537 if (error == 0) { 1538 *buflen = *buflen - auio.uio_resid; 1539 } 1540 1541 return (error); 1542 } 1543 1544 /* 1545 * XXX failure mode if partially written? 1546 */ 1547 int 1548 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, 1549 const char *attrname, int buflen, char *buf, struct thread *td) 1550 { 1551 struct uio auio; 1552 struct iovec iov; 1553 struct mount *mp; 1554 int error; 1555 1556 iov.iov_len = buflen; 1557 iov.iov_base = buf; 1558 1559 auio.uio_iov = &iov; 1560 auio.uio_iovcnt = 1; 1561 auio.uio_rw = UIO_WRITE; 1562 auio.uio_segflg = UIO_SYSSPACE; 1563 auio.uio_td = td; 1564 auio.uio_offset = 0; 1565 auio.uio_resid = buflen; 1566 1567 if ((ioflg & IO_NODELOCKED) == 0) { 1568 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) 1569 return (error); 1570 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1571 } 1572 1573 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 1574 1575 /* authorize attribute setting as kernel */ 1576 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); 1577 1578 if ((ioflg & IO_NODELOCKED) == 0) { 1579 vn_finished_write(mp); 1580 VOP_UNLOCK(vp, 0); 1581 } 1582 1583 return (error); 1584 } 1585 1586 int 1587 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, 1588 const char *attrname, struct thread *td) 1589 { 1590 struct mount *mp; 1591 int error; 1592 1593 if ((ioflg & IO_NODELOCKED) == 0) { 1594 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) 1595 return (error); 1596 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1597 } 1598 1599 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 1600 1601 /* authorize attribute removal as kernel */ 1602 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); 1603 if (error == EOPNOTSUPP) 1604 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, 1605 NULL, td); 1606 1607 if ((ioflg & IO_NODELOCKED) == 0) { 1608 vn_finished_write(mp); 1609 VOP_UNLOCK(vp, 0); 1610 } 1611 1612 return (error); 1613 } 1614 1615 int 1616 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp) 1617 { 1618 struct mount *mp; 1619 int ltype, error; 1620 1621 mp = vp->v_mount; 1622 ltype = VOP_ISLOCKED(vp); 1623 KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED, 1624 ("vn_vget_ino: vp not locked")); 1625 error = vfs_busy(mp, MBF_NOWAIT); 1626 if (error != 0) { 1627 vfs_ref(mp); 1628 VOP_UNLOCK(vp, 0); 1629 error = vfs_busy(mp, 0); 1630 vn_lock(vp, ltype | LK_RETRY); 1631 vfs_rel(mp); 1632 if (error != 0) 1633 return (ENOENT); 1634 if (vp->v_iflag & VI_DOOMED) { 1635 vfs_unbusy(mp); 1636 return (ENOENT); 1637 } 1638 } 1639 VOP_UNLOCK(vp, 0); 1640 error = VFS_VGET(mp, ino, lkflags, rvp); 1641 vfs_unbusy(mp); 1642 vn_lock(vp, ltype | LK_RETRY); 1643 if (vp->v_iflag & VI_DOOMED) { 1644 if (error == 0) 1645 vput(*rvp); 1646 error = ENOENT; 1647 } 1648 return (error); 1649 } 1650 1651 int 1652 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, 1653 const struct thread *td) 1654 { 1655 1656 if (vp->v_type != VREG || td == NULL) 1657 return (0); 1658 PROC_LOCK(td->td_proc); 1659 if ((uoff_t)uio->uio_offset + uio->uio_resid > 1660 lim_cur(td->td_proc, RLIMIT_FSIZE)) { 1661 kern_psignal(td->td_proc, SIGXFSZ); 1662 PROC_UNLOCK(td->td_proc); 1663 return (EFBIG); 1664 } 1665 PROC_UNLOCK(td->td_proc); 1666 return (0); 1667 } 1668 1669 int 1670 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 1671 struct thread *td) 1672 { 1673 struct vnode *vp; 1674 int error, vfslocked; 1675 1676 vp = fp->f_vnode; 1677 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 1678 #ifdef AUDIT 1679 vn_lock(vp, LK_SHARED | LK_RETRY); 1680 AUDIT_ARG_VNODE1(vp); 1681 VOP_UNLOCK(vp, 0); 1682 #endif 1683 error = setfmode(td, active_cred, vp, mode); 1684 VFS_UNLOCK_GIANT(vfslocked); 1685 return (error); 1686 } 1687 1688 int 1689 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 1690 struct thread *td) 1691 { 1692 struct vnode *vp; 1693 int error, vfslocked; 1694 1695 vp = fp->f_vnode; 1696 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 1697 #ifdef AUDIT 1698 vn_lock(vp, LK_SHARED | LK_RETRY); 1699 AUDIT_ARG_VNODE1(vp); 1700 VOP_UNLOCK(vp, 0); 1701 #endif 1702 error = setfown(td, active_cred, vp, uid, gid); 1703 VFS_UNLOCK_GIANT(vfslocked); 1704 return (error); 1705 } 1706 1707 void 1708 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end) 1709 { 1710 vm_object_t object; 1711 1712 if ((object = vp->v_object) == NULL) 1713 return; 1714 VM_OBJECT_LOCK(object); 1715 vm_object_page_remove(object, start, end, 0); 1716 VM_OBJECT_UNLOCK(object); 1717 } 1718 1719 int 1720 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) 1721 { 1722 struct vattr va; 1723 daddr_t bn, bnp; 1724 uint64_t bsize; 1725 off_t noff; 1726 int error; 1727 1728 KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, 1729 ("Wrong command %lu", cmd)); 1730 1731 if (vn_lock(vp, LK_SHARED) != 0) 1732 return (EBADF); 1733 if (vp->v_type != VREG) { 1734 error = ENOTTY; 1735 goto unlock; 1736 } 1737 error = VOP_GETATTR(vp, &va, cred); 1738 if (error != 0) 1739 goto unlock; 1740 noff = *off; 1741 if (noff >= va.va_size) { 1742 error = ENXIO; 1743 goto unlock; 1744 } 1745 bsize = vp->v_mount->mnt_stat.f_iosize; 1746 for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) { 1747 error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL); 1748 if (error == EOPNOTSUPP) { 1749 error = ENOTTY; 1750 goto unlock; 1751 } 1752 if ((bnp == -1 && cmd == FIOSEEKHOLE) || 1753 (bnp != -1 && cmd == FIOSEEKDATA)) { 1754 noff = bn * bsize; 1755 if (noff < *off) 1756 noff = *off; 1757 goto unlock; 1758 } 1759 } 1760 if (noff > va.va_size) 1761 noff = va.va_size; 1762 /* noff == va.va_size. There is an implicit hole at the end of file. */ 1763 if (cmd == FIOSEEKDATA) 1764 error = ENXIO; 1765 unlock: 1766 VOP_UNLOCK(vp, 0); 1767 if (error == 0) 1768 *off = noff; 1769 return (error); 1770 } 1771