1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/fcntl.h> 43 #include <sys/file.h> 44 #include <sys/kdb.h> 45 #include <sys/stat.h> 46 #include <sys/priv.h> 47 #include <sys/proc.h> 48 #include <sys/limits.h> 49 #include <sys/lock.h> 50 #include <sys/mount.h> 51 #include <sys/mutex.h> 52 #include <sys/namei.h> 53 #include <sys/vnode.h> 54 #include <sys/bio.h> 55 #include <sys/buf.h> 56 #include <sys/filio.h> 57 #include <sys/resourcevar.h> 58 #include <sys/sx.h> 59 #include <sys/sysctl.h> 60 #include <sys/ttycom.h> 61 #include <sys/conf.h> 62 #include <sys/syslog.h> 63 #include <sys/unistd.h> 64 65 #include <security/audit/audit.h> 66 #include <security/mac/mac_framework.h> 67 68 #include <vm/vm.h> 69 #include <vm/vm_extern.h> 70 #include <vm/pmap.h> 71 #include <vm/vm_map.h> 72 #include <vm/vm_object.h> 73 #include <vm/vm_page.h> 74 75 static fo_rdwr_t vn_read; 76 static fo_rdwr_t vn_write; 77 static fo_rdwr_t vn_io_fault; 78 static fo_truncate_t vn_truncate; 79 static fo_ioctl_t vn_ioctl; 80 static fo_poll_t vn_poll; 81 static fo_kqfilter_t vn_kqfilter; 82 static fo_stat_t vn_statfile; 83 static fo_close_t vn_closefile; 84 85 struct fileops vnops = { 86 .fo_read = vn_io_fault, 87 .fo_write = vn_io_fault, 88 .fo_truncate = vn_truncate, 89 .fo_ioctl = vn_ioctl, 90 .fo_poll = vn_poll, 91 .fo_kqfilter = vn_kqfilter, 92 .fo_stat = vn_statfile, 93 .fo_close = vn_closefile, 94 .fo_chmod = vn_chmod, 95 .fo_chown = vn_chown, 96 .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE 97 }; 98 99 int 100 vn_open(ndp, flagp, cmode, fp) 101 struct nameidata *ndp; 102 int *flagp, cmode; 103 struct file *fp; 104 { 105 struct thread *td = ndp->ni_cnd.cn_thread; 106 107 return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp)); 108 } 109 110 /* 111 * Common code for vnode open operations. 112 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. 113 * 114 * Note that this does NOT free nameidata for the successful case, 115 * due to the NDINIT being done elsewhere. 116 */ 117 int 118 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, 119 struct ucred *cred, struct file *fp) 120 { 121 struct vnode *vp; 122 struct mount *mp; 123 struct thread *td = ndp->ni_cnd.cn_thread; 124 struct vattr vat; 125 struct vattr *vap = &vat; 126 int fmode, error; 127 accmode_t accmode; 128 int vfslocked, mpsafe; 129 130 mpsafe = ndp->ni_cnd.cn_flags & MPSAFE; 131 restart: 132 vfslocked = 0; 133 fmode = *flagp; 134 if (fmode & O_CREAT) { 135 ndp->ni_cnd.cn_nameiop = CREATE; 136 ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | 137 MPSAFE; 138 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) 139 ndp->ni_cnd.cn_flags |= FOLLOW; 140 if (!(vn_open_flags & VN_OPEN_NOAUDIT)) 141 ndp->ni_cnd.cn_flags |= AUDITVNODE1; 142 bwillwrite(); 143 if ((error = namei(ndp)) != 0) 144 return (error); 145 vfslocked = NDHASGIANT(ndp); 146 if (!mpsafe) 147 ndp->ni_cnd.cn_flags &= ~MPSAFE; 148 if (ndp->ni_vp == NULL) { 149 VATTR_NULL(vap); 150 vap->va_type = VREG; 151 vap->va_mode = cmode; 152 if (fmode & O_EXCL) 153 vap->va_vaflags |= VA_EXCLUSIVE; 154 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { 155 NDFREE(ndp, NDF_ONLY_PNBUF); 156 vput(ndp->ni_dvp); 157 VFS_UNLOCK_GIANT(vfslocked); 158 if ((error = vn_start_write(NULL, &mp, 159 V_XSLEEP | PCATCH)) != 0) 160 return (error); 161 goto restart; 162 } 163 #ifdef MAC 164 error = mac_vnode_check_create(cred, ndp->ni_dvp, 165 &ndp->ni_cnd, vap); 166 if (error == 0) 167 #endif 168 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, 169 &ndp->ni_cnd, vap); 170 vput(ndp->ni_dvp); 171 vn_finished_write(mp); 172 if (error) { 173 VFS_UNLOCK_GIANT(vfslocked); 174 NDFREE(ndp, NDF_ONLY_PNBUF); 175 return (error); 176 } 177 fmode &= ~O_TRUNC; 178 vp = ndp->ni_vp; 179 } else { 180 if (ndp->ni_dvp == ndp->ni_vp) 181 vrele(ndp->ni_dvp); 182 else 183 vput(ndp->ni_dvp); 184 ndp->ni_dvp = NULL; 185 vp = ndp->ni_vp; 186 if (fmode & O_EXCL) { 187 error = EEXIST; 188 goto bad; 189 } 190 fmode &= ~O_CREAT; 191 } 192 } else { 193 ndp->ni_cnd.cn_nameiop = LOOKUP; 194 ndp->ni_cnd.cn_flags = ISOPEN | 195 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | 196 LOCKLEAF | MPSAFE; 197 if (!(fmode & FWRITE)) 198 ndp->ni_cnd.cn_flags |= LOCKSHARED; 199 if (!(vn_open_flags & VN_OPEN_NOAUDIT)) 200 ndp->ni_cnd.cn_flags |= AUDITVNODE1; 201 if ((error = namei(ndp)) != 0) 202 return (error); 203 if (!mpsafe) 204 ndp->ni_cnd.cn_flags &= ~MPSAFE; 205 vfslocked = NDHASGIANT(ndp); 206 vp = ndp->ni_vp; 207 } 208 if (vp->v_type == VLNK) { 209 error = EMLINK; 210 goto bad; 211 } 212 if (vp->v_type == VSOCK) { 213 error = EOPNOTSUPP; 214 goto bad; 215 } 216 if (vp->v_type != VDIR && fmode & O_DIRECTORY) { 217 error = ENOTDIR; 218 goto bad; 219 } 220 accmode = 0; 221 if (fmode & (FWRITE | O_TRUNC)) { 222 if (vp->v_type == VDIR) { 223 error = EISDIR; 224 goto bad; 225 } 226 accmode |= VWRITE; 227 } 228 if (fmode & FREAD) 229 accmode |= VREAD; 230 if (fmode & FEXEC) 231 accmode |= VEXEC; 232 if ((fmode & O_APPEND) && (fmode & FWRITE)) 233 accmode |= VAPPEND; 234 #ifdef MAC 235 error = mac_vnode_check_open(cred, vp, accmode); 236 if (error) 237 goto bad; 238 #endif 239 if ((fmode & O_CREAT) == 0) { 240 if (accmode & VWRITE) { 241 error = vn_writechk(vp); 242 if (error) 243 goto bad; 244 } 245 if (accmode) { 246 error = VOP_ACCESS(vp, accmode, cred, td); 247 if (error) 248 goto bad; 249 } 250 } 251 if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0) 252 goto bad; 253 254 if (fmode & FWRITE) { 255 vp->v_writecount++; 256 CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", 257 __func__, vp, vp->v_writecount); 258 } 259 *flagp = fmode; 260 ASSERT_VOP_LOCKED(vp, "vn_open_cred"); 261 if (!mpsafe) 262 VFS_UNLOCK_GIANT(vfslocked); 263 return (0); 264 bad: 265 NDFREE(ndp, NDF_ONLY_PNBUF); 266 vput(vp); 267 VFS_UNLOCK_GIANT(vfslocked); 268 *flagp = fmode; 269 ndp->ni_vp = NULL; 270 return (error); 271 } 272 273 /* 274 * Check for write permissions on the specified vnode. 275 * Prototype text segments cannot be written. 276 */ 277 int 278 vn_writechk(vp) 279 register struct vnode *vp; 280 { 281 282 ASSERT_VOP_LOCKED(vp, "vn_writechk"); 283 /* 284 * If there's shared text associated with 285 * the vnode, try to free it up once. If 286 * we fail, we can't allow writing. 287 */ 288 if (vp->v_vflag & VV_TEXT) 289 return (ETXTBSY); 290 291 return (0); 292 } 293 294 /* 295 * Vnode close call 296 */ 297 int 298 vn_close(vp, flags, file_cred, td) 299 register struct vnode *vp; 300 int flags; 301 struct ucred *file_cred; 302 struct thread *td; 303 { 304 struct mount *mp; 305 int error, lock_flags; 306 307 if (!(flags & FWRITE) && vp->v_mount != NULL && 308 vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED) 309 lock_flags = LK_SHARED; 310 else 311 lock_flags = LK_EXCLUSIVE; 312 313 VFS_ASSERT_GIANT(vp->v_mount); 314 315 vn_start_write(vp, &mp, V_WAIT); 316 vn_lock(vp, lock_flags | LK_RETRY); 317 if (flags & FWRITE) { 318 VNASSERT(vp->v_writecount > 0, vp, 319 ("vn_close: negative writecount")); 320 vp->v_writecount--; 321 CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", 322 __func__, vp, vp->v_writecount); 323 } 324 error = VOP_CLOSE(vp, flags, file_cred, td); 325 vput(vp); 326 vn_finished_write(mp); 327 return (error); 328 } 329 330 /* 331 * Heuristic to detect sequential operation. 332 */ 333 static int 334 sequential_heuristic(struct uio *uio, struct file *fp) 335 { 336 337 if (atomic_load_acq_int(&(fp->f_flag)) & FRDAHEAD) 338 return (fp->f_seqcount << IO_SEQSHIFT); 339 340 /* 341 * Offset 0 is handled specially. open() sets f_seqcount to 1 so 342 * that the first I/O is normally considered to be slightly 343 * sequential. Seeking to offset 0 doesn't change sequentiality 344 * unless previous seeks have reduced f_seqcount to 0, in which 345 * case offset 0 is not special. 346 */ 347 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || 348 uio->uio_offset == fp->f_nextoff) { 349 /* 350 * f_seqcount is in units of fixed-size blocks so that it 351 * depends mainly on the amount of sequential I/O and not 352 * much on the number of sequential I/O's. The fixed size 353 * of 16384 is hard-coded here since it is (not quite) just 354 * a magic size that works well here. This size is more 355 * closely related to the best I/O size for real disks than 356 * to any block size used by software. 357 */ 358 fp->f_seqcount += howmany(uio->uio_resid, 16384); 359 if (fp->f_seqcount > IO_SEQMAX) 360 fp->f_seqcount = IO_SEQMAX; 361 return (fp->f_seqcount << IO_SEQSHIFT); 362 } 363 364 /* Not sequential. Quickly draw-down sequentiality. */ 365 if (fp->f_seqcount > 1) 366 fp->f_seqcount = 1; 367 else 368 fp->f_seqcount = 0; 369 return (0); 370 } 371 372 /* 373 * Package up an I/O request on a vnode into a uio and do it. 374 */ 375 int 376 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, 377 enum uio_seg segflg, int ioflg, struct ucred *active_cred, 378 struct ucred *file_cred, ssize_t *aresid, struct thread *td) 379 { 380 struct uio auio; 381 struct iovec aiov; 382 struct mount *mp; 383 struct ucred *cred; 384 void *rl_cookie; 385 int error, lock_flags; 386 387 VFS_ASSERT_GIANT(vp->v_mount); 388 389 auio.uio_iov = &aiov; 390 auio.uio_iovcnt = 1; 391 aiov.iov_base = base; 392 aiov.iov_len = len; 393 auio.uio_resid = len; 394 auio.uio_offset = offset; 395 auio.uio_segflg = segflg; 396 auio.uio_rw = rw; 397 auio.uio_td = td; 398 error = 0; 399 400 if ((ioflg & IO_NODELOCKED) == 0) { 401 if (rw == UIO_READ) { 402 rl_cookie = vn_rangelock_rlock(vp, offset, 403 offset + len); 404 } else { 405 rl_cookie = vn_rangelock_wlock(vp, offset, 406 offset + len); 407 } 408 mp = NULL; 409 if (rw == UIO_WRITE) { 410 if (vp->v_type != VCHR && 411 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) 412 != 0) 413 goto out; 414 if (MNT_SHARED_WRITES(mp) || 415 ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) 416 lock_flags = LK_SHARED; 417 else 418 lock_flags = LK_EXCLUSIVE; 419 } else 420 lock_flags = LK_SHARED; 421 vn_lock(vp, lock_flags | LK_RETRY); 422 } else 423 rl_cookie = NULL; 424 425 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 426 #ifdef MAC 427 if ((ioflg & IO_NOMACCHECK) == 0) { 428 if (rw == UIO_READ) 429 error = mac_vnode_check_read(active_cred, file_cred, 430 vp); 431 else 432 error = mac_vnode_check_write(active_cred, file_cred, 433 vp); 434 } 435 #endif 436 if (error == 0) { 437 if (file_cred != NULL) 438 cred = file_cred; 439 else 440 cred = active_cred; 441 if (rw == UIO_READ) 442 error = VOP_READ(vp, &auio, ioflg, cred); 443 else 444 error = VOP_WRITE(vp, &auio, ioflg, cred); 445 } 446 if (aresid) 447 *aresid = auio.uio_resid; 448 else 449 if (auio.uio_resid && error == 0) 450 error = EIO; 451 if ((ioflg & IO_NODELOCKED) == 0) { 452 VOP_UNLOCK(vp, 0); 453 if (mp != NULL) 454 vn_finished_write(mp); 455 } 456 out: 457 if (rl_cookie != NULL) 458 vn_rangelock_unlock(vp, rl_cookie); 459 return (error); 460 } 461 462 /* 463 * Package up an I/O request on a vnode into a uio and do it. The I/O 464 * request is split up into smaller chunks and we try to avoid saturating 465 * the buffer cache while potentially holding a vnode locked, so we 466 * check bwillwrite() before calling vn_rdwr(). We also call kern_yield() 467 * to give other processes a chance to lock the vnode (either other processes 468 * core'ing the same binary, or unrelated processes scanning the directory). 469 */ 470 int 471 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred, 472 file_cred, aresid, td) 473 enum uio_rw rw; 474 struct vnode *vp; 475 void *base; 476 size_t len; 477 off_t offset; 478 enum uio_seg segflg; 479 int ioflg; 480 struct ucred *active_cred; 481 struct ucred *file_cred; 482 size_t *aresid; 483 struct thread *td; 484 { 485 int error = 0; 486 ssize_t iaresid; 487 488 VFS_ASSERT_GIANT(vp->v_mount); 489 490 do { 491 int chunk; 492 493 /* 494 * Force `offset' to a multiple of MAXBSIZE except possibly 495 * for the first chunk, so that filesystems only need to 496 * write full blocks except possibly for the first and last 497 * chunks. 498 */ 499 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; 500 501 if (chunk > len) 502 chunk = len; 503 if (rw != UIO_READ && vp->v_type == VREG) 504 bwillwrite(); 505 iaresid = 0; 506 error = vn_rdwr(rw, vp, base, chunk, offset, segflg, 507 ioflg, active_cred, file_cred, &iaresid, td); 508 len -= chunk; /* aresid calc already includes length */ 509 if (error) 510 break; 511 offset += chunk; 512 base = (char *)base + chunk; 513 kern_yield(PRI_USER); 514 } while (len); 515 if (aresid) 516 *aresid = len + iaresid; 517 return (error); 518 } 519 520 /* 521 * File table vnode read routine. 522 */ 523 static int 524 vn_read(fp, uio, active_cred, flags, td) 525 struct file *fp; 526 struct uio *uio; 527 struct ucred *active_cred; 528 int flags; 529 struct thread *td; 530 { 531 struct vnode *vp; 532 int error, ioflag; 533 struct mtx *mtxp; 534 int advice, vfslocked; 535 off_t offset; 536 537 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", 538 uio->uio_td, td)); 539 mtxp = NULL; 540 vp = fp->f_vnode; 541 ioflag = 0; 542 if (fp->f_flag & FNONBLOCK) 543 ioflag |= IO_NDELAY; 544 if (fp->f_flag & O_DIRECT) 545 ioflag |= IO_DIRECT; 546 advice = POSIX_FADV_NORMAL; 547 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 548 /* 549 * According to McKusick the vn lock was protecting f_offset here. 550 * It is now protected by the FOFFSET_LOCKED flag. 551 */ 552 if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) { 553 mtxp = mtx_pool_find(mtxpool_sleep, fp); 554 mtx_lock(mtxp); 555 if ((flags & FOF_OFFSET) == 0) { 556 while (fp->f_vnread_flags & FOFFSET_LOCKED) { 557 fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; 558 msleep(&fp->f_vnread_flags, mtxp, PUSER -1, 559 "vnread offlock", 0); 560 } 561 fp->f_vnread_flags |= FOFFSET_LOCKED; 562 uio->uio_offset = fp->f_offset; 563 } 564 if (fp->f_advice != NULL && 565 uio->uio_offset >= fp->f_advice->fa_start && 566 uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) 567 advice = fp->f_advice->fa_advice; 568 mtx_unlock(mtxp); 569 } 570 vn_lock(vp, LK_SHARED | LK_RETRY); 571 572 switch (advice) { 573 case POSIX_FADV_NORMAL: 574 case POSIX_FADV_SEQUENTIAL: 575 case POSIX_FADV_NOREUSE: 576 ioflag |= sequential_heuristic(uio, fp); 577 break; 578 case POSIX_FADV_RANDOM: 579 /* Disable read-ahead for random I/O. */ 580 break; 581 } 582 offset = uio->uio_offset; 583 584 #ifdef MAC 585 error = mac_vnode_check_read(active_cred, fp->f_cred, vp); 586 if (error == 0) 587 #endif 588 error = VOP_READ(vp, uio, ioflag, fp->f_cred); 589 if ((flags & FOF_OFFSET) == 0) { 590 fp->f_offset = uio->uio_offset; 591 mtx_lock(mtxp); 592 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) 593 wakeup(&fp->f_vnread_flags); 594 fp->f_vnread_flags = 0; 595 mtx_unlock(mtxp); 596 } 597 fp->f_nextoff = uio->uio_offset; 598 VOP_UNLOCK(vp, 0); 599 if (error == 0 && advice == POSIX_FADV_NOREUSE && 600 offset != uio->uio_offset) 601 error = VOP_ADVISE(vp, offset, uio->uio_offset - 1, 602 POSIX_FADV_DONTNEED); 603 VFS_UNLOCK_GIANT(vfslocked); 604 return (error); 605 } 606 607 /* 608 * File table vnode write routine. 609 */ 610 static int 611 vn_write(fp, uio, active_cred, flags, td) 612 struct file *fp; 613 struct uio *uio; 614 struct ucred *active_cred; 615 int flags; 616 struct thread *td; 617 { 618 struct vnode *vp; 619 struct mount *mp; 620 int error, ioflag, lock_flags; 621 struct mtx *mtxp; 622 int advice, vfslocked; 623 624 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", 625 uio->uio_td, td)); 626 vp = fp->f_vnode; 627 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 628 if (vp->v_type == VREG) 629 bwillwrite(); 630 ioflag = IO_UNIT; 631 if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) 632 ioflag |= IO_APPEND; 633 if (fp->f_flag & FNONBLOCK) 634 ioflag |= IO_NDELAY; 635 if (fp->f_flag & O_DIRECT) 636 ioflag |= IO_DIRECT; 637 if ((fp->f_flag & O_FSYNC) || 638 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) 639 ioflag |= IO_SYNC; 640 mp = NULL; 641 if (vp->v_type != VCHR && 642 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) 643 goto unlock; 644 645 if ((MNT_SHARED_WRITES(mp) || 646 ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) && 647 (flags & FOF_OFFSET) != 0) { 648 lock_flags = LK_SHARED; 649 } else { 650 lock_flags = LK_EXCLUSIVE; 651 } 652 653 vn_lock(vp, lock_flags | LK_RETRY); 654 if ((flags & FOF_OFFSET) == 0) 655 uio->uio_offset = fp->f_offset; 656 advice = POSIX_FADV_NORMAL; 657 if (fp->f_advice != NULL) { 658 mtxp = mtx_pool_find(mtxpool_sleep, fp); 659 mtx_lock(mtxp); 660 if (fp->f_advice != NULL && 661 uio->uio_offset >= fp->f_advice->fa_start && 662 uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) 663 advice = fp->f_advice->fa_advice; 664 mtx_unlock(mtxp); 665 } 666 switch (advice) { 667 case POSIX_FADV_NORMAL: 668 case POSIX_FADV_SEQUENTIAL: 669 ioflag |= sequential_heuristic(uio, fp); 670 break; 671 case POSIX_FADV_RANDOM: 672 /* XXX: Is this correct? */ 673 break; 674 case POSIX_FADV_NOREUSE: 675 /* 676 * Request the underlying FS to discard the buffers 677 * and pages after the I/O is complete. 678 */ 679 ioflag |= IO_DIRECT; 680 break; 681 } 682 683 #ifdef MAC 684 error = mac_vnode_check_write(active_cred, fp->f_cred, vp); 685 if (error == 0) 686 #endif 687 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); 688 if ((flags & FOF_OFFSET) == 0) 689 fp->f_offset = uio->uio_offset; 690 fp->f_nextoff = uio->uio_offset; 691 VOP_UNLOCK(vp, 0); 692 if (vp->v_type != VCHR) 693 vn_finished_write(mp); 694 unlock: 695 VFS_UNLOCK_GIANT(vfslocked); 696 return (error); 697 } 698 699 static const int io_hold_cnt = 16; 700 static int vn_io_fault_enable = 1; 701 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW, 702 &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); 703 static unsigned long vn_io_faults_cnt; 704 SYSCTL_LONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, 705 &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); 706 707 /* 708 * The vn_io_fault() is a wrapper around vn_read() and vn_write() to 709 * prevent the following deadlock: 710 * 711 * Assume that the thread A reads from the vnode vp1 into userspace 712 * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is 713 * currently not resident, then system ends up with the call chain 714 * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] -> 715 * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2) 716 * which establishes lock order vp1->vn_lock, then vp2->vn_lock. 717 * If, at the same time, thread B reads from vnode vp2 into buffer buf2 718 * backed by the pages of vnode vp1, and some page in buf2 is not 719 * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock. 720 * 721 * To prevent the lock order reversal and deadlock, vn_io_fault() does 722 * not allow page faults to happen during VOP_READ() or VOP_WRITE(). 723 * Instead, it first tries to do the whole range i/o with pagefaults 724 * disabled. If all pages in the i/o buffer are resident and mapped, 725 * VOP will succeed (ignoring the genuine filesystem errors). 726 * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do 727 * i/o in chunks, with all pages in the chunk prefaulted and held 728 * using vm_fault_quick_hold_pages(). 729 * 730 * Filesystems using this deadlock avoidance scheme should use the 731 * array of the held pages from uio, saved in the curthread->td_ma, 732 * instead of doing uiomove(). A helper function 733 * vn_io_fault_uiomove() converts uiomove request into 734 * uiomove_fromphys() over td_ma array. 735 * 736 * Since vnode locks do not cover the whole i/o anymore, rangelocks 737 * make the current i/o request atomic with respect to other i/os and 738 * truncations. 739 */ 740 static int 741 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred, 742 int flags, struct thread *td) 743 { 744 vm_page_t ma[io_hold_cnt + 2]; 745 struct uio *uio_clone, short_uio; 746 struct iovec short_iovec[1]; 747 fo_rdwr_t *doio; 748 struct vnode *vp; 749 void *rl_cookie; 750 struct mount *mp; 751 vm_page_t *prev_td_ma; 752 int cnt, error, save, saveheld, prev_td_ma_cnt; 753 vm_offset_t addr, end; 754 vm_prot_t prot; 755 size_t len, resid; 756 ssize_t adv; 757 758 if (uio->uio_rw == UIO_READ) 759 doio = vn_read; 760 else 761 doio = vn_write; 762 vp = fp->f_vnode; 763 if (uio->uio_segflg != UIO_USERSPACE || vp->v_type != VREG || 764 ((mp = vp->v_mount) != NULL && 765 (mp->mnt_kern_flag & MNTK_NO_IOPF) == 0) || 766 !vn_io_fault_enable) 767 return (doio(fp, uio, active_cred, flags, td)); 768 769 /* 770 * The UFS follows IO_UNIT directive and replays back both 771 * uio_offset and uio_resid if an error is encountered during the 772 * operation. But, since the iovec may be already advanced, 773 * uio is still in an inconsistent state. 774 * 775 * Cache a copy of the original uio, which is advanced to the redo 776 * point using UIO_NOCOPY below. 777 */ 778 uio_clone = cloneuio(uio); 779 resid = uio->uio_resid; 780 781 short_uio.uio_segflg = UIO_USERSPACE; 782 short_uio.uio_rw = uio->uio_rw; 783 short_uio.uio_td = uio->uio_td; 784 785 if (uio->uio_rw == UIO_READ) { 786 prot = VM_PROT_WRITE; 787 rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, 788 uio->uio_offset + uio->uio_resid); 789 } else { 790 prot = VM_PROT_READ; 791 if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0) 792 /* For appenders, punt and lock the whole range. */ 793 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); 794 else 795 rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, 796 uio->uio_offset + uio->uio_resid); 797 } 798 799 save = vm_fault_disable_pagefaults(); 800 error = doio(fp, uio, active_cred, flags, td); 801 if (error != EFAULT) 802 goto out; 803 804 atomic_add_long(&vn_io_faults_cnt, 1); 805 uio_clone->uio_segflg = UIO_NOCOPY; 806 uiomove(NULL, resid - uio->uio_resid, uio_clone); 807 uio_clone->uio_segflg = uio->uio_segflg; 808 809 saveheld = curthread_pflags_set(TDP_UIOHELD); 810 prev_td_ma = td->td_ma; 811 prev_td_ma_cnt = td->td_ma_cnt; 812 813 while (uio_clone->uio_resid != 0) { 814 len = uio_clone->uio_iov->iov_len; 815 if (len == 0) { 816 KASSERT(uio_clone->uio_iovcnt >= 1, 817 ("iovcnt underflow")); 818 uio_clone->uio_iov++; 819 uio_clone->uio_iovcnt--; 820 continue; 821 } 822 823 addr = (vm_offset_t)uio_clone->uio_iov->iov_base; 824 end = round_page(addr + len); 825 cnt = howmany(end - trunc_page(addr), PAGE_SIZE); 826 /* 827 * A perfectly misaligned address and length could cause 828 * both the start and the end of the chunk to use partial 829 * page. +2 accounts for such a situation. 830 */ 831 if (cnt > io_hold_cnt + 2) { 832 len = io_hold_cnt * PAGE_SIZE; 833 KASSERT(howmany(round_page(addr + len) - 834 trunc_page(addr), PAGE_SIZE) <= io_hold_cnt + 2, 835 ("cnt overflow")); 836 } 837 cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map, 838 addr, len, prot, ma, io_hold_cnt + 2); 839 if (cnt == -1) { 840 error = EFAULT; 841 break; 842 } 843 short_uio.uio_iov = &short_iovec[0]; 844 short_iovec[0].iov_base = (void *)addr; 845 short_uio.uio_iovcnt = 1; 846 short_uio.uio_resid = short_iovec[0].iov_len = len; 847 short_uio.uio_offset = uio_clone->uio_offset; 848 td->td_ma = ma; 849 td->td_ma_cnt = cnt; 850 851 error = doio(fp, &short_uio, active_cred, flags, td); 852 vm_page_unhold_pages(ma, cnt); 853 adv = len - short_uio.uio_resid; 854 855 uio_clone->uio_iov->iov_base = 856 (char *)uio_clone->uio_iov->iov_base + adv; 857 uio_clone->uio_iov->iov_len -= adv; 858 uio_clone->uio_resid -= adv; 859 uio_clone->uio_offset += adv; 860 861 uio->uio_resid -= adv; 862 uio->uio_offset += adv; 863 864 if (error != 0 || adv == 0) 865 break; 866 } 867 td->td_ma = prev_td_ma; 868 td->td_ma_cnt = prev_td_ma_cnt; 869 curthread_pflags_restore(saveheld); 870 out: 871 vm_fault_enable_pagefaults(save); 872 vn_rangelock_unlock(vp, rl_cookie); 873 free(uio_clone, M_IOV); 874 return (error); 875 } 876 877 /* 878 * Helper function to perform the requested uiomove operation using 879 * the held pages for io->uio_iov[0].iov_base buffer instead of 880 * copyin/copyout. Access to the pages with uiomove_fromphys() 881 * instead of iov_base prevents page faults that could occur due to 882 * pmap_collect() invalidating the mapping created by 883 * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or 884 * object cleanup revoking the write access from page mappings. 885 * 886 * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove() 887 * instead of plain uiomove(). 888 */ 889 int 890 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) 891 { 892 struct uio transp_uio; 893 struct iovec transp_iov[1]; 894 struct thread *td; 895 size_t adv; 896 int error, pgadv; 897 898 td = curthread; 899 if ((td->td_pflags & TDP_UIOHELD) == 0 || 900 uio->uio_segflg != UIO_USERSPACE) 901 return (uiomove(data, xfersize, uio)); 902 903 KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); 904 transp_iov[0].iov_base = data; 905 transp_uio.uio_iov = &transp_iov[0]; 906 transp_uio.uio_iovcnt = 1; 907 if (xfersize > uio->uio_resid) 908 xfersize = uio->uio_resid; 909 transp_uio.uio_resid = transp_iov[0].iov_len = xfersize; 910 transp_uio.uio_offset = 0; 911 transp_uio.uio_segflg = UIO_SYSSPACE; 912 /* 913 * Since transp_iov points to data, and td_ma page array 914 * corresponds to original uio->uio_iov, we need to invert the 915 * direction of the i/o operation as passed to 916 * uiomove_fromphys(). 917 */ 918 switch (uio->uio_rw) { 919 case UIO_WRITE: 920 transp_uio.uio_rw = UIO_READ; 921 break; 922 case UIO_READ: 923 transp_uio.uio_rw = UIO_WRITE; 924 break; 925 } 926 transp_uio.uio_td = uio->uio_td; 927 error = uiomove_fromphys(td->td_ma, 928 ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK, 929 xfersize, &transp_uio); 930 adv = xfersize - transp_uio.uio_resid; 931 pgadv = 932 (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) - 933 (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT); 934 td->td_ma += pgadv; 935 KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, 936 pgadv)); 937 td->td_ma_cnt -= pgadv; 938 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv; 939 uio->uio_iov->iov_len -= adv; 940 uio->uio_resid -= adv; 941 uio->uio_offset += adv; 942 return (error); 943 } 944 945 /* 946 * File table truncate routine. 947 */ 948 static int 949 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred, 950 struct thread *td) 951 { 952 struct vattr vattr; 953 struct mount *mp; 954 struct vnode *vp; 955 void *rl_cookie; 956 int vfslocked; 957 int error; 958 959 vp = fp->f_vnode; 960 961 /* 962 * Lock the whole range for truncation. Otherwise split i/o 963 * might happen partly before and partly after the truncation. 964 */ 965 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); 966 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 967 error = vn_start_write(vp, &mp, V_WAIT | PCATCH); 968 if (error) 969 goto out1; 970 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 971 if (vp->v_type == VDIR) { 972 error = EISDIR; 973 goto out; 974 } 975 #ifdef MAC 976 error = mac_vnode_check_write(active_cred, fp->f_cred, vp); 977 if (error) 978 goto out; 979 #endif 980 error = vn_writechk(vp); 981 if (error == 0) { 982 VATTR_NULL(&vattr); 983 vattr.va_size = length; 984 error = VOP_SETATTR(vp, &vattr, fp->f_cred); 985 } 986 out: 987 VOP_UNLOCK(vp, 0); 988 vn_finished_write(mp); 989 out1: 990 VFS_UNLOCK_GIANT(vfslocked); 991 vn_rangelock_unlock(vp, rl_cookie); 992 return (error); 993 } 994 995 /* 996 * File table vnode stat routine. 997 */ 998 static int 999 vn_statfile(fp, sb, active_cred, td) 1000 struct file *fp; 1001 struct stat *sb; 1002 struct ucred *active_cred; 1003 struct thread *td; 1004 { 1005 struct vnode *vp = fp->f_vnode; 1006 int vfslocked; 1007 int error; 1008 1009 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 1010 vn_lock(vp, LK_SHARED | LK_RETRY); 1011 error = vn_stat(vp, sb, active_cred, fp->f_cred, td); 1012 VOP_UNLOCK(vp, 0); 1013 VFS_UNLOCK_GIANT(vfslocked); 1014 1015 return (error); 1016 } 1017 1018 /* 1019 * Stat a vnode; implementation for the stat syscall 1020 */ 1021 int 1022 vn_stat(vp, sb, active_cred, file_cred, td) 1023 struct vnode *vp; 1024 register struct stat *sb; 1025 struct ucred *active_cred; 1026 struct ucred *file_cred; 1027 struct thread *td; 1028 { 1029 struct vattr vattr; 1030 register struct vattr *vap; 1031 int error; 1032 u_short mode; 1033 1034 #ifdef MAC 1035 error = mac_vnode_check_stat(active_cred, file_cred, vp); 1036 if (error) 1037 return (error); 1038 #endif 1039 1040 vap = &vattr; 1041 1042 /* 1043 * Initialize defaults for new and unusual fields, so that file 1044 * systems which don't support these fields don't need to know 1045 * about them. 1046 */ 1047 vap->va_birthtime.tv_sec = -1; 1048 vap->va_birthtime.tv_nsec = 0; 1049 vap->va_fsid = VNOVAL; 1050 vap->va_rdev = NODEV; 1051 1052 error = VOP_GETATTR(vp, vap, active_cred); 1053 if (error) 1054 return (error); 1055 1056 /* 1057 * Zero the spare stat fields 1058 */ 1059 bzero(sb, sizeof *sb); 1060 1061 /* 1062 * Copy from vattr table 1063 */ 1064 if (vap->va_fsid != VNOVAL) 1065 sb->st_dev = vap->va_fsid; 1066 else 1067 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; 1068 sb->st_ino = vap->va_fileid; 1069 mode = vap->va_mode; 1070 switch (vap->va_type) { 1071 case VREG: 1072 mode |= S_IFREG; 1073 break; 1074 case VDIR: 1075 mode |= S_IFDIR; 1076 break; 1077 case VBLK: 1078 mode |= S_IFBLK; 1079 break; 1080 case VCHR: 1081 mode |= S_IFCHR; 1082 break; 1083 case VLNK: 1084 mode |= S_IFLNK; 1085 break; 1086 case VSOCK: 1087 mode |= S_IFSOCK; 1088 break; 1089 case VFIFO: 1090 mode |= S_IFIFO; 1091 break; 1092 default: 1093 return (EBADF); 1094 }; 1095 sb->st_mode = mode; 1096 sb->st_nlink = vap->va_nlink; 1097 sb->st_uid = vap->va_uid; 1098 sb->st_gid = vap->va_gid; 1099 sb->st_rdev = vap->va_rdev; 1100 if (vap->va_size > OFF_MAX) 1101 return (EOVERFLOW); 1102 sb->st_size = vap->va_size; 1103 sb->st_atim = vap->va_atime; 1104 sb->st_mtim = vap->va_mtime; 1105 sb->st_ctim = vap->va_ctime; 1106 sb->st_birthtim = vap->va_birthtime; 1107 1108 /* 1109 * According to www.opengroup.org, the meaning of st_blksize is 1110 * "a filesystem-specific preferred I/O block size for this 1111 * object. In some filesystem types, this may vary from file 1112 * to file" 1113 * Use miminum/default of PAGE_SIZE (e.g. for VCHR). 1114 */ 1115 1116 sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize); 1117 1118 sb->st_flags = vap->va_flags; 1119 if (priv_check(td, PRIV_VFS_GENERATION)) 1120 sb->st_gen = 0; 1121 else 1122 sb->st_gen = vap->va_gen; 1123 1124 sb->st_blocks = vap->va_bytes / S_BLKSIZE; 1125 return (0); 1126 } 1127 1128 /* 1129 * File table vnode ioctl routine. 1130 */ 1131 static int 1132 vn_ioctl(fp, com, data, active_cred, td) 1133 struct file *fp; 1134 u_long com; 1135 void *data; 1136 struct ucred *active_cred; 1137 struct thread *td; 1138 { 1139 struct vnode *vp = fp->f_vnode; 1140 struct vattr vattr; 1141 int vfslocked; 1142 int error; 1143 1144 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 1145 error = ENOTTY; 1146 switch (vp->v_type) { 1147 case VREG: 1148 case VDIR: 1149 if (com == FIONREAD) { 1150 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1151 error = VOP_GETATTR(vp, &vattr, active_cred); 1152 VOP_UNLOCK(vp, 0); 1153 if (!error) 1154 *(int *)data = vattr.va_size - fp->f_offset; 1155 } 1156 if (com == FIONBIO || com == FIOASYNC) /* XXX */ 1157 error = 0; 1158 else 1159 error = VOP_IOCTL(vp, com, data, fp->f_flag, 1160 active_cred, td); 1161 break; 1162 1163 default: 1164 break; 1165 } 1166 VFS_UNLOCK_GIANT(vfslocked); 1167 return (error); 1168 } 1169 1170 /* 1171 * File table vnode poll routine. 1172 */ 1173 static int 1174 vn_poll(fp, events, active_cred, td) 1175 struct file *fp; 1176 int events; 1177 struct ucred *active_cred; 1178 struct thread *td; 1179 { 1180 struct vnode *vp; 1181 int vfslocked; 1182 int error; 1183 1184 vp = fp->f_vnode; 1185 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 1186 #ifdef MAC 1187 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1188 error = mac_vnode_check_poll(active_cred, fp->f_cred, vp); 1189 VOP_UNLOCK(vp, 0); 1190 if (!error) 1191 #endif 1192 1193 error = VOP_POLL(vp, events, fp->f_cred, td); 1194 VFS_UNLOCK_GIANT(vfslocked); 1195 return (error); 1196 } 1197 1198 /* 1199 * Acquire the requested lock and then check for validity. LK_RETRY 1200 * permits vn_lock to return doomed vnodes. 1201 */ 1202 int 1203 _vn_lock(struct vnode *vp, int flags, char *file, int line) 1204 { 1205 int error; 1206 1207 VNASSERT((flags & LK_TYPE_MASK) != 0, vp, 1208 ("vn_lock called with no locktype.")); 1209 do { 1210 #ifdef DEBUG_VFS_LOCKS 1211 KASSERT(vp->v_holdcnt != 0, 1212 ("vn_lock %p: zero hold count", vp)); 1213 #endif 1214 error = VOP_LOCK1(vp, flags, file, line); 1215 flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */ 1216 KASSERT((flags & LK_RETRY) == 0 || error == 0, 1217 ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)", 1218 flags, error)); 1219 /* 1220 * Callers specify LK_RETRY if they wish to get dead vnodes. 1221 * If RETRY is not set, we return ENOENT instead. 1222 */ 1223 if (error == 0 && vp->v_iflag & VI_DOOMED && 1224 (flags & LK_RETRY) == 0) { 1225 VOP_UNLOCK(vp, 0); 1226 error = ENOENT; 1227 break; 1228 } 1229 } while (flags & LK_RETRY && error != 0); 1230 return (error); 1231 } 1232 1233 /* 1234 * File table vnode close routine. 1235 */ 1236 static int 1237 vn_closefile(fp, td) 1238 struct file *fp; 1239 struct thread *td; 1240 { 1241 struct vnode *vp; 1242 struct flock lf; 1243 int vfslocked; 1244 int error; 1245 1246 vp = fp->f_vnode; 1247 1248 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 1249 if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) { 1250 lf.l_whence = SEEK_SET; 1251 lf.l_start = 0; 1252 lf.l_len = 0; 1253 lf.l_type = F_UNLCK; 1254 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); 1255 } 1256 1257 fp->f_ops = &badfileops; 1258 1259 error = vn_close(vp, fp->f_flag, fp->f_cred, td); 1260 VFS_UNLOCK_GIANT(vfslocked); 1261 return (error); 1262 } 1263 1264 /* 1265 * Preparing to start a filesystem write operation. If the operation is 1266 * permitted, then we bump the count of operations in progress and 1267 * proceed. If a suspend request is in progress, we wait until the 1268 * suspension is over, and then proceed. 1269 */ 1270 int 1271 vn_start_write(vp, mpp, flags) 1272 struct vnode *vp; 1273 struct mount **mpp; 1274 int flags; 1275 { 1276 struct mount *mp; 1277 int error; 1278 1279 error = 0; 1280 /* 1281 * If a vnode is provided, get and return the mount point that 1282 * to which it will write. 1283 */ 1284 if (vp != NULL) { 1285 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { 1286 *mpp = NULL; 1287 if (error != EOPNOTSUPP) 1288 return (error); 1289 return (0); 1290 } 1291 } 1292 if ((mp = *mpp) == NULL) 1293 return (0); 1294 1295 /* 1296 * VOP_GETWRITEMOUNT() returns with the mp refcount held through 1297 * a vfs_ref(). 1298 * As long as a vnode is not provided we need to acquire a 1299 * refcount for the provided mountpoint too, in order to 1300 * emulate a vfs_ref(). 1301 */ 1302 MNT_ILOCK(mp); 1303 if (vp == NULL) 1304 MNT_REF(mp); 1305 1306 /* 1307 * Check on status of suspension. 1308 */ 1309 if ((curthread->td_pflags & TDP_IGNSUSP) == 0 || 1310 mp->mnt_susp_owner != curthread) { 1311 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 1312 if (flags & V_NOWAIT) { 1313 error = EWOULDBLOCK; 1314 goto unlock; 1315 } 1316 error = msleep(&mp->mnt_flag, MNT_MTX(mp), 1317 (PUSER - 1) | (flags & PCATCH), "suspfs", 0); 1318 if (error) 1319 goto unlock; 1320 } 1321 } 1322 if (flags & V_XSLEEP) 1323 goto unlock; 1324 mp->mnt_writeopcount++; 1325 unlock: 1326 if (error != 0 || (flags & V_XSLEEP) != 0) 1327 MNT_REL(mp); 1328 MNT_IUNLOCK(mp); 1329 return (error); 1330 } 1331 1332 /* 1333 * Secondary suspension. Used by operations such as vop_inactive 1334 * routines that are needed by the higher level functions. These 1335 * are allowed to proceed until all the higher level functions have 1336 * completed (indicated by mnt_writeopcount dropping to zero). At that 1337 * time, these operations are halted until the suspension is over. 1338 */ 1339 int 1340 vn_start_secondary_write(vp, mpp, flags) 1341 struct vnode *vp; 1342 struct mount **mpp; 1343 int flags; 1344 { 1345 struct mount *mp; 1346 int error; 1347 1348 retry: 1349 if (vp != NULL) { 1350 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { 1351 *mpp = NULL; 1352 if (error != EOPNOTSUPP) 1353 return (error); 1354 return (0); 1355 } 1356 } 1357 /* 1358 * If we are not suspended or have not yet reached suspended 1359 * mode, then let the operation proceed. 1360 */ 1361 if ((mp = *mpp) == NULL) 1362 return (0); 1363 1364 /* 1365 * VOP_GETWRITEMOUNT() returns with the mp refcount held through 1366 * a vfs_ref(). 1367 * As long as a vnode is not provided we need to acquire a 1368 * refcount for the provided mountpoint too, in order to 1369 * emulate a vfs_ref(). 1370 */ 1371 MNT_ILOCK(mp); 1372 if (vp == NULL) 1373 MNT_REF(mp); 1374 if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) { 1375 mp->mnt_secondary_writes++; 1376 mp->mnt_secondary_accwrites++; 1377 MNT_IUNLOCK(mp); 1378 return (0); 1379 } 1380 if (flags & V_NOWAIT) { 1381 MNT_REL(mp); 1382 MNT_IUNLOCK(mp); 1383 return (EWOULDBLOCK); 1384 } 1385 /* 1386 * Wait for the suspension to finish. 1387 */ 1388 error = msleep(&mp->mnt_flag, MNT_MTX(mp), 1389 (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0); 1390 vfs_rel(mp); 1391 if (error == 0) 1392 goto retry; 1393 return (error); 1394 } 1395 1396 /* 1397 * Filesystem write operation has completed. If we are suspending and this 1398 * operation is the last one, notify the suspender that the suspension is 1399 * now in effect. 1400 */ 1401 void 1402 vn_finished_write(mp) 1403 struct mount *mp; 1404 { 1405 if (mp == NULL) 1406 return; 1407 MNT_ILOCK(mp); 1408 MNT_REL(mp); 1409 mp->mnt_writeopcount--; 1410 if (mp->mnt_writeopcount < 0) 1411 panic("vn_finished_write: neg cnt"); 1412 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && 1413 mp->mnt_writeopcount <= 0) 1414 wakeup(&mp->mnt_writeopcount); 1415 MNT_IUNLOCK(mp); 1416 } 1417 1418 1419 /* 1420 * Filesystem secondary write operation has completed. If we are 1421 * suspending and this operation is the last one, notify the suspender 1422 * that the suspension is now in effect. 1423 */ 1424 void 1425 vn_finished_secondary_write(mp) 1426 struct mount *mp; 1427 { 1428 if (mp == NULL) 1429 return; 1430 MNT_ILOCK(mp); 1431 MNT_REL(mp); 1432 mp->mnt_secondary_writes--; 1433 if (mp->mnt_secondary_writes < 0) 1434 panic("vn_finished_secondary_write: neg cnt"); 1435 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && 1436 mp->mnt_secondary_writes <= 0) 1437 wakeup(&mp->mnt_secondary_writes); 1438 MNT_IUNLOCK(mp); 1439 } 1440 1441 1442 1443 /* 1444 * Request a filesystem to suspend write operations. 1445 */ 1446 int 1447 vfs_write_suspend(mp) 1448 struct mount *mp; 1449 { 1450 int error; 1451 1452 MNT_ILOCK(mp); 1453 if (mp->mnt_susp_owner == curthread) { 1454 MNT_IUNLOCK(mp); 1455 return (EALREADY); 1456 } 1457 while (mp->mnt_kern_flag & MNTK_SUSPEND) 1458 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0); 1459 mp->mnt_kern_flag |= MNTK_SUSPEND; 1460 mp->mnt_susp_owner = curthread; 1461 if (mp->mnt_writeopcount > 0) 1462 (void) msleep(&mp->mnt_writeopcount, 1463 MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); 1464 else 1465 MNT_IUNLOCK(mp); 1466 if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) 1467 vfs_write_resume(mp); 1468 return (error); 1469 } 1470 1471 /* 1472 * Request a filesystem to resume write operations. 1473 */ 1474 void 1475 vfs_write_resume(mp) 1476 struct mount *mp; 1477 { 1478 1479 MNT_ILOCK(mp); 1480 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 1481 KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner")); 1482 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 | 1483 MNTK_SUSPENDED); 1484 mp->mnt_susp_owner = NULL; 1485 wakeup(&mp->mnt_writeopcount); 1486 wakeup(&mp->mnt_flag); 1487 curthread->td_pflags &= ~TDP_IGNSUSP; 1488 MNT_IUNLOCK(mp); 1489 VFS_SUSP_CLEAN(mp); 1490 } else 1491 MNT_IUNLOCK(mp); 1492 } 1493 1494 /* 1495 * Implement kqueues for files by translating it to vnode operation. 1496 */ 1497 static int 1498 vn_kqfilter(struct file *fp, struct knote *kn) 1499 { 1500 int vfslocked; 1501 int error; 1502 1503 vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); 1504 error = VOP_KQFILTER(fp->f_vnode, kn); 1505 VFS_UNLOCK_GIANT(vfslocked); 1506 1507 return error; 1508 } 1509 1510 /* 1511 * Simplified in-kernel wrapper calls for extended attribute access. 1512 * Both calls pass in a NULL credential, authorizing as "kernel" access. 1513 * Set IO_NODELOCKED in ioflg if the vnode is already locked. 1514 */ 1515 int 1516 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, 1517 const char *attrname, int *buflen, char *buf, struct thread *td) 1518 { 1519 struct uio auio; 1520 struct iovec iov; 1521 int error; 1522 1523 iov.iov_len = *buflen; 1524 iov.iov_base = buf; 1525 1526 auio.uio_iov = &iov; 1527 auio.uio_iovcnt = 1; 1528 auio.uio_rw = UIO_READ; 1529 auio.uio_segflg = UIO_SYSSPACE; 1530 auio.uio_td = td; 1531 auio.uio_offset = 0; 1532 auio.uio_resid = *buflen; 1533 1534 if ((ioflg & IO_NODELOCKED) == 0) 1535 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1536 1537 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 1538 1539 /* authorize attribute retrieval as kernel */ 1540 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, 1541 td); 1542 1543 if ((ioflg & IO_NODELOCKED) == 0) 1544 VOP_UNLOCK(vp, 0); 1545 1546 if (error == 0) { 1547 *buflen = *buflen - auio.uio_resid; 1548 } 1549 1550 return (error); 1551 } 1552 1553 /* 1554 * XXX failure mode if partially written? 1555 */ 1556 int 1557 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, 1558 const char *attrname, int buflen, char *buf, struct thread *td) 1559 { 1560 struct uio auio; 1561 struct iovec iov; 1562 struct mount *mp; 1563 int error; 1564 1565 iov.iov_len = buflen; 1566 iov.iov_base = buf; 1567 1568 auio.uio_iov = &iov; 1569 auio.uio_iovcnt = 1; 1570 auio.uio_rw = UIO_WRITE; 1571 auio.uio_segflg = UIO_SYSSPACE; 1572 auio.uio_td = td; 1573 auio.uio_offset = 0; 1574 auio.uio_resid = buflen; 1575 1576 if ((ioflg & IO_NODELOCKED) == 0) { 1577 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) 1578 return (error); 1579 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1580 } 1581 1582 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 1583 1584 /* authorize attribute setting as kernel */ 1585 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); 1586 1587 if ((ioflg & IO_NODELOCKED) == 0) { 1588 vn_finished_write(mp); 1589 VOP_UNLOCK(vp, 0); 1590 } 1591 1592 return (error); 1593 } 1594 1595 int 1596 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, 1597 const char *attrname, struct thread *td) 1598 { 1599 struct mount *mp; 1600 int error; 1601 1602 if ((ioflg & IO_NODELOCKED) == 0) { 1603 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) 1604 return (error); 1605 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1606 } 1607 1608 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 1609 1610 /* authorize attribute removal as kernel */ 1611 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); 1612 if (error == EOPNOTSUPP) 1613 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, 1614 NULL, td); 1615 1616 if ((ioflg & IO_NODELOCKED) == 0) { 1617 vn_finished_write(mp); 1618 VOP_UNLOCK(vp, 0); 1619 } 1620 1621 return (error); 1622 } 1623 1624 int 1625 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp) 1626 { 1627 struct mount *mp; 1628 int ltype, error; 1629 1630 mp = vp->v_mount; 1631 ltype = VOP_ISLOCKED(vp); 1632 KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED, 1633 ("vn_vget_ino: vp not locked")); 1634 error = vfs_busy(mp, MBF_NOWAIT); 1635 if (error != 0) { 1636 vfs_ref(mp); 1637 VOP_UNLOCK(vp, 0); 1638 error = vfs_busy(mp, 0); 1639 vn_lock(vp, ltype | LK_RETRY); 1640 vfs_rel(mp); 1641 if (error != 0) 1642 return (ENOENT); 1643 if (vp->v_iflag & VI_DOOMED) { 1644 vfs_unbusy(mp); 1645 return (ENOENT); 1646 } 1647 } 1648 VOP_UNLOCK(vp, 0); 1649 error = VFS_VGET(mp, ino, lkflags, rvp); 1650 vfs_unbusy(mp); 1651 vn_lock(vp, ltype | LK_RETRY); 1652 if (vp->v_iflag & VI_DOOMED) { 1653 if (error == 0) 1654 vput(*rvp); 1655 error = ENOENT; 1656 } 1657 return (error); 1658 } 1659 1660 int 1661 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, 1662 const struct thread *td) 1663 { 1664 1665 if (vp->v_type != VREG || td == NULL) 1666 return (0); 1667 PROC_LOCK(td->td_proc); 1668 if ((uoff_t)uio->uio_offset + uio->uio_resid > 1669 lim_cur(td->td_proc, RLIMIT_FSIZE)) { 1670 kern_psignal(td->td_proc, SIGXFSZ); 1671 PROC_UNLOCK(td->td_proc); 1672 return (EFBIG); 1673 } 1674 PROC_UNLOCK(td->td_proc); 1675 return (0); 1676 } 1677 1678 int 1679 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 1680 struct thread *td) 1681 { 1682 struct vnode *vp; 1683 int error, vfslocked; 1684 1685 vp = fp->f_vnode; 1686 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 1687 #ifdef AUDIT 1688 vn_lock(vp, LK_SHARED | LK_RETRY); 1689 AUDIT_ARG_VNODE1(vp); 1690 VOP_UNLOCK(vp, 0); 1691 #endif 1692 error = setfmode(td, active_cred, vp, mode); 1693 VFS_UNLOCK_GIANT(vfslocked); 1694 return (error); 1695 } 1696 1697 int 1698 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 1699 struct thread *td) 1700 { 1701 struct vnode *vp; 1702 int error, vfslocked; 1703 1704 vp = fp->f_vnode; 1705 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 1706 #ifdef AUDIT 1707 vn_lock(vp, LK_SHARED | LK_RETRY); 1708 AUDIT_ARG_VNODE1(vp); 1709 VOP_UNLOCK(vp, 0); 1710 #endif 1711 error = setfown(td, active_cred, vp, uid, gid); 1712 VFS_UNLOCK_GIANT(vfslocked); 1713 return (error); 1714 } 1715 1716 void 1717 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end) 1718 { 1719 vm_object_t object; 1720 1721 if ((object = vp->v_object) == NULL) 1722 return; 1723 VM_OBJECT_LOCK(object); 1724 vm_object_page_remove(object, start, end, 0); 1725 VM_OBJECT_UNLOCK(object); 1726 } 1727 1728 int 1729 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) 1730 { 1731 struct vattr va; 1732 daddr_t bn, bnp; 1733 uint64_t bsize; 1734 off_t noff; 1735 int error; 1736 1737 KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, 1738 ("Wrong command %lu", cmd)); 1739 1740 if (vn_lock(vp, LK_SHARED) != 0) 1741 return (EBADF); 1742 if (vp->v_type != VREG) { 1743 error = ENOTTY; 1744 goto unlock; 1745 } 1746 error = VOP_GETATTR(vp, &va, cred); 1747 if (error != 0) 1748 goto unlock; 1749 noff = *off; 1750 if (noff >= va.va_size) { 1751 error = ENXIO; 1752 goto unlock; 1753 } 1754 bsize = vp->v_mount->mnt_stat.f_iosize; 1755 for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) { 1756 error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL); 1757 if (error == EOPNOTSUPP) { 1758 error = ENOTTY; 1759 goto unlock; 1760 } 1761 if ((bnp == -1 && cmd == FIOSEEKHOLE) || 1762 (bnp != -1 && cmd == FIOSEEKDATA)) { 1763 noff = bn * bsize; 1764 if (noff < *off) 1765 noff = *off; 1766 goto unlock; 1767 } 1768 } 1769 if (noff > va.va_size) 1770 noff = va.va_size; 1771 /* noff == va.va_size. There is an implicit hole at the end of file. */ 1772 if (cmd == FIOSEEKDATA) 1773 error = ENXIO; 1774 unlock: 1775 VOP_UNLOCK(vp, 0); 1776 if (error == 0) 1777 *off = noff; 1778 return (error); 1779 } 1780