1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org> 13 * Copyright (c) 2013, 2014 The FreeBSD Foundation 14 * 15 * Portions of this software were developed by Konstantin Belousov 16 * under sponsorship from the FreeBSD Foundation. 17 * 18 * Redistribution and use in source and binary forms, with or without 19 * modification, are permitted provided that the following conditions 20 * are met: 21 * 1. Redistributions of source code must retain the above copyright 22 * notice, this list of conditions and the following disclaimer. 23 * 2. Redistributions in binary form must reproduce the above copyright 24 * notice, this list of conditions and the following disclaimer in the 25 * documentation and/or other materials provided with the distribution. 26 * 3. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 */ 42 43 #include "opt_hwpmc_hooks.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/buf.h> 48 #include <sys/disk.h> 49 #include <sys/dirent.h> 50 #include <sys/fail.h> 51 #include <sys/fcntl.h> 52 #include <sys/file.h> 53 #include <sys/filio.h> 54 #include <sys/ktr.h> 55 #include <sys/ktrace.h> 56 #include <sys/limits.h> 57 #include <sys/lock.h> 58 #include <sys/mman.h> 59 #include <sys/mount.h> 60 #include <sys/mutex.h> 61 #include <sys/namei.h> 62 #include <sys/priv.h> 63 #include <sys/prng.h> 64 #include <sys/proc.h> 65 #include <sys/rwlock.h> 66 #include <sys/sleepqueue.h> 67 #include <sys/stat.h> 68 #include <sys/sysctl.h> 69 #include <sys/unistd.h> 70 #include <sys/user.h> 71 #include <sys/vnode.h> 72 73 #include <security/audit/audit.h> 74 #include <security/mac/mac_framework.h> 75 76 #include <vm/vm.h> 77 #include <vm/vm_extern.h> 78 #include <vm/pmap.h> 79 #include <vm/vm_map.h> 80 #include <vm/vm_object.h> 81 #include <vm/vm_page.h> 82 #include <vm/vm_pager.h> 83 #include <vm/vnode_pager.h> 84 85 #ifdef HWPMC_HOOKS 86 #include <sys/pmckern.h> 87 #endif 88 89 static fo_rdwr_t vn_read; 90 static fo_rdwr_t vn_write; 91 static fo_rdwr_t vn_io_fault; 92 static fo_truncate_t vn_truncate; 93 static fo_ioctl_t vn_ioctl; 94 static fo_poll_t vn_poll; 95 static fo_kqfilter_t vn_kqfilter; 96 static fo_close_t vn_closefile; 97 static fo_mmap_t vn_mmap; 98 static fo_fallocate_t vn_fallocate; 99 static fo_fspacectl_t vn_fspacectl; 100 101 const struct fileops vnops = { 102 .fo_read = vn_io_fault, 103 .fo_write = vn_io_fault, 104 .fo_truncate = vn_truncate, 105 .fo_ioctl = vn_ioctl, 106 .fo_poll = vn_poll, 107 .fo_kqfilter = vn_kqfilter, 108 .fo_stat = vn_statfile, 109 .fo_close = vn_closefile, 110 .fo_chmod = vn_chmod, 111 .fo_chown = vn_chown, 112 .fo_sendfile = vn_sendfile, 113 .fo_seek = vn_seek, 114 .fo_fill_kinfo = vn_fill_kinfo, 115 .fo_mmap = vn_mmap, 116 .fo_fallocate = vn_fallocate, 117 .fo_fspacectl = vn_fspacectl, 118 .fo_cmp = vn_cmp, 119 .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE 120 }; 121 122 const u_int io_hold_cnt = 16; 123 static int vn_io_fault_enable = 1; 124 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN, 125 &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); 126 static int vn_io_fault_prefault = 0; 127 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN, 128 &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting"); 129 static int vn_io_pgcache_read_enable = 1; 130 SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN, 131 &vn_io_pgcache_read_enable, 0, 132 "Enable copying from page cache for reads, avoiding fs"); 133 static u_long vn_io_faults_cnt; 134 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, 135 &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); 136 137 static int vfs_allow_read_dir = 0; 138 SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW, 139 &vfs_allow_read_dir, 0, 140 "Enable read(2) of directory by root for filesystems that support it"); 141 142 /* 143 * Returns true if vn_io_fault mode of handling the i/o request should 144 * be used. 145 */ 146 static bool 147 do_vn_io_fault(struct vnode *vp, struct uio *uio) 148 { 149 struct mount *mp; 150 151 return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG && 152 (mp = vp->v_mount) != NULL && 153 (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable); 154 } 155 156 /* 157 * Structure used to pass arguments to vn_io_fault1(), to do either 158 * file- or vnode-based I/O calls. 159 */ 160 struct vn_io_fault_args { 161 enum { 162 VN_IO_FAULT_FOP, 163 VN_IO_FAULT_VOP 164 } kind; 165 struct ucred *cred; 166 int flags; 167 union { 168 struct fop_args_tag { 169 struct file *fp; 170 fo_rdwr_t *doio; 171 } fop_args; 172 struct vop_args_tag { 173 struct vnode *vp; 174 } vop_args; 175 } args; 176 }; 177 178 static int vn_io_fault1(struct vnode *vp, struct uio *uio, 179 struct vn_io_fault_args *args, struct thread *td); 180 181 int 182 vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp) 183 { 184 struct thread *td = curthread; 185 186 return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp)); 187 } 188 189 static uint64_t 190 open2nameif(int fmode, u_int vn_open_flags) 191 { 192 uint64_t res; 193 194 res = ISOPEN | LOCKLEAF; 195 if ((fmode & O_RESOLVE_BENEATH) != 0) 196 res |= RBENEATH; 197 if ((fmode & O_EMPTY_PATH) != 0) 198 res |= EMPTYPATH; 199 if ((fmode & FREAD) != 0) 200 res |= OPENREAD; 201 if ((fmode & FWRITE) != 0) 202 res |= OPENWRITE; 203 if ((fmode & O_NAMEDATTR) != 0) 204 res |= OPENNAMED | CREATENAMED; 205 if ((vn_open_flags & VN_OPEN_NOAUDIT) == 0) 206 res |= AUDITVNODE1; 207 if ((vn_open_flags & VN_OPEN_NOCAPCHECK) != 0) 208 res |= NOCAPCHECK; 209 if ((vn_open_flags & VN_OPEN_WANTIOCTLCAPS) != 0) 210 res |= WANTIOCTLCAPS; 211 return (res); 212 } 213 214 /* 215 * For the O_NAMEDATTR case, check for a valid use of it. 216 */ 217 static int 218 vfs_check_namedattr(struct vnode *vp) 219 { 220 int error; 221 short irflag; 222 223 error = 0; 224 irflag = vn_irflag_read(vp); 225 if ((vp->v_mount->mnt_flag & MNT_NAMEDATTR) == 0 || 226 ((irflag & VIRF_NAMEDATTR) != 0 && vp->v_type != VREG)) 227 error = EINVAL; 228 else if ((irflag & (VIRF_NAMEDDIR | VIRF_NAMEDATTR)) == 0) 229 error = ENOATTR; 230 return (error); 231 } 232 233 /* 234 * Common code for vnode open operations via a name lookup. 235 * Lookup the vnode and invoke VOP_CREATE if needed. 236 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. 237 * 238 * Note that this does NOT free nameidata for the successful case, 239 * due to the NDINIT being done elsewhere. 240 */ 241 int 242 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, 243 struct ucred *cred, struct file *fp) 244 { 245 struct vnode *vp; 246 struct mount *mp; 247 struct vattr vat; 248 struct vattr *vap = &vat; 249 int fmode, error; 250 bool first_open; 251 252 restart: 253 first_open = false; 254 fmode = *flagp; 255 if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT | 256 O_EXCL | O_DIRECTORY) || 257 (fmode & (O_CREAT | O_EMPTY_PATH)) == (O_CREAT | O_EMPTY_PATH)) 258 return (EINVAL); 259 else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) { 260 ndp->ni_cnd.cn_nameiop = CREATE; 261 ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags); 262 /* 263 * Set NOCACHE to avoid flushing the cache when 264 * rolling in many files at once. 265 * 266 * Set NC_KEEPPOSENTRY to keep positive entries if they already 267 * exist despite NOCACHE. 268 */ 269 ndp->ni_cnd.cn_flags |= LOCKPARENT | NOCACHE | NC_KEEPPOSENTRY; 270 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) 271 ndp->ni_cnd.cn_flags |= FOLLOW; 272 if ((vn_open_flags & VN_OPEN_INVFS) == 0) 273 bwillwrite(); 274 if ((error = namei(ndp)) != 0) 275 return (error); 276 if (ndp->ni_vp == NULL) { 277 if ((fmode & O_NAMEDATTR) != 0 && 278 (ndp->ni_dvp->v_mount->mnt_flag & MNT_NAMEDATTR) == 279 0) { 280 error = EINVAL; 281 vp = ndp->ni_dvp; 282 ndp->ni_dvp = NULL; 283 goto bad; 284 } 285 VATTR_NULL(vap); 286 vap->va_type = VREG; 287 vap->va_mode = cmode; 288 if (fmode & O_EXCL) 289 vap->va_vaflags |= VA_EXCLUSIVE; 290 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { 291 NDFREE_PNBUF(ndp); 292 vput(ndp->ni_dvp); 293 if ((error = vn_start_write(NULL, &mp, 294 V_XSLEEP | V_PCATCH)) != 0) 295 return (error); 296 NDREINIT(ndp); 297 goto restart; 298 } 299 if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0) 300 ndp->ni_cnd.cn_flags |= MAKEENTRY; 301 #ifdef MAC 302 error = mac_vnode_check_create(cred, ndp->ni_dvp, 303 &ndp->ni_cnd, vap); 304 if (error == 0) 305 #endif 306 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, 307 &ndp->ni_cnd, vap); 308 vp = ndp->ni_vp; 309 if (error == 0 && (fmode & O_EXCL) != 0 && 310 (fmode & (O_EXLOCK | O_SHLOCK)) != 0) { 311 VI_LOCK(vp); 312 vp->v_iflag |= VI_FOPENING; 313 VI_UNLOCK(vp); 314 first_open = true; 315 } 316 VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &vp : NULL, 317 false); 318 vn_finished_write(mp); 319 if (error) { 320 NDFREE_PNBUF(ndp); 321 if (error == ERELOOKUP) { 322 NDREINIT(ndp); 323 goto restart; 324 } 325 return (error); 326 } 327 fmode &= ~O_TRUNC; 328 } else { 329 if (ndp->ni_dvp == ndp->ni_vp) 330 vrele(ndp->ni_dvp); 331 else 332 vput(ndp->ni_dvp); 333 ndp->ni_dvp = NULL; 334 vp = ndp->ni_vp; 335 if (fmode & O_EXCL) { 336 error = EEXIST; 337 goto bad; 338 } 339 if ((fmode & O_NAMEDATTR) != 0) { 340 error = vfs_check_namedattr(vp); 341 if (error != 0) 342 goto bad; 343 } else if (vp->v_type == VDIR) { 344 error = EISDIR; 345 goto bad; 346 } 347 fmode &= ~O_CREAT; 348 } 349 } else { 350 ndp->ni_cnd.cn_nameiop = LOOKUP; 351 ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags); 352 ndp->ni_cnd.cn_flags |= (fmode & O_NOFOLLOW) != 0 ? NOFOLLOW : 353 FOLLOW; 354 if ((fmode & FWRITE) == 0) 355 ndp->ni_cnd.cn_flags |= LOCKSHARED; 356 if ((error = namei(ndp)) != 0) 357 return (error); 358 vp = ndp->ni_vp; 359 if ((fmode & O_NAMEDATTR) != 0) { 360 error = vfs_check_namedattr(vp); 361 if (error != 0) 362 goto bad; 363 } 364 } 365 error = vn_open_vnode(vp, fmode, cred, curthread, fp); 366 if (first_open) { 367 VI_LOCK(vp); 368 vp->v_iflag &= ~VI_FOPENING; 369 wakeup(vp); 370 VI_UNLOCK(vp); 371 } 372 if (error) 373 goto bad; 374 *flagp = fmode; 375 return (0); 376 bad: 377 NDFREE_PNBUF(ndp); 378 vput(vp); 379 *flagp = fmode; 380 ndp->ni_vp = NULL; 381 return (error); 382 } 383 384 static int 385 vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp) 386 { 387 struct flock lf; 388 int error, lock_flags, type; 389 390 ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock"); 391 if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0) 392 return (0); 393 KASSERT(fp != NULL, ("open with flock requires fp")); 394 if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE) 395 return (EOPNOTSUPP); 396 397 lock_flags = VOP_ISLOCKED(vp); 398 VOP_UNLOCK(vp); 399 400 lf.l_whence = SEEK_SET; 401 lf.l_start = 0; 402 lf.l_len = 0; 403 lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK; 404 type = F_FLOCK; 405 if ((fmode & FNONBLOCK) == 0) 406 type |= F_WAIT; 407 if ((fmode & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) 408 type |= F_FIRSTOPEN; 409 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type); 410 if (error == 0) 411 fp->f_flag |= FHASLOCK; 412 413 vn_lock(vp, lock_flags | LK_RETRY); 414 return (error); 415 } 416 417 /* 418 * Common code for vnode open operations once a vnode is located. 419 * Check permissions, and call the VOP_OPEN routine. 420 */ 421 int 422 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, 423 struct thread *td, struct file *fp) 424 { 425 accmode_t accmode; 426 int error; 427 428 KASSERT((fmode & O_PATH) == 0 || (fmode & O_ACCMODE) == 0, 429 ("%s: O_PATH and O_ACCMODE are mutually exclusive", __func__)); 430 431 if (vp->v_type == VLNK) { 432 if ((fmode & O_PATH) == 0 || (fmode & FEXEC) != 0) 433 return (EMLINK); 434 } 435 if (vp->v_type != VDIR && fmode & O_DIRECTORY) 436 return (ENOTDIR); 437 438 accmode = 0; 439 if ((fmode & O_PATH) == 0) { 440 if (vp->v_type == VSOCK) 441 return (EOPNOTSUPP); 442 if ((fmode & (FWRITE | O_TRUNC)) != 0) { 443 if (vp->v_type == VDIR) 444 return (EISDIR); 445 accmode |= VWRITE; 446 } 447 if ((fmode & FREAD) != 0) 448 accmode |= VREAD; 449 if ((fmode & O_APPEND) && (fmode & FWRITE)) 450 accmode |= VAPPEND; 451 #ifdef MAC 452 if ((fmode & O_CREAT) != 0) 453 accmode |= VCREAT; 454 #endif 455 } 456 if ((fmode & FEXEC) != 0) 457 accmode |= VEXEC; 458 #ifdef MAC 459 if ((fmode & O_VERIFY) != 0) 460 accmode |= VVERIFY; 461 error = mac_vnode_check_open(cred, vp, accmode); 462 if (error != 0) 463 return (error); 464 465 accmode &= ~(VCREAT | VVERIFY); 466 #endif 467 if ((fmode & O_CREAT) == 0 && accmode != 0) { 468 error = VOP_ACCESS(vp, accmode, cred, td); 469 if (error != 0) 470 return (error); 471 } 472 if ((fmode & O_PATH) != 0) { 473 if (vp->v_type != VFIFO && vp->v_type != VSOCK && 474 VOP_ACCESS(vp, VREAD, cred, td) == 0) 475 fp->f_flag |= FKQALLOWED; 476 return (0); 477 } 478 479 if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 480 vn_lock(vp, LK_UPGRADE | LK_RETRY); 481 error = VOP_OPEN(vp, fmode, cred, td, fp); 482 if (error != 0) 483 return (error); 484 485 error = vn_open_vnode_advlock(vp, fmode, fp); 486 if (error == 0 && (fmode & FWRITE) != 0) { 487 error = VOP_ADD_WRITECOUNT(vp, 1); 488 if (error == 0) { 489 CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", 490 __func__, vp, vp->v_writecount); 491 } 492 } 493 494 /* 495 * Error from advlock or VOP_ADD_WRITECOUNT() still requires 496 * calling VOP_CLOSE() to pair with earlier VOP_OPEN(). 497 */ 498 if (error != 0) { 499 if (fp != NULL) { 500 /* 501 * Arrange the call by having fdrop() to use 502 * vn_closefile(). This is to satisfy 503 * filesystems like devfs or tmpfs, which 504 * override fo_close(). 505 */ 506 fp->f_flag |= FOPENFAILED; 507 fp->f_vnode = vp; 508 if (fp->f_ops == &badfileops) { 509 fp->f_type = DTYPE_VNODE; 510 fp->f_ops = &vnops; 511 } 512 vref(vp); 513 } else { 514 /* 515 * If there is no fp, due to kernel-mode open, 516 * we can call VOP_CLOSE() now. 517 */ 518 if ((vp->v_type == VFIFO || 519 !MNT_EXTENDED_SHARED(vp->v_mount)) && 520 VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 521 vn_lock(vp, LK_UPGRADE | LK_RETRY); 522 (void)VOP_CLOSE(vp, fmode & (FREAD | FWRITE | FEXEC), 523 cred, td); 524 } 525 } 526 527 ASSERT_VOP_LOCKED(vp, "vn_open_vnode"); 528 return (error); 529 530 } 531 532 /* 533 * Check for write permissions on the specified vnode. 534 * Prototype text segments cannot be written. 535 * It is racy. 536 */ 537 int 538 vn_writechk(struct vnode *vp) 539 { 540 541 ASSERT_VOP_LOCKED(vp, "vn_writechk"); 542 /* 543 * If there's shared text associated with 544 * the vnode, try to free it up once. If 545 * we fail, we can't allow writing. 546 */ 547 if (VOP_IS_TEXT(vp)) 548 return (ETXTBSY); 549 550 return (0); 551 } 552 553 /* 554 * Vnode close call 555 */ 556 static int 557 vn_close1(struct vnode *vp, int flags, struct ucred *file_cred, 558 struct thread *td, bool keep_ref) 559 { 560 struct mount *mp; 561 int error, lock_flags; 562 563 lock_flags = vp->v_type != VFIFO && MNT_EXTENDED_SHARED(vp->v_mount) ? 564 LK_SHARED : LK_EXCLUSIVE; 565 566 vn_start_write(vp, &mp, V_WAIT); 567 vn_lock(vp, lock_flags | LK_RETRY); 568 AUDIT_ARG_VNODE1(vp); 569 if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) { 570 VOP_ADD_WRITECOUNT_CHECKED(vp, -1); 571 CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", 572 __func__, vp, vp->v_writecount); 573 } 574 error = VOP_CLOSE(vp, flags, file_cred, td); 575 if (keep_ref) 576 VOP_UNLOCK(vp); 577 else 578 vput(vp); 579 vn_finished_write(mp); 580 return (error); 581 } 582 583 int 584 vn_close(struct vnode *vp, int flags, struct ucred *file_cred, 585 struct thread *td) 586 { 587 588 return (vn_close1(vp, flags, file_cred, td, false)); 589 } 590 591 /* 592 * Heuristic to detect sequential operation. 593 */ 594 static int 595 sequential_heuristic(struct uio *uio, struct file *fp) 596 { 597 enum uio_rw rw; 598 599 ASSERT_VOP_LOCKED(fp->f_vnode, __func__); 600 601 rw = uio->uio_rw; 602 if (fp->f_flag & FRDAHEAD) 603 return (fp->f_seqcount[rw] << IO_SEQSHIFT); 604 605 /* 606 * Offset 0 is handled specially. open() sets f_seqcount to 1 so 607 * that the first I/O is normally considered to be slightly 608 * sequential. Seeking to offset 0 doesn't change sequentiality 609 * unless previous seeks have reduced f_seqcount to 0, in which 610 * case offset 0 is not special. 611 */ 612 if ((uio->uio_offset == 0 && fp->f_seqcount[rw] > 0) || 613 uio->uio_offset == fp->f_nextoff[rw]) { 614 /* 615 * f_seqcount is in units of fixed-size blocks so that it 616 * depends mainly on the amount of sequential I/O and not 617 * much on the number of sequential I/O's. The fixed size 618 * of 16384 is hard-coded here since it is (not quite) just 619 * a magic size that works well here. This size is more 620 * closely related to the best I/O size for real disks than 621 * to any block size used by software. 622 */ 623 if (uio->uio_resid >= IO_SEQMAX * 16384) 624 fp->f_seqcount[rw] = IO_SEQMAX; 625 else { 626 fp->f_seqcount[rw] += howmany(uio->uio_resid, 16384); 627 if (fp->f_seqcount[rw] > IO_SEQMAX) 628 fp->f_seqcount[rw] = IO_SEQMAX; 629 } 630 return (fp->f_seqcount[rw] << IO_SEQSHIFT); 631 } 632 633 /* Not sequential. Quickly draw-down sequentiality. */ 634 if (fp->f_seqcount[rw] > 1) 635 fp->f_seqcount[rw] = 1; 636 else 637 fp->f_seqcount[rw] = 0; 638 return (0); 639 } 640 641 /* 642 * Package up an I/O request on a vnode into a uio and do it. 643 */ 644 int 645 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, 646 enum uio_seg segflg, int ioflg, struct ucred *active_cred, 647 struct ucred *file_cred, ssize_t *aresid, struct thread *td) 648 { 649 struct uio auio; 650 struct iovec aiov; 651 struct mount *mp; 652 struct ucred *cred; 653 void *rl_cookie; 654 struct vn_io_fault_args args; 655 int error, lock_flags; 656 657 if (offset < 0 && vp->v_type != VCHR) 658 return (EINVAL); 659 auio.uio_iov = &aiov; 660 auio.uio_iovcnt = 1; 661 aiov.iov_base = base; 662 aiov.iov_len = len; 663 auio.uio_resid = len; 664 auio.uio_offset = offset; 665 auio.uio_segflg = segflg; 666 auio.uio_rw = rw; 667 auio.uio_td = td; 668 error = 0; 669 670 if ((ioflg & IO_NODELOCKED) == 0) { 671 if ((ioflg & IO_RANGELOCKED) == 0) { 672 if (rw == UIO_READ) { 673 rl_cookie = vn_rangelock_rlock(vp, offset, 674 offset + len); 675 } else if ((ioflg & IO_APPEND) != 0) { 676 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); 677 } else { 678 rl_cookie = vn_rangelock_wlock(vp, offset, 679 offset + len); 680 } 681 } else 682 rl_cookie = NULL; 683 mp = NULL; 684 if (rw == UIO_WRITE) { 685 if (vp->v_type != VCHR && 686 (error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH)) 687 != 0) 688 goto out; 689 lock_flags = vn_lktype_write(mp, vp); 690 } else 691 lock_flags = LK_SHARED; 692 vn_lock(vp, lock_flags | LK_RETRY); 693 } else 694 rl_cookie = NULL; 695 696 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 697 #ifdef MAC 698 if ((ioflg & IO_NOMACCHECK) == 0) { 699 if (rw == UIO_READ) 700 error = mac_vnode_check_read(active_cred, file_cred, 701 vp); 702 else 703 error = mac_vnode_check_write(active_cred, file_cred, 704 vp); 705 } 706 #endif 707 if (error == 0) { 708 if (file_cred != NULL) 709 cred = file_cred; 710 else 711 cred = active_cred; 712 if (do_vn_io_fault(vp, &auio)) { 713 args.kind = VN_IO_FAULT_VOP; 714 args.cred = cred; 715 args.flags = ioflg; 716 args.args.vop_args.vp = vp; 717 error = vn_io_fault1(vp, &auio, &args, td); 718 } else if (rw == UIO_READ) { 719 error = VOP_READ(vp, &auio, ioflg, cred); 720 } else /* if (rw == UIO_WRITE) */ { 721 error = VOP_WRITE(vp, &auio, ioflg, cred); 722 } 723 } 724 if (aresid) 725 *aresid = auio.uio_resid; 726 else 727 if (auio.uio_resid && error == 0) 728 error = EIO; 729 if ((ioflg & IO_NODELOCKED) == 0) { 730 VOP_UNLOCK(vp); 731 if (mp != NULL) 732 vn_finished_write(mp); 733 } 734 out: 735 if (rl_cookie != NULL) 736 vn_rangelock_unlock(vp, rl_cookie); 737 return (error); 738 } 739 740 /* 741 * Package up an I/O request on a vnode into a uio and do it. The I/O 742 * request is split up into smaller chunks and we try to avoid saturating 743 * the buffer cache while potentially holding a vnode locked, so we 744 * check bwillwrite() before calling vn_rdwr(). We also call kern_yield() 745 * to give other processes a chance to lock the vnode (either other processes 746 * core'ing the same binary, or unrelated processes scanning the directory). 747 */ 748 int 749 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, 750 off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, 751 struct ucred *file_cred, size_t *aresid, struct thread *td) 752 { 753 int error = 0; 754 ssize_t iaresid; 755 756 do { 757 int chunk; 758 759 /* 760 * Force `offset' to a multiple of MAXBSIZE except possibly 761 * for the first chunk, so that filesystems only need to 762 * write full blocks except possibly for the first and last 763 * chunks. 764 */ 765 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; 766 767 if (chunk > len) 768 chunk = len; 769 if (rw != UIO_READ && vp->v_type == VREG) 770 bwillwrite(); 771 iaresid = 0; 772 error = vn_rdwr(rw, vp, base, chunk, offset, segflg, 773 ioflg, active_cred, file_cred, &iaresid, td); 774 len -= chunk; /* aresid calc already includes length */ 775 if (error) 776 break; 777 offset += chunk; 778 base = (char *)base + chunk; 779 kern_yield(PRI_USER); 780 } while (len); 781 if (aresid) 782 *aresid = len + iaresid; 783 return (error); 784 } 785 786 #if OFF_MAX <= LONG_MAX 787 off_t 788 foffset_lock(struct file *fp, int flags) 789 { 790 volatile short *flagsp; 791 off_t res; 792 short state; 793 794 KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); 795 796 if ((flags & FOF_NOLOCK) != 0) 797 return (atomic_load_long(&fp->f_offset)); 798 799 /* 800 * According to McKusick the vn lock was protecting f_offset here. 801 * It is now protected by the FOFFSET_LOCKED flag. 802 */ 803 flagsp = &fp->f_vnread_flags; 804 if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED)) 805 return (atomic_load_long(&fp->f_offset)); 806 807 sleepq_lock(&fp->f_vnread_flags); 808 state = atomic_load_16(flagsp); 809 for (;;) { 810 if ((state & FOFFSET_LOCKED) == 0) { 811 if (!atomic_fcmpset_acq_16(flagsp, &state, 812 FOFFSET_LOCKED)) 813 continue; 814 break; 815 } 816 if ((state & FOFFSET_LOCK_WAITING) == 0) { 817 if (!atomic_fcmpset_acq_16(flagsp, &state, 818 state | FOFFSET_LOCK_WAITING)) 819 continue; 820 } 821 DROP_GIANT(); 822 sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0); 823 sleepq_wait(&fp->f_vnread_flags, PUSER -1); 824 PICKUP_GIANT(); 825 sleepq_lock(&fp->f_vnread_flags); 826 state = atomic_load_16(flagsp); 827 } 828 res = atomic_load_long(&fp->f_offset); 829 sleepq_release(&fp->f_vnread_flags); 830 return (res); 831 } 832 833 void 834 foffset_unlock(struct file *fp, off_t val, int flags) 835 { 836 volatile short *flagsp; 837 short state; 838 839 KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); 840 841 if ((flags & FOF_NOUPDATE) == 0) 842 atomic_store_long(&fp->f_offset, val); 843 if ((flags & FOF_NEXTOFF_R) != 0) 844 fp->f_nextoff[UIO_READ] = val; 845 if ((flags & FOF_NEXTOFF_W) != 0) 846 fp->f_nextoff[UIO_WRITE] = val; 847 848 if ((flags & FOF_NOLOCK) != 0) 849 return; 850 851 flagsp = &fp->f_vnread_flags; 852 state = atomic_load_16(flagsp); 853 if ((state & FOFFSET_LOCK_WAITING) == 0 && 854 atomic_cmpset_rel_16(flagsp, state, 0)) 855 return; 856 857 sleepq_lock(&fp->f_vnread_flags); 858 MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0); 859 MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0); 860 fp->f_vnread_flags = 0; 861 sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0); 862 sleepq_release(&fp->f_vnread_flags); 863 } 864 865 static off_t 866 foffset_read(struct file *fp) 867 { 868 869 return (atomic_load_long(&fp->f_offset)); 870 } 871 #else 872 off_t 873 foffset_lock(struct file *fp, int flags) 874 { 875 struct mtx *mtxp; 876 off_t res; 877 878 KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); 879 880 mtxp = mtx_pool_find(mtxpool_sleep, fp); 881 mtx_lock(mtxp); 882 if ((flags & FOF_NOLOCK) == 0) { 883 while (fp->f_vnread_flags & FOFFSET_LOCKED) { 884 fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; 885 msleep(&fp->f_vnread_flags, mtxp, PUSER -1, 886 "vofflock", 0); 887 } 888 fp->f_vnread_flags |= FOFFSET_LOCKED; 889 } 890 res = fp->f_offset; 891 mtx_unlock(mtxp); 892 return (res); 893 } 894 895 void 896 foffset_unlock(struct file *fp, off_t val, int flags) 897 { 898 struct mtx *mtxp; 899 900 KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); 901 902 mtxp = mtx_pool_find(mtxpool_sleep, fp); 903 mtx_lock(mtxp); 904 if ((flags & FOF_NOUPDATE) == 0) 905 fp->f_offset = val; 906 if ((flags & FOF_NEXTOFF_R) != 0) 907 fp->f_nextoff[UIO_READ] = val; 908 if ((flags & FOF_NEXTOFF_W) != 0) 909 fp->f_nextoff[UIO_WRITE] = val; 910 if ((flags & FOF_NOLOCK) == 0) { 911 KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0, 912 ("Lost FOFFSET_LOCKED")); 913 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) 914 wakeup(&fp->f_vnread_flags); 915 fp->f_vnread_flags = 0; 916 } 917 mtx_unlock(mtxp); 918 } 919 920 static off_t 921 foffset_read(struct file *fp) 922 { 923 924 return (foffset_lock(fp, FOF_NOLOCK)); 925 } 926 #endif 927 928 void 929 foffset_lock_pair(struct file *fp1, off_t *off1p, struct file *fp2, off_t *off2p, 930 int flags) 931 { 932 KASSERT(fp1 != fp2, ("foffset_lock_pair: fp1 == fp2")); 933 934 /* Lock in a consistent order to avoid deadlock. */ 935 if ((uintptr_t)fp1 > (uintptr_t)fp2) { 936 struct file *tmpfp; 937 off_t *tmpoffp; 938 939 tmpfp = fp1, fp1 = fp2, fp2 = tmpfp; 940 tmpoffp = off1p, off1p = off2p, off2p = tmpoffp; 941 } 942 if (fp1 != NULL) 943 *off1p = foffset_lock(fp1, flags); 944 if (fp2 != NULL) 945 *off2p = foffset_lock(fp2, flags); 946 } 947 948 void 949 foffset_lock_uio(struct file *fp, struct uio *uio, int flags) 950 { 951 952 if ((flags & FOF_OFFSET) == 0) 953 uio->uio_offset = foffset_lock(fp, flags); 954 } 955 956 void 957 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags) 958 { 959 960 if ((flags & FOF_OFFSET) == 0) 961 foffset_unlock(fp, uio->uio_offset, flags); 962 } 963 964 static int 965 get_advice(struct file *fp, struct uio *uio) 966 { 967 struct mtx *mtxp; 968 int ret; 969 970 ret = POSIX_FADV_NORMAL; 971 if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG) 972 return (ret); 973 974 mtxp = mtx_pool_find(mtxpool_sleep, fp); 975 mtx_lock(mtxp); 976 if (fp->f_advice != NULL && 977 uio->uio_offset >= fp->f_advice->fa_start && 978 uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) 979 ret = fp->f_advice->fa_advice; 980 mtx_unlock(mtxp); 981 return (ret); 982 } 983 984 static int 985 get_write_ioflag(struct file *fp) 986 { 987 int ioflag; 988 struct mount *mp; 989 struct vnode *vp; 990 991 ioflag = 0; 992 vp = fp->f_vnode; 993 mp = atomic_load_ptr(&vp->v_mount); 994 995 if ((fp->f_flag & O_DIRECT) != 0) 996 ioflag |= IO_DIRECT; 997 998 if ((fp->f_flag & O_FSYNC) != 0 || 999 (mp != NULL && (mp->mnt_flag & MNT_SYNCHRONOUS) != 0)) 1000 ioflag |= IO_SYNC; 1001 1002 /* 1003 * For O_DSYNC we set both IO_SYNC and IO_DATASYNC, so that VOP_WRITE() 1004 * or VOP_DEALLOCATE() implementations that don't understand IO_DATASYNC 1005 * fall back to full O_SYNC behavior. 1006 */ 1007 if ((fp->f_flag & O_DSYNC) != 0) 1008 ioflag |= IO_SYNC | IO_DATASYNC; 1009 1010 return (ioflag); 1011 } 1012 1013 int 1014 vn_read_from_obj(struct vnode *vp, struct uio *uio) 1015 { 1016 vm_object_t obj; 1017 vm_page_t ma[io_hold_cnt + 2]; 1018 off_t off, vsz; 1019 ssize_t resid; 1020 int error, i, j; 1021 1022 MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2)); 1023 obj = atomic_load_ptr(&vp->v_object); 1024 if (obj == NULL) 1025 return (EJUSTRETURN); 1026 1027 /* 1028 * Depends on type stability of vm_objects. 1029 */ 1030 vm_object_pip_add(obj, 1); 1031 if ((obj->flags & OBJ_DEAD) != 0) { 1032 /* 1033 * Note that object might be already reused from the 1034 * vnode, and the OBJ_DEAD flag cleared. This is fine, 1035 * we recheck for DOOMED vnode state after all pages 1036 * are busied, and retract then. 1037 * 1038 * But we check for OBJ_DEAD to ensure that we do not 1039 * busy pages while vm_object_terminate_pages() 1040 * processes the queue. 1041 */ 1042 error = EJUSTRETURN; 1043 goto out_pip; 1044 } 1045 1046 resid = uio->uio_resid; 1047 off = uio->uio_offset; 1048 for (i = 0; resid > 0; i++) { 1049 MPASS(i < io_hold_cnt + 2); 1050 ma[i] = vm_page_grab_unlocked(obj, atop(off), 1051 VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | 1052 VM_ALLOC_NOWAIT); 1053 if (ma[i] == NULL) 1054 break; 1055 1056 /* 1057 * Skip invalid pages. Valid mask can be partial only 1058 * at EOF, and we clip later. 1059 */ 1060 if (vm_page_none_valid(ma[i])) { 1061 vm_page_sunbusy(ma[i]); 1062 break; 1063 } 1064 1065 resid -= PAGE_SIZE; 1066 off += PAGE_SIZE; 1067 } 1068 if (i == 0) { 1069 error = EJUSTRETURN; 1070 goto out_pip; 1071 } 1072 1073 /* 1074 * Check VIRF_DOOMED after we busied our pages. Since 1075 * vgonel() terminates the vnode' vm_object, it cannot 1076 * process past pages busied by us. 1077 */ 1078 if (VN_IS_DOOMED(vp)) { 1079 error = EJUSTRETURN; 1080 goto out; 1081 } 1082 1083 resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1); 1084 if (resid > uio->uio_resid) 1085 resid = uio->uio_resid; 1086 1087 /* 1088 * Unlocked read of vnp_size is safe because truncation cannot 1089 * pass busied page. But we load vnp_size into a local 1090 * variable so that possible concurrent extension does not 1091 * break calculation. 1092 */ 1093 #if defined(__powerpc__) && !defined(__powerpc64__) 1094 vsz = obj->un_pager.vnp.vnp_size; 1095 #else 1096 vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size); 1097 #endif 1098 if (uio->uio_offset >= vsz) { 1099 error = EJUSTRETURN; 1100 goto out; 1101 } 1102 if (uio->uio_offset + resid > vsz) 1103 resid = vsz - uio->uio_offset; 1104 1105 error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio); 1106 1107 out: 1108 for (j = 0; j < i; j++) { 1109 if (error == 0) 1110 vm_page_reference(ma[j]); 1111 vm_page_sunbusy(ma[j]); 1112 } 1113 out_pip: 1114 vm_object_pip_wakeup(obj); 1115 if (error != 0) 1116 return (error); 1117 return (uio->uio_resid == 0 ? 0 : EJUSTRETURN); 1118 } 1119 1120 /* 1121 * File table vnode read routine. 1122 */ 1123 static int 1124 vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, 1125 struct thread *td) 1126 { 1127 struct vnode *vp; 1128 off_t orig_offset; 1129 int error, ioflag; 1130 int advice; 1131 1132 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", 1133 uio->uio_td, td)); 1134 KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); 1135 vp = fp->f_vnode; 1136 ioflag = 0; 1137 if (fp->f_flag & FNONBLOCK) 1138 ioflag |= IO_NDELAY; 1139 if (fp->f_flag & O_DIRECT) 1140 ioflag |= IO_DIRECT; 1141 1142 /* 1143 * Try to read from page cache. VIRF_DOOMED check is racy but 1144 * allows us to avoid unneeded work outright. 1145 */ 1146 if (vn_io_pgcache_read_enable && !mac_vnode_check_read_enabled() && 1147 (vn_irflag_read(vp) & (VIRF_DOOMED | VIRF_PGREAD)) == VIRF_PGREAD) { 1148 error = VOP_READ_PGCACHE(vp, uio, ioflag, fp->f_cred); 1149 if (error == 0) { 1150 fp->f_nextoff[UIO_READ] = uio->uio_offset; 1151 return (0); 1152 } 1153 if (error != EJUSTRETURN) 1154 return (error); 1155 } 1156 1157 advice = get_advice(fp, uio); 1158 vn_lock(vp, LK_SHARED | LK_RETRY); 1159 1160 switch (advice) { 1161 case POSIX_FADV_NORMAL: 1162 case POSIX_FADV_SEQUENTIAL: 1163 case POSIX_FADV_NOREUSE: 1164 ioflag |= sequential_heuristic(uio, fp); 1165 break; 1166 case POSIX_FADV_RANDOM: 1167 /* Disable read-ahead for random I/O. */ 1168 break; 1169 } 1170 orig_offset = uio->uio_offset; 1171 1172 #ifdef MAC 1173 error = mac_vnode_check_read(active_cred, fp->f_cred, vp); 1174 if (error == 0) 1175 #endif 1176 error = VOP_READ(vp, uio, ioflag, fp->f_cred); 1177 fp->f_nextoff[UIO_READ] = uio->uio_offset; 1178 VOP_UNLOCK(vp); 1179 if (error == 0 && advice == POSIX_FADV_NOREUSE && 1180 orig_offset != uio->uio_offset) 1181 /* 1182 * Use POSIX_FADV_DONTNEED to flush pages and buffers 1183 * for the backing file after a POSIX_FADV_NOREUSE 1184 * read(2). 1185 */ 1186 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, 1187 POSIX_FADV_DONTNEED); 1188 return (error); 1189 } 1190 1191 /* 1192 * File table vnode write routine. 1193 */ 1194 static int 1195 vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, 1196 struct thread *td) 1197 { 1198 struct vnode *vp; 1199 struct mount *mp; 1200 off_t orig_offset; 1201 int error, ioflag; 1202 int advice; 1203 bool need_finished_write; 1204 1205 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", 1206 uio->uio_td, td)); 1207 KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); 1208 vp = fp->f_vnode; 1209 if (vp->v_type == VREG) 1210 bwillwrite(); 1211 ioflag = IO_UNIT; 1212 if (vp->v_type == VREG && (fp->f_flag & O_APPEND) != 0) 1213 ioflag |= IO_APPEND; 1214 if ((fp->f_flag & FNONBLOCK) != 0) 1215 ioflag |= IO_NDELAY; 1216 ioflag |= get_write_ioflag(fp); 1217 1218 mp = NULL; 1219 need_finished_write = false; 1220 if (vp->v_type != VCHR) { 1221 error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH); 1222 if (error != 0) 1223 goto unlock; 1224 need_finished_write = true; 1225 } 1226 1227 advice = get_advice(fp, uio); 1228 1229 vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY); 1230 switch (advice) { 1231 case POSIX_FADV_NORMAL: 1232 case POSIX_FADV_SEQUENTIAL: 1233 case POSIX_FADV_NOREUSE: 1234 ioflag |= sequential_heuristic(uio, fp); 1235 break; 1236 case POSIX_FADV_RANDOM: 1237 /* XXX: Is this correct? */ 1238 break; 1239 } 1240 orig_offset = uio->uio_offset; 1241 1242 #ifdef MAC 1243 error = mac_vnode_check_write(active_cred, fp->f_cred, vp); 1244 if (error == 0) 1245 #endif 1246 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); 1247 fp->f_nextoff[UIO_WRITE] = uio->uio_offset; 1248 VOP_UNLOCK(vp); 1249 if (need_finished_write) 1250 vn_finished_write(mp); 1251 if (error == 0 && advice == POSIX_FADV_NOREUSE && 1252 orig_offset != uio->uio_offset) 1253 /* 1254 * Use POSIX_FADV_DONTNEED to flush pages and buffers 1255 * for the backing file after a POSIX_FADV_NOREUSE 1256 * write(2). 1257 */ 1258 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, 1259 POSIX_FADV_DONTNEED); 1260 unlock: 1261 return (error); 1262 } 1263 1264 /* 1265 * The vn_io_fault() is a wrapper around vn_read() and vn_write() to 1266 * prevent the following deadlock: 1267 * 1268 * Assume that the thread A reads from the vnode vp1 into userspace 1269 * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is 1270 * currently not resident, then system ends up with the call chain 1271 * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] -> 1272 * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2) 1273 * which establishes lock order vp1->vn_lock, then vp2->vn_lock. 1274 * If, at the same time, thread B reads from vnode vp2 into buffer buf2 1275 * backed by the pages of vnode vp1, and some page in buf2 is not 1276 * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock. 1277 * 1278 * To prevent the lock order reversal and deadlock, vn_io_fault() does 1279 * not allow page faults to happen during VOP_READ() or VOP_WRITE(). 1280 * Instead, it first tries to do the whole range i/o with pagefaults 1281 * disabled. If all pages in the i/o buffer are resident and mapped, 1282 * VOP will succeed (ignoring the genuine filesystem errors). 1283 * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do 1284 * i/o in chunks, with all pages in the chunk prefaulted and held 1285 * using vm_fault_quick_hold_pages(). 1286 * 1287 * Filesystems using this deadlock avoidance scheme should use the 1288 * array of the held pages from uio, saved in the curthread->td_ma, 1289 * instead of doing uiomove(). A helper function 1290 * vn_io_fault_uiomove() converts uiomove request into 1291 * uiomove_fromphys() over td_ma array. 1292 * 1293 * Since vnode locks do not cover the whole i/o anymore, rangelocks 1294 * make the current i/o request atomic with respect to other i/os and 1295 * truncations. 1296 */ 1297 1298 /* 1299 * Decode vn_io_fault_args and perform the corresponding i/o. 1300 */ 1301 static int 1302 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio, 1303 struct thread *td) 1304 { 1305 int error, save; 1306 1307 error = 0; 1308 save = vm_fault_disable_pagefaults(); 1309 switch (args->kind) { 1310 case VN_IO_FAULT_FOP: 1311 error = (args->args.fop_args.doio)(args->args.fop_args.fp, 1312 uio, args->cred, args->flags, td); 1313 break; 1314 case VN_IO_FAULT_VOP: 1315 switch (uio->uio_rw) { 1316 case UIO_READ: 1317 error = VOP_READ(args->args.vop_args.vp, uio, 1318 args->flags, args->cred); 1319 break; 1320 case UIO_WRITE: 1321 error = VOP_WRITE(args->args.vop_args.vp, uio, 1322 args->flags, args->cred); 1323 break; 1324 } 1325 break; 1326 default: 1327 panic("vn_io_fault_doio: unknown kind of io %d %d", 1328 args->kind, uio->uio_rw); 1329 } 1330 vm_fault_enable_pagefaults(save); 1331 return (error); 1332 } 1333 1334 static int 1335 vn_io_fault_touch(char *base, const struct uio *uio) 1336 { 1337 int r; 1338 1339 r = fubyte(base); 1340 if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1)) 1341 return (EFAULT); 1342 return (0); 1343 } 1344 1345 static int 1346 vn_io_fault_prefault_user(const struct uio *uio) 1347 { 1348 char *base; 1349 const struct iovec *iov; 1350 size_t len; 1351 ssize_t resid; 1352 int error, i; 1353 1354 KASSERT(uio->uio_segflg == UIO_USERSPACE, 1355 ("vn_io_fault_prefault userspace")); 1356 1357 error = i = 0; 1358 iov = uio->uio_iov; 1359 resid = uio->uio_resid; 1360 base = iov->iov_base; 1361 len = iov->iov_len; 1362 while (resid > 0) { 1363 error = vn_io_fault_touch(base, uio); 1364 if (error != 0) 1365 break; 1366 if (len < PAGE_SIZE) { 1367 if (len != 0) { 1368 error = vn_io_fault_touch(base + len - 1, uio); 1369 if (error != 0) 1370 break; 1371 resid -= len; 1372 } 1373 if (++i >= uio->uio_iovcnt) 1374 break; 1375 iov = uio->uio_iov + i; 1376 base = iov->iov_base; 1377 len = iov->iov_len; 1378 } else { 1379 len -= PAGE_SIZE; 1380 base += PAGE_SIZE; 1381 resid -= PAGE_SIZE; 1382 } 1383 } 1384 return (error); 1385 } 1386 1387 /* 1388 * Common code for vn_io_fault(), agnostic to the kind of i/o request. 1389 * Uses vn_io_fault_doio() to make the call to an actual i/o function. 1390 * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request 1391 * into args and call vn_io_fault1() to handle faults during the user 1392 * mode buffer accesses. 1393 */ 1394 static int 1395 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, 1396 struct thread *td) 1397 { 1398 vm_page_t ma[io_hold_cnt + 2]; 1399 struct uio *uio_clone, short_uio; 1400 struct iovec short_iovec[1]; 1401 vm_page_t *prev_td_ma; 1402 vm_prot_t prot; 1403 vm_offset_t addr, end; 1404 size_t len, resid; 1405 ssize_t adv; 1406 int error, cnt, saveheld, prev_td_ma_cnt; 1407 1408 if (vn_io_fault_prefault) { 1409 error = vn_io_fault_prefault_user(uio); 1410 if (error != 0) 1411 return (error); /* Or ignore ? */ 1412 } 1413 1414 prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ; 1415 1416 /* 1417 * The UFS follows IO_UNIT directive and replays back both 1418 * uio_offset and uio_resid if an error is encountered during the 1419 * operation. But, since the iovec may be already advanced, 1420 * uio is still in an inconsistent state. 1421 * 1422 * Cache a copy of the original uio, which is advanced to the redo 1423 * point using UIO_NOCOPY below. 1424 */ 1425 uio_clone = cloneuio(uio); 1426 resid = uio->uio_resid; 1427 1428 short_uio.uio_segflg = UIO_USERSPACE; 1429 short_uio.uio_rw = uio->uio_rw; 1430 short_uio.uio_td = uio->uio_td; 1431 1432 error = vn_io_fault_doio(args, uio, td); 1433 if (error != EFAULT) 1434 goto out; 1435 1436 atomic_add_long(&vn_io_faults_cnt, 1); 1437 uio_clone->uio_segflg = UIO_NOCOPY; 1438 uiomove(NULL, resid - uio->uio_resid, uio_clone); 1439 uio_clone->uio_segflg = uio->uio_segflg; 1440 1441 saveheld = curthread_pflags_set(TDP_UIOHELD); 1442 prev_td_ma = td->td_ma; 1443 prev_td_ma_cnt = td->td_ma_cnt; 1444 1445 while (uio_clone->uio_resid != 0) { 1446 len = uio_clone->uio_iov->iov_len; 1447 if (len == 0) { 1448 KASSERT(uio_clone->uio_iovcnt >= 1, 1449 ("iovcnt underflow")); 1450 uio_clone->uio_iov++; 1451 uio_clone->uio_iovcnt--; 1452 continue; 1453 } 1454 if (len > ptoa(io_hold_cnt)) 1455 len = ptoa(io_hold_cnt); 1456 addr = (uintptr_t)uio_clone->uio_iov->iov_base; 1457 end = round_page(addr + len); 1458 if (end < addr) { 1459 error = EFAULT; 1460 break; 1461 } 1462 /* 1463 * A perfectly misaligned address and length could cause 1464 * both the start and the end of the chunk to use partial 1465 * page. +2 accounts for such a situation. 1466 */ 1467 cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map, 1468 addr, len, prot, ma, io_hold_cnt + 2); 1469 if (cnt == -1) { 1470 error = EFAULT; 1471 break; 1472 } 1473 short_uio.uio_iov = &short_iovec[0]; 1474 short_iovec[0].iov_base = (void *)addr; 1475 short_uio.uio_iovcnt = 1; 1476 short_uio.uio_resid = short_iovec[0].iov_len = len; 1477 short_uio.uio_offset = uio_clone->uio_offset; 1478 td->td_ma = ma; 1479 td->td_ma_cnt = cnt; 1480 1481 error = vn_io_fault_doio(args, &short_uio, td); 1482 vm_page_unhold_pages(ma, cnt); 1483 adv = len - short_uio.uio_resid; 1484 1485 uio_clone->uio_iov->iov_base = 1486 (char *)uio_clone->uio_iov->iov_base + adv; 1487 uio_clone->uio_iov->iov_len -= adv; 1488 uio_clone->uio_resid -= adv; 1489 uio_clone->uio_offset += adv; 1490 1491 uio->uio_resid -= adv; 1492 uio->uio_offset += adv; 1493 1494 if (error != 0 || adv == 0) 1495 break; 1496 } 1497 td->td_ma = prev_td_ma; 1498 td->td_ma_cnt = prev_td_ma_cnt; 1499 curthread_pflags_restore(saveheld); 1500 out: 1501 freeuio(uio_clone); 1502 return (error); 1503 } 1504 1505 static int 1506 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred, 1507 int flags, struct thread *td) 1508 { 1509 fo_rdwr_t *doio; 1510 struct vnode *vp; 1511 void *rl_cookie; 1512 struct vn_io_fault_args args; 1513 int error; 1514 bool do_io_fault, do_rangelock; 1515 1516 doio = uio->uio_rw == UIO_READ ? vn_read : vn_write; 1517 vp = fp->f_vnode; 1518 1519 /* 1520 * The ability to read(2) on a directory has historically been 1521 * allowed for all users, but this can and has been the source of 1522 * at least one security issue in the past. As such, it is now hidden 1523 * away behind a sysctl for those that actually need it to use it, and 1524 * restricted to root when it's turned on to make it relatively safe to 1525 * leave on for longer sessions of need. 1526 */ 1527 if (vp->v_type == VDIR) { 1528 KASSERT(uio->uio_rw == UIO_READ, 1529 ("illegal write attempted on a directory")); 1530 if (!vfs_allow_read_dir) 1531 return (EISDIR); 1532 if ((error = priv_check(td, PRIV_VFS_READ_DIR)) != 0) 1533 return (EISDIR); 1534 } 1535 1536 do_io_fault = do_vn_io_fault(vp, uio); 1537 do_rangelock = do_io_fault || (vn_irflag_read(vp) & VIRF_PGREAD) != 0; 1538 foffset_lock_uio(fp, uio, flags); 1539 if (do_rangelock) { 1540 if (uio->uio_rw == UIO_READ) { 1541 rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, 1542 uio->uio_offset + uio->uio_resid); 1543 } else if ((fp->f_flag & O_APPEND) != 0 || 1544 (flags & FOF_OFFSET) == 0) { 1545 /* For appenders, punt and lock the whole range. */ 1546 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); 1547 } else { 1548 rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, 1549 uio->uio_offset + uio->uio_resid); 1550 } 1551 } 1552 if (do_io_fault) { 1553 args.kind = VN_IO_FAULT_FOP; 1554 args.args.fop_args.fp = fp; 1555 args.args.fop_args.doio = doio; 1556 args.cred = active_cred; 1557 args.flags = flags | FOF_OFFSET; 1558 error = vn_io_fault1(vp, uio, &args, td); 1559 } else { 1560 error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td); 1561 } 1562 if (do_rangelock) 1563 vn_rangelock_unlock(vp, rl_cookie); 1564 foffset_unlock_uio(fp, uio, flags); 1565 return (error); 1566 } 1567 1568 /* 1569 * Helper function to perform the requested uiomove operation using 1570 * the held pages for io->uio_iov[0].iov_base buffer instead of 1571 * copyin/copyout. Access to the pages with uiomove_fromphys() 1572 * instead of iov_base prevents page faults that could occur due to 1573 * pmap_collect() invalidating the mapping created by 1574 * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or 1575 * object cleanup revoking the write access from page mappings. 1576 * 1577 * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove() 1578 * instead of plain uiomove(). 1579 */ 1580 int 1581 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) 1582 { 1583 struct uio transp_uio; 1584 struct iovec transp_iov[1]; 1585 struct thread *td; 1586 size_t adv; 1587 int error, pgadv; 1588 1589 td = curthread; 1590 if ((td->td_pflags & TDP_UIOHELD) == 0 || 1591 uio->uio_segflg != UIO_USERSPACE) 1592 return (uiomove(data, xfersize, uio)); 1593 1594 KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); 1595 transp_iov[0].iov_base = data; 1596 transp_uio.uio_iov = &transp_iov[0]; 1597 transp_uio.uio_iovcnt = 1; 1598 if (xfersize > uio->uio_resid) 1599 xfersize = uio->uio_resid; 1600 transp_uio.uio_resid = transp_iov[0].iov_len = xfersize; 1601 transp_uio.uio_offset = 0; 1602 transp_uio.uio_segflg = UIO_SYSSPACE; 1603 /* 1604 * Since transp_iov points to data, and td_ma page array 1605 * corresponds to original uio->uio_iov, we need to invert the 1606 * direction of the i/o operation as passed to 1607 * uiomove_fromphys(). 1608 */ 1609 switch (uio->uio_rw) { 1610 case UIO_WRITE: 1611 transp_uio.uio_rw = UIO_READ; 1612 break; 1613 case UIO_READ: 1614 transp_uio.uio_rw = UIO_WRITE; 1615 break; 1616 } 1617 transp_uio.uio_td = uio->uio_td; 1618 error = uiomove_fromphys(td->td_ma, 1619 ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK, 1620 xfersize, &transp_uio); 1621 adv = xfersize - transp_uio.uio_resid; 1622 pgadv = 1623 (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) - 1624 (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT); 1625 td->td_ma += pgadv; 1626 KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, 1627 pgadv)); 1628 td->td_ma_cnt -= pgadv; 1629 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv; 1630 uio->uio_iov->iov_len -= adv; 1631 uio->uio_resid -= adv; 1632 uio->uio_offset += adv; 1633 return (error); 1634 } 1635 1636 int 1637 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, 1638 struct uio *uio) 1639 { 1640 struct thread *td; 1641 vm_offset_t iov_base; 1642 int cnt, pgadv; 1643 1644 td = curthread; 1645 if ((td->td_pflags & TDP_UIOHELD) == 0 || 1646 uio->uio_segflg != UIO_USERSPACE) 1647 return (uiomove_fromphys(ma, offset, xfersize, uio)); 1648 1649 KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); 1650 cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize; 1651 iov_base = (vm_offset_t)uio->uio_iov->iov_base; 1652 switch (uio->uio_rw) { 1653 case UIO_WRITE: 1654 pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma, 1655 offset, cnt); 1656 break; 1657 case UIO_READ: 1658 pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK, 1659 cnt); 1660 break; 1661 } 1662 pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT); 1663 td->td_ma += pgadv; 1664 KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, 1665 pgadv)); 1666 td->td_ma_cnt -= pgadv; 1667 uio->uio_iov->iov_base = (char *)(iov_base + cnt); 1668 uio->uio_iov->iov_len -= cnt; 1669 uio->uio_resid -= cnt; 1670 uio->uio_offset += cnt; 1671 return (0); 1672 } 1673 1674 /* 1675 * File table truncate routine. 1676 */ 1677 static int 1678 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred, 1679 struct thread *td) 1680 { 1681 struct mount *mp; 1682 struct vnode *vp; 1683 void *rl_cookie; 1684 int error; 1685 1686 vp = fp->f_vnode; 1687 1688 retry: 1689 /* 1690 * Lock the whole range for truncation. Otherwise split i/o 1691 * might happen partly before and partly after the truncation. 1692 */ 1693 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); 1694 error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH); 1695 if (error) 1696 goto out1; 1697 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1698 AUDIT_ARG_VNODE1(vp); 1699 if (vp->v_type == VDIR) { 1700 error = EISDIR; 1701 goto out; 1702 } 1703 #ifdef MAC 1704 error = mac_vnode_check_write(active_cred, fp->f_cred, vp); 1705 if (error) 1706 goto out; 1707 #endif 1708 error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0, 1709 fp->f_cred); 1710 out: 1711 VOP_UNLOCK(vp); 1712 vn_finished_write(mp); 1713 out1: 1714 vn_rangelock_unlock(vp, rl_cookie); 1715 if (error == ERELOOKUP) 1716 goto retry; 1717 return (error); 1718 } 1719 1720 /* 1721 * Truncate a file that is already locked. 1722 */ 1723 int 1724 vn_truncate_locked(struct vnode *vp, off_t length, bool sync, 1725 struct ucred *cred) 1726 { 1727 struct vattr vattr; 1728 int error; 1729 1730 error = VOP_ADD_WRITECOUNT(vp, 1); 1731 if (error == 0) { 1732 VATTR_NULL(&vattr); 1733 vattr.va_size = length; 1734 if (sync) 1735 vattr.va_vaflags |= VA_SYNC; 1736 error = VOP_SETATTR(vp, &vattr, cred); 1737 VOP_ADD_WRITECOUNT_CHECKED(vp, -1); 1738 } 1739 return (error); 1740 } 1741 1742 /* 1743 * File table vnode stat routine. 1744 */ 1745 int 1746 vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred) 1747 { 1748 struct vnode *vp = fp->f_vnode; 1749 int error; 1750 1751 vn_lock(vp, LK_SHARED | LK_RETRY); 1752 error = VOP_STAT(vp, sb, active_cred, fp->f_cred); 1753 VOP_UNLOCK(vp); 1754 1755 return (error); 1756 } 1757 1758 /* 1759 * File table vnode ioctl routine. 1760 */ 1761 static int 1762 vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, 1763 struct thread *td) 1764 { 1765 struct vnode *vp; 1766 struct fiobmap2_arg *bmarg; 1767 off_t size; 1768 int error; 1769 1770 vp = fp->f_vnode; 1771 switch (vp->v_type) { 1772 case VDIR: 1773 case VREG: 1774 switch (com) { 1775 case FIONREAD: 1776 error = vn_getsize(vp, &size, active_cred); 1777 if (error == 0) 1778 *(int *)data = size - fp->f_offset; 1779 return (error); 1780 case FIOBMAP2: 1781 bmarg = (struct fiobmap2_arg *)data; 1782 vn_lock(vp, LK_SHARED | LK_RETRY); 1783 #ifdef MAC 1784 error = mac_vnode_check_read(active_cred, fp->f_cred, 1785 vp); 1786 if (error == 0) 1787 #endif 1788 error = VOP_BMAP(vp, bmarg->bn, NULL, 1789 &bmarg->bn, &bmarg->runp, &bmarg->runb); 1790 VOP_UNLOCK(vp); 1791 return (error); 1792 case FIONBIO: 1793 case FIOASYNC: 1794 return (0); 1795 default: 1796 return (VOP_IOCTL(vp, com, data, fp->f_flag, 1797 active_cred, td)); 1798 } 1799 break; 1800 case VCHR: 1801 return (VOP_IOCTL(vp, com, data, fp->f_flag, 1802 active_cred, td)); 1803 default: 1804 return (ENOTTY); 1805 } 1806 } 1807 1808 /* 1809 * File table vnode poll routine. 1810 */ 1811 static int 1812 vn_poll(struct file *fp, int events, struct ucred *active_cred, 1813 struct thread *td) 1814 { 1815 struct vnode *vp; 1816 int error; 1817 1818 vp = fp->f_vnode; 1819 #if defined(MAC) || defined(AUDIT) 1820 if (AUDITING_TD(td) || mac_vnode_check_poll_enabled()) { 1821 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1822 AUDIT_ARG_VNODE1(vp); 1823 error = mac_vnode_check_poll(active_cred, fp->f_cred, vp); 1824 VOP_UNLOCK(vp); 1825 if (error != 0) 1826 return (error); 1827 } 1828 #endif 1829 error = VOP_POLL(vp, events, fp->f_cred, td); 1830 return (error); 1831 } 1832 1833 /* 1834 * Acquire the requested lock and then check for validity. LK_RETRY 1835 * permits vn_lock to return doomed vnodes. 1836 */ 1837 static int __noinline 1838 _vn_lock_fallback(struct vnode *vp, int flags, const char *file, int line, 1839 int error) 1840 { 1841 1842 KASSERT((flags & LK_RETRY) == 0 || error == 0, 1843 ("vn_lock: error %d incompatible with flags %#x", error, flags)); 1844 1845 if (error == 0) 1846 VNASSERT(VN_IS_DOOMED(vp), vp, ("vnode not doomed")); 1847 1848 if ((flags & LK_RETRY) == 0) { 1849 if (error == 0) { 1850 VOP_UNLOCK(vp); 1851 error = ENOENT; 1852 } 1853 return (error); 1854 } 1855 1856 /* 1857 * LK_RETRY case. 1858 * 1859 * Nothing to do if we got the lock. 1860 */ 1861 if (error == 0) 1862 return (0); 1863 1864 /* 1865 * Interlock was dropped by the call in _vn_lock. 1866 */ 1867 flags &= ~LK_INTERLOCK; 1868 do { 1869 error = VOP_LOCK1(vp, flags, file, line); 1870 } while (error != 0); 1871 return (0); 1872 } 1873 1874 int 1875 _vn_lock(struct vnode *vp, int flags, const char *file, int line) 1876 { 1877 int error; 1878 1879 VNASSERT((flags & LK_TYPE_MASK) != 0, vp, 1880 ("vn_lock: no locktype (%d passed)", flags)); 1881 VNPASS(vp->v_holdcnt > 0, vp); 1882 error = VOP_LOCK1(vp, flags, file, line); 1883 if (__predict_false(error != 0 || VN_IS_DOOMED(vp))) 1884 return (_vn_lock_fallback(vp, flags, file, line, error)); 1885 return (0); 1886 } 1887 1888 /* 1889 * File table vnode close routine. 1890 */ 1891 static int 1892 vn_closefile(struct file *fp, struct thread *td) 1893 { 1894 struct vnode *vp; 1895 struct flock lf; 1896 int error; 1897 bool ref; 1898 1899 vp = fp->f_vnode; 1900 fp->f_ops = &badfileops; 1901 ref = (fp->f_flag & FHASLOCK) != 0; 1902 1903 error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref); 1904 1905 if (__predict_false(ref)) { 1906 lf.l_whence = SEEK_SET; 1907 lf.l_start = 0; 1908 lf.l_len = 0; 1909 lf.l_type = F_UNLCK; 1910 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); 1911 vrele(vp); 1912 } 1913 return (error); 1914 } 1915 1916 /* 1917 * Preparing to start a filesystem write operation. If the operation is 1918 * permitted, then we bump the count of operations in progress and 1919 * proceed. If a suspend request is in progress, we wait until the 1920 * suspension is over, and then proceed. 1921 */ 1922 static int 1923 vn_start_write_refed(struct mount *mp, int flags, bool mplocked) 1924 { 1925 struct mount_pcpu *mpcpu; 1926 int error, mflags; 1927 1928 if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 && 1929 vfs_op_thread_enter(mp, mpcpu)) { 1930 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); 1931 vfs_mp_count_add_pcpu(mpcpu, writeopcount, 1); 1932 vfs_op_thread_exit(mp, mpcpu); 1933 return (0); 1934 } 1935 1936 if (mplocked) 1937 mtx_assert(MNT_MTX(mp), MA_OWNED); 1938 else 1939 MNT_ILOCK(mp); 1940 1941 error = 0; 1942 1943 /* 1944 * Check on status of suspension. 1945 */ 1946 if ((curthread->td_pflags & TDP_IGNSUSP) == 0 || 1947 mp->mnt_susp_owner != curthread) { 1948 mflags = 0; 1949 if ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0) { 1950 if (flags & V_PCATCH) 1951 mflags |= PCATCH; 1952 } 1953 mflags |= (PUSER - 1); 1954 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 1955 if ((flags & V_NOWAIT) != 0) { 1956 error = EWOULDBLOCK; 1957 goto unlock; 1958 } 1959 error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, 1960 "suspfs", 0); 1961 if (error != 0) 1962 goto unlock; 1963 } 1964 } 1965 if ((flags & V_XSLEEP) != 0) 1966 goto unlock; 1967 mp->mnt_writeopcount++; 1968 unlock: 1969 if (error != 0 || (flags & V_XSLEEP) != 0) 1970 MNT_REL(mp); 1971 MNT_IUNLOCK(mp); 1972 return (error); 1973 } 1974 1975 int 1976 vn_start_write(struct vnode *vp, struct mount **mpp, int flags) 1977 { 1978 struct mount *mp; 1979 int error; 1980 1981 KASSERT((flags & ~V_VALID_FLAGS) == 0, 1982 ("%s: invalid flags passed %d\n", __func__, flags)); 1983 1984 error = 0; 1985 /* 1986 * If a vnode is provided, get and return the mount point that 1987 * to which it will write. 1988 */ 1989 if (vp != NULL) { 1990 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { 1991 *mpp = NULL; 1992 if (error != EOPNOTSUPP) 1993 return (error); 1994 return (0); 1995 } 1996 } 1997 if ((mp = *mpp) == NULL) 1998 return (0); 1999 2000 /* 2001 * VOP_GETWRITEMOUNT() returns with the mp refcount held through 2002 * a vfs_ref(). 2003 * As long as a vnode is not provided we need to acquire a 2004 * refcount for the provided mountpoint too, in order to 2005 * emulate a vfs_ref(). 2006 */ 2007 if (vp == NULL) 2008 vfs_ref(mp); 2009 2010 error = vn_start_write_refed(mp, flags, false); 2011 if (error != 0 && (flags & V_NOWAIT) == 0) 2012 *mpp = NULL; 2013 return (error); 2014 } 2015 2016 /* 2017 * Secondary suspension. Used by operations such as vop_inactive 2018 * routines that are needed by the higher level functions. These 2019 * are allowed to proceed until all the higher level functions have 2020 * completed (indicated by mnt_writeopcount dropping to zero). At that 2021 * time, these operations are halted until the suspension is over. 2022 */ 2023 int 2024 vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags) 2025 { 2026 struct mount *mp; 2027 int error, mflags; 2028 2029 KASSERT((flags & (~V_VALID_FLAGS | V_XSLEEP)) == 0, 2030 ("%s: invalid flags passed %d\n", __func__, flags)); 2031 2032 retry: 2033 if (vp != NULL) { 2034 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { 2035 *mpp = NULL; 2036 if (error != EOPNOTSUPP) 2037 return (error); 2038 return (0); 2039 } 2040 } 2041 /* 2042 * If we are not suspended or have not yet reached suspended 2043 * mode, then let the operation proceed. 2044 */ 2045 if ((mp = *mpp) == NULL) 2046 return (0); 2047 2048 /* 2049 * VOP_GETWRITEMOUNT() returns with the mp refcount held through 2050 * a vfs_ref(). 2051 * As long as a vnode is not provided we need to acquire a 2052 * refcount for the provided mountpoint too, in order to 2053 * emulate a vfs_ref(). 2054 */ 2055 MNT_ILOCK(mp); 2056 if (vp == NULL) 2057 MNT_REF(mp); 2058 if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) { 2059 mp->mnt_secondary_writes++; 2060 mp->mnt_secondary_accwrites++; 2061 MNT_IUNLOCK(mp); 2062 return (0); 2063 } 2064 if ((flags & V_NOWAIT) != 0) { 2065 MNT_REL(mp); 2066 MNT_IUNLOCK(mp); 2067 *mpp = NULL; 2068 return (EWOULDBLOCK); 2069 } 2070 /* 2071 * Wait for the suspension to finish. 2072 */ 2073 mflags = 0; 2074 if ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0) { 2075 if ((flags & V_PCATCH) != 0) 2076 mflags |= PCATCH; 2077 } 2078 mflags |= (PUSER - 1) | PDROP; 2079 error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, "suspfs", 0); 2080 vfs_rel(mp); 2081 if (error == 0) 2082 goto retry; 2083 *mpp = NULL; 2084 return (error); 2085 } 2086 2087 /* 2088 * Filesystem write operation has completed. If we are suspending and this 2089 * operation is the last one, notify the suspender that the suspension is 2090 * now in effect. 2091 */ 2092 void 2093 vn_finished_write(struct mount *mp) 2094 { 2095 struct mount_pcpu *mpcpu; 2096 int c; 2097 2098 if (mp == NULL) 2099 return; 2100 2101 if (vfs_op_thread_enter(mp, mpcpu)) { 2102 vfs_mp_count_sub_pcpu(mpcpu, writeopcount, 1); 2103 vfs_mp_count_sub_pcpu(mpcpu, ref, 1); 2104 vfs_op_thread_exit(mp, mpcpu); 2105 return; 2106 } 2107 2108 MNT_ILOCK(mp); 2109 vfs_assert_mount_counters(mp); 2110 MNT_REL(mp); 2111 c = --mp->mnt_writeopcount; 2112 if (mp->mnt_vfs_ops == 0) { 2113 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); 2114 MNT_IUNLOCK(mp); 2115 return; 2116 } 2117 if (c < 0) 2118 vfs_dump_mount_counters(mp); 2119 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0) 2120 wakeup(&mp->mnt_writeopcount); 2121 MNT_IUNLOCK(mp); 2122 } 2123 2124 /* 2125 * Filesystem secondary write operation has completed. If we are 2126 * suspending and this operation is the last one, notify the suspender 2127 * that the suspension is now in effect. 2128 */ 2129 void 2130 vn_finished_secondary_write(struct mount *mp) 2131 { 2132 if (mp == NULL) 2133 return; 2134 MNT_ILOCK(mp); 2135 MNT_REL(mp); 2136 mp->mnt_secondary_writes--; 2137 if (mp->mnt_secondary_writes < 0) 2138 panic("vn_finished_secondary_write: neg cnt"); 2139 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && 2140 mp->mnt_secondary_writes <= 0) 2141 wakeup(&mp->mnt_secondary_writes); 2142 MNT_IUNLOCK(mp); 2143 } 2144 2145 /* 2146 * Request a filesystem to suspend write operations. 2147 */ 2148 int 2149 vfs_write_suspend(struct mount *mp, int flags) 2150 { 2151 int error; 2152 2153 vfs_op_enter(mp); 2154 2155 MNT_ILOCK(mp); 2156 vfs_assert_mount_counters(mp); 2157 if (mp->mnt_susp_owner == curthread) { 2158 vfs_op_exit_locked(mp); 2159 MNT_IUNLOCK(mp); 2160 return (EALREADY); 2161 } 2162 while (mp->mnt_kern_flag & MNTK_SUSPEND) 2163 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0); 2164 2165 /* 2166 * Unmount holds a write reference on the mount point. If we 2167 * own busy reference and drain for writers, we deadlock with 2168 * the reference draining in the unmount path. Callers of 2169 * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if 2170 * vfs_busy() reference is owned and caller is not in the 2171 * unmount context. 2172 */ 2173 if ((flags & VS_SKIP_UNMOUNT) != 0 && 2174 (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { 2175 vfs_op_exit_locked(mp); 2176 MNT_IUNLOCK(mp); 2177 return (EBUSY); 2178 } 2179 2180 mp->mnt_kern_flag |= MNTK_SUSPEND; 2181 mp->mnt_susp_owner = curthread; 2182 if (mp->mnt_writeopcount > 0) 2183 (void) msleep(&mp->mnt_writeopcount, 2184 MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); 2185 else 2186 MNT_IUNLOCK(mp); 2187 if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) { 2188 vfs_write_resume(mp, 0); 2189 /* vfs_write_resume does vfs_op_exit() for us */ 2190 } 2191 return (error); 2192 } 2193 2194 /* 2195 * Request a filesystem to resume write operations. 2196 */ 2197 void 2198 vfs_write_resume(struct mount *mp, int flags) 2199 { 2200 2201 MNT_ILOCK(mp); 2202 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 2203 KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner")); 2204 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 | 2205 MNTK_SUSPENDED); 2206 mp->mnt_susp_owner = NULL; 2207 wakeup(&mp->mnt_writeopcount); 2208 wakeup(&mp->mnt_flag); 2209 curthread->td_pflags &= ~TDP_IGNSUSP; 2210 if ((flags & VR_START_WRITE) != 0) { 2211 MNT_REF(mp); 2212 mp->mnt_writeopcount++; 2213 } 2214 MNT_IUNLOCK(mp); 2215 if ((flags & VR_NO_SUSPCLR) == 0) 2216 VFS_SUSP_CLEAN(mp); 2217 vfs_op_exit(mp); 2218 } else if ((flags & VR_START_WRITE) != 0) { 2219 MNT_REF(mp); 2220 vn_start_write_refed(mp, 0, true); 2221 } else { 2222 MNT_IUNLOCK(mp); 2223 } 2224 } 2225 2226 /* 2227 * Helper loop around vfs_write_suspend() for filesystem unmount VFS 2228 * methods. 2229 */ 2230 int 2231 vfs_write_suspend_umnt(struct mount *mp) 2232 { 2233 int error; 2234 2235 KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0, 2236 ("vfs_write_suspend_umnt: recursed")); 2237 2238 /* dounmount() already called vn_start_write(). */ 2239 for (;;) { 2240 vn_finished_write(mp); 2241 error = vfs_write_suspend(mp, 0); 2242 if (error != 0) { 2243 vn_start_write(NULL, &mp, V_WAIT); 2244 return (error); 2245 } 2246 MNT_ILOCK(mp); 2247 if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0) 2248 break; 2249 MNT_IUNLOCK(mp); 2250 vn_start_write(NULL, &mp, V_WAIT); 2251 } 2252 mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); 2253 wakeup(&mp->mnt_flag); 2254 MNT_IUNLOCK(mp); 2255 curthread->td_pflags |= TDP_IGNSUSP; 2256 return (0); 2257 } 2258 2259 /* 2260 * Implement kqueues for files by translating it to vnode operation. 2261 */ 2262 static int 2263 vn_kqfilter(struct file *fp, struct knote *kn) 2264 { 2265 2266 return (VOP_KQFILTER(fp->f_vnode, kn)); 2267 } 2268 2269 int 2270 vn_kqfilter_opath(struct file *fp, struct knote *kn) 2271 { 2272 if ((fp->f_flag & FKQALLOWED) == 0) 2273 return (EBADF); 2274 return (vn_kqfilter(fp, kn)); 2275 } 2276 2277 /* 2278 * Simplified in-kernel wrapper calls for extended attribute access. 2279 * Both calls pass in a NULL credential, authorizing as "kernel" access. 2280 * Set IO_NODELOCKED in ioflg if the vnode is already locked. 2281 */ 2282 int 2283 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, 2284 const char *attrname, int *buflen, char *buf, struct thread *td) 2285 { 2286 struct uio auio; 2287 struct iovec iov; 2288 int error; 2289 2290 iov.iov_len = *buflen; 2291 iov.iov_base = buf; 2292 2293 auio.uio_iov = &iov; 2294 auio.uio_iovcnt = 1; 2295 auio.uio_rw = UIO_READ; 2296 auio.uio_segflg = UIO_SYSSPACE; 2297 auio.uio_td = td; 2298 auio.uio_offset = 0; 2299 auio.uio_resid = *buflen; 2300 2301 if ((ioflg & IO_NODELOCKED) == 0) 2302 vn_lock(vp, LK_SHARED | LK_RETRY); 2303 2304 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 2305 2306 /* authorize attribute retrieval as kernel */ 2307 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, 2308 td); 2309 2310 if ((ioflg & IO_NODELOCKED) == 0) 2311 VOP_UNLOCK(vp); 2312 2313 if (error == 0) { 2314 *buflen = *buflen - auio.uio_resid; 2315 } 2316 2317 return (error); 2318 } 2319 2320 /* 2321 * XXX failure mode if partially written? 2322 */ 2323 int 2324 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, 2325 const char *attrname, int buflen, char *buf, struct thread *td) 2326 { 2327 struct uio auio; 2328 struct iovec iov; 2329 struct mount *mp; 2330 int error; 2331 2332 iov.iov_len = buflen; 2333 iov.iov_base = buf; 2334 2335 auio.uio_iov = &iov; 2336 auio.uio_iovcnt = 1; 2337 auio.uio_rw = UIO_WRITE; 2338 auio.uio_segflg = UIO_SYSSPACE; 2339 auio.uio_td = td; 2340 auio.uio_offset = 0; 2341 auio.uio_resid = buflen; 2342 2343 if ((ioflg & IO_NODELOCKED) == 0) { 2344 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) 2345 return (error); 2346 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2347 } 2348 2349 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 2350 2351 /* authorize attribute setting as kernel */ 2352 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); 2353 2354 if ((ioflg & IO_NODELOCKED) == 0) { 2355 vn_finished_write(mp); 2356 VOP_UNLOCK(vp); 2357 } 2358 2359 return (error); 2360 } 2361 2362 int 2363 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, 2364 const char *attrname, struct thread *td) 2365 { 2366 struct mount *mp; 2367 int error; 2368 2369 if ((ioflg & IO_NODELOCKED) == 0) { 2370 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) 2371 return (error); 2372 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2373 } 2374 2375 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 2376 2377 /* authorize attribute removal as kernel */ 2378 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); 2379 if (error == EOPNOTSUPP) 2380 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, 2381 NULL, td); 2382 2383 if ((ioflg & IO_NODELOCKED) == 0) { 2384 vn_finished_write(mp); 2385 VOP_UNLOCK(vp); 2386 } 2387 2388 return (error); 2389 } 2390 2391 static int 2392 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags, 2393 struct vnode **rvp) 2394 { 2395 2396 return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp)); 2397 } 2398 2399 int 2400 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp) 2401 { 2402 2403 return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino, 2404 lkflags, rvp)); 2405 } 2406 2407 int 2408 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, 2409 int lkflags, struct vnode **rvp) 2410 { 2411 struct mount *mp; 2412 int ltype, error; 2413 2414 ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get"); 2415 mp = vp->v_mount; 2416 ltype = VOP_ISLOCKED(vp); 2417 KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED, 2418 ("vn_vget_ino: vp not locked")); 2419 error = vfs_busy(mp, MBF_NOWAIT); 2420 if (error != 0) { 2421 vfs_ref(mp); 2422 VOP_UNLOCK(vp); 2423 error = vfs_busy(mp, 0); 2424 vn_lock(vp, ltype | LK_RETRY); 2425 vfs_rel(mp); 2426 if (error != 0) 2427 return (ENOENT); 2428 if (VN_IS_DOOMED(vp)) { 2429 vfs_unbusy(mp); 2430 return (ENOENT); 2431 } 2432 } 2433 VOP_UNLOCK(vp); 2434 error = alloc(mp, alloc_arg, lkflags, rvp); 2435 vfs_unbusy(mp); 2436 if (error != 0 || *rvp != vp) 2437 vn_lock(vp, ltype | LK_RETRY); 2438 if (VN_IS_DOOMED(vp)) { 2439 if (error == 0) { 2440 if (*rvp == vp) 2441 vunref(vp); 2442 else 2443 vput(*rvp); 2444 } 2445 error = ENOENT; 2446 } 2447 return (error); 2448 } 2449 2450 static void 2451 vn_send_sigxfsz(struct proc *p) 2452 { 2453 PROC_LOCK(p); 2454 kern_psignal(p, SIGXFSZ); 2455 PROC_UNLOCK(p); 2456 } 2457 2458 int 2459 vn_rlimit_trunc(u_quad_t size, struct thread *td) 2460 { 2461 if (size <= lim_cur(td, RLIMIT_FSIZE)) 2462 return (0); 2463 vn_send_sigxfsz(td->td_proc); 2464 return (EFBIG); 2465 } 2466 2467 static int 2468 vn_rlimit_fsizex1(const struct vnode *vp, struct uio *uio, off_t maxfsz, 2469 bool adj, struct thread *td) 2470 { 2471 off_t lim; 2472 bool ktr_write; 2473 2474 if (vp->v_type != VREG) 2475 return (0); 2476 2477 /* 2478 * Handle file system maximum file size. 2479 */ 2480 if (maxfsz != 0 && uio->uio_offset + uio->uio_resid > maxfsz) { 2481 if (!adj || uio->uio_offset >= maxfsz) 2482 return (EFBIG); 2483 uio->uio_resid = maxfsz - uio->uio_offset; 2484 } 2485 2486 /* 2487 * This is kernel write (e.g. vnode_pager) or accounting 2488 * write, ignore limit. 2489 */ 2490 if (td == NULL || (td->td_pflags2 & TDP2_ACCT) != 0) 2491 return (0); 2492 2493 /* 2494 * Calculate file size limit. 2495 */ 2496 ktr_write = (td->td_pflags & TDP_INKTRACE) != 0; 2497 lim = __predict_false(ktr_write) ? td->td_ktr_io_lim : 2498 lim_cur(td, RLIMIT_FSIZE); 2499 2500 /* 2501 * Is the limit reached? 2502 */ 2503 if (__predict_true((uoff_t)uio->uio_offset + uio->uio_resid <= lim)) 2504 return (0); 2505 2506 /* 2507 * Prepared filesystems can handle writes truncated to the 2508 * file size limit. 2509 */ 2510 if (adj && (uoff_t)uio->uio_offset < lim) { 2511 uio->uio_resid = lim - (uoff_t)uio->uio_offset; 2512 return (0); 2513 } 2514 2515 if (!ktr_write || ktr_filesize_limit_signal) 2516 vn_send_sigxfsz(td->td_proc); 2517 return (EFBIG); 2518 } 2519 2520 /* 2521 * Helper for VOP_WRITE() implementations, the common code to 2522 * handle maximum supported file size on the filesystem, and 2523 * RLIMIT_FSIZE, except for special writes from accounting subsystem 2524 * and ktrace. 2525 * 2526 * For maximum file size (maxfsz argument): 2527 * - return EFBIG if uio_offset is beyond it 2528 * - otherwise, clamp uio_resid if write would extend file beyond maxfsz. 2529 * 2530 * For RLIMIT_FSIZE: 2531 * - return EFBIG and send SIGXFSZ if uio_offset is beyond the limit 2532 * - otherwise, clamp uio_resid if write would extend file beyond limit. 2533 * 2534 * If clamping occured, the adjustment for uio_resid is stored in 2535 * *resid_adj, to be re-applied by vn_rlimit_fsizex_res() on return 2536 * from the VOP. 2537 */ 2538 int 2539 vn_rlimit_fsizex(const struct vnode *vp, struct uio *uio, off_t maxfsz, 2540 ssize_t *resid_adj, struct thread *td) 2541 { 2542 ssize_t resid_orig; 2543 int error; 2544 bool adj; 2545 2546 resid_orig = uio->uio_resid; 2547 adj = resid_adj != NULL; 2548 error = vn_rlimit_fsizex1(vp, uio, maxfsz, adj, td); 2549 if (adj) 2550 *resid_adj = resid_orig - uio->uio_resid; 2551 return (error); 2552 } 2553 2554 void 2555 vn_rlimit_fsizex_res(struct uio *uio, ssize_t resid_adj) 2556 { 2557 uio->uio_resid += resid_adj; 2558 } 2559 2560 int 2561 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, 2562 struct thread *td) 2563 { 2564 return (vn_rlimit_fsizex(vp, __DECONST(struct uio *, uio), 0, NULL, 2565 td)); 2566 } 2567 2568 int 2569 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 2570 struct thread *td) 2571 { 2572 struct vnode *vp; 2573 2574 vp = fp->f_vnode; 2575 #ifdef AUDIT 2576 vn_lock(vp, LK_SHARED | LK_RETRY); 2577 AUDIT_ARG_VNODE1(vp); 2578 VOP_UNLOCK(vp); 2579 #endif 2580 return (setfmode(td, active_cred, vp, mode)); 2581 } 2582 2583 int 2584 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 2585 struct thread *td) 2586 { 2587 struct vnode *vp; 2588 2589 vp = fp->f_vnode; 2590 #ifdef AUDIT 2591 vn_lock(vp, LK_SHARED | LK_RETRY); 2592 AUDIT_ARG_VNODE1(vp); 2593 VOP_UNLOCK(vp); 2594 #endif 2595 return (setfown(td, active_cred, vp, uid, gid)); 2596 } 2597 2598 /* 2599 * Remove pages in the range ["start", "end") from the vnode's VM object. If 2600 * "end" is 0, then the range extends to the end of the object. 2601 */ 2602 void 2603 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end) 2604 { 2605 vm_object_t object; 2606 2607 if ((object = vp->v_object) == NULL) 2608 return; 2609 VM_OBJECT_WLOCK(object); 2610 vm_object_page_remove(object, start, end, 0); 2611 VM_OBJECT_WUNLOCK(object); 2612 } 2613 2614 /* 2615 * Like vn_pages_remove(), but skips invalid pages, which by definition are not 2616 * mapped into any process' address space. Filesystems may use this in 2617 * preference to vn_pages_remove() to avoid blocking on pages busied in 2618 * preparation for a VOP_GETPAGES. 2619 */ 2620 void 2621 vn_pages_remove_valid(struct vnode *vp, vm_pindex_t start, vm_pindex_t end) 2622 { 2623 vm_object_t object; 2624 2625 if ((object = vp->v_object) == NULL) 2626 return; 2627 VM_OBJECT_WLOCK(object); 2628 vm_object_page_remove(object, start, end, OBJPR_VALIDONLY); 2629 VM_OBJECT_WUNLOCK(object); 2630 } 2631 2632 int 2633 vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off, 2634 struct ucred *cred) 2635 { 2636 off_t size; 2637 daddr_t bn, bnp; 2638 uint64_t bsize; 2639 off_t noff; 2640 int error; 2641 2642 KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, 2643 ("%s: Wrong command %lu", __func__, cmd)); 2644 ASSERT_VOP_ELOCKED(vp, "vn_bmap_seekhole_locked"); 2645 2646 if (vp->v_type != VREG) { 2647 error = ENOTTY; 2648 goto out; 2649 } 2650 error = vn_getsize_locked(vp, &size, cred); 2651 if (error != 0) 2652 goto out; 2653 noff = *off; 2654 if (noff < 0 || noff >= size) { 2655 error = ENXIO; 2656 goto out; 2657 } 2658 2659 /* See the comment in ufs_bmap_seekdata(). */ 2660 vnode_pager_clean_sync(vp); 2661 2662 bsize = vp->v_mount->mnt_stat.f_iosize; 2663 for (bn = noff / bsize; noff < size; bn++, noff += bsize - 2664 noff % bsize) { 2665 error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL); 2666 if (error == EOPNOTSUPP) { 2667 error = ENOTTY; 2668 goto out; 2669 } 2670 if ((bnp == -1 && cmd == FIOSEEKHOLE) || 2671 (bnp != -1 && cmd == FIOSEEKDATA)) { 2672 noff = bn * bsize; 2673 if (noff < *off) 2674 noff = *off; 2675 goto out; 2676 } 2677 } 2678 if (noff > size) 2679 noff = size; 2680 /* noff == size. There is an implicit hole at the end of file. */ 2681 if (cmd == FIOSEEKDATA) 2682 error = ENXIO; 2683 out: 2684 if (error == 0) 2685 *off = noff; 2686 return (error); 2687 } 2688 2689 int 2690 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) 2691 { 2692 int error; 2693 2694 KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, 2695 ("%s: Wrong command %lu", __func__, cmd)); 2696 2697 if (vn_lock(vp, LK_EXCLUSIVE) != 0) 2698 return (EBADF); 2699 error = vn_bmap_seekhole_locked(vp, cmd, off, cred); 2700 VOP_UNLOCK(vp); 2701 return (error); 2702 } 2703 2704 int 2705 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td) 2706 { 2707 struct ucred *cred; 2708 struct vnode *vp; 2709 off_t foffset, fsize, size; 2710 int error, noneg; 2711 2712 cred = td->td_ucred; 2713 vp = fp->f_vnode; 2714 noneg = (vp->v_type != VCHR); 2715 /* 2716 * Try to dodge locking for common case of querying the offset. 2717 */ 2718 if (whence == L_INCR && offset == 0) { 2719 foffset = foffset_read(fp); 2720 if (__predict_false(foffset < 0 && noneg)) { 2721 return (EOVERFLOW); 2722 } 2723 td->td_uretoff.tdu_off = foffset; 2724 return (0); 2725 } 2726 foffset = foffset_lock(fp, 0); 2727 error = 0; 2728 switch (whence) { 2729 case L_INCR: 2730 if (noneg && 2731 (foffset < 0 || 2732 (offset > 0 && foffset > OFF_MAX - offset))) { 2733 error = EOVERFLOW; 2734 break; 2735 } 2736 offset += foffset; 2737 break; 2738 case L_XTND: 2739 error = vn_getsize(vp, &fsize, cred); 2740 if (error != 0) 2741 break; 2742 2743 /* 2744 * If the file references a disk device, then fetch 2745 * the media size and use that to determine the ending 2746 * offset. 2747 */ 2748 if (fsize == 0 && vp->v_type == VCHR && 2749 fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0) 2750 fsize = size; 2751 if (noneg && offset > 0 && fsize > OFF_MAX - offset) { 2752 error = EOVERFLOW; 2753 break; 2754 } 2755 offset += fsize; 2756 break; 2757 case L_SET: 2758 break; 2759 case SEEK_DATA: 2760 error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td); 2761 if (error == ENOTTY) 2762 error = EINVAL; 2763 break; 2764 case SEEK_HOLE: 2765 error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td); 2766 if (error == ENOTTY) 2767 error = EINVAL; 2768 break; 2769 default: 2770 error = EINVAL; 2771 } 2772 if (error == 0 && noneg && offset < 0) 2773 error = EINVAL; 2774 if (error != 0) 2775 goto drop; 2776 VFS_KNOTE_UNLOCKED(vp, 0); 2777 td->td_uretoff.tdu_off = offset; 2778 drop: 2779 foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); 2780 return (error); 2781 } 2782 2783 int 2784 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, 2785 struct thread *td) 2786 { 2787 int error; 2788 2789 /* 2790 * Grant permission if the caller is the owner of the file, or 2791 * the super-user, or has ACL_WRITE_ATTRIBUTES permission on 2792 * on the file. If the time pointer is null, then write 2793 * permission on the file is also sufficient. 2794 * 2795 * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes: 2796 * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES 2797 * will be allowed to set the times [..] to the current 2798 * server time. 2799 */ 2800 error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td); 2801 if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0) 2802 error = VOP_ACCESS(vp, VWRITE, cred, td); 2803 return (error); 2804 } 2805 2806 int 2807 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 2808 { 2809 struct vnode *vp; 2810 int error; 2811 2812 if (fp->f_type == DTYPE_FIFO) 2813 kif->kf_type = KF_TYPE_FIFO; 2814 else 2815 kif->kf_type = KF_TYPE_VNODE; 2816 vp = fp->f_vnode; 2817 vref(vp); 2818 FILEDESC_SUNLOCK(fdp); 2819 error = vn_fill_kinfo_vnode(vp, kif); 2820 vrele(vp); 2821 FILEDESC_SLOCK(fdp); 2822 return (error); 2823 } 2824 2825 static inline void 2826 vn_fill_junk(struct kinfo_file *kif) 2827 { 2828 size_t len, olen; 2829 2830 /* 2831 * Simulate vn_fullpath returning changing values for a given 2832 * vp during e.g. coredump. 2833 */ 2834 len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1; 2835 olen = strlen(kif->kf_path); 2836 if (len < olen) 2837 strcpy(&kif->kf_path[len - 1], "$"); 2838 else 2839 for (; olen < len; olen++) 2840 strcpy(&kif->kf_path[olen], "A"); 2841 } 2842 2843 int 2844 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif) 2845 { 2846 struct vattr va; 2847 char *fullpath, *freepath; 2848 int error; 2849 2850 kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type); 2851 freepath = NULL; 2852 fullpath = "-"; 2853 error = vn_fullpath(vp, &fullpath, &freepath); 2854 if (error == 0) { 2855 strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); 2856 } 2857 if (freepath != NULL) 2858 free(freepath, M_TEMP); 2859 2860 KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path, 2861 vn_fill_junk(kif); 2862 ); 2863 2864 /* 2865 * Retrieve vnode attributes. 2866 */ 2867 va.va_fsid = VNOVAL; 2868 va.va_rdev = NODEV; 2869 vn_lock(vp, LK_SHARED | LK_RETRY); 2870 error = VOP_GETATTR(vp, &va, curthread->td_ucred); 2871 VOP_UNLOCK(vp); 2872 if (error != 0) 2873 return (error); 2874 if (va.va_fsid != VNOVAL) 2875 kif->kf_un.kf_file.kf_file_fsid = va.va_fsid; 2876 else 2877 kif->kf_un.kf_file.kf_file_fsid = 2878 vp->v_mount->mnt_stat.f_fsid.val[0]; 2879 kif->kf_un.kf_file.kf_file_fsid_freebsd11 = 2880 kif->kf_un.kf_file.kf_file_fsid; /* truncate */ 2881 kif->kf_un.kf_file.kf_file_fileid = va.va_fileid; 2882 kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode); 2883 kif->kf_un.kf_file.kf_file_size = va.va_size; 2884 kif->kf_un.kf_file.kf_file_rdev = va.va_rdev; 2885 kif->kf_un.kf_file.kf_file_rdev_freebsd11 = 2886 kif->kf_un.kf_file.kf_file_rdev; /* truncate */ 2887 kif->kf_un.kf_file.kf_file_nlink = va.va_nlink; 2888 return (0); 2889 } 2890 2891 int 2892 vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, 2893 vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, 2894 struct thread *td) 2895 { 2896 #ifdef HWPMC_HOOKS 2897 struct pmckern_map_in pkm; 2898 #endif 2899 struct mount *mp; 2900 struct vnode *vp; 2901 vm_object_t object; 2902 vm_prot_t maxprot; 2903 boolean_t writecounted; 2904 int error; 2905 2906 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 2907 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 2908 /* 2909 * POSIX shared-memory objects are defined to have 2910 * kernel persistence, and are not defined to support 2911 * read(2)/write(2) -- or even open(2). Thus, we can 2912 * use MAP_ASYNC to trade on-disk coherence for speed. 2913 * The shm_open(3) library routine turns on the FPOSIXSHM 2914 * flag to request this behavior. 2915 */ 2916 if ((fp->f_flag & FPOSIXSHM) != 0) 2917 flags |= MAP_NOSYNC; 2918 #endif 2919 vp = fp->f_vnode; 2920 2921 /* 2922 * Ensure that file and memory protections are 2923 * compatible. Note that we only worry about 2924 * writability if mapping is shared; in this case, 2925 * current and max prot are dictated by the open file. 2926 * XXX use the vnode instead? Problem is: what 2927 * credentials do we use for determination? What if 2928 * proc does a setuid? 2929 */ 2930 mp = vp->v_mount; 2931 if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { 2932 maxprot = VM_PROT_NONE; 2933 if ((prot & VM_PROT_EXECUTE) != 0) 2934 return (EACCES); 2935 } else 2936 maxprot = VM_PROT_EXECUTE; 2937 if ((fp->f_flag & FREAD) != 0) 2938 maxprot |= VM_PROT_READ; 2939 else if ((prot & VM_PROT_READ) != 0) 2940 return (EACCES); 2941 2942 /* 2943 * If we are sharing potential changes via MAP_SHARED and we 2944 * are trying to get write permission although we opened it 2945 * without asking for it, bail out. 2946 */ 2947 if ((flags & MAP_SHARED) != 0) { 2948 if ((fp->f_flag & FWRITE) != 0) 2949 maxprot |= VM_PROT_WRITE; 2950 else if ((prot & VM_PROT_WRITE) != 0) 2951 return (EACCES); 2952 } else { 2953 maxprot |= VM_PROT_WRITE; 2954 cap_maxprot |= VM_PROT_WRITE; 2955 } 2956 maxprot &= cap_maxprot; 2957 2958 /* 2959 * For regular files and shared memory, POSIX requires that 2960 * the value of foff be a legitimate offset within the data 2961 * object. In particular, negative offsets are invalid. 2962 * Blocking negative offsets and overflows here avoids 2963 * possible wraparound or user-level access into reserved 2964 * ranges of the data object later. In contrast, POSIX does 2965 * not dictate how offsets are used by device drivers, so in 2966 * the case of a device mapping a negative offset is passed 2967 * on. 2968 */ 2969 if ( 2970 #ifdef _LP64 2971 size > OFF_MAX || 2972 #endif 2973 foff > OFF_MAX - size) 2974 return (EINVAL); 2975 2976 writecounted = FALSE; 2977 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp, 2978 &foff, &object, &writecounted); 2979 if (error != 0) 2980 return (error); 2981 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 2982 foff, writecounted, td); 2983 if (error != 0) { 2984 /* 2985 * If this mapping was accounted for in the vnode's 2986 * writecount, then undo that now. 2987 */ 2988 if (writecounted) 2989 vm_pager_release_writecount(object, 0, size); 2990 vm_object_deallocate(object); 2991 } 2992 #ifdef HWPMC_HOOKS 2993 /* Inform hwpmc(4) if an executable is being mapped. */ 2994 if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) { 2995 if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) { 2996 pkm.pm_file = vp; 2997 pkm.pm_address = (uintptr_t) *addr; 2998 PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm); 2999 } 3000 } 3001 #endif 3002 return (error); 3003 } 3004 3005 void 3006 vn_fsid(struct vnode *vp, struct vattr *va) 3007 { 3008 fsid_t *f; 3009 3010 f = &vp->v_mount->mnt_stat.f_fsid; 3011 va->va_fsid = (uint32_t)f->val[1]; 3012 va->va_fsid <<= sizeof(f->val[1]) * NBBY; 3013 va->va_fsid += (uint32_t)f->val[0]; 3014 } 3015 3016 int 3017 vn_fsync_buf(struct vnode *vp, int waitfor) 3018 { 3019 struct buf *bp, *nbp; 3020 struct bufobj *bo; 3021 struct mount *mp; 3022 int error, maxretry; 3023 3024 error = 0; 3025 maxretry = 10000; /* large, arbitrarily chosen */ 3026 mp = NULL; 3027 if (vp->v_type == VCHR) { 3028 VI_LOCK(vp); 3029 mp = vp->v_rdev->si_mountpt; 3030 VI_UNLOCK(vp); 3031 } 3032 bo = &vp->v_bufobj; 3033 BO_LOCK(bo); 3034 loop1: 3035 /* 3036 * MARK/SCAN initialization to avoid infinite loops. 3037 */ 3038 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 3039 bp->b_vflags &= ~BV_SCANNED; 3040 bp->b_error = 0; 3041 } 3042 3043 /* 3044 * Flush all dirty buffers associated with a vnode. 3045 */ 3046 loop2: 3047 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 3048 if ((bp->b_vflags & BV_SCANNED) != 0) 3049 continue; 3050 bp->b_vflags |= BV_SCANNED; 3051 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) { 3052 if (waitfor != MNT_WAIT) 3053 continue; 3054 if (BUF_LOCK(bp, 3055 LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL, 3056 BO_LOCKPTR(bo)) != 0) { 3057 BO_LOCK(bo); 3058 goto loop1; 3059 } 3060 BO_LOCK(bo); 3061 } 3062 BO_UNLOCK(bo); 3063 KASSERT(bp->b_bufobj == bo, 3064 ("bp %p wrong b_bufobj %p should be %p", 3065 bp, bp->b_bufobj, bo)); 3066 if ((bp->b_flags & B_DELWRI) == 0) 3067 panic("fsync: not dirty"); 3068 if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) { 3069 vfs_bio_awrite(bp); 3070 } else { 3071 bremfree(bp); 3072 bawrite(bp); 3073 } 3074 if (maxretry < 1000) 3075 pause("dirty", hz < 1000 ? 1 : hz / 1000); 3076 BO_LOCK(bo); 3077 goto loop2; 3078 } 3079 3080 /* 3081 * If synchronous the caller expects us to completely resolve all 3082 * dirty buffers in the system. Wait for in-progress I/O to 3083 * complete (which could include background bitmap writes), then 3084 * retry if dirty blocks still exist. 3085 */ 3086 if (waitfor == MNT_WAIT) { 3087 bufobj_wwait(bo, 0, 0); 3088 if (bo->bo_dirty.bv_cnt > 0) { 3089 /* 3090 * If we are unable to write any of these buffers 3091 * then we fail now rather than trying endlessly 3092 * to write them out. 3093 */ 3094 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 3095 if ((error = bp->b_error) != 0) 3096 break; 3097 if ((mp != NULL && mp->mnt_secondary_writes > 0) || 3098 (error == 0 && --maxretry >= 0)) 3099 goto loop1; 3100 if (error == 0) 3101 error = EAGAIN; 3102 } 3103 } 3104 BO_UNLOCK(bo); 3105 if (error != 0) 3106 vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error); 3107 3108 return (error); 3109 } 3110 3111 /* 3112 * Copies a byte range from invp to outvp. Calls VOP_COPY_FILE_RANGE() 3113 * or vn_generic_copy_file_range() after rangelocking the byte ranges, 3114 * to do the actual copy. 3115 * vn_generic_copy_file_range() is factored out, so it can be called 3116 * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from 3117 * different file systems. 3118 */ 3119 int 3120 vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, 3121 off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, 3122 struct ucred *outcred, struct thread *fsize_td) 3123 { 3124 struct mount *inmp, *outmp; 3125 struct vnode *invpl, *outvpl; 3126 int error; 3127 size_t len; 3128 uint64_t uval; 3129 3130 invpl = outvpl = NULL; 3131 len = *lenp; 3132 *lenp = 0; /* For error returns. */ 3133 error = 0; 3134 3135 /* Do some sanity checks on the arguments. */ 3136 if (invp->v_type == VDIR || outvp->v_type == VDIR) 3137 error = EISDIR; 3138 else if (*inoffp < 0 || *outoffp < 0 || 3139 invp->v_type != VREG || outvp->v_type != VREG) 3140 error = EINVAL; 3141 if (error != 0) 3142 goto out; 3143 3144 /* Ensure offset + len does not wrap around. */ 3145 uval = *inoffp; 3146 uval += len; 3147 if (uval > INT64_MAX) 3148 len = INT64_MAX - *inoffp; 3149 uval = *outoffp; 3150 uval += len; 3151 if (uval > INT64_MAX) 3152 len = INT64_MAX - *outoffp; 3153 if (len == 0) 3154 goto out; 3155 3156 error = VOP_GETLOWVNODE(invp, &invpl, FREAD); 3157 if (error != 0) 3158 goto out; 3159 error = VOP_GETLOWVNODE(outvp, &outvpl, FWRITE); 3160 if (error != 0) 3161 goto out1; 3162 3163 inmp = invpl->v_mount; 3164 outmp = outvpl->v_mount; 3165 if (inmp == NULL || outmp == NULL) 3166 goto out2; 3167 3168 for (;;) { 3169 error = vfs_busy(inmp, 0); 3170 if (error != 0) 3171 goto out2; 3172 if (inmp == outmp) 3173 break; 3174 error = vfs_busy(outmp, MBF_NOWAIT); 3175 if (error != 0) { 3176 vfs_unbusy(inmp); 3177 error = vfs_busy(outmp, 0); 3178 if (error == 0) { 3179 vfs_unbusy(outmp); 3180 continue; 3181 } 3182 goto out2; 3183 } 3184 break; 3185 } 3186 3187 /* 3188 * If the two vnodes are for the same file system type, call 3189 * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range() 3190 * which can handle copies across multiple file system types. 3191 */ 3192 *lenp = len; 3193 if (inmp == outmp || inmp->mnt_vfc == outmp->mnt_vfc) 3194 error = VOP_COPY_FILE_RANGE(invpl, inoffp, outvpl, outoffp, 3195 lenp, flags, incred, outcred, fsize_td); 3196 else 3197 error = ENOSYS; 3198 if (error == ENOSYS) 3199 error = vn_generic_copy_file_range(invpl, inoffp, outvpl, 3200 outoffp, lenp, flags, incred, outcred, fsize_td); 3201 vfs_unbusy(outmp); 3202 if (inmp != outmp) 3203 vfs_unbusy(inmp); 3204 out2: 3205 if (outvpl != NULL) 3206 vrele(outvpl); 3207 out1: 3208 if (invpl != NULL) 3209 vrele(invpl); 3210 out: 3211 return (error); 3212 } 3213 3214 /* 3215 * Test len bytes of data starting at dat for all bytes == 0. 3216 * Return true if all bytes are zero, false otherwise. 3217 * Expects dat to be well aligned. 3218 */ 3219 static bool 3220 mem_iszero(void *dat, int len) 3221 { 3222 int i; 3223 const u_int *p; 3224 const char *cp; 3225 3226 for (p = dat; len > 0; len -= sizeof(*p), p++) { 3227 if (len >= sizeof(*p)) { 3228 if (*p != 0) 3229 return (false); 3230 } else { 3231 cp = (const char *)p; 3232 for (i = 0; i < len; i++, cp++) 3233 if (*cp != '\0') 3234 return (false); 3235 } 3236 } 3237 return (true); 3238 } 3239 3240 /* 3241 * Look for a hole in the output file and, if found, adjust *outoffp 3242 * and *xferp to skip past the hole. 3243 * *xferp is the entire hole length to be written and xfer2 is how many bytes 3244 * to be written as 0's upon return. 3245 */ 3246 static off_t 3247 vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp, 3248 off_t *dataoffp, off_t *holeoffp, struct ucred *cred) 3249 { 3250 int error; 3251 off_t delta; 3252 3253 if (*holeoffp == 0 || *holeoffp <= *outoffp) { 3254 *dataoffp = *outoffp; 3255 error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred, 3256 curthread); 3257 if (error == 0) { 3258 *holeoffp = *dataoffp; 3259 error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred, 3260 curthread); 3261 } 3262 if (error != 0 || *holeoffp == *dataoffp) { 3263 /* 3264 * Since outvp is unlocked, it may be possible for 3265 * another thread to do a truncate(), lseek(), write() 3266 * creating a hole at startoff between the above 3267 * VOP_IOCTL() calls, if the other thread does not do 3268 * rangelocking. 3269 * If that happens, *holeoffp == *dataoffp and finding 3270 * the hole has failed, so disable vn_skip_hole(). 3271 */ 3272 *holeoffp = -1; /* Disable use of vn_skip_hole(). */ 3273 return (xfer2); 3274 } 3275 KASSERT(*dataoffp >= *outoffp, 3276 ("vn_skip_hole: dataoff=%jd < outoff=%jd", 3277 (intmax_t)*dataoffp, (intmax_t)*outoffp)); 3278 KASSERT(*holeoffp > *dataoffp, 3279 ("vn_skip_hole: holeoff=%jd <= dataoff=%jd", 3280 (intmax_t)*holeoffp, (intmax_t)*dataoffp)); 3281 } 3282 3283 /* 3284 * If there is a hole before the data starts, advance *outoffp and 3285 * *xferp past the hole. 3286 */ 3287 if (*dataoffp > *outoffp) { 3288 delta = *dataoffp - *outoffp; 3289 if (delta >= *xferp) { 3290 /* Entire *xferp is a hole. */ 3291 *outoffp += *xferp; 3292 *xferp = 0; 3293 return (0); 3294 } 3295 *xferp -= delta; 3296 *outoffp += delta; 3297 xfer2 = MIN(xfer2, *xferp); 3298 } 3299 3300 /* 3301 * If a hole starts before the end of this xfer2, reduce this xfer2 so 3302 * that the write ends at the start of the hole. 3303 * *holeoffp should always be greater than *outoffp, but for the 3304 * non-INVARIANTS case, check this to make sure xfer2 remains a sane 3305 * value. 3306 */ 3307 if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2) 3308 xfer2 = *holeoffp - *outoffp; 3309 return (xfer2); 3310 } 3311 3312 /* 3313 * Write an xfer sized chunk to outvp in blksize blocks from dat. 3314 * dat is a maximum of blksize in length and can be written repeatedly in 3315 * the chunk. 3316 * If growfile == true, just grow the file via vn_truncate_locked() instead 3317 * of doing actual writes. 3318 * If checkhole == true, a hole is being punched, so skip over any hole 3319 * already in the output file. 3320 */ 3321 static int 3322 vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer, 3323 u_long blksize, bool growfile, bool checkhole, struct ucred *cred) 3324 { 3325 struct mount *mp; 3326 off_t dataoff, holeoff, xfer2; 3327 int error; 3328 3329 /* 3330 * Loop around doing writes of blksize until write has been completed. 3331 * Lock/unlock on each loop iteration so that a bwillwrite() can be 3332 * done for each iteration, since the xfer argument can be very 3333 * large if there is a large hole to punch in the output file. 3334 */ 3335 error = 0; 3336 holeoff = 0; 3337 do { 3338 xfer2 = MIN(xfer, blksize); 3339 if (checkhole) { 3340 /* 3341 * Punching a hole. Skip writing if there is 3342 * already a hole in the output file. 3343 */ 3344 xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer, 3345 &dataoff, &holeoff, cred); 3346 if (xfer == 0) 3347 break; 3348 if (holeoff < 0) 3349 checkhole = false; 3350 KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd", 3351 (intmax_t)xfer2)); 3352 } 3353 bwillwrite(); 3354 mp = NULL; 3355 error = vn_start_write(outvp, &mp, V_WAIT); 3356 if (error != 0) 3357 break; 3358 if (growfile) { 3359 error = vn_lock(outvp, LK_EXCLUSIVE); 3360 if (error == 0) { 3361 error = vn_truncate_locked(outvp, outoff + xfer, 3362 false, cred); 3363 VOP_UNLOCK(outvp); 3364 } 3365 } else { 3366 error = vn_lock(outvp, vn_lktype_write(mp, outvp)); 3367 if (error == 0) { 3368 error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2, 3369 outoff, UIO_SYSSPACE, IO_NODELOCKED, 3370 curthread->td_ucred, cred, NULL, curthread); 3371 outoff += xfer2; 3372 xfer -= xfer2; 3373 VOP_UNLOCK(outvp); 3374 } 3375 } 3376 if (mp != NULL) 3377 vn_finished_write(mp); 3378 } while (!growfile && xfer > 0 && error == 0); 3379 return (error); 3380 } 3381 3382 /* 3383 * Copy a byte range of one file to another. This function can handle the 3384 * case where invp and outvp are on different file systems. 3385 * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there 3386 * is no better file system specific way to do it. 3387 */ 3388 int 3389 vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, 3390 struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, 3391 struct ucred *incred, struct ucred *outcred, struct thread *fsize_td) 3392 { 3393 struct vattr inva; 3394 struct mount *mp; 3395 off_t startoff, endoff, xfer, xfer2; 3396 u_long blksize; 3397 int error, interrupted; 3398 bool cantseek, readzeros, eof, first, lastblock, holetoeof, sparse; 3399 ssize_t aresid, r = 0; 3400 size_t copylen, len, savlen; 3401 off_t outsize; 3402 char *dat; 3403 long holein, holeout; 3404 struct timespec curts, endts; 3405 3406 holein = holeout = 0; 3407 savlen = len = *lenp; 3408 error = 0; 3409 interrupted = 0; 3410 dat = NULL; 3411 3412 error = vn_lock(invp, LK_SHARED); 3413 if (error != 0) 3414 goto out; 3415 if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0) 3416 holein = 0; 3417 error = VOP_GETATTR(invp, &inva, incred); 3418 if (error == 0 && inva.va_size > OFF_MAX) 3419 error = EFBIG; 3420 VOP_UNLOCK(invp); 3421 if (error != 0) 3422 goto out; 3423 3424 /* 3425 * Use va_bytes >= va_size as a hint that the file does not have 3426 * sufficient holes to justify the overhead of doing FIOSEEKHOLE. 3427 * This hint does not work well for file systems doing compression 3428 * and may fail when allocations for extended attributes increases 3429 * the value of va_bytes to >= va_size. 3430 */ 3431 sparse = true; 3432 if (holein != 0 && inva.va_bytes >= inva.va_size) { 3433 holein = 0; 3434 sparse = false; 3435 } 3436 3437 mp = NULL; 3438 error = vn_start_write(outvp, &mp, V_WAIT); 3439 if (error == 0) 3440 error = vn_lock(outvp, LK_EXCLUSIVE); 3441 if (error == 0) { 3442 /* 3443 * If fsize_td != NULL, do a vn_rlimit_fsizex() call, 3444 * now that outvp is locked. 3445 */ 3446 if (fsize_td != NULL) { 3447 struct uio io; 3448 3449 io.uio_offset = *outoffp; 3450 io.uio_resid = len; 3451 error = vn_rlimit_fsizex(outvp, &io, 0, &r, fsize_td); 3452 len = savlen = io.uio_resid; 3453 /* 3454 * No need to call vn_rlimit_fsizex_res before return, 3455 * since the uio is local. 3456 */ 3457 } 3458 if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0) 3459 holeout = 0; 3460 /* 3461 * Holes that are past EOF do not need to be written as a block 3462 * of zero bytes. So, truncate the output file as far as 3463 * possible and then use size to decide if writing 0 3464 * bytes is necessary in the loop below. 3465 */ 3466 if (error == 0) 3467 error = vn_getsize_locked(outvp, &outsize, outcred); 3468 if (error == 0 && outsize > *outoffp && 3469 *outoffp <= OFF_MAX - len && outsize <= *outoffp + len && 3470 *inoffp < inva.va_size && 3471 *outoffp <= OFF_MAX - (inva.va_size - *inoffp) && 3472 outsize <= *outoffp + (inva.va_size - *inoffp)) { 3473 #ifdef MAC 3474 error = mac_vnode_check_write(curthread->td_ucred, 3475 outcred, outvp); 3476 if (error == 0) 3477 #endif 3478 error = vn_truncate_locked(outvp, *outoffp, 3479 false, outcred); 3480 if (error == 0) 3481 outsize = *outoffp; 3482 } 3483 VOP_UNLOCK(outvp); 3484 } 3485 if (mp != NULL) 3486 vn_finished_write(mp); 3487 if (error != 0) 3488 goto out; 3489 3490 if (sparse && holein == 0 && holeout > 0) { 3491 /* 3492 * For this special case, the input data will be scanned 3493 * for blocks of all 0 bytes. For these blocks, the 3494 * write can be skipped for the output file to create 3495 * an unallocated region. 3496 * Therefore, use the appropriate size for the output file. 3497 */ 3498 blksize = holeout; 3499 if (blksize <= 512) { 3500 /* 3501 * Use f_iosize, since ZFS reports a _PC_MIN_HOLE_SIZE 3502 * of 512, although it actually only creates 3503 * unallocated regions for blocks >= f_iosize. 3504 */ 3505 blksize = outvp->v_mount->mnt_stat.f_iosize; 3506 } 3507 } else { 3508 /* 3509 * Use the larger of the two f_iosize values. If they are 3510 * not the same size, one will normally be an exact multiple of 3511 * the other, since they are both likely to be a power of 2. 3512 */ 3513 blksize = MAX(invp->v_mount->mnt_stat.f_iosize, 3514 outvp->v_mount->mnt_stat.f_iosize); 3515 } 3516 3517 /* Clip to sane limits. */ 3518 if (blksize < 4096) 3519 blksize = 4096; 3520 else if (blksize > maxphys) 3521 blksize = maxphys; 3522 dat = malloc(blksize, M_TEMP, M_WAITOK); 3523 3524 /* 3525 * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA 3526 * to find holes. Otherwise, just scan the read block for all 0s 3527 * in the inner loop where the data copying is done. 3528 * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may 3529 * support holes on the server, but do not support FIOSEEKHOLE. 3530 * The kernel flag COPY_FILE_RANGE_TIMEO1SEC is used to indicate 3531 * that this function should return after 1second with a partial 3532 * completion. 3533 */ 3534 if ((flags & COPY_FILE_RANGE_TIMEO1SEC) != 0) { 3535 getnanouptime(&endts); 3536 endts.tv_sec++; 3537 } else 3538 timespecclear(&endts); 3539 first = true; 3540 holetoeof = eof = false; 3541 while (len > 0 && error == 0 && !eof && interrupted == 0) { 3542 endoff = 0; /* To shut up compilers. */ 3543 cantseek = true; 3544 startoff = *inoffp; 3545 copylen = len; 3546 3547 /* 3548 * Find the next data area. If there is just a hole to EOF, 3549 * FIOSEEKDATA should fail with ENXIO. 3550 * (I do not know if any file system will report a hole to 3551 * EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA 3552 * will fail for those file systems.) 3553 * 3554 * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE, 3555 * the code just falls through to the inner copy loop. 3556 */ 3557 error = EINVAL; 3558 if (holein > 0) { 3559 error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0, 3560 incred, curthread); 3561 if (error == ENXIO) { 3562 startoff = endoff = inva.va_size; 3563 eof = holetoeof = true; 3564 error = 0; 3565 } 3566 } 3567 if (error == 0 && !holetoeof) { 3568 endoff = startoff; 3569 error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0, 3570 incred, curthread); 3571 /* 3572 * Since invp is unlocked, it may be possible for 3573 * another thread to do a truncate(), lseek(), write() 3574 * creating a hole at startoff between the above 3575 * VOP_IOCTL() calls, if the other thread does not do 3576 * rangelocking. 3577 * If that happens, startoff == endoff and finding 3578 * the hole has failed, so set an error. 3579 */ 3580 if (error == 0 && startoff == endoff) 3581 error = EINVAL; /* Any error. Reset to 0. */ 3582 } 3583 if (error == 0) { 3584 if (startoff > *inoffp) { 3585 /* Found hole before data block. */ 3586 xfer = MIN(startoff - *inoffp, len); 3587 if (*outoffp < outsize) { 3588 /* Must write 0s to punch hole. */ 3589 xfer2 = MIN(outsize - *outoffp, 3590 xfer); 3591 memset(dat, 0, MIN(xfer2, blksize)); 3592 error = vn_write_outvp(outvp, dat, 3593 *outoffp, xfer2, blksize, false, 3594 holeout > 0, outcred); 3595 } 3596 3597 if (error == 0 && *outoffp + xfer > 3598 outsize && (xfer == len || holetoeof)) { 3599 /* Grow output file (hole at end). */ 3600 error = vn_write_outvp(outvp, dat, 3601 *outoffp, xfer, blksize, true, 3602 false, outcred); 3603 } 3604 if (error == 0) { 3605 *inoffp += xfer; 3606 *outoffp += xfer; 3607 len -= xfer; 3608 if (len < savlen) { 3609 interrupted = sig_intr(); 3610 if (timespecisset(&endts) && 3611 interrupted == 0) { 3612 getnanouptime(&curts); 3613 if (timespeccmp(&curts, 3614 &endts, >=)) 3615 interrupted = 3616 EINTR; 3617 } 3618 } 3619 } 3620 } 3621 copylen = MIN(len, endoff - startoff); 3622 cantseek = false; 3623 } else { 3624 cantseek = true; 3625 if (!sparse) 3626 cantseek = false; 3627 startoff = *inoffp; 3628 copylen = len; 3629 error = 0; 3630 } 3631 3632 xfer = blksize; 3633 if (cantseek) { 3634 /* 3635 * Set first xfer to end at a block boundary, so that 3636 * holes are more likely detected in the loop below via 3637 * the for all bytes 0 method. 3638 */ 3639 xfer -= (*inoffp % blksize); 3640 } 3641 3642 /* 3643 * Loop copying the data block. If this was our first attempt 3644 * to copy anything, allow a zero-length block so that the VOPs 3645 * get a chance to update metadata, specifically the atime. 3646 */ 3647 while (error == 0 && ((copylen > 0 && !eof) || first) && 3648 interrupted == 0) { 3649 if (copylen < xfer) 3650 xfer = copylen; 3651 first = false; 3652 error = vn_lock(invp, LK_SHARED); 3653 if (error != 0) 3654 goto out; 3655 error = vn_rdwr(UIO_READ, invp, dat, xfer, 3656 startoff, UIO_SYSSPACE, IO_NODELOCKED, 3657 curthread->td_ucred, incred, &aresid, 3658 curthread); 3659 VOP_UNLOCK(invp); 3660 lastblock = false; 3661 if (error == 0 && (xfer == 0 || aresid > 0)) { 3662 /* Stop the copy at EOF on the input file. */ 3663 xfer -= aresid; 3664 eof = true; 3665 lastblock = true; 3666 } 3667 if (error == 0) { 3668 /* 3669 * Skip the write for holes past the initial EOF 3670 * of the output file, unless this is the last 3671 * write of the output file at EOF. 3672 */ 3673 readzeros = cantseek ? mem_iszero(dat, xfer) : 3674 false; 3675 if (xfer == len) 3676 lastblock = true; 3677 if (!cantseek || *outoffp < outsize || 3678 lastblock || !readzeros) 3679 error = vn_write_outvp(outvp, dat, 3680 *outoffp, xfer, blksize, 3681 readzeros && lastblock && 3682 *outoffp >= outsize, false, 3683 outcred); 3684 if (error == 0) { 3685 *inoffp += xfer; 3686 startoff += xfer; 3687 *outoffp += xfer; 3688 copylen -= xfer; 3689 len -= xfer; 3690 if (len < savlen) { 3691 interrupted = sig_intr(); 3692 if (timespecisset(&endts) && 3693 interrupted == 0) { 3694 getnanouptime(&curts); 3695 if (timespeccmp(&curts, 3696 &endts, >=)) 3697 interrupted = 3698 EINTR; 3699 } 3700 } 3701 } 3702 } 3703 xfer = blksize; 3704 } 3705 } 3706 out: 3707 *lenp = savlen - len; 3708 free(dat, M_TEMP); 3709 return (error); 3710 } 3711 3712 static int 3713 vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td) 3714 { 3715 struct mount *mp; 3716 struct vnode *vp; 3717 off_t olen, ooffset; 3718 int error; 3719 #ifdef AUDIT 3720 int audited_vnode1 = 0; 3721 #endif 3722 3723 vp = fp->f_vnode; 3724 if (vp->v_type != VREG) 3725 return (ENODEV); 3726 3727 /* Allocating blocks may take a long time, so iterate. */ 3728 for (;;) { 3729 olen = len; 3730 ooffset = offset; 3731 3732 bwillwrite(); 3733 mp = NULL; 3734 error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH); 3735 if (error != 0) 3736 break; 3737 error = vn_lock(vp, LK_EXCLUSIVE); 3738 if (error != 0) { 3739 vn_finished_write(mp); 3740 break; 3741 } 3742 #ifdef AUDIT 3743 if (!audited_vnode1) { 3744 AUDIT_ARG_VNODE1(vp); 3745 audited_vnode1 = 1; 3746 } 3747 #endif 3748 #ifdef MAC 3749 error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp); 3750 if (error == 0) 3751 #endif 3752 error = VOP_ALLOCATE(vp, &offset, &len, 0, 3753 td->td_ucred); 3754 VOP_UNLOCK(vp); 3755 vn_finished_write(mp); 3756 3757 if (olen + ooffset != offset + len) { 3758 panic("offset + len changed from %jx/%jx to %jx/%jx", 3759 ooffset, olen, offset, len); 3760 } 3761 if (error != 0 || len == 0) 3762 break; 3763 KASSERT(olen > len, ("Iteration did not make progress?")); 3764 maybe_yield(); 3765 } 3766 3767 return (error); 3768 } 3769 3770 static int 3771 vn_deallocate_impl(struct vnode *vp, off_t *offset, off_t *length, int flags, 3772 int ioflag, struct ucred *cred, struct ucred *active_cred, 3773 struct ucred *file_cred) 3774 { 3775 struct mount *mp; 3776 void *rl_cookie; 3777 off_t off, len; 3778 int error; 3779 #ifdef AUDIT 3780 bool audited_vnode1 = false; 3781 #endif 3782 3783 rl_cookie = NULL; 3784 error = 0; 3785 mp = NULL; 3786 off = *offset; 3787 len = *length; 3788 3789 if ((ioflag & (IO_NODELOCKED | IO_RANGELOCKED)) == 0) 3790 rl_cookie = vn_rangelock_wlock(vp, off, off + len); 3791 while (len > 0 && error == 0) { 3792 /* 3793 * Try to deallocate the longest range in one pass. 3794 * In case a pass takes too long to be executed, it returns 3795 * partial result. The residue will be proceeded in the next 3796 * pass. 3797 */ 3798 3799 if ((ioflag & IO_NODELOCKED) == 0) { 3800 bwillwrite(); 3801 if ((error = vn_start_write(vp, &mp, 3802 V_WAIT | V_PCATCH)) != 0) 3803 goto out; 3804 vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY); 3805 } 3806 #ifdef AUDIT 3807 if (!audited_vnode1) { 3808 AUDIT_ARG_VNODE1(vp); 3809 audited_vnode1 = true; 3810 } 3811 #endif 3812 3813 #ifdef MAC 3814 if ((ioflag & IO_NOMACCHECK) == 0) 3815 error = mac_vnode_check_write(active_cred, file_cred, 3816 vp); 3817 #endif 3818 if (error == 0) 3819 error = VOP_DEALLOCATE(vp, &off, &len, flags, ioflag, 3820 cred); 3821 3822 if ((ioflag & IO_NODELOCKED) == 0) { 3823 VOP_UNLOCK(vp); 3824 if (mp != NULL) { 3825 vn_finished_write(mp); 3826 mp = NULL; 3827 } 3828 } 3829 if (error == 0 && len != 0) 3830 maybe_yield(); 3831 } 3832 out: 3833 if (rl_cookie != NULL) 3834 vn_rangelock_unlock(vp, rl_cookie); 3835 *offset = off; 3836 *length = len; 3837 return (error); 3838 } 3839 3840 /* 3841 * This function is supposed to be used in the situations where the deallocation 3842 * is not triggered by a user request. 3843 */ 3844 int 3845 vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags, 3846 int ioflag, struct ucred *active_cred, struct ucred *file_cred) 3847 { 3848 struct ucred *cred; 3849 3850 if (*offset < 0 || *length <= 0 || *length > OFF_MAX - *offset || 3851 flags != 0) 3852 return (EINVAL); 3853 if (vp->v_type != VREG) 3854 return (ENODEV); 3855 3856 cred = file_cred != NOCRED ? file_cred : active_cred; 3857 return (vn_deallocate_impl(vp, offset, length, flags, ioflag, cred, 3858 active_cred, file_cred)); 3859 } 3860 3861 static int 3862 vn_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags, 3863 struct ucred *active_cred, struct thread *td) 3864 { 3865 int error; 3866 struct vnode *vp; 3867 int ioflag; 3868 3869 KASSERT(cmd == SPACECTL_DEALLOC, ("vn_fspacectl: Invalid cmd")); 3870 KASSERT((flags & ~SPACECTL_F_SUPPORTED) == 0, 3871 ("vn_fspacectl: non-zero flags")); 3872 KASSERT(*offset >= 0 && *length > 0 && *length <= OFF_MAX - *offset, 3873 ("vn_fspacectl: offset/length overflow or underflow")); 3874 vp = fp->f_vnode; 3875 3876 if (vp->v_type != VREG) 3877 return (ENODEV); 3878 3879 ioflag = get_write_ioflag(fp); 3880 3881 switch (cmd) { 3882 case SPACECTL_DEALLOC: 3883 error = vn_deallocate_impl(vp, offset, length, flags, ioflag, 3884 active_cred, active_cred, fp->f_cred); 3885 break; 3886 default: 3887 panic("vn_fspacectl: unknown cmd %d", cmd); 3888 } 3889 3890 return (error); 3891 } 3892 3893 /* 3894 * Keep this assert as long as sizeof(struct dirent) is used as the maximum 3895 * entry size. 3896 */ 3897 _Static_assert(_GENERIC_MAXDIRSIZ == sizeof(struct dirent), 3898 "'struct dirent' size must be a multiple of its alignment " 3899 "(see _GENERIC_DIRLEN())"); 3900 3901 /* 3902 * Returns successive directory entries through some caller's provided buffer. 3903 * 3904 * This function automatically refills the provided buffer with calls to 3905 * VOP_READDIR() (after MAC permission checks). 3906 * 3907 * 'td' is used for credentials and passed to uiomove(). 'dirbuf' is the 3908 * caller's buffer to fill and 'dirbuflen' its allocated size. 'dirbuf' must 3909 * be properly aligned to access 'struct dirent' structures and 'dirbuflen' 3910 * must be greater than GENERIC_MAXDIRSIZ to avoid VOP_READDIR() returning 3911 * EINVAL (the latter is not a strong guarantee (yet); but EINVAL will always 3912 * be returned if this requirement is not verified). '*dpp' points to the 3913 * current directory entry in the buffer and '*len' contains the remaining 3914 * valid bytes in 'dirbuf' after 'dpp' (including the pointed entry). 3915 * 3916 * At first call (or when restarting the read), '*len' must have been set to 0, 3917 * '*off' to 0 (or any valid start offset) and '*eofflag' to 0. There are no 3918 * more entries as soon as '*len' is 0 after a call that returned 0. Calling 3919 * again this function after such a condition is considered an error and EINVAL 3920 * will be returned. Other possible error codes are those of VOP_READDIR(), 3921 * EINTEGRITY if the returned entries do not pass coherency tests, or EINVAL 3922 * (bad call). All errors are unrecoverable, i.e., the state ('*len', '*off' 3923 * and '*eofflag') must be re-initialized before a subsequent call. On error 3924 * or at end of directory, '*dpp' is reset to NULL. 3925 * 3926 * '*len', '*off' and '*eofflag' are internal state the caller should not 3927 * tamper with except as explained above. '*off' is the next directory offset 3928 * to read from to refill the buffer. '*eofflag' is set to 0 or 1 by the last 3929 * internal call to VOP_READDIR() that returned without error, indicating 3930 * whether it reached the end of the directory, and to 2 by this function after 3931 * all entries have been read. 3932 */ 3933 int 3934 vn_dir_next_dirent(struct vnode *vp, struct thread *td, 3935 char *dirbuf, size_t dirbuflen, 3936 struct dirent **dpp, size_t *len, off_t *off, int *eofflag) 3937 { 3938 struct dirent *dp = NULL; 3939 int reclen; 3940 int error; 3941 struct uio uio; 3942 struct iovec iov; 3943 3944 ASSERT_VOP_LOCKED(vp, "vnode not locked"); 3945 VNASSERT(vp->v_type == VDIR, vp, ("vnode is not a directory")); 3946 MPASS2((uintptr_t)dirbuf < (uintptr_t)dirbuf + dirbuflen, 3947 "Address space overflow"); 3948 3949 if (__predict_false(dirbuflen < GENERIC_MAXDIRSIZ)) { 3950 /* Don't take any chances in this case */ 3951 error = EINVAL; 3952 goto out; 3953 } 3954 3955 if (*len != 0) { 3956 dp = *dpp; 3957 3958 /* 3959 * The caller continued to call us after an error (we set dp to 3960 * NULL in a previous iteration). Bail out right now. 3961 */ 3962 if (__predict_false(dp == NULL)) 3963 return (EINVAL); 3964 3965 MPASS(*len <= dirbuflen); 3966 MPASS2((uintptr_t)dirbuf <= (uintptr_t)dp && 3967 (uintptr_t)dp + *len <= (uintptr_t)dirbuf + dirbuflen, 3968 "Filled range not inside buffer"); 3969 3970 reclen = dp->d_reclen; 3971 if (reclen >= *len) { 3972 /* End of buffer reached */ 3973 *len = 0; 3974 } else { 3975 dp = (struct dirent *)((char *)dp + reclen); 3976 *len -= reclen; 3977 } 3978 } 3979 3980 if (*len == 0) { 3981 dp = NULL; 3982 3983 /* Have to refill. */ 3984 switch (*eofflag) { 3985 case 0: 3986 break; 3987 3988 case 1: 3989 /* Nothing more to read. */ 3990 *eofflag = 2; /* Remember the caller reached EOF. */ 3991 goto success; 3992 3993 default: 3994 /* The caller didn't test for EOF. */ 3995 error = EINVAL; 3996 goto out; 3997 } 3998 3999 iov.iov_base = dirbuf; 4000 iov.iov_len = dirbuflen; 4001 4002 uio.uio_iov = &iov; 4003 uio.uio_iovcnt = 1; 4004 uio.uio_offset = *off; 4005 uio.uio_resid = dirbuflen; 4006 uio.uio_segflg = UIO_SYSSPACE; 4007 uio.uio_rw = UIO_READ; 4008 uio.uio_td = td; 4009 4010 #ifdef MAC 4011 error = mac_vnode_check_readdir(td->td_ucred, vp); 4012 if (error == 0) 4013 #endif 4014 error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag, 4015 NULL, NULL); 4016 if (error != 0) 4017 goto out; 4018 4019 *len = dirbuflen - uio.uio_resid; 4020 *off = uio.uio_offset; 4021 4022 if (*len == 0) { 4023 /* Sanity check on INVARIANTS. */ 4024 MPASS(*eofflag != 0); 4025 *eofflag = 1; 4026 goto success; 4027 } 4028 4029 /* 4030 * Normalize the flag returned by VOP_READDIR(), since we use 2 4031 * as a sentinel value. 4032 */ 4033 if (*eofflag != 0) 4034 *eofflag = 1; 4035 4036 dp = (struct dirent *)dirbuf; 4037 } 4038 4039 if (__predict_false(*len < GENERIC_MINDIRSIZ || 4040 dp->d_reclen < GENERIC_MINDIRSIZ)) { 4041 error = EINTEGRITY; 4042 dp = NULL; 4043 goto out; 4044 } 4045 4046 success: 4047 error = 0; 4048 out: 4049 *dpp = dp; 4050 return (error); 4051 } 4052 4053 /* 4054 * Checks whether a directory is empty or not. 4055 * 4056 * If the directory is empty, returns 0, and if it is not, ENOTEMPTY. Other 4057 * values are genuine errors preventing the check. 4058 */ 4059 int 4060 vn_dir_check_empty(struct vnode *vp) 4061 { 4062 struct thread *const td = curthread; 4063 char *dirbuf; 4064 size_t dirbuflen, len; 4065 off_t off; 4066 int eofflag, error; 4067 struct dirent *dp; 4068 struct vattr va; 4069 4070 ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); 4071 VNPASS(vp->v_type == VDIR, vp); 4072 4073 error = VOP_GETATTR(vp, &va, td->td_ucred); 4074 if (error != 0) 4075 return (error); 4076 4077 dirbuflen = max(DEV_BSIZE, GENERIC_MAXDIRSIZ); 4078 if (dirbuflen < va.va_blocksize) 4079 dirbuflen = va.va_blocksize; 4080 dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK); 4081 4082 len = 0; 4083 off = 0; 4084 eofflag = 0; 4085 4086 for (;;) { 4087 error = vn_dir_next_dirent(vp, td, dirbuf, dirbuflen, 4088 &dp, &len, &off, &eofflag); 4089 if (error != 0) 4090 goto end; 4091 4092 if (len == 0) { 4093 /* EOF */ 4094 error = 0; 4095 goto end; 4096 } 4097 4098 /* 4099 * Skip whiteouts. Unionfs operates on filesystems only and 4100 * not on hierarchies, so these whiteouts would be shadowed on 4101 * the system hierarchy but not for a union using the 4102 * filesystem of their directories as the upper layer. 4103 * Additionally, unionfs currently transparently exposes 4104 * union-specific metadata of its upper layer, meaning that 4105 * whiteouts can be seen through the union view in empty 4106 * directories. Taking into account these whiteouts would then 4107 * prevent mounting another filesystem on such effectively 4108 * empty directories. 4109 */ 4110 if (dp->d_type == DT_WHT) 4111 continue; 4112 4113 /* 4114 * Any file in the directory which is not '.' or '..' indicates 4115 * the directory is not empty. 4116 */ 4117 switch (dp->d_namlen) { 4118 case 2: 4119 if (dp->d_name[1] != '.') { 4120 /* Can't be '..' (nor '.') */ 4121 error = ENOTEMPTY; 4122 goto end; 4123 } 4124 /* FALLTHROUGH */ 4125 case 1: 4126 if (dp->d_name[0] != '.') { 4127 /* Can't be '..' nor '.' */ 4128 error = ENOTEMPTY; 4129 goto end; 4130 } 4131 break; 4132 4133 default: 4134 error = ENOTEMPTY; 4135 goto end; 4136 } 4137 } 4138 4139 end: 4140 free(dirbuf, M_TEMP); 4141 return (error); 4142 } 4143 4144 4145 static u_long vn_lock_pair_pause_cnt; 4146 SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD, 4147 &vn_lock_pair_pause_cnt, 0, 4148 "Count of vn_lock_pair deadlocks"); 4149 4150 u_int vn_lock_pair_pause_max; 4151 SYSCTL_UINT(_debug, OID_AUTO, vn_lock_pair_pause_max, CTLFLAG_RW, 4152 &vn_lock_pair_pause_max, 0, 4153 "Max ticks for vn_lock_pair deadlock avoidance sleep"); 4154 4155 static void 4156 vn_lock_pair_pause(const char *wmesg) 4157 { 4158 atomic_add_long(&vn_lock_pair_pause_cnt, 1); 4159 pause(wmesg, prng32_bounded(vn_lock_pair_pause_max)); 4160 } 4161 4162 /* 4163 * Lock pair of (possibly same) vnodes vp1, vp2, avoiding lock order 4164 * reversal. vp1_locked indicates whether vp1 is locked; if not, vp1 4165 * must be unlocked. Same for vp2 and vp2_locked. One of the vnodes 4166 * can be NULL. 4167 * 4168 * The function returns with both vnodes exclusively or shared locked, 4169 * according to corresponding lkflags, and guarantees that it does not 4170 * create lock order reversal with other threads during its execution. 4171 * Both vnodes could be unlocked temporary (and reclaimed). 4172 * 4173 * If requesting shared locking, locked vnode lock must not be recursed. 4174 * 4175 * Only one of LK_SHARED and LK_EXCLUSIVE must be specified. 4176 * LK_NODDLKTREAT can be optionally passed. 4177 * 4178 * If vp1 == vp2, only one, most exclusive, lock is obtained on it. 4179 */ 4180 void 4181 vn_lock_pair(struct vnode *vp1, bool vp1_locked, int lkflags1, 4182 struct vnode *vp2, bool vp2_locked, int lkflags2) 4183 { 4184 int error, locked1; 4185 4186 MPASS((((lkflags1 & LK_SHARED) != 0) ^ ((lkflags1 & LK_EXCLUSIVE) != 0)) || 4187 (vp1 == NULL && lkflags1 == 0)); 4188 MPASS((lkflags1 & ~(LK_SHARED | LK_EXCLUSIVE | LK_NODDLKTREAT)) == 0); 4189 MPASS((((lkflags2 & LK_SHARED) != 0) ^ ((lkflags2 & LK_EXCLUSIVE) != 0)) || 4190 (vp2 == NULL && lkflags2 == 0)); 4191 MPASS((lkflags2 & ~(LK_SHARED | LK_EXCLUSIVE | LK_NODDLKTREAT)) == 0); 4192 4193 if (vp1 == NULL && vp2 == NULL) 4194 return; 4195 4196 if (vp1 == vp2) { 4197 MPASS(vp1_locked == vp2_locked); 4198 4199 /* Select the most exclusive mode for lock. */ 4200 if ((lkflags1 & LK_TYPE_MASK) != (lkflags2 & LK_TYPE_MASK)) 4201 lkflags1 = (lkflags1 & ~LK_SHARED) | LK_EXCLUSIVE; 4202 4203 if (vp1_locked) { 4204 ASSERT_VOP_LOCKED(vp1, "vp1"); 4205 4206 /* No need to relock if any lock is exclusive. */ 4207 if ((vp1->v_vnlock->lock_object.lo_flags & 4208 LK_NOSHARE) != 0) 4209 return; 4210 4211 locked1 = VOP_ISLOCKED(vp1); 4212 if (((lkflags1 & LK_SHARED) != 0 && 4213 locked1 != LK_EXCLUSIVE) || 4214 ((lkflags1 & LK_EXCLUSIVE) != 0 && 4215 locked1 == LK_EXCLUSIVE)) 4216 return; 4217 VOP_UNLOCK(vp1); 4218 } 4219 4220 ASSERT_VOP_UNLOCKED(vp1, "vp1"); 4221 vn_lock(vp1, lkflags1 | LK_RETRY); 4222 return; 4223 } 4224 4225 if (vp1 != NULL) { 4226 if ((lkflags1 & LK_SHARED) != 0 && 4227 (vp1->v_vnlock->lock_object.lo_flags & LK_NOSHARE) != 0) 4228 lkflags1 = (lkflags1 & ~LK_SHARED) | LK_EXCLUSIVE; 4229 if (vp1_locked && VOP_ISLOCKED(vp1) != LK_EXCLUSIVE) { 4230 ASSERT_VOP_LOCKED(vp1, "vp1"); 4231 if ((lkflags1 & LK_EXCLUSIVE) != 0) { 4232 VOP_UNLOCK(vp1); 4233 ASSERT_VOP_UNLOCKED(vp1, 4234 "vp1 shared recursed"); 4235 vp1_locked = false; 4236 } 4237 } else if (!vp1_locked) 4238 ASSERT_VOP_UNLOCKED(vp1, "vp1"); 4239 } else { 4240 vp1_locked = true; 4241 } 4242 4243 if (vp2 != NULL) { 4244 if ((lkflags2 & LK_SHARED) != 0 && 4245 (vp2->v_vnlock->lock_object.lo_flags & LK_NOSHARE) != 0) 4246 lkflags2 = (lkflags2 & ~LK_SHARED) | LK_EXCLUSIVE; 4247 if (vp2_locked && VOP_ISLOCKED(vp2) != LK_EXCLUSIVE) { 4248 ASSERT_VOP_LOCKED(vp2, "vp2"); 4249 if ((lkflags2 & LK_EXCLUSIVE) != 0) { 4250 VOP_UNLOCK(vp2); 4251 ASSERT_VOP_UNLOCKED(vp2, 4252 "vp2 shared recursed"); 4253 vp2_locked = false; 4254 } 4255 } else if (!vp2_locked) 4256 ASSERT_VOP_UNLOCKED(vp2, "vp2"); 4257 } else { 4258 vp2_locked = true; 4259 } 4260 4261 if (!vp1_locked && !vp2_locked) { 4262 vn_lock(vp1, lkflags1 | LK_RETRY); 4263 vp1_locked = true; 4264 } 4265 4266 while (!vp1_locked || !vp2_locked) { 4267 if (vp1_locked && vp2 != NULL) { 4268 if (vp1 != NULL) { 4269 error = VOP_LOCK1(vp2, lkflags2 | LK_NOWAIT, 4270 __FILE__, __LINE__); 4271 if (error == 0) 4272 break; 4273 VOP_UNLOCK(vp1); 4274 vp1_locked = false; 4275 vn_lock_pair_pause("vlp1"); 4276 } 4277 vn_lock(vp2, lkflags2 | LK_RETRY); 4278 vp2_locked = true; 4279 } 4280 if (vp2_locked && vp1 != NULL) { 4281 if (vp2 != NULL) { 4282 error = VOP_LOCK1(vp1, lkflags1 | LK_NOWAIT, 4283 __FILE__, __LINE__); 4284 if (error == 0) 4285 break; 4286 VOP_UNLOCK(vp2); 4287 vp2_locked = false; 4288 vn_lock_pair_pause("vlp2"); 4289 } 4290 vn_lock(vp1, lkflags1 | LK_RETRY); 4291 vp1_locked = true; 4292 } 4293 } 4294 if (vp1 != NULL) { 4295 if (lkflags1 == LK_EXCLUSIVE) 4296 ASSERT_VOP_ELOCKED(vp1, "vp1 ret"); 4297 else 4298 ASSERT_VOP_LOCKED(vp1, "vp1 ret"); 4299 } 4300 if (vp2 != NULL) { 4301 if (lkflags2 == LK_EXCLUSIVE) 4302 ASSERT_VOP_ELOCKED(vp2, "vp2 ret"); 4303 else 4304 ASSERT_VOP_LOCKED(vp2, "vp2 ret"); 4305 } 4306 } 4307 4308 int 4309 vn_lktype_write(struct mount *mp, struct vnode *vp) 4310 { 4311 if (MNT_SHARED_WRITES(mp) || 4312 (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) 4313 return (LK_SHARED); 4314 return (LK_EXCLUSIVE); 4315 } 4316 4317 int 4318 vn_cmp(struct file *fp1, struct file *fp2, struct thread *td) 4319 { 4320 if (fp2->f_type != DTYPE_VNODE) 4321 return (3); 4322 return (kcmp_cmp((uintptr_t)fp1->f_vnode, (uintptr_t)fp2->f_vnode)); 4323 } 4324