1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org> 13 * Copyright (c) 2013, 2014 The FreeBSD Foundation 14 * 15 * Portions of this software were developed by Konstantin Belousov 16 * under sponsorship from the FreeBSD Foundation. 17 * 18 * Redistribution and use in source and binary forms, with or without 19 * modification, are permitted provided that the following conditions 20 * are met: 21 * 1. Redistributions of source code must retain the above copyright 22 * notice, this list of conditions and the following disclaimer. 23 * 2. Redistributions in binary form must reproduce the above copyright 24 * notice, this list of conditions and the following disclaimer in the 25 * documentation and/or other materials provided with the distribution. 26 * 3. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 */ 42 43 #include <sys/cdefs.h> 44 #include "opt_hwpmc_hooks.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/disk.h> 49 #include <sys/fail.h> 50 #include <sys/fcntl.h> 51 #include <sys/file.h> 52 #include <sys/kdb.h> 53 #include <sys/ktr.h> 54 #include <sys/stat.h> 55 #include <sys/priv.h> 56 #include <sys/proc.h> 57 #include <sys/limits.h> 58 #include <sys/lock.h> 59 #include <sys/mman.h> 60 #include <sys/mount.h> 61 #include <sys/mutex.h> 62 #include <sys/namei.h> 63 #include <sys/vnode.h> 64 #include <sys/dirent.h> 65 #include <sys/bio.h> 66 #include <sys/buf.h> 67 #include <sys/filio.h> 68 #include <sys/resourcevar.h> 69 #include <sys/rwlock.h> 70 #include <sys/prng.h> 71 #include <sys/sx.h> 72 #include <sys/sleepqueue.h> 73 #include <sys/sysctl.h> 74 #include <sys/ttycom.h> 75 #include <sys/conf.h> 76 #include <sys/syslog.h> 77 #include <sys/unistd.h> 78 #include <sys/user.h> 79 #include <sys/ktrace.h> 80 81 #include <security/audit/audit.h> 82 #include <security/mac/mac_framework.h> 83 84 #include <vm/vm.h> 85 #include <vm/vm_extern.h> 86 #include <vm/pmap.h> 87 #include <vm/vm_map.h> 88 #include <vm/vm_object.h> 89 #include <vm/vm_page.h> 90 #include <vm/vm_pager.h> 91 #include <vm/vnode_pager.h> 92 93 #ifdef HWPMC_HOOKS 94 #include <sys/pmckern.h> 95 #endif 96 97 static fo_rdwr_t vn_read; 98 static fo_rdwr_t vn_write; 99 static fo_rdwr_t vn_io_fault; 100 static fo_truncate_t vn_truncate; 101 static fo_ioctl_t vn_ioctl; 102 static fo_poll_t vn_poll; 103 static fo_kqfilter_t vn_kqfilter; 104 static fo_close_t vn_closefile; 105 static fo_mmap_t vn_mmap; 106 static fo_fallocate_t vn_fallocate; 107 static fo_fspacectl_t vn_fspacectl; 108 109 const struct fileops vnops = { 110 .fo_read = vn_io_fault, 111 .fo_write = vn_io_fault, 112 .fo_truncate = vn_truncate, 113 .fo_ioctl = vn_ioctl, 114 .fo_poll = vn_poll, 115 .fo_kqfilter = vn_kqfilter, 116 .fo_stat = vn_statfile, 117 .fo_close = vn_closefile, 118 .fo_chmod = vn_chmod, 119 .fo_chown = vn_chown, 120 .fo_sendfile = vn_sendfile, 121 .fo_seek = vn_seek, 122 .fo_fill_kinfo = vn_fill_kinfo, 123 .fo_mmap = vn_mmap, 124 .fo_fallocate = vn_fallocate, 125 .fo_fspacectl = vn_fspacectl, 126 .fo_cmp = vn_cmp, 127 .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE 128 }; 129 130 const u_int io_hold_cnt = 16; 131 static int vn_io_fault_enable = 1; 132 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN, 133 &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); 134 static int vn_io_fault_prefault = 0; 135 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN, 136 &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting"); 137 static int vn_io_pgcache_read_enable = 1; 138 SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN, 139 &vn_io_pgcache_read_enable, 0, 140 "Enable copying from page cache for reads, avoiding fs"); 141 static u_long vn_io_faults_cnt; 142 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, 143 &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); 144 145 static int vfs_allow_read_dir = 0; 146 SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW, 147 &vfs_allow_read_dir, 0, 148 "Enable read(2) of directory by root for filesystems that support it"); 149 150 /* 151 * Returns true if vn_io_fault mode of handling the i/o request should 152 * be used. 153 */ 154 static bool 155 do_vn_io_fault(struct vnode *vp, struct uio *uio) 156 { 157 struct mount *mp; 158 159 return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG && 160 (mp = vp->v_mount) != NULL && 161 (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable); 162 } 163 164 /* 165 * Structure used to pass arguments to vn_io_fault1(), to do either 166 * file- or vnode-based I/O calls. 167 */ 168 struct vn_io_fault_args { 169 enum { 170 VN_IO_FAULT_FOP, 171 VN_IO_FAULT_VOP 172 } kind; 173 struct ucred *cred; 174 int flags; 175 union { 176 struct fop_args_tag { 177 struct file *fp; 178 fo_rdwr_t *doio; 179 } fop_args; 180 struct vop_args_tag { 181 struct vnode *vp; 182 } vop_args; 183 } args; 184 }; 185 186 static int vn_io_fault1(struct vnode *vp, struct uio *uio, 187 struct vn_io_fault_args *args, struct thread *td); 188 189 int 190 vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp) 191 { 192 struct thread *td = curthread; 193 194 return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp)); 195 } 196 197 static uint64_t 198 open2nameif(int fmode, u_int vn_open_flags) 199 { 200 uint64_t res; 201 202 res = ISOPEN | LOCKLEAF; 203 if ((fmode & O_RESOLVE_BENEATH) != 0) 204 res |= RBENEATH; 205 if ((fmode & O_EMPTY_PATH) != 0) 206 res |= EMPTYPATH; 207 if ((fmode & FREAD) != 0) 208 res |= OPENREAD; 209 if ((fmode & FWRITE) != 0) 210 res |= OPENWRITE; 211 if ((fmode & O_NAMEDATTR) != 0) { 212 res |= OPENNAMED; 213 if ((fmode & O_CREAT) != 0) 214 res |= CREATENAMED; 215 } 216 if ((vn_open_flags & VN_OPEN_NOAUDIT) == 0) 217 res |= AUDITVNODE1; 218 if ((vn_open_flags & VN_OPEN_NOCAPCHECK) != 0) 219 res |= NOCAPCHECK; 220 if ((vn_open_flags & VN_OPEN_WANTIOCTLCAPS) != 0) 221 res |= WANTIOCTLCAPS; 222 return (res); 223 } 224 225 /* 226 * Common code for vnode open operations via a name lookup. 227 * Lookup the vnode and invoke VOP_CREATE if needed. 228 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. 229 * 230 * Note that this does NOT free nameidata for the successful case, 231 * due to the NDINIT being done elsewhere. 232 */ 233 int 234 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, 235 struct ucred *cred, struct file *fp) 236 { 237 struct vnode *vp; 238 struct mount *mp; 239 struct vattr vat; 240 struct vattr *vap = &vat; 241 int fmode, error; 242 bool first_open; 243 244 restart: 245 first_open = false; 246 fmode = *flagp; 247 if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT | 248 O_EXCL | O_DIRECTORY) || 249 (fmode & (O_CREAT | O_EMPTY_PATH)) == (O_CREAT | O_EMPTY_PATH)) 250 return (EINVAL); 251 else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) { 252 ndp->ni_cnd.cn_nameiop = CREATE; 253 ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags); 254 /* 255 * Set NOCACHE to avoid flushing the cache when 256 * rolling in many files at once. 257 * 258 * Set NC_KEEPPOSENTRY to keep positive entries if they already 259 * exist despite NOCACHE. 260 */ 261 ndp->ni_cnd.cn_flags |= LOCKPARENT | NOCACHE | NC_KEEPPOSENTRY; 262 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) 263 ndp->ni_cnd.cn_flags |= FOLLOW; 264 if ((vn_open_flags & VN_OPEN_INVFS) == 0) 265 bwillwrite(); 266 if ((error = namei(ndp)) != 0) 267 return (error); 268 if (ndp->ni_vp == NULL) { 269 if ((fmode & O_NAMEDATTR) != 0) { 270 if ((ndp->ni_dvp->v_mount->mnt_flag & 271 MNT_NAMEDATTR) == 0) 272 error = EINVAL; 273 else if ((vn_irflag_read(ndp->ni_dvp) & 274 VIRF_NAMEDDIR) == 0) 275 error = ENOENT; 276 if (error != 0) { 277 vp = ndp->ni_dvp; 278 ndp->ni_dvp = NULL; 279 goto bad; 280 } 281 } 282 VATTR_NULL(vap); 283 vap->va_type = VREG; 284 vap->va_mode = cmode; 285 if (fmode & O_EXCL) 286 vap->va_vaflags |= VA_EXCLUSIVE; 287 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { 288 NDFREE_PNBUF(ndp); 289 vput(ndp->ni_dvp); 290 if ((error = vn_start_write(NULL, &mp, 291 V_XSLEEP | V_PCATCH)) != 0) 292 return (error); 293 NDREINIT(ndp); 294 goto restart; 295 } 296 if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0) 297 ndp->ni_cnd.cn_flags |= MAKEENTRY; 298 #ifdef MAC 299 error = mac_vnode_check_create(cred, ndp->ni_dvp, 300 &ndp->ni_cnd, vap); 301 if (error == 0) 302 #endif 303 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, 304 &ndp->ni_cnd, vap); 305 vp = ndp->ni_vp; 306 if (error == 0 && (fmode & O_EXCL) != 0 && 307 (fmode & (O_EXLOCK | O_SHLOCK)) != 0) { 308 VI_LOCK(vp); 309 vp->v_iflag |= VI_FOPENING; 310 VI_UNLOCK(vp); 311 first_open = true; 312 } 313 VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &vp : NULL, 314 false); 315 vn_finished_write(mp); 316 if (error) { 317 NDFREE_PNBUF(ndp); 318 if (error == ERELOOKUP) { 319 NDREINIT(ndp); 320 goto restart; 321 } 322 return (error); 323 } 324 fmode &= ~O_TRUNC; 325 } else { 326 if (ndp->ni_dvp == ndp->ni_vp) 327 vrele(ndp->ni_dvp); 328 else 329 vput(ndp->ni_dvp); 330 ndp->ni_dvp = NULL; 331 vp = ndp->ni_vp; 332 if (fmode & O_EXCL) { 333 error = EEXIST; 334 goto bad; 335 } 336 if ((fmode & O_NAMEDATTR) != 0) { 337 short irflag; 338 339 irflag = vn_irflag_read(vp); 340 if ((vp->v_mount->mnt_flag & 341 MNT_NAMEDATTR) == 0 || 342 ((irflag & VIRF_NAMEDATTR) != 0 && 343 vp->v_type != VREG)) 344 error = EINVAL; 345 else if ((irflag & (VIRF_NAMEDDIR | 346 VIRF_NAMEDATTR)) == 0) 347 error = ENOATTR; 348 if (error != 0) 349 goto bad; 350 } else if (vp->v_type == VDIR) { 351 error = EISDIR; 352 goto bad; 353 } 354 fmode &= ~O_CREAT; 355 } 356 } else { 357 ndp->ni_cnd.cn_nameiop = LOOKUP; 358 ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags); 359 ndp->ni_cnd.cn_flags |= (fmode & O_NOFOLLOW) != 0 ? NOFOLLOW : 360 FOLLOW; 361 if ((fmode & FWRITE) == 0) 362 ndp->ni_cnd.cn_flags |= LOCKSHARED; 363 if ((error = namei(ndp)) != 0) 364 return (error); 365 vp = ndp->ni_vp; 366 if ((fmode & O_NAMEDATTR) != 0 && (vp->v_mount->mnt_flag & 367 MNT_NAMEDATTR) == 0) { 368 error = EINVAL; 369 goto bad; 370 } 371 } 372 error = vn_open_vnode(vp, fmode, cred, curthread, fp); 373 if (first_open) { 374 VI_LOCK(vp); 375 vp->v_iflag &= ~VI_FOPENING; 376 wakeup(vp); 377 VI_UNLOCK(vp); 378 } 379 if (error) 380 goto bad; 381 *flagp = fmode; 382 return (0); 383 bad: 384 NDFREE_PNBUF(ndp); 385 vput(vp); 386 *flagp = fmode; 387 ndp->ni_vp = NULL; 388 return (error); 389 } 390 391 static int 392 vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp) 393 { 394 struct flock lf; 395 int error, lock_flags, type; 396 397 ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock"); 398 if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0) 399 return (0); 400 KASSERT(fp != NULL, ("open with flock requires fp")); 401 if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE) 402 return (EOPNOTSUPP); 403 404 lock_flags = VOP_ISLOCKED(vp); 405 VOP_UNLOCK(vp); 406 407 lf.l_whence = SEEK_SET; 408 lf.l_start = 0; 409 lf.l_len = 0; 410 lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK; 411 type = F_FLOCK; 412 if ((fmode & FNONBLOCK) == 0) 413 type |= F_WAIT; 414 if ((fmode & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) 415 type |= F_FIRSTOPEN; 416 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type); 417 if (error == 0) 418 fp->f_flag |= FHASLOCK; 419 420 vn_lock(vp, lock_flags | LK_RETRY); 421 return (error); 422 } 423 424 /* 425 * Common code for vnode open operations once a vnode is located. 426 * Check permissions, and call the VOP_OPEN routine. 427 */ 428 int 429 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, 430 struct thread *td, struct file *fp) 431 { 432 accmode_t accmode; 433 int error; 434 435 KASSERT((fmode & O_PATH) == 0 || (fmode & O_ACCMODE) == 0, 436 ("%s: O_PATH and O_ACCMODE are mutually exclusive", __func__)); 437 438 if (vp->v_type == VLNK) { 439 if ((fmode & O_PATH) == 0 || (fmode & FEXEC) != 0) 440 return (EMLINK); 441 } 442 if (vp->v_type != VDIR && fmode & O_DIRECTORY) 443 return (ENOTDIR); 444 445 accmode = 0; 446 if ((fmode & O_PATH) == 0) { 447 if (vp->v_type == VSOCK) 448 return (EOPNOTSUPP); 449 if ((fmode & (FWRITE | O_TRUNC)) != 0) { 450 if (vp->v_type == VDIR) 451 return (EISDIR); 452 accmode |= VWRITE; 453 } 454 if ((fmode & FREAD) != 0) 455 accmode |= VREAD; 456 if ((fmode & O_APPEND) && (fmode & FWRITE)) 457 accmode |= VAPPEND; 458 #ifdef MAC 459 if ((fmode & O_CREAT) != 0) 460 accmode |= VCREAT; 461 #endif 462 } 463 if ((fmode & FEXEC) != 0) 464 accmode |= VEXEC; 465 #ifdef MAC 466 if ((fmode & O_VERIFY) != 0) 467 accmode |= VVERIFY; 468 error = mac_vnode_check_open(cred, vp, accmode); 469 if (error != 0) 470 return (error); 471 472 accmode &= ~(VCREAT | VVERIFY); 473 #endif 474 if ((fmode & O_CREAT) == 0 && accmode != 0) { 475 error = VOP_ACCESS(vp, accmode, cred, td); 476 if (error != 0) 477 return (error); 478 } 479 if ((fmode & O_PATH) != 0) { 480 if (vp->v_type != VFIFO && vp->v_type != VSOCK && 481 VOP_ACCESS(vp, VREAD, cred, td) == 0) 482 fp->f_flag |= FKQALLOWED; 483 return (0); 484 } 485 486 if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 487 vn_lock(vp, LK_UPGRADE | LK_RETRY); 488 error = VOP_OPEN(vp, fmode, cred, td, fp); 489 if (error != 0) 490 return (error); 491 492 error = vn_open_vnode_advlock(vp, fmode, fp); 493 if (error == 0 && (fmode & FWRITE) != 0) { 494 error = VOP_ADD_WRITECOUNT(vp, 1); 495 if (error == 0) { 496 CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", 497 __func__, vp, vp->v_writecount); 498 } 499 } 500 501 /* 502 * Error from advlock or VOP_ADD_WRITECOUNT() still requires 503 * calling VOP_CLOSE() to pair with earlier VOP_OPEN(). 504 */ 505 if (error != 0) { 506 if (fp != NULL) { 507 /* 508 * Arrange the call by having fdrop() to use 509 * vn_closefile(). This is to satisfy 510 * filesystems like devfs or tmpfs, which 511 * override fo_close(). 512 */ 513 fp->f_flag |= FOPENFAILED; 514 fp->f_vnode = vp; 515 if (fp->f_ops == &badfileops) { 516 fp->f_type = DTYPE_VNODE; 517 fp->f_ops = &vnops; 518 } 519 vref(vp); 520 } else { 521 /* 522 * If there is no fp, due to kernel-mode open, 523 * we can call VOP_CLOSE() now. 524 */ 525 if ((vp->v_type == VFIFO || 526 !MNT_EXTENDED_SHARED(vp->v_mount)) && 527 VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 528 vn_lock(vp, LK_UPGRADE | LK_RETRY); 529 (void)VOP_CLOSE(vp, fmode & (FREAD | FWRITE | FEXEC), 530 cred, td); 531 } 532 } 533 534 ASSERT_VOP_LOCKED(vp, "vn_open_vnode"); 535 return (error); 536 537 } 538 539 /* 540 * Check for write permissions on the specified vnode. 541 * Prototype text segments cannot be written. 542 * It is racy. 543 */ 544 int 545 vn_writechk(struct vnode *vp) 546 { 547 548 ASSERT_VOP_LOCKED(vp, "vn_writechk"); 549 /* 550 * If there's shared text associated with 551 * the vnode, try to free it up once. If 552 * we fail, we can't allow writing. 553 */ 554 if (VOP_IS_TEXT(vp)) 555 return (ETXTBSY); 556 557 return (0); 558 } 559 560 /* 561 * Vnode close call 562 */ 563 static int 564 vn_close1(struct vnode *vp, int flags, struct ucred *file_cred, 565 struct thread *td, bool keep_ref) 566 { 567 struct mount *mp; 568 int error, lock_flags; 569 570 lock_flags = vp->v_type != VFIFO && MNT_EXTENDED_SHARED(vp->v_mount) ? 571 LK_SHARED : LK_EXCLUSIVE; 572 573 vn_start_write(vp, &mp, V_WAIT); 574 vn_lock(vp, lock_flags | LK_RETRY); 575 AUDIT_ARG_VNODE1(vp); 576 if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) { 577 VOP_ADD_WRITECOUNT_CHECKED(vp, -1); 578 CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", 579 __func__, vp, vp->v_writecount); 580 } 581 error = VOP_CLOSE(vp, flags, file_cred, td); 582 if (keep_ref) 583 VOP_UNLOCK(vp); 584 else 585 vput(vp); 586 vn_finished_write(mp); 587 return (error); 588 } 589 590 int 591 vn_close(struct vnode *vp, int flags, struct ucred *file_cred, 592 struct thread *td) 593 { 594 595 return (vn_close1(vp, flags, file_cred, td, false)); 596 } 597 598 /* 599 * Heuristic to detect sequential operation. 600 */ 601 static int 602 sequential_heuristic(struct uio *uio, struct file *fp) 603 { 604 enum uio_rw rw; 605 606 ASSERT_VOP_LOCKED(fp->f_vnode, __func__); 607 608 rw = uio->uio_rw; 609 if (fp->f_flag & FRDAHEAD) 610 return (fp->f_seqcount[rw] << IO_SEQSHIFT); 611 612 /* 613 * Offset 0 is handled specially. open() sets f_seqcount to 1 so 614 * that the first I/O is normally considered to be slightly 615 * sequential. Seeking to offset 0 doesn't change sequentiality 616 * unless previous seeks have reduced f_seqcount to 0, in which 617 * case offset 0 is not special. 618 */ 619 if ((uio->uio_offset == 0 && fp->f_seqcount[rw] > 0) || 620 uio->uio_offset == fp->f_nextoff[rw]) { 621 /* 622 * f_seqcount is in units of fixed-size blocks so that it 623 * depends mainly on the amount of sequential I/O and not 624 * much on the number of sequential I/O's. The fixed size 625 * of 16384 is hard-coded here since it is (not quite) just 626 * a magic size that works well here. This size is more 627 * closely related to the best I/O size for real disks than 628 * to any block size used by software. 629 */ 630 if (uio->uio_resid >= IO_SEQMAX * 16384) 631 fp->f_seqcount[rw] = IO_SEQMAX; 632 else { 633 fp->f_seqcount[rw] += howmany(uio->uio_resid, 16384); 634 if (fp->f_seqcount[rw] > IO_SEQMAX) 635 fp->f_seqcount[rw] = IO_SEQMAX; 636 } 637 return (fp->f_seqcount[rw] << IO_SEQSHIFT); 638 } 639 640 /* Not sequential. Quickly draw-down sequentiality. */ 641 if (fp->f_seqcount[rw] > 1) 642 fp->f_seqcount[rw] = 1; 643 else 644 fp->f_seqcount[rw] = 0; 645 return (0); 646 } 647 648 /* 649 * Package up an I/O request on a vnode into a uio and do it. 650 */ 651 int 652 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, 653 enum uio_seg segflg, int ioflg, struct ucred *active_cred, 654 struct ucred *file_cred, ssize_t *aresid, struct thread *td) 655 { 656 struct uio auio; 657 struct iovec aiov; 658 struct mount *mp; 659 struct ucred *cred; 660 void *rl_cookie; 661 struct vn_io_fault_args args; 662 int error, lock_flags; 663 664 if (offset < 0 && vp->v_type != VCHR) 665 return (EINVAL); 666 auio.uio_iov = &aiov; 667 auio.uio_iovcnt = 1; 668 aiov.iov_base = base; 669 aiov.iov_len = len; 670 auio.uio_resid = len; 671 auio.uio_offset = offset; 672 auio.uio_segflg = segflg; 673 auio.uio_rw = rw; 674 auio.uio_td = td; 675 error = 0; 676 677 if ((ioflg & IO_NODELOCKED) == 0) { 678 if ((ioflg & IO_RANGELOCKED) == 0) { 679 if (rw == UIO_READ) { 680 rl_cookie = vn_rangelock_rlock(vp, offset, 681 offset + len); 682 } else if ((ioflg & IO_APPEND) != 0) { 683 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); 684 } else { 685 rl_cookie = vn_rangelock_wlock(vp, offset, 686 offset + len); 687 } 688 } else 689 rl_cookie = NULL; 690 mp = NULL; 691 if (rw == UIO_WRITE) { 692 if (vp->v_type != VCHR && 693 (error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH)) 694 != 0) 695 goto out; 696 lock_flags = vn_lktype_write(mp, vp); 697 } else 698 lock_flags = LK_SHARED; 699 vn_lock(vp, lock_flags | LK_RETRY); 700 } else 701 rl_cookie = NULL; 702 703 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 704 #ifdef MAC 705 if ((ioflg & IO_NOMACCHECK) == 0) { 706 if (rw == UIO_READ) 707 error = mac_vnode_check_read(active_cred, file_cred, 708 vp); 709 else 710 error = mac_vnode_check_write(active_cred, file_cred, 711 vp); 712 } 713 #endif 714 if (error == 0) { 715 if (file_cred != NULL) 716 cred = file_cred; 717 else 718 cred = active_cred; 719 if (do_vn_io_fault(vp, &auio)) { 720 args.kind = VN_IO_FAULT_VOP; 721 args.cred = cred; 722 args.flags = ioflg; 723 args.args.vop_args.vp = vp; 724 error = vn_io_fault1(vp, &auio, &args, td); 725 } else if (rw == UIO_READ) { 726 error = VOP_READ(vp, &auio, ioflg, cred); 727 } else /* if (rw == UIO_WRITE) */ { 728 error = VOP_WRITE(vp, &auio, ioflg, cred); 729 } 730 } 731 if (aresid) 732 *aresid = auio.uio_resid; 733 else 734 if (auio.uio_resid && error == 0) 735 error = EIO; 736 if ((ioflg & IO_NODELOCKED) == 0) { 737 VOP_UNLOCK(vp); 738 if (mp != NULL) 739 vn_finished_write(mp); 740 } 741 out: 742 if (rl_cookie != NULL) 743 vn_rangelock_unlock(vp, rl_cookie); 744 return (error); 745 } 746 747 /* 748 * Package up an I/O request on a vnode into a uio and do it. The I/O 749 * request is split up into smaller chunks and we try to avoid saturating 750 * the buffer cache while potentially holding a vnode locked, so we 751 * check bwillwrite() before calling vn_rdwr(). We also call kern_yield() 752 * to give other processes a chance to lock the vnode (either other processes 753 * core'ing the same binary, or unrelated processes scanning the directory). 754 */ 755 int 756 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, 757 off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, 758 struct ucred *file_cred, size_t *aresid, struct thread *td) 759 { 760 int error = 0; 761 ssize_t iaresid; 762 763 do { 764 int chunk; 765 766 /* 767 * Force `offset' to a multiple of MAXBSIZE except possibly 768 * for the first chunk, so that filesystems only need to 769 * write full blocks except possibly for the first and last 770 * chunks. 771 */ 772 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; 773 774 if (chunk > len) 775 chunk = len; 776 if (rw != UIO_READ && vp->v_type == VREG) 777 bwillwrite(); 778 iaresid = 0; 779 error = vn_rdwr(rw, vp, base, chunk, offset, segflg, 780 ioflg, active_cred, file_cred, &iaresid, td); 781 len -= chunk; /* aresid calc already includes length */ 782 if (error) 783 break; 784 offset += chunk; 785 base = (char *)base + chunk; 786 kern_yield(PRI_USER); 787 } while (len); 788 if (aresid) 789 *aresid = len + iaresid; 790 return (error); 791 } 792 793 #if OFF_MAX <= LONG_MAX 794 off_t 795 foffset_lock(struct file *fp, int flags) 796 { 797 volatile short *flagsp; 798 off_t res; 799 short state; 800 801 KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); 802 803 if ((flags & FOF_NOLOCK) != 0) 804 return (atomic_load_long(&fp->f_offset)); 805 806 /* 807 * According to McKusick the vn lock was protecting f_offset here. 808 * It is now protected by the FOFFSET_LOCKED flag. 809 */ 810 flagsp = &fp->f_vnread_flags; 811 if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED)) 812 return (atomic_load_long(&fp->f_offset)); 813 814 sleepq_lock(&fp->f_vnread_flags); 815 state = atomic_load_16(flagsp); 816 for (;;) { 817 if ((state & FOFFSET_LOCKED) == 0) { 818 if (!atomic_fcmpset_acq_16(flagsp, &state, 819 FOFFSET_LOCKED)) 820 continue; 821 break; 822 } 823 if ((state & FOFFSET_LOCK_WAITING) == 0) { 824 if (!atomic_fcmpset_acq_16(flagsp, &state, 825 state | FOFFSET_LOCK_WAITING)) 826 continue; 827 } 828 DROP_GIANT(); 829 sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0); 830 sleepq_wait(&fp->f_vnread_flags, PUSER -1); 831 PICKUP_GIANT(); 832 sleepq_lock(&fp->f_vnread_flags); 833 state = atomic_load_16(flagsp); 834 } 835 res = atomic_load_long(&fp->f_offset); 836 sleepq_release(&fp->f_vnread_flags); 837 return (res); 838 } 839 840 void 841 foffset_unlock(struct file *fp, off_t val, int flags) 842 { 843 volatile short *flagsp; 844 short state; 845 846 KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); 847 848 if ((flags & FOF_NOUPDATE) == 0) 849 atomic_store_long(&fp->f_offset, val); 850 if ((flags & FOF_NEXTOFF_R) != 0) 851 fp->f_nextoff[UIO_READ] = val; 852 if ((flags & FOF_NEXTOFF_W) != 0) 853 fp->f_nextoff[UIO_WRITE] = val; 854 855 if ((flags & FOF_NOLOCK) != 0) 856 return; 857 858 flagsp = &fp->f_vnread_flags; 859 state = atomic_load_16(flagsp); 860 if ((state & FOFFSET_LOCK_WAITING) == 0 && 861 atomic_cmpset_rel_16(flagsp, state, 0)) 862 return; 863 864 sleepq_lock(&fp->f_vnread_flags); 865 MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0); 866 MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0); 867 fp->f_vnread_flags = 0; 868 sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0); 869 sleepq_release(&fp->f_vnread_flags); 870 } 871 872 static off_t 873 foffset_read(struct file *fp) 874 { 875 876 return (atomic_load_long(&fp->f_offset)); 877 } 878 #else 879 off_t 880 foffset_lock(struct file *fp, int flags) 881 { 882 struct mtx *mtxp; 883 off_t res; 884 885 KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); 886 887 mtxp = mtx_pool_find(mtxpool_sleep, fp); 888 mtx_lock(mtxp); 889 if ((flags & FOF_NOLOCK) == 0) { 890 while (fp->f_vnread_flags & FOFFSET_LOCKED) { 891 fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; 892 msleep(&fp->f_vnread_flags, mtxp, PUSER -1, 893 "vofflock", 0); 894 } 895 fp->f_vnread_flags |= FOFFSET_LOCKED; 896 } 897 res = fp->f_offset; 898 mtx_unlock(mtxp); 899 return (res); 900 } 901 902 void 903 foffset_unlock(struct file *fp, off_t val, int flags) 904 { 905 struct mtx *mtxp; 906 907 KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); 908 909 mtxp = mtx_pool_find(mtxpool_sleep, fp); 910 mtx_lock(mtxp); 911 if ((flags & FOF_NOUPDATE) == 0) 912 fp->f_offset = val; 913 if ((flags & FOF_NEXTOFF_R) != 0) 914 fp->f_nextoff[UIO_READ] = val; 915 if ((flags & FOF_NEXTOFF_W) != 0) 916 fp->f_nextoff[UIO_WRITE] = val; 917 if ((flags & FOF_NOLOCK) == 0) { 918 KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0, 919 ("Lost FOFFSET_LOCKED")); 920 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) 921 wakeup(&fp->f_vnread_flags); 922 fp->f_vnread_flags = 0; 923 } 924 mtx_unlock(mtxp); 925 } 926 927 static off_t 928 foffset_read(struct file *fp) 929 { 930 931 return (foffset_lock(fp, FOF_NOLOCK)); 932 } 933 #endif 934 935 void 936 foffset_lock_pair(struct file *fp1, off_t *off1p, struct file *fp2, off_t *off2p, 937 int flags) 938 { 939 KASSERT(fp1 != fp2, ("foffset_lock_pair: fp1 == fp2")); 940 941 /* Lock in a consistent order to avoid deadlock. */ 942 if ((uintptr_t)fp1 > (uintptr_t)fp2) { 943 struct file *tmpfp; 944 off_t *tmpoffp; 945 946 tmpfp = fp1, fp1 = fp2, fp2 = tmpfp; 947 tmpoffp = off1p, off1p = off2p, off2p = tmpoffp; 948 } 949 if (fp1 != NULL) 950 *off1p = foffset_lock(fp1, flags); 951 if (fp2 != NULL) 952 *off2p = foffset_lock(fp2, flags); 953 } 954 955 void 956 foffset_lock_uio(struct file *fp, struct uio *uio, int flags) 957 { 958 959 if ((flags & FOF_OFFSET) == 0) 960 uio->uio_offset = foffset_lock(fp, flags); 961 } 962 963 void 964 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags) 965 { 966 967 if ((flags & FOF_OFFSET) == 0) 968 foffset_unlock(fp, uio->uio_offset, flags); 969 } 970 971 static int 972 get_advice(struct file *fp, struct uio *uio) 973 { 974 struct mtx *mtxp; 975 int ret; 976 977 ret = POSIX_FADV_NORMAL; 978 if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG) 979 return (ret); 980 981 mtxp = mtx_pool_find(mtxpool_sleep, fp); 982 mtx_lock(mtxp); 983 if (fp->f_advice != NULL && 984 uio->uio_offset >= fp->f_advice->fa_start && 985 uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) 986 ret = fp->f_advice->fa_advice; 987 mtx_unlock(mtxp); 988 return (ret); 989 } 990 991 static int 992 get_write_ioflag(struct file *fp) 993 { 994 int ioflag; 995 struct mount *mp; 996 struct vnode *vp; 997 998 ioflag = 0; 999 vp = fp->f_vnode; 1000 mp = atomic_load_ptr(&vp->v_mount); 1001 1002 if ((fp->f_flag & O_DIRECT) != 0) 1003 ioflag |= IO_DIRECT; 1004 1005 if ((fp->f_flag & O_FSYNC) != 0 || 1006 (mp != NULL && (mp->mnt_flag & MNT_SYNCHRONOUS) != 0)) 1007 ioflag |= IO_SYNC; 1008 1009 /* 1010 * For O_DSYNC we set both IO_SYNC and IO_DATASYNC, so that VOP_WRITE() 1011 * or VOP_DEALLOCATE() implementations that don't understand IO_DATASYNC 1012 * fall back to full O_SYNC behavior. 1013 */ 1014 if ((fp->f_flag & O_DSYNC) != 0) 1015 ioflag |= IO_SYNC | IO_DATASYNC; 1016 1017 return (ioflag); 1018 } 1019 1020 int 1021 vn_read_from_obj(struct vnode *vp, struct uio *uio) 1022 { 1023 vm_object_t obj; 1024 vm_page_t ma[io_hold_cnt + 2]; 1025 off_t off, vsz; 1026 ssize_t resid; 1027 int error, i, j; 1028 1029 MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2)); 1030 obj = atomic_load_ptr(&vp->v_object); 1031 if (obj == NULL) 1032 return (EJUSTRETURN); 1033 1034 /* 1035 * Depends on type stability of vm_objects. 1036 */ 1037 vm_object_pip_add(obj, 1); 1038 if ((obj->flags & OBJ_DEAD) != 0) { 1039 /* 1040 * Note that object might be already reused from the 1041 * vnode, and the OBJ_DEAD flag cleared. This is fine, 1042 * we recheck for DOOMED vnode state after all pages 1043 * are busied, and retract then. 1044 * 1045 * But we check for OBJ_DEAD to ensure that we do not 1046 * busy pages while vm_object_terminate_pages() 1047 * processes the queue. 1048 */ 1049 error = EJUSTRETURN; 1050 goto out_pip; 1051 } 1052 1053 resid = uio->uio_resid; 1054 off = uio->uio_offset; 1055 for (i = 0; resid > 0; i++) { 1056 MPASS(i < io_hold_cnt + 2); 1057 ma[i] = vm_page_grab_unlocked(obj, atop(off), 1058 VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | 1059 VM_ALLOC_NOWAIT); 1060 if (ma[i] == NULL) 1061 break; 1062 1063 /* 1064 * Skip invalid pages. Valid mask can be partial only 1065 * at EOF, and we clip later. 1066 */ 1067 if (vm_page_none_valid(ma[i])) { 1068 vm_page_sunbusy(ma[i]); 1069 break; 1070 } 1071 1072 resid -= PAGE_SIZE; 1073 off += PAGE_SIZE; 1074 } 1075 if (i == 0) { 1076 error = EJUSTRETURN; 1077 goto out_pip; 1078 } 1079 1080 /* 1081 * Check VIRF_DOOMED after we busied our pages. Since 1082 * vgonel() terminates the vnode' vm_object, it cannot 1083 * process past pages busied by us. 1084 */ 1085 if (VN_IS_DOOMED(vp)) { 1086 error = EJUSTRETURN; 1087 goto out; 1088 } 1089 1090 resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1); 1091 if (resid > uio->uio_resid) 1092 resid = uio->uio_resid; 1093 1094 /* 1095 * Unlocked read of vnp_size is safe because truncation cannot 1096 * pass busied page. But we load vnp_size into a local 1097 * variable so that possible concurrent extension does not 1098 * break calculation. 1099 */ 1100 #if defined(__powerpc__) && !defined(__powerpc64__) 1101 vsz = obj->un_pager.vnp.vnp_size; 1102 #else 1103 vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size); 1104 #endif 1105 if (uio->uio_offset >= vsz) { 1106 error = EJUSTRETURN; 1107 goto out; 1108 } 1109 if (uio->uio_offset + resid > vsz) 1110 resid = vsz - uio->uio_offset; 1111 1112 error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio); 1113 1114 out: 1115 for (j = 0; j < i; j++) { 1116 if (error == 0) 1117 vm_page_reference(ma[j]); 1118 vm_page_sunbusy(ma[j]); 1119 } 1120 out_pip: 1121 vm_object_pip_wakeup(obj); 1122 if (error != 0) 1123 return (error); 1124 return (uio->uio_resid == 0 ? 0 : EJUSTRETURN); 1125 } 1126 1127 /* 1128 * File table vnode read routine. 1129 */ 1130 static int 1131 vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, 1132 struct thread *td) 1133 { 1134 struct vnode *vp; 1135 off_t orig_offset; 1136 int error, ioflag; 1137 int advice; 1138 1139 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", 1140 uio->uio_td, td)); 1141 KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); 1142 vp = fp->f_vnode; 1143 ioflag = 0; 1144 if (fp->f_flag & FNONBLOCK) 1145 ioflag |= IO_NDELAY; 1146 if (fp->f_flag & O_DIRECT) 1147 ioflag |= IO_DIRECT; 1148 1149 /* 1150 * Try to read from page cache. VIRF_DOOMED check is racy but 1151 * allows us to avoid unneeded work outright. 1152 */ 1153 if (vn_io_pgcache_read_enable && !mac_vnode_check_read_enabled() && 1154 (vn_irflag_read(vp) & (VIRF_DOOMED | VIRF_PGREAD)) == VIRF_PGREAD) { 1155 error = VOP_READ_PGCACHE(vp, uio, ioflag, fp->f_cred); 1156 if (error == 0) { 1157 fp->f_nextoff[UIO_READ] = uio->uio_offset; 1158 return (0); 1159 } 1160 if (error != EJUSTRETURN) 1161 return (error); 1162 } 1163 1164 advice = get_advice(fp, uio); 1165 vn_lock(vp, LK_SHARED | LK_RETRY); 1166 1167 switch (advice) { 1168 case POSIX_FADV_NORMAL: 1169 case POSIX_FADV_SEQUENTIAL: 1170 case POSIX_FADV_NOREUSE: 1171 ioflag |= sequential_heuristic(uio, fp); 1172 break; 1173 case POSIX_FADV_RANDOM: 1174 /* Disable read-ahead for random I/O. */ 1175 break; 1176 } 1177 orig_offset = uio->uio_offset; 1178 1179 #ifdef MAC 1180 error = mac_vnode_check_read(active_cred, fp->f_cred, vp); 1181 if (error == 0) 1182 #endif 1183 error = VOP_READ(vp, uio, ioflag, fp->f_cred); 1184 fp->f_nextoff[UIO_READ] = uio->uio_offset; 1185 VOP_UNLOCK(vp); 1186 if (error == 0 && advice == POSIX_FADV_NOREUSE && 1187 orig_offset != uio->uio_offset) 1188 /* 1189 * Use POSIX_FADV_DONTNEED to flush pages and buffers 1190 * for the backing file after a POSIX_FADV_NOREUSE 1191 * read(2). 1192 */ 1193 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, 1194 POSIX_FADV_DONTNEED); 1195 return (error); 1196 } 1197 1198 /* 1199 * File table vnode write routine. 1200 */ 1201 static int 1202 vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, 1203 struct thread *td) 1204 { 1205 struct vnode *vp; 1206 struct mount *mp; 1207 off_t orig_offset; 1208 int error, ioflag; 1209 int advice; 1210 bool need_finished_write; 1211 1212 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", 1213 uio->uio_td, td)); 1214 KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); 1215 vp = fp->f_vnode; 1216 if (vp->v_type == VREG) 1217 bwillwrite(); 1218 ioflag = IO_UNIT; 1219 if (vp->v_type == VREG && (fp->f_flag & O_APPEND) != 0) 1220 ioflag |= IO_APPEND; 1221 if ((fp->f_flag & FNONBLOCK) != 0) 1222 ioflag |= IO_NDELAY; 1223 ioflag |= get_write_ioflag(fp); 1224 1225 mp = NULL; 1226 need_finished_write = false; 1227 if (vp->v_type != VCHR) { 1228 error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH); 1229 if (error != 0) 1230 goto unlock; 1231 need_finished_write = true; 1232 } 1233 1234 advice = get_advice(fp, uio); 1235 1236 vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY); 1237 switch (advice) { 1238 case POSIX_FADV_NORMAL: 1239 case POSIX_FADV_SEQUENTIAL: 1240 case POSIX_FADV_NOREUSE: 1241 ioflag |= sequential_heuristic(uio, fp); 1242 break; 1243 case POSIX_FADV_RANDOM: 1244 /* XXX: Is this correct? */ 1245 break; 1246 } 1247 orig_offset = uio->uio_offset; 1248 1249 #ifdef MAC 1250 error = mac_vnode_check_write(active_cred, fp->f_cred, vp); 1251 if (error == 0) 1252 #endif 1253 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); 1254 fp->f_nextoff[UIO_WRITE] = uio->uio_offset; 1255 VOP_UNLOCK(vp); 1256 if (need_finished_write) 1257 vn_finished_write(mp); 1258 if (error == 0 && advice == POSIX_FADV_NOREUSE && 1259 orig_offset != uio->uio_offset) 1260 /* 1261 * Use POSIX_FADV_DONTNEED to flush pages and buffers 1262 * for the backing file after a POSIX_FADV_NOREUSE 1263 * write(2). 1264 */ 1265 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, 1266 POSIX_FADV_DONTNEED); 1267 unlock: 1268 return (error); 1269 } 1270 1271 /* 1272 * The vn_io_fault() is a wrapper around vn_read() and vn_write() to 1273 * prevent the following deadlock: 1274 * 1275 * Assume that the thread A reads from the vnode vp1 into userspace 1276 * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is 1277 * currently not resident, then system ends up with the call chain 1278 * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] -> 1279 * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2) 1280 * which establishes lock order vp1->vn_lock, then vp2->vn_lock. 1281 * If, at the same time, thread B reads from vnode vp2 into buffer buf2 1282 * backed by the pages of vnode vp1, and some page in buf2 is not 1283 * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock. 1284 * 1285 * To prevent the lock order reversal and deadlock, vn_io_fault() does 1286 * not allow page faults to happen during VOP_READ() or VOP_WRITE(). 1287 * Instead, it first tries to do the whole range i/o with pagefaults 1288 * disabled. If all pages in the i/o buffer are resident and mapped, 1289 * VOP will succeed (ignoring the genuine filesystem errors). 1290 * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do 1291 * i/o in chunks, with all pages in the chunk prefaulted and held 1292 * using vm_fault_quick_hold_pages(). 1293 * 1294 * Filesystems using this deadlock avoidance scheme should use the 1295 * array of the held pages from uio, saved in the curthread->td_ma, 1296 * instead of doing uiomove(). A helper function 1297 * vn_io_fault_uiomove() converts uiomove request into 1298 * uiomove_fromphys() over td_ma array. 1299 * 1300 * Since vnode locks do not cover the whole i/o anymore, rangelocks 1301 * make the current i/o request atomic with respect to other i/os and 1302 * truncations. 1303 */ 1304 1305 /* 1306 * Decode vn_io_fault_args and perform the corresponding i/o. 1307 */ 1308 static int 1309 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio, 1310 struct thread *td) 1311 { 1312 int error, save; 1313 1314 error = 0; 1315 save = vm_fault_disable_pagefaults(); 1316 switch (args->kind) { 1317 case VN_IO_FAULT_FOP: 1318 error = (args->args.fop_args.doio)(args->args.fop_args.fp, 1319 uio, args->cred, args->flags, td); 1320 break; 1321 case VN_IO_FAULT_VOP: 1322 switch (uio->uio_rw) { 1323 case UIO_READ: 1324 error = VOP_READ(args->args.vop_args.vp, uio, 1325 args->flags, args->cred); 1326 break; 1327 case UIO_WRITE: 1328 error = VOP_WRITE(args->args.vop_args.vp, uio, 1329 args->flags, args->cred); 1330 break; 1331 } 1332 break; 1333 default: 1334 panic("vn_io_fault_doio: unknown kind of io %d %d", 1335 args->kind, uio->uio_rw); 1336 } 1337 vm_fault_enable_pagefaults(save); 1338 return (error); 1339 } 1340 1341 static int 1342 vn_io_fault_touch(char *base, const struct uio *uio) 1343 { 1344 int r; 1345 1346 r = fubyte(base); 1347 if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1)) 1348 return (EFAULT); 1349 return (0); 1350 } 1351 1352 static int 1353 vn_io_fault_prefault_user(const struct uio *uio) 1354 { 1355 char *base; 1356 const struct iovec *iov; 1357 size_t len; 1358 ssize_t resid; 1359 int error, i; 1360 1361 KASSERT(uio->uio_segflg == UIO_USERSPACE, 1362 ("vn_io_fault_prefault userspace")); 1363 1364 error = i = 0; 1365 iov = uio->uio_iov; 1366 resid = uio->uio_resid; 1367 base = iov->iov_base; 1368 len = iov->iov_len; 1369 while (resid > 0) { 1370 error = vn_io_fault_touch(base, uio); 1371 if (error != 0) 1372 break; 1373 if (len < PAGE_SIZE) { 1374 if (len != 0) { 1375 error = vn_io_fault_touch(base + len - 1, uio); 1376 if (error != 0) 1377 break; 1378 resid -= len; 1379 } 1380 if (++i >= uio->uio_iovcnt) 1381 break; 1382 iov = uio->uio_iov + i; 1383 base = iov->iov_base; 1384 len = iov->iov_len; 1385 } else { 1386 len -= PAGE_SIZE; 1387 base += PAGE_SIZE; 1388 resid -= PAGE_SIZE; 1389 } 1390 } 1391 return (error); 1392 } 1393 1394 /* 1395 * Common code for vn_io_fault(), agnostic to the kind of i/o request. 1396 * Uses vn_io_fault_doio() to make the call to an actual i/o function. 1397 * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request 1398 * into args and call vn_io_fault1() to handle faults during the user 1399 * mode buffer accesses. 1400 */ 1401 static int 1402 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, 1403 struct thread *td) 1404 { 1405 vm_page_t ma[io_hold_cnt + 2]; 1406 struct uio *uio_clone, short_uio; 1407 struct iovec short_iovec[1]; 1408 vm_page_t *prev_td_ma; 1409 vm_prot_t prot; 1410 vm_offset_t addr, end; 1411 size_t len, resid; 1412 ssize_t adv; 1413 int error, cnt, saveheld, prev_td_ma_cnt; 1414 1415 if (vn_io_fault_prefault) { 1416 error = vn_io_fault_prefault_user(uio); 1417 if (error != 0) 1418 return (error); /* Or ignore ? */ 1419 } 1420 1421 prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ; 1422 1423 /* 1424 * The UFS follows IO_UNIT directive and replays back both 1425 * uio_offset and uio_resid if an error is encountered during the 1426 * operation. But, since the iovec may be already advanced, 1427 * uio is still in an inconsistent state. 1428 * 1429 * Cache a copy of the original uio, which is advanced to the redo 1430 * point using UIO_NOCOPY below. 1431 */ 1432 uio_clone = cloneuio(uio); 1433 resid = uio->uio_resid; 1434 1435 short_uio.uio_segflg = UIO_USERSPACE; 1436 short_uio.uio_rw = uio->uio_rw; 1437 short_uio.uio_td = uio->uio_td; 1438 1439 error = vn_io_fault_doio(args, uio, td); 1440 if (error != EFAULT) 1441 goto out; 1442 1443 atomic_add_long(&vn_io_faults_cnt, 1); 1444 uio_clone->uio_segflg = UIO_NOCOPY; 1445 uiomove(NULL, resid - uio->uio_resid, uio_clone); 1446 uio_clone->uio_segflg = uio->uio_segflg; 1447 1448 saveheld = curthread_pflags_set(TDP_UIOHELD); 1449 prev_td_ma = td->td_ma; 1450 prev_td_ma_cnt = td->td_ma_cnt; 1451 1452 while (uio_clone->uio_resid != 0) { 1453 len = uio_clone->uio_iov->iov_len; 1454 if (len == 0) { 1455 KASSERT(uio_clone->uio_iovcnt >= 1, 1456 ("iovcnt underflow")); 1457 uio_clone->uio_iov++; 1458 uio_clone->uio_iovcnt--; 1459 continue; 1460 } 1461 if (len > ptoa(io_hold_cnt)) 1462 len = ptoa(io_hold_cnt); 1463 addr = (uintptr_t)uio_clone->uio_iov->iov_base; 1464 end = round_page(addr + len); 1465 if (end < addr) { 1466 error = EFAULT; 1467 break; 1468 } 1469 /* 1470 * A perfectly misaligned address and length could cause 1471 * both the start and the end of the chunk to use partial 1472 * page. +2 accounts for such a situation. 1473 */ 1474 cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map, 1475 addr, len, prot, ma, io_hold_cnt + 2); 1476 if (cnt == -1) { 1477 error = EFAULT; 1478 break; 1479 } 1480 short_uio.uio_iov = &short_iovec[0]; 1481 short_iovec[0].iov_base = (void *)addr; 1482 short_uio.uio_iovcnt = 1; 1483 short_uio.uio_resid = short_iovec[0].iov_len = len; 1484 short_uio.uio_offset = uio_clone->uio_offset; 1485 td->td_ma = ma; 1486 td->td_ma_cnt = cnt; 1487 1488 error = vn_io_fault_doio(args, &short_uio, td); 1489 vm_page_unhold_pages(ma, cnt); 1490 adv = len - short_uio.uio_resid; 1491 1492 uio_clone->uio_iov->iov_base = 1493 (char *)uio_clone->uio_iov->iov_base + adv; 1494 uio_clone->uio_iov->iov_len -= adv; 1495 uio_clone->uio_resid -= adv; 1496 uio_clone->uio_offset += adv; 1497 1498 uio->uio_resid -= adv; 1499 uio->uio_offset += adv; 1500 1501 if (error != 0 || adv == 0) 1502 break; 1503 } 1504 td->td_ma = prev_td_ma; 1505 td->td_ma_cnt = prev_td_ma_cnt; 1506 curthread_pflags_restore(saveheld); 1507 out: 1508 freeuio(uio_clone); 1509 return (error); 1510 } 1511 1512 static int 1513 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred, 1514 int flags, struct thread *td) 1515 { 1516 fo_rdwr_t *doio; 1517 struct vnode *vp; 1518 void *rl_cookie; 1519 struct vn_io_fault_args args; 1520 int error; 1521 bool do_io_fault, do_rangelock; 1522 1523 doio = uio->uio_rw == UIO_READ ? vn_read : vn_write; 1524 vp = fp->f_vnode; 1525 1526 /* 1527 * The ability to read(2) on a directory has historically been 1528 * allowed for all users, but this can and has been the source of 1529 * at least one security issue in the past. As such, it is now hidden 1530 * away behind a sysctl for those that actually need it to use it, and 1531 * restricted to root when it's turned on to make it relatively safe to 1532 * leave on for longer sessions of need. 1533 */ 1534 if (vp->v_type == VDIR) { 1535 KASSERT(uio->uio_rw == UIO_READ, 1536 ("illegal write attempted on a directory")); 1537 if (!vfs_allow_read_dir) 1538 return (EISDIR); 1539 if ((error = priv_check(td, PRIV_VFS_READ_DIR)) != 0) 1540 return (EISDIR); 1541 } 1542 1543 do_io_fault = do_vn_io_fault(vp, uio); 1544 do_rangelock = do_io_fault || (vn_irflag_read(vp) & VIRF_PGREAD) != 0; 1545 foffset_lock_uio(fp, uio, flags); 1546 if (do_rangelock) { 1547 if (uio->uio_rw == UIO_READ) { 1548 rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, 1549 uio->uio_offset + uio->uio_resid); 1550 } else if ((fp->f_flag & O_APPEND) != 0 || 1551 (flags & FOF_OFFSET) == 0) { 1552 /* For appenders, punt and lock the whole range. */ 1553 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); 1554 } else { 1555 rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, 1556 uio->uio_offset + uio->uio_resid); 1557 } 1558 } 1559 if (do_io_fault) { 1560 args.kind = VN_IO_FAULT_FOP; 1561 args.args.fop_args.fp = fp; 1562 args.args.fop_args.doio = doio; 1563 args.cred = active_cred; 1564 args.flags = flags | FOF_OFFSET; 1565 error = vn_io_fault1(vp, uio, &args, td); 1566 } else { 1567 error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td); 1568 } 1569 if (do_rangelock) 1570 vn_rangelock_unlock(vp, rl_cookie); 1571 foffset_unlock_uio(fp, uio, flags); 1572 return (error); 1573 } 1574 1575 /* 1576 * Helper function to perform the requested uiomove operation using 1577 * the held pages for io->uio_iov[0].iov_base buffer instead of 1578 * copyin/copyout. Access to the pages with uiomove_fromphys() 1579 * instead of iov_base prevents page faults that could occur due to 1580 * pmap_collect() invalidating the mapping created by 1581 * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or 1582 * object cleanup revoking the write access from page mappings. 1583 * 1584 * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove() 1585 * instead of plain uiomove(). 1586 */ 1587 int 1588 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) 1589 { 1590 struct uio transp_uio; 1591 struct iovec transp_iov[1]; 1592 struct thread *td; 1593 size_t adv; 1594 int error, pgadv; 1595 1596 td = curthread; 1597 if ((td->td_pflags & TDP_UIOHELD) == 0 || 1598 uio->uio_segflg != UIO_USERSPACE) 1599 return (uiomove(data, xfersize, uio)); 1600 1601 KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); 1602 transp_iov[0].iov_base = data; 1603 transp_uio.uio_iov = &transp_iov[0]; 1604 transp_uio.uio_iovcnt = 1; 1605 if (xfersize > uio->uio_resid) 1606 xfersize = uio->uio_resid; 1607 transp_uio.uio_resid = transp_iov[0].iov_len = xfersize; 1608 transp_uio.uio_offset = 0; 1609 transp_uio.uio_segflg = UIO_SYSSPACE; 1610 /* 1611 * Since transp_iov points to data, and td_ma page array 1612 * corresponds to original uio->uio_iov, we need to invert the 1613 * direction of the i/o operation as passed to 1614 * uiomove_fromphys(). 1615 */ 1616 switch (uio->uio_rw) { 1617 case UIO_WRITE: 1618 transp_uio.uio_rw = UIO_READ; 1619 break; 1620 case UIO_READ: 1621 transp_uio.uio_rw = UIO_WRITE; 1622 break; 1623 } 1624 transp_uio.uio_td = uio->uio_td; 1625 error = uiomove_fromphys(td->td_ma, 1626 ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK, 1627 xfersize, &transp_uio); 1628 adv = xfersize - transp_uio.uio_resid; 1629 pgadv = 1630 (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) - 1631 (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT); 1632 td->td_ma += pgadv; 1633 KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, 1634 pgadv)); 1635 td->td_ma_cnt -= pgadv; 1636 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv; 1637 uio->uio_iov->iov_len -= adv; 1638 uio->uio_resid -= adv; 1639 uio->uio_offset += adv; 1640 return (error); 1641 } 1642 1643 int 1644 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, 1645 struct uio *uio) 1646 { 1647 struct thread *td; 1648 vm_offset_t iov_base; 1649 int cnt, pgadv; 1650 1651 td = curthread; 1652 if ((td->td_pflags & TDP_UIOHELD) == 0 || 1653 uio->uio_segflg != UIO_USERSPACE) 1654 return (uiomove_fromphys(ma, offset, xfersize, uio)); 1655 1656 KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); 1657 cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize; 1658 iov_base = (vm_offset_t)uio->uio_iov->iov_base; 1659 switch (uio->uio_rw) { 1660 case UIO_WRITE: 1661 pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma, 1662 offset, cnt); 1663 break; 1664 case UIO_READ: 1665 pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK, 1666 cnt); 1667 break; 1668 } 1669 pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT); 1670 td->td_ma += pgadv; 1671 KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, 1672 pgadv)); 1673 td->td_ma_cnt -= pgadv; 1674 uio->uio_iov->iov_base = (char *)(iov_base + cnt); 1675 uio->uio_iov->iov_len -= cnt; 1676 uio->uio_resid -= cnt; 1677 uio->uio_offset += cnt; 1678 return (0); 1679 } 1680 1681 /* 1682 * File table truncate routine. 1683 */ 1684 static int 1685 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred, 1686 struct thread *td) 1687 { 1688 struct mount *mp; 1689 struct vnode *vp; 1690 void *rl_cookie; 1691 int error; 1692 1693 vp = fp->f_vnode; 1694 1695 retry: 1696 /* 1697 * Lock the whole range for truncation. Otherwise split i/o 1698 * might happen partly before and partly after the truncation. 1699 */ 1700 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); 1701 error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH); 1702 if (error) 1703 goto out1; 1704 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1705 AUDIT_ARG_VNODE1(vp); 1706 if (vp->v_type == VDIR) { 1707 error = EISDIR; 1708 goto out; 1709 } 1710 #ifdef MAC 1711 error = mac_vnode_check_write(active_cred, fp->f_cred, vp); 1712 if (error) 1713 goto out; 1714 #endif 1715 error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0, 1716 fp->f_cred); 1717 out: 1718 VOP_UNLOCK(vp); 1719 vn_finished_write(mp); 1720 out1: 1721 vn_rangelock_unlock(vp, rl_cookie); 1722 if (error == ERELOOKUP) 1723 goto retry; 1724 return (error); 1725 } 1726 1727 /* 1728 * Truncate a file that is already locked. 1729 */ 1730 int 1731 vn_truncate_locked(struct vnode *vp, off_t length, bool sync, 1732 struct ucred *cred) 1733 { 1734 struct vattr vattr; 1735 int error; 1736 1737 error = VOP_ADD_WRITECOUNT(vp, 1); 1738 if (error == 0) { 1739 VATTR_NULL(&vattr); 1740 vattr.va_size = length; 1741 if (sync) 1742 vattr.va_vaflags |= VA_SYNC; 1743 error = VOP_SETATTR(vp, &vattr, cred); 1744 VOP_ADD_WRITECOUNT_CHECKED(vp, -1); 1745 } 1746 return (error); 1747 } 1748 1749 /* 1750 * File table vnode stat routine. 1751 */ 1752 int 1753 vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred) 1754 { 1755 struct vnode *vp = fp->f_vnode; 1756 int error; 1757 1758 vn_lock(vp, LK_SHARED | LK_RETRY); 1759 error = VOP_STAT(vp, sb, active_cred, fp->f_cred); 1760 VOP_UNLOCK(vp); 1761 1762 return (error); 1763 } 1764 1765 /* 1766 * File table vnode ioctl routine. 1767 */ 1768 static int 1769 vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, 1770 struct thread *td) 1771 { 1772 struct vnode *vp; 1773 struct fiobmap2_arg *bmarg; 1774 off_t size; 1775 int error; 1776 1777 vp = fp->f_vnode; 1778 switch (vp->v_type) { 1779 case VDIR: 1780 case VREG: 1781 switch (com) { 1782 case FIONREAD: 1783 error = vn_getsize(vp, &size, active_cred); 1784 if (error == 0) 1785 *(int *)data = size - fp->f_offset; 1786 return (error); 1787 case FIOBMAP2: 1788 bmarg = (struct fiobmap2_arg *)data; 1789 vn_lock(vp, LK_SHARED | LK_RETRY); 1790 #ifdef MAC 1791 error = mac_vnode_check_read(active_cred, fp->f_cred, 1792 vp); 1793 if (error == 0) 1794 #endif 1795 error = VOP_BMAP(vp, bmarg->bn, NULL, 1796 &bmarg->bn, &bmarg->runp, &bmarg->runb); 1797 VOP_UNLOCK(vp); 1798 return (error); 1799 case FIONBIO: 1800 case FIOASYNC: 1801 return (0); 1802 default: 1803 return (VOP_IOCTL(vp, com, data, fp->f_flag, 1804 active_cred, td)); 1805 } 1806 break; 1807 case VCHR: 1808 return (VOP_IOCTL(vp, com, data, fp->f_flag, 1809 active_cred, td)); 1810 default: 1811 return (ENOTTY); 1812 } 1813 } 1814 1815 /* 1816 * File table vnode poll routine. 1817 */ 1818 static int 1819 vn_poll(struct file *fp, int events, struct ucred *active_cred, 1820 struct thread *td) 1821 { 1822 struct vnode *vp; 1823 int error; 1824 1825 vp = fp->f_vnode; 1826 #if defined(MAC) || defined(AUDIT) 1827 if (AUDITING_TD(td) || mac_vnode_check_poll_enabled()) { 1828 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1829 AUDIT_ARG_VNODE1(vp); 1830 error = mac_vnode_check_poll(active_cred, fp->f_cred, vp); 1831 VOP_UNLOCK(vp); 1832 if (error != 0) 1833 return (error); 1834 } 1835 #endif 1836 error = VOP_POLL(vp, events, fp->f_cred, td); 1837 return (error); 1838 } 1839 1840 /* 1841 * Acquire the requested lock and then check for validity. LK_RETRY 1842 * permits vn_lock to return doomed vnodes. 1843 */ 1844 static int __noinline 1845 _vn_lock_fallback(struct vnode *vp, int flags, const char *file, int line, 1846 int error) 1847 { 1848 1849 KASSERT((flags & LK_RETRY) == 0 || error == 0, 1850 ("vn_lock: error %d incompatible with flags %#x", error, flags)); 1851 1852 if (error == 0) 1853 VNASSERT(VN_IS_DOOMED(vp), vp, ("vnode not doomed")); 1854 1855 if ((flags & LK_RETRY) == 0) { 1856 if (error == 0) { 1857 VOP_UNLOCK(vp); 1858 error = ENOENT; 1859 } 1860 return (error); 1861 } 1862 1863 /* 1864 * LK_RETRY case. 1865 * 1866 * Nothing to do if we got the lock. 1867 */ 1868 if (error == 0) 1869 return (0); 1870 1871 /* 1872 * Interlock was dropped by the call in _vn_lock. 1873 */ 1874 flags &= ~LK_INTERLOCK; 1875 do { 1876 error = VOP_LOCK1(vp, flags, file, line); 1877 } while (error != 0); 1878 return (0); 1879 } 1880 1881 int 1882 _vn_lock(struct vnode *vp, int flags, const char *file, int line) 1883 { 1884 int error; 1885 1886 VNASSERT((flags & LK_TYPE_MASK) != 0, vp, 1887 ("vn_lock: no locktype (%d passed)", flags)); 1888 VNPASS(vp->v_holdcnt > 0, vp); 1889 error = VOP_LOCK1(vp, flags, file, line); 1890 if (__predict_false(error != 0 || VN_IS_DOOMED(vp))) 1891 return (_vn_lock_fallback(vp, flags, file, line, error)); 1892 return (0); 1893 } 1894 1895 /* 1896 * File table vnode close routine. 1897 */ 1898 static int 1899 vn_closefile(struct file *fp, struct thread *td) 1900 { 1901 struct vnode *vp; 1902 struct flock lf; 1903 int error; 1904 bool ref; 1905 1906 vp = fp->f_vnode; 1907 fp->f_ops = &badfileops; 1908 ref = (fp->f_flag & FHASLOCK) != 0; 1909 1910 error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref); 1911 1912 if (__predict_false(ref)) { 1913 lf.l_whence = SEEK_SET; 1914 lf.l_start = 0; 1915 lf.l_len = 0; 1916 lf.l_type = F_UNLCK; 1917 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); 1918 vrele(vp); 1919 } 1920 return (error); 1921 } 1922 1923 /* 1924 * Preparing to start a filesystem write operation. If the operation is 1925 * permitted, then we bump the count of operations in progress and 1926 * proceed. If a suspend request is in progress, we wait until the 1927 * suspension is over, and then proceed. 1928 */ 1929 static int 1930 vn_start_write_refed(struct mount *mp, int flags, bool mplocked) 1931 { 1932 struct mount_pcpu *mpcpu; 1933 int error, mflags; 1934 1935 if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 && 1936 vfs_op_thread_enter(mp, mpcpu)) { 1937 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); 1938 vfs_mp_count_add_pcpu(mpcpu, writeopcount, 1); 1939 vfs_op_thread_exit(mp, mpcpu); 1940 return (0); 1941 } 1942 1943 if (mplocked) 1944 mtx_assert(MNT_MTX(mp), MA_OWNED); 1945 else 1946 MNT_ILOCK(mp); 1947 1948 error = 0; 1949 1950 /* 1951 * Check on status of suspension. 1952 */ 1953 if ((curthread->td_pflags & TDP_IGNSUSP) == 0 || 1954 mp->mnt_susp_owner != curthread) { 1955 mflags = 0; 1956 if ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0) { 1957 if (flags & V_PCATCH) 1958 mflags |= PCATCH; 1959 } 1960 mflags |= (PUSER - 1); 1961 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 1962 if ((flags & V_NOWAIT) != 0) { 1963 error = EWOULDBLOCK; 1964 goto unlock; 1965 } 1966 error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, 1967 "suspfs", 0); 1968 if (error != 0) 1969 goto unlock; 1970 } 1971 } 1972 if ((flags & V_XSLEEP) != 0) 1973 goto unlock; 1974 mp->mnt_writeopcount++; 1975 unlock: 1976 if (error != 0 || (flags & V_XSLEEP) != 0) 1977 MNT_REL(mp); 1978 MNT_IUNLOCK(mp); 1979 return (error); 1980 } 1981 1982 int 1983 vn_start_write(struct vnode *vp, struct mount **mpp, int flags) 1984 { 1985 struct mount *mp; 1986 int error; 1987 1988 KASSERT((flags & ~V_VALID_FLAGS) == 0, 1989 ("%s: invalid flags passed %d\n", __func__, flags)); 1990 1991 error = 0; 1992 /* 1993 * If a vnode is provided, get and return the mount point that 1994 * to which it will write. 1995 */ 1996 if (vp != NULL) { 1997 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { 1998 *mpp = NULL; 1999 if (error != EOPNOTSUPP) 2000 return (error); 2001 return (0); 2002 } 2003 } 2004 if ((mp = *mpp) == NULL) 2005 return (0); 2006 2007 /* 2008 * VOP_GETWRITEMOUNT() returns with the mp refcount held through 2009 * a vfs_ref(). 2010 * As long as a vnode is not provided we need to acquire a 2011 * refcount for the provided mountpoint too, in order to 2012 * emulate a vfs_ref(). 2013 */ 2014 if (vp == NULL) 2015 vfs_ref(mp); 2016 2017 error = vn_start_write_refed(mp, flags, false); 2018 if (error != 0 && (flags & V_NOWAIT) == 0) 2019 *mpp = NULL; 2020 return (error); 2021 } 2022 2023 /* 2024 * Secondary suspension. Used by operations such as vop_inactive 2025 * routines that are needed by the higher level functions. These 2026 * are allowed to proceed until all the higher level functions have 2027 * completed (indicated by mnt_writeopcount dropping to zero). At that 2028 * time, these operations are halted until the suspension is over. 2029 */ 2030 int 2031 vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags) 2032 { 2033 struct mount *mp; 2034 int error, mflags; 2035 2036 KASSERT((flags & (~V_VALID_FLAGS | V_XSLEEP)) == 0, 2037 ("%s: invalid flags passed %d\n", __func__, flags)); 2038 2039 retry: 2040 if (vp != NULL) { 2041 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { 2042 *mpp = NULL; 2043 if (error != EOPNOTSUPP) 2044 return (error); 2045 return (0); 2046 } 2047 } 2048 /* 2049 * If we are not suspended or have not yet reached suspended 2050 * mode, then let the operation proceed. 2051 */ 2052 if ((mp = *mpp) == NULL) 2053 return (0); 2054 2055 /* 2056 * VOP_GETWRITEMOUNT() returns with the mp refcount held through 2057 * a vfs_ref(). 2058 * As long as a vnode is not provided we need to acquire a 2059 * refcount for the provided mountpoint too, in order to 2060 * emulate a vfs_ref(). 2061 */ 2062 MNT_ILOCK(mp); 2063 if (vp == NULL) 2064 MNT_REF(mp); 2065 if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) { 2066 mp->mnt_secondary_writes++; 2067 mp->mnt_secondary_accwrites++; 2068 MNT_IUNLOCK(mp); 2069 return (0); 2070 } 2071 if ((flags & V_NOWAIT) != 0) { 2072 MNT_REL(mp); 2073 MNT_IUNLOCK(mp); 2074 *mpp = NULL; 2075 return (EWOULDBLOCK); 2076 } 2077 /* 2078 * Wait for the suspension to finish. 2079 */ 2080 mflags = 0; 2081 if ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0) { 2082 if ((flags & V_PCATCH) != 0) 2083 mflags |= PCATCH; 2084 } 2085 mflags |= (PUSER - 1) | PDROP; 2086 error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, "suspfs", 0); 2087 vfs_rel(mp); 2088 if (error == 0) 2089 goto retry; 2090 *mpp = NULL; 2091 return (error); 2092 } 2093 2094 /* 2095 * Filesystem write operation has completed. If we are suspending and this 2096 * operation is the last one, notify the suspender that the suspension is 2097 * now in effect. 2098 */ 2099 void 2100 vn_finished_write(struct mount *mp) 2101 { 2102 struct mount_pcpu *mpcpu; 2103 int c; 2104 2105 if (mp == NULL) 2106 return; 2107 2108 if (vfs_op_thread_enter(mp, mpcpu)) { 2109 vfs_mp_count_sub_pcpu(mpcpu, writeopcount, 1); 2110 vfs_mp_count_sub_pcpu(mpcpu, ref, 1); 2111 vfs_op_thread_exit(mp, mpcpu); 2112 return; 2113 } 2114 2115 MNT_ILOCK(mp); 2116 vfs_assert_mount_counters(mp); 2117 MNT_REL(mp); 2118 c = --mp->mnt_writeopcount; 2119 if (mp->mnt_vfs_ops == 0) { 2120 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); 2121 MNT_IUNLOCK(mp); 2122 return; 2123 } 2124 if (c < 0) 2125 vfs_dump_mount_counters(mp); 2126 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0) 2127 wakeup(&mp->mnt_writeopcount); 2128 MNT_IUNLOCK(mp); 2129 } 2130 2131 /* 2132 * Filesystem secondary write operation has completed. If we are 2133 * suspending and this operation is the last one, notify the suspender 2134 * that the suspension is now in effect. 2135 */ 2136 void 2137 vn_finished_secondary_write(struct mount *mp) 2138 { 2139 if (mp == NULL) 2140 return; 2141 MNT_ILOCK(mp); 2142 MNT_REL(mp); 2143 mp->mnt_secondary_writes--; 2144 if (mp->mnt_secondary_writes < 0) 2145 panic("vn_finished_secondary_write: neg cnt"); 2146 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && 2147 mp->mnt_secondary_writes <= 0) 2148 wakeup(&mp->mnt_secondary_writes); 2149 MNT_IUNLOCK(mp); 2150 } 2151 2152 /* 2153 * Request a filesystem to suspend write operations. 2154 */ 2155 int 2156 vfs_write_suspend(struct mount *mp, int flags) 2157 { 2158 int error; 2159 2160 vfs_op_enter(mp); 2161 2162 MNT_ILOCK(mp); 2163 vfs_assert_mount_counters(mp); 2164 if (mp->mnt_susp_owner == curthread) { 2165 vfs_op_exit_locked(mp); 2166 MNT_IUNLOCK(mp); 2167 return (EALREADY); 2168 } 2169 while (mp->mnt_kern_flag & MNTK_SUSPEND) 2170 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0); 2171 2172 /* 2173 * Unmount holds a write reference on the mount point. If we 2174 * own busy reference and drain for writers, we deadlock with 2175 * the reference draining in the unmount path. Callers of 2176 * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if 2177 * vfs_busy() reference is owned and caller is not in the 2178 * unmount context. 2179 */ 2180 if ((flags & VS_SKIP_UNMOUNT) != 0 && 2181 (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { 2182 vfs_op_exit_locked(mp); 2183 MNT_IUNLOCK(mp); 2184 return (EBUSY); 2185 } 2186 2187 mp->mnt_kern_flag |= MNTK_SUSPEND; 2188 mp->mnt_susp_owner = curthread; 2189 if (mp->mnt_writeopcount > 0) 2190 (void) msleep(&mp->mnt_writeopcount, 2191 MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); 2192 else 2193 MNT_IUNLOCK(mp); 2194 if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) { 2195 vfs_write_resume(mp, 0); 2196 /* vfs_write_resume does vfs_op_exit() for us */ 2197 } 2198 return (error); 2199 } 2200 2201 /* 2202 * Request a filesystem to resume write operations. 2203 */ 2204 void 2205 vfs_write_resume(struct mount *mp, int flags) 2206 { 2207 2208 MNT_ILOCK(mp); 2209 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 2210 KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner")); 2211 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 | 2212 MNTK_SUSPENDED); 2213 mp->mnt_susp_owner = NULL; 2214 wakeup(&mp->mnt_writeopcount); 2215 wakeup(&mp->mnt_flag); 2216 curthread->td_pflags &= ~TDP_IGNSUSP; 2217 if ((flags & VR_START_WRITE) != 0) { 2218 MNT_REF(mp); 2219 mp->mnt_writeopcount++; 2220 } 2221 MNT_IUNLOCK(mp); 2222 if ((flags & VR_NO_SUSPCLR) == 0) 2223 VFS_SUSP_CLEAN(mp); 2224 vfs_op_exit(mp); 2225 } else if ((flags & VR_START_WRITE) != 0) { 2226 MNT_REF(mp); 2227 vn_start_write_refed(mp, 0, true); 2228 } else { 2229 MNT_IUNLOCK(mp); 2230 } 2231 } 2232 2233 /* 2234 * Helper loop around vfs_write_suspend() for filesystem unmount VFS 2235 * methods. 2236 */ 2237 int 2238 vfs_write_suspend_umnt(struct mount *mp) 2239 { 2240 int error; 2241 2242 KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0, 2243 ("vfs_write_suspend_umnt: recursed")); 2244 2245 /* dounmount() already called vn_start_write(). */ 2246 for (;;) { 2247 vn_finished_write(mp); 2248 error = vfs_write_suspend(mp, 0); 2249 if (error != 0) { 2250 vn_start_write(NULL, &mp, V_WAIT); 2251 return (error); 2252 } 2253 MNT_ILOCK(mp); 2254 if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0) 2255 break; 2256 MNT_IUNLOCK(mp); 2257 vn_start_write(NULL, &mp, V_WAIT); 2258 } 2259 mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); 2260 wakeup(&mp->mnt_flag); 2261 MNT_IUNLOCK(mp); 2262 curthread->td_pflags |= TDP_IGNSUSP; 2263 return (0); 2264 } 2265 2266 /* 2267 * Implement kqueues for files by translating it to vnode operation. 2268 */ 2269 static int 2270 vn_kqfilter(struct file *fp, struct knote *kn) 2271 { 2272 2273 return (VOP_KQFILTER(fp->f_vnode, kn)); 2274 } 2275 2276 int 2277 vn_kqfilter_opath(struct file *fp, struct knote *kn) 2278 { 2279 if ((fp->f_flag & FKQALLOWED) == 0) 2280 return (EBADF); 2281 return (vn_kqfilter(fp, kn)); 2282 } 2283 2284 /* 2285 * Simplified in-kernel wrapper calls for extended attribute access. 2286 * Both calls pass in a NULL credential, authorizing as "kernel" access. 2287 * Set IO_NODELOCKED in ioflg if the vnode is already locked. 2288 */ 2289 int 2290 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, 2291 const char *attrname, int *buflen, char *buf, struct thread *td) 2292 { 2293 struct uio auio; 2294 struct iovec iov; 2295 int error; 2296 2297 iov.iov_len = *buflen; 2298 iov.iov_base = buf; 2299 2300 auio.uio_iov = &iov; 2301 auio.uio_iovcnt = 1; 2302 auio.uio_rw = UIO_READ; 2303 auio.uio_segflg = UIO_SYSSPACE; 2304 auio.uio_td = td; 2305 auio.uio_offset = 0; 2306 auio.uio_resid = *buflen; 2307 2308 if ((ioflg & IO_NODELOCKED) == 0) 2309 vn_lock(vp, LK_SHARED | LK_RETRY); 2310 2311 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 2312 2313 /* authorize attribute retrieval as kernel */ 2314 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, 2315 td); 2316 2317 if ((ioflg & IO_NODELOCKED) == 0) 2318 VOP_UNLOCK(vp); 2319 2320 if (error == 0) { 2321 *buflen = *buflen - auio.uio_resid; 2322 } 2323 2324 return (error); 2325 } 2326 2327 /* 2328 * XXX failure mode if partially written? 2329 */ 2330 int 2331 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, 2332 const char *attrname, int buflen, char *buf, struct thread *td) 2333 { 2334 struct uio auio; 2335 struct iovec iov; 2336 struct mount *mp; 2337 int error; 2338 2339 iov.iov_len = buflen; 2340 iov.iov_base = buf; 2341 2342 auio.uio_iov = &iov; 2343 auio.uio_iovcnt = 1; 2344 auio.uio_rw = UIO_WRITE; 2345 auio.uio_segflg = UIO_SYSSPACE; 2346 auio.uio_td = td; 2347 auio.uio_offset = 0; 2348 auio.uio_resid = buflen; 2349 2350 if ((ioflg & IO_NODELOCKED) == 0) { 2351 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) 2352 return (error); 2353 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2354 } 2355 2356 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 2357 2358 /* authorize attribute setting as kernel */ 2359 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); 2360 2361 if ((ioflg & IO_NODELOCKED) == 0) { 2362 vn_finished_write(mp); 2363 VOP_UNLOCK(vp); 2364 } 2365 2366 return (error); 2367 } 2368 2369 int 2370 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, 2371 const char *attrname, struct thread *td) 2372 { 2373 struct mount *mp; 2374 int error; 2375 2376 if ((ioflg & IO_NODELOCKED) == 0) { 2377 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) 2378 return (error); 2379 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2380 } 2381 2382 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 2383 2384 /* authorize attribute removal as kernel */ 2385 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); 2386 if (error == EOPNOTSUPP) 2387 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, 2388 NULL, td); 2389 2390 if ((ioflg & IO_NODELOCKED) == 0) { 2391 vn_finished_write(mp); 2392 VOP_UNLOCK(vp); 2393 } 2394 2395 return (error); 2396 } 2397 2398 static int 2399 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags, 2400 struct vnode **rvp) 2401 { 2402 2403 return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp)); 2404 } 2405 2406 int 2407 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp) 2408 { 2409 2410 return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino, 2411 lkflags, rvp)); 2412 } 2413 2414 int 2415 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, 2416 int lkflags, struct vnode **rvp) 2417 { 2418 struct mount *mp; 2419 int ltype, error; 2420 2421 ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get"); 2422 mp = vp->v_mount; 2423 ltype = VOP_ISLOCKED(vp); 2424 KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED, 2425 ("vn_vget_ino: vp not locked")); 2426 error = vfs_busy(mp, MBF_NOWAIT); 2427 if (error != 0) { 2428 vfs_ref(mp); 2429 VOP_UNLOCK(vp); 2430 error = vfs_busy(mp, 0); 2431 vn_lock(vp, ltype | LK_RETRY); 2432 vfs_rel(mp); 2433 if (error != 0) 2434 return (ENOENT); 2435 if (VN_IS_DOOMED(vp)) { 2436 vfs_unbusy(mp); 2437 return (ENOENT); 2438 } 2439 } 2440 VOP_UNLOCK(vp); 2441 error = alloc(mp, alloc_arg, lkflags, rvp); 2442 vfs_unbusy(mp); 2443 if (error != 0 || *rvp != vp) 2444 vn_lock(vp, ltype | LK_RETRY); 2445 if (VN_IS_DOOMED(vp)) { 2446 if (error == 0) { 2447 if (*rvp == vp) 2448 vunref(vp); 2449 else 2450 vput(*rvp); 2451 } 2452 error = ENOENT; 2453 } 2454 return (error); 2455 } 2456 2457 static void 2458 vn_send_sigxfsz(struct proc *p) 2459 { 2460 PROC_LOCK(p); 2461 kern_psignal(p, SIGXFSZ); 2462 PROC_UNLOCK(p); 2463 } 2464 2465 int 2466 vn_rlimit_trunc(u_quad_t size, struct thread *td) 2467 { 2468 if (size <= lim_cur(td, RLIMIT_FSIZE)) 2469 return (0); 2470 vn_send_sigxfsz(td->td_proc); 2471 return (EFBIG); 2472 } 2473 2474 static int 2475 vn_rlimit_fsizex1(const struct vnode *vp, struct uio *uio, off_t maxfsz, 2476 bool adj, struct thread *td) 2477 { 2478 off_t lim; 2479 bool ktr_write; 2480 2481 if (vp->v_type != VREG) 2482 return (0); 2483 2484 /* 2485 * Handle file system maximum file size. 2486 */ 2487 if (maxfsz != 0 && uio->uio_offset + uio->uio_resid > maxfsz) { 2488 if (!adj || uio->uio_offset >= maxfsz) 2489 return (EFBIG); 2490 uio->uio_resid = maxfsz - uio->uio_offset; 2491 } 2492 2493 /* 2494 * This is kernel write (e.g. vnode_pager) or accounting 2495 * write, ignore limit. 2496 */ 2497 if (td == NULL || (td->td_pflags2 & TDP2_ACCT) != 0) 2498 return (0); 2499 2500 /* 2501 * Calculate file size limit. 2502 */ 2503 ktr_write = (td->td_pflags & TDP_INKTRACE) != 0; 2504 lim = __predict_false(ktr_write) ? td->td_ktr_io_lim : 2505 lim_cur(td, RLIMIT_FSIZE); 2506 2507 /* 2508 * Is the limit reached? 2509 */ 2510 if (__predict_true((uoff_t)uio->uio_offset + uio->uio_resid <= lim)) 2511 return (0); 2512 2513 /* 2514 * Prepared filesystems can handle writes truncated to the 2515 * file size limit. 2516 */ 2517 if (adj && (uoff_t)uio->uio_offset < lim) { 2518 uio->uio_resid = lim - (uoff_t)uio->uio_offset; 2519 return (0); 2520 } 2521 2522 if (!ktr_write || ktr_filesize_limit_signal) 2523 vn_send_sigxfsz(td->td_proc); 2524 return (EFBIG); 2525 } 2526 2527 /* 2528 * Helper for VOP_WRITE() implementations, the common code to 2529 * handle maximum supported file size on the filesystem, and 2530 * RLIMIT_FSIZE, except for special writes from accounting subsystem 2531 * and ktrace. 2532 * 2533 * For maximum file size (maxfsz argument): 2534 * - return EFBIG if uio_offset is beyond it 2535 * - otherwise, clamp uio_resid if write would extend file beyond maxfsz. 2536 * 2537 * For RLIMIT_FSIZE: 2538 * - return EFBIG and send SIGXFSZ if uio_offset is beyond the limit 2539 * - otherwise, clamp uio_resid if write would extend file beyond limit. 2540 * 2541 * If clamping occured, the adjustment for uio_resid is stored in 2542 * *resid_adj, to be re-applied by vn_rlimit_fsizex_res() on return 2543 * from the VOP. 2544 */ 2545 int 2546 vn_rlimit_fsizex(const struct vnode *vp, struct uio *uio, off_t maxfsz, 2547 ssize_t *resid_adj, struct thread *td) 2548 { 2549 ssize_t resid_orig; 2550 int error; 2551 bool adj; 2552 2553 resid_orig = uio->uio_resid; 2554 adj = resid_adj != NULL; 2555 error = vn_rlimit_fsizex1(vp, uio, maxfsz, adj, td); 2556 if (adj) 2557 *resid_adj = resid_orig - uio->uio_resid; 2558 return (error); 2559 } 2560 2561 void 2562 vn_rlimit_fsizex_res(struct uio *uio, ssize_t resid_adj) 2563 { 2564 uio->uio_resid += resid_adj; 2565 } 2566 2567 int 2568 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, 2569 struct thread *td) 2570 { 2571 return (vn_rlimit_fsizex(vp, __DECONST(struct uio *, uio), 0, NULL, 2572 td)); 2573 } 2574 2575 int 2576 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 2577 struct thread *td) 2578 { 2579 struct vnode *vp; 2580 2581 vp = fp->f_vnode; 2582 #ifdef AUDIT 2583 vn_lock(vp, LK_SHARED | LK_RETRY); 2584 AUDIT_ARG_VNODE1(vp); 2585 VOP_UNLOCK(vp); 2586 #endif 2587 return (setfmode(td, active_cred, vp, mode)); 2588 } 2589 2590 int 2591 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 2592 struct thread *td) 2593 { 2594 struct vnode *vp; 2595 2596 vp = fp->f_vnode; 2597 #ifdef AUDIT 2598 vn_lock(vp, LK_SHARED | LK_RETRY); 2599 AUDIT_ARG_VNODE1(vp); 2600 VOP_UNLOCK(vp); 2601 #endif 2602 return (setfown(td, active_cred, vp, uid, gid)); 2603 } 2604 2605 /* 2606 * Remove pages in the range ["start", "end") from the vnode's VM object. If 2607 * "end" is 0, then the range extends to the end of the object. 2608 */ 2609 void 2610 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end) 2611 { 2612 vm_object_t object; 2613 2614 if ((object = vp->v_object) == NULL) 2615 return; 2616 VM_OBJECT_WLOCK(object); 2617 vm_object_page_remove(object, start, end, 0); 2618 VM_OBJECT_WUNLOCK(object); 2619 } 2620 2621 /* 2622 * Like vn_pages_remove(), but skips invalid pages, which by definition are not 2623 * mapped into any process' address space. Filesystems may use this in 2624 * preference to vn_pages_remove() to avoid blocking on pages busied in 2625 * preparation for a VOP_GETPAGES. 2626 */ 2627 void 2628 vn_pages_remove_valid(struct vnode *vp, vm_pindex_t start, vm_pindex_t end) 2629 { 2630 vm_object_t object; 2631 2632 if ((object = vp->v_object) == NULL) 2633 return; 2634 VM_OBJECT_WLOCK(object); 2635 vm_object_page_remove(object, start, end, OBJPR_VALIDONLY); 2636 VM_OBJECT_WUNLOCK(object); 2637 } 2638 2639 int 2640 vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off, 2641 struct ucred *cred) 2642 { 2643 off_t size; 2644 daddr_t bn, bnp; 2645 uint64_t bsize; 2646 off_t noff; 2647 int error; 2648 2649 KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, 2650 ("%s: Wrong command %lu", __func__, cmd)); 2651 ASSERT_VOP_ELOCKED(vp, "vn_bmap_seekhole_locked"); 2652 2653 if (vp->v_type != VREG) { 2654 error = ENOTTY; 2655 goto out; 2656 } 2657 error = vn_getsize_locked(vp, &size, cred); 2658 if (error != 0) 2659 goto out; 2660 noff = *off; 2661 if (noff < 0 || noff >= size) { 2662 error = ENXIO; 2663 goto out; 2664 } 2665 2666 /* See the comment in ufs_bmap_seekdata(). */ 2667 vnode_pager_clean_sync(vp); 2668 2669 bsize = vp->v_mount->mnt_stat.f_iosize; 2670 for (bn = noff / bsize; noff < size; bn++, noff += bsize - 2671 noff % bsize) { 2672 error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL); 2673 if (error == EOPNOTSUPP) { 2674 error = ENOTTY; 2675 goto out; 2676 } 2677 if ((bnp == -1 && cmd == FIOSEEKHOLE) || 2678 (bnp != -1 && cmd == FIOSEEKDATA)) { 2679 noff = bn * bsize; 2680 if (noff < *off) 2681 noff = *off; 2682 goto out; 2683 } 2684 } 2685 if (noff > size) 2686 noff = size; 2687 /* noff == size. There is an implicit hole at the end of file. */ 2688 if (cmd == FIOSEEKDATA) 2689 error = ENXIO; 2690 out: 2691 if (error == 0) 2692 *off = noff; 2693 return (error); 2694 } 2695 2696 int 2697 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) 2698 { 2699 int error; 2700 2701 KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, 2702 ("%s: Wrong command %lu", __func__, cmd)); 2703 2704 if (vn_lock(vp, LK_EXCLUSIVE) != 0) 2705 return (EBADF); 2706 error = vn_bmap_seekhole_locked(vp, cmd, off, cred); 2707 VOP_UNLOCK(vp); 2708 return (error); 2709 } 2710 2711 int 2712 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td) 2713 { 2714 struct ucred *cred; 2715 struct vnode *vp; 2716 off_t foffset, fsize, size; 2717 int error, noneg; 2718 2719 cred = td->td_ucred; 2720 vp = fp->f_vnode; 2721 noneg = (vp->v_type != VCHR); 2722 /* 2723 * Try to dodge locking for common case of querying the offset. 2724 */ 2725 if (whence == L_INCR && offset == 0) { 2726 foffset = foffset_read(fp); 2727 if (__predict_false(foffset < 0 && noneg)) { 2728 return (EOVERFLOW); 2729 } 2730 td->td_uretoff.tdu_off = foffset; 2731 return (0); 2732 } 2733 foffset = foffset_lock(fp, 0); 2734 error = 0; 2735 switch (whence) { 2736 case L_INCR: 2737 if (noneg && 2738 (foffset < 0 || 2739 (offset > 0 && foffset > OFF_MAX - offset))) { 2740 error = EOVERFLOW; 2741 break; 2742 } 2743 offset += foffset; 2744 break; 2745 case L_XTND: 2746 error = vn_getsize(vp, &fsize, cred); 2747 if (error != 0) 2748 break; 2749 2750 /* 2751 * If the file references a disk device, then fetch 2752 * the media size and use that to determine the ending 2753 * offset. 2754 */ 2755 if (fsize == 0 && vp->v_type == VCHR && 2756 fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0) 2757 fsize = size; 2758 if (noneg && offset > 0 && fsize > OFF_MAX - offset) { 2759 error = EOVERFLOW; 2760 break; 2761 } 2762 offset += fsize; 2763 break; 2764 case L_SET: 2765 break; 2766 case SEEK_DATA: 2767 error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td); 2768 if (error == ENOTTY) 2769 error = EINVAL; 2770 break; 2771 case SEEK_HOLE: 2772 error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td); 2773 if (error == ENOTTY) 2774 error = EINVAL; 2775 break; 2776 default: 2777 error = EINVAL; 2778 } 2779 if (error == 0 && noneg && offset < 0) 2780 error = EINVAL; 2781 if (error != 0) 2782 goto drop; 2783 VFS_KNOTE_UNLOCKED(vp, 0); 2784 td->td_uretoff.tdu_off = offset; 2785 drop: 2786 foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); 2787 return (error); 2788 } 2789 2790 int 2791 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, 2792 struct thread *td) 2793 { 2794 int error; 2795 2796 /* 2797 * Grant permission if the caller is the owner of the file, or 2798 * the super-user, or has ACL_WRITE_ATTRIBUTES permission on 2799 * on the file. If the time pointer is null, then write 2800 * permission on the file is also sufficient. 2801 * 2802 * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes: 2803 * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES 2804 * will be allowed to set the times [..] to the current 2805 * server time. 2806 */ 2807 error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td); 2808 if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0) 2809 error = VOP_ACCESS(vp, VWRITE, cred, td); 2810 return (error); 2811 } 2812 2813 int 2814 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 2815 { 2816 struct vnode *vp; 2817 int error; 2818 2819 if (fp->f_type == DTYPE_FIFO) 2820 kif->kf_type = KF_TYPE_FIFO; 2821 else 2822 kif->kf_type = KF_TYPE_VNODE; 2823 vp = fp->f_vnode; 2824 vref(vp); 2825 FILEDESC_SUNLOCK(fdp); 2826 error = vn_fill_kinfo_vnode(vp, kif); 2827 vrele(vp); 2828 FILEDESC_SLOCK(fdp); 2829 return (error); 2830 } 2831 2832 static inline void 2833 vn_fill_junk(struct kinfo_file *kif) 2834 { 2835 size_t len, olen; 2836 2837 /* 2838 * Simulate vn_fullpath returning changing values for a given 2839 * vp during e.g. coredump. 2840 */ 2841 len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1; 2842 olen = strlen(kif->kf_path); 2843 if (len < olen) 2844 strcpy(&kif->kf_path[len - 1], "$"); 2845 else 2846 for (; olen < len; olen++) 2847 strcpy(&kif->kf_path[olen], "A"); 2848 } 2849 2850 int 2851 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif) 2852 { 2853 struct vattr va; 2854 char *fullpath, *freepath; 2855 int error; 2856 2857 kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type); 2858 freepath = NULL; 2859 fullpath = "-"; 2860 error = vn_fullpath(vp, &fullpath, &freepath); 2861 if (error == 0) { 2862 strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); 2863 } 2864 if (freepath != NULL) 2865 free(freepath, M_TEMP); 2866 2867 KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path, 2868 vn_fill_junk(kif); 2869 ); 2870 2871 /* 2872 * Retrieve vnode attributes. 2873 */ 2874 va.va_fsid = VNOVAL; 2875 va.va_rdev = NODEV; 2876 vn_lock(vp, LK_SHARED | LK_RETRY); 2877 error = VOP_GETATTR(vp, &va, curthread->td_ucred); 2878 VOP_UNLOCK(vp); 2879 if (error != 0) 2880 return (error); 2881 if (va.va_fsid != VNOVAL) 2882 kif->kf_un.kf_file.kf_file_fsid = va.va_fsid; 2883 else 2884 kif->kf_un.kf_file.kf_file_fsid = 2885 vp->v_mount->mnt_stat.f_fsid.val[0]; 2886 kif->kf_un.kf_file.kf_file_fsid_freebsd11 = 2887 kif->kf_un.kf_file.kf_file_fsid; /* truncate */ 2888 kif->kf_un.kf_file.kf_file_fileid = va.va_fileid; 2889 kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode); 2890 kif->kf_un.kf_file.kf_file_size = va.va_size; 2891 kif->kf_un.kf_file.kf_file_rdev = va.va_rdev; 2892 kif->kf_un.kf_file.kf_file_rdev_freebsd11 = 2893 kif->kf_un.kf_file.kf_file_rdev; /* truncate */ 2894 kif->kf_un.kf_file.kf_file_nlink = va.va_nlink; 2895 return (0); 2896 } 2897 2898 int 2899 vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, 2900 vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, 2901 struct thread *td) 2902 { 2903 #ifdef HWPMC_HOOKS 2904 struct pmckern_map_in pkm; 2905 #endif 2906 struct mount *mp; 2907 struct vnode *vp; 2908 vm_object_t object; 2909 vm_prot_t maxprot; 2910 boolean_t writecounted; 2911 int error; 2912 2913 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 2914 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 2915 /* 2916 * POSIX shared-memory objects are defined to have 2917 * kernel persistence, and are not defined to support 2918 * read(2)/write(2) -- or even open(2). Thus, we can 2919 * use MAP_ASYNC to trade on-disk coherence for speed. 2920 * The shm_open(3) library routine turns on the FPOSIXSHM 2921 * flag to request this behavior. 2922 */ 2923 if ((fp->f_flag & FPOSIXSHM) != 0) 2924 flags |= MAP_NOSYNC; 2925 #endif 2926 vp = fp->f_vnode; 2927 2928 /* 2929 * Ensure that file and memory protections are 2930 * compatible. Note that we only worry about 2931 * writability if mapping is shared; in this case, 2932 * current and max prot are dictated by the open file. 2933 * XXX use the vnode instead? Problem is: what 2934 * credentials do we use for determination? What if 2935 * proc does a setuid? 2936 */ 2937 mp = vp->v_mount; 2938 if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { 2939 maxprot = VM_PROT_NONE; 2940 if ((prot & VM_PROT_EXECUTE) != 0) 2941 return (EACCES); 2942 } else 2943 maxprot = VM_PROT_EXECUTE; 2944 if ((fp->f_flag & FREAD) != 0) 2945 maxprot |= VM_PROT_READ; 2946 else if ((prot & VM_PROT_READ) != 0) 2947 return (EACCES); 2948 2949 /* 2950 * If we are sharing potential changes via MAP_SHARED and we 2951 * are trying to get write permission although we opened it 2952 * without asking for it, bail out. 2953 */ 2954 if ((flags & MAP_SHARED) != 0) { 2955 if ((fp->f_flag & FWRITE) != 0) 2956 maxprot |= VM_PROT_WRITE; 2957 else if ((prot & VM_PROT_WRITE) != 0) 2958 return (EACCES); 2959 } else { 2960 maxprot |= VM_PROT_WRITE; 2961 cap_maxprot |= VM_PROT_WRITE; 2962 } 2963 maxprot &= cap_maxprot; 2964 2965 /* 2966 * For regular files and shared memory, POSIX requires that 2967 * the value of foff be a legitimate offset within the data 2968 * object. In particular, negative offsets are invalid. 2969 * Blocking negative offsets and overflows here avoids 2970 * possible wraparound or user-level access into reserved 2971 * ranges of the data object later. In contrast, POSIX does 2972 * not dictate how offsets are used by device drivers, so in 2973 * the case of a device mapping a negative offset is passed 2974 * on. 2975 */ 2976 if ( 2977 #ifdef _LP64 2978 size > OFF_MAX || 2979 #endif 2980 foff > OFF_MAX - size) 2981 return (EINVAL); 2982 2983 writecounted = FALSE; 2984 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp, 2985 &foff, &object, &writecounted); 2986 if (error != 0) 2987 return (error); 2988 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 2989 foff, writecounted, td); 2990 if (error != 0) { 2991 /* 2992 * If this mapping was accounted for in the vnode's 2993 * writecount, then undo that now. 2994 */ 2995 if (writecounted) 2996 vm_pager_release_writecount(object, 0, size); 2997 vm_object_deallocate(object); 2998 } 2999 #ifdef HWPMC_HOOKS 3000 /* Inform hwpmc(4) if an executable is being mapped. */ 3001 if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) { 3002 if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) { 3003 pkm.pm_file = vp; 3004 pkm.pm_address = (uintptr_t) *addr; 3005 PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm); 3006 } 3007 } 3008 #endif 3009 return (error); 3010 } 3011 3012 void 3013 vn_fsid(struct vnode *vp, struct vattr *va) 3014 { 3015 fsid_t *f; 3016 3017 f = &vp->v_mount->mnt_stat.f_fsid; 3018 va->va_fsid = (uint32_t)f->val[1]; 3019 va->va_fsid <<= sizeof(f->val[1]) * NBBY; 3020 va->va_fsid += (uint32_t)f->val[0]; 3021 } 3022 3023 int 3024 vn_fsync_buf(struct vnode *vp, int waitfor) 3025 { 3026 struct buf *bp, *nbp; 3027 struct bufobj *bo; 3028 struct mount *mp; 3029 int error, maxretry; 3030 3031 error = 0; 3032 maxretry = 10000; /* large, arbitrarily chosen */ 3033 mp = NULL; 3034 if (vp->v_type == VCHR) { 3035 VI_LOCK(vp); 3036 mp = vp->v_rdev->si_mountpt; 3037 VI_UNLOCK(vp); 3038 } 3039 bo = &vp->v_bufobj; 3040 BO_LOCK(bo); 3041 loop1: 3042 /* 3043 * MARK/SCAN initialization to avoid infinite loops. 3044 */ 3045 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 3046 bp->b_vflags &= ~BV_SCANNED; 3047 bp->b_error = 0; 3048 } 3049 3050 /* 3051 * Flush all dirty buffers associated with a vnode. 3052 */ 3053 loop2: 3054 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 3055 if ((bp->b_vflags & BV_SCANNED) != 0) 3056 continue; 3057 bp->b_vflags |= BV_SCANNED; 3058 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) { 3059 if (waitfor != MNT_WAIT) 3060 continue; 3061 if (BUF_LOCK(bp, 3062 LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL, 3063 BO_LOCKPTR(bo)) != 0) { 3064 BO_LOCK(bo); 3065 goto loop1; 3066 } 3067 BO_LOCK(bo); 3068 } 3069 BO_UNLOCK(bo); 3070 KASSERT(bp->b_bufobj == bo, 3071 ("bp %p wrong b_bufobj %p should be %p", 3072 bp, bp->b_bufobj, bo)); 3073 if ((bp->b_flags & B_DELWRI) == 0) 3074 panic("fsync: not dirty"); 3075 if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) { 3076 vfs_bio_awrite(bp); 3077 } else { 3078 bremfree(bp); 3079 bawrite(bp); 3080 } 3081 if (maxretry < 1000) 3082 pause("dirty", hz < 1000 ? 1 : hz / 1000); 3083 BO_LOCK(bo); 3084 goto loop2; 3085 } 3086 3087 /* 3088 * If synchronous the caller expects us to completely resolve all 3089 * dirty buffers in the system. Wait for in-progress I/O to 3090 * complete (which could include background bitmap writes), then 3091 * retry if dirty blocks still exist. 3092 */ 3093 if (waitfor == MNT_WAIT) { 3094 bufobj_wwait(bo, 0, 0); 3095 if (bo->bo_dirty.bv_cnt > 0) { 3096 /* 3097 * If we are unable to write any of these buffers 3098 * then we fail now rather than trying endlessly 3099 * to write them out. 3100 */ 3101 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 3102 if ((error = bp->b_error) != 0) 3103 break; 3104 if ((mp != NULL && mp->mnt_secondary_writes > 0) || 3105 (error == 0 && --maxretry >= 0)) 3106 goto loop1; 3107 if (error == 0) 3108 error = EAGAIN; 3109 } 3110 } 3111 BO_UNLOCK(bo); 3112 if (error != 0) 3113 vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error); 3114 3115 return (error); 3116 } 3117 3118 /* 3119 * Copies a byte range from invp to outvp. Calls VOP_COPY_FILE_RANGE() 3120 * or vn_generic_copy_file_range() after rangelocking the byte ranges, 3121 * to do the actual copy. 3122 * vn_generic_copy_file_range() is factored out, so it can be called 3123 * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from 3124 * different file systems. 3125 */ 3126 int 3127 vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, 3128 off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, 3129 struct ucred *outcred, struct thread *fsize_td) 3130 { 3131 struct mount *inmp, *outmp; 3132 struct vnode *invpl, *outvpl; 3133 int error; 3134 size_t len; 3135 uint64_t uval; 3136 3137 invpl = outvpl = NULL; 3138 len = *lenp; 3139 *lenp = 0; /* For error returns. */ 3140 error = 0; 3141 3142 /* Do some sanity checks on the arguments. */ 3143 if (invp->v_type == VDIR || outvp->v_type == VDIR) 3144 error = EISDIR; 3145 else if (*inoffp < 0 || *outoffp < 0 || 3146 invp->v_type != VREG || outvp->v_type != VREG) 3147 error = EINVAL; 3148 if (error != 0) 3149 goto out; 3150 3151 /* Ensure offset + len does not wrap around. */ 3152 uval = *inoffp; 3153 uval += len; 3154 if (uval > INT64_MAX) 3155 len = INT64_MAX - *inoffp; 3156 uval = *outoffp; 3157 uval += len; 3158 if (uval > INT64_MAX) 3159 len = INT64_MAX - *outoffp; 3160 if (len == 0) 3161 goto out; 3162 3163 error = VOP_GETLOWVNODE(invp, &invpl, FREAD); 3164 if (error != 0) 3165 goto out; 3166 error = VOP_GETLOWVNODE(outvp, &outvpl, FWRITE); 3167 if (error != 0) 3168 goto out1; 3169 3170 inmp = invpl->v_mount; 3171 outmp = outvpl->v_mount; 3172 if (inmp == NULL || outmp == NULL) 3173 goto out2; 3174 3175 for (;;) { 3176 error = vfs_busy(inmp, 0); 3177 if (error != 0) 3178 goto out2; 3179 if (inmp == outmp) 3180 break; 3181 error = vfs_busy(outmp, MBF_NOWAIT); 3182 if (error != 0) { 3183 vfs_unbusy(inmp); 3184 error = vfs_busy(outmp, 0); 3185 if (error == 0) { 3186 vfs_unbusy(outmp); 3187 continue; 3188 } 3189 goto out2; 3190 } 3191 break; 3192 } 3193 3194 /* 3195 * If the two vnodes are for the same file system type, call 3196 * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range() 3197 * which can handle copies across multiple file system types. 3198 */ 3199 *lenp = len; 3200 if (inmp == outmp || inmp->mnt_vfc == outmp->mnt_vfc) 3201 error = VOP_COPY_FILE_RANGE(invpl, inoffp, outvpl, outoffp, 3202 lenp, flags, incred, outcred, fsize_td); 3203 else 3204 error = ENOSYS; 3205 if (error == ENOSYS) 3206 error = vn_generic_copy_file_range(invpl, inoffp, outvpl, 3207 outoffp, lenp, flags, incred, outcred, fsize_td); 3208 vfs_unbusy(outmp); 3209 if (inmp != outmp) 3210 vfs_unbusy(inmp); 3211 out2: 3212 if (outvpl != NULL) 3213 vrele(outvpl); 3214 out1: 3215 if (invpl != NULL) 3216 vrele(invpl); 3217 out: 3218 return (error); 3219 } 3220 3221 /* 3222 * Test len bytes of data starting at dat for all bytes == 0. 3223 * Return true if all bytes are zero, false otherwise. 3224 * Expects dat to be well aligned. 3225 */ 3226 static bool 3227 mem_iszero(void *dat, int len) 3228 { 3229 int i; 3230 const u_int *p; 3231 const char *cp; 3232 3233 for (p = dat; len > 0; len -= sizeof(*p), p++) { 3234 if (len >= sizeof(*p)) { 3235 if (*p != 0) 3236 return (false); 3237 } else { 3238 cp = (const char *)p; 3239 for (i = 0; i < len; i++, cp++) 3240 if (*cp != '\0') 3241 return (false); 3242 } 3243 } 3244 return (true); 3245 } 3246 3247 /* 3248 * Look for a hole in the output file and, if found, adjust *outoffp 3249 * and *xferp to skip past the hole. 3250 * *xferp is the entire hole length to be written and xfer2 is how many bytes 3251 * to be written as 0's upon return. 3252 */ 3253 static off_t 3254 vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp, 3255 off_t *dataoffp, off_t *holeoffp, struct ucred *cred) 3256 { 3257 int error; 3258 off_t delta; 3259 3260 if (*holeoffp == 0 || *holeoffp <= *outoffp) { 3261 *dataoffp = *outoffp; 3262 error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred, 3263 curthread); 3264 if (error == 0) { 3265 *holeoffp = *dataoffp; 3266 error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred, 3267 curthread); 3268 } 3269 if (error != 0 || *holeoffp == *dataoffp) { 3270 /* 3271 * Since outvp is unlocked, it may be possible for 3272 * another thread to do a truncate(), lseek(), write() 3273 * creating a hole at startoff between the above 3274 * VOP_IOCTL() calls, if the other thread does not do 3275 * rangelocking. 3276 * If that happens, *holeoffp == *dataoffp and finding 3277 * the hole has failed, so disable vn_skip_hole(). 3278 */ 3279 *holeoffp = -1; /* Disable use of vn_skip_hole(). */ 3280 return (xfer2); 3281 } 3282 KASSERT(*dataoffp >= *outoffp, 3283 ("vn_skip_hole: dataoff=%jd < outoff=%jd", 3284 (intmax_t)*dataoffp, (intmax_t)*outoffp)); 3285 KASSERT(*holeoffp > *dataoffp, 3286 ("vn_skip_hole: holeoff=%jd <= dataoff=%jd", 3287 (intmax_t)*holeoffp, (intmax_t)*dataoffp)); 3288 } 3289 3290 /* 3291 * If there is a hole before the data starts, advance *outoffp and 3292 * *xferp past the hole. 3293 */ 3294 if (*dataoffp > *outoffp) { 3295 delta = *dataoffp - *outoffp; 3296 if (delta >= *xferp) { 3297 /* Entire *xferp is a hole. */ 3298 *outoffp += *xferp; 3299 *xferp = 0; 3300 return (0); 3301 } 3302 *xferp -= delta; 3303 *outoffp += delta; 3304 xfer2 = MIN(xfer2, *xferp); 3305 } 3306 3307 /* 3308 * If a hole starts before the end of this xfer2, reduce this xfer2 so 3309 * that the write ends at the start of the hole. 3310 * *holeoffp should always be greater than *outoffp, but for the 3311 * non-INVARIANTS case, check this to make sure xfer2 remains a sane 3312 * value. 3313 */ 3314 if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2) 3315 xfer2 = *holeoffp - *outoffp; 3316 return (xfer2); 3317 } 3318 3319 /* 3320 * Write an xfer sized chunk to outvp in blksize blocks from dat. 3321 * dat is a maximum of blksize in length and can be written repeatedly in 3322 * the chunk. 3323 * If growfile == true, just grow the file via vn_truncate_locked() instead 3324 * of doing actual writes. 3325 * If checkhole == true, a hole is being punched, so skip over any hole 3326 * already in the output file. 3327 */ 3328 static int 3329 vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer, 3330 u_long blksize, bool growfile, bool checkhole, struct ucred *cred) 3331 { 3332 struct mount *mp; 3333 off_t dataoff, holeoff, xfer2; 3334 int error; 3335 3336 /* 3337 * Loop around doing writes of blksize until write has been completed. 3338 * Lock/unlock on each loop iteration so that a bwillwrite() can be 3339 * done for each iteration, since the xfer argument can be very 3340 * large if there is a large hole to punch in the output file. 3341 */ 3342 error = 0; 3343 holeoff = 0; 3344 do { 3345 xfer2 = MIN(xfer, blksize); 3346 if (checkhole) { 3347 /* 3348 * Punching a hole. Skip writing if there is 3349 * already a hole in the output file. 3350 */ 3351 xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer, 3352 &dataoff, &holeoff, cred); 3353 if (xfer == 0) 3354 break; 3355 if (holeoff < 0) 3356 checkhole = false; 3357 KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd", 3358 (intmax_t)xfer2)); 3359 } 3360 bwillwrite(); 3361 mp = NULL; 3362 error = vn_start_write(outvp, &mp, V_WAIT); 3363 if (error != 0) 3364 break; 3365 if (growfile) { 3366 error = vn_lock(outvp, LK_EXCLUSIVE); 3367 if (error == 0) { 3368 error = vn_truncate_locked(outvp, outoff + xfer, 3369 false, cred); 3370 VOP_UNLOCK(outvp); 3371 } 3372 } else { 3373 error = vn_lock(outvp, vn_lktype_write(mp, outvp)); 3374 if (error == 0) { 3375 error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2, 3376 outoff, UIO_SYSSPACE, IO_NODELOCKED, 3377 curthread->td_ucred, cred, NULL, curthread); 3378 outoff += xfer2; 3379 xfer -= xfer2; 3380 VOP_UNLOCK(outvp); 3381 } 3382 } 3383 if (mp != NULL) 3384 vn_finished_write(mp); 3385 } while (!growfile && xfer > 0 && error == 0); 3386 return (error); 3387 } 3388 3389 /* 3390 * Copy a byte range of one file to another. This function can handle the 3391 * case where invp and outvp are on different file systems. 3392 * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there 3393 * is no better file system specific way to do it. 3394 */ 3395 int 3396 vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, 3397 struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, 3398 struct ucred *incred, struct ucred *outcred, struct thread *fsize_td) 3399 { 3400 struct vattr inva; 3401 struct mount *mp; 3402 off_t startoff, endoff, xfer, xfer2; 3403 u_long blksize; 3404 int error, interrupted; 3405 bool cantseek, readzeros, eof, first, lastblock, holetoeof, sparse; 3406 ssize_t aresid, r = 0; 3407 size_t copylen, len, savlen; 3408 off_t outsize; 3409 char *dat; 3410 long holein, holeout; 3411 struct timespec curts, endts; 3412 3413 holein = holeout = 0; 3414 savlen = len = *lenp; 3415 error = 0; 3416 interrupted = 0; 3417 dat = NULL; 3418 3419 error = vn_lock(invp, LK_SHARED); 3420 if (error != 0) 3421 goto out; 3422 if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0) 3423 holein = 0; 3424 error = VOP_GETATTR(invp, &inva, incred); 3425 if (error == 0 && inva.va_size > OFF_MAX) 3426 error = EFBIG; 3427 VOP_UNLOCK(invp); 3428 if (error != 0) 3429 goto out; 3430 3431 /* 3432 * Use va_bytes >= va_size as a hint that the file does not have 3433 * sufficient holes to justify the overhead of doing FIOSEEKHOLE. 3434 * This hint does not work well for file systems doing compression 3435 * and may fail when allocations for extended attributes increases 3436 * the value of va_bytes to >= va_size. 3437 */ 3438 sparse = true; 3439 if (holein != 0 && inva.va_bytes >= inva.va_size) { 3440 holein = 0; 3441 sparse = false; 3442 } 3443 3444 mp = NULL; 3445 error = vn_start_write(outvp, &mp, V_WAIT); 3446 if (error == 0) 3447 error = vn_lock(outvp, LK_EXCLUSIVE); 3448 if (error == 0) { 3449 /* 3450 * If fsize_td != NULL, do a vn_rlimit_fsizex() call, 3451 * now that outvp is locked. 3452 */ 3453 if (fsize_td != NULL) { 3454 struct uio io; 3455 3456 io.uio_offset = *outoffp; 3457 io.uio_resid = len; 3458 error = vn_rlimit_fsizex(outvp, &io, 0, &r, fsize_td); 3459 len = savlen = io.uio_resid; 3460 /* 3461 * No need to call vn_rlimit_fsizex_res before return, 3462 * since the uio is local. 3463 */ 3464 } 3465 if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0) 3466 holeout = 0; 3467 /* 3468 * Holes that are past EOF do not need to be written as a block 3469 * of zero bytes. So, truncate the output file as far as 3470 * possible and then use size to decide if writing 0 3471 * bytes is necessary in the loop below. 3472 */ 3473 if (error == 0) 3474 error = vn_getsize_locked(outvp, &outsize, outcred); 3475 if (error == 0 && outsize > *outoffp && 3476 *outoffp <= OFF_MAX - len && outsize <= *outoffp + len && 3477 *inoffp < inva.va_size && 3478 *outoffp <= OFF_MAX - (inva.va_size - *inoffp) && 3479 outsize <= *outoffp + (inva.va_size - *inoffp)) { 3480 #ifdef MAC 3481 error = mac_vnode_check_write(curthread->td_ucred, 3482 outcred, outvp); 3483 if (error == 0) 3484 #endif 3485 error = vn_truncate_locked(outvp, *outoffp, 3486 false, outcred); 3487 if (error == 0) 3488 outsize = *outoffp; 3489 } 3490 VOP_UNLOCK(outvp); 3491 } 3492 if (mp != NULL) 3493 vn_finished_write(mp); 3494 if (error != 0) 3495 goto out; 3496 3497 if (sparse && holein == 0 && holeout > 0) { 3498 /* 3499 * For this special case, the input data will be scanned 3500 * for blocks of all 0 bytes. For these blocks, the 3501 * write can be skipped for the output file to create 3502 * an unallocated region. 3503 * Therefore, use the appropriate size for the output file. 3504 */ 3505 blksize = holeout; 3506 if (blksize <= 512) { 3507 /* 3508 * Use f_iosize, since ZFS reports a _PC_MIN_HOLE_SIZE 3509 * of 512, although it actually only creates 3510 * unallocated regions for blocks >= f_iosize. 3511 */ 3512 blksize = outvp->v_mount->mnt_stat.f_iosize; 3513 } 3514 } else { 3515 /* 3516 * Use the larger of the two f_iosize values. If they are 3517 * not the same size, one will normally be an exact multiple of 3518 * the other, since they are both likely to be a power of 2. 3519 */ 3520 blksize = MAX(invp->v_mount->mnt_stat.f_iosize, 3521 outvp->v_mount->mnt_stat.f_iosize); 3522 } 3523 3524 /* Clip to sane limits. */ 3525 if (blksize < 4096) 3526 blksize = 4096; 3527 else if (blksize > maxphys) 3528 blksize = maxphys; 3529 dat = malloc(blksize, M_TEMP, M_WAITOK); 3530 3531 /* 3532 * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA 3533 * to find holes. Otherwise, just scan the read block for all 0s 3534 * in the inner loop where the data copying is done. 3535 * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may 3536 * support holes on the server, but do not support FIOSEEKHOLE. 3537 * The kernel flag COPY_FILE_RANGE_TIMEO1SEC is used to indicate 3538 * that this function should return after 1second with a partial 3539 * completion. 3540 */ 3541 if ((flags & COPY_FILE_RANGE_TIMEO1SEC) != 0) { 3542 getnanouptime(&endts); 3543 endts.tv_sec++; 3544 } else 3545 timespecclear(&endts); 3546 first = true; 3547 holetoeof = eof = false; 3548 while (len > 0 && error == 0 && !eof && interrupted == 0) { 3549 endoff = 0; /* To shut up compilers. */ 3550 cantseek = true; 3551 startoff = *inoffp; 3552 copylen = len; 3553 3554 /* 3555 * Find the next data area. If there is just a hole to EOF, 3556 * FIOSEEKDATA should fail with ENXIO. 3557 * (I do not know if any file system will report a hole to 3558 * EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA 3559 * will fail for those file systems.) 3560 * 3561 * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE, 3562 * the code just falls through to the inner copy loop. 3563 */ 3564 error = EINVAL; 3565 if (holein > 0) { 3566 error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0, 3567 incred, curthread); 3568 if (error == ENXIO) { 3569 startoff = endoff = inva.va_size; 3570 eof = holetoeof = true; 3571 error = 0; 3572 } 3573 } 3574 if (error == 0 && !holetoeof) { 3575 endoff = startoff; 3576 error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0, 3577 incred, curthread); 3578 /* 3579 * Since invp is unlocked, it may be possible for 3580 * another thread to do a truncate(), lseek(), write() 3581 * creating a hole at startoff between the above 3582 * VOP_IOCTL() calls, if the other thread does not do 3583 * rangelocking. 3584 * If that happens, startoff == endoff and finding 3585 * the hole has failed, so set an error. 3586 */ 3587 if (error == 0 && startoff == endoff) 3588 error = EINVAL; /* Any error. Reset to 0. */ 3589 } 3590 if (error == 0) { 3591 if (startoff > *inoffp) { 3592 /* Found hole before data block. */ 3593 xfer = MIN(startoff - *inoffp, len); 3594 if (*outoffp < outsize) { 3595 /* Must write 0s to punch hole. */ 3596 xfer2 = MIN(outsize - *outoffp, 3597 xfer); 3598 memset(dat, 0, MIN(xfer2, blksize)); 3599 error = vn_write_outvp(outvp, dat, 3600 *outoffp, xfer2, blksize, false, 3601 holeout > 0, outcred); 3602 } 3603 3604 if (error == 0 && *outoffp + xfer > 3605 outsize && (xfer == len || holetoeof)) { 3606 /* Grow output file (hole at end). */ 3607 error = vn_write_outvp(outvp, dat, 3608 *outoffp, xfer, blksize, true, 3609 false, outcred); 3610 } 3611 if (error == 0) { 3612 *inoffp += xfer; 3613 *outoffp += xfer; 3614 len -= xfer; 3615 if (len < savlen) { 3616 interrupted = sig_intr(); 3617 if (timespecisset(&endts) && 3618 interrupted == 0) { 3619 getnanouptime(&curts); 3620 if (timespeccmp(&curts, 3621 &endts, >=)) 3622 interrupted = 3623 EINTR; 3624 } 3625 } 3626 } 3627 } 3628 copylen = MIN(len, endoff - startoff); 3629 cantseek = false; 3630 } else { 3631 cantseek = true; 3632 if (!sparse) 3633 cantseek = false; 3634 startoff = *inoffp; 3635 copylen = len; 3636 error = 0; 3637 } 3638 3639 xfer = blksize; 3640 if (cantseek) { 3641 /* 3642 * Set first xfer to end at a block boundary, so that 3643 * holes are more likely detected in the loop below via 3644 * the for all bytes 0 method. 3645 */ 3646 xfer -= (*inoffp % blksize); 3647 } 3648 3649 /* 3650 * Loop copying the data block. If this was our first attempt 3651 * to copy anything, allow a zero-length block so that the VOPs 3652 * get a chance to update metadata, specifically the atime. 3653 */ 3654 while (error == 0 && ((copylen > 0 && !eof) || first) && 3655 interrupted == 0) { 3656 if (copylen < xfer) 3657 xfer = copylen; 3658 first = false; 3659 error = vn_lock(invp, LK_SHARED); 3660 if (error != 0) 3661 goto out; 3662 error = vn_rdwr(UIO_READ, invp, dat, xfer, 3663 startoff, UIO_SYSSPACE, IO_NODELOCKED, 3664 curthread->td_ucred, incred, &aresid, 3665 curthread); 3666 VOP_UNLOCK(invp); 3667 lastblock = false; 3668 if (error == 0 && (xfer == 0 || aresid > 0)) { 3669 /* Stop the copy at EOF on the input file. */ 3670 xfer -= aresid; 3671 eof = true; 3672 lastblock = true; 3673 } 3674 if (error == 0) { 3675 /* 3676 * Skip the write for holes past the initial EOF 3677 * of the output file, unless this is the last 3678 * write of the output file at EOF. 3679 */ 3680 readzeros = cantseek ? mem_iszero(dat, xfer) : 3681 false; 3682 if (xfer == len) 3683 lastblock = true; 3684 if (!cantseek || *outoffp < outsize || 3685 lastblock || !readzeros) 3686 error = vn_write_outvp(outvp, dat, 3687 *outoffp, xfer, blksize, 3688 readzeros && lastblock && 3689 *outoffp >= outsize, false, 3690 outcred); 3691 if (error == 0) { 3692 *inoffp += xfer; 3693 startoff += xfer; 3694 *outoffp += xfer; 3695 copylen -= xfer; 3696 len -= xfer; 3697 if (len < savlen) { 3698 interrupted = sig_intr(); 3699 if (timespecisset(&endts) && 3700 interrupted == 0) { 3701 getnanouptime(&curts); 3702 if (timespeccmp(&curts, 3703 &endts, >=)) 3704 interrupted = 3705 EINTR; 3706 } 3707 } 3708 } 3709 } 3710 xfer = blksize; 3711 } 3712 } 3713 out: 3714 *lenp = savlen - len; 3715 free(dat, M_TEMP); 3716 return (error); 3717 } 3718 3719 static int 3720 vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td) 3721 { 3722 struct mount *mp; 3723 struct vnode *vp; 3724 off_t olen, ooffset; 3725 int error; 3726 #ifdef AUDIT 3727 int audited_vnode1 = 0; 3728 #endif 3729 3730 vp = fp->f_vnode; 3731 if (vp->v_type != VREG) 3732 return (ENODEV); 3733 3734 /* Allocating blocks may take a long time, so iterate. */ 3735 for (;;) { 3736 olen = len; 3737 ooffset = offset; 3738 3739 bwillwrite(); 3740 mp = NULL; 3741 error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH); 3742 if (error != 0) 3743 break; 3744 error = vn_lock(vp, LK_EXCLUSIVE); 3745 if (error != 0) { 3746 vn_finished_write(mp); 3747 break; 3748 } 3749 #ifdef AUDIT 3750 if (!audited_vnode1) { 3751 AUDIT_ARG_VNODE1(vp); 3752 audited_vnode1 = 1; 3753 } 3754 #endif 3755 #ifdef MAC 3756 error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp); 3757 if (error == 0) 3758 #endif 3759 error = VOP_ALLOCATE(vp, &offset, &len, 0, 3760 td->td_ucred); 3761 VOP_UNLOCK(vp); 3762 vn_finished_write(mp); 3763 3764 if (olen + ooffset != offset + len) { 3765 panic("offset + len changed from %jx/%jx to %jx/%jx", 3766 ooffset, olen, offset, len); 3767 } 3768 if (error != 0 || len == 0) 3769 break; 3770 KASSERT(olen > len, ("Iteration did not make progress?")); 3771 maybe_yield(); 3772 } 3773 3774 return (error); 3775 } 3776 3777 static int 3778 vn_deallocate_impl(struct vnode *vp, off_t *offset, off_t *length, int flags, 3779 int ioflag, struct ucred *cred, struct ucred *active_cred, 3780 struct ucred *file_cred) 3781 { 3782 struct mount *mp; 3783 void *rl_cookie; 3784 off_t off, len; 3785 int error; 3786 #ifdef AUDIT 3787 bool audited_vnode1 = false; 3788 #endif 3789 3790 rl_cookie = NULL; 3791 error = 0; 3792 mp = NULL; 3793 off = *offset; 3794 len = *length; 3795 3796 if ((ioflag & (IO_NODELOCKED | IO_RANGELOCKED)) == 0) 3797 rl_cookie = vn_rangelock_wlock(vp, off, off + len); 3798 while (len > 0 && error == 0) { 3799 /* 3800 * Try to deallocate the longest range in one pass. 3801 * In case a pass takes too long to be executed, it returns 3802 * partial result. The residue will be proceeded in the next 3803 * pass. 3804 */ 3805 3806 if ((ioflag & IO_NODELOCKED) == 0) { 3807 bwillwrite(); 3808 if ((error = vn_start_write(vp, &mp, 3809 V_WAIT | V_PCATCH)) != 0) 3810 goto out; 3811 vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY); 3812 } 3813 #ifdef AUDIT 3814 if (!audited_vnode1) { 3815 AUDIT_ARG_VNODE1(vp); 3816 audited_vnode1 = true; 3817 } 3818 #endif 3819 3820 #ifdef MAC 3821 if ((ioflag & IO_NOMACCHECK) == 0) 3822 error = mac_vnode_check_write(active_cred, file_cred, 3823 vp); 3824 #endif 3825 if (error == 0) 3826 error = VOP_DEALLOCATE(vp, &off, &len, flags, ioflag, 3827 cred); 3828 3829 if ((ioflag & IO_NODELOCKED) == 0) { 3830 VOP_UNLOCK(vp); 3831 if (mp != NULL) { 3832 vn_finished_write(mp); 3833 mp = NULL; 3834 } 3835 } 3836 if (error == 0 && len != 0) 3837 maybe_yield(); 3838 } 3839 out: 3840 if (rl_cookie != NULL) 3841 vn_rangelock_unlock(vp, rl_cookie); 3842 *offset = off; 3843 *length = len; 3844 return (error); 3845 } 3846 3847 /* 3848 * This function is supposed to be used in the situations where the deallocation 3849 * is not triggered by a user request. 3850 */ 3851 int 3852 vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags, 3853 int ioflag, struct ucred *active_cred, struct ucred *file_cred) 3854 { 3855 struct ucred *cred; 3856 3857 if (*offset < 0 || *length <= 0 || *length > OFF_MAX - *offset || 3858 flags != 0) 3859 return (EINVAL); 3860 if (vp->v_type != VREG) 3861 return (ENODEV); 3862 3863 cred = file_cred != NOCRED ? file_cred : active_cred; 3864 return (vn_deallocate_impl(vp, offset, length, flags, ioflag, cred, 3865 active_cred, file_cred)); 3866 } 3867 3868 static int 3869 vn_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags, 3870 struct ucred *active_cred, struct thread *td) 3871 { 3872 int error; 3873 struct vnode *vp; 3874 int ioflag; 3875 3876 KASSERT(cmd == SPACECTL_DEALLOC, ("vn_fspacectl: Invalid cmd")); 3877 KASSERT((flags & ~SPACECTL_F_SUPPORTED) == 0, 3878 ("vn_fspacectl: non-zero flags")); 3879 KASSERT(*offset >= 0 && *length > 0 && *length <= OFF_MAX - *offset, 3880 ("vn_fspacectl: offset/length overflow or underflow")); 3881 vp = fp->f_vnode; 3882 3883 if (vp->v_type != VREG) 3884 return (ENODEV); 3885 3886 ioflag = get_write_ioflag(fp); 3887 3888 switch (cmd) { 3889 case SPACECTL_DEALLOC: 3890 error = vn_deallocate_impl(vp, offset, length, flags, ioflag, 3891 active_cred, active_cred, fp->f_cred); 3892 break; 3893 default: 3894 panic("vn_fspacectl: unknown cmd %d", cmd); 3895 } 3896 3897 return (error); 3898 } 3899 3900 /* 3901 * Keep this assert as long as sizeof(struct dirent) is used as the maximum 3902 * entry size. 3903 */ 3904 _Static_assert(_GENERIC_MAXDIRSIZ == sizeof(struct dirent), 3905 "'struct dirent' size must be a multiple of its alignment " 3906 "(see _GENERIC_DIRLEN())"); 3907 3908 /* 3909 * Returns successive directory entries through some caller's provided buffer. 3910 * 3911 * This function automatically refills the provided buffer with calls to 3912 * VOP_READDIR() (after MAC permission checks). 3913 * 3914 * 'td' is used for credentials and passed to uiomove(). 'dirbuf' is the 3915 * caller's buffer to fill and 'dirbuflen' its allocated size. 'dirbuf' must 3916 * be properly aligned to access 'struct dirent' structures and 'dirbuflen' 3917 * must be greater than GENERIC_MAXDIRSIZ to avoid VOP_READDIR() returning 3918 * EINVAL (the latter is not a strong guarantee (yet); but EINVAL will always 3919 * be returned if this requirement is not verified). '*dpp' points to the 3920 * current directory entry in the buffer and '*len' contains the remaining 3921 * valid bytes in 'dirbuf' after 'dpp' (including the pointed entry). 3922 * 3923 * At first call (or when restarting the read), '*len' must have been set to 0, 3924 * '*off' to 0 (or any valid start offset) and '*eofflag' to 0. There are no 3925 * more entries as soon as '*len' is 0 after a call that returned 0. Calling 3926 * again this function after such a condition is considered an error and EINVAL 3927 * will be returned. Other possible error codes are those of VOP_READDIR(), 3928 * EINTEGRITY if the returned entries do not pass coherency tests, or EINVAL 3929 * (bad call). All errors are unrecoverable, i.e., the state ('*len', '*off' 3930 * and '*eofflag') must be re-initialized before a subsequent call. On error 3931 * or at end of directory, '*dpp' is reset to NULL. 3932 * 3933 * '*len', '*off' and '*eofflag' are internal state the caller should not 3934 * tamper with except as explained above. '*off' is the next directory offset 3935 * to read from to refill the buffer. '*eofflag' is set to 0 or 1 by the last 3936 * internal call to VOP_READDIR() that returned without error, indicating 3937 * whether it reached the end of the directory, and to 2 by this function after 3938 * all entries have been read. 3939 */ 3940 int 3941 vn_dir_next_dirent(struct vnode *vp, struct thread *td, 3942 char *dirbuf, size_t dirbuflen, 3943 struct dirent **dpp, size_t *len, off_t *off, int *eofflag) 3944 { 3945 struct dirent *dp = NULL; 3946 int reclen; 3947 int error; 3948 struct uio uio; 3949 struct iovec iov; 3950 3951 ASSERT_VOP_LOCKED(vp, "vnode not locked"); 3952 VNASSERT(vp->v_type == VDIR, vp, ("vnode is not a directory")); 3953 MPASS2((uintptr_t)dirbuf < (uintptr_t)dirbuf + dirbuflen, 3954 "Address space overflow"); 3955 3956 if (__predict_false(dirbuflen < GENERIC_MAXDIRSIZ)) { 3957 /* Don't take any chances in this case */ 3958 error = EINVAL; 3959 goto out; 3960 } 3961 3962 if (*len != 0) { 3963 dp = *dpp; 3964 3965 /* 3966 * The caller continued to call us after an error (we set dp to 3967 * NULL in a previous iteration). Bail out right now. 3968 */ 3969 if (__predict_false(dp == NULL)) 3970 return (EINVAL); 3971 3972 MPASS(*len <= dirbuflen); 3973 MPASS2((uintptr_t)dirbuf <= (uintptr_t)dp && 3974 (uintptr_t)dp + *len <= (uintptr_t)dirbuf + dirbuflen, 3975 "Filled range not inside buffer"); 3976 3977 reclen = dp->d_reclen; 3978 if (reclen >= *len) { 3979 /* End of buffer reached */ 3980 *len = 0; 3981 } else { 3982 dp = (struct dirent *)((char *)dp + reclen); 3983 *len -= reclen; 3984 } 3985 } 3986 3987 if (*len == 0) { 3988 dp = NULL; 3989 3990 /* Have to refill. */ 3991 switch (*eofflag) { 3992 case 0: 3993 break; 3994 3995 case 1: 3996 /* Nothing more to read. */ 3997 *eofflag = 2; /* Remember the caller reached EOF. */ 3998 goto success; 3999 4000 default: 4001 /* The caller didn't test for EOF. */ 4002 error = EINVAL; 4003 goto out; 4004 } 4005 4006 iov.iov_base = dirbuf; 4007 iov.iov_len = dirbuflen; 4008 4009 uio.uio_iov = &iov; 4010 uio.uio_iovcnt = 1; 4011 uio.uio_offset = *off; 4012 uio.uio_resid = dirbuflen; 4013 uio.uio_segflg = UIO_SYSSPACE; 4014 uio.uio_rw = UIO_READ; 4015 uio.uio_td = td; 4016 4017 #ifdef MAC 4018 error = mac_vnode_check_readdir(td->td_ucred, vp); 4019 if (error == 0) 4020 #endif 4021 error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag, 4022 NULL, NULL); 4023 if (error != 0) 4024 goto out; 4025 4026 *len = dirbuflen - uio.uio_resid; 4027 *off = uio.uio_offset; 4028 4029 if (*len == 0) { 4030 /* Sanity check on INVARIANTS. */ 4031 MPASS(*eofflag != 0); 4032 *eofflag = 1; 4033 goto success; 4034 } 4035 4036 /* 4037 * Normalize the flag returned by VOP_READDIR(), since we use 2 4038 * as a sentinel value. 4039 */ 4040 if (*eofflag != 0) 4041 *eofflag = 1; 4042 4043 dp = (struct dirent *)dirbuf; 4044 } 4045 4046 if (__predict_false(*len < GENERIC_MINDIRSIZ || 4047 dp->d_reclen < GENERIC_MINDIRSIZ)) { 4048 error = EINTEGRITY; 4049 dp = NULL; 4050 goto out; 4051 } 4052 4053 success: 4054 error = 0; 4055 out: 4056 *dpp = dp; 4057 return (error); 4058 } 4059 4060 /* 4061 * Checks whether a directory is empty or not. 4062 * 4063 * If the directory is empty, returns 0, and if it is not, ENOTEMPTY. Other 4064 * values are genuine errors preventing the check. 4065 */ 4066 int 4067 vn_dir_check_empty(struct vnode *vp) 4068 { 4069 struct thread *const td = curthread; 4070 char *dirbuf; 4071 size_t dirbuflen, len; 4072 off_t off; 4073 int eofflag, error; 4074 struct dirent *dp; 4075 struct vattr va; 4076 4077 ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); 4078 VNPASS(vp->v_type == VDIR, vp); 4079 4080 error = VOP_GETATTR(vp, &va, td->td_ucred); 4081 if (error != 0) 4082 return (error); 4083 4084 dirbuflen = max(DEV_BSIZE, GENERIC_MAXDIRSIZ); 4085 if (dirbuflen < va.va_blocksize) 4086 dirbuflen = va.va_blocksize; 4087 dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK); 4088 4089 len = 0; 4090 off = 0; 4091 eofflag = 0; 4092 4093 for (;;) { 4094 error = vn_dir_next_dirent(vp, td, dirbuf, dirbuflen, 4095 &dp, &len, &off, &eofflag); 4096 if (error != 0) 4097 goto end; 4098 4099 if (len == 0) { 4100 /* EOF */ 4101 error = 0; 4102 goto end; 4103 } 4104 4105 /* 4106 * Skip whiteouts. Unionfs operates on filesystems only and 4107 * not on hierarchies, so these whiteouts would be shadowed on 4108 * the system hierarchy but not for a union using the 4109 * filesystem of their directories as the upper layer. 4110 * Additionally, unionfs currently transparently exposes 4111 * union-specific metadata of its upper layer, meaning that 4112 * whiteouts can be seen through the union view in empty 4113 * directories. Taking into account these whiteouts would then 4114 * prevent mounting another filesystem on such effectively 4115 * empty directories. 4116 */ 4117 if (dp->d_type == DT_WHT) 4118 continue; 4119 4120 /* 4121 * Any file in the directory which is not '.' or '..' indicates 4122 * the directory is not empty. 4123 */ 4124 switch (dp->d_namlen) { 4125 case 2: 4126 if (dp->d_name[1] != '.') { 4127 /* Can't be '..' (nor '.') */ 4128 error = ENOTEMPTY; 4129 goto end; 4130 } 4131 /* FALLTHROUGH */ 4132 case 1: 4133 if (dp->d_name[0] != '.') { 4134 /* Can't be '..' nor '.' */ 4135 error = ENOTEMPTY; 4136 goto end; 4137 } 4138 break; 4139 4140 default: 4141 error = ENOTEMPTY; 4142 goto end; 4143 } 4144 } 4145 4146 end: 4147 free(dirbuf, M_TEMP); 4148 return (error); 4149 } 4150 4151 4152 static u_long vn_lock_pair_pause_cnt; 4153 SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD, 4154 &vn_lock_pair_pause_cnt, 0, 4155 "Count of vn_lock_pair deadlocks"); 4156 4157 u_int vn_lock_pair_pause_max; 4158 SYSCTL_UINT(_debug, OID_AUTO, vn_lock_pair_pause_max, CTLFLAG_RW, 4159 &vn_lock_pair_pause_max, 0, 4160 "Max ticks for vn_lock_pair deadlock avoidance sleep"); 4161 4162 static void 4163 vn_lock_pair_pause(const char *wmesg) 4164 { 4165 atomic_add_long(&vn_lock_pair_pause_cnt, 1); 4166 pause(wmesg, prng32_bounded(vn_lock_pair_pause_max)); 4167 } 4168 4169 /* 4170 * Lock pair of (possibly same) vnodes vp1, vp2, avoiding lock order 4171 * reversal. vp1_locked indicates whether vp1 is locked; if not, vp1 4172 * must be unlocked. Same for vp2 and vp2_locked. One of the vnodes 4173 * can be NULL. 4174 * 4175 * The function returns with both vnodes exclusively or shared locked, 4176 * according to corresponding lkflags, and guarantees that it does not 4177 * create lock order reversal with other threads during its execution. 4178 * Both vnodes could be unlocked temporary (and reclaimed). 4179 * 4180 * If requesting shared locking, locked vnode lock must not be recursed. 4181 * 4182 * Only one of LK_SHARED and LK_EXCLUSIVE must be specified. 4183 * LK_NODDLKTREAT can be optionally passed. 4184 * 4185 * If vp1 == vp2, only one, most exclusive, lock is obtained on it. 4186 */ 4187 void 4188 vn_lock_pair(struct vnode *vp1, bool vp1_locked, int lkflags1, 4189 struct vnode *vp2, bool vp2_locked, int lkflags2) 4190 { 4191 int error, locked1; 4192 4193 MPASS((((lkflags1 & LK_SHARED) != 0) ^ ((lkflags1 & LK_EXCLUSIVE) != 0)) || 4194 (vp1 == NULL && lkflags1 == 0)); 4195 MPASS((lkflags1 & ~(LK_SHARED | LK_EXCLUSIVE | LK_NODDLKTREAT)) == 0); 4196 MPASS((((lkflags2 & LK_SHARED) != 0) ^ ((lkflags2 & LK_EXCLUSIVE) != 0)) || 4197 (vp2 == NULL && lkflags2 == 0)); 4198 MPASS((lkflags2 & ~(LK_SHARED | LK_EXCLUSIVE | LK_NODDLKTREAT)) == 0); 4199 4200 if (vp1 == NULL && vp2 == NULL) 4201 return; 4202 4203 if (vp1 == vp2) { 4204 MPASS(vp1_locked == vp2_locked); 4205 4206 /* Select the most exclusive mode for lock. */ 4207 if ((lkflags1 & LK_TYPE_MASK) != (lkflags2 & LK_TYPE_MASK)) 4208 lkflags1 = (lkflags1 & ~LK_SHARED) | LK_EXCLUSIVE; 4209 4210 if (vp1_locked) { 4211 ASSERT_VOP_LOCKED(vp1, "vp1"); 4212 4213 /* No need to relock if any lock is exclusive. */ 4214 if ((vp1->v_vnlock->lock_object.lo_flags & 4215 LK_NOSHARE) != 0) 4216 return; 4217 4218 locked1 = VOP_ISLOCKED(vp1); 4219 if (((lkflags1 & LK_SHARED) != 0 && 4220 locked1 != LK_EXCLUSIVE) || 4221 ((lkflags1 & LK_EXCLUSIVE) != 0 && 4222 locked1 == LK_EXCLUSIVE)) 4223 return; 4224 VOP_UNLOCK(vp1); 4225 } 4226 4227 ASSERT_VOP_UNLOCKED(vp1, "vp1"); 4228 vn_lock(vp1, lkflags1 | LK_RETRY); 4229 return; 4230 } 4231 4232 if (vp1 != NULL) { 4233 if ((lkflags1 & LK_SHARED) != 0 && 4234 (vp1->v_vnlock->lock_object.lo_flags & LK_NOSHARE) != 0) 4235 lkflags1 = (lkflags1 & ~LK_SHARED) | LK_EXCLUSIVE; 4236 if (vp1_locked && VOP_ISLOCKED(vp1) != LK_EXCLUSIVE) { 4237 ASSERT_VOP_LOCKED(vp1, "vp1"); 4238 if ((lkflags1 & LK_EXCLUSIVE) != 0) { 4239 VOP_UNLOCK(vp1); 4240 ASSERT_VOP_UNLOCKED(vp1, 4241 "vp1 shared recursed"); 4242 vp1_locked = false; 4243 } 4244 } else if (!vp1_locked) 4245 ASSERT_VOP_UNLOCKED(vp1, "vp1"); 4246 } else { 4247 vp1_locked = true; 4248 } 4249 4250 if (vp2 != NULL) { 4251 if ((lkflags2 & LK_SHARED) != 0 && 4252 (vp2->v_vnlock->lock_object.lo_flags & LK_NOSHARE) != 0) 4253 lkflags2 = (lkflags2 & ~LK_SHARED) | LK_EXCLUSIVE; 4254 if (vp2_locked && VOP_ISLOCKED(vp2) != LK_EXCLUSIVE) { 4255 ASSERT_VOP_LOCKED(vp2, "vp2"); 4256 if ((lkflags2 & LK_EXCLUSIVE) != 0) { 4257 VOP_UNLOCK(vp2); 4258 ASSERT_VOP_UNLOCKED(vp2, 4259 "vp2 shared recursed"); 4260 vp2_locked = false; 4261 } 4262 } else if (!vp2_locked) 4263 ASSERT_VOP_UNLOCKED(vp2, "vp2"); 4264 } else { 4265 vp2_locked = true; 4266 } 4267 4268 if (!vp1_locked && !vp2_locked) { 4269 vn_lock(vp1, lkflags1 | LK_RETRY); 4270 vp1_locked = true; 4271 } 4272 4273 while (!vp1_locked || !vp2_locked) { 4274 if (vp1_locked && vp2 != NULL) { 4275 if (vp1 != NULL) { 4276 error = VOP_LOCK1(vp2, lkflags2 | LK_NOWAIT, 4277 __FILE__, __LINE__); 4278 if (error == 0) 4279 break; 4280 VOP_UNLOCK(vp1); 4281 vp1_locked = false; 4282 vn_lock_pair_pause("vlp1"); 4283 } 4284 vn_lock(vp2, lkflags2 | LK_RETRY); 4285 vp2_locked = true; 4286 } 4287 if (vp2_locked && vp1 != NULL) { 4288 if (vp2 != NULL) { 4289 error = VOP_LOCK1(vp1, lkflags1 | LK_NOWAIT, 4290 __FILE__, __LINE__); 4291 if (error == 0) 4292 break; 4293 VOP_UNLOCK(vp2); 4294 vp2_locked = false; 4295 vn_lock_pair_pause("vlp2"); 4296 } 4297 vn_lock(vp1, lkflags1 | LK_RETRY); 4298 vp1_locked = true; 4299 } 4300 } 4301 if (vp1 != NULL) { 4302 if (lkflags1 == LK_EXCLUSIVE) 4303 ASSERT_VOP_ELOCKED(vp1, "vp1 ret"); 4304 else 4305 ASSERT_VOP_LOCKED(vp1, "vp1 ret"); 4306 } 4307 if (vp2 != NULL) { 4308 if (lkflags2 == LK_EXCLUSIVE) 4309 ASSERT_VOP_ELOCKED(vp2, "vp2 ret"); 4310 else 4311 ASSERT_VOP_LOCKED(vp2, "vp2 ret"); 4312 } 4313 } 4314 4315 int 4316 vn_lktype_write(struct mount *mp, struct vnode *vp) 4317 { 4318 if (MNT_SHARED_WRITES(mp) || 4319 (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) 4320 return (LK_SHARED); 4321 return (LK_EXCLUSIVE); 4322 } 4323 4324 int 4325 vn_cmp(struct file *fp1, struct file *fp2, struct thread *td) 4326 { 4327 if (fp2->f_type != DTYPE_VNODE) 4328 return (3); 4329 return (kcmp_cmp((uintptr_t)fp1->f_vnode, (uintptr_t)fp2->f_vnode)); 4330 } 4331