1 /*- 2 * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause) 3 * 4 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 5 * All rights reserved. 6 * 7 * This software was developed for the FreeBSD Project by Marshall 8 * Kirk McKusick and Network Associates Laboratories, the Security 9 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 10 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 11 * research program 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1982, 1986, 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 62 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 63 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 64 */ 65 66 #include <sys/cdefs.h> 67 __FBSDID("$FreeBSD$"); 68 69 #include <sys/param.h> 70 #include <sys/bio.h> 71 #include <sys/systm.h> 72 #include <sys/buf.h> 73 #include <sys/conf.h> 74 #include <sys/extattr.h> 75 #include <sys/kernel.h> 76 #include <sys/limits.h> 77 #include <sys/malloc.h> 78 #include <sys/mount.h> 79 #include <sys/priv.h> 80 #include <sys/rwlock.h> 81 #include <sys/stat.h> 82 #include <sys/sysctl.h> 83 #include <sys/vmmeter.h> 84 #include <sys/vnode.h> 85 86 #include <vm/vm.h> 87 #include <vm/vm_param.h> 88 #include <vm/vm_extern.h> 89 #include <vm/vm_object.h> 90 #include <vm/vm_page.h> 91 #include <vm/vm_pager.h> 92 #include <vm/vnode_pager.h> 93 94 #include <ufs/ufs/extattr.h> 95 #include <ufs/ufs/quota.h> 96 #include <ufs/ufs/inode.h> 97 #include <ufs/ufs/ufs_extern.h> 98 #include <ufs/ufs/ufsmount.h> 99 100 #include <ufs/ffs/fs.h> 101 #include <ufs/ffs/ffs_extern.h> 102 #include "opt_directio.h" 103 #include "opt_ffs.h" 104 105 #define ALIGNED_TO(ptr, s) \ 106 (((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0) 107 108 #ifdef DIRECTIO 109 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 110 #endif 111 static vop_fdatasync_t ffs_fdatasync; 112 static vop_fsync_t ffs_fsync; 113 static vop_getpages_t ffs_getpages; 114 static vop_getpages_async_t ffs_getpages_async; 115 static vop_lock1_t ffs_lock; 116 #ifdef INVARIANTS 117 static vop_unlock_t ffs_unlock_debug; 118 #endif 119 static vop_read_t ffs_read; 120 static vop_write_t ffs_write; 121 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 122 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 123 struct ucred *cred); 124 static vop_strategy_t ffsext_strategy; 125 static vop_closeextattr_t ffs_closeextattr; 126 static vop_deleteextattr_t ffs_deleteextattr; 127 static vop_getextattr_t ffs_getextattr; 128 static vop_listextattr_t ffs_listextattr; 129 static vop_openextattr_t ffs_openextattr; 130 static vop_setextattr_t ffs_setextattr; 131 static vop_vptofh_t ffs_vptofh; 132 133 /* Global vfs data structures for ufs. */ 134 struct vop_vector ffs_vnodeops1 = { 135 .vop_default = &ufs_vnodeops, 136 .vop_fsync = ffs_fsync, 137 .vop_fdatasync = ffs_fdatasync, 138 .vop_getpages = ffs_getpages, 139 .vop_getpages_async = ffs_getpages_async, 140 .vop_lock1 = ffs_lock, 141 #ifdef INVARIANTS 142 .vop_unlock = ffs_unlock_debug, 143 #endif 144 .vop_read = ffs_read, 145 .vop_reallocblks = ffs_reallocblks, 146 .vop_write = ffs_write, 147 .vop_vptofh = ffs_vptofh, 148 }; 149 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops1); 150 151 struct vop_vector ffs_fifoops1 = { 152 .vop_default = &ufs_fifoops, 153 .vop_fsync = ffs_fsync, 154 .vop_fdatasync = ffs_fdatasync, 155 .vop_lock1 = ffs_lock, 156 #ifdef INVARIANTS 157 .vop_unlock = ffs_unlock_debug, 158 #endif 159 .vop_vptofh = ffs_vptofh, 160 }; 161 VFS_VOP_VECTOR_REGISTER(ffs_fifoops1); 162 163 /* Global vfs data structures for ufs. */ 164 struct vop_vector ffs_vnodeops2 = { 165 .vop_default = &ufs_vnodeops, 166 .vop_fsync = ffs_fsync, 167 .vop_fdatasync = ffs_fdatasync, 168 .vop_getpages = ffs_getpages, 169 .vop_getpages_async = ffs_getpages_async, 170 .vop_lock1 = ffs_lock, 171 #ifdef INVARIANTS 172 .vop_unlock = ffs_unlock_debug, 173 #endif 174 .vop_read = ffs_read, 175 .vop_reallocblks = ffs_reallocblks, 176 .vop_write = ffs_write, 177 .vop_closeextattr = ffs_closeextattr, 178 .vop_deleteextattr = ffs_deleteextattr, 179 .vop_getextattr = ffs_getextattr, 180 .vop_listextattr = ffs_listextattr, 181 .vop_openextattr = ffs_openextattr, 182 .vop_setextattr = ffs_setextattr, 183 .vop_vptofh = ffs_vptofh, 184 }; 185 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops2); 186 187 struct vop_vector ffs_fifoops2 = { 188 .vop_default = &ufs_fifoops, 189 .vop_fsync = ffs_fsync, 190 .vop_fdatasync = ffs_fdatasync, 191 .vop_lock1 = ffs_lock, 192 #ifdef INVARIANTS 193 .vop_unlock = ffs_unlock_debug, 194 #endif 195 .vop_reallocblks = ffs_reallocblks, 196 .vop_strategy = ffsext_strategy, 197 .vop_closeextattr = ffs_closeextattr, 198 .vop_deleteextattr = ffs_deleteextattr, 199 .vop_getextattr = ffs_getextattr, 200 .vop_listextattr = ffs_listextattr, 201 .vop_openextattr = ffs_openextattr, 202 .vop_setextattr = ffs_setextattr, 203 .vop_vptofh = ffs_vptofh, 204 }; 205 VFS_VOP_VECTOR_REGISTER(ffs_fifoops2); 206 207 /* 208 * Synch an open file. 209 */ 210 /* ARGSUSED */ 211 static int 212 ffs_fsync(struct vop_fsync_args *ap) 213 { 214 struct vnode *vp; 215 struct bufobj *bo; 216 int error; 217 218 vp = ap->a_vp; 219 bo = &vp->v_bufobj; 220 retry: 221 error = ffs_syncvnode(vp, ap->a_waitfor, 0); 222 if (error) 223 return (error); 224 if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) { 225 error = softdep_fsync(vp); 226 if (error) 227 return (error); 228 229 /* 230 * The softdep_fsync() function may drop vp lock, 231 * allowing for dirty buffers to reappear on the 232 * bo_dirty list. Recheck and resync as needed. 233 */ 234 BO_LOCK(bo); 235 if ((vp->v_type == VREG || vp->v_type == VDIR) && 236 (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) { 237 BO_UNLOCK(bo); 238 goto retry; 239 } 240 BO_UNLOCK(bo); 241 } 242 return (0); 243 } 244 245 int 246 ffs_syncvnode(struct vnode *vp, int waitfor, int flags) 247 { 248 struct inode *ip; 249 struct bufobj *bo; 250 struct buf *bp, *nbp; 251 ufs_lbn_t lbn; 252 int error, passes; 253 bool still_dirty, wait; 254 255 ip = VTOI(vp); 256 ip->i_flag &= ~IN_NEEDSYNC; 257 bo = &vp->v_bufobj; 258 259 /* 260 * When doing MNT_WAIT we must first flush all dependencies 261 * on the inode. 262 */ 263 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && 264 (error = softdep_sync_metadata(vp)) != 0) 265 return (error); 266 267 /* 268 * Flush all dirty buffers associated with a vnode. 269 */ 270 error = 0; 271 passes = 0; 272 wait = false; /* Always do an async pass first. */ 273 lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1)); 274 BO_LOCK(bo); 275 loop: 276 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 277 bp->b_vflags &= ~BV_SCANNED; 278 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 279 /* 280 * Reasons to skip this buffer: it has already been considered 281 * on this pass, the buffer has dependencies that will cause 282 * it to be redirtied and it has not already been deferred, 283 * or it is already being written. 284 */ 285 if ((bp->b_vflags & BV_SCANNED) != 0) 286 continue; 287 bp->b_vflags |= BV_SCANNED; 288 /* 289 * Flush indirects in order, if requested. 290 * 291 * Note that if only datasync is requested, we can 292 * skip indirect blocks when softupdates are not 293 * active. Otherwise we must flush them with data, 294 * since dependencies prevent data block writes. 295 */ 296 if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR && 297 (lbn_level(bp->b_lblkno) >= passes || 298 ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp)))) 299 continue; 300 if (bp->b_lblkno > lbn) 301 panic("ffs_syncvnode: syncing truncated data."); 302 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) { 303 BO_UNLOCK(bo); 304 } else if (wait) { 305 if (BUF_LOCK(bp, 306 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 307 BO_LOCKPTR(bo)) != 0) { 308 bp->b_vflags &= ~BV_SCANNED; 309 goto next; 310 } 311 } else 312 continue; 313 if ((bp->b_flags & B_DELWRI) == 0) 314 panic("ffs_fsync: not dirty"); 315 /* 316 * Check for dependencies and potentially complete them. 317 */ 318 if (!LIST_EMPTY(&bp->b_dep) && 319 (error = softdep_sync_buf(vp, bp, 320 wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { 321 /* I/O error. */ 322 if (error != EBUSY) { 323 BUF_UNLOCK(bp); 324 return (error); 325 } 326 /* If we deferred once, don't defer again. */ 327 if ((bp->b_flags & B_DEFERRED) == 0) { 328 bp->b_flags |= B_DEFERRED; 329 BUF_UNLOCK(bp); 330 goto next; 331 } 332 } 333 if (wait) { 334 bremfree(bp); 335 if ((error = bwrite(bp)) != 0) 336 return (error); 337 } else if ((bp->b_flags & B_CLUSTEROK)) { 338 (void) vfs_bio_awrite(bp); 339 } else { 340 bremfree(bp); 341 (void) bawrite(bp); 342 } 343 next: 344 /* 345 * Since we may have slept during the I/O, we need 346 * to start from a known point. 347 */ 348 BO_LOCK(bo); 349 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); 350 } 351 if (waitfor != MNT_WAIT) { 352 BO_UNLOCK(bo); 353 if ((flags & NO_INO_UPDT) != 0) 354 return (0); 355 else 356 return (ffs_update(vp, 0)); 357 } 358 /* Drain IO to see if we're done. */ 359 bufobj_wwait(bo, 0, 0); 360 /* 361 * Block devices associated with filesystems may have new I/O 362 * requests posted for them even if the vnode is locked, so no 363 * amount of trying will get them clean. We make several passes 364 * as a best effort. 365 * 366 * Regular files may need multiple passes to flush all dependency 367 * work as it is possible that we must write once per indirect 368 * level, once for the leaf, and once for the inode and each of 369 * these will be done with one sync and one async pass. 370 */ 371 if (bo->bo_dirty.bv_cnt > 0) { 372 if ((flags & DATA_ONLY) == 0) { 373 still_dirty = true; 374 } else { 375 /* 376 * For data-only sync, dirty indirect buffers 377 * are ignored. 378 */ 379 still_dirty = false; 380 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 381 if (bp->b_lblkno > -UFS_NDADDR) { 382 still_dirty = true; 383 break; 384 } 385 } 386 } 387 388 if (still_dirty) { 389 /* Write the inode after sync passes to flush deps. */ 390 if (wait && DOINGSOFTDEP(vp) && 391 (flags & NO_INO_UPDT) == 0) { 392 BO_UNLOCK(bo); 393 ffs_update(vp, 1); 394 BO_LOCK(bo); 395 } 396 /* switch between sync/async. */ 397 wait = !wait; 398 if (wait || ++passes < UFS_NIADDR + 2) 399 goto loop; 400 } 401 } 402 BO_UNLOCK(bo); 403 error = 0; 404 if ((flags & DATA_ONLY) == 0) { 405 if ((flags & NO_INO_UPDT) == 0) 406 error = ffs_update(vp, 1); 407 if (DOINGSUJ(vp)) 408 softdep_journal_fsync(VTOI(vp)); 409 } 410 return (error); 411 } 412 413 static int 414 ffs_fdatasync(struct vop_fdatasync_args *ap) 415 { 416 417 return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY)); 418 } 419 420 static int 421 ffs_lock(ap) 422 struct vop_lock1_args /* { 423 struct vnode *a_vp; 424 int a_flags; 425 struct thread *a_td; 426 char *file; 427 int line; 428 } */ *ap; 429 { 430 #ifndef NO_FFS_SNAPSHOT 431 struct vnode *vp; 432 int flags; 433 struct lock *lkp; 434 int result; 435 436 switch (ap->a_flags & LK_TYPE_MASK) { 437 case LK_SHARED: 438 case LK_UPGRADE: 439 case LK_EXCLUSIVE: 440 vp = ap->a_vp; 441 flags = ap->a_flags; 442 for (;;) { 443 #ifdef DEBUG_VFS_LOCKS 444 VNPASS(vp->v_holdcnt != 0, vp); 445 #endif 446 lkp = vp->v_vnlock; 447 result = lockmgr_lock_flags(lkp, flags, 448 &VI_MTX(vp)->lock_object, ap->a_file, ap->a_line); 449 if (lkp == vp->v_vnlock || result != 0) 450 break; 451 /* 452 * Apparent success, except that the vnode 453 * mutated between snapshot file vnode and 454 * regular file vnode while this process 455 * slept. The lock currently held is not the 456 * right lock. Release it, and try to get the 457 * new lock. 458 */ 459 lockmgr_unlock(lkp); 460 if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == 461 (LK_INTERLOCK | LK_NOWAIT)) 462 return (EBUSY); 463 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 464 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 465 flags &= ~LK_INTERLOCK; 466 } 467 break; 468 default: 469 result = VOP_LOCK1_APV(&ufs_vnodeops, ap); 470 } 471 return (result); 472 #else 473 return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); 474 #endif 475 } 476 477 #ifdef INVARIANTS 478 static int 479 ffs_unlock_debug(struct vop_unlock_args *ap) 480 { 481 struct vnode *vp = ap->a_vp; 482 struct inode *ip = VTOI(vp); 483 484 if (ip->i_flag & UFS_INODE_FLAG_LAZY_MASK_ASSERTABLE) { 485 if ((vp->v_mflag & VMP_LAZYLIST) == 0) { 486 VI_LOCK(vp); 487 VNASSERT((vp->v_mflag & VMP_LAZYLIST), vp, 488 ("%s: modified vnode (%x) not on lazy list", 489 __func__, ip->i_flag)); 490 VI_UNLOCK(vp); 491 } 492 } 493 return (VOP_UNLOCK_APV(&ufs_vnodeops, ap)); 494 } 495 #endif 496 497 static int 498 ffs_read_hole(struct uio *uio, long xfersize, long *size) 499 { 500 ssize_t saved_resid, tlen; 501 int error; 502 503 while (xfersize > 0) { 504 tlen = min(xfersize, ZERO_REGION_SIZE); 505 saved_resid = uio->uio_resid; 506 error = vn_io_fault_uiomove(__DECONST(void *, zero_region), 507 tlen, uio); 508 if (error != 0) 509 return (error); 510 tlen = saved_resid - uio->uio_resid; 511 xfersize -= tlen; 512 *size -= tlen; 513 } 514 return (0); 515 } 516 517 /* 518 * Vnode op for reading. 519 */ 520 static int 521 ffs_read(ap) 522 struct vop_read_args /* { 523 struct vnode *a_vp; 524 struct uio *a_uio; 525 int a_ioflag; 526 struct ucred *a_cred; 527 } */ *ap; 528 { 529 struct vnode *vp; 530 struct inode *ip; 531 struct uio *uio; 532 struct fs *fs; 533 struct buf *bp; 534 ufs_lbn_t lbn, nextlbn; 535 off_t bytesinfile; 536 long size, xfersize, blkoffset; 537 ssize_t orig_resid; 538 int bflag, error, ioflag, seqcount; 539 540 vp = ap->a_vp; 541 uio = ap->a_uio; 542 ioflag = ap->a_ioflag; 543 if (ap->a_ioflag & IO_EXT) 544 #ifdef notyet 545 return (ffs_extread(vp, uio, ioflag)); 546 #else 547 panic("ffs_read+IO_EXT"); 548 #endif 549 #ifdef DIRECTIO 550 if ((ioflag & IO_DIRECT) != 0) { 551 int workdone; 552 553 error = ffs_rawread(vp, uio, &workdone); 554 if (error != 0 || workdone != 0) 555 return error; 556 } 557 #endif 558 559 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 560 ip = VTOI(vp); 561 562 #ifdef INVARIANTS 563 if (uio->uio_rw != UIO_READ) 564 panic("ffs_read: mode"); 565 566 if (vp->v_type == VLNK) { 567 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 568 panic("ffs_read: short symlink"); 569 } else if (vp->v_type != VREG && vp->v_type != VDIR) 570 panic("ffs_read: type %d", vp->v_type); 571 #endif 572 orig_resid = uio->uio_resid; 573 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 574 if (orig_resid == 0) 575 return (0); 576 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 577 fs = ITOFS(ip); 578 if (uio->uio_offset < ip->i_size && 579 uio->uio_offset >= fs->fs_maxfilesize) 580 return (EOVERFLOW); 581 582 bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE); 583 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 584 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 585 break; 586 lbn = lblkno(fs, uio->uio_offset); 587 nextlbn = lbn + 1; 588 589 /* 590 * size of buffer. The buffer representing the 591 * end of the file is rounded up to the size of 592 * the block type ( fragment or full block, 593 * depending ). 594 */ 595 size = blksize(fs, ip, lbn); 596 blkoffset = blkoff(fs, uio->uio_offset); 597 598 /* 599 * The amount we want to transfer in this iteration is 600 * one FS block less the amount of the data before 601 * our startpoint (duh!) 602 */ 603 xfersize = fs->fs_bsize - blkoffset; 604 605 /* 606 * But if we actually want less than the block, 607 * or the file doesn't have a whole block more of data, 608 * then use the lesser number. 609 */ 610 if (uio->uio_resid < xfersize) 611 xfersize = uio->uio_resid; 612 if (bytesinfile < xfersize) 613 xfersize = bytesinfile; 614 615 if (lblktosize(fs, nextlbn) >= ip->i_size) { 616 /* 617 * Don't do readahead if this is the end of the file. 618 */ 619 error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp); 620 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 621 /* 622 * Otherwise if we are allowed to cluster, 623 * grab as much as we can. 624 * 625 * XXX This may not be a win if we are not 626 * doing sequential access. 627 */ 628 error = cluster_read(vp, ip->i_size, lbn, 629 size, NOCRED, blkoffset + uio->uio_resid, 630 seqcount, bflag, &bp); 631 } else if (seqcount > 1) { 632 /* 633 * If we are NOT allowed to cluster, then 634 * if we appear to be acting sequentially, 635 * fire off a request for a readahead 636 * as well as a read. Note that the 4th and 5th 637 * arguments point to arrays of the size specified in 638 * the 6th argument. 639 */ 640 u_int nextsize = blksize(fs, ip, nextlbn); 641 error = breadn_flags(vp, lbn, lbn, size, &nextlbn, 642 &nextsize, 1, NOCRED, bflag, NULL, &bp); 643 } else { 644 /* 645 * Failing all of the above, just read what the 646 * user asked for. Interestingly, the same as 647 * the first option above. 648 */ 649 error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp); 650 } 651 if (error == EJUSTRETURN) { 652 error = ffs_read_hole(uio, xfersize, &size); 653 if (error == 0) 654 continue; 655 } 656 if (error != 0) { 657 brelse(bp); 658 bp = NULL; 659 break; 660 } 661 662 /* 663 * We should only get non-zero b_resid when an I/O error 664 * has occurred, which should cause us to break above. 665 * However, if the short read did not cause an error, 666 * then we want to ensure that we do not uiomove bad 667 * or uninitialized data. 668 */ 669 size -= bp->b_resid; 670 if (size < xfersize) { 671 if (size == 0) 672 break; 673 xfersize = size; 674 } 675 676 if (buf_mapped(bp)) { 677 error = vn_io_fault_uiomove((char *)bp->b_data + 678 blkoffset, (int)xfersize, uio); 679 } else { 680 error = vn_io_fault_pgmove(bp->b_pages, blkoffset, 681 (int)xfersize, uio); 682 } 683 if (error) 684 break; 685 686 vfs_bio_brelse(bp, ioflag); 687 } 688 689 /* 690 * This can only happen in the case of an error 691 * because the loop above resets bp to NULL on each iteration 692 * and on normal completion has not set a new value into it. 693 * so it must have come from a 'break' statement 694 */ 695 if (bp != NULL) 696 vfs_bio_brelse(bp, ioflag); 697 698 if ((error == 0 || uio->uio_resid != orig_resid) && 699 (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) 700 UFS_INODE_SET_FLAG_SHARED(ip, IN_ACCESS); 701 return (error); 702 } 703 704 /* 705 * Vnode op for writing. 706 */ 707 static int 708 ffs_write(ap) 709 struct vop_write_args /* { 710 struct vnode *a_vp; 711 struct uio *a_uio; 712 int a_ioflag; 713 struct ucred *a_cred; 714 } */ *ap; 715 { 716 struct vnode *vp; 717 struct uio *uio; 718 struct inode *ip; 719 struct fs *fs; 720 struct buf *bp; 721 ufs_lbn_t lbn; 722 off_t osize; 723 ssize_t resid; 724 int seqcount; 725 int blkoffset, error, flags, ioflag, size, xfersize; 726 727 vp = ap->a_vp; 728 uio = ap->a_uio; 729 ioflag = ap->a_ioflag; 730 if (ap->a_ioflag & IO_EXT) 731 #ifdef notyet 732 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 733 #else 734 panic("ffs_write+IO_EXT"); 735 #endif 736 737 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 738 ip = VTOI(vp); 739 740 #ifdef INVARIANTS 741 if (uio->uio_rw != UIO_WRITE) 742 panic("ffs_write: mode"); 743 #endif 744 745 switch (vp->v_type) { 746 case VREG: 747 if (ioflag & IO_APPEND) 748 uio->uio_offset = ip->i_size; 749 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 750 return (EPERM); 751 /* FALLTHROUGH */ 752 case VLNK: 753 break; 754 case VDIR: 755 panic("ffs_write: dir write"); 756 break; 757 default: 758 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 759 (int)uio->uio_offset, 760 (int)uio->uio_resid 761 ); 762 } 763 764 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 765 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 766 fs = ITOFS(ip); 767 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 768 return (EFBIG); 769 /* 770 * Maybe this should be above the vnode op call, but so long as 771 * file servers have no limits, I don't think it matters. 772 */ 773 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) 774 return (EFBIG); 775 776 resid = uio->uio_resid; 777 osize = ip->i_size; 778 if (seqcount > BA_SEQMAX) 779 flags = BA_SEQMAX << BA_SEQSHIFT; 780 else 781 flags = seqcount << BA_SEQSHIFT; 782 if (ioflag & IO_SYNC) 783 flags |= IO_SYNC; 784 flags |= BA_UNMAPPED; 785 786 for (error = 0; uio->uio_resid > 0;) { 787 lbn = lblkno(fs, uio->uio_offset); 788 blkoffset = blkoff(fs, uio->uio_offset); 789 xfersize = fs->fs_bsize - blkoffset; 790 if (uio->uio_resid < xfersize) 791 xfersize = uio->uio_resid; 792 if (uio->uio_offset + xfersize > ip->i_size) 793 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 794 795 /* 796 * We must perform a read-before-write if the transfer size 797 * does not cover the entire buffer. 798 */ 799 if (fs->fs_bsize > xfersize) 800 flags |= BA_CLRBUF; 801 else 802 flags &= ~BA_CLRBUF; 803 /* XXX is uio->uio_offset the right thing here? */ 804 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 805 ap->a_cred, flags, &bp); 806 if (error != 0) { 807 vnode_pager_setsize(vp, ip->i_size); 808 break; 809 } 810 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 811 bp->b_flags |= B_NOCACHE; 812 813 if (uio->uio_offset + xfersize > ip->i_size) { 814 ip->i_size = uio->uio_offset + xfersize; 815 DIP_SET(ip, i_size, ip->i_size); 816 } 817 818 size = blksize(fs, ip, lbn) - bp->b_resid; 819 if (size < xfersize) 820 xfersize = size; 821 822 if (buf_mapped(bp)) { 823 error = vn_io_fault_uiomove((char *)bp->b_data + 824 blkoffset, (int)xfersize, uio); 825 } else { 826 error = vn_io_fault_pgmove(bp->b_pages, blkoffset, 827 (int)xfersize, uio); 828 } 829 /* 830 * If the buffer is not already filled and we encounter an 831 * error while trying to fill it, we have to clear out any 832 * garbage data from the pages instantiated for the buffer. 833 * If we do not, a failed uiomove() during a write can leave 834 * the prior contents of the pages exposed to a userland mmap. 835 * 836 * Note that we need only clear buffers with a transfer size 837 * equal to the block size because buffers with a shorter 838 * transfer size were cleared above by the call to UFS_BALLOC() 839 * with the BA_CLRBUF flag set. 840 * 841 * If the source region for uiomove identically mmaps the 842 * buffer, uiomove() performed the NOP copy, and the buffer 843 * content remains valid because the page fault handler 844 * validated the pages. 845 */ 846 if (error != 0 && (bp->b_flags & B_CACHE) == 0 && 847 fs->fs_bsize == xfersize) 848 vfs_bio_clrbuf(bp); 849 850 vfs_bio_set_flags(bp, ioflag); 851 852 /* 853 * If IO_SYNC each buffer is written synchronously. Otherwise 854 * if we have a severe page deficiency write the buffer 855 * asynchronously. Otherwise try to cluster, and if that 856 * doesn't do it then either do an async write (if O_DIRECT), 857 * or a delayed write (if not). 858 */ 859 if (ioflag & IO_SYNC) { 860 (void)bwrite(bp); 861 } else if (vm_page_count_severe() || 862 buf_dirty_count_severe() || 863 (ioflag & IO_ASYNC)) { 864 bp->b_flags |= B_CLUSTEROK; 865 bawrite(bp); 866 } else if (xfersize + blkoffset == fs->fs_bsize) { 867 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 868 bp->b_flags |= B_CLUSTEROK; 869 cluster_write(vp, bp, ip->i_size, seqcount, 870 GB_UNMAPPED); 871 } else { 872 bawrite(bp); 873 } 874 } else if (ioflag & IO_DIRECT) { 875 bp->b_flags |= B_CLUSTEROK; 876 bawrite(bp); 877 } else { 878 bp->b_flags |= B_CLUSTEROK; 879 bdwrite(bp); 880 } 881 if (error || xfersize == 0) 882 break; 883 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 884 } 885 /* 886 * If we successfully wrote any data, and we are not the superuser 887 * we clear the setuid and setgid bits as a precaution against 888 * tampering. 889 */ 890 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && 891 ap->a_cred) { 892 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) { 893 ip->i_mode &= ~(ISUID | ISGID); 894 DIP_SET(ip, i_mode, ip->i_mode); 895 } 896 } 897 if (error) { 898 if (ioflag & IO_UNIT) { 899 (void)ffs_truncate(vp, osize, 900 IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred); 901 uio->uio_offset -= resid - uio->uio_resid; 902 uio->uio_resid = resid; 903 } 904 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 905 error = ffs_update(vp, 1); 906 return (error); 907 } 908 909 /* 910 * Extended attribute area reading. 911 */ 912 static int 913 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 914 { 915 struct inode *ip; 916 struct ufs2_dinode *dp; 917 struct fs *fs; 918 struct buf *bp; 919 ufs_lbn_t lbn, nextlbn; 920 off_t bytesinfile; 921 long size, xfersize, blkoffset; 922 ssize_t orig_resid; 923 int error; 924 925 ip = VTOI(vp); 926 fs = ITOFS(ip); 927 dp = ip->i_din2; 928 929 #ifdef INVARIANTS 930 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 931 panic("ffs_extread: mode"); 932 933 #endif 934 orig_resid = uio->uio_resid; 935 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 936 if (orig_resid == 0) 937 return (0); 938 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 939 940 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 941 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 942 break; 943 lbn = lblkno(fs, uio->uio_offset); 944 nextlbn = lbn + 1; 945 946 /* 947 * size of buffer. The buffer representing the 948 * end of the file is rounded up to the size of 949 * the block type ( fragment or full block, 950 * depending ). 951 */ 952 size = sblksize(fs, dp->di_extsize, lbn); 953 blkoffset = blkoff(fs, uio->uio_offset); 954 955 /* 956 * The amount we want to transfer in this iteration is 957 * one FS block less the amount of the data before 958 * our startpoint (duh!) 959 */ 960 xfersize = fs->fs_bsize - blkoffset; 961 962 /* 963 * But if we actually want less than the block, 964 * or the file doesn't have a whole block more of data, 965 * then use the lesser number. 966 */ 967 if (uio->uio_resid < xfersize) 968 xfersize = uio->uio_resid; 969 if (bytesinfile < xfersize) 970 xfersize = bytesinfile; 971 972 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 973 /* 974 * Don't do readahead if this is the end of the info. 975 */ 976 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 977 } else { 978 /* 979 * If we have a second block, then 980 * fire off a request for a readahead 981 * as well as a read. Note that the 4th and 5th 982 * arguments point to arrays of the size specified in 983 * the 6th argument. 984 */ 985 u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 986 987 nextlbn = -1 - nextlbn; 988 error = breadn(vp, -1 - lbn, 989 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 990 } 991 if (error) { 992 brelse(bp); 993 bp = NULL; 994 break; 995 } 996 997 /* 998 * We should only get non-zero b_resid when an I/O error 999 * has occurred, which should cause us to break above. 1000 * However, if the short read did not cause an error, 1001 * then we want to ensure that we do not uiomove bad 1002 * or uninitialized data. 1003 */ 1004 size -= bp->b_resid; 1005 if (size < xfersize) { 1006 if (size == 0) 1007 break; 1008 xfersize = size; 1009 } 1010 1011 error = uiomove((char *)bp->b_data + blkoffset, 1012 (int)xfersize, uio); 1013 if (error) 1014 break; 1015 vfs_bio_brelse(bp, ioflag); 1016 } 1017 1018 /* 1019 * This can only happen in the case of an error 1020 * because the loop above resets bp to NULL on each iteration 1021 * and on normal completion has not set a new value into it. 1022 * so it must have come from a 'break' statement 1023 */ 1024 if (bp != NULL) 1025 vfs_bio_brelse(bp, ioflag); 1026 return (error); 1027 } 1028 1029 /* 1030 * Extended attribute area writing. 1031 */ 1032 static int 1033 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1034 { 1035 struct inode *ip; 1036 struct ufs2_dinode *dp; 1037 struct fs *fs; 1038 struct buf *bp; 1039 ufs_lbn_t lbn; 1040 off_t osize; 1041 ssize_t resid; 1042 int blkoffset, error, flags, size, xfersize; 1043 1044 ip = VTOI(vp); 1045 fs = ITOFS(ip); 1046 dp = ip->i_din2; 1047 1048 #ifdef INVARIANTS 1049 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1050 panic("ffs_extwrite: mode"); 1051 #endif 1052 1053 if (ioflag & IO_APPEND) 1054 uio->uio_offset = dp->di_extsize; 1055 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1056 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1057 if ((uoff_t)uio->uio_offset + uio->uio_resid > 1058 UFS_NXADDR * fs->fs_bsize) 1059 return (EFBIG); 1060 1061 resid = uio->uio_resid; 1062 osize = dp->di_extsize; 1063 flags = IO_EXT; 1064 if (ioflag & IO_SYNC) 1065 flags |= IO_SYNC; 1066 1067 for (error = 0; uio->uio_resid > 0;) { 1068 lbn = lblkno(fs, uio->uio_offset); 1069 blkoffset = blkoff(fs, uio->uio_offset); 1070 xfersize = fs->fs_bsize - blkoffset; 1071 if (uio->uio_resid < xfersize) 1072 xfersize = uio->uio_resid; 1073 1074 /* 1075 * We must perform a read-before-write if the transfer size 1076 * does not cover the entire buffer. 1077 */ 1078 if (fs->fs_bsize > xfersize) 1079 flags |= BA_CLRBUF; 1080 else 1081 flags &= ~BA_CLRBUF; 1082 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1083 ucred, flags, &bp); 1084 if (error != 0) 1085 break; 1086 /* 1087 * If the buffer is not valid we have to clear out any 1088 * garbage data from the pages instantiated for the buffer. 1089 * If we do not, a failed uiomove() during a write can leave 1090 * the prior contents of the pages exposed to a userland 1091 * mmap(). XXX deal with uiomove() errors a better way. 1092 */ 1093 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1094 vfs_bio_clrbuf(bp); 1095 1096 if (uio->uio_offset + xfersize > dp->di_extsize) 1097 dp->di_extsize = uio->uio_offset + xfersize; 1098 1099 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1100 if (size < xfersize) 1101 xfersize = size; 1102 1103 error = 1104 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1105 1106 vfs_bio_set_flags(bp, ioflag); 1107 1108 /* 1109 * If IO_SYNC each buffer is written synchronously. Otherwise 1110 * if we have a severe page deficiency write the buffer 1111 * asynchronously. Otherwise try to cluster, and if that 1112 * doesn't do it then either do an async write (if O_DIRECT), 1113 * or a delayed write (if not). 1114 */ 1115 if (ioflag & IO_SYNC) { 1116 (void)bwrite(bp); 1117 } else if (vm_page_count_severe() || 1118 buf_dirty_count_severe() || 1119 xfersize + blkoffset == fs->fs_bsize || 1120 (ioflag & (IO_ASYNC | IO_DIRECT))) 1121 bawrite(bp); 1122 else 1123 bdwrite(bp); 1124 if (error || xfersize == 0) 1125 break; 1126 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 1127 } 1128 /* 1129 * If we successfully wrote any data, and we are not the superuser 1130 * we clear the setuid and setgid bits as a precaution against 1131 * tampering. 1132 */ 1133 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { 1134 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) { 1135 ip->i_mode &= ~(ISUID | ISGID); 1136 dp->di_mode = ip->i_mode; 1137 } 1138 } 1139 if (error) { 1140 if (ioflag & IO_UNIT) { 1141 (void)ffs_truncate(vp, osize, 1142 IO_EXT | (ioflag&IO_SYNC), ucred); 1143 uio->uio_offset -= resid - uio->uio_resid; 1144 uio->uio_resid = resid; 1145 } 1146 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1147 error = ffs_update(vp, 1); 1148 return (error); 1149 } 1150 1151 1152 /* 1153 * Vnode operating to retrieve a named extended attribute. 1154 * 1155 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1156 * the length of the EA, and possibly the pointer to the entry and to the data. 1157 */ 1158 static int 1159 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, 1160 struct extattr **eapp, u_char **eac) 1161 { 1162 struct extattr *eap, *eaend; 1163 size_t nlen; 1164 1165 nlen = strlen(name); 1166 KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned")); 1167 eap = (struct extattr *)ptr; 1168 eaend = (struct extattr *)(ptr + length); 1169 for (; eap < eaend; eap = EXTATTR_NEXT(eap)) { 1170 /* make sure this entry is complete */ 1171 if (EXTATTR_NEXT(eap) > eaend) 1172 break; 1173 if (eap->ea_namespace != nspace || eap->ea_namelength != nlen 1174 || memcmp(eap->ea_name, name, nlen) != 0) 1175 continue; 1176 if (eapp != NULL) 1177 *eapp = eap; 1178 if (eac != NULL) 1179 *eac = EXTATTR_CONTENT(eap); 1180 return (EXTATTR_CONTENT_SIZE(eap)); 1181 } 1182 return (-1); 1183 } 1184 1185 static int 1186 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1187 { 1188 struct inode *ip; 1189 struct ufs2_dinode *dp; 1190 struct fs *fs; 1191 struct uio luio; 1192 struct iovec liovec; 1193 u_int easize; 1194 int error; 1195 u_char *eae; 1196 1197 ip = VTOI(vp); 1198 fs = ITOFS(ip); 1199 dp = ip->i_din2; 1200 easize = dp->di_extsize; 1201 if ((uoff_t)easize + extra > UFS_NXADDR * fs->fs_bsize) 1202 return (EFBIG); 1203 1204 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1205 1206 liovec.iov_base = eae; 1207 liovec.iov_len = easize; 1208 luio.uio_iov = &liovec; 1209 luio.uio_iovcnt = 1; 1210 luio.uio_offset = 0; 1211 luio.uio_resid = easize; 1212 luio.uio_segflg = UIO_SYSSPACE; 1213 luio.uio_rw = UIO_READ; 1214 luio.uio_td = td; 1215 1216 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1217 if (error) { 1218 free(eae, M_TEMP); 1219 return(error); 1220 } 1221 *p = eae; 1222 return (0); 1223 } 1224 1225 static void 1226 ffs_lock_ea(struct vnode *vp) 1227 { 1228 struct inode *ip; 1229 1230 ip = VTOI(vp); 1231 VI_LOCK(vp); 1232 while (ip->i_flag & IN_EA_LOCKED) { 1233 UFS_INODE_SET_FLAG(ip, IN_EA_LOCKWAIT); 1234 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea", 1235 0); 1236 } 1237 UFS_INODE_SET_FLAG(ip, IN_EA_LOCKED); 1238 VI_UNLOCK(vp); 1239 } 1240 1241 static void 1242 ffs_unlock_ea(struct vnode *vp) 1243 { 1244 struct inode *ip; 1245 1246 ip = VTOI(vp); 1247 VI_LOCK(vp); 1248 if (ip->i_flag & IN_EA_LOCKWAIT) 1249 wakeup(&ip->i_ea_refs); 1250 ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT); 1251 VI_UNLOCK(vp); 1252 } 1253 1254 static int 1255 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1256 { 1257 struct inode *ip; 1258 struct ufs2_dinode *dp; 1259 int error; 1260 1261 ip = VTOI(vp); 1262 1263 ffs_lock_ea(vp); 1264 if (ip->i_ea_area != NULL) { 1265 ip->i_ea_refs++; 1266 ffs_unlock_ea(vp); 1267 return (0); 1268 } 1269 dp = ip->i_din2; 1270 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1271 if (error) { 1272 ffs_unlock_ea(vp); 1273 return (error); 1274 } 1275 ip->i_ea_len = dp->di_extsize; 1276 ip->i_ea_error = 0; 1277 ip->i_ea_refs++; 1278 ffs_unlock_ea(vp); 1279 return (0); 1280 } 1281 1282 /* 1283 * Vnode extattr transaction commit/abort 1284 */ 1285 static int 1286 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1287 { 1288 struct inode *ip; 1289 struct uio luio; 1290 struct iovec liovec; 1291 int error; 1292 struct ufs2_dinode *dp; 1293 1294 ip = VTOI(vp); 1295 1296 ffs_lock_ea(vp); 1297 if (ip->i_ea_area == NULL) { 1298 ffs_unlock_ea(vp); 1299 return (EINVAL); 1300 } 1301 dp = ip->i_din2; 1302 error = ip->i_ea_error; 1303 if (commit && error == 0) { 1304 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit"); 1305 if (cred == NOCRED) 1306 cred = vp->v_mount->mnt_cred; 1307 liovec.iov_base = ip->i_ea_area; 1308 liovec.iov_len = ip->i_ea_len; 1309 luio.uio_iov = &liovec; 1310 luio.uio_iovcnt = 1; 1311 luio.uio_offset = 0; 1312 luio.uio_resid = ip->i_ea_len; 1313 luio.uio_segflg = UIO_SYSSPACE; 1314 luio.uio_rw = UIO_WRITE; 1315 luio.uio_td = td; 1316 /* XXX: I'm not happy about truncating to zero size */ 1317 if (ip->i_ea_len < dp->di_extsize) 1318 error = ffs_truncate(vp, 0, IO_EXT, cred); 1319 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1320 } 1321 if (--ip->i_ea_refs == 0) { 1322 free(ip->i_ea_area, M_TEMP); 1323 ip->i_ea_area = NULL; 1324 ip->i_ea_len = 0; 1325 ip->i_ea_error = 0; 1326 } 1327 ffs_unlock_ea(vp); 1328 return (error); 1329 } 1330 1331 /* 1332 * Vnode extattr strategy routine for fifos. 1333 * 1334 * We need to check for a read or write of the external attributes. 1335 * Otherwise we just fall through and do the usual thing. 1336 */ 1337 static int 1338 ffsext_strategy(struct vop_strategy_args *ap) 1339 /* 1340 struct vop_strategy_args { 1341 struct vnodeop_desc *a_desc; 1342 struct vnode *a_vp; 1343 struct buf *a_bp; 1344 }; 1345 */ 1346 { 1347 struct vnode *vp; 1348 daddr_t lbn; 1349 1350 vp = ap->a_vp; 1351 lbn = ap->a_bp->b_lblkno; 1352 if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR) 1353 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1354 if (vp->v_type == VFIFO) 1355 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1356 panic("spec nodes went here"); 1357 } 1358 1359 /* 1360 * Vnode extattr transaction commit/abort 1361 */ 1362 static int 1363 ffs_openextattr(struct vop_openextattr_args *ap) 1364 /* 1365 struct vop_openextattr_args { 1366 struct vnodeop_desc *a_desc; 1367 struct vnode *a_vp; 1368 IN struct ucred *a_cred; 1369 IN struct thread *a_td; 1370 }; 1371 */ 1372 { 1373 1374 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1375 return (EOPNOTSUPP); 1376 1377 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1378 } 1379 1380 1381 /* 1382 * Vnode extattr transaction commit/abort 1383 */ 1384 static int 1385 ffs_closeextattr(struct vop_closeextattr_args *ap) 1386 /* 1387 struct vop_closeextattr_args { 1388 struct vnodeop_desc *a_desc; 1389 struct vnode *a_vp; 1390 int a_commit; 1391 IN struct ucred *a_cred; 1392 IN struct thread *a_td; 1393 }; 1394 */ 1395 { 1396 1397 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1398 return (EOPNOTSUPP); 1399 1400 if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) 1401 return (EROFS); 1402 1403 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1404 } 1405 1406 /* 1407 * Vnode operation to remove a named attribute. 1408 */ 1409 static int 1410 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1411 /* 1412 vop_deleteextattr { 1413 IN struct vnode *a_vp; 1414 IN int a_attrnamespace; 1415 IN const char *a_name; 1416 IN struct ucred *a_cred; 1417 IN struct thread *a_td; 1418 }; 1419 */ 1420 { 1421 struct inode *ip; 1422 struct extattr *eap; 1423 uint32_t ul; 1424 int olen, error, i, easize; 1425 u_char *eae; 1426 void *tmp; 1427 1428 ip = VTOI(ap->a_vp); 1429 1430 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1431 return (EOPNOTSUPP); 1432 1433 if (strlen(ap->a_name) == 0) 1434 return (EINVAL); 1435 1436 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1437 return (EROFS); 1438 1439 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1440 ap->a_cred, ap->a_td, VWRITE); 1441 if (error) { 1442 1443 /* 1444 * ffs_lock_ea is not needed there, because the vnode 1445 * must be exclusively locked. 1446 */ 1447 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1448 ip->i_ea_error = error; 1449 return (error); 1450 } 1451 1452 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1453 if (error) 1454 return (error); 1455 1456 /* CEM: delete could be done in-place instead */ 1457 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1458 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1459 easize = ip->i_ea_len; 1460 1461 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1462 &eap, NULL); 1463 if (olen == -1) { 1464 /* delete but nonexistent */ 1465 free(eae, M_TEMP); 1466 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1467 return (ENOATTR); 1468 } 1469 ul = eap->ea_length; 1470 i = (u_char *)EXTATTR_NEXT(eap) - eae; 1471 bcopy(EXTATTR_NEXT(eap), eap, easize - i); 1472 easize -= ul; 1473 1474 tmp = ip->i_ea_area; 1475 ip->i_ea_area = eae; 1476 ip->i_ea_len = easize; 1477 free(tmp, M_TEMP); 1478 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1479 return (error); 1480 } 1481 1482 /* 1483 * Vnode operation to retrieve a named extended attribute. 1484 */ 1485 static int 1486 ffs_getextattr(struct vop_getextattr_args *ap) 1487 /* 1488 vop_getextattr { 1489 IN struct vnode *a_vp; 1490 IN int a_attrnamespace; 1491 IN const char *a_name; 1492 INOUT struct uio *a_uio; 1493 OUT size_t *a_size; 1494 IN struct ucred *a_cred; 1495 IN struct thread *a_td; 1496 }; 1497 */ 1498 { 1499 struct inode *ip; 1500 u_char *eae, *p; 1501 unsigned easize; 1502 int error, ealen; 1503 1504 ip = VTOI(ap->a_vp); 1505 1506 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1507 return (EOPNOTSUPP); 1508 1509 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1510 ap->a_cred, ap->a_td, VREAD); 1511 if (error) 1512 return (error); 1513 1514 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1515 if (error) 1516 return (error); 1517 1518 eae = ip->i_ea_area; 1519 easize = ip->i_ea_len; 1520 1521 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1522 NULL, &p); 1523 if (ealen >= 0) { 1524 error = 0; 1525 if (ap->a_size != NULL) 1526 *ap->a_size = ealen; 1527 else if (ap->a_uio != NULL) 1528 error = uiomove(p, ealen, ap->a_uio); 1529 } else 1530 error = ENOATTR; 1531 1532 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1533 return (error); 1534 } 1535 1536 /* 1537 * Vnode operation to retrieve extended attributes on a vnode. 1538 */ 1539 static int 1540 ffs_listextattr(struct vop_listextattr_args *ap) 1541 /* 1542 vop_listextattr { 1543 IN struct vnode *a_vp; 1544 IN int a_attrnamespace; 1545 INOUT struct uio *a_uio; 1546 OUT size_t *a_size; 1547 IN struct ucred *a_cred; 1548 IN struct thread *a_td; 1549 }; 1550 */ 1551 { 1552 struct inode *ip; 1553 struct extattr *eap, *eaend; 1554 int error, ealen; 1555 1556 ip = VTOI(ap->a_vp); 1557 1558 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1559 return (EOPNOTSUPP); 1560 1561 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1562 ap->a_cred, ap->a_td, VREAD); 1563 if (error) 1564 return (error); 1565 1566 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1567 if (error) 1568 return (error); 1569 1570 error = 0; 1571 if (ap->a_size != NULL) 1572 *ap->a_size = 0; 1573 1574 KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned")); 1575 eap = (struct extattr *)ip->i_ea_area; 1576 eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len); 1577 for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) { 1578 /* make sure this entry is complete */ 1579 if (EXTATTR_NEXT(eap) > eaend) 1580 break; 1581 if (eap->ea_namespace != ap->a_attrnamespace) 1582 continue; 1583 1584 ealen = eap->ea_namelength; 1585 if (ap->a_size != NULL) 1586 *ap->a_size += ealen + 1; 1587 else if (ap->a_uio != NULL) 1588 error = uiomove(&eap->ea_namelength, ealen + 1, 1589 ap->a_uio); 1590 } 1591 1592 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1593 return (error); 1594 } 1595 1596 /* 1597 * Vnode operation to set a named attribute. 1598 */ 1599 static int 1600 ffs_setextattr(struct vop_setextattr_args *ap) 1601 /* 1602 vop_setextattr { 1603 IN struct vnode *a_vp; 1604 IN int a_attrnamespace; 1605 IN const char *a_name; 1606 INOUT struct uio *a_uio; 1607 IN struct ucred *a_cred; 1608 IN struct thread *a_td; 1609 }; 1610 */ 1611 { 1612 struct inode *ip; 1613 struct fs *fs; 1614 struct extattr *eap; 1615 uint32_t ealength, ul; 1616 ssize_t ealen; 1617 int olen, eapad1, eapad2, error, i, easize; 1618 u_char *eae; 1619 void *tmp; 1620 1621 ip = VTOI(ap->a_vp); 1622 fs = ITOFS(ip); 1623 1624 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1625 return (EOPNOTSUPP); 1626 1627 if (strlen(ap->a_name) == 0) 1628 return (EINVAL); 1629 1630 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1631 if (ap->a_uio == NULL) 1632 return (EOPNOTSUPP); 1633 1634 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1635 return (EROFS); 1636 1637 ealen = ap->a_uio->uio_resid; 1638 if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR)) 1639 return (EINVAL); 1640 1641 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1642 ap->a_cred, ap->a_td, VWRITE); 1643 if (error) { 1644 1645 /* 1646 * ffs_lock_ea is not needed there, because the vnode 1647 * must be exclusively locked. 1648 */ 1649 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1650 ip->i_ea_error = error; 1651 return (error); 1652 } 1653 1654 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1655 if (error) 1656 return (error); 1657 1658 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1659 eapad1 = roundup2(ealength, 8) - ealength; 1660 eapad2 = roundup2(ealen, 8) - ealen; 1661 ealength += eapad1 + ealen + eapad2; 1662 1663 /* 1664 * CEM: rewrites of the same size or smaller could be done in-place 1665 * instead. (We don't acquire any fine-grained locks in here either, 1666 * so we could also do bigger writes in-place.) 1667 */ 1668 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1669 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1670 easize = ip->i_ea_len; 1671 1672 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1673 &eap, NULL); 1674 if (olen == -1) { 1675 /* new, append at end */ 1676 KASSERT(ALIGNED_TO(eae + easize, struct extattr), 1677 ("unaligned")); 1678 eap = (struct extattr *)(eae + easize); 1679 easize += ealength; 1680 } else { 1681 ul = eap->ea_length; 1682 i = (u_char *)EXTATTR_NEXT(eap) - eae; 1683 if (ul != ealength) { 1684 bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength, 1685 easize - i); 1686 easize += (ealength - ul); 1687 } 1688 } 1689 if (easize > lblktosize(fs, UFS_NXADDR)) { 1690 free(eae, M_TEMP); 1691 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1692 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1693 ip->i_ea_error = ENOSPC; 1694 return (ENOSPC); 1695 } 1696 eap->ea_length = ealength; 1697 eap->ea_namespace = ap->a_attrnamespace; 1698 eap->ea_contentpadlen = eapad2; 1699 eap->ea_namelength = strlen(ap->a_name); 1700 memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name)); 1701 bzero(&eap->ea_name[strlen(ap->a_name)], eapad1); 1702 error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio); 1703 if (error) { 1704 free(eae, M_TEMP); 1705 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1706 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1707 ip->i_ea_error = error; 1708 return (error); 1709 } 1710 bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2); 1711 1712 tmp = ip->i_ea_area; 1713 ip->i_ea_area = eae; 1714 ip->i_ea_len = easize; 1715 free(tmp, M_TEMP); 1716 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1717 return (error); 1718 } 1719 1720 /* 1721 * Vnode pointer to File handle 1722 */ 1723 static int 1724 ffs_vptofh(struct vop_vptofh_args *ap) 1725 /* 1726 vop_vptofh { 1727 IN struct vnode *a_vp; 1728 IN struct fid *a_fhp; 1729 }; 1730 */ 1731 { 1732 struct inode *ip; 1733 struct ufid *ufhp; 1734 1735 ip = VTOI(ap->a_vp); 1736 ufhp = (struct ufid *)ap->a_fhp; 1737 ufhp->ufid_len = sizeof(struct ufid); 1738 ufhp->ufid_ino = ip->i_number; 1739 ufhp->ufid_gen = ip->i_gen; 1740 return (0); 1741 } 1742 1743 SYSCTL_DECL(_vfs_ffs); 1744 static int use_buf_pager = 1; 1745 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0, 1746 "Always use buffer pager instead of bmap"); 1747 1748 static daddr_t 1749 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) 1750 { 1751 1752 return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off)); 1753 } 1754 1755 static int 1756 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn) 1757 { 1758 1759 return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn)); 1760 } 1761 1762 static int 1763 ffs_getpages(struct vop_getpages_args *ap) 1764 { 1765 struct vnode *vp; 1766 struct ufsmount *um; 1767 1768 vp = ap->a_vp; 1769 um = VFSTOUFS(vp->v_mount); 1770 1771 if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) 1772 return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, 1773 ap->a_rbehind, ap->a_rahead, NULL, NULL)); 1774 return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, 1775 ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz)); 1776 } 1777 1778 static int 1779 ffs_getpages_async(struct vop_getpages_async_args *ap) 1780 { 1781 struct vnode *vp; 1782 struct ufsmount *um; 1783 int error; 1784 1785 vp = ap->a_vp; 1786 um = VFSTOUFS(vp->v_mount); 1787 1788 if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) 1789 return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, 1790 ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg)); 1791 1792 error = vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, 1793 ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz); 1794 ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); 1795 1796 return (error); 1797 } 1798 1799