1 /*- 2 * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause) 3 * 4 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 5 * All rights reserved. 6 * 7 * This software was developed for the FreeBSD Project by Marshall 8 * Kirk McKusick and Network Associates Laboratories, the Security 9 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 10 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 11 * research program 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1982, 1986, 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 62 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 63 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 64 */ 65 66 #include <sys/cdefs.h> 67 __FBSDID("$FreeBSD$"); 68 69 #include <sys/param.h> 70 #include <sys/bio.h> 71 #include <sys/systm.h> 72 #include <sys/buf.h> 73 #include <sys/conf.h> 74 #include <sys/extattr.h> 75 #include <sys/kernel.h> 76 #include <sys/limits.h> 77 #include <sys/malloc.h> 78 #include <sys/mount.h> 79 #include <sys/priv.h> 80 #include <sys/rwlock.h> 81 #include <sys/stat.h> 82 #include <sys/sysctl.h> 83 #include <sys/vmmeter.h> 84 #include <sys/vnode.h> 85 86 #include <vm/vm.h> 87 #include <vm/vm_param.h> 88 #include <vm/vm_extern.h> 89 #include <vm/vm_object.h> 90 #include <vm/vm_page.h> 91 #include <vm/vm_pager.h> 92 #include <vm/vnode_pager.h> 93 94 #include <ufs/ufs/extattr.h> 95 #include <ufs/ufs/quota.h> 96 #include <ufs/ufs/inode.h> 97 #include <ufs/ufs/ufs_extern.h> 98 #include <ufs/ufs/ufsmount.h> 99 100 #include <ufs/ffs/fs.h> 101 #include <ufs/ffs/ffs_extern.h> 102 #include "opt_directio.h" 103 #include "opt_ffs.h" 104 105 #define ALIGNED_TO(ptr, s) \ 106 (((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0) 107 108 #ifdef DIRECTIO 109 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 110 #endif 111 static vop_fdatasync_t ffs_fdatasync; 112 static vop_fsync_t ffs_fsync; 113 static vop_getpages_t ffs_getpages; 114 static vop_getpages_async_t ffs_getpages_async; 115 static vop_lock1_t ffs_lock; 116 static vop_read_t ffs_read; 117 static vop_write_t ffs_write; 118 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 119 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 120 struct ucred *cred); 121 static vop_strategy_t ffsext_strategy; 122 static vop_closeextattr_t ffs_closeextattr; 123 static vop_deleteextattr_t ffs_deleteextattr; 124 static vop_getextattr_t ffs_getextattr; 125 static vop_listextattr_t ffs_listextattr; 126 static vop_openextattr_t ffs_openextattr; 127 static vop_setextattr_t ffs_setextattr; 128 static vop_vptofh_t ffs_vptofh; 129 130 /* Global vfs data structures for ufs. */ 131 struct vop_vector ffs_vnodeops1 = { 132 .vop_default = &ufs_vnodeops, 133 .vop_fsync = ffs_fsync, 134 .vop_fdatasync = ffs_fdatasync, 135 .vop_getpages = ffs_getpages, 136 .vop_getpages_async = ffs_getpages_async, 137 .vop_lock1 = ffs_lock, 138 .vop_read = ffs_read, 139 .vop_reallocblks = ffs_reallocblks, 140 .vop_write = ffs_write, 141 .vop_vptofh = ffs_vptofh, 142 }; 143 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops1); 144 145 struct vop_vector ffs_fifoops1 = { 146 .vop_default = &ufs_fifoops, 147 .vop_fsync = ffs_fsync, 148 .vop_fdatasync = ffs_fdatasync, 149 .vop_lock1 = ffs_lock, 150 .vop_vptofh = ffs_vptofh, 151 }; 152 VFS_VOP_VECTOR_REGISTER(ffs_fifoops1); 153 154 /* Global vfs data structures for ufs. */ 155 struct vop_vector ffs_vnodeops2 = { 156 .vop_default = &ufs_vnodeops, 157 .vop_fsync = ffs_fsync, 158 .vop_fdatasync = ffs_fdatasync, 159 .vop_getpages = ffs_getpages, 160 .vop_getpages_async = ffs_getpages_async, 161 .vop_lock1 = ffs_lock, 162 .vop_read = ffs_read, 163 .vop_reallocblks = ffs_reallocblks, 164 .vop_write = ffs_write, 165 .vop_closeextattr = ffs_closeextattr, 166 .vop_deleteextattr = ffs_deleteextattr, 167 .vop_getextattr = ffs_getextattr, 168 .vop_listextattr = ffs_listextattr, 169 .vop_openextattr = ffs_openextattr, 170 .vop_setextattr = ffs_setextattr, 171 .vop_vptofh = ffs_vptofh, 172 }; 173 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops2); 174 175 struct vop_vector ffs_fifoops2 = { 176 .vop_default = &ufs_fifoops, 177 .vop_fsync = ffs_fsync, 178 .vop_fdatasync = ffs_fdatasync, 179 .vop_lock1 = ffs_lock, 180 .vop_reallocblks = ffs_reallocblks, 181 .vop_strategy = ffsext_strategy, 182 .vop_closeextattr = ffs_closeextattr, 183 .vop_deleteextattr = ffs_deleteextattr, 184 .vop_getextattr = ffs_getextattr, 185 .vop_listextattr = ffs_listextattr, 186 .vop_openextattr = ffs_openextattr, 187 .vop_setextattr = ffs_setextattr, 188 .vop_vptofh = ffs_vptofh, 189 }; 190 VFS_VOP_VECTOR_REGISTER(ffs_fifoops2); 191 192 /* 193 * Synch an open file. 194 */ 195 /* ARGSUSED */ 196 static int 197 ffs_fsync(struct vop_fsync_args *ap) 198 { 199 struct vnode *vp; 200 struct bufobj *bo; 201 int error; 202 203 vp = ap->a_vp; 204 bo = &vp->v_bufobj; 205 retry: 206 error = ffs_syncvnode(vp, ap->a_waitfor, 0); 207 if (error) 208 return (error); 209 if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) { 210 error = softdep_fsync(vp); 211 if (error) 212 return (error); 213 214 /* 215 * The softdep_fsync() function may drop vp lock, 216 * allowing for dirty buffers to reappear on the 217 * bo_dirty list. Recheck and resync as needed. 218 */ 219 BO_LOCK(bo); 220 if ((vp->v_type == VREG || vp->v_type == VDIR) && 221 (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) { 222 BO_UNLOCK(bo); 223 goto retry; 224 } 225 BO_UNLOCK(bo); 226 } 227 return (0); 228 } 229 230 int 231 ffs_syncvnode(struct vnode *vp, int waitfor, int flags) 232 { 233 struct inode *ip; 234 struct bufobj *bo; 235 struct buf *bp, *nbp; 236 ufs_lbn_t lbn; 237 int error, passes; 238 bool still_dirty, wait; 239 240 ip = VTOI(vp); 241 ip->i_flag &= ~IN_NEEDSYNC; 242 bo = &vp->v_bufobj; 243 244 /* 245 * When doing MNT_WAIT we must first flush all dependencies 246 * on the inode. 247 */ 248 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && 249 (error = softdep_sync_metadata(vp)) != 0) 250 return (error); 251 252 /* 253 * Flush all dirty buffers associated with a vnode. 254 */ 255 error = 0; 256 passes = 0; 257 wait = false; /* Always do an async pass first. */ 258 lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1)); 259 BO_LOCK(bo); 260 loop: 261 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 262 bp->b_vflags &= ~BV_SCANNED; 263 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 264 /* 265 * Reasons to skip this buffer: it has already been considered 266 * on this pass, the buffer has dependencies that will cause 267 * it to be redirtied and it has not already been deferred, 268 * or it is already being written. 269 */ 270 if ((bp->b_vflags & BV_SCANNED) != 0) 271 continue; 272 bp->b_vflags |= BV_SCANNED; 273 /* 274 * Flush indirects in order, if requested. 275 * 276 * Note that if only datasync is requested, we can 277 * skip indirect blocks when softupdates are not 278 * active. Otherwise we must flush them with data, 279 * since dependencies prevent data block writes. 280 */ 281 if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR && 282 (lbn_level(bp->b_lblkno) >= passes || 283 ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp)))) 284 continue; 285 if (bp->b_lblkno > lbn) 286 panic("ffs_syncvnode: syncing truncated data."); 287 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) { 288 BO_UNLOCK(bo); 289 } else if (wait) { 290 if (BUF_LOCK(bp, 291 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 292 BO_LOCKPTR(bo)) != 0) { 293 bp->b_vflags &= ~BV_SCANNED; 294 goto next; 295 } 296 } else 297 continue; 298 if ((bp->b_flags & B_DELWRI) == 0) 299 panic("ffs_fsync: not dirty"); 300 /* 301 * Check for dependencies and potentially complete them. 302 */ 303 if (!LIST_EMPTY(&bp->b_dep) && 304 (error = softdep_sync_buf(vp, bp, 305 wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { 306 /* I/O error. */ 307 if (error != EBUSY) { 308 BUF_UNLOCK(bp); 309 return (error); 310 } 311 /* If we deferred once, don't defer again. */ 312 if ((bp->b_flags & B_DEFERRED) == 0) { 313 bp->b_flags |= B_DEFERRED; 314 BUF_UNLOCK(bp); 315 goto next; 316 } 317 } 318 if (wait) { 319 bremfree(bp); 320 if ((error = bwrite(bp)) != 0) 321 return (error); 322 } else if ((bp->b_flags & B_CLUSTEROK)) { 323 (void) vfs_bio_awrite(bp); 324 } else { 325 bremfree(bp); 326 (void) bawrite(bp); 327 } 328 next: 329 /* 330 * Since we may have slept during the I/O, we need 331 * to start from a known point. 332 */ 333 BO_LOCK(bo); 334 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); 335 } 336 if (waitfor != MNT_WAIT) { 337 BO_UNLOCK(bo); 338 if ((flags & NO_INO_UPDT) != 0) 339 return (0); 340 else 341 return (ffs_update(vp, 0)); 342 } 343 /* Drain IO to see if we're done. */ 344 bufobj_wwait(bo, 0, 0); 345 /* 346 * Block devices associated with filesystems may have new I/O 347 * requests posted for them even if the vnode is locked, so no 348 * amount of trying will get them clean. We make several passes 349 * as a best effort. 350 * 351 * Regular files may need multiple passes to flush all dependency 352 * work as it is possible that we must write once per indirect 353 * level, once for the leaf, and once for the inode and each of 354 * these will be done with one sync and one async pass. 355 */ 356 if (bo->bo_dirty.bv_cnt > 0) { 357 if ((flags & DATA_ONLY) == 0) { 358 still_dirty = true; 359 } else { 360 /* 361 * For data-only sync, dirty indirect buffers 362 * are ignored. 363 */ 364 still_dirty = false; 365 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 366 if (bp->b_lblkno > -UFS_NDADDR) { 367 still_dirty = true; 368 break; 369 } 370 } 371 } 372 373 if (still_dirty) { 374 /* Write the inode after sync passes to flush deps. */ 375 if (wait && DOINGSOFTDEP(vp) && 376 (flags & NO_INO_UPDT) == 0) { 377 BO_UNLOCK(bo); 378 ffs_update(vp, 1); 379 BO_LOCK(bo); 380 } 381 /* switch between sync/async. */ 382 wait = !wait; 383 if (wait || ++passes < UFS_NIADDR + 2) 384 goto loop; 385 } 386 } 387 BO_UNLOCK(bo); 388 error = 0; 389 if ((flags & DATA_ONLY) == 0) { 390 if ((flags & NO_INO_UPDT) == 0) 391 error = ffs_update(vp, 1); 392 if (DOINGSUJ(vp)) 393 softdep_journal_fsync(VTOI(vp)); 394 } 395 return (error); 396 } 397 398 static int 399 ffs_fdatasync(struct vop_fdatasync_args *ap) 400 { 401 402 return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY)); 403 } 404 405 static int 406 ffs_lock(ap) 407 struct vop_lock1_args /* { 408 struct vnode *a_vp; 409 int a_flags; 410 struct thread *a_td; 411 char *file; 412 int line; 413 } */ *ap; 414 { 415 #ifndef NO_FFS_SNAPSHOT 416 struct vnode *vp; 417 int flags; 418 struct lock *lkp; 419 int result; 420 421 switch (ap->a_flags & LK_TYPE_MASK) { 422 case LK_SHARED: 423 case LK_UPGRADE: 424 case LK_EXCLUSIVE: 425 vp = ap->a_vp; 426 flags = ap->a_flags; 427 for (;;) { 428 #ifdef DEBUG_VFS_LOCKS 429 KASSERT(vp->v_holdcnt != 0, 430 ("ffs_lock %p: zero hold count", vp)); 431 #endif 432 lkp = vp->v_vnlock; 433 result = _lockmgr_args(lkp, flags, VI_MTX(vp), 434 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 435 ap->a_file, ap->a_line); 436 if (lkp == vp->v_vnlock || result != 0) 437 break; 438 /* 439 * Apparent success, except that the vnode 440 * mutated between snapshot file vnode and 441 * regular file vnode while this process 442 * slept. The lock currently held is not the 443 * right lock. Release it, and try to get the 444 * new lock. 445 */ 446 (void) _lockmgr_args(lkp, LK_RELEASE, NULL, 447 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 448 ap->a_file, ap->a_line); 449 if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == 450 (LK_INTERLOCK | LK_NOWAIT)) 451 return (EBUSY); 452 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 453 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 454 flags &= ~LK_INTERLOCK; 455 } 456 break; 457 default: 458 result = VOP_LOCK1_APV(&ufs_vnodeops, ap); 459 } 460 return (result); 461 #else 462 return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); 463 #endif 464 } 465 466 static int 467 ffs_read_hole(struct uio *uio, long xfersize, long *size) 468 { 469 ssize_t saved_resid, tlen; 470 int error; 471 472 while (xfersize > 0) { 473 tlen = min(xfersize, ZERO_REGION_SIZE); 474 saved_resid = uio->uio_resid; 475 error = vn_io_fault_uiomove(__DECONST(void *, zero_region), 476 tlen, uio); 477 if (error != 0) 478 return (error); 479 tlen = saved_resid - uio->uio_resid; 480 xfersize -= tlen; 481 *size -= tlen; 482 } 483 return (0); 484 } 485 486 /* 487 * Vnode op for reading. 488 */ 489 static int 490 ffs_read(ap) 491 struct vop_read_args /* { 492 struct vnode *a_vp; 493 struct uio *a_uio; 494 int a_ioflag; 495 struct ucred *a_cred; 496 } */ *ap; 497 { 498 struct vnode *vp; 499 struct inode *ip; 500 struct uio *uio; 501 struct fs *fs; 502 struct buf *bp; 503 ufs_lbn_t lbn, nextlbn; 504 off_t bytesinfile; 505 long size, xfersize, blkoffset; 506 ssize_t orig_resid; 507 int bflag, error, ioflag, seqcount; 508 509 vp = ap->a_vp; 510 uio = ap->a_uio; 511 ioflag = ap->a_ioflag; 512 if (ap->a_ioflag & IO_EXT) 513 #ifdef notyet 514 return (ffs_extread(vp, uio, ioflag)); 515 #else 516 panic("ffs_read+IO_EXT"); 517 #endif 518 #ifdef DIRECTIO 519 if ((ioflag & IO_DIRECT) != 0) { 520 int workdone; 521 522 error = ffs_rawread(vp, uio, &workdone); 523 if (error != 0 || workdone != 0) 524 return error; 525 } 526 #endif 527 528 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 529 ip = VTOI(vp); 530 531 #ifdef INVARIANTS 532 if (uio->uio_rw != UIO_READ) 533 panic("ffs_read: mode"); 534 535 if (vp->v_type == VLNK) { 536 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 537 panic("ffs_read: short symlink"); 538 } else if (vp->v_type != VREG && vp->v_type != VDIR) 539 panic("ffs_read: type %d", vp->v_type); 540 #endif 541 orig_resid = uio->uio_resid; 542 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 543 if (orig_resid == 0) 544 return (0); 545 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 546 fs = ITOFS(ip); 547 if (uio->uio_offset < ip->i_size && 548 uio->uio_offset >= fs->fs_maxfilesize) 549 return (EOVERFLOW); 550 551 bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE); 552 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 553 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 554 break; 555 lbn = lblkno(fs, uio->uio_offset); 556 nextlbn = lbn + 1; 557 558 /* 559 * size of buffer. The buffer representing the 560 * end of the file is rounded up to the size of 561 * the block type ( fragment or full block, 562 * depending ). 563 */ 564 size = blksize(fs, ip, lbn); 565 blkoffset = blkoff(fs, uio->uio_offset); 566 567 /* 568 * The amount we want to transfer in this iteration is 569 * one FS block less the amount of the data before 570 * our startpoint (duh!) 571 */ 572 xfersize = fs->fs_bsize - blkoffset; 573 574 /* 575 * But if we actually want less than the block, 576 * or the file doesn't have a whole block more of data, 577 * then use the lesser number. 578 */ 579 if (uio->uio_resid < xfersize) 580 xfersize = uio->uio_resid; 581 if (bytesinfile < xfersize) 582 xfersize = bytesinfile; 583 584 if (lblktosize(fs, nextlbn) >= ip->i_size) { 585 /* 586 * Don't do readahead if this is the end of the file. 587 */ 588 error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp); 589 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 590 /* 591 * Otherwise if we are allowed to cluster, 592 * grab as much as we can. 593 * 594 * XXX This may not be a win if we are not 595 * doing sequential access. 596 */ 597 error = cluster_read(vp, ip->i_size, lbn, 598 size, NOCRED, blkoffset + uio->uio_resid, 599 seqcount, bflag, &bp); 600 } else if (seqcount > 1) { 601 /* 602 * If we are NOT allowed to cluster, then 603 * if we appear to be acting sequentially, 604 * fire off a request for a readahead 605 * as well as a read. Note that the 4th and 5th 606 * arguments point to arrays of the size specified in 607 * the 6th argument. 608 */ 609 u_int nextsize = blksize(fs, ip, nextlbn); 610 error = breadn_flags(vp, lbn, lbn, size, &nextlbn, 611 &nextsize, 1, NOCRED, bflag, NULL, &bp); 612 } else { 613 /* 614 * Failing all of the above, just read what the 615 * user asked for. Interestingly, the same as 616 * the first option above. 617 */ 618 error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp); 619 } 620 if (error == EJUSTRETURN) { 621 error = ffs_read_hole(uio, xfersize, &size); 622 if (error == 0) 623 continue; 624 } 625 if (error != 0) { 626 brelse(bp); 627 bp = NULL; 628 break; 629 } 630 631 /* 632 * We should only get non-zero b_resid when an I/O error 633 * has occurred, which should cause us to break above. 634 * However, if the short read did not cause an error, 635 * then we want to ensure that we do not uiomove bad 636 * or uninitialized data. 637 */ 638 size -= bp->b_resid; 639 if (size < xfersize) { 640 if (size == 0) 641 break; 642 xfersize = size; 643 } 644 645 if (buf_mapped(bp)) { 646 error = vn_io_fault_uiomove((char *)bp->b_data + 647 blkoffset, (int)xfersize, uio); 648 } else { 649 error = vn_io_fault_pgmove(bp->b_pages, blkoffset, 650 (int)xfersize, uio); 651 } 652 if (error) 653 break; 654 655 vfs_bio_brelse(bp, ioflag); 656 } 657 658 /* 659 * This can only happen in the case of an error 660 * because the loop above resets bp to NULL on each iteration 661 * and on normal completion has not set a new value into it. 662 * so it must have come from a 'break' statement 663 */ 664 if (bp != NULL) 665 vfs_bio_brelse(bp, ioflag); 666 667 if ((error == 0 || uio->uio_resid != orig_resid) && 668 (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 && 669 (ip->i_flag & IN_ACCESS) == 0) { 670 VI_LOCK(vp); 671 ip->i_flag |= IN_ACCESS; 672 VI_UNLOCK(vp); 673 } 674 return (error); 675 } 676 677 /* 678 * Vnode op for writing. 679 */ 680 static int 681 ffs_write(ap) 682 struct vop_write_args /* { 683 struct vnode *a_vp; 684 struct uio *a_uio; 685 int a_ioflag; 686 struct ucred *a_cred; 687 } */ *ap; 688 { 689 struct vnode *vp; 690 struct uio *uio; 691 struct inode *ip; 692 struct fs *fs; 693 struct buf *bp; 694 ufs_lbn_t lbn; 695 off_t osize; 696 ssize_t resid; 697 int seqcount; 698 int blkoffset, error, flags, ioflag, size, xfersize; 699 700 vp = ap->a_vp; 701 uio = ap->a_uio; 702 ioflag = ap->a_ioflag; 703 if (ap->a_ioflag & IO_EXT) 704 #ifdef notyet 705 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 706 #else 707 panic("ffs_write+IO_EXT"); 708 #endif 709 710 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 711 ip = VTOI(vp); 712 713 #ifdef INVARIANTS 714 if (uio->uio_rw != UIO_WRITE) 715 panic("ffs_write: mode"); 716 #endif 717 718 switch (vp->v_type) { 719 case VREG: 720 if (ioflag & IO_APPEND) 721 uio->uio_offset = ip->i_size; 722 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 723 return (EPERM); 724 /* FALLTHROUGH */ 725 case VLNK: 726 break; 727 case VDIR: 728 panic("ffs_write: dir write"); 729 break; 730 default: 731 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 732 (int)uio->uio_offset, 733 (int)uio->uio_resid 734 ); 735 } 736 737 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 738 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 739 fs = ITOFS(ip); 740 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 741 return (EFBIG); 742 /* 743 * Maybe this should be above the vnode op call, but so long as 744 * file servers have no limits, I don't think it matters. 745 */ 746 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) 747 return (EFBIG); 748 749 resid = uio->uio_resid; 750 osize = ip->i_size; 751 if (seqcount > BA_SEQMAX) 752 flags = BA_SEQMAX << BA_SEQSHIFT; 753 else 754 flags = seqcount << BA_SEQSHIFT; 755 if (ioflag & IO_SYNC) 756 flags |= IO_SYNC; 757 flags |= BA_UNMAPPED; 758 759 for (error = 0; uio->uio_resid > 0;) { 760 lbn = lblkno(fs, uio->uio_offset); 761 blkoffset = blkoff(fs, uio->uio_offset); 762 xfersize = fs->fs_bsize - blkoffset; 763 if (uio->uio_resid < xfersize) 764 xfersize = uio->uio_resid; 765 if (uio->uio_offset + xfersize > ip->i_size) 766 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 767 768 /* 769 * We must perform a read-before-write if the transfer size 770 * does not cover the entire buffer. 771 */ 772 if (fs->fs_bsize > xfersize) 773 flags |= BA_CLRBUF; 774 else 775 flags &= ~BA_CLRBUF; 776 /* XXX is uio->uio_offset the right thing here? */ 777 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 778 ap->a_cred, flags, &bp); 779 if (error != 0) { 780 vnode_pager_setsize(vp, ip->i_size); 781 break; 782 } 783 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 784 bp->b_flags |= B_NOCACHE; 785 786 if (uio->uio_offset + xfersize > ip->i_size) { 787 ip->i_size = uio->uio_offset + xfersize; 788 DIP_SET(ip, i_size, ip->i_size); 789 } 790 791 size = blksize(fs, ip, lbn) - bp->b_resid; 792 if (size < xfersize) 793 xfersize = size; 794 795 if (buf_mapped(bp)) { 796 error = vn_io_fault_uiomove((char *)bp->b_data + 797 blkoffset, (int)xfersize, uio); 798 } else { 799 error = vn_io_fault_pgmove(bp->b_pages, blkoffset, 800 (int)xfersize, uio); 801 } 802 /* 803 * If the buffer is not already filled and we encounter an 804 * error while trying to fill it, we have to clear out any 805 * garbage data from the pages instantiated for the buffer. 806 * If we do not, a failed uiomove() during a write can leave 807 * the prior contents of the pages exposed to a userland mmap. 808 * 809 * Note that we need only clear buffers with a transfer size 810 * equal to the block size because buffers with a shorter 811 * transfer size were cleared above by the call to UFS_BALLOC() 812 * with the BA_CLRBUF flag set. 813 * 814 * If the source region for uiomove identically mmaps the 815 * buffer, uiomove() performed the NOP copy, and the buffer 816 * content remains valid because the page fault handler 817 * validated the pages. 818 */ 819 if (error != 0 && (bp->b_flags & B_CACHE) == 0 && 820 fs->fs_bsize == xfersize) 821 vfs_bio_clrbuf(bp); 822 823 vfs_bio_set_flags(bp, ioflag); 824 825 /* 826 * If IO_SYNC each buffer is written synchronously. Otherwise 827 * if we have a severe page deficiency write the buffer 828 * asynchronously. Otherwise try to cluster, and if that 829 * doesn't do it then either do an async write (if O_DIRECT), 830 * or a delayed write (if not). 831 */ 832 if (ioflag & IO_SYNC) { 833 (void)bwrite(bp); 834 } else if (vm_page_count_severe() || 835 buf_dirty_count_severe() || 836 (ioflag & IO_ASYNC)) { 837 bp->b_flags |= B_CLUSTEROK; 838 bawrite(bp); 839 } else if (xfersize + blkoffset == fs->fs_bsize) { 840 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 841 bp->b_flags |= B_CLUSTEROK; 842 cluster_write(vp, bp, ip->i_size, seqcount, 843 GB_UNMAPPED); 844 } else { 845 bawrite(bp); 846 } 847 } else if (ioflag & IO_DIRECT) { 848 bp->b_flags |= B_CLUSTEROK; 849 bawrite(bp); 850 } else { 851 bp->b_flags |= B_CLUSTEROK; 852 bdwrite(bp); 853 } 854 if (error || xfersize == 0) 855 break; 856 ip->i_flag |= IN_CHANGE | IN_UPDATE; 857 } 858 /* 859 * If we successfully wrote any data, and we are not the superuser 860 * we clear the setuid and setgid bits as a precaution against 861 * tampering. 862 */ 863 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && 864 ap->a_cred) { 865 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) { 866 ip->i_mode &= ~(ISUID | ISGID); 867 DIP_SET(ip, i_mode, ip->i_mode); 868 } 869 } 870 if (error) { 871 if (ioflag & IO_UNIT) { 872 (void)ffs_truncate(vp, osize, 873 IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred); 874 uio->uio_offset -= resid - uio->uio_resid; 875 uio->uio_resid = resid; 876 } 877 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 878 error = ffs_update(vp, 1); 879 return (error); 880 } 881 882 /* 883 * Extended attribute area reading. 884 */ 885 static int 886 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 887 { 888 struct inode *ip; 889 struct ufs2_dinode *dp; 890 struct fs *fs; 891 struct buf *bp; 892 ufs_lbn_t lbn, nextlbn; 893 off_t bytesinfile; 894 long size, xfersize, blkoffset; 895 ssize_t orig_resid; 896 int error; 897 898 ip = VTOI(vp); 899 fs = ITOFS(ip); 900 dp = ip->i_din2; 901 902 #ifdef INVARIANTS 903 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 904 panic("ffs_extread: mode"); 905 906 #endif 907 orig_resid = uio->uio_resid; 908 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 909 if (orig_resid == 0) 910 return (0); 911 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 912 913 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 914 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 915 break; 916 lbn = lblkno(fs, uio->uio_offset); 917 nextlbn = lbn + 1; 918 919 /* 920 * size of buffer. The buffer representing the 921 * end of the file is rounded up to the size of 922 * the block type ( fragment or full block, 923 * depending ). 924 */ 925 size = sblksize(fs, dp->di_extsize, lbn); 926 blkoffset = blkoff(fs, uio->uio_offset); 927 928 /* 929 * The amount we want to transfer in this iteration is 930 * one FS block less the amount of the data before 931 * our startpoint (duh!) 932 */ 933 xfersize = fs->fs_bsize - blkoffset; 934 935 /* 936 * But if we actually want less than the block, 937 * or the file doesn't have a whole block more of data, 938 * then use the lesser number. 939 */ 940 if (uio->uio_resid < xfersize) 941 xfersize = uio->uio_resid; 942 if (bytesinfile < xfersize) 943 xfersize = bytesinfile; 944 945 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 946 /* 947 * Don't do readahead if this is the end of the info. 948 */ 949 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 950 } else { 951 /* 952 * If we have a second block, then 953 * fire off a request for a readahead 954 * as well as a read. Note that the 4th and 5th 955 * arguments point to arrays of the size specified in 956 * the 6th argument. 957 */ 958 u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 959 960 nextlbn = -1 - nextlbn; 961 error = breadn(vp, -1 - lbn, 962 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 963 } 964 if (error) { 965 brelse(bp); 966 bp = NULL; 967 break; 968 } 969 970 /* 971 * We should only get non-zero b_resid when an I/O error 972 * has occurred, which should cause us to break above. 973 * However, if the short read did not cause an error, 974 * then we want to ensure that we do not uiomove bad 975 * or uninitialized data. 976 */ 977 size -= bp->b_resid; 978 if (size < xfersize) { 979 if (size == 0) 980 break; 981 xfersize = size; 982 } 983 984 error = uiomove((char *)bp->b_data + blkoffset, 985 (int)xfersize, uio); 986 if (error) 987 break; 988 vfs_bio_brelse(bp, ioflag); 989 } 990 991 /* 992 * This can only happen in the case of an error 993 * because the loop above resets bp to NULL on each iteration 994 * and on normal completion has not set a new value into it. 995 * so it must have come from a 'break' statement 996 */ 997 if (bp != NULL) 998 vfs_bio_brelse(bp, ioflag); 999 return (error); 1000 } 1001 1002 /* 1003 * Extended attribute area writing. 1004 */ 1005 static int 1006 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1007 { 1008 struct inode *ip; 1009 struct ufs2_dinode *dp; 1010 struct fs *fs; 1011 struct buf *bp; 1012 ufs_lbn_t lbn; 1013 off_t osize; 1014 ssize_t resid; 1015 int blkoffset, error, flags, size, xfersize; 1016 1017 ip = VTOI(vp); 1018 fs = ITOFS(ip); 1019 dp = ip->i_din2; 1020 1021 #ifdef INVARIANTS 1022 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1023 panic("ffs_extwrite: mode"); 1024 #endif 1025 1026 if (ioflag & IO_APPEND) 1027 uio->uio_offset = dp->di_extsize; 1028 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1029 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1030 if ((uoff_t)uio->uio_offset + uio->uio_resid > 1031 UFS_NXADDR * fs->fs_bsize) 1032 return (EFBIG); 1033 1034 resid = uio->uio_resid; 1035 osize = dp->di_extsize; 1036 flags = IO_EXT; 1037 if (ioflag & IO_SYNC) 1038 flags |= IO_SYNC; 1039 1040 for (error = 0; uio->uio_resid > 0;) { 1041 lbn = lblkno(fs, uio->uio_offset); 1042 blkoffset = blkoff(fs, uio->uio_offset); 1043 xfersize = fs->fs_bsize - blkoffset; 1044 if (uio->uio_resid < xfersize) 1045 xfersize = uio->uio_resid; 1046 1047 /* 1048 * We must perform a read-before-write if the transfer size 1049 * does not cover the entire buffer. 1050 */ 1051 if (fs->fs_bsize > xfersize) 1052 flags |= BA_CLRBUF; 1053 else 1054 flags &= ~BA_CLRBUF; 1055 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1056 ucred, flags, &bp); 1057 if (error != 0) 1058 break; 1059 /* 1060 * If the buffer is not valid we have to clear out any 1061 * garbage data from the pages instantiated for the buffer. 1062 * If we do not, a failed uiomove() during a write can leave 1063 * the prior contents of the pages exposed to a userland 1064 * mmap(). XXX deal with uiomove() errors a better way. 1065 */ 1066 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1067 vfs_bio_clrbuf(bp); 1068 1069 if (uio->uio_offset + xfersize > dp->di_extsize) 1070 dp->di_extsize = uio->uio_offset + xfersize; 1071 1072 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1073 if (size < xfersize) 1074 xfersize = size; 1075 1076 error = 1077 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1078 1079 vfs_bio_set_flags(bp, ioflag); 1080 1081 /* 1082 * If IO_SYNC each buffer is written synchronously. Otherwise 1083 * if we have a severe page deficiency write the buffer 1084 * asynchronously. Otherwise try to cluster, and if that 1085 * doesn't do it then either do an async write (if O_DIRECT), 1086 * or a delayed write (if not). 1087 */ 1088 if (ioflag & IO_SYNC) { 1089 (void)bwrite(bp); 1090 } else if (vm_page_count_severe() || 1091 buf_dirty_count_severe() || 1092 xfersize + blkoffset == fs->fs_bsize || 1093 (ioflag & (IO_ASYNC | IO_DIRECT))) 1094 bawrite(bp); 1095 else 1096 bdwrite(bp); 1097 if (error || xfersize == 0) 1098 break; 1099 ip->i_flag |= IN_CHANGE; 1100 } 1101 /* 1102 * If we successfully wrote any data, and we are not the superuser 1103 * we clear the setuid and setgid bits as a precaution against 1104 * tampering. 1105 */ 1106 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { 1107 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) { 1108 ip->i_mode &= ~(ISUID | ISGID); 1109 dp->di_mode = ip->i_mode; 1110 } 1111 } 1112 if (error) { 1113 if (ioflag & IO_UNIT) { 1114 (void)ffs_truncate(vp, osize, 1115 IO_EXT | (ioflag&IO_SYNC), ucred); 1116 uio->uio_offset -= resid - uio->uio_resid; 1117 uio->uio_resid = resid; 1118 } 1119 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1120 error = ffs_update(vp, 1); 1121 return (error); 1122 } 1123 1124 1125 /* 1126 * Vnode operating to retrieve a named extended attribute. 1127 * 1128 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1129 * the length of the EA, and possibly the pointer to the entry and to the data. 1130 */ 1131 static int 1132 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, 1133 struct extattr **eapp, u_char **eac) 1134 { 1135 struct extattr *eap, *eaend; 1136 size_t nlen; 1137 1138 nlen = strlen(name); 1139 KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned")); 1140 eap = (struct extattr *)ptr; 1141 eaend = (struct extattr *)(ptr + length); 1142 for (; eap < eaend; eap = EXTATTR_NEXT(eap)) { 1143 /* make sure this entry is complete */ 1144 if (EXTATTR_NEXT(eap) > eaend) 1145 break; 1146 if (eap->ea_namespace != nspace || eap->ea_namelength != nlen 1147 || memcmp(eap->ea_name, name, nlen) != 0) 1148 continue; 1149 if (eapp != NULL) 1150 *eapp = eap; 1151 if (eac != NULL) 1152 *eac = EXTATTR_CONTENT(eap); 1153 return (EXTATTR_CONTENT_SIZE(eap)); 1154 } 1155 return (-1); 1156 } 1157 1158 static int 1159 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1160 { 1161 struct inode *ip; 1162 struct ufs2_dinode *dp; 1163 struct fs *fs; 1164 struct uio luio; 1165 struct iovec liovec; 1166 u_int easize; 1167 int error; 1168 u_char *eae; 1169 1170 ip = VTOI(vp); 1171 fs = ITOFS(ip); 1172 dp = ip->i_din2; 1173 easize = dp->di_extsize; 1174 if ((uoff_t)easize + extra > UFS_NXADDR * fs->fs_bsize) 1175 return (EFBIG); 1176 1177 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1178 1179 liovec.iov_base = eae; 1180 liovec.iov_len = easize; 1181 luio.uio_iov = &liovec; 1182 luio.uio_iovcnt = 1; 1183 luio.uio_offset = 0; 1184 luio.uio_resid = easize; 1185 luio.uio_segflg = UIO_SYSSPACE; 1186 luio.uio_rw = UIO_READ; 1187 luio.uio_td = td; 1188 1189 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1190 if (error) { 1191 free(eae, M_TEMP); 1192 return(error); 1193 } 1194 *p = eae; 1195 return (0); 1196 } 1197 1198 static void 1199 ffs_lock_ea(struct vnode *vp) 1200 { 1201 struct inode *ip; 1202 1203 ip = VTOI(vp); 1204 VI_LOCK(vp); 1205 while (ip->i_flag & IN_EA_LOCKED) { 1206 ip->i_flag |= IN_EA_LOCKWAIT; 1207 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea", 1208 0); 1209 } 1210 ip->i_flag |= IN_EA_LOCKED; 1211 VI_UNLOCK(vp); 1212 } 1213 1214 static void 1215 ffs_unlock_ea(struct vnode *vp) 1216 { 1217 struct inode *ip; 1218 1219 ip = VTOI(vp); 1220 VI_LOCK(vp); 1221 if (ip->i_flag & IN_EA_LOCKWAIT) 1222 wakeup(&ip->i_ea_refs); 1223 ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT); 1224 VI_UNLOCK(vp); 1225 } 1226 1227 static int 1228 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1229 { 1230 struct inode *ip; 1231 struct ufs2_dinode *dp; 1232 int error; 1233 1234 ip = VTOI(vp); 1235 1236 ffs_lock_ea(vp); 1237 if (ip->i_ea_area != NULL) { 1238 ip->i_ea_refs++; 1239 ffs_unlock_ea(vp); 1240 return (0); 1241 } 1242 dp = ip->i_din2; 1243 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1244 if (error) { 1245 ffs_unlock_ea(vp); 1246 return (error); 1247 } 1248 ip->i_ea_len = dp->di_extsize; 1249 ip->i_ea_error = 0; 1250 ip->i_ea_refs++; 1251 ffs_unlock_ea(vp); 1252 return (0); 1253 } 1254 1255 /* 1256 * Vnode extattr transaction commit/abort 1257 */ 1258 static int 1259 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1260 { 1261 struct inode *ip; 1262 struct uio luio; 1263 struct iovec liovec; 1264 int error; 1265 struct ufs2_dinode *dp; 1266 1267 ip = VTOI(vp); 1268 1269 ffs_lock_ea(vp); 1270 if (ip->i_ea_area == NULL) { 1271 ffs_unlock_ea(vp); 1272 return (EINVAL); 1273 } 1274 dp = ip->i_din2; 1275 error = ip->i_ea_error; 1276 if (commit && error == 0) { 1277 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit"); 1278 if (cred == NOCRED) 1279 cred = vp->v_mount->mnt_cred; 1280 liovec.iov_base = ip->i_ea_area; 1281 liovec.iov_len = ip->i_ea_len; 1282 luio.uio_iov = &liovec; 1283 luio.uio_iovcnt = 1; 1284 luio.uio_offset = 0; 1285 luio.uio_resid = ip->i_ea_len; 1286 luio.uio_segflg = UIO_SYSSPACE; 1287 luio.uio_rw = UIO_WRITE; 1288 luio.uio_td = td; 1289 /* XXX: I'm not happy about truncating to zero size */ 1290 if (ip->i_ea_len < dp->di_extsize) 1291 error = ffs_truncate(vp, 0, IO_EXT, cred); 1292 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1293 } 1294 if (--ip->i_ea_refs == 0) { 1295 free(ip->i_ea_area, M_TEMP); 1296 ip->i_ea_area = NULL; 1297 ip->i_ea_len = 0; 1298 ip->i_ea_error = 0; 1299 } 1300 ffs_unlock_ea(vp); 1301 return (error); 1302 } 1303 1304 /* 1305 * Vnode extattr strategy routine for fifos. 1306 * 1307 * We need to check for a read or write of the external attributes. 1308 * Otherwise we just fall through and do the usual thing. 1309 */ 1310 static int 1311 ffsext_strategy(struct vop_strategy_args *ap) 1312 /* 1313 struct vop_strategy_args { 1314 struct vnodeop_desc *a_desc; 1315 struct vnode *a_vp; 1316 struct buf *a_bp; 1317 }; 1318 */ 1319 { 1320 struct vnode *vp; 1321 daddr_t lbn; 1322 1323 vp = ap->a_vp; 1324 lbn = ap->a_bp->b_lblkno; 1325 if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR) 1326 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1327 if (vp->v_type == VFIFO) 1328 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1329 panic("spec nodes went here"); 1330 } 1331 1332 /* 1333 * Vnode extattr transaction commit/abort 1334 */ 1335 static int 1336 ffs_openextattr(struct vop_openextattr_args *ap) 1337 /* 1338 struct vop_openextattr_args { 1339 struct vnodeop_desc *a_desc; 1340 struct vnode *a_vp; 1341 IN struct ucred *a_cred; 1342 IN struct thread *a_td; 1343 }; 1344 */ 1345 { 1346 1347 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1348 return (EOPNOTSUPP); 1349 1350 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1351 } 1352 1353 1354 /* 1355 * Vnode extattr transaction commit/abort 1356 */ 1357 static int 1358 ffs_closeextattr(struct vop_closeextattr_args *ap) 1359 /* 1360 struct vop_closeextattr_args { 1361 struct vnodeop_desc *a_desc; 1362 struct vnode *a_vp; 1363 int a_commit; 1364 IN struct ucred *a_cred; 1365 IN struct thread *a_td; 1366 }; 1367 */ 1368 { 1369 1370 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1371 return (EOPNOTSUPP); 1372 1373 if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) 1374 return (EROFS); 1375 1376 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1377 } 1378 1379 /* 1380 * Vnode operation to remove a named attribute. 1381 */ 1382 static int 1383 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1384 /* 1385 vop_deleteextattr { 1386 IN struct vnode *a_vp; 1387 IN int a_attrnamespace; 1388 IN const char *a_name; 1389 IN struct ucred *a_cred; 1390 IN struct thread *a_td; 1391 }; 1392 */ 1393 { 1394 struct inode *ip; 1395 struct extattr *eap; 1396 uint32_t ul; 1397 int olen, error, i, easize; 1398 u_char *eae; 1399 void *tmp; 1400 1401 ip = VTOI(ap->a_vp); 1402 1403 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1404 return (EOPNOTSUPP); 1405 1406 if (strlen(ap->a_name) == 0) 1407 return (EINVAL); 1408 1409 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1410 return (EROFS); 1411 1412 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1413 ap->a_cred, ap->a_td, VWRITE); 1414 if (error) { 1415 1416 /* 1417 * ffs_lock_ea is not needed there, because the vnode 1418 * must be exclusively locked. 1419 */ 1420 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1421 ip->i_ea_error = error; 1422 return (error); 1423 } 1424 1425 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1426 if (error) 1427 return (error); 1428 1429 /* CEM: delete could be done in-place instead */ 1430 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1431 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1432 easize = ip->i_ea_len; 1433 1434 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1435 &eap, NULL); 1436 if (olen == -1) { 1437 /* delete but nonexistent */ 1438 free(eae, M_TEMP); 1439 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1440 return (ENOATTR); 1441 } 1442 ul = eap->ea_length; 1443 i = (u_char *)EXTATTR_NEXT(eap) - eae; 1444 bcopy(EXTATTR_NEXT(eap), eap, easize - i); 1445 easize -= ul; 1446 1447 tmp = ip->i_ea_area; 1448 ip->i_ea_area = eae; 1449 ip->i_ea_len = easize; 1450 free(tmp, M_TEMP); 1451 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1452 return (error); 1453 } 1454 1455 /* 1456 * Vnode operation to retrieve a named extended attribute. 1457 */ 1458 static int 1459 ffs_getextattr(struct vop_getextattr_args *ap) 1460 /* 1461 vop_getextattr { 1462 IN struct vnode *a_vp; 1463 IN int a_attrnamespace; 1464 IN const char *a_name; 1465 INOUT struct uio *a_uio; 1466 OUT size_t *a_size; 1467 IN struct ucred *a_cred; 1468 IN struct thread *a_td; 1469 }; 1470 */ 1471 { 1472 struct inode *ip; 1473 u_char *eae, *p; 1474 unsigned easize; 1475 int error, ealen; 1476 1477 ip = VTOI(ap->a_vp); 1478 1479 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1480 return (EOPNOTSUPP); 1481 1482 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1483 ap->a_cred, ap->a_td, VREAD); 1484 if (error) 1485 return (error); 1486 1487 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1488 if (error) 1489 return (error); 1490 1491 eae = ip->i_ea_area; 1492 easize = ip->i_ea_len; 1493 1494 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1495 NULL, &p); 1496 if (ealen >= 0) { 1497 error = 0; 1498 if (ap->a_size != NULL) 1499 *ap->a_size = ealen; 1500 else if (ap->a_uio != NULL) 1501 error = uiomove(p, ealen, ap->a_uio); 1502 } else 1503 error = ENOATTR; 1504 1505 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1506 return (error); 1507 } 1508 1509 /* 1510 * Vnode operation to retrieve extended attributes on a vnode. 1511 */ 1512 static int 1513 ffs_listextattr(struct vop_listextattr_args *ap) 1514 /* 1515 vop_listextattr { 1516 IN struct vnode *a_vp; 1517 IN int a_attrnamespace; 1518 INOUT struct uio *a_uio; 1519 OUT size_t *a_size; 1520 IN struct ucred *a_cred; 1521 IN struct thread *a_td; 1522 }; 1523 */ 1524 { 1525 struct inode *ip; 1526 struct extattr *eap, *eaend; 1527 int error, ealen; 1528 1529 ip = VTOI(ap->a_vp); 1530 1531 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1532 return (EOPNOTSUPP); 1533 1534 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1535 ap->a_cred, ap->a_td, VREAD); 1536 if (error) 1537 return (error); 1538 1539 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1540 if (error) 1541 return (error); 1542 1543 error = 0; 1544 if (ap->a_size != NULL) 1545 *ap->a_size = 0; 1546 1547 KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned")); 1548 eap = (struct extattr *)ip->i_ea_area; 1549 eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len); 1550 for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) { 1551 /* make sure this entry is complete */ 1552 if (EXTATTR_NEXT(eap) > eaend) 1553 break; 1554 if (eap->ea_namespace != ap->a_attrnamespace) 1555 continue; 1556 1557 ealen = eap->ea_namelength; 1558 if (ap->a_size != NULL) 1559 *ap->a_size += ealen + 1; 1560 else if (ap->a_uio != NULL) 1561 error = uiomove(&eap->ea_namelength, ealen + 1, 1562 ap->a_uio); 1563 } 1564 1565 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1566 return (error); 1567 } 1568 1569 /* 1570 * Vnode operation to set a named attribute. 1571 */ 1572 static int 1573 ffs_setextattr(struct vop_setextattr_args *ap) 1574 /* 1575 vop_setextattr { 1576 IN struct vnode *a_vp; 1577 IN int a_attrnamespace; 1578 IN const char *a_name; 1579 INOUT struct uio *a_uio; 1580 IN struct ucred *a_cred; 1581 IN struct thread *a_td; 1582 }; 1583 */ 1584 { 1585 struct inode *ip; 1586 struct fs *fs; 1587 struct extattr *eap; 1588 uint32_t ealength, ul; 1589 ssize_t ealen; 1590 int olen, eapad1, eapad2, error, i, easize; 1591 u_char *eae; 1592 void *tmp; 1593 1594 ip = VTOI(ap->a_vp); 1595 fs = ITOFS(ip); 1596 1597 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1598 return (EOPNOTSUPP); 1599 1600 if (strlen(ap->a_name) == 0) 1601 return (EINVAL); 1602 1603 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1604 if (ap->a_uio == NULL) 1605 return (EOPNOTSUPP); 1606 1607 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1608 return (EROFS); 1609 1610 ealen = ap->a_uio->uio_resid; 1611 if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR)) 1612 return (EINVAL); 1613 1614 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1615 ap->a_cred, ap->a_td, VWRITE); 1616 if (error) { 1617 1618 /* 1619 * ffs_lock_ea is not needed there, because the vnode 1620 * must be exclusively locked. 1621 */ 1622 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1623 ip->i_ea_error = error; 1624 return (error); 1625 } 1626 1627 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1628 if (error) 1629 return (error); 1630 1631 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1632 eapad1 = roundup2(ealength, 8) - ealength; 1633 eapad2 = roundup2(ealen, 8) - ealen; 1634 ealength += eapad1 + ealen + eapad2; 1635 1636 /* 1637 * CEM: rewrites of the same size or smaller could be done in-place 1638 * instead. (We don't acquire any fine-grained locks in here either, 1639 * so we could also do bigger writes in-place.) 1640 */ 1641 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1642 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1643 easize = ip->i_ea_len; 1644 1645 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1646 &eap, NULL); 1647 if (olen == -1) { 1648 /* new, append at end */ 1649 KASSERT(ALIGNED_TO(eae + easize, struct extattr), 1650 ("unaligned")); 1651 eap = (struct extattr *)(eae + easize); 1652 easize += ealength; 1653 } else { 1654 ul = eap->ea_length; 1655 i = (u_char *)EXTATTR_NEXT(eap) - eae; 1656 if (ul != ealength) { 1657 bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength, 1658 easize - i); 1659 easize += (ealength - ul); 1660 } 1661 } 1662 if (easize > lblktosize(fs, UFS_NXADDR)) { 1663 free(eae, M_TEMP); 1664 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1665 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1666 ip->i_ea_error = ENOSPC; 1667 return (ENOSPC); 1668 } 1669 eap->ea_length = ealength; 1670 eap->ea_namespace = ap->a_attrnamespace; 1671 eap->ea_contentpadlen = eapad2; 1672 eap->ea_namelength = strlen(ap->a_name); 1673 memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name)); 1674 bzero(&eap->ea_name[strlen(ap->a_name)], eapad1); 1675 error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio); 1676 if (error) { 1677 free(eae, M_TEMP); 1678 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1679 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1680 ip->i_ea_error = error; 1681 return (error); 1682 } 1683 bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2); 1684 1685 tmp = ip->i_ea_area; 1686 ip->i_ea_area = eae; 1687 ip->i_ea_len = easize; 1688 free(tmp, M_TEMP); 1689 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1690 return (error); 1691 } 1692 1693 /* 1694 * Vnode pointer to File handle 1695 */ 1696 static int 1697 ffs_vptofh(struct vop_vptofh_args *ap) 1698 /* 1699 vop_vptofh { 1700 IN struct vnode *a_vp; 1701 IN struct fid *a_fhp; 1702 }; 1703 */ 1704 { 1705 struct inode *ip; 1706 struct ufid *ufhp; 1707 1708 ip = VTOI(ap->a_vp); 1709 ufhp = (struct ufid *)ap->a_fhp; 1710 ufhp->ufid_len = sizeof(struct ufid); 1711 ufhp->ufid_ino = ip->i_number; 1712 ufhp->ufid_gen = ip->i_gen; 1713 return (0); 1714 } 1715 1716 SYSCTL_DECL(_vfs_ffs); 1717 static int use_buf_pager = 1; 1718 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0, 1719 "Always use buffer pager instead of bmap"); 1720 1721 static daddr_t 1722 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) 1723 { 1724 1725 return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off)); 1726 } 1727 1728 static int 1729 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn) 1730 { 1731 1732 return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn)); 1733 } 1734 1735 static int 1736 ffs_getpages(struct vop_getpages_args *ap) 1737 { 1738 struct vnode *vp; 1739 struct ufsmount *um; 1740 1741 vp = ap->a_vp; 1742 um = VFSTOUFS(vp->v_mount); 1743 1744 if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) 1745 return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, 1746 ap->a_rbehind, ap->a_rahead, NULL, NULL)); 1747 return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, 1748 ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz)); 1749 } 1750 1751 static int 1752 ffs_getpages_async(struct vop_getpages_async_args *ap) 1753 { 1754 struct vnode *vp; 1755 struct ufsmount *um; 1756 int error; 1757 1758 vp = ap->a_vp; 1759 um = VFSTOUFS(vp->v_mount); 1760 1761 if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) 1762 return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, 1763 ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg)); 1764 1765 error = vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, 1766 ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz); 1767 ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); 1768 1769 return (error); 1770 } 1771 1772