1 /*- 2 * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause) 3 * 4 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 5 * All rights reserved. 6 * 7 * This software was developed for the FreeBSD Project by Marshall 8 * Kirk McKusick and Network Associates Laboratories, the Security 9 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 10 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 11 * research program 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1982, 1986, 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 62 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 63 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 64 */ 65 66 #include <sys/cdefs.h> 67 __FBSDID("$FreeBSD$"); 68 69 #include <sys/param.h> 70 #include <sys/bio.h> 71 #include <sys/systm.h> 72 #include <sys/buf.h> 73 #include <sys/conf.h> 74 #include <sys/extattr.h> 75 #include <sys/kernel.h> 76 #include <sys/limits.h> 77 #include <sys/malloc.h> 78 #include <sys/mount.h> 79 #include <sys/priv.h> 80 #include <sys/rwlock.h> 81 #include <sys/stat.h> 82 #include <sys/sysctl.h> 83 #include <sys/vmmeter.h> 84 #include <sys/vnode.h> 85 86 #include <vm/vm.h> 87 #include <vm/vm_param.h> 88 #include <vm/vm_extern.h> 89 #include <vm/vm_object.h> 90 #include <vm/vm_page.h> 91 #include <vm/vm_pager.h> 92 #include <vm/vnode_pager.h> 93 94 #include <ufs/ufs/extattr.h> 95 #include <ufs/ufs/quota.h> 96 #include <ufs/ufs/inode.h> 97 #include <ufs/ufs/ufs_extern.h> 98 #include <ufs/ufs/ufsmount.h> 99 100 #include <ufs/ffs/fs.h> 101 #include <ufs/ffs/ffs_extern.h> 102 #include "opt_directio.h" 103 #include "opt_ffs.h" 104 105 #define ALIGNED_TO(ptr, s) \ 106 (((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0) 107 108 #ifdef DIRECTIO 109 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 110 #endif 111 static vop_fdatasync_t ffs_fdatasync; 112 static vop_fsync_t ffs_fsync; 113 static vop_getpages_t ffs_getpages; 114 static vop_getpages_async_t ffs_getpages_async; 115 static vop_lock1_t ffs_lock; 116 static vop_read_t ffs_read; 117 static vop_write_t ffs_write; 118 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 119 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 120 struct ucred *cred); 121 static vop_strategy_t ffsext_strategy; 122 static vop_closeextattr_t ffs_closeextattr; 123 static vop_deleteextattr_t ffs_deleteextattr; 124 static vop_getextattr_t ffs_getextattr; 125 static vop_listextattr_t ffs_listextattr; 126 static vop_openextattr_t ffs_openextattr; 127 static vop_setextattr_t ffs_setextattr; 128 static vop_vptofh_t ffs_vptofh; 129 130 /* Global vfs data structures for ufs. */ 131 struct vop_vector ffs_vnodeops1 = { 132 .vop_default = &ufs_vnodeops, 133 .vop_fsync = ffs_fsync, 134 .vop_fdatasync = ffs_fdatasync, 135 .vop_getpages = ffs_getpages, 136 .vop_getpages_async = ffs_getpages_async, 137 .vop_lock1 = ffs_lock, 138 .vop_read = ffs_read, 139 .vop_reallocblks = ffs_reallocblks, 140 .vop_write = ffs_write, 141 .vop_vptofh = ffs_vptofh, 142 }; 143 144 struct vop_vector ffs_fifoops1 = { 145 .vop_default = &ufs_fifoops, 146 .vop_fsync = ffs_fsync, 147 .vop_fdatasync = ffs_fdatasync, 148 .vop_lock1 = ffs_lock, 149 .vop_vptofh = ffs_vptofh, 150 }; 151 152 /* Global vfs data structures for ufs. */ 153 struct vop_vector ffs_vnodeops2 = { 154 .vop_default = &ufs_vnodeops, 155 .vop_fsync = ffs_fsync, 156 .vop_fdatasync = ffs_fdatasync, 157 .vop_getpages = ffs_getpages, 158 .vop_getpages_async = ffs_getpages_async, 159 .vop_lock1 = ffs_lock, 160 .vop_read = ffs_read, 161 .vop_reallocblks = ffs_reallocblks, 162 .vop_write = ffs_write, 163 .vop_closeextattr = ffs_closeextattr, 164 .vop_deleteextattr = ffs_deleteextattr, 165 .vop_getextattr = ffs_getextattr, 166 .vop_listextattr = ffs_listextattr, 167 .vop_openextattr = ffs_openextattr, 168 .vop_setextattr = ffs_setextattr, 169 .vop_vptofh = ffs_vptofh, 170 }; 171 172 struct vop_vector ffs_fifoops2 = { 173 .vop_default = &ufs_fifoops, 174 .vop_fsync = ffs_fsync, 175 .vop_fdatasync = ffs_fdatasync, 176 .vop_lock1 = ffs_lock, 177 .vop_reallocblks = ffs_reallocblks, 178 .vop_strategy = ffsext_strategy, 179 .vop_closeextattr = ffs_closeextattr, 180 .vop_deleteextattr = ffs_deleteextattr, 181 .vop_getextattr = ffs_getextattr, 182 .vop_listextattr = ffs_listextattr, 183 .vop_openextattr = ffs_openextattr, 184 .vop_setextattr = ffs_setextattr, 185 .vop_vptofh = ffs_vptofh, 186 }; 187 188 /* 189 * Synch an open file. 190 */ 191 /* ARGSUSED */ 192 static int 193 ffs_fsync(struct vop_fsync_args *ap) 194 { 195 struct vnode *vp; 196 struct bufobj *bo; 197 int error; 198 199 vp = ap->a_vp; 200 bo = &vp->v_bufobj; 201 retry: 202 error = ffs_syncvnode(vp, ap->a_waitfor, 0); 203 if (error) 204 return (error); 205 if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) { 206 error = softdep_fsync(vp); 207 if (error) 208 return (error); 209 210 /* 211 * The softdep_fsync() function may drop vp lock, 212 * allowing for dirty buffers to reappear on the 213 * bo_dirty list. Recheck and resync as needed. 214 */ 215 BO_LOCK(bo); 216 if ((vp->v_type == VREG || vp->v_type == VDIR) && 217 (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) { 218 BO_UNLOCK(bo); 219 goto retry; 220 } 221 BO_UNLOCK(bo); 222 } 223 return (0); 224 } 225 226 int 227 ffs_syncvnode(struct vnode *vp, int waitfor, int flags) 228 { 229 struct inode *ip; 230 struct bufobj *bo; 231 struct buf *bp, *nbp; 232 ufs_lbn_t lbn; 233 int error, passes; 234 bool still_dirty, wait; 235 236 ip = VTOI(vp); 237 ip->i_flag &= ~IN_NEEDSYNC; 238 bo = &vp->v_bufobj; 239 240 /* 241 * When doing MNT_WAIT we must first flush all dependencies 242 * on the inode. 243 */ 244 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && 245 (error = softdep_sync_metadata(vp)) != 0) 246 return (error); 247 248 /* 249 * Flush all dirty buffers associated with a vnode. 250 */ 251 error = 0; 252 passes = 0; 253 wait = false; /* Always do an async pass first. */ 254 lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1)); 255 BO_LOCK(bo); 256 loop: 257 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 258 bp->b_vflags &= ~BV_SCANNED; 259 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 260 /* 261 * Reasons to skip this buffer: it has already been considered 262 * on this pass, the buffer has dependencies that will cause 263 * it to be redirtied and it has not already been deferred, 264 * or it is already being written. 265 */ 266 if ((bp->b_vflags & BV_SCANNED) != 0) 267 continue; 268 bp->b_vflags |= BV_SCANNED; 269 /* 270 * Flush indirects in order, if requested. 271 * 272 * Note that if only datasync is requested, we can 273 * skip indirect blocks when softupdates are not 274 * active. Otherwise we must flush them with data, 275 * since dependencies prevent data block writes. 276 */ 277 if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR && 278 (lbn_level(bp->b_lblkno) >= passes || 279 ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp)))) 280 continue; 281 if (bp->b_lblkno > lbn) 282 panic("ffs_syncvnode: syncing truncated data."); 283 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) { 284 BO_UNLOCK(bo); 285 } else if (wait) { 286 if (BUF_LOCK(bp, 287 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 288 BO_LOCKPTR(bo)) != 0) { 289 bp->b_vflags &= ~BV_SCANNED; 290 goto next; 291 } 292 } else 293 continue; 294 if ((bp->b_flags & B_DELWRI) == 0) 295 panic("ffs_fsync: not dirty"); 296 /* 297 * Check for dependencies and potentially complete them. 298 */ 299 if (!LIST_EMPTY(&bp->b_dep) && 300 (error = softdep_sync_buf(vp, bp, 301 wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { 302 /* I/O error. */ 303 if (error != EBUSY) { 304 BUF_UNLOCK(bp); 305 return (error); 306 } 307 /* If we deferred once, don't defer again. */ 308 if ((bp->b_flags & B_DEFERRED) == 0) { 309 bp->b_flags |= B_DEFERRED; 310 BUF_UNLOCK(bp); 311 goto next; 312 } 313 } 314 if (wait) { 315 bremfree(bp); 316 if ((error = bwrite(bp)) != 0) 317 return (error); 318 } else if ((bp->b_flags & B_CLUSTEROK)) { 319 (void) vfs_bio_awrite(bp); 320 } else { 321 bremfree(bp); 322 (void) bawrite(bp); 323 } 324 next: 325 /* 326 * Since we may have slept during the I/O, we need 327 * to start from a known point. 328 */ 329 BO_LOCK(bo); 330 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); 331 } 332 if (waitfor != MNT_WAIT) { 333 BO_UNLOCK(bo); 334 if ((flags & NO_INO_UPDT) != 0) 335 return (0); 336 else 337 return (ffs_update(vp, 0)); 338 } 339 /* Drain IO to see if we're done. */ 340 bufobj_wwait(bo, 0, 0); 341 /* 342 * Block devices associated with filesystems may have new I/O 343 * requests posted for them even if the vnode is locked, so no 344 * amount of trying will get them clean. We make several passes 345 * as a best effort. 346 * 347 * Regular files may need multiple passes to flush all dependency 348 * work as it is possible that we must write once per indirect 349 * level, once for the leaf, and once for the inode and each of 350 * these will be done with one sync and one async pass. 351 */ 352 if (bo->bo_dirty.bv_cnt > 0) { 353 if ((flags & DATA_ONLY) == 0) { 354 still_dirty = true; 355 } else { 356 /* 357 * For data-only sync, dirty indirect buffers 358 * are ignored. 359 */ 360 still_dirty = false; 361 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 362 if (bp->b_lblkno > -UFS_NDADDR) { 363 still_dirty = true; 364 break; 365 } 366 } 367 } 368 369 if (still_dirty) { 370 /* Write the inode after sync passes to flush deps. */ 371 if (wait && DOINGSOFTDEP(vp) && 372 (flags & NO_INO_UPDT) == 0) { 373 BO_UNLOCK(bo); 374 ffs_update(vp, 1); 375 BO_LOCK(bo); 376 } 377 /* switch between sync/async. */ 378 wait = !wait; 379 if (wait || ++passes < UFS_NIADDR + 2) 380 goto loop; 381 } 382 } 383 BO_UNLOCK(bo); 384 error = 0; 385 if ((flags & DATA_ONLY) == 0) { 386 if ((flags & NO_INO_UPDT) == 0) 387 error = ffs_update(vp, 1); 388 if (DOINGSUJ(vp)) 389 softdep_journal_fsync(VTOI(vp)); 390 } 391 return (error); 392 } 393 394 static int 395 ffs_fdatasync(struct vop_fdatasync_args *ap) 396 { 397 398 return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY)); 399 } 400 401 static int 402 ffs_lock(ap) 403 struct vop_lock1_args /* { 404 struct vnode *a_vp; 405 int a_flags; 406 struct thread *a_td; 407 char *file; 408 int line; 409 } */ *ap; 410 { 411 #ifndef NO_FFS_SNAPSHOT 412 struct vnode *vp; 413 int flags; 414 struct lock *lkp; 415 int result; 416 417 switch (ap->a_flags & LK_TYPE_MASK) { 418 case LK_SHARED: 419 case LK_UPGRADE: 420 case LK_EXCLUSIVE: 421 vp = ap->a_vp; 422 flags = ap->a_flags; 423 for (;;) { 424 #ifdef DEBUG_VFS_LOCKS 425 KASSERT(vp->v_holdcnt != 0, 426 ("ffs_lock %p: zero hold count", vp)); 427 #endif 428 lkp = vp->v_vnlock; 429 result = _lockmgr_args(lkp, flags, VI_MTX(vp), 430 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 431 ap->a_file, ap->a_line); 432 if (lkp == vp->v_vnlock || result != 0) 433 break; 434 /* 435 * Apparent success, except that the vnode 436 * mutated between snapshot file vnode and 437 * regular file vnode while this process 438 * slept. The lock currently held is not the 439 * right lock. Release it, and try to get the 440 * new lock. 441 */ 442 (void) _lockmgr_args(lkp, LK_RELEASE, NULL, 443 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 444 ap->a_file, ap->a_line); 445 if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == 446 (LK_INTERLOCK | LK_NOWAIT)) 447 return (EBUSY); 448 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 449 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 450 flags &= ~LK_INTERLOCK; 451 } 452 break; 453 default: 454 result = VOP_LOCK1_APV(&ufs_vnodeops, ap); 455 } 456 return (result); 457 #else 458 return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); 459 #endif 460 } 461 462 static int 463 ffs_read_hole(struct uio *uio, long xfersize, long *size) 464 { 465 ssize_t saved_resid, tlen; 466 int error; 467 468 while (xfersize > 0) { 469 tlen = min(xfersize, ZERO_REGION_SIZE); 470 saved_resid = uio->uio_resid; 471 error = vn_io_fault_uiomove(__DECONST(void *, zero_region), 472 tlen, uio); 473 if (error != 0) 474 return (error); 475 tlen = saved_resid - uio->uio_resid; 476 xfersize -= tlen; 477 *size -= tlen; 478 } 479 return (0); 480 } 481 482 /* 483 * Vnode op for reading. 484 */ 485 static int 486 ffs_read(ap) 487 struct vop_read_args /* { 488 struct vnode *a_vp; 489 struct uio *a_uio; 490 int a_ioflag; 491 struct ucred *a_cred; 492 } */ *ap; 493 { 494 struct vnode *vp; 495 struct inode *ip; 496 struct uio *uio; 497 struct fs *fs; 498 struct buf *bp; 499 ufs_lbn_t lbn, nextlbn; 500 off_t bytesinfile; 501 long size, xfersize, blkoffset; 502 ssize_t orig_resid; 503 int bflag, error, ioflag, seqcount; 504 505 vp = ap->a_vp; 506 uio = ap->a_uio; 507 ioflag = ap->a_ioflag; 508 if (ap->a_ioflag & IO_EXT) 509 #ifdef notyet 510 return (ffs_extread(vp, uio, ioflag)); 511 #else 512 panic("ffs_read+IO_EXT"); 513 #endif 514 #ifdef DIRECTIO 515 if ((ioflag & IO_DIRECT) != 0) { 516 int workdone; 517 518 error = ffs_rawread(vp, uio, &workdone); 519 if (error != 0 || workdone != 0) 520 return error; 521 } 522 #endif 523 524 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 525 ip = VTOI(vp); 526 527 #ifdef INVARIANTS 528 if (uio->uio_rw != UIO_READ) 529 panic("ffs_read: mode"); 530 531 if (vp->v_type == VLNK) { 532 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 533 panic("ffs_read: short symlink"); 534 } else if (vp->v_type != VREG && vp->v_type != VDIR) 535 panic("ffs_read: type %d", vp->v_type); 536 #endif 537 orig_resid = uio->uio_resid; 538 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 539 if (orig_resid == 0) 540 return (0); 541 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 542 fs = ITOFS(ip); 543 if (uio->uio_offset < ip->i_size && 544 uio->uio_offset >= fs->fs_maxfilesize) 545 return (EOVERFLOW); 546 547 bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE); 548 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 549 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 550 break; 551 lbn = lblkno(fs, uio->uio_offset); 552 nextlbn = lbn + 1; 553 554 /* 555 * size of buffer. The buffer representing the 556 * end of the file is rounded up to the size of 557 * the block type ( fragment or full block, 558 * depending ). 559 */ 560 size = blksize(fs, ip, lbn); 561 blkoffset = blkoff(fs, uio->uio_offset); 562 563 /* 564 * The amount we want to transfer in this iteration is 565 * one FS block less the amount of the data before 566 * our startpoint (duh!) 567 */ 568 xfersize = fs->fs_bsize - blkoffset; 569 570 /* 571 * But if we actually want less than the block, 572 * or the file doesn't have a whole block more of data, 573 * then use the lesser number. 574 */ 575 if (uio->uio_resid < xfersize) 576 xfersize = uio->uio_resid; 577 if (bytesinfile < xfersize) 578 xfersize = bytesinfile; 579 580 if (lblktosize(fs, nextlbn) >= ip->i_size) { 581 /* 582 * Don't do readahead if this is the end of the file. 583 */ 584 error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp); 585 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 586 /* 587 * Otherwise if we are allowed to cluster, 588 * grab as much as we can. 589 * 590 * XXX This may not be a win if we are not 591 * doing sequential access. 592 */ 593 error = cluster_read(vp, ip->i_size, lbn, 594 size, NOCRED, blkoffset + uio->uio_resid, 595 seqcount, bflag, &bp); 596 } else if (seqcount > 1) { 597 /* 598 * If we are NOT allowed to cluster, then 599 * if we appear to be acting sequentially, 600 * fire off a request for a readahead 601 * as well as a read. Note that the 4th and 5th 602 * arguments point to arrays of the size specified in 603 * the 6th argument. 604 */ 605 u_int nextsize = blksize(fs, ip, nextlbn); 606 error = breadn_flags(vp, lbn, lbn, size, &nextlbn, 607 &nextsize, 1, NOCRED, bflag, NULL, &bp); 608 } else { 609 /* 610 * Failing all of the above, just read what the 611 * user asked for. Interestingly, the same as 612 * the first option above. 613 */ 614 error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp); 615 } 616 if (error == EJUSTRETURN) { 617 error = ffs_read_hole(uio, xfersize, &size); 618 if (error == 0) 619 continue; 620 } 621 if (error != 0) { 622 brelse(bp); 623 bp = NULL; 624 break; 625 } 626 627 /* 628 * We should only get non-zero b_resid when an I/O error 629 * has occurred, which should cause us to break above. 630 * However, if the short read did not cause an error, 631 * then we want to ensure that we do not uiomove bad 632 * or uninitialized data. 633 */ 634 size -= bp->b_resid; 635 if (size < xfersize) { 636 if (size == 0) 637 break; 638 xfersize = size; 639 } 640 641 if (buf_mapped(bp)) { 642 error = vn_io_fault_uiomove((char *)bp->b_data + 643 blkoffset, (int)xfersize, uio); 644 } else { 645 error = vn_io_fault_pgmove(bp->b_pages, blkoffset, 646 (int)xfersize, uio); 647 } 648 if (error) 649 break; 650 651 vfs_bio_brelse(bp, ioflag); 652 } 653 654 /* 655 * This can only happen in the case of an error 656 * because the loop above resets bp to NULL on each iteration 657 * and on normal completion has not set a new value into it. 658 * so it must have come from a 'break' statement 659 */ 660 if (bp != NULL) 661 vfs_bio_brelse(bp, ioflag); 662 663 if ((error == 0 || uio->uio_resid != orig_resid) && 664 (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 && 665 (ip->i_flag & IN_ACCESS) == 0) { 666 VI_LOCK(vp); 667 ip->i_flag |= IN_ACCESS; 668 VI_UNLOCK(vp); 669 } 670 return (error); 671 } 672 673 /* 674 * Vnode op for writing. 675 */ 676 static int 677 ffs_write(ap) 678 struct vop_write_args /* { 679 struct vnode *a_vp; 680 struct uio *a_uio; 681 int a_ioflag; 682 struct ucred *a_cred; 683 } */ *ap; 684 { 685 struct vnode *vp; 686 struct uio *uio; 687 struct inode *ip; 688 struct fs *fs; 689 struct buf *bp; 690 ufs_lbn_t lbn; 691 off_t osize; 692 ssize_t resid; 693 int seqcount; 694 int blkoffset, error, flags, ioflag, size, xfersize; 695 696 vp = ap->a_vp; 697 uio = ap->a_uio; 698 ioflag = ap->a_ioflag; 699 if (ap->a_ioflag & IO_EXT) 700 #ifdef notyet 701 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 702 #else 703 panic("ffs_write+IO_EXT"); 704 #endif 705 706 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 707 ip = VTOI(vp); 708 709 #ifdef INVARIANTS 710 if (uio->uio_rw != UIO_WRITE) 711 panic("ffs_write: mode"); 712 #endif 713 714 switch (vp->v_type) { 715 case VREG: 716 if (ioflag & IO_APPEND) 717 uio->uio_offset = ip->i_size; 718 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 719 return (EPERM); 720 /* FALLTHROUGH */ 721 case VLNK: 722 break; 723 case VDIR: 724 panic("ffs_write: dir write"); 725 break; 726 default: 727 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 728 (int)uio->uio_offset, 729 (int)uio->uio_resid 730 ); 731 } 732 733 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 734 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 735 fs = ITOFS(ip); 736 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 737 return (EFBIG); 738 /* 739 * Maybe this should be above the vnode op call, but so long as 740 * file servers have no limits, I don't think it matters. 741 */ 742 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) 743 return (EFBIG); 744 745 resid = uio->uio_resid; 746 osize = ip->i_size; 747 if (seqcount > BA_SEQMAX) 748 flags = BA_SEQMAX << BA_SEQSHIFT; 749 else 750 flags = seqcount << BA_SEQSHIFT; 751 if (ioflag & IO_SYNC) 752 flags |= IO_SYNC; 753 flags |= BA_UNMAPPED; 754 755 for (error = 0; uio->uio_resid > 0;) { 756 lbn = lblkno(fs, uio->uio_offset); 757 blkoffset = blkoff(fs, uio->uio_offset); 758 xfersize = fs->fs_bsize - blkoffset; 759 if (uio->uio_resid < xfersize) 760 xfersize = uio->uio_resid; 761 if (uio->uio_offset + xfersize > ip->i_size) 762 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 763 764 /* 765 * We must perform a read-before-write if the transfer size 766 * does not cover the entire buffer. 767 */ 768 if (fs->fs_bsize > xfersize) 769 flags |= BA_CLRBUF; 770 else 771 flags &= ~BA_CLRBUF; 772 /* XXX is uio->uio_offset the right thing here? */ 773 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 774 ap->a_cred, flags, &bp); 775 if (error != 0) { 776 vnode_pager_setsize(vp, ip->i_size); 777 break; 778 } 779 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 780 bp->b_flags |= B_NOCACHE; 781 782 if (uio->uio_offset + xfersize > ip->i_size) { 783 ip->i_size = uio->uio_offset + xfersize; 784 DIP_SET(ip, i_size, ip->i_size); 785 } 786 787 size = blksize(fs, ip, lbn) - bp->b_resid; 788 if (size < xfersize) 789 xfersize = size; 790 791 if (buf_mapped(bp)) { 792 error = vn_io_fault_uiomove((char *)bp->b_data + 793 blkoffset, (int)xfersize, uio); 794 } else { 795 error = vn_io_fault_pgmove(bp->b_pages, blkoffset, 796 (int)xfersize, uio); 797 } 798 /* 799 * If the buffer is not already filled and we encounter an 800 * error while trying to fill it, we have to clear out any 801 * garbage data from the pages instantiated for the buffer. 802 * If we do not, a failed uiomove() during a write can leave 803 * the prior contents of the pages exposed to a userland mmap. 804 * 805 * Note that we need only clear buffers with a transfer size 806 * equal to the block size because buffers with a shorter 807 * transfer size were cleared above by the call to UFS_BALLOC() 808 * with the BA_CLRBUF flag set. 809 * 810 * If the source region for uiomove identically mmaps the 811 * buffer, uiomove() performed the NOP copy, and the buffer 812 * content remains valid because the page fault handler 813 * validated the pages. 814 */ 815 if (error != 0 && (bp->b_flags & B_CACHE) == 0 && 816 fs->fs_bsize == xfersize) 817 vfs_bio_clrbuf(bp); 818 819 vfs_bio_set_flags(bp, ioflag); 820 821 /* 822 * If IO_SYNC each buffer is written synchronously. Otherwise 823 * if we have a severe page deficiency write the buffer 824 * asynchronously. Otherwise try to cluster, and if that 825 * doesn't do it then either do an async write (if O_DIRECT), 826 * or a delayed write (if not). 827 */ 828 if (ioflag & IO_SYNC) { 829 (void)bwrite(bp); 830 } else if (vm_page_count_severe() || 831 buf_dirty_count_severe() || 832 (ioflag & IO_ASYNC)) { 833 bp->b_flags |= B_CLUSTEROK; 834 bawrite(bp); 835 } else if (xfersize + blkoffset == fs->fs_bsize) { 836 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 837 bp->b_flags |= B_CLUSTEROK; 838 cluster_write(vp, bp, ip->i_size, seqcount, 839 GB_UNMAPPED); 840 } else { 841 bawrite(bp); 842 } 843 } else if (ioflag & IO_DIRECT) { 844 bp->b_flags |= B_CLUSTEROK; 845 bawrite(bp); 846 } else { 847 bp->b_flags |= B_CLUSTEROK; 848 bdwrite(bp); 849 } 850 if (error || xfersize == 0) 851 break; 852 ip->i_flag |= IN_CHANGE | IN_UPDATE; 853 } 854 /* 855 * If we successfully wrote any data, and we are not the superuser 856 * we clear the setuid and setgid bits as a precaution against 857 * tampering. 858 */ 859 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && 860 ap->a_cred) { 861 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) { 862 ip->i_mode &= ~(ISUID | ISGID); 863 DIP_SET(ip, i_mode, ip->i_mode); 864 } 865 } 866 if (error) { 867 if (ioflag & IO_UNIT) { 868 (void)ffs_truncate(vp, osize, 869 IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred); 870 uio->uio_offset -= resid - uio->uio_resid; 871 uio->uio_resid = resid; 872 } 873 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 874 error = ffs_update(vp, 1); 875 return (error); 876 } 877 878 /* 879 * Extended attribute area reading. 880 */ 881 static int 882 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 883 { 884 struct inode *ip; 885 struct ufs2_dinode *dp; 886 struct fs *fs; 887 struct buf *bp; 888 ufs_lbn_t lbn, nextlbn; 889 off_t bytesinfile; 890 long size, xfersize, blkoffset; 891 ssize_t orig_resid; 892 int error; 893 894 ip = VTOI(vp); 895 fs = ITOFS(ip); 896 dp = ip->i_din2; 897 898 #ifdef INVARIANTS 899 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 900 panic("ffs_extread: mode"); 901 902 #endif 903 orig_resid = uio->uio_resid; 904 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 905 if (orig_resid == 0) 906 return (0); 907 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 908 909 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 910 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 911 break; 912 lbn = lblkno(fs, uio->uio_offset); 913 nextlbn = lbn + 1; 914 915 /* 916 * size of buffer. The buffer representing the 917 * end of the file is rounded up to the size of 918 * the block type ( fragment or full block, 919 * depending ). 920 */ 921 size = sblksize(fs, dp->di_extsize, lbn); 922 blkoffset = blkoff(fs, uio->uio_offset); 923 924 /* 925 * The amount we want to transfer in this iteration is 926 * one FS block less the amount of the data before 927 * our startpoint (duh!) 928 */ 929 xfersize = fs->fs_bsize - blkoffset; 930 931 /* 932 * But if we actually want less than the block, 933 * or the file doesn't have a whole block more of data, 934 * then use the lesser number. 935 */ 936 if (uio->uio_resid < xfersize) 937 xfersize = uio->uio_resid; 938 if (bytesinfile < xfersize) 939 xfersize = bytesinfile; 940 941 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 942 /* 943 * Don't do readahead if this is the end of the info. 944 */ 945 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 946 } else { 947 /* 948 * If we have a second block, then 949 * fire off a request for a readahead 950 * as well as a read. Note that the 4th and 5th 951 * arguments point to arrays of the size specified in 952 * the 6th argument. 953 */ 954 u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 955 956 nextlbn = -1 - nextlbn; 957 error = breadn(vp, -1 - lbn, 958 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 959 } 960 if (error) { 961 brelse(bp); 962 bp = NULL; 963 break; 964 } 965 966 /* 967 * We should only get non-zero b_resid when an I/O error 968 * has occurred, which should cause us to break above. 969 * However, if the short read did not cause an error, 970 * then we want to ensure that we do not uiomove bad 971 * or uninitialized data. 972 */ 973 size -= bp->b_resid; 974 if (size < xfersize) { 975 if (size == 0) 976 break; 977 xfersize = size; 978 } 979 980 error = uiomove((char *)bp->b_data + blkoffset, 981 (int)xfersize, uio); 982 if (error) 983 break; 984 vfs_bio_brelse(bp, ioflag); 985 } 986 987 /* 988 * This can only happen in the case of an error 989 * because the loop above resets bp to NULL on each iteration 990 * and on normal completion has not set a new value into it. 991 * so it must have come from a 'break' statement 992 */ 993 if (bp != NULL) 994 vfs_bio_brelse(bp, ioflag); 995 return (error); 996 } 997 998 /* 999 * Extended attribute area writing. 1000 */ 1001 static int 1002 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1003 { 1004 struct inode *ip; 1005 struct ufs2_dinode *dp; 1006 struct fs *fs; 1007 struct buf *bp; 1008 ufs_lbn_t lbn; 1009 off_t osize; 1010 ssize_t resid; 1011 int blkoffset, error, flags, size, xfersize; 1012 1013 ip = VTOI(vp); 1014 fs = ITOFS(ip); 1015 dp = ip->i_din2; 1016 1017 #ifdef INVARIANTS 1018 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1019 panic("ffs_extwrite: mode"); 1020 #endif 1021 1022 if (ioflag & IO_APPEND) 1023 uio->uio_offset = dp->di_extsize; 1024 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1025 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1026 if ((uoff_t)uio->uio_offset + uio->uio_resid > 1027 UFS_NXADDR * fs->fs_bsize) 1028 return (EFBIG); 1029 1030 resid = uio->uio_resid; 1031 osize = dp->di_extsize; 1032 flags = IO_EXT; 1033 if (ioflag & IO_SYNC) 1034 flags |= IO_SYNC; 1035 1036 for (error = 0; uio->uio_resid > 0;) { 1037 lbn = lblkno(fs, uio->uio_offset); 1038 blkoffset = blkoff(fs, uio->uio_offset); 1039 xfersize = fs->fs_bsize - blkoffset; 1040 if (uio->uio_resid < xfersize) 1041 xfersize = uio->uio_resid; 1042 1043 /* 1044 * We must perform a read-before-write if the transfer size 1045 * does not cover the entire buffer. 1046 */ 1047 if (fs->fs_bsize > xfersize) 1048 flags |= BA_CLRBUF; 1049 else 1050 flags &= ~BA_CLRBUF; 1051 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1052 ucred, flags, &bp); 1053 if (error != 0) 1054 break; 1055 /* 1056 * If the buffer is not valid we have to clear out any 1057 * garbage data from the pages instantiated for the buffer. 1058 * If we do not, a failed uiomove() during a write can leave 1059 * the prior contents of the pages exposed to a userland 1060 * mmap(). XXX deal with uiomove() errors a better way. 1061 */ 1062 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1063 vfs_bio_clrbuf(bp); 1064 1065 if (uio->uio_offset + xfersize > dp->di_extsize) 1066 dp->di_extsize = uio->uio_offset + xfersize; 1067 1068 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1069 if (size < xfersize) 1070 xfersize = size; 1071 1072 error = 1073 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1074 1075 vfs_bio_set_flags(bp, ioflag); 1076 1077 /* 1078 * If IO_SYNC each buffer is written synchronously. Otherwise 1079 * if we have a severe page deficiency write the buffer 1080 * asynchronously. Otherwise try to cluster, and if that 1081 * doesn't do it then either do an async write (if O_DIRECT), 1082 * or a delayed write (if not). 1083 */ 1084 if (ioflag & IO_SYNC) { 1085 (void)bwrite(bp); 1086 } else if (vm_page_count_severe() || 1087 buf_dirty_count_severe() || 1088 xfersize + blkoffset == fs->fs_bsize || 1089 (ioflag & (IO_ASYNC | IO_DIRECT))) 1090 bawrite(bp); 1091 else 1092 bdwrite(bp); 1093 if (error || xfersize == 0) 1094 break; 1095 ip->i_flag |= IN_CHANGE; 1096 } 1097 /* 1098 * If we successfully wrote any data, and we are not the superuser 1099 * we clear the setuid and setgid bits as a precaution against 1100 * tampering. 1101 */ 1102 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { 1103 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) { 1104 ip->i_mode &= ~(ISUID | ISGID); 1105 dp->di_mode = ip->i_mode; 1106 } 1107 } 1108 if (error) { 1109 if (ioflag & IO_UNIT) { 1110 (void)ffs_truncate(vp, osize, 1111 IO_EXT | (ioflag&IO_SYNC), ucred); 1112 uio->uio_offset -= resid - uio->uio_resid; 1113 uio->uio_resid = resid; 1114 } 1115 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1116 error = ffs_update(vp, 1); 1117 return (error); 1118 } 1119 1120 1121 /* 1122 * Vnode operating to retrieve a named extended attribute. 1123 * 1124 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1125 * the length of the EA, and possibly the pointer to the entry and to the data. 1126 */ 1127 static int 1128 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, 1129 struct extattr **eapp, u_char **eac) 1130 { 1131 struct extattr *eap, *eaend; 1132 size_t nlen; 1133 1134 nlen = strlen(name); 1135 KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned")); 1136 eap = (struct extattr *)ptr; 1137 eaend = (struct extattr *)(ptr + length); 1138 for (; eap < eaend; eap = EXTATTR_NEXT(eap)) { 1139 /* make sure this entry is complete */ 1140 if (EXTATTR_NEXT(eap) > eaend) 1141 break; 1142 if (eap->ea_namespace != nspace || eap->ea_namelength != nlen 1143 || memcmp(eap->ea_name, name, nlen) != 0) 1144 continue; 1145 if (eapp != NULL) 1146 *eapp = eap; 1147 if (eac != NULL) 1148 *eac = EXTATTR_CONTENT(eap); 1149 return (EXTATTR_CONTENT_SIZE(eap)); 1150 } 1151 return (-1); 1152 } 1153 1154 static int 1155 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1156 { 1157 struct inode *ip; 1158 struct ufs2_dinode *dp; 1159 struct fs *fs; 1160 struct uio luio; 1161 struct iovec liovec; 1162 u_int easize; 1163 int error; 1164 u_char *eae; 1165 1166 ip = VTOI(vp); 1167 fs = ITOFS(ip); 1168 dp = ip->i_din2; 1169 easize = dp->di_extsize; 1170 if ((uoff_t)easize + extra > UFS_NXADDR * fs->fs_bsize) 1171 return (EFBIG); 1172 1173 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1174 1175 liovec.iov_base = eae; 1176 liovec.iov_len = easize; 1177 luio.uio_iov = &liovec; 1178 luio.uio_iovcnt = 1; 1179 luio.uio_offset = 0; 1180 luio.uio_resid = easize; 1181 luio.uio_segflg = UIO_SYSSPACE; 1182 luio.uio_rw = UIO_READ; 1183 luio.uio_td = td; 1184 1185 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1186 if (error) { 1187 free(eae, M_TEMP); 1188 return(error); 1189 } 1190 *p = eae; 1191 return (0); 1192 } 1193 1194 static void 1195 ffs_lock_ea(struct vnode *vp) 1196 { 1197 struct inode *ip; 1198 1199 ip = VTOI(vp); 1200 VI_LOCK(vp); 1201 while (ip->i_flag & IN_EA_LOCKED) { 1202 ip->i_flag |= IN_EA_LOCKWAIT; 1203 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea", 1204 0); 1205 } 1206 ip->i_flag |= IN_EA_LOCKED; 1207 VI_UNLOCK(vp); 1208 } 1209 1210 static void 1211 ffs_unlock_ea(struct vnode *vp) 1212 { 1213 struct inode *ip; 1214 1215 ip = VTOI(vp); 1216 VI_LOCK(vp); 1217 if (ip->i_flag & IN_EA_LOCKWAIT) 1218 wakeup(&ip->i_ea_refs); 1219 ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT); 1220 VI_UNLOCK(vp); 1221 } 1222 1223 static int 1224 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1225 { 1226 struct inode *ip; 1227 struct ufs2_dinode *dp; 1228 int error; 1229 1230 ip = VTOI(vp); 1231 1232 ffs_lock_ea(vp); 1233 if (ip->i_ea_area != NULL) { 1234 ip->i_ea_refs++; 1235 ffs_unlock_ea(vp); 1236 return (0); 1237 } 1238 dp = ip->i_din2; 1239 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1240 if (error) { 1241 ffs_unlock_ea(vp); 1242 return (error); 1243 } 1244 ip->i_ea_len = dp->di_extsize; 1245 ip->i_ea_error = 0; 1246 ip->i_ea_refs++; 1247 ffs_unlock_ea(vp); 1248 return (0); 1249 } 1250 1251 /* 1252 * Vnode extattr transaction commit/abort 1253 */ 1254 static int 1255 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1256 { 1257 struct inode *ip; 1258 struct uio luio; 1259 struct iovec liovec; 1260 int error; 1261 struct ufs2_dinode *dp; 1262 1263 ip = VTOI(vp); 1264 1265 ffs_lock_ea(vp); 1266 if (ip->i_ea_area == NULL) { 1267 ffs_unlock_ea(vp); 1268 return (EINVAL); 1269 } 1270 dp = ip->i_din2; 1271 error = ip->i_ea_error; 1272 if (commit && error == 0) { 1273 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit"); 1274 if (cred == NOCRED) 1275 cred = vp->v_mount->mnt_cred; 1276 liovec.iov_base = ip->i_ea_area; 1277 liovec.iov_len = ip->i_ea_len; 1278 luio.uio_iov = &liovec; 1279 luio.uio_iovcnt = 1; 1280 luio.uio_offset = 0; 1281 luio.uio_resid = ip->i_ea_len; 1282 luio.uio_segflg = UIO_SYSSPACE; 1283 luio.uio_rw = UIO_WRITE; 1284 luio.uio_td = td; 1285 /* XXX: I'm not happy about truncating to zero size */ 1286 if (ip->i_ea_len < dp->di_extsize) 1287 error = ffs_truncate(vp, 0, IO_EXT, cred); 1288 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1289 } 1290 if (--ip->i_ea_refs == 0) { 1291 free(ip->i_ea_area, M_TEMP); 1292 ip->i_ea_area = NULL; 1293 ip->i_ea_len = 0; 1294 ip->i_ea_error = 0; 1295 } 1296 ffs_unlock_ea(vp); 1297 return (error); 1298 } 1299 1300 /* 1301 * Vnode extattr strategy routine for fifos. 1302 * 1303 * We need to check for a read or write of the external attributes. 1304 * Otherwise we just fall through and do the usual thing. 1305 */ 1306 static int 1307 ffsext_strategy(struct vop_strategy_args *ap) 1308 /* 1309 struct vop_strategy_args { 1310 struct vnodeop_desc *a_desc; 1311 struct vnode *a_vp; 1312 struct buf *a_bp; 1313 }; 1314 */ 1315 { 1316 struct vnode *vp; 1317 daddr_t lbn; 1318 1319 vp = ap->a_vp; 1320 lbn = ap->a_bp->b_lblkno; 1321 if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR) 1322 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1323 if (vp->v_type == VFIFO) 1324 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1325 panic("spec nodes went here"); 1326 } 1327 1328 /* 1329 * Vnode extattr transaction commit/abort 1330 */ 1331 static int 1332 ffs_openextattr(struct vop_openextattr_args *ap) 1333 /* 1334 struct vop_openextattr_args { 1335 struct vnodeop_desc *a_desc; 1336 struct vnode *a_vp; 1337 IN struct ucred *a_cred; 1338 IN struct thread *a_td; 1339 }; 1340 */ 1341 { 1342 1343 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1344 return (EOPNOTSUPP); 1345 1346 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1347 } 1348 1349 1350 /* 1351 * Vnode extattr transaction commit/abort 1352 */ 1353 static int 1354 ffs_closeextattr(struct vop_closeextattr_args *ap) 1355 /* 1356 struct vop_closeextattr_args { 1357 struct vnodeop_desc *a_desc; 1358 struct vnode *a_vp; 1359 int a_commit; 1360 IN struct ucred *a_cred; 1361 IN struct thread *a_td; 1362 }; 1363 */ 1364 { 1365 1366 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1367 return (EOPNOTSUPP); 1368 1369 if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) 1370 return (EROFS); 1371 1372 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1373 } 1374 1375 /* 1376 * Vnode operation to remove a named attribute. 1377 */ 1378 static int 1379 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1380 /* 1381 vop_deleteextattr { 1382 IN struct vnode *a_vp; 1383 IN int a_attrnamespace; 1384 IN const char *a_name; 1385 IN struct ucred *a_cred; 1386 IN struct thread *a_td; 1387 }; 1388 */ 1389 { 1390 struct inode *ip; 1391 struct extattr *eap; 1392 uint32_t ul; 1393 int olen, error, i, easize; 1394 u_char *eae; 1395 void *tmp; 1396 1397 ip = VTOI(ap->a_vp); 1398 1399 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1400 return (EOPNOTSUPP); 1401 1402 if (strlen(ap->a_name) == 0) 1403 return (EINVAL); 1404 1405 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1406 return (EROFS); 1407 1408 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1409 ap->a_cred, ap->a_td, VWRITE); 1410 if (error) { 1411 1412 /* 1413 * ffs_lock_ea is not needed there, because the vnode 1414 * must be exclusively locked. 1415 */ 1416 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1417 ip->i_ea_error = error; 1418 return (error); 1419 } 1420 1421 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1422 if (error) 1423 return (error); 1424 1425 /* CEM: delete could be done in-place instead */ 1426 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1427 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1428 easize = ip->i_ea_len; 1429 1430 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1431 &eap, NULL); 1432 if (olen == -1) { 1433 /* delete but nonexistent */ 1434 free(eae, M_TEMP); 1435 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1436 return (ENOATTR); 1437 } 1438 ul = eap->ea_length; 1439 i = (u_char *)EXTATTR_NEXT(eap) - eae; 1440 bcopy(EXTATTR_NEXT(eap), eap, easize - i); 1441 easize -= ul; 1442 1443 tmp = ip->i_ea_area; 1444 ip->i_ea_area = eae; 1445 ip->i_ea_len = easize; 1446 free(tmp, M_TEMP); 1447 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1448 return (error); 1449 } 1450 1451 /* 1452 * Vnode operation to retrieve a named extended attribute. 1453 */ 1454 static int 1455 ffs_getextattr(struct vop_getextattr_args *ap) 1456 /* 1457 vop_getextattr { 1458 IN struct vnode *a_vp; 1459 IN int a_attrnamespace; 1460 IN const char *a_name; 1461 INOUT struct uio *a_uio; 1462 OUT size_t *a_size; 1463 IN struct ucred *a_cred; 1464 IN struct thread *a_td; 1465 }; 1466 */ 1467 { 1468 struct inode *ip; 1469 u_char *eae, *p; 1470 unsigned easize; 1471 int error, ealen; 1472 1473 ip = VTOI(ap->a_vp); 1474 1475 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1476 return (EOPNOTSUPP); 1477 1478 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1479 ap->a_cred, ap->a_td, VREAD); 1480 if (error) 1481 return (error); 1482 1483 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1484 if (error) 1485 return (error); 1486 1487 eae = ip->i_ea_area; 1488 easize = ip->i_ea_len; 1489 1490 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1491 NULL, &p); 1492 if (ealen >= 0) { 1493 error = 0; 1494 if (ap->a_size != NULL) 1495 *ap->a_size = ealen; 1496 else if (ap->a_uio != NULL) 1497 error = uiomove(p, ealen, ap->a_uio); 1498 } else 1499 error = ENOATTR; 1500 1501 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1502 return (error); 1503 } 1504 1505 /* 1506 * Vnode operation to retrieve extended attributes on a vnode. 1507 */ 1508 static int 1509 ffs_listextattr(struct vop_listextattr_args *ap) 1510 /* 1511 vop_listextattr { 1512 IN struct vnode *a_vp; 1513 IN int a_attrnamespace; 1514 INOUT struct uio *a_uio; 1515 OUT size_t *a_size; 1516 IN struct ucred *a_cred; 1517 IN struct thread *a_td; 1518 }; 1519 */ 1520 { 1521 struct inode *ip; 1522 struct extattr *eap, *eaend; 1523 int error, ealen; 1524 1525 ip = VTOI(ap->a_vp); 1526 1527 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1528 return (EOPNOTSUPP); 1529 1530 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1531 ap->a_cred, ap->a_td, VREAD); 1532 if (error) 1533 return (error); 1534 1535 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1536 if (error) 1537 return (error); 1538 1539 error = 0; 1540 if (ap->a_size != NULL) 1541 *ap->a_size = 0; 1542 1543 KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned")); 1544 eap = (struct extattr *)ip->i_ea_area; 1545 eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len); 1546 for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) { 1547 /* make sure this entry is complete */ 1548 if (EXTATTR_NEXT(eap) > eaend) 1549 break; 1550 if (eap->ea_namespace != ap->a_attrnamespace) 1551 continue; 1552 1553 ealen = eap->ea_namelength; 1554 if (ap->a_size != NULL) 1555 *ap->a_size += ealen + 1; 1556 else if (ap->a_uio != NULL) 1557 error = uiomove(&eap->ea_namelength, ealen + 1, 1558 ap->a_uio); 1559 } 1560 1561 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1562 return (error); 1563 } 1564 1565 /* 1566 * Vnode operation to set a named attribute. 1567 */ 1568 static int 1569 ffs_setextattr(struct vop_setextattr_args *ap) 1570 /* 1571 vop_setextattr { 1572 IN struct vnode *a_vp; 1573 IN int a_attrnamespace; 1574 IN const char *a_name; 1575 INOUT struct uio *a_uio; 1576 IN struct ucred *a_cred; 1577 IN struct thread *a_td; 1578 }; 1579 */ 1580 { 1581 struct inode *ip; 1582 struct fs *fs; 1583 struct extattr *eap; 1584 uint32_t ealength, ul; 1585 ssize_t ealen; 1586 int olen, eapad1, eapad2, error, i, easize; 1587 u_char *eae; 1588 void *tmp; 1589 1590 ip = VTOI(ap->a_vp); 1591 fs = ITOFS(ip); 1592 1593 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1594 return (EOPNOTSUPP); 1595 1596 if (strlen(ap->a_name) == 0) 1597 return (EINVAL); 1598 1599 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1600 if (ap->a_uio == NULL) 1601 return (EOPNOTSUPP); 1602 1603 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1604 return (EROFS); 1605 1606 ealen = ap->a_uio->uio_resid; 1607 if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR)) 1608 return (EINVAL); 1609 1610 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1611 ap->a_cred, ap->a_td, VWRITE); 1612 if (error) { 1613 1614 /* 1615 * ffs_lock_ea is not needed there, because the vnode 1616 * must be exclusively locked. 1617 */ 1618 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1619 ip->i_ea_error = error; 1620 return (error); 1621 } 1622 1623 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1624 if (error) 1625 return (error); 1626 1627 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1628 eapad1 = roundup2(ealength, 8) - ealength; 1629 eapad2 = roundup2(ealen, 8) - ealen; 1630 ealength += eapad1 + ealen + eapad2; 1631 1632 /* 1633 * CEM: rewrites of the same size or smaller could be done in-place 1634 * instead. (We don't acquire any fine-grained locks in here either, 1635 * so we could also do bigger writes in-place.) 1636 */ 1637 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1638 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1639 easize = ip->i_ea_len; 1640 1641 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1642 &eap, NULL); 1643 if (olen == -1) { 1644 /* new, append at end */ 1645 KASSERT(ALIGNED_TO(eae + easize, struct extattr), 1646 ("unaligned")); 1647 eap = (struct extattr *)(eae + easize); 1648 easize += ealength; 1649 } else { 1650 ul = eap->ea_length; 1651 i = (u_char *)EXTATTR_NEXT(eap) - eae; 1652 if (ul != ealength) { 1653 bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength, 1654 easize - i); 1655 easize += (ealength - ul); 1656 } 1657 } 1658 if (easize > lblktosize(fs, UFS_NXADDR)) { 1659 free(eae, M_TEMP); 1660 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1661 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1662 ip->i_ea_error = ENOSPC; 1663 return (ENOSPC); 1664 } 1665 eap->ea_length = ealength; 1666 eap->ea_namespace = ap->a_attrnamespace; 1667 eap->ea_contentpadlen = eapad2; 1668 eap->ea_namelength = strlen(ap->a_name); 1669 memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name)); 1670 bzero(&eap->ea_name[strlen(ap->a_name)], eapad1); 1671 error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio); 1672 if (error) { 1673 free(eae, M_TEMP); 1674 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1675 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1676 ip->i_ea_error = error; 1677 return (error); 1678 } 1679 bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2); 1680 1681 tmp = ip->i_ea_area; 1682 ip->i_ea_area = eae; 1683 ip->i_ea_len = easize; 1684 free(tmp, M_TEMP); 1685 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1686 return (error); 1687 } 1688 1689 /* 1690 * Vnode pointer to File handle 1691 */ 1692 static int 1693 ffs_vptofh(struct vop_vptofh_args *ap) 1694 /* 1695 vop_vptofh { 1696 IN struct vnode *a_vp; 1697 IN struct fid *a_fhp; 1698 }; 1699 */ 1700 { 1701 struct inode *ip; 1702 struct ufid *ufhp; 1703 1704 ip = VTOI(ap->a_vp); 1705 ufhp = (struct ufid *)ap->a_fhp; 1706 ufhp->ufid_len = sizeof(struct ufid); 1707 ufhp->ufid_ino = ip->i_number; 1708 ufhp->ufid_gen = ip->i_gen; 1709 return (0); 1710 } 1711 1712 SYSCTL_DECL(_vfs_ffs); 1713 static int use_buf_pager = 1; 1714 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0, 1715 "Always use buffer pager instead of bmap"); 1716 1717 static daddr_t 1718 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) 1719 { 1720 1721 return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off)); 1722 } 1723 1724 static int 1725 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn) 1726 { 1727 1728 return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn)); 1729 } 1730 1731 static int 1732 ffs_getpages(struct vop_getpages_args *ap) 1733 { 1734 struct vnode *vp; 1735 struct ufsmount *um; 1736 1737 vp = ap->a_vp; 1738 um = VFSTOUFS(vp->v_mount); 1739 1740 if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) 1741 return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, 1742 ap->a_rbehind, ap->a_rahead, NULL, NULL)); 1743 return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, 1744 ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz)); 1745 } 1746 1747 static int 1748 ffs_getpages_async(struct vop_getpages_async_args *ap) 1749 { 1750 struct vnode *vp; 1751 struct ufsmount *um; 1752 int error; 1753 1754 vp = ap->a_vp; 1755 um = VFSTOUFS(vp->v_mount); 1756 1757 if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) 1758 return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, 1759 ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg)); 1760 1761 error = vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, 1762 ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz); 1763 ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); 1764 1765 return (error); 1766 } 1767 1768