1 /*- 2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 60 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 61 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include <sys/param.h> 68 #include <sys/bio.h> 69 #include <sys/systm.h> 70 #include <sys/buf.h> 71 #include <sys/conf.h> 72 #include <sys/extattr.h> 73 #include <sys/kernel.h> 74 #include <sys/limits.h> 75 #include <sys/malloc.h> 76 #include <sys/mount.h> 77 #include <sys/priv.h> 78 #include <sys/rwlock.h> 79 #include <sys/stat.h> 80 #include <sys/sysctl.h> 81 #include <sys/vmmeter.h> 82 #include <sys/vnode.h> 83 84 #include <vm/vm.h> 85 #include <vm/vm_param.h> 86 #include <vm/vm_extern.h> 87 #include <vm/vm_object.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_pager.h> 90 #include <vm/vnode_pager.h> 91 92 #include <ufs/ufs/extattr.h> 93 #include <ufs/ufs/quota.h> 94 #include <ufs/ufs/inode.h> 95 #include <ufs/ufs/ufs_extern.h> 96 #include <ufs/ufs/ufsmount.h> 97 98 #include <ufs/ffs/fs.h> 99 #include <ufs/ffs/ffs_extern.h> 100 #include "opt_directio.h" 101 #include "opt_ffs.h" 102 103 #ifdef DIRECTIO 104 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 105 #endif 106 static vop_fdatasync_t ffs_fdatasync; 107 static vop_fsync_t ffs_fsync; 108 static vop_getpages_t ffs_getpages; 109 static vop_lock1_t ffs_lock; 110 static vop_read_t ffs_read; 111 static vop_write_t ffs_write; 112 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 113 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 114 struct ucred *cred); 115 static vop_strategy_t ffsext_strategy; 116 static vop_closeextattr_t ffs_closeextattr; 117 static vop_deleteextattr_t ffs_deleteextattr; 118 static vop_getextattr_t ffs_getextattr; 119 static vop_listextattr_t ffs_listextattr; 120 static vop_openextattr_t ffs_openextattr; 121 static vop_setextattr_t ffs_setextattr; 122 static vop_vptofh_t ffs_vptofh; 123 124 /* Global vfs data structures for ufs. */ 125 struct vop_vector ffs_vnodeops1 = { 126 .vop_default = &ufs_vnodeops, 127 .vop_fsync = ffs_fsync, 128 .vop_fdatasync = ffs_fdatasync, 129 .vop_getpages = ffs_getpages, 130 .vop_getpages_async = vnode_pager_local_getpages_async, 131 .vop_lock1 = ffs_lock, 132 .vop_read = ffs_read, 133 .vop_reallocblks = ffs_reallocblks, 134 .vop_write = ffs_write, 135 .vop_vptofh = ffs_vptofh, 136 }; 137 138 struct vop_vector ffs_fifoops1 = { 139 .vop_default = &ufs_fifoops, 140 .vop_fsync = ffs_fsync, 141 .vop_fdatasync = ffs_fdatasync, 142 .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ 143 .vop_vptofh = ffs_vptofh, 144 }; 145 146 /* Global vfs data structures for ufs. */ 147 struct vop_vector ffs_vnodeops2 = { 148 .vop_default = &ufs_vnodeops, 149 .vop_fsync = ffs_fsync, 150 .vop_fdatasync = ffs_fdatasync, 151 .vop_getpages = ffs_getpages, 152 .vop_getpages_async = vnode_pager_local_getpages_async, 153 .vop_lock1 = ffs_lock, 154 .vop_read = ffs_read, 155 .vop_reallocblks = ffs_reallocblks, 156 .vop_write = ffs_write, 157 .vop_closeextattr = ffs_closeextattr, 158 .vop_deleteextattr = ffs_deleteextattr, 159 .vop_getextattr = ffs_getextattr, 160 .vop_listextattr = ffs_listextattr, 161 .vop_openextattr = ffs_openextattr, 162 .vop_setextattr = ffs_setextattr, 163 .vop_vptofh = ffs_vptofh, 164 }; 165 166 struct vop_vector ffs_fifoops2 = { 167 .vop_default = &ufs_fifoops, 168 .vop_fsync = ffs_fsync, 169 .vop_fdatasync = ffs_fdatasync, 170 .vop_lock1 = ffs_lock, 171 .vop_reallocblks = ffs_reallocblks, 172 .vop_strategy = ffsext_strategy, 173 .vop_closeextattr = ffs_closeextattr, 174 .vop_deleteextattr = ffs_deleteextattr, 175 .vop_getextattr = ffs_getextattr, 176 .vop_listextattr = ffs_listextattr, 177 .vop_openextattr = ffs_openextattr, 178 .vop_setextattr = ffs_setextattr, 179 .vop_vptofh = ffs_vptofh, 180 }; 181 182 /* 183 * Synch an open file. 184 */ 185 /* ARGSUSED */ 186 static int 187 ffs_fsync(struct vop_fsync_args *ap) 188 { 189 struct vnode *vp; 190 struct bufobj *bo; 191 int error; 192 193 vp = ap->a_vp; 194 bo = &vp->v_bufobj; 195 retry: 196 error = ffs_syncvnode(vp, ap->a_waitfor, 0); 197 if (error) 198 return (error); 199 if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) { 200 error = softdep_fsync(vp); 201 if (error) 202 return (error); 203 204 /* 205 * The softdep_fsync() function may drop vp lock, 206 * allowing for dirty buffers to reappear on the 207 * bo_dirty list. Recheck and resync as needed. 208 */ 209 BO_LOCK(bo); 210 if ((vp->v_type == VREG || vp->v_type == VDIR) && 211 (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) { 212 BO_UNLOCK(bo); 213 goto retry; 214 } 215 BO_UNLOCK(bo); 216 } 217 return (0); 218 } 219 220 int 221 ffs_syncvnode(struct vnode *vp, int waitfor, int flags) 222 { 223 struct inode *ip; 224 struct bufobj *bo; 225 struct buf *bp, *nbp; 226 ufs_lbn_t lbn; 227 int error, passes; 228 bool still_dirty, wait; 229 230 ip = VTOI(vp); 231 ip->i_flag &= ~IN_NEEDSYNC; 232 bo = &vp->v_bufobj; 233 234 /* 235 * When doing MNT_WAIT we must first flush all dependencies 236 * on the inode. 237 */ 238 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && 239 (error = softdep_sync_metadata(vp)) != 0) 240 return (error); 241 242 /* 243 * Flush all dirty buffers associated with a vnode. 244 */ 245 error = 0; 246 passes = 0; 247 wait = false; /* Always do an async pass first. */ 248 lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1)); 249 BO_LOCK(bo); 250 loop: 251 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 252 bp->b_vflags &= ~BV_SCANNED; 253 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 254 /* 255 * Reasons to skip this buffer: it has already been considered 256 * on this pass, the buffer has dependencies that will cause 257 * it to be redirtied and it has not already been deferred, 258 * or it is already being written. 259 */ 260 if ((bp->b_vflags & BV_SCANNED) != 0) 261 continue; 262 bp->b_vflags |= BV_SCANNED; 263 /* 264 * Flush indirects in order, if requested. 265 * 266 * Note that if only datasync is requested, we can 267 * skip indirect blocks when softupdates are not 268 * active. Otherwise we must flush them with data, 269 * since dependencies prevent data block writes. 270 */ 271 if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR && 272 (lbn_level(bp->b_lblkno) >= passes || 273 ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp)))) 274 continue; 275 if (bp->b_lblkno > lbn) 276 panic("ffs_syncvnode: syncing truncated data."); 277 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) { 278 BO_UNLOCK(bo); 279 } else if (wait) { 280 if (BUF_LOCK(bp, 281 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 282 BO_LOCKPTR(bo)) != 0) { 283 bp->b_vflags &= ~BV_SCANNED; 284 goto next; 285 } 286 } else 287 continue; 288 if ((bp->b_flags & B_DELWRI) == 0) 289 panic("ffs_fsync: not dirty"); 290 /* 291 * Check for dependencies and potentially complete them. 292 */ 293 if (!LIST_EMPTY(&bp->b_dep) && 294 (error = softdep_sync_buf(vp, bp, 295 wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { 296 /* I/O error. */ 297 if (error != EBUSY) { 298 BUF_UNLOCK(bp); 299 return (error); 300 } 301 /* If we deferred once, don't defer again. */ 302 if ((bp->b_flags & B_DEFERRED) == 0) { 303 bp->b_flags |= B_DEFERRED; 304 BUF_UNLOCK(bp); 305 goto next; 306 } 307 } 308 if (wait) { 309 bremfree(bp); 310 if ((error = bwrite(bp)) != 0) 311 return (error); 312 } else if ((bp->b_flags & B_CLUSTEROK)) { 313 (void) vfs_bio_awrite(bp); 314 } else { 315 bremfree(bp); 316 (void) bawrite(bp); 317 } 318 next: 319 /* 320 * Since we may have slept during the I/O, we need 321 * to start from a known point. 322 */ 323 BO_LOCK(bo); 324 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); 325 } 326 if (waitfor != MNT_WAIT) { 327 BO_UNLOCK(bo); 328 if ((flags & NO_INO_UPDT) != 0) 329 return (0); 330 else 331 return (ffs_update(vp, 0)); 332 } 333 /* Drain IO to see if we're done. */ 334 bufobj_wwait(bo, 0, 0); 335 /* 336 * Block devices associated with filesystems may have new I/O 337 * requests posted for them even if the vnode is locked, so no 338 * amount of trying will get them clean. We make several passes 339 * as a best effort. 340 * 341 * Regular files may need multiple passes to flush all dependency 342 * work as it is possible that we must write once per indirect 343 * level, once for the leaf, and once for the inode and each of 344 * these will be done with one sync and one async pass. 345 */ 346 if (bo->bo_dirty.bv_cnt > 0) { 347 if ((flags & DATA_ONLY) == 0) { 348 still_dirty = true; 349 } else { 350 /* 351 * For data-only sync, dirty indirect buffers 352 * are ignored. 353 */ 354 still_dirty = false; 355 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 356 if (bp->b_lblkno > -NDADDR) { 357 still_dirty = true; 358 break; 359 } 360 } 361 } 362 363 if (still_dirty) { 364 /* Write the inode after sync passes to flush deps. */ 365 if (wait && DOINGSOFTDEP(vp) && 366 (flags & NO_INO_UPDT) == 0) { 367 BO_UNLOCK(bo); 368 ffs_update(vp, 1); 369 BO_LOCK(bo); 370 } 371 /* switch between sync/async. */ 372 wait = !wait; 373 if (wait || ++passes < NIADDR + 2) 374 goto loop; 375 #ifdef INVARIANTS 376 if (!vn_isdisk(vp, NULL)) 377 vn_printf(vp, "ffs_fsync: dirty "); 378 #endif 379 } 380 } 381 BO_UNLOCK(bo); 382 error = 0; 383 if ((flags & DATA_ONLY) == 0) { 384 if ((flags & NO_INO_UPDT) == 0) 385 error = ffs_update(vp, 1); 386 if (DOINGSUJ(vp)) 387 softdep_journal_fsync(VTOI(vp)); 388 } 389 return (error); 390 } 391 392 static int 393 ffs_fdatasync(struct vop_fdatasync_args *ap) 394 { 395 396 return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY)); 397 } 398 399 static int 400 ffs_lock(ap) 401 struct vop_lock1_args /* { 402 struct vnode *a_vp; 403 int a_flags; 404 struct thread *a_td; 405 char *file; 406 int line; 407 } */ *ap; 408 { 409 #ifndef NO_FFS_SNAPSHOT 410 struct vnode *vp; 411 int flags; 412 struct lock *lkp; 413 int result; 414 415 switch (ap->a_flags & LK_TYPE_MASK) { 416 case LK_SHARED: 417 case LK_UPGRADE: 418 case LK_EXCLUSIVE: 419 vp = ap->a_vp; 420 flags = ap->a_flags; 421 for (;;) { 422 #ifdef DEBUG_VFS_LOCKS 423 KASSERT(vp->v_holdcnt != 0, 424 ("ffs_lock %p: zero hold count", vp)); 425 #endif 426 lkp = vp->v_vnlock; 427 result = _lockmgr_args(lkp, flags, VI_MTX(vp), 428 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 429 ap->a_file, ap->a_line); 430 if (lkp == vp->v_vnlock || result != 0) 431 break; 432 /* 433 * Apparent success, except that the vnode 434 * mutated between snapshot file vnode and 435 * regular file vnode while this process 436 * slept. The lock currently held is not the 437 * right lock. Release it, and try to get the 438 * new lock. 439 */ 440 (void) _lockmgr_args(lkp, LK_RELEASE, NULL, 441 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 442 ap->a_file, ap->a_line); 443 if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == 444 (LK_INTERLOCK | LK_NOWAIT)) 445 return (EBUSY); 446 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 447 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 448 flags &= ~LK_INTERLOCK; 449 } 450 break; 451 default: 452 result = VOP_LOCK1_APV(&ufs_vnodeops, ap); 453 } 454 return (result); 455 #else 456 return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); 457 #endif 458 } 459 460 /* 461 * Vnode op for reading. 462 */ 463 static int 464 ffs_read(ap) 465 struct vop_read_args /* { 466 struct vnode *a_vp; 467 struct uio *a_uio; 468 int a_ioflag; 469 struct ucred *a_cred; 470 } */ *ap; 471 { 472 struct vnode *vp; 473 struct inode *ip; 474 struct uio *uio; 475 struct fs *fs; 476 struct buf *bp; 477 ufs_lbn_t lbn, nextlbn; 478 off_t bytesinfile; 479 long size, xfersize, blkoffset; 480 ssize_t orig_resid; 481 int error; 482 int seqcount; 483 int ioflag; 484 485 vp = ap->a_vp; 486 uio = ap->a_uio; 487 ioflag = ap->a_ioflag; 488 if (ap->a_ioflag & IO_EXT) 489 #ifdef notyet 490 return (ffs_extread(vp, uio, ioflag)); 491 #else 492 panic("ffs_read+IO_EXT"); 493 #endif 494 #ifdef DIRECTIO 495 if ((ioflag & IO_DIRECT) != 0) { 496 int workdone; 497 498 error = ffs_rawread(vp, uio, &workdone); 499 if (error != 0 || workdone != 0) 500 return error; 501 } 502 #endif 503 504 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 505 ip = VTOI(vp); 506 507 #ifdef INVARIANTS 508 if (uio->uio_rw != UIO_READ) 509 panic("ffs_read: mode"); 510 511 if (vp->v_type == VLNK) { 512 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 513 panic("ffs_read: short symlink"); 514 } else if (vp->v_type != VREG && vp->v_type != VDIR) 515 panic("ffs_read: type %d", vp->v_type); 516 #endif 517 orig_resid = uio->uio_resid; 518 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 519 if (orig_resid == 0) 520 return (0); 521 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 522 fs = ITOFS(ip); 523 if (uio->uio_offset < ip->i_size && 524 uio->uio_offset >= fs->fs_maxfilesize) 525 return (EOVERFLOW); 526 527 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 528 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 529 break; 530 lbn = lblkno(fs, uio->uio_offset); 531 nextlbn = lbn + 1; 532 533 /* 534 * size of buffer. The buffer representing the 535 * end of the file is rounded up to the size of 536 * the block type ( fragment or full block, 537 * depending ). 538 */ 539 size = blksize(fs, ip, lbn); 540 blkoffset = blkoff(fs, uio->uio_offset); 541 542 /* 543 * The amount we want to transfer in this iteration is 544 * one FS block less the amount of the data before 545 * our startpoint (duh!) 546 */ 547 xfersize = fs->fs_bsize - blkoffset; 548 549 /* 550 * But if we actually want less than the block, 551 * or the file doesn't have a whole block more of data, 552 * then use the lesser number. 553 */ 554 if (uio->uio_resid < xfersize) 555 xfersize = uio->uio_resid; 556 if (bytesinfile < xfersize) 557 xfersize = bytesinfile; 558 559 if (lblktosize(fs, nextlbn) >= ip->i_size) { 560 /* 561 * Don't do readahead if this is the end of the file. 562 */ 563 error = bread_gb(vp, lbn, size, NOCRED, 564 GB_UNMAPPED, &bp); 565 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 566 /* 567 * Otherwise if we are allowed to cluster, 568 * grab as much as we can. 569 * 570 * XXX This may not be a win if we are not 571 * doing sequential access. 572 */ 573 error = cluster_read(vp, ip->i_size, lbn, 574 size, NOCRED, blkoffset + uio->uio_resid, 575 seqcount, GB_UNMAPPED, &bp); 576 } else if (seqcount > 1) { 577 /* 578 * If we are NOT allowed to cluster, then 579 * if we appear to be acting sequentially, 580 * fire off a request for a readahead 581 * as well as a read. Note that the 4th and 5th 582 * arguments point to arrays of the size specified in 583 * the 6th argument. 584 */ 585 u_int nextsize = blksize(fs, ip, nextlbn); 586 error = breadn_flags(vp, lbn, size, &nextlbn, 587 &nextsize, 1, NOCRED, GB_UNMAPPED, &bp); 588 } else { 589 /* 590 * Failing all of the above, just read what the 591 * user asked for. Interestingly, the same as 592 * the first option above. 593 */ 594 error = bread_gb(vp, lbn, size, NOCRED, 595 GB_UNMAPPED, &bp); 596 } 597 if (error) { 598 brelse(bp); 599 bp = NULL; 600 break; 601 } 602 603 /* 604 * If IO_DIRECT then set B_DIRECT for the buffer. This 605 * will cause us to attempt to release the buffer later on 606 * and will cause the buffer cache to attempt to free the 607 * underlying pages. 608 */ 609 if (ioflag & IO_DIRECT) 610 bp->b_flags |= B_DIRECT; 611 612 /* 613 * We should only get non-zero b_resid when an I/O error 614 * has occurred, which should cause us to break above. 615 * However, if the short read did not cause an error, 616 * then we want to ensure that we do not uiomove bad 617 * or uninitialized data. 618 */ 619 size -= bp->b_resid; 620 if (size < xfersize) { 621 if (size == 0) 622 break; 623 xfersize = size; 624 } 625 626 if (buf_mapped(bp)) { 627 error = vn_io_fault_uiomove((char *)bp->b_data + 628 blkoffset, (int)xfersize, uio); 629 } else { 630 error = vn_io_fault_pgmove(bp->b_pages, blkoffset, 631 (int)xfersize, uio); 632 } 633 if (error) 634 break; 635 636 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 637 (LIST_EMPTY(&bp->b_dep))) { 638 /* 639 * If there are no dependencies, and it's VMIO, 640 * then we don't need the buf, mark it available 641 * for freeing. For non-direct VMIO reads, the VM 642 * has the data. 643 */ 644 bp->b_flags |= B_RELBUF; 645 brelse(bp); 646 } else { 647 /* 648 * Otherwise let whoever 649 * made the request take care of 650 * freeing it. We just queue 651 * it onto another list. 652 */ 653 bqrelse(bp); 654 } 655 } 656 657 /* 658 * This can only happen in the case of an error 659 * because the loop above resets bp to NULL on each iteration 660 * and on normal completion has not set a new value into it. 661 * so it must have come from a 'break' statement 662 */ 663 if (bp != NULL) { 664 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 665 (LIST_EMPTY(&bp->b_dep))) { 666 bp->b_flags |= B_RELBUF; 667 brelse(bp); 668 } else { 669 bqrelse(bp); 670 } 671 } 672 673 if ((error == 0 || uio->uio_resid != orig_resid) && 674 (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 && 675 (ip->i_flag & IN_ACCESS) == 0) { 676 VI_LOCK(vp); 677 ip->i_flag |= IN_ACCESS; 678 VI_UNLOCK(vp); 679 } 680 return (error); 681 } 682 683 /* 684 * Vnode op for writing. 685 */ 686 static int 687 ffs_write(ap) 688 struct vop_write_args /* { 689 struct vnode *a_vp; 690 struct uio *a_uio; 691 int a_ioflag; 692 struct ucred *a_cred; 693 } */ *ap; 694 { 695 struct vnode *vp; 696 struct uio *uio; 697 struct inode *ip; 698 struct fs *fs; 699 struct buf *bp; 700 ufs_lbn_t lbn; 701 off_t osize; 702 ssize_t resid; 703 int seqcount; 704 int blkoffset, error, flags, ioflag, size, xfersize; 705 706 vp = ap->a_vp; 707 uio = ap->a_uio; 708 ioflag = ap->a_ioflag; 709 if (ap->a_ioflag & IO_EXT) 710 #ifdef notyet 711 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 712 #else 713 panic("ffs_write+IO_EXT"); 714 #endif 715 716 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 717 ip = VTOI(vp); 718 719 #ifdef INVARIANTS 720 if (uio->uio_rw != UIO_WRITE) 721 panic("ffs_write: mode"); 722 #endif 723 724 switch (vp->v_type) { 725 case VREG: 726 if (ioflag & IO_APPEND) 727 uio->uio_offset = ip->i_size; 728 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 729 return (EPERM); 730 /* FALLTHROUGH */ 731 case VLNK: 732 break; 733 case VDIR: 734 panic("ffs_write: dir write"); 735 break; 736 default: 737 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 738 (int)uio->uio_offset, 739 (int)uio->uio_resid 740 ); 741 } 742 743 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 744 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 745 fs = ITOFS(ip); 746 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 747 return (EFBIG); 748 /* 749 * Maybe this should be above the vnode op call, but so long as 750 * file servers have no limits, I don't think it matters. 751 */ 752 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) 753 return (EFBIG); 754 755 resid = uio->uio_resid; 756 osize = ip->i_size; 757 if (seqcount > BA_SEQMAX) 758 flags = BA_SEQMAX << BA_SEQSHIFT; 759 else 760 flags = seqcount << BA_SEQSHIFT; 761 if (ioflag & IO_SYNC) 762 flags |= IO_SYNC; 763 flags |= BA_UNMAPPED; 764 765 for (error = 0; uio->uio_resid > 0;) { 766 lbn = lblkno(fs, uio->uio_offset); 767 blkoffset = blkoff(fs, uio->uio_offset); 768 xfersize = fs->fs_bsize - blkoffset; 769 if (uio->uio_resid < xfersize) 770 xfersize = uio->uio_resid; 771 if (uio->uio_offset + xfersize > ip->i_size) 772 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 773 774 /* 775 * We must perform a read-before-write if the transfer size 776 * does not cover the entire buffer. 777 */ 778 if (fs->fs_bsize > xfersize) 779 flags |= BA_CLRBUF; 780 else 781 flags &= ~BA_CLRBUF; 782 /* XXX is uio->uio_offset the right thing here? */ 783 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 784 ap->a_cred, flags, &bp); 785 if (error != 0) { 786 vnode_pager_setsize(vp, ip->i_size); 787 break; 788 } 789 if (ioflag & IO_DIRECT) 790 bp->b_flags |= B_DIRECT; 791 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 792 bp->b_flags |= B_NOCACHE; 793 794 if (uio->uio_offset + xfersize > ip->i_size) { 795 ip->i_size = uio->uio_offset + xfersize; 796 DIP_SET(ip, i_size, ip->i_size); 797 } 798 799 size = blksize(fs, ip, lbn) - bp->b_resid; 800 if (size < xfersize) 801 xfersize = size; 802 803 if (buf_mapped(bp)) { 804 error = vn_io_fault_uiomove((char *)bp->b_data + 805 blkoffset, (int)xfersize, uio); 806 } else { 807 error = vn_io_fault_pgmove(bp->b_pages, blkoffset, 808 (int)xfersize, uio); 809 } 810 /* 811 * If the buffer is not already filled and we encounter an 812 * error while trying to fill it, we have to clear out any 813 * garbage data from the pages instantiated for the buffer. 814 * If we do not, a failed uiomove() during a write can leave 815 * the prior contents of the pages exposed to a userland mmap. 816 * 817 * Note that we need only clear buffers with a transfer size 818 * equal to the block size because buffers with a shorter 819 * transfer size were cleared above by the call to UFS_BALLOC() 820 * with the BA_CLRBUF flag set. 821 * 822 * If the source region for uiomove identically mmaps the 823 * buffer, uiomove() performed the NOP copy, and the buffer 824 * content remains valid because the page fault handler 825 * validated the pages. 826 */ 827 if (error != 0 && (bp->b_flags & B_CACHE) == 0 && 828 fs->fs_bsize == xfersize) 829 vfs_bio_clrbuf(bp); 830 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 831 (LIST_EMPTY(&bp->b_dep))) { 832 bp->b_flags |= B_RELBUF; 833 } 834 835 /* 836 * If IO_SYNC each buffer is written synchronously. Otherwise 837 * if we have a severe page deficiency write the buffer 838 * asynchronously. Otherwise try to cluster, and if that 839 * doesn't do it then either do an async write (if O_DIRECT), 840 * or a delayed write (if not). 841 */ 842 if (ioflag & IO_SYNC) { 843 (void)bwrite(bp); 844 } else if (vm_page_count_severe() || 845 buf_dirty_count_severe() || 846 (ioflag & IO_ASYNC)) { 847 bp->b_flags |= B_CLUSTEROK; 848 bawrite(bp); 849 } else if (xfersize + blkoffset == fs->fs_bsize) { 850 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 851 bp->b_flags |= B_CLUSTEROK; 852 cluster_write(vp, bp, ip->i_size, seqcount, 853 GB_UNMAPPED); 854 } else { 855 bawrite(bp); 856 } 857 } else if (ioflag & IO_DIRECT) { 858 bp->b_flags |= B_CLUSTEROK; 859 bawrite(bp); 860 } else { 861 bp->b_flags |= B_CLUSTEROK; 862 bdwrite(bp); 863 } 864 if (error || xfersize == 0) 865 break; 866 ip->i_flag |= IN_CHANGE | IN_UPDATE; 867 } 868 /* 869 * If we successfully wrote any data, and we are not the superuser 870 * we clear the setuid and setgid bits as a precaution against 871 * tampering. 872 */ 873 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && 874 ap->a_cred) { 875 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) { 876 ip->i_mode &= ~(ISUID | ISGID); 877 DIP_SET(ip, i_mode, ip->i_mode); 878 } 879 } 880 if (error) { 881 if (ioflag & IO_UNIT) { 882 (void)ffs_truncate(vp, osize, 883 IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred); 884 uio->uio_offset -= resid - uio->uio_resid; 885 uio->uio_resid = resid; 886 } 887 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 888 error = ffs_update(vp, 1); 889 return (error); 890 } 891 892 /* 893 * Extended attribute area reading. 894 */ 895 static int 896 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 897 { 898 struct inode *ip; 899 struct ufs2_dinode *dp; 900 struct fs *fs; 901 struct buf *bp; 902 ufs_lbn_t lbn, nextlbn; 903 off_t bytesinfile; 904 long size, xfersize, blkoffset; 905 ssize_t orig_resid; 906 int error; 907 908 ip = VTOI(vp); 909 fs = ITOFS(ip); 910 dp = ip->i_din2; 911 912 #ifdef INVARIANTS 913 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 914 panic("ffs_extread: mode"); 915 916 #endif 917 orig_resid = uio->uio_resid; 918 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 919 if (orig_resid == 0) 920 return (0); 921 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 922 923 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 924 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 925 break; 926 lbn = lblkno(fs, uio->uio_offset); 927 nextlbn = lbn + 1; 928 929 /* 930 * size of buffer. The buffer representing the 931 * end of the file is rounded up to the size of 932 * the block type ( fragment or full block, 933 * depending ). 934 */ 935 size = sblksize(fs, dp->di_extsize, lbn); 936 blkoffset = blkoff(fs, uio->uio_offset); 937 938 /* 939 * The amount we want to transfer in this iteration is 940 * one FS block less the amount of the data before 941 * our startpoint (duh!) 942 */ 943 xfersize = fs->fs_bsize - blkoffset; 944 945 /* 946 * But if we actually want less than the block, 947 * or the file doesn't have a whole block more of data, 948 * then use the lesser number. 949 */ 950 if (uio->uio_resid < xfersize) 951 xfersize = uio->uio_resid; 952 if (bytesinfile < xfersize) 953 xfersize = bytesinfile; 954 955 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 956 /* 957 * Don't do readahead if this is the end of the info. 958 */ 959 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 960 } else { 961 /* 962 * If we have a second block, then 963 * fire off a request for a readahead 964 * as well as a read. Note that the 4th and 5th 965 * arguments point to arrays of the size specified in 966 * the 6th argument. 967 */ 968 u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 969 970 nextlbn = -1 - nextlbn; 971 error = breadn(vp, -1 - lbn, 972 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 973 } 974 if (error) { 975 brelse(bp); 976 bp = NULL; 977 break; 978 } 979 980 /* 981 * If IO_DIRECT then set B_DIRECT for the buffer. This 982 * will cause us to attempt to release the buffer later on 983 * and will cause the buffer cache to attempt to free the 984 * underlying pages. 985 */ 986 if (ioflag & IO_DIRECT) 987 bp->b_flags |= B_DIRECT; 988 989 /* 990 * We should only get non-zero b_resid when an I/O error 991 * has occurred, which should cause us to break above. 992 * However, if the short read did not cause an error, 993 * then we want to ensure that we do not uiomove bad 994 * or uninitialized data. 995 */ 996 size -= bp->b_resid; 997 if (size < xfersize) { 998 if (size == 0) 999 break; 1000 xfersize = size; 1001 } 1002 1003 error = uiomove((char *)bp->b_data + blkoffset, 1004 (int)xfersize, uio); 1005 if (error) 1006 break; 1007 1008 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1009 (LIST_EMPTY(&bp->b_dep))) { 1010 /* 1011 * If there are no dependencies, and it's VMIO, 1012 * then we don't need the buf, mark it available 1013 * for freeing. For non-direct VMIO reads, the VM 1014 * has the data. 1015 */ 1016 bp->b_flags |= B_RELBUF; 1017 brelse(bp); 1018 } else { 1019 /* 1020 * Otherwise let whoever 1021 * made the request take care of 1022 * freeing it. We just queue 1023 * it onto another list. 1024 */ 1025 bqrelse(bp); 1026 } 1027 } 1028 1029 /* 1030 * This can only happen in the case of an error 1031 * because the loop above resets bp to NULL on each iteration 1032 * and on normal completion has not set a new value into it. 1033 * so it must have come from a 'break' statement 1034 */ 1035 if (bp != NULL) { 1036 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1037 (LIST_EMPTY(&bp->b_dep))) { 1038 bp->b_flags |= B_RELBUF; 1039 brelse(bp); 1040 } else { 1041 bqrelse(bp); 1042 } 1043 } 1044 return (error); 1045 } 1046 1047 /* 1048 * Extended attribute area writing. 1049 */ 1050 static int 1051 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1052 { 1053 struct inode *ip; 1054 struct ufs2_dinode *dp; 1055 struct fs *fs; 1056 struct buf *bp; 1057 ufs_lbn_t lbn; 1058 off_t osize; 1059 ssize_t resid; 1060 int blkoffset, error, flags, size, xfersize; 1061 1062 ip = VTOI(vp); 1063 fs = ITOFS(ip); 1064 dp = ip->i_din2; 1065 1066 #ifdef INVARIANTS 1067 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1068 panic("ffs_extwrite: mode"); 1069 #endif 1070 1071 if (ioflag & IO_APPEND) 1072 uio->uio_offset = dp->di_extsize; 1073 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1074 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1075 if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1076 return (EFBIG); 1077 1078 resid = uio->uio_resid; 1079 osize = dp->di_extsize; 1080 flags = IO_EXT; 1081 if (ioflag & IO_SYNC) 1082 flags |= IO_SYNC; 1083 1084 for (error = 0; uio->uio_resid > 0;) { 1085 lbn = lblkno(fs, uio->uio_offset); 1086 blkoffset = blkoff(fs, uio->uio_offset); 1087 xfersize = fs->fs_bsize - blkoffset; 1088 if (uio->uio_resid < xfersize) 1089 xfersize = uio->uio_resid; 1090 1091 /* 1092 * We must perform a read-before-write if the transfer size 1093 * does not cover the entire buffer. 1094 */ 1095 if (fs->fs_bsize > xfersize) 1096 flags |= BA_CLRBUF; 1097 else 1098 flags &= ~BA_CLRBUF; 1099 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1100 ucred, flags, &bp); 1101 if (error != 0) 1102 break; 1103 /* 1104 * If the buffer is not valid we have to clear out any 1105 * garbage data from the pages instantiated for the buffer. 1106 * If we do not, a failed uiomove() during a write can leave 1107 * the prior contents of the pages exposed to a userland 1108 * mmap(). XXX deal with uiomove() errors a better way. 1109 */ 1110 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1111 vfs_bio_clrbuf(bp); 1112 if (ioflag & IO_DIRECT) 1113 bp->b_flags |= B_DIRECT; 1114 1115 if (uio->uio_offset + xfersize > dp->di_extsize) 1116 dp->di_extsize = uio->uio_offset + xfersize; 1117 1118 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1119 if (size < xfersize) 1120 xfersize = size; 1121 1122 error = 1123 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1124 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1125 (LIST_EMPTY(&bp->b_dep))) { 1126 bp->b_flags |= B_RELBUF; 1127 } 1128 1129 /* 1130 * If IO_SYNC each buffer is written synchronously. Otherwise 1131 * if we have a severe page deficiency write the buffer 1132 * asynchronously. Otherwise try to cluster, and if that 1133 * doesn't do it then either do an async write (if O_DIRECT), 1134 * or a delayed write (if not). 1135 */ 1136 if (ioflag & IO_SYNC) { 1137 (void)bwrite(bp); 1138 } else if (vm_page_count_severe() || 1139 buf_dirty_count_severe() || 1140 xfersize + blkoffset == fs->fs_bsize || 1141 (ioflag & (IO_ASYNC | IO_DIRECT))) 1142 bawrite(bp); 1143 else 1144 bdwrite(bp); 1145 if (error || xfersize == 0) 1146 break; 1147 ip->i_flag |= IN_CHANGE; 1148 } 1149 /* 1150 * If we successfully wrote any data, and we are not the superuser 1151 * we clear the setuid and setgid bits as a precaution against 1152 * tampering. 1153 */ 1154 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { 1155 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) { 1156 ip->i_mode &= ~(ISUID | ISGID); 1157 dp->di_mode = ip->i_mode; 1158 } 1159 } 1160 if (error) { 1161 if (ioflag & IO_UNIT) { 1162 (void)ffs_truncate(vp, osize, 1163 IO_EXT | (ioflag&IO_SYNC), ucred); 1164 uio->uio_offset -= resid - uio->uio_resid; 1165 uio->uio_resid = resid; 1166 } 1167 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1168 error = ffs_update(vp, 1); 1169 return (error); 1170 } 1171 1172 1173 /* 1174 * Vnode operating to retrieve a named extended attribute. 1175 * 1176 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1177 * the length of the EA, and possibly the pointer to the entry and to the data. 1178 */ 1179 static int 1180 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) 1181 { 1182 u_char *p, *pe, *pn, *p0; 1183 int eapad1, eapad2, ealength, ealen, nlen; 1184 uint32_t ul; 1185 1186 pe = ptr + length; 1187 nlen = strlen(name); 1188 1189 for (p = ptr; p < pe; p = pn) { 1190 p0 = p; 1191 bcopy(p, &ul, sizeof(ul)); 1192 pn = p + ul; 1193 /* make sure this entry is complete */ 1194 if (pn > pe) 1195 break; 1196 p += sizeof(uint32_t); 1197 if (*p != nspace) 1198 continue; 1199 p++; 1200 eapad2 = *p++; 1201 if (*p != nlen) 1202 continue; 1203 p++; 1204 if (bcmp(p, name, nlen)) 1205 continue; 1206 ealength = sizeof(uint32_t) + 3 + nlen; 1207 eapad1 = 8 - (ealength % 8); 1208 if (eapad1 == 8) 1209 eapad1 = 0; 1210 ealength += eapad1; 1211 ealen = ul - ealength - eapad2; 1212 p += nlen + eapad1; 1213 if (eap != NULL) 1214 *eap = p0; 1215 if (eac != NULL) 1216 *eac = p; 1217 return (ealen); 1218 } 1219 return(-1); 1220 } 1221 1222 static int 1223 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1224 { 1225 struct inode *ip; 1226 struct ufs2_dinode *dp; 1227 struct fs *fs; 1228 struct uio luio; 1229 struct iovec liovec; 1230 u_int easize; 1231 int error; 1232 u_char *eae; 1233 1234 ip = VTOI(vp); 1235 fs = ITOFS(ip); 1236 dp = ip->i_din2; 1237 easize = dp->di_extsize; 1238 if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize) 1239 return (EFBIG); 1240 1241 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1242 1243 liovec.iov_base = eae; 1244 liovec.iov_len = easize; 1245 luio.uio_iov = &liovec; 1246 luio.uio_iovcnt = 1; 1247 luio.uio_offset = 0; 1248 luio.uio_resid = easize; 1249 luio.uio_segflg = UIO_SYSSPACE; 1250 luio.uio_rw = UIO_READ; 1251 luio.uio_td = td; 1252 1253 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1254 if (error) { 1255 free(eae, M_TEMP); 1256 return(error); 1257 } 1258 *p = eae; 1259 return (0); 1260 } 1261 1262 static void 1263 ffs_lock_ea(struct vnode *vp) 1264 { 1265 struct inode *ip; 1266 1267 ip = VTOI(vp); 1268 VI_LOCK(vp); 1269 while (ip->i_flag & IN_EA_LOCKED) { 1270 ip->i_flag |= IN_EA_LOCKWAIT; 1271 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea", 1272 0); 1273 } 1274 ip->i_flag |= IN_EA_LOCKED; 1275 VI_UNLOCK(vp); 1276 } 1277 1278 static void 1279 ffs_unlock_ea(struct vnode *vp) 1280 { 1281 struct inode *ip; 1282 1283 ip = VTOI(vp); 1284 VI_LOCK(vp); 1285 if (ip->i_flag & IN_EA_LOCKWAIT) 1286 wakeup(&ip->i_ea_refs); 1287 ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT); 1288 VI_UNLOCK(vp); 1289 } 1290 1291 static int 1292 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1293 { 1294 struct inode *ip; 1295 struct ufs2_dinode *dp; 1296 int error; 1297 1298 ip = VTOI(vp); 1299 1300 ffs_lock_ea(vp); 1301 if (ip->i_ea_area != NULL) { 1302 ip->i_ea_refs++; 1303 ffs_unlock_ea(vp); 1304 return (0); 1305 } 1306 dp = ip->i_din2; 1307 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1308 if (error) { 1309 ffs_unlock_ea(vp); 1310 return (error); 1311 } 1312 ip->i_ea_len = dp->di_extsize; 1313 ip->i_ea_error = 0; 1314 ip->i_ea_refs++; 1315 ffs_unlock_ea(vp); 1316 return (0); 1317 } 1318 1319 /* 1320 * Vnode extattr transaction commit/abort 1321 */ 1322 static int 1323 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1324 { 1325 struct inode *ip; 1326 struct uio luio; 1327 struct iovec liovec; 1328 int error; 1329 struct ufs2_dinode *dp; 1330 1331 ip = VTOI(vp); 1332 1333 ffs_lock_ea(vp); 1334 if (ip->i_ea_area == NULL) { 1335 ffs_unlock_ea(vp); 1336 return (EINVAL); 1337 } 1338 dp = ip->i_din2; 1339 error = ip->i_ea_error; 1340 if (commit && error == 0) { 1341 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit"); 1342 if (cred == NOCRED) 1343 cred = vp->v_mount->mnt_cred; 1344 liovec.iov_base = ip->i_ea_area; 1345 liovec.iov_len = ip->i_ea_len; 1346 luio.uio_iov = &liovec; 1347 luio.uio_iovcnt = 1; 1348 luio.uio_offset = 0; 1349 luio.uio_resid = ip->i_ea_len; 1350 luio.uio_segflg = UIO_SYSSPACE; 1351 luio.uio_rw = UIO_WRITE; 1352 luio.uio_td = td; 1353 /* XXX: I'm not happy about truncating to zero size */ 1354 if (ip->i_ea_len < dp->di_extsize) 1355 error = ffs_truncate(vp, 0, IO_EXT, cred); 1356 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1357 } 1358 if (--ip->i_ea_refs == 0) { 1359 free(ip->i_ea_area, M_TEMP); 1360 ip->i_ea_area = NULL; 1361 ip->i_ea_len = 0; 1362 ip->i_ea_error = 0; 1363 } 1364 ffs_unlock_ea(vp); 1365 return (error); 1366 } 1367 1368 /* 1369 * Vnode extattr strategy routine for fifos. 1370 * 1371 * We need to check for a read or write of the external attributes. 1372 * Otherwise we just fall through and do the usual thing. 1373 */ 1374 static int 1375 ffsext_strategy(struct vop_strategy_args *ap) 1376 /* 1377 struct vop_strategy_args { 1378 struct vnodeop_desc *a_desc; 1379 struct vnode *a_vp; 1380 struct buf *a_bp; 1381 }; 1382 */ 1383 { 1384 struct vnode *vp; 1385 daddr_t lbn; 1386 1387 vp = ap->a_vp; 1388 lbn = ap->a_bp->b_lblkno; 1389 if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -NXADDR) 1390 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1391 if (vp->v_type == VFIFO) 1392 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1393 panic("spec nodes went here"); 1394 } 1395 1396 /* 1397 * Vnode extattr transaction commit/abort 1398 */ 1399 static int 1400 ffs_openextattr(struct vop_openextattr_args *ap) 1401 /* 1402 struct vop_openextattr_args { 1403 struct vnodeop_desc *a_desc; 1404 struct vnode *a_vp; 1405 IN struct ucred *a_cred; 1406 IN struct thread *a_td; 1407 }; 1408 */ 1409 { 1410 1411 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1412 return (EOPNOTSUPP); 1413 1414 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1415 } 1416 1417 1418 /* 1419 * Vnode extattr transaction commit/abort 1420 */ 1421 static int 1422 ffs_closeextattr(struct vop_closeextattr_args *ap) 1423 /* 1424 struct vop_closeextattr_args { 1425 struct vnodeop_desc *a_desc; 1426 struct vnode *a_vp; 1427 int a_commit; 1428 IN struct ucred *a_cred; 1429 IN struct thread *a_td; 1430 }; 1431 */ 1432 { 1433 1434 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1435 return (EOPNOTSUPP); 1436 1437 if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) 1438 return (EROFS); 1439 1440 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1441 } 1442 1443 /* 1444 * Vnode operation to remove a named attribute. 1445 */ 1446 static int 1447 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1448 /* 1449 vop_deleteextattr { 1450 IN struct vnode *a_vp; 1451 IN int a_attrnamespace; 1452 IN const char *a_name; 1453 IN struct ucred *a_cred; 1454 IN struct thread *a_td; 1455 }; 1456 */ 1457 { 1458 struct inode *ip; 1459 struct fs *fs; 1460 uint32_t ealength, ul; 1461 int ealen, olen, eapad1, eapad2, error, i, easize; 1462 u_char *eae, *p; 1463 1464 ip = VTOI(ap->a_vp); 1465 fs = ITOFS(ip); 1466 1467 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1468 return (EOPNOTSUPP); 1469 1470 if (strlen(ap->a_name) == 0) 1471 return (EINVAL); 1472 1473 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1474 return (EROFS); 1475 1476 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1477 ap->a_cred, ap->a_td, VWRITE); 1478 if (error) { 1479 1480 /* 1481 * ffs_lock_ea is not needed there, because the vnode 1482 * must be exclusively locked. 1483 */ 1484 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1485 ip->i_ea_error = error; 1486 return (error); 1487 } 1488 1489 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1490 if (error) 1491 return (error); 1492 1493 ealength = eapad1 = ealen = eapad2 = 0; 1494 1495 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1496 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1497 easize = ip->i_ea_len; 1498 1499 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1500 &p, NULL); 1501 if (olen == -1) { 1502 /* delete but nonexistent */ 1503 free(eae, M_TEMP); 1504 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1505 return(ENOATTR); 1506 } 1507 bcopy(p, &ul, sizeof ul); 1508 i = p - eae + ul; 1509 if (ul != ealength) { 1510 bcopy(p + ul, p + ealength, easize - i); 1511 easize += (ealength - ul); 1512 } 1513 if (easize > NXADDR * fs->fs_bsize) { 1514 free(eae, M_TEMP); 1515 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1516 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1517 ip->i_ea_error = ENOSPC; 1518 return(ENOSPC); 1519 } 1520 p = ip->i_ea_area; 1521 ip->i_ea_area = eae; 1522 ip->i_ea_len = easize; 1523 free(p, M_TEMP); 1524 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1525 return(error); 1526 } 1527 1528 /* 1529 * Vnode operation to retrieve a named extended attribute. 1530 */ 1531 static int 1532 ffs_getextattr(struct vop_getextattr_args *ap) 1533 /* 1534 vop_getextattr { 1535 IN struct vnode *a_vp; 1536 IN int a_attrnamespace; 1537 IN const char *a_name; 1538 INOUT struct uio *a_uio; 1539 OUT size_t *a_size; 1540 IN struct ucred *a_cred; 1541 IN struct thread *a_td; 1542 }; 1543 */ 1544 { 1545 struct inode *ip; 1546 u_char *eae, *p; 1547 unsigned easize; 1548 int error, ealen; 1549 1550 ip = VTOI(ap->a_vp); 1551 1552 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1553 return (EOPNOTSUPP); 1554 1555 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1556 ap->a_cred, ap->a_td, VREAD); 1557 if (error) 1558 return (error); 1559 1560 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1561 if (error) 1562 return (error); 1563 1564 eae = ip->i_ea_area; 1565 easize = ip->i_ea_len; 1566 1567 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1568 NULL, &p); 1569 if (ealen >= 0) { 1570 error = 0; 1571 if (ap->a_size != NULL) 1572 *ap->a_size = ealen; 1573 else if (ap->a_uio != NULL) 1574 error = uiomove(p, ealen, ap->a_uio); 1575 } else 1576 error = ENOATTR; 1577 1578 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1579 return(error); 1580 } 1581 1582 /* 1583 * Vnode operation to retrieve extended attributes on a vnode. 1584 */ 1585 static int 1586 ffs_listextattr(struct vop_listextattr_args *ap) 1587 /* 1588 vop_listextattr { 1589 IN struct vnode *a_vp; 1590 IN int a_attrnamespace; 1591 INOUT struct uio *a_uio; 1592 OUT size_t *a_size; 1593 IN struct ucred *a_cred; 1594 IN struct thread *a_td; 1595 }; 1596 */ 1597 { 1598 struct inode *ip; 1599 u_char *eae, *p, *pe, *pn; 1600 unsigned easize; 1601 uint32_t ul; 1602 int error, ealen; 1603 1604 ip = VTOI(ap->a_vp); 1605 1606 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1607 return (EOPNOTSUPP); 1608 1609 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1610 ap->a_cred, ap->a_td, VREAD); 1611 if (error) 1612 return (error); 1613 1614 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1615 if (error) 1616 return (error); 1617 eae = ip->i_ea_area; 1618 easize = ip->i_ea_len; 1619 1620 error = 0; 1621 if (ap->a_size != NULL) 1622 *ap->a_size = 0; 1623 pe = eae + easize; 1624 for(p = eae; error == 0 && p < pe; p = pn) { 1625 bcopy(p, &ul, sizeof(ul)); 1626 pn = p + ul; 1627 if (pn > pe) 1628 break; 1629 p += sizeof(ul); 1630 if (*p++ != ap->a_attrnamespace) 1631 continue; 1632 p++; /* pad2 */ 1633 ealen = *p; 1634 if (ap->a_size != NULL) { 1635 *ap->a_size += ealen + 1; 1636 } else if (ap->a_uio != NULL) { 1637 error = uiomove(p, ealen + 1, ap->a_uio); 1638 } 1639 } 1640 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1641 return(error); 1642 } 1643 1644 /* 1645 * Vnode operation to set a named attribute. 1646 */ 1647 static int 1648 ffs_setextattr(struct vop_setextattr_args *ap) 1649 /* 1650 vop_setextattr { 1651 IN struct vnode *a_vp; 1652 IN int a_attrnamespace; 1653 IN const char *a_name; 1654 INOUT struct uio *a_uio; 1655 IN struct ucred *a_cred; 1656 IN struct thread *a_td; 1657 }; 1658 */ 1659 { 1660 struct inode *ip; 1661 struct fs *fs; 1662 uint32_t ealength, ul; 1663 ssize_t ealen; 1664 int olen, eapad1, eapad2, error, i, easize; 1665 u_char *eae, *p; 1666 1667 ip = VTOI(ap->a_vp); 1668 fs = ITOFS(ip); 1669 1670 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1671 return (EOPNOTSUPP); 1672 1673 if (strlen(ap->a_name) == 0) 1674 return (EINVAL); 1675 1676 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1677 if (ap->a_uio == NULL) 1678 return (EOPNOTSUPP); 1679 1680 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1681 return (EROFS); 1682 1683 ealen = ap->a_uio->uio_resid; 1684 if (ealen < 0 || ealen > lblktosize(fs, NXADDR)) 1685 return (EINVAL); 1686 1687 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1688 ap->a_cred, ap->a_td, VWRITE); 1689 if (error) { 1690 1691 /* 1692 * ffs_lock_ea is not needed there, because the vnode 1693 * must be exclusively locked. 1694 */ 1695 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1696 ip->i_ea_error = error; 1697 return (error); 1698 } 1699 1700 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1701 if (error) 1702 return (error); 1703 1704 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1705 eapad1 = 8 - (ealength % 8); 1706 if (eapad1 == 8) 1707 eapad1 = 0; 1708 eapad2 = 8 - (ealen % 8); 1709 if (eapad2 == 8) 1710 eapad2 = 0; 1711 ealength += eapad1 + ealen + eapad2; 1712 1713 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1714 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1715 easize = ip->i_ea_len; 1716 1717 olen = ffs_findextattr(eae, easize, 1718 ap->a_attrnamespace, ap->a_name, &p, NULL); 1719 if (olen == -1) { 1720 /* new, append at end */ 1721 p = eae + easize; 1722 easize += ealength; 1723 } else { 1724 bcopy(p, &ul, sizeof ul); 1725 i = p - eae + ul; 1726 if (ul != ealength) { 1727 bcopy(p + ul, p + ealength, easize - i); 1728 easize += (ealength - ul); 1729 } 1730 } 1731 if (easize > lblktosize(fs, NXADDR)) { 1732 free(eae, M_TEMP); 1733 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1734 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1735 ip->i_ea_error = ENOSPC; 1736 return(ENOSPC); 1737 } 1738 bcopy(&ealength, p, sizeof(ealength)); 1739 p += sizeof(ealength); 1740 *p++ = ap->a_attrnamespace; 1741 *p++ = eapad2; 1742 *p++ = strlen(ap->a_name); 1743 strcpy(p, ap->a_name); 1744 p += strlen(ap->a_name); 1745 bzero(p, eapad1); 1746 p += eapad1; 1747 error = uiomove(p, ealen, ap->a_uio); 1748 if (error) { 1749 free(eae, M_TEMP); 1750 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1751 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1752 ip->i_ea_error = error; 1753 return(error); 1754 } 1755 p += ealen; 1756 bzero(p, eapad2); 1757 1758 p = ip->i_ea_area; 1759 ip->i_ea_area = eae; 1760 ip->i_ea_len = easize; 1761 free(p, M_TEMP); 1762 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1763 return(error); 1764 } 1765 1766 /* 1767 * Vnode pointer to File handle 1768 */ 1769 static int 1770 ffs_vptofh(struct vop_vptofh_args *ap) 1771 /* 1772 vop_vptofh { 1773 IN struct vnode *a_vp; 1774 IN struct fid *a_fhp; 1775 }; 1776 */ 1777 { 1778 struct inode *ip; 1779 struct ufid *ufhp; 1780 1781 ip = VTOI(ap->a_vp); 1782 ufhp = (struct ufid *)ap->a_fhp; 1783 ufhp->ufid_len = sizeof(struct ufid); 1784 ufhp->ufid_ino = ip->i_number; 1785 ufhp->ufid_gen = ip->i_gen; 1786 return (0); 1787 } 1788 1789 SYSCTL_DECL(_vfs_ffs); 1790 static int use_buf_pager = 1; 1791 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0, 1792 "Always use buffer pager instead of bmap"); 1793 1794 static daddr_t 1795 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) 1796 { 1797 1798 return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off)); 1799 } 1800 1801 static int 1802 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn) 1803 { 1804 1805 return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn)); 1806 } 1807 1808 static int 1809 ffs_getpages(struct vop_getpages_args *ap) 1810 { 1811 struct vnode *vp; 1812 struct ufsmount *um; 1813 1814 vp = ap->a_vp; 1815 um = VFSTOUFS(vp->v_mount); 1816 1817 if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) 1818 return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, 1819 ap->a_rbehind, ap->a_rahead, NULL, NULL)); 1820 return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, 1821 ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz)); 1822 } 1823