1 /*- 2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 60 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 61 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include <sys/param.h> 68 #include <sys/bio.h> 69 #include <sys/systm.h> 70 #include <sys/buf.h> 71 #include <sys/conf.h> 72 #include <sys/extattr.h> 73 #include <sys/kernel.h> 74 #include <sys/limits.h> 75 #include <sys/malloc.h> 76 #include <sys/mount.h> 77 #include <sys/priv.h> 78 #include <sys/rwlock.h> 79 #include <sys/stat.h> 80 #include <sys/sysctl.h> 81 #include <sys/vmmeter.h> 82 #include <sys/vnode.h> 83 84 #include <vm/vm.h> 85 #include <vm/vm_param.h> 86 #include <vm/vm_extern.h> 87 #include <vm/vm_object.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_pager.h> 90 #include <vm/vnode_pager.h> 91 92 #include <ufs/ufs/extattr.h> 93 #include <ufs/ufs/quota.h> 94 #include <ufs/ufs/inode.h> 95 #include <ufs/ufs/ufs_extern.h> 96 #include <ufs/ufs/ufsmount.h> 97 98 #include <ufs/ffs/fs.h> 99 #include <ufs/ffs/ffs_extern.h> 100 #include "opt_directio.h" 101 #include "opt_ffs.h" 102 103 #ifdef DIRECTIO 104 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 105 #endif 106 static vop_fdatasync_t ffs_fdatasync; 107 static vop_fsync_t ffs_fsync; 108 static vop_getpages_t ffs_getpages; 109 static vop_lock1_t ffs_lock; 110 static vop_read_t ffs_read; 111 static vop_write_t ffs_write; 112 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 113 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 114 struct ucred *cred); 115 static vop_strategy_t ffsext_strategy; 116 static vop_closeextattr_t ffs_closeextattr; 117 static vop_deleteextattr_t ffs_deleteextattr; 118 static vop_getextattr_t ffs_getextattr; 119 static vop_listextattr_t ffs_listextattr; 120 static vop_openextattr_t ffs_openextattr; 121 static vop_setextattr_t ffs_setextattr; 122 static vop_vptofh_t ffs_vptofh; 123 124 /* Global vfs data structures for ufs. */ 125 struct vop_vector ffs_vnodeops1 = { 126 .vop_default = &ufs_vnodeops, 127 .vop_fsync = ffs_fsync, 128 .vop_fdatasync = ffs_fdatasync, 129 .vop_getpages = ffs_getpages, 130 .vop_getpages_async = vnode_pager_local_getpages_async, 131 .vop_lock1 = ffs_lock, 132 .vop_read = ffs_read, 133 .vop_reallocblks = ffs_reallocblks, 134 .vop_write = ffs_write, 135 .vop_vptofh = ffs_vptofh, 136 }; 137 138 struct vop_vector ffs_fifoops1 = { 139 .vop_default = &ufs_fifoops, 140 .vop_fsync = ffs_fsync, 141 .vop_fdatasync = ffs_fdatasync, 142 .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ 143 .vop_vptofh = ffs_vptofh, 144 }; 145 146 /* Global vfs data structures for ufs. */ 147 struct vop_vector ffs_vnodeops2 = { 148 .vop_default = &ufs_vnodeops, 149 .vop_fsync = ffs_fsync, 150 .vop_fdatasync = ffs_fdatasync, 151 .vop_getpages = ffs_getpages, 152 .vop_getpages_async = vnode_pager_local_getpages_async, 153 .vop_lock1 = ffs_lock, 154 .vop_read = ffs_read, 155 .vop_reallocblks = ffs_reallocblks, 156 .vop_write = ffs_write, 157 .vop_closeextattr = ffs_closeextattr, 158 .vop_deleteextattr = ffs_deleteextattr, 159 .vop_getextattr = ffs_getextattr, 160 .vop_listextattr = ffs_listextattr, 161 .vop_openextattr = ffs_openextattr, 162 .vop_setextattr = ffs_setextattr, 163 .vop_vptofh = ffs_vptofh, 164 }; 165 166 struct vop_vector ffs_fifoops2 = { 167 .vop_default = &ufs_fifoops, 168 .vop_fsync = ffs_fsync, 169 .vop_fdatasync = ffs_fdatasync, 170 .vop_lock1 = ffs_lock, 171 .vop_reallocblks = ffs_reallocblks, 172 .vop_strategy = ffsext_strategy, 173 .vop_closeextattr = ffs_closeextattr, 174 .vop_deleteextattr = ffs_deleteextattr, 175 .vop_getextattr = ffs_getextattr, 176 .vop_listextattr = ffs_listextattr, 177 .vop_openextattr = ffs_openextattr, 178 .vop_setextattr = ffs_setextattr, 179 .vop_vptofh = ffs_vptofh, 180 }; 181 182 /* 183 * Synch an open file. 184 */ 185 /* ARGSUSED */ 186 static int 187 ffs_fsync(struct vop_fsync_args *ap) 188 { 189 struct vnode *vp; 190 struct bufobj *bo; 191 int error; 192 193 vp = ap->a_vp; 194 bo = &vp->v_bufobj; 195 retry: 196 error = ffs_syncvnode(vp, ap->a_waitfor, 0); 197 if (error) 198 return (error); 199 if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) { 200 error = softdep_fsync(vp); 201 if (error) 202 return (error); 203 204 /* 205 * The softdep_fsync() function may drop vp lock, 206 * allowing for dirty buffers to reappear on the 207 * bo_dirty list. Recheck and resync as needed. 208 */ 209 BO_LOCK(bo); 210 if ((vp->v_type == VREG || vp->v_type == VDIR) && 211 (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) { 212 BO_UNLOCK(bo); 213 goto retry; 214 } 215 BO_UNLOCK(bo); 216 } 217 return (0); 218 } 219 220 int 221 ffs_syncvnode(struct vnode *vp, int waitfor, int flags) 222 { 223 struct inode *ip; 224 struct bufobj *bo; 225 struct buf *bp, *nbp; 226 ufs_lbn_t lbn; 227 int error, passes; 228 bool still_dirty, wait; 229 230 ip = VTOI(vp); 231 ip->i_flag &= ~IN_NEEDSYNC; 232 bo = &vp->v_bufobj; 233 234 /* 235 * When doing MNT_WAIT we must first flush all dependencies 236 * on the inode. 237 */ 238 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && 239 (error = softdep_sync_metadata(vp)) != 0) 240 return (error); 241 242 /* 243 * Flush all dirty buffers associated with a vnode. 244 */ 245 error = 0; 246 passes = 0; 247 wait = false; /* Always do an async pass first. */ 248 lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1)); 249 BO_LOCK(bo); 250 loop: 251 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 252 bp->b_vflags &= ~BV_SCANNED; 253 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 254 /* 255 * Reasons to skip this buffer: it has already been considered 256 * on this pass, the buffer has dependencies that will cause 257 * it to be redirtied and it has not already been deferred, 258 * or it is already being written. 259 */ 260 if ((bp->b_vflags & BV_SCANNED) != 0) 261 continue; 262 bp->b_vflags |= BV_SCANNED; 263 /* 264 * Flush indirects in order, if requested. 265 * 266 * Note that if only datasync is requested, we can 267 * skip indirect blocks when softupdates are not 268 * active. Otherwise we must flush them with data, 269 * since dependencies prevent data block writes. 270 */ 271 if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR && 272 (lbn_level(bp->b_lblkno) >= passes || 273 ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp)))) 274 continue; 275 if (bp->b_lblkno > lbn) 276 panic("ffs_syncvnode: syncing truncated data."); 277 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) { 278 BO_UNLOCK(bo); 279 } else if (wait) { 280 if (BUF_LOCK(bp, 281 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 282 BO_LOCKPTR(bo)) != 0) { 283 bp->b_vflags &= ~BV_SCANNED; 284 goto next; 285 } 286 } else 287 continue; 288 if ((bp->b_flags & B_DELWRI) == 0) 289 panic("ffs_fsync: not dirty"); 290 /* 291 * Check for dependencies and potentially complete them. 292 */ 293 if (!LIST_EMPTY(&bp->b_dep) && 294 (error = softdep_sync_buf(vp, bp, 295 wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { 296 /* I/O error. */ 297 if (error != EBUSY) { 298 BUF_UNLOCK(bp); 299 return (error); 300 } 301 /* If we deferred once, don't defer again. */ 302 if ((bp->b_flags & B_DEFERRED) == 0) { 303 bp->b_flags |= B_DEFERRED; 304 BUF_UNLOCK(bp); 305 goto next; 306 } 307 } 308 if (wait) { 309 bremfree(bp); 310 if ((error = bwrite(bp)) != 0) 311 return (error); 312 } else if ((bp->b_flags & B_CLUSTEROK)) { 313 (void) vfs_bio_awrite(bp); 314 } else { 315 bremfree(bp); 316 (void) bawrite(bp); 317 } 318 next: 319 /* 320 * Since we may have slept during the I/O, we need 321 * to start from a known point. 322 */ 323 BO_LOCK(bo); 324 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); 325 } 326 if (waitfor != MNT_WAIT) { 327 BO_UNLOCK(bo); 328 if ((flags & NO_INO_UPDT) != 0) 329 return (0); 330 else 331 return (ffs_update(vp, 0)); 332 } 333 /* Drain IO to see if we're done. */ 334 bufobj_wwait(bo, 0, 0); 335 /* 336 * Block devices associated with filesystems may have new I/O 337 * requests posted for them even if the vnode is locked, so no 338 * amount of trying will get them clean. We make several passes 339 * as a best effort. 340 * 341 * Regular files may need multiple passes to flush all dependency 342 * work as it is possible that we must write once per indirect 343 * level, once for the leaf, and once for the inode and each of 344 * these will be done with one sync and one async pass. 345 */ 346 if (bo->bo_dirty.bv_cnt > 0) { 347 if ((flags & DATA_ONLY) == 0) { 348 still_dirty = true; 349 } else { 350 /* 351 * For data-only sync, dirty indirect buffers 352 * are ignored. 353 */ 354 still_dirty = false; 355 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 356 if (bp->b_lblkno > -NDADDR) { 357 still_dirty = true; 358 break; 359 } 360 } 361 } 362 363 if (still_dirty) { 364 /* Write the inode after sync passes to flush deps. */ 365 if (wait && DOINGSOFTDEP(vp) && 366 (flags & NO_INO_UPDT) == 0) { 367 BO_UNLOCK(bo); 368 ffs_update(vp, 1); 369 BO_LOCK(bo); 370 } 371 /* switch between sync/async. */ 372 wait = !wait; 373 if (wait || ++passes < NIADDR + 2) 374 goto loop; 375 #ifdef INVARIANTS 376 if (!vn_isdisk(vp, NULL)) 377 vn_printf(vp, "ffs_fsync: dirty "); 378 #endif 379 } 380 } 381 BO_UNLOCK(bo); 382 error = 0; 383 if ((flags & DATA_ONLY) == 0) { 384 if ((flags & NO_INO_UPDT) == 0) 385 error = ffs_update(vp, 1); 386 if (DOINGSUJ(vp)) 387 softdep_journal_fsync(VTOI(vp)); 388 } 389 return (error); 390 } 391 392 static int 393 ffs_fdatasync(struct vop_fdatasync_args *ap) 394 { 395 396 return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY)); 397 } 398 399 static int 400 ffs_lock(ap) 401 struct vop_lock1_args /* { 402 struct vnode *a_vp; 403 int a_flags; 404 struct thread *a_td; 405 char *file; 406 int line; 407 } */ *ap; 408 { 409 #ifndef NO_FFS_SNAPSHOT 410 struct vnode *vp; 411 int flags; 412 struct lock *lkp; 413 int result; 414 415 switch (ap->a_flags & LK_TYPE_MASK) { 416 case LK_SHARED: 417 case LK_UPGRADE: 418 case LK_EXCLUSIVE: 419 vp = ap->a_vp; 420 flags = ap->a_flags; 421 for (;;) { 422 #ifdef DEBUG_VFS_LOCKS 423 KASSERT(vp->v_holdcnt != 0, 424 ("ffs_lock %p: zero hold count", vp)); 425 #endif 426 lkp = vp->v_vnlock; 427 result = _lockmgr_args(lkp, flags, VI_MTX(vp), 428 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 429 ap->a_file, ap->a_line); 430 if (lkp == vp->v_vnlock || result != 0) 431 break; 432 /* 433 * Apparent success, except that the vnode 434 * mutated between snapshot file vnode and 435 * regular file vnode while this process 436 * slept. The lock currently held is not the 437 * right lock. Release it, and try to get the 438 * new lock. 439 */ 440 (void) _lockmgr_args(lkp, LK_RELEASE, NULL, 441 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 442 ap->a_file, ap->a_line); 443 if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == 444 (LK_INTERLOCK | LK_NOWAIT)) 445 return (EBUSY); 446 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 447 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 448 flags &= ~LK_INTERLOCK; 449 } 450 break; 451 default: 452 result = VOP_LOCK1_APV(&ufs_vnodeops, ap); 453 } 454 return (result); 455 #else 456 return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); 457 #endif 458 } 459 460 /* 461 * Vnode op for reading. 462 */ 463 static int 464 ffs_read(ap) 465 struct vop_read_args /* { 466 struct vnode *a_vp; 467 struct uio *a_uio; 468 int a_ioflag; 469 struct ucred *a_cred; 470 } */ *ap; 471 { 472 struct vnode *vp; 473 struct inode *ip; 474 struct uio *uio; 475 struct fs *fs; 476 struct buf *bp; 477 ufs_lbn_t lbn, nextlbn; 478 off_t bytesinfile; 479 long size, xfersize, blkoffset; 480 ssize_t orig_resid; 481 int error; 482 int seqcount; 483 int ioflag; 484 485 vp = ap->a_vp; 486 uio = ap->a_uio; 487 ioflag = ap->a_ioflag; 488 if (ap->a_ioflag & IO_EXT) 489 #ifdef notyet 490 return (ffs_extread(vp, uio, ioflag)); 491 #else 492 panic("ffs_read+IO_EXT"); 493 #endif 494 #ifdef DIRECTIO 495 if ((ioflag & IO_DIRECT) != 0) { 496 int workdone; 497 498 error = ffs_rawread(vp, uio, &workdone); 499 if (error != 0 || workdone != 0) 500 return error; 501 } 502 #endif 503 504 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 505 ip = VTOI(vp); 506 507 #ifdef INVARIANTS 508 if (uio->uio_rw != UIO_READ) 509 panic("ffs_read: mode"); 510 511 if (vp->v_type == VLNK) { 512 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 513 panic("ffs_read: short symlink"); 514 } else if (vp->v_type != VREG && vp->v_type != VDIR) 515 panic("ffs_read: type %d", vp->v_type); 516 #endif 517 orig_resid = uio->uio_resid; 518 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 519 if (orig_resid == 0) 520 return (0); 521 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 522 fs = ITOFS(ip); 523 if (uio->uio_offset < ip->i_size && 524 uio->uio_offset >= fs->fs_maxfilesize) 525 return (EOVERFLOW); 526 527 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 528 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 529 break; 530 lbn = lblkno(fs, uio->uio_offset); 531 nextlbn = lbn + 1; 532 533 /* 534 * size of buffer. The buffer representing the 535 * end of the file is rounded up to the size of 536 * the block type ( fragment or full block, 537 * depending ). 538 */ 539 size = blksize(fs, ip, lbn); 540 blkoffset = blkoff(fs, uio->uio_offset); 541 542 /* 543 * The amount we want to transfer in this iteration is 544 * one FS block less the amount of the data before 545 * our startpoint (duh!) 546 */ 547 xfersize = fs->fs_bsize - blkoffset; 548 549 /* 550 * But if we actually want less than the block, 551 * or the file doesn't have a whole block more of data, 552 * then use the lesser number. 553 */ 554 if (uio->uio_resid < xfersize) 555 xfersize = uio->uio_resid; 556 if (bytesinfile < xfersize) 557 xfersize = bytesinfile; 558 559 if (lblktosize(fs, nextlbn) >= ip->i_size) { 560 /* 561 * Don't do readahead if this is the end of the file. 562 */ 563 error = bread_gb(vp, lbn, size, NOCRED, 564 GB_UNMAPPED, &bp); 565 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 566 /* 567 * Otherwise if we are allowed to cluster, 568 * grab as much as we can. 569 * 570 * XXX This may not be a win if we are not 571 * doing sequential access. 572 */ 573 error = cluster_read(vp, ip->i_size, lbn, 574 size, NOCRED, blkoffset + uio->uio_resid, 575 seqcount, GB_UNMAPPED, &bp); 576 } else if (seqcount > 1) { 577 /* 578 * If we are NOT allowed to cluster, then 579 * if we appear to be acting sequentially, 580 * fire off a request for a readahead 581 * as well as a read. Note that the 4th and 5th 582 * arguments point to arrays of the size specified in 583 * the 6th argument. 584 */ 585 u_int nextsize = blksize(fs, ip, nextlbn); 586 error = breadn_flags(vp, lbn, size, &nextlbn, 587 &nextsize, 1, NOCRED, GB_UNMAPPED, &bp); 588 } else { 589 /* 590 * Failing all of the above, just read what the 591 * user asked for. Interestingly, the same as 592 * the first option above. 593 */ 594 error = bread_gb(vp, lbn, size, NOCRED, 595 GB_UNMAPPED, &bp); 596 } 597 if (error) { 598 brelse(bp); 599 bp = NULL; 600 break; 601 } 602 603 /* 604 * We should only get non-zero b_resid when an I/O error 605 * has occurred, which should cause us to break above. 606 * However, if the short read did not cause an error, 607 * then we want to ensure that we do not uiomove bad 608 * or uninitialized data. 609 */ 610 size -= bp->b_resid; 611 if (size < xfersize) { 612 if (size == 0) 613 break; 614 xfersize = size; 615 } 616 617 if (buf_mapped(bp)) { 618 error = vn_io_fault_uiomove((char *)bp->b_data + 619 blkoffset, (int)xfersize, uio); 620 } else { 621 error = vn_io_fault_pgmove(bp->b_pages, blkoffset, 622 (int)xfersize, uio); 623 } 624 if (error) 625 break; 626 627 vfs_bio_brelse(bp, ioflag); 628 } 629 630 /* 631 * This can only happen in the case of an error 632 * because the loop above resets bp to NULL on each iteration 633 * and on normal completion has not set a new value into it. 634 * so it must have come from a 'break' statement 635 */ 636 if (bp != NULL) 637 vfs_bio_brelse(bp, ioflag); 638 639 if ((error == 0 || uio->uio_resid != orig_resid) && 640 (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 && 641 (ip->i_flag & IN_ACCESS) == 0) { 642 VI_LOCK(vp); 643 ip->i_flag |= IN_ACCESS; 644 VI_UNLOCK(vp); 645 } 646 return (error); 647 } 648 649 /* 650 * Vnode op for writing. 651 */ 652 static int 653 ffs_write(ap) 654 struct vop_write_args /* { 655 struct vnode *a_vp; 656 struct uio *a_uio; 657 int a_ioflag; 658 struct ucred *a_cred; 659 } */ *ap; 660 { 661 struct vnode *vp; 662 struct uio *uio; 663 struct inode *ip; 664 struct fs *fs; 665 struct buf *bp; 666 ufs_lbn_t lbn; 667 off_t osize; 668 ssize_t resid; 669 int seqcount; 670 int blkoffset, error, flags, ioflag, size, xfersize; 671 672 vp = ap->a_vp; 673 uio = ap->a_uio; 674 ioflag = ap->a_ioflag; 675 if (ap->a_ioflag & IO_EXT) 676 #ifdef notyet 677 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 678 #else 679 panic("ffs_write+IO_EXT"); 680 #endif 681 682 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 683 ip = VTOI(vp); 684 685 #ifdef INVARIANTS 686 if (uio->uio_rw != UIO_WRITE) 687 panic("ffs_write: mode"); 688 #endif 689 690 switch (vp->v_type) { 691 case VREG: 692 if (ioflag & IO_APPEND) 693 uio->uio_offset = ip->i_size; 694 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 695 return (EPERM); 696 /* FALLTHROUGH */ 697 case VLNK: 698 break; 699 case VDIR: 700 panic("ffs_write: dir write"); 701 break; 702 default: 703 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 704 (int)uio->uio_offset, 705 (int)uio->uio_resid 706 ); 707 } 708 709 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 710 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 711 fs = ITOFS(ip); 712 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 713 return (EFBIG); 714 /* 715 * Maybe this should be above the vnode op call, but so long as 716 * file servers have no limits, I don't think it matters. 717 */ 718 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) 719 return (EFBIG); 720 721 resid = uio->uio_resid; 722 osize = ip->i_size; 723 if (seqcount > BA_SEQMAX) 724 flags = BA_SEQMAX << BA_SEQSHIFT; 725 else 726 flags = seqcount << BA_SEQSHIFT; 727 if (ioflag & IO_SYNC) 728 flags |= IO_SYNC; 729 flags |= BA_UNMAPPED; 730 731 for (error = 0; uio->uio_resid > 0;) { 732 lbn = lblkno(fs, uio->uio_offset); 733 blkoffset = blkoff(fs, uio->uio_offset); 734 xfersize = fs->fs_bsize - blkoffset; 735 if (uio->uio_resid < xfersize) 736 xfersize = uio->uio_resid; 737 if (uio->uio_offset + xfersize > ip->i_size) 738 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 739 740 /* 741 * We must perform a read-before-write if the transfer size 742 * does not cover the entire buffer. 743 */ 744 if (fs->fs_bsize > xfersize) 745 flags |= BA_CLRBUF; 746 else 747 flags &= ~BA_CLRBUF; 748 /* XXX is uio->uio_offset the right thing here? */ 749 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 750 ap->a_cred, flags, &bp); 751 if (error != 0) { 752 vnode_pager_setsize(vp, ip->i_size); 753 break; 754 } 755 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 756 bp->b_flags |= B_NOCACHE; 757 758 if (uio->uio_offset + xfersize > ip->i_size) { 759 ip->i_size = uio->uio_offset + xfersize; 760 DIP_SET(ip, i_size, ip->i_size); 761 } 762 763 size = blksize(fs, ip, lbn) - bp->b_resid; 764 if (size < xfersize) 765 xfersize = size; 766 767 if (buf_mapped(bp)) { 768 error = vn_io_fault_uiomove((char *)bp->b_data + 769 blkoffset, (int)xfersize, uio); 770 } else { 771 error = vn_io_fault_pgmove(bp->b_pages, blkoffset, 772 (int)xfersize, uio); 773 } 774 /* 775 * If the buffer is not already filled and we encounter an 776 * error while trying to fill it, we have to clear out any 777 * garbage data from the pages instantiated for the buffer. 778 * If we do not, a failed uiomove() during a write can leave 779 * the prior contents of the pages exposed to a userland mmap. 780 * 781 * Note that we need only clear buffers with a transfer size 782 * equal to the block size because buffers with a shorter 783 * transfer size were cleared above by the call to UFS_BALLOC() 784 * with the BA_CLRBUF flag set. 785 * 786 * If the source region for uiomove identically mmaps the 787 * buffer, uiomove() performed the NOP copy, and the buffer 788 * content remains valid because the page fault handler 789 * validated the pages. 790 */ 791 if (error != 0 && (bp->b_flags & B_CACHE) == 0 && 792 fs->fs_bsize == xfersize) 793 vfs_bio_clrbuf(bp); 794 795 vfs_bio_set_flags(bp, ioflag); 796 797 /* 798 * If IO_SYNC each buffer is written synchronously. Otherwise 799 * if we have a severe page deficiency write the buffer 800 * asynchronously. Otherwise try to cluster, and if that 801 * doesn't do it then either do an async write (if O_DIRECT), 802 * or a delayed write (if not). 803 */ 804 if (ioflag & IO_SYNC) { 805 (void)bwrite(bp); 806 } else if (vm_page_count_severe() || 807 buf_dirty_count_severe() || 808 (ioflag & IO_ASYNC)) { 809 bp->b_flags |= B_CLUSTEROK; 810 bawrite(bp); 811 } else if (xfersize + blkoffset == fs->fs_bsize) { 812 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 813 bp->b_flags |= B_CLUSTEROK; 814 cluster_write(vp, bp, ip->i_size, seqcount, 815 GB_UNMAPPED); 816 } else { 817 bawrite(bp); 818 } 819 } else if (ioflag & IO_DIRECT) { 820 bp->b_flags |= B_CLUSTEROK; 821 bawrite(bp); 822 } else { 823 bp->b_flags |= B_CLUSTEROK; 824 bdwrite(bp); 825 } 826 if (error || xfersize == 0) 827 break; 828 ip->i_flag |= IN_CHANGE | IN_UPDATE; 829 } 830 /* 831 * If we successfully wrote any data, and we are not the superuser 832 * we clear the setuid and setgid bits as a precaution against 833 * tampering. 834 */ 835 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && 836 ap->a_cred) { 837 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) { 838 ip->i_mode &= ~(ISUID | ISGID); 839 DIP_SET(ip, i_mode, ip->i_mode); 840 } 841 } 842 if (error) { 843 if (ioflag & IO_UNIT) { 844 (void)ffs_truncate(vp, osize, 845 IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred); 846 uio->uio_offset -= resid - uio->uio_resid; 847 uio->uio_resid = resid; 848 } 849 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 850 error = ffs_update(vp, 1); 851 return (error); 852 } 853 854 /* 855 * Extended attribute area reading. 856 */ 857 static int 858 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 859 { 860 struct inode *ip; 861 struct ufs2_dinode *dp; 862 struct fs *fs; 863 struct buf *bp; 864 ufs_lbn_t lbn, nextlbn; 865 off_t bytesinfile; 866 long size, xfersize, blkoffset; 867 ssize_t orig_resid; 868 int error; 869 870 ip = VTOI(vp); 871 fs = ITOFS(ip); 872 dp = ip->i_din2; 873 874 #ifdef INVARIANTS 875 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 876 panic("ffs_extread: mode"); 877 878 #endif 879 orig_resid = uio->uio_resid; 880 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 881 if (orig_resid == 0) 882 return (0); 883 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 884 885 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 886 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 887 break; 888 lbn = lblkno(fs, uio->uio_offset); 889 nextlbn = lbn + 1; 890 891 /* 892 * size of buffer. The buffer representing the 893 * end of the file is rounded up to the size of 894 * the block type ( fragment or full block, 895 * depending ). 896 */ 897 size = sblksize(fs, dp->di_extsize, lbn); 898 blkoffset = blkoff(fs, uio->uio_offset); 899 900 /* 901 * The amount we want to transfer in this iteration is 902 * one FS block less the amount of the data before 903 * our startpoint (duh!) 904 */ 905 xfersize = fs->fs_bsize - blkoffset; 906 907 /* 908 * But if we actually want less than the block, 909 * or the file doesn't have a whole block more of data, 910 * then use the lesser number. 911 */ 912 if (uio->uio_resid < xfersize) 913 xfersize = uio->uio_resid; 914 if (bytesinfile < xfersize) 915 xfersize = bytesinfile; 916 917 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 918 /* 919 * Don't do readahead if this is the end of the info. 920 */ 921 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 922 } else { 923 /* 924 * If we have a second block, then 925 * fire off a request for a readahead 926 * as well as a read. Note that the 4th and 5th 927 * arguments point to arrays of the size specified in 928 * the 6th argument. 929 */ 930 u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 931 932 nextlbn = -1 - nextlbn; 933 error = breadn(vp, -1 - lbn, 934 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 935 } 936 if (error) { 937 brelse(bp); 938 bp = NULL; 939 break; 940 } 941 942 /* 943 * We should only get non-zero b_resid when an I/O error 944 * has occurred, which should cause us to break above. 945 * However, if the short read did not cause an error, 946 * then we want to ensure that we do not uiomove bad 947 * or uninitialized data. 948 */ 949 size -= bp->b_resid; 950 if (size < xfersize) { 951 if (size == 0) 952 break; 953 xfersize = size; 954 } 955 956 error = uiomove((char *)bp->b_data + blkoffset, 957 (int)xfersize, uio); 958 if (error) 959 break; 960 vfs_bio_brelse(bp, ioflag); 961 } 962 963 /* 964 * This can only happen in the case of an error 965 * because the loop above resets bp to NULL on each iteration 966 * and on normal completion has not set a new value into it. 967 * so it must have come from a 'break' statement 968 */ 969 if (bp != NULL) 970 vfs_bio_brelse(bp, ioflag); 971 return (error); 972 } 973 974 /* 975 * Extended attribute area writing. 976 */ 977 static int 978 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 979 { 980 struct inode *ip; 981 struct ufs2_dinode *dp; 982 struct fs *fs; 983 struct buf *bp; 984 ufs_lbn_t lbn; 985 off_t osize; 986 ssize_t resid; 987 int blkoffset, error, flags, size, xfersize; 988 989 ip = VTOI(vp); 990 fs = ITOFS(ip); 991 dp = ip->i_din2; 992 993 #ifdef INVARIANTS 994 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 995 panic("ffs_extwrite: mode"); 996 #endif 997 998 if (ioflag & IO_APPEND) 999 uio->uio_offset = dp->di_extsize; 1000 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1001 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1002 if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1003 return (EFBIG); 1004 1005 resid = uio->uio_resid; 1006 osize = dp->di_extsize; 1007 flags = IO_EXT; 1008 if (ioflag & IO_SYNC) 1009 flags |= IO_SYNC; 1010 1011 for (error = 0; uio->uio_resid > 0;) { 1012 lbn = lblkno(fs, uio->uio_offset); 1013 blkoffset = blkoff(fs, uio->uio_offset); 1014 xfersize = fs->fs_bsize - blkoffset; 1015 if (uio->uio_resid < xfersize) 1016 xfersize = uio->uio_resid; 1017 1018 /* 1019 * We must perform a read-before-write if the transfer size 1020 * does not cover the entire buffer. 1021 */ 1022 if (fs->fs_bsize > xfersize) 1023 flags |= BA_CLRBUF; 1024 else 1025 flags &= ~BA_CLRBUF; 1026 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1027 ucred, flags, &bp); 1028 if (error != 0) 1029 break; 1030 /* 1031 * If the buffer is not valid we have to clear out any 1032 * garbage data from the pages instantiated for the buffer. 1033 * If we do not, a failed uiomove() during a write can leave 1034 * the prior contents of the pages exposed to a userland 1035 * mmap(). XXX deal with uiomove() errors a better way. 1036 */ 1037 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1038 vfs_bio_clrbuf(bp); 1039 1040 if (uio->uio_offset + xfersize > dp->di_extsize) 1041 dp->di_extsize = uio->uio_offset + xfersize; 1042 1043 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1044 if (size < xfersize) 1045 xfersize = size; 1046 1047 error = 1048 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1049 1050 vfs_bio_set_flags(bp, ioflag); 1051 1052 /* 1053 * If IO_SYNC each buffer is written synchronously. Otherwise 1054 * if we have a severe page deficiency write the buffer 1055 * asynchronously. Otherwise try to cluster, and if that 1056 * doesn't do it then either do an async write (if O_DIRECT), 1057 * or a delayed write (if not). 1058 */ 1059 if (ioflag & IO_SYNC) { 1060 (void)bwrite(bp); 1061 } else if (vm_page_count_severe() || 1062 buf_dirty_count_severe() || 1063 xfersize + blkoffset == fs->fs_bsize || 1064 (ioflag & (IO_ASYNC | IO_DIRECT))) 1065 bawrite(bp); 1066 else 1067 bdwrite(bp); 1068 if (error || xfersize == 0) 1069 break; 1070 ip->i_flag |= IN_CHANGE; 1071 } 1072 /* 1073 * If we successfully wrote any data, and we are not the superuser 1074 * we clear the setuid and setgid bits as a precaution against 1075 * tampering. 1076 */ 1077 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { 1078 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) { 1079 ip->i_mode &= ~(ISUID | ISGID); 1080 dp->di_mode = ip->i_mode; 1081 } 1082 } 1083 if (error) { 1084 if (ioflag & IO_UNIT) { 1085 (void)ffs_truncate(vp, osize, 1086 IO_EXT | (ioflag&IO_SYNC), ucred); 1087 uio->uio_offset -= resid - uio->uio_resid; 1088 uio->uio_resid = resid; 1089 } 1090 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1091 error = ffs_update(vp, 1); 1092 return (error); 1093 } 1094 1095 1096 /* 1097 * Vnode operating to retrieve a named extended attribute. 1098 * 1099 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1100 * the length of the EA, and possibly the pointer to the entry and to the data. 1101 */ 1102 static int 1103 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) 1104 { 1105 u_char *p, *pe, *pn, *p0; 1106 int eapad1, eapad2, ealength, ealen, nlen; 1107 uint32_t ul; 1108 1109 pe = ptr + length; 1110 nlen = strlen(name); 1111 1112 for (p = ptr; p < pe; p = pn) { 1113 p0 = p; 1114 bcopy(p, &ul, sizeof(ul)); 1115 pn = p + ul; 1116 /* make sure this entry is complete */ 1117 if (pn > pe) 1118 break; 1119 p += sizeof(uint32_t); 1120 if (*p != nspace) 1121 continue; 1122 p++; 1123 eapad2 = *p++; 1124 if (*p != nlen) 1125 continue; 1126 p++; 1127 if (bcmp(p, name, nlen)) 1128 continue; 1129 ealength = sizeof(uint32_t) + 3 + nlen; 1130 eapad1 = 8 - (ealength % 8); 1131 if (eapad1 == 8) 1132 eapad1 = 0; 1133 ealength += eapad1; 1134 ealen = ul - ealength - eapad2; 1135 p += nlen + eapad1; 1136 if (eap != NULL) 1137 *eap = p0; 1138 if (eac != NULL) 1139 *eac = p; 1140 return (ealen); 1141 } 1142 return(-1); 1143 } 1144 1145 static int 1146 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1147 { 1148 struct inode *ip; 1149 struct ufs2_dinode *dp; 1150 struct fs *fs; 1151 struct uio luio; 1152 struct iovec liovec; 1153 u_int easize; 1154 int error; 1155 u_char *eae; 1156 1157 ip = VTOI(vp); 1158 fs = ITOFS(ip); 1159 dp = ip->i_din2; 1160 easize = dp->di_extsize; 1161 if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize) 1162 return (EFBIG); 1163 1164 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1165 1166 liovec.iov_base = eae; 1167 liovec.iov_len = easize; 1168 luio.uio_iov = &liovec; 1169 luio.uio_iovcnt = 1; 1170 luio.uio_offset = 0; 1171 luio.uio_resid = easize; 1172 luio.uio_segflg = UIO_SYSSPACE; 1173 luio.uio_rw = UIO_READ; 1174 luio.uio_td = td; 1175 1176 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1177 if (error) { 1178 free(eae, M_TEMP); 1179 return(error); 1180 } 1181 *p = eae; 1182 return (0); 1183 } 1184 1185 static void 1186 ffs_lock_ea(struct vnode *vp) 1187 { 1188 struct inode *ip; 1189 1190 ip = VTOI(vp); 1191 VI_LOCK(vp); 1192 while (ip->i_flag & IN_EA_LOCKED) { 1193 ip->i_flag |= IN_EA_LOCKWAIT; 1194 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea", 1195 0); 1196 } 1197 ip->i_flag |= IN_EA_LOCKED; 1198 VI_UNLOCK(vp); 1199 } 1200 1201 static void 1202 ffs_unlock_ea(struct vnode *vp) 1203 { 1204 struct inode *ip; 1205 1206 ip = VTOI(vp); 1207 VI_LOCK(vp); 1208 if (ip->i_flag & IN_EA_LOCKWAIT) 1209 wakeup(&ip->i_ea_refs); 1210 ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT); 1211 VI_UNLOCK(vp); 1212 } 1213 1214 static int 1215 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1216 { 1217 struct inode *ip; 1218 struct ufs2_dinode *dp; 1219 int error; 1220 1221 ip = VTOI(vp); 1222 1223 ffs_lock_ea(vp); 1224 if (ip->i_ea_area != NULL) { 1225 ip->i_ea_refs++; 1226 ffs_unlock_ea(vp); 1227 return (0); 1228 } 1229 dp = ip->i_din2; 1230 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1231 if (error) { 1232 ffs_unlock_ea(vp); 1233 return (error); 1234 } 1235 ip->i_ea_len = dp->di_extsize; 1236 ip->i_ea_error = 0; 1237 ip->i_ea_refs++; 1238 ffs_unlock_ea(vp); 1239 return (0); 1240 } 1241 1242 /* 1243 * Vnode extattr transaction commit/abort 1244 */ 1245 static int 1246 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1247 { 1248 struct inode *ip; 1249 struct uio luio; 1250 struct iovec liovec; 1251 int error; 1252 struct ufs2_dinode *dp; 1253 1254 ip = VTOI(vp); 1255 1256 ffs_lock_ea(vp); 1257 if (ip->i_ea_area == NULL) { 1258 ffs_unlock_ea(vp); 1259 return (EINVAL); 1260 } 1261 dp = ip->i_din2; 1262 error = ip->i_ea_error; 1263 if (commit && error == 0) { 1264 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit"); 1265 if (cred == NOCRED) 1266 cred = vp->v_mount->mnt_cred; 1267 liovec.iov_base = ip->i_ea_area; 1268 liovec.iov_len = ip->i_ea_len; 1269 luio.uio_iov = &liovec; 1270 luio.uio_iovcnt = 1; 1271 luio.uio_offset = 0; 1272 luio.uio_resid = ip->i_ea_len; 1273 luio.uio_segflg = UIO_SYSSPACE; 1274 luio.uio_rw = UIO_WRITE; 1275 luio.uio_td = td; 1276 /* XXX: I'm not happy about truncating to zero size */ 1277 if (ip->i_ea_len < dp->di_extsize) 1278 error = ffs_truncate(vp, 0, IO_EXT, cred); 1279 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1280 } 1281 if (--ip->i_ea_refs == 0) { 1282 free(ip->i_ea_area, M_TEMP); 1283 ip->i_ea_area = NULL; 1284 ip->i_ea_len = 0; 1285 ip->i_ea_error = 0; 1286 } 1287 ffs_unlock_ea(vp); 1288 return (error); 1289 } 1290 1291 /* 1292 * Vnode extattr strategy routine for fifos. 1293 * 1294 * We need to check for a read or write of the external attributes. 1295 * Otherwise we just fall through and do the usual thing. 1296 */ 1297 static int 1298 ffsext_strategy(struct vop_strategy_args *ap) 1299 /* 1300 struct vop_strategy_args { 1301 struct vnodeop_desc *a_desc; 1302 struct vnode *a_vp; 1303 struct buf *a_bp; 1304 }; 1305 */ 1306 { 1307 struct vnode *vp; 1308 daddr_t lbn; 1309 1310 vp = ap->a_vp; 1311 lbn = ap->a_bp->b_lblkno; 1312 if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -NXADDR) 1313 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1314 if (vp->v_type == VFIFO) 1315 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1316 panic("spec nodes went here"); 1317 } 1318 1319 /* 1320 * Vnode extattr transaction commit/abort 1321 */ 1322 static int 1323 ffs_openextattr(struct vop_openextattr_args *ap) 1324 /* 1325 struct vop_openextattr_args { 1326 struct vnodeop_desc *a_desc; 1327 struct vnode *a_vp; 1328 IN struct ucred *a_cred; 1329 IN struct thread *a_td; 1330 }; 1331 */ 1332 { 1333 1334 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1335 return (EOPNOTSUPP); 1336 1337 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1338 } 1339 1340 1341 /* 1342 * Vnode extattr transaction commit/abort 1343 */ 1344 static int 1345 ffs_closeextattr(struct vop_closeextattr_args *ap) 1346 /* 1347 struct vop_closeextattr_args { 1348 struct vnodeop_desc *a_desc; 1349 struct vnode *a_vp; 1350 int a_commit; 1351 IN struct ucred *a_cred; 1352 IN struct thread *a_td; 1353 }; 1354 */ 1355 { 1356 1357 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1358 return (EOPNOTSUPP); 1359 1360 if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) 1361 return (EROFS); 1362 1363 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1364 } 1365 1366 /* 1367 * Vnode operation to remove a named attribute. 1368 */ 1369 static int 1370 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1371 /* 1372 vop_deleteextattr { 1373 IN struct vnode *a_vp; 1374 IN int a_attrnamespace; 1375 IN const char *a_name; 1376 IN struct ucred *a_cred; 1377 IN struct thread *a_td; 1378 }; 1379 */ 1380 { 1381 struct inode *ip; 1382 struct fs *fs; 1383 uint32_t ealength, ul; 1384 int ealen, olen, eapad1, eapad2, error, i, easize; 1385 u_char *eae, *p; 1386 1387 ip = VTOI(ap->a_vp); 1388 fs = ITOFS(ip); 1389 1390 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1391 return (EOPNOTSUPP); 1392 1393 if (strlen(ap->a_name) == 0) 1394 return (EINVAL); 1395 1396 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1397 return (EROFS); 1398 1399 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1400 ap->a_cred, ap->a_td, VWRITE); 1401 if (error) { 1402 1403 /* 1404 * ffs_lock_ea is not needed there, because the vnode 1405 * must be exclusively locked. 1406 */ 1407 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1408 ip->i_ea_error = error; 1409 return (error); 1410 } 1411 1412 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1413 if (error) 1414 return (error); 1415 1416 ealength = eapad1 = ealen = eapad2 = 0; 1417 1418 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1419 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1420 easize = ip->i_ea_len; 1421 1422 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1423 &p, NULL); 1424 if (olen == -1) { 1425 /* delete but nonexistent */ 1426 free(eae, M_TEMP); 1427 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1428 return(ENOATTR); 1429 } 1430 bcopy(p, &ul, sizeof ul); 1431 i = p - eae + ul; 1432 if (ul != ealength) { 1433 bcopy(p + ul, p + ealength, easize - i); 1434 easize += (ealength - ul); 1435 } 1436 if (easize > NXADDR * fs->fs_bsize) { 1437 free(eae, M_TEMP); 1438 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1439 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1440 ip->i_ea_error = ENOSPC; 1441 return(ENOSPC); 1442 } 1443 p = ip->i_ea_area; 1444 ip->i_ea_area = eae; 1445 ip->i_ea_len = easize; 1446 free(p, M_TEMP); 1447 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1448 return(error); 1449 } 1450 1451 /* 1452 * Vnode operation to retrieve a named extended attribute. 1453 */ 1454 static int 1455 ffs_getextattr(struct vop_getextattr_args *ap) 1456 /* 1457 vop_getextattr { 1458 IN struct vnode *a_vp; 1459 IN int a_attrnamespace; 1460 IN const char *a_name; 1461 INOUT struct uio *a_uio; 1462 OUT size_t *a_size; 1463 IN struct ucred *a_cred; 1464 IN struct thread *a_td; 1465 }; 1466 */ 1467 { 1468 struct inode *ip; 1469 u_char *eae, *p; 1470 unsigned easize; 1471 int error, ealen; 1472 1473 ip = VTOI(ap->a_vp); 1474 1475 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1476 return (EOPNOTSUPP); 1477 1478 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1479 ap->a_cred, ap->a_td, VREAD); 1480 if (error) 1481 return (error); 1482 1483 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1484 if (error) 1485 return (error); 1486 1487 eae = ip->i_ea_area; 1488 easize = ip->i_ea_len; 1489 1490 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1491 NULL, &p); 1492 if (ealen >= 0) { 1493 error = 0; 1494 if (ap->a_size != NULL) 1495 *ap->a_size = ealen; 1496 else if (ap->a_uio != NULL) 1497 error = uiomove(p, ealen, ap->a_uio); 1498 } else 1499 error = ENOATTR; 1500 1501 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1502 return(error); 1503 } 1504 1505 /* 1506 * Vnode operation to retrieve extended attributes on a vnode. 1507 */ 1508 static int 1509 ffs_listextattr(struct vop_listextattr_args *ap) 1510 /* 1511 vop_listextattr { 1512 IN struct vnode *a_vp; 1513 IN int a_attrnamespace; 1514 INOUT struct uio *a_uio; 1515 OUT size_t *a_size; 1516 IN struct ucred *a_cred; 1517 IN struct thread *a_td; 1518 }; 1519 */ 1520 { 1521 struct inode *ip; 1522 u_char *eae, *p, *pe, *pn; 1523 unsigned easize; 1524 uint32_t ul; 1525 int error, ealen; 1526 1527 ip = VTOI(ap->a_vp); 1528 1529 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1530 return (EOPNOTSUPP); 1531 1532 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1533 ap->a_cred, ap->a_td, VREAD); 1534 if (error) 1535 return (error); 1536 1537 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1538 if (error) 1539 return (error); 1540 eae = ip->i_ea_area; 1541 easize = ip->i_ea_len; 1542 1543 error = 0; 1544 if (ap->a_size != NULL) 1545 *ap->a_size = 0; 1546 pe = eae + easize; 1547 for(p = eae; error == 0 && p < pe; p = pn) { 1548 bcopy(p, &ul, sizeof(ul)); 1549 pn = p + ul; 1550 if (pn > pe) 1551 break; 1552 p += sizeof(ul); 1553 if (*p++ != ap->a_attrnamespace) 1554 continue; 1555 p++; /* pad2 */ 1556 ealen = *p; 1557 if (ap->a_size != NULL) { 1558 *ap->a_size += ealen + 1; 1559 } else if (ap->a_uio != NULL) { 1560 error = uiomove(p, ealen + 1, ap->a_uio); 1561 } 1562 } 1563 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1564 return(error); 1565 } 1566 1567 /* 1568 * Vnode operation to set a named attribute. 1569 */ 1570 static int 1571 ffs_setextattr(struct vop_setextattr_args *ap) 1572 /* 1573 vop_setextattr { 1574 IN struct vnode *a_vp; 1575 IN int a_attrnamespace; 1576 IN const char *a_name; 1577 INOUT struct uio *a_uio; 1578 IN struct ucred *a_cred; 1579 IN struct thread *a_td; 1580 }; 1581 */ 1582 { 1583 struct inode *ip; 1584 struct fs *fs; 1585 uint32_t ealength, ul; 1586 ssize_t ealen; 1587 int olen, eapad1, eapad2, error, i, easize; 1588 u_char *eae, *p; 1589 1590 ip = VTOI(ap->a_vp); 1591 fs = ITOFS(ip); 1592 1593 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1594 return (EOPNOTSUPP); 1595 1596 if (strlen(ap->a_name) == 0) 1597 return (EINVAL); 1598 1599 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1600 if (ap->a_uio == NULL) 1601 return (EOPNOTSUPP); 1602 1603 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1604 return (EROFS); 1605 1606 ealen = ap->a_uio->uio_resid; 1607 if (ealen < 0 || ealen > lblktosize(fs, NXADDR)) 1608 return (EINVAL); 1609 1610 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1611 ap->a_cred, ap->a_td, VWRITE); 1612 if (error) { 1613 1614 /* 1615 * ffs_lock_ea is not needed there, because the vnode 1616 * must be exclusively locked. 1617 */ 1618 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1619 ip->i_ea_error = error; 1620 return (error); 1621 } 1622 1623 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1624 if (error) 1625 return (error); 1626 1627 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1628 eapad1 = 8 - (ealength % 8); 1629 if (eapad1 == 8) 1630 eapad1 = 0; 1631 eapad2 = 8 - (ealen % 8); 1632 if (eapad2 == 8) 1633 eapad2 = 0; 1634 ealength += eapad1 + ealen + eapad2; 1635 1636 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1637 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1638 easize = ip->i_ea_len; 1639 1640 olen = ffs_findextattr(eae, easize, 1641 ap->a_attrnamespace, ap->a_name, &p, NULL); 1642 if (olen == -1) { 1643 /* new, append at end */ 1644 p = eae + easize; 1645 easize += ealength; 1646 } else { 1647 bcopy(p, &ul, sizeof ul); 1648 i = p - eae + ul; 1649 if (ul != ealength) { 1650 bcopy(p + ul, p + ealength, easize - i); 1651 easize += (ealength - ul); 1652 } 1653 } 1654 if (easize > lblktosize(fs, NXADDR)) { 1655 free(eae, M_TEMP); 1656 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1657 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1658 ip->i_ea_error = ENOSPC; 1659 return(ENOSPC); 1660 } 1661 bcopy(&ealength, p, sizeof(ealength)); 1662 p += sizeof(ealength); 1663 *p++ = ap->a_attrnamespace; 1664 *p++ = eapad2; 1665 *p++ = strlen(ap->a_name); 1666 strcpy(p, ap->a_name); 1667 p += strlen(ap->a_name); 1668 bzero(p, eapad1); 1669 p += eapad1; 1670 error = uiomove(p, ealen, ap->a_uio); 1671 if (error) { 1672 free(eae, M_TEMP); 1673 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1674 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1675 ip->i_ea_error = error; 1676 return(error); 1677 } 1678 p += ealen; 1679 bzero(p, eapad2); 1680 1681 p = ip->i_ea_area; 1682 ip->i_ea_area = eae; 1683 ip->i_ea_len = easize; 1684 free(p, M_TEMP); 1685 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1686 return(error); 1687 } 1688 1689 /* 1690 * Vnode pointer to File handle 1691 */ 1692 static int 1693 ffs_vptofh(struct vop_vptofh_args *ap) 1694 /* 1695 vop_vptofh { 1696 IN struct vnode *a_vp; 1697 IN struct fid *a_fhp; 1698 }; 1699 */ 1700 { 1701 struct inode *ip; 1702 struct ufid *ufhp; 1703 1704 ip = VTOI(ap->a_vp); 1705 ufhp = (struct ufid *)ap->a_fhp; 1706 ufhp->ufid_len = sizeof(struct ufid); 1707 ufhp->ufid_ino = ip->i_number; 1708 ufhp->ufid_gen = ip->i_gen; 1709 return (0); 1710 } 1711 1712 SYSCTL_DECL(_vfs_ffs); 1713 static int use_buf_pager = 1; 1714 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0, 1715 "Always use buffer pager instead of bmap"); 1716 1717 static daddr_t 1718 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) 1719 { 1720 1721 return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off)); 1722 } 1723 1724 static int 1725 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn) 1726 { 1727 1728 return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn)); 1729 } 1730 1731 static int 1732 ffs_getpages(struct vop_getpages_args *ap) 1733 { 1734 struct vnode *vp; 1735 struct ufsmount *um; 1736 1737 vp = ap->a_vp; 1738 um = VFSTOUFS(vp->v_mount); 1739 1740 if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) 1741 return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, 1742 ap->a_rbehind, ap->a_rahead, NULL, NULL)); 1743 return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, 1744 ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz)); 1745 } 1746