1 /*- 2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 60 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 61 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include <sys/param.h> 68 #include <sys/bio.h> 69 #include <sys/systm.h> 70 #include <sys/buf.h> 71 #include <sys/conf.h> 72 #include <sys/extattr.h> 73 #include <sys/kernel.h> 74 #include <sys/limits.h> 75 #include <sys/malloc.h> 76 #include <sys/mount.h> 77 #include <sys/priv.h> 78 #include <sys/stat.h> 79 #include <sys/vmmeter.h> 80 #include <sys/vnode.h> 81 82 #include <vm/vm.h> 83 #include <vm/vm_extern.h> 84 #include <vm/vm_object.h> 85 #include <vm/vm_page.h> 86 #include <vm/vm_pager.h> 87 #include <vm/vnode_pager.h> 88 89 #include <ufs/ufs/extattr.h> 90 #include <ufs/ufs/quota.h> 91 #include <ufs/ufs/inode.h> 92 #include <ufs/ufs/ufs_extern.h> 93 #include <ufs/ufs/ufsmount.h> 94 95 #include <ufs/ffs/fs.h> 96 #include <ufs/ffs/ffs_extern.h> 97 #include "opt_directio.h" 98 #include "opt_ffs.h" 99 100 #ifdef DIRECTIO 101 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 102 #endif 103 static vop_fsync_t ffs_fsync; 104 static vop_lock1_t ffs_lock; 105 static vop_getpages_t ffs_getpages; 106 static vop_read_t ffs_read; 107 static vop_write_t ffs_write; 108 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 109 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 110 struct ucred *cred); 111 static vop_strategy_t ffsext_strategy; 112 static vop_closeextattr_t ffs_closeextattr; 113 static vop_deleteextattr_t ffs_deleteextattr; 114 static vop_getextattr_t ffs_getextattr; 115 static vop_listextattr_t ffs_listextattr; 116 static vop_openextattr_t ffs_openextattr; 117 static vop_setextattr_t ffs_setextattr; 118 static vop_vptofh_t ffs_vptofh; 119 120 121 /* Global vfs data structures for ufs. */ 122 struct vop_vector ffs_vnodeops1 = { 123 .vop_default = &ufs_vnodeops, 124 .vop_fsync = ffs_fsync, 125 .vop_getpages = ffs_getpages, 126 .vop_lock1 = ffs_lock, 127 .vop_read = ffs_read, 128 .vop_reallocblks = ffs_reallocblks, 129 .vop_write = ffs_write, 130 .vop_vptofh = ffs_vptofh, 131 }; 132 133 struct vop_vector ffs_fifoops1 = { 134 .vop_default = &ufs_fifoops, 135 .vop_fsync = ffs_fsync, 136 .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ 137 .vop_vptofh = ffs_vptofh, 138 }; 139 140 /* Global vfs data structures for ufs. */ 141 struct vop_vector ffs_vnodeops2 = { 142 .vop_default = &ufs_vnodeops, 143 .vop_fsync = ffs_fsync, 144 .vop_getpages = ffs_getpages, 145 .vop_lock1 = ffs_lock, 146 .vop_read = ffs_read, 147 .vop_reallocblks = ffs_reallocblks, 148 .vop_write = ffs_write, 149 .vop_closeextattr = ffs_closeextattr, 150 .vop_deleteextattr = ffs_deleteextattr, 151 .vop_getextattr = ffs_getextattr, 152 .vop_listextattr = ffs_listextattr, 153 .vop_openextattr = ffs_openextattr, 154 .vop_setextattr = ffs_setextattr, 155 .vop_vptofh = ffs_vptofh, 156 }; 157 158 struct vop_vector ffs_fifoops2 = { 159 .vop_default = &ufs_fifoops, 160 .vop_fsync = ffs_fsync, 161 .vop_lock1 = ffs_lock, 162 .vop_reallocblks = ffs_reallocblks, 163 .vop_strategy = ffsext_strategy, 164 .vop_closeextattr = ffs_closeextattr, 165 .vop_deleteextattr = ffs_deleteextattr, 166 .vop_getextattr = ffs_getextattr, 167 .vop_listextattr = ffs_listextattr, 168 .vop_openextattr = ffs_openextattr, 169 .vop_setextattr = ffs_setextattr, 170 .vop_vptofh = ffs_vptofh, 171 }; 172 173 /* 174 * Synch an open file. 175 */ 176 /* ARGSUSED */ 177 static int 178 ffs_fsync(struct vop_fsync_args *ap) 179 { 180 struct vnode *vp; 181 struct bufobj *bo; 182 int error; 183 184 vp = ap->a_vp; 185 bo = &vp->v_bufobj; 186 retry: 187 error = ffs_syncvnode(vp, ap->a_waitfor); 188 if (error) 189 return (error); 190 if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) { 191 error = softdep_fsync(vp); 192 if (error) 193 return (error); 194 195 /* 196 * The softdep_fsync() function may drop vp lock, 197 * allowing for dirty buffers to reappear on the 198 * bo_dirty list. Recheck and resync as needed. 199 */ 200 BO_LOCK(bo); 201 if (vp->v_type == VREG && (bo->bo_numoutput > 0 || 202 bo->bo_dirty.bv_cnt > 0)) { 203 BO_UNLOCK(bo); 204 goto retry; 205 } 206 BO_UNLOCK(bo); 207 } 208 return (0); 209 } 210 211 int 212 ffs_syncvnode(struct vnode *vp, int waitfor) 213 { 214 struct inode *ip; 215 struct bufobj *bo; 216 struct buf *bp; 217 struct buf *nbp; 218 ufs_lbn_t lbn; 219 int error, wait, passes; 220 221 ip = VTOI(vp); 222 ip->i_flag &= ~IN_NEEDSYNC; 223 bo = &vp->v_bufobj; 224 225 /* 226 * When doing MNT_WAIT we must first flush all dependencies 227 * on the inode. 228 */ 229 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && 230 (error = softdep_sync_metadata(vp)) != 0) 231 return (error); 232 233 /* 234 * Flush all dirty buffers associated with a vnode. 235 */ 236 error = 0; 237 passes = 0; 238 wait = 0; /* Always do an async pass first. */ 239 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 240 BO_LOCK(bo); 241 loop: 242 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 243 bp->b_vflags &= ~BV_SCANNED; 244 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 245 /* 246 * Reasons to skip this buffer: it has already been considered 247 * on this pass, the buffer has dependencies that will cause 248 * it to be redirtied and it has not already been deferred, 249 * or it is already being written. 250 */ 251 if ((bp->b_vflags & BV_SCANNED) != 0) 252 continue; 253 bp->b_vflags |= BV_SCANNED; 254 /* Flush indirects in order. */ 255 if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR && 256 lbn_level(bp->b_lblkno) >= passes) 257 continue; 258 if (bp->b_lblkno > lbn) 259 panic("ffs_syncvnode: syncing truncated data."); 260 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 261 continue; 262 BO_UNLOCK(bo); 263 if ((bp->b_flags & B_DELWRI) == 0) 264 panic("ffs_fsync: not dirty"); 265 /* 266 * Check for dependencies and potentially complete them. 267 */ 268 if (!LIST_EMPTY(&bp->b_dep) && 269 (error = softdep_sync_buf(vp, bp, 270 wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { 271 /* I/O error. */ 272 if (error != EBUSY) { 273 BUF_UNLOCK(bp); 274 return (error); 275 } 276 /* If we deferred once, don't defer again. */ 277 if ((bp->b_flags & B_DEFERRED) == 0) { 278 bp->b_flags |= B_DEFERRED; 279 BUF_UNLOCK(bp); 280 goto next; 281 } 282 } 283 if (wait) { 284 bremfree(bp); 285 if ((error = bwrite(bp)) != 0) 286 return (error); 287 } else if ((bp->b_flags & B_CLUSTEROK)) { 288 (void) vfs_bio_awrite(bp); 289 } else { 290 bremfree(bp); 291 (void) bawrite(bp); 292 } 293 next: 294 /* 295 * Since we may have slept during the I/O, we need 296 * to start from a known point. 297 */ 298 BO_LOCK(bo); 299 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); 300 } 301 if (waitfor != MNT_WAIT) { 302 BO_UNLOCK(bo); 303 return (ffs_update(vp, waitfor)); 304 } 305 /* Drain IO to see if we're done. */ 306 bufobj_wwait(bo, 0, 0); 307 /* 308 * Block devices associated with filesystems may have new I/O 309 * requests posted for them even if the vnode is locked, so no 310 * amount of trying will get them clean. We make several passes 311 * as a best effort. 312 * 313 * Regular files may need multiple passes to flush all dependency 314 * work as it is possible that we must write once per indirect 315 * level, once for the leaf, and once for the inode and each of 316 * these will be done with one sync and one async pass. 317 */ 318 if (bo->bo_dirty.bv_cnt > 0) { 319 /* Write the inode after sync passes to flush deps. */ 320 if (wait && DOINGSOFTDEP(vp)) { 321 BO_UNLOCK(bo); 322 ffs_update(vp, MNT_WAIT); 323 BO_LOCK(bo); 324 } 325 /* switch between sync/async. */ 326 wait = !wait; 327 if (wait == 1 || ++passes < NIADDR + 2) 328 goto loop; 329 #ifdef INVARIANTS 330 if (!vn_isdisk(vp, NULL)) 331 vprint("ffs_fsync: dirty", vp); 332 #endif 333 } 334 BO_UNLOCK(bo); 335 error = ffs_update(vp, MNT_WAIT); 336 if (DOINGSUJ(vp)) 337 softdep_journal_fsync(VTOI(vp)); 338 return (error); 339 } 340 341 static int 342 ffs_lock(ap) 343 struct vop_lock1_args /* { 344 struct vnode *a_vp; 345 int a_flags; 346 struct thread *a_td; 347 char *file; 348 int line; 349 } */ *ap; 350 { 351 #ifndef NO_FFS_SNAPSHOT 352 struct vnode *vp; 353 int flags; 354 struct lock *lkp; 355 int result; 356 357 switch (ap->a_flags & LK_TYPE_MASK) { 358 case LK_SHARED: 359 case LK_UPGRADE: 360 case LK_EXCLUSIVE: 361 vp = ap->a_vp; 362 flags = ap->a_flags; 363 for (;;) { 364 #ifdef DEBUG_VFS_LOCKS 365 KASSERT(vp->v_holdcnt != 0, 366 ("ffs_lock %p: zero hold count", vp)); 367 #endif 368 lkp = vp->v_vnlock; 369 result = _lockmgr_args(lkp, flags, VI_MTX(vp), 370 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 371 ap->a_file, ap->a_line); 372 if (lkp == vp->v_vnlock || result != 0) 373 break; 374 /* 375 * Apparent success, except that the vnode 376 * mutated between snapshot file vnode and 377 * regular file vnode while this process 378 * slept. The lock currently held is not the 379 * right lock. Release it, and try to get the 380 * new lock. 381 */ 382 (void) _lockmgr_args(lkp, LK_RELEASE, NULL, 383 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 384 ap->a_file, ap->a_line); 385 if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == 386 (LK_INTERLOCK | LK_NOWAIT)) 387 return (EBUSY); 388 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 389 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 390 flags &= ~LK_INTERLOCK; 391 } 392 break; 393 default: 394 result = VOP_LOCK1_APV(&ufs_vnodeops, ap); 395 } 396 return (result); 397 #else 398 return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); 399 #endif 400 } 401 402 /* 403 * Vnode op for reading. 404 */ 405 /* ARGSUSED */ 406 static int 407 ffs_read(ap) 408 struct vop_read_args /* { 409 struct vnode *a_vp; 410 struct uio *a_uio; 411 int a_ioflag; 412 struct ucred *a_cred; 413 } */ *ap; 414 { 415 struct vnode *vp; 416 struct inode *ip; 417 struct uio *uio; 418 struct fs *fs; 419 struct buf *bp; 420 ufs_lbn_t lbn, nextlbn; 421 off_t bytesinfile; 422 long size, xfersize, blkoffset; 423 int error, orig_resid; 424 int seqcount; 425 int ioflag; 426 427 vp = ap->a_vp; 428 uio = ap->a_uio; 429 ioflag = ap->a_ioflag; 430 if (ap->a_ioflag & IO_EXT) 431 #ifdef notyet 432 return (ffs_extread(vp, uio, ioflag)); 433 #else 434 panic("ffs_read+IO_EXT"); 435 #endif 436 #ifdef DIRECTIO 437 if ((ioflag & IO_DIRECT) != 0) { 438 int workdone; 439 440 error = ffs_rawread(vp, uio, &workdone); 441 if (error != 0 || workdone != 0) 442 return error; 443 } 444 #endif 445 446 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 447 ip = VTOI(vp); 448 449 #ifdef INVARIANTS 450 if (uio->uio_rw != UIO_READ) 451 panic("ffs_read: mode"); 452 453 if (vp->v_type == VLNK) { 454 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 455 panic("ffs_read: short symlink"); 456 } else if (vp->v_type != VREG && vp->v_type != VDIR) 457 panic("ffs_read: type %d", vp->v_type); 458 #endif 459 orig_resid = uio->uio_resid; 460 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 461 if (orig_resid == 0) 462 return (0); 463 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 464 fs = ip->i_fs; 465 if (uio->uio_offset < ip->i_size && 466 uio->uio_offset >= fs->fs_maxfilesize) 467 return (EOVERFLOW); 468 469 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 470 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 471 break; 472 lbn = lblkno(fs, uio->uio_offset); 473 nextlbn = lbn + 1; 474 475 /* 476 * size of buffer. The buffer representing the 477 * end of the file is rounded up to the size of 478 * the block type ( fragment or full block, 479 * depending ). 480 */ 481 size = blksize(fs, ip, lbn); 482 blkoffset = blkoff(fs, uio->uio_offset); 483 484 /* 485 * The amount we want to transfer in this iteration is 486 * one FS block less the amount of the data before 487 * our startpoint (duh!) 488 */ 489 xfersize = fs->fs_bsize - blkoffset; 490 491 /* 492 * But if we actually want less than the block, 493 * or the file doesn't have a whole block more of data, 494 * then use the lesser number. 495 */ 496 if (uio->uio_resid < xfersize) 497 xfersize = uio->uio_resid; 498 if (bytesinfile < xfersize) 499 xfersize = bytesinfile; 500 501 if (lblktosize(fs, nextlbn) >= ip->i_size) { 502 /* 503 * Don't do readahead if this is the end of the file. 504 */ 505 error = bread(vp, lbn, size, NOCRED, &bp); 506 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 507 /* 508 * Otherwise if we are allowed to cluster, 509 * grab as much as we can. 510 * 511 * XXX This may not be a win if we are not 512 * doing sequential access. 513 */ 514 error = cluster_read(vp, ip->i_size, lbn, 515 size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); 516 } else if (seqcount > 1) { 517 /* 518 * If we are NOT allowed to cluster, then 519 * if we appear to be acting sequentially, 520 * fire off a request for a readahead 521 * as well as a read. Note that the 4th and 5th 522 * arguments point to arrays of the size specified in 523 * the 6th argument. 524 */ 525 int nextsize = blksize(fs, ip, nextlbn); 526 error = breadn(vp, lbn, 527 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 528 } else { 529 /* 530 * Failing all of the above, just read what the 531 * user asked for. Interestingly, the same as 532 * the first option above. 533 */ 534 error = bread(vp, lbn, size, NOCRED, &bp); 535 } 536 if (error) { 537 brelse(bp); 538 bp = NULL; 539 break; 540 } 541 542 /* 543 * If IO_DIRECT then set B_DIRECT for the buffer. This 544 * will cause us to attempt to release the buffer later on 545 * and will cause the buffer cache to attempt to free the 546 * underlying pages. 547 */ 548 if (ioflag & IO_DIRECT) 549 bp->b_flags |= B_DIRECT; 550 551 /* 552 * We should only get non-zero b_resid when an I/O error 553 * has occurred, which should cause us to break above. 554 * However, if the short read did not cause an error, 555 * then we want to ensure that we do not uiomove bad 556 * or uninitialized data. 557 */ 558 size -= bp->b_resid; 559 if (size < xfersize) { 560 if (size == 0) 561 break; 562 xfersize = size; 563 } 564 565 error = uiomove((char *)bp->b_data + blkoffset, 566 (int)xfersize, uio); 567 if (error) 568 break; 569 570 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 571 (LIST_EMPTY(&bp->b_dep))) { 572 /* 573 * If there are no dependencies, and it's VMIO, 574 * then we don't need the buf, mark it available 575 * for freeing. For non-direct VMIO reads, the VM 576 * has the data. 577 */ 578 bp->b_flags |= B_RELBUF; 579 brelse(bp); 580 } else { 581 /* 582 * Otherwise let whoever 583 * made the request take care of 584 * freeing it. We just queue 585 * it onto another list. 586 */ 587 bqrelse(bp); 588 } 589 } 590 591 /* 592 * This can only happen in the case of an error 593 * because the loop above resets bp to NULL on each iteration 594 * and on normal completion has not set a new value into it. 595 * so it must have come from a 'break' statement 596 */ 597 if (bp != NULL) { 598 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 599 (LIST_EMPTY(&bp->b_dep))) { 600 bp->b_flags |= B_RELBUF; 601 brelse(bp); 602 } else { 603 bqrelse(bp); 604 } 605 } 606 607 if ((error == 0 || uio->uio_resid != orig_resid) && 608 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0 && 609 (ip->i_flag & IN_ACCESS) == 0) { 610 VI_LOCK(vp); 611 ip->i_flag |= IN_ACCESS; 612 VI_UNLOCK(vp); 613 } 614 return (error); 615 } 616 617 /* 618 * Vnode op for writing. 619 */ 620 static int 621 ffs_write(ap) 622 struct vop_write_args /* { 623 struct vnode *a_vp; 624 struct uio *a_uio; 625 int a_ioflag; 626 struct ucred *a_cred; 627 } */ *ap; 628 { 629 struct vnode *vp; 630 struct uio *uio; 631 struct inode *ip; 632 struct fs *fs; 633 struct buf *bp; 634 ufs_lbn_t lbn; 635 off_t osize; 636 int seqcount; 637 int blkoffset, error, flags, ioflag, resid, size, xfersize; 638 639 vp = ap->a_vp; 640 uio = ap->a_uio; 641 ioflag = ap->a_ioflag; 642 if (ap->a_ioflag & IO_EXT) 643 #ifdef notyet 644 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 645 #else 646 panic("ffs_write+IO_EXT"); 647 #endif 648 649 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 650 ip = VTOI(vp); 651 652 #ifdef INVARIANTS 653 if (uio->uio_rw != UIO_WRITE) 654 panic("ffs_write: mode"); 655 #endif 656 657 switch (vp->v_type) { 658 case VREG: 659 if (ioflag & IO_APPEND) 660 uio->uio_offset = ip->i_size; 661 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 662 return (EPERM); 663 /* FALLTHROUGH */ 664 case VLNK: 665 break; 666 case VDIR: 667 panic("ffs_write: dir write"); 668 break; 669 default: 670 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 671 (int)uio->uio_offset, 672 (int)uio->uio_resid 673 ); 674 } 675 676 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 677 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 678 fs = ip->i_fs; 679 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 680 return (EFBIG); 681 /* 682 * Maybe this should be above the vnode op call, but so long as 683 * file servers have no limits, I don't think it matters. 684 */ 685 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) 686 return (EFBIG); 687 688 resid = uio->uio_resid; 689 osize = ip->i_size; 690 if (seqcount > BA_SEQMAX) 691 flags = BA_SEQMAX << BA_SEQSHIFT; 692 else 693 flags = seqcount << BA_SEQSHIFT; 694 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 695 flags |= IO_SYNC; 696 697 for (error = 0; uio->uio_resid > 0;) { 698 lbn = lblkno(fs, uio->uio_offset); 699 blkoffset = blkoff(fs, uio->uio_offset); 700 xfersize = fs->fs_bsize - blkoffset; 701 if (uio->uio_resid < xfersize) 702 xfersize = uio->uio_resid; 703 if (uio->uio_offset + xfersize > ip->i_size) 704 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 705 706 /* 707 * We must perform a read-before-write if the transfer size 708 * does not cover the entire buffer. 709 */ 710 if (fs->fs_bsize > xfersize) 711 flags |= BA_CLRBUF; 712 else 713 flags &= ~BA_CLRBUF; 714 /* XXX is uio->uio_offset the right thing here? */ 715 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 716 ap->a_cred, flags, &bp); 717 if (error != 0) { 718 vnode_pager_setsize(vp, ip->i_size); 719 break; 720 } 721 /* 722 * If the buffer is not valid we have to clear out any 723 * garbage data from the pages instantiated for the buffer. 724 * If we do not, a failed uiomove() during a write can leave 725 * the prior contents of the pages exposed to a userland 726 * mmap(). XXX deal with uiomove() errors a better way. 727 */ 728 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 729 vfs_bio_clrbuf(bp); 730 if (ioflag & IO_DIRECT) 731 bp->b_flags |= B_DIRECT; 732 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 733 bp->b_flags |= B_NOCACHE; 734 735 if (uio->uio_offset + xfersize > ip->i_size) { 736 ip->i_size = uio->uio_offset + xfersize; 737 DIP_SET(ip, i_size, ip->i_size); 738 } 739 740 size = blksize(fs, ip, lbn) - bp->b_resid; 741 if (size < xfersize) 742 xfersize = size; 743 744 error = 745 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 746 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 747 (LIST_EMPTY(&bp->b_dep))) { 748 bp->b_flags |= B_RELBUF; 749 } 750 751 /* 752 * If IO_SYNC each buffer is written synchronously. Otherwise 753 * if we have a severe page deficiency write the buffer 754 * asynchronously. Otherwise try to cluster, and if that 755 * doesn't do it then either do an async write (if O_DIRECT), 756 * or a delayed write (if not). 757 */ 758 if (ioflag & IO_SYNC) { 759 (void)bwrite(bp); 760 } else if (vm_page_count_severe() || 761 buf_dirty_count_severe() || 762 (ioflag & IO_ASYNC)) { 763 bp->b_flags |= B_CLUSTEROK; 764 bawrite(bp); 765 } else if (xfersize + blkoffset == fs->fs_bsize) { 766 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 767 bp->b_flags |= B_CLUSTEROK; 768 cluster_write(vp, bp, ip->i_size, seqcount); 769 } else { 770 bawrite(bp); 771 } 772 } else if (ioflag & IO_DIRECT) { 773 bp->b_flags |= B_CLUSTEROK; 774 bawrite(bp); 775 } else { 776 bp->b_flags |= B_CLUSTEROK; 777 bdwrite(bp); 778 } 779 if (error || xfersize == 0) 780 break; 781 ip->i_flag |= IN_CHANGE | IN_UPDATE; 782 } 783 /* 784 * If we successfully wrote any data, and we are not the superuser 785 * we clear the setuid and setgid bits as a precaution against 786 * tampering. 787 */ 788 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && 789 ap->a_cred) { 790 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) { 791 ip->i_mode &= ~(ISUID | ISGID); 792 DIP_SET(ip, i_mode, ip->i_mode); 793 } 794 } 795 if (error) { 796 if (ioflag & IO_UNIT) { 797 (void)ffs_truncate(vp, osize, 798 IO_NORMAL | (ioflag & IO_SYNC), 799 ap->a_cred, uio->uio_td); 800 uio->uio_offset -= resid - uio->uio_resid; 801 uio->uio_resid = resid; 802 } 803 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 804 error = ffs_update(vp, 1); 805 return (error); 806 } 807 808 /* 809 * get page routine 810 */ 811 static int 812 ffs_getpages(ap) 813 struct vop_getpages_args *ap; 814 { 815 int i; 816 vm_page_t mreq; 817 int pcount; 818 819 pcount = round_page(ap->a_count) / PAGE_SIZE; 820 mreq = ap->a_m[ap->a_reqpage]; 821 822 /* 823 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 824 * then the entire page is valid. Since the page may be mapped, 825 * user programs might reference data beyond the actual end of file 826 * occuring within the page. We have to zero that data. 827 */ 828 VM_OBJECT_LOCK(mreq->object); 829 if (mreq->valid) { 830 if (mreq->valid != VM_PAGE_BITS_ALL) 831 vm_page_zero_invalid(mreq, TRUE); 832 for (i = 0; i < pcount; i++) { 833 if (i != ap->a_reqpage) { 834 vm_page_lock(ap->a_m[i]); 835 vm_page_free(ap->a_m[i]); 836 vm_page_unlock(ap->a_m[i]); 837 } 838 } 839 VM_OBJECT_UNLOCK(mreq->object); 840 return VM_PAGER_OK; 841 } 842 VM_OBJECT_UNLOCK(mreq->object); 843 844 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 845 ap->a_count, 846 ap->a_reqpage); 847 } 848 849 850 /* 851 * Extended attribute area reading. 852 */ 853 static int 854 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 855 { 856 struct inode *ip; 857 struct ufs2_dinode *dp; 858 struct fs *fs; 859 struct buf *bp; 860 ufs_lbn_t lbn, nextlbn; 861 off_t bytesinfile; 862 long size, xfersize, blkoffset; 863 int error, orig_resid; 864 865 ip = VTOI(vp); 866 fs = ip->i_fs; 867 dp = ip->i_din2; 868 869 #ifdef INVARIANTS 870 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 871 panic("ffs_extread: mode"); 872 873 #endif 874 orig_resid = uio->uio_resid; 875 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 876 if (orig_resid == 0) 877 return (0); 878 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 879 880 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 881 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 882 break; 883 lbn = lblkno(fs, uio->uio_offset); 884 nextlbn = lbn + 1; 885 886 /* 887 * size of buffer. The buffer representing the 888 * end of the file is rounded up to the size of 889 * the block type ( fragment or full block, 890 * depending ). 891 */ 892 size = sblksize(fs, dp->di_extsize, lbn); 893 blkoffset = blkoff(fs, uio->uio_offset); 894 895 /* 896 * The amount we want to transfer in this iteration is 897 * one FS block less the amount of the data before 898 * our startpoint (duh!) 899 */ 900 xfersize = fs->fs_bsize - blkoffset; 901 902 /* 903 * But if we actually want less than the block, 904 * or the file doesn't have a whole block more of data, 905 * then use the lesser number. 906 */ 907 if (uio->uio_resid < xfersize) 908 xfersize = uio->uio_resid; 909 if (bytesinfile < xfersize) 910 xfersize = bytesinfile; 911 912 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 913 /* 914 * Don't do readahead if this is the end of the info. 915 */ 916 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 917 } else { 918 /* 919 * If we have a second block, then 920 * fire off a request for a readahead 921 * as well as a read. Note that the 4th and 5th 922 * arguments point to arrays of the size specified in 923 * the 6th argument. 924 */ 925 int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 926 927 nextlbn = -1 - nextlbn; 928 error = breadn(vp, -1 - lbn, 929 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 930 } 931 if (error) { 932 brelse(bp); 933 bp = NULL; 934 break; 935 } 936 937 /* 938 * If IO_DIRECT then set B_DIRECT for the buffer. This 939 * will cause us to attempt to release the buffer later on 940 * and will cause the buffer cache to attempt to free the 941 * underlying pages. 942 */ 943 if (ioflag & IO_DIRECT) 944 bp->b_flags |= B_DIRECT; 945 946 /* 947 * We should only get non-zero b_resid when an I/O error 948 * has occurred, which should cause us to break above. 949 * However, if the short read did not cause an error, 950 * then we want to ensure that we do not uiomove bad 951 * or uninitialized data. 952 */ 953 size -= bp->b_resid; 954 if (size < xfersize) { 955 if (size == 0) 956 break; 957 xfersize = size; 958 } 959 960 error = uiomove((char *)bp->b_data + blkoffset, 961 (int)xfersize, uio); 962 if (error) 963 break; 964 965 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 966 (LIST_EMPTY(&bp->b_dep))) { 967 /* 968 * If there are no dependencies, and it's VMIO, 969 * then we don't need the buf, mark it available 970 * for freeing. For non-direct VMIO reads, the VM 971 * has the data. 972 */ 973 bp->b_flags |= B_RELBUF; 974 brelse(bp); 975 } else { 976 /* 977 * Otherwise let whoever 978 * made the request take care of 979 * freeing it. We just queue 980 * it onto another list. 981 */ 982 bqrelse(bp); 983 } 984 } 985 986 /* 987 * This can only happen in the case of an error 988 * because the loop above resets bp to NULL on each iteration 989 * and on normal completion has not set a new value into it. 990 * so it must have come from a 'break' statement 991 */ 992 if (bp != NULL) { 993 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 994 (LIST_EMPTY(&bp->b_dep))) { 995 bp->b_flags |= B_RELBUF; 996 brelse(bp); 997 } else { 998 bqrelse(bp); 999 } 1000 } 1001 return (error); 1002 } 1003 1004 /* 1005 * Extended attribute area writing. 1006 */ 1007 static int 1008 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1009 { 1010 struct inode *ip; 1011 struct ufs2_dinode *dp; 1012 struct fs *fs; 1013 struct buf *bp; 1014 ufs_lbn_t lbn; 1015 off_t osize; 1016 int blkoffset, error, flags, resid, size, xfersize; 1017 1018 ip = VTOI(vp); 1019 fs = ip->i_fs; 1020 dp = ip->i_din2; 1021 1022 #ifdef INVARIANTS 1023 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1024 panic("ffs_extwrite: mode"); 1025 #endif 1026 1027 if (ioflag & IO_APPEND) 1028 uio->uio_offset = dp->di_extsize; 1029 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1030 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1031 if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1032 return (EFBIG); 1033 1034 resid = uio->uio_resid; 1035 osize = dp->di_extsize; 1036 flags = IO_EXT; 1037 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1038 flags |= IO_SYNC; 1039 1040 for (error = 0; uio->uio_resid > 0;) { 1041 lbn = lblkno(fs, uio->uio_offset); 1042 blkoffset = blkoff(fs, uio->uio_offset); 1043 xfersize = fs->fs_bsize - blkoffset; 1044 if (uio->uio_resid < xfersize) 1045 xfersize = uio->uio_resid; 1046 1047 /* 1048 * We must perform a read-before-write if the transfer size 1049 * does not cover the entire buffer. 1050 */ 1051 if (fs->fs_bsize > xfersize) 1052 flags |= BA_CLRBUF; 1053 else 1054 flags &= ~BA_CLRBUF; 1055 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1056 ucred, flags, &bp); 1057 if (error != 0) 1058 break; 1059 /* 1060 * If the buffer is not valid we have to clear out any 1061 * garbage data from the pages instantiated for the buffer. 1062 * If we do not, a failed uiomove() during a write can leave 1063 * the prior contents of the pages exposed to a userland 1064 * mmap(). XXX deal with uiomove() errors a better way. 1065 */ 1066 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1067 vfs_bio_clrbuf(bp); 1068 if (ioflag & IO_DIRECT) 1069 bp->b_flags |= B_DIRECT; 1070 1071 if (uio->uio_offset + xfersize > dp->di_extsize) 1072 dp->di_extsize = uio->uio_offset + xfersize; 1073 1074 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1075 if (size < xfersize) 1076 xfersize = size; 1077 1078 error = 1079 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1080 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1081 (LIST_EMPTY(&bp->b_dep))) { 1082 bp->b_flags |= B_RELBUF; 1083 } 1084 1085 /* 1086 * If IO_SYNC each buffer is written synchronously. Otherwise 1087 * if we have a severe page deficiency write the buffer 1088 * asynchronously. Otherwise try to cluster, and if that 1089 * doesn't do it then either do an async write (if O_DIRECT), 1090 * or a delayed write (if not). 1091 */ 1092 if (ioflag & IO_SYNC) { 1093 (void)bwrite(bp); 1094 } else if (vm_page_count_severe() || 1095 buf_dirty_count_severe() || 1096 xfersize + blkoffset == fs->fs_bsize || 1097 (ioflag & (IO_ASYNC | IO_DIRECT))) 1098 bawrite(bp); 1099 else 1100 bdwrite(bp); 1101 if (error || xfersize == 0) 1102 break; 1103 ip->i_flag |= IN_CHANGE; 1104 } 1105 /* 1106 * If we successfully wrote any data, and we are not the superuser 1107 * we clear the setuid and setgid bits as a precaution against 1108 * tampering. 1109 */ 1110 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { 1111 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) { 1112 ip->i_mode &= ~(ISUID | ISGID); 1113 dp->di_mode = ip->i_mode; 1114 } 1115 } 1116 if (error) { 1117 if (ioflag & IO_UNIT) { 1118 (void)ffs_truncate(vp, osize, 1119 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td); 1120 uio->uio_offset -= resid - uio->uio_resid; 1121 uio->uio_resid = resid; 1122 } 1123 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1124 error = ffs_update(vp, 1); 1125 return (error); 1126 } 1127 1128 1129 /* 1130 * Vnode operating to retrieve a named extended attribute. 1131 * 1132 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1133 * the length of the EA, and possibly the pointer to the entry and to the data. 1134 */ 1135 static int 1136 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) 1137 { 1138 u_char *p, *pe, *pn, *p0; 1139 int eapad1, eapad2, ealength, ealen, nlen; 1140 uint32_t ul; 1141 1142 pe = ptr + length; 1143 nlen = strlen(name); 1144 1145 for (p = ptr; p < pe; p = pn) { 1146 p0 = p; 1147 bcopy(p, &ul, sizeof(ul)); 1148 pn = p + ul; 1149 /* make sure this entry is complete */ 1150 if (pn > pe) 1151 break; 1152 p += sizeof(uint32_t); 1153 if (*p != nspace) 1154 continue; 1155 p++; 1156 eapad2 = *p++; 1157 if (*p != nlen) 1158 continue; 1159 p++; 1160 if (bcmp(p, name, nlen)) 1161 continue; 1162 ealength = sizeof(uint32_t) + 3 + nlen; 1163 eapad1 = 8 - (ealength % 8); 1164 if (eapad1 == 8) 1165 eapad1 = 0; 1166 ealength += eapad1; 1167 ealen = ul - ealength - eapad2; 1168 p += nlen + eapad1; 1169 if (eap != NULL) 1170 *eap = p0; 1171 if (eac != NULL) 1172 *eac = p; 1173 return (ealen); 1174 } 1175 return(-1); 1176 } 1177 1178 static int 1179 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1180 { 1181 struct inode *ip; 1182 struct ufs2_dinode *dp; 1183 struct fs *fs; 1184 struct uio luio; 1185 struct iovec liovec; 1186 int easize, error; 1187 u_char *eae; 1188 1189 ip = VTOI(vp); 1190 fs = ip->i_fs; 1191 dp = ip->i_din2; 1192 easize = dp->di_extsize; 1193 if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize) 1194 return (EFBIG); 1195 1196 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1197 1198 liovec.iov_base = eae; 1199 liovec.iov_len = easize; 1200 luio.uio_iov = &liovec; 1201 luio.uio_iovcnt = 1; 1202 luio.uio_offset = 0; 1203 luio.uio_resid = easize; 1204 luio.uio_segflg = UIO_SYSSPACE; 1205 luio.uio_rw = UIO_READ; 1206 luio.uio_td = td; 1207 1208 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1209 if (error) { 1210 free(eae, M_TEMP); 1211 return(error); 1212 } 1213 *p = eae; 1214 return (0); 1215 } 1216 1217 static void 1218 ffs_lock_ea(struct vnode *vp) 1219 { 1220 struct inode *ip; 1221 1222 ip = VTOI(vp); 1223 VI_LOCK(vp); 1224 while (ip->i_flag & IN_EA_LOCKED) { 1225 ip->i_flag |= IN_EA_LOCKWAIT; 1226 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea", 1227 0); 1228 } 1229 ip->i_flag |= IN_EA_LOCKED; 1230 VI_UNLOCK(vp); 1231 } 1232 1233 static void 1234 ffs_unlock_ea(struct vnode *vp) 1235 { 1236 struct inode *ip; 1237 1238 ip = VTOI(vp); 1239 VI_LOCK(vp); 1240 if (ip->i_flag & IN_EA_LOCKWAIT) 1241 wakeup(&ip->i_ea_refs); 1242 ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT); 1243 VI_UNLOCK(vp); 1244 } 1245 1246 static int 1247 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1248 { 1249 struct inode *ip; 1250 struct ufs2_dinode *dp; 1251 int error; 1252 1253 ip = VTOI(vp); 1254 1255 ffs_lock_ea(vp); 1256 if (ip->i_ea_area != NULL) { 1257 ip->i_ea_refs++; 1258 ffs_unlock_ea(vp); 1259 return (0); 1260 } 1261 dp = ip->i_din2; 1262 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1263 if (error) { 1264 ffs_unlock_ea(vp); 1265 return (error); 1266 } 1267 ip->i_ea_len = dp->di_extsize; 1268 ip->i_ea_error = 0; 1269 ip->i_ea_refs++; 1270 ffs_unlock_ea(vp); 1271 return (0); 1272 } 1273 1274 /* 1275 * Vnode extattr transaction commit/abort 1276 */ 1277 static int 1278 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1279 { 1280 struct inode *ip; 1281 struct uio luio; 1282 struct iovec liovec; 1283 int error; 1284 struct ufs2_dinode *dp; 1285 1286 ip = VTOI(vp); 1287 1288 ffs_lock_ea(vp); 1289 if (ip->i_ea_area == NULL) { 1290 ffs_unlock_ea(vp); 1291 return (EINVAL); 1292 } 1293 dp = ip->i_din2; 1294 error = ip->i_ea_error; 1295 if (commit && error == 0) { 1296 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit"); 1297 if (cred == NOCRED) 1298 cred = vp->v_mount->mnt_cred; 1299 liovec.iov_base = ip->i_ea_area; 1300 liovec.iov_len = ip->i_ea_len; 1301 luio.uio_iov = &liovec; 1302 luio.uio_iovcnt = 1; 1303 luio.uio_offset = 0; 1304 luio.uio_resid = ip->i_ea_len; 1305 luio.uio_segflg = UIO_SYSSPACE; 1306 luio.uio_rw = UIO_WRITE; 1307 luio.uio_td = td; 1308 /* XXX: I'm not happy about truncating to zero size */ 1309 if (ip->i_ea_len < dp->di_extsize) 1310 error = ffs_truncate(vp, 0, IO_EXT, cred, td); 1311 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1312 } 1313 if (--ip->i_ea_refs == 0) { 1314 free(ip->i_ea_area, M_TEMP); 1315 ip->i_ea_area = NULL; 1316 ip->i_ea_len = 0; 1317 ip->i_ea_error = 0; 1318 } 1319 ffs_unlock_ea(vp); 1320 return (error); 1321 } 1322 1323 /* 1324 * Vnode extattr strategy routine for fifos. 1325 * 1326 * We need to check for a read or write of the external attributes. 1327 * Otherwise we just fall through and do the usual thing. 1328 */ 1329 static int 1330 ffsext_strategy(struct vop_strategy_args *ap) 1331 /* 1332 struct vop_strategy_args { 1333 struct vnodeop_desc *a_desc; 1334 struct vnode *a_vp; 1335 struct buf *a_bp; 1336 }; 1337 */ 1338 { 1339 struct vnode *vp; 1340 daddr_t lbn; 1341 1342 vp = ap->a_vp; 1343 lbn = ap->a_bp->b_lblkno; 1344 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1345 lbn < 0 && lbn >= -NXADDR) 1346 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1347 if (vp->v_type == VFIFO) 1348 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1349 panic("spec nodes went here"); 1350 } 1351 1352 /* 1353 * Vnode extattr transaction commit/abort 1354 */ 1355 static int 1356 ffs_openextattr(struct vop_openextattr_args *ap) 1357 /* 1358 struct vop_openextattr_args { 1359 struct vnodeop_desc *a_desc; 1360 struct vnode *a_vp; 1361 IN struct ucred *a_cred; 1362 IN struct thread *a_td; 1363 }; 1364 */ 1365 { 1366 struct inode *ip; 1367 struct fs *fs; 1368 1369 ip = VTOI(ap->a_vp); 1370 fs = ip->i_fs; 1371 1372 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1373 return (EOPNOTSUPP); 1374 1375 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1376 } 1377 1378 1379 /* 1380 * Vnode extattr transaction commit/abort 1381 */ 1382 static int 1383 ffs_closeextattr(struct vop_closeextattr_args *ap) 1384 /* 1385 struct vop_closeextattr_args { 1386 struct vnodeop_desc *a_desc; 1387 struct vnode *a_vp; 1388 int a_commit; 1389 IN struct ucred *a_cred; 1390 IN struct thread *a_td; 1391 }; 1392 */ 1393 { 1394 struct inode *ip; 1395 struct fs *fs; 1396 1397 ip = VTOI(ap->a_vp); 1398 fs = ip->i_fs; 1399 1400 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1401 return (EOPNOTSUPP); 1402 1403 if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) 1404 return (EROFS); 1405 1406 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1407 } 1408 1409 /* 1410 * Vnode operation to remove a named attribute. 1411 */ 1412 static int 1413 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1414 /* 1415 vop_deleteextattr { 1416 IN struct vnode *a_vp; 1417 IN int a_attrnamespace; 1418 IN const char *a_name; 1419 IN struct ucred *a_cred; 1420 IN struct thread *a_td; 1421 }; 1422 */ 1423 { 1424 struct inode *ip; 1425 struct fs *fs; 1426 uint32_t ealength, ul; 1427 int ealen, olen, eapad1, eapad2, error, i, easize; 1428 u_char *eae, *p; 1429 1430 ip = VTOI(ap->a_vp); 1431 fs = ip->i_fs; 1432 1433 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1434 return (EOPNOTSUPP); 1435 1436 if (strlen(ap->a_name) == 0) 1437 return (EINVAL); 1438 1439 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1440 return (EROFS); 1441 1442 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1443 ap->a_cred, ap->a_td, VWRITE); 1444 if (error) { 1445 1446 /* 1447 * ffs_lock_ea is not needed there, because the vnode 1448 * must be exclusively locked. 1449 */ 1450 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1451 ip->i_ea_error = error; 1452 return (error); 1453 } 1454 1455 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1456 if (error) 1457 return (error); 1458 1459 ealength = eapad1 = ealen = eapad2 = 0; 1460 1461 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1462 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1463 easize = ip->i_ea_len; 1464 1465 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1466 &p, NULL); 1467 if (olen == -1) { 1468 /* delete but nonexistent */ 1469 free(eae, M_TEMP); 1470 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1471 return(ENOATTR); 1472 } 1473 bcopy(p, &ul, sizeof ul); 1474 i = p - eae + ul; 1475 if (ul != ealength) { 1476 bcopy(p + ul, p + ealength, easize - i); 1477 easize += (ealength - ul); 1478 } 1479 if (easize > NXADDR * fs->fs_bsize) { 1480 free(eae, M_TEMP); 1481 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1482 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1483 ip->i_ea_error = ENOSPC; 1484 return(ENOSPC); 1485 } 1486 p = ip->i_ea_area; 1487 ip->i_ea_area = eae; 1488 ip->i_ea_len = easize; 1489 free(p, M_TEMP); 1490 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1491 return(error); 1492 } 1493 1494 /* 1495 * Vnode operation to retrieve a named extended attribute. 1496 */ 1497 static int 1498 ffs_getextattr(struct vop_getextattr_args *ap) 1499 /* 1500 vop_getextattr { 1501 IN struct vnode *a_vp; 1502 IN int a_attrnamespace; 1503 IN const char *a_name; 1504 INOUT struct uio *a_uio; 1505 OUT size_t *a_size; 1506 IN struct ucred *a_cred; 1507 IN struct thread *a_td; 1508 }; 1509 */ 1510 { 1511 struct inode *ip; 1512 struct fs *fs; 1513 u_char *eae, *p; 1514 unsigned easize; 1515 int error, ealen; 1516 1517 ip = VTOI(ap->a_vp); 1518 fs = ip->i_fs; 1519 1520 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1521 return (EOPNOTSUPP); 1522 1523 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1524 ap->a_cred, ap->a_td, VREAD); 1525 if (error) 1526 return (error); 1527 1528 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1529 if (error) 1530 return (error); 1531 1532 eae = ip->i_ea_area; 1533 easize = ip->i_ea_len; 1534 1535 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1536 NULL, &p); 1537 if (ealen >= 0) { 1538 error = 0; 1539 if (ap->a_size != NULL) 1540 *ap->a_size = ealen; 1541 else if (ap->a_uio != NULL) 1542 error = uiomove(p, ealen, ap->a_uio); 1543 } else 1544 error = ENOATTR; 1545 1546 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1547 return(error); 1548 } 1549 1550 /* 1551 * Vnode operation to retrieve extended attributes on a vnode. 1552 */ 1553 static int 1554 ffs_listextattr(struct vop_listextattr_args *ap) 1555 /* 1556 vop_listextattr { 1557 IN struct vnode *a_vp; 1558 IN int a_attrnamespace; 1559 INOUT struct uio *a_uio; 1560 OUT size_t *a_size; 1561 IN struct ucred *a_cred; 1562 IN struct thread *a_td; 1563 }; 1564 */ 1565 { 1566 struct inode *ip; 1567 struct fs *fs; 1568 u_char *eae, *p, *pe, *pn; 1569 unsigned easize; 1570 uint32_t ul; 1571 int error, ealen; 1572 1573 ip = VTOI(ap->a_vp); 1574 fs = ip->i_fs; 1575 1576 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1577 return (EOPNOTSUPP); 1578 1579 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1580 ap->a_cred, ap->a_td, VREAD); 1581 if (error) 1582 return (error); 1583 1584 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1585 if (error) 1586 return (error); 1587 eae = ip->i_ea_area; 1588 easize = ip->i_ea_len; 1589 1590 error = 0; 1591 if (ap->a_size != NULL) 1592 *ap->a_size = 0; 1593 pe = eae + easize; 1594 for(p = eae; error == 0 && p < pe; p = pn) { 1595 bcopy(p, &ul, sizeof(ul)); 1596 pn = p + ul; 1597 if (pn > pe) 1598 break; 1599 p += sizeof(ul); 1600 if (*p++ != ap->a_attrnamespace) 1601 continue; 1602 p++; /* pad2 */ 1603 ealen = *p; 1604 if (ap->a_size != NULL) { 1605 *ap->a_size += ealen + 1; 1606 } else if (ap->a_uio != NULL) { 1607 error = uiomove(p, ealen + 1, ap->a_uio); 1608 } 1609 } 1610 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1611 return(error); 1612 } 1613 1614 /* 1615 * Vnode operation to set a named attribute. 1616 */ 1617 static int 1618 ffs_setextattr(struct vop_setextattr_args *ap) 1619 /* 1620 vop_setextattr { 1621 IN struct vnode *a_vp; 1622 IN int a_attrnamespace; 1623 IN const char *a_name; 1624 INOUT struct uio *a_uio; 1625 IN struct ucred *a_cred; 1626 IN struct thread *a_td; 1627 }; 1628 */ 1629 { 1630 struct inode *ip; 1631 struct fs *fs; 1632 uint32_t ealength, ul; 1633 int ealen, olen, eapad1, eapad2, error, i, easize; 1634 u_char *eae, *p; 1635 1636 ip = VTOI(ap->a_vp); 1637 fs = ip->i_fs; 1638 1639 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1640 return (EOPNOTSUPP); 1641 1642 if (strlen(ap->a_name) == 0) 1643 return (EINVAL); 1644 1645 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1646 if (ap->a_uio == NULL) 1647 return (EOPNOTSUPP); 1648 1649 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1650 return (EROFS); 1651 1652 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1653 ap->a_cred, ap->a_td, VWRITE); 1654 if (error) { 1655 1656 /* 1657 * ffs_lock_ea is not needed there, because the vnode 1658 * must be exclusively locked. 1659 */ 1660 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1661 ip->i_ea_error = error; 1662 return (error); 1663 } 1664 1665 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1666 if (error) 1667 return (error); 1668 1669 ealen = ap->a_uio->uio_resid; 1670 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1671 eapad1 = 8 - (ealength % 8); 1672 if (eapad1 == 8) 1673 eapad1 = 0; 1674 eapad2 = 8 - (ealen % 8); 1675 if (eapad2 == 8) 1676 eapad2 = 0; 1677 ealength += eapad1 + ealen + eapad2; 1678 1679 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1680 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1681 easize = ip->i_ea_len; 1682 1683 olen = ffs_findextattr(eae, easize, 1684 ap->a_attrnamespace, ap->a_name, &p, NULL); 1685 if (olen == -1) { 1686 /* new, append at end */ 1687 p = eae + easize; 1688 easize += ealength; 1689 } else { 1690 bcopy(p, &ul, sizeof ul); 1691 i = p - eae + ul; 1692 if (ul != ealength) { 1693 bcopy(p + ul, p + ealength, easize - i); 1694 easize += (ealength - ul); 1695 } 1696 } 1697 if (easize > NXADDR * fs->fs_bsize) { 1698 free(eae, M_TEMP); 1699 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1700 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1701 ip->i_ea_error = ENOSPC; 1702 return(ENOSPC); 1703 } 1704 bcopy(&ealength, p, sizeof(ealength)); 1705 p += sizeof(ealength); 1706 *p++ = ap->a_attrnamespace; 1707 *p++ = eapad2; 1708 *p++ = strlen(ap->a_name); 1709 strcpy(p, ap->a_name); 1710 p += strlen(ap->a_name); 1711 bzero(p, eapad1); 1712 p += eapad1; 1713 error = uiomove(p, ealen, ap->a_uio); 1714 if (error) { 1715 free(eae, M_TEMP); 1716 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1717 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1718 ip->i_ea_error = error; 1719 return(error); 1720 } 1721 p += ealen; 1722 bzero(p, eapad2); 1723 1724 p = ip->i_ea_area; 1725 ip->i_ea_area = eae; 1726 ip->i_ea_len = easize; 1727 free(p, M_TEMP); 1728 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1729 return(error); 1730 } 1731 1732 /* 1733 * Vnode pointer to File handle 1734 */ 1735 static int 1736 ffs_vptofh(struct vop_vptofh_args *ap) 1737 /* 1738 vop_vptofh { 1739 IN struct vnode *a_vp; 1740 IN struct fid *a_fhp; 1741 }; 1742 */ 1743 { 1744 struct inode *ip; 1745 struct ufid *ufhp; 1746 1747 ip = VTOI(ap->a_vp); 1748 ufhp = (struct ufid *)ap->a_fhp; 1749 ufhp->ufid_len = sizeof(struct ufid); 1750 ufhp->ufid_ino = ip->i_number; 1751 ufhp->ufid_gen = ip->i_gen; 1752 return (0); 1753 } 1754