1 /*- 2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 60 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 61 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include <sys/param.h> 68 #include <sys/bio.h> 69 #include <sys/systm.h> 70 #include <sys/buf.h> 71 #include <sys/conf.h> 72 #include <sys/extattr.h> 73 #include <sys/kernel.h> 74 #include <sys/limits.h> 75 #include <sys/malloc.h> 76 #include <sys/mount.h> 77 #include <sys/priv.h> 78 #include <sys/stat.h> 79 #include <sys/vmmeter.h> 80 #include <sys/vnode.h> 81 82 #include <vm/vm.h> 83 #include <vm/vm_extern.h> 84 #include <vm/vm_object.h> 85 #include <vm/vm_page.h> 86 #include <vm/vm_pager.h> 87 #include <vm/vnode_pager.h> 88 89 #include <ufs/ufs/extattr.h> 90 #include <ufs/ufs/quota.h> 91 #include <ufs/ufs/inode.h> 92 #include <ufs/ufs/ufs_extern.h> 93 #include <ufs/ufs/ufsmount.h> 94 95 #include <ufs/ffs/fs.h> 96 #include <ufs/ffs/ffs_extern.h> 97 #include "opt_directio.h" 98 #include "opt_ffs.h" 99 100 #ifdef DIRECTIO 101 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 102 #endif 103 static vop_fsync_t ffs_fsync; 104 static vop_lock1_t ffs_lock; 105 static vop_getpages_t ffs_getpages; 106 static vop_read_t ffs_read; 107 static vop_write_t ffs_write; 108 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 109 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 110 struct ucred *cred); 111 static vop_strategy_t ffsext_strategy; 112 static vop_closeextattr_t ffs_closeextattr; 113 static vop_deleteextattr_t ffs_deleteextattr; 114 static vop_getextattr_t ffs_getextattr; 115 static vop_listextattr_t ffs_listextattr; 116 static vop_openextattr_t ffs_openextattr; 117 static vop_setextattr_t ffs_setextattr; 118 static vop_vptofh_t ffs_vptofh; 119 120 121 /* Global vfs data structures for ufs. */ 122 struct vop_vector ffs_vnodeops1 = { 123 .vop_default = &ufs_vnodeops, 124 .vop_fsync = ffs_fsync, 125 .vop_getpages = ffs_getpages, 126 .vop_lock1 = ffs_lock, 127 .vop_read = ffs_read, 128 .vop_reallocblks = ffs_reallocblks, 129 .vop_write = ffs_write, 130 .vop_vptofh = ffs_vptofh, 131 }; 132 133 struct vop_vector ffs_fifoops1 = { 134 .vop_default = &ufs_fifoops, 135 .vop_fsync = ffs_fsync, 136 .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ 137 .vop_vptofh = ffs_vptofh, 138 }; 139 140 /* Global vfs data structures for ufs. */ 141 struct vop_vector ffs_vnodeops2 = { 142 .vop_default = &ufs_vnodeops, 143 .vop_fsync = ffs_fsync, 144 .vop_getpages = ffs_getpages, 145 .vop_lock1 = ffs_lock, 146 .vop_read = ffs_read, 147 .vop_reallocblks = ffs_reallocblks, 148 .vop_write = ffs_write, 149 .vop_closeextattr = ffs_closeextattr, 150 .vop_deleteextattr = ffs_deleteextattr, 151 .vop_getextattr = ffs_getextattr, 152 .vop_listextattr = ffs_listextattr, 153 .vop_openextattr = ffs_openextattr, 154 .vop_setextattr = ffs_setextattr, 155 .vop_vptofh = ffs_vptofh, 156 }; 157 158 struct vop_vector ffs_fifoops2 = { 159 .vop_default = &ufs_fifoops, 160 .vop_fsync = ffs_fsync, 161 .vop_lock1 = ffs_lock, 162 .vop_reallocblks = ffs_reallocblks, 163 .vop_strategy = ffsext_strategy, 164 .vop_closeextattr = ffs_closeextattr, 165 .vop_deleteextattr = ffs_deleteextattr, 166 .vop_getextattr = ffs_getextattr, 167 .vop_listextattr = ffs_listextattr, 168 .vop_openextattr = ffs_openextattr, 169 .vop_setextattr = ffs_setextattr, 170 .vop_vptofh = ffs_vptofh, 171 }; 172 173 /* 174 * Synch an open file. 175 */ 176 /* ARGSUSED */ 177 static int 178 ffs_fsync(struct vop_fsync_args *ap) 179 { 180 struct vnode *vp; 181 struct bufobj *bo; 182 int error; 183 184 vp = ap->a_vp; 185 bo = &vp->v_bufobj; 186 retry: 187 error = ffs_syncvnode(vp, ap->a_waitfor); 188 if (error) 189 return (error); 190 if (ap->a_waitfor == MNT_WAIT && 191 (vp->v_mount->mnt_flag & MNT_SOFTDEP)) { 192 error = softdep_fsync(vp); 193 if (error) 194 return (error); 195 196 /* 197 * The softdep_fsync() function may drop vp lock, 198 * allowing for dirty buffers to reappear on the 199 * bo_dirty list. Recheck and resync as needed. 200 */ 201 BO_LOCK(bo); 202 if (vp->v_type == VREG && (bo->bo_numoutput > 0 || 203 bo->bo_dirty.bv_cnt > 0)) { 204 BO_UNLOCK(bo); 205 goto retry; 206 } 207 BO_UNLOCK(bo); 208 } 209 return (0); 210 } 211 212 int 213 ffs_syncvnode(struct vnode *vp, int waitfor) 214 { 215 struct inode *ip; 216 struct bufobj *bo; 217 struct buf *bp; 218 struct buf *nbp; 219 ufs_lbn_t lbn; 220 int error, wait, passes; 221 222 ip = VTOI(vp); 223 ip->i_flag &= ~IN_NEEDSYNC; 224 bo = &vp->v_bufobj; 225 226 /* 227 * When doing MNT_WAIT we must first flush all dependencies 228 * on the inode. 229 */ 230 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && 231 (error = softdep_sync_metadata(vp)) != 0) 232 return (error); 233 234 /* 235 * Flush all dirty buffers associated with a vnode. 236 */ 237 error = 0; 238 passes = 0; 239 wait = 0; /* Always do an async pass first. */ 240 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 241 BO_LOCK(bo); 242 loop: 243 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 244 bp->b_vflags &= ~BV_SCANNED; 245 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 246 /* 247 * Reasons to skip this buffer: it has already been considered 248 * on this pass, the buffer has dependencies that will cause 249 * it to be redirtied and it has not already been deferred, 250 * or it is already being written. 251 */ 252 if ((bp->b_vflags & BV_SCANNED) != 0) 253 continue; 254 bp->b_vflags |= BV_SCANNED; 255 /* Flush indirects in order. */ 256 if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR && 257 lbn_level(bp->b_lblkno) >= passes) 258 continue; 259 if (bp->b_lblkno > lbn) 260 panic("ffs_syncvnode: syncing truncated data."); 261 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 262 continue; 263 BO_UNLOCK(bo); 264 if ((bp->b_flags & B_DELWRI) == 0) 265 panic("ffs_fsync: not dirty"); 266 /* 267 * Check for dependencies and potentially complete them. 268 */ 269 if (!LIST_EMPTY(&bp->b_dep) && 270 (error = softdep_sync_buf(vp, bp, 271 wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { 272 /* I/O error. */ 273 if (error != EBUSY) { 274 BUF_UNLOCK(bp); 275 return (error); 276 } 277 /* If we deferred once, don't defer again. */ 278 if ((bp->b_flags & B_DEFERRED) == 0) { 279 bp->b_flags |= B_DEFERRED; 280 BUF_UNLOCK(bp); 281 goto next; 282 } 283 } 284 if (wait) { 285 bremfree(bp); 286 if ((error = bwrite(bp)) != 0) 287 return (error); 288 } else if ((bp->b_flags & B_CLUSTEROK)) { 289 (void) vfs_bio_awrite(bp); 290 } else { 291 bremfree(bp); 292 (void) bawrite(bp); 293 } 294 next: 295 /* 296 * Since we may have slept during the I/O, we need 297 * to start from a known point. 298 */ 299 BO_LOCK(bo); 300 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); 301 } 302 if (waitfor != MNT_WAIT) { 303 BO_UNLOCK(bo); 304 return (ffs_update(vp, waitfor)); 305 } 306 /* Drain IO to see if we're done. */ 307 bufobj_wwait(bo, 0, 0); 308 /* 309 * Block devices associated with filesystems may have new I/O 310 * requests posted for them even if the vnode is locked, so no 311 * amount of trying will get them clean. We make several passes 312 * as a best effort. 313 * 314 * Regular files may need multiple passes to flush all dependency 315 * work as it is possible that we must write once per indirect 316 * level, once for the leaf, and once for the inode and each of 317 * these will be done with one sync and one async pass. 318 */ 319 if (bo->bo_dirty.bv_cnt > 0) { 320 /* Write the inode after sync passes to flush deps. */ 321 if (wait && DOINGSOFTDEP(vp)) { 322 BO_UNLOCK(bo); 323 ffs_update(vp, MNT_WAIT); 324 BO_LOCK(bo); 325 } 326 /* switch between sync/async. */ 327 wait = !wait; 328 if (wait == 1 || ++passes < NIADDR + 2) 329 goto loop; 330 #ifdef INVARIANTS 331 if (!vn_isdisk(vp, NULL)) 332 vprint("ffs_fsync: dirty", vp); 333 #endif 334 } 335 BO_UNLOCK(bo); 336 error = ffs_update(vp, MNT_WAIT); 337 if (DOINGSUJ(vp)) 338 softdep_journal_fsync(VTOI(vp)); 339 return (error); 340 } 341 342 static int 343 ffs_lock(ap) 344 struct vop_lock1_args /* { 345 struct vnode *a_vp; 346 int a_flags; 347 struct thread *a_td; 348 char *file; 349 int line; 350 } */ *ap; 351 { 352 #ifndef NO_FFS_SNAPSHOT 353 struct vnode *vp; 354 int flags; 355 struct lock *lkp; 356 int result; 357 358 switch (ap->a_flags & LK_TYPE_MASK) { 359 case LK_SHARED: 360 case LK_UPGRADE: 361 case LK_EXCLUSIVE: 362 vp = ap->a_vp; 363 flags = ap->a_flags; 364 for (;;) { 365 #ifdef DEBUG_VFS_LOCKS 366 KASSERT(vp->v_holdcnt != 0, 367 ("ffs_lock %p: zero hold count", vp)); 368 #endif 369 lkp = vp->v_vnlock; 370 result = _lockmgr_args(lkp, flags, VI_MTX(vp), 371 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 372 ap->a_file, ap->a_line); 373 if (lkp == vp->v_vnlock || result != 0) 374 break; 375 /* 376 * Apparent success, except that the vnode 377 * mutated between snapshot file vnode and 378 * regular file vnode while this process 379 * slept. The lock currently held is not the 380 * right lock. Release it, and try to get the 381 * new lock. 382 */ 383 (void) _lockmgr_args(lkp, LK_RELEASE, NULL, 384 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 385 ap->a_file, ap->a_line); 386 if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == 387 (LK_INTERLOCK | LK_NOWAIT)) 388 return (EBUSY); 389 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 390 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 391 flags &= ~LK_INTERLOCK; 392 } 393 break; 394 default: 395 result = VOP_LOCK1_APV(&ufs_vnodeops, ap); 396 } 397 return (result); 398 #else 399 return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); 400 #endif 401 } 402 403 /* 404 * Vnode op for reading. 405 */ 406 /* ARGSUSED */ 407 static int 408 ffs_read(ap) 409 struct vop_read_args /* { 410 struct vnode *a_vp; 411 struct uio *a_uio; 412 int a_ioflag; 413 struct ucred *a_cred; 414 } */ *ap; 415 { 416 struct vnode *vp; 417 struct inode *ip; 418 struct uio *uio; 419 struct fs *fs; 420 struct buf *bp; 421 ufs_lbn_t lbn, nextlbn; 422 off_t bytesinfile; 423 long size, xfersize, blkoffset; 424 int error, orig_resid; 425 int seqcount; 426 int ioflag; 427 428 vp = ap->a_vp; 429 uio = ap->a_uio; 430 ioflag = ap->a_ioflag; 431 if (ap->a_ioflag & IO_EXT) 432 #ifdef notyet 433 return (ffs_extread(vp, uio, ioflag)); 434 #else 435 panic("ffs_read+IO_EXT"); 436 #endif 437 #ifdef DIRECTIO 438 if ((ioflag & IO_DIRECT) != 0) { 439 int workdone; 440 441 error = ffs_rawread(vp, uio, &workdone); 442 if (error != 0 || workdone != 0) 443 return error; 444 } 445 #endif 446 447 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 448 ip = VTOI(vp); 449 450 #ifdef INVARIANTS 451 if (uio->uio_rw != UIO_READ) 452 panic("ffs_read: mode"); 453 454 if (vp->v_type == VLNK) { 455 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 456 panic("ffs_read: short symlink"); 457 } else if (vp->v_type != VREG && vp->v_type != VDIR) 458 panic("ffs_read: type %d", vp->v_type); 459 #endif 460 orig_resid = uio->uio_resid; 461 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 462 if (orig_resid == 0) 463 return (0); 464 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 465 fs = ip->i_fs; 466 if (uio->uio_offset < ip->i_size && 467 uio->uio_offset >= fs->fs_maxfilesize) 468 return (EOVERFLOW); 469 470 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 471 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 472 break; 473 lbn = lblkno(fs, uio->uio_offset); 474 nextlbn = lbn + 1; 475 476 /* 477 * size of buffer. The buffer representing the 478 * end of the file is rounded up to the size of 479 * the block type ( fragment or full block, 480 * depending ). 481 */ 482 size = blksize(fs, ip, lbn); 483 blkoffset = blkoff(fs, uio->uio_offset); 484 485 /* 486 * The amount we want to transfer in this iteration is 487 * one FS block less the amount of the data before 488 * our startpoint (duh!) 489 */ 490 xfersize = fs->fs_bsize - blkoffset; 491 492 /* 493 * But if we actually want less than the block, 494 * or the file doesn't have a whole block more of data, 495 * then use the lesser number. 496 */ 497 if (uio->uio_resid < xfersize) 498 xfersize = uio->uio_resid; 499 if (bytesinfile < xfersize) 500 xfersize = bytesinfile; 501 502 if (lblktosize(fs, nextlbn) >= ip->i_size) { 503 /* 504 * Don't do readahead if this is the end of the file. 505 */ 506 error = bread(vp, lbn, size, NOCRED, &bp); 507 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 508 /* 509 * Otherwise if we are allowed to cluster, 510 * grab as much as we can. 511 * 512 * XXX This may not be a win if we are not 513 * doing sequential access. 514 */ 515 error = cluster_read(vp, ip->i_size, lbn, 516 size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); 517 } else if (seqcount > 1) { 518 /* 519 * If we are NOT allowed to cluster, then 520 * if we appear to be acting sequentially, 521 * fire off a request for a readahead 522 * as well as a read. Note that the 4th and 5th 523 * arguments point to arrays of the size specified in 524 * the 6th argument. 525 */ 526 int nextsize = blksize(fs, ip, nextlbn); 527 error = breadn(vp, lbn, 528 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 529 } else { 530 /* 531 * Failing all of the above, just read what the 532 * user asked for. Interestingly, the same as 533 * the first option above. 534 */ 535 error = bread(vp, lbn, size, NOCRED, &bp); 536 } 537 if (error) { 538 brelse(bp); 539 bp = NULL; 540 break; 541 } 542 543 /* 544 * If IO_DIRECT then set B_DIRECT for the buffer. This 545 * will cause us to attempt to release the buffer later on 546 * and will cause the buffer cache to attempt to free the 547 * underlying pages. 548 */ 549 if (ioflag & IO_DIRECT) 550 bp->b_flags |= B_DIRECT; 551 552 /* 553 * We should only get non-zero b_resid when an I/O error 554 * has occurred, which should cause us to break above. 555 * However, if the short read did not cause an error, 556 * then we want to ensure that we do not uiomove bad 557 * or uninitialized data. 558 */ 559 size -= bp->b_resid; 560 if (size < xfersize) { 561 if (size == 0) 562 break; 563 xfersize = size; 564 } 565 566 error = uiomove((char *)bp->b_data + blkoffset, 567 (int)xfersize, uio); 568 if (error) 569 break; 570 571 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 572 (LIST_EMPTY(&bp->b_dep))) { 573 /* 574 * If there are no dependencies, and it's VMIO, 575 * then we don't need the buf, mark it available 576 * for freeing. For non-direct VMIO reads, the VM 577 * has the data. 578 */ 579 bp->b_flags |= B_RELBUF; 580 brelse(bp); 581 } else { 582 /* 583 * Otherwise let whoever 584 * made the request take care of 585 * freeing it. We just queue 586 * it onto another list. 587 */ 588 bqrelse(bp); 589 } 590 } 591 592 /* 593 * This can only happen in the case of an error 594 * because the loop above resets bp to NULL on each iteration 595 * and on normal completion has not set a new value into it. 596 * so it must have come from a 'break' statement 597 */ 598 if (bp != NULL) { 599 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 600 (LIST_EMPTY(&bp->b_dep))) { 601 bp->b_flags |= B_RELBUF; 602 brelse(bp); 603 } else { 604 bqrelse(bp); 605 } 606 } 607 608 if ((error == 0 || uio->uio_resid != orig_resid) && 609 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0 && 610 (ip->i_flag & IN_ACCESS) == 0) { 611 VI_LOCK(vp); 612 ip->i_flag |= IN_ACCESS; 613 VI_UNLOCK(vp); 614 } 615 return (error); 616 } 617 618 /* 619 * Vnode op for writing. 620 */ 621 static int 622 ffs_write(ap) 623 struct vop_write_args /* { 624 struct vnode *a_vp; 625 struct uio *a_uio; 626 int a_ioflag; 627 struct ucred *a_cred; 628 } */ *ap; 629 { 630 struct vnode *vp; 631 struct uio *uio; 632 struct inode *ip; 633 struct fs *fs; 634 struct buf *bp; 635 ufs_lbn_t lbn; 636 off_t osize; 637 int seqcount; 638 int blkoffset, error, flags, ioflag, resid, size, xfersize; 639 640 vp = ap->a_vp; 641 uio = ap->a_uio; 642 ioflag = ap->a_ioflag; 643 if (ap->a_ioflag & IO_EXT) 644 #ifdef notyet 645 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 646 #else 647 panic("ffs_write+IO_EXT"); 648 #endif 649 650 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 651 ip = VTOI(vp); 652 653 #ifdef INVARIANTS 654 if (uio->uio_rw != UIO_WRITE) 655 panic("ffs_write: mode"); 656 #endif 657 658 switch (vp->v_type) { 659 case VREG: 660 if (ioflag & IO_APPEND) 661 uio->uio_offset = ip->i_size; 662 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 663 return (EPERM); 664 /* FALLTHROUGH */ 665 case VLNK: 666 break; 667 case VDIR: 668 panic("ffs_write: dir write"); 669 break; 670 default: 671 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 672 (int)uio->uio_offset, 673 (int)uio->uio_resid 674 ); 675 } 676 677 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 678 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 679 fs = ip->i_fs; 680 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 681 return (EFBIG); 682 /* 683 * Maybe this should be above the vnode op call, but so long as 684 * file servers have no limits, I don't think it matters. 685 */ 686 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) 687 return (EFBIG); 688 689 resid = uio->uio_resid; 690 osize = ip->i_size; 691 if (seqcount > BA_SEQMAX) 692 flags = BA_SEQMAX << BA_SEQSHIFT; 693 else 694 flags = seqcount << BA_SEQSHIFT; 695 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 696 flags |= IO_SYNC; 697 698 for (error = 0; uio->uio_resid > 0;) { 699 lbn = lblkno(fs, uio->uio_offset); 700 blkoffset = blkoff(fs, uio->uio_offset); 701 xfersize = fs->fs_bsize - blkoffset; 702 if (uio->uio_resid < xfersize) 703 xfersize = uio->uio_resid; 704 if (uio->uio_offset + xfersize > ip->i_size) 705 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 706 707 /* 708 * We must perform a read-before-write if the transfer size 709 * does not cover the entire buffer. 710 */ 711 if (fs->fs_bsize > xfersize) 712 flags |= BA_CLRBUF; 713 else 714 flags &= ~BA_CLRBUF; 715 /* XXX is uio->uio_offset the right thing here? */ 716 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 717 ap->a_cred, flags, &bp); 718 if (error != 0) { 719 vnode_pager_setsize(vp, ip->i_size); 720 break; 721 } 722 /* 723 * If the buffer is not valid we have to clear out any 724 * garbage data from the pages instantiated for the buffer. 725 * If we do not, a failed uiomove() during a write can leave 726 * the prior contents of the pages exposed to a userland 727 * mmap(). XXX deal with uiomove() errors a better way. 728 */ 729 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 730 vfs_bio_clrbuf(bp); 731 if (ioflag & IO_DIRECT) 732 bp->b_flags |= B_DIRECT; 733 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 734 bp->b_flags |= B_NOCACHE; 735 736 if (uio->uio_offset + xfersize > ip->i_size) { 737 ip->i_size = uio->uio_offset + xfersize; 738 DIP_SET(ip, i_size, ip->i_size); 739 } 740 741 size = blksize(fs, ip, lbn) - bp->b_resid; 742 if (size < xfersize) 743 xfersize = size; 744 745 error = 746 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 747 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 748 (LIST_EMPTY(&bp->b_dep))) { 749 bp->b_flags |= B_RELBUF; 750 } 751 752 /* 753 * If IO_SYNC each buffer is written synchronously. Otherwise 754 * if we have a severe page deficiency write the buffer 755 * asynchronously. Otherwise try to cluster, and if that 756 * doesn't do it then either do an async write (if O_DIRECT), 757 * or a delayed write (if not). 758 */ 759 if (ioflag & IO_SYNC) { 760 (void)bwrite(bp); 761 } else if (vm_page_count_severe() || 762 buf_dirty_count_severe() || 763 (ioflag & IO_ASYNC)) { 764 bp->b_flags |= B_CLUSTEROK; 765 bawrite(bp); 766 } else if (xfersize + blkoffset == fs->fs_bsize) { 767 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 768 bp->b_flags |= B_CLUSTEROK; 769 cluster_write(vp, bp, ip->i_size, seqcount); 770 } else { 771 bawrite(bp); 772 } 773 } else if (ioflag & IO_DIRECT) { 774 bp->b_flags |= B_CLUSTEROK; 775 bawrite(bp); 776 } else { 777 bp->b_flags |= B_CLUSTEROK; 778 bdwrite(bp); 779 } 780 if (error || xfersize == 0) 781 break; 782 ip->i_flag |= IN_CHANGE | IN_UPDATE; 783 } 784 /* 785 * If we successfully wrote any data, and we are not the superuser 786 * we clear the setuid and setgid bits as a precaution against 787 * tampering. 788 */ 789 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && 790 ap->a_cred) { 791 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) { 792 ip->i_mode &= ~(ISUID | ISGID); 793 DIP_SET(ip, i_mode, ip->i_mode); 794 } 795 } 796 if (error) { 797 if (ioflag & IO_UNIT) { 798 (void)ffs_truncate(vp, osize, 799 IO_NORMAL | (ioflag & IO_SYNC), 800 ap->a_cred, uio->uio_td); 801 uio->uio_offset -= resid - uio->uio_resid; 802 uio->uio_resid = resid; 803 } 804 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 805 error = ffs_update(vp, 1); 806 return (error); 807 } 808 809 /* 810 * get page routine 811 */ 812 static int 813 ffs_getpages(ap) 814 struct vop_getpages_args *ap; 815 { 816 int i; 817 vm_page_t mreq; 818 int pcount; 819 820 pcount = round_page(ap->a_count) / PAGE_SIZE; 821 mreq = ap->a_m[ap->a_reqpage]; 822 823 /* 824 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 825 * then the entire page is valid. Since the page may be mapped, 826 * user programs might reference data beyond the actual end of file 827 * occuring within the page. We have to zero that data. 828 */ 829 VM_OBJECT_LOCK(mreq->object); 830 if (mreq->valid) { 831 if (mreq->valid != VM_PAGE_BITS_ALL) 832 vm_page_zero_invalid(mreq, TRUE); 833 for (i = 0; i < pcount; i++) { 834 if (i != ap->a_reqpage) { 835 vm_page_lock(ap->a_m[i]); 836 vm_page_free(ap->a_m[i]); 837 vm_page_unlock(ap->a_m[i]); 838 } 839 } 840 VM_OBJECT_UNLOCK(mreq->object); 841 return VM_PAGER_OK; 842 } 843 VM_OBJECT_UNLOCK(mreq->object); 844 845 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 846 ap->a_count, 847 ap->a_reqpage); 848 } 849 850 851 /* 852 * Extended attribute area reading. 853 */ 854 static int 855 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 856 { 857 struct inode *ip; 858 struct ufs2_dinode *dp; 859 struct fs *fs; 860 struct buf *bp; 861 ufs_lbn_t lbn, nextlbn; 862 off_t bytesinfile; 863 long size, xfersize, blkoffset; 864 int error, orig_resid; 865 866 ip = VTOI(vp); 867 fs = ip->i_fs; 868 dp = ip->i_din2; 869 870 #ifdef INVARIANTS 871 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 872 panic("ffs_extread: mode"); 873 874 #endif 875 orig_resid = uio->uio_resid; 876 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 877 if (orig_resid == 0) 878 return (0); 879 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 880 881 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 882 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 883 break; 884 lbn = lblkno(fs, uio->uio_offset); 885 nextlbn = lbn + 1; 886 887 /* 888 * size of buffer. The buffer representing the 889 * end of the file is rounded up to the size of 890 * the block type ( fragment or full block, 891 * depending ). 892 */ 893 size = sblksize(fs, dp->di_extsize, lbn); 894 blkoffset = blkoff(fs, uio->uio_offset); 895 896 /* 897 * The amount we want to transfer in this iteration is 898 * one FS block less the amount of the data before 899 * our startpoint (duh!) 900 */ 901 xfersize = fs->fs_bsize - blkoffset; 902 903 /* 904 * But if we actually want less than the block, 905 * or the file doesn't have a whole block more of data, 906 * then use the lesser number. 907 */ 908 if (uio->uio_resid < xfersize) 909 xfersize = uio->uio_resid; 910 if (bytesinfile < xfersize) 911 xfersize = bytesinfile; 912 913 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 914 /* 915 * Don't do readahead if this is the end of the info. 916 */ 917 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 918 } else { 919 /* 920 * If we have a second block, then 921 * fire off a request for a readahead 922 * as well as a read. Note that the 4th and 5th 923 * arguments point to arrays of the size specified in 924 * the 6th argument. 925 */ 926 int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 927 928 nextlbn = -1 - nextlbn; 929 error = breadn(vp, -1 - lbn, 930 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 931 } 932 if (error) { 933 brelse(bp); 934 bp = NULL; 935 break; 936 } 937 938 /* 939 * If IO_DIRECT then set B_DIRECT for the buffer. This 940 * will cause us to attempt to release the buffer later on 941 * and will cause the buffer cache to attempt to free the 942 * underlying pages. 943 */ 944 if (ioflag & IO_DIRECT) 945 bp->b_flags |= B_DIRECT; 946 947 /* 948 * We should only get non-zero b_resid when an I/O error 949 * has occurred, which should cause us to break above. 950 * However, if the short read did not cause an error, 951 * then we want to ensure that we do not uiomove bad 952 * or uninitialized data. 953 */ 954 size -= bp->b_resid; 955 if (size < xfersize) { 956 if (size == 0) 957 break; 958 xfersize = size; 959 } 960 961 error = uiomove((char *)bp->b_data + blkoffset, 962 (int)xfersize, uio); 963 if (error) 964 break; 965 966 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 967 (LIST_EMPTY(&bp->b_dep))) { 968 /* 969 * If there are no dependencies, and it's VMIO, 970 * then we don't need the buf, mark it available 971 * for freeing. For non-direct VMIO reads, the VM 972 * has the data. 973 */ 974 bp->b_flags |= B_RELBUF; 975 brelse(bp); 976 } else { 977 /* 978 * Otherwise let whoever 979 * made the request take care of 980 * freeing it. We just queue 981 * it onto another list. 982 */ 983 bqrelse(bp); 984 } 985 } 986 987 /* 988 * This can only happen in the case of an error 989 * because the loop above resets bp to NULL on each iteration 990 * and on normal completion has not set a new value into it. 991 * so it must have come from a 'break' statement 992 */ 993 if (bp != NULL) { 994 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 995 (LIST_EMPTY(&bp->b_dep))) { 996 bp->b_flags |= B_RELBUF; 997 brelse(bp); 998 } else { 999 bqrelse(bp); 1000 } 1001 } 1002 return (error); 1003 } 1004 1005 /* 1006 * Extended attribute area writing. 1007 */ 1008 static int 1009 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1010 { 1011 struct inode *ip; 1012 struct ufs2_dinode *dp; 1013 struct fs *fs; 1014 struct buf *bp; 1015 ufs_lbn_t lbn; 1016 off_t osize; 1017 int blkoffset, error, flags, resid, size, xfersize; 1018 1019 ip = VTOI(vp); 1020 fs = ip->i_fs; 1021 dp = ip->i_din2; 1022 1023 #ifdef INVARIANTS 1024 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1025 panic("ffs_extwrite: mode"); 1026 #endif 1027 1028 if (ioflag & IO_APPEND) 1029 uio->uio_offset = dp->di_extsize; 1030 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1031 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1032 if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1033 return (EFBIG); 1034 1035 resid = uio->uio_resid; 1036 osize = dp->di_extsize; 1037 flags = IO_EXT; 1038 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1039 flags |= IO_SYNC; 1040 1041 for (error = 0; uio->uio_resid > 0;) { 1042 lbn = lblkno(fs, uio->uio_offset); 1043 blkoffset = blkoff(fs, uio->uio_offset); 1044 xfersize = fs->fs_bsize - blkoffset; 1045 if (uio->uio_resid < xfersize) 1046 xfersize = uio->uio_resid; 1047 1048 /* 1049 * We must perform a read-before-write if the transfer size 1050 * does not cover the entire buffer. 1051 */ 1052 if (fs->fs_bsize > xfersize) 1053 flags |= BA_CLRBUF; 1054 else 1055 flags &= ~BA_CLRBUF; 1056 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1057 ucred, flags, &bp); 1058 if (error != 0) 1059 break; 1060 /* 1061 * If the buffer is not valid we have to clear out any 1062 * garbage data from the pages instantiated for the buffer. 1063 * If we do not, a failed uiomove() during a write can leave 1064 * the prior contents of the pages exposed to a userland 1065 * mmap(). XXX deal with uiomove() errors a better way. 1066 */ 1067 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1068 vfs_bio_clrbuf(bp); 1069 if (ioflag & IO_DIRECT) 1070 bp->b_flags |= B_DIRECT; 1071 1072 if (uio->uio_offset + xfersize > dp->di_extsize) 1073 dp->di_extsize = uio->uio_offset + xfersize; 1074 1075 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1076 if (size < xfersize) 1077 xfersize = size; 1078 1079 error = 1080 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1081 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1082 (LIST_EMPTY(&bp->b_dep))) { 1083 bp->b_flags |= B_RELBUF; 1084 } 1085 1086 /* 1087 * If IO_SYNC each buffer is written synchronously. Otherwise 1088 * if we have a severe page deficiency write the buffer 1089 * asynchronously. Otherwise try to cluster, and if that 1090 * doesn't do it then either do an async write (if O_DIRECT), 1091 * or a delayed write (if not). 1092 */ 1093 if (ioflag & IO_SYNC) { 1094 (void)bwrite(bp); 1095 } else if (vm_page_count_severe() || 1096 buf_dirty_count_severe() || 1097 xfersize + blkoffset == fs->fs_bsize || 1098 (ioflag & (IO_ASYNC | IO_DIRECT))) 1099 bawrite(bp); 1100 else 1101 bdwrite(bp); 1102 if (error || xfersize == 0) 1103 break; 1104 ip->i_flag |= IN_CHANGE; 1105 } 1106 /* 1107 * If we successfully wrote any data, and we are not the superuser 1108 * we clear the setuid and setgid bits as a precaution against 1109 * tampering. 1110 */ 1111 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { 1112 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) { 1113 ip->i_mode &= ~(ISUID | ISGID); 1114 dp->di_mode = ip->i_mode; 1115 } 1116 } 1117 if (error) { 1118 if (ioflag & IO_UNIT) { 1119 (void)ffs_truncate(vp, osize, 1120 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td); 1121 uio->uio_offset -= resid - uio->uio_resid; 1122 uio->uio_resid = resid; 1123 } 1124 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1125 error = ffs_update(vp, 1); 1126 return (error); 1127 } 1128 1129 1130 /* 1131 * Vnode operating to retrieve a named extended attribute. 1132 * 1133 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1134 * the length of the EA, and possibly the pointer to the entry and to the data. 1135 */ 1136 static int 1137 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) 1138 { 1139 u_char *p, *pe, *pn, *p0; 1140 int eapad1, eapad2, ealength, ealen, nlen; 1141 uint32_t ul; 1142 1143 pe = ptr + length; 1144 nlen = strlen(name); 1145 1146 for (p = ptr; p < pe; p = pn) { 1147 p0 = p; 1148 bcopy(p, &ul, sizeof(ul)); 1149 pn = p + ul; 1150 /* make sure this entry is complete */ 1151 if (pn > pe) 1152 break; 1153 p += sizeof(uint32_t); 1154 if (*p != nspace) 1155 continue; 1156 p++; 1157 eapad2 = *p++; 1158 if (*p != nlen) 1159 continue; 1160 p++; 1161 if (bcmp(p, name, nlen)) 1162 continue; 1163 ealength = sizeof(uint32_t) + 3 + nlen; 1164 eapad1 = 8 - (ealength % 8); 1165 if (eapad1 == 8) 1166 eapad1 = 0; 1167 ealength += eapad1; 1168 ealen = ul - ealength - eapad2; 1169 p += nlen + eapad1; 1170 if (eap != NULL) 1171 *eap = p0; 1172 if (eac != NULL) 1173 *eac = p; 1174 return (ealen); 1175 } 1176 return(-1); 1177 } 1178 1179 static int 1180 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1181 { 1182 struct inode *ip; 1183 struct ufs2_dinode *dp; 1184 struct fs *fs; 1185 struct uio luio; 1186 struct iovec liovec; 1187 int easize, error; 1188 u_char *eae; 1189 1190 ip = VTOI(vp); 1191 fs = ip->i_fs; 1192 dp = ip->i_din2; 1193 easize = dp->di_extsize; 1194 if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize) 1195 return (EFBIG); 1196 1197 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1198 1199 liovec.iov_base = eae; 1200 liovec.iov_len = easize; 1201 luio.uio_iov = &liovec; 1202 luio.uio_iovcnt = 1; 1203 luio.uio_offset = 0; 1204 luio.uio_resid = easize; 1205 luio.uio_segflg = UIO_SYSSPACE; 1206 luio.uio_rw = UIO_READ; 1207 luio.uio_td = td; 1208 1209 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1210 if (error) { 1211 free(eae, M_TEMP); 1212 return(error); 1213 } 1214 *p = eae; 1215 return (0); 1216 } 1217 1218 static void 1219 ffs_lock_ea(struct vnode *vp) 1220 { 1221 struct inode *ip; 1222 1223 ip = VTOI(vp); 1224 VI_LOCK(vp); 1225 while (ip->i_flag & IN_EA_LOCKED) { 1226 ip->i_flag |= IN_EA_LOCKWAIT; 1227 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea", 1228 0); 1229 } 1230 ip->i_flag |= IN_EA_LOCKED; 1231 VI_UNLOCK(vp); 1232 } 1233 1234 static void 1235 ffs_unlock_ea(struct vnode *vp) 1236 { 1237 struct inode *ip; 1238 1239 ip = VTOI(vp); 1240 VI_LOCK(vp); 1241 if (ip->i_flag & IN_EA_LOCKWAIT) 1242 wakeup(&ip->i_ea_refs); 1243 ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT); 1244 VI_UNLOCK(vp); 1245 } 1246 1247 static int 1248 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1249 { 1250 struct inode *ip; 1251 struct ufs2_dinode *dp; 1252 int error; 1253 1254 ip = VTOI(vp); 1255 1256 ffs_lock_ea(vp); 1257 if (ip->i_ea_area != NULL) { 1258 ip->i_ea_refs++; 1259 ffs_unlock_ea(vp); 1260 return (0); 1261 } 1262 dp = ip->i_din2; 1263 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1264 if (error) { 1265 ffs_unlock_ea(vp); 1266 return (error); 1267 } 1268 ip->i_ea_len = dp->di_extsize; 1269 ip->i_ea_error = 0; 1270 ip->i_ea_refs++; 1271 ffs_unlock_ea(vp); 1272 return (0); 1273 } 1274 1275 /* 1276 * Vnode extattr transaction commit/abort 1277 */ 1278 static int 1279 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1280 { 1281 struct inode *ip; 1282 struct uio luio; 1283 struct iovec liovec; 1284 int error; 1285 struct ufs2_dinode *dp; 1286 1287 ip = VTOI(vp); 1288 1289 ffs_lock_ea(vp); 1290 if (ip->i_ea_area == NULL) { 1291 ffs_unlock_ea(vp); 1292 return (EINVAL); 1293 } 1294 dp = ip->i_din2; 1295 error = ip->i_ea_error; 1296 if (commit && error == 0) { 1297 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit"); 1298 if (cred == NOCRED) 1299 cred = vp->v_mount->mnt_cred; 1300 liovec.iov_base = ip->i_ea_area; 1301 liovec.iov_len = ip->i_ea_len; 1302 luio.uio_iov = &liovec; 1303 luio.uio_iovcnt = 1; 1304 luio.uio_offset = 0; 1305 luio.uio_resid = ip->i_ea_len; 1306 luio.uio_segflg = UIO_SYSSPACE; 1307 luio.uio_rw = UIO_WRITE; 1308 luio.uio_td = td; 1309 /* XXX: I'm not happy about truncating to zero size */ 1310 if (ip->i_ea_len < dp->di_extsize) 1311 error = ffs_truncate(vp, 0, IO_EXT, cred, td); 1312 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1313 } 1314 if (--ip->i_ea_refs == 0) { 1315 free(ip->i_ea_area, M_TEMP); 1316 ip->i_ea_area = NULL; 1317 ip->i_ea_len = 0; 1318 ip->i_ea_error = 0; 1319 } 1320 ffs_unlock_ea(vp); 1321 return (error); 1322 } 1323 1324 /* 1325 * Vnode extattr strategy routine for fifos. 1326 * 1327 * We need to check for a read or write of the external attributes. 1328 * Otherwise we just fall through and do the usual thing. 1329 */ 1330 static int 1331 ffsext_strategy(struct vop_strategy_args *ap) 1332 /* 1333 struct vop_strategy_args { 1334 struct vnodeop_desc *a_desc; 1335 struct vnode *a_vp; 1336 struct buf *a_bp; 1337 }; 1338 */ 1339 { 1340 struct vnode *vp; 1341 daddr_t lbn; 1342 1343 vp = ap->a_vp; 1344 lbn = ap->a_bp->b_lblkno; 1345 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1346 lbn < 0 && lbn >= -NXADDR) 1347 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1348 if (vp->v_type == VFIFO) 1349 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1350 panic("spec nodes went here"); 1351 } 1352 1353 /* 1354 * Vnode extattr transaction commit/abort 1355 */ 1356 static int 1357 ffs_openextattr(struct vop_openextattr_args *ap) 1358 /* 1359 struct vop_openextattr_args { 1360 struct vnodeop_desc *a_desc; 1361 struct vnode *a_vp; 1362 IN struct ucred *a_cred; 1363 IN struct thread *a_td; 1364 }; 1365 */ 1366 { 1367 struct inode *ip; 1368 struct fs *fs; 1369 1370 ip = VTOI(ap->a_vp); 1371 fs = ip->i_fs; 1372 1373 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1374 return (EOPNOTSUPP); 1375 1376 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1377 } 1378 1379 1380 /* 1381 * Vnode extattr transaction commit/abort 1382 */ 1383 static int 1384 ffs_closeextattr(struct vop_closeextattr_args *ap) 1385 /* 1386 struct vop_closeextattr_args { 1387 struct vnodeop_desc *a_desc; 1388 struct vnode *a_vp; 1389 int a_commit; 1390 IN struct ucred *a_cred; 1391 IN struct thread *a_td; 1392 }; 1393 */ 1394 { 1395 struct inode *ip; 1396 struct fs *fs; 1397 1398 ip = VTOI(ap->a_vp); 1399 fs = ip->i_fs; 1400 1401 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1402 return (EOPNOTSUPP); 1403 1404 if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) 1405 return (EROFS); 1406 1407 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1408 } 1409 1410 /* 1411 * Vnode operation to remove a named attribute. 1412 */ 1413 static int 1414 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1415 /* 1416 vop_deleteextattr { 1417 IN struct vnode *a_vp; 1418 IN int a_attrnamespace; 1419 IN const char *a_name; 1420 IN struct ucred *a_cred; 1421 IN struct thread *a_td; 1422 }; 1423 */ 1424 { 1425 struct inode *ip; 1426 struct fs *fs; 1427 uint32_t ealength, ul; 1428 int ealen, olen, eapad1, eapad2, error, i, easize; 1429 u_char *eae, *p; 1430 1431 ip = VTOI(ap->a_vp); 1432 fs = ip->i_fs; 1433 1434 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1435 return (EOPNOTSUPP); 1436 1437 if (strlen(ap->a_name) == 0) 1438 return (EINVAL); 1439 1440 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1441 return (EROFS); 1442 1443 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1444 ap->a_cred, ap->a_td, VWRITE); 1445 if (error) { 1446 1447 /* 1448 * ffs_lock_ea is not needed there, because the vnode 1449 * must be exclusively locked. 1450 */ 1451 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1452 ip->i_ea_error = error; 1453 return (error); 1454 } 1455 1456 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1457 if (error) 1458 return (error); 1459 1460 ealength = eapad1 = ealen = eapad2 = 0; 1461 1462 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1463 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1464 easize = ip->i_ea_len; 1465 1466 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1467 &p, NULL); 1468 if (olen == -1) { 1469 /* delete but nonexistent */ 1470 free(eae, M_TEMP); 1471 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1472 return(ENOATTR); 1473 } 1474 bcopy(p, &ul, sizeof ul); 1475 i = p - eae + ul; 1476 if (ul != ealength) { 1477 bcopy(p + ul, p + ealength, easize - i); 1478 easize += (ealength - ul); 1479 } 1480 if (easize > NXADDR * fs->fs_bsize) { 1481 free(eae, M_TEMP); 1482 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1483 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1484 ip->i_ea_error = ENOSPC; 1485 return(ENOSPC); 1486 } 1487 p = ip->i_ea_area; 1488 ip->i_ea_area = eae; 1489 ip->i_ea_len = easize; 1490 free(p, M_TEMP); 1491 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1492 return(error); 1493 } 1494 1495 /* 1496 * Vnode operation to retrieve a named extended attribute. 1497 */ 1498 static int 1499 ffs_getextattr(struct vop_getextattr_args *ap) 1500 /* 1501 vop_getextattr { 1502 IN struct vnode *a_vp; 1503 IN int a_attrnamespace; 1504 IN const char *a_name; 1505 INOUT struct uio *a_uio; 1506 OUT size_t *a_size; 1507 IN struct ucred *a_cred; 1508 IN struct thread *a_td; 1509 }; 1510 */ 1511 { 1512 struct inode *ip; 1513 struct fs *fs; 1514 u_char *eae, *p; 1515 unsigned easize; 1516 int error, ealen; 1517 1518 ip = VTOI(ap->a_vp); 1519 fs = ip->i_fs; 1520 1521 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1522 return (EOPNOTSUPP); 1523 1524 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1525 ap->a_cred, ap->a_td, VREAD); 1526 if (error) 1527 return (error); 1528 1529 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1530 if (error) 1531 return (error); 1532 1533 eae = ip->i_ea_area; 1534 easize = ip->i_ea_len; 1535 1536 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1537 NULL, &p); 1538 if (ealen >= 0) { 1539 error = 0; 1540 if (ap->a_size != NULL) 1541 *ap->a_size = ealen; 1542 else if (ap->a_uio != NULL) 1543 error = uiomove(p, ealen, ap->a_uio); 1544 } else 1545 error = ENOATTR; 1546 1547 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1548 return(error); 1549 } 1550 1551 /* 1552 * Vnode operation to retrieve extended attributes on a vnode. 1553 */ 1554 static int 1555 ffs_listextattr(struct vop_listextattr_args *ap) 1556 /* 1557 vop_listextattr { 1558 IN struct vnode *a_vp; 1559 IN int a_attrnamespace; 1560 INOUT struct uio *a_uio; 1561 OUT size_t *a_size; 1562 IN struct ucred *a_cred; 1563 IN struct thread *a_td; 1564 }; 1565 */ 1566 { 1567 struct inode *ip; 1568 struct fs *fs; 1569 u_char *eae, *p, *pe, *pn; 1570 unsigned easize; 1571 uint32_t ul; 1572 int error, ealen; 1573 1574 ip = VTOI(ap->a_vp); 1575 fs = ip->i_fs; 1576 1577 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1578 return (EOPNOTSUPP); 1579 1580 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1581 ap->a_cred, ap->a_td, VREAD); 1582 if (error) 1583 return (error); 1584 1585 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1586 if (error) 1587 return (error); 1588 eae = ip->i_ea_area; 1589 easize = ip->i_ea_len; 1590 1591 error = 0; 1592 if (ap->a_size != NULL) 1593 *ap->a_size = 0; 1594 pe = eae + easize; 1595 for(p = eae; error == 0 && p < pe; p = pn) { 1596 bcopy(p, &ul, sizeof(ul)); 1597 pn = p + ul; 1598 if (pn > pe) 1599 break; 1600 p += sizeof(ul); 1601 if (*p++ != ap->a_attrnamespace) 1602 continue; 1603 p++; /* pad2 */ 1604 ealen = *p; 1605 if (ap->a_size != NULL) { 1606 *ap->a_size += ealen + 1; 1607 } else if (ap->a_uio != NULL) { 1608 error = uiomove(p, ealen + 1, ap->a_uio); 1609 } 1610 } 1611 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1612 return(error); 1613 } 1614 1615 /* 1616 * Vnode operation to set a named attribute. 1617 */ 1618 static int 1619 ffs_setextattr(struct vop_setextattr_args *ap) 1620 /* 1621 vop_setextattr { 1622 IN struct vnode *a_vp; 1623 IN int a_attrnamespace; 1624 IN const char *a_name; 1625 INOUT struct uio *a_uio; 1626 IN struct ucred *a_cred; 1627 IN struct thread *a_td; 1628 }; 1629 */ 1630 { 1631 struct inode *ip; 1632 struct fs *fs; 1633 uint32_t ealength, ul; 1634 int ealen, olen, eapad1, eapad2, error, i, easize; 1635 u_char *eae, *p; 1636 1637 ip = VTOI(ap->a_vp); 1638 fs = ip->i_fs; 1639 1640 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1641 return (EOPNOTSUPP); 1642 1643 if (strlen(ap->a_name) == 0) 1644 return (EINVAL); 1645 1646 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1647 if (ap->a_uio == NULL) 1648 return (EOPNOTSUPP); 1649 1650 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1651 return (EROFS); 1652 1653 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1654 ap->a_cred, ap->a_td, VWRITE); 1655 if (error) { 1656 1657 /* 1658 * ffs_lock_ea is not needed there, because the vnode 1659 * must be exclusively locked. 1660 */ 1661 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1662 ip->i_ea_error = error; 1663 return (error); 1664 } 1665 1666 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1667 if (error) 1668 return (error); 1669 1670 ealen = ap->a_uio->uio_resid; 1671 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1672 eapad1 = 8 - (ealength % 8); 1673 if (eapad1 == 8) 1674 eapad1 = 0; 1675 eapad2 = 8 - (ealen % 8); 1676 if (eapad2 == 8) 1677 eapad2 = 0; 1678 ealength += eapad1 + ealen + eapad2; 1679 1680 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1681 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1682 easize = ip->i_ea_len; 1683 1684 olen = ffs_findextattr(eae, easize, 1685 ap->a_attrnamespace, ap->a_name, &p, NULL); 1686 if (olen == -1) { 1687 /* new, append at end */ 1688 p = eae + easize; 1689 easize += ealength; 1690 } else { 1691 bcopy(p, &ul, sizeof ul); 1692 i = p - eae + ul; 1693 if (ul != ealength) { 1694 bcopy(p + ul, p + ealength, easize - i); 1695 easize += (ealength - ul); 1696 } 1697 } 1698 if (easize > NXADDR * fs->fs_bsize) { 1699 free(eae, M_TEMP); 1700 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1701 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1702 ip->i_ea_error = ENOSPC; 1703 return(ENOSPC); 1704 } 1705 bcopy(&ealength, p, sizeof(ealength)); 1706 p += sizeof(ealength); 1707 *p++ = ap->a_attrnamespace; 1708 *p++ = eapad2; 1709 *p++ = strlen(ap->a_name); 1710 strcpy(p, ap->a_name); 1711 p += strlen(ap->a_name); 1712 bzero(p, eapad1); 1713 p += eapad1; 1714 error = uiomove(p, ealen, ap->a_uio); 1715 if (error) { 1716 free(eae, M_TEMP); 1717 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1718 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1719 ip->i_ea_error = error; 1720 return(error); 1721 } 1722 p += ealen; 1723 bzero(p, eapad2); 1724 1725 p = ip->i_ea_area; 1726 ip->i_ea_area = eae; 1727 ip->i_ea_len = easize; 1728 free(p, M_TEMP); 1729 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1730 return(error); 1731 } 1732 1733 /* 1734 * Vnode pointer to File handle 1735 */ 1736 static int 1737 ffs_vptofh(struct vop_vptofh_args *ap) 1738 /* 1739 vop_vptofh { 1740 IN struct vnode *a_vp; 1741 IN struct fid *a_fhp; 1742 }; 1743 */ 1744 { 1745 struct inode *ip; 1746 struct ufid *ufhp; 1747 1748 ip = VTOI(ap->a_vp); 1749 ufhp = (struct ufid *)ap->a_fhp; 1750 ufhp->ufid_len = sizeof(struct ufid); 1751 ufhp->ufid_ino = ip->i_number; 1752 ufhp->ufid_gen = ip->i_gen; 1753 return (0); 1754 } 1755