1 /*- 2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 60 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 61 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include <sys/param.h> 68 #include <sys/bio.h> 69 #include <sys/systm.h> 70 #include <sys/buf.h> 71 #include <sys/conf.h> 72 #include <sys/extattr.h> 73 #include <sys/kernel.h> 74 #include <sys/limits.h> 75 #include <sys/malloc.h> 76 #include <sys/mount.h> 77 #include <sys/priv.h> 78 #include <sys/stat.h> 79 #include <sys/vmmeter.h> 80 #include <sys/vnode.h> 81 82 #include <vm/vm.h> 83 #include <vm/vm_extern.h> 84 #include <vm/vm_object.h> 85 #include <vm/vm_page.h> 86 #include <vm/vm_pager.h> 87 #include <vm/vnode_pager.h> 88 89 #include <ufs/ufs/extattr.h> 90 #include <ufs/ufs/quota.h> 91 #include <ufs/ufs/inode.h> 92 #include <ufs/ufs/ufs_extern.h> 93 #include <ufs/ufs/ufsmount.h> 94 95 #include <ufs/ffs/fs.h> 96 #include <ufs/ffs/ffs_extern.h> 97 #include "opt_directio.h" 98 #include "opt_ffs.h" 99 100 #ifdef DIRECTIO 101 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 102 #endif 103 static vop_fsync_t ffs_fsync; 104 static vop_lock1_t ffs_lock; 105 static vop_getpages_t ffs_getpages; 106 static vop_read_t ffs_read; 107 static vop_write_t ffs_write; 108 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 109 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 110 struct ucred *cred); 111 static vop_strategy_t ffsext_strategy; 112 static vop_closeextattr_t ffs_closeextattr; 113 static vop_deleteextattr_t ffs_deleteextattr; 114 static vop_getextattr_t ffs_getextattr; 115 static vop_listextattr_t ffs_listextattr; 116 static vop_openextattr_t ffs_openextattr; 117 static vop_setextattr_t ffs_setextattr; 118 static vop_vptofh_t ffs_vptofh; 119 120 121 /* Global vfs data structures for ufs. */ 122 struct vop_vector ffs_vnodeops1 = { 123 .vop_default = &ufs_vnodeops, 124 .vop_fsync = ffs_fsync, 125 .vop_getpages = ffs_getpages, 126 .vop_lock1 = ffs_lock, 127 .vop_read = ffs_read, 128 .vop_reallocblks = ffs_reallocblks, 129 .vop_write = ffs_write, 130 .vop_vptofh = ffs_vptofh, 131 }; 132 133 struct vop_vector ffs_fifoops1 = { 134 .vop_default = &ufs_fifoops, 135 .vop_fsync = ffs_fsync, 136 .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ 137 .vop_vptofh = ffs_vptofh, 138 }; 139 140 /* Global vfs data structures for ufs. */ 141 struct vop_vector ffs_vnodeops2 = { 142 .vop_default = &ufs_vnodeops, 143 .vop_fsync = ffs_fsync, 144 .vop_getpages = ffs_getpages, 145 .vop_lock1 = ffs_lock, 146 .vop_read = ffs_read, 147 .vop_reallocblks = ffs_reallocblks, 148 .vop_write = ffs_write, 149 .vop_closeextattr = ffs_closeextattr, 150 .vop_deleteextattr = ffs_deleteextattr, 151 .vop_getextattr = ffs_getextattr, 152 .vop_listextattr = ffs_listextattr, 153 .vop_openextattr = ffs_openextattr, 154 .vop_setextattr = ffs_setextattr, 155 .vop_vptofh = ffs_vptofh, 156 }; 157 158 struct vop_vector ffs_fifoops2 = { 159 .vop_default = &ufs_fifoops, 160 .vop_fsync = ffs_fsync, 161 .vop_lock1 = ffs_lock, 162 .vop_reallocblks = ffs_reallocblks, 163 .vop_strategy = ffsext_strategy, 164 .vop_closeextattr = ffs_closeextattr, 165 .vop_deleteextattr = ffs_deleteextattr, 166 .vop_getextattr = ffs_getextattr, 167 .vop_listextattr = ffs_listextattr, 168 .vop_openextattr = ffs_openextattr, 169 .vop_setextattr = ffs_setextattr, 170 .vop_vptofh = ffs_vptofh, 171 }; 172 173 /* 174 * Synch an open file. 175 */ 176 /* ARGSUSED */ 177 static int 178 ffs_fsync(struct vop_fsync_args *ap) 179 { 180 struct vnode *vp; 181 struct bufobj *bo; 182 int error; 183 184 vp = ap->a_vp; 185 bo = &vp->v_bufobj; 186 retry: 187 error = ffs_syncvnode(vp, ap->a_waitfor); 188 if (error) 189 return (error); 190 if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) { 191 error = softdep_fsync(vp); 192 if (error) 193 return (error); 194 195 /* 196 * The softdep_fsync() function may drop vp lock, 197 * allowing for dirty buffers to reappear on the 198 * bo_dirty list. Recheck and resync as needed. 199 */ 200 BO_LOCK(bo); 201 if (vp->v_type == VREG && (bo->bo_numoutput > 0 || 202 bo->bo_dirty.bv_cnt > 0)) { 203 BO_UNLOCK(bo); 204 goto retry; 205 } 206 BO_UNLOCK(bo); 207 } 208 return (0); 209 } 210 211 int 212 ffs_syncvnode(struct vnode *vp, int waitfor) 213 { 214 struct inode *ip; 215 struct bufobj *bo; 216 struct buf *bp; 217 struct buf *nbp; 218 ufs_lbn_t lbn; 219 int error, wait, passes; 220 221 ip = VTOI(vp); 222 ip->i_flag &= ~IN_NEEDSYNC; 223 bo = &vp->v_bufobj; 224 225 /* 226 * When doing MNT_WAIT we must first flush all dependencies 227 * on the inode. 228 */ 229 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && 230 (error = softdep_sync_metadata(vp)) != 0) 231 return (error); 232 233 /* 234 * Flush all dirty buffers associated with a vnode. 235 */ 236 error = 0; 237 passes = 0; 238 wait = 0; /* Always do an async pass first. */ 239 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 240 BO_LOCK(bo); 241 loop: 242 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 243 bp->b_vflags &= ~BV_SCANNED; 244 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 245 /* 246 * Reasons to skip this buffer: it has already been considered 247 * on this pass, the buffer has dependencies that will cause 248 * it to be redirtied and it has not already been deferred, 249 * or it is already being written. 250 */ 251 if ((bp->b_vflags & BV_SCANNED) != 0) 252 continue; 253 bp->b_vflags |= BV_SCANNED; 254 /* Flush indirects in order. */ 255 if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR && 256 lbn_level(bp->b_lblkno) >= passes) 257 continue; 258 if (bp->b_lblkno > lbn) 259 panic("ffs_syncvnode: syncing truncated data."); 260 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 261 continue; 262 BO_UNLOCK(bo); 263 if ((bp->b_flags & B_DELWRI) == 0) 264 panic("ffs_fsync: not dirty"); 265 /* 266 * Check for dependencies and potentially complete them. 267 */ 268 if (!LIST_EMPTY(&bp->b_dep) && 269 (error = softdep_sync_buf(vp, bp, 270 wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { 271 /* I/O error. */ 272 if (error != EBUSY) { 273 BUF_UNLOCK(bp); 274 return (error); 275 } 276 /* If we deferred once, don't defer again. */ 277 if ((bp->b_flags & B_DEFERRED) == 0) { 278 bp->b_flags |= B_DEFERRED; 279 BUF_UNLOCK(bp); 280 goto next; 281 } 282 } 283 if (wait) { 284 bremfree(bp); 285 if ((error = bwrite(bp)) != 0) 286 return (error); 287 } else if ((bp->b_flags & B_CLUSTEROK)) { 288 (void) vfs_bio_awrite(bp); 289 } else { 290 bremfree(bp); 291 (void) bawrite(bp); 292 } 293 next: 294 /* 295 * Since we may have slept during the I/O, we need 296 * to start from a known point. 297 */ 298 BO_LOCK(bo); 299 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); 300 } 301 if (waitfor != MNT_WAIT) { 302 BO_UNLOCK(bo); 303 return (ffs_update(vp, waitfor)); 304 } 305 /* Drain IO to see if we're done. */ 306 bufobj_wwait(bo, 0, 0); 307 /* 308 * Block devices associated with filesystems may have new I/O 309 * requests posted for them even if the vnode is locked, so no 310 * amount of trying will get them clean. We make several passes 311 * as a best effort. 312 * 313 * Regular files may need multiple passes to flush all dependency 314 * work as it is possible that we must write once per indirect 315 * level, once for the leaf, and once for the inode and each of 316 * these will be done with one sync and one async pass. 317 */ 318 if (bo->bo_dirty.bv_cnt > 0) { 319 /* Write the inode after sync passes to flush deps. */ 320 if (wait && DOINGSOFTDEP(vp)) { 321 BO_UNLOCK(bo); 322 ffs_update(vp, MNT_WAIT); 323 BO_LOCK(bo); 324 } 325 /* switch between sync/async. */ 326 wait = !wait; 327 if (wait == 1 || ++passes < NIADDR + 2) 328 goto loop; 329 #ifdef INVARIANTS 330 if (!vn_isdisk(vp, NULL)) 331 vprint("ffs_fsync: dirty", vp); 332 #endif 333 } 334 BO_UNLOCK(bo); 335 error = ffs_update(vp, MNT_WAIT); 336 if (DOINGSUJ(vp)) 337 softdep_journal_fsync(VTOI(vp)); 338 return (error); 339 } 340 341 static int 342 ffs_lock(ap) 343 struct vop_lock1_args /* { 344 struct vnode *a_vp; 345 int a_flags; 346 struct thread *a_td; 347 char *file; 348 int line; 349 } */ *ap; 350 { 351 #ifndef NO_FFS_SNAPSHOT 352 struct vnode *vp; 353 int flags; 354 struct lock *lkp; 355 int result; 356 357 switch (ap->a_flags & LK_TYPE_MASK) { 358 case LK_SHARED: 359 case LK_UPGRADE: 360 case LK_EXCLUSIVE: 361 vp = ap->a_vp; 362 flags = ap->a_flags; 363 for (;;) { 364 #ifdef DEBUG_VFS_LOCKS 365 KASSERT(vp->v_holdcnt != 0, 366 ("ffs_lock %p: zero hold count", vp)); 367 #endif 368 lkp = vp->v_vnlock; 369 result = _lockmgr_args(lkp, flags, VI_MTX(vp), 370 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 371 ap->a_file, ap->a_line); 372 if (lkp == vp->v_vnlock || result != 0) 373 break; 374 /* 375 * Apparent success, except that the vnode 376 * mutated between snapshot file vnode and 377 * regular file vnode while this process 378 * slept. The lock currently held is not the 379 * right lock. Release it, and try to get the 380 * new lock. 381 */ 382 (void) _lockmgr_args(lkp, LK_RELEASE, NULL, 383 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 384 ap->a_file, ap->a_line); 385 if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == 386 (LK_INTERLOCK | LK_NOWAIT)) 387 return (EBUSY); 388 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 389 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 390 flags &= ~LK_INTERLOCK; 391 } 392 break; 393 default: 394 result = VOP_LOCK1_APV(&ufs_vnodeops, ap); 395 } 396 return (result); 397 #else 398 return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); 399 #endif 400 } 401 402 /* 403 * Vnode op for reading. 404 */ 405 /* ARGSUSED */ 406 static int 407 ffs_read(ap) 408 struct vop_read_args /* { 409 struct vnode *a_vp; 410 struct uio *a_uio; 411 int a_ioflag; 412 struct ucred *a_cred; 413 } */ *ap; 414 { 415 struct vnode *vp; 416 struct inode *ip; 417 struct uio *uio; 418 struct fs *fs; 419 struct buf *bp; 420 ufs_lbn_t lbn, nextlbn; 421 off_t bytesinfile; 422 long size, xfersize, blkoffset; 423 ssize_t orig_resid; 424 int error; 425 int seqcount; 426 int ioflag; 427 428 vp = ap->a_vp; 429 uio = ap->a_uio; 430 ioflag = ap->a_ioflag; 431 if (ap->a_ioflag & IO_EXT) 432 #ifdef notyet 433 return (ffs_extread(vp, uio, ioflag)); 434 #else 435 panic("ffs_read+IO_EXT"); 436 #endif 437 #ifdef DIRECTIO 438 if ((ioflag & IO_DIRECT) != 0) { 439 int workdone; 440 441 error = ffs_rawread(vp, uio, &workdone); 442 if (error != 0 || workdone != 0) 443 return error; 444 } 445 #endif 446 447 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 448 ip = VTOI(vp); 449 450 #ifdef INVARIANTS 451 if (uio->uio_rw != UIO_READ) 452 panic("ffs_read: mode"); 453 454 if (vp->v_type == VLNK) { 455 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 456 panic("ffs_read: short symlink"); 457 } else if (vp->v_type != VREG && vp->v_type != VDIR) 458 panic("ffs_read: type %d", vp->v_type); 459 #endif 460 orig_resid = uio->uio_resid; 461 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 462 if (orig_resid == 0) 463 return (0); 464 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 465 fs = ip->i_fs; 466 if (uio->uio_offset < ip->i_size && 467 uio->uio_offset >= fs->fs_maxfilesize) 468 return (EOVERFLOW); 469 470 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 471 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 472 break; 473 lbn = lblkno(fs, uio->uio_offset); 474 nextlbn = lbn + 1; 475 476 /* 477 * size of buffer. The buffer representing the 478 * end of the file is rounded up to the size of 479 * the block type ( fragment or full block, 480 * depending ). 481 */ 482 size = blksize(fs, ip, lbn); 483 blkoffset = blkoff(fs, uio->uio_offset); 484 485 /* 486 * The amount we want to transfer in this iteration is 487 * one FS block less the amount of the data before 488 * our startpoint (duh!) 489 */ 490 xfersize = fs->fs_bsize - blkoffset; 491 492 /* 493 * But if we actually want less than the block, 494 * or the file doesn't have a whole block more of data, 495 * then use the lesser number. 496 */ 497 if (uio->uio_resid < xfersize) 498 xfersize = uio->uio_resid; 499 if (bytesinfile < xfersize) 500 xfersize = bytesinfile; 501 502 if (lblktosize(fs, nextlbn) >= ip->i_size) { 503 /* 504 * Don't do readahead if this is the end of the file. 505 */ 506 error = bread(vp, lbn, size, NOCRED, &bp); 507 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 508 /* 509 * Otherwise if we are allowed to cluster, 510 * grab as much as we can. 511 * 512 * XXX This may not be a win if we are not 513 * doing sequential access. 514 */ 515 error = cluster_read(vp, ip->i_size, lbn, 516 size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); 517 } else if (seqcount > 1) { 518 /* 519 * If we are NOT allowed to cluster, then 520 * if we appear to be acting sequentially, 521 * fire off a request for a readahead 522 * as well as a read. Note that the 4th and 5th 523 * arguments point to arrays of the size specified in 524 * the 6th argument. 525 */ 526 int nextsize = blksize(fs, ip, nextlbn); 527 error = breadn(vp, lbn, 528 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 529 } else { 530 /* 531 * Failing all of the above, just read what the 532 * user asked for. Interestingly, the same as 533 * the first option above. 534 */ 535 error = bread(vp, lbn, size, NOCRED, &bp); 536 } 537 if (error) { 538 brelse(bp); 539 bp = NULL; 540 break; 541 } 542 543 /* 544 * If IO_DIRECT then set B_DIRECT for the buffer. This 545 * will cause us to attempt to release the buffer later on 546 * and will cause the buffer cache to attempt to free the 547 * underlying pages. 548 */ 549 if (ioflag & IO_DIRECT) 550 bp->b_flags |= B_DIRECT; 551 552 /* 553 * We should only get non-zero b_resid when an I/O error 554 * has occurred, which should cause us to break above. 555 * However, if the short read did not cause an error, 556 * then we want to ensure that we do not uiomove bad 557 * or uninitialized data. 558 */ 559 size -= bp->b_resid; 560 if (size < xfersize) { 561 if (size == 0) 562 break; 563 xfersize = size; 564 } 565 566 error = uiomove((char *)bp->b_data + blkoffset, 567 (int)xfersize, uio); 568 if (error) 569 break; 570 571 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 572 (LIST_EMPTY(&bp->b_dep))) { 573 /* 574 * If there are no dependencies, and it's VMIO, 575 * then we don't need the buf, mark it available 576 * for freeing. For non-direct VMIO reads, the VM 577 * has the data. 578 */ 579 bp->b_flags |= B_RELBUF; 580 brelse(bp); 581 } else { 582 /* 583 * Otherwise let whoever 584 * made the request take care of 585 * freeing it. We just queue 586 * it onto another list. 587 */ 588 bqrelse(bp); 589 } 590 } 591 592 /* 593 * This can only happen in the case of an error 594 * because the loop above resets bp to NULL on each iteration 595 * and on normal completion has not set a new value into it. 596 * so it must have come from a 'break' statement 597 */ 598 if (bp != NULL) { 599 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 600 (LIST_EMPTY(&bp->b_dep))) { 601 bp->b_flags |= B_RELBUF; 602 brelse(bp); 603 } else { 604 bqrelse(bp); 605 } 606 } 607 608 if ((error == 0 || uio->uio_resid != orig_resid) && 609 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0 && 610 (ip->i_flag & IN_ACCESS) == 0) { 611 VI_LOCK(vp); 612 ip->i_flag |= IN_ACCESS; 613 VI_UNLOCK(vp); 614 } 615 return (error); 616 } 617 618 /* 619 * Vnode op for writing. 620 */ 621 static int 622 ffs_write(ap) 623 struct vop_write_args /* { 624 struct vnode *a_vp; 625 struct uio *a_uio; 626 int a_ioflag; 627 struct ucred *a_cred; 628 } */ *ap; 629 { 630 struct vnode *vp; 631 struct uio *uio; 632 struct inode *ip; 633 struct fs *fs; 634 struct buf *bp; 635 ufs_lbn_t lbn; 636 off_t osize; 637 ssize_t resid; 638 int seqcount; 639 int blkoffset, error, flags, ioflag, size, xfersize; 640 641 vp = ap->a_vp; 642 uio = ap->a_uio; 643 ioflag = ap->a_ioflag; 644 if (ap->a_ioflag & IO_EXT) 645 #ifdef notyet 646 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 647 #else 648 panic("ffs_write+IO_EXT"); 649 #endif 650 651 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 652 ip = VTOI(vp); 653 654 #ifdef INVARIANTS 655 if (uio->uio_rw != UIO_WRITE) 656 panic("ffs_write: mode"); 657 #endif 658 659 switch (vp->v_type) { 660 case VREG: 661 if (ioflag & IO_APPEND) 662 uio->uio_offset = ip->i_size; 663 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 664 return (EPERM); 665 /* FALLTHROUGH */ 666 case VLNK: 667 break; 668 case VDIR: 669 panic("ffs_write: dir write"); 670 break; 671 default: 672 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 673 (int)uio->uio_offset, 674 (int)uio->uio_resid 675 ); 676 } 677 678 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 679 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 680 fs = ip->i_fs; 681 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 682 return (EFBIG); 683 /* 684 * Maybe this should be above the vnode op call, but so long as 685 * file servers have no limits, I don't think it matters. 686 */ 687 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) 688 return (EFBIG); 689 690 resid = uio->uio_resid; 691 osize = ip->i_size; 692 if (seqcount > BA_SEQMAX) 693 flags = BA_SEQMAX << BA_SEQSHIFT; 694 else 695 flags = seqcount << BA_SEQSHIFT; 696 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 697 flags |= IO_SYNC; 698 699 for (error = 0; uio->uio_resid > 0;) { 700 lbn = lblkno(fs, uio->uio_offset); 701 blkoffset = blkoff(fs, uio->uio_offset); 702 xfersize = fs->fs_bsize - blkoffset; 703 if (uio->uio_resid < xfersize) 704 xfersize = uio->uio_resid; 705 if (uio->uio_offset + xfersize > ip->i_size) 706 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 707 708 /* 709 * We must perform a read-before-write if the transfer size 710 * does not cover the entire buffer. 711 */ 712 if (fs->fs_bsize > xfersize) 713 flags |= BA_CLRBUF; 714 else 715 flags &= ~BA_CLRBUF; 716 /* XXX is uio->uio_offset the right thing here? */ 717 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 718 ap->a_cred, flags, &bp); 719 if (error != 0) { 720 vnode_pager_setsize(vp, ip->i_size); 721 break; 722 } 723 if (ioflag & IO_DIRECT) 724 bp->b_flags |= B_DIRECT; 725 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 726 bp->b_flags |= B_NOCACHE; 727 728 if (uio->uio_offset + xfersize > ip->i_size) { 729 ip->i_size = uio->uio_offset + xfersize; 730 DIP_SET(ip, i_size, ip->i_size); 731 } 732 733 size = blksize(fs, ip, lbn) - bp->b_resid; 734 if (size < xfersize) 735 xfersize = size; 736 737 error = 738 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 739 /* 740 * If the buffer is not already filled and we encounter an 741 * error while trying to fill it, we have to clear out any 742 * garbage data from the pages instantiated for the buffer. 743 * If we do not, a failed uiomove() during a write can leave 744 * the prior contents of the pages exposed to a userland mmap. 745 * 746 * Note that we need only clear buffers with a transfer size 747 * equal to the block size because buffers with a shorter 748 * transfer size were cleared above by the call to UFS_BALLOC() 749 * with the BA_CLRBUF flag set. 750 * 751 * If the source region for uiomove identically mmaps the 752 * buffer, uiomove() performed the NOP copy, and the buffer 753 * content remains valid because the page fault handler 754 * validated the pages. 755 */ 756 if (error != 0 && (bp->b_flags & B_CACHE) == 0 && 757 fs->fs_bsize == xfersize) 758 vfs_bio_clrbuf(bp); 759 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 760 (LIST_EMPTY(&bp->b_dep))) { 761 bp->b_flags |= B_RELBUF; 762 } 763 764 /* 765 * If IO_SYNC each buffer is written synchronously. Otherwise 766 * if we have a severe page deficiency write the buffer 767 * asynchronously. Otherwise try to cluster, and if that 768 * doesn't do it then either do an async write (if O_DIRECT), 769 * or a delayed write (if not). 770 */ 771 if (ioflag & IO_SYNC) { 772 (void)bwrite(bp); 773 } else if (vm_page_count_severe() || 774 buf_dirty_count_severe() || 775 (ioflag & IO_ASYNC)) { 776 bp->b_flags |= B_CLUSTEROK; 777 bawrite(bp); 778 } else if (xfersize + blkoffset == fs->fs_bsize) { 779 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 780 bp->b_flags |= B_CLUSTEROK; 781 cluster_write(vp, bp, ip->i_size, seqcount); 782 } else { 783 bawrite(bp); 784 } 785 } else if (ioflag & IO_DIRECT) { 786 bp->b_flags |= B_CLUSTEROK; 787 bawrite(bp); 788 } else { 789 bp->b_flags |= B_CLUSTEROK; 790 bdwrite(bp); 791 } 792 if (error || xfersize == 0) 793 break; 794 ip->i_flag |= IN_CHANGE | IN_UPDATE; 795 } 796 /* 797 * If we successfully wrote any data, and we are not the superuser 798 * we clear the setuid and setgid bits as a precaution against 799 * tampering. 800 */ 801 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && 802 ap->a_cred) { 803 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) { 804 ip->i_mode &= ~(ISUID | ISGID); 805 DIP_SET(ip, i_mode, ip->i_mode); 806 } 807 } 808 if (error) { 809 if (ioflag & IO_UNIT) { 810 (void)ffs_truncate(vp, osize, 811 IO_NORMAL | (ioflag & IO_SYNC), 812 ap->a_cred, uio->uio_td); 813 uio->uio_offset -= resid - uio->uio_resid; 814 uio->uio_resid = resid; 815 } 816 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 817 error = ffs_update(vp, 1); 818 return (error); 819 } 820 821 /* 822 * get page routine 823 */ 824 static int 825 ffs_getpages(ap) 826 struct vop_getpages_args *ap; 827 { 828 int i; 829 vm_page_t mreq; 830 int pcount; 831 832 pcount = round_page(ap->a_count) / PAGE_SIZE; 833 mreq = ap->a_m[ap->a_reqpage]; 834 835 /* 836 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 837 * then the entire page is valid. Since the page may be mapped, 838 * user programs might reference data beyond the actual end of file 839 * occuring within the page. We have to zero that data. 840 */ 841 VM_OBJECT_LOCK(mreq->object); 842 if (mreq->valid) { 843 if (mreq->valid != VM_PAGE_BITS_ALL) 844 vm_page_zero_invalid(mreq, TRUE); 845 for (i = 0; i < pcount; i++) { 846 if (i != ap->a_reqpage) { 847 vm_page_lock(ap->a_m[i]); 848 vm_page_free(ap->a_m[i]); 849 vm_page_unlock(ap->a_m[i]); 850 } 851 } 852 VM_OBJECT_UNLOCK(mreq->object); 853 return VM_PAGER_OK; 854 } 855 VM_OBJECT_UNLOCK(mreq->object); 856 857 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 858 ap->a_count, 859 ap->a_reqpage); 860 } 861 862 863 /* 864 * Extended attribute area reading. 865 */ 866 static int 867 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 868 { 869 struct inode *ip; 870 struct ufs2_dinode *dp; 871 struct fs *fs; 872 struct buf *bp; 873 ufs_lbn_t lbn, nextlbn; 874 off_t bytesinfile; 875 long size, xfersize, blkoffset; 876 ssize_t orig_resid; 877 int error; 878 879 ip = VTOI(vp); 880 fs = ip->i_fs; 881 dp = ip->i_din2; 882 883 #ifdef INVARIANTS 884 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 885 panic("ffs_extread: mode"); 886 887 #endif 888 orig_resid = uio->uio_resid; 889 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 890 if (orig_resid == 0) 891 return (0); 892 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 893 894 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 895 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 896 break; 897 lbn = lblkno(fs, uio->uio_offset); 898 nextlbn = lbn + 1; 899 900 /* 901 * size of buffer. The buffer representing the 902 * end of the file is rounded up to the size of 903 * the block type ( fragment or full block, 904 * depending ). 905 */ 906 size = sblksize(fs, dp->di_extsize, lbn); 907 blkoffset = blkoff(fs, uio->uio_offset); 908 909 /* 910 * The amount we want to transfer in this iteration is 911 * one FS block less the amount of the data before 912 * our startpoint (duh!) 913 */ 914 xfersize = fs->fs_bsize - blkoffset; 915 916 /* 917 * But if we actually want less than the block, 918 * or the file doesn't have a whole block more of data, 919 * then use the lesser number. 920 */ 921 if (uio->uio_resid < xfersize) 922 xfersize = uio->uio_resid; 923 if (bytesinfile < xfersize) 924 xfersize = bytesinfile; 925 926 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 927 /* 928 * Don't do readahead if this is the end of the info. 929 */ 930 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 931 } else { 932 /* 933 * If we have a second block, then 934 * fire off a request for a readahead 935 * as well as a read. Note that the 4th and 5th 936 * arguments point to arrays of the size specified in 937 * the 6th argument. 938 */ 939 int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 940 941 nextlbn = -1 - nextlbn; 942 error = breadn(vp, -1 - lbn, 943 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 944 } 945 if (error) { 946 brelse(bp); 947 bp = NULL; 948 break; 949 } 950 951 /* 952 * If IO_DIRECT then set B_DIRECT for the buffer. This 953 * will cause us to attempt to release the buffer later on 954 * and will cause the buffer cache to attempt to free the 955 * underlying pages. 956 */ 957 if (ioflag & IO_DIRECT) 958 bp->b_flags |= B_DIRECT; 959 960 /* 961 * We should only get non-zero b_resid when an I/O error 962 * has occurred, which should cause us to break above. 963 * However, if the short read did not cause an error, 964 * then we want to ensure that we do not uiomove bad 965 * or uninitialized data. 966 */ 967 size -= bp->b_resid; 968 if (size < xfersize) { 969 if (size == 0) 970 break; 971 xfersize = size; 972 } 973 974 error = uiomove((char *)bp->b_data + blkoffset, 975 (int)xfersize, uio); 976 if (error) 977 break; 978 979 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 980 (LIST_EMPTY(&bp->b_dep))) { 981 /* 982 * If there are no dependencies, and it's VMIO, 983 * then we don't need the buf, mark it available 984 * for freeing. For non-direct VMIO reads, the VM 985 * has the data. 986 */ 987 bp->b_flags |= B_RELBUF; 988 brelse(bp); 989 } else { 990 /* 991 * Otherwise let whoever 992 * made the request take care of 993 * freeing it. We just queue 994 * it onto another list. 995 */ 996 bqrelse(bp); 997 } 998 } 999 1000 /* 1001 * This can only happen in the case of an error 1002 * because the loop above resets bp to NULL on each iteration 1003 * and on normal completion has not set a new value into it. 1004 * so it must have come from a 'break' statement 1005 */ 1006 if (bp != NULL) { 1007 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1008 (LIST_EMPTY(&bp->b_dep))) { 1009 bp->b_flags |= B_RELBUF; 1010 brelse(bp); 1011 } else { 1012 bqrelse(bp); 1013 } 1014 } 1015 return (error); 1016 } 1017 1018 /* 1019 * Extended attribute area writing. 1020 */ 1021 static int 1022 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1023 { 1024 struct inode *ip; 1025 struct ufs2_dinode *dp; 1026 struct fs *fs; 1027 struct buf *bp; 1028 ufs_lbn_t lbn; 1029 off_t osize; 1030 ssize_t resid; 1031 int blkoffset, error, flags, size, xfersize; 1032 1033 ip = VTOI(vp); 1034 fs = ip->i_fs; 1035 dp = ip->i_din2; 1036 1037 #ifdef INVARIANTS 1038 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1039 panic("ffs_extwrite: mode"); 1040 #endif 1041 1042 if (ioflag & IO_APPEND) 1043 uio->uio_offset = dp->di_extsize; 1044 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1045 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1046 if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1047 return (EFBIG); 1048 1049 resid = uio->uio_resid; 1050 osize = dp->di_extsize; 1051 flags = IO_EXT; 1052 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1053 flags |= IO_SYNC; 1054 1055 for (error = 0; uio->uio_resid > 0;) { 1056 lbn = lblkno(fs, uio->uio_offset); 1057 blkoffset = blkoff(fs, uio->uio_offset); 1058 xfersize = fs->fs_bsize - blkoffset; 1059 if (uio->uio_resid < xfersize) 1060 xfersize = uio->uio_resid; 1061 1062 /* 1063 * We must perform a read-before-write if the transfer size 1064 * does not cover the entire buffer. 1065 */ 1066 if (fs->fs_bsize > xfersize) 1067 flags |= BA_CLRBUF; 1068 else 1069 flags &= ~BA_CLRBUF; 1070 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1071 ucred, flags, &bp); 1072 if (error != 0) 1073 break; 1074 /* 1075 * If the buffer is not valid we have to clear out any 1076 * garbage data from the pages instantiated for the buffer. 1077 * If we do not, a failed uiomove() during a write can leave 1078 * the prior contents of the pages exposed to a userland 1079 * mmap(). XXX deal with uiomove() errors a better way. 1080 */ 1081 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1082 vfs_bio_clrbuf(bp); 1083 if (ioflag & IO_DIRECT) 1084 bp->b_flags |= B_DIRECT; 1085 1086 if (uio->uio_offset + xfersize > dp->di_extsize) 1087 dp->di_extsize = uio->uio_offset + xfersize; 1088 1089 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1090 if (size < xfersize) 1091 xfersize = size; 1092 1093 error = 1094 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1095 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1096 (LIST_EMPTY(&bp->b_dep))) { 1097 bp->b_flags |= B_RELBUF; 1098 } 1099 1100 /* 1101 * If IO_SYNC each buffer is written synchronously. Otherwise 1102 * if we have a severe page deficiency write the buffer 1103 * asynchronously. Otherwise try to cluster, and if that 1104 * doesn't do it then either do an async write (if O_DIRECT), 1105 * or a delayed write (if not). 1106 */ 1107 if (ioflag & IO_SYNC) { 1108 (void)bwrite(bp); 1109 } else if (vm_page_count_severe() || 1110 buf_dirty_count_severe() || 1111 xfersize + blkoffset == fs->fs_bsize || 1112 (ioflag & (IO_ASYNC | IO_DIRECT))) 1113 bawrite(bp); 1114 else 1115 bdwrite(bp); 1116 if (error || xfersize == 0) 1117 break; 1118 ip->i_flag |= IN_CHANGE; 1119 } 1120 /* 1121 * If we successfully wrote any data, and we are not the superuser 1122 * we clear the setuid and setgid bits as a precaution against 1123 * tampering. 1124 */ 1125 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { 1126 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) { 1127 ip->i_mode &= ~(ISUID | ISGID); 1128 dp->di_mode = ip->i_mode; 1129 } 1130 } 1131 if (error) { 1132 if (ioflag & IO_UNIT) { 1133 (void)ffs_truncate(vp, osize, 1134 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td); 1135 uio->uio_offset -= resid - uio->uio_resid; 1136 uio->uio_resid = resid; 1137 } 1138 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1139 error = ffs_update(vp, 1); 1140 return (error); 1141 } 1142 1143 1144 /* 1145 * Vnode operating to retrieve a named extended attribute. 1146 * 1147 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1148 * the length of the EA, and possibly the pointer to the entry and to the data. 1149 */ 1150 static int 1151 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) 1152 { 1153 u_char *p, *pe, *pn, *p0; 1154 int eapad1, eapad2, ealength, ealen, nlen; 1155 uint32_t ul; 1156 1157 pe = ptr + length; 1158 nlen = strlen(name); 1159 1160 for (p = ptr; p < pe; p = pn) { 1161 p0 = p; 1162 bcopy(p, &ul, sizeof(ul)); 1163 pn = p + ul; 1164 /* make sure this entry is complete */ 1165 if (pn > pe) 1166 break; 1167 p += sizeof(uint32_t); 1168 if (*p != nspace) 1169 continue; 1170 p++; 1171 eapad2 = *p++; 1172 if (*p != nlen) 1173 continue; 1174 p++; 1175 if (bcmp(p, name, nlen)) 1176 continue; 1177 ealength = sizeof(uint32_t) + 3 + nlen; 1178 eapad1 = 8 - (ealength % 8); 1179 if (eapad1 == 8) 1180 eapad1 = 0; 1181 ealength += eapad1; 1182 ealen = ul - ealength - eapad2; 1183 p += nlen + eapad1; 1184 if (eap != NULL) 1185 *eap = p0; 1186 if (eac != NULL) 1187 *eac = p; 1188 return (ealen); 1189 } 1190 return(-1); 1191 } 1192 1193 static int 1194 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1195 { 1196 struct inode *ip; 1197 struct ufs2_dinode *dp; 1198 struct fs *fs; 1199 struct uio luio; 1200 struct iovec liovec; 1201 int easize, error; 1202 u_char *eae; 1203 1204 ip = VTOI(vp); 1205 fs = ip->i_fs; 1206 dp = ip->i_din2; 1207 easize = dp->di_extsize; 1208 if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize) 1209 return (EFBIG); 1210 1211 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1212 1213 liovec.iov_base = eae; 1214 liovec.iov_len = easize; 1215 luio.uio_iov = &liovec; 1216 luio.uio_iovcnt = 1; 1217 luio.uio_offset = 0; 1218 luio.uio_resid = easize; 1219 luio.uio_segflg = UIO_SYSSPACE; 1220 luio.uio_rw = UIO_READ; 1221 luio.uio_td = td; 1222 1223 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1224 if (error) { 1225 free(eae, M_TEMP); 1226 return(error); 1227 } 1228 *p = eae; 1229 return (0); 1230 } 1231 1232 static void 1233 ffs_lock_ea(struct vnode *vp) 1234 { 1235 struct inode *ip; 1236 1237 ip = VTOI(vp); 1238 VI_LOCK(vp); 1239 while (ip->i_flag & IN_EA_LOCKED) { 1240 ip->i_flag |= IN_EA_LOCKWAIT; 1241 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea", 1242 0); 1243 } 1244 ip->i_flag |= IN_EA_LOCKED; 1245 VI_UNLOCK(vp); 1246 } 1247 1248 static void 1249 ffs_unlock_ea(struct vnode *vp) 1250 { 1251 struct inode *ip; 1252 1253 ip = VTOI(vp); 1254 VI_LOCK(vp); 1255 if (ip->i_flag & IN_EA_LOCKWAIT) 1256 wakeup(&ip->i_ea_refs); 1257 ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT); 1258 VI_UNLOCK(vp); 1259 } 1260 1261 static int 1262 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1263 { 1264 struct inode *ip; 1265 struct ufs2_dinode *dp; 1266 int error; 1267 1268 ip = VTOI(vp); 1269 1270 ffs_lock_ea(vp); 1271 if (ip->i_ea_area != NULL) { 1272 ip->i_ea_refs++; 1273 ffs_unlock_ea(vp); 1274 return (0); 1275 } 1276 dp = ip->i_din2; 1277 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1278 if (error) { 1279 ffs_unlock_ea(vp); 1280 return (error); 1281 } 1282 ip->i_ea_len = dp->di_extsize; 1283 ip->i_ea_error = 0; 1284 ip->i_ea_refs++; 1285 ffs_unlock_ea(vp); 1286 return (0); 1287 } 1288 1289 /* 1290 * Vnode extattr transaction commit/abort 1291 */ 1292 static int 1293 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1294 { 1295 struct inode *ip; 1296 struct uio luio; 1297 struct iovec liovec; 1298 int error; 1299 struct ufs2_dinode *dp; 1300 1301 ip = VTOI(vp); 1302 1303 ffs_lock_ea(vp); 1304 if (ip->i_ea_area == NULL) { 1305 ffs_unlock_ea(vp); 1306 return (EINVAL); 1307 } 1308 dp = ip->i_din2; 1309 error = ip->i_ea_error; 1310 if (commit && error == 0) { 1311 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit"); 1312 if (cred == NOCRED) 1313 cred = vp->v_mount->mnt_cred; 1314 liovec.iov_base = ip->i_ea_area; 1315 liovec.iov_len = ip->i_ea_len; 1316 luio.uio_iov = &liovec; 1317 luio.uio_iovcnt = 1; 1318 luio.uio_offset = 0; 1319 luio.uio_resid = ip->i_ea_len; 1320 luio.uio_segflg = UIO_SYSSPACE; 1321 luio.uio_rw = UIO_WRITE; 1322 luio.uio_td = td; 1323 /* XXX: I'm not happy about truncating to zero size */ 1324 if (ip->i_ea_len < dp->di_extsize) 1325 error = ffs_truncate(vp, 0, IO_EXT, cred, td); 1326 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1327 } 1328 if (--ip->i_ea_refs == 0) { 1329 free(ip->i_ea_area, M_TEMP); 1330 ip->i_ea_area = NULL; 1331 ip->i_ea_len = 0; 1332 ip->i_ea_error = 0; 1333 } 1334 ffs_unlock_ea(vp); 1335 return (error); 1336 } 1337 1338 /* 1339 * Vnode extattr strategy routine for fifos. 1340 * 1341 * We need to check for a read or write of the external attributes. 1342 * Otherwise we just fall through and do the usual thing. 1343 */ 1344 static int 1345 ffsext_strategy(struct vop_strategy_args *ap) 1346 /* 1347 struct vop_strategy_args { 1348 struct vnodeop_desc *a_desc; 1349 struct vnode *a_vp; 1350 struct buf *a_bp; 1351 }; 1352 */ 1353 { 1354 struct vnode *vp; 1355 daddr_t lbn; 1356 1357 vp = ap->a_vp; 1358 lbn = ap->a_bp->b_lblkno; 1359 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1360 lbn < 0 && lbn >= -NXADDR) 1361 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1362 if (vp->v_type == VFIFO) 1363 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1364 panic("spec nodes went here"); 1365 } 1366 1367 /* 1368 * Vnode extattr transaction commit/abort 1369 */ 1370 static int 1371 ffs_openextattr(struct vop_openextattr_args *ap) 1372 /* 1373 struct vop_openextattr_args { 1374 struct vnodeop_desc *a_desc; 1375 struct vnode *a_vp; 1376 IN struct ucred *a_cred; 1377 IN struct thread *a_td; 1378 }; 1379 */ 1380 { 1381 struct inode *ip; 1382 struct fs *fs; 1383 1384 ip = VTOI(ap->a_vp); 1385 fs = ip->i_fs; 1386 1387 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1388 return (EOPNOTSUPP); 1389 1390 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1391 } 1392 1393 1394 /* 1395 * Vnode extattr transaction commit/abort 1396 */ 1397 static int 1398 ffs_closeextattr(struct vop_closeextattr_args *ap) 1399 /* 1400 struct vop_closeextattr_args { 1401 struct vnodeop_desc *a_desc; 1402 struct vnode *a_vp; 1403 int a_commit; 1404 IN struct ucred *a_cred; 1405 IN struct thread *a_td; 1406 }; 1407 */ 1408 { 1409 struct inode *ip; 1410 struct fs *fs; 1411 1412 ip = VTOI(ap->a_vp); 1413 fs = ip->i_fs; 1414 1415 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1416 return (EOPNOTSUPP); 1417 1418 if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) 1419 return (EROFS); 1420 1421 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1422 } 1423 1424 /* 1425 * Vnode operation to remove a named attribute. 1426 */ 1427 static int 1428 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1429 /* 1430 vop_deleteextattr { 1431 IN struct vnode *a_vp; 1432 IN int a_attrnamespace; 1433 IN const char *a_name; 1434 IN struct ucred *a_cred; 1435 IN struct thread *a_td; 1436 }; 1437 */ 1438 { 1439 struct inode *ip; 1440 struct fs *fs; 1441 uint32_t ealength, ul; 1442 int ealen, olen, eapad1, eapad2, error, i, easize; 1443 u_char *eae, *p; 1444 1445 ip = VTOI(ap->a_vp); 1446 fs = ip->i_fs; 1447 1448 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1449 return (EOPNOTSUPP); 1450 1451 if (strlen(ap->a_name) == 0) 1452 return (EINVAL); 1453 1454 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1455 return (EROFS); 1456 1457 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1458 ap->a_cred, ap->a_td, VWRITE); 1459 if (error) { 1460 1461 /* 1462 * ffs_lock_ea is not needed there, because the vnode 1463 * must be exclusively locked. 1464 */ 1465 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1466 ip->i_ea_error = error; 1467 return (error); 1468 } 1469 1470 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1471 if (error) 1472 return (error); 1473 1474 ealength = eapad1 = ealen = eapad2 = 0; 1475 1476 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1477 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1478 easize = ip->i_ea_len; 1479 1480 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1481 &p, NULL); 1482 if (olen == -1) { 1483 /* delete but nonexistent */ 1484 free(eae, M_TEMP); 1485 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1486 return(ENOATTR); 1487 } 1488 bcopy(p, &ul, sizeof ul); 1489 i = p - eae + ul; 1490 if (ul != ealength) { 1491 bcopy(p + ul, p + ealength, easize - i); 1492 easize += (ealength - ul); 1493 } 1494 if (easize > NXADDR * fs->fs_bsize) { 1495 free(eae, M_TEMP); 1496 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1497 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1498 ip->i_ea_error = ENOSPC; 1499 return(ENOSPC); 1500 } 1501 p = ip->i_ea_area; 1502 ip->i_ea_area = eae; 1503 ip->i_ea_len = easize; 1504 free(p, M_TEMP); 1505 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1506 return(error); 1507 } 1508 1509 /* 1510 * Vnode operation to retrieve a named extended attribute. 1511 */ 1512 static int 1513 ffs_getextattr(struct vop_getextattr_args *ap) 1514 /* 1515 vop_getextattr { 1516 IN struct vnode *a_vp; 1517 IN int a_attrnamespace; 1518 IN const char *a_name; 1519 INOUT struct uio *a_uio; 1520 OUT size_t *a_size; 1521 IN struct ucred *a_cred; 1522 IN struct thread *a_td; 1523 }; 1524 */ 1525 { 1526 struct inode *ip; 1527 struct fs *fs; 1528 u_char *eae, *p; 1529 unsigned easize; 1530 int error, ealen; 1531 1532 ip = VTOI(ap->a_vp); 1533 fs = ip->i_fs; 1534 1535 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1536 return (EOPNOTSUPP); 1537 1538 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1539 ap->a_cred, ap->a_td, VREAD); 1540 if (error) 1541 return (error); 1542 1543 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1544 if (error) 1545 return (error); 1546 1547 eae = ip->i_ea_area; 1548 easize = ip->i_ea_len; 1549 1550 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1551 NULL, &p); 1552 if (ealen >= 0) { 1553 error = 0; 1554 if (ap->a_size != NULL) 1555 *ap->a_size = ealen; 1556 else if (ap->a_uio != NULL) 1557 error = uiomove(p, ealen, ap->a_uio); 1558 } else 1559 error = ENOATTR; 1560 1561 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1562 return(error); 1563 } 1564 1565 /* 1566 * Vnode operation to retrieve extended attributes on a vnode. 1567 */ 1568 static int 1569 ffs_listextattr(struct vop_listextattr_args *ap) 1570 /* 1571 vop_listextattr { 1572 IN struct vnode *a_vp; 1573 IN int a_attrnamespace; 1574 INOUT struct uio *a_uio; 1575 OUT size_t *a_size; 1576 IN struct ucred *a_cred; 1577 IN struct thread *a_td; 1578 }; 1579 */ 1580 { 1581 struct inode *ip; 1582 struct fs *fs; 1583 u_char *eae, *p, *pe, *pn; 1584 unsigned easize; 1585 uint32_t ul; 1586 int error, ealen; 1587 1588 ip = VTOI(ap->a_vp); 1589 fs = ip->i_fs; 1590 1591 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1592 return (EOPNOTSUPP); 1593 1594 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1595 ap->a_cred, ap->a_td, VREAD); 1596 if (error) 1597 return (error); 1598 1599 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1600 if (error) 1601 return (error); 1602 eae = ip->i_ea_area; 1603 easize = ip->i_ea_len; 1604 1605 error = 0; 1606 if (ap->a_size != NULL) 1607 *ap->a_size = 0; 1608 pe = eae + easize; 1609 for(p = eae; error == 0 && p < pe; p = pn) { 1610 bcopy(p, &ul, sizeof(ul)); 1611 pn = p + ul; 1612 if (pn > pe) 1613 break; 1614 p += sizeof(ul); 1615 if (*p++ != ap->a_attrnamespace) 1616 continue; 1617 p++; /* pad2 */ 1618 ealen = *p; 1619 if (ap->a_size != NULL) { 1620 *ap->a_size += ealen + 1; 1621 } else if (ap->a_uio != NULL) { 1622 error = uiomove(p, ealen + 1, ap->a_uio); 1623 } 1624 } 1625 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1626 return(error); 1627 } 1628 1629 /* 1630 * Vnode operation to set a named attribute. 1631 */ 1632 static int 1633 ffs_setextattr(struct vop_setextattr_args *ap) 1634 /* 1635 vop_setextattr { 1636 IN struct vnode *a_vp; 1637 IN int a_attrnamespace; 1638 IN const char *a_name; 1639 INOUT struct uio *a_uio; 1640 IN struct ucred *a_cred; 1641 IN struct thread *a_td; 1642 }; 1643 */ 1644 { 1645 struct inode *ip; 1646 struct fs *fs; 1647 uint32_t ealength, ul; 1648 int ealen, olen, eapad1, eapad2, error, i, easize; 1649 u_char *eae, *p; 1650 1651 ip = VTOI(ap->a_vp); 1652 fs = ip->i_fs; 1653 1654 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1655 return (EOPNOTSUPP); 1656 1657 if (strlen(ap->a_name) == 0) 1658 return (EINVAL); 1659 1660 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1661 if (ap->a_uio == NULL) 1662 return (EOPNOTSUPP); 1663 1664 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1665 return (EROFS); 1666 1667 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1668 ap->a_cred, ap->a_td, VWRITE); 1669 if (error) { 1670 1671 /* 1672 * ffs_lock_ea is not needed there, because the vnode 1673 * must be exclusively locked. 1674 */ 1675 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1676 ip->i_ea_error = error; 1677 return (error); 1678 } 1679 1680 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1681 if (error) 1682 return (error); 1683 1684 ealen = ap->a_uio->uio_resid; 1685 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1686 eapad1 = 8 - (ealength % 8); 1687 if (eapad1 == 8) 1688 eapad1 = 0; 1689 eapad2 = 8 - (ealen % 8); 1690 if (eapad2 == 8) 1691 eapad2 = 0; 1692 ealength += eapad1 + ealen + eapad2; 1693 1694 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1695 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1696 easize = ip->i_ea_len; 1697 1698 olen = ffs_findextattr(eae, easize, 1699 ap->a_attrnamespace, ap->a_name, &p, NULL); 1700 if (olen == -1) { 1701 /* new, append at end */ 1702 p = eae + easize; 1703 easize += ealength; 1704 } else { 1705 bcopy(p, &ul, sizeof ul); 1706 i = p - eae + ul; 1707 if (ul != ealength) { 1708 bcopy(p + ul, p + ealength, easize - i); 1709 easize += (ealength - ul); 1710 } 1711 } 1712 if (easize > NXADDR * fs->fs_bsize) { 1713 free(eae, M_TEMP); 1714 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1715 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1716 ip->i_ea_error = ENOSPC; 1717 return(ENOSPC); 1718 } 1719 bcopy(&ealength, p, sizeof(ealength)); 1720 p += sizeof(ealength); 1721 *p++ = ap->a_attrnamespace; 1722 *p++ = eapad2; 1723 *p++ = strlen(ap->a_name); 1724 strcpy(p, ap->a_name); 1725 p += strlen(ap->a_name); 1726 bzero(p, eapad1); 1727 p += eapad1; 1728 error = uiomove(p, ealen, ap->a_uio); 1729 if (error) { 1730 free(eae, M_TEMP); 1731 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1732 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1733 ip->i_ea_error = error; 1734 return(error); 1735 } 1736 p += ealen; 1737 bzero(p, eapad2); 1738 1739 p = ip->i_ea_area; 1740 ip->i_ea_area = eae; 1741 ip->i_ea_len = easize; 1742 free(p, M_TEMP); 1743 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1744 return(error); 1745 } 1746 1747 /* 1748 * Vnode pointer to File handle 1749 */ 1750 static int 1751 ffs_vptofh(struct vop_vptofh_args *ap) 1752 /* 1753 vop_vptofh { 1754 IN struct vnode *a_vp; 1755 IN struct fid *a_fhp; 1756 }; 1757 */ 1758 { 1759 struct inode *ip; 1760 struct ufid *ufhp; 1761 1762 ip = VTOI(ap->a_vp); 1763 ufhp = (struct ufid *)ap->a_fhp; 1764 ufhp->ufid_len = sizeof(struct ufid); 1765 ufhp->ufid_ino = ip->i_number; 1766 ufhp->ufid_gen = ip->i_gen; 1767 return (0); 1768 } 1769