1 /*- 2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 60 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 61 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include <sys/param.h> 68 #include <sys/bio.h> 69 #include <sys/systm.h> 70 #include <sys/buf.h> 71 #include <sys/conf.h> 72 #include <sys/extattr.h> 73 #include <sys/kernel.h> 74 #include <sys/limits.h> 75 #include <sys/malloc.h> 76 #include <sys/mount.h> 77 #include <sys/priv.h> 78 #include <sys/stat.h> 79 #include <sys/vmmeter.h> 80 #include <sys/vnode.h> 81 82 #include <vm/vm.h> 83 #include <vm/vm_extern.h> 84 #include <vm/vm_object.h> 85 #include <vm/vm_page.h> 86 #include <vm/vm_pager.h> 87 #include <vm/vnode_pager.h> 88 89 #include <ufs/ufs/extattr.h> 90 #include <ufs/ufs/quota.h> 91 #include <ufs/ufs/inode.h> 92 #include <ufs/ufs/ufs_extern.h> 93 #include <ufs/ufs/ufsmount.h> 94 95 #include <ufs/ffs/fs.h> 96 #include <ufs/ffs/ffs_extern.h> 97 #include "opt_directio.h" 98 #include "opt_ffs.h" 99 100 #ifdef DIRECTIO 101 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 102 #endif 103 static vop_fsync_t ffs_fsync; 104 static vop_lock1_t ffs_lock; 105 static vop_getpages_t ffs_getpages; 106 static vop_read_t ffs_read; 107 static vop_write_t ffs_write; 108 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 109 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 110 struct ucred *cred); 111 static vop_strategy_t ffsext_strategy; 112 static vop_closeextattr_t ffs_closeextattr; 113 static vop_deleteextattr_t ffs_deleteextattr; 114 static vop_getextattr_t ffs_getextattr; 115 static vop_listextattr_t ffs_listextattr; 116 static vop_openextattr_t ffs_openextattr; 117 static vop_setextattr_t ffs_setextattr; 118 static vop_vptofh_t ffs_vptofh; 119 120 121 /* Global vfs data structures for ufs. */ 122 struct vop_vector ffs_vnodeops1 = { 123 .vop_default = &ufs_vnodeops, 124 .vop_fsync = ffs_fsync, 125 .vop_getpages = ffs_getpages, 126 .vop_lock1 = ffs_lock, 127 .vop_read = ffs_read, 128 .vop_reallocblks = ffs_reallocblks, 129 .vop_write = ffs_write, 130 .vop_vptofh = ffs_vptofh, 131 }; 132 133 struct vop_vector ffs_fifoops1 = { 134 .vop_default = &ufs_fifoops, 135 .vop_fsync = ffs_fsync, 136 .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ 137 .vop_vptofh = ffs_vptofh, 138 }; 139 140 /* Global vfs data structures for ufs. */ 141 struct vop_vector ffs_vnodeops2 = { 142 .vop_default = &ufs_vnodeops, 143 .vop_fsync = ffs_fsync, 144 .vop_getpages = ffs_getpages, 145 .vop_lock1 = ffs_lock, 146 .vop_read = ffs_read, 147 .vop_reallocblks = ffs_reallocblks, 148 .vop_write = ffs_write, 149 .vop_closeextattr = ffs_closeextattr, 150 .vop_deleteextattr = ffs_deleteextattr, 151 .vop_getextattr = ffs_getextattr, 152 .vop_listextattr = ffs_listextattr, 153 .vop_openextattr = ffs_openextattr, 154 .vop_setextattr = ffs_setextattr, 155 .vop_vptofh = ffs_vptofh, 156 }; 157 158 struct vop_vector ffs_fifoops2 = { 159 .vop_default = &ufs_fifoops, 160 .vop_fsync = ffs_fsync, 161 .vop_lock1 = ffs_lock, 162 .vop_reallocblks = ffs_reallocblks, 163 .vop_strategy = ffsext_strategy, 164 .vop_closeextattr = ffs_closeextattr, 165 .vop_deleteextattr = ffs_deleteextattr, 166 .vop_getextattr = ffs_getextattr, 167 .vop_listextattr = ffs_listextattr, 168 .vop_openextattr = ffs_openextattr, 169 .vop_setextattr = ffs_setextattr, 170 .vop_vptofh = ffs_vptofh, 171 }; 172 173 /* 174 * Synch an open file. 175 */ 176 /* ARGSUSED */ 177 static int 178 ffs_fsync(struct vop_fsync_args *ap) 179 { 180 struct vnode *vp; 181 struct bufobj *bo; 182 int error; 183 184 vp = ap->a_vp; 185 bo = &vp->v_bufobj; 186 retry: 187 error = ffs_syncvnode(vp, ap->a_waitfor, 0); 188 if (error) 189 return (error); 190 if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) { 191 error = softdep_fsync(vp); 192 if (error) 193 return (error); 194 195 /* 196 * The softdep_fsync() function may drop vp lock, 197 * allowing for dirty buffers to reappear on the 198 * bo_dirty list. Recheck and resync as needed. 199 */ 200 BO_LOCK(bo); 201 if (vp->v_type == VREG && (bo->bo_numoutput > 0 || 202 bo->bo_dirty.bv_cnt > 0)) { 203 BO_UNLOCK(bo); 204 goto retry; 205 } 206 BO_UNLOCK(bo); 207 } 208 return (0); 209 } 210 211 int 212 ffs_syncvnode(struct vnode *vp, int waitfor, int flags) 213 { 214 struct inode *ip; 215 struct bufobj *bo; 216 struct buf *bp; 217 struct buf *nbp; 218 ufs_lbn_t lbn; 219 int error, wait, passes; 220 221 ip = VTOI(vp); 222 ip->i_flag &= ~IN_NEEDSYNC; 223 bo = &vp->v_bufobj; 224 225 /* 226 * When doing MNT_WAIT we must first flush all dependencies 227 * on the inode. 228 */ 229 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && 230 (error = softdep_sync_metadata(vp)) != 0) 231 return (error); 232 233 /* 234 * Flush all dirty buffers associated with a vnode. 235 */ 236 error = 0; 237 passes = 0; 238 wait = 0; /* Always do an async pass first. */ 239 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 240 BO_LOCK(bo); 241 loop: 242 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 243 bp->b_vflags &= ~BV_SCANNED; 244 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 245 /* 246 * Reasons to skip this buffer: it has already been considered 247 * on this pass, the buffer has dependencies that will cause 248 * it to be redirtied and it has not already been deferred, 249 * or it is already being written. 250 */ 251 if ((bp->b_vflags & BV_SCANNED) != 0) 252 continue; 253 bp->b_vflags |= BV_SCANNED; 254 /* Flush indirects in order. */ 255 if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR && 256 lbn_level(bp->b_lblkno) >= passes) 257 continue; 258 if (bp->b_lblkno > lbn) 259 panic("ffs_syncvnode: syncing truncated data."); 260 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 261 continue; 262 BO_UNLOCK(bo); 263 if ((bp->b_flags & B_DELWRI) == 0) 264 panic("ffs_fsync: not dirty"); 265 /* 266 * Check for dependencies and potentially complete them. 267 */ 268 if (!LIST_EMPTY(&bp->b_dep) && 269 (error = softdep_sync_buf(vp, bp, 270 wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { 271 /* I/O error. */ 272 if (error != EBUSY) { 273 BUF_UNLOCK(bp); 274 return (error); 275 } 276 /* If we deferred once, don't defer again. */ 277 if ((bp->b_flags & B_DEFERRED) == 0) { 278 bp->b_flags |= B_DEFERRED; 279 BUF_UNLOCK(bp); 280 goto next; 281 } 282 } 283 if (wait) { 284 bremfree(bp); 285 if ((error = bwrite(bp)) != 0) 286 return (error); 287 } else if ((bp->b_flags & B_CLUSTEROK)) { 288 (void) vfs_bio_awrite(bp); 289 } else { 290 bremfree(bp); 291 (void) bawrite(bp); 292 } 293 next: 294 /* 295 * Since we may have slept during the I/O, we need 296 * to start from a known point. 297 */ 298 BO_LOCK(bo); 299 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); 300 } 301 if (waitfor != MNT_WAIT) { 302 BO_UNLOCK(bo); 303 if ((flags & NO_INO_UPDT) != 0) 304 return (0); 305 else 306 return (ffs_update(vp, 0)); 307 } 308 /* Drain IO to see if we're done. */ 309 bufobj_wwait(bo, 0, 0); 310 /* 311 * Block devices associated with filesystems may have new I/O 312 * requests posted for them even if the vnode is locked, so no 313 * amount of trying will get them clean. We make several passes 314 * as a best effort. 315 * 316 * Regular files may need multiple passes to flush all dependency 317 * work as it is possible that we must write once per indirect 318 * level, once for the leaf, and once for the inode and each of 319 * these will be done with one sync and one async pass. 320 */ 321 if (bo->bo_dirty.bv_cnt > 0) { 322 /* Write the inode after sync passes to flush deps. */ 323 if (wait && DOINGSOFTDEP(vp) && (flags & NO_INO_UPDT) == 0) { 324 BO_UNLOCK(bo); 325 ffs_update(vp, 1); 326 BO_LOCK(bo); 327 } 328 /* switch between sync/async. */ 329 wait = !wait; 330 if (wait == 1 || ++passes < NIADDR + 2) 331 goto loop; 332 #ifdef INVARIANTS 333 if (!vn_isdisk(vp, NULL)) 334 vprint("ffs_fsync: dirty", vp); 335 #endif 336 } 337 BO_UNLOCK(bo); 338 error = 0; 339 if ((flags & NO_INO_UPDT) == 0) 340 error = ffs_update(vp, 1); 341 if (DOINGSUJ(vp)) 342 softdep_journal_fsync(VTOI(vp)); 343 return (error); 344 } 345 346 static int 347 ffs_lock(ap) 348 struct vop_lock1_args /* { 349 struct vnode *a_vp; 350 int a_flags; 351 struct thread *a_td; 352 char *file; 353 int line; 354 } */ *ap; 355 { 356 #ifndef NO_FFS_SNAPSHOT 357 struct vnode *vp; 358 int flags; 359 struct lock *lkp; 360 int result; 361 362 switch (ap->a_flags & LK_TYPE_MASK) { 363 case LK_SHARED: 364 case LK_UPGRADE: 365 case LK_EXCLUSIVE: 366 vp = ap->a_vp; 367 flags = ap->a_flags; 368 for (;;) { 369 #ifdef DEBUG_VFS_LOCKS 370 KASSERT(vp->v_holdcnt != 0, 371 ("ffs_lock %p: zero hold count", vp)); 372 #endif 373 lkp = vp->v_vnlock; 374 result = _lockmgr_args(lkp, flags, VI_MTX(vp), 375 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 376 ap->a_file, ap->a_line); 377 if (lkp == vp->v_vnlock || result != 0) 378 break; 379 /* 380 * Apparent success, except that the vnode 381 * mutated between snapshot file vnode and 382 * regular file vnode while this process 383 * slept. The lock currently held is not the 384 * right lock. Release it, and try to get the 385 * new lock. 386 */ 387 (void) _lockmgr_args(lkp, LK_RELEASE, NULL, 388 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 389 ap->a_file, ap->a_line); 390 if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == 391 (LK_INTERLOCK | LK_NOWAIT)) 392 return (EBUSY); 393 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 394 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 395 flags &= ~LK_INTERLOCK; 396 } 397 break; 398 default: 399 result = VOP_LOCK1_APV(&ufs_vnodeops, ap); 400 } 401 return (result); 402 #else 403 return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); 404 #endif 405 } 406 407 /* 408 * Vnode op for reading. 409 */ 410 static int 411 ffs_read(ap) 412 struct vop_read_args /* { 413 struct vnode *a_vp; 414 struct uio *a_uio; 415 int a_ioflag; 416 struct ucred *a_cred; 417 } */ *ap; 418 { 419 struct vnode *vp; 420 struct inode *ip; 421 struct uio *uio; 422 struct fs *fs; 423 struct buf *bp; 424 ufs_lbn_t lbn, nextlbn; 425 off_t bytesinfile; 426 long size, xfersize, blkoffset; 427 ssize_t orig_resid; 428 int error; 429 int seqcount; 430 int ioflag; 431 432 vp = ap->a_vp; 433 uio = ap->a_uio; 434 ioflag = ap->a_ioflag; 435 if (ap->a_ioflag & IO_EXT) 436 #ifdef notyet 437 return (ffs_extread(vp, uio, ioflag)); 438 #else 439 panic("ffs_read+IO_EXT"); 440 #endif 441 #ifdef DIRECTIO 442 if ((ioflag & IO_DIRECT) != 0) { 443 int workdone; 444 445 error = ffs_rawread(vp, uio, &workdone); 446 if (error != 0 || workdone != 0) 447 return error; 448 } 449 #endif 450 451 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 452 ip = VTOI(vp); 453 454 #ifdef INVARIANTS 455 if (uio->uio_rw != UIO_READ) 456 panic("ffs_read: mode"); 457 458 if (vp->v_type == VLNK) { 459 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 460 panic("ffs_read: short symlink"); 461 } else if (vp->v_type != VREG && vp->v_type != VDIR) 462 panic("ffs_read: type %d", vp->v_type); 463 #endif 464 orig_resid = uio->uio_resid; 465 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 466 if (orig_resid == 0) 467 return (0); 468 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 469 fs = ip->i_fs; 470 if (uio->uio_offset < ip->i_size && 471 uio->uio_offset >= fs->fs_maxfilesize) 472 return (EOVERFLOW); 473 474 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 475 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 476 break; 477 lbn = lblkno(fs, uio->uio_offset); 478 nextlbn = lbn + 1; 479 480 /* 481 * size of buffer. The buffer representing the 482 * end of the file is rounded up to the size of 483 * the block type ( fragment or full block, 484 * depending ). 485 */ 486 size = blksize(fs, ip, lbn); 487 blkoffset = blkoff(fs, uio->uio_offset); 488 489 /* 490 * The amount we want to transfer in this iteration is 491 * one FS block less the amount of the data before 492 * our startpoint (duh!) 493 */ 494 xfersize = fs->fs_bsize - blkoffset; 495 496 /* 497 * But if we actually want less than the block, 498 * or the file doesn't have a whole block more of data, 499 * then use the lesser number. 500 */ 501 if (uio->uio_resid < xfersize) 502 xfersize = uio->uio_resid; 503 if (bytesinfile < xfersize) 504 xfersize = bytesinfile; 505 506 if (lblktosize(fs, nextlbn) >= ip->i_size) { 507 /* 508 * Don't do readahead if this is the end of the file. 509 */ 510 error = bread(vp, lbn, size, NOCRED, &bp); 511 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 512 /* 513 * Otherwise if we are allowed to cluster, 514 * grab as much as we can. 515 * 516 * XXX This may not be a win if we are not 517 * doing sequential access. 518 */ 519 error = cluster_read(vp, ip->i_size, lbn, 520 size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); 521 } else if (seqcount > 1) { 522 /* 523 * If we are NOT allowed to cluster, then 524 * if we appear to be acting sequentially, 525 * fire off a request for a readahead 526 * as well as a read. Note that the 4th and 5th 527 * arguments point to arrays of the size specified in 528 * the 6th argument. 529 */ 530 int nextsize = blksize(fs, ip, nextlbn); 531 error = breadn(vp, lbn, 532 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 533 } else { 534 /* 535 * Failing all of the above, just read what the 536 * user asked for. Interestingly, the same as 537 * the first option above. 538 */ 539 error = bread(vp, lbn, size, NOCRED, &bp); 540 } 541 if (error) { 542 brelse(bp); 543 bp = NULL; 544 break; 545 } 546 547 /* 548 * If IO_DIRECT then set B_DIRECT for the buffer. This 549 * will cause us to attempt to release the buffer later on 550 * and will cause the buffer cache to attempt to free the 551 * underlying pages. 552 */ 553 if (ioflag & IO_DIRECT) 554 bp->b_flags |= B_DIRECT; 555 556 /* 557 * We should only get non-zero b_resid when an I/O error 558 * has occurred, which should cause us to break above. 559 * However, if the short read did not cause an error, 560 * then we want to ensure that we do not uiomove bad 561 * or uninitialized data. 562 */ 563 size -= bp->b_resid; 564 if (size < xfersize) { 565 if (size == 0) 566 break; 567 xfersize = size; 568 } 569 570 error = uiomove((char *)bp->b_data + blkoffset, 571 (int)xfersize, uio); 572 if (error) 573 break; 574 575 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 576 (LIST_EMPTY(&bp->b_dep))) { 577 /* 578 * If there are no dependencies, and it's VMIO, 579 * then we don't need the buf, mark it available 580 * for freeing. For non-direct VMIO reads, the VM 581 * has the data. 582 */ 583 bp->b_flags |= B_RELBUF; 584 brelse(bp); 585 } else { 586 /* 587 * Otherwise let whoever 588 * made the request take care of 589 * freeing it. We just queue 590 * it onto another list. 591 */ 592 bqrelse(bp); 593 } 594 } 595 596 /* 597 * This can only happen in the case of an error 598 * because the loop above resets bp to NULL on each iteration 599 * and on normal completion has not set a new value into it. 600 * so it must have come from a 'break' statement 601 */ 602 if (bp != NULL) { 603 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 604 (LIST_EMPTY(&bp->b_dep))) { 605 bp->b_flags |= B_RELBUF; 606 brelse(bp); 607 } else { 608 bqrelse(bp); 609 } 610 } 611 612 if ((error == 0 || uio->uio_resid != orig_resid) && 613 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0 && 614 (ip->i_flag & IN_ACCESS) == 0) { 615 VI_LOCK(vp); 616 ip->i_flag |= IN_ACCESS; 617 VI_UNLOCK(vp); 618 } 619 return (error); 620 } 621 622 /* 623 * Vnode op for writing. 624 */ 625 static int 626 ffs_write(ap) 627 struct vop_write_args /* { 628 struct vnode *a_vp; 629 struct uio *a_uio; 630 int a_ioflag; 631 struct ucred *a_cred; 632 } */ *ap; 633 { 634 struct vnode *vp; 635 struct uio *uio; 636 struct inode *ip; 637 struct fs *fs; 638 struct buf *bp; 639 ufs_lbn_t lbn; 640 off_t osize; 641 ssize_t resid; 642 int seqcount; 643 int blkoffset, error, flags, ioflag, size, xfersize; 644 645 vp = ap->a_vp; 646 uio = ap->a_uio; 647 ioflag = ap->a_ioflag; 648 if (ap->a_ioflag & IO_EXT) 649 #ifdef notyet 650 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 651 #else 652 panic("ffs_write+IO_EXT"); 653 #endif 654 655 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 656 ip = VTOI(vp); 657 658 #ifdef INVARIANTS 659 if (uio->uio_rw != UIO_WRITE) 660 panic("ffs_write: mode"); 661 #endif 662 663 switch (vp->v_type) { 664 case VREG: 665 if (ioflag & IO_APPEND) 666 uio->uio_offset = ip->i_size; 667 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 668 return (EPERM); 669 /* FALLTHROUGH */ 670 case VLNK: 671 break; 672 case VDIR: 673 panic("ffs_write: dir write"); 674 break; 675 default: 676 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 677 (int)uio->uio_offset, 678 (int)uio->uio_resid 679 ); 680 } 681 682 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 683 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 684 fs = ip->i_fs; 685 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 686 return (EFBIG); 687 /* 688 * Maybe this should be above the vnode op call, but so long as 689 * file servers have no limits, I don't think it matters. 690 */ 691 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) 692 return (EFBIG); 693 694 resid = uio->uio_resid; 695 osize = ip->i_size; 696 if (seqcount > BA_SEQMAX) 697 flags = BA_SEQMAX << BA_SEQSHIFT; 698 else 699 flags = seqcount << BA_SEQSHIFT; 700 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 701 flags |= IO_SYNC; 702 703 for (error = 0; uio->uio_resid > 0;) { 704 lbn = lblkno(fs, uio->uio_offset); 705 blkoffset = blkoff(fs, uio->uio_offset); 706 xfersize = fs->fs_bsize - blkoffset; 707 if (uio->uio_resid < xfersize) 708 xfersize = uio->uio_resid; 709 if (uio->uio_offset + xfersize > ip->i_size) 710 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 711 712 /* 713 * We must perform a read-before-write if the transfer size 714 * does not cover the entire buffer. 715 */ 716 if (fs->fs_bsize > xfersize) 717 flags |= BA_CLRBUF; 718 else 719 flags &= ~BA_CLRBUF; 720 /* XXX is uio->uio_offset the right thing here? */ 721 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 722 ap->a_cred, flags, &bp); 723 if (error != 0) { 724 vnode_pager_setsize(vp, ip->i_size); 725 break; 726 } 727 if (ioflag & IO_DIRECT) 728 bp->b_flags |= B_DIRECT; 729 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 730 bp->b_flags |= B_NOCACHE; 731 732 if (uio->uio_offset + xfersize > ip->i_size) { 733 ip->i_size = uio->uio_offset + xfersize; 734 DIP_SET(ip, i_size, ip->i_size); 735 } 736 737 size = blksize(fs, ip, lbn) - bp->b_resid; 738 if (size < xfersize) 739 xfersize = size; 740 741 error = 742 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 743 /* 744 * If the buffer is not already filled and we encounter an 745 * error while trying to fill it, we have to clear out any 746 * garbage data from the pages instantiated for the buffer. 747 * If we do not, a failed uiomove() during a write can leave 748 * the prior contents of the pages exposed to a userland mmap. 749 * 750 * Note that we need only clear buffers with a transfer size 751 * equal to the block size because buffers with a shorter 752 * transfer size were cleared above by the call to UFS_BALLOC() 753 * with the BA_CLRBUF flag set. 754 * 755 * If the source region for uiomove identically mmaps the 756 * buffer, uiomove() performed the NOP copy, and the buffer 757 * content remains valid because the page fault handler 758 * validated the pages. 759 */ 760 if (error != 0 && (bp->b_flags & B_CACHE) == 0 && 761 fs->fs_bsize == xfersize) 762 vfs_bio_clrbuf(bp); 763 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 764 (LIST_EMPTY(&bp->b_dep))) { 765 bp->b_flags |= B_RELBUF; 766 } 767 768 /* 769 * If IO_SYNC each buffer is written synchronously. Otherwise 770 * if we have a severe page deficiency write the buffer 771 * asynchronously. Otherwise try to cluster, and if that 772 * doesn't do it then either do an async write (if O_DIRECT), 773 * or a delayed write (if not). 774 */ 775 if (ioflag & IO_SYNC) { 776 (void)bwrite(bp); 777 } else if (vm_page_count_severe() || 778 buf_dirty_count_severe() || 779 (ioflag & IO_ASYNC)) { 780 bp->b_flags |= B_CLUSTEROK; 781 bawrite(bp); 782 } else if (xfersize + blkoffset == fs->fs_bsize) { 783 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 784 bp->b_flags |= B_CLUSTEROK; 785 cluster_write(vp, bp, ip->i_size, seqcount); 786 } else { 787 bawrite(bp); 788 } 789 } else if (ioflag & IO_DIRECT) { 790 bp->b_flags |= B_CLUSTEROK; 791 bawrite(bp); 792 } else { 793 bp->b_flags |= B_CLUSTEROK; 794 bdwrite(bp); 795 } 796 if (error || xfersize == 0) 797 break; 798 ip->i_flag |= IN_CHANGE | IN_UPDATE; 799 } 800 /* 801 * If we successfully wrote any data, and we are not the superuser 802 * we clear the setuid and setgid bits as a precaution against 803 * tampering. 804 */ 805 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && 806 ap->a_cred) { 807 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) { 808 ip->i_mode &= ~(ISUID | ISGID); 809 DIP_SET(ip, i_mode, ip->i_mode); 810 } 811 } 812 if (error) { 813 if (ioflag & IO_UNIT) { 814 (void)ffs_truncate(vp, osize, 815 IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred); 816 uio->uio_offset -= resid - uio->uio_resid; 817 uio->uio_resid = resid; 818 } 819 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 820 error = ffs_update(vp, 1); 821 return (error); 822 } 823 824 /* 825 * get page routine 826 */ 827 static int 828 ffs_getpages(ap) 829 struct vop_getpages_args *ap; 830 { 831 int i; 832 vm_page_t mreq; 833 int pcount; 834 835 pcount = round_page(ap->a_count) / PAGE_SIZE; 836 mreq = ap->a_m[ap->a_reqpage]; 837 838 /* 839 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 840 * then the entire page is valid. Since the page may be mapped, 841 * user programs might reference data beyond the actual end of file 842 * occuring within the page. We have to zero that data. 843 */ 844 VM_OBJECT_LOCK(mreq->object); 845 if (mreq->valid) { 846 if (mreq->valid != VM_PAGE_BITS_ALL) 847 vm_page_zero_invalid(mreq, TRUE); 848 for (i = 0; i < pcount; i++) { 849 if (i != ap->a_reqpage) { 850 vm_page_lock(ap->a_m[i]); 851 vm_page_free(ap->a_m[i]); 852 vm_page_unlock(ap->a_m[i]); 853 } 854 } 855 VM_OBJECT_UNLOCK(mreq->object); 856 return VM_PAGER_OK; 857 } 858 VM_OBJECT_UNLOCK(mreq->object); 859 860 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 861 ap->a_count, 862 ap->a_reqpage); 863 } 864 865 866 /* 867 * Extended attribute area reading. 868 */ 869 static int 870 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 871 { 872 struct inode *ip; 873 struct ufs2_dinode *dp; 874 struct fs *fs; 875 struct buf *bp; 876 ufs_lbn_t lbn, nextlbn; 877 off_t bytesinfile; 878 long size, xfersize, blkoffset; 879 ssize_t orig_resid; 880 int error; 881 882 ip = VTOI(vp); 883 fs = ip->i_fs; 884 dp = ip->i_din2; 885 886 #ifdef INVARIANTS 887 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 888 panic("ffs_extread: mode"); 889 890 #endif 891 orig_resid = uio->uio_resid; 892 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 893 if (orig_resid == 0) 894 return (0); 895 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 896 897 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 898 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 899 break; 900 lbn = lblkno(fs, uio->uio_offset); 901 nextlbn = lbn + 1; 902 903 /* 904 * size of buffer. The buffer representing the 905 * end of the file is rounded up to the size of 906 * the block type ( fragment or full block, 907 * depending ). 908 */ 909 size = sblksize(fs, dp->di_extsize, lbn); 910 blkoffset = blkoff(fs, uio->uio_offset); 911 912 /* 913 * The amount we want to transfer in this iteration is 914 * one FS block less the amount of the data before 915 * our startpoint (duh!) 916 */ 917 xfersize = fs->fs_bsize - blkoffset; 918 919 /* 920 * But if we actually want less than the block, 921 * or the file doesn't have a whole block more of data, 922 * then use the lesser number. 923 */ 924 if (uio->uio_resid < xfersize) 925 xfersize = uio->uio_resid; 926 if (bytesinfile < xfersize) 927 xfersize = bytesinfile; 928 929 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 930 /* 931 * Don't do readahead if this is the end of the info. 932 */ 933 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 934 } else { 935 /* 936 * If we have a second block, then 937 * fire off a request for a readahead 938 * as well as a read. Note that the 4th and 5th 939 * arguments point to arrays of the size specified in 940 * the 6th argument. 941 */ 942 int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 943 944 nextlbn = -1 - nextlbn; 945 error = breadn(vp, -1 - lbn, 946 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 947 } 948 if (error) { 949 brelse(bp); 950 bp = NULL; 951 break; 952 } 953 954 /* 955 * If IO_DIRECT then set B_DIRECT for the buffer. This 956 * will cause us to attempt to release the buffer later on 957 * and will cause the buffer cache to attempt to free the 958 * underlying pages. 959 */ 960 if (ioflag & IO_DIRECT) 961 bp->b_flags |= B_DIRECT; 962 963 /* 964 * We should only get non-zero b_resid when an I/O error 965 * has occurred, which should cause us to break above. 966 * However, if the short read did not cause an error, 967 * then we want to ensure that we do not uiomove bad 968 * or uninitialized data. 969 */ 970 size -= bp->b_resid; 971 if (size < xfersize) { 972 if (size == 0) 973 break; 974 xfersize = size; 975 } 976 977 error = uiomove((char *)bp->b_data + blkoffset, 978 (int)xfersize, uio); 979 if (error) 980 break; 981 982 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 983 (LIST_EMPTY(&bp->b_dep))) { 984 /* 985 * If there are no dependencies, and it's VMIO, 986 * then we don't need the buf, mark it available 987 * for freeing. For non-direct VMIO reads, the VM 988 * has the data. 989 */ 990 bp->b_flags |= B_RELBUF; 991 brelse(bp); 992 } else { 993 /* 994 * Otherwise let whoever 995 * made the request take care of 996 * freeing it. We just queue 997 * it onto another list. 998 */ 999 bqrelse(bp); 1000 } 1001 } 1002 1003 /* 1004 * This can only happen in the case of an error 1005 * because the loop above resets bp to NULL on each iteration 1006 * and on normal completion has not set a new value into it. 1007 * so it must have come from a 'break' statement 1008 */ 1009 if (bp != NULL) { 1010 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1011 (LIST_EMPTY(&bp->b_dep))) { 1012 bp->b_flags |= B_RELBUF; 1013 brelse(bp); 1014 } else { 1015 bqrelse(bp); 1016 } 1017 } 1018 return (error); 1019 } 1020 1021 /* 1022 * Extended attribute area writing. 1023 */ 1024 static int 1025 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1026 { 1027 struct inode *ip; 1028 struct ufs2_dinode *dp; 1029 struct fs *fs; 1030 struct buf *bp; 1031 ufs_lbn_t lbn; 1032 off_t osize; 1033 ssize_t resid; 1034 int blkoffset, error, flags, size, xfersize; 1035 1036 ip = VTOI(vp); 1037 fs = ip->i_fs; 1038 dp = ip->i_din2; 1039 1040 #ifdef INVARIANTS 1041 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1042 panic("ffs_extwrite: mode"); 1043 #endif 1044 1045 if (ioflag & IO_APPEND) 1046 uio->uio_offset = dp->di_extsize; 1047 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1048 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1049 if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1050 return (EFBIG); 1051 1052 resid = uio->uio_resid; 1053 osize = dp->di_extsize; 1054 flags = IO_EXT; 1055 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1056 flags |= IO_SYNC; 1057 1058 for (error = 0; uio->uio_resid > 0;) { 1059 lbn = lblkno(fs, uio->uio_offset); 1060 blkoffset = blkoff(fs, uio->uio_offset); 1061 xfersize = fs->fs_bsize - blkoffset; 1062 if (uio->uio_resid < xfersize) 1063 xfersize = uio->uio_resid; 1064 1065 /* 1066 * We must perform a read-before-write if the transfer size 1067 * does not cover the entire buffer. 1068 */ 1069 if (fs->fs_bsize > xfersize) 1070 flags |= BA_CLRBUF; 1071 else 1072 flags &= ~BA_CLRBUF; 1073 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1074 ucred, flags, &bp); 1075 if (error != 0) 1076 break; 1077 /* 1078 * If the buffer is not valid we have to clear out any 1079 * garbage data from the pages instantiated for the buffer. 1080 * If we do not, a failed uiomove() during a write can leave 1081 * the prior contents of the pages exposed to a userland 1082 * mmap(). XXX deal with uiomove() errors a better way. 1083 */ 1084 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1085 vfs_bio_clrbuf(bp); 1086 if (ioflag & IO_DIRECT) 1087 bp->b_flags |= B_DIRECT; 1088 1089 if (uio->uio_offset + xfersize > dp->di_extsize) 1090 dp->di_extsize = uio->uio_offset + xfersize; 1091 1092 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1093 if (size < xfersize) 1094 xfersize = size; 1095 1096 error = 1097 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1098 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1099 (LIST_EMPTY(&bp->b_dep))) { 1100 bp->b_flags |= B_RELBUF; 1101 } 1102 1103 /* 1104 * If IO_SYNC each buffer is written synchronously. Otherwise 1105 * if we have a severe page deficiency write the buffer 1106 * asynchronously. Otherwise try to cluster, and if that 1107 * doesn't do it then either do an async write (if O_DIRECT), 1108 * or a delayed write (if not). 1109 */ 1110 if (ioflag & IO_SYNC) { 1111 (void)bwrite(bp); 1112 } else if (vm_page_count_severe() || 1113 buf_dirty_count_severe() || 1114 xfersize + blkoffset == fs->fs_bsize || 1115 (ioflag & (IO_ASYNC | IO_DIRECT))) 1116 bawrite(bp); 1117 else 1118 bdwrite(bp); 1119 if (error || xfersize == 0) 1120 break; 1121 ip->i_flag |= IN_CHANGE; 1122 } 1123 /* 1124 * If we successfully wrote any data, and we are not the superuser 1125 * we clear the setuid and setgid bits as a precaution against 1126 * tampering. 1127 */ 1128 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { 1129 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) { 1130 ip->i_mode &= ~(ISUID | ISGID); 1131 dp->di_mode = ip->i_mode; 1132 } 1133 } 1134 if (error) { 1135 if (ioflag & IO_UNIT) { 1136 (void)ffs_truncate(vp, osize, 1137 IO_EXT | (ioflag&IO_SYNC), ucred); 1138 uio->uio_offset -= resid - uio->uio_resid; 1139 uio->uio_resid = resid; 1140 } 1141 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1142 error = ffs_update(vp, 1); 1143 return (error); 1144 } 1145 1146 1147 /* 1148 * Vnode operating to retrieve a named extended attribute. 1149 * 1150 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1151 * the length of the EA, and possibly the pointer to the entry and to the data. 1152 */ 1153 static int 1154 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) 1155 { 1156 u_char *p, *pe, *pn, *p0; 1157 int eapad1, eapad2, ealength, ealen, nlen; 1158 uint32_t ul; 1159 1160 pe = ptr + length; 1161 nlen = strlen(name); 1162 1163 for (p = ptr; p < pe; p = pn) { 1164 p0 = p; 1165 bcopy(p, &ul, sizeof(ul)); 1166 pn = p + ul; 1167 /* make sure this entry is complete */ 1168 if (pn > pe) 1169 break; 1170 p += sizeof(uint32_t); 1171 if (*p != nspace) 1172 continue; 1173 p++; 1174 eapad2 = *p++; 1175 if (*p != nlen) 1176 continue; 1177 p++; 1178 if (bcmp(p, name, nlen)) 1179 continue; 1180 ealength = sizeof(uint32_t) + 3 + nlen; 1181 eapad1 = 8 - (ealength % 8); 1182 if (eapad1 == 8) 1183 eapad1 = 0; 1184 ealength += eapad1; 1185 ealen = ul - ealength - eapad2; 1186 p += nlen + eapad1; 1187 if (eap != NULL) 1188 *eap = p0; 1189 if (eac != NULL) 1190 *eac = p; 1191 return (ealen); 1192 } 1193 return(-1); 1194 } 1195 1196 static int 1197 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1198 { 1199 struct inode *ip; 1200 struct ufs2_dinode *dp; 1201 struct fs *fs; 1202 struct uio luio; 1203 struct iovec liovec; 1204 int easize, error; 1205 u_char *eae; 1206 1207 ip = VTOI(vp); 1208 fs = ip->i_fs; 1209 dp = ip->i_din2; 1210 easize = dp->di_extsize; 1211 if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize) 1212 return (EFBIG); 1213 1214 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1215 1216 liovec.iov_base = eae; 1217 liovec.iov_len = easize; 1218 luio.uio_iov = &liovec; 1219 luio.uio_iovcnt = 1; 1220 luio.uio_offset = 0; 1221 luio.uio_resid = easize; 1222 luio.uio_segflg = UIO_SYSSPACE; 1223 luio.uio_rw = UIO_READ; 1224 luio.uio_td = td; 1225 1226 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1227 if (error) { 1228 free(eae, M_TEMP); 1229 return(error); 1230 } 1231 *p = eae; 1232 return (0); 1233 } 1234 1235 static void 1236 ffs_lock_ea(struct vnode *vp) 1237 { 1238 struct inode *ip; 1239 1240 ip = VTOI(vp); 1241 VI_LOCK(vp); 1242 while (ip->i_flag & IN_EA_LOCKED) { 1243 ip->i_flag |= IN_EA_LOCKWAIT; 1244 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea", 1245 0); 1246 } 1247 ip->i_flag |= IN_EA_LOCKED; 1248 VI_UNLOCK(vp); 1249 } 1250 1251 static void 1252 ffs_unlock_ea(struct vnode *vp) 1253 { 1254 struct inode *ip; 1255 1256 ip = VTOI(vp); 1257 VI_LOCK(vp); 1258 if (ip->i_flag & IN_EA_LOCKWAIT) 1259 wakeup(&ip->i_ea_refs); 1260 ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT); 1261 VI_UNLOCK(vp); 1262 } 1263 1264 static int 1265 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1266 { 1267 struct inode *ip; 1268 struct ufs2_dinode *dp; 1269 int error; 1270 1271 ip = VTOI(vp); 1272 1273 ffs_lock_ea(vp); 1274 if (ip->i_ea_area != NULL) { 1275 ip->i_ea_refs++; 1276 ffs_unlock_ea(vp); 1277 return (0); 1278 } 1279 dp = ip->i_din2; 1280 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1281 if (error) { 1282 ffs_unlock_ea(vp); 1283 return (error); 1284 } 1285 ip->i_ea_len = dp->di_extsize; 1286 ip->i_ea_error = 0; 1287 ip->i_ea_refs++; 1288 ffs_unlock_ea(vp); 1289 return (0); 1290 } 1291 1292 /* 1293 * Vnode extattr transaction commit/abort 1294 */ 1295 static int 1296 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1297 { 1298 struct inode *ip; 1299 struct uio luio; 1300 struct iovec liovec; 1301 int error; 1302 struct ufs2_dinode *dp; 1303 1304 ip = VTOI(vp); 1305 1306 ffs_lock_ea(vp); 1307 if (ip->i_ea_area == NULL) { 1308 ffs_unlock_ea(vp); 1309 return (EINVAL); 1310 } 1311 dp = ip->i_din2; 1312 error = ip->i_ea_error; 1313 if (commit && error == 0) { 1314 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit"); 1315 if (cred == NOCRED) 1316 cred = vp->v_mount->mnt_cred; 1317 liovec.iov_base = ip->i_ea_area; 1318 liovec.iov_len = ip->i_ea_len; 1319 luio.uio_iov = &liovec; 1320 luio.uio_iovcnt = 1; 1321 luio.uio_offset = 0; 1322 luio.uio_resid = ip->i_ea_len; 1323 luio.uio_segflg = UIO_SYSSPACE; 1324 luio.uio_rw = UIO_WRITE; 1325 luio.uio_td = td; 1326 /* XXX: I'm not happy about truncating to zero size */ 1327 if (ip->i_ea_len < dp->di_extsize) 1328 error = ffs_truncate(vp, 0, IO_EXT, cred); 1329 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1330 } 1331 if (--ip->i_ea_refs == 0) { 1332 free(ip->i_ea_area, M_TEMP); 1333 ip->i_ea_area = NULL; 1334 ip->i_ea_len = 0; 1335 ip->i_ea_error = 0; 1336 } 1337 ffs_unlock_ea(vp); 1338 return (error); 1339 } 1340 1341 /* 1342 * Vnode extattr strategy routine for fifos. 1343 * 1344 * We need to check for a read or write of the external attributes. 1345 * Otherwise we just fall through and do the usual thing. 1346 */ 1347 static int 1348 ffsext_strategy(struct vop_strategy_args *ap) 1349 /* 1350 struct vop_strategy_args { 1351 struct vnodeop_desc *a_desc; 1352 struct vnode *a_vp; 1353 struct buf *a_bp; 1354 }; 1355 */ 1356 { 1357 struct vnode *vp; 1358 daddr_t lbn; 1359 1360 vp = ap->a_vp; 1361 lbn = ap->a_bp->b_lblkno; 1362 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1363 lbn < 0 && lbn >= -NXADDR) 1364 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1365 if (vp->v_type == VFIFO) 1366 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1367 panic("spec nodes went here"); 1368 } 1369 1370 /* 1371 * Vnode extattr transaction commit/abort 1372 */ 1373 static int 1374 ffs_openextattr(struct vop_openextattr_args *ap) 1375 /* 1376 struct vop_openextattr_args { 1377 struct vnodeop_desc *a_desc; 1378 struct vnode *a_vp; 1379 IN struct ucred *a_cred; 1380 IN struct thread *a_td; 1381 }; 1382 */ 1383 { 1384 struct inode *ip; 1385 struct fs *fs; 1386 1387 ip = VTOI(ap->a_vp); 1388 fs = ip->i_fs; 1389 1390 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1391 return (EOPNOTSUPP); 1392 1393 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1394 } 1395 1396 1397 /* 1398 * Vnode extattr transaction commit/abort 1399 */ 1400 static int 1401 ffs_closeextattr(struct vop_closeextattr_args *ap) 1402 /* 1403 struct vop_closeextattr_args { 1404 struct vnodeop_desc *a_desc; 1405 struct vnode *a_vp; 1406 int a_commit; 1407 IN struct ucred *a_cred; 1408 IN struct thread *a_td; 1409 }; 1410 */ 1411 { 1412 struct inode *ip; 1413 struct fs *fs; 1414 1415 ip = VTOI(ap->a_vp); 1416 fs = ip->i_fs; 1417 1418 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1419 return (EOPNOTSUPP); 1420 1421 if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) 1422 return (EROFS); 1423 1424 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1425 } 1426 1427 /* 1428 * Vnode operation to remove a named attribute. 1429 */ 1430 static int 1431 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1432 /* 1433 vop_deleteextattr { 1434 IN struct vnode *a_vp; 1435 IN int a_attrnamespace; 1436 IN const char *a_name; 1437 IN struct ucred *a_cred; 1438 IN struct thread *a_td; 1439 }; 1440 */ 1441 { 1442 struct inode *ip; 1443 struct fs *fs; 1444 uint32_t ealength, ul; 1445 int ealen, olen, eapad1, eapad2, error, i, easize; 1446 u_char *eae, *p; 1447 1448 ip = VTOI(ap->a_vp); 1449 fs = ip->i_fs; 1450 1451 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1452 return (EOPNOTSUPP); 1453 1454 if (strlen(ap->a_name) == 0) 1455 return (EINVAL); 1456 1457 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1458 return (EROFS); 1459 1460 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1461 ap->a_cred, ap->a_td, VWRITE); 1462 if (error) { 1463 1464 /* 1465 * ffs_lock_ea is not needed there, because the vnode 1466 * must be exclusively locked. 1467 */ 1468 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1469 ip->i_ea_error = error; 1470 return (error); 1471 } 1472 1473 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1474 if (error) 1475 return (error); 1476 1477 ealength = eapad1 = ealen = eapad2 = 0; 1478 1479 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1480 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1481 easize = ip->i_ea_len; 1482 1483 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1484 &p, NULL); 1485 if (olen == -1) { 1486 /* delete but nonexistent */ 1487 free(eae, M_TEMP); 1488 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1489 return(ENOATTR); 1490 } 1491 bcopy(p, &ul, sizeof ul); 1492 i = p - eae + ul; 1493 if (ul != ealength) { 1494 bcopy(p + ul, p + ealength, easize - i); 1495 easize += (ealength - ul); 1496 } 1497 if (easize > NXADDR * fs->fs_bsize) { 1498 free(eae, M_TEMP); 1499 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1500 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1501 ip->i_ea_error = ENOSPC; 1502 return(ENOSPC); 1503 } 1504 p = ip->i_ea_area; 1505 ip->i_ea_area = eae; 1506 ip->i_ea_len = easize; 1507 free(p, M_TEMP); 1508 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1509 return(error); 1510 } 1511 1512 /* 1513 * Vnode operation to retrieve a named extended attribute. 1514 */ 1515 static int 1516 ffs_getextattr(struct vop_getextattr_args *ap) 1517 /* 1518 vop_getextattr { 1519 IN struct vnode *a_vp; 1520 IN int a_attrnamespace; 1521 IN const char *a_name; 1522 INOUT struct uio *a_uio; 1523 OUT size_t *a_size; 1524 IN struct ucred *a_cred; 1525 IN struct thread *a_td; 1526 }; 1527 */ 1528 { 1529 struct inode *ip; 1530 struct fs *fs; 1531 u_char *eae, *p; 1532 unsigned easize; 1533 int error, ealen; 1534 1535 ip = VTOI(ap->a_vp); 1536 fs = ip->i_fs; 1537 1538 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1539 return (EOPNOTSUPP); 1540 1541 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1542 ap->a_cred, ap->a_td, VREAD); 1543 if (error) 1544 return (error); 1545 1546 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1547 if (error) 1548 return (error); 1549 1550 eae = ip->i_ea_area; 1551 easize = ip->i_ea_len; 1552 1553 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1554 NULL, &p); 1555 if (ealen >= 0) { 1556 error = 0; 1557 if (ap->a_size != NULL) 1558 *ap->a_size = ealen; 1559 else if (ap->a_uio != NULL) 1560 error = uiomove(p, ealen, ap->a_uio); 1561 } else 1562 error = ENOATTR; 1563 1564 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1565 return(error); 1566 } 1567 1568 /* 1569 * Vnode operation to retrieve extended attributes on a vnode. 1570 */ 1571 static int 1572 ffs_listextattr(struct vop_listextattr_args *ap) 1573 /* 1574 vop_listextattr { 1575 IN struct vnode *a_vp; 1576 IN int a_attrnamespace; 1577 INOUT struct uio *a_uio; 1578 OUT size_t *a_size; 1579 IN struct ucred *a_cred; 1580 IN struct thread *a_td; 1581 }; 1582 */ 1583 { 1584 struct inode *ip; 1585 struct fs *fs; 1586 u_char *eae, *p, *pe, *pn; 1587 unsigned easize; 1588 uint32_t ul; 1589 int error, ealen; 1590 1591 ip = VTOI(ap->a_vp); 1592 fs = ip->i_fs; 1593 1594 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1595 return (EOPNOTSUPP); 1596 1597 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1598 ap->a_cred, ap->a_td, VREAD); 1599 if (error) 1600 return (error); 1601 1602 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1603 if (error) 1604 return (error); 1605 eae = ip->i_ea_area; 1606 easize = ip->i_ea_len; 1607 1608 error = 0; 1609 if (ap->a_size != NULL) 1610 *ap->a_size = 0; 1611 pe = eae + easize; 1612 for(p = eae; error == 0 && p < pe; p = pn) { 1613 bcopy(p, &ul, sizeof(ul)); 1614 pn = p + ul; 1615 if (pn > pe) 1616 break; 1617 p += sizeof(ul); 1618 if (*p++ != ap->a_attrnamespace) 1619 continue; 1620 p++; /* pad2 */ 1621 ealen = *p; 1622 if (ap->a_size != NULL) { 1623 *ap->a_size += ealen + 1; 1624 } else if (ap->a_uio != NULL) { 1625 error = uiomove(p, ealen + 1, ap->a_uio); 1626 } 1627 } 1628 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1629 return(error); 1630 } 1631 1632 /* 1633 * Vnode operation to set a named attribute. 1634 */ 1635 static int 1636 ffs_setextattr(struct vop_setextattr_args *ap) 1637 /* 1638 vop_setextattr { 1639 IN struct vnode *a_vp; 1640 IN int a_attrnamespace; 1641 IN const char *a_name; 1642 INOUT struct uio *a_uio; 1643 IN struct ucred *a_cred; 1644 IN struct thread *a_td; 1645 }; 1646 */ 1647 { 1648 struct inode *ip; 1649 struct fs *fs; 1650 uint32_t ealength, ul; 1651 int ealen, olen, eapad1, eapad2, error, i, easize; 1652 u_char *eae, *p; 1653 1654 ip = VTOI(ap->a_vp); 1655 fs = ip->i_fs; 1656 1657 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1658 return (EOPNOTSUPP); 1659 1660 if (strlen(ap->a_name) == 0) 1661 return (EINVAL); 1662 1663 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1664 if (ap->a_uio == NULL) 1665 return (EOPNOTSUPP); 1666 1667 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1668 return (EROFS); 1669 1670 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1671 ap->a_cred, ap->a_td, VWRITE); 1672 if (error) { 1673 1674 /* 1675 * ffs_lock_ea is not needed there, because the vnode 1676 * must be exclusively locked. 1677 */ 1678 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1679 ip->i_ea_error = error; 1680 return (error); 1681 } 1682 1683 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1684 if (error) 1685 return (error); 1686 1687 ealen = ap->a_uio->uio_resid; 1688 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1689 eapad1 = 8 - (ealength % 8); 1690 if (eapad1 == 8) 1691 eapad1 = 0; 1692 eapad2 = 8 - (ealen % 8); 1693 if (eapad2 == 8) 1694 eapad2 = 0; 1695 ealength += eapad1 + ealen + eapad2; 1696 1697 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1698 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1699 easize = ip->i_ea_len; 1700 1701 olen = ffs_findextattr(eae, easize, 1702 ap->a_attrnamespace, ap->a_name, &p, NULL); 1703 if (olen == -1) { 1704 /* new, append at end */ 1705 p = eae + easize; 1706 easize += ealength; 1707 } else { 1708 bcopy(p, &ul, sizeof ul); 1709 i = p - eae + ul; 1710 if (ul != ealength) { 1711 bcopy(p + ul, p + ealength, easize - i); 1712 easize += (ealength - ul); 1713 } 1714 } 1715 if (easize > NXADDR * fs->fs_bsize) { 1716 free(eae, M_TEMP); 1717 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1718 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1719 ip->i_ea_error = ENOSPC; 1720 return(ENOSPC); 1721 } 1722 bcopy(&ealength, p, sizeof(ealength)); 1723 p += sizeof(ealength); 1724 *p++ = ap->a_attrnamespace; 1725 *p++ = eapad2; 1726 *p++ = strlen(ap->a_name); 1727 strcpy(p, ap->a_name); 1728 p += strlen(ap->a_name); 1729 bzero(p, eapad1); 1730 p += eapad1; 1731 error = uiomove(p, ealen, ap->a_uio); 1732 if (error) { 1733 free(eae, M_TEMP); 1734 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1735 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1736 ip->i_ea_error = error; 1737 return(error); 1738 } 1739 p += ealen; 1740 bzero(p, eapad2); 1741 1742 p = ip->i_ea_area; 1743 ip->i_ea_area = eae; 1744 ip->i_ea_len = easize; 1745 free(p, M_TEMP); 1746 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1747 return(error); 1748 } 1749 1750 /* 1751 * Vnode pointer to File handle 1752 */ 1753 static int 1754 ffs_vptofh(struct vop_vptofh_args *ap) 1755 /* 1756 vop_vptofh { 1757 IN struct vnode *a_vp; 1758 IN struct fid *a_fhp; 1759 }; 1760 */ 1761 { 1762 struct inode *ip; 1763 struct ufid *ufhp; 1764 1765 ip = VTOI(ap->a_vp); 1766 ufhp = (struct ufid *)ap->a_fhp; 1767 ufhp->ufid_len = sizeof(struct ufid); 1768 ufhp->ufid_ino = ip->i_number; 1769 ufhp->ufid_gen = ip->i_gen; 1770 return (0); 1771 } 1772