1 /*- 2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 60 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 61 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include <sys/param.h> 68 #include <sys/bio.h> 69 #include <sys/systm.h> 70 #include <sys/buf.h> 71 #include <sys/conf.h> 72 #include <sys/extattr.h> 73 #include <sys/kernel.h> 74 #include <sys/limits.h> 75 #include <sys/malloc.h> 76 #include <sys/mount.h> 77 #include <sys/priv.h> 78 #include <sys/rwlock.h> 79 #include <sys/stat.h> 80 #include <sys/vmmeter.h> 81 #include <sys/vnode.h> 82 83 #include <vm/vm.h> 84 #include <vm/vm_param.h> 85 #include <vm/vm_extern.h> 86 #include <vm/vm_object.h> 87 #include <vm/vm_page.h> 88 #include <vm/vm_pager.h> 89 #include <vm/vnode_pager.h> 90 91 #include <ufs/ufs/extattr.h> 92 #include <ufs/ufs/quota.h> 93 #include <ufs/ufs/inode.h> 94 #include <ufs/ufs/ufs_extern.h> 95 #include <ufs/ufs/ufsmount.h> 96 97 #include <ufs/ffs/fs.h> 98 #include <ufs/ffs/ffs_extern.h> 99 #include "opt_directio.h" 100 #include "opt_ffs.h" 101 102 #ifdef DIRECTIO 103 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 104 #endif 105 static vop_fsync_t ffs_fsync; 106 static vop_lock1_t ffs_lock; 107 static vop_read_t ffs_read; 108 static vop_write_t ffs_write; 109 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 110 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 111 struct ucred *cred); 112 static vop_strategy_t ffsext_strategy; 113 static vop_closeextattr_t ffs_closeextattr; 114 static vop_deleteextattr_t ffs_deleteextattr; 115 static vop_getextattr_t ffs_getextattr; 116 static vop_listextattr_t ffs_listextattr; 117 static vop_openextattr_t ffs_openextattr; 118 static vop_setextattr_t ffs_setextattr; 119 static vop_vptofh_t ffs_vptofh; 120 121 122 /* Global vfs data structures for ufs. */ 123 struct vop_vector ffs_vnodeops1 = { 124 .vop_default = &ufs_vnodeops, 125 .vop_fsync = ffs_fsync, 126 .vop_getpages = vnode_pager_local_getpages, 127 .vop_lock1 = ffs_lock, 128 .vop_read = ffs_read, 129 .vop_reallocblks = ffs_reallocblks, 130 .vop_write = ffs_write, 131 .vop_vptofh = ffs_vptofh, 132 }; 133 134 struct vop_vector ffs_fifoops1 = { 135 .vop_default = &ufs_fifoops, 136 .vop_fsync = ffs_fsync, 137 .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ 138 .vop_vptofh = ffs_vptofh, 139 }; 140 141 /* Global vfs data structures for ufs. */ 142 struct vop_vector ffs_vnodeops2 = { 143 .vop_default = &ufs_vnodeops, 144 .vop_fsync = ffs_fsync, 145 .vop_getpages = vnode_pager_local_getpages, 146 .vop_lock1 = ffs_lock, 147 .vop_read = ffs_read, 148 .vop_reallocblks = ffs_reallocblks, 149 .vop_write = ffs_write, 150 .vop_closeextattr = ffs_closeextattr, 151 .vop_deleteextattr = ffs_deleteextattr, 152 .vop_getextattr = ffs_getextattr, 153 .vop_listextattr = ffs_listextattr, 154 .vop_openextattr = ffs_openextattr, 155 .vop_setextattr = ffs_setextattr, 156 .vop_vptofh = ffs_vptofh, 157 }; 158 159 struct vop_vector ffs_fifoops2 = { 160 .vop_default = &ufs_fifoops, 161 .vop_fsync = ffs_fsync, 162 .vop_lock1 = ffs_lock, 163 .vop_reallocblks = ffs_reallocblks, 164 .vop_strategy = ffsext_strategy, 165 .vop_closeextattr = ffs_closeextattr, 166 .vop_deleteextattr = ffs_deleteextattr, 167 .vop_getextattr = ffs_getextattr, 168 .vop_listextattr = ffs_listextattr, 169 .vop_openextattr = ffs_openextattr, 170 .vop_setextattr = ffs_setextattr, 171 .vop_vptofh = ffs_vptofh, 172 }; 173 174 /* 175 * Synch an open file. 176 */ 177 /* ARGSUSED */ 178 static int 179 ffs_fsync(struct vop_fsync_args *ap) 180 { 181 struct vnode *vp; 182 struct bufobj *bo; 183 int error; 184 185 vp = ap->a_vp; 186 bo = &vp->v_bufobj; 187 retry: 188 error = ffs_syncvnode(vp, ap->a_waitfor, 0); 189 if (error) 190 return (error); 191 if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) { 192 error = softdep_fsync(vp); 193 if (error) 194 return (error); 195 196 /* 197 * The softdep_fsync() function may drop vp lock, 198 * allowing for dirty buffers to reappear on the 199 * bo_dirty list. Recheck and resync as needed. 200 */ 201 BO_LOCK(bo); 202 if (vp->v_type == VREG && (bo->bo_numoutput > 0 || 203 bo->bo_dirty.bv_cnt > 0)) { 204 BO_UNLOCK(bo); 205 goto retry; 206 } 207 BO_UNLOCK(bo); 208 } 209 return (0); 210 } 211 212 int 213 ffs_syncvnode(struct vnode *vp, int waitfor, int flags) 214 { 215 struct inode *ip; 216 struct bufobj *bo; 217 struct buf *bp; 218 struct buf *nbp; 219 ufs_lbn_t lbn; 220 int error, wait, passes; 221 222 ip = VTOI(vp); 223 ip->i_flag &= ~IN_NEEDSYNC; 224 bo = &vp->v_bufobj; 225 226 /* 227 * When doing MNT_WAIT we must first flush all dependencies 228 * on the inode. 229 */ 230 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && 231 (error = softdep_sync_metadata(vp)) != 0) 232 return (error); 233 234 /* 235 * Flush all dirty buffers associated with a vnode. 236 */ 237 error = 0; 238 passes = 0; 239 wait = 0; /* Always do an async pass first. */ 240 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 241 BO_LOCK(bo); 242 loop: 243 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 244 bp->b_vflags &= ~BV_SCANNED; 245 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 246 /* 247 * Reasons to skip this buffer: it has already been considered 248 * on this pass, the buffer has dependencies that will cause 249 * it to be redirtied and it has not already been deferred, 250 * or it is already being written. 251 */ 252 if ((bp->b_vflags & BV_SCANNED) != 0) 253 continue; 254 bp->b_vflags |= BV_SCANNED; 255 /* Flush indirects in order. */ 256 if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR && 257 lbn_level(bp->b_lblkno) >= passes) 258 continue; 259 if (bp->b_lblkno > lbn) 260 panic("ffs_syncvnode: syncing truncated data."); 261 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) { 262 BO_UNLOCK(bo); 263 } else if (wait != 0) { 264 if (BUF_LOCK(bp, 265 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 266 BO_LOCKPTR(bo)) != 0) { 267 bp->b_vflags &= ~BV_SCANNED; 268 goto next; 269 } 270 } else 271 continue; 272 if ((bp->b_flags & B_DELWRI) == 0) 273 panic("ffs_fsync: not dirty"); 274 /* 275 * Check for dependencies and potentially complete them. 276 */ 277 if (!LIST_EMPTY(&bp->b_dep) && 278 (error = softdep_sync_buf(vp, bp, 279 wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { 280 /* I/O error. */ 281 if (error != EBUSY) { 282 BUF_UNLOCK(bp); 283 return (error); 284 } 285 /* If we deferred once, don't defer again. */ 286 if ((bp->b_flags & B_DEFERRED) == 0) { 287 bp->b_flags |= B_DEFERRED; 288 BUF_UNLOCK(bp); 289 goto next; 290 } 291 } 292 if (wait) { 293 bremfree(bp); 294 if ((error = bwrite(bp)) != 0) 295 return (error); 296 } else if ((bp->b_flags & B_CLUSTEROK)) { 297 (void) vfs_bio_awrite(bp); 298 } else { 299 bremfree(bp); 300 (void) bawrite(bp); 301 } 302 next: 303 /* 304 * Since we may have slept during the I/O, we need 305 * to start from a known point. 306 */ 307 BO_LOCK(bo); 308 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); 309 } 310 if (waitfor != MNT_WAIT) { 311 BO_UNLOCK(bo); 312 if ((flags & NO_INO_UPDT) != 0) 313 return (0); 314 else 315 return (ffs_update(vp, 0)); 316 } 317 /* Drain IO to see if we're done. */ 318 bufobj_wwait(bo, 0, 0); 319 /* 320 * Block devices associated with filesystems may have new I/O 321 * requests posted for them even if the vnode is locked, so no 322 * amount of trying will get them clean. We make several passes 323 * as a best effort. 324 * 325 * Regular files may need multiple passes to flush all dependency 326 * work as it is possible that we must write once per indirect 327 * level, once for the leaf, and once for the inode and each of 328 * these will be done with one sync and one async pass. 329 */ 330 if (bo->bo_dirty.bv_cnt > 0) { 331 /* Write the inode after sync passes to flush deps. */ 332 if (wait && DOINGSOFTDEP(vp) && (flags & NO_INO_UPDT) == 0) { 333 BO_UNLOCK(bo); 334 ffs_update(vp, 1); 335 BO_LOCK(bo); 336 } 337 /* switch between sync/async. */ 338 wait = !wait; 339 if (wait == 1 || ++passes < NIADDR + 2) 340 goto loop; 341 #ifdef INVARIANTS 342 if (!vn_isdisk(vp, NULL)) 343 vprint("ffs_fsync: dirty", vp); 344 #endif 345 } 346 BO_UNLOCK(bo); 347 error = 0; 348 if ((flags & NO_INO_UPDT) == 0) 349 error = ffs_update(vp, 1); 350 if (DOINGSUJ(vp)) 351 softdep_journal_fsync(VTOI(vp)); 352 return (error); 353 } 354 355 static int 356 ffs_lock(ap) 357 struct vop_lock1_args /* { 358 struct vnode *a_vp; 359 int a_flags; 360 struct thread *a_td; 361 char *file; 362 int line; 363 } */ *ap; 364 { 365 #ifndef NO_FFS_SNAPSHOT 366 struct vnode *vp; 367 int flags; 368 struct lock *lkp; 369 int result; 370 371 switch (ap->a_flags & LK_TYPE_MASK) { 372 case LK_SHARED: 373 case LK_UPGRADE: 374 case LK_EXCLUSIVE: 375 vp = ap->a_vp; 376 flags = ap->a_flags; 377 for (;;) { 378 #ifdef DEBUG_VFS_LOCKS 379 KASSERT(vp->v_holdcnt != 0, 380 ("ffs_lock %p: zero hold count", vp)); 381 #endif 382 lkp = vp->v_vnlock; 383 result = _lockmgr_args(lkp, flags, VI_MTX(vp), 384 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 385 ap->a_file, ap->a_line); 386 if (lkp == vp->v_vnlock || result != 0) 387 break; 388 /* 389 * Apparent success, except that the vnode 390 * mutated between snapshot file vnode and 391 * regular file vnode while this process 392 * slept. The lock currently held is not the 393 * right lock. Release it, and try to get the 394 * new lock. 395 */ 396 (void) _lockmgr_args(lkp, LK_RELEASE, NULL, 397 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 398 ap->a_file, ap->a_line); 399 if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == 400 (LK_INTERLOCK | LK_NOWAIT)) 401 return (EBUSY); 402 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 403 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 404 flags &= ~LK_INTERLOCK; 405 } 406 break; 407 default: 408 result = VOP_LOCK1_APV(&ufs_vnodeops, ap); 409 } 410 return (result); 411 #else 412 return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); 413 #endif 414 } 415 416 /* 417 * Vnode op for reading. 418 */ 419 static int 420 ffs_read(ap) 421 struct vop_read_args /* { 422 struct vnode *a_vp; 423 struct uio *a_uio; 424 int a_ioflag; 425 struct ucred *a_cred; 426 } */ *ap; 427 { 428 struct vnode *vp; 429 struct inode *ip; 430 struct uio *uio; 431 struct fs *fs; 432 struct buf *bp; 433 ufs_lbn_t lbn, nextlbn; 434 off_t bytesinfile; 435 long size, xfersize, blkoffset; 436 ssize_t orig_resid; 437 int error; 438 int seqcount; 439 int ioflag; 440 441 vp = ap->a_vp; 442 uio = ap->a_uio; 443 ioflag = ap->a_ioflag; 444 if (ap->a_ioflag & IO_EXT) 445 #ifdef notyet 446 return (ffs_extread(vp, uio, ioflag)); 447 #else 448 panic("ffs_read+IO_EXT"); 449 #endif 450 #ifdef DIRECTIO 451 if ((ioflag & IO_DIRECT) != 0) { 452 int workdone; 453 454 error = ffs_rawread(vp, uio, &workdone); 455 if (error != 0 || workdone != 0) 456 return error; 457 } 458 #endif 459 460 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 461 ip = VTOI(vp); 462 463 #ifdef INVARIANTS 464 if (uio->uio_rw != UIO_READ) 465 panic("ffs_read: mode"); 466 467 if (vp->v_type == VLNK) { 468 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 469 panic("ffs_read: short symlink"); 470 } else if (vp->v_type != VREG && vp->v_type != VDIR) 471 panic("ffs_read: type %d", vp->v_type); 472 #endif 473 orig_resid = uio->uio_resid; 474 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 475 if (orig_resid == 0) 476 return (0); 477 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 478 fs = ip->i_fs; 479 if (uio->uio_offset < ip->i_size && 480 uio->uio_offset >= fs->fs_maxfilesize) 481 return (EOVERFLOW); 482 483 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 484 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 485 break; 486 lbn = lblkno(fs, uio->uio_offset); 487 nextlbn = lbn + 1; 488 489 /* 490 * size of buffer. The buffer representing the 491 * end of the file is rounded up to the size of 492 * the block type ( fragment or full block, 493 * depending ). 494 */ 495 size = blksize(fs, ip, lbn); 496 blkoffset = blkoff(fs, uio->uio_offset); 497 498 /* 499 * The amount we want to transfer in this iteration is 500 * one FS block less the amount of the data before 501 * our startpoint (duh!) 502 */ 503 xfersize = fs->fs_bsize - blkoffset; 504 505 /* 506 * But if we actually want less than the block, 507 * or the file doesn't have a whole block more of data, 508 * then use the lesser number. 509 */ 510 if (uio->uio_resid < xfersize) 511 xfersize = uio->uio_resid; 512 if (bytesinfile < xfersize) 513 xfersize = bytesinfile; 514 515 if (lblktosize(fs, nextlbn) >= ip->i_size) { 516 /* 517 * Don't do readahead if this is the end of the file. 518 */ 519 error = bread_gb(vp, lbn, size, NOCRED, 520 GB_UNMAPPED, &bp); 521 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 522 /* 523 * Otherwise if we are allowed to cluster, 524 * grab as much as we can. 525 * 526 * XXX This may not be a win if we are not 527 * doing sequential access. 528 */ 529 error = cluster_read(vp, ip->i_size, lbn, 530 size, NOCRED, blkoffset + uio->uio_resid, 531 seqcount, GB_UNMAPPED, &bp); 532 } else if (seqcount > 1) { 533 /* 534 * If we are NOT allowed to cluster, then 535 * if we appear to be acting sequentially, 536 * fire off a request for a readahead 537 * as well as a read. Note that the 4th and 5th 538 * arguments point to arrays of the size specified in 539 * the 6th argument. 540 */ 541 u_int nextsize = blksize(fs, ip, nextlbn); 542 error = breadn_flags(vp, lbn, size, &nextlbn, 543 &nextsize, 1, NOCRED, GB_UNMAPPED, &bp); 544 } else { 545 /* 546 * Failing all of the above, just read what the 547 * user asked for. Interestingly, the same as 548 * the first option above. 549 */ 550 error = bread_gb(vp, lbn, size, NOCRED, 551 GB_UNMAPPED, &bp); 552 } 553 if (error) { 554 brelse(bp); 555 bp = NULL; 556 break; 557 } 558 559 /* 560 * If IO_DIRECT then set B_DIRECT for the buffer. This 561 * will cause us to attempt to release the buffer later on 562 * and will cause the buffer cache to attempt to free the 563 * underlying pages. 564 */ 565 if (ioflag & IO_DIRECT) 566 bp->b_flags |= B_DIRECT; 567 568 /* 569 * We should only get non-zero b_resid when an I/O error 570 * has occurred, which should cause us to break above. 571 * However, if the short read did not cause an error, 572 * then we want to ensure that we do not uiomove bad 573 * or uninitialized data. 574 */ 575 size -= bp->b_resid; 576 if (size < xfersize) { 577 if (size == 0) 578 break; 579 xfersize = size; 580 } 581 582 if ((bp->b_flags & B_UNMAPPED) == 0) { 583 error = vn_io_fault_uiomove((char *)bp->b_data + 584 blkoffset, (int)xfersize, uio); 585 } else { 586 error = vn_io_fault_pgmove(bp->b_pages, blkoffset, 587 (int)xfersize, uio); 588 } 589 if (error) 590 break; 591 592 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 593 (LIST_EMPTY(&bp->b_dep))) { 594 /* 595 * If there are no dependencies, and it's VMIO, 596 * then we don't need the buf, mark it available 597 * for freeing. For non-direct VMIO reads, the VM 598 * has the data. 599 */ 600 bp->b_flags |= B_RELBUF; 601 brelse(bp); 602 } else { 603 /* 604 * Otherwise let whoever 605 * made the request take care of 606 * freeing it. We just queue 607 * it onto another list. 608 */ 609 bqrelse(bp); 610 } 611 } 612 613 /* 614 * This can only happen in the case of an error 615 * because the loop above resets bp to NULL on each iteration 616 * and on normal completion has not set a new value into it. 617 * so it must have come from a 'break' statement 618 */ 619 if (bp != NULL) { 620 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 621 (LIST_EMPTY(&bp->b_dep))) { 622 bp->b_flags |= B_RELBUF; 623 brelse(bp); 624 } else { 625 bqrelse(bp); 626 } 627 } 628 629 if ((error == 0 || uio->uio_resid != orig_resid) && 630 (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 && 631 (ip->i_flag & IN_ACCESS) == 0) { 632 VI_LOCK(vp); 633 ip->i_flag |= IN_ACCESS; 634 VI_UNLOCK(vp); 635 } 636 return (error); 637 } 638 639 /* 640 * Vnode op for writing. 641 */ 642 static int 643 ffs_write(ap) 644 struct vop_write_args /* { 645 struct vnode *a_vp; 646 struct uio *a_uio; 647 int a_ioflag; 648 struct ucred *a_cred; 649 } */ *ap; 650 { 651 struct vnode *vp; 652 struct uio *uio; 653 struct inode *ip; 654 struct fs *fs; 655 struct buf *bp; 656 ufs_lbn_t lbn; 657 off_t osize; 658 ssize_t resid; 659 int seqcount; 660 int blkoffset, error, flags, ioflag, size, xfersize; 661 662 vp = ap->a_vp; 663 uio = ap->a_uio; 664 ioflag = ap->a_ioflag; 665 if (ap->a_ioflag & IO_EXT) 666 #ifdef notyet 667 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 668 #else 669 panic("ffs_write+IO_EXT"); 670 #endif 671 672 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 673 ip = VTOI(vp); 674 675 #ifdef INVARIANTS 676 if (uio->uio_rw != UIO_WRITE) 677 panic("ffs_write: mode"); 678 #endif 679 680 switch (vp->v_type) { 681 case VREG: 682 if (ioflag & IO_APPEND) 683 uio->uio_offset = ip->i_size; 684 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 685 return (EPERM); 686 /* FALLTHROUGH */ 687 case VLNK: 688 break; 689 case VDIR: 690 panic("ffs_write: dir write"); 691 break; 692 default: 693 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 694 (int)uio->uio_offset, 695 (int)uio->uio_resid 696 ); 697 } 698 699 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 700 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 701 fs = ip->i_fs; 702 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 703 return (EFBIG); 704 /* 705 * Maybe this should be above the vnode op call, but so long as 706 * file servers have no limits, I don't think it matters. 707 */ 708 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) 709 return (EFBIG); 710 711 resid = uio->uio_resid; 712 osize = ip->i_size; 713 if (seqcount > BA_SEQMAX) 714 flags = BA_SEQMAX << BA_SEQSHIFT; 715 else 716 flags = seqcount << BA_SEQSHIFT; 717 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 718 flags |= IO_SYNC; 719 flags |= BA_UNMAPPED; 720 721 for (error = 0; uio->uio_resid > 0;) { 722 lbn = lblkno(fs, uio->uio_offset); 723 blkoffset = blkoff(fs, uio->uio_offset); 724 xfersize = fs->fs_bsize - blkoffset; 725 if (uio->uio_resid < xfersize) 726 xfersize = uio->uio_resid; 727 if (uio->uio_offset + xfersize > ip->i_size) 728 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 729 730 /* 731 * We must perform a read-before-write if the transfer size 732 * does not cover the entire buffer. 733 */ 734 if (fs->fs_bsize > xfersize) 735 flags |= BA_CLRBUF; 736 else 737 flags &= ~BA_CLRBUF; 738 /* XXX is uio->uio_offset the right thing here? */ 739 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 740 ap->a_cred, flags, &bp); 741 if (error != 0) { 742 vnode_pager_setsize(vp, ip->i_size); 743 break; 744 } 745 if (ioflag & IO_DIRECT) 746 bp->b_flags |= B_DIRECT; 747 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 748 bp->b_flags |= B_NOCACHE; 749 750 if (uio->uio_offset + xfersize > ip->i_size) { 751 ip->i_size = uio->uio_offset + xfersize; 752 DIP_SET(ip, i_size, ip->i_size); 753 } 754 755 size = blksize(fs, ip, lbn) - bp->b_resid; 756 if (size < xfersize) 757 xfersize = size; 758 759 if ((bp->b_flags & B_UNMAPPED) == 0) { 760 error = vn_io_fault_uiomove((char *)bp->b_data + 761 blkoffset, (int)xfersize, uio); 762 } else { 763 error = vn_io_fault_pgmove(bp->b_pages, blkoffset, 764 (int)xfersize, uio); 765 } 766 /* 767 * If the buffer is not already filled and we encounter an 768 * error while trying to fill it, we have to clear out any 769 * garbage data from the pages instantiated for the buffer. 770 * If we do not, a failed uiomove() during a write can leave 771 * the prior contents of the pages exposed to a userland mmap. 772 * 773 * Note that we need only clear buffers with a transfer size 774 * equal to the block size because buffers with a shorter 775 * transfer size were cleared above by the call to UFS_BALLOC() 776 * with the BA_CLRBUF flag set. 777 * 778 * If the source region for uiomove identically mmaps the 779 * buffer, uiomove() performed the NOP copy, and the buffer 780 * content remains valid because the page fault handler 781 * validated the pages. 782 */ 783 if (error != 0 && (bp->b_flags & B_CACHE) == 0 && 784 fs->fs_bsize == xfersize) 785 vfs_bio_clrbuf(bp); 786 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 787 (LIST_EMPTY(&bp->b_dep))) { 788 bp->b_flags |= B_RELBUF; 789 } 790 791 /* 792 * If IO_SYNC each buffer is written synchronously. Otherwise 793 * if we have a severe page deficiency write the buffer 794 * asynchronously. Otherwise try to cluster, and if that 795 * doesn't do it then either do an async write (if O_DIRECT), 796 * or a delayed write (if not). 797 */ 798 if (ioflag & IO_SYNC) { 799 (void)bwrite(bp); 800 } else if (vm_page_count_severe() || 801 buf_dirty_count_severe() || 802 (ioflag & IO_ASYNC)) { 803 bp->b_flags |= B_CLUSTEROK; 804 bawrite(bp); 805 } else if (xfersize + blkoffset == fs->fs_bsize) { 806 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 807 bp->b_flags |= B_CLUSTEROK; 808 cluster_write(vp, bp, ip->i_size, seqcount, 809 GB_UNMAPPED); 810 } else { 811 bawrite(bp); 812 } 813 } else if (ioflag & IO_DIRECT) { 814 bp->b_flags |= B_CLUSTEROK; 815 bawrite(bp); 816 } else { 817 bp->b_flags |= B_CLUSTEROK; 818 bdwrite(bp); 819 } 820 if (error || xfersize == 0) 821 break; 822 ip->i_flag |= IN_CHANGE | IN_UPDATE; 823 } 824 /* 825 * If we successfully wrote any data, and we are not the superuser 826 * we clear the setuid and setgid bits as a precaution against 827 * tampering. 828 */ 829 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && 830 ap->a_cred) { 831 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) { 832 ip->i_mode &= ~(ISUID | ISGID); 833 DIP_SET(ip, i_mode, ip->i_mode); 834 } 835 } 836 if (error) { 837 if (ioflag & IO_UNIT) { 838 (void)ffs_truncate(vp, osize, 839 IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred); 840 uio->uio_offset -= resid - uio->uio_resid; 841 uio->uio_resid = resid; 842 } 843 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 844 error = ffs_update(vp, 1); 845 return (error); 846 } 847 848 /* 849 * Extended attribute area reading. 850 */ 851 static int 852 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 853 { 854 struct inode *ip; 855 struct ufs2_dinode *dp; 856 struct fs *fs; 857 struct buf *bp; 858 ufs_lbn_t lbn, nextlbn; 859 off_t bytesinfile; 860 long size, xfersize, blkoffset; 861 ssize_t orig_resid; 862 int error; 863 864 ip = VTOI(vp); 865 fs = ip->i_fs; 866 dp = ip->i_din2; 867 868 #ifdef INVARIANTS 869 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 870 panic("ffs_extread: mode"); 871 872 #endif 873 orig_resid = uio->uio_resid; 874 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 875 if (orig_resid == 0) 876 return (0); 877 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 878 879 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 880 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 881 break; 882 lbn = lblkno(fs, uio->uio_offset); 883 nextlbn = lbn + 1; 884 885 /* 886 * size of buffer. The buffer representing the 887 * end of the file is rounded up to the size of 888 * the block type ( fragment or full block, 889 * depending ). 890 */ 891 size = sblksize(fs, dp->di_extsize, lbn); 892 blkoffset = blkoff(fs, uio->uio_offset); 893 894 /* 895 * The amount we want to transfer in this iteration is 896 * one FS block less the amount of the data before 897 * our startpoint (duh!) 898 */ 899 xfersize = fs->fs_bsize - blkoffset; 900 901 /* 902 * But if we actually want less than the block, 903 * or the file doesn't have a whole block more of data, 904 * then use the lesser number. 905 */ 906 if (uio->uio_resid < xfersize) 907 xfersize = uio->uio_resid; 908 if (bytesinfile < xfersize) 909 xfersize = bytesinfile; 910 911 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 912 /* 913 * Don't do readahead if this is the end of the info. 914 */ 915 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 916 } else { 917 /* 918 * If we have a second block, then 919 * fire off a request for a readahead 920 * as well as a read. Note that the 4th and 5th 921 * arguments point to arrays of the size specified in 922 * the 6th argument. 923 */ 924 u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 925 926 nextlbn = -1 - nextlbn; 927 error = breadn(vp, -1 - lbn, 928 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 929 } 930 if (error) { 931 brelse(bp); 932 bp = NULL; 933 break; 934 } 935 936 /* 937 * If IO_DIRECT then set B_DIRECT for the buffer. This 938 * will cause us to attempt to release the buffer later on 939 * and will cause the buffer cache to attempt to free the 940 * underlying pages. 941 */ 942 if (ioflag & IO_DIRECT) 943 bp->b_flags |= B_DIRECT; 944 945 /* 946 * We should only get non-zero b_resid when an I/O error 947 * has occurred, which should cause us to break above. 948 * However, if the short read did not cause an error, 949 * then we want to ensure that we do not uiomove bad 950 * or uninitialized data. 951 */ 952 size -= bp->b_resid; 953 if (size < xfersize) { 954 if (size == 0) 955 break; 956 xfersize = size; 957 } 958 959 error = uiomove((char *)bp->b_data + blkoffset, 960 (int)xfersize, uio); 961 if (error) 962 break; 963 964 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 965 (LIST_EMPTY(&bp->b_dep))) { 966 /* 967 * If there are no dependencies, and it's VMIO, 968 * then we don't need the buf, mark it available 969 * for freeing. For non-direct VMIO reads, the VM 970 * has the data. 971 */ 972 bp->b_flags |= B_RELBUF; 973 brelse(bp); 974 } else { 975 /* 976 * Otherwise let whoever 977 * made the request take care of 978 * freeing it. We just queue 979 * it onto another list. 980 */ 981 bqrelse(bp); 982 } 983 } 984 985 /* 986 * This can only happen in the case of an error 987 * because the loop above resets bp to NULL on each iteration 988 * and on normal completion has not set a new value into it. 989 * so it must have come from a 'break' statement 990 */ 991 if (bp != NULL) { 992 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 993 (LIST_EMPTY(&bp->b_dep))) { 994 bp->b_flags |= B_RELBUF; 995 brelse(bp); 996 } else { 997 bqrelse(bp); 998 } 999 } 1000 return (error); 1001 } 1002 1003 /* 1004 * Extended attribute area writing. 1005 */ 1006 static int 1007 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1008 { 1009 struct inode *ip; 1010 struct ufs2_dinode *dp; 1011 struct fs *fs; 1012 struct buf *bp; 1013 ufs_lbn_t lbn; 1014 off_t osize; 1015 ssize_t resid; 1016 int blkoffset, error, flags, size, xfersize; 1017 1018 ip = VTOI(vp); 1019 fs = ip->i_fs; 1020 dp = ip->i_din2; 1021 1022 #ifdef INVARIANTS 1023 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1024 panic("ffs_extwrite: mode"); 1025 #endif 1026 1027 if (ioflag & IO_APPEND) 1028 uio->uio_offset = dp->di_extsize; 1029 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1030 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1031 if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1032 return (EFBIG); 1033 1034 resid = uio->uio_resid; 1035 osize = dp->di_extsize; 1036 flags = IO_EXT; 1037 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1038 flags |= IO_SYNC; 1039 1040 for (error = 0; uio->uio_resid > 0;) { 1041 lbn = lblkno(fs, uio->uio_offset); 1042 blkoffset = blkoff(fs, uio->uio_offset); 1043 xfersize = fs->fs_bsize - blkoffset; 1044 if (uio->uio_resid < xfersize) 1045 xfersize = uio->uio_resid; 1046 1047 /* 1048 * We must perform a read-before-write if the transfer size 1049 * does not cover the entire buffer. 1050 */ 1051 if (fs->fs_bsize > xfersize) 1052 flags |= BA_CLRBUF; 1053 else 1054 flags &= ~BA_CLRBUF; 1055 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1056 ucred, flags, &bp); 1057 if (error != 0) 1058 break; 1059 /* 1060 * If the buffer is not valid we have to clear out any 1061 * garbage data from the pages instantiated for the buffer. 1062 * If we do not, a failed uiomove() during a write can leave 1063 * the prior contents of the pages exposed to a userland 1064 * mmap(). XXX deal with uiomove() errors a better way. 1065 */ 1066 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1067 vfs_bio_clrbuf(bp); 1068 if (ioflag & IO_DIRECT) 1069 bp->b_flags |= B_DIRECT; 1070 1071 if (uio->uio_offset + xfersize > dp->di_extsize) 1072 dp->di_extsize = uio->uio_offset + xfersize; 1073 1074 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1075 if (size < xfersize) 1076 xfersize = size; 1077 1078 error = 1079 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1080 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1081 (LIST_EMPTY(&bp->b_dep))) { 1082 bp->b_flags |= B_RELBUF; 1083 } 1084 1085 /* 1086 * If IO_SYNC each buffer is written synchronously. Otherwise 1087 * if we have a severe page deficiency write the buffer 1088 * asynchronously. Otherwise try to cluster, and if that 1089 * doesn't do it then either do an async write (if O_DIRECT), 1090 * or a delayed write (if not). 1091 */ 1092 if (ioflag & IO_SYNC) { 1093 (void)bwrite(bp); 1094 } else if (vm_page_count_severe() || 1095 buf_dirty_count_severe() || 1096 xfersize + blkoffset == fs->fs_bsize || 1097 (ioflag & (IO_ASYNC | IO_DIRECT))) 1098 bawrite(bp); 1099 else 1100 bdwrite(bp); 1101 if (error || xfersize == 0) 1102 break; 1103 ip->i_flag |= IN_CHANGE; 1104 } 1105 /* 1106 * If we successfully wrote any data, and we are not the superuser 1107 * we clear the setuid and setgid bits as a precaution against 1108 * tampering. 1109 */ 1110 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { 1111 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) { 1112 ip->i_mode &= ~(ISUID | ISGID); 1113 dp->di_mode = ip->i_mode; 1114 } 1115 } 1116 if (error) { 1117 if (ioflag & IO_UNIT) { 1118 (void)ffs_truncate(vp, osize, 1119 IO_EXT | (ioflag&IO_SYNC), ucred); 1120 uio->uio_offset -= resid - uio->uio_resid; 1121 uio->uio_resid = resid; 1122 } 1123 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1124 error = ffs_update(vp, 1); 1125 return (error); 1126 } 1127 1128 1129 /* 1130 * Vnode operating to retrieve a named extended attribute. 1131 * 1132 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1133 * the length of the EA, and possibly the pointer to the entry and to the data. 1134 */ 1135 static int 1136 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) 1137 { 1138 u_char *p, *pe, *pn, *p0; 1139 int eapad1, eapad2, ealength, ealen, nlen; 1140 uint32_t ul; 1141 1142 pe = ptr + length; 1143 nlen = strlen(name); 1144 1145 for (p = ptr; p < pe; p = pn) { 1146 p0 = p; 1147 bcopy(p, &ul, sizeof(ul)); 1148 pn = p + ul; 1149 /* make sure this entry is complete */ 1150 if (pn > pe) 1151 break; 1152 p += sizeof(uint32_t); 1153 if (*p != nspace) 1154 continue; 1155 p++; 1156 eapad2 = *p++; 1157 if (*p != nlen) 1158 continue; 1159 p++; 1160 if (bcmp(p, name, nlen)) 1161 continue; 1162 ealength = sizeof(uint32_t) + 3 + nlen; 1163 eapad1 = 8 - (ealength % 8); 1164 if (eapad1 == 8) 1165 eapad1 = 0; 1166 ealength += eapad1; 1167 ealen = ul - ealength - eapad2; 1168 p += nlen + eapad1; 1169 if (eap != NULL) 1170 *eap = p0; 1171 if (eac != NULL) 1172 *eac = p; 1173 return (ealen); 1174 } 1175 return(-1); 1176 } 1177 1178 static int 1179 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1180 { 1181 struct inode *ip; 1182 struct ufs2_dinode *dp; 1183 struct fs *fs; 1184 struct uio luio; 1185 struct iovec liovec; 1186 u_int easize; 1187 int error; 1188 u_char *eae; 1189 1190 ip = VTOI(vp); 1191 fs = ip->i_fs; 1192 dp = ip->i_din2; 1193 easize = dp->di_extsize; 1194 if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize) 1195 return (EFBIG); 1196 1197 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1198 1199 liovec.iov_base = eae; 1200 liovec.iov_len = easize; 1201 luio.uio_iov = &liovec; 1202 luio.uio_iovcnt = 1; 1203 luio.uio_offset = 0; 1204 luio.uio_resid = easize; 1205 luio.uio_segflg = UIO_SYSSPACE; 1206 luio.uio_rw = UIO_READ; 1207 luio.uio_td = td; 1208 1209 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1210 if (error) { 1211 free(eae, M_TEMP); 1212 return(error); 1213 } 1214 *p = eae; 1215 return (0); 1216 } 1217 1218 static void 1219 ffs_lock_ea(struct vnode *vp) 1220 { 1221 struct inode *ip; 1222 1223 ip = VTOI(vp); 1224 VI_LOCK(vp); 1225 while (ip->i_flag & IN_EA_LOCKED) { 1226 ip->i_flag |= IN_EA_LOCKWAIT; 1227 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea", 1228 0); 1229 } 1230 ip->i_flag |= IN_EA_LOCKED; 1231 VI_UNLOCK(vp); 1232 } 1233 1234 static void 1235 ffs_unlock_ea(struct vnode *vp) 1236 { 1237 struct inode *ip; 1238 1239 ip = VTOI(vp); 1240 VI_LOCK(vp); 1241 if (ip->i_flag & IN_EA_LOCKWAIT) 1242 wakeup(&ip->i_ea_refs); 1243 ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT); 1244 VI_UNLOCK(vp); 1245 } 1246 1247 static int 1248 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1249 { 1250 struct inode *ip; 1251 struct ufs2_dinode *dp; 1252 int error; 1253 1254 ip = VTOI(vp); 1255 1256 ffs_lock_ea(vp); 1257 if (ip->i_ea_area != NULL) { 1258 ip->i_ea_refs++; 1259 ffs_unlock_ea(vp); 1260 return (0); 1261 } 1262 dp = ip->i_din2; 1263 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1264 if (error) { 1265 ffs_unlock_ea(vp); 1266 return (error); 1267 } 1268 ip->i_ea_len = dp->di_extsize; 1269 ip->i_ea_error = 0; 1270 ip->i_ea_refs++; 1271 ffs_unlock_ea(vp); 1272 return (0); 1273 } 1274 1275 /* 1276 * Vnode extattr transaction commit/abort 1277 */ 1278 static int 1279 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1280 { 1281 struct inode *ip; 1282 struct uio luio; 1283 struct iovec liovec; 1284 int error; 1285 struct ufs2_dinode *dp; 1286 1287 ip = VTOI(vp); 1288 1289 ffs_lock_ea(vp); 1290 if (ip->i_ea_area == NULL) { 1291 ffs_unlock_ea(vp); 1292 return (EINVAL); 1293 } 1294 dp = ip->i_din2; 1295 error = ip->i_ea_error; 1296 if (commit && error == 0) { 1297 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit"); 1298 if (cred == NOCRED) 1299 cred = vp->v_mount->mnt_cred; 1300 liovec.iov_base = ip->i_ea_area; 1301 liovec.iov_len = ip->i_ea_len; 1302 luio.uio_iov = &liovec; 1303 luio.uio_iovcnt = 1; 1304 luio.uio_offset = 0; 1305 luio.uio_resid = ip->i_ea_len; 1306 luio.uio_segflg = UIO_SYSSPACE; 1307 luio.uio_rw = UIO_WRITE; 1308 luio.uio_td = td; 1309 /* XXX: I'm not happy about truncating to zero size */ 1310 if (ip->i_ea_len < dp->di_extsize) 1311 error = ffs_truncate(vp, 0, IO_EXT, cred); 1312 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1313 } 1314 if (--ip->i_ea_refs == 0) { 1315 free(ip->i_ea_area, M_TEMP); 1316 ip->i_ea_area = NULL; 1317 ip->i_ea_len = 0; 1318 ip->i_ea_error = 0; 1319 } 1320 ffs_unlock_ea(vp); 1321 return (error); 1322 } 1323 1324 /* 1325 * Vnode extattr strategy routine for fifos. 1326 * 1327 * We need to check for a read or write of the external attributes. 1328 * Otherwise we just fall through and do the usual thing. 1329 */ 1330 static int 1331 ffsext_strategy(struct vop_strategy_args *ap) 1332 /* 1333 struct vop_strategy_args { 1334 struct vnodeop_desc *a_desc; 1335 struct vnode *a_vp; 1336 struct buf *a_bp; 1337 }; 1338 */ 1339 { 1340 struct vnode *vp; 1341 daddr_t lbn; 1342 1343 vp = ap->a_vp; 1344 lbn = ap->a_bp->b_lblkno; 1345 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1346 lbn < 0 && lbn >= -NXADDR) 1347 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1348 if (vp->v_type == VFIFO) 1349 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1350 panic("spec nodes went here"); 1351 } 1352 1353 /* 1354 * Vnode extattr transaction commit/abort 1355 */ 1356 static int 1357 ffs_openextattr(struct vop_openextattr_args *ap) 1358 /* 1359 struct vop_openextattr_args { 1360 struct vnodeop_desc *a_desc; 1361 struct vnode *a_vp; 1362 IN struct ucred *a_cred; 1363 IN struct thread *a_td; 1364 }; 1365 */ 1366 { 1367 struct inode *ip; 1368 struct fs *fs; 1369 1370 ip = VTOI(ap->a_vp); 1371 fs = ip->i_fs; 1372 1373 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1374 return (EOPNOTSUPP); 1375 1376 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1377 } 1378 1379 1380 /* 1381 * Vnode extattr transaction commit/abort 1382 */ 1383 static int 1384 ffs_closeextattr(struct vop_closeextattr_args *ap) 1385 /* 1386 struct vop_closeextattr_args { 1387 struct vnodeop_desc *a_desc; 1388 struct vnode *a_vp; 1389 int a_commit; 1390 IN struct ucred *a_cred; 1391 IN struct thread *a_td; 1392 }; 1393 */ 1394 { 1395 struct inode *ip; 1396 struct fs *fs; 1397 1398 ip = VTOI(ap->a_vp); 1399 fs = ip->i_fs; 1400 1401 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1402 return (EOPNOTSUPP); 1403 1404 if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) 1405 return (EROFS); 1406 1407 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1408 } 1409 1410 /* 1411 * Vnode operation to remove a named attribute. 1412 */ 1413 static int 1414 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1415 /* 1416 vop_deleteextattr { 1417 IN struct vnode *a_vp; 1418 IN int a_attrnamespace; 1419 IN const char *a_name; 1420 IN struct ucred *a_cred; 1421 IN struct thread *a_td; 1422 }; 1423 */ 1424 { 1425 struct inode *ip; 1426 struct fs *fs; 1427 uint32_t ealength, ul; 1428 int ealen, olen, eapad1, eapad2, error, i, easize; 1429 u_char *eae, *p; 1430 1431 ip = VTOI(ap->a_vp); 1432 fs = ip->i_fs; 1433 1434 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1435 return (EOPNOTSUPP); 1436 1437 if (strlen(ap->a_name) == 0) 1438 return (EINVAL); 1439 1440 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1441 return (EROFS); 1442 1443 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1444 ap->a_cred, ap->a_td, VWRITE); 1445 if (error) { 1446 1447 /* 1448 * ffs_lock_ea is not needed there, because the vnode 1449 * must be exclusively locked. 1450 */ 1451 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1452 ip->i_ea_error = error; 1453 return (error); 1454 } 1455 1456 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1457 if (error) 1458 return (error); 1459 1460 ealength = eapad1 = ealen = eapad2 = 0; 1461 1462 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1463 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1464 easize = ip->i_ea_len; 1465 1466 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1467 &p, NULL); 1468 if (olen == -1) { 1469 /* delete but nonexistent */ 1470 free(eae, M_TEMP); 1471 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1472 return(ENOATTR); 1473 } 1474 bcopy(p, &ul, sizeof ul); 1475 i = p - eae + ul; 1476 if (ul != ealength) { 1477 bcopy(p + ul, p + ealength, easize - i); 1478 easize += (ealength - ul); 1479 } 1480 if (easize > NXADDR * fs->fs_bsize) { 1481 free(eae, M_TEMP); 1482 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1483 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1484 ip->i_ea_error = ENOSPC; 1485 return(ENOSPC); 1486 } 1487 p = ip->i_ea_area; 1488 ip->i_ea_area = eae; 1489 ip->i_ea_len = easize; 1490 free(p, M_TEMP); 1491 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1492 return(error); 1493 } 1494 1495 /* 1496 * Vnode operation to retrieve a named extended attribute. 1497 */ 1498 static int 1499 ffs_getextattr(struct vop_getextattr_args *ap) 1500 /* 1501 vop_getextattr { 1502 IN struct vnode *a_vp; 1503 IN int a_attrnamespace; 1504 IN const char *a_name; 1505 INOUT struct uio *a_uio; 1506 OUT size_t *a_size; 1507 IN struct ucred *a_cred; 1508 IN struct thread *a_td; 1509 }; 1510 */ 1511 { 1512 struct inode *ip; 1513 struct fs *fs; 1514 u_char *eae, *p; 1515 unsigned easize; 1516 int error, ealen; 1517 1518 ip = VTOI(ap->a_vp); 1519 fs = ip->i_fs; 1520 1521 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1522 return (EOPNOTSUPP); 1523 1524 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1525 ap->a_cred, ap->a_td, VREAD); 1526 if (error) 1527 return (error); 1528 1529 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1530 if (error) 1531 return (error); 1532 1533 eae = ip->i_ea_area; 1534 easize = ip->i_ea_len; 1535 1536 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1537 NULL, &p); 1538 if (ealen >= 0) { 1539 error = 0; 1540 if (ap->a_size != NULL) 1541 *ap->a_size = ealen; 1542 else if (ap->a_uio != NULL) 1543 error = uiomove(p, ealen, ap->a_uio); 1544 } else 1545 error = ENOATTR; 1546 1547 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1548 return(error); 1549 } 1550 1551 /* 1552 * Vnode operation to retrieve extended attributes on a vnode. 1553 */ 1554 static int 1555 ffs_listextattr(struct vop_listextattr_args *ap) 1556 /* 1557 vop_listextattr { 1558 IN struct vnode *a_vp; 1559 IN int a_attrnamespace; 1560 INOUT struct uio *a_uio; 1561 OUT size_t *a_size; 1562 IN struct ucred *a_cred; 1563 IN struct thread *a_td; 1564 }; 1565 */ 1566 { 1567 struct inode *ip; 1568 struct fs *fs; 1569 u_char *eae, *p, *pe, *pn; 1570 unsigned easize; 1571 uint32_t ul; 1572 int error, ealen; 1573 1574 ip = VTOI(ap->a_vp); 1575 fs = ip->i_fs; 1576 1577 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1578 return (EOPNOTSUPP); 1579 1580 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1581 ap->a_cred, ap->a_td, VREAD); 1582 if (error) 1583 return (error); 1584 1585 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1586 if (error) 1587 return (error); 1588 eae = ip->i_ea_area; 1589 easize = ip->i_ea_len; 1590 1591 error = 0; 1592 if (ap->a_size != NULL) 1593 *ap->a_size = 0; 1594 pe = eae + easize; 1595 for(p = eae; error == 0 && p < pe; p = pn) { 1596 bcopy(p, &ul, sizeof(ul)); 1597 pn = p + ul; 1598 if (pn > pe) 1599 break; 1600 p += sizeof(ul); 1601 if (*p++ != ap->a_attrnamespace) 1602 continue; 1603 p++; /* pad2 */ 1604 ealen = *p; 1605 if (ap->a_size != NULL) { 1606 *ap->a_size += ealen + 1; 1607 } else if (ap->a_uio != NULL) { 1608 error = uiomove(p, ealen + 1, ap->a_uio); 1609 } 1610 } 1611 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1612 return(error); 1613 } 1614 1615 /* 1616 * Vnode operation to set a named attribute. 1617 */ 1618 static int 1619 ffs_setextattr(struct vop_setextattr_args *ap) 1620 /* 1621 vop_setextattr { 1622 IN struct vnode *a_vp; 1623 IN int a_attrnamespace; 1624 IN const char *a_name; 1625 INOUT struct uio *a_uio; 1626 IN struct ucred *a_cred; 1627 IN struct thread *a_td; 1628 }; 1629 */ 1630 { 1631 struct inode *ip; 1632 struct fs *fs; 1633 uint32_t ealength, ul; 1634 ssize_t ealen; 1635 int olen, eapad1, eapad2, error, i, easize; 1636 u_char *eae, *p; 1637 1638 ip = VTOI(ap->a_vp); 1639 fs = ip->i_fs; 1640 1641 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1642 return (EOPNOTSUPP); 1643 1644 if (strlen(ap->a_name) == 0) 1645 return (EINVAL); 1646 1647 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1648 if (ap->a_uio == NULL) 1649 return (EOPNOTSUPP); 1650 1651 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1652 return (EROFS); 1653 1654 ealen = ap->a_uio->uio_resid; 1655 if (ealen < 0 || ealen > lblktosize(fs, NXADDR)) 1656 return (EINVAL); 1657 1658 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1659 ap->a_cred, ap->a_td, VWRITE); 1660 if (error) { 1661 1662 /* 1663 * ffs_lock_ea is not needed there, because the vnode 1664 * must be exclusively locked. 1665 */ 1666 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1667 ip->i_ea_error = error; 1668 return (error); 1669 } 1670 1671 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1672 if (error) 1673 return (error); 1674 1675 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1676 eapad1 = 8 - (ealength % 8); 1677 if (eapad1 == 8) 1678 eapad1 = 0; 1679 eapad2 = 8 - (ealen % 8); 1680 if (eapad2 == 8) 1681 eapad2 = 0; 1682 ealength += eapad1 + ealen + eapad2; 1683 1684 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1685 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1686 easize = ip->i_ea_len; 1687 1688 olen = ffs_findextattr(eae, easize, 1689 ap->a_attrnamespace, ap->a_name, &p, NULL); 1690 if (olen == -1) { 1691 /* new, append at end */ 1692 p = eae + easize; 1693 easize += ealength; 1694 } else { 1695 bcopy(p, &ul, sizeof ul); 1696 i = p - eae + ul; 1697 if (ul != ealength) { 1698 bcopy(p + ul, p + ealength, easize - i); 1699 easize += (ealength - ul); 1700 } 1701 } 1702 if (easize > lblktosize(fs, NXADDR)) { 1703 free(eae, M_TEMP); 1704 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1705 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1706 ip->i_ea_error = ENOSPC; 1707 return(ENOSPC); 1708 } 1709 bcopy(&ealength, p, sizeof(ealength)); 1710 p += sizeof(ealength); 1711 *p++ = ap->a_attrnamespace; 1712 *p++ = eapad2; 1713 *p++ = strlen(ap->a_name); 1714 strcpy(p, ap->a_name); 1715 p += strlen(ap->a_name); 1716 bzero(p, eapad1); 1717 p += eapad1; 1718 error = uiomove(p, ealen, ap->a_uio); 1719 if (error) { 1720 free(eae, M_TEMP); 1721 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1722 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1723 ip->i_ea_error = error; 1724 return(error); 1725 } 1726 p += ealen; 1727 bzero(p, eapad2); 1728 1729 p = ip->i_ea_area; 1730 ip->i_ea_area = eae; 1731 ip->i_ea_len = easize; 1732 free(p, M_TEMP); 1733 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1734 return(error); 1735 } 1736 1737 /* 1738 * Vnode pointer to File handle 1739 */ 1740 static int 1741 ffs_vptofh(struct vop_vptofh_args *ap) 1742 /* 1743 vop_vptofh { 1744 IN struct vnode *a_vp; 1745 IN struct fid *a_fhp; 1746 }; 1747 */ 1748 { 1749 struct inode *ip; 1750 struct ufid *ufhp; 1751 1752 ip = VTOI(ap->a_vp); 1753 ufhp = (struct ufid *)ap->a_fhp; 1754 ufhp->ufid_len = sizeof(struct ufid); 1755 ufhp->ufid_ino = ip->i_number; 1756 ufhp->ufid_gen = ip->i_gen; 1757 return (0); 1758 } 1759