1 /*- 2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 60 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 61 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include <sys/param.h> 68 #include <sys/bio.h> 69 #include <sys/systm.h> 70 #include <sys/buf.h> 71 #include <sys/conf.h> 72 #include <sys/extattr.h> 73 #include <sys/kernel.h> 74 #include <sys/limits.h> 75 #include <sys/malloc.h> 76 #include <sys/mount.h> 77 #include <sys/priv.h> 78 #include <sys/stat.h> 79 #include <sys/vmmeter.h> 80 #include <sys/vnode.h> 81 82 #include <vm/vm.h> 83 #include <vm/vm_param.h> 84 #include <vm/vm_extern.h> 85 #include <vm/vm_object.h> 86 #include <vm/vm_page.h> 87 #include <vm/vm_pager.h> 88 #include <vm/vnode_pager.h> 89 90 #include <ufs/ufs/extattr.h> 91 #include <ufs/ufs/quota.h> 92 #include <ufs/ufs/inode.h> 93 #include <ufs/ufs/ufs_extern.h> 94 #include <ufs/ufs/ufsmount.h> 95 96 #include <ufs/ffs/fs.h> 97 #include <ufs/ffs/ffs_extern.h> 98 #include "opt_directio.h" 99 #include "opt_ffs.h" 100 101 #ifdef DIRECTIO 102 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 103 #endif 104 static vop_fsync_t ffs_fsync; 105 static vop_lock1_t ffs_lock; 106 static vop_getpages_t ffs_getpages; 107 static vop_read_t ffs_read; 108 static vop_write_t ffs_write; 109 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 110 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 111 struct ucred *cred); 112 static vop_strategy_t ffsext_strategy; 113 static vop_closeextattr_t ffs_closeextattr; 114 static vop_deleteextattr_t ffs_deleteextattr; 115 static vop_getextattr_t ffs_getextattr; 116 static vop_listextattr_t ffs_listextattr; 117 static vop_openextattr_t ffs_openextattr; 118 static vop_setextattr_t ffs_setextattr; 119 static vop_vptofh_t ffs_vptofh; 120 121 122 /* Global vfs data structures for ufs. */ 123 struct vop_vector ffs_vnodeops1 = { 124 .vop_default = &ufs_vnodeops, 125 .vop_fsync = ffs_fsync, 126 .vop_getpages = ffs_getpages, 127 .vop_lock1 = ffs_lock, 128 .vop_read = ffs_read, 129 .vop_reallocblks = ffs_reallocblks, 130 .vop_write = ffs_write, 131 .vop_vptofh = ffs_vptofh, 132 }; 133 134 struct vop_vector ffs_fifoops1 = { 135 .vop_default = &ufs_fifoops, 136 .vop_fsync = ffs_fsync, 137 .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ 138 .vop_vptofh = ffs_vptofh, 139 }; 140 141 /* Global vfs data structures for ufs. */ 142 struct vop_vector ffs_vnodeops2 = { 143 .vop_default = &ufs_vnodeops, 144 .vop_fsync = ffs_fsync, 145 .vop_getpages = ffs_getpages, 146 .vop_lock1 = ffs_lock, 147 .vop_read = ffs_read, 148 .vop_reallocblks = ffs_reallocblks, 149 .vop_write = ffs_write, 150 .vop_closeextattr = ffs_closeextattr, 151 .vop_deleteextattr = ffs_deleteextattr, 152 .vop_getextattr = ffs_getextattr, 153 .vop_listextattr = ffs_listextattr, 154 .vop_openextattr = ffs_openextattr, 155 .vop_setextattr = ffs_setextattr, 156 .vop_vptofh = ffs_vptofh, 157 }; 158 159 struct vop_vector ffs_fifoops2 = { 160 .vop_default = &ufs_fifoops, 161 .vop_fsync = ffs_fsync, 162 .vop_lock1 = ffs_lock, 163 .vop_reallocblks = ffs_reallocblks, 164 .vop_strategy = ffsext_strategy, 165 .vop_closeextattr = ffs_closeextattr, 166 .vop_deleteextattr = ffs_deleteextattr, 167 .vop_getextattr = ffs_getextattr, 168 .vop_listextattr = ffs_listextattr, 169 .vop_openextattr = ffs_openextattr, 170 .vop_setextattr = ffs_setextattr, 171 .vop_vptofh = ffs_vptofh, 172 }; 173 174 /* 175 * Synch an open file. 176 */ 177 /* ARGSUSED */ 178 static int 179 ffs_fsync(struct vop_fsync_args *ap) 180 { 181 struct vnode *vp; 182 struct bufobj *bo; 183 int error; 184 185 vp = ap->a_vp; 186 bo = &vp->v_bufobj; 187 retry: 188 error = ffs_syncvnode(vp, ap->a_waitfor, 0); 189 if (error) 190 return (error); 191 if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) { 192 error = softdep_fsync(vp); 193 if (error) 194 return (error); 195 196 /* 197 * The softdep_fsync() function may drop vp lock, 198 * allowing for dirty buffers to reappear on the 199 * bo_dirty list. Recheck and resync as needed. 200 */ 201 BO_LOCK(bo); 202 if (vp->v_type == VREG && (bo->bo_numoutput > 0 || 203 bo->bo_dirty.bv_cnt > 0)) { 204 BO_UNLOCK(bo); 205 goto retry; 206 } 207 BO_UNLOCK(bo); 208 } 209 return (0); 210 } 211 212 int 213 ffs_syncvnode(struct vnode *vp, int waitfor, int flags) 214 { 215 struct inode *ip; 216 struct bufobj *bo; 217 struct buf *bp; 218 struct buf *nbp; 219 ufs_lbn_t lbn; 220 int error, wait, passes; 221 222 ip = VTOI(vp); 223 ip->i_flag &= ~IN_NEEDSYNC; 224 bo = &vp->v_bufobj; 225 226 /* 227 * When doing MNT_WAIT we must first flush all dependencies 228 * on the inode. 229 */ 230 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && 231 (error = softdep_sync_metadata(vp)) != 0) 232 return (error); 233 234 /* 235 * Flush all dirty buffers associated with a vnode. 236 */ 237 error = 0; 238 passes = 0; 239 wait = 0; /* Always do an async pass first. */ 240 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 241 BO_LOCK(bo); 242 loop: 243 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 244 bp->b_vflags &= ~BV_SCANNED; 245 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 246 /* 247 * Reasons to skip this buffer: it has already been considered 248 * on this pass, the buffer has dependencies that will cause 249 * it to be redirtied and it has not already been deferred, 250 * or it is already being written. 251 */ 252 if ((bp->b_vflags & BV_SCANNED) != 0) 253 continue; 254 bp->b_vflags |= BV_SCANNED; 255 /* Flush indirects in order. */ 256 if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR && 257 lbn_level(bp->b_lblkno) >= passes) 258 continue; 259 if (bp->b_lblkno > lbn) 260 panic("ffs_syncvnode: syncing truncated data."); 261 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 262 continue; 263 BO_UNLOCK(bo); 264 if ((bp->b_flags & B_DELWRI) == 0) 265 panic("ffs_fsync: not dirty"); 266 /* 267 * Check for dependencies and potentially complete them. 268 */ 269 if (!LIST_EMPTY(&bp->b_dep) && 270 (error = softdep_sync_buf(vp, bp, 271 wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { 272 /* I/O error. */ 273 if (error != EBUSY) { 274 BUF_UNLOCK(bp); 275 return (error); 276 } 277 /* If we deferred once, don't defer again. */ 278 if ((bp->b_flags & B_DEFERRED) == 0) { 279 bp->b_flags |= B_DEFERRED; 280 BUF_UNLOCK(bp); 281 goto next; 282 } 283 } 284 if (wait) { 285 bremfree(bp); 286 if ((error = bwrite(bp)) != 0) 287 return (error); 288 } else if ((bp->b_flags & B_CLUSTEROK)) { 289 (void) vfs_bio_awrite(bp); 290 } else { 291 bremfree(bp); 292 (void) bawrite(bp); 293 } 294 next: 295 /* 296 * Since we may have slept during the I/O, we need 297 * to start from a known point. 298 */ 299 BO_LOCK(bo); 300 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); 301 } 302 if (waitfor != MNT_WAIT) { 303 BO_UNLOCK(bo); 304 if ((flags & NO_INO_UPDT) != 0) 305 return (0); 306 else 307 return (ffs_update(vp, 0)); 308 } 309 /* Drain IO to see if we're done. */ 310 bufobj_wwait(bo, 0, 0); 311 /* 312 * Block devices associated with filesystems may have new I/O 313 * requests posted for them even if the vnode is locked, so no 314 * amount of trying will get them clean. We make several passes 315 * as a best effort. 316 * 317 * Regular files may need multiple passes to flush all dependency 318 * work as it is possible that we must write once per indirect 319 * level, once for the leaf, and once for the inode and each of 320 * these will be done with one sync and one async pass. 321 */ 322 if (bo->bo_dirty.bv_cnt > 0) { 323 /* Write the inode after sync passes to flush deps. */ 324 if (wait && DOINGSOFTDEP(vp) && (flags & NO_INO_UPDT) == 0) { 325 BO_UNLOCK(bo); 326 ffs_update(vp, 1); 327 BO_LOCK(bo); 328 } 329 /* switch between sync/async. */ 330 wait = !wait; 331 if (wait == 1 || ++passes < NIADDR + 2) 332 goto loop; 333 #ifdef INVARIANTS 334 if (!vn_isdisk(vp, NULL)) 335 vprint("ffs_fsync: dirty", vp); 336 #endif 337 } 338 BO_UNLOCK(bo); 339 error = 0; 340 if ((flags & NO_INO_UPDT) == 0) 341 error = ffs_update(vp, 1); 342 if (DOINGSUJ(vp)) 343 softdep_journal_fsync(VTOI(vp)); 344 return (error); 345 } 346 347 static int 348 ffs_lock(ap) 349 struct vop_lock1_args /* { 350 struct vnode *a_vp; 351 int a_flags; 352 struct thread *a_td; 353 char *file; 354 int line; 355 } */ *ap; 356 { 357 #ifndef NO_FFS_SNAPSHOT 358 struct vnode *vp; 359 int flags; 360 struct lock *lkp; 361 int result; 362 363 switch (ap->a_flags & LK_TYPE_MASK) { 364 case LK_SHARED: 365 case LK_UPGRADE: 366 case LK_EXCLUSIVE: 367 vp = ap->a_vp; 368 flags = ap->a_flags; 369 for (;;) { 370 #ifdef DEBUG_VFS_LOCKS 371 KASSERT(vp->v_holdcnt != 0, 372 ("ffs_lock %p: zero hold count", vp)); 373 #endif 374 lkp = vp->v_vnlock; 375 result = _lockmgr_args(lkp, flags, VI_MTX(vp), 376 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 377 ap->a_file, ap->a_line); 378 if (lkp == vp->v_vnlock || result != 0) 379 break; 380 /* 381 * Apparent success, except that the vnode 382 * mutated between snapshot file vnode and 383 * regular file vnode while this process 384 * slept. The lock currently held is not the 385 * right lock. Release it, and try to get the 386 * new lock. 387 */ 388 (void) _lockmgr_args(lkp, LK_RELEASE, NULL, 389 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 390 ap->a_file, ap->a_line); 391 if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == 392 (LK_INTERLOCK | LK_NOWAIT)) 393 return (EBUSY); 394 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 395 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 396 flags &= ~LK_INTERLOCK; 397 } 398 break; 399 default: 400 result = VOP_LOCK1_APV(&ufs_vnodeops, ap); 401 } 402 return (result); 403 #else 404 return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); 405 #endif 406 } 407 408 /* 409 * Vnode op for reading. 410 */ 411 static int 412 ffs_read(ap) 413 struct vop_read_args /* { 414 struct vnode *a_vp; 415 struct uio *a_uio; 416 int a_ioflag; 417 struct ucred *a_cred; 418 } */ *ap; 419 { 420 struct vnode *vp; 421 struct inode *ip; 422 struct uio *uio; 423 struct fs *fs; 424 struct buf *bp; 425 ufs_lbn_t lbn, nextlbn; 426 off_t bytesinfile; 427 long size, xfersize, blkoffset; 428 ssize_t orig_resid; 429 int error; 430 int seqcount; 431 int ioflag; 432 433 vp = ap->a_vp; 434 uio = ap->a_uio; 435 ioflag = ap->a_ioflag; 436 if (ap->a_ioflag & IO_EXT) 437 #ifdef notyet 438 return (ffs_extread(vp, uio, ioflag)); 439 #else 440 panic("ffs_read+IO_EXT"); 441 #endif 442 #ifdef DIRECTIO 443 if ((ioflag & IO_DIRECT) != 0) { 444 int workdone; 445 446 error = ffs_rawread(vp, uio, &workdone); 447 if (error != 0 || workdone != 0) 448 return error; 449 } 450 #endif 451 452 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 453 ip = VTOI(vp); 454 455 #ifdef INVARIANTS 456 if (uio->uio_rw != UIO_READ) 457 panic("ffs_read: mode"); 458 459 if (vp->v_type == VLNK) { 460 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 461 panic("ffs_read: short symlink"); 462 } else if (vp->v_type != VREG && vp->v_type != VDIR) 463 panic("ffs_read: type %d", vp->v_type); 464 #endif 465 orig_resid = uio->uio_resid; 466 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 467 if (orig_resid == 0) 468 return (0); 469 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 470 fs = ip->i_fs; 471 if (uio->uio_offset < ip->i_size && 472 uio->uio_offset >= fs->fs_maxfilesize) 473 return (EOVERFLOW); 474 475 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 476 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 477 break; 478 lbn = lblkno(fs, uio->uio_offset); 479 nextlbn = lbn + 1; 480 481 /* 482 * size of buffer. The buffer representing the 483 * end of the file is rounded up to the size of 484 * the block type ( fragment or full block, 485 * depending ). 486 */ 487 size = blksize(fs, ip, lbn); 488 blkoffset = blkoff(fs, uio->uio_offset); 489 490 /* 491 * The amount we want to transfer in this iteration is 492 * one FS block less the amount of the data before 493 * our startpoint (duh!) 494 */ 495 xfersize = fs->fs_bsize - blkoffset; 496 497 /* 498 * But if we actually want less than the block, 499 * or the file doesn't have a whole block more of data, 500 * then use the lesser number. 501 */ 502 if (uio->uio_resid < xfersize) 503 xfersize = uio->uio_resid; 504 if (bytesinfile < xfersize) 505 xfersize = bytesinfile; 506 507 if (lblktosize(fs, nextlbn) >= ip->i_size) { 508 /* 509 * Don't do readahead if this is the end of the file. 510 */ 511 error = bread(vp, lbn, size, NOCRED, &bp); 512 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 513 /* 514 * Otherwise if we are allowed to cluster, 515 * grab as much as we can. 516 * 517 * XXX This may not be a win if we are not 518 * doing sequential access. 519 */ 520 error = cluster_read(vp, ip->i_size, lbn, 521 size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); 522 } else if (seqcount > 1) { 523 /* 524 * If we are NOT allowed to cluster, then 525 * if we appear to be acting sequentially, 526 * fire off a request for a readahead 527 * as well as a read. Note that the 4th and 5th 528 * arguments point to arrays of the size specified in 529 * the 6th argument. 530 */ 531 int nextsize = blksize(fs, ip, nextlbn); 532 error = breadn(vp, lbn, 533 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 534 } else { 535 /* 536 * Failing all of the above, just read what the 537 * user asked for. Interestingly, the same as 538 * the first option above. 539 */ 540 error = bread(vp, lbn, size, NOCRED, &bp); 541 } 542 if (error) { 543 brelse(bp); 544 bp = NULL; 545 break; 546 } 547 548 /* 549 * If IO_DIRECT then set B_DIRECT for the buffer. This 550 * will cause us to attempt to release the buffer later on 551 * and will cause the buffer cache to attempt to free the 552 * underlying pages. 553 */ 554 if (ioflag & IO_DIRECT) 555 bp->b_flags |= B_DIRECT; 556 557 /* 558 * We should only get non-zero b_resid when an I/O error 559 * has occurred, which should cause us to break above. 560 * However, if the short read did not cause an error, 561 * then we want to ensure that we do not uiomove bad 562 * or uninitialized data. 563 */ 564 size -= bp->b_resid; 565 if (size < xfersize) { 566 if (size == 0) 567 break; 568 xfersize = size; 569 } 570 571 error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, 572 (int)xfersize, uio); 573 if (error) 574 break; 575 576 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 577 (LIST_EMPTY(&bp->b_dep))) { 578 /* 579 * If there are no dependencies, and it's VMIO, 580 * then we don't need the buf, mark it available 581 * for freeing. For non-direct VMIO reads, the VM 582 * has the data. 583 */ 584 bp->b_flags |= B_RELBUF; 585 brelse(bp); 586 } else { 587 /* 588 * Otherwise let whoever 589 * made the request take care of 590 * freeing it. We just queue 591 * it onto another list. 592 */ 593 bqrelse(bp); 594 } 595 } 596 597 /* 598 * This can only happen in the case of an error 599 * because the loop above resets bp to NULL on each iteration 600 * and on normal completion has not set a new value into it. 601 * so it must have come from a 'break' statement 602 */ 603 if (bp != NULL) { 604 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 605 (LIST_EMPTY(&bp->b_dep))) { 606 bp->b_flags |= B_RELBUF; 607 brelse(bp); 608 } else { 609 bqrelse(bp); 610 } 611 } 612 613 if ((error == 0 || uio->uio_resid != orig_resid) && 614 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0 && 615 (ip->i_flag & IN_ACCESS) == 0) { 616 VI_LOCK(vp); 617 ip->i_flag |= IN_ACCESS; 618 VI_UNLOCK(vp); 619 } 620 return (error); 621 } 622 623 /* 624 * Vnode op for writing. 625 */ 626 static int 627 ffs_write(ap) 628 struct vop_write_args /* { 629 struct vnode *a_vp; 630 struct uio *a_uio; 631 int a_ioflag; 632 struct ucred *a_cred; 633 } */ *ap; 634 { 635 struct vnode *vp; 636 struct uio *uio; 637 struct inode *ip; 638 struct fs *fs; 639 struct buf *bp; 640 ufs_lbn_t lbn; 641 off_t osize; 642 ssize_t resid; 643 int seqcount; 644 int blkoffset, error, flags, ioflag, size, xfersize; 645 646 vp = ap->a_vp; 647 uio = ap->a_uio; 648 ioflag = ap->a_ioflag; 649 if (ap->a_ioflag & IO_EXT) 650 #ifdef notyet 651 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 652 #else 653 panic("ffs_write+IO_EXT"); 654 #endif 655 656 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 657 ip = VTOI(vp); 658 659 #ifdef INVARIANTS 660 if (uio->uio_rw != UIO_WRITE) 661 panic("ffs_write: mode"); 662 #endif 663 664 switch (vp->v_type) { 665 case VREG: 666 if (ioflag & IO_APPEND) 667 uio->uio_offset = ip->i_size; 668 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 669 return (EPERM); 670 /* FALLTHROUGH */ 671 case VLNK: 672 break; 673 case VDIR: 674 panic("ffs_write: dir write"); 675 break; 676 default: 677 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 678 (int)uio->uio_offset, 679 (int)uio->uio_resid 680 ); 681 } 682 683 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 684 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 685 fs = ip->i_fs; 686 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 687 return (EFBIG); 688 /* 689 * Maybe this should be above the vnode op call, but so long as 690 * file servers have no limits, I don't think it matters. 691 */ 692 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) 693 return (EFBIG); 694 695 resid = uio->uio_resid; 696 osize = ip->i_size; 697 if (seqcount > BA_SEQMAX) 698 flags = BA_SEQMAX << BA_SEQSHIFT; 699 else 700 flags = seqcount << BA_SEQSHIFT; 701 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 702 flags |= IO_SYNC; 703 704 for (error = 0; uio->uio_resid > 0;) { 705 lbn = lblkno(fs, uio->uio_offset); 706 blkoffset = blkoff(fs, uio->uio_offset); 707 xfersize = fs->fs_bsize - blkoffset; 708 if (uio->uio_resid < xfersize) 709 xfersize = uio->uio_resid; 710 if (uio->uio_offset + xfersize > ip->i_size) 711 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 712 713 /* 714 * We must perform a read-before-write if the transfer size 715 * does not cover the entire buffer. 716 */ 717 if (fs->fs_bsize > xfersize) 718 flags |= BA_CLRBUF; 719 else 720 flags &= ~BA_CLRBUF; 721 /* XXX is uio->uio_offset the right thing here? */ 722 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 723 ap->a_cred, flags, &bp); 724 if (error != 0) { 725 vnode_pager_setsize(vp, ip->i_size); 726 break; 727 } 728 if (ioflag & IO_DIRECT) 729 bp->b_flags |= B_DIRECT; 730 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 731 bp->b_flags |= B_NOCACHE; 732 733 if (uio->uio_offset + xfersize > ip->i_size) { 734 ip->i_size = uio->uio_offset + xfersize; 735 DIP_SET(ip, i_size, ip->i_size); 736 } 737 738 size = blksize(fs, ip, lbn) - bp->b_resid; 739 if (size < xfersize) 740 xfersize = size; 741 742 error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, 743 (int)xfersize, uio); 744 /* 745 * If the buffer is not already filled and we encounter an 746 * error while trying to fill it, we have to clear out any 747 * garbage data from the pages instantiated for the buffer. 748 * If we do not, a failed uiomove() during a write can leave 749 * the prior contents of the pages exposed to a userland mmap. 750 * 751 * Note that we need only clear buffers with a transfer size 752 * equal to the block size because buffers with a shorter 753 * transfer size were cleared above by the call to UFS_BALLOC() 754 * with the BA_CLRBUF flag set. 755 * 756 * If the source region for uiomove identically mmaps the 757 * buffer, uiomove() performed the NOP copy, and the buffer 758 * content remains valid because the page fault handler 759 * validated the pages. 760 */ 761 if (error != 0 && (bp->b_flags & B_CACHE) == 0 && 762 fs->fs_bsize == xfersize) 763 vfs_bio_clrbuf(bp); 764 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 765 (LIST_EMPTY(&bp->b_dep))) { 766 bp->b_flags |= B_RELBUF; 767 } 768 769 /* 770 * If IO_SYNC each buffer is written synchronously. Otherwise 771 * if we have a severe page deficiency write the buffer 772 * asynchronously. Otherwise try to cluster, and if that 773 * doesn't do it then either do an async write (if O_DIRECT), 774 * or a delayed write (if not). 775 */ 776 if (ioflag & IO_SYNC) { 777 (void)bwrite(bp); 778 } else if (vm_page_count_severe() || 779 buf_dirty_count_severe() || 780 (ioflag & IO_ASYNC)) { 781 bp->b_flags |= B_CLUSTEROK; 782 bawrite(bp); 783 } else if (xfersize + blkoffset == fs->fs_bsize) { 784 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 785 bp->b_flags |= B_CLUSTEROK; 786 cluster_write(vp, bp, ip->i_size, seqcount); 787 } else { 788 bawrite(bp); 789 } 790 } else if (ioflag & IO_DIRECT) { 791 bp->b_flags |= B_CLUSTEROK; 792 bawrite(bp); 793 } else { 794 bp->b_flags |= B_CLUSTEROK; 795 bdwrite(bp); 796 } 797 if (error || xfersize == 0) 798 break; 799 ip->i_flag |= IN_CHANGE | IN_UPDATE; 800 } 801 /* 802 * If we successfully wrote any data, and we are not the superuser 803 * we clear the setuid and setgid bits as a precaution against 804 * tampering. 805 */ 806 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && 807 ap->a_cred) { 808 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) { 809 ip->i_mode &= ~(ISUID | ISGID); 810 DIP_SET(ip, i_mode, ip->i_mode); 811 } 812 } 813 if (error) { 814 if (ioflag & IO_UNIT) { 815 (void)ffs_truncate(vp, osize, 816 IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred); 817 uio->uio_offset -= resid - uio->uio_resid; 818 uio->uio_resid = resid; 819 } 820 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 821 error = ffs_update(vp, 1); 822 return (error); 823 } 824 825 /* 826 * get page routine 827 */ 828 static int 829 ffs_getpages(ap) 830 struct vop_getpages_args *ap; 831 { 832 int i; 833 vm_page_t mreq; 834 int pcount; 835 836 pcount = round_page(ap->a_count) / PAGE_SIZE; 837 mreq = ap->a_m[ap->a_reqpage]; 838 839 /* 840 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 841 * then the entire page is valid. Since the page may be mapped, 842 * user programs might reference data beyond the actual end of file 843 * occuring within the page. We have to zero that data. 844 */ 845 VM_OBJECT_LOCK(mreq->object); 846 if (mreq->valid) { 847 if (mreq->valid != VM_PAGE_BITS_ALL) 848 vm_page_zero_invalid(mreq, TRUE); 849 for (i = 0; i < pcount; i++) { 850 if (i != ap->a_reqpage) { 851 vm_page_lock(ap->a_m[i]); 852 vm_page_free(ap->a_m[i]); 853 vm_page_unlock(ap->a_m[i]); 854 } 855 } 856 VM_OBJECT_UNLOCK(mreq->object); 857 return VM_PAGER_OK; 858 } 859 VM_OBJECT_UNLOCK(mreq->object); 860 861 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 862 ap->a_count, 863 ap->a_reqpage); 864 } 865 866 867 /* 868 * Extended attribute area reading. 869 */ 870 static int 871 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 872 { 873 struct inode *ip; 874 struct ufs2_dinode *dp; 875 struct fs *fs; 876 struct buf *bp; 877 ufs_lbn_t lbn, nextlbn; 878 off_t bytesinfile; 879 long size, xfersize, blkoffset; 880 ssize_t orig_resid; 881 int error; 882 883 ip = VTOI(vp); 884 fs = ip->i_fs; 885 dp = ip->i_din2; 886 887 #ifdef INVARIANTS 888 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 889 panic("ffs_extread: mode"); 890 891 #endif 892 orig_resid = uio->uio_resid; 893 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 894 if (orig_resid == 0) 895 return (0); 896 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 897 898 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 899 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 900 break; 901 lbn = lblkno(fs, uio->uio_offset); 902 nextlbn = lbn + 1; 903 904 /* 905 * size of buffer. The buffer representing the 906 * end of the file is rounded up to the size of 907 * the block type ( fragment or full block, 908 * depending ). 909 */ 910 size = sblksize(fs, dp->di_extsize, lbn); 911 blkoffset = blkoff(fs, uio->uio_offset); 912 913 /* 914 * The amount we want to transfer in this iteration is 915 * one FS block less the amount of the data before 916 * our startpoint (duh!) 917 */ 918 xfersize = fs->fs_bsize - blkoffset; 919 920 /* 921 * But if we actually want less than the block, 922 * or the file doesn't have a whole block more of data, 923 * then use the lesser number. 924 */ 925 if (uio->uio_resid < xfersize) 926 xfersize = uio->uio_resid; 927 if (bytesinfile < xfersize) 928 xfersize = bytesinfile; 929 930 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 931 /* 932 * Don't do readahead if this is the end of the info. 933 */ 934 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 935 } else { 936 /* 937 * If we have a second block, then 938 * fire off a request for a readahead 939 * as well as a read. Note that the 4th and 5th 940 * arguments point to arrays of the size specified in 941 * the 6th argument. 942 */ 943 int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 944 945 nextlbn = -1 - nextlbn; 946 error = breadn(vp, -1 - lbn, 947 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 948 } 949 if (error) { 950 brelse(bp); 951 bp = NULL; 952 break; 953 } 954 955 /* 956 * If IO_DIRECT then set B_DIRECT for the buffer. This 957 * will cause us to attempt to release the buffer later on 958 * and will cause the buffer cache to attempt to free the 959 * underlying pages. 960 */ 961 if (ioflag & IO_DIRECT) 962 bp->b_flags |= B_DIRECT; 963 964 /* 965 * We should only get non-zero b_resid when an I/O error 966 * has occurred, which should cause us to break above. 967 * However, if the short read did not cause an error, 968 * then we want to ensure that we do not uiomove bad 969 * or uninitialized data. 970 */ 971 size -= bp->b_resid; 972 if (size < xfersize) { 973 if (size == 0) 974 break; 975 xfersize = size; 976 } 977 978 error = uiomove((char *)bp->b_data + blkoffset, 979 (int)xfersize, uio); 980 if (error) 981 break; 982 983 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 984 (LIST_EMPTY(&bp->b_dep))) { 985 /* 986 * If there are no dependencies, and it's VMIO, 987 * then we don't need the buf, mark it available 988 * for freeing. For non-direct VMIO reads, the VM 989 * has the data. 990 */ 991 bp->b_flags |= B_RELBUF; 992 brelse(bp); 993 } else { 994 /* 995 * Otherwise let whoever 996 * made the request take care of 997 * freeing it. We just queue 998 * it onto another list. 999 */ 1000 bqrelse(bp); 1001 } 1002 } 1003 1004 /* 1005 * This can only happen in the case of an error 1006 * because the loop above resets bp to NULL on each iteration 1007 * and on normal completion has not set a new value into it. 1008 * so it must have come from a 'break' statement 1009 */ 1010 if (bp != NULL) { 1011 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1012 (LIST_EMPTY(&bp->b_dep))) { 1013 bp->b_flags |= B_RELBUF; 1014 brelse(bp); 1015 } else { 1016 bqrelse(bp); 1017 } 1018 } 1019 return (error); 1020 } 1021 1022 /* 1023 * Extended attribute area writing. 1024 */ 1025 static int 1026 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1027 { 1028 struct inode *ip; 1029 struct ufs2_dinode *dp; 1030 struct fs *fs; 1031 struct buf *bp; 1032 ufs_lbn_t lbn; 1033 off_t osize; 1034 ssize_t resid; 1035 int blkoffset, error, flags, size, xfersize; 1036 1037 ip = VTOI(vp); 1038 fs = ip->i_fs; 1039 dp = ip->i_din2; 1040 1041 #ifdef INVARIANTS 1042 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1043 panic("ffs_extwrite: mode"); 1044 #endif 1045 1046 if (ioflag & IO_APPEND) 1047 uio->uio_offset = dp->di_extsize; 1048 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1049 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1050 if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1051 return (EFBIG); 1052 1053 resid = uio->uio_resid; 1054 osize = dp->di_extsize; 1055 flags = IO_EXT; 1056 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1057 flags |= IO_SYNC; 1058 1059 for (error = 0; uio->uio_resid > 0;) { 1060 lbn = lblkno(fs, uio->uio_offset); 1061 blkoffset = blkoff(fs, uio->uio_offset); 1062 xfersize = fs->fs_bsize - blkoffset; 1063 if (uio->uio_resid < xfersize) 1064 xfersize = uio->uio_resid; 1065 1066 /* 1067 * We must perform a read-before-write if the transfer size 1068 * does not cover the entire buffer. 1069 */ 1070 if (fs->fs_bsize > xfersize) 1071 flags |= BA_CLRBUF; 1072 else 1073 flags &= ~BA_CLRBUF; 1074 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1075 ucred, flags, &bp); 1076 if (error != 0) 1077 break; 1078 /* 1079 * If the buffer is not valid we have to clear out any 1080 * garbage data from the pages instantiated for the buffer. 1081 * If we do not, a failed uiomove() during a write can leave 1082 * the prior contents of the pages exposed to a userland 1083 * mmap(). XXX deal with uiomove() errors a better way. 1084 */ 1085 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1086 vfs_bio_clrbuf(bp); 1087 if (ioflag & IO_DIRECT) 1088 bp->b_flags |= B_DIRECT; 1089 1090 if (uio->uio_offset + xfersize > dp->di_extsize) 1091 dp->di_extsize = uio->uio_offset + xfersize; 1092 1093 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1094 if (size < xfersize) 1095 xfersize = size; 1096 1097 error = 1098 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1099 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1100 (LIST_EMPTY(&bp->b_dep))) { 1101 bp->b_flags |= B_RELBUF; 1102 } 1103 1104 /* 1105 * If IO_SYNC each buffer is written synchronously. Otherwise 1106 * if we have a severe page deficiency write the buffer 1107 * asynchronously. Otherwise try to cluster, and if that 1108 * doesn't do it then either do an async write (if O_DIRECT), 1109 * or a delayed write (if not). 1110 */ 1111 if (ioflag & IO_SYNC) { 1112 (void)bwrite(bp); 1113 } else if (vm_page_count_severe() || 1114 buf_dirty_count_severe() || 1115 xfersize + blkoffset == fs->fs_bsize || 1116 (ioflag & (IO_ASYNC | IO_DIRECT))) 1117 bawrite(bp); 1118 else 1119 bdwrite(bp); 1120 if (error || xfersize == 0) 1121 break; 1122 ip->i_flag |= IN_CHANGE; 1123 } 1124 /* 1125 * If we successfully wrote any data, and we are not the superuser 1126 * we clear the setuid and setgid bits as a precaution against 1127 * tampering. 1128 */ 1129 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { 1130 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) { 1131 ip->i_mode &= ~(ISUID | ISGID); 1132 dp->di_mode = ip->i_mode; 1133 } 1134 } 1135 if (error) { 1136 if (ioflag & IO_UNIT) { 1137 (void)ffs_truncate(vp, osize, 1138 IO_EXT | (ioflag&IO_SYNC), ucred); 1139 uio->uio_offset -= resid - uio->uio_resid; 1140 uio->uio_resid = resid; 1141 } 1142 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1143 error = ffs_update(vp, 1); 1144 return (error); 1145 } 1146 1147 1148 /* 1149 * Vnode operating to retrieve a named extended attribute. 1150 * 1151 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1152 * the length of the EA, and possibly the pointer to the entry and to the data. 1153 */ 1154 static int 1155 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) 1156 { 1157 u_char *p, *pe, *pn, *p0; 1158 int eapad1, eapad2, ealength, ealen, nlen; 1159 uint32_t ul; 1160 1161 pe = ptr + length; 1162 nlen = strlen(name); 1163 1164 for (p = ptr; p < pe; p = pn) { 1165 p0 = p; 1166 bcopy(p, &ul, sizeof(ul)); 1167 pn = p + ul; 1168 /* make sure this entry is complete */ 1169 if (pn > pe) 1170 break; 1171 p += sizeof(uint32_t); 1172 if (*p != nspace) 1173 continue; 1174 p++; 1175 eapad2 = *p++; 1176 if (*p != nlen) 1177 continue; 1178 p++; 1179 if (bcmp(p, name, nlen)) 1180 continue; 1181 ealength = sizeof(uint32_t) + 3 + nlen; 1182 eapad1 = 8 - (ealength % 8); 1183 if (eapad1 == 8) 1184 eapad1 = 0; 1185 ealength += eapad1; 1186 ealen = ul - ealength - eapad2; 1187 p += nlen + eapad1; 1188 if (eap != NULL) 1189 *eap = p0; 1190 if (eac != NULL) 1191 *eac = p; 1192 return (ealen); 1193 } 1194 return(-1); 1195 } 1196 1197 static int 1198 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1199 { 1200 struct inode *ip; 1201 struct ufs2_dinode *dp; 1202 struct fs *fs; 1203 struct uio luio; 1204 struct iovec liovec; 1205 int easize, error; 1206 u_char *eae; 1207 1208 ip = VTOI(vp); 1209 fs = ip->i_fs; 1210 dp = ip->i_din2; 1211 easize = dp->di_extsize; 1212 if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize) 1213 return (EFBIG); 1214 1215 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1216 1217 liovec.iov_base = eae; 1218 liovec.iov_len = easize; 1219 luio.uio_iov = &liovec; 1220 luio.uio_iovcnt = 1; 1221 luio.uio_offset = 0; 1222 luio.uio_resid = easize; 1223 luio.uio_segflg = UIO_SYSSPACE; 1224 luio.uio_rw = UIO_READ; 1225 luio.uio_td = td; 1226 1227 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1228 if (error) { 1229 free(eae, M_TEMP); 1230 return(error); 1231 } 1232 *p = eae; 1233 return (0); 1234 } 1235 1236 static void 1237 ffs_lock_ea(struct vnode *vp) 1238 { 1239 struct inode *ip; 1240 1241 ip = VTOI(vp); 1242 VI_LOCK(vp); 1243 while (ip->i_flag & IN_EA_LOCKED) { 1244 ip->i_flag |= IN_EA_LOCKWAIT; 1245 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea", 1246 0); 1247 } 1248 ip->i_flag |= IN_EA_LOCKED; 1249 VI_UNLOCK(vp); 1250 } 1251 1252 static void 1253 ffs_unlock_ea(struct vnode *vp) 1254 { 1255 struct inode *ip; 1256 1257 ip = VTOI(vp); 1258 VI_LOCK(vp); 1259 if (ip->i_flag & IN_EA_LOCKWAIT) 1260 wakeup(&ip->i_ea_refs); 1261 ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT); 1262 VI_UNLOCK(vp); 1263 } 1264 1265 static int 1266 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1267 { 1268 struct inode *ip; 1269 struct ufs2_dinode *dp; 1270 int error; 1271 1272 ip = VTOI(vp); 1273 1274 ffs_lock_ea(vp); 1275 if (ip->i_ea_area != NULL) { 1276 ip->i_ea_refs++; 1277 ffs_unlock_ea(vp); 1278 return (0); 1279 } 1280 dp = ip->i_din2; 1281 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1282 if (error) { 1283 ffs_unlock_ea(vp); 1284 return (error); 1285 } 1286 ip->i_ea_len = dp->di_extsize; 1287 ip->i_ea_error = 0; 1288 ip->i_ea_refs++; 1289 ffs_unlock_ea(vp); 1290 return (0); 1291 } 1292 1293 /* 1294 * Vnode extattr transaction commit/abort 1295 */ 1296 static int 1297 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1298 { 1299 struct inode *ip; 1300 struct uio luio; 1301 struct iovec liovec; 1302 int error; 1303 struct ufs2_dinode *dp; 1304 1305 ip = VTOI(vp); 1306 1307 ffs_lock_ea(vp); 1308 if (ip->i_ea_area == NULL) { 1309 ffs_unlock_ea(vp); 1310 return (EINVAL); 1311 } 1312 dp = ip->i_din2; 1313 error = ip->i_ea_error; 1314 if (commit && error == 0) { 1315 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit"); 1316 if (cred == NOCRED) 1317 cred = vp->v_mount->mnt_cred; 1318 liovec.iov_base = ip->i_ea_area; 1319 liovec.iov_len = ip->i_ea_len; 1320 luio.uio_iov = &liovec; 1321 luio.uio_iovcnt = 1; 1322 luio.uio_offset = 0; 1323 luio.uio_resid = ip->i_ea_len; 1324 luio.uio_segflg = UIO_SYSSPACE; 1325 luio.uio_rw = UIO_WRITE; 1326 luio.uio_td = td; 1327 /* XXX: I'm not happy about truncating to zero size */ 1328 if (ip->i_ea_len < dp->di_extsize) 1329 error = ffs_truncate(vp, 0, IO_EXT, cred); 1330 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1331 } 1332 if (--ip->i_ea_refs == 0) { 1333 free(ip->i_ea_area, M_TEMP); 1334 ip->i_ea_area = NULL; 1335 ip->i_ea_len = 0; 1336 ip->i_ea_error = 0; 1337 } 1338 ffs_unlock_ea(vp); 1339 return (error); 1340 } 1341 1342 /* 1343 * Vnode extattr strategy routine for fifos. 1344 * 1345 * We need to check for a read or write of the external attributes. 1346 * Otherwise we just fall through and do the usual thing. 1347 */ 1348 static int 1349 ffsext_strategy(struct vop_strategy_args *ap) 1350 /* 1351 struct vop_strategy_args { 1352 struct vnodeop_desc *a_desc; 1353 struct vnode *a_vp; 1354 struct buf *a_bp; 1355 }; 1356 */ 1357 { 1358 struct vnode *vp; 1359 daddr_t lbn; 1360 1361 vp = ap->a_vp; 1362 lbn = ap->a_bp->b_lblkno; 1363 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1364 lbn < 0 && lbn >= -NXADDR) 1365 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1366 if (vp->v_type == VFIFO) 1367 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1368 panic("spec nodes went here"); 1369 } 1370 1371 /* 1372 * Vnode extattr transaction commit/abort 1373 */ 1374 static int 1375 ffs_openextattr(struct vop_openextattr_args *ap) 1376 /* 1377 struct vop_openextattr_args { 1378 struct vnodeop_desc *a_desc; 1379 struct vnode *a_vp; 1380 IN struct ucred *a_cred; 1381 IN struct thread *a_td; 1382 }; 1383 */ 1384 { 1385 struct inode *ip; 1386 struct fs *fs; 1387 1388 ip = VTOI(ap->a_vp); 1389 fs = ip->i_fs; 1390 1391 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1392 return (EOPNOTSUPP); 1393 1394 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1395 } 1396 1397 1398 /* 1399 * Vnode extattr transaction commit/abort 1400 */ 1401 static int 1402 ffs_closeextattr(struct vop_closeextattr_args *ap) 1403 /* 1404 struct vop_closeextattr_args { 1405 struct vnodeop_desc *a_desc; 1406 struct vnode *a_vp; 1407 int a_commit; 1408 IN struct ucred *a_cred; 1409 IN struct thread *a_td; 1410 }; 1411 */ 1412 { 1413 struct inode *ip; 1414 struct fs *fs; 1415 1416 ip = VTOI(ap->a_vp); 1417 fs = ip->i_fs; 1418 1419 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1420 return (EOPNOTSUPP); 1421 1422 if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) 1423 return (EROFS); 1424 1425 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1426 } 1427 1428 /* 1429 * Vnode operation to remove a named attribute. 1430 */ 1431 static int 1432 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1433 /* 1434 vop_deleteextattr { 1435 IN struct vnode *a_vp; 1436 IN int a_attrnamespace; 1437 IN const char *a_name; 1438 IN struct ucred *a_cred; 1439 IN struct thread *a_td; 1440 }; 1441 */ 1442 { 1443 struct inode *ip; 1444 struct fs *fs; 1445 uint32_t ealength, ul; 1446 int ealen, olen, eapad1, eapad2, error, i, easize; 1447 u_char *eae, *p; 1448 1449 ip = VTOI(ap->a_vp); 1450 fs = ip->i_fs; 1451 1452 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1453 return (EOPNOTSUPP); 1454 1455 if (strlen(ap->a_name) == 0) 1456 return (EINVAL); 1457 1458 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1459 return (EROFS); 1460 1461 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1462 ap->a_cred, ap->a_td, VWRITE); 1463 if (error) { 1464 1465 /* 1466 * ffs_lock_ea is not needed there, because the vnode 1467 * must be exclusively locked. 1468 */ 1469 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1470 ip->i_ea_error = error; 1471 return (error); 1472 } 1473 1474 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1475 if (error) 1476 return (error); 1477 1478 ealength = eapad1 = ealen = eapad2 = 0; 1479 1480 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1481 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1482 easize = ip->i_ea_len; 1483 1484 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1485 &p, NULL); 1486 if (olen == -1) { 1487 /* delete but nonexistent */ 1488 free(eae, M_TEMP); 1489 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1490 return(ENOATTR); 1491 } 1492 bcopy(p, &ul, sizeof ul); 1493 i = p - eae + ul; 1494 if (ul != ealength) { 1495 bcopy(p + ul, p + ealength, easize - i); 1496 easize += (ealength - ul); 1497 } 1498 if (easize > NXADDR * fs->fs_bsize) { 1499 free(eae, M_TEMP); 1500 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1501 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1502 ip->i_ea_error = ENOSPC; 1503 return(ENOSPC); 1504 } 1505 p = ip->i_ea_area; 1506 ip->i_ea_area = eae; 1507 ip->i_ea_len = easize; 1508 free(p, M_TEMP); 1509 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1510 return(error); 1511 } 1512 1513 /* 1514 * Vnode operation to retrieve a named extended attribute. 1515 */ 1516 static int 1517 ffs_getextattr(struct vop_getextattr_args *ap) 1518 /* 1519 vop_getextattr { 1520 IN struct vnode *a_vp; 1521 IN int a_attrnamespace; 1522 IN const char *a_name; 1523 INOUT struct uio *a_uio; 1524 OUT size_t *a_size; 1525 IN struct ucred *a_cred; 1526 IN struct thread *a_td; 1527 }; 1528 */ 1529 { 1530 struct inode *ip; 1531 struct fs *fs; 1532 u_char *eae, *p; 1533 unsigned easize; 1534 int error, ealen; 1535 1536 ip = VTOI(ap->a_vp); 1537 fs = ip->i_fs; 1538 1539 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1540 return (EOPNOTSUPP); 1541 1542 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1543 ap->a_cred, ap->a_td, VREAD); 1544 if (error) 1545 return (error); 1546 1547 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1548 if (error) 1549 return (error); 1550 1551 eae = ip->i_ea_area; 1552 easize = ip->i_ea_len; 1553 1554 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1555 NULL, &p); 1556 if (ealen >= 0) { 1557 error = 0; 1558 if (ap->a_size != NULL) 1559 *ap->a_size = ealen; 1560 else if (ap->a_uio != NULL) 1561 error = uiomove(p, ealen, ap->a_uio); 1562 } else 1563 error = ENOATTR; 1564 1565 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1566 return(error); 1567 } 1568 1569 /* 1570 * Vnode operation to retrieve extended attributes on a vnode. 1571 */ 1572 static int 1573 ffs_listextattr(struct vop_listextattr_args *ap) 1574 /* 1575 vop_listextattr { 1576 IN struct vnode *a_vp; 1577 IN int a_attrnamespace; 1578 INOUT struct uio *a_uio; 1579 OUT size_t *a_size; 1580 IN struct ucred *a_cred; 1581 IN struct thread *a_td; 1582 }; 1583 */ 1584 { 1585 struct inode *ip; 1586 struct fs *fs; 1587 u_char *eae, *p, *pe, *pn; 1588 unsigned easize; 1589 uint32_t ul; 1590 int error, ealen; 1591 1592 ip = VTOI(ap->a_vp); 1593 fs = ip->i_fs; 1594 1595 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1596 return (EOPNOTSUPP); 1597 1598 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1599 ap->a_cred, ap->a_td, VREAD); 1600 if (error) 1601 return (error); 1602 1603 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1604 if (error) 1605 return (error); 1606 eae = ip->i_ea_area; 1607 easize = ip->i_ea_len; 1608 1609 error = 0; 1610 if (ap->a_size != NULL) 1611 *ap->a_size = 0; 1612 pe = eae + easize; 1613 for(p = eae; error == 0 && p < pe; p = pn) { 1614 bcopy(p, &ul, sizeof(ul)); 1615 pn = p + ul; 1616 if (pn > pe) 1617 break; 1618 p += sizeof(ul); 1619 if (*p++ != ap->a_attrnamespace) 1620 continue; 1621 p++; /* pad2 */ 1622 ealen = *p; 1623 if (ap->a_size != NULL) { 1624 *ap->a_size += ealen + 1; 1625 } else if (ap->a_uio != NULL) { 1626 error = uiomove(p, ealen + 1, ap->a_uio); 1627 } 1628 } 1629 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1630 return(error); 1631 } 1632 1633 /* 1634 * Vnode operation to set a named attribute. 1635 */ 1636 static int 1637 ffs_setextattr(struct vop_setextattr_args *ap) 1638 /* 1639 vop_setextattr { 1640 IN struct vnode *a_vp; 1641 IN int a_attrnamespace; 1642 IN const char *a_name; 1643 INOUT struct uio *a_uio; 1644 IN struct ucred *a_cred; 1645 IN struct thread *a_td; 1646 }; 1647 */ 1648 { 1649 struct inode *ip; 1650 struct fs *fs; 1651 uint32_t ealength, ul; 1652 ssize_t ealen; 1653 int olen, eapad1, eapad2, error, i, easize; 1654 u_char *eae, *p; 1655 1656 ip = VTOI(ap->a_vp); 1657 fs = ip->i_fs; 1658 1659 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) 1660 return (EOPNOTSUPP); 1661 1662 if (strlen(ap->a_name) == 0) 1663 return (EINVAL); 1664 1665 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1666 if (ap->a_uio == NULL) 1667 return (EOPNOTSUPP); 1668 1669 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1670 return (EROFS); 1671 1672 ealen = ap->a_uio->uio_resid; 1673 if (ealen < 0 || ealen > lblktosize(fs, NXADDR)) 1674 return (EINVAL); 1675 1676 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1677 ap->a_cred, ap->a_td, VWRITE); 1678 if (error) { 1679 1680 /* 1681 * ffs_lock_ea is not needed there, because the vnode 1682 * must be exclusively locked. 1683 */ 1684 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1685 ip->i_ea_error = error; 1686 return (error); 1687 } 1688 1689 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1690 if (error) 1691 return (error); 1692 1693 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1694 eapad1 = 8 - (ealength % 8); 1695 if (eapad1 == 8) 1696 eapad1 = 0; 1697 eapad2 = 8 - (ealen % 8); 1698 if (eapad2 == 8) 1699 eapad2 = 0; 1700 ealength += eapad1 + ealen + eapad2; 1701 1702 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1703 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1704 easize = ip->i_ea_len; 1705 1706 olen = ffs_findextattr(eae, easize, 1707 ap->a_attrnamespace, ap->a_name, &p, NULL); 1708 if (olen == -1) { 1709 /* new, append at end */ 1710 p = eae + easize; 1711 easize += ealength; 1712 } else { 1713 bcopy(p, &ul, sizeof ul); 1714 i = p - eae + ul; 1715 if (ul != ealength) { 1716 bcopy(p + ul, p + ealength, easize - i); 1717 easize += (ealength - ul); 1718 } 1719 } 1720 if (easize > lblktosize(fs, NXADDR)) { 1721 free(eae, M_TEMP); 1722 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1723 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1724 ip->i_ea_error = ENOSPC; 1725 return(ENOSPC); 1726 } 1727 bcopy(&ealength, p, sizeof(ealength)); 1728 p += sizeof(ealength); 1729 *p++ = ap->a_attrnamespace; 1730 *p++ = eapad2; 1731 *p++ = strlen(ap->a_name); 1732 strcpy(p, ap->a_name); 1733 p += strlen(ap->a_name); 1734 bzero(p, eapad1); 1735 p += eapad1; 1736 error = uiomove(p, ealen, ap->a_uio); 1737 if (error) { 1738 free(eae, M_TEMP); 1739 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1740 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1741 ip->i_ea_error = error; 1742 return(error); 1743 } 1744 p += ealen; 1745 bzero(p, eapad2); 1746 1747 p = ip->i_ea_area; 1748 ip->i_ea_area = eae; 1749 ip->i_ea_len = easize; 1750 free(p, M_TEMP); 1751 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1752 return(error); 1753 } 1754 1755 /* 1756 * Vnode pointer to File handle 1757 */ 1758 static int 1759 ffs_vptofh(struct vop_vptofh_args *ap) 1760 /* 1761 vop_vptofh { 1762 IN struct vnode *a_vp; 1763 IN struct fid *a_fhp; 1764 }; 1765 */ 1766 { 1767 struct inode *ip; 1768 struct ufid *ufhp; 1769 1770 ip = VTOI(ap->a_vp); 1771 ufhp = (struct ufid *)ap->a_fhp; 1772 ufhp->ufid_len = sizeof(struct ufid); 1773 ufhp->ufid_ino = ip->i_number; 1774 ufhp->ufid_gen = ip->i_gen; 1775 return (0); 1776 } 1777