1 /*- 2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 60 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 61 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include <sys/param.h> 68 #include <sys/bio.h> 69 #include <sys/systm.h> 70 #include <sys/buf.h> 71 #include <sys/conf.h> 72 #include <sys/extattr.h> 73 #include <sys/kernel.h> 74 #include <sys/limits.h> 75 #include <sys/malloc.h> 76 #include <sys/mount.h> 77 #include <sys/priv.h> 78 #include <sys/proc.h> 79 #include <sys/resourcevar.h> 80 #include <sys/signalvar.h> 81 #include <sys/stat.h> 82 #include <sys/vmmeter.h> 83 #include <sys/vnode.h> 84 85 #include <vm/vm.h> 86 #include <vm/vm_extern.h> 87 #include <vm/vm_object.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_pager.h> 90 #include <vm/vnode_pager.h> 91 92 #include <ufs/ufs/extattr.h> 93 #include <ufs/ufs/quota.h> 94 #include <ufs/ufs/inode.h> 95 #include <ufs/ufs/ufs_extern.h> 96 #include <ufs/ufs/ufsmount.h> 97 98 #include <ufs/ffs/fs.h> 99 #include <ufs/ffs/ffs_extern.h> 100 #include "opt_directio.h" 101 #include "opt_ffs.h" 102 103 #ifdef DIRECTIO 104 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 105 #endif 106 static vop_fsync_t ffs_fsync; 107 static vop_lock_t ffs_lock; 108 static vop_getpages_t ffs_getpages; 109 static vop_read_t ffs_read; 110 static vop_write_t ffs_write; 111 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 112 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 113 struct ucred *cred); 114 static vop_strategy_t ffsext_strategy; 115 static vop_closeextattr_t ffs_closeextattr; 116 static vop_deleteextattr_t ffs_deleteextattr; 117 static vop_getextattr_t ffs_getextattr; 118 static vop_listextattr_t ffs_listextattr; 119 static vop_openextattr_t ffs_openextattr; 120 static vop_setextattr_t ffs_setextattr; 121 122 123 /* Global vfs data structures for ufs. */ 124 struct vop_vector ffs_vnodeops1 = { 125 .vop_default = &ufs_vnodeops, 126 .vop_fsync = ffs_fsync, 127 .vop_getpages = ffs_getpages, 128 .vop_lock = ffs_lock, 129 .vop_read = ffs_read, 130 .vop_reallocblks = ffs_reallocblks, 131 .vop_write = ffs_write, 132 }; 133 134 struct vop_vector ffs_fifoops1 = { 135 .vop_default = &ufs_fifoops, 136 .vop_fsync = ffs_fsync, 137 .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ 138 }; 139 140 /* Global vfs data structures for ufs. */ 141 struct vop_vector ffs_vnodeops2 = { 142 .vop_default = &ufs_vnodeops, 143 .vop_fsync = ffs_fsync, 144 .vop_getpages = ffs_getpages, 145 .vop_lock = ffs_lock, 146 .vop_read = ffs_read, 147 .vop_reallocblks = ffs_reallocblks, 148 .vop_write = ffs_write, 149 .vop_closeextattr = ffs_closeextattr, 150 .vop_deleteextattr = ffs_deleteextattr, 151 .vop_getextattr = ffs_getextattr, 152 .vop_listextattr = ffs_listextattr, 153 .vop_openextattr = ffs_openextattr, 154 .vop_setextattr = ffs_setextattr, 155 }; 156 157 struct vop_vector ffs_fifoops2 = { 158 .vop_default = &ufs_fifoops, 159 .vop_fsync = ffs_fsync, 160 .vop_lock = ffs_lock, 161 .vop_reallocblks = ffs_reallocblks, 162 .vop_strategy = ffsext_strategy, 163 .vop_closeextattr = ffs_closeextattr, 164 .vop_deleteextattr = ffs_deleteextattr, 165 .vop_getextattr = ffs_getextattr, 166 .vop_listextattr = ffs_listextattr, 167 .vop_openextattr = ffs_openextattr, 168 .vop_setextattr = ffs_setextattr, 169 }; 170 171 /* 172 * Synch an open file. 173 */ 174 /* ARGSUSED */ 175 static int 176 ffs_fsync(struct vop_fsync_args *ap) 177 { 178 int error; 179 180 error = ffs_syncvnode(ap->a_vp, ap->a_waitfor); 181 if (error) 182 return (error); 183 if (ap->a_waitfor == MNT_WAIT && 184 (ap->a_vp->v_mount->mnt_flag & MNT_SOFTDEP)) 185 error = softdep_fsync(ap->a_vp); 186 return (error); 187 } 188 189 int 190 ffs_syncvnode(struct vnode *vp, int waitfor) 191 { 192 struct inode *ip = VTOI(vp); 193 struct buf *bp; 194 struct buf *nbp; 195 int s, error, wait, passes, skipmeta; 196 ufs_lbn_t lbn; 197 198 wait = (waitfor == MNT_WAIT); 199 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 200 201 /* 202 * Flush all dirty buffers associated with a vnode. 203 */ 204 passes = NIADDR + 1; 205 skipmeta = 0; 206 if (wait) 207 skipmeta = 1; 208 s = splbio(); 209 VI_LOCK(vp); 210 loop: 211 TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) 212 bp->b_vflags &= ~BV_SCANNED; 213 TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { 214 /* 215 * Reasons to skip this buffer: it has already been considered 216 * on this pass, this pass is the first time through on a 217 * synchronous flush request and the buffer being considered 218 * is metadata, the buffer has dependencies that will cause 219 * it to be redirtied and it has not already been deferred, 220 * or it is already being written. 221 */ 222 if ((bp->b_vflags & BV_SCANNED) != 0) 223 continue; 224 bp->b_vflags |= BV_SCANNED; 225 if ((skipmeta == 1 && bp->b_lblkno < 0)) 226 continue; 227 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 228 continue; 229 VI_UNLOCK(vp); 230 if (!wait && LIST_FIRST(&bp->b_dep) != NULL && 231 (bp->b_flags & B_DEFERRED) == 0 && 232 buf_countdeps(bp, 0)) { 233 bp->b_flags |= B_DEFERRED; 234 BUF_UNLOCK(bp); 235 VI_LOCK(vp); 236 continue; 237 } 238 if ((bp->b_flags & B_DELWRI) == 0) 239 panic("ffs_fsync: not dirty"); 240 /* 241 * If this is a synchronous flush request, or it is not a 242 * file or device, start the write on this buffer immediatly. 243 */ 244 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) { 245 246 /* 247 * On our final pass through, do all I/O synchronously 248 * so that we can find out if our flush is failing 249 * because of write errors. 250 */ 251 if (passes > 0 || !wait) { 252 if ((bp->b_flags & B_CLUSTEROK) && !wait) { 253 (void) vfs_bio_awrite(bp); 254 } else { 255 bremfree(bp); 256 splx(s); 257 (void) bawrite(bp); 258 s = splbio(); 259 } 260 } else { 261 bremfree(bp); 262 splx(s); 263 if ((error = bwrite(bp)) != 0) 264 return (error); 265 s = splbio(); 266 } 267 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) { 268 /* 269 * If the buffer is for data that has been truncated 270 * off the file, then throw it away. 271 */ 272 bremfree(bp); 273 bp->b_flags |= B_INVAL | B_NOCACHE; 274 splx(s); 275 brelse(bp); 276 s = splbio(); 277 } else 278 vfs_bio_awrite(bp); 279 280 /* 281 * Since we may have slept during the I/O, we need 282 * to start from a known point. 283 */ 284 VI_LOCK(vp); 285 nbp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd); 286 } 287 /* 288 * If we were asked to do this synchronously, then go back for 289 * another pass, this time doing the metadata. 290 */ 291 if (skipmeta) { 292 skipmeta = 0; 293 goto loop; 294 } 295 296 if (wait) { 297 bufobj_wwait(&vp->v_bufobj, 3, 0); 298 VI_UNLOCK(vp); 299 300 /* 301 * Ensure that any filesystem metatdata associated 302 * with the vnode has been written. 303 */ 304 splx(s); 305 if ((error = softdep_sync_metadata(vp)) != 0) 306 return (error); 307 s = splbio(); 308 309 VI_LOCK(vp); 310 if (vp->v_bufobj.bo_dirty.bv_cnt > 0) { 311 /* 312 * Block devices associated with filesystems may 313 * have new I/O requests posted for them even if 314 * the vnode is locked, so no amount of trying will 315 * get them clean. Thus we give block devices a 316 * good effort, then just give up. For all other file 317 * types, go around and try again until it is clean. 318 */ 319 if (passes > 0) { 320 passes -= 1; 321 goto loop; 322 } 323 #ifdef DIAGNOSTIC 324 if (!vn_isdisk(vp, NULL)) 325 vprint("ffs_fsync: dirty", vp); 326 #endif 327 } 328 } 329 VI_UNLOCK(vp); 330 splx(s); 331 return (ffs_update(vp, wait)); 332 } 333 334 static int 335 ffs_lock(ap) 336 struct vop_lock_args /* { 337 struct vnode *a_vp; 338 int a_flags; 339 struct thread *a_td; 340 } */ *ap; 341 { 342 #ifndef NO_FFS_SNAPSHOT 343 struct vnode *vp; 344 int flags; 345 struct lock *lkp; 346 int result; 347 348 switch (ap->a_flags & LK_TYPE_MASK) { 349 case LK_SHARED: 350 case LK_UPGRADE: 351 case LK_EXCLUSIVE: 352 vp = ap->a_vp; 353 flags = ap->a_flags; 354 for (;;) { 355 /* 356 * vnode interlock must be held to ensure that 357 * the possibly external lock isn't freed, 358 * e.g. when mutating from snapshot file vnode 359 * to regular file vnode. 360 */ 361 if ((flags & LK_INTERLOCK) == 0) { 362 VI_LOCK(vp); 363 flags |= LK_INTERLOCK; 364 } 365 lkp = vp->v_vnlock; 366 result = lockmgr(lkp, flags, VI_MTX(vp), ap->a_td); 367 if (lkp == vp->v_vnlock || result != 0) 368 break; 369 /* 370 * Apparent success, except that the vnode 371 * mutated between snapshot file vnode and 372 * regular file vnode while this process 373 * slept. The lock currently held is not the 374 * right lock. Release it, and try to get the 375 * new lock. 376 */ 377 (void) lockmgr(lkp, LK_RELEASE, VI_MTX(vp), ap->a_td); 378 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 379 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 380 flags &= ~LK_INTERLOCK; 381 } 382 break; 383 default: 384 result = VOP_LOCK_APV(&ufs_vnodeops, ap); 385 } 386 return (result); 387 #else 388 return (VOP_LOCK_APV(&ufs_vnodeops, ap)); 389 #endif 390 } 391 392 /* 393 * Vnode op for reading. 394 */ 395 /* ARGSUSED */ 396 static int 397 ffs_read(ap) 398 struct vop_read_args /* { 399 struct vnode *a_vp; 400 struct uio *a_uio; 401 int a_ioflag; 402 struct ucred *a_cred; 403 } */ *ap; 404 { 405 struct vnode *vp; 406 struct inode *ip; 407 struct uio *uio; 408 struct fs *fs; 409 struct buf *bp; 410 ufs_lbn_t lbn, nextlbn; 411 off_t bytesinfile; 412 long size, xfersize, blkoffset; 413 int error, orig_resid; 414 int seqcount; 415 int ioflag; 416 417 vp = ap->a_vp; 418 uio = ap->a_uio; 419 ioflag = ap->a_ioflag; 420 if (ap->a_ioflag & IO_EXT) 421 #ifdef notyet 422 return (ffs_extread(vp, uio, ioflag)); 423 #else 424 panic("ffs_read+IO_EXT"); 425 #endif 426 #ifdef DIRECTIO 427 if ((ioflag & IO_DIRECT) != 0) { 428 int workdone; 429 430 error = ffs_rawread(vp, uio, &workdone); 431 if (error != 0 || workdone != 0) 432 return error; 433 } 434 #endif 435 436 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 437 ip = VTOI(vp); 438 439 #ifdef DIAGNOSTIC 440 if (uio->uio_rw != UIO_READ) 441 panic("ffs_read: mode"); 442 443 if (vp->v_type == VLNK) { 444 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 445 panic("ffs_read: short symlink"); 446 } else if (vp->v_type != VREG && vp->v_type != VDIR) 447 panic("ffs_read: type %d", vp->v_type); 448 #endif 449 orig_resid = uio->uio_resid; 450 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 451 if (orig_resid == 0) 452 return (0); 453 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 454 fs = ip->i_fs; 455 if (uio->uio_offset < ip->i_size && 456 uio->uio_offset >= fs->fs_maxfilesize) 457 return (EOVERFLOW); 458 459 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 460 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 461 break; 462 lbn = lblkno(fs, uio->uio_offset); 463 nextlbn = lbn + 1; 464 465 /* 466 * size of buffer. The buffer representing the 467 * end of the file is rounded up to the size of 468 * the block type ( fragment or full block, 469 * depending ). 470 */ 471 size = blksize(fs, ip, lbn); 472 blkoffset = blkoff(fs, uio->uio_offset); 473 474 /* 475 * The amount we want to transfer in this iteration is 476 * one FS block less the amount of the data before 477 * our startpoint (duh!) 478 */ 479 xfersize = fs->fs_bsize - blkoffset; 480 481 /* 482 * But if we actually want less than the block, 483 * or the file doesn't have a whole block more of data, 484 * then use the lesser number. 485 */ 486 if (uio->uio_resid < xfersize) 487 xfersize = uio->uio_resid; 488 if (bytesinfile < xfersize) 489 xfersize = bytesinfile; 490 491 if (lblktosize(fs, nextlbn) >= ip->i_size) { 492 /* 493 * Don't do readahead if this is the end of the file. 494 */ 495 error = bread(vp, lbn, size, NOCRED, &bp); 496 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 497 /* 498 * Otherwise if we are allowed to cluster, 499 * grab as much as we can. 500 * 501 * XXX This may not be a win if we are not 502 * doing sequential access. 503 */ 504 error = cluster_read(vp, ip->i_size, lbn, 505 size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); 506 } else if (seqcount > 1) { 507 /* 508 * If we are NOT allowed to cluster, then 509 * if we appear to be acting sequentially, 510 * fire off a request for a readahead 511 * as well as a read. Note that the 4th and 5th 512 * arguments point to arrays of the size specified in 513 * the 6th argument. 514 */ 515 int nextsize = blksize(fs, ip, nextlbn); 516 error = breadn(vp, lbn, 517 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 518 } else { 519 /* 520 * Failing all of the above, just read what the 521 * user asked for. Interestingly, the same as 522 * the first option above. 523 */ 524 error = bread(vp, lbn, size, NOCRED, &bp); 525 } 526 if (error) { 527 brelse(bp); 528 bp = NULL; 529 break; 530 } 531 532 /* 533 * If IO_DIRECT then set B_DIRECT for the buffer. This 534 * will cause us to attempt to release the buffer later on 535 * and will cause the buffer cache to attempt to free the 536 * underlying pages. 537 */ 538 if (ioflag & IO_DIRECT) 539 bp->b_flags |= B_DIRECT; 540 541 /* 542 * We should only get non-zero b_resid when an I/O error 543 * has occurred, which should cause us to break above. 544 * However, if the short read did not cause an error, 545 * then we want to ensure that we do not uiomove bad 546 * or uninitialized data. 547 */ 548 size -= bp->b_resid; 549 if (size < xfersize) { 550 if (size == 0) 551 break; 552 xfersize = size; 553 } 554 555 error = uiomove((char *)bp->b_data + blkoffset, 556 (int)xfersize, uio); 557 if (error) 558 break; 559 560 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 561 (LIST_FIRST(&bp->b_dep) == NULL)) { 562 /* 563 * If there are no dependencies, and it's VMIO, 564 * then we don't need the buf, mark it available 565 * for freeing. The VM has the data. 566 */ 567 bp->b_flags |= B_RELBUF; 568 brelse(bp); 569 } else { 570 /* 571 * Otherwise let whoever 572 * made the request take care of 573 * freeing it. We just queue 574 * it onto another list. 575 */ 576 bqrelse(bp); 577 } 578 } 579 580 /* 581 * This can only happen in the case of an error 582 * because the loop above resets bp to NULL on each iteration 583 * and on normal completion has not set a new value into it. 584 * so it must have come from a 'break' statement 585 */ 586 if (bp != NULL) { 587 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 588 (LIST_FIRST(&bp->b_dep) == NULL)) { 589 bp->b_flags |= B_RELBUF; 590 brelse(bp); 591 } else { 592 bqrelse(bp); 593 } 594 } 595 596 if ((error == 0 || uio->uio_resid != orig_resid) && 597 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) { 598 VI_LOCK(vp); 599 ip->i_flag |= IN_ACCESS; 600 VI_UNLOCK(vp); 601 } 602 return (error); 603 } 604 605 /* 606 * Vnode op for writing. 607 */ 608 static int 609 ffs_write(ap) 610 struct vop_write_args /* { 611 struct vnode *a_vp; 612 struct uio *a_uio; 613 int a_ioflag; 614 struct ucred *a_cred; 615 } */ *ap; 616 { 617 struct vnode *vp; 618 struct uio *uio; 619 struct inode *ip; 620 struct fs *fs; 621 struct buf *bp; 622 struct thread *td; 623 ufs_lbn_t lbn; 624 off_t osize; 625 int seqcount; 626 int blkoffset, error, flags, ioflag, resid, size, xfersize; 627 628 vp = ap->a_vp; 629 uio = ap->a_uio; 630 ioflag = ap->a_ioflag; 631 if (ap->a_ioflag & IO_EXT) 632 #ifdef notyet 633 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 634 #else 635 panic("ffs_write+IO_EXT"); 636 #endif 637 638 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 639 ip = VTOI(vp); 640 641 #ifdef DIAGNOSTIC 642 if (uio->uio_rw != UIO_WRITE) 643 panic("ffs_write: mode"); 644 #endif 645 646 switch (vp->v_type) { 647 case VREG: 648 if (ioflag & IO_APPEND) 649 uio->uio_offset = ip->i_size; 650 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 651 return (EPERM); 652 /* FALLTHROUGH */ 653 case VLNK: 654 break; 655 case VDIR: 656 panic("ffs_write: dir write"); 657 break; 658 default: 659 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 660 (int)uio->uio_offset, 661 (int)uio->uio_resid 662 ); 663 } 664 665 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 666 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 667 fs = ip->i_fs; 668 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 669 return (EFBIG); 670 /* 671 * Maybe this should be above the vnode op call, but so long as 672 * file servers have no limits, I don't think it matters. 673 */ 674 td = uio->uio_td; 675 if (vp->v_type == VREG && td != NULL) { 676 PROC_LOCK(td->td_proc); 677 if (uio->uio_offset + uio->uio_resid > 678 lim_cur(td->td_proc, RLIMIT_FSIZE)) { 679 psignal(td->td_proc, SIGXFSZ); 680 PROC_UNLOCK(td->td_proc); 681 return (EFBIG); 682 } 683 PROC_UNLOCK(td->td_proc); 684 } 685 686 resid = uio->uio_resid; 687 osize = ip->i_size; 688 if (seqcount > BA_SEQMAX) 689 flags = BA_SEQMAX << BA_SEQSHIFT; 690 else 691 flags = seqcount << BA_SEQSHIFT; 692 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 693 flags |= IO_SYNC; 694 695 for (error = 0; uio->uio_resid > 0;) { 696 lbn = lblkno(fs, uio->uio_offset); 697 blkoffset = blkoff(fs, uio->uio_offset); 698 xfersize = fs->fs_bsize - blkoffset; 699 if (uio->uio_resid < xfersize) 700 xfersize = uio->uio_resid; 701 if (uio->uio_offset + xfersize > ip->i_size) 702 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 703 704 /* 705 * We must perform a read-before-write if the transfer size 706 * does not cover the entire buffer. 707 */ 708 if (fs->fs_bsize > xfersize) 709 flags |= BA_CLRBUF; 710 else 711 flags &= ~BA_CLRBUF; 712 /* XXX is uio->uio_offset the right thing here? */ 713 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 714 ap->a_cred, flags, &bp); 715 if (error != 0) 716 break; 717 /* 718 * If the buffer is not valid we have to clear out any 719 * garbage data from the pages instantiated for the buffer. 720 * If we do not, a failed uiomove() during a write can leave 721 * the prior contents of the pages exposed to a userland 722 * mmap(). XXX deal with uiomove() errors a better way. 723 */ 724 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 725 vfs_bio_clrbuf(bp); 726 if (ioflag & IO_DIRECT) 727 bp->b_flags |= B_DIRECT; 728 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 729 bp->b_flags |= B_NOCACHE; 730 731 if (uio->uio_offset + xfersize > ip->i_size) { 732 ip->i_size = uio->uio_offset + xfersize; 733 DIP_SET(ip, i_size, ip->i_size); 734 } 735 736 size = blksize(fs, ip, lbn) - bp->b_resid; 737 if (size < xfersize) 738 xfersize = size; 739 740 error = 741 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 742 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 743 (LIST_FIRST(&bp->b_dep) == NULL)) { 744 bp->b_flags |= B_RELBUF; 745 } 746 747 /* 748 * If IO_SYNC each buffer is written synchronously. Otherwise 749 * if we have a severe page deficiency write the buffer 750 * asynchronously. Otherwise try to cluster, and if that 751 * doesn't do it then either do an async write (if O_DIRECT), 752 * or a delayed write (if not). 753 */ 754 if (ioflag & IO_SYNC) { 755 (void)bwrite(bp); 756 } else if (vm_page_count_severe() || 757 buf_dirty_count_severe() || 758 (ioflag & IO_ASYNC)) { 759 bp->b_flags |= B_CLUSTEROK; 760 bawrite(bp); 761 } else if (xfersize + blkoffset == fs->fs_bsize) { 762 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 763 bp->b_flags |= B_CLUSTEROK; 764 cluster_write(vp, bp, ip->i_size, seqcount); 765 } else { 766 bawrite(bp); 767 } 768 } else if (ioflag & IO_DIRECT) { 769 bp->b_flags |= B_CLUSTEROK; 770 bawrite(bp); 771 } else { 772 bp->b_flags |= B_CLUSTEROK; 773 bdwrite(bp); 774 } 775 if (error || xfersize == 0) 776 break; 777 ip->i_flag |= IN_CHANGE | IN_UPDATE; 778 } 779 /* 780 * If we successfully wrote any data, and we are not the superuser 781 * we clear the setuid and setgid bits as a precaution against 782 * tampering. 783 */ 784 if (resid > uio->uio_resid && ap->a_cred && 785 priv_check_cred(ap->a_cred, PRIV_VFS_CLEARSUGID, 786 SUSER_ALLOWJAIL)) { 787 ip->i_mode &= ~(ISUID | ISGID); 788 DIP_SET(ip, i_mode, ip->i_mode); 789 } 790 if (error) { 791 if (ioflag & IO_UNIT) { 792 (void)ffs_truncate(vp, osize, 793 IO_NORMAL | (ioflag & IO_SYNC), 794 ap->a_cred, uio->uio_td); 795 uio->uio_offset -= resid - uio->uio_resid; 796 uio->uio_resid = resid; 797 } 798 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 799 error = ffs_update(vp, 1); 800 return (error); 801 } 802 803 /* 804 * get page routine 805 */ 806 static int 807 ffs_getpages(ap) 808 struct vop_getpages_args *ap; 809 { 810 int i; 811 vm_page_t mreq; 812 int pcount; 813 814 pcount = round_page(ap->a_count) / PAGE_SIZE; 815 mreq = ap->a_m[ap->a_reqpage]; 816 817 /* 818 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 819 * then the entire page is valid. Since the page may be mapped, 820 * user programs might reference data beyond the actual end of file 821 * occuring within the page. We have to zero that data. 822 */ 823 VM_OBJECT_LOCK(mreq->object); 824 if (mreq->valid) { 825 if (mreq->valid != VM_PAGE_BITS_ALL) 826 vm_page_zero_invalid(mreq, TRUE); 827 vm_page_lock_queues(); 828 for (i = 0; i < pcount; i++) { 829 if (i != ap->a_reqpage) { 830 vm_page_free(ap->a_m[i]); 831 } 832 } 833 vm_page_unlock_queues(); 834 VM_OBJECT_UNLOCK(mreq->object); 835 return VM_PAGER_OK; 836 } 837 VM_OBJECT_UNLOCK(mreq->object); 838 839 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 840 ap->a_count, 841 ap->a_reqpage); 842 } 843 844 845 /* 846 * Extended attribute area reading. 847 */ 848 static int 849 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 850 { 851 struct inode *ip; 852 struct ufs2_dinode *dp; 853 struct fs *fs; 854 struct buf *bp; 855 ufs_lbn_t lbn, nextlbn; 856 off_t bytesinfile; 857 long size, xfersize, blkoffset; 858 int error, orig_resid; 859 860 ip = VTOI(vp); 861 fs = ip->i_fs; 862 dp = ip->i_din2; 863 864 #ifdef DIAGNOSTIC 865 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 866 panic("ffs_extread: mode"); 867 868 #endif 869 orig_resid = uio->uio_resid; 870 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 871 if (orig_resid == 0) 872 return (0); 873 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 874 875 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 876 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 877 break; 878 lbn = lblkno(fs, uio->uio_offset); 879 nextlbn = lbn + 1; 880 881 /* 882 * size of buffer. The buffer representing the 883 * end of the file is rounded up to the size of 884 * the block type ( fragment or full block, 885 * depending ). 886 */ 887 size = sblksize(fs, dp->di_extsize, lbn); 888 blkoffset = blkoff(fs, uio->uio_offset); 889 890 /* 891 * The amount we want to transfer in this iteration is 892 * one FS block less the amount of the data before 893 * our startpoint (duh!) 894 */ 895 xfersize = fs->fs_bsize - blkoffset; 896 897 /* 898 * But if we actually want less than the block, 899 * or the file doesn't have a whole block more of data, 900 * then use the lesser number. 901 */ 902 if (uio->uio_resid < xfersize) 903 xfersize = uio->uio_resid; 904 if (bytesinfile < xfersize) 905 xfersize = bytesinfile; 906 907 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 908 /* 909 * Don't do readahead if this is the end of the info. 910 */ 911 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 912 } else { 913 /* 914 * If we have a second block, then 915 * fire off a request for a readahead 916 * as well as a read. Note that the 4th and 5th 917 * arguments point to arrays of the size specified in 918 * the 6th argument. 919 */ 920 int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 921 922 nextlbn = -1 - nextlbn; 923 error = breadn(vp, -1 - lbn, 924 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 925 } 926 if (error) { 927 brelse(bp); 928 bp = NULL; 929 break; 930 } 931 932 /* 933 * If IO_DIRECT then set B_DIRECT for the buffer. This 934 * will cause us to attempt to release the buffer later on 935 * and will cause the buffer cache to attempt to free the 936 * underlying pages. 937 */ 938 if (ioflag & IO_DIRECT) 939 bp->b_flags |= B_DIRECT; 940 941 /* 942 * We should only get non-zero b_resid when an I/O error 943 * has occurred, which should cause us to break above. 944 * However, if the short read did not cause an error, 945 * then we want to ensure that we do not uiomove bad 946 * or uninitialized data. 947 */ 948 size -= bp->b_resid; 949 if (size < xfersize) { 950 if (size == 0) 951 break; 952 xfersize = size; 953 } 954 955 error = uiomove((char *)bp->b_data + blkoffset, 956 (int)xfersize, uio); 957 if (error) 958 break; 959 960 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 961 (LIST_FIRST(&bp->b_dep) == NULL)) { 962 /* 963 * If there are no dependencies, and it's VMIO, 964 * then we don't need the buf, mark it available 965 * for freeing. The VM has the data. 966 */ 967 bp->b_flags |= B_RELBUF; 968 brelse(bp); 969 } else { 970 /* 971 * Otherwise let whoever 972 * made the request take care of 973 * freeing it. We just queue 974 * it onto another list. 975 */ 976 bqrelse(bp); 977 } 978 } 979 980 /* 981 * This can only happen in the case of an error 982 * because the loop above resets bp to NULL on each iteration 983 * and on normal completion has not set a new value into it. 984 * so it must have come from a 'break' statement 985 */ 986 if (bp != NULL) { 987 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 988 (LIST_FIRST(&bp->b_dep) == NULL)) { 989 bp->b_flags |= B_RELBUF; 990 brelse(bp); 991 } else { 992 bqrelse(bp); 993 } 994 } 995 996 if ((error == 0 || uio->uio_resid != orig_resid) && 997 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) { 998 VI_LOCK(vp); 999 ip->i_flag |= IN_ACCESS; 1000 VI_UNLOCK(vp); 1001 } 1002 return (error); 1003 } 1004 1005 /* 1006 * Extended attribute area writing. 1007 */ 1008 static int 1009 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1010 { 1011 struct inode *ip; 1012 struct ufs2_dinode *dp; 1013 struct fs *fs; 1014 struct buf *bp; 1015 ufs_lbn_t lbn; 1016 off_t osize; 1017 int blkoffset, error, flags, resid, size, xfersize; 1018 1019 ip = VTOI(vp); 1020 fs = ip->i_fs; 1021 dp = ip->i_din2; 1022 1023 #ifdef DIAGNOSTIC 1024 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1025 panic("ffs_extwrite: mode"); 1026 #endif 1027 1028 if (ioflag & IO_APPEND) 1029 uio->uio_offset = dp->di_extsize; 1030 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1031 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1032 if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1033 return (EFBIG); 1034 1035 resid = uio->uio_resid; 1036 osize = dp->di_extsize; 1037 flags = IO_EXT; 1038 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1039 flags |= IO_SYNC; 1040 1041 for (error = 0; uio->uio_resid > 0;) { 1042 lbn = lblkno(fs, uio->uio_offset); 1043 blkoffset = blkoff(fs, uio->uio_offset); 1044 xfersize = fs->fs_bsize - blkoffset; 1045 if (uio->uio_resid < xfersize) 1046 xfersize = uio->uio_resid; 1047 1048 /* 1049 * We must perform a read-before-write if the transfer size 1050 * does not cover the entire buffer. 1051 */ 1052 if (fs->fs_bsize > xfersize) 1053 flags |= BA_CLRBUF; 1054 else 1055 flags &= ~BA_CLRBUF; 1056 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1057 ucred, flags, &bp); 1058 if (error != 0) 1059 break; 1060 /* 1061 * If the buffer is not valid we have to clear out any 1062 * garbage data from the pages instantiated for the buffer. 1063 * If we do not, a failed uiomove() during a write can leave 1064 * the prior contents of the pages exposed to a userland 1065 * mmap(). XXX deal with uiomove() errors a better way. 1066 */ 1067 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1068 vfs_bio_clrbuf(bp); 1069 if (ioflag & IO_DIRECT) 1070 bp->b_flags |= B_DIRECT; 1071 1072 if (uio->uio_offset + xfersize > dp->di_extsize) 1073 dp->di_extsize = uio->uio_offset + xfersize; 1074 1075 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1076 if (size < xfersize) 1077 xfersize = size; 1078 1079 error = 1080 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1081 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1082 (LIST_FIRST(&bp->b_dep) == NULL)) { 1083 bp->b_flags |= B_RELBUF; 1084 } 1085 1086 /* 1087 * If IO_SYNC each buffer is written synchronously. Otherwise 1088 * if we have a severe page deficiency write the buffer 1089 * asynchronously. Otherwise try to cluster, and if that 1090 * doesn't do it then either do an async write (if O_DIRECT), 1091 * or a delayed write (if not). 1092 */ 1093 if (ioflag & IO_SYNC) { 1094 (void)bwrite(bp); 1095 } else if (vm_page_count_severe() || 1096 buf_dirty_count_severe() || 1097 xfersize + blkoffset == fs->fs_bsize || 1098 (ioflag & (IO_ASYNC | IO_DIRECT))) 1099 bawrite(bp); 1100 else 1101 bdwrite(bp); 1102 if (error || xfersize == 0) 1103 break; 1104 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1105 } 1106 /* 1107 * If we successfully wrote any data, and we are not the superuser 1108 * we clear the setuid and setgid bits as a precaution against 1109 * tampering. 1110 */ 1111 if (resid > uio->uio_resid && ucred && 1112 priv_check_cred(ucred, PRIV_VFS_CLEARSUGID, SUSER_ALLOWJAIL)) { 1113 ip->i_mode &= ~(ISUID | ISGID); 1114 dp->di_mode = ip->i_mode; 1115 } 1116 if (error) { 1117 if (ioflag & IO_UNIT) { 1118 (void)ffs_truncate(vp, osize, 1119 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td); 1120 uio->uio_offset -= resid - uio->uio_resid; 1121 uio->uio_resid = resid; 1122 } 1123 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1124 error = ffs_update(vp, 1); 1125 return (error); 1126 } 1127 1128 1129 /* 1130 * Vnode operating to retrieve a named extended attribute. 1131 * 1132 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1133 * the length of the EA, and possibly the pointer to the entry and to the data. 1134 */ 1135 static int 1136 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) 1137 { 1138 u_char *p, *pe, *pn, *p0; 1139 int eapad1, eapad2, ealength, ealen, nlen; 1140 uint32_t ul; 1141 1142 pe = ptr + length; 1143 nlen = strlen(name); 1144 1145 for (p = ptr; p < pe; p = pn) { 1146 p0 = p; 1147 bcopy(p, &ul, sizeof(ul)); 1148 pn = p + ul; 1149 /* make sure this entry is complete */ 1150 if (pn > pe) 1151 break; 1152 p += sizeof(uint32_t); 1153 if (*p != nspace) 1154 continue; 1155 p++; 1156 eapad2 = *p++; 1157 if (*p != nlen) 1158 continue; 1159 p++; 1160 if (bcmp(p, name, nlen)) 1161 continue; 1162 ealength = sizeof(uint32_t) + 3 + nlen; 1163 eapad1 = 8 - (ealength % 8); 1164 if (eapad1 == 8) 1165 eapad1 = 0; 1166 ealength += eapad1; 1167 ealen = ul - ealength - eapad2; 1168 p += nlen + eapad1; 1169 if (eap != NULL) 1170 *eap = p0; 1171 if (eac != NULL) 1172 *eac = p; 1173 return (ealen); 1174 } 1175 return(-1); 1176 } 1177 1178 static int 1179 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1180 { 1181 struct inode *ip; 1182 struct ufs2_dinode *dp; 1183 struct uio luio; 1184 struct iovec liovec; 1185 int easize, error; 1186 u_char *eae; 1187 1188 ip = VTOI(vp); 1189 dp = ip->i_din2; 1190 easize = dp->di_extsize; 1191 1192 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1193 1194 liovec.iov_base = eae; 1195 liovec.iov_len = easize; 1196 luio.uio_iov = &liovec; 1197 luio.uio_iovcnt = 1; 1198 luio.uio_offset = 0; 1199 luio.uio_resid = easize; 1200 luio.uio_segflg = UIO_SYSSPACE; 1201 luio.uio_rw = UIO_READ; 1202 luio.uio_td = td; 1203 1204 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1205 if (error) { 1206 free(eae, M_TEMP); 1207 return(error); 1208 } 1209 *p = eae; 1210 return (0); 1211 } 1212 1213 static int 1214 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1215 { 1216 struct inode *ip; 1217 struct ufs2_dinode *dp; 1218 int error; 1219 1220 ip = VTOI(vp); 1221 1222 if (ip->i_ea_area != NULL) 1223 return (EBUSY); 1224 dp = ip->i_din2; 1225 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1226 if (error) 1227 return (error); 1228 ip->i_ea_len = dp->di_extsize; 1229 ip->i_ea_error = 0; 1230 return (0); 1231 } 1232 1233 /* 1234 * Vnode extattr transaction commit/abort 1235 */ 1236 static int 1237 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1238 { 1239 struct inode *ip; 1240 struct uio luio; 1241 struct iovec liovec; 1242 int error; 1243 struct ufs2_dinode *dp; 1244 1245 ip = VTOI(vp); 1246 if (ip->i_ea_area == NULL) 1247 return (EINVAL); 1248 dp = ip->i_din2; 1249 error = ip->i_ea_error; 1250 if (commit && error == 0) { 1251 if (cred == NOCRED) 1252 cred = vp->v_mount->mnt_cred; 1253 liovec.iov_base = ip->i_ea_area; 1254 liovec.iov_len = ip->i_ea_len; 1255 luio.uio_iov = &liovec; 1256 luio.uio_iovcnt = 1; 1257 luio.uio_offset = 0; 1258 luio.uio_resid = ip->i_ea_len; 1259 luio.uio_segflg = UIO_SYSSPACE; 1260 luio.uio_rw = UIO_WRITE; 1261 luio.uio_td = td; 1262 /* XXX: I'm not happy about truncating to zero size */ 1263 if (ip->i_ea_len < dp->di_extsize) 1264 error = ffs_truncate(vp, 0, IO_EXT, cred, td); 1265 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1266 } 1267 free(ip->i_ea_area, M_TEMP); 1268 ip->i_ea_area = NULL; 1269 ip->i_ea_len = 0; 1270 ip->i_ea_error = 0; 1271 return (error); 1272 } 1273 1274 /* 1275 * Vnode extattr strategy routine for fifos. 1276 * 1277 * We need to check for a read or write of the external attributes. 1278 * Otherwise we just fall through and do the usual thing. 1279 */ 1280 static int 1281 ffsext_strategy(struct vop_strategy_args *ap) 1282 /* 1283 struct vop_strategy_args { 1284 struct vnodeop_desc *a_desc; 1285 struct vnode *a_vp; 1286 struct buf *a_bp; 1287 }; 1288 */ 1289 { 1290 struct vnode *vp; 1291 daddr_t lbn; 1292 1293 vp = ap->a_vp; 1294 lbn = ap->a_bp->b_lblkno; 1295 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1296 lbn < 0 && lbn >= -NXADDR) 1297 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1298 if (vp->v_type == VFIFO) 1299 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1300 panic("spec nodes went here"); 1301 } 1302 1303 /* 1304 * Vnode extattr transaction commit/abort 1305 */ 1306 static int 1307 ffs_openextattr(struct vop_openextattr_args *ap) 1308 /* 1309 struct vop_openextattr_args { 1310 struct vnodeop_desc *a_desc; 1311 struct vnode *a_vp; 1312 IN struct ucred *a_cred; 1313 IN struct thread *a_td; 1314 }; 1315 */ 1316 { 1317 struct inode *ip; 1318 struct fs *fs; 1319 1320 ip = VTOI(ap->a_vp); 1321 fs = ip->i_fs; 1322 1323 if (ap->a_vp->v_type == VCHR) 1324 return (EOPNOTSUPP); 1325 1326 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1327 } 1328 1329 1330 /* 1331 * Vnode extattr transaction commit/abort 1332 */ 1333 static int 1334 ffs_closeextattr(struct vop_closeextattr_args *ap) 1335 /* 1336 struct vop_closeextattr_args { 1337 struct vnodeop_desc *a_desc; 1338 struct vnode *a_vp; 1339 int a_commit; 1340 IN struct ucred *a_cred; 1341 IN struct thread *a_td; 1342 }; 1343 */ 1344 { 1345 struct inode *ip; 1346 struct fs *fs; 1347 1348 ip = VTOI(ap->a_vp); 1349 fs = ip->i_fs; 1350 1351 if (ap->a_vp->v_type == VCHR) 1352 return (EOPNOTSUPP); 1353 1354 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1355 } 1356 1357 /* 1358 * Vnode operation to remove a named attribute. 1359 */ 1360 static int 1361 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1362 /* 1363 vop_deleteextattr { 1364 IN struct vnode *a_vp; 1365 IN int a_attrnamespace; 1366 IN const char *a_name; 1367 IN struct ucred *a_cred; 1368 IN struct thread *a_td; 1369 }; 1370 */ 1371 { 1372 struct inode *ip; 1373 struct fs *fs; 1374 uint32_t ealength, ul; 1375 int ealen, olen, eapad1, eapad2, error, i, easize; 1376 u_char *eae, *p; 1377 int stand_alone; 1378 1379 ip = VTOI(ap->a_vp); 1380 fs = ip->i_fs; 1381 1382 if (ap->a_vp->v_type == VCHR) 1383 return (EOPNOTSUPP); 1384 1385 if (strlen(ap->a_name) == 0) 1386 return (EINVAL); 1387 1388 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1389 ap->a_cred, ap->a_td, IWRITE); 1390 if (error) { 1391 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1392 ip->i_ea_error = error; 1393 return (error); 1394 } 1395 1396 if (ip->i_ea_area == NULL) { 1397 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1398 if (error) 1399 return (error); 1400 stand_alone = 1; 1401 } else { 1402 stand_alone = 0; 1403 } 1404 1405 ealength = eapad1 = ealen = eapad2 = 0; 1406 1407 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1408 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1409 easize = ip->i_ea_len; 1410 1411 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1412 &p, NULL); 1413 if (olen == -1) { 1414 /* delete but nonexistent */ 1415 free(eae, M_TEMP); 1416 if (stand_alone) 1417 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1418 return(ENOATTR); 1419 } 1420 bcopy(p, &ul, sizeof ul); 1421 i = p - eae + ul; 1422 if (ul != ealength) { 1423 bcopy(p + ul, p + ealength, easize - i); 1424 easize += (ealength - ul); 1425 } 1426 if (easize > NXADDR * fs->fs_bsize) { 1427 free(eae, M_TEMP); 1428 if (stand_alone) 1429 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1430 else if (ip->i_ea_error == 0) 1431 ip->i_ea_error = ENOSPC; 1432 return(ENOSPC); 1433 } 1434 p = ip->i_ea_area; 1435 ip->i_ea_area = eae; 1436 ip->i_ea_len = easize; 1437 free(p, M_TEMP); 1438 if (stand_alone) 1439 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1440 return(error); 1441 } 1442 1443 /* 1444 * Vnode operation to retrieve a named extended attribute. 1445 */ 1446 static int 1447 ffs_getextattr(struct vop_getextattr_args *ap) 1448 /* 1449 vop_getextattr { 1450 IN struct vnode *a_vp; 1451 IN int a_attrnamespace; 1452 IN const char *a_name; 1453 INOUT struct uio *a_uio; 1454 OUT size_t *a_size; 1455 IN struct ucred *a_cred; 1456 IN struct thread *a_td; 1457 }; 1458 */ 1459 { 1460 struct inode *ip; 1461 struct fs *fs; 1462 u_char *eae, *p; 1463 unsigned easize; 1464 int error, ealen, stand_alone; 1465 1466 ip = VTOI(ap->a_vp); 1467 fs = ip->i_fs; 1468 1469 if (ap->a_vp->v_type == VCHR) 1470 return (EOPNOTSUPP); 1471 1472 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1473 ap->a_cred, ap->a_td, IREAD); 1474 if (error) 1475 return (error); 1476 1477 if (ip->i_ea_area == NULL) { 1478 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1479 if (error) 1480 return (error); 1481 stand_alone = 1; 1482 } else { 1483 stand_alone = 0; 1484 } 1485 eae = ip->i_ea_area; 1486 easize = ip->i_ea_len; 1487 1488 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1489 NULL, &p); 1490 if (ealen >= 0) { 1491 error = 0; 1492 if (ap->a_size != NULL) 1493 *ap->a_size = ealen; 1494 else if (ap->a_uio != NULL) 1495 error = uiomove(p, ealen, ap->a_uio); 1496 } else 1497 error = ENOATTR; 1498 if (stand_alone) 1499 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1500 return(error); 1501 } 1502 1503 /* 1504 * Vnode operation to retrieve extended attributes on a vnode. 1505 */ 1506 static int 1507 ffs_listextattr(struct vop_listextattr_args *ap) 1508 /* 1509 vop_listextattr { 1510 IN struct vnode *a_vp; 1511 IN int a_attrnamespace; 1512 INOUT struct uio *a_uio; 1513 OUT size_t *a_size; 1514 IN struct ucred *a_cred; 1515 IN struct thread *a_td; 1516 }; 1517 */ 1518 { 1519 struct inode *ip; 1520 struct fs *fs; 1521 u_char *eae, *p, *pe, *pn; 1522 unsigned easize; 1523 uint32_t ul; 1524 int error, ealen, stand_alone; 1525 1526 ip = VTOI(ap->a_vp); 1527 fs = ip->i_fs; 1528 1529 if (ap->a_vp->v_type == VCHR) 1530 return (EOPNOTSUPP); 1531 1532 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1533 ap->a_cred, ap->a_td, IREAD); 1534 if (error) 1535 return (error); 1536 1537 if (ip->i_ea_area == NULL) { 1538 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1539 if (error) 1540 return (error); 1541 stand_alone = 1; 1542 } else { 1543 stand_alone = 0; 1544 } 1545 eae = ip->i_ea_area; 1546 easize = ip->i_ea_len; 1547 1548 error = 0; 1549 if (ap->a_size != NULL) 1550 *ap->a_size = 0; 1551 pe = eae + easize; 1552 for(p = eae; error == 0 && p < pe; p = pn) { 1553 bcopy(p, &ul, sizeof(ul)); 1554 pn = p + ul; 1555 if (pn > pe) 1556 break; 1557 p += sizeof(ul); 1558 if (*p++ != ap->a_attrnamespace) 1559 continue; 1560 p++; /* pad2 */ 1561 ealen = *p; 1562 if (ap->a_size != NULL) { 1563 *ap->a_size += ealen + 1; 1564 } else if (ap->a_uio != NULL) { 1565 error = uiomove(p, ealen + 1, ap->a_uio); 1566 } 1567 } 1568 if (stand_alone) 1569 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1570 return(error); 1571 } 1572 1573 /* 1574 * Vnode operation to set a named attribute. 1575 */ 1576 static int 1577 ffs_setextattr(struct vop_setextattr_args *ap) 1578 /* 1579 vop_setextattr { 1580 IN struct vnode *a_vp; 1581 IN int a_attrnamespace; 1582 IN const char *a_name; 1583 INOUT struct uio *a_uio; 1584 IN struct ucred *a_cred; 1585 IN struct thread *a_td; 1586 }; 1587 */ 1588 { 1589 struct inode *ip; 1590 struct fs *fs; 1591 uint32_t ealength, ul; 1592 int ealen, olen, eapad1, eapad2, error, i, easize; 1593 u_char *eae, *p; 1594 int stand_alone; 1595 1596 ip = VTOI(ap->a_vp); 1597 fs = ip->i_fs; 1598 1599 if (ap->a_vp->v_type == VCHR) 1600 return (EOPNOTSUPP); 1601 1602 if (strlen(ap->a_name) == 0) 1603 return (EINVAL); 1604 1605 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1606 if (ap->a_uio == NULL) 1607 return (EOPNOTSUPP); 1608 1609 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1610 ap->a_cred, ap->a_td, IWRITE); 1611 if (error) { 1612 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1613 ip->i_ea_error = error; 1614 return (error); 1615 } 1616 1617 if (ip->i_ea_area == NULL) { 1618 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1619 if (error) 1620 return (error); 1621 stand_alone = 1; 1622 } else { 1623 stand_alone = 0; 1624 } 1625 1626 ealen = ap->a_uio->uio_resid; 1627 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1628 eapad1 = 8 - (ealength % 8); 1629 if (eapad1 == 8) 1630 eapad1 = 0; 1631 eapad2 = 8 - (ealen % 8); 1632 if (eapad2 == 8) 1633 eapad2 = 0; 1634 ealength += eapad1 + ealen + eapad2; 1635 1636 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1637 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1638 easize = ip->i_ea_len; 1639 1640 olen = ffs_findextattr(eae, easize, 1641 ap->a_attrnamespace, ap->a_name, &p, NULL); 1642 if (olen == -1) { 1643 /* new, append at end */ 1644 p = eae + easize; 1645 easize += ealength; 1646 } else { 1647 bcopy(p, &ul, sizeof ul); 1648 i = p - eae + ul; 1649 if (ul != ealength) { 1650 bcopy(p + ul, p + ealength, easize - i); 1651 easize += (ealength - ul); 1652 } 1653 } 1654 if (easize > NXADDR * fs->fs_bsize) { 1655 free(eae, M_TEMP); 1656 if (stand_alone) 1657 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1658 else if (ip->i_ea_error == 0) 1659 ip->i_ea_error = ENOSPC; 1660 return(ENOSPC); 1661 } 1662 bcopy(&ealength, p, sizeof(ealength)); 1663 p += sizeof(ealength); 1664 *p++ = ap->a_attrnamespace; 1665 *p++ = eapad2; 1666 *p++ = strlen(ap->a_name); 1667 strcpy(p, ap->a_name); 1668 p += strlen(ap->a_name); 1669 bzero(p, eapad1); 1670 p += eapad1; 1671 error = uiomove(p, ealen, ap->a_uio); 1672 if (error) { 1673 free(eae, M_TEMP); 1674 if (stand_alone) 1675 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1676 else if (ip->i_ea_error == 0) 1677 ip->i_ea_error = error; 1678 return(error); 1679 } 1680 p += ealen; 1681 bzero(p, eapad2); 1682 1683 p = ip->i_ea_area; 1684 ip->i_ea_area = eae; 1685 ip->i_ea_len = easize; 1686 free(p, M_TEMP); 1687 if (stand_alone) 1688 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1689 return(error); 1690 } 1691