1 /*- 2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 60 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 61 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include <sys/param.h> 68 #include <sys/bio.h> 69 #include <sys/systm.h> 70 #include <sys/buf.h> 71 #include <sys/conf.h> 72 #include <sys/extattr.h> 73 #include <sys/kernel.h> 74 #include <sys/limits.h> 75 #include <sys/malloc.h> 76 #include <sys/mount.h> 77 #include <sys/proc.h> 78 #include <sys/resourcevar.h> 79 #include <sys/signalvar.h> 80 #include <sys/stat.h> 81 #include <sys/vmmeter.h> 82 #include <sys/vnode.h> 83 84 #include <vm/vm.h> 85 #include <vm/vm_extern.h> 86 #include <vm/vm_object.h> 87 #include <vm/vm_page.h> 88 #include <vm/vm_pager.h> 89 #include <vm/vnode_pager.h> 90 91 #include <ufs/ufs/extattr.h> 92 #include <ufs/ufs/quota.h> 93 #include <ufs/ufs/inode.h> 94 #include <ufs/ufs/ufs_extern.h> 95 #include <ufs/ufs/ufsmount.h> 96 97 #include <ufs/ffs/fs.h> 98 #include <ufs/ffs/ffs_extern.h> 99 #include "opt_directio.h" 100 #include "opt_ffs.h" 101 102 #ifdef DIRECTIO 103 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 104 #endif 105 static vop_fsync_t ffs_fsync; 106 static vop_lock_t ffs_lock; 107 static vop_getpages_t ffs_getpages; 108 static vop_read_t ffs_read; 109 static vop_write_t ffs_write; 110 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 111 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 112 struct ucred *cred); 113 static vop_strategy_t ffsext_strategy; 114 static vop_closeextattr_t ffs_closeextattr; 115 static vop_deleteextattr_t ffs_deleteextattr; 116 static vop_getextattr_t ffs_getextattr; 117 static vop_listextattr_t ffs_listextattr; 118 static vop_openextattr_t ffs_openextattr; 119 static vop_setextattr_t ffs_setextattr; 120 121 122 /* Global vfs data structures for ufs. */ 123 struct vop_vector ffs_vnodeops1 = { 124 .vop_default = &ufs_vnodeops, 125 .vop_fsync = ffs_fsync, 126 .vop_getpages = ffs_getpages, 127 .vop_lock = ffs_lock, 128 .vop_read = ffs_read, 129 .vop_reallocblks = ffs_reallocblks, 130 .vop_write = ffs_write, 131 }; 132 133 struct vop_vector ffs_fifoops1 = { 134 .vop_default = &ufs_fifoops, 135 .vop_fsync = ffs_fsync, 136 .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ 137 }; 138 139 /* Global vfs data structures for ufs. */ 140 struct vop_vector ffs_vnodeops2 = { 141 .vop_default = &ufs_vnodeops, 142 .vop_fsync = ffs_fsync, 143 .vop_getpages = ffs_getpages, 144 .vop_lock = ffs_lock, 145 .vop_read = ffs_read, 146 .vop_reallocblks = ffs_reallocblks, 147 .vop_write = ffs_write, 148 .vop_closeextattr = ffs_closeextattr, 149 .vop_deleteextattr = ffs_deleteextattr, 150 .vop_getextattr = ffs_getextattr, 151 .vop_listextattr = ffs_listextattr, 152 .vop_openextattr = ffs_openextattr, 153 .vop_setextattr = ffs_setextattr, 154 }; 155 156 struct vop_vector ffs_fifoops2 = { 157 .vop_default = &ufs_fifoops, 158 .vop_fsync = ffs_fsync, 159 .vop_lock = ffs_lock, 160 .vop_reallocblks = ffs_reallocblks, 161 .vop_strategy = ffsext_strategy, 162 .vop_closeextattr = ffs_closeextattr, 163 .vop_deleteextattr = ffs_deleteextattr, 164 .vop_getextattr = ffs_getextattr, 165 .vop_listextattr = ffs_listextattr, 166 .vop_openextattr = ffs_openextattr, 167 .vop_setextattr = ffs_setextattr, 168 }; 169 170 /* 171 * Synch an open file. 172 */ 173 /* ARGSUSED */ 174 static int 175 ffs_fsync(struct vop_fsync_args *ap) 176 { 177 int error; 178 179 error = ffs_syncvnode(ap->a_vp, ap->a_waitfor); 180 if (error) 181 return (error); 182 if (ap->a_waitfor == MNT_WAIT && 183 (ap->a_vp->v_mount->mnt_flag & MNT_SOFTDEP)) 184 error = softdep_fsync(ap->a_vp); 185 return (error); 186 } 187 188 int 189 ffs_syncvnode(struct vnode *vp, int waitfor) 190 { 191 struct inode *ip = VTOI(vp); 192 struct buf *bp; 193 struct buf *nbp; 194 int s, error, wait, passes, skipmeta; 195 ufs_lbn_t lbn; 196 197 wait = (waitfor == MNT_WAIT); 198 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 199 200 /* 201 * Flush all dirty buffers associated with a vnode. 202 */ 203 passes = NIADDR + 1; 204 skipmeta = 0; 205 if (wait) 206 skipmeta = 1; 207 s = splbio(); 208 VI_LOCK(vp); 209 loop: 210 TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) 211 bp->b_vflags &= ~BV_SCANNED; 212 TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { 213 /* 214 * Reasons to skip this buffer: it has already been considered 215 * on this pass, this pass is the first time through on a 216 * synchronous flush request and the buffer being considered 217 * is metadata, the buffer has dependencies that will cause 218 * it to be redirtied and it has not already been deferred, 219 * or it is already being written. 220 */ 221 if ((bp->b_vflags & BV_SCANNED) != 0) 222 continue; 223 bp->b_vflags |= BV_SCANNED; 224 if ((skipmeta == 1 && bp->b_lblkno < 0)) 225 continue; 226 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 227 continue; 228 VI_UNLOCK(vp); 229 if (!wait && LIST_FIRST(&bp->b_dep) != NULL && 230 (bp->b_flags & B_DEFERRED) == 0 && 231 buf_countdeps(bp, 0)) { 232 bp->b_flags |= B_DEFERRED; 233 BUF_UNLOCK(bp); 234 VI_LOCK(vp); 235 continue; 236 } 237 if ((bp->b_flags & B_DELWRI) == 0) 238 panic("ffs_fsync: not dirty"); 239 /* 240 * If this is a synchronous flush request, or it is not a 241 * file or device, start the write on this buffer immediatly. 242 */ 243 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) { 244 245 /* 246 * On our final pass through, do all I/O synchronously 247 * so that we can find out if our flush is failing 248 * because of write errors. 249 */ 250 if (passes > 0 || !wait) { 251 if ((bp->b_flags & B_CLUSTEROK) && !wait) { 252 (void) vfs_bio_awrite(bp); 253 } else { 254 bremfree(bp); 255 splx(s); 256 (void) bawrite(bp); 257 s = splbio(); 258 } 259 } else { 260 bremfree(bp); 261 splx(s); 262 if ((error = bwrite(bp)) != 0) 263 return (error); 264 s = splbio(); 265 } 266 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) { 267 /* 268 * If the buffer is for data that has been truncated 269 * off the file, then throw it away. 270 */ 271 bremfree(bp); 272 bp->b_flags |= B_INVAL | B_NOCACHE; 273 splx(s); 274 brelse(bp); 275 s = splbio(); 276 } else 277 vfs_bio_awrite(bp); 278 279 /* 280 * Since we may have slept during the I/O, we need 281 * to start from a known point. 282 */ 283 VI_LOCK(vp); 284 nbp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd); 285 } 286 /* 287 * If we were asked to do this synchronously, then go back for 288 * another pass, this time doing the metadata. 289 */ 290 if (skipmeta) { 291 skipmeta = 0; 292 goto loop; 293 } 294 295 if (wait) { 296 bufobj_wwait(&vp->v_bufobj, 3, 0); 297 VI_UNLOCK(vp); 298 299 /* 300 * Ensure that any filesystem metatdata associated 301 * with the vnode has been written. 302 */ 303 splx(s); 304 if ((error = softdep_sync_metadata(vp)) != 0) 305 return (error); 306 s = splbio(); 307 308 VI_LOCK(vp); 309 if (vp->v_bufobj.bo_dirty.bv_cnt > 0) { 310 /* 311 * Block devices associated with filesystems may 312 * have new I/O requests posted for them even if 313 * the vnode is locked, so no amount of trying will 314 * get them clean. Thus we give block devices a 315 * good effort, then just give up. For all other file 316 * types, go around and try again until it is clean. 317 */ 318 if (passes > 0) { 319 passes -= 1; 320 goto loop; 321 } 322 #ifdef DIAGNOSTIC 323 if (!vn_isdisk(vp, NULL)) 324 vprint("ffs_fsync: dirty", vp); 325 #endif 326 } 327 } 328 VI_UNLOCK(vp); 329 splx(s); 330 return (ffs_update(vp, wait)); 331 } 332 333 static int 334 ffs_lock(ap) 335 struct vop_lock_args /* { 336 struct vnode *a_vp; 337 int a_flags; 338 struct thread *a_td; 339 } */ *ap; 340 { 341 #ifndef NO_FFS_SNAPSHOT 342 struct vnode *vp; 343 int flags; 344 struct lock *lkp; 345 int result; 346 347 switch (ap->a_flags & LK_TYPE_MASK) { 348 case LK_SHARED: 349 case LK_UPGRADE: 350 case LK_EXCLUSIVE: 351 vp = ap->a_vp; 352 flags = ap->a_flags; 353 for (;;) { 354 /* 355 * vnode interlock must be held to ensure that 356 * the possibly external lock isn't freed, 357 * e.g. when mutating from snapshot file vnode 358 * to regular file vnode. 359 */ 360 if ((flags & LK_INTERLOCK) == 0) { 361 VI_LOCK(vp); 362 flags |= LK_INTERLOCK; 363 } 364 lkp = vp->v_vnlock; 365 result = lockmgr(lkp, flags, VI_MTX(vp), ap->a_td); 366 if (lkp == vp->v_vnlock || result != 0) 367 break; 368 /* 369 * Apparent success, except that the vnode 370 * mutated between snapshot file vnode and 371 * regular file vnode while this process 372 * slept. The lock currently held is not the 373 * right lock. Release it, and try to get the 374 * new lock. 375 */ 376 (void) lockmgr(lkp, LK_RELEASE, VI_MTX(vp), ap->a_td); 377 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 378 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 379 flags &= ~LK_INTERLOCK; 380 } 381 break; 382 default: 383 result = VOP_LOCK_APV(&ufs_vnodeops, ap); 384 } 385 return (result); 386 #else 387 return (VOP_LOCK_APV(&ufs_vnodeops, ap)); 388 #endif 389 } 390 391 /* 392 * Vnode op for reading. 393 */ 394 /* ARGSUSED */ 395 static int 396 ffs_read(ap) 397 struct vop_read_args /* { 398 struct vnode *a_vp; 399 struct uio *a_uio; 400 int a_ioflag; 401 struct ucred *a_cred; 402 } */ *ap; 403 { 404 struct vnode *vp; 405 struct inode *ip; 406 struct uio *uio; 407 struct fs *fs; 408 struct buf *bp; 409 ufs_lbn_t lbn, nextlbn; 410 off_t bytesinfile; 411 long size, xfersize, blkoffset; 412 int error, orig_resid; 413 int seqcount; 414 int ioflag; 415 416 vp = ap->a_vp; 417 uio = ap->a_uio; 418 ioflag = ap->a_ioflag; 419 if (ap->a_ioflag & IO_EXT) 420 #ifdef notyet 421 return (ffs_extread(vp, uio, ioflag)); 422 #else 423 panic("ffs_read+IO_EXT"); 424 #endif 425 #ifdef DIRECTIO 426 if ((ioflag & IO_DIRECT) != 0) { 427 int workdone; 428 429 error = ffs_rawread(vp, uio, &workdone); 430 if (error != 0 || workdone != 0) 431 return error; 432 } 433 #endif 434 435 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 436 ip = VTOI(vp); 437 438 #ifdef DIAGNOSTIC 439 if (uio->uio_rw != UIO_READ) 440 panic("ffs_read: mode"); 441 442 if (vp->v_type == VLNK) { 443 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 444 panic("ffs_read: short symlink"); 445 } else if (vp->v_type != VREG && vp->v_type != VDIR) 446 panic("ffs_read: type %d", vp->v_type); 447 #endif 448 orig_resid = uio->uio_resid; 449 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 450 if (orig_resid == 0) 451 return (0); 452 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 453 fs = ip->i_fs; 454 if (uio->uio_offset < ip->i_size && 455 uio->uio_offset >= fs->fs_maxfilesize) 456 return (EOVERFLOW); 457 458 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 459 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 460 break; 461 lbn = lblkno(fs, uio->uio_offset); 462 nextlbn = lbn + 1; 463 464 /* 465 * size of buffer. The buffer representing the 466 * end of the file is rounded up to the size of 467 * the block type ( fragment or full block, 468 * depending ). 469 */ 470 size = blksize(fs, ip, lbn); 471 blkoffset = blkoff(fs, uio->uio_offset); 472 473 /* 474 * The amount we want to transfer in this iteration is 475 * one FS block less the amount of the data before 476 * our startpoint (duh!) 477 */ 478 xfersize = fs->fs_bsize - blkoffset; 479 480 /* 481 * But if we actually want less than the block, 482 * or the file doesn't have a whole block more of data, 483 * then use the lesser number. 484 */ 485 if (uio->uio_resid < xfersize) 486 xfersize = uio->uio_resid; 487 if (bytesinfile < xfersize) 488 xfersize = bytesinfile; 489 490 if (lblktosize(fs, nextlbn) >= ip->i_size) { 491 /* 492 * Don't do readahead if this is the end of the file. 493 */ 494 error = bread(vp, lbn, size, NOCRED, &bp); 495 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 496 /* 497 * Otherwise if we are allowed to cluster, 498 * grab as much as we can. 499 * 500 * XXX This may not be a win if we are not 501 * doing sequential access. 502 */ 503 error = cluster_read(vp, ip->i_size, lbn, 504 size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); 505 } else if (seqcount > 1) { 506 /* 507 * If we are NOT allowed to cluster, then 508 * if we appear to be acting sequentially, 509 * fire off a request for a readahead 510 * as well as a read. Note that the 4th and 5th 511 * arguments point to arrays of the size specified in 512 * the 6th argument. 513 */ 514 int nextsize = blksize(fs, ip, nextlbn); 515 error = breadn(vp, lbn, 516 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 517 } else { 518 /* 519 * Failing all of the above, just read what the 520 * user asked for. Interestingly, the same as 521 * the first option above. 522 */ 523 error = bread(vp, lbn, size, NOCRED, &bp); 524 } 525 if (error) { 526 brelse(bp); 527 bp = NULL; 528 break; 529 } 530 531 /* 532 * If IO_DIRECT then set B_DIRECT for the buffer. This 533 * will cause us to attempt to release the buffer later on 534 * and will cause the buffer cache to attempt to free the 535 * underlying pages. 536 */ 537 if (ioflag & IO_DIRECT) 538 bp->b_flags |= B_DIRECT; 539 540 /* 541 * We should only get non-zero b_resid when an I/O error 542 * has occurred, which should cause us to break above. 543 * However, if the short read did not cause an error, 544 * then we want to ensure that we do not uiomove bad 545 * or uninitialized data. 546 */ 547 size -= bp->b_resid; 548 if (size < xfersize) { 549 if (size == 0) 550 break; 551 xfersize = size; 552 } 553 554 error = uiomove((char *)bp->b_data + blkoffset, 555 (int)xfersize, uio); 556 if (error) 557 break; 558 559 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 560 (LIST_FIRST(&bp->b_dep) == NULL)) { 561 /* 562 * If there are no dependencies, and it's VMIO, 563 * then we don't need the buf, mark it available 564 * for freeing. The VM has the data. 565 */ 566 bp->b_flags |= B_RELBUF; 567 brelse(bp); 568 } else { 569 /* 570 * Otherwise let whoever 571 * made the request take care of 572 * freeing it. We just queue 573 * it onto another list. 574 */ 575 bqrelse(bp); 576 } 577 } 578 579 /* 580 * This can only happen in the case of an error 581 * because the loop above resets bp to NULL on each iteration 582 * and on normal completion has not set a new value into it. 583 * so it must have come from a 'break' statement 584 */ 585 if (bp != NULL) { 586 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 587 (LIST_FIRST(&bp->b_dep) == NULL)) { 588 bp->b_flags |= B_RELBUF; 589 brelse(bp); 590 } else { 591 bqrelse(bp); 592 } 593 } 594 595 if ((error == 0 || uio->uio_resid != orig_resid) && 596 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 597 ip->i_flag |= IN_ACCESS; 598 return (error); 599 } 600 601 /* 602 * Vnode op for writing. 603 */ 604 static int 605 ffs_write(ap) 606 struct vop_write_args /* { 607 struct vnode *a_vp; 608 struct uio *a_uio; 609 int a_ioflag; 610 struct ucred *a_cred; 611 } */ *ap; 612 { 613 struct vnode *vp; 614 struct uio *uio; 615 struct inode *ip; 616 struct fs *fs; 617 struct buf *bp; 618 struct thread *td; 619 ufs_lbn_t lbn; 620 off_t osize; 621 int seqcount; 622 int blkoffset, error, flags, ioflag, resid, size, xfersize; 623 624 vp = ap->a_vp; 625 uio = ap->a_uio; 626 ioflag = ap->a_ioflag; 627 if (ap->a_ioflag & IO_EXT) 628 #ifdef notyet 629 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 630 #else 631 panic("ffs_write+IO_EXT"); 632 #endif 633 634 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 635 ip = VTOI(vp); 636 637 #ifdef DIAGNOSTIC 638 if (uio->uio_rw != UIO_WRITE) 639 panic("ffs_write: mode"); 640 #endif 641 642 switch (vp->v_type) { 643 case VREG: 644 if (ioflag & IO_APPEND) 645 uio->uio_offset = ip->i_size; 646 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 647 return (EPERM); 648 /* FALLTHROUGH */ 649 case VLNK: 650 break; 651 case VDIR: 652 panic("ffs_write: dir write"); 653 break; 654 default: 655 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 656 (int)uio->uio_offset, 657 (int)uio->uio_resid 658 ); 659 } 660 661 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 662 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 663 fs = ip->i_fs; 664 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 665 return (EFBIG); 666 /* 667 * Maybe this should be above the vnode op call, but so long as 668 * file servers have no limits, I don't think it matters. 669 */ 670 td = uio->uio_td; 671 if (vp->v_type == VREG && td != NULL) { 672 PROC_LOCK(td->td_proc); 673 if (uio->uio_offset + uio->uio_resid > 674 lim_cur(td->td_proc, RLIMIT_FSIZE)) { 675 psignal(td->td_proc, SIGXFSZ); 676 PROC_UNLOCK(td->td_proc); 677 return (EFBIG); 678 } 679 PROC_UNLOCK(td->td_proc); 680 } 681 682 resid = uio->uio_resid; 683 osize = ip->i_size; 684 if (seqcount > BA_SEQMAX) 685 flags = BA_SEQMAX << BA_SEQSHIFT; 686 else 687 flags = seqcount << BA_SEQSHIFT; 688 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 689 flags |= IO_SYNC; 690 691 for (error = 0; uio->uio_resid > 0;) { 692 lbn = lblkno(fs, uio->uio_offset); 693 blkoffset = blkoff(fs, uio->uio_offset); 694 xfersize = fs->fs_bsize - blkoffset; 695 if (uio->uio_resid < xfersize) 696 xfersize = uio->uio_resid; 697 if (uio->uio_offset + xfersize > ip->i_size) 698 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 699 700 /* 701 * We must perform a read-before-write if the transfer size 702 * does not cover the entire buffer. 703 */ 704 if (fs->fs_bsize > xfersize) 705 flags |= BA_CLRBUF; 706 else 707 flags &= ~BA_CLRBUF; 708 /* XXX is uio->uio_offset the right thing here? */ 709 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 710 ap->a_cred, flags, &bp); 711 if (error != 0) 712 break; 713 /* 714 * If the buffer is not valid we have to clear out any 715 * garbage data from the pages instantiated for the buffer. 716 * If we do not, a failed uiomove() during a write can leave 717 * the prior contents of the pages exposed to a userland 718 * mmap(). XXX deal with uiomove() errors a better way. 719 */ 720 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 721 vfs_bio_clrbuf(bp); 722 if (ioflag & IO_DIRECT) 723 bp->b_flags |= B_DIRECT; 724 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 725 bp->b_flags |= B_NOCACHE; 726 727 if (uio->uio_offset + xfersize > ip->i_size) { 728 ip->i_size = uio->uio_offset + xfersize; 729 DIP_SET(ip, i_size, ip->i_size); 730 } 731 732 size = blksize(fs, ip, lbn) - bp->b_resid; 733 if (size < xfersize) 734 xfersize = size; 735 736 error = 737 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 738 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 739 (LIST_FIRST(&bp->b_dep) == NULL)) { 740 bp->b_flags |= B_RELBUF; 741 } 742 743 /* 744 * If IO_SYNC each buffer is written synchronously. Otherwise 745 * if we have a severe page deficiency write the buffer 746 * asynchronously. Otherwise try to cluster, and if that 747 * doesn't do it then either do an async write (if O_DIRECT), 748 * or a delayed write (if not). 749 */ 750 if (ioflag & IO_SYNC) { 751 (void)bwrite(bp); 752 } else if (vm_page_count_severe() || 753 buf_dirty_count_severe() || 754 (ioflag & IO_ASYNC)) { 755 bp->b_flags |= B_CLUSTEROK; 756 bawrite(bp); 757 } else if (xfersize + blkoffset == fs->fs_bsize) { 758 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 759 bp->b_flags |= B_CLUSTEROK; 760 cluster_write(vp, bp, ip->i_size, seqcount); 761 } else { 762 bawrite(bp); 763 } 764 } else if (ioflag & IO_DIRECT) { 765 bp->b_flags |= B_CLUSTEROK; 766 bawrite(bp); 767 } else { 768 bp->b_flags |= B_CLUSTEROK; 769 bdwrite(bp); 770 } 771 if (error || xfersize == 0) 772 break; 773 ip->i_flag |= IN_CHANGE | IN_UPDATE; 774 } 775 /* 776 * If we successfully wrote any data, and we are not the superuser 777 * we clear the setuid and setgid bits as a precaution against 778 * tampering. 779 */ 780 if (resid > uio->uio_resid && ap->a_cred && 781 suser_cred(ap->a_cred, SUSER_ALLOWJAIL)) { 782 ip->i_mode &= ~(ISUID | ISGID); 783 DIP_SET(ip, i_mode, ip->i_mode); 784 } 785 if (error) { 786 if (ioflag & IO_UNIT) { 787 (void)ffs_truncate(vp, osize, 788 IO_NORMAL | (ioflag & IO_SYNC), 789 ap->a_cred, uio->uio_td); 790 uio->uio_offset -= resid - uio->uio_resid; 791 uio->uio_resid = resid; 792 } 793 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 794 error = ffs_update(vp, 1); 795 return (error); 796 } 797 798 /* 799 * get page routine 800 */ 801 static int 802 ffs_getpages(ap) 803 struct vop_getpages_args *ap; 804 { 805 int i; 806 vm_page_t mreq; 807 int pcount; 808 809 pcount = round_page(ap->a_count) / PAGE_SIZE; 810 mreq = ap->a_m[ap->a_reqpage]; 811 812 /* 813 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 814 * then the entire page is valid. Since the page may be mapped, 815 * user programs might reference data beyond the actual end of file 816 * occuring within the page. We have to zero that data. 817 */ 818 VM_OBJECT_LOCK(mreq->object); 819 if (mreq->valid) { 820 if (mreq->valid != VM_PAGE_BITS_ALL) 821 vm_page_zero_invalid(mreq, TRUE); 822 vm_page_lock_queues(); 823 for (i = 0; i < pcount; i++) { 824 if (i != ap->a_reqpage) { 825 vm_page_free(ap->a_m[i]); 826 } 827 } 828 vm_page_unlock_queues(); 829 VM_OBJECT_UNLOCK(mreq->object); 830 return VM_PAGER_OK; 831 } 832 VM_OBJECT_UNLOCK(mreq->object); 833 834 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 835 ap->a_count, 836 ap->a_reqpage); 837 } 838 839 840 /* 841 * Extended attribute area reading. 842 */ 843 static int 844 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 845 { 846 struct inode *ip; 847 struct ufs2_dinode *dp; 848 struct fs *fs; 849 struct buf *bp; 850 ufs_lbn_t lbn, nextlbn; 851 off_t bytesinfile; 852 long size, xfersize, blkoffset; 853 int error, orig_resid; 854 855 ip = VTOI(vp); 856 fs = ip->i_fs; 857 dp = ip->i_din2; 858 859 #ifdef DIAGNOSTIC 860 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 861 panic("ffs_extread: mode"); 862 863 #endif 864 orig_resid = uio->uio_resid; 865 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 866 if (orig_resid == 0) 867 return (0); 868 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 869 870 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 871 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 872 break; 873 lbn = lblkno(fs, uio->uio_offset); 874 nextlbn = lbn + 1; 875 876 /* 877 * size of buffer. The buffer representing the 878 * end of the file is rounded up to the size of 879 * the block type ( fragment or full block, 880 * depending ). 881 */ 882 size = sblksize(fs, dp->di_extsize, lbn); 883 blkoffset = blkoff(fs, uio->uio_offset); 884 885 /* 886 * The amount we want to transfer in this iteration is 887 * one FS block less the amount of the data before 888 * our startpoint (duh!) 889 */ 890 xfersize = fs->fs_bsize - blkoffset; 891 892 /* 893 * But if we actually want less than the block, 894 * or the file doesn't have a whole block more of data, 895 * then use the lesser number. 896 */ 897 if (uio->uio_resid < xfersize) 898 xfersize = uio->uio_resid; 899 if (bytesinfile < xfersize) 900 xfersize = bytesinfile; 901 902 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 903 /* 904 * Don't do readahead if this is the end of the info. 905 */ 906 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 907 } else { 908 /* 909 * If we have a second block, then 910 * fire off a request for a readahead 911 * as well as a read. Note that the 4th and 5th 912 * arguments point to arrays of the size specified in 913 * the 6th argument. 914 */ 915 int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 916 917 nextlbn = -1 - nextlbn; 918 error = breadn(vp, -1 - lbn, 919 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 920 } 921 if (error) { 922 brelse(bp); 923 bp = NULL; 924 break; 925 } 926 927 /* 928 * If IO_DIRECT then set B_DIRECT for the buffer. This 929 * will cause us to attempt to release the buffer later on 930 * and will cause the buffer cache to attempt to free the 931 * underlying pages. 932 */ 933 if (ioflag & IO_DIRECT) 934 bp->b_flags |= B_DIRECT; 935 936 /* 937 * We should only get non-zero b_resid when an I/O error 938 * has occurred, which should cause us to break above. 939 * However, if the short read did not cause an error, 940 * then we want to ensure that we do not uiomove bad 941 * or uninitialized data. 942 */ 943 size -= bp->b_resid; 944 if (size < xfersize) { 945 if (size == 0) 946 break; 947 xfersize = size; 948 } 949 950 error = uiomove((char *)bp->b_data + blkoffset, 951 (int)xfersize, uio); 952 if (error) 953 break; 954 955 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 956 (LIST_FIRST(&bp->b_dep) == NULL)) { 957 /* 958 * If there are no dependencies, and it's VMIO, 959 * then we don't need the buf, mark it available 960 * for freeing. The VM has the data. 961 */ 962 bp->b_flags |= B_RELBUF; 963 brelse(bp); 964 } else { 965 /* 966 * Otherwise let whoever 967 * made the request take care of 968 * freeing it. We just queue 969 * it onto another list. 970 */ 971 bqrelse(bp); 972 } 973 } 974 975 /* 976 * This can only happen in the case of an error 977 * because the loop above resets bp to NULL on each iteration 978 * and on normal completion has not set a new value into it. 979 * so it must have come from a 'break' statement 980 */ 981 if (bp != NULL) { 982 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 983 (LIST_FIRST(&bp->b_dep) == NULL)) { 984 bp->b_flags |= B_RELBUF; 985 brelse(bp); 986 } else { 987 bqrelse(bp); 988 } 989 } 990 991 if ((error == 0 || uio->uio_resid != orig_resid) && 992 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 993 ip->i_flag |= IN_ACCESS; 994 return (error); 995 } 996 997 /* 998 * Extended attribute area writing. 999 */ 1000 static int 1001 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1002 { 1003 struct inode *ip; 1004 struct ufs2_dinode *dp; 1005 struct fs *fs; 1006 struct buf *bp; 1007 ufs_lbn_t lbn; 1008 off_t osize; 1009 int blkoffset, error, flags, resid, size, xfersize; 1010 1011 ip = VTOI(vp); 1012 fs = ip->i_fs; 1013 dp = ip->i_din2; 1014 1015 #ifdef DIAGNOSTIC 1016 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1017 panic("ffs_extwrite: mode"); 1018 #endif 1019 1020 if (ioflag & IO_APPEND) 1021 uio->uio_offset = dp->di_extsize; 1022 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1023 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1024 if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1025 return (EFBIG); 1026 1027 resid = uio->uio_resid; 1028 osize = dp->di_extsize; 1029 flags = IO_EXT; 1030 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1031 flags |= IO_SYNC; 1032 1033 for (error = 0; uio->uio_resid > 0;) { 1034 lbn = lblkno(fs, uio->uio_offset); 1035 blkoffset = blkoff(fs, uio->uio_offset); 1036 xfersize = fs->fs_bsize - blkoffset; 1037 if (uio->uio_resid < xfersize) 1038 xfersize = uio->uio_resid; 1039 1040 /* 1041 * We must perform a read-before-write if the transfer size 1042 * does not cover the entire buffer. 1043 */ 1044 if (fs->fs_bsize > xfersize) 1045 flags |= BA_CLRBUF; 1046 else 1047 flags &= ~BA_CLRBUF; 1048 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1049 ucred, flags, &bp); 1050 if (error != 0) 1051 break; 1052 /* 1053 * If the buffer is not valid we have to clear out any 1054 * garbage data from the pages instantiated for the buffer. 1055 * If we do not, a failed uiomove() during a write can leave 1056 * the prior contents of the pages exposed to a userland 1057 * mmap(). XXX deal with uiomove() errors a better way. 1058 */ 1059 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1060 vfs_bio_clrbuf(bp); 1061 if (ioflag & IO_DIRECT) 1062 bp->b_flags |= B_DIRECT; 1063 1064 if (uio->uio_offset + xfersize > dp->di_extsize) 1065 dp->di_extsize = uio->uio_offset + xfersize; 1066 1067 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1068 if (size < xfersize) 1069 xfersize = size; 1070 1071 error = 1072 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1073 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1074 (LIST_FIRST(&bp->b_dep) == NULL)) { 1075 bp->b_flags |= B_RELBUF; 1076 } 1077 1078 /* 1079 * If IO_SYNC each buffer is written synchronously. Otherwise 1080 * if we have a severe page deficiency write the buffer 1081 * asynchronously. Otherwise try to cluster, and if that 1082 * doesn't do it then either do an async write (if O_DIRECT), 1083 * or a delayed write (if not). 1084 */ 1085 if (ioflag & IO_SYNC) { 1086 (void)bwrite(bp); 1087 } else if (vm_page_count_severe() || 1088 buf_dirty_count_severe() || 1089 xfersize + blkoffset == fs->fs_bsize || 1090 (ioflag & (IO_ASYNC | IO_DIRECT))) 1091 bawrite(bp); 1092 else 1093 bdwrite(bp); 1094 if (error || xfersize == 0) 1095 break; 1096 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1097 } 1098 /* 1099 * If we successfully wrote any data, and we are not the superuser 1100 * we clear the setuid and setgid bits as a precaution against 1101 * tampering. 1102 */ 1103 if (resid > uio->uio_resid && ucred && 1104 suser_cred(ucred, SUSER_ALLOWJAIL)) { 1105 ip->i_mode &= ~(ISUID | ISGID); 1106 dp->di_mode = ip->i_mode; 1107 } 1108 if (error) { 1109 if (ioflag & IO_UNIT) { 1110 (void)ffs_truncate(vp, osize, 1111 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td); 1112 uio->uio_offset -= resid - uio->uio_resid; 1113 uio->uio_resid = resid; 1114 } 1115 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1116 error = ffs_update(vp, 1); 1117 return (error); 1118 } 1119 1120 1121 /* 1122 * Vnode operating to retrieve a named extended attribute. 1123 * 1124 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1125 * the length of the EA, and possibly the pointer to the entry and to the data. 1126 */ 1127 static int 1128 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) 1129 { 1130 u_char *p, *pe, *pn, *p0; 1131 int eapad1, eapad2, ealength, ealen, nlen; 1132 uint32_t ul; 1133 1134 pe = ptr + length; 1135 nlen = strlen(name); 1136 1137 for (p = ptr; p < pe; p = pn) { 1138 p0 = p; 1139 bcopy(p, &ul, sizeof(ul)); 1140 pn = p + ul; 1141 /* make sure this entry is complete */ 1142 if (pn > pe) 1143 break; 1144 p += sizeof(uint32_t); 1145 if (*p != nspace) 1146 continue; 1147 p++; 1148 eapad2 = *p++; 1149 if (*p != nlen) 1150 continue; 1151 p++; 1152 if (bcmp(p, name, nlen)) 1153 continue; 1154 ealength = sizeof(uint32_t) + 3 + nlen; 1155 eapad1 = 8 - (ealength % 8); 1156 if (eapad1 == 8) 1157 eapad1 = 0; 1158 ealength += eapad1; 1159 ealen = ul - ealength - eapad2; 1160 p += nlen + eapad1; 1161 if (eap != NULL) 1162 *eap = p0; 1163 if (eac != NULL) 1164 *eac = p; 1165 return (ealen); 1166 } 1167 return(-1); 1168 } 1169 1170 static int 1171 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1172 { 1173 struct inode *ip; 1174 struct ufs2_dinode *dp; 1175 struct uio luio; 1176 struct iovec liovec; 1177 int easize, error; 1178 u_char *eae; 1179 1180 ip = VTOI(vp); 1181 dp = ip->i_din2; 1182 easize = dp->di_extsize; 1183 1184 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1185 1186 liovec.iov_base = eae; 1187 liovec.iov_len = easize; 1188 luio.uio_iov = &liovec; 1189 luio.uio_iovcnt = 1; 1190 luio.uio_offset = 0; 1191 luio.uio_resid = easize; 1192 luio.uio_segflg = UIO_SYSSPACE; 1193 luio.uio_rw = UIO_READ; 1194 luio.uio_td = td; 1195 1196 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1197 if (error) { 1198 free(eae, M_TEMP); 1199 return(error); 1200 } 1201 *p = eae; 1202 return (0); 1203 } 1204 1205 static int 1206 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1207 { 1208 struct inode *ip; 1209 struct ufs2_dinode *dp; 1210 int error; 1211 1212 ip = VTOI(vp); 1213 1214 if (ip->i_ea_area != NULL) 1215 return (EBUSY); 1216 dp = ip->i_din2; 1217 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1218 if (error) 1219 return (error); 1220 ip->i_ea_len = dp->di_extsize; 1221 ip->i_ea_error = 0; 1222 return (0); 1223 } 1224 1225 /* 1226 * Vnode extattr transaction commit/abort 1227 */ 1228 static int 1229 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1230 { 1231 struct inode *ip; 1232 struct uio luio; 1233 struct iovec liovec; 1234 int error; 1235 struct ufs2_dinode *dp; 1236 1237 ip = VTOI(vp); 1238 if (ip->i_ea_area == NULL) 1239 return (EINVAL); 1240 dp = ip->i_din2; 1241 error = ip->i_ea_error; 1242 if (commit && error == 0) { 1243 if (cred == NOCRED) 1244 cred = vp->v_mount->mnt_cred; 1245 liovec.iov_base = ip->i_ea_area; 1246 liovec.iov_len = ip->i_ea_len; 1247 luio.uio_iov = &liovec; 1248 luio.uio_iovcnt = 1; 1249 luio.uio_offset = 0; 1250 luio.uio_resid = ip->i_ea_len; 1251 luio.uio_segflg = UIO_SYSSPACE; 1252 luio.uio_rw = UIO_WRITE; 1253 luio.uio_td = td; 1254 /* XXX: I'm not happy about truncating to zero size */ 1255 if (ip->i_ea_len < dp->di_extsize) 1256 error = ffs_truncate(vp, 0, IO_EXT, cred, td); 1257 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1258 } 1259 free(ip->i_ea_area, M_TEMP); 1260 ip->i_ea_area = NULL; 1261 ip->i_ea_len = 0; 1262 ip->i_ea_error = 0; 1263 return (error); 1264 } 1265 1266 /* 1267 * Vnode extattr strategy routine for fifos. 1268 * 1269 * We need to check for a read or write of the external attributes. 1270 * Otherwise we just fall through and do the usual thing. 1271 */ 1272 static int 1273 ffsext_strategy(struct vop_strategy_args *ap) 1274 /* 1275 struct vop_strategy_args { 1276 struct vnodeop_desc *a_desc; 1277 struct vnode *a_vp; 1278 struct buf *a_bp; 1279 }; 1280 */ 1281 { 1282 struct vnode *vp; 1283 daddr_t lbn; 1284 1285 vp = ap->a_vp; 1286 lbn = ap->a_bp->b_lblkno; 1287 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1288 lbn < 0 && lbn >= -NXADDR) 1289 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1290 if (vp->v_type == VFIFO) 1291 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1292 panic("spec nodes went here"); 1293 } 1294 1295 /* 1296 * Vnode extattr transaction commit/abort 1297 */ 1298 static int 1299 ffs_openextattr(struct vop_openextattr_args *ap) 1300 /* 1301 struct vop_openextattr_args { 1302 struct vnodeop_desc *a_desc; 1303 struct vnode *a_vp; 1304 IN struct ucred *a_cred; 1305 IN struct thread *a_td; 1306 }; 1307 */ 1308 { 1309 struct inode *ip; 1310 struct fs *fs; 1311 1312 ip = VTOI(ap->a_vp); 1313 fs = ip->i_fs; 1314 1315 if (ap->a_vp->v_type == VCHR) 1316 return (EOPNOTSUPP); 1317 1318 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1319 } 1320 1321 1322 /* 1323 * Vnode extattr transaction commit/abort 1324 */ 1325 static int 1326 ffs_closeextattr(struct vop_closeextattr_args *ap) 1327 /* 1328 struct vop_closeextattr_args { 1329 struct vnodeop_desc *a_desc; 1330 struct vnode *a_vp; 1331 int a_commit; 1332 IN struct ucred *a_cred; 1333 IN struct thread *a_td; 1334 }; 1335 */ 1336 { 1337 struct inode *ip; 1338 struct fs *fs; 1339 1340 ip = VTOI(ap->a_vp); 1341 fs = ip->i_fs; 1342 1343 if (ap->a_vp->v_type == VCHR) 1344 return (EOPNOTSUPP); 1345 1346 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1347 } 1348 1349 /* 1350 * Vnode operation to remove a named attribute. 1351 */ 1352 static int 1353 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1354 /* 1355 vop_deleteextattr { 1356 IN struct vnode *a_vp; 1357 IN int a_attrnamespace; 1358 IN const char *a_name; 1359 IN struct ucred *a_cred; 1360 IN struct thread *a_td; 1361 }; 1362 */ 1363 { 1364 struct inode *ip; 1365 struct fs *fs; 1366 uint32_t ealength, ul; 1367 int ealen, olen, eapad1, eapad2, error, i, easize; 1368 u_char *eae, *p; 1369 int stand_alone; 1370 1371 ip = VTOI(ap->a_vp); 1372 fs = ip->i_fs; 1373 1374 if (ap->a_vp->v_type == VCHR) 1375 return (EOPNOTSUPP); 1376 1377 if (strlen(ap->a_name) == 0) 1378 return (EINVAL); 1379 1380 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1381 ap->a_cred, ap->a_td, IWRITE); 1382 if (error) { 1383 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1384 ip->i_ea_error = error; 1385 return (error); 1386 } 1387 1388 if (ip->i_ea_area == NULL) { 1389 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1390 if (error) 1391 return (error); 1392 stand_alone = 1; 1393 } else { 1394 stand_alone = 0; 1395 } 1396 1397 ealength = eapad1 = ealen = eapad2 = 0; 1398 1399 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1400 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1401 easize = ip->i_ea_len; 1402 1403 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1404 &p, NULL); 1405 if (olen == -1) { 1406 /* delete but nonexistent */ 1407 free(eae, M_TEMP); 1408 if (stand_alone) 1409 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1410 return(ENOATTR); 1411 } 1412 bcopy(p, &ul, sizeof ul); 1413 i = p - eae + ul; 1414 if (ul != ealength) { 1415 bcopy(p + ul, p + ealength, easize - i); 1416 easize += (ealength - ul); 1417 } 1418 if (easize > NXADDR * fs->fs_bsize) { 1419 free(eae, M_TEMP); 1420 if (stand_alone) 1421 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1422 else if (ip->i_ea_error == 0) 1423 ip->i_ea_error = ENOSPC; 1424 return(ENOSPC); 1425 } 1426 p = ip->i_ea_area; 1427 ip->i_ea_area = eae; 1428 ip->i_ea_len = easize; 1429 free(p, M_TEMP); 1430 if (stand_alone) 1431 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1432 return(error); 1433 } 1434 1435 /* 1436 * Vnode operation to retrieve a named extended attribute. 1437 */ 1438 static int 1439 ffs_getextattr(struct vop_getextattr_args *ap) 1440 /* 1441 vop_getextattr { 1442 IN struct vnode *a_vp; 1443 IN int a_attrnamespace; 1444 IN const char *a_name; 1445 INOUT struct uio *a_uio; 1446 OUT size_t *a_size; 1447 IN struct ucred *a_cred; 1448 IN struct thread *a_td; 1449 }; 1450 */ 1451 { 1452 struct inode *ip; 1453 struct fs *fs; 1454 u_char *eae, *p; 1455 unsigned easize; 1456 int error, ealen, stand_alone; 1457 1458 ip = VTOI(ap->a_vp); 1459 fs = ip->i_fs; 1460 1461 if (ap->a_vp->v_type == VCHR) 1462 return (EOPNOTSUPP); 1463 1464 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1465 ap->a_cred, ap->a_td, IREAD); 1466 if (error) 1467 return (error); 1468 1469 if (ip->i_ea_area == NULL) { 1470 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1471 if (error) 1472 return (error); 1473 stand_alone = 1; 1474 } else { 1475 stand_alone = 0; 1476 } 1477 eae = ip->i_ea_area; 1478 easize = ip->i_ea_len; 1479 1480 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1481 NULL, &p); 1482 if (ealen >= 0) { 1483 error = 0; 1484 if (ap->a_size != NULL) 1485 *ap->a_size = ealen; 1486 else if (ap->a_uio != NULL) 1487 error = uiomove(p, ealen, ap->a_uio); 1488 } else 1489 error = ENOATTR; 1490 if (stand_alone) 1491 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1492 return(error); 1493 } 1494 1495 /* 1496 * Vnode operation to retrieve extended attributes on a vnode. 1497 */ 1498 static int 1499 ffs_listextattr(struct vop_listextattr_args *ap) 1500 /* 1501 vop_listextattr { 1502 IN struct vnode *a_vp; 1503 IN int a_attrnamespace; 1504 INOUT struct uio *a_uio; 1505 OUT size_t *a_size; 1506 IN struct ucred *a_cred; 1507 IN struct thread *a_td; 1508 }; 1509 */ 1510 { 1511 struct inode *ip; 1512 struct fs *fs; 1513 u_char *eae, *p, *pe, *pn; 1514 unsigned easize; 1515 uint32_t ul; 1516 int error, ealen, stand_alone; 1517 1518 ip = VTOI(ap->a_vp); 1519 fs = ip->i_fs; 1520 1521 if (ap->a_vp->v_type == VCHR) 1522 return (EOPNOTSUPP); 1523 1524 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1525 ap->a_cred, ap->a_td, IREAD); 1526 if (error) 1527 return (error); 1528 1529 if (ip->i_ea_area == NULL) { 1530 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1531 if (error) 1532 return (error); 1533 stand_alone = 1; 1534 } else { 1535 stand_alone = 0; 1536 } 1537 eae = ip->i_ea_area; 1538 easize = ip->i_ea_len; 1539 1540 error = 0; 1541 if (ap->a_size != NULL) 1542 *ap->a_size = 0; 1543 pe = eae + easize; 1544 for(p = eae; error == 0 && p < pe; p = pn) { 1545 bcopy(p, &ul, sizeof(ul)); 1546 pn = p + ul; 1547 if (pn > pe) 1548 break; 1549 p += sizeof(ul); 1550 if (*p++ != ap->a_attrnamespace) 1551 continue; 1552 p++; /* pad2 */ 1553 ealen = *p; 1554 if (ap->a_size != NULL) { 1555 *ap->a_size += ealen + 1; 1556 } else if (ap->a_uio != NULL) { 1557 error = uiomove(p, ealen + 1, ap->a_uio); 1558 } 1559 } 1560 if (stand_alone) 1561 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1562 return(error); 1563 } 1564 1565 /* 1566 * Vnode operation to set a named attribute. 1567 */ 1568 static int 1569 ffs_setextattr(struct vop_setextattr_args *ap) 1570 /* 1571 vop_setextattr { 1572 IN struct vnode *a_vp; 1573 IN int a_attrnamespace; 1574 IN const char *a_name; 1575 INOUT struct uio *a_uio; 1576 IN struct ucred *a_cred; 1577 IN struct thread *a_td; 1578 }; 1579 */ 1580 { 1581 struct inode *ip; 1582 struct fs *fs; 1583 uint32_t ealength, ul; 1584 int ealen, olen, eapad1, eapad2, error, i, easize; 1585 u_char *eae, *p; 1586 int stand_alone; 1587 1588 ip = VTOI(ap->a_vp); 1589 fs = ip->i_fs; 1590 1591 if (ap->a_vp->v_type == VCHR) 1592 return (EOPNOTSUPP); 1593 1594 if (strlen(ap->a_name) == 0) 1595 return (EINVAL); 1596 1597 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1598 if (ap->a_uio == NULL) 1599 return (EOPNOTSUPP); 1600 1601 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1602 ap->a_cred, ap->a_td, IWRITE); 1603 if (error) { 1604 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1605 ip->i_ea_error = error; 1606 return (error); 1607 } 1608 1609 if (ip->i_ea_area == NULL) { 1610 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1611 if (error) 1612 return (error); 1613 stand_alone = 1; 1614 } else { 1615 stand_alone = 0; 1616 } 1617 1618 ealen = ap->a_uio->uio_resid; 1619 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1620 eapad1 = 8 - (ealength % 8); 1621 if (eapad1 == 8) 1622 eapad1 = 0; 1623 eapad2 = 8 - (ealen % 8); 1624 if (eapad2 == 8) 1625 eapad2 = 0; 1626 ealength += eapad1 + ealen + eapad2; 1627 1628 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1629 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1630 easize = ip->i_ea_len; 1631 1632 olen = ffs_findextattr(eae, easize, 1633 ap->a_attrnamespace, ap->a_name, &p, NULL); 1634 if (olen == -1) { 1635 /* new, append at end */ 1636 p = eae + easize; 1637 easize += ealength; 1638 } else { 1639 bcopy(p, &ul, sizeof ul); 1640 i = p - eae + ul; 1641 if (ul != ealength) { 1642 bcopy(p + ul, p + ealength, easize - i); 1643 easize += (ealength - ul); 1644 } 1645 } 1646 if (easize > NXADDR * fs->fs_bsize) { 1647 free(eae, M_TEMP); 1648 if (stand_alone) 1649 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1650 else if (ip->i_ea_error == 0) 1651 ip->i_ea_error = ENOSPC; 1652 return(ENOSPC); 1653 } 1654 bcopy(&ealength, p, sizeof(ealength)); 1655 p += sizeof(ealength); 1656 *p++ = ap->a_attrnamespace; 1657 *p++ = eapad2; 1658 *p++ = strlen(ap->a_name); 1659 strcpy(p, ap->a_name); 1660 p += strlen(ap->a_name); 1661 bzero(p, eapad1); 1662 p += eapad1; 1663 error = uiomove(p, ealen, ap->a_uio); 1664 if (error) { 1665 free(eae, M_TEMP); 1666 if (stand_alone) 1667 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1668 else if (ip->i_ea_error == 0) 1669 ip->i_ea_error = error; 1670 return(error); 1671 } 1672 p += ealen; 1673 bzero(p, eapad2); 1674 1675 p = ip->i_ea_area; 1676 ip->i_ea_area = eae; 1677 ip->i_ea_len = easize; 1678 free(p, M_TEMP); 1679 if (stand_alone) 1680 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1681 return(error); 1682 } 1683