1 /*- 2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 60 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 61 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include <sys/param.h> 68 #include <sys/bio.h> 69 #include <sys/systm.h> 70 #include <sys/buf.h> 71 #include <sys/conf.h> 72 #include <sys/extattr.h> 73 #include <sys/kernel.h> 74 #include <sys/limits.h> 75 #include <sys/malloc.h> 76 #include <sys/mount.h> 77 #include <sys/priv.h> 78 #include <sys/proc.h> 79 #include <sys/resourcevar.h> 80 #include <sys/signalvar.h> 81 #include <sys/stat.h> 82 #include <sys/vmmeter.h> 83 #include <sys/vnode.h> 84 85 #include <vm/vm.h> 86 #include <vm/vm_extern.h> 87 #include <vm/vm_object.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_pager.h> 90 #include <vm/vnode_pager.h> 91 92 #include <ufs/ufs/extattr.h> 93 #include <ufs/ufs/quota.h> 94 #include <ufs/ufs/inode.h> 95 #include <ufs/ufs/ufs_extern.h> 96 #include <ufs/ufs/ufsmount.h> 97 98 #include <ufs/ffs/fs.h> 99 #include <ufs/ffs/ffs_extern.h> 100 #include "opt_directio.h" 101 #include "opt_ffs.h" 102 103 #ifdef DIRECTIO 104 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 105 #endif 106 static vop_fsync_t ffs_fsync; 107 static vop_lock1_t ffs_lock; 108 static vop_getpages_t ffs_getpages; 109 static vop_read_t ffs_read; 110 static vop_write_t ffs_write; 111 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 112 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 113 struct ucred *cred); 114 static vop_strategy_t ffsext_strategy; 115 static vop_closeextattr_t ffs_closeextattr; 116 static vop_deleteextattr_t ffs_deleteextattr; 117 static vop_getextattr_t ffs_getextattr; 118 static vop_listextattr_t ffs_listextattr; 119 static vop_openextattr_t ffs_openextattr; 120 static vop_setextattr_t ffs_setextattr; 121 static vop_vptofh_t ffs_vptofh; 122 123 124 /* Global vfs data structures for ufs. */ 125 struct vop_vector ffs_vnodeops1 = { 126 .vop_default = &ufs_vnodeops, 127 .vop_fsync = ffs_fsync, 128 .vop_getpages = ffs_getpages, 129 .vop_lock1 = ffs_lock, 130 .vop_read = ffs_read, 131 .vop_reallocblks = ffs_reallocblks, 132 .vop_write = ffs_write, 133 .vop_vptofh = ffs_vptofh, 134 }; 135 136 struct vop_vector ffs_fifoops1 = { 137 .vop_default = &ufs_fifoops, 138 .vop_fsync = ffs_fsync, 139 .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ 140 .vop_vptofh = ffs_vptofh, 141 }; 142 143 /* Global vfs data structures for ufs. */ 144 struct vop_vector ffs_vnodeops2 = { 145 .vop_default = &ufs_vnodeops, 146 .vop_fsync = ffs_fsync, 147 .vop_getpages = ffs_getpages, 148 .vop_lock1 = ffs_lock, 149 .vop_read = ffs_read, 150 .vop_reallocblks = ffs_reallocblks, 151 .vop_write = ffs_write, 152 .vop_closeextattr = ffs_closeextattr, 153 .vop_deleteextattr = ffs_deleteextattr, 154 .vop_getextattr = ffs_getextattr, 155 .vop_listextattr = ffs_listextattr, 156 .vop_openextattr = ffs_openextattr, 157 .vop_setextattr = ffs_setextattr, 158 .vop_vptofh = ffs_vptofh, 159 }; 160 161 struct vop_vector ffs_fifoops2 = { 162 .vop_default = &ufs_fifoops, 163 .vop_fsync = ffs_fsync, 164 .vop_lock1 = ffs_lock, 165 .vop_reallocblks = ffs_reallocblks, 166 .vop_strategy = ffsext_strategy, 167 .vop_closeextattr = ffs_closeextattr, 168 .vop_deleteextattr = ffs_deleteextattr, 169 .vop_getextattr = ffs_getextattr, 170 .vop_listextattr = ffs_listextattr, 171 .vop_openextattr = ffs_openextattr, 172 .vop_setextattr = ffs_setextattr, 173 .vop_vptofh = ffs_vptofh, 174 }; 175 176 /* 177 * Synch an open file. 178 */ 179 /* ARGSUSED */ 180 static int 181 ffs_fsync(struct vop_fsync_args *ap) 182 { 183 int error; 184 185 error = ffs_syncvnode(ap->a_vp, ap->a_waitfor); 186 if (error) 187 return (error); 188 if (ap->a_waitfor == MNT_WAIT && 189 (ap->a_vp->v_mount->mnt_flag & MNT_SOFTDEP)) 190 error = softdep_fsync(ap->a_vp); 191 return (error); 192 } 193 194 int 195 ffs_syncvnode(struct vnode *vp, int waitfor) 196 { 197 struct inode *ip = VTOI(vp); 198 struct bufobj *bo; 199 struct buf *bp; 200 struct buf *nbp; 201 int s, error, wait, passes, skipmeta; 202 ufs_lbn_t lbn; 203 204 wait = (waitfor == MNT_WAIT); 205 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 206 bo = &vp->v_bufobj; 207 208 /* 209 * Flush all dirty buffers associated with a vnode. 210 */ 211 passes = NIADDR + 1; 212 skipmeta = 0; 213 if (wait) 214 skipmeta = 1; 215 s = splbio(); 216 BO_LOCK(bo); 217 loop: 218 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 219 bp->b_vflags &= ~BV_SCANNED; 220 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 221 /* 222 * Reasons to skip this buffer: it has already been considered 223 * on this pass, this pass is the first time through on a 224 * synchronous flush request and the buffer being considered 225 * is metadata, the buffer has dependencies that will cause 226 * it to be redirtied and it has not already been deferred, 227 * or it is already being written. 228 */ 229 if ((bp->b_vflags & BV_SCANNED) != 0) 230 continue; 231 bp->b_vflags |= BV_SCANNED; 232 if ((skipmeta == 1 && bp->b_lblkno < 0)) 233 continue; 234 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 235 continue; 236 BO_UNLOCK(bo); 237 if (!wait && !LIST_EMPTY(&bp->b_dep) && 238 (bp->b_flags & B_DEFERRED) == 0 && 239 buf_countdeps(bp, 0)) { 240 bp->b_flags |= B_DEFERRED; 241 BUF_UNLOCK(bp); 242 BO_LOCK(bo); 243 continue; 244 } 245 if ((bp->b_flags & B_DELWRI) == 0) 246 panic("ffs_fsync: not dirty"); 247 /* 248 * If this is a synchronous flush request, or it is not a 249 * file or device, start the write on this buffer immediately. 250 */ 251 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) { 252 253 /* 254 * On our final pass through, do all I/O synchronously 255 * so that we can find out if our flush is failing 256 * because of write errors. 257 */ 258 if (passes > 0 || !wait) { 259 if ((bp->b_flags & B_CLUSTEROK) && !wait) { 260 (void) vfs_bio_awrite(bp); 261 } else { 262 bremfree(bp); 263 splx(s); 264 (void) bawrite(bp); 265 s = splbio(); 266 } 267 } else { 268 bremfree(bp); 269 splx(s); 270 if ((error = bwrite(bp)) != 0) 271 return (error); 272 s = splbio(); 273 } 274 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) { 275 /* 276 * If the buffer is for data that has been truncated 277 * off the file, then throw it away. 278 */ 279 bremfree(bp); 280 bp->b_flags |= B_INVAL | B_NOCACHE; 281 splx(s); 282 brelse(bp); 283 s = splbio(); 284 } else 285 vfs_bio_awrite(bp); 286 287 /* 288 * Since we may have slept during the I/O, we need 289 * to start from a known point. 290 */ 291 BO_LOCK(bo); 292 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); 293 } 294 /* 295 * If we were asked to do this synchronously, then go back for 296 * another pass, this time doing the metadata. 297 */ 298 if (skipmeta) { 299 skipmeta = 0; 300 goto loop; 301 } 302 303 if (wait) { 304 bufobj_wwait(bo, 3, 0); 305 BO_UNLOCK(bo); 306 307 /* 308 * Ensure that any filesystem metatdata associated 309 * with the vnode has been written. 310 */ 311 splx(s); 312 if ((error = softdep_sync_metadata(vp)) != 0) 313 return (error); 314 s = splbio(); 315 316 BO_LOCK(bo); 317 if (bo->bo_dirty.bv_cnt > 0) { 318 /* 319 * Block devices associated with filesystems may 320 * have new I/O requests posted for them even if 321 * the vnode is locked, so no amount of trying will 322 * get them clean. Thus we give block devices a 323 * good effort, then just give up. For all other file 324 * types, go around and try again until it is clean. 325 */ 326 if (passes > 0) { 327 passes -= 1; 328 goto loop; 329 } 330 #ifdef INVARIANTS 331 if (!vn_isdisk(vp, NULL)) 332 vprint("ffs_fsync: dirty", vp); 333 #endif 334 } 335 } 336 BO_UNLOCK(bo); 337 splx(s); 338 return (ffs_update(vp, wait)); 339 } 340 341 static int 342 ffs_lock(ap) 343 struct vop_lock1_args /* { 344 struct vnode *a_vp; 345 int a_flags; 346 struct thread *a_td; 347 char *file; 348 int line; 349 } */ *ap; 350 { 351 #ifndef NO_FFS_SNAPSHOT 352 struct vnode *vp; 353 int flags; 354 struct lock *lkp; 355 int result; 356 357 switch (ap->a_flags & LK_TYPE_MASK) { 358 case LK_SHARED: 359 case LK_UPGRADE: 360 case LK_EXCLUSIVE: 361 vp = ap->a_vp; 362 flags = ap->a_flags; 363 for (;;) { 364 lkp = vp->v_vnlock; 365 result = _lockmgr_args(lkp, flags, VI_MTX(vp), 366 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 367 ap->a_file, ap->a_line); 368 if (lkp == vp->v_vnlock || result != 0) 369 break; 370 /* 371 * Apparent success, except that the vnode 372 * mutated between snapshot file vnode and 373 * regular file vnode while this process 374 * slept. The lock currently held is not the 375 * right lock. Release it, and try to get the 376 * new lock. 377 */ 378 (void) _lockmgr_args(lkp, LK_RELEASE, NULL, 379 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 380 ap->a_file, ap->a_line); 381 if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == 382 (LK_INTERLOCK | LK_NOWAIT)) 383 return (EBUSY); 384 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 385 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 386 flags &= ~LK_INTERLOCK; 387 } 388 break; 389 default: 390 result = VOP_LOCK1_APV(&ufs_vnodeops, ap); 391 } 392 return (result); 393 #else 394 return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); 395 #endif 396 } 397 398 /* 399 * Vnode op for reading. 400 */ 401 /* ARGSUSED */ 402 static int 403 ffs_read(ap) 404 struct vop_read_args /* { 405 struct vnode *a_vp; 406 struct uio *a_uio; 407 int a_ioflag; 408 struct ucred *a_cred; 409 } */ *ap; 410 { 411 struct vnode *vp; 412 struct inode *ip; 413 struct uio *uio; 414 struct fs *fs; 415 struct buf *bp; 416 ufs_lbn_t lbn, nextlbn; 417 off_t bytesinfile; 418 long size, xfersize, blkoffset; 419 int error, orig_resid; 420 int seqcount; 421 int ioflag; 422 423 vp = ap->a_vp; 424 uio = ap->a_uio; 425 ioflag = ap->a_ioflag; 426 if (ap->a_ioflag & IO_EXT) 427 #ifdef notyet 428 return (ffs_extread(vp, uio, ioflag)); 429 #else 430 panic("ffs_read+IO_EXT"); 431 #endif 432 #ifdef DIRECTIO 433 if ((ioflag & IO_DIRECT) != 0) { 434 int workdone; 435 436 error = ffs_rawread(vp, uio, &workdone); 437 if (error != 0 || workdone != 0) 438 return error; 439 } 440 #endif 441 442 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 443 ip = VTOI(vp); 444 445 #ifdef INVARIANTS 446 if (uio->uio_rw != UIO_READ) 447 panic("ffs_read: mode"); 448 449 if (vp->v_type == VLNK) { 450 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 451 panic("ffs_read: short symlink"); 452 } else if (vp->v_type != VREG && vp->v_type != VDIR) 453 panic("ffs_read: type %d", vp->v_type); 454 #endif 455 orig_resid = uio->uio_resid; 456 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 457 if (orig_resid == 0) 458 return (0); 459 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 460 fs = ip->i_fs; 461 if (uio->uio_offset < ip->i_size && 462 uio->uio_offset >= fs->fs_maxfilesize) 463 return (EOVERFLOW); 464 465 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 466 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 467 break; 468 lbn = lblkno(fs, uio->uio_offset); 469 nextlbn = lbn + 1; 470 471 /* 472 * size of buffer. The buffer representing the 473 * end of the file is rounded up to the size of 474 * the block type ( fragment or full block, 475 * depending ). 476 */ 477 size = blksize(fs, ip, lbn); 478 blkoffset = blkoff(fs, uio->uio_offset); 479 480 /* 481 * The amount we want to transfer in this iteration is 482 * one FS block less the amount of the data before 483 * our startpoint (duh!) 484 */ 485 xfersize = fs->fs_bsize - blkoffset; 486 487 /* 488 * But if we actually want less than the block, 489 * or the file doesn't have a whole block more of data, 490 * then use the lesser number. 491 */ 492 if (uio->uio_resid < xfersize) 493 xfersize = uio->uio_resid; 494 if (bytesinfile < xfersize) 495 xfersize = bytesinfile; 496 497 if (lblktosize(fs, nextlbn) >= ip->i_size) { 498 /* 499 * Don't do readahead if this is the end of the file. 500 */ 501 error = bread(vp, lbn, size, NOCRED, &bp); 502 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 503 /* 504 * Otherwise if we are allowed to cluster, 505 * grab as much as we can. 506 * 507 * XXX This may not be a win if we are not 508 * doing sequential access. 509 */ 510 error = cluster_read(vp, ip->i_size, lbn, 511 size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); 512 } else if (seqcount > 1) { 513 /* 514 * If we are NOT allowed to cluster, then 515 * if we appear to be acting sequentially, 516 * fire off a request for a readahead 517 * as well as a read. Note that the 4th and 5th 518 * arguments point to arrays of the size specified in 519 * the 6th argument. 520 */ 521 int nextsize = blksize(fs, ip, nextlbn); 522 error = breadn(vp, lbn, 523 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 524 } else { 525 /* 526 * Failing all of the above, just read what the 527 * user asked for. Interestingly, the same as 528 * the first option above. 529 */ 530 error = bread(vp, lbn, size, NOCRED, &bp); 531 } 532 if (error) { 533 brelse(bp); 534 bp = NULL; 535 break; 536 } 537 538 /* 539 * If IO_DIRECT then set B_DIRECT for the buffer. This 540 * will cause us to attempt to release the buffer later on 541 * and will cause the buffer cache to attempt to free the 542 * underlying pages. 543 */ 544 if (ioflag & IO_DIRECT) 545 bp->b_flags |= B_DIRECT; 546 547 /* 548 * We should only get non-zero b_resid when an I/O error 549 * has occurred, which should cause us to break above. 550 * However, if the short read did not cause an error, 551 * then we want to ensure that we do not uiomove bad 552 * or uninitialized data. 553 */ 554 size -= bp->b_resid; 555 if (size < xfersize) { 556 if (size == 0) 557 break; 558 xfersize = size; 559 } 560 561 error = uiomove((char *)bp->b_data + blkoffset, 562 (int)xfersize, uio); 563 if (error) 564 break; 565 566 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 567 (LIST_EMPTY(&bp->b_dep))) { 568 /* 569 * If there are no dependencies, and it's VMIO, 570 * then we don't need the buf, mark it available 571 * for freeing. The VM has the data. 572 */ 573 bp->b_flags |= B_RELBUF; 574 brelse(bp); 575 } else { 576 /* 577 * Otherwise let whoever 578 * made the request take care of 579 * freeing it. We just queue 580 * it onto another list. 581 */ 582 bqrelse(bp); 583 } 584 } 585 586 /* 587 * This can only happen in the case of an error 588 * because the loop above resets bp to NULL on each iteration 589 * and on normal completion has not set a new value into it. 590 * so it must have come from a 'break' statement 591 */ 592 if (bp != NULL) { 593 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 594 (LIST_EMPTY(&bp->b_dep))) { 595 bp->b_flags |= B_RELBUF; 596 brelse(bp); 597 } else { 598 bqrelse(bp); 599 } 600 } 601 602 if ((error == 0 || uio->uio_resid != orig_resid) && 603 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0 && 604 (ip->i_flag & IN_ACCESS) == 0) { 605 VI_LOCK(vp); 606 ip->i_flag |= IN_ACCESS; 607 VI_UNLOCK(vp); 608 } 609 return (error); 610 } 611 612 /* 613 * Vnode op for writing. 614 */ 615 static int 616 ffs_write(ap) 617 struct vop_write_args /* { 618 struct vnode *a_vp; 619 struct uio *a_uio; 620 int a_ioflag; 621 struct ucred *a_cred; 622 } */ *ap; 623 { 624 struct vnode *vp; 625 struct uio *uio; 626 struct inode *ip; 627 struct fs *fs; 628 struct buf *bp; 629 struct thread *td; 630 ufs_lbn_t lbn; 631 off_t osize; 632 int seqcount; 633 int blkoffset, error, flags, ioflag, resid, size, xfersize; 634 635 vp = ap->a_vp; 636 uio = ap->a_uio; 637 ioflag = ap->a_ioflag; 638 if (ap->a_ioflag & IO_EXT) 639 #ifdef notyet 640 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 641 #else 642 panic("ffs_write+IO_EXT"); 643 #endif 644 645 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 646 ip = VTOI(vp); 647 648 #ifdef INVARIANTS 649 if (uio->uio_rw != UIO_WRITE) 650 panic("ffs_write: mode"); 651 #endif 652 653 switch (vp->v_type) { 654 case VREG: 655 if (ioflag & IO_APPEND) 656 uio->uio_offset = ip->i_size; 657 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 658 return (EPERM); 659 /* FALLTHROUGH */ 660 case VLNK: 661 break; 662 case VDIR: 663 panic("ffs_write: dir write"); 664 break; 665 default: 666 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 667 (int)uio->uio_offset, 668 (int)uio->uio_resid 669 ); 670 } 671 672 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 673 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 674 fs = ip->i_fs; 675 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 676 return (EFBIG); 677 /* 678 * Maybe this should be above the vnode op call, but so long as 679 * file servers have no limits, I don't think it matters. 680 */ 681 td = uio->uio_td; 682 if (vp->v_type == VREG && td != NULL) { 683 PROC_LOCK(td->td_proc); 684 if (uio->uio_offset + uio->uio_resid > 685 lim_cur(td->td_proc, RLIMIT_FSIZE)) { 686 psignal(td->td_proc, SIGXFSZ); 687 PROC_UNLOCK(td->td_proc); 688 return (EFBIG); 689 } 690 PROC_UNLOCK(td->td_proc); 691 } 692 693 resid = uio->uio_resid; 694 osize = ip->i_size; 695 if (seqcount > BA_SEQMAX) 696 flags = BA_SEQMAX << BA_SEQSHIFT; 697 else 698 flags = seqcount << BA_SEQSHIFT; 699 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 700 flags |= IO_SYNC; 701 702 for (error = 0; uio->uio_resid > 0;) { 703 lbn = lblkno(fs, uio->uio_offset); 704 blkoffset = blkoff(fs, uio->uio_offset); 705 xfersize = fs->fs_bsize - blkoffset; 706 if (uio->uio_resid < xfersize) 707 xfersize = uio->uio_resid; 708 if (uio->uio_offset + xfersize > ip->i_size) 709 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 710 711 /* 712 * We must perform a read-before-write if the transfer size 713 * does not cover the entire buffer. 714 */ 715 if (fs->fs_bsize > xfersize) 716 flags |= BA_CLRBUF; 717 else 718 flags &= ~BA_CLRBUF; 719 /* XXX is uio->uio_offset the right thing here? */ 720 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 721 ap->a_cred, flags, &bp); 722 if (error != 0) 723 break; 724 /* 725 * If the buffer is not valid we have to clear out any 726 * garbage data from the pages instantiated for the buffer. 727 * If we do not, a failed uiomove() during a write can leave 728 * the prior contents of the pages exposed to a userland 729 * mmap(). XXX deal with uiomove() errors a better way. 730 */ 731 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 732 vfs_bio_clrbuf(bp); 733 if (ioflag & IO_DIRECT) 734 bp->b_flags |= B_DIRECT; 735 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 736 bp->b_flags |= B_NOCACHE; 737 738 if (uio->uio_offset + xfersize > ip->i_size) { 739 ip->i_size = uio->uio_offset + xfersize; 740 DIP_SET(ip, i_size, ip->i_size); 741 } 742 743 size = blksize(fs, ip, lbn) - bp->b_resid; 744 if (size < xfersize) 745 xfersize = size; 746 747 error = 748 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 749 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 750 (LIST_EMPTY(&bp->b_dep))) { 751 bp->b_flags |= B_RELBUF; 752 } 753 754 /* 755 * If IO_SYNC each buffer is written synchronously. Otherwise 756 * if we have a severe page deficiency write the buffer 757 * asynchronously. Otherwise try to cluster, and if that 758 * doesn't do it then either do an async write (if O_DIRECT), 759 * or a delayed write (if not). 760 */ 761 if (ioflag & IO_SYNC) { 762 (void)bwrite(bp); 763 } else if (vm_page_count_severe() || 764 buf_dirty_count_severe() || 765 (ioflag & IO_ASYNC)) { 766 bp->b_flags |= B_CLUSTEROK; 767 bawrite(bp); 768 } else if (xfersize + blkoffset == fs->fs_bsize) { 769 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 770 bp->b_flags |= B_CLUSTEROK; 771 cluster_write(vp, bp, ip->i_size, seqcount); 772 } else { 773 bawrite(bp); 774 } 775 } else if (ioflag & IO_DIRECT) { 776 bp->b_flags |= B_CLUSTEROK; 777 bawrite(bp); 778 } else { 779 bp->b_flags |= B_CLUSTEROK; 780 bdwrite(bp); 781 } 782 if (error || xfersize == 0) 783 break; 784 ip->i_flag |= IN_CHANGE | IN_UPDATE; 785 } 786 /* 787 * If we successfully wrote any data, and we are not the superuser 788 * we clear the setuid and setgid bits as a precaution against 789 * tampering. 790 */ 791 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && 792 ap->a_cred) { 793 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) { 794 ip->i_mode &= ~(ISUID | ISGID); 795 DIP_SET(ip, i_mode, ip->i_mode); 796 } 797 } 798 if (error) { 799 if (ioflag & IO_UNIT) { 800 (void)ffs_truncate(vp, osize, 801 IO_NORMAL | (ioflag & IO_SYNC), 802 ap->a_cred, uio->uio_td); 803 uio->uio_offset -= resid - uio->uio_resid; 804 uio->uio_resid = resid; 805 } 806 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 807 error = ffs_update(vp, 1); 808 return (error); 809 } 810 811 /* 812 * get page routine 813 */ 814 static int 815 ffs_getpages(ap) 816 struct vop_getpages_args *ap; 817 { 818 int i; 819 vm_page_t mreq; 820 int pcount; 821 822 pcount = round_page(ap->a_count) / PAGE_SIZE; 823 mreq = ap->a_m[ap->a_reqpage]; 824 825 /* 826 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 827 * then the entire page is valid. Since the page may be mapped, 828 * user programs might reference data beyond the actual end of file 829 * occuring within the page. We have to zero that data. 830 */ 831 VM_OBJECT_LOCK(mreq->object); 832 if (mreq->valid) { 833 if (mreq->valid != VM_PAGE_BITS_ALL) 834 vm_page_zero_invalid(mreq, TRUE); 835 vm_page_lock_queues(); 836 for (i = 0; i < pcount; i++) { 837 if (i != ap->a_reqpage) { 838 vm_page_free(ap->a_m[i]); 839 } 840 } 841 vm_page_unlock_queues(); 842 VM_OBJECT_UNLOCK(mreq->object); 843 return VM_PAGER_OK; 844 } 845 VM_OBJECT_UNLOCK(mreq->object); 846 847 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 848 ap->a_count, 849 ap->a_reqpage); 850 } 851 852 853 /* 854 * Extended attribute area reading. 855 */ 856 static int 857 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 858 { 859 struct inode *ip; 860 struct ufs2_dinode *dp; 861 struct fs *fs; 862 struct buf *bp; 863 ufs_lbn_t lbn, nextlbn; 864 off_t bytesinfile; 865 long size, xfersize, blkoffset; 866 int error, orig_resid; 867 868 ip = VTOI(vp); 869 fs = ip->i_fs; 870 dp = ip->i_din2; 871 872 #ifdef INVARIANTS 873 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 874 panic("ffs_extread: mode"); 875 876 #endif 877 orig_resid = uio->uio_resid; 878 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 879 if (orig_resid == 0) 880 return (0); 881 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 882 883 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 884 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 885 break; 886 lbn = lblkno(fs, uio->uio_offset); 887 nextlbn = lbn + 1; 888 889 /* 890 * size of buffer. The buffer representing the 891 * end of the file is rounded up to the size of 892 * the block type ( fragment or full block, 893 * depending ). 894 */ 895 size = sblksize(fs, dp->di_extsize, lbn); 896 blkoffset = blkoff(fs, uio->uio_offset); 897 898 /* 899 * The amount we want to transfer in this iteration is 900 * one FS block less the amount of the data before 901 * our startpoint (duh!) 902 */ 903 xfersize = fs->fs_bsize - blkoffset; 904 905 /* 906 * But if we actually want less than the block, 907 * or the file doesn't have a whole block more of data, 908 * then use the lesser number. 909 */ 910 if (uio->uio_resid < xfersize) 911 xfersize = uio->uio_resid; 912 if (bytesinfile < xfersize) 913 xfersize = bytesinfile; 914 915 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 916 /* 917 * Don't do readahead if this is the end of the info. 918 */ 919 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 920 } else { 921 /* 922 * If we have a second block, then 923 * fire off a request for a readahead 924 * as well as a read. Note that the 4th and 5th 925 * arguments point to arrays of the size specified in 926 * the 6th argument. 927 */ 928 int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 929 930 nextlbn = -1 - nextlbn; 931 error = breadn(vp, -1 - lbn, 932 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 933 } 934 if (error) { 935 brelse(bp); 936 bp = NULL; 937 break; 938 } 939 940 /* 941 * If IO_DIRECT then set B_DIRECT for the buffer. This 942 * will cause us to attempt to release the buffer later on 943 * and will cause the buffer cache to attempt to free the 944 * underlying pages. 945 */ 946 if (ioflag & IO_DIRECT) 947 bp->b_flags |= B_DIRECT; 948 949 /* 950 * We should only get non-zero b_resid when an I/O error 951 * has occurred, which should cause us to break above. 952 * However, if the short read did not cause an error, 953 * then we want to ensure that we do not uiomove bad 954 * or uninitialized data. 955 */ 956 size -= bp->b_resid; 957 if (size < xfersize) { 958 if (size == 0) 959 break; 960 xfersize = size; 961 } 962 963 error = uiomove((char *)bp->b_data + blkoffset, 964 (int)xfersize, uio); 965 if (error) 966 break; 967 968 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 969 (LIST_EMPTY(&bp->b_dep))) { 970 /* 971 * If there are no dependencies, and it's VMIO, 972 * then we don't need the buf, mark it available 973 * for freeing. The VM has the data. 974 */ 975 bp->b_flags |= B_RELBUF; 976 brelse(bp); 977 } else { 978 /* 979 * Otherwise let whoever 980 * made the request take care of 981 * freeing it. We just queue 982 * it onto another list. 983 */ 984 bqrelse(bp); 985 } 986 } 987 988 /* 989 * This can only happen in the case of an error 990 * because the loop above resets bp to NULL on each iteration 991 * and on normal completion has not set a new value into it. 992 * so it must have come from a 'break' statement 993 */ 994 if (bp != NULL) { 995 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 996 (LIST_EMPTY(&bp->b_dep))) { 997 bp->b_flags |= B_RELBUF; 998 brelse(bp); 999 } else { 1000 bqrelse(bp); 1001 } 1002 } 1003 1004 if ((error == 0 || uio->uio_resid != orig_resid) && 1005 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0 && 1006 (ip->i_flag & IN_ACCESS) == 0) { 1007 VI_LOCK(vp); 1008 ip->i_flag |= IN_ACCESS; 1009 VI_UNLOCK(vp); 1010 } 1011 return (error); 1012 } 1013 1014 /* 1015 * Extended attribute area writing. 1016 */ 1017 static int 1018 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1019 { 1020 struct inode *ip; 1021 struct ufs2_dinode *dp; 1022 struct fs *fs; 1023 struct buf *bp; 1024 ufs_lbn_t lbn; 1025 off_t osize; 1026 int blkoffset, error, flags, resid, size, xfersize; 1027 1028 ip = VTOI(vp); 1029 fs = ip->i_fs; 1030 dp = ip->i_din2; 1031 1032 KASSERT(!(ip->i_flag & IN_SPACECOUNTED), ("inode %u: inode is dead", 1033 ip->i_number)); 1034 1035 #ifdef INVARIANTS 1036 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1037 panic("ffs_extwrite: mode"); 1038 #endif 1039 1040 if (ioflag & IO_APPEND) 1041 uio->uio_offset = dp->di_extsize; 1042 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1043 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1044 if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1045 return (EFBIG); 1046 1047 resid = uio->uio_resid; 1048 osize = dp->di_extsize; 1049 flags = IO_EXT; 1050 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1051 flags |= IO_SYNC; 1052 1053 for (error = 0; uio->uio_resid > 0;) { 1054 lbn = lblkno(fs, uio->uio_offset); 1055 blkoffset = blkoff(fs, uio->uio_offset); 1056 xfersize = fs->fs_bsize - blkoffset; 1057 if (uio->uio_resid < xfersize) 1058 xfersize = uio->uio_resid; 1059 1060 /* 1061 * We must perform a read-before-write if the transfer size 1062 * does not cover the entire buffer. 1063 */ 1064 if (fs->fs_bsize > xfersize) 1065 flags |= BA_CLRBUF; 1066 else 1067 flags &= ~BA_CLRBUF; 1068 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1069 ucred, flags, &bp); 1070 if (error != 0) 1071 break; 1072 /* 1073 * If the buffer is not valid we have to clear out any 1074 * garbage data from the pages instantiated for the buffer. 1075 * If we do not, a failed uiomove() during a write can leave 1076 * the prior contents of the pages exposed to a userland 1077 * mmap(). XXX deal with uiomove() errors a better way. 1078 */ 1079 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1080 vfs_bio_clrbuf(bp); 1081 if (ioflag & IO_DIRECT) 1082 bp->b_flags |= B_DIRECT; 1083 1084 if (uio->uio_offset + xfersize > dp->di_extsize) 1085 dp->di_extsize = uio->uio_offset + xfersize; 1086 1087 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1088 if (size < xfersize) 1089 xfersize = size; 1090 1091 error = 1092 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1093 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1094 (LIST_EMPTY(&bp->b_dep))) { 1095 bp->b_flags |= B_RELBUF; 1096 } 1097 1098 /* 1099 * If IO_SYNC each buffer is written synchronously. Otherwise 1100 * if we have a severe page deficiency write the buffer 1101 * asynchronously. Otherwise try to cluster, and if that 1102 * doesn't do it then either do an async write (if O_DIRECT), 1103 * or a delayed write (if not). 1104 */ 1105 if (ioflag & IO_SYNC) { 1106 (void)bwrite(bp); 1107 } else if (vm_page_count_severe() || 1108 buf_dirty_count_severe() || 1109 xfersize + blkoffset == fs->fs_bsize || 1110 (ioflag & (IO_ASYNC | IO_DIRECT))) 1111 bawrite(bp); 1112 else 1113 bdwrite(bp); 1114 if (error || xfersize == 0) 1115 break; 1116 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1117 } 1118 /* 1119 * If we successfully wrote any data, and we are not the superuser 1120 * we clear the setuid and setgid bits as a precaution against 1121 * tampering. 1122 */ 1123 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { 1124 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) { 1125 ip->i_mode &= ~(ISUID | ISGID); 1126 dp->di_mode = ip->i_mode; 1127 } 1128 } 1129 if (error) { 1130 if (ioflag & IO_UNIT) { 1131 (void)ffs_truncate(vp, osize, 1132 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td); 1133 uio->uio_offset -= resid - uio->uio_resid; 1134 uio->uio_resid = resid; 1135 } 1136 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1137 error = ffs_update(vp, 1); 1138 return (error); 1139 } 1140 1141 1142 /* 1143 * Vnode operating to retrieve a named extended attribute. 1144 * 1145 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1146 * the length of the EA, and possibly the pointer to the entry and to the data. 1147 */ 1148 static int 1149 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) 1150 { 1151 u_char *p, *pe, *pn, *p0; 1152 int eapad1, eapad2, ealength, ealen, nlen; 1153 uint32_t ul; 1154 1155 pe = ptr + length; 1156 nlen = strlen(name); 1157 1158 for (p = ptr; p < pe; p = pn) { 1159 p0 = p; 1160 bcopy(p, &ul, sizeof(ul)); 1161 pn = p + ul; 1162 /* make sure this entry is complete */ 1163 if (pn > pe) 1164 break; 1165 p += sizeof(uint32_t); 1166 if (*p != nspace) 1167 continue; 1168 p++; 1169 eapad2 = *p++; 1170 if (*p != nlen) 1171 continue; 1172 p++; 1173 if (bcmp(p, name, nlen)) 1174 continue; 1175 ealength = sizeof(uint32_t) + 3 + nlen; 1176 eapad1 = 8 - (ealength % 8); 1177 if (eapad1 == 8) 1178 eapad1 = 0; 1179 ealength += eapad1; 1180 ealen = ul - ealength - eapad2; 1181 p += nlen + eapad1; 1182 if (eap != NULL) 1183 *eap = p0; 1184 if (eac != NULL) 1185 *eac = p; 1186 return (ealen); 1187 } 1188 return(-1); 1189 } 1190 1191 static int 1192 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1193 { 1194 struct inode *ip; 1195 struct ufs2_dinode *dp; 1196 struct fs *fs; 1197 struct uio luio; 1198 struct iovec liovec; 1199 int easize, error; 1200 u_char *eae; 1201 1202 ip = VTOI(vp); 1203 fs = ip->i_fs; 1204 dp = ip->i_din2; 1205 easize = dp->di_extsize; 1206 if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize) 1207 return (EFBIG); 1208 1209 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1210 1211 liovec.iov_base = eae; 1212 liovec.iov_len = easize; 1213 luio.uio_iov = &liovec; 1214 luio.uio_iovcnt = 1; 1215 luio.uio_offset = 0; 1216 luio.uio_resid = easize; 1217 luio.uio_segflg = UIO_SYSSPACE; 1218 luio.uio_rw = UIO_READ; 1219 luio.uio_td = td; 1220 1221 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1222 if (error) { 1223 free(eae, M_TEMP); 1224 return(error); 1225 } 1226 *p = eae; 1227 return (0); 1228 } 1229 1230 static int 1231 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1232 { 1233 struct inode *ip; 1234 struct ufs2_dinode *dp; 1235 int error; 1236 1237 ip = VTOI(vp); 1238 1239 if (ip->i_ea_area != NULL) 1240 return (EBUSY); 1241 dp = ip->i_din2; 1242 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1243 if (error) 1244 return (error); 1245 ip->i_ea_len = dp->di_extsize; 1246 ip->i_ea_error = 0; 1247 return (0); 1248 } 1249 1250 /* 1251 * Vnode extattr transaction commit/abort 1252 */ 1253 static int 1254 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1255 { 1256 struct inode *ip; 1257 struct uio luio; 1258 struct iovec liovec; 1259 int error; 1260 struct ufs2_dinode *dp; 1261 1262 ip = VTOI(vp); 1263 if (ip->i_ea_area == NULL) 1264 return (EINVAL); 1265 dp = ip->i_din2; 1266 error = ip->i_ea_error; 1267 if (commit && error == 0) { 1268 if (cred == NOCRED) 1269 cred = vp->v_mount->mnt_cred; 1270 liovec.iov_base = ip->i_ea_area; 1271 liovec.iov_len = ip->i_ea_len; 1272 luio.uio_iov = &liovec; 1273 luio.uio_iovcnt = 1; 1274 luio.uio_offset = 0; 1275 luio.uio_resid = ip->i_ea_len; 1276 luio.uio_segflg = UIO_SYSSPACE; 1277 luio.uio_rw = UIO_WRITE; 1278 luio.uio_td = td; 1279 /* XXX: I'm not happy about truncating to zero size */ 1280 if (ip->i_ea_len < dp->di_extsize) 1281 error = ffs_truncate(vp, 0, IO_EXT, cred, td); 1282 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1283 } 1284 free(ip->i_ea_area, M_TEMP); 1285 ip->i_ea_area = NULL; 1286 ip->i_ea_len = 0; 1287 ip->i_ea_error = 0; 1288 return (error); 1289 } 1290 1291 /* 1292 * Vnode extattr strategy routine for fifos. 1293 * 1294 * We need to check for a read or write of the external attributes. 1295 * Otherwise we just fall through and do the usual thing. 1296 */ 1297 static int 1298 ffsext_strategy(struct vop_strategy_args *ap) 1299 /* 1300 struct vop_strategy_args { 1301 struct vnodeop_desc *a_desc; 1302 struct vnode *a_vp; 1303 struct buf *a_bp; 1304 }; 1305 */ 1306 { 1307 struct vnode *vp; 1308 daddr_t lbn; 1309 1310 vp = ap->a_vp; 1311 lbn = ap->a_bp->b_lblkno; 1312 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1313 lbn < 0 && lbn >= -NXADDR) 1314 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1315 if (vp->v_type == VFIFO) 1316 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1317 panic("spec nodes went here"); 1318 } 1319 1320 /* 1321 * Vnode extattr transaction commit/abort 1322 */ 1323 static int 1324 ffs_openextattr(struct vop_openextattr_args *ap) 1325 /* 1326 struct vop_openextattr_args { 1327 struct vnodeop_desc *a_desc; 1328 struct vnode *a_vp; 1329 IN struct ucred *a_cred; 1330 IN struct thread *a_td; 1331 }; 1332 */ 1333 { 1334 struct inode *ip; 1335 struct fs *fs; 1336 1337 ip = VTOI(ap->a_vp); 1338 fs = ip->i_fs; 1339 1340 if (ap->a_vp->v_type == VCHR) 1341 return (EOPNOTSUPP); 1342 1343 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1344 } 1345 1346 1347 /* 1348 * Vnode extattr transaction commit/abort 1349 */ 1350 static int 1351 ffs_closeextattr(struct vop_closeextattr_args *ap) 1352 /* 1353 struct vop_closeextattr_args { 1354 struct vnodeop_desc *a_desc; 1355 struct vnode *a_vp; 1356 int a_commit; 1357 IN struct ucred *a_cred; 1358 IN struct thread *a_td; 1359 }; 1360 */ 1361 { 1362 struct inode *ip; 1363 struct fs *fs; 1364 1365 ip = VTOI(ap->a_vp); 1366 fs = ip->i_fs; 1367 1368 if (ap->a_vp->v_type == VCHR) 1369 return (EOPNOTSUPP); 1370 1371 if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) 1372 return (EROFS); 1373 1374 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1375 } 1376 1377 /* 1378 * Vnode operation to remove a named attribute. 1379 */ 1380 static int 1381 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1382 /* 1383 vop_deleteextattr { 1384 IN struct vnode *a_vp; 1385 IN int a_attrnamespace; 1386 IN const char *a_name; 1387 IN struct ucred *a_cred; 1388 IN struct thread *a_td; 1389 }; 1390 */ 1391 { 1392 struct inode *ip; 1393 struct fs *fs; 1394 uint32_t ealength, ul; 1395 int ealen, olen, eapad1, eapad2, error, i, easize; 1396 u_char *eae, *p; 1397 int stand_alone; 1398 1399 ip = VTOI(ap->a_vp); 1400 fs = ip->i_fs; 1401 1402 if (ap->a_vp->v_type == VCHR) 1403 return (EOPNOTSUPP); 1404 1405 if (strlen(ap->a_name) == 0) 1406 return (EINVAL); 1407 1408 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1409 return (EROFS); 1410 1411 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1412 ap->a_cred, ap->a_td, VWRITE); 1413 if (error) { 1414 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1415 ip->i_ea_error = error; 1416 return (error); 1417 } 1418 1419 if (ip->i_ea_area == NULL) { 1420 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1421 if (error) 1422 return (error); 1423 stand_alone = 1; 1424 } else { 1425 stand_alone = 0; 1426 } 1427 1428 ealength = eapad1 = ealen = eapad2 = 0; 1429 1430 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1431 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1432 easize = ip->i_ea_len; 1433 1434 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1435 &p, NULL); 1436 if (olen == -1) { 1437 /* delete but nonexistent */ 1438 free(eae, M_TEMP); 1439 if (stand_alone) 1440 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1441 return(ENOATTR); 1442 } 1443 bcopy(p, &ul, sizeof ul); 1444 i = p - eae + ul; 1445 if (ul != ealength) { 1446 bcopy(p + ul, p + ealength, easize - i); 1447 easize += (ealength - ul); 1448 } 1449 if (easize > NXADDR * fs->fs_bsize) { 1450 free(eae, M_TEMP); 1451 if (stand_alone) 1452 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1453 else if (ip->i_ea_error == 0) 1454 ip->i_ea_error = ENOSPC; 1455 return(ENOSPC); 1456 } 1457 p = ip->i_ea_area; 1458 ip->i_ea_area = eae; 1459 ip->i_ea_len = easize; 1460 free(p, M_TEMP); 1461 if (stand_alone) 1462 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1463 return(error); 1464 } 1465 1466 /* 1467 * Vnode operation to retrieve a named extended attribute. 1468 */ 1469 static int 1470 ffs_getextattr(struct vop_getextattr_args *ap) 1471 /* 1472 vop_getextattr { 1473 IN struct vnode *a_vp; 1474 IN int a_attrnamespace; 1475 IN const char *a_name; 1476 INOUT struct uio *a_uio; 1477 OUT size_t *a_size; 1478 IN struct ucred *a_cred; 1479 IN struct thread *a_td; 1480 }; 1481 */ 1482 { 1483 struct inode *ip; 1484 struct fs *fs; 1485 u_char *eae, *p; 1486 unsigned easize; 1487 int error, ealen, stand_alone; 1488 1489 ip = VTOI(ap->a_vp); 1490 fs = ip->i_fs; 1491 1492 if (ap->a_vp->v_type == VCHR) 1493 return (EOPNOTSUPP); 1494 1495 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1496 ap->a_cred, ap->a_td, VREAD); 1497 if (error) 1498 return (error); 1499 1500 if (ip->i_ea_area == NULL) { 1501 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1502 if (error) 1503 return (error); 1504 stand_alone = 1; 1505 } else { 1506 stand_alone = 0; 1507 } 1508 eae = ip->i_ea_area; 1509 easize = ip->i_ea_len; 1510 1511 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1512 NULL, &p); 1513 if (ealen >= 0) { 1514 error = 0; 1515 if (ap->a_size != NULL) 1516 *ap->a_size = ealen; 1517 else if (ap->a_uio != NULL) 1518 error = uiomove(p, ealen, ap->a_uio); 1519 } else 1520 error = ENOATTR; 1521 if (stand_alone) 1522 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1523 return(error); 1524 } 1525 1526 /* 1527 * Vnode operation to retrieve extended attributes on a vnode. 1528 */ 1529 static int 1530 ffs_listextattr(struct vop_listextattr_args *ap) 1531 /* 1532 vop_listextattr { 1533 IN struct vnode *a_vp; 1534 IN int a_attrnamespace; 1535 INOUT struct uio *a_uio; 1536 OUT size_t *a_size; 1537 IN struct ucred *a_cred; 1538 IN struct thread *a_td; 1539 }; 1540 */ 1541 { 1542 struct inode *ip; 1543 struct fs *fs; 1544 u_char *eae, *p, *pe, *pn; 1545 unsigned easize; 1546 uint32_t ul; 1547 int error, ealen, stand_alone; 1548 1549 ip = VTOI(ap->a_vp); 1550 fs = ip->i_fs; 1551 1552 if (ap->a_vp->v_type == VCHR) 1553 return (EOPNOTSUPP); 1554 1555 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1556 ap->a_cred, ap->a_td, VREAD); 1557 if (error) 1558 return (error); 1559 1560 if (ip->i_ea_area == NULL) { 1561 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1562 if (error) 1563 return (error); 1564 stand_alone = 1; 1565 } else { 1566 stand_alone = 0; 1567 } 1568 eae = ip->i_ea_area; 1569 easize = ip->i_ea_len; 1570 1571 error = 0; 1572 if (ap->a_size != NULL) 1573 *ap->a_size = 0; 1574 pe = eae + easize; 1575 for(p = eae; error == 0 && p < pe; p = pn) { 1576 bcopy(p, &ul, sizeof(ul)); 1577 pn = p + ul; 1578 if (pn > pe) 1579 break; 1580 p += sizeof(ul); 1581 if (*p++ != ap->a_attrnamespace) 1582 continue; 1583 p++; /* pad2 */ 1584 ealen = *p; 1585 if (ap->a_size != NULL) { 1586 *ap->a_size += ealen + 1; 1587 } else if (ap->a_uio != NULL) { 1588 error = uiomove(p, ealen + 1, ap->a_uio); 1589 } 1590 } 1591 if (stand_alone) 1592 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1593 return(error); 1594 } 1595 1596 /* 1597 * Vnode operation to set a named attribute. 1598 */ 1599 static int 1600 ffs_setextattr(struct vop_setextattr_args *ap) 1601 /* 1602 vop_setextattr { 1603 IN struct vnode *a_vp; 1604 IN int a_attrnamespace; 1605 IN const char *a_name; 1606 INOUT struct uio *a_uio; 1607 IN struct ucred *a_cred; 1608 IN struct thread *a_td; 1609 }; 1610 */ 1611 { 1612 struct inode *ip; 1613 struct fs *fs; 1614 uint32_t ealength, ul; 1615 int ealen, olen, eapad1, eapad2, error, i, easize; 1616 u_char *eae, *p; 1617 int stand_alone; 1618 1619 ip = VTOI(ap->a_vp); 1620 fs = ip->i_fs; 1621 1622 if (ap->a_vp->v_type == VCHR) 1623 return (EOPNOTSUPP); 1624 1625 if (strlen(ap->a_name) == 0) 1626 return (EINVAL); 1627 1628 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1629 if (ap->a_uio == NULL) 1630 return (EOPNOTSUPP); 1631 1632 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1633 return (EROFS); 1634 1635 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1636 ap->a_cred, ap->a_td, VWRITE); 1637 if (error) { 1638 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1639 ip->i_ea_error = error; 1640 return (error); 1641 } 1642 1643 if (ip->i_ea_area == NULL) { 1644 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1645 if (error) 1646 return (error); 1647 stand_alone = 1; 1648 } else { 1649 stand_alone = 0; 1650 } 1651 1652 ealen = ap->a_uio->uio_resid; 1653 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1654 eapad1 = 8 - (ealength % 8); 1655 if (eapad1 == 8) 1656 eapad1 = 0; 1657 eapad2 = 8 - (ealen % 8); 1658 if (eapad2 == 8) 1659 eapad2 = 0; 1660 ealength += eapad1 + ealen + eapad2; 1661 1662 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1663 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1664 easize = ip->i_ea_len; 1665 1666 olen = ffs_findextattr(eae, easize, 1667 ap->a_attrnamespace, ap->a_name, &p, NULL); 1668 if (olen == -1) { 1669 /* new, append at end */ 1670 p = eae + easize; 1671 easize += ealength; 1672 } else { 1673 bcopy(p, &ul, sizeof ul); 1674 i = p - eae + ul; 1675 if (ul != ealength) { 1676 bcopy(p + ul, p + ealength, easize - i); 1677 easize += (ealength - ul); 1678 } 1679 } 1680 if (easize > NXADDR * fs->fs_bsize) { 1681 free(eae, M_TEMP); 1682 if (stand_alone) 1683 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1684 else if (ip->i_ea_error == 0) 1685 ip->i_ea_error = ENOSPC; 1686 return(ENOSPC); 1687 } 1688 bcopy(&ealength, p, sizeof(ealength)); 1689 p += sizeof(ealength); 1690 *p++ = ap->a_attrnamespace; 1691 *p++ = eapad2; 1692 *p++ = strlen(ap->a_name); 1693 strcpy(p, ap->a_name); 1694 p += strlen(ap->a_name); 1695 bzero(p, eapad1); 1696 p += eapad1; 1697 error = uiomove(p, ealen, ap->a_uio); 1698 if (error) { 1699 free(eae, M_TEMP); 1700 if (stand_alone) 1701 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1702 else if (ip->i_ea_error == 0) 1703 ip->i_ea_error = error; 1704 return(error); 1705 } 1706 p += ealen; 1707 bzero(p, eapad2); 1708 1709 p = ip->i_ea_area; 1710 ip->i_ea_area = eae; 1711 ip->i_ea_len = easize; 1712 free(p, M_TEMP); 1713 if (stand_alone) 1714 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1715 return(error); 1716 } 1717 1718 /* 1719 * Vnode pointer to File handle 1720 */ 1721 static int 1722 ffs_vptofh(struct vop_vptofh_args *ap) 1723 /* 1724 vop_vptofh { 1725 IN struct vnode *a_vp; 1726 IN struct fid *a_fhp; 1727 }; 1728 */ 1729 { 1730 struct inode *ip; 1731 struct ufid *ufhp; 1732 1733 ip = VTOI(ap->a_vp); 1734 ufhp = (struct ufid *)ap->a_fhp; 1735 ufhp->ufid_len = sizeof(struct ufid); 1736 ufhp->ufid_ino = ip->i_number; 1737 ufhp->ufid_gen = ip->i_gen; 1738 return (0); 1739 } 1740