1 /*- 2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 60 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 61 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include <sys/param.h> 68 #include <sys/bio.h> 69 #include <sys/systm.h> 70 #include <sys/buf.h> 71 #include <sys/conf.h> 72 #include <sys/extattr.h> 73 #include <sys/kernel.h> 74 #include <sys/limits.h> 75 #include <sys/malloc.h> 76 #include <sys/mount.h> 77 #include <sys/priv.h> 78 #include <sys/proc.h> 79 #include <sys/resourcevar.h> 80 #include <sys/signalvar.h> 81 #include <sys/stat.h> 82 #include <sys/vmmeter.h> 83 #include <sys/vnode.h> 84 85 #include <vm/vm.h> 86 #include <vm/vm_extern.h> 87 #include <vm/vm_object.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_pager.h> 90 #include <vm/vnode_pager.h> 91 92 #include <ufs/ufs/extattr.h> 93 #include <ufs/ufs/quota.h> 94 #include <ufs/ufs/inode.h> 95 #include <ufs/ufs/ufs_extern.h> 96 #include <ufs/ufs/ufsmount.h> 97 98 #include <ufs/ffs/fs.h> 99 #include <ufs/ffs/ffs_extern.h> 100 #include "opt_directio.h" 101 #include "opt_ffs.h" 102 103 #ifdef DIRECTIO 104 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 105 #endif 106 static vop_fsync_t ffs_fsync; 107 static vop_lock1_t ffs_lock; 108 static vop_getpages_t ffs_getpages; 109 static vop_read_t ffs_read; 110 static vop_write_t ffs_write; 111 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 112 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 113 struct ucred *cred); 114 static vop_strategy_t ffsext_strategy; 115 static vop_closeextattr_t ffs_closeextattr; 116 static vop_deleteextattr_t ffs_deleteextattr; 117 static vop_getextattr_t ffs_getextattr; 118 static vop_listextattr_t ffs_listextattr; 119 static vop_openextattr_t ffs_openextattr; 120 static vop_setextattr_t ffs_setextattr; 121 static vop_vptofh_t ffs_vptofh; 122 123 124 /* Global vfs data structures for ufs. */ 125 struct vop_vector ffs_vnodeops1 = { 126 .vop_default = &ufs_vnodeops, 127 .vop_fsync = ffs_fsync, 128 .vop_getpages = ffs_getpages, 129 .vop_lock1 = ffs_lock, 130 .vop_read = ffs_read, 131 .vop_reallocblks = ffs_reallocblks, 132 .vop_write = ffs_write, 133 .vop_vptofh = ffs_vptofh, 134 }; 135 136 struct vop_vector ffs_fifoops1 = { 137 .vop_default = &ufs_fifoops, 138 .vop_fsync = ffs_fsync, 139 .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ 140 .vop_vptofh = ffs_vptofh, 141 }; 142 143 /* Global vfs data structures for ufs. */ 144 struct vop_vector ffs_vnodeops2 = { 145 .vop_default = &ufs_vnodeops, 146 .vop_fsync = ffs_fsync, 147 .vop_getpages = ffs_getpages, 148 .vop_lock1 = ffs_lock, 149 .vop_read = ffs_read, 150 .vop_reallocblks = ffs_reallocblks, 151 .vop_write = ffs_write, 152 .vop_closeextattr = ffs_closeextattr, 153 .vop_deleteextattr = ffs_deleteextattr, 154 .vop_getextattr = ffs_getextattr, 155 .vop_listextattr = ffs_listextattr, 156 .vop_openextattr = ffs_openextattr, 157 .vop_setextattr = ffs_setextattr, 158 .vop_vptofh = ffs_vptofh, 159 }; 160 161 struct vop_vector ffs_fifoops2 = { 162 .vop_default = &ufs_fifoops, 163 .vop_fsync = ffs_fsync, 164 .vop_lock1 = ffs_lock, 165 .vop_reallocblks = ffs_reallocblks, 166 .vop_strategy = ffsext_strategy, 167 .vop_closeextattr = ffs_closeextattr, 168 .vop_deleteextattr = ffs_deleteextattr, 169 .vop_getextattr = ffs_getextattr, 170 .vop_listextattr = ffs_listextattr, 171 .vop_openextattr = ffs_openextattr, 172 .vop_setextattr = ffs_setextattr, 173 .vop_vptofh = ffs_vptofh, 174 }; 175 176 /* 177 * Synch an open file. 178 */ 179 /* ARGSUSED */ 180 static int 181 ffs_fsync(struct vop_fsync_args *ap) 182 { 183 int error; 184 185 error = ffs_syncvnode(ap->a_vp, ap->a_waitfor); 186 if (error) 187 return (error); 188 if (ap->a_waitfor == MNT_WAIT && 189 (ap->a_vp->v_mount->mnt_flag & MNT_SOFTDEP)) 190 error = softdep_fsync(ap->a_vp); 191 return (error); 192 } 193 194 int 195 ffs_syncvnode(struct vnode *vp, int waitfor) 196 { 197 struct inode *ip = VTOI(vp); 198 struct buf *bp; 199 struct buf *nbp; 200 int s, error, wait, passes, skipmeta; 201 ufs_lbn_t lbn; 202 203 wait = (waitfor == MNT_WAIT); 204 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 205 206 /* 207 * Flush all dirty buffers associated with a vnode. 208 */ 209 passes = NIADDR + 1; 210 skipmeta = 0; 211 if (wait) 212 skipmeta = 1; 213 s = splbio(); 214 VI_LOCK(vp); 215 loop: 216 TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) 217 bp->b_vflags &= ~BV_SCANNED; 218 TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { 219 /* 220 * Reasons to skip this buffer: it has already been considered 221 * on this pass, this pass is the first time through on a 222 * synchronous flush request and the buffer being considered 223 * is metadata, the buffer has dependencies that will cause 224 * it to be redirtied and it has not already been deferred, 225 * or it is already being written. 226 */ 227 if ((bp->b_vflags & BV_SCANNED) != 0) 228 continue; 229 bp->b_vflags |= BV_SCANNED; 230 if ((skipmeta == 1 && bp->b_lblkno < 0)) 231 continue; 232 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 233 continue; 234 VI_UNLOCK(vp); 235 if (!wait && !LIST_EMPTY(&bp->b_dep) && 236 (bp->b_flags & B_DEFERRED) == 0 && 237 buf_countdeps(bp, 0)) { 238 bp->b_flags |= B_DEFERRED; 239 BUF_UNLOCK(bp); 240 VI_LOCK(vp); 241 continue; 242 } 243 if ((bp->b_flags & B_DELWRI) == 0) 244 panic("ffs_fsync: not dirty"); 245 /* 246 * If this is a synchronous flush request, or it is not a 247 * file or device, start the write on this buffer immediatly. 248 */ 249 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) { 250 251 /* 252 * On our final pass through, do all I/O synchronously 253 * so that we can find out if our flush is failing 254 * because of write errors. 255 */ 256 if (passes > 0 || !wait) { 257 if ((bp->b_flags & B_CLUSTEROK) && !wait) { 258 (void) vfs_bio_awrite(bp); 259 } else { 260 bremfree(bp); 261 splx(s); 262 (void) bawrite(bp); 263 s = splbio(); 264 } 265 } else { 266 bremfree(bp); 267 splx(s); 268 if ((error = bwrite(bp)) != 0) 269 return (error); 270 s = splbio(); 271 } 272 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) { 273 /* 274 * If the buffer is for data that has been truncated 275 * off the file, then throw it away. 276 */ 277 bremfree(bp); 278 bp->b_flags |= B_INVAL | B_NOCACHE; 279 splx(s); 280 brelse(bp); 281 s = splbio(); 282 } else 283 vfs_bio_awrite(bp); 284 285 /* 286 * Since we may have slept during the I/O, we need 287 * to start from a known point. 288 */ 289 VI_LOCK(vp); 290 nbp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd); 291 } 292 /* 293 * If we were asked to do this synchronously, then go back for 294 * another pass, this time doing the metadata. 295 */ 296 if (skipmeta) { 297 skipmeta = 0; 298 goto loop; 299 } 300 301 if (wait) { 302 bufobj_wwait(&vp->v_bufobj, 3, 0); 303 VI_UNLOCK(vp); 304 305 /* 306 * Ensure that any filesystem metatdata associated 307 * with the vnode has been written. 308 */ 309 splx(s); 310 if ((error = softdep_sync_metadata(vp)) != 0) 311 return (error); 312 s = splbio(); 313 314 VI_LOCK(vp); 315 if (vp->v_bufobj.bo_dirty.bv_cnt > 0) { 316 /* 317 * Block devices associated with filesystems may 318 * have new I/O requests posted for them even if 319 * the vnode is locked, so no amount of trying will 320 * get them clean. Thus we give block devices a 321 * good effort, then just give up. For all other file 322 * types, go around and try again until it is clean. 323 */ 324 if (passes > 0) { 325 passes -= 1; 326 goto loop; 327 } 328 #ifdef INVARIANTS 329 if (!vn_isdisk(vp, NULL)) 330 vprint("ffs_fsync: dirty", vp); 331 #endif 332 } 333 } 334 VI_UNLOCK(vp); 335 splx(s); 336 return (ffs_update(vp, wait)); 337 } 338 339 static int 340 ffs_lock(ap) 341 struct vop_lock1_args /* { 342 struct vnode *a_vp; 343 int a_flags; 344 struct thread *a_td; 345 char *file; 346 int line; 347 } */ *ap; 348 { 349 #ifndef NO_FFS_SNAPSHOT 350 struct vnode *vp; 351 int flags; 352 struct lock *lkp; 353 int result; 354 355 switch (ap->a_flags & LK_TYPE_MASK) { 356 case LK_SHARED: 357 case LK_UPGRADE: 358 case LK_EXCLUSIVE: 359 vp = ap->a_vp; 360 flags = ap->a_flags; 361 for (;;) { 362 /* 363 * vnode interlock must be held to ensure that 364 * the possibly external lock isn't freed, 365 * e.g. when mutating from snapshot file vnode 366 * to regular file vnode. 367 */ 368 if ((flags & LK_INTERLOCK) == 0) { 369 VI_LOCK(vp); 370 flags |= LK_INTERLOCK; 371 } 372 lkp = vp->v_vnlock; 373 result = _lockmgr_args(lkp, flags, VI_MTX(vp), 374 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 375 ap->a_file, ap->a_line); 376 if (lkp == vp->v_vnlock || result != 0) 377 break; 378 /* 379 * Apparent success, except that the vnode 380 * mutated between snapshot file vnode and 381 * regular file vnode while this process 382 * slept. The lock currently held is not the 383 * right lock. Release it, and try to get the 384 * new lock. 385 */ 386 (void) _lockmgr_args(lkp, LK_RELEASE, VI_MTX(vp), 387 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, 388 ap->a_file, ap->a_line); 389 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 390 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 391 flags &= ~LK_INTERLOCK; 392 } 393 break; 394 default: 395 result = VOP_LOCK1_APV(&ufs_vnodeops, ap); 396 } 397 return (result); 398 #else 399 return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); 400 #endif 401 } 402 403 /* 404 * Vnode op for reading. 405 */ 406 /* ARGSUSED */ 407 static int 408 ffs_read(ap) 409 struct vop_read_args /* { 410 struct vnode *a_vp; 411 struct uio *a_uio; 412 int a_ioflag; 413 struct ucred *a_cred; 414 } */ *ap; 415 { 416 struct vnode *vp; 417 struct inode *ip; 418 struct uio *uio; 419 struct fs *fs; 420 struct buf *bp; 421 ufs_lbn_t lbn, nextlbn; 422 off_t bytesinfile; 423 long size, xfersize, blkoffset; 424 int error, orig_resid; 425 int seqcount; 426 int ioflag; 427 428 vp = ap->a_vp; 429 uio = ap->a_uio; 430 ioflag = ap->a_ioflag; 431 if (ap->a_ioflag & IO_EXT) 432 #ifdef notyet 433 return (ffs_extread(vp, uio, ioflag)); 434 #else 435 panic("ffs_read+IO_EXT"); 436 #endif 437 #ifdef DIRECTIO 438 if ((ioflag & IO_DIRECT) != 0) { 439 int workdone; 440 441 error = ffs_rawread(vp, uio, &workdone); 442 if (error != 0 || workdone != 0) 443 return error; 444 } 445 #endif 446 447 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 448 ip = VTOI(vp); 449 450 #ifdef INVARIANTS 451 if (uio->uio_rw != UIO_READ) 452 panic("ffs_read: mode"); 453 454 if (vp->v_type == VLNK) { 455 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 456 panic("ffs_read: short symlink"); 457 } else if (vp->v_type != VREG && vp->v_type != VDIR) 458 panic("ffs_read: type %d", vp->v_type); 459 #endif 460 orig_resid = uio->uio_resid; 461 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 462 if (orig_resid == 0) 463 return (0); 464 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 465 fs = ip->i_fs; 466 if (uio->uio_offset < ip->i_size && 467 uio->uio_offset >= fs->fs_maxfilesize) 468 return (EOVERFLOW); 469 470 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 471 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 472 break; 473 lbn = lblkno(fs, uio->uio_offset); 474 nextlbn = lbn + 1; 475 476 /* 477 * size of buffer. The buffer representing the 478 * end of the file is rounded up to the size of 479 * the block type ( fragment or full block, 480 * depending ). 481 */ 482 size = blksize(fs, ip, lbn); 483 blkoffset = blkoff(fs, uio->uio_offset); 484 485 /* 486 * The amount we want to transfer in this iteration is 487 * one FS block less the amount of the data before 488 * our startpoint (duh!) 489 */ 490 xfersize = fs->fs_bsize - blkoffset; 491 492 /* 493 * But if we actually want less than the block, 494 * or the file doesn't have a whole block more of data, 495 * then use the lesser number. 496 */ 497 if (uio->uio_resid < xfersize) 498 xfersize = uio->uio_resid; 499 if (bytesinfile < xfersize) 500 xfersize = bytesinfile; 501 502 if (lblktosize(fs, nextlbn) >= ip->i_size) { 503 /* 504 * Don't do readahead if this is the end of the file. 505 */ 506 error = bread(vp, lbn, size, NOCRED, &bp); 507 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 508 /* 509 * Otherwise if we are allowed to cluster, 510 * grab as much as we can. 511 * 512 * XXX This may not be a win if we are not 513 * doing sequential access. 514 */ 515 error = cluster_read(vp, ip->i_size, lbn, 516 size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); 517 } else if (seqcount > 1) { 518 /* 519 * If we are NOT allowed to cluster, then 520 * if we appear to be acting sequentially, 521 * fire off a request for a readahead 522 * as well as a read. Note that the 4th and 5th 523 * arguments point to arrays of the size specified in 524 * the 6th argument. 525 */ 526 int nextsize = blksize(fs, ip, nextlbn); 527 error = breadn(vp, lbn, 528 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 529 } else { 530 /* 531 * Failing all of the above, just read what the 532 * user asked for. Interestingly, the same as 533 * the first option above. 534 */ 535 error = bread(vp, lbn, size, NOCRED, &bp); 536 } 537 if (error) { 538 brelse(bp); 539 bp = NULL; 540 break; 541 } 542 543 /* 544 * If IO_DIRECT then set B_DIRECT for the buffer. This 545 * will cause us to attempt to release the buffer later on 546 * and will cause the buffer cache to attempt to free the 547 * underlying pages. 548 */ 549 if (ioflag & IO_DIRECT) 550 bp->b_flags |= B_DIRECT; 551 552 /* 553 * We should only get non-zero b_resid when an I/O error 554 * has occurred, which should cause us to break above. 555 * However, if the short read did not cause an error, 556 * then we want to ensure that we do not uiomove bad 557 * or uninitialized data. 558 */ 559 size -= bp->b_resid; 560 if (size < xfersize) { 561 if (size == 0) 562 break; 563 xfersize = size; 564 } 565 566 error = uiomove((char *)bp->b_data + blkoffset, 567 (int)xfersize, uio); 568 if (error) 569 break; 570 571 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 572 (LIST_EMPTY(&bp->b_dep))) { 573 /* 574 * If there are no dependencies, and it's VMIO, 575 * then we don't need the buf, mark it available 576 * for freeing. The VM has the data. 577 */ 578 bp->b_flags |= B_RELBUF; 579 brelse(bp); 580 } else { 581 /* 582 * Otherwise let whoever 583 * made the request take care of 584 * freeing it. We just queue 585 * it onto another list. 586 */ 587 bqrelse(bp); 588 } 589 } 590 591 /* 592 * This can only happen in the case of an error 593 * because the loop above resets bp to NULL on each iteration 594 * and on normal completion has not set a new value into it. 595 * so it must have come from a 'break' statement 596 */ 597 if (bp != NULL) { 598 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 599 (LIST_EMPTY(&bp->b_dep))) { 600 bp->b_flags |= B_RELBUF; 601 brelse(bp); 602 } else { 603 bqrelse(bp); 604 } 605 } 606 607 if ((error == 0 || uio->uio_resid != orig_resid) && 608 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) { 609 VI_LOCK(vp); 610 ip->i_flag |= IN_ACCESS; 611 VI_UNLOCK(vp); 612 } 613 return (error); 614 } 615 616 /* 617 * Vnode op for writing. 618 */ 619 static int 620 ffs_write(ap) 621 struct vop_write_args /* { 622 struct vnode *a_vp; 623 struct uio *a_uio; 624 int a_ioflag; 625 struct ucred *a_cred; 626 } */ *ap; 627 { 628 struct vnode *vp; 629 struct uio *uio; 630 struct inode *ip; 631 struct fs *fs; 632 struct buf *bp; 633 struct thread *td; 634 ufs_lbn_t lbn; 635 off_t osize; 636 int seqcount; 637 int blkoffset, error, flags, ioflag, resid, size, xfersize; 638 639 vp = ap->a_vp; 640 uio = ap->a_uio; 641 ioflag = ap->a_ioflag; 642 if (ap->a_ioflag & IO_EXT) 643 #ifdef notyet 644 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 645 #else 646 panic("ffs_write+IO_EXT"); 647 #endif 648 649 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 650 ip = VTOI(vp); 651 652 #ifdef INVARIANTS 653 if (uio->uio_rw != UIO_WRITE) 654 panic("ffs_write: mode"); 655 #endif 656 657 switch (vp->v_type) { 658 case VREG: 659 if (ioflag & IO_APPEND) 660 uio->uio_offset = ip->i_size; 661 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 662 return (EPERM); 663 /* FALLTHROUGH */ 664 case VLNK: 665 break; 666 case VDIR: 667 panic("ffs_write: dir write"); 668 break; 669 default: 670 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 671 (int)uio->uio_offset, 672 (int)uio->uio_resid 673 ); 674 } 675 676 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 677 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 678 fs = ip->i_fs; 679 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 680 return (EFBIG); 681 /* 682 * Maybe this should be above the vnode op call, but so long as 683 * file servers have no limits, I don't think it matters. 684 */ 685 td = uio->uio_td; 686 if (vp->v_type == VREG && td != NULL) { 687 PROC_LOCK(td->td_proc); 688 if (uio->uio_offset + uio->uio_resid > 689 lim_cur(td->td_proc, RLIMIT_FSIZE)) { 690 psignal(td->td_proc, SIGXFSZ); 691 PROC_UNLOCK(td->td_proc); 692 return (EFBIG); 693 } 694 PROC_UNLOCK(td->td_proc); 695 } 696 697 resid = uio->uio_resid; 698 osize = ip->i_size; 699 if (seqcount > BA_SEQMAX) 700 flags = BA_SEQMAX << BA_SEQSHIFT; 701 else 702 flags = seqcount << BA_SEQSHIFT; 703 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 704 flags |= IO_SYNC; 705 706 for (error = 0; uio->uio_resid > 0;) { 707 lbn = lblkno(fs, uio->uio_offset); 708 blkoffset = blkoff(fs, uio->uio_offset); 709 xfersize = fs->fs_bsize - blkoffset; 710 if (uio->uio_resid < xfersize) 711 xfersize = uio->uio_resid; 712 if (uio->uio_offset + xfersize > ip->i_size) 713 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 714 715 /* 716 * We must perform a read-before-write if the transfer size 717 * does not cover the entire buffer. 718 */ 719 if (fs->fs_bsize > xfersize) 720 flags |= BA_CLRBUF; 721 else 722 flags &= ~BA_CLRBUF; 723 /* XXX is uio->uio_offset the right thing here? */ 724 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 725 ap->a_cred, flags, &bp); 726 if (error != 0) 727 break; 728 /* 729 * If the buffer is not valid we have to clear out any 730 * garbage data from the pages instantiated for the buffer. 731 * If we do not, a failed uiomove() during a write can leave 732 * the prior contents of the pages exposed to a userland 733 * mmap(). XXX deal with uiomove() errors a better way. 734 */ 735 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 736 vfs_bio_clrbuf(bp); 737 if (ioflag & IO_DIRECT) 738 bp->b_flags |= B_DIRECT; 739 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 740 bp->b_flags |= B_NOCACHE; 741 742 if (uio->uio_offset + xfersize > ip->i_size) { 743 ip->i_size = uio->uio_offset + xfersize; 744 DIP_SET(ip, i_size, ip->i_size); 745 } 746 747 size = blksize(fs, ip, lbn) - bp->b_resid; 748 if (size < xfersize) 749 xfersize = size; 750 751 error = 752 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 753 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 754 (LIST_EMPTY(&bp->b_dep))) { 755 bp->b_flags |= B_RELBUF; 756 } 757 758 /* 759 * If IO_SYNC each buffer is written synchronously. Otherwise 760 * if we have a severe page deficiency write the buffer 761 * asynchronously. Otherwise try to cluster, and if that 762 * doesn't do it then either do an async write (if O_DIRECT), 763 * or a delayed write (if not). 764 */ 765 if (ioflag & IO_SYNC) { 766 (void)bwrite(bp); 767 } else if (vm_page_count_severe() || 768 buf_dirty_count_severe() || 769 (ioflag & IO_ASYNC)) { 770 bp->b_flags |= B_CLUSTEROK; 771 bawrite(bp); 772 } else if (xfersize + blkoffset == fs->fs_bsize) { 773 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 774 bp->b_flags |= B_CLUSTEROK; 775 cluster_write(vp, bp, ip->i_size, seqcount); 776 } else { 777 bawrite(bp); 778 } 779 } else if (ioflag & IO_DIRECT) { 780 bp->b_flags |= B_CLUSTEROK; 781 bawrite(bp); 782 } else { 783 bp->b_flags |= B_CLUSTEROK; 784 bdwrite(bp); 785 } 786 if (error || xfersize == 0) 787 break; 788 ip->i_flag |= IN_CHANGE | IN_UPDATE; 789 } 790 /* 791 * If we successfully wrote any data, and we are not the superuser 792 * we clear the setuid and setgid bits as a precaution against 793 * tampering. 794 */ 795 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && 796 ap->a_cred) { 797 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) { 798 ip->i_mode &= ~(ISUID | ISGID); 799 DIP_SET(ip, i_mode, ip->i_mode); 800 } 801 } 802 if (error) { 803 if (ioflag & IO_UNIT) { 804 (void)ffs_truncate(vp, osize, 805 IO_NORMAL | (ioflag & IO_SYNC), 806 ap->a_cred, uio->uio_td); 807 uio->uio_offset -= resid - uio->uio_resid; 808 uio->uio_resid = resid; 809 } 810 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 811 error = ffs_update(vp, 1); 812 return (error); 813 } 814 815 /* 816 * get page routine 817 */ 818 static int 819 ffs_getpages(ap) 820 struct vop_getpages_args *ap; 821 { 822 int i; 823 vm_page_t mreq; 824 int pcount; 825 826 pcount = round_page(ap->a_count) / PAGE_SIZE; 827 mreq = ap->a_m[ap->a_reqpage]; 828 829 /* 830 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 831 * then the entire page is valid. Since the page may be mapped, 832 * user programs might reference data beyond the actual end of file 833 * occuring within the page. We have to zero that data. 834 */ 835 VM_OBJECT_LOCK(mreq->object); 836 if (mreq->valid) { 837 if (mreq->valid != VM_PAGE_BITS_ALL) 838 vm_page_zero_invalid(mreq, TRUE); 839 vm_page_lock_queues(); 840 for (i = 0; i < pcount; i++) { 841 if (i != ap->a_reqpage) { 842 vm_page_free(ap->a_m[i]); 843 } 844 } 845 vm_page_unlock_queues(); 846 VM_OBJECT_UNLOCK(mreq->object); 847 return VM_PAGER_OK; 848 } 849 VM_OBJECT_UNLOCK(mreq->object); 850 851 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 852 ap->a_count, 853 ap->a_reqpage); 854 } 855 856 857 /* 858 * Extended attribute area reading. 859 */ 860 static int 861 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 862 { 863 struct inode *ip; 864 struct ufs2_dinode *dp; 865 struct fs *fs; 866 struct buf *bp; 867 ufs_lbn_t lbn, nextlbn; 868 off_t bytesinfile; 869 long size, xfersize, blkoffset; 870 int error, orig_resid; 871 872 ip = VTOI(vp); 873 fs = ip->i_fs; 874 dp = ip->i_din2; 875 876 #ifdef INVARIANTS 877 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 878 panic("ffs_extread: mode"); 879 880 #endif 881 orig_resid = uio->uio_resid; 882 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 883 if (orig_resid == 0) 884 return (0); 885 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 886 887 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 888 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 889 break; 890 lbn = lblkno(fs, uio->uio_offset); 891 nextlbn = lbn + 1; 892 893 /* 894 * size of buffer. The buffer representing the 895 * end of the file is rounded up to the size of 896 * the block type ( fragment or full block, 897 * depending ). 898 */ 899 size = sblksize(fs, dp->di_extsize, lbn); 900 blkoffset = blkoff(fs, uio->uio_offset); 901 902 /* 903 * The amount we want to transfer in this iteration is 904 * one FS block less the amount of the data before 905 * our startpoint (duh!) 906 */ 907 xfersize = fs->fs_bsize - blkoffset; 908 909 /* 910 * But if we actually want less than the block, 911 * or the file doesn't have a whole block more of data, 912 * then use the lesser number. 913 */ 914 if (uio->uio_resid < xfersize) 915 xfersize = uio->uio_resid; 916 if (bytesinfile < xfersize) 917 xfersize = bytesinfile; 918 919 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 920 /* 921 * Don't do readahead if this is the end of the info. 922 */ 923 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 924 } else { 925 /* 926 * If we have a second block, then 927 * fire off a request for a readahead 928 * as well as a read. Note that the 4th and 5th 929 * arguments point to arrays of the size specified in 930 * the 6th argument. 931 */ 932 int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 933 934 nextlbn = -1 - nextlbn; 935 error = breadn(vp, -1 - lbn, 936 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 937 } 938 if (error) { 939 brelse(bp); 940 bp = NULL; 941 break; 942 } 943 944 /* 945 * If IO_DIRECT then set B_DIRECT for the buffer. This 946 * will cause us to attempt to release the buffer later on 947 * and will cause the buffer cache to attempt to free the 948 * underlying pages. 949 */ 950 if (ioflag & IO_DIRECT) 951 bp->b_flags |= B_DIRECT; 952 953 /* 954 * We should only get non-zero b_resid when an I/O error 955 * has occurred, which should cause us to break above. 956 * However, if the short read did not cause an error, 957 * then we want to ensure that we do not uiomove bad 958 * or uninitialized data. 959 */ 960 size -= bp->b_resid; 961 if (size < xfersize) { 962 if (size == 0) 963 break; 964 xfersize = size; 965 } 966 967 error = uiomove((char *)bp->b_data + blkoffset, 968 (int)xfersize, uio); 969 if (error) 970 break; 971 972 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 973 (LIST_EMPTY(&bp->b_dep))) { 974 /* 975 * If there are no dependencies, and it's VMIO, 976 * then we don't need the buf, mark it available 977 * for freeing. The VM has the data. 978 */ 979 bp->b_flags |= B_RELBUF; 980 brelse(bp); 981 } else { 982 /* 983 * Otherwise let whoever 984 * made the request take care of 985 * freeing it. We just queue 986 * it onto another list. 987 */ 988 bqrelse(bp); 989 } 990 } 991 992 /* 993 * This can only happen in the case of an error 994 * because the loop above resets bp to NULL on each iteration 995 * and on normal completion has not set a new value into it. 996 * so it must have come from a 'break' statement 997 */ 998 if (bp != NULL) { 999 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1000 (LIST_EMPTY(&bp->b_dep))) { 1001 bp->b_flags |= B_RELBUF; 1002 brelse(bp); 1003 } else { 1004 bqrelse(bp); 1005 } 1006 } 1007 1008 if ((error == 0 || uio->uio_resid != orig_resid) && 1009 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) { 1010 VI_LOCK(vp); 1011 ip->i_flag |= IN_ACCESS; 1012 VI_UNLOCK(vp); 1013 } 1014 return (error); 1015 } 1016 1017 /* 1018 * Extended attribute area writing. 1019 */ 1020 static int 1021 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1022 { 1023 struct inode *ip; 1024 struct ufs2_dinode *dp; 1025 struct fs *fs; 1026 struct buf *bp; 1027 ufs_lbn_t lbn; 1028 off_t osize; 1029 int blkoffset, error, flags, resid, size, xfersize; 1030 1031 ip = VTOI(vp); 1032 fs = ip->i_fs; 1033 dp = ip->i_din2; 1034 1035 KASSERT(!(ip->i_flag & IN_SPACECOUNTED), ("inode %u: inode is dead", 1036 ip->i_number)); 1037 1038 #ifdef INVARIANTS 1039 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1040 panic("ffs_extwrite: mode"); 1041 #endif 1042 1043 if (ioflag & IO_APPEND) 1044 uio->uio_offset = dp->di_extsize; 1045 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1046 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1047 if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1048 return (EFBIG); 1049 1050 resid = uio->uio_resid; 1051 osize = dp->di_extsize; 1052 flags = IO_EXT; 1053 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1054 flags |= IO_SYNC; 1055 1056 for (error = 0; uio->uio_resid > 0;) { 1057 lbn = lblkno(fs, uio->uio_offset); 1058 blkoffset = blkoff(fs, uio->uio_offset); 1059 xfersize = fs->fs_bsize - blkoffset; 1060 if (uio->uio_resid < xfersize) 1061 xfersize = uio->uio_resid; 1062 1063 /* 1064 * We must perform a read-before-write if the transfer size 1065 * does not cover the entire buffer. 1066 */ 1067 if (fs->fs_bsize > xfersize) 1068 flags |= BA_CLRBUF; 1069 else 1070 flags &= ~BA_CLRBUF; 1071 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1072 ucred, flags, &bp); 1073 if (error != 0) 1074 break; 1075 /* 1076 * If the buffer is not valid we have to clear out any 1077 * garbage data from the pages instantiated for the buffer. 1078 * If we do not, a failed uiomove() during a write can leave 1079 * the prior contents of the pages exposed to a userland 1080 * mmap(). XXX deal with uiomove() errors a better way. 1081 */ 1082 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1083 vfs_bio_clrbuf(bp); 1084 if (ioflag & IO_DIRECT) 1085 bp->b_flags |= B_DIRECT; 1086 1087 if (uio->uio_offset + xfersize > dp->di_extsize) 1088 dp->di_extsize = uio->uio_offset + xfersize; 1089 1090 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1091 if (size < xfersize) 1092 xfersize = size; 1093 1094 error = 1095 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1096 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1097 (LIST_EMPTY(&bp->b_dep))) { 1098 bp->b_flags |= B_RELBUF; 1099 } 1100 1101 /* 1102 * If IO_SYNC each buffer is written synchronously. Otherwise 1103 * if we have a severe page deficiency write the buffer 1104 * asynchronously. Otherwise try to cluster, and if that 1105 * doesn't do it then either do an async write (if O_DIRECT), 1106 * or a delayed write (if not). 1107 */ 1108 if (ioflag & IO_SYNC) { 1109 (void)bwrite(bp); 1110 } else if (vm_page_count_severe() || 1111 buf_dirty_count_severe() || 1112 xfersize + blkoffset == fs->fs_bsize || 1113 (ioflag & (IO_ASYNC | IO_DIRECT))) 1114 bawrite(bp); 1115 else 1116 bdwrite(bp); 1117 if (error || xfersize == 0) 1118 break; 1119 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1120 } 1121 /* 1122 * If we successfully wrote any data, and we are not the superuser 1123 * we clear the setuid and setgid bits as a precaution against 1124 * tampering. 1125 */ 1126 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { 1127 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) { 1128 ip->i_mode &= ~(ISUID | ISGID); 1129 dp->di_mode = ip->i_mode; 1130 } 1131 } 1132 if (error) { 1133 if (ioflag & IO_UNIT) { 1134 (void)ffs_truncate(vp, osize, 1135 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td); 1136 uio->uio_offset -= resid - uio->uio_resid; 1137 uio->uio_resid = resid; 1138 } 1139 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1140 error = ffs_update(vp, 1); 1141 return (error); 1142 } 1143 1144 1145 /* 1146 * Vnode operating to retrieve a named extended attribute. 1147 * 1148 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1149 * the length of the EA, and possibly the pointer to the entry and to the data. 1150 */ 1151 static int 1152 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) 1153 { 1154 u_char *p, *pe, *pn, *p0; 1155 int eapad1, eapad2, ealength, ealen, nlen; 1156 uint32_t ul; 1157 1158 pe = ptr + length; 1159 nlen = strlen(name); 1160 1161 for (p = ptr; p < pe; p = pn) { 1162 p0 = p; 1163 bcopy(p, &ul, sizeof(ul)); 1164 pn = p + ul; 1165 /* make sure this entry is complete */ 1166 if (pn > pe) 1167 break; 1168 p += sizeof(uint32_t); 1169 if (*p != nspace) 1170 continue; 1171 p++; 1172 eapad2 = *p++; 1173 if (*p != nlen) 1174 continue; 1175 p++; 1176 if (bcmp(p, name, nlen)) 1177 continue; 1178 ealength = sizeof(uint32_t) + 3 + nlen; 1179 eapad1 = 8 - (ealength % 8); 1180 if (eapad1 == 8) 1181 eapad1 = 0; 1182 ealength += eapad1; 1183 ealen = ul - ealength - eapad2; 1184 p += nlen + eapad1; 1185 if (eap != NULL) 1186 *eap = p0; 1187 if (eac != NULL) 1188 *eac = p; 1189 return (ealen); 1190 } 1191 return(-1); 1192 } 1193 1194 static int 1195 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1196 { 1197 struct inode *ip; 1198 struct ufs2_dinode *dp; 1199 struct fs *fs; 1200 struct uio luio; 1201 struct iovec liovec; 1202 int easize, error; 1203 u_char *eae; 1204 1205 ip = VTOI(vp); 1206 fs = ip->i_fs; 1207 dp = ip->i_din2; 1208 easize = dp->di_extsize; 1209 if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize) 1210 return (EFBIG); 1211 1212 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1213 1214 liovec.iov_base = eae; 1215 liovec.iov_len = easize; 1216 luio.uio_iov = &liovec; 1217 luio.uio_iovcnt = 1; 1218 luio.uio_offset = 0; 1219 luio.uio_resid = easize; 1220 luio.uio_segflg = UIO_SYSSPACE; 1221 luio.uio_rw = UIO_READ; 1222 luio.uio_td = td; 1223 1224 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1225 if (error) { 1226 free(eae, M_TEMP); 1227 return(error); 1228 } 1229 *p = eae; 1230 return (0); 1231 } 1232 1233 static int 1234 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1235 { 1236 struct inode *ip; 1237 struct ufs2_dinode *dp; 1238 int error; 1239 1240 ip = VTOI(vp); 1241 1242 if (ip->i_ea_area != NULL) 1243 return (EBUSY); 1244 dp = ip->i_din2; 1245 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1246 if (error) 1247 return (error); 1248 ip->i_ea_len = dp->di_extsize; 1249 ip->i_ea_error = 0; 1250 return (0); 1251 } 1252 1253 /* 1254 * Vnode extattr transaction commit/abort 1255 */ 1256 static int 1257 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1258 { 1259 struct inode *ip; 1260 struct uio luio; 1261 struct iovec liovec; 1262 int error; 1263 struct ufs2_dinode *dp; 1264 1265 ip = VTOI(vp); 1266 if (ip->i_ea_area == NULL) 1267 return (EINVAL); 1268 dp = ip->i_din2; 1269 error = ip->i_ea_error; 1270 if (commit && error == 0) { 1271 if (cred == NOCRED) 1272 cred = vp->v_mount->mnt_cred; 1273 liovec.iov_base = ip->i_ea_area; 1274 liovec.iov_len = ip->i_ea_len; 1275 luio.uio_iov = &liovec; 1276 luio.uio_iovcnt = 1; 1277 luio.uio_offset = 0; 1278 luio.uio_resid = ip->i_ea_len; 1279 luio.uio_segflg = UIO_SYSSPACE; 1280 luio.uio_rw = UIO_WRITE; 1281 luio.uio_td = td; 1282 /* XXX: I'm not happy about truncating to zero size */ 1283 if (ip->i_ea_len < dp->di_extsize) 1284 error = ffs_truncate(vp, 0, IO_EXT, cred, td); 1285 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1286 } 1287 free(ip->i_ea_area, M_TEMP); 1288 ip->i_ea_area = NULL; 1289 ip->i_ea_len = 0; 1290 ip->i_ea_error = 0; 1291 return (error); 1292 } 1293 1294 /* 1295 * Vnode extattr strategy routine for fifos. 1296 * 1297 * We need to check for a read or write of the external attributes. 1298 * Otherwise we just fall through and do the usual thing. 1299 */ 1300 static int 1301 ffsext_strategy(struct vop_strategy_args *ap) 1302 /* 1303 struct vop_strategy_args { 1304 struct vnodeop_desc *a_desc; 1305 struct vnode *a_vp; 1306 struct buf *a_bp; 1307 }; 1308 */ 1309 { 1310 struct vnode *vp; 1311 daddr_t lbn; 1312 1313 vp = ap->a_vp; 1314 lbn = ap->a_bp->b_lblkno; 1315 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1316 lbn < 0 && lbn >= -NXADDR) 1317 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1318 if (vp->v_type == VFIFO) 1319 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1320 panic("spec nodes went here"); 1321 } 1322 1323 /* 1324 * Vnode extattr transaction commit/abort 1325 */ 1326 static int 1327 ffs_openextattr(struct vop_openextattr_args *ap) 1328 /* 1329 struct vop_openextattr_args { 1330 struct vnodeop_desc *a_desc; 1331 struct vnode *a_vp; 1332 IN struct ucred *a_cred; 1333 IN struct thread *a_td; 1334 }; 1335 */ 1336 { 1337 struct inode *ip; 1338 struct fs *fs; 1339 1340 ip = VTOI(ap->a_vp); 1341 fs = ip->i_fs; 1342 1343 if (ap->a_vp->v_type == VCHR) 1344 return (EOPNOTSUPP); 1345 1346 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1347 } 1348 1349 1350 /* 1351 * Vnode extattr transaction commit/abort 1352 */ 1353 static int 1354 ffs_closeextattr(struct vop_closeextattr_args *ap) 1355 /* 1356 struct vop_closeextattr_args { 1357 struct vnodeop_desc *a_desc; 1358 struct vnode *a_vp; 1359 int a_commit; 1360 IN struct ucred *a_cred; 1361 IN struct thread *a_td; 1362 }; 1363 */ 1364 { 1365 struct inode *ip; 1366 struct fs *fs; 1367 1368 ip = VTOI(ap->a_vp); 1369 fs = ip->i_fs; 1370 1371 if (ap->a_vp->v_type == VCHR) 1372 return (EOPNOTSUPP); 1373 1374 if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) 1375 return (EROFS); 1376 1377 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1378 } 1379 1380 /* 1381 * Vnode operation to remove a named attribute. 1382 */ 1383 static int 1384 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1385 /* 1386 vop_deleteextattr { 1387 IN struct vnode *a_vp; 1388 IN int a_attrnamespace; 1389 IN const char *a_name; 1390 IN struct ucred *a_cred; 1391 IN struct thread *a_td; 1392 }; 1393 */ 1394 { 1395 struct inode *ip; 1396 struct fs *fs; 1397 uint32_t ealength, ul; 1398 int ealen, olen, eapad1, eapad2, error, i, easize; 1399 u_char *eae, *p; 1400 int stand_alone; 1401 1402 ip = VTOI(ap->a_vp); 1403 fs = ip->i_fs; 1404 1405 if (ap->a_vp->v_type == VCHR) 1406 return (EOPNOTSUPP); 1407 1408 if (strlen(ap->a_name) == 0) 1409 return (EINVAL); 1410 1411 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1412 return (EROFS); 1413 1414 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1415 ap->a_cred, ap->a_td, IWRITE); 1416 if (error) { 1417 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1418 ip->i_ea_error = error; 1419 return (error); 1420 } 1421 1422 if (ip->i_ea_area == NULL) { 1423 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1424 if (error) 1425 return (error); 1426 stand_alone = 1; 1427 } else { 1428 stand_alone = 0; 1429 } 1430 1431 ealength = eapad1 = ealen = eapad2 = 0; 1432 1433 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1434 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1435 easize = ip->i_ea_len; 1436 1437 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1438 &p, NULL); 1439 if (olen == -1) { 1440 /* delete but nonexistent */ 1441 free(eae, M_TEMP); 1442 if (stand_alone) 1443 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1444 return(ENOATTR); 1445 } 1446 bcopy(p, &ul, sizeof ul); 1447 i = p - eae + ul; 1448 if (ul != ealength) { 1449 bcopy(p + ul, p + ealength, easize - i); 1450 easize += (ealength - ul); 1451 } 1452 if (easize > NXADDR * fs->fs_bsize) { 1453 free(eae, M_TEMP); 1454 if (stand_alone) 1455 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1456 else if (ip->i_ea_error == 0) 1457 ip->i_ea_error = ENOSPC; 1458 return(ENOSPC); 1459 } 1460 p = ip->i_ea_area; 1461 ip->i_ea_area = eae; 1462 ip->i_ea_len = easize; 1463 free(p, M_TEMP); 1464 if (stand_alone) 1465 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1466 return(error); 1467 } 1468 1469 /* 1470 * Vnode operation to retrieve a named extended attribute. 1471 */ 1472 static int 1473 ffs_getextattr(struct vop_getextattr_args *ap) 1474 /* 1475 vop_getextattr { 1476 IN struct vnode *a_vp; 1477 IN int a_attrnamespace; 1478 IN const char *a_name; 1479 INOUT struct uio *a_uio; 1480 OUT size_t *a_size; 1481 IN struct ucred *a_cred; 1482 IN struct thread *a_td; 1483 }; 1484 */ 1485 { 1486 struct inode *ip; 1487 struct fs *fs; 1488 u_char *eae, *p; 1489 unsigned easize; 1490 int error, ealen, stand_alone; 1491 1492 ip = VTOI(ap->a_vp); 1493 fs = ip->i_fs; 1494 1495 if (ap->a_vp->v_type == VCHR) 1496 return (EOPNOTSUPP); 1497 1498 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1499 ap->a_cred, ap->a_td, IREAD); 1500 if (error) 1501 return (error); 1502 1503 if (ip->i_ea_area == NULL) { 1504 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1505 if (error) 1506 return (error); 1507 stand_alone = 1; 1508 } else { 1509 stand_alone = 0; 1510 } 1511 eae = ip->i_ea_area; 1512 easize = ip->i_ea_len; 1513 1514 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1515 NULL, &p); 1516 if (ealen >= 0) { 1517 error = 0; 1518 if (ap->a_size != NULL) 1519 *ap->a_size = ealen; 1520 else if (ap->a_uio != NULL) 1521 error = uiomove(p, ealen, ap->a_uio); 1522 } else 1523 error = ENOATTR; 1524 if (stand_alone) 1525 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1526 return(error); 1527 } 1528 1529 /* 1530 * Vnode operation to retrieve extended attributes on a vnode. 1531 */ 1532 static int 1533 ffs_listextattr(struct vop_listextattr_args *ap) 1534 /* 1535 vop_listextattr { 1536 IN struct vnode *a_vp; 1537 IN int a_attrnamespace; 1538 INOUT struct uio *a_uio; 1539 OUT size_t *a_size; 1540 IN struct ucred *a_cred; 1541 IN struct thread *a_td; 1542 }; 1543 */ 1544 { 1545 struct inode *ip; 1546 struct fs *fs; 1547 u_char *eae, *p, *pe, *pn; 1548 unsigned easize; 1549 uint32_t ul; 1550 int error, ealen, stand_alone; 1551 1552 ip = VTOI(ap->a_vp); 1553 fs = ip->i_fs; 1554 1555 if (ap->a_vp->v_type == VCHR) 1556 return (EOPNOTSUPP); 1557 1558 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1559 ap->a_cred, ap->a_td, IREAD); 1560 if (error) 1561 return (error); 1562 1563 if (ip->i_ea_area == NULL) { 1564 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1565 if (error) 1566 return (error); 1567 stand_alone = 1; 1568 } else { 1569 stand_alone = 0; 1570 } 1571 eae = ip->i_ea_area; 1572 easize = ip->i_ea_len; 1573 1574 error = 0; 1575 if (ap->a_size != NULL) 1576 *ap->a_size = 0; 1577 pe = eae + easize; 1578 for(p = eae; error == 0 && p < pe; p = pn) { 1579 bcopy(p, &ul, sizeof(ul)); 1580 pn = p + ul; 1581 if (pn > pe) 1582 break; 1583 p += sizeof(ul); 1584 if (*p++ != ap->a_attrnamespace) 1585 continue; 1586 p++; /* pad2 */ 1587 ealen = *p; 1588 if (ap->a_size != NULL) { 1589 *ap->a_size += ealen + 1; 1590 } else if (ap->a_uio != NULL) { 1591 error = uiomove(p, ealen + 1, ap->a_uio); 1592 } 1593 } 1594 if (stand_alone) 1595 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1596 return(error); 1597 } 1598 1599 /* 1600 * Vnode operation to set a named attribute. 1601 */ 1602 static int 1603 ffs_setextattr(struct vop_setextattr_args *ap) 1604 /* 1605 vop_setextattr { 1606 IN struct vnode *a_vp; 1607 IN int a_attrnamespace; 1608 IN const char *a_name; 1609 INOUT struct uio *a_uio; 1610 IN struct ucred *a_cred; 1611 IN struct thread *a_td; 1612 }; 1613 */ 1614 { 1615 struct inode *ip; 1616 struct fs *fs; 1617 uint32_t ealength, ul; 1618 int ealen, olen, eapad1, eapad2, error, i, easize; 1619 u_char *eae, *p; 1620 int stand_alone; 1621 1622 ip = VTOI(ap->a_vp); 1623 fs = ip->i_fs; 1624 1625 if (ap->a_vp->v_type == VCHR) 1626 return (EOPNOTSUPP); 1627 1628 if (strlen(ap->a_name) == 0) 1629 return (EINVAL); 1630 1631 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1632 if (ap->a_uio == NULL) 1633 return (EOPNOTSUPP); 1634 1635 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1636 return (EROFS); 1637 1638 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1639 ap->a_cred, ap->a_td, IWRITE); 1640 if (error) { 1641 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1642 ip->i_ea_error = error; 1643 return (error); 1644 } 1645 1646 if (ip->i_ea_area == NULL) { 1647 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1648 if (error) 1649 return (error); 1650 stand_alone = 1; 1651 } else { 1652 stand_alone = 0; 1653 } 1654 1655 ealen = ap->a_uio->uio_resid; 1656 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1657 eapad1 = 8 - (ealength % 8); 1658 if (eapad1 == 8) 1659 eapad1 = 0; 1660 eapad2 = 8 - (ealen % 8); 1661 if (eapad2 == 8) 1662 eapad2 = 0; 1663 ealength += eapad1 + ealen + eapad2; 1664 1665 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1666 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1667 easize = ip->i_ea_len; 1668 1669 olen = ffs_findextattr(eae, easize, 1670 ap->a_attrnamespace, ap->a_name, &p, NULL); 1671 if (olen == -1) { 1672 /* new, append at end */ 1673 p = eae + easize; 1674 easize += ealength; 1675 } else { 1676 bcopy(p, &ul, sizeof ul); 1677 i = p - eae + ul; 1678 if (ul != ealength) { 1679 bcopy(p + ul, p + ealength, easize - i); 1680 easize += (ealength - ul); 1681 } 1682 } 1683 if (easize > NXADDR * fs->fs_bsize) { 1684 free(eae, M_TEMP); 1685 if (stand_alone) 1686 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1687 else if (ip->i_ea_error == 0) 1688 ip->i_ea_error = ENOSPC; 1689 return(ENOSPC); 1690 } 1691 bcopy(&ealength, p, sizeof(ealength)); 1692 p += sizeof(ealength); 1693 *p++ = ap->a_attrnamespace; 1694 *p++ = eapad2; 1695 *p++ = strlen(ap->a_name); 1696 strcpy(p, ap->a_name); 1697 p += strlen(ap->a_name); 1698 bzero(p, eapad1); 1699 p += eapad1; 1700 error = uiomove(p, ealen, ap->a_uio); 1701 if (error) { 1702 free(eae, M_TEMP); 1703 if (stand_alone) 1704 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1705 else if (ip->i_ea_error == 0) 1706 ip->i_ea_error = error; 1707 return(error); 1708 } 1709 p += ealen; 1710 bzero(p, eapad2); 1711 1712 p = ip->i_ea_area; 1713 ip->i_ea_area = eae; 1714 ip->i_ea_len = easize; 1715 free(p, M_TEMP); 1716 if (stand_alone) 1717 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1718 return(error); 1719 } 1720 1721 /* 1722 * Vnode pointer to File handle 1723 */ 1724 static int 1725 ffs_vptofh(struct vop_vptofh_args *ap) 1726 /* 1727 vop_vptofh { 1728 IN struct vnode *a_vp; 1729 IN struct fid *a_fhp; 1730 }; 1731 */ 1732 { 1733 struct inode *ip; 1734 struct ufid *ufhp; 1735 1736 ip = VTOI(ap->a_vp); 1737 ufhp = (struct ufid *)ap->a_fhp; 1738 ufhp->ufid_len = sizeof(struct ufid); 1739 ufhp->ufid_ino = ip->i_number; 1740 ufhp->ufid_gen = ip->i_gen; 1741 return (0); 1742 } 1743