1 /*- 2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 60 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 61 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include <sys/param.h> 68 #include <sys/bio.h> 69 #include <sys/systm.h> 70 #include <sys/buf.h> 71 #include <sys/conf.h> 72 #include <sys/extattr.h> 73 #include <sys/kernel.h> 74 #include <sys/limits.h> 75 #include <sys/malloc.h> 76 #include <sys/mount.h> 77 #include <sys/priv.h> 78 #include <sys/proc.h> 79 #include <sys/resourcevar.h> 80 #include <sys/signalvar.h> 81 #include <sys/stat.h> 82 #include <sys/vmmeter.h> 83 #include <sys/vnode.h> 84 85 #include <vm/vm.h> 86 #include <vm/vm_extern.h> 87 #include <vm/vm_object.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_pager.h> 90 #include <vm/vnode_pager.h> 91 92 #include <ufs/ufs/extattr.h> 93 #include <ufs/ufs/quota.h> 94 #include <ufs/ufs/inode.h> 95 #include <ufs/ufs/ufs_extern.h> 96 #include <ufs/ufs/ufsmount.h> 97 98 #include <ufs/ffs/fs.h> 99 #include <ufs/ffs/ffs_extern.h> 100 #include "opt_directio.h" 101 #include "opt_ffs.h" 102 103 #ifdef DIRECTIO 104 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 105 #endif 106 static vop_fsync_t ffs_fsync; 107 static vop_lock1_t ffs_lock; 108 static vop_getpages_t ffs_getpages; 109 static vop_read_t ffs_read; 110 static vop_write_t ffs_write; 111 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 112 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 113 struct ucred *cred); 114 static vop_strategy_t ffsext_strategy; 115 static vop_closeextattr_t ffs_closeextattr; 116 static vop_deleteextattr_t ffs_deleteextattr; 117 static vop_getextattr_t ffs_getextattr; 118 static vop_listextattr_t ffs_listextattr; 119 static vop_openextattr_t ffs_openextattr; 120 static vop_setextattr_t ffs_setextattr; 121 static vop_vptofh_t ffs_vptofh; 122 123 124 /* Global vfs data structures for ufs. */ 125 struct vop_vector ffs_vnodeops1 = { 126 .vop_default = &ufs_vnodeops, 127 .vop_fsync = ffs_fsync, 128 .vop_getpages = ffs_getpages, 129 .vop_lock1 = ffs_lock, 130 .vop_read = ffs_read, 131 .vop_reallocblks = ffs_reallocblks, 132 .vop_write = ffs_write, 133 .vop_vptofh = ffs_vptofh, 134 }; 135 136 struct vop_vector ffs_fifoops1 = { 137 .vop_default = &ufs_fifoops, 138 .vop_fsync = ffs_fsync, 139 .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ 140 .vop_vptofh = ffs_vptofh, 141 }; 142 143 /* Global vfs data structures for ufs. */ 144 struct vop_vector ffs_vnodeops2 = { 145 .vop_default = &ufs_vnodeops, 146 .vop_fsync = ffs_fsync, 147 .vop_getpages = ffs_getpages, 148 .vop_lock1 = ffs_lock, 149 .vop_read = ffs_read, 150 .vop_reallocblks = ffs_reallocblks, 151 .vop_write = ffs_write, 152 .vop_closeextattr = ffs_closeextattr, 153 .vop_deleteextattr = ffs_deleteextattr, 154 .vop_getextattr = ffs_getextattr, 155 .vop_listextattr = ffs_listextattr, 156 .vop_openextattr = ffs_openextattr, 157 .vop_setextattr = ffs_setextattr, 158 .vop_vptofh = ffs_vptofh, 159 }; 160 161 struct vop_vector ffs_fifoops2 = { 162 .vop_default = &ufs_fifoops, 163 .vop_fsync = ffs_fsync, 164 .vop_lock1 = ffs_lock, 165 .vop_reallocblks = ffs_reallocblks, 166 .vop_strategy = ffsext_strategy, 167 .vop_closeextattr = ffs_closeextattr, 168 .vop_deleteextattr = ffs_deleteextattr, 169 .vop_getextattr = ffs_getextattr, 170 .vop_listextattr = ffs_listextattr, 171 .vop_openextattr = ffs_openextattr, 172 .vop_setextattr = ffs_setextattr, 173 .vop_vptofh = ffs_vptofh, 174 }; 175 176 /* 177 * Synch an open file. 178 */ 179 /* ARGSUSED */ 180 static int 181 ffs_fsync(struct vop_fsync_args *ap) 182 { 183 int error; 184 185 error = ffs_syncvnode(ap->a_vp, ap->a_waitfor); 186 if (error) 187 return (error); 188 if (ap->a_waitfor == MNT_WAIT && 189 (ap->a_vp->v_mount->mnt_flag & MNT_SOFTDEP)) 190 error = softdep_fsync(ap->a_vp); 191 return (error); 192 } 193 194 int 195 ffs_syncvnode(struct vnode *vp, int waitfor) 196 { 197 struct inode *ip = VTOI(vp); 198 struct buf *bp; 199 struct buf *nbp; 200 int s, error, wait, passes, skipmeta; 201 ufs_lbn_t lbn; 202 203 wait = (waitfor == MNT_WAIT); 204 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 205 206 /* 207 * Flush all dirty buffers associated with a vnode. 208 */ 209 passes = NIADDR + 1; 210 skipmeta = 0; 211 if (wait) 212 skipmeta = 1; 213 s = splbio(); 214 VI_LOCK(vp); 215 loop: 216 TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) 217 bp->b_vflags &= ~BV_SCANNED; 218 TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { 219 /* 220 * Reasons to skip this buffer: it has already been considered 221 * on this pass, this pass is the first time through on a 222 * synchronous flush request and the buffer being considered 223 * is metadata, the buffer has dependencies that will cause 224 * it to be redirtied and it has not already been deferred, 225 * or it is already being written. 226 */ 227 if ((bp->b_vflags & BV_SCANNED) != 0) 228 continue; 229 bp->b_vflags |= BV_SCANNED; 230 if ((skipmeta == 1 && bp->b_lblkno < 0)) 231 continue; 232 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 233 continue; 234 VI_UNLOCK(vp); 235 if (!wait && !LIST_EMPTY(&bp->b_dep) && 236 (bp->b_flags & B_DEFERRED) == 0 && 237 buf_countdeps(bp, 0)) { 238 bp->b_flags |= B_DEFERRED; 239 BUF_UNLOCK(bp); 240 VI_LOCK(vp); 241 continue; 242 } 243 if ((bp->b_flags & B_DELWRI) == 0) 244 panic("ffs_fsync: not dirty"); 245 /* 246 * If this is a synchronous flush request, or it is not a 247 * file or device, start the write on this buffer immediatly. 248 */ 249 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) { 250 251 /* 252 * On our final pass through, do all I/O synchronously 253 * so that we can find out if our flush is failing 254 * because of write errors. 255 */ 256 if (passes > 0 || !wait) { 257 if ((bp->b_flags & B_CLUSTEROK) && !wait) { 258 (void) vfs_bio_awrite(bp); 259 } else { 260 bremfree(bp); 261 splx(s); 262 (void) bawrite(bp); 263 s = splbio(); 264 } 265 } else { 266 bremfree(bp); 267 splx(s); 268 if ((error = bwrite(bp)) != 0) 269 return (error); 270 s = splbio(); 271 } 272 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) { 273 /* 274 * If the buffer is for data that has been truncated 275 * off the file, then throw it away. 276 */ 277 bremfree(bp); 278 bp->b_flags |= B_INVAL | B_NOCACHE; 279 splx(s); 280 brelse(bp); 281 s = splbio(); 282 } else 283 vfs_bio_awrite(bp); 284 285 /* 286 * Since we may have slept during the I/O, we need 287 * to start from a known point. 288 */ 289 VI_LOCK(vp); 290 nbp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd); 291 } 292 /* 293 * If we were asked to do this synchronously, then go back for 294 * another pass, this time doing the metadata. 295 */ 296 if (skipmeta) { 297 skipmeta = 0; 298 goto loop; 299 } 300 301 if (wait) { 302 bufobj_wwait(&vp->v_bufobj, 3, 0); 303 VI_UNLOCK(vp); 304 305 /* 306 * Ensure that any filesystem metatdata associated 307 * with the vnode has been written. 308 */ 309 splx(s); 310 if ((error = softdep_sync_metadata(vp)) != 0) 311 return (error); 312 s = splbio(); 313 314 VI_LOCK(vp); 315 if (vp->v_bufobj.bo_dirty.bv_cnt > 0) { 316 /* 317 * Block devices associated with filesystems may 318 * have new I/O requests posted for them even if 319 * the vnode is locked, so no amount of trying will 320 * get them clean. Thus we give block devices a 321 * good effort, then just give up. For all other file 322 * types, go around and try again until it is clean. 323 */ 324 if (passes > 0) { 325 passes -= 1; 326 goto loop; 327 } 328 #ifdef INVARIANTS 329 if (!vn_isdisk(vp, NULL)) 330 vprint("ffs_fsync: dirty", vp); 331 #endif 332 } 333 } 334 VI_UNLOCK(vp); 335 splx(s); 336 return (ffs_update(vp, wait)); 337 } 338 339 static int 340 ffs_lock(ap) 341 struct vop_lock1_args /* { 342 struct vnode *a_vp; 343 int a_flags; 344 struct thread *a_td; 345 char *file; 346 int line; 347 } */ *ap; 348 { 349 #ifndef NO_FFS_SNAPSHOT 350 struct vnode *vp; 351 int flags; 352 struct lock *lkp; 353 int result; 354 355 switch (ap->a_flags & LK_TYPE_MASK) { 356 case LK_SHARED: 357 case LK_UPGRADE: 358 case LK_EXCLUSIVE: 359 vp = ap->a_vp; 360 flags = ap->a_flags; 361 for (;;) { 362 /* 363 * vnode interlock must be held to ensure that 364 * the possibly external lock isn't freed, 365 * e.g. when mutating from snapshot file vnode 366 * to regular file vnode. 367 */ 368 if ((flags & LK_INTERLOCK) == 0) { 369 VI_LOCK(vp); 370 flags |= LK_INTERLOCK; 371 } 372 lkp = vp->v_vnlock; 373 result = _lockmgr(lkp, flags, VI_MTX(vp), curthread, 374 ap->a_file, ap->a_line); 375 if (lkp == vp->v_vnlock || result != 0) 376 break; 377 /* 378 * Apparent success, except that the vnode 379 * mutated between snapshot file vnode and 380 * regular file vnode while this process 381 * slept. The lock currently held is not the 382 * right lock. Release it, and try to get the 383 * new lock. 384 */ 385 (void) _lockmgr(lkp, LK_RELEASE, VI_MTX(vp), curthread, 386 ap->a_file, ap->a_line); 387 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 388 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 389 flags &= ~LK_INTERLOCK; 390 } 391 break; 392 default: 393 result = VOP_LOCK1_APV(&ufs_vnodeops, ap); 394 } 395 return (result); 396 #else 397 return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); 398 #endif 399 } 400 401 /* 402 * Vnode op for reading. 403 */ 404 /* ARGSUSED */ 405 static int 406 ffs_read(ap) 407 struct vop_read_args /* { 408 struct vnode *a_vp; 409 struct uio *a_uio; 410 int a_ioflag; 411 struct ucred *a_cred; 412 } */ *ap; 413 { 414 struct vnode *vp; 415 struct inode *ip; 416 struct uio *uio; 417 struct fs *fs; 418 struct buf *bp; 419 ufs_lbn_t lbn, nextlbn; 420 off_t bytesinfile; 421 long size, xfersize, blkoffset; 422 int error, orig_resid; 423 int seqcount; 424 int ioflag; 425 426 vp = ap->a_vp; 427 uio = ap->a_uio; 428 ioflag = ap->a_ioflag; 429 if (ap->a_ioflag & IO_EXT) 430 #ifdef notyet 431 return (ffs_extread(vp, uio, ioflag)); 432 #else 433 panic("ffs_read+IO_EXT"); 434 #endif 435 #ifdef DIRECTIO 436 if ((ioflag & IO_DIRECT) != 0) { 437 int workdone; 438 439 error = ffs_rawread(vp, uio, &workdone); 440 if (error != 0 || workdone != 0) 441 return error; 442 } 443 #endif 444 445 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 446 ip = VTOI(vp); 447 448 #ifdef INVARIANTS 449 if (uio->uio_rw != UIO_READ) 450 panic("ffs_read: mode"); 451 452 if (vp->v_type == VLNK) { 453 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 454 panic("ffs_read: short symlink"); 455 } else if (vp->v_type != VREG && vp->v_type != VDIR) 456 panic("ffs_read: type %d", vp->v_type); 457 #endif 458 orig_resid = uio->uio_resid; 459 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 460 if (orig_resid == 0) 461 return (0); 462 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 463 fs = ip->i_fs; 464 if (uio->uio_offset < ip->i_size && 465 uio->uio_offset >= fs->fs_maxfilesize) 466 return (EOVERFLOW); 467 468 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 469 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 470 break; 471 lbn = lblkno(fs, uio->uio_offset); 472 nextlbn = lbn + 1; 473 474 /* 475 * size of buffer. The buffer representing the 476 * end of the file is rounded up to the size of 477 * the block type ( fragment or full block, 478 * depending ). 479 */ 480 size = blksize(fs, ip, lbn); 481 blkoffset = blkoff(fs, uio->uio_offset); 482 483 /* 484 * The amount we want to transfer in this iteration is 485 * one FS block less the amount of the data before 486 * our startpoint (duh!) 487 */ 488 xfersize = fs->fs_bsize - blkoffset; 489 490 /* 491 * But if we actually want less than the block, 492 * or the file doesn't have a whole block more of data, 493 * then use the lesser number. 494 */ 495 if (uio->uio_resid < xfersize) 496 xfersize = uio->uio_resid; 497 if (bytesinfile < xfersize) 498 xfersize = bytesinfile; 499 500 if (lblktosize(fs, nextlbn) >= ip->i_size) { 501 /* 502 * Don't do readahead if this is the end of the file. 503 */ 504 error = bread(vp, lbn, size, NOCRED, &bp); 505 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 506 /* 507 * Otherwise if we are allowed to cluster, 508 * grab as much as we can. 509 * 510 * XXX This may not be a win if we are not 511 * doing sequential access. 512 */ 513 error = cluster_read(vp, ip->i_size, lbn, 514 size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); 515 } else if (seqcount > 1) { 516 /* 517 * If we are NOT allowed to cluster, then 518 * if we appear to be acting sequentially, 519 * fire off a request for a readahead 520 * as well as a read. Note that the 4th and 5th 521 * arguments point to arrays of the size specified in 522 * the 6th argument. 523 */ 524 int nextsize = blksize(fs, ip, nextlbn); 525 error = breadn(vp, lbn, 526 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 527 } else { 528 /* 529 * Failing all of the above, just read what the 530 * user asked for. Interestingly, the same as 531 * the first option above. 532 */ 533 error = bread(vp, lbn, size, NOCRED, &bp); 534 } 535 if (error) { 536 brelse(bp); 537 bp = NULL; 538 break; 539 } 540 541 /* 542 * If IO_DIRECT then set B_DIRECT for the buffer. This 543 * will cause us to attempt to release the buffer later on 544 * and will cause the buffer cache to attempt to free the 545 * underlying pages. 546 */ 547 if (ioflag & IO_DIRECT) 548 bp->b_flags |= B_DIRECT; 549 550 /* 551 * We should only get non-zero b_resid when an I/O error 552 * has occurred, which should cause us to break above. 553 * However, if the short read did not cause an error, 554 * then we want to ensure that we do not uiomove bad 555 * or uninitialized data. 556 */ 557 size -= bp->b_resid; 558 if (size < xfersize) { 559 if (size == 0) 560 break; 561 xfersize = size; 562 } 563 564 error = uiomove((char *)bp->b_data + blkoffset, 565 (int)xfersize, uio); 566 if (error) 567 break; 568 569 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 570 (LIST_EMPTY(&bp->b_dep))) { 571 /* 572 * If there are no dependencies, and it's VMIO, 573 * then we don't need the buf, mark it available 574 * for freeing. The VM has the data. 575 */ 576 bp->b_flags |= B_RELBUF; 577 brelse(bp); 578 } else { 579 /* 580 * Otherwise let whoever 581 * made the request take care of 582 * freeing it. We just queue 583 * it onto another list. 584 */ 585 bqrelse(bp); 586 } 587 } 588 589 /* 590 * This can only happen in the case of an error 591 * because the loop above resets bp to NULL on each iteration 592 * and on normal completion has not set a new value into it. 593 * so it must have come from a 'break' statement 594 */ 595 if (bp != NULL) { 596 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 597 (LIST_EMPTY(&bp->b_dep))) { 598 bp->b_flags |= B_RELBUF; 599 brelse(bp); 600 } else { 601 bqrelse(bp); 602 } 603 } 604 605 if ((error == 0 || uio->uio_resid != orig_resid) && 606 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) { 607 VI_LOCK(vp); 608 ip->i_flag |= IN_ACCESS; 609 VI_UNLOCK(vp); 610 } 611 return (error); 612 } 613 614 /* 615 * Vnode op for writing. 616 */ 617 static int 618 ffs_write(ap) 619 struct vop_write_args /* { 620 struct vnode *a_vp; 621 struct uio *a_uio; 622 int a_ioflag; 623 struct ucred *a_cred; 624 } */ *ap; 625 { 626 struct vnode *vp; 627 struct uio *uio; 628 struct inode *ip; 629 struct fs *fs; 630 struct buf *bp; 631 struct thread *td; 632 ufs_lbn_t lbn; 633 off_t osize; 634 int seqcount; 635 int blkoffset, error, flags, ioflag, resid, size, xfersize; 636 637 vp = ap->a_vp; 638 uio = ap->a_uio; 639 ioflag = ap->a_ioflag; 640 if (ap->a_ioflag & IO_EXT) 641 #ifdef notyet 642 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 643 #else 644 panic("ffs_write+IO_EXT"); 645 #endif 646 647 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 648 ip = VTOI(vp); 649 650 #ifdef INVARIANTS 651 if (uio->uio_rw != UIO_WRITE) 652 panic("ffs_write: mode"); 653 #endif 654 655 switch (vp->v_type) { 656 case VREG: 657 if (ioflag & IO_APPEND) 658 uio->uio_offset = ip->i_size; 659 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 660 return (EPERM); 661 /* FALLTHROUGH */ 662 case VLNK: 663 break; 664 case VDIR: 665 panic("ffs_write: dir write"); 666 break; 667 default: 668 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 669 (int)uio->uio_offset, 670 (int)uio->uio_resid 671 ); 672 } 673 674 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 675 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 676 fs = ip->i_fs; 677 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 678 return (EFBIG); 679 /* 680 * Maybe this should be above the vnode op call, but so long as 681 * file servers have no limits, I don't think it matters. 682 */ 683 td = uio->uio_td; 684 if (vp->v_type == VREG && td != NULL) { 685 PROC_LOCK(td->td_proc); 686 if (uio->uio_offset + uio->uio_resid > 687 lim_cur(td->td_proc, RLIMIT_FSIZE)) { 688 psignal(td->td_proc, SIGXFSZ); 689 PROC_UNLOCK(td->td_proc); 690 return (EFBIG); 691 } 692 PROC_UNLOCK(td->td_proc); 693 } 694 695 resid = uio->uio_resid; 696 osize = ip->i_size; 697 if (seqcount > BA_SEQMAX) 698 flags = BA_SEQMAX << BA_SEQSHIFT; 699 else 700 flags = seqcount << BA_SEQSHIFT; 701 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 702 flags |= IO_SYNC; 703 704 for (error = 0; uio->uio_resid > 0;) { 705 lbn = lblkno(fs, uio->uio_offset); 706 blkoffset = blkoff(fs, uio->uio_offset); 707 xfersize = fs->fs_bsize - blkoffset; 708 if (uio->uio_resid < xfersize) 709 xfersize = uio->uio_resid; 710 if (uio->uio_offset + xfersize > ip->i_size) 711 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 712 713 /* 714 * We must perform a read-before-write if the transfer size 715 * does not cover the entire buffer. 716 */ 717 if (fs->fs_bsize > xfersize) 718 flags |= BA_CLRBUF; 719 else 720 flags &= ~BA_CLRBUF; 721 /* XXX is uio->uio_offset the right thing here? */ 722 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 723 ap->a_cred, flags, &bp); 724 if (error != 0) 725 break; 726 /* 727 * If the buffer is not valid we have to clear out any 728 * garbage data from the pages instantiated for the buffer. 729 * If we do not, a failed uiomove() during a write can leave 730 * the prior contents of the pages exposed to a userland 731 * mmap(). XXX deal with uiomove() errors a better way. 732 */ 733 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 734 vfs_bio_clrbuf(bp); 735 if (ioflag & IO_DIRECT) 736 bp->b_flags |= B_DIRECT; 737 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 738 bp->b_flags |= B_NOCACHE; 739 740 if (uio->uio_offset + xfersize > ip->i_size) { 741 ip->i_size = uio->uio_offset + xfersize; 742 DIP_SET(ip, i_size, ip->i_size); 743 } 744 745 size = blksize(fs, ip, lbn) - bp->b_resid; 746 if (size < xfersize) 747 xfersize = size; 748 749 error = 750 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 751 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 752 (LIST_EMPTY(&bp->b_dep))) { 753 bp->b_flags |= B_RELBUF; 754 } 755 756 /* 757 * If IO_SYNC each buffer is written synchronously. Otherwise 758 * if we have a severe page deficiency write the buffer 759 * asynchronously. Otherwise try to cluster, and if that 760 * doesn't do it then either do an async write (if O_DIRECT), 761 * or a delayed write (if not). 762 */ 763 if (ioflag & IO_SYNC) { 764 (void)bwrite(bp); 765 } else if (vm_page_count_severe() || 766 buf_dirty_count_severe() || 767 (ioflag & IO_ASYNC)) { 768 bp->b_flags |= B_CLUSTEROK; 769 bawrite(bp); 770 } else if (xfersize + blkoffset == fs->fs_bsize) { 771 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 772 bp->b_flags |= B_CLUSTEROK; 773 cluster_write(vp, bp, ip->i_size, seqcount); 774 } else { 775 bawrite(bp); 776 } 777 } else if (ioflag & IO_DIRECT) { 778 bp->b_flags |= B_CLUSTEROK; 779 bawrite(bp); 780 } else { 781 bp->b_flags |= B_CLUSTEROK; 782 bdwrite(bp); 783 } 784 if (error || xfersize == 0) 785 break; 786 ip->i_flag |= IN_CHANGE | IN_UPDATE; 787 } 788 /* 789 * If we successfully wrote any data, and we are not the superuser 790 * we clear the setuid and setgid bits as a precaution against 791 * tampering. 792 */ 793 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && 794 ap->a_cred) { 795 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) { 796 ip->i_mode &= ~(ISUID | ISGID); 797 DIP_SET(ip, i_mode, ip->i_mode); 798 } 799 } 800 if (error) { 801 if (ioflag & IO_UNIT) { 802 (void)ffs_truncate(vp, osize, 803 IO_NORMAL | (ioflag & IO_SYNC), 804 ap->a_cred, uio->uio_td); 805 uio->uio_offset -= resid - uio->uio_resid; 806 uio->uio_resid = resid; 807 } 808 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 809 error = ffs_update(vp, 1); 810 return (error); 811 } 812 813 /* 814 * get page routine 815 */ 816 static int 817 ffs_getpages(ap) 818 struct vop_getpages_args *ap; 819 { 820 int i; 821 vm_page_t mreq; 822 int pcount; 823 824 pcount = round_page(ap->a_count) / PAGE_SIZE; 825 mreq = ap->a_m[ap->a_reqpage]; 826 827 /* 828 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 829 * then the entire page is valid. Since the page may be mapped, 830 * user programs might reference data beyond the actual end of file 831 * occuring within the page. We have to zero that data. 832 */ 833 VM_OBJECT_LOCK(mreq->object); 834 if (mreq->valid) { 835 if (mreq->valid != VM_PAGE_BITS_ALL) 836 vm_page_zero_invalid(mreq, TRUE); 837 vm_page_lock_queues(); 838 for (i = 0; i < pcount; i++) { 839 if (i != ap->a_reqpage) { 840 vm_page_free(ap->a_m[i]); 841 } 842 } 843 vm_page_unlock_queues(); 844 VM_OBJECT_UNLOCK(mreq->object); 845 return VM_PAGER_OK; 846 } 847 VM_OBJECT_UNLOCK(mreq->object); 848 849 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 850 ap->a_count, 851 ap->a_reqpage); 852 } 853 854 855 /* 856 * Extended attribute area reading. 857 */ 858 static int 859 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 860 { 861 struct inode *ip; 862 struct ufs2_dinode *dp; 863 struct fs *fs; 864 struct buf *bp; 865 ufs_lbn_t lbn, nextlbn; 866 off_t bytesinfile; 867 long size, xfersize, blkoffset; 868 int error, orig_resid; 869 870 ip = VTOI(vp); 871 fs = ip->i_fs; 872 dp = ip->i_din2; 873 874 #ifdef INVARIANTS 875 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 876 panic("ffs_extread: mode"); 877 878 #endif 879 orig_resid = uio->uio_resid; 880 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 881 if (orig_resid == 0) 882 return (0); 883 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 884 885 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 886 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 887 break; 888 lbn = lblkno(fs, uio->uio_offset); 889 nextlbn = lbn + 1; 890 891 /* 892 * size of buffer. The buffer representing the 893 * end of the file is rounded up to the size of 894 * the block type ( fragment or full block, 895 * depending ). 896 */ 897 size = sblksize(fs, dp->di_extsize, lbn); 898 blkoffset = blkoff(fs, uio->uio_offset); 899 900 /* 901 * The amount we want to transfer in this iteration is 902 * one FS block less the amount of the data before 903 * our startpoint (duh!) 904 */ 905 xfersize = fs->fs_bsize - blkoffset; 906 907 /* 908 * But if we actually want less than the block, 909 * or the file doesn't have a whole block more of data, 910 * then use the lesser number. 911 */ 912 if (uio->uio_resid < xfersize) 913 xfersize = uio->uio_resid; 914 if (bytesinfile < xfersize) 915 xfersize = bytesinfile; 916 917 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 918 /* 919 * Don't do readahead if this is the end of the info. 920 */ 921 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 922 } else { 923 /* 924 * If we have a second block, then 925 * fire off a request for a readahead 926 * as well as a read. Note that the 4th and 5th 927 * arguments point to arrays of the size specified in 928 * the 6th argument. 929 */ 930 int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 931 932 nextlbn = -1 - nextlbn; 933 error = breadn(vp, -1 - lbn, 934 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 935 } 936 if (error) { 937 brelse(bp); 938 bp = NULL; 939 break; 940 } 941 942 /* 943 * If IO_DIRECT then set B_DIRECT for the buffer. This 944 * will cause us to attempt to release the buffer later on 945 * and will cause the buffer cache to attempt to free the 946 * underlying pages. 947 */ 948 if (ioflag & IO_DIRECT) 949 bp->b_flags |= B_DIRECT; 950 951 /* 952 * We should only get non-zero b_resid when an I/O error 953 * has occurred, which should cause us to break above. 954 * However, if the short read did not cause an error, 955 * then we want to ensure that we do not uiomove bad 956 * or uninitialized data. 957 */ 958 size -= bp->b_resid; 959 if (size < xfersize) { 960 if (size == 0) 961 break; 962 xfersize = size; 963 } 964 965 error = uiomove((char *)bp->b_data + blkoffset, 966 (int)xfersize, uio); 967 if (error) 968 break; 969 970 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 971 (LIST_EMPTY(&bp->b_dep))) { 972 /* 973 * If there are no dependencies, and it's VMIO, 974 * then we don't need the buf, mark it available 975 * for freeing. The VM has the data. 976 */ 977 bp->b_flags |= B_RELBUF; 978 brelse(bp); 979 } else { 980 /* 981 * Otherwise let whoever 982 * made the request take care of 983 * freeing it. We just queue 984 * it onto another list. 985 */ 986 bqrelse(bp); 987 } 988 } 989 990 /* 991 * This can only happen in the case of an error 992 * because the loop above resets bp to NULL on each iteration 993 * and on normal completion has not set a new value into it. 994 * so it must have come from a 'break' statement 995 */ 996 if (bp != NULL) { 997 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 998 (LIST_EMPTY(&bp->b_dep))) { 999 bp->b_flags |= B_RELBUF; 1000 brelse(bp); 1001 } else { 1002 bqrelse(bp); 1003 } 1004 } 1005 1006 if ((error == 0 || uio->uio_resid != orig_resid) && 1007 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) { 1008 VI_LOCK(vp); 1009 ip->i_flag |= IN_ACCESS; 1010 VI_UNLOCK(vp); 1011 } 1012 return (error); 1013 } 1014 1015 /* 1016 * Extended attribute area writing. 1017 */ 1018 static int 1019 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1020 { 1021 struct inode *ip; 1022 struct ufs2_dinode *dp; 1023 struct fs *fs; 1024 struct buf *bp; 1025 ufs_lbn_t lbn; 1026 off_t osize; 1027 int blkoffset, error, flags, resid, size, xfersize; 1028 1029 ip = VTOI(vp); 1030 fs = ip->i_fs; 1031 dp = ip->i_din2; 1032 1033 KASSERT(!(ip->i_flag & IN_SPACECOUNTED), ("inode %u: inode is dead", 1034 ip->i_number)); 1035 1036 #ifdef INVARIANTS 1037 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1038 panic("ffs_extwrite: mode"); 1039 #endif 1040 1041 if (ioflag & IO_APPEND) 1042 uio->uio_offset = dp->di_extsize; 1043 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1044 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1045 if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1046 return (EFBIG); 1047 1048 resid = uio->uio_resid; 1049 osize = dp->di_extsize; 1050 flags = IO_EXT; 1051 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1052 flags |= IO_SYNC; 1053 1054 for (error = 0; uio->uio_resid > 0;) { 1055 lbn = lblkno(fs, uio->uio_offset); 1056 blkoffset = blkoff(fs, uio->uio_offset); 1057 xfersize = fs->fs_bsize - blkoffset; 1058 if (uio->uio_resid < xfersize) 1059 xfersize = uio->uio_resid; 1060 1061 /* 1062 * We must perform a read-before-write if the transfer size 1063 * does not cover the entire buffer. 1064 */ 1065 if (fs->fs_bsize > xfersize) 1066 flags |= BA_CLRBUF; 1067 else 1068 flags &= ~BA_CLRBUF; 1069 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1070 ucred, flags, &bp); 1071 if (error != 0) 1072 break; 1073 /* 1074 * If the buffer is not valid we have to clear out any 1075 * garbage data from the pages instantiated for the buffer. 1076 * If we do not, a failed uiomove() during a write can leave 1077 * the prior contents of the pages exposed to a userland 1078 * mmap(). XXX deal with uiomove() errors a better way. 1079 */ 1080 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1081 vfs_bio_clrbuf(bp); 1082 if (ioflag & IO_DIRECT) 1083 bp->b_flags |= B_DIRECT; 1084 1085 if (uio->uio_offset + xfersize > dp->di_extsize) 1086 dp->di_extsize = uio->uio_offset + xfersize; 1087 1088 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1089 if (size < xfersize) 1090 xfersize = size; 1091 1092 error = 1093 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1094 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1095 (LIST_EMPTY(&bp->b_dep))) { 1096 bp->b_flags |= B_RELBUF; 1097 } 1098 1099 /* 1100 * If IO_SYNC each buffer is written synchronously. Otherwise 1101 * if we have a severe page deficiency write the buffer 1102 * asynchronously. Otherwise try to cluster, and if that 1103 * doesn't do it then either do an async write (if O_DIRECT), 1104 * or a delayed write (if not). 1105 */ 1106 if (ioflag & IO_SYNC) { 1107 (void)bwrite(bp); 1108 } else if (vm_page_count_severe() || 1109 buf_dirty_count_severe() || 1110 xfersize + blkoffset == fs->fs_bsize || 1111 (ioflag & (IO_ASYNC | IO_DIRECT))) 1112 bawrite(bp); 1113 else 1114 bdwrite(bp); 1115 if (error || xfersize == 0) 1116 break; 1117 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1118 } 1119 /* 1120 * If we successfully wrote any data, and we are not the superuser 1121 * we clear the setuid and setgid bits as a precaution against 1122 * tampering. 1123 */ 1124 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { 1125 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) { 1126 ip->i_mode &= ~(ISUID | ISGID); 1127 dp->di_mode = ip->i_mode; 1128 } 1129 } 1130 if (error) { 1131 if (ioflag & IO_UNIT) { 1132 (void)ffs_truncate(vp, osize, 1133 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td); 1134 uio->uio_offset -= resid - uio->uio_resid; 1135 uio->uio_resid = resid; 1136 } 1137 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1138 error = ffs_update(vp, 1); 1139 return (error); 1140 } 1141 1142 1143 /* 1144 * Vnode operating to retrieve a named extended attribute. 1145 * 1146 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1147 * the length of the EA, and possibly the pointer to the entry and to the data. 1148 */ 1149 static int 1150 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) 1151 { 1152 u_char *p, *pe, *pn, *p0; 1153 int eapad1, eapad2, ealength, ealen, nlen; 1154 uint32_t ul; 1155 1156 pe = ptr + length; 1157 nlen = strlen(name); 1158 1159 for (p = ptr; p < pe; p = pn) { 1160 p0 = p; 1161 bcopy(p, &ul, sizeof(ul)); 1162 pn = p + ul; 1163 /* make sure this entry is complete */ 1164 if (pn > pe) 1165 break; 1166 p += sizeof(uint32_t); 1167 if (*p != nspace) 1168 continue; 1169 p++; 1170 eapad2 = *p++; 1171 if (*p != nlen) 1172 continue; 1173 p++; 1174 if (bcmp(p, name, nlen)) 1175 continue; 1176 ealength = sizeof(uint32_t) + 3 + nlen; 1177 eapad1 = 8 - (ealength % 8); 1178 if (eapad1 == 8) 1179 eapad1 = 0; 1180 ealength += eapad1; 1181 ealen = ul - ealength - eapad2; 1182 p += nlen + eapad1; 1183 if (eap != NULL) 1184 *eap = p0; 1185 if (eac != NULL) 1186 *eac = p; 1187 return (ealen); 1188 } 1189 return(-1); 1190 } 1191 1192 static int 1193 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1194 { 1195 struct inode *ip; 1196 struct ufs2_dinode *dp; 1197 struct fs *fs; 1198 struct uio luio; 1199 struct iovec liovec; 1200 int easize, error; 1201 u_char *eae; 1202 1203 ip = VTOI(vp); 1204 fs = ip->i_fs; 1205 dp = ip->i_din2; 1206 easize = dp->di_extsize; 1207 if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize) 1208 return (EFBIG); 1209 1210 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1211 1212 liovec.iov_base = eae; 1213 liovec.iov_len = easize; 1214 luio.uio_iov = &liovec; 1215 luio.uio_iovcnt = 1; 1216 luio.uio_offset = 0; 1217 luio.uio_resid = easize; 1218 luio.uio_segflg = UIO_SYSSPACE; 1219 luio.uio_rw = UIO_READ; 1220 luio.uio_td = td; 1221 1222 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1223 if (error) { 1224 free(eae, M_TEMP); 1225 return(error); 1226 } 1227 *p = eae; 1228 return (0); 1229 } 1230 1231 static int 1232 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1233 { 1234 struct inode *ip; 1235 struct ufs2_dinode *dp; 1236 int error; 1237 1238 ip = VTOI(vp); 1239 1240 if (ip->i_ea_area != NULL) 1241 return (EBUSY); 1242 dp = ip->i_din2; 1243 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1244 if (error) 1245 return (error); 1246 ip->i_ea_len = dp->di_extsize; 1247 ip->i_ea_error = 0; 1248 return (0); 1249 } 1250 1251 /* 1252 * Vnode extattr transaction commit/abort 1253 */ 1254 static int 1255 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1256 { 1257 struct inode *ip; 1258 struct uio luio; 1259 struct iovec liovec; 1260 int error; 1261 struct ufs2_dinode *dp; 1262 1263 ip = VTOI(vp); 1264 if (ip->i_ea_area == NULL) 1265 return (EINVAL); 1266 dp = ip->i_din2; 1267 error = ip->i_ea_error; 1268 if (commit && error == 0) { 1269 if (cred == NOCRED) 1270 cred = vp->v_mount->mnt_cred; 1271 liovec.iov_base = ip->i_ea_area; 1272 liovec.iov_len = ip->i_ea_len; 1273 luio.uio_iov = &liovec; 1274 luio.uio_iovcnt = 1; 1275 luio.uio_offset = 0; 1276 luio.uio_resid = ip->i_ea_len; 1277 luio.uio_segflg = UIO_SYSSPACE; 1278 luio.uio_rw = UIO_WRITE; 1279 luio.uio_td = td; 1280 /* XXX: I'm not happy about truncating to zero size */ 1281 if (ip->i_ea_len < dp->di_extsize) 1282 error = ffs_truncate(vp, 0, IO_EXT, cred, td); 1283 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1284 } 1285 free(ip->i_ea_area, M_TEMP); 1286 ip->i_ea_area = NULL; 1287 ip->i_ea_len = 0; 1288 ip->i_ea_error = 0; 1289 return (error); 1290 } 1291 1292 /* 1293 * Vnode extattr strategy routine for fifos. 1294 * 1295 * We need to check for a read or write of the external attributes. 1296 * Otherwise we just fall through and do the usual thing. 1297 */ 1298 static int 1299 ffsext_strategy(struct vop_strategy_args *ap) 1300 /* 1301 struct vop_strategy_args { 1302 struct vnodeop_desc *a_desc; 1303 struct vnode *a_vp; 1304 struct buf *a_bp; 1305 }; 1306 */ 1307 { 1308 struct vnode *vp; 1309 daddr_t lbn; 1310 1311 vp = ap->a_vp; 1312 lbn = ap->a_bp->b_lblkno; 1313 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1314 lbn < 0 && lbn >= -NXADDR) 1315 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1316 if (vp->v_type == VFIFO) 1317 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1318 panic("spec nodes went here"); 1319 } 1320 1321 /* 1322 * Vnode extattr transaction commit/abort 1323 */ 1324 static int 1325 ffs_openextattr(struct vop_openextattr_args *ap) 1326 /* 1327 struct vop_openextattr_args { 1328 struct vnodeop_desc *a_desc; 1329 struct vnode *a_vp; 1330 IN struct ucred *a_cred; 1331 IN struct thread *a_td; 1332 }; 1333 */ 1334 { 1335 struct inode *ip; 1336 struct fs *fs; 1337 1338 ip = VTOI(ap->a_vp); 1339 fs = ip->i_fs; 1340 1341 if (ap->a_vp->v_type == VCHR) 1342 return (EOPNOTSUPP); 1343 1344 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1345 } 1346 1347 1348 /* 1349 * Vnode extattr transaction commit/abort 1350 */ 1351 static int 1352 ffs_closeextattr(struct vop_closeextattr_args *ap) 1353 /* 1354 struct vop_closeextattr_args { 1355 struct vnodeop_desc *a_desc; 1356 struct vnode *a_vp; 1357 int a_commit; 1358 IN struct ucred *a_cred; 1359 IN struct thread *a_td; 1360 }; 1361 */ 1362 { 1363 struct inode *ip; 1364 struct fs *fs; 1365 1366 ip = VTOI(ap->a_vp); 1367 fs = ip->i_fs; 1368 1369 if (ap->a_vp->v_type == VCHR) 1370 return (EOPNOTSUPP); 1371 1372 if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) 1373 return (EROFS); 1374 1375 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1376 } 1377 1378 /* 1379 * Vnode operation to remove a named attribute. 1380 */ 1381 static int 1382 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1383 /* 1384 vop_deleteextattr { 1385 IN struct vnode *a_vp; 1386 IN int a_attrnamespace; 1387 IN const char *a_name; 1388 IN struct ucred *a_cred; 1389 IN struct thread *a_td; 1390 }; 1391 */ 1392 { 1393 struct inode *ip; 1394 struct fs *fs; 1395 uint32_t ealength, ul; 1396 int ealen, olen, eapad1, eapad2, error, i, easize; 1397 u_char *eae, *p; 1398 int stand_alone; 1399 1400 ip = VTOI(ap->a_vp); 1401 fs = ip->i_fs; 1402 1403 if (ap->a_vp->v_type == VCHR) 1404 return (EOPNOTSUPP); 1405 1406 if (strlen(ap->a_name) == 0) 1407 return (EINVAL); 1408 1409 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1410 return (EROFS); 1411 1412 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1413 ap->a_cred, ap->a_td, IWRITE); 1414 if (error) { 1415 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1416 ip->i_ea_error = error; 1417 return (error); 1418 } 1419 1420 if (ip->i_ea_area == NULL) { 1421 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1422 if (error) 1423 return (error); 1424 stand_alone = 1; 1425 } else { 1426 stand_alone = 0; 1427 } 1428 1429 ealength = eapad1 = ealen = eapad2 = 0; 1430 1431 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1432 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1433 easize = ip->i_ea_len; 1434 1435 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1436 &p, NULL); 1437 if (olen == -1) { 1438 /* delete but nonexistent */ 1439 free(eae, M_TEMP); 1440 if (stand_alone) 1441 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1442 return(ENOATTR); 1443 } 1444 bcopy(p, &ul, sizeof ul); 1445 i = p - eae + ul; 1446 if (ul != ealength) { 1447 bcopy(p + ul, p + ealength, easize - i); 1448 easize += (ealength - ul); 1449 } 1450 if (easize > NXADDR * fs->fs_bsize) { 1451 free(eae, M_TEMP); 1452 if (stand_alone) 1453 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1454 else if (ip->i_ea_error == 0) 1455 ip->i_ea_error = ENOSPC; 1456 return(ENOSPC); 1457 } 1458 p = ip->i_ea_area; 1459 ip->i_ea_area = eae; 1460 ip->i_ea_len = easize; 1461 free(p, M_TEMP); 1462 if (stand_alone) 1463 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1464 return(error); 1465 } 1466 1467 /* 1468 * Vnode operation to retrieve a named extended attribute. 1469 */ 1470 static int 1471 ffs_getextattr(struct vop_getextattr_args *ap) 1472 /* 1473 vop_getextattr { 1474 IN struct vnode *a_vp; 1475 IN int a_attrnamespace; 1476 IN const char *a_name; 1477 INOUT struct uio *a_uio; 1478 OUT size_t *a_size; 1479 IN struct ucred *a_cred; 1480 IN struct thread *a_td; 1481 }; 1482 */ 1483 { 1484 struct inode *ip; 1485 struct fs *fs; 1486 u_char *eae, *p; 1487 unsigned easize; 1488 int error, ealen, stand_alone; 1489 1490 ip = VTOI(ap->a_vp); 1491 fs = ip->i_fs; 1492 1493 if (ap->a_vp->v_type == VCHR) 1494 return (EOPNOTSUPP); 1495 1496 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1497 ap->a_cred, ap->a_td, IREAD); 1498 if (error) 1499 return (error); 1500 1501 if (ip->i_ea_area == NULL) { 1502 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1503 if (error) 1504 return (error); 1505 stand_alone = 1; 1506 } else { 1507 stand_alone = 0; 1508 } 1509 eae = ip->i_ea_area; 1510 easize = ip->i_ea_len; 1511 1512 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1513 NULL, &p); 1514 if (ealen >= 0) { 1515 error = 0; 1516 if (ap->a_size != NULL) 1517 *ap->a_size = ealen; 1518 else if (ap->a_uio != NULL) 1519 error = uiomove(p, ealen, ap->a_uio); 1520 } else 1521 error = ENOATTR; 1522 if (stand_alone) 1523 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1524 return(error); 1525 } 1526 1527 /* 1528 * Vnode operation to retrieve extended attributes on a vnode. 1529 */ 1530 static int 1531 ffs_listextattr(struct vop_listextattr_args *ap) 1532 /* 1533 vop_listextattr { 1534 IN struct vnode *a_vp; 1535 IN int a_attrnamespace; 1536 INOUT struct uio *a_uio; 1537 OUT size_t *a_size; 1538 IN struct ucred *a_cred; 1539 IN struct thread *a_td; 1540 }; 1541 */ 1542 { 1543 struct inode *ip; 1544 struct fs *fs; 1545 u_char *eae, *p, *pe, *pn; 1546 unsigned easize; 1547 uint32_t ul; 1548 int error, ealen, stand_alone; 1549 1550 ip = VTOI(ap->a_vp); 1551 fs = ip->i_fs; 1552 1553 if (ap->a_vp->v_type == VCHR) 1554 return (EOPNOTSUPP); 1555 1556 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1557 ap->a_cred, ap->a_td, IREAD); 1558 if (error) 1559 return (error); 1560 1561 if (ip->i_ea_area == NULL) { 1562 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1563 if (error) 1564 return (error); 1565 stand_alone = 1; 1566 } else { 1567 stand_alone = 0; 1568 } 1569 eae = ip->i_ea_area; 1570 easize = ip->i_ea_len; 1571 1572 error = 0; 1573 if (ap->a_size != NULL) 1574 *ap->a_size = 0; 1575 pe = eae + easize; 1576 for(p = eae; error == 0 && p < pe; p = pn) { 1577 bcopy(p, &ul, sizeof(ul)); 1578 pn = p + ul; 1579 if (pn > pe) 1580 break; 1581 p += sizeof(ul); 1582 if (*p++ != ap->a_attrnamespace) 1583 continue; 1584 p++; /* pad2 */ 1585 ealen = *p; 1586 if (ap->a_size != NULL) { 1587 *ap->a_size += ealen + 1; 1588 } else if (ap->a_uio != NULL) { 1589 error = uiomove(p, ealen + 1, ap->a_uio); 1590 } 1591 } 1592 if (stand_alone) 1593 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1594 return(error); 1595 } 1596 1597 /* 1598 * Vnode operation to set a named attribute. 1599 */ 1600 static int 1601 ffs_setextattr(struct vop_setextattr_args *ap) 1602 /* 1603 vop_setextattr { 1604 IN struct vnode *a_vp; 1605 IN int a_attrnamespace; 1606 IN const char *a_name; 1607 INOUT struct uio *a_uio; 1608 IN struct ucred *a_cred; 1609 IN struct thread *a_td; 1610 }; 1611 */ 1612 { 1613 struct inode *ip; 1614 struct fs *fs; 1615 uint32_t ealength, ul; 1616 int ealen, olen, eapad1, eapad2, error, i, easize; 1617 u_char *eae, *p; 1618 int stand_alone; 1619 1620 ip = VTOI(ap->a_vp); 1621 fs = ip->i_fs; 1622 1623 if (ap->a_vp->v_type == VCHR) 1624 return (EOPNOTSUPP); 1625 1626 if (strlen(ap->a_name) == 0) 1627 return (EINVAL); 1628 1629 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1630 if (ap->a_uio == NULL) 1631 return (EOPNOTSUPP); 1632 1633 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1634 return (EROFS); 1635 1636 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1637 ap->a_cred, ap->a_td, IWRITE); 1638 if (error) { 1639 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1640 ip->i_ea_error = error; 1641 return (error); 1642 } 1643 1644 if (ip->i_ea_area == NULL) { 1645 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1646 if (error) 1647 return (error); 1648 stand_alone = 1; 1649 } else { 1650 stand_alone = 0; 1651 } 1652 1653 ealen = ap->a_uio->uio_resid; 1654 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1655 eapad1 = 8 - (ealength % 8); 1656 if (eapad1 == 8) 1657 eapad1 = 0; 1658 eapad2 = 8 - (ealen % 8); 1659 if (eapad2 == 8) 1660 eapad2 = 0; 1661 ealength += eapad1 + ealen + eapad2; 1662 1663 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1664 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1665 easize = ip->i_ea_len; 1666 1667 olen = ffs_findextattr(eae, easize, 1668 ap->a_attrnamespace, ap->a_name, &p, NULL); 1669 if (olen == -1) { 1670 /* new, append at end */ 1671 p = eae + easize; 1672 easize += ealength; 1673 } else { 1674 bcopy(p, &ul, sizeof ul); 1675 i = p - eae + ul; 1676 if (ul != ealength) { 1677 bcopy(p + ul, p + ealength, easize - i); 1678 easize += (ealength - ul); 1679 } 1680 } 1681 if (easize > NXADDR * fs->fs_bsize) { 1682 free(eae, M_TEMP); 1683 if (stand_alone) 1684 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1685 else if (ip->i_ea_error == 0) 1686 ip->i_ea_error = ENOSPC; 1687 return(ENOSPC); 1688 } 1689 bcopy(&ealength, p, sizeof(ealength)); 1690 p += sizeof(ealength); 1691 *p++ = ap->a_attrnamespace; 1692 *p++ = eapad2; 1693 *p++ = strlen(ap->a_name); 1694 strcpy(p, ap->a_name); 1695 p += strlen(ap->a_name); 1696 bzero(p, eapad1); 1697 p += eapad1; 1698 error = uiomove(p, ealen, ap->a_uio); 1699 if (error) { 1700 free(eae, M_TEMP); 1701 if (stand_alone) 1702 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1703 else if (ip->i_ea_error == 0) 1704 ip->i_ea_error = error; 1705 return(error); 1706 } 1707 p += ealen; 1708 bzero(p, eapad2); 1709 1710 p = ip->i_ea_area; 1711 ip->i_ea_area = eae; 1712 ip->i_ea_len = easize; 1713 free(p, M_TEMP); 1714 if (stand_alone) 1715 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1716 return(error); 1717 } 1718 1719 /* 1720 * Vnode pointer to File handle 1721 */ 1722 static int 1723 ffs_vptofh(struct vop_vptofh_args *ap) 1724 /* 1725 vop_vptofh { 1726 IN struct vnode *a_vp; 1727 IN struct fid *a_fhp; 1728 }; 1729 */ 1730 { 1731 struct inode *ip; 1732 struct ufid *ufhp; 1733 1734 ip = VTOI(ap->a_vp); 1735 ufhp = (struct ufid *)ap->a_fhp; 1736 ufhp->ufid_len = sizeof(struct ufid); 1737 ufhp->ufid_ino = ip->i_number; 1738 ufhp->ufid_gen = ip->i_gen; 1739 return (0); 1740 } 1741