1 /* 2 * Copyright (c) 2002 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Copyright (c) 1982, 1986, 1989, 1993 12 * The Regents of the University of California. All rights reserved. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed by the University of 25 * California, Berkeley and its contributors. 26 * 4. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 * 42 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 43 * $FreeBSD$ 44 */ 45 46 #include <sys/param.h> 47 #include <sys/bio.h> 48 #include <sys/systm.h> 49 #include <sys/buf.h> 50 #include <sys/conf.h> 51 #include <sys/extattr.h> 52 #include <sys/kernel.h> 53 #include <sys/limits.h> 54 #include <sys/malloc.h> 55 #include <sys/mount.h> 56 #include <sys/proc.h> 57 #include <sys/resourcevar.h> 58 #include <sys/signalvar.h> 59 #include <sys/stat.h> 60 #include <sys/vmmeter.h> 61 #include <sys/vnode.h> 62 63 #include <vm/vm.h> 64 #include <vm/vm_extern.h> 65 #include <vm/vm_object.h> 66 #include <vm/vm_page.h> 67 #include <vm/vm_pager.h> 68 #include <vm/vnode_pager.h> 69 70 #include <ufs/ufs/extattr.h> 71 #include <ufs/ufs/quota.h> 72 #include <ufs/ufs/inode.h> 73 #include <ufs/ufs/ufs_extern.h> 74 #include <ufs/ufs/ufsmount.h> 75 76 #include <ufs/ffs/fs.h> 77 #include <ufs/ffs/ffs_extern.h> 78 #include "opt_directio.h" 79 80 #ifdef DIRECTIO 81 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 82 #endif 83 static int ffs_fsync(struct vop_fsync_args *); 84 static int ffs_getpages(struct vop_getpages_args *); 85 static int ffs_read(struct vop_read_args *); 86 static int ffs_write(struct vop_write_args *); 87 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 88 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 89 struct ucred *cred); 90 static int ffsext_strategy(struct vop_strategy_args *); 91 static int ffs_closeextattr(struct vop_closeextattr_args *); 92 static int ffs_getextattr(struct vop_getextattr_args *); 93 static int ffs_openextattr(struct vop_openextattr_args *); 94 static int ffs_setextattr(struct vop_setextattr_args *); 95 96 97 /* Global vfs data structures for ufs. */ 98 vop_t **ffs_vnodeop_p; 99 static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = { 100 { &vop_default_desc, (vop_t *) ufs_vnoperate }, 101 { &vop_fsync_desc, (vop_t *) ffs_fsync }, 102 { &vop_getpages_desc, (vop_t *) ffs_getpages }, 103 { &vop_read_desc, (vop_t *) ffs_read }, 104 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks }, 105 { &vop_write_desc, (vop_t *) ffs_write }, 106 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr }, 107 { &vop_getextattr_desc, (vop_t *) ffs_getextattr }, 108 { &vop_openextattr_desc, (vop_t *) ffs_openextattr }, 109 { &vop_setextattr_desc, (vop_t *) ffs_setextattr }, 110 { NULL, NULL } 111 }; 112 static struct vnodeopv_desc ffs_vnodeop_opv_desc = 113 { &ffs_vnodeop_p, ffs_vnodeop_entries }; 114 115 vop_t **ffs_specop_p; 116 static struct vnodeopv_entry_desc ffs_specop_entries[] = { 117 { &vop_default_desc, (vop_t *) ufs_vnoperatespec }, 118 { &vop_fsync_desc, (vop_t *) ffs_fsync }, 119 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks }, 120 { &vop_strategy_desc, (vop_t *) ffsext_strategy }, 121 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr }, 122 { &vop_getextattr_desc, (vop_t *) ffs_getextattr }, 123 { &vop_openextattr_desc, (vop_t *) ffs_openextattr }, 124 { &vop_setextattr_desc, (vop_t *) ffs_setextattr }, 125 { NULL, NULL } 126 }; 127 static struct vnodeopv_desc ffs_specop_opv_desc = 128 { &ffs_specop_p, ffs_specop_entries }; 129 130 vop_t **ffs_fifoop_p; 131 static struct vnodeopv_entry_desc ffs_fifoop_entries[] = { 132 { &vop_default_desc, (vop_t *) ufs_vnoperatefifo }, 133 { &vop_fsync_desc, (vop_t *) ffs_fsync }, 134 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks }, 135 { &vop_strategy_desc, (vop_t *) ffsext_strategy }, 136 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr }, 137 { &vop_getextattr_desc, (vop_t *) ffs_getextattr }, 138 { &vop_openextattr_desc, (vop_t *) ffs_openextattr }, 139 { &vop_setextattr_desc, (vop_t *) ffs_setextattr }, 140 { NULL, NULL } 141 }; 142 static struct vnodeopv_desc ffs_fifoop_opv_desc = 143 { &ffs_fifoop_p, ffs_fifoop_entries }; 144 145 VNODEOP_SET(ffs_vnodeop_opv_desc); 146 VNODEOP_SET(ffs_specop_opv_desc); 147 VNODEOP_SET(ffs_fifoop_opv_desc); 148 149 /* 150 * Synch an open file. 151 */ 152 /* ARGSUSED */ 153 static int 154 ffs_fsync(ap) 155 struct vop_fsync_args /* { 156 struct vnode *a_vp; 157 struct ucred *a_cred; 158 int a_waitfor; 159 struct thread *a_td; 160 } */ *ap; 161 { 162 struct vnode *vp = ap->a_vp; 163 struct inode *ip = VTOI(vp); 164 struct buf *bp; 165 struct buf *nbp; 166 int s, error, wait, passes, skipmeta; 167 ufs_lbn_t lbn; 168 169 wait = (ap->a_waitfor == MNT_WAIT); 170 if (vn_isdisk(vp, NULL)) { 171 lbn = INT_MAX; 172 if (vp->v_rdev->si_mountpoint != NULL && 173 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP)) 174 softdep_fsync_mountdev(vp); 175 } else { 176 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 177 } 178 179 /* 180 * Flush all dirty buffers associated with a vnode. 181 */ 182 passes = NIADDR + 1; 183 skipmeta = 0; 184 if (wait) 185 skipmeta = 1; 186 s = splbio(); 187 VI_LOCK(vp); 188 loop: 189 TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) 190 bp->b_vflags &= ~BV_SCANNED; 191 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 192 nbp = TAILQ_NEXT(bp, b_vnbufs); 193 /* 194 * Reasons to skip this buffer: it has already been considered 195 * on this pass, this pass is the first time through on a 196 * synchronous flush request and the buffer being considered 197 * is metadata, the buffer has dependencies that will cause 198 * it to be redirtied and it has not already been deferred, 199 * or it is already being written. 200 */ 201 if ((bp->b_vflags & BV_SCANNED) != 0) 202 continue; 203 bp->b_vflags |= BV_SCANNED; 204 if ((skipmeta == 1 && bp->b_lblkno < 0)) 205 continue; 206 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 207 continue; 208 if (!wait && LIST_FIRST(&bp->b_dep) != NULL && 209 (bp->b_flags & B_DEFERRED) == 0 && 210 buf_countdeps(bp, 0)) { 211 bp->b_flags |= B_DEFERRED; 212 BUF_UNLOCK(bp); 213 continue; 214 } 215 VI_UNLOCK(vp); 216 if ((bp->b_flags & B_DELWRI) == 0) 217 panic("ffs_fsync: not dirty"); 218 if (vp != bp->b_vp) 219 panic("ffs_fsync: vp != vp->b_vp"); 220 /* 221 * If this is a synchronous flush request, or it is not a 222 * file or device, start the write on this buffer immediatly. 223 */ 224 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) { 225 226 /* 227 * On our final pass through, do all I/O synchronously 228 * so that we can find out if our flush is failing 229 * because of write errors. 230 */ 231 if (passes > 0 || !wait) { 232 if ((bp->b_flags & B_CLUSTEROK) && !wait) { 233 (void) vfs_bio_awrite(bp); 234 } else { 235 bremfree(bp); 236 splx(s); 237 (void) bawrite(bp); 238 s = splbio(); 239 } 240 } else { 241 bremfree(bp); 242 splx(s); 243 if ((error = bwrite(bp)) != 0) 244 return (error); 245 s = splbio(); 246 } 247 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) { 248 /* 249 * If the buffer is for data that has been truncated 250 * off the file, then throw it away. 251 */ 252 bremfree(bp); 253 bp->b_flags |= B_INVAL | B_NOCACHE; 254 splx(s); 255 brelse(bp); 256 s = splbio(); 257 } else 258 vfs_bio_awrite(bp); 259 260 /* 261 * Since we may have slept during the I/O, we need 262 * to start from a known point. 263 */ 264 VI_LOCK(vp); 265 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd); 266 } 267 /* 268 * If we were asked to do this synchronously, then go back for 269 * another pass, this time doing the metadata. 270 */ 271 if (skipmeta) { 272 skipmeta = 0; 273 goto loop; 274 } 275 276 if (wait) { 277 while (vp->v_numoutput) { 278 vp->v_iflag |= VI_BWAIT; 279 msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp), 280 PRIBIO + 4, "ffsfsn", 0); 281 } 282 VI_UNLOCK(vp); 283 284 /* 285 * Ensure that any filesystem metatdata associated 286 * with the vnode has been written. 287 */ 288 splx(s); 289 if ((error = softdep_sync_metadata(ap)) != 0) 290 return (error); 291 s = splbio(); 292 293 VI_LOCK(vp); 294 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 295 /* 296 * Block devices associated with filesystems may 297 * have new I/O requests posted for them even if 298 * the vnode is locked, so no amount of trying will 299 * get them clean. Thus we give block devices a 300 * good effort, then just give up. For all other file 301 * types, go around and try again until it is clean. 302 */ 303 if (passes > 0) { 304 passes -= 1; 305 goto loop; 306 } 307 #ifdef DIAGNOSTIC 308 if (!vn_isdisk(vp, NULL)) 309 vprint("ffs_fsync: dirty", vp); 310 #endif 311 } 312 } 313 VI_UNLOCK(vp); 314 splx(s); 315 return (UFS_UPDATE(vp, wait)); 316 } 317 318 319 /* 320 * Vnode op for reading. 321 */ 322 /* ARGSUSED */ 323 static int 324 ffs_read(ap) 325 struct vop_read_args /* { 326 struct vnode *a_vp; 327 struct uio *a_uio; 328 int a_ioflag; 329 struct ucred *a_cred; 330 } */ *ap; 331 { 332 struct vnode *vp; 333 struct inode *ip; 334 struct uio *uio; 335 struct fs *fs; 336 struct buf *bp; 337 ufs_lbn_t lbn, nextlbn; 338 off_t bytesinfile; 339 long size, xfersize, blkoffset; 340 int error, orig_resid; 341 mode_t mode; 342 int seqcount; 343 int ioflag; 344 vm_object_t object; 345 346 vp = ap->a_vp; 347 uio = ap->a_uio; 348 ioflag = ap->a_ioflag; 349 if (ap->a_ioflag & IO_EXT) 350 #ifdef notyet 351 return (ffs_extread(vp, uio, ioflag)); 352 #else 353 panic("ffs_read+IO_EXT"); 354 #endif 355 #ifdef DIRECTIO 356 if ((ioflag & IO_DIRECT) != 0) { 357 int workdone; 358 359 error = ffs_rawread(vp, uio, &workdone); 360 if (error != 0 || workdone != 0) 361 return error; 362 } 363 #endif 364 365 GIANT_REQUIRED; 366 367 seqcount = ap->a_ioflag >> 16; 368 ip = VTOI(vp); 369 mode = ip->i_mode; 370 371 #ifdef DIAGNOSTIC 372 if (uio->uio_rw != UIO_READ) 373 panic("ffs_read: mode"); 374 375 if (vp->v_type == VLNK) { 376 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 377 panic("ffs_read: short symlink"); 378 } else if (vp->v_type != VREG && vp->v_type != VDIR) 379 panic("ffs_read: type %d", vp->v_type); 380 #endif 381 fs = ip->i_fs; 382 if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize) 383 return (EFBIG); 384 385 orig_resid = uio->uio_resid; 386 if (orig_resid <= 0) 387 return (0); 388 389 object = vp->v_object; 390 391 bytesinfile = ip->i_size - uio->uio_offset; 392 if (bytesinfile <= 0) { 393 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 394 ip->i_flag |= IN_ACCESS; 395 return 0; 396 } 397 398 if (object) { 399 vm_object_reference(object); 400 } 401 402 /* 403 * Ok so we couldn't do it all in one vm trick... 404 * so cycle around trying smaller bites.. 405 */ 406 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 407 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 408 break; 409 410 lbn = lblkno(fs, uio->uio_offset); 411 nextlbn = lbn + 1; 412 413 /* 414 * size of buffer. The buffer representing the 415 * end of the file is rounded up to the size of 416 * the block type ( fragment or full block, 417 * depending ). 418 */ 419 size = blksize(fs, ip, lbn); 420 blkoffset = blkoff(fs, uio->uio_offset); 421 422 /* 423 * The amount we want to transfer in this iteration is 424 * one FS block less the amount of the data before 425 * our startpoint (duh!) 426 */ 427 xfersize = fs->fs_bsize - blkoffset; 428 429 /* 430 * But if we actually want less than the block, 431 * or the file doesn't have a whole block more of data, 432 * then use the lesser number. 433 */ 434 if (uio->uio_resid < xfersize) 435 xfersize = uio->uio_resid; 436 if (bytesinfile < xfersize) 437 xfersize = bytesinfile; 438 439 if (lblktosize(fs, nextlbn) >= ip->i_size) { 440 /* 441 * Don't do readahead if this is the end of the file. 442 */ 443 error = bread(vp, lbn, size, NOCRED, &bp); 444 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 445 /* 446 * Otherwise if we are allowed to cluster, 447 * grab as much as we can. 448 * 449 * XXX This may not be a win if we are not 450 * doing sequential access. 451 */ 452 error = cluster_read(vp, ip->i_size, lbn, 453 size, NOCRED, uio->uio_resid, seqcount, &bp); 454 } else if (seqcount > 1) { 455 /* 456 * If we are NOT allowed to cluster, then 457 * if we appear to be acting sequentially, 458 * fire off a request for a readahead 459 * as well as a read. Note that the 4th and 5th 460 * arguments point to arrays of the size specified in 461 * the 6th argument. 462 */ 463 int nextsize = blksize(fs, ip, nextlbn); 464 error = breadn(vp, lbn, 465 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 466 } else { 467 /* 468 * Failing all of the above, just read what the 469 * user asked for. Interestingly, the same as 470 * the first option above. 471 */ 472 error = bread(vp, lbn, size, NOCRED, &bp); 473 } 474 if (error) { 475 brelse(bp); 476 bp = NULL; 477 break; 478 } 479 480 /* 481 * If IO_DIRECT then set B_DIRECT for the buffer. This 482 * will cause us to attempt to release the buffer later on 483 * and will cause the buffer cache to attempt to free the 484 * underlying pages. 485 */ 486 if (ioflag & IO_DIRECT) 487 bp->b_flags |= B_DIRECT; 488 489 /* 490 * We should only get non-zero b_resid when an I/O error 491 * has occurred, which should cause us to break above. 492 * However, if the short read did not cause an error, 493 * then we want to ensure that we do not uiomove bad 494 * or uninitialized data. 495 */ 496 size -= bp->b_resid; 497 if (size < xfersize) { 498 if (size == 0) 499 break; 500 xfersize = size; 501 } 502 503 { 504 /* 505 * otherwise use the general form 506 */ 507 error = 508 uiomove((char *)bp->b_data + blkoffset, 509 (int)xfersize, uio); 510 } 511 512 if (error) 513 break; 514 515 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 516 (LIST_FIRST(&bp->b_dep) == NULL)) { 517 /* 518 * If there are no dependencies, and it's VMIO, 519 * then we don't need the buf, mark it available 520 * for freeing. The VM has the data. 521 */ 522 bp->b_flags |= B_RELBUF; 523 brelse(bp); 524 } else { 525 /* 526 * Otherwise let whoever 527 * made the request take care of 528 * freeing it. We just queue 529 * it onto another list. 530 */ 531 bqrelse(bp); 532 } 533 } 534 535 /* 536 * This can only happen in the case of an error 537 * because the loop above resets bp to NULL on each iteration 538 * and on normal completion has not set a new value into it. 539 * so it must have come from a 'break' statement 540 */ 541 if (bp != NULL) { 542 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 543 (LIST_FIRST(&bp->b_dep) == NULL)) { 544 bp->b_flags |= B_RELBUF; 545 brelse(bp); 546 } else { 547 bqrelse(bp); 548 } 549 } 550 551 if (object) { 552 VM_OBJECT_LOCK(object); 553 vm_object_vndeallocate(object); 554 } 555 if ((error == 0 || uio->uio_resid != orig_resid) && 556 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 557 ip->i_flag |= IN_ACCESS; 558 return (error); 559 } 560 561 /* 562 * Vnode op for writing. 563 */ 564 static int 565 ffs_write(ap) 566 struct vop_write_args /* { 567 struct vnode *a_vp; 568 struct uio *a_uio; 569 int a_ioflag; 570 struct ucred *a_cred; 571 } */ *ap; 572 { 573 struct vnode *vp; 574 struct uio *uio; 575 struct inode *ip; 576 struct fs *fs; 577 struct buf *bp; 578 struct thread *td; 579 ufs_lbn_t lbn; 580 off_t osize; 581 int seqcount; 582 int blkoffset, error, extended, flags, ioflag, resid, size, xfersize; 583 vm_object_t object; 584 585 vp = ap->a_vp; 586 uio = ap->a_uio; 587 ioflag = ap->a_ioflag; 588 if (ap->a_ioflag & IO_EXT) 589 #ifdef notyet 590 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 591 #else 592 panic("ffs_read+IO_EXT"); 593 #endif 594 595 GIANT_REQUIRED; 596 597 extended = 0; 598 seqcount = ap->a_ioflag >> 16; 599 ip = VTOI(vp); 600 601 object = vp->v_object; 602 if (object) { 603 vm_object_reference(object); 604 } 605 606 #ifdef DIAGNOSTIC 607 if (uio->uio_rw != UIO_WRITE) 608 panic("ffswrite: mode"); 609 #endif 610 611 switch (vp->v_type) { 612 case VREG: 613 if (ioflag & IO_APPEND) 614 uio->uio_offset = ip->i_size; 615 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) { 616 if (object) { 617 VM_OBJECT_LOCK(object); 618 vm_object_vndeallocate(object); 619 } 620 return (EPERM); 621 } 622 /* FALLTHROUGH */ 623 case VLNK: 624 break; 625 case VDIR: 626 panic("ffswrite: dir write"); 627 break; 628 default: 629 panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type, 630 (int)uio->uio_offset, 631 (int)uio->uio_resid 632 ); 633 } 634 635 fs = ip->i_fs; 636 if (uio->uio_offset < 0 || 637 (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) { 638 if (object) { 639 VM_OBJECT_LOCK(object); 640 vm_object_vndeallocate(object); 641 } 642 return (EFBIG); 643 } 644 /* 645 * Maybe this should be above the vnode op call, but so long as 646 * file servers have no limits, I don't think it matters. 647 */ 648 td = uio->uio_td; 649 if (vp->v_type == VREG && td && 650 uio->uio_offset + uio->uio_resid > 651 td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 652 PROC_LOCK(td->td_proc); 653 psignal(td->td_proc, SIGXFSZ); 654 PROC_UNLOCK(td->td_proc); 655 if (object) { 656 VM_OBJECT_LOCK(object); 657 vm_object_vndeallocate(object); 658 } 659 return (EFBIG); 660 } 661 662 resid = uio->uio_resid; 663 osize = ip->i_size; 664 if (seqcount > BA_SEQMAX) 665 flags = BA_SEQMAX << BA_SEQSHIFT; 666 else 667 flags = seqcount << BA_SEQSHIFT; 668 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 669 flags |= IO_SYNC; 670 671 for (error = 0; uio->uio_resid > 0;) { 672 lbn = lblkno(fs, uio->uio_offset); 673 blkoffset = blkoff(fs, uio->uio_offset); 674 xfersize = fs->fs_bsize - blkoffset; 675 if (uio->uio_resid < xfersize) 676 xfersize = uio->uio_resid; 677 678 if (uio->uio_offset + xfersize > ip->i_size) 679 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 680 681 /* 682 * We must perform a read-before-write if the transfer size 683 * does not cover the entire buffer. 684 */ 685 if (fs->fs_bsize > xfersize) 686 flags |= BA_CLRBUF; 687 else 688 flags &= ~BA_CLRBUF; 689 /* XXX is uio->uio_offset the right thing here? */ 690 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 691 ap->a_cred, flags, &bp); 692 if (error != 0) 693 break; 694 /* 695 * If the buffer is not valid we have to clear out any 696 * garbage data from the pages instantiated for the buffer. 697 * If we do not, a failed uiomove() during a write can leave 698 * the prior contents of the pages exposed to a userland 699 * mmap(). XXX deal with uiomove() errors a better way. 700 */ 701 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 702 vfs_bio_clrbuf(bp); 703 if (ioflag & IO_DIRECT) 704 bp->b_flags |= B_DIRECT; 705 if (ioflag & IO_NOWDRAIN) 706 bp->b_flags |= B_NOWDRAIN; 707 708 if (uio->uio_offset + xfersize > ip->i_size) { 709 ip->i_size = uio->uio_offset + xfersize; 710 DIP(ip, i_size) = ip->i_size; 711 extended = 1; 712 } 713 714 size = blksize(fs, ip, lbn) - bp->b_resid; 715 if (size < xfersize) 716 xfersize = size; 717 718 error = 719 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 720 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 721 (LIST_FIRST(&bp->b_dep) == NULL)) { 722 bp->b_flags |= B_RELBUF; 723 } 724 725 /* 726 * If IO_SYNC each buffer is written synchronously. Otherwise 727 * if we have a severe page deficiency write the buffer 728 * asynchronously. Otherwise try to cluster, and if that 729 * doesn't do it then either do an async write (if O_DIRECT), 730 * or a delayed write (if not). 731 */ 732 if (ioflag & IO_SYNC) { 733 (void)bwrite(bp); 734 } else if (vm_page_count_severe() || 735 buf_dirty_count_severe() || 736 (ioflag & IO_ASYNC)) { 737 bp->b_flags |= B_CLUSTEROK; 738 bawrite(bp); 739 } else if (xfersize + blkoffset == fs->fs_bsize) { 740 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 741 bp->b_flags |= B_CLUSTEROK; 742 cluster_write(bp, ip->i_size, seqcount); 743 } else { 744 bawrite(bp); 745 } 746 } else if (ioflag & IO_DIRECT) { 747 bp->b_flags |= B_CLUSTEROK; 748 bawrite(bp); 749 } else { 750 bp->b_flags |= B_CLUSTEROK; 751 bdwrite(bp); 752 } 753 if (error || xfersize == 0) 754 break; 755 ip->i_flag |= IN_CHANGE | IN_UPDATE; 756 } 757 /* 758 * If we successfully wrote any data, and we are not the superuser 759 * we clear the setuid and setgid bits as a precaution against 760 * tampering. 761 */ 762 if (resid > uio->uio_resid && ap->a_cred && 763 suser_cred(ap->a_cred, PRISON_ROOT)) { 764 ip->i_mode &= ~(ISUID | ISGID); 765 DIP(ip, i_mode) = ip->i_mode; 766 } 767 if (resid > uio->uio_resid) 768 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); 769 if (error) { 770 if (ioflag & IO_UNIT) { 771 (void)UFS_TRUNCATE(vp, osize, 772 IO_NORMAL | (ioflag & IO_SYNC), 773 ap->a_cred, uio->uio_td); 774 uio->uio_offset -= resid - uio->uio_resid; 775 uio->uio_resid = resid; 776 } 777 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 778 error = UFS_UPDATE(vp, 1); 779 780 if (object) { 781 VM_OBJECT_LOCK(object); 782 vm_object_vndeallocate(object); 783 } 784 785 return (error); 786 } 787 788 /* 789 * get page routine 790 */ 791 static int 792 ffs_getpages(ap) 793 struct vop_getpages_args *ap; 794 { 795 off_t foff, physoffset; 796 int i, size, bsize; 797 struct vnode *dp, *vp; 798 vm_object_t obj; 799 vm_pindex_t pindex, firstindex; 800 vm_page_t mreq; 801 int bbackwards, bforwards; 802 int pbackwards, pforwards; 803 int firstpage; 804 ufs2_daddr_t reqblkno, reqlblkno; 805 int poff; 806 int pcount; 807 int rtval; 808 int pagesperblock; 809 810 GIANT_REQUIRED; 811 812 pcount = round_page(ap->a_count) / PAGE_SIZE; 813 mreq = ap->a_m[ap->a_reqpage]; 814 firstindex = ap->a_m[0]->pindex; 815 816 /* 817 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 818 * then the entire page is valid. Since the page may be mapped, 819 * user programs might reference data beyond the actual end of file 820 * occuring within the page. We have to zero that data. 821 */ 822 if (mreq->valid) { 823 if (mreq->valid != VM_PAGE_BITS_ALL) 824 vm_page_zero_invalid(mreq, TRUE); 825 vm_page_lock_queues(); 826 for (i = 0; i < pcount; i++) { 827 if (i != ap->a_reqpage) { 828 vm_page_free(ap->a_m[i]); 829 } 830 } 831 vm_page_unlock_queues(); 832 return VM_PAGER_OK; 833 } 834 835 vp = ap->a_vp; 836 obj = vp->v_object; 837 bsize = vp->v_mount->mnt_stat.f_iosize; 838 pindex = mreq->pindex; 839 foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */; 840 841 if (bsize < PAGE_SIZE) 842 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 843 ap->a_count, 844 ap->a_reqpage); 845 846 /* 847 * foff is the file offset of the required page 848 * reqlblkno is the logical block that contains the page 849 * poff is the index of the page into the logical block 850 */ 851 reqlblkno = foff / bsize; 852 poff = (foff % bsize) / PAGE_SIZE; 853 854 dp = VTOI(vp)->i_devvp; 855 if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards) 856 || (reqblkno == -1)) { 857 vm_page_lock_queues(); 858 for(i = 0; i < pcount; i++) { 859 if (i != ap->a_reqpage) 860 vm_page_free(ap->a_m[i]); 861 } 862 vm_page_unlock_queues(); 863 if (reqblkno == -1) { 864 if ((mreq->flags & PG_ZERO) == 0) 865 pmap_zero_page(mreq); 866 vm_page_undirty(mreq); 867 mreq->valid = VM_PAGE_BITS_ALL; 868 return VM_PAGER_OK; 869 } else { 870 return VM_PAGER_ERROR; 871 } 872 } 873 874 physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE; 875 pagesperblock = bsize / PAGE_SIZE; 876 /* 877 * find the first page that is contiguous... 878 * note that pbackwards is the number of pages that are contiguous 879 * backwards. 880 */ 881 firstpage = 0; 882 if (ap->a_count) { 883 pbackwards = poff + bbackwards * pagesperblock; 884 if (ap->a_reqpage > pbackwards) { 885 firstpage = ap->a_reqpage - pbackwards; 886 vm_page_lock_queues(); 887 for(i=0;i<firstpage;i++) 888 vm_page_free(ap->a_m[i]); 889 vm_page_unlock_queues(); 890 } 891 892 /* 893 * pforwards is the number of pages that are contiguous 894 * after the current page. 895 */ 896 pforwards = (pagesperblock - (poff + 1)) + 897 bforwards * pagesperblock; 898 if (pforwards < (pcount - (ap->a_reqpage + 1))) { 899 vm_page_lock_queues(); 900 for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++) 901 vm_page_free(ap->a_m[i]); 902 vm_page_unlock_queues(); 903 pcount = ap->a_reqpage + pforwards + 1; 904 } 905 906 /* 907 * number of pages for I/O corrected for the non-contig pages at 908 * the beginning of the array. 909 */ 910 pcount -= firstpage; 911 } 912 913 /* 914 * calculate the size of the transfer 915 */ 916 917 size = pcount * PAGE_SIZE; 918 919 if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) > 920 obj->un_pager.vnp.vnp_size) 921 size = obj->un_pager.vnp.vnp_size - 922 IDX_TO_OFF(ap->a_m[firstpage]->pindex); 923 924 physoffset -= foff; 925 rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size, 926 (ap->a_reqpage - firstpage), physoffset); 927 928 return (rtval); 929 } 930 931 /* 932 * Extended attribute area reading. 933 */ 934 static int 935 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 936 { 937 struct inode *ip; 938 struct ufs2_dinode *dp; 939 struct fs *fs; 940 struct buf *bp; 941 ufs_lbn_t lbn, nextlbn; 942 off_t bytesinfile; 943 long size, xfersize, blkoffset; 944 int error, orig_resid; 945 mode_t mode; 946 947 GIANT_REQUIRED; 948 949 ip = VTOI(vp); 950 fs = ip->i_fs; 951 dp = ip->i_din2; 952 mode = ip->i_mode; 953 954 #ifdef DIAGNOSTIC 955 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 956 panic("ffs_extread: mode"); 957 958 #endif 959 orig_resid = uio->uio_resid; 960 if (orig_resid <= 0) 961 return (0); 962 963 bytesinfile = dp->di_extsize - uio->uio_offset; 964 if (bytesinfile <= 0) { 965 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 966 ip->i_flag |= IN_ACCESS; 967 return 0; 968 } 969 970 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 971 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 972 break; 973 974 lbn = lblkno(fs, uio->uio_offset); 975 nextlbn = lbn + 1; 976 977 /* 978 * size of buffer. The buffer representing the 979 * end of the file is rounded up to the size of 980 * the block type ( fragment or full block, 981 * depending ). 982 */ 983 size = sblksize(fs, dp->di_extsize, lbn); 984 blkoffset = blkoff(fs, uio->uio_offset); 985 986 /* 987 * The amount we want to transfer in this iteration is 988 * one FS block less the amount of the data before 989 * our startpoint (duh!) 990 */ 991 xfersize = fs->fs_bsize - blkoffset; 992 993 /* 994 * But if we actually want less than the block, 995 * or the file doesn't have a whole block more of data, 996 * then use the lesser number. 997 */ 998 if (uio->uio_resid < xfersize) 999 xfersize = uio->uio_resid; 1000 if (bytesinfile < xfersize) 1001 xfersize = bytesinfile; 1002 1003 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 1004 /* 1005 * Don't do readahead if this is the end of the info. 1006 */ 1007 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 1008 } else { 1009 /* 1010 * If we have a second block, then 1011 * fire off a request for a readahead 1012 * as well as a read. Note that the 4th and 5th 1013 * arguments point to arrays of the size specified in 1014 * the 6th argument. 1015 */ 1016 int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 1017 1018 nextlbn = -1 - nextlbn; 1019 error = breadn(vp, -1 - lbn, 1020 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 1021 } 1022 if (error) { 1023 brelse(bp); 1024 bp = NULL; 1025 break; 1026 } 1027 1028 /* 1029 * If IO_DIRECT then set B_DIRECT for the buffer. This 1030 * will cause us to attempt to release the buffer later on 1031 * and will cause the buffer cache to attempt to free the 1032 * underlying pages. 1033 */ 1034 if (ioflag & IO_DIRECT) 1035 bp->b_flags |= B_DIRECT; 1036 1037 /* 1038 * We should only get non-zero b_resid when an I/O error 1039 * has occurred, which should cause us to break above. 1040 * However, if the short read did not cause an error, 1041 * then we want to ensure that we do not uiomove bad 1042 * or uninitialized data. 1043 */ 1044 size -= bp->b_resid; 1045 if (size < xfersize) { 1046 if (size == 0) 1047 break; 1048 xfersize = size; 1049 } 1050 1051 error = uiomove((char *)bp->b_data + blkoffset, 1052 (int)xfersize, uio); 1053 if (error) 1054 break; 1055 1056 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1057 (LIST_FIRST(&bp->b_dep) == NULL)) { 1058 /* 1059 * If there are no dependencies, and it's VMIO, 1060 * then we don't need the buf, mark it available 1061 * for freeing. The VM has the data. 1062 */ 1063 bp->b_flags |= B_RELBUF; 1064 brelse(bp); 1065 } else { 1066 /* 1067 * Otherwise let whoever 1068 * made the request take care of 1069 * freeing it. We just queue 1070 * it onto another list. 1071 */ 1072 bqrelse(bp); 1073 } 1074 } 1075 1076 /* 1077 * This can only happen in the case of an error 1078 * because the loop above resets bp to NULL on each iteration 1079 * and on normal completion has not set a new value into it. 1080 * so it must have come from a 'break' statement 1081 */ 1082 if (bp != NULL) { 1083 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1084 (LIST_FIRST(&bp->b_dep) == NULL)) { 1085 bp->b_flags |= B_RELBUF; 1086 brelse(bp); 1087 } else { 1088 bqrelse(bp); 1089 } 1090 } 1091 1092 if ((error == 0 || uio->uio_resid != orig_resid) && 1093 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 1094 ip->i_flag |= IN_ACCESS; 1095 return (error); 1096 } 1097 1098 /* 1099 * Extended attribute area writing. 1100 */ 1101 static int 1102 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1103 { 1104 struct inode *ip; 1105 struct ufs2_dinode *dp; 1106 struct fs *fs; 1107 struct buf *bp; 1108 ufs_lbn_t lbn; 1109 off_t osize; 1110 int blkoffset, error, flags, resid, size, xfersize; 1111 1112 GIANT_REQUIRED; 1113 1114 ip = VTOI(vp); 1115 fs = ip->i_fs; 1116 dp = ip->i_din2; 1117 1118 #ifdef DIAGNOSTIC 1119 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1120 panic("ext_write: mode"); 1121 #endif 1122 1123 if (ioflag & IO_APPEND) 1124 uio->uio_offset = dp->di_extsize; 1125 1126 if (uio->uio_offset < 0 || 1127 (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1128 return (EFBIG); 1129 1130 resid = uio->uio_resid; 1131 osize = dp->di_extsize; 1132 flags = IO_EXT; 1133 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1134 flags |= IO_SYNC; 1135 1136 for (error = 0; uio->uio_resid > 0;) { 1137 lbn = lblkno(fs, uio->uio_offset); 1138 blkoffset = blkoff(fs, uio->uio_offset); 1139 xfersize = fs->fs_bsize - blkoffset; 1140 if (uio->uio_resid < xfersize) 1141 xfersize = uio->uio_resid; 1142 1143 /* 1144 * We must perform a read-before-write if the transfer size 1145 * does not cover the entire buffer. 1146 */ 1147 if (fs->fs_bsize > xfersize) 1148 flags |= BA_CLRBUF; 1149 else 1150 flags &= ~BA_CLRBUF; 1151 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1152 ucred, flags, &bp); 1153 if (error != 0) 1154 break; 1155 /* 1156 * If the buffer is not valid we have to clear out any 1157 * garbage data from the pages instantiated for the buffer. 1158 * If we do not, a failed uiomove() during a write can leave 1159 * the prior contents of the pages exposed to a userland 1160 * mmap(). XXX deal with uiomove() errors a better way. 1161 */ 1162 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1163 vfs_bio_clrbuf(bp); 1164 if (ioflag & IO_DIRECT) 1165 bp->b_flags |= B_DIRECT; 1166 if (ioflag & IO_NOWDRAIN) 1167 bp->b_flags |= B_NOWDRAIN; 1168 1169 if (uio->uio_offset + xfersize > dp->di_extsize) 1170 dp->di_extsize = uio->uio_offset + xfersize; 1171 1172 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1173 if (size < xfersize) 1174 xfersize = size; 1175 1176 error = 1177 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1178 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1179 (LIST_FIRST(&bp->b_dep) == NULL)) { 1180 bp->b_flags |= B_RELBUF; 1181 } 1182 1183 /* 1184 * If IO_SYNC each buffer is written synchronously. Otherwise 1185 * if we have a severe page deficiency write the buffer 1186 * asynchronously. Otherwise try to cluster, and if that 1187 * doesn't do it then either do an async write (if O_DIRECT), 1188 * or a delayed write (if not). 1189 */ 1190 if (ioflag & IO_SYNC) { 1191 (void)bwrite(bp); 1192 } else if (vm_page_count_severe() || 1193 buf_dirty_count_severe() || 1194 xfersize + blkoffset == fs->fs_bsize || 1195 (ioflag & (IO_ASYNC | IO_DIRECT))) 1196 bawrite(bp); 1197 else 1198 bdwrite(bp); 1199 if (error || xfersize == 0) 1200 break; 1201 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1202 } 1203 /* 1204 * If we successfully wrote any data, and we are not the superuser 1205 * we clear the setuid and setgid bits as a precaution against 1206 * tampering. 1207 */ 1208 if (resid > uio->uio_resid && ucred && 1209 suser_cred(ucred, PRISON_ROOT)) { 1210 ip->i_mode &= ~(ISUID | ISGID); 1211 dp->di_mode = ip->i_mode; 1212 } 1213 if (error) { 1214 if (ioflag & IO_UNIT) { 1215 (void)UFS_TRUNCATE(vp, osize, 1216 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td); 1217 uio->uio_offset -= resid - uio->uio_resid; 1218 uio->uio_resid = resid; 1219 } 1220 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1221 error = UFS_UPDATE(vp, 1); 1222 return (error); 1223 } 1224 1225 1226 /* 1227 * Vnode operating to retrieve a named extended attribute. 1228 * 1229 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1230 * the length of the EA, and possibly the pointer to the entry and to the data. 1231 */ 1232 static int 1233 ffs_findextattr(u_char *ptr, uint length, int nspace, const char *name, u_char **eap, u_char **eac) 1234 { 1235 u_char *p, *pe, *pn, *p0; 1236 int eapad1, eapad2, ealength, ealen, nlen; 1237 uint32_t ul; 1238 1239 pe = ptr + length; 1240 nlen = strlen(name); 1241 1242 for (p = ptr; p < pe; p = pn) { 1243 p0 = p; 1244 bcopy(p, &ul, sizeof(ul)); 1245 pn = p + ul; 1246 /* make sure this entry is complete */ 1247 if (pn > pe) 1248 break; 1249 p += sizeof(uint32_t); 1250 if (*p != nspace) 1251 continue; 1252 p++; 1253 eapad2 = *p++; 1254 if (*p != nlen) 1255 continue; 1256 p++; 1257 if (bcmp(p, name, nlen)) 1258 continue; 1259 ealength = sizeof(uint32_t) + 3 + nlen; 1260 eapad1 = 8 - (ealength % 8); 1261 if (eapad1 == 8) 1262 eapad1 = 0; 1263 ealength += eapad1; 1264 ealen = ul - ealength - eapad2; 1265 p += nlen + eapad1; 1266 if (eap != NULL) 1267 *eap = p0; 1268 if (eac != NULL) 1269 *eac = p; 1270 return (ealen); 1271 } 1272 return(-1); 1273 } 1274 1275 static int 1276 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1277 { 1278 struct inode *ip; 1279 struct fs *fs; 1280 struct ufs2_dinode *dp; 1281 struct uio luio; 1282 struct iovec liovec; 1283 int easize, error; 1284 u_char *eae; 1285 1286 ip = VTOI(vp); 1287 fs = ip->i_fs; 1288 dp = ip->i_din2; 1289 easize = dp->di_extsize; 1290 1291 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1292 1293 liovec.iov_base = eae; 1294 liovec.iov_len = easize; 1295 luio.uio_iov = &liovec; 1296 luio.uio_iovcnt = 1; 1297 luio.uio_offset = 0; 1298 luio.uio_resid = easize; 1299 luio.uio_segflg = UIO_SYSSPACE; 1300 luio.uio_rw = UIO_READ; 1301 luio.uio_td = td; 1302 1303 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1304 if (error) { 1305 free(eae, M_TEMP); 1306 return(error); 1307 } 1308 *p = eae; 1309 return (0); 1310 } 1311 1312 static int 1313 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1314 { 1315 struct inode *ip; 1316 struct fs *fs; 1317 struct ufs2_dinode *dp; 1318 int error; 1319 1320 ip = VTOI(vp); 1321 fs = ip->i_fs; 1322 1323 if (ip->i_ea_area != NULL) 1324 return (EBUSY); 1325 dp = ip->i_din2; 1326 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1327 if (error) 1328 return (error); 1329 ip->i_ea_len = dp->di_extsize; 1330 ip->i_ea_error = 0; 1331 return (0); 1332 } 1333 1334 /* 1335 * Vnode extattr transaction commit/abort 1336 */ 1337 static int 1338 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1339 { 1340 struct inode *ip; 1341 struct fs *fs; 1342 struct uio luio; 1343 struct iovec liovec; 1344 int error; 1345 struct ufs2_dinode *dp; 1346 1347 ip = VTOI(vp); 1348 fs = ip->i_fs; 1349 if (ip->i_ea_area == NULL) 1350 return (EINVAL); 1351 dp = ip->i_din2; 1352 error = ip->i_ea_error; 1353 if (commit && error == 0) { 1354 if (cred == NOCRED) 1355 cred = vp->v_mount->mnt_cred; 1356 liovec.iov_base = ip->i_ea_area; 1357 liovec.iov_len = ip->i_ea_len; 1358 luio.uio_iov = &liovec; 1359 luio.uio_iovcnt = 1; 1360 luio.uio_offset = 0; 1361 luio.uio_resid = ip->i_ea_len; 1362 luio.uio_segflg = UIO_SYSSPACE; 1363 luio.uio_rw = UIO_WRITE; 1364 luio.uio_td = td; 1365 /* XXX: I'm not happy about truncating to zero size */ 1366 if (ip->i_ea_len < dp->di_extsize) 1367 error = ffs_truncate(vp, 0, IO_EXT, cred, td); 1368 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1369 } 1370 free(ip->i_ea_area, M_TEMP); 1371 ip->i_ea_area = NULL; 1372 ip->i_ea_len = 0; 1373 ip->i_ea_error = 0; 1374 return (error); 1375 } 1376 1377 /* 1378 * Vnode extattr strategy routine for special devices and fifos. 1379 * 1380 * We need to check for a read or write of the external attributes. 1381 * Otherwise we just fall through and do the usual thing. 1382 */ 1383 static int 1384 ffsext_strategy(struct vop_strategy_args *ap) 1385 /* 1386 struct vop_strategy_args { 1387 struct vnodeop_desc *a_desc; 1388 struct vnode *a_vp; 1389 struct buf *a_bp; 1390 }; 1391 */ 1392 { 1393 struct vnode *vp; 1394 daddr_t lbn; 1395 1396 vp = ap->a_vp; 1397 lbn = ap->a_bp->b_lblkno; 1398 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1399 lbn < 0 && lbn >= -NXADDR) 1400 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1401 if (vp->v_type == VFIFO) 1402 return (ufs_vnoperatefifo((struct vop_generic_args *)ap)); 1403 return (ufs_vnoperatespec((struct vop_generic_args *)ap)); 1404 } 1405 1406 /* 1407 * Vnode extattr transaction commit/abort 1408 */ 1409 static int 1410 ffs_openextattr(struct vop_openextattr_args *ap) 1411 /* 1412 struct vop_openextattr_args { 1413 struct vnodeop_desc *a_desc; 1414 struct vnode *a_vp; 1415 IN struct ucred *a_cred; 1416 IN struct thread *a_td; 1417 }; 1418 */ 1419 { 1420 struct inode *ip; 1421 struct fs *fs; 1422 1423 ip = VTOI(ap->a_vp); 1424 fs = ip->i_fs; 1425 if (fs->fs_magic == FS_UFS1_MAGIC) 1426 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1427 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1428 } 1429 1430 1431 /* 1432 * Vnode extattr transaction commit/abort 1433 */ 1434 static int 1435 ffs_closeextattr(struct vop_closeextattr_args *ap) 1436 /* 1437 struct vop_closeextattr_args { 1438 struct vnodeop_desc *a_desc; 1439 struct vnode *a_vp; 1440 int a_commit; 1441 IN struct ucred *a_cred; 1442 IN struct thread *a_td; 1443 }; 1444 */ 1445 { 1446 struct inode *ip; 1447 struct fs *fs; 1448 1449 ip = VTOI(ap->a_vp); 1450 fs = ip->i_fs; 1451 if (fs->fs_magic == FS_UFS1_MAGIC) 1452 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1453 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1454 } 1455 1456 1457 1458 /* 1459 * Vnode operation to retrieve a named extended attribute. 1460 */ 1461 static int 1462 ffs_getextattr(struct vop_getextattr_args *ap) 1463 /* 1464 vop_getextattr { 1465 IN struct vnode *a_vp; 1466 IN int a_attrnamespace; 1467 IN const char *a_name; 1468 INOUT struct uio *a_uio; 1469 OUT size_t *a_size; 1470 IN struct ucred *a_cred; 1471 IN struct thread *a_td; 1472 }; 1473 */ 1474 { 1475 struct inode *ip; 1476 struct fs *fs; 1477 u_char *eae, *p, *pe, *pn; 1478 struct ufs2_dinode *dp; 1479 unsigned easize; 1480 uint32_t ul; 1481 int error, ealen, stand_alone; 1482 1483 ip = VTOI(ap->a_vp); 1484 fs = ip->i_fs; 1485 1486 if (fs->fs_magic == FS_UFS1_MAGIC) 1487 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1488 1489 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1490 ap->a_cred, ap->a_td, IREAD); 1491 if (error) 1492 return (error); 1493 1494 if (ip->i_ea_area == NULL) { 1495 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1496 if (error) 1497 return (error); 1498 stand_alone = 1; 1499 } else { 1500 stand_alone = 0; 1501 } 1502 dp = ip->i_din2; 1503 eae = ip->i_ea_area; 1504 easize = ip->i_ea_len; 1505 if (strlen(ap->a_name) > 0) { 1506 ealen = ffs_findextattr(eae, easize, 1507 ap->a_attrnamespace, ap->a_name, NULL, &p); 1508 if (ealen >= 0) { 1509 error = 0; 1510 if (ap->a_size != NULL) 1511 *ap->a_size = ealen; 1512 else if (ap->a_uio != NULL) 1513 error = uiomove(p, ealen, ap->a_uio); 1514 } else { 1515 error = ENOATTR; 1516 } 1517 } else { 1518 error = 0; 1519 if (ap->a_size != NULL) 1520 *ap->a_size = 0; 1521 pe = eae + easize; 1522 for(p = eae; error == 0 && p < pe; p = pn) { 1523 bcopy(p, &ul, sizeof(ul)); 1524 pn = p + ul; 1525 if (pn > pe) 1526 break; 1527 p += sizeof(ul); 1528 if (*p++ != ap->a_attrnamespace) 1529 continue; 1530 p++; /* pad2 */ 1531 ealen = *p; 1532 if (ap->a_size != NULL) { 1533 *ap->a_size += ealen + 1; 1534 } else if (ap->a_uio != NULL) { 1535 error = uiomove(p, ealen + 1, ap->a_uio); 1536 } 1537 } 1538 } 1539 if (stand_alone) 1540 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1541 return(error); 1542 } 1543 1544 /* 1545 * Vnode operation to set a named attribute. 1546 */ 1547 static int 1548 ffs_setextattr(struct vop_setextattr_args *ap) 1549 /* 1550 vop_setextattr { 1551 IN struct vnode *a_vp; 1552 IN int a_attrnamespace; 1553 IN const char *a_name; 1554 INOUT struct uio *a_uio; 1555 IN struct ucred *a_cred; 1556 IN struct thread *a_td; 1557 }; 1558 */ 1559 { 1560 struct inode *ip; 1561 struct fs *fs; 1562 uint32_t ealength, ul; 1563 int ealen, olen, eacont, eapad1, eapad2, error, i, easize; 1564 u_char *eae, *p; 1565 struct ufs2_dinode *dp; 1566 struct ucred *cred; 1567 int stand_alone; 1568 1569 ip = VTOI(ap->a_vp); 1570 fs = ip->i_fs; 1571 1572 if (fs->fs_magic == FS_UFS1_MAGIC) 1573 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1574 1575 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1576 ap->a_cred, ap->a_td, IWRITE); 1577 if (error) { 1578 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1579 ip->i_ea_error = error; 1580 return (error); 1581 } 1582 1583 if (ap->a_cred != NOCRED) 1584 cred = ap->a_cred; 1585 else 1586 cred = ap->a_vp->v_mount->mnt_cred; 1587 1588 dp = ip->i_din2; 1589 1590 if (ip->i_ea_area == NULL) { 1591 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1592 if (error) 1593 return (error); 1594 stand_alone = 1; 1595 } else { 1596 stand_alone = 0; 1597 } 1598 1599 /* Calculate the length of the EA entry */ 1600 if (ap->a_uio == NULL) { 1601 /* delete */ 1602 ealength = eapad1 = ealen = eapad2 = eacont = 0; 1603 } else { 1604 ealen = ap->a_uio->uio_resid; 1605 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1606 eapad1 = 8 - (ealength % 8); 1607 if (eapad1 == 8) 1608 eapad1 = 0; 1609 eacont = ealength + eapad1; 1610 eapad2 = 8 - (ealen % 8); 1611 if (eapad2 == 8) 1612 eapad2 = 0; 1613 ealength += eapad1 + ealen + eapad2; 1614 } 1615 1616 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1617 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1618 easize = ip->i_ea_len; 1619 1620 olen = ffs_findextattr(eae, easize, 1621 ap->a_attrnamespace, ap->a_name, &p, NULL); 1622 if (olen == -1 && ealength == 0) { 1623 /* delete but nonexistent */ 1624 free(eae, M_TEMP); 1625 if (stand_alone) 1626 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1627 return(ENOATTR); 1628 } 1629 if (olen == -1) { 1630 /* new, append at end */ 1631 p = eae + easize; 1632 easize += ealength; 1633 } else { 1634 bcopy(p, &ul, sizeof ul); 1635 i = p - eae + ul; 1636 if (ul != ealength) { 1637 bcopy(p + ul, p + ealength, easize - i); 1638 easize += (ealength - ul); 1639 } 1640 } 1641 if (easize > NXADDR * fs->fs_bsize) { 1642 free(eae, M_TEMP); 1643 if (stand_alone) 1644 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1645 else if (ip->i_ea_error == 0) 1646 ip->i_ea_error = ENOSPC; 1647 return(ENOSPC); 1648 } 1649 if (ealength != 0) { 1650 bcopy(&ealength, p, sizeof(ealength)); 1651 p += sizeof(ealength); 1652 *p++ = ap->a_attrnamespace; 1653 *p++ = eapad2; 1654 *p++ = strlen(ap->a_name); 1655 strcpy(p, ap->a_name); 1656 p += strlen(ap->a_name); 1657 bzero(p, eapad1); 1658 p += eapad1; 1659 error = uiomove(p, ealen, ap->a_uio); 1660 if (error) { 1661 free(eae, M_TEMP); 1662 if (stand_alone) 1663 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1664 else if (ip->i_ea_error == 0) 1665 ip->i_ea_error = error; 1666 return(error); 1667 } 1668 p += ealen; 1669 bzero(p, eapad2); 1670 } 1671 p = ip->i_ea_area; 1672 ip->i_ea_area = eae; 1673 ip->i_ea_len = easize; 1674 free(p, M_TEMP); 1675 if (stand_alone) 1676 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1677 return(error); 1678 } 1679