1 /* 2 * Copyright (c) 2002 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Copyright (c) 1982, 1986, 1989, 1993 12 * The Regents of the University of California. All rights reserved. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed by the University of 25 * California, Berkeley and its contributors. 26 * 4. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 * 42 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 43 * $FreeBSD$ 44 */ 45 46 #include <sys/param.h> 47 #include <sys/bio.h> 48 #include <sys/systm.h> 49 #include <sys/buf.h> 50 #include <sys/conf.h> 51 #include <sys/extattr.h> 52 #include <sys/kernel.h> 53 #include <sys/malloc.h> 54 #include <sys/mount.h> 55 #include <sys/proc.h> 56 #include <sys/resourcevar.h> 57 #include <sys/signalvar.h> 58 #include <sys/stat.h> 59 #include <sys/vmmeter.h> 60 #include <sys/vnode.h> 61 62 #include <machine/limits.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_extern.h> 66 #include <vm/vm_object.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_pager.h> 69 #include <vm/vnode_pager.h> 70 71 #include <ufs/ufs/extattr.h> 72 #include <ufs/ufs/quota.h> 73 #include <ufs/ufs/inode.h> 74 #include <ufs/ufs/ufs_extern.h> 75 #include <ufs/ufs/ufsmount.h> 76 77 #include <ufs/ffs/fs.h> 78 #include <ufs/ffs/ffs_extern.h> 79 80 static int ffs_fsync(struct vop_fsync_args *); 81 static int ffs_getpages(struct vop_getpages_args *); 82 static int ffs_read(struct vop_read_args *); 83 static int ffs_write(struct vop_write_args *); 84 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 85 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 86 struct ucred *cred); 87 static int ffsext_strategy(struct vop_strategy_args *); 88 static int ffs_closeextattr(struct vop_closeextattr_args *); 89 static int ffs_getextattr(struct vop_getextattr_args *); 90 static int ffs_openextattr(struct vop_openextattr_args *); 91 static int ffs_setextattr(struct vop_setextattr_args *); 92 93 94 /* Global vfs data structures for ufs. */ 95 vop_t **ffs_vnodeop_p; 96 static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = { 97 { &vop_default_desc, (vop_t *) ufs_vnoperate }, 98 { &vop_fsync_desc, (vop_t *) ffs_fsync }, 99 { &vop_getpages_desc, (vop_t *) ffs_getpages }, 100 { &vop_read_desc, (vop_t *) ffs_read }, 101 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks }, 102 { &vop_write_desc, (vop_t *) ffs_write }, 103 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr }, 104 { &vop_getextattr_desc, (vop_t *) ffs_getextattr }, 105 { &vop_openextattr_desc, (vop_t *) ffs_openextattr }, 106 { &vop_setextattr_desc, (vop_t *) ffs_setextattr }, 107 { NULL, NULL } 108 }; 109 static struct vnodeopv_desc ffs_vnodeop_opv_desc = 110 { &ffs_vnodeop_p, ffs_vnodeop_entries }; 111 112 vop_t **ffs_specop_p; 113 static struct vnodeopv_entry_desc ffs_specop_entries[] = { 114 { &vop_default_desc, (vop_t *) ufs_vnoperatespec }, 115 { &vop_fsync_desc, (vop_t *) ffs_fsync }, 116 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks }, 117 { &vop_strategy_desc, (vop_t *) ffsext_strategy }, 118 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr }, 119 { &vop_getextattr_desc, (vop_t *) ffs_getextattr }, 120 { &vop_openextattr_desc, (vop_t *) ffs_openextattr }, 121 { &vop_setextattr_desc, (vop_t *) ffs_setextattr }, 122 { NULL, NULL } 123 }; 124 static struct vnodeopv_desc ffs_specop_opv_desc = 125 { &ffs_specop_p, ffs_specop_entries }; 126 127 vop_t **ffs_fifoop_p; 128 static struct vnodeopv_entry_desc ffs_fifoop_entries[] = { 129 { &vop_default_desc, (vop_t *) ufs_vnoperatefifo }, 130 { &vop_fsync_desc, (vop_t *) ffs_fsync }, 131 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks }, 132 { &vop_strategy_desc, (vop_t *) ffsext_strategy }, 133 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr }, 134 { &vop_getextattr_desc, (vop_t *) ffs_getextattr }, 135 { &vop_openextattr_desc, (vop_t *) ffs_openextattr }, 136 { &vop_setextattr_desc, (vop_t *) ffs_setextattr }, 137 { NULL, NULL } 138 }; 139 static struct vnodeopv_desc ffs_fifoop_opv_desc = 140 { &ffs_fifoop_p, ffs_fifoop_entries }; 141 142 VNODEOP_SET(ffs_vnodeop_opv_desc); 143 VNODEOP_SET(ffs_specop_opv_desc); 144 VNODEOP_SET(ffs_fifoop_opv_desc); 145 146 /* 147 * Synch an open file. 148 */ 149 /* ARGSUSED */ 150 static int 151 ffs_fsync(ap) 152 struct vop_fsync_args /* { 153 struct vnode *a_vp; 154 struct ucred *a_cred; 155 int a_waitfor; 156 struct thread *a_td; 157 } */ *ap; 158 { 159 struct vnode *vp = ap->a_vp; 160 struct inode *ip = VTOI(vp); 161 struct buf *bp; 162 struct buf *nbp; 163 int s, error, wait, passes, skipmeta; 164 ufs_lbn_t lbn; 165 166 wait = (ap->a_waitfor == MNT_WAIT); 167 if (vn_isdisk(vp, NULL)) { 168 lbn = INT_MAX; 169 if (vp->v_rdev->si_mountpoint != NULL && 170 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP)) 171 softdep_fsync_mountdev(vp); 172 } else { 173 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 174 } 175 176 /* 177 * Flush all dirty buffers associated with a vnode. 178 */ 179 passes = NIADDR + 1; 180 skipmeta = 0; 181 if (wait) 182 skipmeta = 1; 183 s = splbio(); 184 VI_LOCK(vp); 185 loop: 186 TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) 187 bp->b_vflags &= ~BV_SCANNED; 188 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 189 nbp = TAILQ_NEXT(bp, b_vnbufs); 190 /* 191 * Reasons to skip this buffer: it has already been considered 192 * on this pass, this pass is the first time through on a 193 * synchronous flush request and the buffer being considered 194 * is metadata, the buffer has dependencies that will cause 195 * it to be redirtied and it has not already been deferred, 196 * or it is already being written. 197 */ 198 if ((bp->b_vflags & BV_SCANNED) != 0) 199 continue; 200 bp->b_vflags |= BV_SCANNED; 201 if ((skipmeta == 1 && bp->b_lblkno < 0)) 202 continue; 203 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 204 continue; 205 if (!wait && LIST_FIRST(&bp->b_dep) != NULL && 206 (bp->b_flags & B_DEFERRED) == 0 && 207 buf_countdeps(bp, 0)) { 208 bp->b_flags |= B_DEFERRED; 209 BUF_UNLOCK(bp); 210 continue; 211 } 212 VI_UNLOCK(vp); 213 if ((bp->b_flags & B_DELWRI) == 0) 214 panic("ffs_fsync: not dirty"); 215 if (vp != bp->b_vp) 216 panic("ffs_fsync: vp != vp->b_vp"); 217 /* 218 * If this is a synchronous flush request, or it is not a 219 * file or device, start the write on this buffer immediatly. 220 */ 221 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) { 222 223 /* 224 * On our final pass through, do all I/O synchronously 225 * so that we can find out if our flush is failing 226 * because of write errors. 227 */ 228 if (passes > 0 || !wait) { 229 if ((bp->b_flags & B_CLUSTEROK) && !wait) { 230 BUF_UNLOCK(bp); 231 (void) vfs_bio_awrite(bp); 232 } else { 233 bremfree(bp); 234 splx(s); 235 (void) bawrite(bp); 236 s = splbio(); 237 } 238 } else { 239 bremfree(bp); 240 splx(s); 241 if ((error = bwrite(bp)) != 0) 242 return (error); 243 s = splbio(); 244 } 245 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) { 246 /* 247 * If the buffer is for data that has been truncated 248 * off the file, then throw it away. 249 */ 250 bremfree(bp); 251 bp->b_flags |= B_INVAL | B_NOCACHE; 252 splx(s); 253 brelse(bp); 254 s = splbio(); 255 } else { 256 BUF_UNLOCK(bp); 257 vfs_bio_awrite(bp); 258 } 259 /* 260 * Since we may have slept during the I/O, we need 261 * to start from a known point. 262 */ 263 VI_LOCK(vp); 264 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd); 265 } 266 /* 267 * If we were asked to do this synchronously, then go back for 268 * another pass, this time doing the metadata. 269 */ 270 if (skipmeta) { 271 skipmeta = 0; 272 goto loop; 273 } 274 275 if (wait) { 276 while (vp->v_numoutput) { 277 vp->v_iflag |= VI_BWAIT; 278 msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp), 279 PRIBIO + 4, "ffsfsn", 0); 280 } 281 VI_UNLOCK(vp); 282 283 /* 284 * Ensure that any filesystem metatdata associated 285 * with the vnode has been written. 286 */ 287 splx(s); 288 if ((error = softdep_sync_metadata(ap)) != 0) 289 return (error); 290 s = splbio(); 291 292 VI_LOCK(vp); 293 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 294 /* 295 * Block devices associated with filesystems may 296 * have new I/O requests posted for them even if 297 * the vnode is locked, so no amount of trying will 298 * get them clean. Thus we give block devices a 299 * good effort, then just give up. For all other file 300 * types, go around and try again until it is clean. 301 */ 302 if (passes > 0) { 303 passes -= 1; 304 goto loop; 305 } 306 #ifdef DIAGNOSTIC 307 if (!vn_isdisk(vp, NULL)) 308 vprint("ffs_fsync: dirty", vp); 309 #endif 310 } 311 } 312 VI_UNLOCK(vp); 313 splx(s); 314 return (UFS_UPDATE(vp, wait)); 315 } 316 317 318 /* 319 * Vnode op for reading. 320 */ 321 /* ARGSUSED */ 322 static int 323 ffs_read(ap) 324 struct vop_read_args /* { 325 struct vnode *a_vp; 326 struct uio *a_uio; 327 int a_ioflag; 328 struct ucred *a_cred; 329 } */ *ap; 330 { 331 struct vnode *vp; 332 struct inode *ip; 333 struct uio *uio; 334 struct fs *fs; 335 struct buf *bp; 336 ufs_lbn_t lbn, nextlbn; 337 off_t bytesinfile; 338 long size, xfersize, blkoffset; 339 int error, orig_resid; 340 mode_t mode; 341 int seqcount; 342 int ioflag; 343 vm_object_t object; 344 345 vp = ap->a_vp; 346 uio = ap->a_uio; 347 ioflag = ap->a_ioflag; 348 if (ap->a_ioflag & IO_EXT) 349 #ifdef notyet 350 return (ffs_extread(vp, uio, ioflag)); 351 #else 352 panic("ffs_read+IO_EXT"); 353 #endif 354 355 GIANT_REQUIRED; 356 357 seqcount = ap->a_ioflag >> 16; 358 ip = VTOI(vp); 359 mode = ip->i_mode; 360 361 #ifdef DIAGNOSTIC 362 if (uio->uio_rw != UIO_READ) 363 panic("ffs_read: mode"); 364 365 if (vp->v_type == VLNK) { 366 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 367 panic("ffs_read: short symlink"); 368 } else if (vp->v_type != VREG && vp->v_type != VDIR) 369 panic("ffs_read: type %d", vp->v_type); 370 #endif 371 fs = ip->i_fs; 372 if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize) 373 return (EFBIG); 374 375 orig_resid = uio->uio_resid; 376 if (orig_resid <= 0) 377 return (0); 378 379 object = vp->v_object; 380 381 bytesinfile = ip->i_size - uio->uio_offset; 382 if (bytesinfile <= 0) { 383 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 384 ip->i_flag |= IN_ACCESS; 385 return 0; 386 } 387 388 if (object) { 389 vm_object_reference(object); 390 } 391 392 #ifdef ENABLE_VFS_IOOPT 393 /* 394 * If IO optimisation is turned on, 395 * and we are NOT a VM based IO request, 396 * (i.e. not headed for the buffer cache) 397 * but there IS a vm object associated with it. 398 */ 399 if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) { 400 int nread, toread; 401 402 toread = uio->uio_resid; 403 if (toread > bytesinfile) 404 toread = bytesinfile; 405 if (toread >= PAGE_SIZE) { 406 /* 407 * Then if it's at least a page in size, try 408 * get the data from the object using vm tricks 409 */ 410 error = uioread(toread, uio, object, &nread); 411 if ((uio->uio_resid == 0) || (error != 0)) { 412 /* 413 * If we finished or there was an error 414 * then finish up (the reference previously 415 * obtained on object must be released). 416 */ 417 if ((error == 0 || 418 uio->uio_resid != orig_resid) && 419 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 420 ip->i_flag |= IN_ACCESS; 421 422 if (object) { 423 vm_object_vndeallocate(object); 424 } 425 return error; 426 } 427 } 428 } 429 #endif 430 431 /* 432 * Ok so we couldn't do it all in one vm trick... 433 * so cycle around trying smaller bites.. 434 */ 435 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 436 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 437 break; 438 #ifdef ENABLE_VFS_IOOPT 439 if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) { 440 /* 441 * Obviously we didn't finish above, but we 442 * didn't get an error either. Try the same trick again. 443 * but this time we are looping. 444 */ 445 int nread, toread; 446 toread = uio->uio_resid; 447 if (toread > bytesinfile) 448 toread = bytesinfile; 449 450 /* 451 * Once again, if there isn't enough for a 452 * whole page, don't try optimising. 453 */ 454 if (toread >= PAGE_SIZE) { 455 error = uioread(toread, uio, object, &nread); 456 if ((uio->uio_resid == 0) || (error != 0)) { 457 /* 458 * If we finished or there was an 459 * error then finish up (the reference 460 * previously obtained on object must 461 * be released). 462 */ 463 if ((error == 0 || 464 uio->uio_resid != orig_resid) && 465 (vp->v_mount->mnt_flag & 466 MNT_NOATIME) == 0) 467 ip->i_flag |= IN_ACCESS; 468 if (object) { 469 vm_object_vndeallocate(object); 470 } 471 return error; 472 } 473 /* 474 * To get here we didnt't finish or err. 475 * If we did get some data, 476 * loop to try another bite. 477 */ 478 if (nread > 0) { 479 continue; 480 } 481 } 482 } 483 #endif 484 485 lbn = lblkno(fs, uio->uio_offset); 486 nextlbn = lbn + 1; 487 488 /* 489 * size of buffer. The buffer representing the 490 * end of the file is rounded up to the size of 491 * the block type ( fragment or full block, 492 * depending ). 493 */ 494 size = blksize(fs, ip, lbn); 495 blkoffset = blkoff(fs, uio->uio_offset); 496 497 /* 498 * The amount we want to transfer in this iteration is 499 * one FS block less the amount of the data before 500 * our startpoint (duh!) 501 */ 502 xfersize = fs->fs_bsize - blkoffset; 503 504 /* 505 * But if we actually want less than the block, 506 * or the file doesn't have a whole block more of data, 507 * then use the lesser number. 508 */ 509 if (uio->uio_resid < xfersize) 510 xfersize = uio->uio_resid; 511 if (bytesinfile < xfersize) 512 xfersize = bytesinfile; 513 514 if (lblktosize(fs, nextlbn) >= ip->i_size) { 515 /* 516 * Don't do readahead if this is the end of the file. 517 */ 518 error = bread(vp, lbn, size, NOCRED, &bp); 519 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 520 /* 521 * Otherwise if we are allowed to cluster, 522 * grab as much as we can. 523 * 524 * XXX This may not be a win if we are not 525 * doing sequential access. 526 */ 527 error = cluster_read(vp, ip->i_size, lbn, 528 size, NOCRED, uio->uio_resid, seqcount, &bp); 529 } else if (seqcount > 1) { 530 /* 531 * If we are NOT allowed to cluster, then 532 * if we appear to be acting sequentially, 533 * fire off a request for a readahead 534 * as well as a read. Note that the 4th and 5th 535 * arguments point to arrays of the size specified in 536 * the 6th argument. 537 */ 538 int nextsize = blksize(fs, ip, nextlbn); 539 error = breadn(vp, lbn, 540 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 541 } else { 542 /* 543 * Failing all of the above, just read what the 544 * user asked for. Interestingly, the same as 545 * the first option above. 546 */ 547 error = bread(vp, lbn, size, NOCRED, &bp); 548 } 549 if (error) { 550 brelse(bp); 551 bp = NULL; 552 break; 553 } 554 555 /* 556 * If IO_DIRECT then set B_DIRECT for the buffer. This 557 * will cause us to attempt to release the buffer later on 558 * and will cause the buffer cache to attempt to free the 559 * underlying pages. 560 */ 561 if (ioflag & IO_DIRECT) 562 bp->b_flags |= B_DIRECT; 563 564 /* 565 * We should only get non-zero b_resid when an I/O error 566 * has occurred, which should cause us to break above. 567 * However, if the short read did not cause an error, 568 * then we want to ensure that we do not uiomove bad 569 * or uninitialized data. 570 */ 571 size -= bp->b_resid; 572 if (size < xfersize) { 573 if (size == 0) 574 break; 575 xfersize = size; 576 } 577 578 #ifdef ENABLE_VFS_IOOPT 579 if (vfs_ioopt && object && 580 (bp->b_flags & B_VMIO) && 581 ((blkoffset & PAGE_MASK) == 0) && 582 ((xfersize & PAGE_MASK) == 0)) { 583 /* 584 * If VFS IO optimisation is turned on, 585 * and it's an exact page multiple 586 * And a normal VM based op, 587 * then use uiomiveco() 588 */ 589 error = 590 uiomoveco((char *)bp->b_data + blkoffset, 591 (int)xfersize, uio, object, 0); 592 } else 593 #endif 594 { 595 /* 596 * otherwise use the general form 597 */ 598 error = 599 uiomove((char *)bp->b_data + blkoffset, 600 (int)xfersize, uio); 601 } 602 603 if (error) 604 break; 605 606 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 607 (LIST_FIRST(&bp->b_dep) == NULL)) { 608 /* 609 * If there are no dependencies, and it's VMIO, 610 * then we don't need the buf, mark it available 611 * for freeing. The VM has the data. 612 */ 613 bp->b_flags |= B_RELBUF; 614 brelse(bp); 615 } else { 616 /* 617 * Otherwise let whoever 618 * made the request take care of 619 * freeing it. We just queue 620 * it onto another list. 621 */ 622 bqrelse(bp); 623 } 624 } 625 626 /* 627 * This can only happen in the case of an error 628 * because the loop above resets bp to NULL on each iteration 629 * and on normal completion has not set a new value into it. 630 * so it must have come from a 'break' statement 631 */ 632 if (bp != NULL) { 633 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 634 (LIST_FIRST(&bp->b_dep) == NULL)) { 635 bp->b_flags |= B_RELBUF; 636 brelse(bp); 637 } else { 638 bqrelse(bp); 639 } 640 } 641 642 if (object) { 643 vm_object_vndeallocate(object); 644 } 645 if ((error == 0 || uio->uio_resid != orig_resid) && 646 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 647 ip->i_flag |= IN_ACCESS; 648 return (error); 649 } 650 651 /* 652 * Vnode op for writing. 653 */ 654 static int 655 ffs_write(ap) 656 struct vop_write_args /* { 657 struct vnode *a_vp; 658 struct uio *a_uio; 659 int a_ioflag; 660 struct ucred *a_cred; 661 } */ *ap; 662 { 663 struct vnode *vp; 664 struct uio *uio; 665 struct inode *ip; 666 struct fs *fs; 667 struct buf *bp; 668 struct thread *td; 669 ufs_lbn_t lbn; 670 off_t osize; 671 int seqcount; 672 int blkoffset, error, extended, flags, ioflag, resid, size, xfersize; 673 vm_object_t object; 674 675 vp = ap->a_vp; 676 uio = ap->a_uio; 677 ioflag = ap->a_ioflag; 678 if (ap->a_ioflag & IO_EXT) 679 #ifdef notyet 680 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 681 #else 682 panic("ffs_read+IO_EXT"); 683 #endif 684 685 GIANT_REQUIRED; 686 687 extended = 0; 688 seqcount = ap->a_ioflag >> 16; 689 ip = VTOI(vp); 690 691 object = vp->v_object; 692 if (object) { 693 vm_object_reference(object); 694 } 695 696 #ifdef DIAGNOSTIC 697 if (uio->uio_rw != UIO_WRITE) 698 panic("ffswrite: mode"); 699 #endif 700 701 switch (vp->v_type) { 702 case VREG: 703 if (ioflag & IO_APPEND) 704 uio->uio_offset = ip->i_size; 705 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) { 706 if (object) { 707 vm_object_vndeallocate(object); 708 } 709 return (EPERM); 710 } 711 /* FALLTHROUGH */ 712 case VLNK: 713 break; 714 case VDIR: 715 panic("ffswrite: dir write"); 716 break; 717 default: 718 panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type, 719 (int)uio->uio_offset, 720 (int)uio->uio_resid 721 ); 722 } 723 724 fs = ip->i_fs; 725 if (uio->uio_offset < 0 || 726 (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) { 727 if (object) { 728 vm_object_vndeallocate(object); 729 } 730 return (EFBIG); 731 } 732 /* 733 * Maybe this should be above the vnode op call, but so long as 734 * file servers have no limits, I don't think it matters. 735 */ 736 td = uio->uio_td; 737 if (vp->v_type == VREG && td && 738 uio->uio_offset + uio->uio_resid > 739 td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 740 PROC_LOCK(td->td_proc); 741 psignal(td->td_proc, SIGXFSZ); 742 PROC_UNLOCK(td->td_proc); 743 if (object) { 744 vm_object_vndeallocate(object); 745 } 746 return (EFBIG); 747 } 748 749 resid = uio->uio_resid; 750 osize = ip->i_size; 751 if (seqcount > BA_SEQMAX) 752 flags = BA_SEQMAX << BA_SEQSHIFT; 753 else 754 flags = seqcount << BA_SEQSHIFT; 755 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 756 flags |= IO_SYNC; 757 758 #ifdef ENABLE_VFS_IOOPT 759 if (object && (object->flags & OBJ_OPT)) { 760 vm_freeze_copyopts(object, 761 OFF_TO_IDX(uio->uio_offset), 762 OFF_TO_IDX(uio->uio_offset + uio->uio_resid + PAGE_MASK)); 763 } 764 #endif 765 for (error = 0; uio->uio_resid > 0;) { 766 lbn = lblkno(fs, uio->uio_offset); 767 blkoffset = blkoff(fs, uio->uio_offset); 768 xfersize = fs->fs_bsize - blkoffset; 769 if (uio->uio_resid < xfersize) 770 xfersize = uio->uio_resid; 771 772 if (uio->uio_offset + xfersize > ip->i_size) 773 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 774 775 /* 776 * We must perform a read-before-write if the transfer size 777 * does not cover the entire buffer. 778 */ 779 if (fs->fs_bsize > xfersize) 780 flags |= BA_CLRBUF; 781 else 782 flags &= ~BA_CLRBUF; 783 /* XXX is uio->uio_offset the right thing here? */ 784 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 785 ap->a_cred, flags, &bp); 786 if (error != 0) 787 break; 788 /* 789 * If the buffer is not valid we have to clear out any 790 * garbage data from the pages instantiated for the buffer. 791 * If we do not, a failed uiomove() during a write can leave 792 * the prior contents of the pages exposed to a userland 793 * mmap(). XXX deal with uiomove() errors a better way. 794 */ 795 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 796 vfs_bio_clrbuf(bp); 797 if (ioflag & IO_DIRECT) 798 bp->b_flags |= B_DIRECT; 799 if (ioflag & IO_NOWDRAIN) 800 bp->b_flags |= B_NOWDRAIN; 801 802 if (uio->uio_offset + xfersize > ip->i_size) { 803 ip->i_size = uio->uio_offset + xfersize; 804 DIP(ip, i_size) = ip->i_size; 805 extended = 1; 806 } 807 808 size = blksize(fs, ip, lbn) - bp->b_resid; 809 if (size < xfersize) 810 xfersize = size; 811 812 error = 813 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 814 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 815 (LIST_FIRST(&bp->b_dep) == NULL)) { 816 bp->b_flags |= B_RELBUF; 817 } 818 819 /* 820 * If IO_SYNC each buffer is written synchronously. Otherwise 821 * if we have a severe page deficiency write the buffer 822 * asynchronously. Otherwise try to cluster, and if that 823 * doesn't do it then either do an async write (if O_DIRECT), 824 * or a delayed write (if not). 825 */ 826 if (ioflag & IO_SYNC) { 827 (void)bwrite(bp); 828 } else if (vm_page_count_severe() || 829 buf_dirty_count_severe() || 830 (ioflag & IO_ASYNC)) { 831 bp->b_flags |= B_CLUSTEROK; 832 bawrite(bp); 833 } else if (xfersize + blkoffset == fs->fs_bsize) { 834 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 835 bp->b_flags |= B_CLUSTEROK; 836 cluster_write(bp, ip->i_size, seqcount); 837 } else { 838 bawrite(bp); 839 } 840 } else if (ioflag & IO_DIRECT) { 841 bp->b_flags |= B_CLUSTEROK; 842 bawrite(bp); 843 } else { 844 bp->b_flags |= B_CLUSTEROK; 845 bdwrite(bp); 846 } 847 if (error || xfersize == 0) 848 break; 849 ip->i_flag |= IN_CHANGE | IN_UPDATE; 850 } 851 /* 852 * If we successfully wrote any data, and we are not the superuser 853 * we clear the setuid and setgid bits as a precaution against 854 * tampering. 855 */ 856 if (resid > uio->uio_resid && ap->a_cred && 857 suser_cred(ap->a_cred, PRISON_ROOT)) { 858 ip->i_mode &= ~(ISUID | ISGID); 859 DIP(ip, i_mode) = ip->i_mode; 860 } 861 if (resid > uio->uio_resid) 862 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); 863 if (error) { 864 if (ioflag & IO_UNIT) { 865 (void)UFS_TRUNCATE(vp, osize, 866 IO_NORMAL | (ioflag & IO_SYNC), 867 ap->a_cred, uio->uio_td); 868 uio->uio_offset -= resid - uio->uio_resid; 869 uio->uio_resid = resid; 870 } 871 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 872 error = UFS_UPDATE(vp, 1); 873 874 if (object) { 875 vm_object_vndeallocate(object); 876 } 877 878 return (error); 879 } 880 881 /* 882 * get page routine 883 */ 884 static int 885 ffs_getpages(ap) 886 struct vop_getpages_args *ap; 887 { 888 off_t foff, physoffset; 889 int i, size, bsize; 890 struct vnode *dp, *vp; 891 vm_object_t obj; 892 vm_pindex_t pindex, firstindex; 893 vm_page_t mreq; 894 int bbackwards, bforwards; 895 int pbackwards, pforwards; 896 int firstpage; 897 ufs2_daddr_t reqblkno, reqlblkno; 898 int poff; 899 int pcount; 900 int rtval; 901 int pagesperblock; 902 903 GIANT_REQUIRED; 904 905 pcount = round_page(ap->a_count) / PAGE_SIZE; 906 mreq = ap->a_m[ap->a_reqpage]; 907 firstindex = ap->a_m[0]->pindex; 908 909 /* 910 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 911 * then the entire page is valid. Since the page may be mapped, 912 * user programs might reference data beyond the actual end of file 913 * occuring within the page. We have to zero that data. 914 */ 915 if (mreq->valid) { 916 if (mreq->valid != VM_PAGE_BITS_ALL) 917 vm_page_zero_invalid(mreq, TRUE); 918 vm_page_lock_queues(); 919 for (i = 0; i < pcount; i++) { 920 if (i != ap->a_reqpage) { 921 vm_page_free(ap->a_m[i]); 922 } 923 } 924 vm_page_unlock_queues(); 925 return VM_PAGER_OK; 926 } 927 928 vp = ap->a_vp; 929 obj = vp->v_object; 930 bsize = vp->v_mount->mnt_stat.f_iosize; 931 pindex = mreq->pindex; 932 foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */; 933 934 if (bsize < PAGE_SIZE) 935 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 936 ap->a_count, 937 ap->a_reqpage); 938 939 /* 940 * foff is the file offset of the required page 941 * reqlblkno is the logical block that contains the page 942 * poff is the index of the page into the logical block 943 */ 944 reqlblkno = foff / bsize; 945 poff = (foff % bsize) / PAGE_SIZE; 946 947 dp = VTOI(vp)->i_devvp; 948 if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards) 949 || (reqblkno == -1)) { 950 vm_page_lock_queues(); 951 for(i = 0; i < pcount; i++) { 952 if (i != ap->a_reqpage) 953 vm_page_free(ap->a_m[i]); 954 } 955 vm_page_unlock_queues(); 956 if (reqblkno == -1) { 957 if ((mreq->flags & PG_ZERO) == 0) 958 pmap_zero_page(mreq); 959 vm_page_undirty(mreq); 960 mreq->valid = VM_PAGE_BITS_ALL; 961 return VM_PAGER_OK; 962 } else { 963 return VM_PAGER_ERROR; 964 } 965 } 966 967 physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE; 968 pagesperblock = bsize / PAGE_SIZE; 969 /* 970 * find the first page that is contiguous... 971 * note that pbackwards is the number of pages that are contiguous 972 * backwards. 973 */ 974 firstpage = 0; 975 if (ap->a_count) { 976 pbackwards = poff + bbackwards * pagesperblock; 977 if (ap->a_reqpage > pbackwards) { 978 firstpage = ap->a_reqpage - pbackwards; 979 vm_page_lock_queues(); 980 for(i=0;i<firstpage;i++) 981 vm_page_free(ap->a_m[i]); 982 vm_page_unlock_queues(); 983 } 984 985 /* 986 * pforwards is the number of pages that are contiguous 987 * after the current page. 988 */ 989 pforwards = (pagesperblock - (poff + 1)) + 990 bforwards * pagesperblock; 991 if (pforwards < (pcount - (ap->a_reqpage + 1))) { 992 vm_page_lock_queues(); 993 for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++) 994 vm_page_free(ap->a_m[i]); 995 vm_page_unlock_queues(); 996 pcount = ap->a_reqpage + pforwards + 1; 997 } 998 999 /* 1000 * number of pages for I/O corrected for the non-contig pages at 1001 * the beginning of the array. 1002 */ 1003 pcount -= firstpage; 1004 } 1005 1006 /* 1007 * calculate the size of the transfer 1008 */ 1009 1010 size = pcount * PAGE_SIZE; 1011 1012 if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) > 1013 obj->un_pager.vnp.vnp_size) 1014 size = obj->un_pager.vnp.vnp_size - 1015 IDX_TO_OFF(ap->a_m[firstpage]->pindex); 1016 1017 physoffset -= foff; 1018 rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size, 1019 (ap->a_reqpage - firstpage), physoffset); 1020 1021 return (rtval); 1022 } 1023 1024 /* 1025 * Extended attribute area reading. 1026 */ 1027 static int 1028 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 1029 { 1030 struct inode *ip; 1031 struct ufs2_dinode *dp; 1032 struct fs *fs; 1033 struct buf *bp; 1034 ufs_lbn_t lbn, nextlbn; 1035 off_t bytesinfile; 1036 long size, xfersize, blkoffset; 1037 int error, orig_resid; 1038 mode_t mode; 1039 1040 GIANT_REQUIRED; 1041 1042 ip = VTOI(vp); 1043 fs = ip->i_fs; 1044 dp = ip->i_din2; 1045 mode = ip->i_mode; 1046 1047 #ifdef DIAGNOSTIC 1048 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 1049 panic("ffs_extread: mode"); 1050 1051 #endif 1052 orig_resid = uio->uio_resid; 1053 if (orig_resid <= 0) 1054 return (0); 1055 1056 bytesinfile = dp->di_extsize - uio->uio_offset; 1057 if (bytesinfile <= 0) { 1058 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 1059 ip->i_flag |= IN_ACCESS; 1060 return 0; 1061 } 1062 1063 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 1064 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 1065 break; 1066 1067 lbn = lblkno(fs, uio->uio_offset); 1068 nextlbn = lbn + 1; 1069 1070 /* 1071 * size of buffer. The buffer representing the 1072 * end of the file is rounded up to the size of 1073 * the block type ( fragment or full block, 1074 * depending ). 1075 */ 1076 size = sblksize(fs, dp->di_extsize, lbn); 1077 blkoffset = blkoff(fs, uio->uio_offset); 1078 1079 /* 1080 * The amount we want to transfer in this iteration is 1081 * one FS block less the amount of the data before 1082 * our startpoint (duh!) 1083 */ 1084 xfersize = fs->fs_bsize - blkoffset; 1085 1086 /* 1087 * But if we actually want less than the block, 1088 * or the file doesn't have a whole block more of data, 1089 * then use the lesser number. 1090 */ 1091 if (uio->uio_resid < xfersize) 1092 xfersize = uio->uio_resid; 1093 if (bytesinfile < xfersize) 1094 xfersize = bytesinfile; 1095 1096 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 1097 /* 1098 * Don't do readahead if this is the end of the info. 1099 */ 1100 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 1101 } else { 1102 /* 1103 * If we have a second block, then 1104 * fire off a request for a readahead 1105 * as well as a read. Note that the 4th and 5th 1106 * arguments point to arrays of the size specified in 1107 * the 6th argument. 1108 */ 1109 int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 1110 1111 nextlbn = -1 - nextlbn; 1112 error = breadn(vp, -1 - lbn, 1113 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 1114 } 1115 if (error) { 1116 brelse(bp); 1117 bp = NULL; 1118 break; 1119 } 1120 1121 /* 1122 * If IO_DIRECT then set B_DIRECT for the buffer. This 1123 * will cause us to attempt to release the buffer later on 1124 * and will cause the buffer cache to attempt to free the 1125 * underlying pages. 1126 */ 1127 if (ioflag & IO_DIRECT) 1128 bp->b_flags |= B_DIRECT; 1129 1130 /* 1131 * We should only get non-zero b_resid when an I/O error 1132 * has occurred, which should cause us to break above. 1133 * However, if the short read did not cause an error, 1134 * then we want to ensure that we do not uiomove bad 1135 * or uninitialized data. 1136 */ 1137 size -= bp->b_resid; 1138 if (size < xfersize) { 1139 if (size == 0) 1140 break; 1141 xfersize = size; 1142 } 1143 1144 error = uiomove((char *)bp->b_data + blkoffset, 1145 (int)xfersize, uio); 1146 if (error) 1147 break; 1148 1149 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1150 (LIST_FIRST(&bp->b_dep) == NULL)) { 1151 /* 1152 * If there are no dependencies, and it's VMIO, 1153 * then we don't need the buf, mark it available 1154 * for freeing. The VM has the data. 1155 */ 1156 bp->b_flags |= B_RELBUF; 1157 brelse(bp); 1158 } else { 1159 /* 1160 * Otherwise let whoever 1161 * made the request take care of 1162 * freeing it. We just queue 1163 * it onto another list. 1164 */ 1165 bqrelse(bp); 1166 } 1167 } 1168 1169 /* 1170 * This can only happen in the case of an error 1171 * because the loop above resets bp to NULL on each iteration 1172 * and on normal completion has not set a new value into it. 1173 * so it must have come from a 'break' statement 1174 */ 1175 if (bp != NULL) { 1176 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1177 (LIST_FIRST(&bp->b_dep) == NULL)) { 1178 bp->b_flags |= B_RELBUF; 1179 brelse(bp); 1180 } else { 1181 bqrelse(bp); 1182 } 1183 } 1184 1185 if ((error == 0 || uio->uio_resid != orig_resid) && 1186 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 1187 ip->i_flag |= IN_ACCESS; 1188 return (error); 1189 } 1190 1191 /* 1192 * Extended attribute area writing. 1193 */ 1194 static int 1195 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1196 { 1197 struct inode *ip; 1198 struct ufs2_dinode *dp; 1199 struct fs *fs; 1200 struct buf *bp; 1201 ufs_lbn_t lbn; 1202 off_t osize; 1203 int blkoffset, error, flags, resid, size, xfersize; 1204 1205 GIANT_REQUIRED; 1206 1207 ip = VTOI(vp); 1208 fs = ip->i_fs; 1209 dp = ip->i_din2; 1210 1211 #ifdef DIAGNOSTIC 1212 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1213 panic("ext_write: mode"); 1214 #endif 1215 1216 if (ioflag & IO_APPEND) 1217 uio->uio_offset = dp->di_extsize; 1218 1219 if (uio->uio_offset < 0 || 1220 (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1221 return (EFBIG); 1222 1223 resid = uio->uio_resid; 1224 osize = dp->di_extsize; 1225 flags = IO_EXT; 1226 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1227 flags |= IO_SYNC; 1228 1229 for (error = 0; uio->uio_resid > 0;) { 1230 lbn = lblkno(fs, uio->uio_offset); 1231 blkoffset = blkoff(fs, uio->uio_offset); 1232 xfersize = fs->fs_bsize - blkoffset; 1233 if (uio->uio_resid < xfersize) 1234 xfersize = uio->uio_resid; 1235 1236 /* 1237 * We must perform a read-before-write if the transfer size 1238 * does not cover the entire buffer. 1239 */ 1240 if (fs->fs_bsize > xfersize) 1241 flags |= BA_CLRBUF; 1242 else 1243 flags &= ~BA_CLRBUF; 1244 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1245 ucred, flags, &bp); 1246 if (error != 0) 1247 break; 1248 /* 1249 * If the buffer is not valid we have to clear out any 1250 * garbage data from the pages instantiated for the buffer. 1251 * If we do not, a failed uiomove() during a write can leave 1252 * the prior contents of the pages exposed to a userland 1253 * mmap(). XXX deal with uiomove() errors a better way. 1254 */ 1255 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1256 vfs_bio_clrbuf(bp); 1257 if (ioflag & IO_DIRECT) 1258 bp->b_flags |= B_DIRECT; 1259 if (ioflag & IO_NOWDRAIN) 1260 bp->b_flags |= B_NOWDRAIN; 1261 1262 if (uio->uio_offset + xfersize > dp->di_extsize) 1263 dp->di_extsize = uio->uio_offset + xfersize; 1264 1265 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1266 if (size < xfersize) 1267 xfersize = size; 1268 1269 error = 1270 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1271 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1272 (LIST_FIRST(&bp->b_dep) == NULL)) { 1273 bp->b_flags |= B_RELBUF; 1274 } 1275 1276 /* 1277 * If IO_SYNC each buffer is written synchronously. Otherwise 1278 * if we have a severe page deficiency write the buffer 1279 * asynchronously. Otherwise try to cluster, and if that 1280 * doesn't do it then either do an async write (if O_DIRECT), 1281 * or a delayed write (if not). 1282 */ 1283 if (ioflag & IO_SYNC) { 1284 (void)bwrite(bp); 1285 } else if (vm_page_count_severe() || 1286 buf_dirty_count_severe() || 1287 xfersize + blkoffset == fs->fs_bsize || 1288 (ioflag & (IO_ASYNC | IO_DIRECT))) 1289 bawrite(bp); 1290 else 1291 bdwrite(bp); 1292 if (error || xfersize == 0) 1293 break; 1294 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1295 } 1296 /* 1297 * If we successfully wrote any data, and we are not the superuser 1298 * we clear the setuid and setgid bits as a precaution against 1299 * tampering. 1300 */ 1301 if (resid > uio->uio_resid && ucred && 1302 suser_cred(ucred, PRISON_ROOT)) { 1303 ip->i_mode &= ~(ISUID | ISGID); 1304 dp->di_mode = ip->i_mode; 1305 } 1306 if (error) { 1307 if (ioflag & IO_UNIT) { 1308 (void)UFS_TRUNCATE(vp, osize, 1309 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td); 1310 uio->uio_offset -= resid - uio->uio_resid; 1311 uio->uio_resid = resid; 1312 } 1313 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1314 error = UFS_UPDATE(vp, 1); 1315 return (error); 1316 } 1317 1318 1319 /* 1320 * Vnode operating to retrieve a named extended attribute. 1321 * 1322 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1323 * the length of the EA, and possibly the pointer to the entry and to the data. 1324 */ 1325 static int 1326 ffs_findextattr(u_char *ptr, uint length, int nspace, const char *name, u_char **eap, u_char **eac) 1327 { 1328 u_char *p, *pe, *pn, *p0; 1329 int eapad1, eapad2, ealength, ealen, nlen; 1330 uint32_t ul; 1331 1332 pe = ptr + length; 1333 nlen = strlen(name); 1334 1335 for (p = ptr; p < pe; p = pn) { 1336 p0 = p; 1337 bcopy(p, &ul, sizeof(ul)); 1338 pn = p + ul; 1339 /* make sure this entry is complete */ 1340 if (pn > pe) 1341 break; 1342 p += sizeof(uint32_t); 1343 if (*p != nspace) 1344 continue; 1345 p++; 1346 eapad2 = *p++; 1347 if (*p != nlen) 1348 continue; 1349 p++; 1350 if (bcmp(p, name, nlen)) 1351 continue; 1352 ealength = sizeof(uint32_t) + 3 + nlen; 1353 eapad1 = 8 - (ealength % 8); 1354 if (eapad1 == 8) 1355 eapad1 = 0; 1356 ealength += eapad1; 1357 ealen = ul - ealength - eapad2; 1358 p += nlen + eapad1; 1359 if (eap != NULL) 1360 *eap = p0; 1361 if (eac != NULL) 1362 *eac = p; 1363 return (ealen); 1364 } 1365 return(-1); 1366 } 1367 1368 static int 1369 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1370 { 1371 struct inode *ip; 1372 struct fs *fs; 1373 struct ufs2_dinode *dp; 1374 struct uio luio; 1375 struct iovec liovec; 1376 int easize, error; 1377 u_char *eae; 1378 1379 ip = VTOI(vp); 1380 fs = ip->i_fs; 1381 dp = ip->i_din2; 1382 easize = dp->di_extsize; 1383 1384 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1385 1386 liovec.iov_base = eae; 1387 liovec.iov_len = easize; 1388 luio.uio_iov = &liovec; 1389 luio.uio_iovcnt = 1; 1390 luio.uio_offset = 0; 1391 luio.uio_resid = easize; 1392 luio.uio_segflg = UIO_SYSSPACE; 1393 luio.uio_rw = UIO_READ; 1394 luio.uio_td = td; 1395 1396 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1397 if (error) { 1398 free(eae, M_TEMP); 1399 return(error); 1400 } 1401 *p = eae; 1402 return (0); 1403 } 1404 1405 static int 1406 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1407 { 1408 struct inode *ip; 1409 struct fs *fs; 1410 struct ufs2_dinode *dp; 1411 int error; 1412 1413 ip = VTOI(vp); 1414 fs = ip->i_fs; 1415 1416 if (ip->i_ea_area != NULL) 1417 return (EBUSY); 1418 dp = ip->i_din2; 1419 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1420 if (error) 1421 return (error); 1422 ip->i_ea_len = dp->di_extsize; 1423 ip->i_ea_error = 0; 1424 return (0); 1425 } 1426 1427 /* 1428 * Vnode extattr transaction commit/abort 1429 */ 1430 static int 1431 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1432 { 1433 struct inode *ip; 1434 struct fs *fs; 1435 struct uio luio; 1436 struct iovec liovec; 1437 int error; 1438 struct ufs2_dinode *dp; 1439 1440 ip = VTOI(vp); 1441 fs = ip->i_fs; 1442 if (ip->i_ea_area == NULL) 1443 return (EINVAL); 1444 dp = ip->i_din2; 1445 error = ip->i_ea_error; 1446 if (commit && error == 0) { 1447 if (cred == NOCRED) 1448 cred = vp->v_mount->mnt_cred; 1449 liovec.iov_base = ip->i_ea_area; 1450 liovec.iov_len = ip->i_ea_len; 1451 luio.uio_iov = &liovec; 1452 luio.uio_iovcnt = 1; 1453 luio.uio_offset = 0; 1454 luio.uio_resid = ip->i_ea_len; 1455 luio.uio_segflg = UIO_SYSSPACE; 1456 luio.uio_rw = UIO_WRITE; 1457 luio.uio_td = td; 1458 /* XXX: I'm not happy about truncating to zero size */ 1459 if (ip->i_ea_len < dp->di_extsize) 1460 error = ffs_truncate(vp, 0, IO_EXT, cred, td); 1461 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1462 } 1463 free(ip->i_ea_area, M_TEMP); 1464 ip->i_ea_area = NULL; 1465 ip->i_ea_len = 0; 1466 ip->i_ea_error = 0; 1467 return (error); 1468 } 1469 1470 /* 1471 * Vnode extattr strategy routine for special devices and fifos. 1472 * 1473 * We need to check for a read or write of the external attributes. 1474 * Otherwise we just fall through and do the usual thing. 1475 */ 1476 static int 1477 ffsext_strategy(struct vop_strategy_args *ap) 1478 /* 1479 struct vop_strategy_args { 1480 struct vnodeop_desc *a_desc; 1481 struct vnode *a_vp; 1482 struct buf *a_bp; 1483 }; 1484 */ 1485 { 1486 struct vnode *vp; 1487 daddr_t lbn; 1488 1489 vp = ap->a_vp; 1490 lbn = ap->a_bp->b_lblkno; 1491 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1492 lbn < 0 && lbn >= -NXADDR) 1493 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1494 if (vp->v_type == VFIFO) 1495 return (ufs_vnoperatefifo((struct vop_generic_args *)ap)); 1496 return (ufs_vnoperatespec((struct vop_generic_args *)ap)); 1497 } 1498 1499 /* 1500 * Vnode extattr transaction commit/abort 1501 */ 1502 static int 1503 ffs_openextattr(struct vop_openextattr_args *ap) 1504 /* 1505 struct vop_openextattr_args { 1506 struct vnodeop_desc *a_desc; 1507 struct vnode *a_vp; 1508 IN struct ucred *a_cred; 1509 IN struct thread *a_td; 1510 }; 1511 */ 1512 { 1513 struct inode *ip; 1514 struct fs *fs; 1515 1516 ip = VTOI(ap->a_vp); 1517 fs = ip->i_fs; 1518 if (fs->fs_magic == FS_UFS1_MAGIC) 1519 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1520 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1521 } 1522 1523 1524 /* 1525 * Vnode extattr transaction commit/abort 1526 */ 1527 static int 1528 ffs_closeextattr(struct vop_closeextattr_args *ap) 1529 /* 1530 struct vop_closeextattr_args { 1531 struct vnodeop_desc *a_desc; 1532 struct vnode *a_vp; 1533 int a_commit; 1534 IN struct ucred *a_cred; 1535 IN struct thread *a_td; 1536 }; 1537 */ 1538 { 1539 struct inode *ip; 1540 struct fs *fs; 1541 1542 ip = VTOI(ap->a_vp); 1543 fs = ip->i_fs; 1544 if (fs->fs_magic == FS_UFS1_MAGIC) 1545 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1546 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1547 } 1548 1549 1550 1551 /* 1552 * Vnode operation to retrieve a named extended attribute. 1553 */ 1554 static int 1555 ffs_getextattr(struct vop_getextattr_args *ap) 1556 /* 1557 vop_getextattr { 1558 IN struct vnode *a_vp; 1559 IN int a_attrnamespace; 1560 IN const char *a_name; 1561 INOUT struct uio *a_uio; 1562 OUT size_t *a_size; 1563 IN struct ucred *a_cred; 1564 IN struct thread *a_td; 1565 }; 1566 */ 1567 { 1568 struct inode *ip; 1569 struct fs *fs; 1570 u_char *eae, *p, *pe, *pn; 1571 struct ufs2_dinode *dp; 1572 unsigned easize; 1573 uint32_t ul; 1574 int error, ealen, stand_alone; 1575 1576 ip = VTOI(ap->a_vp); 1577 fs = ip->i_fs; 1578 1579 if (fs->fs_magic == FS_UFS1_MAGIC) 1580 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1581 1582 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1583 ap->a_cred, ap->a_td, IREAD); 1584 if (error) 1585 return (error); 1586 1587 if (ip->i_ea_area == NULL) { 1588 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1589 if (error) 1590 return (error); 1591 stand_alone = 1; 1592 } else { 1593 stand_alone = 0; 1594 } 1595 dp = ip->i_din2; 1596 eae = ip->i_ea_area; 1597 easize = ip->i_ea_len; 1598 if (strlen(ap->a_name) > 0) { 1599 ealen = ffs_findextattr(eae, easize, 1600 ap->a_attrnamespace, ap->a_name, NULL, &p); 1601 if (ealen >= 0) { 1602 error = 0; 1603 if (ap->a_size != NULL) 1604 *ap->a_size = ealen; 1605 else if (ap->a_uio != NULL) 1606 error = uiomove(p, ealen, ap->a_uio); 1607 } else { 1608 error = ENOATTR; 1609 } 1610 } else { 1611 error = 0; 1612 if (ap->a_size != NULL) 1613 *ap->a_size = 0; 1614 pe = eae + easize; 1615 for(p = eae; error == 0 && p < pe; p = pn) { 1616 bcopy(p, &ul, sizeof(ul)); 1617 pn = p + ul; 1618 if (pn > pe) 1619 break; 1620 p += sizeof(ul); 1621 if (*p++ != ap->a_attrnamespace) 1622 continue; 1623 p++; /* pad2 */ 1624 ealen = *p; 1625 if (ap->a_size != NULL) { 1626 *ap->a_size += ealen + 1; 1627 } else if (ap->a_uio != NULL) { 1628 error = uiomove(p, ealen + 1, ap->a_uio); 1629 } 1630 } 1631 } 1632 if (stand_alone) 1633 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1634 return(error); 1635 } 1636 1637 /* 1638 * Vnode operation to set a named attribute. 1639 */ 1640 static int 1641 ffs_setextattr(struct vop_setextattr_args *ap) 1642 /* 1643 vop_setextattr { 1644 IN struct vnode *a_vp; 1645 IN int a_attrnamespace; 1646 IN const char *a_name; 1647 INOUT struct uio *a_uio; 1648 IN struct ucred *a_cred; 1649 IN struct thread *a_td; 1650 }; 1651 */ 1652 { 1653 struct inode *ip; 1654 struct fs *fs; 1655 uint32_t ealength, ul; 1656 int ealen, olen, eacont, eapad1, eapad2, error, i, easize; 1657 u_char *eae, *p; 1658 struct ufs2_dinode *dp; 1659 struct ucred *cred; 1660 int stand_alone; 1661 1662 ip = VTOI(ap->a_vp); 1663 fs = ip->i_fs; 1664 1665 if (fs->fs_magic == FS_UFS1_MAGIC) 1666 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1667 1668 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1669 ap->a_cred, ap->a_td, IWRITE); 1670 if (error) { 1671 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1672 ip->i_ea_error = error; 1673 return (error); 1674 } 1675 1676 if (ap->a_cred != NOCRED) 1677 cred = ap->a_cred; 1678 else 1679 cred = ap->a_vp->v_mount->mnt_cred; 1680 1681 dp = ip->i_din2; 1682 1683 if (ip->i_ea_area == NULL) { 1684 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1685 if (error) 1686 return (error); 1687 stand_alone = 1; 1688 } else { 1689 stand_alone = 0; 1690 } 1691 1692 /* Calculate the length of the EA entry */ 1693 if (ap->a_uio == NULL) { 1694 /* delete */ 1695 ealength = eapad1 = ealen = eapad2 = eacont = 0; 1696 } else { 1697 ealen = ap->a_uio->uio_resid; 1698 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1699 eapad1 = 8 - (ealength % 8); 1700 if (eapad1 == 8) 1701 eapad1 = 0; 1702 eacont = ealength + eapad1; 1703 eapad2 = 8 - (ealen % 8); 1704 if (eapad2 == 8) 1705 eapad2 = 0; 1706 ealength += eapad1 + ealen + eapad2; 1707 } 1708 1709 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1710 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1711 easize = ip->i_ea_len; 1712 1713 olen = ffs_findextattr(eae, easize, 1714 ap->a_attrnamespace, ap->a_name, &p, NULL); 1715 if (olen == -1 && ealength == 0) { 1716 /* delete but nonexistent */ 1717 free(eae, M_TEMP); 1718 if (stand_alone) 1719 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1720 return(ENOATTR); 1721 } 1722 if (olen == -1) { 1723 /* new, append at end */ 1724 p = eae + easize; 1725 easize += ealength; 1726 } else { 1727 bcopy(p, &ul, sizeof ul); 1728 i = p - eae + ul; 1729 if (ul != ealength) { 1730 bcopy(p + ul, p + ealength, easize - i); 1731 easize += (ealength - ul); 1732 } 1733 } 1734 if (easize > NXADDR * fs->fs_bsize) { 1735 free(eae, M_TEMP); 1736 if (stand_alone) 1737 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1738 else if (ip->i_ea_error == 0) 1739 ip->i_ea_error = ENOSPC; 1740 return(ENOSPC); 1741 } 1742 if (ealength != 0) { 1743 bcopy(&ealength, p, sizeof(ealength)); 1744 p += sizeof(ealength); 1745 *p++ = ap->a_attrnamespace; 1746 *p++ = eapad2; 1747 *p++ = strlen(ap->a_name); 1748 strcpy(p, ap->a_name); 1749 p += strlen(ap->a_name); 1750 bzero(p, eapad1); 1751 p += eapad1; 1752 error = uiomove(p, ealen, ap->a_uio); 1753 if (error) { 1754 free(eae, M_TEMP); 1755 if (stand_alone) 1756 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1757 else if (ip->i_ea_error == 0) 1758 ip->i_ea_error = error; 1759 return(error); 1760 } 1761 p += ealen; 1762 bzero(p, eapad2); 1763 } 1764 p = ip->i_ea_area; 1765 ip->i_ea_area = eae; 1766 ip->i_ea_len = easize; 1767 free(p, M_TEMP); 1768 if (stand_alone) 1769 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1770 return(error); 1771 } 1772