1 /* 2 * Copyright (c) 2002 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Copyright (c) 1982, 1986, 1989, 1993 12 * The Regents of the University of California. All rights reserved. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed by the University of 25 * California, Berkeley and its contributors. 26 * 4. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 * 42 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 43 * $FreeBSD$ 44 */ 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/resourcevar.h> 49 #include <sys/signalvar.h> 50 #include <sys/kernel.h> 51 #include <sys/stat.h> 52 #include <sys/bio.h> 53 #include <sys/buf.h> 54 #include <sys/proc.h> 55 #include <sys/mount.h> 56 #include <sys/vnode.h> 57 #include <sys/conf.h> 58 59 #include <machine/limits.h> 60 61 #include <vm/vm.h> 62 #include <vm/vm_page.h> 63 #include <vm/vm_object.h> 64 #include <vm/vm_extern.h> 65 #include <vm/vm_pager.h> 66 #include <vm/vnode_pager.h> 67 68 #include <ufs/ufs/extattr.h> 69 #include <ufs/ufs/quota.h> 70 #include <ufs/ufs/inode.h> 71 #include <ufs/ufs/ufsmount.h> 72 #include <ufs/ufs/ufs_extern.h> 73 74 #include <ufs/ffs/fs.h> 75 #include <ufs/ffs/ffs_extern.h> 76 77 int ffs_fsync(struct vop_fsync_args *); 78 static int ffs_getpages(struct vop_getpages_args *); 79 static int ffs_read(struct vop_read_args *); 80 static int ffs_write(struct vop_write_args *); 81 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred); 82 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred); 83 static int ffs_getextattr(struct vop_getextattr_args *); 84 static int ffs_setextattr(struct vop_setextattr_args *); 85 86 87 /* Global vfs data structures for ufs. */ 88 vop_t **ffs_vnodeop_p; 89 static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = { 90 { &vop_default_desc, (vop_t *) ufs_vnoperate }, 91 { &vop_fsync_desc, (vop_t *) ffs_fsync }, 92 { &vop_getpages_desc, (vop_t *) ffs_getpages }, 93 { &vop_read_desc, (vop_t *) ffs_read }, 94 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks }, 95 { &vop_write_desc, (vop_t *) ffs_write }, 96 { &vop_getextattr_desc, (vop_t *) ffs_getextattr }, 97 { &vop_setextattr_desc, (vop_t *) ffs_setextattr }, 98 { NULL, NULL } 99 }; 100 static struct vnodeopv_desc ffs_vnodeop_opv_desc = 101 { &ffs_vnodeop_p, ffs_vnodeop_entries }; 102 103 vop_t **ffs_specop_p; 104 static struct vnodeopv_entry_desc ffs_specop_entries[] = { 105 { &vop_default_desc, (vop_t *) ufs_vnoperatespec }, 106 { &vop_fsync_desc, (vop_t *) ffs_fsync }, 107 { &vop_getextattr_desc, (vop_t *) ffs_getextattr }, 108 { &vop_setextattr_desc, (vop_t *) ffs_setextattr }, 109 { NULL, NULL } 110 }; 111 static struct vnodeopv_desc ffs_specop_opv_desc = 112 { &ffs_specop_p, ffs_specop_entries }; 113 114 vop_t **ffs_fifoop_p; 115 static struct vnodeopv_entry_desc ffs_fifoop_entries[] = { 116 { &vop_default_desc, (vop_t *) ufs_vnoperatefifo }, 117 { &vop_fsync_desc, (vop_t *) ffs_fsync }, 118 { &vop_getextattr_desc, (vop_t *) ffs_getextattr }, 119 { &vop_setextattr_desc, (vop_t *) ffs_setextattr }, 120 { NULL, NULL } 121 }; 122 static struct vnodeopv_desc ffs_fifoop_opv_desc = 123 { &ffs_fifoop_p, ffs_fifoop_entries }; 124 125 VNODEOP_SET(ffs_vnodeop_opv_desc); 126 VNODEOP_SET(ffs_specop_opv_desc); 127 VNODEOP_SET(ffs_fifoop_opv_desc); 128 129 /* 130 * Synch an open file. 131 */ 132 /* ARGSUSED */ 133 int 134 ffs_fsync(ap) 135 struct vop_fsync_args /* { 136 struct vnode *a_vp; 137 struct ucred *a_cred; 138 int a_waitfor; 139 struct thread *a_td; 140 } */ *ap; 141 { 142 struct vnode *vp = ap->a_vp; 143 struct inode *ip = VTOI(vp); 144 struct buf *bp; 145 struct buf *nbp; 146 int s, error, wait, passes, skipmeta; 147 ufs_lbn_t lbn; 148 149 wait = (ap->a_waitfor == MNT_WAIT); 150 if (vn_isdisk(vp, NULL)) { 151 lbn = INT_MAX; 152 if (vp->v_rdev->si_mountpoint != NULL && 153 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP)) 154 softdep_fsync_mountdev(vp); 155 } else { 156 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 157 } 158 159 /* 160 * Flush all dirty buffers associated with a vnode. 161 */ 162 passes = NIADDR + 1; 163 skipmeta = 0; 164 if (wait) 165 skipmeta = 1; 166 s = splbio(); 167 loop: 168 TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) 169 bp->b_flags &= ~B_SCANNED; 170 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 171 nbp = TAILQ_NEXT(bp, b_vnbufs); 172 /* 173 * Reasons to skip this buffer: it has already been considered 174 * on this pass, this pass is the first time through on a 175 * synchronous flush request and the buffer being considered 176 * is metadata, the buffer has dependencies that will cause 177 * it to be redirtied and it has not already been deferred, 178 * or it is already being written. 179 */ 180 if ((bp->b_flags & B_SCANNED) != 0) 181 continue; 182 bp->b_flags |= B_SCANNED; 183 if ((skipmeta == 1 && bp->b_lblkno < 0)) 184 continue; 185 if (!wait && LIST_FIRST(&bp->b_dep) != NULL && 186 (bp->b_flags & B_DEFERRED) == 0 && 187 buf_countdeps(bp, 0)) { 188 bp->b_flags |= B_DEFERRED; 189 continue; 190 } 191 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) 192 continue; 193 if ((bp->b_flags & B_DELWRI) == 0) 194 panic("ffs_fsync: not dirty"); 195 if (vp != bp->b_vp) 196 panic("ffs_fsync: vp != vp->b_vp"); 197 /* 198 * If this is a synchronous flush request, or it is not a 199 * file or device, start the write on this buffer immediatly. 200 */ 201 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) { 202 203 /* 204 * On our final pass through, do all I/O synchronously 205 * so that we can find out if our flush is failing 206 * because of write errors. 207 */ 208 if (passes > 0 || !wait) { 209 if ((bp->b_flags & B_CLUSTEROK) && !wait) { 210 BUF_UNLOCK(bp); 211 (void) vfs_bio_awrite(bp); 212 } else { 213 bremfree(bp); 214 splx(s); 215 (void) bawrite(bp); 216 s = splbio(); 217 } 218 } else { 219 bremfree(bp); 220 splx(s); 221 if ((error = bwrite(bp)) != 0) 222 return (error); 223 s = splbio(); 224 } 225 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) { 226 /* 227 * If the buffer is for data that has been truncated 228 * off the file, then throw it away. 229 */ 230 bremfree(bp); 231 bp->b_flags |= B_INVAL | B_NOCACHE; 232 splx(s); 233 brelse(bp); 234 s = splbio(); 235 } else { 236 BUF_UNLOCK(bp); 237 vfs_bio_awrite(bp); 238 } 239 /* 240 * Since we may have slept during the I/O, we need 241 * to start from a known point. 242 */ 243 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd); 244 } 245 /* 246 * If we were asked to do this synchronously, then go back for 247 * another pass, this time doing the metadata. 248 */ 249 if (skipmeta) { 250 skipmeta = 0; 251 goto loop; 252 } 253 254 if (wait) { 255 VI_LOCK(vp); 256 while (vp->v_numoutput) { 257 vp->v_iflag |= VI_BWAIT; 258 msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp), 259 PRIBIO + 4, "ffsfsn", 0); 260 } 261 VI_UNLOCK(vp); 262 263 /* 264 * Ensure that any filesystem metatdata associated 265 * with the vnode has been written. 266 */ 267 splx(s); 268 if ((error = softdep_sync_metadata(ap)) != 0) 269 return (error); 270 s = splbio(); 271 272 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 273 /* 274 * Block devices associated with filesystems may 275 * have new I/O requests posted for them even if 276 * the vnode is locked, so no amount of trying will 277 * get them clean. Thus we give block devices a 278 * good effort, then just give up. For all other file 279 * types, go around and try again until it is clean. 280 */ 281 if (passes > 0) { 282 passes -= 1; 283 goto loop; 284 } 285 #ifdef DIAGNOSTIC 286 if (!vn_isdisk(vp, NULL)) 287 vprint("ffs_fsync: dirty", vp); 288 #endif 289 } 290 } 291 splx(s); 292 return (UFS_UPDATE(vp, wait)); 293 } 294 295 296 /* 297 * Vnode op for reading. 298 */ 299 /* ARGSUSED */ 300 int 301 ffs_read(ap) 302 struct vop_read_args /* { 303 struct vnode *a_vp; 304 struct uio *a_uio; 305 int a_ioflag; 306 struct ucred *a_cred; 307 } */ *ap; 308 { 309 struct vnode *vp; 310 struct inode *ip; 311 struct uio *uio; 312 struct fs *fs; 313 struct buf *bp; 314 ufs_lbn_t lbn, nextlbn; 315 off_t bytesinfile; 316 long size, xfersize, blkoffset; 317 int error, orig_resid; 318 mode_t mode; 319 int seqcount; 320 int ioflag; 321 vm_object_t object; 322 323 vp = ap->a_vp; 324 uio = ap->a_uio; 325 ioflag = ap->a_ioflag; 326 if (ap->a_ioflag & IO_EXT) 327 return (ffs_extread(vp, uio, ioflag, ap->a_cred)); 328 329 GIANT_REQUIRED; 330 331 seqcount = ap->a_ioflag >> 16; 332 ip = VTOI(vp); 333 mode = ip->i_mode; 334 335 #ifdef DIAGNOSTIC 336 if (uio->uio_rw != UIO_READ) 337 panic("ffs_read: mode"); 338 339 if (vp->v_type == VLNK) { 340 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 341 panic("ffs_read: short symlink"); 342 } else if (vp->v_type != VREG && vp->v_type != VDIR) 343 panic("ffs_read: type %d", vp->v_type); 344 #endif 345 fs = ip->i_fs; 346 if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize) 347 return (EFBIG); 348 349 orig_resid = uio->uio_resid; 350 if (orig_resid <= 0) 351 return (0); 352 353 object = vp->v_object; 354 355 bytesinfile = ip->i_size - uio->uio_offset; 356 if (bytesinfile <= 0) { 357 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 358 ip->i_flag |= IN_ACCESS; 359 return 0; 360 } 361 362 if (object) { 363 vm_object_reference(object); 364 } 365 366 #ifdef ENABLE_VFS_IOOPT 367 /* 368 * If IO optimisation is turned on, 369 * and we are NOT a VM based IO request, 370 * (i.e. not headed for the buffer cache) 371 * but there IS a vm object associated with it. 372 */ 373 if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) { 374 int nread, toread; 375 376 toread = uio->uio_resid; 377 if (toread > bytesinfile) 378 toread = bytesinfile; 379 if (toread >= PAGE_SIZE) { 380 /* 381 * Then if it's at least a page in size, try 382 * get the data from the object using vm tricks 383 */ 384 error = uioread(toread, uio, object, &nread); 385 if ((uio->uio_resid == 0) || (error != 0)) { 386 /* 387 * If we finished or there was an error 388 * then finish up (the reference previously 389 * obtained on object must be released). 390 */ 391 if ((error == 0 || 392 uio->uio_resid != orig_resid) && 393 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 394 ip->i_flag |= IN_ACCESS; 395 396 if (object) { 397 vm_object_vndeallocate(object); 398 } 399 return error; 400 } 401 } 402 } 403 #endif 404 405 /* 406 * Ok so we couldn't do it all in one vm trick... 407 * so cycle around trying smaller bites.. 408 */ 409 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 410 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 411 break; 412 #ifdef ENABLE_VFS_IOOPT 413 if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) { 414 /* 415 * Obviously we didn't finish above, but we 416 * didn't get an error either. Try the same trick again. 417 * but this time we are looping. 418 */ 419 int nread, toread; 420 toread = uio->uio_resid; 421 if (toread > bytesinfile) 422 toread = bytesinfile; 423 424 /* 425 * Once again, if there isn't enough for a 426 * whole page, don't try optimising. 427 */ 428 if (toread >= PAGE_SIZE) { 429 error = uioread(toread, uio, object, &nread); 430 if ((uio->uio_resid == 0) || (error != 0)) { 431 /* 432 * If we finished or there was an 433 * error then finish up (the reference 434 * previously obtained on object must 435 * be released). 436 */ 437 if ((error == 0 || 438 uio->uio_resid != orig_resid) && 439 (vp->v_mount->mnt_flag & 440 MNT_NOATIME) == 0) 441 ip->i_flag |= IN_ACCESS; 442 if (object) { 443 vm_object_vndeallocate(object); 444 } 445 return error; 446 } 447 /* 448 * To get here we didnt't finish or err. 449 * If we did get some data, 450 * loop to try another bite. 451 */ 452 if (nread > 0) { 453 continue; 454 } 455 } 456 } 457 #endif 458 459 lbn = lblkno(fs, uio->uio_offset); 460 nextlbn = lbn + 1; 461 462 /* 463 * size of buffer. The buffer representing the 464 * end of the file is rounded up to the size of 465 * the block type ( fragment or full block, 466 * depending ). 467 */ 468 size = blksize(fs, ip, lbn); 469 blkoffset = blkoff(fs, uio->uio_offset); 470 471 /* 472 * The amount we want to transfer in this iteration is 473 * one FS block less the amount of the data before 474 * our startpoint (duh!) 475 */ 476 xfersize = fs->fs_bsize - blkoffset; 477 478 /* 479 * But if we actually want less than the block, 480 * or the file doesn't have a whole block more of data, 481 * then use the lesser number. 482 */ 483 if (uio->uio_resid < xfersize) 484 xfersize = uio->uio_resid; 485 if (bytesinfile < xfersize) 486 xfersize = bytesinfile; 487 488 if (lblktosize(fs, nextlbn) >= ip->i_size) { 489 /* 490 * Don't do readahead if this is the end of the file. 491 */ 492 error = bread(vp, lbn, size, NOCRED, &bp); 493 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 494 /* 495 * Otherwise if we are allowed to cluster, 496 * grab as much as we can. 497 * 498 * XXX This may not be a win if we are not 499 * doing sequential access. 500 */ 501 error = cluster_read(vp, ip->i_size, lbn, 502 size, NOCRED, uio->uio_resid, seqcount, &bp); 503 } else if (seqcount > 1) { 504 /* 505 * If we are NOT allowed to cluster, then 506 * if we appear to be acting sequentially, 507 * fire off a request for a readahead 508 * as well as a read. Note that the 4th and 5th 509 * arguments point to arrays of the size specified in 510 * the 6th argument. 511 */ 512 int nextsize = blksize(fs, ip, nextlbn); 513 error = breadn(vp, lbn, 514 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 515 } else { 516 /* 517 * Failing all of the above, just read what the 518 * user asked for. Interestingly, the same as 519 * the first option above. 520 */ 521 error = bread(vp, lbn, size, NOCRED, &bp); 522 } 523 if (error) { 524 brelse(bp); 525 bp = NULL; 526 break; 527 } 528 529 /* 530 * If IO_DIRECT then set B_DIRECT for the buffer. This 531 * will cause us to attempt to release the buffer later on 532 * and will cause the buffer cache to attempt to free the 533 * underlying pages. 534 */ 535 if (ioflag & IO_DIRECT) 536 bp->b_flags |= B_DIRECT; 537 538 /* 539 * We should only get non-zero b_resid when an I/O error 540 * has occurred, which should cause us to break above. 541 * However, if the short read did not cause an error, 542 * then we want to ensure that we do not uiomove bad 543 * or uninitialized data. 544 */ 545 size -= bp->b_resid; 546 if (size < xfersize) { 547 if (size == 0) 548 break; 549 xfersize = size; 550 } 551 552 #ifdef ENABLE_VFS_IOOPT 553 if (vfs_ioopt && object && 554 (bp->b_flags & B_VMIO) && 555 ((blkoffset & PAGE_MASK) == 0) && 556 ((xfersize & PAGE_MASK) == 0)) { 557 /* 558 * If VFS IO optimisation is turned on, 559 * and it's an exact page multiple 560 * And a normal VM based op, 561 * then use uiomiveco() 562 */ 563 error = 564 uiomoveco((char *)bp->b_data + blkoffset, 565 (int)xfersize, uio, object, 0); 566 } else 567 #endif 568 { 569 /* 570 * otherwise use the general form 571 */ 572 error = 573 uiomove((char *)bp->b_data + blkoffset, 574 (int)xfersize, uio); 575 } 576 577 if (error) 578 break; 579 580 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 581 (LIST_FIRST(&bp->b_dep) == NULL)) { 582 /* 583 * If there are no dependencies, and it's VMIO, 584 * then we don't need the buf, mark it available 585 * for freeing. The VM has the data. 586 */ 587 bp->b_flags |= B_RELBUF; 588 brelse(bp); 589 } else { 590 /* 591 * Otherwise let whoever 592 * made the request take care of 593 * freeing it. We just queue 594 * it onto another list. 595 */ 596 bqrelse(bp); 597 } 598 } 599 600 /* 601 * This can only happen in the case of an error 602 * because the loop above resets bp to NULL on each iteration 603 * and on normal completion has not set a new value into it. 604 * so it must have come from a 'break' statement 605 */ 606 if (bp != NULL) { 607 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 608 (LIST_FIRST(&bp->b_dep) == NULL)) { 609 bp->b_flags |= B_RELBUF; 610 brelse(bp); 611 } else { 612 bqrelse(bp); 613 } 614 } 615 616 if (object) { 617 vm_object_vndeallocate(object); 618 } 619 if ((error == 0 || uio->uio_resid != orig_resid) && 620 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 621 ip->i_flag |= IN_ACCESS; 622 return (error); 623 } 624 625 /* 626 * Vnode op for writing. 627 */ 628 int 629 ffs_write(ap) 630 struct vop_write_args /* { 631 struct vnode *a_vp; 632 struct uio *a_uio; 633 int a_ioflag; 634 struct ucred *a_cred; 635 } */ *ap; 636 { 637 struct vnode *vp; 638 struct uio *uio; 639 struct inode *ip; 640 struct fs *fs; 641 struct buf *bp; 642 struct thread *td; 643 ufs_lbn_t lbn; 644 off_t osize; 645 int seqcount; 646 int blkoffset, error, extended, flags, ioflag, resid, size, xfersize; 647 vm_object_t object; 648 649 vp = ap->a_vp; 650 uio = ap->a_uio; 651 ioflag = ap->a_ioflag; 652 if (ap->a_ioflag & IO_EXT) 653 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 654 655 GIANT_REQUIRED; 656 657 extended = 0; 658 seqcount = ap->a_ioflag >> 16; 659 ip = VTOI(vp); 660 661 object = vp->v_object; 662 if (object) { 663 vm_object_reference(object); 664 } 665 666 #ifdef DIAGNOSTIC 667 if (uio->uio_rw != UIO_WRITE) 668 panic("ffswrite: mode"); 669 #endif 670 671 switch (vp->v_type) { 672 case VREG: 673 if (ioflag & IO_APPEND) 674 uio->uio_offset = ip->i_size; 675 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) { 676 if (object) { 677 vm_object_vndeallocate(object); 678 } 679 return (EPERM); 680 } 681 /* FALLTHROUGH */ 682 case VLNK: 683 break; 684 case VDIR: 685 panic("ffswrite: dir write"); 686 break; 687 default: 688 panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type, 689 (int)uio->uio_offset, 690 (int)uio->uio_resid 691 ); 692 } 693 694 fs = ip->i_fs; 695 if (uio->uio_offset < 0 || 696 (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) { 697 if (object) { 698 vm_object_vndeallocate(object); 699 } 700 return (EFBIG); 701 } 702 /* 703 * Maybe this should be above the vnode op call, but so long as 704 * file servers have no limits, I don't think it matters. 705 */ 706 td = uio->uio_td; 707 if (vp->v_type == VREG && td && 708 uio->uio_offset + uio->uio_resid > 709 td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 710 PROC_LOCK(td->td_proc); 711 psignal(td->td_proc, SIGXFSZ); 712 PROC_UNLOCK(td->td_proc); 713 if (object) { 714 vm_object_vndeallocate(object); 715 } 716 return (EFBIG); 717 } 718 719 resid = uio->uio_resid; 720 osize = ip->i_size; 721 flags = 0; 722 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 723 flags = IO_SYNC; 724 725 #ifdef ENABLE_VFS_IOOPT 726 if (object && (object->flags & OBJ_OPT)) { 727 vm_freeze_copyopts(object, 728 OFF_TO_IDX(uio->uio_offset), 729 OFF_TO_IDX(uio->uio_offset + uio->uio_resid + PAGE_MASK)); 730 } 731 #endif 732 for (error = 0; uio->uio_resid > 0;) { 733 lbn = lblkno(fs, uio->uio_offset); 734 blkoffset = blkoff(fs, uio->uio_offset); 735 xfersize = fs->fs_bsize - blkoffset; 736 if (uio->uio_resid < xfersize) 737 xfersize = uio->uio_resid; 738 739 if (uio->uio_offset + xfersize > ip->i_size) 740 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 741 742 /* 743 * We must perform a read-before-write if the transfer size 744 * does not cover the entire buffer. 745 */ 746 if (fs->fs_bsize > xfersize) 747 flags |= BA_CLRBUF; 748 else 749 flags &= ~BA_CLRBUF; 750 /* XXX is uio->uio_offset the right thing here? */ 751 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 752 ap->a_cred, flags, &bp); 753 if (error != 0) 754 break; 755 /* 756 * If the buffer is not valid we have to clear out any 757 * garbage data from the pages instantiated for the buffer. 758 * If we do not, a failed uiomove() during a write can leave 759 * the prior contents of the pages exposed to a userland 760 * mmap(). XXX deal with uiomove() errors a better way. 761 */ 762 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 763 vfs_bio_clrbuf(bp); 764 if (ioflag & IO_DIRECT) 765 bp->b_flags |= B_DIRECT; 766 if (ioflag & IO_NOWDRAIN) 767 bp->b_flags |= B_NOWDRAIN; 768 769 if (uio->uio_offset + xfersize > ip->i_size) { 770 ip->i_size = uio->uio_offset + xfersize; 771 DIP(ip, i_size) = ip->i_size; 772 extended = 1; 773 } 774 775 size = blksize(fs, ip, lbn) - bp->b_resid; 776 if (size < xfersize) 777 xfersize = size; 778 779 error = 780 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 781 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 782 (LIST_FIRST(&bp->b_dep) == NULL)) { 783 bp->b_flags |= B_RELBUF; 784 } 785 786 /* 787 * If IO_SYNC each buffer is written synchronously. Otherwise 788 * if we have a severe page deficiency write the buffer 789 * asynchronously. Otherwise try to cluster, and if that 790 * doesn't do it then either do an async write (if O_DIRECT), 791 * or a delayed write (if not). 792 */ 793 if (ioflag & IO_SYNC) { 794 (void)bwrite(bp); 795 } else if (vm_page_count_severe() || 796 buf_dirty_count_severe() || 797 (ioflag & IO_ASYNC)) { 798 bp->b_flags |= B_CLUSTEROK; 799 bawrite(bp); 800 } else if (xfersize + blkoffset == fs->fs_bsize) { 801 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 802 bp->b_flags |= B_CLUSTEROK; 803 cluster_write(bp, ip->i_size, seqcount); 804 } else { 805 bawrite(bp); 806 } 807 } else if (ioflag & IO_DIRECT) { 808 bp->b_flags |= B_CLUSTEROK; 809 bawrite(bp); 810 } else { 811 bp->b_flags |= B_CLUSTEROK; 812 bdwrite(bp); 813 } 814 if (error || xfersize == 0) 815 break; 816 ip->i_flag |= IN_CHANGE | IN_UPDATE; 817 } 818 /* 819 * If we successfully wrote any data, and we are not the superuser 820 * we clear the setuid and setgid bits as a precaution against 821 * tampering. 822 */ 823 if (resid > uio->uio_resid && ap->a_cred && 824 suser_cred(ap->a_cred, PRISON_ROOT)) { 825 ip->i_mode &= ~(ISUID | ISGID); 826 DIP(ip, i_mode) = ip->i_mode; 827 } 828 if (resid > uio->uio_resid) 829 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); 830 if (error) { 831 if (ioflag & IO_UNIT) { 832 (void)UFS_TRUNCATE(vp, osize, 833 IO_NORMAL | (ioflag & IO_SYNC), 834 ap->a_cred, uio->uio_td); 835 uio->uio_offset -= resid - uio->uio_resid; 836 uio->uio_resid = resid; 837 } 838 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 839 error = UFS_UPDATE(vp, 1); 840 841 if (object) { 842 vm_object_vndeallocate(object); 843 } 844 845 return (error); 846 } 847 848 /* 849 * get page routine 850 */ 851 int 852 ffs_getpages(ap) 853 struct vop_getpages_args *ap; 854 { 855 off_t foff, physoffset; 856 int i, size, bsize; 857 struct vnode *dp, *vp; 858 vm_object_t obj; 859 vm_pindex_t pindex, firstindex; 860 vm_page_t mreq; 861 int bbackwards, bforwards; 862 int pbackwards, pforwards; 863 int firstpage; 864 ufs2_daddr_t reqblkno, reqlblkno; 865 int poff; 866 int pcount; 867 int rtval; 868 int pagesperblock; 869 870 GIANT_REQUIRED; 871 872 pcount = round_page(ap->a_count) / PAGE_SIZE; 873 mreq = ap->a_m[ap->a_reqpage]; 874 firstindex = ap->a_m[0]->pindex; 875 876 /* 877 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 878 * then the entire page is valid. Since the page may be mapped, 879 * user programs might reference data beyond the actual end of file 880 * occuring within the page. We have to zero that data. 881 */ 882 if (mreq->valid) { 883 if (mreq->valid != VM_PAGE_BITS_ALL) 884 vm_page_zero_invalid(mreq, TRUE); 885 vm_page_lock_queues(); 886 for (i = 0; i < pcount; i++) { 887 if (i != ap->a_reqpage) { 888 vm_page_free(ap->a_m[i]); 889 } 890 } 891 vm_page_unlock_queues(); 892 return VM_PAGER_OK; 893 } 894 895 vp = ap->a_vp; 896 obj = vp->v_object; 897 bsize = vp->v_mount->mnt_stat.f_iosize; 898 pindex = mreq->pindex; 899 foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */; 900 901 if (bsize < PAGE_SIZE) 902 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 903 ap->a_count, 904 ap->a_reqpage); 905 906 /* 907 * foff is the file offset of the required page 908 * reqlblkno is the logical block that contains the page 909 * poff is the index of the page into the logical block 910 */ 911 reqlblkno = foff / bsize; 912 poff = (foff % bsize) / PAGE_SIZE; 913 914 dp = VTOI(vp)->i_devvp; 915 if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards) 916 || (reqblkno == -1)) { 917 vm_page_lock_queues(); 918 for(i = 0; i < pcount; i++) { 919 if (i != ap->a_reqpage) 920 vm_page_free(ap->a_m[i]); 921 } 922 vm_page_unlock_queues(); 923 if (reqblkno == -1) { 924 if ((mreq->flags & PG_ZERO) == 0) 925 vm_page_zero_fill(mreq); 926 vm_page_undirty(mreq); 927 mreq->valid = VM_PAGE_BITS_ALL; 928 return VM_PAGER_OK; 929 } else { 930 return VM_PAGER_ERROR; 931 } 932 } 933 934 physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE; 935 pagesperblock = bsize / PAGE_SIZE; 936 /* 937 * find the first page that is contiguous... 938 * note that pbackwards is the number of pages that are contiguous 939 * backwards. 940 */ 941 firstpage = 0; 942 if (ap->a_count) { 943 pbackwards = poff + bbackwards * pagesperblock; 944 if (ap->a_reqpage > pbackwards) { 945 firstpage = ap->a_reqpage - pbackwards; 946 vm_page_lock_queues(); 947 for(i=0;i<firstpage;i++) 948 vm_page_free(ap->a_m[i]); 949 vm_page_unlock_queues(); 950 } 951 952 /* 953 * pforwards is the number of pages that are contiguous 954 * after the current page. 955 */ 956 pforwards = (pagesperblock - (poff + 1)) + 957 bforwards * pagesperblock; 958 if (pforwards < (pcount - (ap->a_reqpage + 1))) { 959 vm_page_lock_queues(); 960 for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++) 961 vm_page_free(ap->a_m[i]); 962 vm_page_unlock_queues(); 963 pcount = ap->a_reqpage + pforwards + 1; 964 } 965 966 /* 967 * number of pages for I/O corrected for the non-contig pages at 968 * the beginning of the array. 969 */ 970 pcount -= firstpage; 971 } 972 973 /* 974 * calculate the size of the transfer 975 */ 976 977 size = pcount * PAGE_SIZE; 978 979 if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) > 980 obj->un_pager.vnp.vnp_size) 981 size = obj->un_pager.vnp.vnp_size - 982 IDX_TO_OFF(ap->a_m[firstpage]->pindex); 983 984 physoffset -= foff; 985 rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size, 986 (ap->a_reqpage - firstpage), physoffset); 987 988 return (rtval); 989 } 990 991 /* 992 * Vnode op for extended attribute reading. 993 */ 994 static int 995 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) 996 { 997 struct inode *ip; 998 struct ufs2_dinode *dp; 999 struct fs *fs; 1000 struct buf *bp; 1001 ufs_lbn_t lbn, nextlbn; 1002 off_t bytesinfile; 1003 long size, xfersize, blkoffset; 1004 int error, orig_resid; 1005 mode_t mode; 1006 1007 GIANT_REQUIRED; 1008 1009 ip = VTOI(vp); 1010 fs = ip->i_fs; 1011 dp = ip->i_din2; 1012 mode = ip->i_mode; 1013 1014 #ifdef DIAGNOSTIC 1015 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 1016 panic("ffs_extread: mode"); 1017 1018 #endif 1019 orig_resid = uio->uio_resid; 1020 if (orig_resid <= 0) 1021 return (0); 1022 1023 bytesinfile = dp->di_extsize - uio->uio_offset; 1024 if (bytesinfile <= 0) { 1025 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 1026 ip->i_flag |= IN_ACCESS; 1027 return 0; 1028 } 1029 1030 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 1031 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 1032 break; 1033 1034 lbn = lblkno(fs, uio->uio_offset); 1035 nextlbn = lbn + 1; 1036 1037 /* 1038 * size of buffer. The buffer representing the 1039 * end of the file is rounded up to the size of 1040 * the block type ( fragment or full block, 1041 * depending ). 1042 */ 1043 size = sblksize(fs, dp->di_extsize, lbn); 1044 blkoffset = blkoff(fs, uio->uio_offset); 1045 1046 /* 1047 * The amount we want to transfer in this iteration is 1048 * one FS block less the amount of the data before 1049 * our startpoint (duh!) 1050 */ 1051 xfersize = fs->fs_bsize - blkoffset; 1052 1053 /* 1054 * But if we actually want less than the block, 1055 * or the file doesn't have a whole block more of data, 1056 * then use the lesser number. 1057 */ 1058 if (uio->uio_resid < xfersize) 1059 xfersize = uio->uio_resid; 1060 if (bytesinfile < xfersize) 1061 xfersize = bytesinfile; 1062 1063 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 1064 /* 1065 * Don't do readahead if this is the end of the info. 1066 */ 1067 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 1068 } else { 1069 /* 1070 * If we have a second block, then 1071 * fire off a request for a readahead 1072 * as well as a read. Note that the 4th and 5th 1073 * arguments point to arrays of the size specified in 1074 * the 6th argument. 1075 */ 1076 int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 1077 1078 nextlbn = -1 - nextlbn; 1079 error = breadn(vp, -1 - lbn, 1080 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 1081 } 1082 if (error) { 1083 brelse(bp); 1084 bp = NULL; 1085 break; 1086 } 1087 1088 /* 1089 * If IO_DIRECT then set B_DIRECT for the buffer. This 1090 * will cause us to attempt to release the buffer later on 1091 * and will cause the buffer cache to attempt to free the 1092 * underlying pages. 1093 */ 1094 if (ioflag & IO_DIRECT) 1095 bp->b_flags |= B_DIRECT; 1096 1097 /* 1098 * We should only get non-zero b_resid when an I/O error 1099 * has occurred, which should cause us to break above. 1100 * However, if the short read did not cause an error, 1101 * then we want to ensure that we do not uiomove bad 1102 * or uninitialized data. 1103 */ 1104 size -= bp->b_resid; 1105 if (size < xfersize) { 1106 if (size == 0) 1107 break; 1108 xfersize = size; 1109 } 1110 1111 error = uiomove((char *)bp->b_data + blkoffset, 1112 (int)xfersize, uio); 1113 if (error) 1114 break; 1115 1116 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1117 (LIST_FIRST(&bp->b_dep) == NULL)) { 1118 /* 1119 * If there are no dependencies, and it's VMIO, 1120 * then we don't need the buf, mark it available 1121 * for freeing. The VM has the data. 1122 */ 1123 bp->b_flags |= B_RELBUF; 1124 brelse(bp); 1125 } else { 1126 /* 1127 * Otherwise let whoever 1128 * made the request take care of 1129 * freeing it. We just queue 1130 * it onto another list. 1131 */ 1132 bqrelse(bp); 1133 } 1134 } 1135 1136 /* 1137 * This can only happen in the case of an error 1138 * because the loop above resets bp to NULL on each iteration 1139 * and on normal completion has not set a new value into it. 1140 * so it must have come from a 'break' statement 1141 */ 1142 if (bp != NULL) { 1143 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1144 (LIST_FIRST(&bp->b_dep) == NULL)) { 1145 bp->b_flags |= B_RELBUF; 1146 brelse(bp); 1147 } else { 1148 bqrelse(bp); 1149 } 1150 } 1151 1152 if ((error == 0 || uio->uio_resid != orig_resid) && 1153 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 1154 ip->i_flag |= IN_ACCESS; 1155 return (error); 1156 } 1157 1158 /* 1159 * Vnode op for external attribute writing. 1160 */ 1161 static int 1162 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1163 { 1164 struct inode *ip; 1165 struct ufs2_dinode *dp; 1166 struct fs *fs; 1167 struct buf *bp; 1168 ufs_lbn_t lbn; 1169 off_t osize; 1170 int blkoffset, error, flags, resid, size, xfersize; 1171 1172 GIANT_REQUIRED; 1173 1174 ip = VTOI(vp); 1175 fs = ip->i_fs; 1176 dp = ip->i_din2; 1177 1178 #ifdef DIAGNOSTIC 1179 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1180 panic("ext_write: mode"); 1181 #endif 1182 1183 if (ioflag & IO_APPEND) 1184 uio->uio_offset = dp->di_extsize; 1185 1186 if (uio->uio_offset < 0 || 1187 (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1188 return (EFBIG); 1189 1190 resid = uio->uio_resid; 1191 osize = dp->di_extsize; 1192 flags = IO_EXT; 1193 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1194 flags |= IO_SYNC; 1195 1196 for (error = 0; uio->uio_resid > 0;) { 1197 lbn = lblkno(fs, uio->uio_offset); 1198 blkoffset = blkoff(fs, uio->uio_offset); 1199 xfersize = fs->fs_bsize - blkoffset; 1200 if (uio->uio_resid < xfersize) 1201 xfersize = uio->uio_resid; 1202 1203 /* 1204 * We must perform a read-before-write if the transfer size 1205 * does not cover the entire buffer. 1206 */ 1207 if (fs->fs_bsize > xfersize) 1208 flags |= BA_CLRBUF; 1209 else 1210 flags &= ~BA_CLRBUF; 1211 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1212 ucred, flags, &bp); 1213 if (error != 0) 1214 break; 1215 /* 1216 * If the buffer is not valid we have to clear out any 1217 * garbage data from the pages instantiated for the buffer. 1218 * If we do not, a failed uiomove() during a write can leave 1219 * the prior contents of the pages exposed to a userland 1220 * mmap(). XXX deal with uiomove() errors a better way. 1221 */ 1222 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1223 vfs_bio_clrbuf(bp); 1224 if (ioflag & IO_DIRECT) 1225 bp->b_flags |= B_DIRECT; 1226 if (ioflag & IO_NOWDRAIN) 1227 bp->b_flags |= B_NOWDRAIN; 1228 1229 if (uio->uio_offset + xfersize > dp->di_extsize) 1230 dp->di_extsize = uio->uio_offset + xfersize; 1231 1232 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1233 if (size < xfersize) 1234 xfersize = size; 1235 1236 error = 1237 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1238 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1239 (LIST_FIRST(&bp->b_dep) == NULL)) { 1240 bp->b_flags |= B_RELBUF; 1241 } 1242 1243 /* 1244 * If IO_SYNC each buffer is written synchronously. Otherwise 1245 * if we have a severe page deficiency write the buffer 1246 * asynchronously. Otherwise try to cluster, and if that 1247 * doesn't do it then either do an async write (if O_DIRECT), 1248 * or a delayed write (if not). 1249 */ 1250 if (ioflag & IO_SYNC) { 1251 (void)bwrite(bp); 1252 } else if (vm_page_count_severe() || 1253 buf_dirty_count_severe() || 1254 xfersize + blkoffset == fs->fs_bsize || 1255 (ioflag & (IO_ASYNC | IO_DIRECT))) 1256 bawrite(bp); 1257 else 1258 bdwrite(bp); 1259 if (error || xfersize == 0) 1260 break; 1261 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1262 } 1263 /* 1264 * If we successfully wrote any data, and we are not the superuser 1265 * we clear the setuid and setgid bits as a precaution against 1266 * tampering. 1267 */ 1268 if (resid > uio->uio_resid && ucred && 1269 suser_cred(ucred, PRISON_ROOT)) { 1270 ip->i_mode &= ~(ISUID | ISGID); 1271 dp->di_mode = ip->i_mode; 1272 } 1273 if (error) { 1274 if (ioflag & IO_UNIT) { 1275 (void)UFS_TRUNCATE(vp, osize, 1276 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td); 1277 uio->uio_offset -= resid - uio->uio_resid; 1278 uio->uio_resid = resid; 1279 } 1280 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1281 error = UFS_UPDATE(vp, 1); 1282 return (error); 1283 } 1284 1285 /* 1286 * Vnode operating to retrieve a named extended attribute. 1287 */ 1288 int 1289 ffs_getextattr(struct vop_getextattr_args *ap) 1290 /* 1291 vop_getextattr { 1292 IN struct vnode *a_vp; 1293 IN int a_attrnamespace; 1294 IN const char *a_name; 1295 INOUT struct uio *a_uio; 1296 OUT struct size_t *a_size; 1297 IN struct ucred *a_cred; 1298 IN struct thread *a_td; 1299 }; 1300 */ 1301 { 1302 1303 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1304 } 1305 1306 /* 1307 * Vnode operation to set a named attribute. 1308 */ 1309 int 1310 ffs_setextattr(struct vop_setextattr_args *ap) 1311 /* 1312 vop_setextattr { 1313 IN struct vnode *a_vp; 1314 IN int a_attrnamespace; 1315 IN const char *a_name; 1316 INOUT struct uio *a_uio; 1317 IN struct ucred *a_cred; 1318 IN struct thread *a_td; 1319 }; 1320 */ 1321 { 1322 1323 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1324 } 1325