1 /* 2 * Copyright (c) 1990 University of Utah. 3 * Copyright (c) 1991 The Regents of the University of California. 4 * All rights reserved. 5 * Copyright (c) 1993,1994 John S. Dyson 6 * 7 * This code is derived from software contributed to Berkeley by 8 * the Systems Programming Group of the University of Utah Computer 9 * Science Department. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 40 * $Id: vnode_pager.c,v 1.5 1994/08/06 10:25:50 davidg Exp $ 41 */ 42 43 /* 44 * Page to/from files (vnodes). 45 * 46 * TODO: 47 * pageouts 48 * fix credential use (uses current process credentials now) 49 */ 50 51 /* 52 * MODIFICATIONS: 53 * John S. Dyson 08 Dec 93 54 * 55 * This file in conjunction with some vm_fault mods, eliminate the performance 56 * advantage for using the buffer cache and minimize memory copies. 57 * 58 * 1) Supports multiple - block reads 59 * 2) Bypasses buffer cache for reads 60 * 61 * TODO: 62 * 63 * 1) Totally bypass buffer cache for reads 64 * (Currently will still sometimes use buffer cache for reads) 65 * 2) Bypass buffer cache for writes 66 * (Code does not support it, but mods are simple) 67 */ 68 69 #include <sys/param.h> 70 #include <sys/systm.h> 71 #include <sys/proc.h> 72 #include <sys/malloc.h> 73 #include <sys/vnode.h> 74 #include <sys/uio.h> 75 #include <sys/mount.h> 76 77 #include <vm/vm.h> 78 #include <vm/vm_page.h> 79 #include <vm/vnode_pager.h> 80 81 #include <sys/buf.h> 82 #include <miscfs/specfs/specdev.h> 83 84 int vnode_pager_putmulti(); 85 86 void vnode_pager_init(); 87 vm_pager_t vnode_pager_alloc(caddr_t, vm_offset_t, vm_prot_t, vm_offset_t); 88 void vnode_pager_dealloc(); 89 int vnode_pager_getpage(); 90 int vnode_pager_getmulti(); 91 int vnode_pager_putpage(); 92 boolean_t vnode_pager_haspage(); 93 94 struct pagerops vnodepagerops = { 95 vnode_pager_init, 96 vnode_pager_alloc, 97 vnode_pager_dealloc, 98 vnode_pager_getpage, 99 vnode_pager_getmulti, 100 vnode_pager_putpage, 101 vnode_pager_putmulti, 102 vnode_pager_haspage 103 }; 104 105 106 107 static int vnode_pager_input(vn_pager_t vnp, vm_page_t * m, int count, int reqpage); 108 static int vnode_pager_output(vn_pager_t vnp, vm_page_t * m, int count, int *rtvals); 109 struct buf * getpbuf(); 110 void relpbuf(struct buf * bp); 111 112 extern vm_map_t pager_map; 113 114 struct pagerlst vnode_pager_list; /* list of managed vnodes */ 115 116 #define MAXBP (PAGE_SIZE/DEV_BSIZE); 117 118 void 119 vnode_pager_init() 120 { 121 TAILQ_INIT(&vnode_pager_list); 122 } 123 124 /* 125 * Allocate (or lookup) pager for a vnode. 126 * Handle is a vnode pointer. 127 */ 128 vm_pager_t 129 vnode_pager_alloc(handle, size, prot, offset) 130 caddr_t handle; 131 vm_size_t size; 132 vm_prot_t prot; 133 vm_offset_t offset; 134 { 135 register vm_pager_t pager; 136 register vn_pager_t vnp; 137 vm_object_t object; 138 struct vattr vattr; 139 struct vnode *vp; 140 struct proc *p = curproc; /* XXX */ 141 142 /* 143 * Pageout to vnode, no can do yet. 144 */ 145 if (handle == NULL) 146 return (NULL); 147 148 /* 149 * Vnodes keep a pointer to any associated pager so no need to lookup 150 * with vm_pager_lookup. 151 */ 152 vp = (struct vnode *) handle; 153 pager = (vm_pager_t) vp->v_vmdata; 154 if (pager == NULL) { 155 156 /* 157 * Allocate pager structures 158 */ 159 pager = (vm_pager_t) malloc(sizeof *pager, M_VMPAGER, M_WAITOK); 160 if (pager == NULL) 161 return (NULL); 162 vnp = (vn_pager_t) malloc(sizeof *vnp, M_VMPGDATA, M_WAITOK); 163 if (vnp == NULL) { 164 free((caddr_t) pager, M_VMPAGER); 165 return (NULL); 166 } 167 168 /* 169 * And an object of the appropriate size 170 */ 171 if (VOP_GETATTR(vp, &vattr, p->p_ucred, p) == 0) { 172 object = vm_object_allocate(round_page(vattr.va_size)); 173 vm_object_enter(object, pager); 174 vm_object_setpager(object, pager, 0, TRUE); 175 } else { 176 free((caddr_t) vnp, M_VMPGDATA); 177 free((caddr_t) pager, M_VMPAGER); 178 return (NULL); 179 } 180 181 /* 182 * Hold a reference to the vnode and initialize pager data. 183 */ 184 VREF(vp); 185 vnp->vnp_flags = 0; 186 vnp->vnp_vp = vp; 187 vnp->vnp_size = vattr.va_size; 188 189 TAILQ_INSERT_TAIL(&vnode_pager_list, pager, pg_list); 190 pager->pg_handle = handle; 191 pager->pg_type = PG_VNODE; 192 pager->pg_ops = &vnodepagerops; 193 pager->pg_data = (caddr_t) vnp; 194 vp->v_vmdata = (caddr_t) pager; 195 } else { 196 197 /* 198 * vm_object_lookup() will remove the object from the cache if 199 * found and also gain a reference to the object. 200 */ 201 object = vm_object_lookup(pager); 202 } 203 return (pager); 204 } 205 206 void 207 vnode_pager_dealloc(pager) 208 vm_pager_t pager; 209 { 210 register vn_pager_t vnp = (vn_pager_t) pager->pg_data; 211 register struct vnode *vp; 212 struct proc *p = curproc; /* XXX */ 213 214 if (vp = vnp->vnp_vp) { 215 vp->v_vmdata = NULL; 216 vp->v_flag &= ~VTEXT; 217 #if 0 218 /* can hang if done at reboot on NFS FS */ 219 (void) VOP_FSYNC(vp, p->p_ucred, p); 220 #endif 221 vrele(vp); 222 } 223 TAILQ_REMOVE(&vnode_pager_list, pager, pg_list); 224 free((caddr_t) vnp, M_VMPGDATA); 225 free((caddr_t) pager, M_VMPAGER); 226 } 227 228 int 229 vnode_pager_getmulti(pager, m, count, reqpage, sync) 230 vm_pager_t pager; 231 vm_page_t *m; 232 int count; 233 int reqpage; 234 boolean_t sync; 235 { 236 237 return vnode_pager_input((vn_pager_t) pager->pg_data, m, count, reqpage); 238 } 239 240 int 241 vnode_pager_getpage(pager, m, sync) 242 vm_pager_t pager; 243 vm_page_t m; 244 boolean_t sync; 245 { 246 247 int err; 248 vm_page_t marray[1]; 249 250 if (pager == NULL) 251 return FALSE; 252 marray[0] = m; 253 254 return vnode_pager_input((vn_pager_t) pager->pg_data, marray, 1, 0); 255 } 256 257 boolean_t 258 vnode_pager_putpage(pager, m, sync) 259 vm_pager_t pager; 260 vm_page_t m; 261 boolean_t sync; 262 { 263 int err; 264 vm_page_t marray[1]; 265 int rtvals[1]; 266 267 if (pager == NULL) 268 return FALSE; 269 marray[0] = m; 270 vnode_pager_output((vn_pager_t) pager->pg_data, marray, 1, rtvals); 271 return rtvals[0]; 272 } 273 274 int 275 vnode_pager_putmulti(pager, m, c, sync, rtvals) 276 vm_pager_t pager; 277 vm_page_t *m; 278 int c; 279 boolean_t sync; 280 int *rtvals; 281 { 282 return vnode_pager_output((vn_pager_t) pager->pg_data, m, c, rtvals); 283 } 284 285 286 boolean_t 287 vnode_pager_haspage(pager, offset) 288 vm_pager_t pager; 289 vm_offset_t offset; 290 { 291 register vn_pager_t vnp = (vn_pager_t) pager->pg_data; 292 daddr_t bn; 293 int err; 294 295 /* 296 * Offset beyond end of file, do not have the page 297 */ 298 if (offset >= vnp->vnp_size) { 299 return (FALSE); 300 } 301 302 /* 303 * Read the index to find the disk block to read from. If there is no 304 * block, report that we don't have this data. 305 * 306 * Assumes that the vnode has whole page or nothing. 307 */ 308 err = VOP_BMAP(vnp->vnp_vp, 309 offset / vnp->vnp_vp->v_mount->mnt_stat.f_iosize, 310 (struct vnode **) 0, &bn, 0); 311 if (err) { 312 return (TRUE); 313 } 314 return ((long) bn < 0 ? FALSE : TRUE); 315 } 316 317 /* 318 * Lets the VM system know about a change in size for a file. 319 * If this vnode is mapped into some address space (i.e. we have a pager 320 * for it) we adjust our own internal size and flush any cached pages in 321 * the associated object that are affected by the size change. 322 * 323 * Note: this routine may be invoked as a result of a pager put 324 * operation (possibly at object termination time), so we must be careful. 325 */ 326 void 327 vnode_pager_setsize(vp, nsize) 328 struct vnode *vp; 329 u_long nsize; 330 { 331 register vn_pager_t vnp; 332 register vm_object_t object; 333 vm_pager_t pager; 334 335 /* 336 * Not a mapped vnode 337 */ 338 if (vp == NULL || vp->v_type != VREG || vp->v_vmdata == NULL) 339 return; 340 341 /* 342 * Hasn't changed size 343 */ 344 pager = (vm_pager_t) vp->v_vmdata; 345 vnp = (vn_pager_t) pager->pg_data; 346 if (nsize == vnp->vnp_size) 347 return; 348 349 /* 350 * No object. This can happen during object termination since 351 * vm_object_page_clean is called after the object has been removed 352 * from the hash table, and clean may cause vnode write operations 353 * which can wind up back here. 354 */ 355 object = vm_object_lookup(pager); 356 if (object == NULL) 357 return; 358 359 /* 360 * File has shrunk. Toss any cached pages beyond the new EOF. 361 */ 362 if (nsize < vnp->vnp_size) { 363 vm_object_lock(object); 364 vm_object_page_remove(object, 365 round_page((vm_offset_t) nsize), vnp->vnp_size); 366 vm_object_unlock(object); 367 368 /* 369 * this gets rid of garbage at the end of a page that is now 370 * only partially backed by the vnode... 371 */ 372 if (nsize & PAGE_MASK) { 373 vm_offset_t kva; 374 vm_page_t m; 375 376 m = vm_page_lookup(object, trunc_page((vm_offset_t) nsize)); 377 if (m) { 378 kva = vm_pager_map_page(m); 379 bzero((caddr_t) kva + (nsize & PAGE_MASK), 380 round_page(nsize) - nsize); 381 vm_pager_unmap_page(kva); 382 } 383 } 384 } else { 385 386 /* 387 * this allows the filesystem and VM cache to stay in sync if 388 * the VM page hasn't been modified... After the page is 389 * removed -- it will be faulted back in from the filesystem 390 * cache. 391 */ 392 if (vnp->vnp_size & PAGE_MASK) { 393 vm_page_t m; 394 395 m = vm_page_lookup(object, trunc_page(vnp->vnp_size)); 396 if (m && (m->flags & PG_CLEAN)) { 397 vm_object_lock(object); 398 vm_object_page_remove(object, 399 vnp->vnp_size, vnp->vnp_size); 400 vm_object_unlock(object); 401 } 402 } 403 } 404 vnp->vnp_size = (vm_offset_t) nsize; 405 object->size = round_page(nsize); 406 407 vm_object_deallocate(object); 408 } 409 410 void 411 vnode_pager_umount(mp) 412 register struct mount *mp; 413 { 414 register vm_pager_t pager, npager; 415 struct vnode *vp; 416 417 pager = vnode_pager_list.tqh_first; 418 while (pager) { 419 420 /* 421 * Save the next pointer now since uncaching may terminate the 422 * object and render pager invalid 423 */ 424 vp = ((vn_pager_t) pager->pg_data)->vnp_vp; 425 npager = pager->pg_list.tqe_next; 426 if (mp == (struct mount *) 0 || vp->v_mount == mp) 427 (void) vnode_pager_uncache(vp); 428 pager = npager; 429 } 430 } 431 432 /* 433 * Remove vnode associated object from the object cache. 434 * 435 * Note: this routine may be invoked as a result of a pager put 436 * operation (possibly at object termination time), so we must be careful. 437 */ 438 boolean_t 439 vnode_pager_uncache(vp) 440 register struct vnode *vp; 441 { 442 register vm_object_t object; 443 boolean_t uncached, locked; 444 vm_pager_t pager; 445 446 /* 447 * Not a mapped vnode 448 */ 449 pager = (vm_pager_t) vp->v_vmdata; 450 if (pager == NULL) 451 return (TRUE); 452 453 /* 454 * Unlock the vnode if it is currently locked. We do this since 455 * uncaching the object may result in its destruction which may 456 * initiate paging activity which may necessitate locking the vnode. 457 */ 458 locked = VOP_ISLOCKED(vp); 459 if (locked) 460 VOP_UNLOCK(vp); 461 462 /* 463 * Must use vm_object_lookup() as it actually removes the object from 464 * the cache list. 465 */ 466 object = vm_object_lookup(pager); 467 if (object) { 468 uncached = (object->ref_count <= 1); 469 pager_cache(object, FALSE); 470 } else 471 uncached = TRUE; 472 if (locked) 473 VOP_LOCK(vp); 474 return (uncached); 475 } 476 477 478 void 479 vnode_pager_freepage(m) 480 vm_page_t m; 481 { 482 PAGE_WAKEUP(m); 483 vm_page_free(m); 484 } 485 486 /* 487 * calculate the linear (byte) disk address of specified virtual 488 * file address 489 */ 490 vm_offset_t 491 vnode_pager_addr(vp, address) 492 struct vnode *vp; 493 vm_offset_t address; 494 { 495 int rtaddress; 496 int bsize; 497 vm_offset_t block; 498 struct vnode *rtvp; 499 int err; 500 int vblock, voffset; 501 502 bsize = vp->v_mount->mnt_stat.f_iosize; 503 vblock = address / bsize; 504 voffset = address % bsize; 505 506 err = VOP_BMAP(vp, vblock, &rtvp, &block, 0); 507 508 if (err) 509 rtaddress = -1; 510 else 511 rtaddress = block * DEV_BSIZE + voffset; 512 513 return rtaddress; 514 } 515 516 /* 517 * interrupt routine for I/O completion 518 */ 519 void 520 vnode_pager_iodone(bp) 521 struct buf *bp; 522 { 523 bp->b_flags |= B_DONE; 524 wakeup((caddr_t) bp); 525 if( bp->b_flags & B_ASYNC) { 526 vm_offset_t paddr; 527 vm_page_t m; 528 vm_object_t obj = 0; 529 int i; 530 int npages; 531 532 paddr = (vm_offset_t) bp->b_data; 533 if( bp->b_bufsize != bp->b_bcount) 534 bzero( bp->b_data + bp->b_bcount, 535 bp->b_bufsize - bp->b_bcount); 536 537 npages = (bp->b_bufsize + PAGE_SIZE - 1) / PAGE_SIZE; 538 /* 539 printf("bcount: %d, bufsize: %d, npages: %d\n", 540 bp->b_bcount, bp->b_bufsize, npages); 541 */ 542 for( i = 0; i < npages; i++) { 543 m = PHYS_TO_VM_PAGE(pmap_kextract(paddr + i * PAGE_SIZE)); 544 obj = m->object; 545 if( m) { 546 m->flags |= PG_CLEAN; 547 m->flags &= ~(PG_LAUNDRY|PG_FAKE); 548 PAGE_WAKEUP(m); 549 } else { 550 panic("vnode_pager_iodone: page is gone!!!"); 551 } 552 } 553 pmap_qremove( paddr, npages); 554 if( obj) { 555 --obj->paging_in_progress; 556 if( obj->paging_in_progress == 0) 557 wakeup((caddr_t) obj); 558 } else { 559 panic("vnode_pager_iodone: object is gone???"); 560 } 561 HOLDRELE(bp->b_vp); 562 relpbuf(bp); 563 } 564 } 565 566 /* 567 * small block file system vnode pager input 568 */ 569 int 570 vnode_pager_input_smlfs(vnp, m) 571 vn_pager_t vnp; 572 vm_page_t m; 573 { 574 int i; 575 int s; 576 vm_offset_t paging_offset; 577 struct vnode *dp, *vp; 578 struct buf *bp; 579 vm_offset_t foff; 580 vm_offset_t kva; 581 int fileaddr; 582 int block; 583 vm_offset_t bsize; 584 int error = 0; 585 586 paging_offset = m->object->paging_offset; 587 vp = vnp->vnp_vp; 588 bsize = vp->v_mount->mnt_stat.f_iosize; 589 foff = m->offset + paging_offset; 590 591 VOP_BMAP(vp, foff, &dp, 0, 0); 592 593 kva = vm_pager_map_page(m); 594 595 for (i = 0; i < PAGE_SIZE / bsize; i++) { 596 597 /* 598 * calculate logical block and offset 599 */ 600 block = foff / bsize + i; 601 s = splbio(); 602 while (bp = incore(vp, block)) { 603 int amount; 604 605 /* 606 * wait until the buffer is avail or gone 607 */ 608 if (bp->b_flags & B_BUSY) { 609 bp->b_flags |= B_WANTED; 610 tsleep((caddr_t) bp, PVM, "vnwblk", 0); 611 continue; 612 } 613 amount = bsize; 614 if ((foff + bsize) > vnp->vnp_size) 615 amount = vnp->vnp_size - foff; 616 617 /* 618 * make sure that this page is in the buffer 619 */ 620 if ((amount > 0) && amount <= bp->b_bcount) { 621 bp->b_flags |= B_BUSY; 622 splx(s); 623 624 /* 625 * copy the data from the buffer 626 */ 627 bcopy(bp->b_un.b_addr, (caddr_t) kva + i * bsize, amount); 628 if (amount < bsize) { 629 bzero((caddr_t) kva + amount, bsize - amount); 630 } 631 bp->b_flags &= ~B_BUSY; 632 wakeup((caddr_t) bp); 633 goto nextblock; 634 } 635 break; 636 } 637 splx(s); 638 fileaddr = vnode_pager_addr(vp, foff + i * bsize); 639 if (fileaddr != -1) { 640 bp = getpbuf(); 641 VHOLD(vp); 642 643 /* build a minimal buffer header */ 644 bp->b_flags = B_BUSY | B_READ | B_CALL; 645 bp->b_iodone = vnode_pager_iodone; 646 bp->b_proc = curproc; 647 bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; 648 if (bp->b_rcred != NOCRED) 649 crhold(bp->b_rcred); 650 if (bp->b_wcred != NOCRED) 651 crhold(bp->b_wcred); 652 bp->b_un.b_addr = (caddr_t) kva + i * bsize; 653 bp->b_blkno = fileaddr / DEV_BSIZE; 654 bgetvp(dp, bp); 655 bp->b_bcount = bsize; 656 bp->b_bufsize = bsize; 657 658 /* do the input */ 659 VOP_STRATEGY(bp); 660 661 /* we definitely need to be at splbio here */ 662 663 s = splbio(); 664 while ((bp->b_flags & B_DONE) == 0) { 665 tsleep((caddr_t) bp, PVM, "vnsrd", 0); 666 } 667 splx(s); 668 if ((bp->b_flags & B_ERROR) != 0) 669 error = EIO; 670 671 /* 672 * free the buffer header back to the swap buffer pool 673 */ 674 relpbuf(bp); 675 HOLDRELE(vp); 676 if (error) 677 break; 678 } else { 679 bzero((caddr_t) kva + i * bsize, bsize); 680 } 681 nextblock: 682 } 683 vm_pager_unmap_page(kva); 684 if (error) { 685 return VM_PAGER_FAIL; 686 } 687 pmap_clear_modify(VM_PAGE_TO_PHYS(m)); 688 m->flags |= PG_CLEAN; 689 m->flags &= ~PG_LAUNDRY; 690 return VM_PAGER_OK; 691 692 } 693 694 695 /* 696 * old style vnode pager output routine 697 */ 698 int 699 vnode_pager_input_old(vnp, m) 700 vn_pager_t vnp; 701 vm_page_t m; 702 { 703 int i; 704 struct uio auio; 705 struct iovec aiov; 706 int error; 707 int size; 708 vm_offset_t foff; 709 vm_offset_t kva; 710 711 error = 0; 712 foff = m->offset + m->object->paging_offset; 713 714 /* 715 * Return failure if beyond current EOF 716 */ 717 if (foff >= vnp->vnp_size) { 718 return VM_PAGER_BAD; 719 } else { 720 size = PAGE_SIZE; 721 if (foff + size > vnp->vnp_size) 722 size = vnp->vnp_size - foff; 723 /* 724 * Allocate a kernel virtual address and initialize so that 725 * we can use VOP_READ/WRITE routines. 726 */ 727 kva = vm_pager_map_page(m); 728 aiov.iov_base = (caddr_t) kva; 729 aiov.iov_len = size; 730 auio.uio_iov = &aiov; 731 auio.uio_iovcnt = 1; 732 auio.uio_offset = foff; 733 auio.uio_segflg = UIO_SYSSPACE; 734 auio.uio_rw = UIO_READ; 735 auio.uio_resid = size; 736 auio.uio_procp = (struct proc *) 0; 737 738 error = VOP_READ(vnp->vnp_vp, &auio, 0, curproc->p_ucred); 739 if (!error) { 740 register int count = size - auio.uio_resid; 741 742 if (count == 0) 743 error = EINVAL; 744 else if (count != PAGE_SIZE) 745 bzero((caddr_t) kva + count, PAGE_SIZE - count); 746 } 747 vm_pager_unmap_page(kva); 748 } 749 pmap_clear_modify(VM_PAGE_TO_PHYS(m)); 750 m->flags |= PG_CLEAN; 751 m->flags &= ~PG_LAUNDRY; 752 return error ? VM_PAGER_FAIL : VM_PAGER_OK; 753 } 754 755 /* 756 * generic vnode pager input routine 757 */ 758 int 759 vnode_pager_input(vnp, m, count, reqpage) 760 register vn_pager_t vnp; 761 vm_page_t *m; 762 int count, reqpage; 763 { 764 int i, j; 765 vm_offset_t kva, foff; 766 int size, sizea; 767 struct proc *p = curproc; /* XXX */ 768 vm_object_t object; 769 vm_offset_t paging_offset; 770 struct vnode *dp, *vp; 771 int bsize; 772 773 int first, last; 774 int reqaddr, firstaddr; 775 int block, offset; 776 777 int nbp; 778 struct buf *bp, *bpa; 779 int counta; 780 int s; 781 int failflag; 782 783 int errtype = 0; /* 0 is file type otherwise vm type */ 784 int error = 0; 785 786 object = m[reqpage]->object; /* all vm_page_t items are in same 787 * object */ 788 paging_offset = object->paging_offset; 789 790 vp = vnp->vnp_vp; 791 bsize = vp->v_mount->mnt_stat.f_iosize; 792 793 /* get the UNDERLYING device for the file with VOP_BMAP() */ 794 795 /* 796 * originally, we did not check for an error return value -- assuming 797 * an fs always has a bmap entry point -- that assumption is wrong!!! 798 */ 799 foff = m[reqpage]->offset + paging_offset; 800 801 /* 802 * if we can't bmap, use old VOP code 803 */ 804 if (VOP_BMAP(vp, foff, &dp, 0, 0)) { 805 for (i = 0; i < count; i++) { 806 if (i != reqpage) { 807 vnode_pager_freepage(m[i]); 808 } 809 } 810 return vnode_pager_input_old(vnp, m[reqpage]); 811 812 /* 813 * if the blocksize is smaller than a page size, then use 814 * special small filesystem code. NFS sometimes has a small 815 * blocksize, but it can handle large reads itself. 816 */ 817 } else if ((PAGE_SIZE / bsize) > 1 && 818 (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) { 819 820 for (i = 0; i < count; i++) { 821 if (i != reqpage) { 822 vnode_pager_freepage(m[i]); 823 } 824 } 825 return vnode_pager_input_smlfs(vnp, m[reqpage]); 826 } 827 /* 828 * here on direct device I/O 829 */ 830 831 832 /* 833 * This pathetic hack gets data from the buffer cache, if it's there. 834 * I believe that this is not really necessary, and the ends can be 835 * gotten by defaulting to the normal vfs read behavior, but this 836 * might be more efficient, because the will NOT invoke read-aheads 837 * and one of the purposes of this code is to bypass the buffer cache 838 * and keep from flushing it by reading in a program. 839 */ 840 841 /* 842 * calculate logical block and offset 843 */ 844 block = foff / bsize; 845 offset = foff % bsize; 846 s = splbio(); 847 848 /* 849 * if we have a buffer in core, then try to use it 850 */ 851 while (bp = incore(vp, block)) { 852 int amount; 853 854 /* 855 * wait until the buffer is avail or gone 856 */ 857 if (bp->b_flags & B_BUSY) { 858 bp->b_flags |= B_WANTED; 859 tsleep((caddr_t) bp, PVM, "vnwblk", 0); 860 continue; 861 } 862 amount = PAGE_SIZE; 863 if ((foff + amount) > vnp->vnp_size) 864 amount = vnp->vnp_size - foff; 865 866 /* 867 * make sure that this page is in the buffer 868 */ 869 if ((amount > 0) && (offset + amount) <= bp->b_bcount) { 870 bp->b_flags |= B_BUSY; 871 splx(s); 872 kva = kmem_alloc_pageable( pager_map, PAGE_SIZE); 873 874 /* 875 * map the requested page 876 */ 877 pmap_qenter(kva, &m[reqpage], 1); 878 879 /* 880 * copy the data from the buffer 881 */ 882 bcopy(bp->b_un.b_addr + offset, (caddr_t) kva, amount); 883 if (amount < PAGE_SIZE) { 884 bzero((caddr_t) kva + amount, PAGE_SIZE - amount); 885 } 886 887 /* 888 * unmap the page and free the kva 889 */ 890 pmap_qremove( kva, 1); 891 kmem_free_wakeup(pager_map, kva, PAGE_SIZE); 892 893 /* 894 * release the buffer back to the block subsystem 895 */ 896 bp->b_flags &= ~B_BUSY; 897 wakeup((caddr_t) bp); 898 899 /* 900 * we did not have to do any work to get the requested 901 * page, the read behind/ahead does not justify a read 902 */ 903 for (i = 0; i < count; i++) { 904 if (i != reqpage) { 905 vnode_pager_freepage(m[i]); 906 } 907 } 908 count = 1; 909 reqpage = 0; 910 m[0] = m[reqpage]; 911 912 /* 913 * sorry for the goto 914 */ 915 goto finishup; 916 } 917 918 /* 919 * buffer is nowhere to be found, read from the disk 920 */ 921 break; 922 } 923 splx(s); 924 925 reqaddr = vnode_pager_addr(vp, foff); 926 s = splbio(); 927 928 /* 929 * Make sure that our I/O request is contiguous. Scan backward and 930 * stop for the first discontiguous entry or stop for a page being in 931 * buffer cache. 932 */ 933 failflag = 0; 934 first = reqpage; 935 for (i = reqpage - 1; i >= 0; --i) { 936 if (failflag || 937 incore(vp, (foff + (i - reqpage) * PAGE_SIZE) / bsize) || 938 (vnode_pager_addr(vp, m[i]->offset + paging_offset)) 939 != reqaddr + (i - reqpage) * PAGE_SIZE) { 940 vnode_pager_freepage(m[i]); 941 failflag = 1; 942 } else { 943 first = i; 944 } 945 } 946 947 /* 948 * Scan forward and stop for the first non-contiguous entry or stop 949 * for a page being in buffer cache. 950 */ 951 failflag = 0; 952 last = reqpage + 1; 953 for (i = reqpage + 1; i < count; i++) { 954 if (failflag || 955 incore(vp, (foff + (i - reqpage) * PAGE_SIZE) / bsize) || 956 (vnode_pager_addr(vp, m[i]->offset + paging_offset)) 957 != reqaddr + (i - reqpage) * PAGE_SIZE) { 958 vnode_pager_freepage(m[i]); 959 failflag = 1; 960 } else { 961 last = i + 1; 962 } 963 } 964 splx(s); 965 966 /* 967 * the first and last page have been calculated now, move input pages 968 * to be zero based... 969 */ 970 count = last; 971 if (first != 0) { 972 for (i = first; i < count; i++) { 973 m[i - first] = m[i]; 974 } 975 count -= first; 976 reqpage -= first; 977 } 978 979 /* 980 * calculate the file virtual address for the transfer 981 */ 982 foff = m[0]->offset + paging_offset; 983 984 /* 985 * and get the disk physical address (in bytes) 986 */ 987 firstaddr = vnode_pager_addr(vp, foff); 988 989 /* 990 * calculate the size of the transfer 991 */ 992 size = count * PAGE_SIZE; 993 if ((foff + size) > vnp->vnp_size) 994 size = vnp->vnp_size - foff; 995 996 /* 997 * round up physical size for real devices 998 */ 999 if (dp->v_type == VBLK || dp->v_type == VCHR) 1000 size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 1001 1002 counta = 0; 1003 if( count*PAGE_SIZE > bsize) 1004 counta = (count - reqpage) - 1; 1005 bpa = 0; 1006 sizea = 0; 1007 if( counta) { 1008 bpa = getpbuf(); 1009 count -= counta; 1010 sizea = size - count*PAGE_SIZE; 1011 size = count * PAGE_SIZE; 1012 } 1013 1014 bp = getpbuf(); 1015 kva = (vm_offset_t)bp->b_data; 1016 1017 /* 1018 * and map the pages to be read into the kva 1019 */ 1020 pmap_qenter(kva, m, count); 1021 VHOLD(vp); 1022 1023 /* build a minimal buffer header */ 1024 bp->b_flags = B_BUSY | B_READ | B_CALL; 1025 bp->b_iodone = vnode_pager_iodone; 1026 /* B_PHYS is not set, but it is nice to fill this in */ 1027 bp->b_proc = curproc; 1028 bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; 1029 if (bp->b_rcred != NOCRED) 1030 crhold(bp->b_rcred); 1031 if (bp->b_wcred != NOCRED) 1032 crhold(bp->b_wcred); 1033 bp->b_blkno = firstaddr / DEV_BSIZE; 1034 bgetvp(dp, bp); 1035 bp->b_bcount = size; 1036 bp->b_bufsize = size; 1037 1038 /* do the input */ 1039 VOP_STRATEGY(bp); 1040 if( counta) { 1041 for(i=0;i<counta;i++) { 1042 vm_page_deactivate(m[count+i]); 1043 } 1044 pmap_qenter((vm_offset_t)bpa->b_data, &m[count], counta); 1045 ++m[count]->object->paging_in_progress; 1046 VHOLD(vp); 1047 bpa->b_flags = B_BUSY | B_READ | B_CALL | B_ASYNC; 1048 bpa->b_iodone = vnode_pager_iodone; 1049 /* B_PHYS is not set, but it is nice to fill this in */ 1050 bpa->b_proc = curproc; 1051 bpa->b_rcred = bpa->b_wcred = bpa->b_proc->p_ucred; 1052 if (bpa->b_rcred != NOCRED) 1053 crhold(bpa->b_rcred); 1054 if (bpa->b_wcred != NOCRED) 1055 crhold(bpa->b_wcred); 1056 bpa->b_blkno = (firstaddr + count * PAGE_SIZE) / DEV_BSIZE; 1057 bgetvp(dp, bpa); 1058 bpa->b_bcount = sizea; 1059 bpa->b_bufsize = counta*PAGE_SIZE; 1060 1061 VOP_STRATEGY(bpa); 1062 } 1063 1064 s = splbio(); 1065 /* we definitely need to be at splbio here */ 1066 1067 while ((bp->b_flags & B_DONE) == 0) { 1068 tsleep((caddr_t) bp, PVM, "vnread", 0); 1069 } 1070 splx(s); 1071 if ((bp->b_flags & B_ERROR) != 0) 1072 error = EIO; 1073 1074 if (!error) { 1075 if (size != count * PAGE_SIZE) 1076 bzero((caddr_t) kva + size, PAGE_SIZE * count - size); 1077 } 1078 pmap_qremove( kva, count); 1079 1080 /* 1081 * free the buffer header back to the swap buffer pool 1082 */ 1083 relpbuf(bp); 1084 HOLDRELE(vp); 1085 1086 finishup: 1087 for (i = 0; i < count; i++) { 1088 m[i]->flags |= PG_CLEAN; 1089 m[i]->flags &= ~PG_LAUNDRY; 1090 if (i != reqpage) { 1091 1092 /* 1093 * whether or not to leave the page activated is up in 1094 * the air, but we should put the page on a page queue 1095 * somewhere. (it already is in the object). Result: 1096 * It appears that emperical results show that 1097 * deactivating pages is best. 1098 */ 1099 1100 /* 1101 * just in case someone was asking for this page we 1102 * now tell them that it is ok to use 1103 */ 1104 if (!error) { 1105 vm_page_deactivate(m[i]); 1106 PAGE_WAKEUP(m[i]); 1107 m[i]->flags &= ~PG_FAKE; 1108 } else { 1109 vnode_pager_freepage(m[i]); 1110 } 1111 } 1112 } 1113 if (error) { 1114 printf("vnode pager read error: %d\n", error); 1115 } 1116 if (errtype) 1117 return error; 1118 return (error ? VM_PAGER_FAIL : VM_PAGER_OK); 1119 } 1120 1121 /* 1122 * old-style vnode pager output routine 1123 */ 1124 int 1125 vnode_pager_output_old(vnp, m) 1126 register vn_pager_t vnp; 1127 vm_page_t m; 1128 { 1129 vm_offset_t foff; 1130 vm_offset_t kva; 1131 vm_offset_t size; 1132 struct iovec aiov; 1133 struct uio auio; 1134 struct vnode *vp; 1135 int error; 1136 1137 vp = vnp->vnp_vp; 1138 foff = m->offset + m->object->paging_offset; 1139 1140 /* 1141 * Return failure if beyond current EOF 1142 */ 1143 if (foff >= vnp->vnp_size) { 1144 return VM_PAGER_BAD; 1145 } else { 1146 size = PAGE_SIZE; 1147 if (foff + size > vnp->vnp_size) 1148 size = vnp->vnp_size - foff; 1149 /* 1150 * Allocate a kernel virtual address and initialize so that 1151 * we can use VOP_WRITE routines. 1152 */ 1153 kva = vm_pager_map_page(m); 1154 aiov.iov_base = (caddr_t) kva; 1155 aiov.iov_len = size; 1156 auio.uio_iov = &aiov; 1157 auio.uio_iovcnt = 1; 1158 auio.uio_offset = foff; 1159 auio.uio_segflg = UIO_SYSSPACE; 1160 auio.uio_rw = UIO_WRITE; 1161 auio.uio_resid = size; 1162 auio.uio_procp = (struct proc *) 0; 1163 1164 error = VOP_WRITE(vp, &auio, 0, curproc->p_ucred); 1165 1166 if (!error) { 1167 if ((size - auio.uio_resid) == 0) { 1168 error = EINVAL; 1169 } 1170 } 1171 vm_pager_unmap_page(kva); 1172 return error ? VM_PAGER_FAIL : VM_PAGER_OK; 1173 } 1174 } 1175 1176 /* 1177 * vnode pager output on a small-block file system 1178 */ 1179 int 1180 vnode_pager_output_smlfs(vnp, m) 1181 vn_pager_t vnp; 1182 vm_page_t m; 1183 { 1184 int i; 1185 int s; 1186 vm_offset_t paging_offset; 1187 struct vnode *dp, *vp; 1188 struct buf *bp; 1189 vm_offset_t foff; 1190 vm_offset_t kva; 1191 int fileaddr; 1192 int block; 1193 vm_offset_t bsize; 1194 int error = 0; 1195 1196 paging_offset = m->object->paging_offset; 1197 vp = vnp->vnp_vp; 1198 bsize = vp->v_mount->mnt_stat.f_iosize; 1199 foff = m->offset + paging_offset; 1200 1201 VOP_BMAP(vp, foff, &dp, 0, 0); 1202 kva = vm_pager_map_page(m); 1203 for (i = 0; !error && i < (PAGE_SIZE / bsize); i++) { 1204 1205 /* 1206 * calculate logical block and offset 1207 */ 1208 fileaddr = vnode_pager_addr(vp, foff + i * bsize); 1209 if (fileaddr != -1) { 1210 s = splbio(); 1211 if (bp = incore(vp, (foff / bsize) + i)) { 1212 bp = getblk(vp, (foff / bsize) + i, bp->b_bufsize, 0, 0); 1213 bp->b_flags |= B_INVAL; 1214 brelse(bp); 1215 } 1216 splx(s); 1217 1218 bp = getpbuf(); 1219 VHOLD(vp); 1220 1221 /* build a minimal buffer header */ 1222 bp->b_flags = B_BUSY | B_CALL | B_WRITE; 1223 bp->b_iodone = vnode_pager_iodone; 1224 bp->b_proc = curproc; 1225 bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; 1226 if (bp->b_rcred != NOCRED) 1227 crhold(bp->b_rcred); 1228 if (bp->b_wcred != NOCRED) 1229 crhold(bp->b_wcred); 1230 bp->b_un.b_addr = (caddr_t) kva + i * bsize; 1231 bp->b_blkno = fileaddr / DEV_BSIZE; 1232 bgetvp(dp, bp); 1233 ++dp->v_numoutput; 1234 /* for NFS */ 1235 bp->b_dirtyoff = 0; 1236 bp->b_dirtyend = bsize; 1237 bp->b_bcount = bsize; 1238 bp->b_bufsize = bsize; 1239 1240 /* do the input */ 1241 VOP_STRATEGY(bp); 1242 1243 /* we definitely need to be at splbio here */ 1244 1245 s = splbio(); 1246 while ((bp->b_flags & B_DONE) == 0) { 1247 tsleep((caddr_t) bp, PVM, "vnswrt", 0); 1248 } 1249 splx(s); 1250 if ((bp->b_flags & B_ERROR) != 0) 1251 error = EIO; 1252 1253 /* 1254 * free the buffer header back to the swap buffer pool 1255 */ 1256 relpbuf(bp); 1257 HOLDRELE(vp); 1258 } 1259 } 1260 vm_pager_unmap_page(kva); 1261 if (error) 1262 return VM_PAGER_FAIL; 1263 else 1264 return VM_PAGER_OK; 1265 } 1266 1267 /* 1268 * generic vnode pager output routine 1269 */ 1270 int 1271 vnode_pager_output(vnp, m, count, rtvals) 1272 vn_pager_t vnp; 1273 vm_page_t *m; 1274 int count; 1275 int *rtvals; 1276 { 1277 int i, j; 1278 vm_offset_t kva, foff; 1279 int size; 1280 struct proc *p = curproc; /* XXX */ 1281 vm_object_t object; 1282 vm_offset_t paging_offset; 1283 struct vnode *dp, *vp; 1284 struct buf *bp; 1285 vm_offset_t reqaddr; 1286 int bsize; 1287 int s; 1288 1289 int error = 0; 1290 1291 retryoutput: 1292 object = m[0]->object; /* all vm_page_t items are in same object */ 1293 paging_offset = object->paging_offset; 1294 1295 vp = vnp->vnp_vp; 1296 bsize = vp->v_mount->mnt_stat.f_iosize; 1297 1298 for (i = 0; i < count; i++) 1299 rtvals[i] = VM_PAGER_AGAIN; 1300 1301 /* 1302 * if the filesystem does not have a bmap, then use the old code 1303 */ 1304 if (VOP_BMAP(vp, m[0]->offset + paging_offset, &dp, 0, 0)) { 1305 1306 rtvals[0] = vnode_pager_output_old(vnp, m[0]); 1307 1308 pmap_clear_modify(VM_PAGE_TO_PHYS(m[0])); 1309 m[0]->flags |= PG_CLEAN; 1310 m[0]->flags &= ~PG_LAUNDRY; 1311 return rtvals[0]; 1312 } 1313 1314 /* 1315 * if the filesystem has a small blocksize, then use the small block 1316 * filesystem output code 1317 */ 1318 if ((bsize < PAGE_SIZE) && 1319 (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) { 1320 1321 for (i = 0; i < count; i++) { 1322 rtvals[i] = vnode_pager_output_smlfs(vnp, m[i]); 1323 if (rtvals[i] == VM_PAGER_OK) { 1324 pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); 1325 m[i]->flags |= PG_CLEAN; 1326 m[i]->flags &= ~PG_LAUNDRY; 1327 } 1328 } 1329 return rtvals[0]; 1330 } 1331 1332 for (i = 0; i < count; i++) { 1333 foff = m[i]->offset + paging_offset; 1334 if (foff >= vnp->vnp_size) { 1335 for (j = i; j < count; j++) 1336 rtvals[j] = VM_PAGER_BAD; 1337 count = i; 1338 break; 1339 } 1340 } 1341 if (count == 0) { 1342 return rtvals[0]; 1343 } 1344 foff = m[0]->offset + paging_offset; 1345 reqaddr = vnode_pager_addr(vp, foff); 1346 1347 /* 1348 * Scan forward and stop for the first non-contiguous entry or stop 1349 * for a page being in buffer cache. 1350 */ 1351 for (i = 1; i < count; i++) { 1352 if (vnode_pager_addr(vp, m[i]->offset + paging_offset) 1353 != reqaddr + i * PAGE_SIZE) { 1354 count = i; 1355 break; 1356 } 1357 } 1358 1359 /* 1360 * calculate the size of the transfer 1361 */ 1362 size = count * PAGE_SIZE; 1363 if ((foff + size) > vnp->vnp_size) 1364 size = vnp->vnp_size - foff; 1365 1366 /* 1367 * round up physical size for real devices 1368 */ 1369 if (dp->v_type == VBLK || dp->v_type == VCHR) 1370 size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 1371 1372 bp = getpbuf(); 1373 kva = (vm_offset_t)bp->b_data; 1374 /* 1375 * and map the pages to be read into the kva 1376 */ 1377 pmap_qenter(kva, m, count); 1378 printf("vnode: writing foff: %d, devoff: %d, size: %d\n", 1379 foff, reqaddr, size); 1380 1381 /* 1382 * next invalidate the incore vfs_bio data 1383 */ 1384 for (i = 0; i < count; i++) { 1385 int filblock = (foff + i * PAGE_SIZE) / bsize; 1386 struct buf *fbp; 1387 1388 s = splbio(); 1389 if (fbp = incore(vp, filblock)) { 1390 fbp = getblk(vp, filblock, fbp->b_bufsize, 0, 0); 1391 if (fbp->b_flags & B_DELWRI) { 1392 if (fbp->b_bufsize <= PAGE_SIZE) 1393 fbp->b_flags &= ~B_DELWRI; 1394 else { 1395 bwrite(fbp); 1396 fbp = getblk(vp, filblock, 1397 fbp->b_bufsize, 0, 0); 1398 } 1399 } 1400 fbp->b_flags |= B_INVAL; 1401 brelse(fbp); 1402 } 1403 splx(s); 1404 } 1405 1406 1407 VHOLD(vp); 1408 /* build a minimal buffer header */ 1409 bp->b_flags = B_BUSY | B_WRITE | B_CALL; 1410 bp->b_iodone = vnode_pager_iodone; 1411 /* B_PHYS is not set, but it is nice to fill this in */ 1412 bp->b_proc = curproc; 1413 bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; 1414 1415 if (bp->b_rcred != NOCRED) 1416 crhold(bp->b_rcred); 1417 if (bp->b_wcred != NOCRED) 1418 crhold(bp->b_wcred); 1419 bp->b_blkno = reqaddr / DEV_BSIZE; 1420 bgetvp(dp, bp); 1421 ++dp->v_numoutput; 1422 1423 /* for NFS */ 1424 bp->b_dirtyoff = 0; 1425 bp->b_dirtyend = size; 1426 1427 bp->b_bcount = size; 1428 bp->b_bufsize = size; 1429 1430 /* do the output */ 1431 VOP_STRATEGY(bp); 1432 1433 s = splbio(); 1434 1435 /* we definitely need to be at splbio here */ 1436 1437 while ((bp->b_flags & B_DONE) == 0) { 1438 tsleep((caddr_t) bp, PVM, "vnwrite", 0); 1439 } 1440 splx(s); 1441 1442 if ((bp->b_flags & B_ERROR) != 0) 1443 error = EIO; 1444 1445 pmap_qremove( kva, count); 1446 1447 /* 1448 * free the buffer header back to the swap buffer pool 1449 */ 1450 relpbuf(bp); 1451 HOLDRELE(vp); 1452 1453 if (!error) { 1454 for (i = 0; i < count; i++) { 1455 pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); 1456 m[i]->flags |= PG_CLEAN; 1457 m[i]->flags &= ~PG_LAUNDRY; 1458 rtvals[i] = VM_PAGER_OK; 1459 } 1460 } else if (count != 1) { 1461 error = 0; 1462 count = 1; 1463 goto retryoutput; 1464 } 1465 if (error) { 1466 printf("vnode pager write error: %d\n", error); 1467 } 1468 return (error ? VM_PAGER_FAIL : VM_PAGER_OK); 1469 } 1470