1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * VM - generic vnode page mapping interfaces. 30 * 31 * Mechanism to provide temporary mappings to vnode pages. 32 * The typical use would be to copy/access file data. 33 */ 34 35 #include <sys/types.h> 36 #include <sys/t_lock.h> 37 #include <sys/param.h> 38 #include <sys/sysmacros.h> 39 #include <sys/buf.h> 40 #include <sys/systm.h> 41 #include <sys/vnode.h> 42 #include <sys/mman.h> 43 #include <sys/errno.h> 44 #include <sys/cred.h> 45 #include <sys/kmem.h> 46 #include <sys/vtrace.h> 47 #include <sys/cmn_err.h> 48 #include <sys/debug.h> 49 #include <sys/thread.h> 50 #include <sys/dumphdr.h> 51 #include <sys/bitmap.h> 52 #include <sys/lgrp.h> 53 54 #include <vm/seg_kmem.h> 55 #include <vm/hat.h> 56 #include <vm/as.h> 57 #include <vm/seg.h> 58 #include <vm/seg_kpm.h> 59 #include <vm/seg_map.h> 60 #include <vm/page.h> 61 #include <vm/pvn.h> 62 #include <vm/rm.h> 63 #include <vm/vpm.h> 64 65 /* 66 * Needs to be enabled by each platform. 67 */ 68 int vpm_enable = 0; 69 70 #ifdef SEGKPM_SUPPORT 71 72 73 int vpm_cache_enable = 1; 74 long vpm_cache_percent = 12; 75 long vpm_cache_size; 76 int vpm_nfreelist = 0; 77 int vpmd_freemsk = 0; 78 79 #define VPM_S_PAD 64 80 union vpm_cpu { 81 struct { 82 int vcpu_free_ndx; 83 ulong_t vcpu_hits; 84 ulong_t vcpu_misses; 85 } vcpu; 86 char vpm_pad[VPM_S_PAD]; 87 }; 88 static union vpm_cpu *vpmd_cpu; 89 90 #define vfree_ndx vcpu.vcpu_free_ndx 91 92 int vpm_cachemode = VPMCACHE_LRU; 93 94 #define PPMTX(pp) (&(pp)->p_ilock) 95 96 static struct vpmap *vpmd_vpmap; /* list of vpmap structs preallocated */ 97 static struct vpmfree *vpmd_free; 98 #define VPMAPMTX(vpm) (&vpm->vpm_mtx) 99 #define VPMAP2VMF(vpm) (&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk]) 100 #define VPMAP2VMF_NDX(vpm) (ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk) 101 #define VPMP(id) (&vpmd_vpmap[id - 1]) 102 #define VPMID(vpm) (uint_t)((vpm - vpmd_vpmap) + 1) 103 104 105 #ifdef DEBUG 106 107 struct vpm_debug { 108 int vpmd_steals; 109 int vpmd_contend; 110 int vpmd_prevpagelocked; 111 int vpmd_getpagefailed; 112 int vpmd_zerostart; 113 int vpmd_emptyfreelist; 114 int vpmd_nofreevpms; 115 } vpm_debug; 116 117 #define VPM_DEBUG(x) ((vpm_debug.x)++) 118 119 int steals; 120 int steals_mtbf = 7; 121 int contend; 122 int contend_mtbf = 127; 123 124 #define VPM_MTBF(v, f) (((++(v)) & (f)) != (f)) 125 126 #else /* DEBUG */ 127 128 #define VPM_MTBF(v, f) (1) 129 #define VPM_DEBUG(x) /* nothing */ 130 131 #endif 132 133 /* 134 * The vpm cache. 135 * 136 * The main purpose of having a cache here is to speed up page_lookup() 137 * operations and also provide an LRU(default) behaviour of file pages. The 138 * page_lookup() operation tends to be expensive if a page has to be 139 * reclaimed from the system page cache("cachelist"). Once we speed up the 140 * page_lookup()->page_reclaim() path then there there should be no need for 141 * this cache. The system page cache(cachelist) should effectively serve the 142 * purpose of caching file pages. 143 * 144 * This cache is very similar to segmap's smap cache. Each page in the 145 * cache is tracked by the structure vpmap_t. But unlike segmap, there is no 146 * hash table. The page_t has a reference to the vpmap_t when cached. For a 147 * given vnode, offset the page is found by means of a page_lookup() operation. 148 * Any page which has a mapping(i.e when cached) will not be in the 149 * system 'cachelist'. Hence the page_lookup() will not have to do a 150 * page_reclaim(). That is how the cache serves to speed up page_lookup() 151 * operations. 152 * 153 * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system. 154 */ 155 156 void 157 vpm_init() 158 { 159 long npages; 160 struct vpmap *vpm; 161 struct vpmfree *vpmflp; 162 int i, ndx; 163 extern void prefetch_smap_w(void *); 164 165 if (!vpm_cache_enable) { 166 return; 167 } 168 169 /* 170 * Set the size of the cache. 171 */ 172 vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100); 173 if (vpm_cache_size < VPMAP_MINCACHE) { 174 vpm_cache_size = VPMAP_MINCACHE; 175 } 176 177 /* 178 * Number of freelists. 179 */ 180 if (vpm_nfreelist == 0) { 181 vpm_nfreelist = max_ncpus; 182 } else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) { 183 cmn_err(CE_WARN, "vpmap create : number of freelist " 184 "vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus); 185 vpm_nfreelist = 2 * max_ncpus; 186 } 187 188 /* 189 * Round it up to the next power of 2 190 */ 191 if (vpm_nfreelist & (vpm_nfreelist - 1)) { 192 vpm_nfreelist = 1 << (highbit(vpm_nfreelist)); 193 } 194 vpmd_freemsk = vpm_nfreelist - 1; 195 196 /* 197 * Use a per cpu rotor index to spread the allocations evenly 198 * across the available vpm freelists. 199 */ 200 vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP); 201 ndx = 0; 202 for (i = 0; i < max_ncpus; i++) { 203 204 vpmd_cpu[i].vfree_ndx = ndx; 205 ndx = (ndx + 1) & vpmd_freemsk; 206 } 207 208 /* 209 * Allocate and initialize the freelist. 210 */ 211 vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree), 212 KM_SLEEP); 213 for (i = 0; i < vpm_nfreelist; i++) { 214 215 vpmflp = &vpmd_free[i]; 216 /* 217 * Set up initial queue pointers. They will get flipped 218 * back and forth. 219 */ 220 vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ]; 221 vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ]; 222 } 223 224 npages = mmu_btop(vpm_cache_size); 225 226 227 /* 228 * Allocate and initialize the vpmap structs. 229 */ 230 vpmd_vpmap = kmem_zalloc(sizeof (struct vpmap) * npages, KM_SLEEP); 231 for (vpm = vpmd_vpmap; vpm <= &vpmd_vpmap[npages - 1]; vpm++) { 232 struct vpmfree *vpmflp; 233 union vpm_freeq *releq; 234 struct vpmap *vpmapf; 235 236 /* 237 * Use prefetch as we have to walk thru a large number of 238 * these data structures. We just use the smap's prefetch 239 * routine as it does the same. This should work fine 240 * for x64(this needs to be modified when enabled on sparc). 241 */ 242 prefetch_smap_w((void *)vpm); 243 244 vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm); 245 246 vpmflp = VPMAP2VMF(vpm); 247 releq = vpmflp->vpm_releq; 248 249 vpmapf = releq->vpmq_free; 250 if (vpmapf == NULL) { 251 releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm; 252 } else { 253 vpm->vpm_next = vpmapf; 254 vpm->vpm_prev = vpmapf->vpm_prev; 255 vpmapf->vpm_prev = vpm; 256 vpm->vpm_prev->vpm_next = vpm; 257 releq->vpmq_free = vpm->vpm_next; 258 } 259 260 /* 261 * Indicate that the vpmap is on the releq at start 262 */ 263 vpm->vpm_ndxflg = VPMRELEQ; 264 } 265 } 266 267 268 /* 269 * unhooks vpm from the freelist if it is still on the freelist. 270 */ 271 #define VPMAP_RMFREELIST(vpm) \ 272 { \ 273 if (vpm->vpm_next != NULL) { \ 274 union vpm_freeq *freeq; \ 275 struct vpmfree *vpmflp; \ 276 vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \ 277 freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \ 278 mutex_enter(&freeq->vpmq_mtx); \ 279 if (freeq->vpmq_free != vpm) { \ 280 vpm->vpm_prev->vpm_next = vpm->vpm_next; \ 281 vpm->vpm_next->vpm_prev = vpm->vpm_prev; \ 282 } else if (vpm == vpm->vpm_next) { \ 283 freeq->vpmq_free = NULL; \ 284 } else { \ 285 freeq->vpmq_free = vpm->vpm_next; \ 286 vpm->vpm_prev->vpm_next = vpm->vpm_next; \ 287 vpm->vpm_next->vpm_prev = vpm->vpm_prev; \ 288 } \ 289 mutex_exit(&freeq->vpmq_mtx); \ 290 vpm->vpm_next = vpm->vpm_prev = NULL; \ 291 } \ 292 } 293 294 static int 295 get_freelndx(int mode) 296 { 297 int ndx; 298 299 ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk; 300 switch (mode) { 301 302 case VPMCACHE_LRU: 303 default: 304 vpmd_cpu[CPU->cpu_seqid].vfree_ndx++; 305 break; 306 } 307 return (ndx); 308 } 309 310 311 /* 312 * Find one vpmap structure from the free lists and use it for the newpage. 313 * The previous page it cached is dissociated and released. The page_t's 314 * p_vpmref is cleared only when the vpm it is pointing to is locked(or 315 * for AMD64 when the page is exclusively locked in page_unload. That is 316 * because the p_vpmref is treated as mapping). 317 * 318 * The page's p_vpmref is set when the page is 319 * locked(at least SHARED locked). 320 */ 321 static struct vpmap * 322 get_free_vpmap(page_t *newpage) 323 { 324 struct vpmfree *vpmflp; 325 kmutex_t *vmtx; 326 struct vpmap *vpm, *first; 327 union vpm_freeq *allocq, *releq; 328 page_t *pp = NULL; 329 int end_ndx, page_locked = 0; 330 int free_ndx; 331 332 /* 333 * get the freelist bin index. 334 */ 335 free_ndx = get_freelndx(vpm_cachemode); 336 337 end_ndx = free_ndx; 338 vpmflp = &vpmd_free[free_ndx]; 339 340 retry_queue: 341 allocq = vpmflp->vpm_allocq; 342 mutex_enter(&allocq->vpmq_mtx); 343 344 if ((vpm = allocq->vpmq_free) == NULL) { 345 346 skip_queue: 347 /* 348 * The alloc list is empty or this queue is being skipped; 349 * first see if the allocq toggled. 350 */ 351 if (vpmflp->vpm_allocq != allocq) { 352 /* queue changed */ 353 mutex_exit(&allocq->vpmq_mtx); 354 goto retry_queue; 355 } 356 releq = vpmflp->vpm_releq; 357 if (!mutex_tryenter(&releq->vpmq_mtx)) { 358 /* cannot get releq; a free vpmap may be there now */ 359 mutex_exit(&allocq->vpmq_mtx); 360 361 /* 362 * This loop could spin forever if this thread has 363 * higher priority than the thread that is holding 364 * releq->vpmq_mtx. In order to force the other thread 365 * to run, we'll lock/unlock the mutex which is safe 366 * since we just unlocked the allocq mutex. 367 */ 368 mutex_enter(&releq->vpmq_mtx); 369 mutex_exit(&releq->vpmq_mtx); 370 goto retry_queue; 371 } 372 if (releq->vpmq_free == NULL) { 373 VPM_DEBUG(vpmd_emptyfreelist); 374 /* 375 * This freelist is empty. 376 * This should not happen unless clients 377 * are failing to release the vpmap after 378 * accessing the data. Before resorting 379 * to sleeping, try the next list of the same color. 380 */ 381 free_ndx = (free_ndx + 1) & vpmd_freemsk; 382 if (free_ndx != end_ndx) { 383 mutex_exit(&releq->vpmq_mtx); 384 mutex_exit(&allocq->vpmq_mtx); 385 vpmflp = &vpmd_free[free_ndx]; 386 goto retry_queue; 387 } 388 /* 389 * Tried all freelists. 390 * wait on this list and hope something gets freed. 391 */ 392 vpmflp->vpm_want++; 393 mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx); 394 cv_wait(&vpmflp->vpm_free_cv, 395 &vpmflp->vpm_freeq[0].vpmq_mtx); 396 vpmflp->vpm_want--; 397 mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx); 398 vpmflp = &vpmd_free[free_ndx]; 399 VPM_DEBUG(vpmd_nofreevpms); 400 goto retry_queue; 401 } else { 402 /* 403 * Something on the rele queue; flip the alloc 404 * and rele queues and retry. 405 */ 406 vpmflp->vpm_allocq = releq; 407 vpmflp->vpm_releq = allocq; 408 mutex_exit(&allocq->vpmq_mtx); 409 mutex_exit(&releq->vpmq_mtx); 410 if (page_locked) { 411 delay(hz >> 2); 412 page_locked = 0; 413 } 414 goto retry_queue; 415 } 416 } else { 417 int gotnewvpm; 418 kmutex_t *pmtx; 419 uint_t vpmref; 420 421 /* 422 * Fastpath the case we get the vpmap mutex 423 * on the first try. 424 */ 425 first = vpm; 426 next_vpmap: 427 vmtx = VPMAPMTX(vpm); 428 if (!mutex_tryenter(vmtx)) { 429 /* 430 * Another thread is trying to reclaim this slot. 431 * Skip to the next queue or vpmap. 432 */ 433 if ((vpm = vpm->vpm_next) == first) { 434 goto skip_queue; 435 } else { 436 goto next_vpmap; 437 } 438 } 439 440 /* 441 * Assign this vpm to the newpage. 442 */ 443 pmtx = PPMTX(newpage); 444 gotnewvpm = 0; 445 mutex_enter(pmtx); 446 447 /* 448 * Check if some other thread already assigned a vpm to 449 * this page. 450 */ 451 if ((vpmref = newpage->p_vpmref) == 0) { 452 newpage->p_vpmref = VPMID(vpm); 453 gotnewvpm = 1; 454 } else { 455 VPM_DEBUG(vpmd_contend); 456 mutex_exit(vmtx); 457 } 458 mutex_exit(pmtx); 459 460 if (gotnewvpm) { 461 462 /* 463 * At this point, we've selected the vpm. Remove vpm 464 * from its freelist. If vpm is the first one in 465 * the freelist, update the head of the freelist. 466 */ 467 if (first == vpm) { 468 ASSERT(first == allocq->vpmq_free); 469 allocq->vpmq_free = vpm->vpm_next; 470 } 471 472 /* 473 * If the head of the freelist still points to vpm, 474 * then there are no more free vpmaps in that list. 475 */ 476 if (allocq->vpmq_free == vpm) 477 /* 478 * Took the last one 479 */ 480 allocq->vpmq_free = NULL; 481 else { 482 vpm->vpm_prev->vpm_next = vpm->vpm_next; 483 vpm->vpm_next->vpm_prev = vpm->vpm_prev; 484 } 485 mutex_exit(&allocq->vpmq_mtx); 486 vpm->vpm_prev = vpm->vpm_next = NULL; 487 488 /* 489 * Disassociate the previous page. On x64 systems 490 * p_vpmref is used as a mapping reference to the page. 491 */ 492 if ((pp = vpm->vpm_pp) != NULL && 493 vpm->vpm_vp == pp->p_vnode && 494 vpm->vpm_off == pp->p_offset) { 495 496 pmtx = PPMTX(pp); 497 if (page_trylock(pp, SE_SHARED)) { 498 /* 499 * Now verify that it is the correct 500 * page. If not someone else stole it, 501 * so just unlock it and leave. 502 */ 503 mutex_enter(pmtx); 504 if (PP_ISFREE(pp) || 505 vpm->vpm_vp != pp->p_vnode || 506 vpm->vpm_off != pp->p_offset || 507 pp->p_vpmref != VPMID(vpm)) { 508 mutex_exit(pmtx); 509 510 page_unlock(pp); 511 } else { 512 /* 513 * Release the page. 514 */ 515 pp->p_vpmref = 0; 516 mutex_exit(pmtx); 517 hat_kpm_mapout(pp, 0, 518 hat_kpm_page2va(pp, 1)); 519 (void) page_release(pp, 1); 520 } 521 } else { 522 /* 523 * If the page cannot be locked, just 524 * clear the p_vpmref and go. 525 */ 526 mutex_enter(pmtx); 527 if (pp->p_vpmref == VPMID(vpm)) { 528 pp->p_vpmref = 0; 529 } 530 mutex_exit(pmtx); 531 VPM_DEBUG(vpmd_prevpagelocked); 532 } 533 } 534 535 /* 536 * Setup vpm to point to the new page. 537 */ 538 vpm->vpm_pp = newpage; 539 vpm->vpm_vp = newpage->p_vnode; 540 vpm->vpm_off = newpage->p_offset; 541 542 } else { 543 int steal = !VPM_MTBF(steals, steals_mtbf); 544 /* 545 * Page already has a vpm assigned just use that. 546 * Grab the vpm mutex and verify that it is still 547 * the correct one. The pp->p_vpmref should not change 548 * once we have the vpm mutex and the page lock. 549 */ 550 mutex_exit(&allocq->vpmq_mtx); 551 vpm = VPMP(vpmref); 552 vmtx = VPMAPMTX(vpm); 553 mutex_enter(vmtx); 554 if ((steal && vpm->vpm_refcnt == 0) || 555 vpm->vpm_pp != newpage) { 556 /* 557 * The vpm got stolen, retry. 558 * clear the p_vpmref. 559 */ 560 pmtx = PPMTX(newpage); 561 mutex_enter(pmtx); 562 if (newpage->p_vpmref == vpmref) { 563 newpage->p_vpmref = 0; 564 } 565 mutex_exit(pmtx); 566 567 mutex_exit(vmtx); 568 VPM_DEBUG(vpmd_steals); 569 goto retry_queue; 570 } else if (vpm->vpm_refcnt == 0) { 571 /* 572 * Remove it from the free list if it 573 * exists there. 574 */ 575 VPMAP_RMFREELIST(vpm); 576 } 577 } 578 return (vpm); 579 } 580 } 581 582 static void 583 free_vpmap(struct vpmap *vpm) 584 { 585 struct vpmfree *vpmflp; 586 struct vpmap *vpmfreelist; 587 union vpm_freeq *releq; 588 589 ASSERT(MUTEX_HELD(VPMAPMTX(vpm))); 590 591 if (vpm->vpm_refcnt != 0) { 592 panic("free_vpmap"); 593 /*NOTREACHED*/ 594 } 595 596 vpmflp = &vpmd_free[vpm->vpm_free_ndx]; 597 /* 598 * Add to the tail of the release queue 599 * Note that vpm_releq and vpm_allocq could toggle 600 * before we get the lock. This does not affect 601 * correctness as the 2 queues are only maintained 602 * to reduce lock pressure. 603 */ 604 releq = vpmflp->vpm_releq; 605 if (releq == &vpmflp->vpm_freeq[0]) { 606 vpm->vpm_ndxflg = 0; 607 } else { 608 vpm->vpm_ndxflg = 1; 609 } 610 mutex_enter(&releq->vpmq_mtx); 611 vpmfreelist = releq->vpmq_free; 612 if (vpmfreelist == 0) { 613 int want; 614 615 releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm; 616 /* 617 * Both queue mutexes are held to set vpm_want; 618 * snapshot the value before dropping releq mutex. 619 * If vpm_want appears after the releq mutex is dropped, 620 * then the vpmap just freed is already gone. 621 */ 622 want = vpmflp->vpm_want; 623 mutex_exit(&releq->vpmq_mtx); 624 /* 625 * See if there was a waiter before dropping the releq mutex 626 * then recheck after obtaining vpm_freeq[0] mutex as 627 * the another thread may have already signaled. 628 */ 629 if (want) { 630 mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx); 631 if (vpmflp->vpm_want) 632 cv_signal(&vpmflp->vpm_free_cv); 633 mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx); 634 } 635 } else { 636 vpm->vpm_next = vpmfreelist; 637 vpm->vpm_prev = vpmfreelist->vpm_prev; 638 vpmfreelist->vpm_prev = vpm; 639 vpm->vpm_prev->vpm_next = vpm; 640 mutex_exit(&releq->vpmq_mtx); 641 } 642 } 643 644 /* 645 * Get the vpmap for the page. 646 * The refcnt of this vpm is incremented. 647 */ 648 static struct vpmap * 649 get_vpmap(page_t *pp) 650 { 651 struct vpmap *vpm = NULL; 652 kmutex_t *vmtx; 653 kmutex_t *pmtx; 654 unsigned int refid; 655 656 ASSERT((pp != NULL) && PAGE_LOCKED(pp)); 657 658 if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) { 659 vpm = VPMP(refid); 660 vmtx = VPMAPMTX(vpm); 661 mutex_enter(vmtx); 662 /* 663 * Since we have the page lock and the vpm mutex, the 664 * pp->p_vpmref cannot change. 665 */ 666 if (vpm->vpm_pp != pp) { 667 pmtx = PPMTX(pp); 668 669 /* 670 * Clear the p_vpmref as it is incorrect. 671 * This can happen if the page was stolen. 672 * On x64 this should not happen as p_vpmref 673 * is treated as a mapping on the page. So 674 * if the page is stolen, the mapping would have 675 * been cleared in page_unload(). 676 */ 677 mutex_enter(pmtx); 678 if (pp->p_vpmref == refid) 679 pp->p_vpmref = 0; 680 mutex_exit(pmtx); 681 682 mutex_exit(vmtx); 683 vpm = NULL; 684 } else if (vpm->vpm_refcnt == 0) { 685 /* 686 * Got the vpm, remove it from the free 687 * list if it exists there. 688 */ 689 VPMAP_RMFREELIST(vpm); 690 } 691 } 692 if (vpm == NULL) { 693 /* 694 * get_free_vpmap() returns with the vpmap mutex held. 695 */ 696 vpm = get_free_vpmap(pp); 697 vmtx = VPMAPMTX(vpm); 698 vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++; 699 } else { 700 vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++; 701 } 702 703 vpm->vpm_refcnt++; 704 mutex_exit(vmtx); 705 706 return (vpm); 707 } 708 709 /* END --- vpm cache ---- */ 710 711 /* 712 * The vnode page mapping(vpm) interface routines. 713 */ 714 715 /* 716 * Find or create the pages starting form baseoff for specified 717 * length 'len'. 718 */ 719 static int 720 vpm_pagecreate( 721 struct vnode *vp, 722 u_offset_t baseoff, 723 size_t len, 724 vmap_t vml[], 725 int nseg, 726 int *newpage) 727 { 728 729 page_t *pp = NULL; 730 caddr_t base; 731 u_offset_t off = baseoff; 732 int i; 733 ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS); 734 735 for (i = 0; len > 0; len -= PAGESIZE, i++) { 736 struct vpmap *vpm; 737 738 739 if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) { 740 741 base = segkpm_create_va(off); 742 743 /* 744 * the seg pointer passed in is just advisor. Just 745 * pass segkmap for now like segmap does with 746 * segmap_kpm enabled. 747 */ 748 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, 749 segkmap, base)) == NULL) { 750 panic("segmap_pagecreate_vpm: " 751 "page_create failed"); 752 /*NOTREACHED*/ 753 } 754 if (newpage != NULL) 755 *newpage = 1; 756 757 page_io_unlock(pp); 758 } 759 760 /* 761 * Get the vpm for this page_t. 762 */ 763 if (vpm_cache_enable) { 764 vpm = get_vpmap(pp); 765 vml[i].vs_data = (void *)&vpm->vpm_pp; 766 } else { 767 vml[i].vs_data = (void *)pp; 768 pp->p_vpmref = 0; 769 } 770 771 vml[i].vs_addr = hat_kpm_mapin(pp, 0); 772 vml[i].vs_len = PAGESIZE; 773 774 off += PAGESIZE; 775 } 776 vml[i].vs_data = NULL; 777 vml[i].vs_addr = (caddr_t)NULL; 778 return (0); 779 } 780 781 782 /* 783 * Returns vpm mappings of pages in the range [off, off+len], where 784 * len is rounded up to the PAGESIZE boundary. The list of pages and 785 * the page addresses are returned in the SGL vml (vmap_t) array passed in. 786 * The nseg is the number of vmap_t entries in the array. 787 * 788 * Currently max len allowed is MAXBSIZE therefore, it will either 789 * fetch/create one or two pages depending on what is the PAGESIZE. 790 * 791 * The segmap's SM_LOCKPROTO usage is not supported by these interfaces. 792 * For such cases, use the seg_map interfaces. 793 */ 794 int 795 vpm_map_pages( 796 struct vnode *vp, 797 u_offset_t off, 798 size_t len, 799 int fetchpage, 800 vmap_t *vml, 801 int nseg, 802 int *newpage, 803 enum seg_rw rw) 804 { 805 extern struct vnode *common_specvp(); 806 u_offset_t baseoff; 807 uint_t prot; 808 caddr_t base; 809 page_t *pp, *pplist[MAXVMAPS]; 810 struct vpmap *vpm; 811 int i, error = 0; 812 813 ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS); 814 baseoff = off & (offset_t)PAGEMASK; 815 vml[0].vs_data = NULL; 816 vml[0].vs_addr = (caddr_t)NULL; 817 /* 818 * For now, lets restrict it to MAXBSIZE. XXX - We can allow 819 * len longer then MAXBSIZE, but there should be a limit 820 * which should be determined by how many pages the VOP_GETPAGE() 821 * can fetch. 822 */ 823 if (off + len > baseoff + MAXBSIZE) { 824 panic("vpm_map_pages bad len"); 825 /*NOTREACHED*/ 826 } 827 828 /* 829 * If this is a block device we have to be sure to use the 830 * "common" block device vnode for the mapping. 831 */ 832 if (vp->v_type == VBLK) 833 vp = common_specvp(vp); 834 835 /* 836 * round up len to a multiple of PAGESIZE. 837 */ 838 len = ((off + len - baseoff + PAGESIZE - 1) & (uintptr_t)PAGEMASK); 839 840 if (!fetchpage) 841 return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage)); 842 843 for (i = 0; len > 0; len -= PAGESIZE, i++, pplist[i] = NULL) { 844 845 pp = page_lookup(vp, baseoff, SE_SHARED); 846 847 /* 848 * If we did not find the page or if this page was not 849 * in our cache, then let VOP_GETPAGE get all the pages. 850 * We need to call VOP_GETPAGE so that filesytems can do some 851 * (un)necessary tracking for sequential access. 852 */ 853 854 if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) || 855 (rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF) 856 != (P_MOD | P_REF))) { 857 if (pp != NULL) { 858 page_unlock(pp); 859 } 860 861 /* 862 * Pass a dummy address as it will be required 863 * by page_create_va(). We pass segkmap as the seg 864 * as some file systems(UFS) check it. 865 */ 866 base = segkpm_create_va(baseoff); 867 868 error = VOP_GETPAGE(vp, baseoff, len, &prot, &pplist[i], 869 len, segkmap, base, rw, CRED(), NULL); 870 if (error) { 871 VPM_DEBUG(vpmd_getpagefailed); 872 pplist[i] = NULL; 873 } 874 break; 875 } else { 876 pplist[i] = pp; 877 baseoff += PAGESIZE; 878 } 879 } 880 881 if (error) { 882 for (i = 0; pplist[i] != NULL; i++) { 883 page_unlock(pplist[i]); 884 pplist[i] = NULL; 885 } 886 vml[0].vs_addr = NULL; 887 vml[0].vs_data = NULL; 888 return (error); 889 } 890 891 /* 892 * Get the vpm's for pages. 893 */ 894 for (i = 0; pplist[i] != NULL; i++) { 895 if (vpm_cache_enable) { 896 vpm = get_vpmap(pplist[i]); 897 vml[i].vs_data = (void *)&(vpm->vpm_pp); 898 } else { 899 vml[i].vs_data = (void *)pplist[i]; 900 pplist[i]->p_vpmref = 0; 901 } 902 903 vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0); 904 vml[i].vs_len = PAGESIZE; 905 } 906 907 vml[i].vs_data = NULL; 908 vml[i].vs_addr = (caddr_t)NULL; 909 910 return (0); 911 } 912 913 /* 914 * Release the vpm mappings on the pages and unlock them. 915 */ 916 void 917 vpm_unmap_pages(vmap_t vml[], enum seg_rw rw) 918 { 919 int i; 920 struct vpmap *vpm; 921 kmutex_t *mtx; 922 page_t *pp; 923 924 for (i = 0; vml[i].vs_data != NULL; i++) { 925 ASSERT(IS_KPM_ADDR(vml[i].vs_addr)); 926 927 if (vpm_cache_enable) { 928 pp = *(((page_t **)vml[i].vs_data)); 929 } else { 930 pp = (page_t *)vml[i].vs_data; 931 } 932 933 /* 934 * Mark page as being modified or referenced, bacause vpm pages 935 * would not cause faults where it would be set normally. 936 */ 937 if (rw == S_WRITE) { 938 hat_setrefmod(pp); 939 } else { 940 ASSERT(rw == S_READ); 941 hat_setref(pp); 942 } 943 944 if (vpm_cache_enable) { 945 page_unlock(pp); 946 vpm = (struct vpmap *)((char *)vml[i].vs_data 947 - offsetof(struct vpmap, vpm_pp)); 948 mtx = VPMAPMTX(vpm); 949 mutex_enter(mtx); 950 951 if (--vpm->vpm_refcnt == 0) { 952 free_vpmap(vpm); 953 } 954 mutex_exit(mtx); 955 } else { 956 hat_kpm_mapout(pp, 0, vml[i].vs_addr); 957 (void) page_release(pp, 1); 958 } 959 vml[i].vs_data = NULL; 960 vml[i].vs_addr = NULL; 961 } 962 } 963 964 /* 965 * Given the vp, off and the uio structure, this routine will do the 966 * the copy (uiomove). If the last page created is partially written, 967 * the rest of the page is zeroed out. It also zeros the beginning of 968 * the first page till the start offset if requested(zerostart). 969 * If pages are to be fetched, it will call the filesystem's getpage 970 * function (VOP_GETPAGE) to get them, otherwise they will be created if 971 * not already present in the page cache. 972 */ 973 int 974 vpm_data_copy(struct vnode *vp, 975 u_offset_t off, 976 size_t len, 977 struct uio *uio, 978 int fetchpage, 979 int *newpage, 980 int zerostart, 981 enum seg_rw rw) 982 { 983 int error; 984 struct vmap vml[MINVMAPS]; 985 enum uio_rw uiorw; 986 int npages = 0; 987 988 uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ; 989 /* 990 * 'off' will be the offset where the I/O starts. 991 * We get the pages starting at the (off & PAGEMASK) 992 * page boundary. 993 */ 994 error = vpm_map_pages(vp, off, (uint_t)len, 995 fetchpage, vml, MINVMAPS, &npages, rw); 996 997 if (newpage != NULL) 998 *newpage = npages; 999 if (!error) { 1000 int i, pn, slen = len; 1001 int pon = off & PAGEOFFSET; 1002 1003 /* 1004 * Clear from the beginning of the page to start offset 1005 * if requested. 1006 */ 1007 if (!fetchpage && zerostart) { 1008 (void) kzero(vml[0].vs_addr, (uint_t)pon); 1009 VPM_DEBUG(vpmd_zerostart); 1010 } 1011 1012 for (i = 0; !error && slen > 0 && 1013 vml[i].vs_addr != NULL; i++) { 1014 pn = (int)MIN(slen, (PAGESIZE - pon)); 1015 error = uiomove(vml[i].vs_addr + pon, 1016 (long)pn, uiorw, uio); 1017 slen -= pn; 1018 pon = 0; 1019 } 1020 1021 /* 1022 * When new pages are created, zero out part of the 1023 * page we did not copy to. 1024 */ 1025 if (!fetchpage && npages && 1026 uio->uio_loffset < roundup(off + len, PAGESIZE)) { 1027 int nzero; 1028 1029 pon = (uio->uio_loffset & PAGEOFFSET); 1030 nzero = PAGESIZE - pon; 1031 i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE; 1032 (void) kzero(vml[i].vs_addr + pon, (uint_t)nzero); 1033 } 1034 vpm_unmap_pages(vml, rw); 1035 } 1036 return (error); 1037 } 1038 1039 /* 1040 * called to flush pages for the given vnode covering 1041 * [off, off+len] range. 1042 */ 1043 int 1044 vpm_sync_pages(struct vnode *vp, 1045 u_offset_t off, 1046 size_t len, 1047 uint_t flags) 1048 { 1049 extern struct vnode *common_specvp(); 1050 int bflags = 0; 1051 int error = 0; 1052 size_t psize = roundup(len, PAGESIZE); 1053 1054 /* 1055 * If this is a block device we have to be sure to use the 1056 * "common" block device vnode for the mapping. 1057 */ 1058 if (vp->v_type == VBLK) 1059 vp = common_specvp(vp); 1060 1061 if ((flags & ~SM_DONTNEED) != 0) { 1062 if (flags & SM_ASYNC) 1063 bflags |= B_ASYNC; 1064 if (flags & SM_INVAL) 1065 bflags |= B_INVAL; 1066 if (flags & SM_DESTROY) 1067 bflags |= (B_INVAL|B_TRUNC); 1068 if (flags & SM_FREE) 1069 bflags |= B_FREE; 1070 if (flags & SM_DONTNEED) 1071 bflags |= B_DONTNEED; 1072 1073 error = VOP_PUTPAGE(vp, off, psize, bflags, CRED(), NULL); 1074 } 1075 1076 return (error); 1077 } 1078 1079 1080 #else /* SEGKPM_SUPPORT */ 1081 1082 /* vpm stubs */ 1083 void 1084 vpm_init() 1085 { 1086 } 1087 1088 /*ARGSUSED*/ 1089 int 1090 vpm_pagecreate( 1091 struct vnode *vp, 1092 u_offset_t baseoff, 1093 size_t len, 1094 vmap_t vml[], 1095 int nseg, 1096 int *newpage) 1097 { 1098 return (0); 1099 } 1100 1101 /*ARGSUSED*/ 1102 int 1103 vpm_map_pages( 1104 struct vnode *vp, 1105 u_offset_t off, 1106 size_t len, 1107 int fetchpage, 1108 vmap_t vml[], 1109 int nseg, 1110 int *newpage, 1111 enum seg_rw rw) 1112 { 1113 return (0); 1114 } 1115 1116 /*ARGSUSED*/ 1117 int 1118 vpm_data_copy(struct vnode *vp, 1119 u_offset_t off, 1120 size_t len, 1121 struct uio *uio, 1122 int fetchpage, 1123 int *newpage, 1124 int zerostart, 1125 enum seg_rw rw) 1126 { 1127 return (0); 1128 } 1129 1130 /*ARGSUSED*/ 1131 void 1132 vpm_unmap_pages(vmap_t vml[], enum seg_rw rw) 1133 { 1134 } 1135 /*ARGSUSED*/ 1136 int 1137 vpm_sync_pages(struct vnode *vp, 1138 u_offset_t off, 1139 size_t len, 1140 uint_t flags) 1141 { 1142 return (0); 1143 } 1144 #endif /* SEGKPM_SUPPORT */ 1145