1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 27 /* 28 * VM - generic vnode page mapping interfaces. 29 * 30 * Mechanism to provide temporary mappings to vnode pages. 31 * The typical use would be to copy/access file data. 32 */ 33 34 #include <sys/types.h> 35 #include <sys/t_lock.h> 36 #include <sys/param.h> 37 #include <sys/sysmacros.h> 38 #include <sys/buf.h> 39 #include <sys/systm.h> 40 #include <sys/vnode.h> 41 #include <sys/mman.h> 42 #include <sys/errno.h> 43 #include <sys/cred.h> 44 #include <sys/kmem.h> 45 #include <sys/vtrace.h> 46 #include <sys/cmn_err.h> 47 #include <sys/debug.h> 48 #include <sys/thread.h> 49 #include <sys/dumphdr.h> 50 #include <sys/bitmap.h> 51 #include <sys/lgrp.h> 52 53 #include <vm/seg_kmem.h> 54 #include <vm/hat.h> 55 #include <vm/as.h> 56 #include <vm/seg.h> 57 #include <vm/seg_kpm.h> 58 #include <vm/seg_map.h> 59 #include <vm/page.h> 60 #include <vm/pvn.h> 61 #include <vm/rm.h> 62 #include <vm/vpm.h> 63 64 65 #ifdef SEGKPM_SUPPORT 66 /* 67 * VPM can be disabled by setting vpm_enable = 0 in 68 * /etc/system. 69 * 70 */ 71 int vpm_enable = 1; 72 73 #else 74 75 int vpm_enable = 0; 76 77 #endif 78 79 #ifdef SEGKPM_SUPPORT 80 81 82 int vpm_cache_enable = 1; 83 long vpm_cache_percent = 12; 84 long vpm_cache_size; 85 int vpm_nfreelist = 0; 86 int vpmd_freemsk = 0; 87 88 #define VPM_S_PAD 64 89 union vpm_cpu { 90 struct { 91 int vcpu_free_ndx; 92 ulong_t vcpu_hits; 93 ulong_t vcpu_misses; 94 } vcpu; 95 char vpm_pad[VPM_S_PAD]; 96 }; 97 static union vpm_cpu *vpmd_cpu; 98 99 #define vfree_ndx vcpu.vcpu_free_ndx 100 101 int vpm_cachemode = VPMCACHE_LRU; 102 103 #define PPMTX(pp) (&(pp)->p_ilock) 104 105 static struct vpmap *vpmd_vpmap; /* list of vpmap structs preallocated */ 106 static struct vpmfree *vpmd_free; 107 #define VPMAPMTX(vpm) (&vpm->vpm_mtx) 108 #define VPMAP2VMF(vpm) (&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk]) 109 #define VPMAP2VMF_NDX(vpm) (ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk) 110 #define VPMP(id) (&vpmd_vpmap[id - 1]) 111 #define VPMID(vpm) (uint_t)((vpm - vpmd_vpmap) + 1) 112 113 114 #ifdef DEBUG 115 116 struct vpm_debug { 117 int vpmd_steals; 118 int vpmd_contend; 119 int vpmd_prevpagelocked; 120 int vpmd_getpagefailed; 121 int vpmd_zerostart; 122 int vpmd_emptyfreelist; 123 int vpmd_nofreevpms; 124 } vpm_debug; 125 126 #define VPM_DEBUG(x) ((vpm_debug.x)++) 127 128 int steals; 129 int steals_mtbf = 7; 130 int contend; 131 int contend_mtbf = 127; 132 133 #define VPM_MTBF(v, f) (((++(v)) & (f)) != (f)) 134 135 #else /* DEBUG */ 136 137 #define VPM_MTBF(v, f) (1) 138 #define VPM_DEBUG(x) /* nothing */ 139 140 #endif 141 142 /* 143 * The vpm cache. 144 * 145 * The main purpose of having a cache here is to speed up page_lookup() 146 * operations and also provide an LRU(default) behaviour of file pages. The 147 * page_lookup() operation tends to be expensive if a page has to be 148 * reclaimed from the system page cache("cachelist"). Once we speed up the 149 * page_lookup()->page_reclaim() path then there there should be no need for 150 * this cache. The system page cache(cachelist) should effectively serve the 151 * purpose of caching file pages. 152 * 153 * This cache is very similar to segmap's smap cache. Each page in the 154 * cache is tracked by the structure vpmap_t. But unlike segmap, there is no 155 * hash table. The page_t has a reference to the vpmap_t when cached. For a 156 * given vnode, offset the page is found by means of a page_lookup() operation. 157 * Any page which has a mapping(i.e when cached) will not be in the 158 * system 'cachelist'. Hence the page_lookup() will not have to do a 159 * page_reclaim(). That is how the cache serves to speed up page_lookup() 160 * operations. 161 * 162 * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system. 163 */ 164 165 void 166 vpm_init() 167 { 168 long npages; 169 struct vpmap *vpm; 170 struct vpmfree *vpmflp; 171 int i, ndx; 172 extern void prefetch_smap_w(void *); 173 174 if (!kpm_enable) { 175 vpm_enable = 0; 176 } 177 178 if (!vpm_enable || !vpm_cache_enable) { 179 return; 180 } 181 182 /* 183 * Set the size of the cache. 184 */ 185 vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100); 186 if (vpm_cache_size < VPMAP_MINCACHE) { 187 vpm_cache_size = VPMAP_MINCACHE; 188 } 189 190 if (vpm_cache_size > VPMAP_MAXCACHE) { 191 vpm_cache_size = VPMAP_MAXCACHE; 192 } 193 194 /* 195 * Number of freelists. 196 */ 197 if (vpm_nfreelist == 0) { 198 vpm_nfreelist = max_ncpus; 199 } else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) { 200 cmn_err(CE_WARN, "vpmap create : number of freelist " 201 "vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus); 202 vpm_nfreelist = 2 * max_ncpus; 203 } 204 205 /* 206 * Round it up to the next power of 2 207 */ 208 if (!ISP2(vpm_nfreelist)) { 209 vpm_nfreelist = 1 << (highbit(vpm_nfreelist)); 210 } 211 vpmd_freemsk = vpm_nfreelist - 1; 212 213 /* 214 * Use a per cpu rotor index to spread the allocations evenly 215 * across the available vpm freelists. 216 */ 217 vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP); 218 ndx = 0; 219 for (i = 0; i < max_ncpus; i++) { 220 221 vpmd_cpu[i].vfree_ndx = ndx; 222 ndx = (ndx + 1) & vpmd_freemsk; 223 } 224 225 /* 226 * Allocate and initialize the freelist. 227 */ 228 vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree), 229 KM_SLEEP); 230 for (i = 0; i < vpm_nfreelist; i++) { 231 232 vpmflp = &vpmd_free[i]; 233 /* 234 * Set up initial queue pointers. They will get flipped 235 * back and forth. 236 */ 237 vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ]; 238 vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ]; 239 } 240 241 npages = mmu_btop(vpm_cache_size); 242 243 244 /* 245 * Allocate and initialize the vpmap structs. We need to 246 * walk the array backwards as the prefetch happens in reverse 247 * order. 248 */ 249 vpmd_vpmap = kmem_alloc(sizeof (struct vpmap) * npages, KM_SLEEP); 250 for (vpm = &vpmd_vpmap[npages - 1]; vpm >= vpmd_vpmap; vpm--) { 251 struct vpmfree *vpmflp; 252 union vpm_freeq *releq; 253 struct vpmap *vpmapf; 254 255 /* 256 * Use prefetch as we have to walk thru a large number of 257 * these data structures. We just use the smap's prefetch 258 * routine as it does the same. 259 */ 260 prefetch_smap_w((void *)vpm); 261 262 vpm->vpm_vp = NULL; 263 vpm->vpm_off = 0; 264 vpm->vpm_pp = NULL; 265 vpm->vpm_refcnt = 0; 266 mutex_init(&vpm->vpm_mtx, NULL, MUTEX_DEFAULT, NULL); 267 vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm); 268 269 vpmflp = VPMAP2VMF(vpm); 270 releq = vpmflp->vpm_releq; 271 272 vpmapf = releq->vpmq_free; 273 if (vpmapf == NULL) { 274 releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm; 275 } else { 276 vpm->vpm_next = vpmapf; 277 vpm->vpm_prev = vpmapf->vpm_prev; 278 vpmapf->vpm_prev = vpm; 279 vpm->vpm_prev->vpm_next = vpm; 280 releq->vpmq_free = vpm->vpm_next; 281 } 282 283 /* 284 * Indicate that the vpmap is on the releq at start 285 */ 286 vpm->vpm_ndxflg = VPMRELEQ; 287 } 288 } 289 290 291 /* 292 * unhooks vpm from the freelist if it is still on the freelist. 293 */ 294 #define VPMAP_RMFREELIST(vpm) \ 295 { \ 296 if (vpm->vpm_next != NULL) { \ 297 union vpm_freeq *freeq; \ 298 struct vpmfree *vpmflp; \ 299 vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \ 300 freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \ 301 mutex_enter(&freeq->vpmq_mtx); \ 302 if (freeq->vpmq_free != vpm) { \ 303 vpm->vpm_prev->vpm_next = vpm->vpm_next; \ 304 vpm->vpm_next->vpm_prev = vpm->vpm_prev; \ 305 } else if (vpm == vpm->vpm_next) { \ 306 freeq->vpmq_free = NULL; \ 307 } else { \ 308 freeq->vpmq_free = vpm->vpm_next; \ 309 vpm->vpm_prev->vpm_next = vpm->vpm_next; \ 310 vpm->vpm_next->vpm_prev = vpm->vpm_prev; \ 311 } \ 312 mutex_exit(&freeq->vpmq_mtx); \ 313 vpm->vpm_next = vpm->vpm_prev = NULL; \ 314 } \ 315 } 316 317 static int 318 get_freelndx(int mode) 319 { 320 int ndx; 321 322 ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk; 323 switch (mode) { 324 325 case VPMCACHE_LRU: 326 default: 327 vpmd_cpu[CPU->cpu_seqid].vfree_ndx++; 328 break; 329 } 330 return (ndx); 331 } 332 333 334 /* 335 * Find one vpmap structure from the free lists and use it for the newpage. 336 * The previous page it cached is dissociated and released. The page_t's 337 * p_vpmref is cleared only when the vpm it is pointing to is locked(or 338 * for AMD64 when the page is exclusively locked in page_unload. That is 339 * because the p_vpmref is treated as mapping). 340 * 341 * The page's p_vpmref is set when the page is 342 * locked(at least SHARED locked). 343 */ 344 static struct vpmap * 345 get_free_vpmap(page_t *newpage) 346 { 347 struct vpmfree *vpmflp; 348 kmutex_t *vmtx; 349 struct vpmap *vpm, *first; 350 union vpm_freeq *allocq, *releq; 351 page_t *pp = NULL; 352 int end_ndx, page_locked = 0; 353 int free_ndx; 354 355 /* 356 * get the freelist bin index. 357 */ 358 free_ndx = get_freelndx(vpm_cachemode); 359 360 end_ndx = free_ndx; 361 vpmflp = &vpmd_free[free_ndx]; 362 363 retry_queue: 364 allocq = vpmflp->vpm_allocq; 365 mutex_enter(&allocq->vpmq_mtx); 366 367 if ((vpm = allocq->vpmq_free) == NULL) { 368 369 skip_queue: 370 /* 371 * The alloc list is empty or this queue is being skipped; 372 * first see if the allocq toggled. 373 */ 374 if (vpmflp->vpm_allocq != allocq) { 375 /* queue changed */ 376 mutex_exit(&allocq->vpmq_mtx); 377 goto retry_queue; 378 } 379 releq = vpmflp->vpm_releq; 380 if (!mutex_tryenter(&releq->vpmq_mtx)) { 381 /* cannot get releq; a free vpmap may be there now */ 382 mutex_exit(&allocq->vpmq_mtx); 383 384 /* 385 * This loop could spin forever if this thread has 386 * higher priority than the thread that is holding 387 * releq->vpmq_mtx. In order to force the other thread 388 * to run, we'll lock/unlock the mutex which is safe 389 * since we just unlocked the allocq mutex. 390 */ 391 mutex_enter(&releq->vpmq_mtx); 392 mutex_exit(&releq->vpmq_mtx); 393 goto retry_queue; 394 } 395 if (releq->vpmq_free == NULL) { 396 VPM_DEBUG(vpmd_emptyfreelist); 397 /* 398 * This freelist is empty. 399 * This should not happen unless clients 400 * are failing to release the vpmap after 401 * accessing the data. Before resorting 402 * to sleeping, try the next list of the same color. 403 */ 404 free_ndx = (free_ndx + 1) & vpmd_freemsk; 405 if (free_ndx != end_ndx) { 406 mutex_exit(&releq->vpmq_mtx); 407 mutex_exit(&allocq->vpmq_mtx); 408 vpmflp = &vpmd_free[free_ndx]; 409 goto retry_queue; 410 } 411 /* 412 * Tried all freelists. 413 * wait on this list and hope something gets freed. 414 */ 415 vpmflp->vpm_want++; 416 mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx); 417 cv_wait(&vpmflp->vpm_free_cv, 418 &vpmflp->vpm_freeq[0].vpmq_mtx); 419 vpmflp->vpm_want--; 420 mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx); 421 vpmflp = &vpmd_free[free_ndx]; 422 VPM_DEBUG(vpmd_nofreevpms); 423 goto retry_queue; 424 } else { 425 /* 426 * Something on the rele queue; flip the alloc 427 * and rele queues and retry. 428 */ 429 vpmflp->vpm_allocq = releq; 430 vpmflp->vpm_releq = allocq; 431 mutex_exit(&allocq->vpmq_mtx); 432 mutex_exit(&releq->vpmq_mtx); 433 if (page_locked) { 434 delay(hz >> 2); 435 page_locked = 0; 436 } 437 goto retry_queue; 438 } 439 } else { 440 int gotnewvpm; 441 kmutex_t *pmtx; 442 uint_t vpmref; 443 444 /* 445 * Fastpath the case we get the vpmap mutex 446 * on the first try. 447 */ 448 first = vpm; 449 next_vpmap: 450 vmtx = VPMAPMTX(vpm); 451 if (!mutex_tryenter(vmtx)) { 452 /* 453 * Another thread is trying to reclaim this slot. 454 * Skip to the next queue or vpmap. 455 */ 456 if ((vpm = vpm->vpm_next) == first) { 457 goto skip_queue; 458 } else { 459 goto next_vpmap; 460 } 461 } 462 463 /* 464 * Assign this vpm to the newpage. 465 */ 466 pmtx = PPMTX(newpage); 467 gotnewvpm = 0; 468 mutex_enter(pmtx); 469 470 /* 471 * Check if some other thread already assigned a vpm to 472 * this page. 473 */ 474 if ((vpmref = newpage->p_vpmref) == 0) { 475 newpage->p_vpmref = VPMID(vpm); 476 gotnewvpm = 1; 477 } else { 478 VPM_DEBUG(vpmd_contend); 479 mutex_exit(vmtx); 480 } 481 mutex_exit(pmtx); 482 483 if (gotnewvpm) { 484 485 /* 486 * At this point, we've selected the vpm. Remove vpm 487 * from its freelist. If vpm is the first one in 488 * the freelist, update the head of the freelist. 489 */ 490 if (first == vpm) { 491 ASSERT(first == allocq->vpmq_free); 492 allocq->vpmq_free = vpm->vpm_next; 493 } 494 495 /* 496 * If the head of the freelist still points to vpm, 497 * then there are no more free vpmaps in that list. 498 */ 499 if (allocq->vpmq_free == vpm) 500 /* 501 * Took the last one 502 */ 503 allocq->vpmq_free = NULL; 504 else { 505 vpm->vpm_prev->vpm_next = vpm->vpm_next; 506 vpm->vpm_next->vpm_prev = vpm->vpm_prev; 507 } 508 mutex_exit(&allocq->vpmq_mtx); 509 vpm->vpm_prev = vpm->vpm_next = NULL; 510 511 /* 512 * Disassociate the previous page. 513 * p_vpmref is used as a mapping reference to the page. 514 */ 515 if ((pp = vpm->vpm_pp) != NULL && 516 vpm->vpm_vp == pp->p_vnode && 517 vpm->vpm_off == pp->p_offset) { 518 519 pmtx = PPMTX(pp); 520 if (page_trylock(pp, SE_SHARED)) { 521 /* 522 * Now verify that it is the correct 523 * page. If not someone else stole it, 524 * so just unlock it and leave. 525 */ 526 mutex_enter(pmtx); 527 if (PP_ISFREE(pp) || 528 vpm->vpm_vp != pp->p_vnode || 529 vpm->vpm_off != pp->p_offset || 530 pp->p_vpmref != VPMID(vpm)) { 531 mutex_exit(pmtx); 532 533 page_unlock(pp); 534 } else { 535 /* 536 * Release the page. 537 */ 538 pp->p_vpmref = 0; 539 mutex_exit(pmtx); 540 (void) page_release(pp, 1); 541 } 542 } else { 543 /* 544 * If the page cannot be locked, just 545 * clear the p_vpmref and go. 546 */ 547 mutex_enter(pmtx); 548 if (pp->p_vpmref == VPMID(vpm)) { 549 pp->p_vpmref = 0; 550 } 551 mutex_exit(pmtx); 552 VPM_DEBUG(vpmd_prevpagelocked); 553 } 554 } 555 556 /* 557 * Setup vpm to point to the new page. 558 */ 559 vpm->vpm_pp = newpage; 560 vpm->vpm_vp = newpage->p_vnode; 561 vpm->vpm_off = newpage->p_offset; 562 563 } else { 564 int steal = !VPM_MTBF(steals, steals_mtbf); 565 /* 566 * Page already has a vpm assigned just use that. 567 * Grab the vpm mutex and verify that it is still 568 * the correct one. The pp->p_vpmref should not change 569 * once we have the vpm mutex and the page lock. 570 */ 571 mutex_exit(&allocq->vpmq_mtx); 572 vpm = VPMP(vpmref); 573 vmtx = VPMAPMTX(vpm); 574 mutex_enter(vmtx); 575 if ((steal && vpm->vpm_refcnt == 0) || 576 vpm->vpm_pp != newpage) { 577 /* 578 * The vpm got stolen, retry. 579 * clear the p_vpmref. 580 */ 581 pmtx = PPMTX(newpage); 582 mutex_enter(pmtx); 583 if (newpage->p_vpmref == vpmref) { 584 newpage->p_vpmref = 0; 585 } 586 mutex_exit(pmtx); 587 588 mutex_exit(vmtx); 589 VPM_DEBUG(vpmd_steals); 590 goto retry_queue; 591 } else if (vpm->vpm_refcnt == 0) { 592 /* 593 * Remove it from the free list if it 594 * exists there. 595 */ 596 VPMAP_RMFREELIST(vpm); 597 } 598 } 599 return (vpm); 600 } 601 } 602 603 static void 604 free_vpmap(struct vpmap *vpm) 605 { 606 struct vpmfree *vpmflp; 607 struct vpmap *vpmfreelist; 608 union vpm_freeq *releq; 609 610 ASSERT(MUTEX_HELD(VPMAPMTX(vpm))); 611 612 if (vpm->vpm_refcnt != 0) { 613 panic("free_vpmap"); 614 /*NOTREACHED*/ 615 } 616 617 vpmflp = &vpmd_free[vpm->vpm_free_ndx]; 618 /* 619 * Add to the tail of the release queue 620 * Note that vpm_releq and vpm_allocq could toggle 621 * before we get the lock. This does not affect 622 * correctness as the 2 queues are only maintained 623 * to reduce lock pressure. 624 */ 625 releq = vpmflp->vpm_releq; 626 if (releq == &vpmflp->vpm_freeq[0]) { 627 vpm->vpm_ndxflg = 0; 628 } else { 629 vpm->vpm_ndxflg = 1; 630 } 631 mutex_enter(&releq->vpmq_mtx); 632 vpmfreelist = releq->vpmq_free; 633 if (vpmfreelist == 0) { 634 int want; 635 636 releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm; 637 /* 638 * Both queue mutexes are held to set vpm_want; 639 * snapshot the value before dropping releq mutex. 640 * If vpm_want appears after the releq mutex is dropped, 641 * then the vpmap just freed is already gone. 642 */ 643 want = vpmflp->vpm_want; 644 mutex_exit(&releq->vpmq_mtx); 645 /* 646 * See if there was a waiter before dropping the releq mutex 647 * then recheck after obtaining vpm_freeq[0] mutex as 648 * the another thread may have already signaled. 649 */ 650 if (want) { 651 mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx); 652 if (vpmflp->vpm_want) 653 cv_signal(&vpmflp->vpm_free_cv); 654 mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx); 655 } 656 } else { 657 vpm->vpm_next = vpmfreelist; 658 vpm->vpm_prev = vpmfreelist->vpm_prev; 659 vpmfreelist->vpm_prev = vpm; 660 vpm->vpm_prev->vpm_next = vpm; 661 mutex_exit(&releq->vpmq_mtx); 662 } 663 } 664 665 /* 666 * Get the vpmap for the page. 667 * The refcnt of this vpm is incremented. 668 */ 669 static struct vpmap * 670 get_vpmap(page_t *pp) 671 { 672 struct vpmap *vpm = NULL; 673 kmutex_t *vmtx; 674 kmutex_t *pmtx; 675 unsigned int refid; 676 677 ASSERT((pp != NULL) && PAGE_LOCKED(pp)); 678 679 if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) { 680 vpm = VPMP(refid); 681 vmtx = VPMAPMTX(vpm); 682 mutex_enter(vmtx); 683 /* 684 * Since we have the page lock and the vpm mutex, the 685 * pp->p_vpmref cannot change. 686 */ 687 if (vpm->vpm_pp != pp) { 688 pmtx = PPMTX(pp); 689 690 /* 691 * Clear the p_vpmref as it is incorrect. 692 * This can happen if the page was stolen. 693 * On x64 this should not happen as p_vpmref 694 * is treated as a mapping on the page. So 695 * if the page is stolen, the mapping would have 696 * been cleared in page_unload(). 697 */ 698 mutex_enter(pmtx); 699 if (pp->p_vpmref == refid) 700 pp->p_vpmref = 0; 701 mutex_exit(pmtx); 702 703 mutex_exit(vmtx); 704 vpm = NULL; 705 } else if (vpm->vpm_refcnt == 0) { 706 /* 707 * Got the vpm, remove it from the free 708 * list if it exists there. 709 */ 710 VPMAP_RMFREELIST(vpm); 711 } 712 } 713 if (vpm == NULL) { 714 /* 715 * get_free_vpmap() returns with the vpmap mutex held. 716 */ 717 vpm = get_free_vpmap(pp); 718 vmtx = VPMAPMTX(vpm); 719 vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++; 720 } else { 721 vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++; 722 } 723 724 vpm->vpm_refcnt++; 725 mutex_exit(vmtx); 726 727 return (vpm); 728 } 729 730 /* END --- vpm cache ---- */ 731 732 /* 733 * The vnode page mapping(vpm) interface routines. 734 */ 735 736 /* 737 * Find or create the pages starting form baseoff for specified 738 * length 'len'. 739 */ 740 static int 741 vpm_pagecreate( 742 struct vnode *vp, 743 u_offset_t baseoff, 744 size_t len, 745 vmap_t vml[], 746 int nseg, 747 int *newpage) 748 { 749 750 page_t *pp = NULL; 751 caddr_t base; 752 u_offset_t off = baseoff; 753 int i; 754 ASSERT(nseg >= MINVMAPS && nseg <= MAXVMAPS); 755 756 for (i = 0; len > 0; len -= PAGESIZE, i++) { 757 struct vpmap *vpm; 758 759 760 if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) { 761 762 base = segkpm_create_va(off); 763 764 /* 765 * the seg pointer passed in is just advisor. Just 766 * pass segkmap for now like segmap does with 767 * segmap_kpm enabled. 768 */ 769 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, 770 segkmap, base)) == NULL) { 771 panic("segmap_pagecreate_vpm: " 772 "page_create failed"); 773 /*NOTREACHED*/ 774 } 775 if (newpage != NULL) 776 *newpage = 1; 777 778 page_io_unlock(pp); 779 } 780 781 /* 782 * Get the vpm for this page_t. 783 */ 784 if (vpm_cache_enable) { 785 vpm = get_vpmap(pp); 786 vml[i].vs_data = (void *)&vpm->vpm_pp; 787 } else { 788 vml[i].vs_data = (void *)pp; 789 pp->p_vpmref = 0; 790 } 791 792 vml[i].vs_addr = hat_kpm_mapin(pp, 0); 793 vml[i].vs_len = PAGESIZE; 794 795 off += PAGESIZE; 796 } 797 vml[i].vs_data = NULL; 798 vml[i].vs_addr = (caddr_t)NULL; 799 return (0); 800 } 801 802 803 /* 804 * Returns vpm mappings of pages in the range [off, off+len], where 805 * len is rounded up to the PAGESIZE boundary. The list of pages and 806 * the page addresses are returned in the SGL vml (vmap_t) array passed in. 807 * The nseg is the number of vmap_t entries in the array. 808 * 809 * The segmap's SM_LOCKPROTO usage is not supported by these interfaces. 810 * For such cases, use the seg_map interfaces. 811 */ 812 int 813 vpm_map_pages( 814 struct vnode *vp, 815 u_offset_t off, 816 size_t len, 817 int fetchpage, 818 vmap_t *vml, 819 int nseg, 820 int *newpage, 821 enum seg_rw rw) 822 { 823 extern struct vnode *common_specvp(); 824 u_offset_t baseoff; 825 uint_t prot; 826 caddr_t base; 827 page_t *pp, *pplist[MAXVMAPS]; 828 struct vpmap *vpm; 829 int i, error = 0; 830 size_t tlen; 831 832 ASSERT(nseg >= MINVMAPS && nseg <= MAXVMAPS); 833 baseoff = off & (offset_t)PAGEMASK; 834 vml[0].vs_data = NULL; 835 vml[0].vs_addr = (caddr_t)NULL; 836 837 tlen = P2ROUNDUP(off + len, PAGESIZE) - baseoff; 838 /* 839 * Restrict it to VPMMAXLEN. 840 */ 841 if (tlen > (VPMMAXPGS * PAGESIZE)) { 842 tlen = VPMMAXPGS * PAGESIZE; 843 } 844 /* 845 * Ensure length fits within the vml[] array. One element of 846 * the array is used to mark the end of the scatter/gather list 847 * of valid mappings by setting its vs_addr = NULL. Leave space 848 * for this element. 849 */ 850 if (tlen > ((nseg - 1) * PAGESIZE)) { 851 tlen = ((nseg - 1) * PAGESIZE); 852 } 853 len = tlen; 854 855 /* 856 * If this is a block device we have to be sure to use the 857 * "common" block device vnode for the mapping. 858 */ 859 if (vp->v_type == VBLK) 860 vp = common_specvp(vp); 861 862 863 if (!fetchpage) 864 return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage)); 865 866 for (i = 0; len > 0; len -= PAGESIZE, i++, pplist[i] = NULL) { 867 868 pp = page_lookup(vp, baseoff, SE_SHARED); 869 870 /* 871 * If we did not find the page or if this page was not 872 * in vpm cache(p_vpmref == 0), then let VOP_GETPAGE get 873 * all the pages. 874 * We need to call VOP_GETPAGE so that filesytems can do some 875 * (un)necessary tracking for sequential access. 876 */ 877 878 if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) || 879 (rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF) 880 != (P_MOD | P_REF))) { 881 int j; 882 if (pp != NULL) { 883 page_unlock(pp); 884 } 885 /* 886 * If we did not find the desired set of pages, 887 * from the page cache, just call VOP_GETPAGE to get 888 * all the pages. 889 */ 890 for (j = 0; j < i; j++) { 891 page_unlock(pplist[j]); 892 } 893 894 895 baseoff = off & (offset_t)PAGEMASK; 896 /* 897 * Pass a dummy address as it will be required 898 * by page_create_va(). We pass segkmap as the seg 899 * as some file systems(UFS) check it. 900 */ 901 base = segkpm_create_va(baseoff); 902 903 error = VOP_GETPAGE(vp, baseoff, tlen, &prot, pplist, 904 tlen, segkmap, base, rw, CRED(), NULL); 905 if (error) { 906 VPM_DEBUG(vpmd_getpagefailed); 907 pplist[0] = NULL; 908 } 909 break; 910 } else { 911 pplist[i] = pp; 912 baseoff += PAGESIZE; 913 } 914 } 915 916 if (error) { 917 for (i = 0; pplist[i] != NULL; i++) { 918 page_unlock(pplist[i]); 919 pplist[i] = NULL; 920 } 921 vml[0].vs_addr = NULL; 922 vml[0].vs_data = NULL; 923 return (error); 924 } 925 926 /* 927 * Get the vpm's for pages. 928 */ 929 for (i = 0; pplist[i] != NULL; i++) { 930 if (vpm_cache_enable) { 931 vpm = get_vpmap(pplist[i]); 932 vml[i].vs_data = (void *)&(vpm->vpm_pp); 933 } else { 934 vml[i].vs_data = (void *)pplist[i]; 935 pplist[i]->p_vpmref = 0; 936 } 937 938 vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0); 939 vml[i].vs_len = PAGESIZE; 940 } 941 942 vml[i].vs_data = NULL; 943 vml[i].vs_addr = (caddr_t)NULL; 944 945 return (0); 946 } 947 948 /* 949 * Release the vpm mappings on the pages and unlock them. 950 */ 951 void 952 vpm_unmap_pages(vmap_t vml[], enum seg_rw rw) 953 { 954 int i; 955 struct vpmap *vpm; 956 kmutex_t *mtx; 957 page_t *pp; 958 959 for (i = 0; vml[i].vs_data != NULL; i++) { 960 ASSERT(IS_KPM_ADDR(vml[i].vs_addr)); 961 962 if (vpm_cache_enable) { 963 pp = *(((page_t **)vml[i].vs_data)); 964 } else { 965 pp = (page_t *)vml[i].vs_data; 966 } 967 968 /* 969 * Mark page as being modified or referenced, bacause vpm pages 970 * would not cause faults where it would be set normally. 971 */ 972 if (rw == S_WRITE) { 973 hat_setrefmod(pp); 974 } else { 975 ASSERT(rw == S_READ); 976 hat_setref(pp); 977 } 978 979 if (vpm_cache_enable) { 980 vpm = (struct vpmap *)((char *)vml[i].vs_data 981 - offsetof(struct vpmap, vpm_pp)); 982 hat_kpm_mapout(pp, 0, vml[i].vs_addr); 983 page_unlock(pp); 984 mtx = VPMAPMTX(vpm); 985 mutex_enter(mtx); 986 987 if (--vpm->vpm_refcnt == 0) { 988 free_vpmap(vpm); 989 } 990 mutex_exit(mtx); 991 } else { 992 hat_kpm_mapout(pp, 0, vml[i].vs_addr); 993 (void) page_release(pp, 1); 994 } 995 vml[i].vs_data = NULL; 996 vml[i].vs_addr = NULL; 997 } 998 } 999 1000 /* 1001 * Given the vp, off and the uio structure, this routine will do the 1002 * the copy (uiomove). If the last page created is partially written, 1003 * the rest of the page is zeroed out. It also zeros the beginning of 1004 * the first page till the start offset if requested(zerostart). 1005 * If pages are to be fetched, it will call the filesystem's getpage 1006 * function (VOP_GETPAGE) to get them, otherwise they will be created if 1007 * not already present in the page cache. 1008 */ 1009 int 1010 vpm_data_copy(struct vnode *vp, 1011 u_offset_t off, 1012 size_t len, 1013 struct uio *uio, 1014 int fetchpage, 1015 int *newpage, 1016 int zerostart, 1017 enum seg_rw rw) 1018 { 1019 int error; 1020 struct vmap vml[MINVMAPS]; 1021 enum uio_rw uiorw; 1022 int npages = 0; 1023 1024 uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ; 1025 /* 1026 * 'off' will be the offset where the I/O starts. 1027 * We get the pages starting at the (off & PAGEMASK) 1028 * page boundary. 1029 */ 1030 error = vpm_map_pages(vp, off, (uint_t)len, 1031 fetchpage, vml, MINVMAPS, &npages, rw); 1032 1033 if (newpage != NULL) 1034 *newpage = npages; 1035 if (!error) { 1036 int i, pn, slen = len; 1037 int pon = off & PAGEOFFSET; 1038 1039 /* 1040 * Clear from the beginning of the page to start offset 1041 * if requested. 1042 */ 1043 if (!fetchpage && zerostart) { 1044 (void) kzero(vml[0].vs_addr, (uint_t)pon); 1045 VPM_DEBUG(vpmd_zerostart); 1046 } 1047 1048 for (i = 0; !error && slen > 0 && 1049 vml[i].vs_addr != NULL; i++) { 1050 pn = (int)MIN(slen, (PAGESIZE - pon)); 1051 error = uiomove(vml[i].vs_addr + pon, 1052 (long)pn, uiorw, uio); 1053 slen -= pn; 1054 pon = 0; 1055 } 1056 1057 /* 1058 * When new pages are created, zero out part of the 1059 * page we did not copy to. 1060 */ 1061 if (!fetchpage && npages && 1062 uio->uio_loffset < roundup(off + len, PAGESIZE)) { 1063 int nzero; 1064 1065 pon = (uio->uio_loffset & PAGEOFFSET); 1066 nzero = PAGESIZE - pon; 1067 i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE; 1068 (void) kzero(vml[i].vs_addr + pon, (uint_t)nzero); 1069 } 1070 vpm_unmap_pages(vml, rw); 1071 } 1072 return (error); 1073 } 1074 1075 /* 1076 * called to flush pages for the given vnode covering 1077 * [off, off+len] range. 1078 */ 1079 int 1080 vpm_sync_pages(struct vnode *vp, 1081 u_offset_t off, 1082 size_t len, 1083 uint_t flags) 1084 { 1085 extern struct vnode *common_specvp(); 1086 int bflags = 0; 1087 int error = 0; 1088 size_t psize = roundup(len, PAGESIZE); 1089 1090 /* 1091 * If this is a block device we have to be sure to use the 1092 * "common" block device vnode for the mapping. 1093 */ 1094 if (vp->v_type == VBLK) 1095 vp = common_specvp(vp); 1096 1097 if ((flags & ~SM_DONTNEED) != 0) { 1098 if (flags & SM_ASYNC) 1099 bflags |= B_ASYNC; 1100 if (flags & SM_INVAL) 1101 bflags |= B_INVAL; 1102 if (flags & SM_DESTROY) 1103 bflags |= (B_INVAL|B_TRUNC); 1104 if (flags & SM_FREE) 1105 bflags |= B_FREE; 1106 if (flags & SM_DONTNEED) 1107 bflags |= B_DONTNEED; 1108 1109 error = VOP_PUTPAGE(vp, off, psize, bflags, CRED(), NULL); 1110 } 1111 1112 return (error); 1113 } 1114 1115 1116 #else /* SEGKPM_SUPPORT */ 1117 1118 /* vpm stubs */ 1119 void 1120 vpm_init() 1121 { 1122 } 1123 1124 /*ARGSUSED*/ 1125 int 1126 vpm_pagecreate( 1127 struct vnode *vp, 1128 u_offset_t baseoff, 1129 size_t len, 1130 vmap_t vml[], 1131 int nseg, 1132 int *newpage) 1133 { 1134 return (0); 1135 } 1136 1137 /*ARGSUSED*/ 1138 int 1139 vpm_map_pages( 1140 struct vnode *vp, 1141 u_offset_t off, 1142 size_t len, 1143 int fetchpage, 1144 vmap_t vml[], 1145 int nseg, 1146 int *newpage, 1147 enum seg_rw rw) 1148 { 1149 return (0); 1150 } 1151 1152 /*ARGSUSED*/ 1153 int 1154 vpm_data_copy(struct vnode *vp, 1155 u_offset_t off, 1156 size_t len, 1157 struct uio *uio, 1158 int fetchpage, 1159 int *newpage, 1160 int zerostart, 1161 enum seg_rw rw) 1162 { 1163 return (0); 1164 } 1165 1166 /*ARGSUSED*/ 1167 void 1168 vpm_unmap_pages(vmap_t vml[], enum seg_rw rw) 1169 { 1170 } 1171 /*ARGSUSED*/ 1172 int 1173 vpm_sync_pages(struct vnode *vp, 1174 u_offset_t off, 1175 size_t len, 1176 uint_t flags) 1177 { 1178 return (0); 1179 } 1180 #endif /* SEGKPM_SUPPORT */ 1181