1*a5652762Spraks /* 2*a5652762Spraks * CDDL HEADER START 3*a5652762Spraks * 4*a5652762Spraks * The contents of this file are subject to the terms of the 5*a5652762Spraks * Common Development and Distribution License (the "License"). 6*a5652762Spraks * You may not use this file except in compliance with the License. 7*a5652762Spraks * 8*a5652762Spraks * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*a5652762Spraks * or http://www.opensolaris.org/os/licensing. 10*a5652762Spraks * See the License for the specific language governing permissions 11*a5652762Spraks * and limitations under the License. 12*a5652762Spraks * 13*a5652762Spraks * When distributing Covered Code, include this CDDL HEADER in each 14*a5652762Spraks * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*a5652762Spraks * If applicable, add the following below this CDDL HEADER, with the 16*a5652762Spraks * fields enclosed by brackets "[]" replaced with your own identifying 17*a5652762Spraks * information: Portions Copyright [yyyy] [name of copyright owner] 18*a5652762Spraks * 19*a5652762Spraks * CDDL HEADER END 20*a5652762Spraks */ 21*a5652762Spraks /* 22*a5652762Spraks * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23*a5652762Spraks * Use is subject to license terms. 24*a5652762Spraks */ 25*a5652762Spraks 26*a5652762Spraks #pragma ident "%Z%%M% %I% %E% SMI" 27*a5652762Spraks 28*a5652762Spraks /* 29*a5652762Spraks * VM - generic vnode page mapping interfaces. 30*a5652762Spraks * 31*a5652762Spraks * Mechanism to provide temporary mappings to vnode pages. 32*a5652762Spraks * The typical use would be to copy/access file data. 33*a5652762Spraks */ 34*a5652762Spraks 35*a5652762Spraks #include <sys/types.h> 36*a5652762Spraks #include <sys/t_lock.h> 37*a5652762Spraks #include <sys/param.h> 38*a5652762Spraks #include <sys/sysmacros.h> 39*a5652762Spraks #include <sys/buf.h> 40*a5652762Spraks #include <sys/systm.h> 41*a5652762Spraks #include <sys/vnode.h> 42*a5652762Spraks #include <sys/mman.h> 43*a5652762Spraks #include <sys/errno.h> 44*a5652762Spraks #include <sys/cred.h> 45*a5652762Spraks #include <sys/kmem.h> 46*a5652762Spraks #include <sys/vtrace.h> 47*a5652762Spraks #include <sys/cmn_err.h> 48*a5652762Spraks #include <sys/debug.h> 49*a5652762Spraks #include <sys/thread.h> 50*a5652762Spraks #include <sys/dumphdr.h> 51*a5652762Spraks #include <sys/bitmap.h> 52*a5652762Spraks #include <sys/lgrp.h> 53*a5652762Spraks 54*a5652762Spraks #include <vm/seg_kmem.h> 55*a5652762Spraks #include <vm/hat.h> 56*a5652762Spraks #include <vm/as.h> 57*a5652762Spraks #include <vm/seg.h> 58*a5652762Spraks #include <vm/seg_kpm.h> 59*a5652762Spraks #include <vm/seg_map.h> 60*a5652762Spraks #include <vm/page.h> 61*a5652762Spraks #include <vm/pvn.h> 62*a5652762Spraks #include <vm/rm.h> 63*a5652762Spraks #include <vm/vpm.h> 64*a5652762Spraks 65*a5652762Spraks /* 66*a5652762Spraks * Needs to be enabled by each platform. 67*a5652762Spraks */ 68*a5652762Spraks int vpm_enable = 0; 69*a5652762Spraks 70*a5652762Spraks #ifdef SEGKPM_SUPPORT 71*a5652762Spraks 72*a5652762Spraks 73*a5652762Spraks int vpm_cache_enable = 1; 74*a5652762Spraks long vpm_cache_percent = 12; 75*a5652762Spraks long vpm_cache_size; 76*a5652762Spraks int vpm_nfreelist = 0; 77*a5652762Spraks int vpmd_freemsk = 0; 78*a5652762Spraks 79*a5652762Spraks #define VPM_S_PAD 64 80*a5652762Spraks union vpm_cpu { 81*a5652762Spraks struct { 82*a5652762Spraks int vcpu_free_ndx; 83*a5652762Spraks ulong_t vcpu_hits; 84*a5652762Spraks ulong_t vcpu_misses; 85*a5652762Spraks } vcpu; 86*a5652762Spraks char vpm_pad[VPM_S_PAD]; 87*a5652762Spraks }; 88*a5652762Spraks static union vpm_cpu *vpmd_cpu; 89*a5652762Spraks 90*a5652762Spraks #define vfree_ndx vcpu.vcpu_free_ndx 91*a5652762Spraks 92*a5652762Spraks int vpm_cachemode = VPMCACHE_LRU; 93*a5652762Spraks 94*a5652762Spraks #define PPMTX(pp) (&(pp)->p_ilock) 95*a5652762Spraks 96*a5652762Spraks static struct vpmap *vpmd_vpmap; /* list of vpmap structs preallocated */ 97*a5652762Spraks static struct vpmfree *vpmd_free; 98*a5652762Spraks #define VPMAPMTX(vpm) (&vpm->vpm_mtx) 99*a5652762Spraks #define VPMAP2VMF(vpm) (&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk]) 100*a5652762Spraks #define VPMAP2VMF_NDX(vpm) (ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk) 101*a5652762Spraks #define VPMP(id) (&vpmd_vpmap[id - 1]) 102*a5652762Spraks #define VPMID(vpm) (uint_t)((vpm - vpmd_vpmap) + 1) 103*a5652762Spraks 104*a5652762Spraks 105*a5652762Spraks #ifdef DEBUG 106*a5652762Spraks 107*a5652762Spraks struct vpm_debug { 108*a5652762Spraks int vpmd_steals; 109*a5652762Spraks int vpmd_contend; 110*a5652762Spraks int vpmd_prevpagelocked; 111*a5652762Spraks int vpmd_getpagefailed; 112*a5652762Spraks int vpmd_zerostart; 113*a5652762Spraks int vpmd_emptyfreelist; 114*a5652762Spraks int vpmd_nofreevpms; 115*a5652762Spraks } vpm_debug; 116*a5652762Spraks 117*a5652762Spraks #define VPM_DEBUG(x) ((vpm_debug.x)++) 118*a5652762Spraks 119*a5652762Spraks int steals; 120*a5652762Spraks int steals_mtbf = 7; 121*a5652762Spraks int contend; 122*a5652762Spraks int contend_mtbf = 127; 123*a5652762Spraks 124*a5652762Spraks #define VPM_MTBF(v, f) (((++(v)) & (f)) != (f)) 125*a5652762Spraks 126*a5652762Spraks #else /* DEBUG */ 127*a5652762Spraks 128*a5652762Spraks #define VPM_MTBF(v, f) (1) 129*a5652762Spraks #define VPM_DEBUG(x) /* nothing */ 130*a5652762Spraks 131*a5652762Spraks #endif 132*a5652762Spraks 133*a5652762Spraks /* 134*a5652762Spraks * The vpm cache. 135*a5652762Spraks * 136*a5652762Spraks * The main purpose of having a cache here is to speed up page_lookup() 137*a5652762Spraks * operations and also provide an LRU(default) behaviour of file pages. The 138*a5652762Spraks * page_lookup() operation tends to be expensive if a page has to be 139*a5652762Spraks * reclaimed from the system page cache("cachelist"). Once we speed up the 140*a5652762Spraks * page_lookup()->page_reclaim() path then there there should be no need for 141*a5652762Spraks * this cache. The system page cache(cachelist) should effectively serve the 142*a5652762Spraks * purpose of caching file pages. 143*a5652762Spraks * 144*a5652762Spraks * This cache is very similar to segmap's smap cache. Each page in the 145*a5652762Spraks * cache is tracked by the structure vpmap_t. But unlike segmap, there is no 146*a5652762Spraks * hash table. The page_t has a reference to the vpmap_t when cached. For a 147*a5652762Spraks * given vnode, offset the page is found by means of a page_lookup() operation. 148*a5652762Spraks * Any page which has a mapping(i.e when cached) will not be in the 149*a5652762Spraks * system 'cachelist'. Hence the page_lookup() will not have to do a 150*a5652762Spraks * page_reclaim(). That is how the cache serves to speed up page_lookup() 151*a5652762Spraks * operations. 152*a5652762Spraks * 153*a5652762Spraks * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system. 154*a5652762Spraks */ 155*a5652762Spraks 156*a5652762Spraks void 157*a5652762Spraks vpm_init() 158*a5652762Spraks { 159*a5652762Spraks long npages; 160*a5652762Spraks struct vpmap *vpm; 161*a5652762Spraks struct vpmfree *vpmflp; 162*a5652762Spraks int i, ndx; 163*a5652762Spraks extern void prefetch_smap_w(void *); 164*a5652762Spraks 165*a5652762Spraks if (!vpm_cache_enable) { 166*a5652762Spraks return; 167*a5652762Spraks } 168*a5652762Spraks 169*a5652762Spraks /* 170*a5652762Spraks * Set the size of the cache. 171*a5652762Spraks */ 172*a5652762Spraks vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100); 173*a5652762Spraks if (vpm_cache_size < VPMAP_MINCACHE) { 174*a5652762Spraks vpm_cache_size = VPMAP_MINCACHE; 175*a5652762Spraks } 176*a5652762Spraks 177*a5652762Spraks /* 178*a5652762Spraks * Number of freelists. 179*a5652762Spraks */ 180*a5652762Spraks if (vpm_nfreelist == 0) { 181*a5652762Spraks vpm_nfreelist = max_ncpus; 182*a5652762Spraks } else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) { 183*a5652762Spraks cmn_err(CE_WARN, "vpmap create : number of freelist " 184*a5652762Spraks "vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus); 185*a5652762Spraks vpm_nfreelist = 2 * max_ncpus; 186*a5652762Spraks } 187*a5652762Spraks 188*a5652762Spraks /* 189*a5652762Spraks * Round it up to the next power of 2 190*a5652762Spraks */ 191*a5652762Spraks if (vpm_nfreelist & (vpm_nfreelist - 1)) { 192*a5652762Spraks vpm_nfreelist = 1 << (highbit(vpm_nfreelist)); 193*a5652762Spraks } 194*a5652762Spraks vpmd_freemsk = vpm_nfreelist - 1; 195*a5652762Spraks 196*a5652762Spraks /* 197*a5652762Spraks * Use a per cpu rotor index to spread the allocations evenly 198*a5652762Spraks * across the available vpm freelists. 199*a5652762Spraks */ 200*a5652762Spraks vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP); 201*a5652762Spraks ndx = 0; 202*a5652762Spraks for (i = 0; i < max_ncpus; i++) { 203*a5652762Spraks 204*a5652762Spraks vpmd_cpu[i].vfree_ndx = ndx; 205*a5652762Spraks ndx = (ndx + 1) & vpmd_freemsk; 206*a5652762Spraks } 207*a5652762Spraks 208*a5652762Spraks /* 209*a5652762Spraks * Allocate and initialize the freelist. 210*a5652762Spraks */ 211*a5652762Spraks vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree), 212*a5652762Spraks KM_SLEEP); 213*a5652762Spraks for (i = 0; i < vpm_nfreelist; i++) { 214*a5652762Spraks 215*a5652762Spraks vpmflp = &vpmd_free[i]; 216*a5652762Spraks /* 217*a5652762Spraks * Set up initial queue pointers. They will get flipped 218*a5652762Spraks * back and forth. 219*a5652762Spraks */ 220*a5652762Spraks vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ]; 221*a5652762Spraks vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ]; 222*a5652762Spraks } 223*a5652762Spraks 224*a5652762Spraks npages = mmu_btop(vpm_cache_size); 225*a5652762Spraks 226*a5652762Spraks 227*a5652762Spraks /* 228*a5652762Spraks * Allocate and initialize the vpmap structs. 229*a5652762Spraks */ 230*a5652762Spraks vpmd_vpmap = kmem_zalloc(sizeof (struct vpmap) * npages, KM_SLEEP); 231*a5652762Spraks for (vpm = vpmd_vpmap; vpm <= &vpmd_vpmap[npages - 1]; vpm++) { 232*a5652762Spraks struct vpmfree *vpmflp; 233*a5652762Spraks union vpm_freeq *releq; 234*a5652762Spraks struct vpmap *vpmapf; 235*a5652762Spraks 236*a5652762Spraks /* 237*a5652762Spraks * Use prefetch as we have to walk thru a large number of 238*a5652762Spraks * these data structures. We just use the smap's prefetch 239*a5652762Spraks * routine as it does the same. This should work fine 240*a5652762Spraks * for x64(this needs to be modifed when enabled on sparc). 241*a5652762Spraks */ 242*a5652762Spraks prefetch_smap_w((void *)vpm); 243*a5652762Spraks 244*a5652762Spraks vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm); 245*a5652762Spraks 246*a5652762Spraks vpmflp = VPMAP2VMF(vpm); 247*a5652762Spraks releq = vpmflp->vpm_releq; 248*a5652762Spraks 249*a5652762Spraks vpmapf = releq->vpmq_free; 250*a5652762Spraks if (vpmapf == NULL) { 251*a5652762Spraks releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm; 252*a5652762Spraks } else { 253*a5652762Spraks vpm->vpm_next = vpmapf; 254*a5652762Spraks vpm->vpm_prev = vpmapf->vpm_prev; 255*a5652762Spraks vpmapf->vpm_prev = vpm; 256*a5652762Spraks vpm->vpm_prev->vpm_next = vpm; 257*a5652762Spraks releq->vpmq_free = vpm->vpm_next; 258*a5652762Spraks } 259*a5652762Spraks 260*a5652762Spraks /* 261*a5652762Spraks * Indicate that the vpmap is on the releq at start 262*a5652762Spraks */ 263*a5652762Spraks vpm->vpm_ndxflg = VPMRELEQ; 264*a5652762Spraks } 265*a5652762Spraks } 266*a5652762Spraks 267*a5652762Spraks 268*a5652762Spraks /* 269*a5652762Spraks * unhooks vpm from the freelist if it is still on the freelist. 270*a5652762Spraks */ 271*a5652762Spraks #define VPMAP_RMFREELIST(vpm) \ 272*a5652762Spraks { \ 273*a5652762Spraks if (vpm->vpm_next != NULL) { \ 274*a5652762Spraks union vpm_freeq *freeq; \ 275*a5652762Spraks struct vpmfree *vpmflp; \ 276*a5652762Spraks vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \ 277*a5652762Spraks freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \ 278*a5652762Spraks mutex_enter(&freeq->vpmq_mtx); \ 279*a5652762Spraks if (freeq->vpmq_free != vpm) { \ 280*a5652762Spraks vpm->vpm_prev->vpm_next = vpm->vpm_next; \ 281*a5652762Spraks vpm->vpm_next->vpm_prev = vpm->vpm_prev; \ 282*a5652762Spraks } else if (vpm == vpm->vpm_next) { \ 283*a5652762Spraks freeq->vpmq_free = NULL; \ 284*a5652762Spraks } else { \ 285*a5652762Spraks freeq->vpmq_free = vpm->vpm_next; \ 286*a5652762Spraks vpm->vpm_prev->vpm_next = vpm->vpm_next; \ 287*a5652762Spraks vpm->vpm_next->vpm_prev = vpm->vpm_prev; \ 288*a5652762Spraks } \ 289*a5652762Spraks mutex_exit(&freeq->vpmq_mtx); \ 290*a5652762Spraks vpm->vpm_next = vpm->vpm_prev = NULL; \ 291*a5652762Spraks } \ 292*a5652762Spraks } 293*a5652762Spraks 294*a5652762Spraks static int 295*a5652762Spraks get_freelndx(int mode) 296*a5652762Spraks { 297*a5652762Spraks int ndx; 298*a5652762Spraks 299*a5652762Spraks ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk; 300*a5652762Spraks switch (mode) { 301*a5652762Spraks 302*a5652762Spraks case VPMCACHE_LRU: 303*a5652762Spraks default: 304*a5652762Spraks vpmd_cpu[CPU->cpu_seqid].vfree_ndx++; 305*a5652762Spraks break; 306*a5652762Spraks } 307*a5652762Spraks return (ndx); 308*a5652762Spraks } 309*a5652762Spraks 310*a5652762Spraks 311*a5652762Spraks /* 312*a5652762Spraks * Find one vpmap structure from the free lists and use it for the newpage. 313*a5652762Spraks * The previous page it cached is dissociated and released. The page_t's 314*a5652762Spraks * p_vpmref is cleared only when the vpm it is pointing to is locked(or 315*a5652762Spraks * for AMD64 when the page is exclusively locked in page_unload. That is 316*a5652762Spraks * because the p_vpmref is treated as mapping). 317*a5652762Spraks * 318*a5652762Spraks * The page's p_vpmref is set when the page is 319*a5652762Spraks * locked(at least SHARED locked). 320*a5652762Spraks */ 321*a5652762Spraks static struct vpmap * 322*a5652762Spraks get_free_vpmap(page_t *newpage) 323*a5652762Spraks { 324*a5652762Spraks struct vpmfree *vpmflp; 325*a5652762Spraks kmutex_t *vmtx; 326*a5652762Spraks struct vpmap *vpm, *first; 327*a5652762Spraks union vpm_freeq *allocq, *releq; 328*a5652762Spraks page_t *pp = NULL; 329*a5652762Spraks int end_ndx, page_locked = 0; 330*a5652762Spraks int free_ndx; 331*a5652762Spraks 332*a5652762Spraks /* 333*a5652762Spraks * get the freelist bin index. 334*a5652762Spraks */ 335*a5652762Spraks free_ndx = get_freelndx(vpm_cachemode); 336*a5652762Spraks 337*a5652762Spraks end_ndx = free_ndx; 338*a5652762Spraks vpmflp = &vpmd_free[free_ndx]; 339*a5652762Spraks 340*a5652762Spraks retry_queue: 341*a5652762Spraks allocq = vpmflp->vpm_allocq; 342*a5652762Spraks mutex_enter(&allocq->vpmq_mtx); 343*a5652762Spraks 344*a5652762Spraks if ((vpm = allocq->vpmq_free) == NULL) { 345*a5652762Spraks 346*a5652762Spraks skip_queue: 347*a5652762Spraks /* 348*a5652762Spraks * The alloc list is empty or this queue is being skipped; 349*a5652762Spraks * first see if the allocq toggled. 350*a5652762Spraks */ 351*a5652762Spraks if (vpmflp->vpm_allocq != allocq) { 352*a5652762Spraks /* queue changed */ 353*a5652762Spraks mutex_exit(&allocq->vpmq_mtx); 354*a5652762Spraks goto retry_queue; 355*a5652762Spraks } 356*a5652762Spraks releq = vpmflp->vpm_releq; 357*a5652762Spraks if (!mutex_tryenter(&releq->vpmq_mtx)) { 358*a5652762Spraks /* cannot get releq; a free vpmap may be there now */ 359*a5652762Spraks mutex_exit(&allocq->vpmq_mtx); 360*a5652762Spraks 361*a5652762Spraks /* 362*a5652762Spraks * This loop could spin forever if this thread has 363*a5652762Spraks * higher priority than the thread that is holding 364*a5652762Spraks * releq->vpmq_mtx. In order to force the other thread 365*a5652762Spraks * to run, we'll lock/unlock the mutex which is safe 366*a5652762Spraks * since we just unlocked the allocq mutex. 367*a5652762Spraks */ 368*a5652762Spraks mutex_enter(&releq->vpmq_mtx); 369*a5652762Spraks mutex_exit(&releq->vpmq_mtx); 370*a5652762Spraks goto retry_queue; 371*a5652762Spraks } 372*a5652762Spraks if (releq->vpmq_free == NULL) { 373*a5652762Spraks VPM_DEBUG(vpmd_emptyfreelist); 374*a5652762Spraks /* 375*a5652762Spraks * This freelist is empty. 376*a5652762Spraks * This should not happen unless clients 377*a5652762Spraks * are failing to release the vpmap after 378*a5652762Spraks * accessing the data. Before resorting 379*a5652762Spraks * to sleeping, try the next list of the same color. 380*a5652762Spraks */ 381*a5652762Spraks free_ndx = (free_ndx + 1) & vpmd_freemsk; 382*a5652762Spraks if (free_ndx != end_ndx) { 383*a5652762Spraks mutex_exit(&releq->vpmq_mtx); 384*a5652762Spraks mutex_exit(&allocq->vpmq_mtx); 385*a5652762Spraks vpmflp = &vpmd_free[free_ndx]; 386*a5652762Spraks goto retry_queue; 387*a5652762Spraks } 388*a5652762Spraks /* 389*a5652762Spraks * Tried all freelists. 390*a5652762Spraks * wait on this list and hope something gets freed. 391*a5652762Spraks */ 392*a5652762Spraks vpmflp->vpm_want++; 393*a5652762Spraks mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx); 394*a5652762Spraks cv_wait(&vpmflp->vpm_free_cv, 395*a5652762Spraks &vpmflp->vpm_freeq[0].vpmq_mtx); 396*a5652762Spraks vpmflp->vpm_want--; 397*a5652762Spraks mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx); 398*a5652762Spraks vpmflp = &vpmd_free[free_ndx]; 399*a5652762Spraks VPM_DEBUG(vpmd_nofreevpms); 400*a5652762Spraks goto retry_queue; 401*a5652762Spraks } else { 402*a5652762Spraks /* 403*a5652762Spraks * Something on the rele queue; flip the alloc 404*a5652762Spraks * and rele queues and retry. 405*a5652762Spraks */ 406*a5652762Spraks vpmflp->vpm_allocq = releq; 407*a5652762Spraks vpmflp->vpm_releq = allocq; 408*a5652762Spraks mutex_exit(&allocq->vpmq_mtx); 409*a5652762Spraks mutex_exit(&releq->vpmq_mtx); 410*a5652762Spraks if (page_locked) { 411*a5652762Spraks delay(hz >> 2); 412*a5652762Spraks page_locked = 0; 413*a5652762Spraks } 414*a5652762Spraks goto retry_queue; 415*a5652762Spraks } 416*a5652762Spraks } else { 417*a5652762Spraks int gotnewvpm; 418*a5652762Spraks kmutex_t *pmtx; 419*a5652762Spraks uint_t vpmref; 420*a5652762Spraks 421*a5652762Spraks /* 422*a5652762Spraks * Fastpath the case we get the vpmap mutex 423*a5652762Spraks * on the first try. 424*a5652762Spraks */ 425*a5652762Spraks first = vpm; 426*a5652762Spraks next_vpmap: 427*a5652762Spraks vmtx = VPMAPMTX(vpm); 428*a5652762Spraks if (!mutex_tryenter(vmtx)) { 429*a5652762Spraks /* 430*a5652762Spraks * Another thread is trying to reclaim this slot. 431*a5652762Spraks * Skip to the next queue or vpmap. 432*a5652762Spraks */ 433*a5652762Spraks if ((vpm = vpm->vpm_next) == first) { 434*a5652762Spraks goto skip_queue; 435*a5652762Spraks } else { 436*a5652762Spraks goto next_vpmap; 437*a5652762Spraks } 438*a5652762Spraks } 439*a5652762Spraks 440*a5652762Spraks /* 441*a5652762Spraks * Assign this vpm to the newpage. 442*a5652762Spraks */ 443*a5652762Spraks pmtx = PPMTX(newpage); 444*a5652762Spraks gotnewvpm = 0; 445*a5652762Spraks mutex_enter(pmtx); 446*a5652762Spraks 447*a5652762Spraks /* 448*a5652762Spraks * Check if some other thread already assigned a vpm to 449*a5652762Spraks * this page. 450*a5652762Spraks */ 451*a5652762Spraks if ((vpmref = newpage->p_vpmref) == 0) { 452*a5652762Spraks newpage->p_vpmref = VPMID(vpm); 453*a5652762Spraks gotnewvpm = 1; 454*a5652762Spraks } else { 455*a5652762Spraks VPM_DEBUG(vpmd_contend); 456*a5652762Spraks mutex_exit(vmtx); 457*a5652762Spraks } 458*a5652762Spraks mutex_exit(pmtx); 459*a5652762Spraks 460*a5652762Spraks if (gotnewvpm) { 461*a5652762Spraks 462*a5652762Spraks /* 463*a5652762Spraks * At this point, we've selected the vpm. Remove vpm 464*a5652762Spraks * from its freelist. If vpm is the first one in 465*a5652762Spraks * the freelist, update the head of the freelist. 466*a5652762Spraks */ 467*a5652762Spraks if (first == vpm) { 468*a5652762Spraks ASSERT(first == allocq->vpmq_free); 469*a5652762Spraks allocq->vpmq_free = vpm->vpm_next; 470*a5652762Spraks } 471*a5652762Spraks 472*a5652762Spraks /* 473*a5652762Spraks * If the head of the freelist still points to vpm, 474*a5652762Spraks * then there are no more free vpmaps in that list. 475*a5652762Spraks */ 476*a5652762Spraks if (allocq->vpmq_free == vpm) 477*a5652762Spraks /* 478*a5652762Spraks * Took the last one 479*a5652762Spraks */ 480*a5652762Spraks allocq->vpmq_free = NULL; 481*a5652762Spraks else { 482*a5652762Spraks vpm->vpm_prev->vpm_next = vpm->vpm_next; 483*a5652762Spraks vpm->vpm_next->vpm_prev = vpm->vpm_prev; 484*a5652762Spraks } 485*a5652762Spraks mutex_exit(&allocq->vpmq_mtx); 486*a5652762Spraks vpm->vpm_prev = vpm->vpm_next = NULL; 487*a5652762Spraks 488*a5652762Spraks /* 489*a5652762Spraks * Disassociate the previous page. On x64 systems 490*a5652762Spraks * p_vpmref is used as a mapping reference to the page. 491*a5652762Spraks */ 492*a5652762Spraks if ((pp = vpm->vpm_pp) != NULL && 493*a5652762Spraks vpm->vpm_vp == pp->p_vnode && 494*a5652762Spraks vpm->vpm_off == pp->p_offset) { 495*a5652762Spraks 496*a5652762Spraks pmtx = PPMTX(pp); 497*a5652762Spraks if (page_trylock(pp, SE_SHARED)) { 498*a5652762Spraks /* 499*a5652762Spraks * Now verify that it is the correct 500*a5652762Spraks * page. If not someone else stole it, 501*a5652762Spraks * so just unlock it and leave. 502*a5652762Spraks */ 503*a5652762Spraks mutex_enter(pmtx); 504*a5652762Spraks if (PP_ISFREE(pp) || 505*a5652762Spraks vpm->vpm_vp != pp->p_vnode || 506*a5652762Spraks vpm->vpm_off != pp->p_offset || 507*a5652762Spraks pp->p_vpmref != VPMID(vpm)) { 508*a5652762Spraks mutex_exit(pmtx); 509*a5652762Spraks 510*a5652762Spraks page_unlock(pp); 511*a5652762Spraks } else { 512*a5652762Spraks /* 513*a5652762Spraks * Release the page. 514*a5652762Spraks */ 515*a5652762Spraks pp->p_vpmref = 0; 516*a5652762Spraks mutex_exit(pmtx); 517*a5652762Spraks hat_kpm_mapout(pp, 0, 518*a5652762Spraks hat_kpm_page2va(pp, 1)); 519*a5652762Spraks (void) page_release(pp, 1); 520*a5652762Spraks } 521*a5652762Spraks } else { 522*a5652762Spraks /* 523*a5652762Spraks * If the page cannot be locked, just 524*a5652762Spraks * clear the p_vpmref and go. 525*a5652762Spraks */ 526*a5652762Spraks mutex_enter(pmtx); 527*a5652762Spraks if (pp->p_vpmref == VPMID(vpm)) { 528*a5652762Spraks pp->p_vpmref = 0; 529*a5652762Spraks } 530*a5652762Spraks mutex_exit(pmtx); 531*a5652762Spraks VPM_DEBUG(vpmd_prevpagelocked); 532*a5652762Spraks } 533*a5652762Spraks } 534*a5652762Spraks 535*a5652762Spraks /* 536*a5652762Spraks * Setup vpm to point to the new page. 537*a5652762Spraks */ 538*a5652762Spraks vpm->vpm_pp = newpage; 539*a5652762Spraks vpm->vpm_vp = newpage->p_vnode; 540*a5652762Spraks vpm->vpm_off = newpage->p_offset; 541*a5652762Spraks 542*a5652762Spraks } else { 543*a5652762Spraks int steal = !VPM_MTBF(steals, steals_mtbf); 544*a5652762Spraks /* 545*a5652762Spraks * Page already has a vpm assigned just use that. 546*a5652762Spraks * Grab the vpm mutex and verify that it is still 547*a5652762Spraks * the correct one. The pp->p_vpmref should not change 548*a5652762Spraks * once we have the vpm mutex and the page lock. 549*a5652762Spraks */ 550*a5652762Spraks mutex_exit(&allocq->vpmq_mtx); 551*a5652762Spraks vpm = VPMP(vpmref); 552*a5652762Spraks vmtx = VPMAPMTX(vpm); 553*a5652762Spraks mutex_enter(vmtx); 554*a5652762Spraks if ((steal && vpm->vpm_refcnt == 0) || 555*a5652762Spraks vpm->vpm_pp != newpage) { 556*a5652762Spraks /* 557*a5652762Spraks * The vpm got stolen, retry. 558*a5652762Spraks * clear the p_vpmref. 559*a5652762Spraks */ 560*a5652762Spraks pmtx = PPMTX(newpage); 561*a5652762Spraks mutex_enter(pmtx); 562*a5652762Spraks if (newpage->p_vpmref == vpmref) { 563*a5652762Spraks newpage->p_vpmref = 0; 564*a5652762Spraks } 565*a5652762Spraks mutex_exit(pmtx); 566*a5652762Spraks 567*a5652762Spraks mutex_exit(vmtx); 568*a5652762Spraks VPM_DEBUG(vpmd_steals); 569*a5652762Spraks goto retry_queue; 570*a5652762Spraks } else if (vpm->vpm_refcnt == 0) { 571*a5652762Spraks /* 572*a5652762Spraks * Remove it from the free list if it 573*a5652762Spraks * exists there. 574*a5652762Spraks */ 575*a5652762Spraks VPMAP_RMFREELIST(vpm); 576*a5652762Spraks } 577*a5652762Spraks } 578*a5652762Spraks return (vpm); 579*a5652762Spraks } 580*a5652762Spraks } 581*a5652762Spraks 582*a5652762Spraks static void 583*a5652762Spraks free_vpmap(struct vpmap *vpm) 584*a5652762Spraks { 585*a5652762Spraks struct vpmfree *vpmflp; 586*a5652762Spraks struct vpmap *vpmfreelist; 587*a5652762Spraks union vpm_freeq *releq; 588*a5652762Spraks 589*a5652762Spraks ASSERT(MUTEX_HELD(VPMAPMTX(vpm))); 590*a5652762Spraks 591*a5652762Spraks if (vpm->vpm_refcnt != 0) { 592*a5652762Spraks panic("free_vpmap"); 593*a5652762Spraks /*NOTREACHED*/ 594*a5652762Spraks } 595*a5652762Spraks 596*a5652762Spraks vpmflp = &vpmd_free[vpm->vpm_free_ndx]; 597*a5652762Spraks /* 598*a5652762Spraks * Add to the tail of the release queue 599*a5652762Spraks * Note that vpm_releq and vpm_allocq could toggle 600*a5652762Spraks * before we get the lock. This does not affect 601*a5652762Spraks * correctness as the 2 queues are only maintained 602*a5652762Spraks * to reduce lock pressure. 603*a5652762Spraks */ 604*a5652762Spraks releq = vpmflp->vpm_releq; 605*a5652762Spraks if (releq == &vpmflp->vpm_freeq[0]) { 606*a5652762Spraks vpm->vpm_ndxflg = 0; 607*a5652762Spraks } else { 608*a5652762Spraks vpm->vpm_ndxflg = 1; 609*a5652762Spraks } 610*a5652762Spraks mutex_enter(&releq->vpmq_mtx); 611*a5652762Spraks vpmfreelist = releq->vpmq_free; 612*a5652762Spraks if (vpmfreelist == 0) { 613*a5652762Spraks int want; 614*a5652762Spraks 615*a5652762Spraks releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm; 616*a5652762Spraks /* 617*a5652762Spraks * Both queue mutexes are held to set vpm_want; 618*a5652762Spraks * snapshot the value before dropping releq mutex. 619*a5652762Spraks * If vpm_want appears after the releq mutex is dropped, 620*a5652762Spraks * then the vpmap just freed is already gone. 621*a5652762Spraks */ 622*a5652762Spraks want = vpmflp->vpm_want; 623*a5652762Spraks mutex_exit(&releq->vpmq_mtx); 624*a5652762Spraks /* 625*a5652762Spraks * See if there was a waiter before dropping the releq mutex 626*a5652762Spraks * then recheck after obtaining vpm_freeq[0] mutex as 627*a5652762Spraks * the another thread may have already signaled. 628*a5652762Spraks */ 629*a5652762Spraks if (want) { 630*a5652762Spraks mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx); 631*a5652762Spraks if (vpmflp->vpm_want) 632*a5652762Spraks cv_signal(&vpmflp->vpm_free_cv); 633*a5652762Spraks mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx); 634*a5652762Spraks } 635*a5652762Spraks } else { 636*a5652762Spraks vpm->vpm_next = vpmfreelist; 637*a5652762Spraks vpm->vpm_prev = vpmfreelist->vpm_prev; 638*a5652762Spraks vpmfreelist->vpm_prev = vpm; 639*a5652762Spraks vpm->vpm_prev->vpm_next = vpm; 640*a5652762Spraks mutex_exit(&releq->vpmq_mtx); 641*a5652762Spraks } 642*a5652762Spraks } 643*a5652762Spraks 644*a5652762Spraks /* 645*a5652762Spraks * Get the vpmap for the page. 646*a5652762Spraks * The refcnt of this vpm is incremented. 647*a5652762Spraks */ 648*a5652762Spraks static struct vpmap * 649*a5652762Spraks get_vpmap(page_t *pp) 650*a5652762Spraks { 651*a5652762Spraks struct vpmap *vpm = NULL; 652*a5652762Spraks kmutex_t *vmtx; 653*a5652762Spraks kmutex_t *pmtx; 654*a5652762Spraks unsigned int refid; 655*a5652762Spraks 656*a5652762Spraks ASSERT((pp != NULL) && PAGE_LOCKED(pp)); 657*a5652762Spraks 658*a5652762Spraks if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) { 659*a5652762Spraks vpm = VPMP(refid); 660*a5652762Spraks vmtx = VPMAPMTX(vpm); 661*a5652762Spraks mutex_enter(vmtx); 662*a5652762Spraks /* 663*a5652762Spraks * Since we have the page lock and the vpm mutex, the 664*a5652762Spraks * pp->p_vpmref cannot change. 665*a5652762Spraks */ 666*a5652762Spraks if (vpm->vpm_pp != pp) { 667*a5652762Spraks pmtx = PPMTX(pp); 668*a5652762Spraks 669*a5652762Spraks /* 670*a5652762Spraks * Clear the p_vpmref as it is incorrect. 671*a5652762Spraks * This can happen if the page was stolen. 672*a5652762Spraks * On x64 this should not happen as p_vpmref 673*a5652762Spraks * is treated as a mapping on the page. So 674*a5652762Spraks * if the page is stolen, the mapping would have 675*a5652762Spraks * been cleared in page_unload(). 676*a5652762Spraks */ 677*a5652762Spraks mutex_enter(pmtx); 678*a5652762Spraks if (pp->p_vpmref == refid) 679*a5652762Spraks pp->p_vpmref = 0; 680*a5652762Spraks mutex_exit(pmtx); 681*a5652762Spraks 682*a5652762Spraks mutex_exit(vmtx); 683*a5652762Spraks vpm = NULL; 684*a5652762Spraks } else if (vpm->vpm_refcnt == 0) { 685*a5652762Spraks /* 686*a5652762Spraks * Got the vpm, remove it from the free 687*a5652762Spraks * list if it exists there. 688*a5652762Spraks */ 689*a5652762Spraks VPMAP_RMFREELIST(vpm); 690*a5652762Spraks } 691*a5652762Spraks } 692*a5652762Spraks if (vpm == NULL) { 693*a5652762Spraks /* 694*a5652762Spraks * get_free_vpmap() returns with the vpmap mutex held. 695*a5652762Spraks */ 696*a5652762Spraks vpm = get_free_vpmap(pp); 697*a5652762Spraks vmtx = VPMAPMTX(vpm); 698*a5652762Spraks vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++; 699*a5652762Spraks } else { 700*a5652762Spraks vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++; 701*a5652762Spraks } 702*a5652762Spraks 703*a5652762Spraks vpm->vpm_refcnt++; 704*a5652762Spraks mutex_exit(vmtx); 705*a5652762Spraks 706*a5652762Spraks return (vpm); 707*a5652762Spraks } 708*a5652762Spraks 709*a5652762Spraks /* END --- vpm cache ---- */ 710*a5652762Spraks 711*a5652762Spraks /* 712*a5652762Spraks * The vnode page mapping(vpm) interface routines. 713*a5652762Spraks */ 714*a5652762Spraks 715*a5652762Spraks /* 716*a5652762Spraks * Find or create the pages starting form baseoff for specified 717*a5652762Spraks * length 'len'. 718*a5652762Spraks */ 719*a5652762Spraks static int 720*a5652762Spraks vpm_pagecreate( 721*a5652762Spraks struct vnode *vp, 722*a5652762Spraks u_offset_t baseoff, 723*a5652762Spraks size_t len, 724*a5652762Spraks vmap_t vml[], 725*a5652762Spraks int nseg, 726*a5652762Spraks int *newpage) 727*a5652762Spraks { 728*a5652762Spraks 729*a5652762Spraks page_t *pp = NULL; 730*a5652762Spraks caddr_t base; 731*a5652762Spraks u_offset_t off = baseoff; 732*a5652762Spraks int i; 733*a5652762Spraks ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS); 734*a5652762Spraks 735*a5652762Spraks for (i = 0; len > 0; len -= MIN(len, PAGESIZE), i++) { 736*a5652762Spraks struct vpmap *vpm; 737*a5652762Spraks 738*a5652762Spraks 739*a5652762Spraks if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) { 740*a5652762Spraks 741*a5652762Spraks base = segkpm_create_va(off); 742*a5652762Spraks 743*a5652762Spraks /* 744*a5652762Spraks * the seg pointer passed in is just advisor. Just 745*a5652762Spraks * pass segkmap for now like segmap does with 746*a5652762Spraks * segmap_kpm enabled. 747*a5652762Spraks */ 748*a5652762Spraks if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, 749*a5652762Spraks segkmap, base)) == NULL) { 750*a5652762Spraks panic("segmap_pagecreate_vpm: " 751*a5652762Spraks "page_create failed"); 752*a5652762Spraks /*NOTREACHED*/ 753*a5652762Spraks } 754*a5652762Spraks if (newpage != NULL) 755*a5652762Spraks *newpage = 1; 756*a5652762Spraks 757*a5652762Spraks page_io_unlock(pp); 758*a5652762Spraks } 759*a5652762Spraks 760*a5652762Spraks /* 761*a5652762Spraks * Get the vpm for this page_t. 762*a5652762Spraks */ 763*a5652762Spraks if (vpm_cache_enable) { 764*a5652762Spraks vpm = get_vpmap(pp); 765*a5652762Spraks vml[i].vs_data = (void *)&vpm->vpm_pp; 766*a5652762Spraks } else { 767*a5652762Spraks vml[i].vs_data = (void *)pp; 768*a5652762Spraks pp->p_vpmref = 0; 769*a5652762Spraks } 770*a5652762Spraks 771*a5652762Spraks vml[i].vs_addr = hat_kpm_mapin(pp, 0); 772*a5652762Spraks vml[i].vs_len = PAGESIZE; 773*a5652762Spraks 774*a5652762Spraks off += PAGESIZE; 775*a5652762Spraks } 776*a5652762Spraks vml[i].vs_data = NULL; 777*a5652762Spraks vml[i].vs_addr = (caddr_t)NULL; 778*a5652762Spraks return (0); 779*a5652762Spraks } 780*a5652762Spraks 781*a5652762Spraks 782*a5652762Spraks /* 783*a5652762Spraks * Returns vpm mappings of pages in the range [off, off+len], where 784*a5652762Spraks * len is rounded up to the PAGESIZE boundary. The list of pages and 785*a5652762Spraks * the page addresses are returned in the SGL vml (vmap_t) array passed in. 786*a5652762Spraks * The nseg is the number of vmap_t entries in the array. 787*a5652762Spraks * 788*a5652762Spraks * Currently max len allowed is MAXBSIZE therefore, it will either 789*a5652762Spraks * fetch/create one or two pages depending on what is the PAGESIZE. 790*a5652762Spraks * 791*a5652762Spraks * The segmap's SM_LOCKPROTO usage is not supported by these interfaces. 792*a5652762Spraks * For such cases, use the seg_map interfaces. 793*a5652762Spraks */ 794*a5652762Spraks int 795*a5652762Spraks vpm_map_pages( 796*a5652762Spraks struct vnode *vp, 797*a5652762Spraks u_offset_t off, 798*a5652762Spraks size_t len, 799*a5652762Spraks int fetchpage, 800*a5652762Spraks vmap_t *vml, 801*a5652762Spraks int nseg, 802*a5652762Spraks int *newpage, 803*a5652762Spraks enum seg_rw rw) 804*a5652762Spraks { 805*a5652762Spraks extern struct vnode *common_specvp(); 806*a5652762Spraks u_offset_t baseoff; 807*a5652762Spraks uint_t prot; 808*a5652762Spraks caddr_t base; 809*a5652762Spraks page_t *pp, *pplist[MAXVMAPS]; 810*a5652762Spraks struct vpmap *vpm; 811*a5652762Spraks int i, error = 0; 812*a5652762Spraks 813*a5652762Spraks ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS); 814*a5652762Spraks baseoff = off & (offset_t)PAGEMASK; 815*a5652762Spraks vml[0].vs_data = NULL; 816*a5652762Spraks vml[0].vs_addr = (caddr_t)NULL; 817*a5652762Spraks /* 818*a5652762Spraks * For now, lets restrict it to MAXBSIZE. XXX - We can allow 819*a5652762Spraks * len longer then MAXBSIZE, but there should be a limit 820*a5652762Spraks * which should be determined by how many pages the VOP_GETPAGE() 821*a5652762Spraks * can fetch. 822*a5652762Spraks */ 823*a5652762Spraks if (off + len > baseoff + MAXBSIZE) { 824*a5652762Spraks panic("vpm_map_pages bad len"); 825*a5652762Spraks /*NOTREACHED*/ 826*a5652762Spraks } 827*a5652762Spraks 828*a5652762Spraks /* 829*a5652762Spraks * If this is a block device we have to be sure to use the 830*a5652762Spraks * "common" block device vnode for the mapping. 831*a5652762Spraks */ 832*a5652762Spraks if (vp->v_type == VBLK) 833*a5652762Spraks vp = common_specvp(vp); 834*a5652762Spraks 835*a5652762Spraks 836*a5652762Spraks if (!fetchpage) 837*a5652762Spraks return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage)); 838*a5652762Spraks 839*a5652762Spraks for (i = 0; len > 0; len -= MIN(len, PAGESIZE), i++, 840*a5652762Spraks pplist[i] = NULL) { 841*a5652762Spraks 842*a5652762Spraks pp = page_lookup(vp, baseoff, SE_SHARED); 843*a5652762Spraks 844*a5652762Spraks /* 845*a5652762Spraks * If we did not find the page or if this page was not 846*a5652762Spraks * in our cache, then let VOP_GETPAGE get all the pages. 847*a5652762Spraks * We need to call VOP_GETPAGE so that filesytems can do some 848*a5652762Spraks * (un)necessary tracking for sequential access. 849*a5652762Spraks */ 850*a5652762Spraks 851*a5652762Spraks if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) || 852*a5652762Spraks (rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF) 853*a5652762Spraks != (P_MOD | P_REF))) { 854*a5652762Spraks if (pp != NULL) { 855*a5652762Spraks page_unlock(pp); 856*a5652762Spraks } 857*a5652762Spraks 858*a5652762Spraks /* 859*a5652762Spraks * Pass a dummy address as it will be required 860*a5652762Spraks * by page_create_va(). We pass segkmap as the seg 861*a5652762Spraks * as some file systems(UFS) check it. 862*a5652762Spraks */ 863*a5652762Spraks base = segkpm_create_va(baseoff); 864*a5652762Spraks 865*a5652762Spraks error = VOP_GETPAGE(vp, baseoff, len, &prot, &pplist[i], 866*a5652762Spraks roundup(len, PAGESIZE), segkmap, base, rw, CRED()); 867*a5652762Spraks if (error) { 868*a5652762Spraks VPM_DEBUG(vpmd_getpagefailed); 869*a5652762Spraks pplist[i] = NULL; 870*a5652762Spraks } 871*a5652762Spraks break; 872*a5652762Spraks } else { 873*a5652762Spraks pplist[i] = pp; 874*a5652762Spraks baseoff += PAGESIZE; 875*a5652762Spraks } 876*a5652762Spraks } 877*a5652762Spraks 878*a5652762Spraks if (error) { 879*a5652762Spraks for (i = 0; pplist[i] != NULL; i++) { 880*a5652762Spraks page_unlock(pplist[i]); 881*a5652762Spraks pplist[i] = NULL; 882*a5652762Spraks } 883*a5652762Spraks vml[0].vs_addr = NULL; 884*a5652762Spraks vml[0].vs_data = NULL; 885*a5652762Spraks return (FC_MAKE_ERR(error)); 886*a5652762Spraks } 887*a5652762Spraks 888*a5652762Spraks /* 889*a5652762Spraks * Get the vpm's for pages. 890*a5652762Spraks */ 891*a5652762Spraks for (i = 0; pplist[i] != NULL; i++) { 892*a5652762Spraks if (vpm_cache_enable) { 893*a5652762Spraks vpm = get_vpmap(pplist[i]); 894*a5652762Spraks vml[i].vs_data = (void *)&(vpm->vpm_pp); 895*a5652762Spraks } else { 896*a5652762Spraks vml[i].vs_data = (void *)pplist[i]; 897*a5652762Spraks pplist[i]->p_vpmref = 0; 898*a5652762Spraks } 899*a5652762Spraks 900*a5652762Spraks vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0); 901*a5652762Spraks vml[i].vs_len = PAGESIZE; 902*a5652762Spraks } 903*a5652762Spraks 904*a5652762Spraks vml[i].vs_data = NULL; 905*a5652762Spraks vml[i].vs_addr = (caddr_t)NULL; 906*a5652762Spraks 907*a5652762Spraks return (0); 908*a5652762Spraks } 909*a5652762Spraks 910*a5652762Spraks /* 911*a5652762Spraks * Release the vpm mappings on the pages and unlock them. 912*a5652762Spraks */ 913*a5652762Spraks void 914*a5652762Spraks vpm_unmap_pages(vmap_t vml[], enum seg_rw rw) 915*a5652762Spraks { 916*a5652762Spraks int i; 917*a5652762Spraks struct vpmap *vpm; 918*a5652762Spraks kmutex_t *mtx; 919*a5652762Spraks page_t *pp; 920*a5652762Spraks 921*a5652762Spraks for (i = 0; vml[i].vs_data != NULL; i++) { 922*a5652762Spraks ASSERT(IS_KPM_ADDR(vml[i].vs_addr)); 923*a5652762Spraks 924*a5652762Spraks if (vpm_cache_enable) { 925*a5652762Spraks pp = *(((page_t **)vml[i].vs_data)); 926*a5652762Spraks } else { 927*a5652762Spraks pp = (page_t *)vml[i].vs_data; 928*a5652762Spraks } 929*a5652762Spraks 930*a5652762Spraks /* 931*a5652762Spraks * Mark page as being modified or referenced, bacause vpm pages 932*a5652762Spraks * would not cause faults where it would be set normally. 933*a5652762Spraks */ 934*a5652762Spraks if (rw == S_WRITE) { 935*a5652762Spraks hat_setrefmod(pp); 936*a5652762Spraks } else { 937*a5652762Spraks ASSERT(rw == S_READ); 938*a5652762Spraks hat_setref(pp); 939*a5652762Spraks } 940*a5652762Spraks 941*a5652762Spraks if (vpm_cache_enable) { 942*a5652762Spraks page_unlock(pp); 943*a5652762Spraks vpm = (struct vpmap *)((char *)vml[i].vs_data 944*a5652762Spraks - offsetof(struct vpmap, vpm_pp)); 945*a5652762Spraks mtx = VPMAPMTX(vpm); 946*a5652762Spraks mutex_enter(mtx); 947*a5652762Spraks 948*a5652762Spraks if (--vpm->vpm_refcnt == 0) { 949*a5652762Spraks free_vpmap(vpm); 950*a5652762Spraks } 951*a5652762Spraks mutex_exit(mtx); 952*a5652762Spraks } else { 953*a5652762Spraks hat_kpm_mapout(pp, 0, vml[i].vs_addr); 954*a5652762Spraks (void) page_release(pp, 1); 955*a5652762Spraks } 956*a5652762Spraks vml[i].vs_data = NULL; 957*a5652762Spraks vml[i].vs_addr = NULL; 958*a5652762Spraks } 959*a5652762Spraks } 960*a5652762Spraks 961*a5652762Spraks /* 962*a5652762Spraks * Given the vp, off and the uio structure, this routine will do the 963*a5652762Spraks * the copy (uiomove). If the last page created is partially written, 964*a5652762Spraks * the rest of the page is zeroed out. It also zeros the beginning of 965*a5652762Spraks * the first page till the start offset if requested(zerostart). 966*a5652762Spraks * If pages are to be fetched, it will call the filesystem's getpage 967*a5652762Spraks * function (VOP_GETPAGE) to get them, otherwise they will be created if 968*a5652762Spraks * not already present in the page cache. 969*a5652762Spraks */ 970*a5652762Spraks int 971*a5652762Spraks vpm_data_copy(struct vnode *vp, 972*a5652762Spraks u_offset_t off, 973*a5652762Spraks size_t len, 974*a5652762Spraks struct uio *uio, 975*a5652762Spraks int fetchpage, 976*a5652762Spraks int *newpage, 977*a5652762Spraks int zerostart, 978*a5652762Spraks enum seg_rw rw) 979*a5652762Spraks { 980*a5652762Spraks int error; 981*a5652762Spraks struct vmap vml[MINVMAPS]; 982*a5652762Spraks enum uio_rw uiorw; 983*a5652762Spraks int npages = 0; 984*a5652762Spraks 985*a5652762Spraks uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ; 986*a5652762Spraks /* 987*a5652762Spraks * 'off' will be the offset where the I/O starts. 988*a5652762Spraks * We get the pages starting at the (off & PAGEMASK) 989*a5652762Spraks * page boundary. 990*a5652762Spraks */ 991*a5652762Spraks error = vpm_map_pages(vp, off, (uint_t)len, 992*a5652762Spraks fetchpage, vml, MINVMAPS, &npages, rw); 993*a5652762Spraks 994*a5652762Spraks if (newpage != NULL) 995*a5652762Spraks *newpage = npages; 996*a5652762Spraks if (!error) { 997*a5652762Spraks int i, pn, slen = len; 998*a5652762Spraks int pon = off & PAGEOFFSET; 999*a5652762Spraks 1000*a5652762Spraks /* 1001*a5652762Spraks * Clear from the beginning of the page to start offset 1002*a5652762Spraks * if requested. 1003*a5652762Spraks */ 1004*a5652762Spraks if (!fetchpage && zerostart) { 1005*a5652762Spraks (void) kzero(vml[0].vs_addr, (uint_t)pon); 1006*a5652762Spraks VPM_DEBUG(vpmd_zerostart); 1007*a5652762Spraks } 1008*a5652762Spraks 1009*a5652762Spraks for (i = 0; !error && slen > 0 && 1010*a5652762Spraks vml[i].vs_addr != NULL; i++) { 1011*a5652762Spraks pn = (int)MIN(slen, (PAGESIZE - pon)); 1012*a5652762Spraks error = uiomove(vml[i].vs_addr + pon, 1013*a5652762Spraks (long)pn, uiorw, uio); 1014*a5652762Spraks slen -= pn; 1015*a5652762Spraks pon = 0; 1016*a5652762Spraks } 1017*a5652762Spraks 1018*a5652762Spraks /* 1019*a5652762Spraks * When new pages are created, zero out part of the 1020*a5652762Spraks * page we did not copy to. 1021*a5652762Spraks */ 1022*a5652762Spraks if (!fetchpage && npages && 1023*a5652762Spraks uio->uio_loffset < roundup(off + len, PAGESIZE)) { 1024*a5652762Spraks int nzero; 1025*a5652762Spraks 1026*a5652762Spraks pon = (uio->uio_loffset & PAGEOFFSET); 1027*a5652762Spraks nzero = PAGESIZE - pon; 1028*a5652762Spraks i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE; 1029*a5652762Spraks (void) kzero(vml[i].vs_addr + pon, (uint_t)nzero); 1030*a5652762Spraks } 1031*a5652762Spraks vpm_unmap_pages(vml, rw); 1032*a5652762Spraks } 1033*a5652762Spraks return (error); 1034*a5652762Spraks } 1035*a5652762Spraks 1036*a5652762Spraks /* 1037*a5652762Spraks * called to flush pages for the given vnode covering 1038*a5652762Spraks * [off, off+len] range. 1039*a5652762Spraks */ 1040*a5652762Spraks int 1041*a5652762Spraks vpm_sync_pages(struct vnode *vp, 1042*a5652762Spraks u_offset_t off, 1043*a5652762Spraks size_t len, 1044*a5652762Spraks uint_t flags) 1045*a5652762Spraks { 1046*a5652762Spraks extern struct vnode *common_specvp(); 1047*a5652762Spraks int bflags = 0; 1048*a5652762Spraks int error = 0; 1049*a5652762Spraks size_t psize = roundup(len, PAGESIZE); 1050*a5652762Spraks 1051*a5652762Spraks /* 1052*a5652762Spraks * If this is a block device we have to be sure to use the 1053*a5652762Spraks * "common" block device vnode for the mapping. 1054*a5652762Spraks */ 1055*a5652762Spraks if (vp->v_type == VBLK) 1056*a5652762Spraks vp = common_specvp(vp); 1057*a5652762Spraks 1058*a5652762Spraks if ((flags & ~SM_DONTNEED) != 0) { 1059*a5652762Spraks if (flags & SM_ASYNC) 1060*a5652762Spraks bflags |= B_ASYNC; 1061*a5652762Spraks if (flags & SM_INVAL) 1062*a5652762Spraks bflags |= B_INVAL; 1063*a5652762Spraks if (flags & SM_DESTROY) 1064*a5652762Spraks bflags |= (B_INVAL|B_TRUNC); 1065*a5652762Spraks if (flags & SM_FREE) 1066*a5652762Spraks bflags |= B_FREE; 1067*a5652762Spraks if (flags & SM_DONTNEED) 1068*a5652762Spraks bflags |= B_DONTNEED; 1069*a5652762Spraks 1070*a5652762Spraks error = VOP_PUTPAGE(vp, off, psize, bflags, CRED()); 1071*a5652762Spraks } 1072*a5652762Spraks 1073*a5652762Spraks return (error); 1074*a5652762Spraks } 1075*a5652762Spraks 1076*a5652762Spraks 1077*a5652762Spraks #else /* SEGKPM_SUPPORT */ 1078*a5652762Spraks 1079*a5652762Spraks /* vpm stubs */ 1080*a5652762Spraks void 1081*a5652762Spraks vpm_init() 1082*a5652762Spraks { 1083*a5652762Spraks } 1084*a5652762Spraks 1085*a5652762Spraks /*ARGSUSED*/ 1086*a5652762Spraks int 1087*a5652762Spraks vpm_pagecreate( 1088*a5652762Spraks struct vnode *vp, 1089*a5652762Spraks u_offset_t baseoff, 1090*a5652762Spraks size_t len, 1091*a5652762Spraks vmap_t vml[], 1092*a5652762Spraks int nseg, 1093*a5652762Spraks int *newpage) 1094*a5652762Spraks { 1095*a5652762Spraks return (0); 1096*a5652762Spraks } 1097*a5652762Spraks 1098*a5652762Spraks /*ARGSUSED*/ 1099*a5652762Spraks int 1100*a5652762Spraks vpm_map_pages( 1101*a5652762Spraks struct vnode *vp, 1102*a5652762Spraks u_offset_t off, 1103*a5652762Spraks size_t len, 1104*a5652762Spraks int fetchpage, 1105*a5652762Spraks vmap_t vml[], 1106*a5652762Spraks int nseg, 1107*a5652762Spraks int *newpage, 1108*a5652762Spraks enum seg_rw rw) 1109*a5652762Spraks { 1110*a5652762Spraks return (0); 1111*a5652762Spraks } 1112*a5652762Spraks 1113*a5652762Spraks /*ARGSUSED*/ 1114*a5652762Spraks int 1115*a5652762Spraks vpm_data_copy(struct vnode *vp, 1116*a5652762Spraks u_offset_t off, 1117*a5652762Spraks size_t len, 1118*a5652762Spraks struct uio *uio, 1119*a5652762Spraks int fetchpage, 1120*a5652762Spraks int *newpage, 1121*a5652762Spraks int zerostart, 1122*a5652762Spraks enum seg_rw rw) 1123*a5652762Spraks { 1124*a5652762Spraks return (0); 1125*a5652762Spraks } 1126*a5652762Spraks 1127*a5652762Spraks /*ARGSUSED*/ 1128*a5652762Spraks void 1129*a5652762Spraks vpm_unmap_pages(vmap_t vml[], enum seg_rw rw) 1130*a5652762Spraks { 1131*a5652762Spraks } 1132*a5652762Spraks /*ARGSUSED*/ 1133*a5652762Spraks int 1134*a5652762Spraks vpm_sync_pages(struct vnode *vp, 1135*a5652762Spraks u_offset_t off, 1136*a5652762Spraks size_t len, 1137*a5652762Spraks uint_t flags) 1138*a5652762Spraks { 1139*a5652762Spraks return (0); 1140*a5652762Spraks } 1141*a5652762Spraks #endif /* SEGKPM_SUPPORT */ 1142