/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (c) 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * University Copyright- Copyright (c) 1982, 1986, 1988 * The Regents of the University of California * All Rights Reserved * * University Acknowledgment- Portions of this document are derived from * software developed by the University of California, Berkeley, and its * contributors. */ /* * VM - segment management. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * kstats for segment advise */ segadvstat_t segadvstat = { { "MADV_FREE_hit", KSTAT_DATA_ULONG }, { "MADV_FREE_miss", KSTAT_DATA_ULONG }, }; kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat; uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t); /* * entry in the segment page cache */ struct seg_pcache { struct seg_pcache *p_hnext; /* list for hashed blocks */ struct seg_pcache *p_hprev; pcache_link_t p_plink; /* per segment/amp list */ void *p_htag0; /* segment/amp pointer */ caddr_t p_addr; /* base address/anon_idx */ size_t p_len; /* total bytes */ size_t p_wlen; /* writtable bytes at p_addr */ struct page **p_pp; /* pp shadow list */ seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */ clock_t p_lbolt; /* lbolt from last use */ struct seg_phash *p_hashp; /* our pcache hash bucket */ uint_t p_active; /* active count */ uchar_t p_write; /* true if S_WRITE */ uchar_t p_ref; /* reference byte */ ushort_t p_flags; /* bit flags */ }; struct seg_phash { struct seg_pcache *p_hnext; /* list for hashed blocks */ struct seg_pcache *p_hprev; kmutex_t p_hmutex; /* protects hash bucket */ pcache_link_t p_halink[2]; /* active bucket linkages */ }; struct seg_phash_wired { struct seg_pcache *p_hnext; /* list for hashed blocks */ struct seg_pcache *p_hprev; kmutex_t p_hmutex; /* protects hash bucket */ }; /* * A parameter to control a maximum number of bytes that can be * purged from pcache at a time. */ #define P_MAX_APURGE_BYTES (1024 * 1024 * 1024) /* * log2(fraction of pcache to reclaim at a time). */ #define P_SHRINK_SHFT (5) /* * The following variables can be tuned via /etc/system. */ int segpcache_enabled = 1; /* if 1, shadow lists are cached */ pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */ ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */ ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */ int segpcache_reap_sec = 1; /* reap check rate in secs */ clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */ int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */ clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */ int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */ pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */ static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */ static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */ static kcondvar_t seg_pasync_cv; #pragma align 64(pctrl1) #pragma align 64(pctrl2) #pragma align 64(pctrl3) /* * Keep frequently used variables together in one cache line. */ static struct p_ctrl1 { uint_t p_disabled; /* if not 0, caching temporarily off */ pgcnt_t p_maxwin; /* max # of pages that can be cached */ size_t p_hashwin_sz; /* # of non wired buckets */ struct seg_phash *p_htabwin; /* hash table for non wired entries */ size_t p_hashwired_sz; /* # of wired buckets */ struct seg_phash_wired *p_htabwired; /* hash table for wired entries */ kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */ #ifdef _LP64 ulong_t pad[1]; #endif /* _LP64 */ } pctrl1; static struct p_ctrl2 { kmutex_t p_mem_mtx; /* protects window counter and p_halinks */ pgcnt_t p_locked_win; /* # pages from window */ pgcnt_t p_locked; /* # of pages cached by pagelock */ uchar_t p_ahcur; /* current active links for insert/delete */ uchar_t p_athr_on; /* async reclaim thread is running. */ pcache_link_t p_ahhead[2]; /* active buckets linkages */ } pctrl2; static struct p_ctrl3 { clock_t p_pcp_maxage; /* max pcp age in ticks */ ulong_t p_athr_empty_ahb; /* athread walk stats */ ulong_t p_athr_full_ahb; /* athread walk stats */ pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */ int p_shrink_shft; /* reap shift factor */ #ifdef _LP64 ulong_t pad[3]; #endif /* _LP64 */ } pctrl3; #define seg_pdisabled pctrl1.p_disabled #define seg_pmaxwindow pctrl1.p_maxwin #define seg_phashsize_win pctrl1.p_hashwin_sz #define seg_phashtab_win pctrl1.p_htabwin #define seg_phashsize_wired pctrl1.p_hashwired_sz #define seg_phashtab_wired pctrl1.p_htabwired #define seg_pkmcache pctrl1.p_kmcache #define seg_pmem_mtx pctrl2.p_mem_mtx #define seg_plocked_window pctrl2.p_locked_win #define seg_plocked pctrl2.p_locked #define seg_pahcur pctrl2.p_ahcur #define seg_pathr_on pctrl2.p_athr_on #define seg_pahhead pctrl2.p_ahhead #define seg_pmax_pcpage pctrl3.p_pcp_maxage #define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb #define seg_pathr_full_ahb pctrl3.p_athr_full_ahb #define seg_pshrink_shift pctrl3.p_shrink_shft #define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages #define P_HASHWIN_MASK (seg_phashsize_win - 1) #define P_HASHWIRED_MASK (seg_phashsize_wired - 1) #define P_BASESHIFT (6) kthread_t *seg_pasync_thr; extern struct seg_ops segvn_ops; extern struct seg_ops segspt_shmops; #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED) #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags) #define LBOLT_DELTA(t) ((ulong_t)(ddi_get_lbolt() - (t))) #define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt) /* * htag0 argument can be a seg or amp pointer. */ #define P_HASHBP(seg, htag0, addr, flags) \ (IS_PFLAGS_WIRED((flags)) ? \ ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \ ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \ (&seg_phashtab_win[P_HASHWIN_MASK & \ (((uintptr_t)(htag0) >> 3) ^ \ ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \ (flags >> 16) : page_get_shift((seg)->s_szc))))])) /* * htag0 argument can be a seg or amp pointer. */ #define P_MATCH(pcp, htag0, addr, len) \ ((pcp)->p_htag0 == (htag0) && \ (pcp)->p_addr == (addr) && \ (pcp)->p_len >= (len)) #define P_MATCH_PP(pcp, htag0, addr, len, pp) \ ((pcp)->p_pp == (pp) && \ (pcp)->p_htag0 == (htag0) && \ (pcp)->p_addr == (addr) && \ (pcp)->p_len >= (len)) #define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \ offsetof(struct seg_pcache, p_plink))) #define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \ offsetof(struct seg_phash, p_halink[l]))) /* * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from * active hash bucket lists. We maintain active bucket lists to reduce the * overhead of finding active buckets during asynchronous purging since there * can be 10s of millions of buckets on a large system but only a small subset * of them in actual use. * * There're 2 active bucket lists. Current active list (as per seg_pahcur) is * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete * buckets. The other list is used by asynchronous purge thread. This allows * the purge thread to walk its active list without holding seg_pmem_mtx for a * long time. When asynchronous thread is done with its list it switches to * current active list and makes the list it just finished processing as * current active list. * * seg_padd_abuck() only adds the bucket to current list if the bucket is not * yet on any list. seg_premove_abuck() may remove the bucket from either * list. If the bucket is on current list it will be always removed. Otherwise * the bucket is only removed if asynchronous purge thread is not currently * running or seg_premove_abuck() is called by asynchronous purge thread * itself. A given bucket can only be on one of active lists at a time. These * routines should be called with per bucket lock held. The routines use * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after * the first entry is added to the bucket chain and seg_premove_abuck() must * be called after the last pcp entry is deleted from its chain. Per bucket * lock should be held by the callers. This avoids a potential race condition * when seg_premove_abuck() removes a bucket after pcp entries are added to * its list after the caller checked that the bucket has no entries. (this * race would cause a loss of an active bucket from the active lists). * * Both lists are circular doubly linked lists anchored at seg_pahhead heads. * New entries are added to the end of the list since LRU is used as the * purging policy. */ static void seg_padd_abuck(struct seg_phash *hp) { int lix; ASSERT(MUTEX_HELD(&hp->p_hmutex)); ASSERT((struct seg_phash *)hp->p_hnext != hp); ASSERT((struct seg_phash *)hp->p_hprev != hp); ASSERT(hp->p_hnext == hp->p_hprev); ASSERT(!IS_PCP_WIRED(hp->p_hnext)); ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp); ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp); ASSERT(hp >= seg_phashtab_win && hp < &seg_phashtab_win[seg_phashsize_win]); /* * This bucket can already be on one of active lists * since seg_premove_abuck() may have failed to remove it * before. */ mutex_enter(&seg_pmem_mtx); lix = seg_pahcur; ASSERT(lix >= 0 && lix <= 1); if (hp->p_halink[lix].p_lnext != NULL) { ASSERT(hp->p_halink[lix].p_lprev != NULL); ASSERT(hp->p_halink[!lix].p_lnext == NULL); ASSERT(hp->p_halink[!lix].p_lprev == NULL); mutex_exit(&seg_pmem_mtx); return; } ASSERT(hp->p_halink[lix].p_lprev == NULL); /* * If this bucket is still on list !lix async thread can't yet remove * it since we hold here per bucket lock. In this case just return * since async thread will eventually find and process this bucket. */ if (hp->p_halink[!lix].p_lnext != NULL) { ASSERT(hp->p_halink[!lix].p_lprev != NULL); mutex_exit(&seg_pmem_mtx); return; } ASSERT(hp->p_halink[!lix].p_lprev == NULL); /* * This bucket is not on any active bucket list yet. * Add the bucket to the tail of current active list. */ hp->p_halink[lix].p_lnext = &seg_pahhead[lix]; hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev; seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix]; seg_pahhead[lix].p_lprev = &hp->p_halink[lix]; mutex_exit(&seg_pmem_mtx); } static void seg_premove_abuck(struct seg_phash *hp, int athr) { int lix; ASSERT(MUTEX_HELD(&hp->p_hmutex)); ASSERT((struct seg_phash *)hp->p_hnext == hp); ASSERT((struct seg_phash *)hp->p_hprev == hp); ASSERT(hp >= seg_phashtab_win && hp < &seg_phashtab_win[seg_phashsize_win]); if (athr) { ASSERT(seg_pathr_on); ASSERT(seg_pahcur <= 1); /* * We are called by asynchronous thread that found this bucket * on not currently active (i.e. !seg_pahcur) list. Remove it * from there. Per bucket lock we are holding makes sure * seg_pinsert() can't sneak in and add pcp entries to this * bucket right before we remove the bucket from its list. */ lix = !seg_pahcur; ASSERT(hp->p_halink[lix].p_lnext != NULL); ASSERT(hp->p_halink[lix].p_lprev != NULL); ASSERT(hp->p_halink[!lix].p_lnext == NULL); ASSERT(hp->p_halink[!lix].p_lprev == NULL); hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; hp->p_halink[lix].p_lnext = NULL; hp->p_halink[lix].p_lprev = NULL; return; } mutex_enter(&seg_pmem_mtx); lix = seg_pahcur; ASSERT(lix >= 0 && lix <= 1); /* * If the bucket is on currently active list just remove it from * there. */ if (hp->p_halink[lix].p_lnext != NULL) { ASSERT(hp->p_halink[lix].p_lprev != NULL); ASSERT(hp->p_halink[!lix].p_lnext == NULL); ASSERT(hp->p_halink[!lix].p_lprev == NULL); hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; hp->p_halink[lix].p_lnext = NULL; hp->p_halink[lix].p_lprev = NULL; mutex_exit(&seg_pmem_mtx); return; } ASSERT(hp->p_halink[lix].p_lprev == NULL); /* * If asynchronous thread is not running we can remove the bucket from * not currently active list. The bucket must be on this list since we * already checked that it's not on the other list and the bucket from * which we just deleted the last pcp entry must be still on one of the * active bucket lists. */ lix = !lix; ASSERT(hp->p_halink[lix].p_lnext != NULL); ASSERT(hp->p_halink[lix].p_lprev != NULL); if (!seg_pathr_on) { hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; hp->p_halink[lix].p_lnext = NULL; hp->p_halink[lix].p_lprev = NULL; } mutex_exit(&seg_pmem_mtx); } /* * Check if bucket pointed by hp already has a pcp entry that matches request * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise. * Also delete matching entries that cover smaller address range but start * at the same address as addr argument. Return the list of deleted entries if * any. This is an internal helper function called from seg_pinsert() only * for non wired shadow lists. The caller already holds a per seg/amp list * lock. */ static struct seg_pcache * seg_plookup_checkdup(struct seg_phash *hp, void *htag0, caddr_t addr, size_t len, int *found) { struct seg_pcache *pcp; struct seg_pcache *delcallb_list = NULL; ASSERT(MUTEX_HELD(&hp->p_hmutex)); *found = 0; for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; pcp = pcp->p_hnext) { ASSERT(pcp->p_hashp == hp); if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) { ASSERT(!IS_PCP_WIRED(pcp)); if (pcp->p_len < len) { pcache_link_t *plinkp; if (pcp->p_active) { continue; } plinkp = &pcp->p_plink; plinkp->p_lprev->p_lnext = plinkp->p_lnext; plinkp->p_lnext->p_lprev = plinkp->p_lprev; pcp->p_hprev->p_hnext = pcp->p_hnext; pcp->p_hnext->p_hprev = pcp->p_hprev; pcp->p_hprev = delcallb_list; delcallb_list = pcp; } else { *found = 1; break; } } } return (delcallb_list); } /* * lookup an address range in pagelock cache. Return shadow list and bump up * active count. If amp is not NULL use amp as a lookup tag otherwise use seg * as a lookup tag. */ struct page ** seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, enum seg_rw rw, uint_t flags) { struct seg_pcache *pcp; struct seg_phash *hp; void *htag0; ASSERT(seg != NULL); ASSERT(rw == S_READ || rw == S_WRITE); /* * Skip pagelock cache, while DR is in progress or * seg_pcache is off. */ if (seg_pdisabled) { return (NULL); } ASSERT(seg_phashsize_win != 0); htag0 = (amp == NULL ? (void *)seg : (void *)amp); hp = P_HASHBP(seg, htag0, addr, flags); mutex_enter(&hp->p_hmutex); for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; pcp = pcp->p_hnext) { ASSERT(pcp->p_hashp == hp); if (P_MATCH(pcp, htag0, addr, len)) { ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); /* * If this request wants to write pages * but write permissions starting from * addr don't cover the entire length len * return lookup failure back to the caller. * It will check protections and fail this * pagelock operation with EACCESS error. */ if (rw == S_WRITE && pcp->p_wlen < len) { break; } if (pcp->p_active == UINT_MAX) { break; } pcp->p_active++; if (rw == S_WRITE && !pcp->p_write) { pcp->p_write = 1; } mutex_exit(&hp->p_hmutex); return (pcp->p_pp); } } mutex_exit(&hp->p_hmutex); return (NULL); } /* * mark address range inactive. If the cache is off or the address range is * not in the cache or another shadow list that covers bigger range is found * we call the segment driver to reclaim the pages. Otherwise just decrement * active count and set ref bit. If amp is not NULL use amp as a lookup tag * otherwise use seg as a lookup tag. */ void seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, struct page **pp, enum seg_rw rw, uint_t flags, seg_preclaim_cbfunc_t callback) { struct seg_pcache *pcp; struct seg_phash *hp; kmutex_t *pmtx = NULL; pcache_link_t *pheadp; void *htag0; pgcnt_t npages = 0; int keep = 0; ASSERT(seg != NULL); ASSERT(rw == S_READ || rw == S_WRITE); htag0 = (amp == NULL ? (void *)seg : (void *)amp); /* * Skip lookup if pcache is not configured. */ if (seg_phashsize_win == 0) { goto out; } /* * Grab per seg/amp lock before hash lock if we are going to remove * inactive entry from pcache. */ if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) { if (amp == NULL) { pheadp = &seg->s_phead; pmtx = &seg->s_pmtx; } else { pheadp = &->a_phead; pmtx = &->a_pmtx; } mutex_enter(pmtx); } hp = P_HASHBP(seg, htag0, addr, flags); mutex_enter(&hp->p_hmutex); again: for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; pcp = pcp->p_hnext) { ASSERT(pcp->p_hashp == hp); if (P_MATCH_PP(pcp, htag0, addr, len, pp)) { ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); ASSERT(pcp->p_active); if (keep) { /* * Don't remove this pcp entry * if we didn't find duplicate * shadow lists on second search. * Somebody removed those duplicates * since we dropped hash lock after first * search. */ ASSERT(pmtx != NULL); ASSERT(!IS_PFLAGS_WIRED(flags)); mutex_exit(pmtx); pmtx = NULL; } pcp->p_active--; if (pcp->p_active == 0 && (pmtx != NULL || (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) { /* * This entry is no longer active. Remove it * now either because pcaching is temporarily * disabled or there're other pcp entries that * can match this pagelock request (i.e. this * entry is a duplicate). */ ASSERT(callback == pcp->p_callback); if (pmtx != NULL) { pcache_link_t *plinkp = &pcp->p_plink; ASSERT(!IS_PCP_WIRED(pcp)); ASSERT(pheadp->p_lnext != pheadp); ASSERT(pheadp->p_lprev != pheadp); plinkp->p_lprev->p_lnext = plinkp->p_lnext; plinkp->p_lnext->p_lprev = plinkp->p_lprev; } pcp->p_hprev->p_hnext = pcp->p_hnext; pcp->p_hnext->p_hprev = pcp->p_hprev; if (!IS_PCP_WIRED(pcp) && hp->p_hnext == (struct seg_pcache *)hp) { /* * We removed the last entry from this * bucket. Now remove the bucket from * its active list. */ seg_premove_abuck(hp, 0); } mutex_exit(&hp->p_hmutex); if (pmtx != NULL) { mutex_exit(pmtx); } len = pcp->p_len; npages = btop(len); if (rw != S_WRITE && pcp->p_write) { rw = S_WRITE; } kmem_cache_free(seg_pkmcache, pcp); goto out; } else { /* * We found a matching pcp entry but will not * free it right away even if it's no longer * active. */ if (!pcp->p_active && !IS_PCP_WIRED(pcp)) { /* * Set the reference bit and mark the * time of last access to this pcp * so that asynchronous thread doesn't * free it immediately since * it may be reactivated very soon. */ pcp->p_lbolt = ddi_get_lbolt(); pcp->p_ref = 1; } mutex_exit(&hp->p_hmutex); if (pmtx != NULL) { mutex_exit(pmtx); } return; } } else if (!IS_PFLAGS_WIRED(flags) && P_MATCH(pcp, htag0, addr, len)) { /* * This is a duplicate pcp entry. This situation may * happen if a bigger shadow list that covers our * range was added while our entry was still active. * Now we can free our pcp entry if it becomes * inactive. */ if (!pcp->p_active) { /* * Mark this entry as referenced just in case * we'll free our own pcp entry soon. */ pcp->p_lbolt = ddi_get_lbolt(); pcp->p_ref = 1; } if (pmtx != NULL) { /* * we are already holding pmtx and found a * duplicate. Don't keep our own pcp entry. */ keep = 0; continue; } /* * We have to use mutex_tryenter to attempt to lock * seg/amp list lock since we already hold hash lock * and seg/amp list lock is above hash lock in lock * order. If mutex_tryenter fails drop hash lock and * retake both locks in correct order and research * this hash chain. */ ASSERT(keep == 0); if (amp == NULL) { pheadp = &seg->s_phead; pmtx = &seg->s_pmtx; } else { pheadp = &->a_phead; pmtx = &->a_pmtx; } if (!mutex_tryenter(pmtx)) { mutex_exit(&hp->p_hmutex); mutex_enter(pmtx); mutex_enter(&hp->p_hmutex); /* * If we don't find bigger shadow list on * second search (it may happen since we * dropped bucket lock) keep the entry that * matches our own shadow list. */ keep = 1; goto again; } } } mutex_exit(&hp->p_hmutex); if (pmtx != NULL) { mutex_exit(pmtx); } out: (*callback)(htag0, addr, len, pp, rw, 0); if (npages) { mutex_enter(&seg_pmem_mtx); ASSERT(seg_plocked >= npages); seg_plocked -= npages; if (!IS_PFLAGS_WIRED(flags)) { ASSERT(seg_plocked_window >= npages); seg_plocked_window -= npages; } mutex_exit(&seg_pmem_mtx); } } #ifdef DEBUG static uint32_t p_insert_chk_mtbf = 0; #endif /* * The seg_pinsert_check() is used by segment drivers to predict whether * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing. */ /*ARGSUSED*/ int seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, uint_t flags) { ASSERT(seg != NULL); #ifdef DEBUG if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) { return (SEGP_FAIL); } #endif if (seg_pdisabled) { return (SEGP_FAIL); } ASSERT(seg_phashsize_win != 0); if (IS_PFLAGS_WIRED(flags)) { return (SEGP_SUCCESS); } if (seg_plocked_window + btop(len) > seg_pmaxwindow) { return (SEGP_FAIL); } if (freemem < desfree) { return (SEGP_FAIL); } return (SEGP_SUCCESS); } #ifdef DEBUG static uint32_t p_insert_mtbf = 0; #endif /* * Insert address range with shadow list into pagelock cache if there's no * shadow list already cached for this address range. If the cache is off or * caching is temporarily disabled or the allowed 'window' is exceeded return * SEGP_FAIL. Otherwise return SEGP_SUCCESS. * * For non wired shadow lists (segvn case) include address in the hashing * function to avoid linking all the entries from the same segment or amp on * the same bucket. amp is used instead of seg if amp is not NULL. Non wired * pcache entries are also linked on a per segment/amp list so that all * entries can be found quickly during seg/amp purge without walking the * entire pcache hash table. For wired shadow lists (segspt case) we * don't use address hashing and per segment linking because the caller * currently inserts only one entry per segment that covers the entire * segment. If we used per segment linking even for segspt it would complicate * seg_ppurge_wiredpp() locking. * * Both hash bucket and per seg/amp locks need to be held before adding a non * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken * first. * * This function will also remove from pcache old inactive shadow lists that * overlap with this request but cover smaller range for the same start * address. */ int seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags, seg_preclaim_cbfunc_t callback) { struct seg_pcache *pcp; struct seg_phash *hp; pgcnt_t npages; pcache_link_t *pheadp; kmutex_t *pmtx; struct seg_pcache *delcallb_list = NULL; ASSERT(seg != NULL); ASSERT(rw == S_READ || rw == S_WRITE); ASSERT(rw == S_READ || wlen == len); ASSERT(rw == S_WRITE || wlen <= len); ASSERT(amp == NULL || wlen == len); #ifdef DEBUG if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) { return (SEGP_FAIL); } #endif if (seg_pdisabled) { return (SEGP_FAIL); } ASSERT(seg_phashsize_win != 0); ASSERT((len & PAGEOFFSET) == 0); npages = btop(len); mutex_enter(&seg_pmem_mtx); if (!IS_PFLAGS_WIRED(flags)) { if (seg_plocked_window + npages > seg_pmaxwindow) { mutex_exit(&seg_pmem_mtx); return (SEGP_FAIL); } seg_plocked_window += npages; } seg_plocked += npages; mutex_exit(&seg_pmem_mtx); pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP); /* * If amp is not NULL set htag0 to amp otherwise set it to seg. */ if (amp == NULL) { pcp->p_htag0 = (void *)seg; pcp->p_flags = flags & 0xffff; } else { pcp->p_htag0 = (void *)amp; pcp->p_flags = (flags & 0xffff) | SEGP_AMP; } pcp->p_addr = addr; pcp->p_len = len; pcp->p_wlen = wlen; pcp->p_pp = pp; pcp->p_write = (rw == S_WRITE); pcp->p_callback = callback; pcp->p_active = 1; hp = P_HASHBP(seg, pcp->p_htag0, addr, flags); if (!IS_PFLAGS_WIRED(flags)) { int found; void *htag0; if (amp == NULL) { pheadp = &seg->s_phead; pmtx = &seg->s_pmtx; htag0 = (void *)seg; } else { pheadp = &->a_phead; pmtx = &->a_pmtx; htag0 = (void *)amp; } mutex_enter(pmtx); mutex_enter(&hp->p_hmutex); delcallb_list = seg_plookup_checkdup(hp, htag0, addr, len, &found); if (found) { mutex_exit(&hp->p_hmutex); mutex_exit(pmtx); mutex_enter(&seg_pmem_mtx); seg_plocked -= npages; seg_plocked_window -= npages; mutex_exit(&seg_pmem_mtx); kmem_cache_free(seg_pkmcache, pcp); goto out; } pcp->p_plink.p_lnext = pheadp->p_lnext; pcp->p_plink.p_lprev = pheadp; pheadp->p_lnext->p_lprev = &pcp->p_plink; pheadp->p_lnext = &pcp->p_plink; } else { mutex_enter(&hp->p_hmutex); } pcp->p_hashp = hp; pcp->p_hnext = hp->p_hnext; pcp->p_hprev = (struct seg_pcache *)hp; hp->p_hnext->p_hprev = pcp; hp->p_hnext = pcp; if (!IS_PFLAGS_WIRED(flags) && hp->p_hprev == pcp) { seg_padd_abuck(hp); } mutex_exit(&hp->p_hmutex); if (!IS_PFLAGS_WIRED(flags)) { mutex_exit(pmtx); } out: npages = 0; while (delcallb_list != NULL) { pcp = delcallb_list; delcallb_list = pcp->p_hprev; ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active); (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); npages += btop(pcp->p_len); kmem_cache_free(seg_pkmcache, pcp); } if (npages) { ASSERT(!IS_PFLAGS_WIRED(flags)); mutex_enter(&seg_pmem_mtx); ASSERT(seg_plocked >= npages); ASSERT(seg_plocked_window >= npages); seg_plocked -= npages; seg_plocked_window -= npages; mutex_exit(&seg_pmem_mtx); } return (SEGP_SUCCESS); } /* * purge entries from the pagelock cache if not active * and not recently used. */ static void seg_ppurge_async(int force) { struct seg_pcache *delcallb_list = NULL; struct seg_pcache *pcp; struct seg_phash *hp; pgcnt_t npages = 0; pgcnt_t npages_window = 0; pgcnt_t npgs_to_purge; pgcnt_t npgs_purged = 0; int hlinks = 0; int hlix; pcache_link_t *hlinkp; pcache_link_t *hlnextp = NULL; int lowmem; int trim; ASSERT(seg_phashsize_win != 0); /* * if the cache is off or empty, return */ if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) { return; } if (!force) { lowmem = 0; trim = 0; if (freemem < lotsfree + needfree) { spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0); if (fmem <= 5 * (desfree >> 2)) { lowmem = 1; } else if (fmem <= 7 * (lotsfree >> 3)) { if (seg_plocked_window >= (availrmem_initial >> 1)) { lowmem = 1; } } else if (fmem < lotsfree) { if (seg_plocked_window >= 3 * (availrmem_initial >> 2)) { lowmem = 1; } } } if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) { trim = 1; } if (!lowmem && !trim) { return; } npgs_to_purge = seg_plocked_window >> seg_pshrink_shift; if (lowmem) { npgs_to_purge = MIN(npgs_to_purge, MAX(seg_pmaxapurge_npages, desfree)); } else { npgs_to_purge = MIN(npgs_to_purge, seg_pmaxapurge_npages); } if (npgs_to_purge == 0) { return; } } else { struct seg_phash_wired *hpw; ASSERT(seg_phashsize_wired != 0); for (hpw = seg_phashtab_wired; hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) { if (hpw->p_hnext == (struct seg_pcache *)hpw) { continue; } mutex_enter(&hpw->p_hmutex); for (pcp = hpw->p_hnext; pcp != (struct seg_pcache *)hpw; pcp = pcp->p_hnext) { ASSERT(IS_PCP_WIRED(pcp)); ASSERT(pcp->p_hashp == (struct seg_phash *)hpw); if (pcp->p_active) { continue; } pcp->p_hprev->p_hnext = pcp->p_hnext; pcp->p_hnext->p_hprev = pcp->p_hprev; pcp->p_hprev = delcallb_list; delcallb_list = pcp; } mutex_exit(&hpw->p_hmutex); } } mutex_enter(&seg_pmem_mtx); if (seg_pathr_on) { mutex_exit(&seg_pmem_mtx); goto runcb; } seg_pathr_on = 1; mutex_exit(&seg_pmem_mtx); ASSERT(seg_pahcur <= 1); hlix = !seg_pahcur; again: for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix]; hlinkp = hlnextp) { hlnextp = hlinkp->p_lnext; ASSERT(hlnextp != NULL); hp = hlink2phash(hlinkp, hlix); if (hp->p_hnext == (struct seg_pcache *)hp) { seg_pathr_empty_ahb++; continue; } seg_pathr_full_ahb++; mutex_enter(&hp->p_hmutex); for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; pcp = pcp->p_hnext) { pcache_link_t *pheadp; pcache_link_t *plinkp; void *htag0; kmutex_t *pmtx; ASSERT(!IS_PCP_WIRED(pcp)); ASSERT(pcp->p_hashp == hp); if (pcp->p_active) { continue; } if (!force && pcp->p_ref && PCP_AGE(pcp) < seg_pmax_pcpage) { pcp->p_ref = 0; continue; } plinkp = &pcp->p_plink; htag0 = pcp->p_htag0; if (pcp->p_flags & SEGP_AMP) { pheadp = &((amp_t *)htag0)->a_phead; pmtx = &((amp_t *)htag0)->a_pmtx; } else { pheadp = &((seg_t *)htag0)->s_phead; pmtx = &((seg_t *)htag0)->s_pmtx; } if (!mutex_tryenter(pmtx)) { continue; } ASSERT(pheadp->p_lnext != pheadp); ASSERT(pheadp->p_lprev != pheadp); plinkp->p_lprev->p_lnext = plinkp->p_lnext; plinkp->p_lnext->p_lprev = plinkp->p_lprev; pcp->p_hprev->p_hnext = pcp->p_hnext; pcp->p_hnext->p_hprev = pcp->p_hprev; mutex_exit(pmtx); pcp->p_hprev = delcallb_list; delcallb_list = pcp; npgs_purged += btop(pcp->p_len); } if (hp->p_hnext == (struct seg_pcache *)hp) { seg_premove_abuck(hp, 1); } mutex_exit(&hp->p_hmutex); if (npgs_purged >= seg_plocked_window) { break; } if (!force) { if (npgs_purged >= npgs_to_purge) { break; } if (!trim && !(seg_pathr_full_ahb & 15)) { ASSERT(lowmem); if (freemem >= lotsfree + needfree) { break; } } } } if (hlinkp == &seg_pahhead[hlix]) { /* * We processed the entire hlix active bucket list * but didn't find enough pages to reclaim. * Switch the lists and walk the other list * if we haven't done it yet. */ mutex_enter(&seg_pmem_mtx); ASSERT(seg_pathr_on); ASSERT(seg_pahcur == !hlix); seg_pahcur = hlix; mutex_exit(&seg_pmem_mtx); if (++hlinks < 2) { hlix = !hlix; goto again; } } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] && seg_pahhead[hlix].p_lnext != hlinkp) { ASSERT(hlinkp != NULL); ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]); ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]); ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]); /* * Reinsert the header to point to hlinkp * so that we start from hlinkp bucket next time around. */ seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev; seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext; seg_pahhead[hlix].p_lnext = hlinkp; seg_pahhead[hlix].p_lprev = hlinkp->p_lprev; hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix]; hlinkp->p_lprev = &seg_pahhead[hlix]; } mutex_enter(&seg_pmem_mtx); ASSERT(seg_pathr_on); seg_pathr_on = 0; mutex_exit(&seg_pmem_mtx); runcb: /* * Run the delayed callback list. segments/amps can't go away until * callback is executed since they must have non 0 softlockcnt. That's * why we don't need to hold as/seg/amp locks to execute the callback. */ while (delcallb_list != NULL) { pcp = delcallb_list; delcallb_list = pcp->p_hprev; ASSERT(!pcp->p_active); (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1); npages += btop(pcp->p_len); if (!IS_PCP_WIRED(pcp)) { npages_window += btop(pcp->p_len); } kmem_cache_free(seg_pkmcache, pcp); } if (npages) { mutex_enter(&seg_pmem_mtx); ASSERT(seg_plocked >= npages); ASSERT(seg_plocked_window >= npages_window); seg_plocked -= npages; seg_plocked_window -= npages_window; mutex_exit(&seg_pmem_mtx); } } /* * Remove cached pages for segment(s) entries from hashtable. The segments * are identified by pp array. This is useful for multiple seg's cached on * behalf of dummy segment (ISM/DISM) with common pp array. */ void seg_ppurge_wiredpp(struct page **pp) { struct seg_pcache *pcp; struct seg_phash_wired *hp; pgcnt_t npages = 0; struct seg_pcache *delcallb_list = NULL; /* * if the cache is empty, return */ if (seg_plocked == 0) { return; } ASSERT(seg_phashsize_wired != 0); for (hp = seg_phashtab_wired; hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) { if (hp->p_hnext == (struct seg_pcache *)hp) { continue; } mutex_enter(&hp->p_hmutex); pcp = hp->p_hnext; while (pcp != (struct seg_pcache *)hp) { ASSERT(pcp->p_hashp == (struct seg_phash *)hp); ASSERT(IS_PCP_WIRED(pcp)); /* * purge entries which are not active */ if (!pcp->p_active && pcp->p_pp == pp) { ASSERT(pcp->p_htag0 != NULL); pcp->p_hprev->p_hnext = pcp->p_hnext; pcp->p_hnext->p_hprev = pcp->p_hprev; pcp->p_hprev = delcallb_list; delcallb_list = pcp; } pcp = pcp->p_hnext; } mutex_exit(&hp->p_hmutex); /* * segments can't go away until callback is executed since * they must have non 0 softlockcnt. That's why we don't * need to hold as/seg locks to execute the callback. */ while (delcallb_list != NULL) { int done; pcp = delcallb_list; delcallb_list = pcp->p_hprev; ASSERT(!pcp->p_active); done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1); npages += btop(pcp->p_len); ASSERT(IS_PCP_WIRED(pcp)); kmem_cache_free(seg_pkmcache, pcp); if (done) { ASSERT(delcallb_list == NULL); goto out; } } } out: mutex_enter(&seg_pmem_mtx); ASSERT(seg_plocked >= npages); seg_plocked -= npages; mutex_exit(&seg_pmem_mtx); } /* * purge all entries for a given segment. Since we * callback into the segment driver directly for page * reclaim the caller needs to hold the right locks. */ void seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags) { struct seg_pcache *delcallb_list = NULL; struct seg_pcache *pcp; struct seg_phash *hp; pgcnt_t npages = 0; void *htag0; if (seg_plocked == 0) { return; } ASSERT(seg_phashsize_win != 0); /* * If amp is not NULL use amp as a lookup tag otherwise use seg * as a lookup tag. */ htag0 = (amp == NULL ? (void *)seg : (void *)amp); ASSERT(htag0 != NULL); if (IS_PFLAGS_WIRED(flags)) { hp = P_HASHBP(seg, htag0, 0, flags); mutex_enter(&hp->p_hmutex); pcp = hp->p_hnext; while (pcp != (struct seg_pcache *)hp) { ASSERT(pcp->p_hashp == hp); ASSERT(IS_PCP_WIRED(pcp)); if (pcp->p_htag0 == htag0) { if (pcp->p_active) { break; } pcp->p_hprev->p_hnext = pcp->p_hnext; pcp->p_hnext->p_hprev = pcp->p_hprev; pcp->p_hprev = delcallb_list; delcallb_list = pcp; } pcp = pcp->p_hnext; } mutex_exit(&hp->p_hmutex); } else { pcache_link_t *plinkp; pcache_link_t *pheadp; kmutex_t *pmtx; if (amp == NULL) { ASSERT(seg != NULL); pheadp = &seg->s_phead; pmtx = &seg->s_pmtx; } else { pheadp = &->a_phead; pmtx = &->a_pmtx; } mutex_enter(pmtx); while ((plinkp = pheadp->p_lnext) != pheadp) { pcp = plink2pcache(plinkp); ASSERT(!IS_PCP_WIRED(pcp)); ASSERT(pcp->p_htag0 == htag0); hp = pcp->p_hashp; mutex_enter(&hp->p_hmutex); if (pcp->p_active) { mutex_exit(&hp->p_hmutex); break; } ASSERT(plinkp->p_lprev == pheadp); pheadp->p_lnext = plinkp->p_lnext; plinkp->p_lnext->p_lprev = pheadp; pcp->p_hprev->p_hnext = pcp->p_hnext; pcp->p_hnext->p_hprev = pcp->p_hprev; pcp->p_hprev = delcallb_list; delcallb_list = pcp; if (hp->p_hnext == (struct seg_pcache *)hp) { seg_premove_abuck(hp, 0); } mutex_exit(&hp->p_hmutex); } mutex_exit(pmtx); } while (delcallb_list != NULL) { pcp = delcallb_list; delcallb_list = pcp->p_hprev; ASSERT(!pcp->p_active); (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); npages += btop(pcp->p_len); kmem_cache_free(seg_pkmcache, pcp); } mutex_enter(&seg_pmem_mtx); ASSERT(seg_plocked >= npages); seg_plocked -= npages; if (!IS_PFLAGS_WIRED(flags)) { ASSERT(seg_plocked_window >= npages); seg_plocked_window -= npages; } mutex_exit(&seg_pmem_mtx); } static void seg_pinit_mem_config(void); /* * setup the pagelock cache */ static void seg_pinit(void) { struct seg_phash *hp; ulong_t i; pgcnt_t physmegs; seg_plocked = 0; seg_plocked_window = 0; if (segpcache_enabled == 0) { seg_phashsize_win = 0; seg_phashsize_wired = 0; seg_pdisabled = 1; return; } seg_pdisabled = 0; seg_pkmcache = kmem_cache_create("seg_pcache", sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0); if (segpcache_pcp_maxage_ticks <= 0) { segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz; } seg_pmax_pcpage = segpcache_pcp_maxage_ticks; seg_pathr_empty_ahb = 0; seg_pathr_full_ahb = 0; seg_pshrink_shift = segpcache_shrink_shift; seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes); mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL); physmegs = physmem >> (20 - PAGESHIFT); /* * If segpcache_hashsize_win was not set in /etc/system or it has * absurd value set it to a default. */ if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) { /* * Create one bucket per 32K (or at least per 8 pages) of * available memory. */ pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8); segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket); } if (!ISP2(segpcache_hashsize_win)) { ulong_t rndfac = ~(1UL << (highbit(segpcache_hashsize_win) - 1)); rndfac &= segpcache_hashsize_win; segpcache_hashsize_win += rndfac; segpcache_hashsize_win = 1 << (highbit(segpcache_hashsize_win) - 1); } seg_phashsize_win = segpcache_hashsize_win; seg_phashtab_win = kmem_zalloc( seg_phashsize_win * sizeof (struct seg_phash), KM_SLEEP); for (i = 0; i < seg_phashsize_win; i++) { hp = &seg_phashtab_win[i]; hp->p_hnext = (struct seg_pcache *)hp; hp->p_hprev = (struct seg_pcache *)hp; mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); } seg_pahcur = 0; seg_pathr_on = 0; seg_pahhead[0].p_lnext = &seg_pahhead[0]; seg_pahhead[0].p_lprev = &seg_pahhead[0]; seg_pahhead[1].p_lnext = &seg_pahhead[1]; seg_pahhead[1].p_lprev = &seg_pahhead[1]; /* * If segpcache_hashsize_wired was not set in /etc/system or it has * absurd value set it to a default. */ if (segpcache_hashsize_wired == 0 || segpcache_hashsize_wired > physmem / 4) { /* * Choose segpcache_hashsize_wired based on physmem. * Create a bucket per 128K bytes upto 256K buckets. */ if (physmegs < 20 * 1024) { segpcache_hashsize_wired = MAX(1024, physmegs << 3); } else { segpcache_hashsize_wired = 256 * 1024; } } if (!ISP2(segpcache_hashsize_wired)) { segpcache_hashsize_wired = 1 << highbit(segpcache_hashsize_wired); } seg_phashsize_wired = segpcache_hashsize_wired; seg_phashtab_wired = kmem_zalloc( seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP); for (i = 0; i < seg_phashsize_wired; i++) { hp = (struct seg_phash *)&seg_phashtab_wired[i]; hp->p_hnext = (struct seg_pcache *)hp; hp->p_hprev = (struct seg_pcache *)hp; mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); } if (segpcache_maxwindow == 0) { if (physmegs < 64) { /* 3% of memory */ segpcache_maxwindow = availrmem >> 5; } else if (physmegs < 512) { /* 12% of memory */ segpcache_maxwindow = availrmem >> 3; } else if (physmegs < 1024) { /* 25% of memory */ segpcache_maxwindow = availrmem >> 2; } else if (physmegs < 2048) { /* 50% of memory */ segpcache_maxwindow = availrmem >> 1; } else { /* no limit */ segpcache_maxwindow = (pgcnt_t)-1; } } seg_pmaxwindow = segpcache_maxwindow; seg_pinit_mem_config(); } /* * called by pageout if memory is low */ void seg_preap(void) { /* * if the cache is off or empty, return */ if (seg_plocked_window == 0) { return; } ASSERT(seg_phashsize_win != 0); /* * If somebody is already purging pcache * just return. */ if (seg_pdisabled) { return; } cv_signal(&seg_pasync_cv); } /* * run as a backgroud thread and reclaim pagelock * pages which have not been used recently */ void seg_pasync_thread(void) { callb_cpr_t cpr_info; if (seg_phashsize_win == 0) { thread_exit(); /*NOTREACHED*/ } seg_pasync_thr = curthread; CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx, callb_generic_cpr, "seg_pasync"); if (segpcache_reap_ticks <= 0) { segpcache_reap_ticks = segpcache_reap_sec * hz; } mutex_enter(&seg_pasync_mtx); for (;;) { CALLB_CPR_SAFE_BEGIN(&cpr_info); (void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx, segpcache_reap_ticks, TR_CLOCK_TICK); CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx); if (seg_pdisabled == 0) { seg_ppurge_async(0); } } } static struct kmem_cache *seg_cache; /* * Initialize segment management data structures. */ void seg_init(void) { kstat_t *ksp; seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg), 0, NULL, NULL, NULL, NULL, NULL, 0); ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED, segadvstat_ndata, KSTAT_FLAG_VIRTUAL); if (ksp) { ksp->ks_data = (void *)segadvstat_ptr; kstat_install(ksp); } seg_pinit(); } /* * Allocate a segment to cover [base, base+size] * and attach it to the specified address space. */ struct seg * seg_alloc(struct as *as, caddr_t base, size_t size) { struct seg *new; caddr_t segbase; size_t segsize; segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK); segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) - (uintptr_t)segbase; if (!valid_va_range(&segbase, &segsize, segsize, AH_LO)) return ((struct seg *)NULL); /* bad virtual addr range */ if (as != &kas && valid_usr_range(segbase, segsize, 0, as, as->a_userlimit) != RANGE_OKAY) return ((struct seg *)NULL); /* bad virtual addr range */ new = kmem_cache_alloc(seg_cache, KM_SLEEP); new->s_ops = NULL; new->s_data = NULL; new->s_szc = 0; new->s_flags = 0; mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL); new->s_phead.p_lnext = &new->s_phead; new->s_phead.p_lprev = &new->s_phead; if (seg_attach(as, segbase, segsize, new) < 0) { kmem_cache_free(seg_cache, new); return ((struct seg *)NULL); } /* caller must fill in ops, data */ return (new); } /* * Attach a segment to the address space. Used by seg_alloc() * and for kernel startup to attach to static segments. */ int seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg) { seg->s_as = as; seg->s_base = base; seg->s_size = size; /* * as_addseg() will add the segment at the appropraite point * in the list. It will return -1 if there is overlap with * an already existing segment. */ return (as_addseg(as, seg)); } /* * Unmap a segment and free it from its associated address space. * This should be called by anybody who's finished with a whole segment's * mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the * responsibility of the segment driver to unlink the the segment * from the address space, and to free public and private data structures * associated with the segment. (This is typically done by a call to * seg_free()). */ void seg_unmap(struct seg *seg) { #ifdef DEBUG int ret; #endif /* DEBUG */ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); /* Shouldn't have called seg_unmap if mapping isn't yet established */ ASSERT(seg->s_data != NULL); /* Unmap the whole mapping */ #ifdef DEBUG ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); ASSERT(ret == 0); #else SEGOP_UNMAP(seg, seg->s_base, seg->s_size); #endif /* DEBUG */ } /* * Free the segment from its associated as. This should only be called * if a mapping to the segment has not yet been established (e.g., if * an error occurs in the middle of doing an as_map when the segment * has already been partially set up) or if it has already been deleted * (e.g., from a segment driver unmap routine if the unmap applies to the * entire segment). If the mapping is currently set up then seg_unmap() should * be called instead. */ void seg_free(struct seg *seg) { register struct as *as = seg->s_as; struct seg *tseg = as_removeseg(as, seg); ASSERT(tseg == seg); /* * If the segment private data field is NULL, * then segment driver is not attached yet. */ if (seg->s_data != NULL) SEGOP_FREE(seg); mutex_destroy(&seg->s_pmtx); ASSERT(seg->s_phead.p_lnext == &seg->s_phead); ASSERT(seg->s_phead.p_lprev == &seg->s_phead); kmem_cache_free(seg_cache, seg); } /*ARGSUSED*/ static void seg_p_mem_config_post_add( void *arg, pgcnt_t delta_pages) { /* Nothing to do. */ } void seg_p_enable(void) { mutex_enter(&seg_pcache_mtx); ASSERT(seg_pdisabled != 0); seg_pdisabled--; mutex_exit(&seg_pcache_mtx); } /* * seg_p_disable - disables seg_pcache, and then attempts to empty the * cache. * Returns SEGP_SUCCESS if the cache was successfully emptied, or * SEGP_FAIL if the cache could not be emptied. */ int seg_p_disable(void) { pgcnt_t old_plocked; int stall_count = 0; mutex_enter(&seg_pcache_mtx); seg_pdisabled++; ASSERT(seg_pdisabled != 0); mutex_exit(&seg_pcache_mtx); /* * Attempt to empty the cache. Terminate if seg_plocked does not * diminish with SEGP_STALL_THRESHOLD consecutive attempts. */ while (seg_plocked != 0) { ASSERT(seg_phashsize_win != 0); old_plocked = seg_plocked; seg_ppurge_async(1); if (seg_plocked == old_plocked) { if (stall_count++ > SEGP_STALL_THRESHOLD) { return (SEGP_FAIL); } } else stall_count = 0; if (seg_plocked != 0) delay(hz/SEGP_PREDEL_DELAY_FACTOR); } return (SEGP_SUCCESS); } /* * Attempt to purge seg_pcache. May need to return before this has * completed to allow other pre_del callbacks to unlock pages. This is * ok because: * 1) The seg_pdisabled flag has been set so at least we won't * cache anymore locks and the locks we couldn't purge * will not be held if they do get released by a subsequent * pre-delete callback. * * 2) The rest of the memory delete thread processing does not * depend on the changes made in this pre-delete callback. No * panics will result, the worst that will happen is that the * DR code will timeout and cancel the delete. */ /*ARGSUSED*/ static int seg_p_mem_config_pre_del( void *arg, pgcnt_t delta_pages) { if (seg_phashsize_win == 0) { return (0); } if (seg_p_disable() != SEGP_SUCCESS) cmn_err(CE_NOTE, "!Pre-delete couldn't purge"" pagelock cache - continuing"); return (0); } /*ARGSUSED*/ static void seg_p_mem_config_post_del( void *arg, pgcnt_t delta_pages, int cancelled) { if (seg_phashsize_win == 0) { return; } seg_p_enable(); } static kphysm_setup_vector_t seg_p_mem_config_vec = { KPHYSM_SETUP_VECTOR_VERSION, seg_p_mem_config_post_add, seg_p_mem_config_pre_del, seg_p_mem_config_post_del, }; static void seg_pinit_mem_config(void) { int ret; ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL); /* * Want to catch this in the debug kernel. At run time, if the * callbacks don't get run all will be OK as the disable just makes * it more likely that the pages can be collected. */ ASSERT(ret == 0); } /* * Verify that segment is not a shared anonymous segment which reserves * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered * from one zone to another if any segments are shared. This is because the * last process to exit will credit the swap reservation. This could lead * to the swap being reserved by one zone, and credited to another. */ boolean_t seg_can_change_zones(struct seg *seg) { struct segvn_data *svd; if (seg->s_ops == &segspt_shmops) return (B_FALSE); if (seg->s_ops == &segvn_ops) { svd = (struct segvn_data *)seg->s_data; if (svd->type == MAP_SHARED && svd->amp != NULL && svd->amp->swresv > 0) return (B_FALSE); } return (B_TRUE); } /* * Return swap reserved by a segment backing a private mapping. */ size_t seg_swresv(struct seg *seg) { struct segvn_data *svd; size_t swap = 0; if (seg->s_ops == &segvn_ops) { svd = (struct segvn_data *)seg->s_data; if (svd->type == MAP_PRIVATE && svd->swresv > 0) swap = svd->swresv; } return (swap); } /* * General not supported function for SEGOP_INHERIT */ /* ARGSUSED */ int seg_inherit_notsup(struct seg *seg, caddr_t addr, size_t len, uint_t op) { return (ENOTSUP); }