17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 50209230bSgjelinek * Common Development and Distribution License (the "License"). 60209230bSgjelinek * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate /* 22d3d50737SRafael Vanoni * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 237c478bd9Sstevel@tonic-gate * Use is subject to license terms. 249d12795fSRobert Mustacchi * Copyright (c) 2015, Joyent, Inc. 257c478bd9Sstevel@tonic-gate */ 267c478bd9Sstevel@tonic-gate 277c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 287c478bd9Sstevel@tonic-gate /* All Rights Reserved */ 297c478bd9Sstevel@tonic-gate 307c478bd9Sstevel@tonic-gate /* 317c478bd9Sstevel@tonic-gate * University Copyright- Copyright (c) 1982, 1986, 1988 327c478bd9Sstevel@tonic-gate * The Regents of the University of California 337c478bd9Sstevel@tonic-gate * All Rights Reserved 347c478bd9Sstevel@tonic-gate * 357c478bd9Sstevel@tonic-gate * University Acknowledgment- Portions of this document are derived from 367c478bd9Sstevel@tonic-gate * software developed by the University of California, Berkeley, and its 377c478bd9Sstevel@tonic-gate * contributors. 387c478bd9Sstevel@tonic-gate */ 397c478bd9Sstevel@tonic-gate 407c478bd9Sstevel@tonic-gate /* 417c478bd9Sstevel@tonic-gate * VM - segment management. 427c478bd9Sstevel@tonic-gate */ 437c478bd9Sstevel@tonic-gate 447c478bd9Sstevel@tonic-gate #include <sys/types.h> 457c478bd9Sstevel@tonic-gate #include <sys/inttypes.h> 467c478bd9Sstevel@tonic-gate #include <sys/t_lock.h> 477c478bd9Sstevel@tonic-gate #include <sys/param.h> 487c478bd9Sstevel@tonic-gate #include <sys/systm.h> 497c478bd9Sstevel@tonic-gate #include <sys/kmem.h> 50a98e9dbfSaguzovsk #include <sys/sysmacros.h> 517c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h> 52a98e9dbfSaguzovsk #include <sys/tuneable.h> 537c478bd9Sstevel@tonic-gate #include <sys/debug.h> 54a98e9dbfSaguzovsk #include <sys/fs/swapnode.h> 557c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 567c478bd9Sstevel@tonic-gate #include <sys/callb.h> 577c478bd9Sstevel@tonic-gate #include <sys/mem_config.h> 580209230bSgjelinek #include <sys/mman.h> 597c478bd9Sstevel@tonic-gate 607c478bd9Sstevel@tonic-gate #include <vm/hat.h> 617c478bd9Sstevel@tonic-gate #include <vm/as.h> 627c478bd9Sstevel@tonic-gate #include <vm/seg.h> 637c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h> 640209230bSgjelinek #include <vm/seg_spt.h> 650209230bSgjelinek #include <vm/seg_vn.h> 66a98e9dbfSaguzovsk #include <vm/anon.h> 67a98e9dbfSaguzovsk 687c478bd9Sstevel@tonic-gate /* 697c478bd9Sstevel@tonic-gate * kstats for segment advise 707c478bd9Sstevel@tonic-gate */ 717c478bd9Sstevel@tonic-gate segadvstat_t segadvstat = { 727c478bd9Sstevel@tonic-gate { "MADV_FREE_hit", KSTAT_DATA_ULONG }, 737c478bd9Sstevel@tonic-gate { "MADV_FREE_miss", KSTAT_DATA_ULONG }, 747c478bd9Sstevel@tonic-gate }; 757c478bd9Sstevel@tonic-gate 767c478bd9Sstevel@tonic-gate kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat; 777c478bd9Sstevel@tonic-gate uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t); 787c478bd9Sstevel@tonic-gate 797c478bd9Sstevel@tonic-gate /* 807c478bd9Sstevel@tonic-gate * entry in the segment page cache 817c478bd9Sstevel@tonic-gate */ 827c478bd9Sstevel@tonic-gate struct seg_pcache { 837c478bd9Sstevel@tonic-gate struct seg_pcache *p_hnext; /* list for hashed blocks */ 847c478bd9Sstevel@tonic-gate struct seg_pcache *p_hprev; 85a98e9dbfSaguzovsk pcache_link_t p_plink; /* per segment/amp list */ 86a98e9dbfSaguzovsk void *p_htag0; /* segment/amp pointer */ 87a98e9dbfSaguzovsk caddr_t p_addr; /* base address/anon_idx */ 88a98e9dbfSaguzovsk size_t p_len; /* total bytes */ 89a98e9dbfSaguzovsk size_t p_wlen; /* writtable bytes at p_addr */ 907c478bd9Sstevel@tonic-gate struct page **p_pp; /* pp shadow list */ 91a98e9dbfSaguzovsk seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */ 92a98e9dbfSaguzovsk clock_t p_lbolt; /* lbolt from last use */ 93a98e9dbfSaguzovsk struct seg_phash *p_hashp; /* our pcache hash bucket */ 94a98e9dbfSaguzovsk uint_t p_active; /* active count */ 95a98e9dbfSaguzovsk uchar_t p_write; /* true if S_WRITE */ 96a98e9dbfSaguzovsk uchar_t p_ref; /* reference byte */ 97a98e9dbfSaguzovsk ushort_t p_flags; /* bit flags */ 987c478bd9Sstevel@tonic-gate }; 997c478bd9Sstevel@tonic-gate 1007c478bd9Sstevel@tonic-gate struct seg_phash { 1017c478bd9Sstevel@tonic-gate struct seg_pcache *p_hnext; /* list for hashed blocks */ 1027c478bd9Sstevel@tonic-gate struct seg_pcache *p_hprev; 103a98e9dbfSaguzovsk kmutex_t p_hmutex; /* protects hash bucket */ 104a98e9dbfSaguzovsk pcache_link_t p_halink[2]; /* active bucket linkages */ 105a98e9dbfSaguzovsk }; 106a98e9dbfSaguzovsk 107a98e9dbfSaguzovsk struct seg_phash_wired { 108a98e9dbfSaguzovsk struct seg_pcache *p_hnext; /* list for hashed blocks */ 109a98e9dbfSaguzovsk struct seg_pcache *p_hprev; 1107c478bd9Sstevel@tonic-gate kmutex_t p_hmutex; /* protects hash bucket */ 1117c478bd9Sstevel@tonic-gate }; 1127c478bd9Sstevel@tonic-gate 113a98e9dbfSaguzovsk /* 114a98e9dbfSaguzovsk * A parameter to control a maximum number of bytes that can be 115a98e9dbfSaguzovsk * purged from pcache at a time. 116a98e9dbfSaguzovsk */ 117a98e9dbfSaguzovsk #define P_MAX_APURGE_BYTES (1024 * 1024 * 1024) 1187c478bd9Sstevel@tonic-gate 1197c478bd9Sstevel@tonic-gate /* 120a98e9dbfSaguzovsk * log2(fraction of pcache to reclaim at a time). 121a98e9dbfSaguzovsk */ 122a98e9dbfSaguzovsk #define P_SHRINK_SHFT (5) 123a98e9dbfSaguzovsk 124a98e9dbfSaguzovsk /* 125a98e9dbfSaguzovsk * The following variables can be tuned via /etc/system. 126a98e9dbfSaguzovsk */ 127a98e9dbfSaguzovsk 128a98e9dbfSaguzovsk int segpcache_enabled = 1; /* if 1, shadow lists are cached */ 129a98e9dbfSaguzovsk pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */ 130a98e9dbfSaguzovsk ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */ 131a98e9dbfSaguzovsk ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */ 132a98e9dbfSaguzovsk int segpcache_reap_sec = 1; /* reap check rate in secs */ 133a98e9dbfSaguzovsk clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */ 134a98e9dbfSaguzovsk int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */ 135a98e9dbfSaguzovsk clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */ 136a98e9dbfSaguzovsk int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */ 137a98e9dbfSaguzovsk pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */ 138a98e9dbfSaguzovsk 139a98e9dbfSaguzovsk static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */ 140a98e9dbfSaguzovsk static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */ 141a98e9dbfSaguzovsk static kcondvar_t seg_pasync_cv; 142a98e9dbfSaguzovsk 143a98e9dbfSaguzovsk #pragma align 64(pctrl1) 144a98e9dbfSaguzovsk #pragma align 64(pctrl2) 145a98e9dbfSaguzovsk #pragma align 64(pctrl3) 146a98e9dbfSaguzovsk 147a98e9dbfSaguzovsk /* 148a98e9dbfSaguzovsk * Keep frequently used variables together in one cache line. 149a98e9dbfSaguzovsk */ 150a98e9dbfSaguzovsk static struct p_ctrl1 { 151a98e9dbfSaguzovsk uint_t p_disabled; /* if not 0, caching temporarily off */ 152a98e9dbfSaguzovsk pgcnt_t p_maxwin; /* max # of pages that can be cached */ 153a98e9dbfSaguzovsk size_t p_hashwin_sz; /* # of non wired buckets */ 154a98e9dbfSaguzovsk struct seg_phash *p_htabwin; /* hash table for non wired entries */ 155a98e9dbfSaguzovsk size_t p_hashwired_sz; /* # of wired buckets */ 156a98e9dbfSaguzovsk struct seg_phash_wired *p_htabwired; /* hash table for wired entries */ 157a98e9dbfSaguzovsk kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */ 158a98e9dbfSaguzovsk #ifdef _LP64 159a98e9dbfSaguzovsk ulong_t pad[1]; 160a98e9dbfSaguzovsk #endif /* _LP64 */ 161a98e9dbfSaguzovsk } pctrl1; 162a98e9dbfSaguzovsk 163a98e9dbfSaguzovsk static struct p_ctrl2 { 164a98e9dbfSaguzovsk kmutex_t p_mem_mtx; /* protects window counter and p_halinks */ 165a98e9dbfSaguzovsk pgcnt_t p_locked_win; /* # pages from window */ 166a98e9dbfSaguzovsk pgcnt_t p_locked; /* # of pages cached by pagelock */ 167a98e9dbfSaguzovsk uchar_t p_ahcur; /* current active links for insert/delete */ 168a98e9dbfSaguzovsk uchar_t p_athr_on; /* async reclaim thread is running. */ 169a98e9dbfSaguzovsk pcache_link_t p_ahhead[2]; /* active buckets linkages */ 170a98e9dbfSaguzovsk } pctrl2; 171a98e9dbfSaguzovsk 172a98e9dbfSaguzovsk static struct p_ctrl3 { 173a98e9dbfSaguzovsk clock_t p_pcp_maxage; /* max pcp age in ticks */ 174a98e9dbfSaguzovsk ulong_t p_athr_empty_ahb; /* athread walk stats */ 175a98e9dbfSaguzovsk ulong_t p_athr_full_ahb; /* athread walk stats */ 176a98e9dbfSaguzovsk pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */ 177a98e9dbfSaguzovsk int p_shrink_shft; /* reap shift factor */ 178a98e9dbfSaguzovsk #ifdef _LP64 179a98e9dbfSaguzovsk ulong_t pad[3]; 180a98e9dbfSaguzovsk #endif /* _LP64 */ 181a98e9dbfSaguzovsk } pctrl3; 182a98e9dbfSaguzovsk 183a98e9dbfSaguzovsk #define seg_pdisabled pctrl1.p_disabled 184a98e9dbfSaguzovsk #define seg_pmaxwindow pctrl1.p_maxwin 185a98e9dbfSaguzovsk #define seg_phashsize_win pctrl1.p_hashwin_sz 186a98e9dbfSaguzovsk #define seg_phashtab_win pctrl1.p_htabwin 187a98e9dbfSaguzovsk #define seg_phashsize_wired pctrl1.p_hashwired_sz 188a98e9dbfSaguzovsk #define seg_phashtab_wired pctrl1.p_htabwired 189a98e9dbfSaguzovsk #define seg_pkmcache pctrl1.p_kmcache 190a98e9dbfSaguzovsk #define seg_pmem_mtx pctrl2.p_mem_mtx 191a98e9dbfSaguzovsk #define seg_plocked_window pctrl2.p_locked_win 192a98e9dbfSaguzovsk #define seg_plocked pctrl2.p_locked 193a98e9dbfSaguzovsk #define seg_pahcur pctrl2.p_ahcur 194a98e9dbfSaguzovsk #define seg_pathr_on pctrl2.p_athr_on 195a98e9dbfSaguzovsk #define seg_pahhead pctrl2.p_ahhead 196a98e9dbfSaguzovsk #define seg_pmax_pcpage pctrl3.p_pcp_maxage 197a98e9dbfSaguzovsk #define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb 198a98e9dbfSaguzovsk #define seg_pathr_full_ahb pctrl3.p_athr_full_ahb 199a98e9dbfSaguzovsk #define seg_pshrink_shift pctrl3.p_shrink_shft 200a98e9dbfSaguzovsk #define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages 201a98e9dbfSaguzovsk 202a98e9dbfSaguzovsk #define P_HASHWIN_MASK (seg_phashsize_win - 1) 203a98e9dbfSaguzovsk #define P_HASHWIRED_MASK (seg_phashsize_wired - 1) 204a98e9dbfSaguzovsk #define P_BASESHIFT (6) 205a98e9dbfSaguzovsk 206a98e9dbfSaguzovsk kthread_t *seg_pasync_thr; 207a98e9dbfSaguzovsk 208a98e9dbfSaguzovsk extern struct seg_ops segvn_ops; 209a98e9dbfSaguzovsk extern struct seg_ops segspt_shmops; 210a98e9dbfSaguzovsk 211a98e9dbfSaguzovsk #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED) 212a98e9dbfSaguzovsk #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags) 213a98e9dbfSaguzovsk 214d3d50737SRafael Vanoni #define LBOLT_DELTA(t) ((ulong_t)(ddi_get_lbolt() - (t))) 215a98e9dbfSaguzovsk 216a98e9dbfSaguzovsk #define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt) 217a98e9dbfSaguzovsk 218a98e9dbfSaguzovsk /* 219a98e9dbfSaguzovsk * htag0 argument can be a seg or amp pointer. 220a98e9dbfSaguzovsk */ 221a98e9dbfSaguzovsk #define P_HASHBP(seg, htag0, addr, flags) \ 222a98e9dbfSaguzovsk (IS_PFLAGS_WIRED((flags)) ? \ 223a98e9dbfSaguzovsk ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \ 224a98e9dbfSaguzovsk ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \ 225a98e9dbfSaguzovsk (&seg_phashtab_win[P_HASHWIN_MASK & \ 226a98e9dbfSaguzovsk (((uintptr_t)(htag0) >> 3) ^ \ 227a98e9dbfSaguzovsk ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \ 228a98e9dbfSaguzovsk (flags >> 16) : page_get_shift((seg)->s_szc))))])) 229a98e9dbfSaguzovsk 230a98e9dbfSaguzovsk /* 231a98e9dbfSaguzovsk * htag0 argument can be a seg or amp pointer. 232a98e9dbfSaguzovsk */ 233a98e9dbfSaguzovsk #define P_MATCH(pcp, htag0, addr, len) \ 234a98e9dbfSaguzovsk ((pcp)->p_htag0 == (htag0) && \ 235a98e9dbfSaguzovsk (pcp)->p_addr == (addr) && \ 236a98e9dbfSaguzovsk (pcp)->p_len >= (len)) 237a98e9dbfSaguzovsk 238a98e9dbfSaguzovsk #define P_MATCH_PP(pcp, htag0, addr, len, pp) \ 239a98e9dbfSaguzovsk ((pcp)->p_pp == (pp) && \ 240a98e9dbfSaguzovsk (pcp)->p_htag0 == (htag0) && \ 241a98e9dbfSaguzovsk (pcp)->p_addr == (addr) && \ 242a98e9dbfSaguzovsk (pcp)->p_len >= (len)) 243a98e9dbfSaguzovsk 244a98e9dbfSaguzovsk #define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \ 245a98e9dbfSaguzovsk offsetof(struct seg_pcache, p_plink))) 246a98e9dbfSaguzovsk 247a98e9dbfSaguzovsk #define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \ 248a98e9dbfSaguzovsk offsetof(struct seg_phash, p_halink[l]))) 249a98e9dbfSaguzovsk 250a98e9dbfSaguzovsk /* 251a98e9dbfSaguzovsk * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from 252a98e9dbfSaguzovsk * active hash bucket lists. We maintain active bucket lists to reduce the 253a98e9dbfSaguzovsk * overhead of finding active buckets during asynchronous purging since there 254a98e9dbfSaguzovsk * can be 10s of millions of buckets on a large system but only a small subset 255a98e9dbfSaguzovsk * of them in actual use. 256a98e9dbfSaguzovsk * 257a98e9dbfSaguzovsk * There're 2 active bucket lists. Current active list (as per seg_pahcur) is 258a98e9dbfSaguzovsk * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete 259a98e9dbfSaguzovsk * buckets. The other list is used by asynchronous purge thread. This allows 260a98e9dbfSaguzovsk * the purge thread to walk its active list without holding seg_pmem_mtx for a 261a98e9dbfSaguzovsk * long time. When asynchronous thread is done with its list it switches to 262a98e9dbfSaguzovsk * current active list and makes the list it just finished processing as 263a98e9dbfSaguzovsk * current active list. 264a98e9dbfSaguzovsk * 265a98e9dbfSaguzovsk * seg_padd_abuck() only adds the bucket to current list if the bucket is not 266a98e9dbfSaguzovsk * yet on any list. seg_premove_abuck() may remove the bucket from either 267a98e9dbfSaguzovsk * list. If the bucket is on current list it will be always removed. Otherwise 268a98e9dbfSaguzovsk * the bucket is only removed if asynchronous purge thread is not currently 269a98e9dbfSaguzovsk * running or seg_premove_abuck() is called by asynchronous purge thread 270a98e9dbfSaguzovsk * itself. A given bucket can only be on one of active lists at a time. These 271a98e9dbfSaguzovsk * routines should be called with per bucket lock held. The routines use 272a98e9dbfSaguzovsk * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after 273a98e9dbfSaguzovsk * the first entry is added to the bucket chain and seg_premove_abuck() must 274a98e9dbfSaguzovsk * be called after the last pcp entry is deleted from its chain. Per bucket 275a98e9dbfSaguzovsk * lock should be held by the callers. This avoids a potential race condition 276a98e9dbfSaguzovsk * when seg_premove_abuck() removes a bucket after pcp entries are added to 277a98e9dbfSaguzovsk * its list after the caller checked that the bucket has no entries. (this 278a98e9dbfSaguzovsk * race would cause a loss of an active bucket from the active lists). 279a98e9dbfSaguzovsk * 280a98e9dbfSaguzovsk * Both lists are circular doubly linked lists anchored at seg_pahhead heads. 281a98e9dbfSaguzovsk * New entries are added to the end of the list since LRU is used as the 282a98e9dbfSaguzovsk * purging policy. 283a98e9dbfSaguzovsk */ 284a98e9dbfSaguzovsk static void 285a98e9dbfSaguzovsk seg_padd_abuck(struct seg_phash *hp) 286a98e9dbfSaguzovsk { 287a98e9dbfSaguzovsk int lix; 288a98e9dbfSaguzovsk 289a98e9dbfSaguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex)); 290a98e9dbfSaguzovsk ASSERT((struct seg_phash *)hp->p_hnext != hp); 291a98e9dbfSaguzovsk ASSERT((struct seg_phash *)hp->p_hprev != hp); 292a98e9dbfSaguzovsk ASSERT(hp->p_hnext == hp->p_hprev); 293a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(hp->p_hnext)); 294a98e9dbfSaguzovsk ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp); 295a98e9dbfSaguzovsk ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp); 296a98e9dbfSaguzovsk ASSERT(hp >= seg_phashtab_win && 297a98e9dbfSaguzovsk hp < &seg_phashtab_win[seg_phashsize_win]); 298a98e9dbfSaguzovsk 299a98e9dbfSaguzovsk /* 300a98e9dbfSaguzovsk * This bucket can already be on one of active lists 301a98e9dbfSaguzovsk * since seg_premove_abuck() may have failed to remove it 302a98e9dbfSaguzovsk * before. 303a98e9dbfSaguzovsk */ 304a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 305a98e9dbfSaguzovsk lix = seg_pahcur; 306a98e9dbfSaguzovsk ASSERT(lix >= 0 && lix <= 1); 307a98e9dbfSaguzovsk if (hp->p_halink[lix].p_lnext != NULL) { 308a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 309a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL); 310a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 311a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 312a98e9dbfSaguzovsk return; 313a98e9dbfSaguzovsk } 314a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev == NULL); 315a98e9dbfSaguzovsk 316a98e9dbfSaguzovsk /* 317a98e9dbfSaguzovsk * If this bucket is still on list !lix async thread can't yet remove 318a98e9dbfSaguzovsk * it since we hold here per bucket lock. In this case just return 319a98e9dbfSaguzovsk * since async thread will eventually find and process this bucket. 320a98e9dbfSaguzovsk */ 321a98e9dbfSaguzovsk if (hp->p_halink[!lix].p_lnext != NULL) { 322a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev != NULL); 323a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 324a98e9dbfSaguzovsk return; 325a98e9dbfSaguzovsk } 326a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 327a98e9dbfSaguzovsk /* 328a98e9dbfSaguzovsk * This bucket is not on any active bucket list yet. 329a98e9dbfSaguzovsk * Add the bucket to the tail of current active list. 330a98e9dbfSaguzovsk */ 331a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext = &seg_pahhead[lix]; 332a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev; 333a98e9dbfSaguzovsk seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix]; 334a98e9dbfSaguzovsk seg_pahhead[lix].p_lprev = &hp->p_halink[lix]; 335a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 336a98e9dbfSaguzovsk } 337a98e9dbfSaguzovsk 338a98e9dbfSaguzovsk static void 339a98e9dbfSaguzovsk seg_premove_abuck(struct seg_phash *hp, int athr) 340a98e9dbfSaguzovsk { 341a98e9dbfSaguzovsk int lix; 342a98e9dbfSaguzovsk 343a98e9dbfSaguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex)); 344a98e9dbfSaguzovsk ASSERT((struct seg_phash *)hp->p_hnext == hp); 345a98e9dbfSaguzovsk ASSERT((struct seg_phash *)hp->p_hprev == hp); 346a98e9dbfSaguzovsk ASSERT(hp >= seg_phashtab_win && 347a98e9dbfSaguzovsk hp < &seg_phashtab_win[seg_phashsize_win]); 348a98e9dbfSaguzovsk 349a98e9dbfSaguzovsk if (athr) { 350a98e9dbfSaguzovsk ASSERT(seg_pathr_on); 351a98e9dbfSaguzovsk ASSERT(seg_pahcur <= 1); 352a98e9dbfSaguzovsk /* 353a98e9dbfSaguzovsk * We are called by asynchronous thread that found this bucket 354a98e9dbfSaguzovsk * on not currently active (i.e. !seg_pahcur) list. Remove it 355a98e9dbfSaguzovsk * from there. Per bucket lock we are holding makes sure 356a98e9dbfSaguzovsk * seg_pinsert() can't sneak in and add pcp entries to this 357a98e9dbfSaguzovsk * bucket right before we remove the bucket from its list. 358a98e9dbfSaguzovsk */ 359a98e9dbfSaguzovsk lix = !seg_pahcur; 360a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lnext != NULL); 361a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 362a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL); 363a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 364a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 365a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 366a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext = NULL; 367a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev = NULL; 368a98e9dbfSaguzovsk return; 369a98e9dbfSaguzovsk } 370a98e9dbfSaguzovsk 371a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 372a98e9dbfSaguzovsk lix = seg_pahcur; 373a98e9dbfSaguzovsk ASSERT(lix >= 0 && lix <= 1); 374a98e9dbfSaguzovsk 375a98e9dbfSaguzovsk /* 376a98e9dbfSaguzovsk * If the bucket is on currently active list just remove it from 377a98e9dbfSaguzovsk * there. 378a98e9dbfSaguzovsk */ 379a98e9dbfSaguzovsk if (hp->p_halink[lix].p_lnext != NULL) { 380a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 381a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL); 382a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 383a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 384a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 385a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext = NULL; 386a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev = NULL; 387a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 388a98e9dbfSaguzovsk return; 389a98e9dbfSaguzovsk } 390a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev == NULL); 391a98e9dbfSaguzovsk 392a98e9dbfSaguzovsk /* 393a98e9dbfSaguzovsk * If asynchronous thread is not running we can remove the bucket from 394a98e9dbfSaguzovsk * not currently active list. The bucket must be on this list since we 395a98e9dbfSaguzovsk * already checked that it's not on the other list and the bucket from 396a98e9dbfSaguzovsk * which we just deleted the last pcp entry must be still on one of the 397a98e9dbfSaguzovsk * active bucket lists. 398a98e9dbfSaguzovsk */ 399a98e9dbfSaguzovsk lix = !lix; 400a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lnext != NULL); 401a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 402a98e9dbfSaguzovsk 403a98e9dbfSaguzovsk if (!seg_pathr_on) { 404a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 405a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 406a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext = NULL; 407a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev = NULL; 408a98e9dbfSaguzovsk } 409a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 410a98e9dbfSaguzovsk } 411a98e9dbfSaguzovsk 412a98e9dbfSaguzovsk /* 413a98e9dbfSaguzovsk * Check if bucket pointed by hp already has a pcp entry that matches request 414a98e9dbfSaguzovsk * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise. 415a98e9dbfSaguzovsk * Also delete matching entries that cover smaller address range but start 416a98e9dbfSaguzovsk * at the same address as addr argument. Return the list of deleted entries if 417a98e9dbfSaguzovsk * any. This is an internal helper function called from seg_pinsert() only 418a98e9dbfSaguzovsk * for non wired shadow lists. The caller already holds a per seg/amp list 419a98e9dbfSaguzovsk * lock. 420a98e9dbfSaguzovsk */ 421a98e9dbfSaguzovsk static struct seg_pcache * 422a98e9dbfSaguzovsk seg_plookup_checkdup(struct seg_phash *hp, void *htag0, 423a98e9dbfSaguzovsk caddr_t addr, size_t len, int *found) 424a98e9dbfSaguzovsk { 425a98e9dbfSaguzovsk struct seg_pcache *pcp; 426a98e9dbfSaguzovsk struct seg_pcache *delcallb_list = NULL; 427a98e9dbfSaguzovsk 428a98e9dbfSaguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex)); 429a98e9dbfSaguzovsk 430a98e9dbfSaguzovsk *found = 0; 431a98e9dbfSaguzovsk for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 432a98e9dbfSaguzovsk pcp = pcp->p_hnext) { 433a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp); 434a98e9dbfSaguzovsk if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) { 435a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 436a98e9dbfSaguzovsk if (pcp->p_len < len) { 437a98e9dbfSaguzovsk pcache_link_t *plinkp; 438a98e9dbfSaguzovsk if (pcp->p_active) { 439a98e9dbfSaguzovsk continue; 440a98e9dbfSaguzovsk } 441a98e9dbfSaguzovsk plinkp = &pcp->p_plink; 442a98e9dbfSaguzovsk plinkp->p_lprev->p_lnext = plinkp->p_lnext; 443a98e9dbfSaguzovsk plinkp->p_lnext->p_lprev = plinkp->p_lprev; 444a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 445a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 446a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list; 447a98e9dbfSaguzovsk delcallb_list = pcp; 448a98e9dbfSaguzovsk } else { 449a98e9dbfSaguzovsk *found = 1; 450a98e9dbfSaguzovsk break; 451a98e9dbfSaguzovsk } 452a98e9dbfSaguzovsk } 453a98e9dbfSaguzovsk } 454a98e9dbfSaguzovsk return (delcallb_list); 455a98e9dbfSaguzovsk } 456a98e9dbfSaguzovsk 457a98e9dbfSaguzovsk /* 458a98e9dbfSaguzovsk * lookup an address range in pagelock cache. Return shadow list and bump up 459a98e9dbfSaguzovsk * active count. If amp is not NULL use amp as a lookup tag otherwise use seg 460a98e9dbfSaguzovsk * as a lookup tag. 4617c478bd9Sstevel@tonic-gate */ 4627c478bd9Sstevel@tonic-gate struct page ** 463a98e9dbfSaguzovsk seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, 464a98e9dbfSaguzovsk enum seg_rw rw, uint_t flags) 4657c478bd9Sstevel@tonic-gate { 4667c478bd9Sstevel@tonic-gate struct seg_pcache *pcp; 4677c478bd9Sstevel@tonic-gate struct seg_phash *hp; 468a98e9dbfSaguzovsk void *htag0; 469a98e9dbfSaguzovsk 470a98e9dbfSaguzovsk ASSERT(seg != NULL); 471a98e9dbfSaguzovsk ASSERT(rw == S_READ || rw == S_WRITE); 4727c478bd9Sstevel@tonic-gate 4737c478bd9Sstevel@tonic-gate /* 4747c478bd9Sstevel@tonic-gate * Skip pagelock cache, while DR is in progress or 4757c478bd9Sstevel@tonic-gate * seg_pcache is off. 4767c478bd9Sstevel@tonic-gate */ 477a98e9dbfSaguzovsk if (seg_pdisabled) { 4787c478bd9Sstevel@tonic-gate return (NULL); 4797c478bd9Sstevel@tonic-gate } 480a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 4817c478bd9Sstevel@tonic-gate 482a98e9dbfSaguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp); 483a98e9dbfSaguzovsk hp = P_HASHBP(seg, htag0, addr, flags); 4847c478bd9Sstevel@tonic-gate mutex_enter(&hp->p_hmutex); 4857c478bd9Sstevel@tonic-gate for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 4867c478bd9Sstevel@tonic-gate pcp = pcp->p_hnext) { 487a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp); 488a98e9dbfSaguzovsk if (P_MATCH(pcp, htag0, addr, len)) { 489a98e9dbfSaguzovsk ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); 490a98e9dbfSaguzovsk /* 491a98e9dbfSaguzovsk * If this request wants to write pages 492a98e9dbfSaguzovsk * but write permissions starting from 493a98e9dbfSaguzovsk * addr don't cover the entire length len 494a98e9dbfSaguzovsk * return lookup failure back to the caller. 495a98e9dbfSaguzovsk * It will check protections and fail this 496a98e9dbfSaguzovsk * pagelock operation with EACCESS error. 497a98e9dbfSaguzovsk */ 498a98e9dbfSaguzovsk if (rw == S_WRITE && pcp->p_wlen < len) { 499a98e9dbfSaguzovsk break; 500a98e9dbfSaguzovsk } 501a98e9dbfSaguzovsk if (pcp->p_active == UINT_MAX) { 502a98e9dbfSaguzovsk break; 503a98e9dbfSaguzovsk } 5047c478bd9Sstevel@tonic-gate pcp->p_active++; 505a98e9dbfSaguzovsk if (rw == S_WRITE && !pcp->p_write) { 506a98e9dbfSaguzovsk pcp->p_write = 1; 507a98e9dbfSaguzovsk } 5087c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 5097c478bd9Sstevel@tonic-gate return (pcp->p_pp); 5107c478bd9Sstevel@tonic-gate } 5117c478bd9Sstevel@tonic-gate } 5127c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 5137c478bd9Sstevel@tonic-gate return (NULL); 5147c478bd9Sstevel@tonic-gate } 5157c478bd9Sstevel@tonic-gate 5167c478bd9Sstevel@tonic-gate /* 517a98e9dbfSaguzovsk * mark address range inactive. If the cache is off or the address range is 518a98e9dbfSaguzovsk * not in the cache or another shadow list that covers bigger range is found 519a98e9dbfSaguzovsk * we call the segment driver to reclaim the pages. Otherwise just decrement 520a98e9dbfSaguzovsk * active count and set ref bit. If amp is not NULL use amp as a lookup tag 521a98e9dbfSaguzovsk * otherwise use seg as a lookup tag. 5227c478bd9Sstevel@tonic-gate */ 5237c478bd9Sstevel@tonic-gate void 524a98e9dbfSaguzovsk seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr, 525a98e9dbfSaguzovsk size_t len, struct page **pp, enum seg_rw rw, uint_t flags, 526a98e9dbfSaguzovsk seg_preclaim_cbfunc_t callback) 5277c478bd9Sstevel@tonic-gate { 5287c478bd9Sstevel@tonic-gate struct seg_pcache *pcp; 5297c478bd9Sstevel@tonic-gate struct seg_phash *hp; 530a98e9dbfSaguzovsk kmutex_t *pmtx = NULL; 531a98e9dbfSaguzovsk pcache_link_t *pheadp; 532a98e9dbfSaguzovsk void *htag0; 533a98e9dbfSaguzovsk pgcnt_t npages = 0; 534a98e9dbfSaguzovsk int keep = 0; 5357c478bd9Sstevel@tonic-gate 536a98e9dbfSaguzovsk ASSERT(seg != NULL); 537a98e9dbfSaguzovsk ASSERT(rw == S_READ || rw == S_WRITE); 5387c478bd9Sstevel@tonic-gate 539a98e9dbfSaguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp); 540a98e9dbfSaguzovsk 541a98e9dbfSaguzovsk /* 542a98e9dbfSaguzovsk * Skip lookup if pcache is not configured. 543a98e9dbfSaguzovsk */ 544a98e9dbfSaguzovsk if (seg_phashsize_win == 0) { 5457c478bd9Sstevel@tonic-gate goto out; 5467c478bd9Sstevel@tonic-gate } 547a98e9dbfSaguzovsk 548a98e9dbfSaguzovsk /* 549a98e9dbfSaguzovsk * Grab per seg/amp lock before hash lock if we are going to remove 550a98e9dbfSaguzovsk * inactive entry from pcache. 551a98e9dbfSaguzovsk */ 552a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) { 553a98e9dbfSaguzovsk if (amp == NULL) { 554a98e9dbfSaguzovsk pheadp = &seg->s_phead; 555a98e9dbfSaguzovsk pmtx = &seg->s_pmtx; 556a98e9dbfSaguzovsk } else { 557a98e9dbfSaguzovsk pheadp = &->a_phead; 558a98e9dbfSaguzovsk pmtx = &->a_pmtx; 559a98e9dbfSaguzovsk } 560a98e9dbfSaguzovsk mutex_enter(pmtx); 561a98e9dbfSaguzovsk } 562a98e9dbfSaguzovsk 563a98e9dbfSaguzovsk hp = P_HASHBP(seg, htag0, addr, flags); 564a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 565a98e9dbfSaguzovsk again: 566a98e9dbfSaguzovsk for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 567a98e9dbfSaguzovsk pcp = pcp->p_hnext) { 568a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp); 569a98e9dbfSaguzovsk if (P_MATCH_PP(pcp, htag0, addr, len, pp)) { 570a98e9dbfSaguzovsk ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); 571a98e9dbfSaguzovsk ASSERT(pcp->p_active); 572a98e9dbfSaguzovsk if (keep) { 573a98e9dbfSaguzovsk /* 574a98e9dbfSaguzovsk * Don't remove this pcp entry 575a98e9dbfSaguzovsk * if we didn't find duplicate 576a98e9dbfSaguzovsk * shadow lists on second search. 577a98e9dbfSaguzovsk * Somebody removed those duplicates 578a98e9dbfSaguzovsk * since we dropped hash lock after first 579a98e9dbfSaguzovsk * search. 580a98e9dbfSaguzovsk */ 581a98e9dbfSaguzovsk ASSERT(pmtx != NULL); 582a98e9dbfSaguzovsk ASSERT(!IS_PFLAGS_WIRED(flags)); 583a98e9dbfSaguzovsk mutex_exit(pmtx); 584a98e9dbfSaguzovsk pmtx = NULL; 585a98e9dbfSaguzovsk } 586a98e9dbfSaguzovsk pcp->p_active--; 587a98e9dbfSaguzovsk if (pcp->p_active == 0 && (pmtx != NULL || 588a98e9dbfSaguzovsk (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) { 589a98e9dbfSaguzovsk 590a98e9dbfSaguzovsk /* 591a98e9dbfSaguzovsk * This entry is no longer active. Remove it 592a98e9dbfSaguzovsk * now either because pcaching is temporarily 593a98e9dbfSaguzovsk * disabled or there're other pcp entries that 594a98e9dbfSaguzovsk * can match this pagelock request (i.e. this 595a98e9dbfSaguzovsk * entry is a duplicate). 596a98e9dbfSaguzovsk */ 597a98e9dbfSaguzovsk 598a98e9dbfSaguzovsk ASSERT(callback == pcp->p_callback); 599a98e9dbfSaguzovsk if (pmtx != NULL) { 600a98e9dbfSaguzovsk pcache_link_t *plinkp = &pcp->p_plink; 601a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 602a98e9dbfSaguzovsk ASSERT(pheadp->p_lnext != pheadp); 603a98e9dbfSaguzovsk ASSERT(pheadp->p_lprev != pheadp); 604a98e9dbfSaguzovsk plinkp->p_lprev->p_lnext = 605a98e9dbfSaguzovsk plinkp->p_lnext; 606a98e9dbfSaguzovsk plinkp->p_lnext->p_lprev = 607a98e9dbfSaguzovsk plinkp->p_lprev; 608a98e9dbfSaguzovsk } 609a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 610a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 611a98e9dbfSaguzovsk if (!IS_PCP_WIRED(pcp) && 612a98e9dbfSaguzovsk hp->p_hnext == (struct seg_pcache *)hp) { 613a98e9dbfSaguzovsk /* 614a98e9dbfSaguzovsk * We removed the last entry from this 615a98e9dbfSaguzovsk * bucket. Now remove the bucket from 616a98e9dbfSaguzovsk * its active list. 617a98e9dbfSaguzovsk */ 618a98e9dbfSaguzovsk seg_premove_abuck(hp, 0); 619a98e9dbfSaguzovsk } 6207c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 621a98e9dbfSaguzovsk if (pmtx != NULL) { 622a98e9dbfSaguzovsk mutex_exit(pmtx); 623a98e9dbfSaguzovsk } 624a98e9dbfSaguzovsk len = pcp->p_len; 625a98e9dbfSaguzovsk npages = btop(len); 626a98e9dbfSaguzovsk if (rw != S_WRITE && pcp->p_write) { 627a98e9dbfSaguzovsk rw = S_WRITE; 628a98e9dbfSaguzovsk } 629a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 630a98e9dbfSaguzovsk goto out; 631a98e9dbfSaguzovsk } else { 632a98e9dbfSaguzovsk /* 633a98e9dbfSaguzovsk * We found a matching pcp entry but will not 634a98e9dbfSaguzovsk * free it right away even if it's no longer 635a98e9dbfSaguzovsk * active. 636a98e9dbfSaguzovsk */ 637a98e9dbfSaguzovsk if (!pcp->p_active && !IS_PCP_WIRED(pcp)) { 638a98e9dbfSaguzovsk /* 639a98e9dbfSaguzovsk * Set the reference bit and mark the 640a98e9dbfSaguzovsk * time of last access to this pcp 641a98e9dbfSaguzovsk * so that asynchronous thread doesn't 642a98e9dbfSaguzovsk * free it immediately since 643a98e9dbfSaguzovsk * it may be reactivated very soon. 644a98e9dbfSaguzovsk */ 645d3d50737SRafael Vanoni pcp->p_lbolt = ddi_get_lbolt(); 646a98e9dbfSaguzovsk pcp->p_ref = 1; 647a98e9dbfSaguzovsk } 648a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 649a98e9dbfSaguzovsk if (pmtx != NULL) { 650a98e9dbfSaguzovsk mutex_exit(pmtx); 651a98e9dbfSaguzovsk } 6527c478bd9Sstevel@tonic-gate return; 6537c478bd9Sstevel@tonic-gate } 654a98e9dbfSaguzovsk } else if (!IS_PFLAGS_WIRED(flags) && 655a98e9dbfSaguzovsk P_MATCH(pcp, htag0, addr, len)) { 656a98e9dbfSaguzovsk /* 657a98e9dbfSaguzovsk * This is a duplicate pcp entry. This situation may 658a98e9dbfSaguzovsk * happen if a bigger shadow list that covers our 659a98e9dbfSaguzovsk * range was added while our entry was still active. 660a98e9dbfSaguzovsk * Now we can free our pcp entry if it becomes 661a98e9dbfSaguzovsk * inactive. 662a98e9dbfSaguzovsk */ 663a98e9dbfSaguzovsk if (!pcp->p_active) { 664a98e9dbfSaguzovsk /* 665a98e9dbfSaguzovsk * Mark this entry as referenced just in case 666a98e9dbfSaguzovsk * we'll free our own pcp entry soon. 667a98e9dbfSaguzovsk */ 668d3d50737SRafael Vanoni pcp->p_lbolt = ddi_get_lbolt(); 669a98e9dbfSaguzovsk pcp->p_ref = 1; 670a98e9dbfSaguzovsk } 671a98e9dbfSaguzovsk if (pmtx != NULL) { 672a98e9dbfSaguzovsk /* 673a98e9dbfSaguzovsk * we are already holding pmtx and found a 674a98e9dbfSaguzovsk * duplicate. Don't keep our own pcp entry. 675a98e9dbfSaguzovsk */ 676a98e9dbfSaguzovsk keep = 0; 677a98e9dbfSaguzovsk continue; 678a98e9dbfSaguzovsk } 679a98e9dbfSaguzovsk /* 680a98e9dbfSaguzovsk * We have to use mutex_tryenter to attempt to lock 681a98e9dbfSaguzovsk * seg/amp list lock since we already hold hash lock 682a98e9dbfSaguzovsk * and seg/amp list lock is above hash lock in lock 683a98e9dbfSaguzovsk * order. If mutex_tryenter fails drop hash lock and 684a98e9dbfSaguzovsk * retake both locks in correct order and research 685a98e9dbfSaguzovsk * this hash chain. 686a98e9dbfSaguzovsk */ 687a98e9dbfSaguzovsk ASSERT(keep == 0); 688a98e9dbfSaguzovsk if (amp == NULL) { 689a98e9dbfSaguzovsk pheadp = &seg->s_phead; 690a98e9dbfSaguzovsk pmtx = &seg->s_pmtx; 691a98e9dbfSaguzovsk } else { 692a98e9dbfSaguzovsk pheadp = &->a_phead; 693a98e9dbfSaguzovsk pmtx = &->a_pmtx; 694a98e9dbfSaguzovsk } 695a98e9dbfSaguzovsk if (!mutex_tryenter(pmtx)) { 696a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 697a98e9dbfSaguzovsk mutex_enter(pmtx); 698a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 699a98e9dbfSaguzovsk /* 700a98e9dbfSaguzovsk * If we don't find bigger shadow list on 701a98e9dbfSaguzovsk * second search (it may happen since we 702a98e9dbfSaguzovsk * dropped bucket lock) keep the entry that 703a98e9dbfSaguzovsk * matches our own shadow list. 704a98e9dbfSaguzovsk */ 705a98e9dbfSaguzovsk keep = 1; 706a98e9dbfSaguzovsk goto again; 707a98e9dbfSaguzovsk } 708a98e9dbfSaguzovsk } 7097c478bd9Sstevel@tonic-gate } 7107c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 711a98e9dbfSaguzovsk if (pmtx != NULL) { 712a98e9dbfSaguzovsk mutex_exit(pmtx); 7137c478bd9Sstevel@tonic-gate } 714a98e9dbfSaguzovsk out: 715a98e9dbfSaguzovsk (*callback)(htag0, addr, len, pp, rw, 0); 716a98e9dbfSaguzovsk if (npages) { 717a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 718a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages); 719a98e9dbfSaguzovsk seg_plocked -= npages; 720a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) { 721a98e9dbfSaguzovsk ASSERT(seg_plocked_window >= npages); 722a98e9dbfSaguzovsk seg_plocked_window -= npages; 723a98e9dbfSaguzovsk } 724a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 725a98e9dbfSaguzovsk } 726a98e9dbfSaguzovsk 727a98e9dbfSaguzovsk } 728a98e9dbfSaguzovsk 729a98e9dbfSaguzovsk #ifdef DEBUG 730a98e9dbfSaguzovsk static uint32_t p_insert_chk_mtbf = 0; 731a98e9dbfSaguzovsk #endif 7327c478bd9Sstevel@tonic-gate 7337c478bd9Sstevel@tonic-gate /* 7347c478bd9Sstevel@tonic-gate * The seg_pinsert_check() is used by segment drivers to predict whether 7357c478bd9Sstevel@tonic-gate * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing. 7367c478bd9Sstevel@tonic-gate */ 737a98e9dbfSaguzovsk /*ARGSUSED*/ 7387c478bd9Sstevel@tonic-gate int 739a98e9dbfSaguzovsk seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr, 740a98e9dbfSaguzovsk size_t len, uint_t flags) 7417c478bd9Sstevel@tonic-gate { 742a98e9dbfSaguzovsk ASSERT(seg != NULL); 7437c478bd9Sstevel@tonic-gate 744a98e9dbfSaguzovsk #ifdef DEBUG 745a98e9dbfSaguzovsk if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) { 7467c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 7477c478bd9Sstevel@tonic-gate } 748a98e9dbfSaguzovsk #endif 7497c478bd9Sstevel@tonic-gate 750a98e9dbfSaguzovsk if (seg_pdisabled) { 7517c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 7527c478bd9Sstevel@tonic-gate } 753a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 754a98e9dbfSaguzovsk 755a98e9dbfSaguzovsk if (IS_PFLAGS_WIRED(flags)) { 7567c478bd9Sstevel@tonic-gate return (SEGP_SUCCESS); 7577c478bd9Sstevel@tonic-gate } 7587c478bd9Sstevel@tonic-gate 759a98e9dbfSaguzovsk if (seg_plocked_window + btop(len) > seg_pmaxwindow) { 760a98e9dbfSaguzovsk return (SEGP_FAIL); 761a98e9dbfSaguzovsk } 762a98e9dbfSaguzovsk 763a98e9dbfSaguzovsk if (freemem < desfree) { 764a98e9dbfSaguzovsk return (SEGP_FAIL); 765a98e9dbfSaguzovsk } 766a98e9dbfSaguzovsk 767a98e9dbfSaguzovsk return (SEGP_SUCCESS); 768a98e9dbfSaguzovsk } 769a98e9dbfSaguzovsk 770a98e9dbfSaguzovsk #ifdef DEBUG 771a98e9dbfSaguzovsk static uint32_t p_insert_mtbf = 0; 772a98e9dbfSaguzovsk #endif 7737c478bd9Sstevel@tonic-gate 7747c478bd9Sstevel@tonic-gate /* 775a98e9dbfSaguzovsk * Insert address range with shadow list into pagelock cache if there's no 776a98e9dbfSaguzovsk * shadow list already cached for this address range. If the cache is off or 777a98e9dbfSaguzovsk * caching is temporarily disabled or the allowed 'window' is exceeded return 778a98e9dbfSaguzovsk * SEGP_FAIL. Otherwise return SEGP_SUCCESS. 779a98e9dbfSaguzovsk * 780a98e9dbfSaguzovsk * For non wired shadow lists (segvn case) include address in the hashing 781a98e9dbfSaguzovsk * function to avoid linking all the entries from the same segment or amp on 782a98e9dbfSaguzovsk * the same bucket. amp is used instead of seg if amp is not NULL. Non wired 783a98e9dbfSaguzovsk * pcache entries are also linked on a per segment/amp list so that all 784a98e9dbfSaguzovsk * entries can be found quickly during seg/amp purge without walking the 785a98e9dbfSaguzovsk * entire pcache hash table. For wired shadow lists (segspt case) we 786a98e9dbfSaguzovsk * don't use address hashing and per segment linking because the caller 787a98e9dbfSaguzovsk * currently inserts only one entry per segment that covers the entire 788a98e9dbfSaguzovsk * segment. If we used per segment linking even for segspt it would complicate 789a98e9dbfSaguzovsk * seg_ppurge_wiredpp() locking. 790a98e9dbfSaguzovsk * 791a98e9dbfSaguzovsk * Both hash bucket and per seg/amp locks need to be held before adding a non 792a98e9dbfSaguzovsk * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken 793a98e9dbfSaguzovsk * first. 794a98e9dbfSaguzovsk * 795a98e9dbfSaguzovsk * This function will also remove from pcache old inactive shadow lists that 796a98e9dbfSaguzovsk * overlap with this request but cover smaller range for the same start 797a98e9dbfSaguzovsk * address. 7987c478bd9Sstevel@tonic-gate */ 7997c478bd9Sstevel@tonic-gate int 800a98e9dbfSaguzovsk seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, 801a98e9dbfSaguzovsk size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags, 802a98e9dbfSaguzovsk seg_preclaim_cbfunc_t callback) 8037c478bd9Sstevel@tonic-gate { 8047c478bd9Sstevel@tonic-gate struct seg_pcache *pcp; 8057c478bd9Sstevel@tonic-gate struct seg_phash *hp; 8067c478bd9Sstevel@tonic-gate pgcnt_t npages; 807a98e9dbfSaguzovsk pcache_link_t *pheadp; 808a98e9dbfSaguzovsk kmutex_t *pmtx; 809a98e9dbfSaguzovsk struct seg_pcache *delcallb_list = NULL; 8107c478bd9Sstevel@tonic-gate 811a98e9dbfSaguzovsk ASSERT(seg != NULL); 812a98e9dbfSaguzovsk ASSERT(rw == S_READ || rw == S_WRITE); 813a98e9dbfSaguzovsk ASSERT(rw == S_READ || wlen == len); 814a98e9dbfSaguzovsk ASSERT(rw == S_WRITE || wlen <= len); 815a98e9dbfSaguzovsk ASSERT(amp == NULL || wlen == len); 816a98e9dbfSaguzovsk 817a98e9dbfSaguzovsk #ifdef DEBUG 818a98e9dbfSaguzovsk if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) { 8197c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 8207c478bd9Sstevel@tonic-gate } 821a98e9dbfSaguzovsk #endif 822a98e9dbfSaguzovsk 823a98e9dbfSaguzovsk if (seg_pdisabled) { 8247c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 8257c478bd9Sstevel@tonic-gate } 826a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 827a98e9dbfSaguzovsk 8287c478bd9Sstevel@tonic-gate ASSERT((len & PAGEOFFSET) == 0); 829a98e9dbfSaguzovsk npages = btop(len); 830a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 831a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) { 832a98e9dbfSaguzovsk if (seg_plocked_window + npages > seg_pmaxwindow) { 833a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 8347c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 8357c478bd9Sstevel@tonic-gate } 8367c478bd9Sstevel@tonic-gate seg_plocked_window += npages; 8377c478bd9Sstevel@tonic-gate } 8387c478bd9Sstevel@tonic-gate seg_plocked += npages; 839a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 8407c478bd9Sstevel@tonic-gate 841a98e9dbfSaguzovsk pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP); 842a98e9dbfSaguzovsk /* 843a98e9dbfSaguzovsk * If amp is not NULL set htag0 to amp otherwise set it to seg. 844a98e9dbfSaguzovsk */ 845a98e9dbfSaguzovsk if (amp == NULL) { 846a98e9dbfSaguzovsk pcp->p_htag0 = (void *)seg; 847a98e9dbfSaguzovsk pcp->p_flags = flags & 0xffff; 848a98e9dbfSaguzovsk } else { 849a98e9dbfSaguzovsk pcp->p_htag0 = (void *)amp; 850a98e9dbfSaguzovsk pcp->p_flags = (flags & 0xffff) | SEGP_AMP; 851a98e9dbfSaguzovsk } 8527c478bd9Sstevel@tonic-gate pcp->p_addr = addr; 8537c478bd9Sstevel@tonic-gate pcp->p_len = len; 854a98e9dbfSaguzovsk pcp->p_wlen = wlen; 8557c478bd9Sstevel@tonic-gate pcp->p_pp = pp; 856a98e9dbfSaguzovsk pcp->p_write = (rw == S_WRITE); 8577c478bd9Sstevel@tonic-gate pcp->p_callback = callback; 8587c478bd9Sstevel@tonic-gate pcp->p_active = 1; 8597c478bd9Sstevel@tonic-gate 860a98e9dbfSaguzovsk hp = P_HASHBP(seg, pcp->p_htag0, addr, flags); 861a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) { 862a98e9dbfSaguzovsk int found; 863a98e9dbfSaguzovsk void *htag0; 864a98e9dbfSaguzovsk if (amp == NULL) { 865a98e9dbfSaguzovsk pheadp = &seg->s_phead; 866a98e9dbfSaguzovsk pmtx = &seg->s_pmtx; 867a98e9dbfSaguzovsk htag0 = (void *)seg; 868a98e9dbfSaguzovsk } else { 869a98e9dbfSaguzovsk pheadp = &->a_phead; 870a98e9dbfSaguzovsk pmtx = &->a_pmtx; 871a98e9dbfSaguzovsk htag0 = (void *)amp; 872a98e9dbfSaguzovsk } 873a98e9dbfSaguzovsk mutex_enter(pmtx); 8747c478bd9Sstevel@tonic-gate mutex_enter(&hp->p_hmutex); 875a98e9dbfSaguzovsk delcallb_list = seg_plookup_checkdup(hp, htag0, addr, 876a98e9dbfSaguzovsk len, &found); 877a98e9dbfSaguzovsk if (found) { 878a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 879a98e9dbfSaguzovsk mutex_exit(pmtx); 880a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 881a98e9dbfSaguzovsk seg_plocked -= npages; 882a98e9dbfSaguzovsk seg_plocked_window -= npages; 883a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 884a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 885a98e9dbfSaguzovsk goto out; 886a98e9dbfSaguzovsk } 887a98e9dbfSaguzovsk pcp->p_plink.p_lnext = pheadp->p_lnext; 888a98e9dbfSaguzovsk pcp->p_plink.p_lprev = pheadp; 889a98e9dbfSaguzovsk pheadp->p_lnext->p_lprev = &pcp->p_plink; 890a98e9dbfSaguzovsk pheadp->p_lnext = &pcp->p_plink; 891a98e9dbfSaguzovsk } else { 892a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 893a98e9dbfSaguzovsk } 894a98e9dbfSaguzovsk pcp->p_hashp = hp; 8957c478bd9Sstevel@tonic-gate pcp->p_hnext = hp->p_hnext; 8967c478bd9Sstevel@tonic-gate pcp->p_hprev = (struct seg_pcache *)hp; 8977c478bd9Sstevel@tonic-gate hp->p_hnext->p_hprev = pcp; 8987c478bd9Sstevel@tonic-gate hp->p_hnext = pcp; 899a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags) && 900a98e9dbfSaguzovsk hp->p_hprev == pcp) { 901a98e9dbfSaguzovsk seg_padd_abuck(hp); 902a98e9dbfSaguzovsk } 9037c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 904a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) { 905a98e9dbfSaguzovsk mutex_exit(pmtx); 906a98e9dbfSaguzovsk } 907a98e9dbfSaguzovsk 908a98e9dbfSaguzovsk out: 909a98e9dbfSaguzovsk npages = 0; 910a98e9dbfSaguzovsk while (delcallb_list != NULL) { 911a98e9dbfSaguzovsk pcp = delcallb_list; 912a98e9dbfSaguzovsk delcallb_list = pcp->p_hprev; 913a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active); 914a98e9dbfSaguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 915a98e9dbfSaguzovsk pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); 916a98e9dbfSaguzovsk npages += btop(pcp->p_len); 917a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 918a98e9dbfSaguzovsk } 919a98e9dbfSaguzovsk if (npages) { 920a98e9dbfSaguzovsk ASSERT(!IS_PFLAGS_WIRED(flags)); 921a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 922a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages); 923a98e9dbfSaguzovsk ASSERT(seg_plocked_window >= npages); 924a98e9dbfSaguzovsk seg_plocked -= npages; 925a98e9dbfSaguzovsk seg_plocked_window -= npages; 926a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 927a98e9dbfSaguzovsk } 928a98e9dbfSaguzovsk 9297c478bd9Sstevel@tonic-gate return (SEGP_SUCCESS); 9307c478bd9Sstevel@tonic-gate } 9317c478bd9Sstevel@tonic-gate 9327c478bd9Sstevel@tonic-gate /* 933a98e9dbfSaguzovsk * purge entries from the pagelock cache if not active 934a98e9dbfSaguzovsk * and not recently used. 9357c478bd9Sstevel@tonic-gate */ 9367c478bd9Sstevel@tonic-gate static void 937a98e9dbfSaguzovsk seg_ppurge_async(int force) 9387c478bd9Sstevel@tonic-gate { 9397c478bd9Sstevel@tonic-gate struct seg_pcache *delcallb_list = NULL; 9407c478bd9Sstevel@tonic-gate struct seg_pcache *pcp; 9417c478bd9Sstevel@tonic-gate struct seg_phash *hp; 9427c478bd9Sstevel@tonic-gate pgcnt_t npages = 0; 9437c478bd9Sstevel@tonic-gate pgcnt_t npages_window = 0; 944a98e9dbfSaguzovsk pgcnt_t npgs_to_purge; 945a98e9dbfSaguzovsk pgcnt_t npgs_purged = 0; 946a98e9dbfSaguzovsk int hlinks = 0; 947a98e9dbfSaguzovsk int hlix; 948a98e9dbfSaguzovsk pcache_link_t *hlinkp; 949a98e9dbfSaguzovsk pcache_link_t *hlnextp = NULL; 950a98e9dbfSaguzovsk int lowmem; 951a98e9dbfSaguzovsk int trim; 952a98e9dbfSaguzovsk 953a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 9547c478bd9Sstevel@tonic-gate 9557c478bd9Sstevel@tonic-gate /* 956a98e9dbfSaguzovsk * if the cache is off or empty, return 9577c478bd9Sstevel@tonic-gate */ 958a98e9dbfSaguzovsk if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) { 9597c478bd9Sstevel@tonic-gate return; 9607c478bd9Sstevel@tonic-gate } 9617c478bd9Sstevel@tonic-gate 962a98e9dbfSaguzovsk if (!force) { 963a98e9dbfSaguzovsk lowmem = 0; 964a98e9dbfSaguzovsk trim = 0; 965a98e9dbfSaguzovsk if (freemem < lotsfree + needfree) { 966a98e9dbfSaguzovsk spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0); 967a98e9dbfSaguzovsk if (fmem <= 5 * (desfree >> 2)) { 968a98e9dbfSaguzovsk lowmem = 1; 969a98e9dbfSaguzovsk } else if (fmem <= 7 * (lotsfree >> 3)) { 970a98e9dbfSaguzovsk if (seg_plocked_window >= 971a98e9dbfSaguzovsk (availrmem_initial >> 1)) { 972a98e9dbfSaguzovsk lowmem = 1; 973a98e9dbfSaguzovsk } 974a98e9dbfSaguzovsk } else if (fmem < lotsfree) { 975a98e9dbfSaguzovsk if (seg_plocked_window >= 976a98e9dbfSaguzovsk 3 * (availrmem_initial >> 2)) { 977a98e9dbfSaguzovsk lowmem = 1; 978a98e9dbfSaguzovsk } 979a98e9dbfSaguzovsk } 980a98e9dbfSaguzovsk } 981a98e9dbfSaguzovsk if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) { 982a98e9dbfSaguzovsk trim = 1; 983a98e9dbfSaguzovsk } 984a98e9dbfSaguzovsk if (!lowmem && !trim) { 985a98e9dbfSaguzovsk return; 986a98e9dbfSaguzovsk } 987a98e9dbfSaguzovsk npgs_to_purge = seg_plocked_window >> 988a98e9dbfSaguzovsk seg_pshrink_shift; 989a98e9dbfSaguzovsk if (lowmem) { 990a98e9dbfSaguzovsk npgs_to_purge = MIN(npgs_to_purge, 991a98e9dbfSaguzovsk MAX(seg_pmaxapurge_npages, desfree)); 992a98e9dbfSaguzovsk } else { 993a98e9dbfSaguzovsk npgs_to_purge = MIN(npgs_to_purge, 994a98e9dbfSaguzovsk seg_pmaxapurge_npages); 995a98e9dbfSaguzovsk } 996a98e9dbfSaguzovsk if (npgs_to_purge == 0) { 997a98e9dbfSaguzovsk return; 998a98e9dbfSaguzovsk } 999a98e9dbfSaguzovsk } else { 1000a98e9dbfSaguzovsk struct seg_phash_wired *hpw; 10017c478bd9Sstevel@tonic-gate 1002a98e9dbfSaguzovsk ASSERT(seg_phashsize_wired != 0); 10037c478bd9Sstevel@tonic-gate 1004a98e9dbfSaguzovsk for (hpw = seg_phashtab_wired; 1005a98e9dbfSaguzovsk hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) { 1006a98e9dbfSaguzovsk 1007a98e9dbfSaguzovsk if (hpw->p_hnext == (struct seg_pcache *)hpw) { 1008a98e9dbfSaguzovsk continue; 1009a98e9dbfSaguzovsk } 1010a98e9dbfSaguzovsk 1011a98e9dbfSaguzovsk mutex_enter(&hpw->p_hmutex); 1012a98e9dbfSaguzovsk 1013a98e9dbfSaguzovsk for (pcp = hpw->p_hnext; 1014a98e9dbfSaguzovsk pcp != (struct seg_pcache *)hpw; 1015a98e9dbfSaguzovsk pcp = pcp->p_hnext) { 1016a98e9dbfSaguzovsk 1017a98e9dbfSaguzovsk ASSERT(IS_PCP_WIRED(pcp)); 1018a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == 1019a98e9dbfSaguzovsk (struct seg_phash *)hpw); 1020a98e9dbfSaguzovsk 1021a98e9dbfSaguzovsk if (pcp->p_active) { 1022a98e9dbfSaguzovsk continue; 1023a98e9dbfSaguzovsk } 10247c478bd9Sstevel@tonic-gate pcp->p_hprev->p_hnext = pcp->p_hnext; 10257c478bd9Sstevel@tonic-gate pcp->p_hnext->p_hprev = pcp->p_hprev; 10267c478bd9Sstevel@tonic-gate pcp->p_hprev = delcallb_list; 10277c478bd9Sstevel@tonic-gate delcallb_list = pcp; 10287c478bd9Sstevel@tonic-gate } 1029a98e9dbfSaguzovsk mutex_exit(&hpw->p_hmutex); 1030a98e9dbfSaguzovsk } 1031a98e9dbfSaguzovsk } 1032a98e9dbfSaguzovsk 1033a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1034a98e9dbfSaguzovsk if (seg_pathr_on) { 1035a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 1036a98e9dbfSaguzovsk goto runcb; 1037a98e9dbfSaguzovsk } 1038a98e9dbfSaguzovsk seg_pathr_on = 1; 1039a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 1040a98e9dbfSaguzovsk ASSERT(seg_pahcur <= 1); 1041a98e9dbfSaguzovsk hlix = !seg_pahcur; 1042a98e9dbfSaguzovsk 1043a98e9dbfSaguzovsk again: 1044a98e9dbfSaguzovsk for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix]; 1045a98e9dbfSaguzovsk hlinkp = hlnextp) { 1046a98e9dbfSaguzovsk 1047a98e9dbfSaguzovsk hlnextp = hlinkp->p_lnext; 1048a98e9dbfSaguzovsk ASSERT(hlnextp != NULL); 1049a98e9dbfSaguzovsk 1050a98e9dbfSaguzovsk hp = hlink2phash(hlinkp, hlix); 1051a98e9dbfSaguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 1052a98e9dbfSaguzovsk seg_pathr_empty_ahb++; 1053a98e9dbfSaguzovsk continue; 1054a98e9dbfSaguzovsk } 1055a98e9dbfSaguzovsk seg_pathr_full_ahb++; 1056a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 1057a98e9dbfSaguzovsk 1058a98e9dbfSaguzovsk for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 1059a98e9dbfSaguzovsk pcp = pcp->p_hnext) { 1060a98e9dbfSaguzovsk pcache_link_t *pheadp; 1061a98e9dbfSaguzovsk pcache_link_t *plinkp; 1062a98e9dbfSaguzovsk void *htag0; 1063a98e9dbfSaguzovsk kmutex_t *pmtx; 1064a98e9dbfSaguzovsk 1065a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 1066a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp); 1067a98e9dbfSaguzovsk 1068a98e9dbfSaguzovsk if (pcp->p_active) { 1069a98e9dbfSaguzovsk continue; 1070a98e9dbfSaguzovsk } 1071a98e9dbfSaguzovsk if (!force && pcp->p_ref && 1072a98e9dbfSaguzovsk PCP_AGE(pcp) < seg_pmax_pcpage) { 10737c478bd9Sstevel@tonic-gate pcp->p_ref = 0; 1074a98e9dbfSaguzovsk continue; 1075a98e9dbfSaguzovsk } 1076a98e9dbfSaguzovsk plinkp = &pcp->p_plink; 1077a98e9dbfSaguzovsk htag0 = pcp->p_htag0; 1078a98e9dbfSaguzovsk if (pcp->p_flags & SEGP_AMP) { 1079a98e9dbfSaguzovsk pheadp = &((amp_t *)htag0)->a_phead; 1080a98e9dbfSaguzovsk pmtx = &((amp_t *)htag0)->a_pmtx; 1081a98e9dbfSaguzovsk } else { 1082a98e9dbfSaguzovsk pheadp = &((seg_t *)htag0)->s_phead; 1083a98e9dbfSaguzovsk pmtx = &((seg_t *)htag0)->s_pmtx; 1084a98e9dbfSaguzovsk } 1085a98e9dbfSaguzovsk if (!mutex_tryenter(pmtx)) { 1086a98e9dbfSaguzovsk continue; 1087a98e9dbfSaguzovsk } 1088a98e9dbfSaguzovsk ASSERT(pheadp->p_lnext != pheadp); 1089a98e9dbfSaguzovsk ASSERT(pheadp->p_lprev != pheadp); 1090a98e9dbfSaguzovsk plinkp->p_lprev->p_lnext = 1091a98e9dbfSaguzovsk plinkp->p_lnext; 1092a98e9dbfSaguzovsk plinkp->p_lnext->p_lprev = 1093a98e9dbfSaguzovsk plinkp->p_lprev; 1094a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 1095a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 1096a98e9dbfSaguzovsk mutex_exit(pmtx); 1097a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list; 1098a98e9dbfSaguzovsk delcallb_list = pcp; 1099a98e9dbfSaguzovsk npgs_purged += btop(pcp->p_len); 1100a98e9dbfSaguzovsk } 1101a98e9dbfSaguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 1102a98e9dbfSaguzovsk seg_premove_abuck(hp, 1); 1103a98e9dbfSaguzovsk } 1104a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 1105a98e9dbfSaguzovsk if (npgs_purged >= seg_plocked_window) { 1106a98e9dbfSaguzovsk break; 1107a98e9dbfSaguzovsk } 1108a98e9dbfSaguzovsk if (!force) { 1109a98e9dbfSaguzovsk if (npgs_purged >= npgs_to_purge) { 1110a98e9dbfSaguzovsk break; 1111a98e9dbfSaguzovsk } 1112a98e9dbfSaguzovsk if (!trim && !(seg_pathr_full_ahb & 15)) { 1113a98e9dbfSaguzovsk ASSERT(lowmem); 1114a98e9dbfSaguzovsk if (freemem >= lotsfree + needfree) { 1115a98e9dbfSaguzovsk break; 1116a98e9dbfSaguzovsk } 1117a98e9dbfSaguzovsk } 1118a98e9dbfSaguzovsk } 1119a98e9dbfSaguzovsk } 1120a98e9dbfSaguzovsk 1121a98e9dbfSaguzovsk if (hlinkp == &seg_pahhead[hlix]) { 1122a98e9dbfSaguzovsk /* 1123a98e9dbfSaguzovsk * We processed the entire hlix active bucket list 1124a98e9dbfSaguzovsk * but didn't find enough pages to reclaim. 1125a98e9dbfSaguzovsk * Switch the lists and walk the other list 1126a98e9dbfSaguzovsk * if we haven't done it yet. 1127a98e9dbfSaguzovsk */ 1128a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1129a98e9dbfSaguzovsk ASSERT(seg_pathr_on); 1130a98e9dbfSaguzovsk ASSERT(seg_pahcur == !hlix); 1131a98e9dbfSaguzovsk seg_pahcur = hlix; 1132a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 1133a98e9dbfSaguzovsk if (++hlinks < 2) { 1134a98e9dbfSaguzovsk hlix = !hlix; 1135a98e9dbfSaguzovsk goto again; 1136a98e9dbfSaguzovsk } 1137a98e9dbfSaguzovsk } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] && 1138a98e9dbfSaguzovsk seg_pahhead[hlix].p_lnext != hlinkp) { 1139a98e9dbfSaguzovsk ASSERT(hlinkp != NULL); 1140a98e9dbfSaguzovsk ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]); 1141a98e9dbfSaguzovsk ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]); 1142a98e9dbfSaguzovsk ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]); 1143a98e9dbfSaguzovsk 1144a98e9dbfSaguzovsk /* 1145a98e9dbfSaguzovsk * Reinsert the header to point to hlinkp 1146a98e9dbfSaguzovsk * so that we start from hlinkp bucket next time around. 1147a98e9dbfSaguzovsk */ 1148a98e9dbfSaguzovsk seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev; 1149a98e9dbfSaguzovsk seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext; 1150a98e9dbfSaguzovsk seg_pahhead[hlix].p_lnext = hlinkp; 1151a98e9dbfSaguzovsk seg_pahhead[hlix].p_lprev = hlinkp->p_lprev; 1152a98e9dbfSaguzovsk hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix]; 1153a98e9dbfSaguzovsk hlinkp->p_lprev = &seg_pahhead[hlix]; 1154a98e9dbfSaguzovsk } 1155a98e9dbfSaguzovsk 1156a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1157a98e9dbfSaguzovsk ASSERT(seg_pathr_on); 1158a98e9dbfSaguzovsk seg_pathr_on = 0; 1159a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 1160a98e9dbfSaguzovsk 1161a98e9dbfSaguzovsk runcb: 1162a98e9dbfSaguzovsk /* 1163a98e9dbfSaguzovsk * Run the delayed callback list. segments/amps can't go away until 1164a98e9dbfSaguzovsk * callback is executed since they must have non 0 softlockcnt. That's 1165a98e9dbfSaguzovsk * why we don't need to hold as/seg/amp locks to execute the callback. 1166a98e9dbfSaguzovsk */ 1167a98e9dbfSaguzovsk while (delcallb_list != NULL) { 1168a98e9dbfSaguzovsk pcp = delcallb_list; 1169a98e9dbfSaguzovsk delcallb_list = pcp->p_hprev; 1170a98e9dbfSaguzovsk ASSERT(!pcp->p_active); 1171a98e9dbfSaguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 1172a98e9dbfSaguzovsk pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1); 1173a98e9dbfSaguzovsk npages += btop(pcp->p_len); 1174a98e9dbfSaguzovsk if (!IS_PCP_WIRED(pcp)) { 1175a98e9dbfSaguzovsk npages_window += btop(pcp->p_len); 1176a98e9dbfSaguzovsk } 1177a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 1178a98e9dbfSaguzovsk } 1179a98e9dbfSaguzovsk if (npages) { 1180a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1181a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages); 1182a98e9dbfSaguzovsk ASSERT(seg_plocked_window >= npages_window); 1183a98e9dbfSaguzovsk seg_plocked -= npages; 1184a98e9dbfSaguzovsk seg_plocked_window -= npages_window; 1185a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 1186a98e9dbfSaguzovsk } 1187a98e9dbfSaguzovsk } 1188a98e9dbfSaguzovsk 1189a98e9dbfSaguzovsk /* 1190a98e9dbfSaguzovsk * Remove cached pages for segment(s) entries from hashtable. The segments 1191a98e9dbfSaguzovsk * are identified by pp array. This is useful for multiple seg's cached on 1192a98e9dbfSaguzovsk * behalf of dummy segment (ISM/DISM) with common pp array. 1193a98e9dbfSaguzovsk */ 1194a98e9dbfSaguzovsk void 1195a98e9dbfSaguzovsk seg_ppurge_wiredpp(struct page **pp) 1196a98e9dbfSaguzovsk { 1197a98e9dbfSaguzovsk struct seg_pcache *pcp; 1198a98e9dbfSaguzovsk struct seg_phash_wired *hp; 1199a98e9dbfSaguzovsk pgcnt_t npages = 0; 1200a98e9dbfSaguzovsk struct seg_pcache *delcallb_list = NULL; 1201a98e9dbfSaguzovsk 1202a98e9dbfSaguzovsk /* 1203a98e9dbfSaguzovsk * if the cache is empty, return 1204a98e9dbfSaguzovsk */ 1205a98e9dbfSaguzovsk if (seg_plocked == 0) { 1206a98e9dbfSaguzovsk return; 1207a98e9dbfSaguzovsk } 1208a98e9dbfSaguzovsk ASSERT(seg_phashsize_wired != 0); 1209a98e9dbfSaguzovsk 1210a98e9dbfSaguzovsk for (hp = seg_phashtab_wired; 1211a98e9dbfSaguzovsk hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) { 1212a98e9dbfSaguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 1213a98e9dbfSaguzovsk continue; 1214a98e9dbfSaguzovsk } 1215a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 1216a98e9dbfSaguzovsk pcp = hp->p_hnext; 1217a98e9dbfSaguzovsk while (pcp != (struct seg_pcache *)hp) { 1218a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == (struct seg_phash *)hp); 1219a98e9dbfSaguzovsk ASSERT(IS_PCP_WIRED(pcp)); 1220a98e9dbfSaguzovsk /* 1221a98e9dbfSaguzovsk * purge entries which are not active 1222a98e9dbfSaguzovsk */ 1223a98e9dbfSaguzovsk if (!pcp->p_active && pcp->p_pp == pp) { 1224a98e9dbfSaguzovsk ASSERT(pcp->p_htag0 != NULL); 1225a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 1226a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 1227a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list; 1228a98e9dbfSaguzovsk delcallb_list = pcp; 12297c478bd9Sstevel@tonic-gate } 12307c478bd9Sstevel@tonic-gate pcp = pcp->p_hnext; 12317c478bd9Sstevel@tonic-gate } 12327c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 12337c478bd9Sstevel@tonic-gate /* 1234a98e9dbfSaguzovsk * segments can't go away until callback is executed since 1235a98e9dbfSaguzovsk * they must have non 0 softlockcnt. That's why we don't 1236a98e9dbfSaguzovsk * need to hold as/seg locks to execute the callback. 12377c478bd9Sstevel@tonic-gate */ 12387c478bd9Sstevel@tonic-gate while (delcallb_list != NULL) { 1239a98e9dbfSaguzovsk int done; 12407c478bd9Sstevel@tonic-gate pcp = delcallb_list; 12417c478bd9Sstevel@tonic-gate delcallb_list = pcp->p_hprev; 1242a98e9dbfSaguzovsk ASSERT(!pcp->p_active); 1243a98e9dbfSaguzovsk done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 1244a98e9dbfSaguzovsk pcp->p_len, pcp->p_pp, 1245a98e9dbfSaguzovsk pcp->p_write ? S_WRITE : S_READ, 1); 1246a98e9dbfSaguzovsk npages += btop(pcp->p_len); 1247a98e9dbfSaguzovsk ASSERT(IS_PCP_WIRED(pcp)); 1248a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 1249a98e9dbfSaguzovsk if (done) { 1250a98e9dbfSaguzovsk ASSERT(delcallb_list == NULL); 1251a98e9dbfSaguzovsk goto out; 12527c478bd9Sstevel@tonic-gate } 12537c478bd9Sstevel@tonic-gate } 1254a98e9dbfSaguzovsk } 1255a98e9dbfSaguzovsk 1256a98e9dbfSaguzovsk out: 1257a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1258a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages); 12597c478bd9Sstevel@tonic-gate seg_plocked -= npages; 1260a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 12617c478bd9Sstevel@tonic-gate } 12627c478bd9Sstevel@tonic-gate 12637c478bd9Sstevel@tonic-gate /* 12647c478bd9Sstevel@tonic-gate * purge all entries for a given segment. Since we 12657c478bd9Sstevel@tonic-gate * callback into the segment driver directly for page 12667c478bd9Sstevel@tonic-gate * reclaim the caller needs to hold the right locks. 12677c478bd9Sstevel@tonic-gate */ 12687c478bd9Sstevel@tonic-gate void 1269a98e9dbfSaguzovsk seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags) 12707c478bd9Sstevel@tonic-gate { 12717c478bd9Sstevel@tonic-gate struct seg_pcache *delcallb_list = NULL; 12727c478bd9Sstevel@tonic-gate struct seg_pcache *pcp; 12737c478bd9Sstevel@tonic-gate struct seg_phash *hp; 12747c478bd9Sstevel@tonic-gate pgcnt_t npages = 0; 1275a98e9dbfSaguzovsk void *htag0; 12767c478bd9Sstevel@tonic-gate 1277a98e9dbfSaguzovsk if (seg_plocked == 0) { 12787c478bd9Sstevel@tonic-gate return; 12797c478bd9Sstevel@tonic-gate } 1280a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 1281a98e9dbfSaguzovsk 1282a98e9dbfSaguzovsk /* 1283a98e9dbfSaguzovsk * If amp is not NULL use amp as a lookup tag otherwise use seg 1284a98e9dbfSaguzovsk * as a lookup tag. 1285a98e9dbfSaguzovsk */ 1286a98e9dbfSaguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp); 1287a98e9dbfSaguzovsk ASSERT(htag0 != NULL); 1288a98e9dbfSaguzovsk if (IS_PFLAGS_WIRED(flags)) { 1289a98e9dbfSaguzovsk hp = P_HASHBP(seg, htag0, 0, flags); 12907c478bd9Sstevel@tonic-gate mutex_enter(&hp->p_hmutex); 12917c478bd9Sstevel@tonic-gate pcp = hp->p_hnext; 12927c478bd9Sstevel@tonic-gate while (pcp != (struct seg_pcache *)hp) { 1293a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp); 1294a98e9dbfSaguzovsk ASSERT(IS_PCP_WIRED(pcp)); 1295a98e9dbfSaguzovsk if (pcp->p_htag0 == htag0) { 12967c478bd9Sstevel@tonic-gate if (pcp->p_active) { 12977c478bd9Sstevel@tonic-gate break; 12987c478bd9Sstevel@tonic-gate } 12997c478bd9Sstevel@tonic-gate pcp->p_hprev->p_hnext = pcp->p_hnext; 13007c478bd9Sstevel@tonic-gate pcp->p_hnext->p_hprev = pcp->p_hprev; 13017c478bd9Sstevel@tonic-gate pcp->p_hprev = delcallb_list; 13027c478bd9Sstevel@tonic-gate delcallb_list = pcp; 13037c478bd9Sstevel@tonic-gate } 13047c478bd9Sstevel@tonic-gate pcp = pcp->p_hnext; 13057c478bd9Sstevel@tonic-gate } 13067c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 1307a98e9dbfSaguzovsk } else { 1308a98e9dbfSaguzovsk pcache_link_t *plinkp; 1309a98e9dbfSaguzovsk pcache_link_t *pheadp; 1310a98e9dbfSaguzovsk kmutex_t *pmtx; 1311a98e9dbfSaguzovsk 1312a98e9dbfSaguzovsk if (amp == NULL) { 1313a98e9dbfSaguzovsk ASSERT(seg != NULL); 1314a98e9dbfSaguzovsk pheadp = &seg->s_phead; 1315a98e9dbfSaguzovsk pmtx = &seg->s_pmtx; 1316a98e9dbfSaguzovsk } else { 1317a98e9dbfSaguzovsk pheadp = &->a_phead; 1318a98e9dbfSaguzovsk pmtx = &->a_pmtx; 1319a98e9dbfSaguzovsk } 1320a98e9dbfSaguzovsk mutex_enter(pmtx); 1321a98e9dbfSaguzovsk while ((plinkp = pheadp->p_lnext) != pheadp) { 1322a98e9dbfSaguzovsk pcp = plink2pcache(plinkp); 1323a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 1324a98e9dbfSaguzovsk ASSERT(pcp->p_htag0 == htag0); 1325a98e9dbfSaguzovsk hp = pcp->p_hashp; 1326a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 1327a98e9dbfSaguzovsk if (pcp->p_active) { 1328a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 1329a98e9dbfSaguzovsk break; 1330a98e9dbfSaguzovsk } 1331a98e9dbfSaguzovsk ASSERT(plinkp->p_lprev == pheadp); 1332a98e9dbfSaguzovsk pheadp->p_lnext = plinkp->p_lnext; 1333a98e9dbfSaguzovsk plinkp->p_lnext->p_lprev = pheadp; 1334a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 1335a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 1336a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list; 1337a98e9dbfSaguzovsk delcallb_list = pcp; 1338a98e9dbfSaguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 1339a98e9dbfSaguzovsk seg_premove_abuck(hp, 0); 1340a98e9dbfSaguzovsk } 1341a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 1342a98e9dbfSaguzovsk } 1343a98e9dbfSaguzovsk mutex_exit(pmtx); 1344a98e9dbfSaguzovsk } 13457c478bd9Sstevel@tonic-gate while (delcallb_list != NULL) { 13467c478bd9Sstevel@tonic-gate pcp = delcallb_list; 13477c478bd9Sstevel@tonic-gate delcallb_list = pcp->p_hprev; 1348a98e9dbfSaguzovsk ASSERT(!pcp->p_active); 1349a98e9dbfSaguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len, 1350a98e9dbfSaguzovsk pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); 1351a98e9dbfSaguzovsk npages += btop(pcp->p_len); 1352a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 13537c478bd9Sstevel@tonic-gate } 1354a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1355a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages); 13567c478bd9Sstevel@tonic-gate seg_plocked -= npages; 1357a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) { 1358a98e9dbfSaguzovsk ASSERT(seg_plocked_window >= npages); 1359a98e9dbfSaguzovsk seg_plocked_window -= npages; 1360a98e9dbfSaguzovsk } 1361a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 13627c478bd9Sstevel@tonic-gate } 13637c478bd9Sstevel@tonic-gate 13647c478bd9Sstevel@tonic-gate static void seg_pinit_mem_config(void); 13657c478bd9Sstevel@tonic-gate 13667c478bd9Sstevel@tonic-gate /* 13677c478bd9Sstevel@tonic-gate * setup the pagelock cache 13687c478bd9Sstevel@tonic-gate */ 13697c478bd9Sstevel@tonic-gate static void 13707c478bd9Sstevel@tonic-gate seg_pinit(void) 13717c478bd9Sstevel@tonic-gate { 13727c478bd9Sstevel@tonic-gate struct seg_phash *hp; 1373a98e9dbfSaguzovsk ulong_t i; 1374a98e9dbfSaguzovsk pgcnt_t physmegs; 13757c478bd9Sstevel@tonic-gate 1376a98e9dbfSaguzovsk seg_plocked = 0; 1377a98e9dbfSaguzovsk seg_plocked_window = 0; 13787c478bd9Sstevel@tonic-gate 1379a98e9dbfSaguzovsk if (segpcache_enabled == 0) { 1380a98e9dbfSaguzovsk seg_phashsize_win = 0; 1381a98e9dbfSaguzovsk seg_phashsize_wired = 0; 1382a98e9dbfSaguzovsk seg_pdisabled = 1; 1383a98e9dbfSaguzovsk return; 1384a98e9dbfSaguzovsk } 1385a98e9dbfSaguzovsk 1386a98e9dbfSaguzovsk seg_pdisabled = 0; 1387a98e9dbfSaguzovsk seg_pkmcache = kmem_cache_create("seg_pcache", 1388a98e9dbfSaguzovsk sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0); 1389a98e9dbfSaguzovsk if (segpcache_pcp_maxage_ticks <= 0) { 1390a98e9dbfSaguzovsk segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz; 1391a98e9dbfSaguzovsk } 1392a98e9dbfSaguzovsk seg_pmax_pcpage = segpcache_pcp_maxage_ticks; 1393a98e9dbfSaguzovsk seg_pathr_empty_ahb = 0; 1394a98e9dbfSaguzovsk seg_pathr_full_ahb = 0; 1395a98e9dbfSaguzovsk seg_pshrink_shift = segpcache_shrink_shift; 1396a98e9dbfSaguzovsk seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes); 1397a98e9dbfSaguzovsk 1398a98e9dbfSaguzovsk mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL); 1399a98e9dbfSaguzovsk mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL); 1400a98e9dbfSaguzovsk mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL); 1401a98e9dbfSaguzovsk cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL); 1402a98e9dbfSaguzovsk 14037c478bd9Sstevel@tonic-gate physmegs = physmem >> (20 - PAGESHIFT); 14047c478bd9Sstevel@tonic-gate 14057c478bd9Sstevel@tonic-gate /* 1406a98e9dbfSaguzovsk * If segpcache_hashsize_win was not set in /etc/system or it has 1407a98e9dbfSaguzovsk * absurd value set it to a default. 14087c478bd9Sstevel@tonic-gate */ 1409a98e9dbfSaguzovsk if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) { 1410a98e9dbfSaguzovsk /* 1411a98e9dbfSaguzovsk * Create one bucket per 32K (or at least per 8 pages) of 1412a98e9dbfSaguzovsk * available memory. 1413a98e9dbfSaguzovsk */ 1414a98e9dbfSaguzovsk pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8); 1415a98e9dbfSaguzovsk segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket); 14167c478bd9Sstevel@tonic-gate } 1417a98e9dbfSaguzovsk if (!ISP2(segpcache_hashsize_win)) { 1418a98e9dbfSaguzovsk ulong_t rndfac = ~(1UL << 1419a98e9dbfSaguzovsk (highbit(segpcache_hashsize_win) - 1)); 1420a98e9dbfSaguzovsk rndfac &= segpcache_hashsize_win; 1421a98e9dbfSaguzovsk segpcache_hashsize_win += rndfac; 1422a98e9dbfSaguzovsk segpcache_hashsize_win = 1 << 1423a98e9dbfSaguzovsk (highbit(segpcache_hashsize_win) - 1); 14247c478bd9Sstevel@tonic-gate } 1425a98e9dbfSaguzovsk seg_phashsize_win = segpcache_hashsize_win; 1426a98e9dbfSaguzovsk seg_phashtab_win = kmem_zalloc( 1427a98e9dbfSaguzovsk seg_phashsize_win * sizeof (struct seg_phash), 1428c6f08383Sjj204856 KM_SLEEP); 1429a98e9dbfSaguzovsk for (i = 0; i < seg_phashsize_win; i++) { 1430a98e9dbfSaguzovsk hp = &seg_phashtab_win[i]; 14317c478bd9Sstevel@tonic-gate hp->p_hnext = (struct seg_pcache *)hp; 14327c478bd9Sstevel@tonic-gate hp->p_hprev = (struct seg_pcache *)hp; 14337c478bd9Sstevel@tonic-gate mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 14347c478bd9Sstevel@tonic-gate } 14357c478bd9Sstevel@tonic-gate 1436a98e9dbfSaguzovsk seg_pahcur = 0; 1437a98e9dbfSaguzovsk seg_pathr_on = 0; 1438a98e9dbfSaguzovsk seg_pahhead[0].p_lnext = &seg_pahhead[0]; 1439a98e9dbfSaguzovsk seg_pahhead[0].p_lprev = &seg_pahhead[0]; 1440a98e9dbfSaguzovsk seg_pahhead[1].p_lnext = &seg_pahhead[1]; 1441a98e9dbfSaguzovsk seg_pahhead[1].p_lprev = &seg_pahhead[1]; 1442a98e9dbfSaguzovsk 1443a98e9dbfSaguzovsk /* 1444a98e9dbfSaguzovsk * If segpcache_hashsize_wired was not set in /etc/system or it has 1445a98e9dbfSaguzovsk * absurd value set it to a default. 1446a98e9dbfSaguzovsk */ 1447a98e9dbfSaguzovsk if (segpcache_hashsize_wired == 0 || 1448a98e9dbfSaguzovsk segpcache_hashsize_wired > physmem / 4) { 1449a98e9dbfSaguzovsk /* 1450a98e9dbfSaguzovsk * Choose segpcache_hashsize_wired based on physmem. 1451a98e9dbfSaguzovsk * Create a bucket per 128K bytes upto 256K buckets. 1452a98e9dbfSaguzovsk */ 1453a98e9dbfSaguzovsk if (physmegs < 20 * 1024) { 1454a98e9dbfSaguzovsk segpcache_hashsize_wired = MAX(1024, physmegs << 3); 1455a98e9dbfSaguzovsk } else { 1456a98e9dbfSaguzovsk segpcache_hashsize_wired = 256 * 1024; 1457a98e9dbfSaguzovsk } 1458a98e9dbfSaguzovsk } 1459a98e9dbfSaguzovsk if (!ISP2(segpcache_hashsize_wired)) { 1460a98e9dbfSaguzovsk segpcache_hashsize_wired = 1 << 1461a98e9dbfSaguzovsk highbit(segpcache_hashsize_wired); 1462a98e9dbfSaguzovsk } 1463a98e9dbfSaguzovsk seg_phashsize_wired = segpcache_hashsize_wired; 1464a98e9dbfSaguzovsk seg_phashtab_wired = kmem_zalloc( 1465a98e9dbfSaguzovsk seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP); 1466a98e9dbfSaguzovsk for (i = 0; i < seg_phashsize_wired; i++) { 1467a98e9dbfSaguzovsk hp = (struct seg_phash *)&seg_phashtab_wired[i]; 1468a98e9dbfSaguzovsk hp->p_hnext = (struct seg_pcache *)hp; 1469a98e9dbfSaguzovsk hp->p_hprev = (struct seg_pcache *)hp; 1470a98e9dbfSaguzovsk mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 1471a98e9dbfSaguzovsk } 1472a98e9dbfSaguzovsk 1473a98e9dbfSaguzovsk if (segpcache_maxwindow == 0) { 1474a98e9dbfSaguzovsk if (physmegs < 64) { 1475a98e9dbfSaguzovsk /* 3% of memory */ 1476a98e9dbfSaguzovsk segpcache_maxwindow = availrmem >> 5; 1477a98e9dbfSaguzovsk } else if (physmegs < 512) { 1478a98e9dbfSaguzovsk /* 12% of memory */ 1479a98e9dbfSaguzovsk segpcache_maxwindow = availrmem >> 3; 1480a98e9dbfSaguzovsk } else if (physmegs < 1024) { 1481a98e9dbfSaguzovsk /* 25% of memory */ 1482a98e9dbfSaguzovsk segpcache_maxwindow = availrmem >> 2; 1483a98e9dbfSaguzovsk } else if (physmegs < 2048) { 1484a98e9dbfSaguzovsk /* 50% of memory */ 1485a98e9dbfSaguzovsk segpcache_maxwindow = availrmem >> 1; 1486a98e9dbfSaguzovsk } else { 1487a98e9dbfSaguzovsk /* no limit */ 1488a98e9dbfSaguzovsk segpcache_maxwindow = (pgcnt_t)-1; 1489a98e9dbfSaguzovsk } 1490a98e9dbfSaguzovsk } 1491a98e9dbfSaguzovsk seg_pmaxwindow = segpcache_maxwindow; 14927c478bd9Sstevel@tonic-gate seg_pinit_mem_config(); 14937c478bd9Sstevel@tonic-gate } 14947c478bd9Sstevel@tonic-gate 14957c478bd9Sstevel@tonic-gate /* 14967c478bd9Sstevel@tonic-gate * called by pageout if memory is low 14977c478bd9Sstevel@tonic-gate */ 14987c478bd9Sstevel@tonic-gate void 14997c478bd9Sstevel@tonic-gate seg_preap(void) 15007c478bd9Sstevel@tonic-gate { 15017c478bd9Sstevel@tonic-gate /* 1502a98e9dbfSaguzovsk * if the cache is off or empty, return 15037c478bd9Sstevel@tonic-gate */ 1504a98e9dbfSaguzovsk if (seg_plocked_window == 0) { 15057c478bd9Sstevel@tonic-gate return; 15067c478bd9Sstevel@tonic-gate } 1507a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 1508a98e9dbfSaguzovsk 1509a98e9dbfSaguzovsk /* 1510a98e9dbfSaguzovsk * If somebody is already purging pcache 1511a98e9dbfSaguzovsk * just return. 1512a98e9dbfSaguzovsk */ 1513a98e9dbfSaguzovsk if (seg_pdisabled) { 1514a98e9dbfSaguzovsk return; 15157c478bd9Sstevel@tonic-gate } 15167c478bd9Sstevel@tonic-gate 1517a98e9dbfSaguzovsk cv_signal(&seg_pasync_cv); 1518a98e9dbfSaguzovsk } 15197c478bd9Sstevel@tonic-gate 15207c478bd9Sstevel@tonic-gate /* 15217c478bd9Sstevel@tonic-gate * run as a backgroud thread and reclaim pagelock 15227c478bd9Sstevel@tonic-gate * pages which have not been used recently 15237c478bd9Sstevel@tonic-gate */ 15247c478bd9Sstevel@tonic-gate void 15257c478bd9Sstevel@tonic-gate seg_pasync_thread(void) 15267c478bd9Sstevel@tonic-gate { 15277c478bd9Sstevel@tonic-gate callb_cpr_t cpr_info; 15287c478bd9Sstevel@tonic-gate 1529a98e9dbfSaguzovsk if (seg_phashsize_win == 0) { 1530a98e9dbfSaguzovsk thread_exit(); 1531a98e9dbfSaguzovsk /*NOTREACHED*/ 15327c478bd9Sstevel@tonic-gate } 15337c478bd9Sstevel@tonic-gate 1534a98e9dbfSaguzovsk seg_pasync_thr = curthread; 1535a98e9dbfSaguzovsk 1536a98e9dbfSaguzovsk CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx, 1537a98e9dbfSaguzovsk callb_generic_cpr, "seg_pasync"); 1538a98e9dbfSaguzovsk 1539a98e9dbfSaguzovsk if (segpcache_reap_ticks <= 0) { 1540a98e9dbfSaguzovsk segpcache_reap_ticks = segpcache_reap_sec * hz; 1541a98e9dbfSaguzovsk } 1542a98e9dbfSaguzovsk 1543a98e9dbfSaguzovsk mutex_enter(&seg_pasync_mtx); 15447c478bd9Sstevel@tonic-gate for (;;) { 15457c478bd9Sstevel@tonic-gate CALLB_CPR_SAFE_BEGIN(&cpr_info); 1546d3d50737SRafael Vanoni (void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx, 1547d3d50737SRafael Vanoni segpcache_reap_ticks, TR_CLOCK_TICK); 1548a98e9dbfSaguzovsk CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx); 1549a98e9dbfSaguzovsk if (seg_pdisabled == 0) { 1550a98e9dbfSaguzovsk seg_ppurge_async(0); 15517c478bd9Sstevel@tonic-gate } 15527c478bd9Sstevel@tonic-gate } 15537c478bd9Sstevel@tonic-gate } 15547c478bd9Sstevel@tonic-gate 15557c478bd9Sstevel@tonic-gate static struct kmem_cache *seg_cache; 15567c478bd9Sstevel@tonic-gate 15577c478bd9Sstevel@tonic-gate /* 15587c478bd9Sstevel@tonic-gate * Initialize segment management data structures. 15597c478bd9Sstevel@tonic-gate */ 15607c478bd9Sstevel@tonic-gate void 15617c478bd9Sstevel@tonic-gate seg_init(void) 15627c478bd9Sstevel@tonic-gate { 15637c478bd9Sstevel@tonic-gate kstat_t *ksp; 15647c478bd9Sstevel@tonic-gate 1565a98e9dbfSaguzovsk seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg), 1566a98e9dbfSaguzovsk 0, NULL, NULL, NULL, NULL, NULL, 0); 15677c478bd9Sstevel@tonic-gate 15687c478bd9Sstevel@tonic-gate ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED, 15697c478bd9Sstevel@tonic-gate segadvstat_ndata, KSTAT_FLAG_VIRTUAL); 15707c478bd9Sstevel@tonic-gate if (ksp) { 15717c478bd9Sstevel@tonic-gate ksp->ks_data = (void *)segadvstat_ptr; 15727c478bd9Sstevel@tonic-gate kstat_install(ksp); 15737c478bd9Sstevel@tonic-gate } 15747c478bd9Sstevel@tonic-gate 15757c478bd9Sstevel@tonic-gate seg_pinit(); 15767c478bd9Sstevel@tonic-gate } 15777c478bd9Sstevel@tonic-gate 15787c478bd9Sstevel@tonic-gate /* 15797c478bd9Sstevel@tonic-gate * Allocate a segment to cover [base, base+size] 15807c478bd9Sstevel@tonic-gate * and attach it to the specified address space. 15817c478bd9Sstevel@tonic-gate */ 15827c478bd9Sstevel@tonic-gate struct seg * 15837c478bd9Sstevel@tonic-gate seg_alloc(struct as *as, caddr_t base, size_t size) 15847c478bd9Sstevel@tonic-gate { 15857c478bd9Sstevel@tonic-gate struct seg *new; 15867c478bd9Sstevel@tonic-gate caddr_t segbase; 15877c478bd9Sstevel@tonic-gate size_t segsize; 15887c478bd9Sstevel@tonic-gate 15897c478bd9Sstevel@tonic-gate segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK); 15907c478bd9Sstevel@tonic-gate segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) - 15917c478bd9Sstevel@tonic-gate (uintptr_t)segbase; 15927c478bd9Sstevel@tonic-gate 15937c478bd9Sstevel@tonic-gate if (!valid_va_range(&segbase, &segsize, segsize, AH_LO)) 15947c478bd9Sstevel@tonic-gate return ((struct seg *)NULL); /* bad virtual addr range */ 15957c478bd9Sstevel@tonic-gate 15967c478bd9Sstevel@tonic-gate if (as != &kas && 15977c478bd9Sstevel@tonic-gate valid_usr_range(segbase, segsize, 0, as, 15987c478bd9Sstevel@tonic-gate as->a_userlimit) != RANGE_OKAY) 15997c478bd9Sstevel@tonic-gate return ((struct seg *)NULL); /* bad virtual addr range */ 16007c478bd9Sstevel@tonic-gate 16017c478bd9Sstevel@tonic-gate new = kmem_cache_alloc(seg_cache, KM_SLEEP); 16027c478bd9Sstevel@tonic-gate new->s_ops = NULL; 16037c478bd9Sstevel@tonic-gate new->s_data = NULL; 16047c478bd9Sstevel@tonic-gate new->s_szc = 0; 16057c478bd9Sstevel@tonic-gate new->s_flags = 0; 1606a98e9dbfSaguzovsk mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL); 1607a98e9dbfSaguzovsk new->s_phead.p_lnext = &new->s_phead; 1608a98e9dbfSaguzovsk new->s_phead.p_lprev = &new->s_phead; 16097c478bd9Sstevel@tonic-gate if (seg_attach(as, segbase, segsize, new) < 0) { 16107c478bd9Sstevel@tonic-gate kmem_cache_free(seg_cache, new); 16117c478bd9Sstevel@tonic-gate return ((struct seg *)NULL); 16127c478bd9Sstevel@tonic-gate } 16137c478bd9Sstevel@tonic-gate /* caller must fill in ops, data */ 16147c478bd9Sstevel@tonic-gate return (new); 16157c478bd9Sstevel@tonic-gate } 16167c478bd9Sstevel@tonic-gate 16177c478bd9Sstevel@tonic-gate /* 16187c478bd9Sstevel@tonic-gate * Attach a segment to the address space. Used by seg_alloc() 16197c478bd9Sstevel@tonic-gate * and for kernel startup to attach to static segments. 16207c478bd9Sstevel@tonic-gate */ 16217c478bd9Sstevel@tonic-gate int 16227c478bd9Sstevel@tonic-gate seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg) 16237c478bd9Sstevel@tonic-gate { 16247c478bd9Sstevel@tonic-gate seg->s_as = as; 16257c478bd9Sstevel@tonic-gate seg->s_base = base; 16267c478bd9Sstevel@tonic-gate seg->s_size = size; 16277c478bd9Sstevel@tonic-gate 16287c478bd9Sstevel@tonic-gate /* 16297c478bd9Sstevel@tonic-gate * as_addseg() will add the segment at the appropraite point 16307c478bd9Sstevel@tonic-gate * in the list. It will return -1 if there is overlap with 16317c478bd9Sstevel@tonic-gate * an already existing segment. 16327c478bd9Sstevel@tonic-gate */ 16337c478bd9Sstevel@tonic-gate return (as_addseg(as, seg)); 16347c478bd9Sstevel@tonic-gate } 16357c478bd9Sstevel@tonic-gate 16367c478bd9Sstevel@tonic-gate /* 16377c478bd9Sstevel@tonic-gate * Unmap a segment and free it from its associated address space. 16387c478bd9Sstevel@tonic-gate * This should be called by anybody who's finished with a whole segment's 16397c478bd9Sstevel@tonic-gate * mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the 16407c478bd9Sstevel@tonic-gate * responsibility of the segment driver to unlink the the segment 16417c478bd9Sstevel@tonic-gate * from the address space, and to free public and private data structures 16427c478bd9Sstevel@tonic-gate * associated with the segment. (This is typically done by a call to 16437c478bd9Sstevel@tonic-gate * seg_free()). 16447c478bd9Sstevel@tonic-gate */ 16457c478bd9Sstevel@tonic-gate void 16467c478bd9Sstevel@tonic-gate seg_unmap(struct seg *seg) 16477c478bd9Sstevel@tonic-gate { 16487c478bd9Sstevel@tonic-gate #ifdef DEBUG 16497c478bd9Sstevel@tonic-gate int ret; 16507c478bd9Sstevel@tonic-gate #endif /* DEBUG */ 16517c478bd9Sstevel@tonic-gate 1652*dc32d872SJosef 'Jeff' Sipek ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 16537c478bd9Sstevel@tonic-gate 16547c478bd9Sstevel@tonic-gate /* Shouldn't have called seg_unmap if mapping isn't yet established */ 16557c478bd9Sstevel@tonic-gate ASSERT(seg->s_data != NULL); 16567c478bd9Sstevel@tonic-gate 16577c478bd9Sstevel@tonic-gate /* Unmap the whole mapping */ 16587c478bd9Sstevel@tonic-gate #ifdef DEBUG 16597c478bd9Sstevel@tonic-gate ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 16607c478bd9Sstevel@tonic-gate ASSERT(ret == 0); 16617c478bd9Sstevel@tonic-gate #else 16627c478bd9Sstevel@tonic-gate SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 16637c478bd9Sstevel@tonic-gate #endif /* DEBUG */ 16647c478bd9Sstevel@tonic-gate } 16657c478bd9Sstevel@tonic-gate 16667c478bd9Sstevel@tonic-gate /* 16677c478bd9Sstevel@tonic-gate * Free the segment from its associated as. This should only be called 16687c478bd9Sstevel@tonic-gate * if a mapping to the segment has not yet been established (e.g., if 16697c478bd9Sstevel@tonic-gate * an error occurs in the middle of doing an as_map when the segment 16707c478bd9Sstevel@tonic-gate * has already been partially set up) or if it has already been deleted 16717c478bd9Sstevel@tonic-gate * (e.g., from a segment driver unmap routine if the unmap applies to the 16727c478bd9Sstevel@tonic-gate * entire segment). If the mapping is currently set up then seg_unmap() should 16737c478bd9Sstevel@tonic-gate * be called instead. 16747c478bd9Sstevel@tonic-gate */ 16757c478bd9Sstevel@tonic-gate void 16767c478bd9Sstevel@tonic-gate seg_free(struct seg *seg) 16777c478bd9Sstevel@tonic-gate { 16787c478bd9Sstevel@tonic-gate register struct as *as = seg->s_as; 16797c478bd9Sstevel@tonic-gate struct seg *tseg = as_removeseg(as, seg); 16807c478bd9Sstevel@tonic-gate 16817c478bd9Sstevel@tonic-gate ASSERT(tseg == seg); 16827c478bd9Sstevel@tonic-gate 16837c478bd9Sstevel@tonic-gate /* 16847c478bd9Sstevel@tonic-gate * If the segment private data field is NULL, 16857c478bd9Sstevel@tonic-gate * then segment driver is not attached yet. 16867c478bd9Sstevel@tonic-gate */ 16877c478bd9Sstevel@tonic-gate if (seg->s_data != NULL) 16887c478bd9Sstevel@tonic-gate SEGOP_FREE(seg); 16897c478bd9Sstevel@tonic-gate 1690a98e9dbfSaguzovsk mutex_destroy(&seg->s_pmtx); 1691a98e9dbfSaguzovsk ASSERT(seg->s_phead.p_lnext == &seg->s_phead); 1692a98e9dbfSaguzovsk ASSERT(seg->s_phead.p_lprev == &seg->s_phead); 16937c478bd9Sstevel@tonic-gate kmem_cache_free(seg_cache, seg); 16947c478bd9Sstevel@tonic-gate } 16957c478bd9Sstevel@tonic-gate 16967c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 16977c478bd9Sstevel@tonic-gate static void 16987c478bd9Sstevel@tonic-gate seg_p_mem_config_post_add( 16997c478bd9Sstevel@tonic-gate void *arg, 17007c478bd9Sstevel@tonic-gate pgcnt_t delta_pages) 17017c478bd9Sstevel@tonic-gate { 17027c478bd9Sstevel@tonic-gate /* Nothing to do. */ 17037c478bd9Sstevel@tonic-gate } 17047c478bd9Sstevel@tonic-gate 1705cee1d74bSjfrank void 1706cee1d74bSjfrank seg_p_enable(void) 1707cee1d74bSjfrank { 1708a98e9dbfSaguzovsk mutex_enter(&seg_pcache_mtx); 1709a98e9dbfSaguzovsk ASSERT(seg_pdisabled != 0); 1710a98e9dbfSaguzovsk seg_pdisabled--; 1711a98e9dbfSaguzovsk mutex_exit(&seg_pcache_mtx); 1712cee1d74bSjfrank } 1713cee1d74bSjfrank 1714cee1d74bSjfrank /* 1715cee1d74bSjfrank * seg_p_disable - disables seg_pcache, and then attempts to empty the 1716cee1d74bSjfrank * cache. 1717cee1d74bSjfrank * Returns SEGP_SUCCESS if the cache was successfully emptied, or 1718cee1d74bSjfrank * SEGP_FAIL if the cache could not be emptied. 1719cee1d74bSjfrank */ 1720cee1d74bSjfrank int 1721cee1d74bSjfrank seg_p_disable(void) 1722cee1d74bSjfrank { 1723cee1d74bSjfrank pgcnt_t old_plocked; 1724cee1d74bSjfrank int stall_count = 0; 1725cee1d74bSjfrank 1726a98e9dbfSaguzovsk mutex_enter(&seg_pcache_mtx); 1727a98e9dbfSaguzovsk seg_pdisabled++; 1728a98e9dbfSaguzovsk ASSERT(seg_pdisabled != 0); 1729a98e9dbfSaguzovsk mutex_exit(&seg_pcache_mtx); 1730cee1d74bSjfrank 1731cee1d74bSjfrank /* 1732cee1d74bSjfrank * Attempt to empty the cache. Terminate if seg_plocked does not 1733cee1d74bSjfrank * diminish with SEGP_STALL_THRESHOLD consecutive attempts. 1734cee1d74bSjfrank */ 1735cee1d74bSjfrank while (seg_plocked != 0) { 1736a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 1737cee1d74bSjfrank old_plocked = seg_plocked; 1738a98e9dbfSaguzovsk seg_ppurge_async(1); 1739cee1d74bSjfrank if (seg_plocked == old_plocked) { 1740cee1d74bSjfrank if (stall_count++ > SEGP_STALL_THRESHOLD) { 1741cee1d74bSjfrank return (SEGP_FAIL); 1742cee1d74bSjfrank } 1743cee1d74bSjfrank } else 1744cee1d74bSjfrank stall_count = 0; 1745cee1d74bSjfrank if (seg_plocked != 0) 1746cee1d74bSjfrank delay(hz/SEGP_PREDEL_DELAY_FACTOR); 1747cee1d74bSjfrank } 1748cee1d74bSjfrank return (SEGP_SUCCESS); 1749cee1d74bSjfrank } 1750cee1d74bSjfrank 17517c478bd9Sstevel@tonic-gate /* 17527c478bd9Sstevel@tonic-gate * Attempt to purge seg_pcache. May need to return before this has 17537c478bd9Sstevel@tonic-gate * completed to allow other pre_del callbacks to unlock pages. This is 17547c478bd9Sstevel@tonic-gate * ok because: 1755a98e9dbfSaguzovsk * 1) The seg_pdisabled flag has been set so at least we won't 17567c478bd9Sstevel@tonic-gate * cache anymore locks and the locks we couldn't purge 17577c478bd9Sstevel@tonic-gate * will not be held if they do get released by a subsequent 17587c478bd9Sstevel@tonic-gate * pre-delete callback. 17597c478bd9Sstevel@tonic-gate * 17607c478bd9Sstevel@tonic-gate * 2) The rest of the memory delete thread processing does not 17617c478bd9Sstevel@tonic-gate * depend on the changes made in this pre-delete callback. No 17627c478bd9Sstevel@tonic-gate * panics will result, the worst that will happen is that the 17637c478bd9Sstevel@tonic-gate * DR code will timeout and cancel the delete. 17647c478bd9Sstevel@tonic-gate */ 17657c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 17667c478bd9Sstevel@tonic-gate static int 17677c478bd9Sstevel@tonic-gate seg_p_mem_config_pre_del( 17687c478bd9Sstevel@tonic-gate void *arg, 17697c478bd9Sstevel@tonic-gate pgcnt_t delta_pages) 17707c478bd9Sstevel@tonic-gate { 1771a98e9dbfSaguzovsk if (seg_phashsize_win == 0) { 1772a98e9dbfSaguzovsk return (0); 1773a98e9dbfSaguzovsk } 1774cee1d74bSjfrank if (seg_p_disable() != SEGP_SUCCESS) 1775cee1d74bSjfrank cmn_err(CE_NOTE, 1776cee1d74bSjfrank "!Pre-delete couldn't purge"" pagelock cache - continuing"); 17777c478bd9Sstevel@tonic-gate return (0); 17787c478bd9Sstevel@tonic-gate } 17797c478bd9Sstevel@tonic-gate 17807c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 17817c478bd9Sstevel@tonic-gate static void 17827c478bd9Sstevel@tonic-gate seg_p_mem_config_post_del( 17837c478bd9Sstevel@tonic-gate void *arg, 17847c478bd9Sstevel@tonic-gate pgcnt_t delta_pages, 17857c478bd9Sstevel@tonic-gate int cancelled) 17867c478bd9Sstevel@tonic-gate { 1787a98e9dbfSaguzovsk if (seg_phashsize_win == 0) { 1788a98e9dbfSaguzovsk return; 1789a98e9dbfSaguzovsk } 1790cee1d74bSjfrank seg_p_enable(); 17917c478bd9Sstevel@tonic-gate } 17927c478bd9Sstevel@tonic-gate 17937c478bd9Sstevel@tonic-gate static kphysm_setup_vector_t seg_p_mem_config_vec = { 17947c478bd9Sstevel@tonic-gate KPHYSM_SETUP_VECTOR_VERSION, 17957c478bd9Sstevel@tonic-gate seg_p_mem_config_post_add, 17967c478bd9Sstevel@tonic-gate seg_p_mem_config_pre_del, 17977c478bd9Sstevel@tonic-gate seg_p_mem_config_post_del, 17987c478bd9Sstevel@tonic-gate }; 17997c478bd9Sstevel@tonic-gate 18007c478bd9Sstevel@tonic-gate static void 18017c478bd9Sstevel@tonic-gate seg_pinit_mem_config(void) 18027c478bd9Sstevel@tonic-gate { 18037c478bd9Sstevel@tonic-gate int ret; 18047c478bd9Sstevel@tonic-gate 18057c478bd9Sstevel@tonic-gate ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL); 18067c478bd9Sstevel@tonic-gate /* 18077c478bd9Sstevel@tonic-gate * Want to catch this in the debug kernel. At run time, if the 18087c478bd9Sstevel@tonic-gate * callbacks don't get run all will be OK as the disable just makes 18097c478bd9Sstevel@tonic-gate * it more likely that the pages can be collected. 18107c478bd9Sstevel@tonic-gate */ 18117c478bd9Sstevel@tonic-gate ASSERT(ret == 0); 18127c478bd9Sstevel@tonic-gate } 18130209230bSgjelinek 18140209230bSgjelinek /* 18150209230bSgjelinek * Verify that segment is not a shared anonymous segment which reserves 18160209230bSgjelinek * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered 18170209230bSgjelinek * from one zone to another if any segments are shared. This is because the 18180209230bSgjelinek * last process to exit will credit the swap reservation. This could lead 18190209230bSgjelinek * to the swap being reserved by one zone, and credited to another. 18200209230bSgjelinek */ 18210209230bSgjelinek boolean_t 18220209230bSgjelinek seg_can_change_zones(struct seg *seg) 18230209230bSgjelinek { 18240209230bSgjelinek struct segvn_data *svd; 18250209230bSgjelinek 18260209230bSgjelinek if (seg->s_ops == &segspt_shmops) 18270209230bSgjelinek return (B_FALSE); 18280209230bSgjelinek 18290209230bSgjelinek if (seg->s_ops == &segvn_ops) { 18300209230bSgjelinek svd = (struct segvn_data *)seg->s_data; 18310209230bSgjelinek if (svd->type == MAP_SHARED && 18320209230bSgjelinek svd->amp != NULL && 18330209230bSgjelinek svd->amp->swresv > 0) 18340209230bSgjelinek return (B_FALSE); 18350209230bSgjelinek } 18360209230bSgjelinek return (B_TRUE); 18370209230bSgjelinek } 18380209230bSgjelinek 18390209230bSgjelinek /* 18400209230bSgjelinek * Return swap reserved by a segment backing a private mapping. 18410209230bSgjelinek */ 18420209230bSgjelinek size_t 18430209230bSgjelinek seg_swresv(struct seg *seg) 18440209230bSgjelinek { 18450209230bSgjelinek struct segvn_data *svd; 18460209230bSgjelinek size_t swap = 0; 18470209230bSgjelinek 18480209230bSgjelinek if (seg->s_ops == &segvn_ops) { 18490209230bSgjelinek svd = (struct segvn_data *)seg->s_data; 18500209230bSgjelinek if (svd->type == MAP_PRIVATE && svd->swresv > 0) 18510209230bSgjelinek swap = svd->swresv; 18520209230bSgjelinek } 18530209230bSgjelinek return (swap); 18540209230bSgjelinek } 18559d12795fSRobert Mustacchi 18569d12795fSRobert Mustacchi /* 18579d12795fSRobert Mustacchi * General not supported function for SEGOP_INHERIT 18589d12795fSRobert Mustacchi */ 18599d12795fSRobert Mustacchi /* ARGSUSED */ 18609d12795fSRobert Mustacchi int 18619d12795fSRobert Mustacchi seg_inherit_notsup(struct seg *seg, caddr_t addr, size_t len, uint_t op) 18629d12795fSRobert Mustacchi { 18639d12795fSRobert Mustacchi return (ENOTSUP); 18649d12795fSRobert Mustacchi } 1865