1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright (c) 2018, Joyent, Inc. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 /* 41 * VM - segment management. 42 */ 43 44 #include <sys/types.h> 45 #include <sys/inttypes.h> 46 #include <sys/t_lock.h> 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/kmem.h> 50 #include <sys/sysmacros.h> 51 #include <sys/vmsystm.h> 52 #include <sys/tuneable.h> 53 #include <sys/debug.h> 54 #include <sys/fs/swapnode.h> 55 #include <sys/cmn_err.h> 56 #include <sys/callb.h> 57 #include <sys/mem_config.h> 58 #include <sys/mman.h> 59 60 #include <vm/hat.h> 61 #include <vm/as.h> 62 #include <vm/seg.h> 63 #include <vm/seg_kmem.h> 64 #include <vm/seg_spt.h> 65 #include <vm/seg_vn.h> 66 #include <vm/anon.h> 67 68 /* 69 * kstats for segment advise 70 */ 71 segadvstat_t segadvstat = { 72 { "MADV_FREE_hit", KSTAT_DATA_ULONG }, 73 { "MADV_FREE_miss", KSTAT_DATA_ULONG }, 74 }; 75 76 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat; 77 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t); 78 79 /* 80 * entry in the segment page cache 81 */ 82 struct seg_pcache { 83 struct seg_pcache *p_hnext; /* list for hashed blocks */ 84 struct seg_pcache *p_hprev; 85 pcache_link_t p_plink; /* per segment/amp list */ 86 void *p_htag0; /* segment/amp pointer */ 87 caddr_t p_addr; /* base address/anon_idx */ 88 size_t p_len; /* total bytes */ 89 size_t p_wlen; /* writtable bytes at p_addr */ 90 struct page **p_pp; /* pp shadow list */ 91 seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */ 92 clock_t p_lbolt; /* lbolt from last use */ 93 struct seg_phash *p_hashp; /* our pcache hash bucket */ 94 uint_t p_active; /* active count */ 95 uchar_t p_write; /* true if S_WRITE */ 96 uchar_t p_ref; /* reference byte */ 97 ushort_t p_flags; /* bit flags */ 98 }; 99 100 struct seg_phash { 101 struct seg_pcache *p_hnext; /* list for hashed blocks */ 102 struct seg_pcache *p_hprev; 103 kmutex_t p_hmutex; /* protects hash bucket */ 104 pcache_link_t p_halink[2]; /* active bucket linkages */ 105 }; 106 107 struct seg_phash_wired { 108 struct seg_pcache *p_hnext; /* list for hashed blocks */ 109 struct seg_pcache *p_hprev; 110 kmutex_t p_hmutex; /* protects hash bucket */ 111 }; 112 113 /* 114 * A parameter to control a maximum number of bytes that can be 115 * purged from pcache at a time. 116 */ 117 #define P_MAX_APURGE_BYTES (1024 * 1024 * 1024) 118 119 /* 120 * log2(fraction of pcache to reclaim at a time). 121 */ 122 #define P_SHRINK_SHFT (5) 123 124 /* 125 * The following variables can be tuned via /etc/system. 126 */ 127 128 int segpcache_enabled = 1; /* if 1, shadow lists are cached */ 129 pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */ 130 ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */ 131 ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */ 132 int segpcache_reap_sec = 1; /* reap check rate in secs */ 133 clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */ 134 int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */ 135 clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */ 136 int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */ 137 pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */ 138 139 static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */ 140 static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */ 141 static kcondvar_t seg_pasync_cv; 142 143 #pragma align 64(pctrl1) 144 #pragma align 64(pctrl2) 145 #pragma align 64(pctrl3) 146 147 /* 148 * Keep frequently used variables together in one cache line. 149 */ 150 static struct p_ctrl1 { 151 uint_t p_disabled; /* if not 0, caching temporarily off */ 152 pgcnt_t p_maxwin; /* max # of pages that can be cached */ 153 size_t p_hashwin_sz; /* # of non wired buckets */ 154 struct seg_phash *p_htabwin; /* hash table for non wired entries */ 155 size_t p_hashwired_sz; /* # of wired buckets */ 156 struct seg_phash_wired *p_htabwired; /* hash table for wired entries */ 157 kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */ 158 #ifdef _LP64 159 ulong_t pad[1]; 160 #endif /* _LP64 */ 161 } pctrl1; 162 163 static struct p_ctrl2 { 164 kmutex_t p_mem_mtx; /* protects window counter and p_halinks */ 165 pgcnt_t p_locked_win; /* # pages from window */ 166 pgcnt_t p_locked; /* # of pages cached by pagelock */ 167 uchar_t p_ahcur; /* current active links for insert/delete */ 168 uchar_t p_athr_on; /* async reclaim thread is running. */ 169 pcache_link_t p_ahhead[2]; /* active buckets linkages */ 170 } pctrl2; 171 172 static struct p_ctrl3 { 173 clock_t p_pcp_maxage; /* max pcp age in ticks */ 174 ulong_t p_athr_empty_ahb; /* athread walk stats */ 175 ulong_t p_athr_full_ahb; /* athread walk stats */ 176 pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */ 177 int p_shrink_shft; /* reap shift factor */ 178 #ifdef _LP64 179 ulong_t pad[3]; 180 #endif /* _LP64 */ 181 } pctrl3; 182 183 #define seg_pdisabled pctrl1.p_disabled 184 #define seg_pmaxwindow pctrl1.p_maxwin 185 #define seg_phashsize_win pctrl1.p_hashwin_sz 186 #define seg_phashtab_win pctrl1.p_htabwin 187 #define seg_phashsize_wired pctrl1.p_hashwired_sz 188 #define seg_phashtab_wired pctrl1.p_htabwired 189 #define seg_pkmcache pctrl1.p_kmcache 190 #define seg_pmem_mtx pctrl2.p_mem_mtx 191 #define seg_plocked_window pctrl2.p_locked_win 192 #define seg_plocked pctrl2.p_locked 193 #define seg_pahcur pctrl2.p_ahcur 194 #define seg_pathr_on pctrl2.p_athr_on 195 #define seg_pahhead pctrl2.p_ahhead 196 #define seg_pmax_pcpage pctrl3.p_pcp_maxage 197 #define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb 198 #define seg_pathr_full_ahb pctrl3.p_athr_full_ahb 199 #define seg_pshrink_shift pctrl3.p_shrink_shft 200 #define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages 201 202 #define P_HASHWIN_MASK (seg_phashsize_win - 1) 203 #define P_HASHWIRED_MASK (seg_phashsize_wired - 1) 204 #define P_BASESHIFT (6) 205 206 kthread_t *seg_pasync_thr; 207 208 extern struct seg_ops segvn_ops; 209 extern struct seg_ops segspt_shmops; 210 211 #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED) 212 #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags) 213 214 #define LBOLT_DELTA(t) ((ulong_t)(ddi_get_lbolt() - (t))) 215 216 #define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt) 217 218 /* 219 * htag0 argument can be a seg or amp pointer. 220 */ 221 #define P_HASHBP(seg, htag0, addr, flags) \ 222 (IS_PFLAGS_WIRED((flags)) ? \ 223 ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \ 224 ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \ 225 (&seg_phashtab_win[P_HASHWIN_MASK & \ 226 (((uintptr_t)(htag0) >> 3) ^ \ 227 ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \ 228 (flags >> 16) : page_get_shift((seg)->s_szc))))])) 229 230 /* 231 * htag0 argument can be a seg or amp pointer. 232 */ 233 #define P_MATCH(pcp, htag0, addr, len) \ 234 ((pcp)->p_htag0 == (htag0) && \ 235 (pcp)->p_addr == (addr) && \ 236 (pcp)->p_len >= (len)) 237 238 #define P_MATCH_PP(pcp, htag0, addr, len, pp) \ 239 ((pcp)->p_pp == (pp) && \ 240 (pcp)->p_htag0 == (htag0) && \ 241 (pcp)->p_addr == (addr) && \ 242 (pcp)->p_len >= (len)) 243 244 #define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \ 245 offsetof(struct seg_pcache, p_plink))) 246 247 #define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \ 248 offsetof(struct seg_phash, p_halink[l]))) 249 250 /* 251 * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from 252 * active hash bucket lists. We maintain active bucket lists to reduce the 253 * overhead of finding active buckets during asynchronous purging since there 254 * can be 10s of millions of buckets on a large system but only a small subset 255 * of them in actual use. 256 * 257 * There're 2 active bucket lists. Current active list (as per seg_pahcur) is 258 * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete 259 * buckets. The other list is used by asynchronous purge thread. This allows 260 * the purge thread to walk its active list without holding seg_pmem_mtx for a 261 * long time. When asynchronous thread is done with its list it switches to 262 * current active list and makes the list it just finished processing as 263 * current active list. 264 * 265 * seg_padd_abuck() only adds the bucket to current list if the bucket is not 266 * yet on any list. seg_premove_abuck() may remove the bucket from either 267 * list. If the bucket is on current list it will be always removed. Otherwise 268 * the bucket is only removed if asynchronous purge thread is not currently 269 * running or seg_premove_abuck() is called by asynchronous purge thread 270 * itself. A given bucket can only be on one of active lists at a time. These 271 * routines should be called with per bucket lock held. The routines use 272 * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after 273 * the first entry is added to the bucket chain and seg_premove_abuck() must 274 * be called after the last pcp entry is deleted from its chain. Per bucket 275 * lock should be held by the callers. This avoids a potential race condition 276 * when seg_premove_abuck() removes a bucket after pcp entries are added to 277 * its list after the caller checked that the bucket has no entries. (this 278 * race would cause a loss of an active bucket from the active lists). 279 * 280 * Both lists are circular doubly linked lists anchored at seg_pahhead heads. 281 * New entries are added to the end of the list since LRU is used as the 282 * purging policy. 283 */ 284 static void 285 seg_padd_abuck(struct seg_phash *hp) 286 { 287 int lix; 288 289 ASSERT(MUTEX_HELD(&hp->p_hmutex)); 290 ASSERT((struct seg_phash *)hp->p_hnext != hp); 291 ASSERT((struct seg_phash *)hp->p_hprev != hp); 292 ASSERT(hp->p_hnext == hp->p_hprev); 293 ASSERT(!IS_PCP_WIRED(hp->p_hnext)); 294 ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp); 295 ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp); 296 ASSERT(hp >= seg_phashtab_win && 297 hp < &seg_phashtab_win[seg_phashsize_win]); 298 299 /* 300 * This bucket can already be on one of active lists 301 * since seg_premove_abuck() may have failed to remove it 302 * before. 303 */ 304 mutex_enter(&seg_pmem_mtx); 305 lix = seg_pahcur; 306 ASSERT(lix >= 0 && lix <= 1); 307 if (hp->p_halink[lix].p_lnext != NULL) { 308 ASSERT(hp->p_halink[lix].p_lprev != NULL); 309 ASSERT(hp->p_halink[!lix].p_lnext == NULL); 310 ASSERT(hp->p_halink[!lix].p_lprev == NULL); 311 mutex_exit(&seg_pmem_mtx); 312 return; 313 } 314 ASSERT(hp->p_halink[lix].p_lprev == NULL); 315 316 /* 317 * If this bucket is still on list !lix async thread can't yet remove 318 * it since we hold here per bucket lock. In this case just return 319 * since async thread will eventually find and process this bucket. 320 */ 321 if (hp->p_halink[!lix].p_lnext != NULL) { 322 ASSERT(hp->p_halink[!lix].p_lprev != NULL); 323 mutex_exit(&seg_pmem_mtx); 324 return; 325 } 326 ASSERT(hp->p_halink[!lix].p_lprev == NULL); 327 /* 328 * This bucket is not on any active bucket list yet. 329 * Add the bucket to the tail of current active list. 330 */ 331 hp->p_halink[lix].p_lnext = &seg_pahhead[lix]; 332 hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev; 333 seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix]; 334 seg_pahhead[lix].p_lprev = &hp->p_halink[lix]; 335 mutex_exit(&seg_pmem_mtx); 336 } 337 338 static void 339 seg_premove_abuck(struct seg_phash *hp, int athr) 340 { 341 int lix; 342 343 ASSERT(MUTEX_HELD(&hp->p_hmutex)); 344 ASSERT((struct seg_phash *)hp->p_hnext == hp); 345 ASSERT((struct seg_phash *)hp->p_hprev == hp); 346 ASSERT(hp >= seg_phashtab_win && 347 hp < &seg_phashtab_win[seg_phashsize_win]); 348 349 if (athr) { 350 ASSERT(seg_pathr_on); 351 ASSERT(seg_pahcur <= 1); 352 /* 353 * We are called by asynchronous thread that found this bucket 354 * on not currently active (i.e. !seg_pahcur) list. Remove it 355 * from there. Per bucket lock we are holding makes sure 356 * seg_pinsert() can't sneak in and add pcp entries to this 357 * bucket right before we remove the bucket from its list. 358 */ 359 lix = !seg_pahcur; 360 ASSERT(hp->p_halink[lix].p_lnext != NULL); 361 ASSERT(hp->p_halink[lix].p_lprev != NULL); 362 ASSERT(hp->p_halink[!lix].p_lnext == NULL); 363 ASSERT(hp->p_halink[!lix].p_lprev == NULL); 364 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 365 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 366 hp->p_halink[lix].p_lnext = NULL; 367 hp->p_halink[lix].p_lprev = NULL; 368 return; 369 } 370 371 mutex_enter(&seg_pmem_mtx); 372 lix = seg_pahcur; 373 ASSERT(lix >= 0 && lix <= 1); 374 375 /* 376 * If the bucket is on currently active list just remove it from 377 * there. 378 */ 379 if (hp->p_halink[lix].p_lnext != NULL) { 380 ASSERT(hp->p_halink[lix].p_lprev != NULL); 381 ASSERT(hp->p_halink[!lix].p_lnext == NULL); 382 ASSERT(hp->p_halink[!lix].p_lprev == NULL); 383 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 384 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 385 hp->p_halink[lix].p_lnext = NULL; 386 hp->p_halink[lix].p_lprev = NULL; 387 mutex_exit(&seg_pmem_mtx); 388 return; 389 } 390 ASSERT(hp->p_halink[lix].p_lprev == NULL); 391 392 /* 393 * If asynchronous thread is not running we can remove the bucket from 394 * not currently active list. The bucket must be on this list since we 395 * already checked that it's not on the other list and the bucket from 396 * which we just deleted the last pcp entry must be still on one of the 397 * active bucket lists. 398 */ 399 lix = !lix; 400 ASSERT(hp->p_halink[lix].p_lnext != NULL); 401 ASSERT(hp->p_halink[lix].p_lprev != NULL); 402 403 if (!seg_pathr_on) { 404 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 405 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 406 hp->p_halink[lix].p_lnext = NULL; 407 hp->p_halink[lix].p_lprev = NULL; 408 } 409 mutex_exit(&seg_pmem_mtx); 410 } 411 412 /* 413 * Check if bucket pointed by hp already has a pcp entry that matches request 414 * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise. 415 * Also delete matching entries that cover smaller address range but start 416 * at the same address as addr argument. Return the list of deleted entries if 417 * any. This is an internal helper function called from seg_pinsert() only 418 * for non wired shadow lists. The caller already holds a per seg/amp list 419 * lock. 420 */ 421 static struct seg_pcache * 422 seg_plookup_checkdup(struct seg_phash *hp, void *htag0, 423 caddr_t addr, size_t len, int *found) 424 { 425 struct seg_pcache *pcp; 426 struct seg_pcache *delcallb_list = NULL; 427 428 ASSERT(MUTEX_HELD(&hp->p_hmutex)); 429 430 *found = 0; 431 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 432 pcp = pcp->p_hnext) { 433 ASSERT(pcp->p_hashp == hp); 434 if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) { 435 ASSERT(!IS_PCP_WIRED(pcp)); 436 if (pcp->p_len < len) { 437 pcache_link_t *plinkp; 438 if (pcp->p_active) { 439 continue; 440 } 441 plinkp = &pcp->p_plink; 442 plinkp->p_lprev->p_lnext = plinkp->p_lnext; 443 plinkp->p_lnext->p_lprev = plinkp->p_lprev; 444 pcp->p_hprev->p_hnext = pcp->p_hnext; 445 pcp->p_hnext->p_hprev = pcp->p_hprev; 446 pcp->p_hprev = delcallb_list; 447 delcallb_list = pcp; 448 } else { 449 *found = 1; 450 break; 451 } 452 } 453 } 454 return (delcallb_list); 455 } 456 457 /* 458 * lookup an address range in pagelock cache. Return shadow list and bump up 459 * active count. If amp is not NULL use amp as a lookup tag otherwise use seg 460 * as a lookup tag. 461 */ 462 struct page ** 463 seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, 464 enum seg_rw rw, uint_t flags) 465 { 466 struct seg_pcache *pcp; 467 struct seg_phash *hp; 468 void *htag0; 469 470 ASSERT(seg != NULL); 471 ASSERT(rw == S_READ || rw == S_WRITE); 472 473 /* 474 * Skip pagelock cache, while DR is in progress or 475 * seg_pcache is off. 476 */ 477 if (seg_pdisabled) { 478 return (NULL); 479 } 480 ASSERT(seg_phashsize_win != 0); 481 482 htag0 = (amp == NULL ? (void *)seg : (void *)amp); 483 hp = P_HASHBP(seg, htag0, addr, flags); 484 mutex_enter(&hp->p_hmutex); 485 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 486 pcp = pcp->p_hnext) { 487 ASSERT(pcp->p_hashp == hp); 488 if (P_MATCH(pcp, htag0, addr, len)) { 489 ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); 490 /* 491 * If this request wants to write pages 492 * but write permissions starting from 493 * addr don't cover the entire length len 494 * return lookup failure back to the caller. 495 * It will check protections and fail this 496 * pagelock operation with EACCESS error. 497 */ 498 if (rw == S_WRITE && pcp->p_wlen < len) { 499 break; 500 } 501 if (pcp->p_active == UINT_MAX) { 502 break; 503 } 504 pcp->p_active++; 505 if (rw == S_WRITE && !pcp->p_write) { 506 pcp->p_write = 1; 507 } 508 mutex_exit(&hp->p_hmutex); 509 return (pcp->p_pp); 510 } 511 } 512 mutex_exit(&hp->p_hmutex); 513 return (NULL); 514 } 515 516 /* 517 * mark address range inactive. If the cache is off or the address range is 518 * not in the cache or another shadow list that covers bigger range is found 519 * we call the segment driver to reclaim the pages. Otherwise just decrement 520 * active count and set ref bit. If amp is not NULL use amp as a lookup tag 521 * otherwise use seg as a lookup tag. 522 */ 523 void 524 seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr, 525 size_t len, struct page **pp, enum seg_rw rw, uint_t flags, 526 seg_preclaim_cbfunc_t callback) 527 { 528 struct seg_pcache *pcp; 529 struct seg_phash *hp; 530 kmutex_t *pmtx = NULL; 531 pcache_link_t *pheadp; 532 void *htag0; 533 pgcnt_t npages = 0; 534 int keep = 0; 535 536 ASSERT(seg != NULL); 537 ASSERT(rw == S_READ || rw == S_WRITE); 538 539 htag0 = (amp == NULL ? (void *)seg : (void *)amp); 540 541 /* 542 * Skip lookup if pcache is not configured. 543 */ 544 if (seg_phashsize_win == 0) { 545 goto out; 546 } 547 548 /* 549 * Grab per seg/amp lock before hash lock if we are going to remove 550 * inactive entry from pcache. 551 */ 552 if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) { 553 if (amp == NULL) { 554 pheadp = &seg->s_phead; 555 pmtx = &seg->s_pmtx; 556 } else { 557 pheadp = &->a_phead; 558 pmtx = &->a_pmtx; 559 } 560 mutex_enter(pmtx); 561 } 562 563 hp = P_HASHBP(seg, htag0, addr, flags); 564 mutex_enter(&hp->p_hmutex); 565 again: 566 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 567 pcp = pcp->p_hnext) { 568 ASSERT(pcp->p_hashp == hp); 569 if (P_MATCH_PP(pcp, htag0, addr, len, pp)) { 570 ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); 571 ASSERT(pcp->p_active); 572 if (keep) { 573 /* 574 * Don't remove this pcp entry 575 * if we didn't find duplicate 576 * shadow lists on second search. 577 * Somebody removed those duplicates 578 * since we dropped hash lock after first 579 * search. 580 */ 581 ASSERT(pmtx != NULL); 582 ASSERT(!IS_PFLAGS_WIRED(flags)); 583 mutex_exit(pmtx); 584 pmtx = NULL; 585 } 586 pcp->p_active--; 587 if (pcp->p_active == 0 && (pmtx != NULL || 588 (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) { 589 590 /* 591 * This entry is no longer active. Remove it 592 * now either because pcaching is temporarily 593 * disabled or there're other pcp entries that 594 * can match this pagelock request (i.e. this 595 * entry is a duplicate). 596 */ 597 598 ASSERT(callback == pcp->p_callback); 599 if (pmtx != NULL) { 600 pcache_link_t *plinkp = &pcp->p_plink; 601 ASSERT(!IS_PCP_WIRED(pcp)); 602 ASSERT(pheadp->p_lnext != pheadp); 603 ASSERT(pheadp->p_lprev != pheadp); 604 plinkp->p_lprev->p_lnext = 605 plinkp->p_lnext; 606 plinkp->p_lnext->p_lprev = 607 plinkp->p_lprev; 608 } 609 pcp->p_hprev->p_hnext = pcp->p_hnext; 610 pcp->p_hnext->p_hprev = pcp->p_hprev; 611 if (!IS_PCP_WIRED(pcp) && 612 hp->p_hnext == (struct seg_pcache *)hp) { 613 /* 614 * We removed the last entry from this 615 * bucket. Now remove the bucket from 616 * its active list. 617 */ 618 seg_premove_abuck(hp, 0); 619 } 620 mutex_exit(&hp->p_hmutex); 621 if (pmtx != NULL) { 622 mutex_exit(pmtx); 623 } 624 len = pcp->p_len; 625 npages = btop(len); 626 if (rw != S_WRITE && pcp->p_write) { 627 rw = S_WRITE; 628 } 629 kmem_cache_free(seg_pkmcache, pcp); 630 goto out; 631 } else { 632 /* 633 * We found a matching pcp entry but will not 634 * free it right away even if it's no longer 635 * active. 636 */ 637 if (!pcp->p_active && !IS_PCP_WIRED(pcp)) { 638 /* 639 * Set the reference bit and mark the 640 * time of last access to this pcp 641 * so that asynchronous thread doesn't 642 * free it immediately since 643 * it may be reactivated very soon. 644 */ 645 pcp->p_lbolt = ddi_get_lbolt(); 646 pcp->p_ref = 1; 647 } 648 mutex_exit(&hp->p_hmutex); 649 if (pmtx != NULL) { 650 mutex_exit(pmtx); 651 } 652 return; 653 } 654 } else if (!IS_PFLAGS_WIRED(flags) && 655 P_MATCH(pcp, htag0, addr, len)) { 656 /* 657 * This is a duplicate pcp entry. This situation may 658 * happen if a bigger shadow list that covers our 659 * range was added while our entry was still active. 660 * Now we can free our pcp entry if it becomes 661 * inactive. 662 */ 663 if (!pcp->p_active) { 664 /* 665 * Mark this entry as referenced just in case 666 * we'll free our own pcp entry soon. 667 */ 668 pcp->p_lbolt = ddi_get_lbolt(); 669 pcp->p_ref = 1; 670 } 671 if (pmtx != NULL) { 672 /* 673 * we are already holding pmtx and found a 674 * duplicate. Don't keep our own pcp entry. 675 */ 676 keep = 0; 677 continue; 678 } 679 /* 680 * We have to use mutex_tryenter to attempt to lock 681 * seg/amp list lock since we already hold hash lock 682 * and seg/amp list lock is above hash lock in lock 683 * order. If mutex_tryenter fails drop hash lock and 684 * retake both locks in correct order and research 685 * this hash chain. 686 */ 687 ASSERT(keep == 0); 688 if (amp == NULL) { 689 pheadp = &seg->s_phead; 690 pmtx = &seg->s_pmtx; 691 } else { 692 pheadp = &->a_phead; 693 pmtx = &->a_pmtx; 694 } 695 if (!mutex_tryenter(pmtx)) { 696 mutex_exit(&hp->p_hmutex); 697 mutex_enter(pmtx); 698 mutex_enter(&hp->p_hmutex); 699 /* 700 * If we don't find bigger shadow list on 701 * second search (it may happen since we 702 * dropped bucket lock) keep the entry that 703 * matches our own shadow list. 704 */ 705 keep = 1; 706 goto again; 707 } 708 } 709 } 710 mutex_exit(&hp->p_hmutex); 711 if (pmtx != NULL) { 712 mutex_exit(pmtx); 713 } 714 out: 715 (*callback)(htag0, addr, len, pp, rw, 0); 716 if (npages) { 717 mutex_enter(&seg_pmem_mtx); 718 ASSERT(seg_plocked >= npages); 719 seg_plocked -= npages; 720 if (!IS_PFLAGS_WIRED(flags)) { 721 ASSERT(seg_plocked_window >= npages); 722 seg_plocked_window -= npages; 723 } 724 mutex_exit(&seg_pmem_mtx); 725 } 726 727 } 728 729 #ifdef DEBUG 730 static uint32_t p_insert_chk_mtbf = 0; 731 #endif 732 733 /* 734 * The seg_pinsert_check() is used by segment drivers to predict whether 735 * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing. 736 */ 737 /*ARGSUSED*/ 738 int 739 seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr, 740 size_t len, uint_t flags) 741 { 742 ASSERT(seg != NULL); 743 744 #ifdef DEBUG 745 if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) { 746 return (SEGP_FAIL); 747 } 748 #endif 749 750 if (seg_pdisabled) { 751 return (SEGP_FAIL); 752 } 753 ASSERT(seg_phashsize_win != 0); 754 755 if (IS_PFLAGS_WIRED(flags)) { 756 return (SEGP_SUCCESS); 757 } 758 759 if (seg_plocked_window + btop(len) > seg_pmaxwindow) { 760 return (SEGP_FAIL); 761 } 762 763 if (freemem < desfree) { 764 return (SEGP_FAIL); 765 } 766 767 return (SEGP_SUCCESS); 768 } 769 770 #ifdef DEBUG 771 static uint32_t p_insert_mtbf = 0; 772 #endif 773 774 /* 775 * Insert address range with shadow list into pagelock cache if there's no 776 * shadow list already cached for this address range. If the cache is off or 777 * caching is temporarily disabled or the allowed 'window' is exceeded return 778 * SEGP_FAIL. Otherwise return SEGP_SUCCESS. 779 * 780 * For non wired shadow lists (segvn case) include address in the hashing 781 * function to avoid linking all the entries from the same segment or amp on 782 * the same bucket. amp is used instead of seg if amp is not NULL. Non wired 783 * pcache entries are also linked on a per segment/amp list so that all 784 * entries can be found quickly during seg/amp purge without walking the 785 * entire pcache hash table. For wired shadow lists (segspt case) we 786 * don't use address hashing and per segment linking because the caller 787 * currently inserts only one entry per segment that covers the entire 788 * segment. If we used per segment linking even for segspt it would complicate 789 * seg_ppurge_wiredpp() locking. 790 * 791 * Both hash bucket and per seg/amp locks need to be held before adding a non 792 * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken 793 * first. 794 * 795 * This function will also remove from pcache old inactive shadow lists that 796 * overlap with this request but cover smaller range for the same start 797 * address. 798 */ 799 int 800 seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, 801 size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags, 802 seg_preclaim_cbfunc_t callback) 803 { 804 struct seg_pcache *pcp; 805 struct seg_phash *hp; 806 pgcnt_t npages; 807 pcache_link_t *pheadp; 808 kmutex_t *pmtx; 809 struct seg_pcache *delcallb_list = NULL; 810 811 ASSERT(seg != NULL); 812 ASSERT(rw == S_READ || rw == S_WRITE); 813 ASSERT(rw == S_READ || wlen == len); 814 ASSERT(rw == S_WRITE || wlen <= len); 815 ASSERT(amp == NULL || wlen == len); 816 817 #ifdef DEBUG 818 if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) { 819 return (SEGP_FAIL); 820 } 821 #endif 822 823 if (seg_pdisabled) { 824 return (SEGP_FAIL); 825 } 826 ASSERT(seg_phashsize_win != 0); 827 828 ASSERT((len & PAGEOFFSET) == 0); 829 npages = btop(len); 830 mutex_enter(&seg_pmem_mtx); 831 if (!IS_PFLAGS_WIRED(flags)) { 832 if (seg_plocked_window + npages > seg_pmaxwindow) { 833 mutex_exit(&seg_pmem_mtx); 834 return (SEGP_FAIL); 835 } 836 seg_plocked_window += npages; 837 } 838 seg_plocked += npages; 839 mutex_exit(&seg_pmem_mtx); 840 841 pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP); 842 /* 843 * If amp is not NULL set htag0 to amp otherwise set it to seg. 844 */ 845 if (amp == NULL) { 846 pcp->p_htag0 = (void *)seg; 847 pcp->p_flags = flags & 0xffff; 848 } else { 849 pcp->p_htag0 = (void *)amp; 850 pcp->p_flags = (flags & 0xffff) | SEGP_AMP; 851 } 852 pcp->p_addr = addr; 853 pcp->p_len = len; 854 pcp->p_wlen = wlen; 855 pcp->p_pp = pp; 856 pcp->p_write = (rw == S_WRITE); 857 pcp->p_callback = callback; 858 pcp->p_active = 1; 859 860 hp = P_HASHBP(seg, pcp->p_htag0, addr, flags); 861 if (!IS_PFLAGS_WIRED(flags)) { 862 int found; 863 void *htag0; 864 if (amp == NULL) { 865 pheadp = &seg->s_phead; 866 pmtx = &seg->s_pmtx; 867 htag0 = (void *)seg; 868 } else { 869 pheadp = &->a_phead; 870 pmtx = &->a_pmtx; 871 htag0 = (void *)amp; 872 } 873 mutex_enter(pmtx); 874 mutex_enter(&hp->p_hmutex); 875 delcallb_list = seg_plookup_checkdup(hp, htag0, addr, 876 len, &found); 877 if (found) { 878 mutex_exit(&hp->p_hmutex); 879 mutex_exit(pmtx); 880 mutex_enter(&seg_pmem_mtx); 881 seg_plocked -= npages; 882 seg_plocked_window -= npages; 883 mutex_exit(&seg_pmem_mtx); 884 kmem_cache_free(seg_pkmcache, pcp); 885 goto out; 886 } 887 pcp->p_plink.p_lnext = pheadp->p_lnext; 888 pcp->p_plink.p_lprev = pheadp; 889 pheadp->p_lnext->p_lprev = &pcp->p_plink; 890 pheadp->p_lnext = &pcp->p_plink; 891 } else { 892 mutex_enter(&hp->p_hmutex); 893 } 894 pcp->p_hashp = hp; 895 pcp->p_hnext = hp->p_hnext; 896 pcp->p_hprev = (struct seg_pcache *)hp; 897 hp->p_hnext->p_hprev = pcp; 898 hp->p_hnext = pcp; 899 if (!IS_PFLAGS_WIRED(flags) && 900 hp->p_hprev == pcp) { 901 seg_padd_abuck(hp); 902 } 903 mutex_exit(&hp->p_hmutex); 904 if (!IS_PFLAGS_WIRED(flags)) { 905 mutex_exit(pmtx); 906 } 907 908 out: 909 npages = 0; 910 while (delcallb_list != NULL) { 911 pcp = delcallb_list; 912 delcallb_list = pcp->p_hprev; 913 ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active); 914 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 915 pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); 916 npages += btop(pcp->p_len); 917 kmem_cache_free(seg_pkmcache, pcp); 918 } 919 if (npages) { 920 ASSERT(!IS_PFLAGS_WIRED(flags)); 921 mutex_enter(&seg_pmem_mtx); 922 ASSERT(seg_plocked >= npages); 923 ASSERT(seg_plocked_window >= npages); 924 seg_plocked -= npages; 925 seg_plocked_window -= npages; 926 mutex_exit(&seg_pmem_mtx); 927 } 928 929 return (SEGP_SUCCESS); 930 } 931 932 /* 933 * purge entries from the pagelock cache if not active 934 * and not recently used. 935 */ 936 static void 937 seg_ppurge_async(int force) 938 { 939 struct seg_pcache *delcallb_list = NULL; 940 struct seg_pcache *pcp; 941 struct seg_phash *hp; 942 pgcnt_t npages = 0; 943 pgcnt_t npages_window = 0; 944 pgcnt_t npgs_to_purge; 945 pgcnt_t npgs_purged = 0; 946 int hlinks = 0; 947 int hlix; 948 pcache_link_t *hlinkp; 949 pcache_link_t *hlnextp = NULL; 950 int lowmem; 951 int trim; 952 953 ASSERT(seg_phashsize_win != 0); 954 955 /* 956 * if the cache is off or empty, return 957 */ 958 if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) { 959 return; 960 } 961 962 if (!force) { 963 lowmem = 0; 964 trim = 0; 965 if (freemem < lotsfree + needfree) { 966 spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0); 967 if (fmem <= 5 * (desfree >> 2)) { 968 lowmem = 1; 969 } else if (fmem <= 7 * (lotsfree >> 3)) { 970 if (seg_plocked_window >= 971 (availrmem_initial >> 1)) { 972 lowmem = 1; 973 } 974 } else if (fmem < lotsfree) { 975 if (seg_plocked_window >= 976 3 * (availrmem_initial >> 2)) { 977 lowmem = 1; 978 } 979 } 980 } 981 if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) { 982 trim = 1; 983 } 984 if (!lowmem && !trim) { 985 return; 986 } 987 npgs_to_purge = seg_plocked_window >> 988 seg_pshrink_shift; 989 if (lowmem) { 990 npgs_to_purge = MIN(npgs_to_purge, 991 MAX(seg_pmaxapurge_npages, desfree)); 992 } else { 993 npgs_to_purge = MIN(npgs_to_purge, 994 seg_pmaxapurge_npages); 995 } 996 if (npgs_to_purge == 0) { 997 return; 998 } 999 } else { 1000 struct seg_phash_wired *hpw; 1001 1002 ASSERT(seg_phashsize_wired != 0); 1003 1004 for (hpw = seg_phashtab_wired; 1005 hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) { 1006 1007 if (hpw->p_hnext == (struct seg_pcache *)hpw) { 1008 continue; 1009 } 1010 1011 mutex_enter(&hpw->p_hmutex); 1012 1013 for (pcp = hpw->p_hnext; 1014 pcp != (struct seg_pcache *)hpw; 1015 pcp = pcp->p_hnext) { 1016 1017 ASSERT(IS_PCP_WIRED(pcp)); 1018 ASSERT(pcp->p_hashp == 1019 (struct seg_phash *)hpw); 1020 1021 if (pcp->p_active) { 1022 continue; 1023 } 1024 pcp->p_hprev->p_hnext = pcp->p_hnext; 1025 pcp->p_hnext->p_hprev = pcp->p_hprev; 1026 pcp->p_hprev = delcallb_list; 1027 delcallb_list = pcp; 1028 } 1029 mutex_exit(&hpw->p_hmutex); 1030 } 1031 } 1032 1033 mutex_enter(&seg_pmem_mtx); 1034 if (seg_pathr_on) { 1035 mutex_exit(&seg_pmem_mtx); 1036 goto runcb; 1037 } 1038 seg_pathr_on = 1; 1039 mutex_exit(&seg_pmem_mtx); 1040 ASSERT(seg_pahcur <= 1); 1041 hlix = !seg_pahcur; 1042 1043 again: 1044 for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix]; 1045 hlinkp = hlnextp) { 1046 1047 hlnextp = hlinkp->p_lnext; 1048 ASSERT(hlnextp != NULL); 1049 1050 hp = hlink2phash(hlinkp, hlix); 1051 if (hp->p_hnext == (struct seg_pcache *)hp) { 1052 seg_pathr_empty_ahb++; 1053 continue; 1054 } 1055 seg_pathr_full_ahb++; 1056 mutex_enter(&hp->p_hmutex); 1057 1058 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 1059 pcp = pcp->p_hnext) { 1060 pcache_link_t *pheadp; 1061 pcache_link_t *plinkp; 1062 void *htag0; 1063 kmutex_t *pmtx; 1064 1065 ASSERT(!IS_PCP_WIRED(pcp)); 1066 ASSERT(pcp->p_hashp == hp); 1067 1068 if (pcp->p_active) { 1069 continue; 1070 } 1071 if (!force && pcp->p_ref && 1072 PCP_AGE(pcp) < seg_pmax_pcpage) { 1073 pcp->p_ref = 0; 1074 continue; 1075 } 1076 plinkp = &pcp->p_plink; 1077 htag0 = pcp->p_htag0; 1078 if (pcp->p_flags & SEGP_AMP) { 1079 pheadp = &((amp_t *)htag0)->a_phead; 1080 pmtx = &((amp_t *)htag0)->a_pmtx; 1081 } else { 1082 pheadp = &((seg_t *)htag0)->s_phead; 1083 pmtx = &((seg_t *)htag0)->s_pmtx; 1084 } 1085 if (!mutex_tryenter(pmtx)) { 1086 continue; 1087 } 1088 ASSERT(pheadp->p_lnext != pheadp); 1089 ASSERT(pheadp->p_lprev != pheadp); 1090 plinkp->p_lprev->p_lnext = 1091 plinkp->p_lnext; 1092 plinkp->p_lnext->p_lprev = 1093 plinkp->p_lprev; 1094 pcp->p_hprev->p_hnext = pcp->p_hnext; 1095 pcp->p_hnext->p_hprev = pcp->p_hprev; 1096 mutex_exit(pmtx); 1097 pcp->p_hprev = delcallb_list; 1098 delcallb_list = pcp; 1099 npgs_purged += btop(pcp->p_len); 1100 } 1101 if (hp->p_hnext == (struct seg_pcache *)hp) { 1102 seg_premove_abuck(hp, 1); 1103 } 1104 mutex_exit(&hp->p_hmutex); 1105 if (npgs_purged >= seg_plocked_window) { 1106 break; 1107 } 1108 if (!force) { 1109 if (npgs_purged >= npgs_to_purge) { 1110 break; 1111 } 1112 if (!trim && !(seg_pathr_full_ahb & 15)) { 1113 ASSERT(lowmem); 1114 if (freemem >= lotsfree + needfree) { 1115 break; 1116 } 1117 } 1118 } 1119 } 1120 1121 if (hlinkp == &seg_pahhead[hlix]) { 1122 /* 1123 * We processed the entire hlix active bucket list 1124 * but didn't find enough pages to reclaim. 1125 * Switch the lists and walk the other list 1126 * if we haven't done it yet. 1127 */ 1128 mutex_enter(&seg_pmem_mtx); 1129 ASSERT(seg_pathr_on); 1130 ASSERT(seg_pahcur == !hlix); 1131 seg_pahcur = hlix; 1132 mutex_exit(&seg_pmem_mtx); 1133 if (++hlinks < 2) { 1134 hlix = !hlix; 1135 goto again; 1136 } 1137 } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] && 1138 seg_pahhead[hlix].p_lnext != hlinkp) { 1139 ASSERT(hlinkp != NULL); 1140 ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]); 1141 ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]); 1142 ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]); 1143 1144 /* 1145 * Reinsert the header to point to hlinkp 1146 * so that we start from hlinkp bucket next time around. 1147 */ 1148 seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev; 1149 seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext; 1150 seg_pahhead[hlix].p_lnext = hlinkp; 1151 seg_pahhead[hlix].p_lprev = hlinkp->p_lprev; 1152 hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix]; 1153 hlinkp->p_lprev = &seg_pahhead[hlix]; 1154 } 1155 1156 mutex_enter(&seg_pmem_mtx); 1157 ASSERT(seg_pathr_on); 1158 seg_pathr_on = 0; 1159 mutex_exit(&seg_pmem_mtx); 1160 1161 runcb: 1162 /* 1163 * Run the delayed callback list. segments/amps can't go away until 1164 * callback is executed since they must have non 0 softlockcnt. That's 1165 * why we don't need to hold as/seg/amp locks to execute the callback. 1166 */ 1167 while (delcallb_list != NULL) { 1168 pcp = delcallb_list; 1169 delcallb_list = pcp->p_hprev; 1170 ASSERT(!pcp->p_active); 1171 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 1172 pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1); 1173 npages += btop(pcp->p_len); 1174 if (!IS_PCP_WIRED(pcp)) { 1175 npages_window += btop(pcp->p_len); 1176 } 1177 kmem_cache_free(seg_pkmcache, pcp); 1178 } 1179 if (npages) { 1180 mutex_enter(&seg_pmem_mtx); 1181 ASSERT(seg_plocked >= npages); 1182 ASSERT(seg_plocked_window >= npages_window); 1183 seg_plocked -= npages; 1184 seg_plocked_window -= npages_window; 1185 mutex_exit(&seg_pmem_mtx); 1186 } 1187 } 1188 1189 /* 1190 * Remove cached pages for segment(s) entries from hashtable. The segments 1191 * are identified by pp array. This is useful for multiple seg's cached on 1192 * behalf of dummy segment (ISM/DISM) with common pp array. 1193 */ 1194 void 1195 seg_ppurge_wiredpp(struct page **pp) 1196 { 1197 struct seg_pcache *pcp; 1198 struct seg_phash_wired *hp; 1199 pgcnt_t npages = 0; 1200 struct seg_pcache *delcallb_list = NULL; 1201 1202 /* 1203 * if the cache is empty, return 1204 */ 1205 if (seg_plocked == 0) { 1206 return; 1207 } 1208 ASSERT(seg_phashsize_wired != 0); 1209 1210 for (hp = seg_phashtab_wired; 1211 hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) { 1212 if (hp->p_hnext == (struct seg_pcache *)hp) { 1213 continue; 1214 } 1215 mutex_enter(&hp->p_hmutex); 1216 pcp = hp->p_hnext; 1217 while (pcp != (struct seg_pcache *)hp) { 1218 ASSERT(pcp->p_hashp == (struct seg_phash *)hp); 1219 ASSERT(IS_PCP_WIRED(pcp)); 1220 /* 1221 * purge entries which are not active 1222 */ 1223 if (!pcp->p_active && pcp->p_pp == pp) { 1224 ASSERT(pcp->p_htag0 != NULL); 1225 pcp->p_hprev->p_hnext = pcp->p_hnext; 1226 pcp->p_hnext->p_hprev = pcp->p_hprev; 1227 pcp->p_hprev = delcallb_list; 1228 delcallb_list = pcp; 1229 } 1230 pcp = pcp->p_hnext; 1231 } 1232 mutex_exit(&hp->p_hmutex); 1233 /* 1234 * segments can't go away until callback is executed since 1235 * they must have non 0 softlockcnt. That's why we don't 1236 * need to hold as/seg locks to execute the callback. 1237 */ 1238 while (delcallb_list != NULL) { 1239 int done; 1240 pcp = delcallb_list; 1241 delcallb_list = pcp->p_hprev; 1242 ASSERT(!pcp->p_active); 1243 done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 1244 pcp->p_len, pcp->p_pp, 1245 pcp->p_write ? S_WRITE : S_READ, 1); 1246 npages += btop(pcp->p_len); 1247 ASSERT(IS_PCP_WIRED(pcp)); 1248 kmem_cache_free(seg_pkmcache, pcp); 1249 if (done) { 1250 ASSERT(delcallb_list == NULL); 1251 goto out; 1252 } 1253 } 1254 } 1255 1256 out: 1257 mutex_enter(&seg_pmem_mtx); 1258 ASSERT(seg_plocked >= npages); 1259 seg_plocked -= npages; 1260 mutex_exit(&seg_pmem_mtx); 1261 } 1262 1263 /* 1264 * purge all entries for a given segment. Since we 1265 * callback into the segment driver directly for page 1266 * reclaim the caller needs to hold the right locks. 1267 */ 1268 void 1269 seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags) 1270 { 1271 struct seg_pcache *delcallb_list = NULL; 1272 struct seg_pcache *pcp; 1273 struct seg_phash *hp; 1274 pgcnt_t npages = 0; 1275 void *htag0; 1276 1277 if (seg_plocked == 0) { 1278 return; 1279 } 1280 ASSERT(seg_phashsize_win != 0); 1281 1282 /* 1283 * If amp is not NULL use amp as a lookup tag otherwise use seg 1284 * as a lookup tag. 1285 */ 1286 htag0 = (amp == NULL ? (void *)seg : (void *)amp); 1287 ASSERT(htag0 != NULL); 1288 if (IS_PFLAGS_WIRED(flags)) { 1289 hp = P_HASHBP(seg, htag0, 0, flags); 1290 mutex_enter(&hp->p_hmutex); 1291 pcp = hp->p_hnext; 1292 while (pcp != (struct seg_pcache *)hp) { 1293 ASSERT(pcp->p_hashp == hp); 1294 ASSERT(IS_PCP_WIRED(pcp)); 1295 if (pcp->p_htag0 == htag0) { 1296 if (pcp->p_active) { 1297 break; 1298 } 1299 pcp->p_hprev->p_hnext = pcp->p_hnext; 1300 pcp->p_hnext->p_hprev = pcp->p_hprev; 1301 pcp->p_hprev = delcallb_list; 1302 delcallb_list = pcp; 1303 } 1304 pcp = pcp->p_hnext; 1305 } 1306 mutex_exit(&hp->p_hmutex); 1307 } else { 1308 pcache_link_t *plinkp; 1309 pcache_link_t *pheadp; 1310 kmutex_t *pmtx; 1311 1312 if (amp == NULL) { 1313 ASSERT(seg != NULL); 1314 pheadp = &seg->s_phead; 1315 pmtx = &seg->s_pmtx; 1316 } else { 1317 pheadp = &->a_phead; 1318 pmtx = &->a_pmtx; 1319 } 1320 mutex_enter(pmtx); 1321 while ((plinkp = pheadp->p_lnext) != pheadp) { 1322 pcp = plink2pcache(plinkp); 1323 ASSERT(!IS_PCP_WIRED(pcp)); 1324 ASSERT(pcp->p_htag0 == htag0); 1325 hp = pcp->p_hashp; 1326 mutex_enter(&hp->p_hmutex); 1327 if (pcp->p_active) { 1328 mutex_exit(&hp->p_hmutex); 1329 break; 1330 } 1331 ASSERT(plinkp->p_lprev == pheadp); 1332 pheadp->p_lnext = plinkp->p_lnext; 1333 plinkp->p_lnext->p_lprev = pheadp; 1334 pcp->p_hprev->p_hnext = pcp->p_hnext; 1335 pcp->p_hnext->p_hprev = pcp->p_hprev; 1336 pcp->p_hprev = delcallb_list; 1337 delcallb_list = pcp; 1338 if (hp->p_hnext == (struct seg_pcache *)hp) { 1339 seg_premove_abuck(hp, 0); 1340 } 1341 mutex_exit(&hp->p_hmutex); 1342 } 1343 mutex_exit(pmtx); 1344 } 1345 while (delcallb_list != NULL) { 1346 pcp = delcallb_list; 1347 delcallb_list = pcp->p_hprev; 1348 ASSERT(!pcp->p_active); 1349 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len, 1350 pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); 1351 npages += btop(pcp->p_len); 1352 kmem_cache_free(seg_pkmcache, pcp); 1353 } 1354 mutex_enter(&seg_pmem_mtx); 1355 ASSERT(seg_plocked >= npages); 1356 seg_plocked -= npages; 1357 if (!IS_PFLAGS_WIRED(flags)) { 1358 ASSERT(seg_plocked_window >= npages); 1359 seg_plocked_window -= npages; 1360 } 1361 mutex_exit(&seg_pmem_mtx); 1362 } 1363 1364 static void seg_pinit_mem_config(void); 1365 1366 /* 1367 * setup the pagelock cache 1368 */ 1369 static void 1370 seg_pinit(void) 1371 { 1372 struct seg_phash *hp; 1373 ulong_t i; 1374 pgcnt_t physmegs; 1375 1376 seg_plocked = 0; 1377 seg_plocked_window = 0; 1378 1379 if (segpcache_enabled == 0) { 1380 seg_phashsize_win = 0; 1381 seg_phashsize_wired = 0; 1382 seg_pdisabled = 1; 1383 return; 1384 } 1385 1386 seg_pdisabled = 0; 1387 seg_pkmcache = kmem_cache_create("seg_pcache", 1388 sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0); 1389 if (segpcache_pcp_maxage_ticks <= 0) { 1390 segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz; 1391 } 1392 seg_pmax_pcpage = segpcache_pcp_maxage_ticks; 1393 seg_pathr_empty_ahb = 0; 1394 seg_pathr_full_ahb = 0; 1395 seg_pshrink_shift = segpcache_shrink_shift; 1396 seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes); 1397 1398 mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL); 1399 mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL); 1400 mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL); 1401 cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL); 1402 1403 physmegs = physmem >> (20 - PAGESHIFT); 1404 1405 /* 1406 * If segpcache_hashsize_win was not set in /etc/system or it has 1407 * absurd value set it to a default. 1408 */ 1409 if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) { 1410 /* 1411 * Create one bucket per 32K (or at least per 8 pages) of 1412 * available memory. 1413 */ 1414 pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8); 1415 segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket); 1416 } 1417 if (!ISP2(segpcache_hashsize_win)) { 1418 ulong_t rndfac = ~(1UL << 1419 (highbit(segpcache_hashsize_win) - 1)); 1420 rndfac &= segpcache_hashsize_win; 1421 segpcache_hashsize_win += rndfac; 1422 segpcache_hashsize_win = 1 << 1423 (highbit(segpcache_hashsize_win) - 1); 1424 } 1425 seg_phashsize_win = segpcache_hashsize_win; 1426 seg_phashtab_win = kmem_zalloc( 1427 seg_phashsize_win * sizeof (struct seg_phash), 1428 KM_SLEEP); 1429 for (i = 0; i < seg_phashsize_win; i++) { 1430 hp = &seg_phashtab_win[i]; 1431 hp->p_hnext = (struct seg_pcache *)hp; 1432 hp->p_hprev = (struct seg_pcache *)hp; 1433 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 1434 } 1435 1436 seg_pahcur = 0; 1437 seg_pathr_on = 0; 1438 seg_pahhead[0].p_lnext = &seg_pahhead[0]; 1439 seg_pahhead[0].p_lprev = &seg_pahhead[0]; 1440 seg_pahhead[1].p_lnext = &seg_pahhead[1]; 1441 seg_pahhead[1].p_lprev = &seg_pahhead[1]; 1442 1443 /* 1444 * If segpcache_hashsize_wired was not set in /etc/system or it has 1445 * absurd value set it to a default. 1446 */ 1447 if (segpcache_hashsize_wired == 0 || 1448 segpcache_hashsize_wired > physmem / 4) { 1449 /* 1450 * Choose segpcache_hashsize_wired based on physmem. 1451 * Create a bucket per 128K bytes upto 256K buckets. 1452 */ 1453 if (physmegs < 20 * 1024) { 1454 segpcache_hashsize_wired = MAX(1024, physmegs << 3); 1455 } else { 1456 segpcache_hashsize_wired = 256 * 1024; 1457 } 1458 } 1459 if (!ISP2(segpcache_hashsize_wired)) { 1460 segpcache_hashsize_wired = 1 << 1461 highbit(segpcache_hashsize_wired); 1462 } 1463 seg_phashsize_wired = segpcache_hashsize_wired; 1464 seg_phashtab_wired = kmem_zalloc( 1465 seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP); 1466 for (i = 0; i < seg_phashsize_wired; i++) { 1467 hp = (struct seg_phash *)&seg_phashtab_wired[i]; 1468 hp->p_hnext = (struct seg_pcache *)hp; 1469 hp->p_hprev = (struct seg_pcache *)hp; 1470 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 1471 } 1472 1473 if (segpcache_maxwindow == 0) { 1474 if (physmegs < 64) { 1475 /* 3% of memory */ 1476 segpcache_maxwindow = availrmem >> 5; 1477 } else if (physmegs < 512) { 1478 /* 12% of memory */ 1479 segpcache_maxwindow = availrmem >> 3; 1480 } else if (physmegs < 1024) { 1481 /* 25% of memory */ 1482 segpcache_maxwindow = availrmem >> 2; 1483 } else if (physmegs < 2048) { 1484 /* 50% of memory */ 1485 segpcache_maxwindow = availrmem >> 1; 1486 } else { 1487 /* no limit */ 1488 segpcache_maxwindow = (pgcnt_t)-1; 1489 } 1490 } 1491 seg_pmaxwindow = segpcache_maxwindow; 1492 seg_pinit_mem_config(); 1493 } 1494 1495 /* 1496 * called by pageout if memory is low 1497 */ 1498 void 1499 seg_preap(void) 1500 { 1501 /* 1502 * if the cache is off or empty, return 1503 */ 1504 if (seg_plocked_window == 0) { 1505 return; 1506 } 1507 ASSERT(seg_phashsize_win != 0); 1508 1509 /* 1510 * If somebody is already purging pcache 1511 * just return. 1512 */ 1513 if (seg_pdisabled) { 1514 return; 1515 } 1516 1517 cv_signal(&seg_pasync_cv); 1518 } 1519 1520 /* 1521 * run as a backgroud thread and reclaim pagelock 1522 * pages which have not been used recently 1523 */ 1524 void 1525 seg_pasync_thread(void) 1526 { 1527 callb_cpr_t cpr_info; 1528 1529 if (seg_phashsize_win == 0) { 1530 thread_exit(); 1531 /*NOTREACHED*/ 1532 } 1533 1534 seg_pasync_thr = curthread; 1535 1536 CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx, 1537 callb_generic_cpr, "seg_pasync"); 1538 1539 if (segpcache_reap_ticks <= 0) { 1540 segpcache_reap_ticks = segpcache_reap_sec * hz; 1541 } 1542 1543 mutex_enter(&seg_pasync_mtx); 1544 for (;;) { 1545 CALLB_CPR_SAFE_BEGIN(&cpr_info); 1546 (void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx, 1547 segpcache_reap_ticks, TR_CLOCK_TICK); 1548 CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx); 1549 if (seg_pdisabled == 0) { 1550 seg_ppurge_async(0); 1551 } 1552 } 1553 } 1554 1555 static struct kmem_cache *seg_cache; 1556 1557 /* 1558 * Initialize segment management data structures. 1559 */ 1560 void 1561 seg_init(void) 1562 { 1563 kstat_t *ksp; 1564 1565 seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg), 1566 0, NULL, NULL, NULL, NULL, NULL, 0); 1567 1568 ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED, 1569 segadvstat_ndata, KSTAT_FLAG_VIRTUAL); 1570 if (ksp) { 1571 ksp->ks_data = (void *)segadvstat_ptr; 1572 kstat_install(ksp); 1573 } 1574 1575 seg_pinit(); 1576 } 1577 1578 /* 1579 * Allocate a segment to cover [base, base+size] 1580 * and attach it to the specified address space. 1581 */ 1582 struct seg * 1583 seg_alloc(struct as *as, caddr_t base, size_t size) 1584 { 1585 struct seg *new; 1586 caddr_t segbase; 1587 size_t segsize; 1588 1589 segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK); 1590 segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) - 1591 (uintptr_t)segbase; 1592 1593 if (!valid_va_range(&segbase, &segsize, segsize, AH_LO)) 1594 return ((struct seg *)NULL); /* bad virtual addr range */ 1595 1596 if (as != &kas && 1597 valid_usr_range(segbase, segsize, 0, as, 1598 as->a_userlimit) != RANGE_OKAY) 1599 return ((struct seg *)NULL); /* bad virtual addr range */ 1600 1601 new = kmem_cache_alloc(seg_cache, KM_SLEEP); 1602 new->s_ops = NULL; 1603 new->s_data = NULL; 1604 new->s_szc = 0; 1605 new->s_flags = 0; 1606 mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL); 1607 new->s_phead.p_lnext = &new->s_phead; 1608 new->s_phead.p_lprev = &new->s_phead; 1609 if (seg_attach(as, segbase, segsize, new) < 0) { 1610 kmem_cache_free(seg_cache, new); 1611 return ((struct seg *)NULL); 1612 } 1613 /* caller must fill in ops, data */ 1614 return (new); 1615 } 1616 1617 /* 1618 * Attach a segment to the address space. Used by seg_alloc() 1619 * and for kernel startup to attach to static segments. 1620 */ 1621 int 1622 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg) 1623 { 1624 seg->s_as = as; 1625 seg->s_base = base; 1626 seg->s_size = size; 1627 1628 /* 1629 * as_addseg() will add the segment at the appropraite point 1630 * in the list. It will return -1 if there is overlap with 1631 * an already existing segment. 1632 */ 1633 return (as_addseg(as, seg)); 1634 } 1635 1636 /* 1637 * Unmap a segment and free it from its associated address space. 1638 * This should be called by anybody who's finished with a whole segment's 1639 * mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the 1640 * responsibility of the segment driver to unlink the the segment 1641 * from the address space, and to free public and private data structures 1642 * associated with the segment. (This is typically done by a call to 1643 * seg_free()). 1644 */ 1645 void 1646 seg_unmap(struct seg *seg) 1647 { 1648 #ifdef DEBUG 1649 int ret; 1650 #endif /* DEBUG */ 1651 1652 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 1653 1654 /* Shouldn't have called seg_unmap if mapping isn't yet established */ 1655 ASSERT(seg->s_data != NULL); 1656 1657 /* Unmap the whole mapping */ 1658 #ifdef DEBUG 1659 ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 1660 ASSERT(ret == 0); 1661 #else 1662 SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 1663 #endif /* DEBUG */ 1664 } 1665 1666 /* 1667 * Free the segment from its associated as. This should only be called 1668 * if a mapping to the segment has not yet been established (e.g., if 1669 * an error occurs in the middle of doing an as_map when the segment 1670 * has already been partially set up) or if it has already been deleted 1671 * (e.g., from a segment driver unmap routine if the unmap applies to the 1672 * entire segment). If the mapping is currently set up then seg_unmap() should 1673 * be called instead. 1674 */ 1675 void 1676 seg_free(struct seg *seg) 1677 { 1678 register struct as *as = seg->s_as; 1679 struct seg *tseg = as_removeseg(as, seg); 1680 1681 ASSERT(tseg == seg); 1682 1683 /* 1684 * If the segment private data field is NULL, 1685 * then segment driver is not attached yet. 1686 */ 1687 if (seg->s_data != NULL) 1688 SEGOP_FREE(seg); 1689 1690 mutex_destroy(&seg->s_pmtx); 1691 ASSERT(seg->s_phead.p_lnext == &seg->s_phead); 1692 ASSERT(seg->s_phead.p_lprev == &seg->s_phead); 1693 kmem_cache_free(seg_cache, seg); 1694 } 1695 1696 /*ARGSUSED*/ 1697 static void 1698 seg_p_mem_config_post_add( 1699 void *arg, 1700 pgcnt_t delta_pages) 1701 { 1702 /* Nothing to do. */ 1703 } 1704 1705 void 1706 seg_p_enable(void) 1707 { 1708 mutex_enter(&seg_pcache_mtx); 1709 ASSERT(seg_pdisabled != 0); 1710 seg_pdisabled--; 1711 mutex_exit(&seg_pcache_mtx); 1712 } 1713 1714 /* 1715 * seg_p_disable - disables seg_pcache, and then attempts to empty the 1716 * cache. 1717 * Returns SEGP_SUCCESS if the cache was successfully emptied, or 1718 * SEGP_FAIL if the cache could not be emptied. 1719 */ 1720 int 1721 seg_p_disable(void) 1722 { 1723 pgcnt_t old_plocked; 1724 int stall_count = 0; 1725 1726 mutex_enter(&seg_pcache_mtx); 1727 seg_pdisabled++; 1728 ASSERT(seg_pdisabled != 0); 1729 mutex_exit(&seg_pcache_mtx); 1730 1731 /* 1732 * Attempt to empty the cache. Terminate if seg_plocked does not 1733 * diminish with SEGP_STALL_THRESHOLD consecutive attempts. 1734 */ 1735 while (seg_plocked != 0) { 1736 ASSERT(seg_phashsize_win != 0); 1737 old_plocked = seg_plocked; 1738 seg_ppurge_async(1); 1739 if (seg_plocked == old_plocked) { 1740 if (stall_count++ > SEGP_STALL_THRESHOLD) { 1741 return (SEGP_FAIL); 1742 } 1743 } else 1744 stall_count = 0; 1745 if (seg_plocked != 0) 1746 delay(hz/SEGP_PREDEL_DELAY_FACTOR); 1747 } 1748 return (SEGP_SUCCESS); 1749 } 1750 1751 /* 1752 * Attempt to purge seg_pcache. May need to return before this has 1753 * completed to allow other pre_del callbacks to unlock pages. This is 1754 * ok because: 1755 * 1) The seg_pdisabled flag has been set so at least we won't 1756 * cache anymore locks and the locks we couldn't purge 1757 * will not be held if they do get released by a subsequent 1758 * pre-delete callback. 1759 * 1760 * 2) The rest of the memory delete thread processing does not 1761 * depend on the changes made in this pre-delete callback. No 1762 * panics will result, the worst that will happen is that the 1763 * DR code will timeout and cancel the delete. 1764 */ 1765 /*ARGSUSED*/ 1766 static int 1767 seg_p_mem_config_pre_del( 1768 void *arg, 1769 pgcnt_t delta_pages) 1770 { 1771 if (seg_phashsize_win == 0) { 1772 return (0); 1773 } 1774 if (seg_p_disable() != SEGP_SUCCESS) 1775 cmn_err(CE_NOTE, 1776 "!Pre-delete couldn't purge"" pagelock cache - continuing"); 1777 return (0); 1778 } 1779 1780 /*ARGSUSED*/ 1781 static void 1782 seg_p_mem_config_post_del( 1783 void *arg, 1784 pgcnt_t delta_pages, 1785 int cancelled) 1786 { 1787 if (seg_phashsize_win == 0) { 1788 return; 1789 } 1790 seg_p_enable(); 1791 } 1792 1793 static kphysm_setup_vector_t seg_p_mem_config_vec = { 1794 KPHYSM_SETUP_VECTOR_VERSION, 1795 seg_p_mem_config_post_add, 1796 seg_p_mem_config_pre_del, 1797 seg_p_mem_config_post_del, 1798 }; 1799 1800 static void 1801 seg_pinit_mem_config(void) 1802 { 1803 int ret; 1804 1805 ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL); 1806 /* 1807 * Want to catch this in the debug kernel. At run time, if the 1808 * callbacks don't get run all will be OK as the disable just makes 1809 * it more likely that the pages can be collected. 1810 */ 1811 ASSERT(ret == 0); 1812 } 1813 1814 /* 1815 * Verify that segment is not a shared anonymous segment which reserves 1816 * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered 1817 * from one zone to another if any segments are shared. This is because the 1818 * last process to exit will credit the swap reservation. This could lead 1819 * to the swap being reserved by one zone, and credited to another. 1820 */ 1821 boolean_t 1822 seg_can_change_zones(struct seg *seg) 1823 { 1824 struct segvn_data *svd; 1825 1826 if (seg->s_ops == &segspt_shmops) 1827 return (B_FALSE); 1828 1829 if (seg->s_ops == &segvn_ops) { 1830 svd = (struct segvn_data *)seg->s_data; 1831 if (svd->type == MAP_SHARED && 1832 svd->amp != NULL && 1833 svd->amp->swresv > 0) 1834 return (B_FALSE); 1835 } 1836 return (B_TRUE); 1837 } 1838 1839 /* 1840 * Return swap reserved by a segment backing a private mapping. 1841 */ 1842 size_t 1843 seg_swresv(struct seg *seg) 1844 { 1845 struct segvn_data *svd; 1846 size_t swap = 0; 1847 1848 if (seg->s_ops == &segvn_ops) { 1849 svd = (struct segvn_data *)seg->s_data; 1850 if (svd->type == MAP_PRIVATE && svd->swresv > 0) 1851 swap = svd->swresv; 1852 } 1853 return (swap); 1854 } 1855 1856 /* 1857 * General not supported function for SEGOP_INHERIT 1858 */ 1859 /* ARGSUSED */ 1860 int 1861 seg_inherit_notsup(struct seg *seg, caddr_t addr, size_t len, uint_t op) 1862 { 1863 return (ENOTSUP); 1864 } 1865