1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 /* 40 * VM - segment management. 41 */ 42 43 #include <sys/types.h> 44 #include <sys/inttypes.h> 45 #include <sys/t_lock.h> 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/kmem.h> 49 #include <sys/sysmacros.h> 50 #include <sys/vmsystm.h> 51 #include <sys/tuneable.h> 52 #include <sys/debug.h> 53 #include <sys/fs/swapnode.h> 54 #include <sys/cmn_err.h> 55 #include <sys/callb.h> 56 #include <sys/mem_config.h> 57 #include <sys/mman.h> 58 59 #include <vm/hat.h> 60 #include <vm/as.h> 61 #include <vm/seg.h> 62 #include <vm/seg_kmem.h> 63 #include <vm/seg_spt.h> 64 #include <vm/seg_vn.h> 65 #include <vm/anon.h> 66 67 /* 68 * kstats for segment advise 69 */ 70 segadvstat_t segadvstat = { 71 { "MADV_FREE_hit", KSTAT_DATA_ULONG }, 72 { "MADV_FREE_miss", KSTAT_DATA_ULONG }, 73 }; 74 75 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat; 76 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t); 77 78 /* 79 * entry in the segment page cache 80 */ 81 struct seg_pcache { 82 struct seg_pcache *p_hnext; /* list for hashed blocks */ 83 struct seg_pcache *p_hprev; 84 pcache_link_t p_plink; /* per segment/amp list */ 85 void *p_htag0; /* segment/amp pointer */ 86 caddr_t p_addr; /* base address/anon_idx */ 87 size_t p_len; /* total bytes */ 88 size_t p_wlen; /* writtable bytes at p_addr */ 89 struct page **p_pp; /* pp shadow list */ 90 seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */ 91 clock_t p_lbolt; /* lbolt from last use */ 92 struct seg_phash *p_hashp; /* our pcache hash bucket */ 93 uint_t p_active; /* active count */ 94 uchar_t p_write; /* true if S_WRITE */ 95 uchar_t p_ref; /* reference byte */ 96 ushort_t p_flags; /* bit flags */ 97 }; 98 99 struct seg_phash { 100 struct seg_pcache *p_hnext; /* list for hashed blocks */ 101 struct seg_pcache *p_hprev; 102 kmutex_t p_hmutex; /* protects hash bucket */ 103 pcache_link_t p_halink[2]; /* active bucket linkages */ 104 }; 105 106 struct seg_phash_wired { 107 struct seg_pcache *p_hnext; /* list for hashed blocks */ 108 struct seg_pcache *p_hprev; 109 kmutex_t p_hmutex; /* protects hash bucket */ 110 }; 111 112 /* 113 * A parameter to control a maximum number of bytes that can be 114 * purged from pcache at a time. 115 */ 116 #define P_MAX_APURGE_BYTES (1024 * 1024 * 1024) 117 118 /* 119 * log2(fraction of pcache to reclaim at a time). 120 */ 121 #define P_SHRINK_SHFT (5) 122 123 /* 124 * The following variables can be tuned via /etc/system. 125 */ 126 127 int segpcache_enabled = 1; /* if 1, shadow lists are cached */ 128 pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */ 129 ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */ 130 ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */ 131 int segpcache_reap_sec = 1; /* reap check rate in secs */ 132 clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */ 133 int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */ 134 clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */ 135 int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */ 136 pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */ 137 138 static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */ 139 static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */ 140 static kcondvar_t seg_pasync_cv; 141 142 #pragma align 64(pctrl1) 143 #pragma align 64(pctrl2) 144 #pragma align 64(pctrl3) 145 146 /* 147 * Keep frequently used variables together in one cache line. 148 */ 149 static struct p_ctrl1 { 150 uint_t p_disabled; /* if not 0, caching temporarily off */ 151 pgcnt_t p_maxwin; /* max # of pages that can be cached */ 152 size_t p_hashwin_sz; /* # of non wired buckets */ 153 struct seg_phash *p_htabwin; /* hash table for non wired entries */ 154 size_t p_hashwired_sz; /* # of wired buckets */ 155 struct seg_phash_wired *p_htabwired; /* hash table for wired entries */ 156 kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */ 157 #ifdef _LP64 158 ulong_t pad[1]; 159 #endif /* _LP64 */ 160 } pctrl1; 161 162 static struct p_ctrl2 { 163 kmutex_t p_mem_mtx; /* protects window counter and p_halinks */ 164 pgcnt_t p_locked_win; /* # pages from window */ 165 pgcnt_t p_locked; /* # of pages cached by pagelock */ 166 uchar_t p_ahcur; /* current active links for insert/delete */ 167 uchar_t p_athr_on; /* async reclaim thread is running. */ 168 pcache_link_t p_ahhead[2]; /* active buckets linkages */ 169 } pctrl2; 170 171 static struct p_ctrl3 { 172 clock_t p_pcp_maxage; /* max pcp age in ticks */ 173 ulong_t p_athr_empty_ahb; /* athread walk stats */ 174 ulong_t p_athr_full_ahb; /* athread walk stats */ 175 pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */ 176 int p_shrink_shft; /* reap shift factor */ 177 #ifdef _LP64 178 ulong_t pad[3]; 179 #endif /* _LP64 */ 180 } pctrl3; 181 182 #define seg_pdisabled pctrl1.p_disabled 183 #define seg_pmaxwindow pctrl1.p_maxwin 184 #define seg_phashsize_win pctrl1.p_hashwin_sz 185 #define seg_phashtab_win pctrl1.p_htabwin 186 #define seg_phashsize_wired pctrl1.p_hashwired_sz 187 #define seg_phashtab_wired pctrl1.p_htabwired 188 #define seg_pkmcache pctrl1.p_kmcache 189 #define seg_pmem_mtx pctrl2.p_mem_mtx 190 #define seg_plocked_window pctrl2.p_locked_win 191 #define seg_plocked pctrl2.p_locked 192 #define seg_pahcur pctrl2.p_ahcur 193 #define seg_pathr_on pctrl2.p_athr_on 194 #define seg_pahhead pctrl2.p_ahhead 195 #define seg_pmax_pcpage pctrl3.p_pcp_maxage 196 #define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb 197 #define seg_pathr_full_ahb pctrl3.p_athr_full_ahb 198 #define seg_pshrink_shift pctrl3.p_shrink_shft 199 #define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages 200 201 #define P_HASHWIN_MASK (seg_phashsize_win - 1) 202 #define P_HASHWIRED_MASK (seg_phashsize_wired - 1) 203 #define P_BASESHIFT (6) 204 205 kthread_t *seg_pasync_thr; 206 207 extern struct seg_ops segvn_ops; 208 extern struct seg_ops segspt_shmops; 209 210 #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED) 211 #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags) 212 213 #define LBOLT_DELTA(t) ((ulong_t)(ddi_get_lbolt() - (t))) 214 215 #define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt) 216 217 /* 218 * htag0 argument can be a seg or amp pointer. 219 */ 220 #define P_HASHBP(seg, htag0, addr, flags) \ 221 (IS_PFLAGS_WIRED((flags)) ? \ 222 ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \ 223 ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \ 224 (&seg_phashtab_win[P_HASHWIN_MASK & \ 225 (((uintptr_t)(htag0) >> 3) ^ \ 226 ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \ 227 (flags >> 16) : page_get_shift((seg)->s_szc))))])) 228 229 /* 230 * htag0 argument can be a seg or amp pointer. 231 */ 232 #define P_MATCH(pcp, htag0, addr, len) \ 233 ((pcp)->p_htag0 == (htag0) && \ 234 (pcp)->p_addr == (addr) && \ 235 (pcp)->p_len >= (len)) 236 237 #define P_MATCH_PP(pcp, htag0, addr, len, pp) \ 238 ((pcp)->p_pp == (pp) && \ 239 (pcp)->p_htag0 == (htag0) && \ 240 (pcp)->p_addr == (addr) && \ 241 (pcp)->p_len >= (len)) 242 243 #define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \ 244 offsetof(struct seg_pcache, p_plink))) 245 246 #define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \ 247 offsetof(struct seg_phash, p_halink[l]))) 248 249 /* 250 * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from 251 * active hash bucket lists. We maintain active bucket lists to reduce the 252 * overhead of finding active buckets during asynchronous purging since there 253 * can be 10s of millions of buckets on a large system but only a small subset 254 * of them in actual use. 255 * 256 * There're 2 active bucket lists. Current active list (as per seg_pahcur) is 257 * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete 258 * buckets. The other list is used by asynchronous purge thread. This allows 259 * the purge thread to walk its active list without holding seg_pmem_mtx for a 260 * long time. When asynchronous thread is done with its list it switches to 261 * current active list and makes the list it just finished processing as 262 * current active list. 263 * 264 * seg_padd_abuck() only adds the bucket to current list if the bucket is not 265 * yet on any list. seg_premove_abuck() may remove the bucket from either 266 * list. If the bucket is on current list it will be always removed. Otherwise 267 * the bucket is only removed if asynchronous purge thread is not currently 268 * running or seg_premove_abuck() is called by asynchronous purge thread 269 * itself. A given bucket can only be on one of active lists at a time. These 270 * routines should be called with per bucket lock held. The routines use 271 * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after 272 * the first entry is added to the bucket chain and seg_premove_abuck() must 273 * be called after the last pcp entry is deleted from its chain. Per bucket 274 * lock should be held by the callers. This avoids a potential race condition 275 * when seg_premove_abuck() removes a bucket after pcp entries are added to 276 * its list after the caller checked that the bucket has no entries. (this 277 * race would cause a loss of an active bucket from the active lists). 278 * 279 * Both lists are circular doubly linked lists anchored at seg_pahhead heads. 280 * New entries are added to the end of the list since LRU is used as the 281 * purging policy. 282 */ 283 static void 284 seg_padd_abuck(struct seg_phash *hp) 285 { 286 int lix; 287 288 ASSERT(MUTEX_HELD(&hp->p_hmutex)); 289 ASSERT((struct seg_phash *)hp->p_hnext != hp); 290 ASSERT((struct seg_phash *)hp->p_hprev != hp); 291 ASSERT(hp->p_hnext == hp->p_hprev); 292 ASSERT(!IS_PCP_WIRED(hp->p_hnext)); 293 ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp); 294 ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp); 295 ASSERT(hp >= seg_phashtab_win && 296 hp < &seg_phashtab_win[seg_phashsize_win]); 297 298 /* 299 * This bucket can already be on one of active lists 300 * since seg_premove_abuck() may have failed to remove it 301 * before. 302 */ 303 mutex_enter(&seg_pmem_mtx); 304 lix = seg_pahcur; 305 ASSERT(lix >= 0 && lix <= 1); 306 if (hp->p_halink[lix].p_lnext != NULL) { 307 ASSERT(hp->p_halink[lix].p_lprev != NULL); 308 ASSERT(hp->p_halink[!lix].p_lnext == NULL); 309 ASSERT(hp->p_halink[!lix].p_lprev == NULL); 310 mutex_exit(&seg_pmem_mtx); 311 return; 312 } 313 ASSERT(hp->p_halink[lix].p_lprev == NULL); 314 315 /* 316 * If this bucket is still on list !lix async thread can't yet remove 317 * it since we hold here per bucket lock. In this case just return 318 * since async thread will eventually find and process this bucket. 319 */ 320 if (hp->p_halink[!lix].p_lnext != NULL) { 321 ASSERT(hp->p_halink[!lix].p_lprev != NULL); 322 mutex_exit(&seg_pmem_mtx); 323 return; 324 } 325 ASSERT(hp->p_halink[!lix].p_lprev == NULL); 326 /* 327 * This bucket is not on any active bucket list yet. 328 * Add the bucket to the tail of current active list. 329 */ 330 hp->p_halink[lix].p_lnext = &seg_pahhead[lix]; 331 hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev; 332 seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix]; 333 seg_pahhead[lix].p_lprev = &hp->p_halink[lix]; 334 mutex_exit(&seg_pmem_mtx); 335 } 336 337 static void 338 seg_premove_abuck(struct seg_phash *hp, int athr) 339 { 340 int lix; 341 342 ASSERT(MUTEX_HELD(&hp->p_hmutex)); 343 ASSERT((struct seg_phash *)hp->p_hnext == hp); 344 ASSERT((struct seg_phash *)hp->p_hprev == hp); 345 ASSERT(hp >= seg_phashtab_win && 346 hp < &seg_phashtab_win[seg_phashsize_win]); 347 348 if (athr) { 349 ASSERT(seg_pathr_on); 350 ASSERT(seg_pahcur <= 1); 351 /* 352 * We are called by asynchronous thread that found this bucket 353 * on not currently active (i.e. !seg_pahcur) list. Remove it 354 * from there. Per bucket lock we are holding makes sure 355 * seg_pinsert() can't sneak in and add pcp entries to this 356 * bucket right before we remove the bucket from its list. 357 */ 358 lix = !seg_pahcur; 359 ASSERT(hp->p_halink[lix].p_lnext != NULL); 360 ASSERT(hp->p_halink[lix].p_lprev != NULL); 361 ASSERT(hp->p_halink[!lix].p_lnext == NULL); 362 ASSERT(hp->p_halink[!lix].p_lprev == NULL); 363 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 364 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 365 hp->p_halink[lix].p_lnext = NULL; 366 hp->p_halink[lix].p_lprev = NULL; 367 return; 368 } 369 370 mutex_enter(&seg_pmem_mtx); 371 lix = seg_pahcur; 372 ASSERT(lix >= 0 && lix <= 1); 373 374 /* 375 * If the bucket is on currently active list just remove it from 376 * there. 377 */ 378 if (hp->p_halink[lix].p_lnext != NULL) { 379 ASSERT(hp->p_halink[lix].p_lprev != NULL); 380 ASSERT(hp->p_halink[!lix].p_lnext == NULL); 381 ASSERT(hp->p_halink[!lix].p_lprev == NULL); 382 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 383 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 384 hp->p_halink[lix].p_lnext = NULL; 385 hp->p_halink[lix].p_lprev = NULL; 386 mutex_exit(&seg_pmem_mtx); 387 return; 388 } 389 ASSERT(hp->p_halink[lix].p_lprev == NULL); 390 391 /* 392 * If asynchronous thread is not running we can remove the bucket from 393 * not currently active list. The bucket must be on this list since we 394 * already checked that it's not on the other list and the bucket from 395 * which we just deleted the last pcp entry must be still on one of the 396 * active bucket lists. 397 */ 398 lix = !lix; 399 ASSERT(hp->p_halink[lix].p_lnext != NULL); 400 ASSERT(hp->p_halink[lix].p_lprev != NULL); 401 402 if (!seg_pathr_on) { 403 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 404 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 405 hp->p_halink[lix].p_lnext = NULL; 406 hp->p_halink[lix].p_lprev = NULL; 407 } 408 mutex_exit(&seg_pmem_mtx); 409 } 410 411 /* 412 * Check if bucket pointed by hp already has a pcp entry that matches request 413 * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise. 414 * Also delete matching entries that cover smaller address range but start 415 * at the same address as addr argument. Return the list of deleted entries if 416 * any. This is an internal helper function called from seg_pinsert() only 417 * for non wired shadow lists. The caller already holds a per seg/amp list 418 * lock. 419 */ 420 static struct seg_pcache * 421 seg_plookup_checkdup(struct seg_phash *hp, void *htag0, 422 caddr_t addr, size_t len, int *found) 423 { 424 struct seg_pcache *pcp; 425 struct seg_pcache *delcallb_list = NULL; 426 427 ASSERT(MUTEX_HELD(&hp->p_hmutex)); 428 429 *found = 0; 430 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 431 pcp = pcp->p_hnext) { 432 ASSERT(pcp->p_hashp == hp); 433 if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) { 434 ASSERT(!IS_PCP_WIRED(pcp)); 435 if (pcp->p_len < len) { 436 pcache_link_t *plinkp; 437 if (pcp->p_active) { 438 continue; 439 } 440 plinkp = &pcp->p_plink; 441 plinkp->p_lprev->p_lnext = plinkp->p_lnext; 442 plinkp->p_lnext->p_lprev = plinkp->p_lprev; 443 pcp->p_hprev->p_hnext = pcp->p_hnext; 444 pcp->p_hnext->p_hprev = pcp->p_hprev; 445 pcp->p_hprev = delcallb_list; 446 delcallb_list = pcp; 447 } else { 448 *found = 1; 449 break; 450 } 451 } 452 } 453 return (delcallb_list); 454 } 455 456 /* 457 * lookup an address range in pagelock cache. Return shadow list and bump up 458 * active count. If amp is not NULL use amp as a lookup tag otherwise use seg 459 * as a lookup tag. 460 */ 461 struct page ** 462 seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, 463 enum seg_rw rw, uint_t flags) 464 { 465 struct seg_pcache *pcp; 466 struct seg_phash *hp; 467 void *htag0; 468 469 ASSERT(seg != NULL); 470 ASSERT(rw == S_READ || rw == S_WRITE); 471 472 /* 473 * Skip pagelock cache, while DR is in progress or 474 * seg_pcache is off. 475 */ 476 if (seg_pdisabled) { 477 return (NULL); 478 } 479 ASSERT(seg_phashsize_win != 0); 480 481 htag0 = (amp == NULL ? (void *)seg : (void *)amp); 482 hp = P_HASHBP(seg, htag0, addr, flags); 483 mutex_enter(&hp->p_hmutex); 484 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 485 pcp = pcp->p_hnext) { 486 ASSERT(pcp->p_hashp == hp); 487 if (P_MATCH(pcp, htag0, addr, len)) { 488 ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); 489 /* 490 * If this request wants to write pages 491 * but write permissions starting from 492 * addr don't cover the entire length len 493 * return lookup failure back to the caller. 494 * It will check protections and fail this 495 * pagelock operation with EACCESS error. 496 */ 497 if (rw == S_WRITE && pcp->p_wlen < len) { 498 break; 499 } 500 if (pcp->p_active == UINT_MAX) { 501 break; 502 } 503 pcp->p_active++; 504 if (rw == S_WRITE && !pcp->p_write) { 505 pcp->p_write = 1; 506 } 507 mutex_exit(&hp->p_hmutex); 508 return (pcp->p_pp); 509 } 510 } 511 mutex_exit(&hp->p_hmutex); 512 return (NULL); 513 } 514 515 /* 516 * mark address range inactive. If the cache is off or the address range is 517 * not in the cache or another shadow list that covers bigger range is found 518 * we call the segment driver to reclaim the pages. Otherwise just decrement 519 * active count and set ref bit. If amp is not NULL use amp as a lookup tag 520 * otherwise use seg as a lookup tag. 521 */ 522 void 523 seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr, 524 size_t len, struct page **pp, enum seg_rw rw, uint_t flags, 525 seg_preclaim_cbfunc_t callback) 526 { 527 struct seg_pcache *pcp; 528 struct seg_phash *hp; 529 kmutex_t *pmtx = NULL; 530 pcache_link_t *pheadp; 531 void *htag0; 532 pgcnt_t npages = 0; 533 int keep = 0; 534 535 ASSERT(seg != NULL); 536 ASSERT(rw == S_READ || rw == S_WRITE); 537 538 htag0 = (amp == NULL ? (void *)seg : (void *)amp); 539 540 /* 541 * Skip lookup if pcache is not configured. 542 */ 543 if (seg_phashsize_win == 0) { 544 goto out; 545 } 546 547 /* 548 * Grab per seg/amp lock before hash lock if we are going to remove 549 * inactive entry from pcache. 550 */ 551 if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) { 552 if (amp == NULL) { 553 pheadp = &seg->s_phead; 554 pmtx = &seg->s_pmtx; 555 } else { 556 pheadp = &->a_phead; 557 pmtx = &->a_pmtx; 558 } 559 mutex_enter(pmtx); 560 } 561 562 hp = P_HASHBP(seg, htag0, addr, flags); 563 mutex_enter(&hp->p_hmutex); 564 again: 565 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 566 pcp = pcp->p_hnext) { 567 ASSERT(pcp->p_hashp == hp); 568 if (P_MATCH_PP(pcp, htag0, addr, len, pp)) { 569 ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); 570 ASSERT(pcp->p_active); 571 if (keep) { 572 /* 573 * Don't remove this pcp entry 574 * if we didn't find duplicate 575 * shadow lists on second search. 576 * Somebody removed those duplicates 577 * since we dropped hash lock after first 578 * search. 579 */ 580 ASSERT(pmtx != NULL); 581 ASSERT(!IS_PFLAGS_WIRED(flags)); 582 mutex_exit(pmtx); 583 pmtx = NULL; 584 } 585 pcp->p_active--; 586 if (pcp->p_active == 0 && (pmtx != NULL || 587 (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) { 588 589 /* 590 * This entry is no longer active. Remove it 591 * now either because pcaching is temporarily 592 * disabled or there're other pcp entries that 593 * can match this pagelock request (i.e. this 594 * entry is a duplicate). 595 */ 596 597 ASSERT(callback == pcp->p_callback); 598 if (pmtx != NULL) { 599 pcache_link_t *plinkp = &pcp->p_plink; 600 ASSERT(!IS_PCP_WIRED(pcp)); 601 ASSERT(pheadp->p_lnext != pheadp); 602 ASSERT(pheadp->p_lprev != pheadp); 603 plinkp->p_lprev->p_lnext = 604 plinkp->p_lnext; 605 plinkp->p_lnext->p_lprev = 606 plinkp->p_lprev; 607 } 608 pcp->p_hprev->p_hnext = pcp->p_hnext; 609 pcp->p_hnext->p_hprev = pcp->p_hprev; 610 if (!IS_PCP_WIRED(pcp) && 611 hp->p_hnext == (struct seg_pcache *)hp) { 612 /* 613 * We removed the last entry from this 614 * bucket. Now remove the bucket from 615 * its active list. 616 */ 617 seg_premove_abuck(hp, 0); 618 } 619 mutex_exit(&hp->p_hmutex); 620 if (pmtx != NULL) { 621 mutex_exit(pmtx); 622 } 623 len = pcp->p_len; 624 npages = btop(len); 625 if (rw != S_WRITE && pcp->p_write) { 626 rw = S_WRITE; 627 } 628 kmem_cache_free(seg_pkmcache, pcp); 629 goto out; 630 } else { 631 /* 632 * We found a matching pcp entry but will not 633 * free it right away even if it's no longer 634 * active. 635 */ 636 if (!pcp->p_active && !IS_PCP_WIRED(pcp)) { 637 /* 638 * Set the reference bit and mark the 639 * time of last access to this pcp 640 * so that asynchronous thread doesn't 641 * free it immediately since 642 * it may be reactivated very soon. 643 */ 644 pcp->p_lbolt = ddi_get_lbolt(); 645 pcp->p_ref = 1; 646 } 647 mutex_exit(&hp->p_hmutex); 648 if (pmtx != NULL) { 649 mutex_exit(pmtx); 650 } 651 return; 652 } 653 } else if (!IS_PFLAGS_WIRED(flags) && 654 P_MATCH(pcp, htag0, addr, len)) { 655 /* 656 * This is a duplicate pcp entry. This situation may 657 * happen if a bigger shadow list that covers our 658 * range was added while our entry was still active. 659 * Now we can free our pcp entry if it becomes 660 * inactive. 661 */ 662 if (!pcp->p_active) { 663 /* 664 * Mark this entry as referenced just in case 665 * we'll free our own pcp entry soon. 666 */ 667 pcp->p_lbolt = ddi_get_lbolt(); 668 pcp->p_ref = 1; 669 } 670 if (pmtx != NULL) { 671 /* 672 * we are already holding pmtx and found a 673 * duplicate. Don't keep our own pcp entry. 674 */ 675 keep = 0; 676 continue; 677 } 678 /* 679 * We have to use mutex_tryenter to attempt to lock 680 * seg/amp list lock since we already hold hash lock 681 * and seg/amp list lock is above hash lock in lock 682 * order. If mutex_tryenter fails drop hash lock and 683 * retake both locks in correct order and research 684 * this hash chain. 685 */ 686 ASSERT(keep == 0); 687 if (amp == NULL) { 688 pheadp = &seg->s_phead; 689 pmtx = &seg->s_pmtx; 690 } else { 691 pheadp = &->a_phead; 692 pmtx = &->a_pmtx; 693 } 694 if (!mutex_tryenter(pmtx)) { 695 mutex_exit(&hp->p_hmutex); 696 mutex_enter(pmtx); 697 mutex_enter(&hp->p_hmutex); 698 /* 699 * If we don't find bigger shadow list on 700 * second search (it may happen since we 701 * dropped bucket lock) keep the entry that 702 * matches our own shadow list. 703 */ 704 keep = 1; 705 goto again; 706 } 707 } 708 } 709 mutex_exit(&hp->p_hmutex); 710 if (pmtx != NULL) { 711 mutex_exit(pmtx); 712 } 713 out: 714 (*callback)(htag0, addr, len, pp, rw, 0); 715 if (npages) { 716 mutex_enter(&seg_pmem_mtx); 717 ASSERT(seg_plocked >= npages); 718 seg_plocked -= npages; 719 if (!IS_PFLAGS_WIRED(flags)) { 720 ASSERT(seg_plocked_window >= npages); 721 seg_plocked_window -= npages; 722 } 723 mutex_exit(&seg_pmem_mtx); 724 } 725 726 } 727 728 #ifdef DEBUG 729 static uint32_t p_insert_chk_mtbf = 0; 730 #endif 731 732 /* 733 * The seg_pinsert_check() is used by segment drivers to predict whether 734 * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing. 735 */ 736 /*ARGSUSED*/ 737 int 738 seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr, 739 size_t len, uint_t flags) 740 { 741 ASSERT(seg != NULL); 742 743 #ifdef DEBUG 744 if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) { 745 return (SEGP_FAIL); 746 } 747 #endif 748 749 if (seg_pdisabled) { 750 return (SEGP_FAIL); 751 } 752 ASSERT(seg_phashsize_win != 0); 753 754 if (IS_PFLAGS_WIRED(flags)) { 755 return (SEGP_SUCCESS); 756 } 757 758 if (seg_plocked_window + btop(len) > seg_pmaxwindow) { 759 return (SEGP_FAIL); 760 } 761 762 if (freemem < desfree) { 763 return (SEGP_FAIL); 764 } 765 766 return (SEGP_SUCCESS); 767 } 768 769 #ifdef DEBUG 770 static uint32_t p_insert_mtbf = 0; 771 #endif 772 773 /* 774 * Insert address range with shadow list into pagelock cache if there's no 775 * shadow list already cached for this address range. If the cache is off or 776 * caching is temporarily disabled or the allowed 'window' is exceeded return 777 * SEGP_FAIL. Otherwise return SEGP_SUCCESS. 778 * 779 * For non wired shadow lists (segvn case) include address in the hashing 780 * function to avoid linking all the entries from the same segment or amp on 781 * the same bucket. amp is used instead of seg if amp is not NULL. Non wired 782 * pcache entries are also linked on a per segment/amp list so that all 783 * entries can be found quickly during seg/amp purge without walking the 784 * entire pcache hash table. For wired shadow lists (segspt case) we 785 * don't use address hashing and per segment linking because the caller 786 * currently inserts only one entry per segment that covers the entire 787 * segment. If we used per segment linking even for segspt it would complicate 788 * seg_ppurge_wiredpp() locking. 789 * 790 * Both hash bucket and per seg/amp locks need to be held before adding a non 791 * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken 792 * first. 793 * 794 * This function will also remove from pcache old inactive shadow lists that 795 * overlap with this request but cover smaller range for the same start 796 * address. 797 */ 798 int 799 seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, 800 size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags, 801 seg_preclaim_cbfunc_t callback) 802 { 803 struct seg_pcache *pcp; 804 struct seg_phash *hp; 805 pgcnt_t npages; 806 pcache_link_t *pheadp; 807 kmutex_t *pmtx; 808 struct seg_pcache *delcallb_list = NULL; 809 810 ASSERT(seg != NULL); 811 ASSERT(rw == S_READ || rw == S_WRITE); 812 ASSERT(rw == S_READ || wlen == len); 813 ASSERT(rw == S_WRITE || wlen <= len); 814 ASSERT(amp == NULL || wlen == len); 815 816 #ifdef DEBUG 817 if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) { 818 return (SEGP_FAIL); 819 } 820 #endif 821 822 if (seg_pdisabled) { 823 return (SEGP_FAIL); 824 } 825 ASSERT(seg_phashsize_win != 0); 826 827 ASSERT((len & PAGEOFFSET) == 0); 828 npages = btop(len); 829 mutex_enter(&seg_pmem_mtx); 830 if (!IS_PFLAGS_WIRED(flags)) { 831 if (seg_plocked_window + npages > seg_pmaxwindow) { 832 mutex_exit(&seg_pmem_mtx); 833 return (SEGP_FAIL); 834 } 835 seg_plocked_window += npages; 836 } 837 seg_plocked += npages; 838 mutex_exit(&seg_pmem_mtx); 839 840 pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP); 841 /* 842 * If amp is not NULL set htag0 to amp otherwise set it to seg. 843 */ 844 if (amp == NULL) { 845 pcp->p_htag0 = (void *)seg; 846 pcp->p_flags = flags & 0xffff; 847 } else { 848 pcp->p_htag0 = (void *)amp; 849 pcp->p_flags = (flags & 0xffff) | SEGP_AMP; 850 } 851 pcp->p_addr = addr; 852 pcp->p_len = len; 853 pcp->p_wlen = wlen; 854 pcp->p_pp = pp; 855 pcp->p_write = (rw == S_WRITE); 856 pcp->p_callback = callback; 857 pcp->p_active = 1; 858 859 hp = P_HASHBP(seg, pcp->p_htag0, addr, flags); 860 if (!IS_PFLAGS_WIRED(flags)) { 861 int found; 862 void *htag0; 863 if (amp == NULL) { 864 pheadp = &seg->s_phead; 865 pmtx = &seg->s_pmtx; 866 htag0 = (void *)seg; 867 } else { 868 pheadp = &->a_phead; 869 pmtx = &->a_pmtx; 870 htag0 = (void *)amp; 871 } 872 mutex_enter(pmtx); 873 mutex_enter(&hp->p_hmutex); 874 delcallb_list = seg_plookup_checkdup(hp, htag0, addr, 875 len, &found); 876 if (found) { 877 mutex_exit(&hp->p_hmutex); 878 mutex_exit(pmtx); 879 mutex_enter(&seg_pmem_mtx); 880 seg_plocked -= npages; 881 seg_plocked_window -= npages; 882 mutex_exit(&seg_pmem_mtx); 883 kmem_cache_free(seg_pkmcache, pcp); 884 goto out; 885 } 886 pcp->p_plink.p_lnext = pheadp->p_lnext; 887 pcp->p_plink.p_lprev = pheadp; 888 pheadp->p_lnext->p_lprev = &pcp->p_plink; 889 pheadp->p_lnext = &pcp->p_plink; 890 } else { 891 mutex_enter(&hp->p_hmutex); 892 } 893 pcp->p_hashp = hp; 894 pcp->p_hnext = hp->p_hnext; 895 pcp->p_hprev = (struct seg_pcache *)hp; 896 hp->p_hnext->p_hprev = pcp; 897 hp->p_hnext = pcp; 898 if (!IS_PFLAGS_WIRED(flags) && 899 hp->p_hprev == pcp) { 900 seg_padd_abuck(hp); 901 } 902 mutex_exit(&hp->p_hmutex); 903 if (!IS_PFLAGS_WIRED(flags)) { 904 mutex_exit(pmtx); 905 } 906 907 out: 908 npages = 0; 909 while (delcallb_list != NULL) { 910 pcp = delcallb_list; 911 delcallb_list = pcp->p_hprev; 912 ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active); 913 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 914 pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); 915 npages += btop(pcp->p_len); 916 kmem_cache_free(seg_pkmcache, pcp); 917 } 918 if (npages) { 919 ASSERT(!IS_PFLAGS_WIRED(flags)); 920 mutex_enter(&seg_pmem_mtx); 921 ASSERT(seg_plocked >= npages); 922 ASSERT(seg_plocked_window >= npages); 923 seg_plocked -= npages; 924 seg_plocked_window -= npages; 925 mutex_exit(&seg_pmem_mtx); 926 } 927 928 return (SEGP_SUCCESS); 929 } 930 931 /* 932 * purge entries from the pagelock cache if not active 933 * and not recently used. 934 */ 935 static void 936 seg_ppurge_async(int force) 937 { 938 struct seg_pcache *delcallb_list = NULL; 939 struct seg_pcache *pcp; 940 struct seg_phash *hp; 941 pgcnt_t npages = 0; 942 pgcnt_t npages_window = 0; 943 pgcnt_t npgs_to_purge; 944 pgcnt_t npgs_purged = 0; 945 int hlinks = 0; 946 int hlix; 947 pcache_link_t *hlinkp; 948 pcache_link_t *hlnextp = NULL; 949 int lowmem; 950 int trim; 951 952 ASSERT(seg_phashsize_win != 0); 953 954 /* 955 * if the cache is off or empty, return 956 */ 957 if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) { 958 return; 959 } 960 961 if (!force) { 962 lowmem = 0; 963 trim = 0; 964 if (freemem < lotsfree + needfree) { 965 spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0); 966 if (fmem <= 5 * (desfree >> 2)) { 967 lowmem = 1; 968 } else if (fmem <= 7 * (lotsfree >> 3)) { 969 if (seg_plocked_window >= 970 (availrmem_initial >> 1)) { 971 lowmem = 1; 972 } 973 } else if (fmem < lotsfree) { 974 if (seg_plocked_window >= 975 3 * (availrmem_initial >> 2)) { 976 lowmem = 1; 977 } 978 } 979 } 980 if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) { 981 trim = 1; 982 } 983 if (!lowmem && !trim) { 984 return; 985 } 986 npgs_to_purge = seg_plocked_window >> 987 seg_pshrink_shift; 988 if (lowmem) { 989 npgs_to_purge = MIN(npgs_to_purge, 990 MAX(seg_pmaxapurge_npages, desfree)); 991 } else { 992 npgs_to_purge = MIN(npgs_to_purge, 993 seg_pmaxapurge_npages); 994 } 995 if (npgs_to_purge == 0) { 996 return; 997 } 998 } else { 999 struct seg_phash_wired *hpw; 1000 1001 ASSERT(seg_phashsize_wired != 0); 1002 1003 for (hpw = seg_phashtab_wired; 1004 hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) { 1005 1006 if (hpw->p_hnext == (struct seg_pcache *)hpw) { 1007 continue; 1008 } 1009 1010 mutex_enter(&hpw->p_hmutex); 1011 1012 for (pcp = hpw->p_hnext; 1013 pcp != (struct seg_pcache *)hpw; 1014 pcp = pcp->p_hnext) { 1015 1016 ASSERT(IS_PCP_WIRED(pcp)); 1017 ASSERT(pcp->p_hashp == 1018 (struct seg_phash *)hpw); 1019 1020 if (pcp->p_active) { 1021 continue; 1022 } 1023 pcp->p_hprev->p_hnext = pcp->p_hnext; 1024 pcp->p_hnext->p_hprev = pcp->p_hprev; 1025 pcp->p_hprev = delcallb_list; 1026 delcallb_list = pcp; 1027 } 1028 mutex_exit(&hpw->p_hmutex); 1029 } 1030 } 1031 1032 mutex_enter(&seg_pmem_mtx); 1033 if (seg_pathr_on) { 1034 mutex_exit(&seg_pmem_mtx); 1035 goto runcb; 1036 } 1037 seg_pathr_on = 1; 1038 mutex_exit(&seg_pmem_mtx); 1039 ASSERT(seg_pahcur <= 1); 1040 hlix = !seg_pahcur; 1041 1042 again: 1043 for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix]; 1044 hlinkp = hlnextp) { 1045 1046 hlnextp = hlinkp->p_lnext; 1047 ASSERT(hlnextp != NULL); 1048 1049 hp = hlink2phash(hlinkp, hlix); 1050 if (hp->p_hnext == (struct seg_pcache *)hp) { 1051 seg_pathr_empty_ahb++; 1052 continue; 1053 } 1054 seg_pathr_full_ahb++; 1055 mutex_enter(&hp->p_hmutex); 1056 1057 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 1058 pcp = pcp->p_hnext) { 1059 pcache_link_t *pheadp; 1060 pcache_link_t *plinkp; 1061 void *htag0; 1062 kmutex_t *pmtx; 1063 1064 ASSERT(!IS_PCP_WIRED(pcp)); 1065 ASSERT(pcp->p_hashp == hp); 1066 1067 if (pcp->p_active) { 1068 continue; 1069 } 1070 if (!force && pcp->p_ref && 1071 PCP_AGE(pcp) < seg_pmax_pcpage) { 1072 pcp->p_ref = 0; 1073 continue; 1074 } 1075 plinkp = &pcp->p_plink; 1076 htag0 = pcp->p_htag0; 1077 if (pcp->p_flags & SEGP_AMP) { 1078 pheadp = &((amp_t *)htag0)->a_phead; 1079 pmtx = &((amp_t *)htag0)->a_pmtx; 1080 } else { 1081 pheadp = &((seg_t *)htag0)->s_phead; 1082 pmtx = &((seg_t *)htag0)->s_pmtx; 1083 } 1084 if (!mutex_tryenter(pmtx)) { 1085 continue; 1086 } 1087 ASSERT(pheadp->p_lnext != pheadp); 1088 ASSERT(pheadp->p_lprev != pheadp); 1089 plinkp->p_lprev->p_lnext = 1090 plinkp->p_lnext; 1091 plinkp->p_lnext->p_lprev = 1092 plinkp->p_lprev; 1093 pcp->p_hprev->p_hnext = pcp->p_hnext; 1094 pcp->p_hnext->p_hprev = pcp->p_hprev; 1095 mutex_exit(pmtx); 1096 pcp->p_hprev = delcallb_list; 1097 delcallb_list = pcp; 1098 npgs_purged += btop(pcp->p_len); 1099 } 1100 if (hp->p_hnext == (struct seg_pcache *)hp) { 1101 seg_premove_abuck(hp, 1); 1102 } 1103 mutex_exit(&hp->p_hmutex); 1104 if (npgs_purged >= seg_plocked_window) { 1105 break; 1106 } 1107 if (!force) { 1108 if (npgs_purged >= npgs_to_purge) { 1109 break; 1110 } 1111 if (!trim && !(seg_pathr_full_ahb & 15)) { 1112 ASSERT(lowmem); 1113 if (freemem >= lotsfree + needfree) { 1114 break; 1115 } 1116 } 1117 } 1118 } 1119 1120 if (hlinkp == &seg_pahhead[hlix]) { 1121 /* 1122 * We processed the entire hlix active bucket list 1123 * but didn't find enough pages to reclaim. 1124 * Switch the lists and walk the other list 1125 * if we haven't done it yet. 1126 */ 1127 mutex_enter(&seg_pmem_mtx); 1128 ASSERT(seg_pathr_on); 1129 ASSERT(seg_pahcur == !hlix); 1130 seg_pahcur = hlix; 1131 mutex_exit(&seg_pmem_mtx); 1132 if (++hlinks < 2) { 1133 hlix = !hlix; 1134 goto again; 1135 } 1136 } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] && 1137 seg_pahhead[hlix].p_lnext != hlinkp) { 1138 ASSERT(hlinkp != NULL); 1139 ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]); 1140 ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]); 1141 ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]); 1142 1143 /* 1144 * Reinsert the header to point to hlinkp 1145 * so that we start from hlinkp bucket next time around. 1146 */ 1147 seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev; 1148 seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext; 1149 seg_pahhead[hlix].p_lnext = hlinkp; 1150 seg_pahhead[hlix].p_lprev = hlinkp->p_lprev; 1151 hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix]; 1152 hlinkp->p_lprev = &seg_pahhead[hlix]; 1153 } 1154 1155 mutex_enter(&seg_pmem_mtx); 1156 ASSERT(seg_pathr_on); 1157 seg_pathr_on = 0; 1158 mutex_exit(&seg_pmem_mtx); 1159 1160 runcb: 1161 /* 1162 * Run the delayed callback list. segments/amps can't go away until 1163 * callback is executed since they must have non 0 softlockcnt. That's 1164 * why we don't need to hold as/seg/amp locks to execute the callback. 1165 */ 1166 while (delcallb_list != NULL) { 1167 pcp = delcallb_list; 1168 delcallb_list = pcp->p_hprev; 1169 ASSERT(!pcp->p_active); 1170 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 1171 pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1); 1172 npages += btop(pcp->p_len); 1173 if (!IS_PCP_WIRED(pcp)) { 1174 npages_window += btop(pcp->p_len); 1175 } 1176 kmem_cache_free(seg_pkmcache, pcp); 1177 } 1178 if (npages) { 1179 mutex_enter(&seg_pmem_mtx); 1180 ASSERT(seg_plocked >= npages); 1181 ASSERT(seg_plocked_window >= npages_window); 1182 seg_plocked -= npages; 1183 seg_plocked_window -= npages_window; 1184 mutex_exit(&seg_pmem_mtx); 1185 } 1186 } 1187 1188 /* 1189 * Remove cached pages for segment(s) entries from hashtable. The segments 1190 * are identified by pp array. This is useful for multiple seg's cached on 1191 * behalf of dummy segment (ISM/DISM) with common pp array. 1192 */ 1193 void 1194 seg_ppurge_wiredpp(struct page **pp) 1195 { 1196 struct seg_pcache *pcp; 1197 struct seg_phash_wired *hp; 1198 pgcnt_t npages = 0; 1199 struct seg_pcache *delcallb_list = NULL; 1200 1201 /* 1202 * if the cache is empty, return 1203 */ 1204 if (seg_plocked == 0) { 1205 return; 1206 } 1207 ASSERT(seg_phashsize_wired != 0); 1208 1209 for (hp = seg_phashtab_wired; 1210 hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) { 1211 if (hp->p_hnext == (struct seg_pcache *)hp) { 1212 continue; 1213 } 1214 mutex_enter(&hp->p_hmutex); 1215 pcp = hp->p_hnext; 1216 while (pcp != (struct seg_pcache *)hp) { 1217 ASSERT(pcp->p_hashp == (struct seg_phash *)hp); 1218 ASSERT(IS_PCP_WIRED(pcp)); 1219 /* 1220 * purge entries which are not active 1221 */ 1222 if (!pcp->p_active && pcp->p_pp == pp) { 1223 ASSERT(pcp->p_htag0 != NULL); 1224 pcp->p_hprev->p_hnext = pcp->p_hnext; 1225 pcp->p_hnext->p_hprev = pcp->p_hprev; 1226 pcp->p_hprev = delcallb_list; 1227 delcallb_list = pcp; 1228 } 1229 pcp = pcp->p_hnext; 1230 } 1231 mutex_exit(&hp->p_hmutex); 1232 /* 1233 * segments can't go away until callback is executed since 1234 * they must have non 0 softlockcnt. That's why we don't 1235 * need to hold as/seg locks to execute the callback. 1236 */ 1237 while (delcallb_list != NULL) { 1238 int done; 1239 pcp = delcallb_list; 1240 delcallb_list = pcp->p_hprev; 1241 ASSERT(!pcp->p_active); 1242 done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 1243 pcp->p_len, pcp->p_pp, 1244 pcp->p_write ? S_WRITE : S_READ, 1); 1245 npages += btop(pcp->p_len); 1246 ASSERT(IS_PCP_WIRED(pcp)); 1247 kmem_cache_free(seg_pkmcache, pcp); 1248 if (done) { 1249 ASSERT(delcallb_list == NULL); 1250 goto out; 1251 } 1252 } 1253 } 1254 1255 out: 1256 mutex_enter(&seg_pmem_mtx); 1257 ASSERT(seg_plocked >= npages); 1258 seg_plocked -= npages; 1259 mutex_exit(&seg_pmem_mtx); 1260 } 1261 1262 /* 1263 * purge all entries for a given segment. Since we 1264 * callback into the segment driver directly for page 1265 * reclaim the caller needs to hold the right locks. 1266 */ 1267 void 1268 seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags) 1269 { 1270 struct seg_pcache *delcallb_list = NULL; 1271 struct seg_pcache *pcp; 1272 struct seg_phash *hp; 1273 pgcnt_t npages = 0; 1274 void *htag0; 1275 1276 if (seg_plocked == 0) { 1277 return; 1278 } 1279 ASSERT(seg_phashsize_win != 0); 1280 1281 /* 1282 * If amp is not NULL use amp as a lookup tag otherwise use seg 1283 * as a lookup tag. 1284 */ 1285 htag0 = (amp == NULL ? (void *)seg : (void *)amp); 1286 ASSERT(htag0 != NULL); 1287 if (IS_PFLAGS_WIRED(flags)) { 1288 hp = P_HASHBP(seg, htag0, 0, flags); 1289 mutex_enter(&hp->p_hmutex); 1290 pcp = hp->p_hnext; 1291 while (pcp != (struct seg_pcache *)hp) { 1292 ASSERT(pcp->p_hashp == hp); 1293 ASSERT(IS_PCP_WIRED(pcp)); 1294 if (pcp->p_htag0 == htag0) { 1295 if (pcp->p_active) { 1296 break; 1297 } 1298 pcp->p_hprev->p_hnext = pcp->p_hnext; 1299 pcp->p_hnext->p_hprev = pcp->p_hprev; 1300 pcp->p_hprev = delcallb_list; 1301 delcallb_list = pcp; 1302 } 1303 pcp = pcp->p_hnext; 1304 } 1305 mutex_exit(&hp->p_hmutex); 1306 } else { 1307 pcache_link_t *plinkp; 1308 pcache_link_t *pheadp; 1309 kmutex_t *pmtx; 1310 1311 if (amp == NULL) { 1312 ASSERT(seg != NULL); 1313 pheadp = &seg->s_phead; 1314 pmtx = &seg->s_pmtx; 1315 } else { 1316 pheadp = &->a_phead; 1317 pmtx = &->a_pmtx; 1318 } 1319 mutex_enter(pmtx); 1320 while ((plinkp = pheadp->p_lnext) != pheadp) { 1321 pcp = plink2pcache(plinkp); 1322 ASSERT(!IS_PCP_WIRED(pcp)); 1323 ASSERT(pcp->p_htag0 == htag0); 1324 hp = pcp->p_hashp; 1325 mutex_enter(&hp->p_hmutex); 1326 if (pcp->p_active) { 1327 mutex_exit(&hp->p_hmutex); 1328 break; 1329 } 1330 ASSERT(plinkp->p_lprev == pheadp); 1331 pheadp->p_lnext = plinkp->p_lnext; 1332 plinkp->p_lnext->p_lprev = pheadp; 1333 pcp->p_hprev->p_hnext = pcp->p_hnext; 1334 pcp->p_hnext->p_hprev = pcp->p_hprev; 1335 pcp->p_hprev = delcallb_list; 1336 delcallb_list = pcp; 1337 if (hp->p_hnext == (struct seg_pcache *)hp) { 1338 seg_premove_abuck(hp, 0); 1339 } 1340 mutex_exit(&hp->p_hmutex); 1341 } 1342 mutex_exit(pmtx); 1343 } 1344 while (delcallb_list != NULL) { 1345 pcp = delcallb_list; 1346 delcallb_list = pcp->p_hprev; 1347 ASSERT(!pcp->p_active); 1348 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len, 1349 pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); 1350 npages += btop(pcp->p_len); 1351 kmem_cache_free(seg_pkmcache, pcp); 1352 } 1353 mutex_enter(&seg_pmem_mtx); 1354 ASSERT(seg_plocked >= npages); 1355 seg_plocked -= npages; 1356 if (!IS_PFLAGS_WIRED(flags)) { 1357 ASSERT(seg_plocked_window >= npages); 1358 seg_plocked_window -= npages; 1359 } 1360 mutex_exit(&seg_pmem_mtx); 1361 } 1362 1363 static void seg_pinit_mem_config(void); 1364 1365 /* 1366 * setup the pagelock cache 1367 */ 1368 static void 1369 seg_pinit(void) 1370 { 1371 struct seg_phash *hp; 1372 ulong_t i; 1373 pgcnt_t physmegs; 1374 1375 seg_plocked = 0; 1376 seg_plocked_window = 0; 1377 1378 if (segpcache_enabled == 0) { 1379 seg_phashsize_win = 0; 1380 seg_phashsize_wired = 0; 1381 seg_pdisabled = 1; 1382 return; 1383 } 1384 1385 seg_pdisabled = 0; 1386 seg_pkmcache = kmem_cache_create("seg_pcache", 1387 sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0); 1388 if (segpcache_pcp_maxage_ticks <= 0) { 1389 segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz; 1390 } 1391 seg_pmax_pcpage = segpcache_pcp_maxage_ticks; 1392 seg_pathr_empty_ahb = 0; 1393 seg_pathr_full_ahb = 0; 1394 seg_pshrink_shift = segpcache_shrink_shift; 1395 seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes); 1396 1397 mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL); 1398 mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL); 1399 mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL); 1400 cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL); 1401 1402 physmegs = physmem >> (20 - PAGESHIFT); 1403 1404 /* 1405 * If segpcache_hashsize_win was not set in /etc/system or it has 1406 * absurd value set it to a default. 1407 */ 1408 if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) { 1409 /* 1410 * Create one bucket per 32K (or at least per 8 pages) of 1411 * available memory. 1412 */ 1413 pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8); 1414 segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket); 1415 } 1416 if (!ISP2(segpcache_hashsize_win)) { 1417 ulong_t rndfac = ~(1UL << 1418 (highbit(segpcache_hashsize_win) - 1)); 1419 rndfac &= segpcache_hashsize_win; 1420 segpcache_hashsize_win += rndfac; 1421 segpcache_hashsize_win = 1 << 1422 (highbit(segpcache_hashsize_win) - 1); 1423 } 1424 seg_phashsize_win = segpcache_hashsize_win; 1425 seg_phashtab_win = kmem_zalloc( 1426 seg_phashsize_win * sizeof (struct seg_phash), 1427 KM_SLEEP); 1428 for (i = 0; i < seg_phashsize_win; i++) { 1429 hp = &seg_phashtab_win[i]; 1430 hp->p_hnext = (struct seg_pcache *)hp; 1431 hp->p_hprev = (struct seg_pcache *)hp; 1432 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 1433 } 1434 1435 seg_pahcur = 0; 1436 seg_pathr_on = 0; 1437 seg_pahhead[0].p_lnext = &seg_pahhead[0]; 1438 seg_pahhead[0].p_lprev = &seg_pahhead[0]; 1439 seg_pahhead[1].p_lnext = &seg_pahhead[1]; 1440 seg_pahhead[1].p_lprev = &seg_pahhead[1]; 1441 1442 /* 1443 * If segpcache_hashsize_wired was not set in /etc/system or it has 1444 * absurd value set it to a default. 1445 */ 1446 if (segpcache_hashsize_wired == 0 || 1447 segpcache_hashsize_wired > physmem / 4) { 1448 /* 1449 * Choose segpcache_hashsize_wired based on physmem. 1450 * Create a bucket per 128K bytes upto 256K buckets. 1451 */ 1452 if (physmegs < 20 * 1024) { 1453 segpcache_hashsize_wired = MAX(1024, physmegs << 3); 1454 } else { 1455 segpcache_hashsize_wired = 256 * 1024; 1456 } 1457 } 1458 if (!ISP2(segpcache_hashsize_wired)) { 1459 segpcache_hashsize_wired = 1 << 1460 highbit(segpcache_hashsize_wired); 1461 } 1462 seg_phashsize_wired = segpcache_hashsize_wired; 1463 seg_phashtab_wired = kmem_zalloc( 1464 seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP); 1465 for (i = 0; i < seg_phashsize_wired; i++) { 1466 hp = (struct seg_phash *)&seg_phashtab_wired[i]; 1467 hp->p_hnext = (struct seg_pcache *)hp; 1468 hp->p_hprev = (struct seg_pcache *)hp; 1469 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 1470 } 1471 1472 if (segpcache_maxwindow == 0) { 1473 if (physmegs < 64) { 1474 /* 3% of memory */ 1475 segpcache_maxwindow = availrmem >> 5; 1476 } else if (physmegs < 512) { 1477 /* 12% of memory */ 1478 segpcache_maxwindow = availrmem >> 3; 1479 } else if (physmegs < 1024) { 1480 /* 25% of memory */ 1481 segpcache_maxwindow = availrmem >> 2; 1482 } else if (physmegs < 2048) { 1483 /* 50% of memory */ 1484 segpcache_maxwindow = availrmem >> 1; 1485 } else { 1486 /* no limit */ 1487 segpcache_maxwindow = (pgcnt_t)-1; 1488 } 1489 } 1490 seg_pmaxwindow = segpcache_maxwindow; 1491 seg_pinit_mem_config(); 1492 } 1493 1494 /* 1495 * called by pageout if memory is low 1496 */ 1497 void 1498 seg_preap(void) 1499 { 1500 /* 1501 * if the cache is off or empty, return 1502 */ 1503 if (seg_plocked_window == 0) { 1504 return; 1505 } 1506 ASSERT(seg_phashsize_win != 0); 1507 1508 /* 1509 * If somebody is already purging pcache 1510 * just return. 1511 */ 1512 if (seg_pdisabled) { 1513 return; 1514 } 1515 1516 cv_signal(&seg_pasync_cv); 1517 } 1518 1519 /* 1520 * run as a backgroud thread and reclaim pagelock 1521 * pages which have not been used recently 1522 */ 1523 void 1524 seg_pasync_thread(void) 1525 { 1526 callb_cpr_t cpr_info; 1527 1528 if (seg_phashsize_win == 0) { 1529 thread_exit(); 1530 /*NOTREACHED*/ 1531 } 1532 1533 seg_pasync_thr = curthread; 1534 1535 CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx, 1536 callb_generic_cpr, "seg_pasync"); 1537 1538 if (segpcache_reap_ticks <= 0) { 1539 segpcache_reap_ticks = segpcache_reap_sec * hz; 1540 } 1541 1542 mutex_enter(&seg_pasync_mtx); 1543 for (;;) { 1544 CALLB_CPR_SAFE_BEGIN(&cpr_info); 1545 (void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx, 1546 segpcache_reap_ticks, TR_CLOCK_TICK); 1547 CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx); 1548 if (seg_pdisabled == 0) { 1549 seg_ppurge_async(0); 1550 } 1551 } 1552 } 1553 1554 static struct kmem_cache *seg_cache; 1555 1556 /* 1557 * Initialize segment management data structures. 1558 */ 1559 void 1560 seg_init(void) 1561 { 1562 kstat_t *ksp; 1563 1564 seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg), 1565 0, NULL, NULL, NULL, NULL, NULL, 0); 1566 1567 ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED, 1568 segadvstat_ndata, KSTAT_FLAG_VIRTUAL); 1569 if (ksp) { 1570 ksp->ks_data = (void *)segadvstat_ptr; 1571 kstat_install(ksp); 1572 } 1573 1574 seg_pinit(); 1575 } 1576 1577 /* 1578 * Allocate a segment to cover [base, base+size] 1579 * and attach it to the specified address space. 1580 */ 1581 struct seg * 1582 seg_alloc(struct as *as, caddr_t base, size_t size) 1583 { 1584 struct seg *new; 1585 caddr_t segbase; 1586 size_t segsize; 1587 1588 segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK); 1589 segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) - 1590 (uintptr_t)segbase; 1591 1592 if (!valid_va_range(&segbase, &segsize, segsize, AH_LO)) 1593 return ((struct seg *)NULL); /* bad virtual addr range */ 1594 1595 if (as != &kas && 1596 valid_usr_range(segbase, segsize, 0, as, 1597 as->a_userlimit) != RANGE_OKAY) 1598 return ((struct seg *)NULL); /* bad virtual addr range */ 1599 1600 new = kmem_cache_alloc(seg_cache, KM_SLEEP); 1601 new->s_ops = NULL; 1602 new->s_data = NULL; 1603 new->s_szc = 0; 1604 new->s_flags = 0; 1605 mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL); 1606 new->s_phead.p_lnext = &new->s_phead; 1607 new->s_phead.p_lprev = &new->s_phead; 1608 if (seg_attach(as, segbase, segsize, new) < 0) { 1609 kmem_cache_free(seg_cache, new); 1610 return ((struct seg *)NULL); 1611 } 1612 /* caller must fill in ops, data */ 1613 return (new); 1614 } 1615 1616 /* 1617 * Attach a segment to the address space. Used by seg_alloc() 1618 * and for kernel startup to attach to static segments. 1619 */ 1620 int 1621 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg) 1622 { 1623 seg->s_as = as; 1624 seg->s_base = base; 1625 seg->s_size = size; 1626 1627 /* 1628 * as_addseg() will add the segment at the appropraite point 1629 * in the list. It will return -1 if there is overlap with 1630 * an already existing segment. 1631 */ 1632 return (as_addseg(as, seg)); 1633 } 1634 1635 /* 1636 * Unmap a segment and free it from its associated address space. 1637 * This should be called by anybody who's finished with a whole segment's 1638 * mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the 1639 * responsibility of the segment driver to unlink the the segment 1640 * from the address space, and to free public and private data structures 1641 * associated with the segment. (This is typically done by a call to 1642 * seg_free()). 1643 */ 1644 void 1645 seg_unmap(struct seg *seg) 1646 { 1647 #ifdef DEBUG 1648 int ret; 1649 #endif /* DEBUG */ 1650 1651 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1652 1653 /* Shouldn't have called seg_unmap if mapping isn't yet established */ 1654 ASSERT(seg->s_data != NULL); 1655 1656 /* Unmap the whole mapping */ 1657 #ifdef DEBUG 1658 ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 1659 ASSERT(ret == 0); 1660 #else 1661 SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 1662 #endif /* DEBUG */ 1663 } 1664 1665 /* 1666 * Free the segment from its associated as. This should only be called 1667 * if a mapping to the segment has not yet been established (e.g., if 1668 * an error occurs in the middle of doing an as_map when the segment 1669 * has already been partially set up) or if it has already been deleted 1670 * (e.g., from a segment driver unmap routine if the unmap applies to the 1671 * entire segment). If the mapping is currently set up then seg_unmap() should 1672 * be called instead. 1673 */ 1674 void 1675 seg_free(struct seg *seg) 1676 { 1677 register struct as *as = seg->s_as; 1678 struct seg *tseg = as_removeseg(as, seg); 1679 1680 ASSERT(tseg == seg); 1681 1682 /* 1683 * If the segment private data field is NULL, 1684 * then segment driver is not attached yet. 1685 */ 1686 if (seg->s_data != NULL) 1687 SEGOP_FREE(seg); 1688 1689 mutex_destroy(&seg->s_pmtx); 1690 ASSERT(seg->s_phead.p_lnext == &seg->s_phead); 1691 ASSERT(seg->s_phead.p_lprev == &seg->s_phead); 1692 kmem_cache_free(seg_cache, seg); 1693 } 1694 1695 /*ARGSUSED*/ 1696 static void 1697 seg_p_mem_config_post_add( 1698 void *arg, 1699 pgcnt_t delta_pages) 1700 { 1701 /* Nothing to do. */ 1702 } 1703 1704 void 1705 seg_p_enable(void) 1706 { 1707 mutex_enter(&seg_pcache_mtx); 1708 ASSERT(seg_pdisabled != 0); 1709 seg_pdisabled--; 1710 mutex_exit(&seg_pcache_mtx); 1711 } 1712 1713 /* 1714 * seg_p_disable - disables seg_pcache, and then attempts to empty the 1715 * cache. 1716 * Returns SEGP_SUCCESS if the cache was successfully emptied, or 1717 * SEGP_FAIL if the cache could not be emptied. 1718 */ 1719 int 1720 seg_p_disable(void) 1721 { 1722 pgcnt_t old_plocked; 1723 int stall_count = 0; 1724 1725 mutex_enter(&seg_pcache_mtx); 1726 seg_pdisabled++; 1727 ASSERT(seg_pdisabled != 0); 1728 mutex_exit(&seg_pcache_mtx); 1729 1730 /* 1731 * Attempt to empty the cache. Terminate if seg_plocked does not 1732 * diminish with SEGP_STALL_THRESHOLD consecutive attempts. 1733 */ 1734 while (seg_plocked != 0) { 1735 ASSERT(seg_phashsize_win != 0); 1736 old_plocked = seg_plocked; 1737 seg_ppurge_async(1); 1738 if (seg_plocked == old_plocked) { 1739 if (stall_count++ > SEGP_STALL_THRESHOLD) { 1740 return (SEGP_FAIL); 1741 } 1742 } else 1743 stall_count = 0; 1744 if (seg_plocked != 0) 1745 delay(hz/SEGP_PREDEL_DELAY_FACTOR); 1746 } 1747 return (SEGP_SUCCESS); 1748 } 1749 1750 /* 1751 * Attempt to purge seg_pcache. May need to return before this has 1752 * completed to allow other pre_del callbacks to unlock pages. This is 1753 * ok because: 1754 * 1) The seg_pdisabled flag has been set so at least we won't 1755 * cache anymore locks and the locks we couldn't purge 1756 * will not be held if they do get released by a subsequent 1757 * pre-delete callback. 1758 * 1759 * 2) The rest of the memory delete thread processing does not 1760 * depend on the changes made in this pre-delete callback. No 1761 * panics will result, the worst that will happen is that the 1762 * DR code will timeout and cancel the delete. 1763 */ 1764 /*ARGSUSED*/ 1765 static int 1766 seg_p_mem_config_pre_del( 1767 void *arg, 1768 pgcnt_t delta_pages) 1769 { 1770 if (seg_phashsize_win == 0) { 1771 return (0); 1772 } 1773 if (seg_p_disable() != SEGP_SUCCESS) 1774 cmn_err(CE_NOTE, 1775 "!Pre-delete couldn't purge"" pagelock cache - continuing"); 1776 return (0); 1777 } 1778 1779 /*ARGSUSED*/ 1780 static void 1781 seg_p_mem_config_post_del( 1782 void *arg, 1783 pgcnt_t delta_pages, 1784 int cancelled) 1785 { 1786 if (seg_phashsize_win == 0) { 1787 return; 1788 } 1789 seg_p_enable(); 1790 } 1791 1792 static kphysm_setup_vector_t seg_p_mem_config_vec = { 1793 KPHYSM_SETUP_VECTOR_VERSION, 1794 seg_p_mem_config_post_add, 1795 seg_p_mem_config_pre_del, 1796 seg_p_mem_config_post_del, 1797 }; 1798 1799 static void 1800 seg_pinit_mem_config(void) 1801 { 1802 int ret; 1803 1804 ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL); 1805 /* 1806 * Want to catch this in the debug kernel. At run time, if the 1807 * callbacks don't get run all will be OK as the disable just makes 1808 * it more likely that the pages can be collected. 1809 */ 1810 ASSERT(ret == 0); 1811 } 1812 1813 /* 1814 * Verify that segment is not a shared anonymous segment which reserves 1815 * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered 1816 * from one zone to another if any segments are shared. This is because the 1817 * last process to exit will credit the swap reservation. This could lead 1818 * to the swap being reserved by one zone, and credited to another. 1819 */ 1820 boolean_t 1821 seg_can_change_zones(struct seg *seg) 1822 { 1823 struct segvn_data *svd; 1824 1825 if (seg->s_ops == &segspt_shmops) 1826 return (B_FALSE); 1827 1828 if (seg->s_ops == &segvn_ops) { 1829 svd = (struct segvn_data *)seg->s_data; 1830 if (svd->type == MAP_SHARED && 1831 svd->amp != NULL && 1832 svd->amp->swresv > 0) 1833 return (B_FALSE); 1834 } 1835 return (B_TRUE); 1836 } 1837 1838 /* 1839 * Return swap reserved by a segment backing a private mapping. 1840 */ 1841 size_t 1842 seg_swresv(struct seg *seg) 1843 { 1844 struct segvn_data *svd; 1845 size_t swap = 0; 1846 1847 if (seg->s_ops == &segvn_ops) { 1848 svd = (struct segvn_data *)seg->s_data; 1849 if (svd->type == MAP_PRIVATE && svd->swresv > 0) 1850 swap = svd->swresv; 1851 } 1852 return (swap); 1853 } 1854