1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - segment management. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/inttypes.h> 47 #include <sys/t_lock.h> 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/kmem.h> 51 #include <sys/sysmacros.h> 52 #include <sys/vmsystm.h> 53 #include <sys/tuneable.h> 54 #include <sys/debug.h> 55 #include <sys/fs/swapnode.h> 56 #include <sys/cmn_err.h> 57 #include <sys/callb.h> 58 #include <sys/mem_config.h> 59 #include <sys/mman.h> 60 61 #include <vm/hat.h> 62 #include <vm/as.h> 63 #include <vm/seg.h> 64 #include <vm/seg_kmem.h> 65 #include <vm/seg_spt.h> 66 #include <vm/seg_vn.h> 67 #include <vm/anon.h> 68 69 /* 70 * kstats for segment advise 71 */ 72 segadvstat_t segadvstat = { 73 { "MADV_FREE_hit", KSTAT_DATA_ULONG }, 74 { "MADV_FREE_miss", KSTAT_DATA_ULONG }, 75 }; 76 77 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat; 78 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t); 79 80 /* 81 * entry in the segment page cache 82 */ 83 struct seg_pcache { 84 struct seg_pcache *p_hnext; /* list for hashed blocks */ 85 struct seg_pcache *p_hprev; 86 pcache_link_t p_plink; /* per segment/amp list */ 87 void *p_htag0; /* segment/amp pointer */ 88 caddr_t p_addr; /* base address/anon_idx */ 89 size_t p_len; /* total bytes */ 90 size_t p_wlen; /* writtable bytes at p_addr */ 91 struct page **p_pp; /* pp shadow list */ 92 seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */ 93 clock_t p_lbolt; /* lbolt from last use */ 94 struct seg_phash *p_hashp; /* our pcache hash bucket */ 95 uint_t p_active; /* active count */ 96 uchar_t p_write; /* true if S_WRITE */ 97 uchar_t p_ref; /* reference byte */ 98 ushort_t p_flags; /* bit flags */ 99 }; 100 101 struct seg_phash { 102 struct seg_pcache *p_hnext; /* list for hashed blocks */ 103 struct seg_pcache *p_hprev; 104 kmutex_t p_hmutex; /* protects hash bucket */ 105 pcache_link_t p_halink[2]; /* active bucket linkages */ 106 }; 107 108 struct seg_phash_wired { 109 struct seg_pcache *p_hnext; /* list for hashed blocks */ 110 struct seg_pcache *p_hprev; 111 kmutex_t p_hmutex; /* protects hash bucket */ 112 }; 113 114 /* 115 * A parameter to control a maximum number of bytes that can be 116 * purged from pcache at a time. 117 */ 118 #define P_MAX_APURGE_BYTES (1024 * 1024 * 1024) 119 120 /* 121 * log2(fraction of pcache to reclaim at a time). 122 */ 123 #define P_SHRINK_SHFT (5) 124 125 /* 126 * The following variables can be tuned via /etc/system. 127 */ 128 129 int segpcache_enabled = 1; /* if 1, shadow lists are cached */ 130 pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */ 131 ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */ 132 ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */ 133 int segpcache_reap_sec = 1; /* reap check rate in secs */ 134 clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */ 135 int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */ 136 clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */ 137 int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */ 138 pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */ 139 140 static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */ 141 static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */ 142 static kcondvar_t seg_pasync_cv; 143 144 #pragma align 64(pctrl1) 145 #pragma align 64(pctrl2) 146 #pragma align 64(pctrl3) 147 148 /* 149 * Keep frequently used variables together in one cache line. 150 */ 151 static struct p_ctrl1 { 152 uint_t p_disabled; /* if not 0, caching temporarily off */ 153 pgcnt_t p_maxwin; /* max # of pages that can be cached */ 154 size_t p_hashwin_sz; /* # of non wired buckets */ 155 struct seg_phash *p_htabwin; /* hash table for non wired entries */ 156 size_t p_hashwired_sz; /* # of wired buckets */ 157 struct seg_phash_wired *p_htabwired; /* hash table for wired entries */ 158 kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */ 159 #ifdef _LP64 160 ulong_t pad[1]; 161 #endif /* _LP64 */ 162 } pctrl1; 163 164 static struct p_ctrl2 { 165 kmutex_t p_mem_mtx; /* protects window counter and p_halinks */ 166 pgcnt_t p_locked_win; /* # pages from window */ 167 pgcnt_t p_locked; /* # of pages cached by pagelock */ 168 uchar_t p_ahcur; /* current active links for insert/delete */ 169 uchar_t p_athr_on; /* async reclaim thread is running. */ 170 pcache_link_t p_ahhead[2]; /* active buckets linkages */ 171 } pctrl2; 172 173 static struct p_ctrl3 { 174 clock_t p_pcp_maxage; /* max pcp age in ticks */ 175 ulong_t p_athr_empty_ahb; /* athread walk stats */ 176 ulong_t p_athr_full_ahb; /* athread walk stats */ 177 pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */ 178 int p_shrink_shft; /* reap shift factor */ 179 #ifdef _LP64 180 ulong_t pad[3]; 181 #endif /* _LP64 */ 182 } pctrl3; 183 184 #define seg_pdisabled pctrl1.p_disabled 185 #define seg_pmaxwindow pctrl1.p_maxwin 186 #define seg_phashsize_win pctrl1.p_hashwin_sz 187 #define seg_phashtab_win pctrl1.p_htabwin 188 #define seg_phashsize_wired pctrl1.p_hashwired_sz 189 #define seg_phashtab_wired pctrl1.p_htabwired 190 #define seg_pkmcache pctrl1.p_kmcache 191 #define seg_pmem_mtx pctrl2.p_mem_mtx 192 #define seg_plocked_window pctrl2.p_locked_win 193 #define seg_plocked pctrl2.p_locked 194 #define seg_pahcur pctrl2.p_ahcur 195 #define seg_pathr_on pctrl2.p_athr_on 196 #define seg_pahhead pctrl2.p_ahhead 197 #define seg_pmax_pcpage pctrl3.p_pcp_maxage 198 #define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb 199 #define seg_pathr_full_ahb pctrl3.p_athr_full_ahb 200 #define seg_pshrink_shift pctrl3.p_shrink_shft 201 #define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages 202 203 #define P_HASHWIN_MASK (seg_phashsize_win - 1) 204 #define P_HASHWIRED_MASK (seg_phashsize_wired - 1) 205 #define P_BASESHIFT (6) 206 207 kthread_t *seg_pasync_thr; 208 209 extern struct seg_ops segvn_ops; 210 extern struct seg_ops segspt_shmops; 211 212 #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED) 213 #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags) 214 215 #define LBOLT_DELTA(t) ((ulong_t)(lbolt - (t))) 216 217 #define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt) 218 219 /* 220 * htag0 argument can be a seg or amp pointer. 221 */ 222 #define P_HASHBP(seg, htag0, addr, flags) \ 223 (IS_PFLAGS_WIRED((flags)) ? \ 224 ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \ 225 ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \ 226 (&seg_phashtab_win[P_HASHWIN_MASK & \ 227 (((uintptr_t)(htag0) >> 3) ^ \ 228 ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \ 229 (flags >> 16) : page_get_shift((seg)->s_szc))))])) 230 231 /* 232 * htag0 argument can be a seg or amp pointer. 233 */ 234 #define P_MATCH(pcp, htag0, addr, len) \ 235 ((pcp)->p_htag0 == (htag0) && \ 236 (pcp)->p_addr == (addr) && \ 237 (pcp)->p_len >= (len)) 238 239 #define P_MATCH_PP(pcp, htag0, addr, len, pp) \ 240 ((pcp)->p_pp == (pp) && \ 241 (pcp)->p_htag0 == (htag0) && \ 242 (pcp)->p_addr == (addr) && \ 243 (pcp)->p_len >= (len)) 244 245 #define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \ 246 offsetof(struct seg_pcache, p_plink))) 247 248 #define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \ 249 offsetof(struct seg_phash, p_halink[l]))) 250 251 /* 252 * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from 253 * active hash bucket lists. We maintain active bucket lists to reduce the 254 * overhead of finding active buckets during asynchronous purging since there 255 * can be 10s of millions of buckets on a large system but only a small subset 256 * of them in actual use. 257 * 258 * There're 2 active bucket lists. Current active list (as per seg_pahcur) is 259 * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete 260 * buckets. The other list is used by asynchronous purge thread. This allows 261 * the purge thread to walk its active list without holding seg_pmem_mtx for a 262 * long time. When asynchronous thread is done with its list it switches to 263 * current active list and makes the list it just finished processing as 264 * current active list. 265 * 266 * seg_padd_abuck() only adds the bucket to current list if the bucket is not 267 * yet on any list. seg_premove_abuck() may remove the bucket from either 268 * list. If the bucket is on current list it will be always removed. Otherwise 269 * the bucket is only removed if asynchronous purge thread is not currently 270 * running or seg_premove_abuck() is called by asynchronous purge thread 271 * itself. A given bucket can only be on one of active lists at a time. These 272 * routines should be called with per bucket lock held. The routines use 273 * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after 274 * the first entry is added to the bucket chain and seg_premove_abuck() must 275 * be called after the last pcp entry is deleted from its chain. Per bucket 276 * lock should be held by the callers. This avoids a potential race condition 277 * when seg_premove_abuck() removes a bucket after pcp entries are added to 278 * its list after the caller checked that the bucket has no entries. (this 279 * race would cause a loss of an active bucket from the active lists). 280 * 281 * Both lists are circular doubly linked lists anchored at seg_pahhead heads. 282 * New entries are added to the end of the list since LRU is used as the 283 * purging policy. 284 */ 285 static void 286 seg_padd_abuck(struct seg_phash *hp) 287 { 288 int lix; 289 290 ASSERT(MUTEX_HELD(&hp->p_hmutex)); 291 ASSERT((struct seg_phash *)hp->p_hnext != hp); 292 ASSERT((struct seg_phash *)hp->p_hprev != hp); 293 ASSERT(hp->p_hnext == hp->p_hprev); 294 ASSERT(!IS_PCP_WIRED(hp->p_hnext)); 295 ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp); 296 ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp); 297 ASSERT(hp >= seg_phashtab_win && 298 hp < &seg_phashtab_win[seg_phashsize_win]); 299 300 /* 301 * This bucket can already be on one of active lists 302 * since seg_premove_abuck() may have failed to remove it 303 * before. 304 */ 305 mutex_enter(&seg_pmem_mtx); 306 lix = seg_pahcur; 307 ASSERT(lix >= 0 && lix <= 1); 308 if (hp->p_halink[lix].p_lnext != NULL) { 309 ASSERT(hp->p_halink[lix].p_lprev != NULL); 310 ASSERT(hp->p_halink[!lix].p_lnext == NULL); 311 ASSERT(hp->p_halink[!lix].p_lprev == NULL); 312 mutex_exit(&seg_pmem_mtx); 313 return; 314 } 315 ASSERT(hp->p_halink[lix].p_lprev == NULL); 316 317 /* 318 * If this bucket is still on list !lix async thread can't yet remove 319 * it since we hold here per bucket lock. In this case just return 320 * since async thread will eventually find and process this bucket. 321 */ 322 if (hp->p_halink[!lix].p_lnext != NULL) { 323 ASSERT(hp->p_halink[!lix].p_lprev != NULL); 324 mutex_exit(&seg_pmem_mtx); 325 return; 326 } 327 ASSERT(hp->p_halink[!lix].p_lprev == NULL); 328 /* 329 * This bucket is not on any active bucket list yet. 330 * Add the bucket to the tail of current active list. 331 */ 332 hp->p_halink[lix].p_lnext = &seg_pahhead[lix]; 333 hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev; 334 seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix]; 335 seg_pahhead[lix].p_lprev = &hp->p_halink[lix]; 336 mutex_exit(&seg_pmem_mtx); 337 } 338 339 static void 340 seg_premove_abuck(struct seg_phash *hp, int athr) 341 { 342 int lix; 343 344 ASSERT(MUTEX_HELD(&hp->p_hmutex)); 345 ASSERT((struct seg_phash *)hp->p_hnext == hp); 346 ASSERT((struct seg_phash *)hp->p_hprev == hp); 347 ASSERT(hp >= seg_phashtab_win && 348 hp < &seg_phashtab_win[seg_phashsize_win]); 349 350 if (athr) { 351 ASSERT(seg_pathr_on); 352 ASSERT(seg_pahcur <= 1); 353 /* 354 * We are called by asynchronous thread that found this bucket 355 * on not currently active (i.e. !seg_pahcur) list. Remove it 356 * from there. Per bucket lock we are holding makes sure 357 * seg_pinsert() can't sneak in and add pcp entries to this 358 * bucket right before we remove the bucket from its list. 359 */ 360 lix = !seg_pahcur; 361 ASSERT(hp->p_halink[lix].p_lnext != NULL); 362 ASSERT(hp->p_halink[lix].p_lprev != NULL); 363 ASSERT(hp->p_halink[!lix].p_lnext == NULL); 364 ASSERT(hp->p_halink[!lix].p_lprev == NULL); 365 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 366 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 367 hp->p_halink[lix].p_lnext = NULL; 368 hp->p_halink[lix].p_lprev = NULL; 369 return; 370 } 371 372 mutex_enter(&seg_pmem_mtx); 373 lix = seg_pahcur; 374 ASSERT(lix >= 0 && lix <= 1); 375 376 /* 377 * If the bucket is on currently active list just remove it from 378 * there. 379 */ 380 if (hp->p_halink[lix].p_lnext != NULL) { 381 ASSERT(hp->p_halink[lix].p_lprev != NULL); 382 ASSERT(hp->p_halink[!lix].p_lnext == NULL); 383 ASSERT(hp->p_halink[!lix].p_lprev == NULL); 384 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 385 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 386 hp->p_halink[lix].p_lnext = NULL; 387 hp->p_halink[lix].p_lprev = NULL; 388 mutex_exit(&seg_pmem_mtx); 389 return; 390 } 391 ASSERT(hp->p_halink[lix].p_lprev == NULL); 392 393 /* 394 * If asynchronous thread is not running we can remove the bucket from 395 * not currently active list. The bucket must be on this list since we 396 * already checked that it's not on the other list and the bucket from 397 * which we just deleted the last pcp entry must be still on one of the 398 * active bucket lists. 399 */ 400 lix = !lix; 401 ASSERT(hp->p_halink[lix].p_lnext != NULL); 402 ASSERT(hp->p_halink[lix].p_lprev != NULL); 403 404 if (!seg_pathr_on) { 405 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 406 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 407 hp->p_halink[lix].p_lnext = NULL; 408 hp->p_halink[lix].p_lprev = NULL; 409 } 410 mutex_exit(&seg_pmem_mtx); 411 } 412 413 /* 414 * Check if bucket pointed by hp already has a pcp entry that matches request 415 * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise. 416 * Also delete matching entries that cover smaller address range but start 417 * at the same address as addr argument. Return the list of deleted entries if 418 * any. This is an internal helper function called from seg_pinsert() only 419 * for non wired shadow lists. The caller already holds a per seg/amp list 420 * lock. 421 */ 422 static struct seg_pcache * 423 seg_plookup_checkdup(struct seg_phash *hp, void *htag0, 424 caddr_t addr, size_t len, int *found) 425 { 426 struct seg_pcache *pcp; 427 struct seg_pcache *delcallb_list = NULL; 428 429 ASSERT(MUTEX_HELD(&hp->p_hmutex)); 430 431 *found = 0; 432 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 433 pcp = pcp->p_hnext) { 434 ASSERT(pcp->p_hashp == hp); 435 if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) { 436 ASSERT(!IS_PCP_WIRED(pcp)); 437 if (pcp->p_len < len) { 438 pcache_link_t *plinkp; 439 if (pcp->p_active) { 440 continue; 441 } 442 plinkp = &pcp->p_plink; 443 plinkp->p_lprev->p_lnext = plinkp->p_lnext; 444 plinkp->p_lnext->p_lprev = plinkp->p_lprev; 445 pcp->p_hprev->p_hnext = pcp->p_hnext; 446 pcp->p_hnext->p_hprev = pcp->p_hprev; 447 pcp->p_hprev = delcallb_list; 448 delcallb_list = pcp; 449 } else { 450 *found = 1; 451 break; 452 } 453 } 454 } 455 return (delcallb_list); 456 } 457 458 /* 459 * lookup an address range in pagelock cache. Return shadow list and bump up 460 * active count. If amp is not NULL use amp as a lookup tag otherwise use seg 461 * as a lookup tag. 462 */ 463 struct page ** 464 seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, 465 enum seg_rw rw, uint_t flags) 466 { 467 struct seg_pcache *pcp; 468 struct seg_phash *hp; 469 void *htag0; 470 471 ASSERT(seg != NULL); 472 ASSERT(rw == S_READ || rw == S_WRITE); 473 474 /* 475 * Skip pagelock cache, while DR is in progress or 476 * seg_pcache is off. 477 */ 478 if (seg_pdisabled) { 479 return (NULL); 480 } 481 ASSERT(seg_phashsize_win != 0); 482 483 htag0 = (amp == NULL ? (void *)seg : (void *)amp); 484 hp = P_HASHBP(seg, htag0, addr, flags); 485 mutex_enter(&hp->p_hmutex); 486 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 487 pcp = pcp->p_hnext) { 488 ASSERT(pcp->p_hashp == hp); 489 if (P_MATCH(pcp, htag0, addr, len)) { 490 ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); 491 /* 492 * If this request wants to write pages 493 * but write permissions starting from 494 * addr don't cover the entire length len 495 * return lookup failure back to the caller. 496 * It will check protections and fail this 497 * pagelock operation with EACCESS error. 498 */ 499 if (rw == S_WRITE && pcp->p_wlen < len) { 500 break; 501 } 502 if (pcp->p_active == UINT_MAX) { 503 break; 504 } 505 pcp->p_active++; 506 if (rw == S_WRITE && !pcp->p_write) { 507 pcp->p_write = 1; 508 } 509 mutex_exit(&hp->p_hmutex); 510 return (pcp->p_pp); 511 } 512 } 513 mutex_exit(&hp->p_hmutex); 514 return (NULL); 515 } 516 517 /* 518 * mark address range inactive. If the cache is off or the address range is 519 * not in the cache or another shadow list that covers bigger range is found 520 * we call the segment driver to reclaim the pages. Otherwise just decrement 521 * active count and set ref bit. If amp is not NULL use amp as a lookup tag 522 * otherwise use seg as a lookup tag. 523 */ 524 void 525 seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr, 526 size_t len, struct page **pp, enum seg_rw rw, uint_t flags, 527 seg_preclaim_cbfunc_t callback) 528 { 529 struct seg_pcache *pcp; 530 struct seg_phash *hp; 531 kmutex_t *pmtx = NULL; 532 pcache_link_t *pheadp; 533 void *htag0; 534 pgcnt_t npages = 0; 535 int keep = 0; 536 537 ASSERT(seg != NULL); 538 ASSERT(rw == S_READ || rw == S_WRITE); 539 540 htag0 = (amp == NULL ? (void *)seg : (void *)amp); 541 542 /* 543 * Skip lookup if pcache is not configured. 544 */ 545 if (seg_phashsize_win == 0) { 546 goto out; 547 } 548 549 /* 550 * Grab per seg/amp lock before hash lock if we are going to remove 551 * inactive entry from pcache. 552 */ 553 if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) { 554 if (amp == NULL) { 555 pheadp = &seg->s_phead; 556 pmtx = &seg->s_pmtx; 557 } else { 558 pheadp = &->a_phead; 559 pmtx = &->a_pmtx; 560 } 561 mutex_enter(pmtx); 562 } 563 564 hp = P_HASHBP(seg, htag0, addr, flags); 565 mutex_enter(&hp->p_hmutex); 566 again: 567 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 568 pcp = pcp->p_hnext) { 569 ASSERT(pcp->p_hashp == hp); 570 if (P_MATCH_PP(pcp, htag0, addr, len, pp)) { 571 ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); 572 ASSERT(pcp->p_active); 573 if (keep) { 574 /* 575 * Don't remove this pcp entry 576 * if we didn't find duplicate 577 * shadow lists on second search. 578 * Somebody removed those duplicates 579 * since we dropped hash lock after first 580 * search. 581 */ 582 ASSERT(pmtx != NULL); 583 ASSERT(!IS_PFLAGS_WIRED(flags)); 584 mutex_exit(pmtx); 585 pmtx = NULL; 586 } 587 pcp->p_active--; 588 if (pcp->p_active == 0 && (pmtx != NULL || 589 (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) { 590 591 /* 592 * This entry is no longer active. Remove it 593 * now either because pcaching is temporarily 594 * disabled or there're other pcp entries that 595 * can match this pagelock request (i.e. this 596 * entry is a duplicate). 597 */ 598 599 ASSERT(callback == pcp->p_callback); 600 if (pmtx != NULL) { 601 pcache_link_t *plinkp = &pcp->p_plink; 602 ASSERT(!IS_PCP_WIRED(pcp)); 603 ASSERT(pheadp->p_lnext != pheadp); 604 ASSERT(pheadp->p_lprev != pheadp); 605 plinkp->p_lprev->p_lnext = 606 plinkp->p_lnext; 607 plinkp->p_lnext->p_lprev = 608 plinkp->p_lprev; 609 } 610 pcp->p_hprev->p_hnext = pcp->p_hnext; 611 pcp->p_hnext->p_hprev = pcp->p_hprev; 612 if (!IS_PCP_WIRED(pcp) && 613 hp->p_hnext == (struct seg_pcache *)hp) { 614 /* 615 * We removed the last entry from this 616 * bucket. Now remove the bucket from 617 * its active list. 618 */ 619 seg_premove_abuck(hp, 0); 620 } 621 mutex_exit(&hp->p_hmutex); 622 if (pmtx != NULL) { 623 mutex_exit(pmtx); 624 } 625 len = pcp->p_len; 626 npages = btop(len); 627 if (rw != S_WRITE && pcp->p_write) { 628 rw = S_WRITE; 629 } 630 kmem_cache_free(seg_pkmcache, pcp); 631 goto out; 632 } else { 633 /* 634 * We found a matching pcp entry but will not 635 * free it right away even if it's no longer 636 * active. 637 */ 638 if (!pcp->p_active && !IS_PCP_WIRED(pcp)) { 639 /* 640 * Set the reference bit and mark the 641 * time of last access to this pcp 642 * so that asynchronous thread doesn't 643 * free it immediately since 644 * it may be reactivated very soon. 645 */ 646 pcp->p_lbolt = lbolt; 647 pcp->p_ref = 1; 648 } 649 mutex_exit(&hp->p_hmutex); 650 if (pmtx != NULL) { 651 mutex_exit(pmtx); 652 } 653 return; 654 } 655 } else if (!IS_PFLAGS_WIRED(flags) && 656 P_MATCH(pcp, htag0, addr, len)) { 657 /* 658 * This is a duplicate pcp entry. This situation may 659 * happen if a bigger shadow list that covers our 660 * range was added while our entry was still active. 661 * Now we can free our pcp entry if it becomes 662 * inactive. 663 */ 664 if (!pcp->p_active) { 665 /* 666 * Mark this entry as referenced just in case 667 * we'll free our own pcp entry soon. 668 */ 669 pcp->p_lbolt = lbolt; 670 pcp->p_ref = 1; 671 } 672 if (pmtx != NULL) { 673 /* 674 * we are already holding pmtx and found a 675 * duplicate. Don't keep our own pcp entry. 676 */ 677 keep = 0; 678 continue; 679 } 680 /* 681 * We have to use mutex_tryenter to attempt to lock 682 * seg/amp list lock since we already hold hash lock 683 * and seg/amp list lock is above hash lock in lock 684 * order. If mutex_tryenter fails drop hash lock and 685 * retake both locks in correct order and research 686 * this hash chain. 687 */ 688 ASSERT(keep == 0); 689 if (amp == NULL) { 690 pheadp = &seg->s_phead; 691 pmtx = &seg->s_pmtx; 692 } else { 693 pheadp = &->a_phead; 694 pmtx = &->a_pmtx; 695 } 696 if (!mutex_tryenter(pmtx)) { 697 mutex_exit(&hp->p_hmutex); 698 mutex_enter(pmtx); 699 mutex_enter(&hp->p_hmutex); 700 /* 701 * If we don't find bigger shadow list on 702 * second search (it may happen since we 703 * dropped bucket lock) keep the entry that 704 * matches our own shadow list. 705 */ 706 keep = 1; 707 goto again; 708 } 709 } 710 } 711 mutex_exit(&hp->p_hmutex); 712 if (pmtx != NULL) { 713 mutex_exit(pmtx); 714 } 715 out: 716 (*callback)(htag0, addr, len, pp, rw, 0); 717 if (npages) { 718 mutex_enter(&seg_pmem_mtx); 719 ASSERT(seg_plocked >= npages); 720 seg_plocked -= npages; 721 if (!IS_PFLAGS_WIRED(flags)) { 722 ASSERT(seg_plocked_window >= npages); 723 seg_plocked_window -= npages; 724 } 725 mutex_exit(&seg_pmem_mtx); 726 } 727 728 } 729 730 #ifdef DEBUG 731 static uint32_t p_insert_chk_mtbf = 0; 732 #endif 733 734 /* 735 * The seg_pinsert_check() is used by segment drivers to predict whether 736 * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing. 737 */ 738 /*ARGSUSED*/ 739 int 740 seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr, 741 size_t len, uint_t flags) 742 { 743 ASSERT(seg != NULL); 744 745 #ifdef DEBUG 746 if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) { 747 return (SEGP_FAIL); 748 } 749 #endif 750 751 if (seg_pdisabled) { 752 return (SEGP_FAIL); 753 } 754 ASSERT(seg_phashsize_win != 0); 755 756 if (IS_PFLAGS_WIRED(flags)) { 757 return (SEGP_SUCCESS); 758 } 759 760 if (seg_plocked_window + btop(len) > seg_pmaxwindow) { 761 return (SEGP_FAIL); 762 } 763 764 if (freemem < desfree) { 765 return (SEGP_FAIL); 766 } 767 768 return (SEGP_SUCCESS); 769 } 770 771 #ifdef DEBUG 772 static uint32_t p_insert_mtbf = 0; 773 #endif 774 775 /* 776 * Insert address range with shadow list into pagelock cache if there's no 777 * shadow list already cached for this address range. If the cache is off or 778 * caching is temporarily disabled or the allowed 'window' is exceeded return 779 * SEGP_FAIL. Otherwise return SEGP_SUCCESS. 780 * 781 * For non wired shadow lists (segvn case) include address in the hashing 782 * function to avoid linking all the entries from the same segment or amp on 783 * the same bucket. amp is used instead of seg if amp is not NULL. Non wired 784 * pcache entries are also linked on a per segment/amp list so that all 785 * entries can be found quickly during seg/amp purge without walking the 786 * entire pcache hash table. For wired shadow lists (segspt case) we 787 * don't use address hashing and per segment linking because the caller 788 * currently inserts only one entry per segment that covers the entire 789 * segment. If we used per segment linking even for segspt it would complicate 790 * seg_ppurge_wiredpp() locking. 791 * 792 * Both hash bucket and per seg/amp locks need to be held before adding a non 793 * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken 794 * first. 795 * 796 * This function will also remove from pcache old inactive shadow lists that 797 * overlap with this request but cover smaller range for the same start 798 * address. 799 */ 800 int 801 seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, 802 size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags, 803 seg_preclaim_cbfunc_t callback) 804 { 805 struct seg_pcache *pcp; 806 struct seg_phash *hp; 807 pgcnt_t npages; 808 pcache_link_t *pheadp; 809 kmutex_t *pmtx; 810 struct seg_pcache *delcallb_list = NULL; 811 812 ASSERT(seg != NULL); 813 ASSERT(rw == S_READ || rw == S_WRITE); 814 ASSERT(rw == S_READ || wlen == len); 815 ASSERT(rw == S_WRITE || wlen <= len); 816 ASSERT(amp == NULL || wlen == len); 817 818 #ifdef DEBUG 819 if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) { 820 return (SEGP_FAIL); 821 } 822 #endif 823 824 if (seg_pdisabled) { 825 return (SEGP_FAIL); 826 } 827 ASSERT(seg_phashsize_win != 0); 828 829 ASSERT((len & PAGEOFFSET) == 0); 830 npages = btop(len); 831 mutex_enter(&seg_pmem_mtx); 832 if (!IS_PFLAGS_WIRED(flags)) { 833 if (seg_plocked_window + npages > seg_pmaxwindow) { 834 mutex_exit(&seg_pmem_mtx); 835 return (SEGP_FAIL); 836 } 837 seg_plocked_window += npages; 838 } 839 seg_plocked += npages; 840 mutex_exit(&seg_pmem_mtx); 841 842 pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP); 843 /* 844 * If amp is not NULL set htag0 to amp otherwise set it to seg. 845 */ 846 if (amp == NULL) { 847 pcp->p_htag0 = (void *)seg; 848 pcp->p_flags = flags & 0xffff; 849 } else { 850 pcp->p_htag0 = (void *)amp; 851 pcp->p_flags = (flags & 0xffff) | SEGP_AMP; 852 } 853 pcp->p_addr = addr; 854 pcp->p_len = len; 855 pcp->p_wlen = wlen; 856 pcp->p_pp = pp; 857 pcp->p_write = (rw == S_WRITE); 858 pcp->p_callback = callback; 859 pcp->p_active = 1; 860 861 hp = P_HASHBP(seg, pcp->p_htag0, addr, flags); 862 if (!IS_PFLAGS_WIRED(flags)) { 863 int found; 864 void *htag0; 865 if (amp == NULL) { 866 pheadp = &seg->s_phead; 867 pmtx = &seg->s_pmtx; 868 htag0 = (void *)seg; 869 } else { 870 pheadp = &->a_phead; 871 pmtx = &->a_pmtx; 872 htag0 = (void *)amp; 873 } 874 mutex_enter(pmtx); 875 mutex_enter(&hp->p_hmutex); 876 delcallb_list = seg_plookup_checkdup(hp, htag0, addr, 877 len, &found); 878 if (found) { 879 mutex_exit(&hp->p_hmutex); 880 mutex_exit(pmtx); 881 mutex_enter(&seg_pmem_mtx); 882 seg_plocked -= npages; 883 seg_plocked_window -= npages; 884 mutex_exit(&seg_pmem_mtx); 885 kmem_cache_free(seg_pkmcache, pcp); 886 goto out; 887 } 888 pcp->p_plink.p_lnext = pheadp->p_lnext; 889 pcp->p_plink.p_lprev = pheadp; 890 pheadp->p_lnext->p_lprev = &pcp->p_plink; 891 pheadp->p_lnext = &pcp->p_plink; 892 } else { 893 mutex_enter(&hp->p_hmutex); 894 } 895 pcp->p_hashp = hp; 896 pcp->p_hnext = hp->p_hnext; 897 pcp->p_hprev = (struct seg_pcache *)hp; 898 hp->p_hnext->p_hprev = pcp; 899 hp->p_hnext = pcp; 900 if (!IS_PFLAGS_WIRED(flags) && 901 hp->p_hprev == pcp) { 902 seg_padd_abuck(hp); 903 } 904 mutex_exit(&hp->p_hmutex); 905 if (!IS_PFLAGS_WIRED(flags)) { 906 mutex_exit(pmtx); 907 } 908 909 out: 910 npages = 0; 911 while (delcallb_list != NULL) { 912 pcp = delcallb_list; 913 delcallb_list = pcp->p_hprev; 914 ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active); 915 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 916 pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); 917 npages += btop(pcp->p_len); 918 kmem_cache_free(seg_pkmcache, pcp); 919 } 920 if (npages) { 921 ASSERT(!IS_PFLAGS_WIRED(flags)); 922 mutex_enter(&seg_pmem_mtx); 923 ASSERT(seg_plocked >= npages); 924 ASSERT(seg_plocked_window >= npages); 925 seg_plocked -= npages; 926 seg_plocked_window -= npages; 927 mutex_exit(&seg_pmem_mtx); 928 } 929 930 return (SEGP_SUCCESS); 931 } 932 933 /* 934 * purge entries from the pagelock cache if not active 935 * and not recently used. 936 */ 937 static void 938 seg_ppurge_async(int force) 939 { 940 struct seg_pcache *delcallb_list = NULL; 941 struct seg_pcache *pcp; 942 struct seg_phash *hp; 943 pgcnt_t npages = 0; 944 pgcnt_t npages_window = 0; 945 pgcnt_t npgs_to_purge; 946 pgcnt_t npgs_purged = 0; 947 int hlinks = 0; 948 int hlix; 949 pcache_link_t *hlinkp; 950 pcache_link_t *hlnextp = NULL; 951 int lowmem; 952 int trim; 953 954 ASSERT(seg_phashsize_win != 0); 955 956 /* 957 * if the cache is off or empty, return 958 */ 959 if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) { 960 return; 961 } 962 963 if (!force) { 964 lowmem = 0; 965 trim = 0; 966 if (freemem < lotsfree + needfree) { 967 spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0); 968 if (fmem <= 5 * (desfree >> 2)) { 969 lowmem = 1; 970 } else if (fmem <= 7 * (lotsfree >> 3)) { 971 if (seg_plocked_window >= 972 (availrmem_initial >> 1)) { 973 lowmem = 1; 974 } 975 } else if (fmem < lotsfree) { 976 if (seg_plocked_window >= 977 3 * (availrmem_initial >> 2)) { 978 lowmem = 1; 979 } 980 } 981 } 982 if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) { 983 trim = 1; 984 } 985 if (!lowmem && !trim) { 986 return; 987 } 988 npgs_to_purge = seg_plocked_window >> 989 seg_pshrink_shift; 990 if (lowmem) { 991 npgs_to_purge = MIN(npgs_to_purge, 992 MAX(seg_pmaxapurge_npages, desfree)); 993 } else { 994 npgs_to_purge = MIN(npgs_to_purge, 995 seg_pmaxapurge_npages); 996 } 997 if (npgs_to_purge == 0) { 998 return; 999 } 1000 } else { 1001 struct seg_phash_wired *hpw; 1002 1003 ASSERT(seg_phashsize_wired != 0); 1004 1005 for (hpw = seg_phashtab_wired; 1006 hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) { 1007 1008 if (hpw->p_hnext == (struct seg_pcache *)hpw) { 1009 continue; 1010 } 1011 1012 mutex_enter(&hpw->p_hmutex); 1013 1014 for (pcp = hpw->p_hnext; 1015 pcp != (struct seg_pcache *)hpw; 1016 pcp = pcp->p_hnext) { 1017 1018 ASSERT(IS_PCP_WIRED(pcp)); 1019 ASSERT(pcp->p_hashp == 1020 (struct seg_phash *)hpw); 1021 1022 if (pcp->p_active) { 1023 continue; 1024 } 1025 pcp->p_hprev->p_hnext = pcp->p_hnext; 1026 pcp->p_hnext->p_hprev = pcp->p_hprev; 1027 pcp->p_hprev = delcallb_list; 1028 delcallb_list = pcp; 1029 } 1030 mutex_exit(&hpw->p_hmutex); 1031 } 1032 } 1033 1034 mutex_enter(&seg_pmem_mtx); 1035 if (seg_pathr_on) { 1036 mutex_exit(&seg_pmem_mtx); 1037 goto runcb; 1038 } 1039 seg_pathr_on = 1; 1040 mutex_exit(&seg_pmem_mtx); 1041 ASSERT(seg_pahcur <= 1); 1042 hlix = !seg_pahcur; 1043 1044 again: 1045 for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix]; 1046 hlinkp = hlnextp) { 1047 1048 hlnextp = hlinkp->p_lnext; 1049 ASSERT(hlnextp != NULL); 1050 1051 hp = hlink2phash(hlinkp, hlix); 1052 if (hp->p_hnext == (struct seg_pcache *)hp) { 1053 seg_pathr_empty_ahb++; 1054 continue; 1055 } 1056 seg_pathr_full_ahb++; 1057 mutex_enter(&hp->p_hmutex); 1058 1059 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 1060 pcp = pcp->p_hnext) { 1061 pcache_link_t *pheadp; 1062 pcache_link_t *plinkp; 1063 void *htag0; 1064 kmutex_t *pmtx; 1065 1066 ASSERT(!IS_PCP_WIRED(pcp)); 1067 ASSERT(pcp->p_hashp == hp); 1068 1069 if (pcp->p_active) { 1070 continue; 1071 } 1072 if (!force && pcp->p_ref && 1073 PCP_AGE(pcp) < seg_pmax_pcpage) { 1074 pcp->p_ref = 0; 1075 continue; 1076 } 1077 plinkp = &pcp->p_plink; 1078 htag0 = pcp->p_htag0; 1079 if (pcp->p_flags & SEGP_AMP) { 1080 pheadp = &((amp_t *)htag0)->a_phead; 1081 pmtx = &((amp_t *)htag0)->a_pmtx; 1082 } else { 1083 pheadp = &((seg_t *)htag0)->s_phead; 1084 pmtx = &((seg_t *)htag0)->s_pmtx; 1085 } 1086 if (!mutex_tryenter(pmtx)) { 1087 continue; 1088 } 1089 ASSERT(pheadp->p_lnext != pheadp); 1090 ASSERT(pheadp->p_lprev != pheadp); 1091 plinkp->p_lprev->p_lnext = 1092 plinkp->p_lnext; 1093 plinkp->p_lnext->p_lprev = 1094 plinkp->p_lprev; 1095 pcp->p_hprev->p_hnext = pcp->p_hnext; 1096 pcp->p_hnext->p_hprev = pcp->p_hprev; 1097 mutex_exit(pmtx); 1098 pcp->p_hprev = delcallb_list; 1099 delcallb_list = pcp; 1100 npgs_purged += btop(pcp->p_len); 1101 } 1102 if (hp->p_hnext == (struct seg_pcache *)hp) { 1103 seg_premove_abuck(hp, 1); 1104 } 1105 mutex_exit(&hp->p_hmutex); 1106 if (npgs_purged >= seg_plocked_window) { 1107 break; 1108 } 1109 if (!force) { 1110 if (npgs_purged >= npgs_to_purge) { 1111 break; 1112 } 1113 if (!trim && !(seg_pathr_full_ahb & 15)) { 1114 ASSERT(lowmem); 1115 if (freemem >= lotsfree + needfree) { 1116 break; 1117 } 1118 } 1119 } 1120 } 1121 1122 if (hlinkp == &seg_pahhead[hlix]) { 1123 /* 1124 * We processed the entire hlix active bucket list 1125 * but didn't find enough pages to reclaim. 1126 * Switch the lists and walk the other list 1127 * if we haven't done it yet. 1128 */ 1129 mutex_enter(&seg_pmem_mtx); 1130 ASSERT(seg_pathr_on); 1131 ASSERT(seg_pahcur == !hlix); 1132 seg_pahcur = hlix; 1133 mutex_exit(&seg_pmem_mtx); 1134 if (++hlinks < 2) { 1135 hlix = !hlix; 1136 goto again; 1137 } 1138 } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] && 1139 seg_pahhead[hlix].p_lnext != hlinkp) { 1140 ASSERT(hlinkp != NULL); 1141 ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]); 1142 ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]); 1143 ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]); 1144 1145 /* 1146 * Reinsert the header to point to hlinkp 1147 * so that we start from hlinkp bucket next time around. 1148 */ 1149 seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev; 1150 seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext; 1151 seg_pahhead[hlix].p_lnext = hlinkp; 1152 seg_pahhead[hlix].p_lprev = hlinkp->p_lprev; 1153 hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix]; 1154 hlinkp->p_lprev = &seg_pahhead[hlix]; 1155 } 1156 1157 mutex_enter(&seg_pmem_mtx); 1158 ASSERT(seg_pathr_on); 1159 seg_pathr_on = 0; 1160 mutex_exit(&seg_pmem_mtx); 1161 1162 runcb: 1163 /* 1164 * Run the delayed callback list. segments/amps can't go away until 1165 * callback is executed since they must have non 0 softlockcnt. That's 1166 * why we don't need to hold as/seg/amp locks to execute the callback. 1167 */ 1168 while (delcallb_list != NULL) { 1169 pcp = delcallb_list; 1170 delcallb_list = pcp->p_hprev; 1171 ASSERT(!pcp->p_active); 1172 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 1173 pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1); 1174 npages += btop(pcp->p_len); 1175 if (!IS_PCP_WIRED(pcp)) { 1176 npages_window += btop(pcp->p_len); 1177 } 1178 kmem_cache_free(seg_pkmcache, pcp); 1179 } 1180 if (npages) { 1181 mutex_enter(&seg_pmem_mtx); 1182 ASSERT(seg_plocked >= npages); 1183 ASSERT(seg_plocked_window >= npages_window); 1184 seg_plocked -= npages; 1185 seg_plocked_window -= npages_window; 1186 mutex_exit(&seg_pmem_mtx); 1187 } 1188 } 1189 1190 /* 1191 * Remove cached pages for segment(s) entries from hashtable. The segments 1192 * are identified by pp array. This is useful for multiple seg's cached on 1193 * behalf of dummy segment (ISM/DISM) with common pp array. 1194 */ 1195 void 1196 seg_ppurge_wiredpp(struct page **pp) 1197 { 1198 struct seg_pcache *pcp; 1199 struct seg_phash_wired *hp; 1200 pgcnt_t npages = 0; 1201 struct seg_pcache *delcallb_list = NULL; 1202 1203 /* 1204 * if the cache is empty, return 1205 */ 1206 if (seg_plocked == 0) { 1207 return; 1208 } 1209 ASSERT(seg_phashsize_wired != 0); 1210 1211 for (hp = seg_phashtab_wired; 1212 hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) { 1213 if (hp->p_hnext == (struct seg_pcache *)hp) { 1214 continue; 1215 } 1216 mutex_enter(&hp->p_hmutex); 1217 pcp = hp->p_hnext; 1218 while (pcp != (struct seg_pcache *)hp) { 1219 ASSERT(pcp->p_hashp == (struct seg_phash *)hp); 1220 ASSERT(IS_PCP_WIRED(pcp)); 1221 /* 1222 * purge entries which are not active 1223 */ 1224 if (!pcp->p_active && pcp->p_pp == pp) { 1225 ASSERT(pcp->p_htag0 != NULL); 1226 pcp->p_hprev->p_hnext = pcp->p_hnext; 1227 pcp->p_hnext->p_hprev = pcp->p_hprev; 1228 pcp->p_hprev = delcallb_list; 1229 delcallb_list = pcp; 1230 } 1231 pcp = pcp->p_hnext; 1232 } 1233 mutex_exit(&hp->p_hmutex); 1234 /* 1235 * segments can't go away until callback is executed since 1236 * they must have non 0 softlockcnt. That's why we don't 1237 * need to hold as/seg locks to execute the callback. 1238 */ 1239 while (delcallb_list != NULL) { 1240 int done; 1241 pcp = delcallb_list; 1242 delcallb_list = pcp->p_hprev; 1243 ASSERT(!pcp->p_active); 1244 done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 1245 pcp->p_len, pcp->p_pp, 1246 pcp->p_write ? S_WRITE : S_READ, 1); 1247 npages += btop(pcp->p_len); 1248 ASSERT(IS_PCP_WIRED(pcp)); 1249 kmem_cache_free(seg_pkmcache, pcp); 1250 if (done) { 1251 ASSERT(delcallb_list == NULL); 1252 goto out; 1253 } 1254 } 1255 } 1256 1257 out: 1258 mutex_enter(&seg_pmem_mtx); 1259 ASSERT(seg_plocked >= npages); 1260 seg_plocked -= npages; 1261 mutex_exit(&seg_pmem_mtx); 1262 } 1263 1264 /* 1265 * purge all entries for a given segment. Since we 1266 * callback into the segment driver directly for page 1267 * reclaim the caller needs to hold the right locks. 1268 */ 1269 void 1270 seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags) 1271 { 1272 struct seg_pcache *delcallb_list = NULL; 1273 struct seg_pcache *pcp; 1274 struct seg_phash *hp; 1275 pgcnt_t npages = 0; 1276 void *htag0; 1277 1278 if (seg_plocked == 0) { 1279 return; 1280 } 1281 ASSERT(seg_phashsize_win != 0); 1282 1283 /* 1284 * If amp is not NULL use amp as a lookup tag otherwise use seg 1285 * as a lookup tag. 1286 */ 1287 htag0 = (amp == NULL ? (void *)seg : (void *)amp); 1288 ASSERT(htag0 != NULL); 1289 if (IS_PFLAGS_WIRED(flags)) { 1290 hp = P_HASHBP(seg, htag0, 0, flags); 1291 mutex_enter(&hp->p_hmutex); 1292 pcp = hp->p_hnext; 1293 while (pcp != (struct seg_pcache *)hp) { 1294 ASSERT(pcp->p_hashp == hp); 1295 ASSERT(IS_PCP_WIRED(pcp)); 1296 if (pcp->p_htag0 == htag0) { 1297 if (pcp->p_active) { 1298 break; 1299 } 1300 pcp->p_hprev->p_hnext = pcp->p_hnext; 1301 pcp->p_hnext->p_hprev = pcp->p_hprev; 1302 pcp->p_hprev = delcallb_list; 1303 delcallb_list = pcp; 1304 } 1305 pcp = pcp->p_hnext; 1306 } 1307 mutex_exit(&hp->p_hmutex); 1308 } else { 1309 pcache_link_t *plinkp; 1310 pcache_link_t *pheadp; 1311 kmutex_t *pmtx; 1312 1313 if (amp == NULL) { 1314 ASSERT(seg != NULL); 1315 pheadp = &seg->s_phead; 1316 pmtx = &seg->s_pmtx; 1317 } else { 1318 pheadp = &->a_phead; 1319 pmtx = &->a_pmtx; 1320 } 1321 mutex_enter(pmtx); 1322 while ((plinkp = pheadp->p_lnext) != pheadp) { 1323 pcp = plink2pcache(plinkp); 1324 ASSERT(!IS_PCP_WIRED(pcp)); 1325 ASSERT(pcp->p_htag0 == htag0); 1326 hp = pcp->p_hashp; 1327 mutex_enter(&hp->p_hmutex); 1328 if (pcp->p_active) { 1329 mutex_exit(&hp->p_hmutex); 1330 break; 1331 } 1332 ASSERT(plinkp->p_lprev == pheadp); 1333 pheadp->p_lnext = plinkp->p_lnext; 1334 plinkp->p_lnext->p_lprev = pheadp; 1335 pcp->p_hprev->p_hnext = pcp->p_hnext; 1336 pcp->p_hnext->p_hprev = pcp->p_hprev; 1337 pcp->p_hprev = delcallb_list; 1338 delcallb_list = pcp; 1339 if (hp->p_hnext == (struct seg_pcache *)hp) { 1340 seg_premove_abuck(hp, 0); 1341 } 1342 mutex_exit(&hp->p_hmutex); 1343 } 1344 mutex_exit(pmtx); 1345 } 1346 while (delcallb_list != NULL) { 1347 pcp = delcallb_list; 1348 delcallb_list = pcp->p_hprev; 1349 ASSERT(!pcp->p_active); 1350 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len, 1351 pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); 1352 npages += btop(pcp->p_len); 1353 kmem_cache_free(seg_pkmcache, pcp); 1354 } 1355 mutex_enter(&seg_pmem_mtx); 1356 ASSERT(seg_plocked >= npages); 1357 seg_plocked -= npages; 1358 if (!IS_PFLAGS_WIRED(flags)) { 1359 ASSERT(seg_plocked_window >= npages); 1360 seg_plocked_window -= npages; 1361 } 1362 mutex_exit(&seg_pmem_mtx); 1363 } 1364 1365 static void seg_pinit_mem_config(void); 1366 1367 /* 1368 * setup the pagelock cache 1369 */ 1370 static void 1371 seg_pinit(void) 1372 { 1373 struct seg_phash *hp; 1374 ulong_t i; 1375 pgcnt_t physmegs; 1376 1377 seg_plocked = 0; 1378 seg_plocked_window = 0; 1379 1380 if (segpcache_enabled == 0) { 1381 seg_phashsize_win = 0; 1382 seg_phashsize_wired = 0; 1383 seg_pdisabled = 1; 1384 return; 1385 } 1386 1387 seg_pdisabled = 0; 1388 seg_pkmcache = kmem_cache_create("seg_pcache", 1389 sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0); 1390 if (segpcache_pcp_maxage_ticks <= 0) { 1391 segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz; 1392 } 1393 seg_pmax_pcpage = segpcache_pcp_maxage_ticks; 1394 seg_pathr_empty_ahb = 0; 1395 seg_pathr_full_ahb = 0; 1396 seg_pshrink_shift = segpcache_shrink_shift; 1397 seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes); 1398 1399 mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL); 1400 mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL); 1401 mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL); 1402 cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL); 1403 1404 physmegs = physmem >> (20 - PAGESHIFT); 1405 1406 /* 1407 * If segpcache_hashsize_win was not set in /etc/system or it has 1408 * absurd value set it to a default. 1409 */ 1410 if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) { 1411 /* 1412 * Create one bucket per 32K (or at least per 8 pages) of 1413 * available memory. 1414 */ 1415 pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8); 1416 segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket); 1417 } 1418 if (!ISP2(segpcache_hashsize_win)) { 1419 ulong_t rndfac = ~(1UL << 1420 (highbit(segpcache_hashsize_win) - 1)); 1421 rndfac &= segpcache_hashsize_win; 1422 segpcache_hashsize_win += rndfac; 1423 segpcache_hashsize_win = 1 << 1424 (highbit(segpcache_hashsize_win) - 1); 1425 } 1426 seg_phashsize_win = segpcache_hashsize_win; 1427 seg_phashtab_win = kmem_zalloc( 1428 seg_phashsize_win * sizeof (struct seg_phash), 1429 KM_SLEEP); 1430 for (i = 0; i < seg_phashsize_win; i++) { 1431 hp = &seg_phashtab_win[i]; 1432 hp->p_hnext = (struct seg_pcache *)hp; 1433 hp->p_hprev = (struct seg_pcache *)hp; 1434 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 1435 } 1436 1437 seg_pahcur = 0; 1438 seg_pathr_on = 0; 1439 seg_pahhead[0].p_lnext = &seg_pahhead[0]; 1440 seg_pahhead[0].p_lprev = &seg_pahhead[0]; 1441 seg_pahhead[1].p_lnext = &seg_pahhead[1]; 1442 seg_pahhead[1].p_lprev = &seg_pahhead[1]; 1443 1444 /* 1445 * If segpcache_hashsize_wired was not set in /etc/system or it has 1446 * absurd value set it to a default. 1447 */ 1448 if (segpcache_hashsize_wired == 0 || 1449 segpcache_hashsize_wired > physmem / 4) { 1450 /* 1451 * Choose segpcache_hashsize_wired based on physmem. 1452 * Create a bucket per 128K bytes upto 256K buckets. 1453 */ 1454 if (physmegs < 20 * 1024) { 1455 segpcache_hashsize_wired = MAX(1024, physmegs << 3); 1456 } else { 1457 segpcache_hashsize_wired = 256 * 1024; 1458 } 1459 } 1460 if (!ISP2(segpcache_hashsize_wired)) { 1461 segpcache_hashsize_wired = 1 << 1462 highbit(segpcache_hashsize_wired); 1463 } 1464 seg_phashsize_wired = segpcache_hashsize_wired; 1465 seg_phashtab_wired = kmem_zalloc( 1466 seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP); 1467 for (i = 0; i < seg_phashsize_wired; i++) { 1468 hp = (struct seg_phash *)&seg_phashtab_wired[i]; 1469 hp->p_hnext = (struct seg_pcache *)hp; 1470 hp->p_hprev = (struct seg_pcache *)hp; 1471 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 1472 } 1473 1474 if (segpcache_maxwindow == 0) { 1475 if (physmegs < 64) { 1476 /* 3% of memory */ 1477 segpcache_maxwindow = availrmem >> 5; 1478 } else if (physmegs < 512) { 1479 /* 12% of memory */ 1480 segpcache_maxwindow = availrmem >> 3; 1481 } else if (physmegs < 1024) { 1482 /* 25% of memory */ 1483 segpcache_maxwindow = availrmem >> 2; 1484 } else if (physmegs < 2048) { 1485 /* 50% of memory */ 1486 segpcache_maxwindow = availrmem >> 1; 1487 } else { 1488 /* no limit */ 1489 segpcache_maxwindow = (pgcnt_t)-1; 1490 } 1491 } 1492 seg_pmaxwindow = segpcache_maxwindow; 1493 seg_pinit_mem_config(); 1494 } 1495 1496 /* 1497 * called by pageout if memory is low 1498 */ 1499 void 1500 seg_preap(void) 1501 { 1502 /* 1503 * if the cache is off or empty, return 1504 */ 1505 if (seg_plocked_window == 0) { 1506 return; 1507 } 1508 ASSERT(seg_phashsize_win != 0); 1509 1510 /* 1511 * If somebody is already purging pcache 1512 * just return. 1513 */ 1514 if (seg_pdisabled) { 1515 return; 1516 } 1517 1518 cv_signal(&seg_pasync_cv); 1519 } 1520 1521 /* 1522 * run as a backgroud thread and reclaim pagelock 1523 * pages which have not been used recently 1524 */ 1525 void 1526 seg_pasync_thread(void) 1527 { 1528 callb_cpr_t cpr_info; 1529 1530 if (seg_phashsize_win == 0) { 1531 thread_exit(); 1532 /*NOTREACHED*/ 1533 } 1534 1535 seg_pasync_thr = curthread; 1536 1537 CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx, 1538 callb_generic_cpr, "seg_pasync"); 1539 1540 if (segpcache_reap_ticks <= 0) { 1541 segpcache_reap_ticks = segpcache_reap_sec * hz; 1542 } 1543 1544 mutex_enter(&seg_pasync_mtx); 1545 for (;;) { 1546 CALLB_CPR_SAFE_BEGIN(&cpr_info); 1547 (void) cv_timedwait(&seg_pasync_cv, &seg_pasync_mtx, 1548 lbolt + segpcache_reap_ticks); 1549 CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx); 1550 if (seg_pdisabled == 0) { 1551 seg_ppurge_async(0); 1552 } 1553 } 1554 } 1555 1556 static struct kmem_cache *seg_cache; 1557 1558 /* 1559 * Initialize segment management data structures. 1560 */ 1561 void 1562 seg_init(void) 1563 { 1564 kstat_t *ksp; 1565 1566 seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg), 1567 0, NULL, NULL, NULL, NULL, NULL, 0); 1568 1569 ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED, 1570 segadvstat_ndata, KSTAT_FLAG_VIRTUAL); 1571 if (ksp) { 1572 ksp->ks_data = (void *)segadvstat_ptr; 1573 kstat_install(ksp); 1574 } 1575 1576 seg_pinit(); 1577 } 1578 1579 /* 1580 * Allocate a segment to cover [base, base+size] 1581 * and attach it to the specified address space. 1582 */ 1583 struct seg * 1584 seg_alloc(struct as *as, caddr_t base, size_t size) 1585 { 1586 struct seg *new; 1587 caddr_t segbase; 1588 size_t segsize; 1589 1590 segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK); 1591 segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) - 1592 (uintptr_t)segbase; 1593 1594 if (!valid_va_range(&segbase, &segsize, segsize, AH_LO)) 1595 return ((struct seg *)NULL); /* bad virtual addr range */ 1596 1597 if (as != &kas && 1598 valid_usr_range(segbase, segsize, 0, as, 1599 as->a_userlimit) != RANGE_OKAY) 1600 return ((struct seg *)NULL); /* bad virtual addr range */ 1601 1602 new = kmem_cache_alloc(seg_cache, KM_SLEEP); 1603 new->s_ops = NULL; 1604 new->s_data = NULL; 1605 new->s_szc = 0; 1606 new->s_flags = 0; 1607 mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL); 1608 new->s_phead.p_lnext = &new->s_phead; 1609 new->s_phead.p_lprev = &new->s_phead; 1610 if (seg_attach(as, segbase, segsize, new) < 0) { 1611 kmem_cache_free(seg_cache, new); 1612 return ((struct seg *)NULL); 1613 } 1614 /* caller must fill in ops, data */ 1615 return (new); 1616 } 1617 1618 /* 1619 * Attach a segment to the address space. Used by seg_alloc() 1620 * and for kernel startup to attach to static segments. 1621 */ 1622 int 1623 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg) 1624 { 1625 seg->s_as = as; 1626 seg->s_base = base; 1627 seg->s_size = size; 1628 1629 /* 1630 * as_addseg() will add the segment at the appropraite point 1631 * in the list. It will return -1 if there is overlap with 1632 * an already existing segment. 1633 */ 1634 return (as_addseg(as, seg)); 1635 } 1636 1637 /* 1638 * Unmap a segment and free it from its associated address space. 1639 * This should be called by anybody who's finished with a whole segment's 1640 * mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the 1641 * responsibility of the segment driver to unlink the the segment 1642 * from the address space, and to free public and private data structures 1643 * associated with the segment. (This is typically done by a call to 1644 * seg_free()). 1645 */ 1646 void 1647 seg_unmap(struct seg *seg) 1648 { 1649 #ifdef DEBUG 1650 int ret; 1651 #endif /* DEBUG */ 1652 1653 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1654 1655 /* Shouldn't have called seg_unmap if mapping isn't yet established */ 1656 ASSERT(seg->s_data != NULL); 1657 1658 /* Unmap the whole mapping */ 1659 #ifdef DEBUG 1660 ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 1661 ASSERT(ret == 0); 1662 #else 1663 SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 1664 #endif /* DEBUG */ 1665 } 1666 1667 /* 1668 * Free the segment from its associated as. This should only be called 1669 * if a mapping to the segment has not yet been established (e.g., if 1670 * an error occurs in the middle of doing an as_map when the segment 1671 * has already been partially set up) or if it has already been deleted 1672 * (e.g., from a segment driver unmap routine if the unmap applies to the 1673 * entire segment). If the mapping is currently set up then seg_unmap() should 1674 * be called instead. 1675 */ 1676 void 1677 seg_free(struct seg *seg) 1678 { 1679 register struct as *as = seg->s_as; 1680 struct seg *tseg = as_removeseg(as, seg); 1681 1682 ASSERT(tseg == seg); 1683 1684 /* 1685 * If the segment private data field is NULL, 1686 * then segment driver is not attached yet. 1687 */ 1688 if (seg->s_data != NULL) 1689 SEGOP_FREE(seg); 1690 1691 mutex_destroy(&seg->s_pmtx); 1692 ASSERT(seg->s_phead.p_lnext == &seg->s_phead); 1693 ASSERT(seg->s_phead.p_lprev == &seg->s_phead); 1694 kmem_cache_free(seg_cache, seg); 1695 } 1696 1697 /*ARGSUSED*/ 1698 static void 1699 seg_p_mem_config_post_add( 1700 void *arg, 1701 pgcnt_t delta_pages) 1702 { 1703 /* Nothing to do. */ 1704 } 1705 1706 void 1707 seg_p_enable(void) 1708 { 1709 mutex_enter(&seg_pcache_mtx); 1710 ASSERT(seg_pdisabled != 0); 1711 seg_pdisabled--; 1712 mutex_exit(&seg_pcache_mtx); 1713 } 1714 1715 /* 1716 * seg_p_disable - disables seg_pcache, and then attempts to empty the 1717 * cache. 1718 * Returns SEGP_SUCCESS if the cache was successfully emptied, or 1719 * SEGP_FAIL if the cache could not be emptied. 1720 */ 1721 int 1722 seg_p_disable(void) 1723 { 1724 pgcnt_t old_plocked; 1725 int stall_count = 0; 1726 1727 mutex_enter(&seg_pcache_mtx); 1728 seg_pdisabled++; 1729 ASSERT(seg_pdisabled != 0); 1730 mutex_exit(&seg_pcache_mtx); 1731 1732 /* 1733 * Attempt to empty the cache. Terminate if seg_plocked does not 1734 * diminish with SEGP_STALL_THRESHOLD consecutive attempts. 1735 */ 1736 while (seg_plocked != 0) { 1737 ASSERT(seg_phashsize_win != 0); 1738 old_plocked = seg_plocked; 1739 seg_ppurge_async(1); 1740 if (seg_plocked == old_plocked) { 1741 if (stall_count++ > SEGP_STALL_THRESHOLD) { 1742 return (SEGP_FAIL); 1743 } 1744 } else 1745 stall_count = 0; 1746 if (seg_plocked != 0) 1747 delay(hz/SEGP_PREDEL_DELAY_FACTOR); 1748 } 1749 return (SEGP_SUCCESS); 1750 } 1751 1752 /* 1753 * Attempt to purge seg_pcache. May need to return before this has 1754 * completed to allow other pre_del callbacks to unlock pages. This is 1755 * ok because: 1756 * 1) The seg_pdisabled flag has been set so at least we won't 1757 * cache anymore locks and the locks we couldn't purge 1758 * will not be held if they do get released by a subsequent 1759 * pre-delete callback. 1760 * 1761 * 2) The rest of the memory delete thread processing does not 1762 * depend on the changes made in this pre-delete callback. No 1763 * panics will result, the worst that will happen is that the 1764 * DR code will timeout and cancel the delete. 1765 */ 1766 /*ARGSUSED*/ 1767 static int 1768 seg_p_mem_config_pre_del( 1769 void *arg, 1770 pgcnt_t delta_pages) 1771 { 1772 if (seg_phashsize_win == 0) { 1773 return (0); 1774 } 1775 if (seg_p_disable() != SEGP_SUCCESS) 1776 cmn_err(CE_NOTE, 1777 "!Pre-delete couldn't purge"" pagelock cache - continuing"); 1778 return (0); 1779 } 1780 1781 /*ARGSUSED*/ 1782 static void 1783 seg_p_mem_config_post_del( 1784 void *arg, 1785 pgcnt_t delta_pages, 1786 int cancelled) 1787 { 1788 if (seg_phashsize_win == 0) { 1789 return; 1790 } 1791 seg_p_enable(); 1792 } 1793 1794 static kphysm_setup_vector_t seg_p_mem_config_vec = { 1795 KPHYSM_SETUP_VECTOR_VERSION, 1796 seg_p_mem_config_post_add, 1797 seg_p_mem_config_pre_del, 1798 seg_p_mem_config_post_del, 1799 }; 1800 1801 static void 1802 seg_pinit_mem_config(void) 1803 { 1804 int ret; 1805 1806 ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL); 1807 /* 1808 * Want to catch this in the debug kernel. At run time, if the 1809 * callbacks don't get run all will be OK as the disable just makes 1810 * it more likely that the pages can be collected. 1811 */ 1812 ASSERT(ret == 0); 1813 } 1814 1815 /* 1816 * Verify that segment is not a shared anonymous segment which reserves 1817 * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered 1818 * from one zone to another if any segments are shared. This is because the 1819 * last process to exit will credit the swap reservation. This could lead 1820 * to the swap being reserved by one zone, and credited to another. 1821 */ 1822 boolean_t 1823 seg_can_change_zones(struct seg *seg) 1824 { 1825 struct segvn_data *svd; 1826 1827 if (seg->s_ops == &segspt_shmops) 1828 return (B_FALSE); 1829 1830 if (seg->s_ops == &segvn_ops) { 1831 svd = (struct segvn_data *)seg->s_data; 1832 if (svd->type == MAP_SHARED && 1833 svd->amp != NULL && 1834 svd->amp->swresv > 0) 1835 return (B_FALSE); 1836 } 1837 return (B_TRUE); 1838 } 1839 1840 /* 1841 * Return swap reserved by a segment backing a private mapping. 1842 */ 1843 size_t 1844 seg_swresv(struct seg *seg) 1845 { 1846 struct segvn_data *svd; 1847 size_t swap = 0; 1848 1849 if (seg->s_ops == &segvn_ops) { 1850 svd = (struct segvn_data *)seg->s_data; 1851 if (svd->type == MAP_PRIVATE && svd->swresv > 0) 1852 swap = svd->swresv; 1853 } 1854 return (swap); 1855 } 1856