1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - segment management. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/inttypes.h> 47 #include <sys/t_lock.h> 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/kmem.h> 51 #include <sys/vmsystm.h> 52 #include <sys/debug.h> 53 #include <sys/cmn_err.h> 54 #include <sys/callb.h> 55 #include <sys/mem_config.h> 56 #include <sys/mman.h> 57 58 #include <vm/hat.h> 59 #include <vm/as.h> 60 #include <vm/seg.h> 61 #include <vm/seg_kmem.h> 62 #include <vm/seg_spt.h> 63 #include <vm/seg_vn.h> 64 /* 65 * kstats for segment advise 66 */ 67 segadvstat_t segadvstat = { 68 { "MADV_FREE_hit", KSTAT_DATA_ULONG }, 69 { "MADV_FREE_miss", KSTAT_DATA_ULONG }, 70 }; 71 72 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat; 73 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t); 74 75 /* #define PDEBUG */ 76 #if defined(PDEBUG) || defined(lint) || defined(__lint) 77 int pdebug = 0; 78 #else 79 #define pdebug 0 80 #endif /* PDEBUG */ 81 82 #define PPRINTF if (pdebug) printf 83 #define PPRINT(x) PPRINTF(x) 84 #define PPRINT1(x, a) PPRINTF(x, a) 85 #define PPRINT2(x, a, b) PPRINTF(x, a, b) 86 #define PPRINT3(x, a, b, c) PPRINTF(x, a, b, c) 87 #define PPRINT4(x, a, b, c, d) PPRINTF(x, a, b, c, d) 88 #define PPRINT5(x, a, b, c, d, e) PPRINTF(x, a, b, c, d, e) 89 90 #define P_HASHMASK (p_hashsize - 1) 91 #define P_BASESHIFT 6 92 93 /* 94 * entry in the segment page cache 95 */ 96 struct seg_pcache { 97 struct seg_pcache *p_hnext; /* list for hashed blocks */ 98 struct seg_pcache *p_hprev; 99 int p_active; /* active count */ 100 int p_ref; /* ref bit */ 101 size_t p_len; /* segment length */ 102 caddr_t p_addr; /* base address */ 103 struct seg *p_seg; /* segment */ 104 struct page **p_pp; /* pp shadow list */ 105 enum seg_rw p_rw; /* rw */ 106 uint_t p_flags; /* bit flags */ 107 int (*p_callback)(struct seg *, caddr_t, size_t, 108 struct page **, enum seg_rw); 109 }; 110 111 struct seg_phash { 112 struct seg_pcache *p_hnext; /* list for hashed blocks */ 113 struct seg_pcache *p_hprev; 114 int p_qlen; /* Q length */ 115 kmutex_t p_hmutex; /* protects hash bucket */ 116 }; 117 118 static int seg_preap_time = 20; /* reclaim every 20 secs */ 119 static int seg_pmaxqlen = 5; /* max Q length in hash list */ 120 static int seg_ppcount = 5; /* max # of purges per reclaim interval */ 121 static int seg_plazy = 1; /* if 1, pages are cached after pageunlock */ 122 static pgcnt_t seg_pwindow; /* max # of pages that can be cached */ 123 static pgcnt_t seg_plocked; /* # of pages which are cached by pagelock */ 124 static pgcnt_t seg_plocked_window; /* # pages from window */ 125 int seg_preapahead; 126 127 static uint_t seg_pdisable = 0; /* if not 0, caching temporarily disabled */ 128 129 static int seg_pupdate_active = 1; /* background reclaim thread */ 130 static clock_t seg_preap_interval; /* reap interval in ticks */ 131 132 static kmutex_t seg_pcache; /* protects the whole pagelock cache */ 133 static kmutex_t seg_pmem; /* protects window counter */ 134 static ksema_t seg_psaync_sem; /* sema for reclaim thread */ 135 static struct seg_phash *p_hashtab; 136 static int p_hashsize = 0; 137 138 #define p_hash(seg) \ 139 (P_HASHMASK & \ 140 ((uintptr_t)(seg) >> P_BASESHIFT)) 141 142 #define p_match(pcp, seg, addr, len, rw) \ 143 (((pcp)->p_seg == (seg) && \ 144 (pcp)->p_addr == (addr) && \ 145 (pcp)->p_rw == (rw) && \ 146 (pcp)->p_len == (len)) ? 1 : 0) 147 148 #define p_match_pp(pcp, seg, addr, len, pp, rw) \ 149 (((pcp)->p_seg == (seg) && \ 150 (pcp)->p_addr == (addr) && \ 151 (pcp)->p_pp == (pp) && \ 152 (pcp)->p_rw == (rw) && \ 153 (pcp)->p_len == (len)) ? 1 : 0) 154 155 156 /* 157 * lookup an address range in pagelock cache. Return shadow list 158 * and bump up active count. 159 */ 160 struct page ** 161 seg_plookup(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 162 { 163 struct seg_pcache *pcp; 164 struct seg_phash *hp; 165 166 /* 167 * Skip pagelock cache, while DR is in progress or 168 * seg_pcache is off. 169 */ 170 if (seg_pdisable || seg_plazy == 0) { 171 return (NULL); 172 } 173 174 hp = &p_hashtab[p_hash(seg)]; 175 mutex_enter(&hp->p_hmutex); 176 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 177 pcp = pcp->p_hnext) { 178 if (p_match(pcp, seg, addr, len, rw)) { 179 pcp->p_active++; 180 mutex_exit(&hp->p_hmutex); 181 182 PPRINT5("seg_plookup hit: seg %p, addr %p, " 183 "len %lx, count %d, pplist %p \n", 184 (void *)seg, (void *)addr, len, pcp->p_active, 185 (void *)pcp->p_pp); 186 187 return (pcp->p_pp); 188 } 189 } 190 mutex_exit(&hp->p_hmutex); 191 192 PPRINT("seg_plookup miss:\n"); 193 194 return (NULL); 195 } 196 197 /* 198 * mark address range inactive. If the cache is off or the address 199 * range is not in the cache we call the segment driver to reclaim 200 * the pages. Otherwise just decrement active count and set ref bit. 201 */ 202 void 203 seg_pinactive(struct seg *seg, caddr_t addr, size_t len, struct page **pp, 204 enum seg_rw rw, int (*callback)(struct seg *, caddr_t, size_t, 205 struct page **, enum seg_rw)) 206 { 207 struct seg_pcache *pcp; 208 struct seg_phash *hp; 209 210 if (seg_plazy == 0) { 211 (void) (*callback)(seg, addr, len, pp, rw); 212 return; 213 } 214 hp = &p_hashtab[p_hash(seg)]; 215 mutex_enter(&hp->p_hmutex); 216 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 217 pcp = pcp->p_hnext) { 218 if (p_match_pp(pcp, seg, addr, len, pp, rw)) { 219 pcp->p_active--; 220 ASSERT(pcp->p_active >= 0); 221 if (pcp->p_active == 0 && seg_pdisable) { 222 int npages; 223 224 ASSERT(callback == pcp->p_callback); 225 /* free the entry */ 226 hp->p_qlen--; 227 pcp->p_hprev->p_hnext = pcp->p_hnext; 228 pcp->p_hnext->p_hprev = pcp->p_hprev; 229 mutex_exit(&hp->p_hmutex); 230 npages = pcp->p_len >> PAGESHIFT; 231 mutex_enter(&seg_pmem); 232 seg_plocked -= npages; 233 if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) { 234 seg_plocked_window -= npages; 235 } 236 mutex_exit(&seg_pmem); 237 kmem_free(pcp, sizeof (struct seg_pcache)); 238 goto out; 239 } 240 pcp->p_ref = 1; 241 mutex_exit(&hp->p_hmutex); 242 return; 243 } 244 } 245 mutex_exit(&hp->p_hmutex); 246 out: 247 (void) (*callback)(seg, addr, len, pp, rw); 248 } 249 250 /* 251 * The seg_pinsert_check() is used by segment drivers to predict whether 252 * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing. 253 */ 254 255 int 256 seg_pinsert_check(struct seg *seg, size_t len, uint_t flags) 257 { 258 struct seg_phash *hp; 259 260 if (seg_plazy == 0) { 261 return (SEGP_FAIL); 262 } 263 if (seg_pdisable != 0) { 264 return (SEGP_FAIL); 265 } 266 ASSERT((len & PAGEOFFSET) == 0); 267 hp = &p_hashtab[p_hash(seg)]; 268 if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) { 269 return (SEGP_FAIL); 270 } 271 /* 272 * If the SEGP_FORCE_WIRED flag is set, 273 * we skip the check for seg_pwindow. 274 */ 275 if ((flags & SEGP_FORCE_WIRED) == 0) { 276 pgcnt_t npages; 277 278 npages = len >> PAGESHIFT; 279 if ((seg_plocked_window + npages) > seg_pwindow) { 280 return (SEGP_FAIL); 281 } 282 } 283 return (SEGP_SUCCESS); 284 } 285 286 287 /* 288 * insert address range with shadow list into pagelock cache. If 289 * the cache is off or caching is temporarily disabled or the allowed 290 * 'window' is exceeded - return SEGP_FAIL. Otherwise return 291 * SEGP_SUCCESS. 292 */ 293 int 294 seg_pinsert(struct seg *seg, caddr_t addr, size_t len, struct page **pp, 295 enum seg_rw rw, uint_t flags, int (*callback)(struct seg *, caddr_t, 296 size_t, struct page **, enum seg_rw)) 297 { 298 struct seg_pcache *pcp; 299 struct seg_phash *hp; 300 pgcnt_t npages; 301 302 if (seg_plazy == 0) { 303 return (SEGP_FAIL); 304 } 305 if (seg_pdisable != 0) { 306 return (SEGP_FAIL); 307 } 308 ASSERT((len & PAGEOFFSET) == 0); 309 hp = &p_hashtab[p_hash(seg)]; 310 if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) { 311 return (SEGP_FAIL); 312 } 313 npages = len >> PAGESHIFT; 314 mutex_enter(&seg_pmem); 315 /* 316 * If the SEGP_FORCE_WIRED flag is set, 317 * we skip the check for seg_pwindow. 318 */ 319 if ((flags & SEGP_FORCE_WIRED) == 0) { 320 seg_plocked_window += npages; 321 if (seg_plocked_window > seg_pwindow) { 322 seg_plocked_window -= npages; 323 mutex_exit(&seg_pmem); 324 return (SEGP_FAIL); 325 } 326 } 327 seg_plocked += npages; 328 mutex_exit(&seg_pmem); 329 330 pcp = kmem_alloc(sizeof (struct seg_pcache), KM_SLEEP); 331 pcp->p_seg = seg; 332 pcp->p_addr = addr; 333 pcp->p_len = len; 334 pcp->p_pp = pp; 335 pcp->p_rw = rw; 336 pcp->p_callback = callback; 337 pcp->p_active = 1; 338 pcp->p_flags = flags; 339 340 PPRINT4("seg_pinsert: seg %p, addr %p, len %lx, pplist %p\n", 341 (void *)seg, (void *)addr, len, (void *)pp); 342 343 hp = &p_hashtab[p_hash(seg)]; 344 mutex_enter(&hp->p_hmutex); 345 hp->p_qlen++; 346 pcp->p_hnext = hp->p_hnext; 347 pcp->p_hprev = (struct seg_pcache *)hp; 348 hp->p_hnext->p_hprev = pcp; 349 hp->p_hnext = pcp; 350 mutex_exit(&hp->p_hmutex); 351 return (SEGP_SUCCESS); 352 } 353 354 /* 355 * purge all entries from the pagelock cache if not active 356 * and not recently used. Drop all locks and call through 357 * the address space into the segment driver to reclaim 358 * the pages. This makes sure we get the address space 359 * and segment driver locking right. 360 */ 361 static void 362 seg_ppurge_all(int force) 363 { 364 struct seg_pcache *delcallb_list = NULL; 365 struct seg_pcache *pcp; 366 struct seg_phash *hp; 367 int purge_count = 0; 368 pgcnt_t npages = 0; 369 pgcnt_t npages_window = 0; 370 371 /* 372 * if the cache if off or empty, return 373 */ 374 if (seg_plazy == 0 || seg_plocked == 0) { 375 return; 376 } 377 for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) { 378 mutex_enter(&hp->p_hmutex); 379 pcp = hp->p_hnext; 380 381 /* 382 * While 'force' is set, seg_pasync_thread is not 383 * throttled. This is to speedup flushing of seg_pcache 384 * in preparation for DR. 385 * 386 * In normal case, when 'force' is not set, we throttle 387 * seg_pasync_thread so that we don't spend all the time 388 * time in purging the cache. 389 */ 390 while ((pcp != (struct seg_pcache *)hp) && 391 (force || (purge_count <= seg_ppcount))) { 392 393 /* 394 * purge entries which are not active and 395 * have not been used recently and 396 * have the SEGP_ASYNC_FLUSH flag. 397 * 398 * In the 'force' case, we ignore the 399 * SEGP_ASYNC_FLUSH flag. 400 */ 401 if (!(pcp->p_flags & SEGP_ASYNC_FLUSH)) 402 pcp->p_ref = 1; 403 if (force) 404 pcp->p_ref = 0; 405 if (!pcp->p_ref && !pcp->p_active) { 406 struct as *as = pcp->p_seg->s_as; 407 408 /* 409 * try to get the readers lock on the address 410 * space before taking out the cache element. 411 * This ensures as_pagereclaim() can actually 412 * call through the address space and free 413 * the pages. If we don't get the lock, just 414 * skip this entry. The pages will be reclaimed 415 * by the segment driver at unmap time. 416 */ 417 if (AS_LOCK_TRYENTER(as, &as->a_lock, 418 RW_READER)) { 419 hp->p_qlen--; 420 pcp->p_hprev->p_hnext = pcp->p_hnext; 421 pcp->p_hnext->p_hprev = pcp->p_hprev; 422 pcp->p_hprev = delcallb_list; 423 delcallb_list = pcp; 424 purge_count++; 425 } 426 } else { 427 pcp->p_ref = 0; 428 } 429 pcp = pcp->p_hnext; 430 } 431 mutex_exit(&hp->p_hmutex); 432 if (!force && purge_count > seg_ppcount) 433 break; 434 } 435 436 /* 437 * run the delayed callback list. We don't want to hold the 438 * cache lock during a call through the address space. 439 */ 440 while (delcallb_list != NULL) { 441 struct as *as; 442 443 pcp = delcallb_list; 444 delcallb_list = pcp->p_hprev; 445 as = pcp->p_seg->s_as; 446 447 PPRINT4("seg_ppurge_all: purge seg %p, addr %p, len %lx, " 448 "pplist %p\n", (void *)pcp->p_seg, (void *)pcp->p_addr, 449 pcp->p_len, (void *)pcp->p_pp); 450 451 as_pagereclaim(as, pcp->p_pp, pcp->p_addr, 452 pcp->p_len, pcp->p_rw); 453 AS_LOCK_EXIT(as, &as->a_lock); 454 npages += pcp->p_len >> PAGESHIFT; 455 if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) { 456 npages_window += pcp->p_len >> PAGESHIFT; 457 } 458 kmem_free(pcp, sizeof (struct seg_pcache)); 459 } 460 mutex_enter(&seg_pmem); 461 seg_plocked -= npages; 462 seg_plocked_window -= npages_window; 463 mutex_exit(&seg_pmem); 464 } 465 466 /* 467 * Remove cached pages for segment(s) entries from hashtable. 468 * The segments are identified by a given clients callback 469 * function. 470 * This is useful for multiple seg's cached on behalf of 471 * dummy segment (ISM/DISM) with common callback function. 472 * The clients callback function may return status indicating 473 * that the last seg's entry has been purged. In such a case 474 * the seg_ppurge_seg() stops searching hashtable and exits. 475 * Otherwise all hashtable entries are scanned. 476 */ 477 void 478 seg_ppurge_seg(int (*callback)(struct seg *, caddr_t, size_t, 479 struct page **, enum seg_rw)) 480 { 481 struct seg_pcache *pcp, *npcp; 482 struct seg_phash *hp; 483 pgcnt_t npages = 0; 484 pgcnt_t npages_window = 0; 485 int done = 0; 486 487 /* 488 * if the cache if off or empty, return 489 */ 490 if (seg_plazy == 0 || seg_plocked == 0) { 491 return; 492 } 493 mutex_enter(&seg_pcache); 494 seg_pdisable++; 495 mutex_exit(&seg_pcache); 496 497 for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) { 498 499 mutex_enter(&hp->p_hmutex); 500 pcp = hp->p_hnext; 501 while (pcp != (struct seg_pcache *)hp) { 502 503 /* 504 * purge entries which are not active 505 */ 506 npcp = pcp->p_hnext; 507 if (!pcp->p_active && pcp->p_callback == callback) { 508 hp->p_qlen--; 509 pcp->p_hprev->p_hnext = pcp->p_hnext; 510 pcp->p_hnext->p_hprev = pcp->p_hprev; 511 512 if ((*pcp->p_callback)(pcp->p_seg, pcp->p_addr, 513 pcp->p_len, pcp->p_pp, pcp->p_rw)) { 514 done = 1; 515 } 516 517 npages += pcp->p_len >> PAGESHIFT; 518 if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) { 519 npages_window += 520 pcp->p_len >> PAGESHIFT; 521 } 522 kmem_free(pcp, sizeof (struct seg_pcache)); 523 } 524 pcp = npcp; 525 if (done) 526 break; 527 } 528 mutex_exit(&hp->p_hmutex); 529 if (done) 530 break; 531 } 532 533 mutex_enter(&seg_pcache); 534 seg_pdisable--; 535 mutex_exit(&seg_pcache); 536 537 mutex_enter(&seg_pmem); 538 seg_plocked -= npages; 539 seg_plocked_window -= npages_window; 540 mutex_exit(&seg_pmem); 541 } 542 543 /* 544 * purge all entries for a given segment. Since we 545 * callback into the segment driver directly for page 546 * reclaim the caller needs to hold the right locks. 547 */ 548 void 549 seg_ppurge(struct seg *seg) 550 { 551 struct seg_pcache *delcallb_list = NULL; 552 struct seg_pcache *pcp; 553 struct seg_phash *hp; 554 pgcnt_t npages = 0; 555 pgcnt_t npages_window = 0; 556 557 if (seg_plazy == 0) { 558 return; 559 } 560 hp = &p_hashtab[p_hash(seg)]; 561 mutex_enter(&hp->p_hmutex); 562 pcp = hp->p_hnext; 563 while (pcp != (struct seg_pcache *)hp) { 564 if (pcp->p_seg == seg) { 565 if (pcp->p_active) { 566 break; 567 } 568 hp->p_qlen--; 569 pcp->p_hprev->p_hnext = pcp->p_hnext; 570 pcp->p_hnext->p_hprev = pcp->p_hprev; 571 pcp->p_hprev = delcallb_list; 572 delcallb_list = pcp; 573 } 574 pcp = pcp->p_hnext; 575 } 576 mutex_exit(&hp->p_hmutex); 577 while (delcallb_list != NULL) { 578 pcp = delcallb_list; 579 delcallb_list = pcp->p_hprev; 580 581 PPRINT4("seg_ppurge: purge seg %p, addr %p, len %lx, " 582 "pplist %p\n", (void *)seg, (void *)pcp->p_addr, 583 pcp->p_len, (void *)pcp->p_pp); 584 585 ASSERT(seg == pcp->p_seg); 586 (void) (*pcp->p_callback)(seg, pcp->p_addr, 587 pcp->p_len, pcp->p_pp, pcp->p_rw); 588 npages += pcp->p_len >> PAGESHIFT; 589 if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) { 590 npages_window += pcp->p_len >> PAGESHIFT; 591 } 592 kmem_free(pcp, sizeof (struct seg_pcache)); 593 } 594 mutex_enter(&seg_pmem); 595 seg_plocked -= npages; 596 seg_plocked_window -= npages_window; 597 mutex_exit(&seg_pmem); 598 } 599 600 static void seg_pinit_mem_config(void); 601 602 /* 603 * setup the pagelock cache 604 */ 605 static void 606 seg_pinit(void) 607 { 608 struct seg_phash *hp; 609 int i; 610 uint_t physmegs; 611 612 sema_init(&seg_psaync_sem, 0, NULL, SEMA_DEFAULT, NULL); 613 614 mutex_enter(&seg_pcache); 615 if (p_hashtab == NULL) { 616 physmegs = physmem >> (20 - PAGESHIFT); 617 618 /* If p_hashsize was not set in /etc/system ... */ 619 if (p_hashsize == 0) { 620 /* 621 * Choose p_hashsize based on physmem. 622 */ 623 if (physmegs < 64) { 624 p_hashsize = 64; 625 } else if (physmegs < 1024) { 626 p_hashsize = 1024; 627 } else if (physmegs < 10 * 1024) { 628 p_hashsize = 8192; 629 } else if (physmegs < 20 * 1024) { 630 p_hashsize = 2 * 8192; 631 seg_pmaxqlen = 16; 632 } else { 633 p_hashsize = 128 * 1024; 634 seg_pmaxqlen = 128; 635 } 636 } 637 638 p_hashtab = kmem_zalloc( 639 p_hashsize * sizeof (struct seg_phash), KM_SLEEP); 640 for (i = 0; i < p_hashsize; i++) { 641 hp = (struct seg_phash *)&p_hashtab[i]; 642 hp->p_hnext = (struct seg_pcache *)hp; 643 hp->p_hprev = (struct seg_pcache *)hp; 644 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 645 } 646 if (seg_pwindow == 0) { 647 if (physmegs < 24) { 648 /* don't use cache */ 649 seg_plazy = 0; 650 } else if (physmegs < 64) { 651 seg_pwindow = physmem >> 5; /* 3% of memory */ 652 } else if (physmegs < 10 * 1024) { 653 seg_pwindow = physmem >> 3; /* 12% of memory */ 654 } else { 655 seg_pwindow = physmem >> 1; 656 } 657 } 658 } 659 mutex_exit(&seg_pcache); 660 661 seg_pinit_mem_config(); 662 } 663 664 /* 665 * called by pageout if memory is low 666 */ 667 void 668 seg_preap(void) 669 { 670 /* 671 * if the cache if off or empty, return 672 */ 673 if (seg_plocked == 0 || seg_plazy == 0) { 674 return; 675 } 676 sema_v(&seg_psaync_sem); 677 } 678 679 static void seg_pupdate(void *); 680 681 /* 682 * run as a backgroud thread and reclaim pagelock 683 * pages which have not been used recently 684 */ 685 void 686 seg_pasync_thread(void) 687 { 688 callb_cpr_t cpr_info; 689 kmutex_t pasync_lock; /* just for CPR stuff */ 690 691 mutex_init(&pasync_lock, NULL, MUTEX_DEFAULT, NULL); 692 693 CALLB_CPR_INIT(&cpr_info, &pasync_lock, 694 callb_generic_cpr, "seg_pasync"); 695 696 if (seg_preap_interval == 0) { 697 seg_preap_interval = seg_preap_time * hz; 698 } else { 699 seg_preap_interval *= hz; 700 } 701 if (seg_plazy && seg_pupdate_active) { 702 (void) timeout(seg_pupdate, NULL, seg_preap_interval); 703 } 704 705 for (;;) { 706 mutex_enter(&pasync_lock); 707 CALLB_CPR_SAFE_BEGIN(&cpr_info); 708 mutex_exit(&pasync_lock); 709 sema_p(&seg_psaync_sem); 710 mutex_enter(&pasync_lock); 711 CALLB_CPR_SAFE_END(&cpr_info, &pasync_lock); 712 mutex_exit(&pasync_lock); 713 714 seg_ppurge_all(0); 715 } 716 } 717 718 static void 719 seg_pupdate(void *dummy) 720 { 721 sema_v(&seg_psaync_sem); 722 723 if (seg_plazy && seg_pupdate_active) { 724 (void) timeout(seg_pupdate, dummy, seg_preap_interval); 725 } 726 } 727 728 static struct kmem_cache *seg_cache; 729 730 /* 731 * Initialize segment management data structures. 732 */ 733 void 734 seg_init(void) 735 { 736 kstat_t *ksp; 737 738 seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg), 739 0, NULL, NULL, NULL, NULL, NULL, 0); 740 741 ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED, 742 segadvstat_ndata, KSTAT_FLAG_VIRTUAL); 743 if (ksp) { 744 ksp->ks_data = (void *)segadvstat_ptr; 745 kstat_install(ksp); 746 } 747 748 seg_pinit(); 749 } 750 751 /* 752 * Allocate a segment to cover [base, base+size] 753 * and attach it to the specified address space. 754 */ 755 struct seg * 756 seg_alloc(struct as *as, caddr_t base, size_t size) 757 { 758 struct seg *new; 759 caddr_t segbase; 760 size_t segsize; 761 762 segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK); 763 segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) - 764 (uintptr_t)segbase; 765 766 if (!valid_va_range(&segbase, &segsize, segsize, AH_LO)) 767 return ((struct seg *)NULL); /* bad virtual addr range */ 768 769 if (as != &kas && 770 valid_usr_range(segbase, segsize, 0, as, 771 as->a_userlimit) != RANGE_OKAY) 772 return ((struct seg *)NULL); /* bad virtual addr range */ 773 774 new = kmem_cache_alloc(seg_cache, KM_SLEEP); 775 new->s_ops = NULL; 776 new->s_data = NULL; 777 new->s_szc = 0; 778 new->s_flags = 0; 779 if (seg_attach(as, segbase, segsize, new) < 0) { 780 kmem_cache_free(seg_cache, new); 781 return ((struct seg *)NULL); 782 } 783 /* caller must fill in ops, data */ 784 return (new); 785 } 786 787 /* 788 * Attach a segment to the address space. Used by seg_alloc() 789 * and for kernel startup to attach to static segments. 790 */ 791 int 792 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg) 793 { 794 seg->s_as = as; 795 seg->s_base = base; 796 seg->s_size = size; 797 798 /* 799 * as_addseg() will add the segment at the appropraite point 800 * in the list. It will return -1 if there is overlap with 801 * an already existing segment. 802 */ 803 return (as_addseg(as, seg)); 804 } 805 806 /* 807 * Unmap a segment and free it from its associated address space. 808 * This should be called by anybody who's finished with a whole segment's 809 * mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the 810 * responsibility of the segment driver to unlink the the segment 811 * from the address space, and to free public and private data structures 812 * associated with the segment. (This is typically done by a call to 813 * seg_free()). 814 */ 815 void 816 seg_unmap(struct seg *seg) 817 { 818 #ifdef DEBUG 819 int ret; 820 #endif /* DEBUG */ 821 822 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 823 824 /* Shouldn't have called seg_unmap if mapping isn't yet established */ 825 ASSERT(seg->s_data != NULL); 826 827 /* Unmap the whole mapping */ 828 #ifdef DEBUG 829 ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 830 ASSERT(ret == 0); 831 #else 832 SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 833 #endif /* DEBUG */ 834 } 835 836 /* 837 * Free the segment from its associated as. This should only be called 838 * if a mapping to the segment has not yet been established (e.g., if 839 * an error occurs in the middle of doing an as_map when the segment 840 * has already been partially set up) or if it has already been deleted 841 * (e.g., from a segment driver unmap routine if the unmap applies to the 842 * entire segment). If the mapping is currently set up then seg_unmap() should 843 * be called instead. 844 */ 845 void 846 seg_free(struct seg *seg) 847 { 848 register struct as *as = seg->s_as; 849 struct seg *tseg = as_removeseg(as, seg); 850 851 ASSERT(tseg == seg); 852 853 /* 854 * If the segment private data field is NULL, 855 * then segment driver is not attached yet. 856 */ 857 if (seg->s_data != NULL) 858 SEGOP_FREE(seg); 859 860 kmem_cache_free(seg_cache, seg); 861 } 862 863 /*ARGSUSED*/ 864 static void 865 seg_p_mem_config_post_add( 866 void *arg, 867 pgcnt_t delta_pages) 868 { 869 /* Nothing to do. */ 870 } 871 872 void 873 seg_p_enable(void) 874 { 875 mutex_enter(&seg_pcache); 876 ASSERT(seg_pdisable != 0); 877 seg_pdisable--; 878 mutex_exit(&seg_pcache); 879 } 880 881 /* 882 * seg_p_disable - disables seg_pcache, and then attempts to empty the 883 * cache. 884 * Returns SEGP_SUCCESS if the cache was successfully emptied, or 885 * SEGP_FAIL if the cache could not be emptied. 886 */ 887 int 888 seg_p_disable(void) 889 { 890 pgcnt_t old_plocked; 891 int stall_count = 0; 892 893 mutex_enter(&seg_pcache); 894 seg_pdisable++; 895 ASSERT(seg_pdisable != 0); 896 mutex_exit(&seg_pcache); 897 898 /* 899 * Attempt to empty the cache. Terminate if seg_plocked does not 900 * diminish with SEGP_STALL_THRESHOLD consecutive attempts. 901 */ 902 while (seg_plocked != 0) { 903 old_plocked = seg_plocked; 904 seg_ppurge_all(1); 905 if (seg_plocked == old_plocked) { 906 if (stall_count++ > SEGP_STALL_THRESHOLD) { 907 return (SEGP_FAIL); 908 } 909 } else 910 stall_count = 0; 911 if (seg_plocked != 0) 912 delay(hz/SEGP_PREDEL_DELAY_FACTOR); 913 } 914 return (SEGP_SUCCESS); 915 } 916 917 /* 918 * Attempt to purge seg_pcache. May need to return before this has 919 * completed to allow other pre_del callbacks to unlock pages. This is 920 * ok because: 921 * 1) The seg_pdisable flag has been set so at least we won't 922 * cache anymore locks and the locks we couldn't purge 923 * will not be held if they do get released by a subsequent 924 * pre-delete callback. 925 * 926 * 2) The rest of the memory delete thread processing does not 927 * depend on the changes made in this pre-delete callback. No 928 * panics will result, the worst that will happen is that the 929 * DR code will timeout and cancel the delete. 930 */ 931 /*ARGSUSED*/ 932 static int 933 seg_p_mem_config_pre_del( 934 void *arg, 935 pgcnt_t delta_pages) 936 { 937 if (seg_p_disable() != SEGP_SUCCESS) 938 cmn_err(CE_NOTE, 939 "!Pre-delete couldn't purge"" pagelock cache - continuing"); 940 return (0); 941 } 942 943 /*ARGSUSED*/ 944 static void 945 seg_p_mem_config_post_del( 946 void *arg, 947 pgcnt_t delta_pages, 948 int cancelled) 949 { 950 seg_p_enable(); 951 } 952 953 static kphysm_setup_vector_t seg_p_mem_config_vec = { 954 KPHYSM_SETUP_VECTOR_VERSION, 955 seg_p_mem_config_post_add, 956 seg_p_mem_config_pre_del, 957 seg_p_mem_config_post_del, 958 }; 959 960 static void 961 seg_pinit_mem_config(void) 962 { 963 int ret; 964 965 ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL); 966 /* 967 * Want to catch this in the debug kernel. At run time, if the 968 * callbacks don't get run all will be OK as the disable just makes 969 * it more likely that the pages can be collected. 970 */ 971 ASSERT(ret == 0); 972 } 973 974 extern struct seg_ops segvn_ops; 975 extern struct seg_ops segspt_shmops; 976 977 /* 978 * Verify that segment is not a shared anonymous segment which reserves 979 * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered 980 * from one zone to another if any segments are shared. This is because the 981 * last process to exit will credit the swap reservation. This could lead 982 * to the swap being reserved by one zone, and credited to another. 983 */ 984 boolean_t 985 seg_can_change_zones(struct seg *seg) 986 { 987 struct segvn_data *svd; 988 989 if (seg->s_ops == &segspt_shmops) 990 return (B_FALSE); 991 992 if (seg->s_ops == &segvn_ops) { 993 svd = (struct segvn_data *)seg->s_data; 994 if (svd->type == MAP_SHARED && 995 svd->amp != NULL && 996 svd->amp->swresv > 0) 997 return (B_FALSE); 998 } 999 return (B_TRUE); 1000 } 1001 1002 /* 1003 * Return swap reserved by a segment backing a private mapping. 1004 */ 1005 size_t 1006 seg_swresv(struct seg *seg) 1007 { 1008 struct segvn_data *svd; 1009 size_t swap = 0; 1010 1011 if (seg->s_ops == &segvn_ops) { 1012 svd = (struct segvn_data *)seg->s_data; 1013 if (svd->type == MAP_PRIVATE && svd->swresv > 0) 1014 swap = svd->swresv; 1015 } 1016 return (swap); 1017 } 1018