1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/thread.h> 32 #include <sys/proc.h> 33 #include <sys/callb.h> 34 #include <sys/vnode.h> 35 #include <sys/debug.h> 36 #include <sys/systm.h> /* for bzero */ 37 #include <sys/memlist.h> 38 #include <sys/cmn_err.h> 39 #include <sys/sysmacros.h> 40 #include <sys/vmsystm.h> /* for NOMEMWAIT() */ 41 #include <sys/atomic.h> /* used to update kcage_freemem */ 42 #include <sys/kmem.h> /* for kmem_reap */ 43 #include <sys/errno.h> 44 #include <sys/mem_cage.h> 45 #include <vm/seg_kmem.h> 46 #include <vm/page.h> 47 #include <vm/hat.h> 48 #include <sys/mem_config.h> 49 #include <sys/lgrp.h> 50 51 extern pri_t maxclsyspri; 52 53 #ifdef DEBUG 54 #define KCAGE_STATS 55 #endif 56 57 #ifdef KCAGE_STATS 58 59 #define KCAGE_STATS_VERSION 9 /* can help report generators */ 60 #define KCAGE_STATS_NSCANS 256 /* depth of scan statistics buffer */ 61 62 struct kcage_stats_scan { 63 /* managed by KCAGE_STAT_* macros */ 64 clock_t scan_lbolt; 65 uint_t scan_id; 66 67 /* set in kcage_cageout() */ 68 uint_t kt_passes; 69 clock_t kt_ticks; 70 pgcnt_t kt_kcage_freemem_start; 71 pgcnt_t kt_kcage_freemem_end; 72 pgcnt_t kt_freemem_start; 73 pgcnt_t kt_freemem_end; 74 uint_t kt_examined; 75 uint_t kt_cantlock; 76 uint_t kt_gotone; 77 uint_t kt_gotonefree; 78 uint_t kt_skiplevel; 79 uint_t kt_skipshared; 80 uint_t kt_skiprefd; 81 uint_t kt_destroy; 82 83 /* set in kcage_invalidate_page() */ 84 uint_t kip_reloclocked; 85 uint_t kip_relocmod; 86 uint_t kip_destroy; 87 uint_t kip_nomem; 88 uint_t kip_demotefailed; 89 90 /* set in kcage_expand() */ 91 uint_t ke_wanted; 92 uint_t ke_examined; 93 uint_t ke_lefthole; 94 uint_t ke_gotone; 95 uint_t ke_gotonefree; 96 }; 97 98 struct kcage_stats { 99 /* managed by KCAGE_STAT_* macros */ 100 uint_t version; 101 uint_t size; 102 103 /* set in kcage_cageout */ 104 uint_t kt_wakeups; 105 uint_t kt_scans; 106 uint_t kt_cageout_break; 107 108 /* set in kcage_expand */ 109 uint_t ke_calls; 110 uint_t ke_nopfn; 111 uint_t ke_nopaget; 112 uint_t ke_isnoreloc; 113 uint_t ke_deleting; 114 uint_t ke_lowfreemem; 115 uint_t ke_terminate; 116 117 /* set in kcage_freemem_add() */ 118 uint_t kfa_trottlewake; 119 120 /* set in kcage_freemem_sub() */ 121 uint_t kfs_cagewake; 122 123 /* set in kcage_create_throttle */ 124 uint_t kct_calls; 125 uint_t kct_cageout; 126 uint_t kct_critical; 127 uint_t kct_exempt; 128 uint_t kct_cagewake; 129 uint_t kct_wait; 130 uint_t kct_progress; 131 uint_t kct_noprogress; 132 uint_t kct_timeout; 133 134 /* set in kcage_cageout_wakeup */ 135 uint_t kcw_expandearly; 136 137 /* managed by KCAGE_STAT_* macros */ 138 uint_t scan_array_size; 139 uint_t scan_index; 140 struct kcage_stats_scan scans[KCAGE_STATS_NSCANS]; 141 }; 142 143 static struct kcage_stats kcage_stats; 144 static struct kcage_stats_scan kcage_stats_scan_zero; 145 146 /* 147 * No real need for atomics here. For the most part the incs and sets are 148 * done by the kernel cage thread. There are a few that are done by any 149 * number of other threads. Those cases are noted by comments. 150 */ 151 #define KCAGE_STAT_INCR(m) kcage_stats.m++ 152 153 #define KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v) 154 155 #define KCAGE_STAT_INCR_SCAN(m) \ 156 KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m) 157 158 #define KCAGE_STAT_NINCR_SCAN(m, v) \ 159 KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v) 160 161 #define KCAGE_STAT_SET(m, v) kcage_stats.m = (v) 162 163 #define KCAGE_STAT_SETZ(m, v) \ 164 if (kcage_stats.m == 0) kcage_stats.m = (v) 165 166 #define KCAGE_STAT_SET_SCAN(m, v) \ 167 KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v) 168 169 #define KCAGE_STAT_SETZ_SCAN(m, v) \ 170 KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v) 171 172 #define KCAGE_STAT_INC_SCAN_INDEX \ 173 KCAGE_STAT_SET_SCAN(scan_lbolt, lbolt); \ 174 KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \ 175 kcage_stats.scan_index = \ 176 (kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \ 177 kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero 178 179 #define KCAGE_STAT_INIT_SCAN_INDEX \ 180 kcage_stats.version = KCAGE_STATS_VERSION; \ 181 kcage_stats.size = sizeof (kcage_stats); \ 182 kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \ 183 kcage_stats.scan_index = 0 184 185 #else /* KCAGE_STATS */ 186 187 #define KCAGE_STAT_INCR(v) 188 #define KCAGE_STAT_NINCR(m, v) 189 #define KCAGE_STAT_INCR_SCAN(v) 190 #define KCAGE_STAT_NINCR_SCAN(m, v) 191 #define KCAGE_STAT_SET(m, v) 192 #define KCAGE_STAT_SETZ(m, v) 193 #define KCAGE_STAT_SET_SCAN(m, v) 194 #define KCAGE_STAT_SETZ_SCAN(m, v) 195 #define KCAGE_STAT_INC_SCAN_INDEX 196 #define KCAGE_STAT_INIT_SCAN_INDEX 197 198 #endif /* KCAGE_STATS */ 199 200 static kmutex_t kcage_throttle_mutex; /* protects kcage_throttle_cv */ 201 static kcondvar_t kcage_throttle_cv; 202 203 static kmutex_t kcage_cageout_mutex; /* protects cv and ready flag */ 204 static kcondvar_t kcage_cageout_cv; /* cageout thread naps here */ 205 static int kcage_cageout_ready; /* nonzero when cageout thread ready */ 206 kthread_id_t kcage_cageout_thread; /* to aid debugging */ 207 208 static kmutex_t kcage_range_mutex; /* proctects kcage_glist elements */ 209 210 /* 211 * Cage expansion happens within a range. 212 */ 213 struct kcage_glist { 214 struct kcage_glist *next; 215 pfn_t base; 216 pfn_t lim; 217 pfn_t curr; 218 int decr; 219 }; 220 221 static struct kcage_glist *kcage_glist; 222 static struct kcage_glist *kcage_current_glist; 223 224 /* 225 * The firstfree element is provided so that kmem_alloc can be avoided 226 * until that cage has somewhere to go. This is not currently a problem 227 * as early kmem_alloc's use BOP_ALLOC instead of page_create_va. 228 */ 229 static struct kcage_glist kcage_glist_firstfree; 230 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree; 231 232 /* 233 * Miscellaneous forward references 234 */ 235 static struct kcage_glist *kcage_glist_alloc(void); 236 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **); 237 static void kcage_cageout(void); 238 static int kcage_invalidate_page(page_t *, pgcnt_t *); 239 static int kcage_setnoreloc_pages(page_t *, se_t); 240 241 /* 242 * Kernel Memory Cage counters and thresholds. 243 */ 244 int kcage_on = 0; 245 pgcnt_t kcage_freemem; 246 pgcnt_t kcage_needfree; 247 pgcnt_t kcage_lotsfree; 248 pgcnt_t kcage_desfree; 249 pgcnt_t kcage_minfree; 250 pgcnt_t kcage_throttlefree; 251 int kcage_maxwait = 10; /* in seconds */ 252 253 /* when we use lp for kmem we start the cage at a higher initial value */ 254 pgcnt_t kcage_kmemlp_mincage; 255 256 #ifdef DEBUG 257 pgcnt_t kcage_pagets; 258 #define KCAGEPAGETS_INC() kcage_pagets++ 259 #else 260 #define KCAGEPAGETS_INC() 261 #endif 262 263 /* 264 * Startup and Dynamic Reconfiguration interfaces. 265 * kcage_range_lock() 266 * kcage_range_unlock() 267 * kcage_range_islocked() 268 * kcage_range_add() 269 * kcage_range_del() 270 * kcage_init() 271 * kcage_set_thresholds() 272 */ 273 274 int 275 kcage_range_trylock(void) 276 { 277 return (mutex_tryenter(&kcage_range_mutex)); 278 } 279 280 void 281 kcage_range_lock(void) 282 { 283 mutex_enter(&kcage_range_mutex); 284 } 285 286 void 287 kcage_range_unlock(void) 288 { 289 mutex_exit(&kcage_range_mutex); 290 } 291 292 int 293 kcage_range_islocked(void) 294 { 295 return (MUTEX_HELD(&kcage_range_mutex)); 296 } 297 298 /* 299 * Called from page_get_contig_pages to get the approximate kcage pfn range 300 * for exclusion from search for contiguous pages. This routine is called 301 * without kcage_range lock (kcage routines can call page_get_contig_pages 302 * through page_relocate) and with the assumption, based on kcage_range_add, 303 * that kcage_current_glist always contain a valid pointer. 304 */ 305 306 int 307 kcage_current_pfn(pfn_t *pfncur) 308 { 309 struct kcage_glist *lp = kcage_current_glist; 310 311 ASSERT(kcage_on); 312 313 ASSERT(lp != NULL); 314 315 *pfncur = lp->curr; 316 317 return (lp->decr); 318 } 319 320 int 321 kcage_range_init(struct memlist *ml, int decr) 322 { 323 int ret = 0; 324 325 ASSERT(kcage_range_islocked()); 326 327 if (decr) { 328 while (ml->next != NULL) 329 ml = ml->next; 330 } 331 332 while (ml != NULL) { 333 ret = kcage_range_add(btop(ml->address), btop(ml->size), decr); 334 if (ret) 335 break; 336 337 ml = (decr ? ml->prev : ml->next); 338 } 339 340 return (ret); 341 } 342 343 /* 344 * Third arg controls direction of growth: 0: increasing pfns, 345 * 1: decreasing. 346 * Calls to add and delete must be protected by calls to 347 * kcage_range_lock() and kcage_range_unlock(). 348 */ 349 int 350 kcage_range_add(pfn_t base, pgcnt_t npgs, int decr) 351 { 352 struct kcage_glist *new, **lpp; 353 pfn_t lim; 354 355 ASSERT(kcage_range_islocked()); 356 357 ASSERT(npgs != 0); 358 if (npgs == 0) 359 return (EINVAL); 360 361 lim = base + npgs; 362 363 ASSERT(lim > base); 364 if (lim <= base) 365 return (EINVAL); 366 367 new = kcage_glist_alloc(); 368 if (new == NULL) { 369 return (ENOMEM); 370 } 371 372 new->base = base; 373 new->lim = lim; 374 new->decr = decr; 375 if (new->decr != 0) 376 new->curr = new->lim; 377 else 378 new->curr = new->base; 379 /* 380 * Any overlapping existing ranges are removed by deleting 381 * from the new list as we search for the tail. 382 */ 383 lpp = &kcage_glist; 384 while (*lpp != NULL) { 385 int ret; 386 ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new); 387 if (ret != 0) 388 return (ret); 389 lpp = &(*lpp)->next; 390 } 391 392 *lpp = new; 393 394 if (kcage_current_glist == NULL) { 395 kcage_current_glist = kcage_glist; 396 } 397 398 return (0); 399 } 400 401 /* 402 * Calls to add and delete must be protected by calls to 403 * kcage_range_lock() and kcage_range_unlock(). 404 */ 405 int 406 kcage_range_delete(pfn_t base, pgcnt_t npgs) 407 { 408 struct kcage_glist *lp; 409 pfn_t lim; 410 411 ASSERT(kcage_range_islocked()); 412 413 ASSERT(npgs != 0); 414 if (npgs == 0) 415 return (EINVAL); 416 417 lim = base + npgs; 418 419 ASSERT(lim > base); 420 if (lim <= base) 421 return (EINVAL); 422 423 /* 424 * Check if the delete is OK first as a number of elements 425 * might be involved and it will be difficult to go 426 * back and undo (can't just add the range back in). 427 */ 428 for (lp = kcage_glist; lp != NULL; lp = lp->next) { 429 /* 430 * If there have been no pages allocated from this 431 * element, we don't need to check it. 432 */ 433 if ((lp->decr == 0 && lp->curr == lp->base) || 434 (lp->decr != 0 && lp->curr == lp->lim)) 435 continue; 436 /* 437 * If the element does not overlap, its OK. 438 */ 439 if (base >= lp->lim || lim <= lp->base) 440 continue; 441 /* 442 * Overlapping element: Does the range to be deleted 443 * overlap the area already used? If so fail. 444 */ 445 if (lp->decr == 0 && base < lp->curr && lim >= lp->base) { 446 return (EBUSY); 447 } 448 if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) { 449 return (EBUSY); 450 } 451 } 452 return (kcage_glist_delete(base, lim, &kcage_glist)); 453 } 454 455 /* 456 * Calls to add and delete must be protected by calls to 457 * kcage_range_lock() and kcage_range_unlock(). 458 * This routine gets called after successful Solaris memory 459 * delete operation from DR post memory delete routines. 460 */ 461 int 462 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs) 463 { 464 pfn_t lim; 465 466 ASSERT(kcage_range_islocked()); 467 468 ASSERT(npgs != 0); 469 if (npgs == 0) 470 return (EINVAL); 471 472 lim = base + npgs; 473 474 ASSERT(lim > base); 475 if (lim <= base) 476 return (EINVAL); 477 478 return (kcage_glist_delete(base, lim, &kcage_glist)); 479 } 480 481 /* 482 * No locking is required here as the whole operation is covered 483 * by the kcage_range_lock(). 484 */ 485 static struct kcage_glist * 486 kcage_glist_alloc(void) 487 { 488 struct kcage_glist *new; 489 490 if ((new = kcage_glist_freelist) != NULL) { 491 kcage_glist_freelist = new->next; 492 bzero(new, sizeof (*new)); 493 } else { 494 new = kmem_zalloc(sizeof (struct kcage_glist), KM_NOSLEEP); 495 } 496 return (new); 497 } 498 499 static void 500 kcage_glist_free(struct kcage_glist *lp) 501 { 502 lp->next = kcage_glist_freelist; 503 kcage_glist_freelist = lp; 504 } 505 506 static int 507 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp) 508 { 509 struct kcage_glist *lp, *prev = *lpp; 510 511 while ((lp = *lpp) != NULL) { 512 if (lim > lp->base && base < lp->lim) { 513 /* The delete range overlaps this element. */ 514 if (base <= lp->base && lim >= lp->lim) { 515 /* Delete whole element. */ 516 *lpp = lp->next; 517 if (lp == kcage_current_glist) { 518 /* This can never happen. */ 519 ASSERT(kcage_current_glist != prev); 520 kcage_current_glist = prev; 521 } 522 kcage_glist_free(lp); 523 continue; 524 } 525 526 /* Partial delete. */ 527 if (base > lp->base && lim < lp->lim) { 528 struct kcage_glist *new; 529 530 /* 531 * Remove a section from the middle, 532 * need to allocate a new element. 533 */ 534 new = kcage_glist_alloc(); 535 if (new == NULL) { 536 return (ENOMEM); 537 } 538 539 /* 540 * Tranfser unused range to new. 541 * Edit lp in place to preserve 542 * kcage_current_glist. 543 */ 544 new->decr = lp->decr; 545 if (new->decr != 0) { 546 new->base = lp->base; 547 new->lim = base; 548 new->curr = base; 549 550 lp->base = lim; 551 } else { 552 new->base = lim; 553 new->lim = lp->lim; 554 new->curr = new->base; 555 556 lp->lim = base; 557 } 558 559 /* Insert new. */ 560 new->next = lp->next; 561 lp->next = new; 562 lpp = &lp->next; 563 } else { 564 /* Delete part of current block. */ 565 if (base > lp->base) { 566 ASSERT(lim >= lp->lim); 567 ASSERT(base < lp->lim); 568 if (lp->decr != 0 && 569 lp->curr == lp->lim) 570 lp->curr = base; 571 lp->lim = base; 572 } else { 573 ASSERT(base <= lp->base); 574 ASSERT(lim > lp->base); 575 if (lp->decr == 0 && 576 lp->curr == lp->base) 577 lp->curr = lim; 578 lp->base = lim; 579 } 580 } 581 } 582 prev = *lpp; 583 lpp = &(*lpp)->next; 584 } 585 586 return (0); 587 } 588 589 /* 590 * The caller of kcage_get_pfn must hold the kcage_range_lock to make 591 * sure that there are no concurrent calls. The same lock 592 * must be obtained for range add and delete by calling 593 * kcage_range_lock() and kcage_range_unlock(). 594 */ 595 static pfn_t 596 kcage_get_pfn(void) 597 { 598 struct kcage_glist *lp; 599 pfn_t pfn; 600 601 ASSERT(kcage_range_islocked()); 602 603 lp = kcage_current_glist; 604 while (lp != NULL) { 605 if (lp->decr != 0) { 606 if (lp->curr != lp->base) { 607 pfn = --lp->curr; 608 return (pfn); 609 } 610 } else { 611 if (lp->curr != lp->lim) { 612 pfn = lp->curr++; 613 return (pfn); 614 } 615 } 616 617 lp = lp->next; 618 if (lp) 619 kcage_current_glist = lp; 620 } 621 622 return (PFN_INVALID); 623 } 624 625 /* 626 * Walk the physical address space of the cage. 627 * This routine does not guarantee to return PFNs in the order 628 * in which they were allocated to the cage. Instead, it walks 629 * each range as they appear on the growth list returning the PFNs 630 * range in ascending order. 631 * 632 * To begin scanning at lower edge of cage, reset should be nonzero. 633 * To step through cage, reset should be zero. 634 * 635 * PFN_INVALID will be returned when the upper end of the cage is 636 * reached -- indicating a full scan of the cage has been completed since 637 * previous reset. PFN_INVALID will continue to be returned until 638 * kcage_walk_cage is reset. 639 * 640 * It is possible to receive a PFN_INVALID result on reset if a growth 641 * list is not installed or if none of the PFNs in the installed list have 642 * been allocated to the cage. In otherwords, there is no cage. 643 * 644 * Caller need not hold kcage_range_lock while calling this function 645 * as the front part of the list is static - pages never come out of 646 * the cage. 647 * 648 * The caller is expected to only be kcage_cageout(). 649 */ 650 static pfn_t 651 kcage_walk_cage(int reset) 652 { 653 static struct kcage_glist *lp = NULL; 654 static pfn_t pfn; 655 656 if (reset) 657 lp = NULL; 658 if (lp == NULL) { 659 lp = kcage_glist; 660 pfn = PFN_INVALID; 661 } 662 again: 663 if (pfn == PFN_INVALID) { 664 if (lp == NULL) 665 return (PFN_INVALID); 666 667 if (lp->decr != 0) { 668 /* 669 * In this range the cage grows from the highest 670 * address towards the lowest. 671 * Arrange to return pfns from curr to lim-1, 672 * inclusive, in ascending order. 673 */ 674 675 pfn = lp->curr; 676 } else { 677 /* 678 * In this range the cage grows from the lowest 679 * address towards the highest. 680 * Arrange to return pfns from base to curr, 681 * inclusive, in ascending order. 682 */ 683 684 pfn = lp->base; 685 } 686 } 687 688 if (lp->decr != 0) { /* decrementing pfn */ 689 if (pfn == lp->lim) { 690 /* Don't go beyond the static part of the glist. */ 691 if (lp == kcage_current_glist) 692 lp = NULL; 693 else 694 lp = lp->next; 695 pfn = PFN_INVALID; 696 goto again; 697 } 698 699 ASSERT(pfn >= lp->curr && pfn < lp->lim); 700 } else { /* incrementing pfn */ 701 if (pfn == lp->curr) { 702 /* Don't go beyond the static part of the glist. */ 703 if (lp == kcage_current_glist) 704 lp = NULL; 705 else 706 lp = lp->next; 707 pfn = PFN_INVALID; 708 goto again; 709 } 710 711 ASSERT(pfn >= lp->base && pfn < lp->curr); 712 } 713 714 return (pfn++); 715 } 716 717 /* 718 * Callback functions for to recalc cage thresholds after 719 * Kphysm memory add/delete operations. 720 */ 721 /*ARGSUSED*/ 722 static void 723 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages) 724 { 725 kcage_recalc_thresholds(); 726 } 727 728 /*ARGSUSED*/ 729 static int 730 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages) 731 { 732 /* TODO: when should cage refuse memory delete requests? */ 733 return (0); 734 } 735 736 /*ARGSUSED*/ 737 static void 738 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled) 739 { 740 kcage_recalc_thresholds(); 741 } 742 743 static kphysm_setup_vector_t kcage_kphysm_vectors = { 744 KPHYSM_SETUP_VECTOR_VERSION, 745 kcage_kphysm_postadd_cb, 746 kcage_kphysm_predel_cb, 747 kcage_kphysm_postdel_cb 748 }; 749 750 /* 751 * This is called before a CPR suspend and after a CPR resume. We have to 752 * turn off kcage_cageout_ready before a suspend, and turn it back on after a 753 * restart. 754 */ 755 /*ARGSUSED*/ 756 static boolean_t 757 kcage_cageout_cpr(void *arg, int code) 758 { 759 if (code == CB_CODE_CPR_CHKPT) { 760 ASSERT(kcage_cageout_ready); 761 kcage_cageout_ready = 0; 762 return (B_TRUE); 763 } else if (code == CB_CODE_CPR_RESUME) { 764 ASSERT(kcage_cageout_ready == 0); 765 kcage_cageout_ready = 1; 766 return (B_TRUE); 767 } 768 return (B_FALSE); 769 } 770 771 /* 772 * kcage_recalc_preferred_size() increases initial cage size to improve large 773 * page availability when lp for kmem is enabled and kpr is disabled 774 */ 775 static pgcnt_t 776 kcage_recalc_preferred_size(pgcnt_t preferred_size) 777 { 778 if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) { 779 pgcnt_t lpmincage = kcage_kmemlp_mincage; 780 if (lpmincage == 0) { 781 lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8), 782 segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE; 783 } 784 kcage_kmemlp_mincage = MIN(lpmincage, 785 (segkmem_kmemlp_max / PAGESIZE)); 786 preferred_size = MAX(kcage_kmemlp_mincage, preferred_size); 787 } 788 return (preferred_size); 789 } 790 791 /* 792 * Kcage_init() builds the cage and initializes the cage thresholds. 793 * The size of the cage is determined by the argument preferred_size. 794 * or the actual amount of memory, whichever is smaller. 795 */ 796 void 797 kcage_init(pgcnt_t preferred_size) 798 { 799 pgcnt_t wanted; 800 pfn_t pfn; 801 page_t *pp; 802 extern struct vnode kvp; 803 extern void page_list_noreloc_startup(page_t *); 804 805 ASSERT(!kcage_on); 806 ASSERT(kcage_range_islocked()); 807 808 /* increase preferred cage size for lp for kmem */ 809 preferred_size = kcage_recalc_preferred_size(preferred_size); 810 811 /* Debug note: initialize this now so early expansions can stat */ 812 KCAGE_STAT_INIT_SCAN_INDEX; 813 814 /* 815 * Initialize cage thresholds and install kphysm callback. 816 * If we can't arrange to have the thresholds track with 817 * available physical memory, then the cage thresholds may 818 * end up over time at levels that adversly effect system 819 * performance; so, bail out. 820 */ 821 kcage_recalc_thresholds(); 822 if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) { 823 ASSERT(0); /* Catch this in DEBUG kernels. */ 824 return; 825 } 826 827 /* 828 * Limit startup cage size within the range of kcage_minfree 829 * and availrmem, inclusively. 830 */ 831 wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem); 832 833 /* 834 * Construct the cage. PFNs are allocated from the glist. It 835 * is assumed that the list has been properly ordered for the 836 * platform by the platform code. Typically, this is as simple 837 * as calling kcage_range_init(phys_avail, decr), where decr is 838 * 1 if the kernel has been loaded into upper end of physical 839 * memory, or 0 if the kernel has been loaded at the low end. 840 * 841 * Note: it is assumed that we are in the startup flow, so there 842 * is no reason to grab the page lock. 843 */ 844 kcage_freemem = 0; 845 pfn = PFN_INVALID; /* prime for alignment test */ 846 while (wanted != 0) { 847 if ((pfn = kcage_get_pfn()) == PFN_INVALID) 848 break; 849 850 if ((pp = page_numtopp_nolock(pfn)) != NULL) { 851 KCAGEPAGETS_INC(); 852 /* 853 * Set the noreloc state on the page. 854 * If the page is free and not already 855 * on the noreloc list then move it. 856 */ 857 if (PP_ISFREE(pp)) { 858 if (PP_ISNORELOC(pp) == 0) 859 page_list_noreloc_startup(pp); 860 } else { 861 ASSERT(pp->p_szc == 0); 862 PP_SETNORELOC(pp); 863 } 864 } 865 866 wanted -= 1; 867 } 868 869 /* 870 * Need to go through and find kernel allocated pages 871 * and capture them into the Cage. These will primarily 872 * be pages gotten through boot_alloc(). 873 */ 874 if (kvp.v_pages) { 875 876 pp = kvp.v_pages; 877 do { 878 ASSERT(!PP_ISFREE(pp)); 879 ASSERT(pp->p_szc == 0); 880 PP_SETNORELOC(pp); 881 } while ((pp = pp->p_vpnext) != kvp.v_pages); 882 883 } 884 885 kcage_on = 1; 886 887 /* 888 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend() 889 * after the cageout thread is blocked, and executes from cpr_resume() 890 * before the cageout thread is restarted. By executing in this class, 891 * we are assured that the kernel cage thread won't miss wakeup calls 892 * and also CPR's larger kmem_alloc requests will not fail after 893 * CPR shuts down the cageout kernel thread. 894 */ 895 (void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL, 896 "cageout"); 897 898 /* 899 * Coalesce pages to improve large page availability. A better fix 900 * would to coalesce pages as they are included in the cage 901 */ 902 if (SEGKMEM_USE_LARGEPAGES) { 903 extern void page_freelist_coalesce_all(int mnode); 904 extern int max_mem_nodes; 905 int mnode, max_mnodes = max_mem_nodes; 906 for (mnode = 0; mnode < max_mnodes; mnode++) { 907 page_freelist_coalesce_all(mnode); 908 } 909 } 910 } 911 912 void 913 kcage_recalc_thresholds() 914 { 915 static int first = 1; 916 static pgcnt_t init_lotsfree; 917 static pgcnt_t init_desfree; 918 static pgcnt_t init_minfree; 919 static pgcnt_t init_throttlefree; 920 921 /* TODO: any reason to take more care than this with live editing? */ 922 mutex_enter(&kcage_cageout_mutex); 923 mutex_enter(&freemem_lock); 924 925 if (first) { 926 first = 0; 927 init_lotsfree = kcage_lotsfree; 928 init_desfree = kcage_desfree; 929 init_minfree = kcage_minfree; 930 init_throttlefree = kcage_throttlefree; 931 } else { 932 kcage_lotsfree = init_lotsfree; 933 kcage_desfree = init_desfree; 934 kcage_minfree = init_minfree; 935 kcage_throttlefree = init_throttlefree; 936 } 937 938 if (kcage_lotsfree == 0) 939 kcage_lotsfree = MAX(32, total_pages / 256); 940 941 if (kcage_minfree == 0) 942 kcage_minfree = MAX(32, kcage_lotsfree / 2); 943 944 if (kcage_desfree == 0) 945 kcage_desfree = MAX(32, kcage_minfree); 946 947 if (kcage_throttlefree == 0) 948 kcage_throttlefree = MAX(32, kcage_minfree / 2); 949 950 mutex_exit(&freemem_lock); 951 mutex_exit(&kcage_cageout_mutex); 952 953 if (kcage_cageout_ready) { 954 if (kcage_freemem < kcage_desfree) 955 kcage_cageout_wakeup(); 956 957 if (kcage_needfree) { 958 mutex_enter(&kcage_throttle_mutex); 959 cv_broadcast(&kcage_throttle_cv); 960 mutex_exit(&kcage_throttle_mutex); 961 } 962 } 963 } 964 965 /* 966 * Pageout interface: 967 * kcage_cageout_init() 968 */ 969 void 970 kcage_cageout_init() 971 { 972 if (kcage_on) { 973 mutex_enter(&kcage_cageout_mutex); 974 975 kcage_cageout_thread = thread_create(NULL, 0, kcage_cageout, 976 NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1); 977 978 mutex_exit(&kcage_cageout_mutex); 979 } 980 } 981 982 983 /* 984 * VM Interfaces: 985 * kcage_create_throttle() 986 * kcage_freemem_add() 987 * kcage_freemem_sub() 988 */ 989 990 /* 991 * Wakeup cageout thread and throttle waiting for the number of pages 992 * requested to become available. For non-critical requests, a 993 * timeout is added, since freemem accounting is separate from cage 994 * freemem accounting: it's possible for us to get stuck and not make 995 * forward progress even though there was sufficient freemem before 996 * arriving here. 997 */ 998 int 999 kcage_create_throttle(pgcnt_t npages, int flags) 1000 { 1001 int niter = 0; 1002 pgcnt_t lastfree; 1003 int enough = kcage_freemem > kcage_throttlefree + npages; 1004 1005 KCAGE_STAT_INCR(kct_calls); /* unprotected incr. */ 1006 1007 kcage_cageout_wakeup(); /* just to be sure */ 1008 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1009 1010 /* 1011 * Obviously, we can't throttle the cageout thread since 1012 * we depend on it. We also can't throttle the panic thread. 1013 */ 1014 if (curthread == kcage_cageout_thread || panicstr) { 1015 KCAGE_STAT_INCR(kct_cageout); /* unprotected incr. */ 1016 return (KCT_CRIT); 1017 } 1018 1019 /* 1020 * Don't throttle threads which are critical for proper 1021 * vm management if we're above kcage_throttlefree or 1022 * if freemem is very low. 1023 */ 1024 if (NOMEMWAIT()) { 1025 if (enough) { 1026 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1027 return (KCT_CRIT); 1028 } else if (freemem < minfree) { 1029 KCAGE_STAT_INCR(kct_critical); /* unprotected incr. */ 1030 return (KCT_CRIT); 1031 } 1032 } 1033 1034 /* 1035 * Don't throttle real-time threads. 1036 */ 1037 if (DISP_PRIO(curthread) > maxclsyspri) { 1038 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1039 return (KCT_CRIT); 1040 } 1041 1042 /* 1043 * Cause all other threads (which are assumed to not be 1044 * critical to cageout) to wait here until their request 1045 * can be satisfied. Be a little paranoid and wake the 1046 * kernel cage on each loop through this logic. 1047 */ 1048 while (kcage_freemem < kcage_throttlefree + npages) { 1049 ASSERT(kcage_on); 1050 1051 lastfree = kcage_freemem; 1052 1053 if (kcage_cageout_ready) { 1054 mutex_enter(&kcage_throttle_mutex); 1055 1056 kcage_needfree += npages; 1057 KCAGE_STAT_INCR(kct_wait); 1058 1059 kcage_cageout_wakeup(); 1060 KCAGE_STAT_INCR(kct_cagewake); 1061 1062 cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex); 1063 1064 kcage_needfree -= npages; 1065 1066 mutex_exit(&kcage_throttle_mutex); 1067 } else { 1068 /* 1069 * NOTE: atomics are used just in case we enter 1070 * mp operation before the cageout thread is ready. 1071 */ 1072 atomic_add_long(&kcage_needfree, npages); 1073 1074 kcage_cageout_wakeup(); 1075 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1076 1077 atomic_add_long(&kcage_needfree, -npages); 1078 } 1079 1080 if ((flags & PG_WAIT) == 0) { 1081 if (kcage_freemem > lastfree) { 1082 KCAGE_STAT_INCR(kct_progress); 1083 niter = 0; 1084 } else { 1085 KCAGE_STAT_INCR(kct_noprogress); 1086 if (++niter >= kcage_maxwait) { 1087 KCAGE_STAT_INCR(kct_timeout); 1088 return (KCT_FAILURE); 1089 } 1090 } 1091 } 1092 } 1093 return (KCT_NONCRIT); 1094 } 1095 1096 void 1097 kcage_freemem_add(pgcnt_t npages) 1098 { 1099 extern void wakeup_pcgs(void); 1100 1101 atomic_add_long(&kcage_freemem, npages); 1102 1103 wakeup_pcgs(); /* wakeup threads in pcgs() */ 1104 1105 if (kcage_needfree != 0 && 1106 kcage_freemem >= (kcage_throttlefree + kcage_needfree)) { 1107 1108 mutex_enter(&kcage_throttle_mutex); 1109 cv_broadcast(&kcage_throttle_cv); 1110 KCAGE_STAT_INCR(kfa_trottlewake); 1111 mutex_exit(&kcage_throttle_mutex); 1112 } 1113 } 1114 1115 void 1116 kcage_freemem_sub(pgcnt_t npages) 1117 { 1118 atomic_add_long(&kcage_freemem, -npages); 1119 1120 if (kcage_freemem < kcage_desfree) { 1121 kcage_cageout_wakeup(); 1122 KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */ 1123 } 1124 } 1125 1126 /* 1127 * return 0 on failure and 1 on success. 1128 */ 1129 static int 1130 kcage_setnoreloc_pages(page_t *rootpp, se_t se) 1131 { 1132 pgcnt_t npgs, i; 1133 page_t *pp; 1134 pfn_t rootpfn = page_pptonum(rootpp); 1135 uint_t szc; 1136 1137 ASSERT(!PP_ISFREE(rootpp)); 1138 ASSERT(PAGE_LOCKED_SE(rootpp, se)); 1139 if (!group_page_trylock(rootpp, se)) { 1140 return (0); 1141 } 1142 szc = rootpp->p_szc; 1143 if (szc == 0) { 1144 /* 1145 * The szc of a locked page can only change for pages that are 1146 * non-swapfs (i.e. anonymous memory) file system pages. 1147 */ 1148 ASSERT(rootpp->p_vnode != NULL && 1149 rootpp->p_vnode != &kvp && 1150 !IS_SWAPFSVP(rootpp->p_vnode)); 1151 PP_SETNORELOC(rootpp); 1152 return (1); 1153 } 1154 npgs = page_get_pagecnt(szc); 1155 ASSERT(IS_P2ALIGNED(rootpfn, npgs)); 1156 pp = rootpp; 1157 for (i = 0; i < npgs; i++, pp = page_next(pp)) { 1158 ASSERT(PAGE_LOCKED_SE(pp, se)); 1159 ASSERT(!PP_ISFREE(pp)); 1160 ASSERT(pp->p_szc == szc); 1161 PP_SETNORELOC(pp); 1162 } 1163 group_page_unlock(rootpp); 1164 return (1); 1165 } 1166 1167 /* 1168 * Attempt to convert page to a caged page (set the P_NORELOC flag). 1169 * If successful and pages is free, move page to the tail of whichever 1170 * list it is on. 1171 * Returns: 1172 * EBUSY page already locked, assimilated but not free. 1173 * ENOMEM page assimilated, but memory too low to relocate. Page not free. 1174 * EAGAIN page not assimilated. Page not free. 1175 * ERANGE page assimilated. Page not root. 1176 * 0 page assimilated. Page free. 1177 * *nfreedp number of pages freed. 1178 * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way 1179 * to distinguish between a page that was already a NORELOC page from 1180 * those newly converted to NORELOC pages by this invocation of 1181 * kcage_assimilate_page. 1182 */ 1183 static int 1184 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp) 1185 { 1186 if (page_trylock(pp, SE_EXCL)) { 1187 if (PP_ISNORELOC(pp)) { 1188 check_free_and_return: 1189 if (PP_ISFREE(pp)) { 1190 page_unlock(pp); 1191 *nfreedp = 0; 1192 return (0); 1193 } else { 1194 page_unlock(pp); 1195 return (EBUSY); 1196 } 1197 /*NOTREACHED*/ 1198 } 1199 } else { 1200 if (page_trylock(pp, SE_SHARED)) { 1201 if (PP_ISNORELOC(pp)) 1202 goto check_free_and_return; 1203 } else 1204 return (EAGAIN); 1205 1206 if (!PP_ISFREE(pp)) { 1207 page_unlock(pp); 1208 return (EAGAIN); 1209 } 1210 1211 /* 1212 * Need to upgrade the lock on it and set the NORELOC 1213 * bit. If it is free then remove it from the free 1214 * list so that the platform free list code can keep 1215 * NORELOC pages where they should be. 1216 */ 1217 /* 1218 * Before doing anything, get the exclusive lock. 1219 * This may fail (eg ISM pages are left shared locked). 1220 * If the page is free this will leave a hole in the 1221 * cage. There is no solution yet to this. 1222 */ 1223 if (!page_tryupgrade(pp)) { 1224 page_unlock(pp); 1225 return (EAGAIN); 1226 } 1227 } 1228 1229 ASSERT(PAGE_EXCL(pp)); 1230 1231 if (PP_ISFREE(pp)) { 1232 int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST; 1233 1234 page_list_sub(pp, which | PG_LIST_ISCAGE); 1235 ASSERT(pp->p_szc == 0); 1236 PP_SETNORELOC(pp); 1237 page_list_add(pp, which | PG_LIST_TAIL | PG_LIST_ISCAGE); 1238 1239 page_unlock(pp); 1240 *nfreedp = 1; 1241 return (0); 1242 } else { 1243 if (pp->p_szc != 0) { 1244 if (!kcage_setnoreloc_pages(pp, SE_EXCL)) { 1245 page_unlock(pp); 1246 return (EAGAIN); 1247 } 1248 ASSERT(PP_ISNORELOC(pp)); 1249 } else { 1250 PP_SETNORELOC(pp); 1251 } 1252 return (kcage_invalidate_page(pp, nfreedp)); 1253 } 1254 /*NOTREACHED*/ 1255 } 1256 1257 static int 1258 kcage_expand() 1259 { 1260 int did_something = 0; 1261 1262 spgcnt_t wanted; 1263 pfn_t pfn; 1264 page_t *pp; 1265 /* TODO: we don't really need n any more? */ 1266 pgcnt_t n; 1267 pgcnt_t nf, nfreed; 1268 1269 /* 1270 * Expand the cage if available cage memory is really low. Calculate 1271 * the amount required to return kcage_freemem to the level of 1272 * kcage_lotsfree, or to satisfy throttled requests, whichever is 1273 * more. It is rare for their sum to create an artificial threshold 1274 * above kcage_lotsfree, but it is possible. 1275 * 1276 * Exit early if expansion amount is equal to or less than zero. 1277 * (<0 is possible if kcage_freemem rises suddenly.) 1278 * 1279 * Exit early when the global page pool (apparently) does not 1280 * have enough free pages to page_relocate() even a single page. 1281 */ 1282 wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree) 1283 - kcage_freemem; 1284 if (wanted <= 0) 1285 return (0); 1286 else if (freemem < pageout_reserve + 1) { 1287 KCAGE_STAT_INCR(ke_lowfreemem); 1288 return (0); 1289 } 1290 1291 /* 1292 * Try to get the range list lock. If the lock is already 1293 * held, then don't get stuck here waiting for it. 1294 */ 1295 if (!kcage_range_trylock()) 1296 return (0); 1297 1298 KCAGE_STAT_INCR(ke_calls); 1299 KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted); 1300 1301 /* 1302 * Assimilate more pages from the global page pool into the cage. 1303 */ 1304 n = 0; /* number of pages PP_SETNORELOC'd */ 1305 nf = 0; /* number of those actually free */ 1306 while (kcage_on && nf < wanted) { 1307 pfn = kcage_get_pfn(); 1308 if (pfn == PFN_INVALID) { /* eek! no where to grow */ 1309 KCAGE_STAT_INCR(ke_nopfn); 1310 goto terminate; 1311 } 1312 1313 KCAGE_STAT_INCR_SCAN(ke_examined); 1314 1315 if ((pp = page_numtopp_nolock(pfn)) == NULL) { 1316 KCAGE_STAT_INCR(ke_nopaget); 1317 continue; 1318 } 1319 KCAGEPAGETS_INC(); 1320 /* 1321 * Sanity check. Skip this pfn if it is 1322 * being deleted. 1323 */ 1324 if (pfn_is_being_deleted(pfn)) { 1325 KCAGE_STAT_INCR(ke_deleting); 1326 continue; 1327 } 1328 1329 /* 1330 * NORELOC is only set at boot-time or by this routine 1331 * under the kcage_range_mutex lock which is currently 1332 * held. This means we can do a fast check here before 1333 * locking the page in kcage_assimilate_page. 1334 */ 1335 if (PP_ISNORELOC(pp)) { 1336 KCAGE_STAT_INCR(ke_isnoreloc); 1337 continue; 1338 } 1339 1340 switch (kcage_assimilate_page(pp, &nfreed)) { 1341 case 0: /* assimilated, page is free */ 1342 KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed); 1343 did_something = 1; 1344 nf += nfreed; 1345 n++; 1346 break; 1347 1348 case EBUSY: /* assimilated, page not free */ 1349 case ERANGE: /* assimilated, page not root */ 1350 KCAGE_STAT_INCR_SCAN(ke_gotone); 1351 did_something = 1; 1352 n++; 1353 break; 1354 1355 case ENOMEM: /* assimilated, but no mem */ 1356 KCAGE_STAT_INCR(ke_terminate); 1357 did_something = 1; 1358 n++; 1359 goto terminate; 1360 1361 case EAGAIN: /* can't assimilate */ 1362 KCAGE_STAT_INCR_SCAN(ke_lefthole); 1363 break; 1364 1365 default: /* catch this with debug kernels */ 1366 ASSERT(0); 1367 break; 1368 } 1369 } 1370 1371 /* 1372 * Realign cage edge with the nearest physical address 1373 * boundry for big pages. This is done to give us a 1374 * better chance of actually getting usable big pages 1375 * in the cage. 1376 */ 1377 1378 terminate: 1379 kcage_range_unlock(); 1380 1381 return (did_something); 1382 } 1383 1384 /* 1385 * Relocate page opp (Original Page Pointer) from cage pool to page rpp 1386 * (Replacement Page Pointer) in the global pool. Page opp will be freed 1387 * if relocation is successful, otherwise it is only unlocked. 1388 * On entry, page opp must be exclusively locked and not free. 1389 * *nfreedp: number of pages freed. 1390 */ 1391 static int 1392 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp) 1393 { 1394 page_t *opp = pp; 1395 page_t *rpp = NULL; 1396 spgcnt_t npgs; 1397 int result; 1398 1399 ASSERT(!PP_ISFREE(opp)); 1400 ASSERT(PAGE_EXCL(opp)); 1401 1402 result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL); 1403 *nfreedp = npgs; 1404 if (result == 0) { 1405 while (npgs-- > 0) { 1406 page_t *tpp; 1407 1408 ASSERT(rpp != NULL); 1409 tpp = rpp; 1410 page_sub(&rpp, tpp); 1411 page_unlock(tpp); 1412 } 1413 1414 ASSERT(rpp == NULL); 1415 1416 return (0); /* success */ 1417 } 1418 1419 page_unlock(opp); 1420 return (result); 1421 } 1422 1423 /* 1424 * Based on page_invalidate_pages() 1425 * 1426 * Kcage_invalidate_page() uses page_relocate() twice. Both instances 1427 * of use must be updated to match the new page_relocate() when it 1428 * becomes available. 1429 * 1430 * Return result of kcage_relocate_page or zero if page was directly freed. 1431 * *nfreedp: number of pages freed. 1432 */ 1433 static int 1434 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp) 1435 { 1436 int result; 1437 1438 #if defined(__sparc) 1439 extern struct vnode prom_ppages; 1440 ASSERT(pp->p_vnode != &prom_ppages); 1441 #endif /* __sparc */ 1442 1443 ASSERT(!PP_ISFREE(pp)); 1444 ASSERT(PAGE_EXCL(pp)); 1445 1446 /* 1447 * Is this page involved in some I/O? shared? 1448 * The page_struct_lock need not be acquired to 1449 * examine these fields since the page has an 1450 * "exclusive" lock. 1451 */ 1452 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1453 result = kcage_relocate_page(pp, nfreedp); 1454 #ifdef KCAGE_STATS 1455 if (result == 0) 1456 KCAGE_STAT_INCR_SCAN(kip_reloclocked); 1457 else if (result == ENOMEM) 1458 KCAGE_STAT_INCR_SCAN(kip_nomem); 1459 #endif 1460 return (result); 1461 } 1462 1463 ASSERT(pp->p_vnode->v_type != VCHR); 1464 1465 /* 1466 * Unload the mappings and check if mod bit is set. 1467 */ 1468 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1469 1470 if (hat_ismod(pp)) { 1471 result = kcage_relocate_page(pp, nfreedp); 1472 #ifdef KCAGE_STATS 1473 if (result == 0) 1474 KCAGE_STAT_INCR_SCAN(kip_relocmod); 1475 else if (result == ENOMEM) 1476 KCAGE_STAT_INCR_SCAN(kip_nomem); 1477 #endif 1478 return (result); 1479 } 1480 1481 if (!page_try_demote_pages(pp)) { 1482 KCAGE_STAT_INCR_SCAN(kip_demotefailed); 1483 page_unlock(pp); 1484 return (EAGAIN); 1485 } 1486 1487 page_destroy(pp, 0); 1488 KCAGE_STAT_INCR_SCAN(kip_destroy); 1489 *nfreedp = 1; 1490 return (0); 1491 } 1492 1493 static void 1494 kcage_cageout() 1495 { 1496 pfn_t pfn; 1497 page_t *pp; 1498 callb_cpr_t cprinfo; 1499 int did_something; 1500 int scan_again; 1501 pfn_t start_pfn; 1502 int pass; 1503 int last_pass; 1504 int pages_skipped; 1505 int shared_skipped; 1506 uint_t shared_level = 8; 1507 pgcnt_t nfreed; 1508 #ifdef KCAGE_STATS 1509 clock_t scan_start; 1510 #endif 1511 1512 CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex, 1513 callb_generic_cpr, "cageout"); 1514 1515 mutex_enter(&kcage_cageout_mutex); 1516 1517 pfn = PFN_INVALID; /* force scan reset */ 1518 start_pfn = PFN_INVALID; /* force init with 1st cage pfn */ 1519 kcage_cageout_ready = 1; /* switch kcage_cageout_wakeup mode */ 1520 1521 loop: 1522 /* 1523 * Wait here. Sooner or later, kcage_freemem_sub() will notice 1524 * that kcage_freemem is less than kcage_desfree. When it does 1525 * notice, kcage_freemem_sub() will wake us up via call to 1526 * kcage_cageout_wakeup(). 1527 */ 1528 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1529 cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex); 1530 CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex); 1531 1532 KCAGE_STAT_INCR(kt_wakeups); 1533 KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem); 1534 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem); 1535 pass = 0; 1536 last_pass = 0; 1537 1538 #ifdef KCAGE_STATS 1539 scan_start = lbolt; 1540 #endif 1541 1542 again: 1543 if (!kcage_on) 1544 goto loop; 1545 1546 KCAGE_STAT_INCR(kt_scans); 1547 KCAGE_STAT_INCR_SCAN(kt_passes); 1548 1549 did_something = 0; 1550 pages_skipped = 0; 1551 shared_skipped = 0; 1552 while ((kcage_freemem < kcage_lotsfree || kcage_needfree) && 1553 (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) { 1554 1555 if (start_pfn == PFN_INVALID) 1556 start_pfn = pfn; 1557 else if (start_pfn == pfn) { 1558 last_pass = pass; 1559 pass += 1; 1560 /* 1561 * Did a complete walk of kernel cage, but didn't free 1562 * any pages. If only one cpu is online then 1563 * stop kernel cage walk and try expanding. 1564 */ 1565 if (ncpus_online == 1 && did_something == 0) { 1566 KCAGE_STAT_INCR(kt_cageout_break); 1567 break; 1568 } 1569 } 1570 1571 pp = page_numtopp_nolock(pfn); 1572 if (pp == NULL) { 1573 continue; 1574 } 1575 1576 KCAGE_STAT_INCR_SCAN(kt_examined); 1577 1578 /* 1579 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside 1580 * of the lock. If one is missed it will be seen next 1581 * time through. 1582 * 1583 * Skip non-caged-pages. These pages can exist in the cage 1584 * because, if during cage expansion, a page is 1585 * encountered that is long-term locked the lock prevents the 1586 * expansion logic from setting the P_NORELOC flag. Hence, 1587 * non-caged-pages surrounded by caged-pages. 1588 */ 1589 if (!PP_ISNORELOC(pp)) { 1590 switch (kcage_assimilate_page(pp, &nfreed)) { 1591 case 0: 1592 did_something = 1; 1593 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, 1594 nfreed); 1595 break; 1596 1597 case EBUSY: 1598 case ERANGE: 1599 did_something = 1; 1600 KCAGE_STAT_INCR_SCAN(kt_gotone); 1601 break; 1602 1603 case EAGAIN: 1604 case ENOMEM: 1605 break; 1606 1607 default: 1608 /* catch this with debug kernels */ 1609 ASSERT(0); 1610 break; 1611 } 1612 1613 continue; 1614 } else { 1615 int prm; 1616 1617 if (PP_ISFREE(pp)) { 1618 continue; 1619 } 1620 1621 if ((pp->p_vnode == &kvp && pp->p_lckcnt > 0) || 1622 !page_trylock(pp, SE_EXCL)) { 1623 KCAGE_STAT_INCR_SCAN(kt_cantlock); 1624 continue; 1625 } 1626 1627 /* P_NORELOC bit should not have gone away. */ 1628 ASSERT(PP_ISNORELOC(pp)); 1629 if (PP_ISFREE(pp) || (pp->p_vnode == &kvp && 1630 pp->p_lckcnt > 0)) { 1631 page_unlock(pp); 1632 continue; 1633 } 1634 1635 KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level); 1636 if (hat_page_getshare(pp) > shared_level) { 1637 page_unlock(pp); 1638 pages_skipped = 1; 1639 shared_skipped = 1; 1640 KCAGE_STAT_INCR_SCAN(kt_skipshared); 1641 continue; 1642 } 1643 1644 /* 1645 * In pass {0, 1}, skip page if ref bit is set. 1646 * In pass {0, 1, 2}, skip page if mod bit is set. 1647 */ 1648 prm = hat_pagesync(pp, 1649 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 1650 1651 /* On first pass ignore ref'd pages */ 1652 if (pass <= 1 && (prm & P_REF)) { 1653 KCAGE_STAT_INCR_SCAN(kt_skiprefd); 1654 pages_skipped = 1; 1655 page_unlock(pp); 1656 continue; 1657 } 1658 1659 /* On pass 2, page_destroy if mod bit is not set */ 1660 if (pass <= 2) { 1661 if (pp->p_szc != 0 || (prm & P_MOD) || 1662 pp->p_lckcnt || pp->p_cowcnt) { 1663 pages_skipped = 1; 1664 page_unlock(pp); 1665 } else { 1666 1667 /* 1668 * unload the mappings before 1669 * checking if mod bit is set 1670 */ 1671 (void) hat_pageunload(pp, 1672 HAT_FORCE_PGUNLOAD); 1673 1674 /* 1675 * skip this page if modified 1676 */ 1677 if (hat_ismod(pp)) { 1678 pages_skipped = 1; 1679 page_unlock(pp); 1680 continue; 1681 } 1682 1683 KCAGE_STAT_INCR_SCAN(kt_destroy); 1684 page_destroy(pp, 0); 1685 did_something = 1; 1686 } 1687 continue; 1688 } 1689 1690 if (kcage_invalidate_page(pp, &nfreed) == 0) { 1691 did_something = 1; 1692 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed); 1693 } 1694 1695 /* 1696 * No need to drop the page lock here. 1697 * Kcage_invalidate_page has done that for us 1698 * either explicitly or through a page_free. 1699 */ 1700 } 1701 } 1702 1703 /* 1704 * Expand the cage only if available cage memory is really low. 1705 * This test is done only after a complete scan of the cage. 1706 * The reason for not checking and expanding more often is to 1707 * avoid rapid expansion of the cage. Naturally, scanning the 1708 * cage takes time. So by scanning first, we use that work as a 1709 * delay loop in between expand decisions. 1710 */ 1711 1712 scan_again = 0; 1713 if (kcage_freemem < kcage_minfree || kcage_needfree) { 1714 /* 1715 * Kcage_expand() will return a non-zero value if it was 1716 * able to expand the cage -- whether or not the new 1717 * pages are free and immediately usable. If non-zero, 1718 * we do another scan of the cage. The pages might be 1719 * freed during that scan or by time we get back here. 1720 * If not, we will attempt another expansion. 1721 * However, if kcage_expand() returns zero, then it was 1722 * unable to expand the cage. This is the case when the 1723 * the growth list is exausted, therefore no work was done 1724 * and there is no reason to scan the cage again. 1725 * Note: Kernel cage scan is not repeated on single-cpu 1726 * system to avoid kernel cage thread hogging cpu. 1727 */ 1728 if (pass <= 3 && pages_skipped && ncpus_online > 1) 1729 scan_again = 1; 1730 else 1731 (void) kcage_expand(); /* don't scan again */ 1732 } else if (kcage_freemem < kcage_lotsfree) { 1733 /* 1734 * If available cage memory is less than abundant 1735 * and a full scan of the cage has not yet been completed, 1736 * or a scan has completed and some work was performed, 1737 * or pages were skipped because of sharing, 1738 * or we simply have not yet completed two passes, 1739 * then do another scan. 1740 */ 1741 if (pass <= 2 && pages_skipped) 1742 scan_again = 1; 1743 if (pass == last_pass || did_something) 1744 scan_again = 1; 1745 else if (shared_skipped && shared_level < (8<<24)) { 1746 shared_level <<= 1; 1747 scan_again = 1; 1748 } 1749 } 1750 1751 if (scan_again && ncpus_online > 1) 1752 goto again; 1753 else { 1754 if (shared_level > 8) 1755 shared_level >>= 1; 1756 1757 KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem); 1758 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem); 1759 KCAGE_STAT_SET_SCAN(kt_ticks, lbolt - scan_start); 1760 KCAGE_STAT_INC_SCAN_INDEX; 1761 goto loop; 1762 } 1763 1764 /*NOTREACHED*/ 1765 } 1766 1767 void 1768 kcage_cageout_wakeup() 1769 { 1770 if (mutex_tryenter(&kcage_cageout_mutex)) { 1771 if (kcage_cageout_ready) { 1772 cv_signal(&kcage_cageout_cv); 1773 } else if (kcage_freemem < kcage_minfree || kcage_needfree) { 1774 /* 1775 * Available cage memory is really low. Time to 1776 * start expanding the cage. However, the 1777 * kernel cage thread is not yet ready to 1778 * do the work. Use *this* thread, which is 1779 * most likely to be t0, to do the work. 1780 */ 1781 KCAGE_STAT_INCR(kcw_expandearly); 1782 (void) kcage_expand(); 1783 KCAGE_STAT_INC_SCAN_INDEX; 1784 } 1785 1786 mutex_exit(&kcage_cageout_mutex); 1787 } 1788 /* else, kernel cage thread is already running */ 1789 } 1790 1791 void 1792 kcage_tick() 1793 { 1794 /* 1795 * Once per second we wake up all the threads throttled 1796 * waiting for cage memory, in case we've become stuck 1797 * and haven't made forward progress expanding the cage. 1798 */ 1799 if (kcage_on && kcage_cageout_ready) 1800 cv_broadcast(&kcage_throttle_cv); 1801 } 1802