1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/thread.h> 32 #include <sys/proc.h> 33 #include <sys/callb.h> 34 #include <sys/vnode.h> 35 #include <sys/debug.h> 36 #include <sys/systm.h> /* for bzero */ 37 #include <sys/memlist.h> 38 #include <sys/cmn_err.h> 39 #include <sys/sysmacros.h> 40 #include <sys/vmsystm.h> /* for NOMEMWAIT() */ 41 #include <sys/atomic.h> /* used to update kcage_freemem */ 42 #include <sys/kmem.h> /* for kmem_reap */ 43 #include <sys/errno.h> 44 #include <sys/mem_cage.h> 45 #include <vm/seg_kmem.h> 46 #include <vm/page.h> 47 #include <vm/hat.h> 48 #include <vm/vm_dep.h> 49 #include <sys/mem_config.h> 50 #include <sys/lgrp.h> 51 52 extern pri_t maxclsyspri; 53 54 #ifdef DEBUG 55 #define KCAGE_STATS 56 #endif 57 58 #ifdef KCAGE_STATS 59 60 #define KCAGE_STATS_VERSION 9 /* can help report generators */ 61 #define KCAGE_STATS_NSCANS 256 /* depth of scan statistics buffer */ 62 63 struct kcage_stats_scan { 64 /* managed by KCAGE_STAT_* macros */ 65 clock_t scan_lbolt; 66 uint_t scan_id; 67 68 /* set in kcage_cageout() */ 69 uint_t kt_passes; 70 clock_t kt_ticks; 71 pgcnt_t kt_kcage_freemem_start; 72 pgcnt_t kt_kcage_freemem_end; 73 pgcnt_t kt_freemem_start; 74 pgcnt_t kt_freemem_end; 75 uint_t kt_examined; 76 uint_t kt_cantlock; 77 uint_t kt_gotone; 78 uint_t kt_gotonefree; 79 uint_t kt_skiplevel; 80 uint_t kt_skipshared; 81 uint_t kt_skiprefd; 82 uint_t kt_destroy; 83 84 /* set in kcage_invalidate_page() */ 85 uint_t kip_reloclocked; 86 uint_t kip_relocmod; 87 uint_t kip_destroy; 88 uint_t kip_nomem; 89 uint_t kip_demotefailed; 90 91 /* set in kcage_expand() */ 92 uint_t ke_wanted; 93 uint_t ke_examined; 94 uint_t ke_lefthole; 95 uint_t ke_gotone; 96 uint_t ke_gotonefree; 97 }; 98 99 struct kcage_stats { 100 /* managed by KCAGE_STAT_* macros */ 101 uint_t version; 102 uint_t size; 103 104 /* set in kcage_cageout */ 105 uint_t kt_wakeups; 106 uint_t kt_scans; 107 uint_t kt_cageout_break; 108 109 /* set in kcage_expand */ 110 uint_t ke_calls; 111 uint_t ke_nopfn; 112 uint_t ke_nopaget; 113 uint_t ke_isnoreloc; 114 uint_t ke_deleting; 115 uint_t ke_lowfreemem; 116 uint_t ke_terminate; 117 118 /* set in kcage_freemem_add() */ 119 uint_t kfa_trottlewake; 120 121 /* set in kcage_freemem_sub() */ 122 uint_t kfs_cagewake; 123 124 /* set in kcage_create_throttle */ 125 uint_t kct_calls; 126 uint_t kct_cageout; 127 uint_t kct_critical; 128 uint_t kct_exempt; 129 uint_t kct_cagewake; 130 uint_t kct_wait; 131 uint_t kct_progress; 132 uint_t kct_noprogress; 133 uint_t kct_timeout; 134 135 /* set in kcage_cageout_wakeup */ 136 uint_t kcw_expandearly; 137 138 /* managed by KCAGE_STAT_* macros */ 139 uint_t scan_array_size; 140 uint_t scan_index; 141 struct kcage_stats_scan scans[KCAGE_STATS_NSCANS]; 142 }; 143 144 static struct kcage_stats kcage_stats; 145 static struct kcage_stats_scan kcage_stats_scan_zero; 146 147 /* 148 * No real need for atomics here. For the most part the incs and sets are 149 * done by the kernel cage thread. There are a few that are done by any 150 * number of other threads. Those cases are noted by comments. 151 */ 152 #define KCAGE_STAT_INCR(m) kcage_stats.m++ 153 154 #define KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v) 155 156 #define KCAGE_STAT_INCR_SCAN(m) \ 157 KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m) 158 159 #define KCAGE_STAT_NINCR_SCAN(m, v) \ 160 KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v) 161 162 #define KCAGE_STAT_SET(m, v) kcage_stats.m = (v) 163 164 #define KCAGE_STAT_SETZ(m, v) \ 165 if (kcage_stats.m == 0) kcage_stats.m = (v) 166 167 #define KCAGE_STAT_SET_SCAN(m, v) \ 168 KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v) 169 170 #define KCAGE_STAT_SETZ_SCAN(m, v) \ 171 KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v) 172 173 #define KCAGE_STAT_INC_SCAN_INDEX \ 174 KCAGE_STAT_SET_SCAN(scan_lbolt, lbolt); \ 175 KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \ 176 kcage_stats.scan_index = \ 177 (kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \ 178 kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero 179 180 #define KCAGE_STAT_INIT_SCAN_INDEX \ 181 kcage_stats.version = KCAGE_STATS_VERSION; \ 182 kcage_stats.size = sizeof (kcage_stats); \ 183 kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \ 184 kcage_stats.scan_index = 0 185 186 #else /* KCAGE_STATS */ 187 188 #define KCAGE_STAT_INCR(v) 189 #define KCAGE_STAT_NINCR(m, v) 190 #define KCAGE_STAT_INCR_SCAN(v) 191 #define KCAGE_STAT_NINCR_SCAN(m, v) 192 #define KCAGE_STAT_SET(m, v) 193 #define KCAGE_STAT_SETZ(m, v) 194 #define KCAGE_STAT_SET_SCAN(m, v) 195 #define KCAGE_STAT_SETZ_SCAN(m, v) 196 #define KCAGE_STAT_INC_SCAN_INDEX 197 #define KCAGE_STAT_INIT_SCAN_INDEX 198 199 #endif /* KCAGE_STATS */ 200 201 static kmutex_t kcage_throttle_mutex; /* protects kcage_throttle_cv */ 202 static kcondvar_t kcage_throttle_cv; 203 204 static kmutex_t kcage_cageout_mutex; /* protects cv and ready flag */ 205 static kcondvar_t kcage_cageout_cv; /* cageout thread naps here */ 206 static int kcage_cageout_ready; /* nonzero when cageout thread ready */ 207 kthread_id_t kcage_cageout_thread; /* to aid debugging */ 208 209 static kmutex_t kcage_range_mutex; /* proctects kcage_glist elements */ 210 211 /* 212 * Cage expansion happens within a range. 213 */ 214 struct kcage_glist { 215 struct kcage_glist *next; 216 pfn_t base; 217 pfn_t lim; 218 pfn_t curr; 219 int decr; 220 }; 221 222 static struct kcage_glist *kcage_glist; 223 static struct kcage_glist *kcage_current_glist; 224 225 /* 226 * The firstfree element is provided so that kmem_alloc can be avoided 227 * until that cage has somewhere to go. This is not currently a problem 228 * as early kmem_alloc's use BOP_ALLOC instead of page_create_va. 229 */ 230 static struct kcage_glist kcage_glist_firstfree; 231 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree; 232 233 /* 234 * Miscellaneous forward references 235 */ 236 static struct kcage_glist *kcage_glist_alloc(void); 237 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **); 238 static void kcage_cageout(void); 239 static int kcage_invalidate_page(page_t *, pgcnt_t *); 240 static int kcage_setnoreloc_pages(page_t *, se_t); 241 242 /* 243 * Kernel Memory Cage counters and thresholds. 244 */ 245 int kcage_on = 0; 246 pgcnt_t kcage_freemem; 247 pgcnt_t kcage_needfree; 248 pgcnt_t kcage_lotsfree; 249 pgcnt_t kcage_desfree; 250 pgcnt_t kcage_minfree; 251 pgcnt_t kcage_throttlefree; 252 int kcage_maxwait = 10; /* in seconds */ 253 254 /* when we use lp for kmem we start the cage at a higher initial value */ 255 pgcnt_t kcage_kmemlp_mincage; 256 257 #ifdef DEBUG 258 pgcnt_t kcage_pagets; 259 #define KCAGEPAGETS_INC() kcage_pagets++ 260 #else 261 #define KCAGEPAGETS_INC() 262 #endif 263 264 /* 265 * Startup and Dynamic Reconfiguration interfaces. 266 * kcage_range_lock() 267 * kcage_range_unlock() 268 * kcage_range_islocked() 269 * kcage_range_add() 270 * kcage_range_del() 271 * kcage_init() 272 * kcage_set_thresholds() 273 */ 274 275 int 276 kcage_range_trylock(void) 277 { 278 return (mutex_tryenter(&kcage_range_mutex)); 279 } 280 281 void 282 kcage_range_lock(void) 283 { 284 mutex_enter(&kcage_range_mutex); 285 } 286 287 void 288 kcage_range_unlock(void) 289 { 290 mutex_exit(&kcage_range_mutex); 291 } 292 293 int 294 kcage_range_islocked(void) 295 { 296 return (MUTEX_HELD(&kcage_range_mutex)); 297 } 298 299 /* 300 * Called from page_get_contig_pages to get the approximate kcage pfn range 301 * for exclusion from search for contiguous pages. This routine is called 302 * without kcage_range lock (kcage routines can call page_get_contig_pages 303 * through page_relocate) and with the assumption, based on kcage_range_add, 304 * that kcage_current_glist always contain a valid pointer. 305 */ 306 307 int 308 kcage_current_pfn(pfn_t *pfncur) 309 { 310 struct kcage_glist *lp = kcage_current_glist; 311 312 ASSERT(kcage_on); 313 314 ASSERT(lp != NULL); 315 316 *pfncur = lp->curr; 317 318 return (lp->decr); 319 } 320 321 int 322 kcage_range_init(struct memlist *ml, int decr) 323 { 324 int ret = 0; 325 326 ASSERT(kcage_range_islocked()); 327 328 if (decr) { 329 while (ml->next != NULL) 330 ml = ml->next; 331 } 332 333 while (ml != NULL) { 334 ret = kcage_range_add(btop(ml->address), btop(ml->size), decr); 335 if (ret) 336 break; 337 338 ml = (decr ? ml->prev : ml->next); 339 } 340 341 return (ret); 342 } 343 344 /* 345 * Third arg controls direction of growth: 0: increasing pfns, 346 * 1: decreasing. 347 * Calls to add and delete must be protected by calls to 348 * kcage_range_lock() and kcage_range_unlock(). 349 */ 350 int 351 kcage_range_add(pfn_t base, pgcnt_t npgs, int decr) 352 { 353 struct kcage_glist *new, **lpp; 354 pfn_t lim; 355 356 ASSERT(kcage_range_islocked()); 357 358 ASSERT(npgs != 0); 359 if (npgs == 0) 360 return (EINVAL); 361 362 lim = base + npgs; 363 364 ASSERT(lim > base); 365 if (lim <= base) 366 return (EINVAL); 367 368 new = kcage_glist_alloc(); 369 if (new == NULL) { 370 return (ENOMEM); 371 } 372 373 new->base = base; 374 new->lim = lim; 375 new->decr = decr; 376 if (new->decr != 0) 377 new->curr = new->lim; 378 else 379 new->curr = new->base; 380 /* 381 * Any overlapping existing ranges are removed by deleting 382 * from the new list as we search for the tail. 383 */ 384 lpp = &kcage_glist; 385 while (*lpp != NULL) { 386 int ret; 387 ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new); 388 if (ret != 0) 389 return (ret); 390 lpp = &(*lpp)->next; 391 } 392 393 *lpp = new; 394 395 if (kcage_current_glist == NULL) { 396 kcage_current_glist = kcage_glist; 397 } 398 399 return (0); 400 } 401 402 /* 403 * Calls to add and delete must be protected by calls to 404 * kcage_range_lock() and kcage_range_unlock(). 405 */ 406 int 407 kcage_range_delete(pfn_t base, pgcnt_t npgs) 408 { 409 struct kcage_glist *lp; 410 pfn_t lim; 411 412 ASSERT(kcage_range_islocked()); 413 414 ASSERT(npgs != 0); 415 if (npgs == 0) 416 return (EINVAL); 417 418 lim = base + npgs; 419 420 ASSERT(lim > base); 421 if (lim <= base) 422 return (EINVAL); 423 424 /* 425 * Check if the delete is OK first as a number of elements 426 * might be involved and it will be difficult to go 427 * back and undo (can't just add the range back in). 428 */ 429 for (lp = kcage_glist; lp != NULL; lp = lp->next) { 430 /* 431 * If there have been no pages allocated from this 432 * element, we don't need to check it. 433 */ 434 if ((lp->decr == 0 && lp->curr == lp->base) || 435 (lp->decr != 0 && lp->curr == lp->lim)) 436 continue; 437 /* 438 * If the element does not overlap, its OK. 439 */ 440 if (base >= lp->lim || lim <= lp->base) 441 continue; 442 /* 443 * Overlapping element: Does the range to be deleted 444 * overlap the area already used? If so fail. 445 */ 446 if (lp->decr == 0 && base < lp->curr && lim >= lp->base) { 447 return (EBUSY); 448 } 449 if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) { 450 return (EBUSY); 451 } 452 } 453 return (kcage_glist_delete(base, lim, &kcage_glist)); 454 } 455 456 /* 457 * Calls to add and delete must be protected by calls to 458 * kcage_range_lock() and kcage_range_unlock(). 459 * This routine gets called after successful Solaris memory 460 * delete operation from DR post memory delete routines. 461 */ 462 int 463 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs) 464 { 465 pfn_t lim; 466 467 ASSERT(kcage_range_islocked()); 468 469 ASSERT(npgs != 0); 470 if (npgs == 0) 471 return (EINVAL); 472 473 lim = base + npgs; 474 475 ASSERT(lim > base); 476 if (lim <= base) 477 return (EINVAL); 478 479 return (kcage_glist_delete(base, lim, &kcage_glist)); 480 } 481 482 /* 483 * No locking is required here as the whole operation is covered 484 * by the kcage_range_lock(). 485 */ 486 static struct kcage_glist * 487 kcage_glist_alloc(void) 488 { 489 struct kcage_glist *new; 490 491 if ((new = kcage_glist_freelist) != NULL) { 492 kcage_glist_freelist = new->next; 493 bzero(new, sizeof (*new)); 494 } else { 495 new = kmem_zalloc(sizeof (struct kcage_glist), KM_NOSLEEP); 496 } 497 return (new); 498 } 499 500 static void 501 kcage_glist_free(struct kcage_glist *lp) 502 { 503 lp->next = kcage_glist_freelist; 504 kcage_glist_freelist = lp; 505 } 506 507 static int 508 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp) 509 { 510 struct kcage_glist *lp, *prev = *lpp; 511 512 while ((lp = *lpp) != NULL) { 513 if (lim > lp->base && base < lp->lim) { 514 /* The delete range overlaps this element. */ 515 if (base <= lp->base && lim >= lp->lim) { 516 /* Delete whole element. */ 517 *lpp = lp->next; 518 if (lp == kcage_current_glist) { 519 /* This can never happen. */ 520 ASSERT(kcage_current_glist != prev); 521 kcage_current_glist = prev; 522 } 523 kcage_glist_free(lp); 524 continue; 525 } 526 527 /* Partial delete. */ 528 if (base > lp->base && lim < lp->lim) { 529 struct kcage_glist *new; 530 531 /* 532 * Remove a section from the middle, 533 * need to allocate a new element. 534 */ 535 new = kcage_glist_alloc(); 536 if (new == NULL) { 537 return (ENOMEM); 538 } 539 540 /* 541 * Tranfser unused range to new. 542 * Edit lp in place to preserve 543 * kcage_current_glist. 544 */ 545 new->decr = lp->decr; 546 if (new->decr != 0) { 547 new->base = lp->base; 548 new->lim = base; 549 new->curr = base; 550 551 lp->base = lim; 552 } else { 553 new->base = lim; 554 new->lim = lp->lim; 555 new->curr = new->base; 556 557 lp->lim = base; 558 } 559 560 /* Insert new. */ 561 new->next = lp->next; 562 lp->next = new; 563 lpp = &lp->next; 564 } else { 565 /* Delete part of current block. */ 566 if (base > lp->base) { 567 ASSERT(lim >= lp->lim); 568 ASSERT(base < lp->lim); 569 if (lp->decr != 0 && 570 lp->curr == lp->lim) 571 lp->curr = base; 572 lp->lim = base; 573 } else { 574 ASSERT(base <= lp->base); 575 ASSERT(lim > lp->base); 576 if (lp->decr == 0 && 577 lp->curr == lp->base) 578 lp->curr = lim; 579 lp->base = lim; 580 } 581 } 582 } 583 prev = *lpp; 584 lpp = &(*lpp)->next; 585 } 586 587 return (0); 588 } 589 590 /* 591 * The caller of kcage_get_pfn must hold the kcage_range_lock to make 592 * sure that there are no concurrent calls. The same lock 593 * must be obtained for range add and delete by calling 594 * kcage_range_lock() and kcage_range_unlock(). 595 */ 596 static pfn_t 597 kcage_get_pfn(void) 598 { 599 struct kcage_glist *lp; 600 pfn_t pfn; 601 602 ASSERT(kcage_range_islocked()); 603 604 lp = kcage_current_glist; 605 while (lp != NULL) { 606 if (lp->decr != 0) { 607 if (lp->curr != lp->base) { 608 pfn = --lp->curr; 609 return (pfn); 610 } 611 } else { 612 if (lp->curr != lp->lim) { 613 pfn = lp->curr++; 614 return (pfn); 615 } 616 } 617 618 lp = lp->next; 619 if (lp) 620 kcage_current_glist = lp; 621 } 622 623 return (PFN_INVALID); 624 } 625 626 /* 627 * Walk the physical address space of the cage. 628 * This routine does not guarantee to return PFNs in the order 629 * in which they were allocated to the cage. Instead, it walks 630 * each range as they appear on the growth list returning the PFNs 631 * range in ascending order. 632 * 633 * To begin scanning at lower edge of cage, reset should be nonzero. 634 * To step through cage, reset should be zero. 635 * 636 * PFN_INVALID will be returned when the upper end of the cage is 637 * reached -- indicating a full scan of the cage has been completed since 638 * previous reset. PFN_INVALID will continue to be returned until 639 * kcage_walk_cage is reset. 640 * 641 * It is possible to receive a PFN_INVALID result on reset if a growth 642 * list is not installed or if none of the PFNs in the installed list have 643 * been allocated to the cage. In otherwords, there is no cage. 644 * 645 * Caller need not hold kcage_range_lock while calling this function 646 * as the front part of the list is static - pages never come out of 647 * the cage. 648 * 649 * The caller is expected to only be kcage_cageout(). 650 */ 651 static pfn_t 652 kcage_walk_cage(int reset) 653 { 654 static struct kcage_glist *lp = NULL; 655 static pfn_t pfn; 656 657 if (reset) 658 lp = NULL; 659 if (lp == NULL) { 660 lp = kcage_glist; 661 pfn = PFN_INVALID; 662 } 663 again: 664 if (pfn == PFN_INVALID) { 665 if (lp == NULL) 666 return (PFN_INVALID); 667 668 if (lp->decr != 0) { 669 /* 670 * In this range the cage grows from the highest 671 * address towards the lowest. 672 * Arrange to return pfns from curr to lim-1, 673 * inclusive, in ascending order. 674 */ 675 676 pfn = lp->curr; 677 } else { 678 /* 679 * In this range the cage grows from the lowest 680 * address towards the highest. 681 * Arrange to return pfns from base to curr, 682 * inclusive, in ascending order. 683 */ 684 685 pfn = lp->base; 686 } 687 } 688 689 if (lp->decr != 0) { /* decrementing pfn */ 690 if (pfn == lp->lim) { 691 /* Don't go beyond the static part of the glist. */ 692 if (lp == kcage_current_glist) 693 lp = NULL; 694 else 695 lp = lp->next; 696 pfn = PFN_INVALID; 697 goto again; 698 } 699 700 ASSERT(pfn >= lp->curr && pfn < lp->lim); 701 } else { /* incrementing pfn */ 702 if (pfn == lp->curr) { 703 /* Don't go beyond the static part of the glist. */ 704 if (lp == kcage_current_glist) 705 lp = NULL; 706 else 707 lp = lp->next; 708 pfn = PFN_INVALID; 709 goto again; 710 } 711 712 ASSERT(pfn >= lp->base && pfn < lp->curr); 713 } 714 715 return (pfn++); 716 } 717 718 /* 719 * Callback functions for to recalc cage thresholds after 720 * Kphysm memory add/delete operations. 721 */ 722 /*ARGSUSED*/ 723 static void 724 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages) 725 { 726 kcage_recalc_thresholds(); 727 } 728 729 /*ARGSUSED*/ 730 static int 731 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages) 732 { 733 /* TODO: when should cage refuse memory delete requests? */ 734 return (0); 735 } 736 737 /*ARGSUSED*/ 738 static void 739 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled) 740 { 741 kcage_recalc_thresholds(); 742 } 743 744 static kphysm_setup_vector_t kcage_kphysm_vectors = { 745 KPHYSM_SETUP_VECTOR_VERSION, 746 kcage_kphysm_postadd_cb, 747 kcage_kphysm_predel_cb, 748 kcage_kphysm_postdel_cb 749 }; 750 751 /* 752 * This is called before a CPR suspend and after a CPR resume. We have to 753 * turn off kcage_cageout_ready before a suspend, and turn it back on after a 754 * restart. 755 */ 756 /*ARGSUSED*/ 757 static boolean_t 758 kcage_cageout_cpr(void *arg, int code) 759 { 760 if (code == CB_CODE_CPR_CHKPT) { 761 ASSERT(kcage_cageout_ready); 762 kcage_cageout_ready = 0; 763 return (B_TRUE); 764 } else if (code == CB_CODE_CPR_RESUME) { 765 ASSERT(kcage_cageout_ready == 0); 766 kcage_cageout_ready = 1; 767 return (B_TRUE); 768 } 769 return (B_FALSE); 770 } 771 772 /* 773 * kcage_recalc_preferred_size() increases initial cage size to improve large 774 * page availability when lp for kmem is enabled and kpr is disabled 775 */ 776 static pgcnt_t 777 kcage_recalc_preferred_size(pgcnt_t preferred_size) 778 { 779 if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) { 780 pgcnt_t lpmincage = kcage_kmemlp_mincage; 781 if (lpmincage == 0) { 782 lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8), 783 segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE; 784 } 785 kcage_kmemlp_mincage = MIN(lpmincage, 786 (segkmem_kmemlp_max / PAGESIZE)); 787 preferred_size = MAX(kcage_kmemlp_mincage, preferred_size); 788 } 789 return (preferred_size); 790 } 791 792 /* 793 * Kcage_init() builds the cage and initializes the cage thresholds. 794 * The size of the cage is determined by the argument preferred_size. 795 * or the actual amount of memory, whichever is smaller. 796 */ 797 void 798 kcage_init(pgcnt_t preferred_size) 799 { 800 pgcnt_t wanted; 801 pfn_t pfn; 802 page_t *pp; 803 extern struct vnode kvp; 804 extern void page_list_noreloc_startup(page_t *); 805 806 ASSERT(!kcage_on); 807 ASSERT(kcage_range_islocked()); 808 809 /* increase preferred cage size for lp for kmem */ 810 preferred_size = kcage_recalc_preferred_size(preferred_size); 811 812 /* Debug note: initialize this now so early expansions can stat */ 813 KCAGE_STAT_INIT_SCAN_INDEX; 814 815 /* 816 * Initialize cage thresholds and install kphysm callback. 817 * If we can't arrange to have the thresholds track with 818 * available physical memory, then the cage thresholds may 819 * end up over time at levels that adversly effect system 820 * performance; so, bail out. 821 */ 822 kcage_recalc_thresholds(); 823 if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) { 824 ASSERT(0); /* Catch this in DEBUG kernels. */ 825 return; 826 } 827 828 /* 829 * Limit startup cage size within the range of kcage_minfree 830 * and availrmem, inclusively. 831 */ 832 wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem); 833 834 /* 835 * Construct the cage. PFNs are allocated from the glist. It 836 * is assumed that the list has been properly ordered for the 837 * platform by the platform code. Typically, this is as simple 838 * as calling kcage_range_init(phys_avail, decr), where decr is 839 * 1 if the kernel has been loaded into upper end of physical 840 * memory, or 0 if the kernel has been loaded at the low end. 841 * 842 * Note: it is assumed that we are in the startup flow, so there 843 * is no reason to grab the page lock. 844 */ 845 kcage_freemem = 0; 846 pfn = PFN_INVALID; /* prime for alignment test */ 847 while (wanted != 0) { 848 if ((pfn = kcage_get_pfn()) == PFN_INVALID) 849 break; 850 851 if ((pp = page_numtopp_nolock(pfn)) != NULL) { 852 KCAGEPAGETS_INC(); 853 /* 854 * Set the noreloc state on the page. 855 * If the page is free and not already 856 * on the noreloc list then move it. 857 */ 858 if (PP_ISFREE(pp)) { 859 if (PP_ISNORELOC(pp) == 0) 860 page_list_noreloc_startup(pp); 861 } else { 862 ASSERT(pp->p_szc == 0); 863 PP_SETNORELOC(pp); 864 } 865 } 866 867 wanted -= 1; 868 } 869 870 /* 871 * Need to go through and find kernel allocated pages 872 * and capture them into the Cage. These will primarily 873 * be pages gotten through boot_alloc(). 874 */ 875 if (kvp.v_pages) { 876 877 pp = kvp.v_pages; 878 do { 879 ASSERT(!PP_ISFREE(pp)); 880 ASSERT(pp->p_szc == 0); 881 PP_SETNORELOC(pp); 882 } while ((pp = pp->p_vpnext) != kvp.v_pages); 883 884 } 885 886 kcage_on = 1; 887 888 /* 889 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend() 890 * after the cageout thread is blocked, and executes from cpr_resume() 891 * before the cageout thread is restarted. By executing in this class, 892 * we are assured that the kernel cage thread won't miss wakeup calls 893 * and also CPR's larger kmem_alloc requests will not fail after 894 * CPR shuts down the cageout kernel thread. 895 */ 896 (void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL, 897 "cageout"); 898 899 /* 900 * Coalesce pages to improve large page availability. A better fix 901 * would to coalesce pages as they are included in the cage 902 */ 903 if (SEGKMEM_USE_LARGEPAGES) { 904 extern void page_freelist_coalesce_all(int mnode); 905 extern int max_mem_nodes; 906 int mnode, max_mnodes = max_mem_nodes; 907 for (mnode = 0; mnode < max_mnodes; mnode++) { 908 page_freelist_coalesce_all(mnode); 909 } 910 } 911 } 912 913 void 914 kcage_recalc_thresholds() 915 { 916 static int first = 1; 917 static pgcnt_t init_lotsfree; 918 static pgcnt_t init_desfree; 919 static pgcnt_t init_minfree; 920 static pgcnt_t init_throttlefree; 921 922 /* TODO: any reason to take more care than this with live editing? */ 923 mutex_enter(&kcage_cageout_mutex); 924 mutex_enter(&freemem_lock); 925 926 if (first) { 927 first = 0; 928 init_lotsfree = kcage_lotsfree; 929 init_desfree = kcage_desfree; 930 init_minfree = kcage_minfree; 931 init_throttlefree = kcage_throttlefree; 932 } else { 933 kcage_lotsfree = init_lotsfree; 934 kcage_desfree = init_desfree; 935 kcage_minfree = init_minfree; 936 kcage_throttlefree = init_throttlefree; 937 } 938 939 if (kcage_lotsfree == 0) 940 kcage_lotsfree = MAX(32, total_pages / 256); 941 942 if (kcage_minfree == 0) 943 kcage_minfree = MAX(32, kcage_lotsfree / 2); 944 945 if (kcage_desfree == 0) 946 kcage_desfree = MAX(32, kcage_minfree); 947 948 if (kcage_throttlefree == 0) 949 kcage_throttlefree = MAX(32, kcage_minfree / 2); 950 951 mutex_exit(&freemem_lock); 952 mutex_exit(&kcage_cageout_mutex); 953 954 if (kcage_cageout_ready) { 955 if (kcage_freemem < kcage_desfree) 956 kcage_cageout_wakeup(); 957 958 if (kcage_needfree) { 959 mutex_enter(&kcage_throttle_mutex); 960 cv_broadcast(&kcage_throttle_cv); 961 mutex_exit(&kcage_throttle_mutex); 962 } 963 } 964 } 965 966 /* 967 * Pageout interface: 968 * kcage_cageout_init() 969 */ 970 void 971 kcage_cageout_init() 972 { 973 if (kcage_on) { 974 mutex_enter(&kcage_cageout_mutex); 975 976 kcage_cageout_thread = thread_create(NULL, 0, kcage_cageout, 977 NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1); 978 979 mutex_exit(&kcage_cageout_mutex); 980 } 981 } 982 983 984 /* 985 * VM Interfaces: 986 * kcage_create_throttle() 987 * kcage_freemem_add() 988 * kcage_freemem_sub() 989 */ 990 991 /* 992 * Wakeup cageout thread and throttle waiting for the number of pages 993 * requested to become available. For non-critical requests, a 994 * timeout is added, since freemem accounting is separate from cage 995 * freemem accounting: it's possible for us to get stuck and not make 996 * forward progress even though there was sufficient freemem before 997 * arriving here. 998 */ 999 int 1000 kcage_create_throttle(pgcnt_t npages, int flags) 1001 { 1002 int niter = 0; 1003 pgcnt_t lastfree; 1004 int enough = kcage_freemem > kcage_throttlefree + npages; 1005 1006 KCAGE_STAT_INCR(kct_calls); /* unprotected incr. */ 1007 1008 kcage_cageout_wakeup(); /* just to be sure */ 1009 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1010 1011 /* 1012 * Obviously, we can't throttle the cageout thread since 1013 * we depend on it. We also can't throttle the panic thread. 1014 */ 1015 if (curthread == kcage_cageout_thread || panicstr) { 1016 KCAGE_STAT_INCR(kct_cageout); /* unprotected incr. */ 1017 return (KCT_CRIT); 1018 } 1019 1020 /* 1021 * Don't throttle threads which are critical for proper 1022 * vm management if we're above kcage_throttlefree or 1023 * if freemem is very low. 1024 */ 1025 if (NOMEMWAIT()) { 1026 if (enough) { 1027 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1028 return (KCT_CRIT); 1029 } else if (freemem < minfree) { 1030 KCAGE_STAT_INCR(kct_critical); /* unprotected incr. */ 1031 return (KCT_CRIT); 1032 } 1033 } 1034 1035 /* 1036 * Don't throttle real-time threads. 1037 */ 1038 if (DISP_PRIO(curthread) > maxclsyspri) { 1039 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1040 return (KCT_CRIT); 1041 } 1042 1043 /* 1044 * Cause all other threads (which are assumed to not be 1045 * critical to cageout) to wait here until their request 1046 * can be satisfied. Be a little paranoid and wake the 1047 * kernel cage on each loop through this logic. 1048 */ 1049 while (kcage_freemem < kcage_throttlefree + npages) { 1050 ASSERT(kcage_on); 1051 1052 lastfree = kcage_freemem; 1053 1054 if (kcage_cageout_ready) { 1055 mutex_enter(&kcage_throttle_mutex); 1056 1057 kcage_needfree += npages; 1058 KCAGE_STAT_INCR(kct_wait); 1059 1060 kcage_cageout_wakeup(); 1061 KCAGE_STAT_INCR(kct_cagewake); 1062 1063 cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex); 1064 1065 kcage_needfree -= npages; 1066 1067 mutex_exit(&kcage_throttle_mutex); 1068 } else { 1069 /* 1070 * NOTE: atomics are used just in case we enter 1071 * mp operation before the cageout thread is ready. 1072 */ 1073 atomic_add_long(&kcage_needfree, npages); 1074 1075 kcage_cageout_wakeup(); 1076 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1077 1078 atomic_add_long(&kcage_needfree, -npages); 1079 } 1080 1081 if ((flags & PG_WAIT) == 0) { 1082 if (kcage_freemem > lastfree) { 1083 KCAGE_STAT_INCR(kct_progress); 1084 niter = 0; 1085 } else { 1086 KCAGE_STAT_INCR(kct_noprogress); 1087 if (++niter >= kcage_maxwait) { 1088 KCAGE_STAT_INCR(kct_timeout); 1089 return (KCT_FAILURE); 1090 } 1091 } 1092 } 1093 } 1094 return (KCT_NONCRIT); 1095 } 1096 1097 void 1098 kcage_freemem_add(pgcnt_t npages) 1099 { 1100 extern void wakeup_pcgs(void); 1101 1102 atomic_add_long(&kcage_freemem, npages); 1103 1104 wakeup_pcgs(); /* wakeup threads in pcgs() */ 1105 1106 if (kcage_needfree != 0 && 1107 kcage_freemem >= (kcage_throttlefree + kcage_needfree)) { 1108 1109 mutex_enter(&kcage_throttle_mutex); 1110 cv_broadcast(&kcage_throttle_cv); 1111 KCAGE_STAT_INCR(kfa_trottlewake); 1112 mutex_exit(&kcage_throttle_mutex); 1113 } 1114 } 1115 1116 void 1117 kcage_freemem_sub(pgcnt_t npages) 1118 { 1119 atomic_add_long(&kcage_freemem, -npages); 1120 1121 if (kcage_freemem < kcage_desfree) { 1122 kcage_cageout_wakeup(); 1123 KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */ 1124 } 1125 } 1126 1127 /* 1128 * return 0 on failure and 1 on success. 1129 */ 1130 static int 1131 kcage_setnoreloc_pages(page_t *rootpp, se_t se) 1132 { 1133 pgcnt_t npgs, i; 1134 page_t *pp; 1135 pfn_t rootpfn = page_pptonum(rootpp); 1136 uint_t szc; 1137 1138 ASSERT(!PP_ISFREE(rootpp)); 1139 ASSERT(PAGE_LOCKED_SE(rootpp, se)); 1140 if (!group_page_trylock(rootpp, se)) { 1141 return (0); 1142 } 1143 szc = rootpp->p_szc; 1144 if (szc == 0) { 1145 /* 1146 * The szc of a locked page can only change for pages that are 1147 * non-swapfs (i.e. anonymous memory) file system pages. 1148 */ 1149 ASSERT(rootpp->p_vnode != NULL && 1150 rootpp->p_vnode != &kvp && 1151 !IS_SWAPFSVP(rootpp->p_vnode)); 1152 PP_SETNORELOC(rootpp); 1153 return (1); 1154 } 1155 npgs = page_get_pagecnt(szc); 1156 ASSERT(IS_P2ALIGNED(rootpfn, npgs)); 1157 pp = rootpp; 1158 for (i = 0; i < npgs; i++, pp++) { 1159 ASSERT(PAGE_LOCKED_SE(pp, se)); 1160 ASSERT(!PP_ISFREE(pp)); 1161 ASSERT(pp->p_szc == szc); 1162 PP_SETNORELOC(pp); 1163 } 1164 group_page_unlock(rootpp); 1165 return (1); 1166 } 1167 1168 /* 1169 * Attempt to convert page to a caged page (set the P_NORELOC flag). 1170 * If successful and pages is free, move page to the tail of whichever 1171 * list it is on. 1172 * Returns: 1173 * EBUSY page already locked, assimilated but not free. 1174 * ENOMEM page assimilated, but memory too low to relocate. Page not free. 1175 * EAGAIN page not assimilated. Page not free. 1176 * ERANGE page assimilated. Page not root. 1177 * 0 page assimilated. Page free. 1178 * *nfreedp number of pages freed. 1179 * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way 1180 * to distinguish between a page that was already a NORELOC page from 1181 * those newly converted to NORELOC pages by this invocation of 1182 * kcage_assimilate_page. 1183 */ 1184 static int 1185 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp) 1186 { 1187 if (page_trylock(pp, SE_EXCL)) { 1188 if (PP_ISNORELOC(pp)) { 1189 check_free_and_return: 1190 if (PP_ISFREE(pp)) { 1191 page_unlock(pp); 1192 *nfreedp = 0; 1193 return (0); 1194 } else { 1195 page_unlock(pp); 1196 return (EBUSY); 1197 } 1198 /*NOTREACHED*/ 1199 } 1200 } else { 1201 if (page_trylock(pp, SE_SHARED)) { 1202 if (PP_ISNORELOC(pp)) 1203 goto check_free_and_return; 1204 } else 1205 return (EAGAIN); 1206 1207 if (!PP_ISFREE(pp)) { 1208 page_unlock(pp); 1209 return (EAGAIN); 1210 } 1211 1212 /* 1213 * Need to upgrade the lock on it and set the NORELOC 1214 * bit. If it is free then remove it from the free 1215 * list so that the platform free list code can keep 1216 * NORELOC pages where they should be. 1217 */ 1218 /* 1219 * Before doing anything, get the exclusive lock. 1220 * This may fail (eg ISM pages are left shared locked). 1221 * If the page is free this will leave a hole in the 1222 * cage. There is no solution yet to this. 1223 */ 1224 if (!page_tryupgrade(pp)) { 1225 page_unlock(pp); 1226 return (EAGAIN); 1227 } 1228 } 1229 1230 ASSERT(PAGE_EXCL(pp)); 1231 1232 if (PP_ISFREE(pp)) { 1233 int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST; 1234 1235 page_list_sub(pp, which | PG_LIST_ISCAGE); 1236 ASSERT(pp->p_szc == 0); 1237 PP_SETNORELOC(pp); 1238 page_list_add(pp, which | PG_LIST_TAIL | PG_LIST_ISCAGE); 1239 1240 page_unlock(pp); 1241 *nfreedp = 1; 1242 return (0); 1243 } else { 1244 if (pp->p_szc != 0) { 1245 if (!kcage_setnoreloc_pages(pp, SE_EXCL)) { 1246 page_unlock(pp); 1247 return (EAGAIN); 1248 } 1249 ASSERT(PP_ISNORELOC(pp)); 1250 } else { 1251 PP_SETNORELOC(pp); 1252 } 1253 page_list_xfer(pp, MTYPE_NORELOC, MTYPE_RELOC); 1254 return (kcage_invalidate_page(pp, nfreedp)); 1255 } 1256 /*NOTREACHED*/ 1257 } 1258 1259 static int 1260 kcage_expand() 1261 { 1262 int did_something = 0; 1263 1264 spgcnt_t wanted; 1265 pfn_t pfn; 1266 page_t *pp; 1267 /* TODO: we don't really need n any more? */ 1268 pgcnt_t n; 1269 pgcnt_t nf, nfreed; 1270 1271 /* 1272 * Expand the cage if available cage memory is really low. Calculate 1273 * the amount required to return kcage_freemem to the level of 1274 * kcage_lotsfree, or to satisfy throttled requests, whichever is 1275 * more. It is rare for their sum to create an artificial threshold 1276 * above kcage_lotsfree, but it is possible. 1277 * 1278 * Exit early if expansion amount is equal to or less than zero. 1279 * (<0 is possible if kcage_freemem rises suddenly.) 1280 * 1281 * Exit early when the global page pool (apparently) does not 1282 * have enough free pages to page_relocate() even a single page. 1283 */ 1284 wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree) 1285 - kcage_freemem; 1286 if (wanted <= 0) 1287 return (0); 1288 else if (freemem < pageout_reserve + 1) { 1289 KCAGE_STAT_INCR(ke_lowfreemem); 1290 return (0); 1291 } 1292 1293 /* 1294 * Try to get the range list lock. If the lock is already 1295 * held, then don't get stuck here waiting for it. 1296 */ 1297 if (!kcage_range_trylock()) 1298 return (0); 1299 1300 KCAGE_STAT_INCR(ke_calls); 1301 KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted); 1302 1303 /* 1304 * Assimilate more pages from the global page pool into the cage. 1305 */ 1306 n = 0; /* number of pages PP_SETNORELOC'd */ 1307 nf = 0; /* number of those actually free */ 1308 while (kcage_on && nf < wanted) { 1309 pfn = kcage_get_pfn(); 1310 if (pfn == PFN_INVALID) { /* eek! no where to grow */ 1311 KCAGE_STAT_INCR(ke_nopfn); 1312 goto terminate; 1313 } 1314 1315 KCAGE_STAT_INCR_SCAN(ke_examined); 1316 1317 if ((pp = page_numtopp_nolock(pfn)) == NULL) { 1318 KCAGE_STAT_INCR(ke_nopaget); 1319 continue; 1320 } 1321 KCAGEPAGETS_INC(); 1322 /* 1323 * Sanity check. Skip this pfn if it is 1324 * being deleted. 1325 */ 1326 if (pfn_is_being_deleted(pfn)) { 1327 KCAGE_STAT_INCR(ke_deleting); 1328 continue; 1329 } 1330 1331 /* 1332 * NORELOC is only set at boot-time or by this routine 1333 * under the kcage_range_mutex lock which is currently 1334 * held. This means we can do a fast check here before 1335 * locking the page in kcage_assimilate_page. 1336 */ 1337 if (PP_ISNORELOC(pp)) { 1338 KCAGE_STAT_INCR(ke_isnoreloc); 1339 continue; 1340 } 1341 1342 switch (kcage_assimilate_page(pp, &nfreed)) { 1343 case 0: /* assimilated, page is free */ 1344 KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed); 1345 did_something = 1; 1346 nf += nfreed; 1347 n++; 1348 break; 1349 1350 case EBUSY: /* assimilated, page not free */ 1351 case ERANGE: /* assimilated, page not root */ 1352 KCAGE_STAT_INCR_SCAN(ke_gotone); 1353 did_something = 1; 1354 n++; 1355 break; 1356 1357 case ENOMEM: /* assimilated, but no mem */ 1358 KCAGE_STAT_INCR(ke_terminate); 1359 did_something = 1; 1360 n++; 1361 goto terminate; 1362 1363 case EAGAIN: /* can't assimilate */ 1364 KCAGE_STAT_INCR_SCAN(ke_lefthole); 1365 break; 1366 1367 default: /* catch this with debug kernels */ 1368 ASSERT(0); 1369 break; 1370 } 1371 } 1372 1373 /* 1374 * Realign cage edge with the nearest physical address 1375 * boundry for big pages. This is done to give us a 1376 * better chance of actually getting usable big pages 1377 * in the cage. 1378 */ 1379 1380 terminate: 1381 kcage_range_unlock(); 1382 1383 return (did_something); 1384 } 1385 1386 /* 1387 * Relocate page opp (Original Page Pointer) from cage pool to page rpp 1388 * (Replacement Page Pointer) in the global pool. Page opp will be freed 1389 * if relocation is successful, otherwise it is only unlocked. 1390 * On entry, page opp must be exclusively locked and not free. 1391 * *nfreedp: number of pages freed. 1392 */ 1393 static int 1394 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp) 1395 { 1396 page_t *opp = pp; 1397 page_t *rpp = NULL; 1398 spgcnt_t npgs; 1399 int result; 1400 1401 ASSERT(!PP_ISFREE(opp)); 1402 ASSERT(PAGE_EXCL(opp)); 1403 1404 result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL); 1405 *nfreedp = npgs; 1406 if (result == 0) { 1407 while (npgs-- > 0) { 1408 page_t *tpp; 1409 1410 ASSERT(rpp != NULL); 1411 tpp = rpp; 1412 page_sub(&rpp, tpp); 1413 page_unlock(tpp); 1414 } 1415 1416 ASSERT(rpp == NULL); 1417 1418 return (0); /* success */ 1419 } 1420 1421 page_unlock(opp); 1422 return (result); 1423 } 1424 1425 /* 1426 * Based on page_invalidate_pages() 1427 * 1428 * Kcage_invalidate_page() uses page_relocate() twice. Both instances 1429 * of use must be updated to match the new page_relocate() when it 1430 * becomes available. 1431 * 1432 * Return result of kcage_relocate_page or zero if page was directly freed. 1433 * *nfreedp: number of pages freed. 1434 */ 1435 static int 1436 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp) 1437 { 1438 int result; 1439 1440 #if defined(__sparc) 1441 extern struct vnode prom_ppages; 1442 ASSERT(pp->p_vnode != &prom_ppages); 1443 #endif /* __sparc */ 1444 1445 ASSERT(!PP_ISFREE(pp)); 1446 ASSERT(PAGE_EXCL(pp)); 1447 1448 /* 1449 * Is this page involved in some I/O? shared? 1450 * The page_struct_lock need not be acquired to 1451 * examine these fields since the page has an 1452 * "exclusive" lock. 1453 */ 1454 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1455 result = kcage_relocate_page(pp, nfreedp); 1456 #ifdef KCAGE_STATS 1457 if (result == 0) 1458 KCAGE_STAT_INCR_SCAN(kip_reloclocked); 1459 else if (result == ENOMEM) 1460 KCAGE_STAT_INCR_SCAN(kip_nomem); 1461 #endif 1462 return (result); 1463 } 1464 1465 ASSERT(pp->p_vnode->v_type != VCHR); 1466 1467 /* 1468 * Unload the mappings and check if mod bit is set. 1469 */ 1470 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1471 1472 if (hat_ismod(pp)) { 1473 result = kcage_relocate_page(pp, nfreedp); 1474 #ifdef KCAGE_STATS 1475 if (result == 0) 1476 KCAGE_STAT_INCR_SCAN(kip_relocmod); 1477 else if (result == ENOMEM) 1478 KCAGE_STAT_INCR_SCAN(kip_nomem); 1479 #endif 1480 return (result); 1481 } 1482 1483 if (!page_try_demote_pages(pp)) { 1484 KCAGE_STAT_INCR_SCAN(kip_demotefailed); 1485 page_unlock(pp); 1486 return (EAGAIN); 1487 } 1488 1489 page_destroy(pp, 0); 1490 KCAGE_STAT_INCR_SCAN(kip_destroy); 1491 *nfreedp = 1; 1492 return (0); 1493 } 1494 1495 static void 1496 kcage_cageout() 1497 { 1498 pfn_t pfn; 1499 page_t *pp; 1500 callb_cpr_t cprinfo; 1501 int did_something; 1502 int scan_again; 1503 pfn_t start_pfn; 1504 int pass; 1505 int last_pass; 1506 int pages_skipped; 1507 int shared_skipped; 1508 uint_t shared_level = 8; 1509 pgcnt_t nfreed; 1510 #ifdef KCAGE_STATS 1511 clock_t scan_start; 1512 #endif 1513 1514 CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex, 1515 callb_generic_cpr, "cageout"); 1516 1517 mutex_enter(&kcage_cageout_mutex); 1518 1519 pfn = PFN_INVALID; /* force scan reset */ 1520 start_pfn = PFN_INVALID; /* force init with 1st cage pfn */ 1521 kcage_cageout_ready = 1; /* switch kcage_cageout_wakeup mode */ 1522 1523 loop: 1524 /* 1525 * Wait here. Sooner or later, kcage_freemem_sub() will notice 1526 * that kcage_freemem is less than kcage_desfree. When it does 1527 * notice, kcage_freemem_sub() will wake us up via call to 1528 * kcage_cageout_wakeup(). 1529 */ 1530 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1531 cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex); 1532 CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex); 1533 1534 KCAGE_STAT_INCR(kt_wakeups); 1535 KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem); 1536 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem); 1537 pass = 0; 1538 last_pass = 0; 1539 1540 #ifdef KCAGE_STATS 1541 scan_start = lbolt; 1542 #endif 1543 1544 again: 1545 if (!kcage_on) 1546 goto loop; 1547 1548 KCAGE_STAT_INCR(kt_scans); 1549 KCAGE_STAT_INCR_SCAN(kt_passes); 1550 1551 did_something = 0; 1552 pages_skipped = 0; 1553 shared_skipped = 0; 1554 while ((kcage_freemem < kcage_lotsfree || kcage_needfree) && 1555 (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) { 1556 1557 if (start_pfn == PFN_INVALID) 1558 start_pfn = pfn; 1559 else if (start_pfn == pfn) { 1560 last_pass = pass; 1561 pass += 1; 1562 /* 1563 * Did a complete walk of kernel cage, but didn't free 1564 * any pages. If only one cpu is online then 1565 * stop kernel cage walk and try expanding. 1566 */ 1567 if (ncpus_online == 1 && did_something == 0) { 1568 KCAGE_STAT_INCR(kt_cageout_break); 1569 break; 1570 } 1571 } 1572 1573 pp = page_numtopp_nolock(pfn); 1574 if (pp == NULL) { 1575 continue; 1576 } 1577 1578 KCAGE_STAT_INCR_SCAN(kt_examined); 1579 1580 /* 1581 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside 1582 * of the lock. If one is missed it will be seen next 1583 * time through. 1584 * 1585 * Skip non-caged-pages. These pages can exist in the cage 1586 * because, if during cage expansion, a page is 1587 * encountered that is long-term locked the lock prevents the 1588 * expansion logic from setting the P_NORELOC flag. Hence, 1589 * non-caged-pages surrounded by caged-pages. 1590 */ 1591 if (!PP_ISNORELOC(pp)) { 1592 switch (kcage_assimilate_page(pp, &nfreed)) { 1593 case 0: 1594 did_something = 1; 1595 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, 1596 nfreed); 1597 break; 1598 1599 case EBUSY: 1600 case ERANGE: 1601 did_something = 1; 1602 KCAGE_STAT_INCR_SCAN(kt_gotone); 1603 break; 1604 1605 case EAGAIN: 1606 case ENOMEM: 1607 break; 1608 1609 default: 1610 /* catch this with debug kernels */ 1611 ASSERT(0); 1612 break; 1613 } 1614 1615 continue; 1616 } else { 1617 int prm; 1618 1619 if (PP_ISFREE(pp)) { 1620 continue; 1621 } 1622 1623 if ((pp->p_vnode == &kvp && pp->p_lckcnt > 0) || 1624 !page_trylock(pp, SE_EXCL)) { 1625 KCAGE_STAT_INCR_SCAN(kt_cantlock); 1626 continue; 1627 } 1628 1629 /* P_NORELOC bit should not have gone away. */ 1630 ASSERT(PP_ISNORELOC(pp)); 1631 if (PP_ISFREE(pp) || (pp->p_vnode == &kvp && 1632 pp->p_lckcnt > 0)) { 1633 page_unlock(pp); 1634 continue; 1635 } 1636 1637 KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level); 1638 if (hat_page_getshare(pp) > shared_level) { 1639 page_unlock(pp); 1640 pages_skipped = 1; 1641 shared_skipped = 1; 1642 KCAGE_STAT_INCR_SCAN(kt_skipshared); 1643 continue; 1644 } 1645 1646 /* 1647 * In pass {0, 1}, skip page if ref bit is set. 1648 * In pass {0, 1, 2}, skip page if mod bit is set. 1649 */ 1650 prm = hat_pagesync(pp, 1651 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 1652 1653 /* On first pass ignore ref'd pages */ 1654 if (pass <= 1 && (prm & P_REF)) { 1655 KCAGE_STAT_INCR_SCAN(kt_skiprefd); 1656 pages_skipped = 1; 1657 page_unlock(pp); 1658 continue; 1659 } 1660 1661 /* On pass 2, page_destroy if mod bit is not set */ 1662 if (pass <= 2) { 1663 if (pp->p_szc != 0 || (prm & P_MOD) || 1664 pp->p_lckcnt || pp->p_cowcnt) { 1665 pages_skipped = 1; 1666 page_unlock(pp); 1667 } else { 1668 1669 /* 1670 * unload the mappings before 1671 * checking if mod bit is set 1672 */ 1673 (void) hat_pageunload(pp, 1674 HAT_FORCE_PGUNLOAD); 1675 1676 /* 1677 * skip this page if modified 1678 */ 1679 if (hat_ismod(pp)) { 1680 pages_skipped = 1; 1681 page_unlock(pp); 1682 continue; 1683 } 1684 1685 KCAGE_STAT_INCR_SCAN(kt_destroy); 1686 page_destroy(pp, 0); 1687 did_something = 1; 1688 } 1689 continue; 1690 } 1691 1692 if (kcage_invalidate_page(pp, &nfreed) == 0) { 1693 did_something = 1; 1694 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed); 1695 } 1696 1697 /* 1698 * No need to drop the page lock here. 1699 * Kcage_invalidate_page has done that for us 1700 * either explicitly or through a page_free. 1701 */ 1702 } 1703 } 1704 1705 /* 1706 * Expand the cage only if available cage memory is really low. 1707 * This test is done only after a complete scan of the cage. 1708 * The reason for not checking and expanding more often is to 1709 * avoid rapid expansion of the cage. Naturally, scanning the 1710 * cage takes time. So by scanning first, we use that work as a 1711 * delay loop in between expand decisions. 1712 */ 1713 1714 scan_again = 0; 1715 if (kcage_freemem < kcage_minfree || kcage_needfree) { 1716 /* 1717 * Kcage_expand() will return a non-zero value if it was 1718 * able to expand the cage -- whether or not the new 1719 * pages are free and immediately usable. If non-zero, 1720 * we do another scan of the cage. The pages might be 1721 * freed during that scan or by time we get back here. 1722 * If not, we will attempt another expansion. 1723 * However, if kcage_expand() returns zero, then it was 1724 * unable to expand the cage. This is the case when the 1725 * the growth list is exausted, therefore no work was done 1726 * and there is no reason to scan the cage again. 1727 * Note: Kernel cage scan is not repeated on single-cpu 1728 * system to avoid kernel cage thread hogging cpu. 1729 */ 1730 if (pass <= 3 && pages_skipped && ncpus_online > 1) 1731 scan_again = 1; 1732 else 1733 (void) kcage_expand(); /* don't scan again */ 1734 } else if (kcage_freemem < kcage_lotsfree) { 1735 /* 1736 * If available cage memory is less than abundant 1737 * and a full scan of the cage has not yet been completed, 1738 * or a scan has completed and some work was performed, 1739 * or pages were skipped because of sharing, 1740 * or we simply have not yet completed two passes, 1741 * then do another scan. 1742 */ 1743 if (pass <= 2 && pages_skipped) 1744 scan_again = 1; 1745 if (pass == last_pass || did_something) 1746 scan_again = 1; 1747 else if (shared_skipped && shared_level < (8<<24)) { 1748 shared_level <<= 1; 1749 scan_again = 1; 1750 } 1751 } 1752 1753 if (scan_again && ncpus_online > 1) 1754 goto again; 1755 else { 1756 if (shared_level > 8) 1757 shared_level >>= 1; 1758 1759 KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem); 1760 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem); 1761 KCAGE_STAT_SET_SCAN(kt_ticks, lbolt - scan_start); 1762 KCAGE_STAT_INC_SCAN_INDEX; 1763 goto loop; 1764 } 1765 1766 /*NOTREACHED*/ 1767 } 1768 1769 void 1770 kcage_cageout_wakeup() 1771 { 1772 if (mutex_tryenter(&kcage_cageout_mutex)) { 1773 if (kcage_cageout_ready) { 1774 cv_signal(&kcage_cageout_cv); 1775 } else if (kcage_freemem < kcage_minfree || kcage_needfree) { 1776 /* 1777 * Available cage memory is really low. Time to 1778 * start expanding the cage. However, the 1779 * kernel cage thread is not yet ready to 1780 * do the work. Use *this* thread, which is 1781 * most likely to be t0, to do the work. 1782 */ 1783 KCAGE_STAT_INCR(kcw_expandearly); 1784 (void) kcage_expand(); 1785 KCAGE_STAT_INC_SCAN_INDEX; 1786 } 1787 1788 mutex_exit(&kcage_cageout_mutex); 1789 } 1790 /* else, kernel cage thread is already running */ 1791 } 1792 1793 void 1794 kcage_tick() 1795 { 1796 /* 1797 * Once per second we wake up all the threads throttled 1798 * waiting for cage memory, in case we've become stuck 1799 * and haven't made forward progress expanding the cage. 1800 */ 1801 if (kcage_on && kcage_cageout_ready) 1802 cv_broadcast(&kcage_throttle_cv); 1803 } 1804