1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/thread.h> 32 #include <sys/proc.h> 33 #include <sys/callb.h> 34 #include <sys/vnode.h> 35 #include <sys/debug.h> 36 #include <sys/systm.h> /* for bzero */ 37 #include <sys/memlist.h> 38 #include <sys/cmn_err.h> 39 #include <sys/sysmacros.h> 40 #include <sys/vmsystm.h> /* for NOMEMWAIT() */ 41 #include <sys/atomic.h> /* used to update kcage_freemem */ 42 #include <sys/kmem.h> /* for kmem_reap */ 43 #include <sys/errno.h> 44 #include <sys/mem_cage.h> 45 #include <vm/seg_kmem.h> 46 #include <vm/page.h> 47 #include <vm/hat.h> 48 #include <vm/vm_dep.h> 49 #include <sys/mem_config.h> 50 #include <sys/lgrp.h> 51 52 extern pri_t maxclsyspri; 53 54 #ifdef DEBUG 55 #define KCAGE_STATS 56 #endif 57 58 #ifdef KCAGE_STATS 59 60 #define KCAGE_STATS_VERSION 9 /* can help report generators */ 61 #define KCAGE_STATS_NSCANS 256 /* depth of scan statistics buffer */ 62 63 struct kcage_stats_scan { 64 /* managed by KCAGE_STAT_* macros */ 65 clock_t scan_lbolt; 66 uint_t scan_id; 67 68 /* set in kcage_cageout() */ 69 uint_t kt_passes; 70 clock_t kt_ticks; 71 pgcnt_t kt_kcage_freemem_start; 72 pgcnt_t kt_kcage_freemem_end; 73 pgcnt_t kt_freemem_start; 74 pgcnt_t kt_freemem_end; 75 uint_t kt_examined; 76 uint_t kt_cantlock; 77 uint_t kt_gotone; 78 uint_t kt_gotonefree; 79 uint_t kt_skiplevel; 80 uint_t kt_skipshared; 81 uint_t kt_skiprefd; 82 uint_t kt_destroy; 83 84 /* set in kcage_invalidate_page() */ 85 uint_t kip_reloclocked; 86 uint_t kip_relocmod; 87 uint_t kip_destroy; 88 uint_t kip_nomem; 89 uint_t kip_demotefailed; 90 91 /* set in kcage_expand() */ 92 uint_t ke_wanted; 93 uint_t ke_examined; 94 uint_t ke_lefthole; 95 uint_t ke_gotone; 96 uint_t ke_gotonefree; 97 }; 98 99 struct kcage_stats { 100 /* managed by KCAGE_STAT_* macros */ 101 uint_t version; 102 uint_t size; 103 104 /* set in kcage_cageout */ 105 uint_t kt_wakeups; 106 uint_t kt_scans; 107 uint_t kt_cageout_break; 108 109 /* set in kcage_expand */ 110 uint_t ke_calls; 111 uint_t ke_nopfn; 112 uint_t ke_nopaget; 113 uint_t ke_isnoreloc; 114 uint_t ke_deleting; 115 uint_t ke_lowfreemem; 116 uint_t ke_terminate; 117 118 /* set in kcage_freemem_add() */ 119 uint_t kfa_trottlewake; 120 121 /* set in kcage_freemem_sub() */ 122 uint_t kfs_cagewake; 123 124 /* set in kcage_create_throttle */ 125 uint_t kct_calls; 126 uint_t kct_cageout; 127 uint_t kct_critical; 128 uint_t kct_exempt; 129 uint_t kct_cagewake; 130 uint_t kct_wait; 131 uint_t kct_progress; 132 uint_t kct_noprogress; 133 uint_t kct_timeout; 134 135 /* set in kcage_cageout_wakeup */ 136 uint_t kcw_expandearly; 137 138 /* managed by KCAGE_STAT_* macros */ 139 uint_t scan_array_size; 140 uint_t scan_index; 141 struct kcage_stats_scan scans[KCAGE_STATS_NSCANS]; 142 }; 143 144 static struct kcage_stats kcage_stats; 145 static struct kcage_stats_scan kcage_stats_scan_zero; 146 147 /* 148 * No real need for atomics here. For the most part the incs and sets are 149 * done by the kernel cage thread. There are a few that are done by any 150 * number of other threads. Those cases are noted by comments. 151 */ 152 #define KCAGE_STAT_INCR(m) kcage_stats.m++ 153 154 #define KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v) 155 156 #define KCAGE_STAT_INCR_SCAN(m) \ 157 KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m) 158 159 #define KCAGE_STAT_NINCR_SCAN(m, v) \ 160 KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v) 161 162 #define KCAGE_STAT_SET(m, v) kcage_stats.m = (v) 163 164 #define KCAGE_STAT_SETZ(m, v) \ 165 if (kcage_stats.m == 0) kcage_stats.m = (v) 166 167 #define KCAGE_STAT_SET_SCAN(m, v) \ 168 KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v) 169 170 #define KCAGE_STAT_SETZ_SCAN(m, v) \ 171 KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v) 172 173 #define KCAGE_STAT_INC_SCAN_INDEX \ 174 KCAGE_STAT_SET_SCAN(scan_lbolt, lbolt); \ 175 KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \ 176 kcage_stats.scan_index = \ 177 (kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \ 178 kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero 179 180 #define KCAGE_STAT_INIT_SCAN_INDEX \ 181 kcage_stats.version = KCAGE_STATS_VERSION; \ 182 kcage_stats.size = sizeof (kcage_stats); \ 183 kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \ 184 kcage_stats.scan_index = 0 185 186 #else /* KCAGE_STATS */ 187 188 #define KCAGE_STAT_INCR(v) 189 #define KCAGE_STAT_NINCR(m, v) 190 #define KCAGE_STAT_INCR_SCAN(v) 191 #define KCAGE_STAT_NINCR_SCAN(m, v) 192 #define KCAGE_STAT_SET(m, v) 193 #define KCAGE_STAT_SETZ(m, v) 194 #define KCAGE_STAT_SET_SCAN(m, v) 195 #define KCAGE_STAT_SETZ_SCAN(m, v) 196 #define KCAGE_STAT_INC_SCAN_INDEX 197 #define KCAGE_STAT_INIT_SCAN_INDEX 198 199 #endif /* KCAGE_STATS */ 200 201 static kmutex_t kcage_throttle_mutex; /* protects kcage_throttle_cv */ 202 static kcondvar_t kcage_throttle_cv; 203 204 static kmutex_t kcage_cageout_mutex; /* protects cv and ready flag */ 205 static kcondvar_t kcage_cageout_cv; /* cageout thread naps here */ 206 static int kcage_cageout_ready; /* nonzero when cageout thread ready */ 207 kthread_id_t kcage_cageout_thread; /* to aid debugging */ 208 209 static kmutex_t kcage_range_mutex; /* proctects kcage_glist elements */ 210 211 /* 212 * Cage expansion happens within a range. 213 */ 214 struct kcage_glist { 215 struct kcage_glist *next; 216 pfn_t base; 217 pfn_t lim; 218 pfn_t curr; 219 int decr; 220 }; 221 222 static struct kcage_glist *kcage_glist; 223 static struct kcage_glist *kcage_current_glist; 224 225 /* 226 * The firstfree element is provided so that kmem_alloc can be avoided 227 * until that cage has somewhere to go. This is not currently a problem 228 * as early kmem_alloc's use BOP_ALLOC instead of page_create_va. 229 */ 230 static struct kcage_glist kcage_glist_firstfree; 231 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree; 232 233 /* 234 * Miscellaneous forward references 235 */ 236 static struct kcage_glist *kcage_glist_alloc(void); 237 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **); 238 static void kcage_cageout(void); 239 static int kcage_invalidate_page(page_t *, pgcnt_t *); 240 static int kcage_setnoreloc_pages(page_t *, se_t); 241 242 /* 243 * Kernel Memory Cage counters and thresholds. 244 */ 245 int kcage_on = 0; 246 pgcnt_t kcage_freemem; 247 pgcnt_t kcage_needfree; 248 pgcnt_t kcage_lotsfree; 249 pgcnt_t kcage_desfree; 250 pgcnt_t kcage_minfree; 251 pgcnt_t kcage_throttlefree; 252 pgcnt_t kcage_reserve; 253 int kcage_maxwait = 10; /* in seconds */ 254 255 /* when we use lp for kmem we start the cage at a higher initial value */ 256 pgcnt_t kcage_kmemlp_mincage; 257 258 #ifdef DEBUG 259 pgcnt_t kcage_pagets; 260 #define KCAGEPAGETS_INC() kcage_pagets++ 261 #else 262 #define KCAGEPAGETS_INC() 263 #endif 264 265 /* 266 * Startup and Dynamic Reconfiguration interfaces. 267 * kcage_range_lock() 268 * kcage_range_unlock() 269 * kcage_range_islocked() 270 * kcage_range_add() 271 * kcage_range_del() 272 * kcage_init() 273 * kcage_set_thresholds() 274 */ 275 276 int 277 kcage_range_trylock(void) 278 { 279 return (mutex_tryenter(&kcage_range_mutex)); 280 } 281 282 void 283 kcage_range_lock(void) 284 { 285 mutex_enter(&kcage_range_mutex); 286 } 287 288 void 289 kcage_range_unlock(void) 290 { 291 mutex_exit(&kcage_range_mutex); 292 } 293 294 int 295 kcage_range_islocked(void) 296 { 297 return (MUTEX_HELD(&kcage_range_mutex)); 298 } 299 300 /* 301 * Called from page_get_contig_pages to get the approximate kcage pfn range 302 * for exclusion from search for contiguous pages. This routine is called 303 * without kcage_range lock (kcage routines can call page_get_contig_pages 304 * through page_relocate) and with the assumption, based on kcage_range_add, 305 * that kcage_current_glist always contain a valid pointer. 306 */ 307 308 int 309 kcage_current_pfn(pfn_t *pfncur) 310 { 311 struct kcage_glist *lp = kcage_current_glist; 312 313 ASSERT(kcage_on); 314 315 ASSERT(lp != NULL); 316 317 *pfncur = lp->curr; 318 319 return (lp->decr); 320 } 321 322 int 323 kcage_range_init(struct memlist *ml, int decr) 324 { 325 int ret = 0; 326 327 ASSERT(kcage_range_islocked()); 328 329 if (decr) { 330 while (ml->next != NULL) 331 ml = ml->next; 332 } 333 334 while (ml != NULL) { 335 ret = kcage_range_add(btop(ml->address), btop(ml->size), decr); 336 if (ret) 337 break; 338 339 ml = (decr ? ml->prev : ml->next); 340 } 341 342 return (ret); 343 } 344 345 /* 346 * Third arg controls direction of growth: 0: increasing pfns, 347 * 1: decreasing. 348 * Calls to add and delete must be protected by calls to 349 * kcage_range_lock() and kcage_range_unlock(). 350 */ 351 int 352 kcage_range_add(pfn_t base, pgcnt_t npgs, int decr) 353 { 354 struct kcage_glist *new, **lpp; 355 pfn_t lim; 356 357 ASSERT(kcage_range_islocked()); 358 359 ASSERT(npgs != 0); 360 if (npgs == 0) 361 return (EINVAL); 362 363 lim = base + npgs; 364 365 ASSERT(lim > base); 366 if (lim <= base) 367 return (EINVAL); 368 369 new = kcage_glist_alloc(); 370 if (new == NULL) { 371 return (ENOMEM); 372 } 373 374 new->base = base; 375 new->lim = lim; 376 new->decr = decr; 377 if (new->decr != 0) 378 new->curr = new->lim; 379 else 380 new->curr = new->base; 381 /* 382 * Any overlapping existing ranges are removed by deleting 383 * from the new list as we search for the tail. 384 */ 385 lpp = &kcage_glist; 386 while (*lpp != NULL) { 387 int ret; 388 ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new); 389 if (ret != 0) 390 return (ret); 391 lpp = &(*lpp)->next; 392 } 393 394 *lpp = new; 395 396 if (kcage_current_glist == NULL) { 397 kcage_current_glist = kcage_glist; 398 } 399 400 return (0); 401 } 402 403 /* 404 * Calls to add and delete must be protected by calls to 405 * kcage_range_lock() and kcage_range_unlock(). 406 */ 407 int 408 kcage_range_delete(pfn_t base, pgcnt_t npgs) 409 { 410 struct kcage_glist *lp; 411 pfn_t lim; 412 413 ASSERT(kcage_range_islocked()); 414 415 ASSERT(npgs != 0); 416 if (npgs == 0) 417 return (EINVAL); 418 419 lim = base + npgs; 420 421 ASSERT(lim > base); 422 if (lim <= base) 423 return (EINVAL); 424 425 /* 426 * Check if the delete is OK first as a number of elements 427 * might be involved and it will be difficult to go 428 * back and undo (can't just add the range back in). 429 */ 430 for (lp = kcage_glist; lp != NULL; lp = lp->next) { 431 /* 432 * If there have been no pages allocated from this 433 * element, we don't need to check it. 434 */ 435 if ((lp->decr == 0 && lp->curr == lp->base) || 436 (lp->decr != 0 && lp->curr == lp->lim)) 437 continue; 438 /* 439 * If the element does not overlap, its OK. 440 */ 441 if (base >= lp->lim || lim <= lp->base) 442 continue; 443 /* 444 * Overlapping element: Does the range to be deleted 445 * overlap the area already used? If so fail. 446 */ 447 if (lp->decr == 0 && base < lp->curr && lim >= lp->base) { 448 return (EBUSY); 449 } 450 if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) { 451 return (EBUSY); 452 } 453 } 454 return (kcage_glist_delete(base, lim, &kcage_glist)); 455 } 456 457 /* 458 * Calls to add and delete must be protected by calls to 459 * kcage_range_lock() and kcage_range_unlock(). 460 * This routine gets called after successful Solaris memory 461 * delete operation from DR post memory delete routines. 462 */ 463 int 464 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs) 465 { 466 pfn_t lim; 467 468 ASSERT(kcage_range_islocked()); 469 470 ASSERT(npgs != 0); 471 if (npgs == 0) 472 return (EINVAL); 473 474 lim = base + npgs; 475 476 ASSERT(lim > base); 477 if (lim <= base) 478 return (EINVAL); 479 480 return (kcage_glist_delete(base, lim, &kcage_glist)); 481 } 482 483 /* 484 * No locking is required here as the whole operation is covered 485 * by the kcage_range_lock(). 486 */ 487 static struct kcage_glist * 488 kcage_glist_alloc(void) 489 { 490 struct kcage_glist *new; 491 492 if ((new = kcage_glist_freelist) != NULL) { 493 kcage_glist_freelist = new->next; 494 bzero(new, sizeof (*new)); 495 } else { 496 new = kmem_zalloc(sizeof (struct kcage_glist), KM_NOSLEEP); 497 } 498 return (new); 499 } 500 501 static void 502 kcage_glist_free(struct kcage_glist *lp) 503 { 504 lp->next = kcage_glist_freelist; 505 kcage_glist_freelist = lp; 506 } 507 508 static int 509 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp) 510 { 511 struct kcage_glist *lp, *prev = *lpp; 512 513 while ((lp = *lpp) != NULL) { 514 if (lim > lp->base && base < lp->lim) { 515 /* The delete range overlaps this element. */ 516 if (base <= lp->base && lim >= lp->lim) { 517 /* Delete whole element. */ 518 *lpp = lp->next; 519 if (lp == kcage_current_glist) { 520 /* This can never happen. */ 521 ASSERT(kcage_current_glist != prev); 522 kcage_current_glist = prev; 523 } 524 kcage_glist_free(lp); 525 continue; 526 } 527 528 /* Partial delete. */ 529 if (base > lp->base && lim < lp->lim) { 530 struct kcage_glist *new; 531 532 /* 533 * Remove a section from the middle, 534 * need to allocate a new element. 535 */ 536 new = kcage_glist_alloc(); 537 if (new == NULL) { 538 return (ENOMEM); 539 } 540 541 /* 542 * Tranfser unused range to new. 543 * Edit lp in place to preserve 544 * kcage_current_glist. 545 */ 546 new->decr = lp->decr; 547 if (new->decr != 0) { 548 new->base = lp->base; 549 new->lim = base; 550 new->curr = base; 551 552 lp->base = lim; 553 } else { 554 new->base = lim; 555 new->lim = lp->lim; 556 new->curr = new->base; 557 558 lp->lim = base; 559 } 560 561 /* Insert new. */ 562 new->next = lp->next; 563 lp->next = new; 564 lpp = &lp->next; 565 } else { 566 /* Delete part of current block. */ 567 if (base > lp->base) { 568 ASSERT(lim >= lp->lim); 569 ASSERT(base < lp->lim); 570 if (lp->decr != 0 && 571 lp->curr == lp->lim) 572 lp->curr = base; 573 lp->lim = base; 574 } else { 575 ASSERT(base <= lp->base); 576 ASSERT(lim > lp->base); 577 if (lp->decr == 0 && 578 lp->curr == lp->base) 579 lp->curr = lim; 580 lp->base = lim; 581 } 582 } 583 } 584 prev = *lpp; 585 lpp = &(*lpp)->next; 586 } 587 588 return (0); 589 } 590 591 /* 592 * The caller of kcage_get_pfn must hold the kcage_range_lock to make 593 * sure that there are no concurrent calls. The same lock 594 * must be obtained for range add and delete by calling 595 * kcage_range_lock() and kcage_range_unlock(). 596 */ 597 static pfn_t 598 kcage_get_pfn(void) 599 { 600 struct kcage_glist *lp; 601 pfn_t pfn; 602 603 ASSERT(kcage_range_islocked()); 604 605 lp = kcage_current_glist; 606 while (lp != NULL) { 607 if (lp->decr != 0) { 608 if (lp->curr != lp->base) { 609 pfn = --lp->curr; 610 return (pfn); 611 } 612 } else { 613 if (lp->curr != lp->lim) { 614 pfn = lp->curr++; 615 return (pfn); 616 } 617 } 618 619 lp = lp->next; 620 if (lp) 621 kcage_current_glist = lp; 622 } 623 624 return (PFN_INVALID); 625 } 626 627 /* 628 * Walk the physical address space of the cage. 629 * This routine does not guarantee to return PFNs in the order 630 * in which they were allocated to the cage. Instead, it walks 631 * each range as they appear on the growth list returning the PFNs 632 * range in ascending order. 633 * 634 * To begin scanning at lower edge of cage, reset should be nonzero. 635 * To step through cage, reset should be zero. 636 * 637 * PFN_INVALID will be returned when the upper end of the cage is 638 * reached -- indicating a full scan of the cage has been completed since 639 * previous reset. PFN_INVALID will continue to be returned until 640 * kcage_walk_cage is reset. 641 * 642 * It is possible to receive a PFN_INVALID result on reset if a growth 643 * list is not installed or if none of the PFNs in the installed list have 644 * been allocated to the cage. In otherwords, there is no cage. 645 * 646 * Caller need not hold kcage_range_lock while calling this function 647 * as the front part of the list is static - pages never come out of 648 * the cage. 649 * 650 * The caller is expected to only be kcage_cageout(). 651 */ 652 static pfn_t 653 kcage_walk_cage(int reset) 654 { 655 static struct kcage_glist *lp = NULL; 656 static pfn_t pfn; 657 658 if (reset) 659 lp = NULL; 660 if (lp == NULL) { 661 lp = kcage_glist; 662 pfn = PFN_INVALID; 663 } 664 again: 665 if (pfn == PFN_INVALID) { 666 if (lp == NULL) 667 return (PFN_INVALID); 668 669 if (lp->decr != 0) { 670 /* 671 * In this range the cage grows from the highest 672 * address towards the lowest. 673 * Arrange to return pfns from curr to lim-1, 674 * inclusive, in ascending order. 675 */ 676 677 pfn = lp->curr; 678 } else { 679 /* 680 * In this range the cage grows from the lowest 681 * address towards the highest. 682 * Arrange to return pfns from base to curr, 683 * inclusive, in ascending order. 684 */ 685 686 pfn = lp->base; 687 } 688 } 689 690 if (lp->decr != 0) { /* decrementing pfn */ 691 if (pfn == lp->lim) { 692 /* Don't go beyond the static part of the glist. */ 693 if (lp == kcage_current_glist) 694 lp = NULL; 695 else 696 lp = lp->next; 697 pfn = PFN_INVALID; 698 goto again; 699 } 700 701 ASSERT(pfn >= lp->curr && pfn < lp->lim); 702 } else { /* incrementing pfn */ 703 if (pfn == lp->curr) { 704 /* Don't go beyond the static part of the glist. */ 705 if (lp == kcage_current_glist) 706 lp = NULL; 707 else 708 lp = lp->next; 709 pfn = PFN_INVALID; 710 goto again; 711 } 712 713 ASSERT(pfn >= lp->base && pfn < lp->curr); 714 } 715 716 return (pfn++); 717 } 718 719 /* 720 * Callback functions for to recalc cage thresholds after 721 * Kphysm memory add/delete operations. 722 */ 723 /*ARGSUSED*/ 724 static void 725 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages) 726 { 727 kcage_recalc_thresholds(); 728 } 729 730 /*ARGSUSED*/ 731 static int 732 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages) 733 { 734 /* TODO: when should cage refuse memory delete requests? */ 735 return (0); 736 } 737 738 /*ARGSUSED*/ 739 static void 740 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled) 741 { 742 kcage_recalc_thresholds(); 743 } 744 745 static kphysm_setup_vector_t kcage_kphysm_vectors = { 746 KPHYSM_SETUP_VECTOR_VERSION, 747 kcage_kphysm_postadd_cb, 748 kcage_kphysm_predel_cb, 749 kcage_kphysm_postdel_cb 750 }; 751 752 /* 753 * This is called before a CPR suspend and after a CPR resume. We have to 754 * turn off kcage_cageout_ready before a suspend, and turn it back on after a 755 * restart. 756 */ 757 /*ARGSUSED*/ 758 static boolean_t 759 kcage_cageout_cpr(void *arg, int code) 760 { 761 if (code == CB_CODE_CPR_CHKPT) { 762 ASSERT(kcage_cageout_ready); 763 kcage_cageout_ready = 0; 764 return (B_TRUE); 765 } else if (code == CB_CODE_CPR_RESUME) { 766 ASSERT(kcage_cageout_ready == 0); 767 kcage_cageout_ready = 1; 768 return (B_TRUE); 769 } 770 return (B_FALSE); 771 } 772 773 /* 774 * kcage_recalc_preferred_size() increases initial cage size to improve large 775 * page availability when lp for kmem is enabled and kpr is disabled 776 */ 777 static pgcnt_t 778 kcage_recalc_preferred_size(pgcnt_t preferred_size) 779 { 780 if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) { 781 pgcnt_t lpmincage = kcage_kmemlp_mincage; 782 if (lpmincage == 0) { 783 lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8), 784 segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE; 785 } 786 kcage_kmemlp_mincage = MIN(lpmincage, 787 (segkmem_kmemlp_max / PAGESIZE)); 788 preferred_size = MAX(kcage_kmemlp_mincage, preferred_size); 789 } 790 return (preferred_size); 791 } 792 793 /* 794 * Kcage_init() builds the cage and initializes the cage thresholds. 795 * The size of the cage is determined by the argument preferred_size. 796 * or the actual amount of memory, whichever is smaller. 797 */ 798 void 799 kcage_init(pgcnt_t preferred_size) 800 { 801 pgcnt_t wanted; 802 pfn_t pfn; 803 page_t *pp; 804 extern struct vnode kvp; 805 extern void page_list_noreloc_startup(page_t *); 806 807 ASSERT(!kcage_on); 808 ASSERT(kcage_range_islocked()); 809 810 /* increase preferred cage size for lp for kmem */ 811 preferred_size = kcage_recalc_preferred_size(preferred_size); 812 813 /* Debug note: initialize this now so early expansions can stat */ 814 KCAGE_STAT_INIT_SCAN_INDEX; 815 816 /* 817 * Initialize cage thresholds and install kphysm callback. 818 * If we can't arrange to have the thresholds track with 819 * available physical memory, then the cage thresholds may 820 * end up over time at levels that adversly effect system 821 * performance; so, bail out. 822 */ 823 kcage_recalc_thresholds(); 824 if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) { 825 ASSERT(0); /* Catch this in DEBUG kernels. */ 826 return; 827 } 828 829 /* 830 * Limit startup cage size within the range of kcage_minfree 831 * and availrmem, inclusively. 832 */ 833 wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem); 834 835 /* 836 * Construct the cage. PFNs are allocated from the glist. It 837 * is assumed that the list has been properly ordered for the 838 * platform by the platform code. Typically, this is as simple 839 * as calling kcage_range_init(phys_avail, decr), where decr is 840 * 1 if the kernel has been loaded into upper end of physical 841 * memory, or 0 if the kernel has been loaded at the low end. 842 * 843 * Note: it is assumed that we are in the startup flow, so there 844 * is no reason to grab the page lock. 845 */ 846 kcage_freemem = 0; 847 pfn = PFN_INVALID; /* prime for alignment test */ 848 while (wanted != 0) { 849 if ((pfn = kcage_get_pfn()) == PFN_INVALID) 850 break; 851 852 if ((pp = page_numtopp_nolock(pfn)) != NULL) { 853 KCAGEPAGETS_INC(); 854 /* 855 * Set the noreloc state on the page. 856 * If the page is free and not already 857 * on the noreloc list then move it. 858 */ 859 if (PP_ISFREE(pp)) { 860 if (PP_ISNORELOC(pp) == 0) 861 page_list_noreloc_startup(pp); 862 } else { 863 ASSERT(pp->p_szc == 0); 864 PP_SETNORELOC(pp); 865 } 866 } 867 PLCNT_XFER_NORELOC(pp); 868 wanted -= 1; 869 } 870 871 /* 872 * Need to go through and find kernel allocated pages 873 * and capture them into the Cage. These will primarily 874 * be pages gotten through boot_alloc(). 875 */ 876 if (kvp.v_pages) { 877 878 pp = kvp.v_pages; 879 do { 880 ASSERT(!PP_ISFREE(pp)); 881 ASSERT(pp->p_szc == 0); 882 PP_SETNORELOC(pp); 883 } while ((pp = pp->p_vpnext) != kvp.v_pages); 884 885 } 886 887 kcage_on = 1; 888 889 /* 890 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend() 891 * after the cageout thread is blocked, and executes from cpr_resume() 892 * before the cageout thread is restarted. By executing in this class, 893 * we are assured that the kernel cage thread won't miss wakeup calls 894 * and also CPR's larger kmem_alloc requests will not fail after 895 * CPR shuts down the cageout kernel thread. 896 */ 897 (void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL, 898 "cageout"); 899 900 /* 901 * Coalesce pages to improve large page availability. A better fix 902 * would to coalesce pages as they are included in the cage 903 */ 904 if (SEGKMEM_USE_LARGEPAGES) { 905 extern void page_freelist_coalesce_all(int mnode); 906 extern int max_mem_nodes; 907 int mnode, max_mnodes = max_mem_nodes; 908 for (mnode = 0; mnode < max_mnodes; mnode++) { 909 page_freelist_coalesce_all(mnode); 910 } 911 } 912 } 913 914 void 915 kcage_recalc_thresholds() 916 { 917 static int first = 1; 918 static pgcnt_t init_lotsfree; 919 static pgcnt_t init_desfree; 920 static pgcnt_t init_minfree; 921 static pgcnt_t init_throttlefree; 922 static pgcnt_t init_reserve; 923 924 /* TODO: any reason to take more care than this with live editing? */ 925 mutex_enter(&kcage_cageout_mutex); 926 mutex_enter(&freemem_lock); 927 928 if (first) { 929 first = 0; 930 init_lotsfree = kcage_lotsfree; 931 init_desfree = kcage_desfree; 932 init_minfree = kcage_minfree; 933 init_throttlefree = kcage_throttlefree; 934 init_reserve = kcage_reserve; 935 } else { 936 kcage_lotsfree = init_lotsfree; 937 kcage_desfree = init_desfree; 938 kcage_minfree = init_minfree; 939 kcage_throttlefree = init_throttlefree; 940 kcage_reserve = init_reserve; 941 } 942 943 if (kcage_lotsfree == 0) 944 kcage_lotsfree = MAX(32, total_pages / 256); 945 946 if (kcage_minfree == 0) 947 kcage_minfree = MAX(32, kcage_lotsfree / 2); 948 949 if (kcage_desfree == 0) 950 kcage_desfree = MAX(32, kcage_minfree); 951 952 if (kcage_throttlefree == 0) 953 kcage_throttlefree = MAX(32, kcage_minfree / 2); 954 955 if (kcage_reserve == 0) 956 kcage_reserve = MIN(32, kcage_throttlefree / 2); 957 958 mutex_exit(&freemem_lock); 959 mutex_exit(&kcage_cageout_mutex); 960 961 if (kcage_cageout_ready) { 962 if (kcage_freemem < kcage_desfree) 963 kcage_cageout_wakeup(); 964 965 if (kcage_needfree) { 966 mutex_enter(&kcage_throttle_mutex); 967 cv_broadcast(&kcage_throttle_cv); 968 mutex_exit(&kcage_throttle_mutex); 969 } 970 } 971 } 972 973 /* 974 * Pageout interface: 975 * kcage_cageout_init() 976 */ 977 void 978 kcage_cageout_init() 979 { 980 if (kcage_on) { 981 982 (void) thread_create(NULL, 0, kcage_cageout, 983 NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1); 984 } 985 } 986 987 988 /* 989 * VM Interfaces: 990 * kcage_create_throttle() 991 * kcage_freemem_add() 992 * kcage_freemem_sub() 993 */ 994 995 /* 996 * Wakeup cageout thread and throttle waiting for the number of pages 997 * requested to become available. For non-critical requests, a 998 * timeout is added, since freemem accounting is separate from cage 999 * freemem accounting: it's possible for us to get stuck and not make 1000 * forward progress even though there was sufficient freemem before 1001 * arriving here. 1002 */ 1003 int 1004 kcage_create_throttle(pgcnt_t npages, int flags) 1005 { 1006 int niter = 0; 1007 pgcnt_t lastfree; 1008 int enough = kcage_freemem > kcage_throttlefree + npages; 1009 1010 KCAGE_STAT_INCR(kct_calls); /* unprotected incr. */ 1011 1012 kcage_cageout_wakeup(); /* just to be sure */ 1013 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1014 1015 /* 1016 * Obviously, we can't throttle the cageout thread since 1017 * we depend on it. We also can't throttle the panic thread. 1018 */ 1019 if (curthread == kcage_cageout_thread || panicstr) { 1020 KCAGE_STAT_INCR(kct_cageout); /* unprotected incr. */ 1021 return (KCT_CRIT); 1022 } 1023 1024 /* 1025 * Don't throttle threads which are critical for proper 1026 * vm management if we're above kcage_throttlefree or 1027 * if freemem is very low. 1028 */ 1029 if (NOMEMWAIT()) { 1030 if (enough) { 1031 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1032 return (KCT_CRIT); 1033 } else if (freemem < minfree) { 1034 KCAGE_STAT_INCR(kct_critical); /* unprotected incr. */ 1035 return (KCT_CRIT); 1036 } 1037 } 1038 1039 /* 1040 * Don't throttle real-time threads if kcage_freemem > kcage_reserve. 1041 */ 1042 if (DISP_PRIO(curthread) > maxclsyspri && 1043 kcage_freemem > kcage_reserve) { 1044 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1045 return (KCT_CRIT); 1046 } 1047 1048 /* 1049 * Cause all other threads (which are assumed to not be 1050 * critical to cageout) to wait here until their request 1051 * can be satisfied. Be a little paranoid and wake the 1052 * kernel cage on each loop through this logic. 1053 */ 1054 while (kcage_freemem < kcage_throttlefree + npages) { 1055 ASSERT(kcage_on); 1056 1057 lastfree = kcage_freemem; 1058 1059 if (kcage_cageout_ready) { 1060 mutex_enter(&kcage_throttle_mutex); 1061 1062 kcage_needfree += npages; 1063 KCAGE_STAT_INCR(kct_wait); 1064 1065 kcage_cageout_wakeup(); 1066 KCAGE_STAT_INCR(kct_cagewake); 1067 1068 cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex); 1069 1070 kcage_needfree -= npages; 1071 1072 mutex_exit(&kcage_throttle_mutex); 1073 } else { 1074 /* 1075 * NOTE: atomics are used just in case we enter 1076 * mp operation before the cageout thread is ready. 1077 */ 1078 atomic_add_long(&kcage_needfree, npages); 1079 1080 kcage_cageout_wakeup(); 1081 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1082 1083 atomic_add_long(&kcage_needfree, -npages); 1084 } 1085 1086 if ((flags & PG_WAIT) == 0) { 1087 if (kcage_freemem > lastfree) { 1088 KCAGE_STAT_INCR(kct_progress); 1089 niter = 0; 1090 } else { 1091 KCAGE_STAT_INCR(kct_noprogress); 1092 if (++niter >= kcage_maxwait) { 1093 KCAGE_STAT_INCR(kct_timeout); 1094 return (KCT_FAILURE); 1095 } 1096 } 1097 } 1098 } 1099 return (KCT_NONCRIT); 1100 } 1101 1102 void 1103 kcage_freemem_add(pgcnt_t npages) 1104 { 1105 extern void wakeup_pcgs(void); 1106 1107 atomic_add_long(&kcage_freemem, npages); 1108 1109 wakeup_pcgs(); /* wakeup threads in pcgs() */ 1110 1111 if (kcage_needfree != 0 && 1112 kcage_freemem >= (kcage_throttlefree + kcage_needfree)) { 1113 1114 mutex_enter(&kcage_throttle_mutex); 1115 cv_broadcast(&kcage_throttle_cv); 1116 KCAGE_STAT_INCR(kfa_trottlewake); 1117 mutex_exit(&kcage_throttle_mutex); 1118 } 1119 } 1120 1121 void 1122 kcage_freemem_sub(pgcnt_t npages) 1123 { 1124 atomic_add_long(&kcage_freemem, -npages); 1125 1126 if (kcage_freemem < kcage_desfree) { 1127 kcage_cageout_wakeup(); 1128 KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */ 1129 } 1130 } 1131 1132 /* 1133 * return 0 on failure and 1 on success. 1134 */ 1135 static int 1136 kcage_setnoreloc_pages(page_t *rootpp, se_t se) 1137 { 1138 pgcnt_t npgs, i; 1139 page_t *pp; 1140 pfn_t rootpfn = page_pptonum(rootpp); 1141 uint_t szc; 1142 1143 ASSERT(!PP_ISFREE(rootpp)); 1144 ASSERT(PAGE_LOCKED_SE(rootpp, se)); 1145 if (!group_page_trylock(rootpp, se)) { 1146 return (0); 1147 } 1148 szc = rootpp->p_szc; 1149 if (szc == 0) { 1150 /* 1151 * The szc of a locked page can only change for pages that are 1152 * non-swapfs (i.e. anonymous memory) file system pages. 1153 */ 1154 ASSERT(rootpp->p_vnode != NULL && 1155 rootpp->p_vnode != &kvp && 1156 !IS_SWAPFSVP(rootpp->p_vnode)); 1157 PP_SETNORELOC(rootpp); 1158 return (1); 1159 } 1160 npgs = page_get_pagecnt(szc); 1161 ASSERT(IS_P2ALIGNED(rootpfn, npgs)); 1162 pp = rootpp; 1163 for (i = 0; i < npgs; i++, pp++) { 1164 ASSERT(PAGE_LOCKED_SE(pp, se)); 1165 ASSERT(!PP_ISFREE(pp)); 1166 ASSERT(pp->p_szc == szc); 1167 PP_SETNORELOC(pp); 1168 } 1169 group_page_unlock(rootpp); 1170 return (1); 1171 } 1172 1173 /* 1174 * Attempt to convert page to a caged page (set the P_NORELOC flag). 1175 * If successful and pages is free, move page to the tail of whichever 1176 * list it is on. 1177 * Returns: 1178 * EBUSY page already locked, assimilated but not free. 1179 * ENOMEM page assimilated, but memory too low to relocate. Page not free. 1180 * EAGAIN page not assimilated. Page not free. 1181 * ERANGE page assimilated. Page not root. 1182 * 0 page assimilated. Page free. 1183 * *nfreedp number of pages freed. 1184 * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way 1185 * to distinguish between a page that was already a NORELOC page from 1186 * those newly converted to NORELOC pages by this invocation of 1187 * kcage_assimilate_page. 1188 */ 1189 static int 1190 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp) 1191 { 1192 if (page_trylock(pp, SE_EXCL)) { 1193 if (PP_ISNORELOC(pp)) { 1194 check_free_and_return: 1195 if (PP_ISFREE(pp)) { 1196 page_unlock(pp); 1197 *nfreedp = 0; 1198 return (0); 1199 } else { 1200 page_unlock(pp); 1201 return (EBUSY); 1202 } 1203 /*NOTREACHED*/ 1204 } 1205 } else { 1206 if (page_trylock(pp, SE_SHARED)) { 1207 if (PP_ISNORELOC(pp)) 1208 goto check_free_and_return; 1209 } else 1210 return (EAGAIN); 1211 1212 if (!PP_ISFREE(pp)) { 1213 page_unlock(pp); 1214 return (EAGAIN); 1215 } 1216 1217 /* 1218 * Need to upgrade the lock on it and set the NORELOC 1219 * bit. If it is free then remove it from the free 1220 * list so that the platform free list code can keep 1221 * NORELOC pages where they should be. 1222 */ 1223 /* 1224 * Before doing anything, get the exclusive lock. 1225 * This may fail (eg ISM pages are left shared locked). 1226 * If the page is free this will leave a hole in the 1227 * cage. There is no solution yet to this. 1228 */ 1229 if (!page_tryupgrade(pp)) { 1230 page_unlock(pp); 1231 return (EAGAIN); 1232 } 1233 } 1234 1235 ASSERT(PAGE_EXCL(pp)); 1236 1237 if (PP_ISFREE(pp)) { 1238 int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST; 1239 1240 page_list_sub(pp, which); 1241 ASSERT(pp->p_szc == 0); 1242 PP_SETNORELOC(pp); 1243 page_list_add(pp, which | PG_LIST_TAIL); 1244 1245 page_unlock(pp); 1246 *nfreedp = 1; 1247 PLCNT_XFER_NORELOC(pp); 1248 return (0); 1249 } else { 1250 if (pp->p_szc != 0) { 1251 if (!kcage_setnoreloc_pages(pp, SE_EXCL)) { 1252 page_unlock(pp); 1253 return (EAGAIN); 1254 } 1255 ASSERT(PP_ISNORELOC(pp)); 1256 } else { 1257 PP_SETNORELOC(pp); 1258 } 1259 PLCNT_XFER_NORELOC(pp); 1260 return (kcage_invalidate_page(pp, nfreedp)); 1261 } 1262 /*NOTREACHED*/ 1263 } 1264 1265 static int 1266 kcage_expand() 1267 { 1268 int did_something = 0; 1269 1270 spgcnt_t wanted; 1271 pfn_t pfn; 1272 page_t *pp; 1273 /* TODO: we don't really need n any more? */ 1274 pgcnt_t n; 1275 pgcnt_t nf, nfreed; 1276 1277 /* 1278 * Expand the cage if available cage memory is really low. Calculate 1279 * the amount required to return kcage_freemem to the level of 1280 * kcage_lotsfree, or to satisfy throttled requests, whichever is 1281 * more. It is rare for their sum to create an artificial threshold 1282 * above kcage_lotsfree, but it is possible. 1283 * 1284 * Exit early if expansion amount is equal to or less than zero. 1285 * (<0 is possible if kcage_freemem rises suddenly.) 1286 * 1287 * Exit early when the global page pool (apparently) does not 1288 * have enough free pages to page_relocate() even a single page. 1289 */ 1290 wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree) 1291 - kcage_freemem; 1292 if (wanted <= 0) 1293 return (0); 1294 else if (freemem < pageout_reserve + 1) { 1295 KCAGE_STAT_INCR(ke_lowfreemem); 1296 return (0); 1297 } 1298 1299 /* 1300 * Try to get the range list lock. If the lock is already 1301 * held, then don't get stuck here waiting for it. 1302 */ 1303 if (!kcage_range_trylock()) 1304 return (0); 1305 1306 KCAGE_STAT_INCR(ke_calls); 1307 KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted); 1308 1309 /* 1310 * Assimilate more pages from the global page pool into the cage. 1311 */ 1312 n = 0; /* number of pages PP_SETNORELOC'd */ 1313 nf = 0; /* number of those actually free */ 1314 while (kcage_on && nf < wanted) { 1315 pfn = kcage_get_pfn(); 1316 if (pfn == PFN_INVALID) { /* eek! no where to grow */ 1317 KCAGE_STAT_INCR(ke_nopfn); 1318 goto terminate; 1319 } 1320 1321 KCAGE_STAT_INCR_SCAN(ke_examined); 1322 1323 if ((pp = page_numtopp_nolock(pfn)) == NULL) { 1324 KCAGE_STAT_INCR(ke_nopaget); 1325 continue; 1326 } 1327 KCAGEPAGETS_INC(); 1328 /* 1329 * Sanity check. Skip this pfn if it is 1330 * being deleted. 1331 */ 1332 if (pfn_is_being_deleted(pfn)) { 1333 KCAGE_STAT_INCR(ke_deleting); 1334 continue; 1335 } 1336 1337 /* 1338 * NORELOC is only set at boot-time or by this routine 1339 * under the kcage_range_mutex lock which is currently 1340 * held. This means we can do a fast check here before 1341 * locking the page in kcage_assimilate_page. 1342 */ 1343 if (PP_ISNORELOC(pp)) { 1344 KCAGE_STAT_INCR(ke_isnoreloc); 1345 continue; 1346 } 1347 1348 switch (kcage_assimilate_page(pp, &nfreed)) { 1349 case 0: /* assimilated, page is free */ 1350 KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed); 1351 did_something = 1; 1352 nf += nfreed; 1353 n++; 1354 break; 1355 1356 case EBUSY: /* assimilated, page not free */ 1357 case ERANGE: /* assimilated, page not root */ 1358 KCAGE_STAT_INCR_SCAN(ke_gotone); 1359 did_something = 1; 1360 n++; 1361 break; 1362 1363 case ENOMEM: /* assimilated, but no mem */ 1364 KCAGE_STAT_INCR(ke_terminate); 1365 did_something = 1; 1366 n++; 1367 goto terminate; 1368 1369 case EAGAIN: /* can't assimilate */ 1370 KCAGE_STAT_INCR_SCAN(ke_lefthole); 1371 break; 1372 1373 default: /* catch this with debug kernels */ 1374 ASSERT(0); 1375 break; 1376 } 1377 } 1378 1379 /* 1380 * Realign cage edge with the nearest physical address 1381 * boundry for big pages. This is done to give us a 1382 * better chance of actually getting usable big pages 1383 * in the cage. 1384 */ 1385 1386 terminate: 1387 kcage_range_unlock(); 1388 1389 return (did_something); 1390 } 1391 1392 /* 1393 * Relocate page opp (Original Page Pointer) from cage pool to page rpp 1394 * (Replacement Page Pointer) in the global pool. Page opp will be freed 1395 * if relocation is successful, otherwise it is only unlocked. 1396 * On entry, page opp must be exclusively locked and not free. 1397 * *nfreedp: number of pages freed. 1398 */ 1399 static int 1400 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp) 1401 { 1402 page_t *opp = pp; 1403 page_t *rpp = NULL; 1404 spgcnt_t npgs; 1405 int result; 1406 1407 ASSERT(!PP_ISFREE(opp)); 1408 ASSERT(PAGE_EXCL(opp)); 1409 1410 result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL); 1411 *nfreedp = npgs; 1412 if (result == 0) { 1413 while (npgs-- > 0) { 1414 page_t *tpp; 1415 1416 ASSERT(rpp != NULL); 1417 tpp = rpp; 1418 page_sub(&rpp, tpp); 1419 page_unlock(tpp); 1420 } 1421 1422 ASSERT(rpp == NULL); 1423 1424 return (0); /* success */ 1425 } 1426 1427 page_unlock(opp); 1428 return (result); 1429 } 1430 1431 /* 1432 * Based on page_invalidate_pages() 1433 * 1434 * Kcage_invalidate_page() uses page_relocate() twice. Both instances 1435 * of use must be updated to match the new page_relocate() when it 1436 * becomes available. 1437 * 1438 * Return result of kcage_relocate_page or zero if page was directly freed. 1439 * *nfreedp: number of pages freed. 1440 */ 1441 static int 1442 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp) 1443 { 1444 int result; 1445 1446 #if defined(__sparc) 1447 extern struct vnode prom_ppages; 1448 ASSERT(pp->p_vnode != &prom_ppages); 1449 #endif /* __sparc */ 1450 1451 ASSERT(!PP_ISFREE(pp)); 1452 ASSERT(PAGE_EXCL(pp)); 1453 1454 /* 1455 * Is this page involved in some I/O? shared? 1456 * The page_struct_lock need not be acquired to 1457 * examine these fields since the page has an 1458 * "exclusive" lock. 1459 */ 1460 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1461 result = kcage_relocate_page(pp, nfreedp); 1462 #ifdef KCAGE_STATS 1463 if (result == 0) 1464 KCAGE_STAT_INCR_SCAN(kip_reloclocked); 1465 else if (result == ENOMEM) 1466 KCAGE_STAT_INCR_SCAN(kip_nomem); 1467 #endif 1468 return (result); 1469 } 1470 1471 ASSERT(pp->p_vnode->v_type != VCHR); 1472 1473 /* 1474 * Unload the mappings and check if mod bit is set. 1475 */ 1476 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1477 1478 if (hat_ismod(pp)) { 1479 result = kcage_relocate_page(pp, nfreedp); 1480 #ifdef KCAGE_STATS 1481 if (result == 0) 1482 KCAGE_STAT_INCR_SCAN(kip_relocmod); 1483 else if (result == ENOMEM) 1484 KCAGE_STAT_INCR_SCAN(kip_nomem); 1485 #endif 1486 return (result); 1487 } 1488 1489 if (!page_try_demote_pages(pp)) { 1490 KCAGE_STAT_INCR_SCAN(kip_demotefailed); 1491 page_unlock(pp); 1492 return (EAGAIN); 1493 } 1494 1495 page_destroy(pp, 0); 1496 KCAGE_STAT_INCR_SCAN(kip_destroy); 1497 *nfreedp = 1; 1498 return (0); 1499 } 1500 1501 static void 1502 kcage_cageout() 1503 { 1504 pfn_t pfn; 1505 page_t *pp; 1506 callb_cpr_t cprinfo; 1507 int did_something; 1508 int scan_again; 1509 pfn_t start_pfn; 1510 int pass; 1511 int last_pass; 1512 int pages_skipped; 1513 int shared_skipped; 1514 uint_t shared_level = 8; 1515 pgcnt_t nfreed; 1516 #ifdef KCAGE_STATS 1517 clock_t scan_start; 1518 #endif 1519 1520 CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex, 1521 callb_generic_cpr, "cageout"); 1522 1523 mutex_enter(&kcage_cageout_mutex); 1524 kcage_cageout_thread = curthread; 1525 1526 pfn = PFN_INVALID; /* force scan reset */ 1527 start_pfn = PFN_INVALID; /* force init with 1st cage pfn */ 1528 kcage_cageout_ready = 1; /* switch kcage_cageout_wakeup mode */ 1529 1530 loop: 1531 /* 1532 * Wait here. Sooner or later, kcage_freemem_sub() will notice 1533 * that kcage_freemem is less than kcage_desfree. When it does 1534 * notice, kcage_freemem_sub() will wake us up via call to 1535 * kcage_cageout_wakeup(). 1536 */ 1537 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1538 cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex); 1539 CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex); 1540 1541 KCAGE_STAT_INCR(kt_wakeups); 1542 KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem); 1543 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem); 1544 pass = 0; 1545 last_pass = 0; 1546 1547 #ifdef KCAGE_STATS 1548 scan_start = lbolt; 1549 #endif 1550 1551 again: 1552 if (!kcage_on) 1553 goto loop; 1554 1555 KCAGE_STAT_INCR(kt_scans); 1556 KCAGE_STAT_INCR_SCAN(kt_passes); 1557 1558 did_something = 0; 1559 pages_skipped = 0; 1560 shared_skipped = 0; 1561 while ((kcage_freemem < kcage_lotsfree || kcage_needfree) && 1562 (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) { 1563 1564 if (start_pfn == PFN_INVALID) 1565 start_pfn = pfn; 1566 else if (start_pfn == pfn) { 1567 last_pass = pass; 1568 pass += 1; 1569 /* 1570 * Did a complete walk of kernel cage, but didn't free 1571 * any pages. If only one cpu is online then 1572 * stop kernel cage walk and try expanding. 1573 */ 1574 if (ncpus_online == 1 && did_something == 0) { 1575 KCAGE_STAT_INCR(kt_cageout_break); 1576 break; 1577 } 1578 } 1579 1580 pp = page_numtopp_nolock(pfn); 1581 if (pp == NULL) { 1582 continue; 1583 } 1584 1585 KCAGE_STAT_INCR_SCAN(kt_examined); 1586 1587 /* 1588 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside 1589 * of the lock. If one is missed it will be seen next 1590 * time through. 1591 * 1592 * Skip non-caged-pages. These pages can exist in the cage 1593 * because, if during cage expansion, a page is 1594 * encountered that is long-term locked the lock prevents the 1595 * expansion logic from setting the P_NORELOC flag. Hence, 1596 * non-caged-pages surrounded by caged-pages. 1597 */ 1598 if (!PP_ISNORELOC(pp)) { 1599 switch (kcage_assimilate_page(pp, &nfreed)) { 1600 case 0: 1601 did_something = 1; 1602 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, 1603 nfreed); 1604 break; 1605 1606 case EBUSY: 1607 case ERANGE: 1608 did_something = 1; 1609 KCAGE_STAT_INCR_SCAN(kt_gotone); 1610 break; 1611 1612 case EAGAIN: 1613 case ENOMEM: 1614 break; 1615 1616 default: 1617 /* catch this with debug kernels */ 1618 ASSERT(0); 1619 break; 1620 } 1621 1622 continue; 1623 } else { 1624 int prm; 1625 1626 if (PP_ISFREE(pp)) { 1627 continue; 1628 } 1629 1630 if ((pp->p_vnode == &kvp && pp->p_lckcnt > 0) || 1631 !page_trylock(pp, SE_EXCL)) { 1632 KCAGE_STAT_INCR_SCAN(kt_cantlock); 1633 continue; 1634 } 1635 1636 /* P_NORELOC bit should not have gone away. */ 1637 ASSERT(PP_ISNORELOC(pp)); 1638 if (PP_ISFREE(pp) || (pp->p_vnode == &kvp && 1639 pp->p_lckcnt > 0)) { 1640 page_unlock(pp); 1641 continue; 1642 } 1643 1644 KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level); 1645 if (hat_page_getshare(pp) > shared_level) { 1646 page_unlock(pp); 1647 pages_skipped = 1; 1648 shared_skipped = 1; 1649 KCAGE_STAT_INCR_SCAN(kt_skipshared); 1650 continue; 1651 } 1652 1653 /* 1654 * In pass {0, 1}, skip page if ref bit is set. 1655 * In pass {0, 1, 2}, skip page if mod bit is set. 1656 */ 1657 prm = hat_pagesync(pp, 1658 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 1659 1660 /* On first pass ignore ref'd pages */ 1661 if (pass <= 1 && (prm & P_REF)) { 1662 KCAGE_STAT_INCR_SCAN(kt_skiprefd); 1663 pages_skipped = 1; 1664 page_unlock(pp); 1665 continue; 1666 } 1667 1668 /* On pass 2, page_destroy if mod bit is not set */ 1669 if (pass <= 2) { 1670 if (pp->p_szc != 0 || (prm & P_MOD) || 1671 pp->p_lckcnt || pp->p_cowcnt) { 1672 pages_skipped = 1; 1673 page_unlock(pp); 1674 } else { 1675 1676 /* 1677 * unload the mappings before 1678 * checking if mod bit is set 1679 */ 1680 (void) hat_pageunload(pp, 1681 HAT_FORCE_PGUNLOAD); 1682 1683 /* 1684 * skip this page if modified 1685 */ 1686 if (hat_ismod(pp)) { 1687 pages_skipped = 1; 1688 page_unlock(pp); 1689 continue; 1690 } 1691 1692 KCAGE_STAT_INCR_SCAN(kt_destroy); 1693 page_destroy(pp, 0); 1694 did_something = 1; 1695 } 1696 continue; 1697 } 1698 1699 if (kcage_invalidate_page(pp, &nfreed) == 0) { 1700 did_something = 1; 1701 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed); 1702 } 1703 1704 /* 1705 * No need to drop the page lock here. 1706 * Kcage_invalidate_page has done that for us 1707 * either explicitly or through a page_free. 1708 */ 1709 } 1710 } 1711 1712 /* 1713 * Expand the cage only if available cage memory is really low. 1714 * This test is done only after a complete scan of the cage. 1715 * The reason for not checking and expanding more often is to 1716 * avoid rapid expansion of the cage. Naturally, scanning the 1717 * cage takes time. So by scanning first, we use that work as a 1718 * delay loop in between expand decisions. 1719 */ 1720 1721 scan_again = 0; 1722 if (kcage_freemem < kcage_minfree || kcage_needfree) { 1723 /* 1724 * Kcage_expand() will return a non-zero value if it was 1725 * able to expand the cage -- whether or not the new 1726 * pages are free and immediately usable. If non-zero, 1727 * we do another scan of the cage. The pages might be 1728 * freed during that scan or by time we get back here. 1729 * If not, we will attempt another expansion. 1730 * However, if kcage_expand() returns zero, then it was 1731 * unable to expand the cage. This is the case when the 1732 * the growth list is exausted, therefore no work was done 1733 * and there is no reason to scan the cage again. 1734 * Note: Kernel cage scan is not repeated on single-cpu 1735 * system to avoid kernel cage thread hogging cpu. 1736 */ 1737 if (pass <= 3 && pages_skipped && ncpus_online > 1) 1738 scan_again = 1; 1739 else 1740 (void) kcage_expand(); /* don't scan again */ 1741 } else if (kcage_freemem < kcage_lotsfree) { 1742 /* 1743 * If available cage memory is less than abundant 1744 * and a full scan of the cage has not yet been completed, 1745 * or a scan has completed and some work was performed, 1746 * or pages were skipped because of sharing, 1747 * or we simply have not yet completed two passes, 1748 * then do another scan. 1749 */ 1750 if (pass <= 2 && pages_skipped) 1751 scan_again = 1; 1752 if (pass == last_pass || did_something) 1753 scan_again = 1; 1754 else if (shared_skipped && shared_level < (8<<24)) { 1755 shared_level <<= 1; 1756 scan_again = 1; 1757 } 1758 } 1759 1760 if (scan_again && ncpus_online > 1) 1761 goto again; 1762 else { 1763 if (shared_level > 8) 1764 shared_level >>= 1; 1765 1766 KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem); 1767 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem); 1768 KCAGE_STAT_SET_SCAN(kt_ticks, lbolt - scan_start); 1769 KCAGE_STAT_INC_SCAN_INDEX; 1770 goto loop; 1771 } 1772 1773 /*NOTREACHED*/ 1774 } 1775 1776 void 1777 kcage_cageout_wakeup() 1778 { 1779 if (mutex_tryenter(&kcage_cageout_mutex)) { 1780 if (kcage_cageout_ready) { 1781 cv_signal(&kcage_cageout_cv); 1782 } else if (kcage_freemem < kcage_minfree || kcage_needfree) { 1783 /* 1784 * Available cage memory is really low. Time to 1785 * start expanding the cage. However, the 1786 * kernel cage thread is not yet ready to 1787 * do the work. Use *this* thread, which is 1788 * most likely to be t0, to do the work. 1789 */ 1790 KCAGE_STAT_INCR(kcw_expandearly); 1791 (void) kcage_expand(); 1792 KCAGE_STAT_INC_SCAN_INDEX; 1793 } 1794 1795 mutex_exit(&kcage_cageout_mutex); 1796 } 1797 /* else, kernel cage thread is already running */ 1798 } 1799 1800 void 1801 kcage_tick() 1802 { 1803 /* 1804 * Once per second we wake up all the threads throttled 1805 * waiting for cage memory, in case we've become stuck 1806 * and haven't made forward progress expanding the cage. 1807 */ 1808 if (kcage_on && kcage_cageout_ready) 1809 cv_broadcast(&kcage_throttle_cv); 1810 } 1811