1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/thread.h> 31 #include <sys/proc.h> 32 #include <sys/callb.h> 33 #include <sys/vnode.h> 34 #include <sys/debug.h> 35 #include <sys/systm.h> /* for bzero */ 36 #include <sys/memlist.h> 37 #include <sys/cmn_err.h> 38 #include <sys/sysmacros.h> 39 #include <sys/vmsystm.h> /* for NOMEMWAIT() */ 40 #include <sys/atomic.h> /* used to update kcage_freemem */ 41 #include <sys/kmem.h> /* for kmem_reap */ 42 #include <sys/errno.h> 43 #include <sys/mem_cage.h> 44 #include <vm/seg_kmem.h> 45 #include <vm/page.h> 46 #include <vm/hat.h> 47 #include <vm/vm_dep.h> 48 #include <sys/mem_config.h> 49 #include <sys/lgrp.h> 50 #include <sys/rwlock.h> 51 52 extern pri_t maxclsyspri; 53 54 #ifdef DEBUG 55 #define KCAGE_STATS 56 #endif 57 58 #ifdef KCAGE_STATS 59 60 #define KCAGE_STATS_VERSION 9 /* can help report generators */ 61 #define KCAGE_STATS_NSCANS 256 /* depth of scan statistics buffer */ 62 63 struct kcage_stats_scan { 64 /* managed by KCAGE_STAT_* macros */ 65 clock_t scan_lbolt; 66 uint_t scan_id; 67 68 /* set in kcage_cageout() */ 69 uint_t kt_passes; 70 clock_t kt_ticks; 71 pgcnt_t kt_kcage_freemem_start; 72 pgcnt_t kt_kcage_freemem_end; 73 pgcnt_t kt_freemem_start; 74 pgcnt_t kt_freemem_end; 75 uint_t kt_examined; 76 uint_t kt_cantlock; 77 uint_t kt_gotone; 78 uint_t kt_gotonefree; 79 uint_t kt_skiplevel; 80 uint_t kt_skipshared; 81 uint_t kt_skiprefd; 82 uint_t kt_destroy; 83 84 /* set in kcage_invalidate_page() */ 85 uint_t kip_reloclocked; 86 uint_t kip_relocmod; 87 uint_t kip_destroy; 88 uint_t kip_nomem; 89 uint_t kip_demotefailed; 90 91 /* set in kcage_expand() */ 92 uint_t ke_wanted; 93 uint_t ke_examined; 94 uint_t ke_lefthole; 95 uint_t ke_gotone; 96 uint_t ke_gotonefree; 97 }; 98 99 struct kcage_stats { 100 /* managed by KCAGE_STAT_* macros */ 101 uint_t version; 102 uint_t size; 103 104 /* set in kcage_cageout */ 105 uint_t kt_wakeups; 106 uint_t kt_scans; 107 uint_t kt_cageout_break; 108 109 /* set in kcage_expand */ 110 uint_t ke_calls; 111 uint_t ke_nopfn; 112 uint_t ke_nopaget; 113 uint_t ke_isnoreloc; 114 uint_t ke_deleting; 115 uint_t ke_lowfreemem; 116 uint_t ke_terminate; 117 118 /* set in kcage_freemem_add() */ 119 uint_t kfa_trottlewake; 120 121 /* set in kcage_freemem_sub() */ 122 uint_t kfs_cagewake; 123 124 /* set in kcage_create_throttle */ 125 uint_t kct_calls; 126 uint_t kct_cageout; 127 uint_t kct_critical; 128 uint_t kct_exempt; 129 uint_t kct_cagewake; 130 uint_t kct_wait; 131 uint_t kct_progress; 132 uint_t kct_noprogress; 133 uint_t kct_timeout; 134 135 /* set in kcage_cageout_wakeup */ 136 uint_t kcw_expandearly; 137 138 /* managed by KCAGE_STAT_* macros */ 139 uint_t scan_array_size; 140 uint_t scan_index; 141 struct kcage_stats_scan scans[KCAGE_STATS_NSCANS]; 142 }; 143 144 static struct kcage_stats kcage_stats; 145 static struct kcage_stats_scan kcage_stats_scan_zero; 146 147 /* 148 * No real need for atomics here. For the most part the incs and sets are 149 * done by the kernel cage thread. There are a few that are done by any 150 * number of other threads. Those cases are noted by comments. 151 */ 152 #define KCAGE_STAT_INCR(m) kcage_stats.m++ 153 154 #define KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v) 155 156 #define KCAGE_STAT_INCR_SCAN(m) \ 157 KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m) 158 159 #define KCAGE_STAT_NINCR_SCAN(m, v) \ 160 KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v) 161 162 #define KCAGE_STAT_SET(m, v) kcage_stats.m = (v) 163 164 #define KCAGE_STAT_SETZ(m, v) \ 165 if (kcage_stats.m == 0) kcage_stats.m = (v) 166 167 #define KCAGE_STAT_SET_SCAN(m, v) \ 168 KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v) 169 170 #define KCAGE_STAT_SETZ_SCAN(m, v) \ 171 KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v) 172 173 #define KCAGE_STAT_INC_SCAN_INDEX \ 174 KCAGE_STAT_SET_SCAN(scan_lbolt, lbolt); \ 175 KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \ 176 kcage_stats.scan_index = \ 177 (kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \ 178 kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero 179 180 #define KCAGE_STAT_INIT_SCAN_INDEX \ 181 kcage_stats.version = KCAGE_STATS_VERSION; \ 182 kcage_stats.size = sizeof (kcage_stats); \ 183 kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \ 184 kcage_stats.scan_index = 0 185 186 #else /* KCAGE_STATS */ 187 188 #define KCAGE_STAT_INCR(v) 189 #define KCAGE_STAT_NINCR(m, v) 190 #define KCAGE_STAT_INCR_SCAN(v) 191 #define KCAGE_STAT_NINCR_SCAN(m, v) 192 #define KCAGE_STAT_SET(m, v) 193 #define KCAGE_STAT_SETZ(m, v) 194 #define KCAGE_STAT_SET_SCAN(m, v) 195 #define KCAGE_STAT_SETZ_SCAN(m, v) 196 #define KCAGE_STAT_INC_SCAN_INDEX 197 #define KCAGE_STAT_INIT_SCAN_INDEX 198 199 #endif /* KCAGE_STATS */ 200 201 static kmutex_t kcage_throttle_mutex; /* protects kcage_throttle_cv */ 202 static kcondvar_t kcage_throttle_cv; 203 204 static kmutex_t kcage_cageout_mutex; /* protects cv and ready flag */ 205 static kcondvar_t kcage_cageout_cv; /* cageout thread naps here */ 206 static int kcage_cageout_ready; /* nonzero when cageout thread ready */ 207 kthread_id_t kcage_cageout_thread; /* to aid debugging */ 208 209 static krwlock_t kcage_range_rwlock; /* protects kcage_glist elements */ 210 211 /* 212 * Cage expansion happens within a range. 213 */ 214 struct kcage_glist { 215 struct kcage_glist *next; 216 pfn_t base; 217 pfn_t lim; 218 pfn_t curr; 219 int decr; 220 }; 221 222 static struct kcage_glist *kcage_glist; 223 static struct kcage_glist *kcage_current_glist; 224 225 /* 226 * The firstfree element is provided so that kmem_alloc can be avoided 227 * until that cage has somewhere to go. This is not currently a problem 228 * as early kmem_alloc's use BOP_ALLOC instead of page_create_va. 229 */ 230 static struct kcage_glist kcage_glist_firstfree; 231 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree; 232 233 /* 234 * Miscellaneous forward references 235 */ 236 static struct kcage_glist *kcage_glist_alloc(void); 237 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **); 238 static void kcage_cageout(void); 239 static int kcage_invalidate_page(page_t *, pgcnt_t *); 240 static int kcage_setnoreloc_pages(page_t *, se_t); 241 242 /* 243 * Kernel Memory Cage counters and thresholds. 244 */ 245 int kcage_on = 0; 246 pgcnt_t kcage_freemem; 247 pgcnt_t kcage_needfree; 248 pgcnt_t kcage_lotsfree; 249 pgcnt_t kcage_desfree; 250 pgcnt_t kcage_minfree; 251 pgcnt_t kcage_throttlefree; 252 pgcnt_t kcage_reserve; 253 int kcage_maxwait = 10; /* in seconds */ 254 255 /* when we use lp for kmem we start the cage at a higher initial value */ 256 pgcnt_t kcage_kmemlp_mincage; 257 258 #ifdef DEBUG 259 pgcnt_t kcage_pagets; 260 #define KCAGEPAGETS_INC() kcage_pagets++ 261 #else 262 #define KCAGEPAGETS_INC() 263 #endif 264 265 /* kstats to export what pages are currently caged */ 266 kmutex_t kcage_kstat_lock; 267 static int kcage_kstat_update(kstat_t *ksp, int rw); 268 static int kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw); 269 270 /* 271 * Startup and Dynamic Reconfiguration interfaces. 272 * kcage_range_lock() 273 * kcage_range_unlock() 274 * kcage_range_islocked() 275 * kcage_range_add() 276 * kcage_range_del() 277 * kcage_init() 278 * kcage_set_thresholds() 279 */ 280 281 /* 282 * Called outside of this file to add/remove from the list, 283 * therefore, it takes a writer lock 284 */ 285 void 286 kcage_range_lock(void) 287 { 288 rw_enter(&kcage_range_rwlock, RW_WRITER); 289 } 290 291 void 292 kcage_range_unlock(void) 293 { 294 rw_exit(&kcage_range_rwlock); 295 } 296 297 int 298 kcage_range_islocked(void) 299 { 300 return (rw_lock_held(&kcage_range_rwlock)); 301 } 302 303 /* 304 * Called from page_get_contig_pages to get the approximate kcage pfn range 305 * for exclusion from search for contiguous pages. This routine is called 306 * without kcage_range lock (kcage routines can call page_get_contig_pages 307 * through page_relocate) and with the assumption, based on kcage_range_add, 308 * that kcage_current_glist always contain a valid pointer. 309 */ 310 311 int 312 kcage_current_pfn(pfn_t *pfncur) 313 { 314 struct kcage_glist *lp = kcage_current_glist; 315 316 ASSERT(kcage_on); 317 318 ASSERT(lp != NULL); 319 320 *pfncur = lp->curr; 321 322 return (lp->decr); 323 } 324 325 /* 326 * Called from vm_pagelist.c during coalesce to find kernel cage regions 327 * within an mnode. Looks for the lowest range between lo and hi. 328 * 329 * Kernel cage memory is defined between kcage_glist and kcage_current_glist. 330 * Non-cage memory is defined between kcage_current_glist and list end. 331 * 332 * If incage is set, returns the lowest kcage range. Otherwise returns lowest 333 * non-cage range. 334 * 335 * Returns zero on success and nlo, nhi: 336 * lo <= nlo < nhi <= hi 337 * Returns non-zero if no overlapping range is found. 338 */ 339 int 340 kcage_next_range(int incage, pfn_t lo, pfn_t hi, 341 pfn_t *nlo, pfn_t *nhi) 342 { 343 struct kcage_glist *lp; 344 pfn_t tlo = hi; 345 pfn_t thi = hi; 346 347 ASSERT(lo <= hi); 348 349 /* 350 * Reader lock protects the list, but kcage_get_pfn 351 * running concurrently may advance kcage_current_glist 352 * and also update kcage_current_glist->curr. Page 353 * coalesce can handle this race condition. 354 */ 355 rw_enter(&kcage_range_rwlock, RW_READER); 356 357 for (lp = incage ? kcage_glist : kcage_current_glist; 358 lp != NULL; lp = lp->next) { 359 360 pfn_t klo, khi; 361 362 /* find the range limits in this element */ 363 if ((incage && lp->decr) || (!incage && !lp->decr)) { 364 klo = lp->curr; 365 khi = lp->lim; 366 } else { 367 klo = lp->base; 368 khi = lp->curr; 369 } 370 371 /* handle overlap */ 372 if (klo < tlo && klo < khi && lo < khi && klo < hi) { 373 tlo = MAX(lo, klo); 374 thi = MIN(hi, khi); 375 if (tlo == lo) 376 break; 377 } 378 379 /* check end of kcage */ 380 if (incage && lp == kcage_current_glist) { 381 break; 382 } 383 } 384 385 rw_exit(&kcage_range_rwlock); 386 387 /* return non-zero if no overlapping range found */ 388 if (tlo == thi) 389 return (1); 390 391 ASSERT(lo <= tlo && tlo < thi && thi <= hi); 392 393 /* return overlapping range */ 394 *nlo = tlo; 395 *nhi = thi; 396 return (0); 397 } 398 399 int 400 kcage_range_init(struct memlist *ml, int decr) 401 { 402 int ret = 0; 403 404 ASSERT(kcage_range_islocked()); 405 406 if (decr) { 407 while (ml->next != NULL) 408 ml = ml->next; 409 } 410 411 while (ml != NULL) { 412 ret = kcage_range_add(btop(ml->address), btop(ml->size), decr); 413 if (ret) 414 break; 415 416 ml = (decr ? ml->prev : ml->next); 417 } 418 419 return (ret); 420 } 421 422 /* 423 * Third arg controls direction of growth: 0: increasing pfns, 424 * 1: decreasing. 425 * Calls to add and delete must be protected by calls to 426 * kcage_range_lock() and kcage_range_unlock(). 427 */ 428 int 429 kcage_range_add(pfn_t base, pgcnt_t npgs, int decr) 430 { 431 struct kcage_glist *new, **lpp; 432 pfn_t lim; 433 434 ASSERT(kcage_range_islocked()); 435 436 ASSERT(npgs != 0); 437 if (npgs == 0) 438 return (EINVAL); 439 440 lim = base + npgs; 441 442 ASSERT(lim > base); 443 if (lim <= base) 444 return (EINVAL); 445 446 new = kcage_glist_alloc(); 447 if (new == NULL) { 448 return (ENOMEM); 449 } 450 451 new->base = base; 452 new->lim = lim; 453 new->decr = decr; 454 if (new->decr != 0) 455 new->curr = new->lim; 456 else 457 new->curr = new->base; 458 /* 459 * Any overlapping existing ranges are removed by deleting 460 * from the new list as we search for the tail. 461 */ 462 lpp = &kcage_glist; 463 while (*lpp != NULL) { 464 int ret; 465 ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new); 466 if (ret != 0) 467 return (ret); 468 lpp = &(*lpp)->next; 469 } 470 471 *lpp = new; 472 473 if (kcage_current_glist == NULL) { 474 kcage_current_glist = kcage_glist; 475 } 476 477 return (0); 478 } 479 480 /* 481 * Calls to add and delete must be protected by calls to 482 * kcage_range_lock() and kcage_range_unlock(). 483 */ 484 int 485 kcage_range_delete(pfn_t base, pgcnt_t npgs) 486 { 487 struct kcage_glist *lp; 488 pfn_t lim; 489 490 ASSERT(kcage_range_islocked()); 491 492 ASSERT(npgs != 0); 493 if (npgs == 0) 494 return (EINVAL); 495 496 lim = base + npgs; 497 498 ASSERT(lim > base); 499 if (lim <= base) 500 return (EINVAL); 501 502 /* 503 * Check if the delete is OK first as a number of elements 504 * might be involved and it will be difficult to go 505 * back and undo (can't just add the range back in). 506 */ 507 for (lp = kcage_glist; lp != NULL; lp = lp->next) { 508 /* 509 * If there have been no pages allocated from this 510 * element, we don't need to check it. 511 */ 512 if ((lp->decr == 0 && lp->curr == lp->base) || 513 (lp->decr != 0 && lp->curr == lp->lim)) 514 continue; 515 /* 516 * If the element does not overlap, its OK. 517 */ 518 if (base >= lp->lim || lim <= lp->base) 519 continue; 520 /* 521 * Overlapping element: Does the range to be deleted 522 * overlap the area already used? If so fail. 523 */ 524 if (lp->decr == 0 && base < lp->curr && lim >= lp->base) { 525 return (EBUSY); 526 } 527 if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) { 528 return (EBUSY); 529 } 530 } 531 return (kcage_glist_delete(base, lim, &kcage_glist)); 532 } 533 534 /* 535 * Calls to add and delete must be protected by calls to 536 * kcage_range_lock() and kcage_range_unlock(). 537 * This routine gets called after successful Solaris memory 538 * delete operation from DR post memory delete routines. 539 */ 540 int 541 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs) 542 { 543 pfn_t lim; 544 545 ASSERT(kcage_range_islocked()); 546 547 ASSERT(npgs != 0); 548 if (npgs == 0) 549 return (EINVAL); 550 551 lim = base + npgs; 552 553 ASSERT(lim > base); 554 if (lim <= base) 555 return (EINVAL); 556 557 return (kcage_glist_delete(base, lim, &kcage_glist)); 558 } 559 560 /* 561 * No locking is required here as the whole operation is covered 562 * by the kcage_range_lock(). 563 */ 564 static struct kcage_glist * 565 kcage_glist_alloc(void) 566 { 567 struct kcage_glist *new; 568 569 if ((new = kcage_glist_freelist) != NULL) { 570 kcage_glist_freelist = new->next; 571 bzero(new, sizeof (*new)); 572 } else { 573 new = kmem_zalloc(sizeof (struct kcage_glist), KM_NOSLEEP); 574 } 575 return (new); 576 } 577 578 static void 579 kcage_glist_free(struct kcage_glist *lp) 580 { 581 lp->next = kcage_glist_freelist; 582 kcage_glist_freelist = lp; 583 } 584 585 static int 586 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp) 587 { 588 struct kcage_glist *lp, *prev = *lpp; 589 590 while ((lp = *lpp) != NULL) { 591 if (lim > lp->base && base < lp->lim) { 592 /* The delete range overlaps this element. */ 593 if (base <= lp->base && lim >= lp->lim) { 594 /* Delete whole element. */ 595 *lpp = lp->next; 596 if (lp == kcage_current_glist) { 597 /* This can never happen. */ 598 ASSERT(kcage_current_glist != prev); 599 kcage_current_glist = prev; 600 } 601 kcage_glist_free(lp); 602 continue; 603 } 604 605 /* Partial delete. */ 606 if (base > lp->base && lim < lp->lim) { 607 struct kcage_glist *new; 608 609 /* 610 * Remove a section from the middle, 611 * need to allocate a new element. 612 */ 613 new = kcage_glist_alloc(); 614 if (new == NULL) { 615 return (ENOMEM); 616 } 617 618 /* 619 * Tranfser unused range to new. 620 * Edit lp in place to preserve 621 * kcage_current_glist. 622 */ 623 new->decr = lp->decr; 624 if (new->decr != 0) { 625 new->base = lp->base; 626 new->lim = base; 627 new->curr = base; 628 629 lp->base = lim; 630 } else { 631 new->base = lim; 632 new->lim = lp->lim; 633 new->curr = new->base; 634 635 lp->lim = base; 636 } 637 638 /* Insert new. */ 639 new->next = lp->next; 640 lp->next = new; 641 lpp = &lp->next; 642 } else { 643 /* Delete part of current block. */ 644 if (base > lp->base) { 645 ASSERT(lim >= lp->lim); 646 ASSERT(base < lp->lim); 647 if (lp->decr != 0 && 648 lp->curr == lp->lim) 649 lp->curr = base; 650 lp->lim = base; 651 } else { 652 ASSERT(base <= lp->base); 653 ASSERT(lim > lp->base); 654 if (lp->decr == 0 && 655 lp->curr == lp->base) 656 lp->curr = lim; 657 lp->base = lim; 658 } 659 } 660 } 661 prev = *lpp; 662 lpp = &(*lpp)->next; 663 } 664 665 return (0); 666 } 667 668 /* 669 * The caller of kcage_get_pfn must hold the kcage_range_lock to make 670 * sure that there are no concurrent calls. The same lock 671 * must be obtained for range add and delete by calling 672 * kcage_range_lock() and kcage_range_unlock(). 673 */ 674 static pfn_t 675 kcage_get_pfn(void) 676 { 677 struct kcage_glist *lp; 678 pfn_t pfn; 679 680 ASSERT(kcage_range_islocked()); 681 682 lp = kcage_current_glist; 683 while (lp != NULL) { 684 if (lp->decr != 0) { 685 if (lp->curr != lp->base) { 686 pfn = --lp->curr; 687 return (pfn); 688 } 689 } else { 690 if (lp->curr != lp->lim) { 691 pfn = lp->curr++; 692 return (pfn); 693 } 694 } 695 696 lp = lp->next; 697 if (lp) 698 kcage_current_glist = lp; 699 } 700 701 return (PFN_INVALID); 702 } 703 704 /* 705 * Walk the physical address space of the cage. 706 * This routine does not guarantee to return PFNs in the order 707 * in which they were allocated to the cage. Instead, it walks 708 * each range as they appear on the growth list returning the PFNs 709 * range in ascending order. 710 * 711 * To begin scanning at lower edge of cage, reset should be nonzero. 712 * To step through cage, reset should be zero. 713 * 714 * PFN_INVALID will be returned when the upper end of the cage is 715 * reached -- indicating a full scan of the cage has been completed since 716 * previous reset. PFN_INVALID will continue to be returned until 717 * kcage_walk_cage is reset. 718 * 719 * It is possible to receive a PFN_INVALID result on reset if a growth 720 * list is not installed or if none of the PFNs in the installed list have 721 * been allocated to the cage. In otherwords, there is no cage. 722 * 723 * Caller need not hold kcage_range_lock while calling this function 724 * as the front part of the list is static - pages never come out of 725 * the cage. 726 * 727 * The caller is expected to only be kcage_cageout(). 728 */ 729 static pfn_t 730 kcage_walk_cage(int reset) 731 { 732 static struct kcage_glist *lp = NULL; 733 static pfn_t pfn; 734 735 if (reset) 736 lp = NULL; 737 if (lp == NULL) { 738 lp = kcage_glist; 739 pfn = PFN_INVALID; 740 } 741 again: 742 if (pfn == PFN_INVALID) { 743 if (lp == NULL) 744 return (PFN_INVALID); 745 746 if (lp->decr != 0) { 747 /* 748 * In this range the cage grows from the highest 749 * address towards the lowest. 750 * Arrange to return pfns from curr to lim-1, 751 * inclusive, in ascending order. 752 */ 753 754 pfn = lp->curr; 755 } else { 756 /* 757 * In this range the cage grows from the lowest 758 * address towards the highest. 759 * Arrange to return pfns from base to curr, 760 * inclusive, in ascending order. 761 */ 762 763 pfn = lp->base; 764 } 765 } 766 767 if (lp->decr != 0) { /* decrementing pfn */ 768 if (pfn == lp->lim) { 769 /* Don't go beyond the static part of the glist. */ 770 if (lp == kcage_current_glist) 771 lp = NULL; 772 else 773 lp = lp->next; 774 pfn = PFN_INVALID; 775 goto again; 776 } 777 778 ASSERT(pfn >= lp->curr && pfn < lp->lim); 779 } else { /* incrementing pfn */ 780 if (pfn == lp->curr) { 781 /* Don't go beyond the static part of the glist. */ 782 if (lp == kcage_current_glist) 783 lp = NULL; 784 else 785 lp = lp->next; 786 pfn = PFN_INVALID; 787 goto again; 788 } 789 790 ASSERT(pfn >= lp->base && pfn < lp->curr); 791 } 792 793 return (pfn++); 794 } 795 796 /* 797 * Callback functions for to recalc cage thresholds after 798 * Kphysm memory add/delete operations. 799 */ 800 /*ARGSUSED*/ 801 static void 802 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages) 803 { 804 kcage_recalc_thresholds(); 805 } 806 807 /*ARGSUSED*/ 808 static int 809 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages) 810 { 811 /* TODO: when should cage refuse memory delete requests? */ 812 return (0); 813 } 814 815 /*ARGSUSED*/ 816 static void 817 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled) 818 { 819 kcage_recalc_thresholds(); 820 } 821 822 static kphysm_setup_vector_t kcage_kphysm_vectors = { 823 KPHYSM_SETUP_VECTOR_VERSION, 824 kcage_kphysm_postadd_cb, 825 kcage_kphysm_predel_cb, 826 kcage_kphysm_postdel_cb 827 }; 828 829 /* 830 * This is called before a CPR suspend and after a CPR resume. We have to 831 * turn off kcage_cageout_ready before a suspend, and turn it back on after a 832 * restart. 833 */ 834 /*ARGSUSED*/ 835 static boolean_t 836 kcage_cageout_cpr(void *arg, int code) 837 { 838 if (code == CB_CODE_CPR_CHKPT) { 839 ASSERT(kcage_cageout_ready); 840 kcage_cageout_ready = 0; 841 return (B_TRUE); 842 } else if (code == CB_CODE_CPR_RESUME) { 843 ASSERT(kcage_cageout_ready == 0); 844 kcage_cageout_ready = 1; 845 return (B_TRUE); 846 } 847 return (B_FALSE); 848 } 849 850 /* 851 * kcage_recalc_preferred_size() increases initial cage size to improve large 852 * page availability when lp for kmem is enabled and kpr is disabled 853 */ 854 static pgcnt_t 855 kcage_recalc_preferred_size(pgcnt_t preferred_size) 856 { 857 if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) { 858 pgcnt_t lpmincage = kcage_kmemlp_mincage; 859 if (lpmincage == 0) { 860 lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8), 861 segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE; 862 } 863 kcage_kmemlp_mincage = MIN(lpmincage, 864 (segkmem_kmemlp_max / PAGESIZE)); 865 preferred_size = MAX(kcage_kmemlp_mincage, preferred_size); 866 } 867 return (preferred_size); 868 } 869 870 /* 871 * Kcage_init() builds the cage and initializes the cage thresholds. 872 * The size of the cage is determined by the argument preferred_size. 873 * or the actual amount of memory, whichever is smaller. 874 */ 875 void 876 kcage_init(pgcnt_t preferred_size) 877 { 878 pgcnt_t wanted; 879 pfn_t pfn; 880 page_t *pp; 881 kstat_t *ksp; 882 883 extern struct vnode kvp; 884 extern void page_list_noreloc_startup(page_t *); 885 886 ASSERT(!kcage_on); 887 ASSERT(kcage_range_islocked()); 888 889 /* increase preferred cage size for lp for kmem */ 890 preferred_size = kcage_recalc_preferred_size(preferred_size); 891 892 /* Debug note: initialize this now so early expansions can stat */ 893 KCAGE_STAT_INIT_SCAN_INDEX; 894 895 /* 896 * Initialize cage thresholds and install kphysm callback. 897 * If we can't arrange to have the thresholds track with 898 * available physical memory, then the cage thresholds may 899 * end up over time at levels that adversly effect system 900 * performance; so, bail out. 901 */ 902 kcage_recalc_thresholds(); 903 if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) { 904 ASSERT(0); /* Catch this in DEBUG kernels. */ 905 return; 906 } 907 908 /* 909 * Limit startup cage size within the range of kcage_minfree 910 * and availrmem, inclusively. 911 */ 912 wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem); 913 914 /* 915 * Construct the cage. PFNs are allocated from the glist. It 916 * is assumed that the list has been properly ordered for the 917 * platform by the platform code. Typically, this is as simple 918 * as calling kcage_range_init(phys_avail, decr), where decr is 919 * 1 if the kernel has been loaded into upper end of physical 920 * memory, or 0 if the kernel has been loaded at the low end. 921 * 922 * Note: it is assumed that we are in the startup flow, so there 923 * is no reason to grab the page lock. 924 */ 925 kcage_freemem = 0; 926 pfn = PFN_INVALID; /* prime for alignment test */ 927 while (wanted != 0) { 928 if ((pfn = kcage_get_pfn()) == PFN_INVALID) 929 break; 930 931 if ((pp = page_numtopp_nolock(pfn)) != NULL) { 932 KCAGEPAGETS_INC(); 933 /* 934 * Set the noreloc state on the page. 935 * If the page is free and not already 936 * on the noreloc list then move it. 937 */ 938 if (PP_ISFREE(pp)) { 939 if (PP_ISNORELOC(pp) == 0) 940 page_list_noreloc_startup(pp); 941 } else { 942 ASSERT(pp->p_szc == 0); 943 PP_SETNORELOC(pp); 944 } 945 } 946 PLCNT_XFER_NORELOC(pp); 947 wanted -= 1; 948 } 949 950 /* 951 * Need to go through and find kernel allocated pages 952 * and capture them into the Cage. These will primarily 953 * be pages gotten through boot_alloc(). 954 */ 955 if (kvp.v_pages) { 956 957 pp = kvp.v_pages; 958 do { 959 ASSERT(!PP_ISFREE(pp)); 960 ASSERT(pp->p_szc == 0); 961 PP_SETNORELOC(pp); 962 } while ((pp = pp->p_vpnext) != kvp.v_pages); 963 964 } 965 966 kcage_on = 1; 967 968 /* 969 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend() 970 * after the cageout thread is blocked, and executes from cpr_resume() 971 * before the cageout thread is restarted. By executing in this class, 972 * we are assured that the kernel cage thread won't miss wakeup calls 973 * and also CPR's larger kmem_alloc requests will not fail after 974 * CPR shuts down the cageout kernel thread. 975 */ 976 (void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL, 977 "cageout"); 978 979 /* 980 * Coalesce pages to improve large page availability. A better fix 981 * would to coalesce pages as they are included in the cage 982 */ 983 if (SEGKMEM_USE_LARGEPAGES) { 984 extern void page_freelist_coalesce_all(int mnode); 985 extern int max_mem_nodes; 986 int mnode, max_mnodes = max_mem_nodes; 987 for (mnode = 0; mnode < max_mnodes; mnode++) { 988 page_freelist_coalesce_all(mnode); 989 } 990 } 991 992 ksp = kstat_create("kcage", 0, "kcage_page_list", "misc", 993 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL); 994 if (ksp != NULL) { 995 ksp->ks_update = kcage_kstat_update; 996 ksp->ks_snapshot = kcage_kstat_snapshot; 997 ksp->ks_lock = &kcage_kstat_lock; /* XXX - not really needed */ 998 kstat_install(ksp); 999 } 1000 1001 } 1002 1003 static int 1004 kcage_kstat_update(kstat_t *ksp, int rw) 1005 { 1006 struct kcage_glist *lp; 1007 uint_t count; 1008 1009 if (rw == KSTAT_WRITE) 1010 return (EACCES); 1011 1012 count = 0; 1013 kcage_range_lock(); 1014 for (lp = kcage_glist; lp != NULL; lp = lp->next) { 1015 if (lp->decr) { 1016 if (lp->curr != lp->lim) { 1017 count++; 1018 } 1019 } else { 1020 if (lp->curr != lp->base) { 1021 count++; 1022 } 1023 } 1024 } 1025 kcage_range_unlock(); 1026 1027 ksp->ks_ndata = count; 1028 ksp->ks_data_size = count * 2 * sizeof (uint64_t); 1029 1030 return (0); 1031 } 1032 1033 static int 1034 kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw) 1035 { 1036 struct kcage_glist *lp; 1037 struct memunit { 1038 uint64_t address; 1039 uint64_t size; 1040 } *kspmem; 1041 1042 if (rw == KSTAT_WRITE) 1043 return (EACCES); 1044 1045 ksp->ks_snaptime = gethrtime(); 1046 1047 kspmem = (struct memunit *)buf; 1048 kcage_range_lock(); 1049 for (lp = kcage_glist; lp != NULL; lp = lp->next, kspmem++) { 1050 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) 1051 break; 1052 1053 if (lp->decr) { 1054 if (lp->curr != lp->lim) { 1055 kspmem->address = ptob(lp->curr); 1056 kspmem->size = ptob(lp->lim - lp->curr); 1057 } 1058 } else { 1059 if (lp->curr != lp->base) { 1060 kspmem->address = ptob(lp->base); 1061 kspmem->size = ptob(lp->curr - lp->base); 1062 } 1063 } 1064 } 1065 kcage_range_unlock(); 1066 1067 return (0); 1068 } 1069 1070 void 1071 kcage_recalc_thresholds() 1072 { 1073 static int first = 1; 1074 static pgcnt_t init_lotsfree; 1075 static pgcnt_t init_desfree; 1076 static pgcnt_t init_minfree; 1077 static pgcnt_t init_throttlefree; 1078 static pgcnt_t init_reserve; 1079 1080 /* TODO: any reason to take more care than this with live editing? */ 1081 mutex_enter(&kcage_cageout_mutex); 1082 mutex_enter(&freemem_lock); 1083 1084 if (first) { 1085 first = 0; 1086 init_lotsfree = kcage_lotsfree; 1087 init_desfree = kcage_desfree; 1088 init_minfree = kcage_minfree; 1089 init_throttlefree = kcage_throttlefree; 1090 init_reserve = kcage_reserve; 1091 } else { 1092 kcage_lotsfree = init_lotsfree; 1093 kcage_desfree = init_desfree; 1094 kcage_minfree = init_minfree; 1095 kcage_throttlefree = init_throttlefree; 1096 kcage_reserve = init_reserve; 1097 } 1098 1099 if (kcage_lotsfree == 0) 1100 kcage_lotsfree = MAX(32, total_pages / 256); 1101 1102 if (kcage_minfree == 0) 1103 kcage_minfree = MAX(32, kcage_lotsfree / 2); 1104 1105 if (kcage_desfree == 0) 1106 kcage_desfree = MAX(32, kcage_minfree); 1107 1108 if (kcage_throttlefree == 0) 1109 kcage_throttlefree = MAX(32, kcage_minfree / 2); 1110 1111 if (kcage_reserve == 0) 1112 kcage_reserve = MIN(32, kcage_throttlefree / 2); 1113 1114 mutex_exit(&freemem_lock); 1115 mutex_exit(&kcage_cageout_mutex); 1116 1117 if (kcage_cageout_ready) { 1118 if (kcage_freemem < kcage_desfree) 1119 kcage_cageout_wakeup(); 1120 1121 if (kcage_needfree) { 1122 mutex_enter(&kcage_throttle_mutex); 1123 cv_broadcast(&kcage_throttle_cv); 1124 mutex_exit(&kcage_throttle_mutex); 1125 } 1126 } 1127 } 1128 1129 /* 1130 * Pageout interface: 1131 * kcage_cageout_init() 1132 */ 1133 void 1134 kcage_cageout_init() 1135 { 1136 if (kcage_on) { 1137 1138 (void) thread_create(NULL, 0, kcage_cageout, 1139 NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1); 1140 } 1141 } 1142 1143 1144 /* 1145 * VM Interfaces: 1146 * kcage_create_throttle() 1147 * kcage_freemem_add() 1148 * kcage_freemem_sub() 1149 */ 1150 1151 /* 1152 * Wakeup cageout thread and throttle waiting for the number of pages 1153 * requested to become available. For non-critical requests, a 1154 * timeout is added, since freemem accounting is separate from cage 1155 * freemem accounting: it's possible for us to get stuck and not make 1156 * forward progress even though there was sufficient freemem before 1157 * arriving here. 1158 */ 1159 int 1160 kcage_create_throttle(pgcnt_t npages, int flags) 1161 { 1162 int niter = 0; 1163 pgcnt_t lastfree; 1164 int enough = kcage_freemem > kcage_throttlefree + npages; 1165 1166 KCAGE_STAT_INCR(kct_calls); /* unprotected incr. */ 1167 1168 kcage_cageout_wakeup(); /* just to be sure */ 1169 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1170 1171 /* 1172 * Obviously, we can't throttle the cageout thread since 1173 * we depend on it. We also can't throttle the panic thread. 1174 */ 1175 if (curthread == kcage_cageout_thread || panicstr) { 1176 KCAGE_STAT_INCR(kct_cageout); /* unprotected incr. */ 1177 return (KCT_CRIT); 1178 } 1179 1180 /* 1181 * Don't throttle threads which are critical for proper 1182 * vm management if we're above kcage_throttlefree or 1183 * if freemem is very low. 1184 */ 1185 if (NOMEMWAIT()) { 1186 if (enough) { 1187 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1188 return (KCT_CRIT); 1189 } else if (freemem < minfree) { 1190 KCAGE_STAT_INCR(kct_critical); /* unprotected incr. */ 1191 return (KCT_CRIT); 1192 } 1193 } 1194 1195 /* 1196 * Don't throttle real-time threads if kcage_freemem > kcage_reserve. 1197 */ 1198 if (DISP_PRIO(curthread) > maxclsyspri && 1199 kcage_freemem > kcage_reserve) { 1200 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1201 return (KCT_CRIT); 1202 } 1203 1204 /* 1205 * Cause all other threads (which are assumed to not be 1206 * critical to cageout) to wait here until their request 1207 * can be satisfied. Be a little paranoid and wake the 1208 * kernel cage on each loop through this logic. 1209 */ 1210 while (kcage_freemem < kcage_throttlefree + npages) { 1211 ASSERT(kcage_on); 1212 1213 lastfree = kcage_freemem; 1214 1215 if (kcage_cageout_ready) { 1216 mutex_enter(&kcage_throttle_mutex); 1217 1218 kcage_needfree += npages; 1219 KCAGE_STAT_INCR(kct_wait); 1220 1221 kcage_cageout_wakeup(); 1222 KCAGE_STAT_INCR(kct_cagewake); 1223 1224 cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex); 1225 1226 kcage_needfree -= npages; 1227 1228 mutex_exit(&kcage_throttle_mutex); 1229 } else { 1230 /* 1231 * NOTE: atomics are used just in case we enter 1232 * mp operation before the cageout thread is ready. 1233 */ 1234 atomic_add_long(&kcage_needfree, npages); 1235 1236 kcage_cageout_wakeup(); 1237 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1238 1239 atomic_add_long(&kcage_needfree, -npages); 1240 } 1241 1242 if ((flags & PG_WAIT) == 0) { 1243 if (kcage_freemem > lastfree) { 1244 KCAGE_STAT_INCR(kct_progress); 1245 niter = 0; 1246 } else { 1247 KCAGE_STAT_INCR(kct_noprogress); 1248 if (++niter >= kcage_maxwait) { 1249 KCAGE_STAT_INCR(kct_timeout); 1250 return (KCT_FAILURE); 1251 } 1252 } 1253 } 1254 } 1255 return (KCT_NONCRIT); 1256 } 1257 1258 void 1259 kcage_freemem_add(pgcnt_t npages) 1260 { 1261 extern void wakeup_pcgs(void); 1262 1263 atomic_add_long(&kcage_freemem, npages); 1264 1265 wakeup_pcgs(); /* wakeup threads in pcgs() */ 1266 1267 if (kcage_needfree != 0 && 1268 kcage_freemem >= (kcage_throttlefree + kcage_needfree)) { 1269 1270 mutex_enter(&kcage_throttle_mutex); 1271 cv_broadcast(&kcage_throttle_cv); 1272 KCAGE_STAT_INCR(kfa_trottlewake); 1273 mutex_exit(&kcage_throttle_mutex); 1274 } 1275 } 1276 1277 void 1278 kcage_freemem_sub(pgcnt_t npages) 1279 { 1280 atomic_add_long(&kcage_freemem, -npages); 1281 1282 if (kcage_freemem < kcage_desfree) { 1283 kcage_cageout_wakeup(); 1284 KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */ 1285 } 1286 } 1287 1288 /* 1289 * return 0 on failure and 1 on success. 1290 */ 1291 static int 1292 kcage_setnoreloc_pages(page_t *rootpp, se_t se) 1293 { 1294 pgcnt_t npgs, i; 1295 page_t *pp; 1296 pfn_t rootpfn = page_pptonum(rootpp); 1297 uint_t szc; 1298 1299 ASSERT(!PP_ISFREE(rootpp)); 1300 ASSERT(PAGE_LOCKED_SE(rootpp, se)); 1301 if (!group_page_trylock(rootpp, se)) { 1302 return (0); 1303 } 1304 szc = rootpp->p_szc; 1305 if (szc == 0) { 1306 /* 1307 * The szc of a locked page can only change for pages that are 1308 * non-swapfs (i.e. anonymous memory) file system pages. 1309 */ 1310 ASSERT(rootpp->p_vnode != NULL && 1311 !PP_ISKAS(rootpp) && 1312 !IS_SWAPFSVP(rootpp->p_vnode)); 1313 PP_SETNORELOC(rootpp); 1314 return (1); 1315 } 1316 npgs = page_get_pagecnt(szc); 1317 ASSERT(IS_P2ALIGNED(rootpfn, npgs)); 1318 pp = rootpp; 1319 for (i = 0; i < npgs; i++, pp++) { 1320 ASSERT(PAGE_LOCKED_SE(pp, se)); 1321 ASSERT(!PP_ISFREE(pp)); 1322 ASSERT(pp->p_szc == szc); 1323 PP_SETNORELOC(pp); 1324 } 1325 group_page_unlock(rootpp); 1326 return (1); 1327 } 1328 1329 /* 1330 * Attempt to convert page to a caged page (set the P_NORELOC flag). 1331 * If successful and pages is free, move page to the tail of whichever 1332 * list it is on. 1333 * Returns: 1334 * EBUSY page already locked, assimilated but not free. 1335 * ENOMEM page assimilated, but memory too low to relocate. Page not free. 1336 * EAGAIN page not assimilated. Page not free. 1337 * ERANGE page assimilated. Page not root. 1338 * 0 page assimilated. Page free. 1339 * *nfreedp number of pages freed. 1340 * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way 1341 * to distinguish between a page that was already a NORELOC page from 1342 * those newly converted to NORELOC pages by this invocation of 1343 * kcage_assimilate_page. 1344 */ 1345 static int 1346 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp) 1347 { 1348 if (page_trylock(pp, SE_EXCL)) { 1349 if (PP_ISNORELOC(pp)) { 1350 check_free_and_return: 1351 if (PP_ISFREE(pp)) { 1352 page_unlock(pp); 1353 *nfreedp = 0; 1354 return (0); 1355 } else { 1356 page_unlock(pp); 1357 return (EBUSY); 1358 } 1359 /*NOTREACHED*/ 1360 } 1361 } else { 1362 if (page_trylock(pp, SE_SHARED)) { 1363 if (PP_ISNORELOC(pp)) 1364 goto check_free_and_return; 1365 } else 1366 return (EAGAIN); 1367 1368 if (!PP_ISFREE(pp)) { 1369 page_unlock(pp); 1370 return (EAGAIN); 1371 } 1372 1373 /* 1374 * Need to upgrade the lock on it and set the NORELOC 1375 * bit. If it is free then remove it from the free 1376 * list so that the platform free list code can keep 1377 * NORELOC pages where they should be. 1378 */ 1379 /* 1380 * Before doing anything, get the exclusive lock. 1381 * This may fail (eg ISM pages are left shared locked). 1382 * If the page is free this will leave a hole in the 1383 * cage. There is no solution yet to this. 1384 */ 1385 if (!page_tryupgrade(pp)) { 1386 page_unlock(pp); 1387 return (EAGAIN); 1388 } 1389 } 1390 1391 ASSERT(PAGE_EXCL(pp)); 1392 1393 if (PP_ISFREE(pp)) { 1394 int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST; 1395 1396 page_list_sub(pp, which); 1397 ASSERT(pp->p_szc == 0); 1398 PP_SETNORELOC(pp); 1399 PLCNT_XFER_NORELOC(pp); 1400 page_list_add(pp, which | PG_LIST_TAIL); 1401 1402 page_unlock(pp); 1403 *nfreedp = 1; 1404 return (0); 1405 } else { 1406 if (pp->p_szc != 0) { 1407 if (!kcage_setnoreloc_pages(pp, SE_EXCL)) { 1408 page_unlock(pp); 1409 return (EAGAIN); 1410 } 1411 ASSERT(PP_ISNORELOC(pp)); 1412 } else { 1413 PP_SETNORELOC(pp); 1414 } 1415 PLCNT_XFER_NORELOC(pp); 1416 return (kcage_invalidate_page(pp, nfreedp)); 1417 } 1418 /*NOTREACHED*/ 1419 } 1420 1421 static int 1422 kcage_expand() 1423 { 1424 int did_something = 0; 1425 1426 spgcnt_t wanted; 1427 pfn_t pfn; 1428 page_t *pp; 1429 /* TODO: we don't really need n any more? */ 1430 pgcnt_t n; 1431 pgcnt_t nf, nfreed; 1432 1433 /* 1434 * Expand the cage if available cage memory is really low. Calculate 1435 * the amount required to return kcage_freemem to the level of 1436 * kcage_lotsfree, or to satisfy throttled requests, whichever is 1437 * more. It is rare for their sum to create an artificial threshold 1438 * above kcage_lotsfree, but it is possible. 1439 * 1440 * Exit early if expansion amount is equal to or less than zero. 1441 * (<0 is possible if kcage_freemem rises suddenly.) 1442 * 1443 * Exit early when the global page pool (apparently) does not 1444 * have enough free pages to page_relocate() even a single page. 1445 */ 1446 wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree) 1447 - kcage_freemem; 1448 if (wanted <= 0) 1449 return (0); 1450 else if (freemem < pageout_reserve + 1) { 1451 KCAGE_STAT_INCR(ke_lowfreemem); 1452 return (0); 1453 } 1454 1455 /* 1456 * Try to get the range list reader lock. If the lock is already 1457 * held, then don't get stuck here waiting for it. 1458 */ 1459 if (!rw_tryenter(&kcage_range_rwlock, RW_READER)) 1460 return (0); 1461 1462 KCAGE_STAT_INCR(ke_calls); 1463 KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted); 1464 1465 /* 1466 * Assimilate more pages from the global page pool into the cage. 1467 */ 1468 n = 0; /* number of pages PP_SETNORELOC'd */ 1469 nf = 0; /* number of those actually free */ 1470 while (kcage_on && nf < wanted) { 1471 pfn = kcage_get_pfn(); 1472 if (pfn == PFN_INVALID) { /* eek! no where to grow */ 1473 KCAGE_STAT_INCR(ke_nopfn); 1474 goto terminate; 1475 } 1476 1477 KCAGE_STAT_INCR_SCAN(ke_examined); 1478 1479 if ((pp = page_numtopp_nolock(pfn)) == NULL) { 1480 KCAGE_STAT_INCR(ke_nopaget); 1481 continue; 1482 } 1483 KCAGEPAGETS_INC(); 1484 /* 1485 * Sanity check. Skip this pfn if it is 1486 * being deleted. 1487 */ 1488 if (pfn_is_being_deleted(pfn)) { 1489 KCAGE_STAT_INCR(ke_deleting); 1490 continue; 1491 } 1492 1493 /* 1494 * NORELOC is only set at boot-time or by this routine 1495 * under the kcage_range_rwlock lock which is currently 1496 * held. This means we can do a fast check here before 1497 * locking the page in kcage_assimilate_page. 1498 */ 1499 if (PP_ISNORELOC(pp)) { 1500 KCAGE_STAT_INCR(ke_isnoreloc); 1501 continue; 1502 } 1503 1504 switch (kcage_assimilate_page(pp, &nfreed)) { 1505 case 0: /* assimilated, page is free */ 1506 KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed); 1507 did_something = 1; 1508 nf += nfreed; 1509 n++; 1510 break; 1511 1512 case EBUSY: /* assimilated, page not free */ 1513 case ERANGE: /* assimilated, page not root */ 1514 KCAGE_STAT_INCR_SCAN(ke_gotone); 1515 did_something = 1; 1516 n++; 1517 break; 1518 1519 case ENOMEM: /* assimilated, but no mem */ 1520 KCAGE_STAT_INCR(ke_terminate); 1521 did_something = 1; 1522 n++; 1523 goto terminate; 1524 1525 case EAGAIN: /* can't assimilate */ 1526 KCAGE_STAT_INCR_SCAN(ke_lefthole); 1527 break; 1528 1529 default: /* catch this with debug kernels */ 1530 ASSERT(0); 1531 break; 1532 } 1533 } 1534 1535 /* 1536 * Realign cage edge with the nearest physical address 1537 * boundry for big pages. This is done to give us a 1538 * better chance of actually getting usable big pages 1539 * in the cage. 1540 */ 1541 1542 terminate: 1543 kcage_range_unlock(); 1544 1545 return (did_something); 1546 } 1547 1548 /* 1549 * Relocate page opp (Original Page Pointer) from cage pool to page rpp 1550 * (Replacement Page Pointer) in the global pool. Page opp will be freed 1551 * if relocation is successful, otherwise it is only unlocked. 1552 * On entry, page opp must be exclusively locked and not free. 1553 * *nfreedp: number of pages freed. 1554 */ 1555 static int 1556 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp) 1557 { 1558 page_t *opp = pp; 1559 page_t *rpp = NULL; 1560 spgcnt_t npgs; 1561 int result; 1562 1563 ASSERT(!PP_ISFREE(opp)); 1564 ASSERT(PAGE_EXCL(opp)); 1565 1566 result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL); 1567 *nfreedp = npgs; 1568 if (result == 0) { 1569 while (npgs-- > 0) { 1570 page_t *tpp; 1571 1572 ASSERT(rpp != NULL); 1573 tpp = rpp; 1574 page_sub(&rpp, tpp); 1575 page_unlock(tpp); 1576 } 1577 1578 ASSERT(rpp == NULL); 1579 1580 return (0); /* success */ 1581 } 1582 1583 page_unlock(opp); 1584 return (result); 1585 } 1586 1587 /* 1588 * Based on page_invalidate_pages() 1589 * 1590 * Kcage_invalidate_page() uses page_relocate() twice. Both instances 1591 * of use must be updated to match the new page_relocate() when it 1592 * becomes available. 1593 * 1594 * Return result of kcage_relocate_page or zero if page was directly freed. 1595 * *nfreedp: number of pages freed. 1596 */ 1597 static int 1598 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp) 1599 { 1600 int result; 1601 1602 #if defined(__sparc) 1603 extern struct vnode prom_ppages; 1604 ASSERT(pp->p_vnode != &prom_ppages); 1605 #endif /* __sparc */ 1606 1607 ASSERT(!PP_ISFREE(pp)); 1608 ASSERT(PAGE_EXCL(pp)); 1609 1610 /* 1611 * Is this page involved in some I/O? shared? 1612 * The page_struct_lock need not be acquired to 1613 * examine these fields since the page has an 1614 * "exclusive" lock. 1615 */ 1616 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1617 result = kcage_relocate_page(pp, nfreedp); 1618 #ifdef KCAGE_STATS 1619 if (result == 0) 1620 KCAGE_STAT_INCR_SCAN(kip_reloclocked); 1621 else if (result == ENOMEM) 1622 KCAGE_STAT_INCR_SCAN(kip_nomem); 1623 #endif 1624 return (result); 1625 } 1626 1627 ASSERT(pp->p_vnode->v_type != VCHR); 1628 1629 /* 1630 * Unload the mappings and check if mod bit is set. 1631 */ 1632 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1633 1634 if (hat_ismod(pp)) { 1635 result = kcage_relocate_page(pp, nfreedp); 1636 #ifdef KCAGE_STATS 1637 if (result == 0) 1638 KCAGE_STAT_INCR_SCAN(kip_relocmod); 1639 else if (result == ENOMEM) 1640 KCAGE_STAT_INCR_SCAN(kip_nomem); 1641 #endif 1642 return (result); 1643 } 1644 1645 if (!page_try_demote_pages(pp)) { 1646 KCAGE_STAT_INCR_SCAN(kip_demotefailed); 1647 page_unlock(pp); 1648 return (EAGAIN); 1649 } 1650 1651 page_destroy(pp, 0); 1652 KCAGE_STAT_INCR_SCAN(kip_destroy); 1653 *nfreedp = 1; 1654 return (0); 1655 } 1656 1657 static void 1658 kcage_cageout() 1659 { 1660 pfn_t pfn; 1661 page_t *pp; 1662 callb_cpr_t cprinfo; 1663 int did_something; 1664 int scan_again; 1665 pfn_t start_pfn; 1666 int pass; 1667 int last_pass; 1668 int pages_skipped; 1669 int shared_skipped; 1670 uint_t shared_level = 8; 1671 pgcnt_t nfreed; 1672 #ifdef KCAGE_STATS 1673 clock_t scan_start; 1674 #endif 1675 1676 CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex, 1677 callb_generic_cpr, "cageout"); 1678 1679 mutex_enter(&kcage_cageout_mutex); 1680 kcage_cageout_thread = curthread; 1681 1682 pfn = PFN_INVALID; /* force scan reset */ 1683 start_pfn = PFN_INVALID; /* force init with 1st cage pfn */ 1684 kcage_cageout_ready = 1; /* switch kcage_cageout_wakeup mode */ 1685 1686 loop: 1687 /* 1688 * Wait here. Sooner or later, kcage_freemem_sub() will notice 1689 * that kcage_freemem is less than kcage_desfree. When it does 1690 * notice, kcage_freemem_sub() will wake us up via call to 1691 * kcage_cageout_wakeup(). 1692 */ 1693 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1694 cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex); 1695 CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex); 1696 1697 KCAGE_STAT_INCR(kt_wakeups); 1698 KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem); 1699 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem); 1700 pass = 0; 1701 last_pass = 0; 1702 1703 #ifdef KCAGE_STATS 1704 scan_start = lbolt; 1705 #endif 1706 1707 again: 1708 if (!kcage_on) 1709 goto loop; 1710 1711 KCAGE_STAT_INCR(kt_scans); 1712 KCAGE_STAT_INCR_SCAN(kt_passes); 1713 1714 did_something = 0; 1715 pages_skipped = 0; 1716 shared_skipped = 0; 1717 while ((kcage_freemem < kcage_lotsfree || kcage_needfree) && 1718 (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) { 1719 1720 if (start_pfn == PFN_INVALID) 1721 start_pfn = pfn; 1722 else if (start_pfn == pfn) { 1723 last_pass = pass; 1724 pass += 1; 1725 /* 1726 * Did a complete walk of kernel cage, but didn't free 1727 * any pages. If only one cpu is online then 1728 * stop kernel cage walk and try expanding. 1729 */ 1730 if (ncpus_online == 1 && did_something == 0) { 1731 KCAGE_STAT_INCR(kt_cageout_break); 1732 break; 1733 } 1734 } 1735 1736 pp = page_numtopp_nolock(pfn); 1737 if (pp == NULL) { 1738 continue; 1739 } 1740 1741 KCAGE_STAT_INCR_SCAN(kt_examined); 1742 1743 /* 1744 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside 1745 * of the lock. If one is missed it will be seen next 1746 * time through. 1747 * 1748 * Skip non-caged-pages. These pages can exist in the cage 1749 * because, if during cage expansion, a page is 1750 * encountered that is long-term locked the lock prevents the 1751 * expansion logic from setting the P_NORELOC flag. Hence, 1752 * non-caged-pages surrounded by caged-pages. 1753 */ 1754 if (!PP_ISNORELOC(pp)) { 1755 switch (kcage_assimilate_page(pp, &nfreed)) { 1756 case 0: 1757 did_something = 1; 1758 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, 1759 nfreed); 1760 break; 1761 1762 case EBUSY: 1763 case ERANGE: 1764 did_something = 1; 1765 KCAGE_STAT_INCR_SCAN(kt_gotone); 1766 break; 1767 1768 case EAGAIN: 1769 case ENOMEM: 1770 break; 1771 1772 default: 1773 /* catch this with debug kernels */ 1774 ASSERT(0); 1775 break; 1776 } 1777 1778 continue; 1779 } else { 1780 int prm; 1781 1782 if (PP_ISFREE(pp)) { 1783 continue; 1784 } 1785 1786 if ((PP_ISKAS(pp) && pp->p_lckcnt > 0) || 1787 !page_trylock(pp, SE_EXCL)) { 1788 KCAGE_STAT_INCR_SCAN(kt_cantlock); 1789 continue; 1790 } 1791 1792 /* P_NORELOC bit should not have gone away. */ 1793 ASSERT(PP_ISNORELOC(pp)); 1794 if (PP_ISFREE(pp) || (PP_ISKAS(pp) && 1795 pp->p_lckcnt > 0)) { 1796 page_unlock(pp); 1797 continue; 1798 } 1799 1800 KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level); 1801 if (hat_page_getshare(pp) > shared_level) { 1802 page_unlock(pp); 1803 pages_skipped = 1; 1804 shared_skipped = 1; 1805 KCAGE_STAT_INCR_SCAN(kt_skipshared); 1806 continue; 1807 } 1808 1809 /* 1810 * In pass {0, 1}, skip page if ref bit is set. 1811 * In pass {0, 1, 2}, skip page if mod bit is set. 1812 */ 1813 prm = hat_pagesync(pp, 1814 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 1815 1816 /* On first pass ignore ref'd pages */ 1817 if (pass <= 1 && (prm & P_REF)) { 1818 KCAGE_STAT_INCR_SCAN(kt_skiprefd); 1819 pages_skipped = 1; 1820 page_unlock(pp); 1821 continue; 1822 } 1823 1824 /* On pass 2, page_destroy if mod bit is not set */ 1825 if (pass <= 2) { 1826 if (pp->p_szc != 0 || (prm & P_MOD) || 1827 pp->p_lckcnt || pp->p_cowcnt) { 1828 pages_skipped = 1; 1829 page_unlock(pp); 1830 } else { 1831 1832 /* 1833 * unload the mappings before 1834 * checking if mod bit is set 1835 */ 1836 (void) hat_pageunload(pp, 1837 HAT_FORCE_PGUNLOAD); 1838 1839 /* 1840 * skip this page if modified 1841 */ 1842 if (hat_ismod(pp)) { 1843 pages_skipped = 1; 1844 page_unlock(pp); 1845 continue; 1846 } 1847 1848 KCAGE_STAT_INCR_SCAN(kt_destroy); 1849 page_destroy(pp, 0); 1850 did_something = 1; 1851 } 1852 continue; 1853 } 1854 1855 if (kcage_invalidate_page(pp, &nfreed) == 0) { 1856 did_something = 1; 1857 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed); 1858 } 1859 1860 /* 1861 * No need to drop the page lock here. 1862 * Kcage_invalidate_page has done that for us 1863 * either explicitly or through a page_free. 1864 */ 1865 } 1866 } 1867 1868 /* 1869 * Expand the cage only if available cage memory is really low. 1870 * This test is done only after a complete scan of the cage. 1871 * The reason for not checking and expanding more often is to 1872 * avoid rapid expansion of the cage. Naturally, scanning the 1873 * cage takes time. So by scanning first, we use that work as a 1874 * delay loop in between expand decisions. 1875 */ 1876 1877 scan_again = 0; 1878 if (kcage_freemem < kcage_minfree || kcage_needfree) { 1879 /* 1880 * Kcage_expand() will return a non-zero value if it was 1881 * able to expand the cage -- whether or not the new 1882 * pages are free and immediately usable. If non-zero, 1883 * we do another scan of the cage. The pages might be 1884 * freed during that scan or by time we get back here. 1885 * If not, we will attempt another expansion. 1886 * However, if kcage_expand() returns zero, then it was 1887 * unable to expand the cage. This is the case when the 1888 * the growth list is exausted, therefore no work was done 1889 * and there is no reason to scan the cage again. 1890 * Note: Kernel cage scan is not repeated on single-cpu 1891 * system to avoid kernel cage thread hogging cpu. 1892 */ 1893 if (pass <= 3 && pages_skipped && ncpus_online > 1) 1894 scan_again = 1; 1895 else 1896 (void) kcage_expand(); /* don't scan again */ 1897 } else if (kcage_freemem < kcage_lotsfree) { 1898 /* 1899 * If available cage memory is less than abundant 1900 * and a full scan of the cage has not yet been completed, 1901 * or a scan has completed and some work was performed, 1902 * or pages were skipped because of sharing, 1903 * or we simply have not yet completed two passes, 1904 * then do another scan. 1905 */ 1906 if (pass <= 2 && pages_skipped) 1907 scan_again = 1; 1908 if (pass == last_pass || did_something) 1909 scan_again = 1; 1910 else if (shared_skipped && shared_level < (8<<24)) { 1911 shared_level <<= 1; 1912 scan_again = 1; 1913 } 1914 } 1915 1916 if (scan_again && ncpus_online > 1) 1917 goto again; 1918 else { 1919 if (shared_level > 8) 1920 shared_level >>= 1; 1921 1922 KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem); 1923 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem); 1924 KCAGE_STAT_SET_SCAN(kt_ticks, lbolt - scan_start); 1925 KCAGE_STAT_INC_SCAN_INDEX; 1926 goto loop; 1927 } 1928 1929 /*NOTREACHED*/ 1930 } 1931 1932 void 1933 kcage_cageout_wakeup() 1934 { 1935 if (mutex_tryenter(&kcage_cageout_mutex)) { 1936 if (kcage_cageout_ready) { 1937 cv_signal(&kcage_cageout_cv); 1938 } else if (kcage_freemem < kcage_minfree || kcage_needfree) { 1939 /* 1940 * Available cage memory is really low. Time to 1941 * start expanding the cage. However, the 1942 * kernel cage thread is not yet ready to 1943 * do the work. Use *this* thread, which is 1944 * most likely to be t0, to do the work. 1945 */ 1946 KCAGE_STAT_INCR(kcw_expandearly); 1947 (void) kcage_expand(); 1948 KCAGE_STAT_INC_SCAN_INDEX; 1949 } 1950 1951 mutex_exit(&kcage_cageout_mutex); 1952 } 1953 /* else, kernel cage thread is already running */ 1954 } 1955 1956 void 1957 kcage_tick() 1958 { 1959 /* 1960 * Once per second we wake up all the threads throttled 1961 * waiting for cage memory, in case we've become stuck 1962 * and haven't made forward progress expanding the cage. 1963 */ 1964 if (kcage_on && kcage_cageout_ready) 1965 cv_broadcast(&kcage_throttle_cv); 1966 } 1967