1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/types.h> 26 #include <sys/param.h> 27 #include <sys/thread.h> 28 #include <sys/proc.h> 29 #include <sys/callb.h> 30 #include <sys/vnode.h> 31 #include <sys/debug.h> 32 #include <sys/systm.h> /* for bzero */ 33 #include <sys/memlist.h> 34 #include <sys/cmn_err.h> 35 #include <sys/sysmacros.h> 36 #include <sys/vmsystm.h> /* for NOMEMWAIT() */ 37 #include <sys/atomic.h> /* used to update kcage_freemem */ 38 #include <sys/kmem.h> /* for kmem_reap */ 39 #include <sys/errno.h> 40 #include <sys/mem_cage.h> 41 #include <vm/seg_kmem.h> 42 #include <vm/page.h> 43 #include <vm/hat.h> 44 #include <vm/vm_dep.h> 45 #include <sys/mem_config.h> 46 #include <sys/lgrp.h> 47 #include <sys/rwlock.h> 48 #include <sys/cpupart.h> 49 50 extern pri_t maxclsyspri; 51 52 #ifdef DEBUG 53 #define KCAGE_STATS 54 #endif 55 56 #ifdef KCAGE_STATS 57 58 #define KCAGE_STATS_VERSION 9 /* can help report generators */ 59 #define KCAGE_STATS_NSCANS 256 /* depth of scan statistics buffer */ 60 61 struct kcage_stats_scan { 62 /* managed by KCAGE_STAT_* macros */ 63 clock_t scan_lbolt; 64 uint_t scan_id; 65 66 /* set in kcage_cageout() */ 67 uint_t kt_passes; 68 clock_t kt_ticks; 69 pgcnt_t kt_kcage_freemem_start; 70 pgcnt_t kt_kcage_freemem_end; 71 pgcnt_t kt_freemem_start; 72 pgcnt_t kt_freemem_end; 73 uint_t kt_examined; 74 uint_t kt_cantlock; 75 uint_t kt_gotone; 76 uint_t kt_gotonefree; 77 uint_t kt_skipshared; 78 uint_t kt_skiprefd; 79 uint_t kt_destroy; 80 81 /* set in kcage_invalidate_page() */ 82 uint_t kip_reloclocked; 83 uint_t kip_relocmod; 84 uint_t kip_destroy; 85 uint_t kip_nomem; 86 uint_t kip_demotefailed; 87 88 /* set in kcage_expand() */ 89 uint_t ke_wanted; 90 uint_t ke_examined; 91 uint_t ke_lefthole; 92 uint_t ke_gotone; 93 uint_t ke_gotonefree; 94 }; 95 96 struct kcage_stats { 97 /* managed by KCAGE_STAT_* macros */ 98 uint_t version; 99 uint_t size; 100 101 /* set in kcage_cageout */ 102 uint_t kt_wakeups; 103 uint_t kt_scans; 104 uint_t kt_cageout_break; 105 106 /* set in kcage_expand */ 107 uint_t ke_calls; 108 uint_t ke_nopfn; 109 uint_t ke_nopaget; 110 uint_t ke_isnoreloc; 111 uint_t ke_deleting; 112 uint_t ke_lowfreemem; 113 uint_t ke_terminate; 114 115 /* set in kcage_freemem_add() */ 116 uint_t kfa_trottlewake; 117 118 /* set in kcage_freemem_sub() */ 119 uint_t kfs_cagewake; 120 121 /* set in kcage_create_throttle */ 122 uint_t kct_calls; 123 uint_t kct_cageout; 124 uint_t kct_critical; 125 uint_t kct_exempt; 126 uint_t kct_cagewake; 127 uint_t kct_wait; 128 uint_t kct_progress; 129 uint_t kct_noprogress; 130 uint_t kct_timeout; 131 132 /* set in kcage_cageout_wakeup */ 133 uint_t kcw_expandearly; 134 135 /* managed by KCAGE_STAT_* macros */ 136 uint_t scan_array_size; 137 uint_t scan_index; 138 struct kcage_stats_scan scans[KCAGE_STATS_NSCANS]; 139 }; 140 141 static struct kcage_stats kcage_stats; 142 static struct kcage_stats_scan kcage_stats_scan_zero; 143 144 /* 145 * No real need for atomics here. For the most part the incs and sets are 146 * done by the kernel cage thread. There are a few that are done by any 147 * number of other threads. Those cases are noted by comments. 148 */ 149 #define KCAGE_STAT_INCR(m) kcage_stats.m++ 150 151 #define KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v) 152 153 #define KCAGE_STAT_INCR_SCAN(m) \ 154 KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m) 155 156 #define KCAGE_STAT_NINCR_SCAN(m, v) \ 157 KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v) 158 159 #define KCAGE_STAT_SET(m, v) kcage_stats.m = (v) 160 161 #define KCAGE_STAT_SETZ(m, v) \ 162 if (kcage_stats.m == 0) kcage_stats.m = (v) 163 164 #define KCAGE_STAT_SET_SCAN(m, v) \ 165 KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v) 166 167 #define KCAGE_STAT_SETZ_SCAN(m, v) \ 168 KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v) 169 170 #define KCAGE_STAT_INC_SCAN_INDEX \ 171 KCAGE_STAT_SET_SCAN(scan_lbolt, ddi_get_lbolt()); \ 172 KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \ 173 kcage_stats.scan_index = \ 174 (kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \ 175 kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero 176 177 #define KCAGE_STAT_INIT_SCAN_INDEX \ 178 kcage_stats.version = KCAGE_STATS_VERSION; \ 179 kcage_stats.size = sizeof (kcage_stats); \ 180 kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \ 181 kcage_stats.scan_index = 0 182 183 #else /* KCAGE_STATS */ 184 185 #define KCAGE_STAT_INCR(v) 186 #define KCAGE_STAT_NINCR(m, v) 187 #define KCAGE_STAT_INCR_SCAN(v) 188 #define KCAGE_STAT_NINCR_SCAN(m, v) 189 #define KCAGE_STAT_SET(m, v) 190 #define KCAGE_STAT_SETZ(m, v) 191 #define KCAGE_STAT_SET_SCAN(m, v) 192 #define KCAGE_STAT_SETZ_SCAN(m, v) 193 #define KCAGE_STAT_INC_SCAN_INDEX 194 #define KCAGE_STAT_INIT_SCAN_INDEX 195 196 #endif /* KCAGE_STATS */ 197 198 static kmutex_t kcage_throttle_mutex; /* protects kcage_throttle_cv */ 199 static kcondvar_t kcage_throttle_cv; 200 201 static kmutex_t kcage_cageout_mutex; /* protects cv and ready flag */ 202 static kcondvar_t kcage_cageout_cv; /* cageout thread naps here */ 203 static int kcage_cageout_ready; /* nonzero when cageout thread ready */ 204 kthread_id_t kcage_cageout_thread; /* to aid debugging */ 205 206 static krwlock_t kcage_range_rwlock; /* protects kcage_glist elements */ 207 208 /* 209 * Cage expansion happens within a range. 210 */ 211 struct kcage_glist { 212 struct kcage_glist *next; 213 pfn_t base; 214 pfn_t lim; 215 pfn_t curr; 216 int decr; 217 }; 218 219 static struct kcage_glist *kcage_glist; 220 static struct kcage_glist *kcage_current_glist; 221 222 /* 223 * The firstfree element is provided so that kmem_alloc can be avoided 224 * until that cage has somewhere to go. This is not currently a problem 225 * as early kmem_alloc's use BOP_ALLOC instead of page_create_va. 226 */ 227 static vmem_t *kcage_arena; 228 static struct kcage_glist kcage_glist_firstfree; 229 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree; 230 231 /* 232 * Miscellaneous forward references 233 */ 234 static struct kcage_glist *kcage_glist_alloc(void); 235 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **); 236 static void kcage_cageout(void); 237 static int kcage_invalidate_page(page_t *, pgcnt_t *); 238 static int kcage_setnoreloc_pages(page_t *, se_t); 239 static int kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t); 240 static void kcage_init(pgcnt_t preferred_size); 241 static int kcage_range_delete_internal(pfn_t base, pgcnt_t npgs); 242 243 /* 244 * Kernel Memory Cage counters and thresholds. 245 */ 246 int kcage_on = 0; 247 pgcnt_t kcage_freemem; 248 pgcnt_t kcage_needfree; 249 pgcnt_t kcage_lotsfree; 250 pgcnt_t kcage_desfree; 251 pgcnt_t kcage_minfree; 252 pgcnt_t kcage_throttlefree; 253 pgcnt_t kcage_reserve; 254 int kcage_maxwait = 10; /* in seconds */ 255 256 /* when we use lp for kmem we start the cage at a higher initial value */ 257 pgcnt_t kcage_kmemlp_mincage; 258 259 #ifdef DEBUG 260 pgcnt_t kcage_pagets; 261 #define KCAGEPAGETS_INC() kcage_pagets++ 262 #else 263 #define KCAGEPAGETS_INC() 264 #endif 265 266 /* kstats to export what pages are currently caged */ 267 kmutex_t kcage_kstat_lock; 268 static int kcage_kstat_update(kstat_t *ksp, int rw); 269 static int kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw); 270 271 /* 272 * Startup and Dynamic Reconfiguration interfaces. 273 * kcage_range_add() 274 * kcage_range_del() 275 * kcage_range_delete_post_mem_del() 276 * kcage_range_init() 277 * kcage_set_thresholds() 278 */ 279 280 /* 281 * Called from page_get_contig_pages to get the approximate kcage pfn range 282 * for exclusion from search for contiguous pages. This routine is called 283 * without kcage_range lock (kcage routines can call page_get_contig_pages 284 * through page_relocate) and with the assumption, based on kcage_range_add, 285 * that kcage_current_glist always contain a valid pointer. 286 */ 287 288 int 289 kcage_current_pfn(pfn_t *pfncur) 290 { 291 struct kcage_glist *lp = kcage_current_glist; 292 293 ASSERT(kcage_on); 294 295 ASSERT(lp != NULL); 296 297 *pfncur = lp->curr; 298 299 return (lp->decr); 300 } 301 302 /* 303 * Called from vm_pagelist.c during coalesce to find kernel cage regions 304 * within an mnode. Looks for the lowest range between lo and hi. 305 * 306 * Kernel cage memory is defined between kcage_glist and kcage_current_glist. 307 * Non-cage memory is defined between kcage_current_glist and list end. 308 * 309 * If incage is set, returns the lowest kcage range. Otherwise returns lowest 310 * non-cage range. 311 * 312 * Returns zero on success and nlo, nhi: 313 * lo <= nlo < nhi <= hi 314 * Returns non-zero if no overlapping range is found. 315 */ 316 int 317 kcage_next_range(int incage, pfn_t lo, pfn_t hi, 318 pfn_t *nlo, pfn_t *nhi) 319 { 320 struct kcage_glist *lp; 321 pfn_t tlo = hi; 322 pfn_t thi = hi; 323 324 ASSERT(lo <= hi); 325 326 /* 327 * Reader lock protects the list, but kcage_get_pfn 328 * running concurrently may advance kcage_current_glist 329 * and also update kcage_current_glist->curr. Page 330 * coalesce can handle this race condition. 331 */ 332 rw_enter(&kcage_range_rwlock, RW_READER); 333 334 for (lp = incage ? kcage_glist : kcage_current_glist; 335 lp != NULL; lp = lp->next) { 336 337 pfn_t klo, khi; 338 339 /* find the range limits in this element */ 340 if ((incage && lp->decr) || (!incage && !lp->decr)) { 341 klo = lp->curr; 342 khi = lp->lim; 343 } else { 344 klo = lp->base; 345 khi = lp->curr; 346 } 347 348 /* handle overlap */ 349 if (klo < tlo && klo < khi && lo < khi && klo < hi) { 350 tlo = MAX(lo, klo); 351 thi = MIN(hi, khi); 352 if (tlo == lo) 353 break; 354 } 355 356 /* check end of kcage */ 357 if (incage && lp == kcage_current_glist) { 358 break; 359 } 360 } 361 362 rw_exit(&kcage_range_rwlock); 363 364 /* return non-zero if no overlapping range found */ 365 if (tlo == thi) 366 return (1); 367 368 ASSERT(lo <= tlo && tlo < thi && thi <= hi); 369 370 /* return overlapping range */ 371 *nlo = tlo; 372 *nhi = thi; 373 return (0); 374 } 375 376 void 377 kcage_range_init(struct memlist *ml, kcage_dir_t d, pgcnt_t preferred_size) 378 { 379 int ret = 0; 380 381 ASSERT(kcage_arena == NULL); 382 kcage_arena = vmem_create("kcage_arena", NULL, 0, sizeof (uint64_t), 383 segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP); 384 ASSERT(kcage_arena != NULL); 385 386 if (d == KCAGE_DOWN) { 387 while (ml->ml_next != NULL) 388 ml = ml->ml_next; 389 } 390 391 rw_enter(&kcage_range_rwlock, RW_WRITER); 392 393 while (ml != NULL) { 394 ret = kcage_range_add_internal(btop(ml->ml_address), 395 btop(ml->ml_size), d); 396 if (ret) 397 panic("kcage_range_add_internal failed: " 398 "ml=%p, ret=0x%x\n", (void *)ml, ret); 399 400 ml = (d == KCAGE_DOWN ? ml->ml_prev : ml->ml_next); 401 } 402 403 rw_exit(&kcage_range_rwlock); 404 405 if (ret == 0) 406 kcage_init(preferred_size); 407 } 408 409 /* 410 * Third arg controls direction of growth: 0: increasing pfns, 411 * 1: decreasing. 412 */ 413 static int 414 kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t d) 415 { 416 struct kcage_glist *new, **lpp; 417 pfn_t lim; 418 419 ASSERT(rw_write_held(&kcage_range_rwlock)); 420 421 ASSERT(npgs != 0); 422 if (npgs == 0) 423 return (EINVAL); 424 425 lim = base + npgs; 426 427 ASSERT(lim > base); 428 if (lim <= base) 429 return (EINVAL); 430 431 new = kcage_glist_alloc(); 432 if (new == NULL) { 433 return (ENOMEM); 434 } 435 436 new->base = base; 437 new->lim = lim; 438 new->decr = (d == KCAGE_DOWN); 439 if (new->decr != 0) 440 new->curr = new->lim; 441 else 442 new->curr = new->base; 443 /* 444 * Any overlapping existing ranges are removed by deleting 445 * from the new list as we search for the tail. 446 */ 447 lpp = &kcage_glist; 448 while (*lpp != NULL) { 449 int ret; 450 ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new); 451 if (ret != 0) 452 return (ret); 453 lpp = &(*lpp)->next; 454 } 455 456 *lpp = new; 457 458 if (kcage_current_glist == NULL) { 459 kcage_current_glist = kcage_glist; 460 } 461 462 return (0); 463 } 464 465 int 466 kcage_range_add(pfn_t base, pgcnt_t npgs, kcage_dir_t d) 467 { 468 int ret; 469 470 rw_enter(&kcage_range_rwlock, RW_WRITER); 471 ret = kcage_range_add_internal(base, npgs, d); 472 rw_exit(&kcage_range_rwlock); 473 return (ret); 474 } 475 476 /* 477 * Calls to add and delete must be protected by kcage_range_rwlock 478 */ 479 static int 480 kcage_range_delete_internal(pfn_t base, pgcnt_t npgs) 481 { 482 struct kcage_glist *lp; 483 pfn_t lim; 484 485 ASSERT(rw_write_held(&kcage_range_rwlock)); 486 487 ASSERT(npgs != 0); 488 if (npgs == 0) 489 return (EINVAL); 490 491 lim = base + npgs; 492 493 ASSERT(lim > base); 494 if (lim <= base) 495 return (EINVAL); 496 497 /* 498 * Check if the delete is OK first as a number of elements 499 * might be involved and it will be difficult to go 500 * back and undo (can't just add the range back in). 501 */ 502 for (lp = kcage_glist; lp != NULL; lp = lp->next) { 503 /* 504 * If there have been no pages allocated from this 505 * element, we don't need to check it. 506 */ 507 if ((lp->decr == 0 && lp->curr == lp->base) || 508 (lp->decr != 0 && lp->curr == lp->lim)) 509 continue; 510 /* 511 * If the element does not overlap, its OK. 512 */ 513 if (base >= lp->lim || lim <= lp->base) 514 continue; 515 /* 516 * Overlapping element: Does the range to be deleted 517 * overlap the area already used? If so fail. 518 */ 519 if (lp->decr == 0 && base < lp->curr && lim >= lp->base) { 520 return (EBUSY); 521 } 522 if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) { 523 return (EBUSY); 524 } 525 } 526 return (kcage_glist_delete(base, lim, &kcage_glist)); 527 } 528 529 int 530 kcage_range_delete(pfn_t base, pgcnt_t npgs) 531 { 532 int ret; 533 534 rw_enter(&kcage_range_rwlock, RW_WRITER); 535 ret = kcage_range_delete_internal(base, npgs); 536 rw_exit(&kcage_range_rwlock); 537 return (ret); 538 } 539 540 /* 541 * Calls to add and delete must be protected by kcage_range_rwlock. 542 * This routine gets called after successful Solaris memory 543 * delete operation from DR post memory delete routines. 544 */ 545 static int 546 kcage_range_delete_post_mem_del_internal(pfn_t base, pgcnt_t npgs) 547 { 548 pfn_t lim; 549 550 ASSERT(rw_write_held(&kcage_range_rwlock)); 551 552 ASSERT(npgs != 0); 553 if (npgs == 0) 554 return (EINVAL); 555 556 lim = base + npgs; 557 558 ASSERT(lim > base); 559 if (lim <= base) 560 return (EINVAL); 561 562 return (kcage_glist_delete(base, lim, &kcage_glist)); 563 } 564 565 int 566 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs) 567 { 568 int ret; 569 570 rw_enter(&kcage_range_rwlock, RW_WRITER); 571 ret = kcage_range_delete_post_mem_del_internal(base, npgs); 572 rw_exit(&kcage_range_rwlock); 573 return (ret); 574 } 575 576 /* 577 * No locking is required here as the whole operation is covered 578 * by kcage_range_rwlock writer lock. 579 */ 580 static struct kcage_glist * 581 kcage_glist_alloc(void) 582 { 583 struct kcage_glist *new; 584 585 if ((new = kcage_glist_freelist) != NULL) { 586 kcage_glist_freelist = new->next; 587 } else if (kernel_cage_enable) { 588 new = vmem_alloc(kcage_arena, sizeof (*new), VM_NOSLEEP); 589 } else { 590 /* 591 * On DR supported platforms we allow memory add 592 * even when kernel cage is disabled. "kcage_arena" is 593 * created only when kernel cage is enabled. 594 */ 595 new = kmem_zalloc(sizeof (*new), KM_NOSLEEP); 596 } 597 598 if (new != NULL) 599 bzero(new, sizeof (*new)); 600 601 return (new); 602 } 603 604 static void 605 kcage_glist_free(struct kcage_glist *lp) 606 { 607 lp->next = kcage_glist_freelist; 608 kcage_glist_freelist = lp; 609 } 610 611 static int 612 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp) 613 { 614 struct kcage_glist *lp, *prev = *lpp; 615 616 while ((lp = *lpp) != NULL) { 617 if (lim > lp->base && base < lp->lim) { 618 /* The delete range overlaps this element. */ 619 if (base <= lp->base && lim >= lp->lim) { 620 /* Delete whole element. */ 621 *lpp = lp->next; 622 if (lp == kcage_current_glist) { 623 /* This can never happen. */ 624 ASSERT(kcage_current_glist != prev); 625 kcage_current_glist = prev; 626 } 627 kcage_glist_free(lp); 628 continue; 629 } 630 631 /* Partial delete. */ 632 if (base > lp->base && lim < lp->lim) { 633 struct kcage_glist *new; 634 635 /* 636 * Remove a section from the middle, 637 * need to allocate a new element. 638 */ 639 new = kcage_glist_alloc(); 640 if (new == NULL) { 641 return (ENOMEM); 642 } 643 644 /* 645 * Tranfser unused range to new. 646 * Edit lp in place to preserve 647 * kcage_current_glist. 648 */ 649 new->decr = lp->decr; 650 if (new->decr != 0) { 651 new->base = lp->base; 652 new->lim = base; 653 new->curr = base; 654 655 lp->base = lim; 656 } else { 657 new->base = lim; 658 new->lim = lp->lim; 659 new->curr = new->base; 660 661 lp->lim = base; 662 } 663 664 /* Insert new. */ 665 new->next = lp->next; 666 lp->next = new; 667 lpp = &lp->next; 668 } else { 669 /* Delete part of current block. */ 670 if (base > lp->base) { 671 ASSERT(lim >= lp->lim); 672 ASSERT(base < lp->lim); 673 if (lp->decr != 0 && 674 lp->curr == lp->lim) 675 lp->curr = base; 676 lp->lim = base; 677 } else { 678 ASSERT(base <= lp->base); 679 ASSERT(lim > lp->base); 680 if (lp->decr == 0 && 681 lp->curr == lp->base) 682 lp->curr = lim; 683 lp->base = lim; 684 } 685 } 686 } 687 prev = *lpp; 688 lpp = &(*lpp)->next; 689 } 690 691 return (0); 692 } 693 694 /* 695 * If lockit is 1, kcage_get_pfn holds the 696 * reader lock for kcage_range_rwlock. 697 * Changes to lp->curr can cause race conditions, but 698 * they are handled by higher level code (see kcage_next_range.) 699 */ 700 static pfn_t 701 kcage_get_pfn(int lockit) 702 { 703 struct kcage_glist *lp; 704 pfn_t pfn = PFN_INVALID; 705 706 if (lockit && !rw_tryenter(&kcage_range_rwlock, RW_READER)) 707 return (pfn); 708 709 lp = kcage_current_glist; 710 while (lp != NULL) { 711 if (lp->decr != 0) { 712 if (lp->curr != lp->base) { 713 pfn = --lp->curr; 714 break; 715 } 716 } else { 717 if (lp->curr != lp->lim) { 718 pfn = lp->curr++; 719 break; 720 } 721 } 722 723 lp = lp->next; 724 if (lp) 725 kcage_current_glist = lp; 726 } 727 728 if (lockit) 729 rw_exit(&kcage_range_rwlock); 730 return (pfn); 731 } 732 733 /* 734 * Walk the physical address space of the cage. 735 * This routine does not guarantee to return PFNs in the order 736 * in which they were allocated to the cage. Instead, it walks 737 * each range as they appear on the growth list returning the PFNs 738 * range in ascending order. 739 * 740 * To begin scanning at lower edge of cage, reset should be nonzero. 741 * To step through cage, reset should be zero. 742 * 743 * PFN_INVALID will be returned when the upper end of the cage is 744 * reached -- indicating a full scan of the cage has been completed since 745 * previous reset. PFN_INVALID will continue to be returned until 746 * kcage_walk_cage is reset. 747 * 748 * It is possible to receive a PFN_INVALID result on reset if a growth 749 * list is not installed or if none of the PFNs in the installed list have 750 * been allocated to the cage. In otherwords, there is no cage. 751 * 752 * Caller need not hold kcage_range_rwlock while calling this function 753 * as the front part of the list is static - pages never come out of 754 * the cage. 755 * 756 * The caller is expected to only be kcage_cageout(). 757 */ 758 static pfn_t 759 kcage_walk_cage(int reset) 760 { 761 static struct kcage_glist *lp = NULL; 762 static pfn_t pfn; 763 764 if (reset) 765 lp = NULL; 766 if (lp == NULL) { 767 lp = kcage_glist; 768 pfn = PFN_INVALID; 769 } 770 again: 771 if (pfn == PFN_INVALID) { 772 if (lp == NULL) 773 return (PFN_INVALID); 774 775 if (lp->decr != 0) { 776 /* 777 * In this range the cage grows from the highest 778 * address towards the lowest. 779 * Arrange to return pfns from curr to lim-1, 780 * inclusive, in ascending order. 781 */ 782 783 pfn = lp->curr; 784 } else { 785 /* 786 * In this range the cage grows from the lowest 787 * address towards the highest. 788 * Arrange to return pfns from base to curr, 789 * inclusive, in ascending order. 790 */ 791 792 pfn = lp->base; 793 } 794 } 795 796 if (lp->decr != 0) { /* decrementing pfn */ 797 if (pfn == lp->lim) { 798 /* Don't go beyond the static part of the glist. */ 799 if (lp == kcage_current_glist) 800 lp = NULL; 801 else 802 lp = lp->next; 803 pfn = PFN_INVALID; 804 goto again; 805 } 806 807 ASSERT(pfn >= lp->curr && pfn < lp->lim); 808 } else { /* incrementing pfn */ 809 if (pfn == lp->curr) { 810 /* Don't go beyond the static part of the glist. */ 811 if (lp == kcage_current_glist) 812 lp = NULL; 813 else 814 lp = lp->next; 815 pfn = PFN_INVALID; 816 goto again; 817 } 818 819 ASSERT(pfn >= lp->base && pfn < lp->curr); 820 } 821 822 return (pfn++); 823 } 824 825 /* 826 * Callback functions for to recalc cage thresholds after 827 * Kphysm memory add/delete operations. 828 */ 829 /*ARGSUSED*/ 830 static void 831 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages) 832 { 833 kcage_recalc_thresholds(); 834 } 835 836 /*ARGSUSED*/ 837 static int 838 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages) 839 { 840 /* TODO: when should cage refuse memory delete requests? */ 841 return (0); 842 } 843 844 /*ARGSUSED*/ 845 static void 846 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled) 847 { 848 kcage_recalc_thresholds(); 849 } 850 851 static kphysm_setup_vector_t kcage_kphysm_vectors = { 852 KPHYSM_SETUP_VECTOR_VERSION, 853 kcage_kphysm_postadd_cb, 854 kcage_kphysm_predel_cb, 855 kcage_kphysm_postdel_cb 856 }; 857 858 /* 859 * This is called before a CPR suspend and after a CPR resume. We have to 860 * turn off kcage_cageout_ready before a suspend, and turn it back on after a 861 * restart. 862 */ 863 /*ARGSUSED*/ 864 static boolean_t 865 kcage_cageout_cpr(void *arg, int code) 866 { 867 if (code == CB_CODE_CPR_CHKPT) { 868 ASSERT(kcage_cageout_ready); 869 kcage_cageout_ready = 0; 870 return (B_TRUE); 871 } else if (code == CB_CODE_CPR_RESUME) { 872 ASSERT(kcage_cageout_ready == 0); 873 kcage_cageout_ready = 1; 874 return (B_TRUE); 875 } 876 return (B_FALSE); 877 } 878 879 /* 880 * kcage_recalc_preferred_size() increases initial cage size to improve large 881 * page availability when lp for kmem is enabled and kpr is disabled 882 */ 883 static pgcnt_t 884 kcage_recalc_preferred_size(pgcnt_t preferred_size) 885 { 886 if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) { 887 pgcnt_t lpmincage = kcage_kmemlp_mincage; 888 if (lpmincage == 0) { 889 lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8), 890 segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE; 891 } 892 kcage_kmemlp_mincage = MIN(lpmincage, 893 (segkmem_kmemlp_max / PAGESIZE)); 894 preferred_size = MAX(kcage_kmemlp_mincage, preferred_size); 895 } 896 return (preferred_size); 897 } 898 899 /* 900 * Kcage_init() builds the cage and initializes the cage thresholds. 901 * The size of the cage is determined by the argument preferred_size. 902 * or the actual amount of memory, whichever is smaller. 903 */ 904 static void 905 kcage_init(pgcnt_t preferred_size) 906 { 907 pgcnt_t wanted; 908 pfn_t pfn; 909 page_t *pp; 910 kstat_t *ksp; 911 912 extern void page_list_noreloc_startup(page_t *); 913 914 ASSERT(!kcage_on); 915 916 /* increase preferred cage size for lp for kmem */ 917 preferred_size = kcage_recalc_preferred_size(preferred_size); 918 919 /* Debug note: initialize this now so early expansions can stat */ 920 KCAGE_STAT_INIT_SCAN_INDEX; 921 922 /* 923 * Initialize cage thresholds and install kphysm callback. 924 * If we can't arrange to have the thresholds track with 925 * available physical memory, then the cage thresholds may 926 * end up over time at levels that adversly effect system 927 * performance; so, bail out. 928 */ 929 kcage_recalc_thresholds(); 930 if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) { 931 ASSERT(0); /* Catch this in DEBUG kernels. */ 932 return; 933 } 934 935 /* 936 * Limit startup cage size within the range of kcage_minfree 937 * and availrmem, inclusively. 938 */ 939 wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem); 940 941 /* 942 * Construct the cage. PFNs are allocated from the glist. It 943 * is assumed that the list has been properly ordered for the 944 * platform by the platform code. Typically, this is as simple 945 * as calling kcage_range_init(phys_avail, decr), where decr is 946 * 1 if the kernel has been loaded into upper end of physical 947 * memory, or 0 if the kernel has been loaded at the low end. 948 * 949 * Note: it is assumed that we are in the startup flow, so there 950 * is no reason to grab the page lock. 951 */ 952 kcage_freemem = 0; 953 pfn = PFN_INVALID; /* prime for alignment test */ 954 while (wanted != 0) { 955 if ((pfn = kcage_get_pfn(0)) == PFN_INVALID) 956 break; 957 958 if ((pp = page_numtopp_nolock(pfn)) != NULL) { 959 KCAGEPAGETS_INC(); 960 /* 961 * Set the noreloc state on the page. 962 * If the page is free and not already 963 * on the noreloc list then move it. 964 */ 965 if (PP_ISFREE(pp)) { 966 if (PP_ISNORELOC(pp) == 0) 967 page_list_noreloc_startup(pp); 968 } else { 969 ASSERT(pp->p_szc == 0); 970 PP_SETNORELOC(pp); 971 } 972 } 973 PLCNT_XFER_NORELOC(pp); 974 wanted -= 1; 975 } 976 977 /* 978 * Need to go through and find kernel allocated pages 979 * and capture them into the Cage. These will primarily 980 * be pages gotten through boot_alloc(). 981 */ 982 if (kvp.v_pages) { 983 984 pp = kvp.v_pages; 985 do { 986 ASSERT(!PP_ISFREE(pp)); 987 ASSERT(pp->p_szc == 0); 988 if (PP_ISNORELOC(pp) == 0) { 989 PP_SETNORELOC(pp); 990 PLCNT_XFER_NORELOC(pp); 991 } 992 } while ((pp = pp->p_vpnext) != kvp.v_pages); 993 994 } 995 996 kcage_on = 1; 997 998 /* 999 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend() 1000 * after the cageout thread is blocked, and executes from cpr_resume() 1001 * before the cageout thread is restarted. By executing in this class, 1002 * we are assured that the kernel cage thread won't miss wakeup calls 1003 * and also CPR's larger kmem_alloc requests will not fail after 1004 * CPR shuts down the cageout kernel thread. 1005 */ 1006 (void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL, 1007 "cageout"); 1008 1009 /* 1010 * Coalesce pages to improve large page availability. A better fix 1011 * would to coalesce pages as they are included in the cage 1012 */ 1013 if (SEGKMEM_USE_LARGEPAGES) { 1014 extern void page_freelist_coalesce_all(int mnode); 1015 page_freelist_coalesce_all(-1); /* do all mnodes */ 1016 } 1017 1018 ksp = kstat_create("kcage", 0, "kcage_page_list", "misc", 1019 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL); 1020 if (ksp != NULL) { 1021 ksp->ks_update = kcage_kstat_update; 1022 ksp->ks_snapshot = kcage_kstat_snapshot; 1023 ksp->ks_lock = &kcage_kstat_lock; /* XXX - not really needed */ 1024 kstat_install(ksp); 1025 } 1026 } 1027 1028 static int 1029 kcage_kstat_update(kstat_t *ksp, int rw) 1030 { 1031 struct kcage_glist *lp; 1032 uint_t count; 1033 1034 if (rw == KSTAT_WRITE) 1035 return (EACCES); 1036 1037 count = 0; 1038 rw_enter(&kcage_range_rwlock, RW_WRITER); 1039 for (lp = kcage_glist; lp != NULL; lp = lp->next) { 1040 if (lp->decr) { 1041 if (lp->curr != lp->lim) { 1042 count++; 1043 } 1044 } else { 1045 if (lp->curr != lp->base) { 1046 count++; 1047 } 1048 } 1049 } 1050 rw_exit(&kcage_range_rwlock); 1051 1052 ksp->ks_ndata = count; 1053 ksp->ks_data_size = count * 2 * sizeof (uint64_t); 1054 1055 return (0); 1056 } 1057 1058 static int 1059 kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw) 1060 { 1061 struct kcage_glist *lp; 1062 struct memunit { 1063 uint64_t address; 1064 uint64_t size; 1065 } *kspmem; 1066 1067 if (rw == KSTAT_WRITE) 1068 return (EACCES); 1069 1070 ksp->ks_snaptime = gethrtime(); 1071 1072 kspmem = (struct memunit *)buf; 1073 rw_enter(&kcage_range_rwlock, RW_WRITER); 1074 for (lp = kcage_glist; lp != NULL; lp = lp->next, kspmem++) { 1075 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) 1076 break; 1077 1078 if (lp->decr) { 1079 if (lp->curr != lp->lim) { 1080 kspmem->address = ptob(lp->curr); 1081 kspmem->size = ptob(lp->lim - lp->curr); 1082 } 1083 } else { 1084 if (lp->curr != lp->base) { 1085 kspmem->address = ptob(lp->base); 1086 kspmem->size = ptob(lp->curr - lp->base); 1087 } 1088 } 1089 } 1090 rw_exit(&kcage_range_rwlock); 1091 1092 return (0); 1093 } 1094 1095 void 1096 kcage_recalc_thresholds() 1097 { 1098 static int first = 1; 1099 static pgcnt_t init_lotsfree; 1100 static pgcnt_t init_desfree; 1101 static pgcnt_t init_minfree; 1102 static pgcnt_t init_throttlefree; 1103 static pgcnt_t init_reserve; 1104 1105 /* TODO: any reason to take more care than this with live editing? */ 1106 mutex_enter(&kcage_cageout_mutex); 1107 mutex_enter(&freemem_lock); 1108 1109 if (first) { 1110 first = 0; 1111 init_lotsfree = kcage_lotsfree; 1112 init_desfree = kcage_desfree; 1113 init_minfree = kcage_minfree; 1114 init_throttlefree = kcage_throttlefree; 1115 init_reserve = kcage_reserve; 1116 } else { 1117 kcage_lotsfree = init_lotsfree; 1118 kcage_desfree = init_desfree; 1119 kcage_minfree = init_minfree; 1120 kcage_throttlefree = init_throttlefree; 1121 kcage_reserve = init_reserve; 1122 } 1123 1124 if (kcage_lotsfree == 0) 1125 kcage_lotsfree = MAX(32, total_pages / 256); 1126 1127 if (kcage_minfree == 0) 1128 kcage_minfree = MAX(32, kcage_lotsfree / 2); 1129 1130 if (kcage_desfree == 0) 1131 kcage_desfree = MAX(32, kcage_minfree); 1132 1133 if (kcage_throttlefree == 0) 1134 kcage_throttlefree = MAX(32, kcage_minfree / 2); 1135 1136 if (kcage_reserve == 0) 1137 kcage_reserve = MIN(32, kcage_throttlefree / 2); 1138 1139 mutex_exit(&freemem_lock); 1140 mutex_exit(&kcage_cageout_mutex); 1141 1142 if (kcage_cageout_ready) { 1143 if (kcage_freemem < kcage_desfree) 1144 kcage_cageout_wakeup(); 1145 1146 if (kcage_needfree) { 1147 mutex_enter(&kcage_throttle_mutex); 1148 cv_broadcast(&kcage_throttle_cv); 1149 mutex_exit(&kcage_throttle_mutex); 1150 } 1151 } 1152 } 1153 1154 /* 1155 * Pageout interface: 1156 * kcage_cageout_init() 1157 */ 1158 void 1159 kcage_cageout_init() 1160 { 1161 if (kcage_on) { 1162 (void) lwp_kernel_create(proc_pageout, kcage_cageout, NULL, 1163 TS_RUN, maxclsyspri - 1); 1164 } 1165 } 1166 1167 1168 /* 1169 * VM Interfaces: 1170 * kcage_create_throttle() 1171 * kcage_freemem_add() 1172 * kcage_freemem_sub() 1173 */ 1174 1175 /* 1176 * Wakeup cageout thread and throttle waiting for the number of pages 1177 * requested to become available. For non-critical requests, a 1178 * timeout is added, since freemem accounting is separate from cage 1179 * freemem accounting: it's possible for us to get stuck and not make 1180 * forward progress even though there was sufficient freemem before 1181 * arriving here. 1182 */ 1183 int 1184 kcage_create_throttle(pgcnt_t npages, int flags) 1185 { 1186 1187 KCAGE_STAT_INCR(kct_calls); /* unprotected incr. */ 1188 1189 /* 1190 * Obviously, we can't throttle the cageout thread since 1191 * we depend on it. We also can't throttle the panic thread. 1192 */ 1193 if (curthread == kcage_cageout_thread || panicstr) { 1194 KCAGE_STAT_INCR(kct_cageout); /* unprotected incr. */ 1195 return (KCT_CRIT); 1196 } 1197 1198 /* 1199 * Don't throttle threads which are critical for proper 1200 * vm management if we're above kcage_throttlefree or 1201 * if freemem is very low. 1202 */ 1203 if (NOMEMWAIT()) { 1204 if (kcage_freemem > kcage_throttlefree + npages) { 1205 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1206 return (KCT_CRIT); 1207 } else if (freemem < minfree) { 1208 KCAGE_STAT_INCR(kct_critical); /* unprotected incr. */ 1209 return (KCT_CRIT); 1210 } 1211 } 1212 1213 /* 1214 * Don't throttle real-time threads if kcage_freemem > kcage_reserve. 1215 */ 1216 if (DISP_PRIO(curthread) > maxclsyspri && 1217 kcage_freemem > kcage_reserve) { 1218 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1219 return (KCT_CRIT); 1220 } 1221 1222 /* 1223 * Cause all other threads (which are assumed to not be 1224 * critical to cageout) to wait here until their request 1225 * can be satisfied. Be a little paranoid and wake the 1226 * kernel cage on each loop through this logic. 1227 */ 1228 while (kcage_freemem < kcage_throttlefree + npages) { 1229 ASSERT(kcage_on); 1230 if (kcage_cageout_ready) { 1231 mutex_enter(&kcage_throttle_mutex); 1232 1233 kcage_needfree += npages; 1234 KCAGE_STAT_INCR(kct_wait); 1235 1236 kcage_cageout_wakeup(); 1237 KCAGE_STAT_INCR(kct_cagewake); 1238 1239 cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex); 1240 1241 kcage_needfree -= npages; 1242 1243 mutex_exit(&kcage_throttle_mutex); 1244 } else { 1245 /* 1246 * NOTE: atomics are used just in case we enter 1247 * mp operation before the cageout thread is ready. 1248 */ 1249 atomic_add_long(&kcage_needfree, npages); 1250 1251 kcage_cageout_wakeup(); 1252 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1253 1254 atomic_add_long(&kcage_needfree, -npages); 1255 } 1256 1257 if (NOMEMWAIT() && freemem < minfree) { 1258 return (KCT_CRIT); 1259 } 1260 if ((flags & PG_WAIT) == 0) { 1261 pgcnt_t limit = (flags & PG_NORMALPRI) ? 1262 throttlefree : pageout_reserve; 1263 1264 if ((kcage_freemem < kcage_throttlefree + npages) && 1265 (freemem < limit + npages)) { 1266 return (KCT_FAILURE); 1267 } else { 1268 return (KCT_NONCRIT); 1269 } 1270 } 1271 } 1272 return (KCT_NONCRIT); 1273 } 1274 1275 void 1276 kcage_freemem_add(pgcnt_t npages) 1277 { 1278 extern void wakeup_pcgs(void); 1279 1280 atomic_add_long(&kcage_freemem, npages); 1281 1282 wakeup_pcgs(); /* wakeup threads in pcgs() */ 1283 1284 if (kcage_needfree != 0 && 1285 kcage_freemem >= (kcage_throttlefree + kcage_needfree)) { 1286 1287 mutex_enter(&kcage_throttle_mutex); 1288 cv_broadcast(&kcage_throttle_cv); 1289 KCAGE_STAT_INCR(kfa_trottlewake); 1290 mutex_exit(&kcage_throttle_mutex); 1291 } 1292 } 1293 1294 void 1295 kcage_freemem_sub(pgcnt_t npages) 1296 { 1297 atomic_add_long(&kcage_freemem, -npages); 1298 1299 if (kcage_freemem < kcage_desfree) { 1300 kcage_cageout_wakeup(); 1301 KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */ 1302 } 1303 } 1304 1305 /* 1306 * return 0 on failure and 1 on success. 1307 */ 1308 static int 1309 kcage_setnoreloc_pages(page_t *rootpp, se_t se) 1310 { 1311 pgcnt_t npgs, i; 1312 page_t *pp; 1313 pfn_t rootpfn = page_pptonum(rootpp); 1314 uint_t szc; 1315 1316 ASSERT(!PP_ISFREE(rootpp)); 1317 ASSERT(PAGE_LOCKED_SE(rootpp, se)); 1318 if (!group_page_trylock(rootpp, se)) { 1319 return (0); 1320 } 1321 szc = rootpp->p_szc; 1322 if (szc == 0) { 1323 /* 1324 * The szc of a locked page can only change for pages that are 1325 * non-swapfs (i.e. anonymous memory) file system pages. 1326 */ 1327 ASSERT(rootpp->p_vnode != NULL && 1328 !PP_ISKAS(rootpp) && 1329 !IS_SWAPFSVP(rootpp->p_vnode)); 1330 PP_SETNORELOC(rootpp); 1331 return (1); 1332 } 1333 npgs = page_get_pagecnt(szc); 1334 ASSERT(IS_P2ALIGNED(rootpfn, npgs)); 1335 pp = rootpp; 1336 for (i = 0; i < npgs; i++, pp++) { 1337 ASSERT(PAGE_LOCKED_SE(pp, se)); 1338 ASSERT(!PP_ISFREE(pp)); 1339 ASSERT(pp->p_szc == szc); 1340 PP_SETNORELOC(pp); 1341 } 1342 group_page_unlock(rootpp); 1343 return (1); 1344 } 1345 1346 /* 1347 * Attempt to convert page to a caged page (set the P_NORELOC flag). 1348 * If successful and pages is free, move page to the tail of whichever 1349 * list it is on. 1350 * Returns: 1351 * EBUSY page already locked, assimilated but not free. 1352 * ENOMEM page assimilated, but memory too low to relocate. Page not free. 1353 * EAGAIN page not assimilated. Page not free. 1354 * ERANGE page assimilated. Page not root. 1355 * 0 page assimilated. Page free. 1356 * *nfreedp number of pages freed. 1357 * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way 1358 * to distinguish between a page that was already a NORELOC page from 1359 * those newly converted to NORELOC pages by this invocation of 1360 * kcage_assimilate_page. 1361 */ 1362 static int 1363 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp) 1364 { 1365 if (page_trylock(pp, SE_EXCL)) { 1366 if (PP_ISNORELOC(pp)) { 1367 check_free_and_return: 1368 if (PP_ISFREE(pp)) { 1369 page_unlock(pp); 1370 *nfreedp = 0; 1371 return (0); 1372 } else { 1373 page_unlock(pp); 1374 return (EBUSY); 1375 } 1376 /*NOTREACHED*/ 1377 } 1378 } else { 1379 if (page_trylock(pp, SE_SHARED)) { 1380 if (PP_ISNORELOC(pp)) 1381 goto check_free_and_return; 1382 } else { 1383 return (EAGAIN); 1384 } 1385 if (!PP_ISFREE(pp)) { 1386 page_unlock(pp); 1387 return (EAGAIN); 1388 } 1389 1390 /* 1391 * Need to upgrade the lock on it and set the NORELOC 1392 * bit. If it is free then remove it from the free 1393 * list so that the platform free list code can keep 1394 * NORELOC pages where they should be. 1395 */ 1396 /* 1397 * Before doing anything, get the exclusive lock. 1398 * This may fail (eg ISM pages are left shared locked). 1399 * If the page is free this will leave a hole in the 1400 * cage. There is no solution yet to this. 1401 */ 1402 if (!page_tryupgrade(pp)) { 1403 page_unlock(pp); 1404 return (EAGAIN); 1405 } 1406 } 1407 1408 ASSERT(PAGE_EXCL(pp)); 1409 1410 if (PP_ISFREE(pp)) { 1411 int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST; 1412 1413 page_list_sub(pp, which); 1414 ASSERT(pp->p_szc == 0); 1415 PP_SETNORELOC(pp); 1416 PLCNT_XFER_NORELOC(pp); 1417 page_list_add(pp, which | PG_LIST_TAIL); 1418 1419 page_unlock(pp); 1420 *nfreedp = 1; 1421 return (0); 1422 } else { 1423 if (pp->p_szc != 0) { 1424 if (!kcage_setnoreloc_pages(pp, SE_EXCL)) { 1425 page_unlock(pp); 1426 return (EAGAIN); 1427 } 1428 ASSERT(PP_ISNORELOC(pp)); 1429 } else { 1430 PP_SETNORELOC(pp); 1431 } 1432 PLCNT_XFER_NORELOC(pp); 1433 return (kcage_invalidate_page(pp, nfreedp)); 1434 } 1435 /*NOTREACHED*/ 1436 } 1437 1438 static int 1439 kcage_expand() 1440 { 1441 int did_something = 0; 1442 1443 spgcnt_t wanted; 1444 pfn_t pfn; 1445 page_t *pp; 1446 /* TODO: we don't really need n any more? */ 1447 pgcnt_t n; 1448 pgcnt_t nf, nfreed; 1449 1450 /* 1451 * Expand the cage if available cage memory is really low. Calculate 1452 * the amount required to return kcage_freemem to the level of 1453 * kcage_lotsfree, or to satisfy throttled requests, whichever is 1454 * more. It is rare for their sum to create an artificial threshold 1455 * above kcage_lotsfree, but it is possible. 1456 * 1457 * Exit early if expansion amount is equal to or less than zero. 1458 * (<0 is possible if kcage_freemem rises suddenly.) 1459 * 1460 * Exit early when freemem drops below pageout_reserve plus the request. 1461 */ 1462 wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree) 1463 - kcage_freemem; 1464 if (wanted <= 0) { 1465 return (0); 1466 } else if (freemem < pageout_reserve + wanted) { 1467 KCAGE_STAT_INCR(ke_lowfreemem); 1468 return (0); 1469 } 1470 1471 KCAGE_STAT_INCR(ke_calls); 1472 KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted); 1473 1474 /* 1475 * Assimilate more pages from the global page pool into the cage. 1476 */ 1477 n = 0; /* number of pages PP_SETNORELOC'd */ 1478 nf = 0; /* number of those actually free */ 1479 while (kcage_on && nf < wanted) { 1480 pfn = kcage_get_pfn(1); 1481 if (pfn == PFN_INVALID) { /* eek! no where to grow */ 1482 KCAGE_STAT_INCR(ke_nopfn); 1483 goto terminate; 1484 } 1485 1486 KCAGE_STAT_INCR_SCAN(ke_examined); 1487 1488 if ((pp = page_numtopp_nolock(pfn)) == NULL) { 1489 KCAGE_STAT_INCR(ke_nopaget); 1490 continue; 1491 } 1492 KCAGEPAGETS_INC(); 1493 /* 1494 * Sanity check. Skip this pfn if it is 1495 * being deleted. 1496 */ 1497 if (pfn_is_being_deleted(pfn)) { 1498 KCAGE_STAT_INCR(ke_deleting); 1499 continue; 1500 } 1501 1502 if (PP_ISNORELOC(pp)) { 1503 KCAGE_STAT_INCR(ke_isnoreloc); 1504 continue; 1505 } 1506 1507 switch (kcage_assimilate_page(pp, &nfreed)) { 1508 case 0: /* assimilated, page is free */ 1509 KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed); 1510 did_something = 1; 1511 nf += nfreed; 1512 n++; 1513 break; 1514 1515 case EBUSY: /* assimilated, page not free */ 1516 case ERANGE: /* assimilated, page not root */ 1517 KCAGE_STAT_INCR_SCAN(ke_gotone); 1518 did_something = 1; 1519 n++; 1520 break; 1521 1522 case ENOMEM: /* assimilated, but no mem */ 1523 KCAGE_STAT_INCR(ke_terminate); 1524 did_something = 1; 1525 n++; 1526 goto terminate; 1527 1528 case EAGAIN: /* can't assimilate */ 1529 KCAGE_STAT_INCR_SCAN(ke_lefthole); 1530 break; 1531 1532 default: /* catch this with debug kernels */ 1533 ASSERT(0); 1534 break; 1535 } 1536 } 1537 1538 /* 1539 * Realign cage edge with the nearest physical address 1540 * boundry for big pages. This is done to give us a 1541 * better chance of actually getting usable big pages 1542 * in the cage. 1543 */ 1544 1545 terminate: 1546 1547 return (did_something); 1548 } 1549 1550 /* 1551 * Relocate page opp (Original Page Pointer) from cage pool to page rpp 1552 * (Replacement Page Pointer) in the global pool. Page opp will be freed 1553 * if relocation is successful, otherwise it is only unlocked. 1554 * On entry, page opp must be exclusively locked and not free. 1555 * *nfreedp: number of pages freed. 1556 */ 1557 static int 1558 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp) 1559 { 1560 page_t *opp = pp; 1561 page_t *rpp = NULL; 1562 spgcnt_t npgs; 1563 int result; 1564 1565 ASSERT(!PP_ISFREE(opp)); 1566 ASSERT(PAGE_EXCL(opp)); 1567 1568 result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL); 1569 *nfreedp = npgs; 1570 if (result == 0) { 1571 while (npgs-- > 0) { 1572 page_t *tpp; 1573 1574 ASSERT(rpp != NULL); 1575 tpp = rpp; 1576 page_sub(&rpp, tpp); 1577 page_unlock(tpp); 1578 } 1579 1580 ASSERT(rpp == NULL); 1581 1582 return (0); /* success */ 1583 } 1584 1585 page_unlock(opp); 1586 return (result); 1587 } 1588 1589 /* 1590 * Based on page_invalidate_pages() 1591 * 1592 * Kcage_invalidate_page() uses page_relocate() twice. Both instances 1593 * of use must be updated to match the new page_relocate() when it 1594 * becomes available. 1595 * 1596 * Return result of kcage_relocate_page or zero if page was directly freed. 1597 * *nfreedp: number of pages freed. 1598 */ 1599 static int 1600 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp) 1601 { 1602 int result; 1603 1604 #if defined(__sparc) 1605 ASSERT(pp->p_vnode != &promvp); 1606 #endif /* __sparc */ 1607 ASSERT(!PP_ISFREE(pp)); 1608 ASSERT(PAGE_EXCL(pp)); 1609 1610 /* 1611 * Is this page involved in some I/O? shared? 1612 * The page_struct_lock need not be acquired to 1613 * examine these fields since the page has an 1614 * "exclusive" lock. 1615 */ 1616 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1617 result = kcage_relocate_page(pp, nfreedp); 1618 #ifdef KCAGE_STATS 1619 if (result == 0) 1620 KCAGE_STAT_INCR_SCAN(kip_reloclocked); 1621 else if (result == ENOMEM) 1622 KCAGE_STAT_INCR_SCAN(kip_nomem); 1623 #endif 1624 return (result); 1625 } 1626 1627 ASSERT(pp->p_vnode->v_type != VCHR); 1628 1629 /* 1630 * Unload the mappings and check if mod bit is set. 1631 */ 1632 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1633 1634 if (hat_ismod(pp)) { 1635 result = kcage_relocate_page(pp, nfreedp); 1636 #ifdef KCAGE_STATS 1637 if (result == 0) 1638 KCAGE_STAT_INCR_SCAN(kip_relocmod); 1639 else if (result == ENOMEM) 1640 KCAGE_STAT_INCR_SCAN(kip_nomem); 1641 #endif 1642 return (result); 1643 } 1644 1645 if (!page_try_demote_pages(pp)) { 1646 KCAGE_STAT_INCR_SCAN(kip_demotefailed); 1647 page_unlock(pp); 1648 return (EAGAIN); 1649 } 1650 1651 /* LINTED: constant in conditional context */ 1652 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1653 KCAGE_STAT_INCR_SCAN(kip_destroy); 1654 *nfreedp = 1; 1655 return (0); 1656 } 1657 1658 /* 1659 * Expand cage only if there is not enough memory to satisfy 1660 * current request. We only do one (complete) scan of the cage. 1661 * Dirty pages and pages with shared mappings are skipped; 1662 * Locked pages (p_lckcnt and p_cowcnt) are also skipped. 1663 * All other pages are freed (if they can be locked). 1664 * This may affect caching of user pages which are in cage by freeing/ 1665 * reclaiming them more often. However cage is mainly for kernel (heap) 1666 * pages and we want to keep user pages outside of cage. The above policy 1667 * should also reduce cage expansion plus it should speed up cage mem 1668 * allocations. 1669 */ 1670 static void 1671 kcage_cageout() 1672 { 1673 pfn_t pfn; 1674 page_t *pp; 1675 callb_cpr_t cprinfo; 1676 int did_something; 1677 pfn_t start_pfn; 1678 ulong_t shared_level = 8; 1679 pgcnt_t nfreed; 1680 #ifdef KCAGE_STATS 1681 clock_t scan_start; 1682 #endif 1683 1684 CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex, 1685 callb_generic_cpr, "cageout"); 1686 1687 mutex_enter(&kcage_cageout_mutex); 1688 kcage_cageout_thread = curthread; 1689 1690 pfn = PFN_INVALID; /* force scan reset */ 1691 start_pfn = PFN_INVALID; /* force init with 1st cage pfn */ 1692 kcage_cageout_ready = 1; /* switch kcage_cageout_wakeup mode */ 1693 1694 loop: 1695 /* 1696 * Wait here. Sooner or later, kcage_freemem_sub() will notice 1697 * that kcage_freemem is less than kcage_desfree. When it does 1698 * notice, kcage_freemem_sub() will wake us up via call to 1699 * kcage_cageout_wakeup(). 1700 */ 1701 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1702 cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex); 1703 CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex); 1704 1705 KCAGE_STAT_INCR(kt_wakeups); 1706 KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem); 1707 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem); 1708 #ifdef KCAGE_STATS 1709 scan_start = ddi_get_lbolt(); 1710 #endif 1711 if (!kcage_on) 1712 goto loop; 1713 1714 KCAGE_STAT_INCR(kt_scans); 1715 KCAGE_STAT_INCR_SCAN(kt_passes); 1716 1717 did_something = 0; 1718 while (kcage_freemem < kcage_lotsfree + kcage_needfree) { 1719 1720 if ((pfn = kcage_walk_cage(pfn == PFN_INVALID)) == 1721 PFN_INVALID) { 1722 break; 1723 } 1724 1725 if (start_pfn == PFN_INVALID) 1726 start_pfn = pfn; 1727 else if (start_pfn == pfn) { 1728 /* 1729 * Did a complete walk of kernel cage, but didn't free 1730 * any pages. If only one cpu is active then 1731 * stop kernel cage walk and try expanding. 1732 */ 1733 if (cp_default.cp_ncpus == 1 && did_something == 0) { 1734 KCAGE_STAT_INCR(kt_cageout_break); 1735 break; 1736 } 1737 } 1738 1739 pp = page_numtopp_nolock(pfn); 1740 if (pp == NULL) { 1741 continue; 1742 } 1743 1744 KCAGE_STAT_INCR_SCAN(kt_examined); 1745 1746 /* 1747 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside 1748 * of the lock. If one is missed it will be seen next 1749 * time through. 1750 * 1751 * Skip non-caged-pages. These pages can exist in the cage 1752 * because, if during cage expansion, a page is 1753 * encountered that is long-term locked the lock prevents the 1754 * expansion logic from setting the P_NORELOC flag. Hence, 1755 * non-caged-pages surrounded by caged-pages. 1756 */ 1757 if (!PP_ISNORELOC(pp)) { 1758 switch (kcage_assimilate_page(pp, &nfreed)) { 1759 case 0: 1760 did_something = 1; 1761 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, 1762 nfreed); 1763 break; 1764 1765 case EBUSY: 1766 case ERANGE: 1767 did_something = 1; 1768 KCAGE_STAT_INCR_SCAN(kt_gotone); 1769 break; 1770 1771 case EAGAIN: 1772 case ENOMEM: 1773 break; 1774 1775 default: 1776 /* catch this with debug kernels */ 1777 ASSERT(0); 1778 break; 1779 } 1780 1781 continue; 1782 } else { 1783 if (PP_ISFREE(pp)) { 1784 continue; 1785 } 1786 1787 if ((PP_ISKAS(pp) && pp->p_lckcnt > 0) || 1788 !page_trylock(pp, SE_EXCL)) { 1789 KCAGE_STAT_INCR_SCAN(kt_cantlock); 1790 continue; 1791 } 1792 1793 /* P_NORELOC bit should not have gone away. */ 1794 ASSERT(PP_ISNORELOC(pp)); 1795 if (PP_ISFREE(pp) || (PP_ISKAS(pp) && 1796 pp->p_lckcnt > 0)) { 1797 page_unlock(pp); 1798 continue; 1799 } 1800 1801 if (hat_page_checkshare(pp, shared_level)) { 1802 page_unlock(pp); 1803 KCAGE_STAT_INCR_SCAN(kt_skipshared); 1804 continue; 1805 } 1806 1807 if (kcage_invalidate_page(pp, &nfreed) == 0) { 1808 did_something = 1; 1809 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed); 1810 } 1811 1812 /* 1813 * No need to drop the page lock here. 1814 * Kcage_invalidate_page has done that for us 1815 * either explicitly or through a page_free. 1816 */ 1817 } 1818 } 1819 1820 if (kcage_freemem < kcage_throttlefree + kcage_needfree) 1821 (void) kcage_expand(); 1822 1823 if (kcage_on && kcage_cageout_ready) 1824 cv_broadcast(&kcage_throttle_cv); 1825 1826 KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem); 1827 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem); 1828 KCAGE_STAT_SET_SCAN(kt_ticks, ddi_get_lbolt() - scan_start); 1829 KCAGE_STAT_INC_SCAN_INDEX; 1830 goto loop; 1831 1832 /*NOTREACHED*/ 1833 } 1834 1835 void 1836 kcage_cageout_wakeup() 1837 { 1838 if (mutex_tryenter(&kcage_cageout_mutex)) { 1839 if (kcage_cageout_ready) { 1840 cv_signal(&kcage_cageout_cv); 1841 } else if (kcage_freemem < kcage_minfree || kcage_needfree) { 1842 /* 1843 * Available cage memory is really low. Time to 1844 * start expanding the cage. However, the 1845 * kernel cage thread is not yet ready to 1846 * do the work. Use *this* thread, which is 1847 * most likely to be t0, to do the work. 1848 */ 1849 KCAGE_STAT_INCR(kcw_expandearly); 1850 (void) kcage_expand(); 1851 KCAGE_STAT_INC_SCAN_INDEX; 1852 } 1853 1854 mutex_exit(&kcage_cageout_mutex); 1855 } 1856 /* else, kernel cage thread is already running */ 1857 } 1858 1859 void 1860 kcage_tick() 1861 { 1862 /* 1863 * Once per second we wake up all the threads throttled 1864 * waiting for cage memory, in case we've become stuck 1865 * and haven't made forward progress expanding the cage. 1866 */ 1867 if (kcage_on && kcage_cageout_ready) 1868 cv_broadcast(&kcage_throttle_cv); 1869 } 1870