1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/param.h> 28 #include <sys/thread.h> 29 #include <sys/proc.h> 30 #include <sys/callb.h> 31 #include <sys/vnode.h> 32 #include <sys/debug.h> 33 #include <sys/systm.h> /* for bzero */ 34 #include <sys/memlist.h> 35 #include <sys/cmn_err.h> 36 #include <sys/sysmacros.h> 37 #include <sys/vmsystm.h> /* for NOMEMWAIT() */ 38 #include <sys/atomic.h> /* used to update kcage_freemem */ 39 #include <sys/kmem.h> /* for kmem_reap */ 40 #include <sys/errno.h> 41 #include <sys/mem_cage.h> 42 #include <vm/seg_kmem.h> 43 #include <vm/page.h> 44 #include <vm/hat.h> 45 #include <vm/vm_dep.h> 46 #include <sys/mem_config.h> 47 #include <sys/lgrp.h> 48 #include <sys/rwlock.h> 49 #include <sys/cpupart.h> 50 51 extern pri_t maxclsyspri; 52 53 #ifdef DEBUG 54 #define KCAGE_STATS 55 #endif 56 57 #ifdef KCAGE_STATS 58 59 #define KCAGE_STATS_VERSION 9 /* can help report generators */ 60 #define KCAGE_STATS_NSCANS 256 /* depth of scan statistics buffer */ 61 62 struct kcage_stats_scan { 63 /* managed by KCAGE_STAT_* macros */ 64 clock_t scan_lbolt; 65 uint_t scan_id; 66 67 /* set in kcage_cageout() */ 68 uint_t kt_passes; 69 clock_t kt_ticks; 70 pgcnt_t kt_kcage_freemem_start; 71 pgcnt_t kt_kcage_freemem_end; 72 pgcnt_t kt_freemem_start; 73 pgcnt_t kt_freemem_end; 74 uint_t kt_examined; 75 uint_t kt_cantlock; 76 uint_t kt_gotone; 77 uint_t kt_gotonefree; 78 uint_t kt_skiplevel; 79 uint_t kt_skipshared; 80 uint_t kt_skiprefd; 81 uint_t kt_destroy; 82 83 /* set in kcage_invalidate_page() */ 84 uint_t kip_reloclocked; 85 uint_t kip_relocmod; 86 uint_t kip_destroy; 87 uint_t kip_nomem; 88 uint_t kip_demotefailed; 89 90 /* set in kcage_expand() */ 91 uint_t ke_wanted; 92 uint_t ke_examined; 93 uint_t ke_lefthole; 94 uint_t ke_gotone; 95 uint_t ke_gotonefree; 96 }; 97 98 struct kcage_stats { 99 /* managed by KCAGE_STAT_* macros */ 100 uint_t version; 101 uint_t size; 102 103 /* set in kcage_cageout */ 104 uint_t kt_wakeups; 105 uint_t kt_scans; 106 uint_t kt_cageout_break; 107 108 /* set in kcage_expand */ 109 uint_t ke_calls; 110 uint_t ke_nopfn; 111 uint_t ke_nopaget; 112 uint_t ke_isnoreloc; 113 uint_t ke_deleting; 114 uint_t ke_lowfreemem; 115 uint_t ke_terminate; 116 117 /* set in kcage_freemem_add() */ 118 uint_t kfa_trottlewake; 119 120 /* set in kcage_freemem_sub() */ 121 uint_t kfs_cagewake; 122 123 /* set in kcage_create_throttle */ 124 uint_t kct_calls; 125 uint_t kct_cageout; 126 uint_t kct_critical; 127 uint_t kct_exempt; 128 uint_t kct_cagewake; 129 uint_t kct_wait; 130 uint_t kct_progress; 131 uint_t kct_noprogress; 132 uint_t kct_timeout; 133 134 /* set in kcage_cageout_wakeup */ 135 uint_t kcw_expandearly; 136 137 /* managed by KCAGE_STAT_* macros */ 138 uint_t scan_array_size; 139 uint_t scan_index; 140 struct kcage_stats_scan scans[KCAGE_STATS_NSCANS]; 141 }; 142 143 static struct kcage_stats kcage_stats; 144 static struct kcage_stats_scan kcage_stats_scan_zero; 145 146 /* 147 * No real need for atomics here. For the most part the incs and sets are 148 * done by the kernel cage thread. There are a few that are done by any 149 * number of other threads. Those cases are noted by comments. 150 */ 151 #define KCAGE_STAT_INCR(m) kcage_stats.m++ 152 153 #define KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v) 154 155 #define KCAGE_STAT_INCR_SCAN(m) \ 156 KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m) 157 158 #define KCAGE_STAT_NINCR_SCAN(m, v) \ 159 KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v) 160 161 #define KCAGE_STAT_SET(m, v) kcage_stats.m = (v) 162 163 #define KCAGE_STAT_SETZ(m, v) \ 164 if (kcage_stats.m == 0) kcage_stats.m = (v) 165 166 #define KCAGE_STAT_SET_SCAN(m, v) \ 167 KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v) 168 169 #define KCAGE_STAT_SETZ_SCAN(m, v) \ 170 KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v) 171 172 #define KCAGE_STAT_INC_SCAN_INDEX \ 173 KCAGE_STAT_SET_SCAN(scan_lbolt, ddi_get_lbolt()); \ 174 KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \ 175 kcage_stats.scan_index = \ 176 (kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \ 177 kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero 178 179 #define KCAGE_STAT_INIT_SCAN_INDEX \ 180 kcage_stats.version = KCAGE_STATS_VERSION; \ 181 kcage_stats.size = sizeof (kcage_stats); \ 182 kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \ 183 kcage_stats.scan_index = 0 184 185 #else /* KCAGE_STATS */ 186 187 #define KCAGE_STAT_INCR(v) 188 #define KCAGE_STAT_NINCR(m, v) 189 #define KCAGE_STAT_INCR_SCAN(v) 190 #define KCAGE_STAT_NINCR_SCAN(m, v) 191 #define KCAGE_STAT_SET(m, v) 192 #define KCAGE_STAT_SETZ(m, v) 193 #define KCAGE_STAT_SET_SCAN(m, v) 194 #define KCAGE_STAT_SETZ_SCAN(m, v) 195 #define KCAGE_STAT_INC_SCAN_INDEX 196 #define KCAGE_STAT_INIT_SCAN_INDEX 197 198 #endif /* KCAGE_STATS */ 199 200 static kmutex_t kcage_throttle_mutex; /* protects kcage_throttle_cv */ 201 static kcondvar_t kcage_throttle_cv; 202 203 static kmutex_t kcage_cageout_mutex; /* protects cv and ready flag */ 204 static kcondvar_t kcage_cageout_cv; /* cageout thread naps here */ 205 static int kcage_cageout_ready; /* nonzero when cageout thread ready */ 206 kthread_id_t kcage_cageout_thread; /* to aid debugging */ 207 208 static krwlock_t kcage_range_rwlock; /* protects kcage_glist elements */ 209 210 /* 211 * Cage expansion happens within a range. 212 */ 213 struct kcage_glist { 214 struct kcage_glist *next; 215 pfn_t base; 216 pfn_t lim; 217 pfn_t curr; 218 int decr; 219 }; 220 221 static struct kcage_glist *kcage_glist; 222 static struct kcage_glist *kcage_current_glist; 223 224 /* 225 * The firstfree element is provided so that kmem_alloc can be avoided 226 * until that cage has somewhere to go. This is not currently a problem 227 * as early kmem_alloc's use BOP_ALLOC instead of page_create_va. 228 */ 229 static vmem_t *kcage_arena; 230 static struct kcage_glist kcage_glist_firstfree; 231 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree; 232 233 /* 234 * Miscellaneous forward references 235 */ 236 static struct kcage_glist *kcage_glist_alloc(void); 237 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **); 238 static void kcage_cageout(void); 239 static int kcage_invalidate_page(page_t *, pgcnt_t *); 240 static int kcage_setnoreloc_pages(page_t *, se_t); 241 static int kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t); 242 static void kcage_init(pgcnt_t preferred_size); 243 static int kcage_range_delete_internal(pfn_t base, pgcnt_t npgs); 244 245 /* 246 * Kernel Memory Cage counters and thresholds. 247 */ 248 int kcage_on = 0; 249 pgcnt_t kcage_freemem; 250 pgcnt_t kcage_needfree; 251 pgcnt_t kcage_lotsfree; 252 pgcnt_t kcage_desfree; 253 pgcnt_t kcage_minfree; 254 pgcnt_t kcage_throttlefree; 255 pgcnt_t kcage_reserve; 256 int kcage_maxwait = 10; /* in seconds */ 257 258 /* when we use lp for kmem we start the cage at a higher initial value */ 259 pgcnt_t kcage_kmemlp_mincage; 260 261 #ifdef DEBUG 262 pgcnt_t kcage_pagets; 263 #define KCAGEPAGETS_INC() kcage_pagets++ 264 #else 265 #define KCAGEPAGETS_INC() 266 #endif 267 268 /* kstats to export what pages are currently caged */ 269 kmutex_t kcage_kstat_lock; 270 static int kcage_kstat_update(kstat_t *ksp, int rw); 271 static int kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw); 272 273 /* 274 * Startup and Dynamic Reconfiguration interfaces. 275 * kcage_range_add() 276 * kcage_range_del() 277 * kcage_range_delete_post_mem_del() 278 * kcage_range_init() 279 * kcage_set_thresholds() 280 */ 281 282 /* 283 * Called from page_get_contig_pages to get the approximate kcage pfn range 284 * for exclusion from search for contiguous pages. This routine is called 285 * without kcage_range lock (kcage routines can call page_get_contig_pages 286 * through page_relocate) and with the assumption, based on kcage_range_add, 287 * that kcage_current_glist always contain a valid pointer. 288 */ 289 290 int 291 kcage_current_pfn(pfn_t *pfncur) 292 { 293 struct kcage_glist *lp = kcage_current_glist; 294 295 ASSERT(kcage_on); 296 297 ASSERT(lp != NULL); 298 299 *pfncur = lp->curr; 300 301 return (lp->decr); 302 } 303 304 /* 305 * Called from vm_pagelist.c during coalesce to find kernel cage regions 306 * within an mnode. Looks for the lowest range between lo and hi. 307 * 308 * Kernel cage memory is defined between kcage_glist and kcage_current_glist. 309 * Non-cage memory is defined between kcage_current_glist and list end. 310 * 311 * If incage is set, returns the lowest kcage range. Otherwise returns lowest 312 * non-cage range. 313 * 314 * Returns zero on success and nlo, nhi: 315 * lo <= nlo < nhi <= hi 316 * Returns non-zero if no overlapping range is found. 317 */ 318 int 319 kcage_next_range(int incage, pfn_t lo, pfn_t hi, 320 pfn_t *nlo, pfn_t *nhi) 321 { 322 struct kcage_glist *lp; 323 pfn_t tlo = hi; 324 pfn_t thi = hi; 325 326 ASSERT(lo <= hi); 327 328 /* 329 * Reader lock protects the list, but kcage_get_pfn 330 * running concurrently may advance kcage_current_glist 331 * and also update kcage_current_glist->curr. Page 332 * coalesce can handle this race condition. 333 */ 334 rw_enter(&kcage_range_rwlock, RW_READER); 335 336 for (lp = incage ? kcage_glist : kcage_current_glist; 337 lp != NULL; lp = lp->next) { 338 339 pfn_t klo, khi; 340 341 /* find the range limits in this element */ 342 if ((incage && lp->decr) || (!incage && !lp->decr)) { 343 klo = lp->curr; 344 khi = lp->lim; 345 } else { 346 klo = lp->base; 347 khi = lp->curr; 348 } 349 350 /* handle overlap */ 351 if (klo < tlo && klo < khi && lo < khi && klo < hi) { 352 tlo = MAX(lo, klo); 353 thi = MIN(hi, khi); 354 if (tlo == lo) 355 break; 356 } 357 358 /* check end of kcage */ 359 if (incage && lp == kcage_current_glist) { 360 break; 361 } 362 } 363 364 rw_exit(&kcage_range_rwlock); 365 366 /* return non-zero if no overlapping range found */ 367 if (tlo == thi) 368 return (1); 369 370 ASSERT(lo <= tlo && tlo < thi && thi <= hi); 371 372 /* return overlapping range */ 373 *nlo = tlo; 374 *nhi = thi; 375 return (0); 376 } 377 378 void 379 kcage_range_init(struct memlist *ml, kcage_dir_t d, pgcnt_t preferred_size) 380 { 381 int ret = 0; 382 383 ASSERT(kcage_arena == NULL); 384 kcage_arena = vmem_create("kcage_arena", NULL, 0, sizeof (uint64_t), 385 segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP); 386 ASSERT(kcage_arena != NULL); 387 388 if (d == KCAGE_DOWN) { 389 while (ml->next != NULL) 390 ml = ml->next; 391 } 392 393 rw_enter(&kcage_range_rwlock, RW_WRITER); 394 395 while (ml != NULL) { 396 ret = kcage_range_add_internal(btop(ml->address), 397 btop(ml->size), d); 398 if (ret) 399 panic("kcage_range_add_internal failed: " 400 "ml=%p, ret=0x%x\n", (void *)ml, ret); 401 402 ml = (d == KCAGE_DOWN ? ml->prev : ml->next); 403 } 404 405 rw_exit(&kcage_range_rwlock); 406 407 if (ret == 0) 408 kcage_init(preferred_size); 409 } 410 411 /* 412 * Third arg controls direction of growth: 0: increasing pfns, 413 * 1: decreasing. 414 */ 415 static int 416 kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t d) 417 { 418 struct kcage_glist *new, **lpp; 419 pfn_t lim; 420 421 ASSERT(rw_write_held(&kcage_range_rwlock)); 422 423 ASSERT(npgs != 0); 424 if (npgs == 0) 425 return (EINVAL); 426 427 lim = base + npgs; 428 429 ASSERT(lim > base); 430 if (lim <= base) 431 return (EINVAL); 432 433 new = kcage_glist_alloc(); 434 if (new == NULL) { 435 return (ENOMEM); 436 } 437 438 new->base = base; 439 new->lim = lim; 440 new->decr = (d == KCAGE_DOWN); 441 if (new->decr != 0) 442 new->curr = new->lim; 443 else 444 new->curr = new->base; 445 /* 446 * Any overlapping existing ranges are removed by deleting 447 * from the new list as we search for the tail. 448 */ 449 lpp = &kcage_glist; 450 while (*lpp != NULL) { 451 int ret; 452 ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new); 453 if (ret != 0) 454 return (ret); 455 lpp = &(*lpp)->next; 456 } 457 458 *lpp = new; 459 460 if (kcage_current_glist == NULL) { 461 kcage_current_glist = kcage_glist; 462 } 463 464 return (0); 465 } 466 467 int 468 kcage_range_add(pfn_t base, pgcnt_t npgs, kcage_dir_t d) 469 { 470 int ret; 471 472 rw_enter(&kcage_range_rwlock, RW_WRITER); 473 ret = kcage_range_add_internal(base, npgs, d); 474 rw_exit(&kcage_range_rwlock); 475 return (ret); 476 } 477 478 /* 479 * Calls to add and delete must be protected by kcage_range_rwlock 480 */ 481 static int 482 kcage_range_delete_internal(pfn_t base, pgcnt_t npgs) 483 { 484 struct kcage_glist *lp; 485 pfn_t lim; 486 487 ASSERT(rw_write_held(&kcage_range_rwlock)); 488 489 ASSERT(npgs != 0); 490 if (npgs == 0) 491 return (EINVAL); 492 493 lim = base + npgs; 494 495 ASSERT(lim > base); 496 if (lim <= base) 497 return (EINVAL); 498 499 /* 500 * Check if the delete is OK first as a number of elements 501 * might be involved and it will be difficult to go 502 * back and undo (can't just add the range back in). 503 */ 504 for (lp = kcage_glist; lp != NULL; lp = lp->next) { 505 /* 506 * If there have been no pages allocated from this 507 * element, we don't need to check it. 508 */ 509 if ((lp->decr == 0 && lp->curr == lp->base) || 510 (lp->decr != 0 && lp->curr == lp->lim)) 511 continue; 512 /* 513 * If the element does not overlap, its OK. 514 */ 515 if (base >= lp->lim || lim <= lp->base) 516 continue; 517 /* 518 * Overlapping element: Does the range to be deleted 519 * overlap the area already used? If so fail. 520 */ 521 if (lp->decr == 0 && base < lp->curr && lim >= lp->base) { 522 return (EBUSY); 523 } 524 if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) { 525 return (EBUSY); 526 } 527 } 528 return (kcage_glist_delete(base, lim, &kcage_glist)); 529 } 530 531 int 532 kcage_range_delete(pfn_t base, pgcnt_t npgs) 533 { 534 int ret; 535 536 rw_enter(&kcage_range_rwlock, RW_WRITER); 537 ret = kcage_range_delete_internal(base, npgs); 538 rw_exit(&kcage_range_rwlock); 539 return (ret); 540 } 541 542 /* 543 * Calls to add and delete must be protected by kcage_range_rwlock. 544 * This routine gets called after successful Solaris memory 545 * delete operation from DR post memory delete routines. 546 */ 547 static int 548 kcage_range_delete_post_mem_del_internal(pfn_t base, pgcnt_t npgs) 549 { 550 pfn_t lim; 551 552 ASSERT(rw_write_held(&kcage_range_rwlock)); 553 554 ASSERT(npgs != 0); 555 if (npgs == 0) 556 return (EINVAL); 557 558 lim = base + npgs; 559 560 ASSERT(lim > base); 561 if (lim <= base) 562 return (EINVAL); 563 564 return (kcage_glist_delete(base, lim, &kcage_glist)); 565 } 566 567 int 568 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs) 569 { 570 int ret; 571 572 rw_enter(&kcage_range_rwlock, RW_WRITER); 573 ret = kcage_range_delete_post_mem_del_internal(base, npgs); 574 rw_exit(&kcage_range_rwlock); 575 return (ret); 576 } 577 578 /* 579 * No locking is required here as the whole operation is covered 580 * by kcage_range_rwlock writer lock. 581 */ 582 static struct kcage_glist * 583 kcage_glist_alloc(void) 584 { 585 struct kcage_glist *new; 586 587 if ((new = kcage_glist_freelist) != NULL) { 588 kcage_glist_freelist = new->next; 589 } else if (kernel_cage_enable) { 590 new = vmem_alloc(kcage_arena, sizeof (*new), VM_NOSLEEP); 591 } else { 592 /* 593 * On DR supported platforms we allow memory add 594 * even when kernel cage is disabled. "kcage_arena" is 595 * created only when kernel cage is enabled. 596 */ 597 new = kmem_zalloc(sizeof (*new), KM_NOSLEEP); 598 } 599 600 if (new != NULL) 601 bzero(new, sizeof (*new)); 602 603 return (new); 604 } 605 606 static void 607 kcage_glist_free(struct kcage_glist *lp) 608 { 609 lp->next = kcage_glist_freelist; 610 kcage_glist_freelist = lp; 611 } 612 613 static int 614 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp) 615 { 616 struct kcage_glist *lp, *prev = *lpp; 617 618 while ((lp = *lpp) != NULL) { 619 if (lim > lp->base && base < lp->lim) { 620 /* The delete range overlaps this element. */ 621 if (base <= lp->base && lim >= lp->lim) { 622 /* Delete whole element. */ 623 *lpp = lp->next; 624 if (lp == kcage_current_glist) { 625 /* This can never happen. */ 626 ASSERT(kcage_current_glist != prev); 627 kcage_current_glist = prev; 628 } 629 kcage_glist_free(lp); 630 continue; 631 } 632 633 /* Partial delete. */ 634 if (base > lp->base && lim < lp->lim) { 635 struct kcage_glist *new; 636 637 /* 638 * Remove a section from the middle, 639 * need to allocate a new element. 640 */ 641 new = kcage_glist_alloc(); 642 if (new == NULL) { 643 return (ENOMEM); 644 } 645 646 /* 647 * Tranfser unused range to new. 648 * Edit lp in place to preserve 649 * kcage_current_glist. 650 */ 651 new->decr = lp->decr; 652 if (new->decr != 0) { 653 new->base = lp->base; 654 new->lim = base; 655 new->curr = base; 656 657 lp->base = lim; 658 } else { 659 new->base = lim; 660 new->lim = lp->lim; 661 new->curr = new->base; 662 663 lp->lim = base; 664 } 665 666 /* Insert new. */ 667 new->next = lp->next; 668 lp->next = new; 669 lpp = &lp->next; 670 } else { 671 /* Delete part of current block. */ 672 if (base > lp->base) { 673 ASSERT(lim >= lp->lim); 674 ASSERT(base < lp->lim); 675 if (lp->decr != 0 && 676 lp->curr == lp->lim) 677 lp->curr = base; 678 lp->lim = base; 679 } else { 680 ASSERT(base <= lp->base); 681 ASSERT(lim > lp->base); 682 if (lp->decr == 0 && 683 lp->curr == lp->base) 684 lp->curr = lim; 685 lp->base = lim; 686 } 687 } 688 } 689 prev = *lpp; 690 lpp = &(*lpp)->next; 691 } 692 693 return (0); 694 } 695 696 /* 697 * If lockit is 1, kcage_get_pfn holds the 698 * reader lock for kcage_range_rwlock. 699 * Changes to lp->curr can cause race conditions, but 700 * they are handled by higher level code (see kcage_next_range.) 701 */ 702 static pfn_t 703 kcage_get_pfn(int lockit) 704 { 705 struct kcage_glist *lp; 706 pfn_t pfn = PFN_INVALID; 707 708 if (lockit && !rw_tryenter(&kcage_range_rwlock, RW_READER)) 709 return (pfn); 710 711 lp = kcage_current_glist; 712 while (lp != NULL) { 713 if (lp->decr != 0) { 714 if (lp->curr != lp->base) { 715 pfn = --lp->curr; 716 break; 717 } 718 } else { 719 if (lp->curr != lp->lim) { 720 pfn = lp->curr++; 721 break; 722 } 723 } 724 725 lp = lp->next; 726 if (lp) 727 kcage_current_glist = lp; 728 } 729 730 if (lockit) 731 rw_exit(&kcage_range_rwlock); 732 return (pfn); 733 } 734 735 /* 736 * Walk the physical address space of the cage. 737 * This routine does not guarantee to return PFNs in the order 738 * in which they were allocated to the cage. Instead, it walks 739 * each range as they appear on the growth list returning the PFNs 740 * range in ascending order. 741 * 742 * To begin scanning at lower edge of cage, reset should be nonzero. 743 * To step through cage, reset should be zero. 744 * 745 * PFN_INVALID will be returned when the upper end of the cage is 746 * reached -- indicating a full scan of the cage has been completed since 747 * previous reset. PFN_INVALID will continue to be returned until 748 * kcage_walk_cage is reset. 749 * 750 * It is possible to receive a PFN_INVALID result on reset if a growth 751 * list is not installed or if none of the PFNs in the installed list have 752 * been allocated to the cage. In otherwords, there is no cage. 753 * 754 * Caller need not hold kcage_range_rwlock while calling this function 755 * as the front part of the list is static - pages never come out of 756 * the cage. 757 * 758 * The caller is expected to only be kcage_cageout(). 759 */ 760 static pfn_t 761 kcage_walk_cage(int reset) 762 { 763 static struct kcage_glist *lp = NULL; 764 static pfn_t pfn; 765 766 if (reset) 767 lp = NULL; 768 if (lp == NULL) { 769 lp = kcage_glist; 770 pfn = PFN_INVALID; 771 } 772 again: 773 if (pfn == PFN_INVALID) { 774 if (lp == NULL) 775 return (PFN_INVALID); 776 777 if (lp->decr != 0) { 778 /* 779 * In this range the cage grows from the highest 780 * address towards the lowest. 781 * Arrange to return pfns from curr to lim-1, 782 * inclusive, in ascending order. 783 */ 784 785 pfn = lp->curr; 786 } else { 787 /* 788 * In this range the cage grows from the lowest 789 * address towards the highest. 790 * Arrange to return pfns from base to curr, 791 * inclusive, in ascending order. 792 */ 793 794 pfn = lp->base; 795 } 796 } 797 798 if (lp->decr != 0) { /* decrementing pfn */ 799 if (pfn == lp->lim) { 800 /* Don't go beyond the static part of the glist. */ 801 if (lp == kcage_current_glist) 802 lp = NULL; 803 else 804 lp = lp->next; 805 pfn = PFN_INVALID; 806 goto again; 807 } 808 809 ASSERT(pfn >= lp->curr && pfn < lp->lim); 810 } else { /* incrementing pfn */ 811 if (pfn == lp->curr) { 812 /* Don't go beyond the static part of the glist. */ 813 if (lp == kcage_current_glist) 814 lp = NULL; 815 else 816 lp = lp->next; 817 pfn = PFN_INVALID; 818 goto again; 819 } 820 821 ASSERT(pfn >= lp->base && pfn < lp->curr); 822 } 823 824 return (pfn++); 825 } 826 827 /* 828 * Callback functions for to recalc cage thresholds after 829 * Kphysm memory add/delete operations. 830 */ 831 /*ARGSUSED*/ 832 static void 833 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages) 834 { 835 kcage_recalc_thresholds(); 836 } 837 838 /*ARGSUSED*/ 839 static int 840 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages) 841 { 842 /* TODO: when should cage refuse memory delete requests? */ 843 return (0); 844 } 845 846 /*ARGSUSED*/ 847 static void 848 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled) 849 { 850 kcage_recalc_thresholds(); 851 } 852 853 static kphysm_setup_vector_t kcage_kphysm_vectors = { 854 KPHYSM_SETUP_VECTOR_VERSION, 855 kcage_kphysm_postadd_cb, 856 kcage_kphysm_predel_cb, 857 kcage_kphysm_postdel_cb 858 }; 859 860 /* 861 * This is called before a CPR suspend and after a CPR resume. We have to 862 * turn off kcage_cageout_ready before a suspend, and turn it back on after a 863 * restart. 864 */ 865 /*ARGSUSED*/ 866 static boolean_t 867 kcage_cageout_cpr(void *arg, int code) 868 { 869 if (code == CB_CODE_CPR_CHKPT) { 870 ASSERT(kcage_cageout_ready); 871 kcage_cageout_ready = 0; 872 return (B_TRUE); 873 } else if (code == CB_CODE_CPR_RESUME) { 874 ASSERT(kcage_cageout_ready == 0); 875 kcage_cageout_ready = 1; 876 return (B_TRUE); 877 } 878 return (B_FALSE); 879 } 880 881 /* 882 * kcage_recalc_preferred_size() increases initial cage size to improve large 883 * page availability when lp for kmem is enabled and kpr is disabled 884 */ 885 static pgcnt_t 886 kcage_recalc_preferred_size(pgcnt_t preferred_size) 887 { 888 if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) { 889 pgcnt_t lpmincage = kcage_kmemlp_mincage; 890 if (lpmincage == 0) { 891 lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8), 892 segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE; 893 } 894 kcage_kmemlp_mincage = MIN(lpmincage, 895 (segkmem_kmemlp_max / PAGESIZE)); 896 preferred_size = MAX(kcage_kmemlp_mincage, preferred_size); 897 } 898 return (preferred_size); 899 } 900 901 /* 902 * Kcage_init() builds the cage and initializes the cage thresholds. 903 * The size of the cage is determined by the argument preferred_size. 904 * or the actual amount of memory, whichever is smaller. 905 */ 906 static void 907 kcage_init(pgcnt_t preferred_size) 908 { 909 pgcnt_t wanted; 910 pfn_t pfn; 911 page_t *pp; 912 kstat_t *ksp; 913 914 extern struct vnode kvp; 915 extern void page_list_noreloc_startup(page_t *); 916 917 ASSERT(!kcage_on); 918 919 /* increase preferred cage size for lp for kmem */ 920 preferred_size = kcage_recalc_preferred_size(preferred_size); 921 922 /* Debug note: initialize this now so early expansions can stat */ 923 KCAGE_STAT_INIT_SCAN_INDEX; 924 925 /* 926 * Initialize cage thresholds and install kphysm callback. 927 * If we can't arrange to have the thresholds track with 928 * available physical memory, then the cage thresholds may 929 * end up over time at levels that adversly effect system 930 * performance; so, bail out. 931 */ 932 kcage_recalc_thresholds(); 933 if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) { 934 ASSERT(0); /* Catch this in DEBUG kernels. */ 935 return; 936 } 937 938 /* 939 * Limit startup cage size within the range of kcage_minfree 940 * and availrmem, inclusively. 941 */ 942 wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem); 943 944 /* 945 * Construct the cage. PFNs are allocated from the glist. It 946 * is assumed that the list has been properly ordered for the 947 * platform by the platform code. Typically, this is as simple 948 * as calling kcage_range_init(phys_avail, decr), where decr is 949 * 1 if the kernel has been loaded into upper end of physical 950 * memory, or 0 if the kernel has been loaded at the low end. 951 * 952 * Note: it is assumed that we are in the startup flow, so there 953 * is no reason to grab the page lock. 954 */ 955 kcage_freemem = 0; 956 pfn = PFN_INVALID; /* prime for alignment test */ 957 while (wanted != 0) { 958 if ((pfn = kcage_get_pfn(0)) == PFN_INVALID) 959 break; 960 961 if ((pp = page_numtopp_nolock(pfn)) != NULL) { 962 KCAGEPAGETS_INC(); 963 /* 964 * Set the noreloc state on the page. 965 * If the page is free and not already 966 * on the noreloc list then move it. 967 */ 968 if (PP_ISFREE(pp)) { 969 if (PP_ISNORELOC(pp) == 0) 970 page_list_noreloc_startup(pp); 971 } else { 972 ASSERT(pp->p_szc == 0); 973 PP_SETNORELOC(pp); 974 } 975 } 976 PLCNT_XFER_NORELOC(pp); 977 wanted -= 1; 978 } 979 980 /* 981 * Need to go through and find kernel allocated pages 982 * and capture them into the Cage. These will primarily 983 * be pages gotten through boot_alloc(). 984 */ 985 if (kvp.v_pages) { 986 987 pp = kvp.v_pages; 988 do { 989 ASSERT(!PP_ISFREE(pp)); 990 ASSERT(pp->p_szc == 0); 991 if (PP_ISNORELOC(pp) == 0) { 992 PP_SETNORELOC(pp); 993 PLCNT_XFER_NORELOC(pp); 994 } 995 } while ((pp = pp->p_vpnext) != kvp.v_pages); 996 997 } 998 999 kcage_on = 1; 1000 1001 /* 1002 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend() 1003 * after the cageout thread is blocked, and executes from cpr_resume() 1004 * before the cageout thread is restarted. By executing in this class, 1005 * we are assured that the kernel cage thread won't miss wakeup calls 1006 * and also CPR's larger kmem_alloc requests will not fail after 1007 * CPR shuts down the cageout kernel thread. 1008 */ 1009 (void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL, 1010 "cageout"); 1011 1012 /* 1013 * Coalesce pages to improve large page availability. A better fix 1014 * would to coalesce pages as they are included in the cage 1015 */ 1016 if (SEGKMEM_USE_LARGEPAGES) { 1017 extern void page_freelist_coalesce_all(int mnode); 1018 page_freelist_coalesce_all(-1); /* do all mnodes */ 1019 } 1020 1021 ksp = kstat_create("kcage", 0, "kcage_page_list", "misc", 1022 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL); 1023 if (ksp != NULL) { 1024 ksp->ks_update = kcage_kstat_update; 1025 ksp->ks_snapshot = kcage_kstat_snapshot; 1026 ksp->ks_lock = &kcage_kstat_lock; /* XXX - not really needed */ 1027 kstat_install(ksp); 1028 } 1029 } 1030 1031 static int 1032 kcage_kstat_update(kstat_t *ksp, int rw) 1033 { 1034 struct kcage_glist *lp; 1035 uint_t count; 1036 1037 if (rw == KSTAT_WRITE) 1038 return (EACCES); 1039 1040 count = 0; 1041 rw_enter(&kcage_range_rwlock, RW_WRITER); 1042 for (lp = kcage_glist; lp != NULL; lp = lp->next) { 1043 if (lp->decr) { 1044 if (lp->curr != lp->lim) { 1045 count++; 1046 } 1047 } else { 1048 if (lp->curr != lp->base) { 1049 count++; 1050 } 1051 } 1052 } 1053 rw_exit(&kcage_range_rwlock); 1054 1055 ksp->ks_ndata = count; 1056 ksp->ks_data_size = count * 2 * sizeof (uint64_t); 1057 1058 return (0); 1059 } 1060 1061 static int 1062 kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw) 1063 { 1064 struct kcage_glist *lp; 1065 struct memunit { 1066 uint64_t address; 1067 uint64_t size; 1068 } *kspmem; 1069 1070 if (rw == KSTAT_WRITE) 1071 return (EACCES); 1072 1073 ksp->ks_snaptime = gethrtime(); 1074 1075 kspmem = (struct memunit *)buf; 1076 rw_enter(&kcage_range_rwlock, RW_WRITER); 1077 for (lp = kcage_glist; lp != NULL; lp = lp->next, kspmem++) { 1078 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) 1079 break; 1080 1081 if (lp->decr) { 1082 if (lp->curr != lp->lim) { 1083 kspmem->address = ptob(lp->curr); 1084 kspmem->size = ptob(lp->lim - lp->curr); 1085 } 1086 } else { 1087 if (lp->curr != lp->base) { 1088 kspmem->address = ptob(lp->base); 1089 kspmem->size = ptob(lp->curr - lp->base); 1090 } 1091 } 1092 } 1093 rw_exit(&kcage_range_rwlock); 1094 1095 return (0); 1096 } 1097 1098 void 1099 kcage_recalc_thresholds() 1100 { 1101 static int first = 1; 1102 static pgcnt_t init_lotsfree; 1103 static pgcnt_t init_desfree; 1104 static pgcnt_t init_minfree; 1105 static pgcnt_t init_throttlefree; 1106 static pgcnt_t init_reserve; 1107 1108 /* TODO: any reason to take more care than this with live editing? */ 1109 mutex_enter(&kcage_cageout_mutex); 1110 mutex_enter(&freemem_lock); 1111 1112 if (first) { 1113 first = 0; 1114 init_lotsfree = kcage_lotsfree; 1115 init_desfree = kcage_desfree; 1116 init_minfree = kcage_minfree; 1117 init_throttlefree = kcage_throttlefree; 1118 init_reserve = kcage_reserve; 1119 } else { 1120 kcage_lotsfree = init_lotsfree; 1121 kcage_desfree = init_desfree; 1122 kcage_minfree = init_minfree; 1123 kcage_throttlefree = init_throttlefree; 1124 kcage_reserve = init_reserve; 1125 } 1126 1127 if (kcage_lotsfree == 0) 1128 kcage_lotsfree = MAX(32, total_pages / 256); 1129 1130 if (kcage_minfree == 0) 1131 kcage_minfree = MAX(32, kcage_lotsfree / 2); 1132 1133 if (kcage_desfree == 0) 1134 kcage_desfree = MAX(32, kcage_minfree); 1135 1136 if (kcage_throttlefree == 0) 1137 kcage_throttlefree = MAX(32, kcage_minfree / 2); 1138 1139 if (kcage_reserve == 0) 1140 kcage_reserve = MIN(32, kcage_throttlefree / 2); 1141 1142 mutex_exit(&freemem_lock); 1143 mutex_exit(&kcage_cageout_mutex); 1144 1145 if (kcage_cageout_ready) { 1146 if (kcage_freemem < kcage_desfree) 1147 kcage_cageout_wakeup(); 1148 1149 if (kcage_needfree) { 1150 mutex_enter(&kcage_throttle_mutex); 1151 cv_broadcast(&kcage_throttle_cv); 1152 mutex_exit(&kcage_throttle_mutex); 1153 } 1154 } 1155 } 1156 1157 /* 1158 * Pageout interface: 1159 * kcage_cageout_init() 1160 */ 1161 void 1162 kcage_cageout_init() 1163 { 1164 if (kcage_on) { 1165 1166 (void) thread_create(NULL, 0, kcage_cageout, 1167 NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1); 1168 } 1169 } 1170 1171 1172 /* 1173 * VM Interfaces: 1174 * kcage_create_throttle() 1175 * kcage_freemem_add() 1176 * kcage_freemem_sub() 1177 */ 1178 1179 /* 1180 * Wakeup cageout thread and throttle waiting for the number of pages 1181 * requested to become available. For non-critical requests, a 1182 * timeout is added, since freemem accounting is separate from cage 1183 * freemem accounting: it's possible for us to get stuck and not make 1184 * forward progress even though there was sufficient freemem before 1185 * arriving here. 1186 */ 1187 int 1188 kcage_create_throttle(pgcnt_t npages, int flags) 1189 { 1190 int niter = 0; 1191 pgcnt_t lastfree; 1192 int enough = kcage_freemem > kcage_throttlefree + npages; 1193 1194 KCAGE_STAT_INCR(kct_calls); /* unprotected incr. */ 1195 1196 kcage_cageout_wakeup(); /* just to be sure */ 1197 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1198 1199 /* 1200 * Obviously, we can't throttle the cageout thread since 1201 * we depend on it. We also can't throttle the panic thread. 1202 */ 1203 if (curthread == kcage_cageout_thread || panicstr) { 1204 KCAGE_STAT_INCR(kct_cageout); /* unprotected incr. */ 1205 return (KCT_CRIT); 1206 } 1207 1208 /* 1209 * Don't throttle threads which are critical for proper 1210 * vm management if we're above kcage_throttlefree or 1211 * if freemem is very low. 1212 */ 1213 if (NOMEMWAIT()) { 1214 if (enough) { 1215 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1216 return (KCT_CRIT); 1217 } else if (freemem < minfree) { 1218 KCAGE_STAT_INCR(kct_critical); /* unprotected incr. */ 1219 return (KCT_CRIT); 1220 } 1221 } 1222 1223 /* 1224 * Don't throttle real-time threads if kcage_freemem > kcage_reserve. 1225 */ 1226 if (DISP_PRIO(curthread) > maxclsyspri && 1227 kcage_freemem > kcage_reserve) { 1228 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1229 return (KCT_CRIT); 1230 } 1231 1232 /* 1233 * Cause all other threads (which are assumed to not be 1234 * critical to cageout) to wait here until their request 1235 * can be satisfied. Be a little paranoid and wake the 1236 * kernel cage on each loop through this logic. 1237 */ 1238 while (kcage_freemem < kcage_throttlefree + npages) { 1239 ASSERT(kcage_on); 1240 1241 lastfree = kcage_freemem; 1242 1243 if (kcage_cageout_ready) { 1244 mutex_enter(&kcage_throttle_mutex); 1245 1246 kcage_needfree += npages; 1247 KCAGE_STAT_INCR(kct_wait); 1248 1249 kcage_cageout_wakeup(); 1250 KCAGE_STAT_INCR(kct_cagewake); 1251 1252 cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex); 1253 1254 kcage_needfree -= npages; 1255 1256 mutex_exit(&kcage_throttle_mutex); 1257 } else { 1258 /* 1259 * NOTE: atomics are used just in case we enter 1260 * mp operation before the cageout thread is ready. 1261 */ 1262 atomic_add_long(&kcage_needfree, npages); 1263 1264 kcage_cageout_wakeup(); 1265 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1266 1267 atomic_add_long(&kcage_needfree, -npages); 1268 } 1269 1270 if ((flags & PG_WAIT) == 0) { 1271 if (kcage_freemem > lastfree) { 1272 KCAGE_STAT_INCR(kct_progress); 1273 niter = 0; 1274 } else { 1275 KCAGE_STAT_INCR(kct_noprogress); 1276 if (++niter >= kcage_maxwait) { 1277 KCAGE_STAT_INCR(kct_timeout); 1278 return (KCT_FAILURE); 1279 } 1280 } 1281 } 1282 1283 if (NOMEMWAIT() && freemem < minfree) { 1284 return (KCT_CRIT); 1285 } 1286 1287 } 1288 return (KCT_NONCRIT); 1289 } 1290 1291 void 1292 kcage_freemem_add(pgcnt_t npages) 1293 { 1294 extern void wakeup_pcgs(void); 1295 1296 atomic_add_long(&kcage_freemem, npages); 1297 1298 wakeup_pcgs(); /* wakeup threads in pcgs() */ 1299 1300 if (kcage_needfree != 0 && 1301 kcage_freemem >= (kcage_throttlefree + kcage_needfree)) { 1302 1303 mutex_enter(&kcage_throttle_mutex); 1304 cv_broadcast(&kcage_throttle_cv); 1305 KCAGE_STAT_INCR(kfa_trottlewake); 1306 mutex_exit(&kcage_throttle_mutex); 1307 } 1308 } 1309 1310 void 1311 kcage_freemem_sub(pgcnt_t npages) 1312 { 1313 atomic_add_long(&kcage_freemem, -npages); 1314 1315 if (kcage_freemem < kcage_desfree) { 1316 kcage_cageout_wakeup(); 1317 KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */ 1318 } 1319 } 1320 1321 /* 1322 * return 0 on failure and 1 on success. 1323 */ 1324 static int 1325 kcage_setnoreloc_pages(page_t *rootpp, se_t se) 1326 { 1327 pgcnt_t npgs, i; 1328 page_t *pp; 1329 pfn_t rootpfn = page_pptonum(rootpp); 1330 uint_t szc; 1331 1332 ASSERT(!PP_ISFREE(rootpp)); 1333 ASSERT(PAGE_LOCKED_SE(rootpp, se)); 1334 if (!group_page_trylock(rootpp, se)) { 1335 return (0); 1336 } 1337 szc = rootpp->p_szc; 1338 if (szc == 0) { 1339 /* 1340 * The szc of a locked page can only change for pages that are 1341 * non-swapfs (i.e. anonymous memory) file system pages. 1342 */ 1343 ASSERT(rootpp->p_vnode != NULL && 1344 !PP_ISKAS(rootpp) && 1345 !IS_SWAPFSVP(rootpp->p_vnode)); 1346 PP_SETNORELOC(rootpp); 1347 return (1); 1348 } 1349 npgs = page_get_pagecnt(szc); 1350 ASSERT(IS_P2ALIGNED(rootpfn, npgs)); 1351 pp = rootpp; 1352 for (i = 0; i < npgs; i++, pp++) { 1353 ASSERT(PAGE_LOCKED_SE(pp, se)); 1354 ASSERT(!PP_ISFREE(pp)); 1355 ASSERT(pp->p_szc == szc); 1356 PP_SETNORELOC(pp); 1357 } 1358 group_page_unlock(rootpp); 1359 return (1); 1360 } 1361 1362 /* 1363 * Attempt to convert page to a caged page (set the P_NORELOC flag). 1364 * If successful and pages is free, move page to the tail of whichever 1365 * list it is on. 1366 * Returns: 1367 * EBUSY page already locked, assimilated but not free. 1368 * ENOMEM page assimilated, but memory too low to relocate. Page not free. 1369 * EAGAIN page not assimilated. Page not free. 1370 * ERANGE page assimilated. Page not root. 1371 * 0 page assimilated. Page free. 1372 * *nfreedp number of pages freed. 1373 * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way 1374 * to distinguish between a page that was already a NORELOC page from 1375 * those newly converted to NORELOC pages by this invocation of 1376 * kcage_assimilate_page. 1377 */ 1378 static int 1379 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp) 1380 { 1381 if (page_trylock(pp, SE_EXCL)) { 1382 if (PP_ISNORELOC(pp)) { 1383 check_free_and_return: 1384 if (PP_ISFREE(pp)) { 1385 page_unlock(pp); 1386 *nfreedp = 0; 1387 return (0); 1388 } else { 1389 page_unlock(pp); 1390 return (EBUSY); 1391 } 1392 /*NOTREACHED*/ 1393 } 1394 } else { 1395 if (page_trylock(pp, SE_SHARED)) { 1396 if (PP_ISNORELOC(pp)) 1397 goto check_free_and_return; 1398 } else 1399 return (EAGAIN); 1400 1401 if (!PP_ISFREE(pp)) { 1402 page_unlock(pp); 1403 return (EAGAIN); 1404 } 1405 1406 /* 1407 * Need to upgrade the lock on it and set the NORELOC 1408 * bit. If it is free then remove it from the free 1409 * list so that the platform free list code can keep 1410 * NORELOC pages where they should be. 1411 */ 1412 /* 1413 * Before doing anything, get the exclusive lock. 1414 * This may fail (eg ISM pages are left shared locked). 1415 * If the page is free this will leave a hole in the 1416 * cage. There is no solution yet to this. 1417 */ 1418 if (!page_tryupgrade(pp)) { 1419 page_unlock(pp); 1420 return (EAGAIN); 1421 } 1422 } 1423 1424 ASSERT(PAGE_EXCL(pp)); 1425 1426 if (PP_ISFREE(pp)) { 1427 int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST; 1428 1429 page_list_sub(pp, which); 1430 ASSERT(pp->p_szc == 0); 1431 PP_SETNORELOC(pp); 1432 PLCNT_XFER_NORELOC(pp); 1433 page_list_add(pp, which | PG_LIST_TAIL); 1434 1435 page_unlock(pp); 1436 *nfreedp = 1; 1437 return (0); 1438 } else { 1439 if (pp->p_szc != 0) { 1440 if (!kcage_setnoreloc_pages(pp, SE_EXCL)) { 1441 page_unlock(pp); 1442 return (EAGAIN); 1443 } 1444 ASSERT(PP_ISNORELOC(pp)); 1445 } else { 1446 PP_SETNORELOC(pp); 1447 } 1448 PLCNT_XFER_NORELOC(pp); 1449 return (kcage_invalidate_page(pp, nfreedp)); 1450 } 1451 /*NOTREACHED*/ 1452 } 1453 1454 static int 1455 kcage_expand() 1456 { 1457 int did_something = 0; 1458 1459 spgcnt_t wanted; 1460 pfn_t pfn; 1461 page_t *pp; 1462 /* TODO: we don't really need n any more? */ 1463 pgcnt_t n; 1464 pgcnt_t nf, nfreed; 1465 1466 /* 1467 * Expand the cage if available cage memory is really low. Calculate 1468 * the amount required to return kcage_freemem to the level of 1469 * kcage_lotsfree, or to satisfy throttled requests, whichever is 1470 * more. It is rare for their sum to create an artificial threshold 1471 * above kcage_lotsfree, but it is possible. 1472 * 1473 * Exit early if expansion amount is equal to or less than zero. 1474 * (<0 is possible if kcage_freemem rises suddenly.) 1475 * 1476 * Exit early when the global page pool (apparently) does not 1477 * have enough free pages to page_relocate() even a single page. 1478 */ 1479 wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree) 1480 - kcage_freemem; 1481 if (wanted <= 0) 1482 return (0); 1483 else if (freemem < pageout_reserve + 1) { 1484 KCAGE_STAT_INCR(ke_lowfreemem); 1485 return (0); 1486 } 1487 1488 KCAGE_STAT_INCR(ke_calls); 1489 KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted); 1490 1491 /* 1492 * Assimilate more pages from the global page pool into the cage. 1493 */ 1494 n = 0; /* number of pages PP_SETNORELOC'd */ 1495 nf = 0; /* number of those actually free */ 1496 while (kcage_on && nf < wanted) { 1497 pfn = kcage_get_pfn(1); 1498 if (pfn == PFN_INVALID) { /* eek! no where to grow */ 1499 KCAGE_STAT_INCR(ke_nopfn); 1500 goto terminate; 1501 } 1502 1503 KCAGE_STAT_INCR_SCAN(ke_examined); 1504 1505 if ((pp = page_numtopp_nolock(pfn)) == NULL) { 1506 KCAGE_STAT_INCR(ke_nopaget); 1507 continue; 1508 } 1509 KCAGEPAGETS_INC(); 1510 /* 1511 * Sanity check. Skip this pfn if it is 1512 * being deleted. 1513 */ 1514 if (pfn_is_being_deleted(pfn)) { 1515 KCAGE_STAT_INCR(ke_deleting); 1516 continue; 1517 } 1518 1519 if (PP_ISNORELOC(pp)) { 1520 KCAGE_STAT_INCR(ke_isnoreloc); 1521 continue; 1522 } 1523 1524 switch (kcage_assimilate_page(pp, &nfreed)) { 1525 case 0: /* assimilated, page is free */ 1526 KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed); 1527 did_something = 1; 1528 nf += nfreed; 1529 n++; 1530 break; 1531 1532 case EBUSY: /* assimilated, page not free */ 1533 case ERANGE: /* assimilated, page not root */ 1534 KCAGE_STAT_INCR_SCAN(ke_gotone); 1535 did_something = 1; 1536 n++; 1537 break; 1538 1539 case ENOMEM: /* assimilated, but no mem */ 1540 KCAGE_STAT_INCR(ke_terminate); 1541 did_something = 1; 1542 n++; 1543 goto terminate; 1544 1545 case EAGAIN: /* can't assimilate */ 1546 KCAGE_STAT_INCR_SCAN(ke_lefthole); 1547 break; 1548 1549 default: /* catch this with debug kernels */ 1550 ASSERT(0); 1551 break; 1552 } 1553 } 1554 1555 /* 1556 * Realign cage edge with the nearest physical address 1557 * boundry for big pages. This is done to give us a 1558 * better chance of actually getting usable big pages 1559 * in the cage. 1560 */ 1561 1562 terminate: 1563 1564 return (did_something); 1565 } 1566 1567 /* 1568 * Relocate page opp (Original Page Pointer) from cage pool to page rpp 1569 * (Replacement Page Pointer) in the global pool. Page opp will be freed 1570 * if relocation is successful, otherwise it is only unlocked. 1571 * On entry, page opp must be exclusively locked and not free. 1572 * *nfreedp: number of pages freed. 1573 */ 1574 static int 1575 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp) 1576 { 1577 page_t *opp = pp; 1578 page_t *rpp = NULL; 1579 spgcnt_t npgs; 1580 int result; 1581 1582 ASSERT(!PP_ISFREE(opp)); 1583 ASSERT(PAGE_EXCL(opp)); 1584 1585 result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL); 1586 *nfreedp = npgs; 1587 if (result == 0) { 1588 while (npgs-- > 0) { 1589 page_t *tpp; 1590 1591 ASSERT(rpp != NULL); 1592 tpp = rpp; 1593 page_sub(&rpp, tpp); 1594 page_unlock(tpp); 1595 } 1596 1597 ASSERT(rpp == NULL); 1598 1599 return (0); /* success */ 1600 } 1601 1602 page_unlock(opp); 1603 return (result); 1604 } 1605 1606 /* 1607 * Based on page_invalidate_pages() 1608 * 1609 * Kcage_invalidate_page() uses page_relocate() twice. Both instances 1610 * of use must be updated to match the new page_relocate() when it 1611 * becomes available. 1612 * 1613 * Return result of kcage_relocate_page or zero if page was directly freed. 1614 * *nfreedp: number of pages freed. 1615 */ 1616 static int 1617 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp) 1618 { 1619 int result; 1620 1621 #if defined(__sparc) 1622 extern struct vnode prom_ppages; 1623 ASSERT(pp->p_vnode != &prom_ppages); 1624 #endif /* __sparc */ 1625 1626 ASSERT(!PP_ISFREE(pp)); 1627 ASSERT(PAGE_EXCL(pp)); 1628 1629 /* 1630 * Is this page involved in some I/O? shared? 1631 * The page_struct_lock need not be acquired to 1632 * examine these fields since the page has an 1633 * "exclusive" lock. 1634 */ 1635 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1636 result = kcage_relocate_page(pp, nfreedp); 1637 #ifdef KCAGE_STATS 1638 if (result == 0) 1639 KCAGE_STAT_INCR_SCAN(kip_reloclocked); 1640 else if (result == ENOMEM) 1641 KCAGE_STAT_INCR_SCAN(kip_nomem); 1642 #endif 1643 return (result); 1644 } 1645 1646 ASSERT(pp->p_vnode->v_type != VCHR); 1647 1648 /* 1649 * Unload the mappings and check if mod bit is set. 1650 */ 1651 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1652 1653 if (hat_ismod(pp)) { 1654 result = kcage_relocate_page(pp, nfreedp); 1655 #ifdef KCAGE_STATS 1656 if (result == 0) 1657 KCAGE_STAT_INCR_SCAN(kip_relocmod); 1658 else if (result == ENOMEM) 1659 KCAGE_STAT_INCR_SCAN(kip_nomem); 1660 #endif 1661 return (result); 1662 } 1663 1664 if (!page_try_demote_pages(pp)) { 1665 KCAGE_STAT_INCR_SCAN(kip_demotefailed); 1666 page_unlock(pp); 1667 return (EAGAIN); 1668 } 1669 1670 /* LINTED: constant in conditional context */ 1671 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1672 KCAGE_STAT_INCR_SCAN(kip_destroy); 1673 *nfreedp = 1; 1674 return (0); 1675 } 1676 1677 static void 1678 kcage_cageout() 1679 { 1680 pfn_t pfn; 1681 page_t *pp; 1682 callb_cpr_t cprinfo; 1683 int did_something; 1684 int scan_again; 1685 pfn_t start_pfn; 1686 int pass; 1687 int last_pass; 1688 int pages_skipped; 1689 int shared_skipped; 1690 ulong_t shared_level = 8; 1691 pgcnt_t nfreed; 1692 #ifdef KCAGE_STATS 1693 clock_t scan_start; 1694 #endif 1695 1696 CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex, 1697 callb_generic_cpr, "cageout"); 1698 1699 mutex_enter(&kcage_cageout_mutex); 1700 kcage_cageout_thread = curthread; 1701 1702 pfn = PFN_INVALID; /* force scan reset */ 1703 start_pfn = PFN_INVALID; /* force init with 1st cage pfn */ 1704 kcage_cageout_ready = 1; /* switch kcage_cageout_wakeup mode */ 1705 1706 loop: 1707 /* 1708 * Wait here. Sooner or later, kcage_freemem_sub() will notice 1709 * that kcage_freemem is less than kcage_desfree. When it does 1710 * notice, kcage_freemem_sub() will wake us up via call to 1711 * kcage_cageout_wakeup(). 1712 */ 1713 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1714 cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex); 1715 CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex); 1716 1717 KCAGE_STAT_INCR(kt_wakeups); 1718 KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem); 1719 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem); 1720 pass = 0; 1721 last_pass = 0; 1722 1723 #ifdef KCAGE_STATS 1724 scan_start = ddi_get_lbolt(); 1725 #endif 1726 1727 again: 1728 if (!kcage_on) 1729 goto loop; 1730 1731 KCAGE_STAT_INCR(kt_scans); 1732 KCAGE_STAT_INCR_SCAN(kt_passes); 1733 1734 did_something = 0; 1735 pages_skipped = 0; 1736 shared_skipped = 0; 1737 while ((kcage_freemem < kcage_lotsfree || kcage_needfree) && 1738 (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) { 1739 1740 if (start_pfn == PFN_INVALID) 1741 start_pfn = pfn; 1742 else if (start_pfn == pfn) { 1743 last_pass = pass; 1744 pass += 1; 1745 /* 1746 * Did a complete walk of kernel cage, but didn't free 1747 * any pages. If only one cpu is active then 1748 * stop kernel cage walk and try expanding. 1749 */ 1750 if (cp_default.cp_ncpus == 1 && did_something == 0) { 1751 KCAGE_STAT_INCR(kt_cageout_break); 1752 break; 1753 } 1754 } 1755 1756 pp = page_numtopp_nolock(pfn); 1757 if (pp == NULL) { 1758 continue; 1759 } 1760 1761 KCAGE_STAT_INCR_SCAN(kt_examined); 1762 1763 /* 1764 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside 1765 * of the lock. If one is missed it will be seen next 1766 * time through. 1767 * 1768 * Skip non-caged-pages. These pages can exist in the cage 1769 * because, if during cage expansion, a page is 1770 * encountered that is long-term locked the lock prevents the 1771 * expansion logic from setting the P_NORELOC flag. Hence, 1772 * non-caged-pages surrounded by caged-pages. 1773 */ 1774 if (!PP_ISNORELOC(pp)) { 1775 switch (kcage_assimilate_page(pp, &nfreed)) { 1776 case 0: 1777 did_something = 1; 1778 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, 1779 nfreed); 1780 break; 1781 1782 case EBUSY: 1783 case ERANGE: 1784 did_something = 1; 1785 KCAGE_STAT_INCR_SCAN(kt_gotone); 1786 break; 1787 1788 case EAGAIN: 1789 case ENOMEM: 1790 break; 1791 1792 default: 1793 /* catch this with debug kernels */ 1794 ASSERT(0); 1795 break; 1796 } 1797 1798 continue; 1799 } else { 1800 int prm; 1801 1802 if (PP_ISFREE(pp)) { 1803 continue; 1804 } 1805 1806 if ((PP_ISKAS(pp) && pp->p_lckcnt > 0) || 1807 !page_trylock(pp, SE_EXCL)) { 1808 KCAGE_STAT_INCR_SCAN(kt_cantlock); 1809 continue; 1810 } 1811 1812 /* P_NORELOC bit should not have gone away. */ 1813 ASSERT(PP_ISNORELOC(pp)); 1814 if (PP_ISFREE(pp) || (PP_ISKAS(pp) && 1815 pp->p_lckcnt > 0)) { 1816 page_unlock(pp); 1817 continue; 1818 } 1819 1820 KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level); 1821 if (hat_page_checkshare(pp, shared_level)) { 1822 page_unlock(pp); 1823 pages_skipped = 1; 1824 shared_skipped = 1; 1825 KCAGE_STAT_INCR_SCAN(kt_skipshared); 1826 continue; 1827 } 1828 1829 /* 1830 * In pass {0, 1}, skip page if ref bit is set. 1831 * In pass {0, 1, 2}, skip page if mod bit is set. 1832 */ 1833 prm = hat_pagesync(pp, 1834 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 1835 1836 /* On first pass ignore ref'd pages */ 1837 if (pass <= 1 && (prm & P_REF)) { 1838 KCAGE_STAT_INCR_SCAN(kt_skiprefd); 1839 pages_skipped = 1; 1840 page_unlock(pp); 1841 continue; 1842 } 1843 1844 /* On pass 2, VN_DISPOSE if mod bit is not set */ 1845 if (pass <= 2) { 1846 if (pp->p_szc != 0 || (prm & P_MOD) || 1847 pp->p_lckcnt || pp->p_cowcnt) { 1848 pages_skipped = 1; 1849 page_unlock(pp); 1850 } else { 1851 1852 /* 1853 * unload the mappings before 1854 * checking if mod bit is set 1855 */ 1856 (void) hat_pageunload(pp, 1857 HAT_FORCE_PGUNLOAD); 1858 1859 /* 1860 * skip this page if modified 1861 */ 1862 if (hat_ismod(pp)) { 1863 pages_skipped = 1; 1864 page_unlock(pp); 1865 continue; 1866 } 1867 1868 KCAGE_STAT_INCR_SCAN(kt_destroy); 1869 /* constant in conditional context */ 1870 /* LINTED */ 1871 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1872 did_something = 1; 1873 } 1874 continue; 1875 } 1876 1877 if (kcage_invalidate_page(pp, &nfreed) == 0) { 1878 did_something = 1; 1879 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed); 1880 } 1881 1882 /* 1883 * No need to drop the page lock here. 1884 * Kcage_invalidate_page has done that for us 1885 * either explicitly or through a page_free. 1886 */ 1887 } 1888 } 1889 1890 /* 1891 * Expand the cage only if available cage memory is really low. 1892 * This test is done only after a complete scan of the cage. 1893 * The reason for not checking and expanding more often is to 1894 * avoid rapid expansion of the cage. Naturally, scanning the 1895 * cage takes time. So by scanning first, we use that work as a 1896 * delay loop in between expand decisions. 1897 */ 1898 1899 scan_again = 0; 1900 if (kcage_freemem < kcage_minfree || kcage_needfree) { 1901 /* 1902 * Kcage_expand() will return a non-zero value if it was 1903 * able to expand the cage -- whether or not the new 1904 * pages are free and immediately usable. If non-zero, 1905 * we do another scan of the cage. The pages might be 1906 * freed during that scan or by time we get back here. 1907 * If not, we will attempt another expansion. 1908 * However, if kcage_expand() returns zero, then it was 1909 * unable to expand the cage. This is the case when the 1910 * the growth list is exausted, therefore no work was done 1911 * and there is no reason to scan the cage again. 1912 * Note: Kernel cage scan is not repeated when only one 1913 * cpu is active to avoid kernel cage thread hogging cpu. 1914 */ 1915 if (pass <= 3 && pages_skipped && cp_default.cp_ncpus > 1) 1916 scan_again = 1; 1917 else 1918 (void) kcage_expand(); /* don't scan again */ 1919 } else if (kcage_freemem < kcage_lotsfree) { 1920 /* 1921 * If available cage memory is less than abundant 1922 * and a full scan of the cage has not yet been completed, 1923 * or a scan has completed and some work was performed, 1924 * or pages were skipped because of sharing, 1925 * or we simply have not yet completed two passes, 1926 * then do another scan. 1927 */ 1928 if (pass <= 2 && pages_skipped) 1929 scan_again = 1; 1930 if (pass == last_pass || did_something) 1931 scan_again = 1; 1932 else if (shared_skipped && shared_level < (8<<24)) { 1933 shared_level <<= 1; 1934 scan_again = 1; 1935 } 1936 } 1937 1938 if (scan_again && cp_default.cp_ncpus > 1) 1939 goto again; 1940 else { 1941 if (shared_level > 8) 1942 shared_level >>= 1; 1943 1944 KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem); 1945 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem); 1946 KCAGE_STAT_SET_SCAN(kt_ticks, ddi_get_lbolt() - scan_start); 1947 KCAGE_STAT_INC_SCAN_INDEX; 1948 goto loop; 1949 } 1950 1951 /*NOTREACHED*/ 1952 } 1953 1954 void 1955 kcage_cageout_wakeup() 1956 { 1957 if (mutex_tryenter(&kcage_cageout_mutex)) { 1958 if (kcage_cageout_ready) { 1959 cv_signal(&kcage_cageout_cv); 1960 } else if (kcage_freemem < kcage_minfree || kcage_needfree) { 1961 /* 1962 * Available cage memory is really low. Time to 1963 * start expanding the cage. However, the 1964 * kernel cage thread is not yet ready to 1965 * do the work. Use *this* thread, which is 1966 * most likely to be t0, to do the work. 1967 */ 1968 KCAGE_STAT_INCR(kcw_expandearly); 1969 (void) kcage_expand(); 1970 KCAGE_STAT_INC_SCAN_INDEX; 1971 } 1972 1973 mutex_exit(&kcage_cageout_mutex); 1974 } 1975 /* else, kernel cage thread is already running */ 1976 } 1977 1978 void 1979 kcage_tick() 1980 { 1981 /* 1982 * Once per second we wake up all the threads throttled 1983 * waiting for cage memory, in case we've become stuck 1984 * and haven't made forward progress expanding the cage. 1985 */ 1986 if (kcage_on && kcage_cageout_ready) 1987 cv_broadcast(&kcage_throttle_cv); 1988 } 1989