1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/thread.h> 31 #include <sys/proc.h> 32 #include <sys/callb.h> 33 #include <sys/vnode.h> 34 #include <sys/debug.h> 35 #include <sys/systm.h> /* for bzero */ 36 #include <sys/memlist.h> 37 #include <sys/cmn_err.h> 38 #include <sys/sysmacros.h> 39 #include <sys/vmsystm.h> /* for NOMEMWAIT() */ 40 #include <sys/atomic.h> /* used to update kcage_freemem */ 41 #include <sys/kmem.h> /* for kmem_reap */ 42 #include <sys/errno.h> 43 #include <sys/mem_cage.h> 44 #include <vm/seg_kmem.h> 45 #include <vm/page.h> 46 #include <vm/hat.h> 47 #include <vm/vm_dep.h> 48 #include <sys/mem_config.h> 49 #include <sys/lgrp.h> 50 #include <sys/rwlock.h> 51 #include <sys/cpupart.h> 52 53 extern pri_t maxclsyspri; 54 55 #ifdef DEBUG 56 #define KCAGE_STATS 57 #endif 58 59 #ifdef KCAGE_STATS 60 61 #define KCAGE_STATS_VERSION 9 /* can help report generators */ 62 #define KCAGE_STATS_NSCANS 256 /* depth of scan statistics buffer */ 63 64 struct kcage_stats_scan { 65 /* managed by KCAGE_STAT_* macros */ 66 clock_t scan_lbolt; 67 uint_t scan_id; 68 69 /* set in kcage_cageout() */ 70 uint_t kt_passes; 71 clock_t kt_ticks; 72 pgcnt_t kt_kcage_freemem_start; 73 pgcnt_t kt_kcage_freemem_end; 74 pgcnt_t kt_freemem_start; 75 pgcnt_t kt_freemem_end; 76 uint_t kt_examined; 77 uint_t kt_cantlock; 78 uint_t kt_gotone; 79 uint_t kt_gotonefree; 80 uint_t kt_skiplevel; 81 uint_t kt_skipshared; 82 uint_t kt_skiprefd; 83 uint_t kt_destroy; 84 85 /* set in kcage_invalidate_page() */ 86 uint_t kip_reloclocked; 87 uint_t kip_relocmod; 88 uint_t kip_destroy; 89 uint_t kip_nomem; 90 uint_t kip_demotefailed; 91 92 /* set in kcage_expand() */ 93 uint_t ke_wanted; 94 uint_t ke_examined; 95 uint_t ke_lefthole; 96 uint_t ke_gotone; 97 uint_t ke_gotonefree; 98 }; 99 100 struct kcage_stats { 101 /* managed by KCAGE_STAT_* macros */ 102 uint_t version; 103 uint_t size; 104 105 /* set in kcage_cageout */ 106 uint_t kt_wakeups; 107 uint_t kt_scans; 108 uint_t kt_cageout_break; 109 110 /* set in kcage_expand */ 111 uint_t ke_calls; 112 uint_t ke_nopfn; 113 uint_t ke_nopaget; 114 uint_t ke_isnoreloc; 115 uint_t ke_deleting; 116 uint_t ke_lowfreemem; 117 uint_t ke_terminate; 118 119 /* set in kcage_freemem_add() */ 120 uint_t kfa_trottlewake; 121 122 /* set in kcage_freemem_sub() */ 123 uint_t kfs_cagewake; 124 125 /* set in kcage_create_throttle */ 126 uint_t kct_calls; 127 uint_t kct_cageout; 128 uint_t kct_critical; 129 uint_t kct_exempt; 130 uint_t kct_cagewake; 131 uint_t kct_wait; 132 uint_t kct_progress; 133 uint_t kct_noprogress; 134 uint_t kct_timeout; 135 136 /* set in kcage_cageout_wakeup */ 137 uint_t kcw_expandearly; 138 139 /* managed by KCAGE_STAT_* macros */ 140 uint_t scan_array_size; 141 uint_t scan_index; 142 struct kcage_stats_scan scans[KCAGE_STATS_NSCANS]; 143 }; 144 145 static struct kcage_stats kcage_stats; 146 static struct kcage_stats_scan kcage_stats_scan_zero; 147 148 /* 149 * No real need for atomics here. For the most part the incs and sets are 150 * done by the kernel cage thread. There are a few that are done by any 151 * number of other threads. Those cases are noted by comments. 152 */ 153 #define KCAGE_STAT_INCR(m) kcage_stats.m++ 154 155 #define KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v) 156 157 #define KCAGE_STAT_INCR_SCAN(m) \ 158 KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m) 159 160 #define KCAGE_STAT_NINCR_SCAN(m, v) \ 161 KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v) 162 163 #define KCAGE_STAT_SET(m, v) kcage_stats.m = (v) 164 165 #define KCAGE_STAT_SETZ(m, v) \ 166 if (kcage_stats.m == 0) kcage_stats.m = (v) 167 168 #define KCAGE_STAT_SET_SCAN(m, v) \ 169 KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v) 170 171 #define KCAGE_STAT_SETZ_SCAN(m, v) \ 172 KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v) 173 174 #define KCAGE_STAT_INC_SCAN_INDEX \ 175 KCAGE_STAT_SET_SCAN(scan_lbolt, lbolt); \ 176 KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \ 177 kcage_stats.scan_index = \ 178 (kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \ 179 kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero 180 181 #define KCAGE_STAT_INIT_SCAN_INDEX \ 182 kcage_stats.version = KCAGE_STATS_VERSION; \ 183 kcage_stats.size = sizeof (kcage_stats); \ 184 kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \ 185 kcage_stats.scan_index = 0 186 187 #else /* KCAGE_STATS */ 188 189 #define KCAGE_STAT_INCR(v) 190 #define KCAGE_STAT_NINCR(m, v) 191 #define KCAGE_STAT_INCR_SCAN(v) 192 #define KCAGE_STAT_NINCR_SCAN(m, v) 193 #define KCAGE_STAT_SET(m, v) 194 #define KCAGE_STAT_SETZ(m, v) 195 #define KCAGE_STAT_SET_SCAN(m, v) 196 #define KCAGE_STAT_SETZ_SCAN(m, v) 197 #define KCAGE_STAT_INC_SCAN_INDEX 198 #define KCAGE_STAT_INIT_SCAN_INDEX 199 200 #endif /* KCAGE_STATS */ 201 202 static kmutex_t kcage_throttle_mutex; /* protects kcage_throttle_cv */ 203 static kcondvar_t kcage_throttle_cv; 204 205 static kmutex_t kcage_cageout_mutex; /* protects cv and ready flag */ 206 static kcondvar_t kcage_cageout_cv; /* cageout thread naps here */ 207 static int kcage_cageout_ready; /* nonzero when cageout thread ready */ 208 kthread_id_t kcage_cageout_thread; /* to aid debugging */ 209 210 static krwlock_t kcage_range_rwlock; /* protects kcage_glist elements */ 211 212 /* 213 * Cage expansion happens within a range. 214 */ 215 struct kcage_glist { 216 struct kcage_glist *next; 217 pfn_t base; 218 pfn_t lim; 219 pfn_t curr; 220 int decr; 221 }; 222 223 static struct kcage_glist *kcage_glist; 224 static struct kcage_glist *kcage_current_glist; 225 226 /* 227 * The firstfree element is provided so that kmem_alloc can be avoided 228 * until that cage has somewhere to go. This is not currently a problem 229 * as early kmem_alloc's use BOP_ALLOC instead of page_create_va. 230 */ 231 static vmem_t *kcage_arena; 232 static struct kcage_glist kcage_glist_firstfree; 233 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree; 234 235 /* 236 * Miscellaneous forward references 237 */ 238 static struct kcage_glist *kcage_glist_alloc(void); 239 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **); 240 static void kcage_cageout(void); 241 static int kcage_invalidate_page(page_t *, pgcnt_t *); 242 static int kcage_setnoreloc_pages(page_t *, se_t); 243 static int kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t); 244 static void kcage_init(pgcnt_t preferred_size); 245 static int kcage_range_delete_internal(pfn_t base, pgcnt_t npgs); 246 247 /* 248 * Kernel Memory Cage counters and thresholds. 249 */ 250 int kcage_on = 0; 251 pgcnt_t kcage_freemem; 252 pgcnt_t kcage_needfree; 253 pgcnt_t kcage_lotsfree; 254 pgcnt_t kcage_desfree; 255 pgcnt_t kcage_minfree; 256 pgcnt_t kcage_throttlefree; 257 pgcnt_t kcage_reserve; 258 int kcage_maxwait = 10; /* in seconds */ 259 260 /* when we use lp for kmem we start the cage at a higher initial value */ 261 pgcnt_t kcage_kmemlp_mincage; 262 263 #ifdef DEBUG 264 pgcnt_t kcage_pagets; 265 #define KCAGEPAGETS_INC() kcage_pagets++ 266 #else 267 #define KCAGEPAGETS_INC() 268 #endif 269 270 /* kstats to export what pages are currently caged */ 271 kmutex_t kcage_kstat_lock; 272 static int kcage_kstat_update(kstat_t *ksp, int rw); 273 static int kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw); 274 275 /* 276 * Startup and Dynamic Reconfiguration interfaces. 277 * kcage_range_add() 278 * kcage_range_del() 279 * kcage_range_delete_post_mem_del() 280 * kcage_range_init() 281 * kcage_set_thresholds() 282 */ 283 284 /* 285 * Called from page_get_contig_pages to get the approximate kcage pfn range 286 * for exclusion from search for contiguous pages. This routine is called 287 * without kcage_range lock (kcage routines can call page_get_contig_pages 288 * through page_relocate) and with the assumption, based on kcage_range_add, 289 * that kcage_current_glist always contain a valid pointer. 290 */ 291 292 int 293 kcage_current_pfn(pfn_t *pfncur) 294 { 295 struct kcage_glist *lp = kcage_current_glist; 296 297 ASSERT(kcage_on); 298 299 ASSERT(lp != NULL); 300 301 *pfncur = lp->curr; 302 303 return (lp->decr); 304 } 305 306 /* 307 * Called from vm_pagelist.c during coalesce to find kernel cage regions 308 * within an mnode. Looks for the lowest range between lo and hi. 309 * 310 * Kernel cage memory is defined between kcage_glist and kcage_current_glist. 311 * Non-cage memory is defined between kcage_current_glist and list end. 312 * 313 * If incage is set, returns the lowest kcage range. Otherwise returns lowest 314 * non-cage range. 315 * 316 * Returns zero on success and nlo, nhi: 317 * lo <= nlo < nhi <= hi 318 * Returns non-zero if no overlapping range is found. 319 */ 320 int 321 kcage_next_range(int incage, pfn_t lo, pfn_t hi, 322 pfn_t *nlo, pfn_t *nhi) 323 { 324 struct kcage_glist *lp; 325 pfn_t tlo = hi; 326 pfn_t thi = hi; 327 328 ASSERT(lo <= hi); 329 330 /* 331 * Reader lock protects the list, but kcage_get_pfn 332 * running concurrently may advance kcage_current_glist 333 * and also update kcage_current_glist->curr. Page 334 * coalesce can handle this race condition. 335 */ 336 rw_enter(&kcage_range_rwlock, RW_READER); 337 338 for (lp = incage ? kcage_glist : kcage_current_glist; 339 lp != NULL; lp = lp->next) { 340 341 pfn_t klo, khi; 342 343 /* find the range limits in this element */ 344 if ((incage && lp->decr) || (!incage && !lp->decr)) { 345 klo = lp->curr; 346 khi = lp->lim; 347 } else { 348 klo = lp->base; 349 khi = lp->curr; 350 } 351 352 /* handle overlap */ 353 if (klo < tlo && klo < khi && lo < khi && klo < hi) { 354 tlo = MAX(lo, klo); 355 thi = MIN(hi, khi); 356 if (tlo == lo) 357 break; 358 } 359 360 /* check end of kcage */ 361 if (incage && lp == kcage_current_glist) { 362 break; 363 } 364 } 365 366 rw_exit(&kcage_range_rwlock); 367 368 /* return non-zero if no overlapping range found */ 369 if (tlo == thi) 370 return (1); 371 372 ASSERT(lo <= tlo && tlo < thi && thi <= hi); 373 374 /* return overlapping range */ 375 *nlo = tlo; 376 *nhi = thi; 377 return (0); 378 } 379 380 void 381 kcage_range_init(struct memlist *ml, kcage_dir_t d, pgcnt_t preferred_size) 382 { 383 int ret = 0; 384 385 ASSERT(kcage_arena == NULL); 386 kcage_arena = vmem_create("kcage_arena", NULL, 0, sizeof (uint64_t), 387 segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP); 388 ASSERT(kcage_arena != NULL); 389 390 if (d == KCAGE_DOWN) { 391 while (ml->next != NULL) 392 ml = ml->next; 393 } 394 395 rw_enter(&kcage_range_rwlock, RW_WRITER); 396 397 while (ml != NULL) { 398 ret = kcage_range_add_internal(btop(ml->address), 399 btop(ml->size), d); 400 if (ret) 401 panic("kcage_range_add_internal failed: " 402 "ml=%p, ret=0x%x\n", ml, ret); 403 404 ml = (d == KCAGE_DOWN ? ml->prev : ml->next); 405 } 406 407 rw_exit(&kcage_range_rwlock); 408 409 if (ret == 0) 410 kcage_init(preferred_size); 411 } 412 413 /* 414 * Third arg controls direction of growth: 0: increasing pfns, 415 * 1: decreasing. 416 */ 417 static int 418 kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t d) 419 { 420 struct kcage_glist *new, **lpp; 421 pfn_t lim; 422 423 ASSERT(rw_write_held(&kcage_range_rwlock)); 424 425 ASSERT(npgs != 0); 426 if (npgs == 0) 427 return (EINVAL); 428 429 lim = base + npgs; 430 431 ASSERT(lim > base); 432 if (lim <= base) 433 return (EINVAL); 434 435 new = kcage_glist_alloc(); 436 if (new == NULL) { 437 return (ENOMEM); 438 } 439 440 new->base = base; 441 new->lim = lim; 442 new->decr = (d == KCAGE_DOWN); 443 if (new->decr != 0) 444 new->curr = new->lim; 445 else 446 new->curr = new->base; 447 /* 448 * Any overlapping existing ranges are removed by deleting 449 * from the new list as we search for the tail. 450 */ 451 lpp = &kcage_glist; 452 while (*lpp != NULL) { 453 int ret; 454 ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new); 455 if (ret != 0) 456 return (ret); 457 lpp = &(*lpp)->next; 458 } 459 460 *lpp = new; 461 462 if (kcage_current_glist == NULL) { 463 kcage_current_glist = kcage_glist; 464 } 465 466 return (0); 467 } 468 469 int 470 kcage_range_add(pfn_t base, pgcnt_t npgs, kcage_dir_t d) 471 { 472 int ret; 473 474 rw_enter(&kcage_range_rwlock, RW_WRITER); 475 ret = kcage_range_add_internal(base, npgs, d); 476 rw_exit(&kcage_range_rwlock); 477 return (ret); 478 } 479 480 /* 481 * Calls to add and delete must be protected by kcage_range_rwlock 482 */ 483 static int 484 kcage_range_delete_internal(pfn_t base, pgcnt_t npgs) 485 { 486 struct kcage_glist *lp; 487 pfn_t lim; 488 489 ASSERT(rw_write_held(&kcage_range_rwlock)); 490 491 ASSERT(npgs != 0); 492 if (npgs == 0) 493 return (EINVAL); 494 495 lim = base + npgs; 496 497 ASSERT(lim > base); 498 if (lim <= base) 499 return (EINVAL); 500 501 /* 502 * Check if the delete is OK first as a number of elements 503 * might be involved and it will be difficult to go 504 * back and undo (can't just add the range back in). 505 */ 506 for (lp = kcage_glist; lp != NULL; lp = lp->next) { 507 /* 508 * If there have been no pages allocated from this 509 * element, we don't need to check it. 510 */ 511 if ((lp->decr == 0 && lp->curr == lp->base) || 512 (lp->decr != 0 && lp->curr == lp->lim)) 513 continue; 514 /* 515 * If the element does not overlap, its OK. 516 */ 517 if (base >= lp->lim || lim <= lp->base) 518 continue; 519 /* 520 * Overlapping element: Does the range to be deleted 521 * overlap the area already used? If so fail. 522 */ 523 if (lp->decr == 0 && base < lp->curr && lim >= lp->base) { 524 return (EBUSY); 525 } 526 if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) { 527 return (EBUSY); 528 } 529 } 530 return (kcage_glist_delete(base, lim, &kcage_glist)); 531 } 532 533 int 534 kcage_range_delete(pfn_t base, pgcnt_t npgs) 535 { 536 int ret; 537 538 rw_enter(&kcage_range_rwlock, RW_WRITER); 539 ret = kcage_range_delete_internal(base, npgs); 540 rw_exit(&kcage_range_rwlock); 541 return (ret); 542 } 543 544 /* 545 * Calls to add and delete must be protected by kcage_range_rwlock. 546 * This routine gets called after successful Solaris memory 547 * delete operation from DR post memory delete routines. 548 */ 549 static int 550 kcage_range_delete_post_mem_del_internal(pfn_t base, pgcnt_t npgs) 551 { 552 pfn_t lim; 553 554 ASSERT(rw_write_held(&kcage_range_rwlock)); 555 556 ASSERT(npgs != 0); 557 if (npgs == 0) 558 return (EINVAL); 559 560 lim = base + npgs; 561 562 ASSERT(lim > base); 563 if (lim <= base) 564 return (EINVAL); 565 566 return (kcage_glist_delete(base, lim, &kcage_glist)); 567 } 568 569 int 570 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs) 571 { 572 int ret; 573 574 rw_enter(&kcage_range_rwlock, RW_WRITER); 575 ret = kcage_range_delete_post_mem_del_internal(base, npgs); 576 rw_exit(&kcage_range_rwlock); 577 return (ret); 578 } 579 580 /* 581 * No locking is required here as the whole operation is covered 582 * by kcage_range_rwlock writer lock. 583 */ 584 static struct kcage_glist * 585 kcage_glist_alloc(void) 586 { 587 struct kcage_glist *new; 588 589 if ((new = kcage_glist_freelist) != NULL) { 590 kcage_glist_freelist = new->next; 591 } else { 592 new = vmem_alloc(kcage_arena, sizeof (*new), VM_NOSLEEP); 593 } 594 595 if (new != NULL) 596 bzero(new, sizeof (*new)); 597 598 return (new); 599 } 600 601 static void 602 kcage_glist_free(struct kcage_glist *lp) 603 { 604 lp->next = kcage_glist_freelist; 605 kcage_glist_freelist = lp; 606 } 607 608 static int 609 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp) 610 { 611 struct kcage_glist *lp, *prev = *lpp; 612 613 while ((lp = *lpp) != NULL) { 614 if (lim > lp->base && base < lp->lim) { 615 /* The delete range overlaps this element. */ 616 if (base <= lp->base && lim >= lp->lim) { 617 /* Delete whole element. */ 618 *lpp = lp->next; 619 if (lp == kcage_current_glist) { 620 /* This can never happen. */ 621 ASSERT(kcage_current_glist != prev); 622 kcage_current_glist = prev; 623 } 624 kcage_glist_free(lp); 625 continue; 626 } 627 628 /* Partial delete. */ 629 if (base > lp->base && lim < lp->lim) { 630 struct kcage_glist *new; 631 632 /* 633 * Remove a section from the middle, 634 * need to allocate a new element. 635 */ 636 new = kcage_glist_alloc(); 637 if (new == NULL) { 638 return (ENOMEM); 639 } 640 641 /* 642 * Tranfser unused range to new. 643 * Edit lp in place to preserve 644 * kcage_current_glist. 645 */ 646 new->decr = lp->decr; 647 if (new->decr != 0) { 648 new->base = lp->base; 649 new->lim = base; 650 new->curr = base; 651 652 lp->base = lim; 653 } else { 654 new->base = lim; 655 new->lim = lp->lim; 656 new->curr = new->base; 657 658 lp->lim = base; 659 } 660 661 /* Insert new. */ 662 new->next = lp->next; 663 lp->next = new; 664 lpp = &lp->next; 665 } else { 666 /* Delete part of current block. */ 667 if (base > lp->base) { 668 ASSERT(lim >= lp->lim); 669 ASSERT(base < lp->lim); 670 if (lp->decr != 0 && 671 lp->curr == lp->lim) 672 lp->curr = base; 673 lp->lim = base; 674 } else { 675 ASSERT(base <= lp->base); 676 ASSERT(lim > lp->base); 677 if (lp->decr == 0 && 678 lp->curr == lp->base) 679 lp->curr = lim; 680 lp->base = lim; 681 } 682 } 683 } 684 prev = *lpp; 685 lpp = &(*lpp)->next; 686 } 687 688 return (0); 689 } 690 691 /* 692 * If lockit is 1, kcage_get_pfn holds the 693 * reader lock for kcage_range_rwlock. 694 * Changes to lp->curr can cause race conditions, but 695 * they are handled by higher level code (see kcage_next_range.) 696 */ 697 static pfn_t 698 kcage_get_pfn(int lockit) 699 { 700 struct kcage_glist *lp; 701 pfn_t pfn = PFN_INVALID; 702 703 if (lockit && !rw_tryenter(&kcage_range_rwlock, RW_READER)) 704 return (pfn); 705 706 lp = kcage_current_glist; 707 while (lp != NULL) { 708 if (lp->decr != 0) { 709 if (lp->curr != lp->base) { 710 pfn = --lp->curr; 711 break; 712 } 713 } else { 714 if (lp->curr != lp->lim) { 715 pfn = lp->curr++; 716 break; 717 } 718 } 719 720 lp = lp->next; 721 if (lp) 722 kcage_current_glist = lp; 723 } 724 725 if (lockit) 726 rw_exit(&kcage_range_rwlock); 727 return (pfn); 728 } 729 730 /* 731 * Walk the physical address space of the cage. 732 * This routine does not guarantee to return PFNs in the order 733 * in which they were allocated to the cage. Instead, it walks 734 * each range as they appear on the growth list returning the PFNs 735 * range in ascending order. 736 * 737 * To begin scanning at lower edge of cage, reset should be nonzero. 738 * To step through cage, reset should be zero. 739 * 740 * PFN_INVALID will be returned when the upper end of the cage is 741 * reached -- indicating a full scan of the cage has been completed since 742 * previous reset. PFN_INVALID will continue to be returned until 743 * kcage_walk_cage is reset. 744 * 745 * It is possible to receive a PFN_INVALID result on reset if a growth 746 * list is not installed or if none of the PFNs in the installed list have 747 * been allocated to the cage. In otherwords, there is no cage. 748 * 749 * Caller need not hold kcage_range_rwlock while calling this function 750 * as the front part of the list is static - pages never come out of 751 * the cage. 752 * 753 * The caller is expected to only be kcage_cageout(). 754 */ 755 static pfn_t 756 kcage_walk_cage(int reset) 757 { 758 static struct kcage_glist *lp = NULL; 759 static pfn_t pfn; 760 761 if (reset) 762 lp = NULL; 763 if (lp == NULL) { 764 lp = kcage_glist; 765 pfn = PFN_INVALID; 766 } 767 again: 768 if (pfn == PFN_INVALID) { 769 if (lp == NULL) 770 return (PFN_INVALID); 771 772 if (lp->decr != 0) { 773 /* 774 * In this range the cage grows from the highest 775 * address towards the lowest. 776 * Arrange to return pfns from curr to lim-1, 777 * inclusive, in ascending order. 778 */ 779 780 pfn = lp->curr; 781 } else { 782 /* 783 * In this range the cage grows from the lowest 784 * address towards the highest. 785 * Arrange to return pfns from base to curr, 786 * inclusive, in ascending order. 787 */ 788 789 pfn = lp->base; 790 } 791 } 792 793 if (lp->decr != 0) { /* decrementing pfn */ 794 if (pfn == lp->lim) { 795 /* Don't go beyond the static part of the glist. */ 796 if (lp == kcage_current_glist) 797 lp = NULL; 798 else 799 lp = lp->next; 800 pfn = PFN_INVALID; 801 goto again; 802 } 803 804 ASSERT(pfn >= lp->curr && pfn < lp->lim); 805 } else { /* incrementing pfn */ 806 if (pfn == lp->curr) { 807 /* Don't go beyond the static part of the glist. */ 808 if (lp == kcage_current_glist) 809 lp = NULL; 810 else 811 lp = lp->next; 812 pfn = PFN_INVALID; 813 goto again; 814 } 815 816 ASSERT(pfn >= lp->base && pfn < lp->curr); 817 } 818 819 return (pfn++); 820 } 821 822 /* 823 * Callback functions for to recalc cage thresholds after 824 * Kphysm memory add/delete operations. 825 */ 826 /*ARGSUSED*/ 827 static void 828 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages) 829 { 830 kcage_recalc_thresholds(); 831 } 832 833 /*ARGSUSED*/ 834 static int 835 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages) 836 { 837 /* TODO: when should cage refuse memory delete requests? */ 838 return (0); 839 } 840 841 /*ARGSUSED*/ 842 static void 843 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled) 844 { 845 kcage_recalc_thresholds(); 846 } 847 848 static kphysm_setup_vector_t kcage_kphysm_vectors = { 849 KPHYSM_SETUP_VECTOR_VERSION, 850 kcage_kphysm_postadd_cb, 851 kcage_kphysm_predel_cb, 852 kcage_kphysm_postdel_cb 853 }; 854 855 /* 856 * This is called before a CPR suspend and after a CPR resume. We have to 857 * turn off kcage_cageout_ready before a suspend, and turn it back on after a 858 * restart. 859 */ 860 /*ARGSUSED*/ 861 static boolean_t 862 kcage_cageout_cpr(void *arg, int code) 863 { 864 if (code == CB_CODE_CPR_CHKPT) { 865 ASSERT(kcage_cageout_ready); 866 kcage_cageout_ready = 0; 867 return (B_TRUE); 868 } else if (code == CB_CODE_CPR_RESUME) { 869 ASSERT(kcage_cageout_ready == 0); 870 kcage_cageout_ready = 1; 871 return (B_TRUE); 872 } 873 return (B_FALSE); 874 } 875 876 /* 877 * kcage_recalc_preferred_size() increases initial cage size to improve large 878 * page availability when lp for kmem is enabled and kpr is disabled 879 */ 880 static pgcnt_t 881 kcage_recalc_preferred_size(pgcnt_t preferred_size) 882 { 883 if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) { 884 pgcnt_t lpmincage = kcage_kmemlp_mincage; 885 if (lpmincage == 0) { 886 lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8), 887 segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE; 888 } 889 kcage_kmemlp_mincage = MIN(lpmincage, 890 (segkmem_kmemlp_max / PAGESIZE)); 891 preferred_size = MAX(kcage_kmemlp_mincage, preferred_size); 892 } 893 return (preferred_size); 894 } 895 896 /* 897 * Kcage_init() builds the cage and initializes the cage thresholds. 898 * The size of the cage is determined by the argument preferred_size. 899 * or the actual amount of memory, whichever is smaller. 900 */ 901 static void 902 kcage_init(pgcnt_t preferred_size) 903 { 904 pgcnt_t wanted; 905 pfn_t pfn; 906 page_t *pp; 907 kstat_t *ksp; 908 909 extern struct vnode kvp; 910 extern void page_list_noreloc_startup(page_t *); 911 912 ASSERT(!kcage_on); 913 914 /* increase preferred cage size for lp for kmem */ 915 preferred_size = kcage_recalc_preferred_size(preferred_size); 916 917 /* Debug note: initialize this now so early expansions can stat */ 918 KCAGE_STAT_INIT_SCAN_INDEX; 919 920 /* 921 * Initialize cage thresholds and install kphysm callback. 922 * If we can't arrange to have the thresholds track with 923 * available physical memory, then the cage thresholds may 924 * end up over time at levels that adversly effect system 925 * performance; so, bail out. 926 */ 927 kcage_recalc_thresholds(); 928 if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) { 929 ASSERT(0); /* Catch this in DEBUG kernels. */ 930 return; 931 } 932 933 /* 934 * Limit startup cage size within the range of kcage_minfree 935 * and availrmem, inclusively. 936 */ 937 wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem); 938 939 /* 940 * Construct the cage. PFNs are allocated from the glist. It 941 * is assumed that the list has been properly ordered for the 942 * platform by the platform code. Typically, this is as simple 943 * as calling kcage_range_init(phys_avail, decr), where decr is 944 * 1 if the kernel has been loaded into upper end of physical 945 * memory, or 0 if the kernel has been loaded at the low end. 946 * 947 * Note: it is assumed that we are in the startup flow, so there 948 * is no reason to grab the page lock. 949 */ 950 kcage_freemem = 0; 951 pfn = PFN_INVALID; /* prime for alignment test */ 952 while (wanted != 0) { 953 if ((pfn = kcage_get_pfn(0)) == PFN_INVALID) 954 break; 955 956 if ((pp = page_numtopp_nolock(pfn)) != NULL) { 957 KCAGEPAGETS_INC(); 958 /* 959 * Set the noreloc state on the page. 960 * If the page is free and not already 961 * on the noreloc list then move it. 962 */ 963 if (PP_ISFREE(pp)) { 964 if (PP_ISNORELOC(pp) == 0) 965 page_list_noreloc_startup(pp); 966 } else { 967 ASSERT(pp->p_szc == 0); 968 PP_SETNORELOC(pp); 969 } 970 } 971 PLCNT_XFER_NORELOC(pp); 972 wanted -= 1; 973 } 974 975 /* 976 * Need to go through and find kernel allocated pages 977 * and capture them into the Cage. These will primarily 978 * be pages gotten through boot_alloc(). 979 */ 980 if (kvp.v_pages) { 981 982 pp = kvp.v_pages; 983 do { 984 ASSERT(!PP_ISFREE(pp)); 985 ASSERT(pp->p_szc == 0); 986 if (PP_ISNORELOC(pp) == 0) { 987 PP_SETNORELOC(pp); 988 PLCNT_XFER_NORELOC(pp); 989 } 990 } while ((pp = pp->p_vpnext) != kvp.v_pages); 991 992 } 993 994 kcage_on = 1; 995 996 /* 997 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend() 998 * after the cageout thread is blocked, and executes from cpr_resume() 999 * before the cageout thread is restarted. By executing in this class, 1000 * we are assured that the kernel cage thread won't miss wakeup calls 1001 * and also CPR's larger kmem_alloc requests will not fail after 1002 * CPR shuts down the cageout kernel thread. 1003 */ 1004 (void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL, 1005 "cageout"); 1006 1007 /* 1008 * Coalesce pages to improve large page availability. A better fix 1009 * would to coalesce pages as they are included in the cage 1010 */ 1011 if (SEGKMEM_USE_LARGEPAGES) { 1012 extern void page_freelist_coalesce_all(int mnode); 1013 page_freelist_coalesce_all(-1); /* do all mnodes */ 1014 } 1015 1016 ksp = kstat_create("kcage", 0, "kcage_page_list", "misc", 1017 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL); 1018 if (ksp != NULL) { 1019 ksp->ks_update = kcage_kstat_update; 1020 ksp->ks_snapshot = kcage_kstat_snapshot; 1021 ksp->ks_lock = &kcage_kstat_lock; /* XXX - not really needed */ 1022 kstat_install(ksp); 1023 } 1024 } 1025 1026 static int 1027 kcage_kstat_update(kstat_t *ksp, int rw) 1028 { 1029 struct kcage_glist *lp; 1030 uint_t count; 1031 1032 if (rw == KSTAT_WRITE) 1033 return (EACCES); 1034 1035 count = 0; 1036 rw_enter(&kcage_range_rwlock, RW_WRITER); 1037 for (lp = kcage_glist; lp != NULL; lp = lp->next) { 1038 if (lp->decr) { 1039 if (lp->curr != lp->lim) { 1040 count++; 1041 } 1042 } else { 1043 if (lp->curr != lp->base) { 1044 count++; 1045 } 1046 } 1047 } 1048 rw_exit(&kcage_range_rwlock); 1049 1050 ksp->ks_ndata = count; 1051 ksp->ks_data_size = count * 2 * sizeof (uint64_t); 1052 1053 return (0); 1054 } 1055 1056 static int 1057 kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw) 1058 { 1059 struct kcage_glist *lp; 1060 struct memunit { 1061 uint64_t address; 1062 uint64_t size; 1063 } *kspmem; 1064 1065 if (rw == KSTAT_WRITE) 1066 return (EACCES); 1067 1068 ksp->ks_snaptime = gethrtime(); 1069 1070 kspmem = (struct memunit *)buf; 1071 rw_enter(&kcage_range_rwlock, RW_WRITER); 1072 for (lp = kcage_glist; lp != NULL; lp = lp->next, kspmem++) { 1073 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) 1074 break; 1075 1076 if (lp->decr) { 1077 if (lp->curr != lp->lim) { 1078 kspmem->address = ptob(lp->curr); 1079 kspmem->size = ptob(lp->lim - lp->curr); 1080 } 1081 } else { 1082 if (lp->curr != lp->base) { 1083 kspmem->address = ptob(lp->base); 1084 kspmem->size = ptob(lp->curr - lp->base); 1085 } 1086 } 1087 } 1088 rw_exit(&kcage_range_rwlock); 1089 1090 return (0); 1091 } 1092 1093 void 1094 kcage_recalc_thresholds() 1095 { 1096 static int first = 1; 1097 static pgcnt_t init_lotsfree; 1098 static pgcnt_t init_desfree; 1099 static pgcnt_t init_minfree; 1100 static pgcnt_t init_throttlefree; 1101 static pgcnt_t init_reserve; 1102 1103 /* TODO: any reason to take more care than this with live editing? */ 1104 mutex_enter(&kcage_cageout_mutex); 1105 mutex_enter(&freemem_lock); 1106 1107 if (first) { 1108 first = 0; 1109 init_lotsfree = kcage_lotsfree; 1110 init_desfree = kcage_desfree; 1111 init_minfree = kcage_minfree; 1112 init_throttlefree = kcage_throttlefree; 1113 init_reserve = kcage_reserve; 1114 } else { 1115 kcage_lotsfree = init_lotsfree; 1116 kcage_desfree = init_desfree; 1117 kcage_minfree = init_minfree; 1118 kcage_throttlefree = init_throttlefree; 1119 kcage_reserve = init_reserve; 1120 } 1121 1122 if (kcage_lotsfree == 0) 1123 kcage_lotsfree = MAX(32, total_pages / 256); 1124 1125 if (kcage_minfree == 0) 1126 kcage_minfree = MAX(32, kcage_lotsfree / 2); 1127 1128 if (kcage_desfree == 0) 1129 kcage_desfree = MAX(32, kcage_minfree); 1130 1131 if (kcage_throttlefree == 0) 1132 kcage_throttlefree = MAX(32, kcage_minfree / 2); 1133 1134 if (kcage_reserve == 0) 1135 kcage_reserve = MIN(32, kcage_throttlefree / 2); 1136 1137 mutex_exit(&freemem_lock); 1138 mutex_exit(&kcage_cageout_mutex); 1139 1140 if (kcage_cageout_ready) { 1141 if (kcage_freemem < kcage_desfree) 1142 kcage_cageout_wakeup(); 1143 1144 if (kcage_needfree) { 1145 mutex_enter(&kcage_throttle_mutex); 1146 cv_broadcast(&kcage_throttle_cv); 1147 mutex_exit(&kcage_throttle_mutex); 1148 } 1149 } 1150 } 1151 1152 /* 1153 * Pageout interface: 1154 * kcage_cageout_init() 1155 */ 1156 void 1157 kcage_cageout_init() 1158 { 1159 if (kcage_on) { 1160 1161 (void) thread_create(NULL, 0, kcage_cageout, 1162 NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1); 1163 } 1164 } 1165 1166 1167 /* 1168 * VM Interfaces: 1169 * kcage_create_throttle() 1170 * kcage_freemem_add() 1171 * kcage_freemem_sub() 1172 */ 1173 1174 /* 1175 * Wakeup cageout thread and throttle waiting for the number of pages 1176 * requested to become available. For non-critical requests, a 1177 * timeout is added, since freemem accounting is separate from cage 1178 * freemem accounting: it's possible for us to get stuck and not make 1179 * forward progress even though there was sufficient freemem before 1180 * arriving here. 1181 */ 1182 int 1183 kcage_create_throttle(pgcnt_t npages, int flags) 1184 { 1185 int niter = 0; 1186 pgcnt_t lastfree; 1187 int enough = kcage_freemem > kcage_throttlefree + npages; 1188 1189 KCAGE_STAT_INCR(kct_calls); /* unprotected incr. */ 1190 1191 kcage_cageout_wakeup(); /* just to be sure */ 1192 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1193 1194 /* 1195 * Obviously, we can't throttle the cageout thread since 1196 * we depend on it. We also can't throttle the panic thread. 1197 */ 1198 if (curthread == kcage_cageout_thread || panicstr) { 1199 KCAGE_STAT_INCR(kct_cageout); /* unprotected incr. */ 1200 return (KCT_CRIT); 1201 } 1202 1203 /* 1204 * Don't throttle threads which are critical for proper 1205 * vm management if we're above kcage_throttlefree or 1206 * if freemem is very low. 1207 */ 1208 if (NOMEMWAIT()) { 1209 if (enough) { 1210 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1211 return (KCT_CRIT); 1212 } else if (freemem < minfree) { 1213 KCAGE_STAT_INCR(kct_critical); /* unprotected incr. */ 1214 return (KCT_CRIT); 1215 } 1216 } 1217 1218 /* 1219 * Don't throttle real-time threads if kcage_freemem > kcage_reserve. 1220 */ 1221 if (DISP_PRIO(curthread) > maxclsyspri && 1222 kcage_freemem > kcage_reserve) { 1223 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1224 return (KCT_CRIT); 1225 } 1226 1227 /* 1228 * Cause all other threads (which are assumed to not be 1229 * critical to cageout) to wait here until their request 1230 * can be satisfied. Be a little paranoid and wake the 1231 * kernel cage on each loop through this logic. 1232 */ 1233 while (kcage_freemem < kcage_throttlefree + npages) { 1234 ASSERT(kcage_on); 1235 1236 lastfree = kcage_freemem; 1237 1238 if (kcage_cageout_ready) { 1239 mutex_enter(&kcage_throttle_mutex); 1240 1241 kcage_needfree += npages; 1242 KCAGE_STAT_INCR(kct_wait); 1243 1244 kcage_cageout_wakeup(); 1245 KCAGE_STAT_INCR(kct_cagewake); 1246 1247 cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex); 1248 1249 kcage_needfree -= npages; 1250 1251 mutex_exit(&kcage_throttle_mutex); 1252 } else { 1253 /* 1254 * NOTE: atomics are used just in case we enter 1255 * mp operation before the cageout thread is ready. 1256 */ 1257 atomic_add_long(&kcage_needfree, npages); 1258 1259 kcage_cageout_wakeup(); 1260 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1261 1262 atomic_add_long(&kcage_needfree, -npages); 1263 } 1264 1265 if ((flags & PG_WAIT) == 0) { 1266 if (kcage_freemem > lastfree) { 1267 KCAGE_STAT_INCR(kct_progress); 1268 niter = 0; 1269 } else { 1270 KCAGE_STAT_INCR(kct_noprogress); 1271 if (++niter >= kcage_maxwait) { 1272 KCAGE_STAT_INCR(kct_timeout); 1273 return (KCT_FAILURE); 1274 } 1275 } 1276 } 1277 1278 if (NOMEMWAIT() && freemem < minfree) { 1279 return (KCT_CRIT); 1280 } 1281 1282 } 1283 return (KCT_NONCRIT); 1284 } 1285 1286 void 1287 kcage_freemem_add(pgcnt_t npages) 1288 { 1289 extern void wakeup_pcgs(void); 1290 1291 atomic_add_long(&kcage_freemem, npages); 1292 1293 wakeup_pcgs(); /* wakeup threads in pcgs() */ 1294 1295 if (kcage_needfree != 0 && 1296 kcage_freemem >= (kcage_throttlefree + kcage_needfree)) { 1297 1298 mutex_enter(&kcage_throttle_mutex); 1299 cv_broadcast(&kcage_throttle_cv); 1300 KCAGE_STAT_INCR(kfa_trottlewake); 1301 mutex_exit(&kcage_throttle_mutex); 1302 } 1303 } 1304 1305 void 1306 kcage_freemem_sub(pgcnt_t npages) 1307 { 1308 atomic_add_long(&kcage_freemem, -npages); 1309 1310 if (kcage_freemem < kcage_desfree) { 1311 kcage_cageout_wakeup(); 1312 KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */ 1313 } 1314 } 1315 1316 /* 1317 * return 0 on failure and 1 on success. 1318 */ 1319 static int 1320 kcage_setnoreloc_pages(page_t *rootpp, se_t se) 1321 { 1322 pgcnt_t npgs, i; 1323 page_t *pp; 1324 pfn_t rootpfn = page_pptonum(rootpp); 1325 uint_t szc; 1326 1327 ASSERT(!PP_ISFREE(rootpp)); 1328 ASSERT(PAGE_LOCKED_SE(rootpp, se)); 1329 if (!group_page_trylock(rootpp, se)) { 1330 return (0); 1331 } 1332 szc = rootpp->p_szc; 1333 if (szc == 0) { 1334 /* 1335 * The szc of a locked page can only change for pages that are 1336 * non-swapfs (i.e. anonymous memory) file system pages. 1337 */ 1338 ASSERT(rootpp->p_vnode != NULL && 1339 !PP_ISKAS(rootpp) && 1340 !IS_SWAPFSVP(rootpp->p_vnode)); 1341 PP_SETNORELOC(rootpp); 1342 return (1); 1343 } 1344 npgs = page_get_pagecnt(szc); 1345 ASSERT(IS_P2ALIGNED(rootpfn, npgs)); 1346 pp = rootpp; 1347 for (i = 0; i < npgs; i++, pp++) { 1348 ASSERT(PAGE_LOCKED_SE(pp, se)); 1349 ASSERT(!PP_ISFREE(pp)); 1350 ASSERT(pp->p_szc == szc); 1351 PP_SETNORELOC(pp); 1352 } 1353 group_page_unlock(rootpp); 1354 return (1); 1355 } 1356 1357 /* 1358 * Attempt to convert page to a caged page (set the P_NORELOC flag). 1359 * If successful and pages is free, move page to the tail of whichever 1360 * list it is on. 1361 * Returns: 1362 * EBUSY page already locked, assimilated but not free. 1363 * ENOMEM page assimilated, but memory too low to relocate. Page not free. 1364 * EAGAIN page not assimilated. Page not free. 1365 * ERANGE page assimilated. Page not root. 1366 * 0 page assimilated. Page free. 1367 * *nfreedp number of pages freed. 1368 * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way 1369 * to distinguish between a page that was already a NORELOC page from 1370 * those newly converted to NORELOC pages by this invocation of 1371 * kcage_assimilate_page. 1372 */ 1373 static int 1374 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp) 1375 { 1376 if (page_trylock(pp, SE_EXCL)) { 1377 if (PP_ISNORELOC(pp)) { 1378 check_free_and_return: 1379 if (PP_ISFREE(pp)) { 1380 page_unlock(pp); 1381 *nfreedp = 0; 1382 return (0); 1383 } else { 1384 page_unlock(pp); 1385 return (EBUSY); 1386 } 1387 /*NOTREACHED*/ 1388 } 1389 } else { 1390 if (page_trylock(pp, SE_SHARED)) { 1391 if (PP_ISNORELOC(pp)) 1392 goto check_free_and_return; 1393 } else 1394 return (EAGAIN); 1395 1396 if (!PP_ISFREE(pp)) { 1397 page_unlock(pp); 1398 return (EAGAIN); 1399 } 1400 1401 /* 1402 * Need to upgrade the lock on it and set the NORELOC 1403 * bit. If it is free then remove it from the free 1404 * list so that the platform free list code can keep 1405 * NORELOC pages where they should be. 1406 */ 1407 /* 1408 * Before doing anything, get the exclusive lock. 1409 * This may fail (eg ISM pages are left shared locked). 1410 * If the page is free this will leave a hole in the 1411 * cage. There is no solution yet to this. 1412 */ 1413 if (!page_tryupgrade(pp)) { 1414 page_unlock(pp); 1415 return (EAGAIN); 1416 } 1417 } 1418 1419 ASSERT(PAGE_EXCL(pp)); 1420 1421 if (PP_ISFREE(pp)) { 1422 int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST; 1423 1424 page_list_sub(pp, which); 1425 ASSERT(pp->p_szc == 0); 1426 PP_SETNORELOC(pp); 1427 PLCNT_XFER_NORELOC(pp); 1428 page_list_add(pp, which | PG_LIST_TAIL); 1429 1430 page_unlock(pp); 1431 *nfreedp = 1; 1432 return (0); 1433 } else { 1434 if (pp->p_szc != 0) { 1435 if (!kcage_setnoreloc_pages(pp, SE_EXCL)) { 1436 page_unlock(pp); 1437 return (EAGAIN); 1438 } 1439 ASSERT(PP_ISNORELOC(pp)); 1440 } else { 1441 PP_SETNORELOC(pp); 1442 } 1443 PLCNT_XFER_NORELOC(pp); 1444 return (kcage_invalidate_page(pp, nfreedp)); 1445 } 1446 /*NOTREACHED*/ 1447 } 1448 1449 static int 1450 kcage_expand() 1451 { 1452 int did_something = 0; 1453 1454 spgcnt_t wanted; 1455 pfn_t pfn; 1456 page_t *pp; 1457 /* TODO: we don't really need n any more? */ 1458 pgcnt_t n; 1459 pgcnt_t nf, nfreed; 1460 1461 /* 1462 * Expand the cage if available cage memory is really low. Calculate 1463 * the amount required to return kcage_freemem to the level of 1464 * kcage_lotsfree, or to satisfy throttled requests, whichever is 1465 * more. It is rare for their sum to create an artificial threshold 1466 * above kcage_lotsfree, but it is possible. 1467 * 1468 * Exit early if expansion amount is equal to or less than zero. 1469 * (<0 is possible if kcage_freemem rises suddenly.) 1470 * 1471 * Exit early when the global page pool (apparently) does not 1472 * have enough free pages to page_relocate() even a single page. 1473 */ 1474 wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree) 1475 - kcage_freemem; 1476 if (wanted <= 0) 1477 return (0); 1478 else if (freemem < pageout_reserve + 1) { 1479 KCAGE_STAT_INCR(ke_lowfreemem); 1480 return (0); 1481 } 1482 1483 KCAGE_STAT_INCR(ke_calls); 1484 KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted); 1485 1486 /* 1487 * Assimilate more pages from the global page pool into the cage. 1488 */ 1489 n = 0; /* number of pages PP_SETNORELOC'd */ 1490 nf = 0; /* number of those actually free */ 1491 while (kcage_on && nf < wanted) { 1492 pfn = kcage_get_pfn(1); 1493 if (pfn == PFN_INVALID) { /* eek! no where to grow */ 1494 KCAGE_STAT_INCR(ke_nopfn); 1495 goto terminate; 1496 } 1497 1498 KCAGE_STAT_INCR_SCAN(ke_examined); 1499 1500 if ((pp = page_numtopp_nolock(pfn)) == NULL) { 1501 KCAGE_STAT_INCR(ke_nopaget); 1502 continue; 1503 } 1504 KCAGEPAGETS_INC(); 1505 /* 1506 * Sanity check. Skip this pfn if it is 1507 * being deleted. 1508 */ 1509 if (pfn_is_being_deleted(pfn)) { 1510 KCAGE_STAT_INCR(ke_deleting); 1511 continue; 1512 } 1513 1514 if (PP_ISNORELOC(pp)) { 1515 KCAGE_STAT_INCR(ke_isnoreloc); 1516 continue; 1517 } 1518 1519 switch (kcage_assimilate_page(pp, &nfreed)) { 1520 case 0: /* assimilated, page is free */ 1521 KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed); 1522 did_something = 1; 1523 nf += nfreed; 1524 n++; 1525 break; 1526 1527 case EBUSY: /* assimilated, page not free */ 1528 case ERANGE: /* assimilated, page not root */ 1529 KCAGE_STAT_INCR_SCAN(ke_gotone); 1530 did_something = 1; 1531 n++; 1532 break; 1533 1534 case ENOMEM: /* assimilated, but no mem */ 1535 KCAGE_STAT_INCR(ke_terminate); 1536 did_something = 1; 1537 n++; 1538 goto terminate; 1539 1540 case EAGAIN: /* can't assimilate */ 1541 KCAGE_STAT_INCR_SCAN(ke_lefthole); 1542 break; 1543 1544 default: /* catch this with debug kernels */ 1545 ASSERT(0); 1546 break; 1547 } 1548 } 1549 1550 /* 1551 * Realign cage edge with the nearest physical address 1552 * boundry for big pages. This is done to give us a 1553 * better chance of actually getting usable big pages 1554 * in the cage. 1555 */ 1556 1557 terminate: 1558 1559 return (did_something); 1560 } 1561 1562 /* 1563 * Relocate page opp (Original Page Pointer) from cage pool to page rpp 1564 * (Replacement Page Pointer) in the global pool. Page opp will be freed 1565 * if relocation is successful, otherwise it is only unlocked. 1566 * On entry, page opp must be exclusively locked and not free. 1567 * *nfreedp: number of pages freed. 1568 */ 1569 static int 1570 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp) 1571 { 1572 page_t *opp = pp; 1573 page_t *rpp = NULL; 1574 spgcnt_t npgs; 1575 int result; 1576 1577 ASSERT(!PP_ISFREE(opp)); 1578 ASSERT(PAGE_EXCL(opp)); 1579 1580 result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL); 1581 *nfreedp = npgs; 1582 if (result == 0) { 1583 while (npgs-- > 0) { 1584 page_t *tpp; 1585 1586 ASSERT(rpp != NULL); 1587 tpp = rpp; 1588 page_sub(&rpp, tpp); 1589 page_unlock(tpp); 1590 } 1591 1592 ASSERT(rpp == NULL); 1593 1594 return (0); /* success */ 1595 } 1596 1597 page_unlock(opp); 1598 return (result); 1599 } 1600 1601 /* 1602 * Based on page_invalidate_pages() 1603 * 1604 * Kcage_invalidate_page() uses page_relocate() twice. Both instances 1605 * of use must be updated to match the new page_relocate() when it 1606 * becomes available. 1607 * 1608 * Return result of kcage_relocate_page or zero if page was directly freed. 1609 * *nfreedp: number of pages freed. 1610 */ 1611 static int 1612 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp) 1613 { 1614 int result; 1615 1616 #if defined(__sparc) 1617 extern struct vnode prom_ppages; 1618 ASSERT(pp->p_vnode != &prom_ppages); 1619 #endif /* __sparc */ 1620 1621 ASSERT(!PP_ISFREE(pp)); 1622 ASSERT(PAGE_EXCL(pp)); 1623 1624 /* 1625 * Is this page involved in some I/O? shared? 1626 * The page_struct_lock need not be acquired to 1627 * examine these fields since the page has an 1628 * "exclusive" lock. 1629 */ 1630 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1631 result = kcage_relocate_page(pp, nfreedp); 1632 #ifdef KCAGE_STATS 1633 if (result == 0) 1634 KCAGE_STAT_INCR_SCAN(kip_reloclocked); 1635 else if (result == ENOMEM) 1636 KCAGE_STAT_INCR_SCAN(kip_nomem); 1637 #endif 1638 return (result); 1639 } 1640 1641 ASSERT(pp->p_vnode->v_type != VCHR); 1642 1643 /* 1644 * Unload the mappings and check if mod bit is set. 1645 */ 1646 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1647 1648 if (hat_ismod(pp)) { 1649 result = kcage_relocate_page(pp, nfreedp); 1650 #ifdef KCAGE_STATS 1651 if (result == 0) 1652 KCAGE_STAT_INCR_SCAN(kip_relocmod); 1653 else if (result == ENOMEM) 1654 KCAGE_STAT_INCR_SCAN(kip_nomem); 1655 #endif 1656 return (result); 1657 } 1658 1659 if (!page_try_demote_pages(pp)) { 1660 KCAGE_STAT_INCR_SCAN(kip_demotefailed); 1661 page_unlock(pp); 1662 return (EAGAIN); 1663 } 1664 1665 page_destroy(pp, 0); 1666 KCAGE_STAT_INCR_SCAN(kip_destroy); 1667 *nfreedp = 1; 1668 return (0); 1669 } 1670 1671 static void 1672 kcage_cageout() 1673 { 1674 pfn_t pfn; 1675 page_t *pp; 1676 callb_cpr_t cprinfo; 1677 int did_something; 1678 int scan_again; 1679 pfn_t start_pfn; 1680 int pass; 1681 int last_pass; 1682 int pages_skipped; 1683 int shared_skipped; 1684 ulong_t shared_level = 8; 1685 pgcnt_t nfreed; 1686 #ifdef KCAGE_STATS 1687 clock_t scan_start; 1688 #endif 1689 1690 CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex, 1691 callb_generic_cpr, "cageout"); 1692 1693 mutex_enter(&kcage_cageout_mutex); 1694 kcage_cageout_thread = curthread; 1695 1696 pfn = PFN_INVALID; /* force scan reset */ 1697 start_pfn = PFN_INVALID; /* force init with 1st cage pfn */ 1698 kcage_cageout_ready = 1; /* switch kcage_cageout_wakeup mode */ 1699 1700 loop: 1701 /* 1702 * Wait here. Sooner or later, kcage_freemem_sub() will notice 1703 * that kcage_freemem is less than kcage_desfree. When it does 1704 * notice, kcage_freemem_sub() will wake us up via call to 1705 * kcage_cageout_wakeup(). 1706 */ 1707 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1708 cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex); 1709 CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex); 1710 1711 KCAGE_STAT_INCR(kt_wakeups); 1712 KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem); 1713 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem); 1714 pass = 0; 1715 last_pass = 0; 1716 1717 #ifdef KCAGE_STATS 1718 scan_start = lbolt; 1719 #endif 1720 1721 again: 1722 if (!kcage_on) 1723 goto loop; 1724 1725 KCAGE_STAT_INCR(kt_scans); 1726 KCAGE_STAT_INCR_SCAN(kt_passes); 1727 1728 did_something = 0; 1729 pages_skipped = 0; 1730 shared_skipped = 0; 1731 while ((kcage_freemem < kcage_lotsfree || kcage_needfree) && 1732 (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) { 1733 1734 if (start_pfn == PFN_INVALID) 1735 start_pfn = pfn; 1736 else if (start_pfn == pfn) { 1737 last_pass = pass; 1738 pass += 1; 1739 /* 1740 * Did a complete walk of kernel cage, but didn't free 1741 * any pages. If only one cpu is active then 1742 * stop kernel cage walk and try expanding. 1743 */ 1744 if (cp_default.cp_ncpus == 1 && did_something == 0) { 1745 KCAGE_STAT_INCR(kt_cageout_break); 1746 break; 1747 } 1748 } 1749 1750 pp = page_numtopp_nolock(pfn); 1751 if (pp == NULL) { 1752 continue; 1753 } 1754 1755 KCAGE_STAT_INCR_SCAN(kt_examined); 1756 1757 /* 1758 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside 1759 * of the lock. If one is missed it will be seen next 1760 * time through. 1761 * 1762 * Skip non-caged-pages. These pages can exist in the cage 1763 * because, if during cage expansion, a page is 1764 * encountered that is long-term locked the lock prevents the 1765 * expansion logic from setting the P_NORELOC flag. Hence, 1766 * non-caged-pages surrounded by caged-pages. 1767 */ 1768 if (!PP_ISNORELOC(pp)) { 1769 switch (kcage_assimilate_page(pp, &nfreed)) { 1770 case 0: 1771 did_something = 1; 1772 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, 1773 nfreed); 1774 break; 1775 1776 case EBUSY: 1777 case ERANGE: 1778 did_something = 1; 1779 KCAGE_STAT_INCR_SCAN(kt_gotone); 1780 break; 1781 1782 case EAGAIN: 1783 case ENOMEM: 1784 break; 1785 1786 default: 1787 /* catch this with debug kernels */ 1788 ASSERT(0); 1789 break; 1790 } 1791 1792 continue; 1793 } else { 1794 int prm; 1795 1796 if (PP_ISFREE(pp)) { 1797 continue; 1798 } 1799 1800 if ((PP_ISKAS(pp) && pp->p_lckcnt > 0) || 1801 !page_trylock(pp, SE_EXCL)) { 1802 KCAGE_STAT_INCR_SCAN(kt_cantlock); 1803 continue; 1804 } 1805 1806 /* P_NORELOC bit should not have gone away. */ 1807 ASSERT(PP_ISNORELOC(pp)); 1808 if (PP_ISFREE(pp) || (PP_ISKAS(pp) && 1809 pp->p_lckcnt > 0)) { 1810 page_unlock(pp); 1811 continue; 1812 } 1813 1814 KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level); 1815 if (hat_page_checkshare(pp, shared_level)) { 1816 page_unlock(pp); 1817 pages_skipped = 1; 1818 shared_skipped = 1; 1819 KCAGE_STAT_INCR_SCAN(kt_skipshared); 1820 continue; 1821 } 1822 1823 /* 1824 * In pass {0, 1}, skip page if ref bit is set. 1825 * In pass {0, 1, 2}, skip page if mod bit is set. 1826 */ 1827 prm = hat_pagesync(pp, 1828 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 1829 1830 /* On first pass ignore ref'd pages */ 1831 if (pass <= 1 && (prm & P_REF)) { 1832 KCAGE_STAT_INCR_SCAN(kt_skiprefd); 1833 pages_skipped = 1; 1834 page_unlock(pp); 1835 continue; 1836 } 1837 1838 /* On pass 2, page_destroy if mod bit is not set */ 1839 if (pass <= 2) { 1840 if (pp->p_szc != 0 || (prm & P_MOD) || 1841 pp->p_lckcnt || pp->p_cowcnt) { 1842 pages_skipped = 1; 1843 page_unlock(pp); 1844 } else { 1845 1846 /* 1847 * unload the mappings before 1848 * checking if mod bit is set 1849 */ 1850 (void) hat_pageunload(pp, 1851 HAT_FORCE_PGUNLOAD); 1852 1853 /* 1854 * skip this page if modified 1855 */ 1856 if (hat_ismod(pp)) { 1857 pages_skipped = 1; 1858 page_unlock(pp); 1859 continue; 1860 } 1861 1862 KCAGE_STAT_INCR_SCAN(kt_destroy); 1863 page_destroy(pp, 0); 1864 did_something = 1; 1865 } 1866 continue; 1867 } 1868 1869 if (kcage_invalidate_page(pp, &nfreed) == 0) { 1870 did_something = 1; 1871 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed); 1872 } 1873 1874 /* 1875 * No need to drop the page lock here. 1876 * Kcage_invalidate_page has done that for us 1877 * either explicitly or through a page_free. 1878 */ 1879 } 1880 } 1881 1882 /* 1883 * Expand the cage only if available cage memory is really low. 1884 * This test is done only after a complete scan of the cage. 1885 * The reason for not checking and expanding more often is to 1886 * avoid rapid expansion of the cage. Naturally, scanning the 1887 * cage takes time. So by scanning first, we use that work as a 1888 * delay loop in between expand decisions. 1889 */ 1890 1891 scan_again = 0; 1892 if (kcage_freemem < kcage_minfree || kcage_needfree) { 1893 /* 1894 * Kcage_expand() will return a non-zero value if it was 1895 * able to expand the cage -- whether or not the new 1896 * pages are free and immediately usable. If non-zero, 1897 * we do another scan of the cage. The pages might be 1898 * freed during that scan or by time we get back here. 1899 * If not, we will attempt another expansion. 1900 * However, if kcage_expand() returns zero, then it was 1901 * unable to expand the cage. This is the case when the 1902 * the growth list is exausted, therefore no work was done 1903 * and there is no reason to scan the cage again. 1904 * Note: Kernel cage scan is not repeated when only one 1905 * cpu is active to avoid kernel cage thread hogging cpu. 1906 */ 1907 if (pass <= 3 && pages_skipped && cp_default.cp_ncpus > 1) 1908 scan_again = 1; 1909 else 1910 (void) kcage_expand(); /* don't scan again */ 1911 } else if (kcage_freemem < kcage_lotsfree) { 1912 /* 1913 * If available cage memory is less than abundant 1914 * and a full scan of the cage has not yet been completed, 1915 * or a scan has completed and some work was performed, 1916 * or pages were skipped because of sharing, 1917 * or we simply have not yet completed two passes, 1918 * then do another scan. 1919 */ 1920 if (pass <= 2 && pages_skipped) 1921 scan_again = 1; 1922 if (pass == last_pass || did_something) 1923 scan_again = 1; 1924 else if (shared_skipped && shared_level < (8<<24)) { 1925 shared_level <<= 1; 1926 scan_again = 1; 1927 } 1928 } 1929 1930 if (scan_again && cp_default.cp_ncpus > 1) 1931 goto again; 1932 else { 1933 if (shared_level > 8) 1934 shared_level >>= 1; 1935 1936 KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem); 1937 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem); 1938 KCAGE_STAT_SET_SCAN(kt_ticks, lbolt - scan_start); 1939 KCAGE_STAT_INC_SCAN_INDEX; 1940 goto loop; 1941 } 1942 1943 /*NOTREACHED*/ 1944 } 1945 1946 void 1947 kcage_cageout_wakeup() 1948 { 1949 if (mutex_tryenter(&kcage_cageout_mutex)) { 1950 if (kcage_cageout_ready) { 1951 cv_signal(&kcage_cageout_cv); 1952 } else if (kcage_freemem < kcage_minfree || kcage_needfree) { 1953 /* 1954 * Available cage memory is really low. Time to 1955 * start expanding the cage. However, the 1956 * kernel cage thread is not yet ready to 1957 * do the work. Use *this* thread, which is 1958 * most likely to be t0, to do the work. 1959 */ 1960 KCAGE_STAT_INCR(kcw_expandearly); 1961 (void) kcage_expand(); 1962 KCAGE_STAT_INC_SCAN_INDEX; 1963 } 1964 1965 mutex_exit(&kcage_cageout_mutex); 1966 } 1967 /* else, kernel cage thread is already running */ 1968 } 1969 1970 void 1971 kcage_tick() 1972 { 1973 /* 1974 * Once per second we wake up all the threads throttled 1975 * waiting for cage memory, in case we've become stuck 1976 * and haven't made forward progress expanding the cage. 1977 */ 1978 if (kcage_on && kcage_cageout_ready) 1979 cv_broadcast(&kcage_throttle_cv); 1980 } 1981