1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/sysmacros.h> 28 #include <sys/kmem.h> 29 #include <sys/atomic.h> 30 #include <sys/bitmap.h> 31 #include <sys/systm.h> 32 #include <vm/seg_kmem.h> 33 #include <vm/hat.h> 34 #include <vm/vm_dep.h> 35 #include <vm/hat_i86.h> 36 #include <sys/cmn_err.h> 37 #include <sys/avl.h> 38 39 40 /* 41 * When pages are shared by more than one mapping, a list of these 42 * structs hangs off of the page_t connected by the hm_next and hm_prev 43 * fields. Every hment is also indexed by a system-wide hash table, using 44 * hm_hashlink to connect the hments within each hash bucket. 45 */ 46 struct hment { 47 avl_node_t hm_hashlink; /* links for hash table */ 48 struct hment *hm_next; /* next mapping of same page */ 49 struct hment *hm_prev; /* previous mapping of same page */ 50 htable_t *hm_htable; /* corresponding htable_t */ 51 pfn_t hm_pfn; /* mapping page frame number */ 52 uint16_t hm_entry; /* index of pte in htable */ 53 uint16_t hm_pad; /* explicitly expose compiler padding */ 54 uint32_t hm_pad2; /* explicitly expose compiler padding */ 55 }; 56 57 /* 58 * Value returned by hment_walk() when dealing with a single mapping 59 * embedded in the page_t. 60 */ 61 #define HMENT_EMBEDDED ((hment_t *)(uintptr_t)1) 62 63 kmem_cache_t *hment_cache; 64 65 /* 66 * The hment reserve is similar to the htable reserve, with the following 67 * exception. Hment's are never needed for HAT kmem allocs. 68 * 69 * The hment_reserve_amount variable is used, so that you can change it's 70 * value to zero via a kernel debugger to force stealing to get tested. 71 */ 72 #define HMENT_RESERVE_AMOUNT (200) /* currently a guess at right value. */ 73 uint_t hment_reserve_amount = HMENT_RESERVE_AMOUNT; 74 kmutex_t hment_reserve_mutex; 75 uint_t hment_reserve_count; 76 hment_t *hment_reserve_pool; 77 78 /* 79 * All hments are stored in a system wide hash of AVL trees. 80 */ 81 #define HMENT_HASH_SIZE (64 * 1024) 82 static uint_t hment_hash_entries = HMENT_HASH_SIZE; 83 static avl_tree_t *hment_table; 84 85 /* 86 * Lots of highly shared pages will have the same value for "entry" (consider 87 * the starting address of "xterm" or "sh"). So we'll distinguish them by 88 * adding the pfn of the page table into both the high bits. 89 * The shift by 9 corresponds to the range of values for entry (0..511). 90 */ 91 #define HMENT_HASH(pfn, entry) (uint32_t) \ 92 ((((pfn) << 9) + entry + pfn) & (hment_hash_entries - 1)) 93 94 /* 95 * "mlist_lock" is a hashed mutex lock for protecting per-page mapping 96 * lists and "hash_lock" is a similar lock protecting the hment hash 97 * table. The hashed approach is taken to avoid the spatial overhead of 98 * maintaining a separate lock for each page, while still achieving better 99 * scalability than a single lock would allow. 100 */ 101 #define MLIST_NUM_LOCK 2048 /* must be power of two */ 102 static kmutex_t *mlist_lock; 103 104 /* 105 * the shift by 9 is so that all large pages don't use the same hash bucket 106 */ 107 #define MLIST_MUTEX(pp) \ 108 &mlist_lock[((pp)->p_pagenum + ((pp)->p_pagenum >> 9)) & \ 109 (MLIST_NUM_LOCK - 1)] 110 111 #define HASH_NUM_LOCK 2048 /* must be power of two */ 112 static kmutex_t *hash_lock; 113 114 #define HASH_MUTEX(idx) &hash_lock[(idx) & (HASH_NUM_LOCK-1)] 115 116 static avl_node_t null_avl_link; /* always zero */ 117 static hment_t *hment_steal(void); 118 119 /* 120 * Utility to compare hment_t's for use in AVL tree. The ordering 121 * is entirely arbitrary and is just so that the AVL algorithm works. 122 */ 123 static int 124 hment_compare(const void *hm1, const void *hm2) 125 { 126 hment_t *h1 = (hment_t *)hm1; 127 hment_t *h2 = (hment_t *)hm2; 128 long diff; 129 130 diff = (uintptr_t)h1->hm_htable - (uintptr_t)h2->hm_htable; 131 if (diff == 0) { 132 diff = h1->hm_entry - h2->hm_entry; 133 if (diff == 0) 134 diff = h1->hm_pfn - h2->hm_pfn; 135 } 136 if (diff < 0) 137 diff = -1; 138 else if (diff > 0) 139 diff = 1; 140 return (diff); 141 } 142 143 /* 144 * put one hment onto the reserves list 145 */ 146 static void 147 hment_put_reserve(hment_t *hm) 148 { 149 HATSTAT_INC(hs_hm_put_reserve); 150 mutex_enter(&hment_reserve_mutex); 151 hm->hm_next = hment_reserve_pool; 152 hment_reserve_pool = hm; 153 ++hment_reserve_count; 154 mutex_exit(&hment_reserve_mutex); 155 } 156 157 /* 158 * Take one hment from the reserve. 159 */ 160 static hment_t * 161 hment_get_reserve(void) 162 { 163 hment_t *hm = NULL; 164 165 /* 166 * We rely on a "donation system" to refill the hment reserve 167 * list, which only takes place when we are allocating hments for 168 * user mappings. It is theoretically possible that an incredibly 169 * long string of kernel hment_alloc()s with no intervening user 170 * hment_alloc()s could exhaust that pool. 171 */ 172 HATSTAT_INC(hs_hm_get_reserve); 173 mutex_enter(&hment_reserve_mutex); 174 if (hment_reserve_count != 0) { 175 hm = hment_reserve_pool; 176 hment_reserve_pool = hm->hm_next; 177 --hment_reserve_count; 178 } 179 mutex_exit(&hment_reserve_mutex); 180 return (hm); 181 } 182 183 /* 184 * Allocate an hment 185 */ 186 static hment_t * 187 hment_alloc() 188 { 189 int km_flag = can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP; 190 hment_t *hm = NULL; 191 192 /* 193 * If we aren't using the reserves, try using kmem to get an hment. 194 * Donate any successful allocations to reserves if low. 195 * 196 * If we're in panic, resort to using the reserves. 197 */ 198 HATSTAT_INC(hs_hm_alloc); 199 if (!USE_HAT_RESERVES()) { 200 for (;;) { 201 hm = kmem_cache_alloc(hment_cache, km_flag); 202 if (hm == NULL || 203 USE_HAT_RESERVES() || 204 hment_reserve_count >= hment_reserve_amount) 205 break; 206 hment_put_reserve(hm); 207 } 208 } 209 210 /* 211 * If allocation failed, we need to tap the reserves or steal 212 */ 213 if (hm == NULL) { 214 if (USE_HAT_RESERVES()) 215 hm = hment_get_reserve(); 216 217 /* 218 * If we still haven't gotten an hment, attempt to steal one by 219 * victimizing a mapping in a user htable. 220 */ 221 if (hm == NULL && can_steal_post_boot) 222 hm = hment_steal(); 223 224 /* 225 * we're in dire straights, try the reserve 226 */ 227 if (hm == NULL) 228 hm = hment_get_reserve(); 229 230 /* 231 * still no hment is a serious problem. 232 */ 233 if (hm == NULL) 234 panic("hment_alloc(): no reserve, couldn't steal"); 235 } 236 237 238 hm->hm_entry = 0; 239 hm->hm_htable = NULL; 240 hm->hm_hashlink = null_avl_link; 241 hm->hm_next = NULL; 242 hm->hm_prev = NULL; 243 hm->hm_pfn = PFN_INVALID; 244 return (hm); 245 } 246 247 /* 248 * Free an hment, possibly to the reserves list when called from the 249 * thread using the reserves. For example, when freeing an hment during an 250 * htable_steal(), we can't recurse into the kmem allocator, so we just 251 * push the hment onto the reserve list. 252 */ 253 void 254 hment_free(hment_t *hm) 255 { 256 #ifdef DEBUG 257 /* 258 * zero out all fields to try and force any race conditions to segfault 259 */ 260 bzero(hm, sizeof (*hm)); 261 #endif 262 HATSTAT_INC(hs_hm_free); 263 if (USE_HAT_RESERVES() || 264 hment_reserve_count < hment_reserve_amount) { 265 hment_put_reserve(hm); 266 } else { 267 kmem_cache_free(hment_cache, hm); 268 hment_adjust_reserve(); 269 } 270 } 271 272 /* 273 * These must test for mlist_lock not having been allocated yet. 274 * We just ignore locking in that case, as it means were in early 275 * single threaded startup. 276 */ 277 int 278 x86_hm_held(page_t *pp) 279 { 280 ASSERT(pp != NULL); 281 if (mlist_lock == NULL) 282 return (1); 283 return (MUTEX_HELD(MLIST_MUTEX(pp))); 284 } 285 286 void 287 x86_hm_enter(page_t *pp) 288 { 289 ASSERT(pp != NULL); 290 if (mlist_lock != NULL) 291 mutex_enter(MLIST_MUTEX(pp)); 292 } 293 294 void 295 x86_hm_exit(page_t *pp) 296 { 297 ASSERT(pp != NULL); 298 if (mlist_lock != NULL) 299 mutex_exit(MLIST_MUTEX(pp)); 300 } 301 302 /* 303 * Internal routine to add a full hment to a page_t mapping list 304 */ 305 static void 306 hment_insert(hment_t *hm, page_t *pp) 307 { 308 uint_t idx; 309 310 ASSERT(x86_hm_held(pp)); 311 ASSERT(!pp->p_embed); 312 313 /* 314 * Add the hment to the page's mapping list. 315 */ 316 ++pp->p_share; 317 hm->hm_next = pp->p_mapping; 318 if (pp->p_mapping != NULL) 319 ((hment_t *)pp->p_mapping)->hm_prev = hm; 320 pp->p_mapping = hm; 321 322 /* 323 * Add the hment to the system-wide hash table. 324 */ 325 idx = HMENT_HASH(hm->hm_htable->ht_pfn, hm->hm_entry); 326 327 mutex_enter(HASH_MUTEX(idx)); 328 avl_add(&hment_table[idx], hm); 329 mutex_exit(HASH_MUTEX(idx)); 330 } 331 332 /* 333 * Prepare a mapping list entry to the given page. 334 * 335 * There are 4 different situations to deal with: 336 * 337 * - Adding the first mapping to a page_t as an embedded hment 338 * - Refaulting on an existing embedded mapping 339 * - Upgrading an embedded mapping when adding a 2nd mapping 340 * - Adding another mapping to a page_t that already has multiple mappings 341 * note we don't optimized for the refaulting case here. 342 * 343 * Due to competition with other threads that may be mapping/unmapping the 344 * same page and the need to drop all locks while allocating hments, any or 345 * all of the 3 situations can occur (and in almost any order) in any given 346 * call. Isn't this fun! 347 */ 348 hment_t * 349 hment_prepare(htable_t *htable, uint_t entry, page_t *pp) 350 { 351 hment_t *hm = NULL; 352 353 ASSERT(x86_hm_held(pp)); 354 355 for (;;) { 356 357 /* 358 * The most common case is establishing the first mapping to a 359 * page, so check that first. This doesn't need any allocated 360 * hment. 361 */ 362 if (pp->p_mapping == NULL) { 363 ASSERT(!pp->p_embed); 364 ASSERT(pp->p_share == 0); 365 if (hm == NULL) 366 break; 367 368 /* 369 * we had an hment already, so free it and retry 370 */ 371 goto free_and_continue; 372 } 373 374 /* 375 * If there is an embedded mapping, we may need to 376 * convert it to an hment. 377 */ 378 if (pp->p_embed) { 379 380 /* should point to htable */ 381 ASSERT(pp->p_mapping != NULL); 382 383 /* 384 * If we are faulting on a pre-existing mapping 385 * there is no need to promote/allocate a new hment. 386 * This happens a lot due to segmap. 387 */ 388 if (pp->p_mapping == htable && pp->p_mlentry == entry) { 389 if (hm == NULL) 390 break; 391 goto free_and_continue; 392 } 393 394 /* 395 * If we have an hment allocated, use it to promote the 396 * existing embedded mapping. 397 */ 398 if (hm != NULL) { 399 hm->hm_htable = pp->p_mapping; 400 hm->hm_entry = pp->p_mlentry; 401 hm->hm_pfn = pp->p_pagenum; 402 pp->p_mapping = NULL; 403 pp->p_share = 0; 404 pp->p_embed = 0; 405 hment_insert(hm, pp); 406 } 407 408 /* 409 * We either didn't have an hment allocated or we just 410 * used it for the embedded mapping. In either case, 411 * allocate another hment and restart. 412 */ 413 goto allocate_and_continue; 414 } 415 416 /* 417 * Last possibility is that we're adding an hment to a list 418 * of hments. 419 */ 420 if (hm != NULL) 421 break; 422 allocate_and_continue: 423 x86_hm_exit(pp); 424 hm = hment_alloc(); 425 x86_hm_enter(pp); 426 continue; 427 428 free_and_continue: 429 /* 430 * we allocated an hment already, free it and retry 431 */ 432 x86_hm_exit(pp); 433 hment_free(hm); 434 hm = NULL; 435 x86_hm_enter(pp); 436 } 437 ASSERT(x86_hm_held(pp)); 438 return (hm); 439 } 440 441 /* 442 * Record a mapping list entry for the htable/entry to the given page. 443 * 444 * hment_prepare() should have properly set up the situation. 445 */ 446 void 447 hment_assign(htable_t *htable, uint_t entry, page_t *pp, hment_t *hm) 448 { 449 ASSERT(x86_hm_held(pp)); 450 451 /* 452 * The most common case is establishing the first mapping to a 453 * page, so check that first. This doesn't need any allocated 454 * hment. 455 */ 456 if (pp->p_mapping == NULL) { 457 ASSERT(hm == NULL); 458 ASSERT(!pp->p_embed); 459 ASSERT(pp->p_share == 0); 460 pp->p_embed = 1; 461 pp->p_mapping = htable; 462 pp->p_mlentry = entry; 463 return; 464 } 465 466 /* 467 * We should never get here with a pre-existing embedded maping 468 */ 469 ASSERT(!pp->p_embed); 470 471 /* 472 * add the new hment to the mapping list 473 */ 474 ASSERT(hm != NULL); 475 hm->hm_htable = htable; 476 hm->hm_entry = entry; 477 hm->hm_pfn = pp->p_pagenum; 478 hment_insert(hm, pp); 479 } 480 481 /* 482 * Walk through the mappings for a page. 483 * 484 * must already have done an x86_hm_enter() 485 */ 486 hment_t * 487 hment_walk(page_t *pp, htable_t **ht, uint_t *entry, hment_t *prev) 488 { 489 hment_t *hm; 490 491 ASSERT(x86_hm_held(pp)); 492 493 if (pp->p_embed) { 494 if (prev == NULL) { 495 *ht = (htable_t *)pp->p_mapping; 496 *entry = pp->p_mlentry; 497 hm = HMENT_EMBEDDED; 498 } else { 499 ASSERT(prev == HMENT_EMBEDDED); 500 hm = NULL; 501 } 502 } else { 503 if (prev == NULL) { 504 ASSERT(prev != HMENT_EMBEDDED); 505 hm = (hment_t *)pp->p_mapping; 506 } else { 507 hm = prev->hm_next; 508 } 509 510 if (hm != NULL) { 511 *ht = hm->hm_htable; 512 *entry = hm->hm_entry; 513 } 514 } 515 return (hm); 516 } 517 518 /* 519 * Remove a mapping to a page from its mapping list. Must have 520 * the corresponding mapping list locked. 521 * Finds the mapping list entry with the given pte_t and 522 * unlinks it from the mapping list. 523 */ 524 hment_t * 525 hment_remove(page_t *pp, htable_t *ht, uint_t entry) 526 { 527 hment_t dummy; 528 avl_index_t where; 529 hment_t *hm; 530 uint_t idx; 531 532 ASSERT(x86_hm_held(pp)); 533 534 /* 535 * Check if we have only one mapping embedded in the page_t. 536 */ 537 if (pp->p_embed) { 538 ASSERT(ht == (htable_t *)pp->p_mapping); 539 ASSERT(entry == pp->p_mlentry); 540 ASSERT(pp->p_share == 0); 541 pp->p_mapping = NULL; 542 pp->p_mlentry = 0; 543 pp->p_embed = 0; 544 return (NULL); 545 } 546 547 /* 548 * Otherwise it must be in the list of hments. 549 * Find the hment in the system-wide hash table and remove it. 550 */ 551 ASSERT(pp->p_share != 0); 552 dummy.hm_htable = ht; 553 dummy.hm_entry = entry; 554 dummy.hm_pfn = pp->p_pagenum; 555 idx = HMENT_HASH(ht->ht_pfn, entry); 556 mutex_enter(HASH_MUTEX(idx)); 557 hm = avl_find(&hment_table[idx], &dummy, &where); 558 if (hm == NULL) 559 panic("hment_remove() missing in hash table pp=%lx, ht=%lx," 560 "entry=0x%x hash index=0x%x", (uintptr_t)pp, (uintptr_t)ht, 561 entry, idx); 562 avl_remove(&hment_table[idx], hm); 563 mutex_exit(HASH_MUTEX(idx)); 564 565 /* 566 * Remove the hment from the page's mapping list 567 */ 568 if (hm->hm_next) 569 hm->hm_next->hm_prev = hm->hm_prev; 570 if (hm->hm_prev) 571 hm->hm_prev->hm_next = hm->hm_next; 572 else 573 pp->p_mapping = hm->hm_next; 574 575 --pp->p_share; 576 hm->hm_hashlink = null_avl_link; 577 hm->hm_next = NULL; 578 hm->hm_prev = NULL; 579 580 return (hm); 581 } 582 583 /* 584 * Put initial hment's in the reserve pool. 585 */ 586 void 587 hment_reserve(uint_t count) 588 { 589 hment_t *hm; 590 591 count += hment_reserve_amount; 592 593 while (hment_reserve_count < count) { 594 hm = kmem_cache_alloc(hment_cache, KM_NOSLEEP); 595 if (hm == NULL) 596 return; 597 hment_put_reserve(hm); 598 } 599 } 600 601 /* 602 * Readjust the hment reserves after they may have been used. 603 */ 604 void 605 hment_adjust_reserve() 606 { 607 hment_t *hm; 608 609 /* 610 * Free up any excess reserves 611 */ 612 while (hment_reserve_count > hment_reserve_amount && 613 !USE_HAT_RESERVES()) { 614 hm = hment_get_reserve(); 615 if (hm == NULL) 616 return; 617 kmem_cache_free(hment_cache, hm); 618 } 619 } 620 621 /* 622 * initialize hment data structures 623 */ 624 void 625 hment_init(void) 626 { 627 int i; 628 int flags = KMC_NOHASH | KMC_NODEBUG; 629 630 /* 631 * Initialize kmem caches. On 32 bit kernel's we shut off 632 * debug information to save on precious kernel VA usage. 633 */ 634 hment_cache = kmem_cache_create("hment_t", 635 sizeof (hment_t), 0, NULL, NULL, NULL, 636 NULL, hat_memload_arena, flags); 637 638 hment_table = kmem_zalloc(hment_hash_entries * sizeof (*hment_table), 639 KM_SLEEP); 640 641 mlist_lock = kmem_zalloc(MLIST_NUM_LOCK * sizeof (kmutex_t), KM_SLEEP); 642 643 hash_lock = kmem_zalloc(HASH_NUM_LOCK * sizeof (kmutex_t), KM_SLEEP); 644 645 for (i = 0; i < hment_hash_entries; ++i) 646 avl_create(&hment_table[i], hment_compare, sizeof (hment_t), 647 offsetof(hment_t, hm_hashlink)); 648 649 for (i = 0; i < MLIST_NUM_LOCK; i++) 650 mutex_init(&mlist_lock[i], NULL, MUTEX_DEFAULT, NULL); 651 652 for (i = 0; i < HASH_NUM_LOCK; i++) 653 mutex_init(&hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 654 655 656 } 657 658 /* 659 * return the number of mappings to a page 660 * 661 * Note there is no ASSERT() that the MUTEX is held for this. 662 * Hence the return value might be inaccurate if this is called without 663 * doing an x86_hm_enter(). 664 */ 665 uint_t 666 hment_mapcnt(page_t *pp) 667 { 668 uint_t cnt; 669 uint_t szc; 670 page_t *larger; 671 hment_t *hm; 672 673 x86_hm_enter(pp); 674 if (pp->p_mapping == NULL) 675 cnt = 0; 676 else if (pp->p_embed) 677 cnt = 1; 678 else 679 cnt = pp->p_share; 680 x86_hm_exit(pp); 681 682 /* 683 * walk through all larger mapping sizes counting mappings 684 */ 685 for (szc = 1; szc <= pp->p_szc; ++szc) { 686 larger = PP_GROUPLEADER(pp, szc); 687 if (larger == pp) /* don't double count large mappings */ 688 continue; 689 690 x86_hm_enter(larger); 691 if (larger->p_mapping != NULL) { 692 if (larger->p_embed && 693 ((htable_t *)larger->p_mapping)->ht_level == szc) { 694 ++cnt; 695 } else if (!larger->p_embed) { 696 for (hm = larger->p_mapping; hm; 697 hm = hm->hm_next) { 698 if (hm->hm_htable->ht_level == szc) 699 ++cnt; 700 } 701 } 702 } 703 x86_hm_exit(larger); 704 } 705 return (cnt); 706 } 707 708 /* 709 * We need to steal an hment. Walk through all the page_t's until we 710 * find one that has multiple mappings. Unload one of the mappings 711 * and reclaim that hment. Note that we'll save/restart the starting 712 * page to try and spread the pain. 713 */ 714 static page_t *last_page = NULL; 715 716 static hment_t * 717 hment_steal(void) 718 { 719 page_t *last = last_page; 720 page_t *pp = last; 721 hment_t *hm = NULL; 722 hment_t *hm2; 723 htable_t *ht; 724 uint_t found_one = 0; 725 726 HATSTAT_INC(hs_hm_steals); 727 if (pp == NULL) 728 last = pp = page_first(); 729 730 while (!found_one) { 731 HATSTAT_INC(hs_hm_steal_exam); 732 pp = page_next(pp); 733 if (pp == NULL) 734 pp = page_first(); 735 736 /* 737 * The loop and function exit here if nothing found to steal. 738 */ 739 if (pp == last) 740 return (NULL); 741 742 /* 743 * Only lock the page_t if it has hments. 744 */ 745 if (pp->p_mapping == NULL || pp->p_embed) 746 continue; 747 748 /* 749 * Search the mapping list for a usable mapping. 750 */ 751 x86_hm_enter(pp); 752 if (!pp->p_embed) { 753 for (hm = pp->p_mapping; hm; hm = hm->hm_next) { 754 ht = hm->hm_htable; 755 if (ht->ht_hat != kas.a_hat && 756 ht->ht_busy == 0 && 757 ht->ht_lock_cnt == 0) { 758 found_one = 1; 759 break; 760 } 761 } 762 } 763 if (!found_one) 764 x86_hm_exit(pp); 765 } 766 767 /* 768 * Steal the mapping we found. Note that hati_page_unmap() will 769 * do the x86_hm_exit(). 770 */ 771 hm2 = hati_page_unmap(pp, ht, hm->hm_entry); 772 ASSERT(hm2 == hm); 773 last_page = pp; 774 return (hm); 775 } 776