1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/sysmacros.h> 30 #include <sys/kmem.h> 31 #include <sys/atomic.h> 32 #include <sys/bitmap.h> 33 #include <sys/systm.h> 34 #include <vm/seg_kmem.h> 35 #include <vm/hat.h> 36 #include <vm/vm_dep.h> 37 #include <vm/hat_i86.h> 38 #include <sys/cmn_err.h> 39 40 41 /* 42 * When pages are shared by more than one mapping, a list of these 43 * structs hangs off of the page_t connected by the hm_next and hm_prev 44 * fields. Every hment is also indexed by a system-wide hash table, using 45 * hm_hashnext to connect it to the chain of hments in a single hash 46 * bucket. 47 */ 48 struct hment { 49 struct hment *hm_hashnext; /* next mapping on hash chain */ 50 struct hment *hm_next; /* next mapping of same page */ 51 struct hment *hm_prev; /* previous mapping of same page */ 52 htable_t *hm_htable; /* corresponding htable_t */ 53 pfn_t hm_pfn; /* mapping page frame number */ 54 uint16_t hm_entry; /* index of pte in htable */ 55 uint16_t hm_pad; /* explicitly expose compiler padding */ 56 #ifdef __amd64 57 uint32_t hm_pad2; /* explicitly expose compiler padding */ 58 #endif 59 }; 60 61 /* 62 * Value returned by hment_walk() when dealing with a single mapping 63 * embedded in the page_t. 64 */ 65 #define HMENT_EMBEDDED ((hment_t *)(uintptr_t)1) 66 67 kmem_cache_t *hment_cache; 68 69 /* 70 * The hment reserve is similar to the htable reserve, with the following 71 * exception. Hment's are never needed for HAT kmem allocs. 72 * 73 * The hment_reserve_amount variable is used, so that you can change it's 74 * value to zero via a kernel debugger to force stealing to get tested. 75 */ 76 #define HMENT_RESERVE_AMOUNT (200) /* currently a guess at right value. */ 77 uint_t hment_reserve_amount = HMENT_RESERVE_AMOUNT; 78 kmutex_t hment_reserve_mutex; 79 uint_t hment_reserve_count; 80 hment_t *hment_reserve_pool; 81 82 /* 83 * Possible performance RFE: we might need to make this dynamic, perhaps 84 * based on the number of pages in the system. 85 */ 86 #define HMENT_HASH_SIZE (64 * 1024) 87 static uint_t hment_hash_entries = HMENT_HASH_SIZE; 88 static hment_t **hment_hash; 89 90 /* 91 * Lots of highly shared pages will have the same value for "entry" (consider 92 * the starting address of "xterm" or "sh"). So we'll distinguish them by 93 * adding the pfn of the page table into both the high bits. 94 * The shift by 9 corresponds to the range of values for entry (0..511). 95 */ 96 #define HMENT_HASH(pfn, entry) (uint32_t) \ 97 ((((pfn) << 9) + entry + pfn) & (hment_hash_entries - 1)) 98 99 /* 100 * "mlist_lock" is a hashed mutex lock for protecting per-page mapping 101 * lists and "hash_lock" is a similar lock protecting the hment hash 102 * table. The hashed approach is taken to avoid the spatial overhead of 103 * maintaining a separate lock for each page, while still achieving better 104 * scalability than a single lock would allow. 105 */ 106 #define MLIST_NUM_LOCK 256 /* must be power of two */ 107 static kmutex_t mlist_lock[MLIST_NUM_LOCK]; 108 109 /* 110 * the shift by 9 is so that all large pages don't use the same hash bucket 111 */ 112 #define MLIST_MUTEX(pp) \ 113 &mlist_lock[((pp)->p_pagenum + ((pp)->p_pagenum >> 9)) & \ 114 (MLIST_NUM_LOCK - 1)] 115 116 #define HASH_NUM_LOCK 256 /* must be power of two */ 117 static kmutex_t hash_lock[HASH_NUM_LOCK]; 118 119 #define HASH_MUTEX(idx) &hash_lock[(idx) & (HASH_NUM_LOCK-1)] 120 121 static hment_t *hment_steal(void); 122 123 /* 124 * put one hment onto the reserves list 125 */ 126 static void 127 hment_put_reserve(hment_t *hm) 128 { 129 HATSTAT_INC(hs_hm_put_reserve); 130 mutex_enter(&hment_reserve_mutex); 131 hm->hm_next = hment_reserve_pool; 132 hment_reserve_pool = hm; 133 ++hment_reserve_count; 134 mutex_exit(&hment_reserve_mutex); 135 } 136 137 /* 138 * Take one hment from the reserve. 139 */ 140 static hment_t * 141 hment_get_reserve(void) 142 { 143 hment_t *hm = NULL; 144 145 /* 146 * We rely on a "donation system" to refill the hment reserve 147 * list, which only takes place when we are allocating hments for 148 * user mappings. It is theoretically possible that an incredibly 149 * long string of kernel hment_alloc()s with no intervening user 150 * hment_alloc()s could exhaust that pool. 151 */ 152 HATSTAT_INC(hs_hm_get_reserve); 153 mutex_enter(&hment_reserve_mutex); 154 if (hment_reserve_count != 0) { 155 hm = hment_reserve_pool; 156 hment_reserve_pool = hm->hm_next; 157 --hment_reserve_count; 158 } 159 mutex_exit(&hment_reserve_mutex); 160 return (hm); 161 } 162 163 /* 164 * Allocate an hment 165 */ 166 static hment_t * 167 hment_alloc() 168 { 169 int km_flag = can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP; 170 hment_t *hm = NULL; 171 172 /* 173 * If we aren't using the reserves, try using kmem to get an hment. 174 * Donate any successful allocations to reserves if low. 175 * 176 * If we're in panic, resort to using the reserves. 177 */ 178 HATSTAT_INC(hs_hm_alloc); 179 if (!USE_HAT_RESERVES()) { 180 for (;;) { 181 hm = kmem_cache_alloc(hment_cache, km_flag); 182 if (USE_HAT_RESERVES() || 183 hment_reserve_count >= hment_reserve_amount) 184 break; 185 hment_put_reserve(hm); 186 } 187 } 188 189 /* 190 * If allocation failed, we need to tap the reserves or steal 191 */ 192 if (hm == NULL) { 193 if (USE_HAT_RESERVES()) 194 hm = hment_get_reserve(); 195 196 /* 197 * If we still haven't gotten an hment, attempt to steal one by 198 * victimizing a mapping in a user htable. 199 */ 200 if (hm == NULL && can_steal_post_boot) 201 hm = hment_steal(); 202 203 /* 204 * we're in dire straights, try the reserve 205 */ 206 if (hm == NULL) 207 hm = hment_get_reserve(); 208 209 /* 210 * still no hment is a serious problem. 211 */ 212 if (hm == NULL) 213 panic("hment_alloc(): no reserve, couldn't steal"); 214 } 215 216 217 hm->hm_entry = 0; 218 hm->hm_htable = NULL; 219 hm->hm_hashnext = NULL; 220 hm->hm_next = NULL; 221 hm->hm_prev = NULL; 222 hm->hm_pfn = PFN_INVALID; 223 return (hm); 224 } 225 226 /* 227 * Free an hment, possibly to the reserves list when called from the 228 * thread using the reserves. For example, when freeing an hment during an 229 * htable_steal(), we can't recurse into the kmem allocator, so we just 230 * push the hment onto the reserve list. 231 */ 232 void 233 hment_free(hment_t *hm) 234 { 235 #ifdef DEBUG 236 /* 237 * zero out all fields to try and force any race conditions to segfault 238 */ 239 bzero(hm, sizeof (*hm)); 240 #endif 241 HATSTAT_INC(hs_hm_free); 242 if (USE_HAT_RESERVES() || 243 hment_reserve_count < hment_reserve_amount) { 244 hment_put_reserve(hm); 245 } else { 246 kmem_cache_free(hment_cache, hm); 247 hment_adjust_reserve(); 248 } 249 } 250 251 int 252 x86_hm_held(page_t *pp) 253 { 254 ASSERT(pp != NULL); 255 return (MUTEX_HELD(MLIST_MUTEX(pp))); 256 } 257 258 void 259 x86_hm_enter(page_t *pp) 260 { 261 ASSERT(pp != NULL); 262 mutex_enter(MLIST_MUTEX(pp)); 263 } 264 265 void 266 x86_hm_exit(page_t *pp) 267 { 268 ASSERT(pp != NULL); 269 mutex_exit(MLIST_MUTEX(pp)); 270 } 271 272 /* 273 * Internal routine to add a full hment to a page_t mapping list 274 */ 275 static void 276 hment_insert(hment_t *hm, page_t *pp) 277 { 278 uint_t idx; 279 280 ASSERT(x86_hm_held(pp)); 281 ASSERT(!pp->p_embed); 282 283 /* 284 * Add the hment to the page's mapping list. 285 */ 286 ++pp->p_share; 287 hm->hm_next = pp->p_mapping; 288 if (pp->p_mapping != NULL) 289 ((hment_t *)pp->p_mapping)->hm_prev = hm; 290 pp->p_mapping = hm; 291 292 /* 293 * Add the hment to the system-wide hash table. 294 */ 295 idx = HMENT_HASH(hm->hm_htable->ht_pfn, hm->hm_entry); 296 297 mutex_enter(HASH_MUTEX(idx)); 298 hm->hm_hashnext = hment_hash[idx]; 299 hment_hash[idx] = hm; 300 mutex_exit(HASH_MUTEX(idx)); 301 } 302 303 /* 304 * Prepare a mapping list entry to the given page. 305 * 306 * There are 4 different situations to deal with: 307 * 308 * - Adding the first mapping to a page_t as an embedded hment 309 * - Refaulting on an existing embedded mapping 310 * - Upgrading an embedded mapping when adding a 2nd mapping 311 * - Adding another mapping to a page_t that already has multiple mappings 312 * note we don't optimized for the refaulting case here. 313 * 314 * Due to competition with other threads that may be mapping/unmapping the 315 * same page and the need to drop all locks while allocating hments, any or 316 * all of the 3 situations can occur (and in almost any order) in any given 317 * call. Isn't this fun! 318 */ 319 hment_t * 320 hment_prepare(htable_t *htable, uint_t entry, page_t *pp) 321 { 322 hment_t *hm = NULL; 323 324 ASSERT(x86_hm_held(pp)); 325 326 for (;;) { 327 328 /* 329 * The most common case is establishing the first mapping to a 330 * page, so check that first. This doesn't need any allocated 331 * hment. 332 */ 333 if (pp->p_mapping == NULL) { 334 ASSERT(!pp->p_embed); 335 ASSERT(pp->p_share == 0); 336 if (hm == NULL) 337 break; 338 339 /* 340 * we had an hment already, so free it and retry 341 */ 342 goto free_and_continue; 343 } 344 345 /* 346 * If there is an embedded mapping, we may need to 347 * convert it to an hment. 348 */ 349 if (pp->p_embed) { 350 351 /* should point to htable */ 352 ASSERT(pp->p_mapping != NULL); 353 354 /* 355 * If we are faulting on a pre-existing mapping 356 * there is no need to promote/allocate a new hment. 357 * This happens a lot due to segmap. 358 */ 359 if (pp->p_mapping == htable && pp->p_mlentry == entry) { 360 if (hm == NULL) 361 break; 362 goto free_and_continue; 363 } 364 365 /* 366 * If we have an hment allocated, use it to promote the 367 * existing embedded mapping. 368 */ 369 if (hm != NULL) { 370 hm->hm_htable = pp->p_mapping; 371 hm->hm_entry = pp->p_mlentry; 372 hm->hm_pfn = pp->p_pagenum; 373 pp->p_mapping = NULL; 374 pp->p_share = 0; 375 pp->p_embed = 0; 376 hment_insert(hm, pp); 377 } 378 379 /* 380 * We either didn't have an hment allocated or we just 381 * used it for the embedded mapping. In either case, 382 * allocate another hment and restart. 383 */ 384 goto allocate_and_continue; 385 } 386 387 /* 388 * Last possibility is that we're adding an hment to a list 389 * of hments. 390 */ 391 if (hm != NULL) 392 break; 393 allocate_and_continue: 394 x86_hm_exit(pp); 395 hm = hment_alloc(); 396 x86_hm_enter(pp); 397 continue; 398 399 free_and_continue: 400 /* 401 * we allocated an hment already, free it and retry 402 */ 403 x86_hm_exit(pp); 404 hment_free(hm); 405 hm = NULL; 406 x86_hm_enter(pp); 407 } 408 ASSERT(x86_hm_held(pp)); 409 return (hm); 410 } 411 412 /* 413 * Record a mapping list entry for the htable/entry to the given page. 414 * 415 * hment_prepare() should have properly set up the situation. 416 */ 417 void 418 hment_assign(htable_t *htable, uint_t entry, page_t *pp, hment_t *hm) 419 { 420 ASSERT(x86_hm_held(pp)); 421 422 /* 423 * The most common case is establishing the first mapping to a 424 * page, so check that first. This doesn't need any allocated 425 * hment. 426 */ 427 if (pp->p_mapping == NULL) { 428 ASSERT(hm == NULL); 429 ASSERT(!pp->p_embed); 430 ASSERT(pp->p_share == 0); 431 pp->p_embed = 1; 432 pp->p_mapping = htable; 433 pp->p_mlentry = entry; 434 return; 435 } 436 437 /* 438 * We should never get here with a pre-existing embedded maping 439 */ 440 ASSERT(!pp->p_embed); 441 442 /* 443 * add the new hment to the mapping list 444 */ 445 ASSERT(hm != NULL); 446 hm->hm_htable = htable; 447 hm->hm_entry = entry; 448 hm->hm_pfn = pp->p_pagenum; 449 hment_insert(hm, pp); 450 } 451 452 /* 453 * Walk through the mappings for a page. 454 * 455 * must already have done an x86_hm_enter() 456 */ 457 hment_t * 458 hment_walk(page_t *pp, htable_t **ht, uint_t *entry, hment_t *prev) 459 { 460 hment_t *hm; 461 462 ASSERT(x86_hm_held(pp)); 463 464 if (pp->p_embed) { 465 if (prev == NULL) { 466 *ht = (htable_t *)pp->p_mapping; 467 *entry = pp->p_mlentry; 468 hm = HMENT_EMBEDDED; 469 } else { 470 ASSERT(prev == HMENT_EMBEDDED); 471 hm = NULL; 472 } 473 } else { 474 if (prev == NULL) { 475 ASSERT(prev != HMENT_EMBEDDED); 476 hm = (hment_t *)pp->p_mapping; 477 } else { 478 hm = prev->hm_next; 479 } 480 481 if (hm != NULL) { 482 *ht = hm->hm_htable; 483 *entry = hm->hm_entry; 484 } 485 } 486 return (hm); 487 } 488 489 /* 490 * Remove a mapping to a page from its mapping list. Must have 491 * the corresponding mapping list locked. 492 * Finds the mapping list entry with the given pte_t and 493 * unlinks it from the mapping list. 494 */ 495 hment_t * 496 hment_remove(page_t *pp, htable_t *ht, uint_t entry) 497 { 498 hment_t *prev = NULL; 499 hment_t *hm; 500 uint_t idx; 501 pfn_t pfn; 502 503 ASSERT(x86_hm_held(pp)); 504 505 /* 506 * Check if we have only one mapping embedded in the page_t. 507 */ 508 if (pp->p_embed) { 509 ASSERT(ht == (htable_t *)pp->p_mapping); 510 ASSERT(entry == pp->p_mlentry); 511 ASSERT(pp->p_share == 0); 512 pp->p_mapping = NULL; 513 pp->p_mlentry = 0; 514 pp->p_embed = 0; 515 return (NULL); 516 } 517 518 /* 519 * Otherwise it must be in the list of hments. 520 * Find the hment in the system-wide hash table and remove it. 521 */ 522 ASSERT(pp->p_share != 0); 523 pfn = pp->p_pagenum; 524 idx = HMENT_HASH(ht->ht_pfn, entry); 525 mutex_enter(HASH_MUTEX(idx)); 526 hm = hment_hash[idx]; 527 while (hm && (hm->hm_htable != ht || hm->hm_entry != entry || 528 hm->hm_pfn != pfn)) { 529 prev = hm; 530 hm = hm->hm_hashnext; 531 } 532 if (hm == NULL) { 533 panic("hment_remove() missing in hash table pp=%lx, ht=%lx," 534 "entry=0x%x hash index=0x%x", (uintptr_t)pp, (uintptr_t)ht, 535 entry, idx); 536 } 537 538 if (prev) 539 prev->hm_hashnext = hm->hm_hashnext; 540 else 541 hment_hash[idx] = hm->hm_hashnext; 542 mutex_exit(HASH_MUTEX(idx)); 543 544 /* 545 * Remove the hment from the page's mapping list 546 */ 547 if (hm->hm_next) 548 hm->hm_next->hm_prev = hm->hm_prev; 549 if (hm->hm_prev) 550 hm->hm_prev->hm_next = hm->hm_next; 551 else 552 pp->p_mapping = hm->hm_next; 553 554 --pp->p_share; 555 hm->hm_hashnext = NULL; 556 hm->hm_next = NULL; 557 hm->hm_prev = NULL; 558 559 return (hm); 560 } 561 562 /* 563 * Put initial hment's in the reserve pool. 564 */ 565 void 566 hment_reserve(uint_t count) 567 { 568 hment_t *hm; 569 570 count += hment_reserve_amount; 571 572 while (hment_reserve_count < count) { 573 hm = kmem_cache_alloc(hment_cache, KM_NOSLEEP); 574 if (hm == NULL) 575 return; 576 hment_put_reserve(hm); 577 } 578 } 579 580 /* 581 * Readjust the hment reserves after they may have been used. 582 */ 583 void 584 hment_adjust_reserve() 585 { 586 hment_t *hm; 587 588 /* 589 * Free up any excess reserves 590 */ 591 while (hment_reserve_count > hment_reserve_amount && 592 !USE_HAT_RESERVES()) { 593 hm = hment_get_reserve(); 594 if (hm == NULL) 595 return; 596 kmem_cache_free(hment_cache, hm); 597 } 598 } 599 600 /* 601 * initialize hment data structures 602 */ 603 void 604 hment_init(void) 605 { 606 int i; 607 int flags = KMC_NOHASH | KMC_NODEBUG; 608 609 /* 610 * Initialize kmem caches. On 32 bit kernel's we shut off 611 * debug information to save on precious kernel VA usage. 612 */ 613 hment_cache = kmem_cache_create("hment_t", 614 sizeof (hment_t), 0, NULL, NULL, NULL, 615 NULL, hat_memload_arena, flags); 616 617 hment_hash = kmem_zalloc(hment_hash_entries * sizeof (hment_t *), 618 KM_SLEEP); 619 620 for (i = 0; i < MLIST_NUM_LOCK; i++) 621 mutex_init(&mlist_lock[i], NULL, MUTEX_DEFAULT, NULL); 622 623 for (i = 0; i < HASH_NUM_LOCK; i++) 624 mutex_init(&hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 625 626 627 } 628 629 /* 630 * return the number of mappings to a page 631 * 632 * Note there is no ASSERT() that the MUTEX is held for this. 633 * Hence the return value might be inaccurate if this is called without 634 * doing an x86_hm_enter(). 635 */ 636 uint_t 637 hment_mapcnt(page_t *pp) 638 { 639 uint_t cnt; 640 uint_t szc; 641 page_t *larger; 642 hment_t *hm; 643 644 x86_hm_enter(pp); 645 if (pp->p_mapping == NULL) 646 cnt = 0; 647 else if (pp->p_embed) 648 cnt = 1; 649 else 650 cnt = pp->p_share; 651 x86_hm_exit(pp); 652 653 /* 654 * walk through all larger mapping sizes counting mappings 655 */ 656 for (szc = 1; szc <= pp->p_szc; ++szc) { 657 larger = PP_GROUPLEADER(pp, szc); 658 if (larger == pp) /* don't double count large mappings */ 659 continue; 660 661 x86_hm_enter(larger); 662 if (larger->p_mapping != NULL) { 663 if (larger->p_embed && 664 ((htable_t *)larger->p_mapping)->ht_level == szc) { 665 ++cnt; 666 } else if (!larger->p_embed) { 667 for (hm = larger->p_mapping; hm; 668 hm = hm->hm_next) { 669 if (hm->hm_htable->ht_level == szc) 670 ++cnt; 671 } 672 } 673 } 674 x86_hm_exit(larger); 675 } 676 return (cnt); 677 } 678 679 /* 680 * We need to steal an hment. Walk through all the page_t's until we 681 * find one that has multiple mappings. Unload one of the mappings 682 * and reclaim that hment. Note that we'll save/restart the starting 683 * page to try and spread the pain. 684 */ 685 static page_t *last_page = NULL; 686 687 static hment_t * 688 hment_steal(void) 689 { 690 page_t *last = last_page; 691 page_t *pp = last; 692 hment_t *hm = NULL; 693 hment_t *hm2; 694 htable_t *ht; 695 uint_t found_one = 0; 696 697 HATSTAT_INC(hs_hm_steals); 698 if (pp == NULL) 699 last = pp = page_first(); 700 701 while (!found_one) { 702 HATSTAT_INC(hs_hm_steal_exam); 703 pp = page_next(pp); 704 if (pp == NULL) 705 pp = page_first(); 706 707 /* 708 * The loop and function exit here if nothing found to steal. 709 */ 710 if (pp == last) 711 return (NULL); 712 713 /* 714 * Only lock the page_t if it has hments. 715 */ 716 if (pp->p_mapping == NULL || pp->p_embed) 717 continue; 718 719 /* 720 * Search the mapping list for a usable mapping. 721 */ 722 x86_hm_enter(pp); 723 if (!pp->p_embed) { 724 for (hm = pp->p_mapping; hm; hm = hm->hm_next) { 725 ht = hm->hm_htable; 726 if (ht->ht_hat != kas.a_hat && 727 ht->ht_busy == 0 && 728 ht->ht_lock_cnt == 0) { 729 found_one = 1; 730 break; 731 } 732 } 733 } 734 if (!found_one) 735 x86_hm_exit(pp); 736 } 737 738 /* 739 * Steal the mapping we found. Note that hati_page_unmap() will 740 * do the x86_hm_exit(). 741 */ 742 hm2 = hati_page_unmap(pp, ht, hm->hm_entry); 743 ASSERT(hm2 == hm); 744 last_page = pp; 745 return (hm); 746 } 747