1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/sysmacros.h> 30 #include <sys/kmem.h> 31 #include <sys/atomic.h> 32 #include <sys/bitmap.h> 33 #include <sys/machparam.h> 34 #include <sys/machsystm.h> 35 #include <sys/mman.h> 36 #include <sys/systm.h> 37 #include <sys/cpuvar.h> 38 #include <sys/thread.h> 39 #include <sys/proc.h> 40 #include <sys/cpu.h> 41 #include <sys/kmem.h> 42 #include <sys/disp.h> 43 #include <sys/vmem.h> 44 #include <sys/vmsystm.h> 45 #include <sys/promif.h> 46 #include <sys/var.h> 47 #include <sys/x86_archext.h> 48 #include <sys/bootconf.h> 49 #include <sys/dumphdr.h> 50 #include <vm/seg_kmem.h> 51 #include <vm/seg_kpm.h> 52 #include <vm/hat.h> 53 #include <vm/hat_i86.h> 54 #include <sys/cmn_err.h> 55 56 kmem_cache_t *htable_cache; 57 extern cpuset_t khat_cpuset; 58 59 /* 60 * The variable htable_reserve_amount, rather than HTABLE_RESERVE_AMOUNT, 61 * is used in order to facilitate testing of the htable_steal() code. 62 * By resetting htable_reserve_amount to a lower value, we can force 63 * stealing to occur. The reserve amount is a guess to get us through boot. 64 */ 65 #define HTABLE_RESERVE_AMOUNT (200) 66 uint_t htable_reserve_amount = HTABLE_RESERVE_AMOUNT; 67 kmutex_t htable_reserve_mutex; 68 uint_t htable_reserve_cnt; 69 htable_t *htable_reserve_pool; 70 71 /* 72 * Used to hand test htable_steal(). 73 */ 74 #ifdef DEBUG 75 ulong_t force_steal = 0; 76 ulong_t ptable_cnt = 0; 77 #endif 78 79 /* 80 * This variable is so that we can tune this via /etc/system 81 * Any value works, but a power of two <= mmu.ptes_per_table is best. 82 */ 83 uint_t htable_steal_passes = 8; 84 85 /* 86 * mutex stuff for access to htable hash 87 */ 88 #define NUM_HTABLE_MUTEX 128 89 kmutex_t htable_mutex[NUM_HTABLE_MUTEX]; 90 #define HTABLE_MUTEX_HASH(h) ((h) & (NUM_HTABLE_MUTEX - 1)) 91 92 #define HTABLE_ENTER(h) mutex_enter(&htable_mutex[HTABLE_MUTEX_HASH(h)]); 93 #define HTABLE_EXIT(h) mutex_exit(&htable_mutex[HTABLE_MUTEX_HASH(h)]); 94 95 /* 96 * forward declarations 97 */ 98 static void link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr); 99 static void unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr); 100 static void htable_free(htable_t *ht); 101 static x86pte_t *x86pte_access_pagetable(htable_t *ht); 102 static void x86pte_release_pagetable(htable_t *ht); 103 static x86pte_t x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, 104 x86pte_t new); 105 106 /* 107 * Address used for kernel page tables. See ptable_alloc() below. 108 */ 109 uintptr_t ptable_va = 0; 110 size_t ptable_sz = 2 * MMU_PAGESIZE; 111 112 /* 113 * A counter to track if we are stealing or reaping htables. When non-zero 114 * htable_free() will directly free htables (either to the reserve or kmem) 115 * instead of putting them in a hat's htable cache. 116 */ 117 uint32_t htable_dont_cache = 0; 118 119 /* 120 * Track the number of active pagetables, so we can know how many to reap 121 */ 122 static uint32_t active_ptables = 0; 123 124 /* 125 * Allocate a memory page for a hardware page table. 126 * 127 * The pages allocated for page tables are currently gotten in a hacked up 128 * way. It works for now, but really needs to be fixed up a bit. 129 * 130 * During boot: The boot loader controls physical memory allocation via 131 * boot_alloc(). To avoid conflict with vmem, we just do boot_alloc()s with 132 * addresses less than kernelbase. These addresses are ignored when we take 133 * over mappings from the boot loader. 134 * 135 * Post-boot: we currently use page_create_va() on the kvp with fake offsets, 136 * segments and virt address. This is pretty bogus, but was copied from the 137 * old hat_i86.c code. A better approach would be to have a custom 138 * page_get_physical() interface that can specify either mnode random or 139 * mnode local and takes a page from whatever color has the MOST available - 140 * this would have a minimal impact on page coloring. 141 * 142 * For now the htable pointer in ht is only used to compute a unique vnode 143 * offset for the page. 144 */ 145 static void 146 ptable_alloc(htable_t *ht) 147 { 148 pfn_t pfn; 149 page_t *pp; 150 u_offset_t offset; 151 static struct seg tmpseg; 152 static int first_time = 1; 153 154 /* 155 * Allocating the associated hardware page table is very different 156 * before boot has finished. We get a physical page to from boot 157 * w/o eating up any kernel address space. 158 */ 159 ht->ht_pfn = PFN_INVALID; 160 atomic_add_32(&active_ptables, 1); 161 162 if (use_boot_reserve) { 163 ASSERT(ptable_va != 0); 164 165 /* 166 * Allocate, then demap the ptable_va, so that we're 167 * sure there exist page table entries for the addresses 168 */ 169 if (first_time) { 170 first_time = 0; 171 if ((uintptr_t)BOP_ALLOC(bootops, (caddr_t)ptable_va, 172 ptable_sz, BO_NO_ALIGN) != ptable_va) 173 panic("BOP_ALLOC failed"); 174 175 hat_boot_demap(ptable_va); 176 hat_boot_demap(ptable_va + MMU_PAGESIZE); 177 } 178 179 pfn = ((uintptr_t)BOP_EALLOC(bootops, 0, MMU_PAGESIZE, 180 BO_NO_ALIGN, BOPF_X86_ALLOC_PHYS)) >> MMU_PAGESHIFT; 181 if (page_resv(1, KM_NOSLEEP) == 0) 182 panic("page_resv() failed in ptable alloc"); 183 184 pp = page_numtopp_nolock(pfn); 185 ASSERT(pp != NULL); 186 if (pp->p_szc != 0) 187 page_boot_demote(pp); 188 pp = page_numtopp(pfn, SE_EXCL); 189 ASSERT(pp != NULL); 190 191 } else { 192 /* 193 * Post boot get a page for the table. 194 * 195 * The first check is to see if there is memory in 196 * the system. If we drop to throttlefree, then fail 197 * the ptable_alloc() and let the stealing code kick in. 198 * Note that we have to do this test here, since the test in 199 * page_create_throttle() would let the NOSLEEP allocation 200 * go through and deplete the page reserves. 201 * 202 * The !NOMEMWAIT() lets pageout, fsflush, etc. skip this check. 203 */ 204 if (!NOMEMWAIT() && freemem <= throttlefree + 1) 205 return; 206 207 #ifdef DEBUG 208 /* 209 * This code makes htable_ steal() easier to test. By setting 210 * force_steal we force pagetable allocations to fall 211 * into the stealing code. Roughly 1 in ever "force_steal" 212 * page table allocations will fail. 213 */ 214 if (ht->ht_hat != kas.a_hat && force_steal > 1 && 215 ++ptable_cnt > force_steal) { 216 ptable_cnt = 0; 217 return; 218 } 219 #endif /* DEBUG */ 220 221 /* 222 * This code is temporary, so don't review too critically. 223 * I'm awaiting a new phys page allocator from Kit -- Joe 224 * 225 * We need assign an offset for the page to call 226 * page_create_va. To avoid conflicts with other pages, 227 * we get creative with the offset. 228 * for 32 bits, we pic an offset > 4Gig 229 * for 64 bits, pic an offset somewhere in the VA hole. 230 */ 231 offset = (uintptr_t)ht - kernelbase; 232 offset <<= MMU_PAGESHIFT; 233 #if defined(__amd64) 234 offset += mmu.hole_start; /* something in VA hole */ 235 #else 236 offset += 1ULL << 40; /* something > 4 Gig */ 237 #endif 238 239 if (page_resv(1, KM_NOSLEEP) == 0) 240 return; 241 242 #ifdef DEBUG 243 pp = page_exists(&kvp, offset); 244 if (pp != NULL) 245 panic("ptable already exists %p", pp); 246 #endif 247 pp = page_create_va(&kvp, offset, MMU_PAGESIZE, 248 PG_EXCL | PG_NORELOC, &tmpseg, 249 (void *)((uintptr_t)ht << MMU_PAGESHIFT)); 250 if (pp == NULL) 251 return; 252 page_io_unlock(pp); 253 page_hashout(pp, NULL); 254 pfn = pp->p_pagenum; 255 } 256 page_downgrade(pp); 257 ASSERT(PAGE_SHARED(pp)); 258 259 if (pfn == PFN_INVALID) 260 panic("ptable_alloc(): Invalid PFN!!"); 261 ht->ht_pfn = pfn; 262 HATSTAT_INC(hs_ptable_allocs); 263 } 264 265 /* 266 * Free an htable's associated page table page. See the comments 267 * for ptable_alloc(). 268 */ 269 static void 270 ptable_free(htable_t *ht) 271 { 272 pfn_t pfn = ht->ht_pfn; 273 page_t *pp; 274 275 /* 276 * need to destroy the page used for the pagetable 277 */ 278 ASSERT(pfn != PFN_INVALID); 279 HATSTAT_INC(hs_ptable_frees); 280 atomic_add_32(&active_ptables, -1); 281 pp = page_numtopp_nolock(pfn); 282 if (pp == NULL) 283 panic("ptable_free(): no page for pfn!"); 284 ASSERT(PAGE_SHARED(pp)); 285 ASSERT(pfn == pp->p_pagenum); 286 287 /* 288 * Get an exclusive lock, might have to wait for a kmem reader. 289 */ 290 if (!page_tryupgrade(pp)) { 291 page_unlock(pp); 292 /* 293 * RFE: we could change this to not loop forever 294 * George Cameron had some idea on how to do that. 295 * For now looping works - it's just like sfmmu. 296 */ 297 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 298 continue; 299 } 300 page_free(pp, 1); 301 page_unresv(1); 302 ht->ht_pfn = PFN_INVALID; 303 } 304 305 /* 306 * Put one htable on the reserve list. 307 */ 308 static void 309 htable_put_reserve(htable_t *ht) 310 { 311 ht->ht_hat = NULL; /* no longer tied to a hat */ 312 ASSERT(ht->ht_pfn == PFN_INVALID); 313 HATSTAT_INC(hs_htable_rputs); 314 mutex_enter(&htable_reserve_mutex); 315 ht->ht_next = htable_reserve_pool; 316 htable_reserve_pool = ht; 317 ++htable_reserve_cnt; 318 mutex_exit(&htable_reserve_mutex); 319 } 320 321 /* 322 * Take one htable from the reserve. 323 */ 324 static htable_t * 325 htable_get_reserve(void) 326 { 327 htable_t *ht = NULL; 328 329 mutex_enter(&htable_reserve_mutex); 330 if (htable_reserve_cnt != 0) { 331 ht = htable_reserve_pool; 332 ASSERT(ht != NULL); 333 ASSERT(ht->ht_pfn == PFN_INVALID); 334 htable_reserve_pool = ht->ht_next; 335 --htable_reserve_cnt; 336 HATSTAT_INC(hs_htable_rgets); 337 } 338 mutex_exit(&htable_reserve_mutex); 339 return (ht); 340 } 341 342 /* 343 * Allocate initial htables with page tables and put them on the kernel hat's 344 * cache list. 345 */ 346 void 347 htable_initial_reserve(uint_t count) 348 { 349 htable_t *ht; 350 hat_t *hat = kas.a_hat; 351 352 count += HTABLE_RESERVE_AMOUNT; 353 while (count > 0) { 354 ht = kmem_cache_alloc(htable_cache, KM_NOSLEEP); 355 ASSERT(ht != NULL); 356 357 ASSERT(use_boot_reserve); 358 ht->ht_hat = kas.a_hat; /* so htable_free() works */ 359 ht->ht_flags = 0; /* so x86pte_zero works */ 360 ptable_alloc(ht); 361 if (ht->ht_pfn == PFN_INVALID) 362 panic("ptable_alloc() failed"); 363 364 x86pte_zero(ht, 0, mmu.ptes_per_table); 365 366 ht->ht_next = hat->hat_ht_cached; 367 hat->hat_ht_cached = ht; 368 --count; 369 } 370 } 371 372 /* 373 * Readjust the reserves after a thread finishes using them. 374 * 375 * The first time this is called post boot, we'll also clear out the 376 * extra boot htables that were put in the kernel hat's cache list. 377 */ 378 void 379 htable_adjust_reserve() 380 { 381 static int first_time = 1; 382 htable_t *ht; 383 384 ASSERT(curthread != hat_reserves_thread); 385 386 /* 387 * The first time this is called after we can steal, we free up the 388 * the kernel's cache htable list. It has lots of extra htable/page 389 * tables that were allocated for boot up. 390 */ 391 if (first_time) { 392 first_time = 0; 393 while ((ht = kas.a_hat->hat_ht_cached) != NULL) { 394 kas.a_hat->hat_ht_cached = ht->ht_next; 395 ASSERT(ht->ht_hat == kas.a_hat); 396 ptable_free(ht); 397 htable_put_reserve(ht); 398 } 399 return; 400 } 401 402 /* 403 * Free any excess htables in the reserve list 404 */ 405 while (htable_reserve_cnt > htable_reserve_amount) { 406 ht = htable_get_reserve(); 407 if (ht == NULL) 408 return; 409 ASSERT(ht->ht_pfn == PFN_INVALID); 410 kmem_cache_free(htable_cache, ht); 411 } 412 } 413 414 415 /* 416 * This routine steals htables from user processes for htable_alloc() or 417 * for htable_reap(). 418 */ 419 static htable_t * 420 htable_steal(uint_t cnt) 421 { 422 hat_t *hat = kas.a_hat; /* list starts with khat */ 423 htable_t *list = NULL; 424 htable_t *ht; 425 htable_t *higher; 426 uint_t h; 427 uint_t h_start; 428 static uint_t h_seed = 0; 429 uint_t e; 430 uintptr_t va; 431 x86pte_t pte; 432 uint_t stolen = 0; 433 uint_t pass; 434 uint_t threshold; 435 436 /* 437 * Limit htable_steal_passes to something reasonable 438 */ 439 if (htable_steal_passes == 0) 440 htable_steal_passes = 1; 441 if (htable_steal_passes > mmu.ptes_per_table) 442 htable_steal_passes = mmu.ptes_per_table; 443 444 /* 445 * Loop through all user hats. The 1st pass takes cached htables that 446 * aren't in use. The later passes steal by removing mappings, too. 447 */ 448 atomic_add_32(&htable_dont_cache, 1); 449 for (pass = 0; pass <= htable_steal_passes && stolen < cnt; ++pass) { 450 threshold = pass * mmu.ptes_per_table / htable_steal_passes; 451 hat = kas.a_hat; 452 for (;;) { 453 454 /* 455 * Clear the victim flag and move to next hat 456 */ 457 mutex_enter(&hat_list_lock); 458 if (hat != kas.a_hat) { 459 hat->hat_flags &= ~HAT_VICTIM; 460 cv_broadcast(&hat_list_cv); 461 } 462 hat = hat->hat_next; 463 464 /* 465 * Skip any hat that is already being stolen from. 466 * 467 * We skip SHARED hats, as these are dummy 468 * hats that host ISM shared page tables. 469 * 470 * We also skip if HAT_FREEING because hat_pte_unmap() 471 * won't zero out the PTE's. That would lead to hitting 472 * stale PTEs either here or under hat_unload() when we 473 * steal and unload the same page table in competing 474 * threads. 475 */ 476 while (hat != NULL && 477 (hat->hat_flags & 478 (HAT_VICTIM | HAT_SHARED | HAT_FREEING)) != 0) 479 hat = hat->hat_next; 480 481 if (hat == NULL) { 482 mutex_exit(&hat_list_lock); 483 break; 484 } 485 486 /* 487 * Are we finished? 488 */ 489 if (stolen == cnt) { 490 /* 491 * Try to spread the pain of stealing, 492 * move victim HAT to the end of the HAT list. 493 */ 494 if (pass >= 1 && cnt == 1 && 495 kas.a_hat->hat_prev != hat) { 496 497 /* unlink victim hat */ 498 if (hat->hat_prev) 499 hat->hat_prev->hat_next = 500 hat->hat_next; 501 else 502 kas.a_hat->hat_next = 503 hat->hat_next; 504 if (hat->hat_next) 505 hat->hat_next->hat_prev = 506 hat->hat_prev; 507 else 508 kas.a_hat->hat_prev = 509 hat->hat_prev; 510 511 512 /* relink at end of hat list */ 513 hat->hat_next = NULL; 514 hat->hat_prev = kas.a_hat->hat_prev; 515 if (hat->hat_prev) 516 hat->hat_prev->hat_next = hat; 517 else 518 kas.a_hat->hat_next = hat; 519 kas.a_hat->hat_prev = hat; 520 521 } 522 523 mutex_exit(&hat_list_lock); 524 break; 525 } 526 527 /* 528 * Mark the HAT as a stealing victim. 529 */ 530 hat->hat_flags |= HAT_VICTIM; 531 mutex_exit(&hat_list_lock); 532 533 /* 534 * Take any htables from the hat's cached "free" list. 535 */ 536 hat_enter(hat); 537 while ((ht = hat->hat_ht_cached) != NULL && 538 stolen < cnt) { 539 hat->hat_ht_cached = ht->ht_next; 540 ht->ht_next = list; 541 list = ht; 542 ++stolen; 543 } 544 hat_exit(hat); 545 546 /* 547 * Don't steal on first pass. 548 */ 549 if (pass == 0 || stolen == cnt) 550 continue; 551 552 /* 553 * Search the active htables for one to steal. 554 * Start at a different hash bucket every time to 555 * help spread the pain of stealing. 556 */ 557 h = h_start = h_seed++ % hat->hat_num_hash; 558 do { 559 higher = NULL; 560 HTABLE_ENTER(h); 561 for (ht = hat->hat_ht_hash[h]; ht; 562 ht = ht->ht_next) { 563 564 /* 565 * Can we rule out reaping? 566 */ 567 if (ht->ht_busy != 0 || 568 (ht->ht_flags & HTABLE_SHARED_PFN)|| 569 ht->ht_level > 0 || 570 ht->ht_valid_cnt > threshold || 571 ht->ht_lock_cnt != 0) 572 continue; 573 574 /* 575 * Increment busy so the htable can't 576 * disappear. We drop the htable mutex 577 * to avoid deadlocks with 578 * hat_pageunload() and the hment mutex 579 * while we call hat_pte_unmap() 580 */ 581 ++ht->ht_busy; 582 HTABLE_EXIT(h); 583 584 /* 585 * Try stealing. 586 * - unload and invalidate all PTEs 587 */ 588 for (e = 0, va = ht->ht_vaddr; 589 e < ht->ht_num_ptes && 590 ht->ht_valid_cnt > 0 && 591 ht->ht_busy == 1 && 592 ht->ht_lock_cnt == 0; 593 ++e, va += MMU_PAGESIZE) { 594 pte = x86pte_get(ht, e); 595 if (!PTE_ISVALID(pte)) 596 continue; 597 hat_pte_unmap(ht, e, 598 HAT_UNLOAD, pte, NULL); 599 } 600 601 /* 602 * Reacquire htable lock. If we didn't 603 * remove all mappings in the table, 604 * or another thread added a new mapping 605 * behind us, give up on this table. 606 */ 607 HTABLE_ENTER(h); 608 if (ht->ht_busy != 1 || 609 ht->ht_valid_cnt != 0 || 610 ht->ht_lock_cnt != 0) { 611 --ht->ht_busy; 612 continue; 613 } 614 615 /* 616 * Steal it and unlink the page table. 617 */ 618 higher = ht->ht_parent; 619 unlink_ptp(higher, ht, ht->ht_vaddr); 620 621 /* 622 * remove from the hash list 623 */ 624 if (ht->ht_next) 625 ht->ht_next->ht_prev = 626 ht->ht_prev; 627 628 if (ht->ht_prev) { 629 ht->ht_prev->ht_next = 630 ht->ht_next; 631 } else { 632 ASSERT(hat->hat_ht_hash[h] == 633 ht); 634 hat->hat_ht_hash[h] = 635 ht->ht_next; 636 } 637 638 /* 639 * Break to outer loop to release the 640 * higher (ht_parent) pagtable. This 641 * spreads out the pain caused by 642 * pagefaults. 643 */ 644 ht->ht_next = list; 645 list = ht; 646 ++stolen; 647 break; 648 } 649 HTABLE_EXIT(h); 650 if (higher != NULL) 651 htable_release(higher); 652 if (++h == hat->hat_num_hash) 653 h = 0; 654 } while (stolen < cnt && h != h_start); 655 } 656 } 657 atomic_add_32(&htable_dont_cache, -1); 658 return (list); 659 } 660 661 662 /* 663 * This is invoked from kmem when the system is low on memory. We try 664 * to free hments, htables, and ptables to improve the memory situation. 665 */ 666 /*ARGSUSED*/ 667 static void 668 htable_reap(void *handle) 669 { 670 uint_t reap_cnt; 671 htable_t *list; 672 htable_t *ht; 673 674 HATSTAT_INC(hs_reap_attempts); 675 if (!can_steal_post_boot) 676 return; 677 678 /* 679 * Try to reap 5% of the page tables bounded by a maximum of 680 * 5% of physmem and a minimum of 10. 681 */ 682 reap_cnt = MIN(MAX(physmem / 20, active_ptables / 20), 10); 683 684 /* 685 * Let htable_steal() do the work, we just call htable_free() 686 */ 687 list = htable_steal(reap_cnt); 688 while ((ht = list) != NULL) { 689 list = ht->ht_next; 690 HATSTAT_INC(hs_reaped); 691 htable_free(ht); 692 } 693 694 /* 695 * Free up excess reserves 696 */ 697 htable_adjust_reserve(); 698 hment_adjust_reserve(); 699 } 700 701 /* 702 * allocate an htable, stealing one or using the reserve if necessary 703 */ 704 static htable_t * 705 htable_alloc( 706 hat_t *hat, 707 uintptr_t vaddr, 708 level_t level, 709 htable_t *shared) 710 { 711 htable_t *ht = NULL; 712 uint_t is_vlp; 713 uint_t is_bare = 0; 714 uint_t need_to_zero = 1; 715 int kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP); 716 717 if (level < 0 || level > TOP_LEVEL(hat)) 718 panic("htable_alloc(): level %d out of range\n", level); 719 720 is_vlp = (hat->hat_flags & HAT_VLP) && level == VLP_LEVEL; 721 if (is_vlp || shared != NULL) 722 is_bare = 1; 723 724 /* 725 * First reuse a cached htable from the hat_ht_cached field, this 726 * avoids unnecessary trips through kmem/page allocators. This is also 727 * what happens during use_boot_reserve. 728 */ 729 if (hat->hat_ht_cached != NULL && !is_bare) { 730 hat_enter(hat); 731 ht = hat->hat_ht_cached; 732 if (ht != NULL) { 733 hat->hat_ht_cached = ht->ht_next; 734 need_to_zero = 0; 735 /* XX64 ASSERT() they're all zero somehow */ 736 ASSERT(ht->ht_pfn != PFN_INVALID); 737 } 738 hat_exit(hat); 739 } 740 741 if (ht == NULL) { 742 ASSERT(!use_boot_reserve); 743 /* 744 * When allocating for hat_memload_arena, we use the reserve. 745 * Also use reserves if we are in a panic(). 746 */ 747 if (curthread == hat_reserves_thread || panicstr != NULL) { 748 ASSERT(panicstr != NULL || !is_bare); 749 ASSERT(panicstr != NULL || 750 curthread == hat_reserves_thread); 751 ht = htable_get_reserve(); 752 } else { 753 /* 754 * Donate successful htable allocations to the reserve. 755 */ 756 for (;;) { 757 ASSERT(curthread != hat_reserves_thread); 758 ht = kmem_cache_alloc(htable_cache, kmflags); 759 if (ht == NULL) 760 break; 761 ht->ht_pfn = PFN_INVALID; 762 if (curthread == hat_reserves_thread || 763 panicstr != NULL || 764 htable_reserve_cnt >= htable_reserve_amount) 765 break; 766 htable_put_reserve(ht); 767 } 768 } 769 770 /* 771 * allocate a page for the hardware page table if needed 772 */ 773 if (ht != NULL && !is_bare) { 774 ht->ht_hat = hat; 775 ptable_alloc(ht); 776 if (ht->ht_pfn == PFN_INVALID) { 777 kmem_cache_free(htable_cache, ht); 778 ht = NULL; 779 } 780 } 781 } 782 783 /* 784 * If allocations failed, kick off a kmem_reap() and resort to 785 * htable steal(). We may spin here if the system is very low on 786 * memory. If the kernel itself has consumed all memory and kmem_reap() 787 * can't free up anything, then we'll really get stuck here. 788 * That should only happen in a system where the administrator has 789 * misconfigured VM parameters via /etc/system. 790 */ 791 while (ht == NULL && can_steal_post_boot) { 792 kmem_reap(); 793 ht = htable_steal(1); 794 HATSTAT_INC(hs_steals); 795 796 /* 797 * If we stole for a bare htable, release the pagetable page. 798 */ 799 if (ht != NULL && is_bare) 800 ptable_free(ht); 801 } 802 803 /* 804 * All attempts to allocate or steal failed. This should only happen 805 * if we run out of memory during boot, due perhaps to a huge 806 * boot_archive. At this point there's no way to continue. 807 */ 808 if (ht == NULL) 809 panic("htable_alloc(): couldn't steal\n"); 810 811 /* 812 * Shared page tables have all entries locked and entries may not 813 * be added or deleted. 814 */ 815 ht->ht_flags = 0; 816 if (shared != NULL) { 817 ASSERT(level == 0); 818 ASSERT(shared->ht_valid_cnt > 0); 819 ht->ht_flags |= HTABLE_SHARED_PFN; 820 ht->ht_pfn = shared->ht_pfn; 821 ht->ht_lock_cnt = 0; 822 ht->ht_valid_cnt = 0; /* updated in hat_share() */ 823 ht->ht_shares = shared; 824 need_to_zero = 0; 825 } else { 826 ht->ht_shares = NULL; 827 ht->ht_lock_cnt = 0; 828 ht->ht_valid_cnt = 0; 829 } 830 831 /* 832 * setup flags, etc. for VLP htables 833 */ 834 if (is_vlp) { 835 ht->ht_flags |= HTABLE_VLP; 836 ht->ht_num_ptes = VLP_NUM_PTES; 837 ASSERT(ht->ht_pfn == PFN_INVALID); 838 need_to_zero = 0; 839 } else if (level == mmu.max_level) { 840 ht->ht_num_ptes = mmu.top_level_count; 841 } else { 842 ht->ht_num_ptes = mmu.ptes_per_table; 843 } 844 845 /* 846 * fill in the htable 847 */ 848 ht->ht_hat = hat; 849 ht->ht_parent = NULL; 850 ht->ht_vaddr = vaddr; 851 ht->ht_level = level; 852 ht->ht_busy = 1; 853 ht->ht_next = NULL; 854 ht->ht_prev = NULL; 855 856 /* 857 * Zero out any freshly allocated page table 858 */ 859 if (need_to_zero) 860 x86pte_zero(ht, 0, mmu.ptes_per_table); 861 return (ht); 862 } 863 864 /* 865 * Free up an htable, either to a hat's cached list, the reserves or 866 * back to kmem. 867 */ 868 static void 869 htable_free(htable_t *ht) 870 { 871 hat_t *hat = ht->ht_hat; 872 873 /* 874 * If the process isn't exiting, cache the free htable in the hat 875 * structure. We always do this for the boot reserve. We don't 876 * do this if the hat is exiting or we are stealing/reaping htables. 877 */ 878 if (hat != NULL && 879 !(ht->ht_flags & HTABLE_SHARED_PFN) && 880 (use_boot_reserve || 881 (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) { 882 ASSERT((ht->ht_flags & HTABLE_VLP) == 0); 883 ASSERT(ht->ht_pfn != PFN_INVALID); 884 hat_enter(hat); 885 ht->ht_next = hat->hat_ht_cached; 886 hat->hat_ht_cached = ht; 887 hat_exit(hat); 888 return; 889 } 890 891 /* 892 * If we have a hardware page table, free it. 893 * We don't free page tables that are accessed by sharing someone else. 894 */ 895 if (ht->ht_flags & HTABLE_SHARED_PFN) { 896 ASSERT(ht->ht_pfn != PFN_INVALID); 897 ht->ht_pfn = PFN_INVALID; 898 } else if (!(ht->ht_flags & HTABLE_VLP)) { 899 ptable_free(ht); 900 } 901 902 /* 903 * If we are the thread using the reserves, put free htables 904 * into reserves. 905 */ 906 if (curthread == hat_reserves_thread || 907 htable_reserve_cnt < htable_reserve_amount) 908 htable_put_reserve(ht); 909 else 910 kmem_cache_free(htable_cache, ht); 911 } 912 913 914 /* 915 * This is called when a hat is being destroyed or swapped out. We reap all 916 * the remaining htables in the hat cache. If destroying all left over 917 * htables are also destroyed. 918 * 919 * We also don't need to invalidate any of the PTPs nor do any demapping. 920 */ 921 void 922 htable_purge_hat(hat_t *hat) 923 { 924 htable_t *ht; 925 int h; 926 927 /* 928 * Purge the htable cache if just reaping. 929 */ 930 if (!(hat->hat_flags & HAT_FREEING)) { 931 atomic_add_32(&htable_dont_cache, 1); 932 for (;;) { 933 hat_enter(hat); 934 ht = hat->hat_ht_cached; 935 if (ht == NULL) { 936 hat_exit(hat); 937 break; 938 } 939 hat->hat_ht_cached = ht->ht_next; 940 hat_exit(hat); 941 htable_free(ht); 942 } 943 atomic_add_32(&htable_dont_cache, -1); 944 return; 945 } 946 947 /* 948 * if freeing, no locking is needed 949 */ 950 while ((ht = hat->hat_ht_cached) != NULL) { 951 hat->hat_ht_cached = ht->ht_next; 952 htable_free(ht); 953 } 954 955 /* 956 * walk thru the htable hash table and free all the htables in it. 957 */ 958 for (h = 0; h < hat->hat_num_hash; ++h) { 959 while ((ht = hat->hat_ht_hash[h]) != NULL) { 960 if (ht->ht_next) 961 ht->ht_next->ht_prev = ht->ht_prev; 962 963 if (ht->ht_prev) { 964 ht->ht_prev->ht_next = ht->ht_next; 965 } else { 966 ASSERT(hat->hat_ht_hash[h] == ht); 967 hat->hat_ht_hash[h] = ht->ht_next; 968 } 969 htable_free(ht); 970 } 971 } 972 } 973 974 /* 975 * Unlink an entry for a table at vaddr and level out of the existing table 976 * one level higher. We are always holding the HASH_ENTER() when doing this. 977 */ 978 static void 979 unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr) 980 { 981 uint_t entry = htable_va2entry(vaddr, higher); 982 x86pte_t expect = MAKEPTP(old->ht_pfn, old->ht_level); 983 x86pte_t found; 984 985 ASSERT(higher->ht_busy > 0); 986 ASSERT(higher->ht_valid_cnt > 0); 987 ASSERT(old->ht_valid_cnt == 0); 988 found = x86pte_cas(higher, entry, expect, 0); 989 if (found != expect) 990 panic("Bad PTP found=" FMT_PTE ", expected=" FMT_PTE, 991 found, expect); 992 HTABLE_DEC(higher->ht_valid_cnt); 993 } 994 995 /* 996 * Link an entry for a new table at vaddr and level into the existing table 997 * one level higher. We are always holding the HASH_ENTER() when doing this. 998 */ 999 static void 1000 link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr) 1001 { 1002 uint_t entry = htable_va2entry(vaddr, higher); 1003 x86pte_t newptp = MAKEPTP(new->ht_pfn, new->ht_level); 1004 x86pte_t found; 1005 1006 ASSERT(higher->ht_busy > 0); 1007 1008 ASSERT(new->ht_level != mmu.max_level); 1009 1010 HTABLE_INC(higher->ht_valid_cnt); 1011 1012 found = x86pte_cas(higher, entry, 0, newptp); 1013 if ((found & ~PT_REF) != 0) 1014 panic("HAT: ptp not 0, found=" FMT_PTE, found); 1015 } 1016 1017 /* 1018 * Release of an htable. 1019 * 1020 * During process exit, some empty page tables are not unlinked - hat_free_end() 1021 * cleans them up. Upper level pagetable (mmu.max_page_level and higher) are 1022 * only released during hat_free_end() or by htable_steal(). We always 1023 * release SHARED page tables. 1024 */ 1025 void 1026 htable_release(htable_t *ht) 1027 { 1028 uint_t hashval; 1029 htable_t *shared; 1030 htable_t *higher; 1031 hat_t *hat; 1032 uintptr_t va; 1033 level_t level; 1034 1035 while (ht != NULL) { 1036 shared = NULL; 1037 for (;;) { 1038 hat = ht->ht_hat; 1039 va = ht->ht_vaddr; 1040 level = ht->ht_level; 1041 hashval = HTABLE_HASH(hat, va, level); 1042 1043 /* 1044 * The common case is that this isn't the last use of 1045 * an htable so we don't want to free the htable. 1046 */ 1047 HTABLE_ENTER(hashval); 1048 ASSERT(ht->ht_lock_cnt == 0 || ht->ht_valid_cnt > 0); 1049 ASSERT(ht->ht_valid_cnt >= 0); 1050 ASSERT(ht->ht_busy > 0); 1051 if (ht->ht_valid_cnt > 0) 1052 break; 1053 if (ht->ht_busy > 1) 1054 break; 1055 1056 /* 1057 * we always release empty shared htables 1058 */ 1059 if (!(ht->ht_flags & HTABLE_SHARED_PFN)) { 1060 1061 /* 1062 * don't release if in address space tear down 1063 */ 1064 if (hat->hat_flags & HAT_FREEING) 1065 break; 1066 1067 /* 1068 * At and above max_page_level, free if it's for 1069 * a boot-time kernel mapping below kernelbase. 1070 */ 1071 if (level >= mmu.max_page_level && 1072 (hat != kas.a_hat || va >= kernelbase)) 1073 break; 1074 } 1075 1076 /* 1077 * remember if we destroy an htable that shares its PFN 1078 * from elsewhere 1079 */ 1080 if (ht->ht_flags & HTABLE_SHARED_PFN) { 1081 ASSERT(ht->ht_level == 0); 1082 ASSERT(shared == NULL); 1083 shared = ht->ht_shares; 1084 HATSTAT_INC(hs_htable_unshared); 1085 } 1086 1087 /* 1088 * Handle release of a table and freeing the htable_t. 1089 * Unlink it from the table higher (ie. ht_parent). 1090 */ 1091 ASSERT(ht->ht_lock_cnt == 0); 1092 higher = ht->ht_parent; 1093 ASSERT(higher != NULL); 1094 1095 /* 1096 * Unlink the pagetable. 1097 */ 1098 unlink_ptp(higher, ht, va); 1099 1100 /* 1101 * When any top level VLP page table entry changes, we 1102 * must issue a reload of cr3 on all processors. 1103 */ 1104 if ((hat->hat_flags & HAT_VLP) && 1105 level == VLP_LEVEL - 1) 1106 hat_demap(hat, DEMAP_ALL_ADDR); 1107 1108 /* 1109 * remove this htable from its hash list 1110 */ 1111 if (ht->ht_next) 1112 ht->ht_next->ht_prev = ht->ht_prev; 1113 1114 if (ht->ht_prev) { 1115 ht->ht_prev->ht_next = ht->ht_next; 1116 } else { 1117 ASSERT(hat->hat_ht_hash[hashval] == ht); 1118 hat->hat_ht_hash[hashval] = ht->ht_next; 1119 } 1120 HTABLE_EXIT(hashval); 1121 htable_free(ht); 1122 ht = higher; 1123 } 1124 1125 ASSERT(ht->ht_busy >= 1); 1126 --ht->ht_busy; 1127 HTABLE_EXIT(hashval); 1128 1129 /* 1130 * If we released a shared htable, do a release on the htable 1131 * from which it shared 1132 */ 1133 ht = shared; 1134 } 1135 } 1136 1137 /* 1138 * Find the htable for the pagetable at the given level for the given address. 1139 * If found acquires a hold that eventually needs to be htable_release()d 1140 */ 1141 htable_t * 1142 htable_lookup(hat_t *hat, uintptr_t vaddr, level_t level) 1143 { 1144 uintptr_t base; 1145 uint_t hashval; 1146 htable_t *ht = NULL; 1147 1148 ASSERT(level >= 0); 1149 ASSERT(level <= TOP_LEVEL(hat)); 1150 1151 if (level == TOP_LEVEL(hat)) 1152 base = 0; 1153 else 1154 base = vaddr & LEVEL_MASK(level + 1); 1155 1156 hashval = HTABLE_HASH(hat, base, level); 1157 HTABLE_ENTER(hashval); 1158 for (ht = hat->hat_ht_hash[hashval]; ht; ht = ht->ht_next) { 1159 if (ht->ht_hat == hat && 1160 ht->ht_vaddr == base && 1161 ht->ht_level == level) 1162 break; 1163 } 1164 if (ht) 1165 ++ht->ht_busy; 1166 1167 HTABLE_EXIT(hashval); 1168 return (ht); 1169 } 1170 1171 /* 1172 * Acquires a hold on a known htable (from a locked hment entry). 1173 */ 1174 void 1175 htable_acquire(htable_t *ht) 1176 { 1177 hat_t *hat = ht->ht_hat; 1178 level_t level = ht->ht_level; 1179 uintptr_t base = ht->ht_vaddr; 1180 uint_t hashval = HTABLE_HASH(hat, base, level); 1181 1182 HTABLE_ENTER(hashval); 1183 #ifdef DEBUG 1184 /* 1185 * make sure the htable is there 1186 */ 1187 { 1188 htable_t *h; 1189 1190 for (h = hat->hat_ht_hash[hashval]; 1191 h && h != ht; 1192 h = h->ht_next) 1193 ; 1194 ASSERT(h == ht); 1195 } 1196 #endif /* DEBUG */ 1197 ++ht->ht_busy; 1198 HTABLE_EXIT(hashval); 1199 } 1200 1201 /* 1202 * Find the htable for the pagetable at the given level for the given address. 1203 * If found acquires a hold that eventually needs to be htable_release()d 1204 * If not found the table is created. 1205 * 1206 * Since we can't hold a hash table mutex during allocation, we have to 1207 * drop it and redo the search on a create. Then we may have to free the newly 1208 * allocated htable if another thread raced in and created it ahead of us. 1209 */ 1210 htable_t * 1211 htable_create( 1212 hat_t *hat, 1213 uintptr_t vaddr, 1214 level_t level, 1215 htable_t *shared) 1216 { 1217 uint_t h; 1218 level_t l; 1219 uintptr_t base; 1220 htable_t *ht; 1221 htable_t *higher = NULL; 1222 htable_t *new = NULL; 1223 1224 if (level < 0 || level > TOP_LEVEL(hat)) 1225 panic("htable_create(): level %d out of range\n", level); 1226 1227 /* 1228 * Create the page tables in top down order. 1229 */ 1230 for (l = TOP_LEVEL(hat); l >= level; --l) { 1231 new = NULL; 1232 if (l == TOP_LEVEL(hat)) 1233 base = 0; 1234 else 1235 base = vaddr & LEVEL_MASK(l + 1); 1236 1237 h = HTABLE_HASH(hat, base, l); 1238 try_again: 1239 /* 1240 * look up the htable at this level 1241 */ 1242 HTABLE_ENTER(h); 1243 if (l == TOP_LEVEL(hat)) { 1244 ht = hat->hat_htable; 1245 } else { 1246 for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) { 1247 ASSERT(ht->ht_hat == hat); 1248 if (ht->ht_vaddr == base && 1249 ht->ht_level == l) 1250 break; 1251 } 1252 } 1253 1254 /* 1255 * if we found the htable, increment its busy cnt 1256 * and if we had allocated a new htable, free it. 1257 */ 1258 if (ht != NULL) { 1259 /* 1260 * If we find a pre-existing shared table, it must 1261 * share from the same place. 1262 */ 1263 if (l == level && shared && ht->ht_shares && 1264 ht->ht_shares != shared) { 1265 panic("htable shared from wrong place " 1266 "found htable=%p shared=%p", ht, shared); 1267 } 1268 ++ht->ht_busy; 1269 HTABLE_EXIT(h); 1270 if (new) 1271 htable_free(new); 1272 if (higher != NULL) 1273 htable_release(higher); 1274 higher = ht; 1275 1276 /* 1277 * if we didn't find it on the first search 1278 * allocate a new one and search again 1279 */ 1280 } else if (new == NULL) { 1281 HTABLE_EXIT(h); 1282 new = htable_alloc(hat, base, l, 1283 l == level ? shared : NULL); 1284 goto try_again; 1285 1286 /* 1287 * 2nd search and still not there, use "new" table 1288 * Link new table into higher, when not at top level. 1289 */ 1290 } else { 1291 ht = new; 1292 if (higher != NULL) { 1293 link_ptp(higher, ht, base); 1294 ht->ht_parent = higher; 1295 1296 /* 1297 * When any top level VLP page table changes, 1298 * we must reload cr3 on all processors. 1299 */ 1300 #ifdef __i386 1301 if (mmu.pae_hat && 1302 #else /* !__i386 */ 1303 if ((hat->hat_flags & HAT_VLP) && 1304 #endif /* __i386 */ 1305 l == VLP_LEVEL - 1) 1306 hat_demap(hat, DEMAP_ALL_ADDR); 1307 } 1308 ht->ht_next = hat->hat_ht_hash[h]; 1309 ASSERT(ht->ht_prev == NULL); 1310 if (hat->hat_ht_hash[h]) 1311 hat->hat_ht_hash[h]->ht_prev = ht; 1312 hat->hat_ht_hash[h] = ht; 1313 HTABLE_EXIT(h); 1314 1315 /* 1316 * Note we don't do htable_release(higher). 1317 * That happens recursively when "new" is removed by 1318 * htable_release() or htable_steal(). 1319 */ 1320 higher = ht; 1321 1322 /* 1323 * If we just created a new shared page table we 1324 * increment the shared htable's busy count, so that 1325 * it can't be the victim of a steal even if it's empty. 1326 */ 1327 if (l == level && shared) { 1328 (void) htable_lookup(shared->ht_hat, 1329 shared->ht_vaddr, shared->ht_level); 1330 HATSTAT_INC(hs_htable_shared); 1331 } 1332 } 1333 } 1334 1335 return (ht); 1336 } 1337 1338 /* 1339 * Walk through a given htable looking for the first valid entry. This 1340 * routine takes both a starting and ending address. The starting address 1341 * is required to be within the htable provided by the caller, but there is 1342 * no such restriction on the ending address. 1343 * 1344 * If the routine finds a valid entry in the htable (at or beyond the 1345 * starting address), the PTE (and its address) will be returned. 1346 * This PTE may correspond to either a page or a pagetable - it is the 1347 * caller's responsibility to determine which. If no valid entry is 1348 * found, 0 (and invalid PTE) and the next unexamined address will be 1349 * returned. 1350 * 1351 * The loop has been carefully coded for optimization. 1352 */ 1353 static x86pte_t 1354 htable_scan(htable_t *ht, uintptr_t *vap, uintptr_t eaddr) 1355 { 1356 uint_t e; 1357 x86pte_t found_pte = (x86pte_t)0; 1358 char *pte_ptr; 1359 char *end_pte_ptr; 1360 int l = ht->ht_level; 1361 uintptr_t va = *vap & LEVEL_MASK(l); 1362 size_t pgsize = LEVEL_SIZE(l); 1363 1364 ASSERT(va >= ht->ht_vaddr); 1365 ASSERT(va <= HTABLE_LAST_PAGE(ht)); 1366 1367 /* 1368 * Compute the starting index and ending virtual address 1369 */ 1370 e = htable_va2entry(va, ht); 1371 1372 /* 1373 * The following page table scan code knows that the valid 1374 * bit of a PTE is in the lowest byte AND that x86 is little endian!! 1375 */ 1376 pte_ptr = (char *)x86pte_access_pagetable(ht); 1377 end_pte_ptr = pte_ptr + (ht->ht_num_ptes << mmu.pte_size_shift); 1378 pte_ptr += e << mmu.pte_size_shift; 1379 while (!PTE_ISVALID(*pte_ptr)) { 1380 va += pgsize; 1381 if (va >= eaddr) 1382 break; 1383 pte_ptr += mmu.pte_size; 1384 ASSERT(pte_ptr <= end_pte_ptr); 1385 if (pte_ptr == end_pte_ptr) 1386 break; 1387 } 1388 1389 /* 1390 * if we found a valid PTE, load the entire PTE 1391 */ 1392 if (va < eaddr && pte_ptr != end_pte_ptr) { 1393 if (mmu.pae_hat) { 1394 ATOMIC_LOAD64((x86pte_t *)pte_ptr, found_pte); 1395 } else { 1396 found_pte = *(x86pte32_t *)pte_ptr; 1397 } 1398 } 1399 x86pte_release_pagetable(ht); 1400 1401 #if defined(__amd64) 1402 /* 1403 * deal with VA hole on amd64 1404 */ 1405 if (l == mmu.max_level && va >= mmu.hole_start && va <= mmu.hole_end) 1406 va = mmu.hole_end + va - mmu.hole_start; 1407 #endif /* __amd64 */ 1408 1409 *vap = va; 1410 return (found_pte); 1411 } 1412 1413 /* 1414 * Find the address and htable for the first populated translation at or 1415 * above the given virtual address. The caller may also specify an upper 1416 * limit to the address range to search. Uses level information to quickly 1417 * skip unpopulated sections of virtual address spaces. 1418 * 1419 * If not found returns NULL. When found, returns the htable and virt addr 1420 * and has a hold on the htable. 1421 */ 1422 x86pte_t 1423 htable_walk( 1424 struct hat *hat, 1425 htable_t **htp, 1426 uintptr_t *vaddr, 1427 uintptr_t eaddr) 1428 { 1429 uintptr_t va = *vaddr; 1430 htable_t *ht; 1431 htable_t *prev = *htp; 1432 level_t l; 1433 level_t max_mapped_level; 1434 x86pte_t pte; 1435 1436 ASSERT(eaddr > va); 1437 1438 /* 1439 * If this is a user address, then we know we need not look beyond 1440 * kernelbase. 1441 */ 1442 ASSERT(hat == kas.a_hat || eaddr <= kernelbase || 1443 eaddr == HTABLE_WALK_TO_END); 1444 if (hat != kas.a_hat && eaddr == HTABLE_WALK_TO_END) 1445 eaddr = kernelbase; 1446 1447 /* 1448 * If we're coming in with a previous page table, search it first 1449 * without doing an htable_lookup(), this should be frequent. 1450 */ 1451 if (prev) { 1452 ASSERT(prev->ht_busy > 0); 1453 ASSERT(prev->ht_vaddr <= va); 1454 l = prev->ht_level; 1455 if (va <= HTABLE_LAST_PAGE(prev)) { 1456 pte = htable_scan(prev, &va, eaddr); 1457 1458 if (PTE_ISPAGE(pte, l)) { 1459 *vaddr = va; 1460 *htp = prev; 1461 return (pte); 1462 } 1463 } 1464 1465 /* 1466 * We found nothing in the htable provided by the caller, 1467 * so fall through and do the full search 1468 */ 1469 htable_release(prev); 1470 } 1471 1472 /* 1473 * Find the level of the largest pagesize used by this HAT. 1474 */ 1475 max_mapped_level = 0; 1476 for (l = 1; l <= mmu.max_page_level; ++l) 1477 if (hat->hat_pages_mapped[l] != 0) 1478 max_mapped_level = l; 1479 1480 while (va < eaddr && va >= *vaddr) { 1481 ASSERT(!IN_VA_HOLE(va)); 1482 1483 /* 1484 * Find lowest table with any entry for given address. 1485 */ 1486 for (l = 0; l <= TOP_LEVEL(hat); ++l) { 1487 ht = htable_lookup(hat, va, l); 1488 if (ht != NULL) { 1489 pte = htable_scan(ht, &va, eaddr); 1490 if (PTE_ISPAGE(pte, l)) { 1491 *vaddr = va; 1492 *htp = ht; 1493 return (pte); 1494 } 1495 htable_release(ht); 1496 break; 1497 } 1498 1499 /* 1500 * The ht is never NULL at the top level since 1501 * the top level htable is created in hat_alloc(). 1502 */ 1503 ASSERT(l < TOP_LEVEL(hat)); 1504 1505 /* 1506 * No htable covers the address. If there is no 1507 * larger page size that could cover it, we 1508 * skip to the start of the next page table. 1509 */ 1510 if (l >= max_mapped_level) { 1511 va = NEXT_ENTRY_VA(va, l + 1); 1512 break; 1513 } 1514 } 1515 } 1516 1517 *vaddr = 0; 1518 *htp = NULL; 1519 return (0); 1520 } 1521 1522 /* 1523 * Find the htable and page table entry index of the given virtual address 1524 * with pagesize at or below given level. 1525 * If not found returns NULL. When found, returns the htable, sets 1526 * entry, and has a hold on the htable. 1527 */ 1528 htable_t * 1529 htable_getpte( 1530 struct hat *hat, 1531 uintptr_t vaddr, 1532 uint_t *entry, 1533 x86pte_t *pte, 1534 level_t level) 1535 { 1536 htable_t *ht; 1537 level_t l; 1538 uint_t e; 1539 1540 ASSERT(level <= mmu.max_page_level); 1541 1542 for (l = 0; l <= level; ++l) { 1543 ht = htable_lookup(hat, vaddr, l); 1544 if (ht == NULL) 1545 continue; 1546 e = htable_va2entry(vaddr, ht); 1547 if (entry != NULL) 1548 *entry = e; 1549 if (pte != NULL) 1550 *pte = x86pte_get(ht, e); 1551 return (ht); 1552 } 1553 return (NULL); 1554 } 1555 1556 /* 1557 * Find the htable and page table entry index of the given virtual address. 1558 * There must be a valid page mapped at the given address. 1559 * If not found returns NULL. When found, returns the htable, sets 1560 * entry, and has a hold on the htable. 1561 */ 1562 htable_t * 1563 htable_getpage(struct hat *hat, uintptr_t vaddr, uint_t *entry) 1564 { 1565 htable_t *ht; 1566 uint_t e; 1567 x86pte_t pte; 1568 1569 ht = htable_getpte(hat, vaddr, &e, &pte, mmu.max_page_level); 1570 if (ht == NULL) 1571 return (NULL); 1572 1573 if (entry) 1574 *entry = e; 1575 1576 if (PTE_ISPAGE(pte, ht->ht_level)) 1577 return (ht); 1578 htable_release(ht); 1579 return (NULL); 1580 } 1581 1582 1583 void 1584 htable_init() 1585 { 1586 /* 1587 * To save on kernel VA usage, we avoid debug information in 32 bit 1588 * kernels. 1589 */ 1590 #if defined(__amd64) 1591 int kmem_flags = KMC_NOHASH; 1592 #elif defined(__i386) 1593 int kmem_flags = KMC_NOHASH | KMC_NODEBUG; 1594 #endif 1595 1596 /* 1597 * initialize kmem caches 1598 */ 1599 htable_cache = kmem_cache_create("htable_t", 1600 sizeof (htable_t), 0, NULL, NULL, 1601 htable_reap, NULL, hat_memload_arena, kmem_flags); 1602 } 1603 1604 /* 1605 * get the pte index for the virtual address in the given htable's pagetable 1606 */ 1607 uint_t 1608 htable_va2entry(uintptr_t va, htable_t *ht) 1609 { 1610 level_t l = ht->ht_level; 1611 1612 ASSERT(va >= ht->ht_vaddr); 1613 ASSERT(va <= HTABLE_LAST_PAGE(ht)); 1614 return ((va >> LEVEL_SHIFT(l)) & (ht->ht_num_ptes - 1)); 1615 } 1616 1617 /* 1618 * Given an htable and the index of a pte in it, return the virtual address 1619 * of the page. 1620 */ 1621 uintptr_t 1622 htable_e2va(htable_t *ht, uint_t entry) 1623 { 1624 level_t l = ht->ht_level; 1625 uintptr_t va; 1626 1627 ASSERT(entry < ht->ht_num_ptes); 1628 va = ht->ht_vaddr + ((uintptr_t)entry << LEVEL_SHIFT(l)); 1629 1630 /* 1631 * Need to skip over any VA hole in top level table 1632 */ 1633 #if defined(__amd64) 1634 if (ht->ht_level == mmu.max_level && va >= mmu.hole_start) 1635 va += ((mmu.hole_end - mmu.hole_start) + 1); 1636 #endif 1637 1638 return (va); 1639 } 1640 1641 /* 1642 * The code uses compare and swap instructions to read/write PTE's to 1643 * avoid atomicity problems, since PTEs can be 8 bytes on 32 bit systems. 1644 * Again this can be optimized on 64 bit systems, since aligned load/store 1645 * will naturally be atomic. 1646 * 1647 * The combination of using kpreempt_disable()/_enable() and the hci_mutex 1648 * are used to ensure that an interrupt won't overwrite a temporary mapping 1649 * while it's in use. If an interrupt thread tries to access a PTE, it will 1650 * yield briefly back to the pinned thread which holds the cpu's hci_mutex. 1651 */ 1652 1653 static struct hat_cpu_info init_hci; /* used for cpu 0 */ 1654 1655 /* 1656 * Initialize a CPU private window for mapping page tables. 1657 * There will be 3 total pages of addressing needed: 1658 * 1659 * 1 for r/w access to pagetables 1660 * 1 for r access when copying pagetables (hat_alloc) 1661 * 1 that will map the PTEs for the 1st 2, so we can access them quickly 1662 * 1663 * We use vmem_xalloc() to get a correct alignment so that only one 1664 * hat_mempte_setup() is needed. 1665 */ 1666 void 1667 x86pte_cpu_init(cpu_t *cpu, void *pages) 1668 { 1669 struct hat_cpu_info *hci; 1670 caddr_t va; 1671 1672 /* 1673 * We can't use kmem_alloc/vmem_alloc for the 1st CPU, as this is 1674 * called before we've activated our own HAT 1675 */ 1676 if (pages != NULL) { 1677 hci = &init_hci; 1678 va = pages; 1679 } else { 1680 hci = kmem_alloc(sizeof (struct hat_cpu_info), KM_SLEEP); 1681 va = vmem_xalloc(heap_arena, 3 * MMU_PAGESIZE, MMU_PAGESIZE, 0, 1682 LEVEL_SIZE(1), NULL, NULL, VM_SLEEP); 1683 } 1684 mutex_init(&hci->hci_mutex, NULL, MUTEX_DEFAULT, NULL); 1685 1686 /* 1687 * If we are using segkpm, then there is no need for any of the 1688 * mempte support. We can access the desired memory through a kpm 1689 * mapping rather than setting up a temporary mempte mapping. 1690 */ 1691 if (kpm_enable == 0) { 1692 hci->hci_mapped_pfn = PFN_INVALID; 1693 1694 hci->hci_kernel_pte = 1695 hat_mempte_kern_setup(va, va + (2 * MMU_PAGESIZE)); 1696 hci->hci_pagetable_va = (void *)va; 1697 } 1698 1699 cpu->cpu_hat_info = hci; 1700 } 1701 1702 /* 1703 * Macro to establish temporary mappings for x86pte_XXX routines. 1704 */ 1705 #define X86PTE_REMAP(addr, pte, index, perm, pfn) { \ 1706 x86pte_t t; \ 1707 \ 1708 t = MAKEPTE((pfn), 0) | (perm) | mmu.pt_global | mmu.pt_nx;\ 1709 if (mmu.pae_hat) \ 1710 pte[index] = t; \ 1711 else \ 1712 ((x86pte32_t *)(pte))[index] = t; \ 1713 mmu_tlbflush_entry((caddr_t)(addr)); \ 1714 } 1715 1716 /* 1717 * Disable preemption and establish a mapping to the pagetable with the 1718 * given pfn. This is optimized for there case where it's the same 1719 * pfn as we last used referenced from this CPU. 1720 */ 1721 static x86pte_t * 1722 x86pte_access_pagetable(htable_t *ht) 1723 { 1724 pfn_t pfn; 1725 struct hat_cpu_info *hci; 1726 1727 /* 1728 * VLP pagetables are contained in the hat_t 1729 */ 1730 if (ht->ht_flags & HTABLE_VLP) 1731 return (ht->ht_hat->hat_vlp_ptes); 1732 1733 /* 1734 * During early boot, use hat_boot_remap() of a page table adddress. 1735 */ 1736 pfn = ht->ht_pfn; 1737 ASSERT(pfn != PFN_INVALID); 1738 if (kpm_enable) 1739 return ((x86pte_t *)hat_kpm_pfn2va(pfn)); 1740 1741 if (!khat_running) { 1742 (void) hat_boot_remap(ptable_va, pfn); 1743 return ((x86pte_t *)ptable_va); 1744 } 1745 1746 /* 1747 * Normally, disable preemption and grab the CPU's hci_mutex 1748 */ 1749 kpreempt_disable(); 1750 hci = CPU->cpu_hat_info; 1751 ASSERT(hci != NULL); 1752 mutex_enter(&hci->hci_mutex); 1753 if (hci->hci_mapped_pfn != pfn) { 1754 /* 1755 * The current mapping doesn't already point to this page. 1756 * Update the CPU specific pagetable mapping to map the pfn. 1757 */ 1758 X86PTE_REMAP(hci->hci_pagetable_va, hci->hci_kernel_pte, 0, 1759 PT_WRITABLE, pfn); 1760 hci->hci_mapped_pfn = pfn; 1761 } 1762 return (hci->hci_pagetable_va); 1763 } 1764 1765 /* 1766 * Release access to a page table. 1767 */ 1768 static void 1769 x86pte_release_pagetable(htable_t *ht) 1770 { 1771 struct hat_cpu_info *hci; 1772 1773 if (kpm_enable) 1774 return; 1775 1776 /* 1777 * nothing to do for VLP htables 1778 */ 1779 if (ht->ht_flags & HTABLE_VLP) 1780 return; 1781 1782 /* 1783 * During boot-up hat_kern_setup(), erase the boot loader remapping. 1784 */ 1785 if (!khat_running) { 1786 hat_boot_demap(ptable_va); 1787 return; 1788 } 1789 1790 /* 1791 * Normal Operation: drop the CPU's hci_mutex and restore preemption 1792 */ 1793 hci = CPU->cpu_hat_info; 1794 ASSERT(hci != NULL); 1795 mutex_exit(&hci->hci_mutex); 1796 kpreempt_enable(); 1797 } 1798 1799 /* 1800 * Atomic retrieval of a pagetable entry 1801 */ 1802 x86pte_t 1803 x86pte_get(htable_t *ht, uint_t entry) 1804 { 1805 x86pte_t pte; 1806 x86pte32_t *pte32p; 1807 x86pte_t *ptep; 1808 1809 /* 1810 * Be careful that loading PAE entries in 32 bit kernel is atomic. 1811 */ 1812 ptep = x86pte_access_pagetable(ht); 1813 if (mmu.pae_hat) { 1814 ATOMIC_LOAD64(ptep + entry, pte); 1815 } else { 1816 pte32p = (x86pte32_t *)ptep; 1817 pte = pte32p[entry]; 1818 } 1819 x86pte_release_pagetable(ht); 1820 return (pte); 1821 } 1822 1823 /* 1824 * Atomic unconditional set of a page table entry, it returns the previous 1825 * value. 1826 */ 1827 x86pte_t 1828 x86pte_set(htable_t *ht, uint_t entry, x86pte_t new, void *ptr) 1829 { 1830 x86pte_t old; 1831 x86pte_t prev, n; 1832 x86pte_t *ptep; 1833 x86pte32_t *pte32p; 1834 x86pte32_t n32, p32; 1835 1836 ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN)); 1837 if (ptr == NULL) { 1838 ptep = x86pte_access_pagetable(ht); 1839 ptep = (void *)((caddr_t)ptep + (entry << mmu.pte_size_shift)); 1840 } else { 1841 ptep = ptr; 1842 } 1843 1844 if (mmu.pae_hat) { 1845 for (;;) { 1846 prev = *ptep; 1847 n = new; 1848 /* 1849 * prevent potential data loss by preserving the MOD 1850 * bit if set in the current PTE and the pfns are the 1851 * same. For example, segmap can reissue a read-only 1852 * hat_memload on top of a dirty page. 1853 */ 1854 if (PTE_ISVALID(prev) && PTE2PFN(prev, ht->ht_level) == 1855 PTE2PFN(n, ht->ht_level)) { 1856 n |= prev & (PT_REF | PT_MOD); 1857 } 1858 if (prev == n) { 1859 old = new; 1860 break; 1861 } 1862 old = cas64(ptep, prev, n); 1863 if (old == prev) 1864 break; 1865 } 1866 } else { 1867 pte32p = (x86pte32_t *)ptep; 1868 for (;;) { 1869 p32 = *pte32p; 1870 n32 = new; 1871 if (PTE_ISVALID(p32) && PTE2PFN(p32, ht->ht_level) == 1872 PTE2PFN(n32, ht->ht_level)) { 1873 n32 |= p32 & (PT_REF | PT_MOD); 1874 } 1875 if (p32 == n32) { 1876 old = new; 1877 break; 1878 } 1879 old = cas32(pte32p, p32, n32); 1880 if (old == p32) 1881 break; 1882 } 1883 } 1884 if (ptr == NULL) 1885 x86pte_release_pagetable(ht); 1886 return (old); 1887 } 1888 1889 /* 1890 * Atomic compare and swap of a page table entry. 1891 */ 1892 static x86pte_t 1893 x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, x86pte_t new) 1894 { 1895 x86pte_t pte; 1896 x86pte_t *ptep; 1897 x86pte32_t pte32, o32, n32; 1898 x86pte32_t *pte32p; 1899 1900 ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN)); 1901 ptep = x86pte_access_pagetable(ht); 1902 if (mmu.pae_hat) { 1903 pte = cas64(&ptep[entry], old, new); 1904 } else { 1905 o32 = old; 1906 n32 = new; 1907 pte32p = (x86pte32_t *)ptep; 1908 pte32 = cas32(&pte32p[entry], o32, n32); 1909 pte = pte32; 1910 } 1911 x86pte_release_pagetable(ht); 1912 1913 return (pte); 1914 } 1915 1916 /* 1917 * data structure for cross call information 1918 */ 1919 typedef struct xcall_info { 1920 x86pte_t xi_pte; 1921 x86pte_t xi_old; 1922 x86pte_t *xi_pteptr; 1923 pfn_t xi_pfn; 1924 processorid_t xi_cpuid; 1925 level_t xi_level; 1926 xc_func_t xi_func; 1927 } xcall_info_t; 1928 1929 /* 1930 * Cross call service function to atomically invalidate a PTE and flush TLBs 1931 */ 1932 /*ARGSUSED*/ 1933 static int 1934 x86pte_inval_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3) 1935 { 1936 xcall_info_t *xi = (xcall_info_t *)a1; 1937 caddr_t addr = (caddr_t)a2; 1938 1939 /* 1940 * Only the initiating cpu invalidates the page table entry. 1941 * It returns the previous PTE value to the caller. 1942 */ 1943 if (CPU->cpu_id == xi->xi_cpuid) { 1944 x86pte_t *ptep = xi->xi_pteptr; 1945 pfn_t pfn = xi->xi_pfn; 1946 level_t level = xi->xi_level; 1947 x86pte_t old; 1948 x86pte_t prev; 1949 x86pte32_t *pte32p; 1950 x86pte32_t p32; 1951 1952 if (mmu.pae_hat) { 1953 for (;;) { 1954 prev = *ptep; 1955 if (PTE2PFN(prev, level) != pfn) 1956 break; 1957 old = cas64(ptep, prev, 0); 1958 if (old == prev) 1959 break; 1960 } 1961 } else { 1962 pte32p = (x86pte32_t *)ptep; 1963 for (;;) { 1964 p32 = *pte32p; 1965 if (PTE2PFN(p32, level) != pfn) 1966 break; 1967 old = cas32(pte32p, p32, 0); 1968 if (old == p32) 1969 break; 1970 } 1971 prev = p32; 1972 } 1973 xi->xi_pte = prev; 1974 } 1975 1976 /* 1977 * For a normal address, we just flush one page mapping 1978 * Otherwise reload cr3 to effect a complete TLB flush. 1979 * 1980 * Note we don't reload VLP pte's -- this assume we never have a 1981 * large page size at VLP_LEVEL for VLP processes. 1982 */ 1983 if ((uintptr_t)addr != DEMAP_ALL_ADDR) { 1984 mmu_tlbflush_entry(addr); 1985 } else { 1986 reload_cr3(); 1987 } 1988 return (0); 1989 } 1990 1991 /* 1992 * Cross call service function to atomically change a PTE and flush TLBs 1993 */ 1994 /*ARGSUSED*/ 1995 static int 1996 x86pte_update_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3) 1997 { 1998 xcall_info_t *xi = (xcall_info_t *)a1; 1999 caddr_t addr = (caddr_t)a2; 2000 2001 /* 2002 * Only the initiating cpu changes the page table entry. 2003 * It returns the previous PTE value to the caller. 2004 */ 2005 if (CPU->cpu_id == xi->xi_cpuid) { 2006 x86pte_t *ptep = xi->xi_pteptr; 2007 x86pte_t new = xi->xi_pte; 2008 x86pte_t old = xi->xi_old; 2009 x86pte_t prev; 2010 2011 if (mmu.pae_hat) { 2012 prev = cas64(ptep, old, new); 2013 } else { 2014 x86pte32_t o32 = old; 2015 x86pte32_t n32 = new; 2016 x86pte32_t *pte32p = (x86pte32_t *)ptep; 2017 prev = cas32(pte32p, o32, n32); 2018 } 2019 2020 xi->xi_pte = prev; 2021 } 2022 2023 /* 2024 * Flush the TLB entry 2025 */ 2026 if ((uintptr_t)addr != DEMAP_ALL_ADDR) 2027 mmu_tlbflush_entry(addr); 2028 else 2029 reload_cr3(); 2030 return (0); 2031 } 2032 2033 /* 2034 * Use cross calls to change a page table entry and invalidate TLBs. 2035 */ 2036 void 2037 x86pte_xcall(hat_t *hat, xcall_info_t *xi, uintptr_t addr) 2038 { 2039 cpuset_t cpus; 2040 2041 /* 2042 * Given the current implementation of hat_share(), doing a 2043 * hat_pageunload() on a shared page table requries invalidating 2044 * all user TLB entries on all CPUs. 2045 */ 2046 if (hat->hat_flags & HAT_SHARED) { 2047 hat = kas.a_hat; 2048 addr = DEMAP_ALL_ADDR; 2049 } 2050 2051 /* 2052 * Use a cross call to do the invalidations. 2053 * Note the current CPU always has to be in the cross call CPU set. 2054 */ 2055 kpreempt_disable(); 2056 xi->xi_cpuid = CPU->cpu_id; 2057 CPUSET_ZERO(cpus); 2058 if (hat == kas.a_hat) { 2059 CPUSET_OR(cpus, khat_cpuset); 2060 } else { 2061 mutex_enter(&hat->hat_switch_mutex); 2062 CPUSET_OR(cpus, hat->hat_cpus); 2063 CPUSET_ADD(cpus, CPU->cpu_id); 2064 } 2065 2066 /* 2067 * Use a cross call to modify the page table entry and invalidate TLBs. 2068 * If we're panic'ing, don't bother with the cross call. 2069 * Note the panicstr check isn't bullet proof and the panic system 2070 * ought to be made tighter. 2071 */ 2072 if (panicstr == NULL) 2073 xc_wait_sync((xc_arg_t)xi, addr, NULL, X_CALL_HIPRI, 2074 cpus, xi->xi_func); 2075 else 2076 (void) xi->xi_func((xc_arg_t)xi, (xc_arg_t)addr, NULL); 2077 if (hat != kas.a_hat) 2078 mutex_exit(&hat->hat_switch_mutex); 2079 kpreempt_enable(); 2080 } 2081 2082 /* 2083 * Invalidate a page table entry if it currently maps the given pfn. 2084 * This returns the previous value of the PTE. 2085 */ 2086 x86pte_t 2087 x86pte_invalidate_pfn(htable_t *ht, uint_t entry, pfn_t pfn, void *pte_ptr) 2088 { 2089 xcall_info_t xi; 2090 x86pte_t *ptep; 2091 hat_t *hat; 2092 uintptr_t addr; 2093 2094 ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN)); 2095 if (pte_ptr != NULL) { 2096 ptep = pte_ptr; 2097 } else { 2098 ptep = x86pte_access_pagetable(ht); 2099 ptep = (void *)((caddr_t)ptep + (entry << mmu.pte_size_shift)); 2100 } 2101 2102 /* 2103 * Fill in the structure used by the cross call function to do the 2104 * invalidation. 2105 */ 2106 xi.xi_pte = 0; 2107 xi.xi_pteptr = ptep; 2108 xi.xi_pfn = pfn; 2109 xi.xi_level = ht->ht_level; 2110 xi.xi_func = x86pte_inval_func; 2111 ASSERT(xi.xi_level != VLP_LEVEL); 2112 2113 hat = ht->ht_hat; 2114 addr = htable_e2va(ht, entry); 2115 2116 x86pte_xcall(hat, &xi, addr); 2117 2118 if (pte_ptr == NULL) 2119 x86pte_release_pagetable(ht); 2120 return (xi.xi_pte); 2121 } 2122 2123 /* 2124 * update a PTE and invalidate any stale TLB entries. 2125 */ 2126 x86pte_t 2127 x86pte_update(htable_t *ht, uint_t entry, x86pte_t expected, x86pte_t new) 2128 { 2129 xcall_info_t xi; 2130 x86pte_t *ptep; 2131 hat_t *hat; 2132 uintptr_t addr; 2133 2134 ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN)); 2135 ptep = x86pte_access_pagetable(ht); 2136 ptep = (void *)((caddr_t)ptep + (entry << mmu.pte_size_shift)); 2137 2138 /* 2139 * Fill in the structure used by the cross call function to do the 2140 * invalidation. 2141 */ 2142 xi.xi_pte = new; 2143 xi.xi_old = expected; 2144 xi.xi_pteptr = ptep; 2145 xi.xi_func = x86pte_update_func; 2146 2147 hat = ht->ht_hat; 2148 addr = htable_e2va(ht, entry); 2149 2150 x86pte_xcall(hat, &xi, addr); 2151 2152 x86pte_release_pagetable(ht); 2153 return (xi.xi_pte); 2154 } 2155 2156 /* 2157 * Copy page tables - this is just a little more complicated than the 2158 * previous routines. Note that it's also not atomic! It also is never 2159 * used for VLP pagetables. 2160 */ 2161 void 2162 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count) 2163 { 2164 struct hat_cpu_info *hci; 2165 caddr_t src_va; 2166 caddr_t dst_va; 2167 size_t size; 2168 2169 ASSERT(khat_running); 2170 ASSERT(!(dest->ht_flags & HTABLE_VLP)); 2171 ASSERT(!(src->ht_flags & HTABLE_VLP)); 2172 ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN)); 2173 ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN)); 2174 2175 /* 2176 * Acquire access to the CPU pagetable window for the destination. 2177 */ 2178 dst_va = (caddr_t)x86pte_access_pagetable(dest); 2179 if (kpm_enable) { 2180 src_va = (caddr_t)x86pte_access_pagetable(src); 2181 } else { 2182 hci = CPU->cpu_hat_info; 2183 2184 /* 2185 * Finish defining the src pagetable mapping 2186 */ 2187 src_va = dst_va + MMU_PAGESIZE; 2188 X86PTE_REMAP(src_va, hci->hci_kernel_pte, 1, 0, src->ht_pfn); 2189 } 2190 2191 /* 2192 * now do the copy 2193 */ 2194 2195 dst_va += entry << mmu.pte_size_shift; 2196 src_va += entry << mmu.pte_size_shift; 2197 size = count << mmu.pte_size_shift; 2198 bcopy(src_va, dst_va, size); 2199 2200 x86pte_release_pagetable(dest); 2201 } 2202 2203 /* 2204 * Zero page table entries - Note this doesn't use atomic stores! 2205 */ 2206 void 2207 x86pte_zero(htable_t *dest, uint_t entry, uint_t count) 2208 { 2209 caddr_t dst_va; 2210 x86pte_t *p; 2211 x86pte32_t *p32; 2212 size_t size; 2213 extern void hat_pte_zero(void *, size_t); 2214 2215 /* 2216 * Map in the page table to be zeroed. 2217 */ 2218 ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN)); 2219 ASSERT(!(dest->ht_flags & HTABLE_VLP)); 2220 dst_va = (caddr_t)x86pte_access_pagetable(dest); 2221 dst_va += entry << mmu.pte_size_shift; 2222 size = count << mmu.pte_size_shift; 2223 if (x86_feature & X86_SSE2) { 2224 hat_pte_zero(dst_va, size); 2225 } else if (khat_running) { 2226 bzero(dst_va, size); 2227 } else { 2228 /* 2229 * Can't just use bzero during boot because it checks the 2230 * address against kernelbase. Instead just use a zero loop. 2231 */ 2232 if (mmu.pae_hat) { 2233 p = (x86pte_t *)dst_va; 2234 while (count-- > 0) 2235 *p++ = 0; 2236 } else { 2237 p32 = (x86pte32_t *)dst_va; 2238 while (count-- > 0) 2239 *p32++ = 0; 2240 } 2241 } 2242 x86pte_release_pagetable(dest); 2243 } 2244 2245 /* 2246 * Called to ensure that all pagetables are in the system dump 2247 */ 2248 void 2249 hat_dump(void) 2250 { 2251 hat_t *hat; 2252 uint_t h; 2253 htable_t *ht; 2254 2255 /* 2256 * Dump all page tables 2257 */ 2258 for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) { 2259 for (h = 0; h < hat->hat_num_hash; ++h) { 2260 for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) { 2261 if ((ht->ht_flags & HTABLE_VLP) == 0) 2262 dump_page(ht->ht_pfn); 2263 } 2264 } 2265 } 2266 } 2267