1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <vm/hat.h> 30 #include <vm/hat_sfmmu.h> 31 #include <vm/page.h> 32 #include <sys/pte.h> 33 #include <sys/systm.h> 34 #include <sys/mman.h> 35 #include <sys/sysmacros.h> 36 #include <sys/machparam.h> 37 #include <sys/vtrace.h> 38 #include <sys/kmem.h> 39 #include <sys/mmu.h> 40 #include <sys/cmn_err.h> 41 #include <sys/cpu.h> 42 #include <sys/cpuvar.h> 43 #include <sys/debug.h> 44 #include <sys/lgrp.h> 45 #include <sys/archsystm.h> 46 #include <sys/machsystm.h> 47 #include <sys/vmsystm.h> 48 #include <sys/bitmap.h> 49 #include <vm/as.h> 50 #include <vm/seg.h> 51 #include <vm/seg_kmem.h> 52 #include <vm/seg_kp.h> 53 #include <vm/seg_kpm.h> 54 #include <vm/rm.h> 55 #include <vm/vm_dep.h> 56 #include <sys/t_lock.h> 57 #include <sys/vm_machparam.h> 58 #include <sys/promif.h> 59 #include <sys/prom_isa.h> 60 #include <sys/prom_plat.h> 61 #include <sys/prom_debug.h> 62 #include <sys/privregs.h> 63 #include <sys/bootconf.h> 64 #include <sys/memlist.h> 65 #include <sys/memlist_plat.h> 66 #include <sys/cpu_module.h> 67 #include <sys/reboot.h> 68 #include <sys/kdi.h> 69 70 /* 71 * Static routines 72 */ 73 static void sfmmu_map_prom_mappings(struct translation *, size_t); 74 static struct translation *read_prom_mappings(size_t *); 75 static void sfmmu_reloc_trap_handler(void *, void *, size_t); 76 77 /* 78 * External routines 79 */ 80 extern void sfmmu_remap_kernel(void); 81 extern void sfmmu_patch_utsb(void); 82 83 /* 84 * Global Data: 85 */ 86 extern caddr_t textva, datava; 87 extern tte_t ktext_tte, kdata_tte; /* ttes for kernel text and data */ 88 extern int enable_bigktsb; 89 90 uint64_t memsegspa = (uintptr_t)MSEG_NULLPTR_PA; /* memsegs physical linkage */ 91 uint64_t memseg_phash[N_MEM_SLOTS]; /* use physical memseg addresses */ 92 93 int sfmmu_kern_mapped = 0; 94 95 /* 96 * DMMU primary context register for the kernel context. Machine specific code 97 * inserts correct page size codes when necessary 98 */ 99 uint64_t kcontextreg = KCONTEXT; 100 101 /* Extern Global Data */ 102 103 extern int page_relocate_ready; 104 105 /* 106 * Controls the logic which enables the use of the 107 * QUAD_LDD_PHYS ASI for TSB accesses. 108 */ 109 extern int ktsb_phys; 110 111 /* 112 * Global Routines called from within: 113 * usr/src/uts/sun4u 114 * usr/src/uts/sfmmu 115 * usr/src/uts/sun 116 */ 117 118 pfn_t 119 va_to_pfn(void *vaddr) 120 { 121 u_longlong_t physaddr; 122 int mode, valid; 123 124 if (tba_taken_over) 125 return (hat_getpfnum(kas.a_hat, (caddr_t)vaddr)); 126 127 if ((prom_translate_virt(vaddr, &valid, &physaddr, &mode) != -1) && 128 (valid == -1)) { 129 return ((pfn_t)(physaddr >> MMU_PAGESHIFT)); 130 } 131 return (PFN_INVALID); 132 } 133 134 uint64_t 135 va_to_pa(void *vaddr) 136 { 137 pfn_t pfn; 138 139 if ((pfn = va_to_pfn(vaddr)) == PFN_INVALID) 140 return ((uint64_t)-1); 141 return (((uint64_t)pfn << MMU_PAGESHIFT) | 142 ((uint64_t)vaddr & MMU_PAGEOFFSET)); 143 } 144 145 void 146 hat_kern_setup(void) 147 { 148 struct translation *trans_root; 149 size_t ntrans_root; 150 extern void startup_fixup_physavail(void); 151 152 /* 153 * These are the steps we take to take over the mmu from the prom. 154 * 155 * (1) Read the prom's mappings through the translation property. 156 * (2) Remap the kernel text and kernel data with 2 locked 4MB ttes. 157 * Create the the hmeblks for these 2 ttes at this time. 158 * (3) Create hat structures for all other prom mappings. Since the 159 * kernel text and data hme_blks have already been created we 160 * skip the equivalent prom's mappings. 161 * (4) Initialize the tsb and its corresponding hardware regs. 162 * (5) Take over the trap table (currently in startup). 163 * (6) Up to this point it is possible the prom required some of its 164 * locked tte's. Now that we own the trap table we remove them. 165 */ 166 167 ktsb_pbase = va_to_pa(ktsb_base); 168 ktsb4m_pbase = va_to_pa(ktsb4m_base); 169 PRM_DEBUG(ktsb_pbase); 170 PRM_DEBUG(ktsb4m_pbase); 171 172 sfmmu_setup_4lp(); 173 sfmmu_patch_ktsb(); 174 sfmmu_patch_utsb(); 175 sfmmu_patch_mmu_asi(ktsb_phys); 176 177 sfmmu_init_tsbs(); 178 179 if (kpm_enable) { 180 sfmmu_kpm_patch_tlbm(); 181 if (kpm_smallpages == 0) { 182 sfmmu_kpm_patch_tsbm(); 183 } 184 } 185 186 /* 187 * The 8K-indexed kernel TSB space is used to hold 188 * translations below... 189 */ 190 trans_root = read_prom_mappings(&ntrans_root); 191 sfmmu_remap_kernel(); 192 startup_fixup_physavail(); 193 mmu_init_kernel_pgsz(kas.a_hat); 194 sfmmu_map_prom_mappings(trans_root, ntrans_root); 195 196 /* 197 * We invalidate 8K kernel TSB because we used it in 198 * sfmmu_map_prom_mappings() 199 */ 200 sfmmu_inv_tsb(ktsb_base, ktsb_sz); 201 sfmmu_inv_tsb(ktsb4m_base, ktsb4m_sz); 202 203 sfmmu_init_ktsbinfo(); 204 205 206 sfmmu_kern_mapped = 1; 207 208 /* 209 * hments have been created for mapped pages, and thus we're ready 210 * for kmdb to start using its own trap table. It walks the hments 211 * to resolve TLB misses, and can't be used until they're ready. 212 */ 213 if (boothowto & RB_DEBUG) 214 kdi_dvec_vmready(); 215 } 216 217 /* 218 * Macro used below to convert the prom's 32-bit high and low fields into 219 * a value appropriate for the 64-bit kernel. 220 */ 221 222 #define COMBINE(hi, lo) (((uint64_t)(uint32_t)(hi) << 32) | (uint32_t)(lo)) 223 224 /* 225 * This function traverses the prom mapping list and creates equivalent 226 * mappings in the sfmmu mapping hash. 227 */ 228 static void 229 sfmmu_map_prom_mappings(struct translation *trans_root, size_t ntrans_root) 230 { 231 struct translation *promt; 232 tte_t tte, oldtte, *ttep; 233 pfn_t pfn, oldpfn, basepfn; 234 caddr_t vaddr; 235 size_t size, offset; 236 unsigned long i; 237 uint_t attr; 238 page_t *pp; 239 extern struct memlist *virt_avail; 240 241 ttep = &tte; 242 for (i = 0, promt = trans_root; i < ntrans_root; i++, promt++) { 243 ASSERT(promt->tte_hi != 0); 244 ASSERT32(promt->virt_hi == 0 && promt->size_hi == 0); 245 246 /* 247 * hack until we get rid of map-for-unix 248 */ 249 if (COMBINE(promt->virt_hi, promt->virt_lo) < KERNELBASE) 250 continue; 251 252 ttep->tte_inthi = promt->tte_hi; 253 ttep->tte_intlo = promt->tte_lo; 254 attr = PROC_DATA | HAT_NOSYNC; 255 #if defined(TTE_IS_GLOBAL) 256 if (TTE_IS_GLOBAL(ttep)) { 257 /* 258 * The prom better not use global translations 259 * because a user process might use the same 260 * virtual addresses 261 */ 262 cmn_err(CE_PANIC, "map_prom: global translation"); 263 TTE_SET_LOFLAGS(ttep, TTE_GLB_INT, 0); 264 } 265 #endif 266 if (TTE_IS_LOCKED(ttep)) { 267 /* clear the lock bits */ 268 TTE_CLR_LOCKED(ttep); 269 } 270 attr |= (TTE_IS_VCACHEABLE(ttep)) ? 0 : SFMMU_UNCACHEVTTE; 271 attr |= (TTE_IS_PCACHEABLE(ttep)) ? 0 : SFMMU_UNCACHEPTTE; 272 attr |= (TTE_IS_SIDEFFECT(ttep)) ? SFMMU_SIDEFFECT : 0; 273 attr |= (TTE_IS_IE(ttep)) ? HAT_STRUCTURE_LE : 0; 274 275 size = COMBINE(promt->size_hi, promt->size_lo); 276 offset = 0; 277 basepfn = TTE_TO_PFN((caddr_t)COMBINE(promt->virt_hi, 278 promt->virt_lo), ttep); 279 while (size) { 280 vaddr = (caddr_t)(COMBINE(promt->virt_hi, 281 promt->virt_lo) + offset); 282 283 /* 284 * make sure address is not in virt-avail list 285 */ 286 if (address_in_memlist(virt_avail, (uint64_t)vaddr, 287 size)) { 288 cmn_err(CE_PANIC, "map_prom: inconsistent " 289 "translation/avail lists"); 290 } 291 292 pfn = basepfn + mmu_btop(offset); 293 if (pf_is_memory(pfn)) { 294 if (attr & SFMMU_UNCACHEPTTE) { 295 cmn_err(CE_PANIC, "map_prom: " 296 "uncached prom memory page"); 297 } 298 } else { 299 if (!(attr & SFMMU_SIDEFFECT)) { 300 cmn_err(CE_PANIC, "map_prom: prom " 301 "i/o page without side-effect"); 302 } 303 } 304 oldpfn = sfmmu_vatopfn(vaddr, KHATID, &oldtte); 305 ASSERT(oldpfn != PFN_SUSPENDED); 306 ASSERT(page_relocate_ready == 0); 307 308 if (oldpfn != PFN_INVALID) { 309 /* 310 * mapping already exists. 311 * Verify they are equal 312 */ 313 if (pfn != oldpfn) { 314 cmn_err(CE_PANIC, "map_prom: mapping " 315 "conflict (va=0x%p pfn=%p, " 316 "oldpfn=%p)", 317 (void *)vaddr, (void *)pfn, 318 (void *)oldpfn); 319 } 320 size -= MMU_PAGESIZE; 321 offset += MMU_PAGESIZE; 322 continue; 323 } 324 325 pp = page_numtopp_nolock(pfn); 326 if ((pp != NULL) && PP_ISFREE((page_t *)pp)) { 327 cmn_err(CE_PANIC, "map_prom: " 328 "prom-mapped page (va 0x%p, pfn 0x%p) " 329 "on free list", (void *)vaddr, (void *)pfn); 330 } 331 332 sfmmu_memtte(ttep, pfn, attr, TTE8K); 333 sfmmu_tteload(kas.a_hat, ttep, vaddr, pp, 334 HAT_LOAD_LOCK | SFMMU_NO_TSBLOAD); 335 size -= MMU_PAGESIZE; 336 offset += MMU_PAGESIZE; 337 } 338 } 339 } 340 341 #undef COMBINE /* local to previous routine */ 342 343 /* 344 * This routine reads in the "translations" property in to a buffer and 345 * returns a pointer to this buffer and the number of translations. 346 */ 347 static struct translation * 348 read_prom_mappings(size_t *ntransrootp) 349 { 350 char *prop = "translations"; 351 size_t translen; 352 pnode_t node; 353 struct translation *transroot; 354 355 /* 356 * the "translations" property is associated with the mmu node 357 */ 358 node = (pnode_t)prom_getphandle(prom_mmu_ihandle()); 359 360 /* 361 * We use the TSB space to read in the prom mappings. This space 362 * is currently not being used because we haven't taken over the 363 * trap table yet. It should be big enough to hold the mappings. 364 */ 365 if ((translen = prom_getproplen(node, prop)) == -1) 366 cmn_err(CE_PANIC, "no translations property"); 367 *ntransrootp = translen / sizeof (*transroot); 368 translen = roundup(translen, MMU_PAGESIZE); 369 PRM_DEBUG(translen); 370 if (translen > TSB_BYTES(ktsb_szcode)) 371 cmn_err(CE_PANIC, "not enough space for translations"); 372 373 transroot = (struct translation *)ktsb_base; 374 ASSERT(transroot); 375 if (prom_getprop(node, prop, (caddr_t)transroot) == -1) { 376 cmn_err(CE_PANIC, "translations getprop failed"); 377 } 378 return (transroot); 379 } 380 381 /* 382 * Init routine of the nucleus data memory allocator. 383 * 384 * The nucleus data memory allocator is organized in ecache_alignsize'd 385 * memory chunks. Memory allocated by ndata_alloc() will never be freed. 386 * 387 * The ndata argument is used as header of the ndata freelist. 388 * Other freelist nodes are placed in the nucleus memory itself 389 * at the beginning of a free memory chunk. Therefore a freelist 390 * node (struct memlist) must fit into the smallest allocatable 391 * memory chunk (ecache_alignsize bytes). 392 * 393 * The memory interval [base, end] passed to ndata_alloc_init() must be 394 * bzero'd to allow the allocator to return bzero'd memory easily. 395 */ 396 void 397 ndata_alloc_init(struct memlist *ndata, uintptr_t base, uintptr_t end) 398 { 399 ASSERT(sizeof (struct memlist) <= ecache_alignsize); 400 401 base = roundup(base, ecache_alignsize); 402 end = end - end % ecache_alignsize; 403 404 ASSERT(base < end); 405 406 ndata->address = base; 407 ndata->size = end - base; 408 ndata->next = NULL; 409 ndata->prev = NULL; 410 } 411 412 /* 413 * Deliver the size of the largest free memory chunk. 414 */ 415 size_t 416 ndata_maxsize(struct memlist *ndata) 417 { 418 size_t chunksize = ndata->size; 419 420 while ((ndata = ndata->next) != NULL) { 421 if (chunksize < ndata->size) 422 chunksize = ndata->size; 423 } 424 425 return (chunksize); 426 } 427 428 /* 429 * This is a special function to figure out if the memory chunk needed 430 * for the page structs can fit in the nucleus or not. If it fits the 431 * function calculates and returns the possible remaining ndata size 432 * in the last element if the size needed for page structs would be 433 * allocated from the nucleus. 434 */ 435 size_t 436 ndata_spare(struct memlist *ndata, size_t wanted, size_t alignment) 437 { 438 struct memlist *frlist; 439 uintptr_t base; 440 uintptr_t end; 441 442 for (frlist = ndata; frlist != NULL; frlist = frlist->next) { 443 base = roundup(frlist->address, alignment); 444 end = roundup(base + wanted, ecache_alignsize); 445 446 if (end <= frlist->address + frlist->size) { 447 if (frlist->next == NULL) 448 return (frlist->address + frlist->size - end); 449 450 while (frlist->next != NULL) 451 frlist = frlist->next; 452 453 return (frlist->size); 454 } 455 } 456 457 return (0); 458 } 459 460 /* 461 * Allocate the last properly aligned memory chunk. 462 * This function is called when no more large nucleus memory chunks 463 * will be allocated. The remaining free nucleus memory at the end 464 * of the nucleus can be added to the phys_avail list. 465 */ 466 void * 467 ndata_extra_base(struct memlist *ndata, size_t alignment) 468 { 469 uintptr_t base; 470 size_t wasteage = 0; 471 #ifdef DEBUG 472 static int called = 0; 473 474 if (called++ > 0) 475 cmn_err(CE_PANIC, "ndata_extra_base() called more than once"); 476 #endif /* DEBUG */ 477 478 /* 479 * The alignment needs to be a multiple of ecache_alignsize. 480 */ 481 ASSERT((alignment % ecache_alignsize) == 0); 482 483 while (ndata->next != NULL) { 484 wasteage += ndata->size; 485 ndata = ndata->next; 486 } 487 488 base = roundup(ndata->address, alignment); 489 490 if (base >= ndata->address + ndata->size) 491 return (NULL); 492 493 if (base == ndata->address) { 494 if (ndata->prev != NULL) 495 ndata->prev->next = NULL; 496 else 497 ndata->size = 0; 498 499 bzero((void *)base, sizeof (struct memlist)); 500 501 } else { 502 ndata->size = base - ndata->address; 503 wasteage += ndata->size; 504 } 505 PRM_DEBUG(wasteage); 506 507 return ((void *)base); 508 } 509 510 /* 511 * Select the best matching buffer, avoid memory fragmentation. 512 */ 513 static struct memlist * 514 ndata_select_chunk(struct memlist *ndata, size_t wanted, size_t alignment) 515 { 516 struct memlist *fnd_below = NULL; 517 struct memlist *fnd_above = NULL; 518 struct memlist *fnd_unused = NULL; 519 struct memlist *frlist; 520 uintptr_t base; 521 uintptr_t end; 522 size_t below; 523 size_t above; 524 size_t unused; 525 size_t best_below = ULONG_MAX; 526 size_t best_above = ULONG_MAX; 527 size_t best_unused = ULONG_MAX; 528 529 ASSERT(ndata != NULL); 530 531 /* 532 * Look for the best matching buffer, avoid memory fragmentation. 533 * The following strategy is used, try to find 534 * 1. an exact fitting buffer 535 * 2. avoid wasting any space below the buffer, take first 536 * fitting buffer 537 * 3. avoid wasting any space above the buffer, take first 538 * fitting buffer 539 * 4. avoid wasting space, take first fitting buffer 540 * 5. take the last buffer in chain 541 */ 542 for (frlist = ndata; frlist != NULL; frlist = frlist->next) { 543 base = roundup(frlist->address, alignment); 544 end = roundup(base + wanted, ecache_alignsize); 545 546 if (end > frlist->address + frlist->size) 547 continue; 548 549 below = (base - frlist->address) / ecache_alignsize; 550 above = (frlist->address + frlist->size - end) / 551 ecache_alignsize; 552 unused = below + above; 553 554 if (unused == 0) 555 return (frlist); 556 557 if (frlist->next == NULL) 558 break; 559 560 if (below < best_below) { 561 best_below = below; 562 fnd_below = frlist; 563 } 564 565 if (above < best_above) { 566 best_above = above; 567 fnd_above = frlist; 568 } 569 570 if (unused < best_unused) { 571 best_unused = unused; 572 fnd_unused = frlist; 573 } 574 } 575 576 if (best_below == 0) 577 return (fnd_below); 578 if (best_above == 0) 579 return (fnd_above); 580 if (best_unused < ULONG_MAX) 581 return (fnd_unused); 582 583 return (frlist); 584 } 585 586 /* 587 * Nucleus data memory allocator. 588 * The granularity of the allocator is ecache_alignsize. 589 * See also comment for ndata_alloc_init(). 590 */ 591 void * 592 ndata_alloc(struct memlist *ndata, size_t wanted, size_t alignment) 593 { 594 struct memlist *found; 595 struct memlist *fnd_above; 596 uintptr_t base; 597 uintptr_t end; 598 size_t below; 599 size_t above; 600 601 /* 602 * Look for the best matching buffer, avoid memory fragmentation. 603 */ 604 if ((found = ndata_select_chunk(ndata, wanted, alignment)) == NULL) 605 return (NULL); 606 607 /* 608 * Allocate the nucleus data buffer. 609 */ 610 base = roundup(found->address, alignment); 611 end = roundup(base + wanted, ecache_alignsize); 612 ASSERT(end <= found->address + found->size); 613 614 below = base - found->address; 615 above = found->address + found->size - end; 616 ASSERT(above == 0 || (above % ecache_alignsize) == 0); 617 618 if (below >= ecache_alignsize) { 619 /* 620 * There is free memory below the allocated memory chunk. 621 */ 622 found->size = below - below % ecache_alignsize; 623 624 if (above) { 625 fnd_above = (struct memlist *)end; 626 fnd_above->address = end; 627 fnd_above->size = above; 628 629 if ((fnd_above->next = found->next) != NULL) 630 found->next->prev = fnd_above; 631 fnd_above->prev = found; 632 found->next = fnd_above; 633 } 634 635 return ((void *)base); 636 } 637 638 if (found->prev == NULL) { 639 /* 640 * The first chunk (ndata) is selected. 641 */ 642 ASSERT(found == ndata); 643 if (above) { 644 found->address = end; 645 found->size = above; 646 } else if (found->next != NULL) { 647 found->address = found->next->address; 648 found->size = found->next->size; 649 if ((found->next = found->next->next) != NULL) 650 found->next->prev = found; 651 652 bzero((void *)found->address, sizeof (struct memlist)); 653 } else { 654 found->address = end; 655 found->size = 0; 656 } 657 658 return ((void *)base); 659 } 660 661 /* 662 * Not the first chunk. 663 */ 664 if (above) { 665 fnd_above = (struct memlist *)end; 666 fnd_above->address = end; 667 fnd_above->size = above; 668 669 if ((fnd_above->next = found->next) != NULL) 670 fnd_above->next->prev = fnd_above; 671 fnd_above->prev = found->prev; 672 found->prev->next = fnd_above; 673 674 } else { 675 if ((found->prev->next = found->next) != NULL) 676 found->next->prev = found->prev; 677 } 678 679 bzero((void *)found->address, sizeof (struct memlist)); 680 681 return ((void *)base); 682 } 683 684 /* 685 * Size the kernel TSBs based upon the amount of physical 686 * memory in the system. 687 */ 688 static void 689 calc_tsb_sizes(pgcnt_t npages) 690 { 691 PRM_DEBUG(npages); 692 693 if (npages <= TSB_FREEMEM_MIN) { 694 ktsb_szcode = TSB_128K_SZCODE; 695 enable_bigktsb = 0; 696 } else if (npages <= TSB_FREEMEM_LARGE / 2) { 697 ktsb_szcode = TSB_256K_SZCODE; 698 enable_bigktsb = 0; 699 } else if (npages <= TSB_FREEMEM_LARGE) { 700 ktsb_szcode = TSB_512K_SZCODE; 701 enable_bigktsb = 0; 702 } else if (npages <= TSB_FREEMEM_LARGE * 2 || 703 enable_bigktsb == 0) { 704 ktsb_szcode = TSB_1M_SZCODE; 705 enable_bigktsb = 0; 706 } else { 707 ktsb_szcode = highbit(npages - 1); 708 ktsb_szcode -= TSB_START_SIZE; 709 ktsb_szcode = MAX(ktsb_szcode, MIN_BIGKTSB_SZCODE); 710 ktsb_szcode = MIN(ktsb_szcode, MAX_BIGKTSB_SZCODE); 711 } 712 713 /* 714 * We choose the TSB to hold kernel 4M mappings to have twice 715 * the reach as the primary kernel TSB since this TSB will 716 * potentially (currently) be shared by both mappings to all of 717 * physical memory plus user TSBs. Since the current 718 * limit on primary kernel TSB size is 16MB this will top out 719 * at 64K which we can certainly afford. 720 */ 721 ktsb4m_szcode = ktsb_szcode - (MMU_PAGESHIFT4M - MMU_PAGESHIFT) + 1; 722 if (ktsb4m_szcode < TSB_MIN_SZCODE) 723 ktsb4m_szcode = TSB_MIN_SZCODE; 724 725 ktsb_sz = TSB_BYTES(ktsb_szcode); /* kernel 8K tsb size */ 726 ktsb4m_sz = TSB_BYTES(ktsb4m_szcode); /* kernel 4M tsb size */ 727 } 728 729 /* 730 * Allocate kernel TSBs from nucleus data memory. 731 * The function return 0 on success and -1 on failure. 732 */ 733 int 734 ndata_alloc_tsbs(struct memlist *ndata, pgcnt_t npages) 735 { 736 /* 737 * Size the kernel TSBs based upon the amount of physical 738 * memory in the system. 739 */ 740 calc_tsb_sizes(npages); 741 742 /* 743 * Allocate the 8K kernel TSB if it belongs inside the nucleus. 744 */ 745 if (enable_bigktsb == 0) { 746 if ((ktsb_base = ndata_alloc(ndata, ktsb_sz, ktsb_sz)) == NULL) 747 return (-1); 748 ASSERT(!((uintptr_t)ktsb_base & (ktsb_sz - 1))); 749 750 PRM_DEBUG(ktsb_base); 751 PRM_DEBUG(ktsb_sz); 752 PRM_DEBUG(ktsb_szcode); 753 } 754 755 /* 756 * Next, allocate 4M kernel TSB from the nucleus since it's small. 757 */ 758 if ((ktsb4m_base = ndata_alloc(ndata, ktsb4m_sz, ktsb4m_sz)) == NULL) 759 return (-1); 760 ASSERT(!((uintptr_t)ktsb4m_base & (ktsb4m_sz - 1))); 761 762 PRM_DEBUG(ktsb4m_base); 763 PRM_DEBUG(ktsb4m_sz); 764 PRM_DEBUG(ktsb4m_szcode); 765 766 return (0); 767 } 768 769 /* 770 * Allocate hat structs from the nucleus data memory. 771 */ 772 int 773 ndata_alloc_hat(struct memlist *ndata, pgcnt_t npages, pgcnt_t kpm_npages) 774 { 775 size_t mml_alloc_sz; 776 size_t cb_alloc_sz; 777 int max_nucuhme_buckets = MAX_NUCUHME_BUCKETS; 778 int max_nuckhme_buckets = MAX_NUCKHME_BUCKETS; 779 ulong_t hme_buckets; 780 781 if (enable_bigktsb) { 782 ASSERT((max_nucuhme_buckets + max_nuckhme_buckets) * 783 sizeof (struct hmehash_bucket) <= 784 TSB_BYTES(TSB_1M_SZCODE)); 785 786 max_nucuhme_buckets *= 2; 787 max_nuckhme_buckets *= 2; 788 } 789 790 /* 791 * The number of buckets in the hme hash tables 792 * is a power of 2 such that the average hash chain length is 793 * HMENT_HASHAVELEN. The number of buckets for the user hash is 794 * a function of physical memory and a predefined overmapping factor. 795 * The number of buckets for the kernel hash is a function of 796 * physical memory only. 797 */ 798 hme_buckets = (npages * HMEHASH_FACTOR) / 799 (HMENT_HASHAVELEN * (HMEBLK_SPAN(TTE8K) >> MMU_PAGESHIFT)); 800 801 uhmehash_num = (int)MIN(hme_buckets, MAX_UHME_BUCKETS); 802 803 if (uhmehash_num > USER_BUCKETS_THRESHOLD) { 804 /* 805 * if uhmehash_num is not power of 2 round it down to the 806 * next power of 2. 807 */ 808 uint_t align = 1 << (highbit(uhmehash_num - 1) - 1); 809 uhmehash_num = P2ALIGN(uhmehash_num, align); 810 } else 811 uhmehash_num = 1 << highbit(uhmehash_num - 1); 812 813 hme_buckets = npages / (HMEBLK_SPAN(TTE8K) >> MMU_PAGESHIFT); 814 khmehash_num = (int)MIN(hme_buckets, MAX_KHME_BUCKETS); 815 khmehash_num = 1 << highbit(khmehash_num - 1); 816 khmehash_num = MAX(khmehash_num, MIN_KHME_BUCKETS); 817 818 if ((khmehash_num > max_nuckhme_buckets) || 819 (uhmehash_num > max_nucuhme_buckets)) { 820 khme_hash = NULL; 821 uhme_hash = NULL; 822 } else { 823 size_t hmehash_sz = (uhmehash_num + khmehash_num) * 824 sizeof (struct hmehash_bucket); 825 826 if ((khme_hash = ndata_alloc(ndata, hmehash_sz, 827 ecache_alignsize)) != NULL) 828 uhme_hash = &khme_hash[khmehash_num]; 829 else 830 uhme_hash = NULL; 831 832 PRM_DEBUG(hmehash_sz); 833 } 834 835 PRM_DEBUG(khme_hash); 836 PRM_DEBUG(khmehash_num); 837 PRM_DEBUG(uhme_hash); 838 PRM_DEBUG(uhmehash_num); 839 840 /* 841 * For the page mapping list mutex array we allocate one mutex 842 * for every 128 pages (1 MB) with a minimum of 64 entries and 843 * a maximum of 8K entries. For the initial computation npages 844 * is rounded up (ie. 1 << highbit(npages * 1.5 / 128)) 845 * 846 * mml_shift is roughly log2(mml_table_sz) + 3 for MLIST_HASH 847 * 848 * It is not required that this be allocated from the nucleus, 849 * but it is desirable. So we first allocate from the nucleus 850 * everything that must be there. Having done so, if mml_table 851 * will fit within what remains of the nucleus then it will be 852 * allocated here. If not, set mml_table to NULL, which will cause 853 * startup_memlist() to BOP_ALLOC() space for it after our return... 854 */ 855 mml_table_sz = 1 << highbit((npages * 3) / 256); 856 if (mml_table_sz < 64) 857 mml_table_sz = 64; 858 else if (mml_table_sz > 8192) 859 mml_table_sz = 8192; 860 mml_shift = highbit(mml_table_sz) + 3; 861 862 PRM_DEBUG(mml_table_sz); 863 PRM_DEBUG(mml_shift); 864 865 mml_alloc_sz = mml_table_sz * sizeof (kmutex_t); 866 867 mml_table = ndata_alloc(ndata, mml_alloc_sz, ecache_alignsize); 868 869 PRM_DEBUG(mml_table); 870 871 cb_alloc_sz = sfmmu_max_cb_id * sizeof (struct sfmmu_callback); 872 PRM_DEBUG(cb_alloc_sz); 873 sfmmu_cb_table = ndata_alloc(ndata, cb_alloc_sz, ecache_alignsize); 874 PRM_DEBUG(sfmmu_cb_table); 875 876 /* 877 * For the kpm_page mutex array we allocate one mutex every 16 878 * kpm pages (64MB). In smallpage mode we allocate one mutex 879 * every 8K pages. The minimum is set to 64 entries and the 880 * maximum to 8K entries. 881 * 882 * It is not required that this be allocated from the nucleus, 883 * but it is desirable. So we first allocate from the nucleus 884 * everything that must be there. Having done so, if kpmp_table 885 * or kpmp_stable will fit within what remains of the nucleus 886 * then it will be allocated here. If not, startup_memlist() 887 * will use BOP_ALLOC() space for it after our return... 888 */ 889 if (kpm_enable) { 890 size_t kpmp_alloc_sz; 891 892 if (kpm_smallpages == 0) { 893 kpmp_shift = highbit(sizeof (kpm_page_t)) - 1; 894 kpmp_table_sz = 1 << highbit(kpm_npages / 16); 895 kpmp_table_sz = (kpmp_table_sz < 64) ? 64 : 896 ((kpmp_table_sz > 8192) ? 8192 : kpmp_table_sz); 897 kpmp_alloc_sz = kpmp_table_sz * sizeof (kpm_hlk_t); 898 899 kpmp_table = ndata_alloc(ndata, kpmp_alloc_sz, 900 ecache_alignsize); 901 902 PRM_DEBUG(kpmp_table); 903 PRM_DEBUG(kpmp_table_sz); 904 905 kpmp_stable_sz = 0; 906 kpmp_stable = NULL; 907 } else { 908 ASSERT(kpm_pgsz == PAGESIZE); 909 kpmp_shift = highbit(sizeof (kpm_shlk_t)) + 1; 910 kpmp_stable_sz = 1 << highbit(kpm_npages / 8192); 911 kpmp_stable_sz = (kpmp_stable_sz < 64) ? 64 : 912 ((kpmp_stable_sz > 8192) ? 8192 : kpmp_stable_sz); 913 kpmp_alloc_sz = kpmp_stable_sz * sizeof (kpm_shlk_t); 914 915 kpmp_stable = ndata_alloc(ndata, kpmp_alloc_sz, 916 ecache_alignsize); 917 918 PRM_DEBUG(kpmp_stable); 919 PRM_DEBUG(kpmp_stable_sz); 920 921 kpmp_table_sz = 0; 922 kpmp_table = NULL; 923 } 924 PRM_DEBUG(kpmp_shift); 925 } 926 927 return (0); 928 } 929 930 caddr_t 931 alloc_hme_buckets(caddr_t base, int pagesize) 932 { 933 size_t hmehash_sz = (uhmehash_num + khmehash_num) * 934 sizeof (struct hmehash_bucket); 935 936 ASSERT(khme_hash == NULL); 937 ASSERT(uhme_hash == NULL); 938 939 /* If no pagesize specified, use default MMU pagesize */ 940 if (!pagesize) 941 pagesize = MMU_PAGESIZE; 942 943 /* 944 * If we start aligned and ask for a multiple of a pagesize, and OBP 945 * supports large pages, we will then use mappings of the largest size 946 * possible for the BOP_ALLOC, possibly saving us tens of thousands of 947 * TLB miss-induced traversals of the TSBs and/or the HME hashes... 948 */ 949 base = (caddr_t)roundup((uintptr_t)base, pagesize); 950 hmehash_sz = roundup(hmehash_sz, pagesize); 951 952 khme_hash = (struct hmehash_bucket *)BOP_ALLOC(bootops, base, 953 hmehash_sz, pagesize); 954 955 if ((caddr_t)khme_hash != base) 956 cmn_err(CE_PANIC, "Cannot bop_alloc hme hash buckets."); 957 958 uhme_hash = (struct hmehash_bucket *)((caddr_t)khme_hash + 959 khmehash_num * sizeof (struct hmehash_bucket)); 960 base += hmehash_sz; 961 return (base); 962 } 963 964 /* 965 * This function bop allocs the kernel TSB. 966 */ 967 caddr_t 968 sfmmu_ktsb_alloc(caddr_t tsbbase) 969 { 970 caddr_t vaddr; 971 972 if (enable_bigktsb) { 973 ktsb_base = (caddr_t)roundup((uintptr_t)tsbbase, ktsb_sz); 974 vaddr = (caddr_t)BOP_ALLOC(bootops, ktsb_base, ktsb_sz, 975 ktsb_sz); 976 if (vaddr != ktsb_base) 977 cmn_err(CE_PANIC, "sfmmu_ktsb_alloc: can't alloc" 978 " bigktsb"); 979 ktsb_base = vaddr; 980 tsbbase = ktsb_base + ktsb_sz; 981 PRM_DEBUG(ktsb_base); 982 PRM_DEBUG(tsbbase); 983 } 984 return (tsbbase); 985 } 986 987 /* 988 * Moves code assembled outside of the trap table into the trap 989 * table taking care to relocate relative branches to code outside 990 * of the trap handler. 991 */ 992 static void 993 sfmmu_reloc_trap_handler(void *tablep, void *start, size_t count) 994 { 995 size_t i; 996 uint32_t *src; 997 uint32_t *dst; 998 uint32_t inst; 999 int op, op2; 1000 int32_t offset; 1001 int disp; 1002 1003 src = start; 1004 dst = tablep; 1005 offset = src - dst; 1006 for (src = start, i = 0; i < count; i++, src++, dst++) { 1007 inst = *dst = *src; 1008 op = (inst >> 30) & 0x2; 1009 if (op == 1) { 1010 /* call */ 1011 disp = ((int32_t)inst << 2) >> 2; /* sign-extend */ 1012 if (disp + i >= 0 && disp + i < count) 1013 continue; 1014 disp += offset; 1015 inst = 0x40000000u | (disp & 0x3fffffffu); 1016 *dst = inst; 1017 } else if (op == 0) { 1018 /* branch or sethi */ 1019 op2 = (inst >> 22) & 0x7; 1020 1021 switch (op2) { 1022 case 0x3: /* BPr */ 1023 disp = (((inst >> 20) & 0x3) << 14) | 1024 (inst & 0x3fff); 1025 disp = (disp << 16) >> 16; /* sign-extend */ 1026 if (disp + i >= 0 && disp + i < count) 1027 continue; 1028 disp += offset; 1029 if (((disp << 16) >> 16) != disp) 1030 cmn_err(CE_PANIC, "bad reloc"); 1031 inst &= ~0x303fff; 1032 inst |= (disp & 0x3fff); 1033 inst |= (disp & 0xc000) << 6; 1034 break; 1035 1036 case 0x2: /* Bicc */ 1037 disp = ((int32_t)inst << 10) >> 10; 1038 if (disp + i >= 0 && disp + i < count) 1039 continue; 1040 disp += offset; 1041 if (((disp << 10) >> 10) != disp) 1042 cmn_err(CE_PANIC, "bad reloc"); 1043 inst &= ~0x3fffff; 1044 inst |= (disp & 0x3fffff); 1045 break; 1046 1047 case 0x1: /* Bpcc */ 1048 disp = ((int32_t)inst << 13) >> 13; 1049 if (disp + i >= 0 && disp + i < count) 1050 continue; 1051 disp += offset; 1052 if (((disp << 13) >> 13) != disp) 1053 cmn_err(CE_PANIC, "bad reloc"); 1054 inst &= ~0x7ffff; 1055 inst |= (disp & 0x7ffffu); 1056 break; 1057 } 1058 *dst = inst; 1059 } 1060 } 1061 flush_instr_mem(tablep, count * sizeof (uint32_t)); 1062 } 1063 1064 /* 1065 * Routine to allocate a large page to use in the TSB caches. 1066 */ 1067 /*ARGSUSED*/ 1068 static page_t * 1069 sfmmu_tsb_page_create(void *addr, size_t size, int vmflag, void *arg) 1070 { 1071 int pgflags; 1072 1073 pgflags = PG_EXCL; 1074 if ((vmflag & VM_NOSLEEP) == 0) 1075 pgflags |= PG_WAIT; 1076 if (vmflag & VM_PANIC) 1077 pgflags |= PG_PANIC; 1078 if (vmflag & VM_PUSHPAGE) 1079 pgflags |= PG_PUSHPAGE; 1080 1081 return (page_create_va_large(&kvp, (u_offset_t)(uintptr_t)addr, size, 1082 pgflags, &kvseg, addr, arg)); 1083 } 1084 1085 /* 1086 * Allocate a large page to back the virtual address range 1087 * [addr, addr + size). If addr is NULL, allocate the virtual address 1088 * space as well. 1089 */ 1090 static void * 1091 sfmmu_tsb_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag, 1092 uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *), 1093 void *pcarg) 1094 { 1095 page_t *ppl; 1096 page_t *rootpp; 1097 caddr_t addr = inaddr; 1098 pgcnt_t npages = btopr(size); 1099 page_t **ppa; 1100 int i = 0; 1101 1102 /* 1103 * Assuming that only TSBs will call this with size > PAGESIZE 1104 * There is no reason why this couldn't be expanded to 8k pages as 1105 * well, or other page sizes in the future .... but for now, we 1106 * only support fixed sized page requests. 1107 */ 1108 if ((inaddr == NULL) && ((addr = vmem_xalloc(vmp, size, size, 0, 0, 1109 NULL, NULL, vmflag)) == NULL)) 1110 return (NULL); 1111 1112 /* If we ever don't want TSB slab-sized pages, this will panic */ 1113 ASSERT(((uintptr_t)addr & (tsb_slab_size - 1)) == 0); 1114 1115 if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) { 1116 if (inaddr == NULL) 1117 vmem_xfree(vmp, addr, size); 1118 return (NULL); 1119 } 1120 1121 ppl = page_create_func(addr, size, vmflag, pcarg); 1122 if (ppl == NULL) { 1123 if (inaddr == NULL) 1124 vmem_xfree(vmp, addr, size); 1125 page_unresv(npages); 1126 return (NULL); 1127 } 1128 1129 rootpp = ppl; 1130 ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP); 1131 while (ppl != NULL) { 1132 page_t *pp = ppl; 1133 ppa[i++] = pp; 1134 page_sub(&ppl, pp); 1135 ASSERT(page_iolock_assert(pp)); 1136 page_io_unlock(pp); 1137 } 1138 1139 /* 1140 * Load the locked entry. It's OK to preload the entry into 1141 * the TSB since we now support large mappings in the kernel TSB. 1142 */ 1143 hat_memload_array(kas.a_hat, (caddr_t)rootpp->p_offset, size, 1144 ppa, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr, HAT_LOAD_LOCK); 1145 1146 for (--i; i >= 0; --i) { 1147 (void) page_pp_lock(ppa[i], 0, 1); 1148 page_unlock(ppa[i]); 1149 } 1150 1151 kmem_free(ppa, npages * sizeof (page_t *)); 1152 return (addr); 1153 } 1154 1155 /* Called to import new spans into the TSB vmem arenas */ 1156 void * 1157 sfmmu_tsb_segkmem_alloc(vmem_t *vmp, size_t size, int vmflag) 1158 { 1159 lgrp_id_t lgrpid = LGRP_NONE; 1160 1161 if (tsb_lgrp_affinity) { 1162 /* 1163 * Search for the vmp->lgrpid mapping by brute force; 1164 * some day vmp will have an lgrp, until then we have 1165 * to do this the hard way. 1166 */ 1167 for (lgrpid = 0; lgrpid < NLGRPS_MAX && 1168 vmp != kmem_tsb_default_arena[lgrpid]; lgrpid++); 1169 if (lgrpid == NLGRPS_MAX) 1170 lgrpid = LGRP_NONE; 1171 } 1172 1173 return (sfmmu_tsb_xalloc(vmp, NULL, size, vmflag, 0, 1174 sfmmu_tsb_page_create, lgrpid != LGRP_NONE? &lgrpid : NULL)); 1175 } 1176 1177 /* Called to free spans from the TSB vmem arenas */ 1178 void 1179 sfmmu_tsb_segkmem_free(vmem_t *vmp, void *inaddr, size_t size) 1180 { 1181 page_t *pp; 1182 caddr_t addr = inaddr; 1183 caddr_t eaddr; 1184 pgcnt_t npages = btopr(size); 1185 pgcnt_t pgs_left = npages; 1186 page_t *rootpp = NULL; 1187 1188 ASSERT(((uintptr_t)addr & (tsb_slab_size - 1)) == 0); 1189 1190 hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK); 1191 1192 for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) { 1193 pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL); 1194 if (pp == NULL) 1195 panic("sfmmu_tsb_segkmem_free: page not found"); 1196 1197 ASSERT(PAGE_EXCL(pp)); 1198 page_pp_unlock(pp, 0, 1); 1199 1200 if (rootpp == NULL) 1201 rootpp = pp; 1202 if (--pgs_left == 0) { 1203 /* 1204 * similar logic to segspt_free_pages, but we know we 1205 * have one large page. 1206 */ 1207 page_destroy_pages(rootpp); 1208 } 1209 } 1210 page_unresv(npages); 1211 1212 if (vmp != NULL) 1213 vmem_xfree(vmp, inaddr, size); 1214 } 1215