1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <vm/hat.h> 31 #include <vm/hat_sfmmu.h> 32 #include <vm/page.h> 33 #include <sys/pte.h> 34 #include <sys/systm.h> 35 #include <sys/mman.h> 36 #include <sys/sysmacros.h> 37 #include <sys/machparam.h> 38 #include <sys/vtrace.h> 39 #include <sys/kmem.h> 40 #include <sys/mmu.h> 41 #include <sys/cmn_err.h> 42 #include <sys/cpu.h> 43 #include <sys/cpuvar.h> 44 #include <sys/debug.h> 45 #include <sys/lgrp.h> 46 #include <sys/archsystm.h> 47 #include <sys/machsystm.h> 48 #include <sys/vmsystm.h> 49 #include <sys/bitmap.h> 50 #include <vm/as.h> 51 #include <vm/seg.h> 52 #include <vm/seg_kmem.h> 53 #include <vm/seg_kp.h> 54 #include <vm/seg_kpm.h> 55 #include <vm/rm.h> 56 #include <vm/vm_dep.h> 57 #include <sys/t_lock.h> 58 #include <sys/vm_machparam.h> 59 #include <sys/promif.h> 60 #include <sys/prom_isa.h> 61 #include <sys/prom_plat.h> 62 #include <sys/prom_debug.h> 63 #include <sys/privregs.h> 64 #include <sys/bootconf.h> 65 #include <sys/memlist.h> 66 #include <sys/memlist_plat.h> 67 #include <sys/cpu_module.h> 68 #include <sys/reboot.h> 69 #include <sys/kdi.h> 70 71 /* 72 * Static routines 73 */ 74 static void sfmmu_map_prom_mappings(struct translation *, size_t); 75 static struct translation *read_prom_mappings(size_t *); 76 static void sfmmu_reloc_trap_handler(void *, void *, size_t); 77 78 /* 79 * External routines 80 */ 81 extern void sfmmu_remap_kernel(void); 82 extern void sfmmu_patch_utsb(void); 83 84 /* 85 * Global Data: 86 */ 87 extern caddr_t textva, datava; 88 extern tte_t ktext_tte, kdata_tte; /* ttes for kernel text and data */ 89 extern int enable_bigktsb; 90 91 uint64_t memsegspa = (uintptr_t)MSEG_NULLPTR_PA; /* memsegs physical linkage */ 92 uint64_t memseg_phash[N_MEM_SLOTS]; /* use physical memseg addresses */ 93 94 int sfmmu_kern_mapped = 0; 95 96 /* 97 * DMMU primary context register for the kernel context. Machine specific code 98 * inserts correct page size codes when necessary 99 */ 100 uint64_t kcontextreg = KCONTEXT; 101 102 /* Extern Global Data */ 103 104 extern int page_relocate_ready; 105 106 /* 107 * Controls the logic which enables the use of the 108 * QUAD_LDD_PHYS ASI for TSB accesses. 109 */ 110 extern int ktsb_phys; 111 112 /* 113 * Global Routines called from within: 114 * usr/src/uts/sun4u 115 * usr/src/uts/sfmmu 116 * usr/src/uts/sun 117 */ 118 119 pfn_t 120 va_to_pfn(void *vaddr) 121 { 122 u_longlong_t physaddr; 123 int mode, valid; 124 125 if (tba_taken_over) 126 return (hat_getpfnum(kas.a_hat, (caddr_t)vaddr)); 127 128 if ((prom_translate_virt(vaddr, &valid, &physaddr, &mode) != -1) && 129 (valid == -1)) { 130 return ((pfn_t)(physaddr >> MMU_PAGESHIFT)); 131 } 132 return (PFN_INVALID); 133 } 134 135 uint64_t 136 va_to_pa(void *vaddr) 137 { 138 pfn_t pfn; 139 140 if ((pfn = va_to_pfn(vaddr)) == PFN_INVALID) 141 return ((uint64_t)-1); 142 return (((uint64_t)pfn << MMU_PAGESHIFT) | 143 ((uint64_t)vaddr & MMU_PAGEOFFSET)); 144 } 145 146 void 147 hat_kern_setup(void) 148 { 149 struct translation *trans_root; 150 size_t ntrans_root; 151 extern void startup_fixup_physavail(void); 152 153 /* 154 * These are the steps we take to take over the mmu from the prom. 155 * 156 * (1) Read the prom's mappings through the translation property. 157 * (2) Remap the kernel text and kernel data with 2 locked 4MB ttes. 158 * Create the the hmeblks for these 2 ttes at this time. 159 * (3) Create hat structures for all other prom mappings. Since the 160 * kernel text and data hme_blks have already been created we 161 * skip the equivalent prom's mappings. 162 * (4) Initialize the tsb and its corresponding hardware regs. 163 * (5) Take over the trap table (currently in startup). 164 * (6) Up to this point it is possible the prom required some of its 165 * locked tte's. Now that we own the trap table we remove them. 166 */ 167 168 ktsb_pbase = va_to_pa(ktsb_base); 169 ktsb4m_pbase = va_to_pa(ktsb4m_base); 170 PRM_DEBUG(ktsb_pbase); 171 PRM_DEBUG(ktsb4m_pbase); 172 173 sfmmu_setup_4lp(); 174 sfmmu_patch_ktsb(); 175 sfmmu_patch_utsb(); 176 sfmmu_patch_mmu_asi(ktsb_phys); 177 178 sfmmu_init_tsbs(); 179 180 if (kpm_enable) { 181 sfmmu_kpm_patch_tlbm(); 182 if (kpm_smallpages == 0) { 183 sfmmu_kpm_patch_tsbm(); 184 } 185 } 186 187 /* 188 * The 8K-indexed kernel TSB space is used to hold 189 * translations below... 190 */ 191 trans_root = read_prom_mappings(&ntrans_root); 192 sfmmu_remap_kernel(); 193 startup_fixup_physavail(); 194 mmu_init_kernel_pgsz(kas.a_hat); 195 sfmmu_map_prom_mappings(trans_root, ntrans_root); 196 197 /* 198 * We invalidate 8K kernel TSB because we used it in 199 * sfmmu_map_prom_mappings() 200 */ 201 sfmmu_inv_tsb(ktsb_base, ktsb_sz); 202 sfmmu_inv_tsb(ktsb4m_base, ktsb4m_sz); 203 204 sfmmu_init_ktsbinfo(); 205 206 207 sfmmu_kern_mapped = 1; 208 209 /* 210 * hments have been created for mapped pages, and thus we're ready 211 * for kmdb to start using its own trap table. It walks the hments 212 * to resolve TLB misses, and can't be used until they're ready. 213 */ 214 if (boothowto & RB_DEBUG) 215 kdi_dvec_vmready(); 216 } 217 218 /* 219 * Macro used below to convert the prom's 32-bit high and low fields into 220 * a value appropriate for the 64-bit kernel. 221 */ 222 223 #define COMBINE(hi, lo) (((uint64_t)(uint32_t)(hi) << 32) | (uint32_t)(lo)) 224 225 /* 226 * This function traverses the prom mapping list and creates equivalent 227 * mappings in the sfmmu mapping hash. 228 */ 229 static void 230 sfmmu_map_prom_mappings(struct translation *trans_root, size_t ntrans_root) 231 { 232 struct translation *promt; 233 tte_t tte, oldtte, *ttep; 234 pfn_t pfn, oldpfn, basepfn; 235 caddr_t vaddr; 236 size_t size, offset; 237 unsigned long i; 238 uint_t attr; 239 page_t *pp; 240 extern struct memlist *virt_avail; 241 242 ttep = &tte; 243 for (i = 0, promt = trans_root; i < ntrans_root; i++, promt++) { 244 ASSERT(promt->tte_hi != 0); 245 ASSERT32(promt->virt_hi == 0 && promt->size_hi == 0); 246 247 /* 248 * hack until we get rid of map-for-unix 249 */ 250 if (COMBINE(promt->virt_hi, promt->virt_lo) < KERNELBASE) 251 continue; 252 253 ttep->tte_inthi = promt->tte_hi; 254 ttep->tte_intlo = promt->tte_lo; 255 attr = PROC_DATA | HAT_NOSYNC; 256 #if defined(TTE_IS_GLOBAL) 257 if (TTE_IS_GLOBAL(ttep)) { 258 /* 259 * The prom better not use global translations 260 * because a user process might use the same 261 * virtual addresses 262 */ 263 cmn_err(CE_PANIC, "map_prom: global translation"); 264 TTE_SET_LOFLAGS(ttep, TTE_GLB_INT, 0); 265 } 266 #endif 267 if (TTE_IS_LOCKED(ttep)) { 268 /* clear the lock bits */ 269 TTE_CLR_LOCKED(ttep); 270 } 271 attr |= (TTE_IS_VCACHEABLE(ttep)) ? 0 : SFMMU_UNCACHEVTTE; 272 attr |= (TTE_IS_PCACHEABLE(ttep)) ? 0 : SFMMU_UNCACHEPTTE; 273 attr |= (TTE_IS_SIDEFFECT(ttep)) ? SFMMU_SIDEFFECT : 0; 274 attr |= (TTE_IS_IE(ttep)) ? HAT_STRUCTURE_LE : 0; 275 276 size = COMBINE(promt->size_hi, promt->size_lo); 277 offset = 0; 278 basepfn = TTE_TO_PFN((caddr_t)COMBINE(promt->virt_hi, 279 promt->virt_lo), ttep); 280 while (size) { 281 vaddr = (caddr_t)(COMBINE(promt->virt_hi, 282 promt->virt_lo) + offset); 283 284 /* 285 * make sure address is not in virt-avail list 286 */ 287 if (address_in_memlist(virt_avail, (uint64_t)vaddr, 288 size)) { 289 cmn_err(CE_PANIC, "map_prom: inconsistent " 290 "translation/avail lists"); 291 } 292 293 pfn = basepfn + mmu_btop(offset); 294 if (pf_is_memory(pfn)) { 295 if (attr & SFMMU_UNCACHEPTTE) { 296 cmn_err(CE_PANIC, "map_prom: " 297 "uncached prom memory page"); 298 } 299 } else { 300 if (!(attr & SFMMU_SIDEFFECT)) { 301 cmn_err(CE_PANIC, "map_prom: prom " 302 "i/o page without side-effect"); 303 } 304 } 305 oldpfn = sfmmu_vatopfn(vaddr, KHATID, &oldtte); 306 ASSERT(oldpfn != PFN_SUSPENDED); 307 ASSERT(page_relocate_ready == 0); 308 309 if (oldpfn != PFN_INVALID) { 310 /* 311 * mapping already exists. 312 * Verify they are equal 313 */ 314 if (pfn != oldpfn) { 315 cmn_err(CE_PANIC, "map_prom: mapping " 316 "conflict (va=0x%p pfn=%p, " 317 "oldpfn=%p)", 318 (void *)vaddr, (void *)pfn, 319 (void *)oldpfn); 320 } 321 size -= MMU_PAGESIZE; 322 offset += MMU_PAGESIZE; 323 continue; 324 } 325 326 pp = page_numtopp_nolock(pfn); 327 if ((pp != NULL) && PP_ISFREE((page_t *)pp)) { 328 cmn_err(CE_PANIC, "map_prom: " 329 "prom-mapped page (va 0x%p, pfn 0x%p) " 330 "on free list", (void *)vaddr, (void *)pfn); 331 } 332 333 sfmmu_memtte(ttep, pfn, attr, TTE8K); 334 sfmmu_tteload(kas.a_hat, ttep, vaddr, pp, 335 HAT_LOAD_LOCK | SFMMU_NO_TSBLOAD); 336 size -= MMU_PAGESIZE; 337 offset += MMU_PAGESIZE; 338 } 339 } 340 } 341 342 #undef COMBINE /* local to previous routine */ 343 344 /* 345 * This routine reads in the "translations" property in to a buffer and 346 * returns a pointer to this buffer and the number of translations. 347 */ 348 static struct translation * 349 read_prom_mappings(size_t *ntransrootp) 350 { 351 char *prop = "translations"; 352 size_t translen; 353 dnode_t node; 354 struct translation *transroot; 355 356 /* 357 * the "translations" property is associated with the mmu node 358 */ 359 node = (dnode_t)prom_getphandle(prom_mmu_ihandle()); 360 361 /* 362 * We use the TSB space to read in the prom mappings. This space 363 * is currently not being used because we haven't taken over the 364 * trap table yet. It should be big enough to hold the mappings. 365 */ 366 if ((translen = prom_getproplen(node, prop)) == -1) 367 cmn_err(CE_PANIC, "no translations property"); 368 *ntransrootp = translen / sizeof (*transroot); 369 translen = roundup(translen, MMU_PAGESIZE); 370 PRM_DEBUG(translen); 371 if (translen > TSB_BYTES(ktsb_szcode)) 372 cmn_err(CE_PANIC, "not enough space for translations"); 373 374 transroot = (struct translation *)ktsb_base; 375 ASSERT(transroot); 376 if (prom_getprop(node, prop, (caddr_t)transroot) == -1) { 377 cmn_err(CE_PANIC, "translations getprop failed"); 378 } 379 return (transroot); 380 } 381 382 /* 383 * Init routine of the nucleus data memory allocator. 384 * 385 * The nucleus data memory allocator is organized in ecache_alignsize'd 386 * memory chunks. Memory allocated by ndata_alloc() will never be freed. 387 * 388 * The ndata argument is used as header of the ndata freelist. 389 * Other freelist nodes are placed in the nucleus memory itself 390 * at the beginning of a free memory chunk. Therefore a freelist 391 * node (struct memlist) must fit into the smallest allocatable 392 * memory chunk (ecache_alignsize bytes). 393 * 394 * The memory interval [base, end] passed to ndata_alloc_init() must be 395 * bzero'd to allow the allocator to return bzero'd memory easily. 396 */ 397 void 398 ndata_alloc_init(struct memlist *ndata, uintptr_t base, uintptr_t end) 399 { 400 ASSERT(sizeof (struct memlist) <= ecache_alignsize); 401 402 base = roundup(base, ecache_alignsize); 403 end = end - end % ecache_alignsize; 404 405 ASSERT(base < end); 406 407 ndata->address = base; 408 ndata->size = end - base; 409 ndata->next = NULL; 410 ndata->prev = NULL; 411 } 412 413 /* 414 * Deliver the size of the largest free memory chunk. 415 */ 416 size_t 417 ndata_maxsize(struct memlist *ndata) 418 { 419 size_t chunksize = ndata->size; 420 421 while ((ndata = ndata->next) != NULL) { 422 if (chunksize < ndata->size) 423 chunksize = ndata->size; 424 } 425 426 return (chunksize); 427 } 428 429 /* 430 * This is a special function to figure out if the memory chunk needed 431 * for the page structs can fit in the nucleus or not. If it fits the 432 * function calculates and returns the possible remaining ndata size 433 * in the last element if the size needed for page structs would be 434 * allocated from the nucleus. 435 */ 436 size_t 437 ndata_spare(struct memlist *ndata, size_t wanted, size_t alignment) 438 { 439 struct memlist *frlist; 440 uintptr_t base; 441 uintptr_t end; 442 443 for (frlist = ndata; frlist != NULL; frlist = frlist->next) { 444 base = roundup(frlist->address, alignment); 445 end = roundup(base + wanted, ecache_alignsize); 446 447 if (end <= frlist->address + frlist->size) { 448 if (frlist->next == NULL) 449 return (frlist->address + frlist->size - end); 450 451 while (frlist->next != NULL) 452 frlist = frlist->next; 453 454 return (frlist->size); 455 } 456 } 457 458 return (0); 459 } 460 461 /* 462 * Allocate the last properly aligned memory chunk. 463 * This function is called when no more large nucleus memory chunks 464 * will be allocated. The remaining free nucleus memory at the end 465 * of the nucleus can be added to the phys_avail list. 466 */ 467 void * 468 ndata_extra_base(struct memlist *ndata, size_t alignment) 469 { 470 uintptr_t base; 471 size_t wasteage = 0; 472 #ifdef DEBUG 473 static int called = 0; 474 475 if (called++ > 0) 476 cmn_err(CE_PANIC, "ndata_extra_base() called more than once"); 477 #endif /* DEBUG */ 478 479 /* 480 * The alignment needs to be a multiple of ecache_alignsize. 481 */ 482 ASSERT((alignment % ecache_alignsize) == 0); 483 484 while (ndata->next != NULL) { 485 wasteage += ndata->size; 486 ndata = ndata->next; 487 } 488 489 base = roundup(ndata->address, alignment); 490 491 if (base >= ndata->address + ndata->size) 492 return (NULL); 493 494 if (base == ndata->address) { 495 if (ndata->prev != NULL) 496 ndata->prev->next = NULL; 497 else 498 ndata->size = 0; 499 500 bzero((void *)base, sizeof (struct memlist)); 501 502 } else { 503 ndata->size = base - ndata->address; 504 wasteage += ndata->size; 505 } 506 PRM_DEBUG(wasteage); 507 508 return ((void *)base); 509 } 510 511 /* 512 * Select the best matching buffer, avoid memory fragmentation. 513 */ 514 static struct memlist * 515 ndata_select_chunk(struct memlist *ndata, size_t wanted, size_t alignment) 516 { 517 struct memlist *fnd_below = NULL; 518 struct memlist *fnd_above = NULL; 519 struct memlist *fnd_unused = NULL; 520 struct memlist *frlist; 521 uintptr_t base; 522 uintptr_t end; 523 size_t below; 524 size_t above; 525 size_t unused; 526 size_t best_below = ULONG_MAX; 527 size_t best_above = ULONG_MAX; 528 size_t best_unused = ULONG_MAX; 529 530 ASSERT(ndata != NULL); 531 532 /* 533 * Look for the best matching buffer, avoid memory fragmentation. 534 * The following strategy is used, try to find 535 * 1. an exact fitting buffer 536 * 2. avoid wasting any space below the buffer, take first 537 * fitting buffer 538 * 3. avoid wasting any space above the buffer, take first 539 * fitting buffer 540 * 4. avoid wasting space, take first fitting buffer 541 * 5. take the last buffer in chain 542 */ 543 for (frlist = ndata; frlist != NULL; frlist = frlist->next) { 544 base = roundup(frlist->address, alignment); 545 end = roundup(base + wanted, ecache_alignsize); 546 547 if (end > frlist->address + frlist->size) 548 continue; 549 550 below = (base - frlist->address) / ecache_alignsize; 551 above = (frlist->address + frlist->size - end) / 552 ecache_alignsize; 553 unused = below + above; 554 555 if (unused == 0) 556 return (frlist); 557 558 if (frlist->next == NULL) 559 break; 560 561 if (below < best_below) { 562 best_below = below; 563 fnd_below = frlist; 564 } 565 566 if (above < best_above) { 567 best_above = above; 568 fnd_above = frlist; 569 } 570 571 if (unused < best_unused) { 572 best_unused = unused; 573 fnd_unused = frlist; 574 } 575 } 576 577 if (best_below == 0) 578 return (fnd_below); 579 if (best_above == 0) 580 return (fnd_above); 581 if (best_unused < ULONG_MAX) 582 return (fnd_unused); 583 584 return (frlist); 585 } 586 587 /* 588 * Nucleus data memory allocator. 589 * The granularity of the allocator is ecache_alignsize. 590 * See also comment for ndata_alloc_init(). 591 */ 592 void * 593 ndata_alloc(struct memlist *ndata, size_t wanted, size_t alignment) 594 { 595 struct memlist *found; 596 struct memlist *fnd_above; 597 uintptr_t base; 598 uintptr_t end; 599 size_t below; 600 size_t above; 601 602 /* 603 * Look for the best matching buffer, avoid memory fragmentation. 604 */ 605 if ((found = ndata_select_chunk(ndata, wanted, alignment)) == NULL) 606 return (NULL); 607 608 /* 609 * Allocate the nucleus data buffer. 610 */ 611 base = roundup(found->address, alignment); 612 end = roundup(base + wanted, ecache_alignsize); 613 ASSERT(end <= found->address + found->size); 614 615 below = base - found->address; 616 above = found->address + found->size - end; 617 ASSERT(above == 0 || (above % ecache_alignsize) == 0); 618 619 if (below >= ecache_alignsize) { 620 /* 621 * There is free memory below the allocated memory chunk. 622 */ 623 found->size = below - below % ecache_alignsize; 624 625 if (above) { 626 fnd_above = (struct memlist *)end; 627 fnd_above->address = end; 628 fnd_above->size = above; 629 630 if ((fnd_above->next = found->next) != NULL) 631 found->next->prev = fnd_above; 632 fnd_above->prev = found; 633 found->next = fnd_above; 634 } 635 636 return ((void *)base); 637 } 638 639 if (found->prev == NULL) { 640 /* 641 * The first chunk (ndata) is selected. 642 */ 643 ASSERT(found == ndata); 644 if (above) { 645 found->address = end; 646 found->size = above; 647 } else if (found->next != NULL) { 648 found->address = found->next->address; 649 found->size = found->next->size; 650 if ((found->next = found->next->next) != NULL) 651 found->next->prev = found; 652 653 bzero((void *)found->address, sizeof (struct memlist)); 654 } else { 655 found->address = end; 656 found->size = 0; 657 } 658 659 return ((void *)base); 660 } 661 662 /* 663 * Not the first chunk. 664 */ 665 if (above) { 666 fnd_above = (struct memlist *)end; 667 fnd_above->address = end; 668 fnd_above->size = above; 669 670 if ((fnd_above->next = found->next) != NULL) 671 fnd_above->next->prev = fnd_above; 672 fnd_above->prev = found->prev; 673 found->prev->next = fnd_above; 674 675 } else { 676 if ((found->prev->next = found->next) != NULL) 677 found->next->prev = found->prev; 678 } 679 680 bzero((void *)found->address, sizeof (struct memlist)); 681 682 return ((void *)base); 683 } 684 685 /* 686 * Size the kernel TSBs based upon the amount of physical 687 * memory in the system. 688 */ 689 static void 690 calc_tsb_sizes(pgcnt_t npages) 691 { 692 PRM_DEBUG(npages); 693 694 if (npages <= TSB_FREEMEM_MIN) { 695 ktsb_szcode = TSB_128K_SZCODE; 696 enable_bigktsb = 0; 697 } else if (npages <= TSB_FREEMEM_LARGE / 2) { 698 ktsb_szcode = TSB_256K_SZCODE; 699 enable_bigktsb = 0; 700 } else if (npages <= TSB_FREEMEM_LARGE) { 701 ktsb_szcode = TSB_512K_SZCODE; 702 enable_bigktsb = 0; 703 } else if (npages <= TSB_FREEMEM_LARGE * 2 || 704 enable_bigktsb == 0) { 705 ktsb_szcode = TSB_1M_SZCODE; 706 enable_bigktsb = 0; 707 } else { 708 ktsb_szcode = highbit(npages - 1); 709 ktsb_szcode -= TSB_START_SIZE; 710 ktsb_szcode = MAX(ktsb_szcode, MIN_BIGKTSB_SZCODE); 711 ktsb_szcode = MIN(ktsb_szcode, MAX_BIGKTSB_SZCODE); 712 } 713 714 /* 715 * We choose the TSB to hold kernel 4M mappings to have twice 716 * the reach as the primary kernel TSB since this TSB will 717 * potentially (currently) be shared by both mappings to all of 718 * physical memory plus user TSBs. Since the current 719 * limit on primary kernel TSB size is 16MB this will top out 720 * at 64K which we can certainly afford. 721 */ 722 ktsb4m_szcode = ktsb_szcode - (MMU_PAGESHIFT4M - MMU_PAGESHIFT) + 1; 723 if (ktsb4m_szcode < TSB_MIN_SZCODE) 724 ktsb4m_szcode = TSB_MIN_SZCODE; 725 726 ktsb_sz = TSB_BYTES(ktsb_szcode); /* kernel 8K tsb size */ 727 ktsb4m_sz = TSB_BYTES(ktsb4m_szcode); /* kernel 4M tsb size */ 728 } 729 730 /* 731 * Allocate kernel TSBs from nucleus data memory. 732 * The function return 0 on success and -1 on failure. 733 */ 734 int 735 ndata_alloc_tsbs(struct memlist *ndata, pgcnt_t npages) 736 { 737 /* 738 * Size the kernel TSBs based upon the amount of physical 739 * memory in the system. 740 */ 741 calc_tsb_sizes(npages); 742 743 /* 744 * Allocate the 8K kernel TSB if it belongs inside the nucleus. 745 */ 746 if (enable_bigktsb == 0) { 747 if ((ktsb_base = ndata_alloc(ndata, ktsb_sz, ktsb_sz)) == NULL) 748 return (-1); 749 ASSERT(!((uintptr_t)ktsb_base & (ktsb_sz - 1))); 750 751 PRM_DEBUG(ktsb_base); 752 PRM_DEBUG(ktsb_sz); 753 PRM_DEBUG(ktsb_szcode); 754 } 755 756 /* 757 * Next, allocate 4M kernel TSB from the nucleus since it's small. 758 */ 759 if ((ktsb4m_base = ndata_alloc(ndata, ktsb4m_sz, ktsb4m_sz)) == NULL) 760 return (-1); 761 ASSERT(!((uintptr_t)ktsb4m_base & (ktsb4m_sz - 1))); 762 763 PRM_DEBUG(ktsb4m_base); 764 PRM_DEBUG(ktsb4m_sz); 765 PRM_DEBUG(ktsb4m_szcode); 766 767 return (0); 768 } 769 770 /* 771 * Allocate hat structs from the nucleus data memory. 772 */ 773 int 774 ndata_alloc_hat(struct memlist *ndata, pgcnt_t npages, pgcnt_t kpm_npages) 775 { 776 size_t ctx_sz; 777 size_t mml_alloc_sz; 778 size_t cb_alloc_sz; 779 int max_nucuhme_buckets = MAX_NUCUHME_BUCKETS; 780 int max_nuckhme_buckets = MAX_NUCKHME_BUCKETS; 781 ulong_t hme_buckets; 782 783 if (enable_bigktsb) { 784 ASSERT((max_nucuhme_buckets + max_nuckhme_buckets) * 785 sizeof (struct hmehash_bucket) <= 786 TSB_BYTES(TSB_1M_SZCODE)); 787 788 max_nucuhme_buckets *= 2; 789 max_nuckhme_buckets *= 2; 790 } 791 792 /* 793 * Allocate ctx structures 794 * 795 * based on v_proc to calculate how many ctx structures 796 * is not possible; 797 * use whatever module_setup() assigned to nctxs 798 */ 799 PRM_DEBUG(nctxs); 800 ctx_sz = nctxs * sizeof (struct ctx); 801 if ((ctxs = ndata_alloc(ndata, ctx_sz, sizeof (struct ctx))) == NULL) 802 return (-1); 803 804 PRM_DEBUG(ctxs); 805 806 /* 807 * The number of buckets in the hme hash tables 808 * is a power of 2 such that the average hash chain length is 809 * HMENT_HASHAVELEN. The number of buckets for the user hash is 810 * a function of physical memory and a predefined overmapping factor. 811 * The number of buckets for the kernel hash is a function of 812 * physical memory only. 813 */ 814 hme_buckets = (npages * HMEHASH_FACTOR) / 815 (HMENT_HASHAVELEN * (HMEBLK_SPAN(TTE8K) >> MMU_PAGESHIFT)); 816 817 uhmehash_num = (int)MIN(hme_buckets, MAX_UHME_BUCKETS); 818 819 if (uhmehash_num > USER_BUCKETS_THRESHOLD) { 820 /* 821 * if uhmehash_num is not power of 2 round it down to the 822 * next power of 2. 823 */ 824 uint_t align = 1 << (highbit(uhmehash_num - 1) - 1); 825 uhmehash_num = P2ALIGN(uhmehash_num, align); 826 } else 827 uhmehash_num = 1 << highbit(uhmehash_num - 1); 828 829 hme_buckets = npages / (HMEBLK_SPAN(TTE8K) >> MMU_PAGESHIFT); 830 khmehash_num = (int)MIN(hme_buckets, MAX_KHME_BUCKETS); 831 khmehash_num = 1 << highbit(khmehash_num - 1); 832 khmehash_num = MAX(khmehash_num, MIN_KHME_BUCKETS); 833 834 if ((khmehash_num > max_nuckhme_buckets) || 835 (uhmehash_num > max_nucuhme_buckets)) { 836 khme_hash = NULL; 837 uhme_hash = NULL; 838 } else { 839 size_t hmehash_sz = (uhmehash_num + khmehash_num) * 840 sizeof (struct hmehash_bucket); 841 842 if ((khme_hash = ndata_alloc(ndata, hmehash_sz, 843 ecache_alignsize)) != NULL) 844 uhme_hash = &khme_hash[khmehash_num]; 845 else 846 uhme_hash = NULL; 847 848 PRM_DEBUG(hmehash_sz); 849 } 850 851 PRM_DEBUG(khme_hash); 852 PRM_DEBUG(khmehash_num); 853 PRM_DEBUG(uhme_hash); 854 PRM_DEBUG(uhmehash_num); 855 856 /* 857 * For the page mapping list mutex array we allocate one mutex 858 * for every 128 pages (1 MB) with a minimum of 64 entries and 859 * a maximum of 8K entries. For the initial computation npages 860 * is rounded up (ie. 1 << highbit(npages * 1.5 / 128)) 861 * 862 * mml_shift is roughly log2(mml_table_sz) + 3 for MLIST_HASH 863 * 864 * It is not required that this be allocated from the nucleus, 865 * but it is desirable. So we first allocate from the nucleus 866 * everything that must be there. Having done so, if mml_table 867 * will fit within what remains of the nucleus then it will be 868 * allocated here. If not, set mml_table to NULL, which will cause 869 * startup_memlist() to BOP_ALLOC() space for it after our return... 870 */ 871 mml_table_sz = 1 << highbit((npages * 3) / 256); 872 if (mml_table_sz < 64) 873 mml_table_sz = 64; 874 else if (mml_table_sz > 8192) 875 mml_table_sz = 8192; 876 mml_shift = highbit(mml_table_sz) + 3; 877 878 PRM_DEBUG(mml_table_sz); 879 PRM_DEBUG(mml_shift); 880 881 mml_alloc_sz = mml_table_sz * sizeof (kmutex_t); 882 883 mml_table = ndata_alloc(ndata, mml_alloc_sz, ecache_alignsize); 884 885 PRM_DEBUG(mml_table); 886 887 cb_alloc_sz = sfmmu_max_cb_id * sizeof (struct sfmmu_callback); 888 PRM_DEBUG(cb_alloc_sz); 889 sfmmu_cb_table = ndata_alloc(ndata, cb_alloc_sz, ecache_alignsize); 890 PRM_DEBUG(sfmmu_cb_table); 891 892 /* 893 * For the kpm_page mutex array we allocate one mutex every 16 894 * kpm pages (64MB). In smallpage mode we allocate one mutex 895 * every 8K pages. The minimum is set to 64 entries and the 896 * maximum to 8K entries. 897 * 898 * It is not required that this be allocated from the nucleus, 899 * but it is desirable. So we first allocate from the nucleus 900 * everything that must be there. Having done so, if kpmp_table 901 * or kpmp_stable will fit within what remains of the nucleus 902 * then it will be allocated here. If not, startup_memlist() 903 * will use BOP_ALLOC() space for it after our return... 904 */ 905 if (kpm_enable) { 906 size_t kpmp_alloc_sz; 907 908 if (kpm_smallpages == 0) { 909 kpmp_shift = highbit(sizeof (kpm_page_t)) - 1; 910 kpmp_table_sz = 1 << highbit(kpm_npages / 16); 911 kpmp_table_sz = (kpmp_table_sz < 64) ? 64 : 912 ((kpmp_table_sz > 8192) ? 8192 : kpmp_table_sz); 913 kpmp_alloc_sz = kpmp_table_sz * sizeof (kpm_hlk_t); 914 915 kpmp_table = ndata_alloc(ndata, kpmp_alloc_sz, 916 ecache_alignsize); 917 918 PRM_DEBUG(kpmp_table); 919 PRM_DEBUG(kpmp_table_sz); 920 921 kpmp_stable_sz = 0; 922 kpmp_stable = NULL; 923 } else { 924 ASSERT(kpm_pgsz == PAGESIZE); 925 kpmp_shift = highbit(sizeof (kpm_shlk_t)) + 1; 926 kpmp_stable_sz = 1 << highbit(kpm_npages / 8192); 927 kpmp_stable_sz = (kpmp_stable_sz < 64) ? 64 : 928 ((kpmp_stable_sz > 8192) ? 8192 : kpmp_stable_sz); 929 kpmp_alloc_sz = kpmp_stable_sz * sizeof (kpm_shlk_t); 930 931 kpmp_stable = ndata_alloc(ndata, kpmp_alloc_sz, 932 ecache_alignsize); 933 934 PRM_DEBUG(kpmp_stable); 935 PRM_DEBUG(kpmp_stable_sz); 936 937 kpmp_table_sz = 0; 938 kpmp_table = NULL; 939 } 940 PRM_DEBUG(kpmp_shift); 941 } 942 943 return (0); 944 } 945 946 caddr_t 947 alloc_hme_buckets(caddr_t base, int pagesize) 948 { 949 size_t hmehash_sz = (uhmehash_num + khmehash_num) * 950 sizeof (struct hmehash_bucket); 951 952 ASSERT(khme_hash == NULL); 953 ASSERT(uhme_hash == NULL); 954 955 /* If no pagesize specified, use default MMU pagesize */ 956 if (!pagesize) 957 pagesize = MMU_PAGESIZE; 958 959 /* 960 * If we start aligned and ask for a multiple of a pagesize, and OBP 961 * supports large pages, we will then use mappings of the largest size 962 * possible for the BOP_ALLOC, possibly saving us tens of thousands of 963 * TLB miss-induced traversals of the TSBs and/or the HME hashes... 964 */ 965 base = (caddr_t)roundup((uintptr_t)base, pagesize); 966 hmehash_sz = roundup(hmehash_sz, pagesize); 967 968 khme_hash = (struct hmehash_bucket *)BOP_ALLOC(bootops, base, 969 hmehash_sz, pagesize); 970 971 if ((caddr_t)khme_hash != base) 972 cmn_err(CE_PANIC, "Cannot bop_alloc hme hash buckets."); 973 974 uhme_hash = (struct hmehash_bucket *)((caddr_t)khme_hash + 975 khmehash_num * sizeof (struct hmehash_bucket)); 976 base += hmehash_sz; 977 return (base); 978 } 979 980 /* 981 * This function bop allocs the kernel TSB. 982 */ 983 caddr_t 984 sfmmu_ktsb_alloc(caddr_t tsbbase) 985 { 986 caddr_t vaddr; 987 988 if (enable_bigktsb) { 989 ktsb_base = (caddr_t)roundup((uintptr_t)tsbbase, ktsb_sz); 990 vaddr = (caddr_t)BOP_ALLOC(bootops, ktsb_base, ktsb_sz, 991 ktsb_sz); 992 if (vaddr != ktsb_base) 993 cmn_err(CE_PANIC, "sfmmu_ktsb_alloc: can't alloc" 994 " bigktsb"); 995 ktsb_base = vaddr; 996 tsbbase = ktsb_base + ktsb_sz; 997 PRM_DEBUG(ktsb_base); 998 PRM_DEBUG(tsbbase); 999 } 1000 return (tsbbase); 1001 } 1002 1003 /* 1004 * Moves code assembled outside of the trap table into the trap 1005 * table taking care to relocate relative branches to code outside 1006 * of the trap handler. 1007 */ 1008 static void 1009 sfmmu_reloc_trap_handler(void *tablep, void *start, size_t count) 1010 { 1011 size_t i; 1012 uint32_t *src; 1013 uint32_t *dst; 1014 uint32_t inst; 1015 int op, op2; 1016 int32_t offset; 1017 int disp; 1018 1019 src = start; 1020 dst = tablep; 1021 offset = src - dst; 1022 for (src = start, i = 0; i < count; i++, src++, dst++) { 1023 inst = *dst = *src; 1024 op = (inst >> 30) & 0x2; 1025 if (op == 1) { 1026 /* call */ 1027 disp = ((int32_t)inst << 2) >> 2; /* sign-extend */ 1028 if (disp + i >= 0 && disp + i < count) 1029 continue; 1030 disp += offset; 1031 inst = 0x40000000u | (disp & 0x3fffffffu); 1032 *dst = inst; 1033 } else if (op == 0) { 1034 /* branch or sethi */ 1035 op2 = (inst >> 22) & 0x7; 1036 1037 switch (op2) { 1038 case 0x3: /* BPr */ 1039 disp = (((inst >> 20) & 0x3) << 14) | 1040 (inst & 0x3fff); 1041 disp = (disp << 16) >> 16; /* sign-extend */ 1042 if (disp + i >= 0 && disp + i < count) 1043 continue; 1044 disp += offset; 1045 if (((disp << 16) >> 16) != disp) 1046 cmn_err(CE_PANIC, "bad reloc"); 1047 inst &= ~0x303fff; 1048 inst |= (disp & 0x3fff); 1049 inst |= (disp & 0xc000) << 6; 1050 break; 1051 1052 case 0x2: /* Bicc */ 1053 disp = ((int32_t)inst << 10) >> 10; 1054 if (disp + i >= 0 && disp + i < count) 1055 continue; 1056 disp += offset; 1057 if (((disp << 10) >> 10) != disp) 1058 cmn_err(CE_PANIC, "bad reloc"); 1059 inst &= ~0x3fffff; 1060 inst |= (disp & 0x3fffff); 1061 break; 1062 1063 case 0x1: /* Bpcc */ 1064 disp = ((int32_t)inst << 13) >> 13; 1065 if (disp + i >= 0 && disp + i < count) 1066 continue; 1067 disp += offset; 1068 if (((disp << 13) >> 13) != disp) 1069 cmn_err(CE_PANIC, "bad reloc"); 1070 inst &= ~0x7ffff; 1071 inst |= (disp & 0x7ffffu); 1072 break; 1073 } 1074 *dst = inst; 1075 } 1076 } 1077 flush_instr_mem(tablep, count * sizeof (uint32_t)); 1078 } 1079 1080 /* 1081 * Routine to allocate a large page to use in the TSB caches. 1082 */ 1083 /*ARGSUSED*/ 1084 static page_t * 1085 sfmmu_tsb_page_create(void *addr, size_t size, int vmflag, void *arg) 1086 { 1087 int pgflags; 1088 1089 pgflags = PG_EXCL; 1090 if ((vmflag & VM_NOSLEEP) == 0) 1091 pgflags |= PG_WAIT; 1092 if (vmflag & VM_PANIC) 1093 pgflags |= PG_PANIC; 1094 if (vmflag & VM_PUSHPAGE) 1095 pgflags |= PG_PUSHPAGE; 1096 1097 return (page_create_va_large(&kvp, (u_offset_t)(uintptr_t)addr, size, 1098 pgflags, &kvseg, addr, arg)); 1099 } 1100 1101 /* 1102 * Allocate a large page to back the virtual address range 1103 * [addr, addr + size). If addr is NULL, allocate the virtual address 1104 * space as well. 1105 */ 1106 static void * 1107 sfmmu_tsb_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag, 1108 uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *), 1109 void *pcarg) 1110 { 1111 page_t *ppl; 1112 page_t *rootpp; 1113 caddr_t addr = inaddr; 1114 pgcnt_t npages = btopr(size); 1115 page_t **ppa; 1116 int i = 0; 1117 1118 /* 1119 * Assuming that only TSBs will call this with size > PAGESIZE 1120 * There is no reason why this couldn't be expanded to 8k pages as 1121 * well, or other page sizes in the future .... but for now, we 1122 * only support fixed sized page requests. 1123 */ 1124 if ((inaddr == NULL) && ((addr = vmem_xalloc(vmp, size, size, 0, 0, 1125 NULL, NULL, vmflag)) == NULL)) 1126 return (NULL); 1127 1128 /* If we ever don't want TSB slab-sized pages, this will panic */ 1129 ASSERT(((uintptr_t)addr & (tsb_slab_size - 1)) == 0); 1130 1131 if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) { 1132 if (inaddr == NULL) 1133 vmem_xfree(vmp, addr, size); 1134 return (NULL); 1135 } 1136 1137 ppl = page_create_func(addr, size, vmflag, pcarg); 1138 if (ppl == NULL) { 1139 if (inaddr == NULL) 1140 vmem_xfree(vmp, addr, size); 1141 page_unresv(npages); 1142 return (NULL); 1143 } 1144 1145 rootpp = ppl; 1146 ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP); 1147 while (ppl != NULL) { 1148 page_t *pp = ppl; 1149 ppa[i++] = pp; 1150 page_sub(&ppl, pp); 1151 ASSERT(page_iolock_assert(pp)); 1152 page_io_unlock(pp); 1153 } 1154 1155 /* 1156 * Load the locked entry. It's OK to preload the entry into 1157 * the TSB since we now support large mappings in the kernel TSB. 1158 */ 1159 hat_memload_array(kas.a_hat, (caddr_t)rootpp->p_offset, size, 1160 ppa, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr, HAT_LOAD_LOCK); 1161 1162 for (--i; i >= 0; --i) { 1163 (void) page_pp_lock(ppa[i], 0, 1); 1164 page_unlock(ppa[i]); 1165 } 1166 1167 kmem_free(ppa, npages * sizeof (page_t *)); 1168 return (addr); 1169 } 1170 1171 /* Called to import new spans into the TSB vmem arenas */ 1172 void * 1173 sfmmu_tsb_segkmem_alloc(vmem_t *vmp, size_t size, int vmflag) 1174 { 1175 lgrp_id_t lgrpid = LGRP_NONE; 1176 1177 if (tsb_lgrp_affinity) { 1178 /* 1179 * Search for the vmp->lgrpid mapping by brute force; 1180 * some day vmp will have an lgrp, until then we have 1181 * to do this the hard way. 1182 */ 1183 for (lgrpid = 0; lgrpid < NLGRPS_MAX && 1184 vmp != kmem_tsb_default_arena[lgrpid]; lgrpid++); 1185 if (lgrpid == NLGRPS_MAX) 1186 lgrpid = LGRP_NONE; 1187 } 1188 1189 return (sfmmu_tsb_xalloc(vmp, NULL, size, vmflag, 0, 1190 sfmmu_tsb_page_create, lgrpid != LGRP_NONE? &lgrpid : NULL)); 1191 } 1192 1193 /* Called to free spans from the TSB vmem arenas */ 1194 void 1195 sfmmu_tsb_segkmem_free(vmem_t *vmp, void *inaddr, size_t size) 1196 { 1197 page_t *pp; 1198 caddr_t addr = inaddr; 1199 caddr_t eaddr; 1200 pgcnt_t npages = btopr(size); 1201 pgcnt_t pgs_left = npages; 1202 page_t *rootpp = NULL; 1203 1204 ASSERT(((uintptr_t)addr & (tsb_slab_size - 1)) == 0); 1205 1206 hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK); 1207 1208 for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) { 1209 pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL); 1210 if (pp == NULL) 1211 panic("sfmmu_tsb_segkmem_free: page not found"); 1212 1213 ASSERT(PAGE_EXCL(pp)); 1214 page_pp_unlock(pp, 0, 1); 1215 1216 if (rootpp == NULL) 1217 rootpp = pp; 1218 if (--pgs_left == 0) { 1219 /* 1220 * similar logic to segspt_free_pages, but we know we 1221 * have one large page. 1222 */ 1223 page_destroy_pages(rootpp); 1224 } 1225 } 1226 page_unresv(npages); 1227 1228 if (vmp != NULL) 1229 vmem_xfree(vmp, inaddr, size); 1230 } 1231