xref: /titanic_41/usr/src/uts/sun4/vm/sfmmu.c (revision 60405de4d8688d96dd05157c28db3ade5c9bc234)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <vm/hat.h>
30 #include <vm/hat_sfmmu.h>
31 #include <vm/page.h>
32 #include <sys/pte.h>
33 #include <sys/systm.h>
34 #include <sys/mman.h>
35 #include <sys/sysmacros.h>
36 #include <sys/machparam.h>
37 #include <sys/vtrace.h>
38 #include <sys/kmem.h>
39 #include <sys/mmu.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/cpuvar.h>
43 #include <sys/debug.h>
44 #include <sys/lgrp.h>
45 #include <sys/archsystm.h>
46 #include <sys/machsystm.h>
47 #include <sys/vmsystm.h>
48 #include <sys/bitmap.h>
49 #include <vm/as.h>
50 #include <vm/seg.h>
51 #include <vm/seg_kmem.h>
52 #include <vm/seg_kp.h>
53 #include <vm/seg_kpm.h>
54 #include <vm/rm.h>
55 #include <vm/vm_dep.h>
56 #include <sys/t_lock.h>
57 #include <sys/vm_machparam.h>
58 #include <sys/promif.h>
59 #include <sys/prom_isa.h>
60 #include <sys/prom_plat.h>
61 #include <sys/prom_debug.h>
62 #include <sys/privregs.h>
63 #include <sys/bootconf.h>
64 #include <sys/memlist.h>
65 #include <sys/memlist_plat.h>
66 #include <sys/cpu_module.h>
67 #include <sys/reboot.h>
68 #include <sys/kdi.h>
69 
70 /*
71  * Static routines
72  */
73 static void	sfmmu_map_prom_mappings(struct translation *, size_t);
74 static struct translation *read_prom_mappings(size_t *);
75 static void	sfmmu_reloc_trap_handler(void *, void *, size_t);
76 
77 /*
78  * External routines
79  */
80 extern void sfmmu_remap_kernel(void);
81 extern void sfmmu_patch_utsb(void);
82 
83 /*
84  * Global Data:
85  */
86 extern caddr_t	textva, datava;
87 extern tte_t	ktext_tte, kdata_tte;	/* ttes for kernel text and data */
88 extern int	enable_bigktsb;
89 
90 uint64_t memsegspa = (uintptr_t)MSEG_NULLPTR_PA; /* memsegs physical linkage */
91 uint64_t memseg_phash[N_MEM_SLOTS];	/* use physical memseg addresses */
92 
93 int	sfmmu_kern_mapped = 0;
94 
95 /*
96  * DMMU primary context register for the kernel context. Machine specific code
97  * inserts correct page size codes when necessary
98  */
99 uint64_t kcontextreg = KCONTEXT;
100 
101 /* Extern Global Data */
102 
103 extern int page_relocate_ready;
104 
105 /*
106  * Controls the logic which enables the use of the
107  * QUAD_LDD_PHYS ASI for TSB accesses.
108  */
109 extern int	ktsb_phys;
110 
111 /*
112  * Global Routines called from within:
113  *	usr/src/uts/sun4u
114  *	usr/src/uts/sfmmu
115  *	usr/src/uts/sun
116  */
117 
118 pfn_t
119 va_to_pfn(void *vaddr)
120 {
121 	u_longlong_t physaddr;
122 	int mode, valid;
123 
124 	if (tba_taken_over)
125 		return (hat_getpfnum(kas.a_hat, (caddr_t)vaddr));
126 
127 	if ((prom_translate_virt(vaddr, &valid, &physaddr, &mode) != -1) &&
128 	    (valid == -1)) {
129 		return ((pfn_t)(physaddr >> MMU_PAGESHIFT));
130 	}
131 	return (PFN_INVALID);
132 }
133 
134 uint64_t
135 va_to_pa(void *vaddr)
136 {
137 	pfn_t pfn;
138 
139 	if ((pfn = va_to_pfn(vaddr)) == PFN_INVALID)
140 		return ((uint64_t)-1);
141 	return (((uint64_t)pfn << MMU_PAGESHIFT) |
142 		((uint64_t)vaddr & MMU_PAGEOFFSET));
143 }
144 
145 void
146 hat_kern_setup(void)
147 {
148 	struct translation *trans_root;
149 	size_t ntrans_root;
150 	extern void startup_fixup_physavail(void);
151 
152 	/*
153 	 * These are the steps we take to take over the mmu from the prom.
154 	 *
155 	 * (1)	Read the prom's mappings through the translation property.
156 	 * (2)	Remap the kernel text and kernel data with 2 locked 4MB ttes.
157 	 *	Create the the hmeblks for these 2 ttes at this time.
158 	 * (3)	Create hat structures for all other prom mappings.  Since the
159 	 *	kernel text and data hme_blks have already been created we
160 	 *	skip the equivalent prom's mappings.
161 	 * (4)	Initialize the tsb and its corresponding hardware regs.
162 	 * (5)	Take over the trap table (currently in startup).
163 	 * (6)	Up to this point it is possible the prom required some of its
164 	 *	locked tte's.  Now that we own the trap table we remove them.
165 	 */
166 
167 	ktsb_pbase = va_to_pa(ktsb_base);
168 	ktsb4m_pbase = va_to_pa(ktsb4m_base);
169 	PRM_DEBUG(ktsb_pbase);
170 	PRM_DEBUG(ktsb4m_pbase);
171 
172 	sfmmu_setup_4lp();
173 	sfmmu_patch_ktsb();
174 	sfmmu_patch_utsb();
175 	sfmmu_patch_mmu_asi(ktsb_phys);
176 
177 	sfmmu_init_tsbs();
178 
179 	if (kpm_enable) {
180 		sfmmu_kpm_patch_tlbm();
181 		if (kpm_smallpages == 0) {
182 			sfmmu_kpm_patch_tsbm();
183 		}
184 	}
185 
186 	/*
187 	 * The 8K-indexed kernel TSB space is used to hold
188 	 * translations below...
189 	 */
190 	trans_root = read_prom_mappings(&ntrans_root);
191 	sfmmu_remap_kernel();
192 	startup_fixup_physavail();
193 	mmu_init_kernel_pgsz(kas.a_hat);
194 	sfmmu_map_prom_mappings(trans_root, ntrans_root);
195 
196 	/*
197 	 * We invalidate 8K kernel TSB because we used it in
198 	 * sfmmu_map_prom_mappings()
199 	 */
200 	sfmmu_inv_tsb(ktsb_base, ktsb_sz);
201 	sfmmu_inv_tsb(ktsb4m_base, ktsb4m_sz);
202 
203 	sfmmu_init_ktsbinfo();
204 
205 
206 	sfmmu_kern_mapped = 1;
207 
208 	/*
209 	 * hments have been created for mapped pages, and thus we're ready
210 	 * for kmdb to start using its own trap table.  It walks the hments
211 	 * to resolve TLB misses, and can't be used until they're ready.
212 	 */
213 	if (boothowto & RB_DEBUG)
214 		kdi_dvec_vmready();
215 }
216 
217 /*
218  * Macro used below to convert the prom's 32-bit high and low fields into
219  * a value appropriate for the 64-bit kernel.
220  */
221 
222 #define	COMBINE(hi, lo) (((uint64_t)(uint32_t)(hi) << 32) | (uint32_t)(lo))
223 
224 /*
225  * This function traverses the prom mapping list and creates equivalent
226  * mappings in the sfmmu mapping hash.
227  */
228 static void
229 sfmmu_map_prom_mappings(struct translation *trans_root, size_t ntrans_root)
230 {
231 	struct translation *promt;
232 	tte_t	tte, oldtte, *ttep;
233 	pfn_t	pfn, oldpfn, basepfn;
234 	caddr_t vaddr;
235 	size_t	size, offset;
236 	unsigned long i;
237 	uint_t	attr;
238 	page_t *pp;
239 	extern struct memlist *virt_avail;
240 
241 	ttep = &tte;
242 	for (i = 0, promt = trans_root; i < ntrans_root; i++, promt++) {
243 		ASSERT(promt->tte_hi != 0);
244 		ASSERT32(promt->virt_hi == 0 && promt->size_hi == 0);
245 
246 		/*
247 		 * hack until we get rid of map-for-unix
248 		 */
249 		if (COMBINE(promt->virt_hi, promt->virt_lo) < KERNELBASE)
250 			continue;
251 
252 		ttep->tte_inthi = promt->tte_hi;
253 		ttep->tte_intlo = promt->tte_lo;
254 		attr = PROC_DATA | HAT_NOSYNC;
255 #if defined(TTE_IS_GLOBAL)
256 		if (TTE_IS_GLOBAL(ttep)) {
257 			/*
258 			 * The prom better not use global translations
259 			 * because a user process might use the same
260 			 * virtual addresses
261 			 */
262 			cmn_err(CE_PANIC, "map_prom: global translation");
263 			TTE_SET_LOFLAGS(ttep, TTE_GLB_INT, 0);
264 		}
265 #endif
266 		if (TTE_IS_LOCKED(ttep)) {
267 			/* clear the lock bits */
268 			TTE_CLR_LOCKED(ttep);
269 		}
270 		attr |= (TTE_IS_VCACHEABLE(ttep)) ? 0 : SFMMU_UNCACHEVTTE;
271 		attr |= (TTE_IS_PCACHEABLE(ttep)) ? 0 : SFMMU_UNCACHEPTTE;
272 		attr |= (TTE_IS_SIDEFFECT(ttep)) ? SFMMU_SIDEFFECT : 0;
273 		attr |= (TTE_IS_IE(ttep)) ? HAT_STRUCTURE_LE : 0;
274 
275 		size = COMBINE(promt->size_hi, promt->size_lo);
276 		offset = 0;
277 		basepfn = TTE_TO_PFN((caddr_t)COMBINE(promt->virt_hi,
278 		    promt->virt_lo), ttep);
279 		while (size) {
280 			vaddr = (caddr_t)(COMBINE(promt->virt_hi,
281 			    promt->virt_lo) + offset);
282 
283 			/*
284 			 * make sure address is not in virt-avail list
285 			 */
286 			if (address_in_memlist(virt_avail, (uint64_t)vaddr,
287 			    size)) {
288 				cmn_err(CE_PANIC, "map_prom: inconsistent "
289 				    "translation/avail lists");
290 			}
291 
292 			pfn = basepfn + mmu_btop(offset);
293 			if (pf_is_memory(pfn)) {
294 				if (attr & SFMMU_UNCACHEPTTE) {
295 					cmn_err(CE_PANIC, "map_prom: "
296 					    "uncached prom memory page");
297 				}
298 			} else {
299 				if (!(attr & SFMMU_SIDEFFECT)) {
300 					cmn_err(CE_PANIC, "map_prom: prom "
301 					    "i/o page without side-effect");
302 				}
303 			}
304 			oldpfn = sfmmu_vatopfn(vaddr, KHATID, &oldtte);
305 			ASSERT(oldpfn != PFN_SUSPENDED);
306 			ASSERT(page_relocate_ready == 0);
307 
308 			if (oldpfn != PFN_INVALID) {
309 				/*
310 				 * mapping already exists.
311 				 * Verify they are equal
312 				 */
313 				if (pfn != oldpfn) {
314 					cmn_err(CE_PANIC, "map_prom: mapping "
315 					    "conflict (va=0x%p pfn=%p, "
316 					    "oldpfn=%p)",
317 					    (void *)vaddr, (void *)pfn,
318 					    (void *)oldpfn);
319 				}
320 				size -= MMU_PAGESIZE;
321 				offset += MMU_PAGESIZE;
322 				continue;
323 			}
324 
325 			pp = page_numtopp_nolock(pfn);
326 			if ((pp != NULL) && PP_ISFREE((page_t *)pp)) {
327 				cmn_err(CE_PANIC, "map_prom: "
328 				    "prom-mapped page (va 0x%p, pfn 0x%p) "
329 				    "on free list", (void *)vaddr, (void *)pfn);
330 			}
331 
332 			sfmmu_memtte(ttep, pfn, attr, TTE8K);
333 			sfmmu_tteload(kas.a_hat, ttep, vaddr, pp,
334 			    HAT_LOAD_LOCK | SFMMU_NO_TSBLOAD);
335 			size -= MMU_PAGESIZE;
336 			offset += MMU_PAGESIZE;
337 		}
338 	}
339 }
340 
341 #undef COMBINE	/* local to previous routine */
342 
343 /*
344  * This routine reads in the "translations" property in to a buffer and
345  * returns a pointer to this buffer and the number of translations.
346  */
347 static struct translation *
348 read_prom_mappings(size_t *ntransrootp)
349 {
350 	char *prop = "translations";
351 	size_t translen;
352 	pnode_t node;
353 	struct translation *transroot;
354 
355 	/*
356 	 * the "translations" property is associated with the mmu node
357 	 */
358 	node = (pnode_t)prom_getphandle(prom_mmu_ihandle());
359 
360 	/*
361 	 * We use the TSB space to read in the prom mappings.  This space
362 	 * is currently not being used because we haven't taken over the
363 	 * trap table yet.  It should be big enough to hold the mappings.
364 	 */
365 	if ((translen = prom_getproplen(node, prop)) == -1)
366 		cmn_err(CE_PANIC, "no translations property");
367 	*ntransrootp = translen / sizeof (*transroot);
368 	translen = roundup(translen, MMU_PAGESIZE);
369 	PRM_DEBUG(translen);
370 	if (translen > TSB_BYTES(ktsb_szcode))
371 		cmn_err(CE_PANIC, "not enough space for translations");
372 
373 	transroot = (struct translation *)ktsb_base;
374 	ASSERT(transroot);
375 	if (prom_getprop(node, prop, (caddr_t)transroot) == -1) {
376 		cmn_err(CE_PANIC, "translations getprop failed");
377 	}
378 	return (transroot);
379 }
380 
381 /*
382  * Init routine of the nucleus data memory allocator.
383  *
384  * The nucleus data memory allocator is organized in ecache_alignsize'd
385  * memory chunks. Memory allocated by ndata_alloc() will never be freed.
386  *
387  * The ndata argument is used as header of the ndata freelist.
388  * Other freelist nodes are placed in the nucleus memory itself
389  * at the beginning of a free memory chunk. Therefore a freelist
390  * node (struct memlist) must fit into the smallest allocatable
391  * memory chunk (ecache_alignsize bytes).
392  *
393  * The memory interval [base, end] passed to ndata_alloc_init() must be
394  * bzero'd to allow the allocator to return bzero'd memory easily.
395  */
396 void
397 ndata_alloc_init(struct memlist *ndata, uintptr_t base, uintptr_t end)
398 {
399 	ASSERT(sizeof (struct memlist) <= ecache_alignsize);
400 
401 	base = roundup(base, ecache_alignsize);
402 	end = end - end % ecache_alignsize;
403 
404 	ASSERT(base < end);
405 
406 	ndata->address = base;
407 	ndata->size = end - base;
408 	ndata->next = NULL;
409 	ndata->prev = NULL;
410 }
411 
412 /*
413  * Deliver the size of the largest free memory chunk.
414  */
415 size_t
416 ndata_maxsize(struct memlist *ndata)
417 {
418 	size_t chunksize = ndata->size;
419 
420 	while ((ndata = ndata->next) != NULL) {
421 		if (chunksize < ndata->size)
422 			chunksize = ndata->size;
423 	}
424 
425 	return (chunksize);
426 }
427 
428 /*
429  * This is a special function to figure out if the memory chunk needed
430  * for the page structs can fit in the nucleus or not. If it fits the
431  * function calculates and returns the possible remaining ndata size
432  * in the last element if the size needed for page structs would be
433  * allocated from the nucleus.
434  */
435 size_t
436 ndata_spare(struct memlist *ndata, size_t wanted, size_t alignment)
437 {
438 	struct memlist *frlist;
439 	uintptr_t base;
440 	uintptr_t end;
441 
442 	for (frlist = ndata; frlist != NULL; frlist = frlist->next) {
443 		base = roundup(frlist->address, alignment);
444 		end = roundup(base + wanted, ecache_alignsize);
445 
446 		if (end <= frlist->address + frlist->size) {
447 			if (frlist->next == NULL)
448 				return (frlist->address + frlist->size - end);
449 
450 			while (frlist->next != NULL)
451 				frlist = frlist->next;
452 
453 			return (frlist->size);
454 		}
455 	}
456 
457 	return (0);
458 }
459 
460 /*
461  * Allocate the last properly aligned memory chunk.
462  * This function is called when no more large nucleus memory chunks
463  * will be allocated.  The remaining free nucleus memory at the end
464  * of the nucleus can be added to the phys_avail list.
465  */
466 void *
467 ndata_extra_base(struct memlist *ndata, size_t alignment)
468 {
469 	uintptr_t base;
470 	size_t wasteage = 0;
471 #ifdef	DEBUG
472 	static int called = 0;
473 
474 	if (called++ > 0)
475 		cmn_err(CE_PANIC, "ndata_extra_base() called more than once");
476 #endif /* DEBUG */
477 
478 	/*
479 	 * The alignment needs to be a multiple of ecache_alignsize.
480 	 */
481 	ASSERT((alignment % ecache_alignsize) ==  0);
482 
483 	while (ndata->next != NULL) {
484 		wasteage += ndata->size;
485 		ndata = ndata->next;
486 	}
487 
488 	base = roundup(ndata->address, alignment);
489 
490 	if (base >= ndata->address + ndata->size)
491 		return (NULL);
492 
493 	if (base == ndata->address) {
494 		if (ndata->prev != NULL)
495 			ndata->prev->next = NULL;
496 		else
497 			ndata->size = 0;
498 
499 		bzero((void *)base, sizeof (struct memlist));
500 
501 	} else {
502 		ndata->size = base - ndata->address;
503 		wasteage += ndata->size;
504 	}
505 	PRM_DEBUG(wasteage);
506 
507 	return ((void *)base);
508 }
509 
510 /*
511  * Select the best matching buffer, avoid memory fragmentation.
512  */
513 static struct memlist *
514 ndata_select_chunk(struct memlist *ndata, size_t wanted, size_t alignment)
515 {
516 	struct memlist *fnd_below = NULL;
517 	struct memlist *fnd_above = NULL;
518 	struct memlist *fnd_unused = NULL;
519 	struct memlist *frlist;
520 	uintptr_t base;
521 	uintptr_t end;
522 	size_t below;
523 	size_t above;
524 	size_t unused;
525 	size_t best_below = ULONG_MAX;
526 	size_t best_above = ULONG_MAX;
527 	size_t best_unused = ULONG_MAX;
528 
529 	ASSERT(ndata != NULL);
530 
531 	/*
532 	 * Look for the best matching buffer, avoid memory fragmentation.
533 	 * The following strategy is used, try to find
534 	 *   1. an exact fitting buffer
535 	 *   2. avoid wasting any space below the buffer, take first
536 	 *	fitting buffer
537 	 *   3. avoid wasting any space above the buffer, take first
538 	 *	fitting buffer
539 	 *   4. avoid wasting space, take first fitting buffer
540 	 *   5. take the last buffer in chain
541 	 */
542 	for (frlist = ndata; frlist != NULL; frlist = frlist->next) {
543 		base = roundup(frlist->address, alignment);
544 		end = roundup(base + wanted, ecache_alignsize);
545 
546 		if (end > frlist->address + frlist->size)
547 			continue;
548 
549 		below = (base - frlist->address) / ecache_alignsize;
550 		above = (frlist->address + frlist->size - end) /
551 		    ecache_alignsize;
552 		unused = below + above;
553 
554 		if (unused == 0)
555 			return (frlist);
556 
557 		if (frlist->next == NULL)
558 			break;
559 
560 		if (below < best_below) {
561 			best_below = below;
562 			fnd_below = frlist;
563 		}
564 
565 		if (above < best_above) {
566 			best_above = above;
567 			fnd_above = frlist;
568 		}
569 
570 		if (unused < best_unused) {
571 			best_unused = unused;
572 			fnd_unused = frlist;
573 		}
574 	}
575 
576 	if (best_below == 0)
577 		return (fnd_below);
578 	if (best_above == 0)
579 		return (fnd_above);
580 	if (best_unused < ULONG_MAX)
581 		return (fnd_unused);
582 
583 	return (frlist);
584 }
585 
586 /*
587  * Nucleus data memory allocator.
588  * The granularity of the allocator is ecache_alignsize.
589  * See also comment for ndata_alloc_init().
590  */
591 void *
592 ndata_alloc(struct memlist *ndata, size_t wanted, size_t alignment)
593 {
594 	struct memlist *found;
595 	struct memlist *fnd_above;
596 	uintptr_t base;
597 	uintptr_t end;
598 	size_t below;
599 	size_t above;
600 
601 	/*
602 	 * Look for the best matching buffer, avoid memory fragmentation.
603 	 */
604 	if ((found = ndata_select_chunk(ndata, wanted, alignment)) == NULL)
605 		return (NULL);
606 
607 	/*
608 	 * Allocate the nucleus data buffer.
609 	 */
610 	base = roundup(found->address, alignment);
611 	end = roundup(base + wanted, ecache_alignsize);
612 	ASSERT(end <= found->address + found->size);
613 
614 	below = base - found->address;
615 	above = found->address + found->size - end;
616 	ASSERT(above == 0 || (above % ecache_alignsize) == 0);
617 
618 	if (below >= ecache_alignsize) {
619 		/*
620 		 * There is free memory below the allocated memory chunk.
621 		 */
622 		found->size = below - below % ecache_alignsize;
623 
624 		if (above) {
625 			fnd_above = (struct memlist *)end;
626 			fnd_above->address = end;
627 			fnd_above->size = above;
628 
629 			if ((fnd_above->next = found->next) != NULL)
630 				found->next->prev = fnd_above;
631 			fnd_above->prev = found;
632 			found->next = fnd_above;
633 		}
634 
635 		return ((void *)base);
636 	}
637 
638 	if (found->prev == NULL) {
639 		/*
640 		 * The first chunk (ndata) is selected.
641 		 */
642 		ASSERT(found == ndata);
643 		if (above) {
644 			found->address = end;
645 			found->size = above;
646 		} else if (found->next != NULL) {
647 			found->address = found->next->address;
648 			found->size = found->next->size;
649 			if ((found->next = found->next->next) != NULL)
650 				found->next->prev = found;
651 
652 			bzero((void *)found->address, sizeof (struct memlist));
653 		} else {
654 			found->address = end;
655 			found->size = 0;
656 		}
657 
658 		return ((void *)base);
659 	}
660 
661 	/*
662 	 * Not the first chunk.
663 	 */
664 	if (above) {
665 		fnd_above = (struct memlist *)end;
666 		fnd_above->address = end;
667 		fnd_above->size = above;
668 
669 		if ((fnd_above->next = found->next) != NULL)
670 			fnd_above->next->prev = fnd_above;
671 		fnd_above->prev = found->prev;
672 		found->prev->next = fnd_above;
673 
674 	} else {
675 		if ((found->prev->next = found->next) != NULL)
676 			found->next->prev = found->prev;
677 	}
678 
679 	bzero((void *)found->address, sizeof (struct memlist));
680 
681 	return ((void *)base);
682 }
683 
684 /*
685  * Size the kernel TSBs based upon the amount of physical
686  * memory in the system.
687  */
688 static void
689 calc_tsb_sizes(pgcnt_t npages)
690 {
691 	PRM_DEBUG(npages);
692 
693 	if (npages <= TSB_FREEMEM_MIN) {
694 		ktsb_szcode = TSB_128K_SZCODE;
695 		enable_bigktsb = 0;
696 	} else if (npages <= TSB_FREEMEM_LARGE / 2) {
697 		ktsb_szcode = TSB_256K_SZCODE;
698 		enable_bigktsb = 0;
699 	} else if (npages <= TSB_FREEMEM_LARGE) {
700 		ktsb_szcode = TSB_512K_SZCODE;
701 		enable_bigktsb = 0;
702 	} else if (npages <= TSB_FREEMEM_LARGE * 2 ||
703 	    enable_bigktsb == 0) {
704 		ktsb_szcode = TSB_1M_SZCODE;
705 		enable_bigktsb = 0;
706 	} else {
707 		ktsb_szcode = highbit(npages - 1);
708 		ktsb_szcode -= TSB_START_SIZE;
709 		ktsb_szcode = MAX(ktsb_szcode, MIN_BIGKTSB_SZCODE);
710 		ktsb_szcode = MIN(ktsb_szcode, MAX_BIGKTSB_SZCODE);
711 	}
712 
713 	/*
714 	 * We choose the TSB to hold kernel 4M mappings to have twice
715 	 * the reach as the primary kernel TSB since this TSB will
716 	 * potentially (currently) be shared by both mappings to all of
717 	 * physical memory plus user TSBs.  Since the current
718 	 * limit on primary kernel TSB size is 16MB this will top out
719 	 * at 64K which we can certainly afford.
720 	 */
721 	ktsb4m_szcode = ktsb_szcode - (MMU_PAGESHIFT4M - MMU_PAGESHIFT) + 1;
722 	if (ktsb4m_szcode < TSB_MIN_SZCODE)
723 		ktsb4m_szcode = TSB_MIN_SZCODE;
724 
725 	ktsb_sz = TSB_BYTES(ktsb_szcode);	/* kernel 8K tsb size */
726 	ktsb4m_sz = TSB_BYTES(ktsb4m_szcode);	/* kernel 4M tsb size */
727 }
728 
729 /*
730  * Allocate kernel TSBs from nucleus data memory.
731  * The function return 0 on success and -1 on failure.
732  */
733 int
734 ndata_alloc_tsbs(struct memlist *ndata, pgcnt_t npages)
735 {
736 	/*
737 	 * Size the kernel TSBs based upon the amount of physical
738 	 * memory in the system.
739 	 */
740 	calc_tsb_sizes(npages);
741 
742 	/*
743 	 * Allocate the 8K kernel TSB if it belongs inside the nucleus.
744 	 */
745 	if (enable_bigktsb == 0) {
746 		if ((ktsb_base = ndata_alloc(ndata, ktsb_sz, ktsb_sz)) == NULL)
747 			return (-1);
748 		ASSERT(!((uintptr_t)ktsb_base & (ktsb_sz - 1)));
749 
750 		PRM_DEBUG(ktsb_base);
751 		PRM_DEBUG(ktsb_sz);
752 		PRM_DEBUG(ktsb_szcode);
753 	}
754 
755 	/*
756 	 * Next, allocate 4M kernel TSB from the nucleus since it's small.
757 	 */
758 	if ((ktsb4m_base = ndata_alloc(ndata, ktsb4m_sz, ktsb4m_sz)) == NULL)
759 		return (-1);
760 	ASSERT(!((uintptr_t)ktsb4m_base & (ktsb4m_sz - 1)));
761 
762 	PRM_DEBUG(ktsb4m_base);
763 	PRM_DEBUG(ktsb4m_sz);
764 	PRM_DEBUG(ktsb4m_szcode);
765 
766 	return (0);
767 }
768 
769 /*
770  * Allocate hat structs from the nucleus data memory.
771  */
772 int
773 ndata_alloc_hat(struct memlist *ndata, pgcnt_t npages, pgcnt_t kpm_npages)
774 {
775 	size_t	mml_alloc_sz;
776 	size_t	cb_alloc_sz;
777 	int	max_nucuhme_buckets = MAX_NUCUHME_BUCKETS;
778 	int	max_nuckhme_buckets = MAX_NUCKHME_BUCKETS;
779 	ulong_t hme_buckets;
780 
781 	if (enable_bigktsb) {
782 		ASSERT((max_nucuhme_buckets + max_nuckhme_buckets) *
783 		    sizeof (struct hmehash_bucket) <=
784 			TSB_BYTES(TSB_1M_SZCODE));
785 
786 		max_nucuhme_buckets *= 2;
787 		max_nuckhme_buckets *= 2;
788 	}
789 
790 	/*
791 	 * The number of buckets in the hme hash tables
792 	 * is a power of 2 such that the average hash chain length is
793 	 * HMENT_HASHAVELEN.  The number of buckets for the user hash is
794 	 * a function of physical memory and a predefined overmapping factor.
795 	 * The number of buckets for the kernel hash is a function of
796 	 * physical memory only.
797 	 */
798 	hme_buckets = (npages * HMEHASH_FACTOR) /
799 		(HMENT_HASHAVELEN * (HMEBLK_SPAN(TTE8K) >> MMU_PAGESHIFT));
800 
801 	uhmehash_num = (int)MIN(hme_buckets, MAX_UHME_BUCKETS);
802 
803 	if (uhmehash_num > USER_BUCKETS_THRESHOLD) {
804 		/*
805 		 * if uhmehash_num is not power of 2 round it down to the
806 		 *  next power of 2.
807 		 */
808 		uint_t align = 1 << (highbit(uhmehash_num - 1) - 1);
809 		uhmehash_num = P2ALIGN(uhmehash_num, align);
810 	} else
811 		uhmehash_num = 1 << highbit(uhmehash_num - 1);
812 
813 	hme_buckets = npages / (HMEBLK_SPAN(TTE8K) >> MMU_PAGESHIFT);
814 	khmehash_num = (int)MIN(hme_buckets, MAX_KHME_BUCKETS);
815 	khmehash_num = 1 << highbit(khmehash_num - 1);
816 	khmehash_num = MAX(khmehash_num, MIN_KHME_BUCKETS);
817 
818 	if ((khmehash_num > max_nuckhme_buckets) ||
819 		(uhmehash_num > max_nucuhme_buckets)) {
820 		khme_hash = NULL;
821 		uhme_hash = NULL;
822 	} else {
823 		size_t hmehash_sz = (uhmehash_num + khmehash_num) *
824 		    sizeof (struct hmehash_bucket);
825 
826 		if ((khme_hash = ndata_alloc(ndata, hmehash_sz,
827 		    ecache_alignsize)) != NULL)
828 			uhme_hash = &khme_hash[khmehash_num];
829 		else
830 			uhme_hash = NULL;
831 
832 		PRM_DEBUG(hmehash_sz);
833 	}
834 
835 	PRM_DEBUG(khme_hash);
836 	PRM_DEBUG(khmehash_num);
837 	PRM_DEBUG(uhme_hash);
838 	PRM_DEBUG(uhmehash_num);
839 
840 	/*
841 	 * For the page mapping list mutex array we allocate one mutex
842 	 * for every 128 pages (1 MB) with a minimum of 64 entries and
843 	 * a maximum of 8K entries. For the initial computation npages
844 	 * is rounded up (ie. 1 << highbit(npages * 1.5 / 128))
845 	 *
846 	 * mml_shift is roughly log2(mml_table_sz) + 3 for MLIST_HASH
847 	 *
848 	 * It is not required that this be allocated from the nucleus,
849 	 * but it is desirable.  So we first allocate from the nucleus
850 	 * everything that must be there.  Having done so, if mml_table
851 	 * will fit within what remains of the nucleus then it will be
852 	 * allocated here.  If not, set mml_table to NULL, which will cause
853 	 * startup_memlist() to BOP_ALLOC() space for it after our return...
854 	 */
855 	mml_table_sz = 1 << highbit((npages * 3) / 256);
856 	if (mml_table_sz < 64)
857 		mml_table_sz = 64;
858 	else if (mml_table_sz > 8192)
859 		mml_table_sz = 8192;
860 	mml_shift = highbit(mml_table_sz) + 3;
861 
862 	PRM_DEBUG(mml_table_sz);
863 	PRM_DEBUG(mml_shift);
864 
865 	mml_alloc_sz = mml_table_sz * sizeof (kmutex_t);
866 
867 	mml_table = ndata_alloc(ndata, mml_alloc_sz, ecache_alignsize);
868 
869 	PRM_DEBUG(mml_table);
870 
871 	cb_alloc_sz = sfmmu_max_cb_id * sizeof (struct sfmmu_callback);
872 	PRM_DEBUG(cb_alloc_sz);
873 	sfmmu_cb_table = ndata_alloc(ndata, cb_alloc_sz, ecache_alignsize);
874 	PRM_DEBUG(sfmmu_cb_table);
875 
876 	/*
877 	 * For the kpm_page mutex array we allocate one mutex every 16
878 	 * kpm pages (64MB). In smallpage mode we allocate one mutex
879 	 * every 8K pages. The minimum is set to 64 entries and the
880 	 * maximum to 8K entries.
881 	 *
882 	 * It is not required that this be allocated from the nucleus,
883 	 * but it is desirable.  So we first allocate from the nucleus
884 	 * everything that must be there.  Having done so, if kpmp_table
885 	 * or kpmp_stable will fit within what remains of the nucleus
886 	 * then it will be allocated here.  If not, startup_memlist()
887 	 * will use BOP_ALLOC() space for it after our return...
888 	 */
889 	if (kpm_enable) {
890 		size_t	kpmp_alloc_sz;
891 
892 		if (kpm_smallpages == 0) {
893 			kpmp_shift = highbit(sizeof (kpm_page_t)) - 1;
894 			kpmp_table_sz = 1 << highbit(kpm_npages / 16);
895 			kpmp_table_sz = (kpmp_table_sz < 64) ? 64 :
896 			    ((kpmp_table_sz > 8192) ? 8192 : kpmp_table_sz);
897 			kpmp_alloc_sz = kpmp_table_sz * sizeof (kpm_hlk_t);
898 
899 			kpmp_table = ndata_alloc(ndata, kpmp_alloc_sz,
900 			    ecache_alignsize);
901 
902 			PRM_DEBUG(kpmp_table);
903 			PRM_DEBUG(kpmp_table_sz);
904 
905 			kpmp_stable_sz = 0;
906 			kpmp_stable = NULL;
907 		} else {
908 			ASSERT(kpm_pgsz == PAGESIZE);
909 			kpmp_shift = highbit(sizeof (kpm_shlk_t)) + 1;
910 			kpmp_stable_sz = 1 << highbit(kpm_npages / 8192);
911 			kpmp_stable_sz = (kpmp_stable_sz < 64) ? 64 :
912 			    ((kpmp_stable_sz > 8192) ? 8192 : kpmp_stable_sz);
913 			kpmp_alloc_sz = kpmp_stable_sz * sizeof (kpm_shlk_t);
914 
915 			kpmp_stable = ndata_alloc(ndata, kpmp_alloc_sz,
916 			    ecache_alignsize);
917 
918 			PRM_DEBUG(kpmp_stable);
919 			PRM_DEBUG(kpmp_stable_sz);
920 
921 			kpmp_table_sz = 0;
922 			kpmp_table = NULL;
923 		}
924 		PRM_DEBUG(kpmp_shift);
925 	}
926 
927 	return (0);
928 }
929 
930 caddr_t
931 alloc_hme_buckets(caddr_t base, int pagesize)
932 {
933 	size_t hmehash_sz = (uhmehash_num + khmehash_num) *
934 	sizeof (struct hmehash_bucket);
935 
936 	ASSERT(khme_hash == NULL);
937 	ASSERT(uhme_hash == NULL);
938 
939 	/* If no pagesize specified, use default MMU pagesize */
940 	if (!pagesize)
941 		pagesize = MMU_PAGESIZE;
942 
943 	/*
944 	 * If we start aligned and ask for a multiple of a pagesize, and OBP
945 	 * supports large pages, we will then use mappings of the largest size
946 	 * possible for the BOP_ALLOC, possibly saving us tens of thousands of
947 	 * TLB miss-induced traversals of the TSBs and/or the HME hashes...
948 	 */
949 	base = (caddr_t)roundup((uintptr_t)base, pagesize);
950 	hmehash_sz = roundup(hmehash_sz, pagesize);
951 
952 	khme_hash = (struct hmehash_bucket *)BOP_ALLOC(bootops, base,
953 		hmehash_sz, pagesize);
954 
955 	if ((caddr_t)khme_hash != base)
956 		cmn_err(CE_PANIC, "Cannot bop_alloc hme hash buckets.");
957 
958 	uhme_hash = (struct hmehash_bucket *)((caddr_t)khme_hash +
959 		khmehash_num * sizeof (struct hmehash_bucket));
960 	base += hmehash_sz;
961 	return (base);
962 }
963 
964 /*
965  * This function bop allocs the kernel TSB.
966  */
967 caddr_t
968 sfmmu_ktsb_alloc(caddr_t tsbbase)
969 {
970 	caddr_t vaddr;
971 
972 	if (enable_bigktsb) {
973 		ktsb_base = (caddr_t)roundup((uintptr_t)tsbbase, ktsb_sz);
974 		vaddr = (caddr_t)BOP_ALLOC(bootops, ktsb_base, ktsb_sz,
975 		    ktsb_sz);
976 		if (vaddr != ktsb_base)
977 			cmn_err(CE_PANIC, "sfmmu_ktsb_alloc: can't alloc"
978 			    " bigktsb");
979 		ktsb_base = vaddr;
980 		tsbbase = ktsb_base + ktsb_sz;
981 		PRM_DEBUG(ktsb_base);
982 		PRM_DEBUG(tsbbase);
983 	}
984 	return (tsbbase);
985 }
986 
987 /*
988  * Moves code assembled outside of the trap table into the trap
989  * table taking care to relocate relative branches to code outside
990  * of the trap handler.
991  */
992 static void
993 sfmmu_reloc_trap_handler(void *tablep, void *start, size_t count)
994 {
995 	size_t i;
996 	uint32_t *src;
997 	uint32_t *dst;
998 	uint32_t inst;
999 	int op, op2;
1000 	int32_t offset;
1001 	int disp;
1002 
1003 	src = start;
1004 	dst = tablep;
1005 	offset = src - dst;
1006 	for (src = start, i = 0; i < count; i++, src++, dst++) {
1007 		inst = *dst = *src;
1008 		op = (inst >> 30) & 0x2;
1009 		if (op == 1) {
1010 			/* call */
1011 			disp = ((int32_t)inst << 2) >> 2; /* sign-extend */
1012 			if (disp + i >= 0 && disp + i < count)
1013 				continue;
1014 			disp += offset;
1015 			inst = 0x40000000u | (disp & 0x3fffffffu);
1016 			*dst = inst;
1017 		} else if (op == 0) {
1018 			/* branch or sethi */
1019 			op2 = (inst >> 22) & 0x7;
1020 
1021 			switch (op2) {
1022 			case 0x3: /* BPr */
1023 				disp = (((inst >> 20) & 0x3) << 14) |
1024 				    (inst & 0x3fff);
1025 				disp = (disp << 16) >> 16; /* sign-extend */
1026 				if (disp + i >= 0 && disp + i < count)
1027 					continue;
1028 				disp += offset;
1029 				if (((disp << 16) >> 16) != disp)
1030 					cmn_err(CE_PANIC, "bad reloc");
1031 				inst &= ~0x303fff;
1032 				inst |= (disp & 0x3fff);
1033 				inst |= (disp & 0xc000) << 6;
1034 				break;
1035 
1036 			case 0x2: /* Bicc */
1037 				disp = ((int32_t)inst << 10) >> 10;
1038 				if (disp + i >= 0 && disp + i < count)
1039 					continue;
1040 				disp += offset;
1041 				if (((disp << 10) >> 10) != disp)
1042 					cmn_err(CE_PANIC, "bad reloc");
1043 				inst &= ~0x3fffff;
1044 				inst |= (disp & 0x3fffff);
1045 				break;
1046 
1047 			case 0x1: /* Bpcc */
1048 				disp = ((int32_t)inst << 13) >> 13;
1049 				if (disp + i >= 0 && disp + i < count)
1050 					continue;
1051 				disp += offset;
1052 				if (((disp << 13) >> 13) != disp)
1053 					cmn_err(CE_PANIC, "bad reloc");
1054 				inst &= ~0x7ffff;
1055 				inst |= (disp & 0x7ffffu);
1056 				break;
1057 			}
1058 			*dst = inst;
1059 		}
1060 	}
1061 	flush_instr_mem(tablep, count * sizeof (uint32_t));
1062 }
1063 
1064 /*
1065  * Routine to allocate a large page to use in the TSB caches.
1066  */
1067 /*ARGSUSED*/
1068 static page_t *
1069 sfmmu_tsb_page_create(void *addr, size_t size, int vmflag, void *arg)
1070 {
1071 	int pgflags;
1072 
1073 	pgflags = PG_EXCL;
1074 	if ((vmflag & VM_NOSLEEP) == 0)
1075 		pgflags |= PG_WAIT;
1076 	if (vmflag & VM_PANIC)
1077 		pgflags |= PG_PANIC;
1078 	if (vmflag & VM_PUSHPAGE)
1079 		pgflags |= PG_PUSHPAGE;
1080 
1081 	return (page_create_va_large(&kvp, (u_offset_t)(uintptr_t)addr, size,
1082 	    pgflags, &kvseg, addr, arg));
1083 }
1084 
1085 /*
1086  * Allocate a large page to back the virtual address range
1087  * [addr, addr + size).  If addr is NULL, allocate the virtual address
1088  * space as well.
1089  */
1090 static void *
1091 sfmmu_tsb_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag,
1092     uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *),
1093     void *pcarg)
1094 {
1095 	page_t *ppl;
1096 	page_t *rootpp;
1097 	caddr_t addr = inaddr;
1098 	pgcnt_t npages = btopr(size);
1099 	page_t **ppa;
1100 	int i = 0;
1101 
1102 	/*
1103 	 * Assuming that only TSBs will call this with size > PAGESIZE
1104 	 * There is no reason why this couldn't be expanded to 8k pages as
1105 	 * well, or other page sizes in the future .... but for now, we
1106 	 * only support fixed sized page requests.
1107 	 */
1108 	if ((inaddr == NULL) && ((addr = vmem_xalloc(vmp, size, size, 0, 0,
1109 	    NULL, NULL, vmflag)) == NULL))
1110 		return (NULL);
1111 
1112 	/* If we ever don't want TSB slab-sized pages, this will panic */
1113 	ASSERT(((uintptr_t)addr & (tsb_slab_size - 1)) == 0);
1114 
1115 	if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
1116 		if (inaddr == NULL)
1117 			vmem_xfree(vmp, addr, size);
1118 		return (NULL);
1119 	}
1120 
1121 	ppl = page_create_func(addr, size, vmflag, pcarg);
1122 	if (ppl == NULL) {
1123 		if (inaddr == NULL)
1124 			vmem_xfree(vmp, addr, size);
1125 		page_unresv(npages);
1126 		return (NULL);
1127 	}
1128 
1129 	rootpp = ppl;
1130 	ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
1131 	while (ppl != NULL) {
1132 		page_t *pp = ppl;
1133 		ppa[i++] = pp;
1134 		page_sub(&ppl, pp);
1135 		ASSERT(page_iolock_assert(pp));
1136 		page_io_unlock(pp);
1137 	}
1138 
1139 	/*
1140 	 * Load the locked entry.  It's OK to preload the entry into
1141 	 * the TSB since we now support large mappings in the kernel TSB.
1142 	 */
1143 	hat_memload_array(kas.a_hat, (caddr_t)rootpp->p_offset, size,
1144 	    ppa, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr, HAT_LOAD_LOCK);
1145 
1146 	for (--i; i >= 0; --i) {
1147 		(void) page_pp_lock(ppa[i], 0, 1);
1148 		page_unlock(ppa[i]);
1149 	}
1150 
1151 	kmem_free(ppa, npages * sizeof (page_t *));
1152 	return (addr);
1153 }
1154 
1155 /* Called to import new spans into the TSB vmem arenas */
1156 void *
1157 sfmmu_tsb_segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
1158 {
1159 	lgrp_id_t lgrpid = LGRP_NONE;
1160 
1161 	if (tsb_lgrp_affinity) {
1162 		/*
1163 		 * Search for the vmp->lgrpid mapping by brute force;
1164 		 * some day vmp will have an lgrp, until then we have
1165 		 * to do this the hard way.
1166 		 */
1167 		for (lgrpid = 0; lgrpid < NLGRPS_MAX &&
1168 		    vmp != kmem_tsb_default_arena[lgrpid]; lgrpid++);
1169 		if (lgrpid == NLGRPS_MAX)
1170 			lgrpid = LGRP_NONE;
1171 	}
1172 
1173 	return (sfmmu_tsb_xalloc(vmp, NULL, size, vmflag, 0,
1174 	    sfmmu_tsb_page_create, lgrpid != LGRP_NONE? &lgrpid : NULL));
1175 }
1176 
1177 /* Called to free spans from the TSB vmem arenas */
1178 void
1179 sfmmu_tsb_segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
1180 {
1181 	page_t *pp;
1182 	caddr_t addr = inaddr;
1183 	caddr_t eaddr;
1184 	pgcnt_t npages = btopr(size);
1185 	pgcnt_t pgs_left = npages;
1186 	page_t *rootpp = NULL;
1187 
1188 	ASSERT(((uintptr_t)addr & (tsb_slab_size - 1)) == 0);
1189 
1190 	hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1191 
1192 	for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
1193 		pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
1194 		if (pp == NULL)
1195 			panic("sfmmu_tsb_segkmem_free: page not found");
1196 
1197 		ASSERT(PAGE_EXCL(pp));
1198 		page_pp_unlock(pp, 0, 1);
1199 
1200 		if (rootpp == NULL)
1201 			rootpp = pp;
1202 		if (--pgs_left == 0) {
1203 			/*
1204 			 * similar logic to segspt_free_pages, but we know we
1205 			 * have one large page.
1206 			 */
1207 			page_destroy_pages(rootpp);
1208 		}
1209 	}
1210 	page_unresv(npages);
1211 
1212 	if (vmp != NULL)
1213 		vmem_xfree(vmp, inaddr, size);
1214 }
1215