xref: /titanic_52/usr/src/uts/sun4/vm/sfmmu.c (revision 68c47f65208790c466e5e484f2293d3baed71c6a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <vm/hat.h>
29 #include <vm/hat_sfmmu.h>
30 #include <vm/page.h>
31 #include <sys/pte.h>
32 #include <sys/systm.h>
33 #include <sys/mman.h>
34 #include <sys/sysmacros.h>
35 #include <sys/machparam.h>
36 #include <sys/vtrace.h>
37 #include <sys/kmem.h>
38 #include <sys/mmu.h>
39 #include <sys/cmn_err.h>
40 #include <sys/cpu.h>
41 #include <sys/cpuvar.h>
42 #include <sys/debug.h>
43 #include <sys/lgrp.h>
44 #include <sys/archsystm.h>
45 #include <sys/machsystm.h>
46 #include <sys/vmsystm.h>
47 #include <sys/bitmap.h>
48 #include <vm/as.h>
49 #include <vm/seg.h>
50 #include <vm/seg_kmem.h>
51 #include <vm/seg_kp.h>
52 #include <vm/seg_kpm.h>
53 #include <vm/rm.h>
54 #include <vm/vm_dep.h>
55 #include <sys/t_lock.h>
56 #include <sys/vm_machparam.h>
57 #include <sys/promif.h>
58 #include <sys/prom_isa.h>
59 #include <sys/prom_plat.h>
60 #include <sys/prom_debug.h>
61 #include <sys/privregs.h>
62 #include <sys/bootconf.h>
63 #include <sys/memlist.h>
64 #include <sys/memlist_plat.h>
65 #include <sys/cpu_module.h>
66 #include <sys/reboot.h>
67 #include <sys/kdi.h>
68 
69 /*
70  * Static routines
71  */
72 static void	sfmmu_map_prom_mappings(struct translation *, size_t);
73 static struct translation *read_prom_mappings(size_t *);
74 static void	sfmmu_reloc_trap_handler(void *, void *, size_t);
75 
76 /*
77  * External routines
78  */
79 extern void sfmmu_remap_kernel(void);
80 extern void sfmmu_patch_utsb(void);
81 
82 /*
83  * Global Data:
84  */
85 extern caddr_t	textva, datava;
86 extern tte_t	ktext_tte, kdata_tte;	/* ttes for kernel text and data */
87 extern int	enable_bigktsb;
88 extern int	kmem64_smchunks;
89 
90 uint64_t memsegspa = (uintptr_t)MSEG_NULLPTR_PA; /* memsegs physical linkage */
91 uint64_t memseg_phash[N_MEM_SLOTS];	/* use physical memseg addresses */
92 
93 int	sfmmu_kern_mapped = 0;
94 
95 /*
96  * DMMU primary context register for the kernel context. Machine specific code
97  * inserts correct page size codes when necessary
98  */
99 uint64_t kcontextreg = KCONTEXT;
100 
101 #ifdef DEBUG
102 static int ndata_middle_hole_detected = 0;
103 #endif
104 
105 /* Extern Global Data */
106 
107 extern int page_relocate_ready;
108 
109 /*
110  * Controls the logic which enables the use of the
111  * QUAD_LDD_PHYS ASI for TSB accesses.
112  */
113 extern int	ktsb_phys;
114 
115 /*
116  * Global Routines called from within:
117  *	usr/src/uts/sun4u
118  *	usr/src/uts/sfmmu
119  *	usr/src/uts/sun
120  */
121 
122 pfn_t
123 va_to_pfn(void *vaddr)
124 {
125 	u_longlong_t physaddr;
126 	int mode, valid;
127 
128 	if (tba_taken_over)
129 		return (hat_getpfnum(kas.a_hat, (caddr_t)vaddr));
130 
131 #if !defined(C_OBP)
132 	if (!kmem64_smchunks &&
133 	    (caddr_t)vaddr >= kmem64_base && (caddr_t)vaddr < kmem64_end) {
134 		if (kmem64_pabase == (uint64_t)-1)
135 			prom_panic("va_to_pfn: kmem64_pabase not init");
136 		physaddr = kmem64_pabase + ((caddr_t)vaddr - kmem64_base);
137 		return ((pfn_t)physaddr >> MMU_PAGESHIFT);
138 	}
139 #endif	/* !C_OBP */
140 
141 	if ((prom_translate_virt(vaddr, &valid, &physaddr, &mode) != -1) &&
142 	    (valid == -1)) {
143 		return ((pfn_t)(physaddr >> MMU_PAGESHIFT));
144 	}
145 	return (PFN_INVALID);
146 }
147 
148 uint64_t
149 va_to_pa(void *vaddr)
150 {
151 	pfn_t pfn;
152 
153 	if ((pfn = va_to_pfn(vaddr)) == PFN_INVALID)
154 		return ((uint64_t)-1);
155 	return (((uint64_t)pfn << MMU_PAGESHIFT) |
156 	    ((uint64_t)vaddr & MMU_PAGEOFFSET));
157 }
158 
159 void
160 hat_kern_setup(void)
161 {
162 	struct translation *trans_root;
163 	size_t ntrans_root;
164 	extern void startup_fixup_physavail(void);
165 
166 	/*
167 	 * These are the steps we take to take over the mmu from the prom.
168 	 *
169 	 * (1)	Read the prom's mappings through the translation property.
170 	 * (2)	Remap the kernel text and kernel data with 2 locked 4MB ttes.
171 	 *	Create the the hmeblks for these 2 ttes at this time.
172 	 * (3)	Create hat structures for all other prom mappings.  Since the
173 	 *	kernel text and data hme_blks have already been created we
174 	 *	skip the equivalent prom's mappings.
175 	 * (4)	Initialize the tsb and its corresponding hardware regs.
176 	 * (5)	Take over the trap table (currently in startup).
177 	 * (6)	Up to this point it is possible the prom required some of its
178 	 *	locked tte's.  Now that we own the trap table we remove them.
179 	 */
180 
181 	ktsb_pbase = va_to_pa(ktsb_base);
182 	ktsb4m_pbase = va_to_pa(ktsb4m_base);
183 	PRM_DEBUG(ktsb_pbase);
184 	PRM_DEBUG(ktsb4m_pbase);
185 
186 	sfmmu_patch_ktsb();
187 	sfmmu_patch_utsb();
188 	sfmmu_patch_mmu_asi(ktsb_phys);
189 
190 	sfmmu_init_tsbs();
191 
192 	if (kpm_enable) {
193 		sfmmu_kpm_patch_tlbm();
194 		if (kpm_smallpages == 0) {
195 			sfmmu_kpm_patch_tsbm();
196 		}
197 	}
198 
199 	if (!shctx_on) {
200 		sfmmu_patch_shctx();
201 	}
202 
203 	/*
204 	 * The 8K-indexed kernel TSB space is used to hold
205 	 * translations below...
206 	 */
207 	trans_root = read_prom_mappings(&ntrans_root);
208 	sfmmu_remap_kernel();
209 	startup_fixup_physavail();
210 	mmu_init_kernel_pgsz(kas.a_hat);
211 	sfmmu_map_prom_mappings(trans_root, ntrans_root);
212 
213 	/*
214 	 * We invalidate 8K kernel TSB because we used it in
215 	 * sfmmu_map_prom_mappings()
216 	 */
217 	sfmmu_inv_tsb(ktsb_base, ktsb_sz);
218 	sfmmu_inv_tsb(ktsb4m_base, ktsb4m_sz);
219 
220 	sfmmu_init_ktsbinfo();
221 
222 
223 	sfmmu_kern_mapped = 1;
224 
225 	/*
226 	 * hments have been created for mapped pages, and thus we're ready
227 	 * for kmdb to start using its own trap table.  It walks the hments
228 	 * to resolve TLB misses, and can't be used until they're ready.
229 	 */
230 	if (boothowto & RB_DEBUG)
231 		kdi_dvec_vmready();
232 }
233 
234 /*
235  * Macro used below to convert the prom's 32-bit high and low fields into
236  * a value appropriate for the 64-bit kernel.
237  */
238 
239 #define	COMBINE(hi, lo) (((uint64_t)(uint32_t)(hi) << 32) | (uint32_t)(lo))
240 
241 /*
242  * Track larges pages used.
243  * Provides observability for this feature on non-debug kernels.
244  */
245 ulong_t map_prom_lpcount[MMU_PAGE_SIZES];
246 
247 /*
248  * This function traverses the prom mapping list and creates equivalent
249  * mappings in the sfmmu mapping hash.
250  */
251 static void
252 sfmmu_map_prom_mappings(struct translation *trans_root, size_t ntrans_root)
253 {
254 	struct translation *promt;
255 	tte_t	tte, oldtte, *ttep;
256 	pfn_t	pfn, oldpfn, basepfn;
257 	caddr_t vaddr;
258 	size_t	size, offset;
259 	unsigned long i;
260 	uint_t	attr;
261 	page_t *pp;
262 	extern struct memlist *virt_avail;
263 	char buf[256];
264 
265 	ttep = &tte;
266 	for (i = 0, promt = trans_root; i < ntrans_root; i++, promt++) {
267 		ASSERT(promt->tte_hi != 0);
268 		ASSERT32(promt->virt_hi == 0 && promt->size_hi == 0);
269 
270 		vaddr = (caddr_t)COMBINE(promt->virt_hi, promt->virt_lo);
271 
272 		/*
273 		 * hack until we get rid of map-for-unix
274 		 */
275 		if (vaddr < (caddr_t)KERNELBASE)
276 			continue;
277 
278 		ttep->tte_inthi = promt->tte_hi;
279 		ttep->tte_intlo = promt->tte_lo;
280 		attr = PROC_DATA | HAT_NOSYNC;
281 #if defined(TTE_IS_GLOBAL)
282 		if (TTE_IS_GLOBAL(ttep)) {
283 			/*
284 			 * The prom better not use global translations
285 			 * because a user process might use the same
286 			 * virtual addresses
287 			 */
288 			prom_panic("sfmmu_map_prom_mappings: global"
289 			    " translation");
290 			TTE_SET_LOFLAGS(ttep, TTE_GLB_INT, 0);
291 		}
292 #endif
293 		if (TTE_IS_LOCKED(ttep)) {
294 			/* clear the lock bits */
295 			TTE_CLR_LOCKED(ttep);
296 		}
297 		attr |= (TTE_IS_VCACHEABLE(ttep)) ? 0 : SFMMU_UNCACHEVTTE;
298 		attr |= (TTE_IS_PCACHEABLE(ttep)) ? 0 : SFMMU_UNCACHEPTTE;
299 		attr |= (TTE_IS_SIDEFFECT(ttep)) ? SFMMU_SIDEFFECT : 0;
300 		attr |= (TTE_IS_IE(ttep)) ? HAT_STRUCTURE_LE : 0;
301 
302 		size = COMBINE(promt->size_hi, promt->size_lo);
303 		offset = 0;
304 		basepfn = TTE_TO_PFN((caddr_t)COMBINE(promt->virt_hi,
305 		    promt->virt_lo), ttep);
306 		while (size) {
307 			vaddr = (caddr_t)(COMBINE(promt->virt_hi,
308 			    promt->virt_lo) + offset);
309 
310 			/*
311 			 * make sure address is not in virt-avail list
312 			 */
313 			if (address_in_memlist(virt_avail, (uint64_t)vaddr,
314 			    size)) {
315 				prom_panic("sfmmu_map_prom_mappings:"
316 				    " inconsistent translation/avail lists");
317 			}
318 
319 			pfn = basepfn + mmu_btop(offset);
320 			if (pf_is_memory(pfn)) {
321 				if (attr & SFMMU_UNCACHEPTTE) {
322 					prom_panic("sfmmu_map_prom_mappings:"
323 					    " uncached prom memory page");
324 				}
325 			} else {
326 				if (!(attr & SFMMU_SIDEFFECT)) {
327 					prom_panic("sfmmu_map_prom_mappings:"
328 					    " prom i/o page without"
329 					    " side-effect");
330 				}
331 			}
332 
333 			/*
334 			 * skip kmem64 area
335 			 */
336 			if (!kmem64_smchunks &&
337 			    vaddr >= kmem64_base &&
338 			    vaddr < kmem64_aligned_end) {
339 #if !defined(C_OBP)
340 				prom_panic("sfmmu_map_prom_mappings:"
341 				    " unexpected kmem64 prom mapping");
342 #else	/* !C_OBP */
343 				size_t mapsz;
344 
345 				if (ptob(pfn) !=
346 				    kmem64_pabase + (vaddr - kmem64_base)) {
347 					prom_panic("sfmmu_map_prom_mappings:"
348 					    " unexpected kmem64 prom mapping");
349 				}
350 
351 				mapsz = kmem64_aligned_end - vaddr;
352 				if (mapsz >= size) {
353 					break;
354 				}
355 				size -= mapsz;
356 				offset += mapsz;
357 				continue;
358 #endif	/* !C_OBP */
359 			}
360 
361 			oldpfn = sfmmu_vatopfn(vaddr, KHATID, &oldtte);
362 			ASSERT(oldpfn != PFN_SUSPENDED);
363 			ASSERT(page_relocate_ready == 0);
364 
365 			if (oldpfn != PFN_INVALID) {
366 				/*
367 				 * mapping already exists.
368 				 * Verify they are equal
369 				 */
370 				if (pfn != oldpfn) {
371 					(void) snprintf(buf, sizeof (buf),
372 					"sfmmu_map_prom_mappings: mapping"
373 					" conflict (va = 0x%p, pfn = 0x%p,"
374 					" oldpfn = 0x%p)", (void *)vaddr,
375 					    (void *)pfn, (void *)oldpfn);
376 					prom_panic(buf);
377 				}
378 				size -= MMU_PAGESIZE;
379 				offset += MMU_PAGESIZE;
380 				continue;
381 			}
382 
383 			pp = page_numtopp_nolock(pfn);
384 			if ((pp != NULL) && PP_ISFREE((page_t *)pp)) {
385 				(void) snprintf(buf, sizeof (buf),
386 				"sfmmu_map_prom_mappings: prom-mapped"
387 				" page (va = 0x%p, pfn = 0x%p) on free list",
388 				    (void *)vaddr, (void *)pfn);
389 				prom_panic(buf);
390 			}
391 
392 			sfmmu_memtte(ttep, pfn, attr, TTE8K);
393 			sfmmu_tteload(kas.a_hat, ttep, vaddr, pp,
394 			    HAT_LOAD_LOCK | SFMMU_NO_TSBLOAD);
395 			size -= MMU_PAGESIZE;
396 			offset += MMU_PAGESIZE;
397 		}
398 	}
399 
400 	/*
401 	 * We claimed kmem64 from prom, so now we need to load tte.
402 	 */
403 	if (!kmem64_smchunks && kmem64_base != NULL) {
404 		pgcnt_t pages;
405 		size_t psize;
406 		int pszc;
407 
408 		pszc = kmem64_szc;
409 #ifdef sun4u
410 		if (pszc > TTE8K) {
411 			pszc = segkmem_lpszc;
412 		}
413 #endif	/* sun4u */
414 		psize = TTEBYTES(pszc);
415 		pages = btop(psize);
416 		basepfn = kmem64_pabase >> MMU_PAGESHIFT;
417 		vaddr = kmem64_base;
418 		while (vaddr < kmem64_end) {
419 			sfmmu_memtte(ttep, basepfn,
420 			    PROC_DATA | HAT_NOSYNC, pszc);
421 			sfmmu_tteload(kas.a_hat, ttep, vaddr, NULL,
422 			    HAT_LOAD_LOCK | SFMMU_NO_TSBLOAD);
423 			vaddr += psize;
424 			basepfn += pages;
425 		}
426 		map_prom_lpcount[pszc] =
427 		    ((caddr_t)P2ROUNDUP((uintptr_t)kmem64_end, psize) -
428 		    kmem64_base) >> TTE_PAGE_SHIFT(pszc);
429 	}
430 }
431 
432 #undef COMBINE	/* local to previous routine */
433 
434 /*
435  * This routine reads in the "translations" property in to a buffer and
436  * returns a pointer to this buffer and the number of translations.
437  */
438 static struct translation *
439 read_prom_mappings(size_t *ntransrootp)
440 {
441 	char *prop = "translations";
442 	size_t translen;
443 	pnode_t node;
444 	struct translation *transroot;
445 
446 	/*
447 	 * the "translations" property is associated with the mmu node
448 	 */
449 	node = (pnode_t)prom_getphandle(prom_mmu_ihandle());
450 
451 	/*
452 	 * We use the TSB space to read in the prom mappings.  This space
453 	 * is currently not being used because we haven't taken over the
454 	 * trap table yet.  It should be big enough to hold the mappings.
455 	 */
456 	if ((translen = prom_getproplen(node, prop)) == -1)
457 		cmn_err(CE_PANIC, "no translations property");
458 	*ntransrootp = translen / sizeof (*transroot);
459 	translen = roundup(translen, MMU_PAGESIZE);
460 	PRM_DEBUG(translen);
461 	if (translen > TSB_BYTES(ktsb_szcode))
462 		cmn_err(CE_PANIC, "not enough space for translations");
463 
464 	transroot = (struct translation *)ktsb_base;
465 	ASSERT(transroot);
466 	if (prom_getprop(node, prop, (caddr_t)transroot) == -1) {
467 		cmn_err(CE_PANIC, "translations getprop failed");
468 	}
469 	return (transroot);
470 }
471 
472 /*
473  * Init routine of the nucleus data memory allocator.
474  *
475  * The nucleus data memory allocator is organized in ecache_alignsize'd
476  * memory chunks. Memory allocated by ndata_alloc() will never be freed.
477  *
478  * The ndata argument is used as header of the ndata freelist.
479  * Other freelist nodes are placed in the nucleus memory itself
480  * at the beginning of a free memory chunk. Therefore a freelist
481  * node (struct memlist) must fit into the smallest allocatable
482  * memory chunk (ecache_alignsize bytes).
483  *
484  * The memory interval [base, end] passed to ndata_alloc_init() must be
485  * bzero'd to allow the allocator to return bzero'd memory easily.
486  */
487 void
488 ndata_alloc_init(struct memlist *ndata, uintptr_t base, uintptr_t end)
489 {
490 	ASSERT(sizeof (struct memlist) <= ecache_alignsize);
491 
492 	base = roundup(base, ecache_alignsize);
493 	end = end - end % ecache_alignsize;
494 
495 	ASSERT(base < end);
496 
497 	ndata->ml_address = base;
498 	ndata->ml_size = end - base;
499 	ndata->ml_next = NULL;
500 	ndata->ml_prev = NULL;
501 }
502 
503 /*
504  * Deliver the size of the largest free memory chunk.
505  */
506 size_t
507 ndata_maxsize(struct memlist *ndata)
508 {
509 	size_t chunksize = ndata->ml_size;
510 
511 	while ((ndata = ndata->ml_next) != NULL) {
512 		if (chunksize < ndata->ml_size)
513 			chunksize = ndata->ml_size;
514 	}
515 
516 	return (chunksize);
517 }
518 
519 
520 /*
521  * Allocate the last properly aligned memory chunk.
522  * This function is called when no more large nucleus memory chunks
523  * will be allocated.  The remaining free nucleus memory at the end
524  * of the nucleus can be added to the phys_avail list.
525  */
526 void *
527 ndata_extra_base(struct memlist *ndata, size_t alignment, caddr_t endaddr)
528 {
529 	uintptr_t base;
530 	size_t wasteage = 0;
531 #ifdef	DEBUG
532 	static int called = 0;
533 
534 	if (called++ > 0)
535 		cmn_err(CE_PANIC, "ndata_extra_base() called more than once");
536 #endif /* DEBUG */
537 
538 	/*
539 	 * The alignment needs to be a multiple of ecache_alignsize.
540 	 */
541 	ASSERT((alignment % ecache_alignsize) ==  0);
542 
543 	while (ndata->ml_next != NULL) {
544 		wasteage += ndata->ml_size;
545 		ndata = ndata->ml_next;
546 	}
547 
548 	base = roundup(ndata->ml_address, alignment);
549 
550 	if (base >= ndata->ml_address + ndata->ml_size)
551 		return (NULL);
552 
553 	if ((caddr_t)(ndata->ml_address + ndata->ml_size) != endaddr) {
554 #ifdef DEBUG
555 		ndata_middle_hole_detected = 1;	/* see if we hit this again */
556 #endif
557 		return (NULL);
558 	}
559 
560 	if (base == ndata->ml_address) {
561 		if (ndata->ml_prev != NULL)
562 			ndata->ml_prev->ml_next = NULL;
563 		else
564 			ndata->ml_size = 0;
565 
566 		bzero((void *)base, sizeof (struct memlist));
567 
568 	} else {
569 		ndata->ml_size = base - ndata->ml_address;
570 		wasteage += ndata->ml_size;
571 	}
572 	PRM_DEBUG(wasteage);
573 
574 	return ((void *)base);
575 }
576 
577 /*
578  * Select the best matching buffer, avoid memory fragmentation.
579  */
580 static struct memlist *
581 ndata_select_chunk(struct memlist *ndata, size_t wanted, size_t alignment)
582 {
583 	struct memlist *fnd_below = NULL;
584 	struct memlist *fnd_above = NULL;
585 	struct memlist *fnd_unused = NULL;
586 	struct memlist *frlist;
587 	uintptr_t base;
588 	uintptr_t end;
589 	size_t below;
590 	size_t above;
591 	size_t unused;
592 	size_t best_below = ULONG_MAX;
593 	size_t best_above = ULONG_MAX;
594 	size_t best_unused = ULONG_MAX;
595 
596 	ASSERT(ndata != NULL);
597 
598 	/*
599 	 * Look for the best matching buffer, avoid memory fragmentation.
600 	 * The following strategy is used, try to find
601 	 *   1. an exact fitting buffer
602 	 *   2. avoid wasting any space below the buffer, take first
603 	 *	fitting buffer
604 	 *   3. avoid wasting any space above the buffer, take first
605 	 *	fitting buffer
606 	 *   4. avoid wasting space, take first fitting buffer
607 	 *   5. take the last buffer in chain
608 	 */
609 	for (frlist = ndata; frlist != NULL; frlist = frlist->ml_next) {
610 		base = roundup(frlist->ml_address, alignment);
611 		end = roundup(base + wanted, ecache_alignsize);
612 
613 		if (end > frlist->ml_address + frlist->ml_size)
614 			continue;
615 
616 		below = (base - frlist->ml_address) / ecache_alignsize;
617 		above = (frlist->ml_address + frlist->ml_size - end) /
618 		    ecache_alignsize;
619 		unused = below + above;
620 
621 		if (unused == 0)
622 			return (frlist);
623 
624 		if (frlist->ml_next == NULL)
625 			break;
626 
627 		if (below < best_below) {
628 			best_below = below;
629 			fnd_below = frlist;
630 		}
631 
632 		if (above < best_above) {
633 			best_above = above;
634 			fnd_above = frlist;
635 		}
636 
637 		if (unused < best_unused) {
638 			best_unused = unused;
639 			fnd_unused = frlist;
640 		}
641 	}
642 
643 	if (best_below == 0)
644 		return (fnd_below);
645 	if (best_above == 0)
646 		return (fnd_above);
647 	if (best_unused < ULONG_MAX)
648 		return (fnd_unused);
649 
650 	return (frlist);
651 }
652 
653 /*
654  * Nucleus data memory allocator.
655  * The granularity of the allocator is ecache_alignsize.
656  * See also comment for ndata_alloc_init().
657  */
658 void *
659 ndata_alloc(struct memlist *ndata, size_t wanted, size_t alignment)
660 {
661 	struct memlist *found;
662 	struct memlist *fnd_above;
663 	uintptr_t base;
664 	uintptr_t end;
665 	size_t below;
666 	size_t above;
667 
668 	/*
669 	 * Look for the best matching buffer, avoid memory fragmentation.
670 	 */
671 	if ((found = ndata_select_chunk(ndata, wanted, alignment)) == NULL)
672 		return (NULL);
673 
674 	/*
675 	 * Allocate the nucleus data buffer.
676 	 */
677 	base = roundup(found->ml_address, alignment);
678 	end = roundup(base + wanted, ecache_alignsize);
679 	ASSERT(end <= found->ml_address + found->ml_size);
680 
681 	below = base - found->ml_address;
682 	above = found->ml_address + found->ml_size - end;
683 	ASSERT(above == 0 || (above % ecache_alignsize) == 0);
684 
685 	if (below >= ecache_alignsize) {
686 		/*
687 		 * There is free memory below the allocated memory chunk.
688 		 */
689 		found->ml_size = below - below % ecache_alignsize;
690 
691 		if (above) {
692 			fnd_above = (struct memlist *)end;
693 			fnd_above->ml_address = end;
694 			fnd_above->ml_size = above;
695 
696 			if ((fnd_above->ml_next = found->ml_next) != NULL)
697 				found->ml_next->ml_prev = fnd_above;
698 			fnd_above->ml_prev = found;
699 			found->ml_next = fnd_above;
700 		}
701 
702 		return ((void *)base);
703 	}
704 
705 	if (found->ml_prev == NULL) {
706 		/*
707 		 * The first chunk (ndata) is selected.
708 		 */
709 		ASSERT(found == ndata);
710 		if (above) {
711 			found->ml_address = end;
712 			found->ml_size = above;
713 		} else if (found->ml_next != NULL) {
714 			found->ml_address = found->ml_next->ml_address;
715 			found->ml_size = found->ml_next->ml_size;
716 			if ((found->ml_next = found->ml_next->ml_next) != NULL)
717 				found->ml_next->ml_prev = found;
718 
719 			bzero((void *)found->ml_address,
720 			    sizeof (struct memlist));
721 		} else {
722 			found->ml_address = end;
723 			found->ml_size = 0;
724 		}
725 
726 		return ((void *)base);
727 	}
728 
729 	/*
730 	 * Not the first chunk.
731 	 */
732 	if (above) {
733 		fnd_above = (struct memlist *)end;
734 		fnd_above->ml_address = end;
735 		fnd_above->ml_size = above;
736 
737 		if ((fnd_above->ml_next = found->ml_next) != NULL)
738 			fnd_above->ml_next->ml_prev = fnd_above;
739 		fnd_above->ml_prev = found->ml_prev;
740 		found->ml_prev->ml_next = fnd_above;
741 
742 	} else {
743 		if ((found->ml_prev->ml_next = found->ml_next) != NULL)
744 			found->ml_next->ml_prev = found->ml_prev;
745 	}
746 
747 	bzero((void *)found->ml_address, sizeof (struct memlist));
748 
749 	return ((void *)base);
750 }
751 
752 /*
753  * Size the kernel TSBs based upon the amount of physical
754  * memory in the system.
755  */
756 static void
757 calc_tsb_sizes(pgcnt_t npages)
758 {
759 	PRM_DEBUG(npages);
760 
761 	if (npages <= TSB_FREEMEM_MIN) {
762 		ktsb_szcode = TSB_128K_SZCODE;
763 		enable_bigktsb = 0;
764 	} else if (npages <= TSB_FREEMEM_LARGE / 2) {
765 		ktsb_szcode = TSB_256K_SZCODE;
766 		enable_bigktsb = 0;
767 	} else if (npages <= TSB_FREEMEM_LARGE) {
768 		ktsb_szcode = TSB_512K_SZCODE;
769 		enable_bigktsb = 0;
770 	} else if (npages <= TSB_FREEMEM_LARGE * 2 ||
771 	    enable_bigktsb == 0) {
772 		ktsb_szcode = TSB_1M_SZCODE;
773 		enable_bigktsb = 0;
774 	} else {
775 		ktsb_szcode = highbit(npages - 1);
776 		ktsb_szcode -= TSB_START_SIZE;
777 		ktsb_szcode = MAX(ktsb_szcode, MIN_BIGKTSB_SZCODE);
778 		ktsb_szcode = MIN(ktsb_szcode, MAX_BIGKTSB_SZCODE);
779 	}
780 
781 	/*
782 	 * We choose the TSB to hold kernel 4M mappings to have twice
783 	 * the reach as the primary kernel TSB since this TSB will
784 	 * potentially (currently) be shared by both mappings to all of
785 	 * physical memory plus user TSBs. If this TSB has to be in nucleus
786 	 * (only for Spitfire and Cheetah) limit its size to 64K.
787 	 */
788 	ktsb4m_szcode = highbit((2 * npages) / TTEPAGES(TTE4M) - 1);
789 	ktsb4m_szcode -= TSB_START_SIZE;
790 	ktsb4m_szcode = MAX(ktsb4m_szcode, TSB_MIN_SZCODE);
791 	ktsb4m_szcode = MIN(ktsb4m_szcode, TSB_SOFTSZ_MASK);
792 	if ((enable_bigktsb == 0 || ktsb_phys == 0) && ktsb4m_szcode >
793 	    TSB_64K_SZCODE) {
794 		ktsb4m_szcode = TSB_64K_SZCODE;
795 		max_bootlp_tteszc = TTE8K;
796 	}
797 
798 	ktsb_sz = TSB_BYTES(ktsb_szcode);	/* kernel 8K tsb size */
799 	ktsb4m_sz = TSB_BYTES(ktsb4m_szcode);	/* kernel 4M tsb size */
800 }
801 
802 /*
803  * Allocate kernel TSBs from nucleus data memory.
804  * The function return 0 on success and -1 on failure.
805  */
806 int
807 ndata_alloc_tsbs(struct memlist *ndata, pgcnt_t npages)
808 {
809 	/*
810 	 * Set ktsb_phys to 1 if the processor supports ASI_QUAD_LDD_PHYS.
811 	 */
812 	(void) sfmmu_setup_4lp();
813 
814 	/*
815 	 * Size the kernel TSBs based upon the amount of physical
816 	 * memory in the system.
817 	 */
818 	calc_tsb_sizes(npages);
819 
820 	/*
821 	 * Allocate the 8K kernel TSB if it belongs inside the nucleus.
822 	 */
823 	if (enable_bigktsb == 0) {
824 		if ((ktsb_base = ndata_alloc(ndata, ktsb_sz, ktsb_sz)) == NULL)
825 			return (-1);
826 		ASSERT(!((uintptr_t)ktsb_base & (ktsb_sz - 1)));
827 
828 		PRM_DEBUG(ktsb_base);
829 		PRM_DEBUG(ktsb_sz);
830 		PRM_DEBUG(ktsb_szcode);
831 	}
832 
833 	/*
834 	 * Next, allocate 4M kernel TSB from the nucleus since it's small.
835 	 */
836 	if (ktsb4m_szcode <= TSB_64K_SZCODE) {
837 
838 		ktsb4m_base = ndata_alloc(ndata, ktsb4m_sz, ktsb4m_sz);
839 		if (ktsb4m_base == NULL)
840 			return (-1);
841 		ASSERT(!((uintptr_t)ktsb4m_base & (ktsb4m_sz - 1)));
842 
843 		PRM_DEBUG(ktsb4m_base);
844 		PRM_DEBUG(ktsb4m_sz);
845 		PRM_DEBUG(ktsb4m_szcode);
846 	}
847 
848 	return (0);
849 }
850 
851 size_t
852 calc_hmehash_sz(pgcnt_t npages)
853 {
854 	ulong_t hme_buckets;
855 
856 	/*
857 	 * The number of buckets in the hme hash tables
858 	 * is a power of 2 such that the average hash chain length is
859 	 * HMENT_HASHAVELEN.  The number of buckets for the user hash is
860 	 * a function of physical memory and a predefined overmapping factor.
861 	 * The number of buckets for the kernel hash is a function of
862 	 * physical memory only.
863 	 */
864 	hme_buckets = (npages * HMEHASH_FACTOR) /
865 	    (HMENT_HASHAVELEN * (HMEBLK_SPAN(TTE8K) >> MMU_PAGESHIFT));
866 
867 	uhmehash_num = (int)MIN(hme_buckets, MAX_UHME_BUCKETS);
868 
869 	if (uhmehash_num > USER_BUCKETS_THRESHOLD) {
870 		/*
871 		 * if uhmehash_num is not power of 2 round it down to the
872 		 *  next power of 2.
873 		 */
874 		uint_t align = 1 << (highbit(uhmehash_num - 1) - 1);
875 		uhmehash_num = P2ALIGN(uhmehash_num, align);
876 	} else
877 		uhmehash_num = 1 << highbit(uhmehash_num - 1);
878 
879 	hme_buckets = npages / (HMEBLK_SPAN(TTE8K) >> MMU_PAGESHIFT);
880 	khmehash_num = (int)MIN(hme_buckets, MAX_KHME_BUCKETS);
881 	khmehash_num = 1 << highbit(khmehash_num - 1);
882 	khmehash_num = MAX(khmehash_num, MIN_KHME_BUCKETS);
883 
884 	return ((uhmehash_num + khmehash_num) * sizeof (struct hmehash_bucket));
885 }
886 
887 caddr_t
888 alloc_hmehash(caddr_t alloc_base)
889 {
890 	size_t khmehash_sz, uhmehash_sz;
891 
892 	khme_hash = (struct hmehash_bucket *)alloc_base;
893 	khmehash_sz = khmehash_num * sizeof (struct hmehash_bucket);
894 	alloc_base += khmehash_sz;
895 
896 	uhme_hash = (struct hmehash_bucket *)alloc_base;
897 	uhmehash_sz = uhmehash_num * sizeof (struct hmehash_bucket);
898 	alloc_base += uhmehash_sz;
899 
900 	PRM_DEBUG(khme_hash);
901 	PRM_DEBUG(uhme_hash);
902 
903 	return (alloc_base);
904 }
905 
906 /*
907  * Allocate hat structs from the nucleus data memory.
908  */
909 int
910 ndata_alloc_hat(struct memlist *ndata, pgcnt_t npages)
911 {
912 	size_t	mml_alloc_sz;
913 	size_t	cb_alloc_sz;
914 
915 	/*
916 	 * For the page mapping list mutex array we allocate one mutex
917 	 * for every 128 pages (1 MB) with a minimum of 64 entries and
918 	 * a maximum of 8K entries. For the initial computation npages
919 	 * is rounded up (ie. 1 << highbit(npages * 1.5 / 128))
920 	 *
921 	 * mml_shift is roughly log2(mml_table_sz) + 3 for MLIST_HASH
922 	 */
923 	mml_table_sz = 1 << highbit((npages * 3) / 256);
924 	if (mml_table_sz < 64)
925 		mml_table_sz = 64;
926 	else if (mml_table_sz > 8192)
927 		mml_table_sz = 8192;
928 	mml_shift = highbit(mml_table_sz) + 3;
929 
930 	PRM_DEBUG(mml_table_sz);
931 	PRM_DEBUG(mml_shift);
932 
933 	mml_alloc_sz = mml_table_sz * sizeof (kmutex_t);
934 
935 	mml_table = ndata_alloc(ndata, mml_alloc_sz, ecache_alignsize);
936 	if (mml_table == NULL)
937 		return (-1);
938 	PRM_DEBUG(mml_table);
939 
940 	cb_alloc_sz = sfmmu_max_cb_id * sizeof (struct sfmmu_callback);
941 	PRM_DEBUG(cb_alloc_sz);
942 	sfmmu_cb_table = ndata_alloc(ndata, cb_alloc_sz, ecache_alignsize);
943 	if (sfmmu_cb_table == NULL)
944 		return (-1);
945 	PRM_DEBUG(sfmmu_cb_table);
946 
947 	return (0);
948 }
949 
950 int
951 ndata_alloc_kpm(struct memlist *ndata, pgcnt_t kpm_npages)
952 {
953 	size_t	kpmp_alloc_sz;
954 
955 	/*
956 	 * For the kpm_page mutex array we allocate one mutex every 16
957 	 * kpm pages (64MB). In smallpage mode we allocate one mutex
958 	 * every 8K pages. The minimum is set to 64 entries and the
959 	 * maximum to 8K entries.
960 	 */
961 	if (kpm_smallpages == 0) {
962 		kpmp_shift = highbit(sizeof (kpm_page_t)) - 1;
963 		kpmp_table_sz = 1 << highbit(kpm_npages / 16);
964 		kpmp_table_sz = (kpmp_table_sz < 64) ? 64 :
965 		    ((kpmp_table_sz > 8192) ? 8192 : kpmp_table_sz);
966 		kpmp_alloc_sz = kpmp_table_sz * sizeof (kpm_hlk_t);
967 
968 		kpmp_table = ndata_alloc(ndata, kpmp_alloc_sz,
969 		    ecache_alignsize);
970 		if (kpmp_table == NULL)
971 			return (-1);
972 
973 		PRM_DEBUG(kpmp_table);
974 		PRM_DEBUG(kpmp_table_sz);
975 
976 		kpmp_stable_sz = 0;
977 		kpmp_stable = NULL;
978 	} else {
979 		ASSERT(kpm_pgsz == PAGESIZE);
980 		kpmp_shift = highbit(sizeof (kpm_shlk_t)) + 1;
981 		kpmp_stable_sz = 1 << highbit(kpm_npages / 8192);
982 		kpmp_stable_sz = (kpmp_stable_sz < 64) ? 64 :
983 		    ((kpmp_stable_sz > 8192) ? 8192 : kpmp_stable_sz);
984 		kpmp_alloc_sz = kpmp_stable_sz * sizeof (kpm_shlk_t);
985 
986 		kpmp_stable = ndata_alloc(ndata, kpmp_alloc_sz,
987 		    ecache_alignsize);
988 		if (kpmp_stable == NULL)
989 			return (-1);
990 
991 		PRM_DEBUG(kpmp_stable);
992 		PRM_DEBUG(kpmp_stable_sz);
993 
994 		kpmp_table_sz = 0;
995 		kpmp_table = NULL;
996 	}
997 	PRM_DEBUG(kpmp_shift);
998 
999 	return (0);
1000 }
1001 
1002 /*
1003  * This function bop allocs kernel TSBs.
1004  */
1005 caddr_t
1006 sfmmu_ktsb_alloc(caddr_t tsbbase)
1007 {
1008 	caddr_t vaddr;
1009 
1010 	if (enable_bigktsb) {
1011 		ktsb_base = (caddr_t)roundup((uintptr_t)tsbbase, ktsb_sz);
1012 		vaddr = prom_alloc(ktsb_base, ktsb_sz, ktsb_sz);
1013 		if (vaddr != ktsb_base)
1014 			cmn_err(CE_PANIC, "sfmmu_ktsb_alloc: can't alloc"
1015 			    " 8K bigktsb");
1016 		ktsb_base = vaddr;
1017 		tsbbase = ktsb_base + ktsb_sz;
1018 		PRM_DEBUG(ktsb_base);
1019 		PRM_DEBUG(tsbbase);
1020 	}
1021 
1022 	if (ktsb4m_szcode > TSB_64K_SZCODE) {
1023 		ASSERT(ktsb_phys && enable_bigktsb);
1024 		ktsb4m_base = (caddr_t)roundup((uintptr_t)tsbbase, ktsb4m_sz);
1025 		vaddr = (caddr_t)BOP_ALLOC(bootops, ktsb4m_base, ktsb4m_sz,
1026 		    ktsb4m_sz);
1027 		if (vaddr != ktsb4m_base)
1028 			cmn_err(CE_PANIC, "sfmmu_ktsb_alloc: can't alloc"
1029 			    " 4M bigktsb");
1030 		ktsb4m_base = vaddr;
1031 		tsbbase = ktsb4m_base + ktsb4m_sz;
1032 		PRM_DEBUG(ktsb4m_base);
1033 		PRM_DEBUG(tsbbase);
1034 	}
1035 	return (tsbbase);
1036 }
1037 
1038 /*
1039  * Moves code assembled outside of the trap table into the trap
1040  * table taking care to relocate relative branches to code outside
1041  * of the trap handler.
1042  */
1043 static void
1044 sfmmu_reloc_trap_handler(void *tablep, void *start, size_t count)
1045 {
1046 	size_t i;
1047 	uint32_t *src;
1048 	uint32_t *dst;
1049 	uint32_t inst;
1050 	int op, op2;
1051 	int32_t offset;
1052 	int disp;
1053 
1054 	src = start;
1055 	dst = tablep;
1056 	offset = src - dst;
1057 	for (src = start, i = 0; i < count; i++, src++, dst++) {
1058 		inst = *dst = *src;
1059 		op = (inst >> 30) & 0x2;
1060 		if (op == 1) {
1061 			/* call */
1062 			disp = ((int32_t)inst << 2) >> 2; /* sign-extend */
1063 			if (disp + i >= 0 && disp + i < count)
1064 				continue;
1065 			disp += offset;
1066 			inst = 0x40000000u | (disp & 0x3fffffffu);
1067 			*dst = inst;
1068 		} else if (op == 0) {
1069 			/* branch or sethi */
1070 			op2 = (inst >> 22) & 0x7;
1071 
1072 			switch (op2) {
1073 			case 0x3: /* BPr */
1074 				disp = (((inst >> 20) & 0x3) << 14) |
1075 				    (inst & 0x3fff);
1076 				disp = (disp << 16) >> 16; /* sign-extend */
1077 				if (disp + i >= 0 && disp + i < count)
1078 					continue;
1079 				disp += offset;
1080 				if (((disp << 16) >> 16) != disp)
1081 					cmn_err(CE_PANIC, "bad reloc");
1082 				inst &= ~0x303fff;
1083 				inst |= (disp & 0x3fff);
1084 				inst |= (disp & 0xc000) << 6;
1085 				break;
1086 
1087 			case 0x2: /* Bicc */
1088 				disp = ((int32_t)inst << 10) >> 10;
1089 				if (disp + i >= 0 && disp + i < count)
1090 					continue;
1091 				disp += offset;
1092 				if (((disp << 10) >> 10) != disp)
1093 					cmn_err(CE_PANIC, "bad reloc");
1094 				inst &= ~0x3fffff;
1095 				inst |= (disp & 0x3fffff);
1096 				break;
1097 
1098 			case 0x1: /* Bpcc */
1099 				disp = ((int32_t)inst << 13) >> 13;
1100 				if (disp + i >= 0 && disp + i < count)
1101 					continue;
1102 				disp += offset;
1103 				if (((disp << 13) >> 13) != disp)
1104 					cmn_err(CE_PANIC, "bad reloc");
1105 				inst &= ~0x7ffff;
1106 				inst |= (disp & 0x7ffffu);
1107 				break;
1108 			}
1109 			*dst = inst;
1110 		}
1111 	}
1112 	flush_instr_mem(tablep, count * sizeof (uint32_t));
1113 }
1114 
1115 /*
1116  * Routine to allocate a large page to use in the TSB caches.
1117  */
1118 /*ARGSUSED*/
1119 static page_t *
1120 sfmmu_tsb_page_create(void *addr, size_t size, int vmflag, void *arg)
1121 {
1122 	int pgflags;
1123 
1124 	pgflags = PG_EXCL;
1125 	if ((vmflag & VM_NOSLEEP) == 0)
1126 		pgflags |= PG_WAIT;
1127 	if (vmflag & VM_PANIC)
1128 		pgflags |= PG_PANIC;
1129 	if (vmflag & VM_PUSHPAGE)
1130 		pgflags |= PG_PUSHPAGE;
1131 
1132 	return (page_create_va_large(&kvp, (u_offset_t)(uintptr_t)addr, size,
1133 	    pgflags, &kvseg, addr, arg));
1134 }
1135 
1136 /*
1137  * Allocate a large page to back the virtual address range
1138  * [addr, addr + size).  If addr is NULL, allocate the virtual address
1139  * space as well.
1140  */
1141 static void *
1142 sfmmu_tsb_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag,
1143     uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *),
1144     void *pcarg)
1145 {
1146 	page_t *ppl;
1147 	page_t *rootpp;
1148 	caddr_t addr = inaddr;
1149 	pgcnt_t npages = btopr(size);
1150 	page_t **ppa;
1151 	int i = 0;
1152 
1153 	/*
1154 	 * Assuming that only TSBs will call this with size > PAGESIZE
1155 	 * There is no reason why this couldn't be expanded to 8k pages as
1156 	 * well, or other page sizes in the future .... but for now, we
1157 	 * only support fixed sized page requests.
1158 	 */
1159 	if ((inaddr == NULL) && ((addr = vmem_xalloc(vmp, size, size, 0, 0,
1160 	    NULL, NULL, vmflag)) == NULL))
1161 		return (NULL);
1162 
1163 	if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
1164 		if (inaddr == NULL)
1165 			vmem_xfree(vmp, addr, size);
1166 		return (NULL);
1167 	}
1168 
1169 	ppl = page_create_func(addr, size, vmflag, pcarg);
1170 	if (ppl == NULL) {
1171 		if (inaddr == NULL)
1172 			vmem_xfree(vmp, addr, size);
1173 		page_unresv(npages);
1174 		return (NULL);
1175 	}
1176 
1177 	rootpp = ppl;
1178 	ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
1179 	while (ppl != NULL) {
1180 		page_t *pp = ppl;
1181 		ppa[i++] = pp;
1182 		page_sub(&ppl, pp);
1183 		ASSERT(page_iolock_assert(pp));
1184 		page_io_unlock(pp);
1185 	}
1186 
1187 	/*
1188 	 * Load the locked entry.  It's OK to preload the entry into
1189 	 * the TSB since we now support large mappings in the kernel TSB.
1190 	 */
1191 	hat_memload_array(kas.a_hat, (caddr_t)rootpp->p_offset, size,
1192 	    ppa, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr, HAT_LOAD_LOCK);
1193 
1194 	for (--i; i >= 0; --i) {
1195 		(void) page_pp_lock(ppa[i], 0, 1);
1196 		page_unlock(ppa[i]);
1197 	}
1198 
1199 	kmem_free(ppa, npages * sizeof (page_t *));
1200 	return (addr);
1201 }
1202 
1203 /* Called to import new spans into the TSB vmem arenas */
1204 void *
1205 sfmmu_tsb_segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
1206 {
1207 	lgrp_id_t lgrpid = LGRP_NONE;
1208 
1209 	if (tsb_lgrp_affinity) {
1210 		/*
1211 		 * Search for the vmp->lgrpid mapping by brute force;
1212 		 * some day vmp will have an lgrp, until then we have
1213 		 * to do this the hard way.
1214 		 */
1215 		for (lgrpid = 0; lgrpid < NLGRPS_MAX &&
1216 		    vmp != kmem_tsb_default_arena[lgrpid]; lgrpid++)
1217 			;
1218 		if (lgrpid == NLGRPS_MAX)
1219 			lgrpid = LGRP_NONE;
1220 	}
1221 
1222 	return (sfmmu_tsb_xalloc(vmp, NULL, size, vmflag, 0,
1223 	    sfmmu_tsb_page_create, lgrpid != LGRP_NONE? &lgrpid : NULL));
1224 }
1225 
1226 /* Called to free spans from the TSB vmem arenas */
1227 void
1228 sfmmu_tsb_segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
1229 {
1230 	page_t *pp;
1231 	caddr_t addr = inaddr;
1232 	caddr_t eaddr;
1233 	pgcnt_t npages = btopr(size);
1234 	pgcnt_t pgs_left = npages;
1235 	page_t *rootpp = NULL;
1236 
1237 	hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1238 
1239 	for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
1240 		pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
1241 		if (pp == NULL)
1242 			panic("sfmmu_tsb_segkmem_free: page not found");
1243 
1244 		ASSERT(PAGE_EXCL(pp));
1245 		page_pp_unlock(pp, 0, 1);
1246 
1247 		if (rootpp == NULL)
1248 			rootpp = pp;
1249 		if (--pgs_left == 0) {
1250 			/*
1251 			 * similar logic to segspt_free_pages, but we know we
1252 			 * have one large page.
1253 			 */
1254 			page_destroy_pages(rootpp);
1255 		}
1256 	}
1257 	page_unresv(npages);
1258 
1259 	if (vmp != NULL)
1260 		vmem_xfree(vmp, inaddr, size);
1261 }
1262