xref: /titanic_41/usr/src/uts/sun4/vm/sfmmu.c (revision d5ace9454616652a717c9831d949dffa319381f9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <vm/hat.h>
28 #include <vm/hat_sfmmu.h>
29 #include <vm/page.h>
30 #include <sys/pte.h>
31 #include <sys/systm.h>
32 #include <sys/mman.h>
33 #include <sys/sysmacros.h>
34 #include <sys/machparam.h>
35 #include <sys/vtrace.h>
36 #include <sys/kmem.h>
37 #include <sys/mmu.h>
38 #include <sys/cmn_err.h>
39 #include <sys/cpu.h>
40 #include <sys/cpuvar.h>
41 #include <sys/debug.h>
42 #include <sys/lgrp.h>
43 #include <sys/archsystm.h>
44 #include <sys/machsystm.h>
45 #include <sys/vmsystm.h>
46 #include <sys/bitmap.h>
47 #include <vm/as.h>
48 #include <vm/seg.h>
49 #include <vm/seg_kmem.h>
50 #include <vm/seg_kp.h>
51 #include <vm/seg_kpm.h>
52 #include <vm/rm.h>
53 #include <vm/vm_dep.h>
54 #include <sys/t_lock.h>
55 #include <sys/vm_machparam.h>
56 #include <sys/promif.h>
57 #include <sys/prom_isa.h>
58 #include <sys/prom_plat.h>
59 #include <sys/prom_debug.h>
60 #include <sys/privregs.h>
61 #include <sys/bootconf.h>
62 #include <sys/memlist.h>
63 #include <sys/memlist_plat.h>
64 #include <sys/cpu_module.h>
65 #include <sys/reboot.h>
66 #include <sys/kdi.h>
67 
68 /*
69  * Static routines
70  */
71 static void	sfmmu_map_prom_mappings(struct translation *, size_t);
72 static struct translation *read_prom_mappings(size_t *);
73 static void	sfmmu_reloc_trap_handler(void *, void *, size_t);
74 
75 /*
76  * External routines
77  */
78 extern void sfmmu_remap_kernel(void);
79 extern void sfmmu_patch_utsb(void);
80 
81 /*
82  * Global Data:
83  */
84 extern caddr_t	textva, datava;
85 extern tte_t	ktext_tte, kdata_tte;	/* ttes for kernel text and data */
86 extern int	enable_bigktsb;
87 extern int	kmem64_smchunks;
88 
89 uint64_t memsegspa = (uintptr_t)MSEG_NULLPTR_PA; /* memsegs physical linkage */
90 uint64_t memseg_phash[N_MEM_SLOTS];	/* use physical memseg addresses */
91 
92 int	sfmmu_kern_mapped = 0;
93 
94 /*
95  * DMMU primary context register for the kernel context. Machine specific code
96  * inserts correct page size codes when necessary
97  */
98 uint64_t kcontextreg = KCONTEXT;
99 
100 #ifdef DEBUG
101 static int ndata_middle_hole_detected = 0;
102 #endif
103 
104 /* Extern Global Data */
105 
106 extern int page_relocate_ready;
107 
108 /*
109  * Controls the logic which enables the use of the
110  * QUAD_LDD_PHYS ASI for TSB accesses.
111  */
112 extern int	ktsb_phys;
113 
114 /*
115  * Global Routines called from within:
116  *	usr/src/uts/sun4u
117  *	usr/src/uts/sfmmu
118  *	usr/src/uts/sun
119  */
120 
121 pfn_t
122 va_to_pfn(void *vaddr)
123 {
124 	u_longlong_t physaddr;
125 	int mode, valid;
126 
127 	if (tba_taken_over)
128 		return (hat_getpfnum(kas.a_hat, (caddr_t)vaddr));
129 
130 #if !defined(C_OBP)
131 	if (!kmem64_smchunks &&
132 	    (caddr_t)vaddr >= kmem64_base && (caddr_t)vaddr < kmem64_end) {
133 		if (kmem64_pabase == (uint64_t)-1)
134 			prom_panic("va_to_pfn: kmem64_pabase not init");
135 		physaddr = kmem64_pabase + ((caddr_t)vaddr - kmem64_base);
136 		return ((pfn_t)physaddr >> MMU_PAGESHIFT);
137 	}
138 #endif	/* !C_OBP */
139 
140 	if ((prom_translate_virt(vaddr, &valid, &physaddr, &mode) != -1) &&
141 	    (valid == -1)) {
142 		return ((pfn_t)(physaddr >> MMU_PAGESHIFT));
143 	}
144 	return (PFN_INVALID);
145 }
146 
147 uint64_t
148 va_to_pa(void *vaddr)
149 {
150 	pfn_t pfn;
151 
152 	if ((pfn = va_to_pfn(vaddr)) == PFN_INVALID)
153 		return ((uint64_t)-1);
154 	return (((uint64_t)pfn << MMU_PAGESHIFT) |
155 	    ((uint64_t)vaddr & MMU_PAGEOFFSET));
156 }
157 
158 void
159 hat_kern_setup(void)
160 {
161 	struct translation *trans_root;
162 	size_t ntrans_root;
163 	extern void startup_fixup_physavail(void);
164 
165 	/*
166 	 * These are the steps we take to take over the mmu from the prom.
167 	 *
168 	 * (1)	Read the prom's mappings through the translation property.
169 	 * (2)	Remap the kernel text and kernel data with 2 locked 4MB ttes.
170 	 *	Create the the hmeblks for these 2 ttes at this time.
171 	 * (3)	Create hat structures for all other prom mappings.  Since the
172 	 *	kernel text and data hme_blks have already been created we
173 	 *	skip the equivalent prom's mappings.
174 	 * (4)	Initialize the tsb and its corresponding hardware regs.
175 	 * (5)	Take over the trap table (currently in startup).
176 	 * (6)	Up to this point it is possible the prom required some of its
177 	 *	locked tte's.  Now that we own the trap table we remove them.
178 	 */
179 
180 	ktsb_pbase = va_to_pa(ktsb_base);
181 	ktsb4m_pbase = va_to_pa(ktsb4m_base);
182 	PRM_DEBUG(ktsb_pbase);
183 	PRM_DEBUG(ktsb4m_pbase);
184 
185 	sfmmu_patch_ktsb();
186 	sfmmu_patch_utsb();
187 	sfmmu_patch_mmu_asi(ktsb_phys);
188 
189 	sfmmu_init_tsbs();
190 
191 	if (kpm_enable) {
192 		sfmmu_kpm_patch_tlbm();
193 		if (kpm_smallpages == 0) {
194 			sfmmu_kpm_patch_tsbm();
195 		}
196 	}
197 
198 	if (!shctx_on) {
199 		sfmmu_patch_shctx();
200 	}
201 
202 	if (&mmu_enable_pgsz_search) {
203 		mmu_enable_pgsz_search();
204 	}
205 
206 	/*
207 	 * The 8K-indexed kernel TSB space is used to hold
208 	 * translations below...
209 	 */
210 	trans_root = read_prom_mappings(&ntrans_root);
211 	sfmmu_remap_kernel();
212 	startup_fixup_physavail();
213 	mmu_init_kernel_pgsz(kas.a_hat);
214 	sfmmu_map_prom_mappings(trans_root, ntrans_root);
215 
216 	/*
217 	 * We invalidate 8K kernel TSB because we used it in
218 	 * sfmmu_map_prom_mappings()
219 	 */
220 	sfmmu_inv_tsb(ktsb_base, ktsb_sz);
221 	sfmmu_inv_tsb(ktsb4m_base, ktsb4m_sz);
222 
223 	sfmmu_init_ktsbinfo();
224 
225 
226 	sfmmu_kern_mapped = 1;
227 
228 	/*
229 	 * hments have been created for mapped pages, and thus we're ready
230 	 * for kmdb to start using its own trap table.  It walks the hments
231 	 * to resolve TLB misses, and can't be used until they're ready.
232 	 */
233 	if (boothowto & RB_DEBUG)
234 		kdi_dvec_vmready();
235 }
236 
237 /*
238  * Macro used below to convert the prom's 32-bit high and low fields into
239  * a value appropriate for the 64-bit kernel.
240  */
241 
242 #define	COMBINE(hi, lo) (((uint64_t)(uint32_t)(hi) << 32) | (uint32_t)(lo))
243 
244 /*
245  * Track larges pages used.
246  * Provides observability for this feature on non-debug kernels.
247  */
248 ulong_t map_prom_lpcount[MMU_PAGE_SIZES];
249 
250 /*
251  * This function traverses the prom mapping list and creates equivalent
252  * mappings in the sfmmu mapping hash.
253  */
254 static void
255 sfmmu_map_prom_mappings(struct translation *trans_root, size_t ntrans_root)
256 {
257 	struct translation *promt;
258 	tte_t	tte, oldtte, *ttep;
259 	pfn_t	pfn, oldpfn, basepfn;
260 	caddr_t vaddr;
261 	size_t	size, offset;
262 	unsigned long i;
263 	uint_t	attr;
264 	page_t *pp;
265 	extern struct memlist *virt_avail;
266 	char buf[256];
267 
268 	ttep = &tte;
269 	for (i = 0, promt = trans_root; i < ntrans_root; i++, promt++) {
270 		ASSERT(promt->tte_hi != 0);
271 		ASSERT32(promt->virt_hi == 0 && promt->size_hi == 0);
272 
273 		vaddr = (caddr_t)COMBINE(promt->virt_hi, promt->virt_lo);
274 
275 		/*
276 		 * hack until we get rid of map-for-unix
277 		 */
278 		if (vaddr < (caddr_t)KERNELBASE)
279 			continue;
280 
281 		ttep->tte_inthi = promt->tte_hi;
282 		ttep->tte_intlo = promt->tte_lo;
283 		attr = PROC_DATA | HAT_NOSYNC;
284 #if defined(TTE_IS_GLOBAL)
285 		if (TTE_IS_GLOBAL(ttep)) {
286 			/*
287 			 * The prom better not use global translations
288 			 * because a user process might use the same
289 			 * virtual addresses
290 			 */
291 			prom_panic("sfmmu_map_prom_mappings: global"
292 			    " translation");
293 			TTE_SET_LOFLAGS(ttep, TTE_GLB_INT, 0);
294 		}
295 #endif
296 		if (TTE_IS_LOCKED(ttep)) {
297 			/* clear the lock bits */
298 			TTE_CLR_LOCKED(ttep);
299 		}
300 		attr |= (TTE_IS_VCACHEABLE(ttep)) ? 0 : SFMMU_UNCACHEVTTE;
301 		attr |= (TTE_IS_PCACHEABLE(ttep)) ? 0 : SFMMU_UNCACHEPTTE;
302 		attr |= (TTE_IS_SIDEFFECT(ttep)) ? SFMMU_SIDEFFECT : 0;
303 		attr |= (TTE_IS_IE(ttep)) ? HAT_STRUCTURE_LE : 0;
304 
305 		size = COMBINE(promt->size_hi, promt->size_lo);
306 		offset = 0;
307 		basepfn = TTE_TO_PFN((caddr_t)COMBINE(promt->virt_hi,
308 		    promt->virt_lo), ttep);
309 		while (size) {
310 			vaddr = (caddr_t)(COMBINE(promt->virt_hi,
311 			    promt->virt_lo) + offset);
312 
313 			/*
314 			 * make sure address is not in virt-avail list
315 			 */
316 			if (address_in_memlist(virt_avail, (uint64_t)vaddr,
317 			    size)) {
318 				prom_panic("sfmmu_map_prom_mappings:"
319 				    " inconsistent translation/avail lists");
320 			}
321 
322 			pfn = basepfn + mmu_btop(offset);
323 			if (pf_is_memory(pfn)) {
324 				if (attr & SFMMU_UNCACHEPTTE) {
325 					prom_panic("sfmmu_map_prom_mappings:"
326 					    " uncached prom memory page");
327 				}
328 			} else {
329 				if (!(attr & SFMMU_SIDEFFECT)) {
330 					prom_panic("sfmmu_map_prom_mappings:"
331 					    " prom i/o page without"
332 					    " side-effect");
333 				}
334 			}
335 
336 			/*
337 			 * skip kmem64 area
338 			 */
339 			if (!kmem64_smchunks &&
340 			    vaddr >= kmem64_base &&
341 			    vaddr < kmem64_aligned_end) {
342 #if !defined(C_OBP)
343 				prom_panic("sfmmu_map_prom_mappings:"
344 				    " unexpected kmem64 prom mapping");
345 #else	/* !C_OBP */
346 				size_t mapsz;
347 
348 				if (ptob(pfn) !=
349 				    kmem64_pabase + (vaddr - kmem64_base)) {
350 					prom_panic("sfmmu_map_prom_mappings:"
351 					    " unexpected kmem64 prom mapping");
352 				}
353 
354 				mapsz = kmem64_aligned_end - vaddr;
355 				if (mapsz >= size) {
356 					break;
357 				}
358 				size -= mapsz;
359 				offset += mapsz;
360 				continue;
361 #endif	/* !C_OBP */
362 			}
363 
364 			oldpfn = sfmmu_vatopfn(vaddr, KHATID, &oldtte);
365 			ASSERT(oldpfn != PFN_SUSPENDED);
366 			ASSERT(page_relocate_ready == 0);
367 
368 			if (oldpfn != PFN_INVALID) {
369 				/*
370 				 * mapping already exists.
371 				 * Verify they are equal
372 				 */
373 				if (pfn != oldpfn) {
374 					(void) snprintf(buf, sizeof (buf),
375 					"sfmmu_map_prom_mappings: mapping"
376 					" conflict (va = 0x%p, pfn = 0x%p,"
377 					" oldpfn = 0x%p)", (void *)vaddr,
378 					    (void *)pfn, (void *)oldpfn);
379 					prom_panic(buf);
380 				}
381 				size -= MMU_PAGESIZE;
382 				offset += MMU_PAGESIZE;
383 				continue;
384 			}
385 
386 			pp = page_numtopp_nolock(pfn);
387 			if ((pp != NULL) && PP_ISFREE((page_t *)pp)) {
388 				(void) snprintf(buf, sizeof (buf),
389 				"sfmmu_map_prom_mappings: prom-mapped"
390 				" page (va = 0x%p, pfn = 0x%p) on free list",
391 				    (void *)vaddr, (void *)pfn);
392 				prom_panic(buf);
393 			}
394 
395 			sfmmu_memtte(ttep, pfn, attr, TTE8K);
396 			sfmmu_tteload(kas.a_hat, ttep, vaddr, pp,
397 			    HAT_LOAD_LOCK | SFMMU_NO_TSBLOAD);
398 			size -= MMU_PAGESIZE;
399 			offset += MMU_PAGESIZE;
400 		}
401 	}
402 
403 	/*
404 	 * We claimed kmem64 from prom, so now we need to load tte.
405 	 */
406 	if (!kmem64_smchunks && kmem64_base != NULL) {
407 		pgcnt_t pages;
408 		size_t psize;
409 		int pszc;
410 
411 		pszc = kmem64_szc;
412 #ifdef sun4u
413 		if (pszc > TTE8K) {
414 			pszc = segkmem_lpszc;
415 		}
416 #endif	/* sun4u */
417 		psize = TTEBYTES(pszc);
418 		pages = btop(psize);
419 		basepfn = kmem64_pabase >> MMU_PAGESHIFT;
420 		vaddr = kmem64_base;
421 		while (vaddr < kmem64_end) {
422 			sfmmu_memtte(ttep, basepfn,
423 			    PROC_DATA | HAT_NOSYNC, pszc);
424 			sfmmu_tteload(kas.a_hat, ttep, vaddr, NULL,
425 			    HAT_LOAD_LOCK | SFMMU_NO_TSBLOAD);
426 			vaddr += psize;
427 			basepfn += pages;
428 		}
429 		map_prom_lpcount[pszc] =
430 		    ((caddr_t)P2ROUNDUP((uintptr_t)kmem64_end, psize) -
431 		    kmem64_base) >> TTE_PAGE_SHIFT(pszc);
432 	}
433 }
434 
435 #undef COMBINE	/* local to previous routine */
436 
437 /*
438  * This routine reads in the "translations" property in to a buffer and
439  * returns a pointer to this buffer and the number of translations.
440  */
441 static struct translation *
442 read_prom_mappings(size_t *ntransrootp)
443 {
444 	char *prop = "translations";
445 	size_t translen;
446 	pnode_t node;
447 	struct translation *transroot;
448 
449 	/*
450 	 * the "translations" property is associated with the mmu node
451 	 */
452 	node = (pnode_t)prom_getphandle(prom_mmu_ihandle());
453 
454 	/*
455 	 * We use the TSB space to read in the prom mappings.  This space
456 	 * is currently not being used because we haven't taken over the
457 	 * trap table yet.  It should be big enough to hold the mappings.
458 	 */
459 	if ((translen = prom_getproplen(node, prop)) == -1)
460 		cmn_err(CE_PANIC, "no translations property");
461 	*ntransrootp = translen / sizeof (*transroot);
462 	translen = roundup(translen, MMU_PAGESIZE);
463 	PRM_DEBUG(translen);
464 	if (translen > TSB_BYTES(ktsb_szcode))
465 		cmn_err(CE_PANIC, "not enough space for translations");
466 
467 	transroot = (struct translation *)ktsb_base;
468 	ASSERT(transroot);
469 	if (prom_getprop(node, prop, (caddr_t)transroot) == -1) {
470 		cmn_err(CE_PANIC, "translations getprop failed");
471 	}
472 	return (transroot);
473 }
474 
475 /*
476  * Init routine of the nucleus data memory allocator.
477  *
478  * The nucleus data memory allocator is organized in ecache_alignsize'd
479  * memory chunks. Memory allocated by ndata_alloc() will never be freed.
480  *
481  * The ndata argument is used as header of the ndata freelist.
482  * Other freelist nodes are placed in the nucleus memory itself
483  * at the beginning of a free memory chunk. Therefore a freelist
484  * node (struct memlist) must fit into the smallest allocatable
485  * memory chunk (ecache_alignsize bytes).
486  *
487  * The memory interval [base, end] passed to ndata_alloc_init() must be
488  * bzero'd to allow the allocator to return bzero'd memory easily.
489  */
490 void
491 ndata_alloc_init(struct memlist *ndata, uintptr_t base, uintptr_t end)
492 {
493 	ASSERT(sizeof (struct memlist) <= ecache_alignsize);
494 
495 	base = roundup(base, ecache_alignsize);
496 	end = end - end % ecache_alignsize;
497 
498 	ASSERT(base < end);
499 
500 	ndata->address = base;
501 	ndata->size = end - base;
502 	ndata->next = NULL;
503 	ndata->prev = NULL;
504 }
505 
506 /*
507  * Deliver the size of the largest free memory chunk.
508  */
509 size_t
510 ndata_maxsize(struct memlist *ndata)
511 {
512 	size_t chunksize = ndata->size;
513 
514 	while ((ndata = ndata->next) != NULL) {
515 		if (chunksize < ndata->size)
516 			chunksize = ndata->size;
517 	}
518 
519 	return (chunksize);
520 }
521 
522 
523 /*
524  * Allocate the last properly aligned memory chunk.
525  * This function is called when no more large nucleus memory chunks
526  * will be allocated.  The remaining free nucleus memory at the end
527  * of the nucleus can be added to the phys_avail list.
528  */
529 void *
530 ndata_extra_base(struct memlist *ndata, size_t alignment, caddr_t endaddr)
531 {
532 	uintptr_t base;
533 	size_t wasteage = 0;
534 #ifdef	DEBUG
535 	static int called = 0;
536 
537 	if (called++ > 0)
538 		cmn_err(CE_PANIC, "ndata_extra_base() called more than once");
539 #endif /* DEBUG */
540 
541 	/*
542 	 * The alignment needs to be a multiple of ecache_alignsize.
543 	 */
544 	ASSERT((alignment % ecache_alignsize) ==  0);
545 
546 	while (ndata->next != NULL) {
547 		wasteage += ndata->size;
548 		ndata = ndata->next;
549 	}
550 
551 	base = roundup(ndata->address, alignment);
552 
553 	if (base >= ndata->address + ndata->size)
554 		return (NULL);
555 
556 	if ((caddr_t)(ndata->address + ndata->size) != endaddr) {
557 #ifdef DEBUG
558 		ndata_middle_hole_detected = 1;	/* see if we hit this again */
559 #endif
560 		return (NULL);
561 	}
562 
563 	if (base == ndata->address) {
564 		if (ndata->prev != NULL)
565 			ndata->prev->next = NULL;
566 		else
567 			ndata->size = 0;
568 
569 		bzero((void *)base, sizeof (struct memlist));
570 
571 	} else {
572 		ndata->size = base - ndata->address;
573 		wasteage += ndata->size;
574 	}
575 	PRM_DEBUG(wasteage);
576 
577 	return ((void *)base);
578 }
579 
580 /*
581  * Select the best matching buffer, avoid memory fragmentation.
582  */
583 static struct memlist *
584 ndata_select_chunk(struct memlist *ndata, size_t wanted, size_t alignment)
585 {
586 	struct memlist *fnd_below = NULL;
587 	struct memlist *fnd_above = NULL;
588 	struct memlist *fnd_unused = NULL;
589 	struct memlist *frlist;
590 	uintptr_t base;
591 	uintptr_t end;
592 	size_t below;
593 	size_t above;
594 	size_t unused;
595 	size_t best_below = ULONG_MAX;
596 	size_t best_above = ULONG_MAX;
597 	size_t best_unused = ULONG_MAX;
598 
599 	ASSERT(ndata != NULL);
600 
601 	/*
602 	 * Look for the best matching buffer, avoid memory fragmentation.
603 	 * The following strategy is used, try to find
604 	 *   1. an exact fitting buffer
605 	 *   2. avoid wasting any space below the buffer, take first
606 	 *	fitting buffer
607 	 *   3. avoid wasting any space above the buffer, take first
608 	 *	fitting buffer
609 	 *   4. avoid wasting space, take first fitting buffer
610 	 *   5. take the last buffer in chain
611 	 */
612 	for (frlist = ndata; frlist != NULL; frlist = frlist->next) {
613 		base = roundup(frlist->address, alignment);
614 		end = roundup(base + wanted, ecache_alignsize);
615 
616 		if (end > frlist->address + frlist->size)
617 			continue;
618 
619 		below = (base - frlist->address) / ecache_alignsize;
620 		above = (frlist->address + frlist->size - end) /
621 		    ecache_alignsize;
622 		unused = below + above;
623 
624 		if (unused == 0)
625 			return (frlist);
626 
627 		if (frlist->next == NULL)
628 			break;
629 
630 		if (below < best_below) {
631 			best_below = below;
632 			fnd_below = frlist;
633 		}
634 
635 		if (above < best_above) {
636 			best_above = above;
637 			fnd_above = frlist;
638 		}
639 
640 		if (unused < best_unused) {
641 			best_unused = unused;
642 			fnd_unused = frlist;
643 		}
644 	}
645 
646 	if (best_below == 0)
647 		return (fnd_below);
648 	if (best_above == 0)
649 		return (fnd_above);
650 	if (best_unused < ULONG_MAX)
651 		return (fnd_unused);
652 
653 	return (frlist);
654 }
655 
656 /*
657  * Nucleus data memory allocator.
658  * The granularity of the allocator is ecache_alignsize.
659  * See also comment for ndata_alloc_init().
660  */
661 void *
662 ndata_alloc(struct memlist *ndata, size_t wanted, size_t alignment)
663 {
664 	struct memlist *found;
665 	struct memlist *fnd_above;
666 	uintptr_t base;
667 	uintptr_t end;
668 	size_t below;
669 	size_t above;
670 
671 	/*
672 	 * Look for the best matching buffer, avoid memory fragmentation.
673 	 */
674 	if ((found = ndata_select_chunk(ndata, wanted, alignment)) == NULL)
675 		return (NULL);
676 
677 	/*
678 	 * Allocate the nucleus data buffer.
679 	 */
680 	base = roundup(found->address, alignment);
681 	end = roundup(base + wanted, ecache_alignsize);
682 	ASSERT(end <= found->address + found->size);
683 
684 	below = base - found->address;
685 	above = found->address + found->size - end;
686 	ASSERT(above == 0 || (above % ecache_alignsize) == 0);
687 
688 	if (below >= ecache_alignsize) {
689 		/*
690 		 * There is free memory below the allocated memory chunk.
691 		 */
692 		found->size = below - below % ecache_alignsize;
693 
694 		if (above) {
695 			fnd_above = (struct memlist *)end;
696 			fnd_above->address = end;
697 			fnd_above->size = above;
698 
699 			if ((fnd_above->next = found->next) != NULL)
700 				found->next->prev = fnd_above;
701 			fnd_above->prev = found;
702 			found->next = fnd_above;
703 		}
704 
705 		return ((void *)base);
706 	}
707 
708 	if (found->prev == NULL) {
709 		/*
710 		 * The first chunk (ndata) is selected.
711 		 */
712 		ASSERT(found == ndata);
713 		if (above) {
714 			found->address = end;
715 			found->size = above;
716 		} else if (found->next != NULL) {
717 			found->address = found->next->address;
718 			found->size = found->next->size;
719 			if ((found->next = found->next->next) != NULL)
720 				found->next->prev = found;
721 
722 			bzero((void *)found->address, sizeof (struct memlist));
723 		} else {
724 			found->address = end;
725 			found->size = 0;
726 		}
727 
728 		return ((void *)base);
729 	}
730 
731 	/*
732 	 * Not the first chunk.
733 	 */
734 	if (above) {
735 		fnd_above = (struct memlist *)end;
736 		fnd_above->address = end;
737 		fnd_above->size = above;
738 
739 		if ((fnd_above->next = found->next) != NULL)
740 			fnd_above->next->prev = fnd_above;
741 		fnd_above->prev = found->prev;
742 		found->prev->next = fnd_above;
743 
744 	} else {
745 		if ((found->prev->next = found->next) != NULL)
746 			found->next->prev = found->prev;
747 	}
748 
749 	bzero((void *)found->address, sizeof (struct memlist));
750 
751 	return ((void *)base);
752 }
753 
754 /*
755  * Size the kernel TSBs based upon the amount of physical
756  * memory in the system.
757  */
758 static void
759 calc_tsb_sizes(pgcnt_t npages)
760 {
761 	PRM_DEBUG(npages);
762 
763 	if (npages <= TSB_FREEMEM_MIN) {
764 		ktsb_szcode = TSB_128K_SZCODE;
765 		enable_bigktsb = 0;
766 	} else if (npages <= TSB_FREEMEM_LARGE / 2) {
767 		ktsb_szcode = TSB_256K_SZCODE;
768 		enable_bigktsb = 0;
769 	} else if (npages <= TSB_FREEMEM_LARGE) {
770 		ktsb_szcode = TSB_512K_SZCODE;
771 		enable_bigktsb = 0;
772 	} else if (npages <= TSB_FREEMEM_LARGE * 2 ||
773 	    enable_bigktsb == 0) {
774 		ktsb_szcode = TSB_1M_SZCODE;
775 		enable_bigktsb = 0;
776 	} else {
777 		ktsb_szcode = highbit(npages - 1);
778 		ktsb_szcode -= TSB_START_SIZE;
779 		ktsb_szcode = MAX(ktsb_szcode, MIN_BIGKTSB_SZCODE);
780 		ktsb_szcode = MIN(ktsb_szcode, MAX_BIGKTSB_SZCODE);
781 	}
782 
783 	/*
784 	 * We choose the TSB to hold kernel 4M mappings to have twice
785 	 * the reach as the primary kernel TSB since this TSB will
786 	 * potentially (currently) be shared by both mappings to all of
787 	 * physical memory plus user TSBs. If this TSB has to be in nucleus
788 	 * (only for Spitfire and Cheetah) limit its size to 64K.
789 	 */
790 	ktsb4m_szcode = highbit((2 * npages) / TTEPAGES(TTE4M) - 1);
791 	ktsb4m_szcode -= TSB_START_SIZE;
792 	ktsb4m_szcode = MAX(ktsb4m_szcode, TSB_MIN_SZCODE);
793 	ktsb4m_szcode = MIN(ktsb4m_szcode, TSB_SOFTSZ_MASK);
794 	if ((enable_bigktsb == 0 || ktsb_phys == 0) && ktsb4m_szcode >
795 	    TSB_64K_SZCODE) {
796 		ktsb4m_szcode = TSB_64K_SZCODE;
797 		max_bootlp_tteszc = TTE8K;
798 	}
799 
800 	ktsb_sz = TSB_BYTES(ktsb_szcode);	/* kernel 8K tsb size */
801 	ktsb4m_sz = TSB_BYTES(ktsb4m_szcode);	/* kernel 4M tsb size */
802 }
803 
804 /*
805  * Allocate kernel TSBs from nucleus data memory.
806  * The function return 0 on success and -1 on failure.
807  */
808 int
809 ndata_alloc_tsbs(struct memlist *ndata, pgcnt_t npages)
810 {
811 	/*
812 	 * Set ktsb_phys to 1 if the processor supports ASI_QUAD_LDD_PHYS.
813 	 */
814 	sfmmu_setup_4lp();
815 
816 	/*
817 	 * Size the kernel TSBs based upon the amount of physical
818 	 * memory in the system.
819 	 */
820 	calc_tsb_sizes(npages);
821 
822 	/*
823 	 * Allocate the 8K kernel TSB if it belongs inside the nucleus.
824 	 */
825 	if (enable_bigktsb == 0) {
826 		if ((ktsb_base = ndata_alloc(ndata, ktsb_sz, ktsb_sz)) == NULL)
827 			return (-1);
828 		ASSERT(!((uintptr_t)ktsb_base & (ktsb_sz - 1)));
829 
830 		PRM_DEBUG(ktsb_base);
831 		PRM_DEBUG(ktsb_sz);
832 		PRM_DEBUG(ktsb_szcode);
833 	}
834 
835 	/*
836 	 * Next, allocate 4M kernel TSB from the nucleus since it's small.
837 	 */
838 	if (ktsb4m_szcode <= TSB_64K_SZCODE) {
839 
840 		ktsb4m_base = ndata_alloc(ndata, ktsb4m_sz, ktsb4m_sz);
841 		if (ktsb4m_base == NULL)
842 			return (-1);
843 		ASSERT(!((uintptr_t)ktsb4m_base & (ktsb4m_sz - 1)));
844 
845 		PRM_DEBUG(ktsb4m_base);
846 		PRM_DEBUG(ktsb4m_sz);
847 		PRM_DEBUG(ktsb4m_szcode);
848 	}
849 
850 	return (0);
851 }
852 
853 size_t
854 calc_hmehash_sz(pgcnt_t npages)
855 {
856 	ulong_t hme_buckets;
857 
858 	/*
859 	 * The number of buckets in the hme hash tables
860 	 * is a power of 2 such that the average hash chain length is
861 	 * HMENT_HASHAVELEN.  The number of buckets for the user hash is
862 	 * a function of physical memory and a predefined overmapping factor.
863 	 * The number of buckets for the kernel hash is a function of
864 	 * physical memory only.
865 	 */
866 	hme_buckets = (npages * HMEHASH_FACTOR) /
867 	    (HMENT_HASHAVELEN * (HMEBLK_SPAN(TTE8K) >> MMU_PAGESHIFT));
868 
869 	uhmehash_num = (int)MIN(hme_buckets, MAX_UHME_BUCKETS);
870 
871 	if (uhmehash_num > USER_BUCKETS_THRESHOLD) {
872 		/*
873 		 * if uhmehash_num is not power of 2 round it down to the
874 		 *  next power of 2.
875 		 */
876 		uint_t align = 1 << (highbit(uhmehash_num - 1) - 1);
877 		uhmehash_num = P2ALIGN(uhmehash_num, align);
878 	} else
879 		uhmehash_num = 1 << highbit(uhmehash_num - 1);
880 
881 	hme_buckets = npages / (HMEBLK_SPAN(TTE8K) >> MMU_PAGESHIFT);
882 	khmehash_num = (int)MIN(hme_buckets, MAX_KHME_BUCKETS);
883 	khmehash_num = 1 << highbit(khmehash_num - 1);
884 	khmehash_num = MAX(khmehash_num, MIN_KHME_BUCKETS);
885 
886 	return ((uhmehash_num + khmehash_num) * sizeof (struct hmehash_bucket));
887 }
888 
889 caddr_t
890 alloc_hmehash(caddr_t alloc_base)
891 {
892 	size_t khmehash_sz, uhmehash_sz;
893 
894 	khme_hash = (struct hmehash_bucket *)alloc_base;
895 	khmehash_sz = khmehash_num * sizeof (struct hmehash_bucket);
896 	alloc_base += khmehash_sz;
897 
898 	uhme_hash = (struct hmehash_bucket *)alloc_base;
899 	uhmehash_sz = uhmehash_num * sizeof (struct hmehash_bucket);
900 	alloc_base += uhmehash_sz;
901 
902 	PRM_DEBUG(khme_hash);
903 	PRM_DEBUG(uhme_hash);
904 
905 	return (alloc_base);
906 }
907 
908 /*
909  * Allocate hat structs from the nucleus data memory.
910  */
911 int
912 ndata_alloc_hat(struct memlist *ndata, pgcnt_t npages)
913 {
914 	size_t	mml_alloc_sz;
915 	size_t	cb_alloc_sz;
916 
917 	/*
918 	 * For the page mapping list mutex array we allocate one mutex
919 	 * for every 128 pages (1 MB) with a minimum of 64 entries and
920 	 * a maximum of 8K entries. For the initial computation npages
921 	 * is rounded up (ie. 1 << highbit(npages * 1.5 / 128))
922 	 *
923 	 * mml_shift is roughly log2(mml_table_sz) + 3 for MLIST_HASH
924 	 */
925 	mml_table_sz = 1 << highbit((npages * 3) / 256);
926 	if (mml_table_sz < 64)
927 		mml_table_sz = 64;
928 	else if (mml_table_sz > 8192)
929 		mml_table_sz = 8192;
930 	mml_shift = highbit(mml_table_sz) + 3;
931 
932 	PRM_DEBUG(mml_table_sz);
933 	PRM_DEBUG(mml_shift);
934 
935 	mml_alloc_sz = mml_table_sz * sizeof (kmutex_t);
936 
937 	mml_table = ndata_alloc(ndata, mml_alloc_sz, ecache_alignsize);
938 	if (mml_table == NULL)
939 		return (-1);
940 	PRM_DEBUG(mml_table);
941 
942 	cb_alloc_sz = sfmmu_max_cb_id * sizeof (struct sfmmu_callback);
943 	PRM_DEBUG(cb_alloc_sz);
944 	sfmmu_cb_table = ndata_alloc(ndata, cb_alloc_sz, ecache_alignsize);
945 	if (sfmmu_cb_table == NULL)
946 		return (-1);
947 	PRM_DEBUG(sfmmu_cb_table);
948 
949 	return (0);
950 }
951 
952 int
953 ndata_alloc_kpm(struct memlist *ndata, pgcnt_t kpm_npages)
954 {
955 	size_t	kpmp_alloc_sz;
956 
957 	/*
958 	 * For the kpm_page mutex array we allocate one mutex every 16
959 	 * kpm pages (64MB). In smallpage mode we allocate one mutex
960 	 * every 8K pages. The minimum is set to 64 entries and the
961 	 * maximum to 8K entries.
962 	 */
963 	if (kpm_smallpages == 0) {
964 		kpmp_shift = highbit(sizeof (kpm_page_t)) - 1;
965 		kpmp_table_sz = 1 << highbit(kpm_npages / 16);
966 		kpmp_table_sz = (kpmp_table_sz < 64) ? 64 :
967 		    ((kpmp_table_sz > 8192) ? 8192 : kpmp_table_sz);
968 		kpmp_alloc_sz = kpmp_table_sz * sizeof (kpm_hlk_t);
969 
970 		kpmp_table = ndata_alloc(ndata, kpmp_alloc_sz,
971 		    ecache_alignsize);
972 		if (kpmp_table == NULL)
973 			return (-1);
974 
975 		PRM_DEBUG(kpmp_table);
976 		PRM_DEBUG(kpmp_table_sz);
977 
978 		kpmp_stable_sz = 0;
979 		kpmp_stable = NULL;
980 	} else {
981 		ASSERT(kpm_pgsz == PAGESIZE);
982 		kpmp_shift = highbit(sizeof (kpm_shlk_t)) + 1;
983 		kpmp_stable_sz = 1 << highbit(kpm_npages / 8192);
984 		kpmp_stable_sz = (kpmp_stable_sz < 64) ? 64 :
985 		    ((kpmp_stable_sz > 8192) ? 8192 : kpmp_stable_sz);
986 		kpmp_alloc_sz = kpmp_stable_sz * sizeof (kpm_shlk_t);
987 
988 		kpmp_stable = ndata_alloc(ndata, kpmp_alloc_sz,
989 		    ecache_alignsize);
990 		if (kpmp_stable == NULL)
991 			return (-1);
992 
993 		PRM_DEBUG(kpmp_stable);
994 		PRM_DEBUG(kpmp_stable_sz);
995 
996 		kpmp_table_sz = 0;
997 		kpmp_table = NULL;
998 	}
999 	PRM_DEBUG(kpmp_shift);
1000 
1001 	return (0);
1002 }
1003 
1004 /*
1005  * This function bop allocs kernel TSBs.
1006  */
1007 caddr_t
1008 sfmmu_ktsb_alloc(caddr_t tsbbase)
1009 {
1010 	caddr_t vaddr;
1011 
1012 	if (enable_bigktsb) {
1013 		ktsb_base = (caddr_t)roundup((uintptr_t)tsbbase, ktsb_sz);
1014 		vaddr = prom_alloc(ktsb_base, ktsb_sz, ktsb_sz);
1015 		if (vaddr != ktsb_base)
1016 			cmn_err(CE_PANIC, "sfmmu_ktsb_alloc: can't alloc"
1017 			    " 8K bigktsb");
1018 		ktsb_base = vaddr;
1019 		tsbbase = ktsb_base + ktsb_sz;
1020 		PRM_DEBUG(ktsb_base);
1021 		PRM_DEBUG(tsbbase);
1022 	}
1023 
1024 	if (ktsb4m_szcode > TSB_64K_SZCODE) {
1025 		ASSERT(ktsb_phys && enable_bigktsb);
1026 		ktsb4m_base = (caddr_t)roundup((uintptr_t)tsbbase, ktsb4m_sz);
1027 		vaddr = (caddr_t)BOP_ALLOC(bootops, ktsb4m_base, ktsb4m_sz,
1028 		    ktsb4m_sz);
1029 		if (vaddr != ktsb4m_base)
1030 			cmn_err(CE_PANIC, "sfmmu_ktsb_alloc: can't alloc"
1031 			    " 4M bigktsb");
1032 		ktsb4m_base = vaddr;
1033 		tsbbase = ktsb4m_base + ktsb4m_sz;
1034 		PRM_DEBUG(ktsb4m_base);
1035 		PRM_DEBUG(tsbbase);
1036 	}
1037 	return (tsbbase);
1038 }
1039 
1040 /*
1041  * Moves code assembled outside of the trap table into the trap
1042  * table taking care to relocate relative branches to code outside
1043  * of the trap handler.
1044  */
1045 static void
1046 sfmmu_reloc_trap_handler(void *tablep, void *start, size_t count)
1047 {
1048 	size_t i;
1049 	uint32_t *src;
1050 	uint32_t *dst;
1051 	uint32_t inst;
1052 	int op, op2;
1053 	int32_t offset;
1054 	int disp;
1055 
1056 	src = start;
1057 	dst = tablep;
1058 	offset = src - dst;
1059 	for (src = start, i = 0; i < count; i++, src++, dst++) {
1060 		inst = *dst = *src;
1061 		op = (inst >> 30) & 0x2;
1062 		if (op == 1) {
1063 			/* call */
1064 			disp = ((int32_t)inst << 2) >> 2; /* sign-extend */
1065 			if (disp + i >= 0 && disp + i < count)
1066 				continue;
1067 			disp += offset;
1068 			inst = 0x40000000u | (disp & 0x3fffffffu);
1069 			*dst = inst;
1070 		} else if (op == 0) {
1071 			/* branch or sethi */
1072 			op2 = (inst >> 22) & 0x7;
1073 
1074 			switch (op2) {
1075 			case 0x3: /* BPr */
1076 				disp = (((inst >> 20) & 0x3) << 14) |
1077 				    (inst & 0x3fff);
1078 				disp = (disp << 16) >> 16; /* sign-extend */
1079 				if (disp + i >= 0 && disp + i < count)
1080 					continue;
1081 				disp += offset;
1082 				if (((disp << 16) >> 16) != disp)
1083 					cmn_err(CE_PANIC, "bad reloc");
1084 				inst &= ~0x303fff;
1085 				inst |= (disp & 0x3fff);
1086 				inst |= (disp & 0xc000) << 6;
1087 				break;
1088 
1089 			case 0x2: /* Bicc */
1090 				disp = ((int32_t)inst << 10) >> 10;
1091 				if (disp + i >= 0 && disp + i < count)
1092 					continue;
1093 				disp += offset;
1094 				if (((disp << 10) >> 10) != disp)
1095 					cmn_err(CE_PANIC, "bad reloc");
1096 				inst &= ~0x3fffff;
1097 				inst |= (disp & 0x3fffff);
1098 				break;
1099 
1100 			case 0x1: /* Bpcc */
1101 				disp = ((int32_t)inst << 13) >> 13;
1102 				if (disp + i >= 0 && disp + i < count)
1103 					continue;
1104 				disp += offset;
1105 				if (((disp << 13) >> 13) != disp)
1106 					cmn_err(CE_PANIC, "bad reloc");
1107 				inst &= ~0x7ffff;
1108 				inst |= (disp & 0x7ffffu);
1109 				break;
1110 			}
1111 			*dst = inst;
1112 		}
1113 	}
1114 	flush_instr_mem(tablep, count * sizeof (uint32_t));
1115 }
1116 
1117 /*
1118  * Routine to allocate a large page to use in the TSB caches.
1119  */
1120 /*ARGSUSED*/
1121 static page_t *
1122 sfmmu_tsb_page_create(void *addr, size_t size, int vmflag, void *arg)
1123 {
1124 	int pgflags;
1125 
1126 	pgflags = PG_EXCL;
1127 	if ((vmflag & VM_NOSLEEP) == 0)
1128 		pgflags |= PG_WAIT;
1129 	if (vmflag & VM_PANIC)
1130 		pgflags |= PG_PANIC;
1131 	if (vmflag & VM_PUSHPAGE)
1132 		pgflags |= PG_PUSHPAGE;
1133 
1134 	return (page_create_va_large(&kvp, (u_offset_t)(uintptr_t)addr, size,
1135 	    pgflags, &kvseg, addr, arg));
1136 }
1137 
1138 /*
1139  * Allocate a large page to back the virtual address range
1140  * [addr, addr + size).  If addr is NULL, allocate the virtual address
1141  * space as well.
1142  */
1143 static void *
1144 sfmmu_tsb_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag,
1145     uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *),
1146     void *pcarg)
1147 {
1148 	page_t *ppl;
1149 	page_t *rootpp;
1150 	caddr_t addr = inaddr;
1151 	pgcnt_t npages = btopr(size);
1152 	page_t **ppa;
1153 	int i = 0;
1154 
1155 	/*
1156 	 * Assuming that only TSBs will call this with size > PAGESIZE
1157 	 * There is no reason why this couldn't be expanded to 8k pages as
1158 	 * well, or other page sizes in the future .... but for now, we
1159 	 * only support fixed sized page requests.
1160 	 */
1161 	if ((inaddr == NULL) && ((addr = vmem_xalloc(vmp, size, size, 0, 0,
1162 	    NULL, NULL, vmflag)) == NULL))
1163 		return (NULL);
1164 
1165 	if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
1166 		if (inaddr == NULL)
1167 			vmem_xfree(vmp, addr, size);
1168 		return (NULL);
1169 	}
1170 
1171 	ppl = page_create_func(addr, size, vmflag, pcarg);
1172 	if (ppl == NULL) {
1173 		if (inaddr == NULL)
1174 			vmem_xfree(vmp, addr, size);
1175 		page_unresv(npages);
1176 		return (NULL);
1177 	}
1178 
1179 	rootpp = ppl;
1180 	ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
1181 	while (ppl != NULL) {
1182 		page_t *pp = ppl;
1183 		ppa[i++] = pp;
1184 		page_sub(&ppl, pp);
1185 		ASSERT(page_iolock_assert(pp));
1186 		page_io_unlock(pp);
1187 	}
1188 
1189 	/*
1190 	 * Load the locked entry.  It's OK to preload the entry into
1191 	 * the TSB since we now support large mappings in the kernel TSB.
1192 	 */
1193 	hat_memload_array(kas.a_hat, (caddr_t)rootpp->p_offset, size,
1194 	    ppa, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr, HAT_LOAD_LOCK);
1195 
1196 	for (--i; i >= 0; --i) {
1197 		(void) page_pp_lock(ppa[i], 0, 1);
1198 		page_unlock(ppa[i]);
1199 	}
1200 
1201 	kmem_free(ppa, npages * sizeof (page_t *));
1202 	return (addr);
1203 }
1204 
1205 /* Called to import new spans into the TSB vmem arenas */
1206 void *
1207 sfmmu_tsb_segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
1208 {
1209 	lgrp_id_t lgrpid = LGRP_NONE;
1210 
1211 	if (tsb_lgrp_affinity) {
1212 		/*
1213 		 * Search for the vmp->lgrpid mapping by brute force;
1214 		 * some day vmp will have an lgrp, until then we have
1215 		 * to do this the hard way.
1216 		 */
1217 		for (lgrpid = 0; lgrpid < NLGRPS_MAX &&
1218 		    vmp != kmem_tsb_default_arena[lgrpid]; lgrpid++)
1219 			;
1220 		if (lgrpid == NLGRPS_MAX)
1221 			lgrpid = LGRP_NONE;
1222 	}
1223 
1224 	return (sfmmu_tsb_xalloc(vmp, NULL, size, vmflag, 0,
1225 	    sfmmu_tsb_page_create, lgrpid != LGRP_NONE? &lgrpid : NULL));
1226 }
1227 
1228 /* Called to free spans from the TSB vmem arenas */
1229 void
1230 sfmmu_tsb_segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
1231 {
1232 	page_t *pp;
1233 	caddr_t addr = inaddr;
1234 	caddr_t eaddr;
1235 	pgcnt_t npages = btopr(size);
1236 	pgcnt_t pgs_left = npages;
1237 	page_t *rootpp = NULL;
1238 
1239 	hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1240 
1241 	for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
1242 		pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
1243 		if (pp == NULL)
1244 			panic("sfmmu_tsb_segkmem_free: page not found");
1245 
1246 		ASSERT(PAGE_EXCL(pp));
1247 		page_pp_unlock(pp, 0, 1);
1248 
1249 		if (rootpp == NULL)
1250 			rootpp = pp;
1251 		if (--pgs_left == 0) {
1252 			/*
1253 			 * similar logic to segspt_free_pages, but we know we
1254 			 * have one large page.
1255 			 */
1256 			page_destroy_pages(rootpp);
1257 		}
1258 	}
1259 	page_unresv(npages);
1260 
1261 	if (vmp != NULL)
1262 		vmem_xfree(vmp, inaddr, size);
1263 }
1264