xref: /titanic_44/usr/src/uts/sun4/vm/vm_dep.c (revision 4cc1ac68c690efa70450ed478a37fe6d78f0f42e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * UNIX machine dependent virtual memory support.
31  */
32 
33 #include <sys/vm.h>
34 #include <sys/exec.h>
35 
36 #include <sys/exechdr.h>
37 #include <vm/seg_kmem.h>
38 #include <sys/atomic.h>
39 #include <sys/archsystm.h>
40 #include <sys/machsystm.h>
41 #include <sys/kdi.h>
42 #include <sys/cpu_module.h>
43 
44 #include <vm/hat_sfmmu.h>
45 
46 #include <sys/memnode.h>
47 
48 #include <sys/mem_config.h>
49 #include <sys/mem_cage.h>
50 #include <vm/vm_dep.h>
51 #include <sys/platform_module.h>
52 
53 /*
54  * These variables are set by module specific config routines.
55  * They are only set by modules which will use physical cache page coloring
56  * and/or virtual cache page coloring.
57  */
58 int do_pg_coloring = 0;
59 int do_virtual_coloring = 0;
60 
61 /*
62  * These variables can be conveniently patched at kernel load time to
63  * prevent do_pg_coloring or do_virtual_coloring from being enabled by
64  * module specific config routines.
65  */
66 
67 int use_page_coloring = 1;
68 int use_virtual_coloring = 1;
69 
70 /*
71  * initialized by page_coloring_init()
72  */
73 extern uint_t page_colors;
74 extern uint_t page_colors_mask;
75 extern uint_t page_coloring_shift;
76 int cpu_page_colors;
77 uint_t vac_colors = 0;
78 uint_t vac_colors_mask = 0;
79 
80 /*
81  * get the ecache setsize for the current cpu.
82  */
83 #define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
84 
85 plcnt_t		plcnt;		/* page list count */
86 
87 /*
88  * This variable is set by the cpu module to contain the lowest
89  * address not affected by the SF_ERRATA_57 workaround.  It should
90  * remain 0 if the workaround is not needed.
91  */
92 #if defined(SF_ERRATA_57)
93 caddr_t errata57_limit;
94 #endif
95 
96 extern int disable_auto_large_pages;	/* used by map_pgsz*() routines */
97 
98 extern void page_relocate_hash(page_t *, page_t *);
99 
100 /*
101  * these must be defined in platform specific areas
102  */
103 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
104 	struct proc *, uint_t);
105 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
106 	caddr_t, size_t, uint_t, struct lgrp *);
107 /*
108  * Convert page frame number to an OBMEM page frame number
109  * (i.e. put in the type bits -- zero for this implementation)
110  */
111 pfn_t
112 impl_obmem_pfnum(pfn_t pf)
113 {
114 	return (pf);
115 }
116 
117 /*
118  * Use physmax to determine the highest physical page of DRAM memory
119  * It is assumed that any physical addresses above physmax is in IO space.
120  * We don't bother checking the low end because we assume that memory space
121  * begins at physical page frame 0.
122  *
123  * Return 1 if the page frame is onboard DRAM memory, else 0.
124  * Returns 0 for nvram so it won't be cached.
125  */
126 int
127 pf_is_memory(pfn_t pf)
128 {
129 	/* We must be IO space */
130 	if (pf > physmax)
131 		return (0);
132 
133 	/* We must be memory space */
134 	return (1);
135 }
136 
137 /*
138  * Handle a pagefault.
139  */
140 faultcode_t
141 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
142 {
143 	struct as *as;
144 	struct proc *p;
145 	faultcode_t res;
146 	caddr_t base;
147 	size_t len;
148 	int err;
149 
150 	if (INVALID_VADDR(addr))
151 		return (FC_NOMAP);
152 
153 	if (iskernel) {
154 		as = &kas;
155 	} else {
156 		p = curproc;
157 		as = p->p_as;
158 #if defined(SF_ERRATA_57)
159 		/*
160 		 * Prevent infinite loops due to a segment driver
161 		 * setting the execute permissions and the sfmmu hat
162 		 * silently ignoring them.
163 		 */
164 		if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
165 		    addr < errata57_limit) {
166 			res = FC_NOMAP;
167 			goto out;
168 		}
169 #endif
170 	}
171 
172 	/*
173 	 * Dispatch pagefault.
174 	 */
175 	res = as_fault(as->a_hat, as, addr, 1, type, rw);
176 
177 	/*
178 	 * If this isn't a potential unmapped hole in the user's
179 	 * UNIX data or stack segments, just return status info.
180 	 */
181 	if (!(res == FC_NOMAP && iskernel == 0))
182 		goto out;
183 
184 	/*
185 	 * Check to see if we happened to faulted on a currently unmapped
186 	 * part of the UNIX data or stack segments.  If so, create a zfod
187 	 * mapping there and then try calling the fault routine again.
188 	 */
189 	base = p->p_brkbase;
190 	len = p->p_brksize;
191 
192 	if (addr < base || addr >= base + len) {		/* data seg? */
193 		base = (caddr_t)(p->p_usrstack - p->p_stksize);
194 		len = p->p_stksize;
195 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
196 			/* not in either UNIX data or stack segments */
197 			res = FC_NOMAP;
198 			goto out;
199 		}
200 	}
201 
202 	/* the rest of this function implements a 3.X 4.X 5.X compatibility */
203 	/* This code is probably not needed anymore */
204 
205 	/* expand the gap to the page boundaries on each side */
206 	len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
207 	    ((uintptr_t)base & PAGEMASK);
208 	base = (caddr_t)((uintptr_t)base & PAGEMASK);
209 
210 	as_rangelock(as);
211 	as_purge(as);
212 	if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
213 		err = as_map(as, base, len, segvn_create, zfod_argsp);
214 		as_rangeunlock(as);
215 		if (err) {
216 			res = FC_MAKE_ERR(err);
217 			goto out;
218 		}
219 	} else {
220 		/*
221 		 * This page is already mapped by another thread after we
222 		 * returned from as_fault() above.  We just fallthrough
223 		 * as_fault() below.
224 		 */
225 		as_rangeunlock(as);
226 	}
227 
228 	res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
229 
230 out:
231 
232 	return (res);
233 }
234 
235 /*
236  * This is the routine which defines the address limit implied
237  * by the flag '_MAP_LOW32'.  USERLIMIT32 matches the highest
238  * mappable address in a 32-bit process on this platform (though
239  * perhaps we should make it be UINT32_MAX here?)
240  */
241 void
242 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
243 {
244 	struct proc *p = curproc;
245 	caddr_t userlimit = flags & _MAP_LOW32 ?
246 		(caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
247 	map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
248 }
249 
250 /*
251  * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
252  */
253 caddr_t	hole_start, hole_end;
254 
255 /*
256  * kpm mapping window
257  */
258 caddr_t kpm_vbase;
259 size_t  kpm_size;
260 uchar_t kpm_size_shift;
261 
262 /*
263  * Determine whether [base, base+len] contains a mapable range of
264  * addresses at least minlen long. base and len are adjusted if
265  * required to provide a mapable range.
266  */
267 /* ARGSUSED */
268 int
269 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
270 {
271 	caddr_t hi, lo;
272 
273 	lo = *basep;
274 	hi = lo + *lenp;
275 
276 	/*
277 	 * If hi rolled over the top, try cutting back.
278 	 */
279 	if (hi < lo) {
280 		size_t newlen = 0 - (uintptr_t)lo - 1l;
281 
282 		if (newlen + (uintptr_t)hi < minlen)
283 			return (0);
284 		if (newlen < minlen)
285 			return (0);
286 		*lenp = newlen;
287 	} else if (hi - lo < minlen)
288 		return (0);
289 
290 	/*
291 	 * Deal with a possible hole in the address range between
292 	 * hole_start and hole_end that should never be mapped by the MMU.
293 	 */
294 	hi = lo + *lenp;
295 
296 	if (lo < hole_start) {
297 		if (hi > hole_start)
298 			if (hi < hole_end)
299 				hi = hole_start;
300 			else
301 				/* lo < hole_start && hi >= hole_end */
302 				if (dir == AH_LO) {
303 					/*
304 					 * prefer lowest range
305 					 */
306 					if (hole_start - lo >= minlen)
307 						hi = hole_start;
308 					else if (hi - hole_end >= minlen)
309 						lo = hole_end;
310 					else
311 						return (0);
312 				} else {
313 					/*
314 					 * prefer highest range
315 					 */
316 					if (hi - hole_end >= minlen)
317 						lo = hole_end;
318 					else if (hole_start - lo >= minlen)
319 						hi = hole_start;
320 					else
321 						return (0);
322 				}
323 	} else {
324 		/* lo >= hole_start */
325 		if (hi < hole_end)
326 			return (0);
327 		if (lo < hole_end)
328 			lo = hole_end;
329 	}
330 
331 	if (hi - lo < minlen)
332 		return (0);
333 
334 	*basep = lo;
335 	*lenp = hi - lo;
336 
337 	return (1);
338 }
339 
340 /*
341  * Determine whether [addr, addr+len] with protections `prot' are valid
342  * for a user address space.
343  */
344 /*ARGSUSED*/
345 int
346 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
347     caddr_t userlimit)
348 {
349 	caddr_t eaddr = addr + len;
350 
351 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
352 		return (RANGE_BADADDR);
353 
354 	/*
355 	 * Determine if the address range falls within an illegal
356 	 * range of the MMU.
357 	 */
358 	if (eaddr > hole_start && addr < hole_end)
359 		return (RANGE_BADADDR);
360 
361 #if defined(SF_ERRATA_57)
362 	/*
363 	 * Make sure USERLIMIT isn't raised too high
364 	 */
365 	ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
366 	    errata57_limit == 0);
367 
368 	if (AS_TYPE_64BIT(as) &&
369 	    (addr < errata57_limit) &&
370 	    (prot & PROT_EXEC))
371 		return (RANGE_BADPROT);
372 #endif /* SF_ERRATA57 */
373 	return (RANGE_OKAY);
374 }
375 
376 /*
377  * Routine used to check to see if an a.out can be executed
378  * by the current machine/architecture.
379  */
380 int
381 chkaout(struct exdata *exp)
382 {
383 	if (exp->ux_mach == M_SPARC)
384 		return (0);
385 	else
386 		return (ENOEXEC);
387 }
388 
389 /*
390  * The following functions return information about an a.out
391  * which is used when a program is executed.
392  */
393 
394 /*
395  * Return the load memory address for the data segment.
396  */
397 caddr_t
398 getdmem(struct exec *exp)
399 {
400 	/*
401 	 * XXX - Sparc Reference Hack approaching
402 	 * Remember that we are loading
403 	 * 8k executables into a 4k machine
404 	 * DATA_ALIGN == 2 * PAGESIZE
405 	 */
406 	if (exp->a_text)
407 		return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
408 	else
409 		return ((caddr_t)USRTEXT);
410 }
411 
412 /*
413  * Return the starting disk address for the data segment.
414  */
415 ulong_t
416 getdfile(struct exec *exp)
417 {
418 	if (exp->a_magic == ZMAGIC)
419 		return (exp->a_text);
420 	else
421 		return (sizeof (struct exec) + exp->a_text);
422 }
423 
424 /*
425  * Return the load memory address for the text segment.
426  */
427 
428 /*ARGSUSED*/
429 caddr_t
430 gettmem(struct exec *exp)
431 {
432 	return ((caddr_t)USRTEXT);
433 }
434 
435 /*
436  * Return the file byte offset for the text segment.
437  */
438 uint_t
439 gettfile(struct exec *exp)
440 {
441 	if (exp->a_magic == ZMAGIC)
442 		return (0);
443 	else
444 		return (sizeof (struct exec));
445 }
446 
447 void
448 getexinfo(
449 	struct exdata *edp_in,
450 	struct exdata *edp_out,
451 	int *pagetext,
452 	int *pagedata)
453 {
454 	*edp_out = *edp_in;	/* structure copy */
455 
456 	if ((edp_in->ux_mag == ZMAGIC) &&
457 	    ((edp_in->vp->v_flag & VNOMAP) == 0)) {
458 		*pagetext = 1;
459 		*pagedata = 1;
460 	} else {
461 		*pagetext = 0;
462 		*pagedata = 0;
463 	}
464 }
465 
466 #define	MAP_PGSZ_COMMON(pgsz, n, upper, lower, len)	\
467 	for ((n) = (upper); (n) > (lower); (n)--) {		\
468 		if (disable_auto_large_pages & (1 << (n)))		\
469 			continue;				\
470 		if (hw_page_array[(n)].hp_size <= (len)) {	\
471 			(pgsz) = hw_page_array[(n)].hp_size;	\
472 			break;					\
473 		}						\
474 	}
475 
476 
477 /*ARGSUSED*/
478 size_t
479 map_pgszva(struct proc *p, caddr_t addr, size_t len)
480 {
481 	size_t		pgsz = MMU_PAGESIZE;
482 	int		n, upper;
483 
484 	/*
485 	 * Select the best fit page size within the constraints of
486 	 * auto_lpg_{min,max}szc.
487 	 *
488 	 * Note that we also take the heap size into account when
489 	 * deciding if we've crossed the threshold at which we should
490 	 * increase the page size.  This isn't perfect since the heap
491 	 * may not have reached its full size yet, but it's better than
492 	 * not considering it at all.
493 	 */
494 	len += p->p_brksize;
495 	if (ptob(auto_lpg_tlb_threshold) <= len) {
496 
497 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
498 
499 		/*
500 		 * Use auto_lpg_minszc - 1 as the limit so we never drop
501 		 * below auto_lpg_minszc.  We don't have a size code to refer
502 		 * to like we have for bss and stack, so we assume 0.
503 		 * auto_lpg_minszc should always be >= 0.  Using
504 		 * auto_lpg_minszc cuts off the loop.
505 		 */
506 		MAP_PGSZ_COMMON(pgsz, n, upper, auto_lpg_minszc - 1, len);
507 	}
508 
509 	return (pgsz);
510 }
511 
512 size_t
513 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
514 {
515 	size_t		pgsz;
516 	int		n, upper, lower;
517 
518 	/*
519 	 * If len is zero, retrieve from proc and don't demote the page size.
520 	 */
521 	if (len == 0) {
522 		len = p->p_brksize;
523 	}
524 
525 	/*
526 	 * Still zero?  Then we don't have a heap yet, so pick the default
527 	 * heap size.
528 	 */
529 	if (len == 0) {
530 		pgsz = auto_lpg_heap_default;
531 	} else {
532 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
533 	}
534 
535 	if ((pgsz * auto_lpg_tlb_threshold) <= len) {
536 		/*
537 		 * We're past the threshold, so select the best fit
538 		 * page size within the constraints of
539 		 * auto_lpg_{min,max}szc and the minimum required
540 		 * alignment.
541 		 */
542 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
543 		lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc);
544 		MAP_PGSZ_COMMON(pgsz, n, upper, lower, len);
545 	}
546 
547 	/*
548 	 * If addr == 0 we were called by memcntl() or exec_args() when the
549 	 * size code is 0.  Don't set pgsz less than current size.
550 	 */
551 	if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
552 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
553 	}
554 
555 	return (pgsz);
556 }
557 
558 size_t
559 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
560 {
561 	size_t		pgsz;
562 	int		n, upper, lower;
563 
564 	/*
565 	 * If len is zero, retrieve from proc and don't demote the page size.
566 	 */
567 	if (len == 0) {
568 		len = p->p_stksize;
569 	}
570 
571 	/*
572 	 * Still zero?  Then we don't have a heap yet, so pick the default
573 	 * stack size.
574 	 */
575 	if (len == 0) {
576 		pgsz = auto_lpg_stack_default;
577 	} else {
578 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
579 	}
580 
581 	if ((pgsz * auto_lpg_tlb_threshold) <= len) {
582 		/*
583 		 * We're past the threshold, so select the best fit
584 		 * page size within the constraints of
585 		 * auto_lpg_{min,max}szc and the minimum required
586 		 * alignment.
587 		 */
588 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
589 		lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc);
590 		MAP_PGSZ_COMMON(pgsz, n, upper, lower, len);
591 	}
592 
593 	/*
594 	 * If addr == 0 we were called by memcntl() or exec_args() when the
595 	 * size code is 0.  Don't set pgsz less than current size.
596 	 */
597 	if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
598 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
599 	}
600 
601 	return (pgsz);
602 }
603 
604 
605 /*
606  * Return non 0 value if the address may cause a VAC alias with KPM mappings.
607  * KPM selects an address such that it's equal offset modulo shm_alignment and
608  * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
609  */
610 int
611 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
612 {
613 	if (vac) {
614 		return (((uintptr_t)addr ^ off) & shm_alignment - 1);
615 	} else {
616 		return (0);
617 	}
618 }
619 
620 /*
621  * use_text_pgsz64k, use_initdata_pgsz64k and use_text_pgsz4m
622  * can be set in platform or CPU specific code but user can change the
623  * default values via /etc/system.
624  *
625  * Initial values are defined in architecture specific mach_vm_dep.c file.
626  */
627 extern int use_text_pgsz64k;
628 extern int use_text_pgsz4m;
629 extern int use_initdata_pgsz64k;
630 
631 /*
632  * disable_text_largepages and disable_initdata_largepages bitmaks are set in
633  * platform or CPU specific code to disable page sizes that should not be
634  * used. These variables normally shouldn't be changed via /etc/system. A
635  * particular page size for text or inititialized data will be used by default
636  * if both one of use_* variables is set to 1 AND this page size is not
637  * disabled in the corresponding disable_* bitmask variable.
638  *
639  * Initial values are defined in architecture specific mach_vm_dep.c file.
640  */
641 extern int disable_text_largepages;
642 extern int disable_initdata_largepages;
643 
644 /*
645  * Minimum segment size tunables before 64K or 4M large pages
646  * should be used to map it.
647  *
648  * Initial values are defined in architecture specific mach_vm_dep.c file.
649  */
650 extern size_t text_pgsz64k_minsize;
651 extern size_t text_pgsz4m_minsize;
652 extern size_t initdata_pgsz64k_minsize;
653 
654 /*
655  * Sanity control. Don't use large pages regardless of user
656  * settings if there's less than execseg_lpg_min_physmem memory installed.
657  * The units for this variable is 8K pages.
658  */
659 pgcnt_t execseg_lpg_min_physmem = 131072;		/* 1GB */
660 
661 
662 /* assumes TTE8K...TTE4M == szc */
663 
664 static uint_t
665 map_text_pgsz4m(caddr_t addr, size_t len)
666 {
667 	caddr_t a;
668 
669 	if (len < text_pgsz4m_minsize) {
670 		return (0);
671 	}
672 
673 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t);
674 	if (a < addr || a >= addr + len) {
675 		return (0);
676 	}
677 	len -= (a - addr);
678 	if (len < MMU_PAGESIZE4M) {
679 		return (0);
680 	}
681 
682 	return (1 << TTE4M);
683 }
684 
685 static uint_t
686 map_text_pgsz64k(caddr_t addr, size_t len)
687 {
688 	caddr_t a;
689 	size_t svlen = len;
690 
691 	if (len < text_pgsz64k_minsize) {
692 		return (0);
693 	}
694 
695 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t);
696 	if (a < addr || a >= addr + len) {
697 		return (0);
698 	}
699 	len -= (a - addr);
700 	if (len < MMU_PAGESIZE64K) {
701 		return (0);
702 	}
703 	if (!use_text_pgsz4m ||
704 	    disable_text_largepages & (1 << TTE4M)) {
705 		return (1 << TTE64K);
706 	}
707 	if (svlen < text_pgsz4m_minsize) {
708 		return (1 << TTE64K);
709 	}
710 	addr = a;
711 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t);
712 	if (a < addr || a >= addr + len) {
713 		return (1 << TTE64K);
714 	}
715 	len -= (a - addr);
716 	if (len < MMU_PAGESIZE4M) {
717 		return (1 << TTE64K);
718 	}
719 	return ((1 << TTE4M) | (1 << TTE64K));
720 }
721 
722 static uint_t
723 map_initdata_pgsz64k(caddr_t addr, size_t len)
724 {
725 	caddr_t a;
726 
727 	if (len < initdata_pgsz64k_minsize) {
728 		return (0);
729 	}
730 
731 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t);
732 	if (a < addr || a >= addr + len) {
733 		return (0);
734 	}
735 	len -= (a - addr);
736 	if (len < MMU_PAGESIZE64K) {
737 		return (0);
738 	}
739 	return (1 << TTE64K);
740 }
741 
742 /*
743  * Return a bit vector of large page size codes that
744  * can be used to map [addr, addr + len) region.
745  */
746 uint_t
747 map_execseg_pgszcvec(int text, caddr_t addr, size_t len)
748 {
749 	uint_t ret = 0;
750 
751 	if (physmem < execseg_lpg_min_physmem) {
752 		return (0);
753 	}
754 
755 	if (text) {
756 		if (use_text_pgsz64k &&
757 		    !(disable_text_largepages & (1 << TTE64K))) {
758 			ret = map_text_pgsz64k(addr, len);
759 		} else if (use_text_pgsz4m &&
760 		    !(disable_text_largepages & (1 << TTE4M))) {
761 			ret = map_text_pgsz4m(addr, len);
762 		}
763 	} else if (use_initdata_pgsz64k &&
764 	    !(disable_initdata_largepages & (1 << TTE64K))) {
765 		ret = map_initdata_pgsz64k(addr, len);
766 	}
767 
768 	return (ret);
769 }
770 
771 #define	PNUM_SIZE(size_code)						\
772 	(hw_page_array[size_code].hp_size >> hw_page_array[0].hp_shift)
773 
774 /*
775  * Anchored in the table below are counters used to keep track
776  * of free contiguous physical memory. Each element of the table contains
777  * the array of counters, the size of array which is allocated during
778  * startup based on physmax and a shift value used to convert a pagenum
779  * into a counter array index or vice versa. The table has page size
780  * for rows and region size for columns:
781  *
782  *	page_counters[page_size][region_size]
783  *
784  *	page_size: 	TTE size code of pages on page_size freelist.
785  *
786  *	region_size:	TTE size code of a candidate larger page made up
787  *			made up of contiguous free page_size pages.
788  *
789  * As you go across a page_size row increasing region_size each
790  * element keeps track of how many (region_size - 1) size groups
791  * made up of page_size free pages can be coalesced into a
792  * regsion_size page. Yuck! Lets try an example:
793  *
794  * 	page_counters[1][3] is the table element used for identifying
795  *	candidate 4M pages from contiguous pages off the 64K free list.
796  *	Each index in the page_counters[1][3].array spans 4M. Its the
797  *	number of free 512K size (regsion_size - 1) groups of contiguous
798  *	64K free pages.	So when page_counters[1][3].counters[n] == 8
799  *	we know we have a candidate 4M page made up of 512K size groups
800  *	of 64K free pages.
801  */
802 
803 /*
804  * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
805  * dimensions are allocated dynamically.
806  */
807 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
808 
809 /*
810  * For now there is only a single size cache list.
811  * Allocated dynamically.
812  */
813 page_t ***page_cachelists[MAX_MEM_TYPES];
814 
815 kmutex_t *fpc_mutex[NPC_MUTEX];
816 kmutex_t *cpc_mutex[NPC_MUTEX];
817 
818 caddr_t
819 alloc_page_freelists(int mnode, caddr_t alloc_base, int alloc_align)
820 {
821 	int	mtype;
822 	uint_t	szc;
823 
824 	alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align);
825 
826 	/*
827 	 * We only support small pages in the cachelist.
828 	 */
829 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
830 		page_cachelists[mtype][mnode] = (page_t **)alloc_base;
831 		alloc_base += (sizeof (page_t *) * page_colors);
832 		/*
833 		 * Allocate freelists bins for all
834 		 * supported page sizes.
835 		 */
836 		for (szc = 0; szc < mmu_page_sizes; szc++) {
837 			page_freelists[szc][mtype][mnode] =
838 			    (page_t **)alloc_base;
839 			alloc_base += ((sizeof (page_t *) *
840 			    page_get_pagecolors(szc)));
841 		}
842 	}
843 
844 	alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align);
845 
846 	return (alloc_base);
847 }
848 
849 /*
850  * Allocate page_freelists bin headers for a memnode from the
851  * nucleus data area. This is the first time that mmu_page_sizes is
852  * used during sun4u bootup, so check mmu_page_sizes initialization.
853  */
854 int
855 ndata_alloc_page_freelists(struct memlist *ndata, int mnode)
856 {
857 	size_t alloc_sz;
858 	caddr_t alloc_base;
859 	caddr_t end;
860 	int	mtype;
861 	uint_t	szc;
862 	int32_t allp = 0;
863 
864 	if (&mmu_init_mmu_page_sizes) {
865 		if (!mmu_init_mmu_page_sizes(allp)) {
866 			cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
867 			    mmu_page_sizes);
868 		}
869 	}
870 	ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
871 
872 	/* first time called - allocate max_mem_nodes dimension */
873 	if (mnode == 0) {
874 		int	i;
875 
876 		/* page_cachelists */
877 		alloc_sz = MAX_MEM_TYPES * max_mem_nodes *
878 		    sizeof (page_t **);
879 
880 		/* page_freelists */
881 		alloc_sz += MAX_MEM_TYPES * mmu_page_sizes * max_mem_nodes *
882 		    sizeof (page_t **);
883 
884 		/* fpc_mutex and cpc_mutex */
885 		alloc_sz += 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
886 
887 		alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
888 		if (alloc_base == NULL)
889 			return (-1);
890 
891 		ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
892 
893 		for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
894 			page_cachelists[mtype] = (page_t ***)alloc_base;
895 			alloc_base += (max_mem_nodes * sizeof (page_t **));
896 			for (szc = 0; szc < mmu_page_sizes; szc++) {
897 				page_freelists[szc][mtype] =
898 				    (page_t ***)alloc_base;
899 				alloc_base += (max_mem_nodes *
900 				    sizeof (page_t **));
901 			}
902 		}
903 		for (i = 0; i < NPC_MUTEX; i++) {
904 			fpc_mutex[i] = (kmutex_t *)alloc_base;
905 			alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
906 			cpc_mutex[i] = (kmutex_t *)alloc_base;
907 			alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
908 		}
909 		alloc_sz = 0;
910 	}
911 
912 	/*
913 	 * Calculate the size needed by alloc_page_freelists().
914 	 */
915 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
916 		alloc_sz += sizeof (page_t *) * page_colors;
917 
918 		for (szc = 0; szc < mmu_page_sizes; szc++)
919 			alloc_sz += sizeof (page_t *) *
920 			    page_get_pagecolors(szc);
921 	}
922 
923 	alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
924 	if (alloc_base == NULL)
925 		return (-1);
926 
927 	end = alloc_page_freelists(mnode, alloc_base, ecache_alignsize);
928 	ASSERT((uintptr_t)end == roundup((uintptr_t)alloc_base + alloc_sz,
929 	    ecache_alignsize));
930 
931 	return (0);
932 }
933 
934 /*
935  * To select our starting bin, we stride through the bins with a stride
936  * of 337.  Why 337?  It's prime, it's largeish, and it performs well both
937  * in simulation and practice for different workloads on varying cache sizes.
938  */
939 uint32_t color_start_current = 0;
940 uint32_t color_start_stride = 337;
941 int color_start_random = 0;
942 
943 /* ARGSUSED */
944 uint_t
945 get_color_start(struct as *as)
946 {
947 	uint32_t old, new;
948 
949 	if (consistent_coloring == 2 || color_start_random) {
950 		return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
951 		    page_colors_mask));
952 	}
953 
954 	do {
955 		old = color_start_current;
956 		new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
957 	} while (cas32(&color_start_current, old, new) != old);
958 
959 	return ((uint_t)(new));
960 }
961 
962 /*
963  * Called once at startup from kphysm_init() -- before memialloc()
964  * is invoked to do the 1st page_free()/page_freelist_add().
965  *
966  * initializes page_colors and page_colors_mask based on ecache_setsize.
967  *
968  * Also initializes the counter locks.
969  */
970 void
971 page_coloring_init()
972 {
973 	int	a;
974 
975 	if (do_pg_coloring == 0) {
976 		page_colors = 1;
977 		return;
978 	}
979 
980 	/*
981 	 * Calculate page_colors from ecache_setsize. ecache_setsize contains
982 	 * the max ecache setsize of all cpus configured in the system or, for
983 	 * cheetah+ systems, the max possible ecache setsize for all possible
984 	 * cheetah+ cpus.
985 	 */
986 	page_colors = ecache_setsize / MMU_PAGESIZE;
987 	page_colors_mask = page_colors - 1;
988 
989 	/*
990 	 * initialize cpu_page_colors if ecache setsizes are homogenous.
991 	 * cpu_page_colors set to -1 during DR operation or during startup
992 	 * if setsizes are heterogenous.
993 	 *
994 	 * The value of cpu_page_colors determines if additional color bins
995 	 * need to be checked for a particular color in the page_get routines.
996 	 */
997 	if ((cpu_page_colors == 0) && (cpu_setsize < ecache_setsize))
998 		cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
999 
1000 	vac_colors = vac_size / MMU_PAGESIZE;
1001 	vac_colors_mask = vac_colors -1;
1002 
1003 	page_coloring_shift = 0;
1004 	a = ecache_setsize;
1005 	while (a >>= 1) {
1006 		page_coloring_shift++;
1007 	}
1008 }
1009 
1010 int
1011 bp_color(struct buf *bp)
1012 {
1013 	int color = -1;
1014 
1015 	if (vac) {
1016 		if ((bp->b_flags & B_PAGEIO) != 0) {
1017 			color = sfmmu_get_ppvcolor(bp->b_pages);
1018 		} else if (bp->b_un.b_addr != NULL) {
1019 			color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
1020 		}
1021 	}
1022 	return (color < 0 ? 0 : ptob(color));
1023 }
1024 
1025 /*
1026  * Create & Initialise pageout scanner thread. The thread has to
1027  * start at procedure with process pp and priority pri.
1028  */
1029 void
1030 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
1031 {
1032 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
1033 }
1034 
1035 /*
1036  * Function for flushing D-cache when performing module relocations
1037  * to an alternate mapping.  Stubbed out on all platforms except sun4u,
1038  * at least for now.
1039  */
1040 void
1041 dcache_flushall()
1042 {
1043 	sfmmu_cache_flushall();
1044 }
1045 
1046 static int
1047 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
1048 {
1049 	if (va1 < va2 && va1 + sz1 <= va2)
1050 		return (0);
1051 
1052 	if (va2 < va1 && va2 + sz2 <= va1)
1053 		return (0);
1054 
1055 	return (1);
1056 }
1057 
1058 /*
1059  * Return the number of bytes, relative to the beginning of a given range, that
1060  * are non-toxic (can be read from and written to with relative impunity).
1061  */
1062 size_t
1063 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
1064 {
1065 	/* OBP reads are harmless, but we don't want people writing there */
1066 	if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
1067 	    OFW_START_ADDR + 1))
1068 		return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
1069 
1070 	if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
1071 		return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
1072 
1073 	return (sz); /* no overlap */
1074 }
1075 
1076 /*
1077  * Minimum physmem required for enabling large pages for kernel heap
1078  * Currently we do not enable lp for kmem on systems with less
1079  * than 1GB of memory. This value can be changed via /etc/system
1080  */
1081 size_t segkmem_lpminphysmem = 0x40000000;	/* 1GB */
1082 
1083 /*
1084  * this function chooses large page size for kernel heap
1085  */
1086 size_t
1087 get_segkmem_lpsize(size_t lpsize)
1088 {
1089 	size_t memtotal = physmem * PAGESIZE;
1090 
1091 	if (memtotal < segkmem_lpminphysmem)
1092 		return (PAGESIZE);
1093 
1094 	if (plat_lpkmem_is_supported != NULL &&
1095 	    plat_lpkmem_is_supported() == 0)
1096 		return (PAGESIZE);
1097 
1098 	return (mmu_get_kernel_lpsize(lpsize));
1099 }
1100