xref: /titanic_51/usr/src/uts/sun4/vm/vm_dep.c (revision 8eea8e29cc4374d1ee24c25a07f45af132db3499)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * UNIX machine dependent virtual memory support.
31  */
32 
33 #include <sys/vm.h>
34 #include <sys/exec.h>
35 
36 #include <sys/exechdr.h>
37 #include <vm/seg_kmem.h>
38 #include <sys/atomic.h>
39 #include <sys/archsystm.h>
40 #include <sys/machsystm.h>
41 #include <sys/kdi.h>
42 #include <sys/cpu_module.h>
43 
44 #include <vm/hat_sfmmu.h>
45 
46 #include <sys/memnode.h>
47 
48 #include <sys/mem_config.h>
49 #include <sys/mem_cage.h>
50 #include <vm/vm_dep.h>
51 #include <sys/platform_module.h>
52 
53 /*
54  * These variables are set by module specific config routines.
55  * They are only set by modules which will use physical cache page coloring
56  * and/or virtual cache page coloring.
57  */
58 int do_pg_coloring = 0;
59 int do_virtual_coloring = 0;
60 
61 /*
62  * These variables can be conveniently patched at kernel load time to
63  * prevent do_pg_coloring or do_virtual_coloring from being enabled by
64  * module specific config routines.
65  */
66 
67 int use_page_coloring = 1;
68 int use_virtual_coloring = 1;
69 
70 /*
71  * initialized by page_coloring_init()
72  */
73 extern uint_t page_colors;
74 extern uint_t page_colors_mask;
75 extern uint_t page_coloring_shift;
76 int cpu_page_colors;
77 uint_t vac_colors = 0;
78 uint_t vac_colors_mask = 0;
79 
80 /*
81  * get the ecache setsize for the current cpu.
82  */
83 #define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
84 
85 #ifdef DEBUG
86 plcnt_t		plcnt;		/* page list count */
87 #endif
88 
89 /*
90  * This variable is set by the cpu module to contain the lowest
91  * address not affected by the SF_ERRATA_57 workaround.  It should
92  * remain 0 if the workaround is not needed.
93  */
94 #if defined(SF_ERRATA_57)
95 caddr_t errata57_limit;
96 #endif
97 
98 extern int disable_auto_large_pages;	/* used by map_pgsz*() routines */
99 
100 extern void page_relocate_hash(page_t *, page_t *);
101 
102 /*
103  * these must be defined in platform specific areas
104  */
105 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
106 	struct proc *, uint_t);
107 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
108 	caddr_t, size_t, uint_t, struct lgrp *);
109 /*
110  * Convert page frame number to an OBMEM page frame number
111  * (i.e. put in the type bits -- zero for this implementation)
112  */
113 pfn_t
114 impl_obmem_pfnum(pfn_t pf)
115 {
116 	return (pf);
117 }
118 
119 /*
120  * Use physmax to determine the highest physical page of DRAM memory
121  * It is assumed that any physical addresses above physmax is in IO space.
122  * We don't bother checking the low end because we assume that memory space
123  * begins at physical page frame 0.
124  *
125  * Return 1 if the page frame is onboard DRAM memory, else 0.
126  * Returns 0 for nvram so it won't be cached.
127  */
128 int
129 pf_is_memory(pfn_t pf)
130 {
131 	/* We must be IO space */
132 	if (pf > physmax)
133 		return (0);
134 
135 	/* We must be memory space */
136 	return (1);
137 }
138 
139 /*
140  * Handle a pagefault.
141  */
142 faultcode_t
143 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
144 {
145 	struct as *as;
146 	struct proc *p;
147 	faultcode_t res;
148 	caddr_t base;
149 	size_t len;
150 	int err;
151 
152 	if (INVALID_VADDR(addr))
153 		return (FC_NOMAP);
154 
155 	if (iskernel) {
156 		as = &kas;
157 	} else {
158 		p = curproc;
159 		as = p->p_as;
160 #if defined(SF_ERRATA_57)
161 		/*
162 		 * Prevent infinite loops due to a segment driver
163 		 * setting the execute permissions and the sfmmu hat
164 		 * silently ignoring them.
165 		 */
166 		if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
167 		    addr < errata57_limit) {
168 			res = FC_NOMAP;
169 			goto out;
170 		}
171 #endif
172 	}
173 
174 	/*
175 	 * Dispatch pagefault.
176 	 */
177 	res = as_fault(as->a_hat, as, addr, 1, type, rw);
178 
179 	/*
180 	 * If this isn't a potential unmapped hole in the user's
181 	 * UNIX data or stack segments, just return status info.
182 	 */
183 	if (!(res == FC_NOMAP && iskernel == 0))
184 		goto out;
185 
186 	/*
187 	 * Check to see if we happened to faulted on a currently unmapped
188 	 * part of the UNIX data or stack segments.  If so, create a zfod
189 	 * mapping there and then try calling the fault routine again.
190 	 */
191 	base = p->p_brkbase;
192 	len = p->p_brksize;
193 
194 	if (addr < base || addr >= base + len) {		/* data seg? */
195 		base = (caddr_t)(p->p_usrstack - p->p_stksize);
196 		len = p->p_stksize;
197 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
198 			/* not in either UNIX data or stack segments */
199 			res = FC_NOMAP;
200 			goto out;
201 		}
202 	}
203 
204 	/* the rest of this function implements a 3.X 4.X 5.X compatibility */
205 	/* This code is probably not needed anymore */
206 
207 	/* expand the gap to the page boundaries on each side */
208 	len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
209 	    ((uintptr_t)base & PAGEMASK);
210 	base = (caddr_t)((uintptr_t)base & PAGEMASK);
211 
212 	as_rangelock(as);
213 	as_purge(as);
214 	if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
215 		err = as_map(as, base, len, segvn_create, zfod_argsp);
216 		as_rangeunlock(as);
217 		if (err) {
218 			res = FC_MAKE_ERR(err);
219 			goto out;
220 		}
221 	} else {
222 		/*
223 		 * This page is already mapped by another thread after we
224 		 * returned from as_fault() above.  We just fallthrough
225 		 * as_fault() below.
226 		 */
227 		as_rangeunlock(as);
228 	}
229 
230 	res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
231 
232 out:
233 
234 	return (res);
235 }
236 
237 /*
238  * This is the routine which defines the address limit implied
239  * by the flag '_MAP_LOW32'.  USERLIMIT32 matches the highest
240  * mappable address in a 32-bit process on this platform (though
241  * perhaps we should make it be UINT32_MAX here?)
242  */
243 void
244 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
245 {
246 	struct proc *p = curproc;
247 	caddr_t userlimit = flags & _MAP_LOW32 ?
248 		(caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
249 	map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
250 }
251 
252 /*
253  * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
254  */
255 caddr_t	hole_start, hole_end;
256 
257 /*
258  * kpm mapping window
259  */
260 caddr_t kpm_vbase;
261 size_t  kpm_size;
262 uchar_t kpm_size_shift;
263 
264 /*
265  * Determine whether [base, base+len] contains a mapable range of
266  * addresses at least minlen long. base and len are adjusted if
267  * required to provide a mapable range.
268  */
269 /* ARGSUSED */
270 int
271 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
272 {
273 	caddr_t hi, lo;
274 
275 	lo = *basep;
276 	hi = lo + *lenp;
277 
278 	/*
279 	 * If hi rolled over the top, try cutting back.
280 	 */
281 	if (hi < lo) {
282 		size_t newlen = 0 - (uintptr_t)lo - 1l;
283 
284 		if (newlen + (uintptr_t)hi < minlen)
285 			return (0);
286 		if (newlen < minlen)
287 			return (0);
288 		*lenp = newlen;
289 	} else if (hi - lo < minlen)
290 		return (0);
291 
292 	/*
293 	 * Deal with a possible hole in the address range between
294 	 * hole_start and hole_end that should never be mapped by the MMU.
295 	 */
296 	hi = lo + *lenp;
297 
298 	if (lo < hole_start) {
299 		if (hi > hole_start)
300 			if (hi < hole_end)
301 				hi = hole_start;
302 			else
303 				/* lo < hole_start && hi >= hole_end */
304 				if (dir == AH_LO) {
305 					/*
306 					 * prefer lowest range
307 					 */
308 					if (hole_start - lo >= minlen)
309 						hi = hole_start;
310 					else if (hi - hole_end >= minlen)
311 						lo = hole_end;
312 					else
313 						return (0);
314 				} else {
315 					/*
316 					 * prefer highest range
317 					 */
318 					if (hi - hole_end >= minlen)
319 						lo = hole_end;
320 					else if (hole_start - lo >= minlen)
321 						hi = hole_start;
322 					else
323 						return (0);
324 				}
325 	} else {
326 		/* lo >= hole_start */
327 		if (hi < hole_end)
328 			return (0);
329 		if (lo < hole_end)
330 			lo = hole_end;
331 	}
332 
333 	if (hi - lo < minlen)
334 		return (0);
335 
336 	*basep = lo;
337 	*lenp = hi - lo;
338 
339 	return (1);
340 }
341 
342 /*
343  * Determine whether [addr, addr+len] with protections `prot' are valid
344  * for a user address space.
345  */
346 /*ARGSUSED*/
347 int
348 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
349     caddr_t userlimit)
350 {
351 	caddr_t eaddr = addr + len;
352 
353 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
354 		return (RANGE_BADADDR);
355 
356 	/*
357 	 * Determine if the address range falls within an illegal
358 	 * range of the MMU.
359 	 */
360 	if (eaddr > hole_start && addr < hole_end)
361 		return (RANGE_BADADDR);
362 
363 #if defined(SF_ERRATA_57)
364 	/*
365 	 * Make sure USERLIMIT isn't raised too high
366 	 */
367 	ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
368 	    errata57_limit == 0);
369 
370 	if (AS_TYPE_64BIT(as) &&
371 	    (addr < errata57_limit) &&
372 	    (prot & PROT_EXEC))
373 		return (RANGE_BADPROT);
374 #endif /* SF_ERRATA57 */
375 	return (RANGE_OKAY);
376 }
377 
378 /*
379  * Routine used to check to see if an a.out can be executed
380  * by the current machine/architecture.
381  */
382 int
383 chkaout(struct exdata *exp)
384 {
385 	if (exp->ux_mach == M_SPARC)
386 		return (0);
387 	else
388 		return (ENOEXEC);
389 }
390 
391 /*
392  * The following functions return information about an a.out
393  * which is used when a program is executed.
394  */
395 
396 /*
397  * Return the load memory address for the data segment.
398  */
399 caddr_t
400 getdmem(struct exec *exp)
401 {
402 	/*
403 	 * XXX - Sparc Reference Hack approaching
404 	 * Remember that we are loading
405 	 * 8k executables into a 4k machine
406 	 * DATA_ALIGN == 2 * PAGESIZE
407 	 */
408 	if (exp->a_text)
409 		return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
410 	else
411 		return ((caddr_t)USRTEXT);
412 }
413 
414 /*
415  * Return the starting disk address for the data segment.
416  */
417 ulong_t
418 getdfile(struct exec *exp)
419 {
420 	if (exp->a_magic == ZMAGIC)
421 		return (exp->a_text);
422 	else
423 		return (sizeof (struct exec) + exp->a_text);
424 }
425 
426 /*
427  * Return the load memory address for the text segment.
428  */
429 
430 /*ARGSUSED*/
431 caddr_t
432 gettmem(struct exec *exp)
433 {
434 	return ((caddr_t)USRTEXT);
435 }
436 
437 /*
438  * Return the file byte offset for the text segment.
439  */
440 uint_t
441 gettfile(struct exec *exp)
442 {
443 	if (exp->a_magic == ZMAGIC)
444 		return (0);
445 	else
446 		return (sizeof (struct exec));
447 }
448 
449 void
450 getexinfo(
451 	struct exdata *edp_in,
452 	struct exdata *edp_out,
453 	int *pagetext,
454 	int *pagedata)
455 {
456 	*edp_out = *edp_in;	/* structure copy */
457 
458 	if ((edp_in->ux_mag == ZMAGIC) &&
459 	    ((edp_in->vp->v_flag & VNOMAP) == 0)) {
460 		*pagetext = 1;
461 		*pagedata = 1;
462 	} else {
463 		*pagetext = 0;
464 		*pagedata = 0;
465 	}
466 }
467 
468 #define	MAP_PGSZ_COMMON(pgsz, n, upper, lower, len)	\
469 	for ((n) = (upper); (n) > (lower); (n)--) {		\
470 		if (disable_auto_large_pages & (1 << (n)))		\
471 			continue;				\
472 		if (hw_page_array[(n)].hp_size <= (len)) {	\
473 			(pgsz) = hw_page_array[(n)].hp_size;	\
474 			break;					\
475 		}						\
476 	}
477 
478 
479 /*ARGSUSED*/
480 size_t
481 map_pgszva(struct proc *p, caddr_t addr, size_t len)
482 {
483 	size_t		pgsz = MMU_PAGESIZE;
484 	int		n, upper;
485 
486 	/*
487 	 * Select the best fit page size within the constraints of
488 	 * auto_lpg_{min,max}szc.
489 	 *
490 	 * Note that we also take the heap size into account when
491 	 * deciding if we've crossed the threshold at which we should
492 	 * increase the page size.  This isn't perfect since the heap
493 	 * may not have reached its full size yet, but it's better than
494 	 * not considering it at all.
495 	 */
496 	len += p->p_brksize;
497 	if (ptob(auto_lpg_tlb_threshold) <= len) {
498 
499 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
500 
501 		/*
502 		 * Use auto_lpg_minszc - 1 as the limit so we never drop
503 		 * below auto_lpg_minszc.  We don't have a size code to refer
504 		 * to like we have for bss and stack, so we assume 0.
505 		 * auto_lpg_minszc should always be >= 0.  Using
506 		 * auto_lpg_minszc cuts off the loop.
507 		 */
508 		MAP_PGSZ_COMMON(pgsz, n, upper, auto_lpg_minszc - 1, len);
509 	}
510 
511 	return (pgsz);
512 }
513 
514 size_t
515 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
516 {
517 	size_t		pgsz;
518 	int		n, upper, lower;
519 
520 	/*
521 	 * If len is zero, retrieve from proc and don't demote the page size.
522 	 */
523 	if (len == 0) {
524 		len = p->p_brksize;
525 	}
526 
527 	/*
528 	 * Still zero?  Then we don't have a heap yet, so pick the default
529 	 * heap size.
530 	 */
531 	if (len == 0) {
532 		pgsz = auto_lpg_heap_default;
533 	} else {
534 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
535 	}
536 
537 	if ((pgsz * auto_lpg_tlb_threshold) <= len) {
538 		/*
539 		 * We're past the threshold, so select the best fit
540 		 * page size within the constraints of
541 		 * auto_lpg_{min,max}szc and the minimum required
542 		 * alignment.
543 		 */
544 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
545 		lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc);
546 		MAP_PGSZ_COMMON(pgsz, n, upper, lower, len);
547 	}
548 
549 	/*
550 	 * If addr == 0 we were called by memcntl() or exec_args() when the
551 	 * size code is 0.  Don't set pgsz less than current size.
552 	 */
553 	if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
554 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
555 	}
556 
557 	return (pgsz);
558 }
559 
560 size_t
561 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
562 {
563 	size_t		pgsz;
564 	int		n, upper, lower;
565 
566 	/*
567 	 * If len is zero, retrieve from proc and don't demote the page size.
568 	 */
569 	if (len == 0) {
570 		len = p->p_stksize;
571 	}
572 
573 	/*
574 	 * Still zero?  Then we don't have a heap yet, so pick the default
575 	 * stack size.
576 	 */
577 	if (len == 0) {
578 		pgsz = auto_lpg_stack_default;
579 	} else {
580 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
581 	}
582 
583 	if ((pgsz * auto_lpg_tlb_threshold) <= len) {
584 		/*
585 		 * We're past the threshold, so select the best fit
586 		 * page size within the constraints of
587 		 * auto_lpg_{min,max}szc and the minimum required
588 		 * alignment.
589 		 */
590 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
591 		lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc);
592 		MAP_PGSZ_COMMON(pgsz, n, upper, lower, len);
593 	}
594 
595 	/*
596 	 * If addr == 0 we were called by memcntl() or exec_args() when the
597 	 * size code is 0.  Don't set pgsz less than current size.
598 	 */
599 	if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
600 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
601 	}
602 
603 	return (pgsz);
604 }
605 
606 
607 /*
608  * Return non 0 value if the address may cause a VAC alias with KPM mappings.
609  * KPM selects an address such that it's equal offset modulo shm_alignment and
610  * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
611  */
612 int
613 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
614 {
615 	if (vac) {
616 		return (((uintptr_t)addr ^ off) & shm_alignment - 1);
617 	} else {
618 		return (0);
619 	}
620 }
621 
622 /*
623  * use_text_pgsz64k, use_initdata_pgsz64k and use_text_pgsz4m
624  * can be set in platform or CPU specific code but user can change the
625  * default values via /etc/system.
626  *
627  * Initial values are defined in architecture specific mach_vm_dep.c file.
628  */
629 extern int use_text_pgsz64k;
630 extern int use_text_pgsz4m;
631 extern int use_initdata_pgsz64k;
632 
633 /*
634  * disable_text_largepages and disable_initdata_largepages bitmaks are set in
635  * platform or CPU specific code to disable page sizes that should not be
636  * used. These variables normally shouldn't be changed via /etc/system. A
637  * particular page size for text or inititialized data will be used by default
638  * if both one of use_* variables is set to 1 AND this page size is not
639  * disabled in the corresponding disable_* bitmask variable.
640  *
641  * Initial values are defined in architecture specific mach_vm_dep.c file.
642  */
643 extern int disable_text_largepages;
644 extern int disable_initdata_largepages;
645 
646 /*
647  * Minimum segment size tunables before 64K or 4M large pages
648  * should be used to map it.
649  *
650  * Initial values are defined in architecture specific mach_vm_dep.c file.
651  */
652 extern size_t text_pgsz64k_minsize;
653 extern size_t text_pgsz4m_minsize;
654 extern size_t initdata_pgsz64k_minsize;
655 
656 /*
657  * Sanity control. Don't use large pages regardless of user
658  * settings if there's less than execseg_lpg_min_physmem memory installed.
659  * The units for this variable is 8K pages.
660  */
661 pgcnt_t execseg_lpg_min_physmem = 131072;		/* 1GB */
662 
663 
664 /* assumes TTE8K...TTE4M == szc */
665 
666 static uint_t
667 map_text_pgsz4m(caddr_t addr, size_t len)
668 {
669 	caddr_t a;
670 
671 	if (len < text_pgsz4m_minsize) {
672 		return (0);
673 	}
674 
675 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t);
676 	if (a < addr || a >= addr + len) {
677 		return (0);
678 	}
679 	len -= (a - addr);
680 	if (len < MMU_PAGESIZE4M) {
681 		return (0);
682 	}
683 
684 	return (1 << TTE4M);
685 }
686 
687 static uint_t
688 map_text_pgsz64k(caddr_t addr, size_t len)
689 {
690 	caddr_t a;
691 	size_t svlen = len;
692 
693 	if (len < text_pgsz64k_minsize) {
694 		return (0);
695 	}
696 
697 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t);
698 	if (a < addr || a >= addr + len) {
699 		return (0);
700 	}
701 	len -= (a - addr);
702 	if (len < MMU_PAGESIZE64K) {
703 		return (0);
704 	}
705 	if (!use_text_pgsz4m ||
706 	    disable_text_largepages & (1 << TTE4M)) {
707 		return (1 << TTE64K);
708 	}
709 	if (svlen < text_pgsz4m_minsize) {
710 		return (1 << TTE64K);
711 	}
712 	addr = a;
713 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t);
714 	if (a < addr || a >= addr + len) {
715 		return (1 << TTE64K);
716 	}
717 	len -= (a - addr);
718 	if (len < MMU_PAGESIZE4M) {
719 		return (1 << TTE64K);
720 	}
721 	return ((1 << TTE4M) | (1 << TTE64K));
722 }
723 
724 static uint_t
725 map_initdata_pgsz64k(caddr_t addr, size_t len)
726 {
727 	caddr_t a;
728 
729 	if (len < initdata_pgsz64k_minsize) {
730 		return (0);
731 	}
732 
733 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t);
734 	if (a < addr || a >= addr + len) {
735 		return (0);
736 	}
737 	len -= (a - addr);
738 	if (len < MMU_PAGESIZE64K) {
739 		return (0);
740 	}
741 	return (1 << TTE64K);
742 }
743 
744 /*
745  * Return a bit vector of large page size codes that
746  * can be used to map [addr, addr + len) region.
747  */
748 uint_t
749 map_execseg_pgszcvec(int text, caddr_t addr, size_t len)
750 {
751 	uint_t ret = 0;
752 
753 	if (physmem < execseg_lpg_min_physmem) {
754 		return (0);
755 	}
756 
757 	if (text) {
758 		if (use_text_pgsz64k &&
759 		    !(disable_text_largepages & (1 << TTE64K))) {
760 			ret = map_text_pgsz64k(addr, len);
761 		} else if (use_text_pgsz4m &&
762 		    !(disable_text_largepages & (1 << TTE4M))) {
763 			ret = map_text_pgsz4m(addr, len);
764 		}
765 	} else if (use_initdata_pgsz64k &&
766 	    !(disable_initdata_largepages & (1 << TTE64K))) {
767 		ret = map_initdata_pgsz64k(addr, len);
768 	}
769 
770 	return (ret);
771 }
772 
773 #define	PNUM_SIZE(size_code)						\
774 	(hw_page_array[size_code].hp_size >> hw_page_array[0].hp_shift)
775 
776 /*
777  * Anchored in the table below are counters used to keep track
778  * of free contiguous physical memory. Each element of the table contains
779  * the array of counters, the size of array which is allocated during
780  * startup based on physmax and a shift value used to convert a pagenum
781  * into a counter array index or vice versa. The table has page size
782  * for rows and region size for columns:
783  *
784  *	page_counters[page_size][region_size]
785  *
786  *	page_size: 	TTE size code of pages on page_size freelist.
787  *
788  *	region_size:	TTE size code of a candidate larger page made up
789  *			made up of contiguous free page_size pages.
790  *
791  * As you go across a page_size row increasing region_size each
792  * element keeps track of how many (region_size - 1) size groups
793  * made up of page_size free pages can be coalesced into a
794  * regsion_size page. Yuck! Lets try an example:
795  *
796  * 	page_counters[1][3] is the table element used for identifying
797  *	candidate 4M pages from contiguous pages off the 64K free list.
798  *	Each index in the page_counters[1][3].array spans 4M. Its the
799  *	number of free 512K size (regsion_size - 1) groups of contiguous
800  *	64K free pages.	So when page_counters[1][3].counters[n] == 8
801  *	we know we have a candidate 4M page made up of 512K size groups
802  *	of 64K free pages.
803  */
804 
805 /*
806  * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
807  * dimensions are allocated dynamically.
808  */
809 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
810 
811 /*
812  * For now there is only a single size cache list.
813  * Allocated dynamically.
814  */
815 page_t ***page_cachelists[MAX_MEM_TYPES];
816 
817 kmutex_t *fpc_mutex[NPC_MUTEX];
818 kmutex_t *cpc_mutex[NPC_MUTEX];
819 
820 caddr_t
821 alloc_page_freelists(int mnode, caddr_t alloc_base, int alloc_align)
822 {
823 	int	mtype;
824 	uint_t	szc;
825 
826 	alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align);
827 
828 	/*
829 	 * We only support small pages in the cachelist.
830 	 */
831 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
832 		page_cachelists[mtype][mnode] = (page_t **)alloc_base;
833 		alloc_base += (sizeof (page_t *) * page_colors);
834 		/*
835 		 * Allocate freelists bins for all
836 		 * supported page sizes.
837 		 */
838 		for (szc = 0; szc < mmu_page_sizes; szc++) {
839 			page_freelists[szc][mtype][mnode] =
840 			    (page_t **)alloc_base;
841 			alloc_base += ((sizeof (page_t *) *
842 			    page_get_pagecolors(szc)));
843 		}
844 	}
845 
846 	alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align);
847 
848 	return (alloc_base);
849 }
850 
851 /*
852  * Allocate page_freelists bin headers for a memnode from the
853  * nucleus data area. This is the first time that mmu_page_sizes is
854  * used during sun4u bootup, so check mmu_page_sizes initialization.
855  */
856 int
857 ndata_alloc_page_freelists(struct memlist *ndata, int mnode)
858 {
859 	size_t alloc_sz;
860 	caddr_t alloc_base;
861 	caddr_t end;
862 	int	mtype;
863 	uint_t	szc;
864 	int32_t allp = 0;
865 
866 	if (&mmu_init_mmu_page_sizes) {
867 		if (!mmu_init_mmu_page_sizes(allp)) {
868 			cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
869 			    mmu_page_sizes);
870 		}
871 	}
872 	ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
873 
874 	/* first time called - allocate max_mem_nodes dimension */
875 	if (mnode == 0) {
876 		int	i;
877 
878 		/* page_cachelists */
879 		alloc_sz = MAX_MEM_TYPES * max_mem_nodes *
880 		    sizeof (page_t **);
881 
882 		/* page_freelists */
883 		alloc_sz += MAX_MEM_TYPES * mmu_page_sizes * max_mem_nodes *
884 		    sizeof (page_t **);
885 
886 		/* fpc_mutex and cpc_mutex */
887 		alloc_sz += 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
888 
889 		alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
890 		if (alloc_base == NULL)
891 			return (-1);
892 
893 		ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
894 
895 		for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
896 			page_cachelists[mtype] = (page_t ***)alloc_base;
897 			alloc_base += (max_mem_nodes * sizeof (page_t **));
898 			for (szc = 0; szc < mmu_page_sizes; szc++) {
899 				page_freelists[szc][mtype] =
900 				    (page_t ***)alloc_base;
901 				alloc_base += (max_mem_nodes *
902 				    sizeof (page_t **));
903 			}
904 		}
905 		for (i = 0; i < NPC_MUTEX; i++) {
906 			fpc_mutex[i] = (kmutex_t *)alloc_base;
907 			alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
908 			cpc_mutex[i] = (kmutex_t *)alloc_base;
909 			alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
910 		}
911 		alloc_sz = 0;
912 	}
913 
914 	/*
915 	 * Calculate the size needed by alloc_page_freelists().
916 	 */
917 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
918 		alloc_sz += sizeof (page_t *) * page_colors;
919 
920 		for (szc = 0; szc < mmu_page_sizes; szc++)
921 			alloc_sz += sizeof (page_t *) *
922 			    page_get_pagecolors(szc);
923 	}
924 
925 	alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
926 	if (alloc_base == NULL)
927 		return (-1);
928 
929 	end = alloc_page_freelists(mnode, alloc_base, ecache_alignsize);
930 	ASSERT((uintptr_t)end == roundup((uintptr_t)alloc_base + alloc_sz,
931 	    ecache_alignsize));
932 
933 	return (0);
934 }
935 
936 /*
937  * To select our starting bin, we stride through the bins with a stride
938  * of 337.  Why 337?  It's prime, it's largeish, and it performs well both
939  * in simulation and practice for different workloads on varying cache sizes.
940  */
941 uint32_t color_start_current = 0;
942 uint32_t color_start_stride = 337;
943 int color_start_random = 0;
944 
945 /* ARGSUSED */
946 uint_t
947 get_color_start(struct as *as)
948 {
949 	uint32_t old, new;
950 
951 	if (consistent_coloring == 2 || color_start_random) {
952 		return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
953 		    page_colors_mask));
954 	}
955 
956 	do {
957 		old = color_start_current;
958 		new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
959 	} while (cas32(&color_start_current, old, new) != old);
960 
961 	return ((uint_t)(new));
962 }
963 
964 /*
965  * Called once at startup from kphysm_init() -- before memialloc()
966  * is invoked to do the 1st page_free()/page_freelist_add().
967  *
968  * initializes page_colors and page_colors_mask based on ecache_setsize.
969  *
970  * Also initializes the counter locks.
971  */
972 void
973 page_coloring_init()
974 {
975 	int	a;
976 
977 	if (do_pg_coloring == 0) {
978 		page_colors = 1;
979 		return;
980 	}
981 
982 	/*
983 	 * Calculate page_colors from ecache_setsize. ecache_setsize contains
984 	 * the max ecache setsize of all cpus configured in the system or, for
985 	 * cheetah+ systems, the max possible ecache setsize for all possible
986 	 * cheetah+ cpus.
987 	 */
988 	page_colors = ecache_setsize / MMU_PAGESIZE;
989 	page_colors_mask = page_colors - 1;
990 
991 	/*
992 	 * initialize cpu_page_colors if ecache setsizes are homogenous.
993 	 * cpu_page_colors set to -1 during DR operation or during startup
994 	 * if setsizes are heterogenous.
995 	 *
996 	 * The value of cpu_page_colors determines if additional color bins
997 	 * need to be checked for a particular color in the page_get routines.
998 	 */
999 	if ((cpu_page_colors == 0) && (cpu_setsize < ecache_setsize))
1000 		cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
1001 
1002 	vac_colors = vac_size / MMU_PAGESIZE;
1003 	vac_colors_mask = vac_colors -1;
1004 
1005 	page_coloring_shift = 0;
1006 	a = ecache_setsize;
1007 	while (a >>= 1) {
1008 		page_coloring_shift++;
1009 	}
1010 }
1011 
1012 int
1013 bp_color(struct buf *bp)
1014 {
1015 	int color = -1;
1016 
1017 	if (vac) {
1018 		if ((bp->b_flags & B_PAGEIO) != 0) {
1019 			color = sfmmu_get_ppvcolor(bp->b_pages);
1020 		} else if (bp->b_un.b_addr != NULL) {
1021 			color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
1022 		}
1023 	}
1024 	return (color < 0 ? 0 : ptob(color));
1025 }
1026 
1027 /*
1028  * Create & Initialise pageout scanner thread. The thread has to
1029  * start at procedure with process pp and priority pri.
1030  */
1031 void
1032 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
1033 {
1034 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
1035 }
1036 
1037 /*
1038  * Function for flushing D-cache when performing module relocations
1039  * to an alternate mapping.  Stubbed out on all platforms except sun4u,
1040  * at least for now.
1041  */
1042 void
1043 dcache_flushall()
1044 {
1045 	sfmmu_cache_flushall();
1046 }
1047 
1048 static int
1049 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
1050 {
1051 	if (va1 < va2 && va1 + sz1 <= va2)
1052 		return (0);
1053 
1054 	if (va2 < va1 && va2 + sz2 <= va1)
1055 		return (0);
1056 
1057 	return (1);
1058 }
1059 
1060 /*
1061  * Return the number of bytes, relative to the beginning of a given range, that
1062  * are non-toxic (can be read from and written to with relative impunity).
1063  */
1064 size_t
1065 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
1066 {
1067 	/* OBP reads are harmless, but we don't want people writing there */
1068 	if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
1069 	    OFW_START_ADDR + 1))
1070 		return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
1071 
1072 	if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
1073 		return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
1074 
1075 	return (sz); /* no overlap */
1076 }
1077 
1078 /*
1079  * Minimum physmem required for enabling large pages for kernel heap
1080  * Currently we do not enable lp for kmem on systems with less
1081  * than 1GB of memory. This value can be changed via /etc/system
1082  */
1083 size_t segkmem_lpminphysmem = 0x40000000;	/* 1GB */
1084 
1085 /*
1086  * this function chooses large page size for kernel heap
1087  */
1088 size_t
1089 get_segkmem_lpsize(size_t lpsize)
1090 {
1091 	size_t memtotal = physmem * PAGESIZE;
1092 
1093 	if (memtotal < segkmem_lpminphysmem)
1094 		return (PAGESIZE);
1095 
1096 	if (plat_lpkmem_is_supported != NULL &&
1097 	    plat_lpkmem_is_supported() == 0)
1098 		return (PAGESIZE);
1099 
1100 	return (mmu_get_kernel_lpsize(lpsize));
1101 }
1102