xref: /titanic_41/usr/src/uts/sun4/vm/vm_dep.c (revision 4bac220845f606f60663ed6f3a2b88caa00ae87e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * UNIX machine dependent virtual memory support.
30  */
31 
32 #include <sys/vm.h>
33 #include <sys/exec.h>
34 
35 #include <sys/exechdr.h>
36 #include <vm/seg_kmem.h>
37 #include <sys/atomic.h>
38 #include <sys/archsystm.h>
39 #include <sys/machsystm.h>
40 #include <sys/kdi.h>
41 #include <sys/cpu_module.h>
42 
43 #include <vm/hat_sfmmu.h>
44 
45 #include <sys/memnode.h>
46 
47 #include <sys/mem_config.h>
48 #include <sys/mem_cage.h>
49 #include <vm/vm_dep.h>
50 #include <sys/platform_module.h>
51 
52 /*
53  * These variables are set by module specific config routines.
54  * They are only set by modules which will use physical cache page coloring
55  * and/or virtual cache page coloring.
56  */
57 int do_pg_coloring = 0;
58 int do_virtual_coloring = 0;
59 
60 /*
61  * These variables can be conveniently patched at kernel load time to
62  * prevent do_pg_coloring or do_virtual_coloring from being enabled by
63  * module specific config routines.
64  */
65 
66 int use_page_coloring = 1;
67 int use_virtual_coloring = 1;
68 
69 /*
70  * initialized by page_coloring_init()
71  */
72 extern uint_t page_colors;
73 extern uint_t page_colors_mask;
74 extern uint_t page_coloring_shift;
75 int cpu_page_colors;
76 uint_t vac_colors = 0;
77 uint_t vac_colors_mask = 0;
78 
79 /*
80  * get the ecache setsize for the current cpu.
81  */
82 #define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
83 
84 plcnt_t		plcnt;		/* page list count */
85 
86 /*
87  * This variable is set by the cpu module to contain the lowest
88  * address not affected by the SF_ERRATA_57 workaround.  It should
89  * remain 0 if the workaround is not needed.
90  */
91 #if defined(SF_ERRATA_57)
92 caddr_t errata57_limit;
93 #endif
94 
95 extern int disable_auto_large_pages;	/* used by map_pgsz*() routines */
96 
97 extern void page_relocate_hash(page_t *, page_t *);
98 
99 /*
100  * these must be defined in platform specific areas
101  */
102 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
103 	struct proc *, uint_t);
104 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
105 	caddr_t, size_t, uint_t, struct lgrp *);
106 /*
107  * Convert page frame number to an OBMEM page frame number
108  * (i.e. put in the type bits -- zero for this implementation)
109  */
110 pfn_t
111 impl_obmem_pfnum(pfn_t pf)
112 {
113 	return (pf);
114 }
115 
116 /*
117  * Use physmax to determine the highest physical page of DRAM memory
118  * It is assumed that any physical addresses above physmax is in IO space.
119  * We don't bother checking the low end because we assume that memory space
120  * begins at physical page frame 0.
121  *
122  * Return 1 if the page frame is onboard DRAM memory, else 0.
123  * Returns 0 for nvram so it won't be cached.
124  */
125 int
126 pf_is_memory(pfn_t pf)
127 {
128 	/* We must be IO space */
129 	if (pf > physmax)
130 		return (0);
131 
132 	/* We must be memory space */
133 	return (1);
134 }
135 
136 /*
137  * Handle a pagefault.
138  */
139 faultcode_t
140 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
141 {
142 	struct as *as;
143 	struct proc *p;
144 	faultcode_t res;
145 	caddr_t base;
146 	size_t len;
147 	int err;
148 
149 	if (INVALID_VADDR(addr))
150 		return (FC_NOMAP);
151 
152 	if (iskernel) {
153 		as = &kas;
154 	} else {
155 		p = curproc;
156 		as = p->p_as;
157 #if defined(SF_ERRATA_57)
158 		/*
159 		 * Prevent infinite loops due to a segment driver
160 		 * setting the execute permissions and the sfmmu hat
161 		 * silently ignoring them.
162 		 */
163 		if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
164 		    addr < errata57_limit) {
165 			res = FC_NOMAP;
166 			goto out;
167 		}
168 #endif
169 	}
170 
171 	/*
172 	 * Dispatch pagefault.
173 	 */
174 	res = as_fault(as->a_hat, as, addr, 1, type, rw);
175 
176 	/*
177 	 * If this isn't a potential unmapped hole in the user's
178 	 * UNIX data or stack segments, just return status info.
179 	 */
180 	if (!(res == FC_NOMAP && iskernel == 0))
181 		goto out;
182 
183 	/*
184 	 * Check to see if we happened to faulted on a currently unmapped
185 	 * part of the UNIX data or stack segments.  If so, create a zfod
186 	 * mapping there and then try calling the fault routine again.
187 	 */
188 	base = p->p_brkbase;
189 	len = p->p_brksize;
190 
191 	if (addr < base || addr >= base + len) {		/* data seg? */
192 		base = (caddr_t)(p->p_usrstack - p->p_stksize);
193 		len = p->p_stksize;
194 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
195 			/* not in either UNIX data or stack segments */
196 			res = FC_NOMAP;
197 			goto out;
198 		}
199 	}
200 
201 	/* the rest of this function implements a 3.X 4.X 5.X compatibility */
202 	/* This code is probably not needed anymore */
203 
204 	/* expand the gap to the page boundaries on each side */
205 	len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
206 	    ((uintptr_t)base & PAGEMASK);
207 	base = (caddr_t)((uintptr_t)base & PAGEMASK);
208 
209 	as_rangelock(as);
210 	as_purge(as);
211 	if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
212 		err = as_map(as, base, len, segvn_create, zfod_argsp);
213 		as_rangeunlock(as);
214 		if (err) {
215 			res = FC_MAKE_ERR(err);
216 			goto out;
217 		}
218 	} else {
219 		/*
220 		 * This page is already mapped by another thread after we
221 		 * returned from as_fault() above.  We just fallthrough
222 		 * as_fault() below.
223 		 */
224 		as_rangeunlock(as);
225 	}
226 
227 	res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
228 
229 out:
230 
231 	return (res);
232 }
233 
234 /*
235  * This is the routine which defines the address limit implied
236  * by the flag '_MAP_LOW32'.  USERLIMIT32 matches the highest
237  * mappable address in a 32-bit process on this platform (though
238  * perhaps we should make it be UINT32_MAX here?)
239  */
240 void
241 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
242 {
243 	struct proc *p = curproc;
244 	caddr_t userlimit = flags & _MAP_LOW32 ?
245 		(caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
246 	map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
247 }
248 
249 /*
250  * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
251  */
252 caddr_t	hole_start, hole_end;
253 
254 /*
255  * kpm mapping window
256  */
257 caddr_t kpm_vbase;
258 size_t  kpm_size;
259 uchar_t kpm_size_shift;
260 
261 /*
262  * Determine whether [base, base+len] contains a mapable range of
263  * addresses at least minlen long. base and len are adjusted if
264  * required to provide a mapable range.
265  */
266 /* ARGSUSED */
267 int
268 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
269 {
270 	caddr_t hi, lo;
271 
272 	lo = *basep;
273 	hi = lo + *lenp;
274 
275 	/*
276 	 * If hi rolled over the top, try cutting back.
277 	 */
278 	if (hi < lo) {
279 		size_t newlen = 0 - (uintptr_t)lo - 1l;
280 
281 		if (newlen + (uintptr_t)hi < minlen)
282 			return (0);
283 		if (newlen < minlen)
284 			return (0);
285 		*lenp = newlen;
286 	} else if (hi - lo < minlen)
287 		return (0);
288 
289 	/*
290 	 * Deal with a possible hole in the address range between
291 	 * hole_start and hole_end that should never be mapped by the MMU.
292 	 */
293 	hi = lo + *lenp;
294 
295 	if (lo < hole_start) {
296 		if (hi > hole_start)
297 			if (hi < hole_end)
298 				hi = hole_start;
299 			else
300 				/* lo < hole_start && hi >= hole_end */
301 				if (dir == AH_LO) {
302 					/*
303 					 * prefer lowest range
304 					 */
305 					if (hole_start - lo >= minlen)
306 						hi = hole_start;
307 					else if (hi - hole_end >= minlen)
308 						lo = hole_end;
309 					else
310 						return (0);
311 				} else {
312 					/*
313 					 * prefer highest range
314 					 */
315 					if (hi - hole_end >= minlen)
316 						lo = hole_end;
317 					else if (hole_start - lo >= minlen)
318 						hi = hole_start;
319 					else
320 						return (0);
321 				}
322 	} else {
323 		/* lo >= hole_start */
324 		if (hi < hole_end)
325 			return (0);
326 		if (lo < hole_end)
327 			lo = hole_end;
328 	}
329 
330 	if (hi - lo < minlen)
331 		return (0);
332 
333 	*basep = lo;
334 	*lenp = hi - lo;
335 
336 	return (1);
337 }
338 
339 /*
340  * Determine whether [addr, addr+len] with protections `prot' are valid
341  * for a user address space.
342  */
343 /*ARGSUSED*/
344 int
345 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
346     caddr_t userlimit)
347 {
348 	caddr_t eaddr = addr + len;
349 
350 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
351 		return (RANGE_BADADDR);
352 
353 	/*
354 	 * Determine if the address range falls within an illegal
355 	 * range of the MMU.
356 	 */
357 	if (eaddr > hole_start && addr < hole_end)
358 		return (RANGE_BADADDR);
359 
360 #if defined(SF_ERRATA_57)
361 	/*
362 	 * Make sure USERLIMIT isn't raised too high
363 	 */
364 	ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
365 	    errata57_limit == 0);
366 
367 	if (AS_TYPE_64BIT(as) &&
368 	    (addr < errata57_limit) &&
369 	    (prot & PROT_EXEC))
370 		return (RANGE_BADPROT);
371 #endif /* SF_ERRATA57 */
372 	return (RANGE_OKAY);
373 }
374 
375 /*
376  * Routine used to check to see if an a.out can be executed
377  * by the current machine/architecture.
378  */
379 int
380 chkaout(struct exdata *exp)
381 {
382 	if (exp->ux_mach == M_SPARC)
383 		return (0);
384 	else
385 		return (ENOEXEC);
386 }
387 
388 /*
389  * The following functions return information about an a.out
390  * which is used when a program is executed.
391  */
392 
393 /*
394  * Return the load memory address for the data segment.
395  */
396 caddr_t
397 getdmem(struct exec *exp)
398 {
399 	/*
400 	 * XXX - Sparc Reference Hack approaching
401 	 * Remember that we are loading
402 	 * 8k executables into a 4k machine
403 	 * DATA_ALIGN == 2 * PAGESIZE
404 	 */
405 	if (exp->a_text)
406 		return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
407 	else
408 		return ((caddr_t)USRTEXT);
409 }
410 
411 /*
412  * Return the starting disk address for the data segment.
413  */
414 ulong_t
415 getdfile(struct exec *exp)
416 {
417 	if (exp->a_magic == ZMAGIC)
418 		return (exp->a_text);
419 	else
420 		return (sizeof (struct exec) + exp->a_text);
421 }
422 
423 /*
424  * Return the load memory address for the text segment.
425  */
426 
427 /*ARGSUSED*/
428 caddr_t
429 gettmem(struct exec *exp)
430 {
431 	return ((caddr_t)USRTEXT);
432 }
433 
434 /*
435  * Return the file byte offset for the text segment.
436  */
437 uint_t
438 gettfile(struct exec *exp)
439 {
440 	if (exp->a_magic == ZMAGIC)
441 		return (0);
442 	else
443 		return (sizeof (struct exec));
444 }
445 
446 void
447 getexinfo(
448 	struct exdata *edp_in,
449 	struct exdata *edp_out,
450 	int *pagetext,
451 	int *pagedata)
452 {
453 	*edp_out = *edp_in;	/* structure copy */
454 
455 	if ((edp_in->ux_mag == ZMAGIC) &&
456 	    ((edp_in->vp->v_flag & VNOMAP) == 0)) {
457 		*pagetext = 1;
458 		*pagedata = 1;
459 	} else {
460 		*pagetext = 0;
461 		*pagedata = 0;
462 	}
463 }
464 
465 #define	MAP_PGSZ_COMMON(pgsz, n, upper, lower, len)	\
466 	for ((n) = (upper); (n) > (lower); (n)--) {		\
467 		if (disable_auto_large_pages & (1 << (n)))		\
468 			continue;				\
469 		if (hw_page_array[(n)].hp_size <= (len)) {	\
470 			(pgsz) = hw_page_array[(n)].hp_size;	\
471 			break;					\
472 		}						\
473 	}
474 
475 
476 /*ARGSUSED*/
477 size_t
478 map_pgszva(struct proc *p, caddr_t addr, size_t len)
479 {
480 	size_t		pgsz = MMU_PAGESIZE;
481 	int		n, upper;
482 
483 	/*
484 	 * Select the best fit page size within the constraints of
485 	 * auto_lpg_{min,max}szc.
486 	 *
487 	 * Note that we also take the heap size into account when
488 	 * deciding if we've crossed the threshold at which we should
489 	 * increase the page size.  This isn't perfect since the heap
490 	 * may not have reached its full size yet, but it's better than
491 	 * not considering it at all.
492 	 */
493 	len += p->p_brksize;
494 	if (ptob(auto_lpg_tlb_threshold) <= len) {
495 
496 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
497 
498 		/*
499 		 * Use auto_lpg_minszc - 1 as the limit so we never drop
500 		 * below auto_lpg_minszc.  We don't have a size code to refer
501 		 * to like we have for bss and stack, so we assume 0.
502 		 * auto_lpg_minszc should always be >= 0.  Using
503 		 * auto_lpg_minszc cuts off the loop.
504 		 */
505 		MAP_PGSZ_COMMON(pgsz, n, upper, auto_lpg_minszc - 1, len);
506 	}
507 
508 	return (pgsz);
509 }
510 
511 size_t
512 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
513 {
514 	size_t		pgsz;
515 	int		n, upper, lower;
516 
517 	/*
518 	 * If len is zero, retrieve from proc and don't demote the page size.
519 	 */
520 	if (len == 0) {
521 		len = p->p_brksize;
522 	}
523 
524 	/*
525 	 * Still zero?  Then we don't have a heap yet, so pick the default
526 	 * heap size.
527 	 */
528 	if (len == 0) {
529 		pgsz = auto_lpg_heap_default;
530 	} else {
531 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
532 	}
533 
534 	if ((pgsz * auto_lpg_tlb_threshold) <= len) {
535 		/*
536 		 * We're past the threshold, so select the best fit
537 		 * page size within the constraints of
538 		 * auto_lpg_{min,max}szc and the minimum required
539 		 * alignment.
540 		 */
541 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
542 		lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc);
543 		MAP_PGSZ_COMMON(pgsz, n, upper, lower, len);
544 	}
545 
546 	/*
547 	 * If addr == 0 we were called by memcntl() or exec_args() when the
548 	 * size code is 0.  Don't set pgsz less than current size.
549 	 */
550 	if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
551 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
552 	}
553 
554 	return (pgsz);
555 }
556 
557 size_t
558 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
559 {
560 	size_t		pgsz;
561 	int		n, upper, lower;
562 
563 	/*
564 	 * If len is zero, retrieve from proc and don't demote the page size.
565 	 */
566 	if (len == 0) {
567 		len = p->p_stksize;
568 	}
569 
570 	/*
571 	 * Still zero?  Then we don't have a heap yet, so pick the default
572 	 * stack size.
573 	 */
574 	if (len == 0) {
575 		pgsz = auto_lpg_stack_default;
576 	} else {
577 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
578 	}
579 
580 	if ((pgsz * auto_lpg_tlb_threshold) <= len) {
581 		/*
582 		 * We're past the threshold, so select the best fit
583 		 * page size within the constraints of
584 		 * auto_lpg_{min,max}szc and the minimum required
585 		 * alignment.
586 		 */
587 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
588 		lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc);
589 		MAP_PGSZ_COMMON(pgsz, n, upper, lower, len);
590 	}
591 
592 	/*
593 	 * If addr == 0 we were called by memcntl() or exec_args() when the
594 	 * size code is 0.  Don't set pgsz less than current size.
595 	 */
596 	if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
597 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
598 	}
599 
600 	return (pgsz);
601 }
602 
603 
604 /*
605  * Return non 0 value if the address may cause a VAC alias with KPM mappings.
606  * KPM selects an address such that it's equal offset modulo shm_alignment and
607  * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
608  */
609 int
610 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
611 {
612 	if (vac) {
613 		return (((uintptr_t)addr ^ off) & shm_alignment - 1);
614 	} else {
615 		return (0);
616 	}
617 }
618 
619 /*
620  * use_text_pgsz64k, use_initdata_pgsz64k and use_text_pgsz4m
621  * can be set in platform or CPU specific code but user can change the
622  * default values via /etc/system.
623  *
624  * Initial values are defined in architecture specific mach_vm_dep.c file.
625  */
626 extern int use_text_pgsz64k;
627 extern int use_text_pgsz4m;
628 extern int use_initdata_pgsz64k;
629 
630 /*
631  * disable_text_largepages and disable_initdata_largepages bitmaks are set in
632  * platform or CPU specific code to disable page sizes that should not be
633  * used. These variables normally shouldn't be changed via /etc/system. A
634  * particular page size for text or inititialized data will be used by default
635  * if both one of use_* variables is set to 1 AND this page size is not
636  * disabled in the corresponding disable_* bitmask variable.
637  *
638  * Initial values are defined in architecture specific mach_vm_dep.c file.
639  */
640 extern int disable_text_largepages;
641 extern int disable_initdata_largepages;
642 
643 /*
644  * Minimum segment size tunables before 64K or 4M large pages
645  * should be used to map it.
646  *
647  * Initial values are defined in architecture specific mach_vm_dep.c file.
648  */
649 extern size_t text_pgsz64k_minsize;
650 extern size_t text_pgsz4m_minsize;
651 extern size_t initdata_pgsz64k_minsize;
652 
653 /*
654  * Sanity control. Don't use large pages regardless of user
655  * settings if there's less than execseg_lpg_min_physmem memory installed.
656  * The units for this variable is 8K pages.
657  */
658 pgcnt_t execseg_lpg_min_physmem = 131072;		/* 1GB */
659 
660 extern int disable_shm_large_pages;
661 pgcnt_t shm_lpg_min_physmem = 131072;			/* 1GB */
662 extern size_t max_shm_lpsize;
663 
664 
665 /* assumes TTE8K...TTE4M == szc */
666 
667 static uint_t
668 map_text_pgsz4m(caddr_t addr, size_t len)
669 {
670 	caddr_t a;
671 
672 	if (len < text_pgsz4m_minsize) {
673 		return (0);
674 	}
675 
676 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t);
677 	if (a < addr || a >= addr + len) {
678 		return (0);
679 	}
680 	len -= (a - addr);
681 	if (len < MMU_PAGESIZE4M) {
682 		return (0);
683 	}
684 
685 	return (1 << TTE4M);
686 }
687 
688 static uint_t
689 map_text_pgsz64k(caddr_t addr, size_t len)
690 {
691 	caddr_t a;
692 	size_t svlen = len;
693 
694 	if (len < text_pgsz64k_minsize) {
695 		return (0);
696 	}
697 
698 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t);
699 	if (a < addr || a >= addr + len) {
700 		return (0);
701 	}
702 	len -= (a - addr);
703 	if (len < MMU_PAGESIZE64K) {
704 		return (0);
705 	}
706 	if (!use_text_pgsz4m ||
707 	    disable_text_largepages & (1 << TTE4M)) {
708 		return (1 << TTE64K);
709 	}
710 	if (svlen < text_pgsz4m_minsize) {
711 		return (1 << TTE64K);
712 	}
713 	addr = a;
714 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t);
715 	if (a < addr || a >= addr + len) {
716 		return (1 << TTE64K);
717 	}
718 	len -= (a - addr);
719 	if (len < MMU_PAGESIZE4M) {
720 		return (1 << TTE64K);
721 	}
722 	return ((1 << TTE4M) | (1 << TTE64K));
723 }
724 
725 static uint_t
726 map_initdata_pgsz64k(caddr_t addr, size_t len)
727 {
728 	caddr_t a;
729 
730 	if (len < initdata_pgsz64k_minsize) {
731 		return (0);
732 	}
733 
734 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t);
735 	if (a < addr || a >= addr + len) {
736 		return (0);
737 	}
738 	len -= (a - addr);
739 	if (len < MMU_PAGESIZE64K) {
740 		return (0);
741 	}
742 	return (1 << TTE64K);
743 }
744 
745 /*
746  * Return a bit vector of large page size codes that
747  * can be used to map [addr, addr + len) region.
748  */
749 uint_t
750 map_execseg_pgszcvec(int text, caddr_t addr, size_t len)
751 {
752 	uint_t ret = 0;
753 
754 	if (physmem < execseg_lpg_min_physmem) {
755 		return (0);
756 	}
757 
758 	if (text) {
759 		if (use_text_pgsz64k &&
760 		    !(disable_text_largepages & (1 << TTE64K))) {
761 			ret = map_text_pgsz64k(addr, len);
762 		} else if (use_text_pgsz4m &&
763 		    !(disable_text_largepages & (1 << TTE4M))) {
764 			ret = map_text_pgsz4m(addr, len);
765 		}
766 	} else if (use_initdata_pgsz64k &&
767 	    !(disable_initdata_largepages & (1 << TTE64K))) {
768 		ret = map_initdata_pgsz64k(addr, len);
769 	}
770 
771 	return (ret);
772 }
773 
774 uint_t
775 map_shm_pgszcvec(caddr_t addr, size_t size, uintptr_t off)
776 {
777 	caddr_t eaddr = addr + size;
778 	uint_t szcvec = 0;
779 	int i;
780 	caddr_t raddr;
781 	caddr_t readdr;
782 	size_t pgsz;
783 
784 	if (physmem < shm_lpg_min_physmem || mmu_page_sizes <= 1 ||
785 	    max_shm_lpsize <= MMU_PAGESIZE) {
786 		return (0);
787 	}
788 
789 	for (i = mmu_page_sizes - 1; i > 0; i--) {
790 		if (disable_shm_large_pages & (1 << i)) {
791 			continue;
792 		}
793 		pgsz = page_get_pagesize(i);
794 		if (pgsz > max_shm_lpsize) {
795 			continue;
796 		}
797 		raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
798 		readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
799 		if (raddr < addr || raddr >= readdr) {
800 			continue;
801 		}
802 		if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
803 			continue;
804 		}
805 		szcvec |= (1 << i);
806 		/*
807 		 * And or in the remaining enabled page sizes.
808 		 */
809 		szcvec |= P2PHASE(~disable_shm_large_pages, (1 << i));
810 		szcvec &= ~1; /* no need to return 8K pagesize */
811 		break;
812 	}
813 	return (szcvec);
814 }
815 
816 #define	PNUM_SIZE(size_code)						\
817 	(hw_page_array[size_code].hp_size >> hw_page_array[0].hp_shift)
818 
819 /*
820  * Anchored in the table below are counters used to keep track
821  * of free contiguous physical memory. Each element of the table contains
822  * the array of counters, the size of array which is allocated during
823  * startup based on physmax and a shift value used to convert a pagenum
824  * into a counter array index or vice versa. The table has page size
825  * for rows and region size for columns:
826  *
827  *	page_counters[page_size][region_size]
828  *
829  *	page_size: 	TTE size code of pages on page_size freelist.
830  *
831  *	region_size:	TTE size code of a candidate larger page made up
832  *			made up of contiguous free page_size pages.
833  *
834  * As you go across a page_size row increasing region_size each
835  * element keeps track of how many (region_size - 1) size groups
836  * made up of page_size free pages can be coalesced into a
837  * regsion_size page. Yuck! Lets try an example:
838  *
839  * 	page_counters[1][3] is the table element used for identifying
840  *	candidate 4M pages from contiguous pages off the 64K free list.
841  *	Each index in the page_counters[1][3].array spans 4M. Its the
842  *	number of free 512K size (regsion_size - 1) groups of contiguous
843  *	64K free pages.	So when page_counters[1][3].counters[n] == 8
844  *	we know we have a candidate 4M page made up of 512K size groups
845  *	of 64K free pages.
846  */
847 
848 /*
849  * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
850  * dimensions are allocated dynamically.
851  */
852 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
853 
854 /*
855  * For now there is only a single size cache list.
856  * Allocated dynamically.
857  */
858 page_t ***page_cachelists[MAX_MEM_TYPES];
859 
860 kmutex_t *fpc_mutex[NPC_MUTEX];
861 kmutex_t *cpc_mutex[NPC_MUTEX];
862 
863 caddr_t
864 alloc_page_freelists(int mnode, caddr_t alloc_base, int alloc_align)
865 {
866 	int	mtype;
867 	uint_t	szc;
868 
869 	alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align);
870 
871 	/*
872 	 * We only support small pages in the cachelist.
873 	 */
874 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
875 		page_cachelists[mtype][mnode] = (page_t **)alloc_base;
876 		alloc_base += (sizeof (page_t *) * page_colors);
877 		/*
878 		 * Allocate freelists bins for all
879 		 * supported page sizes.
880 		 */
881 		for (szc = 0; szc < mmu_page_sizes; szc++) {
882 			page_freelists[szc][mtype][mnode] =
883 			    (page_t **)alloc_base;
884 			alloc_base += ((sizeof (page_t *) *
885 			    page_get_pagecolors(szc)));
886 		}
887 	}
888 
889 	alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align);
890 
891 	return (alloc_base);
892 }
893 
894 /*
895  * Allocate page_freelists bin headers for a memnode from the
896  * nucleus data area. This is the first time that mmu_page_sizes is
897  * used during sun4u bootup, so check mmu_page_sizes initialization.
898  */
899 int
900 ndata_alloc_page_freelists(struct memlist *ndata, int mnode)
901 {
902 	size_t alloc_sz;
903 	caddr_t alloc_base;
904 	caddr_t end;
905 	int	mtype;
906 	uint_t	szc;
907 	int32_t allp = 0;
908 
909 	if (&mmu_init_mmu_page_sizes) {
910 		if (!mmu_init_mmu_page_sizes(allp)) {
911 			cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
912 			    mmu_page_sizes);
913 		}
914 	}
915 	ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
916 
917 	/* first time called - allocate max_mem_nodes dimension */
918 	if (mnode == 0) {
919 		int	i;
920 
921 		/* page_cachelists */
922 		alloc_sz = MAX_MEM_TYPES * max_mem_nodes *
923 		    sizeof (page_t **);
924 
925 		/* page_freelists */
926 		alloc_sz += MAX_MEM_TYPES * mmu_page_sizes * max_mem_nodes *
927 		    sizeof (page_t **);
928 
929 		/* fpc_mutex and cpc_mutex */
930 		alloc_sz += 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
931 
932 		alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
933 		if (alloc_base == NULL)
934 			return (-1);
935 
936 		ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
937 
938 		for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
939 			page_cachelists[mtype] = (page_t ***)alloc_base;
940 			alloc_base += (max_mem_nodes * sizeof (page_t **));
941 			for (szc = 0; szc < mmu_page_sizes; szc++) {
942 				page_freelists[szc][mtype] =
943 				    (page_t ***)alloc_base;
944 				alloc_base += (max_mem_nodes *
945 				    sizeof (page_t **));
946 			}
947 		}
948 		for (i = 0; i < NPC_MUTEX; i++) {
949 			fpc_mutex[i] = (kmutex_t *)alloc_base;
950 			alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
951 			cpc_mutex[i] = (kmutex_t *)alloc_base;
952 			alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
953 		}
954 		alloc_sz = 0;
955 	}
956 
957 	/*
958 	 * Calculate the size needed by alloc_page_freelists().
959 	 */
960 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
961 		alloc_sz += sizeof (page_t *) * page_colors;
962 
963 		for (szc = 0; szc < mmu_page_sizes; szc++)
964 			alloc_sz += sizeof (page_t *) *
965 			    page_get_pagecolors(szc);
966 	}
967 
968 	alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
969 	if (alloc_base == NULL)
970 		return (-1);
971 
972 	end = alloc_page_freelists(mnode, alloc_base, ecache_alignsize);
973 	ASSERT((uintptr_t)end == roundup((uintptr_t)alloc_base + alloc_sz,
974 	    ecache_alignsize));
975 
976 	return (0);
977 }
978 
979 /*
980  * To select our starting bin, we stride through the bins with a stride
981  * of 337.  Why 337?  It's prime, it's largeish, and it performs well both
982  * in simulation and practice for different workloads on varying cache sizes.
983  */
984 uint32_t color_start_current = 0;
985 uint32_t color_start_stride = 337;
986 int color_start_random = 0;
987 
988 /* ARGSUSED */
989 uint_t
990 get_color_start(struct as *as)
991 {
992 	uint32_t old, new;
993 
994 	if (consistent_coloring == 2 || color_start_random) {
995 		return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
996 		    page_colors_mask));
997 	}
998 
999 	do {
1000 		old = color_start_current;
1001 		new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
1002 	} while (cas32(&color_start_current, old, new) != old);
1003 
1004 	return ((uint_t)(new));
1005 }
1006 
1007 /*
1008  * Called once at startup from kphysm_init() -- before memialloc()
1009  * is invoked to do the 1st page_free()/page_freelist_add().
1010  *
1011  * initializes page_colors and page_colors_mask based on ecache_setsize.
1012  *
1013  * Also initializes the counter locks.
1014  */
1015 void
1016 page_coloring_init()
1017 {
1018 	int	a;
1019 
1020 	if (do_pg_coloring == 0) {
1021 		page_colors = 1;
1022 		return;
1023 	}
1024 
1025 	/*
1026 	 * Calculate page_colors from ecache_setsize. ecache_setsize contains
1027 	 * the max ecache setsize of all cpus configured in the system or, for
1028 	 * cheetah+ systems, the max possible ecache setsize for all possible
1029 	 * cheetah+ cpus.
1030 	 */
1031 	page_colors = ecache_setsize / MMU_PAGESIZE;
1032 	page_colors_mask = page_colors - 1;
1033 
1034 	/*
1035 	 * initialize cpu_page_colors if ecache setsizes are homogenous.
1036 	 * cpu_page_colors set to -1 during DR operation or during startup
1037 	 * if setsizes are heterogenous.
1038 	 *
1039 	 * The value of cpu_page_colors determines if additional color bins
1040 	 * need to be checked for a particular color in the page_get routines.
1041 	 */
1042 	if ((cpu_page_colors == 0) && (cpu_setsize < ecache_setsize))
1043 		cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
1044 
1045 	vac_colors = vac_size / MMU_PAGESIZE;
1046 	vac_colors_mask = vac_colors -1;
1047 
1048 	page_coloring_shift = 0;
1049 	a = ecache_setsize;
1050 	while (a >>= 1) {
1051 		page_coloring_shift++;
1052 	}
1053 }
1054 
1055 int
1056 bp_color(struct buf *bp)
1057 {
1058 	int color = -1;
1059 
1060 	if (vac) {
1061 		if ((bp->b_flags & B_PAGEIO) != 0) {
1062 			color = sfmmu_get_ppvcolor(bp->b_pages);
1063 		} else if (bp->b_un.b_addr != NULL) {
1064 			color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
1065 		}
1066 	}
1067 	return (color < 0 ? 0 : ptob(color));
1068 }
1069 
1070 /*
1071  * Create & Initialise pageout scanner thread. The thread has to
1072  * start at procedure with process pp and priority pri.
1073  */
1074 void
1075 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
1076 {
1077 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
1078 }
1079 
1080 /*
1081  * Function for flushing D-cache when performing module relocations
1082  * to an alternate mapping.  Stubbed out on all platforms except sun4u,
1083  * at least for now.
1084  */
1085 void
1086 dcache_flushall()
1087 {
1088 	sfmmu_cache_flushall();
1089 }
1090 
1091 static int
1092 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
1093 {
1094 	if (va1 < va2 && va1 + sz1 <= va2)
1095 		return (0);
1096 
1097 	if (va2 < va1 && va2 + sz2 <= va1)
1098 		return (0);
1099 
1100 	return (1);
1101 }
1102 
1103 /*
1104  * Return the number of bytes, relative to the beginning of a given range, that
1105  * are non-toxic (can be read from and written to with relative impunity).
1106  */
1107 size_t
1108 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
1109 {
1110 	/* OBP reads are harmless, but we don't want people writing there */
1111 	if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
1112 	    OFW_START_ADDR + 1))
1113 		return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
1114 
1115 	if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
1116 		return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
1117 
1118 	return (sz); /* no overlap */
1119 }
1120 
1121 /*
1122  * Minimum physmem required for enabling large pages for kernel heap
1123  * Currently we do not enable lp for kmem on systems with less
1124  * than 1GB of memory. This value can be changed via /etc/system
1125  */
1126 size_t segkmem_lpminphysmem = 0x40000000;	/* 1GB */
1127 
1128 /*
1129  * this function chooses large page size for kernel heap
1130  */
1131 size_t
1132 get_segkmem_lpsize(size_t lpsize)
1133 {
1134 	size_t memtotal = physmem * PAGESIZE;
1135 	size_t mmusz;
1136 	uint_t szc;
1137 	extern int disable_large_pages;
1138 
1139 	if (memtotal < segkmem_lpminphysmem)
1140 		return (PAGESIZE);
1141 
1142 	if (plat_lpkmem_is_supported != NULL &&
1143 	    plat_lpkmem_is_supported() == 0)
1144 		return (PAGESIZE);
1145 
1146 	mmusz = mmu_get_kernel_lpsize(lpsize);
1147 	szc = page_szc(mmusz);
1148 
1149 	while (szc) {
1150 		if (!(disable_large_pages & (1 << szc)))
1151 			return (page_get_pagesize(szc));
1152 		szc--;
1153 	}
1154 	return (PAGESIZE);
1155 }
1156