xref: /illumos-gate/usr/src/uts/sun4/vm/vm_dep.c (revision a0e56b0eb1fdc159ff8348ca0e77d884bb7d126b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * UNIX machine dependent virtual memory support.
30  */
31 
32 #include <sys/vm.h>
33 #include <sys/exec.h>
34 
35 #include <sys/exechdr.h>
36 #include <vm/seg_kmem.h>
37 #include <sys/atomic.h>
38 #include <sys/archsystm.h>
39 #include <sys/machsystm.h>
40 #include <sys/kdi.h>
41 #include <sys/cpu_module.h>
42 
43 #include <vm/hat_sfmmu.h>
44 
45 #include <sys/memnode.h>
46 
47 #include <sys/mem_config.h>
48 #include <sys/mem_cage.h>
49 #include <vm/vm_dep.h>
50 #include <sys/platform_module.h>
51 
52 /*
53  * These variables are set by module specific config routines.
54  * They are only set by modules which will use physical cache page coloring
55  * and/or virtual cache page coloring.
56  */
57 int do_pg_coloring = 0;
58 int do_virtual_coloring = 0;
59 
60 /*
61  * These variables can be conveniently patched at kernel load time to
62  * prevent do_pg_coloring or do_virtual_coloring from being enabled by
63  * module specific config routines.
64  */
65 
66 int use_page_coloring = 1;
67 int use_virtual_coloring = 1;
68 
69 /*
70  * initialized by page_coloring_init()
71  */
72 extern uint_t page_colors;
73 extern uint_t page_colors_mask;
74 extern uint_t page_coloring_shift;
75 int cpu_page_colors;
76 uint_t vac_colors = 0;
77 uint_t vac_colors_mask = 0;
78 
79 /*
80  * get the ecache setsize for the current cpu.
81  */
82 #define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
83 
84 plcnt_t		plcnt;		/* page list count */
85 
86 /*
87  * This variable is set by the cpu module to contain the lowest
88  * address not affected by the SF_ERRATA_57 workaround.  It should
89  * remain 0 if the workaround is not needed.
90  */
91 #if defined(SF_ERRATA_57)
92 caddr_t errata57_limit;
93 #endif
94 
95 extern int disable_auto_large_pages;	/* used by map_pgsz*() routines */
96 
97 extern void page_relocate_hash(page_t *, page_t *);
98 
99 /*
100  * these must be defined in platform specific areas
101  */
102 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
103 	struct proc *, uint_t);
104 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
105 	caddr_t, size_t, uint_t, struct lgrp *);
106 /*
107  * Convert page frame number to an OBMEM page frame number
108  * (i.e. put in the type bits -- zero for this implementation)
109  */
110 pfn_t
111 impl_obmem_pfnum(pfn_t pf)
112 {
113 	return (pf);
114 }
115 
116 /*
117  * Use physmax to determine the highest physical page of DRAM memory
118  * It is assumed that any physical addresses above physmax is in IO space.
119  * We don't bother checking the low end because we assume that memory space
120  * begins at physical page frame 0.
121  *
122  * Return 1 if the page frame is onboard DRAM memory, else 0.
123  * Returns 0 for nvram so it won't be cached.
124  */
125 int
126 pf_is_memory(pfn_t pf)
127 {
128 	/* We must be IO space */
129 	if (pf > physmax)
130 		return (0);
131 
132 	/* We must be memory space */
133 	return (1);
134 }
135 
136 /*
137  * Handle a pagefault.
138  */
139 faultcode_t
140 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
141 {
142 	struct as *as;
143 	struct proc *p;
144 	faultcode_t res;
145 	caddr_t base;
146 	size_t len;
147 	int err;
148 
149 	if (INVALID_VADDR(addr))
150 		return (FC_NOMAP);
151 
152 	if (iskernel) {
153 		as = &kas;
154 	} else {
155 		p = curproc;
156 		as = p->p_as;
157 #if defined(SF_ERRATA_57)
158 		/*
159 		 * Prevent infinite loops due to a segment driver
160 		 * setting the execute permissions and the sfmmu hat
161 		 * silently ignoring them.
162 		 */
163 		if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
164 		    addr < errata57_limit) {
165 			res = FC_NOMAP;
166 			goto out;
167 		}
168 #endif
169 	}
170 
171 	/*
172 	 * Dispatch pagefault.
173 	 */
174 	res = as_fault(as->a_hat, as, addr, 1, type, rw);
175 
176 	/*
177 	 * If this isn't a potential unmapped hole in the user's
178 	 * UNIX data or stack segments, just return status info.
179 	 */
180 	if (!(res == FC_NOMAP && iskernel == 0))
181 		goto out;
182 
183 	/*
184 	 * Check to see if we happened to faulted on a currently unmapped
185 	 * part of the UNIX data or stack segments.  If so, create a zfod
186 	 * mapping there and then try calling the fault routine again.
187 	 */
188 	base = p->p_brkbase;
189 	len = p->p_brksize;
190 
191 	if (addr < base || addr >= base + len) {		/* data seg? */
192 		base = (caddr_t)(p->p_usrstack - p->p_stksize);
193 		len = p->p_stksize;
194 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
195 			/* not in either UNIX data or stack segments */
196 			res = FC_NOMAP;
197 			goto out;
198 		}
199 	}
200 
201 	/* the rest of this function implements a 3.X 4.X 5.X compatibility */
202 	/* This code is probably not needed anymore */
203 
204 	/* expand the gap to the page boundaries on each side */
205 	len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
206 	    ((uintptr_t)base & PAGEMASK);
207 	base = (caddr_t)((uintptr_t)base & PAGEMASK);
208 
209 	as_rangelock(as);
210 	as_purge(as);
211 	if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
212 		err = as_map(as, base, len, segvn_create, zfod_argsp);
213 		as_rangeunlock(as);
214 		if (err) {
215 			res = FC_MAKE_ERR(err);
216 			goto out;
217 		}
218 	} else {
219 		/*
220 		 * This page is already mapped by another thread after we
221 		 * returned from as_fault() above.  We just fallthrough
222 		 * as_fault() below.
223 		 */
224 		as_rangeunlock(as);
225 	}
226 
227 	res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
228 
229 out:
230 
231 	return (res);
232 }
233 
234 /*
235  * This is the routine which defines the address limit implied
236  * by the flag '_MAP_LOW32'.  USERLIMIT32 matches the highest
237  * mappable address in a 32-bit process on this platform (though
238  * perhaps we should make it be UINT32_MAX here?)
239  */
240 void
241 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
242 {
243 	struct proc *p = curproc;
244 	caddr_t userlimit = flags & _MAP_LOW32 ?
245 		(caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
246 	map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
247 }
248 
249 /*
250  * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
251  */
252 caddr_t	hole_start, hole_end;
253 
254 /*
255  * kpm mapping window
256  */
257 caddr_t kpm_vbase;
258 size_t  kpm_size;
259 uchar_t kpm_size_shift;
260 
261 /*
262  * Determine whether [base, base+len] contains a mapable range of
263  * addresses at least minlen long. base and len are adjusted if
264  * required to provide a mapable range.
265  */
266 /* ARGSUSED */
267 int
268 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
269 {
270 	caddr_t hi, lo;
271 
272 	lo = *basep;
273 	hi = lo + *lenp;
274 
275 	/*
276 	 * If hi rolled over the top, try cutting back.
277 	 */
278 	if (hi < lo) {
279 		size_t newlen = 0 - (uintptr_t)lo - 1l;
280 
281 		if (newlen + (uintptr_t)hi < minlen)
282 			return (0);
283 		if (newlen < minlen)
284 			return (0);
285 		*lenp = newlen;
286 	} else if (hi - lo < minlen)
287 		return (0);
288 
289 	/*
290 	 * Deal with a possible hole in the address range between
291 	 * hole_start and hole_end that should never be mapped by the MMU.
292 	 */
293 	hi = lo + *lenp;
294 
295 	if (lo < hole_start) {
296 		if (hi > hole_start)
297 			if (hi < hole_end)
298 				hi = hole_start;
299 			else
300 				/* lo < hole_start && hi >= hole_end */
301 				if (dir == AH_LO) {
302 					/*
303 					 * prefer lowest range
304 					 */
305 					if (hole_start - lo >= minlen)
306 						hi = hole_start;
307 					else if (hi - hole_end >= minlen)
308 						lo = hole_end;
309 					else
310 						return (0);
311 				} else {
312 					/*
313 					 * prefer highest range
314 					 */
315 					if (hi - hole_end >= minlen)
316 						lo = hole_end;
317 					else if (hole_start - lo >= minlen)
318 						hi = hole_start;
319 					else
320 						return (0);
321 				}
322 	} else {
323 		/* lo >= hole_start */
324 		if (hi < hole_end)
325 			return (0);
326 		if (lo < hole_end)
327 			lo = hole_end;
328 	}
329 
330 	if (hi - lo < minlen)
331 		return (0);
332 
333 	*basep = lo;
334 	*lenp = hi - lo;
335 
336 	return (1);
337 }
338 
339 /*
340  * Determine whether [addr, addr+len] with protections `prot' are valid
341  * for a user address space.
342  */
343 /*ARGSUSED*/
344 int
345 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
346     caddr_t userlimit)
347 {
348 	caddr_t eaddr = addr + len;
349 
350 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
351 		return (RANGE_BADADDR);
352 
353 	/*
354 	 * Determine if the address range falls within an illegal
355 	 * range of the MMU.
356 	 */
357 	if (eaddr > hole_start && addr < hole_end)
358 		return (RANGE_BADADDR);
359 
360 #if defined(SF_ERRATA_57)
361 	/*
362 	 * Make sure USERLIMIT isn't raised too high
363 	 */
364 	ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
365 	    errata57_limit == 0);
366 
367 	if (AS_TYPE_64BIT(as) &&
368 	    (addr < errata57_limit) &&
369 	    (prot & PROT_EXEC))
370 		return (RANGE_BADPROT);
371 #endif /* SF_ERRATA57 */
372 	return (RANGE_OKAY);
373 }
374 
375 /*
376  * Routine used to check to see if an a.out can be executed
377  * by the current machine/architecture.
378  */
379 int
380 chkaout(struct exdata *exp)
381 {
382 	if (exp->ux_mach == M_SPARC)
383 		return (0);
384 	else
385 		return (ENOEXEC);
386 }
387 
388 /*
389  * The following functions return information about an a.out
390  * which is used when a program is executed.
391  */
392 
393 /*
394  * Return the load memory address for the data segment.
395  */
396 caddr_t
397 getdmem(struct exec *exp)
398 {
399 	/*
400 	 * XXX - Sparc Reference Hack approaching
401 	 * Remember that we are loading
402 	 * 8k executables into a 4k machine
403 	 * DATA_ALIGN == 2 * PAGESIZE
404 	 */
405 	if (exp->a_text)
406 		return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
407 	else
408 		return ((caddr_t)USRTEXT);
409 }
410 
411 /*
412  * Return the starting disk address for the data segment.
413  */
414 ulong_t
415 getdfile(struct exec *exp)
416 {
417 	if (exp->a_magic == ZMAGIC)
418 		return (exp->a_text);
419 	else
420 		return (sizeof (struct exec) + exp->a_text);
421 }
422 
423 /*
424  * Return the load memory address for the text segment.
425  */
426 
427 /*ARGSUSED*/
428 caddr_t
429 gettmem(struct exec *exp)
430 {
431 	return ((caddr_t)USRTEXT);
432 }
433 
434 /*
435  * Return the file byte offset for the text segment.
436  */
437 uint_t
438 gettfile(struct exec *exp)
439 {
440 	if (exp->a_magic == ZMAGIC)
441 		return (0);
442 	else
443 		return (sizeof (struct exec));
444 }
445 
446 void
447 getexinfo(
448 	struct exdata *edp_in,
449 	struct exdata *edp_out,
450 	int *pagetext,
451 	int *pagedata)
452 {
453 	*edp_out = *edp_in;	/* structure copy */
454 
455 	if ((edp_in->ux_mag == ZMAGIC) &&
456 	    ((edp_in->vp->v_flag & VNOMAP) == 0)) {
457 		*pagetext = 1;
458 		*pagedata = 1;
459 	} else {
460 		*pagetext = 0;
461 		*pagedata = 0;
462 	}
463 }
464 
465 #define	MAP_PGSZ_COMMON(pgsz, n, upper, lower, len)	\
466 	for ((n) = (upper); (n) > (lower); (n)--) {		\
467 		if (disable_auto_large_pages & (1 << (n)))		\
468 			continue;				\
469 		if (hw_page_array[(n)].hp_size <= (len)) {	\
470 			(pgsz) = hw_page_array[(n)].hp_size;	\
471 			break;					\
472 		}						\
473 	}
474 
475 
476 /*ARGSUSED*/
477 static size_t
478 map_pgszva(struct proc *p, caddr_t addr, size_t len)
479 {
480 	size_t		pgsz = MMU_PAGESIZE;
481 	int		n, upper;
482 
483 	/*
484 	 * Select the best fit page size within the constraints of
485 	 * auto_lpg_{min,max}szc.
486 	 *
487 	 * Note that we also take the heap size into account when
488 	 * deciding if we've crossed the threshold at which we should
489 	 * increase the page size.  This isn't perfect since the heap
490 	 * may not have reached its full size yet, but it's better than
491 	 * not considering it at all.
492 	 */
493 	len += p->p_brksize;
494 	if (ptob(auto_lpg_tlb_threshold) <= len) {
495 
496 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
497 
498 		/*
499 		 * Use auto_lpg_minszc - 1 as the limit so we never drop
500 		 * below auto_lpg_minszc.  We don't have a size code to refer
501 		 * to like we have for bss and stack, so we assume 0.
502 		 * auto_lpg_minszc should always be >= 0.  Using
503 		 * auto_lpg_minszc cuts off the loop.
504 		 */
505 		MAP_PGSZ_COMMON(pgsz, n, upper, auto_lpg_minszc - 1, len);
506 	}
507 
508 	return (pgsz);
509 }
510 
511 static size_t
512 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
513 {
514 	size_t		pgsz;
515 	int		n, upper, lower;
516 
517 	/*
518 	 * If len is zero, retrieve from proc and don't demote the page size.
519 	 */
520 	if (len == 0) {
521 		len = p->p_brksize;
522 	}
523 
524 	/*
525 	 * Still zero?  Then we don't have a heap yet, so pick the default
526 	 * heap size.
527 	 */
528 	if (len == 0) {
529 		pgsz = auto_lpg_heap_default;
530 	} else {
531 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
532 	}
533 
534 	if ((pgsz * auto_lpg_tlb_threshold) <= len) {
535 		/*
536 		 * We're past the threshold, so select the best fit
537 		 * page size within the constraints of
538 		 * auto_lpg_{min,max}szc and the minimum required
539 		 * alignment.
540 		 */
541 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
542 		lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc);
543 		MAP_PGSZ_COMMON(pgsz, n, upper, lower, len);
544 	}
545 
546 	/*
547 	 * If addr == 0 we were called by memcntl() or exec_args() when the
548 	 * size code is 0.  Don't set pgsz less than current size.
549 	 */
550 	if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
551 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
552 	}
553 
554 	return (pgsz);
555 }
556 
557 static size_t
558 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
559 {
560 	size_t		pgsz;
561 	int		n, upper, lower;
562 
563 	/*
564 	 * If len is zero, retrieve from proc and don't demote the page size.
565 	 */
566 	if (len == 0) {
567 		len = p->p_stksize;
568 	}
569 
570 	/*
571 	 * Still zero?  Then we don't have a heap yet, so pick the default
572 	 * stack size.
573 	 */
574 	if (len == 0) {
575 		pgsz = auto_lpg_stack_default;
576 	} else {
577 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
578 	}
579 
580 	if ((pgsz * auto_lpg_tlb_threshold) <= len) {
581 		/*
582 		 * We're past the threshold, so select the best fit
583 		 * page size within the constraints of
584 		 * auto_lpg_{min,max}szc and the minimum required
585 		 * alignment.
586 		 */
587 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
588 		lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc);
589 		MAP_PGSZ_COMMON(pgsz, n, upper, lower, len);
590 	}
591 
592 	/*
593 	 * If addr == 0 we were called by memcntl() or exec_args() when the
594 	 * size code is 0.  Don't set pgsz less than current size.
595 	 */
596 	if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
597 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
598 	}
599 
600 	return (pgsz);
601 }
602 
603 static size_t
604 map_pgszism(caddr_t addr, size_t len)
605 {
606 	uint_t szc;
607 	size_t pgsz;
608 	extern int disable_ism_large_pages;
609 
610 	for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) {
611 		if (disable_ism_large_pages & (1 << szc))
612 			continue;
613 
614 		pgsz = hw_page_array[szc].hp_size;
615 		if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
616 			return (pgsz);
617 	}
618 	return (DEFAULT_ISM_PAGESIZE);
619 }
620 
621 /*
622  * Suggest a page size to be used to map a segment of type maptype and length
623  * len.  Returns a page size (not a size code).
624  * If remap is non-NULL, fill in a value suggesting whether or not to remap
625  * this segment.
626  */
627 size_t
628 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap)
629 {
630 	size_t	pgsz = 0;
631 
632 	if (remap != NULL)
633 		*remap = (len > auto_lpg_remap_threshold);
634 
635 	switch (maptype) {
636 	case MAPPGSZ_ISM:
637 		pgsz = map_pgszism(addr, len);
638 		break;
639 
640 	case MAPPGSZ_VA:
641 		pgsz = map_pgszva(p, addr, len);
642 		break;
643 
644 	case MAPPGSZ_STK:
645 		pgsz = map_pgszstk(p, addr, len);
646 		break;
647 
648 	case MAPPGSZ_HEAP:
649 		pgsz = map_pgszheap(p, addr, len);
650 		break;
651 	}
652 	return (pgsz);
653 }
654 
655 /*
656  * Return non 0 value if the address may cause a VAC alias with KPM mappings.
657  * KPM selects an address such that it's equal offset modulo shm_alignment and
658  * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
659  */
660 int
661 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
662 {
663 	if (vac) {
664 		return (((uintptr_t)addr ^ off) & shm_alignment - 1);
665 	} else {
666 		return (0);
667 	}
668 }
669 
670 /*
671  * use_text_pgsz64k, use_initdata_pgsz64k and use_text_pgsz4m
672  * can be set in platform or CPU specific code but user can change the
673  * default values via /etc/system.
674  *
675  * Initial values are defined in architecture specific mach_vm_dep.c file.
676  */
677 extern int use_text_pgsz64k;
678 extern int use_text_pgsz4m;
679 extern int use_initdata_pgsz64k;
680 
681 /*
682  * disable_text_largepages and disable_initdata_largepages bitmaks are set in
683  * platform or CPU specific code to disable page sizes that should not be
684  * used. These variables normally shouldn't be changed via /etc/system. A
685  * particular page size for text or inititialized data will be used by default
686  * if both one of use_* variables is set to 1 AND this page size is not
687  * disabled in the corresponding disable_* bitmask variable.
688  *
689  * Initial values are defined in architecture specific mach_vm_dep.c file.
690  */
691 extern int disable_text_largepages;
692 extern int disable_initdata_largepages;
693 
694 /*
695  * Minimum segment size tunables before 64K or 4M large pages
696  * should be used to map it.
697  *
698  * Initial values are defined in architecture specific mach_vm_dep.c file.
699  */
700 extern size_t text_pgsz64k_minsize;
701 extern size_t text_pgsz4m_minsize;
702 extern size_t initdata_pgsz64k_minsize;
703 
704 /*
705  * Sanity control. Don't use large pages regardless of user
706  * settings if there's less than execseg_lpg_min_physmem memory installed.
707  * The units for this variable is 8K pages.
708  */
709 pgcnt_t execseg_lpg_min_physmem = 131072;		/* 1GB */
710 
711 extern int disable_shm_large_pages;
712 pgcnt_t shm_lpg_min_physmem = 131072;			/* 1GB */
713 extern size_t max_shm_lpsize;
714 
715 
716 /* assumes TTE8K...TTE4M == szc */
717 
718 static uint_t
719 map_text_pgsz4m(caddr_t addr, size_t len)
720 {
721 	caddr_t a;
722 
723 	if (len < text_pgsz4m_minsize) {
724 		return (0);
725 	}
726 
727 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t);
728 	if (a < addr || a >= addr + len) {
729 		return (0);
730 	}
731 	len -= (a - addr);
732 	if (len < MMU_PAGESIZE4M) {
733 		return (0);
734 	}
735 
736 	return (1 << TTE4M);
737 }
738 
739 static uint_t
740 map_text_pgsz64k(caddr_t addr, size_t len)
741 {
742 	caddr_t a;
743 	size_t svlen = len;
744 
745 	if (len < text_pgsz64k_minsize) {
746 		return (0);
747 	}
748 
749 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t);
750 	if (a < addr || a >= addr + len) {
751 		return (0);
752 	}
753 	len -= (a - addr);
754 	if (len < MMU_PAGESIZE64K) {
755 		return (0);
756 	}
757 	if (!use_text_pgsz4m ||
758 	    disable_text_largepages & (1 << TTE4M)) {
759 		return (1 << TTE64K);
760 	}
761 	if (svlen < text_pgsz4m_minsize) {
762 		return (1 << TTE64K);
763 	}
764 	addr = a;
765 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t);
766 	if (a < addr || a >= addr + len) {
767 		return (1 << TTE64K);
768 	}
769 	len -= (a - addr);
770 	if (len < MMU_PAGESIZE4M) {
771 		return (1 << TTE64K);
772 	}
773 	return ((1 << TTE4M) | (1 << TTE64K));
774 }
775 
776 static uint_t
777 map_initdata_pgsz64k(caddr_t addr, size_t len)
778 {
779 	caddr_t a;
780 
781 	if (len < initdata_pgsz64k_minsize) {
782 		return (0);
783 	}
784 
785 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t);
786 	if (a < addr || a >= addr + len) {
787 		return (0);
788 	}
789 	len -= (a - addr);
790 	if (len < MMU_PAGESIZE64K) {
791 		return (0);
792 	}
793 	return (1 << TTE64K);
794 }
795 
796 /*
797  * Return a bit vector of large page size codes that
798  * can be used to map [addr, addr + len) region.
799  */
800 uint_t
801 map_execseg_pgszcvec(int text, caddr_t addr, size_t len)
802 {
803 	uint_t ret = 0;
804 
805 	if (physmem < execseg_lpg_min_physmem) {
806 		return (0);
807 	}
808 
809 	if (text) {
810 		if (use_text_pgsz64k &&
811 		    !(disable_text_largepages & (1 << TTE64K))) {
812 			ret = map_text_pgsz64k(addr, len);
813 		} else if (use_text_pgsz4m &&
814 		    !(disable_text_largepages & (1 << TTE4M))) {
815 			ret = map_text_pgsz4m(addr, len);
816 		}
817 	} else if (use_initdata_pgsz64k &&
818 	    !(disable_initdata_largepages & (1 << TTE64K))) {
819 		ret = map_initdata_pgsz64k(addr, len);
820 	}
821 
822 	return (ret);
823 }
824 
825 uint_t
826 map_shm_pgszcvec(caddr_t addr, size_t size, uintptr_t off)
827 {
828 	caddr_t eaddr = addr + size;
829 	uint_t szcvec = 0;
830 	int i;
831 	caddr_t raddr;
832 	caddr_t readdr;
833 	size_t pgsz;
834 
835 	if (physmem < shm_lpg_min_physmem || mmu_page_sizes <= 1 ||
836 	    max_shm_lpsize <= MMU_PAGESIZE) {
837 		return (0);
838 	}
839 
840 	for (i = mmu_page_sizes - 1; i > 0; i--) {
841 		if (disable_shm_large_pages & (1 << i)) {
842 			continue;
843 		}
844 		pgsz = page_get_pagesize(i);
845 		if (pgsz > max_shm_lpsize) {
846 			continue;
847 		}
848 		raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
849 		readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
850 		if (raddr < addr || raddr >= readdr) {
851 			continue;
852 		}
853 		if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
854 			continue;
855 		}
856 		szcvec |= (1 << i);
857 		/*
858 		 * And or in the remaining enabled page sizes.
859 		 */
860 		szcvec |= P2PHASE(~disable_shm_large_pages, (1 << i));
861 		szcvec &= ~1; /* no need to return 8K pagesize */
862 		break;
863 	}
864 	return (szcvec);
865 }
866 
867 #define	PNUM_SIZE(size_code)						\
868 	(hw_page_array[size_code].hp_size >> hw_page_array[0].hp_shift)
869 
870 /*
871  * Anchored in the table below are counters used to keep track
872  * of free contiguous physical memory. Each element of the table contains
873  * the array of counters, the size of array which is allocated during
874  * startup based on physmax and a shift value used to convert a pagenum
875  * into a counter array index or vice versa. The table has page size
876  * for rows and region size for columns:
877  *
878  *	page_counters[page_size][region_size]
879  *
880  *	page_size: 	TTE size code of pages on page_size freelist.
881  *
882  *	region_size:	TTE size code of a candidate larger page made up
883  *			made up of contiguous free page_size pages.
884  *
885  * As you go across a page_size row increasing region_size each
886  * element keeps track of how many (region_size - 1) size groups
887  * made up of page_size free pages can be coalesced into a
888  * regsion_size page. Yuck! Lets try an example:
889  *
890  * 	page_counters[1][3] is the table element used for identifying
891  *	candidate 4M pages from contiguous pages off the 64K free list.
892  *	Each index in the page_counters[1][3].array spans 4M. Its the
893  *	number of free 512K size (regsion_size - 1) groups of contiguous
894  *	64K free pages.	So when page_counters[1][3].counters[n] == 8
895  *	we know we have a candidate 4M page made up of 512K size groups
896  *	of 64K free pages.
897  */
898 
899 /*
900  * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
901  * dimensions are allocated dynamically.
902  */
903 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
904 
905 /*
906  * For now there is only a single size cache list.
907  * Allocated dynamically.
908  */
909 page_t ***page_cachelists[MAX_MEM_TYPES];
910 
911 kmutex_t *fpc_mutex[NPC_MUTEX];
912 kmutex_t *cpc_mutex[NPC_MUTEX];
913 
914 caddr_t
915 alloc_page_freelists(int mnode, caddr_t alloc_base, int alloc_align)
916 {
917 	int	mtype;
918 	uint_t	szc;
919 
920 	alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align);
921 
922 	/*
923 	 * We only support small pages in the cachelist.
924 	 */
925 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
926 		page_cachelists[mtype][mnode] = (page_t **)alloc_base;
927 		alloc_base += (sizeof (page_t *) * page_colors);
928 		/*
929 		 * Allocate freelists bins for all
930 		 * supported page sizes.
931 		 */
932 		for (szc = 0; szc < mmu_page_sizes; szc++) {
933 			page_freelists[szc][mtype][mnode] =
934 			    (page_t **)alloc_base;
935 			alloc_base += ((sizeof (page_t *) *
936 			    page_get_pagecolors(szc)));
937 		}
938 	}
939 
940 	alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align);
941 
942 	return (alloc_base);
943 }
944 
945 /*
946  * Allocate page_freelists bin headers for a memnode from the
947  * nucleus data area. This is the first time that mmu_page_sizes is
948  * used during sun4u bootup, so check mmu_page_sizes initialization.
949  */
950 int
951 ndata_alloc_page_freelists(struct memlist *ndata, int mnode)
952 {
953 	size_t alloc_sz;
954 	caddr_t alloc_base;
955 	caddr_t end;
956 	int	mtype;
957 	uint_t	szc;
958 	int32_t allp = 0;
959 
960 	if (&mmu_init_mmu_page_sizes) {
961 		if (!mmu_init_mmu_page_sizes(allp)) {
962 			cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
963 			    mmu_page_sizes);
964 		}
965 	}
966 	ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
967 
968 	/* first time called - allocate max_mem_nodes dimension */
969 	if (mnode == 0) {
970 		int	i;
971 
972 		/* page_cachelists */
973 		alloc_sz = MAX_MEM_TYPES * max_mem_nodes *
974 		    sizeof (page_t **);
975 
976 		/* page_freelists */
977 		alloc_sz += MAX_MEM_TYPES * mmu_page_sizes * max_mem_nodes *
978 		    sizeof (page_t **);
979 
980 		/* fpc_mutex and cpc_mutex */
981 		alloc_sz += 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
982 
983 		alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
984 		if (alloc_base == NULL)
985 			return (-1);
986 
987 		ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
988 
989 		for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
990 			page_cachelists[mtype] = (page_t ***)alloc_base;
991 			alloc_base += (max_mem_nodes * sizeof (page_t **));
992 			for (szc = 0; szc < mmu_page_sizes; szc++) {
993 				page_freelists[szc][mtype] =
994 				    (page_t ***)alloc_base;
995 				alloc_base += (max_mem_nodes *
996 				    sizeof (page_t **));
997 			}
998 		}
999 		for (i = 0; i < NPC_MUTEX; i++) {
1000 			fpc_mutex[i] = (kmutex_t *)alloc_base;
1001 			alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
1002 			cpc_mutex[i] = (kmutex_t *)alloc_base;
1003 			alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
1004 		}
1005 		alloc_sz = 0;
1006 	}
1007 
1008 	/*
1009 	 * Calculate the size needed by alloc_page_freelists().
1010 	 */
1011 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
1012 		alloc_sz += sizeof (page_t *) * page_colors;
1013 
1014 		for (szc = 0; szc < mmu_page_sizes; szc++)
1015 			alloc_sz += sizeof (page_t *) *
1016 			    page_get_pagecolors(szc);
1017 	}
1018 
1019 	alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
1020 	if (alloc_base == NULL)
1021 		return (-1);
1022 
1023 	end = alloc_page_freelists(mnode, alloc_base, ecache_alignsize);
1024 	ASSERT((uintptr_t)end == roundup((uintptr_t)alloc_base + alloc_sz,
1025 	    ecache_alignsize));
1026 
1027 	return (0);
1028 }
1029 
1030 /*
1031  * To select our starting bin, we stride through the bins with a stride
1032  * of 337.  Why 337?  It's prime, it's largeish, and it performs well both
1033  * in simulation and practice for different workloads on varying cache sizes.
1034  */
1035 uint32_t color_start_current = 0;
1036 uint32_t color_start_stride = 337;
1037 int color_start_random = 0;
1038 
1039 /* ARGSUSED */
1040 uint_t
1041 get_color_start(struct as *as)
1042 {
1043 	uint32_t old, new;
1044 
1045 	if (consistent_coloring == 2 || color_start_random) {
1046 		return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
1047 		    page_colors_mask));
1048 	}
1049 
1050 	do {
1051 		old = color_start_current;
1052 		new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
1053 	} while (cas32(&color_start_current, old, new) != old);
1054 
1055 	return ((uint_t)(new));
1056 }
1057 
1058 /*
1059  * Called once at startup from kphysm_init() -- before memialloc()
1060  * is invoked to do the 1st page_free()/page_freelist_add().
1061  *
1062  * initializes page_colors and page_colors_mask based on ecache_setsize.
1063  *
1064  * Also initializes the counter locks.
1065  */
1066 void
1067 page_coloring_init()
1068 {
1069 	int	a;
1070 
1071 	if (do_pg_coloring == 0) {
1072 		page_colors = 1;
1073 		return;
1074 	}
1075 
1076 	/*
1077 	 * Calculate page_colors from ecache_setsize. ecache_setsize contains
1078 	 * the max ecache setsize of all cpus configured in the system or, for
1079 	 * cheetah+ systems, the max possible ecache setsize for all possible
1080 	 * cheetah+ cpus.
1081 	 */
1082 	page_colors = ecache_setsize / MMU_PAGESIZE;
1083 	page_colors_mask = page_colors - 1;
1084 
1085 	/*
1086 	 * initialize cpu_page_colors if ecache setsizes are homogenous.
1087 	 * cpu_page_colors set to -1 during DR operation or during startup
1088 	 * if setsizes are heterogenous.
1089 	 *
1090 	 * The value of cpu_page_colors determines if additional color bins
1091 	 * need to be checked for a particular color in the page_get routines.
1092 	 */
1093 	if ((cpu_page_colors == 0) && (cpu_setsize < ecache_setsize))
1094 		cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
1095 
1096 	vac_colors = vac_size / MMU_PAGESIZE;
1097 	vac_colors_mask = vac_colors -1;
1098 
1099 	page_coloring_shift = 0;
1100 	a = ecache_setsize;
1101 	while (a >>= 1) {
1102 		page_coloring_shift++;
1103 	}
1104 }
1105 
1106 int
1107 bp_color(struct buf *bp)
1108 {
1109 	int color = -1;
1110 
1111 	if (vac) {
1112 		if ((bp->b_flags & B_PAGEIO) != 0) {
1113 			color = sfmmu_get_ppvcolor(bp->b_pages);
1114 		} else if (bp->b_un.b_addr != NULL) {
1115 			color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
1116 		}
1117 	}
1118 	return (color < 0 ? 0 : ptob(color));
1119 }
1120 
1121 /*
1122  * Create & Initialise pageout scanner thread. The thread has to
1123  * start at procedure with process pp and priority pri.
1124  */
1125 void
1126 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
1127 {
1128 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
1129 }
1130 
1131 /*
1132  * Function for flushing D-cache when performing module relocations
1133  * to an alternate mapping.  Stubbed out on all platforms except sun4u,
1134  * at least for now.
1135  */
1136 void
1137 dcache_flushall()
1138 {
1139 	sfmmu_cache_flushall();
1140 }
1141 
1142 static int
1143 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
1144 {
1145 	if (va1 < va2 && va1 + sz1 <= va2)
1146 		return (0);
1147 
1148 	if (va2 < va1 && va2 + sz2 <= va1)
1149 		return (0);
1150 
1151 	return (1);
1152 }
1153 
1154 /*
1155  * Return the number of bytes, relative to the beginning of a given range, that
1156  * are non-toxic (can be read from and written to with relative impunity).
1157  */
1158 size_t
1159 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
1160 {
1161 	/* OBP reads are harmless, but we don't want people writing there */
1162 	if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
1163 	    OFW_START_ADDR + 1))
1164 		return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
1165 
1166 	if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
1167 		return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
1168 
1169 	return (sz); /* no overlap */
1170 }
1171 
1172 /*
1173  * Minimum physmem required for enabling large pages for kernel heap
1174  * Currently we do not enable lp for kmem on systems with less
1175  * than 1GB of memory. This value can be changed via /etc/system
1176  */
1177 size_t segkmem_lpminphysmem = 0x40000000;	/* 1GB */
1178 
1179 /*
1180  * this function chooses large page size for kernel heap
1181  */
1182 size_t
1183 get_segkmem_lpsize(size_t lpsize)
1184 {
1185 	size_t memtotal = physmem * PAGESIZE;
1186 	size_t mmusz;
1187 	uint_t szc;
1188 	extern int disable_large_pages;
1189 
1190 	if (memtotal < segkmem_lpminphysmem)
1191 		return (PAGESIZE);
1192 
1193 	if (plat_lpkmem_is_supported != NULL &&
1194 	    plat_lpkmem_is_supported() == 0)
1195 		return (PAGESIZE);
1196 
1197 	mmusz = mmu_get_kernel_lpsize(lpsize);
1198 	szc = page_szc(mmusz);
1199 
1200 	while (szc) {
1201 		if (!(disable_large_pages & (1 << szc)))
1202 			return (page_get_pagesize(szc));
1203 		szc--;
1204 	}
1205 	return (PAGESIZE);
1206 }
1207