xref: /titanic_52/usr/src/uts/sun4/vm/vm_dep.c (revision 3906e0c22bea9bf690c20f62b0575c1b1d0ace2e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * UNIX machine dependent virtual memory support.
30  */
31 
32 #include <sys/vm.h>
33 #include <sys/exec.h>
34 
35 #include <sys/exechdr.h>
36 #include <vm/seg_kmem.h>
37 #include <sys/atomic.h>
38 #include <sys/archsystm.h>
39 #include <sys/machsystm.h>
40 #include <sys/kdi.h>
41 #include <sys/cpu_module.h>
42 
43 #include <vm/hat_sfmmu.h>
44 
45 #include <sys/memnode.h>
46 
47 #include <sys/mem_config.h>
48 #include <sys/mem_cage.h>
49 #include <vm/vm_dep.h>
50 #include <vm/page.h>
51 #include <sys/platform_module.h>
52 
53 /*
54  * These variables are set by module specific config routines.
55  * They are only set by modules which will use physical cache page coloring
56  * and/or virtual cache page coloring.
57  */
58 int do_pg_coloring = 0;
59 int do_virtual_coloring = 0;
60 
61 /*
62  * These variables can be conveniently patched at kernel load time to
63  * prevent do_pg_coloring or do_virtual_coloring from being enabled by
64  * module specific config routines.
65  */
66 
67 int use_page_coloring = 1;
68 int use_virtual_coloring = 1;
69 
70 /*
71  * initialized by page_coloring_init()
72  */
73 extern uint_t page_colors;
74 extern uint_t page_colors_mask;
75 extern uint_t page_coloring_shift;
76 int cpu_page_colors;
77 uint_t vac_colors = 0;
78 uint_t vac_colors_mask = 0;
79 
80 /* cpu specific coloring initialization */
81 extern void page_coloring_init_cpu();
82 #pragma weak page_coloring_init_cpu
83 
84 /*
85  * get the ecache setsize for the current cpu.
86  */
87 #define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
88 
89 plcnt_t		plcnt;		/* page list count */
90 
91 /*
92  * This variable is set by the cpu module to contain the lowest
93  * address not affected by the SF_ERRATA_57 workaround.  It should
94  * remain 0 if the workaround is not needed.
95  */
96 #if defined(SF_ERRATA_57)
97 caddr_t errata57_limit;
98 #endif
99 
100 extern int disable_auto_large_pages;	/* used by map_pgsz*() routines */
101 
102 extern void page_relocate_hash(page_t *, page_t *);
103 
104 /*
105  * these must be defined in platform specific areas
106  */
107 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
108 	struct proc *, uint_t);
109 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
110 	caddr_t, size_t, uint_t, struct lgrp *);
111 /*
112  * Convert page frame number to an OBMEM page frame number
113  * (i.e. put in the type bits -- zero for this implementation)
114  */
115 pfn_t
116 impl_obmem_pfnum(pfn_t pf)
117 {
118 	return (pf);
119 }
120 
121 /*
122  * Use physmax to determine the highest physical page of DRAM memory
123  * It is assumed that any physical addresses above physmax is in IO space.
124  * We don't bother checking the low end because we assume that memory space
125  * begins at physical page frame 0.
126  *
127  * Return 1 if the page frame is onboard DRAM memory, else 0.
128  * Returns 0 for nvram so it won't be cached.
129  */
130 int
131 pf_is_memory(pfn_t pf)
132 {
133 	/* We must be IO space */
134 	if (pf > physmax)
135 		return (0);
136 
137 	/* We must be memory space */
138 	return (1);
139 }
140 
141 /*
142  * Handle a pagefault.
143  */
144 faultcode_t
145 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
146 {
147 	struct as *as;
148 	struct proc *p;
149 	faultcode_t res;
150 	caddr_t base;
151 	size_t len;
152 	int err;
153 
154 	if (INVALID_VADDR(addr))
155 		return (FC_NOMAP);
156 
157 	if (iskernel) {
158 		as = &kas;
159 	} else {
160 		p = curproc;
161 		as = p->p_as;
162 #if defined(SF_ERRATA_57)
163 		/*
164 		 * Prevent infinite loops due to a segment driver
165 		 * setting the execute permissions and the sfmmu hat
166 		 * silently ignoring them.
167 		 */
168 		if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
169 		    addr < errata57_limit) {
170 			res = FC_NOMAP;
171 			goto out;
172 		}
173 #endif
174 	}
175 
176 	/*
177 	 * Dispatch pagefault.
178 	 */
179 	res = as_fault(as->a_hat, as, addr, 1, type, rw);
180 
181 	/*
182 	 * If this isn't a potential unmapped hole in the user's
183 	 * UNIX data or stack segments, just return status info.
184 	 */
185 	if (!(res == FC_NOMAP && iskernel == 0))
186 		goto out;
187 
188 	/*
189 	 * Check to see if we happened to faulted on a currently unmapped
190 	 * part of the UNIX data or stack segments.  If so, create a zfod
191 	 * mapping there and then try calling the fault routine again.
192 	 */
193 	base = p->p_brkbase;
194 	len = p->p_brksize;
195 
196 	if (addr < base || addr >= base + len) {		/* data seg? */
197 		base = (caddr_t)(p->p_usrstack - p->p_stksize);
198 		len = p->p_stksize;
199 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
200 			/* not in either UNIX data or stack segments */
201 			res = FC_NOMAP;
202 			goto out;
203 		}
204 	}
205 
206 	/* the rest of this function implements a 3.X 4.X 5.X compatibility */
207 	/* This code is probably not needed anymore */
208 
209 	/* expand the gap to the page boundaries on each side */
210 	len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
211 	    ((uintptr_t)base & PAGEMASK);
212 	base = (caddr_t)((uintptr_t)base & PAGEMASK);
213 
214 	as_rangelock(as);
215 	as_purge(as);
216 	if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
217 		err = as_map(as, base, len, segvn_create, zfod_argsp);
218 		as_rangeunlock(as);
219 		if (err) {
220 			res = FC_MAKE_ERR(err);
221 			goto out;
222 		}
223 	} else {
224 		/*
225 		 * This page is already mapped by another thread after we
226 		 * returned from as_fault() above.  We just fallthrough
227 		 * as_fault() below.
228 		 */
229 		as_rangeunlock(as);
230 	}
231 
232 	res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
233 
234 out:
235 
236 	return (res);
237 }
238 
239 /*
240  * This is the routine which defines the address limit implied
241  * by the flag '_MAP_LOW32'.  USERLIMIT32 matches the highest
242  * mappable address in a 32-bit process on this platform (though
243  * perhaps we should make it be UINT32_MAX here?)
244  */
245 void
246 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
247 {
248 	struct proc *p = curproc;
249 	caddr_t userlimit = flags & _MAP_LOW32 ?
250 		(caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
251 	map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
252 }
253 
254 /*
255  * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
256  */
257 caddr_t	hole_start, hole_end;
258 
259 /*
260  * kpm mapping window
261  */
262 caddr_t kpm_vbase;
263 size_t  kpm_size;
264 uchar_t kpm_size_shift;
265 
266 /*
267  * Determine whether [base, base+len] contains a mapable range of
268  * addresses at least minlen long. base and len are adjusted if
269  * required to provide a mapable range.
270  */
271 /* ARGSUSED */
272 int
273 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
274 {
275 	caddr_t hi, lo;
276 
277 	lo = *basep;
278 	hi = lo + *lenp;
279 
280 	/*
281 	 * If hi rolled over the top, try cutting back.
282 	 */
283 	if (hi < lo) {
284 		size_t newlen = 0 - (uintptr_t)lo - 1l;
285 
286 		if (newlen + (uintptr_t)hi < minlen)
287 			return (0);
288 		if (newlen < minlen)
289 			return (0);
290 		*lenp = newlen;
291 	} else if (hi - lo < minlen)
292 		return (0);
293 
294 	/*
295 	 * Deal with a possible hole in the address range between
296 	 * hole_start and hole_end that should never be mapped by the MMU.
297 	 */
298 	hi = lo + *lenp;
299 
300 	if (lo < hole_start) {
301 		if (hi > hole_start)
302 			if (hi < hole_end)
303 				hi = hole_start;
304 			else
305 				/* lo < hole_start && hi >= hole_end */
306 				if (dir == AH_LO) {
307 					/*
308 					 * prefer lowest range
309 					 */
310 					if (hole_start - lo >= minlen)
311 						hi = hole_start;
312 					else if (hi - hole_end >= minlen)
313 						lo = hole_end;
314 					else
315 						return (0);
316 				} else {
317 					/*
318 					 * prefer highest range
319 					 */
320 					if (hi - hole_end >= minlen)
321 						lo = hole_end;
322 					else if (hole_start - lo >= minlen)
323 						hi = hole_start;
324 					else
325 						return (0);
326 				}
327 	} else {
328 		/* lo >= hole_start */
329 		if (hi < hole_end)
330 			return (0);
331 		if (lo < hole_end)
332 			lo = hole_end;
333 	}
334 
335 	if (hi - lo < minlen)
336 		return (0);
337 
338 	*basep = lo;
339 	*lenp = hi - lo;
340 
341 	return (1);
342 }
343 
344 /*
345  * Determine whether [addr, addr+len] with protections `prot' are valid
346  * for a user address space.
347  */
348 /*ARGSUSED*/
349 int
350 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
351     caddr_t userlimit)
352 {
353 	caddr_t eaddr = addr + len;
354 
355 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
356 		return (RANGE_BADADDR);
357 
358 	/*
359 	 * Determine if the address range falls within an illegal
360 	 * range of the MMU.
361 	 */
362 	if (eaddr > hole_start && addr < hole_end)
363 		return (RANGE_BADADDR);
364 
365 #if defined(SF_ERRATA_57)
366 	/*
367 	 * Make sure USERLIMIT isn't raised too high
368 	 */
369 	ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
370 	    errata57_limit == 0);
371 
372 	if (AS_TYPE_64BIT(as) &&
373 	    (addr < errata57_limit) &&
374 	    (prot & PROT_EXEC))
375 		return (RANGE_BADPROT);
376 #endif /* SF_ERRATA57 */
377 	return (RANGE_OKAY);
378 }
379 
380 /*
381  * Routine used to check to see if an a.out can be executed
382  * by the current machine/architecture.
383  */
384 int
385 chkaout(struct exdata *exp)
386 {
387 	if (exp->ux_mach == M_SPARC)
388 		return (0);
389 	else
390 		return (ENOEXEC);
391 }
392 
393 /*
394  * The following functions return information about an a.out
395  * which is used when a program is executed.
396  */
397 
398 /*
399  * Return the load memory address for the data segment.
400  */
401 caddr_t
402 getdmem(struct exec *exp)
403 {
404 	/*
405 	 * XXX - Sparc Reference Hack approaching
406 	 * Remember that we are loading
407 	 * 8k executables into a 4k machine
408 	 * DATA_ALIGN == 2 * PAGESIZE
409 	 */
410 	if (exp->a_text)
411 		return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
412 	else
413 		return ((caddr_t)USRTEXT);
414 }
415 
416 /*
417  * Return the starting disk address for the data segment.
418  */
419 ulong_t
420 getdfile(struct exec *exp)
421 {
422 	if (exp->a_magic == ZMAGIC)
423 		return (exp->a_text);
424 	else
425 		return (sizeof (struct exec) + exp->a_text);
426 }
427 
428 /*
429  * Return the load memory address for the text segment.
430  */
431 
432 /*ARGSUSED*/
433 caddr_t
434 gettmem(struct exec *exp)
435 {
436 	return ((caddr_t)USRTEXT);
437 }
438 
439 /*
440  * Return the file byte offset for the text segment.
441  */
442 uint_t
443 gettfile(struct exec *exp)
444 {
445 	if (exp->a_magic == ZMAGIC)
446 		return (0);
447 	else
448 		return (sizeof (struct exec));
449 }
450 
451 void
452 getexinfo(
453 	struct exdata *edp_in,
454 	struct exdata *edp_out,
455 	int *pagetext,
456 	int *pagedata)
457 {
458 	*edp_out = *edp_in;	/* structure copy */
459 
460 	if ((edp_in->ux_mag == ZMAGIC) &&
461 	    ((edp_in->vp->v_flag & VNOMAP) == 0)) {
462 		*pagetext = 1;
463 		*pagedata = 1;
464 	} else {
465 		*pagetext = 0;
466 		*pagedata = 0;
467 	}
468 }
469 
470 #define	MAP_PGSZ_COMMON(pgsz, n, upper, lower, len)	\
471 	for ((n) = (upper); (n) > (lower); (n)--) {		\
472 		if (disable_auto_large_pages & (1 << (n)))		\
473 			continue;				\
474 		if (hw_page_array[(n)].hp_size <= (len)) {	\
475 			(pgsz) = hw_page_array[(n)].hp_size;	\
476 			break;					\
477 		}						\
478 	}
479 
480 
481 /*ARGSUSED*/
482 static size_t
483 map_pgszva(struct proc *p, caddr_t addr, size_t len)
484 {
485 	size_t		pgsz = MMU_PAGESIZE;
486 	int		n, upper;
487 
488 	/*
489 	 * Select the best fit page size within the constraints of
490 	 * auto_lpg_{min,max}szc.
491 	 *
492 	 * Note that we also take the heap size into account when
493 	 * deciding if we've crossed the threshold at which we should
494 	 * increase the page size.  This isn't perfect since the heap
495 	 * may not have reached its full size yet, but it's better than
496 	 * not considering it at all.
497 	 */
498 	len += p->p_brksize;
499 	if (ptob(auto_lpg_tlb_threshold) <= len) {
500 
501 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
502 
503 		/*
504 		 * Use auto_lpg_minszc - 1 as the limit so we never drop
505 		 * below auto_lpg_minszc.  We don't have a size code to refer
506 		 * to like we have for bss and stack, so we assume 0.
507 		 * auto_lpg_minszc should always be >= 0.  Using
508 		 * auto_lpg_minszc cuts off the loop.
509 		 */
510 		MAP_PGSZ_COMMON(pgsz, n, upper, auto_lpg_minszc - 1, len);
511 	}
512 
513 	return (pgsz);
514 }
515 
516 static size_t
517 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
518 {
519 	size_t		pgsz;
520 	int		n, upper, lower;
521 
522 	/*
523 	 * If len is zero, retrieve from proc and don't demote the page size.
524 	 */
525 	if (len == 0) {
526 		len = p->p_brksize;
527 	}
528 
529 	/*
530 	 * Still zero?  Then we don't have a heap yet, so pick the default
531 	 * heap size.
532 	 */
533 	if (len == 0) {
534 		pgsz = auto_lpg_heap_default;
535 	} else {
536 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
537 	}
538 
539 	if ((pgsz * auto_lpg_tlb_threshold) <= len) {
540 		/*
541 		 * We're past the threshold, so select the best fit
542 		 * page size within the constraints of
543 		 * auto_lpg_{min,max}szc and the minimum required
544 		 * alignment.
545 		 */
546 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
547 		lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc);
548 		MAP_PGSZ_COMMON(pgsz, n, upper, lower, len);
549 	}
550 
551 	/*
552 	 * If addr == 0 we were called by memcntl() or exec_args() when the
553 	 * size code is 0.  Don't set pgsz less than current size.
554 	 */
555 	if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
556 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
557 	}
558 
559 	return (pgsz);
560 }
561 
562 static size_t
563 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
564 {
565 	size_t		pgsz;
566 	int		n, upper, lower;
567 
568 	/*
569 	 * If len is zero, retrieve from proc and don't demote the page size.
570 	 */
571 	if (len == 0) {
572 		len = p->p_stksize;
573 	}
574 
575 	/*
576 	 * Still zero?  Then we don't have a heap yet, so pick the default
577 	 * stack size.
578 	 */
579 	if (len == 0) {
580 		pgsz = auto_lpg_stack_default;
581 	} else {
582 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
583 	}
584 
585 	if ((pgsz * auto_lpg_tlb_threshold) <= len) {
586 		/*
587 		 * We're past the threshold, so select the best fit
588 		 * page size within the constraints of
589 		 * auto_lpg_{min,max}szc and the minimum required
590 		 * alignment.
591 		 */
592 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
593 		lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc);
594 		MAP_PGSZ_COMMON(pgsz, n, upper, lower, len);
595 	}
596 
597 	/*
598 	 * If addr == 0 we were called by memcntl() or exec_args() when the
599 	 * size code is 0.  Don't set pgsz less than current size.
600 	 */
601 	if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
602 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
603 	}
604 
605 	return (pgsz);
606 }
607 
608 static size_t
609 map_pgszism(caddr_t addr, size_t len)
610 {
611 	uint_t szc;
612 	size_t pgsz;
613 	extern int disable_ism_large_pages;
614 
615 	for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) {
616 		if (disable_ism_large_pages & (1 << szc))
617 			continue;
618 
619 		pgsz = hw_page_array[szc].hp_size;
620 		if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
621 			return (pgsz);
622 	}
623 	return (DEFAULT_ISM_PAGESIZE);
624 }
625 
626 /*
627  * Suggest a page size to be used to map a segment of type maptype and length
628  * len.  Returns a page size (not a size code).
629  * If remap is non-NULL, fill in a value suggesting whether or not to remap
630  * this segment.
631  */
632 size_t
633 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap)
634 {
635 	size_t	pgsz = 0;
636 
637 	if (remap != NULL)
638 		*remap = (len > auto_lpg_remap_threshold);
639 
640 	switch (maptype) {
641 	case MAPPGSZ_ISM:
642 		pgsz = map_pgszism(addr, len);
643 		break;
644 
645 	case MAPPGSZ_VA:
646 		pgsz = map_pgszva(p, addr, len);
647 		break;
648 
649 	case MAPPGSZ_STK:
650 		pgsz = map_pgszstk(p, addr, len);
651 		break;
652 
653 	case MAPPGSZ_HEAP:
654 		pgsz = map_pgszheap(p, addr, len);
655 		break;
656 	}
657 	return (pgsz);
658 }
659 
660 /*
661  * Return non 0 value if the address may cause a VAC alias with KPM mappings.
662  * KPM selects an address such that it's equal offset modulo shm_alignment and
663  * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
664  */
665 int
666 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
667 {
668 	if (vac) {
669 		return (((uintptr_t)addr ^ off) & shm_alignment - 1);
670 	} else {
671 		return (0);
672 	}
673 }
674 
675 /*
676  * use_text_pgsz64k, use_initdata_pgsz64k and use_text_pgsz4m
677  * can be set in platform or CPU specific code but user can change the
678  * default values via /etc/system.
679  *
680  * Initial values are defined in architecture specific mach_vm_dep.c file.
681  */
682 extern int use_text_pgsz64k;
683 extern int use_text_pgsz4m;
684 extern int use_initdata_pgsz64k;
685 
686 /*
687  * disable_text_largepages and disable_initdata_largepages bitmaks are set in
688  * platform or CPU specific code to disable page sizes that should not be
689  * used. These variables normally shouldn't be changed via /etc/system. A
690  * particular page size for text or inititialized data will be used by default
691  * if both one of use_* variables is set to 1 AND this page size is not
692  * disabled in the corresponding disable_* bitmask variable.
693  *
694  * Initial values are defined in architecture specific mach_vm_dep.c file.
695  */
696 extern int disable_text_largepages;
697 extern int disable_initdata_largepages;
698 
699 /*
700  * Minimum segment size tunables before 64K or 4M large pages
701  * should be used to map it.
702  *
703  * Initial values are defined in architecture specific mach_vm_dep.c file.
704  */
705 extern size_t text_pgsz64k_minsize;
706 extern size_t text_pgsz4m_minsize;
707 extern size_t initdata_pgsz64k_minsize;
708 
709 /*
710  * Sanity control. Don't use large pages regardless of user
711  * settings if there's less than execseg_lpg_min_physmem memory installed.
712  * The units for this variable is 8K pages.
713  */
714 pgcnt_t execseg_lpg_min_physmem = 131072;		/* 1GB */
715 
716 extern int disable_shm_large_pages;
717 pgcnt_t shm_lpg_min_physmem = 131072;			/* 1GB */
718 extern size_t max_shm_lpsize;
719 
720 
721 /* assumes TTE8K...TTE4M == szc */
722 
723 static uint_t
724 map_text_pgsz4m(caddr_t addr, size_t len)
725 {
726 	caddr_t a;
727 
728 	if (len < text_pgsz4m_minsize) {
729 		return (0);
730 	}
731 
732 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t);
733 	if (a < addr || a >= addr + len) {
734 		return (0);
735 	}
736 	len -= (a - addr);
737 	if (len < MMU_PAGESIZE4M) {
738 		return (0);
739 	}
740 
741 	return (1 << TTE4M);
742 }
743 
744 static uint_t
745 map_text_pgsz64k(caddr_t addr, size_t len)
746 {
747 	caddr_t a;
748 	size_t svlen = len;
749 
750 	if (len < text_pgsz64k_minsize) {
751 		return (0);
752 	}
753 
754 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t);
755 	if (a < addr || a >= addr + len) {
756 		return (0);
757 	}
758 	len -= (a - addr);
759 	if (len < MMU_PAGESIZE64K) {
760 		return (0);
761 	}
762 	if (!use_text_pgsz4m ||
763 	    disable_text_largepages & (1 << TTE4M)) {
764 		return (1 << TTE64K);
765 	}
766 	if (svlen < text_pgsz4m_minsize) {
767 		return (1 << TTE64K);
768 	}
769 	addr = a;
770 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t);
771 	if (a < addr || a >= addr + len) {
772 		return (1 << TTE64K);
773 	}
774 	len -= (a - addr);
775 	if (len < MMU_PAGESIZE4M) {
776 		return (1 << TTE64K);
777 	}
778 	return ((1 << TTE4M) | (1 << TTE64K));
779 }
780 
781 static uint_t
782 map_initdata_pgsz64k(caddr_t addr, size_t len)
783 {
784 	caddr_t a;
785 
786 	if (len < initdata_pgsz64k_minsize) {
787 		return (0);
788 	}
789 
790 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t);
791 	if (a < addr || a >= addr + len) {
792 		return (0);
793 	}
794 	len -= (a - addr);
795 	if (len < MMU_PAGESIZE64K) {
796 		return (0);
797 	}
798 	return (1 << TTE64K);
799 }
800 
801 /*
802  * Return a bit vector of large page size codes that
803  * can be used to map [addr, addr + len) region.
804  */
805 uint_t
806 map_execseg_pgszcvec(int text, caddr_t addr, size_t len)
807 {
808 	uint_t ret = 0;
809 
810 	if (physmem < execseg_lpg_min_physmem) {
811 		return (0);
812 	}
813 
814 	if (text) {
815 		if (use_text_pgsz64k &&
816 		    !(disable_text_largepages & (1 << TTE64K))) {
817 			ret = map_text_pgsz64k(addr, len);
818 		} else if (use_text_pgsz4m &&
819 		    !(disable_text_largepages & (1 << TTE4M))) {
820 			ret = map_text_pgsz4m(addr, len);
821 		}
822 	} else if (use_initdata_pgsz64k &&
823 	    !(disable_initdata_largepages & (1 << TTE64K))) {
824 		ret = map_initdata_pgsz64k(addr, len);
825 	}
826 
827 	return (ret);
828 }
829 
830 uint_t
831 map_shm_pgszcvec(caddr_t addr, size_t size, uintptr_t off)
832 {
833 	caddr_t eaddr = addr + size;
834 	uint_t szcvec = 0;
835 	int i;
836 	caddr_t raddr;
837 	caddr_t readdr;
838 	size_t pgsz;
839 
840 	if (physmem < shm_lpg_min_physmem || mmu_page_sizes <= 1 ||
841 	    max_shm_lpsize <= MMU_PAGESIZE) {
842 		return (0);
843 	}
844 
845 	for (i = mmu_page_sizes - 1; i > 0; i--) {
846 		if (disable_shm_large_pages & (1 << i)) {
847 			continue;
848 		}
849 		pgsz = page_get_pagesize(i);
850 		if (pgsz > max_shm_lpsize) {
851 			continue;
852 		}
853 		raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
854 		readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
855 		if (raddr < addr || raddr >= readdr) {
856 			continue;
857 		}
858 		if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
859 			continue;
860 		}
861 		szcvec |= (1 << i);
862 		/*
863 		 * And or in the remaining enabled page sizes.
864 		 */
865 		szcvec |= P2PHASE(~disable_shm_large_pages, (1 << i));
866 		szcvec &= ~1; /* no need to return 8K pagesize */
867 		break;
868 	}
869 	return (szcvec);
870 }
871 
872 /*
873  * Anchored in the table below are counters used to keep track
874  * of free contiguous physical memory. Each element of the table contains
875  * the array of counters, the size of array which is allocated during
876  * startup based on physmax and a shift value used to convert a pagenum
877  * into a counter array index or vice versa. The table has page size
878  * for rows and region size for columns:
879  *
880  *	page_counters[page_size][region_size]
881  *
882  *	page_size: 	TTE size code of pages on page_size freelist.
883  *
884  *	region_size:	TTE size code of a candidate larger page made up
885  *			made up of contiguous free page_size pages.
886  *
887  * As you go across a page_size row increasing region_size each
888  * element keeps track of how many (region_size - 1) size groups
889  * made up of page_size free pages can be coalesced into a
890  * regsion_size page. Yuck! Lets try an example:
891  *
892  * 	page_counters[1][3] is the table element used for identifying
893  *	candidate 4M pages from contiguous pages off the 64K free list.
894  *	Each index in the page_counters[1][3].array spans 4M. Its the
895  *	number of free 512K size (regsion_size - 1) groups of contiguous
896  *	64K free pages.	So when page_counters[1][3].counters[n] == 8
897  *	we know we have a candidate 4M page made up of 512K size groups
898  *	of 64K free pages.
899  */
900 
901 /*
902  * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
903  * dimensions are allocated dynamically.
904  */
905 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
906 
907 /*
908  * For now there is only a single size cache list.
909  * Allocated dynamically.
910  */
911 page_t ***page_cachelists[MAX_MEM_TYPES];
912 
913 kmutex_t *fpc_mutex[NPC_MUTEX];
914 kmutex_t *cpc_mutex[NPC_MUTEX];
915 
916 caddr_t
917 alloc_page_freelists(int mnode, caddr_t alloc_base, int alloc_align)
918 {
919 	int	mtype;
920 	uint_t	szc;
921 
922 	alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align);
923 
924 	/*
925 	 * We only support small pages in the cachelist.
926 	 */
927 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
928 		page_cachelists[mtype][mnode] = (page_t **)alloc_base;
929 		alloc_base += (sizeof (page_t *) * page_get_pagecolors(0));
930 		/*
931 		 * Allocate freelists bins for all
932 		 * supported page sizes.
933 		 */
934 		for (szc = 0; szc < mmu_page_sizes; szc++) {
935 			page_freelists[szc][mtype][mnode] =
936 			    (page_t **)alloc_base;
937 			alloc_base += ((sizeof (page_t *) *
938 			    page_get_pagecolors(szc)));
939 		}
940 	}
941 
942 	alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align);
943 
944 	return (alloc_base);
945 }
946 
947 /*
948  * Allocate page_freelists bin headers for a memnode from the
949  * nucleus data area. This is the first time that mmu_page_sizes is
950  * used during sun4u bootup, so check mmu_page_sizes initialization.
951  */
952 int
953 ndata_alloc_page_freelists(struct memlist *ndata, int mnode)
954 {
955 	size_t alloc_sz;
956 	caddr_t alloc_base;
957 	caddr_t end;
958 	int	mtype;
959 	uint_t	szc;
960 	int32_t allp = 0;
961 
962 	if (&mmu_init_mmu_page_sizes) {
963 		if (!mmu_init_mmu_page_sizes(allp)) {
964 			cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
965 			    mmu_page_sizes);
966 		}
967 	}
968 	ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
969 
970 	/* first time called - allocate max_mem_nodes dimension */
971 	if (mnode == 0) {
972 		int	i;
973 
974 		/* page_cachelists */
975 		alloc_sz = MAX_MEM_TYPES * max_mem_nodes *
976 		    sizeof (page_t **);
977 
978 		/* page_freelists */
979 		alloc_sz += MAX_MEM_TYPES * mmu_page_sizes * max_mem_nodes *
980 		    sizeof (page_t **);
981 
982 		/* fpc_mutex and cpc_mutex */
983 		alloc_sz += 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
984 
985 		alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
986 		if (alloc_base == NULL)
987 			return (-1);
988 
989 		ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
990 
991 		for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
992 			page_cachelists[mtype] = (page_t ***)alloc_base;
993 			alloc_base += (max_mem_nodes * sizeof (page_t **));
994 			for (szc = 0; szc < mmu_page_sizes; szc++) {
995 				page_freelists[szc][mtype] =
996 				    (page_t ***)alloc_base;
997 				alloc_base += (max_mem_nodes *
998 				    sizeof (page_t **));
999 			}
1000 		}
1001 		for (i = 0; i < NPC_MUTEX; i++) {
1002 			fpc_mutex[i] = (kmutex_t *)alloc_base;
1003 			alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
1004 			cpc_mutex[i] = (kmutex_t *)alloc_base;
1005 			alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
1006 		}
1007 		alloc_sz = 0;
1008 	}
1009 
1010 	/*
1011 	 * Calculate the size needed by alloc_page_freelists().
1012 	 */
1013 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
1014 		alloc_sz += sizeof (page_t *) * page_get_pagecolors(0);
1015 
1016 		for (szc = 0; szc < mmu_page_sizes; szc++)
1017 			alloc_sz += sizeof (page_t *) *
1018 			    page_get_pagecolors(szc);
1019 	}
1020 
1021 	alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
1022 	if (alloc_base == NULL)
1023 		return (-1);
1024 
1025 	end = alloc_page_freelists(mnode, alloc_base, ecache_alignsize);
1026 	ASSERT((uintptr_t)end == roundup((uintptr_t)alloc_base + alloc_sz,
1027 	    ecache_alignsize));
1028 
1029 	return (0);
1030 }
1031 
1032 /*
1033  * To select our starting bin, we stride through the bins with a stride
1034  * of 337.  Why 337?  It's prime, it's largeish, and it performs well both
1035  * in simulation and practice for different workloads on varying cache sizes.
1036  */
1037 uint32_t color_start_current = 0;
1038 uint32_t color_start_stride = 337;
1039 int color_start_random = 0;
1040 
1041 /* ARGSUSED */
1042 uint_t
1043 get_color_start(struct as *as)
1044 {
1045 	uint32_t old, new;
1046 
1047 	if (consistent_coloring == 2 || color_start_random) {
1048 		return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
1049 		    (hw_page_array[0].hp_colors - 1)));
1050 	}
1051 
1052 	do {
1053 		old = color_start_current;
1054 		new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
1055 	} while (cas32(&color_start_current, old, new) != old);
1056 
1057 	return ((uint_t)(new));
1058 }
1059 
1060 /*
1061  * Called once at startup from kphysm_init() -- before memialloc()
1062  * is invoked to do the 1st page_free()/page_freelist_add().
1063  *
1064  * initializes page_colors and page_colors_mask based on ecache_setsize.
1065  *
1066  * Also initializes the counter locks.
1067  */
1068 void
1069 page_coloring_init()
1070 {
1071 	int	a, i;
1072 	uint_t colors;
1073 
1074 	if (do_pg_coloring == 0) {
1075 		page_colors = 1;
1076 		for (i = 0; i < mmu_page_sizes; i++)
1077 			hw_page_array[i].hp_colors = 1;
1078 		return;
1079 	}
1080 
1081 	/*
1082 	 * Calculate page_colors from ecache_setsize. ecache_setsize contains
1083 	 * the max ecache setsize of all cpus configured in the system or, for
1084 	 * cheetah+ systems, the max possible ecache setsize for all possible
1085 	 * cheetah+ cpus.
1086 	 */
1087 	page_colors = ecache_setsize / MMU_PAGESIZE;
1088 	page_colors_mask = page_colors - 1;
1089 
1090 	vac_colors = vac_size / MMU_PAGESIZE;
1091 	vac_colors_mask = vac_colors -1;
1092 
1093 	page_coloring_shift = 0;
1094 	a = ecache_setsize;
1095 	while (a >>= 1) {
1096 		page_coloring_shift++;
1097 	}
1098 
1099 	/* initialize number of colors per page size */
1100 	for (i = 0; i < mmu_page_sizes; i++) {
1101 		hw_page_array[i].hp_colors = (page_colors_mask >>
1102 		    (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
1103 		    + 1;
1104 	}
1105 
1106 	/*
1107 	 * initialize cpu_page_colors if ecache setsizes are homogenous.
1108 	 * cpu_page_colors set to -1 during DR operation or during startup
1109 	 * if setsizes are heterogenous.
1110 	 *
1111 	 * The value of cpu_page_colors determines if additional color bins
1112 	 * need to be checked for a particular color in the page_get routines.
1113 	 */
1114 	if ((cpu_page_colors == 0) && (cpu_setsize < ecache_setsize)) {
1115 
1116 		cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
1117 		a = lowbit(page_colors) - lowbit(cpu_page_colors);
1118 		ASSERT(a > 0);
1119 		ASSERT(a < 16);
1120 
1121 		for (i = 0; i < mmu_page_sizes; i++) {
1122 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
1123 				colorequivszc[i] = 0;
1124 				continue;
1125 			}
1126 			while ((colors >> a) == 0)
1127 				a--;
1128 			ASSERT(a >= 0);
1129 
1130 			/* higher 4 bits encodes color equiv mask */
1131 			colorequivszc[i] = (a << 4);
1132 		}
1133 	}
1134 
1135 	/* factor in colorequiv to check additional 'equivalent' bins. */
1136 	if (colorequiv > 1 && &page_coloring_init_cpu == NULL) {
1137 
1138 		a = lowbit(colorequiv) - 1;
1139 
1140 		if (a > 15)
1141 			a = 15;
1142 
1143 		for (i = 0; i < mmu_page_sizes; i++) {
1144 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
1145 				continue;
1146 			}
1147 			while ((colors >> a) == 0)
1148 				a--;
1149 			if ((a << 4) > colorequivszc[i]) {
1150 				colorequivszc[i] = (a << 4);
1151 			}
1152 		}
1153 	}
1154 
1155 	/* do cpu specific color initialization */
1156 	if (&page_coloring_init_cpu) {
1157 		page_coloring_init_cpu();
1158 	}
1159 }
1160 
1161 int
1162 bp_color(struct buf *bp)
1163 {
1164 	int color = -1;
1165 
1166 	if (vac) {
1167 		if ((bp->b_flags & B_PAGEIO) != 0) {
1168 			color = sfmmu_get_ppvcolor(bp->b_pages);
1169 		} else if (bp->b_un.b_addr != NULL) {
1170 			color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
1171 		}
1172 	}
1173 	return (color < 0 ? 0 : ptob(color));
1174 }
1175 
1176 /*
1177  * Create & Initialise pageout scanner thread. The thread has to
1178  * start at procedure with process pp and priority pri.
1179  */
1180 void
1181 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
1182 {
1183 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
1184 }
1185 
1186 /*
1187  * Function for flushing D-cache when performing module relocations
1188  * to an alternate mapping.  Stubbed out on all platforms except sun4u,
1189  * at least for now.
1190  */
1191 void
1192 dcache_flushall()
1193 {
1194 	sfmmu_cache_flushall();
1195 }
1196 
1197 static int
1198 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
1199 {
1200 	if (va1 < va2 && va1 + sz1 <= va2)
1201 		return (0);
1202 
1203 	if (va2 < va1 && va2 + sz2 <= va1)
1204 		return (0);
1205 
1206 	return (1);
1207 }
1208 
1209 /*
1210  * Return the number of bytes, relative to the beginning of a given range, that
1211  * are non-toxic (can be read from and written to with relative impunity).
1212  */
1213 size_t
1214 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
1215 {
1216 	/* OBP reads are harmless, but we don't want people writing there */
1217 	if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
1218 	    OFW_START_ADDR + 1))
1219 		return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
1220 
1221 	if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
1222 		return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
1223 
1224 	return (sz); /* no overlap */
1225 }
1226 
1227 /*
1228  * Minimum physmem required for enabling large pages for kernel heap
1229  * Currently we do not enable lp for kmem on systems with less
1230  * than 1GB of memory. This value can be changed via /etc/system
1231  */
1232 size_t segkmem_lpminphysmem = 0x40000000;	/* 1GB */
1233 
1234 /*
1235  * this function chooses large page size for kernel heap
1236  */
1237 size_t
1238 get_segkmem_lpsize(size_t lpsize)
1239 {
1240 	size_t memtotal = physmem * PAGESIZE;
1241 	size_t mmusz;
1242 	uint_t szc;
1243 	extern int disable_large_pages;
1244 
1245 	if (memtotal < segkmem_lpminphysmem)
1246 		return (PAGESIZE);
1247 
1248 	if (plat_lpkmem_is_supported != NULL &&
1249 	    plat_lpkmem_is_supported() == 0)
1250 		return (PAGESIZE);
1251 
1252 	mmusz = mmu_get_kernel_lpsize(lpsize);
1253 	szc = page_szc(mmusz);
1254 
1255 	while (szc) {
1256 		if (!(disable_large_pages & (1 << szc)))
1257 			return (page_get_pagesize(szc));
1258 		szc--;
1259 	}
1260 	return (PAGESIZE);
1261 }
1262