xref: /titanic_52/usr/src/uts/sun4/vm/vm_dep.c (revision 2ee92411ae65ea7cb80c2a46adfc22b983dcea7f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * UNIX machine dependent virtual memory support.
27  */
28 
29 #include <sys/vm.h>
30 #include <sys/exec.h>
31 
32 #include <sys/exechdr.h>
33 #include <vm/seg_kmem.h>
34 #include <sys/atomic.h>
35 #include <sys/archsystm.h>
36 #include <sys/machsystm.h>
37 #include <sys/kdi.h>
38 #include <sys/cpu_module.h>
39 
40 #include <vm/hat_sfmmu.h>
41 
42 #include <sys/memnode.h>
43 
44 #include <sys/mem_config.h>
45 #include <sys/mem_cage.h>
46 #include <vm/vm_dep.h>
47 #include <vm/page.h>
48 #include <sys/platform_module.h>
49 
50 /*
51  * These variables are set by module specific config routines.
52  * They are only set by modules which will use physical cache page coloring.
53  */
54 int do_pg_coloring = 0;
55 
56 /*
57  * These variables can be conveniently patched at kernel load time to
58  * prevent do_pg_coloring from being enabled by
59  * module specific config routines.
60  */
61 
62 int use_page_coloring = 1;
63 
64 /*
65  * initialized by page_coloring_init()
66  */
67 extern uint_t page_colors;
68 extern uint_t page_colors_mask;
69 extern uint_t page_coloring_shift;
70 int cpu_page_colors;
71 uint_t vac_colors = 0;
72 uint_t vac_colors_mask = 0;
73 
74 /* cpu specific coloring initialization */
75 extern void page_coloring_init_cpu();
76 #pragma weak page_coloring_init_cpu
77 
78 /*
79  * get the ecache setsize for the current cpu.
80  */
81 #define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
82 
83 plcnt_t		plcnt;		/* page list count */
84 
85 /*
86  * This variable is set by the cpu module to contain the lowest
87  * address not affected by the SF_ERRATA_57 workaround.  It should
88  * remain 0 if the workaround is not needed.
89  */
90 #if defined(SF_ERRATA_57)
91 caddr_t errata57_limit;
92 #endif
93 
94 static void page_flt_init(page_freelist_type_t *);
95 
96 extern void page_relocate_hash(page_t *, page_t *);
97 
98 /*
99  * these must be defined in platform specific areas
100  */
101 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
102 	struct proc *, uint_t);
103 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
104 	caddr_t, size_t, uint_t, struct lgrp *);
105 /*
106  * Convert page frame number to an OBMEM page frame number
107  * (i.e. put in the type bits -- zero for this implementation)
108  */
109 pfn_t
110 impl_obmem_pfnum(pfn_t pf)
111 {
112 	return (pf);
113 }
114 
115 /*
116  * Use physmax to determine the highest physical page of DRAM memory
117  * It is assumed that any physical addresses above physmax is in IO space.
118  * We don't bother checking the low end because we assume that memory space
119  * begins at physical page frame 0.
120  *
121  * Return 1 if the page frame is onboard DRAM memory, else 0.
122  * Returns 0 for nvram so it won't be cached.
123  */
124 int
125 pf_is_memory(pfn_t pf)
126 {
127 	/* We must be IO space */
128 	if (pf > physmax)
129 		return (0);
130 
131 	/* We must be memory space */
132 	return (1);
133 }
134 
135 /*
136  * Handle a pagefault.
137  */
138 faultcode_t
139 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
140 {
141 	struct as *as;
142 	struct proc *p;
143 	faultcode_t res;
144 	caddr_t base;
145 	size_t len;
146 	int err;
147 
148 	if (INVALID_VADDR(addr))
149 		return (FC_NOMAP);
150 
151 	if (iskernel) {
152 		as = &kas;
153 	} else {
154 		p = curproc;
155 		as = p->p_as;
156 #if defined(SF_ERRATA_57)
157 		/*
158 		 * Prevent infinite loops due to a segment driver
159 		 * setting the execute permissions and the sfmmu hat
160 		 * silently ignoring them.
161 		 */
162 		if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
163 		    addr < errata57_limit) {
164 			res = FC_NOMAP;
165 			goto out;
166 		}
167 #endif
168 	}
169 
170 	/*
171 	 * Dispatch pagefault.
172 	 */
173 	res = as_fault(as->a_hat, as, addr, 1, type, rw);
174 
175 	/*
176 	 * If this isn't a potential unmapped hole in the user's
177 	 * UNIX data or stack segments, just return status info.
178 	 */
179 	if (!(res == FC_NOMAP && iskernel == 0))
180 		goto out;
181 
182 	/*
183 	 * Check to see if we happened to faulted on a currently unmapped
184 	 * part of the UNIX data or stack segments.  If so, create a zfod
185 	 * mapping there and then try calling the fault routine again.
186 	 */
187 	base = p->p_brkbase;
188 	len = p->p_brksize;
189 
190 	if (addr < base || addr >= base + len) {		/* data seg? */
191 		base = (caddr_t)(p->p_usrstack - p->p_stksize);
192 		len = p->p_stksize;
193 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
194 			/* not in either UNIX data or stack segments */
195 			res = FC_NOMAP;
196 			goto out;
197 		}
198 	}
199 
200 	/* the rest of this function implements a 3.X 4.X 5.X compatibility */
201 	/* This code is probably not needed anymore */
202 
203 	/* expand the gap to the page boundaries on each side */
204 	len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
205 	    ((uintptr_t)base & PAGEMASK);
206 	base = (caddr_t)((uintptr_t)base & PAGEMASK);
207 
208 	as_rangelock(as);
209 	as_purge(as);
210 	if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
211 		err = as_map(as, base, len, segvn_create, zfod_argsp);
212 		as_rangeunlock(as);
213 		if (err) {
214 			res = FC_MAKE_ERR(err);
215 			goto out;
216 		}
217 	} else {
218 		/*
219 		 * This page is already mapped by another thread after we
220 		 * returned from as_fault() above.  We just fallthrough
221 		 * as_fault() below.
222 		 */
223 		as_rangeunlock(as);
224 	}
225 
226 	res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
227 
228 out:
229 
230 	return (res);
231 }
232 
233 /*
234  * This is the routine which defines the address limit implied
235  * by the flag '_MAP_LOW32'.  USERLIMIT32 matches the highest
236  * mappable address in a 32-bit process on this platform (though
237  * perhaps we should make it be UINT32_MAX here?)
238  */
239 void
240 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
241 {
242 	struct proc *p = curproc;
243 	caddr_t userlimit = flags & _MAP_LOW32 ?
244 	    (caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
245 	map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
246 }
247 
248 /*
249  * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
250  */
251 caddr_t	hole_start, hole_end;
252 
253 /*
254  * kpm mapping window
255  */
256 caddr_t kpm_vbase;
257 size_t  kpm_size;
258 uchar_t kpm_size_shift;
259 
260 int valid_va_range_aligned_wraparound;
261 /*
262  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
263  * addresses at least "minlen" long, where the base of the range is at "off"
264  * phase from an "align" boundary and there is space for a "redzone"-sized
265  * redzone on either side of the range.  On success, 1 is returned and *basep
266  * and *lenp are adjusted to describe the acceptable range (including
267  * the redzone).  On failure, 0 is returned.
268  */
269 int
270 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
271     size_t align, size_t redzone, size_t off)
272 {
273 	caddr_t hi, lo;
274 	size_t tot_len;
275 
276 	ASSERT(align == 0 ? off == 0 : off < align);
277 	ASSERT(ISP2(align));
278 	ASSERT(align == 0 || align >= PAGESIZE);
279 
280 	lo = *basep;
281 	hi = lo + *lenp;
282 	tot_len = minlen + 2 * redzone;	/* need at least this much space */
283 
284 	/* If hi rolled over the top try cutting back. */
285 	if (hi < lo) {
286 		*lenp = 0UL - (uintptr_t)lo - 1UL;
287 		/* Trying to see if this really happens, and then if so, why */
288 		valid_va_range_aligned_wraparound++;
289 		hi = lo + *lenp;
290 	}
291 	if (*lenp < tot_len) {
292 		return (0);
293 	}
294 
295 	/*
296 	 * Deal with a possible hole in the address range between
297 	 * hole_start and hole_end that should never be mapped by the MMU.
298 	 */
299 
300 	if (lo < hole_start) {
301 		if (hi > hole_start)
302 			if (hi < hole_end)
303 				hi = hole_start;
304 			else
305 				/* lo < hole_start && hi >= hole_end */
306 				if (dir == AH_LO) {
307 					/*
308 					 * prefer lowest range
309 					 */
310 					if (hole_start - lo >= tot_len)
311 						hi = hole_start;
312 					else if (hi - hole_end >= tot_len)
313 						lo = hole_end;
314 					else
315 						return (0);
316 				} else {
317 					/*
318 					 * prefer highest range
319 					 */
320 					if (hi - hole_end >= tot_len)
321 						lo = hole_end;
322 					else if (hole_start - lo >= tot_len)
323 						hi = hole_start;
324 					else
325 						return (0);
326 				}
327 	} else {
328 		/* lo >= hole_start */
329 		if (hi < hole_end)
330 			return (0);
331 		if (lo < hole_end)
332 			lo = hole_end;
333 	}
334 
335 	/* Check if remaining length is too small */
336 	if (hi - lo < tot_len) {
337 		return (0);
338 	}
339 	if (align > 1) {
340 		caddr_t tlo = lo + redzone;
341 		caddr_t thi = hi - redzone;
342 		tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off);
343 		if (tlo < lo + redzone) {
344 			return (0);
345 		}
346 		if (thi < tlo || thi - tlo < minlen) {
347 			return (0);
348 		}
349 	}
350 	*basep = lo;
351 	*lenp = hi - lo;
352 	return (1);
353 }
354 
355 /*
356  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
357  * addresses at least "minlen" long.  On success, 1 is returned and *basep
358  * and *lenp are adjusted to describe the acceptable range.  On failure, 0
359  * is returned.
360  */
361 int
362 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
363 {
364 	return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
365 }
366 
367 /*
368  * Determine whether [addr, addr+len] with protections `prot' are valid
369  * for a user address space.
370  */
371 /*ARGSUSED*/
372 int
373 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
374     caddr_t userlimit)
375 {
376 	caddr_t eaddr = addr + len;
377 
378 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
379 		return (RANGE_BADADDR);
380 
381 	/*
382 	 * Determine if the address range falls within an illegal
383 	 * range of the MMU.
384 	 */
385 	if (eaddr > hole_start && addr < hole_end)
386 		return (RANGE_BADADDR);
387 
388 #if defined(SF_ERRATA_57)
389 	/*
390 	 * Make sure USERLIMIT isn't raised too high
391 	 */
392 	ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
393 	    errata57_limit == 0);
394 
395 	if (AS_TYPE_64BIT(as) &&
396 	    (addr < errata57_limit) &&
397 	    (prot & PROT_EXEC))
398 		return (RANGE_BADPROT);
399 #endif /* SF_ERRATA57 */
400 	return (RANGE_OKAY);
401 }
402 
403 /*
404  * Routine used to check to see if an a.out can be executed
405  * by the current machine/architecture.
406  */
407 int
408 chkaout(struct exdata *exp)
409 {
410 	if (exp->ux_mach == M_SPARC)
411 		return (0);
412 	else
413 		return (ENOEXEC);
414 }
415 
416 /*
417  * The following functions return information about an a.out
418  * which is used when a program is executed.
419  */
420 
421 /*
422  * Return the load memory address for the data segment.
423  */
424 caddr_t
425 getdmem(struct exec *exp)
426 {
427 	/*
428 	 * XXX - Sparc Reference Hack approaching
429 	 * Remember that we are loading
430 	 * 8k executables into a 4k machine
431 	 * DATA_ALIGN == 2 * PAGESIZE
432 	 */
433 	if (exp->a_text)
434 		return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
435 	else
436 		return ((caddr_t)USRTEXT);
437 }
438 
439 /*
440  * Return the starting disk address for the data segment.
441  */
442 ulong_t
443 getdfile(struct exec *exp)
444 {
445 	if (exp->a_magic == ZMAGIC)
446 		return (exp->a_text);
447 	else
448 		return (sizeof (struct exec) + exp->a_text);
449 }
450 
451 /*
452  * Return the load memory address for the text segment.
453  */
454 
455 /*ARGSUSED*/
456 caddr_t
457 gettmem(struct exec *exp)
458 {
459 	return ((caddr_t)USRTEXT);
460 }
461 
462 /*
463  * Return the file byte offset for the text segment.
464  */
465 uint_t
466 gettfile(struct exec *exp)
467 {
468 	if (exp->a_magic == ZMAGIC)
469 		return (0);
470 	else
471 		return (sizeof (struct exec));
472 }
473 
474 void
475 getexinfo(
476 	struct exdata *edp_in,
477 	struct exdata *edp_out,
478 	int *pagetext,
479 	int *pagedata)
480 {
481 	*edp_out = *edp_in;	/* structure copy */
482 
483 	if ((edp_in->ux_mag == ZMAGIC) &&
484 	    ((edp_in->vp->v_flag & VNOMAP) == 0)) {
485 		*pagetext = 1;
486 		*pagedata = 1;
487 	} else {
488 		*pagetext = 0;
489 		*pagedata = 0;
490 	}
491 }
492 
493 /*
494  * Return non 0 value if the address may cause a VAC alias with KPM mappings.
495  * KPM selects an address such that it's equal offset modulo shm_alignment and
496  * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
497  */
498 int
499 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
500 {
501 	if (vac) {
502 		return (((uintptr_t)addr ^ off) & shm_alignment - 1);
503 	} else {
504 		return (0);
505 	}
506 }
507 
508 /*
509  * Sanity control. Don't use large pages regardless of user
510  * settings if there's less than priv or shm_lpg_min_physmem memory installed.
511  * The units for this variable is 8K pages.
512  */
513 pgcnt_t shm_lpg_min_physmem = 131072;			/* 1GB */
514 pgcnt_t privm_lpg_min_physmem = 131072;			/* 1GB */
515 
516 static size_t
517 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
518 {
519 	size_t		pgsz = MMU_PAGESIZE;
520 	int		szc;
521 
522 	/*
523 	 * If len is zero, retrieve from proc and don't demote the page size.
524 	 * Use atleast the default pagesize.
525 	 */
526 	if (len == 0) {
527 		len = p->p_brkbase + p->p_brksize - p->p_bssbase;
528 	}
529 	len = MAX(len, default_uheap_lpsize);
530 
531 	for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
532 		pgsz = hw_page_array[szc].hp_size;
533 		if ((disable_auto_data_large_pages & (1 << szc)) ||
534 		    pgsz > max_uheap_lpsize)
535 			continue;
536 		if (len >= pgsz) {
537 			break;
538 		}
539 	}
540 
541 	/*
542 	 * If addr == 0 we were called by memcntl() when the
543 	 * size code is 0.  Don't set pgsz less than current size.
544 	 */
545 	if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
546 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
547 	}
548 
549 	return (pgsz);
550 }
551 
552 static size_t
553 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
554 {
555 	size_t		pgsz = MMU_PAGESIZE;
556 	int		szc;
557 
558 	/*
559 	 * If len is zero, retrieve from proc and don't demote the page size.
560 	 * Use atleast the default pagesize.
561 	 */
562 	if (len == 0) {
563 		len = p->p_stksize;
564 	}
565 	len = MAX(len, default_ustack_lpsize);
566 
567 	for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
568 		pgsz = hw_page_array[szc].hp_size;
569 		if ((disable_auto_data_large_pages & (1 << szc)) ||
570 		    pgsz > max_ustack_lpsize)
571 			continue;
572 		if (len >= pgsz) {
573 			break;
574 		}
575 	}
576 
577 	/*
578 	 * If addr == 0 we were called by memcntl() or exec_args() when the
579 	 * size code is 0.  Don't set pgsz less than current size.
580 	 */
581 	if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
582 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
583 	}
584 
585 	return (pgsz);
586 }
587 
588 static size_t
589 map_pgszism(caddr_t addr, size_t len)
590 {
591 	uint_t szc;
592 	size_t pgsz;
593 
594 	for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) {
595 		if (disable_ism_large_pages & (1 << szc))
596 			continue;
597 
598 		pgsz = hw_page_array[szc].hp_size;
599 		if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
600 			return (pgsz);
601 	}
602 
603 	return (DEFAULT_ISM_PAGESIZE);
604 }
605 
606 /*
607  * Suggest a page size to be used to map a segment of type maptype and length
608  * len.  Returns a page size (not a size code).
609  */
610 /* ARGSUSED */
611 size_t
612 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
613 {
614 	size_t	pgsz = MMU_PAGESIZE;
615 
616 	ASSERT(maptype != MAPPGSZ_VA);
617 
618 	if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
619 		return (MMU_PAGESIZE);
620 	}
621 
622 	switch (maptype) {
623 	case MAPPGSZ_ISM:
624 		pgsz = map_pgszism(addr, len);
625 		break;
626 
627 	case MAPPGSZ_STK:
628 		if (max_ustack_lpsize > MMU_PAGESIZE) {
629 			pgsz = map_pgszstk(p, addr, len);
630 		}
631 		break;
632 
633 	case MAPPGSZ_HEAP:
634 		if (max_uheap_lpsize > MMU_PAGESIZE) {
635 			pgsz = map_pgszheap(p, addr, len);
636 		}
637 		break;
638 	}
639 	return (pgsz);
640 }
641 
642 
643 /* assumes TTE8K...TTE4M == szc */
644 
645 static uint_t
646 map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs,
647     size_t max_lpsize, size_t min_physmem)
648 {
649 	caddr_t eaddr = addr + size;
650 	uint_t szcvec = 0;
651 	caddr_t raddr;
652 	caddr_t readdr;
653 	size_t pgsz;
654 	int i;
655 
656 	if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
657 		return (0);
658 	}
659 	for (i = mmu_page_sizes - 1; i > 0; i--) {
660 		if (disable_lpgs & (1 << i)) {
661 			continue;
662 		}
663 		pgsz = page_get_pagesize(i);
664 		if (pgsz > max_lpsize) {
665 			continue;
666 		}
667 		raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
668 		readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
669 		if (raddr < addr || raddr >= readdr) {
670 			continue;
671 		}
672 		if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
673 			continue;
674 		}
675 		szcvec |= (1 << i);
676 		/*
677 		 * And or in the remaining enabled page sizes.
678 		 */
679 		szcvec |= P2PHASE(~disable_lpgs, (1 << i));
680 		szcvec &= ~1; /* no need to return 8K pagesize */
681 		break;
682 	}
683 	return (szcvec);
684 }
685 
686 /*
687  * Return a bit vector of large page size codes that
688  * can be used to map [addr, addr + len) region.
689  */
690 /* ARGSUSED */
691 uint_t
692 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
693     int memcntl)
694 {
695 	if (flags & MAP_TEXT) {
696 		return (map_szcvec(addr, size, off,
697 		    disable_auto_text_large_pages,
698 		    max_utext_lpsize, shm_lpg_min_physmem));
699 
700 	} else if (flags & MAP_INITDATA) {
701 		return (map_szcvec(addr, size, off,
702 		    disable_auto_data_large_pages,
703 		    max_uidata_lpsize, privm_lpg_min_physmem));
704 
705 	} else if (type == MAPPGSZC_SHM) {
706 		return (map_szcvec(addr, size, off,
707 		    disable_auto_data_large_pages,
708 		    max_shm_lpsize, shm_lpg_min_physmem));
709 
710 	} else if (type == MAPPGSZC_HEAP) {
711 		return (map_szcvec(addr, size, off,
712 		    disable_auto_data_large_pages,
713 		    max_uheap_lpsize, privm_lpg_min_physmem));
714 
715 	} else if (type == MAPPGSZC_STACK) {
716 		return (map_szcvec(addr, size, off,
717 		    disable_auto_data_large_pages,
718 		    max_ustack_lpsize, privm_lpg_min_physmem));
719 
720 	} else {
721 		return (map_szcvec(addr, size, off,
722 		    disable_auto_data_large_pages,
723 		    max_privmap_lpsize, privm_lpg_min_physmem));
724 	}
725 }
726 
727 /*
728  * For now there is only a single size cache list.
729  * Allocated dynamically.
730  */
731 page_t ***page_cachelists[MAX_MEM_TYPES];
732 
733 kmutex_t *fpc_mutex[NPC_MUTEX];
734 kmutex_t *cpc_mutex[NPC_MUTEX];
735 
736 /*
737  * Calculate space needed for page freelists and counters
738  */
739 size_t
740 calc_free_pagelist_sz(void)
741 {
742 	int szc;
743 	size_t alloc_sz, cache_sz, free_sz;
744 
745 	/*
746 	 * one cachelist per color, node, and type
747 	 */
748 	cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) +
749 	    sizeof (page_t **);
750 	cache_sz *= max_mem_nodes * MAX_MEM_TYPES;
751 
752 	/*
753 	 * one freelist per size, color, node, and type
754 	 */
755 	free_sz = sizeof (page_t **);
756 	for (szc = 0; szc < mmu_page_sizes; szc++)
757 		free_sz += sizeof (page_t *) * page_get_pagecolors(szc);
758 	free_sz *= max_mem_nodes * MAX_MEM_TYPES;
759 
760 	alloc_sz = cache_sz + free_sz + page_ctrs_sz();
761 	return (alloc_sz);
762 }
763 
764 caddr_t
765 alloc_page_freelists(caddr_t alloc_base)
766 {
767 	int	mnode, mtype;
768 	int	szc, clrs;
769 
770 	/*
771 	 * We only support small pages in the cachelist.
772 	 */
773 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
774 		page_cachelists[mtype] = (page_t ***)alloc_base;
775 		alloc_base += (max_mem_nodes * sizeof (page_t **));
776 		for (mnode = 0; mnode < max_mem_nodes; mnode++) {
777 			page_cachelists[mtype][mnode] = (page_t **)alloc_base;
778 			alloc_base +=
779 			    (page_get_pagecolors(0) * sizeof (page_t *));
780 		}
781 	}
782 
783 	/*
784 	 * Allocate freelists bins for all
785 	 * supported page sizes.
786 	 */
787 	for (szc = 0; szc < mmu_page_sizes; szc++) {
788 		clrs = page_get_pagecolors(szc);
789 		for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
790 			ufltp->pflt_freelists[szc][mtype] =
791 			    (page_t ***)alloc_base;
792 			alloc_base += (max_mem_nodes * sizeof (page_t **));
793 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
794 				ufltp->pflt_freelists[szc][mtype][mnode] =
795 				    (page_t **)alloc_base;
796 				alloc_base += (clrs * (sizeof (page_t *)));
797 			}
798 		}
799 	}
800 
801 	alloc_base = page_ctrs_alloc(alloc_base);
802 
803 	page_flt_init(ufltp);
804 	return (alloc_base);
805 }
806 
807 /*
808  * Allocate page_freelists locks for a memnode from the nucleus data
809  * area. This is the first time that mmu_page_sizes is used during
810  * bootup, so check mmu_page_sizes initialization.
811  */
812 int
813 ndata_alloc_page_mutexs(struct memlist *ndata)
814 {
815 	size_t alloc_sz;
816 	caddr_t alloc_base;
817 	int	i;
818 	void	page_coloring_init();
819 
820 	page_coloring_init();
821 	if (&mmu_init_mmu_page_sizes) {
822 		if (!mmu_init_mmu_page_sizes(0)) {
823 			cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
824 			    mmu_page_sizes);
825 		}
826 	}
827 	ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
828 
829 	/* fpc_mutex and cpc_mutex */
830 	alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
831 
832 	alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
833 	if (alloc_base == NULL)
834 		return (-1);
835 
836 	ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
837 
838 	for (i = 0; i < NPC_MUTEX; i++) {
839 		fpc_mutex[i] = (kmutex_t *)alloc_base;
840 		alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
841 		cpc_mutex[i] = (kmutex_t *)alloc_base;
842 		alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
843 	}
844 	return (0);
845 }
846 
847 /*
848  * To select our starting bin, we stride through the bins with a stride
849  * of 337.  Why 337?  It's prime, it's largeish, and it performs well both
850  * in simulation and practice for different workloads on varying cache sizes.
851  */
852 uint32_t color_start_current = 0;
853 uint32_t color_start_stride = 337;
854 int color_start_random = 0;
855 
856 /* ARGSUSED */
857 uint_t
858 get_color_start(struct as *as)
859 {
860 	uint32_t old, new;
861 
862 	if (consistent_coloring == 2 || color_start_random) {
863 		return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
864 		    (hw_page_array[0].hp_colors - 1)));
865 	}
866 
867 	do {
868 		old = color_start_current;
869 		new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
870 	} while (cas32(&color_start_current, old, new) != old);
871 
872 	return ((uint_t)(new));
873 }
874 
875 /*
876  * Called once at startup from kphysm_init() -- before memialloc()
877  * is invoked to do the 1st page_free()/page_freelist_add().
878  *
879  * initializes page_colors and page_colors_mask based on ecache_setsize.
880  *
881  * Also initializes the counter locks.
882  */
883 void
884 page_coloring_init()
885 {
886 	int	a, i;
887 	uint_t colors;
888 
889 	if (do_pg_coloring == 0) {
890 		page_colors = 1;
891 		for (i = 0; i < mmu_page_sizes; i++) {
892 			colorequivszc[i] = 0;
893 			hw_page_array[i].hp_colors = 1;
894 		}
895 		return;
896 	}
897 
898 	/*
899 	 * Calculate page_colors from ecache_setsize. ecache_setsize contains
900 	 * the max ecache setsize of all cpus configured in the system or, for
901 	 * cheetah+ systems, the max possible ecache setsize for all possible
902 	 * cheetah+ cpus.
903 	 */
904 	page_colors = ecache_setsize / MMU_PAGESIZE;
905 	page_colors_mask = page_colors - 1;
906 
907 	vac_colors = vac_size / MMU_PAGESIZE;
908 	vac_colors_mask = vac_colors -1;
909 
910 	page_coloring_shift = 0;
911 	a = ecache_setsize;
912 	while (a >>= 1) {
913 		page_coloring_shift++;
914 	}
915 
916 	/* initialize number of colors per page size */
917 	for (i = 0; i < mmu_page_sizes; i++) {
918 		hw_page_array[i].hp_colors = (page_colors_mask >>
919 		    (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
920 		    + 1;
921 		colorequivszc[i] = 0;
922 	}
923 
924 	/*
925 	 * initialize cpu_page_colors if ecache setsizes are homogenous.
926 	 * cpu_page_colors set to -1 during DR operation or during startup
927 	 * if setsizes are heterogenous.
928 	 *
929 	 * The value of cpu_page_colors determines if additional color bins
930 	 * need to be checked for a particular color in the page_get routines.
931 	 */
932 	if (cpu_setsize > 0 && cpu_page_colors == 0 &&
933 	    cpu_setsize < ecache_setsize) {
934 		cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
935 		a = lowbit(page_colors) - lowbit(cpu_page_colors);
936 		ASSERT(a > 0);
937 		ASSERT(a < 16);
938 
939 		for (i = 0; i < mmu_page_sizes; i++) {
940 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
941 				continue;
942 			}
943 			while ((colors >> a) == 0)
944 				a--;
945 			ASSERT(a >= 0);
946 
947 			/* higher 4 bits encodes color equiv mask */
948 			colorequivszc[i] = (a << 4);
949 		}
950 	}
951 
952 	/* do cpu specific color initialization */
953 	if (&page_coloring_init_cpu) {
954 		page_coloring_init_cpu();
955 	}
956 }
957 
958 int
959 bp_color(struct buf *bp)
960 {
961 	int color = -1;
962 
963 	if (vac) {
964 		if ((bp->b_flags & B_PAGEIO) != 0) {
965 			color = sfmmu_get_ppvcolor(bp->b_pages);
966 		} else if (bp->b_un.b_addr != NULL) {
967 			color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
968 		}
969 	}
970 	return (color < 0 ? 0 : ptob(color));
971 }
972 
973 /*
974  * Function for flushing D-cache when performing module relocations
975  * to an alternate mapping.  Stubbed out on all platforms except sun4u,
976  * at least for now.
977  */
978 void
979 dcache_flushall()
980 {
981 	sfmmu_cache_flushall();
982 }
983 
984 static int
985 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
986 {
987 	if (va1 < va2 && va1 + sz1 <= va2)
988 		return (0);
989 
990 	if (va2 < va1 && va2 + sz2 <= va1)
991 		return (0);
992 
993 	return (1);
994 }
995 
996 /*
997  * Return the number of bytes, relative to the beginning of a given range, that
998  * are non-toxic (can be read from and written to with relative impunity).
999  */
1000 size_t
1001 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
1002 {
1003 	/* OBP reads are harmless, but we don't want people writing there */
1004 	if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
1005 	    OFW_START_ADDR + 1))
1006 		return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
1007 
1008 	if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
1009 		return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
1010 
1011 	return (sz); /* no overlap */
1012 }
1013 
1014 /*
1015  * Minimum physmem required for enabling large pages for kernel heap
1016  * Currently we do not enable lp for kmem on systems with less
1017  * than 1GB of memory. This value can be changed via /etc/system
1018  */
1019 size_t segkmem_lpminphysmem = 0x40000000;	/* 1GB */
1020 
1021 /*
1022  * this function chooses large page size for kernel heap
1023  */
1024 size_t
1025 get_segkmem_lpsize(size_t lpsize)
1026 {
1027 	size_t memtotal = physmem * PAGESIZE;
1028 	size_t mmusz;
1029 	uint_t szc;
1030 
1031 	if (memtotal < segkmem_lpminphysmem)
1032 		return (PAGESIZE);
1033 
1034 	if (plat_lpkmem_is_supported != NULL &&
1035 	    plat_lpkmem_is_supported() == 0)
1036 		return (PAGESIZE);
1037 
1038 	mmusz = mmu_get_kernel_lpsize(lpsize);
1039 	szc = page_szc(mmusz);
1040 
1041 	while (szc) {
1042 		if (!(disable_large_pages & (1 << szc)))
1043 			return (page_get_pagesize(szc));
1044 		szc--;
1045 	}
1046 	return (PAGESIZE);
1047 }
1048 /*
1049  * Initializes the user page freelist type structures.
1050  */
1051 static void
1052 page_flt_init(page_freelist_type_t *ufp)
1053 {
1054 	ufp->pflt_type = PFLT_USER;
1055 	ufp->pflt_get_free = &page_get_uflt;
1056 	ufp->pflt_walk_init = page_list_walk_init;
1057 	ufp->pflt_walk_next = page_list_walk_next_bin;
1058 	ufp->pflt_num_policies = 2;
1059 	ufp->pflt_policy[0] = page_get_mnode_freelist;
1060 	ufp->pflt_policy[1] = page_get_contig_pages;
1061 }
1062