xref: /illumos-gate/usr/src/uts/sun4/vm/vm_dep.c (revision ff31d5bfa079d4db9f78f481637d7ed9f9fa4a49)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2016 Joyent, Inc.
25  */
26 
27 /*
28  * UNIX machine dependent virtual memory support.
29  */
30 
31 #include <sys/vm.h>
32 #include <sys/exec.h>
33 
34 #include <sys/exechdr.h>
35 #include <vm/seg_kmem.h>
36 #include <sys/atomic.h>
37 #include <sys/archsystm.h>
38 #include <sys/machsystm.h>
39 #include <sys/kdi.h>
40 #include <sys/cpu_module.h>
41 #include <sys/secflags.h>
42 
43 #include <vm/hat_sfmmu.h>
44 
45 #include <sys/memnode.h>
46 
47 #include <sys/mem_config.h>
48 #include <sys/mem_cage.h>
49 #include <vm/vm_dep.h>
50 #include <vm/page.h>
51 #include <sys/platform_module.h>
52 
53 /*
54  * These variables are set by module specific config routines.
55  * They are only set by modules which will use physical cache page coloring.
56  */
57 int do_pg_coloring = 0;
58 
59 /*
60  * These variables can be conveniently patched at kernel load time to
61  * prevent do_pg_coloring from being enabled by
62  * module specific config routines.
63  */
64 
65 int use_page_coloring = 1;
66 
67 /*
68  * initialized by page_coloring_init()
69  */
70 extern uint_t page_colors;
71 extern uint_t page_colors_mask;
72 extern uint_t page_coloring_shift;
73 int cpu_page_colors;
74 uint_t vac_colors = 0;
75 uint_t vac_colors_mask = 0;
76 
77 /* cpu specific coloring initialization */
78 extern void page_coloring_init_cpu();
79 #pragma weak page_coloring_init_cpu
80 
81 /*
82  * get the ecache setsize for the current cpu.
83  */
84 #define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
85 
86 plcnt_t		plcnt;		/* page list count */
87 
88 /*
89  * This variable is set by the cpu module to contain the lowest
90  * address not affected by the SF_ERRATA_57 workaround.  It should
91  * remain 0 if the workaround is not needed.
92  */
93 #if defined(SF_ERRATA_57)
94 caddr_t errata57_limit;
95 #endif
96 
97 extern void page_relocate_hash(page_t *, page_t *);
98 
99 /*
100  * these must be defined in platform specific areas
101  */
102 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
103 	struct proc *, uint_t);
104 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
105 	caddr_t, size_t, uint_t, struct lgrp *);
106 /*
107  * Convert page frame number to an OBMEM page frame number
108  * (i.e. put in the type bits -- zero for this implementation)
109  */
110 pfn_t
111 impl_obmem_pfnum(pfn_t pf)
112 {
113 	return (pf);
114 }
115 
116 /*
117  * Use physmax to determine the highest physical page of DRAM memory
118  * It is assumed that any physical addresses above physmax is in IO space.
119  * We don't bother checking the low end because we assume that memory space
120  * begins at physical page frame 0.
121  *
122  * Return 1 if the page frame is onboard DRAM memory, else 0.
123  * Returns 0 for nvram so it won't be cached.
124  */
125 int
126 pf_is_memory(pfn_t pf)
127 {
128 	/* We must be IO space */
129 	if (pf > physmax)
130 		return (0);
131 
132 	/* We must be memory space */
133 	return (1);
134 }
135 
136 /*
137  * Handle a pagefault.
138  */
139 faultcode_t
140 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
141 {
142 	struct as *as;
143 	struct proc *p;
144 	faultcode_t res;
145 	caddr_t base;
146 	size_t len;
147 	int err;
148 
149 	if (INVALID_VADDR(addr))
150 		return (FC_NOMAP);
151 
152 	if (iskernel) {
153 		as = &kas;
154 	} else {
155 		p = curproc;
156 		as = p->p_as;
157 #if defined(SF_ERRATA_57)
158 		/*
159 		 * Prevent infinite loops due to a segment driver
160 		 * setting the execute permissions and the sfmmu hat
161 		 * silently ignoring them.
162 		 */
163 		if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
164 		    addr < errata57_limit) {
165 			res = FC_NOMAP;
166 			goto out;
167 		}
168 #endif
169 	}
170 
171 	/*
172 	 * Dispatch pagefault.
173 	 */
174 	res = as_fault(as->a_hat, as, addr, 1, type, rw);
175 
176 	/*
177 	 * If this isn't a potential unmapped hole in the user's
178 	 * UNIX data or stack segments, just return status info.
179 	 */
180 	if (!(res == FC_NOMAP && iskernel == 0))
181 		goto out;
182 
183 	/*
184 	 * Check to see if we happened to faulted on a currently unmapped
185 	 * part of the UNIX data or stack segments.  If so, create a zfod
186 	 * mapping there and then try calling the fault routine again.
187 	 */
188 	base = p->p_brkbase;
189 	len = p->p_brksize;
190 
191 	if (addr < base || addr >= base + len) {		/* data seg? */
192 		base = (caddr_t)(p->p_usrstack - p->p_stksize);
193 		len = p->p_stksize;
194 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
195 			/* not in either UNIX data or stack segments */
196 			res = FC_NOMAP;
197 			goto out;
198 		}
199 	}
200 
201 	/* the rest of this function implements a 3.X 4.X 5.X compatibility */
202 	/* This code is probably not needed anymore */
203 
204 	/* expand the gap to the page boundaries on each side */
205 	len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
206 	    ((uintptr_t)base & PAGEMASK);
207 	base = (caddr_t)((uintptr_t)base & PAGEMASK);
208 
209 	as_rangelock(as);
210 	as_purge(as);
211 	if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
212 		err = as_map(as, base, len, segvn_create, zfod_argsp);
213 		as_rangeunlock(as);
214 		if (err) {
215 			res = FC_MAKE_ERR(err);
216 			goto out;
217 		}
218 	} else {
219 		/*
220 		 * This page is already mapped by another thread after we
221 		 * returned from as_fault() above.  We just fallthrough
222 		 * as_fault() below.
223 		 */
224 		as_rangeunlock(as);
225 	}
226 
227 	res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
228 
229 out:
230 
231 	return (res);
232 }
233 
234 /*
235  * This is the routine which defines the address limit implied
236  * by the flag '_MAP_LOW32'.  USERLIMIT32 matches the highest
237  * mappable address in a 32-bit process on this platform (though
238  * perhaps we should make it be UINT32_MAX here?)
239  */
240 void
241 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
242 {
243 	struct proc *p = curproc;
244 	caddr_t userlimit = flags & _MAP_LOW32 ?
245 	    (caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
246 	map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
247 }
248 
249 /*
250  * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
251  */
252 caddr_t	hole_start, hole_end;
253 
254 /*
255  * kpm mapping window
256  */
257 caddr_t kpm_vbase;
258 size_t  kpm_size;
259 uchar_t kpm_size_shift;
260 
261 int valid_va_range_aligned_wraparound;
262 /*
263  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
264  * addresses at least "minlen" long, where the base of the range is at "off"
265  * phase from an "align" boundary and there is space for a "redzone"-sized
266  * redzone on either side of the range.  On success, 1 is returned and *basep
267  * and *lenp are adjusted to describe the acceptable range (including
268  * the redzone).  On failure, 0 is returned.
269  */
270 int
271 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
272     size_t align, size_t redzone, size_t off)
273 {
274 	caddr_t hi, lo;
275 	size_t tot_len;
276 
277 	ASSERT(align == 0 ? off == 0 : off < align);
278 	ASSERT(ISP2(align));
279 	ASSERT(align == 0 || align >= PAGESIZE);
280 
281 	lo = *basep;
282 	hi = lo + *lenp;
283 	tot_len = minlen + 2 * redzone;	/* need at least this much space */
284 
285 	/* If hi rolled over the top try cutting back. */
286 	if (hi < lo) {
287 		*lenp = 0UL - (uintptr_t)lo - 1UL;
288 		/* Trying to see if this really happens, and then if so, why */
289 		valid_va_range_aligned_wraparound++;
290 		hi = lo + *lenp;
291 	}
292 	if (*lenp < tot_len) {
293 		return (0);
294 	}
295 
296 	/*
297 	 * Deal with a possible hole in the address range between
298 	 * hole_start and hole_end that should never be mapped by the MMU.
299 	 */
300 
301 	if (lo < hole_start) {
302 		if (hi > hole_start)
303 			if (hi < hole_end)
304 				hi = hole_start;
305 			else
306 				/* lo < hole_start && hi >= hole_end */
307 				if (dir == AH_LO) {
308 					/*
309 					 * prefer lowest range
310 					 */
311 					if (hole_start - lo >= tot_len)
312 						hi = hole_start;
313 					else if (hi - hole_end >= tot_len)
314 						lo = hole_end;
315 					else
316 						return (0);
317 				} else {
318 					/*
319 					 * prefer highest range
320 					 */
321 					if (hi - hole_end >= tot_len)
322 						lo = hole_end;
323 					else if (hole_start - lo >= tot_len)
324 						hi = hole_start;
325 					else
326 						return (0);
327 				}
328 	} else {
329 		/* lo >= hole_start */
330 		if (hi < hole_end)
331 			return (0);
332 		if (lo < hole_end)
333 			lo = hole_end;
334 	}
335 
336 	/* Check if remaining length is too small */
337 	if (hi - lo < tot_len) {
338 		return (0);
339 	}
340 	if (align > 1) {
341 		caddr_t tlo = lo + redzone;
342 		caddr_t thi = hi - redzone;
343 		tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off);
344 		if (tlo < lo + redzone) {
345 			return (0);
346 		}
347 		if (thi < tlo || thi - tlo < minlen) {
348 			return (0);
349 		}
350 	}
351 	*basep = lo;
352 	*lenp = hi - lo;
353 	return (1);
354 }
355 
356 /*
357  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
358  * addresses at least "minlen" long.  On success, 1 is returned and *basep
359  * and *lenp are adjusted to describe the acceptable range.  On failure, 0
360  * is returned.
361  */
362 int
363 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
364 {
365 	return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
366 }
367 
368 /*
369  * Default to forbidding the first 64k of address space.  This protects most
370  * reasonably sized structures from dereferences through NULL:
371  *     ((foo_t *)0)->bar
372  */
373 uintptr_t forbidden_null_mapping_sz = 0x10000;
374 
375 /*
376  * Determine whether [addr, addr+len] with protections `prot' are valid
377  * for a user address space.
378  */
379 /*ARGSUSED*/
380 int
381 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
382     caddr_t userlimit)
383 {
384 	caddr_t eaddr = addr + len;
385 
386 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
387 		return (RANGE_BADADDR);
388 
389 	if ((addr <= (caddr_t)forbidden_null_mapping_sz) &&
390 	    as->a_proc != NULL &&
391 	    secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP))
392 		return (RANGE_BADADDR);
393 
394 	/*
395 	 * Determine if the address range falls within an illegal
396 	 * range of the MMU.
397 	 */
398 	if (eaddr > hole_start && addr < hole_end)
399 		return (RANGE_BADADDR);
400 
401 #if defined(SF_ERRATA_57)
402 	/*
403 	 * Make sure USERLIMIT isn't raised too high
404 	 */
405 	ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
406 	    errata57_limit == 0);
407 
408 	if (AS_TYPE_64BIT(as) &&
409 	    (addr < errata57_limit) &&
410 	    (prot & PROT_EXEC))
411 		return (RANGE_BADPROT);
412 #endif /* SF_ERRATA57 */
413 	return (RANGE_OKAY);
414 }
415 
416 /*
417  * Routine used to check to see if an a.out can be executed
418  * by the current machine/architecture.
419  */
420 int
421 chkaout(struct exdata *exp)
422 {
423 	if (exp->ux_mach == M_SPARC)
424 		return (0);
425 	else
426 		return (ENOEXEC);
427 }
428 
429 /*
430  * The following functions return information about an a.out
431  * which is used when a program is executed.
432  */
433 
434 /*
435  * Return the load memory address for the data segment.
436  */
437 caddr_t
438 getdmem(struct exec *exp)
439 {
440 	/*
441 	 * XXX - Sparc Reference Hack approaching
442 	 * Remember that we are loading
443 	 * 8k executables into a 4k machine
444 	 * DATA_ALIGN == 2 * PAGESIZE
445 	 */
446 	if (exp->a_text)
447 		return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
448 	else
449 		return ((caddr_t)USRTEXT);
450 }
451 
452 /*
453  * Return the starting disk address for the data segment.
454  */
455 ulong_t
456 getdfile(struct exec *exp)
457 {
458 	if (exp->a_magic == ZMAGIC)
459 		return (exp->a_text);
460 	else
461 		return (sizeof (struct exec) + exp->a_text);
462 }
463 
464 /*
465  * Return the load memory address for the text segment.
466  */
467 
468 /*ARGSUSED*/
469 caddr_t
470 gettmem(struct exec *exp)
471 {
472 	return ((caddr_t)USRTEXT);
473 }
474 
475 /*
476  * Return the file byte offset for the text segment.
477  */
478 uint_t
479 gettfile(struct exec *exp)
480 {
481 	if (exp->a_magic == ZMAGIC)
482 		return (0);
483 	else
484 		return (sizeof (struct exec));
485 }
486 
487 void
488 getexinfo(
489 	struct exdata *edp_in,
490 	struct exdata *edp_out,
491 	int *pagetext,
492 	int *pagedata)
493 {
494 	*edp_out = *edp_in;	/* structure copy */
495 
496 	if ((edp_in->ux_mag == ZMAGIC) &&
497 	    ((edp_in->vp->v_flag & VNOMAP) == 0)) {
498 		*pagetext = 1;
499 		*pagedata = 1;
500 	} else {
501 		*pagetext = 0;
502 		*pagedata = 0;
503 	}
504 }
505 
506 /*
507  * Return non 0 value if the address may cause a VAC alias with KPM mappings.
508  * KPM selects an address such that it's equal offset modulo shm_alignment and
509  * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
510  */
511 int
512 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
513 {
514 	if (vac) {
515 		return (((uintptr_t)addr ^ off) & shm_alignment - 1);
516 	} else {
517 		return (0);
518 	}
519 }
520 
521 /*
522  * Sanity control. Don't use large pages regardless of user
523  * settings if there's less than priv or shm_lpg_min_physmem memory installed.
524  * The units for this variable is 8K pages.
525  */
526 pgcnt_t shm_lpg_min_physmem = 131072;			/* 1GB */
527 pgcnt_t privm_lpg_min_physmem = 131072;			/* 1GB */
528 
529 static size_t
530 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
531 {
532 	size_t		pgsz = MMU_PAGESIZE;
533 	int		szc;
534 
535 	/*
536 	 * If len is zero, retrieve from proc and don't demote the page size.
537 	 * Use atleast the default pagesize.
538 	 */
539 	if (len == 0) {
540 		len = p->p_brkbase + p->p_brksize - p->p_bssbase;
541 	}
542 	len = MAX(len, default_uheap_lpsize);
543 
544 	for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
545 		pgsz = hw_page_array[szc].hp_size;
546 		if ((disable_auto_data_large_pages & (1 << szc)) ||
547 		    pgsz > max_uheap_lpsize)
548 			continue;
549 		if (len >= pgsz) {
550 			break;
551 		}
552 	}
553 
554 	/*
555 	 * If addr == 0 we were called by memcntl() when the
556 	 * size code is 0.  Don't set pgsz less than current size.
557 	 */
558 	if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
559 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
560 	}
561 
562 	return (pgsz);
563 }
564 
565 static size_t
566 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
567 {
568 	size_t		pgsz = MMU_PAGESIZE;
569 	int		szc;
570 
571 	/*
572 	 * If len is zero, retrieve from proc and don't demote the page size.
573 	 * Use atleast the default pagesize.
574 	 */
575 	if (len == 0) {
576 		len = p->p_stksize;
577 	}
578 	len = MAX(len, default_ustack_lpsize);
579 
580 	for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
581 		pgsz = hw_page_array[szc].hp_size;
582 		if ((disable_auto_data_large_pages & (1 << szc)) ||
583 		    pgsz > max_ustack_lpsize)
584 			continue;
585 		if (len >= pgsz) {
586 			break;
587 		}
588 	}
589 
590 	/*
591 	 * If addr == 0 we were called by memcntl() or exec_args() when the
592 	 * size code is 0.  Don't set pgsz less than current size.
593 	 */
594 	if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
595 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
596 	}
597 
598 	return (pgsz);
599 }
600 
601 static size_t
602 map_pgszism(caddr_t addr, size_t len)
603 {
604 	uint_t szc;
605 	size_t pgsz;
606 
607 	for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) {
608 		if (disable_ism_large_pages & (1 << szc))
609 			continue;
610 
611 		pgsz = hw_page_array[szc].hp_size;
612 		if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
613 			return (pgsz);
614 	}
615 
616 	return (DEFAULT_ISM_PAGESIZE);
617 }
618 
619 /*
620  * Suggest a page size to be used to map a segment of type maptype and length
621  * len.  Returns a page size (not a size code).
622  */
623 /* ARGSUSED */
624 size_t
625 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
626 {
627 	size_t	pgsz = MMU_PAGESIZE;
628 
629 	ASSERT(maptype != MAPPGSZ_VA);
630 
631 	if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
632 		return (MMU_PAGESIZE);
633 	}
634 
635 	switch (maptype) {
636 	case MAPPGSZ_ISM:
637 		pgsz = map_pgszism(addr, len);
638 		break;
639 
640 	case MAPPGSZ_STK:
641 		if (max_ustack_lpsize > MMU_PAGESIZE) {
642 			pgsz = map_pgszstk(p, addr, len);
643 		}
644 		break;
645 
646 	case MAPPGSZ_HEAP:
647 		if (max_uheap_lpsize > MMU_PAGESIZE) {
648 			pgsz = map_pgszheap(p, addr, len);
649 		}
650 		break;
651 	}
652 	return (pgsz);
653 }
654 
655 
656 /* assumes TTE8K...TTE4M == szc */
657 
658 static uint_t
659 map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs,
660     size_t max_lpsize, size_t min_physmem)
661 {
662 	caddr_t eaddr = addr + size;
663 	uint_t szcvec = 0;
664 	caddr_t raddr;
665 	caddr_t readdr;
666 	size_t pgsz;
667 	int i;
668 
669 	if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
670 		return (0);
671 	}
672 	for (i = mmu_page_sizes - 1; i > 0; i--) {
673 		if (disable_lpgs & (1 << i)) {
674 			continue;
675 		}
676 		pgsz = page_get_pagesize(i);
677 		if (pgsz > max_lpsize) {
678 			continue;
679 		}
680 		raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
681 		readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
682 		if (raddr < addr || raddr >= readdr) {
683 			continue;
684 		}
685 		if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
686 			continue;
687 		}
688 		szcvec |= (1 << i);
689 		/*
690 		 * And or in the remaining enabled page sizes.
691 		 */
692 		szcvec |= P2PHASE(~disable_lpgs, (1 << i));
693 		szcvec &= ~1; /* no need to return 8K pagesize */
694 		break;
695 	}
696 	return (szcvec);
697 }
698 
699 /*
700  * Return a bit vector of large page size codes that
701  * can be used to map [addr, addr + len) region.
702  */
703 /* ARGSUSED */
704 uint_t
705 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
706     int memcntl)
707 {
708 	if (flags & MAP_TEXT) {
709 		return (map_szcvec(addr, size, off,
710 		    disable_auto_text_large_pages,
711 		    max_utext_lpsize, shm_lpg_min_physmem));
712 
713 	} else if (flags & MAP_INITDATA) {
714 		return (map_szcvec(addr, size, off,
715 		    disable_auto_data_large_pages,
716 		    max_uidata_lpsize, privm_lpg_min_physmem));
717 
718 	} else if (type == MAPPGSZC_SHM) {
719 		return (map_szcvec(addr, size, off,
720 		    disable_auto_data_large_pages,
721 		    max_shm_lpsize, shm_lpg_min_physmem));
722 
723 	} else if (type == MAPPGSZC_HEAP) {
724 		return (map_szcvec(addr, size, off,
725 		    disable_auto_data_large_pages,
726 		    max_uheap_lpsize, privm_lpg_min_physmem));
727 
728 	} else if (type == MAPPGSZC_STACK) {
729 		return (map_szcvec(addr, size, off,
730 		    disable_auto_data_large_pages,
731 		    max_ustack_lpsize, privm_lpg_min_physmem));
732 
733 	} else {
734 		return (map_szcvec(addr, size, off,
735 		    disable_auto_data_large_pages,
736 		    max_privmap_lpsize, privm_lpg_min_physmem));
737 	}
738 }
739 
740 /*
741  * Anchored in the table below are counters used to keep track
742  * of free contiguous physical memory. Each element of the table contains
743  * the array of counters, the size of array which is allocated during
744  * startup based on physmax and a shift value used to convert a pagenum
745  * into a counter array index or vice versa. The table has page size
746  * for rows and region size for columns:
747  *
748  *	page_counters[page_size][region_size]
749  *
750  *	page_size: 	TTE size code of pages on page_size freelist.
751  *
752  *	region_size:	TTE size code of a candidate larger page made up
753  *			made up of contiguous free page_size pages.
754  *
755  * As you go across a page_size row increasing region_size each
756  * element keeps track of how many (region_size - 1) size groups
757  * made up of page_size free pages can be coalesced into a
758  * regsion_size page. Yuck! Lets try an example:
759  *
760  * 	page_counters[1][3] is the table element used for identifying
761  *	candidate 4M pages from contiguous pages off the 64K free list.
762  *	Each index in the page_counters[1][3].array spans 4M. Its the
763  *	number of free 512K size (regsion_size - 1) groups of contiguous
764  *	64K free pages.	So when page_counters[1][3].counters[n] == 8
765  *	we know we have a candidate 4M page made up of 512K size groups
766  *	of 64K free pages.
767  */
768 
769 /*
770  * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
771  * dimensions are allocated dynamically.
772  */
773 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
774 
775 /*
776  * For now there is only a single size cache list.
777  * Allocated dynamically.
778  */
779 page_t ***page_cachelists[MAX_MEM_TYPES];
780 
781 kmutex_t *fpc_mutex[NPC_MUTEX];
782 kmutex_t *cpc_mutex[NPC_MUTEX];
783 
784 /*
785  * Calculate space needed for page freelists and counters
786  */
787 size_t
788 calc_free_pagelist_sz(void)
789 {
790 	int szc;
791 	size_t alloc_sz, cache_sz, free_sz;
792 
793 	/*
794 	 * one cachelist per color, node, and type
795 	 */
796 	cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) +
797 	    sizeof (page_t **);
798 	cache_sz *= max_mem_nodes * MAX_MEM_TYPES;
799 
800 	/*
801 	 * one freelist per size, color, node, and type
802 	 */
803 	free_sz = sizeof (page_t **);
804 	for (szc = 0; szc < mmu_page_sizes; szc++)
805 		free_sz += sizeof (page_t *) * page_get_pagecolors(szc);
806 	free_sz *= max_mem_nodes * MAX_MEM_TYPES;
807 
808 	alloc_sz = cache_sz + free_sz + page_ctrs_sz();
809 	return (alloc_sz);
810 }
811 
812 caddr_t
813 alloc_page_freelists(caddr_t alloc_base)
814 {
815 	int	mnode, mtype;
816 	int	szc, clrs;
817 
818 	/*
819 	 * We only support small pages in the cachelist.
820 	 */
821 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
822 		page_cachelists[mtype] = (page_t ***)alloc_base;
823 		alloc_base += (max_mem_nodes * sizeof (page_t **));
824 		for (mnode = 0; mnode < max_mem_nodes; mnode++) {
825 			page_cachelists[mtype][mnode] = (page_t **)alloc_base;
826 			alloc_base +=
827 			    (page_get_pagecolors(0) * sizeof (page_t *));
828 		}
829 	}
830 
831 	/*
832 	 * Allocate freelists bins for all
833 	 * supported page sizes.
834 	 */
835 	for (szc = 0; szc < mmu_page_sizes; szc++) {
836 		clrs = page_get_pagecolors(szc);
837 		for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
838 			page_freelists[szc][mtype] = (page_t ***)alloc_base;
839 			alloc_base += (max_mem_nodes * sizeof (page_t **));
840 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
841 				page_freelists[szc][mtype][mnode] =
842 				    (page_t **)alloc_base;
843 				alloc_base += (clrs * (sizeof (page_t *)));
844 			}
845 		}
846 	}
847 
848 	alloc_base = page_ctrs_alloc(alloc_base);
849 	return (alloc_base);
850 }
851 
852 /*
853  * Allocate page_freelists locks for a memnode from the nucleus data
854  * area. This is the first time that mmu_page_sizes is used during
855  * bootup, so check mmu_page_sizes initialization.
856  */
857 int
858 ndata_alloc_page_mutexs(struct memlist *ndata)
859 {
860 	size_t alloc_sz;
861 	caddr_t alloc_base;
862 	int	i;
863 	void	page_coloring_init();
864 
865 	page_coloring_init();
866 	if (&mmu_init_mmu_page_sizes) {
867 		if (!mmu_init_mmu_page_sizes(0)) {
868 			cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
869 			    mmu_page_sizes);
870 		}
871 	}
872 	ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
873 
874 	/* fpc_mutex and cpc_mutex */
875 	alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
876 
877 	alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
878 	if (alloc_base == NULL)
879 		return (-1);
880 
881 	ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
882 
883 	for (i = 0; i < NPC_MUTEX; i++) {
884 		fpc_mutex[i] = (kmutex_t *)alloc_base;
885 		alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
886 		cpc_mutex[i] = (kmutex_t *)alloc_base;
887 		alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
888 	}
889 	return (0);
890 }
891 
892 /*
893  * To select our starting bin, we stride through the bins with a stride
894  * of 337.  Why 337?  It's prime, it's largeish, and it performs well both
895  * in simulation and practice for different workloads on varying cache sizes.
896  */
897 uint32_t color_start_current = 0;
898 uint32_t color_start_stride = 337;
899 int color_start_random = 0;
900 
901 /* ARGSUSED */
902 uint_t
903 get_color_start(struct as *as)
904 {
905 	uint32_t old, new;
906 
907 	if (consistent_coloring == 2 || color_start_random) {
908 		return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
909 		    (hw_page_array[0].hp_colors - 1)));
910 	}
911 
912 	do {
913 		old = color_start_current;
914 		new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
915 	} while (atomic_cas_32(&color_start_current, old, new) != old);
916 
917 	return ((uint_t)(new));
918 }
919 
920 /*
921  * Called once at startup from kphysm_init() -- before memialloc()
922  * is invoked to do the 1st page_free()/page_freelist_add().
923  *
924  * initializes page_colors and page_colors_mask based on ecache_setsize.
925  *
926  * Also initializes the counter locks.
927  */
928 void
929 page_coloring_init()
930 {
931 	int	a, i;
932 	uint_t colors;
933 
934 	if (do_pg_coloring == 0) {
935 		page_colors = 1;
936 		for (i = 0; i < mmu_page_sizes; i++) {
937 			colorequivszc[i] = 0;
938 			hw_page_array[i].hp_colors = 1;
939 		}
940 		return;
941 	}
942 
943 	/*
944 	 * Calculate page_colors from ecache_setsize. ecache_setsize contains
945 	 * the max ecache setsize of all cpus configured in the system or, for
946 	 * cheetah+ systems, the max possible ecache setsize for all possible
947 	 * cheetah+ cpus.
948 	 */
949 	page_colors = ecache_setsize / MMU_PAGESIZE;
950 	page_colors_mask = page_colors - 1;
951 
952 	vac_colors = vac_size / MMU_PAGESIZE;
953 	vac_colors_mask = vac_colors -1;
954 
955 	page_coloring_shift = 0;
956 	a = ecache_setsize;
957 	while (a >>= 1) {
958 		page_coloring_shift++;
959 	}
960 
961 	/* initialize number of colors per page size */
962 	for (i = 0; i < mmu_page_sizes; i++) {
963 		hw_page_array[i].hp_colors = (page_colors_mask >>
964 		    (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
965 		    + 1;
966 		colorequivszc[i] = 0;
967 	}
968 
969 	/*
970 	 * initialize cpu_page_colors if ecache setsizes are homogenous.
971 	 * cpu_page_colors set to -1 during DR operation or during startup
972 	 * if setsizes are heterogenous.
973 	 *
974 	 * The value of cpu_page_colors determines if additional color bins
975 	 * need to be checked for a particular color in the page_get routines.
976 	 */
977 	if (cpu_setsize > 0 && cpu_page_colors == 0 &&
978 	    cpu_setsize < ecache_setsize) {
979 		cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
980 		a = lowbit(page_colors) - lowbit(cpu_page_colors);
981 		ASSERT(a > 0);
982 		ASSERT(a < 16);
983 
984 		for (i = 0; i < mmu_page_sizes; i++) {
985 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
986 				continue;
987 			}
988 			while ((colors >> a) == 0)
989 				a--;
990 			ASSERT(a >= 0);
991 
992 			/* higher 4 bits encodes color equiv mask */
993 			colorequivszc[i] = (a << 4);
994 		}
995 	}
996 
997 	/* do cpu specific color initialization */
998 	if (&page_coloring_init_cpu) {
999 		page_coloring_init_cpu();
1000 	}
1001 }
1002 
1003 int
1004 bp_color(struct buf *bp)
1005 {
1006 	int color = -1;
1007 
1008 	if (vac) {
1009 		if ((bp->b_flags & B_PAGEIO) != 0) {
1010 			color = sfmmu_get_ppvcolor(bp->b_pages);
1011 		} else if (bp->b_un.b_addr != NULL) {
1012 			color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
1013 		}
1014 	}
1015 	return (color < 0 ? 0 : ptob(color));
1016 }
1017 
1018 /*
1019  * Function for flushing D-cache when performing module relocations
1020  * to an alternate mapping.  Stubbed out on all platforms except sun4u,
1021  * at least for now.
1022  */
1023 void
1024 dcache_flushall()
1025 {
1026 	sfmmu_cache_flushall();
1027 }
1028 
1029 static int
1030 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
1031 {
1032 	if (va1 < va2 && va1 + sz1 <= va2)
1033 		return (0);
1034 
1035 	if (va2 < va1 && va2 + sz2 <= va1)
1036 		return (0);
1037 
1038 	return (1);
1039 }
1040 
1041 /*
1042  * Return the number of bytes, relative to the beginning of a given range, that
1043  * are non-toxic (can be read from and written to with relative impunity).
1044  */
1045 size_t
1046 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
1047 {
1048 	/* OBP reads are harmless, but we don't want people writing there */
1049 	if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
1050 	    OFW_START_ADDR + 1))
1051 		return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
1052 
1053 	if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
1054 		return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
1055 
1056 	return (sz); /* no overlap */
1057 }
1058 
1059 /*
1060  * Minimum physmem required for enabling large pages for kernel heap
1061  * Currently we do not enable lp for kmem on systems with less
1062  * than 1GB of memory. This value can be changed via /etc/system
1063  */
1064 size_t segkmem_lpminphysmem = 0x40000000;	/* 1GB */
1065 
1066 /*
1067  * this function chooses large page size for kernel heap
1068  */
1069 size_t
1070 get_segkmem_lpsize(size_t lpsize)
1071 {
1072 	size_t memtotal = physmem * PAGESIZE;
1073 	size_t mmusz;
1074 	uint_t szc;
1075 
1076 	if (memtotal < segkmem_lpminphysmem)
1077 		return (PAGESIZE);
1078 
1079 	if (plat_lpkmem_is_supported != NULL &&
1080 	    plat_lpkmem_is_supported() == 0)
1081 		return (PAGESIZE);
1082 
1083 	mmusz = mmu_get_kernel_lpsize(lpsize);
1084 	szc = page_szc(mmusz);
1085 
1086 	while (szc) {
1087 		if (!(disable_large_pages & (1 << szc)))
1088 			return (page_get_pagesize(szc));
1089 		szc--;
1090 	}
1091 	return (PAGESIZE);
1092 }
1093