xref: /titanic_52/usr/src/uts/sun4/vm/vm_dep.c (revision 02b4e56ca3a4e4a4fe9e52fca9c2972101f0e57f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * UNIX machine dependent virtual memory support.
28  */
29 
30 #include <sys/vm.h>
31 #include <sys/exec.h>
32 
33 #include <sys/exechdr.h>
34 #include <vm/seg_kmem.h>
35 #include <sys/atomic.h>
36 #include <sys/archsystm.h>
37 #include <sys/machsystm.h>
38 #include <sys/kdi.h>
39 #include <sys/cpu_module.h>
40 
41 #include <vm/hat_sfmmu.h>
42 
43 #include <sys/memnode.h>
44 
45 #include <sys/mem_config.h>
46 #include <sys/mem_cage.h>
47 #include <vm/vm_dep.h>
48 #include <vm/page.h>
49 #include <sys/platform_module.h>
50 
51 /*
52  * These variables are set by module specific config routines.
53  * They are only set by modules which will use physical cache page coloring.
54  */
55 int do_pg_coloring = 0;
56 
57 /*
58  * These variables can be conveniently patched at kernel load time to
59  * prevent do_pg_coloring from being enabled by
60  * module specific config routines.
61  */
62 
63 int use_page_coloring = 1;
64 
65 /*
66  * initialized by page_coloring_init()
67  */
68 extern uint_t page_colors;
69 extern uint_t page_colors_mask;
70 extern uint_t page_coloring_shift;
71 int cpu_page_colors;
72 uint_t vac_colors = 0;
73 uint_t vac_colors_mask = 0;
74 
75 /* cpu specific coloring initialization */
76 extern void page_coloring_init_cpu();
77 #pragma weak page_coloring_init_cpu
78 
79 /*
80  * get the ecache setsize for the current cpu.
81  */
82 #define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
83 
84 plcnt_t		plcnt;		/* page list count */
85 
86 /*
87  * This variable is set by the cpu module to contain the lowest
88  * address not affected by the SF_ERRATA_57 workaround.  It should
89  * remain 0 if the workaround is not needed.
90  */
91 #if defined(SF_ERRATA_57)
92 caddr_t errata57_limit;
93 #endif
94 
95 extern void page_relocate_hash(page_t *, page_t *);
96 
97 /*
98  * these must be defined in platform specific areas
99  */
100 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
101 	struct proc *, uint_t);
102 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
103 	caddr_t, size_t, uint_t, struct lgrp *);
104 /*
105  * Convert page frame number to an OBMEM page frame number
106  * (i.e. put in the type bits -- zero for this implementation)
107  */
108 pfn_t
109 impl_obmem_pfnum(pfn_t pf)
110 {
111 	return (pf);
112 }
113 
114 /*
115  * Use physmax to determine the highest physical page of DRAM memory
116  * It is assumed that any physical addresses above physmax is in IO space.
117  * We don't bother checking the low end because we assume that memory space
118  * begins at physical page frame 0.
119  *
120  * Return 1 if the page frame is onboard DRAM memory, else 0.
121  * Returns 0 for nvram so it won't be cached.
122  */
123 int
124 pf_is_memory(pfn_t pf)
125 {
126 	/* We must be IO space */
127 	if (pf > physmax)
128 		return (0);
129 
130 	/* We must be memory space */
131 	return (1);
132 }
133 
134 /*
135  * Handle a pagefault.
136  */
137 faultcode_t
138 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
139 {
140 	struct as *as;
141 	struct proc *p;
142 	faultcode_t res;
143 	caddr_t base;
144 	size_t len;
145 	int err;
146 
147 	if (INVALID_VADDR(addr))
148 		return (FC_NOMAP);
149 
150 	if (iskernel) {
151 		as = &kas;
152 	} else {
153 		p = curproc;
154 		as = p->p_as;
155 #if defined(SF_ERRATA_57)
156 		/*
157 		 * Prevent infinite loops due to a segment driver
158 		 * setting the execute permissions and the sfmmu hat
159 		 * silently ignoring them.
160 		 */
161 		if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
162 		    addr < errata57_limit) {
163 			res = FC_NOMAP;
164 			goto out;
165 		}
166 #endif
167 	}
168 
169 	/*
170 	 * Dispatch pagefault.
171 	 */
172 	res = as_fault(as->a_hat, as, addr, 1, type, rw);
173 
174 	/*
175 	 * If this isn't a potential unmapped hole in the user's
176 	 * UNIX data or stack segments, just return status info.
177 	 */
178 	if (!(res == FC_NOMAP && iskernel == 0))
179 		goto out;
180 
181 	/*
182 	 * Check to see if we happened to faulted on a currently unmapped
183 	 * part of the UNIX data or stack segments.  If so, create a zfod
184 	 * mapping there and then try calling the fault routine again.
185 	 */
186 	base = p->p_brkbase;
187 	len = p->p_brksize;
188 
189 	if (addr < base || addr >= base + len) {		/* data seg? */
190 		base = (caddr_t)(p->p_usrstack - p->p_stksize);
191 		len = p->p_stksize;
192 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
193 			/* not in either UNIX data or stack segments */
194 			res = FC_NOMAP;
195 			goto out;
196 		}
197 	}
198 
199 	/* the rest of this function implements a 3.X 4.X 5.X compatibility */
200 	/* This code is probably not needed anymore */
201 
202 	/* expand the gap to the page boundaries on each side */
203 	len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
204 	    ((uintptr_t)base & PAGEMASK);
205 	base = (caddr_t)((uintptr_t)base & PAGEMASK);
206 
207 	as_rangelock(as);
208 	as_purge(as);
209 	if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
210 		err = as_map(as, base, len, segvn_create, zfod_argsp);
211 		as_rangeunlock(as);
212 		if (err) {
213 			res = FC_MAKE_ERR(err);
214 			goto out;
215 		}
216 	} else {
217 		/*
218 		 * This page is already mapped by another thread after we
219 		 * returned from as_fault() above.  We just fallthrough
220 		 * as_fault() below.
221 		 */
222 		as_rangeunlock(as);
223 	}
224 
225 	res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
226 
227 out:
228 
229 	return (res);
230 }
231 
232 /*
233  * This is the routine which defines the address limit implied
234  * by the flag '_MAP_LOW32'.  USERLIMIT32 matches the highest
235  * mappable address in a 32-bit process on this platform (though
236  * perhaps we should make it be UINT32_MAX here?)
237  */
238 void
239 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
240 {
241 	struct proc *p = curproc;
242 	caddr_t userlimit = flags & _MAP_LOW32 ?
243 	    (caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
244 	map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
245 }
246 
247 /*
248  * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
249  */
250 caddr_t	hole_start, hole_end;
251 
252 /*
253  * kpm mapping window
254  */
255 caddr_t kpm_vbase;
256 size_t  kpm_size;
257 uchar_t kpm_size_shift;
258 
259 int valid_va_range_aligned_wraparound;
260 /*
261  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
262  * addresses at least "minlen" long, where the base of the range is at "off"
263  * phase from an "align" boundary and there is space for a "redzone"-sized
264  * redzone on either side of the range.  On success, 1 is returned and *basep
265  * and *lenp are adjusted to describe the acceptable range (including
266  * the redzone).  On failure, 0 is returned.
267  */
268 int
269 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
270     size_t align, size_t redzone, size_t off)
271 {
272 	caddr_t hi, lo;
273 	size_t tot_len;
274 
275 	ASSERT(align == 0 ? off == 0 : off < align);
276 	ASSERT(ISP2(align));
277 	ASSERT(align == 0 || align >= PAGESIZE);
278 
279 	lo = *basep;
280 	hi = lo + *lenp;
281 	tot_len = minlen + 2 * redzone;	/* need at least this much space */
282 
283 	/* If hi rolled over the top try cutting back. */
284 	if (hi < lo) {
285 		*lenp = 0UL - (uintptr_t)lo - 1UL;
286 		/* Trying to see if this really happens, and then if so, why */
287 		valid_va_range_aligned_wraparound++;
288 		hi = lo + *lenp;
289 	}
290 	if (*lenp < tot_len) {
291 		return (0);
292 	}
293 
294 	/*
295 	 * Deal with a possible hole in the address range between
296 	 * hole_start and hole_end that should never be mapped by the MMU.
297 	 */
298 
299 	if (lo < hole_start) {
300 		if (hi > hole_start)
301 			if (hi < hole_end)
302 				hi = hole_start;
303 			else
304 				/* lo < hole_start && hi >= hole_end */
305 				if (dir == AH_LO) {
306 					/*
307 					 * prefer lowest range
308 					 */
309 					if (hole_start - lo >= tot_len)
310 						hi = hole_start;
311 					else if (hi - hole_end >= tot_len)
312 						lo = hole_end;
313 					else
314 						return (0);
315 				} else {
316 					/*
317 					 * prefer highest range
318 					 */
319 					if (hi - hole_end >= tot_len)
320 						lo = hole_end;
321 					else if (hole_start - lo >= tot_len)
322 						hi = hole_start;
323 					else
324 						return (0);
325 				}
326 	} else {
327 		/* lo >= hole_start */
328 		if (hi < hole_end)
329 			return (0);
330 		if (lo < hole_end)
331 			lo = hole_end;
332 	}
333 
334 	/* Check if remaining length is too small */
335 	if (hi - lo < tot_len) {
336 		return (0);
337 	}
338 	if (align > 1) {
339 		caddr_t tlo = lo + redzone;
340 		caddr_t thi = hi - redzone;
341 		tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off);
342 		if (tlo < lo + redzone) {
343 			return (0);
344 		}
345 		if (thi < tlo || thi - tlo < minlen) {
346 			return (0);
347 		}
348 	}
349 	*basep = lo;
350 	*lenp = hi - lo;
351 	return (1);
352 }
353 
354 /*
355  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
356  * addresses at least "minlen" long.  On success, 1 is returned and *basep
357  * and *lenp are adjusted to describe the acceptable range.  On failure, 0
358  * is returned.
359  */
360 int
361 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
362 {
363 	return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
364 }
365 
366 /*
367  * Determine whether [addr, addr+len] with protections `prot' are valid
368  * for a user address space.
369  */
370 /*ARGSUSED*/
371 int
372 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
373     caddr_t userlimit)
374 {
375 	caddr_t eaddr = addr + len;
376 
377 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
378 		return (RANGE_BADADDR);
379 
380 	/*
381 	 * Determine if the address range falls within an illegal
382 	 * range of the MMU.
383 	 */
384 	if (eaddr > hole_start && addr < hole_end)
385 		return (RANGE_BADADDR);
386 
387 #if defined(SF_ERRATA_57)
388 	/*
389 	 * Make sure USERLIMIT isn't raised too high
390 	 */
391 	ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
392 	    errata57_limit == 0);
393 
394 	if (AS_TYPE_64BIT(as) &&
395 	    (addr < errata57_limit) &&
396 	    (prot & PROT_EXEC))
397 		return (RANGE_BADPROT);
398 #endif /* SF_ERRATA57 */
399 	return (RANGE_OKAY);
400 }
401 
402 /*
403  * Routine used to check to see if an a.out can be executed
404  * by the current machine/architecture.
405  */
406 int
407 chkaout(struct exdata *exp)
408 {
409 	if (exp->ux_mach == M_SPARC)
410 		return (0);
411 	else
412 		return (ENOEXEC);
413 }
414 
415 /*
416  * The following functions return information about an a.out
417  * which is used when a program is executed.
418  */
419 
420 /*
421  * Return the load memory address for the data segment.
422  */
423 caddr_t
424 getdmem(struct exec *exp)
425 {
426 	/*
427 	 * XXX - Sparc Reference Hack approaching
428 	 * Remember that we are loading
429 	 * 8k executables into a 4k machine
430 	 * DATA_ALIGN == 2 * PAGESIZE
431 	 */
432 	if (exp->a_text)
433 		return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
434 	else
435 		return ((caddr_t)USRTEXT);
436 }
437 
438 /*
439  * Return the starting disk address for the data segment.
440  */
441 ulong_t
442 getdfile(struct exec *exp)
443 {
444 	if (exp->a_magic == ZMAGIC)
445 		return (exp->a_text);
446 	else
447 		return (sizeof (struct exec) + exp->a_text);
448 }
449 
450 /*
451  * Return the load memory address for the text segment.
452  */
453 
454 /*ARGSUSED*/
455 caddr_t
456 gettmem(struct exec *exp)
457 {
458 	return ((caddr_t)USRTEXT);
459 }
460 
461 /*
462  * Return the file byte offset for the text segment.
463  */
464 uint_t
465 gettfile(struct exec *exp)
466 {
467 	if (exp->a_magic == ZMAGIC)
468 		return (0);
469 	else
470 		return (sizeof (struct exec));
471 }
472 
473 void
474 getexinfo(
475 	struct exdata *edp_in,
476 	struct exdata *edp_out,
477 	int *pagetext,
478 	int *pagedata)
479 {
480 	*edp_out = *edp_in;	/* structure copy */
481 
482 	if ((edp_in->ux_mag == ZMAGIC) &&
483 	    ((edp_in->vp->v_flag & VNOMAP) == 0)) {
484 		*pagetext = 1;
485 		*pagedata = 1;
486 	} else {
487 		*pagetext = 0;
488 		*pagedata = 0;
489 	}
490 }
491 
492 /*
493  * Return non 0 value if the address may cause a VAC alias with KPM mappings.
494  * KPM selects an address such that it's equal offset modulo shm_alignment and
495  * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
496  */
497 int
498 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
499 {
500 	if (vac) {
501 		return (((uintptr_t)addr ^ off) & shm_alignment - 1);
502 	} else {
503 		return (0);
504 	}
505 }
506 
507 /*
508  * Sanity control. Don't use large pages regardless of user
509  * settings if there's less than priv or shm_lpg_min_physmem memory installed.
510  * The units for this variable is 8K pages.
511  */
512 pgcnt_t shm_lpg_min_physmem = 131072;			/* 1GB */
513 pgcnt_t privm_lpg_min_physmem = 131072;			/* 1GB */
514 
515 static size_t
516 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
517 {
518 	size_t		pgsz = MMU_PAGESIZE;
519 	int		szc;
520 
521 	/*
522 	 * If len is zero, retrieve from proc and don't demote the page size.
523 	 * Use atleast the default pagesize.
524 	 */
525 	if (len == 0) {
526 		len = p->p_brkbase + p->p_brksize - p->p_bssbase;
527 	}
528 	len = MAX(len, default_uheap_lpsize);
529 
530 	for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
531 		pgsz = hw_page_array[szc].hp_size;
532 		if ((disable_auto_data_large_pages & (1 << szc)) ||
533 		    pgsz > max_uheap_lpsize)
534 			continue;
535 		if (len >= pgsz) {
536 			break;
537 		}
538 	}
539 
540 	/*
541 	 * If addr == 0 we were called by memcntl() when the
542 	 * size code is 0.  Don't set pgsz less than current size.
543 	 */
544 	if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
545 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
546 	}
547 
548 	return (pgsz);
549 }
550 
551 static size_t
552 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
553 {
554 	size_t		pgsz = MMU_PAGESIZE;
555 	int		szc;
556 
557 	/*
558 	 * If len is zero, retrieve from proc and don't demote the page size.
559 	 * Use atleast the default pagesize.
560 	 */
561 	if (len == 0) {
562 		len = p->p_stksize;
563 	}
564 	len = MAX(len, default_ustack_lpsize);
565 
566 	for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
567 		pgsz = hw_page_array[szc].hp_size;
568 		if ((disable_auto_data_large_pages & (1 << szc)) ||
569 		    pgsz > max_ustack_lpsize)
570 			continue;
571 		if (len >= pgsz) {
572 			break;
573 		}
574 	}
575 
576 	/*
577 	 * If addr == 0 we were called by memcntl() or exec_args() when the
578 	 * size code is 0.  Don't set pgsz less than current size.
579 	 */
580 	if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
581 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
582 	}
583 
584 	return (pgsz);
585 }
586 
587 static size_t
588 map_pgszism(caddr_t addr, size_t len)
589 {
590 	uint_t szc;
591 	size_t pgsz;
592 
593 	for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) {
594 		if (disable_ism_large_pages & (1 << szc))
595 			continue;
596 
597 		pgsz = hw_page_array[szc].hp_size;
598 		if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
599 			return (pgsz);
600 	}
601 
602 	return (DEFAULT_ISM_PAGESIZE);
603 }
604 
605 /*
606  * Suggest a page size to be used to map a segment of type maptype and length
607  * len.  Returns a page size (not a size code).
608  */
609 /* ARGSUSED */
610 size_t
611 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
612 {
613 	size_t	pgsz = MMU_PAGESIZE;
614 
615 	ASSERT(maptype != MAPPGSZ_VA);
616 
617 	if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
618 		return (MMU_PAGESIZE);
619 	}
620 
621 	switch (maptype) {
622 	case MAPPGSZ_ISM:
623 		pgsz = map_pgszism(addr, len);
624 		break;
625 
626 	case MAPPGSZ_STK:
627 		if (max_ustack_lpsize > MMU_PAGESIZE) {
628 			pgsz = map_pgszstk(p, addr, len);
629 		}
630 		break;
631 
632 	case MAPPGSZ_HEAP:
633 		if (max_uheap_lpsize > MMU_PAGESIZE) {
634 			pgsz = map_pgszheap(p, addr, len);
635 		}
636 		break;
637 	}
638 	return (pgsz);
639 }
640 
641 
642 /* assumes TTE8K...TTE4M == szc */
643 
644 static uint_t
645 map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs,
646     size_t max_lpsize, size_t min_physmem)
647 {
648 	caddr_t eaddr = addr + size;
649 	uint_t szcvec = 0;
650 	caddr_t raddr;
651 	caddr_t readdr;
652 	size_t pgsz;
653 	int i;
654 
655 	if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
656 		return (0);
657 	}
658 	for (i = mmu_page_sizes - 1; i > 0; i--) {
659 		if (disable_lpgs & (1 << i)) {
660 			continue;
661 		}
662 		pgsz = page_get_pagesize(i);
663 		if (pgsz > max_lpsize) {
664 			continue;
665 		}
666 		raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
667 		readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
668 		if (raddr < addr || raddr >= readdr) {
669 			continue;
670 		}
671 		if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
672 			continue;
673 		}
674 		szcvec |= (1 << i);
675 		/*
676 		 * And or in the remaining enabled page sizes.
677 		 */
678 		szcvec |= P2PHASE(~disable_lpgs, (1 << i));
679 		szcvec &= ~1; /* no need to return 8K pagesize */
680 		break;
681 	}
682 	return (szcvec);
683 }
684 
685 /*
686  * Return a bit vector of large page size codes that
687  * can be used to map [addr, addr + len) region.
688  */
689 /* ARGSUSED */
690 uint_t
691 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
692     int memcntl)
693 {
694 	if (flags & MAP_TEXT) {
695 		return (map_szcvec(addr, size, off,
696 		    disable_auto_text_large_pages,
697 		    max_utext_lpsize, shm_lpg_min_physmem));
698 
699 	} else if (flags & MAP_INITDATA) {
700 		return (map_szcvec(addr, size, off,
701 		    disable_auto_data_large_pages,
702 		    max_uidata_lpsize, privm_lpg_min_physmem));
703 
704 	} else if (type == MAPPGSZC_SHM) {
705 		return (map_szcvec(addr, size, off,
706 		    disable_auto_data_large_pages,
707 		    max_shm_lpsize, shm_lpg_min_physmem));
708 
709 	} else if (type == MAPPGSZC_HEAP) {
710 		return (map_szcvec(addr, size, off,
711 		    disable_auto_data_large_pages,
712 		    max_uheap_lpsize, privm_lpg_min_physmem));
713 
714 	} else if (type == MAPPGSZC_STACK) {
715 		return (map_szcvec(addr, size, off,
716 		    disable_auto_data_large_pages,
717 		    max_ustack_lpsize, privm_lpg_min_physmem));
718 
719 	} else {
720 		return (map_szcvec(addr, size, off,
721 		    disable_auto_data_large_pages,
722 		    max_privmap_lpsize, privm_lpg_min_physmem));
723 	}
724 }
725 
726 /*
727  * Anchored in the table below are counters used to keep track
728  * of free contiguous physical memory. Each element of the table contains
729  * the array of counters, the size of array which is allocated during
730  * startup based on physmax and a shift value used to convert a pagenum
731  * into a counter array index or vice versa. The table has page size
732  * for rows and region size for columns:
733  *
734  *	page_counters[page_size][region_size]
735  *
736  *	page_size: 	TTE size code of pages on page_size freelist.
737  *
738  *	region_size:	TTE size code of a candidate larger page made up
739  *			made up of contiguous free page_size pages.
740  *
741  * As you go across a page_size row increasing region_size each
742  * element keeps track of how many (region_size - 1) size groups
743  * made up of page_size free pages can be coalesced into a
744  * regsion_size page. Yuck! Lets try an example:
745  *
746  * 	page_counters[1][3] is the table element used for identifying
747  *	candidate 4M pages from contiguous pages off the 64K free list.
748  *	Each index in the page_counters[1][3].array spans 4M. Its the
749  *	number of free 512K size (regsion_size - 1) groups of contiguous
750  *	64K free pages.	So when page_counters[1][3].counters[n] == 8
751  *	we know we have a candidate 4M page made up of 512K size groups
752  *	of 64K free pages.
753  */
754 
755 /*
756  * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
757  * dimensions are allocated dynamically.
758  */
759 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
760 
761 /*
762  * For now there is only a single size cache list.
763  * Allocated dynamically.
764  */
765 page_t ***page_cachelists[MAX_MEM_TYPES];
766 
767 kmutex_t *fpc_mutex[NPC_MUTEX];
768 kmutex_t *cpc_mutex[NPC_MUTEX];
769 
770 /*
771  * Calculate space needed for page freelists and counters
772  */
773 size_t
774 calc_free_pagelist_sz(void)
775 {
776 	int szc;
777 	size_t alloc_sz, cache_sz, free_sz;
778 
779 	/*
780 	 * one cachelist per color, node, and type
781 	 */
782 	cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) +
783 	    sizeof (page_t **);
784 	cache_sz *= max_mem_nodes * MAX_MEM_TYPES;
785 
786 	/*
787 	 * one freelist per size, color, node, and type
788 	 */
789 	free_sz = sizeof (page_t **);
790 	for (szc = 0; szc < mmu_page_sizes; szc++)
791 		free_sz += sizeof (page_t *) * page_get_pagecolors(szc);
792 	free_sz *= max_mem_nodes * MAX_MEM_TYPES;
793 
794 	alloc_sz = cache_sz + free_sz + page_ctrs_sz();
795 	return (alloc_sz);
796 }
797 
798 caddr_t
799 alloc_page_freelists(caddr_t alloc_base)
800 {
801 	int	mnode, mtype;
802 	int	szc, clrs;
803 
804 	/*
805 	 * We only support small pages in the cachelist.
806 	 */
807 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
808 		page_cachelists[mtype] = (page_t ***)alloc_base;
809 		alloc_base += (max_mem_nodes * sizeof (page_t **));
810 		for (mnode = 0; mnode < max_mem_nodes; mnode++) {
811 			page_cachelists[mtype][mnode] = (page_t **)alloc_base;
812 			alloc_base +=
813 			    (page_get_pagecolors(0) * sizeof (page_t *));
814 		}
815 	}
816 
817 	/*
818 	 * Allocate freelists bins for all
819 	 * supported page sizes.
820 	 */
821 	for (szc = 0; szc < mmu_page_sizes; szc++) {
822 		clrs = page_get_pagecolors(szc);
823 		for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
824 			page_freelists[szc][mtype] = (page_t ***)alloc_base;
825 			alloc_base += (max_mem_nodes * sizeof (page_t **));
826 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
827 				page_freelists[szc][mtype][mnode] =
828 				    (page_t **)alloc_base;
829 				alloc_base += (clrs * (sizeof (page_t *)));
830 			}
831 		}
832 	}
833 
834 	alloc_base = page_ctrs_alloc(alloc_base);
835 	return (alloc_base);
836 }
837 
838 /*
839  * Allocate page_freelists locks for a memnode from the nucleus data
840  * area. This is the first time that mmu_page_sizes is used during
841  * bootup, so check mmu_page_sizes initialization.
842  */
843 int
844 ndata_alloc_page_mutexs(struct memlist *ndata)
845 {
846 	size_t alloc_sz;
847 	caddr_t alloc_base;
848 	int	i;
849 	void	page_coloring_init();
850 
851 	page_coloring_init();
852 	if (&mmu_init_mmu_page_sizes) {
853 		if (!mmu_init_mmu_page_sizes(0)) {
854 			cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
855 			    mmu_page_sizes);
856 		}
857 	}
858 	ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
859 
860 	/* fpc_mutex and cpc_mutex */
861 	alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
862 
863 	alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
864 	if (alloc_base == NULL)
865 		return (-1);
866 
867 	ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
868 
869 	for (i = 0; i < NPC_MUTEX; i++) {
870 		fpc_mutex[i] = (kmutex_t *)alloc_base;
871 		alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
872 		cpc_mutex[i] = (kmutex_t *)alloc_base;
873 		alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
874 	}
875 	return (0);
876 }
877 
878 /*
879  * To select our starting bin, we stride through the bins with a stride
880  * of 337.  Why 337?  It's prime, it's largeish, and it performs well both
881  * in simulation and practice for different workloads on varying cache sizes.
882  */
883 uint32_t color_start_current = 0;
884 uint32_t color_start_stride = 337;
885 int color_start_random = 0;
886 
887 /* ARGSUSED */
888 uint_t
889 get_color_start(struct as *as)
890 {
891 	uint32_t old, new;
892 
893 	if (consistent_coloring == 2 || color_start_random) {
894 		return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
895 		    (hw_page_array[0].hp_colors - 1)));
896 	}
897 
898 	do {
899 		old = color_start_current;
900 		new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
901 	} while (cas32(&color_start_current, old, new) != old);
902 
903 	return ((uint_t)(new));
904 }
905 
906 /*
907  * Called once at startup from kphysm_init() -- before memialloc()
908  * is invoked to do the 1st page_free()/page_freelist_add().
909  *
910  * initializes page_colors and page_colors_mask based on ecache_setsize.
911  *
912  * Also initializes the counter locks.
913  */
914 void
915 page_coloring_init()
916 {
917 	int	a, i;
918 	uint_t colors;
919 
920 	if (do_pg_coloring == 0) {
921 		page_colors = 1;
922 		for (i = 0; i < mmu_page_sizes; i++) {
923 			colorequivszc[i] = 0;
924 			hw_page_array[i].hp_colors = 1;
925 		}
926 		return;
927 	}
928 
929 	/*
930 	 * Calculate page_colors from ecache_setsize. ecache_setsize contains
931 	 * the max ecache setsize of all cpus configured in the system or, for
932 	 * cheetah+ systems, the max possible ecache setsize for all possible
933 	 * cheetah+ cpus.
934 	 */
935 	page_colors = ecache_setsize / MMU_PAGESIZE;
936 	page_colors_mask = page_colors - 1;
937 
938 	vac_colors = vac_size / MMU_PAGESIZE;
939 	vac_colors_mask = vac_colors -1;
940 
941 	page_coloring_shift = 0;
942 	a = ecache_setsize;
943 	while (a >>= 1) {
944 		page_coloring_shift++;
945 	}
946 
947 	/* initialize number of colors per page size */
948 	for (i = 0; i < mmu_page_sizes; i++) {
949 		hw_page_array[i].hp_colors = (page_colors_mask >>
950 		    (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
951 		    + 1;
952 		colorequivszc[i] = 0;
953 	}
954 
955 	/*
956 	 * initialize cpu_page_colors if ecache setsizes are homogenous.
957 	 * cpu_page_colors set to -1 during DR operation or during startup
958 	 * if setsizes are heterogenous.
959 	 *
960 	 * The value of cpu_page_colors determines if additional color bins
961 	 * need to be checked for a particular color in the page_get routines.
962 	 */
963 	if (cpu_setsize > 0 && cpu_page_colors == 0 &&
964 	    cpu_setsize < ecache_setsize) {
965 		cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
966 		a = lowbit(page_colors) - lowbit(cpu_page_colors);
967 		ASSERT(a > 0);
968 		ASSERT(a < 16);
969 
970 		for (i = 0; i < mmu_page_sizes; i++) {
971 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
972 				continue;
973 			}
974 			while ((colors >> a) == 0)
975 				a--;
976 			ASSERT(a >= 0);
977 
978 			/* higher 4 bits encodes color equiv mask */
979 			colorequivszc[i] = (a << 4);
980 		}
981 	}
982 
983 	/* do cpu specific color initialization */
984 	if (&page_coloring_init_cpu) {
985 		page_coloring_init_cpu();
986 	}
987 }
988 
989 int
990 bp_color(struct buf *bp)
991 {
992 	int color = -1;
993 
994 	if (vac) {
995 		if ((bp->b_flags & B_PAGEIO) != 0) {
996 			color = sfmmu_get_ppvcolor(bp->b_pages);
997 		} else if (bp->b_un.b_addr != NULL) {
998 			color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
999 		}
1000 	}
1001 	return (color < 0 ? 0 : ptob(color));
1002 }
1003 
1004 /*
1005  * Function for flushing D-cache when performing module relocations
1006  * to an alternate mapping.  Stubbed out on all platforms except sun4u,
1007  * at least for now.
1008  */
1009 void
1010 dcache_flushall()
1011 {
1012 	sfmmu_cache_flushall();
1013 }
1014 
1015 static int
1016 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
1017 {
1018 	if (va1 < va2 && va1 + sz1 <= va2)
1019 		return (0);
1020 
1021 	if (va2 < va1 && va2 + sz2 <= va1)
1022 		return (0);
1023 
1024 	return (1);
1025 }
1026 
1027 /*
1028  * Return the number of bytes, relative to the beginning of a given range, that
1029  * are non-toxic (can be read from and written to with relative impunity).
1030  */
1031 size_t
1032 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
1033 {
1034 	/* OBP reads are harmless, but we don't want people writing there */
1035 	if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
1036 	    OFW_START_ADDR + 1))
1037 		return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
1038 
1039 	if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
1040 		return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
1041 
1042 	return (sz); /* no overlap */
1043 }
1044 
1045 /*
1046  * Minimum physmem required for enabling large pages for kernel heap
1047  * Currently we do not enable lp for kmem on systems with less
1048  * than 1GB of memory. This value can be changed via /etc/system
1049  */
1050 size_t segkmem_lpminphysmem = 0x40000000;	/* 1GB */
1051 
1052 /*
1053  * this function chooses large page size for kernel heap
1054  */
1055 size_t
1056 get_segkmem_lpsize(size_t lpsize)
1057 {
1058 	size_t memtotal = physmem * PAGESIZE;
1059 	size_t mmusz;
1060 	uint_t szc;
1061 
1062 	if (memtotal < segkmem_lpminphysmem)
1063 		return (PAGESIZE);
1064 
1065 	if (plat_lpkmem_is_supported != NULL &&
1066 	    plat_lpkmem_is_supported() == 0)
1067 		return (PAGESIZE);
1068 
1069 	mmusz = mmu_get_kernel_lpsize(lpsize);
1070 	szc = page_szc(mmusz);
1071 
1072 	while (szc) {
1073 		if (!(disable_large_pages & (1 << szc)))
1074 			return (page_get_pagesize(szc));
1075 		szc--;
1076 	}
1077 	return (PAGESIZE);
1078 }
1079