xref: /illumos-gate/usr/src/uts/sun4/vm/vm_dep.c (revision 8119dad84d6416f13557b0ba8e2aaf9064cbcfd3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2016 Joyent, Inc.
25  * Copyright 2022 Garrett D'Amore <garrett@damore.org>
26  */
27 
28 /*
29  * UNIX machine dependent virtual memory support.
30  */
31 
32 #include <sys/vm.h>
33 #include <sys/exec.h>
34 
35 #include <sys/exechdr.h>
36 #include <vm/seg_kmem.h>
37 #include <sys/atomic.h>
38 #include <sys/archsystm.h>
39 #include <sys/machsystm.h>
40 #include <sys/kdi.h>
41 #include <sys/cpu_module.h>
42 #include <sys/secflags.h>
43 
44 #include <vm/hat_sfmmu.h>
45 
46 #include <sys/memnode.h>
47 
48 #include <sys/mem_config.h>
49 #include <sys/mem_cage.h>
50 #include <vm/vm_dep.h>
51 #include <vm/page.h>
52 #include <sys/platform_module.h>
53 
54 /*
55  * These variables are set by module specific config routines.
56  * They are only set by modules which will use physical cache page coloring.
57  */
58 int do_pg_coloring = 0;
59 
60 /*
61  * These variables can be conveniently patched at kernel load time to
62  * prevent do_pg_coloring from being enabled by
63  * module specific config routines.
64  */
65 
66 int use_page_coloring = 1;
67 
68 /*
69  * initialized by page_coloring_init()
70  */
71 extern uint_t page_colors;
72 extern uint_t page_colors_mask;
73 extern uint_t page_coloring_shift;
74 int cpu_page_colors;
75 uint_t vac_colors = 0;
76 uint_t vac_colors_mask = 0;
77 
78 /* cpu specific coloring initialization */
79 extern void page_coloring_init_cpu();
80 #pragma weak page_coloring_init_cpu
81 
82 /*
83  * get the ecache setsize for the current cpu.
84  */
85 #define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
86 
87 plcnt_t		plcnt;		/* page list count */
88 
89 /*
90  * This variable is set by the cpu module to contain the lowest
91  * address not affected by the SF_ERRATA_57 workaround.  It should
92  * remain 0 if the workaround is not needed.
93  */
94 #if defined(SF_ERRATA_57)
95 caddr_t errata57_limit;
96 #endif
97 
98 extern void page_relocate_hash(page_t *, page_t *);
99 
100 /*
101  * these must be defined in platform specific areas
102  */
103 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
104 	struct proc *, uint_t);
105 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
106 	caddr_t, size_t, uint_t, struct lgrp *);
107 /*
108  * Convert page frame number to an OBMEM page frame number
109  * (i.e. put in the type bits -- zero for this implementation)
110  */
111 pfn_t
112 impl_obmem_pfnum(pfn_t pf)
113 {
114 	return (pf);
115 }
116 
117 /*
118  * Use physmax to determine the highest physical page of DRAM memory
119  * It is assumed that any physical addresses above physmax is in IO space.
120  * We don't bother checking the low end because we assume that memory space
121  * begins at physical page frame 0.
122  *
123  * Return 1 if the page frame is onboard DRAM memory, else 0.
124  * Returns 0 for nvram so it won't be cached.
125  */
126 int
127 pf_is_memory(pfn_t pf)
128 {
129 	/* We must be IO space */
130 	if (pf > physmax)
131 		return (0);
132 
133 	/* We must be memory space */
134 	return (1);
135 }
136 
137 /*
138  * Handle a pagefault.
139  */
140 faultcode_t
141 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
142 {
143 	struct as *as;
144 	struct proc *p;
145 	faultcode_t res;
146 	caddr_t base;
147 	size_t len;
148 	int err;
149 
150 	if (INVALID_VADDR(addr))
151 		return (FC_NOMAP);
152 
153 	if (iskernel) {
154 		as = &kas;
155 	} else {
156 		p = curproc;
157 		as = p->p_as;
158 #if defined(SF_ERRATA_57)
159 		/*
160 		 * Prevent infinite loops due to a segment driver
161 		 * setting the execute permissions and the sfmmu hat
162 		 * silently ignoring them.
163 		 */
164 		if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
165 		    addr < errata57_limit) {
166 			res = FC_NOMAP;
167 			goto out;
168 		}
169 #endif
170 	}
171 
172 	/*
173 	 * Dispatch pagefault.
174 	 */
175 	res = as_fault(as->a_hat, as, addr, 1, type, rw);
176 
177 	/*
178 	 * If this isn't a potential unmapped hole in the user's
179 	 * UNIX data or stack segments, just return status info.
180 	 */
181 	if (!(res == FC_NOMAP && iskernel == 0))
182 		goto out;
183 
184 	/*
185 	 * Check to see if we happened to faulted on a currently unmapped
186 	 * part of the UNIX data or stack segments.  If so, create a zfod
187 	 * mapping there and then try calling the fault routine again.
188 	 */
189 	base = p->p_brkbase;
190 	len = p->p_brksize;
191 
192 	if (addr < base || addr >= base + len) {		/* data seg? */
193 		base = (caddr_t)(p->p_usrstack - p->p_stksize);
194 		len = p->p_stksize;
195 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
196 			/* not in either UNIX data or stack segments */
197 			res = FC_NOMAP;
198 			goto out;
199 		}
200 	}
201 
202 	/* the rest of this function implements a 3.X 4.X 5.X compatibility */
203 	/* This code is probably not needed anymore */
204 
205 	/* expand the gap to the page boundaries on each side */
206 	len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
207 	    ((uintptr_t)base & PAGEMASK);
208 	base = (caddr_t)((uintptr_t)base & PAGEMASK);
209 
210 	as_rangelock(as);
211 	as_purge(as);
212 	if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
213 		err = as_map(as, base, len, segvn_create, zfod_argsp);
214 		as_rangeunlock(as);
215 		if (err) {
216 			res = FC_MAKE_ERR(err);
217 			goto out;
218 		}
219 	} else {
220 		/*
221 		 * This page is already mapped by another thread after we
222 		 * returned from as_fault() above.  We just fallthrough
223 		 * as_fault() below.
224 		 */
225 		as_rangeunlock(as);
226 	}
227 
228 	res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
229 
230 out:
231 
232 	return (res);
233 }
234 
235 /*
236  * This is the routine which defines the address limit implied
237  * by the flag '_MAP_LOW32'.  USERLIMIT32 matches the highest
238  * mappable address in a 32-bit process on this platform (though
239  * perhaps we should make it be UINT32_MAX here?)
240  */
241 void
242 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
243 {
244 	struct proc *p = curproc;
245 	caddr_t userlimit = flags & _MAP_LOW32 ?
246 	    (caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
247 	map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
248 }
249 
250 /*
251  * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
252  */
253 caddr_t	hole_start, hole_end;
254 
255 /*
256  * kpm mapping window
257  */
258 caddr_t kpm_vbase;
259 size_t  kpm_size;
260 uchar_t kpm_size_shift;
261 
262 int valid_va_range_aligned_wraparound;
263 /*
264  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
265  * addresses at least "minlen" long, where the base of the range is at "off"
266  * phase from an "align" boundary and there is space for a "redzone"-sized
267  * redzone on either side of the range.  On success, 1 is returned and *basep
268  * and *lenp are adjusted to describe the acceptable range (including
269  * the redzone).  On failure, 0 is returned.
270  */
271 int
272 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
273     size_t align, size_t redzone, size_t off)
274 {
275 	caddr_t hi, lo;
276 	size_t tot_len;
277 
278 	ASSERT(align == 0 ? off == 0 : off < align);
279 	ASSERT(ISP2(align));
280 	ASSERT(align == 0 || align >= PAGESIZE);
281 
282 	lo = *basep;
283 	hi = lo + *lenp;
284 	tot_len = minlen + 2 * redzone;	/* need at least this much space */
285 
286 	/* If hi rolled over the top try cutting back. */
287 	if (hi < lo) {
288 		*lenp = 0UL - (uintptr_t)lo - 1UL;
289 		/* Trying to see if this really happens, and then if so, why */
290 		valid_va_range_aligned_wraparound++;
291 		hi = lo + *lenp;
292 	}
293 	if (*lenp < tot_len) {
294 		return (0);
295 	}
296 
297 	/*
298 	 * Deal with a possible hole in the address range between
299 	 * hole_start and hole_end that should never be mapped by the MMU.
300 	 */
301 
302 	if (lo < hole_start) {
303 		if (hi > hole_start)
304 			if (hi < hole_end)
305 				hi = hole_start;
306 			else
307 				/* lo < hole_start && hi >= hole_end */
308 				if (dir == AH_LO) {
309 					/*
310 					 * prefer lowest range
311 					 */
312 					if (hole_start - lo >= tot_len)
313 						hi = hole_start;
314 					else if (hi - hole_end >= tot_len)
315 						lo = hole_end;
316 					else
317 						return (0);
318 				} else {
319 					/*
320 					 * prefer highest range
321 					 */
322 					if (hi - hole_end >= tot_len)
323 						lo = hole_end;
324 					else if (hole_start - lo >= tot_len)
325 						hi = hole_start;
326 					else
327 						return (0);
328 				}
329 	} else {
330 		/* lo >= hole_start */
331 		if (hi < hole_end)
332 			return (0);
333 		if (lo < hole_end)
334 			lo = hole_end;
335 	}
336 
337 	/* Check if remaining length is too small */
338 	if (hi - lo < tot_len) {
339 		return (0);
340 	}
341 	if (align > 1) {
342 		caddr_t tlo = lo + redzone;
343 		caddr_t thi = hi - redzone;
344 		tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off);
345 		if (tlo < lo + redzone) {
346 			return (0);
347 		}
348 		if (thi < tlo || thi - tlo < minlen) {
349 			return (0);
350 		}
351 	}
352 	*basep = lo;
353 	*lenp = hi - lo;
354 	return (1);
355 }
356 
357 /*
358  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
359  * addresses at least "minlen" long.  On success, 1 is returned and *basep
360  * and *lenp are adjusted to describe the acceptable range.  On failure, 0
361  * is returned.
362  */
363 int
364 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
365 {
366 	return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
367 }
368 
369 /*
370  * Default to forbidding the first 64k of address space.  This protects most
371  * reasonably sized structures from dereferences through NULL:
372  *     ((foo_t *)0)->bar
373  */
374 uintptr_t forbidden_null_mapping_sz = 0x10000;
375 
376 /*
377  * Determine whether [addr, addr+len] with protections `prot' are valid
378  * for a user address space.
379  */
380 /*ARGSUSED*/
381 int
382 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
383     caddr_t userlimit)
384 {
385 	caddr_t eaddr = addr + len;
386 
387 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
388 		return (RANGE_BADADDR);
389 
390 	if ((addr <= (caddr_t)forbidden_null_mapping_sz) &&
391 	    as->a_proc != NULL &&
392 	    secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP))
393 		return (RANGE_BADADDR);
394 
395 	/*
396 	 * Determine if the address range falls within an illegal
397 	 * range of the MMU.
398 	 */
399 	if (eaddr > hole_start && addr < hole_end)
400 		return (RANGE_BADADDR);
401 
402 #if defined(SF_ERRATA_57)
403 	/*
404 	 * Make sure USERLIMIT isn't raised too high
405 	 */
406 	ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
407 	    errata57_limit == 0);
408 
409 	if (AS_TYPE_64BIT(as) &&
410 	    (addr < errata57_limit) &&
411 	    (prot & PROT_EXEC))
412 		return (RANGE_BADPROT);
413 #endif /* SF_ERRATA57 */
414 	return (RANGE_OKAY);
415 }
416 
417 /*
418  * Routine used to check to see if an a.out can be executed
419  * by the current machine/architecture.
420  */
421 int
422 chkaout(struct exdata *exp)
423 {
424 	if (exp->ux_mach == M_SPARC)
425 		return (0);
426 	else
427 		return (ENOEXEC);
428 }
429 
430 
431 /*
432  * Return non 0 value if the address may cause a VAC alias with KPM mappings.
433  * KPM selects an address such that it's equal offset modulo shm_alignment and
434  * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
435  */
436 int
437 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
438 {
439 	if (vac) {
440 		return (((uintptr_t)addr ^ off) & shm_alignment - 1);
441 	} else {
442 		return (0);
443 	}
444 }
445 
446 /*
447  * Sanity control. Don't use large pages regardless of user
448  * settings if there's less than priv or shm_lpg_min_physmem memory installed.
449  * The units for this variable is 8K pages.
450  */
451 pgcnt_t shm_lpg_min_physmem = 131072;			/* 1GB */
452 pgcnt_t privm_lpg_min_physmem = 131072;			/* 1GB */
453 
454 static size_t
455 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
456 {
457 	size_t		pgsz = MMU_PAGESIZE;
458 	int		szc;
459 
460 	/*
461 	 * If len is zero, retrieve from proc and don't demote the page size.
462 	 * Use atleast the default pagesize.
463 	 */
464 	if (len == 0) {
465 		len = p->p_brkbase + p->p_brksize - p->p_bssbase;
466 	}
467 	len = MAX(len, default_uheap_lpsize);
468 
469 	for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
470 		pgsz = hw_page_array[szc].hp_size;
471 		if ((disable_auto_data_large_pages & (1 << szc)) ||
472 		    pgsz > max_uheap_lpsize)
473 			continue;
474 		if (len >= pgsz) {
475 			break;
476 		}
477 	}
478 
479 	/*
480 	 * If addr == 0 we were called by memcntl() when the
481 	 * size code is 0.  Don't set pgsz less than current size.
482 	 */
483 	if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
484 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
485 	}
486 
487 	return (pgsz);
488 }
489 
490 static size_t
491 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
492 {
493 	size_t		pgsz = MMU_PAGESIZE;
494 	int		szc;
495 
496 	/*
497 	 * If len is zero, retrieve from proc and don't demote the page size.
498 	 * Use atleast the default pagesize.
499 	 */
500 	if (len == 0) {
501 		len = p->p_stksize;
502 	}
503 	len = MAX(len, default_ustack_lpsize);
504 
505 	for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
506 		pgsz = hw_page_array[szc].hp_size;
507 		if ((disable_auto_data_large_pages & (1 << szc)) ||
508 		    pgsz > max_ustack_lpsize)
509 			continue;
510 		if (len >= pgsz) {
511 			break;
512 		}
513 	}
514 
515 	/*
516 	 * If addr == 0 we were called by memcntl() or exec_args() when the
517 	 * size code is 0.  Don't set pgsz less than current size.
518 	 */
519 	if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
520 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
521 	}
522 
523 	return (pgsz);
524 }
525 
526 static size_t
527 map_pgszism(caddr_t addr, size_t len)
528 {
529 	uint_t szc;
530 	size_t pgsz;
531 
532 	for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) {
533 		if (disable_ism_large_pages & (1 << szc))
534 			continue;
535 
536 		pgsz = hw_page_array[szc].hp_size;
537 		if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
538 			return (pgsz);
539 	}
540 
541 	return (DEFAULT_ISM_PAGESIZE);
542 }
543 
544 /*
545  * Suggest a page size to be used to map a segment of type maptype and length
546  * len.  Returns a page size (not a size code).
547  */
548 /* ARGSUSED */
549 size_t
550 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
551 {
552 	size_t	pgsz = MMU_PAGESIZE;
553 
554 	ASSERT(maptype != MAPPGSZ_VA);
555 
556 	if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
557 		return (MMU_PAGESIZE);
558 	}
559 
560 	switch (maptype) {
561 	case MAPPGSZ_ISM:
562 		pgsz = map_pgszism(addr, len);
563 		break;
564 
565 	case MAPPGSZ_STK:
566 		if (max_ustack_lpsize > MMU_PAGESIZE) {
567 			pgsz = map_pgszstk(p, addr, len);
568 		}
569 		break;
570 
571 	case MAPPGSZ_HEAP:
572 		if (max_uheap_lpsize > MMU_PAGESIZE) {
573 			pgsz = map_pgszheap(p, addr, len);
574 		}
575 		break;
576 	}
577 	return (pgsz);
578 }
579 
580 
581 /* assumes TTE8K...TTE4M == szc */
582 
583 static uint_t
584 map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs,
585     size_t max_lpsize, size_t min_physmem)
586 {
587 	caddr_t eaddr = addr + size;
588 	uint_t szcvec = 0;
589 	caddr_t raddr;
590 	caddr_t readdr;
591 	size_t pgsz;
592 	int i;
593 
594 	if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
595 		return (0);
596 	}
597 	for (i = mmu_page_sizes - 1; i > 0; i--) {
598 		if (disable_lpgs & (1 << i)) {
599 			continue;
600 		}
601 		pgsz = page_get_pagesize(i);
602 		if (pgsz > max_lpsize) {
603 			continue;
604 		}
605 		raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
606 		readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
607 		if (raddr < addr || raddr >= readdr) {
608 			continue;
609 		}
610 		if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
611 			continue;
612 		}
613 		szcvec |= (1 << i);
614 		/*
615 		 * And or in the remaining enabled page sizes.
616 		 */
617 		szcvec |= P2PHASE(~disable_lpgs, (1 << i));
618 		szcvec &= ~1; /* no need to return 8K pagesize */
619 		break;
620 	}
621 	return (szcvec);
622 }
623 
624 /*
625  * Return a bit vector of large page size codes that
626  * can be used to map [addr, addr + len) region.
627  */
628 /* ARGSUSED */
629 uint_t
630 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
631     int memcntl)
632 {
633 	if (flags & MAP_TEXT) {
634 		return (map_szcvec(addr, size, off,
635 		    disable_auto_text_large_pages,
636 		    max_utext_lpsize, shm_lpg_min_physmem));
637 
638 	} else if (flags & MAP_INITDATA) {
639 		return (map_szcvec(addr, size, off,
640 		    disable_auto_data_large_pages,
641 		    max_uidata_lpsize, privm_lpg_min_physmem));
642 
643 	} else if (type == MAPPGSZC_SHM) {
644 		return (map_szcvec(addr, size, off,
645 		    disable_auto_data_large_pages,
646 		    max_shm_lpsize, shm_lpg_min_physmem));
647 
648 	} else if (type == MAPPGSZC_HEAP) {
649 		return (map_szcvec(addr, size, off,
650 		    disable_auto_data_large_pages,
651 		    max_uheap_lpsize, privm_lpg_min_physmem));
652 
653 	} else if (type == MAPPGSZC_STACK) {
654 		return (map_szcvec(addr, size, off,
655 		    disable_auto_data_large_pages,
656 		    max_ustack_lpsize, privm_lpg_min_physmem));
657 
658 	} else {
659 		return (map_szcvec(addr, size, off,
660 		    disable_auto_data_large_pages,
661 		    max_privmap_lpsize, privm_lpg_min_physmem));
662 	}
663 }
664 
665 /*
666  * Anchored in the table below are counters used to keep track
667  * of free contiguous physical memory. Each element of the table contains
668  * the array of counters, the size of array which is allocated during
669  * startup based on physmax and a shift value used to convert a pagenum
670  * into a counter array index or vice versa. The table has page size
671  * for rows and region size for columns:
672  *
673  *	page_counters[page_size][region_size]
674  *
675  *	page_size:	TTE size code of pages on page_size freelist.
676  *
677  *	region_size:	TTE size code of a candidate larger page made up
678  *			made up of contiguous free page_size pages.
679  *
680  * As you go across a page_size row increasing region_size each
681  * element keeps track of how many (region_size - 1) size groups
682  * made up of page_size free pages can be coalesced into a
683  * regsion_size page. Yuck! Lets try an example:
684  *
685  *	page_counters[1][3] is the table element used for identifying
686  *	candidate 4M pages from contiguous pages off the 64K free list.
687  *	Each index in the page_counters[1][3].array spans 4M. Its the
688  *	number of free 512K size (regsion_size - 1) groups of contiguous
689  *	64K free pages.	So when page_counters[1][3].counters[n] == 8
690  *	we know we have a candidate 4M page made up of 512K size groups
691  *	of 64K free pages.
692  */
693 
694 /*
695  * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
696  * dimensions are allocated dynamically.
697  */
698 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
699 
700 /*
701  * For now there is only a single size cache list.
702  * Allocated dynamically.
703  */
704 page_t ***page_cachelists[MAX_MEM_TYPES];
705 
706 kmutex_t *fpc_mutex[NPC_MUTEX];
707 kmutex_t *cpc_mutex[NPC_MUTEX];
708 
709 /*
710  * Calculate space needed for page freelists and counters
711  */
712 size_t
713 calc_free_pagelist_sz(void)
714 {
715 	int szc;
716 	size_t alloc_sz, cache_sz, free_sz;
717 
718 	/*
719 	 * one cachelist per color, node, and type
720 	 */
721 	cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) +
722 	    sizeof (page_t **);
723 	cache_sz *= max_mem_nodes * MAX_MEM_TYPES;
724 
725 	/*
726 	 * one freelist per size, color, node, and type
727 	 */
728 	free_sz = sizeof (page_t **);
729 	for (szc = 0; szc < mmu_page_sizes; szc++)
730 		free_sz += sizeof (page_t *) * page_get_pagecolors(szc);
731 	free_sz *= max_mem_nodes * MAX_MEM_TYPES;
732 
733 	alloc_sz = cache_sz + free_sz + page_ctrs_sz();
734 	return (alloc_sz);
735 }
736 
737 caddr_t
738 alloc_page_freelists(caddr_t alloc_base)
739 {
740 	int	mnode, mtype;
741 	int	szc, clrs;
742 
743 	/*
744 	 * We only support small pages in the cachelist.
745 	 */
746 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
747 		page_cachelists[mtype] = (page_t ***)alloc_base;
748 		alloc_base += (max_mem_nodes * sizeof (page_t **));
749 		for (mnode = 0; mnode < max_mem_nodes; mnode++) {
750 			page_cachelists[mtype][mnode] = (page_t **)alloc_base;
751 			alloc_base +=
752 			    (page_get_pagecolors(0) * sizeof (page_t *));
753 		}
754 	}
755 
756 	/*
757 	 * Allocate freelists bins for all
758 	 * supported page sizes.
759 	 */
760 	for (szc = 0; szc < mmu_page_sizes; szc++) {
761 		clrs = page_get_pagecolors(szc);
762 		for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
763 			page_freelists[szc][mtype] = (page_t ***)alloc_base;
764 			alloc_base += (max_mem_nodes * sizeof (page_t **));
765 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
766 				page_freelists[szc][mtype][mnode] =
767 				    (page_t **)alloc_base;
768 				alloc_base += (clrs * (sizeof (page_t *)));
769 			}
770 		}
771 	}
772 
773 	alloc_base = page_ctrs_alloc(alloc_base);
774 	return (alloc_base);
775 }
776 
777 /*
778  * Allocate page_freelists locks for a memnode from the nucleus data
779  * area. This is the first time that mmu_page_sizes is used during
780  * bootup, so check mmu_page_sizes initialization.
781  */
782 int
783 ndata_alloc_page_mutexs(struct memlist *ndata)
784 {
785 	size_t alloc_sz;
786 	caddr_t alloc_base;
787 	int	i;
788 	void	page_coloring_init();
789 
790 	page_coloring_init();
791 	if (&mmu_init_mmu_page_sizes) {
792 		if (!mmu_init_mmu_page_sizes(0)) {
793 			cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
794 			    mmu_page_sizes);
795 		}
796 	}
797 	ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
798 
799 	/* fpc_mutex and cpc_mutex */
800 	alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
801 
802 	alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
803 	if (alloc_base == NULL)
804 		return (-1);
805 
806 	ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
807 
808 	for (i = 0; i < NPC_MUTEX; i++) {
809 		fpc_mutex[i] = (kmutex_t *)alloc_base;
810 		alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
811 		cpc_mutex[i] = (kmutex_t *)alloc_base;
812 		alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
813 	}
814 	return (0);
815 }
816 
817 /*
818  * To select our starting bin, we stride through the bins with a stride
819  * of 337.  Why 337?  It's prime, it's largeish, and it performs well both
820  * in simulation and practice for different workloads on varying cache sizes.
821  */
822 uint32_t color_start_current = 0;
823 uint32_t color_start_stride = 337;
824 int color_start_random = 0;
825 
826 /* ARGSUSED */
827 uint_t
828 get_color_start(struct as *as)
829 {
830 	uint32_t old, new;
831 
832 	if (consistent_coloring == 2 || color_start_random) {
833 		return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
834 		    (hw_page_array[0].hp_colors - 1)));
835 	}
836 
837 	do {
838 		old = color_start_current;
839 		new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
840 	} while (atomic_cas_32(&color_start_current, old, new) != old);
841 
842 	return ((uint_t)(new));
843 }
844 
845 /*
846  * Called once at startup from kphysm_init() -- before memialloc()
847  * is invoked to do the 1st page_free()/page_freelist_add().
848  *
849  * initializes page_colors and page_colors_mask based on ecache_setsize.
850  *
851  * Also initializes the counter locks.
852  */
853 void
854 page_coloring_init()
855 {
856 	int	a, i;
857 	uint_t colors;
858 
859 	if (do_pg_coloring == 0) {
860 		page_colors = 1;
861 		for (i = 0; i < mmu_page_sizes; i++) {
862 			colorequivszc[i] = 0;
863 			hw_page_array[i].hp_colors = 1;
864 		}
865 		return;
866 	}
867 
868 	/*
869 	 * Calculate page_colors from ecache_setsize. ecache_setsize contains
870 	 * the max ecache setsize of all cpus configured in the system or, for
871 	 * cheetah+ systems, the max possible ecache setsize for all possible
872 	 * cheetah+ cpus.
873 	 */
874 	page_colors = ecache_setsize / MMU_PAGESIZE;
875 	page_colors_mask = page_colors - 1;
876 
877 	vac_colors = vac_size / MMU_PAGESIZE;
878 	vac_colors_mask = vac_colors -1;
879 
880 	page_coloring_shift = 0;
881 	a = ecache_setsize;
882 	while (a >>= 1) {
883 		page_coloring_shift++;
884 	}
885 
886 	/* initialize number of colors per page size */
887 	for (i = 0; i < mmu_page_sizes; i++) {
888 		hw_page_array[i].hp_colors = (page_colors_mask >>
889 		    (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
890 		    + 1;
891 		colorequivszc[i] = 0;
892 	}
893 
894 	/*
895 	 * initialize cpu_page_colors if ecache setsizes are homogenous.
896 	 * cpu_page_colors set to -1 during DR operation or during startup
897 	 * if setsizes are heterogenous.
898 	 *
899 	 * The value of cpu_page_colors determines if additional color bins
900 	 * need to be checked for a particular color in the page_get routines.
901 	 */
902 	if (cpu_setsize > 0 && cpu_page_colors == 0 &&
903 	    cpu_setsize < ecache_setsize) {
904 		cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
905 		a = lowbit(page_colors) - lowbit(cpu_page_colors);
906 		ASSERT(a > 0);
907 		ASSERT(a < 16);
908 
909 		for (i = 0; i < mmu_page_sizes; i++) {
910 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
911 				continue;
912 			}
913 			while ((colors >> a) == 0)
914 				a--;
915 			ASSERT(a >= 0);
916 
917 			/* higher 4 bits encodes color equiv mask */
918 			colorequivszc[i] = (a << 4);
919 		}
920 	}
921 
922 	/* do cpu specific color initialization */
923 	if (&page_coloring_init_cpu) {
924 		page_coloring_init_cpu();
925 	}
926 }
927 
928 int
929 bp_color(struct buf *bp)
930 {
931 	int color = -1;
932 
933 	if (vac) {
934 		if ((bp->b_flags & B_PAGEIO) != 0) {
935 			color = sfmmu_get_ppvcolor(bp->b_pages);
936 		} else if (bp->b_un.b_addr != NULL) {
937 			color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
938 		}
939 	}
940 	return (color < 0 ? 0 : ptob(color));
941 }
942 
943 /*
944  * Function for flushing D-cache when performing module relocations
945  * to an alternate mapping.  Stubbed out on all platforms except sun4u,
946  * at least for now.
947  */
948 void
949 dcache_flushall()
950 {
951 	sfmmu_cache_flushall();
952 }
953 
954 static int
955 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
956 {
957 	if (va1 < va2 && va1 + sz1 <= va2)
958 		return (0);
959 
960 	if (va2 < va1 && va2 + sz2 <= va1)
961 		return (0);
962 
963 	return (1);
964 }
965 
966 /*
967  * Return the number of bytes, relative to the beginning of a given range, that
968  * are non-toxic (can be read from and written to with relative impunity).
969  */
970 size_t
971 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
972 {
973 	/* OBP reads are harmless, but we don't want people writing there */
974 	if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
975 	    OFW_START_ADDR + 1))
976 		return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
977 
978 	if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
979 		return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
980 
981 	return (sz); /* no overlap */
982 }
983 
984 /*
985  * Minimum physmem required for enabling large pages for kernel heap
986  * Currently we do not enable lp for kmem on systems with less
987  * than 1GB of memory. This value can be changed via /etc/system
988  */
989 size_t segkmem_lpminphysmem = 0x40000000;	/* 1GB */
990 
991 /*
992  * this function chooses large page size for kernel heap
993  */
994 size_t
995 get_segkmem_lpsize(size_t lpsize)
996 {
997 	size_t memtotal = physmem * PAGESIZE;
998 	size_t mmusz;
999 	uint_t szc;
1000 
1001 	if (memtotal < segkmem_lpminphysmem)
1002 		return (PAGESIZE);
1003 
1004 	if (plat_lpkmem_is_supported != NULL &&
1005 	    plat_lpkmem_is_supported() == 0)
1006 		return (PAGESIZE);
1007 
1008 	mmusz = mmu_get_kernel_lpsize(lpsize);
1009 	szc = page_szc(mmusz);
1010 
1011 	while (szc) {
1012 		if (!(disable_large_pages & (1 << szc)))
1013 			return (page_get_pagesize(szc));
1014 		szc--;
1015 	}
1016 	return (PAGESIZE);
1017 }
1018