xref: /titanic_50/usr/src/uts/i86pc/vm/vm_machdep.c (revision ae115bc77f6fcde83175c75b4206dc2e50747966)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /*	All Rights Reserved   */
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 /*
37  * UNIX machine dependent virtual memory support.
38  */
39 
40 #include <sys/types.h>
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/user.h>
44 #include <sys/proc.h>
45 #include <sys/kmem.h>
46 #include <sys/vmem.h>
47 #include <sys/buf.h>
48 #include <sys/cpuvar.h>
49 #include <sys/lgrp.h>
50 #include <sys/disp.h>
51 #include <sys/vm.h>
52 #include <sys/mman.h>
53 #include <sys/vnode.h>
54 #include <sys/cred.h>
55 #include <sys/exec.h>
56 #include <sys/exechdr.h>
57 #include <sys/debug.h>
58 #include <sys/vmsystm.h>
59 
60 #include <vm/hat.h>
61 #include <vm/as.h>
62 #include <vm/seg.h>
63 #include <vm/seg_kp.h>
64 #include <vm/seg_vn.h>
65 #include <vm/page.h>
66 #include <vm/seg_kmem.h>
67 #include <vm/seg_kpm.h>
68 #include <vm/vm_dep.h>
69 
70 #include <sys/cpu.h>
71 #include <sys/vm_machparam.h>
72 #include <sys/memlist.h>
73 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
74 #include <vm/hat_i86.h>
75 #include <sys/x86_archext.h>
76 #include <sys/elf_386.h>
77 #include <sys/cmn_err.h>
78 #include <sys/archsystm.h>
79 #include <sys/machsystm.h>
80 
81 #include <sys/vtrace.h>
82 #include <sys/ddidmareq.h>
83 #include <sys/promif.h>
84 #include <sys/memnode.h>
85 #include <sys/stack.h>
86 
87 uint_t vac_colors = 1;
88 
89 int largepagesupport = 0;
90 extern uint_t page_create_new;
91 extern uint_t page_create_exists;
92 extern uint_t page_create_putbacks;
93 extern uint_t page_create_putbacks;
94 /*
95  * Allow users to disable the kernel's use of SSE.
96  */
97 extern int use_sse_pagecopy, use_sse_pagezero;
98 
99 /* 4g memory management */
100 pgcnt_t		maxmem4g;
101 pgcnt_t		freemem4g;
102 int		physmax4g;
103 int		desfree4gshift = 4;	/* maxmem4g shift to derive DESFREE4G */
104 int		lotsfree4gshift = 3;
105 
106 /* 16m memory management: desired number of free pages below 16m. */
107 pgcnt_t		desfree16m = 0x380;
108 
109 #ifdef VM_STATS
110 struct {
111 	ulong_t	pga_alloc;
112 	ulong_t	pga_notfullrange;
113 	ulong_t	pga_nulldmaattr;
114 	ulong_t	pga_allocok;
115 	ulong_t	pga_allocfailed;
116 	ulong_t	pgma_alloc;
117 	ulong_t	pgma_allocok;
118 	ulong_t	pgma_allocfailed;
119 	ulong_t	pgma_allocempty;
120 } pga_vmstats;
121 #endif
122 
123 uint_t mmu_page_sizes;
124 
125 /* How many page sizes the users can see */
126 uint_t mmu_exported_page_sizes;
127 
128 /*
129  * Number of pages in 1 GB.  Don't enable automatic large pages if we have
130  * fewer than this many pages.
131  */
132 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
133 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
134 
135 /*
136  * Maximum and default segment size tunables for user private
137  * and shared anon memory, and user text and initialized data.
138  * These can be patched via /etc/system to allow large pages
139  * to be used for mapping application private and shared anon memory.
140  */
141 size_t mcntl0_lpsize = MMU_PAGESIZE;
142 size_t max_uheap_lpsize = MMU_PAGESIZE;
143 size_t default_uheap_lpsize = MMU_PAGESIZE;
144 size_t max_ustack_lpsize = MMU_PAGESIZE;
145 size_t default_ustack_lpsize = MMU_PAGESIZE;
146 size_t max_privmap_lpsize = MMU_PAGESIZE;
147 size_t max_uidata_lpsize = MMU_PAGESIZE;
148 size_t max_utext_lpsize = MMU_PAGESIZE;
149 size_t max_shm_lpsize = MMU_PAGESIZE;
150 
151 /*
152  * Return the optimum page size for a given mapping
153  */
154 /*ARGSUSED*/
155 size_t
156 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
157 {
158 	level_t l = 0;
159 	size_t pgsz = MMU_PAGESIZE;
160 	size_t max_lpsize;
161 	uint_t mszc;
162 
163 	ASSERT(maptype != MAPPGSZ_VA);
164 
165 	if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
166 		return (MMU_PAGESIZE);
167 	}
168 
169 	switch (maptype) {
170 	case MAPPGSZ_HEAP:
171 	case MAPPGSZ_STK:
172 		max_lpsize = memcntl ? mcntl0_lpsize : (maptype ==
173 		    MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize);
174 		if (max_lpsize == MMU_PAGESIZE) {
175 			return (MMU_PAGESIZE);
176 		}
177 		if (len == 0) {
178 			len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase +
179 			    p->p_brksize - p->p_bssbase : p->p_stksize;
180 		}
181 		len = (maptype == MAPPGSZ_HEAP) ? MAX(len,
182 		    default_uheap_lpsize) : MAX(len, default_ustack_lpsize);
183 
184 		/*
185 		 * use the pages size that best fits len
186 		 */
187 		for (l = mmu.max_page_level; l > 0; --l) {
188 			if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) {
189 				continue;
190 			} else {
191 				pgsz = LEVEL_SIZE(l);
192 			}
193 			break;
194 		}
195 
196 		mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc :
197 		    p->p_stkpageszc);
198 		if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) {
199 			pgsz = hw_page_array[mszc].hp_size;
200 		}
201 		return (pgsz);
202 
203 	/*
204 	 * for ISM use the 1st large page size.
205 	 */
206 	case MAPPGSZ_ISM:
207 		if (mmu.max_page_level == 0)
208 			return (MMU_PAGESIZE);
209 		return (LEVEL_SIZE(1));
210 	}
211 	return (pgsz);
212 }
213 
214 static uint_t
215 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize,
216     size_t min_physmem)
217 {
218 	caddr_t eaddr = addr + size;
219 	uint_t szcvec = 0;
220 	caddr_t raddr;
221 	caddr_t readdr;
222 	size_t	pgsz;
223 	int i;
224 
225 	if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
226 		return (0);
227 	}
228 
229 	for (i = mmu_page_sizes - 1; i > 0; i--) {
230 		pgsz = page_get_pagesize(i);
231 		if (pgsz > max_lpsize) {
232 			continue;
233 		}
234 		raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
235 		readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
236 		if (raddr < addr || raddr >= readdr) {
237 			continue;
238 		}
239 		if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
240 			continue;
241 		}
242 		/*
243 		 * Set szcvec to the remaining page sizes.
244 		 */
245 		szcvec = ((1 << (i + 1)) - 1) & ~1;
246 		break;
247 	}
248 	return (szcvec);
249 }
250 
251 /*
252  * Return a bit vector of large page size codes that
253  * can be used to map [addr, addr + len) region.
254  */
255 /*ARGSUSED*/
256 uint_t
257 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
258     int memcntl)
259 {
260 	size_t max_lpsize = mcntl0_lpsize;
261 
262 	if (mmu.max_page_level == 0)
263 		return (0);
264 
265 	if (flags & MAP_TEXT) {
266 	    if (!memcntl)
267 		max_lpsize = max_utext_lpsize;
268 	    return (map_szcvec(addr, size, off, max_lpsize,
269 		    shm_lpg_min_physmem));
270 
271 	} else if (flags & MAP_INITDATA) {
272 	    if (!memcntl)
273 		max_lpsize = max_uidata_lpsize;
274 	    return (map_szcvec(addr, size, off, max_lpsize,
275 		    privm_lpg_min_physmem));
276 
277 	} else if (type == MAPPGSZC_SHM) {
278 	    if (!memcntl)
279 		max_lpsize = max_shm_lpsize;
280 	    return (map_szcvec(addr, size, off, max_lpsize,
281 		    shm_lpg_min_physmem));
282 
283 	} else if (type == MAPPGSZC_HEAP) {
284 	    if (!memcntl)
285 		max_lpsize = max_uheap_lpsize;
286 	    return (map_szcvec(addr, size, off, max_lpsize,
287 		    privm_lpg_min_physmem));
288 
289 	} else if (type == MAPPGSZC_STACK) {
290 	    if (!memcntl)
291 		max_lpsize = max_ustack_lpsize;
292 	    return (map_szcvec(addr, size, off, max_lpsize,
293 		    privm_lpg_min_physmem));
294 
295 	} else {
296 	    if (!memcntl)
297 		max_lpsize = max_privmap_lpsize;
298 	    return (map_szcvec(addr, size, off, max_lpsize,
299 		    privm_lpg_min_physmem));
300 	}
301 }
302 
303 /*
304  * Handle a pagefault.
305  */
306 faultcode_t
307 pagefault(
308 	caddr_t addr,
309 	enum fault_type type,
310 	enum seg_rw rw,
311 	int iskernel)
312 {
313 	struct as *as;
314 	struct hat *hat;
315 	struct proc *p;
316 	kthread_t *t;
317 	faultcode_t res;
318 	caddr_t base;
319 	size_t len;
320 	int err;
321 	int mapped_red;
322 	uintptr_t ea;
323 
324 	ASSERT_STACK_ALIGNED();
325 
326 	if (INVALID_VADDR(addr))
327 		return (FC_NOMAP);
328 
329 	mapped_red = segkp_map_red();
330 
331 	if (iskernel) {
332 		as = &kas;
333 		hat = as->a_hat;
334 	} else {
335 		t = curthread;
336 		p = ttoproc(t);
337 		as = p->p_as;
338 		hat = as->a_hat;
339 	}
340 
341 	/*
342 	 * Dispatch pagefault.
343 	 */
344 	res = as_fault(hat, as, addr, 1, type, rw);
345 
346 	/*
347 	 * If this isn't a potential unmapped hole in the user's
348 	 * UNIX data or stack segments, just return status info.
349 	 */
350 	if (res != FC_NOMAP || iskernel)
351 		goto out;
352 
353 	/*
354 	 * Check to see if we happened to faulted on a currently unmapped
355 	 * part of the UNIX data or stack segments.  If so, create a zfod
356 	 * mapping there and then try calling the fault routine again.
357 	 */
358 	base = p->p_brkbase;
359 	len = p->p_brksize;
360 
361 	if (addr < base || addr >= base + len) {		/* data seg? */
362 		base = (caddr_t)p->p_usrstack - p->p_stksize;
363 		len = p->p_stksize;
364 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
365 			/* not in either UNIX data or stack segments */
366 			res = FC_NOMAP;
367 			goto out;
368 		}
369 	}
370 
371 	/*
372 	 * the rest of this function implements a 3.X 4.X 5.X compatibility
373 	 * This code is probably not needed anymore
374 	 */
375 	if (p->p_model == DATAMODEL_ILP32) {
376 
377 		/* expand the gap to the page boundaries on each side */
378 		ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
379 		base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
380 		len = ea - (uintptr_t)base;
381 
382 		as_rangelock(as);
383 		if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
384 		    0) {
385 			err = as_map(as, base, len, segvn_create, zfod_argsp);
386 			as_rangeunlock(as);
387 			if (err) {
388 				res = FC_MAKE_ERR(err);
389 				goto out;
390 			}
391 		} else {
392 			/*
393 			 * This page is already mapped by another thread after
394 			 * we returned from as_fault() above.  We just fall
395 			 * through as_fault() below.
396 			 */
397 			as_rangeunlock(as);
398 		}
399 
400 		res = as_fault(hat, as, addr, 1, F_INVAL, rw);
401 	}
402 
403 out:
404 	if (mapped_red)
405 		segkp_unmap_red();
406 
407 	return (res);
408 }
409 
410 void
411 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
412 {
413 	struct proc *p = curproc;
414 	caddr_t userlimit = (flags & _MAP_LOW32) ?
415 	    (caddr_t)_userlimit32 : p->p_as->a_userlimit;
416 
417 	map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
418 }
419 
420 /*ARGSUSED*/
421 int
422 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
423 {
424 	return (0);
425 }
426 
427 /*
428  * map_addr_proc() is the routine called when the system is to
429  * choose an address for the user.  We will pick an address
430  * range which is the highest available below userlimit.
431  *
432  * addrp is a value/result parameter.
433  *	On input it is a hint from the user to be used in a completely
434  *	machine dependent fashion.  We decide to completely ignore this hint.
435  *
436  *	On output it is NULL if no address can be found in the current
437  *	processes address space or else an address that is currently
438  *	not mapped for len bytes with a page of red zone on either side.
439  *
440  *	align is not needed on x86 (it's for viturally addressed caches)
441  */
442 /*ARGSUSED*/
443 void
444 map_addr_proc(
445 	caddr_t *addrp,
446 	size_t len,
447 	offset_t off,
448 	int vacalign,
449 	caddr_t userlimit,
450 	struct proc *p,
451 	uint_t flags)
452 {
453 	struct as *as = p->p_as;
454 	caddr_t addr;
455 	caddr_t base;
456 	size_t slen;
457 	size_t align_amount;
458 
459 	ASSERT32(userlimit == as->a_userlimit);
460 
461 	base = p->p_brkbase;
462 #if defined(__amd64)
463 	/*
464 	 * XX64 Yes, this needs more work.
465 	 */
466 	if (p->p_model == DATAMODEL_NATIVE) {
467 		if (userlimit < as->a_userlimit) {
468 			/*
469 			 * This happens when a program wants to map
470 			 * something in a range that's accessible to a
471 			 * program in a smaller address space.  For example,
472 			 * a 64-bit program calling mmap32(2) to guarantee
473 			 * that the returned address is below 4Gbytes.
474 			 */
475 			ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
476 
477 			if (userlimit > base)
478 				slen = userlimit - base;
479 			else {
480 				*addrp = NULL;
481 				return;
482 			}
483 		} else {
484 			/*
485 			 * XX64 This layout is probably wrong .. but in
486 			 * the event we make the amd64 address space look
487 			 * like sparcv9 i.e. with the stack -above- the
488 			 * heap, this bit of code might even be correct.
489 			 */
490 			slen = p->p_usrstack - base -
491 			    (((size_t)rctl_enforced_value(
492 			    rctlproc_legacy[RLIMIT_STACK],
493 			    p->p_rctls, p) + PAGEOFFSET) & PAGEMASK);
494 		}
495 	} else
496 #endif
497 		slen = userlimit - base;
498 
499 	len = (len + PAGEOFFSET) & PAGEMASK;
500 
501 	/*
502 	 * Redzone for each side of the request. This is done to leave
503 	 * one page unmapped between segments. This is not required, but
504 	 * it's useful for the user because if their program strays across
505 	 * a segment boundary, it will catch a fault immediately making
506 	 * debugging a little easier.
507 	 */
508 	len += 2 * MMU_PAGESIZE;
509 
510 	/*
511 	 * figure out what the alignment should be
512 	 *
513 	 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
514 	 */
515 	if (len <= ELF_386_MAXPGSZ) {
516 		/*
517 		 * Align virtual addresses to ensure that ELF shared libraries
518 		 * are mapped with the appropriate alignment constraints by
519 		 * the run-time linker.
520 		 */
521 		align_amount = ELF_386_MAXPGSZ;
522 	} else {
523 		int l = mmu.max_page_level;
524 
525 		while (l && len < LEVEL_SIZE(l))
526 			--l;
527 
528 		align_amount = LEVEL_SIZE(l);
529 	}
530 
531 	if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
532 		align_amount = (uintptr_t)*addrp;
533 
534 	len += align_amount;
535 
536 	/*
537 	 * Look for a large enough hole starting below userlimit.
538 	 * After finding it, use the upper part.  Addition of PAGESIZE
539 	 * is for the redzone as described above.
540 	 */
541 	if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) {
542 		caddr_t as_addr;
543 
544 		addr = base + slen - len + MMU_PAGESIZE;
545 		as_addr = addr;
546 		/*
547 		 * Round address DOWN to the alignment amount,
548 		 * add the offset, and if this address is less
549 		 * than the original address, add alignment amount.
550 		 */
551 		addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
552 		addr += (uintptr_t)(off & (align_amount - 1));
553 		if (addr < as_addr)
554 			addr += align_amount;
555 
556 		ASSERT(addr <= (as_addr + align_amount));
557 		ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
558 		    ((uintptr_t)(off & (align_amount - 1))));
559 		*addrp = addr;
560 	} else {
561 		*addrp = NULL;	/* no more virtual space */
562 	}
563 }
564 
565 /*
566  * Determine whether [base, base+len] contains a valid range of
567  * addresses at least minlen long. base and len are adjusted if
568  * required to provide a valid range.
569  */
570 /*ARGSUSED3*/
571 int
572 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
573 {
574 	uintptr_t hi, lo;
575 
576 	lo = (uintptr_t)*basep;
577 	hi = lo + *lenp;
578 
579 	/*
580 	 * If hi rolled over the top, try cutting back.
581 	 */
582 	if (hi < lo) {
583 		if (0 - lo + hi < minlen)
584 			return (0);
585 		if (0 - lo < minlen)
586 			return (0);
587 		*lenp = 0 - lo;
588 	} else if (hi - lo < minlen) {
589 		return (0);
590 	}
591 #if defined(__amd64)
592 	/*
593 	 * Deal with a possible hole in the address range between
594 	 * hole_start and hole_end that should never be mapped.
595 	 */
596 	if (lo < hole_start) {
597 		if (hi > hole_start) {
598 			if (hi < hole_end) {
599 				hi = hole_start;
600 			} else {
601 				/* lo < hole_start && hi >= hole_end */
602 				if (dir == AH_LO) {
603 					/*
604 					 * prefer lowest range
605 					 */
606 					if (hole_start - lo >= minlen)
607 						hi = hole_start;
608 					else if (hi - hole_end >= minlen)
609 						lo = hole_end;
610 					else
611 						return (0);
612 				} else {
613 					/*
614 					 * prefer highest range
615 					 */
616 					if (hi - hole_end >= minlen)
617 						lo = hole_end;
618 					else if (hole_start - lo >= minlen)
619 						hi = hole_start;
620 					else
621 						return (0);
622 				}
623 			}
624 		}
625 	} else {
626 		/* lo >= hole_start */
627 		if (hi < hole_end)
628 			return (0);
629 		if (lo < hole_end)
630 			lo = hole_end;
631 	}
632 
633 	if (hi - lo < minlen)
634 		return (0);
635 
636 	*basep = (caddr_t)lo;
637 	*lenp = hi - lo;
638 #endif
639 	return (1);
640 }
641 
642 /*
643  * Determine whether [addr, addr+len] are valid user addresses.
644  */
645 /*ARGSUSED*/
646 int
647 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
648     caddr_t userlimit)
649 {
650 	caddr_t eaddr = addr + len;
651 
652 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
653 		return (RANGE_BADADDR);
654 
655 #if defined(__amd64)
656 	/*
657 	 * Check for the VA hole
658 	 */
659 	if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
660 		return (RANGE_BADADDR);
661 #endif
662 
663 	return (RANGE_OKAY);
664 }
665 
666 /*
667  * Return 1 if the page frame is onboard memory, else 0.
668  */
669 int
670 pf_is_memory(pfn_t pf)
671 {
672 	if (pfn_is_foreign(pf))
673 		return (0);
674 	return (address_in_memlist(phys_install, pfn_to_pa(pf), 1));
675 }
676 
677 
678 /*
679  * initialized by page_coloring_init().
680  */
681 uint_t	page_colors;
682 uint_t	page_colors_mask;
683 uint_t	page_coloring_shift;
684 int	cpu_page_colors;
685 static uint_t	l2_colors;
686 
687 /*
688  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
689  * and page_colors are calculated from the l2 cache n-way set size.  Within a
690  * mnode range, the page freelist and cachelist are hashed into bins based on
691  * color. This makes it easier to search for a page within a specific memory
692  * range.
693  */
694 #define	PAGE_COLORS_MIN	16
695 
696 page_t ****page_freelists;
697 page_t ***page_cachelists;
698 
699 /*
700  * As the PC architecture evolved memory up was clumped into several
701  * ranges for various historical I/O devices to do DMA.
702  * < 16Meg - ISA bus
703  * < 2Gig - ???
704  * < 4Gig - PCI bus or drivers that don't understand PAE mode
705  */
706 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
707     0x100000,	/* pfn range for 4G and above */
708     0x80000,	/* pfn range for 2G-4G */
709     0x01000,	/* pfn range for 16M-2G */
710     0x00000,	/* pfn range for 0-16M */
711 };
712 
713 /*
714  * These are changed during startup if the machine has limited memory.
715  */
716 pfn_t *memranges = &arch_memranges[0];
717 int nranges = NUM_MEM_RANGES;
718 
719 /*
720  * Used by page layer to know about page sizes
721  */
722 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
723 
724 /*
725  * This can be patched via /etc/system to allow old non-PAE aware device
726  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
727  */
728 #if defined(__i386)
729 int restricted_kmemalloc = 0;
730 #elif defined(__amd64)
731 int restricted_kmemalloc = 0;
732 #endif
733 
734 kmutex_t	*fpc_mutex[NPC_MUTEX];
735 kmutex_t	*cpc_mutex[NPC_MUTEX];
736 
737 
738 /*
739  * return the memrange containing pfn
740  */
741 int
742 memrange_num(pfn_t pfn)
743 {
744 	int n;
745 
746 	for (n = 0; n < nranges - 1; ++n) {
747 		if (pfn >= memranges[n])
748 			break;
749 	}
750 	return (n);
751 }
752 
753 /*
754  * return the mnoderange containing pfn
755  */
756 int
757 pfn_2_mtype(pfn_t pfn)
758 {
759 	int	n;
760 
761 	for (n = mnoderangecnt - 1; n >= 0; n--) {
762 		if (pfn >= mnoderanges[n].mnr_pfnlo) {
763 			break;
764 		}
765 	}
766 	return (n);
767 }
768 
769 /*
770  * is_contigpage_free:
771  *	returns a page list of contiguous pages. It minimally has to return
772  *	minctg pages. Caller determines minctg based on the scatter-gather
773  *	list length.
774  *
775  *	pfnp is set to the next page frame to search on return.
776  */
777 static page_t *
778 is_contigpage_free(
779 	pfn_t *pfnp,
780 	pgcnt_t *pgcnt,
781 	pgcnt_t minctg,
782 	uint64_t pfnseg,
783 	int iolock)
784 {
785 	int	i = 0;
786 	pfn_t	pfn = *pfnp;
787 	page_t	*pp;
788 	page_t	*plist = NULL;
789 
790 	/*
791 	 * fail if pfn + minctg crosses a segment boundary.
792 	 * Adjust for next starting pfn to begin at segment boundary.
793 	 */
794 
795 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
796 		*pfnp = roundup(*pfnp, pfnseg + 1);
797 		return (NULL);
798 	}
799 
800 	do {
801 retry:
802 		pp = page_numtopp_nolock(pfn + i);
803 		if ((pp == NULL) ||
804 		    (page_trylock(pp, SE_EXCL) == 0)) {
805 			(*pfnp)++;
806 			break;
807 		}
808 		if (page_pptonum(pp) != pfn + i) {
809 			page_unlock(pp);
810 			goto retry;
811 		}
812 
813 		if (!(PP_ISFREE(pp))) {
814 			page_unlock(pp);
815 			(*pfnp)++;
816 			break;
817 		}
818 
819 		if (!PP_ISAGED(pp)) {
820 			page_list_sub(pp, PG_CACHE_LIST);
821 			page_hashout(pp, (kmutex_t *)NULL);
822 		} else {
823 			page_list_sub(pp, PG_FREE_LIST);
824 		}
825 
826 		if (iolock)
827 			page_io_lock(pp);
828 		page_list_concat(&plist, &pp);
829 
830 		/*
831 		 * exit loop when pgcnt satisfied or segment boundary reached.
832 		 */
833 
834 	} while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
835 
836 	*pfnp += i;		/* set to next pfn to search */
837 
838 	if (i >= minctg) {
839 		*pgcnt -= i;
840 		return (plist);
841 	}
842 
843 	/*
844 	 * failure: minctg not satisfied.
845 	 *
846 	 * if next request crosses segment boundary, set next pfn
847 	 * to search from the segment boundary.
848 	 */
849 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
850 		*pfnp = roundup(*pfnp, pfnseg + 1);
851 
852 	/* clean up any pages already allocated */
853 
854 	while (plist) {
855 		pp = plist;
856 		page_sub(&plist, pp);
857 		page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
858 		if (iolock)
859 			page_io_unlock(pp);
860 		page_unlock(pp);
861 	}
862 
863 	return (NULL);
864 }
865 
866 /*
867  * verify that pages being returned from allocator have correct DMA attribute
868  */
869 #ifndef DEBUG
870 #define	check_dma(a, b, c) (0)
871 #else
872 static void
873 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
874 {
875 	if (dma_attr == NULL)
876 		return;
877 
878 	while (cnt-- > 0) {
879 		if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) <
880 		    dma_attr->dma_attr_addr_lo)
881 			panic("PFN (pp=%p) below dma_attr_addr_lo", pp);
882 		if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >=
883 		    dma_attr->dma_attr_addr_hi)
884 			panic("PFN (pp=%p) above dma_attr_addr_hi", pp);
885 		pp = pp->p_next;
886 	}
887 }
888 #endif
889 
890 static kmutex_t	contig_lock;
891 
892 #define	CONTIG_LOCK()	mutex_enter(&contig_lock);
893 #define	CONTIG_UNLOCK()	mutex_exit(&contig_lock);
894 
895 #define	PFN_16M		(mmu_btop((uint64_t)0x1000000))
896 
897 static page_t *
898 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
899 {
900 	pfn_t		pfn;
901 	int		sgllen;
902 	uint64_t	pfnseg;
903 	pgcnt_t		minctg;
904 	page_t		*pplist = NULL, *plist;
905 	uint64_t	lo, hi;
906 	pgcnt_t		pfnalign = 0;
907 	static pfn_t	startpfn;
908 	static pgcnt_t	lastctgcnt;
909 	uintptr_t	align;
910 
911 	CONTIG_LOCK();
912 
913 	if (mattr) {
914 		lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
915 		hi = mmu_btop(mattr->dma_attr_addr_hi);
916 		if (hi >= physmax)
917 			hi = physmax - 1;
918 		sgllen = mattr->dma_attr_sgllen;
919 		pfnseg = mmu_btop(mattr->dma_attr_seg);
920 
921 		align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
922 		if (align > MMU_PAGESIZE)
923 			pfnalign = mmu_btop(align);
924 
925 		/*
926 		 * in order to satisfy the request, must minimally
927 		 * acquire minctg contiguous pages
928 		 */
929 		minctg = howmany(*pgcnt, sgllen);
930 
931 		ASSERT(hi >= lo);
932 
933 		/*
934 		 * start from where last searched if the minctg >= lastctgcnt
935 		 */
936 		if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
937 			startpfn = lo;
938 	} else {
939 		hi = physmax - 1;
940 		lo = 0;
941 		sgllen = 1;
942 		pfnseg = mmu.highest_pfn;
943 		minctg = *pgcnt;
944 
945 		if (minctg < lastctgcnt)
946 			startpfn = lo;
947 	}
948 	lastctgcnt = minctg;
949 
950 	ASSERT(pfnseg + 1 >= (uint64_t)minctg);
951 
952 	/* conserve 16m memory - start search above 16m when possible */
953 	if (hi > PFN_16M && startpfn < PFN_16M)
954 		startpfn = PFN_16M;
955 
956 	pfn = startpfn;
957 	if (pfnalign)
958 		pfn = P2ROUNDUP(pfn, pfnalign);
959 
960 	while (pfn + minctg - 1 <= hi) {
961 
962 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
963 		if (plist) {
964 			page_list_concat(&pplist, &plist);
965 			sgllen--;
966 			/*
967 			 * return when contig pages no longer needed
968 			 */
969 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
970 				startpfn = pfn;
971 				CONTIG_UNLOCK();
972 				check_dma(mattr, pplist, *pgcnt);
973 				return (pplist);
974 			}
975 			minctg = howmany(*pgcnt, sgllen);
976 		}
977 		if (pfnalign)
978 			pfn = P2ROUNDUP(pfn, pfnalign);
979 	}
980 
981 	/* cannot find contig pages in specified range */
982 	if (startpfn == lo) {
983 		CONTIG_UNLOCK();
984 		return (NULL);
985 	}
986 
987 	/* did not start with lo previously */
988 	pfn = lo;
989 	if (pfnalign)
990 		pfn = P2ROUNDUP(pfn, pfnalign);
991 
992 	/* allow search to go above startpfn */
993 	while (pfn < startpfn) {
994 
995 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
996 		if (plist != NULL) {
997 
998 			page_list_concat(&pplist, &plist);
999 			sgllen--;
1000 
1001 			/*
1002 			 * return when contig pages no longer needed
1003 			 */
1004 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1005 				startpfn = pfn;
1006 				CONTIG_UNLOCK();
1007 				check_dma(mattr, pplist, *pgcnt);
1008 				return (pplist);
1009 			}
1010 			minctg = howmany(*pgcnt, sgllen);
1011 		}
1012 		if (pfnalign)
1013 			pfn = P2ROUNDUP(pfn, pfnalign);
1014 	}
1015 	CONTIG_UNLOCK();
1016 	return (NULL);
1017 }
1018 
1019 /*
1020  * combine mem_node_config and memrange memory ranges into one data
1021  * structure to be used for page list management.
1022  *
1023  * mnode_range_cnt() calculates the number of memory ranges for mnode and
1024  * memranges[]. Used to determine the size of page lists and mnoderanges.
1025  *
1026  * mnode_range_setup() initializes mnoderanges.
1027  */
1028 mnoderange_t	*mnoderanges;
1029 int		mnoderangecnt;
1030 int		mtype4g;
1031 
1032 int
1033 mnode_range_cnt(int mnode)
1034 {
1035 	int	mri;
1036 	int	mnrcnt = 0;
1037 
1038 	if (mem_node_config[mnode].exists != 0) {
1039 		mri = nranges - 1;
1040 
1041 		/* find the memranges index below contained in mnode range */
1042 
1043 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1044 			mri--;
1045 
1046 		/*
1047 		 * increment mnode range counter when memranges or mnode
1048 		 * boundary is reached.
1049 		 */
1050 		while (mri >= 0 &&
1051 		    mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1052 			mnrcnt++;
1053 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1054 				mri--;
1055 			else
1056 				break;
1057 		}
1058 	}
1059 	ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
1060 	return (mnrcnt);
1061 }
1062 
1063 void
1064 mnode_range_setup(mnoderange_t *mnoderanges)
1065 {
1066 	int	mnode, mri;
1067 
1068 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
1069 		if (mem_node_config[mnode].exists == 0)
1070 			continue;
1071 
1072 		mri = nranges - 1;
1073 
1074 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1075 			mri--;
1076 
1077 		while (mri >= 0 && mem_node_config[mnode].physmax >=
1078 		    MEMRANGELO(mri)) {
1079 			mnoderanges->mnr_pfnlo =
1080 			    MAX(MEMRANGELO(mri),
1081 				mem_node_config[mnode].physbase);
1082 			mnoderanges->mnr_pfnhi =
1083 			    MIN(MEMRANGEHI(mri),
1084 				mem_node_config[mnode].physmax);
1085 			mnoderanges->mnr_mnode = mnode;
1086 			mnoderanges->mnr_memrange = mri;
1087 			mnoderanges++;
1088 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1089 				mri--;
1090 			else
1091 				break;
1092 		}
1093 	}
1094 }
1095 
1096 /*
1097  * Determine if the mnode range specified in mtype contains memory belonging
1098  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
1099  * the range of indices from high pfn to 0, 16m or 4g.
1100  *
1101  * Return first mnode range type index found otherwise return -1 if none found.
1102  */
1103 int
1104 mtype_func(int mnode, int mtype, uint_t flags)
1105 {
1106 	if (flags & PGI_MT_RANGE) {
1107 		int	mtlim;
1108 
1109 		if (flags & PGI_MT_NEXT)
1110 			mtype--;
1111 		if (flags & PGI_MT_RANGE0)
1112 			mtlim = 0;
1113 		else if (flags & PGI_MT_RANGE4G)
1114 			mtlim = mtype4g + 1;	/* exclude 0-4g range */
1115 		else if (flags & PGI_MT_RANGE16M)
1116 			mtlim = 1;		/* exclude 0-16m range */
1117 		while (mtype >= mtlim) {
1118 			if (mnoderanges[mtype].mnr_mnode == mnode)
1119 				return (mtype);
1120 			mtype--;
1121 		}
1122 	} else {
1123 		if (mnoderanges[mtype].mnr_mnode == mnode)
1124 			return (mtype);
1125 	}
1126 	return (-1);
1127 }
1128 
1129 /*
1130  * Update the page list max counts with the pfn range specified by the
1131  * input parameters.  Called from add_physmem() when physical memory with
1132  * page_t's are initially added to the page lists.
1133  */
1134 void
1135 mtype_modify_max(pfn_t startpfn, long cnt)
1136 {
1137 	int	mtype = 0;
1138 	pfn_t	endpfn = startpfn + cnt, pfn;
1139 	pgcnt_t	inc;
1140 
1141 	ASSERT(cnt > 0);
1142 
1143 	for (pfn = startpfn; pfn < endpfn; ) {
1144 		if (pfn <= mnoderanges[mtype].mnr_pfnhi) {
1145 			if (endpfn < mnoderanges[mtype].mnr_pfnhi) {
1146 				inc = endpfn - pfn;
1147 			} else {
1148 				inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1;
1149 			}
1150 			mnoderanges[mtype].mnr_mt_pgmax += inc;
1151 			if (physmax4g && mtype <= mtype4g)
1152 				maxmem4g += inc;
1153 			pfn += inc;
1154 		}
1155 		mtype++;
1156 		ASSERT(mtype < mnoderangecnt || pfn >= endpfn);
1157 	}
1158 }
1159 
1160 /*
1161  * Returns the free page count for mnode
1162  */
1163 int
1164 mnode_pgcnt(int mnode)
1165 {
1166 	int	mtype = mnoderangecnt - 1;
1167 	int	flags = PGI_MT_RANGE0;
1168 	pgcnt_t	pgcnt = 0;
1169 
1170 	mtype = mtype_func(mnode, mtype, flags);
1171 
1172 	while (mtype != -1) {
1173 		pgcnt += MTYPE_FREEMEM(mtype);
1174 		mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1175 	}
1176 	return (pgcnt);
1177 }
1178 
1179 /*
1180  * Initialize page coloring variables based on the l2 cache parameters.
1181  * Calculate and return memory needed for page coloring data structures.
1182  */
1183 size_t
1184 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1185 {
1186 	size_t	colorsz = 0;
1187 	int	i;
1188 	int	colors;
1189 
1190 	/*
1191 	 * Reduce the memory ranges lists if we don't have large amounts
1192 	 * of memory. This avoids searching known empty free lists.
1193 	 */
1194 	i = memrange_num(physmax);
1195 	memranges += i;
1196 	nranges -= i;
1197 #if defined(__i386)
1198 	if (i > 0)
1199 		restricted_kmemalloc = 0;
1200 #endif
1201 	/* physmax greater than 4g */
1202 	if (i == 0)
1203 		physmax4g = 1;
1204 
1205 	ASSERT(ISP2(l2_sz));
1206 	ASSERT(ISP2(l2_linesz));
1207 	ASSERT(l2_sz > MMU_PAGESIZE);
1208 
1209 	/* l2_assoc is 0 for fully associative l2 cache */
1210 	if (l2_assoc)
1211 		l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1212 	else
1213 		l2_colors = 1;
1214 
1215 	/* for scalability, configure at least PAGE_COLORS_MIN color bins */
1216 	page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1217 
1218 	/*
1219 	 * cpu_page_colors is non-zero when a page color may be spread across
1220 	 * multiple bins.
1221 	 */
1222 	if (l2_colors < page_colors)
1223 		cpu_page_colors = l2_colors;
1224 
1225 	ASSERT(ISP2(page_colors));
1226 
1227 	page_colors_mask = page_colors - 1;
1228 
1229 	ASSERT(ISP2(CPUSETSIZE()));
1230 	page_coloring_shift = lowbit(CPUSETSIZE());
1231 
1232 	/* initialize number of colors per page size */
1233 	for (i = 0; i <= mmu.max_page_level; i++) {
1234 		hw_page_array[i].hp_size = LEVEL_SIZE(i);
1235 		hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1236 		hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1237 		hw_page_array[i].hp_colors = (page_colors_mask >>
1238 		    (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
1239 		    + 1;
1240 	}
1241 
1242 	/*
1243 	 * The value of cpu_page_colors determines if additional color bins
1244 	 * need to be checked for a particular color in the page_get routines.
1245 	 */
1246 	if (cpu_page_colors != 0) {
1247 
1248 		int a = lowbit(page_colors) - lowbit(cpu_page_colors);
1249 		ASSERT(a > 0);
1250 		ASSERT(a < 16);
1251 
1252 		for (i = 0; i <= mmu.max_page_level; i++) {
1253 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
1254 				colorequivszc[i] = 0;
1255 				continue;
1256 			}
1257 			while ((colors >> a) == 0)
1258 				a--;
1259 			ASSERT(a >= 0);
1260 
1261 			/* higher 4 bits encodes color equiv mask */
1262 			colorequivszc[i] = (a << 4);
1263 		}
1264 	}
1265 
1266 	/* factor in colorequiv to check additional 'equivalent' bins. */
1267 	if (colorequiv > 1) {
1268 
1269 		int a = lowbit(colorequiv) - 1;
1270 		if (a > 15)
1271 			a = 15;
1272 
1273 		for (i = 0; i <= mmu.max_page_level; i++) {
1274 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
1275 				continue;
1276 			}
1277 			while ((colors >> a) == 0)
1278 				a--;
1279 			if ((a << 4) > colorequivszc[i]) {
1280 				colorequivszc[i] = (a << 4);
1281 			}
1282 		}
1283 	}
1284 
1285 	/* size for mnoderanges */
1286 	for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
1287 		mnoderangecnt += mnode_range_cnt(i);
1288 	colorsz = mnoderangecnt * sizeof (mnoderange_t);
1289 
1290 	/* size for fpc_mutex and cpc_mutex */
1291 	colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1292 
1293 	/* size of page_freelists */
1294 	colorsz += mnoderangecnt * sizeof (page_t ***);
1295 	colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1296 
1297 	for (i = 0; i < mmu_page_sizes; i++) {
1298 		colors = page_get_pagecolors(i);
1299 		colorsz += mnoderangecnt * colors * sizeof (page_t *);
1300 	}
1301 
1302 	/* size of page_cachelists */
1303 	colorsz += mnoderangecnt * sizeof (page_t **);
1304 	colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1305 
1306 	return (colorsz);
1307 }
1308 
1309 /*
1310  * Called once at startup to configure page_coloring data structures and
1311  * does the 1st page_free()/page_freelist_add().
1312  */
1313 void
1314 page_coloring_setup(caddr_t pcmemaddr)
1315 {
1316 	int	i;
1317 	int	j;
1318 	int	k;
1319 	caddr_t	addr;
1320 	int	colors;
1321 
1322 	/*
1323 	 * do page coloring setup
1324 	 */
1325 	addr = pcmemaddr;
1326 
1327 	mnoderanges = (mnoderange_t *)addr;
1328 	addr += (mnoderangecnt * sizeof (mnoderange_t));
1329 
1330 	mnode_range_setup(mnoderanges);
1331 
1332 	if (physmax4g)
1333 		mtype4g = pfn_2_mtype(0xfffff);
1334 
1335 	for (k = 0; k < NPC_MUTEX; k++) {
1336 		fpc_mutex[k] = (kmutex_t *)addr;
1337 		addr += (max_mem_nodes * sizeof (kmutex_t));
1338 	}
1339 	for (k = 0; k < NPC_MUTEX; k++) {
1340 		cpc_mutex[k] = (kmutex_t *)addr;
1341 		addr += (max_mem_nodes * sizeof (kmutex_t));
1342 	}
1343 	page_freelists = (page_t ****)addr;
1344 	addr += (mnoderangecnt * sizeof (page_t ***));
1345 
1346 	page_cachelists = (page_t ***)addr;
1347 	addr += (mnoderangecnt * sizeof (page_t **));
1348 
1349 	for (i = 0; i < mnoderangecnt; i++) {
1350 		page_freelists[i] = (page_t ***)addr;
1351 		addr += (mmu_page_sizes * sizeof (page_t **));
1352 
1353 		for (j = 0; j < mmu_page_sizes; j++) {
1354 			colors = page_get_pagecolors(j);
1355 			page_freelists[i][j] = (page_t **)addr;
1356 			addr += (colors * sizeof (page_t *));
1357 		}
1358 		page_cachelists[i] = (page_t **)addr;
1359 		addr += (page_colors * sizeof (page_t *));
1360 	}
1361 }
1362 
1363 /*ARGSUSED*/
1364 int
1365 bp_color(struct buf *bp)
1366 {
1367 	return (0);
1368 }
1369 
1370 /*
1371  * get a page from any list with the given mnode
1372  */
1373 page_t *
1374 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
1375     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
1376 {
1377 	kmutex_t		*pcm;
1378 	int			i;
1379 	page_t			*pp;
1380 	page_t			*first_pp;
1381 	uint64_t		pgaddr;
1382 	ulong_t			bin;
1383 	int			mtypestart;
1384 	int			plw_initialized;
1385 	page_list_walker_t	plw;
1386 
1387 	VM_STAT_ADD(pga_vmstats.pgma_alloc);
1388 
1389 	ASSERT((flags & PG_MATCH_COLOR) == 0);
1390 	ASSERT(szc == 0);
1391 	ASSERT(dma_attr != NULL);
1392 
1393 	MTYPE_START(mnode, mtype, flags);
1394 	if (mtype < 0) {
1395 		VM_STAT_ADD(pga_vmstats.pgma_allocempty);
1396 		return (NULL);
1397 	}
1398 
1399 	mtypestart = mtype;
1400 
1401 	bin = origbin;
1402 
1403 	/*
1404 	 * check up to page_colors + 1 bins - origbin may be checked twice
1405 	 * because of BIN_STEP skip
1406 	 */
1407 	do {
1408 		plw_initialized = 0;
1409 
1410 		for (plw.plw_count = 0;
1411 		    plw.plw_count < page_colors; plw.plw_count++) {
1412 
1413 			if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
1414 				goto nextfreebin;
1415 
1416 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1417 			mutex_enter(pcm);
1418 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
1419 			first_pp = pp;
1420 			while (pp != NULL) {
1421 				if (page_trylock(pp, SE_EXCL) == 0) {
1422 					pp = pp->p_next;
1423 					if (pp == first_pp) {
1424 						pp = NULL;
1425 					}
1426 					continue;
1427 				}
1428 
1429 				ASSERT(PP_ISFREE(pp));
1430 				ASSERT(PP_ISAGED(pp));
1431 				ASSERT(pp->p_vnode == NULL);
1432 				ASSERT(pp->p_hash == NULL);
1433 				ASSERT(pp->p_offset == (u_offset_t)-1);
1434 				ASSERT(pp->p_szc == szc);
1435 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
1436 				/* check if page within DMA attributes */
1437 				pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
1438 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
1439 				    (pgaddr + MMU_PAGESIZE - 1 <=
1440 				    dma_attr->dma_attr_addr_hi)) {
1441 					break;
1442 				}
1443 
1444 				/* continue looking */
1445 				page_unlock(pp);
1446 				pp = pp->p_next;
1447 				if (pp == first_pp)
1448 					pp = NULL;
1449 
1450 			}
1451 			if (pp != NULL) {
1452 				ASSERT(mtype == PP_2_MTYPE(pp));
1453 				ASSERT(pp->p_szc == 0);
1454 
1455 				/* found a page with specified DMA attributes */
1456 				page_sub(&PAGE_FREELISTS(mnode, szc, bin,
1457 				    mtype), pp);
1458 				page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1459 
1460 				if ((PP_ISFREE(pp) == 0) ||
1461 				    (PP_ISAGED(pp) == 0)) {
1462 					cmn_err(CE_PANIC, "page %p is not free",
1463 					    (void *)pp);
1464 				}
1465 
1466 				mutex_exit(pcm);
1467 				check_dma(dma_attr, pp, 1);
1468 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
1469 				return (pp);
1470 			}
1471 			mutex_exit(pcm);
1472 nextfreebin:
1473 			if (plw_initialized == 0) {
1474 				page_list_walk_init(szc, 0, bin, 1, 0, &plw);
1475 				ASSERT(plw.plw_ceq_dif == page_colors);
1476 				plw_initialized = 1;
1477 			}
1478 
1479 			if (plw.plw_do_split) {
1480 				pp = page_freelist_split(szc, bin, mnode,
1481 				    mtype,
1482 				    mmu_btop(dma_attr->dma_attr_addr_hi + 1),
1483 				    &plw);
1484 				if (pp != NULL)
1485 					return (pp);
1486 			}
1487 
1488 			bin = page_list_walk_next_bin(szc, bin, &plw);
1489 		}
1490 
1491 		MTYPE_NEXT(mnode, mtype, flags);
1492 	} while (mtype >= 0);
1493 
1494 	/* failed to find a page in the freelist; try it in the cachelist */
1495 
1496 	/* reset mtype start for cachelist search */
1497 	mtype = mtypestart;
1498 	ASSERT(mtype >= 0);
1499 
1500 	/* start with the bin of matching color */
1501 	bin = origbin;
1502 
1503 	do {
1504 		for (i = 0; i <= page_colors; i++) {
1505 			if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
1506 				goto nextcachebin;
1507 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
1508 			mutex_enter(pcm);
1509 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
1510 			first_pp = pp;
1511 			while (pp != NULL) {
1512 				if (page_trylock(pp, SE_EXCL) == 0) {
1513 					pp = pp->p_next;
1514 					if (pp == first_pp)
1515 						break;
1516 					continue;
1517 				}
1518 				ASSERT(pp->p_vnode);
1519 				ASSERT(PP_ISAGED(pp) == 0);
1520 				ASSERT(pp->p_szc == 0);
1521 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
1522 
1523 				/* check if page within DMA attributes */
1524 
1525 				pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
1526 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
1527 				    (pgaddr + MMU_PAGESIZE - 1 <=
1528 				    dma_attr->dma_attr_addr_hi)) {
1529 					break;
1530 				}
1531 
1532 				/* continue looking */
1533 				page_unlock(pp);
1534 				pp = pp->p_next;
1535 				if (pp == first_pp)
1536 					pp = NULL;
1537 			}
1538 
1539 			if (pp != NULL) {
1540 				ASSERT(mtype == PP_2_MTYPE(pp));
1541 				ASSERT(pp->p_szc == 0);
1542 
1543 				/* found a page with specified DMA attributes */
1544 				page_sub(&PAGE_CACHELISTS(mnode, bin,
1545 				    mtype), pp);
1546 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
1547 
1548 				mutex_exit(pcm);
1549 				ASSERT(pp->p_vnode);
1550 				ASSERT(PP_ISAGED(pp) == 0);
1551 				check_dma(dma_attr, pp, 1);
1552 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
1553 				return (pp);
1554 			}
1555 			mutex_exit(pcm);
1556 nextcachebin:
1557 			bin += (i == 0) ? BIN_STEP : 1;
1558 			bin &= page_colors_mask;
1559 		}
1560 		MTYPE_NEXT(mnode, mtype, flags);
1561 	} while (mtype >= 0);
1562 
1563 	VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
1564 	return (NULL);
1565 }
1566 
1567 /*
1568  * This function is similar to page_get_freelist()/page_get_cachelist()
1569  * but it searches both the lists to find a page with the specified
1570  * color (or no color) and DMA attributes. The search is done in the
1571  * freelist first and then in the cache list within the highest memory
1572  * range (based on DMA attributes) before searching in the lower
1573  * memory ranges.
1574  *
1575  * Note: This function is called only by page_create_io().
1576  */
1577 /*ARGSUSED*/
1578 page_t *
1579 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
1580     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t	*lgrp)
1581 {
1582 	uint_t		bin;
1583 	int		mtype;
1584 	page_t		*pp;
1585 	int		n;
1586 	int		m;
1587 	int		szc;
1588 	int		fullrange;
1589 	int		mnode;
1590 	int		local_failed_stat = 0;
1591 	lgrp_mnode_cookie_t	lgrp_cookie;
1592 
1593 	VM_STAT_ADD(pga_vmstats.pga_alloc);
1594 
1595 	/* only base pagesize currently supported */
1596 	if (size != MMU_PAGESIZE)
1597 		return (NULL);
1598 
1599 	/*
1600 	 * If we're passed a specific lgroup, we use it.  Otherwise,
1601 	 * assume first-touch placement is desired.
1602 	 */
1603 	if (!LGRP_EXISTS(lgrp))
1604 		lgrp = lgrp_home_lgrp();
1605 
1606 	/* LINTED */
1607 	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
1608 
1609 	/*
1610 	 * Only hold one freelist or cachelist lock at a time, that way we
1611 	 * can start anywhere and not have to worry about lock
1612 	 * ordering.
1613 	 */
1614 	if (dma_attr == NULL) {
1615 		n = 0;
1616 		m = mnoderangecnt - 1;
1617 		fullrange = 1;
1618 		VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
1619 	} else {
1620 		pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
1621 		pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
1622 
1623 		/*
1624 		 * We can guarantee alignment only for page boundary.
1625 		 */
1626 		if (dma_attr->dma_attr_align > MMU_PAGESIZE)
1627 			return (NULL);
1628 
1629 		n = pfn_2_mtype(pfnlo);
1630 		m = pfn_2_mtype(pfnhi);
1631 
1632 		fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
1633 		    (pfnhi >= mnoderanges[m].mnr_pfnhi));
1634 	}
1635 	VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
1636 
1637 	if (n > m)
1638 		return (NULL);
1639 
1640 	szc = 0;
1641 
1642 	/* cylcing thru mtype handled by RANGE0 if n == 0 */
1643 	if (n == 0) {
1644 		flags |= PGI_MT_RANGE0;
1645 		n = m;
1646 	}
1647 
1648 	/*
1649 	 * Try local memory node first, but try remote if we can't
1650 	 * get a page of the right color.
1651 	 */
1652 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
1653 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
1654 		/*
1655 		 * allocate pages from high pfn to low.
1656 		 */
1657 		for (mtype = m; mtype >= n; mtype--) {
1658 			if (fullrange != 0) {
1659 				pp = page_get_mnode_freelist(mnode,
1660 				    bin, mtype, szc, flags);
1661 				if (pp == NULL) {
1662 					pp = page_get_mnode_cachelist(
1663 						bin, flags, mnode, mtype);
1664 				}
1665 			} else {
1666 				pp = page_get_mnode_anylist(bin, szc,
1667 				    flags, mnode, mtype, dma_attr);
1668 			}
1669 			if (pp != NULL) {
1670 				VM_STAT_ADD(pga_vmstats.pga_allocok);
1671 				check_dma(dma_attr, pp, 1);
1672 				return (pp);
1673 			}
1674 		}
1675 		if (!local_failed_stat) {
1676 			lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
1677 			local_failed_stat = 1;
1678 		}
1679 	}
1680 	VM_STAT_ADD(pga_vmstats.pga_allocfailed);
1681 
1682 	return (NULL);
1683 }
1684 
1685 /*
1686  * page_create_io()
1687  *
1688  * This function is a copy of page_create_va() with an additional
1689  * argument 'mattr' that specifies DMA memory requirements to
1690  * the page list functions. This function is used by the segkmem
1691  * allocator so it is only to create new pages (i.e PG_EXCL is
1692  * set).
1693  *
1694  * Note: This interface is currently used by x86 PSM only and is
1695  *	 not fully specified so the commitment level is only for
1696  *	 private interface specific to x86. This interface uses PSM
1697  *	 specific page_get_anylist() interface.
1698  */
1699 
1700 #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
1701 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
1702 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
1703 			break; \
1704 	} \
1705 }
1706 
1707 
1708 page_t *
1709 page_create_io(
1710 	struct vnode	*vp,
1711 	u_offset_t	off,
1712 	uint_t		bytes,
1713 	uint_t		flags,
1714 	struct as	*as,
1715 	caddr_t		vaddr,
1716 	ddi_dma_attr_t	*mattr)	/* DMA memory attributes if any */
1717 {
1718 	page_t		*plist = NULL;
1719 	uint_t		plist_len = 0;
1720 	pgcnt_t		npages;
1721 	page_t		*npp = NULL;
1722 	uint_t		pages_req;
1723 	page_t		*pp;
1724 	kmutex_t	*phm = NULL;
1725 	uint_t		index;
1726 
1727 	TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
1728 		"page_create_start:vp %p off %llx bytes %u flags %x",
1729 		vp, off, bytes, flags);
1730 
1731 	ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
1732 
1733 	pages_req = npages = mmu_btopr(bytes);
1734 
1735 	/*
1736 	 * Do the freemem and pcf accounting.
1737 	 */
1738 	if (!page_create_wait(npages, flags)) {
1739 		return (NULL);
1740 	}
1741 
1742 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
1743 		"page_create_success:vp %p off %llx",
1744 		vp, off);
1745 
1746 	/*
1747 	 * If satisfying this request has left us with too little
1748 	 * memory, start the wheels turning to get some back.  The
1749 	 * first clause of the test prevents waking up the pageout
1750 	 * daemon in situations where it would decide that there's
1751 	 * nothing to do.
1752 	 */
1753 	if (nscan < desscan && freemem < minfree) {
1754 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
1755 			"pageout_cv_signal:freemem %ld", freemem);
1756 		cv_signal(&proc_pageout->p_cv);
1757 	}
1758 
1759 	if (flags & PG_PHYSCONTIG) {
1760 
1761 		plist = page_get_contigpage(&npages, mattr, 1);
1762 		if (plist == NULL) {
1763 			page_create_putback(npages);
1764 			return (NULL);
1765 		}
1766 
1767 		pp = plist;
1768 
1769 		do {
1770 			if (!page_hashin(pp, vp, off, NULL)) {
1771 				panic("pg_creat_io: hashin failed %p %p %llx",
1772 				    (void *)pp, (void *)vp, off);
1773 			}
1774 			VM_STAT_ADD(page_create_new);
1775 			off += MMU_PAGESIZE;
1776 			PP_CLRFREE(pp);
1777 			PP_CLRAGED(pp);
1778 			page_set_props(pp, P_REF);
1779 			pp = pp->p_next;
1780 		} while (pp != plist);
1781 
1782 		if (!npages) {
1783 			check_dma(mattr, plist, pages_req);
1784 			return (plist);
1785 		} else {
1786 			vaddr += (pages_req - npages) << MMU_PAGESHIFT;
1787 		}
1788 
1789 		/*
1790 		 * fall-thru:
1791 		 *
1792 		 * page_get_contigpage returns when npages <= sgllen.
1793 		 * Grab the rest of the non-contig pages below from anylist.
1794 		 */
1795 	}
1796 
1797 	/*
1798 	 * Loop around collecting the requested number of pages.
1799 	 * Most of the time, we have to `create' a new page. With
1800 	 * this in mind, pull the page off the free list before
1801 	 * getting the hash lock.  This will minimize the hash
1802 	 * lock hold time, nesting, and the like.  If it turns
1803 	 * out we don't need the page, we put it back at the end.
1804 	 */
1805 	while (npages--) {
1806 		phm = NULL;
1807 
1808 		index = PAGE_HASH_FUNC(vp, off);
1809 top:
1810 		ASSERT(phm == NULL);
1811 		ASSERT(index == PAGE_HASH_FUNC(vp, off));
1812 		ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1813 
1814 		if (npp == NULL) {
1815 			/*
1816 			 * Try to get the page of any color either from
1817 			 * the freelist or from the cache list.
1818 			 */
1819 			npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
1820 			    flags & ~PG_MATCH_COLOR, mattr, NULL);
1821 			if (npp == NULL) {
1822 				if (mattr == NULL) {
1823 					/*
1824 					 * Not looking for a special page;
1825 					 * panic!
1826 					 */
1827 					panic("no page found %d", (int)npages);
1828 				}
1829 				/*
1830 				 * No page found! This can happen
1831 				 * if we are looking for a page
1832 				 * within a specific memory range
1833 				 * for DMA purposes. If PG_WAIT is
1834 				 * specified then we wait for a
1835 				 * while and then try again. The
1836 				 * wait could be forever if we
1837 				 * don't get the page(s) we need.
1838 				 *
1839 				 * Note: XXX We really need a mechanism
1840 				 * to wait for pages in the desired
1841 				 * range. For now, we wait for any
1842 				 * pages and see if we can use it.
1843 				 */
1844 
1845 				if ((mattr != NULL) && (flags & PG_WAIT)) {
1846 					delay(10);
1847 					goto top;
1848 				}
1849 				goto fail; /* undo accounting stuff */
1850 			}
1851 
1852 			if (PP_ISAGED(npp) == 0) {
1853 				/*
1854 				 * Since this page came from the
1855 				 * cachelist, we must destroy the
1856 				 * old vnode association.
1857 				 */
1858 				page_hashout(npp, (kmutex_t *)NULL);
1859 			}
1860 		}
1861 
1862 		/*
1863 		 * We own this page!
1864 		 */
1865 		ASSERT(PAGE_EXCL(npp));
1866 		ASSERT(npp->p_vnode == NULL);
1867 		ASSERT(!hat_page_is_mapped(npp));
1868 		PP_CLRFREE(npp);
1869 		PP_CLRAGED(npp);
1870 
1871 		/*
1872 		 * Here we have a page in our hot little mits and are
1873 		 * just waiting to stuff it on the appropriate lists.
1874 		 * Get the mutex and check to see if it really does
1875 		 * not exist.
1876 		 */
1877 		phm = PAGE_HASH_MUTEX(index);
1878 		mutex_enter(phm);
1879 		PAGE_HASH_SEARCH(index, pp, vp, off);
1880 		if (pp == NULL) {
1881 			VM_STAT_ADD(page_create_new);
1882 			pp = npp;
1883 			npp = NULL;
1884 			if (!page_hashin(pp, vp, off, phm)) {
1885 				/*
1886 				 * Since we hold the page hash mutex and
1887 				 * just searched for this page, page_hashin
1888 				 * had better not fail.  If it does, that
1889 				 * means somethread did not follow the
1890 				 * page hash mutex rules.  Panic now and
1891 				 * get it over with.  As usual, go down
1892 				 * holding all the locks.
1893 				 */
1894 				ASSERT(MUTEX_HELD(phm));
1895 				panic("page_create: hashin fail %p %p %llx %p",
1896 				    (void *)pp, (void *)vp, off, (void *)phm);
1897 
1898 			}
1899 			ASSERT(MUTEX_HELD(phm));
1900 			mutex_exit(phm);
1901 			phm = NULL;
1902 
1903 			/*
1904 			 * Hat layer locking need not be done to set
1905 			 * the following bits since the page is not hashed
1906 			 * and was on the free list (i.e., had no mappings).
1907 			 *
1908 			 * Set the reference bit to protect
1909 			 * against immediate pageout
1910 			 *
1911 			 * XXXmh modify freelist code to set reference
1912 			 * bit so we don't have to do it here.
1913 			 */
1914 			page_set_props(pp, P_REF);
1915 		} else {
1916 			ASSERT(MUTEX_HELD(phm));
1917 			mutex_exit(phm);
1918 			phm = NULL;
1919 			/*
1920 			 * NOTE: This should not happen for pages associated
1921 			 *	 with kernel vnode 'kvp'.
1922 			 */
1923 			/* XX64 - to debug why this happens! */
1924 			ASSERT(!VN_ISKAS(vp));
1925 			if (VN_ISKAS(vp))
1926 				cmn_err(CE_NOTE,
1927 				    "page_create: page not expected "
1928 				    "in hash list for kernel vnode - pp 0x%p",
1929 				    (void *)pp);
1930 			VM_STAT_ADD(page_create_exists);
1931 			goto fail;
1932 		}
1933 
1934 		/*
1935 		 * Got a page!  It is locked.  Acquire the i/o
1936 		 * lock since we are going to use the p_next and
1937 		 * p_prev fields to link the requested pages together.
1938 		 */
1939 		page_io_lock(pp);
1940 		page_add(&plist, pp);
1941 		plist = plist->p_next;
1942 		off += MMU_PAGESIZE;
1943 		vaddr += MMU_PAGESIZE;
1944 	}
1945 
1946 	check_dma(mattr, plist, pages_req);
1947 	return (plist);
1948 
1949 fail:
1950 	if (npp != NULL) {
1951 		/*
1952 		 * Did not need this page after all.
1953 		 * Put it back on the free list.
1954 		 */
1955 		VM_STAT_ADD(page_create_putbacks);
1956 		PP_SETFREE(npp);
1957 		PP_SETAGED(npp);
1958 		npp->p_offset = (u_offset_t)-1;
1959 		page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
1960 		page_unlock(npp);
1961 	}
1962 
1963 	/*
1964 	 * Give up the pages we already got.
1965 	 */
1966 	while (plist != NULL) {
1967 		pp = plist;
1968 		page_sub(&plist, pp);
1969 		page_io_unlock(pp);
1970 		plist_len++;
1971 		/*LINTED: constant in conditional ctx*/
1972 		VN_DISPOSE(pp, B_INVAL, 0, kcred);
1973 	}
1974 
1975 	/*
1976 	 * VN_DISPOSE does freemem accounting for the pages in plist
1977 	 * by calling page_free. So, we need to undo the pcf accounting
1978 	 * for only the remaining pages.
1979 	 */
1980 	VM_STAT_ADD(page_create_putbacks);
1981 	page_create_putback(pages_req - plist_len);
1982 
1983 	return (NULL);
1984 }
1985 
1986 
1987 /*
1988  * Copy the data from the physical page represented by "frompp" to
1989  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
1990  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
1991  * level and no one sleeps with an active mapping there.
1992  *
1993  * Note that the ref/mod bits in the page_t's are not affected by
1994  * this operation, hence it is up to the caller to update them appropriately.
1995  */
1996 int
1997 ppcopy(page_t *frompp, page_t *topp)
1998 {
1999 	caddr_t		pp_addr1;
2000 	caddr_t		pp_addr2;
2001 	hat_mempte_t	pte1;
2002 	hat_mempte_t	pte2;
2003 	kmutex_t	*ppaddr_mutex;
2004 	label_t		ljb;
2005 	int		ret = 1;
2006 
2007 	ASSERT_STACK_ALIGNED();
2008 	ASSERT(PAGE_LOCKED(frompp));
2009 	ASSERT(PAGE_LOCKED(topp));
2010 
2011 	if (kpm_enable) {
2012 		pp_addr1 = hat_kpm_page2va(frompp, 0);
2013 		pp_addr2 = hat_kpm_page2va(topp, 0);
2014 		kpreempt_disable();
2015 	} else {
2016 		/*
2017 		 * disable pre-emption so that CPU can't change
2018 		 */
2019 		kpreempt_disable();
2020 
2021 		pp_addr1 = CPU->cpu_caddr1;
2022 		pp_addr2 = CPU->cpu_caddr2;
2023 		pte1 = CPU->cpu_caddr1pte;
2024 		pte2 = CPU->cpu_caddr2pte;
2025 
2026 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
2027 		mutex_enter(ppaddr_mutex);
2028 
2029 		hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
2030 		    PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
2031 		hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
2032 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
2033 		    HAT_LOAD_NOCONSIST);
2034 	}
2035 
2036 	if (on_fault(&ljb)) {
2037 		ret = 0;
2038 		goto faulted;
2039 	}
2040 	if (use_sse_pagecopy)
2041 		hwblkpagecopy(pp_addr1, pp_addr2);
2042 	else
2043 		bcopy(pp_addr1, pp_addr2, PAGESIZE);
2044 
2045 	no_fault();
2046 faulted:
2047 	if (!kpm_enable) {
2048 		mutex_exit(ppaddr_mutex);
2049 	}
2050 	kpreempt_enable();
2051 	return (ret);
2052 }
2053 
2054 /*
2055  * Zero the physical page from off to off + len given by `pp'
2056  * without changing the reference and modified bits of page.
2057  *
2058  * We use this using CPU private page address #2, see ppcopy() for more info.
2059  * pagezero() must not be called at interrupt level.
2060  */
2061 void
2062 pagezero(page_t *pp, uint_t off, uint_t len)
2063 {
2064 	caddr_t		pp_addr2;
2065 	hat_mempte_t	pte2;
2066 	kmutex_t	*ppaddr_mutex;
2067 
2068 	ASSERT_STACK_ALIGNED();
2069 	ASSERT(len <= MMU_PAGESIZE);
2070 	ASSERT(off <= MMU_PAGESIZE);
2071 	ASSERT(off + len <= MMU_PAGESIZE);
2072 	ASSERT(PAGE_LOCKED(pp));
2073 
2074 	if (kpm_enable) {
2075 		pp_addr2 = hat_kpm_page2va(pp, 0);
2076 		kpreempt_disable();
2077 	} else {
2078 		kpreempt_disable();
2079 
2080 		pp_addr2 = CPU->cpu_caddr2;
2081 		pte2 = CPU->cpu_caddr2pte;
2082 
2083 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
2084 		mutex_enter(ppaddr_mutex);
2085 
2086 		hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2,
2087 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
2088 		    HAT_LOAD_NOCONSIST);
2089 	}
2090 
2091 	if (use_sse_pagezero) {
2092 		hwblkclr(pp_addr2 + off, len);
2093 	} else {
2094 		bzero(pp_addr2 + off, len);
2095 	}
2096 
2097 	if (!kpm_enable)
2098 		mutex_exit(ppaddr_mutex);
2099 	kpreempt_enable();
2100 }
2101 
2102 /*
2103  * Platform-dependent page scrub call.
2104  */
2105 void
2106 pagescrub(page_t *pp, uint_t off, uint_t len)
2107 {
2108 	/*
2109 	 * For now, we rely on the fact that pagezero() will
2110 	 * always clear UEs.
2111 	 */
2112 	pagezero(pp, off, len);
2113 }
2114 
2115 /*
2116  * set up two private addresses for use on a given CPU for use in ppcopy()
2117  */
2118 void
2119 setup_vaddr_for_ppcopy(struct cpu *cpup)
2120 {
2121 	void *addr;
2122 	hat_mempte_t pte_pa;
2123 
2124 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
2125 	pte_pa = hat_mempte_setup(addr);
2126 	cpup->cpu_caddr1 = addr;
2127 	cpup->cpu_caddr1pte = pte_pa;
2128 
2129 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
2130 	pte_pa = hat_mempte_setup(addr);
2131 	cpup->cpu_caddr2 = addr;
2132 	cpup->cpu_caddr2pte = pte_pa;
2133 
2134 	mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
2135 }
2136 
2137 /*
2138  * Undo setup_vaddr_for_ppcopy
2139  */
2140 void
2141 teardown_vaddr_for_ppcopy(struct cpu *cpup)
2142 {
2143 	mutex_destroy(&cpup->cpu_ppaddr_mutex);
2144 
2145 	hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte);
2146 	cpup->cpu_caddr2pte = 0;
2147 	vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1));
2148 	cpup->cpu_caddr2 = 0;
2149 
2150 	hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte);
2151 	cpup->cpu_caddr1pte = 0;
2152 	vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1));
2153 	cpup->cpu_caddr1 = 0;
2154 }
2155 
2156 /*
2157  * Create the pageout scanner thread. The thread has to
2158  * start at procedure with process pp and priority pri.
2159  */
2160 void
2161 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
2162 {
2163 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
2164 }
2165 
2166 /*
2167  * Function for flushing D-cache when performing module relocations
2168  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
2169  */
2170 void
2171 dcache_flushall()
2172 {}
2173 
2174 size_t
2175 exec_get_spslew(void)
2176 {
2177 	return (0);
2178 }
2179 
2180 /*
2181  * Allocate a memory page.  The argument 'seed' can be any pseudo-random
2182  * number to vary where the pages come from.  This is quite a hacked up
2183  * method -- it works for now, but really needs to be fixed up a bit.
2184  *
2185  * We currently use page_create_va() on the kvp with fake offsets,
2186  * segments and virt address.  This is pretty bogus, but was copied from the
2187  * old hat_i86.c code.  A better approach would be to specify either mnode
2188  * random or mnode local and takes a page from whatever color has the MOST
2189  * available - this would have a minimal impact on page coloring.
2190  */
2191 page_t *
2192 page_get_physical(uintptr_t seed)
2193 {
2194 	page_t *pp;
2195 	u_offset_t offset;
2196 	static struct seg tmpseg;
2197 	static uintptr_t ctr = 0;
2198 
2199 	/*
2200 	 * This code is gross, we really need a simpler page allocator.
2201 	 *
2202 	 * We need assign an offset for the page to call page_create_va().
2203 	 * To avoid conflicts with other pages, we get creative with the offset.
2204 	 * For 32 bits, we pick an offset > 4Gig
2205 	 * For 64 bits, pick an offset somewhere in the VA hole.
2206 	 */
2207 	offset = seed;
2208 	if (offset > kernelbase)
2209 		offset -= kernelbase;
2210 	offset <<= MMU_PAGESHIFT;
2211 #if defined(__amd64)
2212 	offset += mmu.hole_start;	/* something in VA hole */
2213 #else
2214 	offset += 1ULL << 40;		/* something > 4 Gig */
2215 #endif
2216 
2217 	if (page_resv(1, KM_NOSLEEP) == 0)
2218 		return (NULL);
2219 
2220 #ifdef	DEBUG
2221 	pp = page_exists(&kvp, offset);
2222 	if (pp != NULL)
2223 		panic("page already exists %p", pp);
2224 #endif
2225 
2226 	pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL | PG_NORELOC,
2227 	    &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE));	/* changing VA usage */
2228 	if (pp == NULL)
2229 		return (NULL);
2230 	page_io_unlock(pp);
2231 	page_hashout(pp, NULL);
2232 	return (pp);
2233 }
2234