xref: /titanic_50/usr/src/uts/i86pc/vm/vm_machdep.c (revision bdfc6d18da790deeec2e0eb09c625902defe2498)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /*	All Rights Reserved   */
29 
30 /*
31  * Portions of this source code were derived from Berkeley 4.3 BSD
32  * under license from the Regents of the University of California.
33  */
34 
35 #pragma ident	"%Z%%M%	%I%	%E% SMI"
36 
37 /*
38  * UNIX machine dependent virtual memory support.
39  */
40 
41 #include <sys/types.h>
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/user.h>
45 #include <sys/proc.h>
46 #include <sys/kmem.h>
47 #include <sys/vmem.h>
48 #include <sys/buf.h>
49 #include <sys/cpuvar.h>
50 #include <sys/lgrp.h>
51 #include <sys/disp.h>
52 #include <sys/vm.h>
53 #include <sys/mman.h>
54 #include <sys/vnode.h>
55 #include <sys/cred.h>
56 #include <sys/exec.h>
57 #include <sys/exechdr.h>
58 #include <sys/debug.h>
59 
60 #include <vm/hat.h>
61 #include <vm/as.h>
62 #include <vm/seg.h>
63 #include <vm/seg_kp.h>
64 #include <vm/seg_vn.h>
65 #include <vm/page.h>
66 #include <vm/seg_kmem.h>
67 #include <vm/seg_kpm.h>
68 #include <vm/vm_dep.h>
69 
70 #include <sys/cpu.h>
71 #include <sys/vm_machparam.h>
72 #include <sys/memlist.h>
73 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
74 #include <vm/hat_i86.h>
75 #include <sys/x86_archext.h>
76 #include <sys/elf_386.h>
77 #include <sys/cmn_err.h>
78 #include <sys/archsystm.h>
79 #include <sys/machsystm.h>
80 
81 #include <sys/vtrace.h>
82 #include <sys/ddidmareq.h>
83 #include <sys/promif.h>
84 #include <sys/memnode.h>
85 #include <sys/stack.h>
86 
87 uint_t vac_colors = 0;
88 
89 int largepagesupport = 0;
90 extern uint_t page_create_new;
91 extern uint_t page_create_exists;
92 extern uint_t page_create_putbacks;
93 extern uint_t page_create_putbacks;
94 extern uintptr_t eprom_kernelbase;
95 extern int use_sse_pagecopy, use_sse_pagezero;	/* in ml/float.s */
96 
97 /* 4g memory management */
98 pgcnt_t		maxmem4g;
99 pgcnt_t		freemem4g;
100 int		physmax4g;
101 int		desfree4gshift = 4;	/* maxmem4g shift to derive DESFREE4G */
102 int		lotsfree4gshift = 3;
103 
104 #ifdef VM_STATS
105 struct {
106 	ulong_t	pga_alloc;
107 	ulong_t	pga_notfullrange;
108 	ulong_t	pga_nulldmaattr;
109 	ulong_t	pga_allocok;
110 	ulong_t	pga_allocfailed;
111 	ulong_t	pgma_alloc;
112 	ulong_t	pgma_allocok;
113 	ulong_t	pgma_allocfailed;
114 	ulong_t	pgma_allocempty;
115 } pga_vmstats;
116 #endif
117 
118 uint_t mmu_page_sizes;
119 
120 /* How many page sizes the users can see */
121 uint_t mmu_exported_page_sizes;
122 
123 size_t auto_lpg_va_default = MMU_PAGESIZE; /* used by zmap() */
124 
125 /*
126  * Return the optimum page size for a given mapping
127  */
128 /*ARGSUSED*/
129 size_t
130 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap)
131 {
132 	level_t l;
133 
134 	if (remap)
135 		*remap = 0;
136 
137 	switch (maptype) {
138 
139 	case MAPPGSZ_STK:
140 	case MAPPGSZ_HEAP:
141 	case MAPPGSZ_VA:
142 		/*
143 		 * use the pages size that best fits len
144 		 */
145 		for (l = mmu.max_page_level; l > 0; --l) {
146 			if (len < LEVEL_SIZE(l))
147 				continue;
148 			break;
149 		}
150 		return (LEVEL_SIZE(l));
151 
152 	/*
153 	 * for ISM use the 1st large page size.
154 	 */
155 	case MAPPGSZ_ISM:
156 		if (mmu.max_page_level == 0)
157 			return (MMU_PAGESIZE);
158 		return (LEVEL_SIZE(1));
159 	}
160 	return (0);
161 }
162 
163 /*
164  * This can be patched via /etc/system to allow large pages
165  * to be used for mapping application and libraries text segments.
166  */
167 int	use_text_largepages = 0;
168 
169 /*
170  * Return a bit vector of large page size codes that
171  * can be used to map [addr, addr + len) region.
172  */
173 
174 /*ARGSUSED*/
175 uint_t
176 map_execseg_pgszcvec(int text, caddr_t addr, size_t len)
177 {
178 	size_t	pgsz;
179 	caddr_t a;
180 
181 	if (!text || !use_text_largepages ||
182 	    mmu.max_page_level == 0)
183 		return (0);
184 
185 	pgsz = LEVEL_SIZE(1);
186 	a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
187 	if (a < addr || a >= addr + len) {
188 		return (0);
189 	}
190 	len -= (a - addr);
191 	if (len < pgsz) {
192 		return (0);
193 	}
194 	return (1 << 1);
195 }
196 
197 /*
198  * Handle a pagefault.
199  */
200 faultcode_t
201 pagefault(
202 	caddr_t addr,
203 	enum fault_type type,
204 	enum seg_rw rw,
205 	int iskernel)
206 {
207 	struct as *as;
208 	struct hat *hat;
209 	struct proc *p;
210 	kthread_t *t;
211 	faultcode_t res;
212 	caddr_t base;
213 	size_t len;
214 	int err;
215 	int mapped_red;
216 	uintptr_t ea;
217 
218 	ASSERT_STACK_ALIGNED();
219 
220 	if (INVALID_VADDR(addr))
221 		return (FC_NOMAP);
222 
223 	mapped_red = segkp_map_red();
224 
225 	if (iskernel) {
226 		as = &kas;
227 		hat = as->a_hat;
228 	} else {
229 		t = curthread;
230 		p = ttoproc(t);
231 		as = p->p_as;
232 		hat = as->a_hat;
233 	}
234 
235 	/*
236 	 * Dispatch pagefault.
237 	 */
238 	res = as_fault(hat, as, addr, 1, type, rw);
239 
240 	/*
241 	 * If this isn't a potential unmapped hole in the user's
242 	 * UNIX data or stack segments, just return status info.
243 	 */
244 	if (res != FC_NOMAP || iskernel)
245 		goto out;
246 
247 	/*
248 	 * Check to see if we happened to faulted on a currently unmapped
249 	 * part of the UNIX data or stack segments.  If so, create a zfod
250 	 * mapping there and then try calling the fault routine again.
251 	 */
252 	base = p->p_brkbase;
253 	len = p->p_brksize;
254 
255 	if (addr < base || addr >= base + len) {		/* data seg? */
256 		base = (caddr_t)p->p_usrstack - p->p_stksize;
257 		len = p->p_stksize;
258 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
259 			/* not in either UNIX data or stack segments */
260 			res = FC_NOMAP;
261 			goto out;
262 		}
263 	}
264 
265 	/*
266 	 * the rest of this function implements a 3.X 4.X 5.X compatibility
267 	 * This code is probably not needed anymore
268 	 */
269 	if (p->p_model == DATAMODEL_ILP32) {
270 
271 		/* expand the gap to the page boundaries on each side */
272 		ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
273 		base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
274 		len = ea - (uintptr_t)base;
275 
276 		as_rangelock(as);
277 		if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
278 		    0) {
279 			err = as_map(as, base, len, segvn_create, zfod_argsp);
280 			as_rangeunlock(as);
281 			if (err) {
282 				res = FC_MAKE_ERR(err);
283 				goto out;
284 			}
285 		} else {
286 			/*
287 			 * This page is already mapped by another thread after
288 			 * we returned from as_fault() above.  We just fall
289 			 * through as_fault() below.
290 			 */
291 			as_rangeunlock(as);
292 		}
293 
294 		res = as_fault(hat, as, addr, 1, F_INVAL, rw);
295 	}
296 
297 out:
298 	if (mapped_red)
299 		segkp_unmap_red();
300 
301 	return (res);
302 }
303 
304 void
305 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
306 {
307 	struct proc *p = curproc;
308 	caddr_t userlimit = (flags & _MAP_LOW32) ?
309 	    (caddr_t)_userlimit32 : p->p_as->a_userlimit;
310 
311 	map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
312 }
313 
314 /*ARGSUSED*/
315 int
316 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
317 {
318 	return (0);
319 }
320 
321 /*
322  * map_addr_proc() is the routine called when the system is to
323  * choose an address for the user.  We will pick an address
324  * range which is the highest available below kernelbase.
325  *
326  * addrp is a value/result parameter.
327  *	On input it is a hint from the user to be used in a completely
328  *	machine dependent fashion.  We decide to completely ignore this hint.
329  *
330  *	On output it is NULL if no address can be found in the current
331  *	processes address space or else an address that is currently
332  *	not mapped for len bytes with a page of red zone on either side.
333  *
334  *	align is not needed on x86 (it's for viturally addressed caches)
335  */
336 /*ARGSUSED*/
337 void
338 map_addr_proc(
339 	caddr_t *addrp,
340 	size_t len,
341 	offset_t off,
342 	int vacalign,
343 	caddr_t userlimit,
344 	struct proc *p,
345 	uint_t flags)
346 {
347 	struct as *as = p->p_as;
348 	caddr_t addr;
349 	caddr_t base;
350 	size_t slen;
351 	size_t align_amount;
352 
353 	ASSERT32(userlimit == as->a_userlimit);
354 
355 	base = p->p_brkbase;
356 #if defined(__amd64)
357 	/*
358 	 * XX64 Yes, this needs more work.
359 	 */
360 	if (p->p_model == DATAMODEL_NATIVE) {
361 		if (userlimit < as->a_userlimit) {
362 			/*
363 			 * This happens when a program wants to map
364 			 * something in a range that's accessible to a
365 			 * program in a smaller address space.  For example,
366 			 * a 64-bit program calling mmap32(2) to guarantee
367 			 * that the returned address is below 4Gbytes.
368 			 */
369 			ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
370 
371 			if (userlimit > base)
372 				slen = userlimit - base;
373 			else {
374 				*addrp = NULL;
375 				return;
376 			}
377 		} else {
378 			/*
379 			 * XX64 This layout is probably wrong .. but in
380 			 * the event we make the amd64 address space look
381 			 * like sparcv9 i.e. with the stack -above- the
382 			 * heap, this bit of code might even be correct.
383 			 */
384 			slen = p->p_usrstack - base -
385 			    (((size_t)rctl_enforced_value(
386 			    rctlproc_legacy[RLIMIT_STACK],
387 			    p->p_rctls, p) + PAGEOFFSET) & PAGEMASK);
388 		}
389 	} else
390 #endif
391 		slen = userlimit - base;
392 
393 	len = (len + PAGEOFFSET) & PAGEMASK;
394 
395 	/*
396 	 * Redzone for each side of the request. This is done to leave
397 	 * one page unmapped between segments. This is not required, but
398 	 * it's useful for the user because if their program strays across
399 	 * a segment boundary, it will catch a fault immediately making
400 	 * debugging a little easier.
401 	 */
402 	len += 2 * MMU_PAGESIZE;
403 
404 	/*
405 	 * figure out what the alignment should be
406 	 *
407 	 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
408 	 */
409 	if (len <= ELF_386_MAXPGSZ) {
410 		/*
411 		 * Align virtual addresses to ensure that ELF shared libraries
412 		 * are mapped with the appropriate alignment constraints by
413 		 * the run-time linker.
414 		 */
415 		align_amount = ELF_386_MAXPGSZ;
416 	} else {
417 		int l = mmu.max_page_level;
418 
419 		while (l && len < LEVEL_SIZE(l))
420 			--l;
421 
422 		align_amount = LEVEL_SIZE(l);
423 	}
424 
425 	if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
426 		align_amount = (uintptr_t)*addrp;
427 
428 	len += align_amount;
429 
430 	/*
431 	 * Look for a large enough hole starting below userlimit.
432 	 * After finding it, use the upper part.  Addition of PAGESIZE
433 	 * is for the redzone as described above.
434 	 */
435 	if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) {
436 		caddr_t as_addr;
437 
438 		addr = base + slen - len + MMU_PAGESIZE;
439 		as_addr = addr;
440 		/*
441 		 * Round address DOWN to the alignment amount,
442 		 * add the offset, and if this address is less
443 		 * than the original address, add alignment amount.
444 		 */
445 		addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
446 		addr += (uintptr_t)(off & (align_amount - 1));
447 		if (addr < as_addr)
448 			addr += align_amount;
449 
450 		ASSERT(addr <= (as_addr + align_amount));
451 		ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
452 		    ((uintptr_t)(off & (align_amount - 1))));
453 		*addrp = addr;
454 	} else {
455 		*addrp = NULL;	/* no more virtual space */
456 	}
457 }
458 
459 /*
460  * Determine whether [base, base+len] contains a valid range of
461  * addresses at least minlen long. base and len are adjusted if
462  * required to provide a valid range.
463  */
464 /*ARGSUSED3*/
465 int
466 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
467 {
468 	uintptr_t hi, lo;
469 
470 	lo = (uintptr_t)*basep;
471 	hi = lo + *lenp;
472 
473 	/*
474 	 * If hi rolled over the top, try cutting back.
475 	 */
476 	if (hi < lo) {
477 		if (0 - lo + hi < minlen)
478 			return (0);
479 		if (0 - lo < minlen)
480 			return (0);
481 		*lenp = 0 - lo;
482 	} else if (hi - lo < minlen) {
483 		return (0);
484 	}
485 #if defined(__amd64)
486 	/*
487 	 * Deal with a possible hole in the address range between
488 	 * hole_start and hole_end that should never be mapped.
489 	 */
490 	if (lo < hole_start) {
491 		if (hi > hole_start) {
492 			if (hi < hole_end) {
493 				hi = hole_start;
494 			} else {
495 				/* lo < hole_start && hi >= hole_end */
496 				if (dir == AH_LO) {
497 					/*
498 					 * prefer lowest range
499 					 */
500 					if (hole_start - lo >= minlen)
501 						hi = hole_start;
502 					else if (hi - hole_end >= minlen)
503 						lo = hole_end;
504 					else
505 						return (0);
506 				} else {
507 					/*
508 					 * prefer highest range
509 					 */
510 					if (hi - hole_end >= minlen)
511 						lo = hole_end;
512 					else if (hole_start - lo >= minlen)
513 						hi = hole_start;
514 					else
515 						return (0);
516 				}
517 			}
518 		}
519 	} else {
520 		/* lo >= hole_start */
521 		if (hi < hole_end)
522 			return (0);
523 		if (lo < hole_end)
524 			lo = hole_end;
525 	}
526 
527 	if (hi - lo < minlen)
528 		return (0);
529 
530 	*basep = (caddr_t)lo;
531 	*lenp = hi - lo;
532 #endif
533 	return (1);
534 }
535 
536 /*
537  * Determine whether [addr, addr+len] are valid user addresses.
538  */
539 /*ARGSUSED*/
540 int
541 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
542     caddr_t userlimit)
543 {
544 	caddr_t eaddr = addr + len;
545 
546 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
547 		return (RANGE_BADADDR);
548 
549 #if defined(__amd64)
550 	/*
551 	 * Check for the VA hole
552 	 */
553 	if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
554 		return (RANGE_BADADDR);
555 #endif
556 
557 	return (RANGE_OKAY);
558 }
559 
560 /*
561  * Return 1 if the page frame is onboard memory, else 0.
562  */
563 int
564 pf_is_memory(pfn_t pf)
565 {
566 	return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1));
567 }
568 
569 
570 /*
571  * initialized by page_coloring_init().
572  */
573 uint_t	page_colors;
574 uint_t	page_colors_mask;
575 uint_t	page_coloring_shift;
576 int	cpu_page_colors;
577 static uint_t	l2_colors;
578 
579 /*
580  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
581  * and page_colors are calculated from the l2 cache n-way set size.  Within a
582  * mnode range, the page freelist and cachelist are hashed into bins based on
583  * color. This makes it easier to search for a page within a specific memory
584  * range.
585  */
586 #define	PAGE_COLORS_MIN	16
587 
588 page_t ****page_freelists;
589 page_t ***page_cachelists;
590 
591 /*
592  * As the PC architecture evolved memory up was clumped into several
593  * ranges for various historical I/O devices to do DMA.
594  * < 16Meg - ISA bus
595  * < 2Gig - ???
596  * < 4Gig - PCI bus or drivers that don't understand PAE mode
597  */
598 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
599     0x100000,	/* pfn range for 4G and above */
600     0x80000,	/* pfn range for 2G-4G */
601     0x01000,	/* pfn range for 16M-2G */
602     0x00000,	/* pfn range for 0-16M */
603 };
604 
605 /*
606  * These are changed during startup if the machine has limited memory.
607  */
608 pfn_t *memranges = &arch_memranges[0];
609 int nranges = NUM_MEM_RANGES;
610 
611 /*
612  * Used by page layer to know about page sizes
613  */
614 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
615 
616 /*
617  * This can be patched via /etc/system to allow old non-PAE aware device
618  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
619  */
620 #if defined(__i386)
621 int restricted_kmemalloc = 1;	/* XX64 re-examine with PSARC 2004/405 */
622 #elif defined(__amd64)
623 int restricted_kmemalloc = 0;
624 #endif
625 
626 kmutex_t	*fpc_mutex[NPC_MUTEX];
627 kmutex_t	*cpc_mutex[NPC_MUTEX];
628 
629 
630 /*
631  * return the memrange containing pfn
632  */
633 int
634 memrange_num(pfn_t pfn)
635 {
636 	int n;
637 
638 	for (n = 0; n < nranges - 1; ++n) {
639 		if (pfn >= memranges[n])
640 			break;
641 	}
642 	return (n);
643 }
644 
645 /*
646  * return the mnoderange containing pfn
647  */
648 int
649 pfn_2_mtype(pfn_t pfn)
650 {
651 	int	n;
652 
653 	for (n = mnoderangecnt - 1; n >= 0; n--) {
654 		if (pfn >= mnoderanges[n].mnr_pfnlo) {
655 			break;
656 		}
657 	}
658 	return (n);
659 }
660 
661 /*
662  * is_contigpage_free:
663  *	returns a page list of contiguous pages. It minimally has to return
664  *	minctg pages. Caller determines minctg based on the scatter-gather
665  *	list length.
666  *
667  *	pfnp is set to the next page frame to search on return.
668  */
669 static page_t *
670 is_contigpage_free(
671 	pfn_t *pfnp,
672 	pgcnt_t *pgcnt,
673 	pgcnt_t minctg,
674 	uint64_t pfnseg,
675 	int iolock)
676 {
677 	int	i = 0;
678 	pfn_t	pfn = *pfnp;
679 	page_t	*pp;
680 	page_t	*plist = NULL;
681 
682 	/*
683 	 * fail if pfn + minctg crosses a segment boundary.
684 	 * Adjust for next starting pfn to begin at segment boundary.
685 	 */
686 
687 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
688 		*pfnp = roundup(*pfnp, pfnseg + 1);
689 		return (NULL);
690 	}
691 
692 	do {
693 retry:
694 		pp = page_numtopp_nolock(pfn + i);
695 		if ((pp == NULL) ||
696 		    (page_trylock(pp, SE_EXCL) == 0)) {
697 			(*pfnp)++;
698 			break;
699 		}
700 		if (page_pptonum(pp) != pfn + i) {
701 			page_unlock(pp);
702 			goto retry;
703 		}
704 
705 		if (!(PP_ISFREE(pp))) {
706 			page_unlock(pp);
707 			(*pfnp)++;
708 			break;
709 		}
710 
711 		if (!PP_ISAGED(pp)) {
712 			page_list_sub(pp, PG_CACHE_LIST);
713 			page_hashout(pp, (kmutex_t *)NULL);
714 		} else {
715 			page_list_sub(pp, PG_FREE_LIST);
716 		}
717 
718 		if (iolock)
719 			page_io_lock(pp);
720 		page_list_concat(&plist, &pp);
721 
722 		/*
723 		 * exit loop when pgcnt satisfied or segment boundary reached.
724 		 */
725 
726 	} while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
727 
728 	*pfnp += i;		/* set to next pfn to search */
729 
730 	if (i >= minctg) {
731 		*pgcnt -= i;
732 		return (plist);
733 	}
734 
735 	/*
736 	 * failure: minctg not satisfied.
737 	 *
738 	 * if next request crosses segment boundary, set next pfn
739 	 * to search from the segment boundary.
740 	 */
741 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
742 		*pfnp = roundup(*pfnp, pfnseg + 1);
743 
744 	/* clean up any pages already allocated */
745 
746 	while (plist) {
747 		pp = plist;
748 		page_sub(&plist, pp);
749 		page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
750 		if (iolock)
751 			page_io_unlock(pp);
752 		page_unlock(pp);
753 	}
754 
755 	return (NULL);
756 }
757 
758 /*
759  * verify that pages being returned from allocator have correct DMA attribute
760  */
761 #ifndef DEBUG
762 #define	check_dma(a, b, c) (0)
763 #else
764 static void
765 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
766 {
767 	if (dma_attr == NULL)
768 		return;
769 
770 	while (cnt-- > 0) {
771 		if (mmu_ptob((uint64_t)pp->p_pagenum) <
772 		    dma_attr->dma_attr_addr_lo)
773 			panic("PFN (pp=%p) below dma_attr_addr_lo", pp);
774 		if (mmu_ptob((uint64_t)pp->p_pagenum) >=
775 		    dma_attr->dma_attr_addr_hi)
776 			panic("PFN (pp=%p) above dma_attr_addr_hi", pp);
777 		pp = pp->p_next;
778 	}
779 }
780 #endif
781 
782 static kmutex_t	contig_lock;
783 
784 #define	CONTIG_LOCK()	mutex_enter(&contig_lock);
785 #define	CONTIG_UNLOCK()	mutex_exit(&contig_lock);
786 
787 #define	PFN_16M		(mmu_btop((uint64_t)0x1000000))
788 
789 static page_t *
790 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
791 {
792 	pfn_t		pfn;
793 	int		sgllen;
794 	uint64_t	pfnseg;
795 	pgcnt_t		minctg;
796 	page_t		*pplist = NULL, *plist;
797 	uint64_t	lo, hi;
798 	pgcnt_t		pfnalign = 0;
799 	static pfn_t	startpfn;
800 	static pgcnt_t	lastctgcnt;
801 	uintptr_t	align;
802 
803 	CONTIG_LOCK();
804 
805 	if (mattr) {
806 		lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
807 		hi = mmu_btop(mattr->dma_attr_addr_hi);
808 		if (hi >= physmax)
809 			hi = physmax - 1;
810 		sgllen = mattr->dma_attr_sgllen;
811 		pfnseg = mmu_btop(mattr->dma_attr_seg);
812 
813 		align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
814 		if (align > MMU_PAGESIZE)
815 			pfnalign = mmu_btop(align);
816 
817 		/*
818 		 * in order to satisfy the request, must minimally
819 		 * acquire minctg contiguous pages
820 		 */
821 		minctg = howmany(*pgcnt, sgllen);
822 
823 		ASSERT(hi >= lo);
824 
825 		/*
826 		 * start from where last searched if the minctg >= lastctgcnt
827 		 */
828 		if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
829 			startpfn = lo;
830 	} else {
831 		hi = physmax - 1;
832 		lo = 0;
833 		sgllen = 1;
834 		pfnseg = mmu.highest_pfn;
835 		minctg = *pgcnt;
836 
837 		if (minctg < lastctgcnt)
838 			startpfn = lo;
839 	}
840 	lastctgcnt = minctg;
841 
842 	ASSERT(pfnseg + 1 >= (uint64_t)minctg);
843 
844 	/* conserve 16m memory - start search above 16m when possible */
845 	if (hi > PFN_16M && startpfn < PFN_16M)
846 		startpfn = PFN_16M;
847 
848 	pfn = startpfn;
849 	if (pfnalign)
850 		pfn = P2ROUNDUP(pfn, pfnalign);
851 
852 	while (pfn + minctg - 1 <= hi) {
853 
854 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
855 		if (plist) {
856 			page_list_concat(&pplist, &plist);
857 			sgllen--;
858 			/*
859 			 * return when contig pages no longer needed
860 			 */
861 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
862 				startpfn = pfn;
863 				CONTIG_UNLOCK();
864 				check_dma(mattr, pplist, *pgcnt);
865 				return (pplist);
866 			}
867 			minctg = howmany(*pgcnt, sgllen);
868 		}
869 		if (pfnalign)
870 			pfn = P2ROUNDUP(pfn, pfnalign);
871 	}
872 
873 	/* cannot find contig pages in specified range */
874 	if (startpfn == lo) {
875 		CONTIG_UNLOCK();
876 		return (NULL);
877 	}
878 
879 	/* did not start with lo previously */
880 	pfn = lo;
881 	if (pfnalign)
882 		pfn = P2ROUNDUP(pfn, pfnalign);
883 
884 	/* allow search to go above startpfn */
885 	while (pfn < startpfn) {
886 
887 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
888 		if (plist != NULL) {
889 
890 			page_list_concat(&pplist, &plist);
891 			sgllen--;
892 
893 			/*
894 			 * return when contig pages no longer needed
895 			 */
896 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
897 				startpfn = pfn;
898 				CONTIG_UNLOCK();
899 				check_dma(mattr, pplist, *pgcnt);
900 				return (pplist);
901 			}
902 			minctg = howmany(*pgcnt, sgllen);
903 		}
904 		if (pfnalign)
905 			pfn = P2ROUNDUP(pfn, pfnalign);
906 	}
907 	CONTIG_UNLOCK();
908 	return (NULL);
909 }
910 
911 /*
912  * combine mem_node_config and memrange memory ranges into one data
913  * structure to be used for page list management.
914  *
915  * mnode_range_cnt() calculates the number of memory ranges for mnode and
916  * memranges[]. Used to determine the size of page lists and mnoderanges.
917  *
918  * mnode_range_setup() initializes mnoderanges.
919  */
920 mnoderange_t	*mnoderanges;
921 int		mnoderangecnt;
922 int		mtype4g;
923 
924 int
925 mnode_range_cnt()
926 {
927 	int	mri;
928 	int	mnrcnt = 0;
929 	int	mnode;
930 
931 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
932 		if (mem_node_config[mnode].exists == 0)
933 			continue;
934 
935 		mri = nranges - 1;
936 
937 		/* find the memranges index below contained in mnode range */
938 
939 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
940 			mri--;
941 
942 		/*
943 		 * increment mnode range counter when memranges or mnode
944 		 * boundary is reached.
945 		 */
946 		while (mri >= 0 &&
947 		    mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
948 			mnrcnt++;
949 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
950 				mri--;
951 			else
952 				break;
953 		}
954 	}
955 	return (mnrcnt);
956 }
957 
958 void
959 mnode_range_setup(mnoderange_t *mnoderanges)
960 {
961 	int	mnode, mri;
962 
963 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
964 		if (mem_node_config[mnode].exists == 0)
965 			continue;
966 
967 		mri = nranges - 1;
968 
969 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
970 			mri--;
971 
972 		while (mri >= 0 && mem_node_config[mnode].physmax >=
973 		    MEMRANGELO(mri)) {
974 			mnoderanges->mnr_pfnlo =
975 			    MAX(MEMRANGELO(mri),
976 				mem_node_config[mnode].physbase);
977 			mnoderanges->mnr_pfnhi =
978 			    MIN(MEMRANGEHI(mri),
979 				mem_node_config[mnode].physmax);
980 			mnoderanges->mnr_mnode = mnode;
981 			mnoderanges->mnr_memrange = mri;
982 			mnoderanges++;
983 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
984 				mri--;
985 			else
986 				break;
987 		}
988 	}
989 }
990 
991 /*
992  * Determine if the mnode range specified in mtype contains memory belonging
993  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
994  * the range of indices to 0 or 4g.
995  *
996  * Return first mnode range type index found otherwise return -1 if none found.
997  */
998 int
999 mtype_func(int mnode, int mtype, uint_t flags)
1000 {
1001 	if (flags & PGI_MT_RANGE) {
1002 		int	mtlim = 0;	/* default to PGI_MT_RANGEO */
1003 
1004 		if (flags & PGI_MT_NEXT)
1005 			mtype--;
1006 		if (flags & PGI_MT_RANGE4G)
1007 			mtlim = mtype4g + 1;
1008 		while (mtype >= mtlim) {
1009 			if (mnoderanges[mtype].mnr_mnode == mnode)
1010 				return (mtype);
1011 			mtype--;
1012 		}
1013 	} else {
1014 		if (mnoderanges[mtype].mnr_mnode == mnode)
1015 			return (mtype);
1016 	}
1017 	return (-1);
1018 }
1019 
1020 /*
1021  * Returns the free page count for mnode
1022  */
1023 int
1024 mnode_pgcnt(int mnode)
1025 {
1026 	int	mtype = mnoderangecnt - 1;
1027 	int	flags = PGI_MT_RANGE0;
1028 	pgcnt_t	pgcnt = 0;
1029 
1030 	mtype = mtype_func(mnode, mtype, flags);
1031 
1032 	while (mtype != -1) {
1033 		pgcnt += (mnoderanges[mtype].mnr_mt_flpgcnt +
1034 		    mnoderanges[mtype].mnr_mt_lgpgcnt +
1035 		    mnoderanges[mtype].mnr_mt_clpgcnt);
1036 		mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1037 	}
1038 	return (pgcnt);
1039 }
1040 
1041 /*
1042  * Initialize page coloring variables based on the l2 cache parameters.
1043  * Calculate and return memory needed for page coloring data structures.
1044  */
1045 size_t
1046 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1047 {
1048 	size_t	colorsz = 0;
1049 	int	i;
1050 	int	colors;
1051 
1052 	/*
1053 	 * Reduce the memory ranges lists if we don't have large amounts
1054 	 * of memory. This avoids searching known empty free lists.
1055 	 */
1056 	i = memrange_num(physmax);
1057 	memranges += i;
1058 	nranges -= i;
1059 #if defined(__i386)
1060 	if (i > 0)
1061 		restricted_kmemalloc = 0;
1062 #endif
1063 	/* physmax greater than 4g */
1064 	if (i == 0)
1065 		physmax4g = 1;
1066 
1067 	/*
1068 	 * setup pagesize for generic page layer
1069 	 */
1070 	for (i = 0; i <= mmu.max_page_level; ++i) {
1071 		hw_page_array[i].hp_size = LEVEL_SIZE(i);
1072 		hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1073 		hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1074 	}
1075 
1076 	ASSERT(ISP2(l2_sz));
1077 	ASSERT(ISP2(l2_linesz));
1078 	ASSERT(l2_sz > MMU_PAGESIZE);
1079 
1080 	/* l2_assoc is 0 for fully associative l2 cache */
1081 	if (l2_assoc)
1082 		l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1083 	else
1084 		l2_colors = 1;
1085 
1086 	/* for scalability, configure at least PAGE_COLORS_MIN color bins */
1087 	page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1088 
1089 	/*
1090 	 * cpu_page_colors is non-zero when a page color may be spread across
1091 	 * multiple bins.
1092 	 */
1093 	if (l2_colors < page_colors)
1094 		cpu_page_colors = l2_colors;
1095 
1096 	ASSERT(ISP2(page_colors));
1097 
1098 	page_colors_mask = page_colors - 1;
1099 
1100 	ASSERT(ISP2(CPUSETSIZE()));
1101 	page_coloring_shift = lowbit(CPUSETSIZE());
1102 
1103 	/* size for mnoderanges */
1104 	mnoderangecnt = mnode_range_cnt();
1105 	colorsz = mnoderangecnt * sizeof (mnoderange_t);
1106 
1107 	/* size for fpc_mutex and cpc_mutex */
1108 	colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1109 
1110 	/* size of page_freelists */
1111 	colorsz += mnoderangecnt * sizeof (page_t ***);
1112 	colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1113 
1114 	for (i = 0; i < mmu_page_sizes; i++) {
1115 		colors = page_get_pagecolors(i);
1116 		colorsz += mnoderangecnt * colors * sizeof (page_t *);
1117 	}
1118 
1119 	/* size of page_cachelists */
1120 	colorsz += mnoderangecnt * sizeof (page_t **);
1121 	colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1122 
1123 	return (colorsz);
1124 }
1125 
1126 /*
1127  * Called once at startup to configure page_coloring data structures and
1128  * does the 1st page_free()/page_freelist_add().
1129  */
1130 void
1131 page_coloring_setup(caddr_t pcmemaddr)
1132 {
1133 	int	i;
1134 	int	j;
1135 	int	k;
1136 	caddr_t	addr;
1137 	int	colors;
1138 
1139 	/*
1140 	 * do page coloring setup
1141 	 */
1142 	addr = pcmemaddr;
1143 
1144 	mnoderanges = (mnoderange_t *)addr;
1145 	addr += (mnoderangecnt * sizeof (mnoderange_t));
1146 
1147 	mnode_range_setup(mnoderanges);
1148 
1149 	if (physmax4g)
1150 		mtype4g = pfn_2_mtype(0xfffff);
1151 
1152 	for (k = 0; k < NPC_MUTEX; k++) {
1153 		fpc_mutex[k] = (kmutex_t *)addr;
1154 		addr += (max_mem_nodes * sizeof (kmutex_t));
1155 	}
1156 	for (k = 0; k < NPC_MUTEX; k++) {
1157 		cpc_mutex[k] = (kmutex_t *)addr;
1158 		addr += (max_mem_nodes * sizeof (kmutex_t));
1159 	}
1160 	page_freelists = (page_t ****)addr;
1161 	addr += (mnoderangecnt * sizeof (page_t ***));
1162 
1163 	page_cachelists = (page_t ***)addr;
1164 	addr += (mnoderangecnt * sizeof (page_t **));
1165 
1166 	for (i = 0; i < mnoderangecnt; i++) {
1167 		page_freelists[i] = (page_t ***)addr;
1168 		addr += (mmu_page_sizes * sizeof (page_t **));
1169 
1170 		for (j = 0; j < mmu_page_sizes; j++) {
1171 			colors = page_get_pagecolors(j);
1172 			page_freelists[i][j] = (page_t **)addr;
1173 			addr += (colors * sizeof (page_t *));
1174 		}
1175 		page_cachelists[i] = (page_t **)addr;
1176 		addr += (page_colors * sizeof (page_t *));
1177 	}
1178 }
1179 
1180 /*ARGSUSED*/
1181 int
1182 bp_color(struct buf *bp)
1183 {
1184 	return (0);
1185 }
1186 
1187 /*
1188  * get a page from any list with the given mnode
1189  */
1190 page_t *
1191 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
1192     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
1193 {
1194 	kmutex_t	*pcm;
1195 	int		i;
1196 	page_t		*pp;
1197 	page_t		*first_pp;
1198 	uint64_t	pgaddr;
1199 	ulong_t		bin;
1200 	int		mtypestart;
1201 
1202 	VM_STAT_ADD(pga_vmstats.pgma_alloc);
1203 
1204 	ASSERT((flags & PG_MATCH_COLOR) == 0);
1205 	ASSERT(szc == 0);
1206 	ASSERT(dma_attr != NULL);
1207 
1208 
1209 	MTYPE_START(mnode, mtype, flags);
1210 	if (mtype < 0) {
1211 		VM_STAT_ADD(pga_vmstats.pgma_allocempty);
1212 		return (NULL);
1213 	}
1214 
1215 	mtypestart = mtype;
1216 
1217 	bin = origbin;
1218 
1219 	/*
1220 	 * check up to page_colors + 1 bins - origbin may be checked twice
1221 	 * because of BIN_STEP skip
1222 	 */
1223 	do {
1224 		i = 0;
1225 		while (i <= page_colors) {
1226 			if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
1227 				goto nextfreebin;
1228 
1229 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1230 			mutex_enter(pcm);
1231 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
1232 			first_pp = pp;
1233 			while (pp != NULL) {
1234 				if (page_trylock(pp, SE_EXCL) == 0) {
1235 					pp = pp->p_next;
1236 					if (pp == first_pp) {
1237 						pp = NULL;
1238 					}
1239 					continue;
1240 				}
1241 
1242 				ASSERT(PP_ISFREE(pp));
1243 				ASSERT(PP_ISAGED(pp));
1244 				ASSERT(pp->p_vnode == NULL);
1245 				ASSERT(pp->p_hash == NULL);
1246 				ASSERT(pp->p_offset == (u_offset_t)-1);
1247 				ASSERT(pp->p_szc == szc);
1248 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
1249 				/* check if page within DMA attributes */
1250 				pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum));
1251 
1252 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
1253 				    (pgaddr + MMU_PAGESIZE - 1 <=
1254 				    dma_attr->dma_attr_addr_hi)) {
1255 					break;
1256 				}
1257 
1258 				/* continue looking */
1259 				page_unlock(pp);
1260 				pp = pp->p_next;
1261 				if (pp == first_pp)
1262 					pp = NULL;
1263 
1264 			}
1265 			if (pp != NULL) {
1266 				ASSERT(mtype == PP_2_MTYPE(pp));
1267 				ASSERT(pp->p_szc == 0);
1268 
1269 				/* found a page with specified DMA attributes */
1270 				page_sub(&PAGE_FREELISTS(mnode, szc, bin,
1271 				    mtype), pp);
1272 				page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1273 
1274 				if ((PP_ISFREE(pp) == 0) ||
1275 				    (PP_ISAGED(pp) == 0)) {
1276 					cmn_err(CE_PANIC, "page %p is not free",
1277 					    (void *)pp);
1278 				}
1279 
1280 				mutex_exit(pcm);
1281 				check_dma(dma_attr, pp, 1);
1282 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
1283 				return (pp);
1284 			}
1285 			mutex_exit(pcm);
1286 nextfreebin:
1287 			pp = page_freelist_fill(szc, bin, mnode, mtype,
1288 			    mmu_btop(dma_attr->dma_attr_addr_hi + 1));
1289 			if (pp)
1290 				return (pp);
1291 
1292 			/* try next bin */
1293 			bin += (i == 0) ? BIN_STEP : 1;
1294 			bin &= page_colors_mask;
1295 			i++;
1296 		}
1297 		MTYPE_NEXT(mnode, mtype, flags);
1298 	} while (mtype >= 0);
1299 
1300 	/* failed to find a page in the freelist; try it in the cachelist */
1301 
1302 	/* reset mtype start for cachelist search */
1303 	mtype = mtypestart;
1304 	ASSERT(mtype >= 0);
1305 
1306 	/* start with the bin of matching color */
1307 	bin = origbin;
1308 
1309 	do {
1310 		for (i = 0; i <= page_colors; i++) {
1311 			if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
1312 				goto nextcachebin;
1313 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
1314 			mutex_enter(pcm);
1315 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
1316 			first_pp = pp;
1317 			while (pp != NULL) {
1318 				if (page_trylock(pp, SE_EXCL) == 0) {
1319 					pp = pp->p_next;
1320 					if (pp == first_pp)
1321 						break;
1322 					continue;
1323 				}
1324 				ASSERT(pp->p_vnode);
1325 				ASSERT(PP_ISAGED(pp) == 0);
1326 				ASSERT(pp->p_szc == 0);
1327 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
1328 
1329 				/* check if page within DMA attributes */
1330 
1331 				pgaddr = ptob((uint64_t)(pp->p_pagenum));
1332 
1333 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
1334 				    (pgaddr + MMU_PAGESIZE - 1 <=
1335 				    dma_attr->dma_attr_addr_hi)) {
1336 					break;
1337 				}
1338 
1339 				/* continue looking */
1340 				page_unlock(pp);
1341 				pp = pp->p_next;
1342 				if (pp == first_pp)
1343 					pp = NULL;
1344 			}
1345 
1346 			if (pp != NULL) {
1347 				ASSERT(mtype == PP_2_MTYPE(pp));
1348 				ASSERT(pp->p_szc == 0);
1349 
1350 				/* found a page with specified DMA attributes */
1351 				page_sub(&PAGE_CACHELISTS(mnode, bin,
1352 				    mtype), pp);
1353 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
1354 
1355 				mutex_exit(pcm);
1356 				ASSERT(pp->p_vnode);
1357 				ASSERT(PP_ISAGED(pp) == 0);
1358 				check_dma(dma_attr, pp, 1);
1359 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
1360 				return (pp);
1361 			}
1362 			mutex_exit(pcm);
1363 nextcachebin:
1364 			bin += (i == 0) ? BIN_STEP : 1;
1365 			bin &= page_colors_mask;
1366 		}
1367 		MTYPE_NEXT(mnode, mtype, flags);
1368 	} while (mtype >= 0);
1369 
1370 	VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
1371 	return (NULL);
1372 }
1373 
1374 /*
1375  * This function is similar to page_get_freelist()/page_get_cachelist()
1376  * but it searches both the lists to find a page with the specified
1377  * color (or no color) and DMA attributes. The search is done in the
1378  * freelist first and then in the cache list within the highest memory
1379  * range (based on DMA attributes) before searching in the lower
1380  * memory ranges.
1381  *
1382  * Note: This function is called only by page_create_io().
1383  */
1384 /*ARGSUSED*/
1385 page_t *
1386 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
1387     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t	*lgrp)
1388 {
1389 	uint_t		bin;
1390 	int		mtype;
1391 	page_t		*pp;
1392 	int		n;
1393 	int		m;
1394 	int		szc;
1395 	int		fullrange;
1396 	int		mnode;
1397 	int		local_failed_stat = 0;
1398 	lgrp_mnode_cookie_t	lgrp_cookie;
1399 
1400 	VM_STAT_ADD(pga_vmstats.pga_alloc);
1401 
1402 	/* only base pagesize currently supported */
1403 	if (size != MMU_PAGESIZE)
1404 		return (NULL);
1405 
1406 	/*
1407 	 * If we're passed a specific lgroup, we use it.  Otherwise,
1408 	 * assume first-touch placement is desired.
1409 	 */
1410 	if (!LGRP_EXISTS(lgrp))
1411 		lgrp = lgrp_home_lgrp();
1412 
1413 	/* LINTED */
1414 	AS_2_BIN(as, seg, vp, vaddr, bin);
1415 
1416 	/*
1417 	 * Only hold one freelist or cachelist lock at a time, that way we
1418 	 * can start anywhere and not have to worry about lock
1419 	 * ordering.
1420 	 */
1421 	if (dma_attr == NULL) {
1422 		n = 0;
1423 		m = mnoderangecnt - 1;
1424 		fullrange = 1;
1425 		VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
1426 	} else {
1427 		pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
1428 		pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
1429 
1430 		/*
1431 		 * We can guarantee alignment only for page boundary.
1432 		 */
1433 		if (dma_attr->dma_attr_align > MMU_PAGESIZE)
1434 			return (NULL);
1435 
1436 		n = pfn_2_mtype(pfnlo);
1437 		m = pfn_2_mtype(pfnhi);
1438 
1439 		fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
1440 		    (pfnhi >= mnoderanges[m].mnr_pfnhi));
1441 	}
1442 	VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
1443 
1444 	if (n > m)
1445 		return (NULL);
1446 
1447 	szc = 0;
1448 
1449 	/* cylcing thru mtype handled by RANGE0 if n == 0 */
1450 	if (n == 0) {
1451 		flags |= PGI_MT_RANGE0;
1452 		n = m;
1453 	}
1454 
1455 	/*
1456 	 * Try local memory node first, but try remote if we can't
1457 	 * get a page of the right color.
1458 	 */
1459 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
1460 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
1461 		/*
1462 		 * allocate pages from high pfn to low.
1463 		 */
1464 		for (mtype = m; mtype >= n; mtype--) {
1465 			if (fullrange != 0) {
1466 				pp = page_get_mnode_freelist(mnode,
1467 				    bin, mtype, szc, flags);
1468 				if (pp == NULL) {
1469 					pp = page_get_mnode_cachelist(
1470 						bin, flags, mnode, mtype);
1471 				}
1472 			} else {
1473 				pp = page_get_mnode_anylist(bin, szc,
1474 				    flags, mnode, mtype, dma_attr);
1475 			}
1476 			if (pp != NULL) {
1477 				VM_STAT_ADD(pga_vmstats.pga_allocok);
1478 				check_dma(dma_attr, pp, 1);
1479 				return (pp);
1480 			}
1481 		}
1482 		if (!local_failed_stat) {
1483 			lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
1484 			local_failed_stat = 1;
1485 		}
1486 	}
1487 	VM_STAT_ADD(pga_vmstats.pga_allocfailed);
1488 
1489 	return (NULL);
1490 }
1491 
1492 /*
1493  * page_create_io()
1494  *
1495  * This function is a copy of page_create_va() with an additional
1496  * argument 'mattr' that specifies DMA memory requirements to
1497  * the page list functions. This function is used by the segkmem
1498  * allocator so it is only to create new pages (i.e PG_EXCL is
1499  * set).
1500  *
1501  * Note: This interface is currently used by x86 PSM only and is
1502  *	 not fully specified so the commitment level is only for
1503  *	 private interface specific to x86. This interface uses PSM
1504  *	 specific page_get_anylist() interface.
1505  */
1506 
1507 #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
1508 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
1509 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
1510 			break; \
1511 	} \
1512 }
1513 
1514 
1515 page_t *
1516 page_create_io(
1517 	struct vnode	*vp,
1518 	u_offset_t	off,
1519 	uint_t		bytes,
1520 	uint_t		flags,
1521 	struct as	*as,
1522 	caddr_t		vaddr,
1523 	ddi_dma_attr_t	*mattr)	/* DMA memory attributes if any */
1524 {
1525 	page_t		*plist = NULL;
1526 	uint_t		plist_len = 0;
1527 	pgcnt_t		npages;
1528 	page_t		*npp = NULL;
1529 	uint_t		pages_req;
1530 	page_t		*pp;
1531 	kmutex_t	*phm = NULL;
1532 	uint_t		index;
1533 
1534 	TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
1535 		"page_create_start:vp %p off %llx bytes %u flags %x",
1536 		vp, off, bytes, flags);
1537 
1538 	ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
1539 
1540 	pages_req = npages = mmu_btopr(bytes);
1541 
1542 	/*
1543 	 * Do the freemem and pcf accounting.
1544 	 */
1545 	if (!page_create_wait(npages, flags)) {
1546 		return (NULL);
1547 	}
1548 
1549 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
1550 		"page_create_success:vp %p off %llx",
1551 		vp, off);
1552 
1553 	/*
1554 	 * If satisfying this request has left us with too little
1555 	 * memory, start the wheels turning to get some back.  The
1556 	 * first clause of the test prevents waking up the pageout
1557 	 * daemon in situations where it would decide that there's
1558 	 * nothing to do.
1559 	 */
1560 	if (nscan < desscan && freemem < minfree) {
1561 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
1562 			"pageout_cv_signal:freemem %ld", freemem);
1563 		cv_signal(&proc_pageout->p_cv);
1564 	}
1565 
1566 	if (flags & PG_PHYSCONTIG) {
1567 
1568 		plist = page_get_contigpage(&npages, mattr, 1);
1569 		if (plist == NULL) {
1570 			page_create_putback(npages);
1571 			return (NULL);
1572 		}
1573 
1574 		pp = plist;
1575 
1576 		do {
1577 			if (!page_hashin(pp, vp, off, NULL)) {
1578 				panic("pg_creat_io: hashin failed %p %p %llx",
1579 				    (void *)pp, (void *)vp, off);
1580 			}
1581 			VM_STAT_ADD(page_create_new);
1582 			off += MMU_PAGESIZE;
1583 			PP_CLRFREE(pp);
1584 			PP_CLRAGED(pp);
1585 			page_set_props(pp, P_REF);
1586 			pp = pp->p_next;
1587 		} while (pp != plist);
1588 
1589 		if (!npages) {
1590 			check_dma(mattr, plist, pages_req);
1591 			return (plist);
1592 		} else {
1593 			vaddr += (pages_req - npages) << MMU_PAGESHIFT;
1594 		}
1595 
1596 		/*
1597 		 * fall-thru:
1598 		 *
1599 		 * page_get_contigpage returns when npages <= sgllen.
1600 		 * Grab the rest of the non-contig pages below from anylist.
1601 		 */
1602 	}
1603 
1604 	/*
1605 	 * Loop around collecting the requested number of pages.
1606 	 * Most of the time, we have to `create' a new page. With
1607 	 * this in mind, pull the page off the free list before
1608 	 * getting the hash lock.  This will minimize the hash
1609 	 * lock hold time, nesting, and the like.  If it turns
1610 	 * out we don't need the page, we put it back at the end.
1611 	 */
1612 	while (npages--) {
1613 		phm = NULL;
1614 
1615 		index = PAGE_HASH_FUNC(vp, off);
1616 top:
1617 		ASSERT(phm == NULL);
1618 		ASSERT(index == PAGE_HASH_FUNC(vp, off));
1619 		ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1620 
1621 		if (npp == NULL) {
1622 			/*
1623 			 * Try to get the page of any color either from
1624 			 * the freelist or from the cache list.
1625 			 */
1626 			npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
1627 			    flags & ~PG_MATCH_COLOR, mattr, NULL);
1628 			if (npp == NULL) {
1629 				if (mattr == NULL) {
1630 					/*
1631 					 * Not looking for a special page;
1632 					 * panic!
1633 					 */
1634 					panic("no page found %d", (int)npages);
1635 				}
1636 				/*
1637 				 * No page found! This can happen
1638 				 * if we are looking for a page
1639 				 * within a specific memory range
1640 				 * for DMA purposes. If PG_WAIT is
1641 				 * specified then we wait for a
1642 				 * while and then try again. The
1643 				 * wait could be forever if we
1644 				 * don't get the page(s) we need.
1645 				 *
1646 				 * Note: XXX We really need a mechanism
1647 				 * to wait for pages in the desired
1648 				 * range. For now, we wait for any
1649 				 * pages and see if we can use it.
1650 				 */
1651 
1652 				if ((mattr != NULL) && (flags & PG_WAIT)) {
1653 					delay(10);
1654 					goto top;
1655 				}
1656 
1657 				goto fail; /* undo accounting stuff */
1658 			}
1659 
1660 			if (PP_ISAGED(npp) == 0) {
1661 				/*
1662 				 * Since this page came from the
1663 				 * cachelist, we must destroy the
1664 				 * old vnode association.
1665 				 */
1666 				page_hashout(npp, (kmutex_t *)NULL);
1667 			}
1668 		}
1669 
1670 		/*
1671 		 * We own this page!
1672 		 */
1673 		ASSERT(PAGE_EXCL(npp));
1674 		ASSERT(npp->p_vnode == NULL);
1675 		ASSERT(!hat_page_is_mapped(npp));
1676 		PP_CLRFREE(npp);
1677 		PP_CLRAGED(npp);
1678 
1679 		/*
1680 		 * Here we have a page in our hot little mits and are
1681 		 * just waiting to stuff it on the appropriate lists.
1682 		 * Get the mutex and check to see if it really does
1683 		 * not exist.
1684 		 */
1685 		phm = PAGE_HASH_MUTEX(index);
1686 		mutex_enter(phm);
1687 		PAGE_HASH_SEARCH(index, pp, vp, off);
1688 		if (pp == NULL) {
1689 			VM_STAT_ADD(page_create_new);
1690 			pp = npp;
1691 			npp = NULL;
1692 			if (!page_hashin(pp, vp, off, phm)) {
1693 				/*
1694 				 * Since we hold the page hash mutex and
1695 				 * just searched for this page, page_hashin
1696 				 * had better not fail.  If it does, that
1697 				 * means somethread did not follow the
1698 				 * page hash mutex rules.  Panic now and
1699 				 * get it over with.  As usual, go down
1700 				 * holding all the locks.
1701 				 */
1702 				ASSERT(MUTEX_HELD(phm));
1703 				panic("page_create: hashin fail %p %p %llx %p",
1704 				    (void *)pp, (void *)vp, off, (void *)phm);
1705 
1706 			}
1707 			ASSERT(MUTEX_HELD(phm));
1708 			mutex_exit(phm);
1709 			phm = NULL;
1710 
1711 			/*
1712 			 * Hat layer locking need not be done to set
1713 			 * the following bits since the page is not hashed
1714 			 * and was on the free list (i.e., had no mappings).
1715 			 *
1716 			 * Set the reference bit to protect
1717 			 * against immediate pageout
1718 			 *
1719 			 * XXXmh modify freelist code to set reference
1720 			 * bit so we don't have to do it here.
1721 			 */
1722 			page_set_props(pp, P_REF);
1723 		} else {
1724 			ASSERT(MUTEX_HELD(phm));
1725 			mutex_exit(phm);
1726 			phm = NULL;
1727 			/*
1728 			 * NOTE: This should not happen for pages associated
1729 			 *	 with kernel vnode 'kvp'.
1730 			 */
1731 			/* XX64 - to debug why this happens! */
1732 			ASSERT(vp != &kvp);
1733 			if (vp == &kvp)
1734 				cmn_err(CE_NOTE,
1735 				    "page_create: page not expected "
1736 				    "in hash list for kernel vnode - pp 0x%p",
1737 				    (void *)pp);
1738 			VM_STAT_ADD(page_create_exists);
1739 			goto fail;
1740 		}
1741 
1742 		/*
1743 		 * Got a page!  It is locked.  Acquire the i/o
1744 		 * lock since we are going to use the p_next and
1745 		 * p_prev fields to link the requested pages together.
1746 		 */
1747 		page_io_lock(pp);
1748 		page_add(&plist, pp);
1749 		plist = plist->p_next;
1750 		off += MMU_PAGESIZE;
1751 		vaddr += MMU_PAGESIZE;
1752 	}
1753 
1754 	check_dma(mattr, plist, pages_req);
1755 	return (plist);
1756 
1757 fail:
1758 	if (npp != NULL) {
1759 		/*
1760 		 * Did not need this page after all.
1761 		 * Put it back on the free list.
1762 		 */
1763 		VM_STAT_ADD(page_create_putbacks);
1764 		PP_SETFREE(npp);
1765 		PP_SETAGED(npp);
1766 		npp->p_offset = (u_offset_t)-1;
1767 		page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
1768 		page_unlock(npp);
1769 	}
1770 
1771 	/*
1772 	 * Give up the pages we already got.
1773 	 */
1774 	while (plist != NULL) {
1775 		pp = plist;
1776 		page_sub(&plist, pp);
1777 		page_io_unlock(pp);
1778 		plist_len++;
1779 		/*LINTED: constant in conditional ctx*/
1780 		VN_DISPOSE(pp, B_INVAL, 0, kcred);
1781 	}
1782 
1783 	/*
1784 	 * VN_DISPOSE does freemem accounting for the pages in plist
1785 	 * by calling page_free. So, we need to undo the pcf accounting
1786 	 * for only the remaining pages.
1787 	 */
1788 	VM_STAT_ADD(page_create_putbacks);
1789 	page_create_putback(pages_req - plist_len);
1790 
1791 	return (NULL);
1792 }
1793 
1794 
1795 /*
1796  * Copy the data from the physical page represented by "frompp" to
1797  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
1798  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
1799  * level and no one sleeps with an active mapping there.
1800  *
1801  * Note that the ref/mod bits in the page_t's are not affected by
1802  * this operation, hence it is up to the caller to update them appropriately.
1803  */
1804 void
1805 ppcopy(page_t *frompp, page_t *topp)
1806 {
1807 	caddr_t		pp_addr1;
1808 	caddr_t		pp_addr2;
1809 	void		*pte1;
1810 	void		*pte2;
1811 	kmutex_t	*ppaddr_mutex;
1812 
1813 	ASSERT_STACK_ALIGNED();
1814 	ASSERT(PAGE_LOCKED(frompp));
1815 	ASSERT(PAGE_LOCKED(topp));
1816 
1817 	if (kpm_enable) {
1818 		pp_addr1 = hat_kpm_page2va(frompp, 0);
1819 		pp_addr2 = hat_kpm_page2va(topp, 0);
1820 		kpreempt_disable();
1821 	} else {
1822 		/*
1823 		 * disable pre-emption so that CPU can't change
1824 		 */
1825 		kpreempt_disable();
1826 
1827 		pp_addr1 = CPU->cpu_caddr1;
1828 		pp_addr2 = CPU->cpu_caddr2;
1829 		pte1 = (void *)CPU->cpu_caddr1pte;
1830 		pte2 = (void *)CPU->cpu_caddr2pte;
1831 
1832 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
1833 		mutex_enter(ppaddr_mutex);
1834 
1835 		hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
1836 		    PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
1837 		hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
1838 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
1839 		    HAT_LOAD_NOCONSIST);
1840 	}
1841 
1842 	if (use_sse_pagecopy)
1843 		hwblkpagecopy(pp_addr1, pp_addr2);
1844 	else
1845 		bcopy(pp_addr1, pp_addr2, PAGESIZE);
1846 
1847 	if (!kpm_enable)
1848 		mutex_exit(ppaddr_mutex);
1849 	kpreempt_enable();
1850 }
1851 
1852 /*
1853  * Zero the physical page from off to off + len given by `pp'
1854  * without changing the reference and modified bits of page.
1855  *
1856  * We use this using CPU private page address #2, see ppcopy() for more info.
1857  * pagezero() must not be called at interrupt level.
1858  */
1859 void
1860 pagezero(page_t *pp, uint_t off, uint_t len)
1861 {
1862 	caddr_t		pp_addr2;
1863 	void		*pte2;
1864 	kmutex_t	*ppaddr_mutex;
1865 
1866 	ASSERT_STACK_ALIGNED();
1867 	ASSERT(len <= MMU_PAGESIZE);
1868 	ASSERT(off <= MMU_PAGESIZE);
1869 	ASSERT(off + len <= MMU_PAGESIZE);
1870 	ASSERT(PAGE_LOCKED(pp));
1871 
1872 	if (kpm_enable) {
1873 		pp_addr2 = hat_kpm_page2va(pp, 0);
1874 		kpreempt_disable();
1875 	} else {
1876 		kpreempt_disable();
1877 
1878 		pp_addr2 = CPU->cpu_caddr2;
1879 		pte2 = (void *)CPU->cpu_caddr2pte;
1880 
1881 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
1882 		mutex_enter(ppaddr_mutex);
1883 
1884 		hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2,
1885 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
1886 		    HAT_LOAD_NOCONSIST);
1887 	}
1888 
1889 	if (use_sse_pagezero)
1890 		hwblkclr(pp_addr2 + off, len);
1891 	else
1892 		bzero(pp_addr2 + off, len);
1893 
1894 	if (!kpm_enable)
1895 		mutex_exit(ppaddr_mutex);
1896 	kpreempt_enable();
1897 }
1898 
1899 /*
1900  * Platform-dependent page scrub call.
1901  */
1902 void
1903 pagescrub(page_t *pp, uint_t off, uint_t len)
1904 {
1905 	/*
1906 	 * For now, we rely on the fact that pagezero() will
1907 	 * always clear UEs.
1908 	 */
1909 	pagezero(pp, off, len);
1910 }
1911 
1912 /*
1913  * set up two private addresses for use on a given CPU for use in ppcopy()
1914  */
1915 void
1916 setup_vaddr_for_ppcopy(struct cpu *cpup)
1917 {
1918 	void *addr;
1919 	void *pte;
1920 
1921 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
1922 	pte = hat_mempte_setup(addr);
1923 	cpup->cpu_caddr1 = addr;
1924 	cpup->cpu_caddr1pte = (pteptr_t)pte;
1925 
1926 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
1927 	pte = hat_mempte_setup(addr);
1928 	cpup->cpu_caddr2 = addr;
1929 	cpup->cpu_caddr2pte = (pteptr_t)pte;
1930 
1931 	mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
1932 }
1933 
1934 
1935 /*
1936  * Create the pageout scanner thread. The thread has to
1937  * start at procedure with process pp and priority pri.
1938  */
1939 void
1940 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
1941 {
1942 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
1943 }
1944 
1945 /*
1946  * any use for this?
1947  */
1948 void
1949 post_startup_mmu_initialization(void)
1950 {}
1951 
1952 /*
1953  * Function for flushing D-cache when performing module relocations
1954  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
1955  */
1956 void
1957 dcache_flushall()
1958 {}
1959