xref: /titanic_41/usr/src/uts/i86pc/vm/vm_machdep.c (revision ba2e4443695ee6a6f420a35cd4fc3d3346d22932)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /*	All Rights Reserved   */
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 /*
37  * UNIX machine dependent virtual memory support.
38  */
39 
40 #include <sys/types.h>
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/user.h>
44 #include <sys/proc.h>
45 #include <sys/kmem.h>
46 #include <sys/vmem.h>
47 #include <sys/buf.h>
48 #include <sys/cpuvar.h>
49 #include <sys/lgrp.h>
50 #include <sys/disp.h>
51 #include <sys/vm.h>
52 #include <sys/mman.h>
53 #include <sys/vnode.h>
54 #include <sys/cred.h>
55 #include <sys/exec.h>
56 #include <sys/exechdr.h>
57 #include <sys/debug.h>
58 
59 #include <vm/hat.h>
60 #include <vm/as.h>
61 #include <vm/seg.h>
62 #include <vm/seg_kp.h>
63 #include <vm/seg_vn.h>
64 #include <vm/page.h>
65 #include <vm/seg_kmem.h>
66 #include <vm/seg_kpm.h>
67 #include <vm/vm_dep.h>
68 
69 #include <sys/cpu.h>
70 #include <sys/vm_machparam.h>
71 #include <sys/memlist.h>
72 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
73 #include <vm/hat_i86.h>
74 #include <sys/x86_archext.h>
75 #include <sys/elf_386.h>
76 #include <sys/cmn_err.h>
77 #include <sys/archsystm.h>
78 #include <sys/machsystm.h>
79 
80 #include <sys/vtrace.h>
81 #include <sys/ddidmareq.h>
82 #include <sys/promif.h>
83 #include <sys/memnode.h>
84 #include <sys/stack.h>
85 
86 uint_t vac_colors = 0;
87 
88 int largepagesupport = 0;
89 extern uint_t page_create_new;
90 extern uint_t page_create_exists;
91 extern uint_t page_create_putbacks;
92 extern uint_t page_create_putbacks;
93 extern uintptr_t eprom_kernelbase;
94 extern int use_sse_pagecopy, use_sse_pagezero;	/* in ml/float.s */
95 
96 /* 4g memory management */
97 pgcnt_t		maxmem4g;
98 pgcnt_t		freemem4g;
99 int		physmax4g;
100 int		desfree4gshift = 4;	/* maxmem4g shift to derive DESFREE4G */
101 int		lotsfree4gshift = 3;
102 
103 /* 16m memory management: desired number of free pages below 16m. */
104 pgcnt_t		desfree16m = 0x380;
105 
106 #ifdef VM_STATS
107 struct {
108 	ulong_t	pga_alloc;
109 	ulong_t	pga_notfullrange;
110 	ulong_t	pga_nulldmaattr;
111 	ulong_t	pga_allocok;
112 	ulong_t	pga_allocfailed;
113 	ulong_t	pgma_alloc;
114 	ulong_t	pgma_allocok;
115 	ulong_t	pgma_allocfailed;
116 	ulong_t	pgma_allocempty;
117 } pga_vmstats;
118 #endif
119 
120 uint_t mmu_page_sizes;
121 
122 /* How many page sizes the users can see */
123 uint_t mmu_exported_page_sizes;
124 
125 size_t auto_lpg_va_default = MMU_PAGESIZE; /* used by zmap() */
126 /*
127  * Number of pages in 1 GB.  Don't enable automatic large pages if we have
128  * fewer than this many pages.
129  */
130 pgcnt_t auto_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
131 
132 /*
133  * Return the optimum page size for a given mapping
134  */
135 /*ARGSUSED*/
136 size_t
137 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap)
138 {
139 	level_t l;
140 
141 	if (remap)
142 		*remap = 0;
143 
144 	switch (maptype) {
145 
146 	case MAPPGSZ_STK:
147 	case MAPPGSZ_HEAP:
148 	case MAPPGSZ_VA:
149 		/*
150 		 * use the pages size that best fits len
151 		 */
152 		for (l = mmu.max_page_level; l > 0; --l) {
153 			if (len < LEVEL_SIZE(l))
154 				continue;
155 			break;
156 		}
157 		return (LEVEL_SIZE(l));
158 
159 	/*
160 	 * for ISM use the 1st large page size.
161 	 */
162 	case MAPPGSZ_ISM:
163 		if (mmu.max_page_level == 0)
164 			return (MMU_PAGESIZE);
165 		return (LEVEL_SIZE(1));
166 	}
167 	return (0);
168 }
169 
170 /*
171  * This can be patched via /etc/system to allow large pages
172  * to be used for mapping application and libraries text segments.
173  */
174 int	use_text_largepages = 0;
175 
176 /*
177  * Return a bit vector of large page size codes that
178  * can be used to map [addr, addr + len) region.
179  */
180 
181 /*ARGSUSED*/
182 uint_t
183 map_execseg_pgszcvec(int text, caddr_t addr, size_t len)
184 {
185 	size_t	pgsz;
186 	caddr_t a;
187 
188 	if (!text || !use_text_largepages ||
189 	    mmu.max_page_level == 0)
190 		return (0);
191 
192 	pgsz = LEVEL_SIZE(1);
193 	a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
194 	if (a < addr || a >= addr + len) {
195 		return (0);
196 	}
197 	len -= (a - addr);
198 	if (len < pgsz) {
199 		return (0);
200 	}
201 	return (1 << 1);
202 }
203 
204 /*
205  * Handle a pagefault.
206  */
207 faultcode_t
208 pagefault(
209 	caddr_t addr,
210 	enum fault_type type,
211 	enum seg_rw rw,
212 	int iskernel)
213 {
214 	struct as *as;
215 	struct hat *hat;
216 	struct proc *p;
217 	kthread_t *t;
218 	faultcode_t res;
219 	caddr_t base;
220 	size_t len;
221 	int err;
222 	int mapped_red;
223 	uintptr_t ea;
224 
225 	ASSERT_STACK_ALIGNED();
226 
227 	if (INVALID_VADDR(addr))
228 		return (FC_NOMAP);
229 
230 	mapped_red = segkp_map_red();
231 
232 	if (iskernel) {
233 		as = &kas;
234 		hat = as->a_hat;
235 	} else {
236 		t = curthread;
237 		p = ttoproc(t);
238 		as = p->p_as;
239 		hat = as->a_hat;
240 	}
241 
242 	/*
243 	 * Dispatch pagefault.
244 	 */
245 	res = as_fault(hat, as, addr, 1, type, rw);
246 
247 	/*
248 	 * If this isn't a potential unmapped hole in the user's
249 	 * UNIX data or stack segments, just return status info.
250 	 */
251 	if (res != FC_NOMAP || iskernel)
252 		goto out;
253 
254 	/*
255 	 * Check to see if we happened to faulted on a currently unmapped
256 	 * part of the UNIX data or stack segments.  If so, create a zfod
257 	 * mapping there and then try calling the fault routine again.
258 	 */
259 	base = p->p_brkbase;
260 	len = p->p_brksize;
261 
262 	if (addr < base || addr >= base + len) {		/* data seg? */
263 		base = (caddr_t)p->p_usrstack - p->p_stksize;
264 		len = p->p_stksize;
265 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
266 			/* not in either UNIX data or stack segments */
267 			res = FC_NOMAP;
268 			goto out;
269 		}
270 	}
271 
272 	/*
273 	 * the rest of this function implements a 3.X 4.X 5.X compatibility
274 	 * This code is probably not needed anymore
275 	 */
276 	if (p->p_model == DATAMODEL_ILP32) {
277 
278 		/* expand the gap to the page boundaries on each side */
279 		ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
280 		base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
281 		len = ea - (uintptr_t)base;
282 
283 		as_rangelock(as);
284 		if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
285 		    0) {
286 			err = as_map(as, base, len, segvn_create, zfod_argsp);
287 			as_rangeunlock(as);
288 			if (err) {
289 				res = FC_MAKE_ERR(err);
290 				goto out;
291 			}
292 		} else {
293 			/*
294 			 * This page is already mapped by another thread after
295 			 * we returned from as_fault() above.  We just fall
296 			 * through as_fault() below.
297 			 */
298 			as_rangeunlock(as);
299 		}
300 
301 		res = as_fault(hat, as, addr, 1, F_INVAL, rw);
302 	}
303 
304 out:
305 	if (mapped_red)
306 		segkp_unmap_red();
307 
308 	return (res);
309 }
310 
311 void
312 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
313 {
314 	struct proc *p = curproc;
315 	caddr_t userlimit = (flags & _MAP_LOW32) ?
316 	    (caddr_t)_userlimit32 : p->p_as->a_userlimit;
317 
318 	map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
319 }
320 
321 /*ARGSUSED*/
322 int
323 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
324 {
325 	return (0);
326 }
327 
328 /*
329  * map_addr_proc() is the routine called when the system is to
330  * choose an address for the user.  We will pick an address
331  * range which is the highest available below kernelbase.
332  *
333  * addrp is a value/result parameter.
334  *	On input it is a hint from the user to be used in a completely
335  *	machine dependent fashion.  We decide to completely ignore this hint.
336  *
337  *	On output it is NULL if no address can be found in the current
338  *	processes address space or else an address that is currently
339  *	not mapped for len bytes with a page of red zone on either side.
340  *
341  *	align is not needed on x86 (it's for viturally addressed caches)
342  */
343 /*ARGSUSED*/
344 void
345 map_addr_proc(
346 	caddr_t *addrp,
347 	size_t len,
348 	offset_t off,
349 	int vacalign,
350 	caddr_t userlimit,
351 	struct proc *p,
352 	uint_t flags)
353 {
354 	struct as *as = p->p_as;
355 	caddr_t addr;
356 	caddr_t base;
357 	size_t slen;
358 	size_t align_amount;
359 
360 	ASSERT32(userlimit == as->a_userlimit);
361 
362 	base = p->p_brkbase;
363 #if defined(__amd64)
364 	/*
365 	 * XX64 Yes, this needs more work.
366 	 */
367 	if (p->p_model == DATAMODEL_NATIVE) {
368 		if (userlimit < as->a_userlimit) {
369 			/*
370 			 * This happens when a program wants to map
371 			 * something in a range that's accessible to a
372 			 * program in a smaller address space.  For example,
373 			 * a 64-bit program calling mmap32(2) to guarantee
374 			 * that the returned address is below 4Gbytes.
375 			 */
376 			ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
377 
378 			if (userlimit > base)
379 				slen = userlimit - base;
380 			else {
381 				*addrp = NULL;
382 				return;
383 			}
384 		} else {
385 			/*
386 			 * XX64 This layout is probably wrong .. but in
387 			 * the event we make the amd64 address space look
388 			 * like sparcv9 i.e. with the stack -above- the
389 			 * heap, this bit of code might even be correct.
390 			 */
391 			slen = p->p_usrstack - base -
392 			    (((size_t)rctl_enforced_value(
393 			    rctlproc_legacy[RLIMIT_STACK],
394 			    p->p_rctls, p) + PAGEOFFSET) & PAGEMASK);
395 		}
396 	} else
397 #endif
398 		slen = userlimit - base;
399 
400 	len = (len + PAGEOFFSET) & PAGEMASK;
401 
402 	/*
403 	 * Redzone for each side of the request. This is done to leave
404 	 * one page unmapped between segments. This is not required, but
405 	 * it's useful for the user because if their program strays across
406 	 * a segment boundary, it will catch a fault immediately making
407 	 * debugging a little easier.
408 	 */
409 	len += 2 * MMU_PAGESIZE;
410 
411 	/*
412 	 * figure out what the alignment should be
413 	 *
414 	 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
415 	 */
416 	if (len <= ELF_386_MAXPGSZ) {
417 		/*
418 		 * Align virtual addresses to ensure that ELF shared libraries
419 		 * are mapped with the appropriate alignment constraints by
420 		 * the run-time linker.
421 		 */
422 		align_amount = ELF_386_MAXPGSZ;
423 	} else {
424 		int l = mmu.max_page_level;
425 
426 		while (l && len < LEVEL_SIZE(l))
427 			--l;
428 
429 		align_amount = LEVEL_SIZE(l);
430 	}
431 
432 	if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
433 		align_amount = (uintptr_t)*addrp;
434 
435 	len += align_amount;
436 
437 	/*
438 	 * Look for a large enough hole starting below userlimit.
439 	 * After finding it, use the upper part.  Addition of PAGESIZE
440 	 * is for the redzone as described above.
441 	 */
442 	if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) {
443 		caddr_t as_addr;
444 
445 		addr = base + slen - len + MMU_PAGESIZE;
446 		as_addr = addr;
447 		/*
448 		 * Round address DOWN to the alignment amount,
449 		 * add the offset, and if this address is less
450 		 * than the original address, add alignment amount.
451 		 */
452 		addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
453 		addr += (uintptr_t)(off & (align_amount - 1));
454 		if (addr < as_addr)
455 			addr += align_amount;
456 
457 		ASSERT(addr <= (as_addr + align_amount));
458 		ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
459 		    ((uintptr_t)(off & (align_amount - 1))));
460 		*addrp = addr;
461 	} else {
462 		*addrp = NULL;	/* no more virtual space */
463 	}
464 }
465 
466 /*
467  * Determine whether [base, base+len] contains a valid range of
468  * addresses at least minlen long. base and len are adjusted if
469  * required to provide a valid range.
470  */
471 /*ARGSUSED3*/
472 int
473 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
474 {
475 	uintptr_t hi, lo;
476 
477 	lo = (uintptr_t)*basep;
478 	hi = lo + *lenp;
479 
480 	/*
481 	 * If hi rolled over the top, try cutting back.
482 	 */
483 	if (hi < lo) {
484 		if (0 - lo + hi < minlen)
485 			return (0);
486 		if (0 - lo < minlen)
487 			return (0);
488 		*lenp = 0 - lo;
489 	} else if (hi - lo < minlen) {
490 		return (0);
491 	}
492 #if defined(__amd64)
493 	/*
494 	 * Deal with a possible hole in the address range between
495 	 * hole_start and hole_end that should never be mapped.
496 	 */
497 	if (lo < hole_start) {
498 		if (hi > hole_start) {
499 			if (hi < hole_end) {
500 				hi = hole_start;
501 			} else {
502 				/* lo < hole_start && hi >= hole_end */
503 				if (dir == AH_LO) {
504 					/*
505 					 * prefer lowest range
506 					 */
507 					if (hole_start - lo >= minlen)
508 						hi = hole_start;
509 					else if (hi - hole_end >= minlen)
510 						lo = hole_end;
511 					else
512 						return (0);
513 				} else {
514 					/*
515 					 * prefer highest range
516 					 */
517 					if (hi - hole_end >= minlen)
518 						lo = hole_end;
519 					else if (hole_start - lo >= minlen)
520 						hi = hole_start;
521 					else
522 						return (0);
523 				}
524 			}
525 		}
526 	} else {
527 		/* lo >= hole_start */
528 		if (hi < hole_end)
529 			return (0);
530 		if (lo < hole_end)
531 			lo = hole_end;
532 	}
533 
534 	if (hi - lo < minlen)
535 		return (0);
536 
537 	*basep = (caddr_t)lo;
538 	*lenp = hi - lo;
539 #endif
540 	return (1);
541 }
542 
543 /*
544  * Determine whether [addr, addr+len] are valid user addresses.
545  */
546 /*ARGSUSED*/
547 int
548 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
549     caddr_t userlimit)
550 {
551 	caddr_t eaddr = addr + len;
552 
553 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
554 		return (RANGE_BADADDR);
555 
556 #if defined(__amd64)
557 	/*
558 	 * Check for the VA hole
559 	 */
560 	if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
561 		return (RANGE_BADADDR);
562 #endif
563 
564 	return (RANGE_OKAY);
565 }
566 
567 /*
568  * Return 1 if the page frame is onboard memory, else 0.
569  */
570 int
571 pf_is_memory(pfn_t pf)
572 {
573 	return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1));
574 }
575 
576 
577 /*
578  * initialized by page_coloring_init().
579  */
580 uint_t	page_colors;
581 uint_t	page_colors_mask;
582 uint_t	page_coloring_shift;
583 int	cpu_page_colors;
584 static uint_t	l2_colors;
585 
586 /*
587  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
588  * and page_colors are calculated from the l2 cache n-way set size.  Within a
589  * mnode range, the page freelist and cachelist are hashed into bins based on
590  * color. This makes it easier to search for a page within a specific memory
591  * range.
592  */
593 #define	PAGE_COLORS_MIN	16
594 
595 page_t ****page_freelists;
596 page_t ***page_cachelists;
597 
598 /*
599  * As the PC architecture evolved memory up was clumped into several
600  * ranges for various historical I/O devices to do DMA.
601  * < 16Meg - ISA bus
602  * < 2Gig - ???
603  * < 4Gig - PCI bus or drivers that don't understand PAE mode
604  */
605 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
606     0x100000,	/* pfn range for 4G and above */
607     0x80000,	/* pfn range for 2G-4G */
608     0x01000,	/* pfn range for 16M-2G */
609     0x00000,	/* pfn range for 0-16M */
610 };
611 
612 /*
613  * These are changed during startup if the machine has limited memory.
614  */
615 pfn_t *memranges = &arch_memranges[0];
616 int nranges = NUM_MEM_RANGES;
617 
618 /*
619  * Used by page layer to know about page sizes
620  */
621 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
622 
623 /*
624  * This can be patched via /etc/system to allow old non-PAE aware device
625  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
626  */
627 #if defined(__i386)
628 int restricted_kmemalloc = 0;
629 #elif defined(__amd64)
630 int restricted_kmemalloc = 0;
631 #endif
632 
633 kmutex_t	*fpc_mutex[NPC_MUTEX];
634 kmutex_t	*cpc_mutex[NPC_MUTEX];
635 
636 
637 /*
638  * return the memrange containing pfn
639  */
640 int
641 memrange_num(pfn_t pfn)
642 {
643 	int n;
644 
645 	for (n = 0; n < nranges - 1; ++n) {
646 		if (pfn >= memranges[n])
647 			break;
648 	}
649 	return (n);
650 }
651 
652 /*
653  * return the mnoderange containing pfn
654  */
655 int
656 pfn_2_mtype(pfn_t pfn)
657 {
658 	int	n;
659 
660 	for (n = mnoderangecnt - 1; n >= 0; n--) {
661 		if (pfn >= mnoderanges[n].mnr_pfnlo) {
662 			break;
663 		}
664 	}
665 	return (n);
666 }
667 
668 /*
669  * is_contigpage_free:
670  *	returns a page list of contiguous pages. It minimally has to return
671  *	minctg pages. Caller determines minctg based on the scatter-gather
672  *	list length.
673  *
674  *	pfnp is set to the next page frame to search on return.
675  */
676 static page_t *
677 is_contigpage_free(
678 	pfn_t *pfnp,
679 	pgcnt_t *pgcnt,
680 	pgcnt_t minctg,
681 	uint64_t pfnseg,
682 	int iolock)
683 {
684 	int	i = 0;
685 	pfn_t	pfn = *pfnp;
686 	page_t	*pp;
687 	page_t	*plist = NULL;
688 
689 	/*
690 	 * fail if pfn + minctg crosses a segment boundary.
691 	 * Adjust for next starting pfn to begin at segment boundary.
692 	 */
693 
694 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
695 		*pfnp = roundup(*pfnp, pfnseg + 1);
696 		return (NULL);
697 	}
698 
699 	do {
700 retry:
701 		pp = page_numtopp_nolock(pfn + i);
702 		if ((pp == NULL) ||
703 		    (page_trylock(pp, SE_EXCL) == 0)) {
704 			(*pfnp)++;
705 			break;
706 		}
707 		if (page_pptonum(pp) != pfn + i) {
708 			page_unlock(pp);
709 			goto retry;
710 		}
711 
712 		if (!(PP_ISFREE(pp))) {
713 			page_unlock(pp);
714 			(*pfnp)++;
715 			break;
716 		}
717 
718 		if (!PP_ISAGED(pp)) {
719 			page_list_sub(pp, PG_CACHE_LIST);
720 			page_hashout(pp, (kmutex_t *)NULL);
721 		} else {
722 			page_list_sub(pp, PG_FREE_LIST);
723 		}
724 
725 		if (iolock)
726 			page_io_lock(pp);
727 		page_list_concat(&plist, &pp);
728 
729 		/*
730 		 * exit loop when pgcnt satisfied or segment boundary reached.
731 		 */
732 
733 	} while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
734 
735 	*pfnp += i;		/* set to next pfn to search */
736 
737 	if (i >= minctg) {
738 		*pgcnt -= i;
739 		return (plist);
740 	}
741 
742 	/*
743 	 * failure: minctg not satisfied.
744 	 *
745 	 * if next request crosses segment boundary, set next pfn
746 	 * to search from the segment boundary.
747 	 */
748 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
749 		*pfnp = roundup(*pfnp, pfnseg + 1);
750 
751 	/* clean up any pages already allocated */
752 
753 	while (plist) {
754 		pp = plist;
755 		page_sub(&plist, pp);
756 		page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
757 		if (iolock)
758 			page_io_unlock(pp);
759 		page_unlock(pp);
760 	}
761 
762 	return (NULL);
763 }
764 
765 /*
766  * verify that pages being returned from allocator have correct DMA attribute
767  */
768 #ifndef DEBUG
769 #define	check_dma(a, b, c) (0)
770 #else
771 static void
772 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
773 {
774 	if (dma_attr == NULL)
775 		return;
776 
777 	while (cnt-- > 0) {
778 		if (mmu_ptob((uint64_t)pp->p_pagenum) <
779 		    dma_attr->dma_attr_addr_lo)
780 			panic("PFN (pp=%p) below dma_attr_addr_lo", pp);
781 		if (mmu_ptob((uint64_t)pp->p_pagenum) >=
782 		    dma_attr->dma_attr_addr_hi)
783 			panic("PFN (pp=%p) above dma_attr_addr_hi", pp);
784 		pp = pp->p_next;
785 	}
786 }
787 #endif
788 
789 static kmutex_t	contig_lock;
790 
791 #define	CONTIG_LOCK()	mutex_enter(&contig_lock);
792 #define	CONTIG_UNLOCK()	mutex_exit(&contig_lock);
793 
794 #define	PFN_16M		(mmu_btop((uint64_t)0x1000000))
795 
796 static page_t *
797 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
798 {
799 	pfn_t		pfn;
800 	int		sgllen;
801 	uint64_t	pfnseg;
802 	pgcnt_t		minctg;
803 	page_t		*pplist = NULL, *plist;
804 	uint64_t	lo, hi;
805 	pgcnt_t		pfnalign = 0;
806 	static pfn_t	startpfn;
807 	static pgcnt_t	lastctgcnt;
808 	uintptr_t	align;
809 
810 	CONTIG_LOCK();
811 
812 	if (mattr) {
813 		lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
814 		hi = mmu_btop(mattr->dma_attr_addr_hi);
815 		if (hi >= physmax)
816 			hi = physmax - 1;
817 		sgllen = mattr->dma_attr_sgllen;
818 		pfnseg = mmu_btop(mattr->dma_attr_seg);
819 
820 		align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
821 		if (align > MMU_PAGESIZE)
822 			pfnalign = mmu_btop(align);
823 
824 		/*
825 		 * in order to satisfy the request, must minimally
826 		 * acquire minctg contiguous pages
827 		 */
828 		minctg = howmany(*pgcnt, sgllen);
829 
830 		ASSERT(hi >= lo);
831 
832 		/*
833 		 * start from where last searched if the minctg >= lastctgcnt
834 		 */
835 		if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
836 			startpfn = lo;
837 	} else {
838 		hi = physmax - 1;
839 		lo = 0;
840 		sgllen = 1;
841 		pfnseg = mmu.highest_pfn;
842 		minctg = *pgcnt;
843 
844 		if (minctg < lastctgcnt)
845 			startpfn = lo;
846 	}
847 	lastctgcnt = minctg;
848 
849 	ASSERT(pfnseg + 1 >= (uint64_t)minctg);
850 
851 	/* conserve 16m memory - start search above 16m when possible */
852 	if (hi > PFN_16M && startpfn < PFN_16M)
853 		startpfn = PFN_16M;
854 
855 	pfn = startpfn;
856 	if (pfnalign)
857 		pfn = P2ROUNDUP(pfn, pfnalign);
858 
859 	while (pfn + minctg - 1 <= hi) {
860 
861 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
862 		if (plist) {
863 			page_list_concat(&pplist, &plist);
864 			sgllen--;
865 			/*
866 			 * return when contig pages no longer needed
867 			 */
868 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
869 				startpfn = pfn;
870 				CONTIG_UNLOCK();
871 				check_dma(mattr, pplist, *pgcnt);
872 				return (pplist);
873 			}
874 			minctg = howmany(*pgcnt, sgllen);
875 		}
876 		if (pfnalign)
877 			pfn = P2ROUNDUP(pfn, pfnalign);
878 	}
879 
880 	/* cannot find contig pages in specified range */
881 	if (startpfn == lo) {
882 		CONTIG_UNLOCK();
883 		return (NULL);
884 	}
885 
886 	/* did not start with lo previously */
887 	pfn = lo;
888 	if (pfnalign)
889 		pfn = P2ROUNDUP(pfn, pfnalign);
890 
891 	/* allow search to go above startpfn */
892 	while (pfn < startpfn) {
893 
894 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
895 		if (plist != NULL) {
896 
897 			page_list_concat(&pplist, &plist);
898 			sgllen--;
899 
900 			/*
901 			 * return when contig pages no longer needed
902 			 */
903 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
904 				startpfn = pfn;
905 				CONTIG_UNLOCK();
906 				check_dma(mattr, pplist, *pgcnt);
907 				return (pplist);
908 			}
909 			minctg = howmany(*pgcnt, sgllen);
910 		}
911 		if (pfnalign)
912 			pfn = P2ROUNDUP(pfn, pfnalign);
913 	}
914 	CONTIG_UNLOCK();
915 	return (NULL);
916 }
917 
918 /*
919  * combine mem_node_config and memrange memory ranges into one data
920  * structure to be used for page list management.
921  *
922  * mnode_range_cnt() calculates the number of memory ranges for mnode and
923  * memranges[]. Used to determine the size of page lists and mnoderanges.
924  *
925  * mnode_range_setup() initializes mnoderanges.
926  */
927 mnoderange_t	*mnoderanges;
928 int		mnoderangecnt;
929 int		mtype4g;
930 
931 int
932 mnode_range_cnt()
933 {
934 	int	mri;
935 	int	mnrcnt = 0;
936 	int	mnode;
937 
938 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
939 		if (mem_node_config[mnode].exists == 0)
940 			continue;
941 
942 		mri = nranges - 1;
943 
944 		/* find the memranges index below contained in mnode range */
945 
946 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
947 			mri--;
948 
949 		/*
950 		 * increment mnode range counter when memranges or mnode
951 		 * boundary is reached.
952 		 */
953 		while (mri >= 0 &&
954 		    mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
955 			mnrcnt++;
956 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
957 				mri--;
958 			else
959 				break;
960 		}
961 	}
962 	return (mnrcnt);
963 }
964 
965 void
966 mnode_range_setup(mnoderange_t *mnoderanges)
967 {
968 	int	mnode, mri;
969 
970 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
971 		if (mem_node_config[mnode].exists == 0)
972 			continue;
973 
974 		mri = nranges - 1;
975 
976 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
977 			mri--;
978 
979 		while (mri >= 0 && mem_node_config[mnode].physmax >=
980 		    MEMRANGELO(mri)) {
981 			mnoderanges->mnr_pfnlo =
982 			    MAX(MEMRANGELO(mri),
983 				mem_node_config[mnode].physbase);
984 			mnoderanges->mnr_pfnhi =
985 			    MIN(MEMRANGEHI(mri),
986 				mem_node_config[mnode].physmax);
987 			mnoderanges->mnr_mnode = mnode;
988 			mnoderanges->mnr_memrange = mri;
989 			mnoderanges++;
990 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
991 				mri--;
992 			else
993 				break;
994 		}
995 	}
996 }
997 
998 /*
999  * Determine if the mnode range specified in mtype contains memory belonging
1000  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
1001  * the range of indices from high pfn to 0, 16m or 4g.
1002  *
1003  * Return first mnode range type index found otherwise return -1 if none found.
1004  */
1005 int
1006 mtype_func(int mnode, int mtype, uint_t flags)
1007 {
1008 	if (flags & PGI_MT_RANGE) {
1009 		int	mtlim;
1010 
1011 		if (flags & PGI_MT_NEXT)
1012 			mtype--;
1013 		if (flags & PGI_MT_RANGE0)
1014 			mtlim = 0;
1015 		else if (flags & PGI_MT_RANGE4G)
1016 			mtlim = mtype4g + 1;	/* exclude 0-4g range */
1017 		else if (flags & PGI_MT_RANGE16M)
1018 			mtlim = 1;		/* exclude 0-16m range */
1019 		while (mtype >= mtlim) {
1020 			if (mnoderanges[mtype].mnr_mnode == mnode)
1021 				return (mtype);
1022 			mtype--;
1023 		}
1024 	} else {
1025 		if (mnoderanges[mtype].mnr_mnode == mnode)
1026 			return (mtype);
1027 	}
1028 	return (-1);
1029 }
1030 
1031 /*
1032  * Update the page list max counts with the pfn range specified by the
1033  * input parameters.  Called from add_physmem() when physical memory with
1034  * page_t's are initially added to the page lists.
1035  */
1036 void
1037 mtype_modify_max(pfn_t startpfn, long cnt)
1038 {
1039 	int	mtype = 0;
1040 	pfn_t	endpfn = startpfn + cnt, pfn;
1041 	pgcnt_t	inc;
1042 
1043 	ASSERT(cnt > 0);
1044 
1045 	for (pfn = startpfn; pfn < endpfn; ) {
1046 		if (pfn <= mnoderanges[mtype].mnr_pfnhi) {
1047 			if (endpfn < mnoderanges[mtype].mnr_pfnhi) {
1048 				inc = endpfn - pfn;
1049 			} else {
1050 				inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1;
1051 			}
1052 			mnoderanges[mtype].mnr_mt_pgmax += inc;
1053 			if (physmax4g && mtype <= mtype4g)
1054 				maxmem4g += inc;
1055 			pfn += inc;
1056 		}
1057 		mtype++;
1058 		ASSERT(mtype < mnoderangecnt || pfn >= endpfn);
1059 	}
1060 }
1061 
1062 /*
1063  * Returns the free page count for mnode
1064  */
1065 int
1066 mnode_pgcnt(int mnode)
1067 {
1068 	int	mtype = mnoderangecnt - 1;
1069 	int	flags = PGI_MT_RANGE0;
1070 	pgcnt_t	pgcnt = 0;
1071 
1072 	mtype = mtype_func(mnode, mtype, flags);
1073 
1074 	while (mtype != -1) {
1075 		pgcnt += MTYPE_FREEMEM(mtype);
1076 		mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1077 	}
1078 	return (pgcnt);
1079 }
1080 
1081 /*
1082  * Initialize page coloring variables based on the l2 cache parameters.
1083  * Calculate and return memory needed for page coloring data structures.
1084  */
1085 size_t
1086 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1087 {
1088 	size_t	colorsz = 0;
1089 	int	i;
1090 	int	colors;
1091 
1092 	/*
1093 	 * Reduce the memory ranges lists if we don't have large amounts
1094 	 * of memory. This avoids searching known empty free lists.
1095 	 */
1096 	i = memrange_num(physmax);
1097 	memranges += i;
1098 	nranges -= i;
1099 #if defined(__i386)
1100 	if (i > 0)
1101 		restricted_kmemalloc = 0;
1102 #endif
1103 	/* physmax greater than 4g */
1104 	if (i == 0)
1105 		physmax4g = 1;
1106 
1107 	/*
1108 	 * setup pagesize for generic page layer
1109 	 */
1110 	for (i = 0; i <= mmu.max_page_level; ++i) {
1111 		hw_page_array[i].hp_size = LEVEL_SIZE(i);
1112 		hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1113 		hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1114 	}
1115 
1116 	ASSERT(ISP2(l2_sz));
1117 	ASSERT(ISP2(l2_linesz));
1118 	ASSERT(l2_sz > MMU_PAGESIZE);
1119 
1120 	/* l2_assoc is 0 for fully associative l2 cache */
1121 	if (l2_assoc)
1122 		l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1123 	else
1124 		l2_colors = 1;
1125 
1126 	/* for scalability, configure at least PAGE_COLORS_MIN color bins */
1127 	page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1128 
1129 	/*
1130 	 * cpu_page_colors is non-zero when a page color may be spread across
1131 	 * multiple bins.
1132 	 */
1133 	if (l2_colors < page_colors)
1134 		cpu_page_colors = l2_colors;
1135 
1136 	ASSERT(ISP2(page_colors));
1137 
1138 	page_colors_mask = page_colors - 1;
1139 
1140 	ASSERT(ISP2(CPUSETSIZE()));
1141 	page_coloring_shift = lowbit(CPUSETSIZE());
1142 
1143 	/* size for mnoderanges */
1144 	mnoderangecnt = mnode_range_cnt();
1145 	colorsz = mnoderangecnt * sizeof (mnoderange_t);
1146 
1147 	/* size for fpc_mutex and cpc_mutex */
1148 	colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1149 
1150 	/* size of page_freelists */
1151 	colorsz += mnoderangecnt * sizeof (page_t ***);
1152 	colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1153 
1154 	for (i = 0; i < mmu_page_sizes; i++) {
1155 		colors = page_get_pagecolors(i);
1156 		colorsz += mnoderangecnt * colors * sizeof (page_t *);
1157 	}
1158 
1159 	/* size of page_cachelists */
1160 	colorsz += mnoderangecnt * sizeof (page_t **);
1161 	colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1162 
1163 	return (colorsz);
1164 }
1165 
1166 /*
1167  * Called once at startup to configure page_coloring data structures and
1168  * does the 1st page_free()/page_freelist_add().
1169  */
1170 void
1171 page_coloring_setup(caddr_t pcmemaddr)
1172 {
1173 	int	i;
1174 	int	j;
1175 	int	k;
1176 	caddr_t	addr;
1177 	int	colors;
1178 
1179 	/*
1180 	 * do page coloring setup
1181 	 */
1182 	addr = pcmemaddr;
1183 
1184 	mnoderanges = (mnoderange_t *)addr;
1185 	addr += (mnoderangecnt * sizeof (mnoderange_t));
1186 
1187 	mnode_range_setup(mnoderanges);
1188 
1189 	if (physmax4g)
1190 		mtype4g = pfn_2_mtype(0xfffff);
1191 
1192 	for (k = 0; k < NPC_MUTEX; k++) {
1193 		fpc_mutex[k] = (kmutex_t *)addr;
1194 		addr += (max_mem_nodes * sizeof (kmutex_t));
1195 	}
1196 	for (k = 0; k < NPC_MUTEX; k++) {
1197 		cpc_mutex[k] = (kmutex_t *)addr;
1198 		addr += (max_mem_nodes * sizeof (kmutex_t));
1199 	}
1200 	page_freelists = (page_t ****)addr;
1201 	addr += (mnoderangecnt * sizeof (page_t ***));
1202 
1203 	page_cachelists = (page_t ***)addr;
1204 	addr += (mnoderangecnt * sizeof (page_t **));
1205 
1206 	for (i = 0; i < mnoderangecnt; i++) {
1207 		page_freelists[i] = (page_t ***)addr;
1208 		addr += (mmu_page_sizes * sizeof (page_t **));
1209 
1210 		for (j = 0; j < mmu_page_sizes; j++) {
1211 			colors = page_get_pagecolors(j);
1212 			page_freelists[i][j] = (page_t **)addr;
1213 			addr += (colors * sizeof (page_t *));
1214 		}
1215 		page_cachelists[i] = (page_t **)addr;
1216 		addr += (page_colors * sizeof (page_t *));
1217 	}
1218 }
1219 
1220 /*ARGSUSED*/
1221 int
1222 bp_color(struct buf *bp)
1223 {
1224 	return (0);
1225 }
1226 
1227 /*
1228  * get a page from any list with the given mnode
1229  */
1230 page_t *
1231 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
1232     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
1233 {
1234 	kmutex_t	*pcm;
1235 	int		i;
1236 	page_t		*pp;
1237 	page_t		*first_pp;
1238 	uint64_t	pgaddr;
1239 	ulong_t		bin;
1240 	int		mtypestart;
1241 
1242 	VM_STAT_ADD(pga_vmstats.pgma_alloc);
1243 
1244 	ASSERT((flags & PG_MATCH_COLOR) == 0);
1245 	ASSERT(szc == 0);
1246 	ASSERT(dma_attr != NULL);
1247 
1248 
1249 	MTYPE_START(mnode, mtype, flags);
1250 	if (mtype < 0) {
1251 		VM_STAT_ADD(pga_vmstats.pgma_allocempty);
1252 		return (NULL);
1253 	}
1254 
1255 	mtypestart = mtype;
1256 
1257 	bin = origbin;
1258 
1259 	/*
1260 	 * check up to page_colors + 1 bins - origbin may be checked twice
1261 	 * because of BIN_STEP skip
1262 	 */
1263 	do {
1264 		i = 0;
1265 		while (i <= page_colors) {
1266 			if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
1267 				goto nextfreebin;
1268 
1269 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1270 			mutex_enter(pcm);
1271 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
1272 			first_pp = pp;
1273 			while (pp != NULL) {
1274 				if (page_trylock(pp, SE_EXCL) == 0) {
1275 					pp = pp->p_next;
1276 					if (pp == first_pp) {
1277 						pp = NULL;
1278 					}
1279 					continue;
1280 				}
1281 
1282 				ASSERT(PP_ISFREE(pp));
1283 				ASSERT(PP_ISAGED(pp));
1284 				ASSERT(pp->p_vnode == NULL);
1285 				ASSERT(pp->p_hash == NULL);
1286 				ASSERT(pp->p_offset == (u_offset_t)-1);
1287 				ASSERT(pp->p_szc == szc);
1288 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
1289 				/* check if page within DMA attributes */
1290 				pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum));
1291 
1292 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
1293 				    (pgaddr + MMU_PAGESIZE - 1 <=
1294 				    dma_attr->dma_attr_addr_hi)) {
1295 					break;
1296 				}
1297 
1298 				/* continue looking */
1299 				page_unlock(pp);
1300 				pp = pp->p_next;
1301 				if (pp == first_pp)
1302 					pp = NULL;
1303 
1304 			}
1305 			if (pp != NULL) {
1306 				ASSERT(mtype == PP_2_MTYPE(pp));
1307 				ASSERT(pp->p_szc == 0);
1308 
1309 				/* found a page with specified DMA attributes */
1310 				page_sub(&PAGE_FREELISTS(mnode, szc, bin,
1311 				    mtype), pp);
1312 				page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1313 
1314 				if ((PP_ISFREE(pp) == 0) ||
1315 				    (PP_ISAGED(pp) == 0)) {
1316 					cmn_err(CE_PANIC, "page %p is not free",
1317 					    (void *)pp);
1318 				}
1319 
1320 				mutex_exit(pcm);
1321 				check_dma(dma_attr, pp, 1);
1322 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
1323 				return (pp);
1324 			}
1325 			mutex_exit(pcm);
1326 nextfreebin:
1327 			pp = page_freelist_fill(szc, bin, mnode, mtype,
1328 			    mmu_btop(dma_attr->dma_attr_addr_hi + 1));
1329 			if (pp)
1330 				return (pp);
1331 
1332 			/* try next bin */
1333 			bin += (i == 0) ? BIN_STEP : 1;
1334 			bin &= page_colors_mask;
1335 			i++;
1336 		}
1337 		MTYPE_NEXT(mnode, mtype, flags);
1338 	} while (mtype >= 0);
1339 
1340 	/* failed to find a page in the freelist; try it in the cachelist */
1341 
1342 	/* reset mtype start for cachelist search */
1343 	mtype = mtypestart;
1344 	ASSERT(mtype >= 0);
1345 
1346 	/* start with the bin of matching color */
1347 	bin = origbin;
1348 
1349 	do {
1350 		for (i = 0; i <= page_colors; i++) {
1351 			if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
1352 				goto nextcachebin;
1353 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
1354 			mutex_enter(pcm);
1355 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
1356 			first_pp = pp;
1357 			while (pp != NULL) {
1358 				if (page_trylock(pp, SE_EXCL) == 0) {
1359 					pp = pp->p_next;
1360 					if (pp == first_pp)
1361 						break;
1362 					continue;
1363 				}
1364 				ASSERT(pp->p_vnode);
1365 				ASSERT(PP_ISAGED(pp) == 0);
1366 				ASSERT(pp->p_szc == 0);
1367 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
1368 
1369 				/* check if page within DMA attributes */
1370 
1371 				pgaddr = ptob((uint64_t)(pp->p_pagenum));
1372 
1373 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
1374 				    (pgaddr + MMU_PAGESIZE - 1 <=
1375 				    dma_attr->dma_attr_addr_hi)) {
1376 					break;
1377 				}
1378 
1379 				/* continue looking */
1380 				page_unlock(pp);
1381 				pp = pp->p_next;
1382 				if (pp == first_pp)
1383 					pp = NULL;
1384 			}
1385 
1386 			if (pp != NULL) {
1387 				ASSERT(mtype == PP_2_MTYPE(pp));
1388 				ASSERT(pp->p_szc == 0);
1389 
1390 				/* found a page with specified DMA attributes */
1391 				page_sub(&PAGE_CACHELISTS(mnode, bin,
1392 				    mtype), pp);
1393 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
1394 
1395 				mutex_exit(pcm);
1396 				ASSERT(pp->p_vnode);
1397 				ASSERT(PP_ISAGED(pp) == 0);
1398 				check_dma(dma_attr, pp, 1);
1399 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
1400 				return (pp);
1401 			}
1402 			mutex_exit(pcm);
1403 nextcachebin:
1404 			bin += (i == 0) ? BIN_STEP : 1;
1405 			bin &= page_colors_mask;
1406 		}
1407 		MTYPE_NEXT(mnode, mtype, flags);
1408 	} while (mtype >= 0);
1409 
1410 	VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
1411 	return (NULL);
1412 }
1413 
1414 /*
1415  * This function is similar to page_get_freelist()/page_get_cachelist()
1416  * but it searches both the lists to find a page with the specified
1417  * color (or no color) and DMA attributes. The search is done in the
1418  * freelist first and then in the cache list within the highest memory
1419  * range (based on DMA attributes) before searching in the lower
1420  * memory ranges.
1421  *
1422  * Note: This function is called only by page_create_io().
1423  */
1424 /*ARGSUSED*/
1425 page_t *
1426 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
1427     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t	*lgrp)
1428 {
1429 	uint_t		bin;
1430 	int		mtype;
1431 	page_t		*pp;
1432 	int		n;
1433 	int		m;
1434 	int		szc;
1435 	int		fullrange;
1436 	int		mnode;
1437 	int		local_failed_stat = 0;
1438 	lgrp_mnode_cookie_t	lgrp_cookie;
1439 
1440 	VM_STAT_ADD(pga_vmstats.pga_alloc);
1441 
1442 	/* only base pagesize currently supported */
1443 	if (size != MMU_PAGESIZE)
1444 		return (NULL);
1445 
1446 	/*
1447 	 * If we're passed a specific lgroup, we use it.  Otherwise,
1448 	 * assume first-touch placement is desired.
1449 	 */
1450 	if (!LGRP_EXISTS(lgrp))
1451 		lgrp = lgrp_home_lgrp();
1452 
1453 	/* LINTED */
1454 	AS_2_BIN(as, seg, vp, vaddr, bin);
1455 
1456 	/*
1457 	 * Only hold one freelist or cachelist lock at a time, that way we
1458 	 * can start anywhere and not have to worry about lock
1459 	 * ordering.
1460 	 */
1461 	if (dma_attr == NULL) {
1462 		n = 0;
1463 		m = mnoderangecnt - 1;
1464 		fullrange = 1;
1465 		VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
1466 	} else {
1467 		pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
1468 		pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
1469 
1470 		/*
1471 		 * We can guarantee alignment only for page boundary.
1472 		 */
1473 		if (dma_attr->dma_attr_align > MMU_PAGESIZE)
1474 			return (NULL);
1475 
1476 		n = pfn_2_mtype(pfnlo);
1477 		m = pfn_2_mtype(pfnhi);
1478 
1479 		fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
1480 		    (pfnhi >= mnoderanges[m].mnr_pfnhi));
1481 	}
1482 	VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
1483 
1484 	if (n > m)
1485 		return (NULL);
1486 
1487 	szc = 0;
1488 
1489 	/* cylcing thru mtype handled by RANGE0 if n == 0 */
1490 	if (n == 0) {
1491 		flags |= PGI_MT_RANGE0;
1492 		n = m;
1493 	}
1494 
1495 	/*
1496 	 * Try local memory node first, but try remote if we can't
1497 	 * get a page of the right color.
1498 	 */
1499 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
1500 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
1501 		/*
1502 		 * allocate pages from high pfn to low.
1503 		 */
1504 		for (mtype = m; mtype >= n; mtype--) {
1505 			if (fullrange != 0) {
1506 				pp = page_get_mnode_freelist(mnode,
1507 				    bin, mtype, szc, flags);
1508 				if (pp == NULL) {
1509 					pp = page_get_mnode_cachelist(
1510 						bin, flags, mnode, mtype);
1511 				}
1512 			} else {
1513 				pp = page_get_mnode_anylist(bin, szc,
1514 				    flags, mnode, mtype, dma_attr);
1515 			}
1516 			if (pp != NULL) {
1517 				VM_STAT_ADD(pga_vmstats.pga_allocok);
1518 				check_dma(dma_attr, pp, 1);
1519 				return (pp);
1520 			}
1521 		}
1522 		if (!local_failed_stat) {
1523 			lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
1524 			local_failed_stat = 1;
1525 		}
1526 	}
1527 	VM_STAT_ADD(pga_vmstats.pga_allocfailed);
1528 
1529 	return (NULL);
1530 }
1531 
1532 /*
1533  * page_create_io()
1534  *
1535  * This function is a copy of page_create_va() with an additional
1536  * argument 'mattr' that specifies DMA memory requirements to
1537  * the page list functions. This function is used by the segkmem
1538  * allocator so it is only to create new pages (i.e PG_EXCL is
1539  * set).
1540  *
1541  * Note: This interface is currently used by x86 PSM only and is
1542  *	 not fully specified so the commitment level is only for
1543  *	 private interface specific to x86. This interface uses PSM
1544  *	 specific page_get_anylist() interface.
1545  */
1546 
1547 #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
1548 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
1549 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
1550 			break; \
1551 	} \
1552 }
1553 
1554 
1555 page_t *
1556 page_create_io(
1557 	struct vnode	*vp,
1558 	u_offset_t	off,
1559 	uint_t		bytes,
1560 	uint_t		flags,
1561 	struct as	*as,
1562 	caddr_t		vaddr,
1563 	ddi_dma_attr_t	*mattr)	/* DMA memory attributes if any */
1564 {
1565 	page_t		*plist = NULL;
1566 	uint_t		plist_len = 0;
1567 	pgcnt_t		npages;
1568 	page_t		*npp = NULL;
1569 	uint_t		pages_req;
1570 	page_t		*pp;
1571 	kmutex_t	*phm = NULL;
1572 	uint_t		index;
1573 
1574 	TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
1575 		"page_create_start:vp %p off %llx bytes %u flags %x",
1576 		vp, off, bytes, flags);
1577 
1578 	ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
1579 
1580 	pages_req = npages = mmu_btopr(bytes);
1581 
1582 	/*
1583 	 * Do the freemem and pcf accounting.
1584 	 */
1585 	if (!page_create_wait(npages, flags)) {
1586 		return (NULL);
1587 	}
1588 
1589 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
1590 		"page_create_success:vp %p off %llx",
1591 		vp, off);
1592 
1593 	/*
1594 	 * If satisfying this request has left us with too little
1595 	 * memory, start the wheels turning to get some back.  The
1596 	 * first clause of the test prevents waking up the pageout
1597 	 * daemon in situations where it would decide that there's
1598 	 * nothing to do.
1599 	 */
1600 	if (nscan < desscan && freemem < minfree) {
1601 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
1602 			"pageout_cv_signal:freemem %ld", freemem);
1603 		cv_signal(&proc_pageout->p_cv);
1604 	}
1605 
1606 	if (flags & PG_PHYSCONTIG) {
1607 
1608 		plist = page_get_contigpage(&npages, mattr, 1);
1609 		if (plist == NULL) {
1610 			page_create_putback(npages);
1611 			return (NULL);
1612 		}
1613 
1614 		pp = plist;
1615 
1616 		do {
1617 			if (!page_hashin(pp, vp, off, NULL)) {
1618 				panic("pg_creat_io: hashin failed %p %p %llx",
1619 				    (void *)pp, (void *)vp, off);
1620 			}
1621 			VM_STAT_ADD(page_create_new);
1622 			off += MMU_PAGESIZE;
1623 			PP_CLRFREE(pp);
1624 			PP_CLRAGED(pp);
1625 			page_set_props(pp, P_REF);
1626 			pp = pp->p_next;
1627 		} while (pp != plist);
1628 
1629 		if (!npages) {
1630 			check_dma(mattr, plist, pages_req);
1631 			return (plist);
1632 		} else {
1633 			vaddr += (pages_req - npages) << MMU_PAGESHIFT;
1634 		}
1635 
1636 		/*
1637 		 * fall-thru:
1638 		 *
1639 		 * page_get_contigpage returns when npages <= sgllen.
1640 		 * Grab the rest of the non-contig pages below from anylist.
1641 		 */
1642 	}
1643 
1644 	/*
1645 	 * Loop around collecting the requested number of pages.
1646 	 * Most of the time, we have to `create' a new page. With
1647 	 * this in mind, pull the page off the free list before
1648 	 * getting the hash lock.  This will minimize the hash
1649 	 * lock hold time, nesting, and the like.  If it turns
1650 	 * out we don't need the page, we put it back at the end.
1651 	 */
1652 	while (npages--) {
1653 		phm = NULL;
1654 
1655 		index = PAGE_HASH_FUNC(vp, off);
1656 top:
1657 		ASSERT(phm == NULL);
1658 		ASSERT(index == PAGE_HASH_FUNC(vp, off));
1659 		ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1660 
1661 		if (npp == NULL) {
1662 			/*
1663 			 * Try to get the page of any color either from
1664 			 * the freelist or from the cache list.
1665 			 */
1666 			npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
1667 			    flags & ~PG_MATCH_COLOR, mattr, NULL);
1668 			if (npp == NULL) {
1669 				if (mattr == NULL) {
1670 					/*
1671 					 * Not looking for a special page;
1672 					 * panic!
1673 					 */
1674 					panic("no page found %d", (int)npages);
1675 				}
1676 				/*
1677 				 * No page found! This can happen
1678 				 * if we are looking for a page
1679 				 * within a specific memory range
1680 				 * for DMA purposes. If PG_WAIT is
1681 				 * specified then we wait for a
1682 				 * while and then try again. The
1683 				 * wait could be forever if we
1684 				 * don't get the page(s) we need.
1685 				 *
1686 				 * Note: XXX We really need a mechanism
1687 				 * to wait for pages in the desired
1688 				 * range. For now, we wait for any
1689 				 * pages and see if we can use it.
1690 				 */
1691 
1692 				if ((mattr != NULL) && (flags & PG_WAIT)) {
1693 					delay(10);
1694 					goto top;
1695 				}
1696 
1697 				goto fail; /* undo accounting stuff */
1698 			}
1699 
1700 			if (PP_ISAGED(npp) == 0) {
1701 				/*
1702 				 * Since this page came from the
1703 				 * cachelist, we must destroy the
1704 				 * old vnode association.
1705 				 */
1706 				page_hashout(npp, (kmutex_t *)NULL);
1707 			}
1708 		}
1709 
1710 		/*
1711 		 * We own this page!
1712 		 */
1713 		ASSERT(PAGE_EXCL(npp));
1714 		ASSERT(npp->p_vnode == NULL);
1715 		ASSERT(!hat_page_is_mapped(npp));
1716 		PP_CLRFREE(npp);
1717 		PP_CLRAGED(npp);
1718 
1719 		/*
1720 		 * Here we have a page in our hot little mits and are
1721 		 * just waiting to stuff it on the appropriate lists.
1722 		 * Get the mutex and check to see if it really does
1723 		 * not exist.
1724 		 */
1725 		phm = PAGE_HASH_MUTEX(index);
1726 		mutex_enter(phm);
1727 		PAGE_HASH_SEARCH(index, pp, vp, off);
1728 		if (pp == NULL) {
1729 			VM_STAT_ADD(page_create_new);
1730 			pp = npp;
1731 			npp = NULL;
1732 			if (!page_hashin(pp, vp, off, phm)) {
1733 				/*
1734 				 * Since we hold the page hash mutex and
1735 				 * just searched for this page, page_hashin
1736 				 * had better not fail.  If it does, that
1737 				 * means somethread did not follow the
1738 				 * page hash mutex rules.  Panic now and
1739 				 * get it over with.  As usual, go down
1740 				 * holding all the locks.
1741 				 */
1742 				ASSERT(MUTEX_HELD(phm));
1743 				panic("page_create: hashin fail %p %p %llx %p",
1744 				    (void *)pp, (void *)vp, off, (void *)phm);
1745 
1746 			}
1747 			ASSERT(MUTEX_HELD(phm));
1748 			mutex_exit(phm);
1749 			phm = NULL;
1750 
1751 			/*
1752 			 * Hat layer locking need not be done to set
1753 			 * the following bits since the page is not hashed
1754 			 * and was on the free list (i.e., had no mappings).
1755 			 *
1756 			 * Set the reference bit to protect
1757 			 * against immediate pageout
1758 			 *
1759 			 * XXXmh modify freelist code to set reference
1760 			 * bit so we don't have to do it here.
1761 			 */
1762 			page_set_props(pp, P_REF);
1763 		} else {
1764 			ASSERT(MUTEX_HELD(phm));
1765 			mutex_exit(phm);
1766 			phm = NULL;
1767 			/*
1768 			 * NOTE: This should not happen for pages associated
1769 			 *	 with kernel vnode 'kvp'.
1770 			 */
1771 			/* XX64 - to debug why this happens! */
1772 			ASSERT(vp != &kvp);
1773 			if (vp == &kvp)
1774 				cmn_err(CE_NOTE,
1775 				    "page_create: page not expected "
1776 				    "in hash list for kernel vnode - pp 0x%p",
1777 				    (void *)pp);
1778 			VM_STAT_ADD(page_create_exists);
1779 			goto fail;
1780 		}
1781 
1782 		/*
1783 		 * Got a page!  It is locked.  Acquire the i/o
1784 		 * lock since we are going to use the p_next and
1785 		 * p_prev fields to link the requested pages together.
1786 		 */
1787 		page_io_lock(pp);
1788 		page_add(&plist, pp);
1789 		plist = plist->p_next;
1790 		off += MMU_PAGESIZE;
1791 		vaddr += MMU_PAGESIZE;
1792 	}
1793 
1794 	check_dma(mattr, plist, pages_req);
1795 	return (plist);
1796 
1797 fail:
1798 	if (npp != NULL) {
1799 		/*
1800 		 * Did not need this page after all.
1801 		 * Put it back on the free list.
1802 		 */
1803 		VM_STAT_ADD(page_create_putbacks);
1804 		PP_SETFREE(npp);
1805 		PP_SETAGED(npp);
1806 		npp->p_offset = (u_offset_t)-1;
1807 		page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
1808 		page_unlock(npp);
1809 	}
1810 
1811 	/*
1812 	 * Give up the pages we already got.
1813 	 */
1814 	while (plist != NULL) {
1815 		pp = plist;
1816 		page_sub(&plist, pp);
1817 		page_io_unlock(pp);
1818 		plist_len++;
1819 		/*LINTED: constant in conditional ctx*/
1820 		VN_DISPOSE(pp, B_INVAL, 0, kcred);
1821 	}
1822 
1823 	/*
1824 	 * VN_DISPOSE does freemem accounting for the pages in plist
1825 	 * by calling page_free. So, we need to undo the pcf accounting
1826 	 * for only the remaining pages.
1827 	 */
1828 	VM_STAT_ADD(page_create_putbacks);
1829 	page_create_putback(pages_req - plist_len);
1830 
1831 	return (NULL);
1832 }
1833 
1834 
1835 /*
1836  * Copy the data from the physical page represented by "frompp" to
1837  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
1838  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
1839  * level and no one sleeps with an active mapping there.
1840  *
1841  * Note that the ref/mod bits in the page_t's are not affected by
1842  * this operation, hence it is up to the caller to update them appropriately.
1843  */
1844 void
1845 ppcopy(page_t *frompp, page_t *topp)
1846 {
1847 	caddr_t		pp_addr1;
1848 	caddr_t		pp_addr2;
1849 	void		*pte1;
1850 	void		*pte2;
1851 	kmutex_t	*ppaddr_mutex;
1852 
1853 	ASSERT_STACK_ALIGNED();
1854 	ASSERT(PAGE_LOCKED(frompp));
1855 	ASSERT(PAGE_LOCKED(topp));
1856 
1857 	if (kpm_enable) {
1858 		pp_addr1 = hat_kpm_page2va(frompp, 0);
1859 		pp_addr2 = hat_kpm_page2va(topp, 0);
1860 		kpreempt_disable();
1861 	} else {
1862 		/*
1863 		 * disable pre-emption so that CPU can't change
1864 		 */
1865 		kpreempt_disable();
1866 
1867 		pp_addr1 = CPU->cpu_caddr1;
1868 		pp_addr2 = CPU->cpu_caddr2;
1869 		pte1 = (void *)CPU->cpu_caddr1pte;
1870 		pte2 = (void *)CPU->cpu_caddr2pte;
1871 
1872 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
1873 		mutex_enter(ppaddr_mutex);
1874 
1875 		hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
1876 		    PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
1877 		hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
1878 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
1879 		    HAT_LOAD_NOCONSIST);
1880 	}
1881 
1882 	if (use_sse_pagecopy)
1883 		hwblkpagecopy(pp_addr1, pp_addr2);
1884 	else
1885 		bcopy(pp_addr1, pp_addr2, PAGESIZE);
1886 
1887 	if (!kpm_enable)
1888 		mutex_exit(ppaddr_mutex);
1889 	kpreempt_enable();
1890 }
1891 
1892 /*
1893  * Zero the physical page from off to off + len given by `pp'
1894  * without changing the reference and modified bits of page.
1895  *
1896  * We use this using CPU private page address #2, see ppcopy() for more info.
1897  * pagezero() must not be called at interrupt level.
1898  */
1899 void
1900 pagezero(page_t *pp, uint_t off, uint_t len)
1901 {
1902 	caddr_t		pp_addr2;
1903 	void		*pte2;
1904 	kmutex_t	*ppaddr_mutex;
1905 
1906 	ASSERT_STACK_ALIGNED();
1907 	ASSERT(len <= MMU_PAGESIZE);
1908 	ASSERT(off <= MMU_PAGESIZE);
1909 	ASSERT(off + len <= MMU_PAGESIZE);
1910 	ASSERT(PAGE_LOCKED(pp));
1911 
1912 	if (kpm_enable) {
1913 		pp_addr2 = hat_kpm_page2va(pp, 0);
1914 		kpreempt_disable();
1915 	} else {
1916 		kpreempt_disable();
1917 
1918 		pp_addr2 = CPU->cpu_caddr2;
1919 		pte2 = (void *)CPU->cpu_caddr2pte;
1920 
1921 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
1922 		mutex_enter(ppaddr_mutex);
1923 
1924 		hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2,
1925 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
1926 		    HAT_LOAD_NOCONSIST);
1927 	}
1928 
1929 	if (use_sse_pagezero)
1930 		hwblkclr(pp_addr2 + off, len);
1931 	else
1932 		bzero(pp_addr2 + off, len);
1933 
1934 	if (!kpm_enable)
1935 		mutex_exit(ppaddr_mutex);
1936 	kpreempt_enable();
1937 }
1938 
1939 /*
1940  * Platform-dependent page scrub call.
1941  */
1942 void
1943 pagescrub(page_t *pp, uint_t off, uint_t len)
1944 {
1945 	/*
1946 	 * For now, we rely on the fact that pagezero() will
1947 	 * always clear UEs.
1948 	 */
1949 	pagezero(pp, off, len);
1950 }
1951 
1952 /*
1953  * set up two private addresses for use on a given CPU for use in ppcopy()
1954  */
1955 void
1956 setup_vaddr_for_ppcopy(struct cpu *cpup)
1957 {
1958 	void *addr;
1959 	void *pte;
1960 
1961 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
1962 	pte = hat_mempte_setup(addr);
1963 	cpup->cpu_caddr1 = addr;
1964 	cpup->cpu_caddr1pte = (pteptr_t)pte;
1965 
1966 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
1967 	pte = hat_mempte_setup(addr);
1968 	cpup->cpu_caddr2 = addr;
1969 	cpup->cpu_caddr2pte = (pteptr_t)pte;
1970 
1971 	mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
1972 }
1973 
1974 
1975 /*
1976  * Create the pageout scanner thread. The thread has to
1977  * start at procedure with process pp and priority pri.
1978  */
1979 void
1980 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
1981 {
1982 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
1983 }
1984 
1985 /*
1986  * Function for flushing D-cache when performing module relocations
1987  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
1988  */
1989 void
1990 dcache_flushall()
1991 {}
1992