xref: /illumos-gate/usr/src/uts/i86pc/vm/vm_machdep.c (revision 5cce9d40d191f7d11762f0803b81ddffaabafd3e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /*	All Rights Reserved   */
29 
30 /*
31  * Portions of this source code were derived from Berkeley 4.3 BSD
32  * under license from the Regents of the University of California.
33  */
34 
35 #pragma ident	"%Z%%M%	%I%	%E% SMI"
36 
37 /*
38  * UNIX machine dependent virtual memory support.
39  */
40 
41 #include <sys/types.h>
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/user.h>
45 #include <sys/proc.h>
46 #include <sys/kmem.h>
47 #include <sys/vmem.h>
48 #include <sys/buf.h>
49 #include <sys/cpuvar.h>
50 #include <sys/lgrp.h>
51 #include <sys/disp.h>
52 #include <sys/vm.h>
53 #include <sys/mman.h>
54 #include <sys/vnode.h>
55 #include <sys/cred.h>
56 #include <sys/exec.h>
57 #include <sys/exechdr.h>
58 #include <sys/debug.h>
59 
60 #include <vm/hat.h>
61 #include <vm/as.h>
62 #include <vm/seg.h>
63 #include <vm/seg_kp.h>
64 #include <vm/seg_vn.h>
65 #include <vm/page.h>
66 #include <vm/seg_kmem.h>
67 #include <vm/seg_kpm.h>
68 #include <vm/vm_dep.h>
69 
70 #include <sys/cpu.h>
71 #include <sys/vm_machparam.h>
72 #include <sys/memlist.h>
73 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
74 #include <vm/hat_i86.h>
75 #include <sys/x86_archext.h>
76 #include <sys/elf_386.h>
77 #include <sys/cmn_err.h>
78 #include <sys/archsystm.h>
79 #include <sys/machsystm.h>
80 
81 #include <sys/vtrace.h>
82 #include <sys/ddidmareq.h>
83 #include <sys/promif.h>
84 #include <sys/memnode.h>
85 #include <sys/stack.h>
86 
87 uint_t vac_colors = 0;
88 
89 int largepagesupport = 0;
90 extern uint_t page_create_new;
91 extern uint_t page_create_exists;
92 extern uint_t page_create_putbacks;
93 extern uint_t page_create_putbacks;
94 extern uintptr_t eprom_kernelbase;
95 extern int use_sse_pagecopy, use_sse_pagezero;	/* in ml/float.s */
96 
97 /* 4g memory management */
98 pgcnt_t		maxmem4g;
99 pgcnt_t		freemem4g;
100 int		physmax4g;
101 int		desfree4gshift = 4;	/* maxmem4g shift to derive DESFREE4G */
102 int		lotsfree4gshift = 3;
103 
104 #ifdef VM_STATS
105 struct {
106 	ulong_t	pga_alloc;
107 	ulong_t	pga_notfullrange;
108 	ulong_t	pga_nulldmaattr;
109 	ulong_t	pga_allocok;
110 	ulong_t	pga_allocfailed;
111 	ulong_t	pgma_alloc;
112 	ulong_t	pgma_allocok;
113 	ulong_t	pgma_allocfailed;
114 	ulong_t	pgma_allocempty;
115 } pga_vmstats;
116 #endif
117 
118 uint_t mmu_page_sizes;
119 
120 /* How many page sizes the users can see */
121 uint_t mmu_exported_page_sizes;
122 
123 size_t auto_lpg_va_default = MMU_PAGESIZE; /* used by zmap() */
124 /*
125  * Number of pages in 1 GB.  Don't enable automatic large pages if we have
126  * fewer than this many pages.
127  */
128 pgcnt_t auto_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
129 
130 /*
131  * Return the optimum page size for a given mapping
132  */
133 /*ARGSUSED*/
134 size_t
135 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap)
136 {
137 	level_t l;
138 
139 	if (remap)
140 		*remap = 0;
141 
142 	switch (maptype) {
143 
144 	case MAPPGSZ_STK:
145 	case MAPPGSZ_HEAP:
146 	case MAPPGSZ_VA:
147 		/*
148 		 * use the pages size that best fits len
149 		 */
150 		for (l = mmu.max_page_level; l > 0; --l) {
151 			if (len < LEVEL_SIZE(l))
152 				continue;
153 			break;
154 		}
155 		return (LEVEL_SIZE(l));
156 
157 	/*
158 	 * for ISM use the 1st large page size.
159 	 */
160 	case MAPPGSZ_ISM:
161 		if (mmu.max_page_level == 0)
162 			return (MMU_PAGESIZE);
163 		return (LEVEL_SIZE(1));
164 	}
165 	return (0);
166 }
167 
168 /*
169  * This can be patched via /etc/system to allow large pages
170  * to be used for mapping application and libraries text segments.
171  */
172 int	use_text_largepages = 0;
173 
174 /*
175  * Return a bit vector of large page size codes that
176  * can be used to map [addr, addr + len) region.
177  */
178 
179 /*ARGSUSED*/
180 uint_t
181 map_execseg_pgszcvec(int text, caddr_t addr, size_t len)
182 {
183 	size_t	pgsz;
184 	caddr_t a;
185 
186 	if (!text || !use_text_largepages ||
187 	    mmu.max_page_level == 0)
188 		return (0);
189 
190 	pgsz = LEVEL_SIZE(1);
191 	a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
192 	if (a < addr || a >= addr + len) {
193 		return (0);
194 	}
195 	len -= (a - addr);
196 	if (len < pgsz) {
197 		return (0);
198 	}
199 	return (1 << 1);
200 }
201 
202 /*
203  * Handle a pagefault.
204  */
205 faultcode_t
206 pagefault(
207 	caddr_t addr,
208 	enum fault_type type,
209 	enum seg_rw rw,
210 	int iskernel)
211 {
212 	struct as *as;
213 	struct hat *hat;
214 	struct proc *p;
215 	kthread_t *t;
216 	faultcode_t res;
217 	caddr_t base;
218 	size_t len;
219 	int err;
220 	int mapped_red;
221 	uintptr_t ea;
222 
223 	ASSERT_STACK_ALIGNED();
224 
225 	if (INVALID_VADDR(addr))
226 		return (FC_NOMAP);
227 
228 	mapped_red = segkp_map_red();
229 
230 	if (iskernel) {
231 		as = &kas;
232 		hat = as->a_hat;
233 	} else {
234 		t = curthread;
235 		p = ttoproc(t);
236 		as = p->p_as;
237 		hat = as->a_hat;
238 	}
239 
240 	/*
241 	 * Dispatch pagefault.
242 	 */
243 	res = as_fault(hat, as, addr, 1, type, rw);
244 
245 	/*
246 	 * If this isn't a potential unmapped hole in the user's
247 	 * UNIX data or stack segments, just return status info.
248 	 */
249 	if (res != FC_NOMAP || iskernel)
250 		goto out;
251 
252 	/*
253 	 * Check to see if we happened to faulted on a currently unmapped
254 	 * part of the UNIX data or stack segments.  If so, create a zfod
255 	 * mapping there and then try calling the fault routine again.
256 	 */
257 	base = p->p_brkbase;
258 	len = p->p_brksize;
259 
260 	if (addr < base || addr >= base + len) {		/* data seg? */
261 		base = (caddr_t)p->p_usrstack - p->p_stksize;
262 		len = p->p_stksize;
263 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
264 			/* not in either UNIX data or stack segments */
265 			res = FC_NOMAP;
266 			goto out;
267 		}
268 	}
269 
270 	/*
271 	 * the rest of this function implements a 3.X 4.X 5.X compatibility
272 	 * This code is probably not needed anymore
273 	 */
274 	if (p->p_model == DATAMODEL_ILP32) {
275 
276 		/* expand the gap to the page boundaries on each side */
277 		ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
278 		base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
279 		len = ea - (uintptr_t)base;
280 
281 		as_rangelock(as);
282 		if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
283 		    0) {
284 			err = as_map(as, base, len, segvn_create, zfod_argsp);
285 			as_rangeunlock(as);
286 			if (err) {
287 				res = FC_MAKE_ERR(err);
288 				goto out;
289 			}
290 		} else {
291 			/*
292 			 * This page is already mapped by another thread after
293 			 * we returned from as_fault() above.  We just fall
294 			 * through as_fault() below.
295 			 */
296 			as_rangeunlock(as);
297 		}
298 
299 		res = as_fault(hat, as, addr, 1, F_INVAL, rw);
300 	}
301 
302 out:
303 	if (mapped_red)
304 		segkp_unmap_red();
305 
306 	return (res);
307 }
308 
309 void
310 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
311 {
312 	struct proc *p = curproc;
313 	caddr_t userlimit = (flags & _MAP_LOW32) ?
314 	    (caddr_t)_userlimit32 : p->p_as->a_userlimit;
315 
316 	map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
317 }
318 
319 /*ARGSUSED*/
320 int
321 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
322 {
323 	return (0);
324 }
325 
326 /*
327  * map_addr_proc() is the routine called when the system is to
328  * choose an address for the user.  We will pick an address
329  * range which is the highest available below kernelbase.
330  *
331  * addrp is a value/result parameter.
332  *	On input it is a hint from the user to be used in a completely
333  *	machine dependent fashion.  We decide to completely ignore this hint.
334  *
335  *	On output it is NULL if no address can be found in the current
336  *	processes address space or else an address that is currently
337  *	not mapped for len bytes with a page of red zone on either side.
338  *
339  *	align is not needed on x86 (it's for viturally addressed caches)
340  */
341 /*ARGSUSED*/
342 void
343 map_addr_proc(
344 	caddr_t *addrp,
345 	size_t len,
346 	offset_t off,
347 	int vacalign,
348 	caddr_t userlimit,
349 	struct proc *p,
350 	uint_t flags)
351 {
352 	struct as *as = p->p_as;
353 	caddr_t addr;
354 	caddr_t base;
355 	size_t slen;
356 	size_t align_amount;
357 
358 	ASSERT32(userlimit == as->a_userlimit);
359 
360 	base = p->p_brkbase;
361 #if defined(__amd64)
362 	/*
363 	 * XX64 Yes, this needs more work.
364 	 */
365 	if (p->p_model == DATAMODEL_NATIVE) {
366 		if (userlimit < as->a_userlimit) {
367 			/*
368 			 * This happens when a program wants to map
369 			 * something in a range that's accessible to a
370 			 * program in a smaller address space.  For example,
371 			 * a 64-bit program calling mmap32(2) to guarantee
372 			 * that the returned address is below 4Gbytes.
373 			 */
374 			ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
375 
376 			if (userlimit > base)
377 				slen = userlimit - base;
378 			else {
379 				*addrp = NULL;
380 				return;
381 			}
382 		} else {
383 			/*
384 			 * XX64 This layout is probably wrong .. but in
385 			 * the event we make the amd64 address space look
386 			 * like sparcv9 i.e. with the stack -above- the
387 			 * heap, this bit of code might even be correct.
388 			 */
389 			slen = p->p_usrstack - base -
390 			    (((size_t)rctl_enforced_value(
391 			    rctlproc_legacy[RLIMIT_STACK],
392 			    p->p_rctls, p) + PAGEOFFSET) & PAGEMASK);
393 		}
394 	} else
395 #endif
396 		slen = userlimit - base;
397 
398 	len = (len + PAGEOFFSET) & PAGEMASK;
399 
400 	/*
401 	 * Redzone for each side of the request. This is done to leave
402 	 * one page unmapped between segments. This is not required, but
403 	 * it's useful for the user because if their program strays across
404 	 * a segment boundary, it will catch a fault immediately making
405 	 * debugging a little easier.
406 	 */
407 	len += 2 * MMU_PAGESIZE;
408 
409 	/*
410 	 * figure out what the alignment should be
411 	 *
412 	 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
413 	 */
414 	if (len <= ELF_386_MAXPGSZ) {
415 		/*
416 		 * Align virtual addresses to ensure that ELF shared libraries
417 		 * are mapped with the appropriate alignment constraints by
418 		 * the run-time linker.
419 		 */
420 		align_amount = ELF_386_MAXPGSZ;
421 	} else {
422 		int l = mmu.max_page_level;
423 
424 		while (l && len < LEVEL_SIZE(l))
425 			--l;
426 
427 		align_amount = LEVEL_SIZE(l);
428 	}
429 
430 	if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
431 		align_amount = (uintptr_t)*addrp;
432 
433 	len += align_amount;
434 
435 	/*
436 	 * Look for a large enough hole starting below userlimit.
437 	 * After finding it, use the upper part.  Addition of PAGESIZE
438 	 * is for the redzone as described above.
439 	 */
440 	if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) {
441 		caddr_t as_addr;
442 
443 		addr = base + slen - len + MMU_PAGESIZE;
444 		as_addr = addr;
445 		/*
446 		 * Round address DOWN to the alignment amount,
447 		 * add the offset, and if this address is less
448 		 * than the original address, add alignment amount.
449 		 */
450 		addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
451 		addr += (uintptr_t)(off & (align_amount - 1));
452 		if (addr < as_addr)
453 			addr += align_amount;
454 
455 		ASSERT(addr <= (as_addr + align_amount));
456 		ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
457 		    ((uintptr_t)(off & (align_amount - 1))));
458 		*addrp = addr;
459 	} else {
460 		*addrp = NULL;	/* no more virtual space */
461 	}
462 }
463 
464 /*
465  * Determine whether [base, base+len] contains a valid range of
466  * addresses at least minlen long. base and len are adjusted if
467  * required to provide a valid range.
468  */
469 /*ARGSUSED3*/
470 int
471 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
472 {
473 	uintptr_t hi, lo;
474 
475 	lo = (uintptr_t)*basep;
476 	hi = lo + *lenp;
477 
478 	/*
479 	 * If hi rolled over the top, try cutting back.
480 	 */
481 	if (hi < lo) {
482 		if (0 - lo + hi < minlen)
483 			return (0);
484 		if (0 - lo < minlen)
485 			return (0);
486 		*lenp = 0 - lo;
487 	} else if (hi - lo < minlen) {
488 		return (0);
489 	}
490 #if defined(__amd64)
491 	/*
492 	 * Deal with a possible hole in the address range between
493 	 * hole_start and hole_end that should never be mapped.
494 	 */
495 	if (lo < hole_start) {
496 		if (hi > hole_start) {
497 			if (hi < hole_end) {
498 				hi = hole_start;
499 			} else {
500 				/* lo < hole_start && hi >= hole_end */
501 				if (dir == AH_LO) {
502 					/*
503 					 * prefer lowest range
504 					 */
505 					if (hole_start - lo >= minlen)
506 						hi = hole_start;
507 					else if (hi - hole_end >= minlen)
508 						lo = hole_end;
509 					else
510 						return (0);
511 				} else {
512 					/*
513 					 * prefer highest range
514 					 */
515 					if (hi - hole_end >= minlen)
516 						lo = hole_end;
517 					else if (hole_start - lo >= minlen)
518 						hi = hole_start;
519 					else
520 						return (0);
521 				}
522 			}
523 		}
524 	} else {
525 		/* lo >= hole_start */
526 		if (hi < hole_end)
527 			return (0);
528 		if (lo < hole_end)
529 			lo = hole_end;
530 	}
531 
532 	if (hi - lo < minlen)
533 		return (0);
534 
535 	*basep = (caddr_t)lo;
536 	*lenp = hi - lo;
537 #endif
538 	return (1);
539 }
540 
541 /*
542  * Determine whether [addr, addr+len] are valid user addresses.
543  */
544 /*ARGSUSED*/
545 int
546 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
547     caddr_t userlimit)
548 {
549 	caddr_t eaddr = addr + len;
550 
551 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
552 		return (RANGE_BADADDR);
553 
554 #if defined(__amd64)
555 	/*
556 	 * Check for the VA hole
557 	 */
558 	if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
559 		return (RANGE_BADADDR);
560 #endif
561 
562 	return (RANGE_OKAY);
563 }
564 
565 /*
566  * Return 1 if the page frame is onboard memory, else 0.
567  */
568 int
569 pf_is_memory(pfn_t pf)
570 {
571 	return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1));
572 }
573 
574 
575 /*
576  * initialized by page_coloring_init().
577  */
578 uint_t	page_colors;
579 uint_t	page_colors_mask;
580 uint_t	page_coloring_shift;
581 int	cpu_page_colors;
582 static uint_t	l2_colors;
583 
584 /*
585  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
586  * and page_colors are calculated from the l2 cache n-way set size.  Within a
587  * mnode range, the page freelist and cachelist are hashed into bins based on
588  * color. This makes it easier to search for a page within a specific memory
589  * range.
590  */
591 #define	PAGE_COLORS_MIN	16
592 
593 page_t ****page_freelists;
594 page_t ***page_cachelists;
595 
596 /*
597  * As the PC architecture evolved memory up was clumped into several
598  * ranges for various historical I/O devices to do DMA.
599  * < 16Meg - ISA bus
600  * < 2Gig - ???
601  * < 4Gig - PCI bus or drivers that don't understand PAE mode
602  */
603 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
604     0x100000,	/* pfn range for 4G and above */
605     0x80000,	/* pfn range for 2G-4G */
606     0x01000,	/* pfn range for 16M-2G */
607     0x00000,	/* pfn range for 0-16M */
608 };
609 
610 /*
611  * These are changed during startup if the machine has limited memory.
612  */
613 pfn_t *memranges = &arch_memranges[0];
614 int nranges = NUM_MEM_RANGES;
615 
616 /*
617  * Used by page layer to know about page sizes
618  */
619 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
620 
621 /*
622  * This can be patched via /etc/system to allow old non-PAE aware device
623  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
624  */
625 #if defined(__i386)
626 int restricted_kmemalloc = 1;	/* XX64 re-examine with PSARC 2004/405 */
627 #elif defined(__amd64)
628 int restricted_kmemalloc = 0;
629 #endif
630 
631 kmutex_t	*fpc_mutex[NPC_MUTEX];
632 kmutex_t	*cpc_mutex[NPC_MUTEX];
633 
634 
635 /*
636  * return the memrange containing pfn
637  */
638 int
639 memrange_num(pfn_t pfn)
640 {
641 	int n;
642 
643 	for (n = 0; n < nranges - 1; ++n) {
644 		if (pfn >= memranges[n])
645 			break;
646 	}
647 	return (n);
648 }
649 
650 /*
651  * return the mnoderange containing pfn
652  */
653 int
654 pfn_2_mtype(pfn_t pfn)
655 {
656 	int	n;
657 
658 	for (n = mnoderangecnt - 1; n >= 0; n--) {
659 		if (pfn >= mnoderanges[n].mnr_pfnlo) {
660 			break;
661 		}
662 	}
663 	return (n);
664 }
665 
666 /*
667  * is_contigpage_free:
668  *	returns a page list of contiguous pages. It minimally has to return
669  *	minctg pages. Caller determines minctg based on the scatter-gather
670  *	list length.
671  *
672  *	pfnp is set to the next page frame to search on return.
673  */
674 static page_t *
675 is_contigpage_free(
676 	pfn_t *pfnp,
677 	pgcnt_t *pgcnt,
678 	pgcnt_t minctg,
679 	uint64_t pfnseg,
680 	int iolock)
681 {
682 	int	i = 0;
683 	pfn_t	pfn = *pfnp;
684 	page_t	*pp;
685 	page_t	*plist = NULL;
686 
687 	/*
688 	 * fail if pfn + minctg crosses a segment boundary.
689 	 * Adjust for next starting pfn to begin at segment boundary.
690 	 */
691 
692 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
693 		*pfnp = roundup(*pfnp, pfnseg + 1);
694 		return (NULL);
695 	}
696 
697 	do {
698 retry:
699 		pp = page_numtopp_nolock(pfn + i);
700 		if ((pp == NULL) ||
701 		    (page_trylock(pp, SE_EXCL) == 0)) {
702 			(*pfnp)++;
703 			break;
704 		}
705 		if (page_pptonum(pp) != pfn + i) {
706 			page_unlock(pp);
707 			goto retry;
708 		}
709 
710 		if (!(PP_ISFREE(pp))) {
711 			page_unlock(pp);
712 			(*pfnp)++;
713 			break;
714 		}
715 
716 		if (!PP_ISAGED(pp)) {
717 			page_list_sub(pp, PG_CACHE_LIST);
718 			page_hashout(pp, (kmutex_t *)NULL);
719 		} else {
720 			page_list_sub(pp, PG_FREE_LIST);
721 		}
722 
723 		if (iolock)
724 			page_io_lock(pp);
725 		page_list_concat(&plist, &pp);
726 
727 		/*
728 		 * exit loop when pgcnt satisfied or segment boundary reached.
729 		 */
730 
731 	} while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
732 
733 	*pfnp += i;		/* set to next pfn to search */
734 
735 	if (i >= minctg) {
736 		*pgcnt -= i;
737 		return (plist);
738 	}
739 
740 	/*
741 	 * failure: minctg not satisfied.
742 	 *
743 	 * if next request crosses segment boundary, set next pfn
744 	 * to search from the segment boundary.
745 	 */
746 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
747 		*pfnp = roundup(*pfnp, pfnseg + 1);
748 
749 	/* clean up any pages already allocated */
750 
751 	while (plist) {
752 		pp = plist;
753 		page_sub(&plist, pp);
754 		page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
755 		if (iolock)
756 			page_io_unlock(pp);
757 		page_unlock(pp);
758 	}
759 
760 	return (NULL);
761 }
762 
763 /*
764  * verify that pages being returned from allocator have correct DMA attribute
765  */
766 #ifndef DEBUG
767 #define	check_dma(a, b, c) (0)
768 #else
769 static void
770 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
771 {
772 	if (dma_attr == NULL)
773 		return;
774 
775 	while (cnt-- > 0) {
776 		if (mmu_ptob((uint64_t)pp->p_pagenum) <
777 		    dma_attr->dma_attr_addr_lo)
778 			panic("PFN (pp=%p) below dma_attr_addr_lo", pp);
779 		if (mmu_ptob((uint64_t)pp->p_pagenum) >=
780 		    dma_attr->dma_attr_addr_hi)
781 			panic("PFN (pp=%p) above dma_attr_addr_hi", pp);
782 		pp = pp->p_next;
783 	}
784 }
785 #endif
786 
787 static kmutex_t	contig_lock;
788 
789 #define	CONTIG_LOCK()	mutex_enter(&contig_lock);
790 #define	CONTIG_UNLOCK()	mutex_exit(&contig_lock);
791 
792 #define	PFN_16M		(mmu_btop((uint64_t)0x1000000))
793 
794 static page_t *
795 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
796 {
797 	pfn_t		pfn;
798 	int		sgllen;
799 	uint64_t	pfnseg;
800 	pgcnt_t		minctg;
801 	page_t		*pplist = NULL, *plist;
802 	uint64_t	lo, hi;
803 	pgcnt_t		pfnalign = 0;
804 	static pfn_t	startpfn;
805 	static pgcnt_t	lastctgcnt;
806 	uintptr_t	align;
807 
808 	CONTIG_LOCK();
809 
810 	if (mattr) {
811 		lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
812 		hi = mmu_btop(mattr->dma_attr_addr_hi);
813 		if (hi >= physmax)
814 			hi = physmax - 1;
815 		sgllen = mattr->dma_attr_sgllen;
816 		pfnseg = mmu_btop(mattr->dma_attr_seg);
817 
818 		align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
819 		if (align > MMU_PAGESIZE)
820 			pfnalign = mmu_btop(align);
821 
822 		/*
823 		 * in order to satisfy the request, must minimally
824 		 * acquire minctg contiguous pages
825 		 */
826 		minctg = howmany(*pgcnt, sgllen);
827 
828 		ASSERT(hi >= lo);
829 
830 		/*
831 		 * start from where last searched if the minctg >= lastctgcnt
832 		 */
833 		if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
834 			startpfn = lo;
835 	} else {
836 		hi = physmax - 1;
837 		lo = 0;
838 		sgllen = 1;
839 		pfnseg = mmu.highest_pfn;
840 		minctg = *pgcnt;
841 
842 		if (minctg < lastctgcnt)
843 			startpfn = lo;
844 	}
845 	lastctgcnt = minctg;
846 
847 	ASSERT(pfnseg + 1 >= (uint64_t)minctg);
848 
849 	/* conserve 16m memory - start search above 16m when possible */
850 	if (hi > PFN_16M && startpfn < PFN_16M)
851 		startpfn = PFN_16M;
852 
853 	pfn = startpfn;
854 	if (pfnalign)
855 		pfn = P2ROUNDUP(pfn, pfnalign);
856 
857 	while (pfn + minctg - 1 <= hi) {
858 
859 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
860 		if (plist) {
861 			page_list_concat(&pplist, &plist);
862 			sgllen--;
863 			/*
864 			 * return when contig pages no longer needed
865 			 */
866 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
867 				startpfn = pfn;
868 				CONTIG_UNLOCK();
869 				check_dma(mattr, pplist, *pgcnt);
870 				return (pplist);
871 			}
872 			minctg = howmany(*pgcnt, sgllen);
873 		}
874 		if (pfnalign)
875 			pfn = P2ROUNDUP(pfn, pfnalign);
876 	}
877 
878 	/* cannot find contig pages in specified range */
879 	if (startpfn == lo) {
880 		CONTIG_UNLOCK();
881 		return (NULL);
882 	}
883 
884 	/* did not start with lo previously */
885 	pfn = lo;
886 	if (pfnalign)
887 		pfn = P2ROUNDUP(pfn, pfnalign);
888 
889 	/* allow search to go above startpfn */
890 	while (pfn < startpfn) {
891 
892 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
893 		if (plist != NULL) {
894 
895 			page_list_concat(&pplist, &plist);
896 			sgllen--;
897 
898 			/*
899 			 * return when contig pages no longer needed
900 			 */
901 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
902 				startpfn = pfn;
903 				CONTIG_UNLOCK();
904 				check_dma(mattr, pplist, *pgcnt);
905 				return (pplist);
906 			}
907 			minctg = howmany(*pgcnt, sgllen);
908 		}
909 		if (pfnalign)
910 			pfn = P2ROUNDUP(pfn, pfnalign);
911 	}
912 	CONTIG_UNLOCK();
913 	return (NULL);
914 }
915 
916 /*
917  * combine mem_node_config and memrange memory ranges into one data
918  * structure to be used for page list management.
919  *
920  * mnode_range_cnt() calculates the number of memory ranges for mnode and
921  * memranges[]. Used to determine the size of page lists and mnoderanges.
922  *
923  * mnode_range_setup() initializes mnoderanges.
924  */
925 mnoderange_t	*mnoderanges;
926 int		mnoderangecnt;
927 int		mtype4g;
928 
929 int
930 mnode_range_cnt()
931 {
932 	int	mri;
933 	int	mnrcnt = 0;
934 	int	mnode;
935 
936 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
937 		if (mem_node_config[mnode].exists == 0)
938 			continue;
939 
940 		mri = nranges - 1;
941 
942 		/* find the memranges index below contained in mnode range */
943 
944 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
945 			mri--;
946 
947 		/*
948 		 * increment mnode range counter when memranges or mnode
949 		 * boundary is reached.
950 		 */
951 		while (mri >= 0 &&
952 		    mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
953 			mnrcnt++;
954 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
955 				mri--;
956 			else
957 				break;
958 		}
959 	}
960 	return (mnrcnt);
961 }
962 
963 void
964 mnode_range_setup(mnoderange_t *mnoderanges)
965 {
966 	int	mnode, mri;
967 
968 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
969 		if (mem_node_config[mnode].exists == 0)
970 			continue;
971 
972 		mri = nranges - 1;
973 
974 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
975 			mri--;
976 
977 		while (mri >= 0 && mem_node_config[mnode].physmax >=
978 		    MEMRANGELO(mri)) {
979 			mnoderanges->mnr_pfnlo =
980 			    MAX(MEMRANGELO(mri),
981 				mem_node_config[mnode].physbase);
982 			mnoderanges->mnr_pfnhi =
983 			    MIN(MEMRANGEHI(mri),
984 				mem_node_config[mnode].physmax);
985 			mnoderanges->mnr_mnode = mnode;
986 			mnoderanges->mnr_memrange = mri;
987 			mnoderanges++;
988 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
989 				mri--;
990 			else
991 				break;
992 		}
993 	}
994 }
995 
996 /*
997  * Determine if the mnode range specified in mtype contains memory belonging
998  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
999  * the range of indices to 0 or 4g.
1000  *
1001  * Return first mnode range type index found otherwise return -1 if none found.
1002  */
1003 int
1004 mtype_func(int mnode, int mtype, uint_t flags)
1005 {
1006 	if (flags & PGI_MT_RANGE) {
1007 		int	mtlim = 0;	/* default to PGI_MT_RANGEO */
1008 
1009 		if (flags & PGI_MT_NEXT)
1010 			mtype--;
1011 		if (flags & PGI_MT_RANGE4G)
1012 			mtlim = mtype4g + 1;
1013 		while (mtype >= mtlim) {
1014 			if (mnoderanges[mtype].mnr_mnode == mnode)
1015 				return (mtype);
1016 			mtype--;
1017 		}
1018 	} else {
1019 		if (mnoderanges[mtype].mnr_mnode == mnode)
1020 			return (mtype);
1021 	}
1022 	return (-1);
1023 }
1024 
1025 /*
1026  * Update the page list max counts with the pfn range specified by the
1027  * input parameters.  Called from add_physmem() when physical memory with
1028  * page_t's are initially added to the page lists.
1029  */
1030 void
1031 mtype_modify_max(pfn_t startpfn, long cnt)
1032 {
1033 	int	mtype = 0;
1034 	pfn_t	endpfn = startpfn + cnt, pfn;
1035 	pgcnt_t	inc;
1036 
1037 	ASSERT(cnt > 0);
1038 
1039 	for (pfn = startpfn; pfn < endpfn; ) {
1040 		if (pfn <= mnoderanges[mtype].mnr_pfnhi) {
1041 			if (endpfn < mnoderanges[mtype].mnr_pfnhi) {
1042 				inc = endpfn - pfn;
1043 			} else {
1044 				inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1;
1045 			}
1046 			mnoderanges[mtype].mnr_mt_pgmax += inc;
1047 			if (physmax4g && mtype <= mtype4g)
1048 				maxmem4g += inc;
1049 			pfn += inc;
1050 		}
1051 		mtype++;
1052 		ASSERT(mtype < mnoderangecnt || pfn >= endpfn);
1053 	}
1054 }
1055 
1056 /*
1057  * Returns the free page count for mnode
1058  */
1059 int
1060 mnode_pgcnt(int mnode)
1061 {
1062 	int	mtype = mnoderangecnt - 1;
1063 	int	flags = PGI_MT_RANGE0;
1064 	pgcnt_t	pgcnt = 0;
1065 
1066 	mtype = mtype_func(mnode, mtype, flags);
1067 
1068 	while (mtype != -1) {
1069 		pgcnt += (mnoderanges[mtype].mnr_mt_flpgcnt +
1070 		    mnoderanges[mtype].mnr_mt_lgpgcnt +
1071 		    mnoderanges[mtype].mnr_mt_clpgcnt);
1072 		mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1073 	}
1074 	return (pgcnt);
1075 }
1076 
1077 /*
1078  * Initialize page coloring variables based on the l2 cache parameters.
1079  * Calculate and return memory needed for page coloring data structures.
1080  */
1081 size_t
1082 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1083 {
1084 	size_t	colorsz = 0;
1085 	int	i;
1086 	int	colors;
1087 
1088 	/*
1089 	 * Reduce the memory ranges lists if we don't have large amounts
1090 	 * of memory. This avoids searching known empty free lists.
1091 	 */
1092 	i = memrange_num(physmax);
1093 	memranges += i;
1094 	nranges -= i;
1095 #if defined(__i386)
1096 	if (i > 0)
1097 		restricted_kmemalloc = 0;
1098 #endif
1099 	/* physmax greater than 4g */
1100 	if (i == 0)
1101 		physmax4g = 1;
1102 
1103 	/*
1104 	 * setup pagesize for generic page layer
1105 	 */
1106 	for (i = 0; i <= mmu.max_page_level; ++i) {
1107 		hw_page_array[i].hp_size = LEVEL_SIZE(i);
1108 		hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1109 		hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1110 	}
1111 
1112 	ASSERT(ISP2(l2_sz));
1113 	ASSERT(ISP2(l2_linesz));
1114 	ASSERT(l2_sz > MMU_PAGESIZE);
1115 
1116 	/* l2_assoc is 0 for fully associative l2 cache */
1117 	if (l2_assoc)
1118 		l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1119 	else
1120 		l2_colors = 1;
1121 
1122 	/* for scalability, configure at least PAGE_COLORS_MIN color bins */
1123 	page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1124 
1125 	/*
1126 	 * cpu_page_colors is non-zero when a page color may be spread across
1127 	 * multiple bins.
1128 	 */
1129 	if (l2_colors < page_colors)
1130 		cpu_page_colors = l2_colors;
1131 
1132 	ASSERT(ISP2(page_colors));
1133 
1134 	page_colors_mask = page_colors - 1;
1135 
1136 	ASSERT(ISP2(CPUSETSIZE()));
1137 	page_coloring_shift = lowbit(CPUSETSIZE());
1138 
1139 	/* size for mnoderanges */
1140 	mnoderangecnt = mnode_range_cnt();
1141 	colorsz = mnoderangecnt * sizeof (mnoderange_t);
1142 
1143 	/* size for fpc_mutex and cpc_mutex */
1144 	colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1145 
1146 	/* size of page_freelists */
1147 	colorsz += mnoderangecnt * sizeof (page_t ***);
1148 	colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1149 
1150 	for (i = 0; i < mmu_page_sizes; i++) {
1151 		colors = page_get_pagecolors(i);
1152 		colorsz += mnoderangecnt * colors * sizeof (page_t *);
1153 	}
1154 
1155 	/* size of page_cachelists */
1156 	colorsz += mnoderangecnt * sizeof (page_t **);
1157 	colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1158 
1159 	return (colorsz);
1160 }
1161 
1162 /*
1163  * Called once at startup to configure page_coloring data structures and
1164  * does the 1st page_free()/page_freelist_add().
1165  */
1166 void
1167 page_coloring_setup(caddr_t pcmemaddr)
1168 {
1169 	int	i;
1170 	int	j;
1171 	int	k;
1172 	caddr_t	addr;
1173 	int	colors;
1174 
1175 	/*
1176 	 * do page coloring setup
1177 	 */
1178 	addr = pcmemaddr;
1179 
1180 	mnoderanges = (mnoderange_t *)addr;
1181 	addr += (mnoderangecnt * sizeof (mnoderange_t));
1182 
1183 	mnode_range_setup(mnoderanges);
1184 
1185 	if (physmax4g)
1186 		mtype4g = pfn_2_mtype(0xfffff);
1187 
1188 	for (k = 0; k < NPC_MUTEX; k++) {
1189 		fpc_mutex[k] = (kmutex_t *)addr;
1190 		addr += (max_mem_nodes * sizeof (kmutex_t));
1191 	}
1192 	for (k = 0; k < NPC_MUTEX; k++) {
1193 		cpc_mutex[k] = (kmutex_t *)addr;
1194 		addr += (max_mem_nodes * sizeof (kmutex_t));
1195 	}
1196 	page_freelists = (page_t ****)addr;
1197 	addr += (mnoderangecnt * sizeof (page_t ***));
1198 
1199 	page_cachelists = (page_t ***)addr;
1200 	addr += (mnoderangecnt * sizeof (page_t **));
1201 
1202 	for (i = 0; i < mnoderangecnt; i++) {
1203 		page_freelists[i] = (page_t ***)addr;
1204 		addr += (mmu_page_sizes * sizeof (page_t **));
1205 
1206 		for (j = 0; j < mmu_page_sizes; j++) {
1207 			colors = page_get_pagecolors(j);
1208 			page_freelists[i][j] = (page_t **)addr;
1209 			addr += (colors * sizeof (page_t *));
1210 		}
1211 		page_cachelists[i] = (page_t **)addr;
1212 		addr += (page_colors * sizeof (page_t *));
1213 	}
1214 }
1215 
1216 /*ARGSUSED*/
1217 int
1218 bp_color(struct buf *bp)
1219 {
1220 	return (0);
1221 }
1222 
1223 /*
1224  * get a page from any list with the given mnode
1225  */
1226 page_t *
1227 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
1228     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
1229 {
1230 	kmutex_t	*pcm;
1231 	int		i;
1232 	page_t		*pp;
1233 	page_t		*first_pp;
1234 	uint64_t	pgaddr;
1235 	ulong_t		bin;
1236 	int		mtypestart;
1237 
1238 	VM_STAT_ADD(pga_vmstats.pgma_alloc);
1239 
1240 	ASSERT((flags & PG_MATCH_COLOR) == 0);
1241 	ASSERT(szc == 0);
1242 	ASSERT(dma_attr != NULL);
1243 
1244 
1245 	MTYPE_START(mnode, mtype, flags);
1246 	if (mtype < 0) {
1247 		VM_STAT_ADD(pga_vmstats.pgma_allocempty);
1248 		return (NULL);
1249 	}
1250 
1251 	mtypestart = mtype;
1252 
1253 	bin = origbin;
1254 
1255 	/*
1256 	 * check up to page_colors + 1 bins - origbin may be checked twice
1257 	 * because of BIN_STEP skip
1258 	 */
1259 	do {
1260 		i = 0;
1261 		while (i <= page_colors) {
1262 			if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
1263 				goto nextfreebin;
1264 
1265 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1266 			mutex_enter(pcm);
1267 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
1268 			first_pp = pp;
1269 			while (pp != NULL) {
1270 				if (page_trylock(pp, SE_EXCL) == 0) {
1271 					pp = pp->p_next;
1272 					if (pp == first_pp) {
1273 						pp = NULL;
1274 					}
1275 					continue;
1276 				}
1277 
1278 				ASSERT(PP_ISFREE(pp));
1279 				ASSERT(PP_ISAGED(pp));
1280 				ASSERT(pp->p_vnode == NULL);
1281 				ASSERT(pp->p_hash == NULL);
1282 				ASSERT(pp->p_offset == (u_offset_t)-1);
1283 				ASSERT(pp->p_szc == szc);
1284 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
1285 				/* check if page within DMA attributes */
1286 				pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum));
1287 
1288 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
1289 				    (pgaddr + MMU_PAGESIZE - 1 <=
1290 				    dma_attr->dma_attr_addr_hi)) {
1291 					break;
1292 				}
1293 
1294 				/* continue looking */
1295 				page_unlock(pp);
1296 				pp = pp->p_next;
1297 				if (pp == first_pp)
1298 					pp = NULL;
1299 
1300 			}
1301 			if (pp != NULL) {
1302 				ASSERT(mtype == PP_2_MTYPE(pp));
1303 				ASSERT(pp->p_szc == 0);
1304 
1305 				/* found a page with specified DMA attributes */
1306 				page_sub(&PAGE_FREELISTS(mnode, szc, bin,
1307 				    mtype), pp);
1308 				page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1309 
1310 				if ((PP_ISFREE(pp) == 0) ||
1311 				    (PP_ISAGED(pp) == 0)) {
1312 					cmn_err(CE_PANIC, "page %p is not free",
1313 					    (void *)pp);
1314 				}
1315 
1316 				mutex_exit(pcm);
1317 				check_dma(dma_attr, pp, 1);
1318 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
1319 				return (pp);
1320 			}
1321 			mutex_exit(pcm);
1322 nextfreebin:
1323 			pp = page_freelist_fill(szc, bin, mnode, mtype,
1324 			    mmu_btop(dma_attr->dma_attr_addr_hi + 1));
1325 			if (pp)
1326 				return (pp);
1327 
1328 			/* try next bin */
1329 			bin += (i == 0) ? BIN_STEP : 1;
1330 			bin &= page_colors_mask;
1331 			i++;
1332 		}
1333 		MTYPE_NEXT(mnode, mtype, flags);
1334 	} while (mtype >= 0);
1335 
1336 	/* failed to find a page in the freelist; try it in the cachelist */
1337 
1338 	/* reset mtype start for cachelist search */
1339 	mtype = mtypestart;
1340 	ASSERT(mtype >= 0);
1341 
1342 	/* start with the bin of matching color */
1343 	bin = origbin;
1344 
1345 	do {
1346 		for (i = 0; i <= page_colors; i++) {
1347 			if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
1348 				goto nextcachebin;
1349 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
1350 			mutex_enter(pcm);
1351 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
1352 			first_pp = pp;
1353 			while (pp != NULL) {
1354 				if (page_trylock(pp, SE_EXCL) == 0) {
1355 					pp = pp->p_next;
1356 					if (pp == first_pp)
1357 						break;
1358 					continue;
1359 				}
1360 				ASSERT(pp->p_vnode);
1361 				ASSERT(PP_ISAGED(pp) == 0);
1362 				ASSERT(pp->p_szc == 0);
1363 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
1364 
1365 				/* check if page within DMA attributes */
1366 
1367 				pgaddr = ptob((uint64_t)(pp->p_pagenum));
1368 
1369 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
1370 				    (pgaddr + MMU_PAGESIZE - 1 <=
1371 				    dma_attr->dma_attr_addr_hi)) {
1372 					break;
1373 				}
1374 
1375 				/* continue looking */
1376 				page_unlock(pp);
1377 				pp = pp->p_next;
1378 				if (pp == first_pp)
1379 					pp = NULL;
1380 			}
1381 
1382 			if (pp != NULL) {
1383 				ASSERT(mtype == PP_2_MTYPE(pp));
1384 				ASSERT(pp->p_szc == 0);
1385 
1386 				/* found a page with specified DMA attributes */
1387 				page_sub(&PAGE_CACHELISTS(mnode, bin,
1388 				    mtype), pp);
1389 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
1390 
1391 				mutex_exit(pcm);
1392 				ASSERT(pp->p_vnode);
1393 				ASSERT(PP_ISAGED(pp) == 0);
1394 				check_dma(dma_attr, pp, 1);
1395 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
1396 				return (pp);
1397 			}
1398 			mutex_exit(pcm);
1399 nextcachebin:
1400 			bin += (i == 0) ? BIN_STEP : 1;
1401 			bin &= page_colors_mask;
1402 		}
1403 		MTYPE_NEXT(mnode, mtype, flags);
1404 	} while (mtype >= 0);
1405 
1406 	VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
1407 	return (NULL);
1408 }
1409 
1410 /*
1411  * This function is similar to page_get_freelist()/page_get_cachelist()
1412  * but it searches both the lists to find a page with the specified
1413  * color (or no color) and DMA attributes. The search is done in the
1414  * freelist first and then in the cache list within the highest memory
1415  * range (based on DMA attributes) before searching in the lower
1416  * memory ranges.
1417  *
1418  * Note: This function is called only by page_create_io().
1419  */
1420 /*ARGSUSED*/
1421 page_t *
1422 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
1423     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t	*lgrp)
1424 {
1425 	uint_t		bin;
1426 	int		mtype;
1427 	page_t		*pp;
1428 	int		n;
1429 	int		m;
1430 	int		szc;
1431 	int		fullrange;
1432 	int		mnode;
1433 	int		local_failed_stat = 0;
1434 	lgrp_mnode_cookie_t	lgrp_cookie;
1435 
1436 	VM_STAT_ADD(pga_vmstats.pga_alloc);
1437 
1438 	/* only base pagesize currently supported */
1439 	if (size != MMU_PAGESIZE)
1440 		return (NULL);
1441 
1442 	/*
1443 	 * If we're passed a specific lgroup, we use it.  Otherwise,
1444 	 * assume first-touch placement is desired.
1445 	 */
1446 	if (!LGRP_EXISTS(lgrp))
1447 		lgrp = lgrp_home_lgrp();
1448 
1449 	/* LINTED */
1450 	AS_2_BIN(as, seg, vp, vaddr, bin);
1451 
1452 	/*
1453 	 * Only hold one freelist or cachelist lock at a time, that way we
1454 	 * can start anywhere and not have to worry about lock
1455 	 * ordering.
1456 	 */
1457 	if (dma_attr == NULL) {
1458 		n = 0;
1459 		m = mnoderangecnt - 1;
1460 		fullrange = 1;
1461 		VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
1462 	} else {
1463 		pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
1464 		pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
1465 
1466 		/*
1467 		 * We can guarantee alignment only for page boundary.
1468 		 */
1469 		if (dma_attr->dma_attr_align > MMU_PAGESIZE)
1470 			return (NULL);
1471 
1472 		n = pfn_2_mtype(pfnlo);
1473 		m = pfn_2_mtype(pfnhi);
1474 
1475 		fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
1476 		    (pfnhi >= mnoderanges[m].mnr_pfnhi));
1477 	}
1478 	VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
1479 
1480 	if (n > m)
1481 		return (NULL);
1482 
1483 	szc = 0;
1484 
1485 	/* cylcing thru mtype handled by RANGE0 if n == 0 */
1486 	if (n == 0) {
1487 		flags |= PGI_MT_RANGE0;
1488 		n = m;
1489 	}
1490 
1491 	/*
1492 	 * Try local memory node first, but try remote if we can't
1493 	 * get a page of the right color.
1494 	 */
1495 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
1496 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
1497 		/*
1498 		 * allocate pages from high pfn to low.
1499 		 */
1500 		for (mtype = m; mtype >= n; mtype--) {
1501 			if (fullrange != 0) {
1502 				pp = page_get_mnode_freelist(mnode,
1503 				    bin, mtype, szc, flags);
1504 				if (pp == NULL) {
1505 					pp = page_get_mnode_cachelist(
1506 						bin, flags, mnode, mtype);
1507 				}
1508 			} else {
1509 				pp = page_get_mnode_anylist(bin, szc,
1510 				    flags, mnode, mtype, dma_attr);
1511 			}
1512 			if (pp != NULL) {
1513 				VM_STAT_ADD(pga_vmstats.pga_allocok);
1514 				check_dma(dma_attr, pp, 1);
1515 				return (pp);
1516 			}
1517 		}
1518 		if (!local_failed_stat) {
1519 			lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
1520 			local_failed_stat = 1;
1521 		}
1522 	}
1523 	VM_STAT_ADD(pga_vmstats.pga_allocfailed);
1524 
1525 	return (NULL);
1526 }
1527 
1528 /*
1529  * page_create_io()
1530  *
1531  * This function is a copy of page_create_va() with an additional
1532  * argument 'mattr' that specifies DMA memory requirements to
1533  * the page list functions. This function is used by the segkmem
1534  * allocator so it is only to create new pages (i.e PG_EXCL is
1535  * set).
1536  *
1537  * Note: This interface is currently used by x86 PSM only and is
1538  *	 not fully specified so the commitment level is only for
1539  *	 private interface specific to x86. This interface uses PSM
1540  *	 specific page_get_anylist() interface.
1541  */
1542 
1543 #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
1544 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
1545 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
1546 			break; \
1547 	} \
1548 }
1549 
1550 
1551 page_t *
1552 page_create_io(
1553 	struct vnode	*vp,
1554 	u_offset_t	off,
1555 	uint_t		bytes,
1556 	uint_t		flags,
1557 	struct as	*as,
1558 	caddr_t		vaddr,
1559 	ddi_dma_attr_t	*mattr)	/* DMA memory attributes if any */
1560 {
1561 	page_t		*plist = NULL;
1562 	uint_t		plist_len = 0;
1563 	pgcnt_t		npages;
1564 	page_t		*npp = NULL;
1565 	uint_t		pages_req;
1566 	page_t		*pp;
1567 	kmutex_t	*phm = NULL;
1568 	uint_t		index;
1569 
1570 	TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
1571 		"page_create_start:vp %p off %llx bytes %u flags %x",
1572 		vp, off, bytes, flags);
1573 
1574 	ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
1575 
1576 	pages_req = npages = mmu_btopr(bytes);
1577 
1578 	/*
1579 	 * Do the freemem and pcf accounting.
1580 	 */
1581 	if (!page_create_wait(npages, flags)) {
1582 		return (NULL);
1583 	}
1584 
1585 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
1586 		"page_create_success:vp %p off %llx",
1587 		vp, off);
1588 
1589 	/*
1590 	 * If satisfying this request has left us with too little
1591 	 * memory, start the wheels turning to get some back.  The
1592 	 * first clause of the test prevents waking up the pageout
1593 	 * daemon in situations where it would decide that there's
1594 	 * nothing to do.
1595 	 */
1596 	if (nscan < desscan && freemem < minfree) {
1597 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
1598 			"pageout_cv_signal:freemem %ld", freemem);
1599 		cv_signal(&proc_pageout->p_cv);
1600 	}
1601 
1602 	if (flags & PG_PHYSCONTIG) {
1603 
1604 		plist = page_get_contigpage(&npages, mattr, 1);
1605 		if (plist == NULL) {
1606 			page_create_putback(npages);
1607 			return (NULL);
1608 		}
1609 
1610 		pp = plist;
1611 
1612 		do {
1613 			if (!page_hashin(pp, vp, off, NULL)) {
1614 				panic("pg_creat_io: hashin failed %p %p %llx",
1615 				    (void *)pp, (void *)vp, off);
1616 			}
1617 			VM_STAT_ADD(page_create_new);
1618 			off += MMU_PAGESIZE;
1619 			PP_CLRFREE(pp);
1620 			PP_CLRAGED(pp);
1621 			page_set_props(pp, P_REF);
1622 			pp = pp->p_next;
1623 		} while (pp != plist);
1624 
1625 		if (!npages) {
1626 			check_dma(mattr, plist, pages_req);
1627 			return (plist);
1628 		} else {
1629 			vaddr += (pages_req - npages) << MMU_PAGESHIFT;
1630 		}
1631 
1632 		/*
1633 		 * fall-thru:
1634 		 *
1635 		 * page_get_contigpage returns when npages <= sgllen.
1636 		 * Grab the rest of the non-contig pages below from anylist.
1637 		 */
1638 	}
1639 
1640 	/*
1641 	 * Loop around collecting the requested number of pages.
1642 	 * Most of the time, we have to `create' a new page. With
1643 	 * this in mind, pull the page off the free list before
1644 	 * getting the hash lock.  This will minimize the hash
1645 	 * lock hold time, nesting, and the like.  If it turns
1646 	 * out we don't need the page, we put it back at the end.
1647 	 */
1648 	while (npages--) {
1649 		phm = NULL;
1650 
1651 		index = PAGE_HASH_FUNC(vp, off);
1652 top:
1653 		ASSERT(phm == NULL);
1654 		ASSERT(index == PAGE_HASH_FUNC(vp, off));
1655 		ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1656 
1657 		if (npp == NULL) {
1658 			/*
1659 			 * Try to get the page of any color either from
1660 			 * the freelist or from the cache list.
1661 			 */
1662 			npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
1663 			    flags & ~PG_MATCH_COLOR, mattr, NULL);
1664 			if (npp == NULL) {
1665 				if (mattr == NULL) {
1666 					/*
1667 					 * Not looking for a special page;
1668 					 * panic!
1669 					 */
1670 					panic("no page found %d", (int)npages);
1671 				}
1672 				/*
1673 				 * No page found! This can happen
1674 				 * if we are looking for a page
1675 				 * within a specific memory range
1676 				 * for DMA purposes. If PG_WAIT is
1677 				 * specified then we wait for a
1678 				 * while and then try again. The
1679 				 * wait could be forever if we
1680 				 * don't get the page(s) we need.
1681 				 *
1682 				 * Note: XXX We really need a mechanism
1683 				 * to wait for pages in the desired
1684 				 * range. For now, we wait for any
1685 				 * pages and see if we can use it.
1686 				 */
1687 
1688 				if ((mattr != NULL) && (flags & PG_WAIT)) {
1689 					delay(10);
1690 					goto top;
1691 				}
1692 
1693 				goto fail; /* undo accounting stuff */
1694 			}
1695 
1696 			if (PP_ISAGED(npp) == 0) {
1697 				/*
1698 				 * Since this page came from the
1699 				 * cachelist, we must destroy the
1700 				 * old vnode association.
1701 				 */
1702 				page_hashout(npp, (kmutex_t *)NULL);
1703 			}
1704 		}
1705 
1706 		/*
1707 		 * We own this page!
1708 		 */
1709 		ASSERT(PAGE_EXCL(npp));
1710 		ASSERT(npp->p_vnode == NULL);
1711 		ASSERT(!hat_page_is_mapped(npp));
1712 		PP_CLRFREE(npp);
1713 		PP_CLRAGED(npp);
1714 
1715 		/*
1716 		 * Here we have a page in our hot little mits and are
1717 		 * just waiting to stuff it on the appropriate lists.
1718 		 * Get the mutex and check to see if it really does
1719 		 * not exist.
1720 		 */
1721 		phm = PAGE_HASH_MUTEX(index);
1722 		mutex_enter(phm);
1723 		PAGE_HASH_SEARCH(index, pp, vp, off);
1724 		if (pp == NULL) {
1725 			VM_STAT_ADD(page_create_new);
1726 			pp = npp;
1727 			npp = NULL;
1728 			if (!page_hashin(pp, vp, off, phm)) {
1729 				/*
1730 				 * Since we hold the page hash mutex and
1731 				 * just searched for this page, page_hashin
1732 				 * had better not fail.  If it does, that
1733 				 * means somethread did not follow the
1734 				 * page hash mutex rules.  Panic now and
1735 				 * get it over with.  As usual, go down
1736 				 * holding all the locks.
1737 				 */
1738 				ASSERT(MUTEX_HELD(phm));
1739 				panic("page_create: hashin fail %p %p %llx %p",
1740 				    (void *)pp, (void *)vp, off, (void *)phm);
1741 
1742 			}
1743 			ASSERT(MUTEX_HELD(phm));
1744 			mutex_exit(phm);
1745 			phm = NULL;
1746 
1747 			/*
1748 			 * Hat layer locking need not be done to set
1749 			 * the following bits since the page is not hashed
1750 			 * and was on the free list (i.e., had no mappings).
1751 			 *
1752 			 * Set the reference bit to protect
1753 			 * against immediate pageout
1754 			 *
1755 			 * XXXmh modify freelist code to set reference
1756 			 * bit so we don't have to do it here.
1757 			 */
1758 			page_set_props(pp, P_REF);
1759 		} else {
1760 			ASSERT(MUTEX_HELD(phm));
1761 			mutex_exit(phm);
1762 			phm = NULL;
1763 			/*
1764 			 * NOTE: This should not happen for pages associated
1765 			 *	 with kernel vnode 'kvp'.
1766 			 */
1767 			/* XX64 - to debug why this happens! */
1768 			ASSERT(vp != &kvp);
1769 			if (vp == &kvp)
1770 				cmn_err(CE_NOTE,
1771 				    "page_create: page not expected "
1772 				    "in hash list for kernel vnode - pp 0x%p",
1773 				    (void *)pp);
1774 			VM_STAT_ADD(page_create_exists);
1775 			goto fail;
1776 		}
1777 
1778 		/*
1779 		 * Got a page!  It is locked.  Acquire the i/o
1780 		 * lock since we are going to use the p_next and
1781 		 * p_prev fields to link the requested pages together.
1782 		 */
1783 		page_io_lock(pp);
1784 		page_add(&plist, pp);
1785 		plist = plist->p_next;
1786 		off += MMU_PAGESIZE;
1787 		vaddr += MMU_PAGESIZE;
1788 	}
1789 
1790 	check_dma(mattr, plist, pages_req);
1791 	return (plist);
1792 
1793 fail:
1794 	if (npp != NULL) {
1795 		/*
1796 		 * Did not need this page after all.
1797 		 * Put it back on the free list.
1798 		 */
1799 		VM_STAT_ADD(page_create_putbacks);
1800 		PP_SETFREE(npp);
1801 		PP_SETAGED(npp);
1802 		npp->p_offset = (u_offset_t)-1;
1803 		page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
1804 		page_unlock(npp);
1805 	}
1806 
1807 	/*
1808 	 * Give up the pages we already got.
1809 	 */
1810 	while (plist != NULL) {
1811 		pp = plist;
1812 		page_sub(&plist, pp);
1813 		page_io_unlock(pp);
1814 		plist_len++;
1815 		/*LINTED: constant in conditional ctx*/
1816 		VN_DISPOSE(pp, B_INVAL, 0, kcred);
1817 	}
1818 
1819 	/*
1820 	 * VN_DISPOSE does freemem accounting for the pages in plist
1821 	 * by calling page_free. So, we need to undo the pcf accounting
1822 	 * for only the remaining pages.
1823 	 */
1824 	VM_STAT_ADD(page_create_putbacks);
1825 	page_create_putback(pages_req - plist_len);
1826 
1827 	return (NULL);
1828 }
1829 
1830 
1831 /*
1832  * Copy the data from the physical page represented by "frompp" to
1833  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
1834  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
1835  * level and no one sleeps with an active mapping there.
1836  *
1837  * Note that the ref/mod bits in the page_t's are not affected by
1838  * this operation, hence it is up to the caller to update them appropriately.
1839  */
1840 void
1841 ppcopy(page_t *frompp, page_t *topp)
1842 {
1843 	caddr_t		pp_addr1;
1844 	caddr_t		pp_addr2;
1845 	void		*pte1;
1846 	void		*pte2;
1847 	kmutex_t	*ppaddr_mutex;
1848 
1849 	ASSERT_STACK_ALIGNED();
1850 	ASSERT(PAGE_LOCKED(frompp));
1851 	ASSERT(PAGE_LOCKED(topp));
1852 
1853 	if (kpm_enable) {
1854 		pp_addr1 = hat_kpm_page2va(frompp, 0);
1855 		pp_addr2 = hat_kpm_page2va(topp, 0);
1856 		kpreempt_disable();
1857 	} else {
1858 		/*
1859 		 * disable pre-emption so that CPU can't change
1860 		 */
1861 		kpreempt_disable();
1862 
1863 		pp_addr1 = CPU->cpu_caddr1;
1864 		pp_addr2 = CPU->cpu_caddr2;
1865 		pte1 = (void *)CPU->cpu_caddr1pte;
1866 		pte2 = (void *)CPU->cpu_caddr2pte;
1867 
1868 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
1869 		mutex_enter(ppaddr_mutex);
1870 
1871 		hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
1872 		    PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
1873 		hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
1874 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
1875 		    HAT_LOAD_NOCONSIST);
1876 	}
1877 
1878 	if (use_sse_pagecopy)
1879 		hwblkpagecopy(pp_addr1, pp_addr2);
1880 	else
1881 		bcopy(pp_addr1, pp_addr2, PAGESIZE);
1882 
1883 	if (!kpm_enable)
1884 		mutex_exit(ppaddr_mutex);
1885 	kpreempt_enable();
1886 }
1887 
1888 /*
1889  * Zero the physical page from off to off + len given by `pp'
1890  * without changing the reference and modified bits of page.
1891  *
1892  * We use this using CPU private page address #2, see ppcopy() for more info.
1893  * pagezero() must not be called at interrupt level.
1894  */
1895 void
1896 pagezero(page_t *pp, uint_t off, uint_t len)
1897 {
1898 	caddr_t		pp_addr2;
1899 	void		*pte2;
1900 	kmutex_t	*ppaddr_mutex;
1901 
1902 	ASSERT_STACK_ALIGNED();
1903 	ASSERT(len <= MMU_PAGESIZE);
1904 	ASSERT(off <= MMU_PAGESIZE);
1905 	ASSERT(off + len <= MMU_PAGESIZE);
1906 	ASSERT(PAGE_LOCKED(pp));
1907 
1908 	if (kpm_enable) {
1909 		pp_addr2 = hat_kpm_page2va(pp, 0);
1910 		kpreempt_disable();
1911 	} else {
1912 		kpreempt_disable();
1913 
1914 		pp_addr2 = CPU->cpu_caddr2;
1915 		pte2 = (void *)CPU->cpu_caddr2pte;
1916 
1917 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
1918 		mutex_enter(ppaddr_mutex);
1919 
1920 		hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2,
1921 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
1922 		    HAT_LOAD_NOCONSIST);
1923 	}
1924 
1925 	if (use_sse_pagezero)
1926 		hwblkclr(pp_addr2 + off, len);
1927 	else
1928 		bzero(pp_addr2 + off, len);
1929 
1930 	if (!kpm_enable)
1931 		mutex_exit(ppaddr_mutex);
1932 	kpreempt_enable();
1933 }
1934 
1935 /*
1936  * Platform-dependent page scrub call.
1937  */
1938 void
1939 pagescrub(page_t *pp, uint_t off, uint_t len)
1940 {
1941 	/*
1942 	 * For now, we rely on the fact that pagezero() will
1943 	 * always clear UEs.
1944 	 */
1945 	pagezero(pp, off, len);
1946 }
1947 
1948 /*
1949  * set up two private addresses for use on a given CPU for use in ppcopy()
1950  */
1951 void
1952 setup_vaddr_for_ppcopy(struct cpu *cpup)
1953 {
1954 	void *addr;
1955 	void *pte;
1956 
1957 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
1958 	pte = hat_mempte_setup(addr);
1959 	cpup->cpu_caddr1 = addr;
1960 	cpup->cpu_caddr1pte = (pteptr_t)pte;
1961 
1962 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
1963 	pte = hat_mempte_setup(addr);
1964 	cpup->cpu_caddr2 = addr;
1965 	cpup->cpu_caddr2pte = (pteptr_t)pte;
1966 
1967 	mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
1968 }
1969 
1970 
1971 /*
1972  * Create the pageout scanner thread. The thread has to
1973  * start at procedure with process pp and priority pri.
1974  */
1975 void
1976 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
1977 {
1978 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
1979 }
1980 
1981 /*
1982  * Function for flushing D-cache when performing module relocations
1983  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
1984  */
1985 void
1986 dcache_flushall()
1987 {}
1988