xref: /illumos-gate/usr/src/uts/i86pc/vm/vm_machdep.c (revision 4c28a617e3922d92a58e813a5b955eb526b9c386)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 /*
25  * Copyright (c) 2010, Intel Corporation.
26  * All rights reserved.
27  * Copyright 2018 Joyent, Inc.
28  */
29 
30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
31 /*	All Rights Reserved   */
32 
33 /*
34  * Portions of this source code were derived from Berkeley 4.3 BSD
35  * under license from the Regents of the University of California.
36  */
37 
38 /*
39  * UNIX machine dependent virtual memory support.
40  */
41 
42 #include <sys/types.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/user.h>
46 #include <sys/proc.h>
47 #include <sys/kmem.h>
48 #include <sys/vmem.h>
49 #include <sys/buf.h>
50 #include <sys/cpuvar.h>
51 #include <sys/lgrp.h>
52 #include <sys/disp.h>
53 #include <sys/vm.h>
54 #include <sys/mman.h>
55 #include <sys/vnode.h>
56 #include <sys/cred.h>
57 #include <sys/exec.h>
58 #include <sys/exechdr.h>
59 #include <sys/debug.h>
60 #include <sys/vmsystm.h>
61 #include <sys/swap.h>
62 #include <sys/dumphdr.h>
63 #include <sys/random.h>
64 
65 #include <vm/hat.h>
66 #include <vm/as.h>
67 #include <vm/seg.h>
68 #include <vm/seg_kp.h>
69 #include <vm/seg_vn.h>
70 #include <vm/page.h>
71 #include <vm/seg_kmem.h>
72 #include <vm/seg_kpm.h>
73 #include <vm/vm_dep.h>
74 
75 #include <sys/cpu.h>
76 #include <sys/vm_machparam.h>
77 #include <sys/memlist.h>
78 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
79 #include <vm/hat_i86.h>
80 #include <sys/x86_archext.h>
81 #include <sys/elf_386.h>
82 #include <sys/cmn_err.h>
83 #include <sys/archsystm.h>
84 #include <sys/machsystm.h>
85 #include <sys/secflags.h>
86 
87 #include <sys/vtrace.h>
88 #include <sys/ddidmareq.h>
89 #include <sys/promif.h>
90 #include <sys/memnode.h>
91 #include <sys/stack.h>
92 #include <util/qsort.h>
93 #include <sys/taskq.h>
94 
95 #ifdef __xpv
96 
97 #include <sys/hypervisor.h>
98 #include <sys/xen_mmu.h>
99 #include <sys/balloon_impl.h>
100 
101 /*
102  * domain 0 pages usable for DMA are kept pre-allocated and kept in
103  * distinct lists, ordered by increasing mfn.
104  */
105 static kmutex_t io_pool_lock;
106 static kmutex_t contig_list_lock;
107 static page_t *io_pool_4g;	/* pool for 32 bit dma limited devices */
108 static page_t *io_pool_16m;	/* pool for 24 bit dma limited legacy devices */
109 static long io_pool_cnt;
110 static long io_pool_cnt_max = 0;
111 #define	DEFAULT_IO_POOL_MIN	128
112 static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN;
113 static long io_pool_cnt_lowater = 0;
114 static long io_pool_shrink_attempts; /* how many times did we try to shrink */
115 static long io_pool_shrinks;	/* how many times did we really shrink */
116 static long io_pool_grows;	/* how many times did we grow */
117 static mfn_t start_mfn = 1;
118 static caddr_t io_pool_kva;	/* use to alloc pages when needed */
119 
120 static int create_contig_pfnlist(uint_t);
121 
122 /*
123  * percentage of phys mem to hold in the i/o pool
124  */
125 #define	DEFAULT_IO_POOL_PCT	2
126 static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT;
127 static void page_io_pool_sub(page_t **, page_t *, page_t *);
128 int ioalloc_dbg = 0;
129 
130 #endif /* __xpv */
131 
132 uint_t vac_colors = 1;
133 
134 int largepagesupport = 0;
135 extern uint_t page_create_new;
136 extern uint_t page_create_exists;
137 extern uint_t page_create_putbacks;
138 /*
139  * Allow users to disable the kernel's use of SSE.
140  */
141 extern int use_sse_pagecopy, use_sse_pagezero;
142 
143 /*
144  * combined memory ranges from mnode and memranges[] to manage single
145  * mnode/mtype dimension in the page lists.
146  */
147 typedef struct {
148 	pfn_t	mnr_pfnlo;
149 	pfn_t	mnr_pfnhi;
150 	int	mnr_mnode;
151 	int	mnr_memrange;		/* index into memranges[] */
152 	int	mnr_next;		/* next lower PA mnoderange */
153 	int	mnr_exists;
154 	/* maintain page list stats */
155 	pgcnt_t	mnr_mt_clpgcnt;		/* cache list cnt */
156 	pgcnt_t	mnr_mt_flpgcnt[MMU_PAGE_SIZES];	/* free list cnt per szc */
157 	pgcnt_t	mnr_mt_totcnt;		/* sum of cache and free lists */
158 #ifdef DEBUG
159 	struct mnr_mts {		/* mnode/mtype szc stats */
160 		pgcnt_t	mnr_mts_pgcnt;
161 		int	mnr_mts_colors;
162 		pgcnt_t *mnr_mtsc_pgcnt;
163 	} 	*mnr_mts;
164 #endif
165 } mnoderange_t;
166 
167 #define	MEMRANGEHI(mtype)						\
168 	((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
169 #define	MEMRANGELO(mtype)	(memranges[mtype])
170 
171 #define	MTYPE_FREEMEM(mt)	(mnoderanges[mt].mnr_mt_totcnt)
172 
173 /*
174  * As the PC architecture evolved memory up was clumped into several
175  * ranges for various historical I/O devices to do DMA.
176  * < 16Meg - ISA bus
177  * < 2Gig - ???
178  * < 4Gig - PCI bus or drivers that don't understand PAE mode
179  *
180  * These are listed in reverse order, so that we can skip over unused
181  * ranges on machines with small memories.
182  *
183  * For now under the Hypervisor, we'll only ever have one memrange.
184  */
185 #define	PFN_4GIG	0x100000
186 #define	PFN_16MEG	0x1000
187 /* Indices into the memory range (arch_memranges) array. */
188 #define	MRI_4G		0
189 #define	MRI_2G		1
190 #define	MRI_16M		2
191 #define	MRI_0		3
192 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
193     PFN_4GIG,	/* pfn range for 4G and above */
194     0x80000,	/* pfn range for 2G-4G */
195     PFN_16MEG,	/* pfn range for 16M-2G */
196     0x00000,	/* pfn range for 0-16M */
197 };
198 pfn_t *memranges = &arch_memranges[0];
199 int nranges = NUM_MEM_RANGES;
200 
201 /*
202  * This combines mem_node_config and memranges into one data
203  * structure to be used for page list management.
204  */
205 mnoderange_t	*mnoderanges;
206 int		mnoderangecnt;
207 int		mtype4g;
208 int		mtype16m;
209 int		mtypetop;	/* index of highest pfn'ed mnoderange */
210 
211 /*
212  * 4g memory management variables for systems with more than 4g of memory:
213  *
214  * physical memory below 4g is required for 32bit dma devices and, currently,
215  * for kmem memory. On systems with more than 4g of memory, the pool of memory
216  * below 4g can be depleted without any paging activity given that there is
217  * likely to be sufficient memory above 4g.
218  *
219  * physmax4g is set true if the largest pfn is over 4g. The rest of the
220  * 4g memory management code is enabled only when physmax4g is true.
221  *
222  * maxmem4g is the count of the maximum number of pages on the page lists
223  * with physical addresses below 4g. It can be a lot less then 4g given that
224  * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
225  * agp aperture etc.
226  *
227  * freemem4g maintains the count of the number of available pages on the
228  * page lists with physical addresses below 4g.
229  *
230  * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
231  * 6% (desfree4gshift = 4) of maxmem4g.
232  *
233  * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
234  * and the amount of physical memory above 4g is greater than freemem4g.
235  * In this case, page_get_* routines will restrict below 4g allocations
236  * for requests that don't specifically require it.
237  */
238 
239 #define	DESFREE4G	(maxmem4g >> desfree4gshift)
240 
241 #define	RESTRICT4G_ALLOC					\
242 	(physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
243 
244 static pgcnt_t	maxmem4g;
245 static pgcnt_t	freemem4g;
246 static int	physmax4g;
247 static int	desfree4gshift = 4;	/* maxmem4g shift to derive DESFREE4G */
248 
249 /*
250  * 16m memory management:
251  *
252  * reserve some amount of physical memory below 16m for legacy devices.
253  *
254  * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
255  * 16m or if the 16m pool drops below DESFREE16M.
256  *
257  * In this case, general page allocations via page_get_{free,cache}list
258  * routines will be restricted from allocating from the 16m pool. Allocations
259  * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
260  * are not restricted.
261  */
262 
263 #define	FREEMEM16M	MTYPE_FREEMEM(mtype16m)
264 #define	DESFREE16M	desfree16m
265 #define	RESTRICT16M_ALLOC(freemem, pgcnt, flags)		\
266 	((freemem != 0) && ((flags & PG_PANIC) == 0) &&		\
267 	    ((freemem >= (FREEMEM16M)) ||			\
268 	    (FREEMEM16M  < (DESFREE16M + pgcnt))))
269 
270 static pgcnt_t	desfree16m = 0x380;
271 
272 /*
273  * This can be patched via /etc/system to allow old non-PAE aware device
274  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
275  */
276 int restricted_kmemalloc = 0;
277 
278 #ifdef VM_STATS
279 struct {
280 	ulong_t	pga_alloc;
281 	ulong_t	pga_notfullrange;
282 	ulong_t	pga_nulldmaattr;
283 	ulong_t	pga_allocok;
284 	ulong_t	pga_allocfailed;
285 	ulong_t	pgma_alloc;
286 	ulong_t	pgma_allocok;
287 	ulong_t	pgma_allocfailed;
288 	ulong_t	pgma_allocempty;
289 } pga_vmstats;
290 #endif
291 
292 uint_t mmu_page_sizes;
293 
294 /* How many page sizes the users can see */
295 uint_t mmu_exported_page_sizes;
296 
297 /* page sizes that legacy applications can see */
298 uint_t mmu_legacy_page_sizes;
299 
300 /*
301  * Number of pages in 1 GB.  Don't enable automatic large pages if we have
302  * fewer than this many pages.
303  */
304 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
305 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
306 
307 /*
308  * Maximum and default segment size tunables for user private
309  * and shared anon memory, and user text and initialized data.
310  * These can be patched via /etc/system to allow large pages
311  * to be used for mapping application private and shared anon memory.
312  */
313 size_t mcntl0_lpsize = MMU_PAGESIZE;
314 size_t max_uheap_lpsize = MMU_PAGESIZE;
315 size_t default_uheap_lpsize = MMU_PAGESIZE;
316 size_t max_ustack_lpsize = MMU_PAGESIZE;
317 size_t default_ustack_lpsize = MMU_PAGESIZE;
318 size_t max_privmap_lpsize = MMU_PAGESIZE;
319 size_t max_uidata_lpsize = MMU_PAGESIZE;
320 size_t max_utext_lpsize = MMU_PAGESIZE;
321 size_t max_shm_lpsize = MMU_PAGESIZE;
322 
323 
324 /*
325  * initialized by page_coloring_init().
326  */
327 uint_t	page_colors;
328 uint_t	page_colors_mask;
329 uint_t	page_coloring_shift;
330 int	cpu_page_colors;
331 static uint_t	l2_colors;
332 
333 /*
334  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
335  * and page_colors are calculated from the l2 cache n-way set size.  Within a
336  * mnode range, the page freelist and cachelist are hashed into bins based on
337  * color. This makes it easier to search for a page within a specific memory
338  * range.
339  */
340 #define	PAGE_COLORS_MIN	16
341 
342 page_t ****page_freelists;
343 page_t ***page_cachelists;
344 
345 
346 /*
347  * Used by page layer to know about page sizes
348  */
349 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
350 
351 kmutex_t	*fpc_mutex[NPC_MUTEX];
352 kmutex_t	*cpc_mutex[NPC_MUTEX];
353 
354 /* Lock to protect mnoderanges array for memory DR operations. */
355 static kmutex_t mnoderange_lock;
356 
357 /*
358  * Only let one thread at a time try to coalesce large pages, to
359  * prevent them from working against each other.
360  */
361 static kmutex_t	contig_lock;
362 #define	CONTIG_LOCK()	mutex_enter(&contig_lock);
363 #define	CONTIG_UNLOCK()	mutex_exit(&contig_lock);
364 
365 #define	PFN_16M		(mmu_btop((uint64_t)0x1000000))
366 
367 caddr_t
368 i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
369 {
370 	caddr_t addr;
371 	caddr_t addr1;
372 	page_t *pp;
373 
374 	addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
375 
376 	for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
377 		pp = page_numtopp_nolock(pf);
378 		if (pp == NULL) {
379 			hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
380 			    prot | HAT_NOSYNC, HAT_LOAD_LOCK);
381 		} else {
382 			hat_memload(kas.a_hat, addr, pp,
383 			    prot | HAT_NOSYNC, HAT_LOAD_LOCK);
384 		}
385 	}
386 
387 	return (addr1);
388 }
389 
390 /*
391  * This routine is like page_numtopp, but accepts only free pages, which
392  * it allocates (unfrees) and returns with the exclusive lock held.
393  * It is used by machdep.c/dma_init() to find contiguous free pages.
394  */
395 page_t *
396 page_numtopp_alloc(pfn_t pfnum)
397 {
398 	page_t *pp;
399 
400 retry:
401 	pp = page_numtopp_nolock(pfnum);
402 	if (pp == NULL) {
403 		return (NULL);
404 	}
405 
406 	if (!page_trylock(pp, SE_EXCL)) {
407 		return (NULL);
408 	}
409 
410 	if (page_pptonum(pp) != pfnum) {
411 		page_unlock(pp);
412 		goto retry;
413 	}
414 
415 	if (!PP_ISFREE(pp)) {
416 		page_unlock(pp);
417 		return (NULL);
418 	}
419 	if (pp->p_szc) {
420 		page_demote_free_pages(pp);
421 		page_unlock(pp);
422 		goto retry;
423 	}
424 
425 	/* If associated with a vnode, destroy mappings */
426 
427 	if (pp->p_vnode) {
428 
429 		page_destroy_free(pp);
430 
431 		if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
432 			return (NULL);
433 		}
434 
435 		if (page_pptonum(pp) != pfnum) {
436 			page_unlock(pp);
437 			goto retry;
438 		}
439 	}
440 
441 	if (!PP_ISFREE(pp)) {
442 		page_unlock(pp);
443 		return (NULL);
444 	}
445 
446 	if (!page_reclaim(pp, (kmutex_t *)NULL))
447 		return (NULL);
448 
449 	return (pp);
450 }
451 
452 /*
453  * Return the optimum page size for a given mapping
454  */
455 /*ARGSUSED*/
456 size_t
457 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
458 {
459 	level_t l = 0;
460 	size_t pgsz = MMU_PAGESIZE;
461 	size_t max_lpsize;
462 	uint_t mszc;
463 
464 	ASSERT(maptype != MAPPGSZ_VA);
465 
466 	if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
467 		return (MMU_PAGESIZE);
468 	}
469 
470 	switch (maptype) {
471 	case MAPPGSZ_HEAP:
472 	case MAPPGSZ_STK:
473 		max_lpsize = memcntl ? mcntl0_lpsize : (maptype ==
474 		    MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize);
475 		if (max_lpsize == MMU_PAGESIZE) {
476 			return (MMU_PAGESIZE);
477 		}
478 		if (len == 0) {
479 			len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase +
480 			    p->p_brksize - p->p_bssbase : p->p_stksize;
481 		}
482 		len = (maptype == MAPPGSZ_HEAP) ? MAX(len,
483 		    default_uheap_lpsize) : MAX(len, default_ustack_lpsize);
484 
485 		/*
486 		 * use the pages size that best fits len
487 		 */
488 		for (l = mmu.umax_page_level; l > 0; --l) {
489 			if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) {
490 				continue;
491 			} else {
492 				pgsz = LEVEL_SIZE(l);
493 			}
494 			break;
495 		}
496 
497 		mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc :
498 		    p->p_stkpageszc);
499 		if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) {
500 			pgsz = hw_page_array[mszc].hp_size;
501 		}
502 		return (pgsz);
503 
504 	case MAPPGSZ_ISM:
505 		for (l = mmu.umax_page_level; l > 0; --l) {
506 			if (len >= LEVEL_SIZE(l))
507 				return (LEVEL_SIZE(l));
508 		}
509 		return (LEVEL_SIZE(0));
510 	}
511 	return (pgsz);
512 }
513 
514 static uint_t
515 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize,
516     size_t min_physmem)
517 {
518 	caddr_t eaddr = addr + size;
519 	uint_t szcvec = 0;
520 	caddr_t raddr;
521 	caddr_t readdr;
522 	size_t	pgsz;
523 	int i;
524 
525 	if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
526 		return (0);
527 	}
528 
529 	for (i = mmu_exported_page_sizes - 1; i > 0; i--) {
530 		pgsz = page_get_pagesize(i);
531 		if (pgsz > max_lpsize) {
532 			continue;
533 		}
534 		raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
535 		readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
536 		if (raddr < addr || raddr >= readdr) {
537 			continue;
538 		}
539 		if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
540 			continue;
541 		}
542 		/*
543 		 * Set szcvec to the remaining page sizes.
544 		 */
545 		szcvec = ((1 << (i + 1)) - 1) & ~1;
546 		break;
547 	}
548 	return (szcvec);
549 }
550 
551 /*
552  * Return a bit vector of large page size codes that
553  * can be used to map [addr, addr + len) region.
554  */
555 /*ARGSUSED*/
556 uint_t
557 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
558     int memcntl)
559 {
560 	size_t max_lpsize = mcntl0_lpsize;
561 
562 	if (mmu.max_page_level == 0)
563 		return (0);
564 
565 	if (flags & MAP_TEXT) {
566 		if (!memcntl)
567 			max_lpsize = max_utext_lpsize;
568 		return (map_szcvec(addr, size, off, max_lpsize,
569 		    shm_lpg_min_physmem));
570 
571 	} else if (flags & MAP_INITDATA) {
572 		if (!memcntl)
573 			max_lpsize = max_uidata_lpsize;
574 		return (map_szcvec(addr, size, off, max_lpsize,
575 		    privm_lpg_min_physmem));
576 
577 	} else if (type == MAPPGSZC_SHM) {
578 		if (!memcntl)
579 			max_lpsize = max_shm_lpsize;
580 		return (map_szcvec(addr, size, off, max_lpsize,
581 		    shm_lpg_min_physmem));
582 
583 	} else if (type == MAPPGSZC_HEAP) {
584 		if (!memcntl)
585 			max_lpsize = max_uheap_lpsize;
586 		return (map_szcvec(addr, size, off, max_lpsize,
587 		    privm_lpg_min_physmem));
588 
589 	} else if (type == MAPPGSZC_STACK) {
590 		if (!memcntl)
591 			max_lpsize = max_ustack_lpsize;
592 		return (map_szcvec(addr, size, off, max_lpsize,
593 		    privm_lpg_min_physmem));
594 
595 	} else {
596 		if (!memcntl)
597 			max_lpsize = max_privmap_lpsize;
598 		return (map_szcvec(addr, size, off, max_lpsize,
599 		    privm_lpg_min_physmem));
600 	}
601 }
602 
603 /*
604  * Handle a pagefault.
605  */
606 faultcode_t
607 pagefault(
608 	caddr_t addr,
609 	enum fault_type type,
610 	enum seg_rw rw,
611 	int iskernel)
612 {
613 	struct as *as;
614 	struct hat *hat;
615 	struct proc *p;
616 	kthread_t *t;
617 	faultcode_t res;
618 	caddr_t base;
619 	size_t len;
620 	int err;
621 	int mapped_red;
622 	uintptr_t ea;
623 
624 	ASSERT_STACK_ALIGNED();
625 
626 	if (INVALID_VADDR(addr))
627 		return (FC_NOMAP);
628 
629 	mapped_red = segkp_map_red();
630 
631 	if (iskernel) {
632 		as = &kas;
633 		hat = as->a_hat;
634 	} else {
635 		t = curthread;
636 		p = ttoproc(t);
637 		as = p->p_as;
638 		hat = as->a_hat;
639 	}
640 
641 	/*
642 	 * Dispatch pagefault.
643 	 */
644 	res = as_fault(hat, as, addr, 1, type, rw);
645 
646 	/*
647 	 * If this isn't a potential unmapped hole in the user's
648 	 * UNIX data or stack segments, just return status info.
649 	 */
650 	if (res != FC_NOMAP || iskernel)
651 		goto out;
652 
653 	/*
654 	 * Check to see if we happened to faulted on a currently unmapped
655 	 * part of the UNIX data or stack segments.  If so, create a zfod
656 	 * mapping there and then try calling the fault routine again.
657 	 */
658 	base = p->p_brkbase;
659 	len = p->p_brksize;
660 
661 	if (addr < base || addr >= base + len) {		/* data seg? */
662 		base = (caddr_t)p->p_usrstack - p->p_stksize;
663 		len = p->p_stksize;
664 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
665 			/* not in either UNIX data or stack segments */
666 			res = FC_NOMAP;
667 			goto out;
668 		}
669 	}
670 
671 	/*
672 	 * the rest of this function implements a 3.X 4.X 5.X compatibility
673 	 * This code is probably not needed anymore
674 	 */
675 	if (p->p_model == DATAMODEL_ILP32) {
676 
677 		/* expand the gap to the page boundaries on each side */
678 		ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
679 		base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
680 		len = ea - (uintptr_t)base;
681 
682 		as_rangelock(as);
683 		if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
684 		    0) {
685 			err = as_map(as, base, len, segvn_create, zfod_argsp);
686 			as_rangeunlock(as);
687 			if (err) {
688 				res = FC_MAKE_ERR(err);
689 				goto out;
690 			}
691 		} else {
692 			/*
693 			 * This page is already mapped by another thread after
694 			 * we returned from as_fault() above.  We just fall
695 			 * through as_fault() below.
696 			 */
697 			as_rangeunlock(as);
698 		}
699 
700 		res = as_fault(hat, as, addr, 1, F_INVAL, rw);
701 	}
702 
703 out:
704 	if (mapped_red)
705 		segkp_unmap_red();
706 
707 	return (res);
708 }
709 
710 void
711 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
712 {
713 	struct proc *p = curproc;
714 	caddr_t userlimit = (flags & _MAP_LOW32) ?
715 	    (caddr_t)_userlimit32 : p->p_as->a_userlimit;
716 
717 	map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
718 }
719 
720 /*ARGSUSED*/
721 int
722 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
723 {
724 	return (0);
725 }
726 
727 /*
728  * The maximum amount a randomized mapping will be slewed.  We should perhaps
729  * arrange things so these tunables can be separate for mmap, mmapobj, and
730  * ld.so
731  */
732 size_t aslr_max_map_skew = 256 * 1024 * 1024; /* 256MB */
733 
734 /*
735  * map_addr_proc() is the routine called when the system is to
736  * choose an address for the user.  We will pick an address
737  * range which is the highest available below userlimit.
738  *
739  * Every mapping will have a redzone of a single page on either side of
740  * the request. This is done to leave one page unmapped between segments.
741  * This is not required, but it's useful for the user because if their
742  * program strays across a segment boundary, it will catch a fault
743  * immediately making debugging a little easier.  Currently the redzone
744  * is mandatory.
745  *
746  * addrp is a value/result parameter.
747  *	On input it is a hint from the user to be used in a completely
748  *	machine dependent fashion.  We decide to completely ignore this hint.
749  *	If MAP_ALIGN was specified, addrp contains the minimal alignment, which
750  *	must be some "power of two" multiple of pagesize.
751  *
752  *	On output it is NULL if no address can be found in the current
753  *	processes address space or else an address that is currently
754  *	not mapped for len bytes with a page of red zone on either side.
755  *
756  *	vacalign is not needed on x86 (it's for viturally addressed caches)
757  */
758 /*ARGSUSED*/
759 void
760 map_addr_proc(
761 	caddr_t *addrp,
762 	size_t len,
763 	offset_t off,
764 	int vacalign,
765 	caddr_t userlimit,
766 	struct proc *p,
767 	uint_t flags)
768 {
769 	struct as *as = p->p_as;
770 	caddr_t addr;
771 	caddr_t base;
772 	size_t slen;
773 	size_t align_amount;
774 
775 	ASSERT32(userlimit == as->a_userlimit);
776 
777 	base = p->p_brkbase;
778 #if defined(__amd64)
779 	/*
780 	 * XX64 Yes, this needs more work.
781 	 */
782 	if (p->p_model == DATAMODEL_NATIVE) {
783 		if (userlimit < as->a_userlimit) {
784 			/*
785 			 * This happens when a program wants to map
786 			 * something in a range that's accessible to a
787 			 * program in a smaller address space.  For example,
788 			 * a 64-bit program calling mmap32(2) to guarantee
789 			 * that the returned address is below 4Gbytes.
790 			 */
791 			ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
792 
793 			if (userlimit > base)
794 				slen = userlimit - base;
795 			else {
796 				*addrp = NULL;
797 				return;
798 			}
799 		} else {
800 			/*
801 			 * XX64 This layout is probably wrong .. but in
802 			 * the event we make the amd64 address space look
803 			 * like sparcv9 i.e. with the stack -above- the
804 			 * heap, this bit of code might even be correct.
805 			 */
806 			slen = p->p_usrstack - base -
807 			    ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
808 		}
809 	} else
810 #endif
811 		slen = userlimit - base;
812 
813 	/* Make len be a multiple of PAGESIZE */
814 	len = (len + PAGEOFFSET) & PAGEMASK;
815 
816 	/*
817 	 * figure out what the alignment should be
818 	 *
819 	 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
820 	 */
821 	if (len <= ELF_386_MAXPGSZ) {
822 		/*
823 		 * Align virtual addresses to ensure that ELF shared libraries
824 		 * are mapped with the appropriate alignment constraints by
825 		 * the run-time linker.
826 		 */
827 		align_amount = ELF_386_MAXPGSZ;
828 	} else {
829 		/*
830 		 * For 32-bit processes, only those which have specified
831 		 * MAP_ALIGN and an addr will be aligned on a larger page size.
832 		 * Not doing so can potentially waste up to 1G of process
833 		 * address space.
834 		 */
835 		int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 :
836 		    mmu.umax_page_level;
837 
838 		while (lvl && len < LEVEL_SIZE(lvl))
839 			--lvl;
840 
841 		align_amount = LEVEL_SIZE(lvl);
842 	}
843 	if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
844 		align_amount = (uintptr_t)*addrp;
845 
846 	ASSERT(ISP2(align_amount));
847 	ASSERT(align_amount == 0 || align_amount >= PAGESIZE);
848 
849 	off = off & (align_amount - 1);
850 
851 	/*
852 	 * Look for a large enough hole starting below userlimit.
853 	 * After finding it, use the upper part.
854 	 */
855 	if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount,
856 	    PAGESIZE, off) == 0) {
857 		caddr_t as_addr;
858 
859 		/*
860 		 * addr is the highest possible address to use since we have
861 		 * a PAGESIZE redzone at the beginning and end.
862 		 */
863 		addr = base + slen - (PAGESIZE + len);
864 		as_addr = addr;
865 		/*
866 		 * Round address DOWN to the alignment amount and
867 		 * add the offset in.
868 		 * If addr is greater than as_addr, len would not be large
869 		 * enough to include the redzone, so we must adjust down
870 		 * by the alignment amount.
871 		 */
872 		addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
873 		addr += (uintptr_t)off;
874 		if (addr > as_addr) {
875 			addr -= align_amount;
876 		}
877 
878 		/*
879 		 * If randomization is requested, slew the allocation
880 		 * backwards, within the same gap, by a random amount.
881 		 */
882 		if (flags & _MAP_RANDOMIZE) {
883 			uint32_t slew;
884 
885 			(void) random_get_pseudo_bytes((uint8_t *)&slew,
886 			    sizeof (slew));
887 
888 			slew = slew % MIN(aslr_max_map_skew, (addr - base));
889 			addr -= P2ALIGN(slew, align_amount);
890 		}
891 
892 		ASSERT(addr > base);
893 		ASSERT(addr + len < base + slen);
894 		ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
895 		    ((uintptr_t)(off)));
896 		*addrp = addr;
897 	} else {
898 		*addrp = NULL;	/* no more virtual space */
899 	}
900 }
901 
902 int valid_va_range_aligned_wraparound;
903 
904 /*
905  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
906  * addresses at least "minlen" long, where the base of the range is at "off"
907  * phase from an "align" boundary and there is space for a "redzone"-sized
908  * redzone on either side of the range.  On success, 1 is returned and *basep
909  * and *lenp are adjusted to describe the acceptable range (including
910  * the redzone).  On failure, 0 is returned.
911  */
912 /*ARGSUSED3*/
913 int
914 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
915     size_t align, size_t redzone, size_t off)
916 {
917 	uintptr_t hi, lo;
918 	size_t tot_len;
919 
920 	ASSERT(align == 0 ? off == 0 : off < align);
921 	ASSERT(ISP2(align));
922 	ASSERT(align == 0 || align >= PAGESIZE);
923 
924 	lo = (uintptr_t)*basep;
925 	hi = lo + *lenp;
926 	tot_len = minlen + 2 * redzone; /* need at least this much space */
927 
928 	/*
929 	 * If hi rolled over the top, try cutting back.
930 	 */
931 	if (hi < lo) {
932 		*lenp = 0UL - lo - 1UL;
933 		/* See if this really happens. If so, then we figure out why */
934 		valid_va_range_aligned_wraparound++;
935 		hi = lo + *lenp;
936 	}
937 	if (*lenp < tot_len) {
938 		return (0);
939 	}
940 
941 #if defined(__amd64)
942 	/*
943 	 * Deal with a possible hole in the address range between
944 	 * hole_start and hole_end that should never be mapped.
945 	 */
946 	if (lo < hole_start) {
947 		if (hi > hole_start) {
948 			if (hi < hole_end) {
949 				hi = hole_start;
950 			} else {
951 				/* lo < hole_start && hi >= hole_end */
952 				if (dir == AH_LO) {
953 					/*
954 					 * prefer lowest range
955 					 */
956 					if (hole_start - lo >= tot_len)
957 						hi = hole_start;
958 					else if (hi - hole_end >= tot_len)
959 						lo = hole_end;
960 					else
961 						return (0);
962 				} else {
963 					/*
964 					 * prefer highest range
965 					 */
966 					if (hi - hole_end >= tot_len)
967 						lo = hole_end;
968 					else if (hole_start - lo >= tot_len)
969 						hi = hole_start;
970 					else
971 						return (0);
972 				}
973 			}
974 		}
975 	} else {
976 		/* lo >= hole_start */
977 		if (hi < hole_end)
978 			return (0);
979 		if (lo < hole_end)
980 			lo = hole_end;
981 	}
982 #endif
983 
984 	if (hi - lo < tot_len)
985 		return (0);
986 
987 	if (align > 1) {
988 		uintptr_t tlo = lo + redzone;
989 		uintptr_t thi = hi - redzone;
990 		tlo = (uintptr_t)P2PHASEUP(tlo, align, off);
991 		if (tlo < lo + redzone) {
992 			return (0);
993 		}
994 		if (thi < tlo || thi - tlo < minlen) {
995 			return (0);
996 		}
997 	}
998 
999 	*basep = (caddr_t)lo;
1000 	*lenp = hi - lo;
1001 	return (1);
1002 }
1003 
1004 /*
1005  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
1006  * addresses at least "minlen" long.  On success, 1 is returned and *basep
1007  * and *lenp are adjusted to describe the acceptable range.  On failure, 0
1008  * is returned.
1009  */
1010 int
1011 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
1012 {
1013 	return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
1014 }
1015 
1016 /*
1017  * Default to forbidding the first 64k of address space.  This protects most
1018  * reasonably sized structures from dereferences through NULL:
1019  *     ((foo_t *)0)->bar
1020  */
1021 uintptr_t forbidden_null_mapping_sz = 0x10000;
1022 
1023 /*
1024  * Determine whether [addr, addr+len] are valid user addresses.
1025  */
1026 /*ARGSUSED*/
1027 int
1028 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
1029     caddr_t userlimit)
1030 {
1031 	caddr_t eaddr = addr + len;
1032 
1033 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
1034 		return (RANGE_BADADDR);
1035 
1036 	if ((addr <= (caddr_t)forbidden_null_mapping_sz) &&
1037 	    as->a_proc != NULL &&
1038 	    secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP))
1039 		return (RANGE_BADADDR);
1040 
1041 #if defined(__amd64)
1042 	/*
1043 	 * Check for the VA hole
1044 	 */
1045 	if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
1046 		return (RANGE_BADADDR);
1047 #endif
1048 
1049 	return (RANGE_OKAY);
1050 }
1051 
1052 /*
1053  * Return 1 if the page frame is onboard memory, else 0.
1054  */
1055 int
1056 pf_is_memory(pfn_t pf)
1057 {
1058 	if (pfn_is_foreign(pf))
1059 		return (0);
1060 	return (address_in_memlist(phys_install, pfn_to_pa(pf), 1));
1061 }
1062 
1063 /*
1064  * return the memrange containing pfn
1065  */
1066 int
1067 memrange_num(pfn_t pfn)
1068 {
1069 	int n;
1070 
1071 	for (n = 0; n < nranges - 1; ++n) {
1072 		if (pfn >= memranges[n])
1073 			break;
1074 	}
1075 	return (n);
1076 }
1077 
1078 /*
1079  * return the mnoderange containing pfn
1080  */
1081 /*ARGSUSED*/
1082 int
1083 pfn_2_mtype(pfn_t pfn)
1084 {
1085 #if defined(__xpv)
1086 	return (0);
1087 #else
1088 	int	n;
1089 
1090 	/* Always start from highest pfn and work our way down */
1091 	for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1092 		if (pfn >= mnoderanges[n].mnr_pfnlo) {
1093 			break;
1094 		}
1095 	}
1096 	return (n);
1097 #endif
1098 }
1099 
1100 #if !defined(__xpv)
1101 /*
1102  * is_contigpage_free:
1103  *	returns a page list of contiguous pages. It minimally has to return
1104  *	minctg pages. Caller determines minctg based on the scatter-gather
1105  *	list length.
1106  *
1107  *	pfnp is set to the next page frame to search on return.
1108  */
1109 static page_t *
1110 is_contigpage_free(
1111 	pfn_t *pfnp,
1112 	pgcnt_t *pgcnt,
1113 	pgcnt_t minctg,
1114 	uint64_t pfnseg,
1115 	int iolock)
1116 {
1117 	int	i = 0;
1118 	pfn_t	pfn = *pfnp;
1119 	page_t	*pp;
1120 	page_t	*plist = NULL;
1121 
1122 	/*
1123 	 * fail if pfn + minctg crosses a segment boundary.
1124 	 * Adjust for next starting pfn to begin at segment boundary.
1125 	 */
1126 
1127 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
1128 		*pfnp = roundup(*pfnp, pfnseg + 1);
1129 		return (NULL);
1130 	}
1131 
1132 	do {
1133 retry:
1134 		pp = page_numtopp_nolock(pfn + i);
1135 		if ((pp == NULL) || IS_DUMP_PAGE(pp) ||
1136 		    (page_trylock(pp, SE_EXCL) == 0)) {
1137 			(*pfnp)++;
1138 			break;
1139 		}
1140 		if (page_pptonum(pp) != pfn + i) {
1141 			page_unlock(pp);
1142 			goto retry;
1143 		}
1144 
1145 		if (!(PP_ISFREE(pp))) {
1146 			page_unlock(pp);
1147 			(*pfnp)++;
1148 			break;
1149 		}
1150 
1151 		if (!PP_ISAGED(pp)) {
1152 			page_list_sub(pp, PG_CACHE_LIST);
1153 			page_hashout(pp, (kmutex_t *)NULL);
1154 		} else {
1155 			page_list_sub(pp, PG_FREE_LIST);
1156 		}
1157 
1158 		if (iolock)
1159 			page_io_lock(pp);
1160 		page_list_concat(&plist, &pp);
1161 
1162 		/*
1163 		 * exit loop when pgcnt satisfied or segment boundary reached.
1164 		 */
1165 
1166 	} while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
1167 
1168 	*pfnp += i;		/* set to next pfn to search */
1169 
1170 	if (i >= minctg) {
1171 		*pgcnt -= i;
1172 		return (plist);
1173 	}
1174 
1175 	/*
1176 	 * failure: minctg not satisfied.
1177 	 *
1178 	 * if next request crosses segment boundary, set next pfn
1179 	 * to search from the segment boundary.
1180 	 */
1181 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
1182 		*pfnp = roundup(*pfnp, pfnseg + 1);
1183 
1184 	/* clean up any pages already allocated */
1185 
1186 	while (plist) {
1187 		pp = plist;
1188 		page_sub(&plist, pp);
1189 		page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
1190 		if (iolock)
1191 			page_io_unlock(pp);
1192 		page_unlock(pp);
1193 	}
1194 
1195 	return (NULL);
1196 }
1197 #endif	/* !__xpv */
1198 
1199 /*
1200  * verify that pages being returned from allocator have correct DMA attribute
1201  */
1202 #ifndef DEBUG
1203 #define	check_dma(a, b, c) (void)(0)
1204 #else
1205 static void
1206 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
1207 {
1208 	if (dma_attr == NULL)
1209 		return;
1210 
1211 	while (cnt-- > 0) {
1212 		if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) <
1213 		    dma_attr->dma_attr_addr_lo)
1214 			panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp);
1215 		if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >=
1216 		    dma_attr->dma_attr_addr_hi)
1217 			panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp);
1218 		pp = pp->p_next;
1219 	}
1220 }
1221 #endif
1222 
1223 #if !defined(__xpv)
1224 static page_t *
1225 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
1226 {
1227 	pfn_t		pfn;
1228 	int		sgllen;
1229 	uint64_t	pfnseg;
1230 	pgcnt_t		minctg;
1231 	page_t		*pplist = NULL, *plist;
1232 	uint64_t	lo, hi;
1233 	pgcnt_t		pfnalign = 0;
1234 	static pfn_t	startpfn;
1235 	static pgcnt_t	lastctgcnt;
1236 	uintptr_t	align;
1237 
1238 	CONTIG_LOCK();
1239 
1240 	if (mattr) {
1241 		lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
1242 		hi = mmu_btop(mattr->dma_attr_addr_hi);
1243 		if (hi >= physmax)
1244 			hi = physmax - 1;
1245 		sgllen = mattr->dma_attr_sgllen;
1246 		pfnseg = mmu_btop(mattr->dma_attr_seg);
1247 
1248 		align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
1249 		if (align > MMU_PAGESIZE)
1250 			pfnalign = mmu_btop(align);
1251 
1252 		/*
1253 		 * in order to satisfy the request, must minimally
1254 		 * acquire minctg contiguous pages
1255 		 */
1256 		minctg = howmany(*pgcnt, sgllen);
1257 
1258 		ASSERT(hi >= lo);
1259 
1260 		/*
1261 		 * start from where last searched if the minctg >= lastctgcnt
1262 		 */
1263 		if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
1264 			startpfn = lo;
1265 	} else {
1266 		hi = physmax - 1;
1267 		lo = 0;
1268 		sgllen = 1;
1269 		pfnseg = mmu.highest_pfn;
1270 		minctg = *pgcnt;
1271 
1272 		if (minctg < lastctgcnt)
1273 			startpfn = lo;
1274 	}
1275 	lastctgcnt = minctg;
1276 
1277 	ASSERT(pfnseg + 1 >= (uint64_t)minctg);
1278 
1279 	/* conserve 16m memory - start search above 16m when possible */
1280 	if (hi > PFN_16M && startpfn < PFN_16M)
1281 		startpfn = PFN_16M;
1282 
1283 	pfn = startpfn;
1284 	if (pfnalign)
1285 		pfn = P2ROUNDUP(pfn, pfnalign);
1286 
1287 	while (pfn + minctg - 1 <= hi) {
1288 
1289 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1290 		if (plist) {
1291 			page_list_concat(&pplist, &plist);
1292 			sgllen--;
1293 			/*
1294 			 * return when contig pages no longer needed
1295 			 */
1296 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1297 				startpfn = pfn;
1298 				CONTIG_UNLOCK();
1299 				check_dma(mattr, pplist, *pgcnt);
1300 				return (pplist);
1301 			}
1302 			minctg = howmany(*pgcnt, sgllen);
1303 		}
1304 		if (pfnalign)
1305 			pfn = P2ROUNDUP(pfn, pfnalign);
1306 	}
1307 
1308 	/* cannot find contig pages in specified range */
1309 	if (startpfn == lo) {
1310 		CONTIG_UNLOCK();
1311 		return (NULL);
1312 	}
1313 
1314 	/* did not start with lo previously */
1315 	pfn = lo;
1316 	if (pfnalign)
1317 		pfn = P2ROUNDUP(pfn, pfnalign);
1318 
1319 	/* allow search to go above startpfn */
1320 	while (pfn < startpfn) {
1321 
1322 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1323 		if (plist != NULL) {
1324 
1325 			page_list_concat(&pplist, &plist);
1326 			sgllen--;
1327 
1328 			/*
1329 			 * return when contig pages no longer needed
1330 			 */
1331 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1332 				startpfn = pfn;
1333 				CONTIG_UNLOCK();
1334 				check_dma(mattr, pplist, *pgcnt);
1335 				return (pplist);
1336 			}
1337 			minctg = howmany(*pgcnt, sgllen);
1338 		}
1339 		if (pfnalign)
1340 			pfn = P2ROUNDUP(pfn, pfnalign);
1341 	}
1342 	CONTIG_UNLOCK();
1343 	return (NULL);
1344 }
1345 #endif	/* !__xpv */
1346 
1347 /*
1348  * mnode_range_cnt() calculates the number of memory ranges for mnode and
1349  * memranges[]. Used to determine the size of page lists and mnoderanges.
1350  */
1351 int
1352 mnode_range_cnt(int mnode)
1353 {
1354 #if defined(__xpv)
1355 	ASSERT(mnode == 0);
1356 	return (1);
1357 #else	/* __xpv */
1358 	int	mri;
1359 	int	mnrcnt = 0;
1360 
1361 	if (mem_node_config[mnode].exists != 0) {
1362 		mri = nranges - 1;
1363 
1364 		/* find the memranges index below contained in mnode range */
1365 
1366 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1367 			mri--;
1368 
1369 		/*
1370 		 * increment mnode range counter when memranges or mnode
1371 		 * boundary is reached.
1372 		 */
1373 		while (mri >= 0 &&
1374 		    mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1375 			mnrcnt++;
1376 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1377 				mri--;
1378 			else
1379 				break;
1380 		}
1381 	}
1382 	ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
1383 	return (mnrcnt);
1384 #endif	/* __xpv */
1385 }
1386 
1387 /*
1388  * mnode_range_setup() initializes mnoderanges.
1389  */
1390 void
1391 mnode_range_setup(mnoderange_t *mnoderanges)
1392 {
1393 	mnoderange_t *mp = mnoderanges;
1394 	int	mnode, mri;
1395 	int	mindex = 0;	/* current index into mnoderanges array */
1396 	int	i, j;
1397 	pfn_t	hipfn;
1398 	int	last, hi;
1399 
1400 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
1401 		if (mem_node_config[mnode].exists == 0)
1402 			continue;
1403 
1404 		mri = nranges - 1;
1405 
1406 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1407 			mri--;
1408 
1409 		while (mri >= 0 && mem_node_config[mnode].physmax >=
1410 		    MEMRANGELO(mri)) {
1411 			mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri),
1412 			    mem_node_config[mnode].physbase);
1413 			mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri),
1414 			    mem_node_config[mnode].physmax);
1415 			mnoderanges->mnr_mnode = mnode;
1416 			mnoderanges->mnr_memrange = mri;
1417 			mnoderanges->mnr_exists = 1;
1418 			mnoderanges++;
1419 			mindex++;
1420 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1421 				mri--;
1422 			else
1423 				break;
1424 		}
1425 	}
1426 
1427 	/*
1428 	 * For now do a simple sort of the mnoderanges array to fill in
1429 	 * the mnr_next fields.  Since mindex is expected to be relatively
1430 	 * small, using a simple O(N^2) algorithm.
1431 	 */
1432 	for (i = 0; i < mindex; i++) {
1433 		if (mp[i].mnr_pfnlo == 0)	/* find lowest */
1434 			break;
1435 	}
1436 	ASSERT(i < mindex);
1437 	last = i;
1438 	mtype16m = last;
1439 	mp[last].mnr_next = -1;
1440 	for (i = 0; i < mindex - 1; i++) {
1441 		hipfn = (pfn_t)(-1);
1442 		hi = -1;
1443 		/* find next highest mnode range */
1444 		for (j = 0; j < mindex; j++) {
1445 			if (mp[j].mnr_pfnlo > mp[last].mnr_pfnlo &&
1446 			    mp[j].mnr_pfnlo < hipfn) {
1447 				hipfn = mp[j].mnr_pfnlo;
1448 				hi = j;
1449 			}
1450 		}
1451 		mp[hi].mnr_next = last;
1452 		last = hi;
1453 	}
1454 	mtypetop = last;
1455 }
1456 
1457 #ifndef	__xpv
1458 /*
1459  * Update mnoderanges for memory hot-add DR operations.
1460  */
1461 static void
1462 mnode_range_add(int mnode)
1463 {
1464 	int	*prev;
1465 	int	n, mri;
1466 	pfn_t	start, end;
1467 	extern	void membar_sync(void);
1468 
1469 	ASSERT(0 <= mnode && mnode < max_mem_nodes);
1470 	ASSERT(mem_node_config[mnode].exists);
1471 	start = mem_node_config[mnode].physbase;
1472 	end = mem_node_config[mnode].physmax;
1473 	ASSERT(start <= end);
1474 	mutex_enter(&mnoderange_lock);
1475 
1476 #ifdef	DEBUG
1477 	/* Check whether it interleaves with other memory nodes. */
1478 	for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1479 		ASSERT(mnoderanges[n].mnr_exists);
1480 		if (mnoderanges[n].mnr_mnode == mnode)
1481 			continue;
1482 		ASSERT(start > mnoderanges[n].mnr_pfnhi ||
1483 		    end < mnoderanges[n].mnr_pfnlo);
1484 	}
1485 #endif	/* DEBUG */
1486 
1487 	mri = nranges - 1;
1488 	while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1489 		mri--;
1490 	while (mri >= 0 && mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1491 		/* Check whether mtype already exists. */
1492 		for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1493 			if (mnoderanges[n].mnr_mnode == mnode &&
1494 			    mnoderanges[n].mnr_memrange == mri) {
1495 				mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri),
1496 				    start);
1497 				mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri),
1498 				    end);
1499 				break;
1500 			}
1501 		}
1502 
1503 		/* Add a new entry if it doesn't exist yet. */
1504 		if (n == -1) {
1505 			/* Try to find an unused entry in mnoderanges array. */
1506 			for (n = 0; n < mnoderangecnt; n++) {
1507 				if (mnoderanges[n].mnr_exists == 0)
1508 					break;
1509 			}
1510 			ASSERT(n < mnoderangecnt);
1511 			mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), start);
1512 			mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), end);
1513 			mnoderanges[n].mnr_mnode = mnode;
1514 			mnoderanges[n].mnr_memrange = mri;
1515 			mnoderanges[n].mnr_exists = 1;
1516 			/* Page 0 should always be present. */
1517 			for (prev = &mtypetop;
1518 			    mnoderanges[*prev].mnr_pfnlo > start;
1519 			    prev = &mnoderanges[*prev].mnr_next) {
1520 				ASSERT(mnoderanges[*prev].mnr_next >= 0);
1521 				ASSERT(mnoderanges[*prev].mnr_pfnlo > end);
1522 			}
1523 			mnoderanges[n].mnr_next = *prev;
1524 			membar_sync();
1525 			*prev = n;
1526 		}
1527 
1528 		if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1529 			mri--;
1530 		else
1531 			break;
1532 	}
1533 
1534 	mutex_exit(&mnoderange_lock);
1535 }
1536 
1537 /*
1538  * Update mnoderanges for memory hot-removal DR operations.
1539  */
1540 static void
1541 mnode_range_del(int mnode)
1542 {
1543 	_NOTE(ARGUNUSED(mnode));
1544 	ASSERT(0 <= mnode && mnode < max_mem_nodes);
1545 	/* TODO: support deletion operation. */
1546 	ASSERT(0);
1547 }
1548 
1549 void
1550 plat_slice_add(pfn_t start, pfn_t end)
1551 {
1552 	mem_node_add_slice(start, end);
1553 	if (plat_dr_enabled()) {
1554 		mnode_range_add(PFN_2_MEM_NODE(start));
1555 	}
1556 }
1557 
1558 void
1559 plat_slice_del(pfn_t start, pfn_t end)
1560 {
1561 	ASSERT(PFN_2_MEM_NODE(start) == PFN_2_MEM_NODE(end));
1562 	ASSERT(plat_dr_enabled());
1563 	mnode_range_del(PFN_2_MEM_NODE(start));
1564 	mem_node_del_slice(start, end);
1565 }
1566 #endif	/* __xpv */
1567 
1568 /*ARGSUSED*/
1569 int
1570 mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz)
1571 {
1572 	int mtype = mtypetop;
1573 
1574 #if !defined(__xpv)
1575 #if defined(__i386)
1576 	/*
1577 	 * set the mtype range
1578 	 * - kmem requests need to be below 4g if restricted_kmemalloc is set.
1579 	 * - for non kmem requests, set range to above 4g if memory below 4g
1580 	 * runs low.
1581 	 */
1582 	if (restricted_kmemalloc && VN_ISKAS(vp) &&
1583 	    (caddr_t)(vaddr) >= kernelheap &&
1584 	    (caddr_t)(vaddr) < ekernelheap) {
1585 		ASSERT(physmax4g);
1586 		mtype = mtype4g;
1587 		if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz),
1588 		    btop(pgsz), *flags)) {
1589 			*flags |= PGI_MT_RANGE16M;
1590 		} else {
1591 			VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1592 			VM_STAT_COND_ADD((*flags & PG_PANIC),
1593 			    vmm_vmstats.pgpanicalloc);
1594 			*flags |= PGI_MT_RANGE0;
1595 		}
1596 		return (mtype);
1597 	}
1598 #endif	/* __i386 */
1599 
1600 	if (RESTRICT4G_ALLOC) {
1601 		VM_STAT_ADD(vmm_vmstats.restrict4gcnt);
1602 		/* here only for > 4g systems */
1603 		*flags |= PGI_MT_RANGE4G;
1604 	} else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) {
1605 		*flags |= PGI_MT_RANGE16M;
1606 	} else {
1607 		VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1608 		VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc);
1609 		*flags |= PGI_MT_RANGE0;
1610 	}
1611 #endif /* !__xpv */
1612 	return (mtype);
1613 }
1614 
1615 
1616 /* mtype init for page_get_replacement_page */
1617 /*ARGSUSED*/
1618 int
1619 mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt)
1620 {
1621 	int mtype = mtypetop;
1622 #if !defined(__xpv)
1623 	if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) {
1624 		*flags |= PGI_MT_RANGE16M;
1625 	} else {
1626 		VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1627 		*flags |= PGI_MT_RANGE0;
1628 	}
1629 #endif
1630 	return (mtype);
1631 }
1632 
1633 /*
1634  * Determine if the mnode range specified in mtype contains memory belonging
1635  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
1636  * the range from high pfn to 0, 16m or 4g.
1637  *
1638  * Return first mnode range type index found otherwise return -1 if none found.
1639  */
1640 int
1641 mtype_func(int mnode, int mtype, uint_t flags)
1642 {
1643 	if (flags & PGI_MT_RANGE) {
1644 		int	mnr_lim = MRI_0;
1645 
1646 		if (flags & PGI_MT_NEXT) {
1647 			mtype = mnoderanges[mtype].mnr_next;
1648 		}
1649 		if (flags & PGI_MT_RANGE4G)
1650 			mnr_lim = MRI_4G;	/* exclude 0-4g range */
1651 		else if (flags & PGI_MT_RANGE16M)
1652 			mnr_lim = MRI_16M;	/* exclude 0-16m range */
1653 		while (mtype != -1 &&
1654 		    mnoderanges[mtype].mnr_memrange <= mnr_lim) {
1655 			if (mnoderanges[mtype].mnr_mnode == mnode)
1656 				return (mtype);
1657 			mtype = mnoderanges[mtype].mnr_next;
1658 		}
1659 	} else if (mnoderanges[mtype].mnr_mnode == mnode) {
1660 		return (mtype);
1661 	}
1662 	return (-1);
1663 }
1664 
1665 /*
1666  * Update the page list max counts with the pfn range specified by the
1667  * input parameters.
1668  */
1669 void
1670 mtype_modify_max(pfn_t startpfn, long cnt)
1671 {
1672 	int		mtype;
1673 	pgcnt_t		inc;
1674 	spgcnt_t	scnt = (spgcnt_t)(cnt);
1675 	pgcnt_t		acnt = ABS(scnt);
1676 	pfn_t		endpfn = startpfn + acnt;
1677 	pfn_t		pfn, lo;
1678 
1679 	if (!physmax4g)
1680 		return;
1681 
1682 	mtype = mtypetop;
1683 	for (pfn = endpfn; pfn > startpfn; ) {
1684 		ASSERT(mtype != -1);
1685 		lo = mnoderanges[mtype].mnr_pfnlo;
1686 		if (pfn > lo) {
1687 			if (startpfn >= lo) {
1688 				inc = pfn - startpfn;
1689 			} else {
1690 				inc = pfn - lo;
1691 			}
1692 			if (mnoderanges[mtype].mnr_memrange != MRI_4G) {
1693 				if (scnt > 0)
1694 					maxmem4g += inc;
1695 				else
1696 					maxmem4g -= inc;
1697 			}
1698 			pfn -= inc;
1699 		}
1700 		mtype = mnoderanges[mtype].mnr_next;
1701 	}
1702 }
1703 
1704 int
1705 mtype_2_mrange(int mtype)
1706 {
1707 	return (mnoderanges[mtype].mnr_memrange);
1708 }
1709 
1710 void
1711 mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi)
1712 {
1713 	_NOTE(ARGUNUSED(mnode));
1714 	ASSERT(mnoderanges[mtype].mnr_mnode == mnode);
1715 	*pfnlo = mnoderanges[mtype].mnr_pfnlo;
1716 	*pfnhi = mnoderanges[mtype].mnr_pfnhi;
1717 }
1718 
1719 size_t
1720 plcnt_sz(size_t ctrs_sz)
1721 {
1722 #ifdef DEBUG
1723 	int	szc, colors;
1724 
1725 	ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes;
1726 	for (szc = 0; szc < mmu_page_sizes; szc++) {
1727 		colors = page_get_pagecolors(szc);
1728 		ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors;
1729 	}
1730 #endif
1731 	return (ctrs_sz);
1732 }
1733 
1734 caddr_t
1735 plcnt_init(caddr_t addr)
1736 {
1737 #ifdef DEBUG
1738 	int	mt, szc, colors;
1739 
1740 	for (mt = 0; mt < mnoderangecnt; mt++) {
1741 		mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr;
1742 		addr += (sizeof (struct mnr_mts) * mmu_page_sizes);
1743 		for (szc = 0; szc < mmu_page_sizes; szc++) {
1744 			colors = page_get_pagecolors(szc);
1745 			mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors;
1746 			mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt =
1747 			    (pgcnt_t *)addr;
1748 			addr += (sizeof (pgcnt_t) * colors);
1749 		}
1750 	}
1751 #endif
1752 	return (addr);
1753 }
1754 
1755 void
1756 plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags)
1757 {
1758 	_NOTE(ARGUNUSED(pp));
1759 #ifdef DEBUG
1760 	int	bin = PP_2_BIN(pp);
1761 
1762 	atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt);
1763 	atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin],
1764 	    cnt);
1765 #endif
1766 	ASSERT(mtype == PP_2_MTYPE(pp));
1767 	if (physmax4g && mnoderanges[mtype].mnr_memrange != MRI_4G)
1768 		atomic_add_long(&freemem4g, cnt);
1769 	if (flags & PG_CACHE_LIST)
1770 		atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt);
1771 	else
1772 		atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt);
1773 	atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt);
1774 }
1775 
1776 /*
1777  * Returns the free page count for mnode
1778  */
1779 int
1780 mnode_pgcnt(int mnode)
1781 {
1782 	int	mtype = mtypetop;
1783 	int	flags = PGI_MT_RANGE0;
1784 	pgcnt_t	pgcnt = 0;
1785 
1786 	mtype = mtype_func(mnode, mtype, flags);
1787 
1788 	while (mtype != -1) {
1789 		pgcnt += MTYPE_FREEMEM(mtype);
1790 		mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1791 	}
1792 	return (pgcnt);
1793 }
1794 
1795 /*
1796  * Initialize page coloring variables based on the l2 cache parameters.
1797  * Calculate and return memory needed for page coloring data structures.
1798  */
1799 size_t
1800 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1801 {
1802 	_NOTE(ARGUNUSED(l2_linesz));
1803 	size_t	colorsz = 0;
1804 	int	i;
1805 	int	colors;
1806 
1807 #if defined(__xpv)
1808 	/*
1809 	 * Hypervisor domains currently don't have any concept of NUMA.
1810 	 * Hence we'll act like there is only 1 memrange.
1811 	 */
1812 	i = memrange_num(1);
1813 #else /* !__xpv */
1814 	/*
1815 	 * Reduce the memory ranges lists if we don't have large amounts
1816 	 * of memory. This avoids searching known empty free lists.
1817 	 * To support memory DR operations, we need to keep memory ranges
1818 	 * for possible memory hot-add operations.
1819 	 */
1820 	if (plat_dr_physmax > physmax)
1821 		i = memrange_num(plat_dr_physmax);
1822 	else
1823 		i = memrange_num(physmax);
1824 #if defined(__i386)
1825 	if (i > MRI_4G)
1826 		restricted_kmemalloc = 0;
1827 #endif
1828 	/* physmax greater than 4g */
1829 	if (i == MRI_4G)
1830 		physmax4g = 1;
1831 #endif /* !__xpv */
1832 	memranges += i;
1833 	nranges -= i;
1834 
1835 	ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES);
1836 
1837 	ASSERT(ISP2(l2_linesz));
1838 	ASSERT(l2_sz > MMU_PAGESIZE);
1839 
1840 	/* l2_assoc is 0 for fully associative l2 cache */
1841 	if (l2_assoc)
1842 		l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1843 	else
1844 		l2_colors = 1;
1845 
1846 	ASSERT(ISP2(l2_colors));
1847 
1848 	/* for scalability, configure at least PAGE_COLORS_MIN color bins */
1849 	page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1850 
1851 	/*
1852 	 * cpu_page_colors is non-zero when a page color may be spread across
1853 	 * multiple bins.
1854 	 */
1855 	if (l2_colors < page_colors)
1856 		cpu_page_colors = l2_colors;
1857 
1858 	ASSERT(ISP2(page_colors));
1859 
1860 	page_colors_mask = page_colors - 1;
1861 
1862 	ASSERT(ISP2(CPUSETSIZE()));
1863 	page_coloring_shift = lowbit(CPUSETSIZE());
1864 
1865 	/* initialize number of colors per page size */
1866 	for (i = 0; i <= mmu.max_page_level; i++) {
1867 		hw_page_array[i].hp_size = LEVEL_SIZE(i);
1868 		hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1869 		hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1870 		hw_page_array[i].hp_colors = (page_colors_mask >>
1871 		    (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
1872 		    + 1;
1873 		colorequivszc[i] = 0;
1874 	}
1875 
1876 	/*
1877 	 * The value of cpu_page_colors determines if additional color bins
1878 	 * need to be checked for a particular color in the page_get routines.
1879 	 */
1880 	if (cpu_page_colors != 0) {
1881 
1882 		int a = lowbit(page_colors) - lowbit(cpu_page_colors);
1883 		ASSERT(a > 0);
1884 		ASSERT(a < 16);
1885 
1886 		for (i = 0; i <= mmu.max_page_level; i++) {
1887 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
1888 				colorequivszc[i] = 0;
1889 				continue;
1890 			}
1891 			while ((colors >> a) == 0)
1892 				a--;
1893 			ASSERT(a >= 0);
1894 
1895 			/* higher 4 bits encodes color equiv mask */
1896 			colorequivszc[i] = (a << 4);
1897 		}
1898 	}
1899 
1900 	/* factor in colorequiv to check additional 'equivalent' bins. */
1901 	if (colorequiv > 1) {
1902 
1903 		int a = lowbit(colorequiv) - 1;
1904 		if (a > 15)
1905 			a = 15;
1906 
1907 		for (i = 0; i <= mmu.max_page_level; i++) {
1908 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
1909 				continue;
1910 			}
1911 			while ((colors >> a) == 0)
1912 				a--;
1913 			if ((a << 4) > colorequivszc[i]) {
1914 				colorequivszc[i] = (a << 4);
1915 			}
1916 		}
1917 	}
1918 
1919 	/* size for mnoderanges */
1920 	for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
1921 		mnoderangecnt += mnode_range_cnt(i);
1922 	if (plat_dr_support_memory()) {
1923 		/*
1924 		 * Reserve enough space for memory DR operations.
1925 		 * Two extra mnoderanges for possbile fragmentations,
1926 		 * one for the 2G boundary and the other for the 4G boundary.
1927 		 * We don't expect a memory board crossing the 16M boundary
1928 		 * for memory hot-add operations on x86 platforms.
1929 		 */
1930 		mnoderangecnt += 2 + max_mem_nodes - lgrp_plat_node_cnt;
1931 	}
1932 	colorsz = mnoderangecnt * sizeof (mnoderange_t);
1933 
1934 	/* size for fpc_mutex and cpc_mutex */
1935 	colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1936 
1937 	/* size of page_freelists */
1938 	colorsz += mnoderangecnt * sizeof (page_t ***);
1939 	colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1940 
1941 	for (i = 0; i < mmu_page_sizes; i++) {
1942 		colors = page_get_pagecolors(i);
1943 		colorsz += mnoderangecnt * colors * sizeof (page_t *);
1944 	}
1945 
1946 	/* size of page_cachelists */
1947 	colorsz += mnoderangecnt * sizeof (page_t **);
1948 	colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1949 
1950 	return (colorsz);
1951 }
1952 
1953 /*
1954  * Called once at startup to configure page_coloring data structures and
1955  * does the 1st page_free()/page_freelist_add().
1956  */
1957 void
1958 page_coloring_setup(caddr_t pcmemaddr)
1959 {
1960 	int	i;
1961 	int	j;
1962 	int	k;
1963 	caddr_t	addr;
1964 	int	colors;
1965 
1966 	/*
1967 	 * do page coloring setup
1968 	 */
1969 	addr = pcmemaddr;
1970 
1971 	mnoderanges = (mnoderange_t *)addr;
1972 	addr += (mnoderangecnt * sizeof (mnoderange_t));
1973 
1974 	mnode_range_setup(mnoderanges);
1975 
1976 	if (physmax4g)
1977 		mtype4g = pfn_2_mtype(0xfffff);
1978 
1979 	for (k = 0; k < NPC_MUTEX; k++) {
1980 		fpc_mutex[k] = (kmutex_t *)addr;
1981 		addr += (max_mem_nodes * sizeof (kmutex_t));
1982 	}
1983 	for (k = 0; k < NPC_MUTEX; k++) {
1984 		cpc_mutex[k] = (kmutex_t *)addr;
1985 		addr += (max_mem_nodes * sizeof (kmutex_t));
1986 	}
1987 	page_freelists = (page_t ****)addr;
1988 	addr += (mnoderangecnt * sizeof (page_t ***));
1989 
1990 	page_cachelists = (page_t ***)addr;
1991 	addr += (mnoderangecnt * sizeof (page_t **));
1992 
1993 	for (i = 0; i < mnoderangecnt; i++) {
1994 		page_freelists[i] = (page_t ***)addr;
1995 		addr += (mmu_page_sizes * sizeof (page_t **));
1996 
1997 		for (j = 0; j < mmu_page_sizes; j++) {
1998 			colors = page_get_pagecolors(j);
1999 			page_freelists[i][j] = (page_t **)addr;
2000 			addr += (colors * sizeof (page_t *));
2001 		}
2002 		page_cachelists[i] = (page_t **)addr;
2003 		addr += (page_colors * sizeof (page_t *));
2004 	}
2005 }
2006 
2007 #if defined(__xpv)
2008 /*
2009  * Give back 10% of the io_pool pages to the free list.
2010  * Don't shrink the pool below some absolute minimum.
2011  */
2012 static void
2013 page_io_pool_shrink()
2014 {
2015 	int retcnt;
2016 	page_t *pp, *pp_first, *pp_last, **curpool;
2017 	mfn_t mfn;
2018 	int bothpools = 0;
2019 
2020 	mutex_enter(&io_pool_lock);
2021 	io_pool_shrink_attempts++;	/* should be a kstat? */
2022 	retcnt = io_pool_cnt / 10;
2023 	if (io_pool_cnt - retcnt < io_pool_cnt_min)
2024 		retcnt = io_pool_cnt - io_pool_cnt_min;
2025 	if (retcnt <= 0)
2026 		goto done;
2027 	io_pool_shrinks++;	/* should be a kstat? */
2028 	curpool = &io_pool_4g;
2029 domore:
2030 	/*
2031 	 * Loop through taking pages from the end of the list
2032 	 * (highest mfns) till amount to return reached.
2033 	 */
2034 	for (pp = *curpool; pp && retcnt > 0; ) {
2035 		pp_first = pp_last = pp->p_prev;
2036 		if (pp_first == *curpool)
2037 			break;
2038 		retcnt--;
2039 		io_pool_cnt--;
2040 		page_io_pool_sub(curpool, pp_first, pp_last);
2041 		if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn)
2042 			start_mfn = mfn;
2043 		page_free(pp_first, 1);
2044 		pp = *curpool;
2045 	}
2046 	if (retcnt != 0 && !bothpools) {
2047 		/*
2048 		 * If not enough found in less constrained pool try the
2049 		 * more constrained one.
2050 		 */
2051 		curpool = &io_pool_16m;
2052 		bothpools = 1;
2053 		goto domore;
2054 	}
2055 done:
2056 	mutex_exit(&io_pool_lock);
2057 }
2058 
2059 #endif	/* __xpv */
2060 
2061 uint_t
2062 page_create_update_flags_x86(uint_t flags)
2063 {
2064 #if defined(__xpv)
2065 	/*
2066 	 * Check this is an urgent allocation and free pages are depleted.
2067 	 */
2068 	if (!(flags & PG_WAIT) && freemem < desfree)
2069 		page_io_pool_shrink();
2070 #else /* !__xpv */
2071 	/*
2072 	 * page_create_get_something may call this because 4g memory may be
2073 	 * depleted. Set flags to allow for relocation of base page below
2074 	 * 4g if necessary.
2075 	 */
2076 	if (physmax4g)
2077 		flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI);
2078 #endif /* __xpv */
2079 	return (flags);
2080 }
2081 
2082 /*ARGSUSED*/
2083 int
2084 bp_color(struct buf *bp)
2085 {
2086 	return (0);
2087 }
2088 
2089 #if defined(__xpv)
2090 
2091 /*
2092  * Take pages out of an io_pool
2093  */
2094 static void
2095 page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last)
2096 {
2097 	if (*poolp == pp_first) {
2098 		*poolp = pp_last->p_next;
2099 		if (*poolp == pp_first)
2100 			*poolp = NULL;
2101 	}
2102 	pp_first->p_prev->p_next = pp_last->p_next;
2103 	pp_last->p_next->p_prev = pp_first->p_prev;
2104 	pp_first->p_prev = pp_last;
2105 	pp_last->p_next = pp_first;
2106 }
2107 
2108 /*
2109  * Put a page on the io_pool list. The list is ordered by increasing MFN.
2110  */
2111 static void
2112 page_io_pool_add(page_t **poolp, page_t *pp)
2113 {
2114 	page_t	*look;
2115 	mfn_t	mfn = mfn_list[pp->p_pagenum];
2116 
2117 	if (*poolp == NULL) {
2118 		*poolp = pp;
2119 		pp->p_next = pp;
2120 		pp->p_prev = pp;
2121 		return;
2122 	}
2123 
2124 	/*
2125 	 * Since we try to take pages from the high end of the pool
2126 	 * chances are good that the pages to be put on the list will
2127 	 * go at or near the end of the list. so start at the end and
2128 	 * work backwards.
2129 	 */
2130 	look = (*poolp)->p_prev;
2131 	while (mfn < mfn_list[look->p_pagenum]) {
2132 		look = look->p_prev;
2133 		if (look == (*poolp)->p_prev)
2134 			break; /* backed all the way to front of list */
2135 	}
2136 
2137 	/* insert after look */
2138 	pp->p_prev = look;
2139 	pp->p_next = look->p_next;
2140 	pp->p_next->p_prev = pp;
2141 	look->p_next = pp;
2142 	if (mfn < mfn_list[(*poolp)->p_pagenum]) {
2143 		/*
2144 		 * we inserted a new first list element
2145 		 * adjust pool pointer to newly inserted element
2146 		 */
2147 		*poolp = pp;
2148 	}
2149 }
2150 
2151 /*
2152  * Add a page to the io_pool.  Setting the force flag will force the page
2153  * into the io_pool no matter what.
2154  */
2155 static void
2156 add_page_to_pool(page_t *pp, int force)
2157 {
2158 	page_t *highest;
2159 	page_t *freep = NULL;
2160 
2161 	mutex_enter(&io_pool_lock);
2162 	/*
2163 	 * Always keep the scarce low memory pages
2164 	 */
2165 	if (mfn_list[pp->p_pagenum] < PFN_16MEG) {
2166 		++io_pool_cnt;
2167 		page_io_pool_add(&io_pool_16m, pp);
2168 		goto done;
2169 	}
2170 	if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) {
2171 		++io_pool_cnt;
2172 		page_io_pool_add(&io_pool_4g, pp);
2173 	} else {
2174 		highest = io_pool_4g->p_prev;
2175 		if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) {
2176 			page_io_pool_sub(&io_pool_4g, highest, highest);
2177 			page_io_pool_add(&io_pool_4g, pp);
2178 			freep = highest;
2179 		} else {
2180 			freep = pp;
2181 		}
2182 	}
2183 done:
2184 	mutex_exit(&io_pool_lock);
2185 	if (freep)
2186 		page_free(freep, 1);
2187 }
2188 
2189 
2190 int contig_pfn_cnt;	/* no of pfns in the contig pfn list */
2191 int contig_pfn_max;	/* capacity of the contig pfn list */
2192 int next_alloc_pfn;	/* next position in list to start a contig search */
2193 int contig_pfnlist_updates;	/* pfn list update count */
2194 int contig_pfnlist_builds;	/* how many times have we (re)built list */
2195 int contig_pfnlist_buildfailed;	/* how many times has list build failed */
2196 int create_contig_pending;	/* nonzero means taskq creating contig list */
2197 pfn_t *contig_pfn_list = NULL;	/* list of contig pfns in ascending mfn order */
2198 
2199 /*
2200  * Function to use in sorting a list of pfns by their underlying mfns.
2201  */
2202 static int
2203 mfn_compare(const void *pfnp1, const void *pfnp2)
2204 {
2205 	mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1];
2206 	mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2];
2207 
2208 	if (mfn1 > mfn2)
2209 		return (1);
2210 	if (mfn1 < mfn2)
2211 		return (-1);
2212 	return (0);
2213 }
2214 
2215 /*
2216  * Compact the contig_pfn_list by tossing all the non-contiguous
2217  * elements from the list.
2218  */
2219 static void
2220 compact_contig_pfn_list(void)
2221 {
2222 	pfn_t pfn, lapfn, prev_lapfn;
2223 	mfn_t mfn;
2224 	int i, newcnt = 0;
2225 
2226 	prev_lapfn = 0;
2227 	for (i = 0; i < contig_pfn_cnt - 1; i++) {
2228 		pfn = contig_pfn_list[i];
2229 		lapfn = contig_pfn_list[i + 1];
2230 		mfn = mfn_list[pfn];
2231 		/*
2232 		 * See if next pfn is for a contig mfn
2233 		 */
2234 		if (mfn_list[lapfn] != mfn + 1)
2235 			continue;
2236 		/*
2237 		 * pfn and lookahead are both put in list
2238 		 * unless pfn is the previous lookahead.
2239 		 */
2240 		if (pfn != prev_lapfn)
2241 			contig_pfn_list[newcnt++] = pfn;
2242 		contig_pfn_list[newcnt++] = lapfn;
2243 		prev_lapfn = lapfn;
2244 	}
2245 	for (i = newcnt; i < contig_pfn_cnt; i++)
2246 		contig_pfn_list[i] = 0;
2247 	contig_pfn_cnt = newcnt;
2248 }
2249 
2250 /*ARGSUSED*/
2251 static void
2252 call_create_contiglist(void *arg)
2253 {
2254 	(void) create_contig_pfnlist(PG_WAIT);
2255 }
2256 
2257 /*
2258  * Create list of freelist pfns that have underlying
2259  * contiguous mfns.  The list is kept in ascending mfn order.
2260  * returns 1 if list created else 0.
2261  */
2262 static int
2263 create_contig_pfnlist(uint_t flags)
2264 {
2265 	pfn_t pfn;
2266 	page_t *pp;
2267 	int ret = 1;
2268 
2269 	mutex_enter(&contig_list_lock);
2270 	if (contig_pfn_list != NULL)
2271 		goto out;
2272 	contig_pfn_max = freemem + (freemem / 10);
2273 	contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t),
2274 	    (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP);
2275 	if (contig_pfn_list == NULL) {
2276 		/*
2277 		 * If we could not create the contig list (because
2278 		 * we could not sleep for memory).  Dispatch a taskq that can
2279 		 * sleep to get the memory.
2280 		 */
2281 		if (!create_contig_pending) {
2282 			if (taskq_dispatch(system_taskq, call_create_contiglist,
2283 			    NULL, TQ_NOSLEEP) != NULL)
2284 				create_contig_pending = 1;
2285 		}
2286 		contig_pfnlist_buildfailed++;	/* count list build failures */
2287 		ret = 0;
2288 		goto out;
2289 	}
2290 	create_contig_pending = 0;
2291 	ASSERT(contig_pfn_cnt == 0);
2292 	for (pfn = 0; pfn < mfn_count; pfn++) {
2293 		pp = page_numtopp_nolock(pfn);
2294 		if (pp == NULL || !PP_ISFREE(pp))
2295 			continue;
2296 		contig_pfn_list[contig_pfn_cnt] = pfn;
2297 		if (++contig_pfn_cnt == contig_pfn_max)
2298 			break;
2299 	}
2300 	/*
2301 	 * Sanity check the new list.
2302 	 */
2303 	if (contig_pfn_cnt < 2) { /* no contig pfns */
2304 		contig_pfn_cnt = 0;
2305 		contig_pfnlist_buildfailed++;
2306 		kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t));
2307 		contig_pfn_list = NULL;
2308 		contig_pfn_max = 0;
2309 		ret = 0;
2310 		goto out;
2311 	}
2312 	qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare);
2313 	compact_contig_pfn_list();
2314 	/*
2315 	 * Make sure next search of the newly created contiguous pfn
2316 	 * list starts at the beginning of the list.
2317 	 */
2318 	next_alloc_pfn = 0;
2319 	contig_pfnlist_builds++;	/* count list builds */
2320 out:
2321 	mutex_exit(&contig_list_lock);
2322 	return (ret);
2323 }
2324 
2325 
2326 /*
2327  * Toss the current contig pfnlist.  Someone is about to do a massive
2328  * update to pfn<->mfn mappings.  So we have them destroy the list and lock
2329  * it till they are done with their update.
2330  */
2331 void
2332 clear_and_lock_contig_pfnlist()
2333 {
2334 	pfn_t *listp = NULL;
2335 	size_t listsize;
2336 
2337 	mutex_enter(&contig_list_lock);
2338 	if (contig_pfn_list != NULL) {
2339 		listp = contig_pfn_list;
2340 		listsize = contig_pfn_max * sizeof (pfn_t);
2341 		contig_pfn_list = NULL;
2342 		contig_pfn_max = contig_pfn_cnt = 0;
2343 	}
2344 	if (listp != NULL)
2345 		kmem_free(listp, listsize);
2346 }
2347 
2348 /*
2349  * Unlock the contig_pfn_list.  The next attempted use of it will cause
2350  * it to be re-created.
2351  */
2352 void
2353 unlock_contig_pfnlist()
2354 {
2355 	mutex_exit(&contig_list_lock);
2356 }
2357 
2358 /*
2359  * Update the contiguous pfn list in response to a pfn <-> mfn reassignment
2360  */
2361 void
2362 update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn)
2363 {
2364 	int probe_hi, probe_lo, probe_pos, insert_after, insert_point;
2365 	pfn_t probe_pfn;
2366 	mfn_t probe_mfn;
2367 	int drop_lock = 0;
2368 
2369 	if (mutex_owner(&contig_list_lock) != curthread) {
2370 		drop_lock = 1;
2371 		mutex_enter(&contig_list_lock);
2372 	}
2373 	if (contig_pfn_list == NULL)
2374 		goto done;
2375 	contig_pfnlist_updates++;
2376 	/*
2377 	 * Find the pfn in the current list.  Use a binary chop to locate it.
2378 	 */
2379 	probe_hi = contig_pfn_cnt - 1;
2380 	probe_lo = 0;
2381 	probe_pos = (probe_hi + probe_lo) / 2;
2382 	while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) {
2383 		if (probe_pos == probe_lo) { /* pfn not in list */
2384 			probe_pos = -1;
2385 			break;
2386 		}
2387 		if (pfn_to_mfn(probe_pfn) <= oldmfn)
2388 			probe_lo = probe_pos;
2389 		else
2390 			probe_hi = probe_pos;
2391 		probe_pos = (probe_hi + probe_lo) / 2;
2392 	}
2393 	if (probe_pos >= 0) {
2394 		/*
2395 		 * Remove pfn from list and ensure next alloc
2396 		 * position stays in bounds.
2397 		 */
2398 		if (--contig_pfn_cnt <= next_alloc_pfn)
2399 			next_alloc_pfn = 0;
2400 		if (contig_pfn_cnt < 2) { /* no contig pfns */
2401 			contig_pfn_cnt = 0;
2402 			kmem_free(contig_pfn_list,
2403 			    contig_pfn_max * sizeof (pfn_t));
2404 			contig_pfn_list = NULL;
2405 			contig_pfn_max = 0;
2406 			goto done;
2407 		}
2408 		ovbcopy(&contig_pfn_list[probe_pos + 1],
2409 		    &contig_pfn_list[probe_pos],
2410 		    (contig_pfn_cnt - probe_pos) * sizeof (pfn_t));
2411 	}
2412 	if (newmfn == MFN_INVALID)
2413 		goto done;
2414 	/*
2415 	 * Check if new mfn has adjacent mfns in the list
2416 	 */
2417 	probe_hi = contig_pfn_cnt - 1;
2418 	probe_lo = 0;
2419 	insert_after = -2;
2420 	do {
2421 		probe_pos = (probe_hi + probe_lo) / 2;
2422 		probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]);
2423 		if (newmfn == probe_mfn + 1)
2424 			insert_after = probe_pos;
2425 		else if (newmfn == probe_mfn - 1)
2426 			insert_after = probe_pos - 1;
2427 		if (probe_pos == probe_lo)
2428 			break;
2429 		if (probe_mfn <= newmfn)
2430 			probe_lo = probe_pos;
2431 		else
2432 			probe_hi = probe_pos;
2433 	} while (insert_after == -2);
2434 	/*
2435 	 * If there is space in the list and there are adjacent mfns
2436 	 * insert the pfn in to its proper place in the list.
2437 	 */
2438 	if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) {
2439 		insert_point = insert_after + 1;
2440 		ovbcopy(&contig_pfn_list[insert_point],
2441 		    &contig_pfn_list[insert_point + 1],
2442 		    (contig_pfn_cnt - insert_point) * sizeof (pfn_t));
2443 		contig_pfn_list[insert_point] = pfn;
2444 		contig_pfn_cnt++;
2445 	}
2446 done:
2447 	if (drop_lock)
2448 		mutex_exit(&contig_list_lock);
2449 }
2450 
2451 /*
2452  * Called to (re-)populate the io_pool from the free page lists.
2453  */
2454 long
2455 populate_io_pool(void)
2456 {
2457 	pfn_t pfn;
2458 	mfn_t mfn, max_mfn;
2459 	page_t *pp;
2460 
2461 	/*
2462 	 * Figure out the bounds of the pool on first invocation.
2463 	 * We use a percentage of memory for the io pool size.
2464 	 * we allow that to shrink, but not to less than a fixed minimum
2465 	 */
2466 	if (io_pool_cnt_max == 0) {
2467 		io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct);
2468 		io_pool_cnt_lowater = io_pool_cnt_max;
2469 		/*
2470 		 * This is the first time in populate_io_pool, grab a va to use
2471 		 * when we need to allocate pages.
2472 		 */
2473 		io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
2474 	}
2475 	/*
2476 	 * If we are out of pages in the pool, then grow the size of the pool
2477 	 */
2478 	if (io_pool_cnt == 0) {
2479 		/*
2480 		 * Grow the max size of the io pool by 5%, but never more than
2481 		 * 25% of physical memory.
2482 		 */
2483 		if (io_pool_cnt_max < physmem / 4)
2484 			io_pool_cnt_max += io_pool_cnt_max / 20;
2485 	}
2486 	io_pool_grows++;	/* should be a kstat? */
2487 
2488 	/*
2489 	 * Get highest mfn on this platform, but limit to the 32 bit DMA max.
2490 	 */
2491 	(void) mfn_to_pfn(start_mfn);
2492 	max_mfn = MIN(cached_max_mfn, PFN_4GIG);
2493 	for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) {
2494 		pfn = mfn_to_pfn(mfn);
2495 		if (pfn & PFN_IS_FOREIGN_MFN)
2496 			continue;
2497 		/*
2498 		 * try to allocate it from free pages
2499 		 */
2500 		pp = page_numtopp_alloc(pfn);
2501 		if (pp == NULL)
2502 			continue;
2503 		PP_CLRFREE(pp);
2504 		add_page_to_pool(pp, 1);
2505 		if (io_pool_cnt >= io_pool_cnt_max)
2506 			break;
2507 	}
2508 
2509 	return (io_pool_cnt);
2510 }
2511 
2512 /*
2513  * Destroy a page that was being used for DMA I/O. It may or
2514  * may not actually go back to the io_pool.
2515  */
2516 void
2517 page_destroy_io(page_t *pp)
2518 {
2519 	mfn_t mfn = mfn_list[pp->p_pagenum];
2520 
2521 	/*
2522 	 * When the page was alloc'd a reservation was made, release it now
2523 	 */
2524 	page_unresv(1);
2525 	/*
2526 	 * Unload translations, if any, then hash out the
2527 	 * page to erase its identity.
2528 	 */
2529 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2530 	page_hashout(pp, NULL);
2531 
2532 	/*
2533 	 * If the page came from the free lists, just put it back to them.
2534 	 * DomU pages always go on the free lists as well.
2535 	 */
2536 	if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) {
2537 		page_free(pp, 1);
2538 		return;
2539 	}
2540 
2541 	add_page_to_pool(pp, 0);
2542 }
2543 
2544 
2545 long contig_searches;		/* count of times contig pages requested */
2546 long contig_search_restarts;	/* count of contig ranges tried */
2547 long contig_search_failed;	/* count of contig alloc failures */
2548 
2549 /*
2550  * Free partial page list
2551  */
2552 static void
2553 free_partial_list(page_t **pplist)
2554 {
2555 	page_t *pp;
2556 
2557 	while (*pplist != NULL) {
2558 		pp = *pplist;
2559 		page_io_pool_sub(pplist, pp, pp);
2560 		page_free(pp, 1);
2561 	}
2562 }
2563 
2564 /*
2565  * Look thru the contiguous pfns that are not part of the io_pool for
2566  * contiguous free pages.  Return a list of the found pages or NULL.
2567  */
2568 page_t *
2569 find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg,
2570     pgcnt_t pfnalign)
2571 {
2572 	page_t *pp, *plist = NULL;
2573 	mfn_t mfn, prev_mfn, start_mfn;
2574 	pfn_t pfn;
2575 	int pages_needed, pages_requested;
2576 	int search_start;
2577 
2578 	/*
2579 	 * create the contig pfn list if not already done
2580 	 */
2581 retry:
2582 	mutex_enter(&contig_list_lock);
2583 	if (contig_pfn_list == NULL) {
2584 		mutex_exit(&contig_list_lock);
2585 		if (!create_contig_pfnlist(flags)) {
2586 			return (NULL);
2587 		}
2588 		goto retry;
2589 	}
2590 	contig_searches++;
2591 	/*
2592 	 * Search contiguous pfn list for physically contiguous pages not in
2593 	 * the io_pool.  Start the search where the last search left off.
2594 	 */
2595 	pages_requested = pages_needed = npages;
2596 	search_start = next_alloc_pfn;
2597 	start_mfn = prev_mfn = 0;
2598 	while (pages_needed) {
2599 		pfn = contig_pfn_list[next_alloc_pfn];
2600 		mfn = pfn_to_mfn(pfn);
2601 		/*
2602 		 * Check if mfn is first one or contig to previous one and
2603 		 * if page corresponding to mfn is free and that mfn
2604 		 * range is not crossing a segment boundary.
2605 		 */
2606 		if ((prev_mfn == 0 || mfn == prev_mfn + 1) &&
2607 		    (pp = page_numtopp_alloc(pfn)) != NULL &&
2608 		    !((mfn & pfnseg) < (start_mfn & pfnseg))) {
2609 			PP_CLRFREE(pp);
2610 			page_io_pool_add(&plist, pp);
2611 			pages_needed--;
2612 			if (prev_mfn == 0) {
2613 				if (pfnalign &&
2614 				    mfn != P2ROUNDUP(mfn, pfnalign)) {
2615 					/*
2616 					 * not properly aligned
2617 					 */
2618 					contig_search_restarts++;
2619 					free_partial_list(&plist);
2620 					pages_needed = pages_requested;
2621 					start_mfn = prev_mfn = 0;
2622 					goto skip;
2623 				}
2624 				start_mfn = mfn;
2625 			}
2626 			prev_mfn = mfn;
2627 		} else {
2628 			contig_search_restarts++;
2629 			free_partial_list(&plist);
2630 			pages_needed = pages_requested;
2631 			start_mfn = prev_mfn = 0;
2632 		}
2633 skip:
2634 		if (++next_alloc_pfn == contig_pfn_cnt)
2635 			next_alloc_pfn = 0;
2636 		if (next_alloc_pfn == search_start)
2637 			break; /* all pfns searched */
2638 	}
2639 	mutex_exit(&contig_list_lock);
2640 	if (pages_needed) {
2641 		contig_search_failed++;
2642 		/*
2643 		 * Failed to find enough contig pages.
2644 		 * free partial page list
2645 		 */
2646 		free_partial_list(&plist);
2647 	}
2648 	return (plist);
2649 }
2650 
2651 /*
2652  * Search the reserved io pool pages for a page range with the
2653  * desired characteristics.
2654  */
2655 page_t *
2656 page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg)
2657 {
2658 	page_t *pp_first, *pp_last;
2659 	page_t *pp, **poolp;
2660 	pgcnt_t nwanted, pfnalign;
2661 	uint64_t pfnseg;
2662 	mfn_t mfn, tmfn, hi_mfn, lo_mfn;
2663 	int align, attempt = 0;
2664 
2665 	if (minctg == 1)
2666 		contig = 0;
2667 	lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2668 	hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2669 	pfnseg = mmu_btop(mattr->dma_attr_seg);
2670 	align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2671 	if (align > MMU_PAGESIZE)
2672 		pfnalign = mmu_btop(align);
2673 	else
2674 		pfnalign = 0;
2675 
2676 try_again:
2677 	/*
2678 	 * See if we want pages for a legacy device
2679 	 */
2680 	if (hi_mfn < PFN_16MEG)
2681 		poolp = &io_pool_16m;
2682 	else
2683 		poolp = &io_pool_4g;
2684 try_smaller:
2685 	/*
2686 	 * Take pages from I/O pool. We'll use pages from the highest
2687 	 * MFN range possible.
2688 	 */
2689 	pp_first = pp_last = NULL;
2690 	mutex_enter(&io_pool_lock);
2691 	nwanted = minctg;
2692 	for (pp = *poolp; pp && nwanted > 0; ) {
2693 		pp = pp->p_prev;
2694 
2695 		/*
2696 		 * skip pages above allowable range
2697 		 */
2698 		mfn = mfn_list[pp->p_pagenum];
2699 		if (hi_mfn < mfn)
2700 			goto skip;
2701 
2702 		/*
2703 		 * stop at pages below allowable range
2704 		 */
2705 		if (lo_mfn > mfn)
2706 			break;
2707 restart:
2708 		if (pp_last == NULL) {
2709 			/*
2710 			 * Check alignment
2711 			 */
2712 			tmfn = mfn - (minctg - 1);
2713 			if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign))
2714 				goto skip; /* not properly aligned */
2715 			/*
2716 			 * Check segment
2717 			 */
2718 			if ((mfn & pfnseg) < (tmfn & pfnseg))
2719 				goto skip; /* crosses seg boundary */
2720 			/*
2721 			 * Start building page list
2722 			 */
2723 			pp_first = pp_last = pp;
2724 			nwanted--;
2725 		} else {
2726 			/*
2727 			 * check physical contiguity if required
2728 			 */
2729 			if (contig &&
2730 			    mfn_list[pp_first->p_pagenum] != mfn + 1) {
2731 				/*
2732 				 * not a contiguous page, restart list.
2733 				 */
2734 				pp_last = NULL;
2735 				nwanted = minctg;
2736 				goto restart;
2737 			} else { /* add page to list */
2738 				pp_first = pp;
2739 				nwanted--;
2740 			}
2741 		}
2742 skip:
2743 		if (pp == *poolp)
2744 			break;
2745 	}
2746 
2747 	/*
2748 	 * If we didn't find memory. Try the more constrained pool, then
2749 	 * sweep free pages into the DMA pool and try again.
2750 	 */
2751 	if (nwanted != 0) {
2752 		mutex_exit(&io_pool_lock);
2753 		/*
2754 		 * If we were looking in the less constrained pool and
2755 		 * didn't find pages, try the more constrained pool.
2756 		 */
2757 		if (poolp == &io_pool_4g) {
2758 			poolp = &io_pool_16m;
2759 			goto try_smaller;
2760 		}
2761 		kmem_reap();
2762 		if (++attempt < 4) {
2763 			/*
2764 			 * Grab some more io_pool pages
2765 			 */
2766 			(void) populate_io_pool();
2767 			goto try_again; /* go around and retry */
2768 		}
2769 		return (NULL);
2770 	}
2771 	/*
2772 	 * Found the pages, now snip them from the list
2773 	 */
2774 	page_io_pool_sub(poolp, pp_first, pp_last);
2775 	io_pool_cnt -= minctg;
2776 	/*
2777 	 * reset low water mark
2778 	 */
2779 	if (io_pool_cnt < io_pool_cnt_lowater)
2780 		io_pool_cnt_lowater = io_pool_cnt;
2781 	mutex_exit(&io_pool_lock);
2782 	return (pp_first);
2783 }
2784 
2785 page_t *
2786 page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr,
2787     ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg)
2788 {
2789 	uint_t kflags;
2790 	int order, extra, extpages, i, contig, nbits, extents;
2791 	page_t *pp, *expp, *pp_first, **pplist = NULL;
2792 	mfn_t *mfnlist = NULL;
2793 
2794 	contig = flags & PG_PHYSCONTIG;
2795 	if (minctg == 1)
2796 		contig = 0;
2797 	flags &= ~PG_PHYSCONTIG;
2798 	kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP;
2799 	/*
2800 	 * Hypervisor will allocate extents, if we want contig
2801 	 * pages extent must be >= minctg
2802 	 */
2803 	if (contig) {
2804 		order = highbit(minctg) - 1;
2805 		if (minctg & ((1 << order) - 1))
2806 			order++;
2807 		extpages = 1 << order;
2808 	} else {
2809 		order = 0;
2810 		extpages = minctg;
2811 	}
2812 	if (extpages > minctg) {
2813 		extra = extpages - minctg;
2814 		if (!page_resv(extra, kflags))
2815 			return (NULL);
2816 	}
2817 	pp_first = NULL;
2818 	pplist = kmem_alloc(extpages * sizeof (page_t *), kflags);
2819 	if (pplist == NULL)
2820 		goto balloon_fail;
2821 	mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags);
2822 	if (mfnlist == NULL)
2823 		goto balloon_fail;
2824 	pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr);
2825 	if (pp == NULL)
2826 		goto balloon_fail;
2827 	pp_first = pp;
2828 	if (extpages > minctg) {
2829 		/*
2830 		 * fill out the rest of extent pages to swap
2831 		 * with the hypervisor
2832 		 */
2833 		for (i = 0; i < extra; i++) {
2834 			expp = page_create_va(vp,
2835 			    (u_offset_t)(uintptr_t)io_pool_kva,
2836 			    PAGESIZE, flags, &kvseg, io_pool_kva);
2837 			if (expp == NULL)
2838 				goto balloon_fail;
2839 			(void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD);
2840 			page_io_unlock(expp);
2841 			page_hashout(expp, NULL);
2842 			page_io_lock(expp);
2843 			/*
2844 			 * add page to end of list
2845 			 */
2846 			expp->p_prev = pp_first->p_prev;
2847 			expp->p_next = pp_first;
2848 			expp->p_prev->p_next = expp;
2849 			pp_first->p_prev = expp;
2850 		}
2851 
2852 	}
2853 	for (i = 0; i < extpages; i++) {
2854 		pplist[i] = pp;
2855 		pp = pp->p_next;
2856 	}
2857 	nbits = highbit(mattr->dma_attr_addr_hi);
2858 	extents = contig ? 1 : minctg;
2859 	if (balloon_replace_pages(extents, pplist, nbits, order,
2860 	    mfnlist) != extents) {
2861 		if (ioalloc_dbg)
2862 			cmn_err(CE_NOTE, "request to hypervisor"
2863 			    " for %d pages, maxaddr %" PRIx64 " failed",
2864 			    extpages, mattr->dma_attr_addr_hi);
2865 		goto balloon_fail;
2866 	}
2867 
2868 	kmem_free(pplist, extpages * sizeof (page_t *));
2869 	kmem_free(mfnlist, extpages * sizeof (mfn_t));
2870 	/*
2871 	 * Return any excess pages to free list
2872 	 */
2873 	if (extpages > minctg) {
2874 		for (i = 0; i < extra; i++) {
2875 			pp = pp_first->p_prev;
2876 			page_sub(&pp_first, pp);
2877 			page_io_unlock(pp);
2878 			page_unresv(1);
2879 			page_free(pp, 1);
2880 		}
2881 	}
2882 	return (pp_first);
2883 balloon_fail:
2884 	/*
2885 	 * Return pages to free list and return failure
2886 	 */
2887 	while (pp_first != NULL) {
2888 		pp = pp_first;
2889 		page_sub(&pp_first, pp);
2890 		page_io_unlock(pp);
2891 		if (pp->p_vnode != NULL)
2892 			page_hashout(pp, NULL);
2893 		page_free(pp, 1);
2894 	}
2895 	if (pplist)
2896 		kmem_free(pplist, extpages * sizeof (page_t *));
2897 	if (mfnlist)
2898 		kmem_free(mfnlist, extpages * sizeof (mfn_t));
2899 	page_unresv(extpages - minctg);
2900 	return (NULL);
2901 }
2902 
2903 static void
2904 return_partial_alloc(page_t *plist)
2905 {
2906 	page_t *pp;
2907 
2908 	while (plist != NULL) {
2909 		pp = plist;
2910 		page_sub(&plist, pp);
2911 		page_io_unlock(pp);
2912 		page_destroy_io(pp);
2913 	}
2914 }
2915 
2916 static page_t *
2917 page_get_contigpages(
2918 	struct vnode	*vp,
2919 	u_offset_t	off,
2920 	int		*npagesp,
2921 	uint_t		flags,
2922 	caddr_t		vaddr,
2923 	ddi_dma_attr_t	*mattr)
2924 {
2925 	mfn_t	max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
2926 	page_t	*plist;	/* list to return */
2927 	page_t	*pp, *mcpl;
2928 	int	contig, anyaddr, npages, getone = 0;
2929 	mfn_t	lo_mfn;
2930 	mfn_t	hi_mfn;
2931 	pgcnt_t	pfnalign = 0;
2932 	int	align, sgllen;
2933 	uint64_t pfnseg;
2934 	pgcnt_t	minctg;
2935 
2936 	npages = *npagesp;
2937 	ASSERT(mattr != NULL);
2938 	lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2939 	hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2940 	sgllen = mattr->dma_attr_sgllen;
2941 	pfnseg = mmu_btop(mattr->dma_attr_seg);
2942 	align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2943 	if (align > MMU_PAGESIZE)
2944 		pfnalign = mmu_btop(align);
2945 
2946 	contig = flags & PG_PHYSCONTIG;
2947 	if (npages == -1) {
2948 		npages = 1;
2949 		pfnalign = 0;
2950 	}
2951 	/*
2952 	 * Clear the contig flag if only one page is needed.
2953 	 */
2954 	if (npages == 1) {
2955 		getone = 1;
2956 		contig = 0;
2957 	}
2958 
2959 	/*
2960 	 * Check if any page in the system is fine.
2961 	 */
2962 	anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn;
2963 	if (!contig && anyaddr && !pfnalign) {
2964 		flags &= ~PG_PHYSCONTIG;
2965 		plist = page_create_va(vp, off, npages * MMU_PAGESIZE,
2966 		    flags, &kvseg, vaddr);
2967 		if (plist != NULL) {
2968 			*npagesp = 0;
2969 			return (plist);
2970 		}
2971 	}
2972 	plist = NULL;
2973 	minctg = howmany(npages, sgllen);
2974 	while (npages > sgllen || getone) {
2975 		if (minctg > npages)
2976 			minctg = npages;
2977 		mcpl = NULL;
2978 		/*
2979 		 * We could want contig pages with no address range limits.
2980 		 */
2981 		if (anyaddr && contig) {
2982 			/*
2983 			 * Look for free contig pages to satisfy the request.
2984 			 */
2985 			mcpl = find_contig_free(minctg, flags, pfnseg,
2986 			    pfnalign);
2987 		}
2988 		/*
2989 		 * Try the reserved io pools next
2990 		 */
2991 		if (mcpl == NULL)
2992 			mcpl = page_io_pool_alloc(mattr, contig, minctg);
2993 		if (mcpl != NULL) {
2994 			pp = mcpl;
2995 			do {
2996 				if (!page_hashin(pp, vp, off, NULL)) {
2997 					panic("page_get_contigpages:"
2998 					    " hashin failed"
2999 					    " pp %p, vp %p, off %llx",
3000 					    (void *)pp, (void *)vp, off);
3001 				}
3002 				off += MMU_PAGESIZE;
3003 				PP_CLRFREE(pp);
3004 				PP_CLRAGED(pp);
3005 				page_set_props(pp, P_REF);
3006 				page_io_lock(pp);
3007 				pp = pp->p_next;
3008 			} while (pp != mcpl);
3009 		} else {
3010 			/*
3011 			 * Hypervisor exchange doesn't handle segment or
3012 			 * alignment constraints
3013 			 */
3014 			if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi ||
3015 			    pfnalign)
3016 				goto fail;
3017 			/*
3018 			 * Try exchanging pages with the hypervisor
3019 			 */
3020 			mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr,
3021 			    flags, minctg);
3022 			if (mcpl == NULL)
3023 				goto fail;
3024 			off += minctg * MMU_PAGESIZE;
3025 		}
3026 		check_dma(mattr, mcpl, minctg);
3027 		/*
3028 		 * Here with a minctg run of contiguous pages, add them to the
3029 		 * list we will return for this request.
3030 		 */
3031 		page_list_concat(&plist, &mcpl);
3032 		npages -= minctg;
3033 		*npagesp = npages;
3034 		sgllen--;
3035 		if (getone)
3036 			break;
3037 	}
3038 	return (plist);
3039 fail:
3040 	return_partial_alloc(plist);
3041 	return (NULL);
3042 }
3043 
3044 /*
3045  * Allocator for domain 0 I/O pages. We match the required
3046  * DMA attributes and contiguity constraints.
3047  */
3048 /*ARGSUSED*/
3049 page_t *
3050 page_create_io(
3051 	struct vnode	*vp,
3052 	u_offset_t	off,
3053 	uint_t		bytes,
3054 	uint_t		flags,
3055 	struct as	*as,
3056 	caddr_t		vaddr,
3057 	ddi_dma_attr_t	*mattr)
3058 {
3059 	page_t	*plist = NULL, *pp;
3060 	int	npages = 0, contig, anyaddr, pages_req;
3061 	mfn_t	lo_mfn;
3062 	mfn_t	hi_mfn;
3063 	pgcnt_t	pfnalign = 0;
3064 	int	align;
3065 	int	is_domu = 0;
3066 	int	dummy, bytes_got;
3067 	mfn_t	max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
3068 
3069 	ASSERT(mattr != NULL);
3070 	lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
3071 	hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
3072 	align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
3073 	if (align > MMU_PAGESIZE)
3074 		pfnalign = mmu_btop(align);
3075 
3076 	/*
3077 	 * Clear the contig flag if only one page is needed or the scatter
3078 	 * gather list length is >= npages.
3079 	 */
3080 	pages_req = npages = mmu_btopr(bytes);
3081 	contig = (flags & PG_PHYSCONTIG);
3082 	bytes = P2ROUNDUP(bytes, MMU_PAGESIZE);
3083 	if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages)
3084 		contig = 0;
3085 
3086 	/*
3087 	 * Check if any old page in the system is fine.
3088 	 * DomU should always go down this path.
3089 	 */
3090 	is_domu = !DOMAIN_IS_INITDOMAIN(xen_info);
3091 	anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign;
3092 	if ((!contig && anyaddr) || is_domu) {
3093 		flags &= ~PG_PHYSCONTIG;
3094 		plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr);
3095 		if (plist != NULL)
3096 			return (plist);
3097 		else if (is_domu)
3098 			return (NULL); /* no memory available */
3099 	}
3100 	/*
3101 	 * DomU should never reach here
3102 	 */
3103 	if (contig) {
3104 		plist = page_get_contigpages(vp, off, &npages, flags, vaddr,
3105 		    mattr);
3106 		if (plist == NULL)
3107 			goto fail;
3108 		bytes_got = (pages_req - npages) << MMU_PAGESHIFT;
3109 		vaddr += bytes_got;
3110 		off += bytes_got;
3111 		/*
3112 		 * We now have all the contiguous pages we need, but
3113 		 * we may still need additional non-contiguous pages.
3114 		 */
3115 	}
3116 	/*
3117 	 * now loop collecting the requested number of pages, these do
3118 	 * not have to be contiguous pages but we will use the contig
3119 	 * page alloc code to get the pages since it will honor any
3120 	 * other constraints the pages may have.
3121 	 */
3122 	while (npages--) {
3123 		dummy = -1;
3124 		pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr);
3125 		if (pp == NULL)
3126 			goto fail;
3127 		page_add(&plist, pp);
3128 		vaddr += MMU_PAGESIZE;
3129 		off += MMU_PAGESIZE;
3130 	}
3131 	return (plist);
3132 fail:
3133 	/*
3134 	 * Failed to get enough pages, return ones we did get
3135 	 */
3136 	return_partial_alloc(plist);
3137 	return (NULL);
3138 }
3139 
3140 /*
3141  * Lock and return the page with the highest mfn that we can find.  last_mfn
3142  * holds the last one found, so the next search can start from there.  We
3143  * also keep a counter so that we don't loop forever if the machine has no
3144  * free pages.
3145  *
3146  * This is called from the balloon thread to find pages to give away.  new_high
3147  * is used when new mfn's have been added to the system - we will reset our
3148  * search if the new mfn's are higher than our current search position.
3149  */
3150 page_t *
3151 page_get_high_mfn(mfn_t new_high)
3152 {
3153 	static mfn_t last_mfn = 0;
3154 	pfn_t pfn;
3155 	page_t *pp;
3156 	ulong_t loop_count = 0;
3157 
3158 	if (new_high > last_mfn)
3159 		last_mfn = new_high;
3160 
3161 	for (; loop_count < mfn_count; loop_count++, last_mfn--) {
3162 		if (last_mfn == 0) {
3163 			last_mfn = cached_max_mfn;
3164 		}
3165 
3166 		pfn = mfn_to_pfn(last_mfn);
3167 		if (pfn & PFN_IS_FOREIGN_MFN)
3168 			continue;
3169 
3170 		/* See if the page is free.  If so, lock it. */
3171 		pp = page_numtopp_alloc(pfn);
3172 		if (pp == NULL)
3173 			continue;
3174 		PP_CLRFREE(pp);
3175 
3176 		ASSERT(PAGE_EXCL(pp));
3177 		ASSERT(pp->p_vnode == NULL);
3178 		ASSERT(!hat_page_is_mapped(pp));
3179 		last_mfn--;
3180 		return (pp);
3181 	}
3182 	return (NULL);
3183 }
3184 
3185 #else /* !__xpv */
3186 
3187 /*
3188  * get a page from any list with the given mnode
3189  */
3190 static page_t *
3191 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
3192     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
3193 {
3194 	kmutex_t		*pcm;
3195 	int			i;
3196 	page_t			*pp;
3197 	page_t			*first_pp;
3198 	uint64_t		pgaddr;
3199 	ulong_t			bin;
3200 	int			mtypestart;
3201 	int			plw_initialized;
3202 	page_list_walker_t	plw;
3203 
3204 	VM_STAT_ADD(pga_vmstats.pgma_alloc);
3205 
3206 	ASSERT((flags & PG_MATCH_COLOR) == 0);
3207 	ASSERT(szc == 0);
3208 	ASSERT(dma_attr != NULL);
3209 
3210 	MTYPE_START(mnode, mtype, flags);
3211 	if (mtype < 0) {
3212 		VM_STAT_ADD(pga_vmstats.pgma_allocempty);
3213 		return (NULL);
3214 	}
3215 
3216 	mtypestart = mtype;
3217 
3218 	bin = origbin;
3219 
3220 	/*
3221 	 * check up to page_colors + 1 bins - origbin may be checked twice
3222 	 * because of BIN_STEP skip
3223 	 */
3224 	do {
3225 		plw_initialized = 0;
3226 
3227 		for (plw.plw_count = 0;
3228 		    plw.plw_count < page_colors; plw.plw_count++) {
3229 
3230 			if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
3231 				goto nextfreebin;
3232 
3233 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
3234 			mutex_enter(pcm);
3235 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
3236 			first_pp = pp;
3237 			while (pp != NULL) {
3238 				if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3239 				    SE_EXCL) == 0) {
3240 					pp = pp->p_next;
3241 					if (pp == first_pp) {
3242 						pp = NULL;
3243 					}
3244 					continue;
3245 				}
3246 
3247 				ASSERT(PP_ISFREE(pp));
3248 				ASSERT(PP_ISAGED(pp));
3249 				ASSERT(pp->p_vnode == NULL);
3250 				ASSERT(pp->p_hash == NULL);
3251 				ASSERT(pp->p_offset == (u_offset_t)-1);
3252 				ASSERT(pp->p_szc == szc);
3253 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3254 				/* check if page within DMA attributes */
3255 				pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3256 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3257 				    (pgaddr + MMU_PAGESIZE - 1 <=
3258 				    dma_attr->dma_attr_addr_hi)) {
3259 					break;
3260 				}
3261 
3262 				/* continue looking */
3263 				page_unlock(pp);
3264 				pp = pp->p_next;
3265 				if (pp == first_pp)
3266 					pp = NULL;
3267 
3268 			}
3269 			if (pp != NULL) {
3270 				ASSERT(mtype == PP_2_MTYPE(pp));
3271 				ASSERT(pp->p_szc == 0);
3272 
3273 				/* found a page with specified DMA attributes */
3274 				page_sub(&PAGE_FREELISTS(mnode, szc, bin,
3275 				    mtype), pp);
3276 				page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3277 
3278 				if ((PP_ISFREE(pp) == 0) ||
3279 				    (PP_ISAGED(pp) == 0)) {
3280 					cmn_err(CE_PANIC, "page %p is not free",
3281 					    (void *)pp);
3282 				}
3283 
3284 				mutex_exit(pcm);
3285 				check_dma(dma_attr, pp, 1);
3286 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
3287 				return (pp);
3288 			}
3289 			mutex_exit(pcm);
3290 nextfreebin:
3291 			if (plw_initialized == 0) {
3292 				page_list_walk_init(szc, 0, bin, 1, 0, &plw);
3293 				ASSERT(plw.plw_ceq_dif == page_colors);
3294 				plw_initialized = 1;
3295 			}
3296 
3297 			if (plw.plw_do_split) {
3298 				pp = page_freelist_split(szc, bin, mnode,
3299 				    mtype,
3300 				    mmu_btop(dma_attr->dma_attr_addr_lo),
3301 				    mmu_btop(dma_attr->dma_attr_addr_hi + 1),
3302 				    &plw);
3303 				if (pp != NULL) {
3304 					check_dma(dma_attr, pp, 1);
3305 					return (pp);
3306 				}
3307 			}
3308 
3309 			bin = page_list_walk_next_bin(szc, bin, &plw);
3310 		}
3311 
3312 		MTYPE_NEXT(mnode, mtype, flags);
3313 	} while (mtype >= 0);
3314 
3315 	/* failed to find a page in the freelist; try it in the cachelist */
3316 
3317 	/* reset mtype start for cachelist search */
3318 	mtype = mtypestart;
3319 	ASSERT(mtype >= 0);
3320 
3321 	/* start with the bin of matching color */
3322 	bin = origbin;
3323 
3324 	do {
3325 		for (i = 0; i <= page_colors; i++) {
3326 			if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
3327 				goto nextcachebin;
3328 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3329 			mutex_enter(pcm);
3330 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
3331 			first_pp = pp;
3332 			while (pp != NULL) {
3333 				if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3334 				    SE_EXCL) == 0) {
3335 					pp = pp->p_next;
3336 					if (pp == first_pp)
3337 						pp = NULL;
3338 					continue;
3339 				}
3340 				ASSERT(pp->p_vnode);
3341 				ASSERT(PP_ISAGED(pp) == 0);
3342 				ASSERT(pp->p_szc == 0);
3343 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3344 
3345 				/* check if page within DMA attributes */
3346 
3347 				pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3348 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3349 				    (pgaddr + MMU_PAGESIZE - 1 <=
3350 				    dma_attr->dma_attr_addr_hi)) {
3351 					break;
3352 				}
3353 
3354 				/* continue looking */
3355 				page_unlock(pp);
3356 				pp = pp->p_next;
3357 				if (pp == first_pp)
3358 					pp = NULL;
3359 			}
3360 
3361 			if (pp != NULL) {
3362 				ASSERT(mtype == PP_2_MTYPE(pp));
3363 				ASSERT(pp->p_szc == 0);
3364 
3365 				/* found a page with specified DMA attributes */
3366 				page_sub(&PAGE_CACHELISTS(mnode, bin,
3367 				    mtype), pp);
3368 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3369 
3370 				mutex_exit(pcm);
3371 				ASSERT(pp->p_vnode);
3372 				ASSERT(PP_ISAGED(pp) == 0);
3373 				check_dma(dma_attr, pp, 1);
3374 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
3375 				return (pp);
3376 			}
3377 			mutex_exit(pcm);
3378 nextcachebin:
3379 			bin += (i == 0) ? BIN_STEP : 1;
3380 			bin &= page_colors_mask;
3381 		}
3382 		MTYPE_NEXT(mnode, mtype, flags);
3383 	} while (mtype >= 0);
3384 
3385 	VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
3386 	return (NULL);
3387 }
3388 
3389 /*
3390  * This function is similar to page_get_freelist()/page_get_cachelist()
3391  * but it searches both the lists to find a page with the specified
3392  * color (or no color) and DMA attributes. The search is done in the
3393  * freelist first and then in the cache list within the highest memory
3394  * range (based on DMA attributes) before searching in the lower
3395  * memory ranges.
3396  *
3397  * Note: This function is called only by page_create_io().
3398  */
3399 /*ARGSUSED*/
3400 static page_t *
3401 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
3402     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t	*lgrp)
3403 {
3404 	uint_t		bin;
3405 	int		mtype;
3406 	page_t		*pp;
3407 	int		n;
3408 	int		m;
3409 	int		szc;
3410 	int		fullrange;
3411 	int		mnode;
3412 	int		local_failed_stat = 0;
3413 	lgrp_mnode_cookie_t	lgrp_cookie;
3414 
3415 	VM_STAT_ADD(pga_vmstats.pga_alloc);
3416 
3417 	/* only base pagesize currently supported */
3418 	if (size != MMU_PAGESIZE)
3419 		return (NULL);
3420 
3421 	/*
3422 	 * If we're passed a specific lgroup, we use it.  Otherwise,
3423 	 * assume first-touch placement is desired.
3424 	 */
3425 	if (!LGRP_EXISTS(lgrp))
3426 		lgrp = lgrp_home_lgrp();
3427 
3428 	/* LINTED */
3429 	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3430 
3431 	/*
3432 	 * Only hold one freelist or cachelist lock at a time, that way we
3433 	 * can start anywhere and not have to worry about lock
3434 	 * ordering.
3435 	 */
3436 	if (dma_attr == NULL) {
3437 		n = mtype16m;
3438 		m = mtypetop;
3439 		fullrange = 1;
3440 		VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
3441 	} else {
3442 		pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
3443 		pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
3444 
3445 		/*
3446 		 * We can guarantee alignment only for page boundary.
3447 		 */
3448 		if (dma_attr->dma_attr_align > MMU_PAGESIZE)
3449 			return (NULL);
3450 
3451 		/* Sanity check the dma_attr */
3452 		if (pfnlo > pfnhi)
3453 			return (NULL);
3454 
3455 		n = pfn_2_mtype(pfnlo);
3456 		m = pfn_2_mtype(pfnhi);
3457 
3458 		fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
3459 		    (pfnhi >= mnoderanges[m].mnr_pfnhi));
3460 	}
3461 	VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
3462 
3463 	szc = 0;
3464 
3465 	/* cylcing thru mtype handled by RANGE0 if n == mtype16m */
3466 	if (n == mtype16m) {
3467 		flags |= PGI_MT_RANGE0;
3468 		n = m;
3469 	}
3470 
3471 	/*
3472 	 * Try local memory node first, but try remote if we can't
3473 	 * get a page of the right color.
3474 	 */
3475 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
3476 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3477 		/*
3478 		 * allocate pages from high pfn to low.
3479 		 */
3480 		mtype = m;
3481 		do {
3482 			if (fullrange != 0) {
3483 				pp = page_get_mnode_freelist(mnode,
3484 				    bin, mtype, szc, flags);
3485 				if (pp == NULL) {
3486 					pp = page_get_mnode_cachelist(
3487 					    bin, flags, mnode, mtype);
3488 				}
3489 			} else {
3490 				pp = page_get_mnode_anylist(bin, szc,
3491 				    flags, mnode, mtype, dma_attr);
3492 			}
3493 			if (pp != NULL) {
3494 				VM_STAT_ADD(pga_vmstats.pga_allocok);
3495 				check_dma(dma_attr, pp, 1);
3496 				return (pp);
3497 			}
3498 		} while (mtype != n &&
3499 		    (mtype = mnoderanges[mtype].mnr_next) != -1);
3500 		if (!local_failed_stat) {
3501 			lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3502 			local_failed_stat = 1;
3503 		}
3504 	}
3505 	VM_STAT_ADD(pga_vmstats.pga_allocfailed);
3506 
3507 	return (NULL);
3508 }
3509 
3510 /*
3511  * page_create_io()
3512  *
3513  * This function is a copy of page_create_va() with an additional
3514  * argument 'mattr' that specifies DMA memory requirements to
3515  * the page list functions. This function is used by the segkmem
3516  * allocator so it is only to create new pages (i.e PG_EXCL is
3517  * set).
3518  *
3519  * Note: This interface is currently used by x86 PSM only and is
3520  *	 not fully specified so the commitment level is only for
3521  *	 private interface specific to x86. This interface uses PSM
3522  *	 specific page_get_anylist() interface.
3523  */
3524 
3525 #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
3526 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
3527 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
3528 			break; \
3529 	} \
3530 }
3531 
3532 
3533 page_t *
3534 page_create_io(
3535 	struct vnode	*vp,
3536 	u_offset_t	off,
3537 	uint_t		bytes,
3538 	uint_t		flags,
3539 	struct as	*as,
3540 	caddr_t		vaddr,
3541 	ddi_dma_attr_t	*mattr)	/* DMA memory attributes if any */
3542 {
3543 	page_t		*plist = NULL;
3544 	uint_t		plist_len = 0;
3545 	pgcnt_t		npages;
3546 	page_t		*npp = NULL;
3547 	uint_t		pages_req;
3548 	page_t		*pp;
3549 	kmutex_t	*phm = NULL;
3550 	uint_t		index;
3551 
3552 	TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
3553 	    "page_create_start:vp %p off %llx bytes %u flags %x",
3554 	    vp, off, bytes, flags);
3555 
3556 	ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
3557 
3558 	pages_req = npages = mmu_btopr(bytes);
3559 
3560 	/*
3561 	 * Do the freemem and pcf accounting.
3562 	 */
3563 	if (!page_create_wait(npages, flags)) {
3564 		return (NULL);
3565 	}
3566 
3567 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
3568 	    "page_create_success:vp %p off %llx", vp, off);
3569 
3570 	/*
3571 	 * If satisfying this request has left us with too little
3572 	 * memory, start the wheels turning to get some back.  The
3573 	 * first clause of the test prevents waking up the pageout
3574 	 * daemon in situations where it would decide that there's
3575 	 * nothing to do.
3576 	 */
3577 	if (nscan < desscan && freemem < minfree) {
3578 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
3579 		    "pageout_cv_signal:freemem %ld", freemem);
3580 		cv_signal(&proc_pageout->p_cv);
3581 	}
3582 
3583 	if (flags & PG_PHYSCONTIG) {
3584 
3585 		plist = page_get_contigpage(&npages, mattr, 1);
3586 		if (plist == NULL) {
3587 			page_create_putback(npages);
3588 			return (NULL);
3589 		}
3590 
3591 		pp = plist;
3592 
3593 		do {
3594 			if (!page_hashin(pp, vp, off, NULL)) {
3595 				panic("pg_creat_io: hashin failed %p %p %llx",
3596 				    (void *)pp, (void *)vp, off);
3597 			}
3598 			VM_STAT_ADD(page_create_new);
3599 			off += MMU_PAGESIZE;
3600 			PP_CLRFREE(pp);
3601 			PP_CLRAGED(pp);
3602 			page_set_props(pp, P_REF);
3603 			pp = pp->p_next;
3604 		} while (pp != plist);
3605 
3606 		if (!npages) {
3607 			check_dma(mattr, plist, pages_req);
3608 			return (plist);
3609 		} else {
3610 			vaddr += (pages_req - npages) << MMU_PAGESHIFT;
3611 		}
3612 
3613 		/*
3614 		 * fall-thru:
3615 		 *
3616 		 * page_get_contigpage returns when npages <= sgllen.
3617 		 * Grab the rest of the non-contig pages below from anylist.
3618 		 */
3619 	}
3620 
3621 	/*
3622 	 * Loop around collecting the requested number of pages.
3623 	 * Most of the time, we have to `create' a new page. With
3624 	 * this in mind, pull the page off the free list before
3625 	 * getting the hash lock.  This will minimize the hash
3626 	 * lock hold time, nesting, and the like.  If it turns
3627 	 * out we don't need the page, we put it back at the end.
3628 	 */
3629 	while (npages--) {
3630 		phm = NULL;
3631 
3632 		index = PAGE_HASH_FUNC(vp, off);
3633 top:
3634 		ASSERT(phm == NULL);
3635 		ASSERT(index == PAGE_HASH_FUNC(vp, off));
3636 		ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3637 
3638 		if (npp == NULL) {
3639 			/*
3640 			 * Try to get the page of any color either from
3641 			 * the freelist or from the cache list.
3642 			 */
3643 			npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
3644 			    flags & ~PG_MATCH_COLOR, mattr, NULL);
3645 			if (npp == NULL) {
3646 				if (mattr == NULL) {
3647 					/*
3648 					 * Not looking for a special page;
3649 					 * panic!
3650 					 */
3651 					panic("no page found %d", (int)npages);
3652 				}
3653 				/*
3654 				 * No page found! This can happen
3655 				 * if we are looking for a page
3656 				 * within a specific memory range
3657 				 * for DMA purposes. If PG_WAIT is
3658 				 * specified then we wait for a
3659 				 * while and then try again. The
3660 				 * wait could be forever if we
3661 				 * don't get the page(s) we need.
3662 				 *
3663 				 * Note: XXX We really need a mechanism
3664 				 * to wait for pages in the desired
3665 				 * range. For now, we wait for any
3666 				 * pages and see if we can use it.
3667 				 */
3668 
3669 				if ((mattr != NULL) && (flags & PG_WAIT)) {
3670 					delay(10);
3671 					goto top;
3672 				}
3673 				goto fail; /* undo accounting stuff */
3674 			}
3675 
3676 			if (PP_ISAGED(npp) == 0) {
3677 				/*
3678 				 * Since this page came from the
3679 				 * cachelist, we must destroy the
3680 				 * old vnode association.
3681 				 */
3682 				page_hashout(npp, (kmutex_t *)NULL);
3683 			}
3684 		}
3685 
3686 		/*
3687 		 * We own this page!
3688 		 */
3689 		ASSERT(PAGE_EXCL(npp));
3690 		ASSERT(npp->p_vnode == NULL);
3691 		ASSERT(!hat_page_is_mapped(npp));
3692 		PP_CLRFREE(npp);
3693 		PP_CLRAGED(npp);
3694 
3695 		/*
3696 		 * Here we have a page in our hot little mits and are
3697 		 * just waiting to stuff it on the appropriate lists.
3698 		 * Get the mutex and check to see if it really does
3699 		 * not exist.
3700 		 */
3701 		phm = PAGE_HASH_MUTEX(index);
3702 		mutex_enter(phm);
3703 		PAGE_HASH_SEARCH(index, pp, vp, off);
3704 		if (pp == NULL) {
3705 			VM_STAT_ADD(page_create_new);
3706 			pp = npp;
3707 			npp = NULL;
3708 			if (!page_hashin(pp, vp, off, phm)) {
3709 				/*
3710 				 * Since we hold the page hash mutex and
3711 				 * just searched for this page, page_hashin
3712 				 * had better not fail.  If it does, that
3713 				 * means somethread did not follow the
3714 				 * page hash mutex rules.  Panic now and
3715 				 * get it over with.  As usual, go down
3716 				 * holding all the locks.
3717 				 */
3718 				ASSERT(MUTEX_HELD(phm));
3719 				panic("page_create: hashin fail %p %p %llx %p",
3720 				    (void *)pp, (void *)vp, off, (void *)phm);
3721 
3722 			}
3723 			ASSERT(MUTEX_HELD(phm));
3724 			mutex_exit(phm);
3725 			phm = NULL;
3726 
3727 			/*
3728 			 * Hat layer locking need not be done to set
3729 			 * the following bits since the page is not hashed
3730 			 * and was on the free list (i.e., had no mappings).
3731 			 *
3732 			 * Set the reference bit to protect
3733 			 * against immediate pageout
3734 			 *
3735 			 * XXXmh modify freelist code to set reference
3736 			 * bit so we don't have to do it here.
3737 			 */
3738 			page_set_props(pp, P_REF);
3739 		} else {
3740 			ASSERT(MUTEX_HELD(phm));
3741 			mutex_exit(phm);
3742 			phm = NULL;
3743 			/*
3744 			 * NOTE: This should not happen for pages associated
3745 			 *	 with kernel vnode 'kvp'.
3746 			 */
3747 			/* XX64 - to debug why this happens! */
3748 			ASSERT(!VN_ISKAS(vp));
3749 			if (VN_ISKAS(vp))
3750 				cmn_err(CE_NOTE,
3751 				    "page_create: page not expected "
3752 				    "in hash list for kernel vnode - pp 0x%p",
3753 				    (void *)pp);
3754 			VM_STAT_ADD(page_create_exists);
3755 			goto fail;
3756 		}
3757 
3758 		/*
3759 		 * Got a page!  It is locked.  Acquire the i/o
3760 		 * lock since we are going to use the p_next and
3761 		 * p_prev fields to link the requested pages together.
3762 		 */
3763 		page_io_lock(pp);
3764 		page_add(&plist, pp);
3765 		plist = plist->p_next;
3766 		off += MMU_PAGESIZE;
3767 		vaddr += MMU_PAGESIZE;
3768 	}
3769 
3770 	check_dma(mattr, plist, pages_req);
3771 	return (plist);
3772 
3773 fail:
3774 	if (npp != NULL) {
3775 		/*
3776 		 * Did not need this page after all.
3777 		 * Put it back on the free list.
3778 		 */
3779 		VM_STAT_ADD(page_create_putbacks);
3780 		PP_SETFREE(npp);
3781 		PP_SETAGED(npp);
3782 		npp->p_offset = (u_offset_t)-1;
3783 		page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
3784 		page_unlock(npp);
3785 	}
3786 
3787 	/*
3788 	 * Give up the pages we already got.
3789 	 */
3790 	while (plist != NULL) {
3791 		pp = plist;
3792 		page_sub(&plist, pp);
3793 		page_io_unlock(pp);
3794 		plist_len++;
3795 		/*LINTED: constant in conditional ctx*/
3796 		VN_DISPOSE(pp, B_INVAL, 0, kcred);
3797 	}
3798 
3799 	/*
3800 	 * VN_DISPOSE does freemem accounting for the pages in plist
3801 	 * by calling page_free. So, we need to undo the pcf accounting
3802 	 * for only the remaining pages.
3803 	 */
3804 	VM_STAT_ADD(page_create_putbacks);
3805 	page_create_putback(pages_req - plist_len);
3806 
3807 	return (NULL);
3808 }
3809 #endif /* !__xpv */
3810 
3811 
3812 /*
3813  * Copy the data from the physical page represented by "frompp" to
3814  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
3815  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
3816  * level and no one sleeps with an active mapping there.
3817  *
3818  * Note that the ref/mod bits in the page_t's are not affected by
3819  * this operation, hence it is up to the caller to update them appropriately.
3820  */
3821 int
3822 ppcopy(page_t *frompp, page_t *topp)
3823 {
3824 	caddr_t		pp_addr1;
3825 	caddr_t		pp_addr2;
3826 	hat_mempte_t	pte1;
3827 	hat_mempte_t	pte2;
3828 	kmutex_t	*ppaddr_mutex;
3829 	label_t		ljb;
3830 	int		ret = 1;
3831 
3832 	ASSERT_STACK_ALIGNED();
3833 	ASSERT(PAGE_LOCKED(frompp));
3834 	ASSERT(PAGE_LOCKED(topp));
3835 
3836 	if (kpm_enable) {
3837 		pp_addr1 = hat_kpm_page2va(frompp, 0);
3838 		pp_addr2 = hat_kpm_page2va(topp, 0);
3839 		kpreempt_disable();
3840 	} else {
3841 		/*
3842 		 * disable pre-emption so that CPU can't change
3843 		 */
3844 		kpreempt_disable();
3845 
3846 		pp_addr1 = CPU->cpu_caddr1;
3847 		pp_addr2 = CPU->cpu_caddr2;
3848 		pte1 = CPU->cpu_caddr1pte;
3849 		pte2 = CPU->cpu_caddr2pte;
3850 
3851 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3852 		mutex_enter(ppaddr_mutex);
3853 
3854 		hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
3855 		    PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
3856 		hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
3857 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3858 		    HAT_LOAD_NOCONSIST);
3859 	}
3860 
3861 	if (on_fault(&ljb)) {
3862 		ret = 0;
3863 		goto faulted;
3864 	}
3865 	if (use_sse_pagecopy)
3866 #ifdef __xpv
3867 		page_copy_no_xmm(pp_addr2, pp_addr1);
3868 #else
3869 		hwblkpagecopy(pp_addr1, pp_addr2);
3870 #endif
3871 	else
3872 		bcopy(pp_addr1, pp_addr2, PAGESIZE);
3873 
3874 	no_fault();
3875 faulted:
3876 	if (!kpm_enable) {
3877 #ifdef __xpv
3878 		/*
3879 		 * We can't leave unused mappings laying about under the
3880 		 * hypervisor, so blow them away.
3881 		 */
3882 		if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0,
3883 		    UVMF_INVLPG | UVMF_LOCAL) < 0)
3884 			panic("HYPERVISOR_update_va_mapping() failed");
3885 		if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3886 		    UVMF_INVLPG | UVMF_LOCAL) < 0)
3887 			panic("HYPERVISOR_update_va_mapping() failed");
3888 #endif
3889 		mutex_exit(ppaddr_mutex);
3890 	}
3891 	kpreempt_enable();
3892 	return (ret);
3893 }
3894 
3895 void
3896 pagezero(page_t *pp, uint_t off, uint_t len)
3897 {
3898 	ASSERT(PAGE_LOCKED(pp));
3899 	pfnzero(page_pptonum(pp), off, len);
3900 }
3901 
3902 /*
3903  * Zero the physical page from off to off + len given by pfn
3904  * without changing the reference and modified bits of page.
3905  *
3906  * We use this using CPU private page address #2, see ppcopy() for more info.
3907  * pfnzero() must not be called at interrupt level.
3908  */
3909 void
3910 pfnzero(pfn_t pfn, uint_t off, uint_t len)
3911 {
3912 	caddr_t		pp_addr2;
3913 	hat_mempte_t	pte2;
3914 	kmutex_t	*ppaddr_mutex = NULL;
3915 
3916 	ASSERT_STACK_ALIGNED();
3917 	ASSERT(len <= MMU_PAGESIZE);
3918 	ASSERT(off <= MMU_PAGESIZE);
3919 	ASSERT(off + len <= MMU_PAGESIZE);
3920 
3921 	if (kpm_enable && !pfn_is_foreign(pfn)) {
3922 		pp_addr2 = hat_kpm_pfn2va(pfn);
3923 		kpreempt_disable();
3924 	} else {
3925 		kpreempt_disable();
3926 
3927 		pp_addr2 = CPU->cpu_caddr2;
3928 		pte2 = CPU->cpu_caddr2pte;
3929 
3930 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3931 		mutex_enter(ppaddr_mutex);
3932 
3933 		hat_mempte_remap(pfn, pp_addr2, pte2,
3934 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3935 		    HAT_LOAD_NOCONSIST);
3936 	}
3937 
3938 	if (use_sse_pagezero) {
3939 #ifdef __xpv
3940 		uint_t rem;
3941 
3942 		/*
3943 		 * zero a byte at a time until properly aligned for
3944 		 * block_zero_no_xmm().
3945 		 */
3946 		while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0)
3947 			pp_addr2[off++] = 0;
3948 
3949 		/*
3950 		 * Now use faster block_zero_no_xmm() for any range
3951 		 * that is properly aligned and sized.
3952 		 */
3953 		rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN));
3954 		len -= rem;
3955 		if (len != 0) {
3956 			block_zero_no_xmm(pp_addr2 + off, len);
3957 			off += len;
3958 		}
3959 
3960 		/*
3961 		 * zero remainder with byte stores.
3962 		 */
3963 		while (rem-- > 0)
3964 			pp_addr2[off++] = 0;
3965 #else
3966 		hwblkclr(pp_addr2 + off, len);
3967 #endif
3968 	} else {
3969 		bzero(pp_addr2 + off, len);
3970 	}
3971 
3972 	if (!kpm_enable || pfn_is_foreign(pfn)) {
3973 #ifdef __xpv
3974 		/*
3975 		 * On the hypervisor this page might get used for a page
3976 		 * table before any intervening change to this mapping,
3977 		 * so blow it away.
3978 		 */
3979 		if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3980 		    UVMF_INVLPG) < 0)
3981 			panic("HYPERVISOR_update_va_mapping() failed");
3982 #endif
3983 		mutex_exit(ppaddr_mutex);
3984 	}
3985 
3986 	kpreempt_enable();
3987 }
3988 
3989 /*
3990  * Platform-dependent page scrub call.
3991  */
3992 void
3993 pagescrub(page_t *pp, uint_t off, uint_t len)
3994 {
3995 	/*
3996 	 * For now, we rely on the fact that pagezero() will
3997 	 * always clear UEs.
3998 	 */
3999 	pagezero(pp, off, len);
4000 }
4001 
4002 /*
4003  * set up two private addresses for use on a given CPU for use in ppcopy()
4004  */
4005 void
4006 setup_vaddr_for_ppcopy(struct cpu *cpup)
4007 {
4008 	void *addr;
4009 	hat_mempte_t pte_pa;
4010 
4011 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
4012 	pte_pa = hat_mempte_setup(addr);
4013 	cpup->cpu_caddr1 = addr;
4014 	cpup->cpu_caddr1pte = pte_pa;
4015 
4016 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
4017 	pte_pa = hat_mempte_setup(addr);
4018 	cpup->cpu_caddr2 = addr;
4019 	cpup->cpu_caddr2pte = pte_pa;
4020 
4021 	mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
4022 }
4023 
4024 /*
4025  * Undo setup_vaddr_for_ppcopy
4026  */
4027 void
4028 teardown_vaddr_for_ppcopy(struct cpu *cpup)
4029 {
4030 	mutex_destroy(&cpup->cpu_ppaddr_mutex);
4031 
4032 	hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte);
4033 	cpup->cpu_caddr2pte = 0;
4034 	vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1));
4035 	cpup->cpu_caddr2 = 0;
4036 
4037 	hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte);
4038 	cpup->cpu_caddr1pte = 0;
4039 	vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1));
4040 	cpup->cpu_caddr1 = 0;
4041 }
4042 
4043 /*
4044  * Function for flushing D-cache when performing module relocations
4045  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
4046  */
4047 void
4048 dcache_flushall()
4049 {}
4050 
4051 /*
4052  * Allocate a memory page.  The argument 'seed' can be any pseudo-random
4053  * number to vary where the pages come from.  This is quite a hacked up
4054  * method -- it works for now, but really needs to be fixed up a bit.
4055  *
4056  * We currently use page_create_va() on the kvp with fake offsets,
4057  * segments and virt address.  This is pretty bogus, but was copied from the
4058  * old hat_i86.c code.  A better approach would be to specify either mnode
4059  * random or mnode local and takes a page from whatever color has the MOST
4060  * available - this would have a minimal impact on page coloring.
4061  */
4062 page_t *
4063 page_get_physical(uintptr_t seed)
4064 {
4065 	page_t *pp;
4066 	u_offset_t offset;
4067 	static struct seg tmpseg;
4068 	static uintptr_t ctr = 0;
4069 
4070 	/*
4071 	 * This code is gross, we really need a simpler page allocator.
4072 	 *
4073 	 * We need to assign an offset for the page to call page_create_va()
4074 	 * To avoid conflicts with other pages, we get creative with the offset.
4075 	 * For 32 bits, we need an offset > 4Gig
4076 	 * For 64 bits, need an offset somewhere in the VA hole.
4077 	 */
4078 	offset = seed;
4079 	if (offset > kernelbase)
4080 		offset -= kernelbase;
4081 	offset <<= MMU_PAGESHIFT;
4082 #if defined(__amd64)
4083 	offset += mmu.hole_start;	/* something in VA hole */
4084 #else
4085 	offset += 1ULL << 40;	/* something > 4 Gig */
4086 #endif
4087 
4088 	if (page_resv(1, KM_NOSLEEP) == 0)
4089 		return (NULL);
4090 
4091 #ifdef	DEBUG
4092 	pp = page_exists(&kvp, offset);
4093 	if (pp != NULL)
4094 		panic("page already exists %p", (void *)pp);
4095 #endif
4096 
4097 	pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL,
4098 	    &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE));	/* changing VA usage */
4099 	if (pp != NULL) {
4100 		page_io_unlock(pp);
4101 		page_downgrade(pp);
4102 	}
4103 	return (pp);
4104 }
4105