xref: /titanic_51/usr/src/uts/i86pc/vm/vm_machdep.c (revision 3906e0c22bea9bf690c20f62b0575c1b1d0ace2e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /*	All Rights Reserved   */
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 /*
37  * UNIX machine dependent virtual memory support.
38  */
39 
40 #include <sys/types.h>
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/user.h>
44 #include <sys/proc.h>
45 #include <sys/kmem.h>
46 #include <sys/vmem.h>
47 #include <sys/buf.h>
48 #include <sys/cpuvar.h>
49 #include <sys/lgrp.h>
50 #include <sys/disp.h>
51 #include <sys/vm.h>
52 #include <sys/mman.h>
53 #include <sys/vnode.h>
54 #include <sys/cred.h>
55 #include <sys/exec.h>
56 #include <sys/exechdr.h>
57 #include <sys/debug.h>
58 
59 #include <vm/hat.h>
60 #include <vm/as.h>
61 #include <vm/seg.h>
62 #include <vm/seg_kp.h>
63 #include <vm/seg_vn.h>
64 #include <vm/page.h>
65 #include <vm/seg_kmem.h>
66 #include <vm/seg_kpm.h>
67 #include <vm/vm_dep.h>
68 
69 #include <sys/cpu.h>
70 #include <sys/vm_machparam.h>
71 #include <sys/memlist.h>
72 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
73 #include <vm/hat_i86.h>
74 #include <sys/x86_archext.h>
75 #include <sys/elf_386.h>
76 #include <sys/cmn_err.h>
77 #include <sys/archsystm.h>
78 #include <sys/machsystm.h>
79 
80 #include <sys/vtrace.h>
81 #include <sys/ddidmareq.h>
82 #include <sys/promif.h>
83 #include <sys/memnode.h>
84 #include <sys/stack.h>
85 
86 uint_t vac_colors = 1;
87 
88 int largepagesupport = 0;
89 extern uint_t page_create_new;
90 extern uint_t page_create_exists;
91 extern uint_t page_create_putbacks;
92 extern uint_t page_create_putbacks;
93 extern uintptr_t eprom_kernelbase;
94 extern int use_sse_pagecopy, use_sse_pagezero;	/* in ml/float.s */
95 
96 /* 4g memory management */
97 pgcnt_t		maxmem4g;
98 pgcnt_t		freemem4g;
99 int		physmax4g;
100 int		desfree4gshift = 4;	/* maxmem4g shift to derive DESFREE4G */
101 int		lotsfree4gshift = 3;
102 
103 /* 16m memory management: desired number of free pages below 16m. */
104 pgcnt_t		desfree16m = 0x380;
105 
106 #ifdef VM_STATS
107 struct {
108 	ulong_t	pga_alloc;
109 	ulong_t	pga_notfullrange;
110 	ulong_t	pga_nulldmaattr;
111 	ulong_t	pga_allocok;
112 	ulong_t	pga_allocfailed;
113 	ulong_t	pgma_alloc;
114 	ulong_t	pgma_allocok;
115 	ulong_t	pgma_allocfailed;
116 	ulong_t	pgma_allocempty;
117 } pga_vmstats;
118 #endif
119 
120 uint_t mmu_page_sizes;
121 
122 /* How many page sizes the users can see */
123 uint_t mmu_exported_page_sizes;
124 
125 size_t auto_lpg_va_default = MMU_PAGESIZE; /* used by zmap() */
126 /*
127  * Number of pages in 1 GB.  Don't enable automatic large pages if we have
128  * fewer than this many pages.
129  */
130 pgcnt_t auto_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
131 
132 /*
133  * Return the optimum page size for a given mapping
134  */
135 /*ARGSUSED*/
136 size_t
137 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap)
138 {
139 	level_t l;
140 
141 	if (remap)
142 		*remap = 0;
143 
144 	switch (maptype) {
145 
146 	case MAPPGSZ_STK:
147 	case MAPPGSZ_HEAP:
148 	case MAPPGSZ_VA:
149 		/*
150 		 * use the pages size that best fits len
151 		 */
152 		for (l = mmu.max_page_level; l > 0; --l) {
153 			if (len < LEVEL_SIZE(l))
154 				continue;
155 			break;
156 		}
157 		return (LEVEL_SIZE(l));
158 
159 	/*
160 	 * for ISM use the 1st large page size.
161 	 */
162 	case MAPPGSZ_ISM:
163 		if (mmu.max_page_level == 0)
164 			return (MMU_PAGESIZE);
165 		return (LEVEL_SIZE(1));
166 	}
167 	return (0);
168 }
169 
170 /*
171  * This can be patched via /etc/system to allow large pages
172  * to be used for mapping application and libraries text segments.
173  */
174 int	use_text_largepages = 0;
175 int	use_shm_largepages = 0;
176 
177 /*
178  * Return a bit vector of large page size codes that
179  * can be used to map [addr, addr + len) region.
180  */
181 
182 /*ARGSUSED*/
183 uint_t
184 map_execseg_pgszcvec(int text, caddr_t addr, size_t len)
185 {
186 	size_t	pgsz;
187 	caddr_t a;
188 
189 	if (!text || !use_text_largepages ||
190 	    mmu.max_page_level == 0)
191 		return (0);
192 
193 	pgsz = LEVEL_SIZE(1);
194 	a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
195 	if (a < addr || a >= addr + len) {
196 		return (0);
197 	}
198 	len -= (a - addr);
199 	if (len < pgsz) {
200 		return (0);
201 	}
202 	return (1 << 1);
203 }
204 
205 uint_t
206 map_shm_pgszcvec(caddr_t addr, size_t len, uintptr_t off)
207 {
208 	size_t	pgsz;
209 	caddr_t a;
210 
211 	if (!use_shm_largepages || mmu.max_page_level == 0) {
212 		return (0);
213 	}
214 
215 	pgsz = LEVEL_SIZE(1);
216 	a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
217 	if (a < addr || a >= addr + len ||
218 	    P2PHASE((uintptr_t)addr ^ off, pgsz)) {
219 		return (0);
220 	}
221 	len -= (a - addr);
222 	if (len < pgsz) {
223 		return (0);
224 	}
225 	return (1 << 1);
226 }
227 
228 /*
229  * Handle a pagefault.
230  */
231 faultcode_t
232 pagefault(
233 	caddr_t addr,
234 	enum fault_type type,
235 	enum seg_rw rw,
236 	int iskernel)
237 {
238 	struct as *as;
239 	struct hat *hat;
240 	struct proc *p;
241 	kthread_t *t;
242 	faultcode_t res;
243 	caddr_t base;
244 	size_t len;
245 	int err;
246 	int mapped_red;
247 	uintptr_t ea;
248 
249 	ASSERT_STACK_ALIGNED();
250 
251 	if (INVALID_VADDR(addr))
252 		return (FC_NOMAP);
253 
254 	mapped_red = segkp_map_red();
255 
256 	if (iskernel) {
257 		as = &kas;
258 		hat = as->a_hat;
259 	} else {
260 		t = curthread;
261 		p = ttoproc(t);
262 		as = p->p_as;
263 		hat = as->a_hat;
264 	}
265 
266 	/*
267 	 * Dispatch pagefault.
268 	 */
269 	res = as_fault(hat, as, addr, 1, type, rw);
270 
271 	/*
272 	 * If this isn't a potential unmapped hole in the user's
273 	 * UNIX data or stack segments, just return status info.
274 	 */
275 	if (res != FC_NOMAP || iskernel)
276 		goto out;
277 
278 	/*
279 	 * Check to see if we happened to faulted on a currently unmapped
280 	 * part of the UNIX data or stack segments.  If so, create a zfod
281 	 * mapping there and then try calling the fault routine again.
282 	 */
283 	base = p->p_brkbase;
284 	len = p->p_brksize;
285 
286 	if (addr < base || addr >= base + len) {		/* data seg? */
287 		base = (caddr_t)p->p_usrstack - p->p_stksize;
288 		len = p->p_stksize;
289 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
290 			/* not in either UNIX data or stack segments */
291 			res = FC_NOMAP;
292 			goto out;
293 		}
294 	}
295 
296 	/*
297 	 * the rest of this function implements a 3.X 4.X 5.X compatibility
298 	 * This code is probably not needed anymore
299 	 */
300 	if (p->p_model == DATAMODEL_ILP32) {
301 
302 		/* expand the gap to the page boundaries on each side */
303 		ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
304 		base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
305 		len = ea - (uintptr_t)base;
306 
307 		as_rangelock(as);
308 		if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
309 		    0) {
310 			err = as_map(as, base, len, segvn_create, zfod_argsp);
311 			as_rangeunlock(as);
312 			if (err) {
313 				res = FC_MAKE_ERR(err);
314 				goto out;
315 			}
316 		} else {
317 			/*
318 			 * This page is already mapped by another thread after
319 			 * we returned from as_fault() above.  We just fall
320 			 * through as_fault() below.
321 			 */
322 			as_rangeunlock(as);
323 		}
324 
325 		res = as_fault(hat, as, addr, 1, F_INVAL, rw);
326 	}
327 
328 out:
329 	if (mapped_red)
330 		segkp_unmap_red();
331 
332 	return (res);
333 }
334 
335 void
336 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
337 {
338 	struct proc *p = curproc;
339 	caddr_t userlimit = (flags & _MAP_LOW32) ?
340 	    (caddr_t)_userlimit32 : p->p_as->a_userlimit;
341 
342 	map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
343 }
344 
345 /*ARGSUSED*/
346 int
347 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
348 {
349 	return (0);
350 }
351 
352 /*
353  * map_addr_proc() is the routine called when the system is to
354  * choose an address for the user.  We will pick an address
355  * range which is the highest available below kernelbase.
356  *
357  * addrp is a value/result parameter.
358  *	On input it is a hint from the user to be used in a completely
359  *	machine dependent fashion.  We decide to completely ignore this hint.
360  *
361  *	On output it is NULL if no address can be found in the current
362  *	processes address space or else an address that is currently
363  *	not mapped for len bytes with a page of red zone on either side.
364  *
365  *	align is not needed on x86 (it's for viturally addressed caches)
366  */
367 /*ARGSUSED*/
368 void
369 map_addr_proc(
370 	caddr_t *addrp,
371 	size_t len,
372 	offset_t off,
373 	int vacalign,
374 	caddr_t userlimit,
375 	struct proc *p,
376 	uint_t flags)
377 {
378 	struct as *as = p->p_as;
379 	caddr_t addr;
380 	caddr_t base;
381 	size_t slen;
382 	size_t align_amount;
383 
384 	ASSERT32(userlimit == as->a_userlimit);
385 
386 	base = p->p_brkbase;
387 #if defined(__amd64)
388 	/*
389 	 * XX64 Yes, this needs more work.
390 	 */
391 	if (p->p_model == DATAMODEL_NATIVE) {
392 		if (userlimit < as->a_userlimit) {
393 			/*
394 			 * This happens when a program wants to map
395 			 * something in a range that's accessible to a
396 			 * program in a smaller address space.  For example,
397 			 * a 64-bit program calling mmap32(2) to guarantee
398 			 * that the returned address is below 4Gbytes.
399 			 */
400 			ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
401 
402 			if (userlimit > base)
403 				slen = userlimit - base;
404 			else {
405 				*addrp = NULL;
406 				return;
407 			}
408 		} else {
409 			/*
410 			 * XX64 This layout is probably wrong .. but in
411 			 * the event we make the amd64 address space look
412 			 * like sparcv9 i.e. with the stack -above- the
413 			 * heap, this bit of code might even be correct.
414 			 */
415 			slen = p->p_usrstack - base -
416 			    (((size_t)rctl_enforced_value(
417 			    rctlproc_legacy[RLIMIT_STACK],
418 			    p->p_rctls, p) + PAGEOFFSET) & PAGEMASK);
419 		}
420 	} else
421 #endif
422 		slen = userlimit - base;
423 
424 	len = (len + PAGEOFFSET) & PAGEMASK;
425 
426 	/*
427 	 * Redzone for each side of the request. This is done to leave
428 	 * one page unmapped between segments. This is not required, but
429 	 * it's useful for the user because if their program strays across
430 	 * a segment boundary, it will catch a fault immediately making
431 	 * debugging a little easier.
432 	 */
433 	len += 2 * MMU_PAGESIZE;
434 
435 	/*
436 	 * figure out what the alignment should be
437 	 *
438 	 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
439 	 */
440 	if (len <= ELF_386_MAXPGSZ) {
441 		/*
442 		 * Align virtual addresses to ensure that ELF shared libraries
443 		 * are mapped with the appropriate alignment constraints by
444 		 * the run-time linker.
445 		 */
446 		align_amount = ELF_386_MAXPGSZ;
447 	} else {
448 		int l = mmu.max_page_level;
449 
450 		while (l && len < LEVEL_SIZE(l))
451 			--l;
452 
453 		align_amount = LEVEL_SIZE(l);
454 	}
455 
456 	if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
457 		align_amount = (uintptr_t)*addrp;
458 
459 	len += align_amount;
460 
461 	/*
462 	 * Look for a large enough hole starting below userlimit.
463 	 * After finding it, use the upper part.  Addition of PAGESIZE
464 	 * is for the redzone as described above.
465 	 */
466 	if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) {
467 		caddr_t as_addr;
468 
469 		addr = base + slen - len + MMU_PAGESIZE;
470 		as_addr = addr;
471 		/*
472 		 * Round address DOWN to the alignment amount,
473 		 * add the offset, and if this address is less
474 		 * than the original address, add alignment amount.
475 		 */
476 		addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
477 		addr += (uintptr_t)(off & (align_amount - 1));
478 		if (addr < as_addr)
479 			addr += align_amount;
480 
481 		ASSERT(addr <= (as_addr + align_amount));
482 		ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
483 		    ((uintptr_t)(off & (align_amount - 1))));
484 		*addrp = addr;
485 	} else {
486 		*addrp = NULL;	/* no more virtual space */
487 	}
488 }
489 
490 /*
491  * Determine whether [base, base+len] contains a valid range of
492  * addresses at least minlen long. base and len are adjusted if
493  * required to provide a valid range.
494  */
495 /*ARGSUSED3*/
496 int
497 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
498 {
499 	uintptr_t hi, lo;
500 
501 	lo = (uintptr_t)*basep;
502 	hi = lo + *lenp;
503 
504 	/*
505 	 * If hi rolled over the top, try cutting back.
506 	 */
507 	if (hi < lo) {
508 		if (0 - lo + hi < minlen)
509 			return (0);
510 		if (0 - lo < minlen)
511 			return (0);
512 		*lenp = 0 - lo;
513 	} else if (hi - lo < minlen) {
514 		return (0);
515 	}
516 #if defined(__amd64)
517 	/*
518 	 * Deal with a possible hole in the address range between
519 	 * hole_start and hole_end that should never be mapped.
520 	 */
521 	if (lo < hole_start) {
522 		if (hi > hole_start) {
523 			if (hi < hole_end) {
524 				hi = hole_start;
525 			} else {
526 				/* lo < hole_start && hi >= hole_end */
527 				if (dir == AH_LO) {
528 					/*
529 					 * prefer lowest range
530 					 */
531 					if (hole_start - lo >= minlen)
532 						hi = hole_start;
533 					else if (hi - hole_end >= minlen)
534 						lo = hole_end;
535 					else
536 						return (0);
537 				} else {
538 					/*
539 					 * prefer highest range
540 					 */
541 					if (hi - hole_end >= minlen)
542 						lo = hole_end;
543 					else if (hole_start - lo >= minlen)
544 						hi = hole_start;
545 					else
546 						return (0);
547 				}
548 			}
549 		}
550 	} else {
551 		/* lo >= hole_start */
552 		if (hi < hole_end)
553 			return (0);
554 		if (lo < hole_end)
555 			lo = hole_end;
556 	}
557 
558 	if (hi - lo < minlen)
559 		return (0);
560 
561 	*basep = (caddr_t)lo;
562 	*lenp = hi - lo;
563 #endif
564 	return (1);
565 }
566 
567 /*
568  * Determine whether [addr, addr+len] are valid user addresses.
569  */
570 /*ARGSUSED*/
571 int
572 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
573     caddr_t userlimit)
574 {
575 	caddr_t eaddr = addr + len;
576 
577 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
578 		return (RANGE_BADADDR);
579 
580 #if defined(__amd64)
581 	/*
582 	 * Check for the VA hole
583 	 */
584 	if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
585 		return (RANGE_BADADDR);
586 #endif
587 
588 	return (RANGE_OKAY);
589 }
590 
591 /*
592  * Return 1 if the page frame is onboard memory, else 0.
593  */
594 int
595 pf_is_memory(pfn_t pf)
596 {
597 	return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1));
598 }
599 
600 
601 /*
602  * initialized by page_coloring_init().
603  */
604 uint_t	page_colors;
605 uint_t	page_colors_mask;
606 uint_t	page_coloring_shift;
607 int	cpu_page_colors;
608 static uint_t	l2_colors;
609 
610 /*
611  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
612  * and page_colors are calculated from the l2 cache n-way set size.  Within a
613  * mnode range, the page freelist and cachelist are hashed into bins based on
614  * color. This makes it easier to search for a page within a specific memory
615  * range.
616  */
617 #define	PAGE_COLORS_MIN	16
618 
619 page_t ****page_freelists;
620 page_t ***page_cachelists;
621 
622 /*
623  * As the PC architecture evolved memory up was clumped into several
624  * ranges for various historical I/O devices to do DMA.
625  * < 16Meg - ISA bus
626  * < 2Gig - ???
627  * < 4Gig - PCI bus or drivers that don't understand PAE mode
628  */
629 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
630     0x100000,	/* pfn range for 4G and above */
631     0x80000,	/* pfn range for 2G-4G */
632     0x01000,	/* pfn range for 16M-2G */
633     0x00000,	/* pfn range for 0-16M */
634 };
635 
636 /*
637  * These are changed during startup if the machine has limited memory.
638  */
639 pfn_t *memranges = &arch_memranges[0];
640 int nranges = NUM_MEM_RANGES;
641 
642 /*
643  * Used by page layer to know about page sizes
644  */
645 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
646 
647 /*
648  * This can be patched via /etc/system to allow old non-PAE aware device
649  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
650  */
651 #if defined(__i386)
652 int restricted_kmemalloc = 0;
653 #elif defined(__amd64)
654 int restricted_kmemalloc = 0;
655 #endif
656 
657 kmutex_t	*fpc_mutex[NPC_MUTEX];
658 kmutex_t	*cpc_mutex[NPC_MUTEX];
659 
660 
661 /*
662  * return the memrange containing pfn
663  */
664 int
665 memrange_num(pfn_t pfn)
666 {
667 	int n;
668 
669 	for (n = 0; n < nranges - 1; ++n) {
670 		if (pfn >= memranges[n])
671 			break;
672 	}
673 	return (n);
674 }
675 
676 /*
677  * return the mnoderange containing pfn
678  */
679 int
680 pfn_2_mtype(pfn_t pfn)
681 {
682 	int	n;
683 
684 	for (n = mnoderangecnt - 1; n >= 0; n--) {
685 		if (pfn >= mnoderanges[n].mnr_pfnlo) {
686 			break;
687 		}
688 	}
689 	return (n);
690 }
691 
692 /*
693  * is_contigpage_free:
694  *	returns a page list of contiguous pages. It minimally has to return
695  *	minctg pages. Caller determines minctg based on the scatter-gather
696  *	list length.
697  *
698  *	pfnp is set to the next page frame to search on return.
699  */
700 static page_t *
701 is_contigpage_free(
702 	pfn_t *pfnp,
703 	pgcnt_t *pgcnt,
704 	pgcnt_t minctg,
705 	uint64_t pfnseg,
706 	int iolock)
707 {
708 	int	i = 0;
709 	pfn_t	pfn = *pfnp;
710 	page_t	*pp;
711 	page_t	*plist = NULL;
712 
713 	/*
714 	 * fail if pfn + minctg crosses a segment boundary.
715 	 * Adjust for next starting pfn to begin at segment boundary.
716 	 */
717 
718 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
719 		*pfnp = roundup(*pfnp, pfnseg + 1);
720 		return (NULL);
721 	}
722 
723 	do {
724 retry:
725 		pp = page_numtopp_nolock(pfn + i);
726 		if ((pp == NULL) ||
727 		    (page_trylock(pp, SE_EXCL) == 0)) {
728 			(*pfnp)++;
729 			break;
730 		}
731 		if (page_pptonum(pp) != pfn + i) {
732 			page_unlock(pp);
733 			goto retry;
734 		}
735 
736 		if (!(PP_ISFREE(pp))) {
737 			page_unlock(pp);
738 			(*pfnp)++;
739 			break;
740 		}
741 
742 		if (!PP_ISAGED(pp)) {
743 			page_list_sub(pp, PG_CACHE_LIST);
744 			page_hashout(pp, (kmutex_t *)NULL);
745 		} else {
746 			page_list_sub(pp, PG_FREE_LIST);
747 		}
748 
749 		if (iolock)
750 			page_io_lock(pp);
751 		page_list_concat(&plist, &pp);
752 
753 		/*
754 		 * exit loop when pgcnt satisfied or segment boundary reached.
755 		 */
756 
757 	} while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
758 
759 	*pfnp += i;		/* set to next pfn to search */
760 
761 	if (i >= minctg) {
762 		*pgcnt -= i;
763 		return (plist);
764 	}
765 
766 	/*
767 	 * failure: minctg not satisfied.
768 	 *
769 	 * if next request crosses segment boundary, set next pfn
770 	 * to search from the segment boundary.
771 	 */
772 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
773 		*pfnp = roundup(*pfnp, pfnseg + 1);
774 
775 	/* clean up any pages already allocated */
776 
777 	while (plist) {
778 		pp = plist;
779 		page_sub(&plist, pp);
780 		page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
781 		if (iolock)
782 			page_io_unlock(pp);
783 		page_unlock(pp);
784 	}
785 
786 	return (NULL);
787 }
788 
789 /*
790  * verify that pages being returned from allocator have correct DMA attribute
791  */
792 #ifndef DEBUG
793 #define	check_dma(a, b, c) (0)
794 #else
795 static void
796 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
797 {
798 	if (dma_attr == NULL)
799 		return;
800 
801 	while (cnt-- > 0) {
802 		if (mmu_ptob((uint64_t)pp->p_pagenum) <
803 		    dma_attr->dma_attr_addr_lo)
804 			panic("PFN (pp=%p) below dma_attr_addr_lo", pp);
805 		if (mmu_ptob((uint64_t)pp->p_pagenum) >=
806 		    dma_attr->dma_attr_addr_hi)
807 			panic("PFN (pp=%p) above dma_attr_addr_hi", pp);
808 		pp = pp->p_next;
809 	}
810 }
811 #endif
812 
813 static kmutex_t	contig_lock;
814 
815 #define	CONTIG_LOCK()	mutex_enter(&contig_lock);
816 #define	CONTIG_UNLOCK()	mutex_exit(&contig_lock);
817 
818 #define	PFN_16M		(mmu_btop((uint64_t)0x1000000))
819 
820 static page_t *
821 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
822 {
823 	pfn_t		pfn;
824 	int		sgllen;
825 	uint64_t	pfnseg;
826 	pgcnt_t		minctg;
827 	page_t		*pplist = NULL, *plist;
828 	uint64_t	lo, hi;
829 	pgcnt_t		pfnalign = 0;
830 	static pfn_t	startpfn;
831 	static pgcnt_t	lastctgcnt;
832 	uintptr_t	align;
833 
834 	CONTIG_LOCK();
835 
836 	if (mattr) {
837 		lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
838 		hi = mmu_btop(mattr->dma_attr_addr_hi);
839 		if (hi >= physmax)
840 			hi = physmax - 1;
841 		sgllen = mattr->dma_attr_sgllen;
842 		pfnseg = mmu_btop(mattr->dma_attr_seg);
843 
844 		align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
845 		if (align > MMU_PAGESIZE)
846 			pfnalign = mmu_btop(align);
847 
848 		/*
849 		 * in order to satisfy the request, must minimally
850 		 * acquire minctg contiguous pages
851 		 */
852 		minctg = howmany(*pgcnt, sgllen);
853 
854 		ASSERT(hi >= lo);
855 
856 		/*
857 		 * start from where last searched if the minctg >= lastctgcnt
858 		 */
859 		if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
860 			startpfn = lo;
861 	} else {
862 		hi = physmax - 1;
863 		lo = 0;
864 		sgllen = 1;
865 		pfnseg = mmu.highest_pfn;
866 		minctg = *pgcnt;
867 
868 		if (minctg < lastctgcnt)
869 			startpfn = lo;
870 	}
871 	lastctgcnt = minctg;
872 
873 	ASSERT(pfnseg + 1 >= (uint64_t)minctg);
874 
875 	/* conserve 16m memory - start search above 16m when possible */
876 	if (hi > PFN_16M && startpfn < PFN_16M)
877 		startpfn = PFN_16M;
878 
879 	pfn = startpfn;
880 	if (pfnalign)
881 		pfn = P2ROUNDUP(pfn, pfnalign);
882 
883 	while (pfn + minctg - 1 <= hi) {
884 
885 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
886 		if (plist) {
887 			page_list_concat(&pplist, &plist);
888 			sgllen--;
889 			/*
890 			 * return when contig pages no longer needed
891 			 */
892 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
893 				startpfn = pfn;
894 				CONTIG_UNLOCK();
895 				check_dma(mattr, pplist, *pgcnt);
896 				return (pplist);
897 			}
898 			minctg = howmany(*pgcnt, sgllen);
899 		}
900 		if (pfnalign)
901 			pfn = P2ROUNDUP(pfn, pfnalign);
902 	}
903 
904 	/* cannot find contig pages in specified range */
905 	if (startpfn == lo) {
906 		CONTIG_UNLOCK();
907 		return (NULL);
908 	}
909 
910 	/* did not start with lo previously */
911 	pfn = lo;
912 	if (pfnalign)
913 		pfn = P2ROUNDUP(pfn, pfnalign);
914 
915 	/* allow search to go above startpfn */
916 	while (pfn < startpfn) {
917 
918 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
919 		if (plist != NULL) {
920 
921 			page_list_concat(&pplist, &plist);
922 			sgllen--;
923 
924 			/*
925 			 * return when contig pages no longer needed
926 			 */
927 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
928 				startpfn = pfn;
929 				CONTIG_UNLOCK();
930 				check_dma(mattr, pplist, *pgcnt);
931 				return (pplist);
932 			}
933 			minctg = howmany(*pgcnt, sgllen);
934 		}
935 		if (pfnalign)
936 			pfn = P2ROUNDUP(pfn, pfnalign);
937 	}
938 	CONTIG_UNLOCK();
939 	return (NULL);
940 }
941 
942 /*
943  * combine mem_node_config and memrange memory ranges into one data
944  * structure to be used for page list management.
945  *
946  * mnode_range_cnt() calculates the number of memory ranges for mnode and
947  * memranges[]. Used to determine the size of page lists and mnoderanges.
948  *
949  * mnode_range_setup() initializes mnoderanges.
950  */
951 mnoderange_t	*mnoderanges;
952 int		mnoderangecnt;
953 int		mtype4g;
954 
955 int
956 mnode_range_cnt(int mnode)
957 {
958 	int	mri;
959 	int	mnrcnt = 0;
960 
961 	if (mem_node_config[mnode].exists != 0) {
962 		mri = nranges - 1;
963 
964 		/* find the memranges index below contained in mnode range */
965 
966 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
967 			mri--;
968 
969 		/*
970 		 * increment mnode range counter when memranges or mnode
971 		 * boundary is reached.
972 		 */
973 		while (mri >= 0 &&
974 		    mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
975 			mnrcnt++;
976 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
977 				mri--;
978 			else
979 				break;
980 		}
981 	}
982 	ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
983 	return (mnrcnt);
984 }
985 
986 void
987 mnode_range_setup(mnoderange_t *mnoderanges)
988 {
989 	int	mnode, mri;
990 
991 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
992 		if (mem_node_config[mnode].exists == 0)
993 			continue;
994 
995 		mri = nranges - 1;
996 
997 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
998 			mri--;
999 
1000 		while (mri >= 0 && mem_node_config[mnode].physmax >=
1001 		    MEMRANGELO(mri)) {
1002 			mnoderanges->mnr_pfnlo =
1003 			    MAX(MEMRANGELO(mri),
1004 				mem_node_config[mnode].physbase);
1005 			mnoderanges->mnr_pfnhi =
1006 			    MIN(MEMRANGEHI(mri),
1007 				mem_node_config[mnode].physmax);
1008 			mnoderanges->mnr_mnode = mnode;
1009 			mnoderanges->mnr_memrange = mri;
1010 			mnoderanges++;
1011 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1012 				mri--;
1013 			else
1014 				break;
1015 		}
1016 	}
1017 }
1018 
1019 /*
1020  * Determine if the mnode range specified in mtype contains memory belonging
1021  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
1022  * the range of indices from high pfn to 0, 16m or 4g.
1023  *
1024  * Return first mnode range type index found otherwise return -1 if none found.
1025  */
1026 int
1027 mtype_func(int mnode, int mtype, uint_t flags)
1028 {
1029 	if (flags & PGI_MT_RANGE) {
1030 		int	mtlim;
1031 
1032 		if (flags & PGI_MT_NEXT)
1033 			mtype--;
1034 		if (flags & PGI_MT_RANGE0)
1035 			mtlim = 0;
1036 		else if (flags & PGI_MT_RANGE4G)
1037 			mtlim = mtype4g + 1;	/* exclude 0-4g range */
1038 		else if (flags & PGI_MT_RANGE16M)
1039 			mtlim = 1;		/* exclude 0-16m range */
1040 		while (mtype >= mtlim) {
1041 			if (mnoderanges[mtype].mnr_mnode == mnode)
1042 				return (mtype);
1043 			mtype--;
1044 		}
1045 	} else {
1046 		if (mnoderanges[mtype].mnr_mnode == mnode)
1047 			return (mtype);
1048 	}
1049 	return (-1);
1050 }
1051 
1052 /*
1053  * Update the page list max counts with the pfn range specified by the
1054  * input parameters.  Called from add_physmem() when physical memory with
1055  * page_t's are initially added to the page lists.
1056  */
1057 void
1058 mtype_modify_max(pfn_t startpfn, long cnt)
1059 {
1060 	int	mtype = 0;
1061 	pfn_t	endpfn = startpfn + cnt, pfn;
1062 	pgcnt_t	inc;
1063 
1064 	ASSERT(cnt > 0);
1065 
1066 	for (pfn = startpfn; pfn < endpfn; ) {
1067 		if (pfn <= mnoderanges[mtype].mnr_pfnhi) {
1068 			if (endpfn < mnoderanges[mtype].mnr_pfnhi) {
1069 				inc = endpfn - pfn;
1070 			} else {
1071 				inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1;
1072 			}
1073 			mnoderanges[mtype].mnr_mt_pgmax += inc;
1074 			if (physmax4g && mtype <= mtype4g)
1075 				maxmem4g += inc;
1076 			pfn += inc;
1077 		}
1078 		mtype++;
1079 		ASSERT(mtype < mnoderangecnt || pfn >= endpfn);
1080 	}
1081 }
1082 
1083 /*
1084  * Returns the free page count for mnode
1085  */
1086 int
1087 mnode_pgcnt(int mnode)
1088 {
1089 	int	mtype = mnoderangecnt - 1;
1090 	int	flags = PGI_MT_RANGE0;
1091 	pgcnt_t	pgcnt = 0;
1092 
1093 	mtype = mtype_func(mnode, mtype, flags);
1094 
1095 	while (mtype != -1) {
1096 		pgcnt += MTYPE_FREEMEM(mtype);
1097 		mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1098 	}
1099 	return (pgcnt);
1100 }
1101 
1102 /*
1103  * Initialize page coloring variables based on the l2 cache parameters.
1104  * Calculate and return memory needed for page coloring data structures.
1105  */
1106 size_t
1107 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1108 {
1109 	size_t	colorsz = 0;
1110 	int	i;
1111 	int	colors;
1112 
1113 	/*
1114 	 * Reduce the memory ranges lists if we don't have large amounts
1115 	 * of memory. This avoids searching known empty free lists.
1116 	 */
1117 	i = memrange_num(physmax);
1118 	memranges += i;
1119 	nranges -= i;
1120 #if defined(__i386)
1121 	if (i > 0)
1122 		restricted_kmemalloc = 0;
1123 #endif
1124 	/* physmax greater than 4g */
1125 	if (i == 0)
1126 		physmax4g = 1;
1127 
1128 	ASSERT(ISP2(l2_sz));
1129 	ASSERT(ISP2(l2_linesz));
1130 	ASSERT(l2_sz > MMU_PAGESIZE);
1131 
1132 	/* l2_assoc is 0 for fully associative l2 cache */
1133 	if (l2_assoc)
1134 		l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1135 	else
1136 		l2_colors = 1;
1137 
1138 	/* for scalability, configure at least PAGE_COLORS_MIN color bins */
1139 	page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1140 
1141 	/*
1142 	 * cpu_page_colors is non-zero when a page color may be spread across
1143 	 * multiple bins.
1144 	 */
1145 	if (l2_colors < page_colors)
1146 		cpu_page_colors = l2_colors;
1147 
1148 	ASSERT(ISP2(page_colors));
1149 
1150 	page_colors_mask = page_colors - 1;
1151 
1152 	ASSERT(ISP2(CPUSETSIZE()));
1153 	page_coloring_shift = lowbit(CPUSETSIZE());
1154 
1155 	/* initialize number of colors per page size */
1156 	for (i = 0; i <= mmu.max_page_level; i++) {
1157 		hw_page_array[i].hp_size = LEVEL_SIZE(i);
1158 		hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1159 		hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1160 		hw_page_array[i].hp_colors = (page_colors_mask >>
1161 		    (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
1162 		    + 1;
1163 	}
1164 
1165 	/*
1166 	 * The value of cpu_page_colors determines if additional color bins
1167 	 * need to be checked for a particular color in the page_get routines.
1168 	 */
1169 	if (cpu_page_colors != 0) {
1170 
1171 		int a = lowbit(page_colors) - lowbit(cpu_page_colors);
1172 		ASSERT(a > 0);
1173 		ASSERT(a < 16);
1174 
1175 		for (i = 0; i <= mmu.max_page_level; i++) {
1176 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
1177 				colorequivszc[i] = 0;
1178 				continue;
1179 			}
1180 			while ((colors >> a) == 0)
1181 				a--;
1182 			ASSERT(a >= 0);
1183 
1184 			/* higher 4 bits encodes color equiv mask */
1185 			colorequivszc[i] = (a << 4);
1186 		}
1187 	}
1188 
1189 	/* factor in colorequiv to check additional 'equivalent' bins. */
1190 	if (colorequiv > 1) {
1191 
1192 		int a = lowbit(colorequiv) - 1;
1193 		if (a > 15)
1194 			a = 15;
1195 
1196 		for (i = 0; i <= mmu.max_page_level; i++) {
1197 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
1198 				continue;
1199 			}
1200 			while ((colors >> a) == 0)
1201 				a--;
1202 			if ((a << 4) > colorequivszc[i]) {
1203 				colorequivszc[i] = (a << 4);
1204 			}
1205 		}
1206 	}
1207 
1208 	/* size for mnoderanges */
1209 	for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
1210 		mnoderangecnt += mnode_range_cnt(i);
1211 	colorsz = mnoderangecnt * sizeof (mnoderange_t);
1212 
1213 	/* size for fpc_mutex and cpc_mutex */
1214 	colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1215 
1216 	/* size of page_freelists */
1217 	colorsz += mnoderangecnt * sizeof (page_t ***);
1218 	colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1219 
1220 	for (i = 0; i < mmu_page_sizes; i++) {
1221 		colors = page_get_pagecolors(i);
1222 		colorsz += mnoderangecnt * colors * sizeof (page_t *);
1223 	}
1224 
1225 	/* size of page_cachelists */
1226 	colorsz += mnoderangecnt * sizeof (page_t **);
1227 	colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1228 
1229 	return (colorsz);
1230 }
1231 
1232 /*
1233  * Called once at startup to configure page_coloring data structures and
1234  * does the 1st page_free()/page_freelist_add().
1235  */
1236 void
1237 page_coloring_setup(caddr_t pcmemaddr)
1238 {
1239 	int	i;
1240 	int	j;
1241 	int	k;
1242 	caddr_t	addr;
1243 	int	colors;
1244 
1245 	/*
1246 	 * do page coloring setup
1247 	 */
1248 	addr = pcmemaddr;
1249 
1250 	mnoderanges = (mnoderange_t *)addr;
1251 	addr += (mnoderangecnt * sizeof (mnoderange_t));
1252 
1253 	mnode_range_setup(mnoderanges);
1254 
1255 	if (physmax4g)
1256 		mtype4g = pfn_2_mtype(0xfffff);
1257 
1258 	for (k = 0; k < NPC_MUTEX; k++) {
1259 		fpc_mutex[k] = (kmutex_t *)addr;
1260 		addr += (max_mem_nodes * sizeof (kmutex_t));
1261 	}
1262 	for (k = 0; k < NPC_MUTEX; k++) {
1263 		cpc_mutex[k] = (kmutex_t *)addr;
1264 		addr += (max_mem_nodes * sizeof (kmutex_t));
1265 	}
1266 	page_freelists = (page_t ****)addr;
1267 	addr += (mnoderangecnt * sizeof (page_t ***));
1268 
1269 	page_cachelists = (page_t ***)addr;
1270 	addr += (mnoderangecnt * sizeof (page_t **));
1271 
1272 	for (i = 0; i < mnoderangecnt; i++) {
1273 		page_freelists[i] = (page_t ***)addr;
1274 		addr += (mmu_page_sizes * sizeof (page_t **));
1275 
1276 		for (j = 0; j < mmu_page_sizes; j++) {
1277 			colors = page_get_pagecolors(j);
1278 			page_freelists[i][j] = (page_t **)addr;
1279 			addr += (colors * sizeof (page_t *));
1280 		}
1281 		page_cachelists[i] = (page_t **)addr;
1282 		addr += (page_colors * sizeof (page_t *));
1283 	}
1284 }
1285 
1286 /*ARGSUSED*/
1287 int
1288 bp_color(struct buf *bp)
1289 {
1290 	return (0);
1291 }
1292 
1293 /*
1294  * get a page from any list with the given mnode
1295  */
1296 page_t *
1297 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
1298     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
1299 {
1300 	kmutex_t		*pcm;
1301 	int			i;
1302 	page_t			*pp;
1303 	page_t			*first_pp;
1304 	uint64_t		pgaddr;
1305 	ulong_t			bin;
1306 	int			mtypestart;
1307 	int			plw_initialized;
1308 	page_list_walker_t	plw;
1309 
1310 	VM_STAT_ADD(pga_vmstats.pgma_alloc);
1311 
1312 	ASSERT((flags & PG_MATCH_COLOR) == 0);
1313 	ASSERT(szc == 0);
1314 	ASSERT(dma_attr != NULL);
1315 
1316 	MTYPE_START(mnode, mtype, flags);
1317 	if (mtype < 0) {
1318 		VM_STAT_ADD(pga_vmstats.pgma_allocempty);
1319 		return (NULL);
1320 	}
1321 
1322 	mtypestart = mtype;
1323 
1324 	bin = origbin;
1325 
1326 	/*
1327 	 * check up to page_colors + 1 bins - origbin may be checked twice
1328 	 * because of BIN_STEP skip
1329 	 */
1330 	do {
1331 		plw_initialized = 0;
1332 
1333 		for (plw.plw_count = 0;
1334 		    plw.plw_count < page_colors; plw.plw_count++) {
1335 
1336 			if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
1337 				goto nextfreebin;
1338 
1339 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1340 			mutex_enter(pcm);
1341 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
1342 			first_pp = pp;
1343 			while (pp != NULL) {
1344 				if (page_trylock(pp, SE_EXCL) == 0) {
1345 					pp = pp->p_next;
1346 					if (pp == first_pp) {
1347 						pp = NULL;
1348 					}
1349 					continue;
1350 				}
1351 
1352 				ASSERT(PP_ISFREE(pp));
1353 				ASSERT(PP_ISAGED(pp));
1354 				ASSERT(pp->p_vnode == NULL);
1355 				ASSERT(pp->p_hash == NULL);
1356 				ASSERT(pp->p_offset == (u_offset_t)-1);
1357 				ASSERT(pp->p_szc == szc);
1358 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
1359 				/* check if page within DMA attributes */
1360 				pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum));
1361 
1362 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
1363 				    (pgaddr + MMU_PAGESIZE - 1 <=
1364 				    dma_attr->dma_attr_addr_hi)) {
1365 					break;
1366 				}
1367 
1368 				/* continue looking */
1369 				page_unlock(pp);
1370 				pp = pp->p_next;
1371 				if (pp == first_pp)
1372 					pp = NULL;
1373 
1374 			}
1375 			if (pp != NULL) {
1376 				ASSERT(mtype == PP_2_MTYPE(pp));
1377 				ASSERT(pp->p_szc == 0);
1378 
1379 				/* found a page with specified DMA attributes */
1380 				page_sub(&PAGE_FREELISTS(mnode, szc, bin,
1381 				    mtype), pp);
1382 				page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1383 
1384 				if ((PP_ISFREE(pp) == 0) ||
1385 				    (PP_ISAGED(pp) == 0)) {
1386 					cmn_err(CE_PANIC, "page %p is not free",
1387 					    (void *)pp);
1388 				}
1389 
1390 				mutex_exit(pcm);
1391 				check_dma(dma_attr, pp, 1);
1392 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
1393 				return (pp);
1394 			}
1395 			mutex_exit(pcm);
1396 nextfreebin:
1397 			if (plw_initialized == 0) {
1398 				page_list_walk_init(szc, 0, bin, 1, 0, &plw);
1399 				ASSERT(plw.plw_ceq_dif == page_colors);
1400 				plw_initialized = 1;
1401 			}
1402 
1403 			if (plw.plw_do_split) {
1404 				pp = page_freelist_split(szc, bin, mnode,
1405 				    mtype,
1406 				    mmu_btop(dma_attr->dma_attr_addr_hi + 1),
1407 				    &plw);
1408 				if (pp != NULL)
1409 					return (pp);
1410 			}
1411 
1412 			bin = page_list_walk_next_bin(szc, bin, &plw);
1413 		}
1414 
1415 		MTYPE_NEXT(mnode, mtype, flags);
1416 	} while (mtype >= 0);
1417 
1418 	/* failed to find a page in the freelist; try it in the cachelist */
1419 
1420 	/* reset mtype start for cachelist search */
1421 	mtype = mtypestart;
1422 	ASSERT(mtype >= 0);
1423 
1424 	/* start with the bin of matching color */
1425 	bin = origbin;
1426 
1427 	do {
1428 		for (i = 0; i <= page_colors; i++) {
1429 			if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
1430 				goto nextcachebin;
1431 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
1432 			mutex_enter(pcm);
1433 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
1434 			first_pp = pp;
1435 			while (pp != NULL) {
1436 				if (page_trylock(pp, SE_EXCL) == 0) {
1437 					pp = pp->p_next;
1438 					if (pp == first_pp)
1439 						break;
1440 					continue;
1441 				}
1442 				ASSERT(pp->p_vnode);
1443 				ASSERT(PP_ISAGED(pp) == 0);
1444 				ASSERT(pp->p_szc == 0);
1445 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
1446 
1447 				/* check if page within DMA attributes */
1448 
1449 				pgaddr = ptob((uint64_t)(pp->p_pagenum));
1450 
1451 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
1452 				    (pgaddr + MMU_PAGESIZE - 1 <=
1453 				    dma_attr->dma_attr_addr_hi)) {
1454 					break;
1455 				}
1456 
1457 				/* continue looking */
1458 				page_unlock(pp);
1459 				pp = pp->p_next;
1460 				if (pp == first_pp)
1461 					pp = NULL;
1462 			}
1463 
1464 			if (pp != NULL) {
1465 				ASSERT(mtype == PP_2_MTYPE(pp));
1466 				ASSERT(pp->p_szc == 0);
1467 
1468 				/* found a page with specified DMA attributes */
1469 				page_sub(&PAGE_CACHELISTS(mnode, bin,
1470 				    mtype), pp);
1471 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
1472 
1473 				mutex_exit(pcm);
1474 				ASSERT(pp->p_vnode);
1475 				ASSERT(PP_ISAGED(pp) == 0);
1476 				check_dma(dma_attr, pp, 1);
1477 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
1478 				return (pp);
1479 			}
1480 			mutex_exit(pcm);
1481 nextcachebin:
1482 			bin += (i == 0) ? BIN_STEP : 1;
1483 			bin &= page_colors_mask;
1484 		}
1485 		MTYPE_NEXT(mnode, mtype, flags);
1486 	} while (mtype >= 0);
1487 
1488 	VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
1489 	return (NULL);
1490 }
1491 
1492 /*
1493  * This function is similar to page_get_freelist()/page_get_cachelist()
1494  * but it searches both the lists to find a page with the specified
1495  * color (or no color) and DMA attributes. The search is done in the
1496  * freelist first and then in the cache list within the highest memory
1497  * range (based on DMA attributes) before searching in the lower
1498  * memory ranges.
1499  *
1500  * Note: This function is called only by page_create_io().
1501  */
1502 /*ARGSUSED*/
1503 page_t *
1504 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
1505     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t	*lgrp)
1506 {
1507 	uint_t		bin;
1508 	int		mtype;
1509 	page_t		*pp;
1510 	int		n;
1511 	int		m;
1512 	int		szc;
1513 	int		fullrange;
1514 	int		mnode;
1515 	int		local_failed_stat = 0;
1516 	lgrp_mnode_cookie_t	lgrp_cookie;
1517 
1518 	VM_STAT_ADD(pga_vmstats.pga_alloc);
1519 
1520 	/* only base pagesize currently supported */
1521 	if (size != MMU_PAGESIZE)
1522 		return (NULL);
1523 
1524 	/*
1525 	 * If we're passed a specific lgroup, we use it.  Otherwise,
1526 	 * assume first-touch placement is desired.
1527 	 */
1528 	if (!LGRP_EXISTS(lgrp))
1529 		lgrp = lgrp_home_lgrp();
1530 
1531 	/* LINTED */
1532 	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
1533 
1534 	/*
1535 	 * Only hold one freelist or cachelist lock at a time, that way we
1536 	 * can start anywhere and not have to worry about lock
1537 	 * ordering.
1538 	 */
1539 	if (dma_attr == NULL) {
1540 		n = 0;
1541 		m = mnoderangecnt - 1;
1542 		fullrange = 1;
1543 		VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
1544 	} else {
1545 		pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
1546 		pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
1547 
1548 		/*
1549 		 * We can guarantee alignment only for page boundary.
1550 		 */
1551 		if (dma_attr->dma_attr_align > MMU_PAGESIZE)
1552 			return (NULL);
1553 
1554 		n = pfn_2_mtype(pfnlo);
1555 		m = pfn_2_mtype(pfnhi);
1556 
1557 		fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
1558 		    (pfnhi >= mnoderanges[m].mnr_pfnhi));
1559 	}
1560 	VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
1561 
1562 	if (n > m)
1563 		return (NULL);
1564 
1565 	szc = 0;
1566 
1567 	/* cylcing thru mtype handled by RANGE0 if n == 0 */
1568 	if (n == 0) {
1569 		flags |= PGI_MT_RANGE0;
1570 		n = m;
1571 	}
1572 
1573 	/*
1574 	 * Try local memory node first, but try remote if we can't
1575 	 * get a page of the right color.
1576 	 */
1577 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
1578 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
1579 		/*
1580 		 * allocate pages from high pfn to low.
1581 		 */
1582 		for (mtype = m; mtype >= n; mtype--) {
1583 			if (fullrange != 0) {
1584 				pp = page_get_mnode_freelist(mnode,
1585 				    bin, mtype, szc, flags);
1586 				if (pp == NULL) {
1587 					pp = page_get_mnode_cachelist(
1588 						bin, flags, mnode, mtype);
1589 				}
1590 			} else {
1591 				pp = page_get_mnode_anylist(bin, szc,
1592 				    flags, mnode, mtype, dma_attr);
1593 			}
1594 			if (pp != NULL) {
1595 				VM_STAT_ADD(pga_vmstats.pga_allocok);
1596 				check_dma(dma_attr, pp, 1);
1597 				return (pp);
1598 			}
1599 		}
1600 		if (!local_failed_stat) {
1601 			lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
1602 			local_failed_stat = 1;
1603 		}
1604 	}
1605 	VM_STAT_ADD(pga_vmstats.pga_allocfailed);
1606 
1607 	return (NULL);
1608 }
1609 
1610 /*
1611  * page_create_io()
1612  *
1613  * This function is a copy of page_create_va() with an additional
1614  * argument 'mattr' that specifies DMA memory requirements to
1615  * the page list functions. This function is used by the segkmem
1616  * allocator so it is only to create new pages (i.e PG_EXCL is
1617  * set).
1618  *
1619  * Note: This interface is currently used by x86 PSM only and is
1620  *	 not fully specified so the commitment level is only for
1621  *	 private interface specific to x86. This interface uses PSM
1622  *	 specific page_get_anylist() interface.
1623  */
1624 
1625 #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
1626 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
1627 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
1628 			break; \
1629 	} \
1630 }
1631 
1632 
1633 page_t *
1634 page_create_io(
1635 	struct vnode	*vp,
1636 	u_offset_t	off,
1637 	uint_t		bytes,
1638 	uint_t		flags,
1639 	struct as	*as,
1640 	caddr_t		vaddr,
1641 	ddi_dma_attr_t	*mattr)	/* DMA memory attributes if any */
1642 {
1643 	page_t		*plist = NULL;
1644 	uint_t		plist_len = 0;
1645 	pgcnt_t		npages;
1646 	page_t		*npp = NULL;
1647 	uint_t		pages_req;
1648 	page_t		*pp;
1649 	kmutex_t	*phm = NULL;
1650 	uint_t		index;
1651 
1652 	TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
1653 		"page_create_start:vp %p off %llx bytes %u flags %x",
1654 		vp, off, bytes, flags);
1655 
1656 	ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
1657 
1658 	pages_req = npages = mmu_btopr(bytes);
1659 
1660 	/*
1661 	 * Do the freemem and pcf accounting.
1662 	 */
1663 	if (!page_create_wait(npages, flags)) {
1664 		return (NULL);
1665 	}
1666 
1667 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
1668 		"page_create_success:vp %p off %llx",
1669 		vp, off);
1670 
1671 	/*
1672 	 * If satisfying this request has left us with too little
1673 	 * memory, start the wheels turning to get some back.  The
1674 	 * first clause of the test prevents waking up the pageout
1675 	 * daemon in situations where it would decide that there's
1676 	 * nothing to do.
1677 	 */
1678 	if (nscan < desscan && freemem < minfree) {
1679 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
1680 			"pageout_cv_signal:freemem %ld", freemem);
1681 		cv_signal(&proc_pageout->p_cv);
1682 	}
1683 
1684 	if (flags & PG_PHYSCONTIG) {
1685 
1686 		plist = page_get_contigpage(&npages, mattr, 1);
1687 		if (plist == NULL) {
1688 			page_create_putback(npages);
1689 			return (NULL);
1690 		}
1691 
1692 		pp = plist;
1693 
1694 		do {
1695 			if (!page_hashin(pp, vp, off, NULL)) {
1696 				panic("pg_creat_io: hashin failed %p %p %llx",
1697 				    (void *)pp, (void *)vp, off);
1698 			}
1699 			VM_STAT_ADD(page_create_new);
1700 			off += MMU_PAGESIZE;
1701 			PP_CLRFREE(pp);
1702 			PP_CLRAGED(pp);
1703 			page_set_props(pp, P_REF);
1704 			pp = pp->p_next;
1705 		} while (pp != plist);
1706 
1707 		if (!npages) {
1708 			check_dma(mattr, plist, pages_req);
1709 			return (plist);
1710 		} else {
1711 			vaddr += (pages_req - npages) << MMU_PAGESHIFT;
1712 		}
1713 
1714 		/*
1715 		 * fall-thru:
1716 		 *
1717 		 * page_get_contigpage returns when npages <= sgllen.
1718 		 * Grab the rest of the non-contig pages below from anylist.
1719 		 */
1720 	}
1721 
1722 	/*
1723 	 * Loop around collecting the requested number of pages.
1724 	 * Most of the time, we have to `create' a new page. With
1725 	 * this in mind, pull the page off the free list before
1726 	 * getting the hash lock.  This will minimize the hash
1727 	 * lock hold time, nesting, and the like.  If it turns
1728 	 * out we don't need the page, we put it back at the end.
1729 	 */
1730 	while (npages--) {
1731 		phm = NULL;
1732 
1733 		index = PAGE_HASH_FUNC(vp, off);
1734 top:
1735 		ASSERT(phm == NULL);
1736 		ASSERT(index == PAGE_HASH_FUNC(vp, off));
1737 		ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1738 
1739 		if (npp == NULL) {
1740 			/*
1741 			 * Try to get the page of any color either from
1742 			 * the freelist or from the cache list.
1743 			 */
1744 			npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
1745 			    flags & ~PG_MATCH_COLOR, mattr, NULL);
1746 			if (npp == NULL) {
1747 				if (mattr == NULL) {
1748 					/*
1749 					 * Not looking for a special page;
1750 					 * panic!
1751 					 */
1752 					panic("no page found %d", (int)npages);
1753 				}
1754 				/*
1755 				 * No page found! This can happen
1756 				 * if we are looking for a page
1757 				 * within a specific memory range
1758 				 * for DMA purposes. If PG_WAIT is
1759 				 * specified then we wait for a
1760 				 * while and then try again. The
1761 				 * wait could be forever if we
1762 				 * don't get the page(s) we need.
1763 				 *
1764 				 * Note: XXX We really need a mechanism
1765 				 * to wait for pages in the desired
1766 				 * range. For now, we wait for any
1767 				 * pages and see if we can use it.
1768 				 */
1769 
1770 				if ((mattr != NULL) && (flags & PG_WAIT)) {
1771 					delay(10);
1772 					goto top;
1773 				}
1774 
1775 				goto fail; /* undo accounting stuff */
1776 			}
1777 
1778 			if (PP_ISAGED(npp) == 0) {
1779 				/*
1780 				 * Since this page came from the
1781 				 * cachelist, we must destroy the
1782 				 * old vnode association.
1783 				 */
1784 				page_hashout(npp, (kmutex_t *)NULL);
1785 			}
1786 		}
1787 
1788 		/*
1789 		 * We own this page!
1790 		 */
1791 		ASSERT(PAGE_EXCL(npp));
1792 		ASSERT(npp->p_vnode == NULL);
1793 		ASSERT(!hat_page_is_mapped(npp));
1794 		PP_CLRFREE(npp);
1795 		PP_CLRAGED(npp);
1796 
1797 		/*
1798 		 * Here we have a page in our hot little mits and are
1799 		 * just waiting to stuff it on the appropriate lists.
1800 		 * Get the mutex and check to see if it really does
1801 		 * not exist.
1802 		 */
1803 		phm = PAGE_HASH_MUTEX(index);
1804 		mutex_enter(phm);
1805 		PAGE_HASH_SEARCH(index, pp, vp, off);
1806 		if (pp == NULL) {
1807 			VM_STAT_ADD(page_create_new);
1808 			pp = npp;
1809 			npp = NULL;
1810 			if (!page_hashin(pp, vp, off, phm)) {
1811 				/*
1812 				 * Since we hold the page hash mutex and
1813 				 * just searched for this page, page_hashin
1814 				 * had better not fail.  If it does, that
1815 				 * means somethread did not follow the
1816 				 * page hash mutex rules.  Panic now and
1817 				 * get it over with.  As usual, go down
1818 				 * holding all the locks.
1819 				 */
1820 				ASSERT(MUTEX_HELD(phm));
1821 				panic("page_create: hashin fail %p %p %llx %p",
1822 				    (void *)pp, (void *)vp, off, (void *)phm);
1823 
1824 			}
1825 			ASSERT(MUTEX_HELD(phm));
1826 			mutex_exit(phm);
1827 			phm = NULL;
1828 
1829 			/*
1830 			 * Hat layer locking need not be done to set
1831 			 * the following bits since the page is not hashed
1832 			 * and was on the free list (i.e., had no mappings).
1833 			 *
1834 			 * Set the reference bit to protect
1835 			 * against immediate pageout
1836 			 *
1837 			 * XXXmh modify freelist code to set reference
1838 			 * bit so we don't have to do it here.
1839 			 */
1840 			page_set_props(pp, P_REF);
1841 		} else {
1842 			ASSERT(MUTEX_HELD(phm));
1843 			mutex_exit(phm);
1844 			phm = NULL;
1845 			/*
1846 			 * NOTE: This should not happen for pages associated
1847 			 *	 with kernel vnode 'kvp'.
1848 			 */
1849 			/* XX64 - to debug why this happens! */
1850 			ASSERT(vp != &kvp);
1851 			if (vp == &kvp)
1852 				cmn_err(CE_NOTE,
1853 				    "page_create: page not expected "
1854 				    "in hash list for kernel vnode - pp 0x%p",
1855 				    (void *)pp);
1856 			VM_STAT_ADD(page_create_exists);
1857 			goto fail;
1858 		}
1859 
1860 		/*
1861 		 * Got a page!  It is locked.  Acquire the i/o
1862 		 * lock since we are going to use the p_next and
1863 		 * p_prev fields to link the requested pages together.
1864 		 */
1865 		page_io_lock(pp);
1866 		page_add(&plist, pp);
1867 		plist = plist->p_next;
1868 		off += MMU_PAGESIZE;
1869 		vaddr += MMU_PAGESIZE;
1870 	}
1871 
1872 	check_dma(mattr, plist, pages_req);
1873 	return (plist);
1874 
1875 fail:
1876 	if (npp != NULL) {
1877 		/*
1878 		 * Did not need this page after all.
1879 		 * Put it back on the free list.
1880 		 */
1881 		VM_STAT_ADD(page_create_putbacks);
1882 		PP_SETFREE(npp);
1883 		PP_SETAGED(npp);
1884 		npp->p_offset = (u_offset_t)-1;
1885 		page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
1886 		page_unlock(npp);
1887 	}
1888 
1889 	/*
1890 	 * Give up the pages we already got.
1891 	 */
1892 	while (plist != NULL) {
1893 		pp = plist;
1894 		page_sub(&plist, pp);
1895 		page_io_unlock(pp);
1896 		plist_len++;
1897 		/*LINTED: constant in conditional ctx*/
1898 		VN_DISPOSE(pp, B_INVAL, 0, kcred);
1899 	}
1900 
1901 	/*
1902 	 * VN_DISPOSE does freemem accounting for the pages in plist
1903 	 * by calling page_free. So, we need to undo the pcf accounting
1904 	 * for only the remaining pages.
1905 	 */
1906 	VM_STAT_ADD(page_create_putbacks);
1907 	page_create_putback(pages_req - plist_len);
1908 
1909 	return (NULL);
1910 }
1911 
1912 
1913 /*
1914  * Copy the data from the physical page represented by "frompp" to
1915  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
1916  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
1917  * level and no one sleeps with an active mapping there.
1918  *
1919  * Note that the ref/mod bits in the page_t's are not affected by
1920  * this operation, hence it is up to the caller to update them appropriately.
1921  */
1922 void
1923 ppcopy(page_t *frompp, page_t *topp)
1924 {
1925 	caddr_t		pp_addr1;
1926 	caddr_t		pp_addr2;
1927 	void		*pte1;
1928 	void		*pte2;
1929 	kmutex_t	*ppaddr_mutex;
1930 
1931 	ASSERT_STACK_ALIGNED();
1932 	ASSERT(PAGE_LOCKED(frompp));
1933 	ASSERT(PAGE_LOCKED(topp));
1934 
1935 	if (kpm_enable) {
1936 		pp_addr1 = hat_kpm_page2va(frompp, 0);
1937 		pp_addr2 = hat_kpm_page2va(topp, 0);
1938 		kpreempt_disable();
1939 	} else {
1940 		/*
1941 		 * disable pre-emption so that CPU can't change
1942 		 */
1943 		kpreempt_disable();
1944 
1945 		pp_addr1 = CPU->cpu_caddr1;
1946 		pp_addr2 = CPU->cpu_caddr2;
1947 		pte1 = (void *)CPU->cpu_caddr1pte;
1948 		pte2 = (void *)CPU->cpu_caddr2pte;
1949 
1950 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
1951 		mutex_enter(ppaddr_mutex);
1952 
1953 		hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
1954 		    PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
1955 		hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
1956 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
1957 		    HAT_LOAD_NOCONSIST);
1958 	}
1959 
1960 	if (use_sse_pagecopy)
1961 		hwblkpagecopy(pp_addr1, pp_addr2);
1962 	else
1963 		bcopy(pp_addr1, pp_addr2, PAGESIZE);
1964 
1965 	if (!kpm_enable)
1966 		mutex_exit(ppaddr_mutex);
1967 	kpreempt_enable();
1968 }
1969 
1970 /*
1971  * Zero the physical page from off to off + len given by `pp'
1972  * without changing the reference and modified bits of page.
1973  *
1974  * We use this using CPU private page address #2, see ppcopy() for more info.
1975  * pagezero() must not be called at interrupt level.
1976  */
1977 void
1978 pagezero(page_t *pp, uint_t off, uint_t len)
1979 {
1980 	caddr_t		pp_addr2;
1981 	void		*pte2;
1982 	kmutex_t	*ppaddr_mutex;
1983 
1984 	ASSERT_STACK_ALIGNED();
1985 	ASSERT(len <= MMU_PAGESIZE);
1986 	ASSERT(off <= MMU_PAGESIZE);
1987 	ASSERT(off + len <= MMU_PAGESIZE);
1988 	ASSERT(PAGE_LOCKED(pp));
1989 
1990 	if (kpm_enable) {
1991 		pp_addr2 = hat_kpm_page2va(pp, 0);
1992 		kpreempt_disable();
1993 	} else {
1994 		kpreempt_disable();
1995 
1996 		pp_addr2 = CPU->cpu_caddr2;
1997 		pte2 = (void *)CPU->cpu_caddr2pte;
1998 
1999 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
2000 		mutex_enter(ppaddr_mutex);
2001 
2002 		hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2,
2003 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
2004 		    HAT_LOAD_NOCONSIST);
2005 	}
2006 
2007 	if (use_sse_pagezero)
2008 		hwblkclr(pp_addr2 + off, len);
2009 	else
2010 		bzero(pp_addr2 + off, len);
2011 
2012 	if (!kpm_enable)
2013 		mutex_exit(ppaddr_mutex);
2014 	kpreempt_enable();
2015 }
2016 
2017 /*
2018  * Platform-dependent page scrub call.
2019  */
2020 void
2021 pagescrub(page_t *pp, uint_t off, uint_t len)
2022 {
2023 	/*
2024 	 * For now, we rely on the fact that pagezero() will
2025 	 * always clear UEs.
2026 	 */
2027 	pagezero(pp, off, len);
2028 }
2029 
2030 /*
2031  * set up two private addresses for use on a given CPU for use in ppcopy()
2032  */
2033 void
2034 setup_vaddr_for_ppcopy(struct cpu *cpup)
2035 {
2036 	void *addr;
2037 	void *pte;
2038 
2039 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
2040 	pte = hat_mempte_setup(addr);
2041 	cpup->cpu_caddr1 = addr;
2042 	cpup->cpu_caddr1pte = (pteptr_t)pte;
2043 
2044 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
2045 	pte = hat_mempte_setup(addr);
2046 	cpup->cpu_caddr2 = addr;
2047 	cpup->cpu_caddr2pte = (pteptr_t)pte;
2048 
2049 	mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
2050 }
2051 
2052 
2053 /*
2054  * Create the pageout scanner thread. The thread has to
2055  * start at procedure with process pp and priority pri.
2056  */
2057 void
2058 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
2059 {
2060 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
2061 }
2062 
2063 /*
2064  * Function for flushing D-cache when performing module relocations
2065  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
2066  */
2067 void
2068 dcache_flushall()
2069 {}
2070