xref: /titanic_50/usr/src/uts/i86pc/vm/vm_machdep.c (revision 26706799998ca499307df9f2515e8be432a95eda)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /*	All Rights Reserved   */
29 
30 /*
31  * Portions of this source code were derived from Berkeley 4.3 BSD
32  * under license from the Regents of the University of California.
33  */
34 
35 #pragma ident	"%Z%%M%	%I%	%E% SMI"
36 
37 /*
38  * UNIX machine dependent virtual memory support.
39  */
40 
41 #include <sys/types.h>
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/user.h>
45 #include <sys/proc.h>
46 #include <sys/kmem.h>
47 #include <sys/vmem.h>
48 #include <sys/buf.h>
49 #include <sys/cpuvar.h>
50 #include <sys/lgrp.h>
51 #include <sys/disp.h>
52 #include <sys/vm.h>
53 #include <sys/mman.h>
54 #include <sys/vnode.h>
55 #include <sys/cred.h>
56 #include <sys/exec.h>
57 #include <sys/exechdr.h>
58 #include <sys/debug.h>
59 
60 #include <vm/hat.h>
61 #include <vm/as.h>
62 #include <vm/seg.h>
63 #include <vm/seg_kp.h>
64 #include <vm/seg_vn.h>
65 #include <vm/page.h>
66 #include <vm/seg_kmem.h>
67 #include <vm/seg_kpm.h>
68 #include <vm/vm_dep.h>
69 
70 #include <sys/cpu.h>
71 #include <sys/vm_machparam.h>
72 #include <sys/memlist.h>
73 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
74 #include <vm/hat_i86.h>
75 #include <sys/x86_archext.h>
76 #include <sys/elf_386.h>
77 #include <sys/cmn_err.h>
78 #include <sys/archsystm.h>
79 #include <sys/machsystm.h>
80 
81 #include <sys/vtrace.h>
82 #include <sys/ddidmareq.h>
83 #include <sys/promif.h>
84 #include <sys/memnode.h>
85 #include <sys/stack.h>
86 
87 uint_t vac_colors = 0;
88 
89 int largepagesupport = 0;
90 extern uint_t page_create_new;
91 extern uint_t page_create_exists;
92 extern uint_t page_create_putbacks;
93 extern uint_t page_create_putbacks;
94 extern uintptr_t eprom_kernelbase;
95 extern int use_sse_pagecopy, use_sse_pagezero;	/* in ml/float.s */
96 
97 /* 4g memory management */
98 pgcnt_t		maxmem4g;
99 pgcnt_t		freemem4g;
100 int		physmax4g;
101 int		desfree4gshift = 4;	/* maxmem4g shift to derive DESFREE4G */
102 int		lotsfree4gshift = 3;
103 
104 /* 16m memory management: desired number of free pages below 16m. */
105 pgcnt_t		desfree16m = 0x380;
106 
107 #ifdef VM_STATS
108 struct {
109 	ulong_t	pga_alloc;
110 	ulong_t	pga_notfullrange;
111 	ulong_t	pga_nulldmaattr;
112 	ulong_t	pga_allocok;
113 	ulong_t	pga_allocfailed;
114 	ulong_t	pgma_alloc;
115 	ulong_t	pgma_allocok;
116 	ulong_t	pgma_allocfailed;
117 	ulong_t	pgma_allocempty;
118 } pga_vmstats;
119 #endif
120 
121 uint_t mmu_page_sizes;
122 
123 /* How many page sizes the users can see */
124 uint_t mmu_exported_page_sizes;
125 
126 size_t auto_lpg_va_default = MMU_PAGESIZE; /* used by zmap() */
127 /*
128  * Number of pages in 1 GB.  Don't enable automatic large pages if we have
129  * fewer than this many pages.
130  */
131 pgcnt_t auto_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
132 
133 /*
134  * Return the optimum page size for a given mapping
135  */
136 /*ARGSUSED*/
137 size_t
138 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap)
139 {
140 	level_t l;
141 
142 	if (remap)
143 		*remap = 0;
144 
145 	switch (maptype) {
146 
147 	case MAPPGSZ_STK:
148 	case MAPPGSZ_HEAP:
149 	case MAPPGSZ_VA:
150 		/*
151 		 * use the pages size that best fits len
152 		 */
153 		for (l = mmu.max_page_level; l > 0; --l) {
154 			if (len < LEVEL_SIZE(l))
155 				continue;
156 			break;
157 		}
158 		return (LEVEL_SIZE(l));
159 
160 	/*
161 	 * for ISM use the 1st large page size.
162 	 */
163 	case MAPPGSZ_ISM:
164 		if (mmu.max_page_level == 0)
165 			return (MMU_PAGESIZE);
166 		return (LEVEL_SIZE(1));
167 	}
168 	return (0);
169 }
170 
171 /*
172  * This can be patched via /etc/system to allow large pages
173  * to be used for mapping application and libraries text segments.
174  */
175 int	use_text_largepages = 0;
176 
177 /*
178  * Return a bit vector of large page size codes that
179  * can be used to map [addr, addr + len) region.
180  */
181 
182 /*ARGSUSED*/
183 uint_t
184 map_execseg_pgszcvec(int text, caddr_t addr, size_t len)
185 {
186 	size_t	pgsz;
187 	caddr_t a;
188 
189 	if (!text || !use_text_largepages ||
190 	    mmu.max_page_level == 0)
191 		return (0);
192 
193 	pgsz = LEVEL_SIZE(1);
194 	a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
195 	if (a < addr || a >= addr + len) {
196 		return (0);
197 	}
198 	len -= (a - addr);
199 	if (len < pgsz) {
200 		return (0);
201 	}
202 	return (1 << 1);
203 }
204 
205 /*
206  * Handle a pagefault.
207  */
208 faultcode_t
209 pagefault(
210 	caddr_t addr,
211 	enum fault_type type,
212 	enum seg_rw rw,
213 	int iskernel)
214 {
215 	struct as *as;
216 	struct hat *hat;
217 	struct proc *p;
218 	kthread_t *t;
219 	faultcode_t res;
220 	caddr_t base;
221 	size_t len;
222 	int err;
223 	int mapped_red;
224 	uintptr_t ea;
225 
226 	ASSERT_STACK_ALIGNED();
227 
228 	if (INVALID_VADDR(addr))
229 		return (FC_NOMAP);
230 
231 	mapped_red = segkp_map_red();
232 
233 	if (iskernel) {
234 		as = &kas;
235 		hat = as->a_hat;
236 	} else {
237 		t = curthread;
238 		p = ttoproc(t);
239 		as = p->p_as;
240 		hat = as->a_hat;
241 	}
242 
243 	/*
244 	 * Dispatch pagefault.
245 	 */
246 	res = as_fault(hat, as, addr, 1, type, rw);
247 
248 	/*
249 	 * If this isn't a potential unmapped hole in the user's
250 	 * UNIX data or stack segments, just return status info.
251 	 */
252 	if (res != FC_NOMAP || iskernel)
253 		goto out;
254 
255 	/*
256 	 * Check to see if we happened to faulted on a currently unmapped
257 	 * part of the UNIX data or stack segments.  If so, create a zfod
258 	 * mapping there and then try calling the fault routine again.
259 	 */
260 	base = p->p_brkbase;
261 	len = p->p_brksize;
262 
263 	if (addr < base || addr >= base + len) {		/* data seg? */
264 		base = (caddr_t)p->p_usrstack - p->p_stksize;
265 		len = p->p_stksize;
266 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
267 			/* not in either UNIX data or stack segments */
268 			res = FC_NOMAP;
269 			goto out;
270 		}
271 	}
272 
273 	/*
274 	 * the rest of this function implements a 3.X 4.X 5.X compatibility
275 	 * This code is probably not needed anymore
276 	 */
277 	if (p->p_model == DATAMODEL_ILP32) {
278 
279 		/* expand the gap to the page boundaries on each side */
280 		ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
281 		base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
282 		len = ea - (uintptr_t)base;
283 
284 		as_rangelock(as);
285 		if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
286 		    0) {
287 			err = as_map(as, base, len, segvn_create, zfod_argsp);
288 			as_rangeunlock(as);
289 			if (err) {
290 				res = FC_MAKE_ERR(err);
291 				goto out;
292 			}
293 		} else {
294 			/*
295 			 * This page is already mapped by another thread after
296 			 * we returned from as_fault() above.  We just fall
297 			 * through as_fault() below.
298 			 */
299 			as_rangeunlock(as);
300 		}
301 
302 		res = as_fault(hat, as, addr, 1, F_INVAL, rw);
303 	}
304 
305 out:
306 	if (mapped_red)
307 		segkp_unmap_red();
308 
309 	return (res);
310 }
311 
312 void
313 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
314 {
315 	struct proc *p = curproc;
316 	caddr_t userlimit = (flags & _MAP_LOW32) ?
317 	    (caddr_t)_userlimit32 : p->p_as->a_userlimit;
318 
319 	map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
320 }
321 
322 /*ARGSUSED*/
323 int
324 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
325 {
326 	return (0);
327 }
328 
329 /*
330  * map_addr_proc() is the routine called when the system is to
331  * choose an address for the user.  We will pick an address
332  * range which is the highest available below kernelbase.
333  *
334  * addrp is a value/result parameter.
335  *	On input it is a hint from the user to be used in a completely
336  *	machine dependent fashion.  We decide to completely ignore this hint.
337  *
338  *	On output it is NULL if no address can be found in the current
339  *	processes address space or else an address that is currently
340  *	not mapped for len bytes with a page of red zone on either side.
341  *
342  *	align is not needed on x86 (it's for viturally addressed caches)
343  */
344 /*ARGSUSED*/
345 void
346 map_addr_proc(
347 	caddr_t *addrp,
348 	size_t len,
349 	offset_t off,
350 	int vacalign,
351 	caddr_t userlimit,
352 	struct proc *p,
353 	uint_t flags)
354 {
355 	struct as *as = p->p_as;
356 	caddr_t addr;
357 	caddr_t base;
358 	size_t slen;
359 	size_t align_amount;
360 
361 	ASSERT32(userlimit == as->a_userlimit);
362 
363 	base = p->p_brkbase;
364 #if defined(__amd64)
365 	/*
366 	 * XX64 Yes, this needs more work.
367 	 */
368 	if (p->p_model == DATAMODEL_NATIVE) {
369 		if (userlimit < as->a_userlimit) {
370 			/*
371 			 * This happens when a program wants to map
372 			 * something in a range that's accessible to a
373 			 * program in a smaller address space.  For example,
374 			 * a 64-bit program calling mmap32(2) to guarantee
375 			 * that the returned address is below 4Gbytes.
376 			 */
377 			ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
378 
379 			if (userlimit > base)
380 				slen = userlimit - base;
381 			else {
382 				*addrp = NULL;
383 				return;
384 			}
385 		} else {
386 			/*
387 			 * XX64 This layout is probably wrong .. but in
388 			 * the event we make the amd64 address space look
389 			 * like sparcv9 i.e. with the stack -above- the
390 			 * heap, this bit of code might even be correct.
391 			 */
392 			slen = p->p_usrstack - base -
393 			    (((size_t)rctl_enforced_value(
394 			    rctlproc_legacy[RLIMIT_STACK],
395 			    p->p_rctls, p) + PAGEOFFSET) & PAGEMASK);
396 		}
397 	} else
398 #endif
399 		slen = userlimit - base;
400 
401 	len = (len + PAGEOFFSET) & PAGEMASK;
402 
403 	/*
404 	 * Redzone for each side of the request. This is done to leave
405 	 * one page unmapped between segments. This is not required, but
406 	 * it's useful for the user because if their program strays across
407 	 * a segment boundary, it will catch a fault immediately making
408 	 * debugging a little easier.
409 	 */
410 	len += 2 * MMU_PAGESIZE;
411 
412 	/*
413 	 * figure out what the alignment should be
414 	 *
415 	 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
416 	 */
417 	if (len <= ELF_386_MAXPGSZ) {
418 		/*
419 		 * Align virtual addresses to ensure that ELF shared libraries
420 		 * are mapped with the appropriate alignment constraints by
421 		 * the run-time linker.
422 		 */
423 		align_amount = ELF_386_MAXPGSZ;
424 	} else {
425 		int l = mmu.max_page_level;
426 
427 		while (l && len < LEVEL_SIZE(l))
428 			--l;
429 
430 		align_amount = LEVEL_SIZE(l);
431 	}
432 
433 	if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
434 		align_amount = (uintptr_t)*addrp;
435 
436 	len += align_amount;
437 
438 	/*
439 	 * Look for a large enough hole starting below userlimit.
440 	 * After finding it, use the upper part.  Addition of PAGESIZE
441 	 * is for the redzone as described above.
442 	 */
443 	if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) {
444 		caddr_t as_addr;
445 
446 		addr = base + slen - len + MMU_PAGESIZE;
447 		as_addr = addr;
448 		/*
449 		 * Round address DOWN to the alignment amount,
450 		 * add the offset, and if this address is less
451 		 * than the original address, add alignment amount.
452 		 */
453 		addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
454 		addr += (uintptr_t)(off & (align_amount - 1));
455 		if (addr < as_addr)
456 			addr += align_amount;
457 
458 		ASSERT(addr <= (as_addr + align_amount));
459 		ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
460 		    ((uintptr_t)(off & (align_amount - 1))));
461 		*addrp = addr;
462 	} else {
463 		*addrp = NULL;	/* no more virtual space */
464 	}
465 }
466 
467 /*
468  * Determine whether [base, base+len] contains a valid range of
469  * addresses at least minlen long. base and len are adjusted if
470  * required to provide a valid range.
471  */
472 /*ARGSUSED3*/
473 int
474 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
475 {
476 	uintptr_t hi, lo;
477 
478 	lo = (uintptr_t)*basep;
479 	hi = lo + *lenp;
480 
481 	/*
482 	 * If hi rolled over the top, try cutting back.
483 	 */
484 	if (hi < lo) {
485 		if (0 - lo + hi < minlen)
486 			return (0);
487 		if (0 - lo < minlen)
488 			return (0);
489 		*lenp = 0 - lo;
490 	} else if (hi - lo < minlen) {
491 		return (0);
492 	}
493 #if defined(__amd64)
494 	/*
495 	 * Deal with a possible hole in the address range between
496 	 * hole_start and hole_end that should never be mapped.
497 	 */
498 	if (lo < hole_start) {
499 		if (hi > hole_start) {
500 			if (hi < hole_end) {
501 				hi = hole_start;
502 			} else {
503 				/* lo < hole_start && hi >= hole_end */
504 				if (dir == AH_LO) {
505 					/*
506 					 * prefer lowest range
507 					 */
508 					if (hole_start - lo >= minlen)
509 						hi = hole_start;
510 					else if (hi - hole_end >= minlen)
511 						lo = hole_end;
512 					else
513 						return (0);
514 				} else {
515 					/*
516 					 * prefer highest range
517 					 */
518 					if (hi - hole_end >= minlen)
519 						lo = hole_end;
520 					else if (hole_start - lo >= minlen)
521 						hi = hole_start;
522 					else
523 						return (0);
524 				}
525 			}
526 		}
527 	} else {
528 		/* lo >= hole_start */
529 		if (hi < hole_end)
530 			return (0);
531 		if (lo < hole_end)
532 			lo = hole_end;
533 	}
534 
535 	if (hi - lo < minlen)
536 		return (0);
537 
538 	*basep = (caddr_t)lo;
539 	*lenp = hi - lo;
540 #endif
541 	return (1);
542 }
543 
544 /*
545  * Determine whether [addr, addr+len] are valid user addresses.
546  */
547 /*ARGSUSED*/
548 int
549 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
550     caddr_t userlimit)
551 {
552 	caddr_t eaddr = addr + len;
553 
554 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
555 		return (RANGE_BADADDR);
556 
557 #if defined(__amd64)
558 	/*
559 	 * Check for the VA hole
560 	 */
561 	if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
562 		return (RANGE_BADADDR);
563 #endif
564 
565 	return (RANGE_OKAY);
566 }
567 
568 /*
569  * Return 1 if the page frame is onboard memory, else 0.
570  */
571 int
572 pf_is_memory(pfn_t pf)
573 {
574 	return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1));
575 }
576 
577 
578 /*
579  * initialized by page_coloring_init().
580  */
581 uint_t	page_colors;
582 uint_t	page_colors_mask;
583 uint_t	page_coloring_shift;
584 int	cpu_page_colors;
585 static uint_t	l2_colors;
586 
587 /*
588  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
589  * and page_colors are calculated from the l2 cache n-way set size.  Within a
590  * mnode range, the page freelist and cachelist are hashed into bins based on
591  * color. This makes it easier to search for a page within a specific memory
592  * range.
593  */
594 #define	PAGE_COLORS_MIN	16
595 
596 page_t ****page_freelists;
597 page_t ***page_cachelists;
598 
599 /*
600  * As the PC architecture evolved memory up was clumped into several
601  * ranges for various historical I/O devices to do DMA.
602  * < 16Meg - ISA bus
603  * < 2Gig - ???
604  * < 4Gig - PCI bus or drivers that don't understand PAE mode
605  */
606 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
607     0x100000,	/* pfn range for 4G and above */
608     0x80000,	/* pfn range for 2G-4G */
609     0x01000,	/* pfn range for 16M-2G */
610     0x00000,	/* pfn range for 0-16M */
611 };
612 
613 /*
614  * These are changed during startup if the machine has limited memory.
615  */
616 pfn_t *memranges = &arch_memranges[0];
617 int nranges = NUM_MEM_RANGES;
618 
619 /*
620  * Used by page layer to know about page sizes
621  */
622 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
623 
624 /*
625  * This can be patched via /etc/system to allow old non-PAE aware device
626  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
627  */
628 #if defined(__i386)
629 int restricted_kmemalloc = 1;	/* XX64 re-examine with PSARC 2004/405 */
630 #elif defined(__amd64)
631 int restricted_kmemalloc = 0;
632 #endif
633 
634 kmutex_t	*fpc_mutex[NPC_MUTEX];
635 kmutex_t	*cpc_mutex[NPC_MUTEX];
636 
637 
638 /*
639  * return the memrange containing pfn
640  */
641 int
642 memrange_num(pfn_t pfn)
643 {
644 	int n;
645 
646 	for (n = 0; n < nranges - 1; ++n) {
647 		if (pfn >= memranges[n])
648 			break;
649 	}
650 	return (n);
651 }
652 
653 /*
654  * return the mnoderange containing pfn
655  */
656 int
657 pfn_2_mtype(pfn_t pfn)
658 {
659 	int	n;
660 
661 	for (n = mnoderangecnt - 1; n >= 0; n--) {
662 		if (pfn >= mnoderanges[n].mnr_pfnlo) {
663 			break;
664 		}
665 	}
666 	return (n);
667 }
668 
669 /*
670  * is_contigpage_free:
671  *	returns a page list of contiguous pages. It minimally has to return
672  *	minctg pages. Caller determines minctg based on the scatter-gather
673  *	list length.
674  *
675  *	pfnp is set to the next page frame to search on return.
676  */
677 static page_t *
678 is_contigpage_free(
679 	pfn_t *pfnp,
680 	pgcnt_t *pgcnt,
681 	pgcnt_t minctg,
682 	uint64_t pfnseg,
683 	int iolock)
684 {
685 	int	i = 0;
686 	pfn_t	pfn = *pfnp;
687 	page_t	*pp;
688 	page_t	*plist = NULL;
689 
690 	/*
691 	 * fail if pfn + minctg crosses a segment boundary.
692 	 * Adjust for next starting pfn to begin at segment boundary.
693 	 */
694 
695 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
696 		*pfnp = roundup(*pfnp, pfnseg + 1);
697 		return (NULL);
698 	}
699 
700 	do {
701 retry:
702 		pp = page_numtopp_nolock(pfn + i);
703 		if ((pp == NULL) ||
704 		    (page_trylock(pp, SE_EXCL) == 0)) {
705 			(*pfnp)++;
706 			break;
707 		}
708 		if (page_pptonum(pp) != pfn + i) {
709 			page_unlock(pp);
710 			goto retry;
711 		}
712 
713 		if (!(PP_ISFREE(pp))) {
714 			page_unlock(pp);
715 			(*pfnp)++;
716 			break;
717 		}
718 
719 		if (!PP_ISAGED(pp)) {
720 			page_list_sub(pp, PG_CACHE_LIST);
721 			page_hashout(pp, (kmutex_t *)NULL);
722 		} else {
723 			page_list_sub(pp, PG_FREE_LIST);
724 		}
725 
726 		if (iolock)
727 			page_io_lock(pp);
728 		page_list_concat(&plist, &pp);
729 
730 		/*
731 		 * exit loop when pgcnt satisfied or segment boundary reached.
732 		 */
733 
734 	} while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
735 
736 	*pfnp += i;		/* set to next pfn to search */
737 
738 	if (i >= minctg) {
739 		*pgcnt -= i;
740 		return (plist);
741 	}
742 
743 	/*
744 	 * failure: minctg not satisfied.
745 	 *
746 	 * if next request crosses segment boundary, set next pfn
747 	 * to search from the segment boundary.
748 	 */
749 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
750 		*pfnp = roundup(*pfnp, pfnseg + 1);
751 
752 	/* clean up any pages already allocated */
753 
754 	while (plist) {
755 		pp = plist;
756 		page_sub(&plist, pp);
757 		page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
758 		if (iolock)
759 			page_io_unlock(pp);
760 		page_unlock(pp);
761 	}
762 
763 	return (NULL);
764 }
765 
766 /*
767  * verify that pages being returned from allocator have correct DMA attribute
768  */
769 #ifndef DEBUG
770 #define	check_dma(a, b, c) (0)
771 #else
772 static void
773 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
774 {
775 	if (dma_attr == NULL)
776 		return;
777 
778 	while (cnt-- > 0) {
779 		if (mmu_ptob((uint64_t)pp->p_pagenum) <
780 		    dma_attr->dma_attr_addr_lo)
781 			panic("PFN (pp=%p) below dma_attr_addr_lo", pp);
782 		if (mmu_ptob((uint64_t)pp->p_pagenum) >=
783 		    dma_attr->dma_attr_addr_hi)
784 			panic("PFN (pp=%p) above dma_attr_addr_hi", pp);
785 		pp = pp->p_next;
786 	}
787 }
788 #endif
789 
790 static kmutex_t	contig_lock;
791 
792 #define	CONTIG_LOCK()	mutex_enter(&contig_lock);
793 #define	CONTIG_UNLOCK()	mutex_exit(&contig_lock);
794 
795 #define	PFN_16M		(mmu_btop((uint64_t)0x1000000))
796 
797 static page_t *
798 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
799 {
800 	pfn_t		pfn;
801 	int		sgllen;
802 	uint64_t	pfnseg;
803 	pgcnt_t		minctg;
804 	page_t		*pplist = NULL, *plist;
805 	uint64_t	lo, hi;
806 	pgcnt_t		pfnalign = 0;
807 	static pfn_t	startpfn;
808 	static pgcnt_t	lastctgcnt;
809 	uintptr_t	align;
810 
811 	CONTIG_LOCK();
812 
813 	if (mattr) {
814 		lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
815 		hi = mmu_btop(mattr->dma_attr_addr_hi);
816 		if (hi >= physmax)
817 			hi = physmax - 1;
818 		sgllen = mattr->dma_attr_sgllen;
819 		pfnseg = mmu_btop(mattr->dma_attr_seg);
820 
821 		align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
822 		if (align > MMU_PAGESIZE)
823 			pfnalign = mmu_btop(align);
824 
825 		/*
826 		 * in order to satisfy the request, must minimally
827 		 * acquire minctg contiguous pages
828 		 */
829 		minctg = howmany(*pgcnt, sgllen);
830 
831 		ASSERT(hi >= lo);
832 
833 		/*
834 		 * start from where last searched if the minctg >= lastctgcnt
835 		 */
836 		if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
837 			startpfn = lo;
838 	} else {
839 		hi = physmax - 1;
840 		lo = 0;
841 		sgllen = 1;
842 		pfnseg = mmu.highest_pfn;
843 		minctg = *pgcnt;
844 
845 		if (minctg < lastctgcnt)
846 			startpfn = lo;
847 	}
848 	lastctgcnt = minctg;
849 
850 	ASSERT(pfnseg + 1 >= (uint64_t)minctg);
851 
852 	/* conserve 16m memory - start search above 16m when possible */
853 	if (hi > PFN_16M && startpfn < PFN_16M)
854 		startpfn = PFN_16M;
855 
856 	pfn = startpfn;
857 	if (pfnalign)
858 		pfn = P2ROUNDUP(pfn, pfnalign);
859 
860 	while (pfn + minctg - 1 <= hi) {
861 
862 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
863 		if (plist) {
864 			page_list_concat(&pplist, &plist);
865 			sgllen--;
866 			/*
867 			 * return when contig pages no longer needed
868 			 */
869 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
870 				startpfn = pfn;
871 				CONTIG_UNLOCK();
872 				check_dma(mattr, pplist, *pgcnt);
873 				return (pplist);
874 			}
875 			minctg = howmany(*pgcnt, sgllen);
876 		}
877 		if (pfnalign)
878 			pfn = P2ROUNDUP(pfn, pfnalign);
879 	}
880 
881 	/* cannot find contig pages in specified range */
882 	if (startpfn == lo) {
883 		CONTIG_UNLOCK();
884 		return (NULL);
885 	}
886 
887 	/* did not start with lo previously */
888 	pfn = lo;
889 	if (pfnalign)
890 		pfn = P2ROUNDUP(pfn, pfnalign);
891 
892 	/* allow search to go above startpfn */
893 	while (pfn < startpfn) {
894 
895 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
896 		if (plist != NULL) {
897 
898 			page_list_concat(&pplist, &plist);
899 			sgllen--;
900 
901 			/*
902 			 * return when contig pages no longer needed
903 			 */
904 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
905 				startpfn = pfn;
906 				CONTIG_UNLOCK();
907 				check_dma(mattr, pplist, *pgcnt);
908 				return (pplist);
909 			}
910 			minctg = howmany(*pgcnt, sgllen);
911 		}
912 		if (pfnalign)
913 			pfn = P2ROUNDUP(pfn, pfnalign);
914 	}
915 	CONTIG_UNLOCK();
916 	return (NULL);
917 }
918 
919 /*
920  * combine mem_node_config and memrange memory ranges into one data
921  * structure to be used for page list management.
922  *
923  * mnode_range_cnt() calculates the number of memory ranges for mnode and
924  * memranges[]. Used to determine the size of page lists and mnoderanges.
925  *
926  * mnode_range_setup() initializes mnoderanges.
927  */
928 mnoderange_t	*mnoderanges;
929 int		mnoderangecnt;
930 int		mtype4g;
931 
932 int
933 mnode_range_cnt()
934 {
935 	int	mri;
936 	int	mnrcnt = 0;
937 	int	mnode;
938 
939 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
940 		if (mem_node_config[mnode].exists == 0)
941 			continue;
942 
943 		mri = nranges - 1;
944 
945 		/* find the memranges index below contained in mnode range */
946 
947 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
948 			mri--;
949 
950 		/*
951 		 * increment mnode range counter when memranges or mnode
952 		 * boundary is reached.
953 		 */
954 		while (mri >= 0 &&
955 		    mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
956 			mnrcnt++;
957 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
958 				mri--;
959 			else
960 				break;
961 		}
962 	}
963 	return (mnrcnt);
964 }
965 
966 void
967 mnode_range_setup(mnoderange_t *mnoderanges)
968 {
969 	int	mnode, mri;
970 
971 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
972 		if (mem_node_config[mnode].exists == 0)
973 			continue;
974 
975 		mri = nranges - 1;
976 
977 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
978 			mri--;
979 
980 		while (mri >= 0 && mem_node_config[mnode].physmax >=
981 		    MEMRANGELO(mri)) {
982 			mnoderanges->mnr_pfnlo =
983 			    MAX(MEMRANGELO(mri),
984 				mem_node_config[mnode].physbase);
985 			mnoderanges->mnr_pfnhi =
986 			    MIN(MEMRANGEHI(mri),
987 				mem_node_config[mnode].physmax);
988 			mnoderanges->mnr_mnode = mnode;
989 			mnoderanges->mnr_memrange = mri;
990 			mnoderanges++;
991 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
992 				mri--;
993 			else
994 				break;
995 		}
996 	}
997 }
998 
999 /*
1000  * Determine if the mnode range specified in mtype contains memory belonging
1001  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
1002  * the range of indices from high pfn to 0, 16m or 4g.
1003  *
1004  * Return first mnode range type index found otherwise return -1 if none found.
1005  */
1006 int
1007 mtype_func(int mnode, int mtype, uint_t flags)
1008 {
1009 	if (flags & PGI_MT_RANGE) {
1010 		int	mtlim;
1011 
1012 		if (flags & PGI_MT_NEXT)
1013 			mtype--;
1014 		if (flags & PGI_MT_RANGE0)
1015 			mtlim = 0;
1016 		else if (flags & PGI_MT_RANGE4G)
1017 			mtlim = mtype4g + 1;	/* exclude 0-4g range */
1018 		else if (flags & PGI_MT_RANGE16M)
1019 			mtlim = 1;		/* exclude 0-16m range */
1020 		while (mtype >= mtlim) {
1021 			if (mnoderanges[mtype].mnr_mnode == mnode)
1022 				return (mtype);
1023 			mtype--;
1024 		}
1025 	} else {
1026 		if (mnoderanges[mtype].mnr_mnode == mnode)
1027 			return (mtype);
1028 	}
1029 	return (-1);
1030 }
1031 
1032 /*
1033  * Update the page list max counts with the pfn range specified by the
1034  * input parameters.  Called from add_physmem() when physical memory with
1035  * page_t's are initially added to the page lists.
1036  */
1037 void
1038 mtype_modify_max(pfn_t startpfn, long cnt)
1039 {
1040 	int	mtype = 0;
1041 	pfn_t	endpfn = startpfn + cnt, pfn;
1042 	pgcnt_t	inc;
1043 
1044 	ASSERT(cnt > 0);
1045 
1046 	for (pfn = startpfn; pfn < endpfn; ) {
1047 		if (pfn <= mnoderanges[mtype].mnr_pfnhi) {
1048 			if (endpfn < mnoderanges[mtype].mnr_pfnhi) {
1049 				inc = endpfn - pfn;
1050 			} else {
1051 				inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1;
1052 			}
1053 			mnoderanges[mtype].mnr_mt_pgmax += inc;
1054 			if (physmax4g && mtype <= mtype4g)
1055 				maxmem4g += inc;
1056 			pfn += inc;
1057 		}
1058 		mtype++;
1059 		ASSERT(mtype < mnoderangecnt || pfn >= endpfn);
1060 	}
1061 }
1062 
1063 /*
1064  * Returns the free page count for mnode
1065  */
1066 int
1067 mnode_pgcnt(int mnode)
1068 {
1069 	int	mtype = mnoderangecnt - 1;
1070 	int	flags = PGI_MT_RANGE0;
1071 	pgcnt_t	pgcnt = 0;
1072 
1073 	mtype = mtype_func(mnode, mtype, flags);
1074 
1075 	while (mtype != -1) {
1076 		pgcnt += MTYPE_FREEMEM(mtype);
1077 		mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1078 	}
1079 	return (pgcnt);
1080 }
1081 
1082 /*
1083  * Initialize page coloring variables based on the l2 cache parameters.
1084  * Calculate and return memory needed for page coloring data structures.
1085  */
1086 size_t
1087 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1088 {
1089 	size_t	colorsz = 0;
1090 	int	i;
1091 	int	colors;
1092 
1093 	/*
1094 	 * Reduce the memory ranges lists if we don't have large amounts
1095 	 * of memory. This avoids searching known empty free lists.
1096 	 */
1097 	i = memrange_num(physmax);
1098 	memranges += i;
1099 	nranges -= i;
1100 #if defined(__i386)
1101 	if (i > 0)
1102 		restricted_kmemalloc = 0;
1103 #endif
1104 	/* physmax greater than 4g */
1105 	if (i == 0)
1106 		physmax4g = 1;
1107 
1108 	/*
1109 	 * setup pagesize for generic page layer
1110 	 */
1111 	for (i = 0; i <= mmu.max_page_level; ++i) {
1112 		hw_page_array[i].hp_size = LEVEL_SIZE(i);
1113 		hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1114 		hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1115 	}
1116 
1117 	ASSERT(ISP2(l2_sz));
1118 	ASSERT(ISP2(l2_linesz));
1119 	ASSERT(l2_sz > MMU_PAGESIZE);
1120 
1121 	/* l2_assoc is 0 for fully associative l2 cache */
1122 	if (l2_assoc)
1123 		l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1124 	else
1125 		l2_colors = 1;
1126 
1127 	/* for scalability, configure at least PAGE_COLORS_MIN color bins */
1128 	page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1129 
1130 	/*
1131 	 * cpu_page_colors is non-zero when a page color may be spread across
1132 	 * multiple bins.
1133 	 */
1134 	if (l2_colors < page_colors)
1135 		cpu_page_colors = l2_colors;
1136 
1137 	ASSERT(ISP2(page_colors));
1138 
1139 	page_colors_mask = page_colors - 1;
1140 
1141 	ASSERT(ISP2(CPUSETSIZE()));
1142 	page_coloring_shift = lowbit(CPUSETSIZE());
1143 
1144 	/* size for mnoderanges */
1145 	mnoderangecnt = mnode_range_cnt();
1146 	colorsz = mnoderangecnt * sizeof (mnoderange_t);
1147 
1148 	/* size for fpc_mutex and cpc_mutex */
1149 	colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1150 
1151 	/* size of page_freelists */
1152 	colorsz += mnoderangecnt * sizeof (page_t ***);
1153 	colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1154 
1155 	for (i = 0; i < mmu_page_sizes; i++) {
1156 		colors = page_get_pagecolors(i);
1157 		colorsz += mnoderangecnt * colors * sizeof (page_t *);
1158 	}
1159 
1160 	/* size of page_cachelists */
1161 	colorsz += mnoderangecnt * sizeof (page_t **);
1162 	colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1163 
1164 	return (colorsz);
1165 }
1166 
1167 /*
1168  * Called once at startup to configure page_coloring data structures and
1169  * does the 1st page_free()/page_freelist_add().
1170  */
1171 void
1172 page_coloring_setup(caddr_t pcmemaddr)
1173 {
1174 	int	i;
1175 	int	j;
1176 	int	k;
1177 	caddr_t	addr;
1178 	int	colors;
1179 
1180 	/*
1181 	 * do page coloring setup
1182 	 */
1183 	addr = pcmemaddr;
1184 
1185 	mnoderanges = (mnoderange_t *)addr;
1186 	addr += (mnoderangecnt * sizeof (mnoderange_t));
1187 
1188 	mnode_range_setup(mnoderanges);
1189 
1190 	if (physmax4g)
1191 		mtype4g = pfn_2_mtype(0xfffff);
1192 
1193 	for (k = 0; k < NPC_MUTEX; k++) {
1194 		fpc_mutex[k] = (kmutex_t *)addr;
1195 		addr += (max_mem_nodes * sizeof (kmutex_t));
1196 	}
1197 	for (k = 0; k < NPC_MUTEX; k++) {
1198 		cpc_mutex[k] = (kmutex_t *)addr;
1199 		addr += (max_mem_nodes * sizeof (kmutex_t));
1200 	}
1201 	page_freelists = (page_t ****)addr;
1202 	addr += (mnoderangecnt * sizeof (page_t ***));
1203 
1204 	page_cachelists = (page_t ***)addr;
1205 	addr += (mnoderangecnt * sizeof (page_t **));
1206 
1207 	for (i = 0; i < mnoderangecnt; i++) {
1208 		page_freelists[i] = (page_t ***)addr;
1209 		addr += (mmu_page_sizes * sizeof (page_t **));
1210 
1211 		for (j = 0; j < mmu_page_sizes; j++) {
1212 			colors = page_get_pagecolors(j);
1213 			page_freelists[i][j] = (page_t **)addr;
1214 			addr += (colors * sizeof (page_t *));
1215 		}
1216 		page_cachelists[i] = (page_t **)addr;
1217 		addr += (page_colors * sizeof (page_t *));
1218 	}
1219 }
1220 
1221 /*ARGSUSED*/
1222 int
1223 bp_color(struct buf *bp)
1224 {
1225 	return (0);
1226 }
1227 
1228 /*
1229  * get a page from any list with the given mnode
1230  */
1231 page_t *
1232 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
1233     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
1234 {
1235 	kmutex_t	*pcm;
1236 	int		i;
1237 	page_t		*pp;
1238 	page_t		*first_pp;
1239 	uint64_t	pgaddr;
1240 	ulong_t		bin;
1241 	int		mtypestart;
1242 
1243 	VM_STAT_ADD(pga_vmstats.pgma_alloc);
1244 
1245 	ASSERT((flags & PG_MATCH_COLOR) == 0);
1246 	ASSERT(szc == 0);
1247 	ASSERT(dma_attr != NULL);
1248 
1249 
1250 	MTYPE_START(mnode, mtype, flags);
1251 	if (mtype < 0) {
1252 		VM_STAT_ADD(pga_vmstats.pgma_allocempty);
1253 		return (NULL);
1254 	}
1255 
1256 	mtypestart = mtype;
1257 
1258 	bin = origbin;
1259 
1260 	/*
1261 	 * check up to page_colors + 1 bins - origbin may be checked twice
1262 	 * because of BIN_STEP skip
1263 	 */
1264 	do {
1265 		i = 0;
1266 		while (i <= page_colors) {
1267 			if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
1268 				goto nextfreebin;
1269 
1270 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1271 			mutex_enter(pcm);
1272 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
1273 			first_pp = pp;
1274 			while (pp != NULL) {
1275 				if (page_trylock(pp, SE_EXCL) == 0) {
1276 					pp = pp->p_next;
1277 					if (pp == first_pp) {
1278 						pp = NULL;
1279 					}
1280 					continue;
1281 				}
1282 
1283 				ASSERT(PP_ISFREE(pp));
1284 				ASSERT(PP_ISAGED(pp));
1285 				ASSERT(pp->p_vnode == NULL);
1286 				ASSERT(pp->p_hash == NULL);
1287 				ASSERT(pp->p_offset == (u_offset_t)-1);
1288 				ASSERT(pp->p_szc == szc);
1289 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
1290 				/* check if page within DMA attributes */
1291 				pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum));
1292 
1293 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
1294 				    (pgaddr + MMU_PAGESIZE - 1 <=
1295 				    dma_attr->dma_attr_addr_hi)) {
1296 					break;
1297 				}
1298 
1299 				/* continue looking */
1300 				page_unlock(pp);
1301 				pp = pp->p_next;
1302 				if (pp == first_pp)
1303 					pp = NULL;
1304 
1305 			}
1306 			if (pp != NULL) {
1307 				ASSERT(mtype == PP_2_MTYPE(pp));
1308 				ASSERT(pp->p_szc == 0);
1309 
1310 				/* found a page with specified DMA attributes */
1311 				page_sub(&PAGE_FREELISTS(mnode, szc, bin,
1312 				    mtype), pp);
1313 				page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1314 
1315 				if ((PP_ISFREE(pp) == 0) ||
1316 				    (PP_ISAGED(pp) == 0)) {
1317 					cmn_err(CE_PANIC, "page %p is not free",
1318 					    (void *)pp);
1319 				}
1320 
1321 				mutex_exit(pcm);
1322 				check_dma(dma_attr, pp, 1);
1323 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
1324 				return (pp);
1325 			}
1326 			mutex_exit(pcm);
1327 nextfreebin:
1328 			pp = page_freelist_fill(szc, bin, mnode, mtype,
1329 			    mmu_btop(dma_attr->dma_attr_addr_hi + 1));
1330 			if (pp)
1331 				return (pp);
1332 
1333 			/* try next bin */
1334 			bin += (i == 0) ? BIN_STEP : 1;
1335 			bin &= page_colors_mask;
1336 			i++;
1337 		}
1338 		MTYPE_NEXT(mnode, mtype, flags);
1339 	} while (mtype >= 0);
1340 
1341 	/* failed to find a page in the freelist; try it in the cachelist */
1342 
1343 	/* reset mtype start for cachelist search */
1344 	mtype = mtypestart;
1345 	ASSERT(mtype >= 0);
1346 
1347 	/* start with the bin of matching color */
1348 	bin = origbin;
1349 
1350 	do {
1351 		for (i = 0; i <= page_colors; i++) {
1352 			if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
1353 				goto nextcachebin;
1354 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
1355 			mutex_enter(pcm);
1356 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
1357 			first_pp = pp;
1358 			while (pp != NULL) {
1359 				if (page_trylock(pp, SE_EXCL) == 0) {
1360 					pp = pp->p_next;
1361 					if (pp == first_pp)
1362 						break;
1363 					continue;
1364 				}
1365 				ASSERT(pp->p_vnode);
1366 				ASSERT(PP_ISAGED(pp) == 0);
1367 				ASSERT(pp->p_szc == 0);
1368 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
1369 
1370 				/* check if page within DMA attributes */
1371 
1372 				pgaddr = ptob((uint64_t)(pp->p_pagenum));
1373 
1374 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
1375 				    (pgaddr + MMU_PAGESIZE - 1 <=
1376 				    dma_attr->dma_attr_addr_hi)) {
1377 					break;
1378 				}
1379 
1380 				/* continue looking */
1381 				page_unlock(pp);
1382 				pp = pp->p_next;
1383 				if (pp == first_pp)
1384 					pp = NULL;
1385 			}
1386 
1387 			if (pp != NULL) {
1388 				ASSERT(mtype == PP_2_MTYPE(pp));
1389 				ASSERT(pp->p_szc == 0);
1390 
1391 				/* found a page with specified DMA attributes */
1392 				page_sub(&PAGE_CACHELISTS(mnode, bin,
1393 				    mtype), pp);
1394 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
1395 
1396 				mutex_exit(pcm);
1397 				ASSERT(pp->p_vnode);
1398 				ASSERT(PP_ISAGED(pp) == 0);
1399 				check_dma(dma_attr, pp, 1);
1400 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
1401 				return (pp);
1402 			}
1403 			mutex_exit(pcm);
1404 nextcachebin:
1405 			bin += (i == 0) ? BIN_STEP : 1;
1406 			bin &= page_colors_mask;
1407 		}
1408 		MTYPE_NEXT(mnode, mtype, flags);
1409 	} while (mtype >= 0);
1410 
1411 	VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
1412 	return (NULL);
1413 }
1414 
1415 /*
1416  * This function is similar to page_get_freelist()/page_get_cachelist()
1417  * but it searches both the lists to find a page with the specified
1418  * color (or no color) and DMA attributes. The search is done in the
1419  * freelist first and then in the cache list within the highest memory
1420  * range (based on DMA attributes) before searching in the lower
1421  * memory ranges.
1422  *
1423  * Note: This function is called only by page_create_io().
1424  */
1425 /*ARGSUSED*/
1426 page_t *
1427 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
1428     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t	*lgrp)
1429 {
1430 	uint_t		bin;
1431 	int		mtype;
1432 	page_t		*pp;
1433 	int		n;
1434 	int		m;
1435 	int		szc;
1436 	int		fullrange;
1437 	int		mnode;
1438 	int		local_failed_stat = 0;
1439 	lgrp_mnode_cookie_t	lgrp_cookie;
1440 
1441 	VM_STAT_ADD(pga_vmstats.pga_alloc);
1442 
1443 	/* only base pagesize currently supported */
1444 	if (size != MMU_PAGESIZE)
1445 		return (NULL);
1446 
1447 	/*
1448 	 * If we're passed a specific lgroup, we use it.  Otherwise,
1449 	 * assume first-touch placement is desired.
1450 	 */
1451 	if (!LGRP_EXISTS(lgrp))
1452 		lgrp = lgrp_home_lgrp();
1453 
1454 	/* LINTED */
1455 	AS_2_BIN(as, seg, vp, vaddr, bin);
1456 
1457 	/*
1458 	 * Only hold one freelist or cachelist lock at a time, that way we
1459 	 * can start anywhere and not have to worry about lock
1460 	 * ordering.
1461 	 */
1462 	if (dma_attr == NULL) {
1463 		n = 0;
1464 		m = mnoderangecnt - 1;
1465 		fullrange = 1;
1466 		VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
1467 	} else {
1468 		pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
1469 		pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
1470 
1471 		/*
1472 		 * We can guarantee alignment only for page boundary.
1473 		 */
1474 		if (dma_attr->dma_attr_align > MMU_PAGESIZE)
1475 			return (NULL);
1476 
1477 		n = pfn_2_mtype(pfnlo);
1478 		m = pfn_2_mtype(pfnhi);
1479 
1480 		fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
1481 		    (pfnhi >= mnoderanges[m].mnr_pfnhi));
1482 	}
1483 	VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
1484 
1485 	if (n > m)
1486 		return (NULL);
1487 
1488 	szc = 0;
1489 
1490 	/* cylcing thru mtype handled by RANGE0 if n == 0 */
1491 	if (n == 0) {
1492 		flags |= PGI_MT_RANGE0;
1493 		n = m;
1494 	}
1495 
1496 	/*
1497 	 * Try local memory node first, but try remote if we can't
1498 	 * get a page of the right color.
1499 	 */
1500 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
1501 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
1502 		/*
1503 		 * allocate pages from high pfn to low.
1504 		 */
1505 		for (mtype = m; mtype >= n; mtype--) {
1506 			if (fullrange != 0) {
1507 				pp = page_get_mnode_freelist(mnode,
1508 				    bin, mtype, szc, flags);
1509 				if (pp == NULL) {
1510 					pp = page_get_mnode_cachelist(
1511 						bin, flags, mnode, mtype);
1512 				}
1513 			} else {
1514 				pp = page_get_mnode_anylist(bin, szc,
1515 				    flags, mnode, mtype, dma_attr);
1516 			}
1517 			if (pp != NULL) {
1518 				VM_STAT_ADD(pga_vmstats.pga_allocok);
1519 				check_dma(dma_attr, pp, 1);
1520 				return (pp);
1521 			}
1522 		}
1523 		if (!local_failed_stat) {
1524 			lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
1525 			local_failed_stat = 1;
1526 		}
1527 	}
1528 	VM_STAT_ADD(pga_vmstats.pga_allocfailed);
1529 
1530 	return (NULL);
1531 }
1532 
1533 /*
1534  * page_create_io()
1535  *
1536  * This function is a copy of page_create_va() with an additional
1537  * argument 'mattr' that specifies DMA memory requirements to
1538  * the page list functions. This function is used by the segkmem
1539  * allocator so it is only to create new pages (i.e PG_EXCL is
1540  * set).
1541  *
1542  * Note: This interface is currently used by x86 PSM only and is
1543  *	 not fully specified so the commitment level is only for
1544  *	 private interface specific to x86. This interface uses PSM
1545  *	 specific page_get_anylist() interface.
1546  */
1547 
1548 #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
1549 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
1550 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
1551 			break; \
1552 	} \
1553 }
1554 
1555 
1556 page_t *
1557 page_create_io(
1558 	struct vnode	*vp,
1559 	u_offset_t	off,
1560 	uint_t		bytes,
1561 	uint_t		flags,
1562 	struct as	*as,
1563 	caddr_t		vaddr,
1564 	ddi_dma_attr_t	*mattr)	/* DMA memory attributes if any */
1565 {
1566 	page_t		*plist = NULL;
1567 	uint_t		plist_len = 0;
1568 	pgcnt_t		npages;
1569 	page_t		*npp = NULL;
1570 	uint_t		pages_req;
1571 	page_t		*pp;
1572 	kmutex_t	*phm = NULL;
1573 	uint_t		index;
1574 
1575 	TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
1576 		"page_create_start:vp %p off %llx bytes %u flags %x",
1577 		vp, off, bytes, flags);
1578 
1579 	ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
1580 
1581 	pages_req = npages = mmu_btopr(bytes);
1582 
1583 	/*
1584 	 * Do the freemem and pcf accounting.
1585 	 */
1586 	if (!page_create_wait(npages, flags)) {
1587 		return (NULL);
1588 	}
1589 
1590 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
1591 		"page_create_success:vp %p off %llx",
1592 		vp, off);
1593 
1594 	/*
1595 	 * If satisfying this request has left us with too little
1596 	 * memory, start the wheels turning to get some back.  The
1597 	 * first clause of the test prevents waking up the pageout
1598 	 * daemon in situations where it would decide that there's
1599 	 * nothing to do.
1600 	 */
1601 	if (nscan < desscan && freemem < minfree) {
1602 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
1603 			"pageout_cv_signal:freemem %ld", freemem);
1604 		cv_signal(&proc_pageout->p_cv);
1605 	}
1606 
1607 	if (flags & PG_PHYSCONTIG) {
1608 
1609 		plist = page_get_contigpage(&npages, mattr, 1);
1610 		if (plist == NULL) {
1611 			page_create_putback(npages);
1612 			return (NULL);
1613 		}
1614 
1615 		pp = plist;
1616 
1617 		do {
1618 			if (!page_hashin(pp, vp, off, NULL)) {
1619 				panic("pg_creat_io: hashin failed %p %p %llx",
1620 				    (void *)pp, (void *)vp, off);
1621 			}
1622 			VM_STAT_ADD(page_create_new);
1623 			off += MMU_PAGESIZE;
1624 			PP_CLRFREE(pp);
1625 			PP_CLRAGED(pp);
1626 			page_set_props(pp, P_REF);
1627 			pp = pp->p_next;
1628 		} while (pp != plist);
1629 
1630 		if (!npages) {
1631 			check_dma(mattr, plist, pages_req);
1632 			return (plist);
1633 		} else {
1634 			vaddr += (pages_req - npages) << MMU_PAGESHIFT;
1635 		}
1636 
1637 		/*
1638 		 * fall-thru:
1639 		 *
1640 		 * page_get_contigpage returns when npages <= sgllen.
1641 		 * Grab the rest of the non-contig pages below from anylist.
1642 		 */
1643 	}
1644 
1645 	/*
1646 	 * Loop around collecting the requested number of pages.
1647 	 * Most of the time, we have to `create' a new page. With
1648 	 * this in mind, pull the page off the free list before
1649 	 * getting the hash lock.  This will minimize the hash
1650 	 * lock hold time, nesting, and the like.  If it turns
1651 	 * out we don't need the page, we put it back at the end.
1652 	 */
1653 	while (npages--) {
1654 		phm = NULL;
1655 
1656 		index = PAGE_HASH_FUNC(vp, off);
1657 top:
1658 		ASSERT(phm == NULL);
1659 		ASSERT(index == PAGE_HASH_FUNC(vp, off));
1660 		ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1661 
1662 		if (npp == NULL) {
1663 			/*
1664 			 * Try to get the page of any color either from
1665 			 * the freelist or from the cache list.
1666 			 */
1667 			npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
1668 			    flags & ~PG_MATCH_COLOR, mattr, NULL);
1669 			if (npp == NULL) {
1670 				if (mattr == NULL) {
1671 					/*
1672 					 * Not looking for a special page;
1673 					 * panic!
1674 					 */
1675 					panic("no page found %d", (int)npages);
1676 				}
1677 				/*
1678 				 * No page found! This can happen
1679 				 * if we are looking for a page
1680 				 * within a specific memory range
1681 				 * for DMA purposes. If PG_WAIT is
1682 				 * specified then we wait for a
1683 				 * while and then try again. The
1684 				 * wait could be forever if we
1685 				 * don't get the page(s) we need.
1686 				 *
1687 				 * Note: XXX We really need a mechanism
1688 				 * to wait for pages in the desired
1689 				 * range. For now, we wait for any
1690 				 * pages and see if we can use it.
1691 				 */
1692 
1693 				if ((mattr != NULL) && (flags & PG_WAIT)) {
1694 					delay(10);
1695 					goto top;
1696 				}
1697 
1698 				goto fail; /* undo accounting stuff */
1699 			}
1700 
1701 			if (PP_ISAGED(npp) == 0) {
1702 				/*
1703 				 * Since this page came from the
1704 				 * cachelist, we must destroy the
1705 				 * old vnode association.
1706 				 */
1707 				page_hashout(npp, (kmutex_t *)NULL);
1708 			}
1709 		}
1710 
1711 		/*
1712 		 * We own this page!
1713 		 */
1714 		ASSERT(PAGE_EXCL(npp));
1715 		ASSERT(npp->p_vnode == NULL);
1716 		ASSERT(!hat_page_is_mapped(npp));
1717 		PP_CLRFREE(npp);
1718 		PP_CLRAGED(npp);
1719 
1720 		/*
1721 		 * Here we have a page in our hot little mits and are
1722 		 * just waiting to stuff it on the appropriate lists.
1723 		 * Get the mutex and check to see if it really does
1724 		 * not exist.
1725 		 */
1726 		phm = PAGE_HASH_MUTEX(index);
1727 		mutex_enter(phm);
1728 		PAGE_HASH_SEARCH(index, pp, vp, off);
1729 		if (pp == NULL) {
1730 			VM_STAT_ADD(page_create_new);
1731 			pp = npp;
1732 			npp = NULL;
1733 			if (!page_hashin(pp, vp, off, phm)) {
1734 				/*
1735 				 * Since we hold the page hash mutex and
1736 				 * just searched for this page, page_hashin
1737 				 * had better not fail.  If it does, that
1738 				 * means somethread did not follow the
1739 				 * page hash mutex rules.  Panic now and
1740 				 * get it over with.  As usual, go down
1741 				 * holding all the locks.
1742 				 */
1743 				ASSERT(MUTEX_HELD(phm));
1744 				panic("page_create: hashin fail %p %p %llx %p",
1745 				    (void *)pp, (void *)vp, off, (void *)phm);
1746 
1747 			}
1748 			ASSERT(MUTEX_HELD(phm));
1749 			mutex_exit(phm);
1750 			phm = NULL;
1751 
1752 			/*
1753 			 * Hat layer locking need not be done to set
1754 			 * the following bits since the page is not hashed
1755 			 * and was on the free list (i.e., had no mappings).
1756 			 *
1757 			 * Set the reference bit to protect
1758 			 * against immediate pageout
1759 			 *
1760 			 * XXXmh modify freelist code to set reference
1761 			 * bit so we don't have to do it here.
1762 			 */
1763 			page_set_props(pp, P_REF);
1764 		} else {
1765 			ASSERT(MUTEX_HELD(phm));
1766 			mutex_exit(phm);
1767 			phm = NULL;
1768 			/*
1769 			 * NOTE: This should not happen for pages associated
1770 			 *	 with kernel vnode 'kvp'.
1771 			 */
1772 			/* XX64 - to debug why this happens! */
1773 			ASSERT(vp != &kvp);
1774 			if (vp == &kvp)
1775 				cmn_err(CE_NOTE,
1776 				    "page_create: page not expected "
1777 				    "in hash list for kernel vnode - pp 0x%p",
1778 				    (void *)pp);
1779 			VM_STAT_ADD(page_create_exists);
1780 			goto fail;
1781 		}
1782 
1783 		/*
1784 		 * Got a page!  It is locked.  Acquire the i/o
1785 		 * lock since we are going to use the p_next and
1786 		 * p_prev fields to link the requested pages together.
1787 		 */
1788 		page_io_lock(pp);
1789 		page_add(&plist, pp);
1790 		plist = plist->p_next;
1791 		off += MMU_PAGESIZE;
1792 		vaddr += MMU_PAGESIZE;
1793 	}
1794 
1795 	check_dma(mattr, plist, pages_req);
1796 	return (plist);
1797 
1798 fail:
1799 	if (npp != NULL) {
1800 		/*
1801 		 * Did not need this page after all.
1802 		 * Put it back on the free list.
1803 		 */
1804 		VM_STAT_ADD(page_create_putbacks);
1805 		PP_SETFREE(npp);
1806 		PP_SETAGED(npp);
1807 		npp->p_offset = (u_offset_t)-1;
1808 		page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
1809 		page_unlock(npp);
1810 	}
1811 
1812 	/*
1813 	 * Give up the pages we already got.
1814 	 */
1815 	while (plist != NULL) {
1816 		pp = plist;
1817 		page_sub(&plist, pp);
1818 		page_io_unlock(pp);
1819 		plist_len++;
1820 		/*LINTED: constant in conditional ctx*/
1821 		VN_DISPOSE(pp, B_INVAL, 0, kcred);
1822 	}
1823 
1824 	/*
1825 	 * VN_DISPOSE does freemem accounting for the pages in plist
1826 	 * by calling page_free. So, we need to undo the pcf accounting
1827 	 * for only the remaining pages.
1828 	 */
1829 	VM_STAT_ADD(page_create_putbacks);
1830 	page_create_putback(pages_req - plist_len);
1831 
1832 	return (NULL);
1833 }
1834 
1835 
1836 /*
1837  * Copy the data from the physical page represented by "frompp" to
1838  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
1839  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
1840  * level and no one sleeps with an active mapping there.
1841  *
1842  * Note that the ref/mod bits in the page_t's are not affected by
1843  * this operation, hence it is up to the caller to update them appropriately.
1844  */
1845 void
1846 ppcopy(page_t *frompp, page_t *topp)
1847 {
1848 	caddr_t		pp_addr1;
1849 	caddr_t		pp_addr2;
1850 	void		*pte1;
1851 	void		*pte2;
1852 	kmutex_t	*ppaddr_mutex;
1853 
1854 	ASSERT_STACK_ALIGNED();
1855 	ASSERT(PAGE_LOCKED(frompp));
1856 	ASSERT(PAGE_LOCKED(topp));
1857 
1858 	if (kpm_enable) {
1859 		pp_addr1 = hat_kpm_page2va(frompp, 0);
1860 		pp_addr2 = hat_kpm_page2va(topp, 0);
1861 		kpreempt_disable();
1862 	} else {
1863 		/*
1864 		 * disable pre-emption so that CPU can't change
1865 		 */
1866 		kpreempt_disable();
1867 
1868 		pp_addr1 = CPU->cpu_caddr1;
1869 		pp_addr2 = CPU->cpu_caddr2;
1870 		pte1 = (void *)CPU->cpu_caddr1pte;
1871 		pte2 = (void *)CPU->cpu_caddr2pte;
1872 
1873 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
1874 		mutex_enter(ppaddr_mutex);
1875 
1876 		hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
1877 		    PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
1878 		hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
1879 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
1880 		    HAT_LOAD_NOCONSIST);
1881 	}
1882 
1883 	if (use_sse_pagecopy)
1884 		hwblkpagecopy(pp_addr1, pp_addr2);
1885 	else
1886 		bcopy(pp_addr1, pp_addr2, PAGESIZE);
1887 
1888 	if (!kpm_enable)
1889 		mutex_exit(ppaddr_mutex);
1890 	kpreempt_enable();
1891 }
1892 
1893 /*
1894  * Zero the physical page from off to off + len given by `pp'
1895  * without changing the reference and modified bits of page.
1896  *
1897  * We use this using CPU private page address #2, see ppcopy() for more info.
1898  * pagezero() must not be called at interrupt level.
1899  */
1900 void
1901 pagezero(page_t *pp, uint_t off, uint_t len)
1902 {
1903 	caddr_t		pp_addr2;
1904 	void		*pte2;
1905 	kmutex_t	*ppaddr_mutex;
1906 
1907 	ASSERT_STACK_ALIGNED();
1908 	ASSERT(len <= MMU_PAGESIZE);
1909 	ASSERT(off <= MMU_PAGESIZE);
1910 	ASSERT(off + len <= MMU_PAGESIZE);
1911 	ASSERT(PAGE_LOCKED(pp));
1912 
1913 	if (kpm_enable) {
1914 		pp_addr2 = hat_kpm_page2va(pp, 0);
1915 		kpreempt_disable();
1916 	} else {
1917 		kpreempt_disable();
1918 
1919 		pp_addr2 = CPU->cpu_caddr2;
1920 		pte2 = (void *)CPU->cpu_caddr2pte;
1921 
1922 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
1923 		mutex_enter(ppaddr_mutex);
1924 
1925 		hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2,
1926 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
1927 		    HAT_LOAD_NOCONSIST);
1928 	}
1929 
1930 	if (use_sse_pagezero)
1931 		hwblkclr(pp_addr2 + off, len);
1932 	else
1933 		bzero(pp_addr2 + off, len);
1934 
1935 	if (!kpm_enable)
1936 		mutex_exit(ppaddr_mutex);
1937 	kpreempt_enable();
1938 }
1939 
1940 /*
1941  * Platform-dependent page scrub call.
1942  */
1943 void
1944 pagescrub(page_t *pp, uint_t off, uint_t len)
1945 {
1946 	/*
1947 	 * For now, we rely on the fact that pagezero() will
1948 	 * always clear UEs.
1949 	 */
1950 	pagezero(pp, off, len);
1951 }
1952 
1953 /*
1954  * set up two private addresses for use on a given CPU for use in ppcopy()
1955  */
1956 void
1957 setup_vaddr_for_ppcopy(struct cpu *cpup)
1958 {
1959 	void *addr;
1960 	void *pte;
1961 
1962 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
1963 	pte = hat_mempte_setup(addr);
1964 	cpup->cpu_caddr1 = addr;
1965 	cpup->cpu_caddr1pte = (pteptr_t)pte;
1966 
1967 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
1968 	pte = hat_mempte_setup(addr);
1969 	cpup->cpu_caddr2 = addr;
1970 	cpup->cpu_caddr2pte = (pteptr_t)pte;
1971 
1972 	mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
1973 }
1974 
1975 
1976 /*
1977  * Create the pageout scanner thread. The thread has to
1978  * start at procedure with process pp and priority pri.
1979  */
1980 void
1981 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
1982 {
1983 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
1984 }
1985 
1986 /*
1987  * Function for flushing D-cache when performing module relocations
1988  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
1989  */
1990 void
1991 dcache_flushall()
1992 {}
1993