xref: /freebsd/sys/vm/vm_phys.c (revision 1de7b4b805ddbf2429da511c053686ac4591ed89)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2002-2006 Rice University
5  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
6  * All rights reserved.
7  *
8  * This software was developed for the FreeBSD Project by Alan L. Cox,
9  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
24  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
27  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
30  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  *	Physical memory system implementation
36  *
37  * Any external functions defined by this module are only to be used by the
38  * virtual memory system.
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_ddb.h"
45 #include "opt_vm.h"
46 
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/lock.h>
50 #include <sys/kernel.h>
51 #include <sys/malloc.h>
52 #include <sys/mutex.h>
53 #include <sys/proc.h>
54 #include <sys/queue.h>
55 #include <sys/rwlock.h>
56 #include <sys/sbuf.h>
57 #include <sys/sysctl.h>
58 #include <sys/tree.h>
59 #include <sys/vmmeter.h>
60 #include <sys/seq.h>
61 
62 #include <ddb/ddb.h>
63 
64 #include <vm/vm.h>
65 #include <vm/vm_param.h>
66 #include <vm/vm_kern.h>
67 #include <vm/vm_object.h>
68 #include <vm/vm_page.h>
69 #include <vm/vm_phys.h>
70 
71 #include <vm/vm_domain.h>
72 
73 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
74     "Too many physsegs.");
75 
76 #ifdef VM_NUMA_ALLOC
77 struct mem_affinity *mem_affinity;
78 int *mem_locality;
79 #endif
80 
81 int vm_ndomains = 1;
82 
83 struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
84 int vm_phys_nsegs;
85 
86 struct vm_phys_fictitious_seg;
87 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
88     struct vm_phys_fictitious_seg *);
89 
90 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
91     RB_INITIALIZER(_vm_phys_fictitious_tree);
92 
93 struct vm_phys_fictitious_seg {
94 	RB_ENTRY(vm_phys_fictitious_seg) node;
95 	/* Memory region data */
96 	vm_paddr_t	start;
97 	vm_paddr_t	end;
98 	vm_page_t	first_page;
99 };
100 
101 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
102     vm_phys_fictitious_cmp);
103 
104 static struct rwlock vm_phys_fictitious_reg_lock;
105 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
106 
107 static struct vm_freelist
108     vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
109 
110 static int vm_nfreelists;
111 
112 /*
113  * Provides the mapping from VM_FREELIST_* to free list indices (flind).
114  */
115 static int vm_freelist_to_flind[VM_NFREELIST];
116 
117 CTASSERT(VM_FREELIST_DEFAULT == 0);
118 
119 #ifdef VM_FREELIST_ISADMA
120 #define	VM_ISADMA_BOUNDARY	16777216
121 #endif
122 #ifdef VM_FREELIST_DMA32
123 #define	VM_DMA32_BOUNDARY	((vm_paddr_t)1 << 32)
124 #endif
125 
126 /*
127  * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
128  * the ordering of the free list boundaries.
129  */
130 #if defined(VM_ISADMA_BOUNDARY) && defined(VM_LOWMEM_BOUNDARY)
131 CTASSERT(VM_ISADMA_BOUNDARY < VM_LOWMEM_BOUNDARY);
132 #endif
133 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
134 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
135 #endif
136 
137 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
138 SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
139     NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
140 
141 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
142 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
143     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
144 
145 #ifdef VM_NUMA_ALLOC
146 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
147 SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD,
148     NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info");
149 #endif
150 
151 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
152     &vm_ndomains, 0, "Number of physical memory domains available.");
153 
154 /*
155  * Default to first-touch + round-robin.
156  */
157 static struct mtx vm_default_policy_mtx;
158 MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex",
159     MTX_DEF);
160 #ifdef VM_NUMA_ALLOC
161 static struct vm_domain_policy vm_default_policy =
162     VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
163 #else
164 /* Use round-robin so the domain policy code will only try once per allocation */
165 static struct vm_domain_policy vm_default_policy =
166     VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0);
167 #endif
168 
169 static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool,
170     int order);
171 static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg,
172     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
173     vm_paddr_t boundary);
174 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
175 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
176 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
177     int order);
178 
179 static int
180 sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS)
181 {
182 	char policy_name[32];
183 	int error;
184 
185 	mtx_lock(&vm_default_policy_mtx);
186 
187 	/* Map policy to output string */
188 	switch (vm_default_policy.p.policy) {
189 	case VM_POLICY_FIRST_TOUCH:
190 		strcpy(policy_name, "first-touch");
191 		break;
192 	case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
193 		strcpy(policy_name, "first-touch-rr");
194 		break;
195 	case VM_POLICY_ROUND_ROBIN:
196 	default:
197 		strcpy(policy_name, "rr");
198 		break;
199 	}
200 	mtx_unlock(&vm_default_policy_mtx);
201 
202 	error = sysctl_handle_string(oidp, &policy_name[0],
203 	    sizeof(policy_name), req);
204 	if (error != 0 || req->newptr == NULL)
205 		return (error);
206 
207 	mtx_lock(&vm_default_policy_mtx);
208 	/* Set: match on the subset of policies that make sense as a default */
209 	if (strcmp("first-touch-rr", policy_name) == 0) {
210 		vm_domain_policy_set(&vm_default_policy,
211 		    VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
212 	} else if (strcmp("first-touch", policy_name) == 0) {
213 		vm_domain_policy_set(&vm_default_policy,
214 		    VM_POLICY_FIRST_TOUCH, 0);
215 	} else if (strcmp("rr", policy_name) == 0) {
216 		vm_domain_policy_set(&vm_default_policy,
217 		    VM_POLICY_ROUND_ROBIN, 0);
218 	} else {
219 		error = EINVAL;
220 		goto finish;
221 	}
222 
223 	error = 0;
224 finish:
225 	mtx_unlock(&vm_default_policy_mtx);
226 	return (error);
227 }
228 
229 SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW,
230     0, 0, sysctl_vm_default_policy, "A",
231     "Default policy (rr, first-touch, first-touch-rr");
232 
233 /*
234  * Red-black tree helpers for vm fictitious range management.
235  */
236 static inline int
237 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
238     struct vm_phys_fictitious_seg *range)
239 {
240 
241 	KASSERT(range->start != 0 && range->end != 0,
242 	    ("Invalid range passed on search for vm_fictitious page"));
243 	if (p->start >= range->end)
244 		return (1);
245 	if (p->start < range->start)
246 		return (-1);
247 
248 	return (0);
249 }
250 
251 static int
252 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
253     struct vm_phys_fictitious_seg *p2)
254 {
255 
256 	/* Check if this is a search for a page */
257 	if (p1->end == 0)
258 		return (vm_phys_fictitious_in_range(p1, p2));
259 
260 	KASSERT(p2->end != 0,
261     ("Invalid range passed as second parameter to vm fictitious comparison"));
262 
263 	/* Searching to add a new range */
264 	if (p1->end <= p2->start)
265 		return (-1);
266 	if (p1->start >= p2->end)
267 		return (1);
268 
269 	panic("Trying to add overlapping vm fictitious ranges:\n"
270 	    "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
271 	    (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
272 }
273 
274 #ifdef notyet
275 static __inline int
276 vm_rr_selectdomain(void)
277 {
278 #ifdef VM_NUMA_ALLOC
279 	struct thread *td;
280 
281 	td = curthread;
282 
283 	td->td_dom_rr_idx++;
284 	td->td_dom_rr_idx %= vm_ndomains;
285 	return (td->td_dom_rr_idx);
286 #else
287 	return (0);
288 #endif
289 }
290 #endif /* notyet */
291 
292 /*
293  * Initialise a VM domain iterator.
294  *
295  * Check the thread policy, then the proc policy,
296  * then default to the system policy.
297  *
298  * Later on the various layers will have this logic
299  * plumbed into them and the phys code will be explicitly
300  * handed a VM domain policy to use.
301  */
302 static void
303 vm_policy_iterator_init(struct vm_domain_iterator *vi)
304 {
305 #ifdef VM_NUMA_ALLOC
306 	struct vm_domain_policy lcl;
307 #endif
308 
309 	vm_domain_iterator_init(vi);
310 
311 #ifdef VM_NUMA_ALLOC
312 	/* Copy out the thread policy */
313 	vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy);
314 	if (lcl.p.policy != VM_POLICY_NONE) {
315 		/* Thread policy is present; use it */
316 		vm_domain_iterator_set_policy(vi, &lcl);
317 		return;
318 	}
319 
320 	vm_domain_policy_localcopy(&lcl,
321 	    &curthread->td_proc->p_vm_dom_policy);
322 	if (lcl.p.policy != VM_POLICY_NONE) {
323 		/* Process policy is present; use it */
324 		vm_domain_iterator_set_policy(vi, &lcl);
325 		return;
326 	}
327 #endif
328 	/* Use system default policy */
329 	vm_domain_iterator_set_policy(vi, &vm_default_policy);
330 }
331 
332 static void
333 vm_policy_iterator_finish(struct vm_domain_iterator *vi)
334 {
335 
336 	vm_domain_iterator_cleanup(vi);
337 }
338 
339 boolean_t
340 vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high)
341 {
342 	struct vm_phys_seg *s;
343 	int idx;
344 
345 	while ((idx = ffsl(mask)) != 0) {
346 		idx--;	/* ffsl counts from 1 */
347 		mask &= ~(1UL << idx);
348 		s = &vm_phys_segs[idx];
349 		if (low < s->end && high > s->start)
350 			return (TRUE);
351 	}
352 	return (FALSE);
353 }
354 
355 /*
356  * Outputs the state of the physical memory allocator, specifically,
357  * the amount of physical memory in each free list.
358  */
359 static int
360 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
361 {
362 	struct sbuf sbuf;
363 	struct vm_freelist *fl;
364 	int dom, error, flind, oind, pind;
365 
366 	error = sysctl_wire_old_buffer(req, 0);
367 	if (error != 0)
368 		return (error);
369 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
370 	for (dom = 0; dom < vm_ndomains; dom++) {
371 		sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
372 		for (flind = 0; flind < vm_nfreelists; flind++) {
373 			sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
374 			    "\n  ORDER (SIZE)  |  NUMBER"
375 			    "\n              ", flind);
376 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
377 				sbuf_printf(&sbuf, "  |  POOL %d", pind);
378 			sbuf_printf(&sbuf, "\n--            ");
379 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
380 				sbuf_printf(&sbuf, "-- --      ");
381 			sbuf_printf(&sbuf, "--\n");
382 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
383 				sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
384 				    1 << (PAGE_SHIFT - 10 + oind));
385 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
386 				fl = vm_phys_free_queues[dom][flind][pind];
387 					sbuf_printf(&sbuf, "  |  %6d",
388 					    fl[oind].lcnt);
389 				}
390 				sbuf_printf(&sbuf, "\n");
391 			}
392 		}
393 	}
394 	error = sbuf_finish(&sbuf);
395 	sbuf_delete(&sbuf);
396 	return (error);
397 }
398 
399 /*
400  * Outputs the set of physical memory segments.
401  */
402 static int
403 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
404 {
405 	struct sbuf sbuf;
406 	struct vm_phys_seg *seg;
407 	int error, segind;
408 
409 	error = sysctl_wire_old_buffer(req, 0);
410 	if (error != 0)
411 		return (error);
412 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
413 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
414 		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
415 		seg = &vm_phys_segs[segind];
416 		sbuf_printf(&sbuf, "start:     %#jx\n",
417 		    (uintmax_t)seg->start);
418 		sbuf_printf(&sbuf, "end:       %#jx\n",
419 		    (uintmax_t)seg->end);
420 		sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
421 		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
422 	}
423 	error = sbuf_finish(&sbuf);
424 	sbuf_delete(&sbuf);
425 	return (error);
426 }
427 
428 /*
429  * Return affinity, or -1 if there's no affinity information.
430  */
431 int
432 vm_phys_mem_affinity(int f, int t)
433 {
434 
435 #ifdef VM_NUMA_ALLOC
436 	if (mem_locality == NULL)
437 		return (-1);
438 	if (f >= vm_ndomains || t >= vm_ndomains)
439 		return (-1);
440 	return (mem_locality[f * vm_ndomains + t]);
441 #else
442 	return (-1);
443 #endif
444 }
445 
446 #ifdef VM_NUMA_ALLOC
447 /*
448  * Outputs the VM locality table.
449  */
450 static int
451 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
452 {
453 	struct sbuf sbuf;
454 	int error, i, j;
455 
456 	error = sysctl_wire_old_buffer(req, 0);
457 	if (error != 0)
458 		return (error);
459 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
460 
461 	sbuf_printf(&sbuf, "\n");
462 
463 	for (i = 0; i < vm_ndomains; i++) {
464 		sbuf_printf(&sbuf, "%d: ", i);
465 		for (j = 0; j < vm_ndomains; j++) {
466 			sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
467 		}
468 		sbuf_printf(&sbuf, "\n");
469 	}
470 	error = sbuf_finish(&sbuf);
471 	sbuf_delete(&sbuf);
472 	return (error);
473 }
474 #endif
475 
476 static void
477 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
478 {
479 
480 	m->order = order;
481 	if (tail)
482 		TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q);
483 	else
484 		TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q);
485 	fl[order].lcnt++;
486 }
487 
488 static void
489 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
490 {
491 
492 	TAILQ_REMOVE(&fl[order].pl, m, plinks.q);
493 	fl[order].lcnt--;
494 	m->order = VM_NFREEORDER;
495 }
496 
497 /*
498  * Create a physical memory segment.
499  */
500 static void
501 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
502 {
503 	struct vm_phys_seg *seg;
504 
505 	KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
506 	    ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
507 	KASSERT(domain < vm_ndomains,
508 	    ("vm_phys_create_seg: invalid domain provided"));
509 	seg = &vm_phys_segs[vm_phys_nsegs++];
510 	while (seg > vm_phys_segs && (seg - 1)->start >= end) {
511 		*seg = *(seg - 1);
512 		seg--;
513 	}
514 	seg->start = start;
515 	seg->end = end;
516 	seg->domain = domain;
517 }
518 
519 static void
520 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
521 {
522 #ifdef VM_NUMA_ALLOC
523 	int i;
524 
525 	if (mem_affinity == NULL) {
526 		_vm_phys_create_seg(start, end, 0);
527 		return;
528 	}
529 
530 	for (i = 0;; i++) {
531 		if (mem_affinity[i].end == 0)
532 			panic("Reached end of affinity info");
533 		if (mem_affinity[i].end <= start)
534 			continue;
535 		if (mem_affinity[i].start > start)
536 			panic("No affinity info for start %jx",
537 			    (uintmax_t)start);
538 		if (mem_affinity[i].end >= end) {
539 			_vm_phys_create_seg(start, end,
540 			    mem_affinity[i].domain);
541 			break;
542 		}
543 		_vm_phys_create_seg(start, mem_affinity[i].end,
544 		    mem_affinity[i].domain);
545 		start = mem_affinity[i].end;
546 	}
547 #else
548 	_vm_phys_create_seg(start, end, 0);
549 #endif
550 }
551 
552 /*
553  * Add a physical memory segment.
554  */
555 void
556 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
557 {
558 	vm_paddr_t paddr;
559 
560 	KASSERT((start & PAGE_MASK) == 0,
561 	    ("vm_phys_define_seg: start is not page aligned"));
562 	KASSERT((end & PAGE_MASK) == 0,
563 	    ("vm_phys_define_seg: end is not page aligned"));
564 
565 	/*
566 	 * Split the physical memory segment if it spans two or more free
567 	 * list boundaries.
568 	 */
569 	paddr = start;
570 #ifdef	VM_FREELIST_ISADMA
571 	if (paddr < VM_ISADMA_BOUNDARY && end > VM_ISADMA_BOUNDARY) {
572 		vm_phys_create_seg(paddr, VM_ISADMA_BOUNDARY);
573 		paddr = VM_ISADMA_BOUNDARY;
574 	}
575 #endif
576 #ifdef	VM_FREELIST_LOWMEM
577 	if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
578 		vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
579 		paddr = VM_LOWMEM_BOUNDARY;
580 	}
581 #endif
582 #ifdef	VM_FREELIST_DMA32
583 	if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
584 		vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
585 		paddr = VM_DMA32_BOUNDARY;
586 	}
587 #endif
588 	vm_phys_create_seg(paddr, end);
589 }
590 
591 /*
592  * Initialize the physical memory allocator.
593  *
594  * Requires that vm_page_array is initialized!
595  */
596 void
597 vm_phys_init(void)
598 {
599 	struct vm_freelist *fl;
600 	struct vm_phys_seg *seg;
601 	u_long npages;
602 	int dom, flind, freelist, oind, pind, segind;
603 
604 	/*
605 	 * Compute the number of free lists, and generate the mapping from the
606 	 * manifest constants VM_FREELIST_* to the free list indices.
607 	 *
608 	 * Initially, the entries of vm_freelist_to_flind[] are set to either
609 	 * 0 or 1 to indicate which free lists should be created.
610 	 */
611 	npages = 0;
612 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
613 		seg = &vm_phys_segs[segind];
614 #ifdef	VM_FREELIST_ISADMA
615 		if (seg->end <= VM_ISADMA_BOUNDARY)
616 			vm_freelist_to_flind[VM_FREELIST_ISADMA] = 1;
617 		else
618 #endif
619 #ifdef	VM_FREELIST_LOWMEM
620 		if (seg->end <= VM_LOWMEM_BOUNDARY)
621 			vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
622 		else
623 #endif
624 #ifdef	VM_FREELIST_DMA32
625 		if (
626 #ifdef	VM_DMA32_NPAGES_THRESHOLD
627 		    /*
628 		     * Create the DMA32 free list only if the amount of
629 		     * physical memory above physical address 4G exceeds the
630 		     * given threshold.
631 		     */
632 		    npages > VM_DMA32_NPAGES_THRESHOLD &&
633 #endif
634 		    seg->end <= VM_DMA32_BOUNDARY)
635 			vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
636 		else
637 #endif
638 		{
639 			npages += atop(seg->end - seg->start);
640 			vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
641 		}
642 	}
643 	/* Change each entry into a running total of the free lists. */
644 	for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
645 		vm_freelist_to_flind[freelist] +=
646 		    vm_freelist_to_flind[freelist - 1];
647 	}
648 	vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
649 	KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
650 	/* Change each entry into a free list index. */
651 	for (freelist = 0; freelist < VM_NFREELIST; freelist++)
652 		vm_freelist_to_flind[freelist]--;
653 
654 	/*
655 	 * Initialize the first_page and free_queues fields of each physical
656 	 * memory segment.
657 	 */
658 #ifdef VM_PHYSSEG_SPARSE
659 	npages = 0;
660 #endif
661 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
662 		seg = &vm_phys_segs[segind];
663 #ifdef VM_PHYSSEG_SPARSE
664 		seg->first_page = &vm_page_array[npages];
665 		npages += atop(seg->end - seg->start);
666 #else
667 		seg->first_page = PHYS_TO_VM_PAGE(seg->start);
668 #endif
669 #ifdef	VM_FREELIST_ISADMA
670 		if (seg->end <= VM_ISADMA_BOUNDARY) {
671 			flind = vm_freelist_to_flind[VM_FREELIST_ISADMA];
672 			KASSERT(flind >= 0,
673 			    ("vm_phys_init: ISADMA flind < 0"));
674 		} else
675 #endif
676 #ifdef	VM_FREELIST_LOWMEM
677 		if (seg->end <= VM_LOWMEM_BOUNDARY) {
678 			flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
679 			KASSERT(flind >= 0,
680 			    ("vm_phys_init: LOWMEM flind < 0"));
681 		} else
682 #endif
683 #ifdef	VM_FREELIST_DMA32
684 		if (seg->end <= VM_DMA32_BOUNDARY) {
685 			flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
686 			KASSERT(flind >= 0,
687 			    ("vm_phys_init: DMA32 flind < 0"));
688 		} else
689 #endif
690 		{
691 			flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
692 			KASSERT(flind >= 0,
693 			    ("vm_phys_init: DEFAULT flind < 0"));
694 		}
695 		seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
696 	}
697 
698 	/*
699 	 * Initialize the free queues.
700 	 */
701 	for (dom = 0; dom < vm_ndomains; dom++) {
702 		for (flind = 0; flind < vm_nfreelists; flind++) {
703 			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
704 				fl = vm_phys_free_queues[dom][flind][pind];
705 				for (oind = 0; oind < VM_NFREEORDER; oind++)
706 					TAILQ_INIT(&fl[oind].pl);
707 			}
708 		}
709 	}
710 
711 	rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
712 }
713 
714 /*
715  * Split a contiguous, power of two-sized set of physical pages.
716  */
717 static __inline void
718 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
719 {
720 	vm_page_t m_buddy;
721 
722 	while (oind > order) {
723 		oind--;
724 		m_buddy = &m[1 << oind];
725 		KASSERT(m_buddy->order == VM_NFREEORDER,
726 		    ("vm_phys_split_pages: page %p has unexpected order %d",
727 		    m_buddy, m_buddy->order));
728 		vm_freelist_add(fl, m_buddy, oind, 0);
729         }
730 }
731 
732 /*
733  * Allocate a contiguous, power of two-sized set of physical pages
734  * from the free lists.
735  *
736  * The free page queues must be locked.
737  */
738 vm_page_t
739 vm_phys_alloc_pages(int pool, int order)
740 {
741 	vm_page_t m;
742 	int domain, flind;
743 	struct vm_domain_iterator vi;
744 
745 	KASSERT(pool < VM_NFREEPOOL,
746 	    ("vm_phys_alloc_pages: pool %d is out of range", pool));
747 	KASSERT(order < VM_NFREEORDER,
748 	    ("vm_phys_alloc_pages: order %d is out of range", order));
749 
750 	vm_policy_iterator_init(&vi);
751 
752 	while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
753 		for (flind = 0; flind < vm_nfreelists; flind++) {
754 			m = vm_phys_alloc_domain_pages(domain, flind, pool,
755 			    order);
756 			if (m != NULL)
757 				return (m);
758 		}
759 	}
760 
761 	vm_policy_iterator_finish(&vi);
762 	return (NULL);
763 }
764 
765 /*
766  * Allocate a contiguous, power of two-sized set of physical pages from the
767  * specified free list.  The free list must be specified using one of the
768  * manifest constants VM_FREELIST_*.
769  *
770  * The free page queues must be locked.
771  */
772 vm_page_t
773 vm_phys_alloc_freelist_pages(int freelist, int pool, int order)
774 {
775 	vm_page_t m;
776 	struct vm_domain_iterator vi;
777 	int domain;
778 
779 	KASSERT(freelist < VM_NFREELIST,
780 	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
781 	    freelist));
782 	KASSERT(pool < VM_NFREEPOOL,
783 	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
784 	KASSERT(order < VM_NFREEORDER,
785 	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
786 
787 	vm_policy_iterator_init(&vi);
788 
789 	while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
790 		m = vm_phys_alloc_domain_pages(domain,
791 		    vm_freelist_to_flind[freelist], pool, order);
792 		if (m != NULL)
793 			return (m);
794 	}
795 
796 	vm_policy_iterator_finish(&vi);
797 	return (NULL);
798 }
799 
800 static vm_page_t
801 vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order)
802 {
803 	struct vm_freelist *fl;
804 	struct vm_freelist *alt;
805 	int oind, pind;
806 	vm_page_t m;
807 
808 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
809 	fl = &vm_phys_free_queues[domain][flind][pool][0];
810 	for (oind = order; oind < VM_NFREEORDER; oind++) {
811 		m = TAILQ_FIRST(&fl[oind].pl);
812 		if (m != NULL) {
813 			vm_freelist_rem(fl, m, oind);
814 			vm_phys_split_pages(m, oind, fl, order);
815 			return (m);
816 		}
817 	}
818 
819 	/*
820 	 * The given pool was empty.  Find the largest
821 	 * contiguous, power-of-two-sized set of pages in any
822 	 * pool.  Transfer these pages to the given pool, and
823 	 * use them to satisfy the allocation.
824 	 */
825 	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
826 		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
827 			alt = &vm_phys_free_queues[domain][flind][pind][0];
828 			m = TAILQ_FIRST(&alt[oind].pl);
829 			if (m != NULL) {
830 				vm_freelist_rem(alt, m, oind);
831 				vm_phys_set_pool(pool, m, oind);
832 				vm_phys_split_pages(m, oind, fl, order);
833 				return (m);
834 			}
835 		}
836 	}
837 	return (NULL);
838 }
839 
840 /*
841  * Find the vm_page corresponding to the given physical address.
842  */
843 vm_page_t
844 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
845 {
846 	struct vm_phys_seg *seg;
847 	int segind;
848 
849 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
850 		seg = &vm_phys_segs[segind];
851 		if (pa >= seg->start && pa < seg->end)
852 			return (&seg->first_page[atop(pa - seg->start)]);
853 	}
854 	return (NULL);
855 }
856 
857 vm_page_t
858 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
859 {
860 	struct vm_phys_fictitious_seg tmp, *seg;
861 	vm_page_t m;
862 
863 	m = NULL;
864 	tmp.start = pa;
865 	tmp.end = 0;
866 
867 	rw_rlock(&vm_phys_fictitious_reg_lock);
868 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
869 	rw_runlock(&vm_phys_fictitious_reg_lock);
870 	if (seg == NULL)
871 		return (NULL);
872 
873 	m = &seg->first_page[atop(pa - seg->start)];
874 	KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
875 
876 	return (m);
877 }
878 
879 static inline void
880 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
881     long page_count, vm_memattr_t memattr)
882 {
883 	long i;
884 
885 	bzero(range, page_count * sizeof(*range));
886 	for (i = 0; i < page_count; i++) {
887 		vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
888 		range[i].oflags &= ~VPO_UNMANAGED;
889 		range[i].busy_lock = VPB_UNBUSIED;
890 	}
891 }
892 
893 int
894 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
895     vm_memattr_t memattr)
896 {
897 	struct vm_phys_fictitious_seg *seg;
898 	vm_page_t fp;
899 	long page_count;
900 #ifdef VM_PHYSSEG_DENSE
901 	long pi, pe;
902 	long dpage_count;
903 #endif
904 
905 	KASSERT(start < end,
906 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
907 	    (uintmax_t)start, (uintmax_t)end));
908 
909 	page_count = (end - start) / PAGE_SIZE;
910 
911 #ifdef VM_PHYSSEG_DENSE
912 	pi = atop(start);
913 	pe = atop(end);
914 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
915 		fp = &vm_page_array[pi - first_page];
916 		if ((pe - first_page) > vm_page_array_size) {
917 			/*
918 			 * We have a segment that starts inside
919 			 * of vm_page_array, but ends outside of it.
920 			 *
921 			 * Use vm_page_array pages for those that are
922 			 * inside of the vm_page_array range, and
923 			 * allocate the remaining ones.
924 			 */
925 			dpage_count = vm_page_array_size - (pi - first_page);
926 			vm_phys_fictitious_init_range(fp, start, dpage_count,
927 			    memattr);
928 			page_count -= dpage_count;
929 			start += ptoa(dpage_count);
930 			goto alloc;
931 		}
932 		/*
933 		 * We can allocate the full range from vm_page_array,
934 		 * so there's no need to register the range in the tree.
935 		 */
936 		vm_phys_fictitious_init_range(fp, start, page_count, memattr);
937 		return (0);
938 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
939 		/*
940 		 * We have a segment that ends inside of vm_page_array,
941 		 * but starts outside of it.
942 		 */
943 		fp = &vm_page_array[0];
944 		dpage_count = pe - first_page;
945 		vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
946 		    memattr);
947 		end -= ptoa(dpage_count);
948 		page_count -= dpage_count;
949 		goto alloc;
950 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
951 		/*
952 		 * Trying to register a fictitious range that expands before
953 		 * and after vm_page_array.
954 		 */
955 		return (EINVAL);
956 	} else {
957 alloc:
958 #endif
959 		fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
960 		    M_WAITOK);
961 #ifdef VM_PHYSSEG_DENSE
962 	}
963 #endif
964 	vm_phys_fictitious_init_range(fp, start, page_count, memattr);
965 
966 	seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
967 	seg->start = start;
968 	seg->end = end;
969 	seg->first_page = fp;
970 
971 	rw_wlock(&vm_phys_fictitious_reg_lock);
972 	RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
973 	rw_wunlock(&vm_phys_fictitious_reg_lock);
974 
975 	return (0);
976 }
977 
978 void
979 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
980 {
981 	struct vm_phys_fictitious_seg *seg, tmp;
982 #ifdef VM_PHYSSEG_DENSE
983 	long pi, pe;
984 #endif
985 
986 	KASSERT(start < end,
987 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
988 	    (uintmax_t)start, (uintmax_t)end));
989 
990 #ifdef VM_PHYSSEG_DENSE
991 	pi = atop(start);
992 	pe = atop(end);
993 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
994 		if ((pe - first_page) <= vm_page_array_size) {
995 			/*
996 			 * This segment was allocated using vm_page_array
997 			 * only, there's nothing to do since those pages
998 			 * were never added to the tree.
999 			 */
1000 			return;
1001 		}
1002 		/*
1003 		 * We have a segment that starts inside
1004 		 * of vm_page_array, but ends outside of it.
1005 		 *
1006 		 * Calculate how many pages were added to the
1007 		 * tree and free them.
1008 		 */
1009 		start = ptoa(first_page + vm_page_array_size);
1010 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1011 		/*
1012 		 * We have a segment that ends inside of vm_page_array,
1013 		 * but starts outside of it.
1014 		 */
1015 		end = ptoa(first_page);
1016 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1017 		/* Since it's not possible to register such a range, panic. */
1018 		panic(
1019 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
1020 		    (uintmax_t)start, (uintmax_t)end);
1021 	}
1022 #endif
1023 	tmp.start = start;
1024 	tmp.end = 0;
1025 
1026 	rw_wlock(&vm_phys_fictitious_reg_lock);
1027 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1028 	if (seg->start != start || seg->end != end) {
1029 		rw_wunlock(&vm_phys_fictitious_reg_lock);
1030 		panic(
1031 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
1032 		    (uintmax_t)start, (uintmax_t)end);
1033 	}
1034 	RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
1035 	rw_wunlock(&vm_phys_fictitious_reg_lock);
1036 	free(seg->first_page, M_FICT_PAGES);
1037 	free(seg, M_FICT_PAGES);
1038 }
1039 
1040 /*
1041  * Find the segment containing the given physical address.
1042  */
1043 int
1044 vm_phys_paddr_to_segind(vm_paddr_t pa)
1045 {
1046 	struct vm_phys_seg *seg;
1047 	int segind;
1048 
1049 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
1050 		seg = &vm_phys_segs[segind];
1051 		if (pa >= seg->start && pa < seg->end)
1052 			return (segind);
1053 	}
1054 	panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
1055 	    (uintmax_t)pa);
1056 }
1057 
1058 /*
1059  * Free a contiguous, power of two-sized set of physical pages.
1060  *
1061  * The free page queues must be locked.
1062  */
1063 void
1064 vm_phys_free_pages(vm_page_t m, int order)
1065 {
1066 	struct vm_freelist *fl;
1067 	struct vm_phys_seg *seg;
1068 	vm_paddr_t pa;
1069 	vm_page_t m_buddy;
1070 
1071 	KASSERT(m->order == VM_NFREEORDER,
1072 	    ("vm_phys_free_pages: page %p has unexpected order %d",
1073 	    m, m->order));
1074 	KASSERT(m->pool < VM_NFREEPOOL,
1075 	    ("vm_phys_free_pages: page %p has unexpected pool %d",
1076 	    m, m->pool));
1077 	KASSERT(order < VM_NFREEORDER,
1078 	    ("vm_phys_free_pages: order %d is out of range", order));
1079 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1080 	seg = &vm_phys_segs[m->segind];
1081 	if (order < VM_NFREEORDER - 1) {
1082 		pa = VM_PAGE_TO_PHYS(m);
1083 		do {
1084 			pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
1085 			if (pa < seg->start || pa >= seg->end)
1086 				break;
1087 			m_buddy = &seg->first_page[atop(pa - seg->start)];
1088 			if (m_buddy->order != order)
1089 				break;
1090 			fl = (*seg->free_queues)[m_buddy->pool];
1091 			vm_freelist_rem(fl, m_buddy, order);
1092 			if (m_buddy->pool != m->pool)
1093 				vm_phys_set_pool(m->pool, m_buddy, order);
1094 			order++;
1095 			pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
1096 			m = &seg->first_page[atop(pa - seg->start)];
1097 		} while (order < VM_NFREEORDER - 1);
1098 	}
1099 	fl = (*seg->free_queues)[m->pool];
1100 	vm_freelist_add(fl, m, order, 1);
1101 }
1102 
1103 /*
1104  * Free a contiguous, arbitrarily sized set of physical pages.
1105  *
1106  * The free page queues must be locked.
1107  */
1108 void
1109 vm_phys_free_contig(vm_page_t m, u_long npages)
1110 {
1111 	u_int n;
1112 	int order;
1113 
1114 	/*
1115 	 * Avoid unnecessary coalescing by freeing the pages in the largest
1116 	 * possible power-of-two-sized subsets.
1117 	 */
1118 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1119 	for (;; npages -= n) {
1120 		/*
1121 		 * Unsigned "min" is used here so that "order" is assigned
1122 		 * "VM_NFREEORDER - 1" when "m"'s physical address is zero
1123 		 * or the low-order bits of its physical address are zero
1124 		 * because the size of a physical address exceeds the size of
1125 		 * a long.
1126 		 */
1127 		order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1,
1128 		    VM_NFREEORDER - 1);
1129 		n = 1 << order;
1130 		if (npages < n)
1131 			break;
1132 		vm_phys_free_pages(m, order);
1133 		m += n;
1134 	}
1135 	/* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */
1136 	for (; npages > 0; npages -= n) {
1137 		order = flsl(npages) - 1;
1138 		n = 1 << order;
1139 		vm_phys_free_pages(m, order);
1140 		m += n;
1141 	}
1142 }
1143 
1144 /*
1145  * Scan physical memory between the specified addresses "low" and "high" for a
1146  * run of contiguous physical pages that satisfy the specified conditions, and
1147  * return the lowest page in the run.  The specified "alignment" determines
1148  * the alignment of the lowest physical page in the run.  If the specified
1149  * "boundary" is non-zero, then the run of physical pages cannot span a
1150  * physical address that is a multiple of "boundary".
1151  *
1152  * "npages" must be greater than zero.  Both "alignment" and "boundary" must
1153  * be a power of two.
1154  */
1155 vm_page_t
1156 vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
1157     u_long alignment, vm_paddr_t boundary, int options)
1158 {
1159 	vm_paddr_t pa_end;
1160 	vm_page_t m_end, m_run, m_start;
1161 	struct vm_phys_seg *seg;
1162 	int segind;
1163 
1164 	KASSERT(npages > 0, ("npages is 0"));
1165 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1166 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1167 	if (low >= high)
1168 		return (NULL);
1169 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
1170 		seg = &vm_phys_segs[segind];
1171 		if (seg->start >= high)
1172 			break;
1173 		if (low >= seg->end)
1174 			continue;
1175 		if (low <= seg->start)
1176 			m_start = seg->first_page;
1177 		else
1178 			m_start = &seg->first_page[atop(low - seg->start)];
1179 		if (high < seg->end)
1180 			pa_end = high;
1181 		else
1182 			pa_end = seg->end;
1183 		if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages))
1184 			continue;
1185 		m_end = &seg->first_page[atop(pa_end - seg->start)];
1186 		m_run = vm_page_scan_contig(npages, m_start, m_end,
1187 		    alignment, boundary, options);
1188 		if (m_run != NULL)
1189 			return (m_run);
1190 	}
1191 	return (NULL);
1192 }
1193 
1194 /*
1195  * Set the pool for a contiguous, power of two-sized set of physical pages.
1196  */
1197 void
1198 vm_phys_set_pool(int pool, vm_page_t m, int order)
1199 {
1200 	vm_page_t m_tmp;
1201 
1202 	for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
1203 		m_tmp->pool = pool;
1204 }
1205 
1206 /*
1207  * Search for the given physical page "m" in the free lists.  If the search
1208  * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
1209  * FALSE, indicating that "m" is not in the free lists.
1210  *
1211  * The free page queues must be locked.
1212  */
1213 boolean_t
1214 vm_phys_unfree_page(vm_page_t m)
1215 {
1216 	struct vm_freelist *fl;
1217 	struct vm_phys_seg *seg;
1218 	vm_paddr_t pa, pa_half;
1219 	vm_page_t m_set, m_tmp;
1220 	int order;
1221 
1222 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1223 
1224 	/*
1225 	 * First, find the contiguous, power of two-sized set of free
1226 	 * physical pages containing the given physical page "m" and
1227 	 * assign it to "m_set".
1228 	 */
1229 	seg = &vm_phys_segs[m->segind];
1230 	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
1231 	    order < VM_NFREEORDER - 1; ) {
1232 		order++;
1233 		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
1234 		if (pa >= seg->start)
1235 			m_set = &seg->first_page[atop(pa - seg->start)];
1236 		else
1237 			return (FALSE);
1238 	}
1239 	if (m_set->order < order)
1240 		return (FALSE);
1241 	if (m_set->order == VM_NFREEORDER)
1242 		return (FALSE);
1243 	KASSERT(m_set->order < VM_NFREEORDER,
1244 	    ("vm_phys_unfree_page: page %p has unexpected order %d",
1245 	    m_set, m_set->order));
1246 
1247 	/*
1248 	 * Next, remove "m_set" from the free lists.  Finally, extract
1249 	 * "m" from "m_set" using an iterative algorithm: While "m_set"
1250 	 * is larger than a page, shrink "m_set" by returning the half
1251 	 * of "m_set" that does not contain "m" to the free lists.
1252 	 */
1253 	fl = (*seg->free_queues)[m_set->pool];
1254 	order = m_set->order;
1255 	vm_freelist_rem(fl, m_set, order);
1256 	while (order > 0) {
1257 		order--;
1258 		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
1259 		if (m->phys_addr < pa_half)
1260 			m_tmp = &seg->first_page[atop(pa_half - seg->start)];
1261 		else {
1262 			m_tmp = m_set;
1263 			m_set = &seg->first_page[atop(pa_half - seg->start)];
1264 		}
1265 		vm_freelist_add(fl, m_tmp, order, 0);
1266 	}
1267 	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
1268 	return (TRUE);
1269 }
1270 
1271 /*
1272  * Allocate a contiguous set of physical pages of the given size
1273  * "npages" from the free lists.  All of the physical pages must be at
1274  * or above the given physical address "low" and below the given
1275  * physical address "high".  The given value "alignment" determines the
1276  * alignment of the first physical page in the set.  If the given value
1277  * "boundary" is non-zero, then the set of physical pages cannot cross
1278  * any physical address boundary that is a multiple of that value.  Both
1279  * "alignment" and "boundary" must be a power of two.
1280  */
1281 vm_page_t
1282 vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
1283     u_long alignment, vm_paddr_t boundary)
1284 {
1285 	vm_paddr_t pa_end, pa_start;
1286 	vm_page_t m_run;
1287 	struct vm_domain_iterator vi;
1288 	struct vm_phys_seg *seg;
1289 	int domain, segind;
1290 
1291 	KASSERT(npages > 0, ("npages is 0"));
1292 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1293 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1294 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1295 	if (low >= high)
1296 		return (NULL);
1297 	vm_policy_iterator_init(&vi);
1298 restartdom:
1299 	if (vm_domain_iterator_run(&vi, &domain) != 0) {
1300 		vm_policy_iterator_finish(&vi);
1301 		return (NULL);
1302 	}
1303 	m_run = NULL;
1304 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
1305 		seg = &vm_phys_segs[segind];
1306 		if (seg->start >= high || seg->domain != domain)
1307 			continue;
1308 		if (low >= seg->end)
1309 			break;
1310 		if (low <= seg->start)
1311 			pa_start = seg->start;
1312 		else
1313 			pa_start = low;
1314 		if (high < seg->end)
1315 			pa_end = high;
1316 		else
1317 			pa_end = seg->end;
1318 		if (pa_end - pa_start < ptoa(npages))
1319 			continue;
1320 		m_run = vm_phys_alloc_seg_contig(seg, npages, low, high,
1321 		    alignment, boundary);
1322 		if (m_run != NULL)
1323 			break;
1324 	}
1325 	if (m_run == NULL && !vm_domain_iterator_isdone(&vi))
1326 		goto restartdom;
1327 	vm_policy_iterator_finish(&vi);
1328 	return (m_run);
1329 }
1330 
1331 /*
1332  * Allocate a run of contiguous physical pages from the free list for the
1333  * specified segment.
1334  */
1335 static vm_page_t
1336 vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages,
1337     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
1338 {
1339 	struct vm_freelist *fl;
1340 	vm_paddr_t pa, pa_end, size;
1341 	vm_page_t m, m_ret;
1342 	u_long npages_end;
1343 	int oind, order, pind;
1344 
1345 	KASSERT(npages > 0, ("npages is 0"));
1346 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1347 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1348 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1349 	/* Compute the queue that is the best fit for npages. */
1350 	for (order = 0; (1 << order) < npages; order++);
1351 	/* Search for a run satisfying the specified conditions. */
1352 	size = npages << PAGE_SHIFT;
1353 	for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER;
1354 	    oind++) {
1355 		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1356 			fl = (*seg->free_queues)[pind];
1357 			TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
1358 				/*
1359 				 * Is the size of this allocation request
1360 				 * larger than the largest block size?
1361 				 */
1362 				if (order >= VM_NFREEORDER) {
1363 					/*
1364 					 * Determine if a sufficient number of
1365 					 * subsequent blocks to satisfy the
1366 					 * allocation request are free.
1367 					 */
1368 					pa = VM_PAGE_TO_PHYS(m_ret);
1369 					pa_end = pa + size;
1370 					for (;;) {
1371 						pa += 1 << (PAGE_SHIFT +
1372 						    VM_NFREEORDER - 1);
1373 						if (pa >= pa_end ||
1374 						    pa < seg->start ||
1375 						    pa >= seg->end)
1376 							break;
1377 						m = &seg->first_page[atop(pa -
1378 						    seg->start)];
1379 						if (m->order != VM_NFREEORDER -
1380 						    1)
1381 							break;
1382 					}
1383 					/* If not, go to the next block. */
1384 					if (pa < pa_end)
1385 						continue;
1386 				}
1387 
1388 				/*
1389 				 * Determine if the blocks are within the
1390 				 * given range, satisfy the given alignment,
1391 				 * and do not cross the given boundary.
1392 				 */
1393 				pa = VM_PAGE_TO_PHYS(m_ret);
1394 				pa_end = pa + size;
1395 				if (pa >= low && pa_end <= high &&
1396 				    (pa & (alignment - 1)) == 0 &&
1397 				    rounddown2(pa ^ (pa_end - 1), boundary) == 0)
1398 					goto done;
1399 			}
1400 		}
1401 	}
1402 	return (NULL);
1403 done:
1404 	for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
1405 		fl = (*seg->free_queues)[m->pool];
1406 		vm_freelist_rem(fl, m, m->order);
1407 	}
1408 	if (m_ret->pool != VM_FREEPOOL_DEFAULT)
1409 		vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
1410 	fl = (*seg->free_queues)[m_ret->pool];
1411 	vm_phys_split_pages(m_ret, oind, fl, order);
1412 	/* Return excess pages to the free lists. */
1413 	npages_end = roundup2(npages, 1 << imin(oind, order));
1414 	if (npages < npages_end)
1415 		vm_phys_free_contig(&m_ret[npages], npages_end - npages);
1416 	return (m_ret);
1417 }
1418 
1419 #ifdef DDB
1420 /*
1421  * Show the number of physical pages in each of the free lists.
1422  */
1423 DB_SHOW_COMMAND(freepages, db_show_freepages)
1424 {
1425 	struct vm_freelist *fl;
1426 	int flind, oind, pind, dom;
1427 
1428 	for (dom = 0; dom < vm_ndomains; dom++) {
1429 		db_printf("DOMAIN: %d\n", dom);
1430 		for (flind = 0; flind < vm_nfreelists; flind++) {
1431 			db_printf("FREE LIST %d:\n"
1432 			    "\n  ORDER (SIZE)  |  NUMBER"
1433 			    "\n              ", flind);
1434 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
1435 				db_printf("  |  POOL %d", pind);
1436 			db_printf("\n--            ");
1437 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
1438 				db_printf("-- --      ");
1439 			db_printf("--\n");
1440 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
1441 				db_printf("  %2.2d (%6.6dK)", oind,
1442 				    1 << (PAGE_SHIFT - 10 + oind));
1443 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1444 				fl = vm_phys_free_queues[dom][flind][pind];
1445 					db_printf("  |  %6.6d", fl[oind].lcnt);
1446 				}
1447 				db_printf("\n");
1448 			}
1449 			db_printf("\n");
1450 		}
1451 		db_printf("\n");
1452 	}
1453 }
1454 #endif
1455