xref: /freebsd/sys/vm/vm_phys.c (revision 574ef650695088d56ea12df7da76155370286f9f)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2002-2006 Rice University
5  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
6  * All rights reserved.
7  *
8  * This software was developed for the FreeBSD Project by Alan L. Cox,
9  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
24  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
27  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
30  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  *	Physical memory system implementation
36  *
37  * Any external functions defined by this module are only to be used by the
38  * virtual memory system.
39  */
40 
41 #include <sys/cdefs.h>
42 #include "opt_ddb.h"
43 #include "opt_vm.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/domainset.h>
48 #include <sys/lock.h>
49 #include <sys/kernel.h>
50 #include <sys/malloc.h>
51 #include <sys/mutex.h>
52 #include <sys/proc.h>
53 #include <sys/queue.h>
54 #include <sys/rwlock.h>
55 #include <sys/sbuf.h>
56 #include <sys/sysctl.h>
57 #include <sys/tree.h>
58 #include <sys/vmmeter.h>
59 
60 #include <ddb/ddb.h>
61 
62 #include <vm/vm.h>
63 #include <vm/vm_extern.h>
64 #include <vm/vm_param.h>
65 #include <vm/vm_kern.h>
66 #include <vm/vm_object.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_phys.h>
69 #include <vm/vm_pagequeue.h>
70 
71 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
72     "Too many physsegs.");
73 _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t),
74     "vm_paddr_t too big for ffsll, flsll.");
75 
76 #ifdef NUMA
77 struct mem_affinity __read_mostly *mem_affinity;
78 int __read_mostly *mem_locality;
79 
80 static int numa_disabled;
81 static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
82     "NUMA options");
83 SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
84     &numa_disabled, 0, "NUMA-awareness in the allocators is disabled");
85 #endif
86 
87 int __read_mostly vm_ndomains = 1;
88 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1);
89 
90 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX];
91 int __read_mostly vm_phys_nsegs;
92 static struct vm_phys_seg vm_phys_early_segs[8];
93 static int vm_phys_early_nsegs;
94 
95 struct vm_phys_fictitious_seg;
96 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
97     struct vm_phys_fictitious_seg *);
98 
99 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
100     RB_INITIALIZER(&vm_phys_fictitious_tree);
101 
102 struct vm_phys_fictitious_seg {
103 	RB_ENTRY(vm_phys_fictitious_seg) node;
104 	/* Memory region data */
105 	vm_paddr_t	start;
106 	vm_paddr_t	end;
107 	vm_page_t	first_page;
108 };
109 
110 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
111     vm_phys_fictitious_cmp);
112 
113 static struct rwlock_padalign vm_phys_fictitious_reg_lock;
114 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
115 
116 static struct vm_freelist __aligned(CACHE_LINE_SIZE)
117     vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL]
118     [VM_NFREEORDER_MAX];
119 
120 static int __read_mostly vm_nfreelists;
121 
122 /*
123  * These "avail lists" are globals used to communicate boot-time physical
124  * memory layout to other parts of the kernel.  Each physically contiguous
125  * region of memory is defined by a start address at an even index and an
126  * end address at the following odd index.  Each list is terminated by a
127  * pair of zero entries.
128  *
129  * dump_avail tells the dump code what regions to include in a crash dump, and
130  * phys_avail is all of the remaining physical memory that is available for
131  * the vm system.
132  *
133  * Initially dump_avail and phys_avail are identical.  Boot time memory
134  * allocations remove extents from phys_avail that may still be included
135  * in dumps.
136  */
137 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT];
138 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT];
139 
140 /*
141  * Provides the mapping from VM_FREELIST_* to free list indices (flind).
142  */
143 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
144 
145 CTASSERT(VM_FREELIST_DEFAULT == 0);
146 
147 #ifdef VM_FREELIST_DMA32
148 #define	VM_DMA32_BOUNDARY	((vm_paddr_t)1 << 32)
149 #endif
150 
151 /*
152  * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
153  * the ordering of the free list boundaries.
154  */
155 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
156 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
157 #endif
158 
159 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
160 SYSCTL_OID(_vm, OID_AUTO, phys_free,
161     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
162     sysctl_vm_phys_free, "A",
163     "Phys Free Info");
164 
165 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
166 SYSCTL_OID(_vm, OID_AUTO, phys_segs,
167     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
168     sysctl_vm_phys_segs, "A",
169     "Phys Seg Info");
170 
171 #ifdef NUMA
172 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
173 SYSCTL_OID(_vm, OID_AUTO, phys_locality,
174     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
175     sysctl_vm_phys_locality, "A",
176     "Phys Locality Info");
177 #endif
178 
179 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
180     &vm_ndomains, 0, "Number of physical memory domains available.");
181 
182 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
183 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
184 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
185     int order, int tail);
186 
187 /*
188  * Red-black tree helpers for vm fictitious range management.
189  */
190 static inline int
191 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
192     struct vm_phys_fictitious_seg *range)
193 {
194 
195 	KASSERT(range->start != 0 && range->end != 0,
196 	    ("Invalid range passed on search for vm_fictitious page"));
197 	if (p->start >= range->end)
198 		return (1);
199 	if (p->start < range->start)
200 		return (-1);
201 
202 	return (0);
203 }
204 
205 static int
206 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
207     struct vm_phys_fictitious_seg *p2)
208 {
209 
210 	/* Check if this is a search for a page */
211 	if (p1->end == 0)
212 		return (vm_phys_fictitious_in_range(p1, p2));
213 
214 	KASSERT(p2->end != 0,
215     ("Invalid range passed as second parameter to vm fictitious comparison"));
216 
217 	/* Searching to add a new range */
218 	if (p1->end <= p2->start)
219 		return (-1);
220 	if (p1->start >= p2->end)
221 		return (1);
222 
223 	panic("Trying to add overlapping vm fictitious ranges:\n"
224 	    "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
225 	    (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
226 }
227 
228 int
229 vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used,
230     vm_paddr_t high __numa_used)
231 {
232 #ifdef NUMA
233 	domainset_t mask;
234 	int i;
235 
236 	if (vm_ndomains == 1 || mem_affinity == NULL)
237 		return (0);
238 
239 	DOMAINSET_ZERO(&mask);
240 	/*
241 	 * Check for any memory that overlaps low, high.
242 	 */
243 	for (i = 0; mem_affinity[i].end != 0; i++)
244 		if (mem_affinity[i].start <= high &&
245 		    mem_affinity[i].end >= low)
246 			DOMAINSET_SET(mem_affinity[i].domain, &mask);
247 	if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask))
248 		return (prefer);
249 	if (DOMAINSET_EMPTY(&mask))
250 		panic("vm_phys_domain_match:  Impossible constraint");
251 	return (DOMAINSET_FFS(&mask) - 1);
252 #else
253 	return (0);
254 #endif
255 }
256 
257 /*
258  * Outputs the state of the physical memory allocator, specifically,
259  * the amount of physical memory in each free list.
260  */
261 static int
262 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
263 {
264 	struct sbuf sbuf;
265 	struct vm_freelist *fl;
266 	int dom, error, flind, oind, pind;
267 
268 	error = sysctl_wire_old_buffer(req, 0);
269 	if (error != 0)
270 		return (error);
271 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
272 	for (dom = 0; dom < vm_ndomains; dom++) {
273 		sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
274 		for (flind = 0; flind < vm_nfreelists; flind++) {
275 			sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
276 			    "\n  ORDER (SIZE)  |  NUMBER"
277 			    "\n              ", flind);
278 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
279 				sbuf_printf(&sbuf, "  |  POOL %d", pind);
280 			sbuf_printf(&sbuf, "\n--            ");
281 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
282 				sbuf_printf(&sbuf, "-- --      ");
283 			sbuf_printf(&sbuf, "--\n");
284 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
285 				sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
286 				    1 << (PAGE_SHIFT - 10 + oind));
287 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
288 				fl = vm_phys_free_queues[dom][flind][pind];
289 					sbuf_printf(&sbuf, "  |  %6d",
290 					    fl[oind].lcnt);
291 				}
292 				sbuf_printf(&sbuf, "\n");
293 			}
294 		}
295 	}
296 	error = sbuf_finish(&sbuf);
297 	sbuf_delete(&sbuf);
298 	return (error);
299 }
300 
301 /*
302  * Outputs the set of physical memory segments.
303  */
304 static int
305 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
306 {
307 	struct sbuf sbuf;
308 	struct vm_phys_seg *seg;
309 	int error, segind;
310 
311 	error = sysctl_wire_old_buffer(req, 0);
312 	if (error != 0)
313 		return (error);
314 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
315 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
316 		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
317 		seg = &vm_phys_segs[segind];
318 		sbuf_printf(&sbuf, "start:     %#jx\n",
319 		    (uintmax_t)seg->start);
320 		sbuf_printf(&sbuf, "end:       %#jx\n",
321 		    (uintmax_t)seg->end);
322 		sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
323 		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
324 	}
325 	error = sbuf_finish(&sbuf);
326 	sbuf_delete(&sbuf);
327 	return (error);
328 }
329 
330 /*
331  * Return affinity, or -1 if there's no affinity information.
332  */
333 int
334 vm_phys_mem_affinity(int f __numa_used, int t __numa_used)
335 {
336 
337 #ifdef NUMA
338 	if (mem_locality == NULL)
339 		return (-1);
340 	if (f >= vm_ndomains || t >= vm_ndomains)
341 		return (-1);
342 	return (mem_locality[f * vm_ndomains + t]);
343 #else
344 	return (-1);
345 #endif
346 }
347 
348 #ifdef NUMA
349 /*
350  * Outputs the VM locality table.
351  */
352 static int
353 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
354 {
355 	struct sbuf sbuf;
356 	int error, i, j;
357 
358 	error = sysctl_wire_old_buffer(req, 0);
359 	if (error != 0)
360 		return (error);
361 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
362 
363 	sbuf_printf(&sbuf, "\n");
364 
365 	for (i = 0; i < vm_ndomains; i++) {
366 		sbuf_printf(&sbuf, "%d: ", i);
367 		for (j = 0; j < vm_ndomains; j++) {
368 			sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
369 		}
370 		sbuf_printf(&sbuf, "\n");
371 	}
372 	error = sbuf_finish(&sbuf);
373 	sbuf_delete(&sbuf);
374 	return (error);
375 }
376 #endif
377 
378 static void
379 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
380 {
381 
382 	m->order = order;
383 	if (tail)
384 		TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
385 	else
386 		TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
387 	fl[order].lcnt++;
388 }
389 
390 static void
391 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
392 {
393 
394 	TAILQ_REMOVE(&fl[order].pl, m, listq);
395 	fl[order].lcnt--;
396 	m->order = VM_NFREEORDER;
397 }
398 
399 /*
400  * Create a physical memory segment.
401  */
402 static void
403 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
404 {
405 	struct vm_phys_seg *seg;
406 
407 	KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
408 	    ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
409 	KASSERT(domain >= 0 && domain < vm_ndomains,
410 	    ("vm_phys_create_seg: invalid domain provided"));
411 	seg = &vm_phys_segs[vm_phys_nsegs++];
412 	while (seg > vm_phys_segs && (seg - 1)->start >= end) {
413 		*seg = *(seg - 1);
414 		seg--;
415 	}
416 	seg->start = start;
417 	seg->end = end;
418 	seg->domain = domain;
419 }
420 
421 static void
422 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
423 {
424 #ifdef NUMA
425 	int i;
426 
427 	if (mem_affinity == NULL) {
428 		_vm_phys_create_seg(start, end, 0);
429 		return;
430 	}
431 
432 	for (i = 0;; i++) {
433 		if (mem_affinity[i].end == 0)
434 			panic("Reached end of affinity info");
435 		if (mem_affinity[i].end <= start)
436 			continue;
437 		if (mem_affinity[i].start > start)
438 			panic("No affinity info for start %jx",
439 			    (uintmax_t)start);
440 		if (mem_affinity[i].end >= end) {
441 			_vm_phys_create_seg(start, end,
442 			    mem_affinity[i].domain);
443 			break;
444 		}
445 		_vm_phys_create_seg(start, mem_affinity[i].end,
446 		    mem_affinity[i].domain);
447 		start = mem_affinity[i].end;
448 	}
449 #else
450 	_vm_phys_create_seg(start, end, 0);
451 #endif
452 }
453 
454 /*
455  * Add a physical memory segment.
456  */
457 void
458 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
459 {
460 	vm_paddr_t paddr;
461 
462 	KASSERT((start & PAGE_MASK) == 0,
463 	    ("vm_phys_define_seg: start is not page aligned"));
464 	KASSERT((end & PAGE_MASK) == 0,
465 	    ("vm_phys_define_seg: end is not page aligned"));
466 
467 	/*
468 	 * Split the physical memory segment if it spans two or more free
469 	 * list boundaries.
470 	 */
471 	paddr = start;
472 #ifdef	VM_FREELIST_LOWMEM
473 	if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
474 		vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
475 		paddr = VM_LOWMEM_BOUNDARY;
476 	}
477 #endif
478 #ifdef	VM_FREELIST_DMA32
479 	if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
480 		vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
481 		paddr = VM_DMA32_BOUNDARY;
482 	}
483 #endif
484 	vm_phys_create_seg(paddr, end);
485 }
486 
487 /*
488  * Initialize the physical memory allocator.
489  *
490  * Requires that vm_page_array is initialized!
491  */
492 void
493 vm_phys_init(void)
494 {
495 	struct vm_freelist *fl;
496 	struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg;
497 #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE)
498 	u_long npages;
499 #endif
500 	int dom, flind, freelist, oind, pind, segind;
501 
502 	/*
503 	 * Compute the number of free lists, and generate the mapping from the
504 	 * manifest constants VM_FREELIST_* to the free list indices.
505 	 *
506 	 * Initially, the entries of vm_freelist_to_flind[] are set to either
507 	 * 0 or 1 to indicate which free lists should be created.
508 	 */
509 #ifdef	VM_DMA32_NPAGES_THRESHOLD
510 	npages = 0;
511 #endif
512 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
513 		seg = &vm_phys_segs[segind];
514 #ifdef	VM_FREELIST_LOWMEM
515 		if (seg->end <= VM_LOWMEM_BOUNDARY)
516 			vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
517 		else
518 #endif
519 #ifdef	VM_FREELIST_DMA32
520 		if (
521 #ifdef	VM_DMA32_NPAGES_THRESHOLD
522 		    /*
523 		     * Create the DMA32 free list only if the amount of
524 		     * physical memory above physical address 4G exceeds the
525 		     * given threshold.
526 		     */
527 		    npages > VM_DMA32_NPAGES_THRESHOLD &&
528 #endif
529 		    seg->end <= VM_DMA32_BOUNDARY)
530 			vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
531 		else
532 #endif
533 		{
534 #ifdef	VM_DMA32_NPAGES_THRESHOLD
535 			npages += atop(seg->end - seg->start);
536 #endif
537 			vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
538 		}
539 	}
540 	/* Change each entry into a running total of the free lists. */
541 	for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
542 		vm_freelist_to_flind[freelist] +=
543 		    vm_freelist_to_flind[freelist - 1];
544 	}
545 	vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
546 	KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
547 	/* Change each entry into a free list index. */
548 	for (freelist = 0; freelist < VM_NFREELIST; freelist++)
549 		vm_freelist_to_flind[freelist]--;
550 
551 	/*
552 	 * Initialize the first_page and free_queues fields of each physical
553 	 * memory segment.
554 	 */
555 #ifdef VM_PHYSSEG_SPARSE
556 	npages = 0;
557 #endif
558 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
559 		seg = &vm_phys_segs[segind];
560 #ifdef VM_PHYSSEG_SPARSE
561 		seg->first_page = &vm_page_array[npages];
562 		npages += atop(seg->end - seg->start);
563 #else
564 		seg->first_page = PHYS_TO_VM_PAGE(seg->start);
565 #endif
566 #ifdef	VM_FREELIST_LOWMEM
567 		if (seg->end <= VM_LOWMEM_BOUNDARY) {
568 			flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
569 			KASSERT(flind >= 0,
570 			    ("vm_phys_init: LOWMEM flind < 0"));
571 		} else
572 #endif
573 #ifdef	VM_FREELIST_DMA32
574 		if (seg->end <= VM_DMA32_BOUNDARY) {
575 			flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
576 			KASSERT(flind >= 0,
577 			    ("vm_phys_init: DMA32 flind < 0"));
578 		} else
579 #endif
580 		{
581 			flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
582 			KASSERT(flind >= 0,
583 			    ("vm_phys_init: DEFAULT flind < 0"));
584 		}
585 		seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
586 	}
587 
588 	/*
589 	 * Coalesce physical memory segments that are contiguous and share the
590 	 * same per-domain free queues.
591 	 */
592 	prev_seg = vm_phys_segs;
593 	seg = &vm_phys_segs[1];
594 	end_seg = &vm_phys_segs[vm_phys_nsegs];
595 	while (seg < end_seg) {
596 		if (prev_seg->end == seg->start &&
597 		    prev_seg->free_queues == seg->free_queues) {
598 			prev_seg->end = seg->end;
599 			KASSERT(prev_seg->domain == seg->domain,
600 			    ("vm_phys_init: free queues cannot span domains"));
601 			vm_phys_nsegs--;
602 			end_seg--;
603 			for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++)
604 				*tmp_seg = *(tmp_seg + 1);
605 		} else {
606 			prev_seg = seg;
607 			seg++;
608 		}
609 	}
610 
611 	/*
612 	 * Initialize the free queues.
613 	 */
614 	for (dom = 0; dom < vm_ndomains; dom++) {
615 		for (flind = 0; flind < vm_nfreelists; flind++) {
616 			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
617 				fl = vm_phys_free_queues[dom][flind][pind];
618 				for (oind = 0; oind < VM_NFREEORDER; oind++)
619 					TAILQ_INIT(&fl[oind].pl);
620 			}
621 		}
622 	}
623 
624 	rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
625 }
626 
627 /*
628  * Register info about the NUMA topology of the system.
629  *
630  * Invoked by platform-dependent code prior to vm_phys_init().
631  */
632 void
633 vm_phys_register_domains(int ndomains __numa_used,
634     struct mem_affinity *affinity __numa_used, int *locality __numa_used)
635 {
636 #ifdef NUMA
637 	int i;
638 
639 	/*
640 	 * For now the only override value that we support is 1, which
641 	 * effectively disables NUMA-awareness in the allocators.
642 	 */
643 	TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled);
644 	if (numa_disabled)
645 		ndomains = 1;
646 
647 	if (ndomains > 1) {
648 		vm_ndomains = ndomains;
649 		mem_affinity = affinity;
650 		mem_locality = locality;
651 	}
652 
653 	for (i = 0; i < vm_ndomains; i++)
654 		DOMAINSET_SET(i, &all_domains);
655 #endif
656 }
657 
658 /*
659  * Split a contiguous, power of two-sized set of physical pages.
660  *
661  * When this function is called by a page allocation function, the caller
662  * should request insertion at the head unless the order [order, oind) queues
663  * are known to be empty.  The objective being to reduce the likelihood of
664  * long-term fragmentation by promoting contemporaneous allocation and
665  * (hopefully) deallocation.
666  */
667 static __inline void
668 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order,
669     int tail)
670 {
671 	vm_page_t m_buddy;
672 	int pool = m->pool;
673 
674 	while (oind > order) {
675 		oind--;
676 		m_buddy = &m[1 << oind];
677 		KASSERT(m_buddy->order == VM_NFREEORDER,
678 		    ("vm_phys_split_pages: page %p has unexpected order %d",
679 		    m_buddy, m_buddy->order));
680 		KASSERT(m_buddy->pool == VM_NFREEPOOL,
681 		    ("vm_phys_split_pages: page %p has unexpected pool %d",
682 		    m_buddy, m_buddy->pool));
683 		m_buddy->pool = pool;
684 		vm_freelist_add(fl, m_buddy, oind, tail);
685         }
686 }
687 
688 /*
689  * Add the physical pages [m, m + npages) at the beginning of a power-of-two
690  * aligned and sized set to the specified free list.
691  *
692  * When this function is called by a page allocation function, the caller
693  * should request insertion at the head unless the lower-order queues are
694  * known to be empty.  The objective being to reduce the likelihood of long-
695  * term fragmentation by promoting contemporaneous allocation and (hopefully)
696  * deallocation.
697  *
698  * The physical page m's buddy must not be free.
699  */
700 static void
701 vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool,
702     int tail)
703 {
704         int order;
705 
706 	KASSERT(npages == 0 ||
707 	    (VM_PAGE_TO_PHYS(m) &
708 	    ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0,
709 	    ("%s: page %p and npages %u are misaligned",
710 	    __func__, m, npages));
711         while (npages > 0) {
712 		KASSERT(m->order == VM_NFREEORDER,
713 		    ("%s: page %p has unexpected order %d",
714 		    __func__, m, m->order));
715                 order = fls(npages) - 1;
716 		KASSERT(order < VM_NFREEORDER,
717 		    ("%s: order %d is out of range", __func__, order));
718 		m->pool = pool;
719                 vm_freelist_add(fl, m, order, tail);
720 		m += 1 << order;
721                 npages -= 1 << order;
722         }
723 }
724 
725 /*
726  * Add the physical pages [m, m + npages) at the end of a power-of-two aligned
727  * and sized set to the specified free list.
728  *
729  * When this function is called by a page allocation function, the caller
730  * should request insertion at the head unless the lower-order queues are
731  * known to be empty.  The objective being to reduce the likelihood of long-
732  * term fragmentation by promoting contemporaneous allocation and (hopefully)
733  * deallocation.
734  *
735  * If npages is zero, this function does nothing and ignores the physical page
736  * parameter m.  Otherwise, the physical page m's buddy must not be free.
737  */
738 static vm_page_t
739 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool,
740     int tail)
741 {
742 	int order;
743 
744 	KASSERT(npages == 0 ||
745 	    ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) &
746 	    ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0,
747 	    ("vm_phys_enq_range: page %p and npages %u are misaligned",
748 	    m, npages));
749 	while (npages > 0) {
750 		KASSERT(m->order == VM_NFREEORDER,
751 		    ("vm_phys_enq_range: page %p has unexpected order %d",
752 		    m, m->order));
753 		order = ffs(npages) - 1;
754 		KASSERT(order < VM_NFREEORDER,
755 		    ("vm_phys_enq_range: order %d is out of range", order));
756 		m->pool = pool;
757 		vm_freelist_add(fl, m, order, tail);
758 		m += 1 << order;
759 		npages -= 1 << order;
760 	}
761 	return (m);
762 }
763 
764 /*
765  * Tries to allocate the specified number of pages from the specified pool
766  * within the specified domain.  Returns the actual number of allocated pages
767  * and a pointer to each page through the array ma[].
768  *
769  * The returned pages may not be physically contiguous.  However, in contrast
770  * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0),
771  * calling this function once to allocate the desired number of pages will
772  * avoid wasted time in vm_phys_split_pages().  Sets the pool field for
773  * every allocated page.
774  *
775  * The free page queues for the specified domain must be locked.
776  */
777 int
778 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
779 {
780 	struct vm_freelist *alt, *fl;
781 	vm_page_t m;
782 	int avail, end, flind, freelist, i, oind, pind;
783 
784 	KASSERT(domain >= 0 && domain < vm_ndomains,
785 	    ("vm_phys_alloc_npages: domain %d is out of range", domain));
786 	KASSERT(pool < VM_NFREEPOOL,
787 	    ("vm_phys_alloc_npages: pool %d is out of range", pool));
788 	KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
789 	    ("vm_phys_alloc_npages: npages %d is out of range", npages));
790 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
791 	i = 0;
792 	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
793 		flind = vm_freelist_to_flind[freelist];
794 		if (flind < 0)
795 			continue;
796 		fl = vm_phys_free_queues[domain][flind][pool];
797 		for (oind = 0; oind < VM_NFREEORDER; oind++) {
798 			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
799 				vm_freelist_rem(fl, m, oind);
800 				avail = i + (1 << oind);
801 				end = imin(npages, avail);
802 				ma[i++] = m++;
803 				while (i < end) {
804 					m->pool = pool;
805 					ma[i++] = m++;
806 				}
807 				if (i == npages) {
808 					/*
809 					 * Return excess pages to fl.  Its order
810 					 * [0, oind) queues are empty.
811 					 */
812 					vm_phys_enq_range(m, avail - i, fl,
813 					    pool, 1);
814 					return (npages);
815 				}
816 			}
817 		}
818 		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
819 			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
820 				alt = vm_phys_free_queues[domain][flind][pind];
821 				while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
822 				    NULL) {
823 					vm_freelist_rem(alt, m, oind);
824 					avail = i + (1 << oind);
825 					end = imin(npages, avail);
826 					do {
827 						m->pool = pool;
828 						ma[i++] = m++;
829 					} while (i < end);
830 					if (i == npages) {
831 						/*
832 						 * Return excess pages to fl.
833 						 * Its order [0, oind) queues
834 						 * are empty.
835 						 */
836 						vm_phys_enq_range(m, avail - i,
837 						    fl, pool, 1);
838 						return (npages);
839 					}
840 				}
841 			}
842 		}
843 	}
844 	return (i);
845 }
846 
847 /*
848  * Allocate a contiguous, power of two-sized set of physical pages
849  * from the free lists.  Sets the pool field in the first page only.
850  *
851  * The free page queues must be locked.
852  */
853 vm_page_t
854 vm_phys_alloc_pages(int domain, int pool, int order)
855 {
856 	vm_page_t m;
857 	int freelist;
858 
859 	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
860 		m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order);
861 		if (m != NULL)
862 			return (m);
863 	}
864 	return (NULL);
865 }
866 
867 /*
868  * Allocate a contiguous, power of two-sized set of physical pages from the
869  * specified free list.  The free list must be specified using one of the
870  * manifest constants VM_FREELIST_*.  Sets the pool field in the first page
871  * only.
872  *
873  * The free page queues must be locked.
874  */
875 vm_page_t
876 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
877 {
878 	struct vm_freelist *alt, *fl;
879 	vm_page_t m;
880 	int oind, pind, flind;
881 
882 	KASSERT(domain >= 0 && domain < vm_ndomains,
883 	    ("vm_phys_alloc_freelist_pages: domain %d is out of range",
884 	    domain));
885 	KASSERT(freelist < VM_NFREELIST,
886 	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
887 	    freelist));
888 	KASSERT(pool < VM_NFREEPOOL,
889 	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
890 	KASSERT(order < VM_NFREEORDER,
891 	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
892 
893 	flind = vm_freelist_to_flind[freelist];
894 	/* Check if freelist is present */
895 	if (flind < 0)
896 		return (NULL);
897 
898 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
899 	fl = &vm_phys_free_queues[domain][flind][pool][0];
900 	for (oind = order; oind < VM_NFREEORDER; oind++) {
901 		m = TAILQ_FIRST(&fl[oind].pl);
902 		if (m != NULL) {
903 			vm_freelist_rem(fl, m, oind);
904 			/* The order [order, oind) queues are empty. */
905 			vm_phys_split_pages(m, oind, fl, order, 1);
906 			return (m);
907 		}
908 	}
909 
910 	/*
911 	 * The given pool was empty.  Find the largest
912 	 * contiguous, power-of-two-sized set of pages in any
913 	 * pool.  Transfer these pages to the given pool, and
914 	 * use them to satisfy the allocation.
915 	 */
916 	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
917 		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
918 			alt = &vm_phys_free_queues[domain][flind][pind][0];
919 			m = TAILQ_FIRST(&alt[oind].pl);
920 			if (m != NULL) {
921 				vm_freelist_rem(alt, m, oind);
922 				m->pool = pool;
923 				/* The order [order, oind) queues are empty. */
924 				vm_phys_split_pages(m, oind, fl, order, 1);
925 				return (m);
926 			}
927 		}
928 	}
929 	return (NULL);
930 }
931 
932 /*
933  * Find the vm_page corresponding to the given physical address.
934  */
935 vm_page_t
936 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
937 {
938 	struct vm_phys_seg *seg;
939 
940 	if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
941 		return (&seg->first_page[atop(pa - seg->start)]);
942 	return (NULL);
943 }
944 
945 vm_page_t
946 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
947 {
948 	struct vm_phys_fictitious_seg tmp, *seg;
949 	vm_page_t m;
950 
951 	m = NULL;
952 	tmp.start = pa;
953 	tmp.end = 0;
954 
955 	rw_rlock(&vm_phys_fictitious_reg_lock);
956 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
957 	rw_runlock(&vm_phys_fictitious_reg_lock);
958 	if (seg == NULL)
959 		return (NULL);
960 
961 	m = &seg->first_page[atop(pa - seg->start)];
962 	KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
963 
964 	return (m);
965 }
966 
967 static inline void
968 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
969     long page_count, vm_memattr_t memattr)
970 {
971 	long i;
972 
973 	bzero(range, page_count * sizeof(*range));
974 	for (i = 0; i < page_count; i++) {
975 		vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
976 		range[i].oflags &= ~VPO_UNMANAGED;
977 		range[i].busy_lock = VPB_UNBUSIED;
978 	}
979 }
980 
981 int
982 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
983     vm_memattr_t memattr)
984 {
985 	struct vm_phys_fictitious_seg *seg;
986 	vm_page_t fp;
987 	long page_count;
988 #ifdef VM_PHYSSEG_DENSE
989 	long pi, pe;
990 	long dpage_count;
991 #endif
992 
993 	KASSERT(start < end,
994 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
995 	    (uintmax_t)start, (uintmax_t)end));
996 
997 	page_count = (end - start) / PAGE_SIZE;
998 
999 #ifdef VM_PHYSSEG_DENSE
1000 	pi = atop(start);
1001 	pe = atop(end);
1002 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1003 		fp = &vm_page_array[pi - first_page];
1004 		if ((pe - first_page) > vm_page_array_size) {
1005 			/*
1006 			 * We have a segment that starts inside
1007 			 * of vm_page_array, but ends outside of it.
1008 			 *
1009 			 * Use vm_page_array pages for those that are
1010 			 * inside of the vm_page_array range, and
1011 			 * allocate the remaining ones.
1012 			 */
1013 			dpage_count = vm_page_array_size - (pi - first_page);
1014 			vm_phys_fictitious_init_range(fp, start, dpage_count,
1015 			    memattr);
1016 			page_count -= dpage_count;
1017 			start += ptoa(dpage_count);
1018 			goto alloc;
1019 		}
1020 		/*
1021 		 * We can allocate the full range from vm_page_array,
1022 		 * so there's no need to register the range in the tree.
1023 		 */
1024 		vm_phys_fictitious_init_range(fp, start, page_count, memattr);
1025 		return (0);
1026 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1027 		/*
1028 		 * We have a segment that ends inside of vm_page_array,
1029 		 * but starts outside of it.
1030 		 */
1031 		fp = &vm_page_array[0];
1032 		dpage_count = pe - first_page;
1033 		vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
1034 		    memattr);
1035 		end -= ptoa(dpage_count);
1036 		page_count -= dpage_count;
1037 		goto alloc;
1038 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1039 		/*
1040 		 * Trying to register a fictitious range that expands before
1041 		 * and after vm_page_array.
1042 		 */
1043 		return (EINVAL);
1044 	} else {
1045 alloc:
1046 #endif
1047 		fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
1048 		    M_WAITOK);
1049 #ifdef VM_PHYSSEG_DENSE
1050 	}
1051 #endif
1052 	vm_phys_fictitious_init_range(fp, start, page_count, memattr);
1053 
1054 	seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
1055 	seg->start = start;
1056 	seg->end = end;
1057 	seg->first_page = fp;
1058 
1059 	rw_wlock(&vm_phys_fictitious_reg_lock);
1060 	RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
1061 	rw_wunlock(&vm_phys_fictitious_reg_lock);
1062 
1063 	return (0);
1064 }
1065 
1066 void
1067 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
1068 {
1069 	struct vm_phys_fictitious_seg *seg, tmp;
1070 #ifdef VM_PHYSSEG_DENSE
1071 	long pi, pe;
1072 #endif
1073 
1074 	KASSERT(start < end,
1075 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
1076 	    (uintmax_t)start, (uintmax_t)end));
1077 
1078 #ifdef VM_PHYSSEG_DENSE
1079 	pi = atop(start);
1080 	pe = atop(end);
1081 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1082 		if ((pe - first_page) <= vm_page_array_size) {
1083 			/*
1084 			 * This segment was allocated using vm_page_array
1085 			 * only, there's nothing to do since those pages
1086 			 * were never added to the tree.
1087 			 */
1088 			return;
1089 		}
1090 		/*
1091 		 * We have a segment that starts inside
1092 		 * of vm_page_array, but ends outside of it.
1093 		 *
1094 		 * Calculate how many pages were added to the
1095 		 * tree and free them.
1096 		 */
1097 		start = ptoa(first_page + vm_page_array_size);
1098 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1099 		/*
1100 		 * We have a segment that ends inside of vm_page_array,
1101 		 * but starts outside of it.
1102 		 */
1103 		end = ptoa(first_page);
1104 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1105 		/* Since it's not possible to register such a range, panic. */
1106 		panic(
1107 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
1108 		    (uintmax_t)start, (uintmax_t)end);
1109 	}
1110 #endif
1111 	tmp.start = start;
1112 	tmp.end = 0;
1113 
1114 	rw_wlock(&vm_phys_fictitious_reg_lock);
1115 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1116 	if (seg->start != start || seg->end != end) {
1117 		rw_wunlock(&vm_phys_fictitious_reg_lock);
1118 		panic(
1119 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
1120 		    (uintmax_t)start, (uintmax_t)end);
1121 	}
1122 	RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
1123 	rw_wunlock(&vm_phys_fictitious_reg_lock);
1124 	free(seg->first_page, M_FICT_PAGES);
1125 	free(seg, M_FICT_PAGES);
1126 }
1127 
1128 /*
1129  * Free a contiguous, power of two-sized set of physical pages.  Assumes that
1130  * only the first page has a valid pool field.
1131  *
1132  * The free page queues must be locked.
1133  */
1134 void
1135 vm_phys_free_pages(vm_page_t m, int order)
1136 {
1137 	struct vm_freelist *fl;
1138 	struct vm_phys_seg *seg;
1139 	vm_paddr_t pa;
1140 	vm_page_t m_buddy;
1141 	int pool = m->pool;
1142 
1143 	KASSERT(m->order == VM_NFREEORDER,
1144 	    ("vm_phys_free_pages: page %p has unexpected order %d",
1145 	    m, m->order));
1146 	KASSERT(pool < VM_NFREEPOOL,
1147 	    ("vm_phys_free_pages: page %p has unexpected pool %d", m, pool));
1148 	KASSERT(order < VM_NFREEORDER,
1149 	    ("vm_phys_free_pages: order %d is out of range", order));
1150 	seg = &vm_phys_segs[m->segind];
1151 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1152 	if (order < VM_NFREEORDER - 1) {
1153 		vm_page_t m_start = m;
1154 		pa = VM_PAGE_TO_PHYS(m);
1155 		do {
1156 			pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
1157 			if (pa < seg->start || pa >= seg->end)
1158 				break;
1159 			m_buddy = &seg->first_page[atop(pa - seg->start)];
1160 			if (m_buddy->order != order)
1161 				break;
1162 			fl = (*seg->free_queues)[m_buddy->pool];
1163 			vm_freelist_rem(fl, m_buddy, order);
1164 			m_buddy->pool = VM_NFREEPOOL;
1165 			order++;
1166 			pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
1167 			m = &seg->first_page[atop(pa - seg->start)];
1168 		} while (order < VM_NFREEORDER - 1);
1169 		if (m != m_start) {
1170 			m_start->pool = VM_NFREEPOOL;
1171 			m->pool = pool;
1172 		}
1173 	}
1174 	fl = (*seg->free_queues)[pool];
1175 	vm_freelist_add(fl, m, order, 1);
1176 }
1177 
1178 /*
1179  * Free a contiguous, arbitrarily sized set of physical pages, without merging
1180  * across set boundaries.  Assumes no pages have a valid pool field.
1181  *
1182  * The free page queues must be locked.
1183  */
1184 void
1185 vm_phys_enqueue_contig(vm_page_t m, int pool, u_long npages)
1186 {
1187 	struct vm_freelist *fl;
1188 	struct vm_phys_seg *seg;
1189 	vm_page_t m_end;
1190 	vm_paddr_t diff, lo;
1191 	int order;
1192 
1193 	/*
1194 	 * Avoid unnecessary coalescing by freeing the pages in the largest
1195 	 * possible power-of-two-sized subsets.
1196 	 */
1197 	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1198 	seg = &vm_phys_segs[m->segind];
1199 	fl = (*seg->free_queues)[pool];
1200 	m_end = m + npages;
1201 	/* Free blocks of increasing size. */
1202 	lo = atop(VM_PAGE_TO_PHYS(m));
1203 	if (m < m_end &&
1204 	    (diff = lo ^ (lo + npages - 1)) != 0) {
1205 		order = min(flsll(diff) - 1, VM_NFREEORDER - 1);
1206 		m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl,
1207 		    pool, 1);
1208 	}
1209 
1210 	/* Free blocks of maximum size. */
1211 	order = VM_NFREEORDER - 1;
1212 	while (m + (1 << order) <= m_end) {
1213 		KASSERT(seg == &vm_phys_segs[m->segind],
1214 		    ("%s: page range [%p,%p) spans multiple segments",
1215 		    __func__, m_end - npages, m));
1216 		m->pool = pool;
1217 		vm_freelist_add(fl, m, order, 1);
1218 		m += 1 << order;
1219 	}
1220 	/* Free blocks of diminishing size. */
1221 	vm_phys_enq_beg(m, m_end - m, fl, pool, 1);
1222 }
1223 
1224 /*
1225  * Free a contiguous, arbitrarily sized set of physical pages.
1226  * Assumes that every page has the same, valid, pool field value.
1227  *
1228  * The free page queues must be locked.
1229  */
1230 void
1231 vm_phys_free_contig(vm_page_t m, u_long npages)
1232 {
1233 	vm_paddr_t lo;
1234 	vm_page_t m_start, m_end;
1235 	unsigned max_order, order_start, order_end;
1236 	int pool = m->pool;
1237 
1238 	KASSERT(pool < VM_NFREEPOOL,
1239 	    ("%s: pool %d is out of range", __func__, pool));
1240 
1241 	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1242 
1243 	lo = atop(VM_PAGE_TO_PHYS(m));
1244 	max_order = min(flsll(lo ^ (lo + npages)) - 1, VM_NFREEORDER - 1);
1245 	m_end = m + npages;
1246 	for (m_start = m; m < m_end; m++)
1247 		m->pool = VM_NFREEPOOL;
1248 	m = m_start;
1249 	order_start = ffsll(lo) - 1;
1250 	if (order_start < max_order)
1251 		m_start += 1 << order_start;
1252 	order_end = ffsll(lo + npages) - 1;
1253 	if (order_end < max_order)
1254 		m_end -= 1 << order_end;
1255 	/*
1256 	 * Avoid unnecessary coalescing by freeing the pages at the start and
1257 	 * end of the range last.
1258 	 */
1259 	if (m_start < m_end)
1260 		vm_phys_enqueue_contig(m_start, pool, m_end - m_start);
1261 	if (order_start < max_order) {
1262 		m->pool = pool;
1263 		vm_phys_free_pages(m, order_start);
1264 	}
1265 	if (order_end < max_order) {
1266 		m_end->pool = pool;
1267 		vm_phys_free_pages(m_end, order_end);
1268 	}
1269 }
1270 
1271 /*
1272  * Identify the first address range within segment segind or greater
1273  * that matches the domain, lies within the low/high range, and has
1274  * enough pages.  Return -1 if there is none.
1275  */
1276 int
1277 vm_phys_find_range(vm_page_t bounds[], int segind, int domain,
1278     u_long npages, vm_paddr_t low, vm_paddr_t high)
1279 {
1280 	vm_paddr_t pa_end, pa_start;
1281 	struct vm_phys_seg *end_seg, *seg;
1282 
1283 	KASSERT(npages > 0, ("npages is zero"));
1284 	KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range"));
1285 	end_seg = &vm_phys_segs[vm_phys_nsegs];
1286 	for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) {
1287 		if (seg->domain != domain)
1288 			continue;
1289 		if (seg->start >= high)
1290 			return (-1);
1291 		pa_start = MAX(low, seg->start);
1292 		pa_end = MIN(high, seg->end);
1293 		if (pa_end - pa_start < ptoa(npages))
1294 			continue;
1295 		bounds[0] = &seg->first_page[atop(pa_start - seg->start)];
1296 		bounds[1] = &seg->first_page[atop(pa_end - seg->start)];
1297 		return (seg - vm_phys_segs);
1298 	}
1299 	return (-1);
1300 }
1301 
1302 /*
1303  * Search for the given physical page "m" in the free lists.  If the search
1304  * succeeds, remove "m" from the free lists and return true.  Otherwise, return
1305  * false, indicating that "m" is not in the free lists.
1306  *
1307  * The free page queues must be locked.
1308  */
1309 bool
1310 vm_phys_unfree_page(vm_page_t m)
1311 {
1312 	struct vm_freelist *fl;
1313 	struct vm_phys_seg *seg;
1314 	vm_paddr_t pa, pa_half;
1315 	vm_page_t m_set, m_tmp;
1316 	int order, pool;
1317 
1318 	/*
1319 	 * First, find the contiguous, power of two-sized set of free
1320 	 * physical pages containing the given physical page "m" and
1321 	 * assign it to "m_set".
1322 	 */
1323 	seg = &vm_phys_segs[m->segind];
1324 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1325 	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
1326 	    order < VM_NFREEORDER - 1; ) {
1327 		order++;
1328 		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
1329 		if (pa >= seg->start)
1330 			m_set = &seg->first_page[atop(pa - seg->start)];
1331 		else
1332 			return (false);
1333 	}
1334 	if (m_set->order < order)
1335 		return (false);
1336 	if (m_set->order == VM_NFREEORDER)
1337 		return (false);
1338 	KASSERT(m_set->order < VM_NFREEORDER,
1339 	    ("vm_phys_unfree_page: page %p has unexpected order %d",
1340 	    m_set, m_set->order));
1341 
1342 	/*
1343 	 * Next, remove "m_set" from the free lists.  Finally, extract
1344 	 * "m" from "m_set" using an iterative algorithm: While "m_set"
1345 	 * is larger than a page, shrink "m_set" by returning the half
1346 	 * of "m_set" that does not contain "m" to the free lists.
1347 	 */
1348 	pool = m_set->pool;
1349 	fl = (*seg->free_queues)[pool];
1350 	order = m_set->order;
1351 	vm_freelist_rem(fl, m_set, order);
1352 	while (order > 0) {
1353 		order--;
1354 		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
1355 		if (m->phys_addr < pa_half)
1356 			m_tmp = &seg->first_page[atop(pa_half - seg->start)];
1357 		else {
1358 			m_tmp = m_set;
1359 			m_set = &seg->first_page[atop(pa_half - seg->start)];
1360 		}
1361 		m_tmp->pool = pool;
1362 		vm_freelist_add(fl, m_tmp, order, 0);
1363 	}
1364 	m_set->pool = pool;
1365 	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
1366 	return (true);
1367 }
1368 
1369 /*
1370  * Find a run of contiguous physical pages, meeting alignment requirements, from
1371  * a list of max-sized page blocks, where we need at least two consecutive
1372  * blocks to satisfy the (large) page request.
1373  */
1374 static vm_page_t
1375 vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages,
1376     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
1377 {
1378 	struct vm_phys_seg *seg;
1379 	vm_page_t m, m_iter, m_ret;
1380 	vm_paddr_t max_size, size;
1381 	int max_order;
1382 
1383 	max_order = VM_NFREEORDER - 1;
1384 	size = npages << PAGE_SHIFT;
1385 	max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order);
1386 	KASSERT(size > max_size, ("size is too small"));
1387 
1388 	/*
1389 	 * In order to avoid examining any free max-sized page block more than
1390 	 * twice, identify the ones that are first in a physically-contiguous
1391 	 * sequence of such blocks, and only for those walk the sequence to
1392 	 * check if there are enough free blocks starting at a properly aligned
1393 	 * block.  Thus, no block is checked for free-ness more than twice.
1394 	 */
1395 	TAILQ_FOREACH(m, &fl[max_order].pl, listq) {
1396 		/*
1397 		 * Skip m unless it is first in a sequence of free max page
1398 		 * blocks >= low in its segment.
1399 		 */
1400 		seg = &vm_phys_segs[m->segind];
1401 		if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start))
1402 			continue;
1403 		if (VM_PAGE_TO_PHYS(m) >= max_size &&
1404 		    VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) &&
1405 		    max_order == m[-1 << max_order].order)
1406 			continue;
1407 
1408 		/*
1409 		 * Advance m_ret from m to the first of the sequence, if any,
1410 		 * that satisfies alignment conditions and might leave enough
1411 		 * space.
1412 		 */
1413 		m_ret = m;
1414 		while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret),
1415 		    size, alignment, boundary) &&
1416 		    VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) &&
1417 		    max_order == m_ret[1 << max_order].order)
1418 			m_ret += 1 << max_order;
1419 
1420 		/*
1421 		 * Skip m unless some block m_ret in the sequence is properly
1422 		 * aligned, and begins a sequence of enough pages less than
1423 		 * high, and in the same segment.
1424 		 */
1425 		if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end))
1426 			continue;
1427 
1428 		/*
1429 		 * Skip m unless the blocks to allocate starting at m_ret are
1430 		 * all free.
1431 		 */
1432 		for (m_iter = m_ret;
1433 		    m_iter < m_ret + npages && max_order == m_iter->order;
1434 		    m_iter += 1 << max_order) {
1435 		}
1436 		if (m_iter < m_ret + npages)
1437 			continue;
1438 		return (m_ret);
1439 	}
1440 	return (NULL);
1441 }
1442 
1443 /*
1444  * Find a run of contiguous physical pages from the specified free list
1445  * table.
1446  */
1447 static vm_page_t
1448 vm_phys_find_queues_contig(
1449     struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX],
1450     u_long npages, vm_paddr_t low, vm_paddr_t high,
1451     u_long alignment, vm_paddr_t boundary)
1452 {
1453 	struct vm_freelist *fl;
1454 	vm_page_t m_ret;
1455 	vm_paddr_t pa, pa_end, size;
1456 	int oind, order, pind;
1457 
1458 	KASSERT(npages > 0, ("npages is 0"));
1459 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1460 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1461 	/* Compute the queue that is the best fit for npages. */
1462 	order = flsl(npages - 1);
1463 	/* Search for a large enough free block. */
1464 	size = npages << PAGE_SHIFT;
1465 	for (oind = order; oind < VM_NFREEORDER; oind++) {
1466 		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1467 			fl = (*queues)[pind];
1468 			TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
1469 				/*
1470 				 * Determine if the address range starting at pa
1471 				 * is within the given range, satisfies the
1472 				 * given alignment, and does not cross the given
1473 				 * boundary.
1474 				 */
1475 				pa = VM_PAGE_TO_PHYS(m_ret);
1476 				pa_end = pa + size;
1477 				if (low <= pa && pa_end <= high &&
1478 				    vm_addr_ok(pa, size, alignment, boundary))
1479 					return (m_ret);
1480 			}
1481 		}
1482 	}
1483 	if (order < VM_NFREEORDER)
1484 		return (NULL);
1485 	/* Search for a long-enough sequence of max-order blocks. */
1486 	for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1487 		fl = (*queues)[pind];
1488 		m_ret = vm_phys_find_freelist_contig(fl, npages,
1489 		    low, high, alignment, boundary);
1490 		if (m_ret != NULL)
1491 			return (m_ret);
1492 	}
1493 	return (NULL);
1494 }
1495 
1496 /*
1497  * Allocate a contiguous set of physical pages of the given size
1498  * "npages" from the free lists.  All of the physical pages must be at
1499  * or above the given physical address "low" and below the given
1500  * physical address "high".  The given value "alignment" determines the
1501  * alignment of the first physical page in the set.  If the given value
1502  * "boundary" is non-zero, then the set of physical pages cannot cross
1503  * any physical address boundary that is a multiple of that value.  Both
1504  * "alignment" and "boundary" must be a power of two.  Sets the pool
1505  * field in every allocated page.
1506  */
1507 vm_page_t
1508 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
1509     u_long alignment, vm_paddr_t boundary)
1510 {
1511 	vm_paddr_t pa_end, pa_start;
1512 	struct vm_freelist *fl;
1513 	vm_page_t m, m_run;
1514 	struct vm_phys_seg *seg;
1515 	struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX];
1516 	int oind, segind;
1517 
1518 	KASSERT(npages > 0, ("npages is 0"));
1519 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1520 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1521 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
1522 	if (low >= high)
1523 		return (NULL);
1524 	queues = NULL;
1525 	m_run = NULL;
1526 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
1527 		seg = &vm_phys_segs[segind];
1528 		if (seg->start >= high || seg->domain != domain)
1529 			continue;
1530 		if (low >= seg->end)
1531 			break;
1532 		if (low <= seg->start)
1533 			pa_start = seg->start;
1534 		else
1535 			pa_start = low;
1536 		if (high < seg->end)
1537 			pa_end = high;
1538 		else
1539 			pa_end = seg->end;
1540 		if (pa_end - pa_start < ptoa(npages))
1541 			continue;
1542 		/*
1543 		 * If a previous segment led to a search using
1544 		 * the same free lists as would this segment, then
1545 		 * we've actually already searched within this
1546 		 * too.  So skip it.
1547 		 */
1548 		if (seg->free_queues == queues)
1549 			continue;
1550 		queues = seg->free_queues;
1551 		m_run = vm_phys_find_queues_contig(queues, npages,
1552 		    low, high, alignment, boundary);
1553 		if (m_run != NULL)
1554 			break;
1555 	}
1556 	if (m_run == NULL)
1557 		return (NULL);
1558 
1559 	/* Allocate pages from the page-range found. */
1560 	for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) {
1561 		fl = (*queues)[m->pool];
1562 		oind = m->order;
1563 		vm_freelist_rem(fl, m, oind);
1564 	}
1565 	/* Return excess pages to the free lists. */
1566 	fl = (*queues)[VM_FREEPOOL_DEFAULT];
1567 	vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl,
1568 	    VM_FREEPOOL_DEFAULT, 0);
1569 
1570 	/* Return page verified to satisfy conditions of request. */
1571 	for (m = m_run; m < &m_run[npages]; m++)
1572 		m->pool = VM_FREEPOOL_DEFAULT;
1573 
1574 	pa_start = VM_PAGE_TO_PHYS(m_run);
1575 	KASSERT(low <= pa_start,
1576 	    ("memory allocated below minimum requested range"));
1577 	KASSERT(pa_start + ptoa(npages) <= high,
1578 	    ("memory allocated above maximum requested range"));
1579 	seg = &vm_phys_segs[m_run->segind];
1580 	KASSERT(seg->domain == domain,
1581 	    ("memory not allocated from specified domain"));
1582 	KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary),
1583 	    ("memory alignment/boundary constraints not satisfied"));
1584 	return (m_run);
1585 }
1586 
1587 /*
1588  * Return the index of the first unused slot which may be the terminating
1589  * entry.
1590  */
1591 static int
1592 vm_phys_avail_count(void)
1593 {
1594 	int i;
1595 
1596 	for (i = 0; phys_avail[i + 1]; i += 2)
1597 		continue;
1598 	if (i > PHYS_AVAIL_ENTRIES)
1599 		panic("Improperly terminated phys_avail %d entries", i);
1600 
1601 	return (i);
1602 }
1603 
1604 /*
1605  * Assert that a phys_avail entry is valid.
1606  */
1607 static void
1608 vm_phys_avail_check(int i)
1609 {
1610 	if (phys_avail[i] & PAGE_MASK)
1611 		panic("Unaligned phys_avail[%d]: %#jx", i,
1612 		    (intmax_t)phys_avail[i]);
1613 	if (phys_avail[i+1] & PAGE_MASK)
1614 		panic("Unaligned phys_avail[%d + 1]: %#jx", i,
1615 		    (intmax_t)phys_avail[i]);
1616 	if (phys_avail[i + 1] < phys_avail[i])
1617 		panic("phys_avail[%d] start %#jx < end %#jx", i,
1618 		    (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]);
1619 }
1620 
1621 /*
1622  * Return the index of an overlapping phys_avail entry or -1.
1623  */
1624 #ifdef NUMA
1625 static int
1626 vm_phys_avail_find(vm_paddr_t pa)
1627 {
1628 	int i;
1629 
1630 	for (i = 0; phys_avail[i + 1]; i += 2)
1631 		if (phys_avail[i] <= pa && phys_avail[i + 1] > pa)
1632 			return (i);
1633 	return (-1);
1634 }
1635 #endif
1636 
1637 /*
1638  * Return the index of the largest entry.
1639  */
1640 int
1641 vm_phys_avail_largest(void)
1642 {
1643 	vm_paddr_t sz, largesz;
1644 	int largest;
1645 	int i;
1646 
1647 	largest = 0;
1648 	largesz = 0;
1649 	for (i = 0; phys_avail[i + 1]; i += 2) {
1650 		sz = vm_phys_avail_size(i);
1651 		if (sz > largesz) {
1652 			largesz = sz;
1653 			largest = i;
1654 		}
1655 	}
1656 
1657 	return (largest);
1658 }
1659 
1660 vm_paddr_t
1661 vm_phys_avail_size(int i)
1662 {
1663 
1664 	return (phys_avail[i + 1] - phys_avail[i]);
1665 }
1666 
1667 /*
1668  * Split an entry at the address 'pa'.  Return zero on success or errno.
1669  */
1670 static int
1671 vm_phys_avail_split(vm_paddr_t pa, int i)
1672 {
1673 	int cnt;
1674 
1675 	vm_phys_avail_check(i);
1676 	if (pa <= phys_avail[i] || pa >= phys_avail[i + 1])
1677 		panic("vm_phys_avail_split: invalid address");
1678 	cnt = vm_phys_avail_count();
1679 	if (cnt >= PHYS_AVAIL_ENTRIES)
1680 		return (ENOSPC);
1681 	memmove(&phys_avail[i + 2], &phys_avail[i],
1682 	    (cnt - i) * sizeof(phys_avail[0]));
1683 	phys_avail[i + 1] = pa;
1684 	phys_avail[i + 2] = pa;
1685 	vm_phys_avail_check(i);
1686 	vm_phys_avail_check(i+2);
1687 
1688 	return (0);
1689 }
1690 
1691 /*
1692  * Check if a given physical address can be included as part of a crash dump.
1693  */
1694 bool
1695 vm_phys_is_dumpable(vm_paddr_t pa)
1696 {
1697 	vm_page_t m;
1698 	int i;
1699 
1700 	if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL)
1701 		return ((m->flags & PG_NODUMP) == 0);
1702 
1703 	for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
1704 		if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
1705 			return (true);
1706 	}
1707 	return (false);
1708 }
1709 
1710 void
1711 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end)
1712 {
1713 	struct vm_phys_seg *seg;
1714 
1715 	if (vm_phys_early_nsegs == -1)
1716 		panic("%s: called after initialization", __func__);
1717 	if (vm_phys_early_nsegs == nitems(vm_phys_early_segs))
1718 		panic("%s: ran out of early segments", __func__);
1719 
1720 	seg = &vm_phys_early_segs[vm_phys_early_nsegs++];
1721 	seg->start = start;
1722 	seg->end = end;
1723 }
1724 
1725 /*
1726  * This routine allocates NUMA node specific memory before the page
1727  * allocator is bootstrapped.
1728  */
1729 vm_paddr_t
1730 vm_phys_early_alloc(int domain, size_t alloc_size)
1731 {
1732 #ifdef NUMA
1733 	int mem_index;
1734 #endif
1735 	int i, biggestone;
1736 	vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align;
1737 
1738 	KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains),
1739 	    ("%s: invalid domain index %d", __func__, domain));
1740 
1741 	/*
1742 	 * Search the mem_affinity array for the biggest address
1743 	 * range in the desired domain.  This is used to constrain
1744 	 * the phys_avail selection below.
1745 	 */
1746 	biggestsize = 0;
1747 	mem_start = 0;
1748 	mem_end = -1;
1749 #ifdef NUMA
1750 	mem_index = 0;
1751 	if (mem_affinity != NULL) {
1752 		for (i = 0;; i++) {
1753 			size = mem_affinity[i].end - mem_affinity[i].start;
1754 			if (size == 0)
1755 				break;
1756 			if (domain != -1 && mem_affinity[i].domain != domain)
1757 				continue;
1758 			if (size > biggestsize) {
1759 				mem_index = i;
1760 				biggestsize = size;
1761 			}
1762 		}
1763 		mem_start = mem_affinity[mem_index].start;
1764 		mem_end = mem_affinity[mem_index].end;
1765 	}
1766 #endif
1767 
1768 	/*
1769 	 * Now find biggest physical segment in within the desired
1770 	 * numa domain.
1771 	 */
1772 	biggestsize = 0;
1773 	biggestone = 0;
1774 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1775 		/* skip regions that are out of range */
1776 		if (phys_avail[i+1] - alloc_size < mem_start ||
1777 		    phys_avail[i+1] > mem_end)
1778 			continue;
1779 		size = vm_phys_avail_size(i);
1780 		if (size > biggestsize) {
1781 			biggestone = i;
1782 			biggestsize = size;
1783 		}
1784 	}
1785 	alloc_size = round_page(alloc_size);
1786 
1787 	/*
1788 	 * Grab single pages from the front to reduce fragmentation.
1789 	 */
1790 	if (alloc_size == PAGE_SIZE) {
1791 		pa = phys_avail[biggestone];
1792 		phys_avail[biggestone] += PAGE_SIZE;
1793 		vm_phys_avail_check(biggestone);
1794 		return (pa);
1795 	}
1796 
1797 	/*
1798 	 * Naturally align large allocations.
1799 	 */
1800 	align = phys_avail[biggestone + 1] & (alloc_size - 1);
1801 	if (alloc_size + align > biggestsize)
1802 		panic("cannot find a large enough size\n");
1803 	if (align != 0 &&
1804 	    vm_phys_avail_split(phys_avail[biggestone + 1] - align,
1805 	    biggestone) != 0)
1806 		/* Wasting memory. */
1807 		phys_avail[biggestone + 1] -= align;
1808 
1809 	phys_avail[biggestone + 1] -= alloc_size;
1810 	vm_phys_avail_check(biggestone);
1811 	pa = phys_avail[biggestone + 1];
1812 	return (pa);
1813 }
1814 
1815 void
1816 vm_phys_early_startup(void)
1817 {
1818 	struct vm_phys_seg *seg;
1819 	int i;
1820 
1821 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1822 		phys_avail[i] = round_page(phys_avail[i]);
1823 		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
1824 	}
1825 
1826 	for (i = 0; i < vm_phys_early_nsegs; i++) {
1827 		seg = &vm_phys_early_segs[i];
1828 		vm_phys_add_seg(seg->start, seg->end);
1829 	}
1830 	vm_phys_early_nsegs = -1;
1831 
1832 #ifdef NUMA
1833 	/* Force phys_avail to be split by domain. */
1834 	if (mem_affinity != NULL) {
1835 		int idx;
1836 
1837 		for (i = 0; mem_affinity[i].end != 0; i++) {
1838 			idx = vm_phys_avail_find(mem_affinity[i].start);
1839 			if (idx != -1 &&
1840 			    phys_avail[idx] != mem_affinity[i].start)
1841 				vm_phys_avail_split(mem_affinity[i].start, idx);
1842 			idx = vm_phys_avail_find(mem_affinity[i].end);
1843 			if (idx != -1 &&
1844 			    phys_avail[idx] != mem_affinity[i].end)
1845 				vm_phys_avail_split(mem_affinity[i].end, idx);
1846 		}
1847 	}
1848 #endif
1849 }
1850 
1851 #ifdef DDB
1852 /*
1853  * Show the number of physical pages in each of the free lists.
1854  */
1855 DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE)
1856 {
1857 	struct vm_freelist *fl;
1858 	int flind, oind, pind, dom;
1859 
1860 	for (dom = 0; dom < vm_ndomains; dom++) {
1861 		db_printf("DOMAIN: %d\n", dom);
1862 		for (flind = 0; flind < vm_nfreelists; flind++) {
1863 			db_printf("FREE LIST %d:\n"
1864 			    "\n  ORDER (SIZE)  |  NUMBER"
1865 			    "\n              ", flind);
1866 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
1867 				db_printf("  |  POOL %d", pind);
1868 			db_printf("\n--            ");
1869 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
1870 				db_printf("-- --      ");
1871 			db_printf("--\n");
1872 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
1873 				db_printf("  %2.2d (%6.6dK)", oind,
1874 				    1 << (PAGE_SHIFT - 10 + oind));
1875 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1876 				fl = vm_phys_free_queues[dom][flind][pind];
1877 					db_printf("  |  %6.6d", fl[oind].lcnt);
1878 				}
1879 				db_printf("\n");
1880 			}
1881 			db_printf("\n");
1882 		}
1883 		db_printf("\n");
1884 	}
1885 }
1886 #endif
1887