xref: /freebsd/sys/vm/vm_phys.c (revision 9b37d84c87e69dabc69d818aa4d2fea718bd8b74)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2002-2006 Rice University
5  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
6  * All rights reserved.
7  *
8  * This software was developed for the FreeBSD Project by Alan L. Cox,
9  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
24  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
27  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
30  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  *	Physical memory system implementation
36  *
37  * Any external functions defined by this module are only to be used by the
38  * virtual memory system.
39  */
40 
41 #include <sys/cdefs.h>
42 #include "opt_ddb.h"
43 #include "opt_vm.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/domainset.h>
48 #include <sys/lock.h>
49 #include <sys/kernel.h>
50 #include <sys/kthread.h>
51 #include <sys/malloc.h>
52 #include <sys/mutex.h>
53 #include <sys/proc.h>
54 #include <sys/queue.h>
55 #include <sys/rwlock.h>
56 #include <sys/sbuf.h>
57 #include <sys/sched.h>
58 #include <sys/sysctl.h>
59 #include <sys/tree.h>
60 #include <sys/tslog.h>
61 #include <sys/unistd.h>
62 #include <sys/vmmeter.h>
63 
64 #include <ddb/ddb.h>
65 
66 #include <vm/vm.h>
67 #include <vm/vm_extern.h>
68 #include <vm/vm_param.h>
69 #include <vm/vm_kern.h>
70 #include <vm/vm_page.h>
71 #include <vm/vm_phys.h>
72 #include <vm/vm_pagequeue.h>
73 
74 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
75     "Too many physsegs.");
76 _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t),
77     "vm_paddr_t too big for ffsll, flsll.");
78 
79 #ifdef NUMA
80 struct mem_affinity __read_mostly *mem_affinity;
81 int __read_mostly *mem_locality;
82 
83 static int numa_disabled;
84 static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
85     "NUMA options");
86 SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
87     &numa_disabled, 0, "NUMA-awareness in the allocators is disabled");
88 #endif
89 
90 int __read_mostly vm_ndomains = 1;
91 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1);
92 
93 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX];
94 int __read_mostly vm_phys_nsegs;
95 static struct vm_phys_seg vm_phys_early_segs[8];
96 static int vm_phys_early_nsegs;
97 
98 struct vm_phys_fictitious_seg;
99 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
100     struct vm_phys_fictitious_seg *);
101 
102 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
103     RB_INITIALIZER(&vm_phys_fictitious_tree);
104 
105 struct vm_phys_fictitious_seg {
106 	RB_ENTRY(vm_phys_fictitious_seg) node;
107 	/* Memory region data */
108 	vm_paddr_t	start;
109 	vm_paddr_t	end;
110 	vm_page_t	first_page;
111 };
112 
113 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
114     vm_phys_fictitious_cmp);
115 
116 static struct rwlock_padalign vm_phys_fictitious_reg_lock;
117 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
118 
119 static struct vm_freelist __aligned(CACHE_LINE_SIZE)
120     vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL]
121     [VM_NFREEORDER_MAX];
122 
123 static int __read_mostly vm_nfreelists;
124 
125 /*
126  * These "avail lists" are globals used to communicate boot-time physical
127  * memory layout to other parts of the kernel.  Each physically contiguous
128  * region of memory is defined by a start address at an even index and an
129  * end address at the following odd index.  Each list is terminated by a
130  * pair of zero entries.
131  *
132  * dump_avail tells the dump code what regions to include in a crash dump, and
133  * phys_avail is all of the remaining physical memory that is available for
134  * the vm system.
135  *
136  * Initially dump_avail and phys_avail are identical.  Boot time memory
137  * allocations remove extents from phys_avail that may still be included
138  * in dumps.
139  */
140 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT];
141 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT];
142 
143 /*
144  * Provides the mapping from VM_FREELIST_* to free list indices (flind).
145  */
146 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
147 static int __read_mostly vm_default_freepool;
148 
149 CTASSERT(VM_FREELIST_DEFAULT == 0);
150 
151 #ifdef VM_FREELIST_DMA32
152 #define	VM_DMA32_BOUNDARY	((vm_paddr_t)1 << 32)
153 #endif
154 
155 /*
156  * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
157  * the ordering of the free list boundaries.
158  */
159 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
160 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
161 #endif
162 
163 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
164 SYSCTL_OID(_vm, OID_AUTO, phys_free,
165     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
166     sysctl_vm_phys_free, "A",
167     "Phys Free Info");
168 
169 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
170 SYSCTL_OID(_vm, OID_AUTO, phys_segs,
171     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
172     sysctl_vm_phys_segs, "A",
173     "Phys Seg Info");
174 
175 #ifdef NUMA
176 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
177 SYSCTL_OID(_vm, OID_AUTO, phys_locality,
178     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
179     sysctl_vm_phys_locality, "A",
180     "Phys Locality Info");
181 #endif
182 
183 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
184     &vm_ndomains, 0, "Number of physical memory domains available.");
185 
186 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
187 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
188 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
189     int order, int pool, int tail);
190 
191 static bool __diagused
192 vm_phys_pool_valid(int pool)
193 {
194 #ifdef VM_FREEPOOL_LAZYINIT
195 	if (pool == VM_FREEPOOL_LAZYINIT)
196 		return (false);
197 #endif
198 	return (pool >= 0 && pool < VM_NFREEPOOL);
199 }
200 
201 /*
202  * Red-black tree helpers for vm fictitious range management.
203  */
204 static inline int
205 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
206     struct vm_phys_fictitious_seg *range)
207 {
208 
209 	KASSERT(range->start != 0 && range->end != 0,
210 	    ("Invalid range passed on search for vm_fictitious page"));
211 	if (p->start >= range->end)
212 		return (1);
213 	if (p->start < range->start)
214 		return (-1);
215 
216 	return (0);
217 }
218 
219 static int
220 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
221     struct vm_phys_fictitious_seg *p2)
222 {
223 
224 	/* Check if this is a search for a page */
225 	if (p1->end == 0)
226 		return (vm_phys_fictitious_in_range(p1, p2));
227 
228 	KASSERT(p2->end != 0,
229     ("Invalid range passed as second parameter to vm fictitious comparison"));
230 
231 	/* Searching to add a new range */
232 	if (p1->end <= p2->start)
233 		return (-1);
234 	if (p1->start >= p2->end)
235 		return (1);
236 
237 	panic("Trying to add overlapping vm fictitious ranges:\n"
238 	    "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
239 	    (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
240 }
241 
242 int
243 vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used,
244     vm_paddr_t high __numa_used)
245 {
246 #ifdef NUMA
247 	domainset_t mask;
248 	int i;
249 
250 	if (vm_ndomains == 1 || mem_affinity == NULL)
251 		return (0);
252 
253 	DOMAINSET_ZERO(&mask);
254 	/*
255 	 * Check for any memory that overlaps low, high.
256 	 */
257 	for (i = 0; mem_affinity[i].end != 0; i++)
258 		if (mem_affinity[i].start <= high &&
259 		    mem_affinity[i].end >= low)
260 			DOMAINSET_SET(mem_affinity[i].domain, &mask);
261 	if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask))
262 		return (prefer);
263 	if (DOMAINSET_EMPTY(&mask))
264 		panic("vm_phys_domain_match:  Impossible constraint");
265 	return (DOMAINSET_FFS(&mask) - 1);
266 #else
267 	return (0);
268 #endif
269 }
270 
271 /*
272  * Outputs the state of the physical memory allocator, specifically,
273  * the amount of physical memory in each free list.
274  */
275 static int
276 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
277 {
278 	struct sbuf sbuf;
279 	struct vm_freelist *fl;
280 	int dom, error, flind, oind, pind;
281 
282 	error = sysctl_wire_old_buffer(req, 0);
283 	if (error != 0)
284 		return (error);
285 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
286 	for (dom = 0; dom < vm_ndomains; dom++) {
287 		sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
288 		for (flind = 0; flind < vm_nfreelists; flind++) {
289 			sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
290 			    "\n  ORDER (SIZE)  |  NUMBER"
291 			    "\n              ", flind);
292 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
293 				sbuf_printf(&sbuf, "  |  POOL %d", pind);
294 			sbuf_printf(&sbuf, "\n--            ");
295 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
296 				sbuf_printf(&sbuf, "-- --      ");
297 			sbuf_printf(&sbuf, "--\n");
298 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
299 				sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
300 				    1 << (PAGE_SHIFT - 10 + oind));
301 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
302 				fl = vm_phys_free_queues[dom][flind][pind];
303 					sbuf_printf(&sbuf, "  |  %6d",
304 					    fl[oind].lcnt);
305 				}
306 				sbuf_printf(&sbuf, "\n");
307 			}
308 		}
309 	}
310 	error = sbuf_finish(&sbuf);
311 	sbuf_delete(&sbuf);
312 	return (error);
313 }
314 
315 /*
316  * Outputs the set of physical memory segments.
317  */
318 static int
319 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
320 {
321 	struct sbuf sbuf;
322 	struct vm_phys_seg *seg;
323 	int error, segind;
324 
325 	error = sysctl_wire_old_buffer(req, 0);
326 	if (error != 0)
327 		return (error);
328 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
329 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
330 		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
331 		seg = &vm_phys_segs[segind];
332 		sbuf_printf(&sbuf, "start:     %#jx\n",
333 		    (uintmax_t)seg->start);
334 		sbuf_printf(&sbuf, "end:       %#jx\n",
335 		    (uintmax_t)seg->end);
336 		sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
337 		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
338 	}
339 	error = sbuf_finish(&sbuf);
340 	sbuf_delete(&sbuf);
341 	return (error);
342 }
343 
344 /*
345  * Return affinity, or -1 if there's no affinity information.
346  */
347 int
348 vm_phys_mem_affinity(int f __numa_used, int t __numa_used)
349 {
350 
351 #ifdef NUMA
352 	if (mem_locality == NULL)
353 		return (-1);
354 	if (f >= vm_ndomains || t >= vm_ndomains)
355 		return (-1);
356 	return (mem_locality[f * vm_ndomains + t]);
357 #else
358 	return (-1);
359 #endif
360 }
361 
362 #ifdef NUMA
363 /*
364  * Outputs the VM locality table.
365  */
366 static int
367 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
368 {
369 	struct sbuf sbuf;
370 	int error, i, j;
371 
372 	error = sysctl_wire_old_buffer(req, 0);
373 	if (error != 0)
374 		return (error);
375 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
376 
377 	sbuf_printf(&sbuf, "\n");
378 
379 	for (i = 0; i < vm_ndomains; i++) {
380 		sbuf_printf(&sbuf, "%d: ", i);
381 		for (j = 0; j < vm_ndomains; j++) {
382 			sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
383 		}
384 		sbuf_printf(&sbuf, "\n");
385 	}
386 	error = sbuf_finish(&sbuf);
387 	sbuf_delete(&sbuf);
388 	return (error);
389 }
390 #endif
391 
392 static void
393 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int pool,
394     int tail)
395 {
396 
397 	m->order = order;
398 	m->pool = pool;
399 	if (tail)
400 		TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
401 	else
402 		TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
403 	fl[order].lcnt++;
404 }
405 
406 static void
407 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
408 {
409 
410 	TAILQ_REMOVE(&fl[order].pl, m, listq);
411 	fl[order].lcnt--;
412 	m->order = VM_NFREEORDER;
413 }
414 
415 /*
416  * Create a physical memory segment.
417  */
418 static void
419 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
420 {
421 	struct vm_phys_seg *seg;
422 
423 	if (!(0 <= domain && domain < vm_ndomains))
424 		panic("%s: Invalid domain %d ('vm_ndomains' is %d)",
425 		    __func__, domain, vm_ndomains);
426 	if (vm_phys_nsegs >= VM_PHYSSEG_MAX)
427 		panic("Not enough storage for physical segments, "
428 		    "increase VM_PHYSSEG_MAX");
429 
430 	seg = &vm_phys_segs[vm_phys_nsegs++];
431 	while (seg > vm_phys_segs && seg[-1].start >= end) {
432 		*seg = *(seg - 1);
433 		seg--;
434 	}
435 	seg->start = start;
436 	seg->end = end;
437 	seg->domain = domain;
438 	if (seg != vm_phys_segs && seg[-1].end > start)
439 		panic("Overlapping physical segments: Current [%#jx,%#jx) "
440 		    "at index %zu, previous [%#jx,%#jx)",
441 		    (uintmax_t)start, (uintmax_t)end, seg - vm_phys_segs,
442 		    (uintmax_t)seg[-1].start, (uintmax_t)seg[-1].end);
443 }
444 
445 static void
446 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
447 {
448 #ifdef NUMA
449 	int i;
450 
451 	if (mem_affinity == NULL) {
452 		_vm_phys_create_seg(start, end, 0);
453 		return;
454 	}
455 
456 	for (i = 0;; i++) {
457 		if (mem_affinity[i].end == 0)
458 			panic("Reached end of affinity info");
459 		if (mem_affinity[i].end <= start)
460 			continue;
461 		if (mem_affinity[i].start > start)
462 			panic("No affinity info for start %jx",
463 			    (uintmax_t)start);
464 		if (mem_affinity[i].end >= end) {
465 			_vm_phys_create_seg(start, end,
466 			    mem_affinity[i].domain);
467 			break;
468 		}
469 		_vm_phys_create_seg(start, mem_affinity[i].end,
470 		    mem_affinity[i].domain);
471 		start = mem_affinity[i].end;
472 	}
473 #else
474 	_vm_phys_create_seg(start, end, 0);
475 #endif
476 }
477 
478 /*
479  * Add a physical memory segment.
480  */
481 void
482 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
483 {
484 	vm_paddr_t paddr;
485 
486 	if ((start & PAGE_MASK) != 0)
487 		panic("%s: start (%jx) is not page aligned", __func__,
488 		    (uintmax_t)start);
489 	if ((end & PAGE_MASK) != 0)
490 		panic("%s: end (%jx) is not page aligned", __func__,
491 		    (uintmax_t)end);
492 	if (start > end)
493 		panic("%s: start (%jx) > end (%jx)!", __func__,
494 		    (uintmax_t)start, (uintmax_t)end);
495 
496 	if (start == end)
497 		return;
498 
499 	/*
500 	 * Split the physical memory segment if it spans two or more free
501 	 * list boundaries.
502 	 */
503 	paddr = start;
504 #ifdef	VM_FREELIST_LOWMEM
505 	if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
506 		vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
507 		paddr = VM_LOWMEM_BOUNDARY;
508 	}
509 #endif
510 #ifdef	VM_FREELIST_DMA32
511 	if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
512 		vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
513 		paddr = VM_DMA32_BOUNDARY;
514 	}
515 #endif
516 	vm_phys_create_seg(paddr, end);
517 }
518 
519 /*
520  * Initialize the physical memory allocator.
521  *
522  * Requires that vm_page_array is initialized!
523  */
524 void
525 vm_phys_init(void)
526 {
527 	struct vm_freelist *fl;
528 	struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg;
529 #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE)
530 	u_long npages;
531 #endif
532 	int dom, flind, freelist, oind, pind, segind;
533 
534 	/*
535 	 * Compute the number of free lists, and generate the mapping from the
536 	 * manifest constants VM_FREELIST_* to the free list indices.
537 	 *
538 	 * Initially, the entries of vm_freelist_to_flind[] are set to either
539 	 * 0 or 1 to indicate which free lists should be created.
540 	 */
541 #ifdef	VM_DMA32_NPAGES_THRESHOLD
542 	npages = 0;
543 #endif
544 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
545 		seg = &vm_phys_segs[segind];
546 #ifdef	VM_FREELIST_LOWMEM
547 		if (seg->end <= VM_LOWMEM_BOUNDARY)
548 			vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
549 		else
550 #endif
551 #ifdef	VM_FREELIST_DMA32
552 		if (
553 #ifdef	VM_DMA32_NPAGES_THRESHOLD
554 		    /*
555 		     * Create the DMA32 free list only if the amount of
556 		     * physical memory above physical address 4G exceeds the
557 		     * given threshold.
558 		     */
559 		    npages > VM_DMA32_NPAGES_THRESHOLD &&
560 #endif
561 		    seg->end <= VM_DMA32_BOUNDARY)
562 			vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
563 		else
564 #endif
565 		{
566 #ifdef	VM_DMA32_NPAGES_THRESHOLD
567 			npages += atop(seg->end - seg->start);
568 #endif
569 			vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
570 		}
571 	}
572 	/* Change each entry into a running total of the free lists. */
573 	for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
574 		vm_freelist_to_flind[freelist] +=
575 		    vm_freelist_to_flind[freelist - 1];
576 	}
577 	vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
578 	KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
579 	/* Change each entry into a free list index. */
580 	for (freelist = 0; freelist < VM_NFREELIST; freelist++)
581 		vm_freelist_to_flind[freelist]--;
582 
583 	/*
584 	 * Initialize the first_page and free_queues fields of each physical
585 	 * memory segment.
586 	 */
587 #ifdef VM_PHYSSEG_SPARSE
588 	npages = 0;
589 #endif
590 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
591 		seg = &vm_phys_segs[segind];
592 #ifdef VM_PHYSSEG_SPARSE
593 		seg->first_page = &vm_page_array[npages];
594 		npages += atop(seg->end - seg->start);
595 #else
596 		seg->first_page = PHYS_TO_VM_PAGE(seg->start);
597 #endif
598 #ifdef	VM_FREELIST_LOWMEM
599 		if (seg->end <= VM_LOWMEM_BOUNDARY) {
600 			flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
601 			KASSERT(flind >= 0,
602 			    ("vm_phys_init: LOWMEM flind < 0"));
603 		} else
604 #endif
605 #ifdef	VM_FREELIST_DMA32
606 		if (seg->end <= VM_DMA32_BOUNDARY) {
607 			flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
608 			KASSERT(flind >= 0,
609 			    ("vm_phys_init: DMA32 flind < 0"));
610 		} else
611 #endif
612 		{
613 			flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
614 			KASSERT(flind >= 0,
615 			    ("vm_phys_init: DEFAULT flind < 0"));
616 		}
617 		seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
618 	}
619 
620 	/*
621 	 * Coalesce physical memory segments that are contiguous and share the
622 	 * same per-domain free queues.
623 	 */
624 	prev_seg = vm_phys_segs;
625 	seg = &vm_phys_segs[1];
626 	end_seg = &vm_phys_segs[vm_phys_nsegs];
627 	while (seg < end_seg) {
628 		if (prev_seg->end == seg->start &&
629 		    prev_seg->free_queues == seg->free_queues) {
630 			prev_seg->end = seg->end;
631 			KASSERT(prev_seg->domain == seg->domain,
632 			    ("vm_phys_init: free queues cannot span domains"));
633 			vm_phys_nsegs--;
634 			end_seg--;
635 			for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++)
636 				*tmp_seg = *(tmp_seg + 1);
637 		} else {
638 			prev_seg = seg;
639 			seg++;
640 		}
641 	}
642 
643 	/*
644 	 * Initialize the free queues.
645 	 */
646 	for (dom = 0; dom < vm_ndomains; dom++) {
647 		for (flind = 0; flind < vm_nfreelists; flind++) {
648 			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
649 				fl = vm_phys_free_queues[dom][flind][pind];
650 				for (oind = 0; oind < VM_NFREEORDER; oind++)
651 					TAILQ_INIT(&fl[oind].pl);
652 			}
653 		}
654 	}
655 
656 #ifdef VM_FREEPOOL_LAZYINIT
657 	vm_default_freepool = VM_FREEPOOL_LAZYINIT;
658 #else
659 	vm_default_freepool = VM_FREEPOOL_DEFAULT;
660 #endif
661 
662 	rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
663 }
664 
665 /*
666  * Register info about the NUMA topology of the system.
667  *
668  * Invoked by platform-dependent code prior to vm_phys_init().
669  */
670 void
671 vm_phys_register_domains(int ndomains __numa_used,
672     struct mem_affinity *affinity __numa_used, int *locality __numa_used)
673 {
674 #ifdef NUMA
675 	int i;
676 
677 	/*
678 	 * For now the only override value that we support is 1, which
679 	 * effectively disables NUMA-awareness in the allocators.
680 	 */
681 	TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled);
682 	if (numa_disabled)
683 		ndomains = 1;
684 
685 	if (ndomains > 1) {
686 		vm_ndomains = ndomains;
687 		mem_affinity = affinity;
688 		mem_locality = locality;
689 	}
690 
691 	for (i = 0; i < vm_ndomains; i++)
692 		DOMAINSET_SET(i, &all_domains);
693 #endif
694 }
695 
696 /*
697  * Split a contiguous, power of two-sized set of physical pages.
698  *
699  * When this function is called by a page allocation function, the caller
700  * should request insertion at the head unless the order [order, oind) queues
701  * are known to be empty.  The objective being to reduce the likelihood of
702  * long-term fragmentation by promoting contemporaneous allocation and
703  * (hopefully) deallocation.
704  */
705 static __inline void
706 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order,
707     int pool, int tail)
708 {
709 	vm_page_t m_buddy;
710 
711 	while (oind > order) {
712 		oind--;
713 		m_buddy = &m[1 << oind];
714 		KASSERT(m_buddy->order == VM_NFREEORDER,
715 		    ("vm_phys_split_pages: page %p has unexpected order %d",
716 		    m_buddy, m_buddy->order));
717 		vm_freelist_add(fl, m_buddy, oind, pool, tail);
718         }
719 }
720 
721 static void
722 vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int pool,
723     int tail)
724 {
725 	KASSERT(order >= 0 && order < VM_NFREEORDER,
726 	    ("%s: invalid order %d", __func__, order));
727 
728 	vm_freelist_add(fl, m, order, pool, tail);
729 #ifdef VM_FREEPOOL_LAZYINIT
730 	if (__predict_false(pool == VM_FREEPOOL_LAZYINIT)) {
731 		vm_page_t m_next;
732 		vm_paddr_t pa;
733 		int npages;
734 
735 		npages = 1 << order;
736 		m_next = m + npages;
737 		pa = m->phys_addr + ptoa(npages);
738 		if (pa < vm_phys_segs[m->segind].end) {
739 			vm_page_init_page(m_next, pa, m->segind,
740 			    VM_FREEPOOL_LAZYINIT);
741 		}
742 	}
743 #endif
744 }
745 
746 /*
747  * Add the physical pages [m, m + npages) at the beginning of a power-of-two
748  * aligned and sized set to the specified free list.
749  *
750  * When this function is called by a page allocation function, the caller
751  * should request insertion at the head unless the lower-order queues are
752  * known to be empty.  The objective being to reduce the likelihood of long-
753  * term fragmentation by promoting contemporaneous allocation and (hopefully)
754  * deallocation.
755  *
756  * The physical page m's buddy must not be free.
757  */
758 static void
759 vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool,
760     int tail)
761 {
762         int order;
763 
764 	KASSERT(npages == 0 ||
765 	    (VM_PAGE_TO_PHYS(m) &
766 	    ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
767 	    ("%s: page %p and npages %u are misaligned",
768 	    __func__, m, npages));
769         while (npages > 0) {
770 		KASSERT(m->order == VM_NFREEORDER,
771 		    ("%s: page %p has unexpected order %d",
772 		    __func__, m, m->order));
773 		order = ilog2(npages);
774 		KASSERT(order < VM_NFREEORDER,
775 		    ("%s: order %d is out of range", __func__, order));
776 		vm_phys_enq_chunk(fl, m, order, pool, tail);
777 		m += 1 << order;
778 		npages -= 1 << order;
779 	}
780 }
781 
782 /*
783  * Add the physical pages [m, m + npages) at the end of a power-of-two aligned
784  * and sized set to the specified free list.
785  *
786  * When this function is called by a page allocation function, the caller
787  * should request insertion at the head unless the lower-order queues are
788  * known to be empty.  The objective being to reduce the likelihood of long-
789  * term fragmentation by promoting contemporaneous allocation and (hopefully)
790  * deallocation.
791  *
792  * If npages is zero, this function does nothing and ignores the physical page
793  * parameter m.  Otherwise, the physical page m's buddy must not be free.
794  */
795 static vm_page_t
796 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool,
797     int tail)
798 {
799 	int order;
800 
801 	KASSERT(npages == 0 ||
802 	    ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) &
803 	    ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
804 	    ("vm_phys_enq_range: page %p and npages %u are misaligned",
805 	    m, npages));
806 	while (npages > 0) {
807 		KASSERT(m->order == VM_NFREEORDER,
808 		    ("vm_phys_enq_range: page %p has unexpected order %d",
809 		    m, m->order));
810 		order = ffs(npages) - 1;
811 		vm_phys_enq_chunk(fl, m, order, pool, tail);
812 		m += 1 << order;
813 		npages -= 1 << order;
814 	}
815 	return (m);
816 }
817 
818 /*
819  * Complete initialization a contiguous, power of two-sized set of physical
820  * pages.
821  *
822  * If the pages currently belong to the lazy init pool, then the corresponding
823  * page structures must be initialized.  In this case it is assumed that the
824  * first page in the run has already been initialized.
825  */
826 static void
827 vm_phys_finish_init(vm_page_t m, int order)
828 {
829 #ifdef VM_FREEPOOL_LAZYINIT
830 	if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
831 		vm_paddr_t pa;
832 		int segind;
833 
834 		TSENTER();
835 		pa = m->phys_addr + PAGE_SIZE;
836 		segind = m->segind;
837 		for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order];
838 		    m_tmp++, pa += PAGE_SIZE)
839 			vm_page_init_page(m_tmp, pa, segind, VM_NFREEPOOL);
840 		TSEXIT();
841 	}
842 #endif
843 }
844 
845 /*
846  * Tries to allocate the specified number of pages from the specified pool
847  * within the specified domain.  Returns the actual number of allocated pages
848  * and a pointer to each page through the array ma[].
849  *
850  * The returned pages may not be physically contiguous.  However, in contrast
851  * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0),
852  * calling this function once to allocate the desired number of pages will
853  * avoid wasted time in vm_phys_split_pages().  The allocated pages have no
854  * valid pool field set.
855  *
856  * The free page queues for the specified domain must be locked.
857  */
858 int
859 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
860 {
861 	struct vm_freelist *alt, *fl;
862 	vm_page_t m;
863 	int avail, end, flind, freelist, i, oind, pind;
864 
865 	KASSERT(domain >= 0 && domain < vm_ndomains,
866 	    ("vm_phys_alloc_npages: domain %d is out of range", domain));
867 	KASSERT(vm_phys_pool_valid(pool),
868 	    ("vm_phys_alloc_npages: pool %d is out of range", pool));
869 	KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
870 	    ("vm_phys_alloc_npages: npages %d is out of range", npages));
871 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
872 	i = 0;
873 	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
874 		flind = vm_freelist_to_flind[freelist];
875 		if (flind < 0)
876 			continue;
877 		fl = vm_phys_free_queues[domain][flind][pool];
878 		for (oind = 0; oind < VM_NFREEORDER; oind++) {
879 			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
880 				vm_freelist_rem(fl, m, oind);
881 				avail = i + (1 << oind);
882 				end = imin(npages, avail);
883 				while (i < end)
884 					ma[i++] = m++;
885 				if (i == npages) {
886 					/*
887 					 * Return excess pages to fl.  Its order
888 					 * [0, oind) queues are empty.
889 					 */
890 					vm_phys_enq_range(m, avail - i, fl,
891 					    pool, 1);
892 					return (npages);
893 				}
894 			}
895 		}
896 		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
897 			for (pind = vm_default_freepool; pind < VM_NFREEPOOL;
898 			    pind++) {
899 				alt = vm_phys_free_queues[domain][flind][pind];
900 				while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
901 				    NULL) {
902 					vm_freelist_rem(alt, m, oind);
903 					vm_phys_finish_init(m, oind);
904 					avail = i + (1 << oind);
905 					end = imin(npages, avail);
906 					while (i < end)
907 						ma[i++] = m++;
908 					if (i == npages) {
909 						/*
910 						 * Return excess pages to fl.
911 						 * Its order [0, oind) queues
912 						 * are empty.
913 						 */
914 						vm_phys_enq_range(m, avail - i,
915 						    fl, pool, 1);
916 						return (npages);
917 					}
918 				}
919 			}
920 		}
921 	}
922 	return (i);
923 }
924 
925 /*
926  * Allocate a contiguous, power of two-sized set of physical pages from the
927  * specified free list.  The free list must be specified using one of the
928  * manifest constants VM_FREELIST_*.
929  *
930  * The free page queues must be locked.
931  */
932 static vm_page_t
933 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
934 {
935 	struct vm_freelist *alt, *fl;
936 	vm_page_t m;
937 	int oind, pind, flind;
938 
939 	KASSERT(domain >= 0 && domain < vm_ndomains,
940 	    ("vm_phys_alloc_freelist_pages: domain %d is out of range",
941 	    domain));
942 	KASSERT(freelist < VM_NFREELIST,
943 	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
944 	    freelist));
945 	KASSERT(vm_phys_pool_valid(pool),
946 	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
947 	KASSERT(order < VM_NFREEORDER,
948 	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
949 
950 	flind = vm_freelist_to_flind[freelist];
951 	/* Check if freelist is present */
952 	if (flind < 0)
953 		return (NULL);
954 
955 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
956 	fl = &vm_phys_free_queues[domain][flind][pool][0];
957 	for (oind = order; oind < VM_NFREEORDER; oind++) {
958 		m = TAILQ_FIRST(&fl[oind].pl);
959 		if (m != NULL) {
960 			vm_freelist_rem(fl, m, oind);
961 			/* The order [order, oind) queues are empty. */
962 			vm_phys_split_pages(m, oind, fl, order, pool, 1);
963 			return (m);
964 		}
965 	}
966 
967 	/*
968 	 * The given pool was empty.  Find the largest
969 	 * contiguous, power-of-two-sized set of pages in any
970 	 * pool.  Transfer these pages to the given pool, and
971 	 * use them to satisfy the allocation.
972 	 */
973 	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
974 		for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
975 			alt = &vm_phys_free_queues[domain][flind][pind][0];
976 			m = TAILQ_FIRST(&alt[oind].pl);
977 			if (m != NULL) {
978 				vm_freelist_rem(alt, m, oind);
979 				vm_phys_finish_init(m, oind);
980 				/* The order [order, oind) queues are empty. */
981 				vm_phys_split_pages(m, oind, fl, order, pool, 1);
982 				return (m);
983 			}
984 		}
985 	}
986 	return (NULL);
987 }
988 
989 /*
990  * Allocate a contiguous, power of two-sized set of physical pages
991  * from the free lists.
992  *
993  * The free page queues must be locked.
994  */
995 vm_page_t
996 vm_phys_alloc_pages(int domain, int pool, int order)
997 {
998 	vm_page_t m;
999 	int freelist;
1000 
1001 	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
1002 		m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order);
1003 		if (m != NULL)
1004 			return (m);
1005 	}
1006 	return (NULL);
1007 }
1008 
1009 /*
1010  * Find the vm_page corresponding to the given physical address, which must lie
1011  * within the given physical memory segment.
1012  */
1013 vm_page_t
1014 vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa)
1015 {
1016 	KASSERT(pa >= seg->start && pa < seg->end,
1017 	    ("%s: pa %#jx is out of range", __func__, (uintmax_t)pa));
1018 
1019 	return (&seg->first_page[atop(pa - seg->start)]);
1020 }
1021 
1022 /*
1023  * Find the vm_page corresponding to the given physical address.
1024  */
1025 vm_page_t
1026 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
1027 {
1028 	struct vm_phys_seg *seg;
1029 
1030 	if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
1031 		return (vm_phys_seg_paddr_to_vm_page(seg, pa));
1032 	return (NULL);
1033 }
1034 
1035 vm_page_t
1036 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
1037 {
1038 	struct vm_phys_fictitious_seg tmp, *seg;
1039 	vm_page_t m;
1040 
1041 	m = NULL;
1042 	tmp.start = pa;
1043 	tmp.end = 0;
1044 
1045 	rw_rlock(&vm_phys_fictitious_reg_lock);
1046 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1047 	rw_runlock(&vm_phys_fictitious_reg_lock);
1048 	if (seg == NULL)
1049 		return (NULL);
1050 
1051 	m = &seg->first_page[atop(pa - seg->start)];
1052 	KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
1053 
1054 	return (m);
1055 }
1056 
1057 static inline void
1058 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
1059     long page_count, vm_memattr_t memattr)
1060 {
1061 	long i;
1062 
1063 	bzero(range, page_count * sizeof(*range));
1064 	for (i = 0; i < page_count; i++) {
1065 		vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
1066 		range[i].oflags &= ~VPO_UNMANAGED;
1067 		range[i].busy_lock = VPB_UNBUSIED;
1068 	}
1069 }
1070 
1071 int
1072 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
1073     vm_memattr_t memattr)
1074 {
1075 	struct vm_phys_fictitious_seg *seg;
1076 	vm_page_t fp;
1077 	long page_count;
1078 #ifdef VM_PHYSSEG_DENSE
1079 	long pi, pe;
1080 	long dpage_count;
1081 #endif
1082 
1083 	KASSERT(start < end,
1084 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
1085 	    (uintmax_t)start, (uintmax_t)end));
1086 
1087 	page_count = (end - start) / PAGE_SIZE;
1088 
1089 #ifdef VM_PHYSSEG_DENSE
1090 	pi = atop(start);
1091 	pe = atop(end);
1092 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1093 		fp = &vm_page_array[pi - first_page];
1094 		if ((pe - first_page) > vm_page_array_size) {
1095 			/*
1096 			 * We have a segment that starts inside
1097 			 * of vm_page_array, but ends outside of it.
1098 			 *
1099 			 * Use vm_page_array pages for those that are
1100 			 * inside of the vm_page_array range, and
1101 			 * allocate the remaining ones.
1102 			 */
1103 			dpage_count = vm_page_array_size - (pi - first_page);
1104 			vm_phys_fictitious_init_range(fp, start, dpage_count,
1105 			    memattr);
1106 			page_count -= dpage_count;
1107 			start += ptoa(dpage_count);
1108 			goto alloc;
1109 		}
1110 		/*
1111 		 * We can allocate the full range from vm_page_array,
1112 		 * so there's no need to register the range in the tree.
1113 		 */
1114 		vm_phys_fictitious_init_range(fp, start, page_count, memattr);
1115 		return (0);
1116 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1117 		/*
1118 		 * We have a segment that ends inside of vm_page_array,
1119 		 * but starts outside of it.
1120 		 */
1121 		fp = &vm_page_array[0];
1122 		dpage_count = pe - first_page;
1123 		vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
1124 		    memattr);
1125 		end -= ptoa(dpage_count);
1126 		page_count -= dpage_count;
1127 		goto alloc;
1128 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1129 		/*
1130 		 * Trying to register a fictitious range that expands before
1131 		 * and after vm_page_array.
1132 		 */
1133 		return (EINVAL);
1134 	} else {
1135 alloc:
1136 #endif
1137 		fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
1138 		    M_WAITOK);
1139 #ifdef VM_PHYSSEG_DENSE
1140 	}
1141 #endif
1142 	vm_phys_fictitious_init_range(fp, start, page_count, memattr);
1143 
1144 	seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
1145 	seg->start = start;
1146 	seg->end = end;
1147 	seg->first_page = fp;
1148 
1149 	rw_wlock(&vm_phys_fictitious_reg_lock);
1150 	RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
1151 	rw_wunlock(&vm_phys_fictitious_reg_lock);
1152 
1153 	return (0);
1154 }
1155 
1156 void
1157 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
1158 {
1159 	struct vm_phys_fictitious_seg *seg, tmp;
1160 #ifdef VM_PHYSSEG_DENSE
1161 	long pi, pe;
1162 #endif
1163 
1164 	KASSERT(start < end,
1165 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
1166 	    (uintmax_t)start, (uintmax_t)end));
1167 
1168 #ifdef VM_PHYSSEG_DENSE
1169 	pi = atop(start);
1170 	pe = atop(end);
1171 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1172 		if ((pe - first_page) <= vm_page_array_size) {
1173 			/*
1174 			 * This segment was allocated using vm_page_array
1175 			 * only, there's nothing to do since those pages
1176 			 * were never added to the tree.
1177 			 */
1178 			return;
1179 		}
1180 		/*
1181 		 * We have a segment that starts inside
1182 		 * of vm_page_array, but ends outside of it.
1183 		 *
1184 		 * Calculate how many pages were added to the
1185 		 * tree and free them.
1186 		 */
1187 		start = ptoa(first_page + vm_page_array_size);
1188 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1189 		/*
1190 		 * We have a segment that ends inside of vm_page_array,
1191 		 * but starts outside of it.
1192 		 */
1193 		end = ptoa(first_page);
1194 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1195 		/* Since it's not possible to register such a range, panic. */
1196 		panic(
1197 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
1198 		    (uintmax_t)start, (uintmax_t)end);
1199 	}
1200 #endif
1201 	tmp.start = start;
1202 	tmp.end = 0;
1203 
1204 	rw_wlock(&vm_phys_fictitious_reg_lock);
1205 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1206 	if (seg->start != start || seg->end != end) {
1207 		rw_wunlock(&vm_phys_fictitious_reg_lock);
1208 		panic(
1209 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
1210 		    (uintmax_t)start, (uintmax_t)end);
1211 	}
1212 	RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
1213 	rw_wunlock(&vm_phys_fictitious_reg_lock);
1214 	free(seg->first_page, M_FICT_PAGES);
1215 	free(seg, M_FICT_PAGES);
1216 }
1217 
1218 /*
1219  * Free a contiguous, power of two-sized set of physical pages.
1220  * The pool field in the first page determines the destination pool.
1221  *
1222  * The free page queues must be locked.
1223  */
1224 void
1225 vm_phys_free_pages(vm_page_t m, int pool, int order)
1226 {
1227 	struct vm_freelist *fl;
1228 	struct vm_phys_seg *seg;
1229 	vm_paddr_t pa;
1230 	vm_page_t m_buddy;
1231 
1232 	KASSERT(m->order == VM_NFREEORDER,
1233 	    ("%s: page %p has unexpected order %d",
1234 	    __func__, m, m->order));
1235 	KASSERT(vm_phys_pool_valid(pool),
1236 	    ("%s: unexpected pool param %d", __func__, pool));
1237 	KASSERT(order < VM_NFREEORDER,
1238 	    ("%s: order %d is out of range", __func__, order));
1239 	seg = &vm_phys_segs[m->segind];
1240 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1241 	if (order < VM_NFREEORDER - 1) {
1242 		pa = VM_PAGE_TO_PHYS(m);
1243 		do {
1244 			pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
1245 			if (pa < seg->start || pa >= seg->end)
1246 				break;
1247 			m_buddy = vm_phys_seg_paddr_to_vm_page(seg, pa);
1248 			if (m_buddy->order != order)
1249 				break;
1250 			fl = (*seg->free_queues)[m_buddy->pool];
1251 			vm_freelist_rem(fl, m_buddy, order);
1252 			vm_phys_finish_init(m_buddy, order);
1253 			order++;
1254 			pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
1255 			m = vm_phys_seg_paddr_to_vm_page(seg, pa);
1256 		} while (order < VM_NFREEORDER - 1);
1257 	}
1258 	fl = (*seg->free_queues)[pool];
1259 	vm_freelist_add(fl, m, order, pool, 1);
1260 }
1261 
1262 #ifdef VM_FREEPOOL_LAZYINIT
1263 /*
1264  * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving
1265  * them to the default pool.  This is a prerequisite for some rare operations
1266  * which need to scan the page array and thus depend on all pages being
1267  * initialized.
1268  */
1269 static void
1270 vm_phys_lazy_init_domain(int domain, bool locked)
1271 {
1272 	static bool initdone[MAXMEMDOM];
1273 	struct vm_domain *vmd;
1274 	struct vm_freelist *fl;
1275 	vm_page_t m;
1276 	int pind;
1277 	bool unlocked;
1278 
1279 	if (__predict_true(atomic_load_bool(&initdone[domain])))
1280 		return;
1281 
1282 	vmd = VM_DOMAIN(domain);
1283 	if (locked)
1284 		vm_domain_free_assert_locked(vmd);
1285 	else
1286 		vm_domain_free_lock(vmd);
1287 	if (atomic_load_bool(&initdone[domain]))
1288 		goto out;
1289 	pind = VM_FREEPOOL_LAZYINIT;
1290 	for (int freelist = 0; freelist < VM_NFREELIST; freelist++) {
1291 		int flind;
1292 
1293 		flind = vm_freelist_to_flind[freelist];
1294 		if (flind < 0)
1295 			continue;
1296 		fl = vm_phys_free_queues[domain][flind][pind];
1297 		for (int oind = 0; oind < VM_NFREEORDER; oind++) {
1298 			if (atomic_load_int(&fl[oind].lcnt) == 0)
1299 				continue;
1300 			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
1301 				/*
1302 				 * Avoid holding the lock across the
1303 				 * initialization unless there's a free page
1304 				 * shortage.
1305 				 */
1306 				vm_freelist_rem(fl, m, oind);
1307 				unlocked = vm_domain_allocate(vmd,
1308 				    VM_ALLOC_NORMAL, 1 << oind);
1309 				if (unlocked)
1310 					vm_domain_free_unlock(vmd);
1311 				vm_phys_finish_init(m, oind);
1312 				if (unlocked) {
1313 					vm_domain_freecnt_inc(vmd, 1 << oind);
1314 					vm_domain_free_lock(vmd);
1315 				}
1316 				vm_phys_free_pages(m, VM_FREEPOOL_DEFAULT,
1317 				    oind);
1318 			}
1319 		}
1320 	}
1321 	atomic_store_bool(&initdone[domain], true);
1322 out:
1323 	if (!locked)
1324 		vm_domain_free_unlock(vmd);
1325 }
1326 
1327 static void
1328 vm_phys_lazy_init(void)
1329 {
1330 	for (int domain = 0; domain < vm_ndomains; domain++)
1331 		vm_phys_lazy_init_domain(domain, false);
1332 	atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT);
1333 }
1334 
1335 static void
1336 vm_phys_lazy_init_kthr(void *arg __unused)
1337 {
1338 	vm_phys_lazy_init();
1339 	kthread_exit();
1340 }
1341 
1342 static void
1343 vm_phys_lazy_sysinit(void *arg __unused)
1344 {
1345 	struct thread *td;
1346 	int error;
1347 
1348 	error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td,
1349 	    RFSTOPPED, 0, "vmlazyinit");
1350 	if (error == 0) {
1351 		thread_lock(td);
1352 		sched_prio(td, PRI_MIN_IDLE);
1353 		sched_add(td, SRQ_BORING);
1354 	} else {
1355 		printf("%s: could not create lazy init thread: %d\n",
1356 		    __func__, error);
1357 		vm_phys_lazy_init();
1358 	}
1359 }
1360 SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit,
1361     NULL);
1362 #endif /* VM_FREEPOOL_LAZYINIT */
1363 
1364 /*
1365  * Free a contiguous, arbitrarily sized set of physical pages, without
1366  * merging across set boundaries.  Assumes no pages have a valid pool field.
1367  *
1368  * The free page queues must be locked.
1369  */
1370 void
1371 vm_phys_enqueue_contig(vm_page_t m, int pool, u_long npages)
1372 {
1373 	struct vm_freelist *fl;
1374 	struct vm_phys_seg *seg;
1375 	vm_page_t m_end;
1376 	vm_paddr_t diff, lo;
1377 	int order;
1378 
1379 	/*
1380 	 * Avoid unnecessary coalescing by freeing the pages in the largest
1381 	 * possible power-of-two-sized subsets.
1382 	 */
1383 	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1384 	seg = &vm_phys_segs[m->segind];
1385 	fl = (*seg->free_queues)[pool];
1386 	m_end = m + npages;
1387 	/* Free blocks of increasing size. */
1388 	lo = atop(VM_PAGE_TO_PHYS(m));
1389 	if (m < m_end &&
1390 	    (diff = lo ^ (lo + npages - 1)) != 0) {
1391 		order = min(ilog2(diff), VM_NFREEORDER - 1);
1392 		m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl,
1393 		    pool, 1);
1394 	}
1395 
1396 	/* Free blocks of maximum size. */
1397 	order = VM_NFREEORDER - 1;
1398 	while (m + (1 << order) <= m_end) {
1399 		KASSERT(seg == &vm_phys_segs[m->segind],
1400 		    ("%s: page range [%p,%p) spans multiple segments",
1401 		    __func__, m_end - npages, m));
1402 		vm_phys_enq_chunk(fl, m, order, pool, 1);
1403 		m += 1 << order;
1404 	}
1405 	/* Free blocks of diminishing size. */
1406 	vm_phys_enq_beg(m, m_end - m, fl, pool, 1);
1407 }
1408 
1409 /*
1410  * Free a contiguous, arbitrarily sized set of physical pages.
1411  * Assumes that every page but the first has no valid pool field.
1412  * Uses the pool value in the first page if valid, otherwise default.
1413  *
1414  * The free page queues must be locked.
1415  */
1416 void
1417 vm_phys_free_contig(vm_page_t m, int pool, u_long npages)
1418 {
1419 	vm_paddr_t lo;
1420 	vm_page_t m_start, m_end;
1421 	unsigned max_order, order_start, order_end;
1422 
1423 	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1424 
1425 	lo = atop(VM_PAGE_TO_PHYS(m));
1426 	max_order = min(ilog2(lo ^ (lo + npages)), VM_NFREEORDER - 1);
1427 
1428 	m_start = m;
1429 	order_start = ffsll(lo) - 1;
1430 	if (order_start < max_order)
1431 		m_start += 1 << order_start;
1432 	m_end = m + npages;
1433 	order_end = ffsll(lo + npages) - 1;
1434 	if (order_end < max_order)
1435 		m_end -= 1 << order_end;
1436 	/*
1437 	 * Avoid unnecessary coalescing by freeing the pages at the start and
1438 	 * end of the range last.
1439 	 */
1440 	if (m_start < m_end)
1441 		vm_phys_enqueue_contig(m_start, pool, m_end - m_start);
1442 	if (order_start < max_order)
1443 		vm_phys_free_pages(m, pool, order_start);
1444 	if (order_end < max_order)
1445 		vm_phys_free_pages(m_end, pool, order_end);
1446 }
1447 
1448 /*
1449  * Identify the first address range within segment segind or greater
1450  * that matches the domain, lies within the low/high range, and has
1451  * enough pages.  Return -1 if there is none.
1452  */
1453 int
1454 vm_phys_find_range(vm_page_t bounds[], int segind, int domain,
1455     u_long npages, vm_paddr_t low, vm_paddr_t high)
1456 {
1457 	vm_paddr_t pa_end, pa_start;
1458 	struct vm_phys_seg *end_seg, *seg;
1459 
1460 	KASSERT(npages > 0, ("npages is zero"));
1461 	KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range"));
1462 	end_seg = &vm_phys_segs[vm_phys_nsegs];
1463 	for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) {
1464 		if (seg->domain != domain)
1465 			continue;
1466 		if (seg->start >= high)
1467 			return (-1);
1468 		pa_start = MAX(low, seg->start);
1469 		pa_end = MIN(high, seg->end);
1470 		if (pa_end - pa_start < ptoa(npages))
1471 			continue;
1472 #ifdef VM_FREEPOOL_LAZYINIT
1473 		/*
1474 		 * The pages on the free lists must be initialized.
1475 		 */
1476 		vm_phys_lazy_init_domain(domain, false);
1477 #endif
1478 		bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start);
1479 		bounds[1] = &seg->first_page[atop(pa_end - seg->start)];
1480 		return (seg - vm_phys_segs);
1481 	}
1482 	return (-1);
1483 }
1484 
1485 /*
1486  * Search for the given physical page "m" in the free lists.  If the search
1487  * succeeds, remove "m" from the free lists and return true.  Otherwise, return
1488  * false, indicating that "m" is not in the free lists.
1489  *
1490  * The free page queues must be locked.
1491  */
1492 bool
1493 vm_phys_unfree_page(vm_paddr_t pa)
1494 {
1495 	struct vm_freelist *fl;
1496 	struct vm_phys_seg *seg;
1497 	vm_paddr_t pa_half;
1498 	vm_page_t m, m_set, m_tmp;
1499 	int order, pool;
1500 
1501 	seg = vm_phys_paddr_to_seg(pa);
1502 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1503 
1504 #ifdef VM_FREEPOOL_LAZYINIT
1505 	/*
1506 	 * The pages on the free lists must be initialized.
1507 	 */
1508 	vm_phys_lazy_init_domain(seg->domain, true);
1509 #endif
1510 
1511 	/*
1512 	 * First, find the contiguous, power of two-sized set of free
1513 	 * physical pages containing the given physical page "m" and
1514 	 * assign it to "m_set".
1515 	 */
1516 	m = vm_phys_paddr_to_vm_page(pa);
1517 	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
1518 	    order < VM_NFREEORDER - 1; ) {
1519 		order++;
1520 		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
1521 		if (pa >= seg->start)
1522 			m_set = vm_phys_seg_paddr_to_vm_page(seg, pa);
1523 		else
1524 			return (false);
1525 	}
1526 	if (m_set->order < order)
1527 		return (false);
1528 	if (m_set->order == VM_NFREEORDER)
1529 		return (false);
1530 	KASSERT(m_set->order < VM_NFREEORDER,
1531 	    ("vm_phys_unfree_page: page %p has unexpected order %d",
1532 	    m_set, m_set->order));
1533 
1534 	/*
1535 	 * Next, remove "m_set" from the free lists.  Finally, extract
1536 	 * "m" from "m_set" using an iterative algorithm: While "m_set"
1537 	 * is larger than a page, shrink "m_set" by returning the half
1538 	 * of "m_set" that does not contain "m" to the free lists.
1539 	 */
1540 	pool = m_set->pool;
1541 	fl = (*seg->free_queues)[pool];
1542 	order = m_set->order;
1543 	vm_freelist_rem(fl, m_set, order);
1544 	while (order > 0) {
1545 		order--;
1546 		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
1547 		if (m->phys_addr < pa_half)
1548 			m_tmp = vm_phys_seg_paddr_to_vm_page(seg, pa_half);
1549 		else {
1550 			m_tmp = m_set;
1551 			m_set = vm_phys_seg_paddr_to_vm_page(seg, pa_half);
1552 		}
1553 		vm_freelist_add(fl, m_tmp, order, pool, 0);
1554 	}
1555 	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
1556 	return (true);
1557 }
1558 
1559 /*
1560  * Find a run of contiguous physical pages, meeting alignment requirements, from
1561  * a list of max-sized page blocks, where we need at least two consecutive
1562  * blocks to satisfy the (large) page request.
1563  */
1564 static vm_page_t
1565 vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages,
1566     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
1567 {
1568 	struct vm_phys_seg *seg;
1569 	vm_page_t m, m_iter, m_ret;
1570 	vm_paddr_t max_size, size;
1571 	int max_order;
1572 
1573 	max_order = VM_NFREEORDER - 1;
1574 	size = npages << PAGE_SHIFT;
1575 	max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order);
1576 	KASSERT(size > max_size, ("size is too small"));
1577 
1578 	/*
1579 	 * In order to avoid examining any free max-sized page block more than
1580 	 * twice, identify the ones that are first in a physically-contiguous
1581 	 * sequence of such blocks, and only for those walk the sequence to
1582 	 * check if there are enough free blocks starting at a properly aligned
1583 	 * block.  Thus, no block is checked for free-ness more than twice.
1584 	 */
1585 	TAILQ_FOREACH(m, &fl[max_order].pl, listq) {
1586 		/*
1587 		 * Skip m unless it is first in a sequence of free max page
1588 		 * blocks >= low in its segment.
1589 		 */
1590 		seg = &vm_phys_segs[m->segind];
1591 		if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start))
1592 			continue;
1593 		if (VM_PAGE_TO_PHYS(m) >= max_size &&
1594 		    VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) &&
1595 		    max_order == m[-1 << max_order].order)
1596 			continue;
1597 
1598 		/*
1599 		 * Advance m_ret from m to the first of the sequence, if any,
1600 		 * that satisfies alignment conditions and might leave enough
1601 		 * space.
1602 		 */
1603 		m_ret = m;
1604 		while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret),
1605 		    size, alignment, boundary) &&
1606 		    VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) &&
1607 		    max_order == m_ret[1 << max_order].order)
1608 			m_ret += 1 << max_order;
1609 
1610 		/*
1611 		 * Skip m unless some block m_ret in the sequence is properly
1612 		 * aligned, and begins a sequence of enough pages less than
1613 		 * high, and in the same segment.
1614 		 */
1615 		if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end))
1616 			continue;
1617 
1618 		/*
1619 		 * Skip m unless the blocks to allocate starting at m_ret are
1620 		 * all free.
1621 		 */
1622 		for (m_iter = m_ret;
1623 		    m_iter < m_ret + npages && max_order == m_iter->order;
1624 		    m_iter += 1 << max_order) {
1625 		}
1626 		if (m_iter < m_ret + npages)
1627 			continue;
1628 		return (m_ret);
1629 	}
1630 	return (NULL);
1631 }
1632 
1633 /*
1634  * Find a run of contiguous physical pages from the specified free list
1635  * table.
1636  */
1637 static vm_page_t
1638 vm_phys_find_queues_contig(
1639     struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX],
1640     u_long npages, vm_paddr_t low, vm_paddr_t high,
1641     u_long alignment, vm_paddr_t boundary)
1642 {
1643 	struct vm_freelist *fl;
1644 	vm_page_t m_ret;
1645 	vm_paddr_t pa, pa_end, size;
1646 	int oind, order, pind;
1647 
1648 	KASSERT(npages > 0, ("npages is 0"));
1649 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1650 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1651 	/* Compute the queue that is the best fit for npages. */
1652 	order = flsl(npages - 1);
1653 	/* Search for a large enough free block. */
1654 	size = npages << PAGE_SHIFT;
1655 	for (oind = order; oind < VM_NFREEORDER; oind++) {
1656 		for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
1657 			fl = (*queues)[pind];
1658 			TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
1659 				/*
1660 				 * Determine if the address range starting at pa
1661 				 * is within the given range, satisfies the
1662 				 * given alignment, and does not cross the given
1663 				 * boundary.
1664 				 */
1665 				pa = VM_PAGE_TO_PHYS(m_ret);
1666 				pa_end = pa + size;
1667 				if (low <= pa && pa_end <= high &&
1668 				    vm_addr_ok(pa, size, alignment, boundary))
1669 					return (m_ret);
1670 			}
1671 		}
1672 	}
1673 	if (order < VM_NFREEORDER)
1674 		return (NULL);
1675 	/* Search for a long-enough sequence of max-order blocks. */
1676 	for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
1677 		fl = (*queues)[pind];
1678 		m_ret = vm_phys_find_freelist_contig(fl, npages,
1679 		    low, high, alignment, boundary);
1680 		if (m_ret != NULL)
1681 			return (m_ret);
1682 	}
1683 	return (NULL);
1684 }
1685 
1686 /*
1687  * Allocate a contiguous set of physical pages of the given size
1688  * "npages" from the free lists.  All of the physical pages must be at
1689  * or above the given physical address "low" and below the given
1690  * physical address "high".  The given value "alignment" determines the
1691  * alignment of the first physical page in the set.  If the given value
1692  * "boundary" is non-zero, then the set of physical pages cannot cross
1693  * any physical address boundary that is a multiple of that value.  Both
1694  * "alignment" and "boundary" must be a power of two.  Sets the pool
1695  * field to DEFAULT in the first allocated page.
1696  */
1697 vm_page_t
1698 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
1699     u_long alignment, vm_paddr_t boundary)
1700 {
1701 	vm_paddr_t pa_end, pa_start;
1702 	struct vm_freelist *fl;
1703 	vm_page_t m, m_run;
1704 	struct vm_phys_seg *seg;
1705 	struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX];
1706 	int oind, segind;
1707 
1708 	KASSERT(npages > 0, ("npages is 0"));
1709 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1710 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1711 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
1712 	if (low >= high)
1713 		return (NULL);
1714 	queues = NULL;
1715 	m_run = NULL;
1716 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
1717 		seg = &vm_phys_segs[segind];
1718 		if (seg->start >= high || seg->domain != domain)
1719 			continue;
1720 		if (low >= seg->end)
1721 			break;
1722 		if (low <= seg->start)
1723 			pa_start = seg->start;
1724 		else
1725 			pa_start = low;
1726 		if (high < seg->end)
1727 			pa_end = high;
1728 		else
1729 			pa_end = seg->end;
1730 		if (pa_end - pa_start < ptoa(npages))
1731 			continue;
1732 		/*
1733 		 * If a previous segment led to a search using
1734 		 * the same free lists as would this segment, then
1735 		 * we've actually already searched within this
1736 		 * too.  So skip it.
1737 		 */
1738 		if (seg->free_queues == queues)
1739 			continue;
1740 		queues = seg->free_queues;
1741 		m_run = vm_phys_find_queues_contig(queues, npages,
1742 		    low, high, alignment, boundary);
1743 		if (m_run != NULL)
1744 			break;
1745 	}
1746 	if (m_run == NULL)
1747 		return (NULL);
1748 
1749 	/* Allocate pages from the page-range found. */
1750 	for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) {
1751 		fl = (*queues)[m->pool];
1752 		oind = m->order;
1753 		vm_freelist_rem(fl, m, oind);
1754 		vm_phys_finish_init(m, oind);
1755 	}
1756 	/* Return excess pages to the free lists. */
1757 	fl = (*queues)[VM_FREEPOOL_DEFAULT];
1758 	vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl,
1759 	    VM_FREEPOOL_DEFAULT, 0);
1760 
1761 	/* Return page verified to satisfy conditions of request. */
1762 	pa_start = VM_PAGE_TO_PHYS(m_run);
1763 	KASSERT(low <= pa_start,
1764 	    ("memory allocated below minimum requested range"));
1765 	KASSERT(pa_start + ptoa(npages) <= high,
1766 	    ("memory allocated above maximum requested range"));
1767 	seg = &vm_phys_segs[m_run->segind];
1768 	KASSERT(seg->domain == domain,
1769 	    ("memory not allocated from specified domain"));
1770 	KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary),
1771 	    ("memory alignment/boundary constraints not satisfied"));
1772 	return (m_run);
1773 }
1774 
1775 /*
1776  * Return the index of the first unused slot which may be the terminating
1777  * entry.
1778  */
1779 static int
1780 vm_phys_avail_count(void)
1781 {
1782 	int i;
1783 
1784 	for (i = 0; i < PHYS_AVAIL_COUNT; i += 2)
1785 		if (phys_avail[i] == 0 && phys_avail[i + 1] == 0)
1786 			return (i);
1787 	panic("Improperly terminated phys_avail[]");
1788 }
1789 
1790 /*
1791  * Assert that a phys_avail entry is valid.
1792  */
1793 static void
1794 vm_phys_avail_check(int i)
1795 {
1796 	if (i % 2 != 0)
1797 		panic("Chunk start index %d is not even.", i);
1798 	if (phys_avail[i] & PAGE_MASK)
1799 		panic("Unaligned phys_avail[%d]: %#jx", i,
1800 		    (intmax_t)phys_avail[i]);
1801 	if (phys_avail[i + 1] & PAGE_MASK)
1802 		panic("Unaligned phys_avail[%d + 1]: %#jx", i,
1803 		    (intmax_t)phys_avail[i + 1]);
1804 	if (phys_avail[i + 1] < phys_avail[i])
1805 		panic("phys_avail[%d]: start %#jx > end %#jx", i,
1806 		    (intmax_t)phys_avail[i], (intmax_t)phys_avail[i + 1]);
1807 }
1808 
1809 /*
1810  * Return the index of an overlapping phys_avail entry or -1.
1811  */
1812 #ifdef NUMA
1813 static int
1814 vm_phys_avail_find(vm_paddr_t pa)
1815 {
1816 	int i;
1817 
1818 	for (i = 0; phys_avail[i + 1]; i += 2)
1819 		if (phys_avail[i] <= pa && phys_avail[i + 1] > pa)
1820 			return (i);
1821 	return (-1);
1822 }
1823 #endif
1824 
1825 /*
1826  * Return the index of the largest entry.
1827  */
1828 int
1829 vm_phys_avail_largest(void)
1830 {
1831 	vm_paddr_t sz, largesz;
1832 	int largest;
1833 	int i;
1834 
1835 	largest = 0;
1836 	largesz = 0;
1837 	for (i = 0; phys_avail[i + 1]; i += 2) {
1838 		sz = vm_phys_avail_size(i);
1839 		if (sz > largesz) {
1840 			largesz = sz;
1841 			largest = i;
1842 		}
1843 	}
1844 
1845 	return (largest);
1846 }
1847 
1848 vm_paddr_t
1849 vm_phys_avail_size(int i)
1850 {
1851 
1852 	return (phys_avail[i + 1] - phys_avail[i]);
1853 }
1854 
1855 /*
1856  * Split a chunk in phys_avail[] at the address 'pa'.
1857  *
1858  * 'pa' must be within a chunk (slots i and i + 1) or one of its boundaries.
1859  * Returns zero on actual split, in which case the two new chunks occupy slots
1860  * i to i + 3, else EJUSTRETURN if 'pa' was one of the boundaries (and no split
1861  * actually occurred) else ENOSPC if there are not enough slots in phys_avail[]
1862  * to represent the additional chunk caused by the split.
1863  */
1864 static int
1865 vm_phys_avail_split(vm_paddr_t pa, int i)
1866 {
1867 	int cnt;
1868 
1869 	vm_phys_avail_check(i);
1870 	if (pa < phys_avail[i] || pa > phys_avail[i + 1])
1871 		panic("%s: Address %#jx not in range at slot %d [%#jx;%#jx].",
1872 		    __func__, (uintmax_t)pa, i,
1873 		    (uintmax_t)phys_avail[i], (uintmax_t)phys_avail[i + 1]);
1874 	if (pa == phys_avail[i] || pa == phys_avail[i + 1])
1875 		return (EJUSTRETURN);
1876 	cnt = vm_phys_avail_count();
1877 	if (cnt >= PHYS_AVAIL_ENTRIES)
1878 		return (ENOSPC);
1879 	memmove(&phys_avail[i + 2], &phys_avail[i],
1880 	    (cnt - i) * sizeof(phys_avail[0]));
1881 	phys_avail[i + 1] = pa;
1882 	phys_avail[i + 2] = pa;
1883 	vm_phys_avail_check(i);
1884 	vm_phys_avail_check(i+2);
1885 
1886 	return (0);
1887 }
1888 
1889 /*
1890  * Check if a given physical address can be included as part of a crash dump.
1891  */
1892 bool
1893 vm_phys_is_dumpable(vm_paddr_t pa)
1894 {
1895 	vm_page_t m;
1896 	int i;
1897 
1898 	if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL)
1899 		return ((m->flags & PG_NODUMP) == 0);
1900 
1901 	for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
1902 		if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
1903 			return (true);
1904 	}
1905 	return (false);
1906 }
1907 
1908 void
1909 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end)
1910 {
1911 	struct vm_phys_seg *seg;
1912 
1913 	if (vm_phys_early_nsegs == -1)
1914 		panic("%s: called after initialization", __func__);
1915 	if (vm_phys_early_nsegs == nitems(vm_phys_early_segs))
1916 		panic("%s: ran out of early segments", __func__);
1917 
1918 	seg = &vm_phys_early_segs[vm_phys_early_nsegs++];
1919 	seg->start = start;
1920 	seg->end = end;
1921 }
1922 
1923 /*
1924  * This routine allocates NUMA node specific memory before the page
1925  * allocator is bootstrapped.
1926  */
1927 vm_paddr_t
1928 vm_phys_early_alloc(int domain, size_t alloc_size)
1929 {
1930 #ifdef NUMA
1931 	int mem_index;
1932 #endif
1933 	int i, biggestone;
1934 	vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align;
1935 
1936 	KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains),
1937 	    ("%s: invalid domain index %d", __func__, domain));
1938 
1939 	/*
1940 	 * Search the mem_affinity array for the biggest address
1941 	 * range in the desired domain.  This is used to constrain
1942 	 * the phys_avail selection below.
1943 	 */
1944 	biggestsize = 0;
1945 	mem_start = 0;
1946 	mem_end = -1;
1947 #ifdef NUMA
1948 	mem_index = 0;
1949 	if (mem_affinity != NULL) {
1950 		for (i = 0;; i++) {
1951 			size = mem_affinity[i].end - mem_affinity[i].start;
1952 			if (size == 0)
1953 				break;
1954 			if (domain != -1 && mem_affinity[i].domain != domain)
1955 				continue;
1956 			if (size > biggestsize) {
1957 				mem_index = i;
1958 				biggestsize = size;
1959 			}
1960 		}
1961 		mem_start = mem_affinity[mem_index].start;
1962 		mem_end = mem_affinity[mem_index].end;
1963 	}
1964 #endif
1965 
1966 	/*
1967 	 * Now find biggest physical segment in within the desired
1968 	 * numa domain.
1969 	 */
1970 	biggestsize = 0;
1971 	biggestone = 0;
1972 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1973 		/* skip regions that are out of range */
1974 		if (phys_avail[i+1] - alloc_size < mem_start ||
1975 		    phys_avail[i+1] > mem_end)
1976 			continue;
1977 		size = vm_phys_avail_size(i);
1978 		if (size > biggestsize) {
1979 			biggestone = i;
1980 			biggestsize = size;
1981 		}
1982 	}
1983 	alloc_size = round_page(alloc_size);
1984 
1985 	/*
1986 	 * Grab single pages from the front to reduce fragmentation.
1987 	 */
1988 	if (alloc_size == PAGE_SIZE) {
1989 		pa = phys_avail[biggestone];
1990 		phys_avail[biggestone] += PAGE_SIZE;
1991 		vm_phys_avail_check(biggestone);
1992 		return (pa);
1993 	}
1994 
1995 	/*
1996 	 * Naturally align large allocations.
1997 	 */
1998 	align = phys_avail[biggestone + 1] & (alloc_size - 1);
1999 	if (alloc_size + align > biggestsize)
2000 		panic("cannot find a large enough size\n");
2001 	if (align != 0 &&
2002 	    vm_phys_avail_split(phys_avail[biggestone + 1] - align,
2003 	    biggestone) != 0)
2004 		/* Wasting memory. */
2005 		phys_avail[biggestone + 1] -= align;
2006 
2007 	phys_avail[biggestone + 1] -= alloc_size;
2008 	vm_phys_avail_check(biggestone);
2009 	pa = phys_avail[biggestone + 1];
2010 	return (pa);
2011 }
2012 
2013 void
2014 vm_phys_early_startup(void)
2015 {
2016 	struct vm_phys_seg *seg;
2017 	int i;
2018 
2019 	if (phys_avail[1] == 0)
2020 		panic("phys_avail[] is empty");
2021 
2022 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
2023 		phys_avail[i] = round_page(phys_avail[i]);
2024 		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
2025 	}
2026 
2027 	for (i = 0; i < vm_phys_early_nsegs; i++) {
2028 		seg = &vm_phys_early_segs[i];
2029 		vm_phys_add_seg(seg->start, seg->end);
2030 	}
2031 	vm_phys_early_nsegs = -1;
2032 
2033 #ifdef NUMA
2034 	/* Force phys_avail to be split by domain. */
2035 	if (mem_affinity != NULL) {
2036 		int idx;
2037 
2038 		for (i = 0; mem_affinity[i].end != 0; i++) {
2039 			idx = vm_phys_avail_find(mem_affinity[i].start);
2040 			if (idx != -1)
2041 				vm_phys_avail_split(mem_affinity[i].start, idx);
2042 			idx = vm_phys_avail_find(mem_affinity[i].end);
2043 			if (idx != -1)
2044 				vm_phys_avail_split(mem_affinity[i].end, idx);
2045 		}
2046 	}
2047 #endif
2048 }
2049 
2050 #ifdef DDB
2051 /*
2052  * Show the number of physical pages in each of the free lists.
2053  */
2054 DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE)
2055 {
2056 	struct vm_freelist *fl;
2057 	int flind, oind, pind, dom;
2058 
2059 	for (dom = 0; dom < vm_ndomains; dom++) {
2060 		db_printf("DOMAIN: %d\n", dom);
2061 		for (flind = 0; flind < vm_nfreelists; flind++) {
2062 			db_printf("FREE LIST %d:\n"
2063 			    "\n  ORDER (SIZE)  |  NUMBER"
2064 			    "\n              ", flind);
2065 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
2066 				db_printf("  |  POOL %d", pind);
2067 			db_printf("\n--            ");
2068 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
2069 				db_printf("-- --      ");
2070 			db_printf("--\n");
2071 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
2072 				db_printf("  %2.2d (%6.6dK)", oind,
2073 				    1 << (PAGE_SHIFT - 10 + oind));
2074 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
2075 				fl = vm_phys_free_queues[dom][flind][pind];
2076 					db_printf("  |  %6.6d", fl[oind].lcnt);
2077 				}
2078 				db_printf("\n");
2079 			}
2080 			db_printf("\n");
2081 		}
2082 		db_printf("\n");
2083 	}
2084 }
2085 #endif
2086