xref: /freebsd/sys/vm/vm_phys.c (revision 7a7741af18d6c8a804cc643cb7ecda9d730c6aa6)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2002-2006 Rice University
5  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
6  * All rights reserved.
7  *
8  * This software was developed for the FreeBSD Project by Alan L. Cox,
9  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
24  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
27  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
30  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  *	Physical memory system implementation
36  *
37  * Any external functions defined by this module are only to be used by the
38  * virtual memory system.
39  */
40 
41 #include <sys/cdefs.h>
42 #include "opt_ddb.h"
43 #include "opt_vm.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/domainset.h>
48 #include <sys/lock.h>
49 #include <sys/kernel.h>
50 #include <sys/kthread.h>
51 #include <sys/malloc.h>
52 #include <sys/mutex.h>
53 #include <sys/proc.h>
54 #include <sys/queue.h>
55 #include <sys/rwlock.h>
56 #include <sys/sbuf.h>
57 #include <sys/sched.h>
58 #include <sys/sysctl.h>
59 #include <sys/tree.h>
60 #include <sys/tslog.h>
61 #include <sys/unistd.h>
62 #include <sys/vmmeter.h>
63 
64 #include <ddb/ddb.h>
65 
66 #include <vm/vm.h>
67 #include <vm/vm_extern.h>
68 #include <vm/vm_param.h>
69 #include <vm/vm_kern.h>
70 #include <vm/vm_object.h>
71 #include <vm/vm_page.h>
72 #include <vm/vm_phys.h>
73 #include <vm/vm_pagequeue.h>
74 
75 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
76     "Too many physsegs.");
77 _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t),
78     "vm_paddr_t too big for ffsll, flsll.");
79 
80 #ifdef NUMA
81 struct mem_affinity __read_mostly *mem_affinity;
82 int __read_mostly *mem_locality;
83 
84 static int numa_disabled;
85 static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
86     "NUMA options");
87 SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
88     &numa_disabled, 0, "NUMA-awareness in the allocators is disabled");
89 #endif
90 
91 int __read_mostly vm_ndomains = 1;
92 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1);
93 
94 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX];
95 int __read_mostly vm_phys_nsegs;
96 static struct vm_phys_seg vm_phys_early_segs[8];
97 static int vm_phys_early_nsegs;
98 
99 struct vm_phys_fictitious_seg;
100 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
101     struct vm_phys_fictitious_seg *);
102 
103 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
104     RB_INITIALIZER(&vm_phys_fictitious_tree);
105 
106 struct vm_phys_fictitious_seg {
107 	RB_ENTRY(vm_phys_fictitious_seg) node;
108 	/* Memory region data */
109 	vm_paddr_t	start;
110 	vm_paddr_t	end;
111 	vm_page_t	first_page;
112 };
113 
114 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
115     vm_phys_fictitious_cmp);
116 
117 static struct rwlock_padalign vm_phys_fictitious_reg_lock;
118 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
119 
120 static struct vm_freelist __aligned(CACHE_LINE_SIZE)
121     vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL]
122     [VM_NFREEORDER_MAX];
123 
124 static int __read_mostly vm_nfreelists;
125 
126 /*
127  * These "avail lists" are globals used to communicate boot-time physical
128  * memory layout to other parts of the kernel.  Each physically contiguous
129  * region of memory is defined by a start address at an even index and an
130  * end address at the following odd index.  Each list is terminated by a
131  * pair of zero entries.
132  *
133  * dump_avail tells the dump code what regions to include in a crash dump, and
134  * phys_avail is all of the remaining physical memory that is available for
135  * the vm system.
136  *
137  * Initially dump_avail and phys_avail are identical.  Boot time memory
138  * allocations remove extents from phys_avail that may still be included
139  * in dumps.
140  */
141 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT];
142 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT];
143 
144 /*
145  * Provides the mapping from VM_FREELIST_* to free list indices (flind).
146  */
147 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
148 static int __read_mostly vm_default_freepool;
149 
150 CTASSERT(VM_FREELIST_DEFAULT == 0);
151 
152 #ifdef VM_FREELIST_DMA32
153 #define	VM_DMA32_BOUNDARY	((vm_paddr_t)1 << 32)
154 #endif
155 
156 /*
157  * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
158  * the ordering of the free list boundaries.
159  */
160 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
161 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
162 #endif
163 
164 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
165 SYSCTL_OID(_vm, OID_AUTO, phys_free,
166     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
167     sysctl_vm_phys_free, "A",
168     "Phys Free Info");
169 
170 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
171 SYSCTL_OID(_vm, OID_AUTO, phys_segs,
172     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
173     sysctl_vm_phys_segs, "A",
174     "Phys Seg Info");
175 
176 #ifdef NUMA
177 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
178 SYSCTL_OID(_vm, OID_AUTO, phys_locality,
179     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
180     sysctl_vm_phys_locality, "A",
181     "Phys Locality Info");
182 #endif
183 
184 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
185     &vm_ndomains, 0, "Number of physical memory domains available.");
186 
187 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
188 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
189 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
190     int order, int tail);
191 
192 static bool __diagused
193 vm_phys_pool_valid(int pool)
194 {
195 #ifdef VM_FREEPOOL_LAZYINIT
196 	if (pool == VM_FREEPOOL_LAZYINIT)
197 		return (false);
198 #endif
199 	return (pool >= 0 && pool < VM_NFREEPOOL);
200 }
201 
202 /*
203  * Red-black tree helpers for vm fictitious range management.
204  */
205 static inline int
206 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
207     struct vm_phys_fictitious_seg *range)
208 {
209 
210 	KASSERT(range->start != 0 && range->end != 0,
211 	    ("Invalid range passed on search for vm_fictitious page"));
212 	if (p->start >= range->end)
213 		return (1);
214 	if (p->start < range->start)
215 		return (-1);
216 
217 	return (0);
218 }
219 
220 static int
221 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
222     struct vm_phys_fictitious_seg *p2)
223 {
224 
225 	/* Check if this is a search for a page */
226 	if (p1->end == 0)
227 		return (vm_phys_fictitious_in_range(p1, p2));
228 
229 	KASSERT(p2->end != 0,
230     ("Invalid range passed as second parameter to vm fictitious comparison"));
231 
232 	/* Searching to add a new range */
233 	if (p1->end <= p2->start)
234 		return (-1);
235 	if (p1->start >= p2->end)
236 		return (1);
237 
238 	panic("Trying to add overlapping vm fictitious ranges:\n"
239 	    "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
240 	    (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
241 }
242 
243 int
244 vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used,
245     vm_paddr_t high __numa_used)
246 {
247 #ifdef NUMA
248 	domainset_t mask;
249 	int i;
250 
251 	if (vm_ndomains == 1 || mem_affinity == NULL)
252 		return (0);
253 
254 	DOMAINSET_ZERO(&mask);
255 	/*
256 	 * Check for any memory that overlaps low, high.
257 	 */
258 	for (i = 0; mem_affinity[i].end != 0; i++)
259 		if (mem_affinity[i].start <= high &&
260 		    mem_affinity[i].end >= low)
261 			DOMAINSET_SET(mem_affinity[i].domain, &mask);
262 	if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask))
263 		return (prefer);
264 	if (DOMAINSET_EMPTY(&mask))
265 		panic("vm_phys_domain_match:  Impossible constraint");
266 	return (DOMAINSET_FFS(&mask) - 1);
267 #else
268 	return (0);
269 #endif
270 }
271 
272 /*
273  * Outputs the state of the physical memory allocator, specifically,
274  * the amount of physical memory in each free list.
275  */
276 static int
277 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
278 {
279 	struct sbuf sbuf;
280 	struct vm_freelist *fl;
281 	int dom, error, flind, oind, pind;
282 
283 	error = sysctl_wire_old_buffer(req, 0);
284 	if (error != 0)
285 		return (error);
286 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
287 	for (dom = 0; dom < vm_ndomains; dom++) {
288 		sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
289 		for (flind = 0; flind < vm_nfreelists; flind++) {
290 			sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
291 			    "\n  ORDER (SIZE)  |  NUMBER"
292 			    "\n              ", flind);
293 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
294 				sbuf_printf(&sbuf, "  |  POOL %d", pind);
295 			sbuf_printf(&sbuf, "\n--            ");
296 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
297 				sbuf_printf(&sbuf, "-- --      ");
298 			sbuf_printf(&sbuf, "--\n");
299 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
300 				sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
301 				    1 << (PAGE_SHIFT - 10 + oind));
302 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
303 				fl = vm_phys_free_queues[dom][flind][pind];
304 					sbuf_printf(&sbuf, "  |  %6d",
305 					    fl[oind].lcnt);
306 				}
307 				sbuf_printf(&sbuf, "\n");
308 			}
309 		}
310 	}
311 	error = sbuf_finish(&sbuf);
312 	sbuf_delete(&sbuf);
313 	return (error);
314 }
315 
316 /*
317  * Outputs the set of physical memory segments.
318  */
319 static int
320 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
321 {
322 	struct sbuf sbuf;
323 	struct vm_phys_seg *seg;
324 	int error, segind;
325 
326 	error = sysctl_wire_old_buffer(req, 0);
327 	if (error != 0)
328 		return (error);
329 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
330 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
331 		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
332 		seg = &vm_phys_segs[segind];
333 		sbuf_printf(&sbuf, "start:     %#jx\n",
334 		    (uintmax_t)seg->start);
335 		sbuf_printf(&sbuf, "end:       %#jx\n",
336 		    (uintmax_t)seg->end);
337 		sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
338 		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
339 	}
340 	error = sbuf_finish(&sbuf);
341 	sbuf_delete(&sbuf);
342 	return (error);
343 }
344 
345 /*
346  * Return affinity, or -1 if there's no affinity information.
347  */
348 int
349 vm_phys_mem_affinity(int f __numa_used, int t __numa_used)
350 {
351 
352 #ifdef NUMA
353 	if (mem_locality == NULL)
354 		return (-1);
355 	if (f >= vm_ndomains || t >= vm_ndomains)
356 		return (-1);
357 	return (mem_locality[f * vm_ndomains + t]);
358 #else
359 	return (-1);
360 #endif
361 }
362 
363 #ifdef NUMA
364 /*
365  * Outputs the VM locality table.
366  */
367 static int
368 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
369 {
370 	struct sbuf sbuf;
371 	int error, i, j;
372 
373 	error = sysctl_wire_old_buffer(req, 0);
374 	if (error != 0)
375 		return (error);
376 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
377 
378 	sbuf_printf(&sbuf, "\n");
379 
380 	for (i = 0; i < vm_ndomains; i++) {
381 		sbuf_printf(&sbuf, "%d: ", i);
382 		for (j = 0; j < vm_ndomains; j++) {
383 			sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
384 		}
385 		sbuf_printf(&sbuf, "\n");
386 	}
387 	error = sbuf_finish(&sbuf);
388 	sbuf_delete(&sbuf);
389 	return (error);
390 }
391 #endif
392 
393 static void
394 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
395 {
396 
397 	m->order = order;
398 	if (tail)
399 		TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
400 	else
401 		TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
402 	fl[order].lcnt++;
403 }
404 
405 static void
406 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
407 {
408 
409 	TAILQ_REMOVE(&fl[order].pl, m, listq);
410 	fl[order].lcnt--;
411 	m->order = VM_NFREEORDER;
412 }
413 
414 /*
415  * Create a physical memory segment.
416  */
417 static void
418 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
419 {
420 	struct vm_phys_seg *seg;
421 
422 	KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
423 	    ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
424 	KASSERT(domain >= 0 && domain < vm_ndomains,
425 	    ("vm_phys_create_seg: invalid domain provided"));
426 	seg = &vm_phys_segs[vm_phys_nsegs++];
427 	while (seg > vm_phys_segs && (seg - 1)->start >= end) {
428 		*seg = *(seg - 1);
429 		seg--;
430 	}
431 	seg->start = start;
432 	seg->end = end;
433 	seg->domain = domain;
434 }
435 
436 static void
437 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
438 {
439 #ifdef NUMA
440 	int i;
441 
442 	if (mem_affinity == NULL) {
443 		_vm_phys_create_seg(start, end, 0);
444 		return;
445 	}
446 
447 	for (i = 0;; i++) {
448 		if (mem_affinity[i].end == 0)
449 			panic("Reached end of affinity info");
450 		if (mem_affinity[i].end <= start)
451 			continue;
452 		if (mem_affinity[i].start > start)
453 			panic("No affinity info for start %jx",
454 			    (uintmax_t)start);
455 		if (mem_affinity[i].end >= end) {
456 			_vm_phys_create_seg(start, end,
457 			    mem_affinity[i].domain);
458 			break;
459 		}
460 		_vm_phys_create_seg(start, mem_affinity[i].end,
461 		    mem_affinity[i].domain);
462 		start = mem_affinity[i].end;
463 	}
464 #else
465 	_vm_phys_create_seg(start, end, 0);
466 #endif
467 }
468 
469 /*
470  * Add a physical memory segment.
471  */
472 void
473 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
474 {
475 	vm_paddr_t paddr;
476 
477 	KASSERT((start & PAGE_MASK) == 0,
478 	    ("vm_phys_define_seg: start is not page aligned"));
479 	KASSERT((end & PAGE_MASK) == 0,
480 	    ("vm_phys_define_seg: end is not page aligned"));
481 
482 	/*
483 	 * Split the physical memory segment if it spans two or more free
484 	 * list boundaries.
485 	 */
486 	paddr = start;
487 #ifdef	VM_FREELIST_LOWMEM
488 	if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
489 		vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
490 		paddr = VM_LOWMEM_BOUNDARY;
491 	}
492 #endif
493 #ifdef	VM_FREELIST_DMA32
494 	if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
495 		vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
496 		paddr = VM_DMA32_BOUNDARY;
497 	}
498 #endif
499 	vm_phys_create_seg(paddr, end);
500 }
501 
502 /*
503  * Initialize the physical memory allocator.
504  *
505  * Requires that vm_page_array is initialized!
506  */
507 void
508 vm_phys_init(void)
509 {
510 	struct vm_freelist *fl;
511 	struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg;
512 #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE)
513 	u_long npages;
514 #endif
515 	int dom, flind, freelist, oind, pind, segind;
516 
517 	/*
518 	 * Compute the number of free lists, and generate the mapping from the
519 	 * manifest constants VM_FREELIST_* to the free list indices.
520 	 *
521 	 * Initially, the entries of vm_freelist_to_flind[] are set to either
522 	 * 0 or 1 to indicate which free lists should be created.
523 	 */
524 #ifdef	VM_DMA32_NPAGES_THRESHOLD
525 	npages = 0;
526 #endif
527 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
528 		seg = &vm_phys_segs[segind];
529 #ifdef	VM_FREELIST_LOWMEM
530 		if (seg->end <= VM_LOWMEM_BOUNDARY)
531 			vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
532 		else
533 #endif
534 #ifdef	VM_FREELIST_DMA32
535 		if (
536 #ifdef	VM_DMA32_NPAGES_THRESHOLD
537 		    /*
538 		     * Create the DMA32 free list only if the amount of
539 		     * physical memory above physical address 4G exceeds the
540 		     * given threshold.
541 		     */
542 		    npages > VM_DMA32_NPAGES_THRESHOLD &&
543 #endif
544 		    seg->end <= VM_DMA32_BOUNDARY)
545 			vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
546 		else
547 #endif
548 		{
549 #ifdef	VM_DMA32_NPAGES_THRESHOLD
550 			npages += atop(seg->end - seg->start);
551 #endif
552 			vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
553 		}
554 	}
555 	/* Change each entry into a running total of the free lists. */
556 	for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
557 		vm_freelist_to_flind[freelist] +=
558 		    vm_freelist_to_flind[freelist - 1];
559 	}
560 	vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
561 	KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
562 	/* Change each entry into a free list index. */
563 	for (freelist = 0; freelist < VM_NFREELIST; freelist++)
564 		vm_freelist_to_flind[freelist]--;
565 
566 	/*
567 	 * Initialize the first_page and free_queues fields of each physical
568 	 * memory segment.
569 	 */
570 #ifdef VM_PHYSSEG_SPARSE
571 	npages = 0;
572 #endif
573 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
574 		seg = &vm_phys_segs[segind];
575 #ifdef VM_PHYSSEG_SPARSE
576 		seg->first_page = &vm_page_array[npages];
577 		npages += atop(seg->end - seg->start);
578 #else
579 		seg->first_page = PHYS_TO_VM_PAGE(seg->start);
580 #endif
581 #ifdef	VM_FREELIST_LOWMEM
582 		if (seg->end <= VM_LOWMEM_BOUNDARY) {
583 			flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
584 			KASSERT(flind >= 0,
585 			    ("vm_phys_init: LOWMEM flind < 0"));
586 		} else
587 #endif
588 #ifdef	VM_FREELIST_DMA32
589 		if (seg->end <= VM_DMA32_BOUNDARY) {
590 			flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
591 			KASSERT(flind >= 0,
592 			    ("vm_phys_init: DMA32 flind < 0"));
593 		} else
594 #endif
595 		{
596 			flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
597 			KASSERT(flind >= 0,
598 			    ("vm_phys_init: DEFAULT flind < 0"));
599 		}
600 		seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
601 	}
602 
603 	/*
604 	 * Coalesce physical memory segments that are contiguous and share the
605 	 * same per-domain free queues.
606 	 */
607 	prev_seg = vm_phys_segs;
608 	seg = &vm_phys_segs[1];
609 	end_seg = &vm_phys_segs[vm_phys_nsegs];
610 	while (seg < end_seg) {
611 		if (prev_seg->end == seg->start &&
612 		    prev_seg->free_queues == seg->free_queues) {
613 			prev_seg->end = seg->end;
614 			KASSERT(prev_seg->domain == seg->domain,
615 			    ("vm_phys_init: free queues cannot span domains"));
616 			vm_phys_nsegs--;
617 			end_seg--;
618 			for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++)
619 				*tmp_seg = *(tmp_seg + 1);
620 		} else {
621 			prev_seg = seg;
622 			seg++;
623 		}
624 	}
625 
626 	/*
627 	 * Initialize the free queues.
628 	 */
629 	for (dom = 0; dom < vm_ndomains; dom++) {
630 		for (flind = 0; flind < vm_nfreelists; flind++) {
631 			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
632 				fl = vm_phys_free_queues[dom][flind][pind];
633 				for (oind = 0; oind < VM_NFREEORDER; oind++)
634 					TAILQ_INIT(&fl[oind].pl);
635 			}
636 		}
637 	}
638 
639 #ifdef VM_FREEPOOL_LAZYINIT
640 	vm_default_freepool = VM_FREEPOOL_LAZYINIT;
641 #else
642 	vm_default_freepool = VM_FREEPOOL_DEFAULT;
643 #endif
644 
645 	rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
646 }
647 
648 /*
649  * Register info about the NUMA topology of the system.
650  *
651  * Invoked by platform-dependent code prior to vm_phys_init().
652  */
653 void
654 vm_phys_register_domains(int ndomains __numa_used,
655     struct mem_affinity *affinity __numa_used, int *locality __numa_used)
656 {
657 #ifdef NUMA
658 	int i;
659 
660 	/*
661 	 * For now the only override value that we support is 1, which
662 	 * effectively disables NUMA-awareness in the allocators.
663 	 */
664 	TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled);
665 	if (numa_disabled)
666 		ndomains = 1;
667 
668 	if (ndomains > 1) {
669 		vm_ndomains = ndomains;
670 		mem_affinity = affinity;
671 		mem_locality = locality;
672 	}
673 
674 	for (i = 0; i < vm_ndomains; i++)
675 		DOMAINSET_SET(i, &all_domains);
676 #endif
677 }
678 
679 /*
680  * Split a contiguous, power of two-sized set of physical pages.
681  *
682  * When this function is called by a page allocation function, the caller
683  * should request insertion at the head unless the order [order, oind) queues
684  * are known to be empty.  The objective being to reduce the likelihood of
685  * long-term fragmentation by promoting contemporaneous allocation and
686  * (hopefully) deallocation.
687  */
688 static __inline void
689 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order,
690     int tail)
691 {
692 	vm_page_t m_buddy;
693 
694 	while (oind > order) {
695 		oind--;
696 		m_buddy = &m[1 << oind];
697 		KASSERT(m_buddy->order == VM_NFREEORDER,
698 		    ("vm_phys_split_pages: page %p has unexpected order %d",
699 		    m_buddy, m_buddy->order));
700 		vm_freelist_add(fl, m_buddy, oind, tail);
701         }
702 }
703 
704 static void
705 vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int tail)
706 {
707 	KASSERT(order >= 0 && order < VM_NFREEORDER,
708 	    ("%s: invalid order %d", __func__, order));
709 
710 	vm_freelist_add(fl, m, order, tail);
711 #ifdef VM_FREEPOOL_LAZYINIT
712 	if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
713 		vm_page_t m_next;
714 		vm_paddr_t pa;
715 		int npages;
716 
717 		npages = 1 << order;
718 		m_next = m + npages;
719 		pa = m->phys_addr + ptoa(npages);
720 		if (pa < vm_phys_segs[m->segind].end) {
721 			vm_page_init_page(m_next, pa, m->segind,
722 			    VM_FREEPOOL_LAZYINIT);
723 		}
724 	}
725 #endif
726 }
727 
728 /*
729  * Add the physical pages [m, m + npages) at the beginning of a power-of-two
730  * aligned and sized set to the specified free list.
731  *
732  * When this function is called by a page allocation function, the caller
733  * should request insertion at the head unless the lower-order queues are
734  * known to be empty.  The objective being to reduce the likelihood of long-
735  * term fragmentation by promoting contemporaneous allocation and (hopefully)
736  * deallocation.
737  *
738  * The physical page m's buddy must not be free.
739  */
740 static void
741 vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail)
742 {
743         int order;
744 
745 	KASSERT(npages == 0 ||
746 	    (VM_PAGE_TO_PHYS(m) &
747 	    ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
748 	    ("%s: page %p and npages %u are misaligned",
749 	    __func__, m, npages));
750         while (npages > 0) {
751 		KASSERT(m->order == VM_NFREEORDER,
752 		    ("%s: page %p has unexpected order %d",
753 		    __func__, m, m->order));
754 		order = ilog2(npages);
755 		KASSERT(order < VM_NFREEORDER,
756 		    ("%s: order %d is out of range", __func__, order));
757 		vm_phys_enq_chunk(fl, m, order, tail);
758 		m += 1 << order;
759 		npages -= 1 << order;
760 	}
761 }
762 
763 /*
764  * Add the physical pages [m, m + npages) at the end of a power-of-two aligned
765  * and sized set to the specified free list.
766  *
767  * When this function is called by a page allocation function, the caller
768  * should request insertion at the head unless the lower-order queues are
769  * known to be empty.  The objective being to reduce the likelihood of long-
770  * term fragmentation by promoting contemporaneous allocation and (hopefully)
771  * deallocation.
772  *
773  * If npages is zero, this function does nothing and ignores the physical page
774  * parameter m.  Otherwise, the physical page m's buddy must not be free.
775  */
776 static vm_page_t
777 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail)
778 {
779 	int order;
780 
781 	KASSERT(npages == 0 ||
782 	    ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) &
783 	    ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
784 	    ("vm_phys_enq_range: page %p and npages %u are misaligned",
785 	    m, npages));
786 	while (npages > 0) {
787 		KASSERT(m->order == VM_NFREEORDER,
788 		    ("vm_phys_enq_range: page %p has unexpected order %d",
789 		    m, m->order));
790 		order = ffs(npages) - 1;
791 		vm_phys_enq_chunk(fl, m, order, tail);
792 		m += 1 << order;
793 		npages -= 1 << order;
794 	}
795 	return (m);
796 }
797 
798 /*
799  * Set the pool for a contiguous, power of two-sized set of physical pages.
800  *
801  * If the pages currently belong to the lazy init pool, then the corresponding
802  * page structures must be initialized.  In this case it is assumed that the
803  * first page in the run has already been initialized.
804  */
805 static void
806 vm_phys_set_pool(int pool, vm_page_t m, int order)
807 {
808 #ifdef VM_FREEPOOL_LAZYINIT
809 	if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
810 		vm_paddr_t pa;
811 		int segind;
812 
813 		m->pool = pool;
814 
815 		TSENTER();
816 		pa = m->phys_addr + PAGE_SIZE;
817 		segind = m->segind;
818 		for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order];
819 		    m_tmp++, pa += PAGE_SIZE)
820 			vm_page_init_page(m_tmp, pa, segind, pool);
821 		TSEXIT();
822 	} else
823 #endif
824 		for (vm_page_t m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
825 			m_tmp->pool = pool;
826 }
827 
828 /*
829  * Tries to allocate the specified number of pages from the specified pool
830  * within the specified domain.  Returns the actual number of allocated pages
831  * and a pointer to each page through the array ma[].
832  *
833  * The returned pages may not be physically contiguous.  However, in contrast
834  * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0),
835  * calling this function once to allocate the desired number of pages will
836  * avoid wasted time in vm_phys_split_pages().
837  *
838  * The free page queues for the specified domain must be locked.
839  */
840 int
841 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
842 {
843 	struct vm_freelist *alt, *fl;
844 	vm_page_t m;
845 	int avail, end, flind, freelist, i, oind, pind;
846 
847 	KASSERT(domain >= 0 && domain < vm_ndomains,
848 	    ("vm_phys_alloc_npages: domain %d is out of range", domain));
849 	KASSERT(vm_phys_pool_valid(pool),
850 	    ("vm_phys_alloc_npages: pool %d is out of range", pool));
851 	KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
852 	    ("vm_phys_alloc_npages: npages %d is out of range", npages));
853 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
854 	i = 0;
855 	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
856 		flind = vm_freelist_to_flind[freelist];
857 		if (flind < 0)
858 			continue;
859 		fl = vm_phys_free_queues[domain][flind][pool];
860 		for (oind = 0; oind < VM_NFREEORDER; oind++) {
861 			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
862 				vm_freelist_rem(fl, m, oind);
863 				avail = i + (1 << oind);
864 				end = imin(npages, avail);
865 				while (i < end)
866 					ma[i++] = m++;
867 				if (i == npages) {
868 					/*
869 					 * Return excess pages to fl.  Its order
870 					 * [0, oind) queues are empty.
871 					 */
872 					vm_phys_enq_range(m, avail - i, fl, 1);
873 					return (npages);
874 				}
875 			}
876 		}
877 		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
878 			for (pind = vm_default_freepool; pind < VM_NFREEPOOL;
879 			    pind++) {
880 				alt = vm_phys_free_queues[domain][flind][pind];
881 				while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
882 				    NULL) {
883 					vm_freelist_rem(alt, m, oind);
884 					vm_phys_set_pool(pool, m, oind);
885 					avail = i + (1 << oind);
886 					end = imin(npages, avail);
887 					while (i < end)
888 						ma[i++] = m++;
889 					if (i == npages) {
890 						/*
891 						 * Return excess pages to fl.
892 						 * Its order [0, oind) queues
893 						 * are empty.
894 						 */
895 						vm_phys_enq_range(m, avail - i,
896 						    fl, 1);
897 						return (npages);
898 					}
899 				}
900 			}
901 		}
902 	}
903 	return (i);
904 }
905 
906 /*
907  * Allocate a contiguous, power of two-sized set of physical pages from the
908  * specified free list.  The free list must be specified using one of the
909  * manifest constants VM_FREELIST_*.
910  *
911  * The free page queues must be locked.
912  */
913 static vm_page_t
914 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
915 {
916 	struct vm_freelist *alt, *fl;
917 	vm_page_t m;
918 	int oind, pind, flind;
919 
920 	KASSERT(domain >= 0 && domain < vm_ndomains,
921 	    ("vm_phys_alloc_freelist_pages: domain %d is out of range",
922 	    domain));
923 	KASSERT(freelist < VM_NFREELIST,
924 	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
925 	    freelist));
926 	KASSERT(vm_phys_pool_valid(pool),
927 	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
928 	KASSERT(order < VM_NFREEORDER,
929 	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
930 
931 	flind = vm_freelist_to_flind[freelist];
932 	/* Check if freelist is present */
933 	if (flind < 0)
934 		return (NULL);
935 
936 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
937 	fl = &vm_phys_free_queues[domain][flind][pool][0];
938 	for (oind = order; oind < VM_NFREEORDER; oind++) {
939 		m = TAILQ_FIRST(&fl[oind].pl);
940 		if (m != NULL) {
941 			vm_freelist_rem(fl, m, oind);
942 			/* The order [order, oind) queues are empty. */
943 			vm_phys_split_pages(m, oind, fl, order, 1);
944 			return (m);
945 		}
946 	}
947 
948 	/*
949 	 * The given pool was empty.  Find the largest
950 	 * contiguous, power-of-two-sized set of pages in any
951 	 * pool.  Transfer these pages to the given pool, and
952 	 * use them to satisfy the allocation.
953 	 */
954 	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
955 		for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
956 			alt = &vm_phys_free_queues[domain][flind][pind][0];
957 			m = TAILQ_FIRST(&alt[oind].pl);
958 			if (m != NULL) {
959 				vm_freelist_rem(alt, m, oind);
960 				vm_phys_set_pool(pool, m, oind);
961 				/* The order [order, oind) queues are empty. */
962 				vm_phys_split_pages(m, oind, fl, order, 1);
963 				return (m);
964 			}
965 		}
966 	}
967 	return (NULL);
968 }
969 
970 /*
971  * Allocate a contiguous, power of two-sized set of physical pages
972  * from the free lists.
973  *
974  * The free page queues must be locked.
975  */
976 vm_page_t
977 vm_phys_alloc_pages(int domain, int pool, int order)
978 {
979 	vm_page_t m;
980 	int freelist;
981 
982 	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
983 		m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order);
984 		if (m != NULL)
985 			return (m);
986 	}
987 	return (NULL);
988 }
989 
990 /*
991  * Find the vm_page corresponding to the given physical address, which must lie
992  * within the given physical memory segment.
993  */
994 vm_page_t
995 vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa)
996 {
997 	KASSERT(pa >= seg->start && pa < seg->end,
998 	    ("%s: pa %#jx is out of range", __func__, (uintmax_t)pa));
999 
1000 	return (&seg->first_page[atop(pa - seg->start)]);
1001 }
1002 
1003 /*
1004  * Find the vm_page corresponding to the given physical address.
1005  */
1006 vm_page_t
1007 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
1008 {
1009 	struct vm_phys_seg *seg;
1010 
1011 	if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
1012 		return (vm_phys_seg_paddr_to_vm_page(seg, pa));
1013 	return (NULL);
1014 }
1015 
1016 vm_page_t
1017 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
1018 {
1019 	struct vm_phys_fictitious_seg tmp, *seg;
1020 	vm_page_t m;
1021 
1022 	m = NULL;
1023 	tmp.start = pa;
1024 	tmp.end = 0;
1025 
1026 	rw_rlock(&vm_phys_fictitious_reg_lock);
1027 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1028 	rw_runlock(&vm_phys_fictitious_reg_lock);
1029 	if (seg == NULL)
1030 		return (NULL);
1031 
1032 	m = &seg->first_page[atop(pa - seg->start)];
1033 	KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
1034 
1035 	return (m);
1036 }
1037 
1038 static inline void
1039 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
1040     long page_count, vm_memattr_t memattr)
1041 {
1042 	long i;
1043 
1044 	bzero(range, page_count * sizeof(*range));
1045 	for (i = 0; i < page_count; i++) {
1046 		vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
1047 		range[i].oflags &= ~VPO_UNMANAGED;
1048 		range[i].busy_lock = VPB_UNBUSIED;
1049 	}
1050 }
1051 
1052 int
1053 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
1054     vm_memattr_t memattr)
1055 {
1056 	struct vm_phys_fictitious_seg *seg;
1057 	vm_page_t fp;
1058 	long page_count;
1059 #ifdef VM_PHYSSEG_DENSE
1060 	long pi, pe;
1061 	long dpage_count;
1062 #endif
1063 
1064 	KASSERT(start < end,
1065 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
1066 	    (uintmax_t)start, (uintmax_t)end));
1067 
1068 	page_count = (end - start) / PAGE_SIZE;
1069 
1070 #ifdef VM_PHYSSEG_DENSE
1071 	pi = atop(start);
1072 	pe = atop(end);
1073 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1074 		fp = &vm_page_array[pi - first_page];
1075 		if ((pe - first_page) > vm_page_array_size) {
1076 			/*
1077 			 * We have a segment that starts inside
1078 			 * of vm_page_array, but ends outside of it.
1079 			 *
1080 			 * Use vm_page_array pages for those that are
1081 			 * inside of the vm_page_array range, and
1082 			 * allocate the remaining ones.
1083 			 */
1084 			dpage_count = vm_page_array_size - (pi - first_page);
1085 			vm_phys_fictitious_init_range(fp, start, dpage_count,
1086 			    memattr);
1087 			page_count -= dpage_count;
1088 			start += ptoa(dpage_count);
1089 			goto alloc;
1090 		}
1091 		/*
1092 		 * We can allocate the full range from vm_page_array,
1093 		 * so there's no need to register the range in the tree.
1094 		 */
1095 		vm_phys_fictitious_init_range(fp, start, page_count, memattr);
1096 		return (0);
1097 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1098 		/*
1099 		 * We have a segment that ends inside of vm_page_array,
1100 		 * but starts outside of it.
1101 		 */
1102 		fp = &vm_page_array[0];
1103 		dpage_count = pe - first_page;
1104 		vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
1105 		    memattr);
1106 		end -= ptoa(dpage_count);
1107 		page_count -= dpage_count;
1108 		goto alloc;
1109 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1110 		/*
1111 		 * Trying to register a fictitious range that expands before
1112 		 * and after vm_page_array.
1113 		 */
1114 		return (EINVAL);
1115 	} else {
1116 alloc:
1117 #endif
1118 		fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
1119 		    M_WAITOK);
1120 #ifdef VM_PHYSSEG_DENSE
1121 	}
1122 #endif
1123 	vm_phys_fictitious_init_range(fp, start, page_count, memattr);
1124 
1125 	seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
1126 	seg->start = start;
1127 	seg->end = end;
1128 	seg->first_page = fp;
1129 
1130 	rw_wlock(&vm_phys_fictitious_reg_lock);
1131 	RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
1132 	rw_wunlock(&vm_phys_fictitious_reg_lock);
1133 
1134 	return (0);
1135 }
1136 
1137 void
1138 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
1139 {
1140 	struct vm_phys_fictitious_seg *seg, tmp;
1141 #ifdef VM_PHYSSEG_DENSE
1142 	long pi, pe;
1143 #endif
1144 
1145 	KASSERT(start < end,
1146 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
1147 	    (uintmax_t)start, (uintmax_t)end));
1148 
1149 #ifdef VM_PHYSSEG_DENSE
1150 	pi = atop(start);
1151 	pe = atop(end);
1152 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1153 		if ((pe - first_page) <= vm_page_array_size) {
1154 			/*
1155 			 * This segment was allocated using vm_page_array
1156 			 * only, there's nothing to do since those pages
1157 			 * were never added to the tree.
1158 			 */
1159 			return;
1160 		}
1161 		/*
1162 		 * We have a segment that starts inside
1163 		 * of vm_page_array, but ends outside of it.
1164 		 *
1165 		 * Calculate how many pages were added to the
1166 		 * tree and free them.
1167 		 */
1168 		start = ptoa(first_page + vm_page_array_size);
1169 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1170 		/*
1171 		 * We have a segment that ends inside of vm_page_array,
1172 		 * but starts outside of it.
1173 		 */
1174 		end = ptoa(first_page);
1175 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1176 		/* Since it's not possible to register such a range, panic. */
1177 		panic(
1178 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
1179 		    (uintmax_t)start, (uintmax_t)end);
1180 	}
1181 #endif
1182 	tmp.start = start;
1183 	tmp.end = 0;
1184 
1185 	rw_wlock(&vm_phys_fictitious_reg_lock);
1186 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1187 	if (seg->start != start || seg->end != end) {
1188 		rw_wunlock(&vm_phys_fictitious_reg_lock);
1189 		panic(
1190 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
1191 		    (uintmax_t)start, (uintmax_t)end);
1192 	}
1193 	RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
1194 	rw_wunlock(&vm_phys_fictitious_reg_lock);
1195 	free(seg->first_page, M_FICT_PAGES);
1196 	free(seg, M_FICT_PAGES);
1197 }
1198 
1199 /*
1200  * Free a contiguous, power of two-sized set of physical pages.
1201  *
1202  * The free page queues must be locked.
1203  */
1204 void
1205 vm_phys_free_pages(vm_page_t m, int order)
1206 {
1207 	struct vm_freelist *fl;
1208 	struct vm_phys_seg *seg;
1209 	vm_paddr_t pa;
1210 	vm_page_t m_buddy;
1211 
1212 	KASSERT(m->order == VM_NFREEORDER,
1213 	    ("vm_phys_free_pages: page %p has unexpected order %d",
1214 	    m, m->order));
1215 	KASSERT(vm_phys_pool_valid(m->pool),
1216 	    ("vm_phys_free_pages: page %p has unexpected pool %d",
1217 	    m, m->pool));
1218 	KASSERT(order < VM_NFREEORDER,
1219 	    ("vm_phys_free_pages: order %d is out of range", order));
1220 	seg = &vm_phys_segs[m->segind];
1221 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1222 	if (order < VM_NFREEORDER - 1) {
1223 		pa = VM_PAGE_TO_PHYS(m);
1224 		do {
1225 			pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
1226 			if (pa < seg->start || pa >= seg->end)
1227 				break;
1228 			m_buddy = vm_phys_seg_paddr_to_vm_page(seg, pa);
1229 			if (m_buddy->order != order)
1230 				break;
1231 			fl = (*seg->free_queues)[m_buddy->pool];
1232 			vm_freelist_rem(fl, m_buddy, order);
1233 			if (m_buddy->pool != m->pool)
1234 				vm_phys_set_pool(m->pool, m_buddy, order);
1235 			order++;
1236 			pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
1237 			m = vm_phys_seg_paddr_to_vm_page(seg, pa);
1238 		} while (order < VM_NFREEORDER - 1);
1239 	}
1240 	fl = (*seg->free_queues)[m->pool];
1241 	vm_freelist_add(fl, m, order, 1);
1242 }
1243 
1244 #ifdef VM_FREEPOOL_LAZYINIT
1245 /*
1246  * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving
1247  * them to the default pool.  This is a prerequisite for some rare operations
1248  * which need to scan the page array and thus depend on all pages being
1249  * initialized.
1250  */
1251 static void
1252 vm_phys_lazy_init_domain(int domain, bool locked)
1253 {
1254 	static bool initdone[MAXMEMDOM];
1255 	struct vm_domain *vmd;
1256 	struct vm_freelist *fl;
1257 	vm_page_t m;
1258 	int pind;
1259 	bool unlocked;
1260 
1261 	if (__predict_true(atomic_load_bool(&initdone[domain])))
1262 		return;
1263 
1264 	vmd = VM_DOMAIN(domain);
1265 	if (locked)
1266 		vm_domain_free_assert_locked(vmd);
1267 	else
1268 		vm_domain_free_lock(vmd);
1269 	if (atomic_load_bool(&initdone[domain]))
1270 		goto out;
1271 	pind = VM_FREEPOOL_LAZYINIT;
1272 	for (int freelist = 0; freelist < VM_NFREELIST; freelist++) {
1273 		int flind;
1274 
1275 		flind = vm_freelist_to_flind[freelist];
1276 		if (flind < 0)
1277 			continue;
1278 		fl = vm_phys_free_queues[domain][flind][pind];
1279 		for (int oind = 0; oind < VM_NFREEORDER; oind++) {
1280 			if (atomic_load_int(&fl[oind].lcnt) == 0)
1281 				continue;
1282 			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
1283 				/*
1284 				 * Avoid holding the lock across the
1285 				 * initialization unless there's a free page
1286 				 * shortage.
1287 				 */
1288 				vm_freelist_rem(fl, m, oind);
1289 				unlocked = vm_domain_allocate(vmd,
1290 				    VM_ALLOC_NORMAL, 1 << oind);
1291 				if (unlocked)
1292 					vm_domain_free_unlock(vmd);
1293 				vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind);
1294 				if (unlocked) {
1295 					vm_domain_freecnt_inc(vmd, 1 << oind);
1296 					vm_domain_free_lock(vmd);
1297 				}
1298 				vm_phys_free_pages(m, oind);
1299 			}
1300 		}
1301 	}
1302 	atomic_store_bool(&initdone[domain], true);
1303 out:
1304 	if (!locked)
1305 		vm_domain_free_unlock(vmd);
1306 }
1307 
1308 static void
1309 vm_phys_lazy_init(void)
1310 {
1311 	for (int domain = 0; domain < vm_ndomains; domain++)
1312 		vm_phys_lazy_init_domain(domain, false);
1313 	atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT);
1314 }
1315 
1316 static void
1317 vm_phys_lazy_init_kthr(void *arg __unused)
1318 {
1319 	vm_phys_lazy_init();
1320 	kthread_exit();
1321 }
1322 
1323 static void
1324 vm_phys_lazy_sysinit(void *arg __unused)
1325 {
1326 	struct thread *td;
1327 	int error;
1328 
1329 	error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td,
1330 	    RFSTOPPED, 0, "vmlazyinit");
1331 	if (error == 0) {
1332 		thread_lock(td);
1333 		sched_prio(td, PRI_MIN_IDLE);
1334 		sched_add(td, SRQ_BORING);
1335 	} else {
1336 		printf("%s: could not create lazy init thread: %d\n",
1337 		    __func__, error);
1338 		vm_phys_lazy_init();
1339 	}
1340 }
1341 SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit,
1342     NULL);
1343 #endif /* VM_FREEPOOL_LAZYINIT */
1344 
1345 /*
1346  * Free a contiguous, arbitrarily sized set of physical pages, without
1347  * merging across set boundaries.
1348  *
1349  * The free page queues must be locked.
1350  */
1351 void
1352 vm_phys_enqueue_contig(vm_page_t m, u_long npages)
1353 {
1354 	struct vm_freelist *fl;
1355 	struct vm_phys_seg *seg;
1356 	vm_page_t m_end;
1357 	vm_paddr_t diff, lo;
1358 	int order;
1359 
1360 	/*
1361 	 * Avoid unnecessary coalescing by freeing the pages in the largest
1362 	 * possible power-of-two-sized subsets.
1363 	 */
1364 	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1365 	seg = &vm_phys_segs[m->segind];
1366 	fl = (*seg->free_queues)[m->pool];
1367 	m_end = m + npages;
1368 	/* Free blocks of increasing size. */
1369 	lo = atop(VM_PAGE_TO_PHYS(m));
1370 	if (m < m_end &&
1371 	    (diff = lo ^ (lo + npages - 1)) != 0) {
1372 		order = min(ilog2(diff), VM_NFREEORDER - 1);
1373 		m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl, 1);
1374 	}
1375 
1376 	/* Free blocks of maximum size. */
1377 	order = VM_NFREEORDER - 1;
1378 	while (m + (1 << order) <= m_end) {
1379 		KASSERT(seg == &vm_phys_segs[m->segind],
1380 		    ("%s: page range [%p,%p) spans multiple segments",
1381 		    __func__, m_end - npages, m));
1382 		vm_phys_enq_chunk(fl, m, order, 1);
1383 		m += 1 << order;
1384 	}
1385 	/* Free blocks of diminishing size. */
1386 	vm_phys_enq_beg(m, m_end - m, fl, 1);
1387 }
1388 
1389 /*
1390  * Free a contiguous, arbitrarily sized set of physical pages.
1391  *
1392  * The free page queues must be locked.
1393  */
1394 void
1395 vm_phys_free_contig(vm_page_t m, u_long npages)
1396 {
1397 	vm_paddr_t lo;
1398 	vm_page_t m_start, m_end;
1399 	unsigned max_order, order_start, order_end;
1400 
1401 	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1402 
1403 	lo = atop(VM_PAGE_TO_PHYS(m));
1404 	max_order = min(ilog2(lo ^ (lo + npages)), VM_NFREEORDER - 1);
1405 
1406 	m_start = m;
1407 	order_start = ffsll(lo) - 1;
1408 	if (order_start < max_order)
1409 		m_start += 1 << order_start;
1410 	m_end = m + npages;
1411 	order_end = ffsll(lo + npages) - 1;
1412 	if (order_end < max_order)
1413 		m_end -= 1 << order_end;
1414 	/*
1415 	 * Avoid unnecessary coalescing by freeing the pages at the start and
1416 	 * end of the range last.
1417 	 */
1418 	if (m_start < m_end)
1419 		vm_phys_enqueue_contig(m_start, m_end - m_start);
1420 	if (order_start < max_order)
1421 		vm_phys_free_pages(m, order_start);
1422 	if (order_end < max_order)
1423 		vm_phys_free_pages(m_end, order_end);
1424 }
1425 
1426 /*
1427  * Identify the first address range within segment segind or greater
1428  * that matches the domain, lies within the low/high range, and has
1429  * enough pages.  Return -1 if there is none.
1430  */
1431 int
1432 vm_phys_find_range(vm_page_t bounds[], int segind, int domain,
1433     u_long npages, vm_paddr_t low, vm_paddr_t high)
1434 {
1435 	vm_paddr_t pa_end, pa_start;
1436 	struct vm_phys_seg *end_seg, *seg;
1437 
1438 	KASSERT(npages > 0, ("npages is zero"));
1439 	KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range"));
1440 	end_seg = &vm_phys_segs[vm_phys_nsegs];
1441 	for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) {
1442 		if (seg->domain != domain)
1443 			continue;
1444 		if (seg->start >= high)
1445 			return (-1);
1446 		pa_start = MAX(low, seg->start);
1447 		pa_end = MIN(high, seg->end);
1448 		if (pa_end - pa_start < ptoa(npages))
1449 			continue;
1450 #ifdef VM_FREEPOOL_LAZYINIT
1451 		/*
1452 		 * The pages on the free lists must be initialized.
1453 		 */
1454 		vm_phys_lazy_init_domain(domain, false);
1455 #endif
1456 		bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start);
1457 		bounds[1] = &seg->first_page[atop(pa_end - seg->start)];
1458 		return (seg - vm_phys_segs);
1459 	}
1460 	return (-1);
1461 }
1462 
1463 /*
1464  * Search for the given physical page "m" in the free lists.  If the search
1465  * succeeds, remove "m" from the free lists and return true.  Otherwise, return
1466  * false, indicating that "m" is not in the free lists.
1467  *
1468  * The free page queues must be locked.
1469  */
1470 bool
1471 vm_phys_unfree_page(vm_paddr_t pa)
1472 {
1473 	struct vm_freelist *fl;
1474 	struct vm_phys_seg *seg;
1475 	vm_paddr_t pa_half;
1476 	vm_page_t m, m_set, m_tmp;
1477 	int order;
1478 
1479 	seg = vm_phys_paddr_to_seg(pa);
1480 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1481 
1482 	/*
1483 	 * The pages on the free lists must be initialized.
1484 	 */
1485 #ifdef VM_FREEPOOL_LAZYINIT
1486 	vm_phys_lazy_init_domain(seg->domain, true);
1487 #endif
1488 
1489 	/*
1490 	 * First, find the contiguous, power of two-sized set of free
1491 	 * physical pages containing the given physical page "m" and
1492 	 * assign it to "m_set".
1493 	 */
1494 	m = vm_phys_paddr_to_vm_page(pa);
1495 	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
1496 	    order < VM_NFREEORDER - 1; ) {
1497 		order++;
1498 		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
1499 		if (pa >= seg->start)
1500 			m_set = vm_phys_seg_paddr_to_vm_page(seg, pa);
1501 		else
1502 			return (false);
1503 	}
1504 	if (m_set->order < order)
1505 		return (false);
1506 	if (m_set->order == VM_NFREEORDER)
1507 		return (false);
1508 	KASSERT(m_set->order < VM_NFREEORDER,
1509 	    ("vm_phys_unfree_page: page %p has unexpected order %d",
1510 	    m_set, m_set->order));
1511 
1512 	/*
1513 	 * Next, remove "m_set" from the free lists.  Finally, extract
1514 	 * "m" from "m_set" using an iterative algorithm: While "m_set"
1515 	 * is larger than a page, shrink "m_set" by returning the half
1516 	 * of "m_set" that does not contain "m" to the free lists.
1517 	 */
1518 	fl = (*seg->free_queues)[m_set->pool];
1519 	order = m_set->order;
1520 	vm_freelist_rem(fl, m_set, order);
1521 	while (order > 0) {
1522 		order--;
1523 		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
1524 		if (m->phys_addr < pa_half)
1525 			m_tmp = vm_phys_seg_paddr_to_vm_page(seg, pa_half);
1526 		else {
1527 			m_tmp = m_set;
1528 			m_set = vm_phys_seg_paddr_to_vm_page(seg, pa_half);
1529 		}
1530 		vm_freelist_add(fl, m_tmp, order, 0);
1531 	}
1532 	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
1533 	return (true);
1534 }
1535 
1536 /*
1537  * Find a run of contiguous physical pages, meeting alignment requirements, from
1538  * a list of max-sized page blocks, where we need at least two consecutive
1539  * blocks to satisfy the (large) page request.
1540  */
1541 static vm_page_t
1542 vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages,
1543     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
1544 {
1545 	struct vm_phys_seg *seg;
1546 	vm_page_t m, m_iter, m_ret;
1547 	vm_paddr_t max_size, size;
1548 	int max_order;
1549 
1550 	max_order = VM_NFREEORDER - 1;
1551 	size = npages << PAGE_SHIFT;
1552 	max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order);
1553 	KASSERT(size > max_size, ("size is too small"));
1554 
1555 	/*
1556 	 * In order to avoid examining any free max-sized page block more than
1557 	 * twice, identify the ones that are first in a physically-contiguous
1558 	 * sequence of such blocks, and only for those walk the sequence to
1559 	 * check if there are enough free blocks starting at a properly aligned
1560 	 * block.  Thus, no block is checked for free-ness more than twice.
1561 	 */
1562 	TAILQ_FOREACH(m, &fl[max_order].pl, listq) {
1563 		/*
1564 		 * Skip m unless it is first in a sequence of free max page
1565 		 * blocks >= low in its segment.
1566 		 */
1567 		seg = &vm_phys_segs[m->segind];
1568 		if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start))
1569 			continue;
1570 		if (VM_PAGE_TO_PHYS(m) >= max_size &&
1571 		    VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) &&
1572 		    max_order == m[-1 << max_order].order)
1573 			continue;
1574 
1575 		/*
1576 		 * Advance m_ret from m to the first of the sequence, if any,
1577 		 * that satisfies alignment conditions and might leave enough
1578 		 * space.
1579 		 */
1580 		m_ret = m;
1581 		while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret),
1582 		    size, alignment, boundary) &&
1583 		    VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) &&
1584 		    max_order == m_ret[1 << max_order].order)
1585 			m_ret += 1 << max_order;
1586 
1587 		/*
1588 		 * Skip m unless some block m_ret in the sequence is properly
1589 		 * aligned, and begins a sequence of enough pages less than
1590 		 * high, and in the same segment.
1591 		 */
1592 		if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end))
1593 			continue;
1594 
1595 		/*
1596 		 * Skip m unless the blocks to allocate starting at m_ret are
1597 		 * all free.
1598 		 */
1599 		for (m_iter = m_ret;
1600 		    m_iter < m_ret + npages && max_order == m_iter->order;
1601 		    m_iter += 1 << max_order) {
1602 		}
1603 		if (m_iter < m_ret + npages)
1604 			continue;
1605 		return (m_ret);
1606 	}
1607 	return (NULL);
1608 }
1609 
1610 /*
1611  * Find a run of contiguous physical pages from the specified free list
1612  * table.
1613  */
1614 static vm_page_t
1615 vm_phys_find_queues_contig(
1616     struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX],
1617     u_long npages, vm_paddr_t low, vm_paddr_t high,
1618     u_long alignment, vm_paddr_t boundary)
1619 {
1620 	struct vm_freelist *fl;
1621 	vm_page_t m_ret;
1622 	vm_paddr_t pa, pa_end, size;
1623 	int oind, order, pind;
1624 
1625 	KASSERT(npages > 0, ("npages is 0"));
1626 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1627 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1628 	/* Compute the queue that is the best fit for npages. */
1629 	order = flsl(npages - 1);
1630 	/* Search for a large enough free block. */
1631 	size = npages << PAGE_SHIFT;
1632 	for (oind = order; oind < VM_NFREEORDER; oind++) {
1633 		for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
1634 			fl = (*queues)[pind];
1635 			TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
1636 				/*
1637 				 * Determine if the address range starting at pa
1638 				 * is within the given range, satisfies the
1639 				 * given alignment, and does not cross the given
1640 				 * boundary.
1641 				 */
1642 				pa = VM_PAGE_TO_PHYS(m_ret);
1643 				pa_end = pa + size;
1644 				if (low <= pa && pa_end <= high &&
1645 				    vm_addr_ok(pa, size, alignment, boundary))
1646 					return (m_ret);
1647 			}
1648 		}
1649 	}
1650 	if (order < VM_NFREEORDER)
1651 		return (NULL);
1652 	/* Search for a long-enough sequence of max-order blocks. */
1653 	for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
1654 		fl = (*queues)[pind];
1655 		m_ret = vm_phys_find_freelist_contig(fl, npages,
1656 		    low, high, alignment, boundary);
1657 		if (m_ret != NULL)
1658 			return (m_ret);
1659 	}
1660 	return (NULL);
1661 }
1662 
1663 /*
1664  * Allocate a contiguous set of physical pages of the given size
1665  * "npages" from the free lists.  All of the physical pages must be at
1666  * or above the given physical address "low" and below the given
1667  * physical address "high".  The given value "alignment" determines the
1668  * alignment of the first physical page in the set.  If the given value
1669  * "boundary" is non-zero, then the set of physical pages cannot cross
1670  * any physical address boundary that is a multiple of that value.  Both
1671  * "alignment" and "boundary" must be a power of two.
1672  */
1673 vm_page_t
1674 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
1675     u_long alignment, vm_paddr_t boundary)
1676 {
1677 	vm_paddr_t pa_end, pa_start;
1678 	struct vm_freelist *fl;
1679 	vm_page_t m, m_run;
1680 	struct vm_phys_seg *seg;
1681 	struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX];
1682 	int oind, segind;
1683 
1684 	KASSERT(npages > 0, ("npages is 0"));
1685 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1686 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1687 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
1688 	if (low >= high)
1689 		return (NULL);
1690 	queues = NULL;
1691 	m_run = NULL;
1692 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
1693 		seg = &vm_phys_segs[segind];
1694 		if (seg->start >= high || seg->domain != domain)
1695 			continue;
1696 		if (low >= seg->end)
1697 			break;
1698 		if (low <= seg->start)
1699 			pa_start = seg->start;
1700 		else
1701 			pa_start = low;
1702 		if (high < seg->end)
1703 			pa_end = high;
1704 		else
1705 			pa_end = seg->end;
1706 		if (pa_end - pa_start < ptoa(npages))
1707 			continue;
1708 		/*
1709 		 * If a previous segment led to a search using
1710 		 * the same free lists as would this segment, then
1711 		 * we've actually already searched within this
1712 		 * too.  So skip it.
1713 		 */
1714 		if (seg->free_queues == queues)
1715 			continue;
1716 		queues = seg->free_queues;
1717 		m_run = vm_phys_find_queues_contig(queues, npages,
1718 		    low, high, alignment, boundary);
1719 		if (m_run != NULL)
1720 			break;
1721 	}
1722 	if (m_run == NULL)
1723 		return (NULL);
1724 
1725 	/* Allocate pages from the page-range found. */
1726 	for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) {
1727 		fl = (*queues)[m->pool];
1728 		oind = m->order;
1729 		vm_freelist_rem(fl, m, oind);
1730 		if (m->pool != VM_FREEPOOL_DEFAULT)
1731 			vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind);
1732 	}
1733 	/* Return excess pages to the free lists. */
1734 	fl = (*queues)[VM_FREEPOOL_DEFAULT];
1735 	vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl, 0);
1736 
1737 	/* Return page verified to satisfy conditions of request. */
1738 	pa_start = VM_PAGE_TO_PHYS(m_run);
1739 	KASSERT(low <= pa_start,
1740 	    ("memory allocated below minimum requested range"));
1741 	KASSERT(pa_start + ptoa(npages) <= high,
1742 	    ("memory allocated above maximum requested range"));
1743 	seg = &vm_phys_segs[m_run->segind];
1744 	KASSERT(seg->domain == domain,
1745 	    ("memory not allocated from specified domain"));
1746 	KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary),
1747 	    ("memory alignment/boundary constraints not satisfied"));
1748 	return (m_run);
1749 }
1750 
1751 /*
1752  * Return the index of the first unused slot which may be the terminating
1753  * entry.
1754  */
1755 static int
1756 vm_phys_avail_count(void)
1757 {
1758 	int i;
1759 
1760 	for (i = 0; phys_avail[i + 1]; i += 2)
1761 		continue;
1762 	if (i > PHYS_AVAIL_ENTRIES)
1763 		panic("Improperly terminated phys_avail %d entries", i);
1764 
1765 	return (i);
1766 }
1767 
1768 /*
1769  * Assert that a phys_avail entry is valid.
1770  */
1771 static void
1772 vm_phys_avail_check(int i)
1773 {
1774 	if (phys_avail[i] & PAGE_MASK)
1775 		panic("Unaligned phys_avail[%d]: %#jx", i,
1776 		    (intmax_t)phys_avail[i]);
1777 	if (phys_avail[i+1] & PAGE_MASK)
1778 		panic("Unaligned phys_avail[%d + 1]: %#jx", i,
1779 		    (intmax_t)phys_avail[i]);
1780 	if (phys_avail[i + 1] < phys_avail[i])
1781 		panic("phys_avail[%d] start %#jx < end %#jx", i,
1782 		    (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]);
1783 }
1784 
1785 /*
1786  * Return the index of an overlapping phys_avail entry or -1.
1787  */
1788 #ifdef NUMA
1789 static int
1790 vm_phys_avail_find(vm_paddr_t pa)
1791 {
1792 	int i;
1793 
1794 	for (i = 0; phys_avail[i + 1]; i += 2)
1795 		if (phys_avail[i] <= pa && phys_avail[i + 1] > pa)
1796 			return (i);
1797 	return (-1);
1798 }
1799 #endif
1800 
1801 /*
1802  * Return the index of the largest entry.
1803  */
1804 int
1805 vm_phys_avail_largest(void)
1806 {
1807 	vm_paddr_t sz, largesz;
1808 	int largest;
1809 	int i;
1810 
1811 	largest = 0;
1812 	largesz = 0;
1813 	for (i = 0; phys_avail[i + 1]; i += 2) {
1814 		sz = vm_phys_avail_size(i);
1815 		if (sz > largesz) {
1816 			largesz = sz;
1817 			largest = i;
1818 		}
1819 	}
1820 
1821 	return (largest);
1822 }
1823 
1824 vm_paddr_t
1825 vm_phys_avail_size(int i)
1826 {
1827 
1828 	return (phys_avail[i + 1] - phys_avail[i]);
1829 }
1830 
1831 /*
1832  * Split an entry at the address 'pa'.  Return zero on success or errno.
1833  */
1834 static int
1835 vm_phys_avail_split(vm_paddr_t pa, int i)
1836 {
1837 	int cnt;
1838 
1839 	vm_phys_avail_check(i);
1840 	if (pa <= phys_avail[i] || pa >= phys_avail[i + 1])
1841 		panic("vm_phys_avail_split: invalid address");
1842 	cnt = vm_phys_avail_count();
1843 	if (cnt >= PHYS_AVAIL_ENTRIES)
1844 		return (ENOSPC);
1845 	memmove(&phys_avail[i + 2], &phys_avail[i],
1846 	    (cnt - i) * sizeof(phys_avail[0]));
1847 	phys_avail[i + 1] = pa;
1848 	phys_avail[i + 2] = pa;
1849 	vm_phys_avail_check(i);
1850 	vm_phys_avail_check(i+2);
1851 
1852 	return (0);
1853 }
1854 
1855 /*
1856  * Check if a given physical address can be included as part of a crash dump.
1857  */
1858 bool
1859 vm_phys_is_dumpable(vm_paddr_t pa)
1860 {
1861 	vm_page_t m;
1862 	int i;
1863 
1864 	if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL)
1865 		return ((m->flags & PG_NODUMP) == 0);
1866 
1867 	for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
1868 		if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
1869 			return (true);
1870 	}
1871 	return (false);
1872 }
1873 
1874 void
1875 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end)
1876 {
1877 	struct vm_phys_seg *seg;
1878 
1879 	if (vm_phys_early_nsegs == -1)
1880 		panic("%s: called after initialization", __func__);
1881 	if (vm_phys_early_nsegs == nitems(vm_phys_early_segs))
1882 		panic("%s: ran out of early segments", __func__);
1883 
1884 	seg = &vm_phys_early_segs[vm_phys_early_nsegs++];
1885 	seg->start = start;
1886 	seg->end = end;
1887 }
1888 
1889 /*
1890  * This routine allocates NUMA node specific memory before the page
1891  * allocator is bootstrapped.
1892  */
1893 vm_paddr_t
1894 vm_phys_early_alloc(int domain, size_t alloc_size)
1895 {
1896 #ifdef NUMA
1897 	int mem_index;
1898 #endif
1899 	int i, biggestone;
1900 	vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align;
1901 
1902 	KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains),
1903 	    ("%s: invalid domain index %d", __func__, domain));
1904 
1905 	/*
1906 	 * Search the mem_affinity array for the biggest address
1907 	 * range in the desired domain.  This is used to constrain
1908 	 * the phys_avail selection below.
1909 	 */
1910 	biggestsize = 0;
1911 	mem_start = 0;
1912 	mem_end = -1;
1913 #ifdef NUMA
1914 	mem_index = 0;
1915 	if (mem_affinity != NULL) {
1916 		for (i = 0;; i++) {
1917 			size = mem_affinity[i].end - mem_affinity[i].start;
1918 			if (size == 0)
1919 				break;
1920 			if (domain != -1 && mem_affinity[i].domain != domain)
1921 				continue;
1922 			if (size > biggestsize) {
1923 				mem_index = i;
1924 				biggestsize = size;
1925 			}
1926 		}
1927 		mem_start = mem_affinity[mem_index].start;
1928 		mem_end = mem_affinity[mem_index].end;
1929 	}
1930 #endif
1931 
1932 	/*
1933 	 * Now find biggest physical segment in within the desired
1934 	 * numa domain.
1935 	 */
1936 	biggestsize = 0;
1937 	biggestone = 0;
1938 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1939 		/* skip regions that are out of range */
1940 		if (phys_avail[i+1] - alloc_size < mem_start ||
1941 		    phys_avail[i+1] > mem_end)
1942 			continue;
1943 		size = vm_phys_avail_size(i);
1944 		if (size > biggestsize) {
1945 			biggestone = i;
1946 			biggestsize = size;
1947 		}
1948 	}
1949 	alloc_size = round_page(alloc_size);
1950 
1951 	/*
1952 	 * Grab single pages from the front to reduce fragmentation.
1953 	 */
1954 	if (alloc_size == PAGE_SIZE) {
1955 		pa = phys_avail[biggestone];
1956 		phys_avail[biggestone] += PAGE_SIZE;
1957 		vm_phys_avail_check(biggestone);
1958 		return (pa);
1959 	}
1960 
1961 	/*
1962 	 * Naturally align large allocations.
1963 	 */
1964 	align = phys_avail[biggestone + 1] & (alloc_size - 1);
1965 	if (alloc_size + align > biggestsize)
1966 		panic("cannot find a large enough size\n");
1967 	if (align != 0 &&
1968 	    vm_phys_avail_split(phys_avail[biggestone + 1] - align,
1969 	    biggestone) != 0)
1970 		/* Wasting memory. */
1971 		phys_avail[biggestone + 1] -= align;
1972 
1973 	phys_avail[biggestone + 1] -= alloc_size;
1974 	vm_phys_avail_check(biggestone);
1975 	pa = phys_avail[biggestone + 1];
1976 	return (pa);
1977 }
1978 
1979 void
1980 vm_phys_early_startup(void)
1981 {
1982 	struct vm_phys_seg *seg;
1983 	int i;
1984 
1985 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1986 		phys_avail[i] = round_page(phys_avail[i]);
1987 		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
1988 	}
1989 
1990 	for (i = 0; i < vm_phys_early_nsegs; i++) {
1991 		seg = &vm_phys_early_segs[i];
1992 		vm_phys_add_seg(seg->start, seg->end);
1993 	}
1994 	vm_phys_early_nsegs = -1;
1995 
1996 #ifdef NUMA
1997 	/* Force phys_avail to be split by domain. */
1998 	if (mem_affinity != NULL) {
1999 		int idx;
2000 
2001 		for (i = 0; mem_affinity[i].end != 0; i++) {
2002 			idx = vm_phys_avail_find(mem_affinity[i].start);
2003 			if (idx != -1 &&
2004 			    phys_avail[idx] != mem_affinity[i].start)
2005 				vm_phys_avail_split(mem_affinity[i].start, idx);
2006 			idx = vm_phys_avail_find(mem_affinity[i].end);
2007 			if (idx != -1 &&
2008 			    phys_avail[idx] != mem_affinity[i].end)
2009 				vm_phys_avail_split(mem_affinity[i].end, idx);
2010 		}
2011 	}
2012 #endif
2013 }
2014 
2015 #ifdef DDB
2016 /*
2017  * Show the number of physical pages in each of the free lists.
2018  */
2019 DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE)
2020 {
2021 	struct vm_freelist *fl;
2022 	int flind, oind, pind, dom;
2023 
2024 	for (dom = 0; dom < vm_ndomains; dom++) {
2025 		db_printf("DOMAIN: %d\n", dom);
2026 		for (flind = 0; flind < vm_nfreelists; flind++) {
2027 			db_printf("FREE LIST %d:\n"
2028 			    "\n  ORDER (SIZE)  |  NUMBER"
2029 			    "\n              ", flind);
2030 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
2031 				db_printf("  |  POOL %d", pind);
2032 			db_printf("\n--            ");
2033 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
2034 				db_printf("-- --      ");
2035 			db_printf("--\n");
2036 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
2037 				db_printf("  %2.2d (%6.6dK)", oind,
2038 				    1 << (PAGE_SHIFT - 10 + oind));
2039 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
2040 				fl = vm_phys_free_queues[dom][flind][pind];
2041 					db_printf("  |  %6.6d", fl[oind].lcnt);
2042 				}
2043 				db_printf("\n");
2044 			}
2045 			db_printf("\n");
2046 		}
2047 		db_printf("\n");
2048 	}
2049 }
2050 #endif
2051