xref: /freebsd/sys/vm/vm_phys.c (revision fbff6d54da146e98ec2ce4ebfbb86339d4f9fa21)
111752d88SAlan Cox /*-
24d846d26SWarner Losh  * SPDX-License-Identifier: BSD-2-Clause
3fe267a55SPedro F. Giffuni  *
411752d88SAlan Cox  * Copyright (c) 2002-2006 Rice University
511752d88SAlan Cox  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
611752d88SAlan Cox  * All rights reserved.
711752d88SAlan Cox  *
811752d88SAlan Cox  * This software was developed for the FreeBSD Project by Alan L. Cox,
911752d88SAlan Cox  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
1011752d88SAlan Cox  *
1111752d88SAlan Cox  * Redistribution and use in source and binary forms, with or without
1211752d88SAlan Cox  * modification, are permitted provided that the following conditions
1311752d88SAlan Cox  * are met:
1411752d88SAlan Cox  * 1. Redistributions of source code must retain the above copyright
1511752d88SAlan Cox  *    notice, this list of conditions and the following disclaimer.
1611752d88SAlan Cox  * 2. Redistributions in binary form must reproduce the above copyright
1711752d88SAlan Cox  *    notice, this list of conditions and the following disclaimer in the
1811752d88SAlan Cox  *    documentation and/or other materials provided with the distribution.
1911752d88SAlan Cox  *
2011752d88SAlan Cox  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
2111752d88SAlan Cox  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
2211752d88SAlan Cox  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
2311752d88SAlan Cox  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
2411752d88SAlan Cox  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
2511752d88SAlan Cox  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
2611752d88SAlan Cox  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
2711752d88SAlan Cox  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
2811752d88SAlan Cox  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2911752d88SAlan Cox  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
3011752d88SAlan Cox  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
3111752d88SAlan Cox  * POSSIBILITY OF SUCH DAMAGE.
3211752d88SAlan Cox  */
3311752d88SAlan Cox 
34fbd80bd0SAlan Cox /*
35fbd80bd0SAlan Cox  *	Physical memory system implementation
36fbd80bd0SAlan Cox  *
37fbd80bd0SAlan Cox  * Any external functions defined by this module are only to be used by the
38fbd80bd0SAlan Cox  * virtual memory system.
39fbd80bd0SAlan Cox  */
40fbd80bd0SAlan Cox 
4111752d88SAlan Cox #include <sys/cdefs.h>
4211752d88SAlan Cox #include "opt_ddb.h"
43174b5f38SJohn Baldwin #include "opt_vm.h"
4411752d88SAlan Cox 
4511752d88SAlan Cox #include <sys/param.h>
4611752d88SAlan Cox #include <sys/systm.h>
47662e7fa8SMark Johnston #include <sys/domainset.h>
4811752d88SAlan Cox #include <sys/lock.h>
4911752d88SAlan Cox #include <sys/kernel.h>
50b16b4c22SMark Johnston #include <sys/kthread.h>
5111752d88SAlan Cox #include <sys/malloc.h>
5211752d88SAlan Cox #include <sys/mutex.h>
537e226537SAttilio Rao #include <sys/proc.h>
5411752d88SAlan Cox #include <sys/queue.h>
5538d6b2dcSRoger Pau Monné #include <sys/rwlock.h>
5611752d88SAlan Cox #include <sys/sbuf.h>
57b16b4c22SMark Johnston #include <sys/sched.h>
5811752d88SAlan Cox #include <sys/sysctl.h>
5938d6b2dcSRoger Pau Monné #include <sys/tree.h>
60b16b4c22SMark Johnston #include <sys/tslog.h>
61b16b4c22SMark Johnston #include <sys/unistd.h>
6211752d88SAlan Cox #include <sys/vmmeter.h>
6311752d88SAlan Cox 
6411752d88SAlan Cox #include <ddb/ddb.h>
6511752d88SAlan Cox 
6611752d88SAlan Cox #include <vm/vm.h>
6701e115abSDoug Moore #include <vm/vm_extern.h>
6811752d88SAlan Cox #include <vm/vm_param.h>
6911752d88SAlan Cox #include <vm/vm_kern.h>
7011752d88SAlan Cox #include <vm/vm_object.h>
7111752d88SAlan Cox #include <vm/vm_page.h>
7211752d88SAlan Cox #include <vm/vm_phys.h>
73e2068d0bSJeff Roberson #include <vm/vm_pagequeue.h>
7411752d88SAlan Cox 
75449c2e92SKonstantin Belousov _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
76449c2e92SKonstantin Belousov     "Too many physsegs.");
77c9b06fa5SDoug Moore _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t),
78c9b06fa5SDoug Moore     "vm_paddr_t too big for ffsll, flsll.");
7911752d88SAlan Cox 
80b6715dabSJeff Roberson #ifdef NUMA
81cdfeced8SJeff Roberson struct mem_affinity __read_mostly *mem_affinity;
82cdfeced8SJeff Roberson int __read_mostly *mem_locality;
83c415cfc8SZhenlei Huang 
84c415cfc8SZhenlei Huang static int numa_disabled;
85c415cfc8SZhenlei Huang static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
86c415cfc8SZhenlei Huang     "NUMA options");
87c415cfc8SZhenlei Huang SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
88c415cfc8SZhenlei Huang     &numa_disabled, 0, "NUMA-awareness in the allocators is disabled");
8962d70a81SJohn Baldwin #endif
90a3870a18SJohn Baldwin 
91cdfeced8SJeff Roberson int __read_mostly vm_ndomains = 1;
92463406acSMark Johnston domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1);
937e226537SAttilio Rao 
94cdfeced8SJeff Roberson struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX];
95cdfeced8SJeff Roberson int __read_mostly vm_phys_nsegs;
9681302f1dSMark Johnston static struct vm_phys_seg vm_phys_early_segs[8];
9781302f1dSMark Johnston static int vm_phys_early_nsegs;
9811752d88SAlan Cox 
9938d6b2dcSRoger Pau Monné struct vm_phys_fictitious_seg;
10038d6b2dcSRoger Pau Monné static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
10138d6b2dcSRoger Pau Monné     struct vm_phys_fictitious_seg *);
10238d6b2dcSRoger Pau Monné 
10338d6b2dcSRoger Pau Monné RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
104b649c2acSDoug Moore     RB_INITIALIZER(&vm_phys_fictitious_tree);
10538d6b2dcSRoger Pau Monné 
10638d6b2dcSRoger Pau Monné struct vm_phys_fictitious_seg {
10738d6b2dcSRoger Pau Monné 	RB_ENTRY(vm_phys_fictitious_seg) node;
10838d6b2dcSRoger Pau Monné 	/* Memory region data */
109b6de32bdSKonstantin Belousov 	vm_paddr_t	start;
110b6de32bdSKonstantin Belousov 	vm_paddr_t	end;
111b6de32bdSKonstantin Belousov 	vm_page_t	first_page;
11238d6b2dcSRoger Pau Monné };
11338d6b2dcSRoger Pau Monné 
11438d6b2dcSRoger Pau Monné RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
11538d6b2dcSRoger Pau Monné     vm_phys_fictitious_cmp);
11638d6b2dcSRoger Pau Monné 
117cdfeced8SJeff Roberson static struct rwlock_padalign vm_phys_fictitious_reg_lock;
118c0432fc3SMark Johnston MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
119b6de32bdSKonstantin Belousov 
120cdfeced8SJeff Roberson static struct vm_freelist __aligned(CACHE_LINE_SIZE)
121f2a496d6SKonstantin Belousov     vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL]
122f2a496d6SKonstantin Belousov     [VM_NFREEORDER_MAX];
12311752d88SAlan Cox 
124cdfeced8SJeff Roberson static int __read_mostly vm_nfreelists;
125d866a563SAlan Cox 
126d866a563SAlan Cox /*
12721943937SJeff Roberson  * These "avail lists" are globals used to communicate boot-time physical
12821943937SJeff Roberson  * memory layout to other parts of the kernel.  Each physically contiguous
12921943937SJeff Roberson  * region of memory is defined by a start address at an even index and an
13021943937SJeff Roberson  * end address at the following odd index.  Each list is terminated by a
13121943937SJeff Roberson  * pair of zero entries.
13221943937SJeff Roberson  *
13321943937SJeff Roberson  * dump_avail tells the dump code what regions to include in a crash dump, and
13421943937SJeff Roberson  * phys_avail is all of the remaining physical memory that is available for
13521943937SJeff Roberson  * the vm system.
13621943937SJeff Roberson  *
13721943937SJeff Roberson  * Initially dump_avail and phys_avail are identical.  Boot time memory
13821943937SJeff Roberson  * allocations remove extents from phys_avail that may still be included
13921943937SJeff Roberson  * in dumps.
14021943937SJeff Roberson  */
14121943937SJeff Roberson vm_paddr_t phys_avail[PHYS_AVAIL_COUNT];
14221943937SJeff Roberson vm_paddr_t dump_avail[PHYS_AVAIL_COUNT];
14321943937SJeff Roberson 
14421943937SJeff Roberson /*
145d866a563SAlan Cox  * Provides the mapping from VM_FREELIST_* to free list indices (flind).
146d866a563SAlan Cox  */
147cdfeced8SJeff Roberson static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
148b16b4c22SMark Johnston static int __read_mostly vm_default_freepool;
149d866a563SAlan Cox 
150d866a563SAlan Cox CTASSERT(VM_FREELIST_DEFAULT == 0);
151d866a563SAlan Cox 
152d866a563SAlan Cox #ifdef VM_FREELIST_DMA32
153d866a563SAlan Cox #define	VM_DMA32_BOUNDARY	((vm_paddr_t)1 << 32)
154d866a563SAlan Cox #endif
155d866a563SAlan Cox 
156d866a563SAlan Cox /*
157d866a563SAlan Cox  * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
158d866a563SAlan Cox  * the ordering of the free list boundaries.
159d866a563SAlan Cox  */
160d866a563SAlan Cox #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
161d866a563SAlan Cox CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
162d866a563SAlan Cox #endif
16311752d88SAlan Cox 
16411752d88SAlan Cox static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
1657029da5cSPawel Biernacki SYSCTL_OID(_vm, OID_AUTO, phys_free,
166114484b7SMark Johnston     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1677029da5cSPawel Biernacki     sysctl_vm_phys_free, "A",
1687029da5cSPawel Biernacki     "Phys Free Info");
16911752d88SAlan Cox 
17011752d88SAlan Cox static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
1717029da5cSPawel Biernacki SYSCTL_OID(_vm, OID_AUTO, phys_segs,
172114484b7SMark Johnston     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1737029da5cSPawel Biernacki     sysctl_vm_phys_segs, "A",
1747029da5cSPawel Biernacki     "Phys Seg Info");
17511752d88SAlan Cox 
176b6715dabSJeff Roberson #ifdef NUMA
177415d7ccaSAdrian Chadd static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
1787029da5cSPawel Biernacki SYSCTL_OID(_vm, OID_AUTO, phys_locality,
179114484b7SMark Johnston     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1807029da5cSPawel Biernacki     sysctl_vm_phys_locality, "A",
1817029da5cSPawel Biernacki     "Phys Locality Info");
1826520495aSAdrian Chadd #endif
183415d7ccaSAdrian Chadd 
1847e226537SAttilio Rao SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
1857e226537SAttilio Rao     &vm_ndomains, 0, "Number of physical memory domains available.");
186a3870a18SJohn Baldwin 
187d866a563SAlan Cox static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
188d866a563SAlan Cox static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
18911752d88SAlan Cox static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
190370a338aSAlan Cox     int order, int tail);
191c606ab59SDoug Moore 
192b16b4c22SMark Johnston static bool __diagused
193b16b4c22SMark Johnston vm_phys_pool_valid(int pool)
194b16b4c22SMark Johnston {
195b16b4c22SMark Johnston #ifdef VM_FREEPOOL_LAZYINIT
196b16b4c22SMark Johnston 	if (pool == VM_FREEPOOL_LAZYINIT)
197b16b4c22SMark Johnston 		return (false);
198b16b4c22SMark Johnston #endif
199b16b4c22SMark Johnston 	return (pool >= 0 && pool < VM_NFREEPOOL);
200b16b4c22SMark Johnston }
201b16b4c22SMark Johnston 
20238d6b2dcSRoger Pau Monné /*
20338d6b2dcSRoger Pau Monné  * Red-black tree helpers for vm fictitious range management.
20438d6b2dcSRoger Pau Monné  */
20538d6b2dcSRoger Pau Monné static inline int
20638d6b2dcSRoger Pau Monné vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
20738d6b2dcSRoger Pau Monné     struct vm_phys_fictitious_seg *range)
20838d6b2dcSRoger Pau Monné {
20938d6b2dcSRoger Pau Monné 
21038d6b2dcSRoger Pau Monné 	KASSERT(range->start != 0 && range->end != 0,
21138d6b2dcSRoger Pau Monné 	    ("Invalid range passed on search for vm_fictitious page"));
21238d6b2dcSRoger Pau Monné 	if (p->start >= range->end)
21338d6b2dcSRoger Pau Monné 		return (1);
21438d6b2dcSRoger Pau Monné 	if (p->start < range->start)
21538d6b2dcSRoger Pau Monné 		return (-1);
21638d6b2dcSRoger Pau Monné 
21738d6b2dcSRoger Pau Monné 	return (0);
21838d6b2dcSRoger Pau Monné }
21938d6b2dcSRoger Pau Monné 
22038d6b2dcSRoger Pau Monné static int
22138d6b2dcSRoger Pau Monné vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
22238d6b2dcSRoger Pau Monné     struct vm_phys_fictitious_seg *p2)
22338d6b2dcSRoger Pau Monné {
22438d6b2dcSRoger Pau Monné 
22538d6b2dcSRoger Pau Monné 	/* Check if this is a search for a page */
22638d6b2dcSRoger Pau Monné 	if (p1->end == 0)
22738d6b2dcSRoger Pau Monné 		return (vm_phys_fictitious_in_range(p1, p2));
22838d6b2dcSRoger Pau Monné 
22938d6b2dcSRoger Pau Monné 	KASSERT(p2->end != 0,
23038d6b2dcSRoger Pau Monné     ("Invalid range passed as second parameter to vm fictitious comparison"));
23138d6b2dcSRoger Pau Monné 
23238d6b2dcSRoger Pau Monné 	/* Searching to add a new range */
23338d6b2dcSRoger Pau Monné 	if (p1->end <= p2->start)
23438d6b2dcSRoger Pau Monné 		return (-1);
23538d6b2dcSRoger Pau Monné 	if (p1->start >= p2->end)
23638d6b2dcSRoger Pau Monné 		return (1);
23738d6b2dcSRoger Pau Monné 
23838d6b2dcSRoger Pau Monné 	panic("Trying to add overlapping vm fictitious ranges:\n"
23938d6b2dcSRoger Pau Monné 	    "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
24038d6b2dcSRoger Pau Monné 	    (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
24138d6b2dcSRoger Pau Monné }
24238d6b2dcSRoger Pau Monné 
2436f4acaf4SJeff Roberson int
244cb20a74cSStephen J. Kiernan vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used,
245cb20a74cSStephen J. Kiernan     vm_paddr_t high __numa_used)
246449c2e92SKonstantin Belousov {
247b6715dabSJeff Roberson #ifdef NUMA
2486f4acaf4SJeff Roberson 	domainset_t mask;
2496f4acaf4SJeff Roberson 	int i;
250449c2e92SKonstantin Belousov 
2516f4acaf4SJeff Roberson 	if (vm_ndomains == 1 || mem_affinity == NULL)
2526f4acaf4SJeff Roberson 		return (0);
2536f4acaf4SJeff Roberson 
2546f4acaf4SJeff Roberson 	DOMAINSET_ZERO(&mask);
2556f4acaf4SJeff Roberson 	/*
2566f4acaf4SJeff Roberson 	 * Check for any memory that overlaps low, high.
2576f4acaf4SJeff Roberson 	 */
2586f4acaf4SJeff Roberson 	for (i = 0; mem_affinity[i].end != 0; i++)
2596f4acaf4SJeff Roberson 		if (mem_affinity[i].start <= high &&
2606f4acaf4SJeff Roberson 		    mem_affinity[i].end >= low)
2616f4acaf4SJeff Roberson 			DOMAINSET_SET(mem_affinity[i].domain, &mask);
2626f4acaf4SJeff Roberson 	if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask))
2636f4acaf4SJeff Roberson 		return (prefer);
2646f4acaf4SJeff Roberson 	if (DOMAINSET_EMPTY(&mask))
2656f4acaf4SJeff Roberson 		panic("vm_phys_domain_match:  Impossible constraint");
2666f4acaf4SJeff Roberson 	return (DOMAINSET_FFS(&mask) - 1);
2676f4acaf4SJeff Roberson #else
2686f4acaf4SJeff Roberson 	return (0);
2696f4acaf4SJeff Roberson #endif
270449c2e92SKonstantin Belousov }
271449c2e92SKonstantin Belousov 
27211752d88SAlan Cox /*
27311752d88SAlan Cox  * Outputs the state of the physical memory allocator, specifically,
27411752d88SAlan Cox  * the amount of physical memory in each free list.
27511752d88SAlan Cox  */
27611752d88SAlan Cox static int
27711752d88SAlan Cox sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
27811752d88SAlan Cox {
27911752d88SAlan Cox 	struct sbuf sbuf;
28011752d88SAlan Cox 	struct vm_freelist *fl;
2817e226537SAttilio Rao 	int dom, error, flind, oind, pind;
28211752d88SAlan Cox 
28300f0e671SMatthew D Fleming 	error = sysctl_wire_old_buffer(req, 0);
28400f0e671SMatthew D Fleming 	if (error != 0)
28500f0e671SMatthew D Fleming 		return (error);
2867e226537SAttilio Rao 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
2877e226537SAttilio Rao 	for (dom = 0; dom < vm_ndomains; dom++) {
288eb2f42fbSAlan Cox 		sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
28911752d88SAlan Cox 		for (flind = 0; flind < vm_nfreelists; flind++) {
290eb2f42fbSAlan Cox 			sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
29111752d88SAlan Cox 			    "\n  ORDER (SIZE)  |  NUMBER"
29211752d88SAlan Cox 			    "\n              ", flind);
29311752d88SAlan Cox 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
29411752d88SAlan Cox 				sbuf_printf(&sbuf, "  |  POOL %d", pind);
29511752d88SAlan Cox 			sbuf_printf(&sbuf, "\n--            ");
29611752d88SAlan Cox 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
29711752d88SAlan Cox 				sbuf_printf(&sbuf, "-- --      ");
29811752d88SAlan Cox 			sbuf_printf(&sbuf, "--\n");
29911752d88SAlan Cox 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
300d689bc00SAlan Cox 				sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
30111752d88SAlan Cox 				    1 << (PAGE_SHIFT - 10 + oind));
30211752d88SAlan Cox 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
3037e226537SAttilio Rao 				fl = vm_phys_free_queues[dom][flind][pind];
304eb2f42fbSAlan Cox 					sbuf_printf(&sbuf, "  |  %6d",
3057e226537SAttilio Rao 					    fl[oind].lcnt);
30611752d88SAlan Cox 				}
30711752d88SAlan Cox 				sbuf_printf(&sbuf, "\n");
30811752d88SAlan Cox 			}
3097e226537SAttilio Rao 		}
31011752d88SAlan Cox 	}
3114e657159SMatthew D Fleming 	error = sbuf_finish(&sbuf);
31211752d88SAlan Cox 	sbuf_delete(&sbuf);
31311752d88SAlan Cox 	return (error);
31411752d88SAlan Cox }
31511752d88SAlan Cox 
31611752d88SAlan Cox /*
31711752d88SAlan Cox  * Outputs the set of physical memory segments.
31811752d88SAlan Cox  */
31911752d88SAlan Cox static int
32011752d88SAlan Cox sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
32111752d88SAlan Cox {
32211752d88SAlan Cox 	struct sbuf sbuf;
32311752d88SAlan Cox 	struct vm_phys_seg *seg;
32411752d88SAlan Cox 	int error, segind;
32511752d88SAlan Cox 
32600f0e671SMatthew D Fleming 	error = sysctl_wire_old_buffer(req, 0);
32700f0e671SMatthew D Fleming 	if (error != 0)
32800f0e671SMatthew D Fleming 		return (error);
3294e657159SMatthew D Fleming 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
33011752d88SAlan Cox 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
33111752d88SAlan Cox 		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
33211752d88SAlan Cox 		seg = &vm_phys_segs[segind];
33311752d88SAlan Cox 		sbuf_printf(&sbuf, "start:     %#jx\n",
33411752d88SAlan Cox 		    (uintmax_t)seg->start);
33511752d88SAlan Cox 		sbuf_printf(&sbuf, "end:       %#jx\n",
33611752d88SAlan Cox 		    (uintmax_t)seg->end);
337a3870a18SJohn Baldwin 		sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
33811752d88SAlan Cox 		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
33911752d88SAlan Cox 	}
3404e657159SMatthew D Fleming 	error = sbuf_finish(&sbuf);
34111752d88SAlan Cox 	sbuf_delete(&sbuf);
34211752d88SAlan Cox 	return (error);
34311752d88SAlan Cox }
34411752d88SAlan Cox 
345415d7ccaSAdrian Chadd /*
346415d7ccaSAdrian Chadd  * Return affinity, or -1 if there's no affinity information.
347415d7ccaSAdrian Chadd  */
3486520495aSAdrian Chadd int
349cb20a74cSStephen J. Kiernan vm_phys_mem_affinity(int f __numa_used, int t __numa_used)
350415d7ccaSAdrian Chadd {
351415d7ccaSAdrian Chadd 
352b6715dabSJeff Roberson #ifdef NUMA
353415d7ccaSAdrian Chadd 	if (mem_locality == NULL)
354415d7ccaSAdrian Chadd 		return (-1);
355415d7ccaSAdrian Chadd 	if (f >= vm_ndomains || t >= vm_ndomains)
356415d7ccaSAdrian Chadd 		return (-1);
357415d7ccaSAdrian Chadd 	return (mem_locality[f * vm_ndomains + t]);
3586520495aSAdrian Chadd #else
3596520495aSAdrian Chadd 	return (-1);
3606520495aSAdrian Chadd #endif
361415d7ccaSAdrian Chadd }
362415d7ccaSAdrian Chadd 
363b6715dabSJeff Roberson #ifdef NUMA
364415d7ccaSAdrian Chadd /*
365415d7ccaSAdrian Chadd  * Outputs the VM locality table.
366415d7ccaSAdrian Chadd  */
367415d7ccaSAdrian Chadd static int
368415d7ccaSAdrian Chadd sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
369415d7ccaSAdrian Chadd {
370415d7ccaSAdrian Chadd 	struct sbuf sbuf;
371415d7ccaSAdrian Chadd 	int error, i, j;
372415d7ccaSAdrian Chadd 
373415d7ccaSAdrian Chadd 	error = sysctl_wire_old_buffer(req, 0);
374415d7ccaSAdrian Chadd 	if (error != 0)
375415d7ccaSAdrian Chadd 		return (error);
376415d7ccaSAdrian Chadd 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
377415d7ccaSAdrian Chadd 
378415d7ccaSAdrian Chadd 	sbuf_printf(&sbuf, "\n");
379415d7ccaSAdrian Chadd 
380415d7ccaSAdrian Chadd 	for (i = 0; i < vm_ndomains; i++) {
381415d7ccaSAdrian Chadd 		sbuf_printf(&sbuf, "%d: ", i);
382415d7ccaSAdrian Chadd 		for (j = 0; j < vm_ndomains; j++) {
383415d7ccaSAdrian Chadd 			sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
384415d7ccaSAdrian Chadd 		}
385415d7ccaSAdrian Chadd 		sbuf_printf(&sbuf, "\n");
386415d7ccaSAdrian Chadd 	}
387415d7ccaSAdrian Chadd 	error = sbuf_finish(&sbuf);
388415d7ccaSAdrian Chadd 	sbuf_delete(&sbuf);
389415d7ccaSAdrian Chadd 	return (error);
390415d7ccaSAdrian Chadd }
3916520495aSAdrian Chadd #endif
392415d7ccaSAdrian Chadd 
3937e226537SAttilio Rao static void
3947e226537SAttilio Rao vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
395a3870a18SJohn Baldwin {
396a3870a18SJohn Baldwin 
3977e226537SAttilio Rao 	m->order = order;
3987e226537SAttilio Rao 	if (tail)
3995cd29d0fSMark Johnston 		TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
4007e226537SAttilio Rao 	else
4015cd29d0fSMark Johnston 		TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
4027e226537SAttilio Rao 	fl[order].lcnt++;
403a3870a18SJohn Baldwin }
4047e226537SAttilio Rao 
4057e226537SAttilio Rao static void
4067e226537SAttilio Rao vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
4077e226537SAttilio Rao {
4087e226537SAttilio Rao 
4095cd29d0fSMark Johnston 	TAILQ_REMOVE(&fl[order].pl, m, listq);
4107e226537SAttilio Rao 	fl[order].lcnt--;
4117e226537SAttilio Rao 	m->order = VM_NFREEORDER;
412a3870a18SJohn Baldwin }
413a3870a18SJohn Baldwin 
41411752d88SAlan Cox /*
41511752d88SAlan Cox  * Create a physical memory segment.
41611752d88SAlan Cox  */
41711752d88SAlan Cox static void
418d866a563SAlan Cox _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
41911752d88SAlan Cox {
42011752d88SAlan Cox 	struct vm_phys_seg *seg;
42111752d88SAlan Cox 
42211752d88SAlan Cox 	KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
42311752d88SAlan Cox 	    ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
424ef435ae7SJeff Roberson 	KASSERT(domain >= 0 && domain < vm_ndomains,
4257e226537SAttilio Rao 	    ("vm_phys_create_seg: invalid domain provided"));
42611752d88SAlan Cox 	seg = &vm_phys_segs[vm_phys_nsegs++];
427271f0f12SAlan Cox 	while (seg > vm_phys_segs && (seg - 1)->start >= end) {
428271f0f12SAlan Cox 		*seg = *(seg - 1);
429271f0f12SAlan Cox 		seg--;
430271f0f12SAlan Cox 	}
43111752d88SAlan Cox 	seg->start = start;
43211752d88SAlan Cox 	seg->end = end;
433a3870a18SJohn Baldwin 	seg->domain = domain;
43411752d88SAlan Cox }
43511752d88SAlan Cox 
436a3870a18SJohn Baldwin static void
437d866a563SAlan Cox vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
438a3870a18SJohn Baldwin {
439b6715dabSJeff Roberson #ifdef NUMA
440a3870a18SJohn Baldwin 	int i;
441a3870a18SJohn Baldwin 
442a3870a18SJohn Baldwin 	if (mem_affinity == NULL) {
443d866a563SAlan Cox 		_vm_phys_create_seg(start, end, 0);
444a3870a18SJohn Baldwin 		return;
445a3870a18SJohn Baldwin 	}
446a3870a18SJohn Baldwin 
447a3870a18SJohn Baldwin 	for (i = 0;; i++) {
448a3870a18SJohn Baldwin 		if (mem_affinity[i].end == 0)
449a3870a18SJohn Baldwin 			panic("Reached end of affinity info");
450a3870a18SJohn Baldwin 		if (mem_affinity[i].end <= start)
451a3870a18SJohn Baldwin 			continue;
452a3870a18SJohn Baldwin 		if (mem_affinity[i].start > start)
453a3870a18SJohn Baldwin 			panic("No affinity info for start %jx",
454a3870a18SJohn Baldwin 			    (uintmax_t)start);
455a3870a18SJohn Baldwin 		if (mem_affinity[i].end >= end) {
456d866a563SAlan Cox 			_vm_phys_create_seg(start, end,
457a3870a18SJohn Baldwin 			    mem_affinity[i].domain);
458a3870a18SJohn Baldwin 			break;
459a3870a18SJohn Baldwin 		}
460d866a563SAlan Cox 		_vm_phys_create_seg(start, mem_affinity[i].end,
461a3870a18SJohn Baldwin 		    mem_affinity[i].domain);
462a3870a18SJohn Baldwin 		start = mem_affinity[i].end;
463a3870a18SJohn Baldwin 	}
46462d70a81SJohn Baldwin #else
46562d70a81SJohn Baldwin 	_vm_phys_create_seg(start, end, 0);
46662d70a81SJohn Baldwin #endif
467a3870a18SJohn Baldwin }
468a3870a18SJohn Baldwin 
46911752d88SAlan Cox /*
470271f0f12SAlan Cox  * Add a physical memory segment.
471271f0f12SAlan Cox  */
472271f0f12SAlan Cox void
473271f0f12SAlan Cox vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
474271f0f12SAlan Cox {
475d866a563SAlan Cox 	vm_paddr_t paddr;
476271f0f12SAlan Cox 
477271f0f12SAlan Cox 	KASSERT((start & PAGE_MASK) == 0,
478271f0f12SAlan Cox 	    ("vm_phys_define_seg: start is not page aligned"));
479271f0f12SAlan Cox 	KASSERT((end & PAGE_MASK) == 0,
480271f0f12SAlan Cox 	    ("vm_phys_define_seg: end is not page aligned"));
481d866a563SAlan Cox 
482d866a563SAlan Cox 	/*
483d866a563SAlan Cox 	 * Split the physical memory segment if it spans two or more free
484d866a563SAlan Cox 	 * list boundaries.
485d866a563SAlan Cox 	 */
486d866a563SAlan Cox 	paddr = start;
487d866a563SAlan Cox #ifdef	VM_FREELIST_LOWMEM
488d866a563SAlan Cox 	if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
489d866a563SAlan Cox 		vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
490d866a563SAlan Cox 		paddr = VM_LOWMEM_BOUNDARY;
491d866a563SAlan Cox 	}
492271f0f12SAlan Cox #endif
493d866a563SAlan Cox #ifdef	VM_FREELIST_DMA32
494d866a563SAlan Cox 	if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
495d866a563SAlan Cox 		vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
496d866a563SAlan Cox 		paddr = VM_DMA32_BOUNDARY;
497d866a563SAlan Cox 	}
498d866a563SAlan Cox #endif
499d866a563SAlan Cox 	vm_phys_create_seg(paddr, end);
500271f0f12SAlan Cox }
501271f0f12SAlan Cox 
502271f0f12SAlan Cox /*
50311752d88SAlan Cox  * Initialize the physical memory allocator.
504d866a563SAlan Cox  *
505d866a563SAlan Cox  * Requires that vm_page_array is initialized!
50611752d88SAlan Cox  */
50711752d88SAlan Cox void
50811752d88SAlan Cox vm_phys_init(void)
50911752d88SAlan Cox {
51011752d88SAlan Cox 	struct vm_freelist *fl;
51172aebdd7SAlan Cox 	struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg;
51252526922SJohn Baldwin #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE)
513d866a563SAlan Cox 	u_long npages;
51452526922SJohn Baldwin #endif
515d866a563SAlan Cox 	int dom, flind, freelist, oind, pind, segind;
51611752d88SAlan Cox 
517d866a563SAlan Cox 	/*
518d866a563SAlan Cox 	 * Compute the number of free lists, and generate the mapping from the
519d866a563SAlan Cox 	 * manifest constants VM_FREELIST_* to the free list indices.
520d866a563SAlan Cox 	 *
521d866a563SAlan Cox 	 * Initially, the entries of vm_freelist_to_flind[] are set to either
522d866a563SAlan Cox 	 * 0 or 1 to indicate which free lists should be created.
523d866a563SAlan Cox 	 */
52452526922SJohn Baldwin #ifdef	VM_DMA32_NPAGES_THRESHOLD
525d866a563SAlan Cox 	npages = 0;
52652526922SJohn Baldwin #endif
527d866a563SAlan Cox 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
528d866a563SAlan Cox 		seg = &vm_phys_segs[segind];
529d866a563SAlan Cox #ifdef	VM_FREELIST_LOWMEM
530d866a563SAlan Cox 		if (seg->end <= VM_LOWMEM_BOUNDARY)
531d866a563SAlan Cox 			vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
532d866a563SAlan Cox 		else
533d866a563SAlan Cox #endif
534d866a563SAlan Cox #ifdef	VM_FREELIST_DMA32
535d866a563SAlan Cox 		if (
536d866a563SAlan Cox #ifdef	VM_DMA32_NPAGES_THRESHOLD
537d866a563SAlan Cox 		    /*
538d866a563SAlan Cox 		     * Create the DMA32 free list only if the amount of
539d866a563SAlan Cox 		     * physical memory above physical address 4G exceeds the
540d866a563SAlan Cox 		     * given threshold.
541d866a563SAlan Cox 		     */
542d866a563SAlan Cox 		    npages > VM_DMA32_NPAGES_THRESHOLD &&
543d866a563SAlan Cox #endif
544d866a563SAlan Cox 		    seg->end <= VM_DMA32_BOUNDARY)
545d866a563SAlan Cox 			vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
546d866a563SAlan Cox 		else
547d866a563SAlan Cox #endif
548d866a563SAlan Cox 		{
54952526922SJohn Baldwin #ifdef	VM_DMA32_NPAGES_THRESHOLD
550d866a563SAlan Cox 			npages += atop(seg->end - seg->start);
55152526922SJohn Baldwin #endif
552d866a563SAlan Cox 			vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
553d866a563SAlan Cox 		}
554d866a563SAlan Cox 	}
555d866a563SAlan Cox 	/* Change each entry into a running total of the free lists. */
556d866a563SAlan Cox 	for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
557d866a563SAlan Cox 		vm_freelist_to_flind[freelist] +=
558d866a563SAlan Cox 		    vm_freelist_to_flind[freelist - 1];
559d866a563SAlan Cox 	}
560d866a563SAlan Cox 	vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
561d866a563SAlan Cox 	KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
562d866a563SAlan Cox 	/* Change each entry into a free list index. */
563d866a563SAlan Cox 	for (freelist = 0; freelist < VM_NFREELIST; freelist++)
564d866a563SAlan Cox 		vm_freelist_to_flind[freelist]--;
565d866a563SAlan Cox 
566d866a563SAlan Cox 	/*
567d866a563SAlan Cox 	 * Initialize the first_page and free_queues fields of each physical
568d866a563SAlan Cox 	 * memory segment.
569d866a563SAlan Cox 	 */
570271f0f12SAlan Cox #ifdef VM_PHYSSEG_SPARSE
571d866a563SAlan Cox 	npages = 0;
57211752d88SAlan Cox #endif
573271f0f12SAlan Cox 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
574271f0f12SAlan Cox 		seg = &vm_phys_segs[segind];
575271f0f12SAlan Cox #ifdef VM_PHYSSEG_SPARSE
576d866a563SAlan Cox 		seg->first_page = &vm_page_array[npages];
577d866a563SAlan Cox 		npages += atop(seg->end - seg->start);
578271f0f12SAlan Cox #else
579271f0f12SAlan Cox 		seg->first_page = PHYS_TO_VM_PAGE(seg->start);
58011752d88SAlan Cox #endif
581d866a563SAlan Cox #ifdef	VM_FREELIST_LOWMEM
582d866a563SAlan Cox 		if (seg->end <= VM_LOWMEM_BOUNDARY) {
583d866a563SAlan Cox 			flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
584d866a563SAlan Cox 			KASSERT(flind >= 0,
585d866a563SAlan Cox 			    ("vm_phys_init: LOWMEM flind < 0"));
586d866a563SAlan Cox 		} else
587d866a563SAlan Cox #endif
588d866a563SAlan Cox #ifdef	VM_FREELIST_DMA32
589d866a563SAlan Cox 		if (seg->end <= VM_DMA32_BOUNDARY) {
590d866a563SAlan Cox 			flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
591d866a563SAlan Cox 			KASSERT(flind >= 0,
592d866a563SAlan Cox 			    ("vm_phys_init: DMA32 flind < 0"));
593d866a563SAlan Cox 		} else
594d866a563SAlan Cox #endif
595d866a563SAlan Cox 		{
596d866a563SAlan Cox 			flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
597d866a563SAlan Cox 			KASSERT(flind >= 0,
598d866a563SAlan Cox 			    ("vm_phys_init: DEFAULT flind < 0"));
59911752d88SAlan Cox 		}
600d866a563SAlan Cox 		seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
601d866a563SAlan Cox 	}
602d866a563SAlan Cox 
603d866a563SAlan Cox 	/*
60472aebdd7SAlan Cox 	 * Coalesce physical memory segments that are contiguous and share the
60572aebdd7SAlan Cox 	 * same per-domain free queues.
60672aebdd7SAlan Cox 	 */
60772aebdd7SAlan Cox 	prev_seg = vm_phys_segs;
60872aebdd7SAlan Cox 	seg = &vm_phys_segs[1];
60972aebdd7SAlan Cox 	end_seg = &vm_phys_segs[vm_phys_nsegs];
61072aebdd7SAlan Cox 	while (seg < end_seg) {
61172aebdd7SAlan Cox 		if (prev_seg->end == seg->start &&
61272aebdd7SAlan Cox 		    prev_seg->free_queues == seg->free_queues) {
61372aebdd7SAlan Cox 			prev_seg->end = seg->end;
61472aebdd7SAlan Cox 			KASSERT(prev_seg->domain == seg->domain,
61572aebdd7SAlan Cox 			    ("vm_phys_init: free queues cannot span domains"));
61672aebdd7SAlan Cox 			vm_phys_nsegs--;
61772aebdd7SAlan Cox 			end_seg--;
61872aebdd7SAlan Cox 			for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++)
61972aebdd7SAlan Cox 				*tmp_seg = *(tmp_seg + 1);
62072aebdd7SAlan Cox 		} else {
62172aebdd7SAlan Cox 			prev_seg = seg;
62272aebdd7SAlan Cox 			seg++;
62372aebdd7SAlan Cox 		}
62472aebdd7SAlan Cox 	}
62572aebdd7SAlan Cox 
62672aebdd7SAlan Cox 	/*
627d866a563SAlan Cox 	 * Initialize the free queues.
628d866a563SAlan Cox 	 */
6297e226537SAttilio Rao 	for (dom = 0; dom < vm_ndomains; dom++) {
63011752d88SAlan Cox 		for (flind = 0; flind < vm_nfreelists; flind++) {
63111752d88SAlan Cox 			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
6327e226537SAttilio Rao 				fl = vm_phys_free_queues[dom][flind][pind];
63311752d88SAlan Cox 				for (oind = 0; oind < VM_NFREEORDER; oind++)
63411752d88SAlan Cox 					TAILQ_INIT(&fl[oind].pl);
63511752d88SAlan Cox 			}
63611752d88SAlan Cox 		}
637a3870a18SJohn Baldwin 	}
638d866a563SAlan Cox 
639b16b4c22SMark Johnston #ifdef VM_FREEPOOL_LAZYINIT
640b16b4c22SMark Johnston 	vm_default_freepool = VM_FREEPOOL_LAZYINIT;
641b16b4c22SMark Johnston #else
642b16b4c22SMark Johnston 	vm_default_freepool = VM_FREEPOOL_DEFAULT;
643b16b4c22SMark Johnston #endif
644b16b4c22SMark Johnston 
64538d6b2dcSRoger Pau Monné 	rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
64611752d88SAlan Cox }
64711752d88SAlan Cox 
64811752d88SAlan Cox /*
649662e7fa8SMark Johnston  * Register info about the NUMA topology of the system.
650662e7fa8SMark Johnston  *
651662e7fa8SMark Johnston  * Invoked by platform-dependent code prior to vm_phys_init().
652662e7fa8SMark Johnston  */
653662e7fa8SMark Johnston void
654cb20a74cSStephen J. Kiernan vm_phys_register_domains(int ndomains __numa_used,
655cb20a74cSStephen J. Kiernan     struct mem_affinity *affinity __numa_used, int *locality __numa_used)
656662e7fa8SMark Johnston {
657662e7fa8SMark Johnston #ifdef NUMA
658c415cfc8SZhenlei Huang 	int i;
659662e7fa8SMark Johnston 
660b61f3142SMark Johnston 	/*
661b61f3142SMark Johnston 	 * For now the only override value that we support is 1, which
662b61f3142SMark Johnston 	 * effectively disables NUMA-awareness in the allocators.
663b61f3142SMark Johnston 	 */
664c415cfc8SZhenlei Huang 	TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled);
665c415cfc8SZhenlei Huang 	if (numa_disabled)
666b61f3142SMark Johnston 		ndomains = 1;
667b61f3142SMark Johnston 
668b61f3142SMark Johnston 	if (ndomains > 1) {
669662e7fa8SMark Johnston 		vm_ndomains = ndomains;
670662e7fa8SMark Johnston 		mem_affinity = affinity;
671662e7fa8SMark Johnston 		mem_locality = locality;
672b61f3142SMark Johnston 	}
673662e7fa8SMark Johnston 
674662e7fa8SMark Johnston 	for (i = 0; i < vm_ndomains; i++)
675662e7fa8SMark Johnston 		DOMAINSET_SET(i, &all_domains);
676662e7fa8SMark Johnston #endif
677662e7fa8SMark Johnston }
678662e7fa8SMark Johnston 
679662e7fa8SMark Johnston /*
68011752d88SAlan Cox  * Split a contiguous, power of two-sized set of physical pages.
681370a338aSAlan Cox  *
682370a338aSAlan Cox  * When this function is called by a page allocation function, the caller
683370a338aSAlan Cox  * should request insertion at the head unless the order [order, oind) queues
684370a338aSAlan Cox  * are known to be empty.  The objective being to reduce the likelihood of
685370a338aSAlan Cox  * long-term fragmentation by promoting contemporaneous allocation and
686370a338aSAlan Cox  * (hopefully) deallocation.
68711752d88SAlan Cox  */
68811752d88SAlan Cox static __inline void
689370a338aSAlan Cox vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order,
690370a338aSAlan Cox     int tail)
69111752d88SAlan Cox {
69211752d88SAlan Cox 	vm_page_t m_buddy;
69311752d88SAlan Cox 
69411752d88SAlan Cox 	while (oind > order) {
69511752d88SAlan Cox 		oind--;
69611752d88SAlan Cox 		m_buddy = &m[1 << oind];
69711752d88SAlan Cox 		KASSERT(m_buddy->order == VM_NFREEORDER,
69811752d88SAlan Cox 		    ("vm_phys_split_pages: page %p has unexpected order %d",
69911752d88SAlan Cox 		    m_buddy, m_buddy->order));
700370a338aSAlan Cox 		vm_freelist_add(fl, m_buddy, oind, tail);
70111752d88SAlan Cox         }
70211752d88SAlan Cox }
70311752d88SAlan Cox 
704d7ec4a88SMark Johnston static void
705d7ec4a88SMark Johnston vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int tail)
706d7ec4a88SMark Johnston {
707d7ec4a88SMark Johnston 	KASSERT(order >= 0 && order < VM_NFREEORDER,
708d7ec4a88SMark Johnston 	    ("%s: invalid order %d", __func__, order));
709d7ec4a88SMark Johnston 
710d7ec4a88SMark Johnston 	vm_freelist_add(fl, m, order, tail);
711b16b4c22SMark Johnston #ifdef VM_FREEPOOL_LAZYINIT
712b16b4c22SMark Johnston 	if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
713b16b4c22SMark Johnston 		vm_page_t m_next;
714b16b4c22SMark Johnston 		int npages;
715b16b4c22SMark Johnston 
716b16b4c22SMark Johnston 		npages = 1 << order;
717b16b4c22SMark Johnston 		m_next = m + npages;
718b16b4c22SMark Johnston 		vm_page_init_page(m_next, m->phys_addr + ptoa(npages), m->segind,
719b16b4c22SMark Johnston 		    VM_FREEPOOL_LAZYINIT);
720b16b4c22SMark Johnston 	}
721b16b4c22SMark Johnston #endif
722d7ec4a88SMark Johnston }
723d7ec4a88SMark Johnston 
72411752d88SAlan Cox /*
725e77f4e7fSDoug Moore  * Add the physical pages [m, m + npages) at the beginning of a power-of-two
726e77f4e7fSDoug Moore  * aligned and sized set to the specified free list.
727e77f4e7fSDoug Moore  *
728e77f4e7fSDoug Moore  * When this function is called by a page allocation function, the caller
729e77f4e7fSDoug Moore  * should request insertion at the head unless the lower-order queues are
730e77f4e7fSDoug Moore  * known to be empty.  The objective being to reduce the likelihood of long-
731e77f4e7fSDoug Moore  * term fragmentation by promoting contemporaneous allocation and (hopefully)
732e77f4e7fSDoug Moore  * deallocation.
733e77f4e7fSDoug Moore  *
734e77f4e7fSDoug Moore  * The physical page m's buddy must not be free.
735e77f4e7fSDoug Moore  */
736e77f4e7fSDoug Moore static void
737e3537f92SDoug Moore vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail)
738e77f4e7fSDoug Moore {
739e77f4e7fSDoug Moore         int order;
740e77f4e7fSDoug Moore 
741e77f4e7fSDoug Moore 	KASSERT(npages == 0 ||
742e77f4e7fSDoug Moore 	    (VM_PAGE_TO_PHYS(m) &
743543d55d7SDoug Moore 	    ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
744e77f4e7fSDoug Moore 	    ("%s: page %p and npages %u are misaligned",
745e77f4e7fSDoug Moore 	    __func__, m, npages));
746e77f4e7fSDoug Moore         while (npages > 0) {
747e77f4e7fSDoug Moore 		KASSERT(m->order == VM_NFREEORDER,
748e77f4e7fSDoug Moore 		    ("%s: page %p has unexpected order %d",
749e77f4e7fSDoug Moore 		    __func__, m, m->order));
750543d55d7SDoug Moore 		order = ilog2(npages);
751e77f4e7fSDoug Moore 		KASSERT(order < VM_NFREEORDER,
752e77f4e7fSDoug Moore 		    ("%s: order %d is out of range", __func__, order));
753d7ec4a88SMark Johnston 		vm_phys_enq_chunk(fl, m, order, tail);
754e77f4e7fSDoug Moore 		m += 1 << order;
755e77f4e7fSDoug Moore 		npages -= 1 << order;
756e77f4e7fSDoug Moore 	}
757e77f4e7fSDoug Moore }
758e77f4e7fSDoug Moore 
759e77f4e7fSDoug Moore /*
7607493904eSAlan Cox  * Add the physical pages [m, m + npages) at the end of a power-of-two aligned
7617493904eSAlan Cox  * and sized set to the specified free list.
7627493904eSAlan Cox  *
7637493904eSAlan Cox  * When this function is called by a page allocation function, the caller
7647493904eSAlan Cox  * should request insertion at the head unless the lower-order queues are
7657493904eSAlan Cox  * known to be empty.  The objective being to reduce the likelihood of long-
7667493904eSAlan Cox  * term fragmentation by promoting contemporaneous allocation and (hopefully)
7677493904eSAlan Cox  * deallocation.
7687493904eSAlan Cox  *
769ccdb2827SDoug Moore  * If npages is zero, this function does nothing and ignores the physical page
770ccdb2827SDoug Moore  * parameter m.  Otherwise, the physical page m's buddy must not be free.
7717493904eSAlan Cox  */
772c9b06fa5SDoug Moore static vm_page_t
773e3537f92SDoug Moore vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail)
7747493904eSAlan Cox {
7757493904eSAlan Cox 	int order;
7767493904eSAlan Cox 
777ccdb2827SDoug Moore 	KASSERT(npages == 0 ||
778ccdb2827SDoug Moore 	    ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) &
779543d55d7SDoug Moore 	    ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
7807493904eSAlan Cox 	    ("vm_phys_enq_range: page %p and npages %u are misaligned",
7817493904eSAlan Cox 	    m, npages));
782c9b06fa5SDoug Moore 	while (npages > 0) {
7837493904eSAlan Cox 		KASSERT(m->order == VM_NFREEORDER,
7847493904eSAlan Cox 		    ("vm_phys_enq_range: page %p has unexpected order %d",
7857493904eSAlan Cox 		    m, m->order));
7867493904eSAlan Cox 		order = ffs(npages) - 1;
787d7ec4a88SMark Johnston 		vm_phys_enq_chunk(fl, m, order, tail);
788c9b06fa5SDoug Moore 		m += 1 << order;
789c9b06fa5SDoug Moore 		npages -= 1 << order;
790c9b06fa5SDoug Moore 	}
791c9b06fa5SDoug Moore 	return (m);
7927493904eSAlan Cox }
7937493904eSAlan Cox 
7947493904eSAlan Cox /*
795e3537f92SDoug Moore  * Set the pool for a contiguous, power of two-sized set of physical pages.
796b16b4c22SMark Johnston  *
797b16b4c22SMark Johnston  * If the pages currently belong to the lazy init pool, then the corresponding
798b16b4c22SMark Johnston  * page structures must be initialized.  In this case it is assumed that the
799b16b4c22SMark Johnston  * first page in the run has already been initialized.
800e3537f92SDoug Moore  */
801e3537f92SDoug Moore static void
802e3537f92SDoug Moore vm_phys_set_pool(int pool, vm_page_t m, int order)
803e3537f92SDoug Moore {
804b16b4c22SMark Johnston #ifdef VM_FREEPOOL_LAZYINIT
805b16b4c22SMark Johnston 	if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
806b16b4c22SMark Johnston 		vm_paddr_t pa;
807b16b4c22SMark Johnston 		int segind;
808e3537f92SDoug Moore 
809b16b4c22SMark Johnston 		m->pool = pool;
810b16b4c22SMark Johnston 
811b16b4c22SMark Johnston 		TSENTER();
812b16b4c22SMark Johnston 		pa = m->phys_addr + PAGE_SIZE;
813b16b4c22SMark Johnston 		segind = m->segind;
814b16b4c22SMark Johnston 		for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order];
815b16b4c22SMark Johnston 		    m_tmp++, pa += PAGE_SIZE)
816b16b4c22SMark Johnston 			vm_page_init_page(m_tmp, pa, segind, pool);
817b16b4c22SMark Johnston 		TSEXIT();
818b16b4c22SMark Johnston 	} else
819b16b4c22SMark Johnston #endif
820b16b4c22SMark Johnston 		for (vm_page_t m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
821e3537f92SDoug Moore 			m_tmp->pool = pool;
822e3537f92SDoug Moore }
823e3537f92SDoug Moore 
824e3537f92SDoug Moore /*
82589ea39a7SAlan Cox  * Tries to allocate the specified number of pages from the specified pool
82689ea39a7SAlan Cox  * within the specified domain.  Returns the actual number of allocated pages
82789ea39a7SAlan Cox  * and a pointer to each page through the array ma[].
82889ea39a7SAlan Cox  *
82932d81f21SAlan Cox  * The returned pages may not be physically contiguous.  However, in contrast
83032d81f21SAlan Cox  * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0),
83132d81f21SAlan Cox  * calling this function once to allocate the desired number of pages will
832e3537f92SDoug Moore  * avoid wasted time in vm_phys_split_pages().
83389ea39a7SAlan Cox  *
83489ea39a7SAlan Cox  * The free page queues for the specified domain must be locked.
83589ea39a7SAlan Cox  */
83689ea39a7SAlan Cox int
83789ea39a7SAlan Cox vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
83889ea39a7SAlan Cox {
83989ea39a7SAlan Cox 	struct vm_freelist *alt, *fl;
84089ea39a7SAlan Cox 	vm_page_t m;
841c9b06fa5SDoug Moore 	int avail, end, flind, freelist, i, oind, pind;
84289ea39a7SAlan Cox 
84389ea39a7SAlan Cox 	KASSERT(domain >= 0 && domain < vm_ndomains,
84489ea39a7SAlan Cox 	    ("vm_phys_alloc_npages: domain %d is out of range", domain));
845b16b4c22SMark Johnston 	KASSERT(vm_phys_pool_valid(pool),
84689ea39a7SAlan Cox 	    ("vm_phys_alloc_npages: pool %d is out of range", pool));
84789ea39a7SAlan Cox 	KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
84889ea39a7SAlan Cox 	    ("vm_phys_alloc_npages: npages %d is out of range", npages));
84989ea39a7SAlan Cox 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
85089ea39a7SAlan Cox 	i = 0;
85189ea39a7SAlan Cox 	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
85289ea39a7SAlan Cox 		flind = vm_freelist_to_flind[freelist];
85389ea39a7SAlan Cox 		if (flind < 0)
85489ea39a7SAlan Cox 			continue;
85589ea39a7SAlan Cox 		fl = vm_phys_free_queues[domain][flind][pool];
85689ea39a7SAlan Cox 		for (oind = 0; oind < VM_NFREEORDER; oind++) {
85789ea39a7SAlan Cox 			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
85889ea39a7SAlan Cox 				vm_freelist_rem(fl, m, oind);
859c9b06fa5SDoug Moore 				avail = i + (1 << oind);
860c9b06fa5SDoug Moore 				end = imin(npages, avail);
861e3537f92SDoug Moore 				while (i < end)
86289ea39a7SAlan Cox 					ma[i++] = m++;
863c9b06fa5SDoug Moore 				if (i == npages) {
8647493904eSAlan Cox 					/*
865c9b06fa5SDoug Moore 					 * Return excess pages to fl.  Its order
866c9b06fa5SDoug Moore 					 * [0, oind) queues are empty.
8677493904eSAlan Cox 					 */
868e3537f92SDoug Moore 					vm_phys_enq_range(m, avail - i, fl, 1);
86989ea39a7SAlan Cox 					return (npages);
870c9b06fa5SDoug Moore 				}
87189ea39a7SAlan Cox 			}
87289ea39a7SAlan Cox 		}
87389ea39a7SAlan Cox 		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
874b16b4c22SMark Johnston 			for (pind = vm_default_freepool; pind < VM_NFREEPOOL;
875b16b4c22SMark Johnston 			    pind++) {
87689ea39a7SAlan Cox 				alt = vm_phys_free_queues[domain][flind][pind];
87789ea39a7SAlan Cox 				while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
87889ea39a7SAlan Cox 				    NULL) {
87989ea39a7SAlan Cox 					vm_freelist_rem(alt, m, oind);
880e3537f92SDoug Moore 					vm_phys_set_pool(pool, m, oind);
881c9b06fa5SDoug Moore 					avail = i + (1 << oind);
882c9b06fa5SDoug Moore 					end = imin(npages, avail);
883e3537f92SDoug Moore 					while (i < end)
88489ea39a7SAlan Cox 						ma[i++] = m++;
885c9b06fa5SDoug Moore 					if (i == npages) {
8867493904eSAlan Cox 						/*
8877493904eSAlan Cox 						 * Return excess pages to fl.
8887493904eSAlan Cox 						 * Its order [0, oind) queues
8897493904eSAlan Cox 						 * are empty.
8907493904eSAlan Cox 						 */
891c9b06fa5SDoug Moore 						vm_phys_enq_range(m, avail - i,
892e3537f92SDoug Moore 						    fl, 1);
89389ea39a7SAlan Cox 						return (npages);
894c9b06fa5SDoug Moore 					}
89589ea39a7SAlan Cox 				}
89689ea39a7SAlan Cox 			}
89789ea39a7SAlan Cox 		}
89889ea39a7SAlan Cox 	}
89989ea39a7SAlan Cox 	return (i);
90089ea39a7SAlan Cox }
90189ea39a7SAlan Cox 
90289ea39a7SAlan Cox /*
90311752d88SAlan Cox  * Allocate a contiguous, power of two-sized set of physical pages
904e3537f92SDoug Moore  * from the free lists.
9058941dc44SAlan Cox  *
9068941dc44SAlan Cox  * The free page queues must be locked.
90711752d88SAlan Cox  */
90811752d88SAlan Cox vm_page_t
909ef435ae7SJeff Roberson vm_phys_alloc_pages(int domain, int pool, int order)
91011752d88SAlan Cox {
91149ca10d4SJayachandran C. 	vm_page_t m;
9120db2102aSMichael Zhilin 	int freelist;
91349ca10d4SJayachandran C. 
9140db2102aSMichael Zhilin 	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
9150db2102aSMichael Zhilin 		m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order);
91649ca10d4SJayachandran C. 		if (m != NULL)
91749ca10d4SJayachandran C. 			return (m);
91849ca10d4SJayachandran C. 	}
91949ca10d4SJayachandran C. 	return (NULL);
92049ca10d4SJayachandran C. }
92149ca10d4SJayachandran C. 
92249ca10d4SJayachandran C. /*
923d866a563SAlan Cox  * Allocate a contiguous, power of two-sized set of physical pages from the
924d866a563SAlan Cox  * specified free list.  The free list must be specified using one of the
925e3537f92SDoug Moore  * manifest constants VM_FREELIST_*.
926d866a563SAlan Cox  *
927d866a563SAlan Cox  * The free page queues must be locked.
92849ca10d4SJayachandran C.  */
92949ca10d4SJayachandran C. vm_page_t
9300db2102aSMichael Zhilin vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
93149ca10d4SJayachandran C. {
932ef435ae7SJeff Roberson 	struct vm_freelist *alt, *fl;
93311752d88SAlan Cox 	vm_page_t m;
9340db2102aSMichael Zhilin 	int oind, pind, flind;
93511752d88SAlan Cox 
936ef435ae7SJeff Roberson 	KASSERT(domain >= 0 && domain < vm_ndomains,
937ef435ae7SJeff Roberson 	    ("vm_phys_alloc_freelist_pages: domain %d is out of range",
938ef435ae7SJeff Roberson 	    domain));
9390db2102aSMichael Zhilin 	KASSERT(freelist < VM_NFREELIST,
940d866a563SAlan Cox 	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
9415be93778SAndrew Turner 	    freelist));
942b16b4c22SMark Johnston 	KASSERT(vm_phys_pool_valid(pool),
94349ca10d4SJayachandran C. 	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
94411752d88SAlan Cox 	KASSERT(order < VM_NFREEORDER,
94549ca10d4SJayachandran C. 	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
9466520495aSAdrian Chadd 
9470db2102aSMichael Zhilin 	flind = vm_freelist_to_flind[freelist];
9480db2102aSMichael Zhilin 	/* Check if freelist is present */
9490db2102aSMichael Zhilin 	if (flind < 0)
9500db2102aSMichael Zhilin 		return (NULL);
9510db2102aSMichael Zhilin 
952e2068d0bSJeff Roberson 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
9537e226537SAttilio Rao 	fl = &vm_phys_free_queues[domain][flind][pool][0];
95411752d88SAlan Cox 	for (oind = order; oind < VM_NFREEORDER; oind++) {
95511752d88SAlan Cox 		m = TAILQ_FIRST(&fl[oind].pl);
95611752d88SAlan Cox 		if (m != NULL) {
9577e226537SAttilio Rao 			vm_freelist_rem(fl, m, oind);
958370a338aSAlan Cox 			/* The order [order, oind) queues are empty. */
959370a338aSAlan Cox 			vm_phys_split_pages(m, oind, fl, order, 1);
96011752d88SAlan Cox 			return (m);
96111752d88SAlan Cox 		}
96211752d88SAlan Cox 	}
96311752d88SAlan Cox 
96411752d88SAlan Cox 	/*
96511752d88SAlan Cox 	 * The given pool was empty.  Find the largest
96611752d88SAlan Cox 	 * contiguous, power-of-two-sized set of pages in any
96711752d88SAlan Cox 	 * pool.  Transfer these pages to the given pool, and
96811752d88SAlan Cox 	 * use them to satisfy the allocation.
96911752d88SAlan Cox 	 */
97011752d88SAlan Cox 	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
971b16b4c22SMark Johnston 		for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
9727e226537SAttilio Rao 			alt = &vm_phys_free_queues[domain][flind][pind][0];
97311752d88SAlan Cox 			m = TAILQ_FIRST(&alt[oind].pl);
97411752d88SAlan Cox 			if (m != NULL) {
9757e226537SAttilio Rao 				vm_freelist_rem(alt, m, oind);
976e3537f92SDoug Moore 				vm_phys_set_pool(pool, m, oind);
977370a338aSAlan Cox 				/* The order [order, oind) queues are empty. */
978370a338aSAlan Cox 				vm_phys_split_pages(m, oind, fl, order, 1);
97911752d88SAlan Cox 				return (m);
98011752d88SAlan Cox 			}
98111752d88SAlan Cox 		}
98211752d88SAlan Cox 	}
98311752d88SAlan Cox 	return (NULL);
98411752d88SAlan Cox }
98511752d88SAlan Cox 
98611752d88SAlan Cox /*
98769cbb187SMark Johnston  * Find the vm_page corresponding to the given physical address, which must lie
98869cbb187SMark Johnston  * within the given physical memory segment.
98969cbb187SMark Johnston  */
99069cbb187SMark Johnston vm_page_t
99169cbb187SMark Johnston vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa)
99269cbb187SMark Johnston {
99369cbb187SMark Johnston 	KASSERT(pa >= seg->start && pa < seg->end,
99469cbb187SMark Johnston 	    ("%s: pa %#jx is out of range", __func__, (uintmax_t)pa));
99569cbb187SMark Johnston 
99669cbb187SMark Johnston 	return (&seg->first_page[atop(pa - seg->start)]);
99769cbb187SMark Johnston }
99869cbb187SMark Johnston 
99969cbb187SMark Johnston /*
100011752d88SAlan Cox  * Find the vm_page corresponding to the given physical address.
100111752d88SAlan Cox  */
100211752d88SAlan Cox vm_page_t
100311752d88SAlan Cox vm_phys_paddr_to_vm_page(vm_paddr_t pa)
100411752d88SAlan Cox {
100511752d88SAlan Cox 	struct vm_phys_seg *seg;
100611752d88SAlan Cox 
10079e817428SDoug Moore 	if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
100869cbb187SMark Johnston 		return (vm_phys_seg_paddr_to_vm_page(seg, pa));
1009f06a3a36SAndrew Thompson 	return (NULL);
101011752d88SAlan Cox }
101111752d88SAlan Cox 
1012b6de32bdSKonstantin Belousov vm_page_t
1013b6de32bdSKonstantin Belousov vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
1014b6de32bdSKonstantin Belousov {
101538d6b2dcSRoger Pau Monné 	struct vm_phys_fictitious_seg tmp, *seg;
1016b6de32bdSKonstantin Belousov 	vm_page_t m;
1017b6de32bdSKonstantin Belousov 
1018b6de32bdSKonstantin Belousov 	m = NULL;
101938d6b2dcSRoger Pau Monné 	tmp.start = pa;
102038d6b2dcSRoger Pau Monné 	tmp.end = 0;
102138d6b2dcSRoger Pau Monné 
102238d6b2dcSRoger Pau Monné 	rw_rlock(&vm_phys_fictitious_reg_lock);
102338d6b2dcSRoger Pau Monné 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
102438d6b2dcSRoger Pau Monné 	rw_runlock(&vm_phys_fictitious_reg_lock);
102538d6b2dcSRoger Pau Monné 	if (seg == NULL)
102638d6b2dcSRoger Pau Monné 		return (NULL);
102738d6b2dcSRoger Pau Monné 
1028b6de32bdSKonstantin Belousov 	m = &seg->first_page[atop(pa - seg->start)];
102938d6b2dcSRoger Pau Monné 	KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
103038d6b2dcSRoger Pau Monné 
1031b6de32bdSKonstantin Belousov 	return (m);
1032b6de32bdSKonstantin Belousov }
1033b6de32bdSKonstantin Belousov 
10345ebe728dSRoger Pau Monné static inline void
10355ebe728dSRoger Pau Monné vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
10365ebe728dSRoger Pau Monné     long page_count, vm_memattr_t memattr)
10375ebe728dSRoger Pau Monné {
10385ebe728dSRoger Pau Monné 	long i;
10395ebe728dSRoger Pau Monné 
1040f93f7cf1SMark Johnston 	bzero(range, page_count * sizeof(*range));
10415ebe728dSRoger Pau Monné 	for (i = 0; i < page_count; i++) {
10425ebe728dSRoger Pau Monné 		vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
10435ebe728dSRoger Pau Monné 		range[i].oflags &= ~VPO_UNMANAGED;
10445ebe728dSRoger Pau Monné 		range[i].busy_lock = VPB_UNBUSIED;
10455ebe728dSRoger Pau Monné 	}
10465ebe728dSRoger Pau Monné }
10475ebe728dSRoger Pau Monné 
1048b6de32bdSKonstantin Belousov int
1049b6de32bdSKonstantin Belousov vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
1050b6de32bdSKonstantin Belousov     vm_memattr_t memattr)
1051b6de32bdSKonstantin Belousov {
1052b6de32bdSKonstantin Belousov 	struct vm_phys_fictitious_seg *seg;
1053b6de32bdSKonstantin Belousov 	vm_page_t fp;
10545ebe728dSRoger Pau Monné 	long page_count;
1055b6de32bdSKonstantin Belousov #ifdef VM_PHYSSEG_DENSE
10565ebe728dSRoger Pau Monné 	long pi, pe;
10575ebe728dSRoger Pau Monné 	long dpage_count;
1058b6de32bdSKonstantin Belousov #endif
1059b6de32bdSKonstantin Belousov 
10605ebe728dSRoger Pau Monné 	KASSERT(start < end,
10615ebe728dSRoger Pau Monné 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
10625ebe728dSRoger Pau Monné 	    (uintmax_t)start, (uintmax_t)end));
10635ebe728dSRoger Pau Monné 
1064b6de32bdSKonstantin Belousov 	page_count = (end - start) / PAGE_SIZE;
1065b6de32bdSKonstantin Belousov 
1066b6de32bdSKonstantin Belousov #ifdef VM_PHYSSEG_DENSE
1067b6de32bdSKonstantin Belousov 	pi = atop(start);
10685ebe728dSRoger Pau Monné 	pe = atop(end);
10695ebe728dSRoger Pau Monné 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1070b6de32bdSKonstantin Belousov 		fp = &vm_page_array[pi - first_page];
10715ebe728dSRoger Pau Monné 		if ((pe - first_page) > vm_page_array_size) {
10725ebe728dSRoger Pau Monné 			/*
10735ebe728dSRoger Pau Monné 			 * We have a segment that starts inside
10745ebe728dSRoger Pau Monné 			 * of vm_page_array, but ends outside of it.
10755ebe728dSRoger Pau Monné 			 *
10765ebe728dSRoger Pau Monné 			 * Use vm_page_array pages for those that are
10775ebe728dSRoger Pau Monné 			 * inside of the vm_page_array range, and
10785ebe728dSRoger Pau Monné 			 * allocate the remaining ones.
10795ebe728dSRoger Pau Monné 			 */
10805ebe728dSRoger Pau Monné 			dpage_count = vm_page_array_size - (pi - first_page);
10815ebe728dSRoger Pau Monné 			vm_phys_fictitious_init_range(fp, start, dpage_count,
10825ebe728dSRoger Pau Monné 			    memattr);
10835ebe728dSRoger Pau Monné 			page_count -= dpage_count;
10845ebe728dSRoger Pau Monné 			start += ptoa(dpage_count);
10855ebe728dSRoger Pau Monné 			goto alloc;
10865ebe728dSRoger Pau Monné 		}
10875ebe728dSRoger Pau Monné 		/*
10885ebe728dSRoger Pau Monné 		 * We can allocate the full range from vm_page_array,
10895ebe728dSRoger Pau Monné 		 * so there's no need to register the range in the tree.
10905ebe728dSRoger Pau Monné 		 */
10915ebe728dSRoger Pau Monné 		vm_phys_fictitious_init_range(fp, start, page_count, memattr);
10925ebe728dSRoger Pau Monné 		return (0);
10935ebe728dSRoger Pau Monné 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
10945ebe728dSRoger Pau Monné 		/*
10955ebe728dSRoger Pau Monné 		 * We have a segment that ends inside of vm_page_array,
10965ebe728dSRoger Pau Monné 		 * but starts outside of it.
10975ebe728dSRoger Pau Monné 		 */
10985ebe728dSRoger Pau Monné 		fp = &vm_page_array[0];
10995ebe728dSRoger Pau Monné 		dpage_count = pe - first_page;
11005ebe728dSRoger Pau Monné 		vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
11015ebe728dSRoger Pau Monné 		    memattr);
11025ebe728dSRoger Pau Monné 		end -= ptoa(dpage_count);
11035ebe728dSRoger Pau Monné 		page_count -= dpage_count;
11045ebe728dSRoger Pau Monné 		goto alloc;
11055ebe728dSRoger Pau Monné 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
11065ebe728dSRoger Pau Monné 		/*
11075ebe728dSRoger Pau Monné 		 * Trying to register a fictitious range that expands before
11085ebe728dSRoger Pau Monné 		 * and after vm_page_array.
11095ebe728dSRoger Pau Monné 		 */
11105ebe728dSRoger Pau Monné 		return (EINVAL);
11115ebe728dSRoger Pau Monné 	} else {
11125ebe728dSRoger Pau Monné alloc:
1113b6de32bdSKonstantin Belousov #endif
1114b6de32bdSKonstantin Belousov 		fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
1115f93f7cf1SMark Johnston 		    M_WAITOK);
11165ebe728dSRoger Pau Monné #ifdef VM_PHYSSEG_DENSE
1117b6de32bdSKonstantin Belousov 	}
11185ebe728dSRoger Pau Monné #endif
11195ebe728dSRoger Pau Monné 	vm_phys_fictitious_init_range(fp, start, page_count, memattr);
112038d6b2dcSRoger Pau Monné 
112138d6b2dcSRoger Pau Monné 	seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
1122b6de32bdSKonstantin Belousov 	seg->start = start;
1123b6de32bdSKonstantin Belousov 	seg->end = end;
1124b6de32bdSKonstantin Belousov 	seg->first_page = fp;
112538d6b2dcSRoger Pau Monné 
112638d6b2dcSRoger Pau Monné 	rw_wlock(&vm_phys_fictitious_reg_lock);
112738d6b2dcSRoger Pau Monné 	RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
112838d6b2dcSRoger Pau Monné 	rw_wunlock(&vm_phys_fictitious_reg_lock);
112938d6b2dcSRoger Pau Monné 
1130b6de32bdSKonstantin Belousov 	return (0);
1131b6de32bdSKonstantin Belousov }
1132b6de32bdSKonstantin Belousov 
1133b6de32bdSKonstantin Belousov void
1134b6de32bdSKonstantin Belousov vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
1135b6de32bdSKonstantin Belousov {
113638d6b2dcSRoger Pau Monné 	struct vm_phys_fictitious_seg *seg, tmp;
1137b6de32bdSKonstantin Belousov #ifdef VM_PHYSSEG_DENSE
11385ebe728dSRoger Pau Monné 	long pi, pe;
1139b6de32bdSKonstantin Belousov #endif
1140b6de32bdSKonstantin Belousov 
11415ebe728dSRoger Pau Monné 	KASSERT(start < end,
11425ebe728dSRoger Pau Monné 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
11435ebe728dSRoger Pau Monné 	    (uintmax_t)start, (uintmax_t)end));
11445ebe728dSRoger Pau Monné 
1145b6de32bdSKonstantin Belousov #ifdef VM_PHYSSEG_DENSE
1146b6de32bdSKonstantin Belousov 	pi = atop(start);
11475ebe728dSRoger Pau Monné 	pe = atop(end);
11485ebe728dSRoger Pau Monné 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
11495ebe728dSRoger Pau Monné 		if ((pe - first_page) <= vm_page_array_size) {
11505ebe728dSRoger Pau Monné 			/*
11515ebe728dSRoger Pau Monné 			 * This segment was allocated using vm_page_array
11525ebe728dSRoger Pau Monné 			 * only, there's nothing to do since those pages
11535ebe728dSRoger Pau Monné 			 * were never added to the tree.
11545ebe728dSRoger Pau Monné 			 */
11555ebe728dSRoger Pau Monné 			return;
11565ebe728dSRoger Pau Monné 		}
11575ebe728dSRoger Pau Monné 		/*
11585ebe728dSRoger Pau Monné 		 * We have a segment that starts inside
11595ebe728dSRoger Pau Monné 		 * of vm_page_array, but ends outside of it.
11605ebe728dSRoger Pau Monné 		 *
11615ebe728dSRoger Pau Monné 		 * Calculate how many pages were added to the
11625ebe728dSRoger Pau Monné 		 * tree and free them.
11635ebe728dSRoger Pau Monné 		 */
11645ebe728dSRoger Pau Monné 		start = ptoa(first_page + vm_page_array_size);
11655ebe728dSRoger Pau Monné 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
11665ebe728dSRoger Pau Monné 		/*
11675ebe728dSRoger Pau Monné 		 * We have a segment that ends inside of vm_page_array,
11685ebe728dSRoger Pau Monné 		 * but starts outside of it.
11695ebe728dSRoger Pau Monné 		 */
11705ebe728dSRoger Pau Monné 		end = ptoa(first_page);
11715ebe728dSRoger Pau Monné 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
11725ebe728dSRoger Pau Monné 		/* Since it's not possible to register such a range, panic. */
11735ebe728dSRoger Pau Monné 		panic(
11745ebe728dSRoger Pau Monné 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
11755ebe728dSRoger Pau Monné 		    (uintmax_t)start, (uintmax_t)end);
11765ebe728dSRoger Pau Monné 	}
1177b6de32bdSKonstantin Belousov #endif
117838d6b2dcSRoger Pau Monné 	tmp.start = start;
117938d6b2dcSRoger Pau Monné 	tmp.end = 0;
1180b6de32bdSKonstantin Belousov 
118138d6b2dcSRoger Pau Monné 	rw_wlock(&vm_phys_fictitious_reg_lock);
118238d6b2dcSRoger Pau Monné 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
118338d6b2dcSRoger Pau Monné 	if (seg->start != start || seg->end != end) {
118438d6b2dcSRoger Pau Monné 		rw_wunlock(&vm_phys_fictitious_reg_lock);
118538d6b2dcSRoger Pau Monné 		panic(
118638d6b2dcSRoger Pau Monné 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
118738d6b2dcSRoger Pau Monné 		    (uintmax_t)start, (uintmax_t)end);
118838d6b2dcSRoger Pau Monné 	}
118938d6b2dcSRoger Pau Monné 	RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
119038d6b2dcSRoger Pau Monné 	rw_wunlock(&vm_phys_fictitious_reg_lock);
119138d6b2dcSRoger Pau Monné 	free(seg->first_page, M_FICT_PAGES);
119238d6b2dcSRoger Pau Monné 	free(seg, M_FICT_PAGES);
1193b6de32bdSKonstantin Belousov }
1194b6de32bdSKonstantin Belousov 
119511752d88SAlan Cox /*
1196e3537f92SDoug Moore  * Free a contiguous, power of two-sized set of physical pages.
11978941dc44SAlan Cox  *
11988941dc44SAlan Cox  * The free page queues must be locked.
119911752d88SAlan Cox  */
120011752d88SAlan Cox void
120111752d88SAlan Cox vm_phys_free_pages(vm_page_t m, int order)
120211752d88SAlan Cox {
120311752d88SAlan Cox 	struct vm_freelist *fl;
120411752d88SAlan Cox 	struct vm_phys_seg *seg;
12055c1f2cc4SAlan Cox 	vm_paddr_t pa;
120611752d88SAlan Cox 	vm_page_t m_buddy;
120711752d88SAlan Cox 
120811752d88SAlan Cox 	KASSERT(m->order == VM_NFREEORDER,
12093921068fSJeff Roberson 	    ("vm_phys_free_pages: page %p has unexpected order %d",
12103921068fSJeff Roberson 	    m, m->order));
1211b16b4c22SMark Johnston 	KASSERT(vm_phys_pool_valid(m->pool),
1212e3537f92SDoug Moore 	    ("vm_phys_free_pages: page %p has unexpected pool %d",
1213e3537f92SDoug Moore 	    m, m->pool));
121411752d88SAlan Cox 	KASSERT(order < VM_NFREEORDER,
12158941dc44SAlan Cox 	    ("vm_phys_free_pages: order %d is out of range", order));
121611752d88SAlan Cox 	seg = &vm_phys_segs[m->segind];
1217e2068d0bSJeff Roberson 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
12185c1f2cc4SAlan Cox 	if (order < VM_NFREEORDER - 1) {
12195c1f2cc4SAlan Cox 		pa = VM_PAGE_TO_PHYS(m);
12205c1f2cc4SAlan Cox 		do {
12215c1f2cc4SAlan Cox 			pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
12225c1f2cc4SAlan Cox 			if (pa < seg->start || pa >= seg->end)
122311752d88SAlan Cox 				break;
122469cbb187SMark Johnston 			m_buddy = vm_phys_seg_paddr_to_vm_page(seg, pa);
122511752d88SAlan Cox 			if (m_buddy->order != order)
122611752d88SAlan Cox 				break;
122711752d88SAlan Cox 			fl = (*seg->free_queues)[m_buddy->pool];
12287e226537SAttilio Rao 			vm_freelist_rem(fl, m_buddy, order);
1229e3537f92SDoug Moore 			if (m_buddy->pool != m->pool)
1230e3537f92SDoug Moore 				vm_phys_set_pool(m->pool, m_buddy, order);
123111752d88SAlan Cox 			order++;
12325c1f2cc4SAlan Cox 			pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
123369cbb187SMark Johnston 			m = vm_phys_seg_paddr_to_vm_page(seg, pa);
12345c1f2cc4SAlan Cox 		} while (order < VM_NFREEORDER - 1);
123511752d88SAlan Cox 	}
1236e3537f92SDoug Moore 	fl = (*seg->free_queues)[m->pool];
12377e226537SAttilio Rao 	vm_freelist_add(fl, m, order, 1);
123811752d88SAlan Cox }
123911752d88SAlan Cox 
1240b16b4c22SMark Johnston #ifdef VM_FREEPOOL_LAZYINIT
1241b16b4c22SMark Johnston /*
1242b16b4c22SMark Johnston  * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving
1243b16b4c22SMark Johnston  * them to the default pool.  This is a prerequisite for some rare operations
1244b16b4c22SMark Johnston  * which need to scan the page array and thus depend on all pages being
1245b16b4c22SMark Johnston  * initialized.
1246b16b4c22SMark Johnston  */
1247b16b4c22SMark Johnston static void
1248b16b4c22SMark Johnston vm_phys_lazy_init_domain(int domain, bool locked)
1249b16b4c22SMark Johnston {
1250b16b4c22SMark Johnston 	static bool initdone[MAXMEMDOM];
1251b16b4c22SMark Johnston 	struct vm_domain *vmd;
1252b16b4c22SMark Johnston 	struct vm_freelist *fl;
1253b16b4c22SMark Johnston 	vm_page_t m;
1254b16b4c22SMark Johnston 	int pind;
1255b16b4c22SMark Johnston 	bool unlocked;
1256b16b4c22SMark Johnston 
1257b16b4c22SMark Johnston 	if (__predict_true(atomic_load_bool(&initdone[domain])))
1258b16b4c22SMark Johnston 		return;
1259b16b4c22SMark Johnston 
1260b16b4c22SMark Johnston 	vmd = VM_DOMAIN(domain);
1261b16b4c22SMark Johnston 	if (locked)
1262b16b4c22SMark Johnston 		vm_domain_free_assert_locked(vmd);
1263b16b4c22SMark Johnston 	else
1264b16b4c22SMark Johnston 		vm_domain_free_lock(vmd);
1265b16b4c22SMark Johnston 	if (atomic_load_bool(&initdone[domain]))
1266b16b4c22SMark Johnston 		goto out;
1267b16b4c22SMark Johnston 	pind = VM_FREEPOOL_LAZYINIT;
1268b16b4c22SMark Johnston 	for (int freelist = 0; freelist < VM_NFREELIST; freelist++) {
1269b16b4c22SMark Johnston 		int flind;
1270b16b4c22SMark Johnston 
1271b16b4c22SMark Johnston 		flind = vm_freelist_to_flind[freelist];
1272b16b4c22SMark Johnston 		if (flind < 0)
1273b16b4c22SMark Johnston 			continue;
1274b16b4c22SMark Johnston 		fl = vm_phys_free_queues[domain][flind][pind];
1275b16b4c22SMark Johnston 		for (int oind = 0; oind < VM_NFREEORDER; oind++) {
1276b16b4c22SMark Johnston 			if (atomic_load_int(&fl[oind].lcnt) == 0)
1277b16b4c22SMark Johnston 				continue;
1278b16b4c22SMark Johnston 			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
1279b16b4c22SMark Johnston 				/*
1280b16b4c22SMark Johnston 				 * Avoid holding the lock across the
1281b16b4c22SMark Johnston 				 * initialization unless there's a free page
1282b16b4c22SMark Johnston 				 * shortage.
1283b16b4c22SMark Johnston 				 */
1284b16b4c22SMark Johnston 				vm_freelist_rem(fl, m, oind);
1285b16b4c22SMark Johnston 				unlocked = vm_domain_allocate(vmd,
1286b16b4c22SMark Johnston 				    VM_ALLOC_NORMAL, 1 << oind);
1287b16b4c22SMark Johnston 				if (unlocked)
1288b16b4c22SMark Johnston 					vm_domain_free_unlock(vmd);
1289b16b4c22SMark Johnston 				vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind);
1290b16b4c22SMark Johnston 				if (unlocked) {
1291b16b4c22SMark Johnston 					vm_domain_freecnt_inc(vmd, 1 << oind);
1292b16b4c22SMark Johnston 					vm_domain_free_lock(vmd);
1293b16b4c22SMark Johnston 				}
1294b16b4c22SMark Johnston 				vm_phys_free_pages(m, oind);
1295b16b4c22SMark Johnston 			}
1296b16b4c22SMark Johnston 		}
1297b16b4c22SMark Johnston 	}
1298b16b4c22SMark Johnston 	atomic_store_bool(&initdone[domain], true);
1299b16b4c22SMark Johnston out:
1300b16b4c22SMark Johnston 	if (!locked)
1301b16b4c22SMark Johnston 		vm_domain_free_unlock(vmd);
1302b16b4c22SMark Johnston }
1303b16b4c22SMark Johnston 
1304b16b4c22SMark Johnston static void
1305b16b4c22SMark Johnston vm_phys_lazy_init(void)
1306b16b4c22SMark Johnston {
1307b16b4c22SMark Johnston 	for (int domain = 0; domain < vm_ndomains; domain++)
1308b16b4c22SMark Johnston 		vm_phys_lazy_init_domain(domain, false);
1309b16b4c22SMark Johnston 	atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT);
1310b16b4c22SMark Johnston }
1311b16b4c22SMark Johnston 
1312b16b4c22SMark Johnston static void
1313b16b4c22SMark Johnston vm_phys_lazy_init_kthr(void *arg __unused)
1314b16b4c22SMark Johnston {
1315b16b4c22SMark Johnston 	vm_phys_lazy_init();
1316b16b4c22SMark Johnston 	kthread_exit();
1317b16b4c22SMark Johnston }
1318b16b4c22SMark Johnston 
1319b16b4c22SMark Johnston static void
1320b16b4c22SMark Johnston vm_phys_lazy_sysinit(void *arg __unused)
1321b16b4c22SMark Johnston {
1322b16b4c22SMark Johnston 	struct thread *td;
1323b16b4c22SMark Johnston 	int error;
1324b16b4c22SMark Johnston 
1325b16b4c22SMark Johnston 	error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td,
1326b16b4c22SMark Johnston 	    RFSTOPPED, 0, "vmlazyinit");
1327b16b4c22SMark Johnston 	if (error == 0) {
1328b16b4c22SMark Johnston 		thread_lock(td);
1329b16b4c22SMark Johnston 		sched_prio(td, PRI_MIN_IDLE);
1330b16b4c22SMark Johnston 		sched_add(td, SRQ_BORING);
1331b16b4c22SMark Johnston 	} else {
1332b16b4c22SMark Johnston 		printf("%s: could not create lazy init thread: %d\n",
1333b16b4c22SMark Johnston 		    __func__, error);
1334b16b4c22SMark Johnston 		vm_phys_lazy_init();
1335b16b4c22SMark Johnston 	}
1336b16b4c22SMark Johnston }
1337b16b4c22SMark Johnston SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit,
1338b16b4c22SMark Johnston     NULL);
1339b16b4c22SMark Johnston #endif /* VM_FREEPOOL_LAZYINIT */
1340b16b4c22SMark Johnston 
134111752d88SAlan Cox /*
1342e3537f92SDoug Moore  * Free a contiguous, arbitrarily sized set of physical pages, without
1343e3537f92SDoug Moore  * merging across set boundaries.
1344b8590daeSDoug Moore  *
1345b8590daeSDoug Moore  * The free page queues must be locked.
1346b8590daeSDoug Moore  */
1347b8590daeSDoug Moore void
1348e3537f92SDoug Moore vm_phys_enqueue_contig(vm_page_t m, u_long npages)
1349b8590daeSDoug Moore {
1350b8590daeSDoug Moore 	struct vm_freelist *fl;
1351b8590daeSDoug Moore 	struct vm_phys_seg *seg;
1352b8590daeSDoug Moore 	vm_page_t m_end;
1353c9b06fa5SDoug Moore 	vm_paddr_t diff, lo;
1354b8590daeSDoug Moore 	int order;
1355b8590daeSDoug Moore 
1356b8590daeSDoug Moore 	/*
1357b8590daeSDoug Moore 	 * Avoid unnecessary coalescing by freeing the pages in the largest
1358b8590daeSDoug Moore 	 * possible power-of-two-sized subsets.
1359b8590daeSDoug Moore 	 */
1360b8590daeSDoug Moore 	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1361b8590daeSDoug Moore 	seg = &vm_phys_segs[m->segind];
1362e3537f92SDoug Moore 	fl = (*seg->free_queues)[m->pool];
1363b8590daeSDoug Moore 	m_end = m + npages;
1364b8590daeSDoug Moore 	/* Free blocks of increasing size. */
13656dd15b7aSDoug Moore 	lo = atop(VM_PAGE_TO_PHYS(m));
1366c9b06fa5SDoug Moore 	if (m < m_end &&
1367c9b06fa5SDoug Moore 	    (diff = lo ^ (lo + npages - 1)) != 0) {
1368543d55d7SDoug Moore 		order = min(ilog2(diff), VM_NFREEORDER - 1);
1369e3537f92SDoug Moore 		m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl, 1);
13705c1f2cc4SAlan Cox 	}
1371c9b06fa5SDoug Moore 
1372b8590daeSDoug Moore 	/* Free blocks of maximum size. */
1373c9b06fa5SDoug Moore 	order = VM_NFREEORDER - 1;
1374b8590daeSDoug Moore 	while (m + (1 << order) <= m_end) {
1375b8590daeSDoug Moore 		KASSERT(seg == &vm_phys_segs[m->segind],
1376b8590daeSDoug Moore 		    ("%s: page range [%p,%p) spans multiple segments",
1377b8590daeSDoug Moore 		    __func__, m_end - npages, m));
1378d7ec4a88SMark Johnston 		vm_phys_enq_chunk(fl, m, order, 1);
1379b8590daeSDoug Moore 		m += 1 << order;
1380b8590daeSDoug Moore 	}
1381b8590daeSDoug Moore 	/* Free blocks of diminishing size. */
1382e3537f92SDoug Moore 	vm_phys_enq_beg(m, m_end - m, fl, 1);
1383b8590daeSDoug Moore }
1384b8590daeSDoug Moore 
1385b8590daeSDoug Moore /*
1386b8590daeSDoug Moore  * Free a contiguous, arbitrarily sized set of physical pages.
1387b8590daeSDoug Moore  *
1388b8590daeSDoug Moore  * The free page queues must be locked.
1389b8590daeSDoug Moore  */
1390b8590daeSDoug Moore void
1391b8590daeSDoug Moore vm_phys_free_contig(vm_page_t m, u_long npages)
1392b8590daeSDoug Moore {
13936dd15b7aSDoug Moore 	vm_paddr_t lo;
1394b8590daeSDoug Moore 	vm_page_t m_start, m_end;
13956dd15b7aSDoug Moore 	unsigned max_order, order_start, order_end;
1396b8590daeSDoug Moore 
1397b8590daeSDoug Moore 	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1398b8590daeSDoug Moore 
13996dd15b7aSDoug Moore 	lo = atop(VM_PAGE_TO_PHYS(m));
1400543d55d7SDoug Moore 	max_order = min(ilog2(lo ^ (lo + npages)), VM_NFREEORDER - 1);
1401e3537f92SDoug Moore 
1402e3537f92SDoug Moore 	m_start = m;
14036dd15b7aSDoug Moore 	order_start = ffsll(lo) - 1;
14046dd15b7aSDoug Moore 	if (order_start < max_order)
1405b8590daeSDoug Moore 		m_start += 1 << order_start;
1406e3537f92SDoug Moore 	m_end = m + npages;
14076dd15b7aSDoug Moore 	order_end = ffsll(lo + npages) - 1;
14086dd15b7aSDoug Moore 	if (order_end < max_order)
1409b8590daeSDoug Moore 		m_end -= 1 << order_end;
1410b8590daeSDoug Moore 	/*
1411b8590daeSDoug Moore 	 * Avoid unnecessary coalescing by freeing the pages at the start and
1412b8590daeSDoug Moore 	 * end of the range last.
1413b8590daeSDoug Moore 	 */
1414b8590daeSDoug Moore 	if (m_start < m_end)
1415e3537f92SDoug Moore 		vm_phys_enqueue_contig(m_start, m_end - m_start);
1416e3537f92SDoug Moore 	if (order_start < max_order)
1417b8590daeSDoug Moore 		vm_phys_free_pages(m, order_start);
1418e3537f92SDoug Moore 	if (order_end < max_order)
1419b8590daeSDoug Moore 		vm_phys_free_pages(m_end, order_end);
14205c1f2cc4SAlan Cox }
14215c1f2cc4SAlan Cox 
14225c1f2cc4SAlan Cox /*
14239e817428SDoug Moore  * Identify the first address range within segment segind or greater
14249e817428SDoug Moore  * that matches the domain, lies within the low/high range, and has
14259e817428SDoug Moore  * enough pages.  Return -1 if there is none.
1426c869e672SAlan Cox  */
14279e817428SDoug Moore int
14289e817428SDoug Moore vm_phys_find_range(vm_page_t bounds[], int segind, int domain,
14299e817428SDoug Moore     u_long npages, vm_paddr_t low, vm_paddr_t high)
1430c869e672SAlan Cox {
14319e817428SDoug Moore 	vm_paddr_t pa_end, pa_start;
14329e817428SDoug Moore 	struct vm_phys_seg *end_seg, *seg;
1433c869e672SAlan Cox 
14349e817428SDoug Moore 	KASSERT(npages > 0, ("npages is zero"));
143558d42717SAlan Cox 	KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range"));
14369e817428SDoug Moore 	end_seg = &vm_phys_segs[vm_phys_nsegs];
14379e817428SDoug Moore 	for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) {
14383f289c3fSJeff Roberson 		if (seg->domain != domain)
14393f289c3fSJeff Roberson 			continue;
1440c869e672SAlan Cox 		if (seg->start >= high)
14419e817428SDoug Moore 			return (-1);
14429e817428SDoug Moore 		pa_start = MAX(low, seg->start);
14439e817428SDoug Moore 		pa_end = MIN(high, seg->end);
14449e817428SDoug Moore 		if (pa_end - pa_start < ptoa(npages))
1445c869e672SAlan Cox 			continue;
1446b16b4c22SMark Johnston #ifdef VM_FREEPOOL_LAZYINIT
1447b16b4c22SMark Johnston 		/*
1448b16b4c22SMark Johnston 		 * The pages on the free lists must be initialized.
1449b16b4c22SMark Johnston 		 */
1450b16b4c22SMark Johnston 		vm_phys_lazy_init_domain(domain, false);
1451b16b4c22SMark Johnston #endif
145269cbb187SMark Johnston 		bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start);
1453*fbff6d54SMark Johnston 		bounds[1] = &seg->first_page[atop(pa_end - seg->start)];
14549e817428SDoug Moore 		return (seg - vm_phys_segs);
1455c869e672SAlan Cox 	}
14569e817428SDoug Moore 	return (-1);
1457c869e672SAlan Cox }
1458c869e672SAlan Cox 
1459c869e672SAlan Cox /*
14609742373aSAlan Cox  * Search for the given physical page "m" in the free lists.  If the search
14616062d9faSMark Johnston  * succeeds, remove "m" from the free lists and return true.  Otherwise, return
14626062d9faSMark Johnston  * false, indicating that "m" is not in the free lists.
14637bfda801SAlan Cox  *
14647bfda801SAlan Cox  * The free page queues must be locked.
14657bfda801SAlan Cox  */
14666062d9faSMark Johnston bool
1467b16b4c22SMark Johnston vm_phys_unfree_page(vm_paddr_t pa)
14687bfda801SAlan Cox {
14697bfda801SAlan Cox 	struct vm_freelist *fl;
14707bfda801SAlan Cox 	struct vm_phys_seg *seg;
1471b16b4c22SMark Johnston 	vm_paddr_t pa_half;
1472b16b4c22SMark Johnston 	vm_page_t m, m_set, m_tmp;
1473e3537f92SDoug Moore 	int order;
14747bfda801SAlan Cox 
1475b16b4c22SMark Johnston 	seg = vm_phys_paddr_to_seg(pa);
1476b16b4c22SMark Johnston 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1477b16b4c22SMark Johnston 
1478b16b4c22SMark Johnston 	/*
1479b16b4c22SMark Johnston 	 * The pages on the free lists must be initialized.
1480b16b4c22SMark Johnston 	 */
1481b16b4c22SMark Johnston #ifdef VM_FREEPOOL_LAZYINIT
1482b16b4c22SMark Johnston 	vm_phys_lazy_init_domain(seg->domain, true);
1483b16b4c22SMark Johnston #endif
1484b16b4c22SMark Johnston 
14857bfda801SAlan Cox 	/*
14867bfda801SAlan Cox 	 * First, find the contiguous, power of two-sized set of free
14877bfda801SAlan Cox 	 * physical pages containing the given physical page "m" and
14887bfda801SAlan Cox 	 * assign it to "m_set".
14897bfda801SAlan Cox 	 */
1490b16b4c22SMark Johnston 	m = vm_phys_paddr_to_vm_page(pa);
14917bfda801SAlan Cox 	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
1492bc8794a1SAlan Cox 	    order < VM_NFREEORDER - 1; ) {
14937bfda801SAlan Cox 		order++;
14947bfda801SAlan Cox 		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
14952fbced65SAlan Cox 		if (pa >= seg->start)
149669cbb187SMark Johnston 			m_set = vm_phys_seg_paddr_to_vm_page(seg, pa);
1497e35395ceSAlan Cox 		else
14986062d9faSMark Johnston 			return (false);
14997bfda801SAlan Cox 	}
1500e35395ceSAlan Cox 	if (m_set->order < order)
15016062d9faSMark Johnston 		return (false);
1502e35395ceSAlan Cox 	if (m_set->order == VM_NFREEORDER)
15036062d9faSMark Johnston 		return (false);
15047bfda801SAlan Cox 	KASSERT(m_set->order < VM_NFREEORDER,
15057bfda801SAlan Cox 	    ("vm_phys_unfree_page: page %p has unexpected order %d",
15067bfda801SAlan Cox 	    m_set, m_set->order));
15077bfda801SAlan Cox 
15087bfda801SAlan Cox 	/*
15097bfda801SAlan Cox 	 * Next, remove "m_set" from the free lists.  Finally, extract
15107bfda801SAlan Cox 	 * "m" from "m_set" using an iterative algorithm: While "m_set"
15117bfda801SAlan Cox 	 * is larger than a page, shrink "m_set" by returning the half
15127bfda801SAlan Cox 	 * of "m_set" that does not contain "m" to the free lists.
15137bfda801SAlan Cox 	 */
1514e3537f92SDoug Moore 	fl = (*seg->free_queues)[m_set->pool];
15157bfda801SAlan Cox 	order = m_set->order;
15167e226537SAttilio Rao 	vm_freelist_rem(fl, m_set, order);
15177bfda801SAlan Cox 	while (order > 0) {
15187bfda801SAlan Cox 		order--;
15197bfda801SAlan Cox 		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
15207bfda801SAlan Cox 		if (m->phys_addr < pa_half)
152169cbb187SMark Johnston 			m_tmp = vm_phys_seg_paddr_to_vm_page(seg, pa_half);
15227bfda801SAlan Cox 		else {
15237bfda801SAlan Cox 			m_tmp = m_set;
152469cbb187SMark Johnston 			m_set = vm_phys_seg_paddr_to_vm_page(seg, pa_half);
15257bfda801SAlan Cox 		}
15267e226537SAttilio Rao 		vm_freelist_add(fl, m_tmp, order, 0);
15277bfda801SAlan Cox 	}
15287bfda801SAlan Cox 	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
15296062d9faSMark Johnston 	return (true);
15307bfda801SAlan Cox }
15317bfda801SAlan Cox 
15327bfda801SAlan Cox /*
15332a4897bdSDoug Moore  * Find a run of contiguous physical pages, meeting alignment requirements, from
15342a4897bdSDoug Moore  * a list of max-sized page blocks, where we need at least two consecutive
15352a4897bdSDoug Moore  * blocks to satisfy the (large) page request.
1536fa8a6585SDoug Moore  */
1537fa8a6585SDoug Moore static vm_page_t
15382a4897bdSDoug Moore vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages,
1539fa8a6585SDoug Moore     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
1540fa8a6585SDoug Moore {
1541fa8a6585SDoug Moore 	struct vm_phys_seg *seg;
15422a4897bdSDoug Moore 	vm_page_t m, m_iter, m_ret;
15432a4897bdSDoug Moore 	vm_paddr_t max_size, size;
15442a4897bdSDoug Moore 	int max_order;
1545fa8a6585SDoug Moore 
15462a4897bdSDoug Moore 	max_order = VM_NFREEORDER - 1;
1547fa8a6585SDoug Moore 	size = npages << PAGE_SHIFT;
15482a4897bdSDoug Moore 	max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order);
15492a4897bdSDoug Moore 	KASSERT(size > max_size, ("size is too small"));
15502a4897bdSDoug Moore 
1551fa8a6585SDoug Moore 	/*
15522a4897bdSDoug Moore 	 * In order to avoid examining any free max-sized page block more than
15532a4897bdSDoug Moore 	 * twice, identify the ones that are first in a physically-contiguous
15542a4897bdSDoug Moore 	 * sequence of such blocks, and only for those walk the sequence to
15552a4897bdSDoug Moore 	 * check if there are enough free blocks starting at a properly aligned
15562a4897bdSDoug Moore 	 * block.  Thus, no block is checked for free-ness more than twice.
1557fa8a6585SDoug Moore 	 */
15582a4897bdSDoug Moore 	TAILQ_FOREACH(m, &fl[max_order].pl, listq) {
15592a4897bdSDoug Moore 		/*
15602a4897bdSDoug Moore 		 * Skip m unless it is first in a sequence of free max page
15612a4897bdSDoug Moore 		 * blocks >= low in its segment.
15622a4897bdSDoug Moore 		 */
15632a4897bdSDoug Moore 		seg = &vm_phys_segs[m->segind];
15642a4897bdSDoug Moore 		if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start))
15652a4897bdSDoug Moore 			continue;
15662a4897bdSDoug Moore 		if (VM_PAGE_TO_PHYS(m) >= max_size &&
15672a4897bdSDoug Moore 		    VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) &&
15682a4897bdSDoug Moore 		    max_order == m[-1 << max_order].order)
1569fa8a6585SDoug Moore 			continue;
1570fa8a6585SDoug Moore 
1571fa8a6585SDoug Moore 		/*
15722a4897bdSDoug Moore 		 * Advance m_ret from m to the first of the sequence, if any,
15732a4897bdSDoug Moore 		 * that satisfies alignment conditions and might leave enough
15742a4897bdSDoug Moore 		 * space.
1575fa8a6585SDoug Moore 		 */
15762a4897bdSDoug Moore 		m_ret = m;
15772a4897bdSDoug Moore 		while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret),
15782a4897bdSDoug Moore 		    size, alignment, boundary) &&
15792a4897bdSDoug Moore 		    VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) &&
15802a4897bdSDoug Moore 		    max_order == m_ret[1 << max_order].order)
15812a4897bdSDoug Moore 			m_ret += 1 << max_order;
15822a4897bdSDoug Moore 
15832a4897bdSDoug Moore 		/*
15842a4897bdSDoug Moore 		 * Skip m unless some block m_ret in the sequence is properly
15852a4897bdSDoug Moore 		 * aligned, and begins a sequence of enough pages less than
15862a4897bdSDoug Moore 		 * high, and in the same segment.
15872a4897bdSDoug Moore 		 */
15882a4897bdSDoug Moore 		if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end))
1589fa8a6585SDoug Moore 			continue;
1590fa8a6585SDoug Moore 
1591fa8a6585SDoug Moore 		/*
15922a4897bdSDoug Moore 		 * Skip m unless the blocks to allocate starting at m_ret are
15932a4897bdSDoug Moore 		 * all free.
1594fa8a6585SDoug Moore 		 */
15952a4897bdSDoug Moore 		for (m_iter = m_ret;
15962a4897bdSDoug Moore 		    m_iter < m_ret + npages && max_order == m_iter->order;
15972a4897bdSDoug Moore 		    m_iter += 1 << max_order) {
1598fa8a6585SDoug Moore 		}
15992a4897bdSDoug Moore 		if (m_iter < m_ret + npages)
1600fa8a6585SDoug Moore 			continue;
1601fa8a6585SDoug Moore 		return (m_ret);
1602fa8a6585SDoug Moore 	}
1603fa8a6585SDoug Moore 	return (NULL);
1604fa8a6585SDoug Moore }
1605fa8a6585SDoug Moore 
1606fa8a6585SDoug Moore /*
1607fa8a6585SDoug Moore  * Find a run of contiguous physical pages from the specified free list
1608342056faSDoug Moore  * table.
1609c869e672SAlan Cox  */
1610c869e672SAlan Cox static vm_page_t
1611fa8a6585SDoug Moore vm_phys_find_queues_contig(
1612342056faSDoug Moore     struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX],
1613342056faSDoug Moore     u_long npages, vm_paddr_t low, vm_paddr_t high,
1614342056faSDoug Moore     u_long alignment, vm_paddr_t boundary)
1615c869e672SAlan Cox {
1616c869e672SAlan Cox 	struct vm_freelist *fl;
1617fa8a6585SDoug Moore 	vm_page_t m_ret;
1618c869e672SAlan Cox 	vm_paddr_t pa, pa_end, size;
1619c869e672SAlan Cox 	int oind, order, pind;
1620c869e672SAlan Cox 
1621c869e672SAlan Cox 	KASSERT(npages > 0, ("npages is 0"));
1622c869e672SAlan Cox 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1623c869e672SAlan Cox 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1624c869e672SAlan Cox 	/* Compute the queue that is the best fit for npages. */
16259161b4deSAlan Cox 	order = flsl(npages - 1);
1626fa8a6585SDoug Moore 	/* Search for a large enough free block. */
1627c869e672SAlan Cox 	size = npages << PAGE_SHIFT;
1628fa8a6585SDoug Moore 	for (oind = order; oind < VM_NFREEORDER; oind++) {
1629b16b4c22SMark Johnston 		for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
1630342056faSDoug Moore 			fl = (*queues)[pind];
16315cd29d0fSMark Johnston 			TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
1632c869e672SAlan Cox 				/*
1633da92ecbcSDoug Moore 				 * Determine if the address range starting at pa
1634da92ecbcSDoug Moore 				 * is within the given range, satisfies the
1635da92ecbcSDoug Moore 				 * given alignment, and does not cross the given
1636da92ecbcSDoug Moore 				 * boundary.
163711752d88SAlan Cox 				 */
1638da92ecbcSDoug Moore 				pa = VM_PAGE_TO_PHYS(m_ret);
1639da92ecbcSDoug Moore 				pa_end = pa + size;
1640fa8a6585SDoug Moore 				if (low <= pa && pa_end <= high &&
1641fa8a6585SDoug Moore 				    vm_addr_ok(pa, size, alignment, boundary))
1642fa8a6585SDoug Moore 					return (m_ret);
1643fa8a6585SDoug Moore 			}
1644fa8a6585SDoug Moore 		}
1645fa8a6585SDoug Moore 	}
1646da92ecbcSDoug Moore 	if (order < VM_NFREEORDER)
1647fa8a6585SDoug Moore 		return (NULL);
16482a4897bdSDoug Moore 	/* Search for a long-enough sequence of max-order blocks. */
1649b16b4c22SMark Johnston 	for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
1650fa8a6585SDoug Moore 		fl = (*queues)[pind];
16512a4897bdSDoug Moore 		m_ret = vm_phys_find_freelist_contig(fl, npages,
1652fa8a6585SDoug Moore 		    low, high, alignment, boundary);
1653fa8a6585SDoug Moore 		if (m_ret != NULL)
1654fa8a6585SDoug Moore 			return (m_ret);
165511752d88SAlan Cox 	}
165611752d88SAlan Cox 	return (NULL);
165711752d88SAlan Cox }
165811752d88SAlan Cox 
1659b7565d44SJeff Roberson /*
1660342056faSDoug Moore  * Allocate a contiguous set of physical pages of the given size
1661342056faSDoug Moore  * "npages" from the free lists.  All of the physical pages must be at
1662342056faSDoug Moore  * or above the given physical address "low" and below the given
1663342056faSDoug Moore  * physical address "high".  The given value "alignment" determines the
1664342056faSDoug Moore  * alignment of the first physical page in the set.  If the given value
1665342056faSDoug Moore  * "boundary" is non-zero, then the set of physical pages cannot cross
1666342056faSDoug Moore  * any physical address boundary that is a multiple of that value.  Both
1667e3537f92SDoug Moore  * "alignment" and "boundary" must be a power of two.
1668342056faSDoug Moore  */
1669342056faSDoug Moore vm_page_t
1670342056faSDoug Moore vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
1671342056faSDoug Moore     u_long alignment, vm_paddr_t boundary)
1672342056faSDoug Moore {
1673342056faSDoug Moore 	vm_paddr_t pa_end, pa_start;
1674fa8a6585SDoug Moore 	struct vm_freelist *fl;
1675fa8a6585SDoug Moore 	vm_page_t m, m_run;
1676342056faSDoug Moore 	struct vm_phys_seg *seg;
1677342056faSDoug Moore 	struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX];
1678fa8a6585SDoug Moore 	int oind, segind;
1679342056faSDoug Moore 
1680342056faSDoug Moore 	KASSERT(npages > 0, ("npages is 0"));
1681342056faSDoug Moore 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1682342056faSDoug Moore 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1683342056faSDoug Moore 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
1684342056faSDoug Moore 	if (low >= high)
1685342056faSDoug Moore 		return (NULL);
1686342056faSDoug Moore 	queues = NULL;
1687342056faSDoug Moore 	m_run = NULL;
1688342056faSDoug Moore 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
1689342056faSDoug Moore 		seg = &vm_phys_segs[segind];
1690342056faSDoug Moore 		if (seg->start >= high || seg->domain != domain)
1691342056faSDoug Moore 			continue;
1692342056faSDoug Moore 		if (low >= seg->end)
1693342056faSDoug Moore 			break;
1694342056faSDoug Moore 		if (low <= seg->start)
1695342056faSDoug Moore 			pa_start = seg->start;
1696342056faSDoug Moore 		else
1697342056faSDoug Moore 			pa_start = low;
1698342056faSDoug Moore 		if (high < seg->end)
1699342056faSDoug Moore 			pa_end = high;
1700342056faSDoug Moore 		else
1701342056faSDoug Moore 			pa_end = seg->end;
1702342056faSDoug Moore 		if (pa_end - pa_start < ptoa(npages))
1703342056faSDoug Moore 			continue;
1704342056faSDoug Moore 		/*
1705342056faSDoug Moore 		 * If a previous segment led to a search using
1706342056faSDoug Moore 		 * the same free lists as would this segment, then
1707342056faSDoug Moore 		 * we've actually already searched within this
1708342056faSDoug Moore 		 * too.  So skip it.
1709342056faSDoug Moore 		 */
1710342056faSDoug Moore 		if (seg->free_queues == queues)
1711342056faSDoug Moore 			continue;
1712342056faSDoug Moore 		queues = seg->free_queues;
1713fa8a6585SDoug Moore 		m_run = vm_phys_find_queues_contig(queues, npages,
1714342056faSDoug Moore 		    low, high, alignment, boundary);
1715342056faSDoug Moore 		if (m_run != NULL)
1716342056faSDoug Moore 			break;
1717342056faSDoug Moore 	}
1718fa8a6585SDoug Moore 	if (m_run == NULL)
1719fa8a6585SDoug Moore 		return (NULL);
1720fa8a6585SDoug Moore 
1721fa8a6585SDoug Moore 	/* Allocate pages from the page-range found. */
1722fa8a6585SDoug Moore 	for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) {
1723fa8a6585SDoug Moore 		fl = (*queues)[m->pool];
1724fa8a6585SDoug Moore 		oind = m->order;
1725fa8a6585SDoug Moore 		vm_freelist_rem(fl, m, oind);
1726e3537f92SDoug Moore 		if (m->pool != VM_FREEPOOL_DEFAULT)
1727e3537f92SDoug Moore 			vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind);
1728fa8a6585SDoug Moore 	}
1729fa8a6585SDoug Moore 	/* Return excess pages to the free lists. */
1730fa8a6585SDoug Moore 	fl = (*queues)[VM_FREEPOOL_DEFAULT];
1731e3537f92SDoug Moore 	vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl, 0);
17322a4897bdSDoug Moore 
17332a4897bdSDoug Moore 	/* Return page verified to satisfy conditions of request. */
17342a4897bdSDoug Moore 	pa_start = VM_PAGE_TO_PHYS(m_run);
17352a4897bdSDoug Moore 	KASSERT(low <= pa_start,
17362a4897bdSDoug Moore 	    ("memory allocated below minimum requested range"));
17372a4897bdSDoug Moore 	KASSERT(pa_start + ptoa(npages) <= high,
17382a4897bdSDoug Moore 	    ("memory allocated above maximum requested range"));
17392a4897bdSDoug Moore 	seg = &vm_phys_segs[m_run->segind];
17402a4897bdSDoug Moore 	KASSERT(seg->domain == domain,
17412a4897bdSDoug Moore 	    ("memory not allocated from specified domain"));
17422a4897bdSDoug Moore 	KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary),
17432a4897bdSDoug Moore 	    ("memory alignment/boundary constraints not satisfied"));
1744342056faSDoug Moore 	return (m_run);
1745342056faSDoug Moore }
1746342056faSDoug Moore 
1747342056faSDoug Moore /*
1748b7565d44SJeff Roberson  * Return the index of the first unused slot which may be the terminating
1749b7565d44SJeff Roberson  * entry.
1750b7565d44SJeff Roberson  */
1751b7565d44SJeff Roberson static int
1752b7565d44SJeff Roberson vm_phys_avail_count(void)
1753b7565d44SJeff Roberson {
1754b7565d44SJeff Roberson 	int i;
1755b7565d44SJeff Roberson 
1756b7565d44SJeff Roberson 	for (i = 0; phys_avail[i + 1]; i += 2)
1757b7565d44SJeff Roberson 		continue;
1758b7565d44SJeff Roberson 	if (i > PHYS_AVAIL_ENTRIES)
1759b7565d44SJeff Roberson 		panic("Improperly terminated phys_avail %d entries", i);
1760b7565d44SJeff Roberson 
1761b7565d44SJeff Roberson 	return (i);
1762b7565d44SJeff Roberson }
1763b7565d44SJeff Roberson 
1764b7565d44SJeff Roberson /*
1765b7565d44SJeff Roberson  * Assert that a phys_avail entry is valid.
1766b7565d44SJeff Roberson  */
1767b7565d44SJeff Roberson static void
1768b7565d44SJeff Roberson vm_phys_avail_check(int i)
1769b7565d44SJeff Roberson {
1770b7565d44SJeff Roberson 	if (phys_avail[i] & PAGE_MASK)
1771b7565d44SJeff Roberson 		panic("Unaligned phys_avail[%d]: %#jx", i,
1772b7565d44SJeff Roberson 		    (intmax_t)phys_avail[i]);
1773b7565d44SJeff Roberson 	if (phys_avail[i+1] & PAGE_MASK)
1774b7565d44SJeff Roberson 		panic("Unaligned phys_avail[%d + 1]: %#jx", i,
1775b7565d44SJeff Roberson 		    (intmax_t)phys_avail[i]);
1776b7565d44SJeff Roberson 	if (phys_avail[i + 1] < phys_avail[i])
1777b7565d44SJeff Roberson 		panic("phys_avail[%d] start %#jx < end %#jx", i,
1778b7565d44SJeff Roberson 		    (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]);
1779b7565d44SJeff Roberson }
1780b7565d44SJeff Roberson 
1781b7565d44SJeff Roberson /*
1782b7565d44SJeff Roberson  * Return the index of an overlapping phys_avail entry or -1.
1783b7565d44SJeff Roberson  */
1784be3f5f29SJeff Roberson #ifdef NUMA
1785b7565d44SJeff Roberson static int
1786b7565d44SJeff Roberson vm_phys_avail_find(vm_paddr_t pa)
1787b7565d44SJeff Roberson {
1788b7565d44SJeff Roberson 	int i;
1789b7565d44SJeff Roberson 
1790b7565d44SJeff Roberson 	for (i = 0; phys_avail[i + 1]; i += 2)
1791b7565d44SJeff Roberson 		if (phys_avail[i] <= pa && phys_avail[i + 1] > pa)
1792b7565d44SJeff Roberson 			return (i);
1793b7565d44SJeff Roberson 	return (-1);
1794b7565d44SJeff Roberson }
1795be3f5f29SJeff Roberson #endif
1796b7565d44SJeff Roberson 
1797b7565d44SJeff Roberson /*
1798b7565d44SJeff Roberson  * Return the index of the largest entry.
1799b7565d44SJeff Roberson  */
1800b7565d44SJeff Roberson int
1801b7565d44SJeff Roberson vm_phys_avail_largest(void)
1802b7565d44SJeff Roberson {
1803b7565d44SJeff Roberson 	vm_paddr_t sz, largesz;
1804b7565d44SJeff Roberson 	int largest;
1805b7565d44SJeff Roberson 	int i;
1806b7565d44SJeff Roberson 
1807b7565d44SJeff Roberson 	largest = 0;
1808b7565d44SJeff Roberson 	largesz = 0;
1809b7565d44SJeff Roberson 	for (i = 0; phys_avail[i + 1]; i += 2) {
1810b7565d44SJeff Roberson 		sz = vm_phys_avail_size(i);
1811b7565d44SJeff Roberson 		if (sz > largesz) {
1812b7565d44SJeff Roberson 			largesz = sz;
1813b7565d44SJeff Roberson 			largest = i;
1814b7565d44SJeff Roberson 		}
1815b7565d44SJeff Roberson 	}
1816b7565d44SJeff Roberson 
1817b7565d44SJeff Roberson 	return (largest);
1818b7565d44SJeff Roberson }
1819b7565d44SJeff Roberson 
1820b7565d44SJeff Roberson vm_paddr_t
1821b7565d44SJeff Roberson vm_phys_avail_size(int i)
1822b7565d44SJeff Roberson {
1823b7565d44SJeff Roberson 
1824b7565d44SJeff Roberson 	return (phys_avail[i + 1] - phys_avail[i]);
1825b7565d44SJeff Roberson }
1826b7565d44SJeff Roberson 
1827b7565d44SJeff Roberson /*
1828b7565d44SJeff Roberson  * Split an entry at the address 'pa'.  Return zero on success or errno.
1829b7565d44SJeff Roberson  */
1830b7565d44SJeff Roberson static int
1831b7565d44SJeff Roberson vm_phys_avail_split(vm_paddr_t pa, int i)
1832b7565d44SJeff Roberson {
1833b7565d44SJeff Roberson 	int cnt;
1834b7565d44SJeff Roberson 
1835b7565d44SJeff Roberson 	vm_phys_avail_check(i);
1836b7565d44SJeff Roberson 	if (pa <= phys_avail[i] || pa >= phys_avail[i + 1])
1837b7565d44SJeff Roberson 		panic("vm_phys_avail_split: invalid address");
1838b7565d44SJeff Roberson 	cnt = vm_phys_avail_count();
1839b7565d44SJeff Roberson 	if (cnt >= PHYS_AVAIL_ENTRIES)
1840b7565d44SJeff Roberson 		return (ENOSPC);
1841b7565d44SJeff Roberson 	memmove(&phys_avail[i + 2], &phys_avail[i],
1842b7565d44SJeff Roberson 	    (cnt - i) * sizeof(phys_avail[0]));
1843b7565d44SJeff Roberson 	phys_avail[i + 1] = pa;
1844b7565d44SJeff Roberson 	phys_avail[i + 2] = pa;
1845b7565d44SJeff Roberson 	vm_phys_avail_check(i);
1846b7565d44SJeff Roberson 	vm_phys_avail_check(i+2);
1847b7565d44SJeff Roberson 
1848b7565d44SJeff Roberson 	return (0);
1849b7565d44SJeff Roberson }
1850b7565d44SJeff Roberson 
185131991a5aSMitchell Horne /*
185231991a5aSMitchell Horne  * Check if a given physical address can be included as part of a crash dump.
185331991a5aSMitchell Horne  */
185431991a5aSMitchell Horne bool
185531991a5aSMitchell Horne vm_phys_is_dumpable(vm_paddr_t pa)
185631991a5aSMitchell Horne {
185731991a5aSMitchell Horne 	vm_page_t m;
185831991a5aSMitchell Horne 	int i;
185931991a5aSMitchell Horne 
186031991a5aSMitchell Horne 	if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL)
186131991a5aSMitchell Horne 		return ((m->flags & PG_NODUMP) == 0);
186231991a5aSMitchell Horne 
186331991a5aSMitchell Horne 	for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
186431991a5aSMitchell Horne 		if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
186531991a5aSMitchell Horne 			return (true);
186631991a5aSMitchell Horne 	}
186731991a5aSMitchell Horne 	return (false);
186831991a5aSMitchell Horne }
186931991a5aSMitchell Horne 
187081302f1dSMark Johnston void
187181302f1dSMark Johnston vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end)
187281302f1dSMark Johnston {
187381302f1dSMark Johnston 	struct vm_phys_seg *seg;
187481302f1dSMark Johnston 
187581302f1dSMark Johnston 	if (vm_phys_early_nsegs == -1)
187681302f1dSMark Johnston 		panic("%s: called after initialization", __func__);
187781302f1dSMark Johnston 	if (vm_phys_early_nsegs == nitems(vm_phys_early_segs))
187881302f1dSMark Johnston 		panic("%s: ran out of early segments", __func__);
187981302f1dSMark Johnston 
188081302f1dSMark Johnston 	seg = &vm_phys_early_segs[vm_phys_early_nsegs++];
188181302f1dSMark Johnston 	seg->start = start;
188281302f1dSMark Johnston 	seg->end = end;
188381302f1dSMark Johnston }
188481302f1dSMark Johnston 
1885b7565d44SJeff Roberson /*
1886b7565d44SJeff Roberson  * This routine allocates NUMA node specific memory before the page
1887b7565d44SJeff Roberson  * allocator is bootstrapped.
1888b7565d44SJeff Roberson  */
1889b7565d44SJeff Roberson vm_paddr_t
1890b7565d44SJeff Roberson vm_phys_early_alloc(int domain, size_t alloc_size)
1891b7565d44SJeff Roberson {
18922e7838aeSJohn Baldwin #ifdef NUMA
18932e7838aeSJohn Baldwin 	int mem_index;
18942e7838aeSJohn Baldwin #endif
18952e7838aeSJohn Baldwin 	int i, biggestone;
1896b7565d44SJeff Roberson 	vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align;
1897b7565d44SJeff Roberson 
189881302f1dSMark Johnston 	KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains),
189981302f1dSMark Johnston 	    ("%s: invalid domain index %d", __func__, domain));
1900b7565d44SJeff Roberson 
1901b7565d44SJeff Roberson 	/*
1902b7565d44SJeff Roberson 	 * Search the mem_affinity array for the biggest address
1903b7565d44SJeff Roberson 	 * range in the desired domain.  This is used to constrain
1904b7565d44SJeff Roberson 	 * the phys_avail selection below.
1905b7565d44SJeff Roberson 	 */
1906b7565d44SJeff Roberson 	biggestsize = 0;
1907b7565d44SJeff Roberson 	mem_start = 0;
1908b7565d44SJeff Roberson 	mem_end = -1;
1909b7565d44SJeff Roberson #ifdef NUMA
19102e7838aeSJohn Baldwin 	mem_index = 0;
1911b7565d44SJeff Roberson 	if (mem_affinity != NULL) {
1912b7565d44SJeff Roberson 		for (i = 0;; i++) {
1913b7565d44SJeff Roberson 			size = mem_affinity[i].end - mem_affinity[i].start;
1914b7565d44SJeff Roberson 			if (size == 0)
1915b7565d44SJeff Roberson 				break;
191681302f1dSMark Johnston 			if (domain != -1 && mem_affinity[i].domain != domain)
1917b7565d44SJeff Roberson 				continue;
1918b7565d44SJeff Roberson 			if (size > biggestsize) {
1919b7565d44SJeff Roberson 				mem_index = i;
1920b7565d44SJeff Roberson 				biggestsize = size;
1921b7565d44SJeff Roberson 			}
1922b7565d44SJeff Roberson 		}
1923b7565d44SJeff Roberson 		mem_start = mem_affinity[mem_index].start;
1924b7565d44SJeff Roberson 		mem_end = mem_affinity[mem_index].end;
1925b7565d44SJeff Roberson 	}
1926b7565d44SJeff Roberson #endif
1927b7565d44SJeff Roberson 
1928b7565d44SJeff Roberson 	/*
1929b7565d44SJeff Roberson 	 * Now find biggest physical segment in within the desired
1930b7565d44SJeff Roberson 	 * numa domain.
1931b7565d44SJeff Roberson 	 */
1932b7565d44SJeff Roberson 	biggestsize = 0;
1933b7565d44SJeff Roberson 	biggestone = 0;
1934b7565d44SJeff Roberson 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1935b7565d44SJeff Roberson 		/* skip regions that are out of range */
1936b7565d44SJeff Roberson 		if (phys_avail[i+1] - alloc_size < mem_start ||
1937b7565d44SJeff Roberson 		    phys_avail[i+1] > mem_end)
1938b7565d44SJeff Roberson 			continue;
1939b7565d44SJeff Roberson 		size = vm_phys_avail_size(i);
1940b7565d44SJeff Roberson 		if (size > biggestsize) {
1941b7565d44SJeff Roberson 			biggestone = i;
1942b7565d44SJeff Roberson 			biggestsize = size;
1943b7565d44SJeff Roberson 		}
1944b7565d44SJeff Roberson 	}
1945b7565d44SJeff Roberson 	alloc_size = round_page(alloc_size);
1946b7565d44SJeff Roberson 
1947b7565d44SJeff Roberson 	/*
1948b7565d44SJeff Roberson 	 * Grab single pages from the front to reduce fragmentation.
1949b7565d44SJeff Roberson 	 */
1950b7565d44SJeff Roberson 	if (alloc_size == PAGE_SIZE) {
1951b7565d44SJeff Roberson 		pa = phys_avail[biggestone];
1952b7565d44SJeff Roberson 		phys_avail[biggestone] += PAGE_SIZE;
1953b7565d44SJeff Roberson 		vm_phys_avail_check(biggestone);
1954b7565d44SJeff Roberson 		return (pa);
1955b7565d44SJeff Roberson 	}
1956b7565d44SJeff Roberson 
1957b7565d44SJeff Roberson 	/*
1958b7565d44SJeff Roberson 	 * Naturally align large allocations.
1959b7565d44SJeff Roberson 	 */
1960b7565d44SJeff Roberson 	align = phys_avail[biggestone + 1] & (alloc_size - 1);
1961b7565d44SJeff Roberson 	if (alloc_size + align > biggestsize)
1962b7565d44SJeff Roberson 		panic("cannot find a large enough size\n");
1963b7565d44SJeff Roberson 	if (align != 0 &&
1964b7565d44SJeff Roberson 	    vm_phys_avail_split(phys_avail[biggestone + 1] - align,
1965b7565d44SJeff Roberson 	    biggestone) != 0)
1966b7565d44SJeff Roberson 		/* Wasting memory. */
1967b7565d44SJeff Roberson 		phys_avail[biggestone + 1] -= align;
1968b7565d44SJeff Roberson 
1969b7565d44SJeff Roberson 	phys_avail[biggestone + 1] -= alloc_size;
1970b7565d44SJeff Roberson 	vm_phys_avail_check(biggestone);
1971b7565d44SJeff Roberson 	pa = phys_avail[biggestone + 1];
1972b7565d44SJeff Roberson 	return (pa);
1973b7565d44SJeff Roberson }
1974b7565d44SJeff Roberson 
1975b7565d44SJeff Roberson void
1976b7565d44SJeff Roberson vm_phys_early_startup(void)
1977b7565d44SJeff Roberson {
197881302f1dSMark Johnston 	struct vm_phys_seg *seg;
1979b7565d44SJeff Roberson 	int i;
1980b7565d44SJeff Roberson 
1981b7565d44SJeff Roberson 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1982b7565d44SJeff Roberson 		phys_avail[i] = round_page(phys_avail[i]);
1983b7565d44SJeff Roberson 		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
1984b7565d44SJeff Roberson 	}
1985b7565d44SJeff Roberson 
198681302f1dSMark Johnston 	for (i = 0; i < vm_phys_early_nsegs; i++) {
198781302f1dSMark Johnston 		seg = &vm_phys_early_segs[i];
198881302f1dSMark Johnston 		vm_phys_add_seg(seg->start, seg->end);
198981302f1dSMark Johnston 	}
199081302f1dSMark Johnston 	vm_phys_early_nsegs = -1;
199181302f1dSMark Johnston 
1992b7565d44SJeff Roberson #ifdef NUMA
1993b7565d44SJeff Roberson 	/* Force phys_avail to be split by domain. */
1994b7565d44SJeff Roberson 	if (mem_affinity != NULL) {
1995b7565d44SJeff Roberson 		int idx;
1996b7565d44SJeff Roberson 
1997b7565d44SJeff Roberson 		for (i = 0; mem_affinity[i].end != 0; i++) {
1998b7565d44SJeff Roberson 			idx = vm_phys_avail_find(mem_affinity[i].start);
1999b7565d44SJeff Roberson 			if (idx != -1 &&
2000b7565d44SJeff Roberson 			    phys_avail[idx] != mem_affinity[i].start)
2001b7565d44SJeff Roberson 				vm_phys_avail_split(mem_affinity[i].start, idx);
2002b7565d44SJeff Roberson 			idx = vm_phys_avail_find(mem_affinity[i].end);
2003b7565d44SJeff Roberson 			if (idx != -1 &&
2004b7565d44SJeff Roberson 			    phys_avail[idx] != mem_affinity[i].end)
2005b7565d44SJeff Roberson 				vm_phys_avail_split(mem_affinity[i].end, idx);
2006b7565d44SJeff Roberson 		}
2007b7565d44SJeff Roberson 	}
2008b7565d44SJeff Roberson #endif
2009b7565d44SJeff Roberson }
2010b7565d44SJeff Roberson 
201111752d88SAlan Cox #ifdef DDB
201211752d88SAlan Cox /*
201311752d88SAlan Cox  * Show the number of physical pages in each of the free lists.
201411752d88SAlan Cox  */
2015c84c5e00SMitchell Horne DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE)
201611752d88SAlan Cox {
201711752d88SAlan Cox 	struct vm_freelist *fl;
20187e226537SAttilio Rao 	int flind, oind, pind, dom;
201911752d88SAlan Cox 
20207e226537SAttilio Rao 	for (dom = 0; dom < vm_ndomains; dom++) {
20217e226537SAttilio Rao 		db_printf("DOMAIN: %d\n", dom);
202211752d88SAlan Cox 		for (flind = 0; flind < vm_nfreelists; flind++) {
202311752d88SAlan Cox 			db_printf("FREE LIST %d:\n"
202411752d88SAlan Cox 			    "\n  ORDER (SIZE)  |  NUMBER"
202511752d88SAlan Cox 			    "\n              ", flind);
202611752d88SAlan Cox 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
202711752d88SAlan Cox 				db_printf("  |  POOL %d", pind);
202811752d88SAlan Cox 			db_printf("\n--            ");
202911752d88SAlan Cox 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
203011752d88SAlan Cox 				db_printf("-- --      ");
203111752d88SAlan Cox 			db_printf("--\n");
203211752d88SAlan Cox 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
203311752d88SAlan Cox 				db_printf("  %2.2d (%6.6dK)", oind,
203411752d88SAlan Cox 				    1 << (PAGE_SHIFT - 10 + oind));
203511752d88SAlan Cox 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
20367e226537SAttilio Rao 				fl = vm_phys_free_queues[dom][flind][pind];
203711752d88SAlan Cox 					db_printf("  |  %6.6d", fl[oind].lcnt);
203811752d88SAlan Cox 				}
203911752d88SAlan Cox 				db_printf("\n");
204011752d88SAlan Cox 			}
204111752d88SAlan Cox 			db_printf("\n");
204211752d88SAlan Cox 		}
20437e226537SAttilio Rao 		db_printf("\n");
20447e226537SAttilio Rao 	}
204511752d88SAlan Cox }
204611752d88SAlan Cox #endif
2047