xref: /freebsd/sys/vm/vm_phys.c (revision 32e77bcdec5c034a9252876aa018f0bf34b36dbc)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2002-2006 Rice University
5  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
6  * All rights reserved.
7  *
8  * This software was developed for the FreeBSD Project by Alan L. Cox,
9  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
24  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
27  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
30  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  *	Physical memory system implementation
36  *
37  * Any external functions defined by this module are only to be used by the
38  * virtual memory system.
39  */
40 
41 #include <sys/cdefs.h>
42 #include "opt_ddb.h"
43 #include "opt_vm.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/domainset.h>
48 #include <sys/lock.h>
49 #include <sys/kernel.h>
50 #include <sys/kthread.h>
51 #include <sys/malloc.h>
52 #include <sys/mutex.h>
53 #include <sys/proc.h>
54 #include <sys/queue.h>
55 #include <sys/rwlock.h>
56 #include <sys/sbuf.h>
57 #include <sys/sched.h>
58 #include <sys/sysctl.h>
59 #include <sys/tree.h>
60 #include <sys/tslog.h>
61 #include <sys/unistd.h>
62 #include <sys/vmmeter.h>
63 
64 #include <ddb/ddb.h>
65 
66 #include <vm/vm.h>
67 #include <vm/vm_extern.h>
68 #include <vm/vm_param.h>
69 #include <vm/vm_kern.h>
70 #include <vm/vm_object.h>
71 #include <vm/vm_page.h>
72 #include <vm/vm_phys.h>
73 #include <vm/vm_pagequeue.h>
74 
75 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
76     "Too many physsegs.");
77 _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t),
78     "vm_paddr_t too big for ffsll, flsll.");
79 
80 #ifdef NUMA
81 struct mem_affinity __read_mostly *mem_affinity;
82 int __read_mostly *mem_locality;
83 
84 static int numa_disabled;
85 static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
86     "NUMA options");
87 SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
88     &numa_disabled, 0, "NUMA-awareness in the allocators is disabled");
89 #endif
90 
91 int __read_mostly vm_ndomains = 1;
92 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1);
93 
94 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX];
95 int __read_mostly vm_phys_nsegs;
96 static struct vm_phys_seg vm_phys_early_segs[8];
97 static int vm_phys_early_nsegs;
98 
99 struct vm_phys_fictitious_seg;
100 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
101     struct vm_phys_fictitious_seg *);
102 
103 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
104     RB_INITIALIZER(&vm_phys_fictitious_tree);
105 
106 struct vm_phys_fictitious_seg {
107 	RB_ENTRY(vm_phys_fictitious_seg) node;
108 	/* Memory region data */
109 	vm_paddr_t	start;
110 	vm_paddr_t	end;
111 	vm_page_t	first_page;
112 };
113 
114 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
115     vm_phys_fictitious_cmp);
116 
117 static struct rwlock_padalign vm_phys_fictitious_reg_lock;
118 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
119 
120 static struct vm_freelist __aligned(CACHE_LINE_SIZE)
121     vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL]
122     [VM_NFREEORDER_MAX];
123 
124 static int __read_mostly vm_nfreelists;
125 
126 /*
127  * These "avail lists" are globals used to communicate boot-time physical
128  * memory layout to other parts of the kernel.  Each physically contiguous
129  * region of memory is defined by a start address at an even index and an
130  * end address at the following odd index.  Each list is terminated by a
131  * pair of zero entries.
132  *
133  * dump_avail tells the dump code what regions to include in a crash dump, and
134  * phys_avail is all of the remaining physical memory that is available for
135  * the vm system.
136  *
137  * Initially dump_avail and phys_avail are identical.  Boot time memory
138  * allocations remove extents from phys_avail that may still be included
139  * in dumps.
140  */
141 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT];
142 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT];
143 
144 /*
145  * Provides the mapping from VM_FREELIST_* to free list indices (flind).
146  */
147 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
148 static int __read_mostly vm_default_freepool;
149 
150 CTASSERT(VM_FREELIST_DEFAULT == 0);
151 
152 #ifdef VM_FREELIST_DMA32
153 #define	VM_DMA32_BOUNDARY	((vm_paddr_t)1 << 32)
154 #endif
155 
156 /*
157  * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
158  * the ordering of the free list boundaries.
159  */
160 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
161 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
162 #endif
163 
164 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
165 SYSCTL_OID(_vm, OID_AUTO, phys_free,
166     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
167     sysctl_vm_phys_free, "A",
168     "Phys Free Info");
169 
170 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
171 SYSCTL_OID(_vm, OID_AUTO, phys_segs,
172     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
173     sysctl_vm_phys_segs, "A",
174     "Phys Seg Info");
175 
176 #ifdef NUMA
177 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
178 SYSCTL_OID(_vm, OID_AUTO, phys_locality,
179     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
180     sysctl_vm_phys_locality, "A",
181     "Phys Locality Info");
182 #endif
183 
184 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
185     &vm_ndomains, 0, "Number of physical memory domains available.");
186 
187 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
188 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
189 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
190     int order, int pool, int tail);
191 
192 static bool __diagused
vm_phys_pool_valid(int pool)193 vm_phys_pool_valid(int pool)
194 {
195 #ifdef VM_FREEPOOL_LAZYINIT
196 	if (pool == VM_FREEPOOL_LAZYINIT)
197 		return (false);
198 #endif
199 	return (pool >= 0 && pool < VM_NFREEPOOL);
200 }
201 
202 /*
203  * Red-black tree helpers for vm fictitious range management.
204  */
205 static inline int
vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg * p,struct vm_phys_fictitious_seg * range)206 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
207     struct vm_phys_fictitious_seg *range)
208 {
209 
210 	KASSERT(range->start != 0 && range->end != 0,
211 	    ("Invalid range passed on search for vm_fictitious page"));
212 	if (p->start >= range->end)
213 		return (1);
214 	if (p->start < range->start)
215 		return (-1);
216 
217 	return (0);
218 }
219 
220 static int
vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg * p1,struct vm_phys_fictitious_seg * p2)221 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
222     struct vm_phys_fictitious_seg *p2)
223 {
224 
225 	/* Check if this is a search for a page */
226 	if (p1->end == 0)
227 		return (vm_phys_fictitious_in_range(p1, p2));
228 
229 	KASSERT(p2->end != 0,
230     ("Invalid range passed as second parameter to vm fictitious comparison"));
231 
232 	/* Searching to add a new range */
233 	if (p1->end <= p2->start)
234 		return (-1);
235 	if (p1->start >= p2->end)
236 		return (1);
237 
238 	panic("Trying to add overlapping vm fictitious ranges:\n"
239 	    "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
240 	    (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
241 }
242 
243 int
vm_phys_domain_match(int prefer __numa_used,vm_paddr_t low __numa_used,vm_paddr_t high __numa_used)244 vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used,
245     vm_paddr_t high __numa_used)
246 {
247 #ifdef NUMA
248 	domainset_t mask;
249 	int i;
250 
251 	if (vm_ndomains == 1 || mem_affinity == NULL)
252 		return (0);
253 
254 	DOMAINSET_ZERO(&mask);
255 	/*
256 	 * Check for any memory that overlaps low, high.
257 	 */
258 	for (i = 0; mem_affinity[i].end != 0; i++)
259 		if (mem_affinity[i].start <= high &&
260 		    mem_affinity[i].end >= low)
261 			DOMAINSET_SET(mem_affinity[i].domain, &mask);
262 	if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask))
263 		return (prefer);
264 	if (DOMAINSET_EMPTY(&mask))
265 		panic("vm_phys_domain_match:  Impossible constraint");
266 	return (DOMAINSET_FFS(&mask) - 1);
267 #else
268 	return (0);
269 #endif
270 }
271 
272 /*
273  * Outputs the state of the physical memory allocator, specifically,
274  * the amount of physical memory in each free list.
275  */
276 static int
sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)277 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
278 {
279 	struct sbuf sbuf;
280 	struct vm_freelist *fl;
281 	int dom, error, flind, oind, pind;
282 
283 	error = sysctl_wire_old_buffer(req, 0);
284 	if (error != 0)
285 		return (error);
286 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
287 	for (dom = 0; dom < vm_ndomains; dom++) {
288 		sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
289 		for (flind = 0; flind < vm_nfreelists; flind++) {
290 			sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
291 			    "\n  ORDER (SIZE)  |  NUMBER"
292 			    "\n              ", flind);
293 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
294 				sbuf_printf(&sbuf, "  |  POOL %d", pind);
295 			sbuf_printf(&sbuf, "\n--            ");
296 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
297 				sbuf_printf(&sbuf, "-- --      ");
298 			sbuf_printf(&sbuf, "--\n");
299 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
300 				sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
301 				    1 << (PAGE_SHIFT - 10 + oind));
302 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
303 				fl = vm_phys_free_queues[dom][flind][pind];
304 					sbuf_printf(&sbuf, "  |  %6d",
305 					    fl[oind].lcnt);
306 				}
307 				sbuf_printf(&sbuf, "\n");
308 			}
309 		}
310 	}
311 	error = sbuf_finish(&sbuf);
312 	sbuf_delete(&sbuf);
313 	return (error);
314 }
315 
316 /*
317  * Outputs the set of physical memory segments.
318  */
319 static int
sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)320 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
321 {
322 	struct sbuf sbuf;
323 	struct vm_phys_seg *seg;
324 	int error, segind;
325 
326 	error = sysctl_wire_old_buffer(req, 0);
327 	if (error != 0)
328 		return (error);
329 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
330 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
331 		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
332 		seg = &vm_phys_segs[segind];
333 		sbuf_printf(&sbuf, "start:     %#jx\n",
334 		    (uintmax_t)seg->start);
335 		sbuf_printf(&sbuf, "end:       %#jx\n",
336 		    (uintmax_t)seg->end);
337 		sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
338 		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
339 	}
340 	error = sbuf_finish(&sbuf);
341 	sbuf_delete(&sbuf);
342 	return (error);
343 }
344 
345 /*
346  * Return affinity, or -1 if there's no affinity information.
347  */
348 int
vm_phys_mem_affinity(int f __numa_used,int t __numa_used)349 vm_phys_mem_affinity(int f __numa_used, int t __numa_used)
350 {
351 
352 #ifdef NUMA
353 	if (mem_locality == NULL)
354 		return (-1);
355 	if (f >= vm_ndomains || t >= vm_ndomains)
356 		return (-1);
357 	return (mem_locality[f * vm_ndomains + t]);
358 #else
359 	return (-1);
360 #endif
361 }
362 
363 #ifdef NUMA
364 /*
365  * Outputs the VM locality table.
366  */
367 static int
sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)368 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
369 {
370 	struct sbuf sbuf;
371 	int error, i, j;
372 
373 	error = sysctl_wire_old_buffer(req, 0);
374 	if (error != 0)
375 		return (error);
376 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
377 
378 	sbuf_printf(&sbuf, "\n");
379 
380 	for (i = 0; i < vm_ndomains; i++) {
381 		sbuf_printf(&sbuf, "%d: ", i);
382 		for (j = 0; j < vm_ndomains; j++) {
383 			sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
384 		}
385 		sbuf_printf(&sbuf, "\n");
386 	}
387 	error = sbuf_finish(&sbuf);
388 	sbuf_delete(&sbuf);
389 	return (error);
390 }
391 #endif
392 
393 static void
vm_freelist_add(struct vm_freelist * fl,vm_page_t m,int order,int pool,int tail)394 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int pool,
395     int tail)
396 {
397 
398 	m->order = order;
399 	m->pool = pool;
400 	if (tail)
401 		TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
402 	else
403 		TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
404 	fl[order].lcnt++;
405 }
406 
407 static void
vm_freelist_rem(struct vm_freelist * fl,vm_page_t m,int order)408 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
409 {
410 
411 	TAILQ_REMOVE(&fl[order].pl, m, listq);
412 	fl[order].lcnt--;
413 	m->order = VM_NFREEORDER;
414 }
415 
416 /*
417  * Create a physical memory segment.
418  */
419 static void
_vm_phys_create_seg(vm_paddr_t start,vm_paddr_t end,int domain)420 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
421 {
422 	struct vm_phys_seg *seg;
423 
424 	if (!(0 <= domain && domain < vm_ndomains))
425 		panic("%s: Invalid domain %d ('vm_ndomains' is %d)",
426 		    __func__, domain, vm_ndomains);
427 	if (vm_phys_nsegs >= VM_PHYSSEG_MAX)
428 		panic("Not enough storage for physical segments, "
429 		    "increase VM_PHYSSEG_MAX");
430 
431 	seg = &vm_phys_segs[vm_phys_nsegs++];
432 	while (seg > vm_phys_segs && seg[-1].start >= end) {
433 		*seg = *(seg - 1);
434 		seg--;
435 	}
436 	seg->start = start;
437 	seg->end = end;
438 	seg->domain = domain;
439 	if (seg != vm_phys_segs && seg[-1].end > start)
440 		panic("Overlapping physical segments: Current [%#jx,%#jx) "
441 		    "at index %zu, previous [%#jx,%#jx)",
442 		    (uintmax_t)start, (uintmax_t)end, seg - vm_phys_segs,
443 		    (uintmax_t)seg[-1].start, (uintmax_t)seg[-1].end);
444 }
445 
446 static void
vm_phys_create_seg(vm_paddr_t start,vm_paddr_t end)447 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
448 {
449 #ifdef NUMA
450 	int i;
451 
452 	if (mem_affinity == NULL) {
453 		_vm_phys_create_seg(start, end, 0);
454 		return;
455 	}
456 
457 	for (i = 0;; i++) {
458 		if (mem_affinity[i].end == 0)
459 			panic("Reached end of affinity info");
460 		if (mem_affinity[i].end <= start)
461 			continue;
462 		if (mem_affinity[i].start > start)
463 			panic("No affinity info for start %jx",
464 			    (uintmax_t)start);
465 		if (mem_affinity[i].end >= end) {
466 			_vm_phys_create_seg(start, end,
467 			    mem_affinity[i].domain);
468 			break;
469 		}
470 		_vm_phys_create_seg(start, mem_affinity[i].end,
471 		    mem_affinity[i].domain);
472 		start = mem_affinity[i].end;
473 	}
474 #else
475 	_vm_phys_create_seg(start, end, 0);
476 #endif
477 }
478 
479 /*
480  * Add a physical memory segment.
481  */
482 void
vm_phys_add_seg(vm_paddr_t start,vm_paddr_t end)483 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
484 {
485 	vm_paddr_t paddr;
486 
487 	if ((start & PAGE_MASK) != 0)
488 		panic("%s: start (%jx) is not page aligned", __func__,
489 		    (uintmax_t)start);
490 	if ((end & PAGE_MASK) != 0)
491 		panic("%s: end (%jx) is not page aligned", __func__,
492 		    (uintmax_t)end);
493 	if (start > end)
494 		panic("%s: start (%jx) > end (%jx)!", __func__,
495 		    (uintmax_t)start, (uintmax_t)end);
496 
497 	if (start == end)
498 		return;
499 
500 	/*
501 	 * Split the physical memory segment if it spans two or more free
502 	 * list boundaries.
503 	 */
504 	paddr = start;
505 #ifdef	VM_FREELIST_LOWMEM
506 	if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
507 		vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
508 		paddr = VM_LOWMEM_BOUNDARY;
509 	}
510 #endif
511 #ifdef	VM_FREELIST_DMA32
512 	if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
513 		vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
514 		paddr = VM_DMA32_BOUNDARY;
515 	}
516 #endif
517 	vm_phys_create_seg(paddr, end);
518 }
519 
520 /*
521  * Initialize the physical memory allocator.
522  *
523  * Requires that vm_page_array is initialized!
524  */
525 void
vm_phys_init(void)526 vm_phys_init(void)
527 {
528 	struct vm_freelist *fl;
529 	struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg;
530 #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE)
531 	u_long npages;
532 #endif
533 	int dom, flind, freelist, oind, pind, segind;
534 
535 	/*
536 	 * Compute the number of free lists, and generate the mapping from the
537 	 * manifest constants VM_FREELIST_* to the free list indices.
538 	 *
539 	 * Initially, the entries of vm_freelist_to_flind[] are set to either
540 	 * 0 or 1 to indicate which free lists should be created.
541 	 */
542 #ifdef	VM_DMA32_NPAGES_THRESHOLD
543 	npages = 0;
544 #endif
545 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
546 		seg = &vm_phys_segs[segind];
547 #ifdef	VM_FREELIST_LOWMEM
548 		if (seg->end <= VM_LOWMEM_BOUNDARY)
549 			vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
550 		else
551 #endif
552 #ifdef	VM_FREELIST_DMA32
553 		if (
554 #ifdef	VM_DMA32_NPAGES_THRESHOLD
555 		    /*
556 		     * Create the DMA32 free list only if the amount of
557 		     * physical memory above physical address 4G exceeds the
558 		     * given threshold.
559 		     */
560 		    npages > VM_DMA32_NPAGES_THRESHOLD &&
561 #endif
562 		    seg->end <= VM_DMA32_BOUNDARY)
563 			vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
564 		else
565 #endif
566 		{
567 #ifdef	VM_DMA32_NPAGES_THRESHOLD
568 			npages += atop(seg->end - seg->start);
569 #endif
570 			vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
571 		}
572 	}
573 	/* Change each entry into a running total of the free lists. */
574 	for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
575 		vm_freelist_to_flind[freelist] +=
576 		    vm_freelist_to_flind[freelist - 1];
577 	}
578 	vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
579 	KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
580 	/* Change each entry into a free list index. */
581 	for (freelist = 0; freelist < VM_NFREELIST; freelist++)
582 		vm_freelist_to_flind[freelist]--;
583 
584 	/*
585 	 * Initialize the first_page and free_queues fields of each physical
586 	 * memory segment.
587 	 */
588 #ifdef VM_PHYSSEG_SPARSE
589 	npages = 0;
590 #endif
591 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
592 		seg = &vm_phys_segs[segind];
593 #ifdef VM_PHYSSEG_SPARSE
594 		seg->first_page = &vm_page_array[npages];
595 		npages += atop(seg->end - seg->start);
596 #else
597 		seg->first_page = PHYS_TO_VM_PAGE(seg->start);
598 #endif
599 #ifdef	VM_FREELIST_LOWMEM
600 		if (seg->end <= VM_LOWMEM_BOUNDARY) {
601 			flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
602 			KASSERT(flind >= 0,
603 			    ("vm_phys_init: LOWMEM flind < 0"));
604 		} else
605 #endif
606 #ifdef	VM_FREELIST_DMA32
607 		if (seg->end <= VM_DMA32_BOUNDARY) {
608 			flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
609 			KASSERT(flind >= 0,
610 			    ("vm_phys_init: DMA32 flind < 0"));
611 		} else
612 #endif
613 		{
614 			flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
615 			KASSERT(flind >= 0,
616 			    ("vm_phys_init: DEFAULT flind < 0"));
617 		}
618 		seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
619 	}
620 
621 	/*
622 	 * Coalesce physical memory segments that are contiguous and share the
623 	 * same per-domain free queues.
624 	 */
625 	prev_seg = vm_phys_segs;
626 	seg = &vm_phys_segs[1];
627 	end_seg = &vm_phys_segs[vm_phys_nsegs];
628 	while (seg < end_seg) {
629 		if (prev_seg->end == seg->start &&
630 		    prev_seg->free_queues == seg->free_queues) {
631 			prev_seg->end = seg->end;
632 			KASSERT(prev_seg->domain == seg->domain,
633 			    ("vm_phys_init: free queues cannot span domains"));
634 			vm_phys_nsegs--;
635 			end_seg--;
636 			for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++)
637 				*tmp_seg = *(tmp_seg + 1);
638 		} else {
639 			prev_seg = seg;
640 			seg++;
641 		}
642 	}
643 
644 	/*
645 	 * Initialize the free queues.
646 	 */
647 	for (dom = 0; dom < vm_ndomains; dom++) {
648 		for (flind = 0; flind < vm_nfreelists; flind++) {
649 			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
650 				fl = vm_phys_free_queues[dom][flind][pind];
651 				for (oind = 0; oind < VM_NFREEORDER; oind++)
652 					TAILQ_INIT(&fl[oind].pl);
653 			}
654 		}
655 	}
656 
657 #ifdef VM_FREEPOOL_LAZYINIT
658 	vm_default_freepool = VM_FREEPOOL_LAZYINIT;
659 #else
660 	vm_default_freepool = VM_FREEPOOL_DEFAULT;
661 #endif
662 
663 	rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
664 }
665 
666 /*
667  * Register info about the NUMA topology of the system.
668  *
669  * Invoked by platform-dependent code prior to vm_phys_init().
670  */
671 void
vm_phys_register_domains(int ndomains __numa_used,struct mem_affinity * affinity __numa_used,int * locality __numa_used)672 vm_phys_register_domains(int ndomains __numa_used,
673     struct mem_affinity *affinity __numa_used, int *locality __numa_used)
674 {
675 #ifdef NUMA
676 	int i;
677 
678 	/*
679 	 * For now the only override value that we support is 1, which
680 	 * effectively disables NUMA-awareness in the allocators.
681 	 */
682 	TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled);
683 	if (numa_disabled)
684 		ndomains = 1;
685 
686 	if (ndomains > 1) {
687 		vm_ndomains = ndomains;
688 		mem_affinity = affinity;
689 		mem_locality = locality;
690 	}
691 
692 	for (i = 0; i < vm_ndomains; i++)
693 		DOMAINSET_SET(i, &all_domains);
694 #endif
695 }
696 
697 /*
698  * Split a contiguous, power of two-sized set of physical pages.
699  *
700  * When this function is called by a page allocation function, the caller
701  * should request insertion at the head unless the order [order, oind) queues
702  * are known to be empty.  The objective being to reduce the likelihood of
703  * long-term fragmentation by promoting contemporaneous allocation and
704  * (hopefully) deallocation.
705  */
706 static __inline void
vm_phys_split_pages(vm_page_t m,int oind,struct vm_freelist * fl,int order,int pool,int tail)707 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order,
708     int pool, int tail)
709 {
710 	vm_page_t m_buddy;
711 
712 	while (oind > order) {
713 		oind--;
714 		m_buddy = &m[1 << oind];
715 		KASSERT(m_buddy->order == VM_NFREEORDER,
716 		    ("vm_phys_split_pages: page %p has unexpected order %d",
717 		    m_buddy, m_buddy->order));
718 		vm_freelist_add(fl, m_buddy, oind, pool, tail);
719         }
720 }
721 
722 static void
vm_phys_enq_chunk(struct vm_freelist * fl,vm_page_t m,int order,int pool,int tail)723 vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int pool,
724     int tail)
725 {
726 	KASSERT(order >= 0 && order < VM_NFREEORDER,
727 	    ("%s: invalid order %d", __func__, order));
728 
729 	vm_freelist_add(fl, m, order, pool, tail);
730 #ifdef VM_FREEPOOL_LAZYINIT
731 	if (__predict_false(pool == VM_FREEPOOL_LAZYINIT)) {
732 		vm_page_t m_next;
733 		vm_paddr_t pa;
734 		int npages;
735 
736 		npages = 1 << order;
737 		m_next = m + npages;
738 		pa = m->phys_addr + ptoa(npages);
739 		if (pa < vm_phys_segs[m->segind].end) {
740 			vm_page_init_page(m_next, pa, m->segind,
741 			    VM_FREEPOOL_LAZYINIT);
742 		}
743 	}
744 #endif
745 }
746 
747 /*
748  * Add the physical pages [m, m + npages) at the beginning of a power-of-two
749  * aligned and sized set to the specified free list.
750  *
751  * When this function is called by a page allocation function, the caller
752  * should request insertion at the head unless the lower-order queues are
753  * known to be empty.  The objective being to reduce the likelihood of long-
754  * term fragmentation by promoting contemporaneous allocation and (hopefully)
755  * deallocation.
756  *
757  * The physical page m's buddy must not be free.
758  */
759 static void
vm_phys_enq_beg(vm_page_t m,u_int npages,struct vm_freelist * fl,int pool,int tail)760 vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool,
761     int tail)
762 {
763         int order;
764 
765 	KASSERT(npages == 0 ||
766 	    (VM_PAGE_TO_PHYS(m) &
767 	    ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
768 	    ("%s: page %p and npages %u are misaligned",
769 	    __func__, m, npages));
770         while (npages > 0) {
771 		KASSERT(m->order == VM_NFREEORDER,
772 		    ("%s: page %p has unexpected order %d",
773 		    __func__, m, m->order));
774 		order = ilog2(npages);
775 		KASSERT(order < VM_NFREEORDER,
776 		    ("%s: order %d is out of range", __func__, order));
777 		vm_phys_enq_chunk(fl, m, order, pool, tail);
778 		m += 1 << order;
779 		npages -= 1 << order;
780 	}
781 }
782 
783 /*
784  * Add the physical pages [m, m + npages) at the end of a power-of-two aligned
785  * and sized set to the specified free list.
786  *
787  * When this function is called by a page allocation function, the caller
788  * should request insertion at the head unless the lower-order queues are
789  * known to be empty.  The objective being to reduce the likelihood of long-
790  * term fragmentation by promoting contemporaneous allocation and (hopefully)
791  * deallocation.
792  *
793  * If npages is zero, this function does nothing and ignores the physical page
794  * parameter m.  Otherwise, the physical page m's buddy must not be free.
795  */
796 static vm_page_t
vm_phys_enq_range(vm_page_t m,u_int npages,struct vm_freelist * fl,int pool,int tail)797 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool,
798     int tail)
799 {
800 	int order;
801 
802 	KASSERT(npages == 0 ||
803 	    ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) &
804 	    ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
805 	    ("vm_phys_enq_range: page %p and npages %u are misaligned",
806 	    m, npages));
807 	while (npages > 0) {
808 		KASSERT(m->order == VM_NFREEORDER,
809 		    ("vm_phys_enq_range: page %p has unexpected order %d",
810 		    m, m->order));
811 		order = ffs(npages) - 1;
812 		vm_phys_enq_chunk(fl, m, order, pool, tail);
813 		m += 1 << order;
814 		npages -= 1 << order;
815 	}
816 	return (m);
817 }
818 
819 /*
820  * Complete initialization a contiguous, power of two-sized set of physical
821  * pages.
822  *
823  * If the pages currently belong to the lazy init pool, then the corresponding
824  * page structures must be initialized.  In this case it is assumed that the
825  * first page in the run has already been initialized.
826  */
827 static void
vm_phys_finish_init(vm_page_t m,int order)828 vm_phys_finish_init(vm_page_t m, int order)
829 {
830 #ifdef VM_FREEPOOL_LAZYINIT
831 	if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
832 		vm_paddr_t pa;
833 		int segind;
834 
835 		TSENTER();
836 		pa = m->phys_addr + PAGE_SIZE;
837 		segind = m->segind;
838 		for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order];
839 		    m_tmp++, pa += PAGE_SIZE)
840 			vm_page_init_page(m_tmp, pa, segind, VM_NFREEPOOL);
841 		TSEXIT();
842 	}
843 #endif
844 }
845 
846 /*
847  * Tries to allocate the specified number of pages from the specified pool
848  * within the specified domain.  Returns the actual number of allocated pages
849  * and a pointer to each page through the array ma[].
850  *
851  * The returned pages may not be physically contiguous.  However, in contrast
852  * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0),
853  * calling this function once to allocate the desired number of pages will
854  * avoid wasted time in vm_phys_split_pages().  The allocated pages have no
855  * valid pool field set.
856  *
857  * The free page queues for the specified domain must be locked.
858  */
859 int
vm_phys_alloc_npages(int domain,int pool,int npages,vm_page_t ma[])860 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
861 {
862 	struct vm_freelist *alt, *fl;
863 	vm_page_t m;
864 	int avail, end, flind, freelist, i, oind, pind;
865 
866 	KASSERT(domain >= 0 && domain < vm_ndomains,
867 	    ("vm_phys_alloc_npages: domain %d is out of range", domain));
868 	KASSERT(vm_phys_pool_valid(pool),
869 	    ("vm_phys_alloc_npages: pool %d is out of range", pool));
870 	KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
871 	    ("vm_phys_alloc_npages: npages %d is out of range", npages));
872 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
873 	i = 0;
874 	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
875 		flind = vm_freelist_to_flind[freelist];
876 		if (flind < 0)
877 			continue;
878 		fl = vm_phys_free_queues[domain][flind][pool];
879 		for (oind = 0; oind < VM_NFREEORDER; oind++) {
880 			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
881 				vm_freelist_rem(fl, m, oind);
882 				avail = i + (1 << oind);
883 				end = imin(npages, avail);
884 				while (i < end)
885 					ma[i++] = m++;
886 				if (i == npages) {
887 					/*
888 					 * Return excess pages to fl.  Its order
889 					 * [0, oind) queues are empty.
890 					 */
891 					vm_phys_enq_range(m, avail - i, fl,
892 					    pool, 1);
893 					return (npages);
894 				}
895 			}
896 		}
897 		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
898 			for (pind = vm_default_freepool; pind < VM_NFREEPOOL;
899 			    pind++) {
900 				alt = vm_phys_free_queues[domain][flind][pind];
901 				while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
902 				    NULL) {
903 					vm_freelist_rem(alt, m, oind);
904 					vm_phys_finish_init(m, oind);
905 					avail = i + (1 << oind);
906 					end = imin(npages, avail);
907 					while (i < end)
908 						ma[i++] = m++;
909 					if (i == npages) {
910 						/*
911 						 * Return excess pages to fl.
912 						 * Its order [0, oind) queues
913 						 * are empty.
914 						 */
915 						vm_phys_enq_range(m, avail - i,
916 						    fl, pool, 1);
917 						return (npages);
918 					}
919 				}
920 			}
921 		}
922 	}
923 	return (i);
924 }
925 
926 /*
927  * Allocate a contiguous, power of two-sized set of physical pages from the
928  * specified free list.  The free list must be specified using one of the
929  * manifest constants VM_FREELIST_*.
930  *
931  * The free page queues must be locked.
932  */
933 static vm_page_t
vm_phys_alloc_freelist_pages(int domain,int freelist,int pool,int order)934 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
935 {
936 	struct vm_freelist *alt, *fl;
937 	vm_page_t m;
938 	int oind, pind, flind;
939 
940 	KASSERT(domain >= 0 && domain < vm_ndomains,
941 	    ("vm_phys_alloc_freelist_pages: domain %d is out of range",
942 	    domain));
943 	KASSERT(freelist < VM_NFREELIST,
944 	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
945 	    freelist));
946 	KASSERT(vm_phys_pool_valid(pool),
947 	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
948 	KASSERT(order < VM_NFREEORDER,
949 	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
950 
951 	flind = vm_freelist_to_flind[freelist];
952 	/* Check if freelist is present */
953 	if (flind < 0)
954 		return (NULL);
955 
956 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
957 	fl = &vm_phys_free_queues[domain][flind][pool][0];
958 	for (oind = order; oind < VM_NFREEORDER; oind++) {
959 		m = TAILQ_FIRST(&fl[oind].pl);
960 		if (m != NULL) {
961 			vm_freelist_rem(fl, m, oind);
962 			/* The order [order, oind) queues are empty. */
963 			vm_phys_split_pages(m, oind, fl, order, pool, 1);
964 			return (m);
965 		}
966 	}
967 
968 	/*
969 	 * The given pool was empty.  Find the largest
970 	 * contiguous, power-of-two-sized set of pages in any
971 	 * pool.  Transfer these pages to the given pool, and
972 	 * use them to satisfy the allocation.
973 	 */
974 	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
975 		for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
976 			alt = &vm_phys_free_queues[domain][flind][pind][0];
977 			m = TAILQ_FIRST(&alt[oind].pl);
978 			if (m != NULL) {
979 				vm_freelist_rem(alt, m, oind);
980 				vm_phys_finish_init(m, oind);
981 				/* The order [order, oind) queues are empty. */
982 				vm_phys_split_pages(m, oind, fl, order, pool, 1);
983 				return (m);
984 			}
985 		}
986 	}
987 	return (NULL);
988 }
989 
990 /*
991  * Allocate a contiguous, power of two-sized set of physical pages
992  * from the free lists.
993  *
994  * The free page queues must be locked.
995  */
996 vm_page_t
vm_phys_alloc_pages(int domain,int pool,int order)997 vm_phys_alloc_pages(int domain, int pool, int order)
998 {
999 	vm_page_t m;
1000 	int freelist;
1001 
1002 	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
1003 		m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order);
1004 		if (m != NULL)
1005 			return (m);
1006 	}
1007 	return (NULL);
1008 }
1009 
1010 /*
1011  * Find the vm_page corresponding to the given physical address, which must lie
1012  * within the given physical memory segment.
1013  */
1014 vm_page_t
vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg * seg,vm_paddr_t pa)1015 vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa)
1016 {
1017 	KASSERT(pa >= seg->start && pa < seg->end,
1018 	    ("%s: pa %#jx is out of range", __func__, (uintmax_t)pa));
1019 
1020 	return (&seg->first_page[atop(pa - seg->start)]);
1021 }
1022 
1023 /*
1024  * Find the vm_page corresponding to the given physical address.
1025  */
1026 vm_page_t
vm_phys_paddr_to_vm_page(vm_paddr_t pa)1027 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
1028 {
1029 	struct vm_phys_seg *seg;
1030 
1031 	if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
1032 		return (vm_phys_seg_paddr_to_vm_page(seg, pa));
1033 	return (NULL);
1034 }
1035 
1036 vm_page_t
vm_phys_fictitious_to_vm_page(vm_paddr_t pa)1037 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
1038 {
1039 	struct vm_phys_fictitious_seg tmp, *seg;
1040 	vm_page_t m;
1041 
1042 	m = NULL;
1043 	tmp.start = pa;
1044 	tmp.end = 0;
1045 
1046 	rw_rlock(&vm_phys_fictitious_reg_lock);
1047 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1048 	rw_runlock(&vm_phys_fictitious_reg_lock);
1049 	if (seg == NULL)
1050 		return (NULL);
1051 
1052 	m = &seg->first_page[atop(pa - seg->start)];
1053 	KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
1054 
1055 	return (m);
1056 }
1057 
1058 static inline void
vm_phys_fictitious_init_range(vm_page_t range,vm_paddr_t start,long page_count,vm_memattr_t memattr)1059 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
1060     long page_count, vm_memattr_t memattr)
1061 {
1062 	long i;
1063 
1064 	bzero(range, page_count * sizeof(*range));
1065 	for (i = 0; i < page_count; i++) {
1066 		vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
1067 		range[i].oflags &= ~VPO_UNMANAGED;
1068 		range[i].busy_lock = VPB_UNBUSIED;
1069 	}
1070 }
1071 
1072 int
vm_phys_fictitious_reg_range(vm_paddr_t start,vm_paddr_t end,vm_memattr_t memattr)1073 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
1074     vm_memattr_t memattr)
1075 {
1076 	struct vm_phys_fictitious_seg *seg;
1077 	vm_page_t fp;
1078 	long page_count;
1079 #ifdef VM_PHYSSEG_DENSE
1080 	long pi, pe;
1081 	long dpage_count;
1082 #endif
1083 
1084 	KASSERT(start < end,
1085 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
1086 	    (uintmax_t)start, (uintmax_t)end));
1087 
1088 	page_count = (end - start) / PAGE_SIZE;
1089 
1090 #ifdef VM_PHYSSEG_DENSE
1091 	pi = atop(start);
1092 	pe = atop(end);
1093 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1094 		fp = &vm_page_array[pi - first_page];
1095 		if ((pe - first_page) > vm_page_array_size) {
1096 			/*
1097 			 * We have a segment that starts inside
1098 			 * of vm_page_array, but ends outside of it.
1099 			 *
1100 			 * Use vm_page_array pages for those that are
1101 			 * inside of the vm_page_array range, and
1102 			 * allocate the remaining ones.
1103 			 */
1104 			dpage_count = vm_page_array_size - (pi - first_page);
1105 			vm_phys_fictitious_init_range(fp, start, dpage_count,
1106 			    memattr);
1107 			page_count -= dpage_count;
1108 			start += ptoa(dpage_count);
1109 			goto alloc;
1110 		}
1111 		/*
1112 		 * We can allocate the full range from vm_page_array,
1113 		 * so there's no need to register the range in the tree.
1114 		 */
1115 		vm_phys_fictitious_init_range(fp, start, page_count, memattr);
1116 		return (0);
1117 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1118 		/*
1119 		 * We have a segment that ends inside of vm_page_array,
1120 		 * but starts outside of it.
1121 		 */
1122 		fp = &vm_page_array[0];
1123 		dpage_count = pe - first_page;
1124 		vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
1125 		    memattr);
1126 		end -= ptoa(dpage_count);
1127 		page_count -= dpage_count;
1128 		goto alloc;
1129 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1130 		/*
1131 		 * Trying to register a fictitious range that expands before
1132 		 * and after vm_page_array.
1133 		 */
1134 		return (EINVAL);
1135 	} else {
1136 alloc:
1137 #endif
1138 		fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
1139 		    M_WAITOK);
1140 #ifdef VM_PHYSSEG_DENSE
1141 	}
1142 #endif
1143 	vm_phys_fictitious_init_range(fp, start, page_count, memattr);
1144 
1145 	seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
1146 	seg->start = start;
1147 	seg->end = end;
1148 	seg->first_page = fp;
1149 
1150 	rw_wlock(&vm_phys_fictitious_reg_lock);
1151 	RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
1152 	rw_wunlock(&vm_phys_fictitious_reg_lock);
1153 
1154 	return (0);
1155 }
1156 
1157 void
vm_phys_fictitious_unreg_range(vm_paddr_t start,vm_paddr_t end)1158 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
1159 {
1160 	struct vm_phys_fictitious_seg *seg, tmp;
1161 #ifdef VM_PHYSSEG_DENSE
1162 	long pi, pe;
1163 #endif
1164 
1165 	KASSERT(start < end,
1166 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
1167 	    (uintmax_t)start, (uintmax_t)end));
1168 
1169 #ifdef VM_PHYSSEG_DENSE
1170 	pi = atop(start);
1171 	pe = atop(end);
1172 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1173 		if ((pe - first_page) <= vm_page_array_size) {
1174 			/*
1175 			 * This segment was allocated using vm_page_array
1176 			 * only, there's nothing to do since those pages
1177 			 * were never added to the tree.
1178 			 */
1179 			return;
1180 		}
1181 		/*
1182 		 * We have a segment that starts inside
1183 		 * of vm_page_array, but ends outside of it.
1184 		 *
1185 		 * Calculate how many pages were added to the
1186 		 * tree and free them.
1187 		 */
1188 		start = ptoa(first_page + vm_page_array_size);
1189 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1190 		/*
1191 		 * We have a segment that ends inside of vm_page_array,
1192 		 * but starts outside of it.
1193 		 */
1194 		end = ptoa(first_page);
1195 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1196 		/* Since it's not possible to register such a range, panic. */
1197 		panic(
1198 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
1199 		    (uintmax_t)start, (uintmax_t)end);
1200 	}
1201 #endif
1202 	tmp.start = start;
1203 	tmp.end = 0;
1204 
1205 	rw_wlock(&vm_phys_fictitious_reg_lock);
1206 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1207 	if (seg->start != start || seg->end != end) {
1208 		rw_wunlock(&vm_phys_fictitious_reg_lock);
1209 		panic(
1210 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
1211 		    (uintmax_t)start, (uintmax_t)end);
1212 	}
1213 	RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
1214 	rw_wunlock(&vm_phys_fictitious_reg_lock);
1215 	free(seg->first_page, M_FICT_PAGES);
1216 	free(seg, M_FICT_PAGES);
1217 }
1218 
1219 /*
1220  * Free a contiguous, power of two-sized set of physical pages.
1221  * The pool field in the first page determines the destination pool.
1222  *
1223  * The free page queues must be locked.
1224  */
1225 void
vm_phys_free_pages(vm_page_t m,int pool,int order)1226 vm_phys_free_pages(vm_page_t m, int pool, int order)
1227 {
1228 	struct vm_freelist *fl;
1229 	struct vm_phys_seg *seg;
1230 	vm_paddr_t pa;
1231 	vm_page_t m_buddy;
1232 
1233 	KASSERT(m->order == VM_NFREEORDER,
1234 	    ("%s: page %p has unexpected order %d",
1235 	    __func__, m, m->order));
1236 	KASSERT(vm_phys_pool_valid(pool),
1237 	    ("%s: unexpected pool param %d", __func__, pool));
1238 	KASSERT(order < VM_NFREEORDER,
1239 	    ("%s: order %d is out of range", __func__, order));
1240 	seg = &vm_phys_segs[m->segind];
1241 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1242 	if (order < VM_NFREEORDER - 1) {
1243 		pa = VM_PAGE_TO_PHYS(m);
1244 		do {
1245 			pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
1246 			if (pa < seg->start || pa >= seg->end)
1247 				break;
1248 			m_buddy = vm_phys_seg_paddr_to_vm_page(seg, pa);
1249 			if (m_buddy->order != order)
1250 				break;
1251 			fl = (*seg->free_queues)[m_buddy->pool];
1252 			vm_freelist_rem(fl, m_buddy, order);
1253 			vm_phys_finish_init(m_buddy, order);
1254 			order++;
1255 			pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
1256 			m = vm_phys_seg_paddr_to_vm_page(seg, pa);
1257 		} while (order < VM_NFREEORDER - 1);
1258 	}
1259 	fl = (*seg->free_queues)[pool];
1260 	vm_freelist_add(fl, m, order, pool, 1);
1261 }
1262 
1263 #ifdef VM_FREEPOOL_LAZYINIT
1264 /*
1265  * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving
1266  * them to the default pool.  This is a prerequisite for some rare operations
1267  * which need to scan the page array and thus depend on all pages being
1268  * initialized.
1269  */
1270 static void
vm_phys_lazy_init_domain(int domain,bool locked)1271 vm_phys_lazy_init_domain(int domain, bool locked)
1272 {
1273 	static bool initdone[MAXMEMDOM];
1274 	struct vm_domain *vmd;
1275 	struct vm_freelist *fl;
1276 	vm_page_t m;
1277 	int pind;
1278 	bool unlocked;
1279 
1280 	if (__predict_true(atomic_load_bool(&initdone[domain])))
1281 		return;
1282 
1283 	vmd = VM_DOMAIN(domain);
1284 	if (locked)
1285 		vm_domain_free_assert_locked(vmd);
1286 	else
1287 		vm_domain_free_lock(vmd);
1288 	if (atomic_load_bool(&initdone[domain]))
1289 		goto out;
1290 	pind = VM_FREEPOOL_LAZYINIT;
1291 	for (int freelist = 0; freelist < VM_NFREELIST; freelist++) {
1292 		int flind;
1293 
1294 		flind = vm_freelist_to_flind[freelist];
1295 		if (flind < 0)
1296 			continue;
1297 		fl = vm_phys_free_queues[domain][flind][pind];
1298 		for (int oind = 0; oind < VM_NFREEORDER; oind++) {
1299 			if (atomic_load_int(&fl[oind].lcnt) == 0)
1300 				continue;
1301 			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
1302 				/*
1303 				 * Avoid holding the lock across the
1304 				 * initialization unless there's a free page
1305 				 * shortage.
1306 				 */
1307 				vm_freelist_rem(fl, m, oind);
1308 				unlocked = vm_domain_allocate(vmd,
1309 				    VM_ALLOC_NORMAL, 1 << oind);
1310 				if (unlocked)
1311 					vm_domain_free_unlock(vmd);
1312 				vm_phys_finish_init(m, oind);
1313 				if (unlocked) {
1314 					vm_domain_freecnt_inc(vmd, 1 << oind);
1315 					vm_domain_free_lock(vmd);
1316 				}
1317 				vm_phys_free_pages(m, VM_FREEPOOL_DEFAULT,
1318 				    oind);
1319 			}
1320 		}
1321 	}
1322 	atomic_store_bool(&initdone[domain], true);
1323 out:
1324 	if (!locked)
1325 		vm_domain_free_unlock(vmd);
1326 }
1327 
1328 static void
vm_phys_lazy_init(void)1329 vm_phys_lazy_init(void)
1330 {
1331 	for (int domain = 0; domain < vm_ndomains; domain++)
1332 		vm_phys_lazy_init_domain(domain, false);
1333 	atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT);
1334 }
1335 
1336 static void
vm_phys_lazy_init_kthr(void * arg __unused)1337 vm_phys_lazy_init_kthr(void *arg __unused)
1338 {
1339 	vm_phys_lazy_init();
1340 	kthread_exit();
1341 }
1342 
1343 static void
vm_phys_lazy_sysinit(void * arg __unused)1344 vm_phys_lazy_sysinit(void *arg __unused)
1345 {
1346 	struct thread *td;
1347 	int error;
1348 
1349 	error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td,
1350 	    RFSTOPPED, 0, "vmlazyinit");
1351 	if (error == 0) {
1352 		thread_lock(td);
1353 		sched_prio(td, PRI_MIN_IDLE);
1354 		sched_add(td, SRQ_BORING);
1355 	} else {
1356 		printf("%s: could not create lazy init thread: %d\n",
1357 		    __func__, error);
1358 		vm_phys_lazy_init();
1359 	}
1360 }
1361 SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit,
1362     NULL);
1363 #endif /* VM_FREEPOOL_LAZYINIT */
1364 
1365 /*
1366  * Free a contiguous, arbitrarily sized set of physical pages, without
1367  * merging across set boundaries.  Assumes no pages have a valid pool field.
1368  *
1369  * The free page queues must be locked.
1370  */
1371 void
vm_phys_enqueue_contig(vm_page_t m,int pool,u_long npages)1372 vm_phys_enqueue_contig(vm_page_t m, int pool, u_long npages)
1373 {
1374 	struct vm_freelist *fl;
1375 	struct vm_phys_seg *seg;
1376 	vm_page_t m_end;
1377 	vm_paddr_t diff, lo;
1378 	int order;
1379 
1380 	/*
1381 	 * Avoid unnecessary coalescing by freeing the pages in the largest
1382 	 * possible power-of-two-sized subsets.
1383 	 */
1384 	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1385 	seg = &vm_phys_segs[m->segind];
1386 	fl = (*seg->free_queues)[pool];
1387 	m_end = m + npages;
1388 	/* Free blocks of increasing size. */
1389 	lo = atop(VM_PAGE_TO_PHYS(m));
1390 	if (m < m_end &&
1391 	    (diff = lo ^ (lo + npages - 1)) != 0) {
1392 		order = min(ilog2(diff), VM_NFREEORDER - 1);
1393 		m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl,
1394 		    pool, 1);
1395 	}
1396 
1397 	/* Free blocks of maximum size. */
1398 	order = VM_NFREEORDER - 1;
1399 	while (m + (1 << order) <= m_end) {
1400 		KASSERT(seg == &vm_phys_segs[m->segind],
1401 		    ("%s: page range [%p,%p) spans multiple segments",
1402 		    __func__, m_end - npages, m));
1403 		vm_phys_enq_chunk(fl, m, order, pool, 1);
1404 		m += 1 << order;
1405 	}
1406 	/* Free blocks of diminishing size. */
1407 	vm_phys_enq_beg(m, m_end - m, fl, pool, 1);
1408 }
1409 
1410 /*
1411  * Free a contiguous, arbitrarily sized set of physical pages.
1412  * Assumes that every page but the first has no valid pool field.
1413  * Uses the pool value in the first page if valid, otherwise default.
1414  *
1415  * The free page queues must be locked.
1416  */
1417 void
vm_phys_free_contig(vm_page_t m,int pool,u_long npages)1418 vm_phys_free_contig(vm_page_t m, int pool, u_long npages)
1419 {
1420 	vm_paddr_t lo;
1421 	vm_page_t m_start, m_end;
1422 	unsigned max_order, order_start, order_end;
1423 
1424 	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1425 
1426 	lo = atop(VM_PAGE_TO_PHYS(m));
1427 	max_order = min(ilog2(lo ^ (lo + npages)), VM_NFREEORDER - 1);
1428 
1429 	m_start = m;
1430 	order_start = ffsll(lo) - 1;
1431 	if (order_start < max_order)
1432 		m_start += 1 << order_start;
1433 	m_end = m + npages;
1434 	order_end = ffsll(lo + npages) - 1;
1435 	if (order_end < max_order)
1436 		m_end -= 1 << order_end;
1437 	/*
1438 	 * Avoid unnecessary coalescing by freeing the pages at the start and
1439 	 * end of the range last.
1440 	 */
1441 	if (m_start < m_end)
1442 		vm_phys_enqueue_contig(m_start, pool, m_end - m_start);
1443 	if (order_start < max_order)
1444 		vm_phys_free_pages(m, pool, order_start);
1445 	if (order_end < max_order)
1446 		vm_phys_free_pages(m_end, pool, order_end);
1447 }
1448 
1449 /*
1450  * Identify the first address range within segment segind or greater
1451  * that matches the domain, lies within the low/high range, and has
1452  * enough pages.  Return -1 if there is none.
1453  */
1454 int
vm_phys_find_range(vm_page_t bounds[],int segind,int domain,u_long npages,vm_paddr_t low,vm_paddr_t high)1455 vm_phys_find_range(vm_page_t bounds[], int segind, int domain,
1456     u_long npages, vm_paddr_t low, vm_paddr_t high)
1457 {
1458 	vm_paddr_t pa_end, pa_start;
1459 	struct vm_phys_seg *end_seg, *seg;
1460 
1461 	KASSERT(npages > 0, ("npages is zero"));
1462 	KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range"));
1463 	end_seg = &vm_phys_segs[vm_phys_nsegs];
1464 	for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) {
1465 		if (seg->domain != domain)
1466 			continue;
1467 		if (seg->start >= high)
1468 			return (-1);
1469 		pa_start = MAX(low, seg->start);
1470 		pa_end = MIN(high, seg->end);
1471 		if (pa_end - pa_start < ptoa(npages))
1472 			continue;
1473 #ifdef VM_FREEPOOL_LAZYINIT
1474 		/*
1475 		 * The pages on the free lists must be initialized.
1476 		 */
1477 		vm_phys_lazy_init_domain(domain, false);
1478 #endif
1479 		bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start);
1480 		bounds[1] = &seg->first_page[atop(pa_end - seg->start)];
1481 		return (seg - vm_phys_segs);
1482 	}
1483 	return (-1);
1484 }
1485 
1486 /*
1487  * Search for the given physical page "m" in the free lists.  If the search
1488  * succeeds, remove "m" from the free lists and return true.  Otherwise, return
1489  * false, indicating that "m" is not in the free lists.
1490  *
1491  * The free page queues must be locked.
1492  */
1493 bool
vm_phys_unfree_page(vm_paddr_t pa)1494 vm_phys_unfree_page(vm_paddr_t pa)
1495 {
1496 	struct vm_freelist *fl;
1497 	struct vm_phys_seg *seg;
1498 	vm_paddr_t pa_half;
1499 	vm_page_t m, m_set, m_tmp;
1500 	int order, pool;
1501 
1502 	seg = vm_phys_paddr_to_seg(pa);
1503 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1504 
1505 #ifdef VM_FREEPOOL_LAZYINIT
1506 	/*
1507 	 * The pages on the free lists must be initialized.
1508 	 */
1509 	vm_phys_lazy_init_domain(seg->domain, true);
1510 #endif
1511 
1512 	/*
1513 	 * First, find the contiguous, power of two-sized set of free
1514 	 * physical pages containing the given physical page "m" and
1515 	 * assign it to "m_set".
1516 	 */
1517 	m = vm_phys_paddr_to_vm_page(pa);
1518 	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
1519 	    order < VM_NFREEORDER - 1; ) {
1520 		order++;
1521 		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
1522 		if (pa >= seg->start)
1523 			m_set = vm_phys_seg_paddr_to_vm_page(seg, pa);
1524 		else
1525 			return (false);
1526 	}
1527 	if (m_set->order < order)
1528 		return (false);
1529 	if (m_set->order == VM_NFREEORDER)
1530 		return (false);
1531 	KASSERT(m_set->order < VM_NFREEORDER,
1532 	    ("vm_phys_unfree_page: page %p has unexpected order %d",
1533 	    m_set, m_set->order));
1534 
1535 	/*
1536 	 * Next, remove "m_set" from the free lists.  Finally, extract
1537 	 * "m" from "m_set" using an iterative algorithm: While "m_set"
1538 	 * is larger than a page, shrink "m_set" by returning the half
1539 	 * of "m_set" that does not contain "m" to the free lists.
1540 	 */
1541 	pool = m_set->pool;
1542 	fl = (*seg->free_queues)[pool];
1543 	order = m_set->order;
1544 	vm_freelist_rem(fl, m_set, order);
1545 	while (order > 0) {
1546 		order--;
1547 		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
1548 		if (m->phys_addr < pa_half)
1549 			m_tmp = vm_phys_seg_paddr_to_vm_page(seg, pa_half);
1550 		else {
1551 			m_tmp = m_set;
1552 			m_set = vm_phys_seg_paddr_to_vm_page(seg, pa_half);
1553 		}
1554 		vm_freelist_add(fl, m_tmp, order, pool, 0);
1555 	}
1556 	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
1557 	return (true);
1558 }
1559 
1560 /*
1561  * Find a run of contiguous physical pages, meeting alignment requirements, from
1562  * a list of max-sized page blocks, where we need at least two consecutive
1563  * blocks to satisfy the (large) page request.
1564  */
1565 static vm_page_t
vm_phys_find_freelist_contig(struct vm_freelist * fl,u_long npages,vm_paddr_t low,vm_paddr_t high,u_long alignment,vm_paddr_t boundary)1566 vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages,
1567     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
1568 {
1569 	struct vm_phys_seg *seg;
1570 	vm_page_t m, m_iter, m_ret;
1571 	vm_paddr_t max_size, size;
1572 	int max_order;
1573 
1574 	max_order = VM_NFREEORDER - 1;
1575 	size = npages << PAGE_SHIFT;
1576 	max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order);
1577 	KASSERT(size > max_size, ("size is too small"));
1578 
1579 	/*
1580 	 * In order to avoid examining any free max-sized page block more than
1581 	 * twice, identify the ones that are first in a physically-contiguous
1582 	 * sequence of such blocks, and only for those walk the sequence to
1583 	 * check if there are enough free blocks starting at a properly aligned
1584 	 * block.  Thus, no block is checked for free-ness more than twice.
1585 	 */
1586 	TAILQ_FOREACH(m, &fl[max_order].pl, listq) {
1587 		/*
1588 		 * Skip m unless it is first in a sequence of free max page
1589 		 * blocks >= low in its segment.
1590 		 */
1591 		seg = &vm_phys_segs[m->segind];
1592 		if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start))
1593 			continue;
1594 		if (VM_PAGE_TO_PHYS(m) >= max_size &&
1595 		    VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) &&
1596 		    max_order == m[-1 << max_order].order)
1597 			continue;
1598 
1599 		/*
1600 		 * Advance m_ret from m to the first of the sequence, if any,
1601 		 * that satisfies alignment conditions and might leave enough
1602 		 * space.
1603 		 */
1604 		m_ret = m;
1605 		while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret),
1606 		    size, alignment, boundary) &&
1607 		    VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) &&
1608 		    max_order == m_ret[1 << max_order].order)
1609 			m_ret += 1 << max_order;
1610 
1611 		/*
1612 		 * Skip m unless some block m_ret in the sequence is properly
1613 		 * aligned, and begins a sequence of enough pages less than
1614 		 * high, and in the same segment.
1615 		 */
1616 		if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end))
1617 			continue;
1618 
1619 		/*
1620 		 * Skip m unless the blocks to allocate starting at m_ret are
1621 		 * all free.
1622 		 */
1623 		for (m_iter = m_ret;
1624 		    m_iter < m_ret + npages && max_order == m_iter->order;
1625 		    m_iter += 1 << max_order) {
1626 		}
1627 		if (m_iter < m_ret + npages)
1628 			continue;
1629 		return (m_ret);
1630 	}
1631 	return (NULL);
1632 }
1633 
1634 /*
1635  * Find a run of contiguous physical pages from the specified free list
1636  * table.
1637  */
1638 static vm_page_t
vm_phys_find_queues_contig(struct vm_freelist (* queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX],u_long npages,vm_paddr_t low,vm_paddr_t high,u_long alignment,vm_paddr_t boundary)1639 vm_phys_find_queues_contig(
1640     struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX],
1641     u_long npages, vm_paddr_t low, vm_paddr_t high,
1642     u_long alignment, vm_paddr_t boundary)
1643 {
1644 	struct vm_freelist *fl;
1645 	vm_page_t m_ret;
1646 	vm_paddr_t pa, pa_end, size;
1647 	int oind, order, pind;
1648 
1649 	KASSERT(npages > 0, ("npages is 0"));
1650 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1651 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1652 	/* Compute the queue that is the best fit for npages. */
1653 	order = flsl(npages - 1);
1654 	/* Search for a large enough free block. */
1655 	size = npages << PAGE_SHIFT;
1656 	for (oind = order; oind < VM_NFREEORDER; oind++) {
1657 		for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
1658 			fl = (*queues)[pind];
1659 			TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
1660 				/*
1661 				 * Determine if the address range starting at pa
1662 				 * is within the given range, satisfies the
1663 				 * given alignment, and does not cross the given
1664 				 * boundary.
1665 				 */
1666 				pa = VM_PAGE_TO_PHYS(m_ret);
1667 				pa_end = pa + size;
1668 				if (low <= pa && pa_end <= high &&
1669 				    vm_addr_ok(pa, size, alignment, boundary))
1670 					return (m_ret);
1671 			}
1672 		}
1673 	}
1674 	if (order < VM_NFREEORDER)
1675 		return (NULL);
1676 	/* Search for a long-enough sequence of max-order blocks. */
1677 	for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
1678 		fl = (*queues)[pind];
1679 		m_ret = vm_phys_find_freelist_contig(fl, npages,
1680 		    low, high, alignment, boundary);
1681 		if (m_ret != NULL)
1682 			return (m_ret);
1683 	}
1684 	return (NULL);
1685 }
1686 
1687 /*
1688  * Allocate a contiguous set of physical pages of the given size
1689  * "npages" from the free lists.  All of the physical pages must be at
1690  * or above the given physical address "low" and below the given
1691  * physical address "high".  The given value "alignment" determines the
1692  * alignment of the first physical page in the set.  If the given value
1693  * "boundary" is non-zero, then the set of physical pages cannot cross
1694  * any physical address boundary that is a multiple of that value.  Both
1695  * "alignment" and "boundary" must be a power of two.  Sets the pool
1696  * field to DEFAULT in the first allocated page.
1697  */
1698 vm_page_t
vm_phys_alloc_contig(int domain,u_long npages,vm_paddr_t low,vm_paddr_t high,u_long alignment,vm_paddr_t boundary)1699 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
1700     u_long alignment, vm_paddr_t boundary)
1701 {
1702 	vm_paddr_t pa_end, pa_start;
1703 	struct vm_freelist *fl;
1704 	vm_page_t m, m_run;
1705 	struct vm_phys_seg *seg;
1706 	struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX];
1707 	int oind, segind;
1708 
1709 	KASSERT(npages > 0, ("npages is 0"));
1710 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1711 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1712 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
1713 	if (low >= high)
1714 		return (NULL);
1715 	queues = NULL;
1716 	m_run = NULL;
1717 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
1718 		seg = &vm_phys_segs[segind];
1719 		if (seg->start >= high || seg->domain != domain)
1720 			continue;
1721 		if (low >= seg->end)
1722 			break;
1723 		if (low <= seg->start)
1724 			pa_start = seg->start;
1725 		else
1726 			pa_start = low;
1727 		if (high < seg->end)
1728 			pa_end = high;
1729 		else
1730 			pa_end = seg->end;
1731 		if (pa_end - pa_start < ptoa(npages))
1732 			continue;
1733 		/*
1734 		 * If a previous segment led to a search using
1735 		 * the same free lists as would this segment, then
1736 		 * we've actually already searched within this
1737 		 * too.  So skip it.
1738 		 */
1739 		if (seg->free_queues == queues)
1740 			continue;
1741 		queues = seg->free_queues;
1742 		m_run = vm_phys_find_queues_contig(queues, npages,
1743 		    low, high, alignment, boundary);
1744 		if (m_run != NULL)
1745 			break;
1746 	}
1747 	if (m_run == NULL)
1748 		return (NULL);
1749 
1750 	/* Allocate pages from the page-range found. */
1751 	for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) {
1752 		fl = (*queues)[m->pool];
1753 		oind = m->order;
1754 		vm_freelist_rem(fl, m, oind);
1755 		vm_phys_finish_init(m, oind);
1756 	}
1757 	/* Return excess pages to the free lists. */
1758 	fl = (*queues)[VM_FREEPOOL_DEFAULT];
1759 	vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl,
1760 	    VM_FREEPOOL_DEFAULT, 0);
1761 
1762 	/* Return page verified to satisfy conditions of request. */
1763 	pa_start = VM_PAGE_TO_PHYS(m_run);
1764 	KASSERT(low <= pa_start,
1765 	    ("memory allocated below minimum requested range"));
1766 	KASSERT(pa_start + ptoa(npages) <= high,
1767 	    ("memory allocated above maximum requested range"));
1768 	seg = &vm_phys_segs[m_run->segind];
1769 	KASSERT(seg->domain == domain,
1770 	    ("memory not allocated from specified domain"));
1771 	KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary),
1772 	    ("memory alignment/boundary constraints not satisfied"));
1773 	return (m_run);
1774 }
1775 
1776 /*
1777  * Return the index of the first unused slot which may be the terminating
1778  * entry.
1779  */
1780 static int
vm_phys_avail_count(void)1781 vm_phys_avail_count(void)
1782 {
1783 	int i;
1784 
1785 	for (i = 0; i < PHYS_AVAIL_COUNT; i += 2)
1786 		if (phys_avail[i] == 0 && phys_avail[i + 1] == 0)
1787 			return (i);
1788 	panic("Improperly terminated phys_avail[]");
1789 }
1790 
1791 /*
1792  * Assert that a phys_avail entry is valid.
1793  */
1794 static void
vm_phys_avail_check(int i)1795 vm_phys_avail_check(int i)
1796 {
1797 	if (i % 2 != 0)
1798 		panic("Chunk start index %d is not even.", i);
1799 	if (phys_avail[i] & PAGE_MASK)
1800 		panic("Unaligned phys_avail[%d]: %#jx", i,
1801 		    (intmax_t)phys_avail[i]);
1802 	if (phys_avail[i + 1] & PAGE_MASK)
1803 		panic("Unaligned phys_avail[%d + 1]: %#jx", i,
1804 		    (intmax_t)phys_avail[i + 1]);
1805 	if (phys_avail[i + 1] < phys_avail[i])
1806 		panic("phys_avail[%d]: start %#jx > end %#jx", i,
1807 		    (intmax_t)phys_avail[i], (intmax_t)phys_avail[i + 1]);
1808 }
1809 
1810 /*
1811  * Return the index of an overlapping phys_avail entry or -1.
1812  */
1813 #ifdef NUMA
1814 static int
vm_phys_avail_find(vm_paddr_t pa)1815 vm_phys_avail_find(vm_paddr_t pa)
1816 {
1817 	int i;
1818 
1819 	for (i = 0; phys_avail[i + 1]; i += 2)
1820 		if (phys_avail[i] <= pa && phys_avail[i + 1] > pa)
1821 			return (i);
1822 	return (-1);
1823 }
1824 #endif
1825 
1826 /*
1827  * Return the index of the largest entry.
1828  */
1829 int
vm_phys_avail_largest(void)1830 vm_phys_avail_largest(void)
1831 {
1832 	vm_paddr_t sz, largesz;
1833 	int largest;
1834 	int i;
1835 
1836 	largest = 0;
1837 	largesz = 0;
1838 	for (i = 0; phys_avail[i + 1]; i += 2) {
1839 		sz = vm_phys_avail_size(i);
1840 		if (sz > largesz) {
1841 			largesz = sz;
1842 			largest = i;
1843 		}
1844 	}
1845 
1846 	return (largest);
1847 }
1848 
1849 vm_paddr_t
vm_phys_avail_size(int i)1850 vm_phys_avail_size(int i)
1851 {
1852 
1853 	return (phys_avail[i + 1] - phys_avail[i]);
1854 }
1855 
1856 /*
1857  * Split a chunk in phys_avail[] at the address 'pa'.
1858  *
1859  * 'pa' must be within a chunk (slots i and i + 1) or one of its boundaries.
1860  * Returns zero on actual split, in which case the two new chunks occupy slots
1861  * i to i + 3, else EJUSTRETURN if 'pa' was one of the boundaries (and no split
1862  * actually occurred) else ENOSPC if there are not enough slots in phys_avail[]
1863  * to represent the additional chunk caused by the split.
1864  */
1865 static int
vm_phys_avail_split(vm_paddr_t pa,int i)1866 vm_phys_avail_split(vm_paddr_t pa, int i)
1867 {
1868 	int cnt;
1869 
1870 	vm_phys_avail_check(i);
1871 	if (pa < phys_avail[i] || pa > phys_avail[i + 1])
1872 		panic("%s: Address %#jx not in range at slot %d [%#jx;%#jx].",
1873 		    __func__, (uintmax_t)pa, i,
1874 		    (uintmax_t)phys_avail[i], (uintmax_t)phys_avail[i + 1]);
1875 	if (pa == phys_avail[i] || pa == phys_avail[i + 1])
1876 		return (EJUSTRETURN);
1877 	cnt = vm_phys_avail_count();
1878 	if (cnt >= PHYS_AVAIL_ENTRIES)
1879 		return (ENOSPC);
1880 	memmove(&phys_avail[i + 2], &phys_avail[i],
1881 	    (cnt - i) * sizeof(phys_avail[0]));
1882 	phys_avail[i + 1] = pa;
1883 	phys_avail[i + 2] = pa;
1884 	vm_phys_avail_check(i);
1885 	vm_phys_avail_check(i+2);
1886 
1887 	return (0);
1888 }
1889 
1890 /*
1891  * Check if a given physical address can be included as part of a crash dump.
1892  */
1893 bool
vm_phys_is_dumpable(vm_paddr_t pa)1894 vm_phys_is_dumpable(vm_paddr_t pa)
1895 {
1896 	vm_page_t m;
1897 	int i;
1898 
1899 	if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL)
1900 		return ((m->flags & PG_NODUMP) == 0);
1901 
1902 	for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
1903 		if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
1904 			return (true);
1905 	}
1906 	return (false);
1907 }
1908 
1909 void
vm_phys_early_add_seg(vm_paddr_t start,vm_paddr_t end)1910 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end)
1911 {
1912 	struct vm_phys_seg *seg;
1913 
1914 	if (vm_phys_early_nsegs == -1)
1915 		panic("%s: called after initialization", __func__);
1916 	if (vm_phys_early_nsegs == nitems(vm_phys_early_segs))
1917 		panic("%s: ran out of early segments", __func__);
1918 
1919 	seg = &vm_phys_early_segs[vm_phys_early_nsegs++];
1920 	seg->start = start;
1921 	seg->end = end;
1922 }
1923 
1924 /*
1925  * This routine allocates NUMA node specific memory before the page
1926  * allocator is bootstrapped.
1927  */
1928 vm_paddr_t
vm_phys_early_alloc(int domain,size_t alloc_size)1929 vm_phys_early_alloc(int domain, size_t alloc_size)
1930 {
1931 #ifdef NUMA
1932 	int mem_index;
1933 #endif
1934 	int i, biggestone;
1935 	vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align;
1936 
1937 	KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains),
1938 	    ("%s: invalid domain index %d", __func__, domain));
1939 
1940 	/*
1941 	 * Search the mem_affinity array for the biggest address
1942 	 * range in the desired domain.  This is used to constrain
1943 	 * the phys_avail selection below.
1944 	 */
1945 	biggestsize = 0;
1946 	mem_start = 0;
1947 	mem_end = -1;
1948 #ifdef NUMA
1949 	mem_index = 0;
1950 	if (mem_affinity != NULL) {
1951 		for (i = 0;; i++) {
1952 			size = mem_affinity[i].end - mem_affinity[i].start;
1953 			if (size == 0)
1954 				break;
1955 			if (domain != -1 && mem_affinity[i].domain != domain)
1956 				continue;
1957 			if (size > biggestsize) {
1958 				mem_index = i;
1959 				biggestsize = size;
1960 			}
1961 		}
1962 		mem_start = mem_affinity[mem_index].start;
1963 		mem_end = mem_affinity[mem_index].end;
1964 	}
1965 #endif
1966 
1967 	/*
1968 	 * Now find biggest physical segment in within the desired
1969 	 * numa domain.
1970 	 */
1971 	biggestsize = 0;
1972 	biggestone = 0;
1973 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1974 		/* skip regions that are out of range */
1975 		if (phys_avail[i+1] - alloc_size < mem_start ||
1976 		    phys_avail[i+1] > mem_end)
1977 			continue;
1978 		size = vm_phys_avail_size(i);
1979 		if (size > biggestsize) {
1980 			biggestone = i;
1981 			biggestsize = size;
1982 		}
1983 	}
1984 	alloc_size = round_page(alloc_size);
1985 
1986 	/*
1987 	 * Grab single pages from the front to reduce fragmentation.
1988 	 */
1989 	if (alloc_size == PAGE_SIZE) {
1990 		pa = phys_avail[biggestone];
1991 		phys_avail[biggestone] += PAGE_SIZE;
1992 		vm_phys_avail_check(biggestone);
1993 		return (pa);
1994 	}
1995 
1996 	/*
1997 	 * Naturally align large allocations.
1998 	 */
1999 	align = phys_avail[biggestone + 1] & (alloc_size - 1);
2000 	if (alloc_size + align > biggestsize)
2001 		panic("cannot find a large enough size\n");
2002 	if (align != 0 &&
2003 	    vm_phys_avail_split(phys_avail[biggestone + 1] - align,
2004 	    biggestone) != 0)
2005 		/* Wasting memory. */
2006 		phys_avail[biggestone + 1] -= align;
2007 
2008 	phys_avail[biggestone + 1] -= alloc_size;
2009 	vm_phys_avail_check(biggestone);
2010 	pa = phys_avail[biggestone + 1];
2011 	return (pa);
2012 }
2013 
2014 void
vm_phys_early_startup(void)2015 vm_phys_early_startup(void)
2016 {
2017 	struct vm_phys_seg *seg;
2018 	int i;
2019 
2020 	if (phys_avail[1] == 0)
2021 		panic("phys_avail[] is empty");
2022 
2023 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
2024 		phys_avail[i] = round_page(phys_avail[i]);
2025 		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
2026 	}
2027 
2028 	for (i = 0; i < vm_phys_early_nsegs; i++) {
2029 		seg = &vm_phys_early_segs[i];
2030 		vm_phys_add_seg(seg->start, seg->end);
2031 	}
2032 	vm_phys_early_nsegs = -1;
2033 
2034 #ifdef NUMA
2035 	/* Force phys_avail to be split by domain. */
2036 	if (mem_affinity != NULL) {
2037 		int idx;
2038 
2039 		for (i = 0; mem_affinity[i].end != 0; i++) {
2040 			idx = vm_phys_avail_find(mem_affinity[i].start);
2041 			if (idx != -1)
2042 				vm_phys_avail_split(mem_affinity[i].start, idx);
2043 			idx = vm_phys_avail_find(mem_affinity[i].end);
2044 			if (idx != -1)
2045 				vm_phys_avail_split(mem_affinity[i].end, idx);
2046 		}
2047 	}
2048 #endif
2049 }
2050 
2051 #ifdef DDB
2052 /*
2053  * Show the number of physical pages in each of the free lists.
2054  */
DB_SHOW_COMMAND_FLAGS(freepages,db_show_freepages,DB_CMD_MEMSAFE)2055 DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE)
2056 {
2057 	struct vm_freelist *fl;
2058 	int flind, oind, pind, dom;
2059 
2060 	for (dom = 0; dom < vm_ndomains; dom++) {
2061 		db_printf("DOMAIN: %d\n", dom);
2062 		for (flind = 0; flind < vm_nfreelists; flind++) {
2063 			db_printf("FREE LIST %d:\n"
2064 			    "\n  ORDER (SIZE)  |  NUMBER"
2065 			    "\n              ", flind);
2066 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
2067 				db_printf("  |  POOL %d", pind);
2068 			db_printf("\n--            ");
2069 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
2070 				db_printf("-- --      ");
2071 			db_printf("--\n");
2072 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
2073 				db_printf("  %2.2d (%6.6dK)", oind,
2074 				    1 << (PAGE_SHIFT - 10 + oind));
2075 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
2076 				fl = vm_phys_free_queues[dom][flind][pind];
2077 					db_printf("  |  %6.6d", fl[oind].lcnt);
2078 				}
2079 				db_printf("\n");
2080 			}
2081 			db_printf("\n");
2082 		}
2083 		db_printf("\n");
2084 	}
2085 }
2086 #endif
2087