1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Physical memory system implementation 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 #include "opt_ddb.h" 43 #include "opt_vm.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/domainset.h> 48 #include <sys/lock.h> 49 #include <sys/kernel.h> 50 #include <sys/kthread.h> 51 #include <sys/malloc.h> 52 #include <sys/mutex.h> 53 #include <sys/proc.h> 54 #include <sys/queue.h> 55 #include <sys/rwlock.h> 56 #include <sys/sbuf.h> 57 #include <sys/sched.h> 58 #include <sys/sysctl.h> 59 #include <sys/tree.h> 60 #include <sys/tslog.h> 61 #include <sys/unistd.h> 62 #include <sys/vmmeter.h> 63 64 #include <ddb/ddb.h> 65 66 #include <vm/vm.h> 67 #include <vm/vm_extern.h> 68 #include <vm/vm_param.h> 69 #include <vm/vm_kern.h> 70 #include <vm/vm_page.h> 71 #include <vm/vm_phys.h> 72 #include <vm/vm_pagequeue.h> 73 74 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, 75 "Too many physsegs."); 76 _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t), 77 "vm_paddr_t too big for ffsll, flsll."); 78 79 #ifdef NUMA 80 struct mem_affinity __read_mostly *mem_affinity; 81 int __read_mostly *mem_locality; 82 83 static int numa_disabled; 84 static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 85 "NUMA options"); 86 SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 87 &numa_disabled, 0, "NUMA-awareness in the allocators is disabled"); 88 #endif 89 90 int __read_mostly vm_ndomains = 1; 91 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); 92 93 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; 94 int __read_mostly vm_phys_nsegs; 95 static struct vm_phys_seg vm_phys_early_segs[8]; 96 static int vm_phys_early_nsegs; 97 98 struct vm_phys_fictitious_seg; 99 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, 100 struct vm_phys_fictitious_seg *); 101 102 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = 103 RB_INITIALIZER(&vm_phys_fictitious_tree); 104 105 struct vm_phys_fictitious_seg { 106 RB_ENTRY(vm_phys_fictitious_seg) node; 107 /* Memory region data */ 108 vm_paddr_t start; 109 vm_paddr_t end; 110 vm_page_t first_page; 111 }; 112 113 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, 114 vm_phys_fictitious_cmp); 115 116 static struct rwlock_padalign vm_phys_fictitious_reg_lock; 117 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); 118 119 static struct vm_freelist __aligned(CACHE_LINE_SIZE) 120 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL] 121 [VM_NFREEORDER_MAX]; 122 123 static int __read_mostly vm_nfreelists; 124 125 /* 126 * These "avail lists" are globals used to communicate boot-time physical 127 * memory layout to other parts of the kernel. Each physically contiguous 128 * region of memory is defined by a start address at an even index and an 129 * end address at the following odd index. Each list is terminated by a 130 * pair of zero entries. 131 * 132 * dump_avail tells the dump code what regions to include in a crash dump, and 133 * phys_avail is all of the remaining physical memory that is available for 134 * the vm system. 135 * 136 * Initially dump_avail and phys_avail are identical. Boot time memory 137 * allocations remove extents from phys_avail that may still be included 138 * in dumps. 139 */ 140 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; 141 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; 142 143 /* 144 * Provides the mapping from VM_FREELIST_* to free list indices (flind). 145 */ 146 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; 147 static int __read_mostly vm_default_freepool; 148 149 CTASSERT(VM_FREELIST_DEFAULT == 0); 150 151 #ifdef VM_FREELIST_DMA32 152 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) 153 #endif 154 155 /* 156 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about 157 * the ordering of the free list boundaries. 158 */ 159 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) 160 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); 161 #endif 162 163 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); 164 SYSCTL_OID(_vm, OID_AUTO, phys_free, 165 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 166 sysctl_vm_phys_free, "A", 167 "Phys Free Info"); 168 169 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); 170 SYSCTL_OID(_vm, OID_AUTO, phys_segs, 171 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 172 sysctl_vm_phys_segs, "A", 173 "Phys Seg Info"); 174 175 #ifdef NUMA 176 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); 177 SYSCTL_OID(_vm, OID_AUTO, phys_locality, 178 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 179 sysctl_vm_phys_locality, "A", 180 "Phys Locality Info"); 181 #endif 182 183 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, 184 &vm_ndomains, 0, "Number of physical memory domains available."); 185 186 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); 187 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); 188 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, 189 int order, int pool, int tail); 190 191 static bool __diagused 192 vm_phys_pool_valid(int pool) 193 { 194 #ifdef VM_FREEPOOL_LAZYINIT 195 if (pool == VM_FREEPOOL_LAZYINIT) 196 return (false); 197 #endif 198 return (pool >= 0 && pool < VM_NFREEPOOL); 199 } 200 201 /* 202 * Red-black tree helpers for vm fictitious range management. 203 */ 204 static inline int 205 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, 206 struct vm_phys_fictitious_seg *range) 207 { 208 209 KASSERT(range->start != 0 && range->end != 0, 210 ("Invalid range passed on search for vm_fictitious page")); 211 if (p->start >= range->end) 212 return (1); 213 if (p->start < range->start) 214 return (-1); 215 216 return (0); 217 } 218 219 static int 220 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, 221 struct vm_phys_fictitious_seg *p2) 222 { 223 224 /* Check if this is a search for a page */ 225 if (p1->end == 0) 226 return (vm_phys_fictitious_in_range(p1, p2)); 227 228 KASSERT(p2->end != 0, 229 ("Invalid range passed as second parameter to vm fictitious comparison")); 230 231 /* Searching to add a new range */ 232 if (p1->end <= p2->start) 233 return (-1); 234 if (p1->start >= p2->end) 235 return (1); 236 237 panic("Trying to add overlapping vm fictitious ranges:\n" 238 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, 239 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); 240 } 241 242 int 243 vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used, 244 vm_paddr_t high __numa_used) 245 { 246 #ifdef NUMA 247 domainset_t mask; 248 int i; 249 250 if (vm_ndomains == 1 || mem_affinity == NULL) 251 return (0); 252 253 DOMAINSET_ZERO(&mask); 254 /* 255 * Check for any memory that overlaps low, high. 256 */ 257 for (i = 0; mem_affinity[i].end != 0; i++) 258 if (mem_affinity[i].start <= high && 259 mem_affinity[i].end >= low) 260 DOMAINSET_SET(mem_affinity[i].domain, &mask); 261 if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask)) 262 return (prefer); 263 if (DOMAINSET_EMPTY(&mask)) 264 panic("vm_phys_domain_match: Impossible constraint"); 265 return (DOMAINSET_FFS(&mask) - 1); 266 #else 267 return (0); 268 #endif 269 } 270 271 /* 272 * Outputs the state of the physical memory allocator, specifically, 273 * the amount of physical memory in each free list. 274 */ 275 static int 276 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) 277 { 278 struct sbuf sbuf; 279 struct vm_freelist *fl; 280 int dom, error, flind, oind, pind; 281 282 error = sysctl_wire_old_buffer(req, 0); 283 if (error != 0) 284 return (error); 285 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); 286 for (dom = 0; dom < vm_ndomains; dom++) { 287 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); 288 for (flind = 0; flind < vm_nfreelists; flind++) { 289 sbuf_printf(&sbuf, "\nFREE LIST %d:\n" 290 "\n ORDER (SIZE) | NUMBER" 291 "\n ", flind); 292 for (pind = 0; pind < VM_NFREEPOOL; pind++) 293 sbuf_printf(&sbuf, " | POOL %d", pind); 294 sbuf_printf(&sbuf, "\n-- "); 295 for (pind = 0; pind < VM_NFREEPOOL; pind++) 296 sbuf_printf(&sbuf, "-- -- "); 297 sbuf_printf(&sbuf, "--\n"); 298 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 299 sbuf_printf(&sbuf, " %2d (%6dK)", oind, 300 1 << (PAGE_SHIFT - 10 + oind)); 301 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 302 fl = vm_phys_free_queues[dom][flind][pind]; 303 sbuf_printf(&sbuf, " | %6d", 304 fl[oind].lcnt); 305 } 306 sbuf_printf(&sbuf, "\n"); 307 } 308 } 309 } 310 error = sbuf_finish(&sbuf); 311 sbuf_delete(&sbuf); 312 return (error); 313 } 314 315 /* 316 * Outputs the set of physical memory segments. 317 */ 318 static int 319 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) 320 { 321 struct sbuf sbuf; 322 struct vm_phys_seg *seg; 323 int error, segind; 324 325 error = sysctl_wire_old_buffer(req, 0); 326 if (error != 0) 327 return (error); 328 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 329 for (segind = 0; segind < vm_phys_nsegs; segind++) { 330 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); 331 seg = &vm_phys_segs[segind]; 332 sbuf_printf(&sbuf, "start: %#jx\n", 333 (uintmax_t)seg->start); 334 sbuf_printf(&sbuf, "end: %#jx\n", 335 (uintmax_t)seg->end); 336 sbuf_printf(&sbuf, "domain: %d\n", seg->domain); 337 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); 338 } 339 error = sbuf_finish(&sbuf); 340 sbuf_delete(&sbuf); 341 return (error); 342 } 343 344 /* 345 * Return affinity, or -1 if there's no affinity information. 346 */ 347 int 348 vm_phys_mem_affinity(int f __numa_used, int t __numa_used) 349 { 350 351 #ifdef NUMA 352 if (mem_locality == NULL) 353 return (-1); 354 if (f >= vm_ndomains || t >= vm_ndomains) 355 return (-1); 356 return (mem_locality[f * vm_ndomains + t]); 357 #else 358 return (-1); 359 #endif 360 } 361 362 #ifdef NUMA 363 /* 364 * Outputs the VM locality table. 365 */ 366 static int 367 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) 368 { 369 struct sbuf sbuf; 370 int error, i, j; 371 372 error = sysctl_wire_old_buffer(req, 0); 373 if (error != 0) 374 return (error); 375 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 376 377 sbuf_printf(&sbuf, "\n"); 378 379 for (i = 0; i < vm_ndomains; i++) { 380 sbuf_printf(&sbuf, "%d: ", i); 381 for (j = 0; j < vm_ndomains; j++) { 382 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); 383 } 384 sbuf_printf(&sbuf, "\n"); 385 } 386 error = sbuf_finish(&sbuf); 387 sbuf_delete(&sbuf); 388 return (error); 389 } 390 #endif 391 392 static void 393 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int pool, 394 int tail) 395 { 396 /* 397 * The paging queues and the free page lists utilize the same field, 398 * plinks.q, within the vm_page structure. When a physical page is 399 * freed, it is lazily removed from the paging queues to reduce the 400 * cost of removal through batching. Here, we must ensure that any 401 * deferred dequeue on the physical page has completed before using 402 * its plinks.q field. 403 */ 404 if (__predict_false(vm_page_astate_load(m).queue != PQ_NONE)) 405 vm_page_dequeue(m); 406 407 m->order = order; 408 m->pool = pool; 409 if (tail) 410 TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q); 411 else 412 TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q); 413 fl[order].lcnt++; 414 } 415 416 static void 417 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) 418 { 419 420 TAILQ_REMOVE(&fl[order].pl, m, plinks.q); 421 fl[order].lcnt--; 422 m->order = VM_NFREEORDER; 423 } 424 425 /* 426 * Create a physical memory segment. 427 */ 428 static void 429 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) 430 { 431 struct vm_phys_seg *seg; 432 433 if (!(0 <= domain && domain < vm_ndomains)) 434 panic("%s: Invalid domain %d ('vm_ndomains' is %d)", 435 __func__, domain, vm_ndomains); 436 if (vm_phys_nsegs >= VM_PHYSSEG_MAX) 437 panic("Not enough storage for physical segments, " 438 "increase VM_PHYSSEG_MAX"); 439 440 seg = &vm_phys_segs[vm_phys_nsegs++]; 441 while (seg > vm_phys_segs && seg[-1].start >= end) { 442 *seg = *(seg - 1); 443 seg--; 444 } 445 seg->start = start; 446 seg->end = end; 447 seg->domain = domain; 448 if (seg != vm_phys_segs && seg[-1].end > start) 449 panic("Overlapping physical segments: Current [%#jx,%#jx) " 450 "at index %zu, previous [%#jx,%#jx)", 451 (uintmax_t)start, (uintmax_t)end, seg - vm_phys_segs, 452 (uintmax_t)seg[-1].start, (uintmax_t)seg[-1].end); 453 } 454 455 static void 456 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) 457 { 458 #ifdef NUMA 459 int i; 460 461 if (mem_affinity == NULL) { 462 _vm_phys_create_seg(start, end, 0); 463 return; 464 } 465 466 for (i = 0;; i++) { 467 if (mem_affinity[i].end == 0) 468 panic("Reached end of affinity info"); 469 if (mem_affinity[i].end <= start) 470 continue; 471 if (mem_affinity[i].start > start) 472 panic("No affinity info for start %jx", 473 (uintmax_t)start); 474 if (mem_affinity[i].end >= end) { 475 _vm_phys_create_seg(start, end, 476 mem_affinity[i].domain); 477 break; 478 } 479 _vm_phys_create_seg(start, mem_affinity[i].end, 480 mem_affinity[i].domain); 481 start = mem_affinity[i].end; 482 } 483 #else 484 _vm_phys_create_seg(start, end, 0); 485 #endif 486 } 487 488 /* 489 * Add a physical memory segment. 490 */ 491 void 492 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) 493 { 494 vm_paddr_t paddr; 495 496 if ((start & PAGE_MASK) != 0) 497 panic("%s: start (%jx) is not page aligned", __func__, 498 (uintmax_t)start); 499 if ((end & PAGE_MASK) != 0) 500 panic("%s: end (%jx) is not page aligned", __func__, 501 (uintmax_t)end); 502 if (start > end) 503 panic("%s: start (%jx) > end (%jx)!", __func__, 504 (uintmax_t)start, (uintmax_t)end); 505 506 if (start == end) 507 return; 508 509 /* 510 * Split the physical memory segment if it spans two or more free 511 * list boundaries. 512 */ 513 paddr = start; 514 #ifdef VM_FREELIST_LOWMEM 515 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { 516 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); 517 paddr = VM_LOWMEM_BOUNDARY; 518 } 519 #endif 520 #ifdef VM_FREELIST_DMA32 521 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { 522 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); 523 paddr = VM_DMA32_BOUNDARY; 524 } 525 #endif 526 vm_phys_create_seg(paddr, end); 527 } 528 529 /* 530 * Initialize the physical memory allocator. 531 * 532 * Requires that vm_page_array is initialized! 533 */ 534 void 535 vm_phys_init(void) 536 { 537 struct vm_freelist *fl; 538 struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg; 539 #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE) 540 u_long npages; 541 #endif 542 int dom, flind, freelist, oind, pind, segind; 543 544 /* 545 * Compute the number of free lists, and generate the mapping from the 546 * manifest constants VM_FREELIST_* to the free list indices. 547 * 548 * Initially, the entries of vm_freelist_to_flind[] are set to either 549 * 0 or 1 to indicate which free lists should be created. 550 */ 551 #ifdef VM_DMA32_NPAGES_THRESHOLD 552 npages = 0; 553 #endif 554 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 555 seg = &vm_phys_segs[segind]; 556 #ifdef VM_FREELIST_LOWMEM 557 if (seg->end <= VM_LOWMEM_BOUNDARY) 558 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; 559 else 560 #endif 561 #ifdef VM_FREELIST_DMA32 562 if ( 563 #ifdef VM_DMA32_NPAGES_THRESHOLD 564 /* 565 * Create the DMA32 free list only if the amount of 566 * physical memory above physical address 4G exceeds the 567 * given threshold. 568 */ 569 npages > VM_DMA32_NPAGES_THRESHOLD && 570 #endif 571 seg->end <= VM_DMA32_BOUNDARY) 572 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; 573 else 574 #endif 575 { 576 #ifdef VM_DMA32_NPAGES_THRESHOLD 577 npages += atop(seg->end - seg->start); 578 #endif 579 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; 580 } 581 } 582 /* Change each entry into a running total of the free lists. */ 583 for (freelist = 1; freelist < VM_NFREELIST; freelist++) { 584 vm_freelist_to_flind[freelist] += 585 vm_freelist_to_flind[freelist - 1]; 586 } 587 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; 588 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); 589 /* Change each entry into a free list index. */ 590 for (freelist = 0; freelist < VM_NFREELIST; freelist++) 591 vm_freelist_to_flind[freelist]--; 592 593 /* 594 * Initialize the first_page and free_queues fields of each physical 595 * memory segment. 596 */ 597 #ifdef VM_PHYSSEG_SPARSE 598 npages = 0; 599 #endif 600 for (segind = 0; segind < vm_phys_nsegs; segind++) { 601 seg = &vm_phys_segs[segind]; 602 #ifdef VM_PHYSSEG_SPARSE 603 seg->first_page = &vm_page_array[npages]; 604 npages += atop(seg->end - seg->start); 605 #else 606 seg->first_page = PHYS_TO_VM_PAGE(seg->start); 607 #endif 608 #ifdef VM_FREELIST_LOWMEM 609 if (seg->end <= VM_LOWMEM_BOUNDARY) { 610 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; 611 KASSERT(flind >= 0, 612 ("vm_phys_init: LOWMEM flind < 0")); 613 } else 614 #endif 615 #ifdef VM_FREELIST_DMA32 616 if (seg->end <= VM_DMA32_BOUNDARY) { 617 flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; 618 KASSERT(flind >= 0, 619 ("vm_phys_init: DMA32 flind < 0")); 620 } else 621 #endif 622 { 623 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; 624 KASSERT(flind >= 0, 625 ("vm_phys_init: DEFAULT flind < 0")); 626 } 627 seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; 628 } 629 630 /* 631 * Coalesce physical memory segments that are contiguous and share the 632 * same per-domain free queues. 633 */ 634 prev_seg = vm_phys_segs; 635 seg = &vm_phys_segs[1]; 636 end_seg = &vm_phys_segs[vm_phys_nsegs]; 637 while (seg < end_seg) { 638 if (prev_seg->end == seg->start && 639 prev_seg->free_queues == seg->free_queues) { 640 prev_seg->end = seg->end; 641 KASSERT(prev_seg->domain == seg->domain, 642 ("vm_phys_init: free queues cannot span domains")); 643 vm_phys_nsegs--; 644 end_seg--; 645 for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++) 646 *tmp_seg = *(tmp_seg + 1); 647 } else { 648 prev_seg = seg; 649 seg++; 650 } 651 } 652 653 /* 654 * Initialize the free queues. 655 */ 656 for (dom = 0; dom < vm_ndomains; dom++) { 657 for (flind = 0; flind < vm_nfreelists; flind++) { 658 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 659 fl = vm_phys_free_queues[dom][flind][pind]; 660 for (oind = 0; oind < VM_NFREEORDER; oind++) 661 TAILQ_INIT(&fl[oind].pl); 662 } 663 } 664 } 665 666 #ifdef VM_FREEPOOL_LAZYINIT 667 vm_default_freepool = VM_FREEPOOL_LAZYINIT; 668 #else 669 vm_default_freepool = VM_FREEPOOL_DEFAULT; 670 #endif 671 672 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); 673 } 674 675 /* 676 * Register info about the NUMA topology of the system. 677 * 678 * Invoked by platform-dependent code prior to vm_phys_init(). 679 */ 680 void 681 vm_phys_register_domains(int ndomains __numa_used, 682 struct mem_affinity *affinity __numa_used, int *locality __numa_used) 683 { 684 #ifdef NUMA 685 int i; 686 687 /* 688 * For now the only override value that we support is 1, which 689 * effectively disables NUMA-awareness in the allocators. 690 */ 691 TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled); 692 if (numa_disabled) 693 ndomains = 1; 694 695 if (ndomains > 1) { 696 vm_ndomains = ndomains; 697 mem_affinity = affinity; 698 mem_locality = locality; 699 } 700 701 for (i = 0; i < vm_ndomains; i++) 702 DOMAINSET_SET(i, &all_domains); 703 #endif 704 } 705 706 /* 707 * Split a contiguous, power of two-sized set of physical pages. 708 * 709 * When this function is called by a page allocation function, the caller 710 * should request insertion at the head unless the order [order, oind) queues 711 * are known to be empty. The objective being to reduce the likelihood of 712 * long-term fragmentation by promoting contemporaneous allocation and 713 * (hopefully) deallocation. 714 */ 715 static __inline void 716 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, 717 int pool, int tail) 718 { 719 vm_page_t m_buddy; 720 721 while (oind > order) { 722 oind--; 723 m_buddy = &m[1 << oind]; 724 KASSERT(m_buddy->order == VM_NFREEORDER, 725 ("vm_phys_split_pages: page %p has unexpected order %d", 726 m_buddy, m_buddy->order)); 727 vm_freelist_add(fl, m_buddy, oind, pool, tail); 728 } 729 } 730 731 static void 732 vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int pool, 733 int tail) 734 { 735 KASSERT(order >= 0 && order < VM_NFREEORDER, 736 ("%s: invalid order %d", __func__, order)); 737 738 vm_freelist_add(fl, m, order, pool, tail); 739 #ifdef VM_FREEPOOL_LAZYINIT 740 if (__predict_false(pool == VM_FREEPOOL_LAZYINIT)) { 741 vm_page_t m_next; 742 vm_paddr_t pa; 743 int npages; 744 745 npages = 1 << order; 746 m_next = m + npages; 747 pa = m->phys_addr + ptoa(npages); 748 if (pa < vm_phys_segs[m->segind].end) { 749 vm_page_init_page(m_next, pa, m->segind, 750 VM_FREEPOOL_LAZYINIT); 751 } 752 } 753 #endif 754 } 755 756 /* 757 * Add the physical pages [m, m + npages) at the beginning of a power-of-two 758 * aligned and sized set to the specified free list. 759 * 760 * When this function is called by a page allocation function, the caller 761 * should request insertion at the head unless the lower-order queues are 762 * known to be empty. The objective being to reduce the likelihood of long- 763 * term fragmentation by promoting contemporaneous allocation and (hopefully) 764 * deallocation. 765 * 766 * The physical page m's buddy must not be free. 767 */ 768 static void 769 vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool, 770 int tail) 771 { 772 int order; 773 774 KASSERT(npages == 0 || 775 (VM_PAGE_TO_PHYS(m) & 776 ((PAGE_SIZE << ilog2(npages)) - 1)) == 0, 777 ("%s: page %p and npages %u are misaligned", 778 __func__, m, npages)); 779 while (npages > 0) { 780 KASSERT(m->order == VM_NFREEORDER, 781 ("%s: page %p has unexpected order %d", 782 __func__, m, m->order)); 783 order = ilog2(npages); 784 KASSERT(order < VM_NFREEORDER, 785 ("%s: order %d is out of range", __func__, order)); 786 vm_phys_enq_chunk(fl, m, order, pool, tail); 787 m += 1 << order; 788 npages -= 1 << order; 789 } 790 } 791 792 /* 793 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned 794 * and sized set to the specified free list. 795 * 796 * When this function is called by a page allocation function, the caller 797 * should request insertion at the head unless the lower-order queues are 798 * known to be empty. The objective being to reduce the likelihood of long- 799 * term fragmentation by promoting contemporaneous allocation and (hopefully) 800 * deallocation. 801 * 802 * If npages is zero, this function does nothing and ignores the physical page 803 * parameter m. Otherwise, the physical page m's buddy must not be free. 804 */ 805 static vm_page_t 806 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool, 807 int tail) 808 { 809 int order; 810 811 KASSERT(npages == 0 || 812 ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) & 813 ((PAGE_SIZE << ilog2(npages)) - 1)) == 0, 814 ("vm_phys_enq_range: page %p and npages %u are misaligned", 815 m, npages)); 816 while (npages > 0) { 817 KASSERT(m->order == VM_NFREEORDER, 818 ("vm_phys_enq_range: page %p has unexpected order %d", 819 m, m->order)); 820 order = ffs(npages) - 1; 821 vm_phys_enq_chunk(fl, m, order, pool, tail); 822 m += 1 << order; 823 npages -= 1 << order; 824 } 825 return (m); 826 } 827 828 /* 829 * Complete initialization a contiguous, power of two-sized set of physical 830 * pages. 831 * 832 * If the pages currently belong to the lazy init pool, then the corresponding 833 * page structures must be initialized. In this case it is assumed that the 834 * first page in the run has already been initialized. 835 */ 836 static void 837 vm_phys_finish_init(vm_page_t m, int order) 838 { 839 #ifdef VM_FREEPOOL_LAZYINIT 840 if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) { 841 vm_paddr_t pa; 842 int segind; 843 844 TSENTER(); 845 pa = m->phys_addr + PAGE_SIZE; 846 segind = m->segind; 847 for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order]; 848 m_tmp++, pa += PAGE_SIZE) 849 vm_page_init_page(m_tmp, pa, segind, VM_NFREEPOOL); 850 TSEXIT(); 851 } 852 #endif 853 } 854 855 /* 856 * Tries to allocate the specified number of pages from the specified pool 857 * within the specified domain. Returns the actual number of allocated pages 858 * and a pointer to each page through the array ma[]. 859 * 860 * The returned pages may not be physically contiguous. However, in contrast 861 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0), 862 * calling this function once to allocate the desired number of pages will 863 * avoid wasted time in vm_phys_split_pages(). The allocated pages have no 864 * valid pool field set. 865 * 866 * The free page queues for the specified domain must be locked. 867 */ 868 int 869 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) 870 { 871 struct vm_freelist *alt, *fl; 872 vm_page_t m; 873 int avail, end, flind, freelist, i, oind, pind; 874 875 KASSERT(domain >= 0 && domain < vm_ndomains, 876 ("vm_phys_alloc_npages: domain %d is out of range", domain)); 877 KASSERT(vm_phys_pool_valid(pool), 878 ("vm_phys_alloc_npages: pool %d is out of range", pool)); 879 KASSERT(npages <= 1 << (VM_NFREEORDER - 1), 880 ("vm_phys_alloc_npages: npages %d is out of range", npages)); 881 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 882 i = 0; 883 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 884 flind = vm_freelist_to_flind[freelist]; 885 if (flind < 0) 886 continue; 887 fl = vm_phys_free_queues[domain][flind][pool]; 888 for (oind = 0; oind < VM_NFREEORDER; oind++) { 889 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 890 vm_freelist_rem(fl, m, oind); 891 avail = i + (1 << oind); 892 end = imin(npages, avail); 893 while (i < end) 894 ma[i++] = m++; 895 if (i == npages) { 896 /* 897 * Return excess pages to fl. Its order 898 * [0, oind) queues are empty. 899 */ 900 vm_phys_enq_range(m, avail - i, fl, 901 pool, 1); 902 return (npages); 903 } 904 } 905 } 906 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 907 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; 908 pind++) { 909 alt = vm_phys_free_queues[domain][flind][pind]; 910 while ((m = TAILQ_FIRST(&alt[oind].pl)) != 911 NULL) { 912 vm_freelist_rem(alt, m, oind); 913 vm_phys_finish_init(m, oind); 914 avail = i + (1 << oind); 915 end = imin(npages, avail); 916 while (i < end) 917 ma[i++] = m++; 918 if (i == npages) { 919 /* 920 * Return excess pages to fl. 921 * Its order [0, oind) queues 922 * are empty. 923 */ 924 vm_phys_enq_range(m, avail - i, 925 fl, pool, 1); 926 return (npages); 927 } 928 } 929 } 930 } 931 } 932 return (i); 933 } 934 935 /* 936 * Allocate a contiguous, power of two-sized set of physical pages from the 937 * specified free list. The free list must be specified using one of the 938 * manifest constants VM_FREELIST_*. 939 * 940 * The free page queues must be locked. 941 */ 942 static vm_page_t 943 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) 944 { 945 struct vm_freelist *alt, *fl; 946 vm_page_t m; 947 int oind, pind, flind; 948 949 KASSERT(domain >= 0 && domain < vm_ndomains, 950 ("vm_phys_alloc_freelist_pages: domain %d is out of range", 951 domain)); 952 KASSERT(freelist < VM_NFREELIST, 953 ("vm_phys_alloc_freelist_pages: freelist %d is out of range", 954 freelist)); 955 KASSERT(vm_phys_pool_valid(pool), 956 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); 957 KASSERT(order < VM_NFREEORDER, 958 ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); 959 960 flind = vm_freelist_to_flind[freelist]; 961 /* Check if freelist is present */ 962 if (flind < 0) 963 return (NULL); 964 965 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 966 fl = &vm_phys_free_queues[domain][flind][pool][0]; 967 for (oind = order; oind < VM_NFREEORDER; oind++) { 968 m = TAILQ_FIRST(&fl[oind].pl); 969 if (m != NULL) { 970 vm_freelist_rem(fl, m, oind); 971 /* The order [order, oind) queues are empty. */ 972 vm_phys_split_pages(m, oind, fl, order, pool, 1); 973 return (m); 974 } 975 } 976 977 /* 978 * The given pool was empty. Find the largest 979 * contiguous, power-of-two-sized set of pages in any 980 * pool. Transfer these pages to the given pool, and 981 * use them to satisfy the allocation. 982 */ 983 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { 984 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 985 alt = &vm_phys_free_queues[domain][flind][pind][0]; 986 m = TAILQ_FIRST(&alt[oind].pl); 987 if (m != NULL) { 988 vm_freelist_rem(alt, m, oind); 989 vm_phys_finish_init(m, oind); 990 /* The order [order, oind) queues are empty. */ 991 vm_phys_split_pages(m, oind, fl, order, pool, 1); 992 return (m); 993 } 994 } 995 } 996 return (NULL); 997 } 998 999 /* 1000 * Allocate a contiguous, power of two-sized set of physical pages 1001 * from the free lists. 1002 * 1003 * The free page queues must be locked. 1004 */ 1005 vm_page_t 1006 vm_phys_alloc_pages(int domain, int pool, int order) 1007 { 1008 vm_page_t m; 1009 int freelist; 1010 1011 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 1012 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order); 1013 if (m != NULL) 1014 return (m); 1015 } 1016 return (NULL); 1017 } 1018 1019 /* 1020 * Find the vm_page corresponding to the given physical address, which must lie 1021 * within the given physical memory segment. 1022 */ 1023 vm_page_t 1024 vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa) 1025 { 1026 KASSERT(pa >= seg->start && pa < seg->end, 1027 ("%s: pa %#jx is out of range", __func__, (uintmax_t)pa)); 1028 1029 return (&seg->first_page[atop(pa - seg->start)]); 1030 } 1031 1032 /* 1033 * Find the vm_page corresponding to the given physical address. 1034 */ 1035 vm_page_t 1036 vm_phys_paddr_to_vm_page(vm_paddr_t pa) 1037 { 1038 struct vm_phys_seg *seg; 1039 1040 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) 1041 return (vm_phys_seg_paddr_to_vm_page(seg, pa)); 1042 return (NULL); 1043 } 1044 1045 vm_page_t 1046 vm_phys_fictitious_to_vm_page(vm_paddr_t pa) 1047 { 1048 struct vm_phys_fictitious_seg tmp, *seg; 1049 vm_page_t m; 1050 1051 m = NULL; 1052 tmp.start = pa; 1053 tmp.end = 0; 1054 1055 rw_rlock(&vm_phys_fictitious_reg_lock); 1056 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1057 rw_runlock(&vm_phys_fictitious_reg_lock); 1058 if (seg == NULL) 1059 return (NULL); 1060 1061 m = &seg->first_page[atop(pa - seg->start)]; 1062 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); 1063 1064 return (m); 1065 } 1066 1067 static inline void 1068 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, 1069 long page_count, vm_memattr_t memattr) 1070 { 1071 long i; 1072 1073 bzero(range, page_count * sizeof(*range)); 1074 for (i = 0; i < page_count; i++) { 1075 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); 1076 range[i].oflags &= ~VPO_UNMANAGED; 1077 range[i].busy_lock = VPB_UNBUSIED; 1078 } 1079 } 1080 1081 int 1082 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, 1083 vm_memattr_t memattr) 1084 { 1085 struct vm_phys_fictitious_seg *seg; 1086 vm_page_t fp; 1087 long page_count; 1088 #ifdef VM_PHYSSEG_DENSE 1089 long pi, pe; 1090 long dpage_count; 1091 #endif 1092 1093 KASSERT(start < end, 1094 ("Start of segment isn't less than end (start: %jx end: %jx)", 1095 (uintmax_t)start, (uintmax_t)end)); 1096 1097 page_count = (end - start) / PAGE_SIZE; 1098 1099 #ifdef VM_PHYSSEG_DENSE 1100 pi = atop(start); 1101 pe = atop(end); 1102 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1103 fp = &vm_page_array[pi - first_page]; 1104 if ((pe - first_page) > vm_page_array_size) { 1105 /* 1106 * We have a segment that starts inside 1107 * of vm_page_array, but ends outside of it. 1108 * 1109 * Use vm_page_array pages for those that are 1110 * inside of the vm_page_array range, and 1111 * allocate the remaining ones. 1112 */ 1113 dpage_count = vm_page_array_size - (pi - first_page); 1114 vm_phys_fictitious_init_range(fp, start, dpage_count, 1115 memattr); 1116 page_count -= dpage_count; 1117 start += ptoa(dpage_count); 1118 goto alloc; 1119 } 1120 /* 1121 * We can allocate the full range from vm_page_array, 1122 * so there's no need to register the range in the tree. 1123 */ 1124 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1125 return (0); 1126 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1127 /* 1128 * We have a segment that ends inside of vm_page_array, 1129 * but starts outside of it. 1130 */ 1131 fp = &vm_page_array[0]; 1132 dpage_count = pe - first_page; 1133 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, 1134 memattr); 1135 end -= ptoa(dpage_count); 1136 page_count -= dpage_count; 1137 goto alloc; 1138 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1139 /* 1140 * Trying to register a fictitious range that expands before 1141 * and after vm_page_array. 1142 */ 1143 return (EINVAL); 1144 } else { 1145 alloc: 1146 #endif 1147 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, 1148 M_WAITOK); 1149 #ifdef VM_PHYSSEG_DENSE 1150 } 1151 #endif 1152 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1153 1154 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); 1155 seg->start = start; 1156 seg->end = end; 1157 seg->first_page = fp; 1158 1159 rw_wlock(&vm_phys_fictitious_reg_lock); 1160 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); 1161 rw_wunlock(&vm_phys_fictitious_reg_lock); 1162 1163 return (0); 1164 } 1165 1166 void 1167 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) 1168 { 1169 struct vm_phys_fictitious_seg *seg, tmp; 1170 #ifdef VM_PHYSSEG_DENSE 1171 long pi, pe; 1172 #endif 1173 1174 KASSERT(start < end, 1175 ("Start of segment isn't less than end (start: %jx end: %jx)", 1176 (uintmax_t)start, (uintmax_t)end)); 1177 1178 #ifdef VM_PHYSSEG_DENSE 1179 pi = atop(start); 1180 pe = atop(end); 1181 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1182 if ((pe - first_page) <= vm_page_array_size) { 1183 /* 1184 * This segment was allocated using vm_page_array 1185 * only, there's nothing to do since those pages 1186 * were never added to the tree. 1187 */ 1188 return; 1189 } 1190 /* 1191 * We have a segment that starts inside 1192 * of vm_page_array, but ends outside of it. 1193 * 1194 * Calculate how many pages were added to the 1195 * tree and free them. 1196 */ 1197 start = ptoa(first_page + vm_page_array_size); 1198 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1199 /* 1200 * We have a segment that ends inside of vm_page_array, 1201 * but starts outside of it. 1202 */ 1203 end = ptoa(first_page); 1204 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1205 /* Since it's not possible to register such a range, panic. */ 1206 panic( 1207 "Unregistering not registered fictitious range [%#jx:%#jx]", 1208 (uintmax_t)start, (uintmax_t)end); 1209 } 1210 #endif 1211 tmp.start = start; 1212 tmp.end = 0; 1213 1214 rw_wlock(&vm_phys_fictitious_reg_lock); 1215 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1216 if (seg->start != start || seg->end != end) { 1217 rw_wunlock(&vm_phys_fictitious_reg_lock); 1218 panic( 1219 "Unregistering not registered fictitious range [%#jx:%#jx]", 1220 (uintmax_t)start, (uintmax_t)end); 1221 } 1222 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); 1223 rw_wunlock(&vm_phys_fictitious_reg_lock); 1224 free(seg->first_page, M_FICT_PAGES); 1225 free(seg, M_FICT_PAGES); 1226 } 1227 1228 /* 1229 * Free a contiguous, power of two-sized set of physical pages. 1230 * The pool field in the first page determines the destination pool. 1231 * 1232 * The free page queues must be locked. 1233 */ 1234 void 1235 vm_phys_free_pages(vm_page_t m, int pool, int order) 1236 { 1237 struct vm_freelist *fl; 1238 struct vm_phys_seg *seg; 1239 vm_paddr_t pa; 1240 vm_page_t m_buddy; 1241 1242 KASSERT(m->order == VM_NFREEORDER, 1243 ("%s: page %p has unexpected order %d", 1244 __func__, m, m->order)); 1245 KASSERT(vm_phys_pool_valid(pool), 1246 ("%s: unexpected pool param %d", __func__, pool)); 1247 KASSERT(order < VM_NFREEORDER, 1248 ("%s: order %d is out of range", __func__, order)); 1249 seg = &vm_phys_segs[m->segind]; 1250 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1251 if (order < VM_NFREEORDER - 1) { 1252 pa = VM_PAGE_TO_PHYS(m); 1253 do { 1254 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); 1255 if (pa < seg->start || pa >= seg->end) 1256 break; 1257 m_buddy = vm_phys_seg_paddr_to_vm_page(seg, pa); 1258 if (m_buddy->order != order) 1259 break; 1260 fl = (*seg->free_queues)[m_buddy->pool]; 1261 vm_freelist_rem(fl, m_buddy, order); 1262 vm_phys_finish_init(m_buddy, order); 1263 order++; 1264 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); 1265 m = vm_phys_seg_paddr_to_vm_page(seg, pa); 1266 } while (order < VM_NFREEORDER - 1); 1267 } 1268 fl = (*seg->free_queues)[pool]; 1269 vm_freelist_add(fl, m, order, pool, 1); 1270 } 1271 1272 #ifdef VM_FREEPOOL_LAZYINIT 1273 /* 1274 * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving 1275 * them to the default pool. This is a prerequisite for some rare operations 1276 * which need to scan the page array and thus depend on all pages being 1277 * initialized. 1278 */ 1279 static void 1280 vm_phys_lazy_init_domain(int domain, bool locked) 1281 { 1282 static bool initdone[MAXMEMDOM]; 1283 struct vm_domain *vmd; 1284 struct vm_freelist *fl; 1285 vm_page_t m; 1286 int pind; 1287 bool unlocked; 1288 1289 if (__predict_true(atomic_load_bool(&initdone[domain]))) 1290 return; 1291 1292 vmd = VM_DOMAIN(domain); 1293 if (locked) 1294 vm_domain_free_assert_locked(vmd); 1295 else 1296 vm_domain_free_lock(vmd); 1297 if (atomic_load_bool(&initdone[domain])) 1298 goto out; 1299 pind = VM_FREEPOOL_LAZYINIT; 1300 for (int freelist = 0; freelist < VM_NFREELIST; freelist++) { 1301 int flind; 1302 1303 flind = vm_freelist_to_flind[freelist]; 1304 if (flind < 0) 1305 continue; 1306 fl = vm_phys_free_queues[domain][flind][pind]; 1307 for (int oind = 0; oind < VM_NFREEORDER; oind++) { 1308 if (atomic_load_int(&fl[oind].lcnt) == 0) 1309 continue; 1310 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 1311 /* 1312 * Avoid holding the lock across the 1313 * initialization unless there's a free page 1314 * shortage. 1315 */ 1316 vm_freelist_rem(fl, m, oind); 1317 unlocked = vm_domain_allocate(vmd, 1318 VM_ALLOC_NORMAL, 1 << oind); 1319 if (unlocked) 1320 vm_domain_free_unlock(vmd); 1321 vm_phys_finish_init(m, oind); 1322 if (unlocked) { 1323 vm_domain_freecnt_inc(vmd, 1 << oind); 1324 vm_domain_free_lock(vmd); 1325 } 1326 vm_phys_free_pages(m, VM_FREEPOOL_DEFAULT, 1327 oind); 1328 } 1329 } 1330 } 1331 atomic_store_bool(&initdone[domain], true); 1332 out: 1333 if (!locked) 1334 vm_domain_free_unlock(vmd); 1335 } 1336 1337 static void 1338 vm_phys_lazy_init(void) 1339 { 1340 for (int domain = 0; domain < vm_ndomains; domain++) 1341 vm_phys_lazy_init_domain(domain, false); 1342 atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT); 1343 } 1344 1345 static void 1346 vm_phys_lazy_init_kthr(void *arg __unused) 1347 { 1348 vm_phys_lazy_init(); 1349 kthread_exit(); 1350 } 1351 1352 static void 1353 vm_phys_lazy_sysinit(void *arg __unused) 1354 { 1355 struct thread *td; 1356 int error; 1357 1358 error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td, 1359 RFSTOPPED, 0, "vmlazyinit"); 1360 if (error == 0) { 1361 thread_lock(td); 1362 sched_prio(td, PRI_MIN_IDLE); 1363 sched_add(td, SRQ_BORING); 1364 } else { 1365 printf("%s: could not create lazy init thread: %d\n", 1366 __func__, error); 1367 vm_phys_lazy_init(); 1368 } 1369 } 1370 SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit, 1371 NULL); 1372 #endif /* VM_FREEPOOL_LAZYINIT */ 1373 1374 /* 1375 * Free a contiguous, arbitrarily sized set of physical pages, without 1376 * merging across set boundaries. Assumes no pages have a valid pool field. 1377 * 1378 * The free page queues must be locked. 1379 */ 1380 void 1381 vm_phys_enqueue_contig(vm_page_t m, int pool, u_long npages) 1382 { 1383 struct vm_freelist *fl; 1384 struct vm_phys_seg *seg; 1385 vm_page_t m_end; 1386 vm_paddr_t diff, lo; 1387 int order; 1388 1389 /* 1390 * Avoid unnecessary coalescing by freeing the pages in the largest 1391 * possible power-of-two-sized subsets. 1392 */ 1393 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1394 seg = &vm_phys_segs[m->segind]; 1395 fl = (*seg->free_queues)[pool]; 1396 m_end = m + npages; 1397 /* Free blocks of increasing size. */ 1398 lo = atop(VM_PAGE_TO_PHYS(m)); 1399 if (m < m_end && 1400 (diff = lo ^ (lo + npages - 1)) != 0) { 1401 order = min(ilog2(diff), VM_NFREEORDER - 1); 1402 m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl, 1403 pool, 1); 1404 } 1405 1406 /* Free blocks of maximum size. */ 1407 order = VM_NFREEORDER - 1; 1408 while (m + (1 << order) <= m_end) { 1409 KASSERT(seg == &vm_phys_segs[m->segind], 1410 ("%s: page range [%p,%p) spans multiple segments", 1411 __func__, m_end - npages, m)); 1412 vm_phys_enq_chunk(fl, m, order, pool, 1); 1413 m += 1 << order; 1414 } 1415 /* Free blocks of diminishing size. */ 1416 vm_phys_enq_beg(m, m_end - m, fl, pool, 1); 1417 } 1418 1419 /* 1420 * Free a contiguous, arbitrarily sized set of physical pages. 1421 * Assumes that every page but the first has no valid pool field. 1422 * Uses the pool value in the first page if valid, otherwise default. 1423 * 1424 * The free page queues must be locked. 1425 */ 1426 void 1427 vm_phys_free_contig(vm_page_t m, int pool, u_long npages) 1428 { 1429 vm_paddr_t lo; 1430 vm_page_t m_start, m_end; 1431 unsigned max_order, order_start, order_end; 1432 1433 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1434 1435 lo = atop(VM_PAGE_TO_PHYS(m)); 1436 max_order = min(ilog2(lo ^ (lo + npages)), VM_NFREEORDER - 1); 1437 1438 m_start = m; 1439 order_start = ffsll(lo) - 1; 1440 if (order_start < max_order) 1441 m_start += 1 << order_start; 1442 m_end = m + npages; 1443 order_end = ffsll(lo + npages) - 1; 1444 if (order_end < max_order) 1445 m_end -= 1 << order_end; 1446 /* 1447 * Avoid unnecessary coalescing by freeing the pages at the start and 1448 * end of the range last. 1449 */ 1450 if (m_start < m_end) 1451 vm_phys_enqueue_contig(m_start, pool, m_end - m_start); 1452 if (order_start < max_order) 1453 vm_phys_free_pages(m, pool, order_start); 1454 if (order_end < max_order) 1455 vm_phys_free_pages(m_end, pool, order_end); 1456 } 1457 1458 /* 1459 * Identify the first address range within segment segind or greater 1460 * that matches the domain, lies within the low/high range, and has 1461 * enough pages. Return -1 if there is none. 1462 */ 1463 int 1464 vm_phys_find_range(vm_page_t bounds[], int segind, int domain, 1465 u_long npages, vm_paddr_t low, vm_paddr_t high) 1466 { 1467 vm_paddr_t pa_end, pa_start; 1468 struct vm_phys_seg *end_seg, *seg; 1469 1470 KASSERT(npages > 0, ("npages is zero")); 1471 KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range")); 1472 end_seg = &vm_phys_segs[vm_phys_nsegs]; 1473 for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) { 1474 if (seg->domain != domain) 1475 continue; 1476 if (seg->start >= high) 1477 return (-1); 1478 pa_start = MAX(low, seg->start); 1479 pa_end = MIN(high, seg->end); 1480 if (pa_end - pa_start < ptoa(npages)) 1481 continue; 1482 #ifdef VM_FREEPOOL_LAZYINIT 1483 /* 1484 * The pages on the free lists must be initialized. 1485 */ 1486 vm_phys_lazy_init_domain(domain, false); 1487 #endif 1488 bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start); 1489 bounds[1] = &seg->first_page[atop(pa_end - seg->start)]; 1490 return (seg - vm_phys_segs); 1491 } 1492 return (-1); 1493 } 1494 1495 /* 1496 * Search for the given physical page "m" in the free lists. If the search 1497 * succeeds, remove "m" from the free lists and return true. Otherwise, return 1498 * false, indicating that "m" is not in the free lists. 1499 * 1500 * The free page queues must be locked. 1501 */ 1502 bool 1503 vm_phys_unfree_page(vm_paddr_t pa) 1504 { 1505 struct vm_freelist *fl; 1506 struct vm_phys_seg *seg; 1507 vm_paddr_t pa_half; 1508 vm_page_t m, m_set, m_tmp; 1509 int order, pool; 1510 1511 seg = vm_phys_paddr_to_seg(pa); 1512 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1513 1514 #ifdef VM_FREEPOOL_LAZYINIT 1515 /* 1516 * The pages on the free lists must be initialized. 1517 */ 1518 vm_phys_lazy_init_domain(seg->domain, true); 1519 #endif 1520 1521 /* 1522 * First, find the contiguous, power of two-sized set of free 1523 * physical pages containing the given physical page "m" and 1524 * assign it to "m_set". 1525 */ 1526 m = vm_phys_paddr_to_vm_page(pa); 1527 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && 1528 order < VM_NFREEORDER - 1; ) { 1529 order++; 1530 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); 1531 if (pa >= seg->start) 1532 m_set = vm_phys_seg_paddr_to_vm_page(seg, pa); 1533 else 1534 return (false); 1535 } 1536 if (m_set->order < order) 1537 return (false); 1538 if (m_set->order == VM_NFREEORDER) 1539 return (false); 1540 KASSERT(m_set->order < VM_NFREEORDER, 1541 ("vm_phys_unfree_page: page %p has unexpected order %d", 1542 m_set, m_set->order)); 1543 1544 /* 1545 * Next, remove "m_set" from the free lists. Finally, extract 1546 * "m" from "m_set" using an iterative algorithm: While "m_set" 1547 * is larger than a page, shrink "m_set" by returning the half 1548 * of "m_set" that does not contain "m" to the free lists. 1549 */ 1550 pool = m_set->pool; 1551 fl = (*seg->free_queues)[pool]; 1552 order = m_set->order; 1553 vm_freelist_rem(fl, m_set, order); 1554 while (order > 0) { 1555 order--; 1556 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); 1557 if (m->phys_addr < pa_half) 1558 m_tmp = vm_phys_seg_paddr_to_vm_page(seg, pa_half); 1559 else { 1560 m_tmp = m_set; 1561 m_set = vm_phys_seg_paddr_to_vm_page(seg, pa_half); 1562 } 1563 vm_freelist_add(fl, m_tmp, order, pool, 0); 1564 } 1565 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); 1566 return (true); 1567 } 1568 1569 /* 1570 * Find a run of contiguous physical pages, meeting alignment requirements, from 1571 * a list of max-sized page blocks, where we need at least two consecutive 1572 * blocks to satisfy the (large) page request. 1573 */ 1574 static vm_page_t 1575 vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages, 1576 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1577 { 1578 struct vm_phys_seg *seg; 1579 vm_page_t m, m_iter, m_ret; 1580 vm_paddr_t max_size, size; 1581 int max_order; 1582 1583 max_order = VM_NFREEORDER - 1; 1584 size = npages << PAGE_SHIFT; 1585 max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order); 1586 KASSERT(size > max_size, ("size is too small")); 1587 1588 /* 1589 * In order to avoid examining any free max-sized page block more than 1590 * twice, identify the ones that are first in a physically-contiguous 1591 * sequence of such blocks, and only for those walk the sequence to 1592 * check if there are enough free blocks starting at a properly aligned 1593 * block. Thus, no block is checked for free-ness more than twice. 1594 */ 1595 TAILQ_FOREACH(m, &fl[max_order].pl, plinks.q) { 1596 /* 1597 * Skip m unless it is first in a sequence of free max page 1598 * blocks >= low in its segment. 1599 */ 1600 seg = &vm_phys_segs[m->segind]; 1601 if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start)) 1602 continue; 1603 if (VM_PAGE_TO_PHYS(m) >= max_size && 1604 VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) && 1605 max_order == m[-1 << max_order].order) 1606 continue; 1607 1608 /* 1609 * Advance m_ret from m to the first of the sequence, if any, 1610 * that satisfies alignment conditions and might leave enough 1611 * space. 1612 */ 1613 m_ret = m; 1614 while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret), 1615 size, alignment, boundary) && 1616 VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) && 1617 max_order == m_ret[1 << max_order].order) 1618 m_ret += 1 << max_order; 1619 1620 /* 1621 * Skip m unless some block m_ret in the sequence is properly 1622 * aligned, and begins a sequence of enough pages less than 1623 * high, and in the same segment. 1624 */ 1625 if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end)) 1626 continue; 1627 1628 /* 1629 * Skip m unless the blocks to allocate starting at m_ret are 1630 * all free. 1631 */ 1632 for (m_iter = m_ret; 1633 m_iter < m_ret + npages && max_order == m_iter->order; 1634 m_iter += 1 << max_order) { 1635 } 1636 if (m_iter < m_ret + npages) 1637 continue; 1638 return (m_ret); 1639 } 1640 return (NULL); 1641 } 1642 1643 /* 1644 * Find a run of contiguous physical pages from the specified free list 1645 * table. 1646 */ 1647 static vm_page_t 1648 vm_phys_find_queues_contig( 1649 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX], 1650 u_long npages, vm_paddr_t low, vm_paddr_t high, 1651 u_long alignment, vm_paddr_t boundary) 1652 { 1653 struct vm_freelist *fl; 1654 vm_page_t m_ret; 1655 vm_paddr_t pa, pa_end, size; 1656 int oind, order, pind; 1657 1658 KASSERT(npages > 0, ("npages is 0")); 1659 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1660 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1661 /* Compute the queue that is the best fit for npages. */ 1662 order = flsl(npages - 1); 1663 /* Search for a large enough free block. */ 1664 size = npages << PAGE_SHIFT; 1665 for (oind = order; oind < VM_NFREEORDER; oind++) { 1666 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 1667 fl = (*queues)[pind]; 1668 TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) { 1669 /* 1670 * Determine if the address range starting at pa 1671 * is within the given range, satisfies the 1672 * given alignment, and does not cross the given 1673 * boundary. 1674 */ 1675 pa = VM_PAGE_TO_PHYS(m_ret); 1676 pa_end = pa + size; 1677 if (low <= pa && pa_end <= high && 1678 vm_addr_ok(pa, size, alignment, boundary)) 1679 return (m_ret); 1680 } 1681 } 1682 } 1683 if (order < VM_NFREEORDER) 1684 return (NULL); 1685 /* Search for a long-enough sequence of max-order blocks. */ 1686 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 1687 fl = (*queues)[pind]; 1688 m_ret = vm_phys_find_freelist_contig(fl, npages, 1689 low, high, alignment, boundary); 1690 if (m_ret != NULL) 1691 return (m_ret); 1692 } 1693 return (NULL); 1694 } 1695 1696 /* 1697 * Allocate a contiguous set of physical pages of the given size 1698 * "npages" from the free lists. All of the physical pages must be at 1699 * or above the given physical address "low" and below the given 1700 * physical address "high". The given value "alignment" determines the 1701 * alignment of the first physical page in the set. If the given value 1702 * "boundary" is non-zero, then the set of physical pages cannot cross 1703 * any physical address boundary that is a multiple of that value. Both 1704 * "alignment" and "boundary" must be a power of two. Sets the pool 1705 * field to DEFAULT in the first allocated page. 1706 */ 1707 vm_page_t 1708 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1709 u_long alignment, vm_paddr_t boundary) 1710 { 1711 vm_paddr_t pa_end, pa_start; 1712 struct vm_freelist *fl; 1713 vm_page_t m, m_run; 1714 struct vm_phys_seg *seg; 1715 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX]; 1716 int oind, segind; 1717 1718 KASSERT(npages > 0, ("npages is 0")); 1719 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1720 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1721 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 1722 if (low >= high) 1723 return (NULL); 1724 queues = NULL; 1725 m_run = NULL; 1726 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 1727 seg = &vm_phys_segs[segind]; 1728 if (seg->start >= high || seg->domain != domain) 1729 continue; 1730 if (low >= seg->end) 1731 break; 1732 if (low <= seg->start) 1733 pa_start = seg->start; 1734 else 1735 pa_start = low; 1736 if (high < seg->end) 1737 pa_end = high; 1738 else 1739 pa_end = seg->end; 1740 if (pa_end - pa_start < ptoa(npages)) 1741 continue; 1742 /* 1743 * If a previous segment led to a search using 1744 * the same free lists as would this segment, then 1745 * we've actually already searched within this 1746 * too. So skip it. 1747 */ 1748 if (seg->free_queues == queues) 1749 continue; 1750 queues = seg->free_queues; 1751 m_run = vm_phys_find_queues_contig(queues, npages, 1752 low, high, alignment, boundary); 1753 if (m_run != NULL) 1754 break; 1755 } 1756 if (m_run == NULL) 1757 return (NULL); 1758 1759 /* Allocate pages from the page-range found. */ 1760 for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) { 1761 fl = (*queues)[m->pool]; 1762 oind = m->order; 1763 vm_freelist_rem(fl, m, oind); 1764 vm_phys_finish_init(m, oind); 1765 } 1766 /* Return excess pages to the free lists. */ 1767 fl = (*queues)[VM_FREEPOOL_DEFAULT]; 1768 vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl, 1769 VM_FREEPOOL_DEFAULT, 0); 1770 1771 /* Return page verified to satisfy conditions of request. */ 1772 pa_start = VM_PAGE_TO_PHYS(m_run); 1773 KASSERT(low <= pa_start, 1774 ("memory allocated below minimum requested range")); 1775 KASSERT(pa_start + ptoa(npages) <= high, 1776 ("memory allocated above maximum requested range")); 1777 seg = &vm_phys_segs[m_run->segind]; 1778 KASSERT(seg->domain == domain, 1779 ("memory not allocated from specified domain")); 1780 KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary), 1781 ("memory alignment/boundary constraints not satisfied")); 1782 return (m_run); 1783 } 1784 1785 /* 1786 * Return the index of the first unused slot which may be the terminating 1787 * entry. 1788 */ 1789 static int 1790 vm_phys_avail_count(void) 1791 { 1792 int i; 1793 1794 for (i = 0; i < PHYS_AVAIL_COUNT; i += 2) 1795 if (phys_avail[i] == 0 && phys_avail[i + 1] == 0) 1796 return (i); 1797 panic("Improperly terminated phys_avail[]"); 1798 } 1799 1800 /* 1801 * Assert that a phys_avail entry is valid. 1802 */ 1803 static void 1804 vm_phys_avail_check(int i) 1805 { 1806 if (i % 2 != 0) 1807 panic("Chunk start index %d is not even.", i); 1808 if (phys_avail[i] & PAGE_MASK) 1809 panic("Unaligned phys_avail[%d]: %#jx", i, 1810 (intmax_t)phys_avail[i]); 1811 if (phys_avail[i + 1] & PAGE_MASK) 1812 panic("Unaligned phys_avail[%d + 1]: %#jx", i, 1813 (intmax_t)phys_avail[i + 1]); 1814 if (phys_avail[i + 1] < phys_avail[i]) 1815 panic("phys_avail[%d]: start %#jx > end %#jx", i, 1816 (intmax_t)phys_avail[i], (intmax_t)phys_avail[i + 1]); 1817 } 1818 1819 /* 1820 * Return the index of an overlapping phys_avail entry or -1. 1821 */ 1822 #ifdef NUMA 1823 static int 1824 vm_phys_avail_find(vm_paddr_t pa) 1825 { 1826 int i; 1827 1828 for (i = 0; phys_avail[i + 1]; i += 2) 1829 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa) 1830 return (i); 1831 return (-1); 1832 } 1833 #endif 1834 1835 /* 1836 * Return the index of the largest entry. 1837 */ 1838 int 1839 vm_phys_avail_largest(void) 1840 { 1841 vm_paddr_t sz, largesz; 1842 int largest; 1843 int i; 1844 1845 largest = 0; 1846 largesz = 0; 1847 for (i = 0; phys_avail[i + 1]; i += 2) { 1848 sz = vm_phys_avail_size(i); 1849 if (sz > largesz) { 1850 largesz = sz; 1851 largest = i; 1852 } 1853 } 1854 1855 return (largest); 1856 } 1857 1858 vm_paddr_t 1859 vm_phys_avail_size(int i) 1860 { 1861 1862 return (phys_avail[i + 1] - phys_avail[i]); 1863 } 1864 1865 /* 1866 * Split a chunk in phys_avail[] at the address 'pa'. 1867 * 1868 * 'pa' must be within a chunk (slots i and i + 1) or one of its boundaries. 1869 * Returns zero on actual split, in which case the two new chunks occupy slots 1870 * i to i + 3, else EJUSTRETURN if 'pa' was one of the boundaries (and no split 1871 * actually occurred) else ENOSPC if there are not enough slots in phys_avail[] 1872 * to represent the additional chunk caused by the split. 1873 */ 1874 static int 1875 vm_phys_avail_split(vm_paddr_t pa, int i) 1876 { 1877 int cnt; 1878 1879 vm_phys_avail_check(i); 1880 if (pa < phys_avail[i] || pa > phys_avail[i + 1]) 1881 panic("%s: Address %#jx not in range at slot %d [%#jx;%#jx].", 1882 __func__, (uintmax_t)pa, i, 1883 (uintmax_t)phys_avail[i], (uintmax_t)phys_avail[i + 1]); 1884 if (pa == phys_avail[i] || pa == phys_avail[i + 1]) 1885 return (EJUSTRETURN); 1886 cnt = vm_phys_avail_count(); 1887 if (cnt >= PHYS_AVAIL_ENTRIES) 1888 return (ENOSPC); 1889 memmove(&phys_avail[i + 2], &phys_avail[i], 1890 (cnt - i) * sizeof(phys_avail[0])); 1891 phys_avail[i + 1] = pa; 1892 phys_avail[i + 2] = pa; 1893 vm_phys_avail_check(i); 1894 vm_phys_avail_check(i+2); 1895 1896 return (0); 1897 } 1898 1899 /* 1900 * Check if a given physical address can be included as part of a crash dump. 1901 */ 1902 bool 1903 vm_phys_is_dumpable(vm_paddr_t pa) 1904 { 1905 vm_page_t m; 1906 int i; 1907 1908 if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) 1909 return ((m->flags & PG_NODUMP) == 0); 1910 1911 for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { 1912 if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) 1913 return (true); 1914 } 1915 return (false); 1916 } 1917 1918 void 1919 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end) 1920 { 1921 struct vm_phys_seg *seg; 1922 1923 if (vm_phys_early_nsegs == -1) 1924 panic("%s: called after initialization", __func__); 1925 if (vm_phys_early_nsegs == nitems(vm_phys_early_segs)) 1926 panic("%s: ran out of early segments", __func__); 1927 1928 seg = &vm_phys_early_segs[vm_phys_early_nsegs++]; 1929 seg->start = start; 1930 seg->end = end; 1931 } 1932 1933 /* 1934 * This routine allocates NUMA node specific memory before the page 1935 * allocator is bootstrapped. 1936 */ 1937 vm_paddr_t 1938 vm_phys_early_alloc(int domain, size_t alloc_size) 1939 { 1940 #ifdef NUMA 1941 int mem_index; 1942 #endif 1943 int i, biggestone; 1944 vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align; 1945 1946 KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains), 1947 ("%s: invalid domain index %d", __func__, domain)); 1948 1949 /* 1950 * Search the mem_affinity array for the biggest address 1951 * range in the desired domain. This is used to constrain 1952 * the phys_avail selection below. 1953 */ 1954 biggestsize = 0; 1955 mem_start = 0; 1956 mem_end = -1; 1957 #ifdef NUMA 1958 mem_index = 0; 1959 if (mem_affinity != NULL) { 1960 for (i = 0;; i++) { 1961 size = mem_affinity[i].end - mem_affinity[i].start; 1962 if (size == 0) 1963 break; 1964 if (domain != -1 && mem_affinity[i].domain != domain) 1965 continue; 1966 if (size > biggestsize) { 1967 mem_index = i; 1968 biggestsize = size; 1969 } 1970 } 1971 mem_start = mem_affinity[mem_index].start; 1972 mem_end = mem_affinity[mem_index].end; 1973 } 1974 #endif 1975 1976 /* 1977 * Now find biggest physical segment in within the desired 1978 * numa domain. 1979 */ 1980 biggestsize = 0; 1981 biggestone = 0; 1982 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1983 /* skip regions that are out of range */ 1984 if (phys_avail[i+1] - alloc_size < mem_start || 1985 phys_avail[i+1] > mem_end) 1986 continue; 1987 size = vm_phys_avail_size(i); 1988 if (size > biggestsize) { 1989 biggestone = i; 1990 biggestsize = size; 1991 } 1992 } 1993 alloc_size = round_page(alloc_size); 1994 1995 /* 1996 * Grab single pages from the front to reduce fragmentation. 1997 */ 1998 if (alloc_size == PAGE_SIZE) { 1999 pa = phys_avail[biggestone]; 2000 phys_avail[biggestone] += PAGE_SIZE; 2001 vm_phys_avail_check(biggestone); 2002 return (pa); 2003 } 2004 2005 /* 2006 * Naturally align large allocations. 2007 */ 2008 align = phys_avail[biggestone + 1] & (alloc_size - 1); 2009 if (alloc_size + align > biggestsize) 2010 panic("cannot find a large enough size\n"); 2011 if (align != 0 && 2012 vm_phys_avail_split(phys_avail[biggestone + 1] - align, 2013 biggestone) != 0) 2014 /* Wasting memory. */ 2015 phys_avail[biggestone + 1] -= align; 2016 2017 phys_avail[biggestone + 1] -= alloc_size; 2018 vm_phys_avail_check(biggestone); 2019 pa = phys_avail[biggestone + 1]; 2020 return (pa); 2021 } 2022 2023 void 2024 vm_phys_early_startup(void) 2025 { 2026 struct vm_phys_seg *seg; 2027 int i; 2028 2029 if (phys_avail[1] == 0) 2030 panic("phys_avail[] is empty"); 2031 2032 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 2033 phys_avail[i] = round_page(phys_avail[i]); 2034 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 2035 } 2036 2037 for (i = 0; i < vm_phys_early_nsegs; i++) { 2038 seg = &vm_phys_early_segs[i]; 2039 vm_phys_add_seg(seg->start, seg->end); 2040 } 2041 vm_phys_early_nsegs = -1; 2042 2043 #ifdef NUMA 2044 /* Force phys_avail to be split by domain. */ 2045 if (mem_affinity != NULL) { 2046 int idx; 2047 2048 for (i = 0; mem_affinity[i].end != 0; i++) { 2049 idx = vm_phys_avail_find(mem_affinity[i].start); 2050 if (idx != -1) 2051 vm_phys_avail_split(mem_affinity[i].start, idx); 2052 idx = vm_phys_avail_find(mem_affinity[i].end); 2053 if (idx != -1) 2054 vm_phys_avail_split(mem_affinity[i].end, idx); 2055 } 2056 } 2057 #endif 2058 } 2059 2060 #ifdef DDB 2061 /* 2062 * Show the number of physical pages in each of the free lists. 2063 */ 2064 DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE) 2065 { 2066 struct vm_freelist *fl; 2067 int flind, oind, pind, dom; 2068 2069 for (dom = 0; dom < vm_ndomains; dom++) { 2070 db_printf("DOMAIN: %d\n", dom); 2071 for (flind = 0; flind < vm_nfreelists; flind++) { 2072 db_printf("FREE LIST %d:\n" 2073 "\n ORDER (SIZE) | NUMBER" 2074 "\n ", flind); 2075 for (pind = 0; pind < VM_NFREEPOOL; pind++) 2076 db_printf(" | POOL %d", pind); 2077 db_printf("\n-- "); 2078 for (pind = 0; pind < VM_NFREEPOOL; pind++) 2079 db_printf("-- -- "); 2080 db_printf("--\n"); 2081 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 2082 db_printf(" %2.2d (%6.6dK)", oind, 2083 1 << (PAGE_SHIFT - 10 + oind)); 2084 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 2085 fl = vm_phys_free_queues[dom][flind][pind]; 2086 db_printf(" | %6.6d", fl[oind].lcnt); 2087 } 2088 db_printf("\n"); 2089 } 2090 db_printf("\n"); 2091 } 2092 db_printf("\n"); 2093 } 2094 } 2095 #endif 2096